csv-detective 0.7.5.dev1180__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. csv_detective/__init__.py +1 -1
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
  7. csv_detective/detect_fields/temp/date/__init__.py +5 -1
  8. csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
  9. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
  10. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
  11. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
  12. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
  13. csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
  14. csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
  15. csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
  16. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
  17. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
  18. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  19. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
  20. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  21. csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
  22. csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
  23. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
  24. csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
  25. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
  26. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
  27. csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
  28. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
  29. csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
  30. csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
  31. csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
  32. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
  33. csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
  34. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
  35. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
  36. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
  37. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
  38. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
  39. csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
  40. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
  41. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
  42. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
  43. csv_detective/detect_labels/other/booleen/__init__.py +1 -1
  44. csv_detective/detect_labels/other/email/__init__.py +1 -1
  45. csv_detective/detect_labels/other/float/__init__.py +1 -1
  46. csv_detective/detect_labels/other/int/__init__.py +1 -1
  47. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  48. csv_detective/detect_labels/other/twitter/__init__.py +1 -1
  49. csv_detective/detect_labels/other/url/__init__.py +1 -1
  50. csv_detective/detect_labels/other/uuid/__init__.py +1 -1
  51. csv_detective/detect_labels/temp/date/__init__.py +1 -1
  52. csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
  53. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
  54. csv_detective/detect_labels/temp/year/__init__.py +1 -1
  55. csv_detective/detection/columns.py +89 -0
  56. csv_detective/detection/encoding.py +27 -0
  57. csv_detective/detection/engine.py +46 -0
  58. csv_detective/detection/headers.py +32 -0
  59. csv_detective/detection/rows.py +18 -0
  60. csv_detective/detection/separator.py +44 -0
  61. csv_detective/detection/variables.py +98 -0
  62. csv_detective/explore_csv.py +40 -110
  63. csv_detective/output/dataframe.py +55 -0
  64. csv_detective/{create_example.py → output/example.py} +10 -9
  65. csv_detective/output/profile.py +87 -0
  66. csv_detective/{schema_generation.py → output/schema.py} +344 -343
  67. csv_detective/output/utils.py +51 -0
  68. csv_detective/parsing/columns.py +141 -0
  69. csv_detective/parsing/compression.py +11 -0
  70. csv_detective/parsing/csv.py +55 -0
  71. csv_detective/parsing/excel.py +169 -0
  72. csv_detective/parsing/load.py +97 -0
  73. csv_detective/utils.py +10 -236
  74. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +3 -0
  75. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +3 -2
  76. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +85 -71
  77. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +1 -1
  78. tests/test_fields.py +7 -6
  79. tests/test_file.py +56 -57
  80. csv_detective/detection.py +0 -618
  81. /csv_detective/{process_text.py → parsing/text.py} +0 -0
  82. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  83. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
  84. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
  85. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info/licenses}/LICENSE.AGPL.txt +0 -0
  86. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
@@ -1,618 +0,0 @@
1
- from typing import TextIO, Optional, Union
2
- from collections import defaultdict
3
- import pandas as pd
4
- import math
5
- import csv
6
- from cchardet import detect
7
- from ast import literal_eval
8
- import logging
9
- from time import time
10
- import openpyxl
11
- import xlrd
12
- import requests
13
- from io import BytesIO
14
- import magic
15
- from csv_detective.utils import display_logs_depending_process_time
16
- from csv_detective.detect_fields.other.float import float_casting
17
-
18
- logging.basicConfig(level=logging.INFO)
19
-
20
- NEW_EXCEL_EXT = [".xlsx", ".xlsm", ".xltx", ".xltm"]
21
- OLD_EXCEL_EXT = [".xls"]
22
- OPEN_OFFICE_EXT = [".odf", ".ods", ".odt"]
23
- XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
24
- engine_to_file = {
25
- "openpyxl": "Excel",
26
- "xlrd": "old Excel",
27
- "odf": "OpenOffice"
28
- }
29
-
30
-
31
- def is_url(csv_file_path: str) -> bool:
32
- # could be more sophisticated if needed
33
- return csv_file_path.startswith('http')
34
-
35
-
36
- def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
37
- """
38
- Detects whether a column contains continuous variables. We consider a continuous column
39
- one that contains a considerable amount of float values.
40
- We removed the integers as we then end up with postal codes, insee codes, and all sort
41
- of codes and types.
42
- This is not optimal but it will do for now.
43
- """
44
- # if we need this again in the future, could be first based on columns detected as int/float to cut time
45
-
46
- def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
47
- count = serie.value_counts().to_dict()
48
- total_nb = len(serie)
49
- if float in count:
50
- nb_floats = count[float]
51
- else:
52
- return False
53
- if nb_floats / total_nb >= continuous_th:
54
- return True
55
- else:
56
- return False
57
-
58
- def parses_to_integer(value: str):
59
- try:
60
- value = value.replace(",", ".")
61
- value = literal_eval(value)
62
- return type(value)
63
- # flake8: noqa
64
- except:
65
- return False
66
-
67
- if verbose:
68
- start = time()
69
- logging.info("Detecting continuous columns")
70
- res = table.apply(
71
- lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
72
- )
73
- if verbose:
74
- display_logs_depending_process_time(
75
- f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
76
- time() - start,
77
- )
78
- return res.index[res]
79
-
80
-
81
- def detetect_categorical_variable(
82
- table: pd.DataFrame,
83
- threshold_pct_categorical: float = 0.05,
84
- max_number_categorical_values: int = 25,
85
- verbose: bool = False,
86
- ):
87
- """
88
- Heuristically detects whether a table (df) contains categorical values according to
89
- the number of unique values contained.
90
- As the idea of detecting categorical values is to then try to learn models to predict
91
- them, we limit categorical values to at most 25 different modes or at most 5% disparity.
92
- Postal code, insee code, code region and so on, may be thus not considered categorical values.
93
- :param table:
94
- :param threshold_pct_categorical:
95
- :param max_number_categorical_values:
96
- :return:
97
- """
98
-
99
- def abs_number_different_values(column_values: pd.Series):
100
- return column_values.nunique()
101
-
102
- def rel_number_different_values(column_values: pd.Series):
103
- return column_values.nunique() / len(column_values)
104
-
105
- def detect_categorical(column_values: pd.Series):
106
- abs_unique_values = abs_number_different_values(column_values)
107
- rel_unique_values = rel_number_different_values(column_values)
108
- if (
109
- abs_unique_values <= max_number_categorical_values
110
- or rel_unique_values <= threshold_pct_categorical
111
- ):
112
- return True
113
- return False
114
-
115
- if verbose:
116
- start = time()
117
- logging.info("Detecting categorical columns")
118
- res = table.apply(lambda serie: detect_categorical(serie))
119
- if verbose:
120
- display_logs_depending_process_time(
121
- f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
122
- time() - start,
123
- )
124
- return res.index[res], res
125
-
126
-
127
- def detect_engine(csv_file_path: str, verbose=False) -> Optional[str]:
128
- if verbose:
129
- start = time()
130
- mapping = {
131
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
132
- 'application/vnd.ms-excel': 'xlrd',
133
- 'application/vnd.oasis.opendocument.spreadsheet': 'odf',
134
- # all these files could be recognized as zip, may need to check all cases then
135
- 'application/zip': 'openpyxl',
136
- }
137
- # if none of the above, we move forwards with the csv process
138
- if is_url(csv_file_path):
139
- remote_content = requests.get(csv_file_path).content
140
- engine = mapping.get(magic.from_buffer(remote_content, mime=True))
141
- else:
142
- engine = mapping.get(magic.from_file(csv_file_path, mime=True))
143
- if verbose:
144
- display_logs_depending_process_time(
145
- f'File has no extension, detected {engine_to_file.get(engine, "csv")}',
146
- time() - start,
147
- )
148
- return engine
149
-
150
-
151
- def detect_separator(file: TextIO, verbose: bool = False) -> str:
152
- """Detects csv separator"""
153
- # TODO: add a robust detection:
154
- # si on a un point virgule comme texte et \t comme séparateur, on renvoie
155
- # pour l'instant un point virgule
156
- if verbose:
157
- start = time()
158
- logging.info("Detecting separator")
159
- file.seek(0)
160
- header = file.readline()
161
- possible_separators = [";", ",", "|", "\t"]
162
- sep_count = dict()
163
- for sep in possible_separators:
164
- sep_count[sep] = header.count(sep)
165
- sep = max(sep_count, key=sep_count.get)
166
- # testing that the first 10 (arbitrary) rows all have the same number of fields
167
- # as the header. Prevents downstream unwanted behaviour where pandas can load
168
- # the file (in a weird way) but the process is irrelevant.
169
- file.seek(0)
170
- reader = csv.reader(file, delimiter=sep)
171
- rows_lengths = set()
172
- for idx, row in enumerate(reader):
173
- if idx > 10:
174
- break
175
- rows_lengths.add(len(row))
176
- if len(rows_lengths) > 1:
177
- raise ValueError('Number of columns is not even across the first 10 rows.')
178
-
179
- if verbose:
180
- display_logs_depending_process_time(
181
- f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
182
- time() - start,
183
- )
184
- return sep
185
-
186
-
187
- def detect_encoding(csv_file_path: str, verbose: bool = False) -> str:
188
- """
189
- Detects file encoding using faust-cchardet (forked from the original cchardet)
190
- """
191
- if verbose:
192
- start = time()
193
- logging.info("Detecting encoding")
194
- if is_url(csv_file_path):
195
- r = requests.get(csv_file_path)
196
- r.raise_for_status()
197
- binary_file = BytesIO(r.content)
198
- else:
199
- binary_file = open(csv_file_path, mode="rb")
200
- encoding_dict = detect(binary_file.read())
201
- if not encoding_dict["encoding"]:
202
- raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
203
- if verbose:
204
- message = f'Detected encoding: "{encoding_dict["encoding"]}"'
205
- message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
206
- display_logs_depending_process_time(
207
- message,
208
- time() - start,
209
- )
210
- return encoding_dict['encoding']
211
-
212
-
213
- def parse_table(
214
- the_file: TextIO,
215
- encoding: str,
216
- sep: str,
217
- num_rows: int,
218
- skiprows: int,
219
- random_state: int = 42,
220
- verbose : bool = False,
221
- ) -> tuple[pd.DataFrame, int, int]:
222
- if verbose:
223
- start = time()
224
- logging.info("Parsing table")
225
- table = None
226
-
227
- if not isinstance(the_file, str):
228
- the_file.seek(0)
229
-
230
- total_lines = None
231
- for encoding in [encoding, "ISO-8859-1", "utf-8"]:
232
- if encoding is None:
233
- continue
234
-
235
- if "ISO-8859" in encoding:
236
- encoding = "ISO-8859-1"
237
- try:
238
- table = pd.read_csv(
239
- the_file, sep=sep, dtype="unicode", encoding=encoding, skiprows=skiprows
240
- )
241
- total_lines = len(table)
242
- nb_duplicates = len(table.loc[table.duplicated()])
243
- if num_rows > 0:
244
- num_rows = min(num_rows - 1, total_lines)
245
- table = table.sample(num_rows, random_state=random_state)
246
- # else : table is unchanged
247
- break
248
- except TypeError:
249
- print("Trying encoding : {encoding}".format(encoding=encoding))
250
-
251
- if table is None:
252
- raise ValueError("Could not load file")
253
- if verbose:
254
- display_logs_depending_process_time(
255
- f'Table parsed successfully in {round(time() - start, 3)}s',
256
- time() - start,
257
- )
258
- return table, total_lines, nb_duplicates
259
-
260
-
261
- def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
262
- """Analog process to detect_headers for csv files, determines how many rows to skip
263
- to end up with the header at the right place"""
264
- idx = 0
265
- if all([str(c).startswith('Unnamed:') for c in table.columns]):
266
- # there is on offset between the index in the file (idx here)
267
- # and the index in the dataframe, because of the header
268
- idx = 1
269
- while table.iloc[idx - 1].isna().all():
270
- idx += 1
271
- cols = table.iloc[idx - 1]
272
- table = table.iloc[idx:]
273
- table.columns = cols.to_list()
274
- # +1 here because the headers should count as a row
275
- return table, idx
276
-
277
-
278
- def parse_excel(
279
- csv_file_path: str,
280
- num_rows: int = -1,
281
- engine: Optional[str] = None,
282
- sheet_name: Optional[str] = None,
283
- random_state: int = 42,
284
- verbose : bool = False,
285
- ) -> tuple[pd.DataFrame, int, int, str, str, int]:
286
- """"Excel-like parsing is really slow, could be a good improvement for future development"""
287
- if verbose:
288
- start = time()
289
- no_sheet_specified = sheet_name is None
290
-
291
- if (
292
- engine in ['openpyxl', 'xlrd'] or
293
- any([csv_file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT])
294
- ):
295
- remote_content = None
296
- if is_url(csv_file_path):
297
- r = requests.get(csv_file_path)
298
- r.raise_for_status()
299
- remote_content = BytesIO(r.content)
300
- if not engine:
301
- if any([csv_file_path.endswith(k) for k in NEW_EXCEL_EXT]):
302
- engine = "openpyxl"
303
- else:
304
- engine = "xlrd"
305
- if sheet_name is None:
306
- if verbose:
307
- display_logs_depending_process_time(
308
- f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
309
- time() - start,
310
- )
311
- try:
312
- if engine == "openpyxl":
313
- # openpyxl doesn't want to open files that don't have a valid extension
314
- # see: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2157
315
- # if the file is remote, we have a remote content anyway so it's fine
316
- if not remote_content and '.' not in csv_file_path.split('/')[-1]:
317
- with open(csv_file_path, 'rb') as f:
318
- remote_content = BytesIO(f.read())
319
- # faster than loading all sheets
320
- wb = openpyxl.load_workbook(remote_content or csv_file_path, read_only=True)
321
- try:
322
- sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
323
- except TypeError:
324
- # sometimes read_only can't get the info, so we have to open the file for real
325
- # this takes more time but it's for a limited number of files
326
- # and it's this or nothing
327
- wb = openpyxl.load_workbook(remote_content or csv_file_path)
328
- sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
329
- else:
330
- if remote_content:
331
- wb = xlrd.open_workbook(file_contents=remote_content.read())
332
- else:
333
- wb = xlrd.open_workbook(csv_file_path)
334
- sizes = {s.name: s.nrows * s.ncols for s in wb.sheets()}
335
- sheet_name = max(sizes, key=sizes.get)
336
- except xlrd.biffh.XLRDError:
337
- # sometimes a xls file is recognized as ods
338
- if verbose:
339
- display_logs_depending_process_time(
340
- 'Could not read file with classic xls reader, trying with ODS',
341
- time() - start,
342
- )
343
- engine = "odf"
344
-
345
- if engine == "odf" or any([csv_file_path.endswith(k) for k in OPEN_OFFICE_EXT]):
346
- # for ODS files, no way to get sheets' sizes without
347
- # loading the file one way or another (pandas or pure odfpy)
348
- # so all in one
349
- engine = "odf"
350
- if sheet_name is None:
351
- if verbose:
352
- display_logs_depending_process_time(
353
- f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
354
- time() - start,
355
- )
356
- tables = pd.read_excel(
357
- csv_file_path,
358
- engine="odf",
359
- sheet_name=None,
360
- dtype="unicode",
361
- )
362
- sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
363
- sheet_name = max(sizes, key=sizes.get)
364
- if verbose:
365
- display_logs_depending_process_time(
366
- f'Going forwards with sheet "{sheet_name}"',
367
- time() - start,
368
- )
369
- table = tables[sheet_name]
370
- else:
371
- if verbose:
372
- display_logs_depending_process_time(
373
- f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
374
- time() - start,
375
- )
376
- table = pd.read_excel(
377
- csv_file_path,
378
- engine="odf",
379
- sheet_name=sheet_name,
380
- dtype="unicode",
381
- )
382
- table, header_row_idx = remove_empty_first_rows(table)
383
- total_lines = len(table)
384
- nb_duplicates = len(table.loc[table.duplicated()])
385
- if num_rows > 0:
386
- num_rows = min(num_rows - 1, total_lines)
387
- table = table.sample(num_rows, random_state=random_state)
388
- if verbose:
389
- display_logs_depending_process_time(
390
- f'Table parsed successfully in {round(time() - start, 3)}s',
391
- time() - start,
392
- )
393
- return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
394
-
395
- # so here we end up with (old and new) excel files only
396
- if verbose:
397
- if no_sheet_specified:
398
- display_logs_depending_process_time(
399
- f'Going forwards with sheet "{sheet_name}"',
400
- time() - start,
401
- )
402
- else:
403
- display_logs_depending_process_time(
404
- f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
405
- time() - start,
406
- )
407
- table = pd.read_excel(
408
- csv_file_path,
409
- engine=engine,
410
- sheet_name=sheet_name,
411
- dtype="unicode",
412
- )
413
- table, header_row_idx = remove_empty_first_rows(table)
414
- total_lines = len(table)
415
- nb_duplicates = len(table.loc[table.duplicated()])
416
- if num_rows > 0:
417
- num_rows = min(num_rows - 1, total_lines)
418
- table = table.sample(num_rows, random_state=random_state)
419
- if verbose:
420
- display_logs_depending_process_time(
421
- f'Table parsed successfully in {round(time() - start, 3)}s',
422
- time() - start,
423
- )
424
- return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
425
-
426
-
427
- def prevent_nan(value: float) -> Optional[float]:
428
- if math.isnan(value):
429
- return None
430
- return value
431
-
432
-
433
- def create_profile(
434
- table: pd.DataFrame,
435
- dict_cols_fields: dict,
436
- num_rows: int,
437
- limited_output: bool = True,
438
- verbose: bool = False,
439
- ) -> dict:
440
- if verbose:
441
- start = time()
442
- logging.info("Creating profile")
443
- map_python_types = {
444
- "string": str,
445
- "int": float,
446
- "float": float,
447
- }
448
-
449
- if num_rows > 0:
450
- raise ValueError("To create profiles num_rows has to be set to -1")
451
- safe_table = table.copy()
452
- if not limited_output:
453
- dict_cols_fields = {
454
- k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
455
- for k, v in dict_cols_fields.items()
456
- }
457
- dtypes = {
458
- k: map_python_types.get(v["python_type"], str)
459
- for k, v in dict_cols_fields.items()
460
- }
461
- for c in safe_table.columns:
462
- if dtypes[c] == float:
463
- safe_table[c] = safe_table[c].apply(
464
- lambda s: float_casting(s) if isinstance(s, str) else s
465
- )
466
- profile = defaultdict(dict)
467
- for c in safe_table.columns:
468
- if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
469
- float,
470
- int,
471
- ]:
472
- profile[c].update(
473
- min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
474
- safe_table[c].min()
475
- )),
476
- max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
477
- safe_table[c].max()
478
- )),
479
- mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
480
- safe_table[c].mean()
481
- )),
482
- std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
483
- safe_table[c].std()
484
- )),
485
- )
486
- tops_bruts = safe_table[safe_table[c].notna()][c] \
487
- .value_counts(dropna=True) \
488
- .reset_index() \
489
- .iloc[:10] \
490
- .to_dict(orient="records")
491
- tops = []
492
- for tb in tops_bruts:
493
- tops.append({
494
- "count": tb["count"],
495
- "value": tb[c],
496
- })
497
- profile[c].update(
498
- tops=tops,
499
- nb_distinct=safe_table[c].nunique(),
500
- nb_missing_values=len(safe_table[c].loc[safe_table[c].isna()]),
501
- )
502
- if verbose:
503
- display_logs_depending_process_time(
504
- f"Created profile in {round(time() - start, 3)}s",
505
- time() - start,
506
- )
507
- return profile
508
-
509
-
510
- def detect_extra_columns(file: TextIO, sep: str):
511
- """regarde s'il y a des colonnes en trop
512
- Attention, file ne doit pas avoir de ligne vide"""
513
- file.seek(0)
514
- retour = False
515
- nb_useless_col = 99999
516
-
517
- for i in range(10):
518
- line = file.readline()
519
- # regarde si on a un retour
520
- if retour:
521
- assert line[-1] == "\n"
522
- if line[-1] == "\n":
523
- retour = True
524
-
525
- # regarde le nombre de derniere colonne inutile
526
- deb = 0 + retour
527
- line = line[::-1][deb:]
528
- k = 0
529
- for sign in line:
530
- if sign != sep:
531
- break
532
- k += 1
533
- if k == 0:
534
- return 0, retour
535
- nb_useless_col = min(k, nb_useless_col)
536
- return nb_useless_col, retour
537
-
538
-
539
- def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
540
- """Tests 10 first rows for possible header (header not in 1st line)"""
541
- if verbose:
542
- start = time()
543
- logging.info("Detecting headers")
544
- file.seek(0)
545
- for i in range(10):
546
- header = file.readline()
547
- position = file.tell()
548
- chaine = [c for c in header.replace("\n", "").split(sep) if c]
549
- if chaine[-1] not in ["", "\n"] and all(
550
- [mot not in ["", "\n"] for mot in chaine[1:-1]]
551
- ):
552
- next_row = file.readline()
553
- file.seek(position)
554
- if header != next_row:
555
- if verbose:
556
- display_logs_depending_process_time(
557
- f'Detected headers in {round(time() - start, 3)}s',
558
- time() - start,
559
- )
560
- return i, chaine
561
- if verbose:
562
- logging.info(f'No header detected')
563
- return 0, None
564
-
565
-
566
- def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False) -> int:
567
- """Tests first 10 lines to see if there are empty heading columns"""
568
- if verbose:
569
- start = time()
570
- logging.info("Detecting heading columns")
571
- file.seek(0)
572
- return_int = float("Inf")
573
- for i in range(10):
574
- line = file.readline()
575
- return_int = min(return_int, len(line) - len(line.strip(sep)))
576
- if return_int == 0:
577
- if verbose:
578
- display_logs_depending_process_time(
579
- f'No heading column detected in {round(time() - start, 3)}s',
580
- time() - start,
581
- )
582
- return 0
583
- if verbose:
584
- display_logs_depending_process_time(
585
- f'{return_int} heading columns detected in {round(time() - start, 3)}s',
586
- time() - start,
587
- )
588
- return return_int
589
-
590
-
591
- def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False) -> int:
592
- """Tests first 10 lines to see if there are empty trailing columns"""
593
- if verbose:
594
- start = time()
595
- logging.info("Detecting trailing columns")
596
- file.seek(0)
597
- return_int = float("Inf")
598
- for i in range(10):
599
- line = file.readline()
600
- return_int = min(
601
- return_int,
602
- len(line.replace("\n", ""))
603
- - len(line.replace("\n", "").strip(sep))
604
- - heading_columns,
605
- )
606
- if return_int == 0:
607
- if verbose:
608
- display_logs_depending_process_time(
609
- f'No trailing column detected in {round(time() - start, 3)}s',
610
- time() - start,
611
- )
612
- return 0
613
- if verbose:
614
- display_logs_depending_process_time(
615
- f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
616
- time() - start,
617
- )
618
- return return_int
File without changes