csv-detective 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1228__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. csv_detective/__init__.py +1 -1
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
  7. csv_detective/detect_fields/other/float/__init__.py +1 -1
  8. csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
  9. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
  10. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
  11. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
  12. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
  13. csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
  14. csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
  15. csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
  16. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
  17. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
  18. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  19. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
  20. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  21. csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
  22. csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
  23. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
  24. csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
  25. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
  26. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
  27. csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
  28. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
  29. csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
  30. csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
  31. csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
  32. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
  33. csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
  34. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
  35. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
  36. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
  37. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
  38. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
  39. csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
  40. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
  41. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
  42. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
  43. csv_detective/detect_labels/other/booleen/__init__.py +1 -1
  44. csv_detective/detect_labels/other/email/__init__.py +1 -1
  45. csv_detective/detect_labels/other/float/__init__.py +1 -1
  46. csv_detective/detect_labels/other/int/__init__.py +1 -1
  47. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  48. csv_detective/detect_labels/other/twitter/__init__.py +1 -1
  49. csv_detective/detect_labels/other/url/__init__.py +1 -1
  50. csv_detective/detect_labels/other/uuid/__init__.py +1 -1
  51. csv_detective/detect_labels/temp/date/__init__.py +1 -1
  52. csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
  53. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
  54. csv_detective/detect_labels/temp/year/__init__.py +1 -1
  55. csv_detective/detection/columns.py +89 -0
  56. csv_detective/detection/encoding.py +27 -0
  57. csv_detective/detection/engine.py +46 -0
  58. csv_detective/detection/headers.py +32 -0
  59. csv_detective/detection/rows.py +18 -0
  60. csv_detective/detection/separator.py +44 -0
  61. csv_detective/detection/variables.py +98 -0
  62. csv_detective/explore_csv.py +40 -124
  63. csv_detective/output/dataframe.py +55 -0
  64. csv_detective/{create_example.py → output/example.py} +10 -9
  65. csv_detective/output/profile.py +87 -0
  66. csv_detective/{schema_generation.py → output/schema.py} +344 -343
  67. csv_detective/output/utils.py +51 -0
  68. csv_detective/parsing/columns.py +141 -0
  69. csv_detective/parsing/compression.py +11 -0
  70. csv_detective/parsing/csv.py +55 -0
  71. csv_detective/parsing/excel.py +169 -0
  72. csv_detective/parsing/load.py +97 -0
  73. csv_detective/utils.py +10 -236
  74. {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1228.data}/data/share/csv_detective/CHANGELOG.md +3 -0
  75. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/METADATA +1 -1
  76. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/RECORD +85 -71
  77. tests/test_fields.py +8 -7
  78. tests/test_file.py +15 -14
  79. csv_detective/detection.py +0 -633
  80. /csv_detective/{process_text.py → parsing/text.py} +0 -0
  81. {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1228.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  82. {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1228.data}/data/share/csv_detective/README.md +0 -0
  83. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/WHEEL +0 -0
  84. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/entry_points.txt +0 -0
  85. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
  86. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1228.dist-info}/top_level.txt +0 -0
@@ -1,633 +0,0 @@
1
- from typing import TextIO, Optional
2
- from collections import defaultdict
3
- import pandas as pd
4
- import math
5
- import csv
6
- from cchardet import detect
7
- from ast import literal_eval
8
- import gzip
9
- import logging
10
- from time import time
11
- import openpyxl
12
- import xlrd
13
- import requests
14
- from io import BytesIO
15
- import magic
16
- from csv_detective.utils import display_logs_depending_process_time
17
- from csv_detective.detect_fields.other.float import float_casting
18
-
19
- logging.basicConfig(level=logging.INFO)
20
-
21
- NEW_EXCEL_EXT = [".xlsx", ".xlsm", ".xltx", ".xltm"]
22
- OLD_EXCEL_EXT = [".xls"]
23
- OPEN_OFFICE_EXT = [".odf", ".ods", ".odt"]
24
- XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
25
- EXCEL_ENGINES = ["openpyxl", "xlrd", "odf"]
26
- COMPRESSION_ENGINES = ["gzip"]
27
- engine_to_file = {
28
- "openpyxl": "Excel",
29
- "xlrd": "old Excel",
30
- "odf": "OpenOffice",
31
- "gzip": "csv.gz",
32
- }
33
-
34
-
35
- def is_url(csv_file_path: str) -> bool:
36
- # could be more sophisticated if needed
37
- return csv_file_path.startswith('http')
38
-
39
-
40
- def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
41
- """
42
- Detects whether a column contains continuous variables. We consider a continuous column
43
- one that contains a considerable amount of float values.
44
- We removed the integers as we then end up with postal codes, insee codes, and all sort
45
- of codes and types.
46
- This is not optimal but it will do for now.
47
- """
48
- # if we need this again in the future, could be first based on columns detected as int/float to cut time
49
-
50
- def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
51
- count = serie.value_counts().to_dict()
52
- total_nb = len(serie)
53
- if float in count:
54
- nb_floats = count[float]
55
- else:
56
- return False
57
- if nb_floats / total_nb >= continuous_th:
58
- return True
59
- else:
60
- return False
61
-
62
- def parses_to_integer(value: str):
63
- try:
64
- value = value.replace(",", ".")
65
- value = literal_eval(value)
66
- return type(value)
67
- # flake8: noqa
68
- except:
69
- return False
70
-
71
- if verbose:
72
- start = time()
73
- logging.info("Detecting continuous columns")
74
- res = table.apply(
75
- lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
76
- )
77
- if verbose:
78
- display_logs_depending_process_time(
79
- f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
80
- time() - start,
81
- )
82
- return res.index[res]
83
-
84
-
85
- def detetect_categorical_variable(
86
- table: pd.DataFrame,
87
- threshold_pct_categorical: float = 0.05,
88
- max_number_categorical_values: int = 25,
89
- verbose: bool = False,
90
- ):
91
- """
92
- Heuristically detects whether a table (df) contains categorical values according to
93
- the number of unique values contained.
94
- As the idea of detecting categorical values is to then try to learn models to predict
95
- them, we limit categorical values to at most 25 different modes or at most 5% disparity.
96
- Postal code, insee code, code region and so on, may be thus not considered categorical values.
97
- :param table:
98
- :param threshold_pct_categorical:
99
- :param max_number_categorical_values:
100
- :return:
101
- """
102
-
103
- def abs_number_different_values(column_values: pd.Series):
104
- return column_values.nunique()
105
-
106
- def rel_number_different_values(column_values: pd.Series):
107
- return column_values.nunique() / len(column_values)
108
-
109
- def detect_categorical(column_values: pd.Series):
110
- abs_unique_values = abs_number_different_values(column_values)
111
- rel_unique_values = rel_number_different_values(column_values)
112
- if (
113
- abs_unique_values <= max_number_categorical_values
114
- or rel_unique_values <= threshold_pct_categorical
115
- ):
116
- return True
117
- return False
118
-
119
- if verbose:
120
- start = time()
121
- logging.info("Detecting categorical columns")
122
- res = table.apply(lambda serie: detect_categorical(serie))
123
- if verbose:
124
- display_logs_depending_process_time(
125
- f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
126
- time() - start,
127
- )
128
- return res.index[res], res
129
-
130
-
131
- def detect_engine(csv_file_path: str, verbose=False) -> Optional[str]:
132
- if verbose:
133
- start = time()
134
- mapping = {
135
- "application/gzip": "gzip",
136
- "application/x-gzip": "gzip",
137
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
138
- 'application/vnd.ms-excel': 'xlrd',
139
- 'application/vnd.oasis.opendocument.spreadsheet': 'odf',
140
- # all these files could be recognized as zip, may need to check all cases then
141
- 'application/zip': 'openpyxl',
142
- }
143
- # if none of the above, we move forwards with the csv process
144
- if is_url(csv_file_path):
145
- remote_content = requests.get(csv_file_path).content
146
- engine = mapping.get(magic.from_buffer(remote_content, mime=True))
147
- else:
148
- engine = mapping.get(magic.from_file(csv_file_path, mime=True))
149
- if verbose:
150
- message = (
151
- f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
152
- if engine else "Processing the file as a csv"
153
- )
154
- display_logs_depending_process_time(
155
- message,
156
- time() - start,
157
- )
158
- return engine
159
-
160
-
161
- def detect_separator(file: TextIO, verbose: bool = False) -> str:
162
- """Detects csv separator"""
163
- # TODO: add a robust detection:
164
- # si on a un point virgule comme texte et \t comme séparateur, on renvoie
165
- # pour l'instant un point virgule
166
- if verbose:
167
- start = time()
168
- logging.info("Detecting separator")
169
- file.seek(0)
170
- header = file.readline()
171
- possible_separators = [";", ",", "|", "\t"]
172
- sep_count = dict()
173
- for sep in possible_separators:
174
- sep_count[sep] = header.count(sep)
175
- sep = max(sep_count, key=sep_count.get)
176
- # testing that the first 10 (arbitrary) rows all have the same number of fields
177
- # as the header. Prevents downstream unwanted behaviour where pandas can load
178
- # the file (in a weird way) but the process is irrelevant.
179
- file.seek(0)
180
- reader = csv.reader(file, delimiter=sep)
181
- rows_lengths = set()
182
- for idx, row in enumerate(reader):
183
- if idx > 10:
184
- break
185
- rows_lengths.add(len(row))
186
- if len(rows_lengths) > 1:
187
- raise ValueError(
188
- f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
189
- )
190
-
191
- if verbose:
192
- display_logs_depending_process_time(
193
- f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
194
- time() - start,
195
- )
196
- return sep
197
-
198
-
199
- def unzip(binary_file: BytesIO, engine: str) -> BytesIO:
200
- if engine == "gzip":
201
- with gzip.open(binary_file, mode="rb") as binary_file:
202
- file_content = binary_file.read()
203
- else:
204
- raise NotImplementedError(f"{engine} is not yet supported")
205
- return BytesIO(file_content)
206
-
207
-
208
- def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
209
- """
210
- Detects file encoding using faust-cchardet (forked from the original cchardet)
211
- """
212
- if verbose:
213
- start = time()
214
- logging.info("Detecting encoding")
215
- encoding_dict = detect(binary_file.read())
216
- if not encoding_dict["encoding"]:
217
- raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
218
- if verbose:
219
- message = f'Detected encoding: "{encoding_dict["encoding"]}"'
220
- message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
221
- display_logs_depending_process_time(
222
- message,
223
- time() - start,
224
- )
225
- return encoding_dict['encoding']
226
-
227
-
228
- def parse_table(
229
- the_file: TextIO,
230
- encoding: str,
231
- sep: str,
232
- num_rows: int,
233
- skiprows: int,
234
- random_state: int = 42,
235
- verbose : bool = False,
236
- ) -> tuple[pd.DataFrame, int, int]:
237
- if verbose:
238
- start = time()
239
- logging.info("Parsing table")
240
- table = None
241
-
242
- if not isinstance(the_file, str):
243
- the_file.seek(0)
244
-
245
- total_lines = None
246
- for encoding in [encoding, "ISO-8859-1", "utf-8"]:
247
- if encoding is None:
248
- continue
249
-
250
- if "ISO-8859" in encoding:
251
- encoding = "ISO-8859-1"
252
- try:
253
- table = pd.read_csv(
254
- the_file, sep=sep, dtype="unicode", encoding=encoding, skiprows=skiprows
255
- )
256
- total_lines = len(table)
257
- nb_duplicates = len(table.loc[table.duplicated()])
258
- if num_rows > 0:
259
- num_rows = min(num_rows - 1, total_lines)
260
- table = table.sample(num_rows, random_state=random_state)
261
- # else : table is unchanged
262
- break
263
- except TypeError:
264
- print("Trying encoding : {encoding}".format(encoding=encoding))
265
-
266
- if table is None:
267
- raise ValueError("Could not load file")
268
- if verbose:
269
- display_logs_depending_process_time(
270
- f'Table parsed successfully in {round(time() - start, 3)}s',
271
- time() - start,
272
- )
273
- return table, total_lines, nb_duplicates
274
-
275
-
276
- def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
277
- """Analog process to detect_headers for csv files, determines how many rows to skip
278
- to end up with the header at the right place"""
279
- idx = 0
280
- if all([str(c).startswith('Unnamed:') for c in table.columns]):
281
- # there is on offset between the index in the file (idx here)
282
- # and the index in the dataframe, because of the header
283
- idx = 1
284
- while table.iloc[idx - 1].isna().all():
285
- idx += 1
286
- cols = table.iloc[idx - 1]
287
- table = table.iloc[idx:]
288
- table.columns = cols.to_list()
289
- # +1 here because the headers should count as a row
290
- return table, idx
291
-
292
-
293
- def parse_excel(
294
- csv_file_path: str,
295
- num_rows: int = -1,
296
- engine: Optional[str] = None,
297
- sheet_name: Optional[str] = None,
298
- random_state: int = 42,
299
- verbose : bool = False,
300
- ) -> tuple[pd.DataFrame, int, int, str, str, int]:
301
- """"Excel-like parsing is really slow, could be a good improvement for future development"""
302
- if verbose:
303
- start = time()
304
- no_sheet_specified = sheet_name is None
305
-
306
- if (
307
- engine in ['openpyxl', 'xlrd'] or
308
- any([csv_file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT])
309
- ):
310
- remote_content = None
311
- if is_url(csv_file_path):
312
- r = requests.get(csv_file_path)
313
- r.raise_for_status()
314
- remote_content = BytesIO(r.content)
315
- if not engine:
316
- if any([csv_file_path.endswith(k) for k in NEW_EXCEL_EXT]):
317
- engine = "openpyxl"
318
- else:
319
- engine = "xlrd"
320
- if sheet_name is None:
321
- if verbose:
322
- display_logs_depending_process_time(
323
- f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
324
- time() - start,
325
- )
326
- try:
327
- if engine == "openpyxl":
328
- # openpyxl doesn't want to open files that don't have a valid extension
329
- # see: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2157
330
- # if the file is remote, we have a remote content anyway so it's fine
331
- if not remote_content and '.' not in csv_file_path.split('/')[-1]:
332
- with open(csv_file_path, 'rb') as f:
333
- remote_content = BytesIO(f.read())
334
- # faster than loading all sheets
335
- wb = openpyxl.load_workbook(remote_content or csv_file_path, read_only=True)
336
- try:
337
- sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
338
- except TypeError:
339
- # sometimes read_only can't get the info, so we have to open the file for real
340
- # this takes more time but it's for a limited number of files
341
- # and it's this or nothing
342
- wb = openpyxl.load_workbook(remote_content or csv_file_path)
343
- sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
344
- else:
345
- if remote_content:
346
- wb = xlrd.open_workbook(file_contents=remote_content.read())
347
- else:
348
- wb = xlrd.open_workbook(csv_file_path)
349
- sizes = {s.name: s.nrows * s.ncols for s in wb.sheets()}
350
- sheet_name = max(sizes, key=sizes.get)
351
- except xlrd.biffh.XLRDError:
352
- # sometimes a xls file is recognized as ods
353
- if verbose:
354
- display_logs_depending_process_time(
355
- 'Could not read file with classic xls reader, trying with ODS',
356
- time() - start,
357
- )
358
- engine = "odf"
359
-
360
- if engine == "odf" or any([csv_file_path.endswith(k) for k in OPEN_OFFICE_EXT]):
361
- # for ODS files, no way to get sheets' sizes without
362
- # loading the file one way or another (pandas or pure odfpy)
363
- # so all in one
364
- engine = "odf"
365
- if sheet_name is None:
366
- if verbose:
367
- display_logs_depending_process_time(
368
- f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
369
- time() - start,
370
- )
371
- tables = pd.read_excel(
372
- csv_file_path,
373
- engine="odf",
374
- sheet_name=None,
375
- dtype="unicode",
376
- )
377
- sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
378
- sheet_name = max(sizes, key=sizes.get)
379
- if verbose:
380
- display_logs_depending_process_time(
381
- f'Going forwards with sheet "{sheet_name}"',
382
- time() - start,
383
- )
384
- table = tables[sheet_name]
385
- else:
386
- if verbose:
387
- display_logs_depending_process_time(
388
- f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
389
- time() - start,
390
- )
391
- table = pd.read_excel(
392
- csv_file_path,
393
- engine="odf",
394
- sheet_name=sheet_name,
395
- dtype="unicode",
396
- )
397
- table, header_row_idx = remove_empty_first_rows(table)
398
- total_lines = len(table)
399
- nb_duplicates = len(table.loc[table.duplicated()])
400
- if num_rows > 0:
401
- num_rows = min(num_rows - 1, total_lines)
402
- table = table.sample(num_rows, random_state=random_state)
403
- if verbose:
404
- display_logs_depending_process_time(
405
- f'Table parsed successfully in {round(time() - start, 3)}s',
406
- time() - start,
407
- )
408
- return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
409
-
410
- # so here we end up with (old and new) excel files only
411
- if verbose:
412
- if no_sheet_specified:
413
- display_logs_depending_process_time(
414
- f'Going forwards with sheet "{sheet_name}"',
415
- time() - start,
416
- )
417
- else:
418
- display_logs_depending_process_time(
419
- f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
420
- time() - start,
421
- )
422
- table = pd.read_excel(
423
- csv_file_path,
424
- engine=engine,
425
- sheet_name=sheet_name,
426
- dtype="unicode",
427
- )
428
- table, header_row_idx = remove_empty_first_rows(table)
429
- total_lines = len(table)
430
- nb_duplicates = len(table.loc[table.duplicated()])
431
- if num_rows > 0:
432
- num_rows = min(num_rows - 1, total_lines)
433
- table = table.sample(num_rows, random_state=random_state)
434
- if verbose:
435
- display_logs_depending_process_time(
436
- f'Table parsed successfully in {round(time() - start, 3)}s',
437
- time() - start,
438
- )
439
- return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
440
-
441
-
442
- def prevent_nan(value: float) -> Optional[float]:
443
- if math.isnan(value):
444
- return None
445
- return value
446
-
447
-
448
- def create_profile(
449
- table: pd.DataFrame,
450
- dict_cols_fields: dict,
451
- num_rows: int,
452
- limited_output: bool = True,
453
- verbose: bool = False,
454
- ) -> dict:
455
- if verbose:
456
- start = time()
457
- logging.info("Creating profile")
458
- map_python_types = {
459
- "string": str,
460
- "int": float,
461
- "float": float,
462
- }
463
-
464
- if num_rows > 0:
465
- raise ValueError("To create profiles num_rows has to be set to -1")
466
- safe_table = table.copy()
467
- if not limited_output:
468
- dict_cols_fields = {
469
- k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
470
- for k, v in dict_cols_fields.items()
471
- }
472
- dtypes = {
473
- k: map_python_types.get(v["python_type"], str)
474
- for k, v in dict_cols_fields.items()
475
- }
476
- for c in safe_table.columns:
477
- if dtypes[c] == float:
478
- safe_table[c] = safe_table[c].apply(
479
- lambda s: float_casting(s) if isinstance(s, str) else s
480
- )
481
- profile = defaultdict(dict)
482
- for c in safe_table.columns:
483
- if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
484
- float,
485
- int,
486
- ]:
487
- profile[c].update(
488
- min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
489
- safe_table[c].min()
490
- )),
491
- max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
492
- safe_table[c].max()
493
- )),
494
- mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
495
- safe_table[c].mean()
496
- )),
497
- std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
498
- safe_table[c].std()
499
- )),
500
- )
501
- tops_bruts = safe_table[safe_table[c].notna()][c] \
502
- .value_counts(dropna=True) \
503
- .reset_index() \
504
- .iloc[:10] \
505
- .to_dict(orient="records")
506
- tops = []
507
- for tb in tops_bruts:
508
- tops.append({
509
- "count": tb["count"],
510
- "value": tb[c],
511
- })
512
- profile[c].update(
513
- tops=tops,
514
- nb_distinct=safe_table[c].nunique(),
515
- nb_missing_values=len(safe_table[c].loc[safe_table[c].isna()]),
516
- )
517
- if verbose:
518
- display_logs_depending_process_time(
519
- f"Created profile in {round(time() - start, 3)}s",
520
- time() - start,
521
- )
522
- return profile
523
-
524
-
525
- def detect_extra_columns(file: TextIO, sep: str):
526
- """regarde s'il y a des colonnes en trop
527
- Attention, file ne doit pas avoir de ligne vide"""
528
- file.seek(0)
529
- retour = False
530
- nb_useless_col = 99999
531
-
532
- for i in range(10):
533
- line = file.readline()
534
- # regarde si on a un retour
535
- if retour:
536
- assert line[-1] == "\n"
537
- if line[-1] == "\n":
538
- retour = True
539
-
540
- # regarde le nombre de derniere colonne inutile
541
- deb = 0 + retour
542
- line = line[::-1][deb:]
543
- k = 0
544
- for sign in line:
545
- if sign != sep:
546
- break
547
- k += 1
548
- if k == 0:
549
- return 0, retour
550
- nb_useless_col = min(k, nb_useless_col)
551
- return nb_useless_col, retour
552
-
553
-
554
- def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
555
- """Tests 10 first rows for possible header (header not in 1st line)"""
556
- if verbose:
557
- start = time()
558
- logging.info("Detecting headers")
559
- file.seek(0)
560
- for i in range(10):
561
- header = file.readline()
562
- position = file.tell()
563
- chaine = [c for c in header.replace("\n", "").split(sep) if c]
564
- if chaine[-1] not in ["", "\n"] and all(
565
- [mot not in ["", "\n"] for mot in chaine[1:-1]]
566
- ):
567
- next_row = file.readline()
568
- file.seek(position)
569
- if header != next_row:
570
- if verbose:
571
- display_logs_depending_process_time(
572
- f'Detected headers in {round(time() - start, 3)}s',
573
- time() - start,
574
- )
575
- return i, chaine
576
- if verbose:
577
- logging.info(f'No header detected')
578
- return 0, None
579
-
580
-
581
- def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False) -> int:
582
- """Tests first 10 lines to see if there are empty heading columns"""
583
- if verbose:
584
- start = time()
585
- logging.info("Detecting heading columns")
586
- file.seek(0)
587
- return_int = float("Inf")
588
- for i in range(10):
589
- line = file.readline()
590
- return_int = min(return_int, len(line) - len(line.strip(sep)))
591
- if return_int == 0:
592
- if verbose:
593
- display_logs_depending_process_time(
594
- f'No heading column detected in {round(time() - start, 3)}s',
595
- time() - start,
596
- )
597
- return 0
598
- if verbose:
599
- display_logs_depending_process_time(
600
- f'{return_int} heading columns detected in {round(time() - start, 3)}s',
601
- time() - start,
602
- )
603
- return return_int
604
-
605
-
606
- def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False) -> int:
607
- """Tests first 10 lines to see if there are empty trailing columns"""
608
- if verbose:
609
- start = time()
610
- logging.info("Detecting trailing columns")
611
- file.seek(0)
612
- return_int = float("Inf")
613
- for i in range(10):
614
- line = file.readline()
615
- return_int = min(
616
- return_int,
617
- len(line.replace("\n", ""))
618
- - len(line.replace("\n", "").strip(sep))
619
- - heading_columns,
620
- )
621
- if return_int == 0:
622
- if verbose:
623
- display_logs_depending_process_time(
624
- f'No trailing column detected in {round(time() - start, 3)}s',
625
- time() - start,
626
- )
627
- return 0
628
- if verbose:
629
- display_logs_depending_process_time(
630
- f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
631
- time() - start,
632
- )
633
- return return_int
File without changes