csv-detective 0.7.5.dev1069__py3-none-any.whl → 0.7.5.dev1113__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_fields/__init__.py +1 -1
- csv_detective/detect_fields/other/booleen/__init__.py +22 -16
- csv_detective/detect_fields/other/float/__init__.py +2 -2
- csv_detective/detect_fields/temp/date/__init__.py +21 -37
- csv_detective/detect_fields/temp/datetime/__init__.py +19 -0
- csv_detective/detect_labels/temp/date/__init__.py +3 -1
- csv_detective/detection.py +47 -53
- csv_detective/explore_csv.py +57 -47
- csv_detective/utils.py +53 -2
- {csv_detective-0.7.5.dev1069.data → csv_detective-0.7.5.dev1113.data}/data/share/csv_detective/CHANGELOG.md +2 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/METADATA +2 -1
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/RECORD +20 -19
- tests/test_fields.py +19 -0
- tests/test_file.py +28 -0
- {csv_detective-0.7.5.dev1069.data → csv_detective-0.7.5.dev1113.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1069.data → csv_detective-0.7.5.dev1113.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/WHEEL +0 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/top_level.txt +0 -0
|
@@ -1,21 +1,27 @@
|
|
|
1
1
|
PROPORTION = 1
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
2
|
+
bool_mapping = {
|
|
3
|
+
"1": True,
|
|
4
|
+
"0": False,
|
|
5
|
+
"vrai": True,
|
|
6
|
+
"faux": False,
|
|
7
|
+
"true": True,
|
|
8
|
+
"false": False,
|
|
9
|
+
"oui": True,
|
|
10
|
+
"non": False,
|
|
11
|
+
"yes": True,
|
|
12
|
+
"no": False,
|
|
13
|
+
"y": True,
|
|
14
|
+
"n": False,
|
|
15
|
+
"o": True,
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
liste_bool = set(bool_mapping.keys())
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
20
|
+
|
|
21
|
+
def bool_casting(val: str) -> bool:
|
|
22
|
+
return bool_mapping.get(val)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _is(val: str) -> bool:
|
|
26
|
+
'''Détecte les booléens'''
|
|
21
27
|
return isinstance(val, str) and val.lower() in liste_bool
|
|
@@ -1,46 +1,30 @@
|
|
|
1
|
-
import
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
from
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from dateparser import parse as date_parser
|
|
5
|
+
from dateutil.parser import parse as dateutil_parser, ParserError
|
|
5
6
|
|
|
6
7
|
PROPORTION = 1
|
|
7
8
|
# /!\ this is only for dates, not datetimes which are handled by other utils
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
def
|
|
11
|
-
|
|
12
|
-
# longest date string expected here is DD-septembre-YYYY, so 17 characters
|
|
13
|
-
if len(val) > 17:
|
|
14
|
-
return False
|
|
11
|
+
def date_casting(val: str) -> Optional[datetime]:
|
|
12
|
+
"""For performance reasons, we try first with dateutil and fallback on dateparser"""
|
|
15
13
|
try:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
return True
|
|
20
|
-
except (ParserError, ValueError, TypeError, OverflowError):
|
|
21
|
-
return False
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
seps = r'[\s/\-\*_\|;.,]'
|
|
25
|
-
# matches JJ-MM-AAAA with any of the listed separators
|
|
26
|
-
pat = r'^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$'.replace('SEP', seps)
|
|
27
|
-
# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
|
|
28
|
-
tap = r'^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$'.replace('SEP', seps + '?')
|
|
29
|
-
# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
|
|
30
|
-
letters = (
|
|
31
|
-
r'^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr'
|
|
32
|
-
r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
|
|
33
|
-
r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP'
|
|
34
|
-
r'(\d{2}|\d{4})$'
|
|
35
|
-
).replace('SEP', seps + '?')
|
|
14
|
+
return dateutil_parser(val)
|
|
15
|
+
except ParserError:
|
|
16
|
+
return date_parser(val)
|
|
36
17
|
|
|
37
18
|
|
|
38
19
|
def _is(val):
|
|
39
|
-
'''Renvoie True si val peut être une date, False sinon
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
)
|
|
20
|
+
'''Renvoie True si val peut être une date, False sinon'''
|
|
21
|
+
# early stops, to cut processing time
|
|
22
|
+
if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
|
|
23
|
+
return False
|
|
24
|
+
threshold = 0.3
|
|
25
|
+
if sum([char.isdigit() for char in val]) / len(val) < threshold:
|
|
26
|
+
return False
|
|
27
|
+
res = date_casting(val)
|
|
28
|
+
if not res or res.hour or res.minute or res.second:
|
|
29
|
+
return False
|
|
30
|
+
return True
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
from csv_detective.detect_fields.temp.date import date_casting
|
|
4
|
+
|
|
5
|
+
PROPORTION = 1
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _is(val: Optional[Any]) -> bool:
|
|
9
|
+
'''Renvoie True si val peut être un datetime, False sinon'''
|
|
10
|
+
# early stops, to cut processing time
|
|
11
|
+
if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
|
|
12
|
+
return False
|
|
13
|
+
threshold = 0.7
|
|
14
|
+
if sum([char.isdigit() for char in val]) / len(val) < threshold:
|
|
15
|
+
return False
|
|
16
|
+
res = date_casting(val)
|
|
17
|
+
if res and (res.hour or res.minute or res.second):
|
|
18
|
+
return True
|
|
19
|
+
return False
|
csv_detective/detection.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from typing import TextIO, Optional
|
|
1
|
+
from typing import TextIO, Optional, Union
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import math
|
|
4
5
|
import csv
|
|
@@ -27,7 +28,7 @@ engine_to_file = {
|
|
|
27
28
|
}
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
def is_url(csv_file_path: str):
|
|
31
|
+
def is_url(csv_file_path: str) -> bool:
|
|
31
32
|
# could be more sophisticated if needed
|
|
32
33
|
return csv_file_path.startswith('http')
|
|
33
34
|
|
|
@@ -35,17 +36,14 @@ def is_url(csv_file_path: str):
|
|
|
35
36
|
def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
|
|
36
37
|
"""
|
|
37
38
|
Detects whether a column contains continuous variables. We consider a continuous column
|
|
38
|
-
one that contains
|
|
39
|
-
a considerable amount of float values.
|
|
39
|
+
one that contains a considerable amount of float values.
|
|
40
40
|
We removed the integers as we then end up with postal codes, insee codes, and all sort
|
|
41
41
|
of codes and types.
|
|
42
42
|
This is not optimal but it will do for now.
|
|
43
|
-
:param table:
|
|
44
|
-
:return:
|
|
45
43
|
"""
|
|
46
44
|
# if we need this again in the future, could be first based on columns detected as int/float to cut time
|
|
47
45
|
|
|
48
|
-
def check_threshold(serie: pd.Series, continuous_th: float):
|
|
46
|
+
def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
|
|
49
47
|
count = serie.value_counts().to_dict()
|
|
50
48
|
total_nb = len(serie)
|
|
51
49
|
if float in count:
|
|
@@ -75,7 +73,7 @@ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9,
|
|
|
75
73
|
if verbose:
|
|
76
74
|
display_logs_depending_process_time(
|
|
77
75
|
f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
|
|
78
|
-
time() - start
|
|
76
|
+
time() - start,
|
|
79
77
|
)
|
|
80
78
|
return res.index[res]
|
|
81
79
|
|
|
@@ -121,12 +119,12 @@ def detetect_categorical_variable(
|
|
|
121
119
|
if verbose:
|
|
122
120
|
display_logs_depending_process_time(
|
|
123
121
|
f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
|
|
124
|
-
time() - start
|
|
122
|
+
time() - start,
|
|
125
123
|
)
|
|
126
124
|
return res.index[res], res
|
|
127
125
|
|
|
128
126
|
|
|
129
|
-
def detect_engine(csv_file_path: str, verbose=False):
|
|
127
|
+
def detect_engine(csv_file_path: str, verbose=False) -> Optional[str]:
|
|
130
128
|
if verbose:
|
|
131
129
|
start = time()
|
|
132
130
|
mapping = {
|
|
@@ -145,12 +143,12 @@ def detect_engine(csv_file_path: str, verbose=False):
|
|
|
145
143
|
if verbose:
|
|
146
144
|
display_logs_depending_process_time(
|
|
147
145
|
f'File has no extension, detected {engine_to_file.get(engine, "csv")}',
|
|
148
|
-
time() - start
|
|
146
|
+
time() - start,
|
|
149
147
|
)
|
|
150
148
|
return engine
|
|
151
149
|
|
|
152
150
|
|
|
153
|
-
def detect_separator(file: TextIO, verbose: bool = False):
|
|
151
|
+
def detect_separator(file: TextIO, verbose: bool = False) -> str:
|
|
154
152
|
"""Detects csv separator"""
|
|
155
153
|
# TODO: add a robust detection:
|
|
156
154
|
# si on a un point virgule comme texte et \t comme séparateur, on renvoie
|
|
@@ -181,12 +179,12 @@ def detect_separator(file: TextIO, verbose: bool = False):
|
|
|
181
179
|
if verbose:
|
|
182
180
|
display_logs_depending_process_time(
|
|
183
181
|
f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
|
|
184
|
-
time() - start
|
|
182
|
+
time() - start,
|
|
185
183
|
)
|
|
186
184
|
return sep
|
|
187
185
|
|
|
188
186
|
|
|
189
|
-
def detect_encoding(csv_file_path: str, verbose: bool = False):
|
|
187
|
+
def detect_encoding(csv_file_path: str, verbose: bool = False) -> str:
|
|
190
188
|
"""
|
|
191
189
|
Detects file encoding using faust-cchardet (forked from the original cchardet)
|
|
192
190
|
"""
|
|
@@ -205,7 +203,7 @@ def detect_encoding(csv_file_path: str, verbose: bool = False):
|
|
|
205
203
|
message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
|
|
206
204
|
display_logs_depending_process_time(
|
|
207
205
|
message,
|
|
208
|
-
time() - start
|
|
206
|
+
time() - start,
|
|
209
207
|
)
|
|
210
208
|
return encoding_dict['encoding']
|
|
211
209
|
|
|
@@ -218,8 +216,7 @@ def parse_table(
|
|
|
218
216
|
skiprows: int,
|
|
219
217
|
random_state: int = 42,
|
|
220
218
|
verbose : bool = False,
|
|
221
|
-
):
|
|
222
|
-
# Takes care of some problems
|
|
219
|
+
) -> tuple[pd.DataFrame, int, int]:
|
|
223
220
|
if verbose:
|
|
224
221
|
start = time()
|
|
225
222
|
logging.info("Parsing table")
|
|
@@ -230,7 +227,6 @@ def parse_table(
|
|
|
230
227
|
|
|
231
228
|
total_lines = None
|
|
232
229
|
for encoding in [encoding, "ISO-8859-1", "utf-8"]:
|
|
233
|
-
# TODO : modification systematique
|
|
234
230
|
if encoding is None:
|
|
235
231
|
continue
|
|
236
232
|
|
|
@@ -251,17 +247,16 @@ def parse_table(
|
|
|
251
247
|
print("Trying encoding : {encoding}".format(encoding=encoding))
|
|
252
248
|
|
|
253
249
|
if table is None:
|
|
254
|
-
|
|
255
|
-
return table, "NA", "NA"
|
|
250
|
+
raise ValueError("Could not load file")
|
|
256
251
|
if verbose:
|
|
257
252
|
display_logs_depending_process_time(
|
|
258
253
|
f'Table parsed successfully in {round(time() - start, 3)}s',
|
|
259
|
-
time() - start
|
|
254
|
+
time() - start,
|
|
260
255
|
)
|
|
261
256
|
return table, total_lines, nb_duplicates
|
|
262
257
|
|
|
263
258
|
|
|
264
|
-
def remove_empty_first_rows(table: pd.DataFrame):
|
|
259
|
+
def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
265
260
|
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
266
261
|
to end up with the header at the right place"""
|
|
267
262
|
idx = 0
|
|
@@ -274,7 +269,7 @@ def remove_empty_first_rows(table: pd.DataFrame):
|
|
|
274
269
|
cols = table.iloc[idx - 1]
|
|
275
270
|
table = table.iloc[idx:]
|
|
276
271
|
table.columns = cols.to_list()
|
|
277
|
-
# +1 here because the
|
|
272
|
+
# +1 here because the headers should count as a row
|
|
278
273
|
return table, idx
|
|
279
274
|
|
|
280
275
|
|
|
@@ -285,7 +280,7 @@ def parse_excel(
|
|
|
285
280
|
sheet_name: Optional[str] = None,
|
|
286
281
|
random_state: int = 42,
|
|
287
282
|
verbose : bool = False,
|
|
288
|
-
):
|
|
283
|
+
) -> tuple[pd.DataFrame, int, int, str, str, int]:
|
|
289
284
|
""""Excel-like parsing is really slow, could be a good improvement for future development"""
|
|
290
285
|
if verbose:
|
|
291
286
|
start = time()
|
|
@@ -309,7 +304,7 @@ def parse_excel(
|
|
|
309
304
|
if verbose:
|
|
310
305
|
display_logs_depending_process_time(
|
|
311
306
|
f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
|
|
312
|
-
time() - start
|
|
307
|
+
time() - start,
|
|
313
308
|
)
|
|
314
309
|
try:
|
|
315
310
|
if engine == "openpyxl":
|
|
@@ -341,7 +336,7 @@ def parse_excel(
|
|
|
341
336
|
if verbose:
|
|
342
337
|
display_logs_depending_process_time(
|
|
343
338
|
'Could not read file with classic xls reader, trying with ODS',
|
|
344
|
-
time() - start
|
|
339
|
+
time() - start,
|
|
345
340
|
)
|
|
346
341
|
engine = "odf"
|
|
347
342
|
|
|
@@ -354,33 +349,33 @@ def parse_excel(
|
|
|
354
349
|
if verbose:
|
|
355
350
|
display_logs_depending_process_time(
|
|
356
351
|
f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
|
|
357
|
-
time() - start
|
|
352
|
+
time() - start,
|
|
358
353
|
)
|
|
359
354
|
tables = pd.read_excel(
|
|
360
355
|
csv_file_path,
|
|
361
356
|
engine="odf",
|
|
362
357
|
sheet_name=None,
|
|
363
|
-
dtype="unicode"
|
|
358
|
+
dtype="unicode",
|
|
364
359
|
)
|
|
365
360
|
sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
|
|
366
361
|
sheet_name = max(sizes, key=sizes.get)
|
|
367
362
|
if verbose:
|
|
368
363
|
display_logs_depending_process_time(
|
|
369
364
|
f'Going forwards with sheet "{sheet_name}"',
|
|
370
|
-
time() - start
|
|
365
|
+
time() - start,
|
|
371
366
|
)
|
|
372
367
|
table = tables[sheet_name]
|
|
373
368
|
else:
|
|
374
369
|
if verbose:
|
|
375
370
|
display_logs_depending_process_time(
|
|
376
371
|
f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
|
|
377
|
-
time() - start
|
|
372
|
+
time() - start,
|
|
378
373
|
)
|
|
379
374
|
table = pd.read_excel(
|
|
380
375
|
csv_file_path,
|
|
381
376
|
engine="odf",
|
|
382
377
|
sheet_name=sheet_name,
|
|
383
|
-
dtype="unicode"
|
|
378
|
+
dtype="unicode",
|
|
384
379
|
)
|
|
385
380
|
table, header_row_idx = remove_empty_first_rows(table)
|
|
386
381
|
total_lines = len(table)
|
|
@@ -391,7 +386,7 @@ def parse_excel(
|
|
|
391
386
|
if verbose:
|
|
392
387
|
display_logs_depending_process_time(
|
|
393
388
|
f'Table parsed successfully in {round(time() - start, 3)}s',
|
|
394
|
-
time() - start
|
|
389
|
+
time() - start,
|
|
395
390
|
)
|
|
396
391
|
return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
|
|
397
392
|
|
|
@@ -400,18 +395,18 @@ def parse_excel(
|
|
|
400
395
|
if no_sheet_specified:
|
|
401
396
|
display_logs_depending_process_time(
|
|
402
397
|
f'Going forwards with sheet "{sheet_name}"',
|
|
403
|
-
time() - start
|
|
398
|
+
time() - start,
|
|
404
399
|
)
|
|
405
400
|
else:
|
|
406
401
|
display_logs_depending_process_time(
|
|
407
402
|
f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
|
|
408
|
-
time() - start
|
|
403
|
+
time() - start,
|
|
409
404
|
)
|
|
410
405
|
table = pd.read_excel(
|
|
411
406
|
csv_file_path,
|
|
412
407
|
engine=engine,
|
|
413
408
|
sheet_name=sheet_name,
|
|
414
|
-
dtype="unicode"
|
|
409
|
+
dtype="unicode",
|
|
415
410
|
)
|
|
416
411
|
table, header_row_idx = remove_empty_first_rows(table)
|
|
417
412
|
total_lines = len(table)
|
|
@@ -422,12 +417,12 @@ def parse_excel(
|
|
|
422
417
|
if verbose:
|
|
423
418
|
display_logs_depending_process_time(
|
|
424
419
|
f'Table parsed successfully in {round(time() - start, 3)}s',
|
|
425
|
-
time() - start
|
|
420
|
+
time() - start,
|
|
426
421
|
)
|
|
427
422
|
return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
|
|
428
423
|
|
|
429
424
|
|
|
430
|
-
def prevent_nan(value: float):
|
|
425
|
+
def prevent_nan(value: float) -> Optional[float]:
|
|
431
426
|
if math.isnan(value):
|
|
432
427
|
return None
|
|
433
428
|
return value
|
|
@@ -439,7 +434,7 @@ def create_profile(
|
|
|
439
434
|
num_rows: int,
|
|
440
435
|
limited_output: bool = True,
|
|
441
436
|
verbose: bool = False,
|
|
442
|
-
):
|
|
437
|
+
) -> dict:
|
|
443
438
|
if verbose:
|
|
444
439
|
start = time()
|
|
445
440
|
logging.info("Creating profile")
|
|
@@ -466,9 +461,8 @@ def create_profile(
|
|
|
466
461
|
safe_table[c] = safe_table[c].apply(
|
|
467
462
|
lambda s: float_casting(s) if isinstance(s, str) else s
|
|
468
463
|
)
|
|
469
|
-
profile =
|
|
464
|
+
profile = defaultdict(dict)
|
|
470
465
|
for c in safe_table.columns:
|
|
471
|
-
profile[c] = {}
|
|
472
466
|
if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
|
|
473
467
|
float,
|
|
474
468
|
int,
|
|
@@ -494,10 +488,10 @@ def create_profile(
|
|
|
494
488
|
.to_dict(orient="records")
|
|
495
489
|
tops = []
|
|
496
490
|
for tb in tops_bruts:
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
491
|
+
tops.append({
|
|
492
|
+
"count": tb["count"],
|
|
493
|
+
"value": tb[c],
|
|
494
|
+
})
|
|
501
495
|
profile[c].update(
|
|
502
496
|
tops=tops,
|
|
503
497
|
nb_distinct=safe_table[c].nunique(),
|
|
@@ -506,7 +500,7 @@ def create_profile(
|
|
|
506
500
|
if verbose:
|
|
507
501
|
display_logs_depending_process_time(
|
|
508
502
|
f"Created profile in {round(time() - start, 3)}s",
|
|
509
|
-
time() - start
|
|
503
|
+
time() - start,
|
|
510
504
|
)
|
|
511
505
|
return profile
|
|
512
506
|
|
|
@@ -540,7 +534,7 @@ def detect_extra_columns(file: TextIO, sep: str):
|
|
|
540
534
|
return nb_useless_col, retour
|
|
541
535
|
|
|
542
536
|
|
|
543
|
-
def detect_headers(file: TextIO, sep: str, verbose: bool = False):
|
|
537
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
|
|
544
538
|
"""Tests 10 first rows for possible header (header not in 1st line)"""
|
|
545
539
|
if verbose:
|
|
546
540
|
start = time()
|
|
@@ -559,7 +553,7 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False):
|
|
|
559
553
|
if verbose:
|
|
560
554
|
display_logs_depending_process_time(
|
|
561
555
|
f'Detected headers in {round(time() - start, 3)}s',
|
|
562
|
-
time() - start
|
|
556
|
+
time() - start,
|
|
563
557
|
)
|
|
564
558
|
return i, chaine
|
|
565
559
|
if verbose:
|
|
@@ -567,7 +561,7 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False):
|
|
|
567
561
|
return 0, None
|
|
568
562
|
|
|
569
563
|
|
|
570
|
-
def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False):
|
|
564
|
+
def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False) -> int:
|
|
571
565
|
"""Tests first 10 lines to see if there are empty heading columns"""
|
|
572
566
|
if verbose:
|
|
573
567
|
start = time()
|
|
@@ -581,18 +575,18 @@ def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False):
|
|
|
581
575
|
if verbose:
|
|
582
576
|
display_logs_depending_process_time(
|
|
583
577
|
f'No heading column detected in {round(time() - start, 3)}s',
|
|
584
|
-
time() - start
|
|
578
|
+
time() - start,
|
|
585
579
|
)
|
|
586
580
|
return 0
|
|
587
581
|
if verbose:
|
|
588
582
|
display_logs_depending_process_time(
|
|
589
583
|
f'{return_int} heading columns detected in {round(time() - start, 3)}s',
|
|
590
|
-
time() - start
|
|
584
|
+
time() - start,
|
|
591
585
|
)
|
|
592
586
|
return return_int
|
|
593
587
|
|
|
594
588
|
|
|
595
|
-
def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False):
|
|
589
|
+
def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False) -> int:
|
|
596
590
|
"""Tests first 10 lines to see if there are empty trailing columns"""
|
|
597
591
|
if verbose:
|
|
598
592
|
start = time()
|
|
@@ -611,12 +605,12 @@ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbos
|
|
|
611
605
|
if verbose:
|
|
612
606
|
display_logs_depending_process_time(
|
|
613
607
|
f'No trailing column detected in {round(time() - start, 3)}s',
|
|
614
|
-
time() - start
|
|
608
|
+
time() - start,
|
|
615
609
|
)
|
|
616
610
|
return 0
|
|
617
611
|
if verbose:
|
|
618
612
|
display_logs_depending_process_time(
|
|
619
613
|
f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
|
|
620
|
-
time() - start
|
|
614
|
+
time() - start,
|
|
621
615
|
)
|
|
622
616
|
return return_int
|
csv_detective/explore_csv.py
CHANGED
|
@@ -4,6 +4,7 @@ contenu possible des champs
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
from typing import Dict, List, Union
|
|
7
|
+
from collections import defaultdict
|
|
7
8
|
import json
|
|
8
9
|
import numpy as np
|
|
9
10
|
import os
|
|
@@ -18,7 +19,13 @@ import pandas as pd
|
|
|
18
19
|
from csv_detective import detect_fields, detect_labels
|
|
19
20
|
from csv_detective.s3_utils import download_from_minio, upload_to_minio
|
|
20
21
|
from csv_detective.schema_generation import generate_table_schema
|
|
21
|
-
from csv_detective.utils import
|
|
22
|
+
from csv_detective.utils import (
|
|
23
|
+
cast_df,
|
|
24
|
+
display_logs_depending_process_time,
|
|
25
|
+
prepare_output_dict,
|
|
26
|
+
test_col,
|
|
27
|
+
test_label,
|
|
28
|
+
)
|
|
22
29
|
from .detection import (
|
|
23
30
|
detect_engine,
|
|
24
31
|
detect_separator,
|
|
@@ -39,7 +46,7 @@ from .detection import (
|
|
|
39
46
|
logging.basicConfig(level=logging.INFO)
|
|
40
47
|
|
|
41
48
|
|
|
42
|
-
def get_all_packages(detect_type):
|
|
49
|
+
def get_all_packages(detect_type) -> list:
|
|
43
50
|
root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
|
|
44
51
|
modules = []
|
|
45
52
|
for dirpath, _, filenames in os.walk(root_dir):
|
|
@@ -58,7 +65,7 @@ def get_all_packages(detect_type):
|
|
|
58
65
|
def return_all_tests(
|
|
59
66
|
user_input_tests: Union[str, list],
|
|
60
67
|
detect_type: str,
|
|
61
|
-
):
|
|
68
|
+
) -> list:
|
|
62
69
|
"""
|
|
63
70
|
returns all tests that have a method _is and are listed in the user_input_tests
|
|
64
71
|
the function can select a sub_package from csv_detective
|
|
@@ -110,6 +117,7 @@ def routine(
|
|
|
110
117
|
output_profile: bool = False,
|
|
111
118
|
output_schema: bool = False,
|
|
112
119
|
output_df: bool = False,
|
|
120
|
+
cast_json: bool = True,
|
|
113
121
|
verbose: bool = False,
|
|
114
122
|
sheet_name: Union[str, int] = None,
|
|
115
123
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
@@ -126,6 +134,7 @@ def routine(
|
|
|
126
134
|
output_profile: whether or not to add the 'profile' field to the output
|
|
127
135
|
output_schema: whether or not to add the 'schema' field to the output (tableschema)
|
|
128
136
|
output_df: whether or not to return the loaded DataFrame along with the analysis report
|
|
137
|
+
cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
|
|
129
138
|
verbose: whether or not to print process logs in console
|
|
130
139
|
sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
|
|
131
140
|
skipna: whether to keep NaN (empty cells) for tests
|
|
@@ -175,12 +184,10 @@ def routine(
|
|
|
175
184
|
sep = detect_separator(str_file, verbose=verbose)
|
|
176
185
|
header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
|
|
177
186
|
if header is None:
|
|
178
|
-
|
|
179
|
-
return return_dict
|
|
187
|
+
return {"error": True}
|
|
180
188
|
elif isinstance(header, list):
|
|
181
189
|
if any([x is None for x in header]):
|
|
182
|
-
|
|
183
|
-
return return_dict
|
|
190
|
+
return {"error": True}
|
|
184
191
|
heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
|
|
185
192
|
trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
|
|
186
193
|
table, total_lines, nb_duplicates = parse_table(
|
|
@@ -200,7 +207,7 @@ def routine(
|
|
|
200
207
|
# )
|
|
201
208
|
|
|
202
209
|
# Creating return dictionary
|
|
203
|
-
|
|
210
|
+
analysis = {
|
|
204
211
|
"header_row_idx": header_row_idx,
|
|
205
212
|
"header": header,
|
|
206
213
|
"total_lines": total_lines,
|
|
@@ -212,12 +219,12 @@ def routine(
|
|
|
212
219
|
}
|
|
213
220
|
# this is only relevant for xls-like
|
|
214
221
|
if is_xls_like:
|
|
215
|
-
|
|
216
|
-
|
|
222
|
+
analysis["engine"] = engine
|
|
223
|
+
analysis["sheet_name"] = sheet_name
|
|
217
224
|
# this is only relevant for csv
|
|
218
225
|
else:
|
|
219
|
-
|
|
220
|
-
|
|
226
|
+
analysis["encoding"] = encoding
|
|
227
|
+
analysis["separator"] = sep
|
|
221
228
|
|
|
222
229
|
# list testing to be performed
|
|
223
230
|
all_tests_fields = return_all_tests(
|
|
@@ -229,25 +236,24 @@ def routine(
|
|
|
229
236
|
|
|
230
237
|
# if no testing then return
|
|
231
238
|
if not all_tests_fields and not all_tests_labels:
|
|
232
|
-
return
|
|
239
|
+
return analysis
|
|
233
240
|
|
|
234
241
|
# Perform testing on fields
|
|
235
|
-
|
|
236
|
-
|
|
242
|
+
scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
|
|
243
|
+
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
237
244
|
|
|
238
245
|
# Perform testing on labels
|
|
239
|
-
|
|
240
|
-
|
|
246
|
+
scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
|
|
247
|
+
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
241
248
|
|
|
242
249
|
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
243
250
|
# This is because the fields are more important than the labels and yields a max
|
|
244
251
|
# of 1.5 for the final score.
|
|
245
|
-
|
|
252
|
+
scores_table = scores_table_fields * (
|
|
246
253
|
1
|
|
247
|
-
+
|
|
248
|
-
index=
|
|
249
|
-
).values
|
|
250
|
-
/ 2
|
|
254
|
+
+ scores_table_labels.reindex(
|
|
255
|
+
index=scores_table_fields.index, fill_value=0
|
|
256
|
+
).values / 2
|
|
251
257
|
)
|
|
252
258
|
|
|
253
259
|
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
@@ -263,12 +269,12 @@ def routine(
|
|
|
263
269
|
"latitude_l93",
|
|
264
270
|
"longitude_l93",
|
|
265
271
|
]
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
272
|
+
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
273
|
+
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
274
|
+
scores_table.loc[formats_with_mandatory_label, :],
|
|
269
275
|
0,
|
|
270
276
|
)
|
|
271
|
-
|
|
277
|
+
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
272
278
|
|
|
273
279
|
metier_to_python_type = {
|
|
274
280
|
"booleen": "bool",
|
|
@@ -278,6 +284,8 @@ def routine(
|
|
|
278
284
|
"json": "json",
|
|
279
285
|
"json_geojson": "json",
|
|
280
286
|
"datetime": "datetime",
|
|
287
|
+
"datetime_iso": "datetime",
|
|
288
|
+
"datetime_rfc822": "datetime",
|
|
281
289
|
"date": "date",
|
|
282
290
|
"latitude": "float",
|
|
283
291
|
"latitude_l93": "float",
|
|
@@ -291,7 +299,7 @@ def routine(
|
|
|
291
299
|
|
|
292
300
|
if not limited_output:
|
|
293
301
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
294
|
-
|
|
302
|
+
analysis[detection_method] = {
|
|
295
303
|
col_name: [
|
|
296
304
|
{
|
|
297
305
|
"python_type": metier_to_python_type.get(
|
|
@@ -301,32 +309,29 @@ def routine(
|
|
|
301
309
|
}
|
|
302
310
|
for detection in detections
|
|
303
311
|
]
|
|
304
|
-
for col_name, detections in
|
|
312
|
+
for col_name, detections in analysis[detection_method].items()
|
|
305
313
|
}
|
|
306
314
|
else:
|
|
307
315
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
308
|
-
|
|
316
|
+
analysis[detection_method] = {
|
|
309
317
|
col_name: {
|
|
310
318
|
"python_type": metier_to_python_type.get(
|
|
311
319
|
detection["format"], "string"
|
|
312
320
|
),
|
|
313
321
|
**detection,
|
|
314
322
|
}
|
|
315
|
-
for col_name, detection in
|
|
323
|
+
for col_name, detection in analysis[detection_method].items()
|
|
316
324
|
}
|
|
317
325
|
|
|
318
326
|
# Add detection with formats as keys
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
}
|
|
323
|
-
for header, col_metadata in return_dict["columns"].items():
|
|
324
|
-
return_dict["formats"][col_metadata["format"]].append(header)
|
|
327
|
+
analysis["formats"] = defaultdict(list)
|
|
328
|
+
for header, col_metadata in analysis["columns"].items():
|
|
329
|
+
analysis["formats"][col_metadata["format"]].append(header)
|
|
325
330
|
|
|
326
331
|
if output_profile:
|
|
327
|
-
|
|
332
|
+
analysis["profile"] = create_profile(
|
|
328
333
|
table=table,
|
|
329
|
-
dict_cols_fields=
|
|
334
|
+
dict_cols_fields=analysis["columns"],
|
|
330
335
|
num_rows=num_rows,
|
|
331
336
|
limited_output=limited_output,
|
|
332
337
|
verbose=verbose,
|
|
@@ -343,11 +348,11 @@ def routine(
|
|
|
343
348
|
output_path += "_sheet-" + str(sheet_name)
|
|
344
349
|
output_path += ".json"
|
|
345
350
|
with open(output_path, "w", encoding="utf8") as fp:
|
|
346
|
-
json.dump(
|
|
351
|
+
json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
|
|
347
352
|
|
|
348
353
|
if output_schema:
|
|
349
|
-
|
|
350
|
-
|
|
354
|
+
analysis["schema"] = generate_table_schema(
|
|
355
|
+
analysis,
|
|
351
356
|
save_file=False,
|
|
352
357
|
verbose=verbose
|
|
353
358
|
)
|
|
@@ -357,8 +362,13 @@ def routine(
|
|
|
357
362
|
time() - start_routine
|
|
358
363
|
)
|
|
359
364
|
if output_df:
|
|
360
|
-
return
|
|
361
|
-
|
|
365
|
+
return analysis, cast_df(
|
|
366
|
+
df=table,
|
|
367
|
+
columns=analysis["columns"],
|
|
368
|
+
cast_json=cast_json,
|
|
369
|
+
verbose=verbose,
|
|
370
|
+
)
|
|
371
|
+
return analysis
|
|
362
372
|
|
|
363
373
|
|
|
364
374
|
def routine_minio(
|
|
@@ -436,7 +446,7 @@ def routine_minio(
|
|
|
436
446
|
minio_pwd=minio_pwd,
|
|
437
447
|
)
|
|
438
448
|
|
|
439
|
-
|
|
449
|
+
analysis = routine(
|
|
440
450
|
csv_file_path,
|
|
441
451
|
num_rows,
|
|
442
452
|
user_input_tests,
|
|
@@ -449,7 +459,7 @@ def routine_minio(
|
|
|
449
459
|
# Write report JSON file.
|
|
450
460
|
output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
|
|
451
461
|
with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
|
|
452
|
-
json.dump(
|
|
462
|
+
json.dump(analysis, fp, indent=4, separators=(",", ": "))
|
|
453
463
|
|
|
454
464
|
upload_to_minio(
|
|
455
465
|
netloc=output_minio_location["netloc"],
|
|
@@ -464,7 +474,7 @@ def routine_minio(
|
|
|
464
474
|
os.remove(csv_file_path)
|
|
465
475
|
|
|
466
476
|
generate_table_schema(
|
|
467
|
-
|
|
477
|
+
analysis,
|
|
468
478
|
True,
|
|
469
479
|
netloc=tableschema_minio_location["netloc"],
|
|
470
480
|
bucket=tableschema_minio_location["bucket"],
|
|
@@ -473,4 +483,4 @@ def routine_minio(
|
|
|
473
483
|
minio_pwd=minio_pwd,
|
|
474
484
|
)
|
|
475
485
|
|
|
476
|
-
return
|
|
486
|
+
return analysis
|
csv_detective/utils.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
1
|
-
from typing import Callable
|
|
1
|
+
from typing import Callable, Optional, Union
|
|
2
|
+
import json
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import logging
|
|
4
5
|
from time import time
|
|
6
|
+
from datetime import date, datetime
|
|
7
|
+
|
|
8
|
+
from csv_detective.detect_fields.other.booleen import bool_casting
|
|
9
|
+
from csv_detective.detect_fields.other.float import float_casting
|
|
10
|
+
from csv_detective.detect_fields.temp.date import date_casting
|
|
5
11
|
|
|
6
12
|
logging.basicConfig(level=logging.INFO)
|
|
7
13
|
|
|
@@ -210,7 +216,52 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
|
|
|
210
216
|
|
|
211
217
|
def full_word_strictly_inside_string(word: str, string: str):
|
|
212
218
|
return (
|
|
213
|
-
|
|
219
|
+
word == string
|
|
220
|
+
or (" " + word + " " in string)
|
|
214
221
|
or (string.startswith(word + " "))
|
|
215
222
|
or (string.endswith(" " + word))
|
|
216
223
|
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
|
|
227
|
+
if not isinstance(value, str) or not value:
|
|
228
|
+
# None is the current default value in hydra, should we keep this?
|
|
229
|
+
return None
|
|
230
|
+
if _type == "float":
|
|
231
|
+
return float_casting(value)
|
|
232
|
+
if _type == "bool":
|
|
233
|
+
return bool_casting(value)
|
|
234
|
+
if _type == "json":
|
|
235
|
+
# in hydra json are given to postgres as strings, conversion is done by postgres
|
|
236
|
+
return json.loads(value)
|
|
237
|
+
if _type == "date":
|
|
238
|
+
_date = date_casting(value)
|
|
239
|
+
return _date.date() if _date else None
|
|
240
|
+
if _type == "datetime":
|
|
241
|
+
return date_casting(value)
|
|
242
|
+
raise ValueError(f"Unknown type `{_type}`")
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
|
|
246
|
+
if verbose:
|
|
247
|
+
start = time()
|
|
248
|
+
output_df = pd.DataFrame()
|
|
249
|
+
for col_name, detection in columns.items():
|
|
250
|
+
if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
|
|
251
|
+
# no change if detected type is string
|
|
252
|
+
output_df[col_name] = df[col_name].copy()
|
|
253
|
+
elif detection["python_type"] == "int":
|
|
254
|
+
# to allow having ints and NaN in the same column
|
|
255
|
+
output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
|
|
256
|
+
else:
|
|
257
|
+
output_df[col_name] = df[col_name].apply(
|
|
258
|
+
lambda col: cast(col, _type=detection["python_type"])
|
|
259
|
+
)
|
|
260
|
+
# to save RAM
|
|
261
|
+
del df[col_name]
|
|
262
|
+
if verbose:
|
|
263
|
+
display_logs_depending_process_time(
|
|
264
|
+
f'Casting columns completed in {round(time() - start, 3)}s',
|
|
265
|
+
time() - start,
|
|
266
|
+
)
|
|
267
|
+
return output_df
|
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
- New function that creates a csv from a list of fields and constraints, or from a TableSchema [#101](https://github.com/datagouv/csv-detective/pull/101)
|
|
6
6
|
- Enable outputing loaded dataframe [#102](https://github.com/datagouv/csv-detective/pull/102)
|
|
7
|
+
- Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
|
|
8
|
+
- The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
|
|
7
9
|
|
|
8
10
|
## 0.7.4 (2024-11-15)
|
|
9
11
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: csv_detective
|
|
3
|
-
Version: 0.7.5.
|
|
3
|
+
Version: 0.7.5.dev1113
|
|
4
4
|
Summary: Detect CSV column content
|
|
5
5
|
Home-page: https://github.com/etalab/csv_detective
|
|
6
6
|
Author: Etalab
|
|
@@ -15,6 +15,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE.AGPL.txt
|
|
17
17
|
Requires-Dist: boto3==1.34.0
|
|
18
|
+
Requires-Dist: dateparser==1.2.0
|
|
18
19
|
Requires-Dist: faust-cchardet==2.1.19
|
|
19
20
|
Requires-Dist: pandas==2.2.0
|
|
20
21
|
Requires-Dist: pytest==8.3.0
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
|
|
2
2
|
csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
|
|
3
3
|
csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
|
|
4
|
-
csv_detective/detection.py,sha256=
|
|
5
|
-
csv_detective/explore_csv.py,sha256
|
|
4
|
+
csv_detective/detection.py,sha256=SUNGMvvuM_bj3gKYw-x6-CjjkirqCPoeAm0NCPkijrM,22225
|
|
5
|
+
csv_detective/explore_csv.py,sha256=-AxnM0hGlhrbI4w1wdZwC_w-DYgoOCFpMQ94agIaD5U,17380
|
|
6
6
|
csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
|
|
7
7
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
8
8
|
csv_detective/schema_generation.py,sha256=D1Cq4QRajsKtY8EJSwbRTIB-T_Cb2ZpcmYtCrJ6DvJQ,13135
|
|
9
|
-
csv_detective/utils.py,sha256=
|
|
10
|
-
csv_detective/detect_fields/__init__.py,sha256=
|
|
9
|
+
csv_detective/utils.py,sha256=yO9INaLh-QX-FFL2A153AlMqftE04wb0hpN6HJvsKGg,10581
|
|
10
|
+
csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
|
|
11
11
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=e5JqMNOPxx0Ivju3zAHCGMopZroCpR4vr3DJKlQhMz4,1675
|
|
@@ -55,9 +55,9 @@ csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSq
|
|
|
55
55
|
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=3nlBqFYD4kVSVxw4b9DTPcxW59oL0T3Kj0OxPlyP9og,268
|
|
56
56
|
csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
|
|
57
57
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
|
-
csv_detective/detect_fields/other/booleen/__init__.py,sha256=
|
|
58
|
+
csv_detective/detect_fields/other/booleen/__init__.py,sha256=1qIEI681iEaPVb9XxmH2ewxDdfmYhHe4-s3MZ6L1A9Q,489
|
|
59
59
|
csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
|
|
60
|
-
csv_detective/detect_fields/other/float/__init__.py,sha256=
|
|
60
|
+
csv_detective/detect_fields/other/float/__init__.py,sha256=dpEd5ZijmjQ7gqcTnYRoRoLGGJae0RyGwVC6MPra9go,549
|
|
61
61
|
csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
|
|
62
62
|
csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
|
|
63
63
|
csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
|
|
@@ -65,7 +65,8 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
|
|
|
65
65
|
csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
|
|
66
66
|
csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
|
|
67
67
|
csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
|
-
csv_detective/detect_fields/temp/date/__init__.py,sha256=
|
|
68
|
+
csv_detective/detect_fields/temp/date/__init__.py,sha256=aFP1feFWFhCpR6Q9s_4BBwWxFtwFiMXY1iduSeQIjdA,943
|
|
69
|
+
csv_detective/detect_fields/temp/datetime/__init__.py,sha256=Ykwhk2ospjY9P0KOG0AitgqN0sld6UmhOlbMz_XGQzQ,597
|
|
69
70
|
csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
|
|
70
71
|
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
|
|
71
72
|
csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
|
|
@@ -122,22 +123,22 @@ csv_detective/detect_labels/other/twitter/__init__.py,sha256=D8G4vGsFL9a99OJz-03
|
|
|
122
123
|
csv_detective/detect_labels/other/url/__init__.py,sha256=vqUQvn5o6JZU8iRsSG3AYqggjlhzagozVYWwpuSReV8,1202
|
|
123
124
|
csv_detective/detect_labels/other/uuid/__init__.py,sha256=OdMUxqvqMdGaY5nph7CbIF_Q0LSxljxE72kCMT4m-Zk,931
|
|
124
125
|
csv_detective/detect_labels/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
125
|
-
csv_detective/detect_labels/temp/date/__init__.py,sha256
|
|
126
|
+
csv_detective/detect_labels/temp/date/__init__.py,sha256=-R7VqlryozelSn4wH_7w9x6ks77DP1kw2XMBYSLrzXE,1322
|
|
126
127
|
csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
|
|
127
128
|
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
|
|
128
129
|
csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
|
|
129
|
-
csv_detective-0.7.5.
|
|
130
|
-
csv_detective-0.7.5.
|
|
131
|
-
csv_detective-0.7.5.
|
|
130
|
+
csv_detective-0.7.5.dev1113.data/data/share/csv_detective/CHANGELOG.md,sha256=S9f0BlHhNQhrJ8bbw7bThthn2AG-gP5n8eg4Eep05IA,7063
|
|
131
|
+
csv_detective-0.7.5.dev1113.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
132
|
+
csv_detective-0.7.5.dev1113.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
132
133
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
133
134
|
tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
|
|
134
|
-
tests/test_fields.py,sha256=
|
|
135
|
-
tests/test_file.py,sha256=
|
|
135
|
+
tests/test_fields.py,sha256=_96htvTzvM7u-W57RpOBbsacWirIm4R36PP7JhPEaYQ,11123
|
|
136
|
+
tests/test_file.py,sha256=HO-Zqv0ZDFy3d0ZrpjWQPXBrwgUmzesseoEofy8G2UU,7529
|
|
136
137
|
tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
|
|
137
138
|
tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
|
|
138
|
-
csv_detective-0.7.5.
|
|
139
|
-
csv_detective-0.7.5.
|
|
140
|
-
csv_detective-0.7.5.
|
|
141
|
-
csv_detective-0.7.5.
|
|
142
|
-
csv_detective-0.7.5.
|
|
143
|
-
csv_detective-0.7.5.
|
|
139
|
+
csv_detective-0.7.5.dev1113.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
140
|
+
csv_detective-0.7.5.dev1113.dist-info/METADATA,sha256=7kqAw_UnjMjoBSfLqk59j7OYdY9PB0bPC35p9QxXbFY,1178
|
|
141
|
+
csv_detective-0.7.5.dev1113.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
142
|
+
csv_detective-0.7.5.dev1113.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
143
|
+
csv_detective-0.7.5.dev1113.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
144
|
+
csv_detective-0.7.5.dev1113.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from numpy import random
|
|
3
|
+
import pytest
|
|
4
|
+
from datetime import date as _date, datetime as _datetime
|
|
3
5
|
|
|
4
6
|
from csv_detective.detect_fields.FR.geo import (
|
|
5
7
|
adresse,
|
|
@@ -46,6 +48,7 @@ from csv_detective.detection import (
|
|
|
46
48
|
detetect_categorical_variable,
|
|
47
49
|
)
|
|
48
50
|
from csv_detective.explore_csv import return_all_tests
|
|
51
|
+
from csv_detective.utils import cast
|
|
49
52
|
|
|
50
53
|
|
|
51
54
|
def test_all_tests_return_bool():
|
|
@@ -504,3 +507,19 @@ def test_match_float():
|
|
|
504
507
|
def test_not_match_float():
|
|
505
508
|
for val in ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"]:
|
|
506
509
|
assert not test_float._is(val)
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
@pytest.mark.parametrize(
|
|
513
|
+
"args",
|
|
514
|
+
(
|
|
515
|
+
("1.9", "float", float),
|
|
516
|
+
("oui", "bool", bool),
|
|
517
|
+
("[1, 2]", "json", list),
|
|
518
|
+
('{"a": 1}', "json", dict),
|
|
519
|
+
("2022-08-01", "date", _date),
|
|
520
|
+
("2024-09-23 17:32:07", "datetime", _datetime),
|
|
521
|
+
),
|
|
522
|
+
)
|
|
523
|
+
def test_cast(args):
|
|
524
|
+
value, detected_type, cast_type = args
|
|
525
|
+
assert isinstance(cast(value, detected_type), cast_type)
|
tests/test_file.py
CHANGED
|
@@ -232,3 +232,31 @@ def test_output_df():
|
|
|
232
232
|
assert isinstance(output, dict)
|
|
233
233
|
assert isinstance(df, pd.DataFrame)
|
|
234
234
|
assert len(df) == 6
|
|
235
|
+
assert df["partly_empty"].dtype == pd.Int64Dtype()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@pytest.mark.parametrize(
|
|
239
|
+
"cast_json",
|
|
240
|
+
(
|
|
241
|
+
(True, dict),
|
|
242
|
+
(False, str),
|
|
243
|
+
),
|
|
244
|
+
)
|
|
245
|
+
def test_cast_json(mocked_responses, cast_json):
|
|
246
|
+
cast_json, expected_type = cast_json
|
|
247
|
+
expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
|
|
248
|
+
mocked_responses.get(
|
|
249
|
+
'http://example.com/test.csv',
|
|
250
|
+
body=expected_content,
|
|
251
|
+
status=200,
|
|
252
|
+
)
|
|
253
|
+
analysis, df = routine(
|
|
254
|
+
csv_file_path='http://example.com/test.csv',
|
|
255
|
+
num_rows=-1,
|
|
256
|
+
output_profile=False,
|
|
257
|
+
save_results=False,
|
|
258
|
+
output_df=True,
|
|
259
|
+
cast_json=cast_json,
|
|
260
|
+
)
|
|
261
|
+
assert analysis['columns']["a_simple_dict"]["python_type"] == "json"
|
|
262
|
+
assert isinstance(df["a_simple_dict"][0], expected_type)
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/LICENSE.AGPL.txt
RENAMED
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/top_level.txt
RENAMED
|
File without changes
|