csv-detective 0.7.5.dev1069__py3-none-any.whl → 0.7.5.dev1078__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection.py +47 -53
- csv_detective/explore_csv.py +41 -46
- {csv_detective-0.7.5.dev1069.data → csv_detective-0.7.5.dev1078.data}/data/share/csv_detective/CHANGELOG.md +1 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1078.dist-info}/METADATA +1 -1
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1078.dist-info}/RECORD +11 -11
- {csv_detective-0.7.5.dev1069.data → csv_detective-0.7.5.dev1078.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1069.data → csv_detective-0.7.5.dev1078.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1078.dist-info}/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1078.dist-info}/WHEEL +0 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1078.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1078.dist-info}/top_level.txt +0 -0
csv_detective/detection.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from typing import TextIO, Optional
|
|
1
|
+
from typing import TextIO, Optional, Union
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import math
|
|
4
5
|
import csv
|
|
@@ -27,7 +28,7 @@ engine_to_file = {
|
|
|
27
28
|
}
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
def is_url(csv_file_path: str):
|
|
31
|
+
def is_url(csv_file_path: str) -> bool:
|
|
31
32
|
# could be more sophisticated if needed
|
|
32
33
|
return csv_file_path.startswith('http')
|
|
33
34
|
|
|
@@ -35,17 +36,14 @@ def is_url(csv_file_path: str):
|
|
|
35
36
|
def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
|
|
36
37
|
"""
|
|
37
38
|
Detects whether a column contains continuous variables. We consider a continuous column
|
|
38
|
-
one that contains
|
|
39
|
-
a considerable amount of float values.
|
|
39
|
+
one that contains a considerable amount of float values.
|
|
40
40
|
We removed the integers as we then end up with postal codes, insee codes, and all sort
|
|
41
41
|
of codes and types.
|
|
42
42
|
This is not optimal but it will do for now.
|
|
43
|
-
:param table:
|
|
44
|
-
:return:
|
|
45
43
|
"""
|
|
46
44
|
# if we need this again in the future, could be first based on columns detected as int/float to cut time
|
|
47
45
|
|
|
48
|
-
def check_threshold(serie: pd.Series, continuous_th: float):
|
|
46
|
+
def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
|
|
49
47
|
count = serie.value_counts().to_dict()
|
|
50
48
|
total_nb = len(serie)
|
|
51
49
|
if float in count:
|
|
@@ -75,7 +73,7 @@ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9,
|
|
|
75
73
|
if verbose:
|
|
76
74
|
display_logs_depending_process_time(
|
|
77
75
|
f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
|
|
78
|
-
time() - start
|
|
76
|
+
time() - start,
|
|
79
77
|
)
|
|
80
78
|
return res.index[res]
|
|
81
79
|
|
|
@@ -121,12 +119,12 @@ def detetect_categorical_variable(
|
|
|
121
119
|
if verbose:
|
|
122
120
|
display_logs_depending_process_time(
|
|
123
121
|
f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
|
|
124
|
-
time() - start
|
|
122
|
+
time() - start,
|
|
125
123
|
)
|
|
126
124
|
return res.index[res], res
|
|
127
125
|
|
|
128
126
|
|
|
129
|
-
def detect_engine(csv_file_path: str, verbose=False):
|
|
127
|
+
def detect_engine(csv_file_path: str, verbose=False) -> Optional[str]:
|
|
130
128
|
if verbose:
|
|
131
129
|
start = time()
|
|
132
130
|
mapping = {
|
|
@@ -145,12 +143,12 @@ def detect_engine(csv_file_path: str, verbose=False):
|
|
|
145
143
|
if verbose:
|
|
146
144
|
display_logs_depending_process_time(
|
|
147
145
|
f'File has no extension, detected {engine_to_file.get(engine, "csv")}',
|
|
148
|
-
time() - start
|
|
146
|
+
time() - start,
|
|
149
147
|
)
|
|
150
148
|
return engine
|
|
151
149
|
|
|
152
150
|
|
|
153
|
-
def detect_separator(file: TextIO, verbose: bool = False):
|
|
151
|
+
def detect_separator(file: TextIO, verbose: bool = False) -> str:
|
|
154
152
|
"""Detects csv separator"""
|
|
155
153
|
# TODO: add a robust detection:
|
|
156
154
|
# si on a un point virgule comme texte et \t comme séparateur, on renvoie
|
|
@@ -181,12 +179,12 @@ def detect_separator(file: TextIO, verbose: bool = False):
|
|
|
181
179
|
if verbose:
|
|
182
180
|
display_logs_depending_process_time(
|
|
183
181
|
f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
|
|
184
|
-
time() - start
|
|
182
|
+
time() - start,
|
|
185
183
|
)
|
|
186
184
|
return sep
|
|
187
185
|
|
|
188
186
|
|
|
189
|
-
def detect_encoding(csv_file_path: str, verbose: bool = False):
|
|
187
|
+
def detect_encoding(csv_file_path: str, verbose: bool = False) -> str:
|
|
190
188
|
"""
|
|
191
189
|
Detects file encoding using faust-cchardet (forked from the original cchardet)
|
|
192
190
|
"""
|
|
@@ -205,7 +203,7 @@ def detect_encoding(csv_file_path: str, verbose: bool = False):
|
|
|
205
203
|
message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
|
|
206
204
|
display_logs_depending_process_time(
|
|
207
205
|
message,
|
|
208
|
-
time() - start
|
|
206
|
+
time() - start,
|
|
209
207
|
)
|
|
210
208
|
return encoding_dict['encoding']
|
|
211
209
|
|
|
@@ -218,8 +216,7 @@ def parse_table(
|
|
|
218
216
|
skiprows: int,
|
|
219
217
|
random_state: int = 42,
|
|
220
218
|
verbose : bool = False,
|
|
221
|
-
):
|
|
222
|
-
# Takes care of some problems
|
|
219
|
+
) -> tuple[pd.DataFrame, int, int]:
|
|
223
220
|
if verbose:
|
|
224
221
|
start = time()
|
|
225
222
|
logging.info("Parsing table")
|
|
@@ -230,7 +227,6 @@ def parse_table(
|
|
|
230
227
|
|
|
231
228
|
total_lines = None
|
|
232
229
|
for encoding in [encoding, "ISO-8859-1", "utf-8"]:
|
|
233
|
-
# TODO : modification systematique
|
|
234
230
|
if encoding is None:
|
|
235
231
|
continue
|
|
236
232
|
|
|
@@ -251,17 +247,16 @@ def parse_table(
|
|
|
251
247
|
print("Trying encoding : {encoding}".format(encoding=encoding))
|
|
252
248
|
|
|
253
249
|
if table is None:
|
|
254
|
-
|
|
255
|
-
return table, "NA", "NA"
|
|
250
|
+
raise ValueError("Could not load file")
|
|
256
251
|
if verbose:
|
|
257
252
|
display_logs_depending_process_time(
|
|
258
253
|
f'Table parsed successfully in {round(time() - start, 3)}s',
|
|
259
|
-
time() - start
|
|
254
|
+
time() - start,
|
|
260
255
|
)
|
|
261
256
|
return table, total_lines, nb_duplicates
|
|
262
257
|
|
|
263
258
|
|
|
264
|
-
def remove_empty_first_rows(table: pd.DataFrame):
|
|
259
|
+
def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
265
260
|
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
266
261
|
to end up with the header at the right place"""
|
|
267
262
|
idx = 0
|
|
@@ -274,7 +269,7 @@ def remove_empty_first_rows(table: pd.DataFrame):
|
|
|
274
269
|
cols = table.iloc[idx - 1]
|
|
275
270
|
table = table.iloc[idx:]
|
|
276
271
|
table.columns = cols.to_list()
|
|
277
|
-
# +1 here because the
|
|
272
|
+
# +1 here because the headers should count as a row
|
|
278
273
|
return table, idx
|
|
279
274
|
|
|
280
275
|
|
|
@@ -285,7 +280,7 @@ def parse_excel(
|
|
|
285
280
|
sheet_name: Optional[str] = None,
|
|
286
281
|
random_state: int = 42,
|
|
287
282
|
verbose : bool = False,
|
|
288
|
-
):
|
|
283
|
+
) -> tuple[pd.DataFrame, int, int, str, str, int]:
|
|
289
284
|
""""Excel-like parsing is really slow, could be a good improvement for future development"""
|
|
290
285
|
if verbose:
|
|
291
286
|
start = time()
|
|
@@ -309,7 +304,7 @@ def parse_excel(
|
|
|
309
304
|
if verbose:
|
|
310
305
|
display_logs_depending_process_time(
|
|
311
306
|
f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
|
|
312
|
-
time() - start
|
|
307
|
+
time() - start,
|
|
313
308
|
)
|
|
314
309
|
try:
|
|
315
310
|
if engine == "openpyxl":
|
|
@@ -341,7 +336,7 @@ def parse_excel(
|
|
|
341
336
|
if verbose:
|
|
342
337
|
display_logs_depending_process_time(
|
|
343
338
|
'Could not read file with classic xls reader, trying with ODS',
|
|
344
|
-
time() - start
|
|
339
|
+
time() - start,
|
|
345
340
|
)
|
|
346
341
|
engine = "odf"
|
|
347
342
|
|
|
@@ -354,33 +349,33 @@ def parse_excel(
|
|
|
354
349
|
if verbose:
|
|
355
350
|
display_logs_depending_process_time(
|
|
356
351
|
f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
|
|
357
|
-
time() - start
|
|
352
|
+
time() - start,
|
|
358
353
|
)
|
|
359
354
|
tables = pd.read_excel(
|
|
360
355
|
csv_file_path,
|
|
361
356
|
engine="odf",
|
|
362
357
|
sheet_name=None,
|
|
363
|
-
dtype="unicode"
|
|
358
|
+
dtype="unicode",
|
|
364
359
|
)
|
|
365
360
|
sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
|
|
366
361
|
sheet_name = max(sizes, key=sizes.get)
|
|
367
362
|
if verbose:
|
|
368
363
|
display_logs_depending_process_time(
|
|
369
364
|
f'Going forwards with sheet "{sheet_name}"',
|
|
370
|
-
time() - start
|
|
365
|
+
time() - start,
|
|
371
366
|
)
|
|
372
367
|
table = tables[sheet_name]
|
|
373
368
|
else:
|
|
374
369
|
if verbose:
|
|
375
370
|
display_logs_depending_process_time(
|
|
376
371
|
f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
|
|
377
|
-
time() - start
|
|
372
|
+
time() - start,
|
|
378
373
|
)
|
|
379
374
|
table = pd.read_excel(
|
|
380
375
|
csv_file_path,
|
|
381
376
|
engine="odf",
|
|
382
377
|
sheet_name=sheet_name,
|
|
383
|
-
dtype="unicode"
|
|
378
|
+
dtype="unicode",
|
|
384
379
|
)
|
|
385
380
|
table, header_row_idx = remove_empty_first_rows(table)
|
|
386
381
|
total_lines = len(table)
|
|
@@ -391,7 +386,7 @@ def parse_excel(
|
|
|
391
386
|
if verbose:
|
|
392
387
|
display_logs_depending_process_time(
|
|
393
388
|
f'Table parsed successfully in {round(time() - start, 3)}s',
|
|
394
|
-
time() - start
|
|
389
|
+
time() - start,
|
|
395
390
|
)
|
|
396
391
|
return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
|
|
397
392
|
|
|
@@ -400,18 +395,18 @@ def parse_excel(
|
|
|
400
395
|
if no_sheet_specified:
|
|
401
396
|
display_logs_depending_process_time(
|
|
402
397
|
f'Going forwards with sheet "{sheet_name}"',
|
|
403
|
-
time() - start
|
|
398
|
+
time() - start,
|
|
404
399
|
)
|
|
405
400
|
else:
|
|
406
401
|
display_logs_depending_process_time(
|
|
407
402
|
f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
|
|
408
|
-
time() - start
|
|
403
|
+
time() - start,
|
|
409
404
|
)
|
|
410
405
|
table = pd.read_excel(
|
|
411
406
|
csv_file_path,
|
|
412
407
|
engine=engine,
|
|
413
408
|
sheet_name=sheet_name,
|
|
414
|
-
dtype="unicode"
|
|
409
|
+
dtype="unicode",
|
|
415
410
|
)
|
|
416
411
|
table, header_row_idx = remove_empty_first_rows(table)
|
|
417
412
|
total_lines = len(table)
|
|
@@ -422,12 +417,12 @@ def parse_excel(
|
|
|
422
417
|
if verbose:
|
|
423
418
|
display_logs_depending_process_time(
|
|
424
419
|
f'Table parsed successfully in {round(time() - start, 3)}s',
|
|
425
|
-
time() - start
|
|
420
|
+
time() - start,
|
|
426
421
|
)
|
|
427
422
|
return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
|
|
428
423
|
|
|
429
424
|
|
|
430
|
-
def prevent_nan(value: float):
|
|
425
|
+
def prevent_nan(value: float) -> Optional[float]:
|
|
431
426
|
if math.isnan(value):
|
|
432
427
|
return None
|
|
433
428
|
return value
|
|
@@ -439,7 +434,7 @@ def create_profile(
|
|
|
439
434
|
num_rows: int,
|
|
440
435
|
limited_output: bool = True,
|
|
441
436
|
verbose: bool = False,
|
|
442
|
-
):
|
|
437
|
+
) -> dict:
|
|
443
438
|
if verbose:
|
|
444
439
|
start = time()
|
|
445
440
|
logging.info("Creating profile")
|
|
@@ -466,9 +461,8 @@ def create_profile(
|
|
|
466
461
|
safe_table[c] = safe_table[c].apply(
|
|
467
462
|
lambda s: float_casting(s) if isinstance(s, str) else s
|
|
468
463
|
)
|
|
469
|
-
profile =
|
|
464
|
+
profile = defaultdict(dict)
|
|
470
465
|
for c in safe_table.columns:
|
|
471
|
-
profile[c] = {}
|
|
472
466
|
if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
|
|
473
467
|
float,
|
|
474
468
|
int,
|
|
@@ -494,10 +488,10 @@ def create_profile(
|
|
|
494
488
|
.to_dict(orient="records")
|
|
495
489
|
tops = []
|
|
496
490
|
for tb in tops_bruts:
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
491
|
+
tops.append({
|
|
492
|
+
"count": tb["count"],
|
|
493
|
+
"value": tb[c],
|
|
494
|
+
})
|
|
501
495
|
profile[c].update(
|
|
502
496
|
tops=tops,
|
|
503
497
|
nb_distinct=safe_table[c].nunique(),
|
|
@@ -506,7 +500,7 @@ def create_profile(
|
|
|
506
500
|
if verbose:
|
|
507
501
|
display_logs_depending_process_time(
|
|
508
502
|
f"Created profile in {round(time() - start, 3)}s",
|
|
509
|
-
time() - start
|
|
503
|
+
time() - start,
|
|
510
504
|
)
|
|
511
505
|
return profile
|
|
512
506
|
|
|
@@ -540,7 +534,7 @@ def detect_extra_columns(file: TextIO, sep: str):
|
|
|
540
534
|
return nb_useless_col, retour
|
|
541
535
|
|
|
542
536
|
|
|
543
|
-
def detect_headers(file: TextIO, sep: str, verbose: bool = False):
|
|
537
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
|
|
544
538
|
"""Tests 10 first rows for possible header (header not in 1st line)"""
|
|
545
539
|
if verbose:
|
|
546
540
|
start = time()
|
|
@@ -559,7 +553,7 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False):
|
|
|
559
553
|
if verbose:
|
|
560
554
|
display_logs_depending_process_time(
|
|
561
555
|
f'Detected headers in {round(time() - start, 3)}s',
|
|
562
|
-
time() - start
|
|
556
|
+
time() - start,
|
|
563
557
|
)
|
|
564
558
|
return i, chaine
|
|
565
559
|
if verbose:
|
|
@@ -567,7 +561,7 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False):
|
|
|
567
561
|
return 0, None
|
|
568
562
|
|
|
569
563
|
|
|
570
|
-
def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False):
|
|
564
|
+
def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False) -> int:
|
|
571
565
|
"""Tests first 10 lines to see if there are empty heading columns"""
|
|
572
566
|
if verbose:
|
|
573
567
|
start = time()
|
|
@@ -581,18 +575,18 @@ def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False):
|
|
|
581
575
|
if verbose:
|
|
582
576
|
display_logs_depending_process_time(
|
|
583
577
|
f'No heading column detected in {round(time() - start, 3)}s',
|
|
584
|
-
time() - start
|
|
578
|
+
time() - start,
|
|
585
579
|
)
|
|
586
580
|
return 0
|
|
587
581
|
if verbose:
|
|
588
582
|
display_logs_depending_process_time(
|
|
589
583
|
f'{return_int} heading columns detected in {round(time() - start, 3)}s',
|
|
590
|
-
time() - start
|
|
584
|
+
time() - start,
|
|
591
585
|
)
|
|
592
586
|
return return_int
|
|
593
587
|
|
|
594
588
|
|
|
595
|
-
def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False):
|
|
589
|
+
def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False) -> int:
|
|
596
590
|
"""Tests first 10 lines to see if there are empty trailing columns"""
|
|
597
591
|
if verbose:
|
|
598
592
|
start = time()
|
|
@@ -611,12 +605,12 @@ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbos
|
|
|
611
605
|
if verbose:
|
|
612
606
|
display_logs_depending_process_time(
|
|
613
607
|
f'No trailing column detected in {round(time() - start, 3)}s',
|
|
614
|
-
time() - start
|
|
608
|
+
time() - start,
|
|
615
609
|
)
|
|
616
610
|
return 0
|
|
617
611
|
if verbose:
|
|
618
612
|
display_logs_depending_process_time(
|
|
619
613
|
f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
|
|
620
|
-
time() - start
|
|
614
|
+
time() - start,
|
|
621
615
|
)
|
|
622
616
|
return return_int
|
csv_detective/explore_csv.py
CHANGED
|
@@ -4,6 +4,7 @@ contenu possible des champs
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
from typing import Dict, List, Union
|
|
7
|
+
from collections import defaultdict
|
|
7
8
|
import json
|
|
8
9
|
import numpy as np
|
|
9
10
|
import os
|
|
@@ -39,7 +40,7 @@ from .detection import (
|
|
|
39
40
|
logging.basicConfig(level=logging.INFO)
|
|
40
41
|
|
|
41
42
|
|
|
42
|
-
def get_all_packages(detect_type):
|
|
43
|
+
def get_all_packages(detect_type) -> list:
|
|
43
44
|
root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
|
|
44
45
|
modules = []
|
|
45
46
|
for dirpath, _, filenames in os.walk(root_dir):
|
|
@@ -58,7 +59,7 @@ def get_all_packages(detect_type):
|
|
|
58
59
|
def return_all_tests(
|
|
59
60
|
user_input_tests: Union[str, list],
|
|
60
61
|
detect_type: str,
|
|
61
|
-
):
|
|
62
|
+
) -> list:
|
|
62
63
|
"""
|
|
63
64
|
returns all tests that have a method _is and are listed in the user_input_tests
|
|
64
65
|
the function can select a sub_package from csv_detective
|
|
@@ -175,12 +176,10 @@ def routine(
|
|
|
175
176
|
sep = detect_separator(str_file, verbose=verbose)
|
|
176
177
|
header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
|
|
177
178
|
if header is None:
|
|
178
|
-
|
|
179
|
-
return return_dict
|
|
179
|
+
return {"error": True}
|
|
180
180
|
elif isinstance(header, list):
|
|
181
181
|
if any([x is None for x in header]):
|
|
182
|
-
|
|
183
|
-
return return_dict
|
|
182
|
+
return {"error": True}
|
|
184
183
|
heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
|
|
185
184
|
trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
|
|
186
185
|
table, total_lines, nb_duplicates = parse_table(
|
|
@@ -200,7 +199,7 @@ def routine(
|
|
|
200
199
|
# )
|
|
201
200
|
|
|
202
201
|
# Creating return dictionary
|
|
203
|
-
|
|
202
|
+
analysis = {
|
|
204
203
|
"header_row_idx": header_row_idx,
|
|
205
204
|
"header": header,
|
|
206
205
|
"total_lines": total_lines,
|
|
@@ -212,12 +211,12 @@ def routine(
|
|
|
212
211
|
}
|
|
213
212
|
# this is only relevant for xls-like
|
|
214
213
|
if is_xls_like:
|
|
215
|
-
|
|
216
|
-
|
|
214
|
+
analysis["engine"] = engine
|
|
215
|
+
analysis["sheet_name"] = sheet_name
|
|
217
216
|
# this is only relevant for csv
|
|
218
217
|
else:
|
|
219
|
-
|
|
220
|
-
|
|
218
|
+
analysis["encoding"] = encoding
|
|
219
|
+
analysis["separator"] = sep
|
|
221
220
|
|
|
222
221
|
# list testing to be performed
|
|
223
222
|
all_tests_fields = return_all_tests(
|
|
@@ -229,25 +228,24 @@ def routine(
|
|
|
229
228
|
|
|
230
229
|
# if no testing then return
|
|
231
230
|
if not all_tests_fields and not all_tests_labels:
|
|
232
|
-
return
|
|
231
|
+
return analysis
|
|
233
232
|
|
|
234
233
|
# Perform testing on fields
|
|
235
|
-
|
|
236
|
-
|
|
234
|
+
scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
|
|
235
|
+
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
237
236
|
|
|
238
237
|
# Perform testing on labels
|
|
239
|
-
|
|
240
|
-
|
|
238
|
+
scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
|
|
239
|
+
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
241
240
|
|
|
242
241
|
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
243
242
|
# This is because the fields are more important than the labels and yields a max
|
|
244
243
|
# of 1.5 for the final score.
|
|
245
|
-
|
|
244
|
+
scores_table = scores_table_fields * (
|
|
246
245
|
1
|
|
247
|
-
+
|
|
248
|
-
index=
|
|
249
|
-
).values
|
|
250
|
-
/ 2
|
|
246
|
+
+ scores_table_labels.reindex(
|
|
247
|
+
index=scores_table_fields.index, fill_value=0
|
|
248
|
+
).values / 2
|
|
251
249
|
)
|
|
252
250
|
|
|
253
251
|
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
@@ -263,12 +261,12 @@ def routine(
|
|
|
263
261
|
"latitude_l93",
|
|
264
262
|
"longitude_l93",
|
|
265
263
|
]
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
264
|
+
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
265
|
+
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
266
|
+
scores_table.loc[formats_with_mandatory_label, :],
|
|
269
267
|
0,
|
|
270
268
|
)
|
|
271
|
-
|
|
269
|
+
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
272
270
|
|
|
273
271
|
metier_to_python_type = {
|
|
274
272
|
"booleen": "bool",
|
|
@@ -291,7 +289,7 @@ def routine(
|
|
|
291
289
|
|
|
292
290
|
if not limited_output:
|
|
293
291
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
294
|
-
|
|
292
|
+
analysis[detection_method] = {
|
|
295
293
|
col_name: [
|
|
296
294
|
{
|
|
297
295
|
"python_type": metier_to_python_type.get(
|
|
@@ -301,32 +299,29 @@ def routine(
|
|
|
301
299
|
}
|
|
302
300
|
for detection in detections
|
|
303
301
|
]
|
|
304
|
-
for col_name, detections in
|
|
302
|
+
for col_name, detections in analysis[detection_method].items()
|
|
305
303
|
}
|
|
306
304
|
else:
|
|
307
305
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
308
|
-
|
|
306
|
+
analysis[detection_method] = {
|
|
309
307
|
col_name: {
|
|
310
308
|
"python_type": metier_to_python_type.get(
|
|
311
309
|
detection["format"], "string"
|
|
312
310
|
),
|
|
313
311
|
**detection,
|
|
314
312
|
}
|
|
315
|
-
for col_name, detection in
|
|
313
|
+
for col_name, detection in analysis[detection_method].items()
|
|
316
314
|
}
|
|
317
315
|
|
|
318
316
|
# Add detection with formats as keys
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
}
|
|
323
|
-
for header, col_metadata in return_dict["columns"].items():
|
|
324
|
-
return_dict["formats"][col_metadata["format"]].append(header)
|
|
317
|
+
analysis["formats"] = defaultdict(list)
|
|
318
|
+
for header, col_metadata in analysis["columns"].items():
|
|
319
|
+
analysis["formats"][col_metadata["format"]].append(header)
|
|
325
320
|
|
|
326
321
|
if output_profile:
|
|
327
|
-
|
|
322
|
+
analysis["profile"] = create_profile(
|
|
328
323
|
table=table,
|
|
329
|
-
dict_cols_fields=
|
|
324
|
+
dict_cols_fields=analysis["columns"],
|
|
330
325
|
num_rows=num_rows,
|
|
331
326
|
limited_output=limited_output,
|
|
332
327
|
verbose=verbose,
|
|
@@ -343,11 +338,11 @@ def routine(
|
|
|
343
338
|
output_path += "_sheet-" + str(sheet_name)
|
|
344
339
|
output_path += ".json"
|
|
345
340
|
with open(output_path, "w", encoding="utf8") as fp:
|
|
346
|
-
json.dump(
|
|
341
|
+
json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
|
|
347
342
|
|
|
348
343
|
if output_schema:
|
|
349
|
-
|
|
350
|
-
|
|
344
|
+
analysis["schema"] = generate_table_schema(
|
|
345
|
+
analysis,
|
|
351
346
|
save_file=False,
|
|
352
347
|
verbose=verbose
|
|
353
348
|
)
|
|
@@ -357,8 +352,8 @@ def routine(
|
|
|
357
352
|
time() - start_routine
|
|
358
353
|
)
|
|
359
354
|
if output_df:
|
|
360
|
-
return
|
|
361
|
-
return
|
|
355
|
+
return analysis, table
|
|
356
|
+
return analysis
|
|
362
357
|
|
|
363
358
|
|
|
364
359
|
def routine_minio(
|
|
@@ -436,7 +431,7 @@ def routine_minio(
|
|
|
436
431
|
minio_pwd=minio_pwd,
|
|
437
432
|
)
|
|
438
433
|
|
|
439
|
-
|
|
434
|
+
analysis = routine(
|
|
440
435
|
csv_file_path,
|
|
441
436
|
num_rows,
|
|
442
437
|
user_input_tests,
|
|
@@ -449,7 +444,7 @@ def routine_minio(
|
|
|
449
444
|
# Write report JSON file.
|
|
450
445
|
output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
|
|
451
446
|
with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
|
|
452
|
-
json.dump(
|
|
447
|
+
json.dump(analysis, fp, indent=4, separators=(",", ": "))
|
|
453
448
|
|
|
454
449
|
upload_to_minio(
|
|
455
450
|
netloc=output_minio_location["netloc"],
|
|
@@ -464,7 +459,7 @@ def routine_minio(
|
|
|
464
459
|
os.remove(csv_file_path)
|
|
465
460
|
|
|
466
461
|
generate_table_schema(
|
|
467
|
-
|
|
462
|
+
analysis,
|
|
468
463
|
True,
|
|
469
464
|
netloc=tableschema_minio_location["netloc"],
|
|
470
465
|
bucket=tableschema_minio_location["bucket"],
|
|
@@ -473,4 +468,4 @@ def routine_minio(
|
|
|
473
468
|
minio_pwd=minio_pwd,
|
|
474
469
|
)
|
|
475
470
|
|
|
476
|
-
return
|
|
471
|
+
return analysis
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
- New function that creates a csv from a list of fields and constraints, or from a TableSchema [#101](https://github.com/datagouv/csv-detective/pull/101)
|
|
6
6
|
- Enable outputing loaded dataframe [#102](https://github.com/datagouv/csv-detective/pull/102)
|
|
7
|
+
- Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
|
|
7
8
|
|
|
8
9
|
## 0.7.4 (2024-11-15)
|
|
9
10
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
|
|
2
2
|
csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
|
|
3
3
|
csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
|
|
4
|
-
csv_detective/detection.py,sha256=
|
|
5
|
-
csv_detective/explore_csv.py,sha256=
|
|
4
|
+
csv_detective/detection.py,sha256=SUNGMvvuM_bj3gKYw-x6-CjjkirqCPoeAm0NCPkijrM,22225
|
|
5
|
+
csv_detective/explore_csv.py,sha256=i1m1JmnMSILlGnPhXlXsUbDVcgXaJ1E2nKE7_6D2xEE,16996
|
|
6
6
|
csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
|
|
7
7
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
8
8
|
csv_detective/schema_generation.py,sha256=D1Cq4QRajsKtY8EJSwbRTIB-T_Cb2ZpcmYtCrJ6DvJQ,13135
|
|
@@ -126,18 +126,18 @@ csv_detective/detect_labels/temp/date/__init__.py,sha256=GrIbo64WVM3hi7ShBRKKyKU
|
|
|
126
126
|
csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
|
|
127
127
|
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
|
|
128
128
|
csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
|
|
129
|
-
csv_detective-0.7.5.
|
|
130
|
-
csv_detective-0.7.5.
|
|
131
|
-
csv_detective-0.7.5.
|
|
129
|
+
csv_detective-0.7.5.dev1078.data/data/share/csv_detective/CHANGELOG.md,sha256=5M95hTftsY9Ic2q_jexDNp-MgAFAXuPZyWGyFABi3l4,6927
|
|
130
|
+
csv_detective-0.7.5.dev1078.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
131
|
+
csv_detective-0.7.5.dev1078.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
132
132
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
133
133
|
tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
|
|
134
134
|
tests/test_fields.py,sha256=kXel-hiyQYrJ3OLmwUMg1K3DKbbwBLvUplxZWxpp18I,10605
|
|
135
135
|
tests/test_file.py,sha256=oQITvAxdcrqDby2wWSh_X9TCwFqdFaP34XNy92ibXyg,6725
|
|
136
136
|
tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
|
|
137
137
|
tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
|
|
138
|
-
csv_detective-0.7.5.
|
|
139
|
-
csv_detective-0.7.5.
|
|
140
|
-
csv_detective-0.7.5.
|
|
141
|
-
csv_detective-0.7.5.
|
|
142
|
-
csv_detective-0.7.5.
|
|
143
|
-
csv_detective-0.7.5.
|
|
138
|
+
csv_detective-0.7.5.dev1078.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
139
|
+
csv_detective-0.7.5.dev1078.dist-info/METADATA,sha256=NSxmqCJpApiSavZ59QEMfRuzeB_pmOk2Wm_zTy-o2eQ,1145
|
|
140
|
+
csv_detective-0.7.5.dev1078.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
141
|
+
csv_detective-0.7.5.dev1078.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
142
|
+
csv_detective-0.7.5.dev1078.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
143
|
+
csv_detective-0.7.5.dev1078.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1078.dist-info}/LICENSE.AGPL.txt
RENAMED
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1078.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1078.dist-info}/top_level.txt
RENAMED
|
File without changes
|