csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from re import finditer
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def camel_case_split(identifier: str):
|
|
5
|
+
matches = finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
|
|
6
|
+
return " ".join([m.group(0) for m in matches])
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
translate_dict = {
|
|
10
|
+
" ": ["-", "_", "'", ",", " "],
|
|
11
|
+
"a": ["à", "â"],
|
|
12
|
+
"c": ["ç"],
|
|
13
|
+
"e": ["é", "è", "ê", "é"],
|
|
14
|
+
"i": ["î", "ï"],
|
|
15
|
+
"o": ["ô", "ö"],
|
|
16
|
+
"u": ["ù", "û", "ü"],
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Process text
|
|
21
|
+
def _process_text(val: str):
|
|
22
|
+
"""Traitement des chaînes de caractères pour les standardiser.
|
|
23
|
+
Plusieurs alternatives ont été testées : .translate, unidecode.unidecode,
|
|
24
|
+
des méthodes hybrides, mais aucune ne s'est avérée plus performante."""
|
|
25
|
+
val = camel_case_split(val)
|
|
26
|
+
val = val.lower()
|
|
27
|
+
for target in translate_dict:
|
|
28
|
+
for source in translate_dict[target]:
|
|
29
|
+
val = val.replace(source, target)
|
|
30
|
+
val = val.strip()
|
|
31
|
+
return val
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def is_word_in_string(word: str, string: str):
|
|
35
|
+
# if the substring is too short, the test can become irrelevant
|
|
36
|
+
return len(word) > 2 and word in string
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def header_score(header: str, words_combinations_list: list[str]) -> float:
|
|
40
|
+
"""Returns:
|
|
41
|
+
- 1 if the header is exactly in the specified list
|
|
42
|
+
- 0.5 if any of the words is within the header
|
|
43
|
+
- 0 otherwise"""
|
|
44
|
+
processed_header = _process_text(header)
|
|
45
|
+
|
|
46
|
+
header_matches_words_combination = float(
|
|
47
|
+
any(words_combination == processed_header for words_combination in words_combinations_list)
|
|
48
|
+
)
|
|
49
|
+
words_combination_in_header = 0.5 * (
|
|
50
|
+
any(
|
|
51
|
+
is_word_in_string(words_combination, processed_header)
|
|
52
|
+
for words_combination in words_combinations_list
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return max(header_matches_words_combination, words_combination_in_header)
|
csv_detective/utils.py
CHANGED
|
@@ -1,209 +1,36 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
1
|
import logging
|
|
3
|
-
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
4
|
|
|
5
5
|
logging.basicConfig(level=logging.INFO)
|
|
6
|
+
logging.addLevelName(
|
|
7
|
+
logging.CRITICAL, "\033[1;41m%s\033[1;0m" % logging.getLevelName(logging.CRITICAL)
|
|
8
|
+
)
|
|
9
|
+
logging.addLevelName(logging.WARN, "\033[1;31m%s\033[1;0m" % logging.getLevelName(logging.WARN))
|
|
6
10
|
|
|
11
|
+
THRESHOLD_WARN = 1
|
|
12
|
+
THRESHOLD_CRITICAL = 3
|
|
7
13
|
|
|
8
|
-
def display_logs_depending_process_time(prompt: str, duration: float):
|
|
9
|
-
'''
|
|
10
|
-
Print colored logs according to the time the operation took.
|
|
11
|
-
'''
|
|
12
|
-
logging.addLevelName(logging.CRITICAL, "\033[1;41m%s\033[1;0m" % logging.getLevelName(logging.CRITICAL))
|
|
13
|
-
logging.addLevelName(logging.WARN, "\033[1;31m%s\033[1;0m" % logging.getLevelName(logging.WARN))
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
threshold_critical = 3
|
|
17
|
-
|
|
18
|
-
if duration < threshold_warn:
|
|
19
|
-
logging.info(prompt)
|
|
20
|
-
elif duration < threshold_critical:
|
|
21
|
-
logging.warn(prompt)
|
|
22
|
-
else:
|
|
23
|
-
logging.critical(prompt)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def test_col_val(
|
|
27
|
-
serie, test_func, proportion=0.9, skipna=True, output_mode="ALL", verbose=False
|
|
28
|
-
):
|
|
29
|
-
"""Tests values of the serie using test_func.
|
|
30
|
-
- skipna : if True indicates that NaNs are not counted as False
|
|
31
|
-
- proportion : indicates the proportion of values that have to pass the test
|
|
32
|
-
for the serie to be detected as a certain format
|
|
15
|
+
def display_logs_depending_process_time(prompt: str, duration: float) -> None:
|
|
33
16
|
"""
|
|
34
|
-
|
|
35
|
-
start = time()
|
|
36
|
-
|
|
37
|
-
# TODO : change for a cleaner method and only test columns in modules labels
|
|
38
|
-
def apply_test_func(serie, test_func, _range):
|
|
39
|
-
return serie.sample(n=_range).apply(test_func)
|
|
40
|
-
try:
|
|
41
|
-
if skipna:
|
|
42
|
-
serie = serie[serie.notnull()]
|
|
43
|
-
ser_len = len(serie)
|
|
44
|
-
if ser_len == 0:
|
|
45
|
-
return 0.0
|
|
46
|
-
if output_mode == "ALL":
|
|
47
|
-
result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
|
|
48
|
-
return result if result >= proportion else 0.0
|
|
49
|
-
else:
|
|
50
|
-
if proportion == 1: # Then try first 1 value, then 5, then all
|
|
51
|
-
for _range in [
|
|
52
|
-
min(1, ser_len),
|
|
53
|
-
min(5, ser_len),
|
|
54
|
-
ser_len,
|
|
55
|
-
]: # Pour ne pas faire d'opérations inutiles, on commence par 1,
|
|
56
|
-
# puis 5 valeurs puis la serie complète
|
|
57
|
-
if all(apply_test_func(serie, test_func, _range)):
|
|
58
|
-
# print(serie.name, ': check OK')
|
|
59
|
-
pass
|
|
60
|
-
else:
|
|
61
|
-
return 0.0
|
|
62
|
-
return 1.0
|
|
63
|
-
else:
|
|
64
|
-
# if we have a proportion, statistically it's OK to analyse up to 10k rows
|
|
65
|
-
# (arbitrary number) and get a significant result
|
|
66
|
-
to_analyse = min(ser_len, 10000)
|
|
67
|
-
result = apply_test_func(serie, test_func, to_analyse).sum() / to_analyse
|
|
68
|
-
return result if result >= proportion else 0.0
|
|
69
|
-
finally:
|
|
70
|
-
if verbose and time() - start > 3:
|
|
71
|
-
display_logs_depending_process_time(
|
|
72
|
-
f"\t/!\\ Column '{serie.name}' took too long ({round(time() - start, 3)}s)",
|
|
73
|
-
time() - start
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def test_col_label(label, test_func, proportion=1, output_mode="ALL"):
|
|
78
|
-
"""Tests label (from header) using test_func.
|
|
79
|
-
- proportion : indicates the minimum score to pass the test for the serie
|
|
80
|
-
to be detected as a certain format
|
|
17
|
+
Print colored logs according to the time the operation took.
|
|
81
18
|
"""
|
|
82
|
-
if
|
|
83
|
-
|
|
19
|
+
if duration < THRESHOLD_WARN:
|
|
20
|
+
logging.info(prompt)
|
|
21
|
+
elif duration < THRESHOLD_CRITICAL:
|
|
22
|
+
logging.warning(prompt)
|
|
84
23
|
else:
|
|
85
|
-
|
|
86
|
-
return result if result >= proportion else 0
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def test_col(table, all_tests, output_mode, verbose: bool = False):
|
|
90
|
-
# Initialising dict for tests
|
|
91
|
-
if verbose:
|
|
92
|
-
start = time()
|
|
93
|
-
logging.info("Testing columns to get types")
|
|
94
|
-
test_funcs = dict()
|
|
95
|
-
for test in all_tests:
|
|
96
|
-
name = test.__name__.split(".")[-1]
|
|
97
|
-
test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
|
|
98
|
-
return_table = pd.DataFrame(columns=table.columns)
|
|
99
|
-
for idx, (key, value) in enumerate(test_funcs.items()):
|
|
100
|
-
if verbose:
|
|
101
|
-
start_type = time()
|
|
102
|
-
logging.info(f"\t- Starting with type '{key}'")
|
|
103
|
-
# improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
|
|
104
|
-
# => the following needs to change, "apply" means all columns are tested for one type at once
|
|
105
|
-
return_table.loc[key] = table.apply(
|
|
106
|
-
lambda serie: test_col_val(
|
|
107
|
-
serie,
|
|
108
|
-
value["func"],
|
|
109
|
-
value["prop"],
|
|
110
|
-
output_mode=output_mode,
|
|
111
|
-
verbose=verbose,
|
|
112
|
-
)
|
|
113
|
-
)
|
|
114
|
-
if verbose:
|
|
115
|
-
display_logs_depending_process_time(
|
|
116
|
-
f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
|
|
117
|
-
time() - start_type
|
|
118
|
-
)
|
|
119
|
-
if verbose:
|
|
120
|
-
display_logs_depending_process_time(f"Done testing columns in {round(time() - start, 3)}s", time() - start)
|
|
121
|
-
return return_table
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
def test_label(table, all_tests, output_mode, verbose: bool = False):
|
|
125
|
-
# Initialising dict for tests
|
|
126
|
-
if verbose:
|
|
127
|
-
start = time()
|
|
128
|
-
logging.info("Testing labels to get types")
|
|
129
|
-
test_funcs = dict()
|
|
130
|
-
for test in all_tests:
|
|
131
|
-
name = test.__name__.split(".")[-1]
|
|
132
|
-
test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
|
|
133
|
-
|
|
134
|
-
return_table = pd.DataFrame(columns=table.columns)
|
|
135
|
-
for idx, (key, value) in enumerate(test_funcs.items()):
|
|
136
|
-
if verbose:
|
|
137
|
-
start_type = time()
|
|
138
|
-
return_table.loc[key] = [
|
|
139
|
-
test_col_label(
|
|
140
|
-
col_name, value["func"], value["prop"], output_mode=output_mode
|
|
141
|
-
)
|
|
142
|
-
for col_name in table.columns
|
|
143
|
-
]
|
|
144
|
-
if verbose:
|
|
145
|
-
display_logs_depending_process_time(
|
|
146
|
-
f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
|
|
147
|
-
time() - start_type
|
|
148
|
-
)
|
|
149
|
-
if verbose:
|
|
150
|
-
display_logs_depending_process_time(f"Done testing labels in {round(time() - start, 3)}s", time() - start)
|
|
151
|
-
return return_table
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def prepare_output_dict(return_table, output_mode):
|
|
155
|
-
return_dict_cols = return_table.to_dict("dict")
|
|
156
|
-
return_dict_cols_intermediary = {}
|
|
157
|
-
for column_name in return_dict_cols:
|
|
158
|
-
return_dict_cols_intermediary[column_name] = []
|
|
159
|
-
for detected_value_type in return_dict_cols[column_name]:
|
|
160
|
-
if return_dict_cols[column_name][detected_value_type] == 0:
|
|
161
|
-
continue
|
|
162
|
-
dict_tmp = {}
|
|
163
|
-
dict_tmp["format"] = detected_value_type
|
|
164
|
-
dict_tmp["score"] = return_dict_cols[column_name][detected_value_type]
|
|
165
|
-
return_dict_cols_intermediary[column_name].append(dict_tmp)
|
|
166
|
-
|
|
167
|
-
# Clean dict using priorities
|
|
168
|
-
formats_detected = {
|
|
169
|
-
x["format"] for x in return_dict_cols_intermediary[column_name]
|
|
170
|
-
}
|
|
171
|
-
formats_to_remove = set()
|
|
172
|
-
# Deprioritise float and int detection vs others
|
|
173
|
-
if len(formats_detected - {"float", "int"}) > 0:
|
|
174
|
-
formats_to_remove = formats_to_remove.union({"float", "int"})
|
|
175
|
-
if "int" in formats_detected:
|
|
176
|
-
formats_to_remove.add("float")
|
|
177
|
-
if "latitude_wgs_fr_metropole" in formats_detected:
|
|
178
|
-
formats_to_remove.add("latitude_l93")
|
|
179
|
-
formats_to_remove.add("latitude_wgs")
|
|
180
|
-
if "longitude_wgs_fr_metropole" in formats_detected:
|
|
181
|
-
formats_to_remove.add("longitude_l93")
|
|
182
|
-
formats_to_remove.add("longitude_wgs")
|
|
183
|
-
if "longitude_wgs" in formats_detected:
|
|
184
|
-
formats_to_remove.add("longitude_l93")
|
|
185
|
-
if "code_region" in formats_detected:
|
|
186
|
-
formats_to_remove.add("code_departement")
|
|
187
|
-
|
|
188
|
-
formats_to_keep = formats_detected - formats_to_remove
|
|
24
|
+
logging.critical(prompt)
|
|
189
25
|
|
|
190
|
-
detections = return_dict_cols_intermediary[column_name]
|
|
191
|
-
detections = [x for x in detections if x["format"] in formats_to_keep]
|
|
192
|
-
if output_mode == "ALL":
|
|
193
|
-
return_dict_cols_intermediary[column_name] = detections
|
|
194
|
-
if output_mode == "LIMITED":
|
|
195
|
-
return_dict_cols_intermediary[column_name] = (
|
|
196
|
-
max(detections, key=lambda x: x["score"])
|
|
197
|
-
if len(detections) > 0
|
|
198
|
-
else {"format": "string", "score": 1.0}
|
|
199
|
-
)
|
|
200
26
|
|
|
201
|
-
|
|
27
|
+
def is_url(file_path: str) -> bool:
|
|
28
|
+
# could be more sophisticated if needed
|
|
29
|
+
# using the URL detection test was considered but too broad (schema required to use requests)
|
|
30
|
+
return file_path.startswith("http")
|
|
202
31
|
|
|
203
32
|
|
|
204
|
-
def
|
|
205
|
-
|
|
206
|
-
("
|
|
207
|
-
|
|
208
|
-
or (string.endswith(" " + word))
|
|
209
|
-
)
|
|
33
|
+
def cast_prevent_nan(value: float, _type: str) -> float | int | None:
|
|
34
|
+
if _type not in {"int", "float"}:
|
|
35
|
+
raise ValueError(f"Invalid type was passed: {_type}")
|
|
36
|
+
return None if pd.isna(value) else eval(_type)(value)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from csv_detective.format import FormatsManager
|
|
6
|
+
from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
|
|
7
|
+
|
|
8
|
+
VALIDATION_CHUNK_SIZE = int(1e5)
|
|
9
|
+
logging.basicConfig(level=logging.INFO)
|
|
10
|
+
|
|
11
|
+
formats = FormatsManager().formats
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def validate(
|
|
15
|
+
file_path: str,
|
|
16
|
+
previous_analysis: dict,
|
|
17
|
+
verbose: bool = False,
|
|
18
|
+
skipna: bool = True,
|
|
19
|
+
) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
|
|
20
|
+
"""
|
|
21
|
+
Verify is the given file has the same fields and types as in the given analysis.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
file_path: the path of the file to validate
|
|
25
|
+
previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine)
|
|
26
|
+
verbose: whether the code displays the steps it's going through
|
|
27
|
+
skipna: whether to ignore NaN values in the checks
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
if previous_analysis.get("separator"):
|
|
31
|
+
# loading the table in chunks
|
|
32
|
+
chunks = pd.read_csv(
|
|
33
|
+
file_path,
|
|
34
|
+
dtype=str,
|
|
35
|
+
sep=previous_analysis["separator"],
|
|
36
|
+
encoding=previous_analysis["encoding"],
|
|
37
|
+
skiprows=previous_analysis["header_row_idx"],
|
|
38
|
+
compression=previous_analysis.get("compression"),
|
|
39
|
+
chunksize=VALIDATION_CHUNK_SIZE,
|
|
40
|
+
)
|
|
41
|
+
analysis = {
|
|
42
|
+
k: v
|
|
43
|
+
for k, v in previous_analysis.items()
|
|
44
|
+
if k
|
|
45
|
+
in ["encoding", "separator", "compression", "heading_columns", "trailing_columns"]
|
|
46
|
+
and v is not None
|
|
47
|
+
}
|
|
48
|
+
else:
|
|
49
|
+
# or chunks-like if not chunkable
|
|
50
|
+
chunks = iter(
|
|
51
|
+
[
|
|
52
|
+
pd.read_excel(
|
|
53
|
+
file_path,
|
|
54
|
+
dtype=str,
|
|
55
|
+
engine=previous_analysis["engine"],
|
|
56
|
+
sheet_name=previous_analysis["sheet_name"],
|
|
57
|
+
)
|
|
58
|
+
]
|
|
59
|
+
)
|
|
60
|
+
analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
|
|
61
|
+
first_chunk = next(chunks)
|
|
62
|
+
analysis.update(
|
|
63
|
+
{k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
|
|
64
|
+
)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
if verbose:
|
|
67
|
+
logging.warning(f"> Could not load the file with previous analysis values: {e}")
|
|
68
|
+
return False, None, None, None
|
|
69
|
+
if verbose:
|
|
70
|
+
logging.info("Comparing table with the previous analysis")
|
|
71
|
+
logging.info("- Checking if all columns match")
|
|
72
|
+
if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
|
|
73
|
+
list(first_chunk.columns)[k] != previous_analysis["header"][k]
|
|
74
|
+
for k in range(len(previous_analysis["header"]))
|
|
75
|
+
):
|
|
76
|
+
if verbose:
|
|
77
|
+
logging.warning("> Columns do not match, proceeding with full analysis")
|
|
78
|
+
return False, None, None, None
|
|
79
|
+
if verbose:
|
|
80
|
+
logging.info(
|
|
81
|
+
f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# hashing rows to get nb_duplicates
|
|
85
|
+
row_hashes_count = first_chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
|
|
86
|
+
# getting values for profile to read the file only once
|
|
87
|
+
col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
|
|
88
|
+
analysis["total_lines"] = 0
|
|
89
|
+
for idx, chunk in enumerate([first_chunk, *chunks]):
|
|
90
|
+
if verbose:
|
|
91
|
+
logging.info(f"> Testing chunk number {idx}")
|
|
92
|
+
analysis["total_lines"] += len(chunk)
|
|
93
|
+
row_hashes_count = row_hashes_count.add(
|
|
94
|
+
chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
|
|
95
|
+
fill_value=0,
|
|
96
|
+
)
|
|
97
|
+
for col in chunk.columns:
|
|
98
|
+
col_values[col] = col_values[col].add(
|
|
99
|
+
chunk[col].value_counts(dropna=False),
|
|
100
|
+
fill_value=0,
|
|
101
|
+
)
|
|
102
|
+
for col_name, args in previous_analysis["columns"].items():
|
|
103
|
+
if verbose:
|
|
104
|
+
logging.info(f"- Testing {col_name} for {args['format']}")
|
|
105
|
+
if args["format"] == "string":
|
|
106
|
+
# no test for columns that have not been recognized as a specific format
|
|
107
|
+
continue
|
|
108
|
+
test_result: float = test_col_val(
|
|
109
|
+
serie=chunk[col_name],
|
|
110
|
+
format=formats[args["format"]],
|
|
111
|
+
skipna=skipna,
|
|
112
|
+
)
|
|
113
|
+
if not bool(test_result):
|
|
114
|
+
if verbose:
|
|
115
|
+
logging.warning("> Test failed, proceeding with full analysis")
|
|
116
|
+
return False, first_chunk, analysis, None
|
|
117
|
+
if verbose:
|
|
118
|
+
logging.info("> All checks successful")
|
|
119
|
+
analysis["nb_duplicates"] = sum(row_hashes_count > 1)
|
|
120
|
+
analysis["categorical"] = [
|
|
121
|
+
col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
|
|
122
|
+
]
|
|
123
|
+
return (
|
|
124
|
+
True,
|
|
125
|
+
first_chunk,
|
|
126
|
+
analysis
|
|
127
|
+
| {
|
|
128
|
+
k: previous_analysis[k]
|
|
129
|
+
for k in [
|
|
130
|
+
"categorical",
|
|
131
|
+
"columns",
|
|
132
|
+
"columns_fields",
|
|
133
|
+
"columns_labels",
|
|
134
|
+
"formats",
|
|
135
|
+
]
|
|
136
|
+
},
|
|
137
|
+
col_values,
|
|
138
|
+
)
|