csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
csv_detective/__init__.py
CHANGED
csv_detective/cli.py
CHANGED
|
@@ -4,41 +4,53 @@ Command line client for csv_detective
|
|
|
4
4
|
|
|
5
5
|
import argparse
|
|
6
6
|
import json
|
|
7
|
-
|
|
7
|
+
|
|
8
|
+
from csv_detective.explore_csv import routine
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
def run():
|
|
11
|
-
explorer = argparse.ArgumentParser(description=
|
|
12
|
+
explorer = argparse.ArgumentParser(description="Analyse a tabular file")
|
|
13
|
+
explorer.add_argument("file_path", type=str, help="Enter path of tabular file to explore")
|
|
14
|
+
explorer.add_argument(
|
|
15
|
+
"-n",
|
|
16
|
+
"--num_rows",
|
|
17
|
+
dest="num_rows",
|
|
18
|
+
type=int,
|
|
19
|
+
nargs="?",
|
|
20
|
+
help="Number of rows to use for detection (default 500)",
|
|
21
|
+
)
|
|
12
22
|
explorer.add_argument(
|
|
13
|
-
|
|
23
|
+
"-s",
|
|
24
|
+
"--sep",
|
|
25
|
+
dest="sep",
|
|
14
26
|
type=str,
|
|
15
|
-
|
|
27
|
+
nargs="?",
|
|
28
|
+
help="Columns separator (detected if not specified)",
|
|
16
29
|
)
|
|
17
30
|
explorer.add_argument(
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
dest='num_rows',
|
|
31
|
+
"--save",
|
|
32
|
+
dest="save_results",
|
|
21
33
|
type=int,
|
|
22
|
-
nargs=
|
|
23
|
-
help=
|
|
34
|
+
nargs="?",
|
|
35
|
+
help="Whether to save the resulting analysis to json (1 = save, 0 = don't)",
|
|
24
36
|
)
|
|
25
37
|
explorer.add_argument(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
dest=
|
|
29
|
-
type=
|
|
30
|
-
nargs=
|
|
31
|
-
help=
|
|
38
|
+
"-v",
|
|
39
|
+
"--verbose",
|
|
40
|
+
dest="verbose",
|
|
41
|
+
type=int,
|
|
42
|
+
nargs="?",
|
|
43
|
+
help="Verbose (0 = quiet, 1 = details)",
|
|
32
44
|
)
|
|
33
45
|
|
|
34
46
|
opts = explorer.parse_args()
|
|
35
47
|
|
|
36
|
-
num_rows = opts.num_rows or 50
|
|
37
48
|
inspection_results = routine(
|
|
38
|
-
opts.file_path,
|
|
39
|
-
num_rows=num_rows,
|
|
40
|
-
|
|
41
|
-
|
|
49
|
+
csv_file_path=opts.file_path,
|
|
50
|
+
num_rows=opts.num_rows,
|
|
51
|
+
sep=opts.sep,
|
|
52
|
+
save_results=bool(opts.save_results),
|
|
53
|
+
verbose=bool(opts.verbose),
|
|
42
54
|
)
|
|
43
55
|
|
|
44
|
-
print(json.dumps(inspection_results, indent=4,
|
|
56
|
+
print(json.dumps(inspection_results, indent=4, ensure_ascii=False))
|
|
File without changes
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from time import time
|
|
3
|
+
from typing import TextIO
|
|
4
|
+
|
|
5
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect_extra_columns(file: TextIO, sep: str):
|
|
9
|
+
"""regarde s'il y a des colonnes en trop
|
|
10
|
+
Attention, file ne doit pas avoir de ligne vide"""
|
|
11
|
+
file.seek(0)
|
|
12
|
+
retour = False
|
|
13
|
+
nb_useless_col = 99999
|
|
14
|
+
|
|
15
|
+
for i in range(10):
|
|
16
|
+
line = file.readline()
|
|
17
|
+
# regarde si on a un retour
|
|
18
|
+
if retour:
|
|
19
|
+
assert line[-1] == "\n"
|
|
20
|
+
if line[-1] == "\n":
|
|
21
|
+
retour = True
|
|
22
|
+
|
|
23
|
+
# regarde le nombre de derniere colonne inutile
|
|
24
|
+
deb = 0 + retour
|
|
25
|
+
line = line[::-1][deb:]
|
|
26
|
+
k = 0
|
|
27
|
+
for sign in line:
|
|
28
|
+
if sign != sep:
|
|
29
|
+
break
|
|
30
|
+
k += 1
|
|
31
|
+
if k == 0:
|
|
32
|
+
return 0, retour
|
|
33
|
+
nb_useless_col = min(k, nb_useless_col)
|
|
34
|
+
return nb_useless_col, retour
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int:
|
|
38
|
+
"""Tests first 10 lines to see if there are empty heading columns"""
|
|
39
|
+
if verbose:
|
|
40
|
+
start = time()
|
|
41
|
+
logging.info("Detecting heading columns")
|
|
42
|
+
file.seek(0)
|
|
43
|
+
return_int = float("Inf")
|
|
44
|
+
for i in range(10):
|
|
45
|
+
line = file.readline()
|
|
46
|
+
return_int = min(return_int, len(line) - len(line.strip(sep)))
|
|
47
|
+
if return_int == 0:
|
|
48
|
+
if verbose:
|
|
49
|
+
display_logs_depending_process_time(
|
|
50
|
+
f"No heading column detected in {round(time() - start, 3)}s",
|
|
51
|
+
time() - start,
|
|
52
|
+
)
|
|
53
|
+
return 0
|
|
54
|
+
if verbose:
|
|
55
|
+
display_logs_depending_process_time(
|
|
56
|
+
f"{return_int} heading columns detected in {round(time() - start, 3)}s",
|
|
57
|
+
time() - start,
|
|
58
|
+
)
|
|
59
|
+
return return_int
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def detect_trailing_columns(
|
|
63
|
+
file: TextIO, sep: str, heading_columns: int, verbose: bool = False
|
|
64
|
+
) -> int:
|
|
65
|
+
"""Tests first 10 lines to see if there are empty trailing columns"""
|
|
66
|
+
if verbose:
|
|
67
|
+
start = time()
|
|
68
|
+
logging.info("Detecting trailing columns")
|
|
69
|
+
file.seek(0)
|
|
70
|
+
return_int = float("Inf")
|
|
71
|
+
for i in range(10):
|
|
72
|
+
line = file.readline()
|
|
73
|
+
return_int = min(
|
|
74
|
+
return_int,
|
|
75
|
+
len(line.replace("\n", "")) - len(line.replace("\n", "").strip(sep)) - heading_columns,
|
|
76
|
+
)
|
|
77
|
+
if return_int == 0:
|
|
78
|
+
if verbose:
|
|
79
|
+
display_logs_depending_process_time(
|
|
80
|
+
f"No trailing column detected in {round(time() - start, 3)}s",
|
|
81
|
+
time() - start,
|
|
82
|
+
)
|
|
83
|
+
return 0
|
|
84
|
+
if verbose:
|
|
85
|
+
display_logs_depending_process_time(
|
|
86
|
+
f"{return_int} trailing columns detected in {round(time() - start, 3)}s",
|
|
87
|
+
time() - start,
|
|
88
|
+
)
|
|
89
|
+
return return_int
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
from cchardet import detect
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Detects file encoding using faust-cchardet (forked from the original cchardet)
|
|
13
|
+
"""
|
|
14
|
+
if verbose:
|
|
15
|
+
start = time()
|
|
16
|
+
logging.info("Detecting encoding")
|
|
17
|
+
encoding_dict = detect(binary_file.read())
|
|
18
|
+
if not encoding_dict["encoding"]:
|
|
19
|
+
raise ValueError(
|
|
20
|
+
"Could not detect the file's encoding. Consider specifying it in the routine call."
|
|
21
|
+
)
|
|
22
|
+
if verbose:
|
|
23
|
+
message = f'Detected encoding: "{encoding_dict["encoding"]}"'
|
|
24
|
+
message += f" in {round(time() - start, 3)}s (confidence: {round(encoding_dict['confidence'] * 100)}%)"
|
|
25
|
+
display_logs_depending_process_time(
|
|
26
|
+
message,
|
|
27
|
+
time() - start,
|
|
28
|
+
)
|
|
29
|
+
return encoding_dict["encoding"]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from time import time
|
|
2
|
+
|
|
3
|
+
import magic
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from csv_detective.utils import display_logs_depending_process_time, is_url
|
|
7
|
+
|
|
8
|
+
COMPRESSION_ENGINES = ["gzip"]
|
|
9
|
+
EXCEL_ENGINES = ["openpyxl", "xlrd", "odf"]
|
|
10
|
+
engine_to_file = {
|
|
11
|
+
"openpyxl": "Excel",
|
|
12
|
+
"xlrd": "old Excel",
|
|
13
|
+
"odf": "OpenOffice",
|
|
14
|
+
"gzip": "csv.gz",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def detect_engine(file_path: str, verbose=False) -> str | None:
|
|
19
|
+
if verbose:
|
|
20
|
+
start = time()
|
|
21
|
+
mapping = {
|
|
22
|
+
"application/gzip": "gzip",
|
|
23
|
+
"application/x-gzip": "gzip",
|
|
24
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "openpyxl",
|
|
25
|
+
"application/vnd.ms-excel": "xlrd",
|
|
26
|
+
"application/vnd.oasis.opendocument.spreadsheet": "odf",
|
|
27
|
+
# all these files could be recognized as zip, may need to check all cases then
|
|
28
|
+
"application/zip": "openpyxl",
|
|
29
|
+
}
|
|
30
|
+
# if none of the above, we move forwards with the csv process
|
|
31
|
+
if is_url(file_path):
|
|
32
|
+
remote_content = next(requests.get(file_path, stream=True).iter_content(chunk_size=1024))
|
|
33
|
+
engine = mapping.get(magic.from_buffer(remote_content, mime=True))
|
|
34
|
+
else:
|
|
35
|
+
engine = mapping.get(magic.from_file(file_path, mime=True))
|
|
36
|
+
if verbose:
|
|
37
|
+
message = (
|
|
38
|
+
f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
|
|
39
|
+
if engine
|
|
40
|
+
else "Processing the file as a csv"
|
|
41
|
+
)
|
|
42
|
+
display_logs_depending_process_time(
|
|
43
|
+
message,
|
|
44
|
+
time() - start,
|
|
45
|
+
)
|
|
46
|
+
return engine
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from csv_detective.detection.variables import (
|
|
7
|
+
detect_categorical_variable,
|
|
8
|
+
# detect_continuous_variable,
|
|
9
|
+
)
|
|
10
|
+
from csv_detective.format import Format, FormatsManager
|
|
11
|
+
from csv_detective.output.utils import prepare_output_dict
|
|
12
|
+
from csv_detective.parsing.columns import (
|
|
13
|
+
MAX_NUMBER_CATEGORICAL_VALUES,
|
|
14
|
+
test_col,
|
|
15
|
+
test_col_chunks,
|
|
16
|
+
test_label,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
fmtm = FormatsManager()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def detect_formats(
|
|
23
|
+
table: pd.DataFrame,
|
|
24
|
+
analysis: dict,
|
|
25
|
+
file_path: str,
|
|
26
|
+
tags: list[str] | None = None,
|
|
27
|
+
limited_output: bool = True,
|
|
28
|
+
skipna: bool = True,
|
|
29
|
+
verbose: bool = False,
|
|
30
|
+
) -> tuple[dict, dict[str, pd.Series] | None]:
|
|
31
|
+
in_chunks = analysis.get("total_lines") is None
|
|
32
|
+
|
|
33
|
+
# list testing to be performed
|
|
34
|
+
formats: dict[str, Format] = (
|
|
35
|
+
fmtm.get_formats_from_tags(tags) if tags is not None else fmtm.formats
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# if no testing then return
|
|
39
|
+
if len(formats) == 0:
|
|
40
|
+
return analysis, None
|
|
41
|
+
|
|
42
|
+
# Perform testing on fields
|
|
43
|
+
if not in_chunks:
|
|
44
|
+
# table is small enough to be tested in one go
|
|
45
|
+
scores_table_fields = test_col(
|
|
46
|
+
table=table,
|
|
47
|
+
formats=formats,
|
|
48
|
+
limited_output=limited_output,
|
|
49
|
+
skipna=skipna,
|
|
50
|
+
verbose=verbose,
|
|
51
|
+
)
|
|
52
|
+
res_categorical, _ = detect_categorical_variable(
|
|
53
|
+
table,
|
|
54
|
+
max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
|
|
55
|
+
verbose=verbose,
|
|
56
|
+
)
|
|
57
|
+
analysis["categorical"] = res_categorical
|
|
58
|
+
col_values = None
|
|
59
|
+
else:
|
|
60
|
+
scores_table_fields, analysis, col_values = test_col_chunks(
|
|
61
|
+
table=table,
|
|
62
|
+
file_path=file_path,
|
|
63
|
+
analysis=analysis,
|
|
64
|
+
formats=formats,
|
|
65
|
+
limited_output=limited_output,
|
|
66
|
+
skipna=skipna,
|
|
67
|
+
verbose=verbose,
|
|
68
|
+
)
|
|
69
|
+
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
70
|
+
|
|
71
|
+
# Perform testing on labels
|
|
72
|
+
scores_table_labels = test_label(analysis["header"], formats, limited_output, verbose=verbose)
|
|
73
|
+
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
74
|
+
|
|
75
|
+
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
76
|
+
# This is because the fields are more important than the labels and yields a max
|
|
77
|
+
# of 1.5 for the final score.
|
|
78
|
+
scores_table = scores_table_fields * (
|
|
79
|
+
1 + scores_table_labels.reindex(index=scores_table_fields.index, fill_value=0).values / 2
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
83
|
+
# a detection (skipping the ones that have been excluded by the users).
|
|
84
|
+
formats_with_mandatory_label = [
|
|
85
|
+
f
|
|
86
|
+
for f in [
|
|
87
|
+
"code_departement",
|
|
88
|
+
"code_commune_insee",
|
|
89
|
+
"code_postal",
|
|
90
|
+
"code_fantoir",
|
|
91
|
+
"latitude_wgs",
|
|
92
|
+
"longitude_wgs",
|
|
93
|
+
"latitude_wgs_fr_metropole",
|
|
94
|
+
"longitude_wgs_fr_metropole",
|
|
95
|
+
"latitude_l93",
|
|
96
|
+
"longitude_l93",
|
|
97
|
+
"siren",
|
|
98
|
+
"siret",
|
|
99
|
+
]
|
|
100
|
+
if f in scores_table.index
|
|
101
|
+
]
|
|
102
|
+
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
103
|
+
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
104
|
+
scores_table.loc[formats_with_mandatory_label, :],
|
|
105
|
+
0,
|
|
106
|
+
)
|
|
107
|
+
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
108
|
+
|
|
109
|
+
metier_to_python_type = {
|
|
110
|
+
"booleen": "bool",
|
|
111
|
+
"int": "int",
|
|
112
|
+
"float": "float",
|
|
113
|
+
"string": "string",
|
|
114
|
+
"json": "json",
|
|
115
|
+
"geojson": "json",
|
|
116
|
+
"datetime_aware": "datetime",
|
|
117
|
+
"datetime_naive": "datetime",
|
|
118
|
+
"datetime_rfc822": "datetime",
|
|
119
|
+
"date": "date",
|
|
120
|
+
"latitude_l93": "float",
|
|
121
|
+
"latitude_wgs": "float",
|
|
122
|
+
"latitude_wgs_fr_metropole": "float",
|
|
123
|
+
"longitude_l93": "float",
|
|
124
|
+
"longitude_wgs": "float",
|
|
125
|
+
"longitude_wgs_fr_metropole": "float",
|
|
126
|
+
"binary": "binary",
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if not limited_output:
|
|
130
|
+
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
131
|
+
analysis[detection_method] = {
|
|
132
|
+
col_name: [
|
|
133
|
+
{
|
|
134
|
+
"python_type": metier_to_python_type.get(detection["format"], "string"),
|
|
135
|
+
**detection,
|
|
136
|
+
}
|
|
137
|
+
for detection in detections
|
|
138
|
+
]
|
|
139
|
+
for col_name, detections in analysis[detection_method].items()
|
|
140
|
+
}
|
|
141
|
+
else:
|
|
142
|
+
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
143
|
+
analysis[detection_method] = {
|
|
144
|
+
col_name: {
|
|
145
|
+
"python_type": metier_to_python_type.get(detection["format"], "string"),
|
|
146
|
+
**detection,
|
|
147
|
+
}
|
|
148
|
+
for col_name, detection in analysis[detection_method].items()
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Add detection with formats as keys
|
|
152
|
+
analysis["formats"] = defaultdict(list)
|
|
153
|
+
for header, col_metadata in analysis["columns"].items():
|
|
154
|
+
analysis["formats"][col_metadata["format"]].append(header)
|
|
155
|
+
|
|
156
|
+
return analysis, col_values
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from time import time
|
|
3
|
+
from typing import TextIO
|
|
4
|
+
|
|
5
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
|
|
9
|
+
"""Tests 10 first rows for possible header (in case header is not 1st row)"""
|
|
10
|
+
if verbose:
|
|
11
|
+
start = time()
|
|
12
|
+
logging.info("Detecting headers")
|
|
13
|
+
file.seek(0)
|
|
14
|
+
for i in range(10):
|
|
15
|
+
row = file.readline()
|
|
16
|
+
position = file.tell()
|
|
17
|
+
headers = [c for c in row.replace("\n", "").split(sep) if c]
|
|
18
|
+
if not any(col == "" for col in headers):
|
|
19
|
+
next_row = file.readline()
|
|
20
|
+
file.seek(position)
|
|
21
|
+
if row != next_row:
|
|
22
|
+
if verbose:
|
|
23
|
+
display_logs_depending_process_time(
|
|
24
|
+
f"Detected headers in {round(time() - start, 3)}s",
|
|
25
|
+
time() - start,
|
|
26
|
+
)
|
|
27
|
+
return i, headers
|
|
28
|
+
raise ValueError("Could not retrieve headers")
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
5
|
+
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
6
|
+
to end up with the header at the right place"""
|
|
7
|
+
idx = 0
|
|
8
|
+
if all([str(c).startswith("Unnamed:") for c in table.columns]):
|
|
9
|
+
# there is on offset between the index in the file (idx here)
|
|
10
|
+
# and the index in the dataframe, because of the header
|
|
11
|
+
idx = 1
|
|
12
|
+
while table.iloc[idx - 1].isna().all():
|
|
13
|
+
idx += 1
|
|
14
|
+
cols = table.iloc[idx - 1]
|
|
15
|
+
table = table.iloc[idx:]
|
|
16
|
+
table.columns = cols.to_list()
|
|
17
|
+
# +1 here because the headers should count as a row
|
|
18
|
+
return table, idx
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import logging
|
|
3
|
+
from time import time
|
|
4
|
+
from typing import TextIO
|
|
5
|
+
|
|
6
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def detect_separator(file: TextIO, verbose: bool = False) -> str:
|
|
10
|
+
"""Detects csv separator"""
|
|
11
|
+
# TODO: add a robust detection:
|
|
12
|
+
# si on a un point virgule comme texte et \t comme séparateur, on renvoie
|
|
13
|
+
# pour l'instant un point virgule
|
|
14
|
+
if verbose:
|
|
15
|
+
start = time()
|
|
16
|
+
logging.info("Detecting separator")
|
|
17
|
+
file.seek(0)
|
|
18
|
+
header = file.readline()
|
|
19
|
+
possible_separators = [";", ",", "|", "\t"]
|
|
20
|
+
sep_count = dict()
|
|
21
|
+
for sep in possible_separators:
|
|
22
|
+
sep_count[sep] = header.count(sep)
|
|
23
|
+
sep = max(sep_count, key=sep_count.get)
|
|
24
|
+
# testing that the first 10 (arbitrary) rows all have the same number of fields
|
|
25
|
+
# as the header. Prevents downstream unwanted behaviour where pandas can load
|
|
26
|
+
# the file (in a weird way) but the process is irrelevant.
|
|
27
|
+
file.seek(0)
|
|
28
|
+
reader = csv.reader(file, delimiter=sep)
|
|
29
|
+
rows_lengths = set()
|
|
30
|
+
for idx, row in enumerate(reader):
|
|
31
|
+
if idx > 10:
|
|
32
|
+
break
|
|
33
|
+
rows_lengths.add(len(row))
|
|
34
|
+
if len(rows_lengths) > 1:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if verbose:
|
|
40
|
+
display_logs_depending_process_time(
|
|
41
|
+
f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
|
|
42
|
+
time() - start,
|
|
43
|
+
)
|
|
44
|
+
return sep
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from ast import literal_eval
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_continuous_variable(
|
|
11
|
+
table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False
|
|
12
|
+
):
|
|
13
|
+
"""
|
|
14
|
+
Detects whether a column contains continuous variables. We consider a continuous column
|
|
15
|
+
one that contains a considerable amount of float values.
|
|
16
|
+
We removed the integers as we then end up with postal codes, insee codes, and all sort
|
|
17
|
+
of codes and types.
|
|
18
|
+
This is not optimal but it will do for now.
|
|
19
|
+
"""
|
|
20
|
+
# if we need this again in the future, could be first based on columns detected as int/float to cut time
|
|
21
|
+
|
|
22
|
+
def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
|
|
23
|
+
count = serie.value_counts().to_dict()
|
|
24
|
+
total_nb = len(serie)
|
|
25
|
+
if float in count:
|
|
26
|
+
nb_floats = count[float]
|
|
27
|
+
else:
|
|
28
|
+
return False
|
|
29
|
+
if nb_floats / total_nb >= continuous_th:
|
|
30
|
+
return True
|
|
31
|
+
else:
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
def parses_to_integer(value: str):
|
|
35
|
+
try:
|
|
36
|
+
value = value.replace(",", ".")
|
|
37
|
+
value = literal_eval(value)
|
|
38
|
+
return type(value)
|
|
39
|
+
except Exception:
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
if verbose:
|
|
43
|
+
start = time()
|
|
44
|
+
logging.info("Detecting continuous columns")
|
|
45
|
+
res = table.apply(lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th))
|
|
46
|
+
if verbose:
|
|
47
|
+
display_logs_depending_process_time(
|
|
48
|
+
f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
|
|
49
|
+
time() - start,
|
|
50
|
+
)
|
|
51
|
+
return res.index[res]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def detect_categorical_variable(
|
|
55
|
+
table: pd.DataFrame,
|
|
56
|
+
threshold_pct_categorical: float = 0.05,
|
|
57
|
+
max_number_categorical_values: int = 25,
|
|
58
|
+
verbose: bool = False,
|
|
59
|
+
) -> tuple[list[str], pd.DataFrame]:
|
|
60
|
+
"""
|
|
61
|
+
Heuristically detects whether a table (df) contains categorical values according to
|
|
62
|
+
the number of unique values contained.
|
|
63
|
+
As the idea of detecting categorical values is to then try to learn models to predict
|
|
64
|
+
them, we limit categorical values to at most 25 different modes or at most 5% disparity.
|
|
65
|
+
Postal code, insee code, code region and so on, may be thus not considered categorical values.
|
|
66
|
+
:param table:
|
|
67
|
+
:param threshold_pct_categorical:
|
|
68
|
+
:param max_number_categorical_values:
|
|
69
|
+
:return:
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def abs_number_different_values(column_values: pd.Series):
|
|
73
|
+
return column_values.nunique()
|
|
74
|
+
|
|
75
|
+
def rel_number_different_values(column_values: pd.Series):
|
|
76
|
+
return column_values.nunique() / len(column_values)
|
|
77
|
+
|
|
78
|
+
def detect_categorical(column_values: pd.Series):
|
|
79
|
+
abs_unique_values = abs_number_different_values(column_values)
|
|
80
|
+
rel_unique_values = rel_number_different_values(column_values)
|
|
81
|
+
if (
|
|
82
|
+
abs_unique_values <= max_number_categorical_values
|
|
83
|
+
or rel_unique_values <= threshold_pct_categorical
|
|
84
|
+
):
|
|
85
|
+
return True
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
if verbose:
|
|
89
|
+
start = time()
|
|
90
|
+
logging.info("Detecting categorical columns")
|
|
91
|
+
res = table.apply(lambda serie: detect_categorical(serie))
|
|
92
|
+
if verbose:
|
|
93
|
+
display_logs_depending_process_time(
|
|
94
|
+
f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
|
|
95
|
+
time() - start,
|
|
96
|
+
)
|
|
97
|
+
return list(res.index[res]), res
|