csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
csv_detective/explore_csv.py
CHANGED
|
@@ -1,413 +1,187 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Ce script analyse les premières lignes d'un CSV pour essayer de déterminer le
|
|
3
|
-
contenu possible des champs
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
from typing import Dict, List, Literal, Union
|
|
7
|
-
import json
|
|
8
|
-
import numpy as np
|
|
9
|
-
import os
|
|
10
|
-
import tempfile
|
|
11
|
-
from pkg_resources import resource_string
|
|
12
1
|
import logging
|
|
13
2
|
from time import time
|
|
14
3
|
|
|
15
|
-
|
|
16
|
-
from csv_detective import detect_fields
|
|
17
|
-
from csv_detective import detect_labels
|
|
18
|
-
from csv_detective.s3_utils import download_from_minio, upload_to_minio
|
|
19
|
-
from csv_detective.schema_generation import generate_table_schema
|
|
20
|
-
from csv_detective.utils import test_col, test_label, prepare_output_dict, display_logs_depending_process_time
|
|
21
|
-
from .detection import (
|
|
22
|
-
detect_separator,
|
|
23
|
-
detect_encoding,
|
|
24
|
-
detect_headers,
|
|
25
|
-
detect_heading_columns,
|
|
26
|
-
detect_trailing_columns,
|
|
27
|
-
parse_table,
|
|
28
|
-
create_profile,
|
|
29
|
-
detetect_categorical_variable,
|
|
30
|
-
# detect_continuous_variable,
|
|
31
|
-
)
|
|
4
|
+
import pandas as pd
|
|
32
5
|
|
|
6
|
+
from csv_detective.detection.formats import detect_formats
|
|
7
|
+
from csv_detective.output import generate_output
|
|
8
|
+
from csv_detective.parsing.load import load_file
|
|
9
|
+
from csv_detective.utils import display_logs_depending_process_time, is_url
|
|
10
|
+
from csv_detective.validate import validate
|
|
33
11
|
|
|
34
12
|
logging.basicConfig(level=logging.INFO)
|
|
35
13
|
|
|
36
|
-
def return_all_tests(user_input_tests, detect_type="detect_fields"):
|
|
37
|
-
"""
|
|
38
|
-
returns all tests that have a method _is and are listed in the user_input_tests
|
|
39
|
-
the function can select a sub_package from csv_detective
|
|
40
|
-
"""
|
|
41
|
-
all_packages = resource_string(__name__, "all_packages.txt")
|
|
42
|
-
all_packages = all_packages.decode().split("\n")
|
|
43
|
-
all_packages.remove("")
|
|
44
|
-
all_packages.remove("csv_detective")
|
|
45
|
-
all_packages = [x.replace("csv_detective.", "") for x in all_packages]
|
|
46
|
-
|
|
47
|
-
if user_input_tests is None:
|
|
48
|
-
return []
|
|
49
|
-
|
|
50
|
-
if isinstance(user_input_tests, str):
|
|
51
|
-
assert user_input_tests[0] != "-"
|
|
52
|
-
if user_input_tests == "ALL":
|
|
53
|
-
tests_to_do = [detect_type]
|
|
54
|
-
else:
|
|
55
|
-
tests_to_do = [detect_type + "." + user_input_tests]
|
|
56
|
-
tests_to_not_do = []
|
|
57
|
-
elif isinstance(user_input_tests, list):
|
|
58
|
-
if "ALL" in user_input_tests:
|
|
59
|
-
tests_to_do = [detect_type]
|
|
60
|
-
else:
|
|
61
|
-
tests_to_do = [
|
|
62
|
-
detect_type + "." + x for x in user_input_tests if x[0] != "-"
|
|
63
|
-
]
|
|
64
|
-
tests_to_not_do = [
|
|
65
|
-
detect_type + "." + x[1:] for x in user_input_tests if x[0] == "-"
|
|
66
|
-
]
|
|
67
|
-
|
|
68
|
-
all_fields = [
|
|
69
|
-
x
|
|
70
|
-
for x in all_packages
|
|
71
|
-
if any([y == x[: len(y)] for y in tests_to_do])
|
|
72
|
-
and all([y != x[: len(y)] for y in tests_to_not_do])
|
|
73
|
-
]
|
|
74
|
-
all_tests = [eval(field) for field in all_fields]
|
|
75
|
-
all_tests = [
|
|
76
|
-
test for test in all_tests if "_is" in dir(test)
|
|
77
|
-
] # TODO : Fix this shit
|
|
78
|
-
return all_tests
|
|
79
|
-
|
|
80
14
|
|
|
81
15
|
def routine(
|
|
82
|
-
|
|
16
|
+
file_path: str,
|
|
83
17
|
num_rows: int = 500,
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
save_results: bool = True,
|
|
87
|
-
encoding: str = None,
|
|
88
|
-
sep: str = None,
|
|
18
|
+
tags: list[str] | None = None,
|
|
19
|
+
limited_output: bool = True,
|
|
20
|
+
save_results: bool | str = True,
|
|
21
|
+
encoding: str | None = None,
|
|
22
|
+
sep: str | None = None,
|
|
23
|
+
skipna: bool = True,
|
|
89
24
|
output_profile: bool = False,
|
|
90
25
|
output_schema: bool = False,
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
26
|
+
output_df: bool = False,
|
|
27
|
+
cast_json: bool = True,
|
|
28
|
+
verbose: bool = False,
|
|
29
|
+
sheet_name: str | int | None = None,
|
|
30
|
+
) -> dict | tuple[dict, pd.DataFrame]:
|
|
31
|
+
"""
|
|
32
|
+
Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis.
|
|
95
33
|
|
|
96
34
|
Args:
|
|
97
|
-
|
|
98
|
-
num_rows: number of rows to sample from the file for analysis ; -1 for analysis
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
the most likely one for each column
|
|
103
|
-
save_results: whether or not to save the results in a json file
|
|
35
|
+
file_path: local path or URL to file
|
|
36
|
+
num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
|
|
37
|
+
tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
|
|
38
|
+
limited_output: whether or not to return all possible types or only the most likely one for each column
|
|
39
|
+
save_results: whether or not to save the results in a json file, or the path where to dump the output
|
|
104
40
|
output_profile: whether or not to add the 'profile' field to the output
|
|
105
41
|
output_schema: whether or not to add the 'schema' field to the output (tableschema)
|
|
106
|
-
|
|
42
|
+
output_df: whether or not to return the loaded DataFrame along with the analysis report
|
|
43
|
+
cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
|
|
44
|
+
verbose: whether or not to print process logs in console
|
|
45
|
+
sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
|
|
46
|
+
skipna: whether to keep NaN (empty cells) for tests
|
|
107
47
|
|
|
108
48
|
Returns:
|
|
109
49
|
dict: a dict with information about the csv and possible types for each column
|
|
110
50
|
"""
|
|
111
|
-
if verbose:
|
|
112
|
-
start_routine = time()
|
|
113
|
-
if csv_file_path is None:
|
|
114
|
-
raise ValueError("csv_file_path is required.")
|
|
115
|
-
|
|
116
|
-
if encoding is None:
|
|
117
|
-
binary_file = open(csv_file_path, mode="rb")
|
|
118
|
-
encoding = detect_encoding(binary_file, verbose=verbose)
|
|
119
|
-
|
|
120
|
-
with open(csv_file_path, "r", encoding=encoding) as str_file:
|
|
121
|
-
if sep is None:
|
|
122
|
-
sep = detect_separator(str_file, verbose=verbose)
|
|
123
|
-
header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
|
|
124
|
-
if header is None:
|
|
125
|
-
return_dict = {"error": True}
|
|
126
|
-
return return_dict
|
|
127
|
-
elif isinstance(header, list):
|
|
128
|
-
if any([x is None for x in header]):
|
|
129
|
-
return_dict = {"error": True}
|
|
130
|
-
return return_dict
|
|
131
|
-
heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
|
|
132
|
-
trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
|
|
133
|
-
table, total_lines, nb_duplicates = parse_table(
|
|
134
|
-
str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
if table.empty:
|
|
138
|
-
res_categorical = []
|
|
139
|
-
# res_continuous = []
|
|
140
|
-
else:
|
|
141
|
-
# Detects columns that are categorical
|
|
142
|
-
res_categorical, categorical_mask = detetect_categorical_variable(table, verbose=verbose)
|
|
143
|
-
res_categorical = list(res_categorical)
|
|
144
|
-
# Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
|
|
145
|
-
# res_continuous = list(
|
|
146
|
-
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
147
|
-
# )
|
|
148
|
-
|
|
149
|
-
# Creating return dictionary
|
|
150
|
-
return_dict = dict()
|
|
151
|
-
return_dict["encoding"] = encoding
|
|
152
|
-
return_dict["separator"] = sep
|
|
153
|
-
return_dict["header_row_idx"] = header_row_idx
|
|
154
|
-
return_dict["header"] = header
|
|
155
|
-
return_dict["total_lines"] = total_lines
|
|
156
|
-
return_dict["nb_duplicates"] = nb_duplicates
|
|
157
|
-
|
|
158
|
-
return_dict["heading_columns"] = heading_columns
|
|
159
|
-
return_dict["trailing_columns"] = trailing_columns
|
|
160
51
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
user_input_tests, detect_type="detect_fields"
|
|
167
|
-
) # list all tests for the fields
|
|
168
|
-
all_tests_labels = return_all_tests(
|
|
169
|
-
user_input_tests, detect_type="detect_labels"
|
|
170
|
-
) # list all tests for the labels
|
|
171
|
-
|
|
172
|
-
# if no testing then return
|
|
173
|
-
if not all_tests_fields and not all_tests_labels:
|
|
174
|
-
return return_dict
|
|
175
|
-
|
|
176
|
-
# Perform testing on fields
|
|
177
|
-
return_table_fields = test_col(table, all_tests_fields, output_mode, verbose=verbose)
|
|
178
|
-
return_dict_cols_fields = prepare_output_dict(return_table_fields, output_mode)
|
|
179
|
-
return_dict["columns_fields"] = return_dict_cols_fields
|
|
52
|
+
if not (
|
|
53
|
+
isinstance(save_results, bool)
|
|
54
|
+
or (isinstance(save_results, str) and save_results.endswith(".json"))
|
|
55
|
+
):
|
|
56
|
+
raise ValueError("`save_results` must be a bool or a valid path to a json file.")
|
|
180
57
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
58
|
+
if verbose:
|
|
59
|
+
start_routine = time()
|
|
60
|
+
if is_url(file_path):
|
|
61
|
+
logging.info("Path recognized as a URL")
|
|
185
62
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
).values
|
|
194
|
-
/ 2
|
|
63
|
+
table, analysis = load_file(
|
|
64
|
+
file_path=file_path,
|
|
65
|
+
num_rows=num_rows,
|
|
66
|
+
encoding=encoding,
|
|
67
|
+
sep=sep,
|
|
68
|
+
verbose=verbose,
|
|
69
|
+
sheet_name=sheet_name,
|
|
195
70
|
)
|
|
196
71
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
"latitude_wgs_fr_metropole",
|
|
206
|
-
"longitude_wgs_fr_metropole",
|
|
207
|
-
"latitude_l93",
|
|
208
|
-
"longitude_l93",
|
|
209
|
-
]
|
|
210
|
-
return_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
211
|
-
return_table_labels.loc[formats_with_mandatory_label, :],
|
|
212
|
-
return_table.loc[formats_with_mandatory_label, :],
|
|
213
|
-
0,
|
|
72
|
+
analysis, _col_values = detect_formats(
|
|
73
|
+
table=table,
|
|
74
|
+
analysis=analysis,
|
|
75
|
+
file_path=file_path,
|
|
76
|
+
tags=tags,
|
|
77
|
+
limited_output=limited_output,
|
|
78
|
+
skipna=skipna,
|
|
79
|
+
verbose=verbose,
|
|
214
80
|
)
|
|
215
|
-
return_dict_cols = prepare_output_dict(return_table, output_mode)
|
|
216
|
-
return_dict["columns"] = return_dict_cols
|
|
217
81
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
"longitude_wgs": "float",
|
|
234
|
-
"longitude_wgs_fr_metropole": "float",
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
if output_mode == "ALL":
|
|
238
|
-
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
239
|
-
return_dict[detection_method] = {
|
|
240
|
-
col_name: [
|
|
241
|
-
{
|
|
242
|
-
"python_type": metier_to_python_type.get(
|
|
243
|
-
detection["format"], "string"
|
|
244
|
-
),
|
|
245
|
-
**detection,
|
|
246
|
-
}
|
|
247
|
-
for detection in detections
|
|
248
|
-
]
|
|
249
|
-
for col_name, detections in return_dict[detection_method].items()
|
|
250
|
-
}
|
|
251
|
-
if output_mode == "LIMITED":
|
|
252
|
-
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
253
|
-
return_dict[detection_method] = {
|
|
254
|
-
col_name: {
|
|
255
|
-
"python_type": metier_to_python_type.get(
|
|
256
|
-
detection["format"], "string"
|
|
257
|
-
),
|
|
258
|
-
**detection,
|
|
259
|
-
}
|
|
260
|
-
for col_name, detection in return_dict[detection_method].items()
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
# Add detection with formats as keys
|
|
264
|
-
return_dict["formats"] = {
|
|
265
|
-
column_metadata["format"]: []
|
|
266
|
-
for column_metadata in return_dict["columns"].values()
|
|
267
|
-
}
|
|
268
|
-
for header, col_metadata in return_dict["columns"].items():
|
|
269
|
-
return_dict["formats"][col_metadata["format"]].append(header)
|
|
270
|
-
|
|
271
|
-
if output_profile:
|
|
272
|
-
return_dict["profile"] = create_profile(
|
|
273
|
-
table, return_dict["columns"],
|
|
274
|
-
sep,
|
|
275
|
-
encoding,
|
|
276
|
-
num_rows,
|
|
277
|
-
header_row_idx,
|
|
278
|
-
verbose=verbose
|
|
279
|
-
)
|
|
280
|
-
|
|
281
|
-
if save_results:
|
|
282
|
-
# Write your file as json
|
|
283
|
-
output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
|
|
284
|
-
with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
|
|
285
|
-
json.dump(return_dict, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
|
|
286
|
-
|
|
287
|
-
if output_schema and output_mode != "ALL":
|
|
288
|
-
return_dict["schema"] = generate_table_schema(
|
|
289
|
-
return_dict,
|
|
290
|
-
save_file=False,
|
|
291
|
-
verbose=verbose
|
|
292
|
-
)
|
|
293
|
-
if verbose:
|
|
294
|
-
display_logs_depending_process_time(
|
|
295
|
-
f'Routine completed in {round(time() - start_routine, 3)}s',
|
|
296
|
-
time() - start_routine
|
|
82
|
+
try:
|
|
83
|
+
return generate_output(
|
|
84
|
+
table=table,
|
|
85
|
+
analysis=analysis,
|
|
86
|
+
file_path=file_path,
|
|
87
|
+
num_rows=num_rows,
|
|
88
|
+
limited_output=limited_output,
|
|
89
|
+
save_results=save_results,
|
|
90
|
+
output_profile=output_profile,
|
|
91
|
+
output_schema=output_schema,
|
|
92
|
+
output_df=output_df,
|
|
93
|
+
cast_json=cast_json,
|
|
94
|
+
verbose=verbose,
|
|
95
|
+
sheet_name=sheet_name,
|
|
96
|
+
_col_values=_col_values,
|
|
297
97
|
)
|
|
298
|
-
|
|
98
|
+
finally:
|
|
99
|
+
if verbose:
|
|
100
|
+
display_logs_depending_process_time(
|
|
101
|
+
f"Routine completed in {round(time() - start_routine, 3)}s", time() - start_routine
|
|
102
|
+
)
|
|
299
103
|
|
|
300
104
|
|
|
301
|
-
def
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
tableschema_minio_location: Dict[str, str],
|
|
305
|
-
minio_user: str,
|
|
306
|
-
minio_pwd: str,
|
|
105
|
+
def validate_then_detect(
|
|
106
|
+
file_path: str,
|
|
107
|
+
previous_analysis: dict,
|
|
307
108
|
num_rows: int = 500,
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
109
|
+
tags: list[str] | None = None,
|
|
110
|
+
limited_output: bool = True,
|
|
111
|
+
save_results: bool | str = True,
|
|
112
|
+
skipna: bool = True,
|
|
113
|
+
output_profile: bool = False,
|
|
114
|
+
output_schema: bool = False,
|
|
115
|
+
output_df: bool = False,
|
|
116
|
+
cast_json: bool = True,
|
|
117
|
+
verbose: bool = False,
|
|
311
118
|
):
|
|
312
|
-
"""
|
|
313
|
-
|
|
119
|
+
"""
|
|
120
|
+
Performs a validation of the given file against the given analysis.
|
|
121
|
+
If the validation fails, performs a full analysis and return it.
|
|
122
|
+
Otherwise return the previous analysis (which is therefore still valid).
|
|
123
|
+
NB: if asked, the profile is recreated in both cases.
|
|
314
124
|
|
|
315
125
|
Args:
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
the
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
the
|
|
328
|
-
|
|
329
|
-
Returns:
|
|
330
|
-
dict: a dict with information about the csv and possible types for each column
|
|
126
|
+
file_path: the path of the file to validate.
|
|
127
|
+
previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine)
|
|
128
|
+
num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
|
|
129
|
+
tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
|
|
130
|
+
limited_output: whether or not to return all possible types or only the most likely one for each column
|
|
131
|
+
save_results: whether or not to save the results in a json file, or the path where to dump the output
|
|
132
|
+
skipna: whether to ignore NaN values in the checks
|
|
133
|
+
output_profile: whether or not to add the 'profile' field to the output
|
|
134
|
+
output_schema: whether or not to add the 'schema' field to the output (tableschema)
|
|
135
|
+
output_df: whether or not to return the loaded DataFrame along with the analysis report
|
|
136
|
+
cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
|
|
137
|
+
verbose: whether the code displays the steps it's going through
|
|
331
138
|
"""
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
(
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
]
|
|
343
|
-
]
|
|
344
|
-
)
|
|
345
|
-
)
|
|
346
|
-
and (minio_user is None)
|
|
347
|
-
or (minio_pwd is None)
|
|
348
|
-
):
|
|
349
|
-
raise ValueError("Minio credentials are required if using Minio")
|
|
350
|
-
|
|
351
|
-
for location_dict in [
|
|
352
|
-
csv_minio_location,
|
|
353
|
-
output_minio_location,
|
|
354
|
-
tableschema_minio_location,
|
|
355
|
-
]:
|
|
356
|
-
if location_dict is not None:
|
|
357
|
-
if any(
|
|
358
|
-
[
|
|
359
|
-
(location_key not in location_dict)
|
|
360
|
-
or (location_dict[location_key] is None)
|
|
361
|
-
for location_key in ["netloc", "bucket", "key"]
|
|
362
|
-
]
|
|
363
|
-
):
|
|
364
|
-
raise ValueError("Minio location dict must contain url, bucket and key")
|
|
365
|
-
|
|
366
|
-
csv_file_path = tempfile.NamedTemporaryFile(delete=False).name
|
|
367
|
-
download_from_minio(
|
|
368
|
-
netloc=csv_minio_location["netloc"],
|
|
369
|
-
bucket=csv_minio_location["bucket"],
|
|
370
|
-
key=csv_minio_location["key"],
|
|
371
|
-
filepath=csv_file_path,
|
|
372
|
-
minio_user=minio_user,
|
|
373
|
-
minio_pwd=minio_pwd,
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
return_dict = routine(
|
|
377
|
-
csv_file_path,
|
|
378
|
-
num_rows,
|
|
379
|
-
user_input_tests,
|
|
380
|
-
output_mode="LIMITED",
|
|
381
|
-
save_results=True,
|
|
382
|
-
encoding=encoding,
|
|
383
|
-
sep=sep,
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
# Write report JSON file.
|
|
387
|
-
output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
|
|
388
|
-
with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
|
|
389
|
-
json.dump(return_dict, fp, indent=4, separators=(",", ": "))
|
|
390
|
-
|
|
391
|
-
upload_to_minio(
|
|
392
|
-
netloc=output_minio_location["netloc"],
|
|
393
|
-
bucket=output_minio_location["bucket"],
|
|
394
|
-
key=output_minio_location["key"],
|
|
395
|
-
filepath=output_path_to_store_minio_file,
|
|
396
|
-
minio_user=minio_user,
|
|
397
|
-
minio_pwd=minio_pwd,
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
os.remove(output_path_to_store_minio_file)
|
|
401
|
-
os.remove(csv_file_path)
|
|
402
|
-
|
|
403
|
-
generate_table_schema(
|
|
404
|
-
return_dict,
|
|
405
|
-
True,
|
|
406
|
-
netloc=tableschema_minio_location["netloc"],
|
|
407
|
-
bucket=tableschema_minio_location["bucket"],
|
|
408
|
-
key=tableschema_minio_location["key"],
|
|
409
|
-
minio_user=minio_user,
|
|
410
|
-
minio_pwd=minio_pwd,
|
|
139
|
+
if verbose:
|
|
140
|
+
start_routine = time()
|
|
141
|
+
if is_url(file_path):
|
|
142
|
+
logging.info("Path recognized as a URL")
|
|
143
|
+
|
|
144
|
+
is_valid, table, analysis, col_values = validate(
|
|
145
|
+
file_path=file_path,
|
|
146
|
+
previous_analysis=previous_analysis,
|
|
147
|
+
verbose=verbose,
|
|
148
|
+
skipna=skipna,
|
|
411
149
|
)
|
|
412
|
-
|
|
413
|
-
|
|
150
|
+
if analysis is None:
|
|
151
|
+
# if loading failed in validate, we load it from scratch
|
|
152
|
+
table, analysis = load_file(
|
|
153
|
+
file_path=file_path,
|
|
154
|
+
num_rows=num_rows,
|
|
155
|
+
verbose=verbose,
|
|
156
|
+
)
|
|
157
|
+
if not is_valid:
|
|
158
|
+
analysis, col_values = detect_formats(
|
|
159
|
+
table=table,
|
|
160
|
+
analysis=analysis,
|
|
161
|
+
file_path=file_path,
|
|
162
|
+
tags=tags,
|
|
163
|
+
limited_output=limited_output,
|
|
164
|
+
skipna=skipna,
|
|
165
|
+
verbose=verbose,
|
|
166
|
+
)
|
|
167
|
+
try:
|
|
168
|
+
return generate_output(
|
|
169
|
+
table=table,
|
|
170
|
+
analysis=analysis,
|
|
171
|
+
file_path=file_path,
|
|
172
|
+
num_rows=num_rows,
|
|
173
|
+
limited_output=limited_output,
|
|
174
|
+
save_results=save_results,
|
|
175
|
+
output_profile=output_profile,
|
|
176
|
+
output_schema=output_schema,
|
|
177
|
+
output_df=output_df,
|
|
178
|
+
cast_json=cast_json,
|
|
179
|
+
verbose=verbose,
|
|
180
|
+
sheet_name=analysis.get("sheet_name"),
|
|
181
|
+
_col_values=col_values,
|
|
182
|
+
)
|
|
183
|
+
finally:
|
|
184
|
+
if verbose:
|
|
185
|
+
display_logs_depending_process_time(
|
|
186
|
+
f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
|
|
187
|
+
)
|
csv_detective/format.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from typing import Any, Callable
|
|
2
|
+
|
|
3
|
+
from csv_detective.parsing.text import header_score
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Format:
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
name: str,
|
|
10
|
+
func: Callable[[Any], bool],
|
|
11
|
+
_test_values: dict[bool, list[str]],
|
|
12
|
+
labels: list[str] = [],
|
|
13
|
+
proportion: float = 1,
|
|
14
|
+
tags: list[str] = [],
|
|
15
|
+
) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Instanciates a Format object.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
name: the name of the format.
|
|
21
|
+
func: the value test for the format (returns whether a string is valid).
|
|
22
|
+
_test_values: lists of valid and invalid values, used in the tests
|
|
23
|
+
labels: the list of hint headers for the header score
|
|
24
|
+
proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
|
|
25
|
+
tags: to allow users to submit a file to only a subset of formats
|
|
26
|
+
"""
|
|
27
|
+
self.name: str = name
|
|
28
|
+
self.func: Callable = func
|
|
29
|
+
self._test_values: dict[bool, list[str]] = _test_values
|
|
30
|
+
self.labels: list[str] = labels
|
|
31
|
+
self.proportion: float = proportion
|
|
32
|
+
self.tags: list[str] = tags
|
|
33
|
+
|
|
34
|
+
def is_valid_label(self, val: str) -> float:
|
|
35
|
+
return header_score(val, self.labels)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class FormatsManager:
|
|
39
|
+
formats: dict[str, Format]
|
|
40
|
+
|
|
41
|
+
def __init__(self) -> None:
|
|
42
|
+
import csv_detective.formats as formats
|
|
43
|
+
|
|
44
|
+
format_labels = [f for f in dir(formats) if "_is" in dir(getattr(formats, f))]
|
|
45
|
+
self.formats = {
|
|
46
|
+
label: Format(
|
|
47
|
+
name=label,
|
|
48
|
+
func=(module := getattr(formats, label))._is,
|
|
49
|
+
_test_values=module._test_values,
|
|
50
|
+
**{
|
|
51
|
+
attr: val
|
|
52
|
+
for attr in ["labels", "proportion", "tags"]
|
|
53
|
+
if (val := getattr(module, attr, None))
|
|
54
|
+
},
|
|
55
|
+
)
|
|
56
|
+
for label in format_labels
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]:
|
|
60
|
+
return {
|
|
61
|
+
label: fmt
|
|
62
|
+
for label, fmt in self.formats.items()
|
|
63
|
+
if all(tag in fmt.tags for tag in tags)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
def available_tags(self) -> set[str]:
|
|
67
|
+
return set(tag for format in self.formats.values() for tag in format.tags)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
for file in os.listdir(os.path.dirname(__file__)):
|
|
5
|
+
if file.endswith(".py") and not file.startswith("_"):
|
|
6
|
+
module_name = file[:-3]
|
|
7
|
+
module = importlib.import_module(f"csv_detective.formats.{module_name}")
|
|
8
|
+
globals()[module_name] = module
|
|
9
|
+
del module
|