csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2319__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +12 -15
- csv_detective/explore_csv.py +28 -9
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
- csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
- csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
- csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
- csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
- csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
- csv_detective/formats/geojson.py +36 -0
- csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
- csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
- csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
- csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
- csv_detective/formats/sexe.py +17 -0
- csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +45 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +3 -4
- csv_detective/output/dataframe.py +3 -3
- csv_detective/output/profile.py +2 -3
- csv_detective/output/schema.py +2 -2
- csv_detective/parsing/columns.py +35 -50
- csv_detective/parsing/csv.py +2 -2
- csv_detective/parsing/load.py +4 -5
- csv_detective/validate.py +9 -4
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/METADATA +6 -5
- csv_detective-0.9.3.dev2319.dist-info/RECORD +102 -0
- tests/test_fields.py +39 -364
- tests/test_file.py +1 -1
- tests/test_labels.py +5 -3
- tests/test_structure.py +40 -36
- csv_detective/detect_fields/FR/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/__init__.py +0 -112
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/email/__init__.py +0 -10
- csv_detective/detect_fields/other/money/__init__.py +0 -11
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/percent/__init__.py +0 -9
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -14
- csv_detective/detect_fields/other/uuid/__init__.py +0 -10
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
- csv_detective/detect_labels/__init__.py +0 -94
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -8
- csv_detective/detect_labels/other/email/__init__.py +0 -20
- csv_detective/detect_labels/other/float/__init__.py +0 -8
- csv_detective/detect_labels/other/int/__init__.py +0 -8
- csv_detective/detect_labels/other/money/__init__.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_labels/other/twitter/__init__.py +0 -8
- csv_detective/detect_labels/other/url/__init__.py +0 -23
- csv_detective/detect_labels/other/uuid/__init__.py +0 -8
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -28
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
- csv_detective/detect_labels/temp/year/__init__.py +0 -19
- csv_detective/load_tests.py +0 -59
- csv_detective-0.9.3.dev2258.dist-info/RECORD +0 -166
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from csv_detective.detection.variables import (
|
|
|
7
7
|
detect_categorical_variable,
|
|
8
8
|
# detect_continuous_variable,
|
|
9
9
|
)
|
|
10
|
-
from csv_detective.
|
|
10
|
+
from csv_detective.format import Format, FormatsManager
|
|
11
11
|
from csv_detective.output.utils import prepare_output_dict
|
|
12
12
|
from csv_detective.parsing.columns import (
|
|
13
13
|
MAX_NUMBER_CATEGORICAL_VALUES,
|
|
@@ -16,12 +16,14 @@ from csv_detective.parsing.columns import (
|
|
|
16
16
|
test_label,
|
|
17
17
|
)
|
|
18
18
|
|
|
19
|
+
fmtm = FormatsManager()
|
|
20
|
+
|
|
19
21
|
|
|
20
22
|
def detect_formats(
|
|
21
23
|
table: pd.DataFrame,
|
|
22
24
|
analysis: dict,
|
|
23
25
|
file_path: str,
|
|
24
|
-
|
|
26
|
+
tags: list[str] | None = None,
|
|
25
27
|
limited_output: bool = True,
|
|
26
28
|
skipna: bool = True,
|
|
27
29
|
verbose: bool = False,
|
|
@@ -29,15 +31,12 @@ def detect_formats(
|
|
|
29
31
|
in_chunks = analysis.get("total_lines") is None
|
|
30
32
|
|
|
31
33
|
# list testing to be performed
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
)
|
|
35
|
-
all_tests_labels = return_all_tests(
|
|
36
|
-
user_input_tests, detect_type="detect_labels"
|
|
37
|
-
) # list all tests for the labels
|
|
34
|
+
formats: dict[str, Format] = (
|
|
35
|
+
fmtm.get_formats_from_tags(tags) if tags is not None else fmtm.formats
|
|
36
|
+
)
|
|
38
37
|
|
|
39
38
|
# if no testing then return
|
|
40
|
-
if
|
|
39
|
+
if len(formats) == 0:
|
|
41
40
|
return analysis, None
|
|
42
41
|
|
|
43
42
|
# Perform testing on fields
|
|
@@ -45,7 +44,7 @@ def detect_formats(
|
|
|
45
44
|
# table is small enough to be tested in one go
|
|
46
45
|
scores_table_fields = test_col(
|
|
47
46
|
table=table,
|
|
48
|
-
|
|
47
|
+
formats=formats,
|
|
49
48
|
limited_output=limited_output,
|
|
50
49
|
skipna=skipna,
|
|
51
50
|
verbose=verbose,
|
|
@@ -62,7 +61,7 @@ def detect_formats(
|
|
|
62
61
|
table=table,
|
|
63
62
|
file_path=file_path,
|
|
64
63
|
analysis=analysis,
|
|
65
|
-
|
|
64
|
+
formats=formats,
|
|
66
65
|
limited_output=limited_output,
|
|
67
66
|
skipna=skipna,
|
|
68
67
|
verbose=verbose,
|
|
@@ -70,9 +69,7 @@ def detect_formats(
|
|
|
70
69
|
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
71
70
|
|
|
72
71
|
# Perform testing on labels
|
|
73
|
-
scores_table_labels = test_label(
|
|
74
|
-
analysis["header"], all_tests_labels, limited_output, verbose=verbose
|
|
75
|
-
)
|
|
72
|
+
scores_table_labels = test_label(analysis["header"], formats, limited_output, verbose=verbose)
|
|
76
73
|
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
77
74
|
|
|
78
75
|
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
@@ -115,7 +112,7 @@ def detect_formats(
|
|
|
115
112
|
"float": "float",
|
|
116
113
|
"string": "string",
|
|
117
114
|
"json": "json",
|
|
118
|
-
"
|
|
115
|
+
"geojson": "json",
|
|
119
116
|
"datetime_aware": "datetime",
|
|
120
117
|
"datetime_naive": "datetime",
|
|
121
118
|
"datetime_rfc822": "datetime",
|
csv_detective/explore_csv.py
CHANGED
|
@@ -15,7 +15,7 @@ logging.basicConfig(level=logging.INFO)
|
|
|
15
15
|
def routine(
|
|
16
16
|
file_path: str,
|
|
17
17
|
num_rows: int = 500,
|
|
18
|
-
|
|
18
|
+
tags: list[str] | None = None,
|
|
19
19
|
limited_output: bool = True,
|
|
20
20
|
save_results: bool | str = True,
|
|
21
21
|
encoding: str | None = None,
|
|
@@ -28,14 +28,13 @@ def routine(
|
|
|
28
28
|
verbose: bool = False,
|
|
29
29
|
sheet_name: str | int | None = None,
|
|
30
30
|
) -> dict | tuple[dict, pd.DataFrame]:
|
|
31
|
-
"""
|
|
32
|
-
column contents, and if requested the DataFrame with columns cast according to analysis.
|
|
31
|
+
"""
|
|
32
|
+
Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis.
|
|
33
33
|
|
|
34
34
|
Args:
|
|
35
35
|
file_path: local path or URL to file
|
|
36
|
-
num_rows: number of rows to sample from the file for analysis ; -1 for analysis
|
|
37
|
-
|
|
38
|
-
user_input_tests: tests to run on the file
|
|
36
|
+
num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
|
|
37
|
+
tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
|
|
39
38
|
limited_output: whether or not to return all possible types or only the most likely one for each column
|
|
40
39
|
save_results: whether or not to save the results in a json file, or the path where to dump the output
|
|
41
40
|
output_profile: whether or not to add the 'profile' field to the output
|
|
@@ -74,7 +73,7 @@ def routine(
|
|
|
74
73
|
table=table,
|
|
75
74
|
analysis=analysis,
|
|
76
75
|
file_path=file_path,
|
|
77
|
-
|
|
76
|
+
tags=tags,
|
|
78
77
|
limited_output=limited_output,
|
|
79
78
|
skipna=skipna,
|
|
80
79
|
verbose=verbose,
|
|
@@ -107,7 +106,7 @@ def validate_then_detect(
|
|
|
107
106
|
file_path: str,
|
|
108
107
|
previous_analysis: dict,
|
|
109
108
|
num_rows: int = 500,
|
|
110
|
-
|
|
109
|
+
tags: list[str] | None = None,
|
|
111
110
|
limited_output: bool = True,
|
|
112
111
|
save_results: bool | str = True,
|
|
113
112
|
skipna: bool = True,
|
|
@@ -117,6 +116,26 @@ def validate_then_detect(
|
|
|
117
116
|
cast_json: bool = True,
|
|
118
117
|
verbose: bool = False,
|
|
119
118
|
):
|
|
119
|
+
"""
|
|
120
|
+
Performs a validation of the given file against the given analysis.
|
|
121
|
+
If the validation fails, performs a full analysis and return it.
|
|
122
|
+
Otherwise return the previous analysis (which is therefore still valid).
|
|
123
|
+
NB: if asked, the profile is recreated in both cases.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
file_path: the path of the file to validate.
|
|
127
|
+
previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine)
|
|
128
|
+
num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
|
|
129
|
+
tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
|
|
130
|
+
limited_output: whether or not to return all possible types or only the most likely one for each column
|
|
131
|
+
save_results: whether or not to save the results in a json file, or the path where to dump the output
|
|
132
|
+
skipna: whether to ignore NaN values in the checks
|
|
133
|
+
output_profile: whether or not to add the 'profile' field to the output
|
|
134
|
+
output_schema: whether or not to add the 'schema' field to the output (tableschema)
|
|
135
|
+
output_df: whether or not to return the loaded DataFrame along with the analysis report
|
|
136
|
+
cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
|
|
137
|
+
verbose: whether the code displays the steps it's going through
|
|
138
|
+
"""
|
|
120
139
|
if verbose:
|
|
121
140
|
start_routine = time()
|
|
122
141
|
if is_url(file_path):
|
|
@@ -140,7 +159,7 @@ def validate_then_detect(
|
|
|
140
159
|
table=table,
|
|
141
160
|
analysis=analysis,
|
|
142
161
|
file_path=file_path,
|
|
143
|
-
|
|
162
|
+
tags=tags,
|
|
144
163
|
limited_output=limited_output,
|
|
145
164
|
skipna=skipna,
|
|
146
165
|
verbose=verbose,
|
csv_detective/format.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from typing import Any, Callable
|
|
2
|
+
|
|
3
|
+
from csv_detective.parsing.text import header_score
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Format:
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
name: str,
|
|
10
|
+
func: Callable[[Any], bool],
|
|
11
|
+
_test_values: dict[bool, list[str]],
|
|
12
|
+
labels: list[str] = [],
|
|
13
|
+
proportion: float = 1,
|
|
14
|
+
tags: list[str] = [],
|
|
15
|
+
) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Instanciates a Format object.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
name: the name of the format.
|
|
21
|
+
func: the value test for the format (returns whether a string is valid).
|
|
22
|
+
_test_values: lists of valid and invalid values, used in the tests
|
|
23
|
+
labels: the list of hint headers for the header score
|
|
24
|
+
proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
|
|
25
|
+
tags: to allow users to submit a file to only a subset of formats
|
|
26
|
+
"""
|
|
27
|
+
self.name: str = name
|
|
28
|
+
self.func: Callable = func
|
|
29
|
+
self._test_values: dict[bool, list[str]] = _test_values
|
|
30
|
+
self.labels: list[str] = labels
|
|
31
|
+
self.proportion: float = proportion
|
|
32
|
+
self.tags: list[str] = tags
|
|
33
|
+
|
|
34
|
+
def is_valid_label(self, val: str) -> float:
|
|
35
|
+
return header_score(val, self.labels)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class FormatsManager:
|
|
39
|
+
formats: dict[str, Format]
|
|
40
|
+
|
|
41
|
+
def __init__(self) -> None:
|
|
42
|
+
import csv_detective.formats as formats
|
|
43
|
+
|
|
44
|
+
format_labels = [f for f in dir(formats) if "_is" in dir(getattr(formats, f))]
|
|
45
|
+
self.formats = {
|
|
46
|
+
label: Format(
|
|
47
|
+
name=label,
|
|
48
|
+
func=(module := getattr(formats, label))._is,
|
|
49
|
+
_test_values=module._test_values,
|
|
50
|
+
**{
|
|
51
|
+
attr: val
|
|
52
|
+
for attr in ["labels", "proportion", "tags"]
|
|
53
|
+
if (val := getattr(module, attr, None))
|
|
54
|
+
},
|
|
55
|
+
)
|
|
56
|
+
for label in format_labels
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]:
|
|
60
|
+
return {
|
|
61
|
+
label: fmt
|
|
62
|
+
for label, fmt in self.formats.items()
|
|
63
|
+
if all(tag in fmt.tags for tag in tags)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
def available_tags(self) -> set[str]:
|
|
67
|
+
return set(tag for format in self.formats.values() for tag in format.tags)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
for file in os.listdir(os.path.dirname(__file__)):
|
|
5
|
+
if file.endswith(".py") and not file.startswith("_"):
|
|
6
|
+
module_name = file[:-3]
|
|
7
|
+
module = importlib.import_module(f"csv_detective.formats.{module_name}")
|
|
8
|
+
globals()[module_name] = module
|
|
9
|
+
del module
|
|
@@ -1,100 +1,116 @@
|
|
|
1
|
-
from csv_detective.parsing.text import _process_text
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
"
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
"
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
"
|
|
36
|
-
"
|
|
37
|
-
"
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
"
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
|
|
60
|
-
"
|
|
61
|
-
"
|
|
62
|
-
"
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
"
|
|
66
|
-
|
|
67
|
-
"
|
|
68
|
-
"
|
|
69
|
-
# '
|
|
70
|
-
"
|
|
71
|
-
"
|
|
72
|
-
"
|
|
73
|
-
"
|
|
74
|
-
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
# '
|
|
80
|
-
|
|
81
|
-
"
|
|
82
|
-
"
|
|
83
|
-
"
|
|
84
|
-
|
|
85
|
-
"
|
|
86
|
-
|
|
87
|
-
"
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# '
|
|
91
|
-
"
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
1
|
+
from csv_detective.parsing.text import _process_text
|
|
2
|
+
|
|
3
|
+
proportion = 0.55
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"adresse",
|
|
7
|
+
"localisation",
|
|
8
|
+
"adresse postale",
|
|
9
|
+
"adresse geographique",
|
|
10
|
+
"adr",
|
|
11
|
+
"adresse complete",
|
|
12
|
+
"adresse station",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
voies = {
|
|
16
|
+
"aire ",
|
|
17
|
+
"allee ",
|
|
18
|
+
"avenue ",
|
|
19
|
+
"base ",
|
|
20
|
+
"boulevard ",
|
|
21
|
+
"cami ",
|
|
22
|
+
"carrefour ",
|
|
23
|
+
"chemin ",
|
|
24
|
+
"cheminement ",
|
|
25
|
+
"chaussee ",
|
|
26
|
+
"cite ",
|
|
27
|
+
"clos ",
|
|
28
|
+
"coin ",
|
|
29
|
+
"corniche ",
|
|
30
|
+
"cote ",
|
|
31
|
+
"cour ",
|
|
32
|
+
"cours ",
|
|
33
|
+
"domaine ",
|
|
34
|
+
"descente ",
|
|
35
|
+
"ecart ",
|
|
36
|
+
"esplanade ",
|
|
37
|
+
"faubourg ",
|
|
38
|
+
"gare ",
|
|
39
|
+
"grande rue",
|
|
40
|
+
"hameau ",
|
|
41
|
+
"halle ",
|
|
42
|
+
"ilot ",
|
|
43
|
+
"impasse ",
|
|
44
|
+
"lieu dit",
|
|
45
|
+
"lotissement ",
|
|
46
|
+
"marche ",
|
|
47
|
+
"montee ",
|
|
48
|
+
"parc ",
|
|
49
|
+
"passage ",
|
|
50
|
+
"place ",
|
|
51
|
+
"plan ",
|
|
52
|
+
"plaine ",
|
|
53
|
+
"plateau ",
|
|
54
|
+
"pont ",
|
|
55
|
+
"port ",
|
|
56
|
+
"promenade ",
|
|
57
|
+
"parvis ",
|
|
58
|
+
"quartier ",
|
|
59
|
+
"quai ",
|
|
60
|
+
"residence ",
|
|
61
|
+
"ruelle ",
|
|
62
|
+
"rocade ",
|
|
63
|
+
"rond point",
|
|
64
|
+
"route ",
|
|
65
|
+
"rue ",
|
|
66
|
+
# 'sente - sentier',
|
|
67
|
+
"square ",
|
|
68
|
+
"tour ",
|
|
69
|
+
# 'terre-plein',
|
|
70
|
+
"traverse ",
|
|
71
|
+
"villa ",
|
|
72
|
+
"village ",
|
|
73
|
+
"voie ",
|
|
74
|
+
"zone artisanale",
|
|
75
|
+
"zone d’amenagement concerte",
|
|
76
|
+
"zone d’amenagement differe",
|
|
77
|
+
"zone industrielle",
|
|
78
|
+
"zone ",
|
|
79
|
+
# 'r',
|
|
80
|
+
"av ",
|
|
81
|
+
"pl ",
|
|
82
|
+
"bd ",
|
|
83
|
+
"cami ",
|
|
84
|
+
# 'che',
|
|
85
|
+
"chs ",
|
|
86
|
+
"dom ",
|
|
87
|
+
"ham ",
|
|
88
|
+
"ld ",
|
|
89
|
+
# 'pro',
|
|
90
|
+
# 'rte',
|
|
91
|
+
"vlge ",
|
|
92
|
+
"za ",
|
|
93
|
+
"zac ",
|
|
94
|
+
"zad ",
|
|
95
|
+
"zi ",
|
|
96
|
+
# 'car',
|
|
97
|
+
"fg ",
|
|
98
|
+
# 'lot',
|
|
99
|
+
"imp ",
|
|
100
|
+
# 'qu',
|
|
101
|
+
"mte",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _is(val):
|
|
106
|
+
"""Repere des adresses"""
|
|
107
|
+
if not isinstance(val, str) or len(val) > 150:
|
|
108
|
+
return False
|
|
109
|
+
val = _process_text(val)
|
|
110
|
+
return any(x in val for x in voies)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
_test_values = {
|
|
114
|
+
True: ["rue du martyr"],
|
|
115
|
+
False: ["un batiment"],
|
|
116
|
+
}
|
|
@@ -1,27 +1,35 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
"
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
1
|
+
proportion = 1
|
|
2
|
+
tags = ["type"]
|
|
3
|
+
labels = ["is ", "has ", "est "]
|
|
4
|
+
|
|
5
|
+
bool_mapping = {
|
|
6
|
+
"1": True,
|
|
7
|
+
"0": False,
|
|
8
|
+
"vrai": True,
|
|
9
|
+
"faux": False,
|
|
10
|
+
"true": True,
|
|
11
|
+
"false": False,
|
|
12
|
+
"oui": True,
|
|
13
|
+
"non": False,
|
|
14
|
+
"yes": True,
|
|
15
|
+
"no": False,
|
|
16
|
+
"y": True,
|
|
17
|
+
"n": False,
|
|
18
|
+
"o": True,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
liste_bool = set(bool_mapping.keys())
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def bool_casting(val: str) -> bool:
|
|
25
|
+
return bool_mapping.get(val.lower())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _is(val):
|
|
29
|
+
return isinstance(val, str) and val.lower() in liste_bool
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
_test_values = {
|
|
33
|
+
True: ["oui", "0", "1", "yes", "false", "True"],
|
|
34
|
+
False: ["nein", "ja", "2", "-0"],
|
|
35
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from frformat import CodeCommuneInsee, Millesime
|
|
2
|
+
|
|
3
|
+
proportion = 0.75
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"code commune insee",
|
|
7
|
+
"code insee",
|
|
8
|
+
"codes insee",
|
|
9
|
+
"code commune",
|
|
10
|
+
"code insee commune",
|
|
11
|
+
"insee",
|
|
12
|
+
"code com",
|
|
13
|
+
"com",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
_code_commune_insee = CodeCommuneInsee(Millesime.LATEST)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _is(val):
|
|
20
|
+
return isinstance(val, str) and _code_commune_insee.is_valid(val)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
_test_values = {
|
|
24
|
+
True: ["91471", "01053"],
|
|
25
|
+
False: ["914712", "01000"],
|
|
26
|
+
}
|
csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py}
RENAMED
|
@@ -1,29 +1,36 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
from csv_detective.parsing.text import _process_text
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
"
|
|
28
|
-
|
|
29
|
-
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from csv_detective.parsing.text import _process_text
|
|
4
|
+
|
|
5
|
+
proportion = 1
|
|
6
|
+
tags = ["fr"]
|
|
7
|
+
labels = ["code csp insee", "code csp"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _is(val):
|
|
11
|
+
if not isinstance(val, str):
|
|
12
|
+
return False
|
|
13
|
+
val = _process_text(val)
|
|
14
|
+
if len(val) != 4:
|
|
15
|
+
return False
|
|
16
|
+
a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val))
|
|
17
|
+
b = val in {
|
|
18
|
+
"7100",
|
|
19
|
+
"7200",
|
|
20
|
+
"7400",
|
|
21
|
+
"7500",
|
|
22
|
+
"7700",
|
|
23
|
+
"7800",
|
|
24
|
+
"8100",
|
|
25
|
+
"8300",
|
|
26
|
+
"8400",
|
|
27
|
+
"8500",
|
|
28
|
+
"8600",
|
|
29
|
+
}
|
|
30
|
+
return a or b
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_test_values = {
|
|
34
|
+
True: ["121f"],
|
|
35
|
+
False: ["121x"],
|
|
36
|
+
}
|
csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py}
RENAMED
|
@@ -1,15 +1,29 @@
|
|
|
1
|
-
from frformat import Millesime, NumeroDepartement, Options
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
1
|
+
from frformat import Millesime, NumeroDepartement, Options
|
|
2
|
+
|
|
3
|
+
proportion = 1
|
|
4
|
+
tags = ["fr", "geo"]
|
|
5
|
+
labels = [
|
|
6
|
+
"code departement",
|
|
7
|
+
"code_departement",
|
|
8
|
+
"dep",
|
|
9
|
+
"departement",
|
|
10
|
+
"dept",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
_options = Options(
|
|
14
|
+
ignore_case=True,
|
|
15
|
+
ignore_accents=True,
|
|
16
|
+
replace_non_alphanumeric_with_space=True,
|
|
17
|
+
ignore_extra_whitespace=True,
|
|
18
|
+
)
|
|
19
|
+
_numero_departement = NumeroDepartement(Millesime.LATEST, _options)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is(val):
|
|
23
|
+
return isinstance(val, str) and _numero_departement.is_valid(val)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_test_values = {
|
|
27
|
+
True: ["75", "2A", "2b", "974", "01"],
|
|
28
|
+
False: ["00", "96", "101"],
|
|
29
|
+
}
|