csv-detective 0.9.3.dev2241__py3-none-any.whl → 0.9.3.dev2319__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +12 -15
- csv_detective/detection/headers.py +6 -8
- csv_detective/explore_csv.py +28 -9
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
- csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
- csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
- csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
- csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
- csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
- csv_detective/formats/geojson.py +36 -0
- csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
- csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
- csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
- csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
- csv_detective/formats/sexe.py +17 -0
- csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +45 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +3 -4
- csv_detective/output/dataframe.py +3 -3
- csv_detective/output/profile.py +2 -3
- csv_detective/output/schema.py +2 -2
- csv_detective/parsing/columns.py +35 -50
- csv_detective/parsing/csv.py +2 -2
- csv_detective/parsing/load.py +10 -11
- csv_detective/validate.py +9 -4
- {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/METADATA +6 -5
- csv_detective-0.9.3.dev2319.dist-info/RECORD +102 -0
- tests/test_fields.py +39 -364
- tests/test_file.py +1 -1
- tests/test_labels.py +5 -3
- tests/test_structure.py +40 -36
- csv_detective/detect_fields/FR/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/__init__.py +0 -112
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/email/__init__.py +0 -10
- csv_detective/detect_fields/other/money/__init__.py +0 -11
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/percent/__init__.py +0 -9
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -14
- csv_detective/detect_fields/other/uuid/__init__.py +0 -10
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
- csv_detective/detect_labels/__init__.py +0 -94
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -8
- csv_detective/detect_labels/other/email/__init__.py +0 -20
- csv_detective/detect_labels/other/float/__init__.py +0 -8
- csv_detective/detect_labels/other/int/__init__.py +0 -8
- csv_detective/detect_labels/other/money/__init__.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_labels/other/twitter/__init__.py +0 -8
- csv_detective/detect_labels/other/url/__init__.py +0 -23
- csv_detective/detect_labels/other/uuid/__init__.py +0 -8
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -28
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
- csv_detective/detect_labels/temp/year/__init__.py +0 -19
- csv_detective/load_tests.py +0 -59
- csv_detective-0.9.3.dev2241.dist-info/RECORD +0 -166
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
- {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/top_level.txt +0 -0
tests/test_fields.py
CHANGED
|
@@ -6,87 +6,22 @@ import pandas as pd
|
|
|
6
6
|
import pytest
|
|
7
7
|
from numpy import random
|
|
8
8
|
|
|
9
|
-
from csv_detective.detect_fields.FR.geo import (
|
|
10
|
-
adresse,
|
|
11
|
-
code_commune_insee,
|
|
12
|
-
code_departement,
|
|
13
|
-
code_fantoir,
|
|
14
|
-
code_postal,
|
|
15
|
-
code_region,
|
|
16
|
-
commune,
|
|
17
|
-
departement,
|
|
18
|
-
insee_canton,
|
|
19
|
-
latitude_l93,
|
|
20
|
-
latitude_wgs_fr_metropole,
|
|
21
|
-
longitude_l93,
|
|
22
|
-
longitude_wgs_fr_metropole,
|
|
23
|
-
pays,
|
|
24
|
-
region,
|
|
25
|
-
)
|
|
26
|
-
from csv_detective.detect_fields.FR.other import (
|
|
27
|
-
code_csp_insee,
|
|
28
|
-
code_import,
|
|
29
|
-
code_rna,
|
|
30
|
-
code_waldec,
|
|
31
|
-
csp_insee,
|
|
32
|
-
date_fr,
|
|
33
|
-
insee_ape700,
|
|
34
|
-
sexe,
|
|
35
|
-
siren,
|
|
36
|
-
siret,
|
|
37
|
-
tel_fr,
|
|
38
|
-
uai,
|
|
39
|
-
)
|
|
40
|
-
from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee
|
|
41
|
-
from csv_detective.detect_fields.geo import (
|
|
42
|
-
iso_country_code_alpha2,
|
|
43
|
-
iso_country_code_alpha3,
|
|
44
|
-
iso_country_code_numeric,
|
|
45
|
-
json_geojson,
|
|
46
|
-
latitude_wgs,
|
|
47
|
-
latlon_wgs,
|
|
48
|
-
longitude_wgs,
|
|
49
|
-
lonlat_wgs,
|
|
50
|
-
)
|
|
51
|
-
from csv_detective.detect_fields.other import (
|
|
52
|
-
booleen,
|
|
53
|
-
email,
|
|
54
|
-
json,
|
|
55
|
-
money,
|
|
56
|
-
mongo_object_id,
|
|
57
|
-
percent,
|
|
58
|
-
twitter,
|
|
59
|
-
url,
|
|
60
|
-
uuid,
|
|
61
|
-
)
|
|
62
|
-
from csv_detective.detect_fields.other import (
|
|
63
|
-
float as test_float,
|
|
64
|
-
)
|
|
65
|
-
from csv_detective.detect_fields.other import (
|
|
66
|
-
int as test_int,
|
|
67
|
-
)
|
|
68
|
-
from csv_detective.detect_fields.temp import (
|
|
69
|
-
date,
|
|
70
|
-
datetime_aware,
|
|
71
|
-
datetime_naive,
|
|
72
|
-
datetime_rfc822,
|
|
73
|
-
year,
|
|
74
|
-
)
|
|
75
9
|
from csv_detective.detection.variables import (
|
|
76
10
|
detect_categorical_variable,
|
|
77
11
|
detect_continuous_variable,
|
|
78
12
|
)
|
|
79
|
-
from csv_detective.
|
|
13
|
+
from csv_detective.format import FormatsManager
|
|
80
14
|
from csv_detective.output.dataframe import cast
|
|
81
15
|
from csv_detective.output.utils import prepare_output_dict
|
|
82
16
|
from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
|
|
83
17
|
|
|
18
|
+
fmtm = FormatsManager()
|
|
19
|
+
|
|
84
20
|
|
|
85
|
-
def
|
|
86
|
-
|
|
87
|
-
for attr in all_tests.values():
|
|
21
|
+
def test_all_format_funcs_return_bool():
|
|
22
|
+
for format in fmtm.formats.values():
|
|
88
23
|
for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
|
|
89
|
-
assert isinstance(
|
|
24
|
+
assert isinstance(format.func(tmp), bool)
|
|
90
25
|
|
|
91
26
|
|
|
92
27
|
# categorical
|
|
@@ -124,292 +59,37 @@ def test_detect_continuous_variable():
|
|
|
124
59
|
assert res2.values and res2.values[0] == "cont"
|
|
125
60
|
|
|
126
61
|
|
|
127
|
-
fields = {
|
|
128
|
-
adresse: {
|
|
129
|
-
True: ["rue du martyr"],
|
|
130
|
-
False: ["un batiment"],
|
|
131
|
-
},
|
|
132
|
-
code_commune_insee: {
|
|
133
|
-
True: ["91471", "01053"],
|
|
134
|
-
False: ["914712", "01000"],
|
|
135
|
-
},
|
|
136
|
-
code_departement: {
|
|
137
|
-
True: ["75", "2A", "2b", "974", "01"],
|
|
138
|
-
False: ["00", "96", "101"],
|
|
139
|
-
},
|
|
140
|
-
code_fantoir: {
|
|
141
|
-
True: ["7755A", "B150B", "ZA04C", "ZB03D"],
|
|
142
|
-
False: ["7755", "ZA99A"],
|
|
143
|
-
},
|
|
144
|
-
code_postal: {
|
|
145
|
-
True: ["75020", "01000"],
|
|
146
|
-
False: ["77777", "018339"],
|
|
147
|
-
},
|
|
148
|
-
code_region: {
|
|
149
|
-
True: ["32"],
|
|
150
|
-
False: ["55"],
|
|
151
|
-
},
|
|
152
|
-
commune: {
|
|
153
|
-
True: ["saint denis"],
|
|
154
|
-
False: ["new york", "lion"],
|
|
155
|
-
},
|
|
156
|
-
departement: {
|
|
157
|
-
True: ["essonne"],
|
|
158
|
-
False: ["alabama", "auvergne"],
|
|
159
|
-
},
|
|
160
|
-
insee_canton: {
|
|
161
|
-
True: ["nantua"],
|
|
162
|
-
False: ["california"],
|
|
163
|
-
},
|
|
164
|
-
latitude_l93: {
|
|
165
|
-
True: ["6037008", "7123528.5", "7124528,5"],
|
|
166
|
-
False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"],
|
|
167
|
-
},
|
|
168
|
-
longitude_l93: {
|
|
169
|
-
True: ["0", "-154", "1265783,45", "34723.4"],
|
|
170
|
-
False: ["1456669.8", "-776225", "346_3214"],
|
|
171
|
-
},
|
|
172
|
-
latitude_wgs_fr_metropole: {
|
|
173
|
-
True: ["42.5"],
|
|
174
|
-
False: ["22.5", "62.5"],
|
|
175
|
-
},
|
|
176
|
-
longitude_wgs_fr_metropole: {
|
|
177
|
-
True: ["-2.5"],
|
|
178
|
-
False: ["12.8"],
|
|
179
|
-
},
|
|
180
|
-
pays: {
|
|
181
|
-
True: ["france", "italie"],
|
|
182
|
-
False: ["amerique", "paris"],
|
|
183
|
-
},
|
|
184
|
-
region: {
|
|
185
|
-
True: ["bretagne", "ile-de-france"],
|
|
186
|
-
False: ["baviere", "overgne"],
|
|
187
|
-
},
|
|
188
|
-
code_csp_insee: {
|
|
189
|
-
True: ["121f"],
|
|
190
|
-
False: ["121x"],
|
|
191
|
-
},
|
|
192
|
-
code_rna: {
|
|
193
|
-
True: ["W751515517"],
|
|
194
|
-
False: [
|
|
195
|
-
"W111111111111111111111111111111111111",
|
|
196
|
-
"w143788974",
|
|
197
|
-
"W12",
|
|
198
|
-
"678W23456",
|
|
199
|
-
"165789325",
|
|
200
|
-
"Wa1#89sf&h",
|
|
201
|
-
],
|
|
202
|
-
},
|
|
203
|
-
code_import: {
|
|
204
|
-
True: ["123S1871092288"],
|
|
205
|
-
False: ["AA751PEE00188854", "W123456789"],
|
|
206
|
-
},
|
|
207
|
-
code_waldec: {
|
|
208
|
-
True: ["W123456789", "W2D1234567"],
|
|
209
|
-
False: ["AA751PEE00188854"],
|
|
210
|
-
},
|
|
211
|
-
csp_insee: {
|
|
212
|
-
True: ["employes de la poste"],
|
|
213
|
-
False: ["super-heros"],
|
|
214
|
-
},
|
|
215
|
-
sexe: {
|
|
216
|
-
True: ["homme"],
|
|
217
|
-
False: ["hermaphrodite"],
|
|
218
|
-
},
|
|
219
|
-
siren: {
|
|
220
|
-
True: ["552 100 554", "552100554"],
|
|
221
|
-
False: ["42"],
|
|
222
|
-
},
|
|
223
|
-
siret: {
|
|
224
|
-
True: ["13002526500013", "130 025 265 00013"],
|
|
225
|
-
False: ["13002526500012"],
|
|
226
|
-
},
|
|
227
|
-
uai: {
|
|
228
|
-
True: ["0422170F"],
|
|
229
|
-
False: ["04292E"],
|
|
230
|
-
},
|
|
231
|
-
date_fr: {
|
|
232
|
-
True: ["13 fevrier 1996"],
|
|
233
|
-
False: ["44 march 2025"],
|
|
234
|
-
},
|
|
235
|
-
insee_ape700: {True: ["0116Z"], False: ["0116A"]},
|
|
236
|
-
tel_fr: {
|
|
237
|
-
True: ["0134643467"],
|
|
238
|
-
False: ["6625388263", "01288398"],
|
|
239
|
-
},
|
|
240
|
-
jour_de_la_semaine: {
|
|
241
|
-
True: ["lundi"],
|
|
242
|
-
False: ["jour de la biere"],
|
|
243
|
-
},
|
|
244
|
-
mois_de_annee: {
|
|
245
|
-
True: ["juin", "décembre"],
|
|
246
|
-
False: ["november"],
|
|
247
|
-
},
|
|
248
|
-
iso_country_code_alpha2: {
|
|
249
|
-
True: ["FR"],
|
|
250
|
-
False: ["XX", "A", "FRA"],
|
|
251
|
-
},
|
|
252
|
-
iso_country_code_alpha3: {
|
|
253
|
-
True: ["FRA"],
|
|
254
|
-
False: ["XXX", "FR", "A"],
|
|
255
|
-
},
|
|
256
|
-
iso_country_code_numeric: {
|
|
257
|
-
True: ["250"],
|
|
258
|
-
False: ["003"],
|
|
259
|
-
},
|
|
260
|
-
json_geojson: {
|
|
261
|
-
True: [
|
|
262
|
-
'{"coordinates": [45.783753, 3.049342], "type": "63870"}',
|
|
263
|
-
'{"geometry": {"coordinates": [45.783753, 3.049342]}}',
|
|
264
|
-
],
|
|
265
|
-
False: ['{"pomme": "fruit", "reponse": 42}'],
|
|
266
|
-
},
|
|
267
|
-
latitude_wgs: {
|
|
268
|
-
True: ["43.2", "-22"],
|
|
269
|
-
False: ["100"],
|
|
270
|
-
},
|
|
271
|
-
latlon_wgs: {
|
|
272
|
-
True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"],
|
|
273
|
-
False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"],
|
|
274
|
-
},
|
|
275
|
-
longitude_wgs: {
|
|
276
|
-
True: ["120", "-20.2"],
|
|
277
|
-
False: ["-200"],
|
|
278
|
-
},
|
|
279
|
-
lonlat_wgs: {
|
|
280
|
-
True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
|
|
281
|
-
False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
|
|
282
|
-
},
|
|
283
|
-
booleen: {
|
|
284
|
-
True: ["oui", "0", "1", "yes", "false", "True"],
|
|
285
|
-
False: ["nein", "ja", "2", "-0"],
|
|
286
|
-
},
|
|
287
|
-
email: {
|
|
288
|
-
True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
|
|
289
|
-
False: ["cdo@@gouv.sfd"],
|
|
290
|
-
},
|
|
291
|
-
json: {
|
|
292
|
-
True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"],
|
|
293
|
-
False: ["5", '{"zefib":', '{"a"}'],
|
|
294
|
-
},
|
|
295
|
-
money: {
|
|
296
|
-
True: ["120€", "-20.2$"],
|
|
297
|
-
False: ["200", "100 euros"],
|
|
298
|
-
},
|
|
299
|
-
mongo_object_id: {
|
|
300
|
-
True: ["62320e50f981bc2b57bcc044"],
|
|
301
|
-
False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"],
|
|
302
|
-
},
|
|
303
|
-
percent: {
|
|
304
|
-
True: ["120%", "-20.2%"],
|
|
305
|
-
False: ["200", "100 pourcents"],
|
|
306
|
-
},
|
|
307
|
-
twitter: {
|
|
308
|
-
True: ["@accueil1"],
|
|
309
|
-
False: ["adresse@mail"],
|
|
310
|
-
},
|
|
311
|
-
url: {
|
|
312
|
-
True: [
|
|
313
|
-
"www.data.gouv.fr",
|
|
314
|
-
"http://data.gouv.fr",
|
|
315
|
-
"https://www.youtube.com/@data-gouv-fr",
|
|
316
|
-
(
|
|
317
|
-
"https://tabular-api.data.gouv.fr/api/resources/"
|
|
318
|
-
"aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
|
|
319
|
-
"?score__greater=0.9&decompte__exact=13"
|
|
320
|
-
),
|
|
321
|
-
],
|
|
322
|
-
False: ["tmp@data.gouv.fr"],
|
|
323
|
-
},
|
|
324
|
-
uuid: {
|
|
325
|
-
True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
|
|
326
|
-
False: ["0610928327"],
|
|
327
|
-
},
|
|
328
|
-
test_int: {
|
|
329
|
-
True: ["1", "0", "1764", "-24"],
|
|
330
|
-
False: ["01053", "1.2", "123_456", "+35"],
|
|
331
|
-
},
|
|
332
|
-
test_float: {
|
|
333
|
-
True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"],
|
|
334
|
-
False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"],
|
|
335
|
-
},
|
|
336
|
-
date: {
|
|
337
|
-
True: [
|
|
338
|
-
"1960-08-07",
|
|
339
|
-
"12/02/2007",
|
|
340
|
-
"15 jan 1985",
|
|
341
|
-
"15 décembre 1985",
|
|
342
|
-
"02 05 2003",
|
|
343
|
-
"20030502",
|
|
344
|
-
"1993-12/02",
|
|
345
|
-
],
|
|
346
|
-
False: [
|
|
347
|
-
"1993-1993-1993",
|
|
348
|
-
"39-10-1993",
|
|
349
|
-
"19-15-1993",
|
|
350
|
-
"15 tambour 1985",
|
|
351
|
-
"12152003",
|
|
352
|
-
"20031512",
|
|
353
|
-
"02052003",
|
|
354
|
-
],
|
|
355
|
-
},
|
|
356
|
-
datetime_aware: {
|
|
357
|
-
True: [
|
|
358
|
-
"2021-06-22 10:20:10-04:00",
|
|
359
|
-
"2030-06-22 00:00:00.0028+02:00",
|
|
360
|
-
"2000-12-21 10:20:10.1Z",
|
|
361
|
-
"2024-12-19T10:53:36.428000+00:00",
|
|
362
|
-
"1996/06/22 10:20:10 GMT",
|
|
363
|
-
],
|
|
364
|
-
False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
|
|
365
|
-
},
|
|
366
|
-
datetime_naive: {
|
|
367
|
-
True: [
|
|
368
|
-
"2021-06-22 10:20:10",
|
|
369
|
-
"2030/06-22 00:00:00",
|
|
370
|
-
"2030/06/22 00:00:00.0028",
|
|
371
|
-
],
|
|
372
|
-
False: [
|
|
373
|
-
"2021-06-22T30:20:10",
|
|
374
|
-
"Sun, 06 Nov 1994 08:49:37 GMT",
|
|
375
|
-
"2021-06-44 10:20:10+02:00",
|
|
376
|
-
"1999-12-01T00:00:00Z",
|
|
377
|
-
"2021-06-44",
|
|
378
|
-
"15 décembre 1985",
|
|
379
|
-
],
|
|
380
|
-
},
|
|
381
|
-
datetime_rfc822: {
|
|
382
|
-
True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
|
|
383
|
-
False: ["2021-06-22T10:20:10"],
|
|
384
|
-
},
|
|
385
|
-
year: {
|
|
386
|
-
True: ["2015"],
|
|
387
|
-
False: ["20166"],
|
|
388
|
-
},
|
|
389
|
-
}
|
|
390
|
-
|
|
391
62
|
# we could also have a function here to add all True values of (almost)
|
|
392
|
-
# each field to the False values of all others
|
|
63
|
+
# each field to the False values of all others (to do when parenthood is added)
|
|
393
64
|
|
|
394
65
|
|
|
395
66
|
def test_all_fields_have_tests():
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
67
|
+
for format in fmtm.formats.values():
|
|
68
|
+
valid = format._test_values
|
|
69
|
+
# checking structure
|
|
70
|
+
assert all(
|
|
71
|
+
isinstance(key, bool)
|
|
72
|
+
and isinstance(vals, list)
|
|
73
|
+
and all(isinstance(val, str) for val in vals)
|
|
74
|
+
for key, vals in valid.items()
|
|
75
|
+
)
|
|
76
|
+
# checking that we have valid and invalid cases for each
|
|
77
|
+
assert all(b in valid.keys() for b in [True, False])
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# this is based on the _test_values of each <format>.py file
|
|
401
81
|
@pytest.mark.parametrize(
|
|
402
82
|
"args",
|
|
403
83
|
(
|
|
404
|
-
(
|
|
405
|
-
for field in fields
|
|
84
|
+
(format.func, value, valid)
|
|
406
85
|
for valid in [True, False]
|
|
407
|
-
for
|
|
86
|
+
for format in fmtm.formats.values()
|
|
87
|
+
for value in format._test_values[valid]
|
|
408
88
|
),
|
|
409
89
|
)
|
|
410
90
|
def test_fields_with_values(args):
|
|
411
|
-
|
|
412
|
-
assert
|
|
91
|
+
func, value, valid = args
|
|
92
|
+
assert func(value) is valid
|
|
413
93
|
|
|
414
94
|
|
|
415
95
|
@pytest.mark.parametrize(
|
|
@@ -456,37 +136,32 @@ def test_priority(args):
|
|
|
456
136
|
@pytest.mark.parametrize(
|
|
457
137
|
"args",
|
|
458
138
|
(
|
|
459
|
-
("1996-02-13", date),
|
|
460
|
-
("28/01/2000", date),
|
|
461
|
-
("2025-08-20T14:30:00+02:00", datetime_aware),
|
|
462
|
-
("2025/08/20 14:30:00.2763-12:00", datetime_aware),
|
|
463
|
-
("1925_12_20T14:30:00.2763", datetime_naive),
|
|
464
|
-
("1925 12 20 14:30:00Z", datetime_aware),
|
|
139
|
+
("1996-02-13", fmtm.formats["date"]),
|
|
140
|
+
("28/01/2000", fmtm.formats["date"]),
|
|
141
|
+
("2025-08-20T14:30:00+02:00", fmtm.formats["datetime_aware"]),
|
|
142
|
+
("2025/08/20 14:30:00.2763-12:00", fmtm.formats["datetime_aware"]),
|
|
143
|
+
("1925_12_20T14:30:00.2763", fmtm.formats["datetime_naive"]),
|
|
144
|
+
("1925 12 20 14:30:00Z", fmtm.formats["datetime_aware"]),
|
|
465
145
|
),
|
|
466
146
|
)
|
|
467
147
|
def test_early_detection(args):
|
|
468
|
-
value,
|
|
469
|
-
with patch("csv_detective.
|
|
470
|
-
res =
|
|
148
|
+
value, format = args
|
|
149
|
+
with patch("csv_detective.formats.date.date_casting") as mock_func:
|
|
150
|
+
res = format.func(value)
|
|
471
151
|
assert res
|
|
472
152
|
mock_func.assert_not_called()
|
|
473
153
|
|
|
474
154
|
|
|
475
155
|
def test_all_proportion_1():
|
|
476
|
-
all_tests = return_all_tests("ALL", "detect_fields")
|
|
477
|
-
prop_1 = {
|
|
478
|
-
name: eval(name if name not in ["int", "float"] else "test_" + name)
|
|
479
|
-
for name, attr in all_tests.items()
|
|
480
|
-
if attr["prop"] == 1
|
|
481
|
-
}
|
|
482
156
|
# building a table that uses only correct values for these formats, except on one row
|
|
483
157
|
table = pd.DataFrame(
|
|
484
158
|
{
|
|
485
|
-
|
|
486
|
-
for
|
|
159
|
+
name: (format._test_values[True] * 100)[:100] + ["not_suitable"]
|
|
160
|
+
for name, format in fmtm.formats.items()
|
|
161
|
+
if format.proportion == 1
|
|
487
162
|
}
|
|
488
163
|
)
|
|
489
164
|
# testing columns for all formats
|
|
490
|
-
returned_table = col_test(table,
|
|
165
|
+
returned_table = col_test(table, fmtm.formats, limited_output=True)
|
|
491
166
|
# the analysis should have found no match on any format
|
|
492
167
|
assert all(returned_table[col].sum() == 0 for col in table.columns)
|
tests/test_file.py
CHANGED
|
@@ -49,7 +49,7 @@ def test_columns_output_on_file(chunk_size):
|
|
|
49
49
|
assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json"
|
|
50
50
|
assert output["columns"]["STRUCTURED_INFO"]["format"] == "json"
|
|
51
51
|
assert output["columns"]["GEO_INFO"]["python_type"] == "json"
|
|
52
|
-
assert output["columns"]["GEO_INFO"]["format"] == "
|
|
52
|
+
assert output["columns"]["GEO_INFO"]["format"] == "geojson"
|
|
53
53
|
|
|
54
54
|
|
|
55
55
|
def test_profile_output_on_file():
|
tests/test_labels.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import pytest
|
|
2
2
|
|
|
3
|
-
from csv_detective.
|
|
3
|
+
from csv_detective.format import FormatsManager
|
|
4
|
+
|
|
5
|
+
fmtm = FormatsManager()
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
# money labels
|
|
7
9
|
def test_money_labels():
|
|
8
10
|
header = "Montant total"
|
|
9
|
-
assert money.
|
|
11
|
+
assert fmtm.formats["money"].is_valid_label(header) == 0.5
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
@pytest.mark.parametrize(
|
|
@@ -21,4 +23,4 @@ def test_money_labels():
|
|
|
21
23
|
)
|
|
22
24
|
def test_latitude(params):
|
|
23
25
|
header, expected = params
|
|
24
|
-
assert expected == latitude_wgs.
|
|
26
|
+
assert expected == fmtm.formats["latitude_wgs"].is_valid_label(header)
|
tests/test_structure.py
CHANGED
|
@@ -1,41 +1,45 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
"""
|
|
9
|
-
Check that all tests are properly structured:
|
|
10
|
-
- an __init__.py file in the test folder
|
|
11
|
-
- an _is function in the __init__.py file
|
|
12
|
-
"""
|
|
13
|
-
for _type in ["fields", "labels"]:
|
|
14
|
-
_dir = f"csv_detective/detect_{_type}"
|
|
15
|
-
subfolders = []
|
|
16
|
-
for dirpath, dirnames, _ in os.walk(_dir):
|
|
17
|
-
for dirname in dirnames:
|
|
18
|
-
if "__pycache__" not in dirname:
|
|
19
|
-
subfolders.append(os.path.join(dirpath, dirname))
|
|
20
|
-
final_subfolders = [
|
|
21
|
-
sf
|
|
22
|
-
for sf in subfolders
|
|
23
|
-
if not any(other_sf.startswith(sf) for other_sf in subfolders if sf != other_sf)
|
|
24
|
-
]
|
|
25
|
-
for f_sf in final_subfolders:
|
|
26
|
-
assert "__init__.py" in os.listdir(f_sf)
|
|
27
|
-
_package = eval(
|
|
28
|
-
f_sf.replace("csv_detective/", "")
|
|
29
|
-
# locally we have "\\", but in CI for instance there is "/"
|
|
30
|
-
.replace("\\", ".")
|
|
31
|
-
.replace("/", ".")
|
|
32
|
-
)
|
|
33
|
-
assert "_is" in dir(_package)
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from csv_detective.format import Format, FormatsManager
|
|
6
|
+
|
|
7
|
+
fmtm = FormatsManager()
|
|
34
8
|
|
|
35
9
|
|
|
36
10
|
def test_all_tests_have_unique_name():
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
11
|
+
formats: list[str] = os.listdir("csv_detective/formats")
|
|
12
|
+
assert "__init__.py" in formats
|
|
13
|
+
assert len(formats) == len(set(formats))
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_conformity():
|
|
17
|
+
for name, format in fmtm.formats.items():
|
|
18
|
+
assert isinstance(name, str)
|
|
19
|
+
assert isinstance(format, Format)
|
|
20
|
+
assert all(
|
|
21
|
+
getattr(format, attr) is not None
|
|
22
|
+
for attr in [
|
|
23
|
+
"name",
|
|
24
|
+
"func",
|
|
25
|
+
"_test_values",
|
|
26
|
+
"labels",
|
|
27
|
+
"proportion",
|
|
28
|
+
"tags",
|
|
29
|
+
]
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.mark.parametrize(
|
|
34
|
+
"tags",
|
|
35
|
+
(
|
|
36
|
+
["type"],
|
|
37
|
+
["temp", "fr"],
|
|
38
|
+
),
|
|
39
|
+
)
|
|
40
|
+
def test_get_from_tags(tags):
|
|
41
|
+
fmts = fmtm.get_formats_from_tags(tags)
|
|
42
|
+
assert len(fmts)
|
|
43
|
+
for fmt in fmts.values():
|
|
44
|
+
for tag in tags:
|
|
45
|
+
assert tag in fmt.tags
|
|
File without changes
|
|
File without changes
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from frformat import Departement, Millesime, Options
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.9
|
|
4
|
-
|
|
5
|
-
_options = Options(
|
|
6
|
-
ignore_case=True,
|
|
7
|
-
ignore_accents=True,
|
|
8
|
-
replace_non_alphanumeric_with_space=True,
|
|
9
|
-
ignore_extra_whitespace=True,
|
|
10
|
-
)
|
|
11
|
-
_departement = Departement(Millesime.LATEST, _options)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def _is(val):
|
|
15
|
-
"""Match avec le nom des departements"""
|
|
16
|
-
return isinstance(val, str) and _departement.is_valid(val)
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from frformat import LatitudeL93
|
|
2
|
-
|
|
3
|
-
from csv_detective.detect_fields.other.float import _is as is_float
|
|
4
|
-
from csv_detective.detect_fields.other.float import float_casting
|
|
5
|
-
|
|
6
|
-
PROPORTION = 1
|
|
7
|
-
|
|
8
|
-
_latitudel93 = LatitudeL93()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def _is(val):
|
|
12
|
-
try:
|
|
13
|
-
if isinstance(val, str) and is_float(val):
|
|
14
|
-
return _latitudel93.is_valid(float_casting(val))
|
|
15
|
-
|
|
16
|
-
return False
|
|
17
|
-
|
|
18
|
-
except (ValueError, OverflowError):
|
|
19
|
-
return False
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from csv_detective.detect_fields.other.float import _is as is_float
|
|
2
|
-
|
|
3
|
-
PROPORTION = 1
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def _is(val):
|
|
7
|
-
"""Renvoie True si val peut etre une latitude en métropole"""
|
|
8
|
-
try:
|
|
9
|
-
return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3
|
|
10
|
-
except ValueError:
|
|
11
|
-
return False
|
|
12
|
-
except OverflowError:
|
|
13
|
-
return False
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from frformat import LongitudeL93
|
|
2
|
-
|
|
3
|
-
from csv_detective.detect_fields.other.float import _is as is_float
|
|
4
|
-
from csv_detective.detect_fields.other.float import float_casting
|
|
5
|
-
|
|
6
|
-
PROPORTION = 1
|
|
7
|
-
|
|
8
|
-
_longitudel93 = LongitudeL93()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def _is(val):
|
|
12
|
-
try:
|
|
13
|
-
if isinstance(val, str) and is_float(val):
|
|
14
|
-
return _longitudel93.is_valid(float_casting(val))
|
|
15
|
-
|
|
16
|
-
return False
|
|
17
|
-
|
|
18
|
-
except (ValueError, OverflowError):
|
|
19
|
-
return False
|