csv-detective 0.10.3.dev7__py3-none-any.whl → 0.10.2549__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/__init__.py +0 -0
- csv_detective/detection/columns.py +0 -0
- csv_detective/detection/encoding.py +0 -0
- csv_detective/detection/engine.py +0 -0
- csv_detective/detection/formats.py +38 -11
- csv_detective/detection/headers.py +14 -12
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/separator.py +0 -0
- csv_detective/detection/variables.py +0 -0
- csv_detective/explore_csv.py +6 -18
- csv_detective/format.py +5 -12
- csv_detective/formats/__init__.py +0 -0
- csv_detective/formats/adresse.py +9 -9
- csv_detective/formats/binary.py +1 -2
- csv_detective/formats/booleen.py +2 -3
- csv_detective/formats/code_commune_insee.py +10 -12
- csv_detective/formats/code_csp_insee.py +1 -1
- csv_detective/formats/code_departement.py +7 -8
- csv_detective/formats/code_fantoir.py +5 -6
- csv_detective/formats/code_import.py +1 -1
- csv_detective/formats/code_postal.py +9 -10
- csv_detective/formats/code_region.py +6 -7
- csv_detective/formats/code_rna.py +6 -7
- csv_detective/formats/code_waldec.py +1 -1
- csv_detective/formats/commune.py +5 -5
- csv_detective/formats/csp_insee.py +5 -6
- csv_detective/formats/data/insee_ape700.txt +1 -1
- csv_detective/formats/data/iso_country_code_alpha2.txt +397 -153
- csv_detective/formats/data/iso_country_code_alpha3.txt +132 -132
- csv_detective/formats/data/iso_country_code_numeric.txt +94 -94
- csv_detective/formats/date.py +18 -28
- csv_detective/formats/date_fr.py +1 -1
- csv_detective/formats/datetime_aware.py +2 -7
- csv_detective/formats/datetime_naive.py +0 -3
- csv_detective/formats/datetime_rfc822.py +0 -1
- csv_detective/formats/departement.py +15 -15
- csv_detective/formats/email.py +13 -13
- csv_detective/formats/float.py +1 -2
- csv_detective/formats/geojson.py +10 -10
- csv_detective/formats/insee_ape700.py +8 -10
- csv_detective/formats/insee_canton.py +6 -6
- csv_detective/formats/int.py +1 -2
- csv_detective/formats/iso_country_code_alpha2.py +14 -14
- csv_detective/formats/iso_country_code_alpha3.py +13 -6
- csv_detective/formats/iso_country_code_numeric.py +9 -2
- csv_detective/formats/jour_de_la_semaine.py +12 -11
- csv_detective/formats/json.py +0 -6
- csv_detective/formats/latitude_l93.py +22 -8
- csv_detective/formats/latitude_wgs.py +29 -31
- csv_detective/formats/latitude_wgs_fr_metropole.py +30 -7
- csv_detective/formats/latlon_wgs.py +28 -30
- csv_detective/formats/longitude_l93.py +13 -8
- csv_detective/formats/longitude_wgs.py +19 -34
- csv_detective/formats/longitude_wgs_fr_metropole.py +19 -6
- csv_detective/formats/lonlat_wgs.py +11 -12
- csv_detective/formats/mois_de_lannee.py +1 -1
- csv_detective/formats/money.py +1 -1
- csv_detective/formats/mongo_object_id.py +1 -1
- csv_detective/formats/pays.py +13 -11
- csv_detective/formats/percent.py +1 -1
- csv_detective/formats/region.py +13 -13
- csv_detective/formats/sexe.py +1 -1
- csv_detective/formats/siren.py +10 -9
- csv_detective/formats/siret.py +9 -9
- csv_detective/formats/tel_fr.py +13 -7
- csv_detective/formats/uai.py +18 -17
- csv_detective/formats/url.py +16 -16
- csv_detective/formats/username.py +1 -1
- csv_detective/formats/uuid.py +1 -1
- csv_detective/formats/year.py +12 -7
- csv_detective/output/__init__.py +0 -0
- csv_detective/output/dataframe.py +3 -8
- csv_detective/output/example.py +0 -0
- csv_detective/output/profile.py +2 -6
- csv_detective/output/schema.py +0 -0
- csv_detective/output/utils.py +0 -0
- csv_detective/parsing/__init__.py +0 -0
- csv_detective/parsing/columns.py +1 -1
- csv_detective/parsing/compression.py +0 -0
- csv_detective/parsing/csv.py +0 -0
- csv_detective/parsing/excel.py +1 -1
- csv_detective/parsing/load.py +12 -11
- csv_detective/parsing/text.py +12 -13
- csv_detective/validate.py +36 -71
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.2549.dist-info}/METADATA +18 -15
- csv_detective-0.10.2549.dist-info/RECORD +92 -0
- csv_detective-0.10.2549.dist-info/WHEEL +4 -0
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.2549.dist-info}/entry_points.txt +1 -0
- csv_detective-0.10.3.dev7.dist-info/RECORD +0 -111
- csv_detective-0.10.3.dev7.dist-info/WHEEL +0 -5
- csv_detective-0.10.3.dev7.dist-info/licenses/LICENSE +0 -21
- csv_detective-0.10.3.dev7.dist-info/top_level.txt +0 -3
- tests/__init__.py +0 -0
- tests/data/a_test_file.csv +0 -407
- tests/data/a_test_file.json +0 -394
- tests/data/b_test_file.csv +0 -7
- tests/data/c_test_file.csv +0 -2
- tests/data/csv_file +0 -7
- tests/data/file.csv.gz +0 -0
- tests/data/file.ods +0 -0
- tests/data/file.xls +0 -0
- tests/data/file.xlsx +0 -0
- tests/data/xlsx_file +0 -0
- tests/test_example.py +0 -67
- tests/test_fields.py +0 -175
- tests/test_file.py +0 -468
- tests/test_labels.py +0 -26
- tests/test_structure.py +0 -45
- tests/test_validation.py +0 -163
tests/data/a_test_file.json
DELETED
|
@@ -1,394 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"encoding": "ASCII",
|
|
3
|
-
"separator": ";",
|
|
4
|
-
"header_row_idx": 2,
|
|
5
|
-
"header": [
|
|
6
|
-
"NUMCOM",
|
|
7
|
-
"NOMCOM",
|
|
8
|
-
"NUMDEP",
|
|
9
|
-
"NOMDEP",
|
|
10
|
-
"NUMEPCI",
|
|
11
|
-
"NOMEPCI",
|
|
12
|
-
"TXCOUVGLO_COM_2014",
|
|
13
|
-
"TXCOUVGLO_DEP_2014",
|
|
14
|
-
"TXCOUVGLO_EPCI_2014",
|
|
15
|
-
"STRUCTURED_INFO",
|
|
16
|
-
"GEO_INFO"
|
|
17
|
-
],
|
|
18
|
-
"total_lines": 404,
|
|
19
|
-
"nb_duplicates": 7,
|
|
20
|
-
"heading_columns": 0,
|
|
21
|
-
"trailing_columns": 0,
|
|
22
|
-
"continuous": [
|
|
23
|
-
"TXCOUVGLO_EPCI_2014"
|
|
24
|
-
],
|
|
25
|
-
"categorical": [
|
|
26
|
-
"NUMDEP",
|
|
27
|
-
"NOMDEP",
|
|
28
|
-
"TXCOUVGLO_DEP_2014",
|
|
29
|
-
"GEO_INFO"
|
|
30
|
-
],
|
|
31
|
-
"columns_fields": {
|
|
32
|
-
"NUMCOM": {
|
|
33
|
-
"python_type": "string",
|
|
34
|
-
"format": "code_commune_insee",
|
|
35
|
-
"score": 1.0
|
|
36
|
-
},
|
|
37
|
-
"NOMCOM": {
|
|
38
|
-
"python_type": "string",
|
|
39
|
-
"format": "commune",
|
|
40
|
-
"score": 1.0
|
|
41
|
-
},
|
|
42
|
-
"NUMDEP": {
|
|
43
|
-
"python_type": "int",
|
|
44
|
-
"format": "int",
|
|
45
|
-
"score": 1.0
|
|
46
|
-
},
|
|
47
|
-
"NOMDEP": {
|
|
48
|
-
"python_type": "string",
|
|
49
|
-
"format": "departement",
|
|
50
|
-
"score": 1.0
|
|
51
|
-
},
|
|
52
|
-
"NUMEPCI": {
|
|
53
|
-
"python_type": "string",
|
|
54
|
-
"format": "siren",
|
|
55
|
-
"score": 1.0
|
|
56
|
-
},
|
|
57
|
-
"NOMEPCI": {
|
|
58
|
-
"python_type": "string",
|
|
59
|
-
"format": "string",
|
|
60
|
-
"score": 1.0
|
|
61
|
-
},
|
|
62
|
-
"TXCOUVGLO_COM_2014": {
|
|
63
|
-
"python_type": "float",
|
|
64
|
-
"format": "float",
|
|
65
|
-
"score": 1.0
|
|
66
|
-
},
|
|
67
|
-
"TXCOUVGLO_DEP_2014": {
|
|
68
|
-
"python_type": "float",
|
|
69
|
-
"format": "latitude_wgs",
|
|
70
|
-
"score": 0.9951690821256038
|
|
71
|
-
},
|
|
72
|
-
"TXCOUVGLO_EPCI_2014": {
|
|
73
|
-
"python_type": "float",
|
|
74
|
-
"format": "longitude_wgs",
|
|
75
|
-
"score": 0.9444444444444444
|
|
76
|
-
},
|
|
77
|
-
"STRUCTURED_INFO": {
|
|
78
|
-
"python_type": "json",
|
|
79
|
-
"format": "json",
|
|
80
|
-
"score": 1.0
|
|
81
|
-
},
|
|
82
|
-
"GEO_INFO": {
|
|
83
|
-
"python_type": "json",
|
|
84
|
-
"format": "geojson",
|
|
85
|
-
"score": 1.0
|
|
86
|
-
}
|
|
87
|
-
},
|
|
88
|
-
"columns_labels": {
|
|
89
|
-
"NUMCOM": {
|
|
90
|
-
"python_type": "string",
|
|
91
|
-
"format": "code_commune_insee",
|
|
92
|
-
"score": 0.5
|
|
93
|
-
},
|
|
94
|
-
"NOMCOM": {
|
|
95
|
-
"python_type": "string",
|
|
96
|
-
"format": "string",
|
|
97
|
-
"score": 1.0
|
|
98
|
-
},
|
|
99
|
-
"NUMDEP": {
|
|
100
|
-
"python_type": "string",
|
|
101
|
-
"format": "string",
|
|
102
|
-
"score": 1.0
|
|
103
|
-
},
|
|
104
|
-
"NOMDEP": {
|
|
105
|
-
"python_type": "string",
|
|
106
|
-
"format": "string",
|
|
107
|
-
"score": 1.0
|
|
108
|
-
},
|
|
109
|
-
"NUMEPCI": {
|
|
110
|
-
"python_type": "string",
|
|
111
|
-
"format": "string",
|
|
112
|
-
"score": 1.0
|
|
113
|
-
},
|
|
114
|
-
"NOMEPCI": {
|
|
115
|
-
"python_type": "string",
|
|
116
|
-
"format": "string",
|
|
117
|
-
"score": 1.0
|
|
118
|
-
},
|
|
119
|
-
"TXCOUVGLO_COM_2014": {
|
|
120
|
-
"python_type": "string",
|
|
121
|
-
"format": "code_commune_insee",
|
|
122
|
-
"score": 0.5
|
|
123
|
-
},
|
|
124
|
-
"TXCOUVGLO_DEP_2014": {
|
|
125
|
-
"python_type": "string",
|
|
126
|
-
"format": "code_departement",
|
|
127
|
-
"score": 0.5
|
|
128
|
-
},
|
|
129
|
-
"TXCOUVGLO_EPCI_2014": {
|
|
130
|
-
"python_type": "string",
|
|
131
|
-
"format": "string",
|
|
132
|
-
"score": 1.0
|
|
133
|
-
},
|
|
134
|
-
"STRUCTURED_INFO": {
|
|
135
|
-
"python_type": "string",
|
|
136
|
-
"format": "string",
|
|
137
|
-
"score": 1.0
|
|
138
|
-
},
|
|
139
|
-
"GEO_INFO": {
|
|
140
|
-
"python_type": "string",
|
|
141
|
-
"format": "latlon_wgs",
|
|
142
|
-
"score": 0.5
|
|
143
|
-
}
|
|
144
|
-
},
|
|
145
|
-
"columns": {
|
|
146
|
-
"NUMCOM": {
|
|
147
|
-
"python_type": "string",
|
|
148
|
-
"format": "code_commune_insee",
|
|
149
|
-
"score": 1.125
|
|
150
|
-
},
|
|
151
|
-
"NOMCOM": {
|
|
152
|
-
"python_type": "string",
|
|
153
|
-
"format": "commune",
|
|
154
|
-
"score": 1.0
|
|
155
|
-
},
|
|
156
|
-
"NUMDEP": {
|
|
157
|
-
"python_type": "int",
|
|
158
|
-
"format": "int",
|
|
159
|
-
"score": 1.0
|
|
160
|
-
},
|
|
161
|
-
"NOMDEP": {
|
|
162
|
-
"python_type": "string",
|
|
163
|
-
"format": "departement",
|
|
164
|
-
"score": 1.0
|
|
165
|
-
},
|
|
166
|
-
"NUMEPCI": {
|
|
167
|
-
"python_type": "string",
|
|
168
|
-
"format": "siren",
|
|
169
|
-
"score": 1.0
|
|
170
|
-
},
|
|
171
|
-
"NOMEPCI": {
|
|
172
|
-
"python_type": "string",
|
|
173
|
-
"format": "string",
|
|
174
|
-
"score": 1.0
|
|
175
|
-
},
|
|
176
|
-
"TXCOUVGLO_COM_2014": {
|
|
177
|
-
"python_type": "float",
|
|
178
|
-
"format": "float",
|
|
179
|
-
"score": 1.0
|
|
180
|
-
},
|
|
181
|
-
"TXCOUVGLO_DEP_2014": {
|
|
182
|
-
"python_type": "float",
|
|
183
|
-
"format": "float",
|
|
184
|
-
"score": 1.0
|
|
185
|
-
},
|
|
186
|
-
"TXCOUVGLO_EPCI_2014": {
|
|
187
|
-
"python_type": "float",
|
|
188
|
-
"format": "float",
|
|
189
|
-
"score": 1.0
|
|
190
|
-
},
|
|
191
|
-
"STRUCTURED_INFO": {
|
|
192
|
-
"python_type": "json",
|
|
193
|
-
"format": "json",
|
|
194
|
-
"score": 1.0
|
|
195
|
-
},
|
|
196
|
-
"GEO_INFO": {
|
|
197
|
-
"python_type": "json",
|
|
198
|
-
"format": "geojson",
|
|
199
|
-
"score": 1.0
|
|
200
|
-
}
|
|
201
|
-
},
|
|
202
|
-
"formats": {
|
|
203
|
-
"code_commune_insee": [
|
|
204
|
-
"NUMCOM"
|
|
205
|
-
],
|
|
206
|
-
"int": [
|
|
207
|
-
"NUMDEP"
|
|
208
|
-
],
|
|
209
|
-
"commune": [
|
|
210
|
-
"NOMCOM"
|
|
211
|
-
],
|
|
212
|
-
"departement": [
|
|
213
|
-
"NOMDEP"
|
|
214
|
-
],
|
|
215
|
-
"siren": [
|
|
216
|
-
"NUMEPCI"
|
|
217
|
-
],
|
|
218
|
-
"string": [
|
|
219
|
-
"NOMEPCI"
|
|
220
|
-
],
|
|
221
|
-
"float": [
|
|
222
|
-
"TXCOUVGLO_COM_2014",
|
|
223
|
-
"TXCOUVGLO_DEP_2014",
|
|
224
|
-
"TXCOUVGLO_EPCI_2014"
|
|
225
|
-
],
|
|
226
|
-
"json": [
|
|
227
|
-
"STRUCTURED_INFO"
|
|
228
|
-
],
|
|
229
|
-
"geojson": [
|
|
230
|
-
"GEO_INFO"
|
|
231
|
-
]
|
|
232
|
-
},
|
|
233
|
-
"profile": {
|
|
234
|
-
"NUMCOM": {
|
|
235
|
-
"tops": [
|
|
236
|
-
"01170",
|
|
237
|
-
"01169",
|
|
238
|
-
"01167",
|
|
239
|
-
"01166",
|
|
240
|
-
"01165",
|
|
241
|
-
"01163",
|
|
242
|
-
"01162",
|
|
243
|
-
"01297",
|
|
244
|
-
"01304",
|
|
245
|
-
"01303"
|
|
246
|
-
],
|
|
247
|
-
"nb_distinct": 407,
|
|
248
|
-
"nb_missing_values": 0
|
|
249
|
-
},
|
|
250
|
-
"NOMCOM": {
|
|
251
|
-
"tops": [
|
|
252
|
-
"BEARD-GEOVREISSIAT",
|
|
253
|
-
"GENOUILLEUX",
|
|
254
|
-
"GARNERANS",
|
|
255
|
-
"FRANS",
|
|
256
|
-
"FRANCHELEINS",
|
|
257
|
-
"FOISSIAT",
|
|
258
|
-
"FLAXIEU",
|
|
259
|
-
"PIZAY",
|
|
260
|
-
"PONT-D'AIN",
|
|
261
|
-
"PONCIN"
|
|
262
|
-
],
|
|
263
|
-
"nb_distinct": 407,
|
|
264
|
-
"nb_missing_values": 0
|
|
265
|
-
},
|
|
266
|
-
"NUMDEP": {
|
|
267
|
-
"min": 1,
|
|
268
|
-
"max": 6,
|
|
269
|
-
"mean": 1,
|
|
270
|
-
"std": 0,
|
|
271
|
-
"tops": [
|
|
272
|
-
1,
|
|
273
|
-
6
|
|
274
|
-
],
|
|
275
|
-
"nb_distinct": 2,
|
|
276
|
-
"nb_missing_values": 0
|
|
277
|
-
},
|
|
278
|
-
"NOMDEP": {
|
|
279
|
-
"tops": [
|
|
280
|
-
"AIN",
|
|
281
|
-
"ALPES MARITIMES"
|
|
282
|
-
],
|
|
283
|
-
"nb_distinct": 2,
|
|
284
|
-
"nb_missing_values": 0
|
|
285
|
-
},
|
|
286
|
-
"NUMEPCI": {
|
|
287
|
-
"tops": [
|
|
288
|
-
"200040350",
|
|
289
|
-
"200042935",
|
|
290
|
-
"240100883",
|
|
291
|
-
"240100750",
|
|
292
|
-
"200042497",
|
|
293
|
-
"200035210",
|
|
294
|
-
"240100156",
|
|
295
|
-
"240100370",
|
|
296
|
-
"200029999",
|
|
297
|
-
"240100628"
|
|
298
|
-
],
|
|
299
|
-
"nb_distinct": 33,
|
|
300
|
-
"nb_missing_values": 0
|
|
301
|
-
},
|
|
302
|
-
"NOMEPCI": {
|
|
303
|
-
"tops": [
|
|
304
|
-
"CC BUGEY SUD",
|
|
305
|
-
"CC HAUT - BUGEY",
|
|
306
|
-
"CC DE LA PLAINE DE L'AIN",
|
|
307
|
-
"CC DU PAYS DE GEX",
|
|
308
|
-
"CC DOMBES SAONE VALLEE",
|
|
309
|
-
"CC CHALARONNE CENTRE",
|
|
310
|
-
"CC DE MONTREVEL - EN - BRESSE",
|
|
311
|
-
"CC DU VALROMEY",
|
|
312
|
-
"CC RIVES DE L'AIN - PAYS DU CERDON",
|
|
313
|
-
"CA BOURG EN BRESSE AGGLOMERATION"
|
|
314
|
-
],
|
|
315
|
-
"nb_distinct": 33,
|
|
316
|
-
"nb_missing_values": 0
|
|
317
|
-
},
|
|
318
|
-
"TXCOUVGLO_COM_2014": {
|
|
319
|
-
"min": 0.0,
|
|
320
|
-
"max": 200.2,
|
|
321
|
-
"mean": 59.35863746958638,
|
|
322
|
-
"std": 36.453598197621275,
|
|
323
|
-
"tops": [
|
|
324
|
-
0.0,
|
|
325
|
-
68.6,
|
|
326
|
-
30.5,
|
|
327
|
-
54.7,
|
|
328
|
-
82.6,
|
|
329
|
-
78.4,
|
|
330
|
-
64.3,
|
|
331
|
-
78.1,
|
|
332
|
-
24.9,
|
|
333
|
-
null
|
|
334
|
-
],
|
|
335
|
-
"nb_distinct": 297,
|
|
336
|
-
"nb_missing_values": 3
|
|
337
|
-
},
|
|
338
|
-
"TXCOUVGLO_DEP_2014": {
|
|
339
|
-
"min": 47.0,
|
|
340
|
-
"max": 65.2,
|
|
341
|
-
"mean": 65.112077294686,
|
|
342
|
-
"std": 1.263455055322421,
|
|
343
|
-
"tops": [
|
|
344
|
-
65.2,
|
|
345
|
-
47.0
|
|
346
|
-
],
|
|
347
|
-
"nb_distinct": 2,
|
|
348
|
-
"nb_missing_values": 0
|
|
349
|
-
},
|
|
350
|
-
"TXCOUVGLO_EPCI_2014": {
|
|
351
|
-
"min": 28.3,
|
|
352
|
-
"max": 93.9,
|
|
353
|
-
"mean": 64.45772946859903,
|
|
354
|
-
"std": 12.72227368109601,
|
|
355
|
-
"tops": [
|
|
356
|
-
52.4,
|
|
357
|
-
45.3,
|
|
358
|
-
75.2,
|
|
359
|
-
78.4,
|
|
360
|
-
46.9,
|
|
361
|
-
77.8,
|
|
362
|
-
67.9,
|
|
363
|
-
70.0,
|
|
364
|
-
72.9,
|
|
365
|
-
68.7
|
|
366
|
-
],
|
|
367
|
-
"nb_distinct": 30,
|
|
368
|
-
"nb_missing_values": 0
|
|
369
|
-
},
|
|
370
|
-
"STRUCTURED_INFO": {
|
|
371
|
-
"tops": [
|
|
372
|
-
"{\"champ_1\": 154, \"champ_2\": 0.0792}",
|
|
373
|
-
"{\"champ_1\": 153, \"champ_2\": 0.0737}",
|
|
374
|
-
"{\"champ_1\": 152, \"champ_2\": 0.0681}",
|
|
375
|
-
"{\"champ_1\": 151, \"champ_2\": 0.0624}",
|
|
376
|
-
"{\"champ_1\": 150, \"champ_2\": 0.0568}",
|
|
377
|
-
"{\"champ_1\": 149, \"champ_2\": 0.0511}",
|
|
378
|
-
"{\"champ_1\": 148, \"champ_2\": 0.0454}",
|
|
379
|
-
"{\"champ_1\": 268, \"champ_2\": 0.553}",
|
|
380
|
-
"{\"champ_1\": 275, \"champ_2\": 0.5732}",
|
|
381
|
-
"{\"champ_1\": 274, \"champ_2\": 0.5704}"
|
|
382
|
-
],
|
|
383
|
-
"nb_distinct": 407,
|
|
384
|
-
"nb_missing_values": 0
|
|
385
|
-
},
|
|
386
|
-
"GEO_INFO": {
|
|
387
|
-
"tops": [
|
|
388
|
-
"{\"type\": \"Point\", \"coordinates\": [12.5, 2.8]}"
|
|
389
|
-
],
|
|
390
|
-
"nb_distinct": 1,
|
|
391
|
-
"nb_missing_values": 0
|
|
392
|
-
}
|
|
393
|
-
}
|
|
394
|
-
}
|
tests/data/b_test_file.csv
DELETED
tests/data/c_test_file.csv
DELETED
tests/data/csv_file
DELETED
tests/data/file.csv.gz
DELETED
|
Binary file
|
tests/data/file.ods
DELETED
|
Binary file
|
tests/data/file.xls
DELETED
|
Binary file
|
tests/data/file.xlsx
DELETED
|
Binary file
|
tests/data/xlsx_file
DELETED
|
Binary file
|
tests/test_example.py
DELETED
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from uuid import UUID
|
|
3
|
-
|
|
4
|
-
from csv_detective.output.example import create_example_csv_file
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def test_example_creation():
|
|
8
|
-
fields = [
|
|
9
|
-
{
|
|
10
|
-
"name": "id_unique",
|
|
11
|
-
"type": "id",
|
|
12
|
-
},
|
|
13
|
-
{
|
|
14
|
-
"name": "nom_modele",
|
|
15
|
-
"type": "str",
|
|
16
|
-
"args": {"length": 20},
|
|
17
|
-
},
|
|
18
|
-
{
|
|
19
|
-
"name": "siret",
|
|
20
|
-
"type": "str",
|
|
21
|
-
"args": {"pattern": "^\\d{14}$"},
|
|
22
|
-
},
|
|
23
|
-
{
|
|
24
|
-
"name": "type_producteur",
|
|
25
|
-
"type": "str",
|
|
26
|
-
"args": {"enum": ["privé", "public", "association"]},
|
|
27
|
-
},
|
|
28
|
-
{
|
|
29
|
-
"name": "date_creation",
|
|
30
|
-
"type": "date",
|
|
31
|
-
"args": {
|
|
32
|
-
"date_range": ["1996-02-13", "2000-01-28"],
|
|
33
|
-
"format": "%Y-%m-%d",
|
|
34
|
-
},
|
|
35
|
-
},
|
|
36
|
-
{
|
|
37
|
-
"name": "url_produit",
|
|
38
|
-
"type": "url",
|
|
39
|
-
},
|
|
40
|
-
{
|
|
41
|
-
"name": "nb_produits",
|
|
42
|
-
"type": "int",
|
|
43
|
-
},
|
|
44
|
-
{"name": "note", "type": "float", "args": {"num_range": [1, 20]}},
|
|
45
|
-
]
|
|
46
|
-
df = create_example_csv_file(
|
|
47
|
-
fields=fields,
|
|
48
|
-
file_length=5,
|
|
49
|
-
output_name=None,
|
|
50
|
-
)
|
|
51
|
-
assert len(df) == 5
|
|
52
|
-
assert all(UUID(_) for _ in df["id_unique"])
|
|
53
|
-
assert all(len(_) == 20 for _ in df["nom_modele"])
|
|
54
|
-
assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
|
|
55
|
-
assert all(_ in ["privé", "public", "association"] for _ in df["type_producteur"])
|
|
56
|
-
assert all(_ >= "1996-02-13" and _ <= "2000-01-28" for _ in df["date_creation"])
|
|
57
|
-
assert all(_.startswith("http") for _ in df["url_produit"])
|
|
58
|
-
assert all(isinstance(_, int) for _ in df["nb_produits"])
|
|
59
|
-
assert all(_ >= 1 and _ <= 20 for _ in df["note"])
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def test_example_from_tableschema():
|
|
63
|
-
df = create_example_csv_file(
|
|
64
|
-
schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
|
|
65
|
-
output_name=None,
|
|
66
|
-
)
|
|
67
|
-
assert len(df) == 10
|
tests/test_fields.py
DELETED
|
@@ -1,175 +0,0 @@
|
|
|
1
|
-
from datetime import date as _date
|
|
2
|
-
from datetime import datetime as _datetime
|
|
3
|
-
from unittest.mock import patch
|
|
4
|
-
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import pytest
|
|
7
|
-
from numpy import random
|
|
8
|
-
|
|
9
|
-
from csv_detective.detection.variables import (
|
|
10
|
-
detect_categorical_variable,
|
|
11
|
-
detect_continuous_variable,
|
|
12
|
-
)
|
|
13
|
-
from csv_detective.format import FormatsManager
|
|
14
|
-
from csv_detective.output.dataframe import cast
|
|
15
|
-
from csv_detective.output.utils import prepare_output_dict
|
|
16
|
-
from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
|
|
17
|
-
|
|
18
|
-
fmtm = FormatsManager()
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def test_all_format_funcs_return_bool():
|
|
22
|
-
for format in fmtm.formats.values():
|
|
23
|
-
for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
|
|
24
|
-
assert isinstance(format.func(tmp), bool)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
# categorical
|
|
28
|
-
def test_detect_categorical_variable():
|
|
29
|
-
categorical_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
|
|
30
|
-
categorical_col2 = [str(k // 20) for k in range(100)]
|
|
31
|
-
not_categorical_col = [i for i in range(100)]
|
|
32
|
-
|
|
33
|
-
df_dict = {
|
|
34
|
-
"cat": categorical_col,
|
|
35
|
-
"cat2": categorical_col2,
|
|
36
|
-
"not_cat": not_categorical_col,
|
|
37
|
-
}
|
|
38
|
-
df = pd.DataFrame(df_dict, dtype=str)
|
|
39
|
-
|
|
40
|
-
res, _ = detect_categorical_variable(df)
|
|
41
|
-
assert len(res) and all(k in res for k in ["cat", "cat2"])
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
# continuous
|
|
45
|
-
def test_detect_continuous_variable():
|
|
46
|
-
continuous_col = random.random(100)
|
|
47
|
-
continuous_col_2 = [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7, 21, 3] * 10
|
|
48
|
-
not_continuous_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
|
|
49
|
-
|
|
50
|
-
df_dict = {"cont": continuous_col, "not_cont": not_continuous_col}
|
|
51
|
-
df_dict_2 = {"cont": continuous_col_2, "not_cont": not_continuous_col}
|
|
52
|
-
|
|
53
|
-
df = pd.DataFrame(df_dict, dtype=str)
|
|
54
|
-
df2 = pd.DataFrame(df_dict_2, dtype=str)
|
|
55
|
-
|
|
56
|
-
res = detect_continuous_variable(df)
|
|
57
|
-
res2 = detect_continuous_variable(df2, continuous_th=0.65)
|
|
58
|
-
assert res.values and res.values[0] == "cont"
|
|
59
|
-
assert res2.values and res2.values[0] == "cont"
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
# we could also have a function here to add all True values of (almost)
|
|
63
|
-
# each field to the False values of all others (to do when parenthood is added)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def test_all_fields_have_tests():
|
|
67
|
-
for format in fmtm.formats.values():
|
|
68
|
-
valid = format._test_values
|
|
69
|
-
# checking structure
|
|
70
|
-
assert all(
|
|
71
|
-
isinstance(key, bool)
|
|
72
|
-
and isinstance(vals, list)
|
|
73
|
-
and all(isinstance(val, str) for val in vals)
|
|
74
|
-
for key, vals in valid.items()
|
|
75
|
-
)
|
|
76
|
-
# checking that we have valid and invalid cases for each
|
|
77
|
-
assert all(b in valid.keys() for b in [True, False])
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
# this is based on the _test_values of each <format>.py file
|
|
81
|
-
@pytest.mark.parametrize(
|
|
82
|
-
"args",
|
|
83
|
-
(
|
|
84
|
-
(format.func, value, valid)
|
|
85
|
-
for valid in [True, False]
|
|
86
|
-
for format in fmtm.formats.values()
|
|
87
|
-
for value in format._test_values[valid]
|
|
88
|
-
),
|
|
89
|
-
)
|
|
90
|
-
def test_fields_with_values(args):
|
|
91
|
-
func, value, valid = args
|
|
92
|
-
assert func(value) is valid
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
@pytest.mark.parametrize(
|
|
96
|
-
"args",
|
|
97
|
-
(
|
|
98
|
-
("Valeur", "string", str),
|
|
99
|
-
("-17", "int", int),
|
|
100
|
-
("1.9", "float", float),
|
|
101
|
-
("oui", "bool", bool),
|
|
102
|
-
("[1, 2]", "json", list),
|
|
103
|
-
('{"a": 1}', "json", dict),
|
|
104
|
-
("2022-08-01", "date", _date),
|
|
105
|
-
("2024-09-23 17:32:07", "datetime", _datetime),
|
|
106
|
-
("2024-09-23 17:32:07+02:00", "datetime", _datetime),
|
|
107
|
-
("N/A", "int", None),
|
|
108
|
-
("nan", "bool", None),
|
|
109
|
-
("", "date", None), # all NaN-like values should be cast as None for all type
|
|
110
|
-
),
|
|
111
|
-
)
|
|
112
|
-
def test_cast(args):
|
|
113
|
-
value, detected_type, cast_type = args
|
|
114
|
-
if cast_type is None:
|
|
115
|
-
assert cast(value, detected_type) is None
|
|
116
|
-
else:
|
|
117
|
-
assert isinstance(cast(value, detected_type), cast_type)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
@pytest.mark.parametrize(
|
|
121
|
-
"args",
|
|
122
|
-
(
|
|
123
|
-
# there is a specific numerical format => specific wins
|
|
124
|
-
({"int": 1, "float": 1, "latitude_wgs": 1}, "latitude_wgs"),
|
|
125
|
-
# scores are equal for related formats => priority wins
|
|
126
|
-
({"int": 1, "float": 1}, "int"),
|
|
127
|
-
# score is lower for priority format => secondary wins
|
|
128
|
-
({"int": 0.5, "float": 1}, "float"),
|
|
129
|
-
# score is lower for priority format, but is 1 => priority wins
|
|
130
|
-
({"int": 1, "float": 1.25}, "int"),
|
|
131
|
-
# two rounds of priority => highest priority wins
|
|
132
|
-
({"latlon_wgs": 1, "lonlat_wgs": 1, "json": 1}, "latlon_wgs"),
|
|
133
|
-
# no detection => default to string
|
|
134
|
-
({}, "string"),
|
|
135
|
-
),
|
|
136
|
-
)
|
|
137
|
-
def test_priority(args):
|
|
138
|
-
detections, expected = args
|
|
139
|
-
col = "col1"
|
|
140
|
-
output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
|
|
141
|
-
assert output[col]["format"] == expected
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
@pytest.mark.parametrize(
|
|
145
|
-
"args",
|
|
146
|
-
(
|
|
147
|
-
("1996-02-13", fmtm.formats["date"]),
|
|
148
|
-
("28/01/2000", fmtm.formats["date"]),
|
|
149
|
-
("2025-08-20T14:30:00+02:00", fmtm.formats["datetime_aware"]),
|
|
150
|
-
("2025/08/20 14:30:00.2763-12:00", fmtm.formats["datetime_aware"]),
|
|
151
|
-
("1925_12_20T14:30:00.2763", fmtm.formats["datetime_naive"]),
|
|
152
|
-
("1925 12 20 14:30:00Z", fmtm.formats["datetime_aware"]),
|
|
153
|
-
),
|
|
154
|
-
)
|
|
155
|
-
def test_early_detection(args):
|
|
156
|
-
value, format = args
|
|
157
|
-
with patch("csv_detective.formats.date.date_casting") as mock_func:
|
|
158
|
-
res = format.func(value)
|
|
159
|
-
assert res
|
|
160
|
-
mock_func.assert_not_called()
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def test_all_proportion_1():
|
|
164
|
-
# building a table that uses only correct values for these formats, except on one row
|
|
165
|
-
table = pd.DataFrame(
|
|
166
|
-
{
|
|
167
|
-
name: (format._test_values[True] * 100)[:100] + ["not_suitable"]
|
|
168
|
-
for name, format in fmtm.formats.items()
|
|
169
|
-
if format.proportion == 1
|
|
170
|
-
}
|
|
171
|
-
)
|
|
172
|
-
# testing columns for all formats
|
|
173
|
-
returned_table = col_test(table, fmtm.formats, limited_output=True)
|
|
174
|
-
# the analysis should have found no match on any format
|
|
175
|
-
assert all(returned_table[col].sum() == 0 for col in table.columns)
|