csv-detective 0.10.1.dev2669__py3-none-any.whl → 0.10.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. csv_detective/detection/__init__.py +0 -0
  2. csv_detective/detection/columns.py +0 -0
  3. csv_detective/detection/encoding.py +0 -0
  4. csv_detective/detection/engine.py +0 -0
  5. csv_detective/detection/formats.py +0 -0
  6. csv_detective/detection/headers.py +0 -0
  7. csv_detective/detection/rows.py +0 -0
  8. csv_detective/detection/separator.py +0 -0
  9. csv_detective/detection/variables.py +0 -0
  10. csv_detective/format.py +0 -0
  11. csv_detective/formats/__init__.py +0 -0
  12. csv_detective/formats/adresse.py +0 -0
  13. csv_detective/formats/binary.py +0 -0
  14. csv_detective/formats/booleen.py +0 -0
  15. csv_detective/formats/code_commune_insee.py +0 -0
  16. csv_detective/formats/code_csp_insee.py +0 -0
  17. csv_detective/formats/code_departement.py +0 -0
  18. csv_detective/formats/code_fantoir.py +0 -0
  19. csv_detective/formats/code_import.py +0 -0
  20. csv_detective/formats/code_postal.py +0 -0
  21. csv_detective/formats/code_region.py +0 -0
  22. csv_detective/formats/code_rna.py +0 -0
  23. csv_detective/formats/code_waldec.py +0 -0
  24. csv_detective/formats/commune.py +0 -0
  25. csv_detective/formats/csp_insee.py +0 -0
  26. csv_detective/formats/date.py +0 -0
  27. csv_detective/formats/date_fr.py +0 -0
  28. csv_detective/formats/datetime_aware.py +0 -0
  29. csv_detective/formats/datetime_naive.py +0 -0
  30. csv_detective/formats/datetime_rfc822.py +0 -0
  31. csv_detective/formats/departement.py +0 -0
  32. csv_detective/formats/email.py +0 -0
  33. csv_detective/formats/float.py +0 -0
  34. csv_detective/formats/geojson.py +0 -0
  35. csv_detective/formats/insee_ape700.py +0 -0
  36. csv_detective/formats/insee_canton.py +0 -0
  37. csv_detective/formats/int.py +0 -0
  38. csv_detective/formats/iso_country_code_alpha2.py +0 -0
  39. csv_detective/formats/iso_country_code_alpha3.py +0 -0
  40. csv_detective/formats/iso_country_code_numeric.py +0 -0
  41. csv_detective/formats/jour_de_la_semaine.py +0 -0
  42. csv_detective/formats/json.py +0 -0
  43. csv_detective/formats/latitude_l93.py +0 -0
  44. csv_detective/formats/latitude_wgs.py +0 -0
  45. csv_detective/formats/latitude_wgs_fr_metropole.py +0 -0
  46. csv_detective/formats/latlon_wgs.py +0 -0
  47. csv_detective/formats/longitude_l93.py +0 -0
  48. csv_detective/formats/longitude_wgs.py +0 -0
  49. csv_detective/formats/longitude_wgs_fr_metropole.py +0 -0
  50. csv_detective/formats/lonlat_wgs.py +0 -0
  51. csv_detective/formats/mois_de_lannee.py +0 -0
  52. csv_detective/formats/money.py +0 -0
  53. csv_detective/formats/mongo_object_id.py +0 -0
  54. csv_detective/formats/pays.py +0 -0
  55. csv_detective/formats/percent.py +0 -0
  56. csv_detective/formats/region.py +0 -0
  57. csv_detective/formats/sexe.py +0 -0
  58. csv_detective/formats/siren.py +0 -0
  59. csv_detective/formats/siret.py +0 -0
  60. csv_detective/formats/tel_fr.py +0 -0
  61. csv_detective/formats/uai.py +0 -0
  62. csv_detective/formats/url.py +0 -0
  63. csv_detective/formats/username.py +0 -0
  64. csv_detective/formats/uuid.py +0 -0
  65. csv_detective/formats/year.py +0 -0
  66. csv_detective/output/__init__.py +0 -0
  67. csv_detective/output/dataframe.py +0 -0
  68. csv_detective/output/example.py +0 -0
  69. csv_detective/output/profile.py +0 -0
  70. csv_detective/output/schema.py +0 -0
  71. csv_detective/output/utils.py +0 -0
  72. csv_detective/parsing/__init__.py +0 -0
  73. csv_detective/parsing/columns.py +0 -0
  74. csv_detective/parsing/compression.py +0 -0
  75. csv_detective/parsing/csv.py +0 -0
  76. csv_detective/parsing/excel.py +0 -0
  77. csv_detective/parsing/load.py +0 -0
  78. csv_detective/validate.py +0 -0
  79. {csv_detective-0.10.1.dev2669.dist-info → csv_detective-0.10.2.dev1.dist-info}/METADATA +18 -17
  80. {csv_detective-0.10.1.dev2669.dist-info → csv_detective-0.10.2.dev1.dist-info}/RECORD +32 -13
  81. csv_detective-0.10.2.dev1.dist-info/WHEEL +5 -0
  82. {csv_detective-0.10.1.dev2669.dist-info → csv_detective-0.10.2.dev1.dist-info}/entry_points.txt +0 -1
  83. csv_detective-0.10.2.dev1.dist-info/licenses/LICENSE +21 -0
  84. csv_detective-0.10.2.dev1.dist-info/top_level.txt +3 -0
  85. tests/__init__.py +0 -0
  86. tests/data/a_test_file.csv +407 -0
  87. tests/data/a_test_file.json +394 -0
  88. tests/data/b_test_file.csv +7 -0
  89. tests/data/c_test_file.csv +2 -0
  90. tests/data/csv_file +7 -0
  91. tests/data/file.csv.gz +0 -0
  92. tests/data/file.ods +0 -0
  93. tests/data/file.xls +0 -0
  94. tests/data/file.xlsx +0 -0
  95. tests/data/xlsx_file +0 -0
  96. tests/test_example.py +67 -0
  97. tests/test_fields.py +169 -0
  98. tests/test_file.py +448 -0
  99. tests/test_labels.py +26 -0
  100. tests/test_structure.py +45 -0
  101. tests/test_validation.py +108 -0
  102. csv_detective-0.10.1.dev2669.dist-info/WHEEL +0 -4
@@ -0,0 +1,394 @@
1
+ {
2
+ "encoding": "ASCII",
3
+ "separator": ";",
4
+ "header_row_idx": 2,
5
+ "header": [
6
+ "NUMCOM",
7
+ "NOMCOM",
8
+ "NUMDEP",
9
+ "NOMDEP",
10
+ "NUMEPCI",
11
+ "NOMEPCI",
12
+ "TXCOUVGLO_COM_2014",
13
+ "TXCOUVGLO_DEP_2014",
14
+ "TXCOUVGLO_EPCI_2014",
15
+ "STRUCTURED_INFO",
16
+ "GEO_INFO"
17
+ ],
18
+ "total_lines": 404,
19
+ "nb_duplicates": 7,
20
+ "heading_columns": 0,
21
+ "trailing_columns": 0,
22
+ "continuous": [
23
+ "TXCOUVGLO_EPCI_2014"
24
+ ],
25
+ "categorical": [
26
+ "NUMDEP",
27
+ "NOMDEP",
28
+ "TXCOUVGLO_DEP_2014",
29
+ "GEO_INFO"
30
+ ],
31
+ "columns_fields": {
32
+ "NUMCOM": {
33
+ "python_type": "string",
34
+ "format": "code_commune_insee",
35
+ "score": 1.0
36
+ },
37
+ "NOMCOM": {
38
+ "python_type": "string",
39
+ "format": "commune",
40
+ "score": 1.0
41
+ },
42
+ "NUMDEP": {
43
+ "python_type": "int",
44
+ "format": "int",
45
+ "score": 1.0
46
+ },
47
+ "NOMDEP": {
48
+ "python_type": "string",
49
+ "format": "departement",
50
+ "score": 1.0
51
+ },
52
+ "NUMEPCI": {
53
+ "python_type": "string",
54
+ "format": "siren",
55
+ "score": 1.0
56
+ },
57
+ "NOMEPCI": {
58
+ "python_type": "string",
59
+ "format": "string",
60
+ "score": 1.0
61
+ },
62
+ "TXCOUVGLO_COM_2014": {
63
+ "python_type": "float",
64
+ "format": "float",
65
+ "score": 1.0
66
+ },
67
+ "TXCOUVGLO_DEP_2014": {
68
+ "python_type": "float",
69
+ "format": "latitude_wgs",
70
+ "score": 0.9951690821256038
71
+ },
72
+ "TXCOUVGLO_EPCI_2014": {
73
+ "python_type": "float",
74
+ "format": "longitude_wgs",
75
+ "score": 0.9444444444444444
76
+ },
77
+ "STRUCTURED_INFO": {
78
+ "python_type": "json",
79
+ "format": "json",
80
+ "score": 1.0
81
+ },
82
+ "GEO_INFO": {
83
+ "python_type": "json",
84
+ "format": "geojson",
85
+ "score": 1.0
86
+ }
87
+ },
88
+ "columns_labels": {
89
+ "NUMCOM": {
90
+ "python_type": "string",
91
+ "format": "code_commune_insee",
92
+ "score": 0.5
93
+ },
94
+ "NOMCOM": {
95
+ "python_type": "string",
96
+ "format": "string",
97
+ "score": 1.0
98
+ },
99
+ "NUMDEP": {
100
+ "python_type": "string",
101
+ "format": "string",
102
+ "score": 1.0
103
+ },
104
+ "NOMDEP": {
105
+ "python_type": "string",
106
+ "format": "string",
107
+ "score": 1.0
108
+ },
109
+ "NUMEPCI": {
110
+ "python_type": "string",
111
+ "format": "string",
112
+ "score": 1.0
113
+ },
114
+ "NOMEPCI": {
115
+ "python_type": "string",
116
+ "format": "string",
117
+ "score": 1.0
118
+ },
119
+ "TXCOUVGLO_COM_2014": {
120
+ "python_type": "string",
121
+ "format": "code_commune_insee",
122
+ "score": 0.5
123
+ },
124
+ "TXCOUVGLO_DEP_2014": {
125
+ "python_type": "string",
126
+ "format": "code_departement",
127
+ "score": 0.5
128
+ },
129
+ "TXCOUVGLO_EPCI_2014": {
130
+ "python_type": "string",
131
+ "format": "string",
132
+ "score": 1.0
133
+ },
134
+ "STRUCTURED_INFO": {
135
+ "python_type": "string",
136
+ "format": "string",
137
+ "score": 1.0
138
+ },
139
+ "GEO_INFO": {
140
+ "python_type": "string",
141
+ "format": "latlon_wgs",
142
+ "score": 0.5
143
+ }
144
+ },
145
+ "columns": {
146
+ "NUMCOM": {
147
+ "python_type": "string",
148
+ "format": "code_commune_insee",
149
+ "score": 1.125
150
+ },
151
+ "NOMCOM": {
152
+ "python_type": "string",
153
+ "format": "commune",
154
+ "score": 1.0
155
+ },
156
+ "NUMDEP": {
157
+ "python_type": "int",
158
+ "format": "int",
159
+ "score": 1.0
160
+ },
161
+ "NOMDEP": {
162
+ "python_type": "string",
163
+ "format": "departement",
164
+ "score": 1.0
165
+ },
166
+ "NUMEPCI": {
167
+ "python_type": "string",
168
+ "format": "siren",
169
+ "score": 1.0
170
+ },
171
+ "NOMEPCI": {
172
+ "python_type": "string",
173
+ "format": "string",
174
+ "score": 1.0
175
+ },
176
+ "TXCOUVGLO_COM_2014": {
177
+ "python_type": "float",
178
+ "format": "float",
179
+ "score": 1.0
180
+ },
181
+ "TXCOUVGLO_DEP_2014": {
182
+ "python_type": "float",
183
+ "format": "float",
184
+ "score": 1.0
185
+ },
186
+ "TXCOUVGLO_EPCI_2014": {
187
+ "python_type": "float",
188
+ "format": "float",
189
+ "score": 1.0
190
+ },
191
+ "STRUCTURED_INFO": {
192
+ "python_type": "json",
193
+ "format": "json",
194
+ "score": 1.0
195
+ },
196
+ "GEO_INFO": {
197
+ "python_type": "json",
198
+ "format": "geojson",
199
+ "score": 1.0
200
+ }
201
+ },
202
+ "formats": {
203
+ "code_commune_insee": [
204
+ "NUMCOM"
205
+ ],
206
+ "int": [
207
+ "NUMDEP"
208
+ ],
209
+ "commune": [
210
+ "NOMCOM"
211
+ ],
212
+ "departement": [
213
+ "NOMDEP"
214
+ ],
215
+ "siren": [
216
+ "NUMEPCI"
217
+ ],
218
+ "string": [
219
+ "NOMEPCI"
220
+ ],
221
+ "float": [
222
+ "TXCOUVGLO_COM_2014",
223
+ "TXCOUVGLO_DEP_2014",
224
+ "TXCOUVGLO_EPCI_2014"
225
+ ],
226
+ "json": [
227
+ "STRUCTURED_INFO"
228
+ ],
229
+ "geojson": [
230
+ "GEO_INFO"
231
+ ]
232
+ },
233
+ "profile": {
234
+ "NUMCOM": {
235
+ "tops": [
236
+ "01170",
237
+ "01169",
238
+ "01167",
239
+ "01166",
240
+ "01165",
241
+ "01163",
242
+ "01162",
243
+ "01297",
244
+ "01304",
245
+ "01303"
246
+ ],
247
+ "nb_distinct": 407,
248
+ "nb_missing_values": 0
249
+ },
250
+ "NOMCOM": {
251
+ "tops": [
252
+ "BEARD-GEOVREISSIAT",
253
+ "GENOUILLEUX",
254
+ "GARNERANS",
255
+ "FRANS",
256
+ "FRANCHELEINS",
257
+ "FOISSIAT",
258
+ "FLAXIEU",
259
+ "PIZAY",
260
+ "PONT-D'AIN",
261
+ "PONCIN"
262
+ ],
263
+ "nb_distinct": 407,
264
+ "nb_missing_values": 0
265
+ },
266
+ "NUMDEP": {
267
+ "min": 1,
268
+ "max": 6,
269
+ "mean": 1,
270
+ "std": 0,
271
+ "tops": [
272
+ 1,
273
+ 6
274
+ ],
275
+ "nb_distinct": 2,
276
+ "nb_missing_values": 0
277
+ },
278
+ "NOMDEP": {
279
+ "tops": [
280
+ "AIN",
281
+ "ALPES MARITIMES"
282
+ ],
283
+ "nb_distinct": 2,
284
+ "nb_missing_values": 0
285
+ },
286
+ "NUMEPCI": {
287
+ "tops": [
288
+ "200040350",
289
+ "200042935",
290
+ "240100883",
291
+ "240100750",
292
+ "200042497",
293
+ "200035210",
294
+ "240100156",
295
+ "240100370",
296
+ "200029999",
297
+ "240100628"
298
+ ],
299
+ "nb_distinct": 33,
300
+ "nb_missing_values": 0
301
+ },
302
+ "NOMEPCI": {
303
+ "tops": [
304
+ "CC BUGEY SUD",
305
+ "CC HAUT - BUGEY",
306
+ "CC DE LA PLAINE DE L'AIN",
307
+ "CC DU PAYS DE GEX",
308
+ "CC DOMBES SAONE VALLEE",
309
+ "CC CHALARONNE CENTRE",
310
+ "CC DE MONTREVEL - EN - BRESSE",
311
+ "CC DU VALROMEY",
312
+ "CC RIVES DE L'AIN - PAYS DU CERDON",
313
+ "CA BOURG EN BRESSE AGGLOMERATION"
314
+ ],
315
+ "nb_distinct": 33,
316
+ "nb_missing_values": 0
317
+ },
318
+ "TXCOUVGLO_COM_2014": {
319
+ "min": 0.0,
320
+ "max": 200.2,
321
+ "mean": 59.35863746958638,
322
+ "std": 36.453598197621275,
323
+ "tops": [
324
+ 0.0,
325
+ 68.6,
326
+ 30.5,
327
+ 54.7,
328
+ 82.6,
329
+ 78.4,
330
+ 64.3,
331
+ 78.1,
332
+ 24.9,
333
+ null
334
+ ],
335
+ "nb_distinct": 297,
336
+ "nb_missing_values": 3
337
+ },
338
+ "TXCOUVGLO_DEP_2014": {
339
+ "min": 47.0,
340
+ "max": 65.2,
341
+ "mean": 65.112077294686,
342
+ "std": 1.263455055322421,
343
+ "tops": [
344
+ 65.2,
345
+ 47.0
346
+ ],
347
+ "nb_distinct": 2,
348
+ "nb_missing_values": 0
349
+ },
350
+ "TXCOUVGLO_EPCI_2014": {
351
+ "min": 28.3,
352
+ "max": 93.9,
353
+ "mean": 64.45772946859903,
354
+ "std": 12.72227368109601,
355
+ "tops": [
356
+ 52.4,
357
+ 45.3,
358
+ 75.2,
359
+ 78.4,
360
+ 46.9,
361
+ 77.8,
362
+ 67.9,
363
+ 70.0,
364
+ 72.9,
365
+ 68.7
366
+ ],
367
+ "nb_distinct": 30,
368
+ "nb_missing_values": 0
369
+ },
370
+ "STRUCTURED_INFO": {
371
+ "tops": [
372
+ "{\"champ_1\": 154, \"champ_2\": 0.0792}",
373
+ "{\"champ_1\": 153, \"champ_2\": 0.0737}",
374
+ "{\"champ_1\": 152, \"champ_2\": 0.0681}",
375
+ "{\"champ_1\": 151, \"champ_2\": 0.0624}",
376
+ "{\"champ_1\": 150, \"champ_2\": 0.0568}",
377
+ "{\"champ_1\": 149, \"champ_2\": 0.0511}",
378
+ "{\"champ_1\": 148, \"champ_2\": 0.0454}",
379
+ "{\"champ_1\": 268, \"champ_2\": 0.553}",
380
+ "{\"champ_1\": 275, \"champ_2\": 0.5732}",
381
+ "{\"champ_1\": 274, \"champ_2\": 0.5704}"
382
+ ],
383
+ "nb_distinct": 407,
384
+ "nb_missing_values": 0
385
+ },
386
+ "GEO_INFO": {
387
+ "tops": [
388
+ "{\"type\": \"Point\", \"coordinates\": [12.5, 2.8]}"
389
+ ],
390
+ "nb_distinct": 1,
391
+ "nb_missing_values": 0
392
+ }
393
+ }
394
+ }
@@ -0,0 +1,7 @@
1
+ code_departement,code_region,partly_empty
2
+ 01,01,10
3
+ 23,84,100
4
+ 44,32,76
5
+ 31,32,
6
+ 28,84,
7
+ 59,24,
@@ -0,0 +1,2 @@
1
+ colmmun A;columnB
2
+ row A;row B;row C
tests/data/csv_file ADDED
@@ -0,0 +1,7 @@
1
+ code_departement,code_region
2
+ 01,01
3
+ 23,84
4
+ 44,32
5
+ 31,32
6
+ 28,84
7
+ 59,24
tests/data/file.csv.gz ADDED
Binary file
tests/data/file.ods ADDED
Binary file
tests/data/file.xls ADDED
Binary file
tests/data/file.xlsx ADDED
Binary file
tests/data/xlsx_file ADDED
Binary file
tests/test_example.py ADDED
@@ -0,0 +1,67 @@
1
+ import re
2
+ from uuid import UUID
3
+
4
+ from csv_detective.output.example import create_example_csv_file
5
+
6
+
7
+ def test_example_creation():
8
+ fields = [
9
+ {
10
+ "name": "id_unique",
11
+ "type": "id",
12
+ },
13
+ {
14
+ "name": "nom_modele",
15
+ "type": "str",
16
+ "args": {"length": 20},
17
+ },
18
+ {
19
+ "name": "siret",
20
+ "type": "str",
21
+ "args": {"pattern": "^\\d{14}$"},
22
+ },
23
+ {
24
+ "name": "type_producteur",
25
+ "type": "str",
26
+ "args": {"enum": ["privé", "public", "association"]},
27
+ },
28
+ {
29
+ "name": "date_creation",
30
+ "type": "date",
31
+ "args": {
32
+ "date_range": ["1996-02-13", "2000-01-28"],
33
+ "format": "%Y-%m-%d",
34
+ },
35
+ },
36
+ {
37
+ "name": "url_produit",
38
+ "type": "url",
39
+ },
40
+ {
41
+ "name": "nb_produits",
42
+ "type": "int",
43
+ },
44
+ {"name": "note", "type": "float", "args": {"num_range": [1, 20]}},
45
+ ]
46
+ df = create_example_csv_file(
47
+ fields=fields,
48
+ file_length=5,
49
+ output_name=None,
50
+ )
51
+ assert len(df) == 5
52
+ assert all(UUID(_) for _ in df["id_unique"])
53
+ assert all(len(_) == 20 for _ in df["nom_modele"])
54
+ assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
55
+ assert all(_ in ["privé", "public", "association"] for _ in df["type_producteur"])
56
+ assert all(_ >= "1996-02-13" and _ <= "2000-01-28" for _ in df["date_creation"])
57
+ assert all(_.startswith("http") for _ in df["url_produit"])
58
+ assert all(isinstance(_, int) for _ in df["nb_produits"])
59
+ assert all(_ >= 1 and _ <= 20 for _ in df["note"])
60
+
61
+
62
+ def test_example_from_tableschema():
63
+ df = create_example_csv_file(
64
+ schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
65
+ output_name=None,
66
+ )
67
+ assert len(df) == 10
tests/test_fields.py ADDED
@@ -0,0 +1,169 @@
1
+ from datetime import date as _date
2
+ from datetime import datetime as _datetime
3
+ from unittest.mock import patch
4
+
5
+ import pandas as pd
6
+ import pytest
7
+ from numpy import random
8
+
9
+ from csv_detective.detection.variables import (
10
+ detect_categorical_variable,
11
+ detect_continuous_variable,
12
+ )
13
+ from csv_detective.format import FormatsManager
14
+ from csv_detective.output.dataframe import cast
15
+ from csv_detective.output.utils import prepare_output_dict
16
+ from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
17
+
18
+ fmtm = FormatsManager()
19
+
20
+
21
+ def test_all_format_funcs_return_bool():
22
+ for format in fmtm.formats.values():
23
+ for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
24
+ assert isinstance(format.func(tmp), bool)
25
+
26
+
27
+ # categorical
28
+ def test_detect_categorical_variable():
29
+ categorical_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
30
+ categorical_col2 = [str(k // 20) for k in range(100)]
31
+ not_categorical_col = [i for i in range(100)]
32
+
33
+ df_dict = {
34
+ "cat": categorical_col,
35
+ "cat2": categorical_col2,
36
+ "not_cat": not_categorical_col,
37
+ }
38
+ df = pd.DataFrame(df_dict, dtype=str)
39
+
40
+ res, _ = detect_categorical_variable(df)
41
+ assert len(res) and all(k in res for k in ["cat", "cat2"])
42
+
43
+
44
+ # continuous
45
+ def test_detect_continuous_variable():
46
+ continuous_col = random.random(100)
47
+ continuous_col_2 = [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7, 21, 3] * 10
48
+ not_continuous_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
49
+
50
+ df_dict = {"cont": continuous_col, "not_cont": not_continuous_col}
51
+ df_dict_2 = {"cont": continuous_col_2, "not_cont": not_continuous_col}
52
+
53
+ df = pd.DataFrame(df_dict, dtype=str)
54
+ df2 = pd.DataFrame(df_dict_2, dtype=str)
55
+
56
+ res = detect_continuous_variable(df)
57
+ res2 = detect_continuous_variable(df2, continuous_th=0.65)
58
+ assert res.values and res.values[0] == "cont"
59
+ assert res2.values and res2.values[0] == "cont"
60
+
61
+
62
+ # we could also have a function here to add all True values of (almost)
63
+ # each field to the False values of all others (to do when parenthood is added)
64
+
65
+
66
+ def test_all_fields_have_tests():
67
+ for format in fmtm.formats.values():
68
+ valid = format._test_values
69
+ # checking structure
70
+ assert all(
71
+ isinstance(key, bool)
72
+ and isinstance(vals, list)
73
+ and all(isinstance(val, str) for val in vals)
74
+ for key, vals in valid.items()
75
+ )
76
+ # checking that we have valid and invalid cases for each
77
+ assert all(b in valid.keys() for b in [True, False])
78
+
79
+
80
+ # this is based on the _test_values of each <format>.py file
81
+ @pytest.mark.parametrize(
82
+ "args",
83
+ (
84
+ (format.func, value, valid)
85
+ for valid in [True, False]
86
+ for format in fmtm.formats.values()
87
+ for value in format._test_values[valid]
88
+ ),
89
+ )
90
+ def test_fields_with_values(args):
91
+ func, value, valid = args
92
+ assert func(value) is valid
93
+
94
+
95
+ @pytest.mark.parametrize(
96
+ "args",
97
+ (
98
+ ("Valeur", "string", str),
99
+ ("-17", "int", int),
100
+ ("1.9", "float", float),
101
+ ("oui", "bool", bool),
102
+ ("[1, 2]", "json", list),
103
+ ('{"a": 1}', "json", dict),
104
+ ("2022-08-01", "date", _date),
105
+ ("2024-09-23 17:32:07", "datetime", _datetime),
106
+ ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
107
+ ),
108
+ )
109
+ def test_cast(args):
110
+ value, detected_type, cast_type = args
111
+ assert isinstance(cast(value, detected_type), cast_type)
112
+
113
+
114
+ @pytest.mark.parametrize(
115
+ "args",
116
+ (
117
+ # there is a specific numerical format => specific wins
118
+ ({"int": 1, "float": 1, "latitude_wgs": 1}, "latitude_wgs"),
119
+ # scores are equal for related formats => priority wins
120
+ ({"int": 1, "float": 1}, "int"),
121
+ # score is lower for priority format => secondary wins
122
+ ({"int": 0.5, "float": 1}, "float"),
123
+ # score is lower for priority format, but is 1 => priority wins
124
+ ({"int": 1, "float": 1.25}, "int"),
125
+ # two rounds of priority => highest priority wins
126
+ ({"latlon_wgs": 1, "lonlat_wgs": 1, "json": 1}, "latlon_wgs"),
127
+ # no detection => default to string
128
+ ({}, "string"),
129
+ ),
130
+ )
131
+ def test_priority(args):
132
+ detections, expected = args
133
+ col = "col1"
134
+ output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
135
+ assert output[col]["format"] == expected
136
+
137
+
138
+ @pytest.mark.parametrize(
139
+ "args",
140
+ (
141
+ ("1996-02-13", fmtm.formats["date"]),
142
+ ("28/01/2000", fmtm.formats["date"]),
143
+ ("2025-08-20T14:30:00+02:00", fmtm.formats["datetime_aware"]),
144
+ ("2025/08/20 14:30:00.2763-12:00", fmtm.formats["datetime_aware"]),
145
+ ("1925_12_20T14:30:00.2763", fmtm.formats["datetime_naive"]),
146
+ ("1925 12 20 14:30:00Z", fmtm.formats["datetime_aware"]),
147
+ ),
148
+ )
149
+ def test_early_detection(args):
150
+ value, format = args
151
+ with patch("csv_detective.formats.date.date_casting") as mock_func:
152
+ res = format.func(value)
153
+ assert res
154
+ mock_func.assert_not_called()
155
+
156
+
157
+ def test_all_proportion_1():
158
+ # building a table that uses only correct values for these formats, except on one row
159
+ table = pd.DataFrame(
160
+ {
161
+ name: (format._test_values[True] * 100)[:100] + ["not_suitable"]
162
+ for name, format in fmtm.formats.items()
163
+ if format.proportion == 1
164
+ }
165
+ )
166
+ # testing columns for all formats
167
+ returned_table = col_test(table, fmtm.formats, limited_output=True)
168
+ # the analysis should have found no match on any format
169
+ assert all(returned_table[col].sum() == 0 for col in table.columns)