csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.12674__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. csv_detective/detection/__init__.py +0 -0
  2. csv_detective/detection/columns.py +0 -0
  3. csv_detective/detection/encoding.py +0 -0
  4. csv_detective/detection/engine.py +0 -0
  5. csv_detective/detection/formats.py +0 -2
  6. csv_detective/detection/headers.py +14 -12
  7. csv_detective/detection/rows.py +1 -1
  8. csv_detective/detection/separator.py +0 -0
  9. csv_detective/detection/variables.py +0 -0
  10. csv_detective/explore_csv.py +4 -15
  11. csv_detective/format.py +1 -1
  12. csv_detective/formats/__init__.py +0 -0
  13. csv_detective/formats/adresse.py +0 -0
  14. csv_detective/formats/binary.py +0 -0
  15. csv_detective/formats/booleen.py +0 -0
  16. csv_detective/formats/code_commune_insee.py +0 -0
  17. csv_detective/formats/code_csp_insee.py +0 -0
  18. csv_detective/formats/code_departement.py +0 -0
  19. csv_detective/formats/code_fantoir.py +0 -0
  20. csv_detective/formats/code_import.py +0 -0
  21. csv_detective/formats/code_postal.py +0 -0
  22. csv_detective/formats/code_region.py +0 -0
  23. csv_detective/formats/code_rna.py +0 -0
  24. csv_detective/formats/code_waldec.py +0 -0
  25. csv_detective/formats/commune.py +0 -0
  26. csv_detective/formats/csp_insee.py +0 -0
  27. csv_detective/formats/date.py +1 -10
  28. csv_detective/formats/date_fr.py +0 -0
  29. csv_detective/formats/datetime_aware.py +0 -0
  30. csv_detective/formats/datetime_naive.py +0 -0
  31. csv_detective/formats/datetime_rfc822.py +0 -0
  32. csv_detective/formats/departement.py +0 -0
  33. csv_detective/formats/email.py +0 -0
  34. csv_detective/formats/float.py +0 -0
  35. csv_detective/formats/geojson.py +0 -0
  36. csv_detective/formats/insee_ape700.py +0 -0
  37. csv_detective/formats/insee_canton.py +0 -0
  38. csv_detective/formats/int.py +0 -0
  39. csv_detective/formats/iso_country_code_alpha2.py +0 -0
  40. csv_detective/formats/iso_country_code_alpha3.py +0 -0
  41. csv_detective/formats/iso_country_code_numeric.py +0 -0
  42. csv_detective/formats/jour_de_la_semaine.py +0 -0
  43. csv_detective/formats/json.py +0 -0
  44. csv_detective/formats/latitude_l93.py +0 -0
  45. csv_detective/formats/latitude_wgs.py +0 -0
  46. csv_detective/formats/latitude_wgs_fr_metropole.py +0 -0
  47. csv_detective/formats/latlon_wgs.py +0 -0
  48. csv_detective/formats/longitude_l93.py +0 -0
  49. csv_detective/formats/longitude_wgs.py +0 -0
  50. csv_detective/formats/longitude_wgs_fr_metropole.py +0 -0
  51. csv_detective/formats/lonlat_wgs.py +0 -0
  52. csv_detective/formats/mois_de_lannee.py +0 -0
  53. csv_detective/formats/money.py +0 -0
  54. csv_detective/formats/mongo_object_id.py +0 -0
  55. csv_detective/formats/pays.py +0 -0
  56. csv_detective/formats/percent.py +0 -0
  57. csv_detective/formats/region.py +0 -0
  58. csv_detective/formats/sexe.py +0 -0
  59. csv_detective/formats/siren.py +0 -0
  60. csv_detective/formats/siret.py +0 -0
  61. csv_detective/formats/tel_fr.py +0 -0
  62. csv_detective/formats/uai.py +0 -0
  63. csv_detective/formats/url.py +0 -0
  64. csv_detective/formats/username.py +0 -0
  65. csv_detective/formats/uuid.py +0 -0
  66. csv_detective/formats/year.py +0 -0
  67. csv_detective/output/__init__.py +0 -0
  68. csv_detective/output/dataframe.py +2 -2
  69. csv_detective/output/example.py +0 -0
  70. csv_detective/output/profile.py +1 -1
  71. csv_detective/output/schema.py +0 -0
  72. csv_detective/output/utils.py +0 -0
  73. csv_detective/parsing/__init__.py +0 -0
  74. csv_detective/parsing/columns.py +5 -9
  75. csv_detective/parsing/compression.py +0 -0
  76. csv_detective/parsing/csv.py +0 -0
  77. csv_detective/parsing/excel.py +1 -1
  78. csv_detective/parsing/load.py +12 -11
  79. csv_detective/validate.py +36 -71
  80. {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/METADATA +18 -15
  81. {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/RECORD +22 -41
  82. csv_detective-0.10.12674.dist-info/WHEEL +4 -0
  83. {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.12674.dist-info}/entry_points.txt +1 -0
  84. csv_detective-0.10.4.dev1.dist-info/WHEEL +0 -5
  85. csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE +0 -21
  86. csv_detective-0.10.4.dev1.dist-info/top_level.txt +0 -3
  87. tests/__init__.py +0 -0
  88. tests/data/a_test_file.csv +0 -407
  89. tests/data/a_test_file.json +0 -394
  90. tests/data/b_test_file.csv +0 -7
  91. tests/data/c_test_file.csv +0 -2
  92. tests/data/csv_file +0 -7
  93. tests/data/file.csv.gz +0 -0
  94. tests/data/file.ods +0 -0
  95. tests/data/file.xls +0 -0
  96. tests/data/file.xlsx +0 -0
  97. tests/data/xlsx_file +0 -0
  98. tests/test_example.py +0 -67
  99. tests/test_fields.py +0 -175
  100. tests/test_file.py +0 -469
  101. tests/test_labels.py +0 -26
  102. tests/test_structure.py +0 -45
  103. tests/test_validation.py +0 -163
@@ -1,394 +0,0 @@
1
- {
2
- "encoding": "ASCII",
3
- "separator": ";",
4
- "header_row_idx": 2,
5
- "header": [
6
- "NUMCOM",
7
- "NOMCOM",
8
- "NUMDEP",
9
- "NOMDEP",
10
- "NUMEPCI",
11
- "NOMEPCI",
12
- "TXCOUVGLO_COM_2014",
13
- "TXCOUVGLO_DEP_2014",
14
- "TXCOUVGLO_EPCI_2014",
15
- "STRUCTURED_INFO",
16
- "GEO_INFO"
17
- ],
18
- "total_lines": 404,
19
- "nb_duplicates": 7,
20
- "heading_columns": 0,
21
- "trailing_columns": 0,
22
- "continuous": [
23
- "TXCOUVGLO_EPCI_2014"
24
- ],
25
- "categorical": [
26
- "NUMDEP",
27
- "NOMDEP",
28
- "TXCOUVGLO_DEP_2014",
29
- "GEO_INFO"
30
- ],
31
- "columns_fields": {
32
- "NUMCOM": {
33
- "python_type": "string",
34
- "format": "code_commune_insee",
35
- "score": 1.0
36
- },
37
- "NOMCOM": {
38
- "python_type": "string",
39
- "format": "commune",
40
- "score": 1.0
41
- },
42
- "NUMDEP": {
43
- "python_type": "int",
44
- "format": "int",
45
- "score": 1.0
46
- },
47
- "NOMDEP": {
48
- "python_type": "string",
49
- "format": "departement",
50
- "score": 1.0
51
- },
52
- "NUMEPCI": {
53
- "python_type": "string",
54
- "format": "siren",
55
- "score": 1.0
56
- },
57
- "NOMEPCI": {
58
- "python_type": "string",
59
- "format": "string",
60
- "score": 1.0
61
- },
62
- "TXCOUVGLO_COM_2014": {
63
- "python_type": "float",
64
- "format": "float",
65
- "score": 1.0
66
- },
67
- "TXCOUVGLO_DEP_2014": {
68
- "python_type": "float",
69
- "format": "latitude_wgs",
70
- "score": 0.9951690821256038
71
- },
72
- "TXCOUVGLO_EPCI_2014": {
73
- "python_type": "float",
74
- "format": "longitude_wgs",
75
- "score": 0.9444444444444444
76
- },
77
- "STRUCTURED_INFO": {
78
- "python_type": "json",
79
- "format": "json",
80
- "score": 1.0
81
- },
82
- "GEO_INFO": {
83
- "python_type": "json",
84
- "format": "geojson",
85
- "score": 1.0
86
- }
87
- },
88
- "columns_labels": {
89
- "NUMCOM": {
90
- "python_type": "string",
91
- "format": "code_commune_insee",
92
- "score": 0.5
93
- },
94
- "NOMCOM": {
95
- "python_type": "string",
96
- "format": "string",
97
- "score": 1.0
98
- },
99
- "NUMDEP": {
100
- "python_type": "string",
101
- "format": "string",
102
- "score": 1.0
103
- },
104
- "NOMDEP": {
105
- "python_type": "string",
106
- "format": "string",
107
- "score": 1.0
108
- },
109
- "NUMEPCI": {
110
- "python_type": "string",
111
- "format": "string",
112
- "score": 1.0
113
- },
114
- "NOMEPCI": {
115
- "python_type": "string",
116
- "format": "string",
117
- "score": 1.0
118
- },
119
- "TXCOUVGLO_COM_2014": {
120
- "python_type": "string",
121
- "format": "code_commune_insee",
122
- "score": 0.5
123
- },
124
- "TXCOUVGLO_DEP_2014": {
125
- "python_type": "string",
126
- "format": "code_departement",
127
- "score": 0.5
128
- },
129
- "TXCOUVGLO_EPCI_2014": {
130
- "python_type": "string",
131
- "format": "string",
132
- "score": 1.0
133
- },
134
- "STRUCTURED_INFO": {
135
- "python_type": "string",
136
- "format": "string",
137
- "score": 1.0
138
- },
139
- "GEO_INFO": {
140
- "python_type": "string",
141
- "format": "latlon_wgs",
142
- "score": 0.5
143
- }
144
- },
145
- "columns": {
146
- "NUMCOM": {
147
- "python_type": "string",
148
- "format": "code_commune_insee",
149
- "score": 1.125
150
- },
151
- "NOMCOM": {
152
- "python_type": "string",
153
- "format": "commune",
154
- "score": 1.0
155
- },
156
- "NUMDEP": {
157
- "python_type": "int",
158
- "format": "int",
159
- "score": 1.0
160
- },
161
- "NOMDEP": {
162
- "python_type": "string",
163
- "format": "departement",
164
- "score": 1.0
165
- },
166
- "NUMEPCI": {
167
- "python_type": "string",
168
- "format": "siren",
169
- "score": 1.0
170
- },
171
- "NOMEPCI": {
172
- "python_type": "string",
173
- "format": "string",
174
- "score": 1.0
175
- },
176
- "TXCOUVGLO_COM_2014": {
177
- "python_type": "float",
178
- "format": "float",
179
- "score": 1.0
180
- },
181
- "TXCOUVGLO_DEP_2014": {
182
- "python_type": "float",
183
- "format": "float",
184
- "score": 1.0
185
- },
186
- "TXCOUVGLO_EPCI_2014": {
187
- "python_type": "float",
188
- "format": "float",
189
- "score": 1.0
190
- },
191
- "STRUCTURED_INFO": {
192
- "python_type": "json",
193
- "format": "json",
194
- "score": 1.0
195
- },
196
- "GEO_INFO": {
197
- "python_type": "json",
198
- "format": "geojson",
199
- "score": 1.0
200
- }
201
- },
202
- "formats": {
203
- "code_commune_insee": [
204
- "NUMCOM"
205
- ],
206
- "int": [
207
- "NUMDEP"
208
- ],
209
- "commune": [
210
- "NOMCOM"
211
- ],
212
- "departement": [
213
- "NOMDEP"
214
- ],
215
- "siren": [
216
- "NUMEPCI"
217
- ],
218
- "string": [
219
- "NOMEPCI"
220
- ],
221
- "float": [
222
- "TXCOUVGLO_COM_2014",
223
- "TXCOUVGLO_DEP_2014",
224
- "TXCOUVGLO_EPCI_2014"
225
- ],
226
- "json": [
227
- "STRUCTURED_INFO"
228
- ],
229
- "geojson": [
230
- "GEO_INFO"
231
- ]
232
- },
233
- "profile": {
234
- "NUMCOM": {
235
- "tops": [
236
- "01170",
237
- "01169",
238
- "01167",
239
- "01166",
240
- "01165",
241
- "01163",
242
- "01162",
243
- "01297",
244
- "01304",
245
- "01303"
246
- ],
247
- "nb_distinct": 407,
248
- "nb_missing_values": 0
249
- },
250
- "NOMCOM": {
251
- "tops": [
252
- "BEARD-GEOVREISSIAT",
253
- "GENOUILLEUX",
254
- "GARNERANS",
255
- "FRANS",
256
- "FRANCHELEINS",
257
- "FOISSIAT",
258
- "FLAXIEU",
259
- "PIZAY",
260
- "PONT-D'AIN",
261
- "PONCIN"
262
- ],
263
- "nb_distinct": 407,
264
- "nb_missing_values": 0
265
- },
266
- "NUMDEP": {
267
- "min": 1,
268
- "max": 6,
269
- "mean": 1,
270
- "std": 0,
271
- "tops": [
272
- 1,
273
- 6
274
- ],
275
- "nb_distinct": 2,
276
- "nb_missing_values": 0
277
- },
278
- "NOMDEP": {
279
- "tops": [
280
- "AIN",
281
- "ALPES MARITIMES"
282
- ],
283
- "nb_distinct": 2,
284
- "nb_missing_values": 0
285
- },
286
- "NUMEPCI": {
287
- "tops": [
288
- "200040350",
289
- "200042935",
290
- "240100883",
291
- "240100750",
292
- "200042497",
293
- "200035210",
294
- "240100156",
295
- "240100370",
296
- "200029999",
297
- "240100628"
298
- ],
299
- "nb_distinct": 33,
300
- "nb_missing_values": 0
301
- },
302
- "NOMEPCI": {
303
- "tops": [
304
- "CC BUGEY SUD",
305
- "CC HAUT - BUGEY",
306
- "CC DE LA PLAINE DE L'AIN",
307
- "CC DU PAYS DE GEX",
308
- "CC DOMBES SAONE VALLEE",
309
- "CC CHALARONNE CENTRE",
310
- "CC DE MONTREVEL - EN - BRESSE",
311
- "CC DU VALROMEY",
312
- "CC RIVES DE L'AIN - PAYS DU CERDON",
313
- "CA BOURG EN BRESSE AGGLOMERATION"
314
- ],
315
- "nb_distinct": 33,
316
- "nb_missing_values": 0
317
- },
318
- "TXCOUVGLO_COM_2014": {
319
- "min": 0.0,
320
- "max": 200.2,
321
- "mean": 59.35863746958638,
322
- "std": 36.453598197621275,
323
- "tops": [
324
- 0.0,
325
- 68.6,
326
- 30.5,
327
- 54.7,
328
- 82.6,
329
- 78.4,
330
- 64.3,
331
- 78.1,
332
- 24.9,
333
- null
334
- ],
335
- "nb_distinct": 297,
336
- "nb_missing_values": 3
337
- },
338
- "TXCOUVGLO_DEP_2014": {
339
- "min": 47.0,
340
- "max": 65.2,
341
- "mean": 65.112077294686,
342
- "std": 1.263455055322421,
343
- "tops": [
344
- 65.2,
345
- 47.0
346
- ],
347
- "nb_distinct": 2,
348
- "nb_missing_values": 0
349
- },
350
- "TXCOUVGLO_EPCI_2014": {
351
- "min": 28.3,
352
- "max": 93.9,
353
- "mean": 64.45772946859903,
354
- "std": 12.72227368109601,
355
- "tops": [
356
- 52.4,
357
- 45.3,
358
- 75.2,
359
- 78.4,
360
- 46.9,
361
- 77.8,
362
- 67.9,
363
- 70.0,
364
- 72.9,
365
- 68.7
366
- ],
367
- "nb_distinct": 30,
368
- "nb_missing_values": 0
369
- },
370
- "STRUCTURED_INFO": {
371
- "tops": [
372
- "{\"champ_1\": 154, \"champ_2\": 0.0792}",
373
- "{\"champ_1\": 153, \"champ_2\": 0.0737}",
374
- "{\"champ_1\": 152, \"champ_2\": 0.0681}",
375
- "{\"champ_1\": 151, \"champ_2\": 0.0624}",
376
- "{\"champ_1\": 150, \"champ_2\": 0.0568}",
377
- "{\"champ_1\": 149, \"champ_2\": 0.0511}",
378
- "{\"champ_1\": 148, \"champ_2\": 0.0454}",
379
- "{\"champ_1\": 268, \"champ_2\": 0.553}",
380
- "{\"champ_1\": 275, \"champ_2\": 0.5732}",
381
- "{\"champ_1\": 274, \"champ_2\": 0.5704}"
382
- ],
383
- "nb_distinct": 407,
384
- "nb_missing_values": 0
385
- },
386
- "GEO_INFO": {
387
- "tops": [
388
- "{\"type\": \"Point\", \"coordinates\": [12.5, 2.8]}"
389
- ],
390
- "nb_distinct": 1,
391
- "nb_missing_values": 0
392
- }
393
- }
394
- }
@@ -1,7 +0,0 @@
1
- code_departement,code_region,partly_empty
2
- 01,01,10
3
- 23,84,100
4
- 44,32,76
5
- 31,32,
6
- 28,84,
7
- 59,24,
@@ -1,2 +0,0 @@
1
- colmmun A;columnB
2
- row A;row B;row C
tests/data/csv_file DELETED
@@ -1,7 +0,0 @@
1
- code_departement,code_region
2
- 01,01
3
- 23,84
4
- 44,32
5
- 31,32
6
- 28,84
7
- 59,24
tests/data/file.csv.gz DELETED
Binary file
tests/data/file.ods DELETED
Binary file
tests/data/file.xls DELETED
Binary file
tests/data/file.xlsx DELETED
Binary file
tests/data/xlsx_file DELETED
Binary file
tests/test_example.py DELETED
@@ -1,67 +0,0 @@
1
- import re
2
- from uuid import UUID
3
-
4
- from csv_detective.output.example import create_example_csv_file
5
-
6
-
7
- def test_example_creation():
8
- fields = [
9
- {
10
- "name": "id_unique",
11
- "type": "id",
12
- },
13
- {
14
- "name": "nom_modele",
15
- "type": "str",
16
- "args": {"length": 20},
17
- },
18
- {
19
- "name": "siret",
20
- "type": "str",
21
- "args": {"pattern": "^\\d{14}$"},
22
- },
23
- {
24
- "name": "type_producteur",
25
- "type": "str",
26
- "args": {"enum": ["privé", "public", "association"]},
27
- },
28
- {
29
- "name": "date_creation",
30
- "type": "date",
31
- "args": {
32
- "date_range": ["1996-02-13", "2000-01-28"],
33
- "format": "%Y-%m-%d",
34
- },
35
- },
36
- {
37
- "name": "url_produit",
38
- "type": "url",
39
- },
40
- {
41
- "name": "nb_produits",
42
- "type": "int",
43
- },
44
- {"name": "note", "type": "float", "args": {"num_range": [1, 20]}},
45
- ]
46
- df = create_example_csv_file(
47
- fields=fields,
48
- file_length=5,
49
- output_name=None,
50
- )
51
- assert len(df) == 5
52
- assert all(UUID(_) for _ in df["id_unique"])
53
- assert all(len(_) == 20 for _ in df["nom_modele"])
54
- assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
55
- assert all(_ in ["privé", "public", "association"] for _ in df["type_producteur"])
56
- assert all(_ >= "1996-02-13" and _ <= "2000-01-28" for _ in df["date_creation"])
57
- assert all(_.startswith("http") for _ in df["url_produit"])
58
- assert all(isinstance(_, int) for _ in df["nb_produits"])
59
- assert all(_ >= 1 and _ <= 20 for _ in df["note"])
60
-
61
-
62
- def test_example_from_tableschema():
63
- df = create_example_csv_file(
64
- schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
65
- output_name=None,
66
- )
67
- assert len(df) == 10
tests/test_fields.py DELETED
@@ -1,175 +0,0 @@
1
- from datetime import date as _date
2
- from datetime import datetime as _datetime
3
- from unittest.mock import patch
4
-
5
- import pandas as pd
6
- import pytest
7
- from numpy import random
8
-
9
- from csv_detective.detection.variables import (
10
- detect_categorical_variable,
11
- detect_continuous_variable,
12
- )
13
- from csv_detective.format import FormatsManager
14
- from csv_detective.output.dataframe import cast
15
- from csv_detective.output.utils import prepare_output_dict
16
- from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
17
-
18
- fmtm = FormatsManager()
19
-
20
-
21
- def test_all_format_funcs_return_bool():
22
- for format in fmtm.formats.values():
23
- for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
24
- assert isinstance(format.func(tmp), bool)
25
-
26
-
27
- # categorical
28
- def test_detect_categorical_variable():
29
- categorical_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
30
- categorical_col2 = [str(k // 20) for k in range(100)]
31
- not_categorical_col = [i for i in range(100)]
32
-
33
- df_dict = {
34
- "cat": categorical_col,
35
- "cat2": categorical_col2,
36
- "not_cat": not_categorical_col,
37
- }
38
- df = pd.DataFrame(df_dict, dtype=str)
39
-
40
- res, _ = detect_categorical_variable(df)
41
- assert len(res) and all(k in res for k in ["cat", "cat2"])
42
-
43
-
44
- # continuous
45
- def test_detect_continuous_variable():
46
- continuous_col = random.random(100)
47
- continuous_col_2 = [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7, 21, 3] * 10
48
- not_continuous_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
49
-
50
- df_dict = {"cont": continuous_col, "not_cont": not_continuous_col}
51
- df_dict_2 = {"cont": continuous_col_2, "not_cont": not_continuous_col}
52
-
53
- df = pd.DataFrame(df_dict, dtype=str)
54
- df2 = pd.DataFrame(df_dict_2, dtype=str)
55
-
56
- res = detect_continuous_variable(df)
57
- res2 = detect_continuous_variable(df2, continuous_th=0.65)
58
- assert res.values and res.values[0] == "cont"
59
- assert res2.values and res2.values[0] == "cont"
60
-
61
-
62
- # we could also have a function here to add all True values of (almost)
63
- # each field to the False values of all others (to do when parenthood is added)
64
-
65
-
66
- def test_all_fields_have_tests():
67
- for format in fmtm.formats.values():
68
- valid = format._test_values
69
- # checking structure
70
- assert all(
71
- isinstance(key, bool)
72
- and isinstance(vals, list)
73
- and all(isinstance(val, str) for val in vals)
74
- for key, vals in valid.items()
75
- )
76
- # checking that we have valid and invalid cases for each
77
- assert all(b in valid.keys() for b in [True, False])
78
-
79
-
80
- # this is based on the _test_values of each <format>.py file
81
- @pytest.mark.parametrize(
82
- "args",
83
- (
84
- (format.func, value, valid)
85
- for valid in [True, False]
86
- for format in fmtm.formats.values()
87
- for value in format._test_values[valid]
88
- ),
89
- )
90
- def test_fields_with_values(args):
91
- func, value, valid = args
92
- assert func(value) is valid
93
-
94
-
95
- @pytest.mark.parametrize(
96
- "args",
97
- (
98
- ("Valeur", "string", str),
99
- ("-17", "int", int),
100
- ("1.9", "float", float),
101
- ("oui", "bool", bool),
102
- ("[1, 2]", "json", list),
103
- ('{"a": 1}', "json", dict),
104
- ("2022-08-01", "date", _date),
105
- ("2024-09-23 17:32:07", "datetime", _datetime),
106
- ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
107
- ("N/A", "int", None),
108
- ("nan", "bool", None),
109
- ("", "date", None), # all NaN-like values should be cast as None for all type
110
- ),
111
- )
112
- def test_cast(args):
113
- value, detected_type, cast_type = args
114
- if cast_type is None:
115
- assert cast(value, detected_type) is None
116
- else:
117
- assert isinstance(cast(value, detected_type), cast_type)
118
-
119
-
120
- @pytest.mark.parametrize(
121
- "args",
122
- (
123
- # there is a specific numerical format => specific wins
124
- ({"int": 1, "float": 1, "latitude_wgs": 1}, "latitude_wgs"),
125
- # scores are equal for related formats => priority wins
126
- ({"int": 1, "float": 1}, "int"),
127
- # score is lower for priority format => secondary wins
128
- ({"int": 0.5, "float": 1}, "float"),
129
- # score is lower for priority format, but is 1 => priority wins
130
- ({"int": 1, "float": 1.25}, "int"),
131
- # two rounds of priority => highest priority wins
132
- ({"latlon_wgs": 1, "lonlat_wgs": 1, "json": 1}, "latlon_wgs"),
133
- # no detection => default to string
134
- ({}, "string"),
135
- ),
136
- )
137
- def test_priority(args):
138
- detections, expected = args
139
- col = "col1"
140
- output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
141
- assert output[col]["format"] == expected
142
-
143
-
144
- @pytest.mark.parametrize(
145
- "args",
146
- (
147
- ("1996-02-13", fmtm.formats["date"]),
148
- ("28/01/2000", fmtm.formats["date"]),
149
- ("2025-08-20T14:30:00+02:00", fmtm.formats["datetime_aware"]),
150
- ("2025/08/20 14:30:00.2763-12:00", fmtm.formats["datetime_aware"]),
151
- ("1925_12_20T14:30:00.2763", fmtm.formats["datetime_naive"]),
152
- ("1925 12 20 14:30:00Z", fmtm.formats["datetime_aware"]),
153
- ),
154
- )
155
- def test_early_detection(args):
156
- value, format = args
157
- with patch("csv_detective.formats.date.date_casting") as mock_func:
158
- res = format.func(value)
159
- assert res
160
- mock_func.assert_not_called()
161
-
162
-
163
- def test_all_proportion_1():
164
- # building a table that uses only correct values for these formats, except on one row
165
- table = pd.DataFrame(
166
- {
167
- name: (format._test_values[True] * 100)[:100] + ["not_suitable"]
168
- for name, format in fmtm.formats.items()
169
- if format.proportion == 1
170
- }
171
- )
172
- # testing columns for all formats
173
- returned_table = col_test(table, fmtm.formats, limited_output=True)
174
- # the analysis should have found no match on any format
175
- assert all(returned_table[col].sum() == 0 for col in table.columns)