csv-detective 0.9.3.dev2241__py3-none-any.whl → 0.9.3.dev2319__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. csv_detective/detection/formats.py +12 -15
  2. csv_detective/detection/headers.py +6 -8
  3. csv_detective/explore_csv.py +28 -9
  4. csv_detective/format.py +67 -0
  5. csv_detective/formats/__init__.py +9 -0
  6. csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
  7. csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
  8. csv_detective/formats/code_commune_insee.py +26 -0
  9. csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
  10. csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
  11. csv_detective/formats/code_fantoir.py +21 -0
  12. csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
  13. csv_detective/formats/code_postal.py +25 -0
  14. csv_detective/formats/code_region.py +22 -0
  15. csv_detective/formats/code_rna.py +29 -0
  16. csv_detective/formats/code_waldec.py +17 -0
  17. csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
  18. csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
  19. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  20. csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
  21. csv_detective/formats/date_fr.py +22 -0
  22. csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
  23. csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
  24. csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
  25. csv_detective/formats/departement.py +37 -0
  26. csv_detective/formats/email.py +28 -0
  27. csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
  28. csv_detective/formats/geojson.py +36 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
  30. csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
  31. csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
  32. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  33. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  34. csv_detective/formats/iso_country_code_numeric.py +31 -0
  35. csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
  36. csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
  37. csv_detective/formats/latitude_l93.py +48 -0
  38. csv_detective/formats/latitude_wgs.py +42 -0
  39. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  40. csv_detective/formats/latlon_wgs.py +53 -0
  41. csv_detective/formats/longitude_l93.py +39 -0
  42. csv_detective/formats/longitude_wgs.py +32 -0
  43. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  44. csv_detective/formats/lonlat_wgs.py +36 -0
  45. csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
  46. csv_detective/formats/money.py +18 -0
  47. csv_detective/formats/mongo_object_id.py +14 -0
  48. csv_detective/formats/pays.py +35 -0
  49. csv_detective/formats/percent.py +16 -0
  50. csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
  51. csv_detective/formats/sexe.py +17 -0
  52. csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
  53. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
  54. csv_detective/formats/tel_fr.py +36 -0
  55. csv_detective/formats/uai.py +36 -0
  56. csv_detective/formats/url.py +45 -0
  57. csv_detective/formats/username.py +14 -0
  58. csv_detective/formats/uuid.py +16 -0
  59. csv_detective/formats/year.py +28 -0
  60. csv_detective/output/__init__.py +3 -4
  61. csv_detective/output/dataframe.py +3 -3
  62. csv_detective/output/profile.py +2 -3
  63. csv_detective/output/schema.py +2 -2
  64. csv_detective/parsing/columns.py +35 -50
  65. csv_detective/parsing/csv.py +2 -2
  66. csv_detective/parsing/load.py +10 -11
  67. csv_detective/validate.py +9 -4
  68. {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/METADATA +6 -5
  69. csv_detective-0.9.3.dev2319.dist-info/RECORD +102 -0
  70. tests/test_fields.py +39 -364
  71. tests/test_file.py +1 -1
  72. tests/test_labels.py +5 -3
  73. tests/test_structure.py +40 -36
  74. csv_detective/detect_fields/FR/__init__.py +0 -0
  75. csv_detective/detect_fields/FR/geo/__init__.py +0 -0
  76. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
  77. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
  78. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
  79. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
  80. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
  81. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
  82. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  83. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
  84. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  85. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
  86. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  87. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  88. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
  89. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
  90. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  91. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
  92. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
  93. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  94. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  95. csv_detective/detect_fields/__init__.py +0 -112
  96. csv_detective/detect_fields/geo/__init__.py +0 -0
  97. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  98. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  99. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  100. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
  101. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  102. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
  103. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  104. csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
  105. csv_detective/detect_fields/other/__init__.py +0 -0
  106. csv_detective/detect_fields/other/email/__init__.py +0 -10
  107. csv_detective/detect_fields/other/money/__init__.py +0 -11
  108. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  109. csv_detective/detect_fields/other/percent/__init__.py +0 -9
  110. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  111. csv_detective/detect_fields/other/url/__init__.py +0 -14
  112. csv_detective/detect_fields/other/uuid/__init__.py +0 -10
  113. csv_detective/detect_fields/temp/__init__.py +0 -0
  114. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  115. csv_detective/detect_labels/FR/__init__.py +0 -0
  116. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  117. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
  118. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
  119. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
  120. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
  121. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
  122. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
  123. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
  124. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
  125. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
  126. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
  127. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
  128. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
  129. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
  130. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
  131. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
  132. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  133. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
  134. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
  135. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
  136. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
  137. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
  138. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
  139. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
  140. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
  141. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
  142. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
  143. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
  144. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  145. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
  146. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
  147. csv_detective/detect_labels/__init__.py +0 -94
  148. csv_detective/detect_labels/geo/__init__.py +0 -0
  149. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
  150. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
  151. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
  152. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
  153. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
  154. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
  155. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
  156. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
  157. csv_detective/detect_labels/other/__init__.py +0 -0
  158. csv_detective/detect_labels/other/booleen/__init__.py +0 -8
  159. csv_detective/detect_labels/other/email/__init__.py +0 -20
  160. csv_detective/detect_labels/other/float/__init__.py +0 -8
  161. csv_detective/detect_labels/other/int/__init__.py +0 -8
  162. csv_detective/detect_labels/other/money/__init__.py +0 -8
  163. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
  164. csv_detective/detect_labels/other/twitter/__init__.py +0 -8
  165. csv_detective/detect_labels/other/url/__init__.py +0 -23
  166. csv_detective/detect_labels/other/uuid/__init__.py +0 -8
  167. csv_detective/detect_labels/temp/__init__.py +0 -0
  168. csv_detective/detect_labels/temp/date/__init__.py +0 -28
  169. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
  170. csv_detective/detect_labels/temp/year/__init__.py +0 -19
  171. csv_detective/load_tests.py +0 -59
  172. csv_detective-0.9.3.dev2241.dist-info/RECORD +0 -166
  173. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  174. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  175. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  176. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
  177. {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/WHEEL +0 -0
  178. {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/entry_points.txt +0 -0
  179. {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/licenses/LICENSE +0 -0
  180. {csv_detective-0.9.3.dev2241.dist-info → csv_detective-0.9.3.dev2319.dist-info}/top_level.txt +0 -0
tests/test_fields.py CHANGED
@@ -6,87 +6,22 @@ import pandas as pd
6
6
  import pytest
7
7
  from numpy import random
8
8
 
9
- from csv_detective.detect_fields.FR.geo import (
10
- adresse,
11
- code_commune_insee,
12
- code_departement,
13
- code_fantoir,
14
- code_postal,
15
- code_region,
16
- commune,
17
- departement,
18
- insee_canton,
19
- latitude_l93,
20
- latitude_wgs_fr_metropole,
21
- longitude_l93,
22
- longitude_wgs_fr_metropole,
23
- pays,
24
- region,
25
- )
26
- from csv_detective.detect_fields.FR.other import (
27
- code_csp_insee,
28
- code_import,
29
- code_rna,
30
- code_waldec,
31
- csp_insee,
32
- date_fr,
33
- insee_ape700,
34
- sexe,
35
- siren,
36
- siret,
37
- tel_fr,
38
- uai,
39
- )
40
- from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee
41
- from csv_detective.detect_fields.geo import (
42
- iso_country_code_alpha2,
43
- iso_country_code_alpha3,
44
- iso_country_code_numeric,
45
- json_geojson,
46
- latitude_wgs,
47
- latlon_wgs,
48
- longitude_wgs,
49
- lonlat_wgs,
50
- )
51
- from csv_detective.detect_fields.other import (
52
- booleen,
53
- email,
54
- json,
55
- money,
56
- mongo_object_id,
57
- percent,
58
- twitter,
59
- url,
60
- uuid,
61
- )
62
- from csv_detective.detect_fields.other import (
63
- float as test_float,
64
- )
65
- from csv_detective.detect_fields.other import (
66
- int as test_int,
67
- )
68
- from csv_detective.detect_fields.temp import (
69
- date,
70
- datetime_aware,
71
- datetime_naive,
72
- datetime_rfc822,
73
- year,
74
- )
75
9
  from csv_detective.detection.variables import (
76
10
  detect_categorical_variable,
77
11
  detect_continuous_variable,
78
12
  )
79
- from csv_detective.load_tests import return_all_tests
13
+ from csv_detective.format import FormatsManager
80
14
  from csv_detective.output.dataframe import cast
81
15
  from csv_detective.output.utils import prepare_output_dict
82
16
  from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
83
17
 
18
+ fmtm = FormatsManager()
19
+
84
20
 
85
- def test_all_tests_return_bool():
86
- all_tests = return_all_tests("ALL", "detect_fields")
87
- for attr in all_tests.values():
21
+ def test_all_format_funcs_return_bool():
22
+ for format in fmtm.formats.values():
88
23
  for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
89
- assert isinstance(attr["func"](tmp), bool)
24
+ assert isinstance(format.func(tmp), bool)
90
25
 
91
26
 
92
27
  # categorical
@@ -124,292 +59,37 @@ def test_detect_continuous_variable():
124
59
  assert res2.values and res2.values[0] == "cont"
125
60
 
126
61
 
127
- fields = {
128
- adresse: {
129
- True: ["rue du martyr"],
130
- False: ["un batiment"],
131
- },
132
- code_commune_insee: {
133
- True: ["91471", "01053"],
134
- False: ["914712", "01000"],
135
- },
136
- code_departement: {
137
- True: ["75", "2A", "2b", "974", "01"],
138
- False: ["00", "96", "101"],
139
- },
140
- code_fantoir: {
141
- True: ["7755A", "B150B", "ZA04C", "ZB03D"],
142
- False: ["7755", "ZA99A"],
143
- },
144
- code_postal: {
145
- True: ["75020", "01000"],
146
- False: ["77777", "018339"],
147
- },
148
- code_region: {
149
- True: ["32"],
150
- False: ["55"],
151
- },
152
- commune: {
153
- True: ["saint denis"],
154
- False: ["new york", "lion"],
155
- },
156
- departement: {
157
- True: ["essonne"],
158
- False: ["alabama", "auvergne"],
159
- },
160
- insee_canton: {
161
- True: ["nantua"],
162
- False: ["california"],
163
- },
164
- latitude_l93: {
165
- True: ["6037008", "7123528.5", "7124528,5"],
166
- False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"],
167
- },
168
- longitude_l93: {
169
- True: ["0", "-154", "1265783,45", "34723.4"],
170
- False: ["1456669.8", "-776225", "346_3214"],
171
- },
172
- latitude_wgs_fr_metropole: {
173
- True: ["42.5"],
174
- False: ["22.5", "62.5"],
175
- },
176
- longitude_wgs_fr_metropole: {
177
- True: ["-2.5"],
178
- False: ["12.8"],
179
- },
180
- pays: {
181
- True: ["france", "italie"],
182
- False: ["amerique", "paris"],
183
- },
184
- region: {
185
- True: ["bretagne", "ile-de-france"],
186
- False: ["baviere", "overgne"],
187
- },
188
- code_csp_insee: {
189
- True: ["121f"],
190
- False: ["121x"],
191
- },
192
- code_rna: {
193
- True: ["W751515517"],
194
- False: [
195
- "W111111111111111111111111111111111111",
196
- "w143788974",
197
- "W12",
198
- "678W23456",
199
- "165789325",
200
- "Wa1#89sf&h",
201
- ],
202
- },
203
- code_import: {
204
- True: ["123S1871092288"],
205
- False: ["AA751PEE00188854", "W123456789"],
206
- },
207
- code_waldec: {
208
- True: ["W123456789", "W2D1234567"],
209
- False: ["AA751PEE00188854"],
210
- },
211
- csp_insee: {
212
- True: ["employes de la poste"],
213
- False: ["super-heros"],
214
- },
215
- sexe: {
216
- True: ["homme"],
217
- False: ["hermaphrodite"],
218
- },
219
- siren: {
220
- True: ["552 100 554", "552100554"],
221
- False: ["42"],
222
- },
223
- siret: {
224
- True: ["13002526500013", "130 025 265 00013"],
225
- False: ["13002526500012"],
226
- },
227
- uai: {
228
- True: ["0422170F"],
229
- False: ["04292E"],
230
- },
231
- date_fr: {
232
- True: ["13 fevrier 1996"],
233
- False: ["44 march 2025"],
234
- },
235
- insee_ape700: {True: ["0116Z"], False: ["0116A"]},
236
- tel_fr: {
237
- True: ["0134643467"],
238
- False: ["6625388263", "01288398"],
239
- },
240
- jour_de_la_semaine: {
241
- True: ["lundi"],
242
- False: ["jour de la biere"],
243
- },
244
- mois_de_annee: {
245
- True: ["juin", "décembre"],
246
- False: ["november"],
247
- },
248
- iso_country_code_alpha2: {
249
- True: ["FR"],
250
- False: ["XX", "A", "FRA"],
251
- },
252
- iso_country_code_alpha3: {
253
- True: ["FRA"],
254
- False: ["XXX", "FR", "A"],
255
- },
256
- iso_country_code_numeric: {
257
- True: ["250"],
258
- False: ["003"],
259
- },
260
- json_geojson: {
261
- True: [
262
- '{"coordinates": [45.783753, 3.049342], "type": "63870"}',
263
- '{"geometry": {"coordinates": [45.783753, 3.049342]}}',
264
- ],
265
- False: ['{"pomme": "fruit", "reponse": 42}'],
266
- },
267
- latitude_wgs: {
268
- True: ["43.2", "-22"],
269
- False: ["100"],
270
- },
271
- latlon_wgs: {
272
- True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"],
273
- False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"],
274
- },
275
- longitude_wgs: {
276
- True: ["120", "-20.2"],
277
- False: ["-200"],
278
- },
279
- lonlat_wgs: {
280
- True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
281
- False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
282
- },
283
- booleen: {
284
- True: ["oui", "0", "1", "yes", "false", "True"],
285
- False: ["nein", "ja", "2", "-0"],
286
- },
287
- email: {
288
- True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
289
- False: ["cdo@@gouv.sfd"],
290
- },
291
- json: {
292
- True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"],
293
- False: ["5", '{"zefib":', '{"a"}'],
294
- },
295
- money: {
296
- True: ["120€", "-20.2$"],
297
- False: ["200", "100 euros"],
298
- },
299
- mongo_object_id: {
300
- True: ["62320e50f981bc2b57bcc044"],
301
- False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"],
302
- },
303
- percent: {
304
- True: ["120%", "-20.2%"],
305
- False: ["200", "100 pourcents"],
306
- },
307
- twitter: {
308
- True: ["@accueil1"],
309
- False: ["adresse@mail"],
310
- },
311
- url: {
312
- True: [
313
- "www.data.gouv.fr",
314
- "http://data.gouv.fr",
315
- "https://www.youtube.com/@data-gouv-fr",
316
- (
317
- "https://tabular-api.data.gouv.fr/api/resources/"
318
- "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
319
- "?score__greater=0.9&decompte__exact=13"
320
- ),
321
- ],
322
- False: ["tmp@data.gouv.fr"],
323
- },
324
- uuid: {
325
- True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
326
- False: ["0610928327"],
327
- },
328
- test_int: {
329
- True: ["1", "0", "1764", "-24"],
330
- False: ["01053", "1.2", "123_456", "+35"],
331
- },
332
- test_float: {
333
- True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"],
334
- False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"],
335
- },
336
- date: {
337
- True: [
338
- "1960-08-07",
339
- "12/02/2007",
340
- "15 jan 1985",
341
- "15 décembre 1985",
342
- "02 05 2003",
343
- "20030502",
344
- "1993-12/02",
345
- ],
346
- False: [
347
- "1993-1993-1993",
348
- "39-10-1993",
349
- "19-15-1993",
350
- "15 tambour 1985",
351
- "12152003",
352
- "20031512",
353
- "02052003",
354
- ],
355
- },
356
- datetime_aware: {
357
- True: [
358
- "2021-06-22 10:20:10-04:00",
359
- "2030-06-22 00:00:00.0028+02:00",
360
- "2000-12-21 10:20:10.1Z",
361
- "2024-12-19T10:53:36.428000+00:00",
362
- "1996/06/22 10:20:10 GMT",
363
- ],
364
- False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
365
- },
366
- datetime_naive: {
367
- True: [
368
- "2021-06-22 10:20:10",
369
- "2030/06-22 00:00:00",
370
- "2030/06/22 00:00:00.0028",
371
- ],
372
- False: [
373
- "2021-06-22T30:20:10",
374
- "Sun, 06 Nov 1994 08:49:37 GMT",
375
- "2021-06-44 10:20:10+02:00",
376
- "1999-12-01T00:00:00Z",
377
- "2021-06-44",
378
- "15 décembre 1985",
379
- ],
380
- },
381
- datetime_rfc822: {
382
- True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
383
- False: ["2021-06-22T10:20:10"],
384
- },
385
- year: {
386
- True: ["2015"],
387
- False: ["20166"],
388
- },
389
- }
390
-
391
62
  # we could also have a function here to add all True values of (almost)
392
- # each field to the False values of all others
63
+ # each field to the False values of all others (to do when parenthood is added)
393
64
 
394
65
 
395
66
  def test_all_fields_have_tests():
396
- all_tests = return_all_tests("ALL", "detect_fields")
397
- for attr in all_tests.values():
398
- assert fields.get(attr["module"])
399
-
400
-
67
+ for format in fmtm.formats.values():
68
+ valid = format._test_values
69
+ # checking structure
70
+ assert all(
71
+ isinstance(key, bool)
72
+ and isinstance(vals, list)
73
+ and all(isinstance(val, str) for val in vals)
74
+ for key, vals in valid.items()
75
+ )
76
+ # checking that we have valid and invalid cases for each
77
+ assert all(b in valid.keys() for b in [True, False])
78
+
79
+
80
+ # this is based on the _test_values of each <format>.py file
401
81
  @pytest.mark.parametrize(
402
82
  "args",
403
83
  (
404
- (field, value, valid)
405
- for field in fields
84
+ (format.func, value, valid)
406
85
  for valid in [True, False]
407
- for value in fields[field][valid]
86
+ for format in fmtm.formats.values()
87
+ for value in format._test_values[valid]
408
88
  ),
409
89
  )
410
90
  def test_fields_with_values(args):
411
- field, value, valid = args
412
- assert field._is(value) is valid
91
+ func, value, valid = args
92
+ assert func(value) is valid
413
93
 
414
94
 
415
95
  @pytest.mark.parametrize(
@@ -456,37 +136,32 @@ def test_priority(args):
456
136
  @pytest.mark.parametrize(
457
137
  "args",
458
138
  (
459
- ("1996-02-13", date),
460
- ("28/01/2000", date),
461
- ("2025-08-20T14:30:00+02:00", datetime_aware),
462
- ("2025/08/20 14:30:00.2763-12:00", datetime_aware),
463
- ("1925_12_20T14:30:00.2763", datetime_naive),
464
- ("1925 12 20 14:30:00Z", datetime_aware),
139
+ ("1996-02-13", fmtm.formats["date"]),
140
+ ("28/01/2000", fmtm.formats["date"]),
141
+ ("2025-08-20T14:30:00+02:00", fmtm.formats["datetime_aware"]),
142
+ ("2025/08/20 14:30:00.2763-12:00", fmtm.formats["datetime_aware"]),
143
+ ("1925_12_20T14:30:00.2763", fmtm.formats["datetime_naive"]),
144
+ ("1925 12 20 14:30:00Z", fmtm.formats["datetime_aware"]),
465
145
  ),
466
146
  )
467
147
  def test_early_detection(args):
468
- value, module = args
469
- with patch("csv_detective.detect_fields.temp.date.date_casting") as mock_func:
470
- res = module._is(value)
148
+ value, format = args
149
+ with patch("csv_detective.formats.date.date_casting") as mock_func:
150
+ res = format.func(value)
471
151
  assert res
472
152
  mock_func.assert_not_called()
473
153
 
474
154
 
475
155
  def test_all_proportion_1():
476
- all_tests = return_all_tests("ALL", "detect_fields")
477
- prop_1 = {
478
- name: eval(name if name not in ["int", "float"] else "test_" + name)
479
- for name, attr in all_tests.items()
480
- if attr["prop"] == 1
481
- }
482
156
  # building a table that uses only correct values for these formats, except on one row
483
157
  table = pd.DataFrame(
484
158
  {
485
- test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"]
486
- for test_name, test_module in prop_1.items()
159
+ name: (format._test_values[True] * 100)[:100] + ["not_suitable"]
160
+ for name, format in fmtm.formats.items()
161
+ if format.proportion == 1
487
162
  }
488
163
  )
489
164
  # testing columns for all formats
490
- returned_table = col_test(table, all_tests, limited_output=True)
165
+ returned_table = col_test(table, fmtm.formats, limited_output=True)
491
166
  # the analysis should have found no match on any format
492
167
  assert all(returned_table[col].sum() == 0 for col in table.columns)
tests/test_file.py CHANGED
@@ -49,7 +49,7 @@ def test_columns_output_on_file(chunk_size):
49
49
  assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json"
50
50
  assert output["columns"]["STRUCTURED_INFO"]["format"] == "json"
51
51
  assert output["columns"]["GEO_INFO"]["python_type"] == "json"
52
- assert output["columns"]["GEO_INFO"]["format"] == "json_geojson"
52
+ assert output["columns"]["GEO_INFO"]["format"] == "geojson"
53
53
 
54
54
 
55
55
  def test_profile_output_on_file():
tests/test_labels.py CHANGED
@@ -1,12 +1,14 @@
1
1
  import pytest
2
2
 
3
- from csv_detective.detect_labels import latitude_wgs, money
3
+ from csv_detective.format import FormatsManager
4
+
5
+ fmtm = FormatsManager()
4
6
 
5
7
 
6
8
  # money labels
7
9
  def test_money_labels():
8
10
  header = "Montant total"
9
- assert money._is(header) == 0.5
11
+ assert fmtm.formats["money"].is_valid_label(header) == 0.5
10
12
 
11
13
 
12
14
  @pytest.mark.parametrize(
@@ -21,4 +23,4 @@ def test_money_labels():
21
23
  )
22
24
  def test_latitude(params):
23
25
  header, expected = params
24
- assert expected == latitude_wgs._is(header)
26
+ assert expected == fmtm.formats["latitude_wgs"].is_valid_label(header)
tests/test_structure.py CHANGED
@@ -1,41 +1,45 @@
1
1
  import os
2
2
 
3
- from csv_detective import detect_fields, detect_labels # noqa
4
- from csv_detective.load_tests import return_all_tests
5
-
6
-
7
- def tests_conformity():
8
- """
9
- Check that all tests are properly structured:
10
- - an __init__.py file in the test folder
11
- - an _is function in the __init__.py file
12
- """
13
- for _type in ["fields", "labels"]:
14
- _dir = f"csv_detective/detect_{_type}"
15
- subfolders = []
16
- for dirpath, dirnames, _ in os.walk(_dir):
17
- for dirname in dirnames:
18
- if "__pycache__" not in dirname:
19
- subfolders.append(os.path.join(dirpath, dirname))
20
- final_subfolders = [
21
- sf
22
- for sf in subfolders
23
- if not any(other_sf.startswith(sf) for other_sf in subfolders if sf != other_sf)
24
- ]
25
- for f_sf in final_subfolders:
26
- assert "__init__.py" in os.listdir(f_sf)
27
- _package = eval(
28
- f_sf.replace("csv_detective/", "")
29
- # locally we have "\\", but in CI for instance there is "/"
30
- .replace("\\", ".")
31
- .replace("/", ".")
32
- )
33
- assert "_is" in dir(_package)
3
+ import pytest
4
+
5
+ from csv_detective.format import Format, FormatsManager
6
+
7
+ fmtm = FormatsManager()
34
8
 
35
9
 
36
10
  def test_all_tests_have_unique_name():
37
- names = [
38
- attr["module"].__name__.split(".")[-1]
39
- for attr in return_all_tests("ALL", "detect_fields").values()
40
- ]
41
- assert len(names) == len(set(names))
11
+ formats: list[str] = os.listdir("csv_detective/formats")
12
+ assert "__init__.py" in formats
13
+ assert len(formats) == len(set(formats))
14
+
15
+
16
+ def test_conformity():
17
+ for name, format in fmtm.formats.items():
18
+ assert isinstance(name, str)
19
+ assert isinstance(format, Format)
20
+ assert all(
21
+ getattr(format, attr) is not None
22
+ for attr in [
23
+ "name",
24
+ "func",
25
+ "_test_values",
26
+ "labels",
27
+ "proportion",
28
+ "tags",
29
+ ]
30
+ )
31
+
32
+
33
+ @pytest.mark.parametrize(
34
+ "tags",
35
+ (
36
+ ["type"],
37
+ ["temp", "fr"],
38
+ ),
39
+ )
40
+ def test_get_from_tags(tags):
41
+ fmts = fmtm.get_formats_from_tags(tags)
42
+ assert len(fmts)
43
+ for fmt in fmts.values():
44
+ for tag in tags:
45
+ assert tag in fmt.tags
File without changes
File without changes
@@ -1,9 +0,0 @@
1
- from frformat import CodeCommuneInsee, Millesime
2
-
3
- PROPORTION = 0.75
4
-
5
- _code_commune_insee = CodeCommuneInsee(Millesime.LATEST)
6
-
7
-
8
- def _is(val):
9
- return _code_commune_insee.is_valid(val)
@@ -1,9 +0,0 @@
1
- from frformat import CodeFantoir
2
-
3
- PROPORTION = 1
4
-
5
- _code_fantoir = CodeFantoir()
6
-
7
-
8
- def _is(val):
9
- return isinstance(val, str) and _code_fantoir.is_valid(val)
@@ -1,9 +0,0 @@
1
- from frformat import CodePostal
2
-
3
- PROPORTION = 0.9
4
-
5
- _code_postal = CodePostal()
6
-
7
-
8
- def _is(val):
9
- return _code_postal.is_valid(val)
@@ -1,10 +0,0 @@
1
- from frformat import CodeRegion, Millesime
2
-
3
- PROPORTION = 1
4
-
5
- _code_region = CodeRegion(Millesime.LATEST)
6
-
7
-
8
- def _is(val):
9
- """Renvoie True si val peut être un code_région, False sinon"""
10
- return isinstance(val, str) and _code_region.is_valid(val)
@@ -1,16 +0,0 @@
1
- from frformat import Departement, Millesime, Options
2
-
3
- PROPORTION = 0.9
4
-
5
- _options = Options(
6
- ignore_case=True,
7
- ignore_accents=True,
8
- replace_non_alphanumeric_with_space=True,
9
- ignore_extra_whitespace=True,
10
- )
11
- _departement = Departement(Millesime.LATEST, _options)
12
-
13
-
14
- def _is(val):
15
- """Match avec le nom des departements"""
16
- return isinstance(val, str) and _departement.is_valid(val)
@@ -1,19 +0,0 @@
1
- from frformat import LatitudeL93
2
-
3
- from csv_detective.detect_fields.other.float import _is as is_float
4
- from csv_detective.detect_fields.other.float import float_casting
5
-
6
- PROPORTION = 1
7
-
8
- _latitudel93 = LatitudeL93()
9
-
10
-
11
- def _is(val):
12
- try:
13
- if isinstance(val, str) and is_float(val):
14
- return _latitudel93.is_valid(float_casting(val))
15
-
16
- return False
17
-
18
- except (ValueError, OverflowError):
19
- return False
@@ -1,13 +0,0 @@
1
- from csv_detective.detect_fields.other.float import _is as is_float
2
-
3
- PROPORTION = 1
4
-
5
-
6
- def _is(val):
7
- """Renvoie True si val peut etre une latitude en métropole"""
8
- try:
9
- return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3
10
- except ValueError:
11
- return False
12
- except OverflowError:
13
- return False
@@ -1,19 +0,0 @@
1
- from frformat import LongitudeL93
2
-
3
- from csv_detective.detect_fields.other.float import _is as is_float
4
- from csv_detective.detect_fields.other.float import float_casting
5
-
6
- PROPORTION = 1
7
-
8
- _longitudel93 = LongitudeL93()
9
-
10
-
11
- def _is(val):
12
- try:
13
- if isinstance(val, str) and is_float(val):
14
- return _longitudel93.is_valid(float_casting(val))
15
-
16
- return False
17
-
18
- except (ValueError, OverflowError):
19
- return False