csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -0,0 +1,36 @@
1
+ from csv_detective.formats.latitude_wgs import _is as is_lat
2
+ from csv_detective.formats.latlon_wgs import SHARED_COORDS_LABELS
3
+ from csv_detective.formats.longitude_wgs import _is as is_lon
4
+
5
+ proportion = 1
6
+ tags = ["geo"]
7
+
8
+ specific = [
9
+ "lonlat",
10
+ "lon lat",
11
+ "y x",
12
+ "yx",
13
+ ]
14
+
15
+ # we aim wide to catch exact matches if possible for the highest possible score
16
+ words = (
17
+ SHARED_COORDS_LABELS
18
+ + specific
19
+ + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
20
+ )
21
+
22
+
23
+ def _is(val):
24
+ if not isinstance(val, str) or val.count(",") != 1:
25
+ return False
26
+ lon, lat = val.split(",")
27
+ # handling [lon,lat]
28
+ if lon.startswith("[") and lat.endswith("]"):
29
+ lon, lat = lon[1:], lat[:-1]
30
+ return is_lon(lon) and is_lat(lat.replace(" ", ""))
31
+
32
+
33
+ _test_values = {
34
+ True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
35
+ False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
36
+ }
@@ -0,0 +1,48 @@
1
+ from unidecode import unidecode
2
+
3
+ proportion = 1
4
+ tags = ["fr", "temp"]
5
+ labels = ["mois", "month"]
6
+
7
+ mois = {
8
+ "janvier",
9
+ "fevrier",
10
+ "mars",
11
+ "avril",
12
+ "mai",
13
+ "juin",
14
+ "juillet",
15
+ "aout",
16
+ "septembre",
17
+ "octobre",
18
+ "novembre",
19
+ "decembre",
20
+ "jan",
21
+ "fev",
22
+ "mar",
23
+ "avr",
24
+ "mai",
25
+ "jun",
26
+ "jui",
27
+ "juil",
28
+ "aou",
29
+ "sep",
30
+ "sept",
31
+ "oct",
32
+ "nov",
33
+ "dec",
34
+ }
35
+
36
+
37
+ def _is(val):
38
+ """Renvoie True si les champs peuvent être des mois de l'année"""
39
+ if not isinstance(val, str):
40
+ return False
41
+ val = unidecode(val.lower())
42
+ return val in mois
43
+
44
+
45
+ _test_values = {
46
+ True: ["JUIN", "décembre"],
47
+ False: ["november"],
48
+ }
@@ -0,0 +1,18 @@
1
+ from csv_detective.formats.float import _is as is_float
2
+
3
+ proportion = 0.8
4
+ labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
5
+
6
+ currencies = {"€", "$", "£", "¥"}
7
+
8
+
9
+ def _is(val):
10
+ if not isinstance(val, str) or val[-1] not in currencies:
11
+ return False
12
+ return is_float(val[:-1])
13
+
14
+
15
+ _test_values = {
16
+ True: ["120€", "-20.2$"],
17
+ False: ["200", "100 euros"],
18
+ }
@@ -0,0 +1,14 @@
1
+ import re
2
+
3
+ proportion = 0.8
4
+ labels = ["id", "objectid"]
5
+
6
+
7
+ def _is(val):
8
+ return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val))
9
+
10
+
11
+ _test_values = {
12
+ True: ["62320e50f981bc2b57bcc044"],
13
+ False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"],
14
+ }
@@ -0,0 +1,35 @@
1
+ from frformat import Millesime, Options, Pays
2
+
3
+ proportion = 0.6
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "pays",
7
+ "payslieu",
8
+ "paysorg",
9
+ "country",
10
+ "pays lib",
11
+ "lieupays",
12
+ "pays beneficiaire",
13
+ "nom du pays",
14
+ "journey start country",
15
+ "libelle pays",
16
+ "journey end country",
17
+ ]
18
+
19
+ _options = Options(
20
+ ignore_case=True,
21
+ ignore_accents=True,
22
+ replace_non_alphanumeric_with_space=True,
23
+ ignore_extra_whitespace=True,
24
+ )
25
+ _pays = Pays(Millesime.LATEST, _options)
26
+
27
+
28
+ def _is(val):
29
+ return isinstance(val, str) and _pays.is_valid(val)
30
+
31
+
32
+ _test_values = {
33
+ True: ["france", "italie"],
34
+ False: ["amerique", "paris"],
35
+ }
@@ -0,0 +1,16 @@
1
+ from csv_detective.formats.float import _is as is_float
2
+
3
+ proportion = 0.8
4
+ labels = []
5
+
6
+
7
+ def _is(val):
8
+ if not isinstance(val, str) or val[-1] != "%":
9
+ return False
10
+ return is_float(val[:-1])
11
+
12
+
13
+ _test_values = {
14
+ True: ["120%", "-20.2%"],
15
+ False: ["200", "100 pourcents"],
16
+ }
@@ -0,0 +1,70 @@
1
+ from frformat import Millesime, Options, Region
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "region",
7
+ "libelle region",
8
+ "nom region",
9
+ "libelle reg",
10
+ "nom reg",
11
+ "reg libusage",
12
+ "nom de la region",
13
+ "regionorg",
14
+ "regionlieu",
15
+ "reg",
16
+ "nom officiel region",
17
+ ]
18
+
19
+ _extra_valid_values_set = frozenset(
20
+ {
21
+ "alsace",
22
+ "aquitaine",
23
+ "ara",
24
+ "aura",
25
+ "auvergne",
26
+ "auvergne et rhone alpes",
27
+ "basse normandie",
28
+ "bfc",
29
+ "bourgogne",
30
+ "bourgogne et franche comte",
31
+ "centre",
32
+ "champagne ardenne",
33
+ "franche comte",
34
+ "ge",
35
+ "haute normandie",
36
+ "hdf",
37
+ "languedoc roussillon",
38
+ "limousin",
39
+ "lorraine",
40
+ "midi pyrenees",
41
+ "nord pas de calais",
42
+ "npdc",
43
+ "paca",
44
+ "picardie",
45
+ "poitou charentes",
46
+ "reunion",
47
+ "rhone alpes",
48
+ }
49
+ )
50
+
51
+
52
+ _options = Options(
53
+ ignore_case=True,
54
+ ignore_accents=True,
55
+ replace_non_alphanumeric_with_space=True,
56
+ ignore_extra_whitespace=True,
57
+ extra_valid_values=_extra_valid_values_set,
58
+ )
59
+ _region = Region(Millesime.LATEST, _options)
60
+
61
+
62
+ def _is(val):
63
+ """Match avec le nom des regions"""
64
+ return isinstance(val, str) and _region.is_valid(val)
65
+
66
+
67
+ _test_values = {
68
+ True: ["bretagne", "ile-de-france"],
69
+ False: ["baviere", "overgne"],
70
+ }
@@ -0,0 +1,17 @@
1
+ from csv_detective.parsing.text import _process_text
2
+
3
+ proportion = 1
4
+ tags = ["fr"]
5
+ labels = ["sexe", "sex", "civilite", "genre", "id sexe"]
6
+
7
+
8
+ def _is(val):
9
+ if not isinstance(val, str):
10
+ return False
11
+ return _process_text(val) in {"homme", "femme", "h", "f", "m", "masculin", "feminin"}
12
+
13
+
14
+ _test_values = {
15
+ True: ["femme", "H"],
16
+ False: ["adulte"],
17
+ }
@@ -0,0 +1,37 @@
1
+ import re
2
+
3
+ proportion = 0.9
4
+ tags = ["fr"]
5
+ labels = [
6
+ "siren",
7
+ "siren organisme designe",
8
+ "siren organisme designant",
9
+ "n° siren",
10
+ "siren organisme",
11
+ "siren titulaire",
12
+ "numero siren",
13
+ "epci",
14
+ ]
15
+
16
+
17
+ def _is(val):
18
+ """Repere les codes SIREN"""
19
+ if not isinstance(val, str):
20
+ return False
21
+ val = val.replace(" ", "")
22
+ if not bool(re.match(r"^[0-9]{9}$", val)):
23
+ return False
24
+ # Vérification par clé propre aux codes siren
25
+ cle = 0
26
+ pair = False
27
+ for x in val:
28
+ y = int(x) * (1 + pair)
29
+ cle += y // 10 + y % 10
30
+ pair = not pair
31
+ return cle % 10 == 0
32
+
33
+
34
+ _test_values = {
35
+ True: ["552 100 554", "552100554"],
36
+ False: ["42"],
37
+ }
@@ -1,29 +1,47 @@
1
- import re
2
-
3
- PROPORTION = 0.8
4
-
5
-
6
- def _is(val):
7
- '''Détection des identifiants SIRET (SIRENE)'''
8
- val = val.replace(' ', '')
9
- if not bool(re.match(r'^[0-9]{14}$', val)):
10
- return False
11
-
12
- # Vérification par clé de luhn du SIREN
13
- cle = 0
14
- pair = False
15
- for x in val[:9]:
16
- y = int(x) * (1 + pair)
17
- cle += y // 10 + y % 10
18
- pair = not pair
19
- if cle % 10 != 0:
20
- return cle % 10 == 0
21
-
22
- # Vérification par clé de luhn du SIRET
23
- cle = 0
24
- pair = len(val) % 2 == 0
25
- for x in val:
26
- y = int(x) * (1 + pair)
27
- cle += y // 10 + y % 10
28
- pair = not pair
29
- return cle % 10 == 0
1
+ import re
2
+
3
+ proportion = 0.8
4
+ tags = ["fr"]
5
+ labels = [
6
+ "siret",
7
+ "siret d",
8
+ "num siret",
9
+ "siretacheteur",
10
+ "n° siret",
11
+ "coll siret",
12
+ "epci",
13
+ ]
14
+
15
+
16
+ def _is(val):
17
+ """Détection des identifiants SIRET (SIRENE)"""
18
+ if not isinstance(val, str):
19
+ return False
20
+ val = val.replace(" ", "")
21
+ if not bool(re.match(r"^[0-9]{14}$", val)):
22
+ return False
23
+
24
+ # Vérification par clé de luhn du SIREN
25
+ cle = 0
26
+ pair = False
27
+ for x in val[:9]:
28
+ y = int(x) * (1 + pair)
29
+ cle += y // 10 + y % 10
30
+ pair = not pair
31
+ if cle % 10 != 0:
32
+ return cle % 10 == 0
33
+
34
+ # Vérification par clé de luhn du SIRET
35
+ cle = 0
36
+ pair = len(val) % 2 == 0
37
+ for x in val:
38
+ y = int(x) * (1 + pair)
39
+ cle += y // 10 + y % 10
40
+ pair = not pair
41
+ return cle % 10 == 0
42
+
43
+
44
+ _test_values = {
45
+ True: ["13002526500013", "130 025 265 00013"],
46
+ False: ["13002526500012"],
47
+ }
@@ -0,0 +1,36 @@
1
+ import re
2
+
3
+ proportion = 0.7
4
+ tags = ["fr"]
5
+ labels = [
6
+ "telephone",
7
+ "tel",
8
+ "tel1",
9
+ "tel2",
10
+ "phone",
11
+ "num tel",
12
+ "tel mob",
13
+ "telephone sav",
14
+ "telephone1",
15
+ "coordinates.phone",
16
+ "telephone du lieu",
17
+ ]
18
+
19
+
20
+ def _is(val):
21
+ if not isinstance(val, str):
22
+ return False
23
+
24
+ if len(val) < 10:
25
+ return False
26
+
27
+ val = val.replace(".", "").replace("-", "").replace(" ", "")
28
+
29
+ match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val))
30
+ return match_1
31
+
32
+
33
+ _test_values = {
34
+ True: ["0134643467"],
35
+ False: ["6625388263", "01288398"],
36
+ }
@@ -0,0 +1,36 @@
1
+ import re
2
+
3
+ proportion = 0.8
4
+ tags = ["fr"]
5
+ labels = [
6
+ "uai",
7
+ "code etablissement",
8
+ "code uai",
9
+ "uai - identifiant",
10
+ "numero uai",
11
+ "rne",
12
+ "numero de l'etablissement",
13
+ "code rne",
14
+ "codeetab",
15
+ "code uai de l'etablissement",
16
+ "ref uai",
17
+ "cd rne",
18
+ "numerouai",
19
+ "numero d etablissement",
20
+ "code etablissement",
21
+ "numero etablissement",
22
+ ]
23
+
24
+
25
+ def _is(val):
26
+ if not isinstance(val, str) or len(val) != 8:
27
+ return False
28
+ if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)):
29
+ return False
30
+ return True
31
+
32
+
33
+ _test_values = {
34
+ True: ["0422170F"],
35
+ False: ["04292E"],
36
+ }
@@ -0,0 +1,46 @@
1
+ import re
2
+
3
+ proportion = 1
4
+ labels = [
5
+ "url",
6
+ "url source",
7
+ "site web",
8
+ "source url",
9
+ "site internet",
10
+ "remote url",
11
+ "web",
12
+ "site",
13
+ "lien",
14
+ "site data",
15
+ "lien url",
16
+ "lien vers le fichier",
17
+ "sitweb",
18
+ "interneturl",
19
+ ]
20
+
21
+ pattern = re.compile(
22
+ r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"
23
+ r"(/[A-Za-z\u00C0-\u024F\u1E00-\u1EFF0-9\s._~:/?#[@!$&'()*+,;=%-]*)?$"
24
+ )
25
+
26
+
27
+ def _is(val):
28
+ if not isinstance(val, str):
29
+ return False
30
+ return bool(pattern.match(val))
31
+
32
+
33
+ _test_values = {
34
+ True: [
35
+ "www.data.gouv.fr",
36
+ "http://data.gouv.fr",
37
+ "https://www.youtube.com/@data-gouv-fr",
38
+ (
39
+ "https://tabular-api.data.gouv.fr/api/resources/"
40
+ "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
41
+ "?score__greater=0.9&decompte__exact=13"
42
+ ),
43
+ "https://une-ville.fr/délibérations/2025/Doc avec espaces et àccëñts.pdf",
44
+ ],
45
+ False: ["tmp@data.gouv.fr"],
46
+ }
@@ -0,0 +1,14 @@
1
+ import re
2
+
3
+ proportion = 1
4
+ labels = ["account", "username", "user"]
5
+
6
+
7
+ def _is(val):
8
+ return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val))
9
+
10
+
11
+ _test_values = {
12
+ True: ["@accueil1"],
13
+ False: ["adresse@mail"],
14
+ }
@@ -0,0 +1,16 @@
1
+ import re
2
+
3
+ proportion = 0.8
4
+ labels = ["id", "identifiant"]
5
+
6
+
7
+ def _is(val) -> bool:
8
+ return isinstance(val, str) and bool(
9
+ re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
10
+ )
11
+
12
+
13
+ _test_values = {
14
+ True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
15
+ False: ["0610928327"],
16
+ }
@@ -0,0 +1,28 @@
1
+ proportion = 1
2
+ tags = ["temp"]
3
+ labels = [
4
+ "year",
5
+ "annee",
6
+ "annee depot",
7
+ "an nais",
8
+ "exercice",
9
+ "data year",
10
+ "annee de publication",
11
+ "exercice comptable",
12
+ "annee de naissance",
13
+ "annee ouverture",
14
+ ]
15
+
16
+
17
+ def _is(val):
18
+ try:
19
+ val = int(val)
20
+ except ValueError:
21
+ return False
22
+ return (1800 <= val) and (val <= 2100)
23
+
24
+
25
+ _test_values = {
26
+ True: ["2015"],
27
+ False: ["20166", "123"],
28
+ }
@@ -0,0 +1,65 @@
1
+ import json
2
+ import os
3
+ from typing import Iterator
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.output.dataframe import cast_df_chunks
8
+ from csv_detective.output.profile import create_profile
9
+ from csv_detective.output.schema import generate_table_schema
10
+ from csv_detective.utils import is_url
11
+
12
+
13
+ def generate_output(
14
+ table: pd.DataFrame,
15
+ analysis: dict,
16
+ file_path: str,
17
+ num_rows: int = 500,
18
+ limited_output: bool = True,
19
+ save_results: bool | str = True,
20
+ output_profile: bool = False,
21
+ output_schema: bool = False,
22
+ output_df: bool = False,
23
+ cast_json: bool = True,
24
+ verbose: bool = False,
25
+ sheet_name: str | int | None = None,
26
+ _col_values: dict[str, pd.Series] | None = None,
27
+ ) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
28
+ if output_profile:
29
+ analysis["profile"] = create_profile(
30
+ table=table,
31
+ columns=analysis["columns"],
32
+ num_rows=num_rows,
33
+ limited_output=limited_output,
34
+ cast_json=cast_json,
35
+ verbose=verbose,
36
+ _col_values=_col_values,
37
+ )
38
+
39
+ if save_results:
40
+ if isinstance(save_results, str):
41
+ output_path = save_results
42
+ else:
43
+ output_path = os.path.splitext(file_path)[0]
44
+ if is_url(output_path):
45
+ output_path = output_path.split("/")[-1]
46
+ if analysis.get("sheet_name"):
47
+ output_path += "_sheet-" + str(sheet_name)
48
+ output_path += ".json"
49
+ with open(output_path, "w", encoding="utf8") as fp:
50
+ json.dump(
51
+ analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str
52
+ )
53
+
54
+ if output_schema:
55
+ analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
56
+
57
+ if output_df:
58
+ return analysis, cast_df_chunks(
59
+ df=table,
60
+ analysis=analysis,
61
+ file_path=file_path,
62
+ cast_json=cast_json,
63
+ verbose=verbose,
64
+ )
65
+ return analysis