csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. csv_detective/detection/formats.py +12 -15
  2. csv_detective/explore_csv.py +28 -9
  3. csv_detective/format.py +67 -0
  4. csv_detective/formats/__init__.py +9 -0
  5. csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
  6. csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
  7. csv_detective/formats/code_commune_insee.py +26 -0
  8. csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
  9. csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
  10. csv_detective/formats/code_fantoir.py +21 -0
  11. csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
  12. csv_detective/formats/code_postal.py +25 -0
  13. csv_detective/formats/code_region.py +22 -0
  14. csv_detective/formats/code_rna.py +29 -0
  15. csv_detective/formats/code_waldec.py +17 -0
  16. csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
  17. csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
  18. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  19. csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
  20. csv_detective/formats/date_fr.py +22 -0
  21. csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
  22. csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
  23. csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
  24. csv_detective/formats/departement.py +37 -0
  25. csv_detective/formats/email.py +28 -0
  26. csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
  27. csv_detective/formats/geojson.py +36 -0
  28. csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
  29. csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
  30. csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
  31. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  32. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  33. csv_detective/formats/iso_country_code_numeric.py +31 -0
  34. csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
  35. csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
  36. csv_detective/formats/latitude_l93.py +48 -0
  37. csv_detective/formats/latitude_wgs.py +42 -0
  38. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  39. csv_detective/formats/latlon_wgs.py +53 -0
  40. csv_detective/formats/longitude_l93.py +39 -0
  41. csv_detective/formats/longitude_wgs.py +32 -0
  42. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  43. csv_detective/formats/lonlat_wgs.py +36 -0
  44. csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
  45. csv_detective/formats/money.py +18 -0
  46. csv_detective/formats/mongo_object_id.py +14 -0
  47. csv_detective/formats/pays.py +35 -0
  48. csv_detective/formats/percent.py +16 -0
  49. csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
  50. csv_detective/formats/sexe.py +17 -0
  51. csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
  52. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
  53. csv_detective/formats/tel_fr.py +36 -0
  54. csv_detective/formats/uai.py +36 -0
  55. csv_detective/formats/url.py +45 -0
  56. csv_detective/formats/username.py +14 -0
  57. csv_detective/formats/uuid.py +16 -0
  58. csv_detective/formats/year.py +28 -0
  59. csv_detective/output/__init__.py +3 -4
  60. csv_detective/output/dataframe.py +3 -3
  61. csv_detective/output/profile.py +2 -3
  62. csv_detective/output/schema.py +2 -2
  63. csv_detective/parsing/columns.py +35 -50
  64. csv_detective/parsing/csv.py +2 -2
  65. csv_detective/parsing/load.py +4 -5
  66. csv_detective/validate.py +9 -4
  67. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/METADATA +6 -5
  68. csv_detective-0.9.3.dev2348.dist-info/RECORD +102 -0
  69. tests/test_fields.py +39 -364
  70. tests/test_file.py +1 -1
  71. tests/test_labels.py +5 -3
  72. tests/test_structure.py +40 -36
  73. csv_detective/detect_fields/FR/__init__.py +0 -0
  74. csv_detective/detect_fields/FR/geo/__init__.py +0 -0
  75. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
  76. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
  77. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
  78. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
  79. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
  80. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
  81. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  82. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
  83. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  84. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
  85. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  86. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  87. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
  88. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
  89. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  90. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
  91. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
  92. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  93. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  94. csv_detective/detect_fields/__init__.py +0 -112
  95. csv_detective/detect_fields/geo/__init__.py +0 -0
  96. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  97. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  98. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  99. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
  100. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  101. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
  102. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  103. csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
  104. csv_detective/detect_fields/other/__init__.py +0 -0
  105. csv_detective/detect_fields/other/email/__init__.py +0 -10
  106. csv_detective/detect_fields/other/money/__init__.py +0 -11
  107. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  108. csv_detective/detect_fields/other/percent/__init__.py +0 -9
  109. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  110. csv_detective/detect_fields/other/url/__init__.py +0 -14
  111. csv_detective/detect_fields/other/uuid/__init__.py +0 -10
  112. csv_detective/detect_fields/temp/__init__.py +0 -0
  113. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  114. csv_detective/detect_labels/FR/__init__.py +0 -0
  115. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  116. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
  117. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
  118. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
  119. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
  120. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
  121. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
  122. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
  123. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
  124. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
  125. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
  126. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
  127. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
  128. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
  129. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
  130. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
  131. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  132. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
  133. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
  134. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
  135. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
  136. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
  137. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
  138. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
  139. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
  140. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
  141. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
  142. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
  143. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  144. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
  145. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
  146. csv_detective/detect_labels/__init__.py +0 -94
  147. csv_detective/detect_labels/geo/__init__.py +0 -0
  148. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
  149. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
  150. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
  151. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
  152. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
  153. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
  154. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
  155. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
  156. csv_detective/detect_labels/other/__init__.py +0 -0
  157. csv_detective/detect_labels/other/booleen/__init__.py +0 -8
  158. csv_detective/detect_labels/other/email/__init__.py +0 -20
  159. csv_detective/detect_labels/other/float/__init__.py +0 -8
  160. csv_detective/detect_labels/other/int/__init__.py +0 -8
  161. csv_detective/detect_labels/other/money/__init__.py +0 -8
  162. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
  163. csv_detective/detect_labels/other/twitter/__init__.py +0 -8
  164. csv_detective/detect_labels/other/url/__init__.py +0 -23
  165. csv_detective/detect_labels/other/uuid/__init__.py +0 -8
  166. csv_detective/detect_labels/temp/__init__.py +0 -0
  167. csv_detective/detect_labels/temp/date/__init__.py +0 -28
  168. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
  169. csv_detective/detect_labels/temp/year/__init__.py +0 -19
  170. csv_detective/load_tests.py +0 -59
  171. csv_detective-0.9.3.dev2258.dist-info/RECORD +0 -166
  172. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  173. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  174. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  175. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
  176. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/WHEEL +0 -0
  177. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/entry_points.txt +0 -0
  178. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/licenses/LICENSE +0 -0
  179. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,36 @@
1
+ from csv_detective.formats.latitude_wgs import _is as is_lat
2
+ from csv_detective.formats.latlon_wgs import SHARED_COORDS_LABELS
3
+ from csv_detective.formats.longitude_wgs import _is as is_lon
4
+
5
+ proportion = 1
6
+ tags = ["geo"]
7
+
8
+ specific = [
9
+ "lonlat",
10
+ "lon lat",
11
+ "y x",
12
+ "yx",
13
+ ]
14
+
15
+ # we aim wide to catch exact matches if possible for the highest possible score
16
+ words = (
17
+ SHARED_COORDS_LABELS
18
+ + specific
19
+ + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
20
+ )
21
+
22
+
23
+ def _is(val):
24
+ if not isinstance(val, str) or val.count(",") != 1:
25
+ return False
26
+ lon, lat = val.split(",")
27
+ # handling [lon,lat]
28
+ if lon.startswith("[") and lat.endswith("]"):
29
+ lon, lat = lon[1:], lat[:-1]
30
+ return is_lon(lon) and is_lat(lat.replace(" ", ""))
31
+
32
+
33
+ _test_values = {
34
+ True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
35
+ False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
36
+ }
@@ -1,39 +1,48 @@
1
- from unidecode import unidecode
2
-
3
- PROPORTION = 1
4
- mois = {
5
- "janvier",
6
- "fevrier",
7
- "mars",
8
- "avril",
9
- "mai",
10
- "juin",
11
- "juillet",
12
- "aout",
13
- "septembre",
14
- "octobre",
15
- "novembre",
16
- "decembre",
17
- "jan",
18
- "fev",
19
- "mar",
20
- "avr",
21
- "mai",
22
- "jun",
23
- "jui",
24
- "juil",
25
- "aou",
26
- "sep",
27
- "sept",
28
- "oct",
29
- "nov",
30
- "dec",
31
- }
32
-
33
-
34
- def _is(val):
35
- """Renvoie True si les champs peuvent être des mois de l'année"""
36
- if not isinstance(val, str):
37
- return False
38
- val = unidecode(val.lower())
39
- return val in mois
1
+ from unidecode import unidecode
2
+
3
+ proportion = 1
4
+ tags = ["fr", "temp"]
5
+ labels = ["mois", "month"]
6
+
7
+ mois = {
8
+ "janvier",
9
+ "fevrier",
10
+ "mars",
11
+ "avril",
12
+ "mai",
13
+ "juin",
14
+ "juillet",
15
+ "aout",
16
+ "septembre",
17
+ "octobre",
18
+ "novembre",
19
+ "decembre",
20
+ "jan",
21
+ "fev",
22
+ "mar",
23
+ "avr",
24
+ "mai",
25
+ "jun",
26
+ "jui",
27
+ "juil",
28
+ "aou",
29
+ "sep",
30
+ "sept",
31
+ "oct",
32
+ "nov",
33
+ "dec",
34
+ }
35
+
36
+
37
+ def _is(val):
38
+ """Renvoie True si les champs peuvent être des mois de l'année"""
39
+ if not isinstance(val, str):
40
+ return False
41
+ val = unidecode(val.lower())
42
+ return val in mois
43
+
44
+
45
+ _test_values = {
46
+ True: ["JUIN", "décembre"],
47
+ False: ["november"],
48
+ }
@@ -0,0 +1,18 @@
1
+ from csv_detective.formats.float import _is as is_float
2
+
3
+ proportion = 0.8
4
+ labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
5
+
6
+ currencies = {"€", "$", "£", "¥"}
7
+
8
+
9
+ def _is(val):
10
+ if not isinstance(val, str) or val[-1] not in currencies:
11
+ return False
12
+ return is_float(val[:-1])
13
+
14
+
15
+ _test_values = {
16
+ True: ["120€", "-20.2$"],
17
+ False: ["200", "100 euros"],
18
+ }
@@ -0,0 +1,14 @@
1
+ import re
2
+
3
+ proportion = 0.8
4
+ labels = ["id", "objectid"]
5
+
6
+
7
+ def _is(val):
8
+ return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val))
9
+
10
+
11
+ _test_values = {
12
+ True: ["62320e50f981bc2b57bcc044"],
13
+ False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"],
14
+ }
@@ -0,0 +1,35 @@
1
+ from frformat import Millesime, Options, Pays
2
+
3
+ proportion = 0.6
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "pays",
7
+ "payslieu",
8
+ "paysorg",
9
+ "country",
10
+ "pays lib",
11
+ "lieupays",
12
+ "pays beneficiaire",
13
+ "nom du pays",
14
+ "journey start country",
15
+ "libelle pays",
16
+ "journey end country",
17
+ ]
18
+
19
+ _options = Options(
20
+ ignore_case=True,
21
+ ignore_accents=True,
22
+ replace_non_alphanumeric_with_space=True,
23
+ ignore_extra_whitespace=True,
24
+ )
25
+ _pays = Pays(Millesime.LATEST, _options)
26
+
27
+
28
+ def _is(val):
29
+ return isinstance(val, str) and _pays.is_valid(val)
30
+
31
+
32
+ _test_values = {
33
+ True: ["france", "italie"],
34
+ False: ["amerique", "paris"],
35
+ }
@@ -0,0 +1,16 @@
1
+ from csv_detective.formats.float import _is as is_float
2
+
3
+ proportion = 0.8
4
+ labels = []
5
+
6
+
7
+ def _is(val):
8
+ if not isinstance(val, str) or val[-1] != "%":
9
+ return False
10
+ return is_float(val[:-1])
11
+
12
+
13
+ _test_values = {
14
+ True: ["120%", "-20.2%"],
15
+ False: ["200", "100 pourcents"],
16
+ }
@@ -1,50 +1,70 @@
1
- from frformat import Millesime, Options, Region
2
-
3
- PROPORTION = 1
4
-
5
- _extra_valid_values_set = frozenset(
6
- {
7
- "alsace",
8
- "aquitaine",
9
- "ara",
10
- "aura",
11
- "auvergne",
12
- "auvergne et rhone alpes",
13
- "basse normandie",
14
- "bfc",
15
- "bourgogne",
16
- "bourgogne et franche comte",
17
- "centre",
18
- "champagne ardenne",
19
- "franche comte",
20
- "ge",
21
- "haute normandie",
22
- "hdf",
23
- "languedoc roussillon",
24
- "limousin",
25
- "lorraine",
26
- "midi pyrenees",
27
- "nord pas de calais",
28
- "npdc",
29
- "paca",
30
- "picardie",
31
- "poitou charentes",
32
- "reunion",
33
- "rhone alpes",
34
- }
35
- )
36
-
37
-
38
- _options = Options(
39
- ignore_case=True,
40
- ignore_accents=True,
41
- replace_non_alphanumeric_with_space=True,
42
- ignore_extra_whitespace=True,
43
- extra_valid_values=_extra_valid_values_set,
44
- )
45
- _region = Region(Millesime.LATEST, _options)
46
-
47
-
48
- def _is(val):
49
- """Match avec le nom des regions"""
50
- return isinstance(val, str) and _region.is_valid(val)
1
+ from frformat import Millesime, Options, Region
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "region",
7
+ "libelle region",
8
+ "nom region",
9
+ "libelle reg",
10
+ "nom reg",
11
+ "reg libusage",
12
+ "nom de la region",
13
+ "regionorg",
14
+ "regionlieu",
15
+ "reg",
16
+ "nom officiel region",
17
+ ]
18
+
19
+ _extra_valid_values_set = frozenset(
20
+ {
21
+ "alsace",
22
+ "aquitaine",
23
+ "ara",
24
+ "aura",
25
+ "auvergne",
26
+ "auvergne et rhone alpes",
27
+ "basse normandie",
28
+ "bfc",
29
+ "bourgogne",
30
+ "bourgogne et franche comte",
31
+ "centre",
32
+ "champagne ardenne",
33
+ "franche comte",
34
+ "ge",
35
+ "haute normandie",
36
+ "hdf",
37
+ "languedoc roussillon",
38
+ "limousin",
39
+ "lorraine",
40
+ "midi pyrenees",
41
+ "nord pas de calais",
42
+ "npdc",
43
+ "paca",
44
+ "picardie",
45
+ "poitou charentes",
46
+ "reunion",
47
+ "rhone alpes",
48
+ }
49
+ )
50
+
51
+
52
+ _options = Options(
53
+ ignore_case=True,
54
+ ignore_accents=True,
55
+ replace_non_alphanumeric_with_space=True,
56
+ ignore_extra_whitespace=True,
57
+ extra_valid_values=_extra_valid_values_set,
58
+ )
59
+ _region = Region(Millesime.LATEST, _options)
60
+
61
+
62
+ def _is(val):
63
+ """Match avec le nom des regions"""
64
+ return isinstance(val, str) and _region.is_valid(val)
65
+
66
+
67
+ _test_values = {
68
+ True: ["bretagne", "ile-de-france"],
69
+ False: ["baviere", "overgne"],
70
+ }
@@ -0,0 +1,17 @@
1
+ from csv_detective.parsing.text import _process_text
2
+
3
+ proportion = 1
4
+ tags = ["fr"]
5
+ labels = ["sexe", "sex", "civilite", "genre", "id sexe"]
6
+
7
+
8
+ def _is(val):
9
+ if not isinstance(val, str):
10
+ return False
11
+ return _process_text(val) in {"homme", "femme", "h", "f", "m", "masculin", "feminin"}
12
+
13
+
14
+ _test_values = {
15
+ True: ["femme", "H"],
16
+ False: ["adulte"],
17
+ }
@@ -1,20 +1,37 @@
1
- import re
2
-
3
- PROPORTION = 0.9
4
-
5
-
6
- def _is(val):
7
- """Repere les codes SIREN"""
8
- if not isinstance(val, str):
9
- return False
10
- val = val.replace(" ", "")
11
- if not bool(re.match(r"^[0-9]{9}$", val)):
12
- return False
13
- # Vérification par clé propre aux codes siren
14
- cle = 0
15
- pair = False
16
- for x in val:
17
- y = int(x) * (1 + pair)
18
- cle += y // 10 + y % 10
19
- pair = not pair
20
- return cle % 10 == 0
1
+ import re
2
+
3
+ proportion = 0.9
4
+ tags = ["fr"]
5
+ labels = [
6
+ "siren",
7
+ "siren organisme designe",
8
+ "siren organisme designant",
9
+ "n° siren",
10
+ "siren organisme",
11
+ "siren titulaire",
12
+ "numero siren",
13
+ "epci",
14
+ ]
15
+
16
+
17
+ def _is(val):
18
+ """Repere les codes SIREN"""
19
+ if not isinstance(val, str):
20
+ return False
21
+ val = val.replace(" ", "")
22
+ if not bool(re.match(r"^[0-9]{9}$", val)):
23
+ return False
24
+ # Vérification par clé propre aux codes siren
25
+ cle = 0
26
+ pair = False
27
+ for x in val:
28
+ y = int(x) * (1 + pair)
29
+ cle += y // 10 + y % 10
30
+ pair = not pair
31
+ return cle % 10 == 0
32
+
33
+
34
+ _test_values = {
35
+ True: ["552 100 554", "552100554"],
36
+ False: ["42"],
37
+ }
@@ -1,31 +1,47 @@
1
- import re
2
-
3
- PROPORTION = 0.8
4
-
5
-
6
- def _is(val):
7
- """Détection des identifiants SIRET (SIRENE)"""
8
- if not isinstance(val, str):
9
- return False
10
- val = val.replace(" ", "")
11
- if not bool(re.match(r"^[0-9]{14}$", val)):
12
- return False
13
-
14
- # Vérification par clé de luhn du SIREN
15
- cle = 0
16
- pair = False
17
- for x in val[:9]:
18
- y = int(x) * (1 + pair)
19
- cle += y // 10 + y % 10
20
- pair = not pair
21
- if cle % 10 != 0:
22
- return cle % 10 == 0
23
-
24
- # Vérification par clé de luhn du SIRET
25
- cle = 0
26
- pair = len(val) % 2 == 0
27
- for x in val:
28
- y = int(x) * (1 + pair)
29
- cle += y // 10 + y % 10
30
- pair = not pair
31
- return cle % 10 == 0
1
+ import re
2
+
3
+ proportion = 0.8
4
+ tags = ["fr"]
5
+ labels = [
6
+ "siret",
7
+ "siret d",
8
+ "num siret",
9
+ "siretacheteur",
10
+ " siret",
11
+ "coll siret",
12
+ "epci",
13
+ ]
14
+
15
+
16
+ def _is(val):
17
+ """Détection des identifiants SIRET (SIRENE)"""
18
+ if not isinstance(val, str):
19
+ return False
20
+ val = val.replace(" ", "")
21
+ if not bool(re.match(r"^[0-9]{14}$", val)):
22
+ return False
23
+
24
+ # Vérification par clé de luhn du SIREN
25
+ cle = 0
26
+ pair = False
27
+ for x in val[:9]:
28
+ y = int(x) * (1 + pair)
29
+ cle += y // 10 + y % 10
30
+ pair = not pair
31
+ if cle % 10 != 0:
32
+ return cle % 10 == 0
33
+
34
+ # Vérification par clé de luhn du SIRET
35
+ cle = 0
36
+ pair = len(val) % 2 == 0
37
+ for x in val:
38
+ y = int(x) * (1 + pair)
39
+ cle += y // 10 + y % 10
40
+ pair = not pair
41
+ return cle % 10 == 0
42
+
43
+
44
+ _test_values = {
45
+ True: ["13002526500013", "130 025 265 00013"],
46
+ False: ["13002526500012"],
47
+ }
@@ -0,0 +1,36 @@
1
+ import re
2
+
3
+ proportion = 0.7
4
+ tags = ["fr"]
5
+ labels = [
6
+ "telephone",
7
+ "tel",
8
+ "tel1",
9
+ "tel2",
10
+ "phone",
11
+ "num tel",
12
+ "tel mob",
13
+ "telephone sav",
14
+ "telephone1",
15
+ "coordinates.phone",
16
+ "telephone du lieu",
17
+ ]
18
+
19
+
20
+ def _is(val):
21
+ if not isinstance(val, str):
22
+ return False
23
+
24
+ if len(val) < 10:
25
+ return False
26
+
27
+ val = val.replace(".", "").replace("-", "").replace(" ", "")
28
+
29
+ match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val))
30
+ return match_1
31
+
32
+
33
+ _test_values = {
34
+ True: ["0134643467"],
35
+ False: ["6625388263", "01288398"],
36
+ }
@@ -0,0 +1,36 @@
1
+ import re
2
+
3
+ proportion = 0.8
4
+ tags = ["fr"]
5
+ labels = [
6
+ "uai",
7
+ "code etablissement",
8
+ "code uai",
9
+ "uai - identifiant",
10
+ "numero uai",
11
+ "rne",
12
+ "numero de l'etablissement",
13
+ "code rne",
14
+ "codeetab",
15
+ "code uai de l'etablissement",
16
+ "ref uai",
17
+ "cd rne",
18
+ "numerouai",
19
+ "numero d etablissement",
20
+ "code etablissement",
21
+ "numero etablissement",
22
+ ]
23
+
24
+
25
+ def _is(val):
26
+ if not isinstance(val, str) or len(val) != 8:
27
+ return False
28
+ if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)):
29
+ return False
30
+ return True
31
+
32
+
33
+ _test_values = {
34
+ True: ["0422170F"],
35
+ False: ["04292E"],
36
+ }
@@ -0,0 +1,45 @@
1
+ import re
2
+
3
+ proportion = 1
4
+ labels = [
5
+ "url",
6
+ "url source",
7
+ "site web",
8
+ "source url",
9
+ "site internet",
10
+ "remote url",
11
+ "web",
12
+ "site",
13
+ "lien",
14
+ "site data",
15
+ "lien url",
16
+ "lien vers le fichier",
17
+ "sitweb",
18
+ "interneturl",
19
+ ]
20
+
21
+ pattern = re.compile(
22
+ r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"
23
+ r"(/[A-Za-z0-9._~:/?#[@!$&'()*+,;=%-]*)?$"
24
+ )
25
+
26
+
27
+ def _is(val):
28
+ if not isinstance(val, str):
29
+ return False
30
+ return bool(pattern.match(val))
31
+
32
+
33
+ _test_values = {
34
+ True: [
35
+ "www.data.gouv.fr",
36
+ "http://data.gouv.fr",
37
+ "https://www.youtube.com/@data-gouv-fr",
38
+ (
39
+ "https://tabular-api.data.gouv.fr/api/resources/"
40
+ "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/"
41
+ "?score__greater=0.9&decompte__exact=13"
42
+ ),
43
+ ],
44
+ False: ["tmp@data.gouv.fr"],
45
+ }
@@ -0,0 +1,14 @@
1
+ import re
2
+
3
+ proportion = 1
4
+ labels = ["account", "username", "user"]
5
+
6
+
7
+ def _is(val):
8
+ return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val))
9
+
10
+
11
+ _test_values = {
12
+ True: ["@accueil1"],
13
+ False: ["adresse@mail"],
14
+ }
@@ -0,0 +1,16 @@
1
+ import re
2
+
3
+ proportion = 0.8
4
+ labels = ["id", "identifiant"]
5
+
6
+
7
+ def _is(val) -> bool:
8
+ return isinstance(val, str) and bool(
9
+ re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
10
+ )
11
+
12
+
13
+ _test_values = {
14
+ True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
15
+ False: ["0610928327"],
16
+ }
@@ -0,0 +1,28 @@
1
+ proportion = 1
2
+ tags = ["temp"]
3
+ labels = [
4
+ "year",
5
+ "annee",
6
+ "annee depot",
7
+ "an nais",
8
+ "exercice",
9
+ "data year",
10
+ "annee de publication",
11
+ "exercice comptable",
12
+ "annee de naissance",
13
+ "annee ouverture",
14
+ ]
15
+
16
+
17
+ def _is(val):
18
+ try:
19
+ val = int(val)
20
+ except ValueError:
21
+ return False
22
+ return (1800 <= val) and (val <= 2100)
23
+
24
+
25
+ _test_values = {
26
+ True: ["2015"],
27
+ False: ["20166", "123"],
28
+ }