csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2319__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. csv_detective/detection/formats.py +12 -15
  2. csv_detective/explore_csv.py +28 -9
  3. csv_detective/format.py +67 -0
  4. csv_detective/formats/__init__.py +9 -0
  5. csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
  6. csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
  7. csv_detective/formats/code_commune_insee.py +26 -0
  8. csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
  9. csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
  10. csv_detective/formats/code_fantoir.py +21 -0
  11. csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
  12. csv_detective/formats/code_postal.py +25 -0
  13. csv_detective/formats/code_region.py +22 -0
  14. csv_detective/formats/code_rna.py +29 -0
  15. csv_detective/formats/code_waldec.py +17 -0
  16. csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
  17. csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
  18. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  19. csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
  20. csv_detective/formats/date_fr.py +22 -0
  21. csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
  22. csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
  23. csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
  24. csv_detective/formats/departement.py +37 -0
  25. csv_detective/formats/email.py +28 -0
  26. csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
  27. csv_detective/formats/geojson.py +36 -0
  28. csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
  29. csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
  30. csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
  31. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  32. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  33. csv_detective/formats/iso_country_code_numeric.py +31 -0
  34. csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
  35. csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
  36. csv_detective/formats/latitude_l93.py +48 -0
  37. csv_detective/formats/latitude_wgs.py +42 -0
  38. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  39. csv_detective/formats/latlon_wgs.py +53 -0
  40. csv_detective/formats/longitude_l93.py +39 -0
  41. csv_detective/formats/longitude_wgs.py +32 -0
  42. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  43. csv_detective/formats/lonlat_wgs.py +36 -0
  44. csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
  45. csv_detective/formats/money.py +18 -0
  46. csv_detective/formats/mongo_object_id.py +14 -0
  47. csv_detective/formats/pays.py +35 -0
  48. csv_detective/formats/percent.py +16 -0
  49. csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
  50. csv_detective/formats/sexe.py +17 -0
  51. csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
  52. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
  53. csv_detective/formats/tel_fr.py +36 -0
  54. csv_detective/formats/uai.py +36 -0
  55. csv_detective/formats/url.py +45 -0
  56. csv_detective/formats/username.py +14 -0
  57. csv_detective/formats/uuid.py +16 -0
  58. csv_detective/formats/year.py +28 -0
  59. csv_detective/output/__init__.py +3 -4
  60. csv_detective/output/dataframe.py +3 -3
  61. csv_detective/output/profile.py +2 -3
  62. csv_detective/output/schema.py +2 -2
  63. csv_detective/parsing/columns.py +35 -50
  64. csv_detective/parsing/csv.py +2 -2
  65. csv_detective/parsing/load.py +4 -5
  66. csv_detective/validate.py +9 -4
  67. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/METADATA +6 -5
  68. csv_detective-0.9.3.dev2319.dist-info/RECORD +102 -0
  69. tests/test_fields.py +39 -364
  70. tests/test_file.py +1 -1
  71. tests/test_labels.py +5 -3
  72. tests/test_structure.py +40 -36
  73. csv_detective/detect_fields/FR/__init__.py +0 -0
  74. csv_detective/detect_fields/FR/geo/__init__.py +0 -0
  75. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
  76. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
  77. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
  78. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
  79. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
  80. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
  81. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  82. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
  83. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  84. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
  85. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  86. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  87. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
  88. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
  89. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  90. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
  91. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
  92. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  93. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  94. csv_detective/detect_fields/__init__.py +0 -112
  95. csv_detective/detect_fields/geo/__init__.py +0 -0
  96. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  97. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  98. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  99. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
  100. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  101. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
  102. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  103. csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
  104. csv_detective/detect_fields/other/__init__.py +0 -0
  105. csv_detective/detect_fields/other/email/__init__.py +0 -10
  106. csv_detective/detect_fields/other/money/__init__.py +0 -11
  107. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  108. csv_detective/detect_fields/other/percent/__init__.py +0 -9
  109. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  110. csv_detective/detect_fields/other/url/__init__.py +0 -14
  111. csv_detective/detect_fields/other/uuid/__init__.py +0 -10
  112. csv_detective/detect_fields/temp/__init__.py +0 -0
  113. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  114. csv_detective/detect_labels/FR/__init__.py +0 -0
  115. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  116. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
  117. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
  118. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
  119. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
  120. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
  121. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
  122. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
  123. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
  124. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
  125. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
  126. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
  127. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
  128. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
  129. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
  130. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
  131. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  132. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
  133. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
  134. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
  135. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
  136. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
  137. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
  138. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
  139. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
  140. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
  141. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
  142. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
  143. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  144. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
  145. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
  146. csv_detective/detect_labels/__init__.py +0 -94
  147. csv_detective/detect_labels/geo/__init__.py +0 -0
  148. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
  149. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
  150. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
  151. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
  152. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
  153. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
  154. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
  155. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
  156. csv_detective/detect_labels/other/__init__.py +0 -0
  157. csv_detective/detect_labels/other/booleen/__init__.py +0 -8
  158. csv_detective/detect_labels/other/email/__init__.py +0 -20
  159. csv_detective/detect_labels/other/float/__init__.py +0 -8
  160. csv_detective/detect_labels/other/int/__init__.py +0 -8
  161. csv_detective/detect_labels/other/money/__init__.py +0 -8
  162. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
  163. csv_detective/detect_labels/other/twitter/__init__.py +0 -8
  164. csv_detective/detect_labels/other/url/__init__.py +0 -23
  165. csv_detective/detect_labels/other/uuid/__init__.py +0 -8
  166. csv_detective/detect_labels/temp/__init__.py +0 -0
  167. csv_detective/detect_labels/temp/date/__init__.py +0 -28
  168. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
  169. csv_detective/detect_labels/temp/year/__init__.py +0 -19
  170. csv_detective/load_tests.py +0 -59
  171. csv_detective-0.9.3.dev2258.dist-info/RECORD +0 -166
  172. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  173. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  174. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  175. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
  176. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/WHEEL +0 -0
  177. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/entry_points.txt +0 -0
  178. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/licenses/LICENSE +0 -0
  179. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2319.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,21 @@
1
+ from frformat import CodeFantoir
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "cadastre1",
7
+ "code fantoir",
8
+ "fantoir",
9
+ ]
10
+
11
+ _code_fantoir = CodeFantoir()
12
+
13
+
14
+ def _is(val):
15
+ return isinstance(val, str) and _code_fantoir.is_valid(val)
16
+
17
+
18
+ _test_values = {
19
+ True: ["7755A", "B150B", "ZA04C", "ZB03D"],
20
+ False: ["7755", "ZA99A"],
21
+ }
@@ -1,9 +1,17 @@
1
- import re
2
-
3
- PROPORTION = 0.9
4
- regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$"
5
-
6
-
7
- def _is(val):
8
- """Repere le code Import (ancien RNA)"""
9
- return isinstance(val, str) and bool(re.match(regex, val))
1
+ import re
2
+
3
+ proportion = 0.9
4
+ tags = ["fr"]
5
+ labels = ["code"]
6
+
7
+ regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$"
8
+
9
+
10
+ def _is(val):
11
+ return isinstance(val, str) and bool(re.match(regex, val))
12
+
13
+
14
+ _test_values = {
15
+ True: ["123S1871092288"],
16
+ False: ["AA751PEE00188854", "W123456789"],
17
+ }
@@ -0,0 +1,25 @@
1
+ from frformat import CodePostal
2
+
3
+ proportion = 0.9
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "code postal",
7
+ "postal code",
8
+ "postcode",
9
+ "post code",
10
+ "cp",
11
+ "codes postaux",
12
+ "location postcode",
13
+ ]
14
+
15
+ _code_postal = CodePostal()
16
+
17
+
18
+ def _is(val):
19
+ return isinstance(val, str) and _code_postal.is_valid(val)
20
+
21
+
22
+ _test_values = {
23
+ True: ["75020", "01000"],
24
+ False: ["77777", "018339"],
25
+ }
@@ -0,0 +1,22 @@
1
+ from frformat import CodeRegion, Millesime
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "code region",
7
+ "reg",
8
+ "code insee region",
9
+ "region",
10
+ ]
11
+
12
+ _code_region = CodeRegion(Millesime.LATEST)
13
+
14
+
15
+ def _is(val):
16
+ return isinstance(val, str) and _code_region.is_valid(val)
17
+
18
+
19
+ _test_values = {
20
+ True: ["32"],
21
+ False: ["55"],
22
+ }
@@ -0,0 +1,29 @@
1
+ from frformat import CodeRNA
2
+
3
+ proportion = 0.9
4
+ tags = ["fr"]
5
+ labels = [
6
+ "code rna",
7
+ "rna",
8
+ "n° inscription association",
9
+ "identifiant association",
10
+ ]
11
+
12
+ _code_rna = CodeRNA()
13
+
14
+
15
+ def _is(val):
16
+ return isinstance(val, str) and _code_rna.is_valid(val)
17
+
18
+
19
+ _test_values = {
20
+ True: ["W751515517"],
21
+ False: [
22
+ "W111111111111111111111111111111111111",
23
+ "w143788974",
24
+ "W12",
25
+ "678W23456",
26
+ "165789325",
27
+ "Wa1#89sf&h",
28
+ ],
29
+ }
@@ -0,0 +1,17 @@
1
+ import re
2
+
3
+ proportion = 0.9
4
+ tags = ["fr"]
5
+ labels = ["code waldec", "waldec"]
6
+
7
+ regex = r"^W\d[\dA-Z]\d{7}$"
8
+
9
+
10
+ def _is(val):
11
+ return isinstance(val, str) and bool(re.match(regex, val))
12
+
13
+
14
+ _test_values = {
15
+ True: ["W123456789", "W2D1234567"],
16
+ False: ["AA751PEE00188854"],
17
+ }
@@ -1,16 +1,27 @@
1
- from frformat import Commune, Millesime, Options
2
-
3
- PROPORTION = 0.9
4
-
5
- _options = Options(
6
- ignore_case=True,
7
- ignore_accents=True,
8
- replace_non_alphanumeric_with_space=True,
9
- ignore_extra_whitespace=True,
10
- )
11
- _commune = Commune(Millesime.LATEST, _options)
12
-
13
-
14
- def _is(val):
15
- """Match avec le nom des communes"""
16
- return isinstance(val, str) and _commune.is_valid(val)
1
+ from frformat import Commune, Millesime, Options
2
+
3
+ proportion = 0.8
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "commune",
7
+ "ville",
8
+ "libelle commune",
9
+ ]
10
+
11
+ _options = Options(
12
+ ignore_case=True,
13
+ ignore_accents=True,
14
+ replace_non_alphanumeric_with_space=True,
15
+ ignore_extra_whitespace=True,
16
+ )
17
+ _commune = Commune(Millesime.LATEST, _options)
18
+
19
+
20
+ def _is(val):
21
+ return isinstance(val, str) and _commune.is_valid(val)
22
+
23
+
24
+ _test_values = {
25
+ True: ["saint denis"],
26
+ False: ["new york", "lion"],
27
+ }
@@ -1,19 +1,31 @@
1
- from os.path import dirname, join
2
-
3
- from csv_detective.parsing.text import _process_text
4
-
5
- PROPORTION = 1
6
- f = open(join(dirname(__file__), "csp_insee.txt"), "r")
7
- codes_insee = f.read().split("\n")
8
- # removing empty str due to additionnal line in file
9
- del codes_insee[-1]
10
- codes_insee = set(codes_insee)
11
- f.close()
12
-
13
-
14
- def _is(val):
15
- """Repère les csp telles que définies par l'INSEE"""
16
- if not isinstance(val, str):
17
- return False
18
- val = _process_text(val)
19
- return val in codes_insee
1
+ from os.path import dirname, join
2
+
3
+ from csv_detective.parsing.text import _process_text
4
+
5
+ proportion = 1
6
+ tags = ["fr"]
7
+ labels = [
8
+ "csp insee",
9
+ "csp",
10
+ "categorie socioprofessionnelle",
11
+ ]
12
+
13
+ f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r")
14
+ codes_insee = f.read().split("\n")
15
+ # removing empty str due to additionnal line in file
16
+ del codes_insee[-1]
17
+ codes_insee = set(codes_insee)
18
+ f.close()
19
+
20
+
21
+ def _is(val):
22
+ if not isinstance(val, str):
23
+ return False
24
+ val = _process_text(val)
25
+ return val in codes_insee
26
+
27
+
28
+ _test_values = {
29
+ True: ["employes de la poste"],
30
+ False: ["super-heros"],
31
+ }
@@ -1,62 +1,99 @@
1
- import re
2
- from datetime import datetime
3
-
4
- from dateparser import parse as date_parser
5
- from dateutil.parser import ParserError
6
- from dateutil.parser import parse as dateutil_parser
7
-
8
- PROPORTION = 1
9
- # /!\ this is only for dates, not datetimes which are handled by other utils
10
-
11
-
12
- def date_casting(val: str) -> datetime | None:
13
- """For performance reasons, we try first with dateutil and fallback on dateparser"""
14
- try:
15
- return dateutil_parser(val)
16
- except ParserError:
17
- return date_parser(val)
18
- except Exception:
19
- return None
20
-
21
-
22
- seps = r"[\s/\-\*_\|;.,]"
23
- # matches JJ-MM-AAAA with any of the listed separators
24
- jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
25
- "SEP", seps
26
- )
27
- # matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
28
- aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
29
- "SEP", seps + "?"
30
- )
31
- # matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
32
- string_month_pattern = (
33
- r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
34
- r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
35
- r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
36
- r"([0-9]{2}$|(19|20)[0-9]{2}$)"
37
- ).replace("SEP", seps + "?")
38
-
39
- threshold = 0.3
40
-
41
-
42
- def _is(val):
43
- """Renvoie True si val peut être une date, False sinon"""
44
- # early stops, to cut processing time
45
- if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
46
- return False
47
- # if it's a usual date pattern
48
- if any(
49
- # with this syntax, if any of the first value is True, the next ones are not computed
50
- [
51
- bool(re.match(jjmmaaaa_pattern, val))
52
- or bool(re.match(aaaammjj_pattern, val))
53
- or bool(re.match(string_month_pattern, val, re.IGNORECASE))
54
- ]
55
- ):
56
- return True
57
- if sum([char.isdigit() for char in val]) / len(val) < threshold:
58
- return False
59
- res = date_casting(val)
60
- if not res or res.hour or res.minute or res.second:
61
- return False
62
- return True
1
+ import re
2
+ from datetime import datetime
3
+
4
+ from dateparser import parse as date_parser
5
+ from dateutil.parser import ParserError
6
+ from dateutil.parser import parse as dateutil_parser
7
+
8
+ proportion = 1
9
+ tags = ["temp", "type"]
10
+ SHARED_DATE_LABELS = [
11
+ "date",
12
+ "mise à jour",
13
+ "modifie",
14
+ "maj",
15
+ "datemaj",
16
+ "update",
17
+ "created",
18
+ "modified",
19
+ ]
20
+ labels = SHARED_DATE_LABELS + [
21
+ "jour",
22
+ "periode",
23
+ "dpc",
24
+ "yyyymmdd",
25
+ "aaaammjj",
26
+ ]
27
+
28
+
29
+ def date_casting(val: str) -> datetime | None:
30
+ """For performance reasons, we try first with dateutil and fallback on dateparser"""
31
+ try:
32
+ return dateutil_parser(val)
33
+ except ParserError:
34
+ return date_parser(val)
35
+ except Exception:
36
+ return None
37
+
38
+
39
+ threshold = 0.3
40
+ seps = r"[\s/\-\*_\|;.,]"
41
+ # matches JJ-MM-AAAA with any of the listed separators
42
+ jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
43
+ "SEP", seps
44
+ )
45
+ # matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
46
+ aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
47
+ "SEP", seps + "?"
48
+ )
49
+ # matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
50
+ string_month_pattern = (
51
+ r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
52
+ r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
53
+ r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
54
+ r"([0-9]{2}$|(19|20)[0-9]{2}$)"
55
+ ).replace("SEP", seps + "?")
56
+
57
+
58
+ def _is(val):
59
+ # early stops, to cut processing time
60
+ if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
61
+ return False
62
+ # if it's a usual date pattern
63
+ if any(
64
+ # with this syntax, if any of the first value is True, the next ones are not computed
65
+ [
66
+ bool(re.match(jjmmaaaa_pattern, val))
67
+ or bool(re.match(aaaammjj_pattern, val))
68
+ or bool(re.match(string_month_pattern, val, re.IGNORECASE))
69
+ ]
70
+ ):
71
+ return True
72
+ if sum([char.isdigit() for char in val]) / len(val) < threshold:
73
+ return False
74
+ res = date_casting(val)
75
+ if not res or res.hour or res.minute or res.second:
76
+ return False
77
+ return True
78
+
79
+
80
+ _test_values = {
81
+ True: [
82
+ "1960-08-07",
83
+ "12/02/2007",
84
+ "15 jan 1985",
85
+ "15 décembre 1985",
86
+ "02 05 2003",
87
+ "20030502",
88
+ "1993-12/02",
89
+ ],
90
+ False: [
91
+ "1993-1993-1993",
92
+ "39-10-1993",
93
+ "19-15-1993",
94
+ "15 tambour 1985",
95
+ "12152003",
96
+ "20031512",
97
+ "02052003",
98
+ ],
99
+ }
@@ -0,0 +1,22 @@
1
+ import re
2
+
3
+ from csv_detective.parsing.text import _process_text
4
+
5
+ proportion = 1
6
+ tags = ["fr", "temp"]
7
+ labels = ["date"]
8
+
9
+ pattern = (
10
+ r"^(0?[1-9]|[12][0-9]|3[01])[ \-/](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"
11
+ r"|octobre|novembre|decembre)[ \-/]\d{4}$"
12
+ )
13
+
14
+
15
+ def _is(val):
16
+ return isinstance(val, str) and bool(re.match(pattern, _process_text(val)))
17
+
18
+
19
+ _test_values = {
20
+ True: ["13 février 1996", "15 decembre 2024"],
21
+ False: ["44 march 2025"],
22
+ }
@@ -1,12 +1,12 @@
1
1
  import re
2
- from typing import Any
3
2
 
4
- from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
3
+ from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, date_casting
5
4
 
6
- PROPORTION = 1
7
- threshold = 0.7
5
+ proportion = 1
6
+ tags = ["temp", "type"]
7
+ labels = SHARED_DATE_LABELS + ["datetime", "timestamp"]
8
8
 
9
- # matches AAAA-MM-JJTHH:MM:SS(.dddddd)(±HH:MM|Z) with any of the listed separators for the date OR NO SEPARATOR
9
+ threshold = 0.7
10
10
  pat = (
11
11
  aaaammjj_pattern.replace("$", "")
12
12
  + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
@@ -14,8 +14,7 @@ pat = (
14
14
  )
15
15
 
16
16
 
17
- def _is(val: Any | None) -> bool:
18
- """Detects timezone-aware datetimes only"""
17
+ def _is(val):
19
18
  # early stops, to cut processing time
20
19
  # 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
21
20
  # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
@@ -32,3 +31,15 @@ def _is(val: Any | None) -> bool:
32
31
  and bool(res.hour or res.minute or res.second or res.microsecond)
33
32
  and bool(res.tzinfo)
34
33
  )
34
+
35
+
36
+ _test_values = {
37
+ True: [
38
+ "2021-06-22 10:20:10-04:00",
39
+ "2030-06-22 00:00:00.0028+02:00",
40
+ "2000-12-21 10:20:10.1Z",
41
+ "2024-12-19T10:53:36.428000+00:00",
42
+ "1996/06/22 10:20:10 GMT",
43
+ ],
44
+ False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
45
+ }
@@ -1,9 +1,11 @@
1
1
  import re
2
2
  from typing import Any
3
3
 
4
- from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
4
+ from csv_detective.formats.date import aaaammjj_pattern, date_casting
5
+ from csv_detective.formats.datetime_aware import labels # noqa
5
6
 
6
- PROPORTION = 1
7
+ proportion = 1
8
+ tags = ["temp", "type"]
7
9
  threshold = 0.7
8
10
 
9
11
  # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
@@ -27,3 +29,20 @@ def _is(val: Any | None) -> bool:
27
29
  return False
28
30
  res = date_casting(val)
29
31
  return res is not None and not bool(res.tzinfo)
32
+
33
+
34
+ _test_values = {
35
+ True: [
36
+ "2021-06-22 10:20:10",
37
+ "2030/06-22 00:00:00",
38
+ "2030/06/22 00:00:00.0028",
39
+ ],
40
+ False: [
41
+ "2021-06-22T30:20:10",
42
+ "Sun, 06 Nov 1994 08:49:37 GMT",
43
+ "2021-06-44 10:20:10+02:00",
44
+ "1999-12-01T00:00:00Z",
45
+ "2021-06-44",
46
+ "15 décembre 1985",
47
+ ],
48
+ }
@@ -1,18 +1,24 @@
1
- import re
2
-
3
- PROPORTION = 1
4
-
5
-
6
- def _is(val):
7
- """Renvoie True si val peut être une date au format rfc822, False sinon
8
- Exemple: Tue, 19 Dec 2023 15:30:45 +0000"""
9
-
10
- return isinstance(val, str) and bool(
11
- re.match(
12
- r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
13
- r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
14
- r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
15
- val.lower(),
16
- re.IGNORECASE,
17
- )
18
- )
1
+ import re
2
+
3
+ from csv_detective.formats.datetime_aware import labels # noqa
4
+
5
+ proportion = 1
6
+ tags = ["temp", "type"]
7
+
8
+
9
+ def _is(val):
10
+ return isinstance(val, str) and bool(
11
+ re.match(
12
+ r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
13
+ r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
14
+ r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
15
+ val.lower(),
16
+ re.IGNORECASE,
17
+ )
18
+ )
19
+
20
+
21
+ _test_values = {
22
+ True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
23
+ False: ["2021-06-22T10:20:10"],
24
+ }
@@ -0,0 +1,37 @@
1
+ from frformat import Departement, Millesime, Options
2
+
3
+ proportion = 0.9
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "departement",
7
+ "libelle du departement",
8
+ "deplib",
9
+ "nom dept",
10
+ "dept",
11
+ "libdepartement",
12
+ "nom departement",
13
+ "libelle dep",
14
+ "libelle departement",
15
+ "lb departements",
16
+ "dep libusage",
17
+ "lb departement",
18
+ "nom dep",
19
+ ]
20
+
21
+ _options = Options(
22
+ ignore_case=True,
23
+ ignore_accents=True,
24
+ replace_non_alphanumeric_with_space=True,
25
+ ignore_extra_whitespace=True,
26
+ )
27
+ _departement = Departement(Millesime.LATEST, _options)
28
+
29
+
30
+ def _is(val):
31
+ return isinstance(val, str) and _departement.is_valid(val)
32
+
33
+
34
+ _test_values = {
35
+ True: ["essonne"],
36
+ False: ["alabama", "auvergne"],
37
+ }
@@ -0,0 +1,28 @@
1
+ import re
2
+
3
+ proportion = 0.9
4
+ labels = [
5
+ "email",
6
+ "mail",
7
+ "courriel",
8
+ "contact",
9
+ "mel",
10
+ "lieucourriel",
11
+ "coordinates.emailcontact",
12
+ "e mail",
13
+ "mo mail",
14
+ "adresse mail",
15
+ "adresse email",
16
+ ]
17
+
18
+
19
+ def _is(val):
20
+ return isinstance(val, str) and bool(
21
+ re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
22
+ )
23
+
24
+
25
+ _test_values = {
26
+ True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
27
+ False: ["cdo@@gouv.sfd"],
28
+ }
@@ -1,21 +1,29 @@
1
- PROPORTION = 1
2
-
3
-
4
- def float_casting(val: str) -> float:
5
- return float(val.replace(",", "."))
6
-
7
-
8
- def _is(val):
9
- """Detects floats, assuming that tables will not have scientific
10
- notations (3e6) or "+" in the string. "-" is still accepted."""
11
- try:
12
- if (
13
- not isinstance(val, str)
14
- or any([k in val for k in ["_", "+", "e", "E"]])
15
- or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
16
- ):
17
- return False
18
- float_casting(val)
19
- return True
20
- except ValueError:
21
- return False
1
+ proportion = 1
2
+ tags = ["type"]
3
+ labels = ["part", "ratio", "taux"]
4
+
5
+
6
+ def float_casting(val: str) -> float:
7
+ return float(val.replace(",", "."))
8
+
9
+
10
+ def _is(val):
11
+ """Detects floats, assuming that tables will not have scientific
12
+ notations (3e6) or "+" in the string. "-" is still accepted."""
13
+ try:
14
+ if (
15
+ not isinstance(val, str)
16
+ or any([k in val for k in ["_", "+", "e", "E"]])
17
+ or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
18
+ ):
19
+ return False
20
+ float_casting(val)
21
+ return True
22
+ except ValueError:
23
+ return False
24
+
25
+
26
+ _test_values = {
27
+ True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"],
28
+ False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"],
29
+ }