csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -0,0 +1,116 @@
1
+ from csv_detective.parsing.text import _process_text
2
+
3
+ proportion = 0.55
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "adresse",
7
+ "localisation",
8
+ "adresse postale",
9
+ "adresse geographique",
10
+ "adr",
11
+ "adresse complete",
12
+ "adresse station",
13
+ ]
14
+
15
+ voies = {
16
+ "aire ",
17
+ "allee ",
18
+ "avenue ",
19
+ "base ",
20
+ "boulevard ",
21
+ "cami ",
22
+ "carrefour ",
23
+ "chemin ",
24
+ "cheminement ",
25
+ "chaussee ",
26
+ "cite ",
27
+ "clos ",
28
+ "coin ",
29
+ "corniche ",
30
+ "cote ",
31
+ "cour ",
32
+ "cours ",
33
+ "domaine ",
34
+ "descente ",
35
+ "ecart ",
36
+ "esplanade ",
37
+ "faubourg ",
38
+ "gare ",
39
+ "grande rue",
40
+ "hameau ",
41
+ "halle ",
42
+ "ilot ",
43
+ "impasse ",
44
+ "lieu dit",
45
+ "lotissement ",
46
+ "marche ",
47
+ "montee ",
48
+ "parc ",
49
+ "passage ",
50
+ "place ",
51
+ "plan ",
52
+ "plaine ",
53
+ "plateau ",
54
+ "pont ",
55
+ "port ",
56
+ "promenade ",
57
+ "parvis ",
58
+ "quartier ",
59
+ "quai ",
60
+ "residence ",
61
+ "ruelle ",
62
+ "rocade ",
63
+ "rond point",
64
+ "route ",
65
+ "rue ",
66
+ # 'sente - sentier',
67
+ "square ",
68
+ "tour ",
69
+ # 'terre-plein',
70
+ "traverse ",
71
+ "villa ",
72
+ "village ",
73
+ "voie ",
74
+ "zone artisanale",
75
+ "zone d’amenagement concerte",
76
+ "zone d’amenagement differe",
77
+ "zone industrielle",
78
+ "zone ",
79
+ # 'r',
80
+ "av ",
81
+ "pl ",
82
+ "bd ",
83
+ "cami ",
84
+ # 'che',
85
+ "chs ",
86
+ "dom ",
87
+ "ham ",
88
+ "ld ",
89
+ # 'pro',
90
+ # 'rte',
91
+ "vlge ",
92
+ "za ",
93
+ "zac ",
94
+ "zad ",
95
+ "zi ",
96
+ # 'car',
97
+ "fg ",
98
+ # 'lot',
99
+ "imp ",
100
+ # 'qu',
101
+ "mte",
102
+ }
103
+
104
+
105
+ def _is(val):
106
+ """Repere des adresses"""
107
+ if not isinstance(val, str) or len(val) > 150:
108
+ return False
109
+ val = _process_text(val)
110
+ return any(x in val for x in voies)
111
+
112
+
113
+ _test_values = {
114
+ True: ["rue du martyr"],
115
+ False: ["un batiment"],
116
+ }
@@ -0,0 +1,26 @@
1
+ import codecs
2
+
3
+ proportion = 1
4
+ tags = ["type"]
5
+ labels = ["bytes", "binary", "image", "encode", "content"]
6
+
7
+
8
+ def binary_casting(val: str) -> bytes:
9
+ return codecs.escape_decode(val[2:-1])[0]
10
+
11
+
12
+ def _is(val) -> bool:
13
+ if isinstance(val, str) and (
14
+ (val.startswith("b'") and val.endswith("'")) or (val.startswith('b"') and val.endswith('"'))
15
+ ):
16
+ try:
17
+ return isinstance(binary_casting(val), bytes)
18
+ except Exception:
19
+ return False
20
+ return False
21
+
22
+
23
+ _test_values = {
24
+ True: ["b'\x01\x01'", 'b"\x01\x01\x00\x00\x00;\xb7\xd4\xc5_)J\xc0\xcb\x16>\x9e\xd1\xc4\x13@"'],
25
+ False: ["bytes", 'b"ytes'],
26
+ }
@@ -0,0 +1,35 @@
1
+ proportion = 1
2
+ tags = ["type"]
3
+ labels = ["is ", "has ", "est "]
4
+
5
+ bool_mapping = {
6
+ "1": True,
7
+ "0": False,
8
+ "vrai": True,
9
+ "faux": False,
10
+ "true": True,
11
+ "false": False,
12
+ "oui": True,
13
+ "non": False,
14
+ "yes": True,
15
+ "no": False,
16
+ "y": True,
17
+ "n": False,
18
+ "o": True,
19
+ }
20
+
21
+ liste_bool = set(bool_mapping.keys())
22
+
23
+
24
+ def bool_casting(val: str) -> bool:
25
+ return bool_mapping.get(val.lower())
26
+
27
+
28
+ def _is(val):
29
+ return isinstance(val, str) and val.lower() in liste_bool
30
+
31
+
32
+ _test_values = {
33
+ True: ["oui", "0", "1", "yes", "false", "True"],
34
+ False: ["nein", "ja", "2", "-0"],
35
+ }
@@ -0,0 +1,26 @@
1
+ from frformat import CodeCommuneInsee, Millesime
2
+
3
+ proportion = 0.75
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "code commune insee",
7
+ "code insee",
8
+ "codes insee",
9
+ "code commune",
10
+ "code insee commune",
11
+ "insee",
12
+ "code com",
13
+ "com",
14
+ ]
15
+
16
+ _code_commune_insee = CodeCommuneInsee(Millesime.LATEST)
17
+
18
+
19
+ def _is(val):
20
+ return isinstance(val, str) and _code_commune_insee.is_valid(val)
21
+
22
+
23
+ _test_values = {
24
+ True: ["91471", "01053"],
25
+ False: ["914712", "01000"],
26
+ }
@@ -0,0 +1,36 @@
1
+ import re
2
+
3
+ from csv_detective.parsing.text import _process_text
4
+
5
+ proportion = 1
6
+ tags = ["fr"]
7
+ labels = ["code csp insee", "code csp"]
8
+
9
+
10
+ def _is(val):
11
+ if not isinstance(val, str):
12
+ return False
13
+ val = _process_text(val)
14
+ if len(val) != 4:
15
+ return False
16
+ a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val))
17
+ b = val in {
18
+ "7100",
19
+ "7200",
20
+ "7400",
21
+ "7500",
22
+ "7700",
23
+ "7800",
24
+ "8100",
25
+ "8300",
26
+ "8400",
27
+ "8500",
28
+ "8600",
29
+ }
30
+ return a or b
31
+
32
+
33
+ _test_values = {
34
+ True: ["121f"],
35
+ False: ["121x"],
36
+ }
@@ -0,0 +1,29 @@
1
+ from frformat import Millesime, NumeroDepartement, Options
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "code departement",
7
+ "code_departement",
8
+ "dep",
9
+ "departement",
10
+ "dept",
11
+ ]
12
+
13
+ _options = Options(
14
+ ignore_case=True,
15
+ ignore_accents=True,
16
+ replace_non_alphanumeric_with_space=True,
17
+ ignore_extra_whitespace=True,
18
+ )
19
+ _numero_departement = NumeroDepartement(Millesime.LATEST, _options)
20
+
21
+
22
+ def _is(val):
23
+ return isinstance(val, str) and _numero_departement.is_valid(val)
24
+
25
+
26
+ _test_values = {
27
+ True: ["75", "2A", "2b", "974", "01"],
28
+ False: ["00", "96", "101"],
29
+ }
@@ -0,0 +1,21 @@
1
+ from frformat import CodeFantoir
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "cadastre1",
7
+ "code fantoir",
8
+ "fantoir",
9
+ ]
10
+
11
+ _code_fantoir = CodeFantoir()
12
+
13
+
14
+ def _is(val):
15
+ return isinstance(val, str) and _code_fantoir.is_valid(val)
16
+
17
+
18
+ _test_values = {
19
+ True: ["7755A", "B150B", "ZA04C", "ZB03D"],
20
+ False: ["7755", "ZA99A"],
21
+ }
@@ -0,0 +1,17 @@
1
+ import re
2
+
3
+ proportion = 0.9
4
+ tags = ["fr"]
5
+ labels = ["code"]
6
+
7
+ regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$"
8
+
9
+
10
+ def _is(val):
11
+ return isinstance(val, str) and bool(re.match(regex, val))
12
+
13
+
14
+ _test_values = {
15
+ True: ["123S1871092288"],
16
+ False: ["AA751PEE00188854", "W123456789"],
17
+ }
@@ -0,0 +1,25 @@
1
+ from frformat import CodePostal
2
+
3
+ proportion = 0.9
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "code postal",
7
+ "postal code",
8
+ "postcode",
9
+ "post code",
10
+ "cp",
11
+ "codes postaux",
12
+ "location postcode",
13
+ ]
14
+
15
+ _code_postal = CodePostal()
16
+
17
+
18
+ def _is(val):
19
+ return isinstance(val, str) and _code_postal.is_valid(val)
20
+
21
+
22
+ _test_values = {
23
+ True: ["75020", "01000"],
24
+ False: ["77777", "018339"],
25
+ }
@@ -0,0 +1,22 @@
1
+ from frformat import CodeRegion, Millesime
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "code region",
7
+ "reg",
8
+ "code insee region",
9
+ "region",
10
+ ]
11
+
12
+ _code_region = CodeRegion(Millesime.LATEST)
13
+
14
+
15
+ def _is(val):
16
+ return isinstance(val, str) and _code_region.is_valid(val)
17
+
18
+
19
+ _test_values = {
20
+ True: ["32"],
21
+ False: ["55"],
22
+ }
@@ -0,0 +1,29 @@
1
+ from frformat import CodeRNA
2
+
3
+ proportion = 0.9
4
+ tags = ["fr"]
5
+ labels = [
6
+ "code rna",
7
+ "rna",
8
+ "n° inscription association",
9
+ "identifiant association",
10
+ ]
11
+
12
+ _code_rna = CodeRNA()
13
+
14
+
15
+ def _is(val):
16
+ return isinstance(val, str) and _code_rna.is_valid(val)
17
+
18
+
19
+ _test_values = {
20
+ True: ["W751515517"],
21
+ False: [
22
+ "W111111111111111111111111111111111111",
23
+ "w143788974",
24
+ "W12",
25
+ "678W23456",
26
+ "165789325",
27
+ "Wa1#89sf&h",
28
+ ],
29
+ }
@@ -0,0 +1,17 @@
1
+ import re
2
+
3
+ proportion = 0.9
4
+ tags = ["fr"]
5
+ labels = ["code waldec", "waldec"]
6
+
7
+ regex = r"^W\d[\dA-Z]\d{7}$"
8
+
9
+
10
+ def _is(val):
11
+ return isinstance(val, str) and bool(re.match(regex, val))
12
+
13
+
14
+ _test_values = {
15
+ True: ["W123456789", "W2D1234567"],
16
+ False: ["AA751PEE00188854"],
17
+ }
@@ -0,0 +1,27 @@
1
+ from frformat import Commune, Millesime, Options
2
+
3
+ proportion = 0.8
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "commune",
7
+ "ville",
8
+ "libelle commune",
9
+ ]
10
+
11
+ _options = Options(
12
+ ignore_case=True,
13
+ ignore_accents=True,
14
+ replace_non_alphanumeric_with_space=True,
15
+ ignore_extra_whitespace=True,
16
+ )
17
+ _commune = Commune(Millesime.LATEST, _options)
18
+
19
+
20
+ def _is(val):
21
+ return isinstance(val, str) and _commune.is_valid(val)
22
+
23
+
24
+ _test_values = {
25
+ True: ["saint denis"],
26
+ False: ["new york", "lion"],
27
+ }
@@ -0,0 +1,31 @@
1
+ from os.path import dirname, join
2
+
3
+ from csv_detective.parsing.text import _process_text
4
+
5
+ proportion = 1
6
+ tags = ["fr"]
7
+ labels = [
8
+ "csp insee",
9
+ "csp",
10
+ "categorie socioprofessionnelle",
11
+ ]
12
+
13
+ f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r")
14
+ codes_insee = f.read().split("\n")
15
+ # removing empty str due to additionnal line in file
16
+ del codes_insee[-1]
17
+ codes_insee = set(codes_insee)
18
+ f.close()
19
+
20
+
21
+ def _is(val):
22
+ if not isinstance(val, str):
23
+ return False
24
+ val = _process_text(val)
25
+ return val in codes_insee
26
+
27
+
28
+ _test_values = {
29
+ True: ["employes de la poste"],
30
+ False: ["super-heros"],
31
+ }
@@ -0,0 +1,99 @@
1
+ import re
2
+ from datetime import datetime
3
+
4
+ from dateparser import parse as date_parser
5
+ from dateutil.parser import ParserError
6
+ from dateutil.parser import parse as dateutil_parser
7
+
8
+ proportion = 1
9
+ tags = ["temp", "type"]
10
+ SHARED_DATE_LABELS = [
11
+ "date",
12
+ "mise à jour",
13
+ "modifie",
14
+ "maj",
15
+ "datemaj",
16
+ "update",
17
+ "created",
18
+ "modified",
19
+ ]
20
+ labels = SHARED_DATE_LABELS + [
21
+ "jour",
22
+ "periode",
23
+ "dpc",
24
+ "yyyymmdd",
25
+ "aaaammjj",
26
+ ]
27
+
28
+
29
+ def date_casting(val: str) -> datetime | None:
30
+ """For performance reasons, we try first with dateutil and fallback on dateparser"""
31
+ try:
32
+ return dateutil_parser(val)
33
+ except ParserError:
34
+ return date_parser(val)
35
+ except Exception:
36
+ return None
37
+
38
+
39
+ threshold = 0.3
40
+ seps = r"[\s/\-\*_\|;.,]"
41
+ # matches JJ-MM-AAAA with any of the listed separators
42
+ jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
43
+ "SEP", seps
44
+ )
45
+ # matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
46
+ aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
47
+ "SEP", seps + "?"
48
+ )
49
+ # matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
50
+ string_month_pattern = (
51
+ r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
52
+ r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
53
+ r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
54
+ r"([0-9]{2}$|(19|20)[0-9]{2}$)"
55
+ ).replace("SEP", seps + "?")
56
+
57
+
58
+ def _is(val):
59
+ # early stops, to cut processing time
60
+ if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
61
+ return False
62
+ # if it's a usual date pattern
63
+ if any(
64
+ # with this syntax, if any of the first value is True, the next ones are not computed
65
+ [
66
+ bool(re.match(jjmmaaaa_pattern, val))
67
+ or bool(re.match(aaaammjj_pattern, val))
68
+ or bool(re.match(string_month_pattern, val, re.IGNORECASE))
69
+ ]
70
+ ):
71
+ return True
72
+ if sum([char.isdigit() for char in val]) / len(val) < threshold:
73
+ return False
74
+ res = date_casting(val)
75
+ if not res or res.hour or res.minute or res.second:
76
+ return False
77
+ return True
78
+
79
+
80
+ _test_values = {
81
+ True: [
82
+ "1960-08-07",
83
+ "12/02/2007",
84
+ "15 jan 1985",
85
+ "15 décembre 1985",
86
+ "02 05 2003",
87
+ "20030502",
88
+ "1993-12/02",
89
+ ],
90
+ False: [
91
+ "1993-1993-1993",
92
+ "39-10-1993",
93
+ "19-15-1993",
94
+ "15 tambour 1985",
95
+ "12152003",
96
+ "20031512",
97
+ "02052003",
98
+ ],
99
+ }
@@ -0,0 +1,22 @@
1
+ import re
2
+
3
+ from csv_detective.parsing.text import _process_text
4
+
5
+ proportion = 1
6
+ tags = ["fr", "temp"]
7
+ labels = ["date"]
8
+
9
+ pattern = (
10
+ r"^(0?[1-9]|[12][0-9]|3[01])[ \-/](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"
11
+ r"|octobre|novembre|decembre)[ \-/]\d{4}$"
12
+ )
13
+
14
+
15
+ def _is(val):
16
+ return isinstance(val, str) and bool(re.match(pattern, _process_text(val)))
17
+
18
+
19
+ _test_values = {
20
+ True: ["13 février 1996", "15 decembre 2024"],
21
+ False: ["44 march 2025"],
22
+ }
@@ -0,0 +1,45 @@
1
+ import re
2
+
3
+ from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, date_casting
4
+
5
+ proportion = 1
6
+ tags = ["temp", "type"]
7
+ labels = SHARED_DATE_LABELS + ["datetime", "timestamp"]
8
+
9
+ threshold = 0.7
10
+ pat = (
11
+ aaaammjj_pattern.replace("$", "")
12
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
13
+ + r"?(([+-](0\d|1[0-9]|2[0-3]):([0-5][0-9]))|Z)$"
14
+ )
15
+
16
+
17
+ def _is(val):
18
+ # early stops, to cut processing time
19
+ # 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
20
+ # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
21
+ if not isinstance(val, str) or len(val) > 35 or len(val) < 16:
22
+ return False
23
+ # if usual format, no need to parse
24
+ if bool(re.match(pat, val)):
25
+ return True
26
+ if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
27
+ return False
28
+ res = date_casting(val)
29
+ return (
30
+ res is not None
31
+ and bool(res.hour or res.minute or res.second or res.microsecond)
32
+ and bool(res.tzinfo)
33
+ )
34
+
35
+
36
+ _test_values = {
37
+ True: [
38
+ "2021-06-22 10:20:10-04:00",
39
+ "2030-06-22 00:00:00.0028+02:00",
40
+ "2000-12-21 10:20:10.1Z",
41
+ "2024-12-19T10:53:36.428000+00:00",
42
+ "1996/06/22 10:20:10 GMT",
43
+ ],
44
+ False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
45
+ }
@@ -0,0 +1,48 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ from csv_detective.formats.date import aaaammjj_pattern, date_casting
5
+ from csv_detective.formats.datetime_aware import labels # noqa
6
+
7
+ proportion = 1
8
+ tags = ["temp", "type"]
9
+ threshold = 0.7
10
+
11
+ # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
12
+ pat = (
13
+ aaaammjj_pattern.replace("$", "")
14
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?$"
15
+ )
16
+
17
+
18
+ def _is(val: Any | None) -> bool:
19
+ """Detects naive datetimes only"""
20
+ # early stops, to cut processing time
21
+ # 15 is the minimal length of a datetime format YYMMDDTHH:MM:SS
22
+ # 26 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd, keeping some slack
23
+ if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
24
+ return False
25
+ # if usual format, no need to parse
26
+ if bool(re.match(pat, val)):
27
+ return True
28
+ if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
29
+ return False
30
+ res = date_casting(val)
31
+ return res is not None and not bool(res.tzinfo)
32
+
33
+
34
+ _test_values = {
35
+ True: [
36
+ "2021-06-22 10:20:10",
37
+ "2030/06-22 00:00:00",
38
+ "2030/06/22 00:00:00.0028",
39
+ ],
40
+ False: [
41
+ "2021-06-22T30:20:10",
42
+ "Sun, 06 Nov 1994 08:49:37 GMT",
43
+ "2021-06-44 10:20:10+02:00",
44
+ "1999-12-01T00:00:00Z",
45
+ "2021-06-44",
46
+ "15 décembre 1985",
47
+ ],
48
+ }