csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -0,0 +1,24 @@
1
+ import re
2
+
3
+ from csv_detective.formats.datetime_aware import labels # noqa
4
+
5
+ proportion = 1
6
+ tags = ["temp", "type"]
7
+
8
+
9
+ def _is(val):
10
+ return isinstance(val, str) and bool(
11
+ re.match(
12
+ r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
13
+ r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
14
+ r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
15
+ val.lower(),
16
+ re.IGNORECASE,
17
+ )
18
+ )
19
+
20
+
21
+ _test_values = {
22
+ True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
23
+ False: ["2021-06-22T10:20:10"],
24
+ }
@@ -0,0 +1,37 @@
1
+ from frformat import Departement, Millesime, Options
2
+
3
+ proportion = 0.9
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "departement",
7
+ "libelle du departement",
8
+ "deplib",
9
+ "nom dept",
10
+ "dept",
11
+ "libdepartement",
12
+ "nom departement",
13
+ "libelle dep",
14
+ "libelle departement",
15
+ "lb departements",
16
+ "dep libusage",
17
+ "lb departement",
18
+ "nom dep",
19
+ ]
20
+
21
+ _options = Options(
22
+ ignore_case=True,
23
+ ignore_accents=True,
24
+ replace_non_alphanumeric_with_space=True,
25
+ ignore_extra_whitespace=True,
26
+ )
27
+ _departement = Departement(Millesime.LATEST, _options)
28
+
29
+
30
+ def _is(val):
31
+ return isinstance(val, str) and _departement.is_valid(val)
32
+
33
+
34
+ _test_values = {
35
+ True: ["essonne"],
36
+ False: ["alabama", "auvergne"],
37
+ }
@@ -0,0 +1,28 @@
1
+ import re
2
+
3
+ proportion = 0.9
4
+ labels = [
5
+ "email",
6
+ "mail",
7
+ "courriel",
8
+ "contact",
9
+ "mel",
10
+ "lieucourriel",
11
+ "coordinates.emailcontact",
12
+ "e mail",
13
+ "mo mail",
14
+ "adresse mail",
15
+ "adresse email",
16
+ ]
17
+
18
+
19
+ def _is(val):
20
+ return isinstance(val, str) and bool(
21
+ re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
22
+ )
23
+
24
+
25
+ _test_values = {
26
+ True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
27
+ False: ["cdo@@gouv.sfd"],
28
+ }
@@ -0,0 +1,29 @@
1
+ proportion = 1
2
+ tags = ["type"]
3
+ labels = ["part", "ratio", "taux"]
4
+
5
+
6
+ def float_casting(val: str) -> float:
7
+ return float(val.replace(",", "."))
8
+
9
+
10
+ def _is(val):
11
+ """Detects floats, assuming that tables will not have scientific
12
+ notations (3e6) or "+" in the string. "-" is still accepted."""
13
+ try:
14
+ if (
15
+ not isinstance(val, str)
16
+ or any([k in val for k in ["_", "+", "e", "E"]])
17
+ or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
18
+ ):
19
+ return False
20
+ float_casting(val)
21
+ return True
22
+ except ValueError:
23
+ return False
24
+
25
+
26
+ _test_values = {
27
+ True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"],
28
+ False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"],
29
+ }
@@ -0,0 +1,36 @@
1
+ import json
2
+
3
+ proportion = 1
4
+ tags = ["geo"]
5
+ labels = [
6
+ "json geojson",
7
+ "json",
8
+ "geojson",
9
+ "geo shape",
10
+ "geom",
11
+ "geometry",
12
+ "geo shape",
13
+ "geoshape",
14
+ ]
15
+
16
+
17
+ def _is(val) -> bool:
18
+ try:
19
+ j = json.loads(val)
20
+ if isinstance(j, dict):
21
+ if "type" in j and "coordinates" in j:
22
+ return True
23
+ if "geometry" in j and "coordinates" in j["geometry"]:
24
+ return True
25
+ except Exception:
26
+ pass
27
+ return False
28
+
29
+
30
+ _test_values = {
31
+ True: [
32
+ '{"coordinates": [45.783753, 3.049342], "type": "63870"}',
33
+ '{"geometry": {"coordinates": [45.783753, 3.049342]}}',
34
+ ],
35
+ False: ['{"pomme": "fruit", "reponse": 42}'],
36
+ }
@@ -0,0 +1,31 @@
1
+ from os.path import dirname, join
2
+
3
+ from csv_detective.parsing.text import _process_text
4
+
5
+ proportion = 0.8
6
+ tags = ["fr"]
7
+ labels = [
8
+ "code ape",
9
+ "code activite (ape)",
10
+ "code naf",
11
+ "code naf organisme designe",
12
+ "code naf organisme designant",
13
+ "base sirene : code ape de l'etablissement siege",
14
+ ]
15
+
16
+ f = open(join(dirname(__file__), "data", "insee_ape700.txt"), "r")
17
+ condes_insee_ape = f.read().split("\n")
18
+ # removing empty str due to additionnal line in file
19
+ del condes_insee_ape[-1]
20
+ condes_insee_ape = set(condes_insee_ape)
21
+ f.close()
22
+
23
+
24
+ def _is(val):
25
+ if not isinstance(val, str):
26
+ return False
27
+ val = _process_text(val).upper()
28
+ return val in condes_insee_ape
29
+
30
+
31
+ _test_values = {True: ["0116Z"], False: ["0116A"]}
@@ -0,0 +1,28 @@
1
+ from frformat import Canton, Millesime, Options
2
+
3
+ proportion = 0.9
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "insee canton",
7
+ "canton",
8
+ "cant",
9
+ "nom canton",
10
+ ]
11
+
12
+ _options = Options(
13
+ ignore_case=True,
14
+ ignore_accents=True,
15
+ replace_non_alphanumeric_with_space=True,
16
+ ignore_extra_whitespace=True,
17
+ )
18
+ _canton = Canton(Millesime.LATEST, _options)
19
+
20
+
21
+ def _is(val):
22
+ return isinstance(val, str) and _canton.is_valid(val)
23
+
24
+
25
+ _test_values = {
26
+ True: ["nantua"],
27
+ False: ["california"],
28
+ }
@@ -0,0 +1,23 @@
1
+ labels = ["nb", "nombre", "nbre"]
2
+ tag = ["type"]
3
+
4
+
5
+ def _is(val):
6
+ """Detects integers"""
7
+ if (
8
+ not isinstance(val, str)
9
+ or any([v in val for v in [".", "_", "+"]])
10
+ or (val.startswith("0") and len(val) > 1)
11
+ ):
12
+ return False
13
+ try:
14
+ int(val)
15
+ return True
16
+ except ValueError:
17
+ return False
18
+
19
+
20
+ _test_values = {
21
+ True: ["1", "0", "1764", "-24"],
22
+ False: ["01053", "1.2", "123_456", "+35"],
23
+ }
@@ -0,0 +1,30 @@
1
+ import re
2
+ from os.path import dirname, join
3
+
4
+ proportion = 1
5
+ tags = ["geo"]
6
+ labels = [
7
+ "iso country code",
8
+ "code pays",
9
+ "pays",
10
+ "country",
11
+ "nation",
12
+ "pays code",
13
+ "code pays (iso)",
14
+ ]
15
+
16
+ with open(join(dirname(__file__), "data", "iso_country_code_alpha2.txt"), "r") as iofile:
17
+ liste_pays = iofile.read().split("\n")
18
+ liste_pays = set(liste_pays)
19
+
20
+
21
+ def _is(val):
22
+ if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)):
23
+ return False
24
+ return val in liste_pays
25
+
26
+
27
+ _test_values = {
28
+ True: ["FR"],
29
+ False: ["XX", "A", "FRA"],
30
+ }
@@ -0,0 +1,30 @@
1
+ import re
2
+ from os.path import dirname, join
3
+
4
+ proportion = 1
5
+ tags = ["geo"]
6
+ labels = [
7
+ "iso country code",
8
+ "code pays",
9
+ "pays",
10
+ "country",
11
+ "nation",
12
+ "pays code",
13
+ "code pays (iso)",
14
+ ]
15
+
16
+ with open(join(dirname(__file__), "data", "iso_country_code_alpha3.txt"), "r") as iofile:
17
+ liste_pays = iofile.read().split("\n")
18
+
19
+
20
+ def _is(val):
21
+ """Renvoie True si val peut etre un code iso pays alpha-3, False sinon"""
22
+ if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)):
23
+ return False
24
+ return val in set(liste_pays)
25
+
26
+
27
+ _test_values = {
28
+ True: ["FRA"],
29
+ False: ["XXX", "FR", "A"],
30
+ }
@@ -0,0 +1,31 @@
1
+ import re
2
+ from os.path import dirname, join
3
+
4
+ proportion = 1
5
+ tags = ["geo"]
6
+ labels = [
7
+ "iso country code",
8
+ "code pays",
9
+ "pays",
10
+ "country",
11
+ "nation",
12
+ "pays code",
13
+ "code pays (iso)",
14
+ ]
15
+
16
+ with open(join(dirname(__file__), "data", "iso_country_code_numeric.txt"), "r") as iofile:
17
+ liste_pays = iofile.read().split("\n")
18
+ liste_pays = set(liste_pays)
19
+
20
+
21
+ def _is(val):
22
+ """Renvoie True si val peut etre un code iso pays numerique, False sinon"""
23
+ if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)):
24
+ return False
25
+ return val in liste_pays
26
+
27
+
28
+ _test_values = {
29
+ True: ["250"],
30
+ False: ["003"],
31
+ }
@@ -0,0 +1,41 @@
1
+ proportion = 0.8
2
+ tags = ["fr", "temp"]
3
+ labels = [
4
+ "jour semaine",
5
+ "type jour",
6
+ "jour de la semaine",
7
+ "saufjour",
8
+ "nomjour",
9
+ "jour",
10
+ "jour de fermeture",
11
+ ]
12
+
13
+ jours = {
14
+ "lundi",
15
+ "mardi",
16
+ "mercredi",
17
+ "jeudi",
18
+ "vendredi",
19
+ "samedi",
20
+ "dimanche",
21
+ "lun",
22
+ "mar",
23
+ "mer",
24
+ "jeu",
25
+ "ven",
26
+ "sam",
27
+ "dim",
28
+ }
29
+
30
+
31
+ def _is(val):
32
+ if not isinstance(val, str):
33
+ return False
34
+ val = val.lower()
35
+ return val in jours
36
+
37
+
38
+ _test_values = {
39
+ True: ["lundi"],
40
+ False: ["jour de la biere"],
41
+ }
@@ -0,0 +1,20 @@
1
+ import json
2
+ from json import JSONDecodeError
3
+
4
+ proportion = 1
5
+ tags = ["type"]
6
+
7
+
8
+ def _is(val):
9
+ try:
10
+ loaded = json.loads(val)
11
+ # we don't want to consider integers for instance
12
+ return isinstance(loaded, (list, dict))
13
+ except (JSONDecodeError, TypeError):
14
+ return False
15
+
16
+
17
+ _test_values = {
18
+ True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"],
19
+ False: ["5", '{"zefib":', '{"a"}'],
20
+ }
@@ -0,0 +1,48 @@
1
+ from frformat import LatitudeL93
2
+
3
+ from csv_detective.formats.float import _is as is_float
4
+ from csv_detective.formats.float import float_casting
5
+
6
+ proportion = 1
7
+ tags = ["fr", "geo"]
8
+ labels = [
9
+ "latitude",
10
+ "lat",
11
+ "y",
12
+ "yf",
13
+ "yd",
14
+ "y l93",
15
+ "coordonnee y",
16
+ "latitude lb93",
17
+ "coord y",
18
+ "ycoord",
19
+ "geocodage y gps",
20
+ "location latitude",
21
+ "ylatitude",
22
+ "ylat",
23
+ "latitude (y)",
24
+ "latitudeorg",
25
+ "coordinates.latitude",
26
+ "googlemap latitude",
27
+ "latitudelieu",
28
+ "latitude googlemap",
29
+ ]
30
+
31
+ _latitudel93 = LatitudeL93()
32
+
33
+
34
+ def _is(val):
35
+ try:
36
+ if isinstance(val, str) and is_float(val):
37
+ return _latitudel93.is_valid(float_casting(val))
38
+
39
+ return False
40
+
41
+ except (ValueError, OverflowError):
42
+ return False
43
+
44
+
45
+ _test_values = {
46
+ True: ["6037008", "7123528.5", "7124528,5"],
47
+ False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"],
48
+ }
@@ -0,0 +1,42 @@
1
+ from csv_detective.formats.float import _is as is_float
2
+
3
+ proportion = 1
4
+ tags = ["geo"]
5
+ labels = [
6
+ "latitude",
7
+ "lat",
8
+ "y",
9
+ "yf",
10
+ "yd",
11
+ "coordonnee y",
12
+ "coord y",
13
+ "ycoord",
14
+ "geocodage y gps",
15
+ "location latitude",
16
+ "ylatitude",
17
+ "ylat",
18
+ "latitude (y)",
19
+ "latitudeorg",
20
+ "coordinates.latitude",
21
+ "googlemap latitude",
22
+ "latitudelieu",
23
+ "latitude googlemap",
24
+ "latitude wgs84",
25
+ "y wgs84",
26
+ "latitude (wgs84)",
27
+ ]
28
+
29
+
30
+ def _is(val):
31
+ try:
32
+ return is_float(val) and float(val) >= -90 and float(val) <= 90
33
+ except ValueError:
34
+ return False
35
+ except OverflowError:
36
+ return False
37
+
38
+
39
+ _test_values = {
40
+ True: ["43.2", "-22"],
41
+ False: ["100"],
42
+ }
@@ -0,0 +1,42 @@
1
+ from csv_detective.formats.float import _is as is_float
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "latitude",
7
+ "lat",
8
+ "y",
9
+ "yf",
10
+ "yd",
11
+ "coordonnee y",
12
+ "coord y",
13
+ "ycoord",
14
+ "geocodage y gps",
15
+ "location latitude",
16
+ "ylatitude",
17
+ "ylat",
18
+ "latitude (y)",
19
+ "latitudeorg",
20
+ "coordinates.latitude",
21
+ "googlemap latitude",
22
+ "latitudelieu",
23
+ "latitude googlemap",
24
+ "latitude wgs84",
25
+ "y wgs84",
26
+ "latitude (wgs84)",
27
+ ]
28
+
29
+
30
+ def _is(val):
31
+ try:
32
+ return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3
33
+ except ValueError:
34
+ return False
35
+ except OverflowError:
36
+ return False
37
+
38
+
39
+ _test_values = {
40
+ True: ["42.5"],
41
+ False: ["22.5", "62.5"],
42
+ }
@@ -0,0 +1,53 @@
1
+ from csv_detective.formats.latitude_wgs import _is as is_lat
2
+ from csv_detective.formats.longitude_wgs import _is as is_lon
3
+
4
+ proportion = 1
5
+ tags = ["geo"]
6
+
7
+ SHARED_COORDS_LABELS = [
8
+ "ban",
9
+ "coordinates",
10
+ "coordonnees",
11
+ "coordonnees insee",
12
+ "geo",
13
+ "geopoint",
14
+ "geoloc",
15
+ "geolocalisation",
16
+ "geom",
17
+ "geometry",
18
+ "gps",
19
+ "localisation",
20
+ "point",
21
+ "position",
22
+ "wgs84",
23
+ ]
24
+
25
+ specific = [
26
+ "latlon",
27
+ "lat lon",
28
+ "x y",
29
+ "xy",
30
+ ]
31
+
32
+ # we aim wide to catch exact matches if possible for the highest possible score
33
+ labels = (
34
+ SHARED_COORDS_LABELS
35
+ + specific
36
+ + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
37
+ )
38
+
39
+
40
+ def _is(val):
41
+ if not isinstance(val, str) or val.count(",") != 1:
42
+ return False
43
+ lat, lon = val.split(",")
44
+ # handling [lat,lon]
45
+ if lat.startswith("[") and lon.endswith("]"):
46
+ lat, lon = lat[1:], lon[:-1]
47
+ return is_lat(lat) and is_lon(lon.replace(" ", ""))
48
+
49
+
50
+ _test_values = {
51
+ True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"],
52
+ False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"],
53
+ }
@@ -0,0 +1,39 @@
1
+ from frformat import LongitudeL93
2
+
3
+ from csv_detective.formats.float import _is as is_float
4
+ from csv_detective.formats.float import float_casting
5
+
6
+ proportion = 1
7
+ tags = ["fr", "geo"]
8
+ labels = [
9
+ "longitude",
10
+ "lon",
11
+ "long",
12
+ "geocodage x gps",
13
+ "location longitude",
14
+ "xlongitude",
15
+ "lng",
16
+ "xlong",
17
+ "x",
18
+ "xf",
19
+ "xd",
20
+ ]
21
+
22
+ _longitudel93 = LongitudeL93()
23
+
24
+
25
+ def _is(val):
26
+ try:
27
+ if isinstance(val, str) and is_float(val):
28
+ return _longitudel93.is_valid(float_casting(val))
29
+
30
+ return False
31
+
32
+ except (ValueError, OverflowError):
33
+ return False
34
+
35
+
36
+ _test_values = {
37
+ True: ["0", "-154", "1265783,45", "34723.4"],
38
+ False: ["1456669.8", "-776225", "346_3214"],
39
+ }
@@ -0,0 +1,32 @@
1
+ from csv_detective.formats.float import _is as is_float
2
+
3
+ proportion = 1
4
+ tags = ["geo"]
5
+ labels = [
6
+ "longitude",
7
+ "lon",
8
+ "long",
9
+ "geocodage x gps",
10
+ "location longitude",
11
+ "xlongitude",
12
+ "lng",
13
+ "xlong",
14
+ "x",
15
+ "xf",
16
+ "xd",
17
+ ]
18
+
19
+
20
+ def _is(val):
21
+ try:
22
+ return is_float(val) and float(val) >= -180 and float(val) <= 180
23
+ except ValueError:
24
+ return False
25
+ except OverflowError:
26
+ return False
27
+
28
+
29
+ _test_values = {
30
+ True: ["120", "-20.2"],
31
+ False: ["-200"],
32
+ }
@@ -0,0 +1,32 @@
1
+ from csv_detective.formats.float import _is as is_float
2
+
3
+ proportion = 1
4
+ tags = ["fr", "geo"]
5
+ labels = [
6
+ "longitude",
7
+ "lon",
8
+ "long",
9
+ "geocodage x gps",
10
+ "location longitude",
11
+ "xlongitude",
12
+ "lng",
13
+ "xlong",
14
+ "x",
15
+ "xf",
16
+ "xd",
17
+ ]
18
+
19
+
20
+ def _is(val):
21
+ try:
22
+ return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8
23
+ except ValueError:
24
+ return False
25
+ except OverflowError:
26
+ return False
27
+
28
+
29
+ _test_values = {
30
+ True: ["-2.5"],
31
+ False: ["12.8"],
32
+ }