csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -1,57 +0,0 @@
1
- # flake8: noqa
2
- from .FR.other import (
3
- code_csp_insee,
4
- csp_insee,
5
- sexe,
6
- siren,
7
- tel_fr,
8
- uai,
9
- siret,
10
- insee_ape700,
11
- date_fr,
12
- code_waldec,
13
- code_rna
14
- )
15
-
16
- from .other import (
17
- email,
18
- url,
19
- booleen,
20
- mongo_object_id,
21
- twitter,
22
- float,
23
- int,
24
- uuid,
25
- json
26
- )
27
-
28
- from .FR.geo import (
29
- adresse,
30
- code_commune_insee,
31
- code_postal,
32
- commune,
33
- departement,
34
- pays,
35
- region,
36
- code_departement,
37
- code_fantoir,
38
- longitude_wgs_fr_metropole,
39
- latitude_wgs_fr_metropole,
40
- code_region,
41
- latitude_l93,
42
- longitude_l93,
43
- insee_canton
44
- )
45
-
46
- from .geo import (
47
- iso_country_code_alpha2,
48
- iso_country_code_alpha3,
49
- iso_country_code_numeric,
50
- latitude_wgs,
51
- longitude_wgs,
52
- latlon_wgs,
53
- json_geojson
54
- )
55
-
56
- from .FR.temp import jour_de_la_semaine, mois_de_annee
57
- from .temp import year, date, datetime_iso, datetime_rfc822
File without changes
@@ -1,15 +0,0 @@
1
- from os.path import dirname, join
2
- import re
3
-
4
- PROPORTION = 1
5
-
6
- with open(join(dirname(__file__), 'iso_country_code_alpha2.txt'), 'r') as iofile:
7
- liste_pays = iofile.read().split('\n')
8
- liste_pays = set(liste_pays)
9
-
10
-
11
- def _is(val):
12
- '''Renvoie True si val peut etre un code iso pays alpha-2, False sinon'''
13
- if not bool(re.match(r'[A-Z]{2}$', val)):
14
- return False
15
- return val in liste_pays
@@ -1,14 +0,0 @@
1
- from os.path import dirname, join
2
- import re
3
-
4
- PROPORTION = 1
5
-
6
- with open(join(dirname(__file__), 'iso_country_code_alpha3.txt'), 'r') as iofile:
7
- liste_pays = iofile.read().split('\n')
8
-
9
-
10
- def _is(val):
11
- '''Renvoie True si val peut etre un code iso pays alpha-3, False sinon'''
12
- if not bool(re.match(r'[A-Z]{3}$', val)):
13
- return False
14
- return val in set(liste_pays)
@@ -1,15 +0,0 @@
1
- from os.path import dirname, join
2
- import re
3
-
4
- PROPORTION = 1
5
-
6
- with open(join(dirname(__file__), 'iso_country_code_numeric.txt'), 'r') as iofile:
7
- liste_pays = iofile.read().split('\n')
8
- liste_pays = set(liste_pays)
9
-
10
-
11
- def _is(val):
12
- '''Renvoie True si val peut etre un code iso pays numerique, False sinon'''
13
- if not bool(re.match(r'[0-9]{3}$', val)):
14
- return False
15
- return val in liste_pays
@@ -1,22 +0,0 @@
1
- import json
2
- from json import JSONDecodeError
3
-
4
- PROPORTION = 0.9
5
-
6
-
7
- def _is(val):
8
- '''Renvoie True si val peut etre geojson'''
9
-
10
- try:
11
- j = json.loads(val)
12
- if 'type' in j and 'coordinates' in j:
13
- return True
14
- if 'geometry' in j:
15
- if 'coordinates' in j['geometry']:
16
- return True
17
- except JSONDecodeError:
18
- pass
19
- except TypeError:
20
- pass
21
-
22
- return False
@@ -1,13 +0,0 @@
1
- from csv_detective.detect_fields.other.float import _is as is_float
2
-
3
- PROPORTION = 0.9
4
-
5
-
6
- def _is(val):
7
- '''Renvoie True si val peut etre une latitude'''
8
- try:
9
- return is_float(val) and float(val) >= -90 and float(val) <= 90
10
- except ValueError:
11
- return False
12
- except OverflowError:
13
- return False
@@ -1,15 +0,0 @@
1
- import re
2
-
3
- PROPORTION = 0.9
4
-
5
-
6
- def _is(val):
7
- '''Renvoie True si val peut etre une latitude,longitude'''
8
-
9
- a = bool(
10
- re.match(
11
- r'^\[?[\+\-]?[0-8]?\d\.\d* ?, ?[\+\-]?(1[0-7]\d|\d{1,2})\.\d+\]?$', val
12
- )
13
- )
14
-
15
- return a
@@ -1,13 +0,0 @@
1
- from csv_detective.detect_fields.other.float import _is as is_float
2
-
3
- PROPORTION = 0.9
4
-
5
-
6
- def _is(val):
7
- '''Renvoie True si val peut etre une longitude'''
8
- try:
9
- is_float(val) and float(val) >= -180 and float(val) <= 180
10
- except ValueError:
11
- return False
12
- except OverflowError:
13
- return False
File without changes
@@ -1,21 +0,0 @@
1
- PROPORTION = 1
2
- liste_bool = {
3
- '0',
4
- '1',
5
- 'vrai',
6
- 'faux',
7
- 'true',
8
- 'false',
9
- 'oui',
10
- 'non',
11
- 'yes',
12
- 'no',
13
- 'y',
14
- 'n',
15
- 'o'
16
- }
17
-
18
-
19
- def _is(val):
20
- '''Détection les booléens'''
21
- return val.lower() in liste_bool
@@ -1,8 +0,0 @@
1
- import re
2
-
3
- PROPORTION = 1
4
-
5
-
6
- def _is(val):
7
- '''Detects e-mails'''
8
- return bool(re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$', val))
@@ -1,17 +0,0 @@
1
- PROPORTION = 1
2
-
3
-
4
- def float_casting(str2cast):
5
- return float(str2cast.replace(',', '.'))
6
-
7
-
8
- def _is(val):
9
- '''Detects floats, assuming that tables will not have scientific
10
- notations (3e6) or "+" in the string. "-" is still accepted.'''
11
- try:
12
- if any([k in val for k in ['_', '+', 'e', 'E']]):
13
- return False
14
- float_casting(val)
15
- return True
16
- except ValueError:
17
- return False
@@ -1,12 +0,0 @@
1
- PROPORTION = 1
2
-
3
-
4
- def _is(val):
5
- '''Detects integers'''
6
- if any([v in val for v in ['.', '_', '+']]):
7
- return False
8
- try:
9
- int(val)
10
- return True
11
- except ValueError:
12
- return False
@@ -1,24 +0,0 @@
1
- import json
2
- from json import JSONDecodeError
3
-
4
- PROPORTION = 1
5
-
6
-
7
- def _is(val):
8
- '''Detects json'''
9
- try:
10
- loaded = json.loads(val)
11
- if isinstance(loaded, list) or (
12
- isinstance(loaded, dict) and not (
13
- any(
14
- [
15
- geo in loaded for geo in ['coordinates', 'geometry']
16
- ]
17
- )
18
- )
19
- ):
20
- return True
21
- else:
22
- return False
23
- except JSONDecodeError:
24
- return False
@@ -1,8 +0,0 @@
1
- import re
2
-
3
- PROPORTION = 0.8
4
-
5
-
6
- def _is(val):
7
- '''Detects Mongo ObjectIds'''
8
- return bool(re.match(r'^[0-9a-fA-F]{24}$', val))
@@ -1,8 +0,0 @@
1
- import re
2
-
3
- PROPORTION = 1
4
-
5
-
6
- def _is(val):
7
- '''Detects twitter accounts'''
8
- return bool(re.match(r'^@[A-Za-z0-9_]+$', val))
@@ -1,11 +0,0 @@
1
-
2
- PROPORTION = 1
3
-
4
-
5
- def _is(val):
6
- '''Detects urls'''
7
- a = 'http://' in val
8
- b = 'www.' in val
9
- c = any([x in val for x in ['.fr', '.com', '.org', '.gouv', '.net']])
10
- d = not ('@' in val)
11
- return (a or b or c) and d
@@ -1,11 +0,0 @@
1
- import re
2
-
3
- PROPORTION = 0.8
4
-
5
-
6
- def _is(val):
7
- '''Detects UUIDs'''
8
- return bool(re.match(
9
- r'^[{]?[0-9a-fA-F]{8}' + '-?([0-9a-fA-F]{4}-?)' + '{3}[0-9a-fA-F]{12}[}]?$',
10
- val
11
- ))
File without changes
@@ -1,62 +0,0 @@
1
- import re
2
- from dateutil.parser import parse, ParserError
3
- from csv_detective.detect_fields.other.float import _is as is_float
4
- from unidecode import unidecode
5
-
6
- PROPORTION = 1
7
- # /!\ this is only for dates, not datetimes which are handled by other utils
8
-
9
-
10
- def is_dateutil_date(val: str) -> bool:
11
- # we don't want to get datetimes here, so length restriction
12
- # longest date string expected here is DD-septembre-YYYY, so 17 characters
13
- if len(val) > 17:
14
- return False
15
- try:
16
- res = parse(val, fuzzy=False)
17
- if res.hour or res.minute or res.second:
18
- return False
19
- return True
20
- except (ParserError, ValueError, TypeError, OverflowError):
21
- return False
22
-
23
-
24
- def _is(val):
25
- '''Renvoie True si val peut être une date, False sinon
26
- On ne garde que les regex pour les cas où parse() ne convient pas'''
27
-
28
- # matches 02/12 03 and 02_12 2003
29
- a = bool(
30
- re.match(
31
- r'^(0[1-9]|[12][0-9]|3[01])[ -/_](0[1-9]|1[012])[ -/_]'
32
- r'([0-9]{2}|(19|20)[0-9]{2}$)',
33
- val
34
- )
35
- )
36
-
37
- # matches 02052003
38
- b = bool(
39
- re.match(
40
- r'^(0[1-9]|[12][0-9]|3[01])(0[1-9]|1[012])([0-9]{2}|'
41
- r'(19|20){2}$)',
42
- val
43
- )
44
- )
45
-
46
- # matches JJ*MM*AAAA
47
- c = bool(
48
- re.match(
49
- r'^(0[1-9]|[12][0-9]|3[01]).?(0[1-9]|1[012]).?(19|20)?\d\d$', val))
50
-
51
- # matches JJ-mmm-AAAA and matches JJ-mmm...mm-AAAA
52
- d = bool(
53
- re.match(
54
- r'^(0[1-9]|[12][0-9]|3[01])[ -/_;.:,](jan|fev|feb|mar|avr|apr'
55
- r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
56
- r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)[ -/_;.:,]'
57
- r'([0-9]{2}$|(19|20)[0-9]{2}$)',
58
- unidecode(val)
59
- )
60
- )
61
-
62
- return (is_dateutil_date(val) and not is_float(val)) or a or b or c or d
@@ -1,18 +0,0 @@
1
- import re
2
-
3
- PROPORTION = 1
4
-
5
-
6
- def _is(val):
7
- '''Renvoie True si val peut être une date au format iso, False sinon
8
- Exemple: 2023-01-15T12:30:45.123456Z'''
9
- a = bool(
10
- re.match(
11
- r'^\d{4}-(0[1-9]|1[012])\-(0[1-9]|[12][0-9]|3[01])[Tt]'
12
- r'([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9])'
13
- r'(\.\d+)?([Zz]|[-+](0[0-9]|1[0-2]):[0-5][0-9])?$',
14
- val
15
- )
16
- )
17
-
18
- return a
@@ -1,21 +0,0 @@
1
- import re
2
-
3
- PROPORTION = 1
4
-
5
-
6
- def _is(val):
7
- '''Renvoie True si val peut être une date au format rfc822, False sinon
8
- Exemple: Tue, 19 Dec 2023 15:30:45 +0000'''
9
-
10
- val = val.lower()
11
- a = bool(
12
- re.match(
13
- r'^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} '
14
- r'([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) '
15
- r'(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$',
16
- val,
17
- re.IGNORECASE
18
- )
19
- )
20
-
21
- return a
@@ -1,10 +0,0 @@
1
- PROPORTION = 1
2
-
3
-
4
- def _is(val):
5
- '''Returns True if val can be a year'''
6
- try:
7
- val = int(val)
8
- except ValueError:
9
- return False
10
- return (1800 <= val) and (val <= 2100)
File without changes
File without changes
@@ -1,40 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'adresse',
15
- 'adresse postale',
16
- 'adresse geographique',
17
- 'adr',
18
- 'adresse complete',
19
- 'adresse station'
20
- ]
21
- processed_header = _process_text(header)
22
-
23
- header_matches_words_combination = float(
24
- any(
25
- [
26
- words_combination == processed_header for words_combination in words_combinations_list
27
- ]
28
- )
29
- )
30
- words_combination_in_header = 0.5 * float(
31
- any(
32
- [
33
- full_word_strictly_inside_string(
34
- words_combination, processed_header
35
- ) for words_combination in words_combinations_list
36
- ]
37
- )
38
- )
39
-
40
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,42 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'code commune insee',
15
- 'code insee',
16
- 'codes insee',
17
- 'code commune',
18
- 'code insee commune',
19
- 'insee',
20
- 'code com',
21
- 'com'
22
- ]
23
- processed_header = _process_text(header)
24
-
25
- header_matches_words_combination = float(
26
- any(
27
- [
28
- words_combination == processed_header for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
- words_combination_in_header = 0.5 * float(
33
- any(
34
- [
35
- full_word_strictly_inside_string(
36
- words_combination, processed_header
37
- ) for words_combination in words_combinations_list
38
- ]
39
- )
40
- )
41
-
42
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
- # 'dep': Possible confusion with dep name?
13
- words_combinations_list = ['code departement', 'code_departement', 'dep', 'departement', 'dept']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['cadastre1', 'code fantoir', 'fantoir']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,41 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'code postal',
15
- 'postal code',
16
- 'postcode',
17
- 'post code',
18
- 'cp',
19
- 'codes postaux',
20
- 'location postcode'
21
- ]
22
- processed_header = _process_text(header)
23
-
24
- header_matches_words_combination = float(
25
- any(
26
- [
27
- words_combination == processed_header for words_combination in words_combinations_list
28
- ]
29
- )
30
- )
31
- words_combination_in_header = 0.5 * float(
32
- any(
33
- [
34
- full_word_strictly_inside_string(
35
- words_combination, processed_header
36
- ) for words_combination in words_combinations_list
37
- ]
38
- )
39
- )
40
-
41
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
- # 'reg' : possible confusion with region name?
13
- words_combinations_list = ['code region', 'reg', 'code insee region', 'region']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)