csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -1,343 +1,268 @@
1
- from botocore.exceptions import ClientError
2
- from datetime import datetime
3
- import json
4
- import os
5
- import tempfile
6
- from typing import Optional
7
- import logging
8
- from time import time
9
-
10
- from csv_detective.s3_utils import get_s3_client, download_from_minio, upload_to_minio
11
- from csv_detective.utils import display_logs_depending_process_time
12
-
13
-
14
- def get_description(format: str) -> str:
15
- """Returns generic description for specific field"""
16
- format_to_desc = {
17
- "adresse": "Adresse",
18
- "code_commune_insee": "Le code INSEE de la commune",
19
- "code_departement": "Le code INSEE du département",
20
- "code_region": "Le code INSEE de la région",
21
- "code_fantoir": "Le code FANTOIR de la voie ou du lieu-dit",
22
- "code_postal": "Le code postal",
23
- "commune": "Le nom de la commune",
24
- "departement": "Le nom du département",
25
- "insee_canton": "Le nom du canton",
26
- "latitude_l93": "La latitude au format Lambert 93",
27
- "latitude_wgs_fr_metropole": (
28
- "La latitude au format WGS. Ne concerne que des latitudes "
29
- "de la métropole française"
30
- ),
31
- "longitude_l93": "La longitude au format Lambert 93",
32
- "longitude_wgs_fr_metropole": (
33
- "La longitude au format WGS. Ne concerne que des longitudes "
34
- "de la métropole française"
35
- ),
36
- "pays": "Le nom du pays",
37
- "region": "Le nom de la région",
38
- "code_csp_insee": "Le code de Catégorie Socio-professionnel INSEE",
39
- "code_rna": "Le code RNA de l'association",
40
- "code_waldec": "Le code WALDEC de l'association",
41
- "csp_insee": "La catégorie socio-professionnel INSEE",
42
- "date_fr": "Data au format français",
43
- "sexe": "Le sexe",
44
- "siren": "Le numéro SIREN à 9 chiffres de l'entreprise (unité légale)",
45
- "siret": "Le numéro SIRET à 14 chiffres de l'établissement d'une entreprise",
46
- "tel_fr": "Le numéro de téléphone français",
47
- "uai": "Le numéro UAI (Unité Administrative Immatriculée) de l'établissement scolaire",
48
- "jour_de_la_semaine": "Le jour de la semaine",
49
- "mois_de_annee": "Le mois de l'année",
50
- "latitude_wgs": "La latitude au format WGS",
51
- "longitude_wgs": "La longitude au format WGS",
52
- "latlon_wgs": "Les coordonnées XY (latitude et longitude)",
53
- "booleen": "Booléen",
54
- "email": "L'adresse couriel (email)",
55
- "float": "Nombre flottant virgule)",
56
- "int": "Nombre entier",
57
- "json": "Chaîne de caractère json",
58
- "mongo_object_id": "Identifiant de base de donnée Mongo",
59
- "twitter": "Compte Twitter",
60
- "url": "Adresse URL",
61
- "uuid": "Identifiant unique au format UUID",
62
- "date": "Date",
63
- "datetime_iso": "Date au format datetime (ISO)",
64
- "datetime_rfc822": "Date au format datetime (RFC822)",
65
- "year": "Année",
66
- }
67
- return format_to_desc.get(format, "")
68
-
69
-
70
- def get_pattern(format: str) -> str:
71
- """Returns the pattern for a particular format"""
72
- format_to_pattern = {
73
- "siren": r"^\d{9}$",
74
- "siret": r"^\d{14}$",
75
- "code_commune_insee": r"^([013-9]\d|2[AB1-9])\d{3}$",
76
- "code_postal": r"^([013-9]\d|2[AB1-9])\d{3}$",
77
- "code_departement": r"^(([013-9]\d|2[AB1-9])$|9\d{2}$)",
78
- "code_region": r"^\d{2}$",
79
- "code_rna": r"^[wW]\d{9}$",
80
- "code_waldec": (
81
- r"^\d{3}\D\d{1,10}$|^\d\D\d\D\d{10}$|^\d{3}\D{3}\d{1,10}$|^\d{3}\D\d{4}\D\d{1,10}"
82
- r"$|^\d{3}\D\d{2}[-]\d{3}$|^\d\D\d\D\d{2}\D\d{1,8}$"
83
- ),
84
- "uai": r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$",
85
- "email": r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$",
86
- "twitter": r'^@[A-Za-z0-9_]+$',
87
- "mongo_object_id": r'^[0-9a-fA-F]{24}$',
88
- "uuid": r'^[{]?[0-9a-fA-F]{8}' + '-?([0-9a-fA-F]{4}-?)' + '{3}[0-9a-fA-F]{12}[}]?$',
89
- "url": (
90
- r'^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]'
91
- r'{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$'
92
- )
93
- }
94
- if format in format_to_pattern:
95
- return {"pattern": format_to_pattern[format]}
96
- else:
97
- return {}
98
-
99
-
100
- def get_validata_type(format: str) -> str:
101
- """Returns the validata type for a given format"""
102
- metier_to_validata_type = {
103
- "booleen": "boolean",
104
- "int": "integer",
105
- "float": "number",
106
- "string": "string",
107
- "date": "date",
108
- "datetime_iso": "datetime",
109
- "datetime_rfc822": "datetime",
110
- "json_geojson": "geojson",
111
- "latitude": "number",
112
- "latitude_l93": "number",
113
- "latitude_wgs": "number",
114
- "latitude_wgs_fr_metropole": "number",
115
- "latlon_wgs": "geo_point",
116
- "longitude": "number",
117
- "longitude_l93": "number",
118
- "longitude_wgs": "number",
119
- "longitude_wgs_fr_metropole": "number",
120
- "year": "year",
121
- }
122
- return metier_to_validata_type.get(format, "string")
123
-
124
-
125
- def get_example(format: str) -> str:
126
- """Returns the example for a given format"""
127
- format_to_example = {
128
- "booleen": "true",
129
- "int": 42,
130
- "float": 42.42,
131
- "string": "Lorem ipsum dolor sit amet",
132
- "adresse": "28 rue Ledion, 75014 Paris",
133
- "insee_canton": "Pont-d'Ain",
134
- "code_commune_insee": "27501",
135
- "code_csp_insee": "233c",
136
- "code_departement": "2A",
137
- "code_fantoir": "A633",
138
- "code_postal": "75014",
139
- "code_region": "52",
140
- "code_rna": "W123456789",
141
- # 'code_waldec': TODO: add code_waldec
142
- "commune": "Joyeux",
143
- "csp_insee": "anciens agriculteurs exploitants",
144
- "date": "2020-01-01",
145
- "date_fr": "12 janvier 2020",
146
- "datetime_iso": "2020-01-01T00:00:00",
147
- "datetime_rfc822": "Tue, 1 Jan 2020 00:00:00 +0000",
148
- "departement": "Ain",
149
- "email": "example@example.com",
150
- "insee_ape700": "0130Z",
151
- "iso_country_code_alpha2": "FR",
152
- "iso_country_code_alpha3": "FRA",
153
- "iso_country_code_numeric": 250,
154
- "jour_de_la_semaine": "lundi",
155
- "json_geojson": '{"type": "Point", "coordinates": [0, 0]}',
156
- "latitude": 42.42,
157
- "latitude_l93": 6037008,
158
- "latitude_wgs": 42.42,
159
- "latitude_wgs_fr_metropole": 41.3,
160
- "latlon_wgs": "42.42, 0.0",
161
- "longitude": 0.0,
162
- "longitude_l93": -357823,
163
- "longitude_wgs": 0.0,
164
- "longitude_wgs_fr_metropole": 1.2,
165
- "mois_de_annee": "janvier",
166
- "mongo_object_id": "507f191e810c19729de860ea",
167
- "pays": "France",
168
- "region": "nouvelle aquitaine",
169
- "sexe": "h",
170
- "siren": "362521879",
171
- "siret": "56894100056",
172
- "tel_fr": "+33123456789",
173
- "twitter": "@Etalab",
174
- "uai": "0470009E",
175
- "url": "https://www.data.gouv.fr",
176
- "uuid": "123e4567-e89b-12d3-a456-426614174000",
177
- "year": "2020",
178
- }
179
- return format_to_example.get(format, "")
180
-
181
-
182
- def get_constraints(format: str) -> dict:
183
- """Returns the constraints for a given format"""
184
- pattern_constraints = get_pattern(format)
185
- extra_constraints = {}
186
- if format == "latitude_l93":
187
- extra_constraints = {"minimum": 6037008, "maximum": 7230728}
188
- if format == "longitude_l93":
189
- extra_constraints = {"minimum": -357823, "maximum": 7230728}
190
- if format == "latitude_wgs_fr_metropole":
191
- extra_constraints = {"minimum": 41.3, "maximum": 51.3}
192
- if format == "longitude_wgs_fr_metropole":
193
- extra_constraints = {"minimum": -5.5, "maximum": 9.8}
194
-
195
- return {"required": False, **pattern_constraints, **extra_constraints}
196
-
197
-
198
- def generate_table_schema(
199
- analysis_report: dict,
200
- save_file: bool,
201
- netloc: Optional[str] = None,
202
- bucket: Optional[str] = None,
203
- key: Optional[str] = None,
204
- minio_user: Optional[str] = None,
205
- minio_pwd: Optional[str] = None,
206
- verbose: bool = False
207
- ) -> dict:
208
- """Generates a table schema from the analysis report
209
-
210
- Args:
211
- analysis_report (dict): The analysis report from csv_detective
212
- save_file (bool): indicate if schema should be saved into minio or just returned
213
- netloc (str): The netloc of the minio instance to upload the tableschema
214
- bucket (str): The bucket to save the schema in
215
- key (str): The key to save the schema in (without extension as we will append
216
- version number and extension)
217
- minio_user (str): The minio user
218
- minio_pwd (str): The minio password
219
-
220
- Returns:
221
- """
222
- if verbose:
223
- start = time()
224
- logging.info("Creating table schema")
225
- fields = [
226
- {
227
- "name": header,
228
- "description": get_description(field_report["format"]),
229
- "example": get_example(field_report["format"]),
230
- "type": get_validata_type(field_report["format"]),
231
- "formatFR": field_report["format"],
232
- "constraints": get_constraints(field_report["format"])
233
- }
234
- for header, field_report in analysis_report["columns"].items()
235
- ]
236
-
237
- new_version = "0.0.1"
238
-
239
- schema = {
240
- "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
241
- "name": "",
242
- "title": "",
243
- "description": "",
244
- "countryCode": "FR",
245
- "homepage": "",
246
- "path": "https://github.com/etalab/csv-detective",
247
- "resources": [],
248
- "sources": [
249
- {
250
- "title": "Spécification Tableschema",
251
- "path": "https://specs.frictionlessdata.io/table-schema"
252
- },
253
- {
254
- "title": "schema.data.gouv.fr",
255
- "path": "https://schema.data.gouv.fr"
256
- }
257
- ],
258
- "created": datetime.today().strftime("%Y-%m-%d"),
259
- "lastModified": datetime.today().strftime("%Y-%m-%d"),
260
- "version": new_version,
261
- "contributors": [
262
- {
263
- "title": "Table schema bot",
264
- "email": "schema@data.gouv.fr",
265
- "organisation": "data.gouv.fr",
266
- "role": "author",
267
- },
268
- ],
269
- "fields": fields,
270
- "missingValues": [""],
271
- }
272
-
273
- if verbose:
274
- display_logs_depending_process_time(f'Created schema in {round(time() - start, 3)}s', time() - start)
275
-
276
- if not save_file:
277
- return schema
278
-
279
- if save_file:
280
- if not all([netloc, key, bucket, minio_user, minio_pwd]):
281
- raise Exception(
282
- "To save schema into minio, parameters : netloc, key, bucket, "
283
- "minio_user, minio_pwd should be provided"
284
- )
285
-
286
- # Create bucket if does not exist
287
- client = get_s3_client(netloc, minio_user, minio_pwd)
288
- try:
289
- client.head_bucket(Bucket=bucket)
290
- except ClientError:
291
- client.create_bucket(Bucket=bucket)
292
-
293
- tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
294
- if "Contents" in tableschema_objects:
295
- tableschema_keys = [
296
- tableschema["Key"]
297
- for tableschema in client.list_objects(
298
- Bucket=bucket, Prefix=key, Delimiter="/"
299
- )["Contents"]
300
- ]
301
- tableschema_versions = [
302
- os.path.splitext(tableschema_key)[0].split("_")[-1]
303
- for tableschema_key in tableschema_keys
304
- ]
305
- latest_version = max(tableschema_versions)
306
-
307
- with tempfile.NamedTemporaryFile() as latest_schema_file:
308
- with open(latest_schema_file.name, "w") as fp:
309
- download_from_minio(
310
- netloc,
311
- bucket,
312
- f"{key}_{latest_version}.json",
313
- latest_schema_file.name,
314
- minio_user,
315
- minio_pwd,
316
- )
317
- # Check if files are different
318
- with open(latest_schema_file.name, "r") as fp:
319
- latest_schema = json.load(fp)
320
- if latest_schema["fields"] != fields:
321
- latest_version_split = latest_version.split(".")
322
- new_version = (
323
- latest_version_split[0]
324
- + "."
325
- + latest_version_split[1]
326
- + "."
327
- + str(int(latest_version_split[2]) + 1)
328
- )
329
- else:
330
- return None
331
-
332
- schema["version"] = new_version
333
-
334
- tableschema_file = tempfile.NamedTemporaryFile(delete=False)
335
- with open(tableschema_file.name, "w") as fp:
336
- json.dump(schema, fp, indent=4)
337
-
338
- new_version_key = f"{key}_{new_version}.json"
339
- upload_to_minio(
340
- netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
341
- )
342
- os.unlink(tableschema_file.name)
343
- return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
1
+ import json
2
+ import logging
3
+ from datetime import datetime
4
+ from time import time
5
+
6
+ from csv_detective.utils import display_logs_depending_process_time
7
+
8
+
9
+ def get_description(format: str) -> str:
10
+ """Returns generic description for specific field"""
11
+ format_to_desc = {
12
+ "adresse": "Adresse",
13
+ "code_commune_insee": "Le code INSEE de la commune",
14
+ "code_departement": "Le code INSEE du département",
15
+ "code_region": "Le code INSEE de la région",
16
+ "code_fantoir": "Le code FANTOIR de la voie ou du lieu-dit",
17
+ "code_postal": "Le code postal",
18
+ "commune": "Le nom de la commune",
19
+ "departement": "Le nom du département",
20
+ "insee_canton": "Le nom du canton",
21
+ "latitude_l93": "La latitude au format Lambert 93",
22
+ "latitude_wgs_fr_metropole": (
23
+ "La latitude au format WGS. Ne concerne que des latitudes de la métropole française"
24
+ ),
25
+ "longitude_l93": "La longitude au format Lambert 93",
26
+ "longitude_wgs_fr_metropole": (
27
+ "La longitude au format WGS. Ne concerne que des longitudes de la métropole française"
28
+ ),
29
+ "pays": "Le nom du pays",
30
+ "region": "Le nom de la région",
31
+ "code_csp_insee": "Le code de Catégorie Socio-professionnel INSEE",
32
+ "code_rna": "Le code RNA de l'association",
33
+ "code_waldec": "Le code WALDEC de l'association",
34
+ "csp_insee": "La catégorie socio-professionnel INSEE",
35
+ "date_fr": "Data au format français",
36
+ "sexe": "Le sexe",
37
+ "siren": "Le numéro SIREN à 9 chiffres de l'entreprise (unité légale)",
38
+ "siret": "Le numéro SIRET à 14 chiffres de l'établissement d'une entreprise",
39
+ "tel_fr": "Le numéro de téléphone français",
40
+ "uai": "Le numéro UAI (Unité Administrative Immatriculée) de l'établissement scolaire",
41
+ "jour_de_la_semaine": "Le jour de la semaine",
42
+ "mois_de_annee": "Le mois de l'année",
43
+ "latitude_wgs": "La latitude au format WGS",
44
+ "longitude_wgs": "La longitude au format WGS",
45
+ "latlon_wgs": "Les coordonnées XY (latitude et longitude)",
46
+ "lonlat_wgs": "Les coordonnées XY (longitude et latitude)",
47
+ "booleen": "Booléen",
48
+ "email": "L'adresse couriel (email)",
49
+ "float": "Nombre flottant virgule)",
50
+ "int": "Nombre entier",
51
+ "json": "Chaîne de caractère json",
52
+ "mongo_object_id": "Identifiant de base de donnée Mongo",
53
+ "twitter": "Compte Twitter",
54
+ "url": "Adresse URL",
55
+ "uuid": "Identifiant unique au format UUID",
56
+ "date": "Date",
57
+ "datetime_aware": "Date au format datetime avec fuseau horaire",
58
+ "datetime_naive": "Date au format datetime sans fuseau horaire",
59
+ "datetime_rfc822": "Date au format datetime (RFC822)",
60
+ "year": "Année",
61
+ }
62
+ return format_to_desc.get(format, "")
63
+
64
+
65
+ def get_pattern(format: str) -> str:
66
+ """Returns the pattern for a particular format"""
67
+ format_to_pattern = {
68
+ "siren": r"^\d{9}$",
69
+ "siret": r"^\d{14}$",
70
+ "code_commune_insee": r"^([013-9]\d|2[AB1-9])\d{3}$",
71
+ "code_postal": r"^([013-9]\d|2[AB1-9])\d{3}$",
72
+ "code_departement": r"^(([013-9]\d|2[AB1-9])$|9\d{2}$)",
73
+ "code_region": r"^\d{2}$",
74
+ "code_rna": r"^[wW]\d{9}$",
75
+ "code_waldec": (
76
+ r"^\d{3}\D\d{1,10}$|^\d\D\d\D\d{10}$|^\d{3}\D{3}\d{1,10}$|^\d{3}\D\d{4}\D\d{1,10}"
77
+ r"$|^\d{3}\D\d{2}[-]\d{3}$|^\d\D\d\D\d{2}\D\d{1,8}$"
78
+ ),
79
+ "uai": r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$",
80
+ "email": r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$",
81
+ "twitter": r"^@[A-Za-z0-9_]+$",
82
+ "mongo_object_id": r"^[0-9a-fA-F]{24}$",
83
+ "uuid": r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$",
84
+ "url": (
85
+ r"^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]"
86
+ r"{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$"
87
+ ),
88
+ }
89
+ if format in format_to_pattern:
90
+ return {"pattern": format_to_pattern[format]}
91
+ else:
92
+ return {}
93
+
94
+
95
+ def get_validata_type(format: str) -> str:
96
+ """Returns the validata type for a given format"""
97
+ metier_to_validata_type = {
98
+ "booleen": "boolean",
99
+ "int": "integer",
100
+ "float": "number",
101
+ "string": "string",
102
+ "date": "date",
103
+ "datetime_aware": "datetime",
104
+ "datetime_naive": "datetime",
105
+ "datetime_rfc822": "datetime",
106
+ "geojson": "geojson",
107
+ "latitude": "number",
108
+ "latitude_l93": "number",
109
+ "latitude_wgs": "number",
110
+ "latitude_wgs_fr_metropole": "number",
111
+ "latlon_wgs": "geo_point",
112
+ "lonlat_wgs": "geo_point",
113
+ "longitude": "number",
114
+ "longitude_l93": "number",
115
+ "longitude_wgs": "number",
116
+ "longitude_wgs_fr_metropole": "number",
117
+ "year": "year",
118
+ }
119
+ return metier_to_validata_type.get(format, "string")
120
+
121
+
122
+ def get_example(format: str) -> str:
123
+ """Returns the example for a given format"""
124
+ format_to_example = {
125
+ "booleen": "true",
126
+ "int": 42,
127
+ "float": 42.42,
128
+ "string": "Lorem ipsum dolor sit amet",
129
+ "adresse": "28 rue Ledion, 75014 Paris",
130
+ "insee_canton": "Pont-d'Ain",
131
+ "code_commune_insee": "27501",
132
+ "code_csp_insee": "233c",
133
+ "code_departement": "2A",
134
+ "code_fantoir": "A633",
135
+ "code_postal": "75014",
136
+ "code_region": "52",
137
+ "code_rna": "W123456789",
138
+ # 'code_waldec': TODO: add code_waldec
139
+ "commune": "Joyeux",
140
+ "csp_insee": "anciens agriculteurs exploitants",
141
+ "date": "2020-01-01",
142
+ "date_fr": "12 janvier 2020",
143
+ "datetime_aware": "2020-01-01T00:00:00+02:00",
144
+ "datetime_naive": "2020-01-01T00:00:00",
145
+ "datetime_rfc822": "Tue, 1 Jan 2020 00:00:00 +0000",
146
+ "departement": "Ain",
147
+ "email": "example@example.com",
148
+ "insee_ape700": "0130Z",
149
+ "iso_country_code_alpha2": "FR",
150
+ "iso_country_code_alpha3": "FRA",
151
+ "iso_country_code_numeric": 250,
152
+ "jour_de_la_semaine": "lundi",
153
+ "geojson": '{"type": "Point", "coordinates": [0, 0]}',
154
+ "latitude": 42.42,
155
+ "latitude_l93": 6037008,
156
+ "latitude_wgs": 42.42,
157
+ "latitude_wgs_fr_metropole": 41.3,
158
+ "latlon_wgs": "42.42, 0.0",
159
+ "lonlat_wgs": "0.0, 42.42",
160
+ "longitude": 0.0,
161
+ "longitude_l93": -357823,
162
+ "longitude_wgs": 0.0,
163
+ "longitude_wgs_fr_metropole": 1.2,
164
+ "mois_de_annee": "janvier",
165
+ "mongo_object_id": "507f191e810c19729de860ea",
166
+ "pays": "France",
167
+ "region": "nouvelle aquitaine",
168
+ "sexe": "h",
169
+ "siren": "362521879",
170
+ "siret": "56894100056",
171
+ "tel_fr": "+33123456789",
172
+ "twitter": "@Etalab",
173
+ "uai": "0470009E",
174
+ "url": "https://www.data.gouv.fr",
175
+ "uuid": "123e4567-e89b-12d3-a456-426614174000",
176
+ "year": "2020",
177
+ }
178
+ return format_to_example.get(format, "")
179
+
180
+
181
+ def get_constraints(format: str) -> dict:
182
+ """Returns the constraints for a given format"""
183
+ pattern_constraints = get_pattern(format)
184
+ extra_constraints = {}
185
+ if format == "latitude_l93":
186
+ extra_constraints = {"minimum": 6037008, "maximum": 7230728}
187
+ if format == "longitude_l93":
188
+ extra_constraints = {"minimum": -357823, "maximum": 7230728}
189
+ if format == "latitude_wgs_fr_metropole":
190
+ extra_constraints = {"minimum": 41.3, "maximum": 51.3}
191
+ if format == "longitude_wgs_fr_metropole":
192
+ extra_constraints = {"minimum": -5.5, "maximum": 9.8}
193
+
194
+ return {"required": False, **pattern_constraints, **extra_constraints}
195
+
196
+
197
+ def generate_table_schema(
198
+ analysis_report: dict,
199
+ save_results: bool | str = True,
200
+ verbose: bool = False,
201
+ ) -> dict:
202
+ """Generates a table schema from the analysis report
203
+
204
+ Args:
205
+ analysis_report (dict): The analysis report from csv_detective
206
+ save_results (bool or str): whether and where to save the results
207
+
208
+ Returns:
209
+ """
210
+ if verbose:
211
+ start = time()
212
+ logging.info("Creating table schema")
213
+ fields = [
214
+ {
215
+ "name": header,
216
+ "description": get_description(field_report["format"]),
217
+ "example": get_example(field_report["format"]),
218
+ "type": get_validata_type(field_report["format"]),
219
+ "formatFR": field_report["format"],
220
+ "constraints": get_constraints(field_report["format"]),
221
+ }
222
+ for header, field_report in analysis_report["columns"].items()
223
+ ]
224
+
225
+ new_version = "0.0.1"
226
+
227
+ schema = {
228
+ "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
229
+ "name": "",
230
+ "title": "",
231
+ "description": "",
232
+ "countryCode": "FR",
233
+ "homepage": "",
234
+ "path": "https://github.com/etalab/csv-detective",
235
+ "resources": [],
236
+ "sources": [
237
+ {
238
+ "title": "Spécification Tableschema",
239
+ "path": "https://specs.frictionlessdata.io/table-schema",
240
+ },
241
+ {"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"},
242
+ ],
243
+ "created": datetime.today().strftime("%Y-%m-%d"),
244
+ "lastModified": datetime.today().strftime("%Y-%m-%d"),
245
+ "version": new_version,
246
+ "contributors": [
247
+ {
248
+ "title": "Table schema bot",
249
+ "email": "schema@data.gouv.fr",
250
+ "organisation": "data.gouv.fr",
251
+ "role": "author",
252
+ },
253
+ ],
254
+ "fields": fields,
255
+ "missingValues": [""],
256
+ }
257
+
258
+ if verbose:
259
+ display_logs_depending_process_time(
260
+ f"Created schema in {round(time() - start, 3)}s", time() - start
261
+ )
262
+
263
+ if save_results:
264
+ output_path = save_results if isinstance(save_results, str) else "schema.json"
265
+ with open(output_path, "w", encoding="utf8") as fp:
266
+ json.dump(schema, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str)
267
+
268
+ return schema