csv-detective 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
- csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/other/booleen/__init__.py +1 -1
- csv_detective/detect_labels/other/email/__init__.py +1 -1
- csv_detective/detect_labels/other/float/__init__.py +1 -1
- csv_detective/detect_labels/other/int/__init__.py +1 -1
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detect_labels/other/twitter/__init__.py +1 -1
- csv_detective/detect_labels/other/url/__init__.py +1 -1
- csv_detective/detect_labels/other/uuid/__init__.py +1 -1
- csv_detective/detect_labels/temp/date/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
- csv_detective/detect_labels/temp/year/__init__.py +1 -1
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +27 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/headers.py +32 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +98 -0
- csv_detective/explore_csv.py +40 -124
- csv_detective/output/dataframe.py +55 -0
- csv_detective/{create_example.py → output/example.py} +10 -9
- csv_detective/output/profile.py +87 -0
- csv_detective/{schema_generation.py → output/schema.py} +344 -343
- csv_detective/output/utils.py +51 -0
- csv_detective/parsing/columns.py +141 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +55 -0
- csv_detective/parsing/excel.py +169 -0
- csv_detective/parsing/load.py +97 -0
- csv_detective/utils.py +10 -236
- {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +1 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +1 -1
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +84 -70
- tests/test_fields.py +7 -6
- tests/test_file.py +15 -14
- csv_detective/detection.py +0 -633
- /csv_detective/{process_text.py → parsing/text.py} +0 -0
- {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
|
@@ -1,343 +1,344 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
import
|
|
4
|
-
import os
|
|
5
|
-
import tempfile
|
|
6
|
-
from
|
|
7
|
-
import
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
from csv_detective.
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
"
|
|
28
|
-
|
|
29
|
-
"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
"
|
|
33
|
-
|
|
34
|
-
"
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
"
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
"
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"
|
|
60
|
-
"
|
|
61
|
-
"
|
|
62
|
-
"
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
"
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"
|
|
80
|
-
"
|
|
81
|
-
|
|
82
|
-
r"
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
"
|
|
86
|
-
"
|
|
87
|
-
"
|
|
88
|
-
"
|
|
89
|
-
"
|
|
90
|
-
|
|
91
|
-
r'{1,
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
"
|
|
105
|
-
"
|
|
106
|
-
"
|
|
107
|
-
"
|
|
108
|
-
"
|
|
109
|
-
"
|
|
110
|
-
"
|
|
111
|
-
"
|
|
112
|
-
"
|
|
113
|
-
"
|
|
114
|
-
"
|
|
115
|
-
"
|
|
116
|
-
"
|
|
117
|
-
"
|
|
118
|
-
"
|
|
119
|
-
"
|
|
120
|
-
"
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
"
|
|
130
|
-
"
|
|
131
|
-
"
|
|
132
|
-
"
|
|
133
|
-
"
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
"
|
|
137
|
-
"
|
|
138
|
-
"
|
|
139
|
-
"
|
|
140
|
-
"
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
"
|
|
144
|
-
"
|
|
145
|
-
"
|
|
146
|
-
"
|
|
147
|
-
"
|
|
148
|
-
"
|
|
149
|
-
"
|
|
150
|
-
"
|
|
151
|
-
"
|
|
152
|
-
"
|
|
153
|
-
"
|
|
154
|
-
"
|
|
155
|
-
"
|
|
156
|
-
"
|
|
157
|
-
"
|
|
158
|
-
"
|
|
159
|
-
"
|
|
160
|
-
"
|
|
161
|
-
"
|
|
162
|
-
"
|
|
163
|
-
"
|
|
164
|
-
"
|
|
165
|
-
"
|
|
166
|
-
"
|
|
167
|
-
"
|
|
168
|
-
"
|
|
169
|
-
"
|
|
170
|
-
"
|
|
171
|
-
"
|
|
172
|
-
"
|
|
173
|
-
"
|
|
174
|
-
"
|
|
175
|
-
"
|
|
176
|
-
"
|
|
177
|
-
"
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
"
|
|
229
|
-
"
|
|
230
|
-
"
|
|
231
|
-
"
|
|
232
|
-
"
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
"
|
|
242
|
-
"
|
|
243
|
-
"
|
|
244
|
-
"
|
|
245
|
-
"
|
|
246
|
-
"
|
|
247
|
-
"
|
|
248
|
-
"
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
"
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
"
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
"
|
|
260
|
-
"
|
|
261
|
-
"
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
"
|
|
265
|
-
"
|
|
266
|
-
"
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
"
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
"
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
+
|
|
326
|
-
+
|
|
327
|
-
+
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
from time import time
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from botocore.exceptions import ClientError
|
|
10
|
+
|
|
11
|
+
from csv_detective.s3_utils import get_s3_client, download_from_minio, upload_to_minio
|
|
12
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_description(format: str) -> str:
|
|
16
|
+
"""Returns generic description for specific field"""
|
|
17
|
+
format_to_desc = {
|
|
18
|
+
"adresse": "Adresse",
|
|
19
|
+
"code_commune_insee": "Le code INSEE de la commune",
|
|
20
|
+
"code_departement": "Le code INSEE du département",
|
|
21
|
+
"code_region": "Le code INSEE de la région",
|
|
22
|
+
"code_fantoir": "Le code FANTOIR de la voie ou du lieu-dit",
|
|
23
|
+
"code_postal": "Le code postal",
|
|
24
|
+
"commune": "Le nom de la commune",
|
|
25
|
+
"departement": "Le nom du département",
|
|
26
|
+
"insee_canton": "Le nom du canton",
|
|
27
|
+
"latitude_l93": "La latitude au format Lambert 93",
|
|
28
|
+
"latitude_wgs_fr_metropole": (
|
|
29
|
+
"La latitude au format WGS. Ne concerne que des latitudes "
|
|
30
|
+
"de la métropole française"
|
|
31
|
+
),
|
|
32
|
+
"longitude_l93": "La longitude au format Lambert 93",
|
|
33
|
+
"longitude_wgs_fr_metropole": (
|
|
34
|
+
"La longitude au format WGS. Ne concerne que des longitudes "
|
|
35
|
+
"de la métropole française"
|
|
36
|
+
),
|
|
37
|
+
"pays": "Le nom du pays",
|
|
38
|
+
"region": "Le nom de la région",
|
|
39
|
+
"code_csp_insee": "Le code de Catégorie Socio-professionnel INSEE",
|
|
40
|
+
"code_rna": "Le code RNA de l'association",
|
|
41
|
+
"code_waldec": "Le code WALDEC de l'association",
|
|
42
|
+
"csp_insee": "La catégorie socio-professionnel INSEE",
|
|
43
|
+
"date_fr": "Data au format français",
|
|
44
|
+
"sexe": "Le sexe",
|
|
45
|
+
"siren": "Le numéro SIREN à 9 chiffres de l'entreprise (unité légale)",
|
|
46
|
+
"siret": "Le numéro SIRET à 14 chiffres de l'établissement d'une entreprise",
|
|
47
|
+
"tel_fr": "Le numéro de téléphone français",
|
|
48
|
+
"uai": "Le numéro UAI (Unité Administrative Immatriculée) de l'établissement scolaire",
|
|
49
|
+
"jour_de_la_semaine": "Le jour de la semaine",
|
|
50
|
+
"mois_de_annee": "Le mois de l'année",
|
|
51
|
+
"latitude_wgs": "La latitude au format WGS",
|
|
52
|
+
"longitude_wgs": "La longitude au format WGS",
|
|
53
|
+
"latlon_wgs": "Les coordonnées XY (latitude et longitude)",
|
|
54
|
+
"booleen": "Booléen",
|
|
55
|
+
"email": "L'adresse couriel (email)",
|
|
56
|
+
"float": "Nombre flottant (à virgule)",
|
|
57
|
+
"int": "Nombre entier",
|
|
58
|
+
"json": "Chaîne de caractère json",
|
|
59
|
+
"mongo_object_id": "Identifiant de base de donnée Mongo",
|
|
60
|
+
"twitter": "Compte Twitter",
|
|
61
|
+
"url": "Adresse URL",
|
|
62
|
+
"uuid": "Identifiant unique au format UUID",
|
|
63
|
+
"date": "Date",
|
|
64
|
+
"datetime_iso": "Date au format datetime (ISO)",
|
|
65
|
+
"datetime_rfc822": "Date au format datetime (RFC822)",
|
|
66
|
+
"year": "Année",
|
|
67
|
+
}
|
|
68
|
+
return format_to_desc.get(format, "")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_pattern(format: str) -> str:
|
|
72
|
+
"""Returns the pattern for a particular format"""
|
|
73
|
+
format_to_pattern = {
|
|
74
|
+
"siren": r"^\d{9}$",
|
|
75
|
+
"siret": r"^\d{14}$",
|
|
76
|
+
"code_commune_insee": r"^([013-9]\d|2[AB1-9])\d{3}$",
|
|
77
|
+
"code_postal": r"^([013-9]\d|2[AB1-9])\d{3}$",
|
|
78
|
+
"code_departement": r"^(([013-9]\d|2[AB1-9])$|9\d{2}$)",
|
|
79
|
+
"code_region": r"^\d{2}$",
|
|
80
|
+
"code_rna": r"^[wW]\d{9}$",
|
|
81
|
+
"code_waldec": (
|
|
82
|
+
r"^\d{3}\D\d{1,10}$|^\d\D\d\D\d{10}$|^\d{3}\D{3}\d{1,10}$|^\d{3}\D\d{4}\D\d{1,10}"
|
|
83
|
+
r"$|^\d{3}\D\d{2}[-]\d{3}$|^\d\D\d\D\d{2}\D\d{1,8}$"
|
|
84
|
+
),
|
|
85
|
+
"uai": r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$",
|
|
86
|
+
"email": r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$",
|
|
87
|
+
"twitter": r'^@[A-Za-z0-9_]+$',
|
|
88
|
+
"mongo_object_id": r'^[0-9a-fA-F]{24}$',
|
|
89
|
+
"uuid": r'^[{]?[0-9a-fA-F]{8}' + '-?([0-9a-fA-F]{4}-?)' + '{3}[0-9a-fA-F]{12}[}]?$',
|
|
90
|
+
"url": (
|
|
91
|
+
r'^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]'
|
|
92
|
+
r'{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$'
|
|
93
|
+
)
|
|
94
|
+
}
|
|
95
|
+
if format in format_to_pattern:
|
|
96
|
+
return {"pattern": format_to_pattern[format]}
|
|
97
|
+
else:
|
|
98
|
+
return {}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def get_validata_type(format: str) -> str:
|
|
102
|
+
"""Returns the validata type for a given format"""
|
|
103
|
+
metier_to_validata_type = {
|
|
104
|
+
"booleen": "boolean",
|
|
105
|
+
"int": "integer",
|
|
106
|
+
"float": "number",
|
|
107
|
+
"string": "string",
|
|
108
|
+
"date": "date",
|
|
109
|
+
"datetime_iso": "datetime",
|
|
110
|
+
"datetime_rfc822": "datetime",
|
|
111
|
+
"json_geojson": "geojson",
|
|
112
|
+
"latitude": "number",
|
|
113
|
+
"latitude_l93": "number",
|
|
114
|
+
"latitude_wgs": "number",
|
|
115
|
+
"latitude_wgs_fr_metropole": "number",
|
|
116
|
+
"latlon_wgs": "geo_point",
|
|
117
|
+
"longitude": "number",
|
|
118
|
+
"longitude_l93": "number",
|
|
119
|
+
"longitude_wgs": "number",
|
|
120
|
+
"longitude_wgs_fr_metropole": "number",
|
|
121
|
+
"year": "year",
|
|
122
|
+
}
|
|
123
|
+
return metier_to_validata_type.get(format, "string")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def get_example(format: str) -> str:
|
|
127
|
+
"""Returns the example for a given format"""
|
|
128
|
+
format_to_example = {
|
|
129
|
+
"booleen": "true",
|
|
130
|
+
"int": 42,
|
|
131
|
+
"float": 42.42,
|
|
132
|
+
"string": "Lorem ipsum dolor sit amet",
|
|
133
|
+
"adresse": "28 rue Ledion, 75014 Paris",
|
|
134
|
+
"insee_canton": "Pont-d'Ain",
|
|
135
|
+
"code_commune_insee": "27501",
|
|
136
|
+
"code_csp_insee": "233c",
|
|
137
|
+
"code_departement": "2A",
|
|
138
|
+
"code_fantoir": "A633",
|
|
139
|
+
"code_postal": "75014",
|
|
140
|
+
"code_region": "52",
|
|
141
|
+
"code_rna": "W123456789",
|
|
142
|
+
# 'code_waldec': TODO: add code_waldec
|
|
143
|
+
"commune": "Joyeux",
|
|
144
|
+
"csp_insee": "anciens agriculteurs exploitants",
|
|
145
|
+
"date": "2020-01-01",
|
|
146
|
+
"date_fr": "12 janvier 2020",
|
|
147
|
+
"datetime_iso": "2020-01-01T00:00:00",
|
|
148
|
+
"datetime_rfc822": "Tue, 1 Jan 2020 00:00:00 +0000",
|
|
149
|
+
"departement": "Ain",
|
|
150
|
+
"email": "example@example.com",
|
|
151
|
+
"insee_ape700": "0130Z",
|
|
152
|
+
"iso_country_code_alpha2": "FR",
|
|
153
|
+
"iso_country_code_alpha3": "FRA",
|
|
154
|
+
"iso_country_code_numeric": 250,
|
|
155
|
+
"jour_de_la_semaine": "lundi",
|
|
156
|
+
"json_geojson": '{"type": "Point", "coordinates": [0, 0]}',
|
|
157
|
+
"latitude": 42.42,
|
|
158
|
+
"latitude_l93": 6037008,
|
|
159
|
+
"latitude_wgs": 42.42,
|
|
160
|
+
"latitude_wgs_fr_metropole": 41.3,
|
|
161
|
+
"latlon_wgs": "42.42, 0.0",
|
|
162
|
+
"longitude": 0.0,
|
|
163
|
+
"longitude_l93": -357823,
|
|
164
|
+
"longitude_wgs": 0.0,
|
|
165
|
+
"longitude_wgs_fr_metropole": 1.2,
|
|
166
|
+
"mois_de_annee": "janvier",
|
|
167
|
+
"mongo_object_id": "507f191e810c19729de860ea",
|
|
168
|
+
"pays": "France",
|
|
169
|
+
"region": "nouvelle aquitaine",
|
|
170
|
+
"sexe": "h",
|
|
171
|
+
"siren": "362521879",
|
|
172
|
+
"siret": "56894100056",
|
|
173
|
+
"tel_fr": "+33123456789",
|
|
174
|
+
"twitter": "@Etalab",
|
|
175
|
+
"uai": "0470009E",
|
|
176
|
+
"url": "https://www.data.gouv.fr",
|
|
177
|
+
"uuid": "123e4567-e89b-12d3-a456-426614174000",
|
|
178
|
+
"year": "2020",
|
|
179
|
+
}
|
|
180
|
+
return format_to_example.get(format, "")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_constraints(format: str) -> dict:
|
|
184
|
+
"""Returns the constraints for a given format"""
|
|
185
|
+
pattern_constraints = get_pattern(format)
|
|
186
|
+
extra_constraints = {}
|
|
187
|
+
if format == "latitude_l93":
|
|
188
|
+
extra_constraints = {"minimum": 6037008, "maximum": 7230728}
|
|
189
|
+
if format == "longitude_l93":
|
|
190
|
+
extra_constraints = {"minimum": -357823, "maximum": 7230728}
|
|
191
|
+
if format == "latitude_wgs_fr_metropole":
|
|
192
|
+
extra_constraints = {"minimum": 41.3, "maximum": 51.3}
|
|
193
|
+
if format == "longitude_wgs_fr_metropole":
|
|
194
|
+
extra_constraints = {"minimum": -5.5, "maximum": 9.8}
|
|
195
|
+
|
|
196
|
+
return {"required": False, **pattern_constraints, **extra_constraints}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def generate_table_schema(
|
|
200
|
+
analysis_report: dict,
|
|
201
|
+
save_file: bool,
|
|
202
|
+
netloc: Optional[str] = None,
|
|
203
|
+
bucket: Optional[str] = None,
|
|
204
|
+
key: Optional[str] = None,
|
|
205
|
+
minio_user: Optional[str] = None,
|
|
206
|
+
minio_pwd: Optional[str] = None,
|
|
207
|
+
verbose: bool = False
|
|
208
|
+
) -> dict:
|
|
209
|
+
"""Generates a table schema from the analysis report
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
analysis_report (dict): The analysis report from csv_detective
|
|
213
|
+
save_file (bool): indicate if schema should be saved into minio or just returned
|
|
214
|
+
netloc (str): The netloc of the minio instance to upload the tableschema
|
|
215
|
+
bucket (str): The bucket to save the schema in
|
|
216
|
+
key (str): The key to save the schema in (without extension as we will append
|
|
217
|
+
version number and extension)
|
|
218
|
+
minio_user (str): The minio user
|
|
219
|
+
minio_pwd (str): The minio password
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
"""
|
|
223
|
+
if verbose:
|
|
224
|
+
start = time()
|
|
225
|
+
logging.info("Creating table schema")
|
|
226
|
+
fields = [
|
|
227
|
+
{
|
|
228
|
+
"name": header,
|
|
229
|
+
"description": get_description(field_report["format"]),
|
|
230
|
+
"example": get_example(field_report["format"]),
|
|
231
|
+
"type": get_validata_type(field_report["format"]),
|
|
232
|
+
"formatFR": field_report["format"],
|
|
233
|
+
"constraints": get_constraints(field_report["format"])
|
|
234
|
+
}
|
|
235
|
+
for header, field_report in analysis_report["columns"].items()
|
|
236
|
+
]
|
|
237
|
+
|
|
238
|
+
new_version = "0.0.1"
|
|
239
|
+
|
|
240
|
+
schema = {
|
|
241
|
+
"$schema": "https://frictionlessdata.io/schemas/table-schema.json",
|
|
242
|
+
"name": "",
|
|
243
|
+
"title": "",
|
|
244
|
+
"description": "",
|
|
245
|
+
"countryCode": "FR",
|
|
246
|
+
"homepage": "",
|
|
247
|
+
"path": "https://github.com/etalab/csv-detective",
|
|
248
|
+
"resources": [],
|
|
249
|
+
"sources": [
|
|
250
|
+
{
|
|
251
|
+
"title": "Spécification Tableschema",
|
|
252
|
+
"path": "https://specs.frictionlessdata.io/table-schema"
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
"title": "schema.data.gouv.fr",
|
|
256
|
+
"path": "https://schema.data.gouv.fr"
|
|
257
|
+
}
|
|
258
|
+
],
|
|
259
|
+
"created": datetime.today().strftime("%Y-%m-%d"),
|
|
260
|
+
"lastModified": datetime.today().strftime("%Y-%m-%d"),
|
|
261
|
+
"version": new_version,
|
|
262
|
+
"contributors": [
|
|
263
|
+
{
|
|
264
|
+
"title": "Table schema bot",
|
|
265
|
+
"email": "schema@data.gouv.fr",
|
|
266
|
+
"organisation": "data.gouv.fr",
|
|
267
|
+
"role": "author",
|
|
268
|
+
},
|
|
269
|
+
],
|
|
270
|
+
"fields": fields,
|
|
271
|
+
"missingValues": [""],
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if verbose:
|
|
275
|
+
display_logs_depending_process_time(f'Created schema in {round(time() - start, 3)}s', time() - start)
|
|
276
|
+
|
|
277
|
+
if not save_file:
|
|
278
|
+
return schema
|
|
279
|
+
|
|
280
|
+
if save_file:
|
|
281
|
+
if not all([netloc, key, bucket, minio_user, minio_pwd]):
|
|
282
|
+
raise Exception(
|
|
283
|
+
"To save schema into minio, parameters : netloc, key, bucket, "
|
|
284
|
+
"minio_user, minio_pwd should be provided"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Create bucket if does not exist
|
|
288
|
+
client = get_s3_client(netloc, minio_user, minio_pwd)
|
|
289
|
+
try:
|
|
290
|
+
client.head_bucket(Bucket=bucket)
|
|
291
|
+
except ClientError:
|
|
292
|
+
client.create_bucket(Bucket=bucket)
|
|
293
|
+
|
|
294
|
+
tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
|
|
295
|
+
if "Contents" in tableschema_objects:
|
|
296
|
+
tableschema_keys = [
|
|
297
|
+
tableschema["Key"]
|
|
298
|
+
for tableschema in client.list_objects(
|
|
299
|
+
Bucket=bucket, Prefix=key, Delimiter="/"
|
|
300
|
+
)["Contents"]
|
|
301
|
+
]
|
|
302
|
+
tableschema_versions = [
|
|
303
|
+
os.path.splitext(tableschema_key)[0].split("_")[-1]
|
|
304
|
+
for tableschema_key in tableschema_keys
|
|
305
|
+
]
|
|
306
|
+
latest_version = max(tableschema_versions)
|
|
307
|
+
|
|
308
|
+
with tempfile.NamedTemporaryFile() as latest_schema_file:
|
|
309
|
+
with open(latest_schema_file.name, "w") as fp:
|
|
310
|
+
download_from_minio(
|
|
311
|
+
netloc,
|
|
312
|
+
bucket,
|
|
313
|
+
f"{key}_{latest_version}.json",
|
|
314
|
+
latest_schema_file.name,
|
|
315
|
+
minio_user,
|
|
316
|
+
minio_pwd,
|
|
317
|
+
)
|
|
318
|
+
# Check if files are different
|
|
319
|
+
with open(latest_schema_file.name, "r") as fp:
|
|
320
|
+
latest_schema = json.load(fp)
|
|
321
|
+
if latest_schema["fields"] != fields:
|
|
322
|
+
latest_version_split = latest_version.split(".")
|
|
323
|
+
new_version = (
|
|
324
|
+
latest_version_split[0]
|
|
325
|
+
+ "."
|
|
326
|
+
+ latest_version_split[1]
|
|
327
|
+
+ "."
|
|
328
|
+
+ str(int(latest_version_split[2]) + 1)
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
schema["version"] = new_version
|
|
334
|
+
|
|
335
|
+
tableschema_file = tempfile.NamedTemporaryFile(delete=False)
|
|
336
|
+
with open(tableschema_file.name, "w") as fp:
|
|
337
|
+
json.dump(schema, fp, indent=4)
|
|
338
|
+
|
|
339
|
+
new_version_key = f"{key}_{new_version}.json"
|
|
340
|
+
upload_to_minio(
|
|
341
|
+
netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
|
|
342
|
+
)
|
|
343
|
+
os.unlink(tableschema_file.name)
|
|
344
|
+
return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
|