csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
|
@@ -1,343 +1,268 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
"
|
|
37
|
-
"
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
"
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"
|
|
60
|
-
"
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
"
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
"
|
|
80
|
-
"
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
),
|
|
84
|
-
"
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
"
|
|
104
|
-
"
|
|
105
|
-
"
|
|
106
|
-
"
|
|
107
|
-
"
|
|
108
|
-
"
|
|
109
|
-
"
|
|
110
|
-
"
|
|
111
|
-
"
|
|
112
|
-
"
|
|
113
|
-
"
|
|
114
|
-
"
|
|
115
|
-
"
|
|
116
|
-
"
|
|
117
|
-
"
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
"
|
|
129
|
-
"
|
|
130
|
-
"
|
|
131
|
-
"
|
|
132
|
-
"
|
|
133
|
-
"
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
"
|
|
137
|
-
"
|
|
138
|
-
|
|
139
|
-
"
|
|
140
|
-
"
|
|
141
|
-
|
|
142
|
-
"
|
|
143
|
-
"
|
|
144
|
-
"
|
|
145
|
-
"
|
|
146
|
-
"
|
|
147
|
-
"
|
|
148
|
-
"
|
|
149
|
-
"
|
|
150
|
-
"
|
|
151
|
-
"
|
|
152
|
-
"
|
|
153
|
-
"
|
|
154
|
-
"
|
|
155
|
-
"
|
|
156
|
-
"
|
|
157
|
-
"
|
|
158
|
-
"
|
|
159
|
-
"
|
|
160
|
-
"
|
|
161
|
-
"
|
|
162
|
-
"
|
|
163
|
-
"
|
|
164
|
-
"
|
|
165
|
-
"
|
|
166
|
-
"
|
|
167
|
-
"
|
|
168
|
-
"
|
|
169
|
-
"
|
|
170
|
-
"
|
|
171
|
-
"
|
|
172
|
-
"
|
|
173
|
-
"
|
|
174
|
-
"
|
|
175
|
-
"
|
|
176
|
-
"
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
"
|
|
244
|
-
"
|
|
245
|
-
"
|
|
246
|
-
"
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
"
|
|
251
|
-
"
|
|
252
|
-
},
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
"fields": fields,
|
|
270
|
-
"missingValues": [""],
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
if verbose:
|
|
274
|
-
display_logs_depending_process_time(f'Created schema in {round(time() - start, 3)}s', time() - start)
|
|
275
|
-
|
|
276
|
-
if not save_file:
|
|
277
|
-
return schema
|
|
278
|
-
|
|
279
|
-
if save_file:
|
|
280
|
-
if not all([netloc, key, bucket, minio_user, minio_pwd]):
|
|
281
|
-
raise Exception(
|
|
282
|
-
"To save schema into minio, parameters : netloc, key, bucket, "
|
|
283
|
-
"minio_user, minio_pwd should be provided"
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
# Create bucket if does not exist
|
|
287
|
-
client = get_s3_client(netloc, minio_user, minio_pwd)
|
|
288
|
-
try:
|
|
289
|
-
client.head_bucket(Bucket=bucket)
|
|
290
|
-
except ClientError:
|
|
291
|
-
client.create_bucket(Bucket=bucket)
|
|
292
|
-
|
|
293
|
-
tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
|
|
294
|
-
if "Contents" in tableschema_objects:
|
|
295
|
-
tableschema_keys = [
|
|
296
|
-
tableschema["Key"]
|
|
297
|
-
for tableschema in client.list_objects(
|
|
298
|
-
Bucket=bucket, Prefix=key, Delimiter="/"
|
|
299
|
-
)["Contents"]
|
|
300
|
-
]
|
|
301
|
-
tableschema_versions = [
|
|
302
|
-
os.path.splitext(tableschema_key)[0].split("_")[-1]
|
|
303
|
-
for tableschema_key in tableschema_keys
|
|
304
|
-
]
|
|
305
|
-
latest_version = max(tableschema_versions)
|
|
306
|
-
|
|
307
|
-
with tempfile.NamedTemporaryFile() as latest_schema_file:
|
|
308
|
-
with open(latest_schema_file.name, "w") as fp:
|
|
309
|
-
download_from_minio(
|
|
310
|
-
netloc,
|
|
311
|
-
bucket,
|
|
312
|
-
f"{key}_{latest_version}.json",
|
|
313
|
-
latest_schema_file.name,
|
|
314
|
-
minio_user,
|
|
315
|
-
minio_pwd,
|
|
316
|
-
)
|
|
317
|
-
# Check if files are different
|
|
318
|
-
with open(latest_schema_file.name, "r") as fp:
|
|
319
|
-
latest_schema = json.load(fp)
|
|
320
|
-
if latest_schema["fields"] != fields:
|
|
321
|
-
latest_version_split = latest_version.split(".")
|
|
322
|
-
new_version = (
|
|
323
|
-
latest_version_split[0]
|
|
324
|
-
+ "."
|
|
325
|
-
+ latest_version_split[1]
|
|
326
|
-
+ "."
|
|
327
|
-
+ str(int(latest_version_split[2]) + 1)
|
|
328
|
-
)
|
|
329
|
-
else:
|
|
330
|
-
return None
|
|
331
|
-
|
|
332
|
-
schema["version"] = new_version
|
|
333
|
-
|
|
334
|
-
tableschema_file = tempfile.NamedTemporaryFile(delete=False)
|
|
335
|
-
with open(tableschema_file.name, "w") as fp:
|
|
336
|
-
json.dump(schema, fp, indent=4)
|
|
337
|
-
|
|
338
|
-
new_version_key = f"{key}_{new_version}.json"
|
|
339
|
-
upload_to_minio(
|
|
340
|
-
netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
|
|
341
|
-
)
|
|
342
|
-
os.unlink(tableschema_file.name)
|
|
343
|
-
return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from time import time
|
|
5
|
+
|
|
6
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_description(format: str) -> str:
|
|
10
|
+
"""Returns generic description for specific field"""
|
|
11
|
+
format_to_desc = {
|
|
12
|
+
"adresse": "Adresse",
|
|
13
|
+
"code_commune_insee": "Le code INSEE de la commune",
|
|
14
|
+
"code_departement": "Le code INSEE du département",
|
|
15
|
+
"code_region": "Le code INSEE de la région",
|
|
16
|
+
"code_fantoir": "Le code FANTOIR de la voie ou du lieu-dit",
|
|
17
|
+
"code_postal": "Le code postal",
|
|
18
|
+
"commune": "Le nom de la commune",
|
|
19
|
+
"departement": "Le nom du département",
|
|
20
|
+
"insee_canton": "Le nom du canton",
|
|
21
|
+
"latitude_l93": "La latitude au format Lambert 93",
|
|
22
|
+
"latitude_wgs_fr_metropole": (
|
|
23
|
+
"La latitude au format WGS. Ne concerne que des latitudes de la métropole française"
|
|
24
|
+
),
|
|
25
|
+
"longitude_l93": "La longitude au format Lambert 93",
|
|
26
|
+
"longitude_wgs_fr_metropole": (
|
|
27
|
+
"La longitude au format WGS. Ne concerne que des longitudes de la métropole française"
|
|
28
|
+
),
|
|
29
|
+
"pays": "Le nom du pays",
|
|
30
|
+
"region": "Le nom de la région",
|
|
31
|
+
"code_csp_insee": "Le code de Catégorie Socio-professionnel INSEE",
|
|
32
|
+
"code_rna": "Le code RNA de l'association",
|
|
33
|
+
"code_waldec": "Le code WALDEC de l'association",
|
|
34
|
+
"csp_insee": "La catégorie socio-professionnel INSEE",
|
|
35
|
+
"date_fr": "Data au format français",
|
|
36
|
+
"sexe": "Le sexe",
|
|
37
|
+
"siren": "Le numéro SIREN à 9 chiffres de l'entreprise (unité légale)",
|
|
38
|
+
"siret": "Le numéro SIRET à 14 chiffres de l'établissement d'une entreprise",
|
|
39
|
+
"tel_fr": "Le numéro de téléphone français",
|
|
40
|
+
"uai": "Le numéro UAI (Unité Administrative Immatriculée) de l'établissement scolaire",
|
|
41
|
+
"jour_de_la_semaine": "Le jour de la semaine",
|
|
42
|
+
"mois_de_annee": "Le mois de l'année",
|
|
43
|
+
"latitude_wgs": "La latitude au format WGS",
|
|
44
|
+
"longitude_wgs": "La longitude au format WGS",
|
|
45
|
+
"latlon_wgs": "Les coordonnées XY (latitude et longitude)",
|
|
46
|
+
"lonlat_wgs": "Les coordonnées XY (longitude et latitude)",
|
|
47
|
+
"booleen": "Booléen",
|
|
48
|
+
"email": "L'adresse couriel (email)",
|
|
49
|
+
"float": "Nombre flottant (à virgule)",
|
|
50
|
+
"int": "Nombre entier",
|
|
51
|
+
"json": "Chaîne de caractère json",
|
|
52
|
+
"mongo_object_id": "Identifiant de base de donnée Mongo",
|
|
53
|
+
"twitter": "Compte Twitter",
|
|
54
|
+
"url": "Adresse URL",
|
|
55
|
+
"uuid": "Identifiant unique au format UUID",
|
|
56
|
+
"date": "Date",
|
|
57
|
+
"datetime_aware": "Date au format datetime avec fuseau horaire",
|
|
58
|
+
"datetime_naive": "Date au format datetime sans fuseau horaire",
|
|
59
|
+
"datetime_rfc822": "Date au format datetime (RFC822)",
|
|
60
|
+
"year": "Année",
|
|
61
|
+
}
|
|
62
|
+
return format_to_desc.get(format, "")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_pattern(format: str) -> str:
|
|
66
|
+
"""Returns the pattern for a particular format"""
|
|
67
|
+
format_to_pattern = {
|
|
68
|
+
"siren": r"^\d{9}$",
|
|
69
|
+
"siret": r"^\d{14}$",
|
|
70
|
+
"code_commune_insee": r"^([013-9]\d|2[AB1-9])\d{3}$",
|
|
71
|
+
"code_postal": r"^([013-9]\d|2[AB1-9])\d{3}$",
|
|
72
|
+
"code_departement": r"^(([013-9]\d|2[AB1-9])$|9\d{2}$)",
|
|
73
|
+
"code_region": r"^\d{2}$",
|
|
74
|
+
"code_rna": r"^[wW]\d{9}$",
|
|
75
|
+
"code_waldec": (
|
|
76
|
+
r"^\d{3}\D\d{1,10}$|^\d\D\d\D\d{10}$|^\d{3}\D{3}\d{1,10}$|^\d{3}\D\d{4}\D\d{1,10}"
|
|
77
|
+
r"$|^\d{3}\D\d{2}[-]\d{3}$|^\d\D\d\D\d{2}\D\d{1,8}$"
|
|
78
|
+
),
|
|
79
|
+
"uai": r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$",
|
|
80
|
+
"email": r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$",
|
|
81
|
+
"twitter": r"^@[A-Za-z0-9_]+$",
|
|
82
|
+
"mongo_object_id": r"^[0-9a-fA-F]{24}$",
|
|
83
|
+
"uuid": r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$",
|
|
84
|
+
"url": (
|
|
85
|
+
r"^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]"
|
|
86
|
+
r"{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$"
|
|
87
|
+
),
|
|
88
|
+
}
|
|
89
|
+
if format in format_to_pattern:
|
|
90
|
+
return {"pattern": format_to_pattern[format]}
|
|
91
|
+
else:
|
|
92
|
+
return {}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_validata_type(format: str) -> str:
|
|
96
|
+
"""Returns the validata type for a given format"""
|
|
97
|
+
metier_to_validata_type = {
|
|
98
|
+
"booleen": "boolean",
|
|
99
|
+
"int": "integer",
|
|
100
|
+
"float": "number",
|
|
101
|
+
"string": "string",
|
|
102
|
+
"date": "date",
|
|
103
|
+
"datetime_aware": "datetime",
|
|
104
|
+
"datetime_naive": "datetime",
|
|
105
|
+
"datetime_rfc822": "datetime",
|
|
106
|
+
"geojson": "geojson",
|
|
107
|
+
"latitude": "number",
|
|
108
|
+
"latitude_l93": "number",
|
|
109
|
+
"latitude_wgs": "number",
|
|
110
|
+
"latitude_wgs_fr_metropole": "number",
|
|
111
|
+
"latlon_wgs": "geo_point",
|
|
112
|
+
"lonlat_wgs": "geo_point",
|
|
113
|
+
"longitude": "number",
|
|
114
|
+
"longitude_l93": "number",
|
|
115
|
+
"longitude_wgs": "number",
|
|
116
|
+
"longitude_wgs_fr_metropole": "number",
|
|
117
|
+
"year": "year",
|
|
118
|
+
}
|
|
119
|
+
return metier_to_validata_type.get(format, "string")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def get_example(format: str) -> str:
|
|
123
|
+
"""Returns the example for a given format"""
|
|
124
|
+
format_to_example = {
|
|
125
|
+
"booleen": "true",
|
|
126
|
+
"int": 42,
|
|
127
|
+
"float": 42.42,
|
|
128
|
+
"string": "Lorem ipsum dolor sit amet",
|
|
129
|
+
"adresse": "28 rue Ledion, 75014 Paris",
|
|
130
|
+
"insee_canton": "Pont-d'Ain",
|
|
131
|
+
"code_commune_insee": "27501",
|
|
132
|
+
"code_csp_insee": "233c",
|
|
133
|
+
"code_departement": "2A",
|
|
134
|
+
"code_fantoir": "A633",
|
|
135
|
+
"code_postal": "75014",
|
|
136
|
+
"code_region": "52",
|
|
137
|
+
"code_rna": "W123456789",
|
|
138
|
+
# 'code_waldec': TODO: add code_waldec
|
|
139
|
+
"commune": "Joyeux",
|
|
140
|
+
"csp_insee": "anciens agriculteurs exploitants",
|
|
141
|
+
"date": "2020-01-01",
|
|
142
|
+
"date_fr": "12 janvier 2020",
|
|
143
|
+
"datetime_aware": "2020-01-01T00:00:00+02:00",
|
|
144
|
+
"datetime_naive": "2020-01-01T00:00:00",
|
|
145
|
+
"datetime_rfc822": "Tue, 1 Jan 2020 00:00:00 +0000",
|
|
146
|
+
"departement": "Ain",
|
|
147
|
+
"email": "example@example.com",
|
|
148
|
+
"insee_ape700": "0130Z",
|
|
149
|
+
"iso_country_code_alpha2": "FR",
|
|
150
|
+
"iso_country_code_alpha3": "FRA",
|
|
151
|
+
"iso_country_code_numeric": 250,
|
|
152
|
+
"jour_de_la_semaine": "lundi",
|
|
153
|
+
"geojson": '{"type": "Point", "coordinates": [0, 0]}',
|
|
154
|
+
"latitude": 42.42,
|
|
155
|
+
"latitude_l93": 6037008,
|
|
156
|
+
"latitude_wgs": 42.42,
|
|
157
|
+
"latitude_wgs_fr_metropole": 41.3,
|
|
158
|
+
"latlon_wgs": "42.42, 0.0",
|
|
159
|
+
"lonlat_wgs": "0.0, 42.42",
|
|
160
|
+
"longitude": 0.0,
|
|
161
|
+
"longitude_l93": -357823,
|
|
162
|
+
"longitude_wgs": 0.0,
|
|
163
|
+
"longitude_wgs_fr_metropole": 1.2,
|
|
164
|
+
"mois_de_annee": "janvier",
|
|
165
|
+
"mongo_object_id": "507f191e810c19729de860ea",
|
|
166
|
+
"pays": "France",
|
|
167
|
+
"region": "nouvelle aquitaine",
|
|
168
|
+
"sexe": "h",
|
|
169
|
+
"siren": "362521879",
|
|
170
|
+
"siret": "56894100056",
|
|
171
|
+
"tel_fr": "+33123456789",
|
|
172
|
+
"twitter": "@Etalab",
|
|
173
|
+
"uai": "0470009E",
|
|
174
|
+
"url": "https://www.data.gouv.fr",
|
|
175
|
+
"uuid": "123e4567-e89b-12d3-a456-426614174000",
|
|
176
|
+
"year": "2020",
|
|
177
|
+
}
|
|
178
|
+
return format_to_example.get(format, "")
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def get_constraints(format: str) -> dict:
|
|
182
|
+
"""Returns the constraints for a given format"""
|
|
183
|
+
pattern_constraints = get_pattern(format)
|
|
184
|
+
extra_constraints = {}
|
|
185
|
+
if format == "latitude_l93":
|
|
186
|
+
extra_constraints = {"minimum": 6037008, "maximum": 7230728}
|
|
187
|
+
if format == "longitude_l93":
|
|
188
|
+
extra_constraints = {"minimum": -357823, "maximum": 7230728}
|
|
189
|
+
if format == "latitude_wgs_fr_metropole":
|
|
190
|
+
extra_constraints = {"minimum": 41.3, "maximum": 51.3}
|
|
191
|
+
if format == "longitude_wgs_fr_metropole":
|
|
192
|
+
extra_constraints = {"minimum": -5.5, "maximum": 9.8}
|
|
193
|
+
|
|
194
|
+
return {"required": False, **pattern_constraints, **extra_constraints}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def generate_table_schema(
|
|
198
|
+
analysis_report: dict,
|
|
199
|
+
save_results: bool | str = True,
|
|
200
|
+
verbose: bool = False,
|
|
201
|
+
) -> dict:
|
|
202
|
+
"""Generates a table schema from the analysis report
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
analysis_report (dict): The analysis report from csv_detective
|
|
206
|
+
save_results (bool or str): whether and where to save the results
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
"""
|
|
210
|
+
if verbose:
|
|
211
|
+
start = time()
|
|
212
|
+
logging.info("Creating table schema")
|
|
213
|
+
fields = [
|
|
214
|
+
{
|
|
215
|
+
"name": header,
|
|
216
|
+
"description": get_description(field_report["format"]),
|
|
217
|
+
"example": get_example(field_report["format"]),
|
|
218
|
+
"type": get_validata_type(field_report["format"]),
|
|
219
|
+
"formatFR": field_report["format"],
|
|
220
|
+
"constraints": get_constraints(field_report["format"]),
|
|
221
|
+
}
|
|
222
|
+
for header, field_report in analysis_report["columns"].items()
|
|
223
|
+
]
|
|
224
|
+
|
|
225
|
+
new_version = "0.0.1"
|
|
226
|
+
|
|
227
|
+
schema = {
|
|
228
|
+
"$schema": "https://frictionlessdata.io/schemas/table-schema.json",
|
|
229
|
+
"name": "",
|
|
230
|
+
"title": "",
|
|
231
|
+
"description": "",
|
|
232
|
+
"countryCode": "FR",
|
|
233
|
+
"homepage": "",
|
|
234
|
+
"path": "https://github.com/etalab/csv-detective",
|
|
235
|
+
"resources": [],
|
|
236
|
+
"sources": [
|
|
237
|
+
{
|
|
238
|
+
"title": "Spécification Tableschema",
|
|
239
|
+
"path": "https://specs.frictionlessdata.io/table-schema",
|
|
240
|
+
},
|
|
241
|
+
{"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"},
|
|
242
|
+
],
|
|
243
|
+
"created": datetime.today().strftime("%Y-%m-%d"),
|
|
244
|
+
"lastModified": datetime.today().strftime("%Y-%m-%d"),
|
|
245
|
+
"version": new_version,
|
|
246
|
+
"contributors": [
|
|
247
|
+
{
|
|
248
|
+
"title": "Table schema bot",
|
|
249
|
+
"email": "schema@data.gouv.fr",
|
|
250
|
+
"organisation": "data.gouv.fr",
|
|
251
|
+
"role": "author",
|
|
252
|
+
},
|
|
253
|
+
],
|
|
254
|
+
"fields": fields,
|
|
255
|
+
"missingValues": [""],
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if verbose:
|
|
259
|
+
display_logs_depending_process_time(
|
|
260
|
+
f"Created schema in {round(time() - start, 3)}s", time() - start
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if save_results:
|
|
264
|
+
output_path = save_results if isinstance(save_results, str) else "schema.json"
|
|
265
|
+
with open(output_path, "w", encoding="utf8") as fp:
|
|
266
|
+
json.dump(schema, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str)
|
|
267
|
+
|
|
268
|
+
return schema
|