csv-detective 0.9.3.dev2140__py3-none-any.whl → 0.9.3.dev2232__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +2 -1
- csv_detective/detect_labels/FR/other/siren/__init__.py +1 -0
- csv_detective/detect_labels/FR/other/siret/__init__.py +1 -0
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +22 -29
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +15 -7
- csv_detective/detection/engine.py +1 -1
- csv_detective/detection/formats.py +42 -95
- csv_detective/detection/variables.py +2 -2
- csv_detective/explore_csv.py +5 -7
- csv_detective/load_tests.py +11 -4
- csv_detective/output/__init__.py +8 -4
- csv_detective/output/dataframe.py +37 -0
- csv_detective/output/example.py +3 -1
- csv_detective/output/profile.py +59 -19
- csv_detective/parsing/columns.py +133 -35
- csv_detective/parsing/csv.py +26 -23
- csv_detective/parsing/load.py +21 -8
- csv_detective/validate.py +86 -40
- {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/METADATA +45 -29
- {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/RECORD +28 -28
- tests/test_fields.py +9 -13
- tests/test_file.py +64 -36
- tests/test_structure.py +4 -1
- tests/test_validation.py +9 -4
- {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csv-detective
|
|
3
|
-
Version: 0.9.3.
|
|
3
|
+
Version: 0.9.3.dev2232
|
|
4
4
|
Summary: Detect tabular files column content
|
|
5
5
|
Author-email: Etalab <opendatateam@data.gouv.fr>
|
|
6
6
|
License: MIT
|
|
@@ -22,16 +22,16 @@ Requires-Dist: python-magic==0.4.27
|
|
|
22
22
|
Requires-Dist: frformat==0.4.0
|
|
23
23
|
Requires-Dist: Faker>=33.0.0
|
|
24
24
|
Requires-Dist: rstr==3.2.2
|
|
25
|
+
Requires-Dist: more-itertools>=10.8.0
|
|
25
26
|
Provides-Extra: dev
|
|
26
27
|
Requires-Dist: pytest>=8.3.0; extra == "dev"
|
|
27
28
|
Requires-Dist: responses>=0.25.0; extra == "dev"
|
|
28
|
-
Requires-Dist: bumpx>=0.3.10; extra == "dev"
|
|
29
29
|
Requires-Dist: ruff>=0.9.3; extra == "dev"
|
|
30
30
|
Dynamic: license-file
|
|
31
31
|
|
|
32
32
|
# CSV Detective
|
|
33
33
|
|
|
34
|
-
This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks to see for each column if it matches with various content types.
|
|
34
|
+
This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types.
|
|
35
35
|
|
|
36
36
|
Currently supported file types: csv, xls, xlsx, ods.
|
|
37
37
|
|
|
@@ -51,7 +51,7 @@ pip install csv-detective
|
|
|
51
51
|
|
|
52
52
|
Say you have a tabular file located at `file_path`. This is how you could use `csv_detective`:
|
|
53
53
|
|
|
54
|
-
```
|
|
54
|
+
```python
|
|
55
55
|
# Import the csv_detective package
|
|
56
56
|
from csv_detective import routine
|
|
57
57
|
import os # for this example only
|
|
@@ -159,13 +159,26 @@ The program creates a `Python` dictionnary with the following information :
|
|
|
159
159
|
```
|
|
160
160
|
|
|
161
161
|
The output slightly differs depending on the file format:
|
|
162
|
-
- csv files have `encoding` and `separator`
|
|
162
|
+
- csv files have `encoding` and `separator` (and `compression` if relevant)
|
|
163
163
|
- xls, xls, ods files have `engine` and `sheet_name`
|
|
164
164
|
|
|
165
|
+
You may also set `output_df` to `True`, in which case the output is a tuple of two elements:
|
|
166
|
+
- the analysis (as described above)
|
|
167
|
+
- an iteror of `pd.DataFrame`s which contain the columns cast with the detected types (which can be used with `pd.concat` or in a loop):
|
|
168
|
+
```python
|
|
169
|
+
inspection, df_chunks = routine(
|
|
170
|
+
file_path=file_path,
|
|
171
|
+
num_rows=-1,
|
|
172
|
+
output_df=True,
|
|
173
|
+
)
|
|
174
|
+
cast_df = pd.concat(df_chunks, ignore_index=True)
|
|
175
|
+
# if "col1" has been detected as a float, then cast_df["col1"] contains floats
|
|
176
|
+
```
|
|
177
|
+
|
|
165
178
|
### What Formats Can Be Detected
|
|
166
179
|
|
|
167
180
|
Includes :
|
|
168
|
-
|
|
181
|
+
- types (float, int, dates, datetimes, JSON) and more specific (latitude, longitude, geoJSON...)
|
|
169
182
|
- Communes, Départements, Régions, Pays
|
|
170
183
|
- Codes Communes, Codes Postaux, Codes Departement, ISO Pays
|
|
171
184
|
- Codes CSP, Description CSP, SIREN
|
|
@@ -173,6 +186,16 @@ Includes :
|
|
|
173
186
|
- Years, Dates, Jours de la Semaine FR
|
|
174
187
|
- UUIDs, Mongo ObjectIds
|
|
175
188
|
|
|
189
|
+
### Validation
|
|
190
|
+
If you have a pre-made analysis of a file, you can check whether an other file conforms to the same analysis:
|
|
191
|
+
```python
|
|
192
|
+
from csv_detective import validate
|
|
193
|
+
is_valid, *_ = validate(
|
|
194
|
+
file_path,
|
|
195
|
+
previous_analysis, # exactly as it came out from the routine function
|
|
196
|
+
)
|
|
197
|
+
```
|
|
198
|
+
|
|
176
199
|
### Format detection and scoring
|
|
177
200
|
For each column, 3 scores are computed for each format, the higher the score, the more likely the format:
|
|
178
201
|
- the field score based on the values contained in the column (0.0 to 1.0).
|
|
@@ -200,7 +223,6 @@ Only the format with highest score is present in the output.
|
|
|
200
223
|
Related ideas:
|
|
201
224
|
|
|
202
225
|
- store column names to make a learning model based on column names for (possible pre-screen)
|
|
203
|
-
- normalising data based on column prediction
|
|
204
226
|
- entity resolution (good luck...)
|
|
205
227
|
|
|
206
228
|
## Why Could This Be of Any Use ?
|
|
@@ -220,32 +242,26 @@ ruff check --fix .
|
|
|
220
242
|
ruff format .
|
|
221
243
|
```
|
|
222
244
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
The release process uses `bumpx`.
|
|
226
|
-
|
|
227
|
-
```shell
|
|
228
|
-
pip install -e .[dev]
|
|
229
|
-
```
|
|
245
|
+
### 🏷️ Release
|
|
230
246
|
|
|
231
|
-
|
|
247
|
+
The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
|
|
232
248
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
4. CircleCI will pickup this tag, build the package and publish it to pypi
|
|
237
|
-
5. `bumpx` will have everything ready for the next version (version, changelog...)
|
|
249
|
+
```bash
|
|
250
|
+
# Create a new release
|
|
251
|
+
./tag_version.sh <version>
|
|
238
252
|
|
|
239
|
-
|
|
253
|
+
# Example
|
|
254
|
+
./tag_version.sh 2.5.0
|
|
240
255
|
|
|
241
|
-
|
|
242
|
-
|
|
256
|
+
# Dry run to see what would happen
|
|
257
|
+
./tag_version.sh 2.5.0 --dry-run
|
|
243
258
|
```
|
|
244
259
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
This will release a patch version:
|
|
260
|
+
**Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
|
|
248
261
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
262
|
+
The script automatically:
|
|
263
|
+
- Updates the version in pyproject.toml
|
|
264
|
+
- Extracts commits since the last tag and formats them for CHANGELOG.md
|
|
265
|
+
- Identifies breaking changes (commits with `!:` in the subject)
|
|
266
|
+
- Creates a git tag and pushes it to the remote repository
|
|
267
|
+
- Creates a GitHub release with the changelog content
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
csv_detective/__init__.py,sha256=
|
|
1
|
+
csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
|
|
2
2
|
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
4
|
-
csv_detective/load_tests.py,sha256=
|
|
3
|
+
csv_detective/explore_csv.py,sha256=kuLkORQarelG13swoi0dH4cERu8BoRtRvyQ2SsYYhCY,5653
|
|
4
|
+
csv_detective/load_tests.py,sha256=VzHJq1Q22C666nad17ciPRtcQEonP40YmSERn9zylvQ,2399
|
|
5
5
|
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
6
|
-
csv_detective/validate.py,sha256=
|
|
6
|
+
csv_detective/validate.py,sha256=CNTYu_rOiv-Z8iWqCI_Ac_LXvbneRSukiu7NxB9Rcuo,5187
|
|
7
7
|
csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
|
|
8
8
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -97,8 +97,8 @@ csv_detective/detect_labels/FR/other/csp_insee/__init__.py,sha256=AI9nqj3zm6_vyc
|
|
|
97
97
|
csv_detective/detect_labels/FR/other/date_fr/__init__.py,sha256=4Crk045ZD_tVovI7C-IqjKFz23Ej5-hrFkhZK4OilqA,258
|
|
98
98
|
csv_detective/detect_labels/FR/other/insee_ape700/__init__.py,sha256=N7LzmtNwZERgrwMy3EFHaVBpdiwkt2_9Tt7XVJLff6U,406
|
|
99
99
|
csv_detective/detect_labels/FR/other/sexe/__init__.py,sha256=ZWhc8S9L1X2fFh2g5Ja-LuhsfHg_lALKrur6yDnGDPk,238
|
|
100
|
-
csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=
|
|
101
|
-
csv_detective/detect_labels/FR/other/siret/__init__.py,sha256
|
|
100
|
+
csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=cGzc9HPzbWlffkzJgwujUqupLi1Pkm0HWBLZv-_c4to,402
|
|
101
|
+
csv_detective/detect_labels/FR/other/siret/__init__.py,sha256=Av8IsLre6pRnPj-AHtqaU-1C_TMCxgDYAbTGIW0XIdU,339
|
|
102
102
|
csv_detective/detect_labels/FR/other/tel_fr/__init__.py,sha256=4jIZ9cmN73XhP4ayGcEMcB_y0X45oRk1Lq2p_pNfgok,426
|
|
103
103
|
csv_detective/detect_labels/FR/other/uai/__init__.py,sha256=5L6JowK9y6y9uZNg6hWzknMSzh0SurkwQeTINNKTdYY,599
|
|
104
104
|
csv_detective/detect_labels/FR/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -110,9 +110,9 @@ csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py,sha256=biUZP
|
|
|
110
110
|
csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
|
|
111
111
|
csv_detective/detect_labels/geo/json_geojson/__init__.py,sha256=On8VOCDD0EspZra6fTQCXH4MYao2xmRu-o7xWcab7Jg,355
|
|
112
112
|
csv_detective/detect_labels/geo/latitude_wgs/__init__.py,sha256=ME_KjniqDSdAwXP7XnKXyr5IA75KrGSLIhvPNfsux6E,664
|
|
113
|
-
csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=
|
|
113
|
+
csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=CegBNN-RR1k-I0OU7ZsdlpVI5UBYDcj5QDX9KaWay-w,701
|
|
114
114
|
csv_detective/detect_labels/geo/longitude_wgs/__init__.py,sha256=_8IV2FLtrOjzhQNsk-fsgc9-jbAgzKDVMr4tXu2P-s4,429
|
|
115
|
-
csv_detective/detect_labels/geo/lonlat_wgs/__init__.py,sha256=
|
|
115
|
+
csv_detective/detect_labels/geo/lonlat_wgs/__init__.py,sha256=ZmBLiCyboJzpsbXa5fsTxvAbO0W-ukRXnRWemN-Z-wc,481
|
|
116
116
|
csv_detective/detect_labels/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
117
117
|
csv_detective/detect_labels/other/booleen/__init__.py,sha256=zEkarex7L4T3vmYjR5hdhtnhugTVDsvkgG_it6nN0aA,214
|
|
118
118
|
csv_detective/detect_labels/other/email/__init__.py,sha256=Poagn45-eC2a_Wdk5Qs6d2BgYdncCQKZp2yEB50IuNw,431
|
|
@@ -130,37 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
|
|
|
130
130
|
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
131
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
132
132
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
133
|
-
csv_detective/detection/engine.py,sha256=
|
|
134
|
-
csv_detective/detection/formats.py,sha256=
|
|
133
|
+
csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
|
|
134
|
+
csv_detective/detection/formats.py,sha256=92ZZafoJFZHDSxSUR6rDCFjLGpD4f4IZorqqVgxwFY8,5595
|
|
135
135
|
csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
|
|
136
136
|
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
137
137
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
138
|
-
csv_detective/detection/variables.py,sha256
|
|
139
|
-
csv_detective/output/__init__.py,sha256=
|
|
140
|
-
csv_detective/output/dataframe.py,sha256=
|
|
141
|
-
csv_detective/output/example.py,sha256=
|
|
142
|
-
csv_detective/output/profile.py,sha256=
|
|
138
|
+
csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
|
|
139
|
+
csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
|
|
140
|
+
csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
|
|
141
|
+
csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
|
|
142
|
+
csv_detective/output/profile.py,sha256=oWIuHchiZ72VzGLB9q3mW-hhWq1VxiU1Z09VWjAU-JM,4696
|
|
143
143
|
csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
|
|
144
144
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
145
145
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
146
|
-
csv_detective/parsing/columns.py,sha256=
|
|
146
|
+
csv_detective/parsing/columns.py,sha256=X5v3_1zgZXadnxjUG3__xLjOIvNU4n9LOiWZbix4I30,9838
|
|
147
147
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
148
|
-
csv_detective/parsing/csv.py,sha256=
|
|
148
|
+
csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,1742
|
|
149
149
|
csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
|
|
150
|
-
csv_detective/parsing/load.py,sha256
|
|
150
|
+
csv_detective/parsing/load.py,sha256=Ks1S92H_GErvd2Uy0_EuShMzZSkiuWdTmVQFJ_XX5lg,4167
|
|
151
151
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
152
|
-
csv_detective-0.9.3.
|
|
152
|
+
csv_detective-0.9.3.dev2232.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
153
153
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
154
154
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
155
|
-
tests/test_fields.py,sha256=
|
|
156
|
-
tests/test_file.py,sha256=
|
|
155
|
+
tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
|
|
156
|
+
tests/test_file.py,sha256=bYP-NzPoGEXPwNZLD1EjJlviT9a_27IY6cb0shdiR4U,12329
|
|
157
157
|
tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
|
|
158
|
-
tests/test_structure.py,sha256=
|
|
159
|
-
tests/test_validation.py,sha256=
|
|
158
|
+
tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
|
|
159
|
+
tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
|
|
160
160
|
venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
|
|
161
161
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
162
|
-
csv_detective-0.9.3.
|
|
163
|
-
csv_detective-0.9.3.
|
|
164
|
-
csv_detective-0.9.3.
|
|
165
|
-
csv_detective-0.9.3.
|
|
166
|
-
csv_detective-0.9.3.
|
|
162
|
+
csv_detective-0.9.3.dev2232.dist-info/METADATA,sha256=q8o2SRFri-iFmUgOp3tL5jGlIsuXB-TDyUj7BOaCPhg,10845
|
|
163
|
+
csv_detective-0.9.3.dev2232.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
164
|
+
csv_detective-0.9.3.dev2232.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
165
|
+
csv_detective-0.9.3.dev2232.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
166
|
+
csv_detective-0.9.3.dev2232.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -84,13 +84,13 @@ from csv_detective.parsing.columns import test_col as col_test # to prevent pyt
|
|
|
84
84
|
|
|
85
85
|
def test_all_tests_return_bool():
|
|
86
86
|
all_tests = return_all_tests("ALL", "detect_fields")
|
|
87
|
-
for
|
|
87
|
+
for attr in all_tests.values():
|
|
88
88
|
for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
|
|
89
|
-
assert isinstance(
|
|
89
|
+
assert isinstance(attr["func"](tmp), bool)
|
|
90
90
|
|
|
91
91
|
|
|
92
92
|
# categorical
|
|
93
|
-
def
|
|
93
|
+
def test_detect_categorical_variable():
|
|
94
94
|
categorical_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
|
|
95
95
|
categorical_col2 = [str(k // 20) for k in range(100)]
|
|
96
96
|
not_categorical_col = [i for i in range(100)]
|
|
@@ -103,7 +103,7 @@ def test_detetect_categorical_variable():
|
|
|
103
103
|
df = pd.DataFrame(df_dict, dtype=str)
|
|
104
104
|
|
|
105
105
|
res, _ = detect_categorical_variable(df)
|
|
106
|
-
assert len(res
|
|
106
|
+
assert len(res) and all(k in res for k in ["cat", "cat2"])
|
|
107
107
|
|
|
108
108
|
|
|
109
109
|
# continuous
|
|
@@ -394,8 +394,8 @@ fields = {
|
|
|
394
394
|
|
|
395
395
|
def test_all_fields_have_tests():
|
|
396
396
|
all_tests = return_all_tests("ALL", "detect_fields")
|
|
397
|
-
for
|
|
398
|
-
assert fields.get(
|
|
397
|
+
for attr in all_tests.values():
|
|
398
|
+
assert fields.get(attr["module"])
|
|
399
399
|
|
|
400
400
|
|
|
401
401
|
@pytest.mark.parametrize(
|
|
@@ -475,13 +475,9 @@ def test_early_detection(args):
|
|
|
475
475
|
def test_all_proportion_1():
|
|
476
476
|
all_tests = return_all_tests("ALL", "detect_fields")
|
|
477
477
|
prop_1 = {
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
else "test_" + t.__name__.split(".")[-1]
|
|
482
|
-
)
|
|
483
|
-
for t in all_tests
|
|
484
|
-
if t.PROPORTION == 1
|
|
478
|
+
name: eval(name if name not in ["int", "float"] else "test_" + name)
|
|
479
|
+
for name, attr in all_tests.items()
|
|
480
|
+
if attr["prop"] == 1
|
|
485
481
|
}
|
|
486
482
|
# building a table that uses only correct values for these formats, except on one row
|
|
487
483
|
table = pd.DataFrame(
|
tests/test_file.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from unittest.mock import patch
|
|
1
|
+
from unittest.mock import MagicMock, patch
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import pytest
|
|
@@ -6,15 +6,19 @@ import responses
|
|
|
6
6
|
|
|
7
7
|
from csv_detective import routine
|
|
8
8
|
from csv_detective.output.profile import create_profile
|
|
9
|
-
from csv_detective.parsing.
|
|
9
|
+
from csv_detective.parsing.csv import CHUNK_SIZE
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@pytest.mark.parametrize(
|
|
13
|
-
"
|
|
14
|
-
(100, int(1e5)),
|
|
13
|
+
"chunk_size",
|
|
14
|
+
(100, 404, int(1e5)),
|
|
15
15
|
)
|
|
16
|
-
def test_columns_output_on_file(
|
|
17
|
-
with
|
|
16
|
+
def test_columns_output_on_file(chunk_size):
|
|
17
|
+
with (
|
|
18
|
+
# maybe we should refactor later to avoid having to patch everywhere
|
|
19
|
+
patch("csv_detective.parsing.csv.CHUNK_SIZE", chunk_size),
|
|
20
|
+
patch("csv_detective.parsing.columns.CHUNK_SIZE", chunk_size),
|
|
21
|
+
):
|
|
18
22
|
output = routine(
|
|
19
23
|
file_path="tests/data/a_test_file.csv",
|
|
20
24
|
num_rows=-1,
|
|
@@ -248,17 +252,23 @@ def mocked_responses():
|
|
|
248
252
|
def test_urls(mocked_responses, params):
|
|
249
253
|
file_name, checks = params
|
|
250
254
|
url = f"http://example.com/{file_name}"
|
|
255
|
+
expected_content = open(f"tests/data/{file_name}", "rb").read()
|
|
251
256
|
mocked_responses.get(
|
|
252
257
|
url,
|
|
253
|
-
body=
|
|
258
|
+
body=expected_content,
|
|
254
259
|
status=200,
|
|
255
260
|
)
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
261
|
+
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
262
|
+
mock_response = MagicMock()
|
|
263
|
+
mock_response.read.return_value = expected_content
|
|
264
|
+
mock_response.__enter__.return_value = mock_response
|
|
265
|
+
mock_urlopen.return_value = mock_response
|
|
266
|
+
_ = routine(
|
|
267
|
+
file_path=url,
|
|
268
|
+
num_rows=-1,
|
|
269
|
+
output_profile=False,
|
|
270
|
+
save_results=False,
|
|
271
|
+
)
|
|
262
272
|
for k, v in checks.items():
|
|
263
273
|
if v is None:
|
|
264
274
|
assert not _.get(k)
|
|
@@ -289,13 +299,14 @@ def test_nan_values(expected_type):
|
|
|
289
299
|
|
|
290
300
|
|
|
291
301
|
def test_output_df():
|
|
292
|
-
output,
|
|
302
|
+
output, df_chunks = routine(
|
|
293
303
|
file_path="tests/data/b_test_file.csv",
|
|
294
304
|
num_rows=-1,
|
|
295
305
|
output_profile=False,
|
|
296
306
|
save_results=False,
|
|
297
307
|
output_df=True,
|
|
298
308
|
)
|
|
309
|
+
df = pd.concat(df_chunks, ignore_index=True)
|
|
299
310
|
assert isinstance(output, dict)
|
|
300
311
|
assert isinstance(df, pd.DataFrame)
|
|
301
312
|
assert len(df) == 6
|
|
@@ -317,14 +328,20 @@ def test_cast_json(mocked_responses, cast_json):
|
|
|
317
328
|
body=expected_content,
|
|
318
329
|
status=200,
|
|
319
330
|
)
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
331
|
+
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
332
|
+
mock_response = MagicMock()
|
|
333
|
+
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
334
|
+
mock_response.__enter__.return_value = mock_response
|
|
335
|
+
mock_urlopen.return_value = mock_response
|
|
336
|
+
analysis, df_chunks = routine(
|
|
337
|
+
file_path="http://example.com/test.csv",
|
|
338
|
+
num_rows=-1,
|
|
339
|
+
output_profile=False,
|
|
340
|
+
save_results=False,
|
|
341
|
+
output_df=True,
|
|
342
|
+
cast_json=cast_json,
|
|
343
|
+
)
|
|
344
|
+
df = pd.concat(df_chunks, ignore_index=True)
|
|
328
345
|
assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
|
|
329
346
|
assert isinstance(df["a_simple_dict"][0], expected_type)
|
|
330
347
|
|
|
@@ -337,27 +354,38 @@ def test_almost_uniform_column(mocked_responses):
|
|
|
337
354
|
body=expected_content,
|
|
338
355
|
status=200,
|
|
339
356
|
)
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
357
|
+
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
358
|
+
mock_response = MagicMock()
|
|
359
|
+
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
360
|
+
mock_response.__enter__.return_value = mock_response
|
|
361
|
+
mock_urlopen.return_value = mock_response
|
|
362
|
+
analysis = routine(
|
|
363
|
+
file_path="http://example.com/test.csv",
|
|
364
|
+
num_rows=-1,
|
|
365
|
+
output_profile=False,
|
|
366
|
+
save_results=False,
|
|
367
|
+
)
|
|
346
368
|
assert analysis["columns"][col_name]["format"] == "int"
|
|
347
369
|
|
|
348
370
|
|
|
349
371
|
def test_full_nan_column(mocked_responses):
|
|
350
372
|
# we want a file that needs sampling
|
|
351
|
-
expected_content = "only_nan,second_col\n" + ",1\n" * (
|
|
373
|
+
expected_content = "only_nan,second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
|
|
352
374
|
mocked_responses.get(
|
|
353
375
|
"http://example.com/test.csv",
|
|
354
376
|
body=expected_content,
|
|
355
377
|
status=200,
|
|
356
378
|
)
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
379
|
+
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
380
|
+
# Create a mock HTTP response object
|
|
381
|
+
mock_response = MagicMock()
|
|
382
|
+
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
383
|
+
mock_response.__enter__.return_value = mock_response
|
|
384
|
+
mock_urlopen.return_value = mock_response
|
|
385
|
+
# just testing it doesn't fail
|
|
386
|
+
routine(
|
|
387
|
+
file_path="http://example.com/test.csv",
|
|
388
|
+
num_rows=-1,
|
|
389
|
+
output_profile=False,
|
|
390
|
+
save_results=False,
|
|
391
|
+
)
|
tests/test_structure.py
CHANGED
|
@@ -34,5 +34,8 @@ def tests_conformity():
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def test_all_tests_have_unique_name():
|
|
37
|
-
names = [
|
|
37
|
+
names = [
|
|
38
|
+
attr["module"].__name__.split(".")[-1]
|
|
39
|
+
for attr in return_all_tests("ALL", "detect_fields").values()
|
|
40
|
+
]
|
|
38
41
|
assert len(names) == len(set(names))
|
tests/test_validation.py
CHANGED
|
@@ -49,12 +49,9 @@ def test_validation(_params):
|
|
|
49
49
|
for dotkey in modif_previous_analysis:
|
|
50
50
|
keys = dotkey.split(".")
|
|
51
51
|
set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
|
|
52
|
-
is_valid, table, analysis = validate(
|
|
52
|
+
is_valid, table, analysis, col_values = validate(
|
|
53
53
|
"tests/data/a_test_file.csv",
|
|
54
54
|
previous_analysis=previous_analysis,
|
|
55
|
-
num_rows=-1,
|
|
56
|
-
sep=previous_analysis.get("separator"),
|
|
57
|
-
encoding=previous_analysis.get("encoding"),
|
|
58
55
|
)
|
|
59
56
|
assert is_valid == should_be_valid
|
|
60
57
|
if table_type is None:
|
|
@@ -65,6 +62,14 @@ def test_validation(_params):
|
|
|
65
62
|
assert analysis is None
|
|
66
63
|
else:
|
|
67
64
|
assert isinstance(analysis, analysis_type)
|
|
65
|
+
if should_be_valid:
|
|
66
|
+
assert isinstance(col_values, dict)
|
|
67
|
+
assert all(
|
|
68
|
+
col in table.columns and isinstance(values, pd.Series)
|
|
69
|
+
for col, values in col_values.items()
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
assert col_values is None
|
|
68
73
|
|
|
69
74
|
|
|
70
75
|
@pytest.mark.parametrize(
|
|
File without changes
|
{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/top_level.txt
RENAMED
|
File without changes
|