csv-detective 0.9.3.dev2140__py3-none-any.whl → 0.9.3.dev2232__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. csv_detective/__init__.py +2 -1
  2. csv_detective/detect_labels/FR/other/siren/__init__.py +1 -0
  3. csv_detective/detect_labels/FR/other/siret/__init__.py +1 -0
  4. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +22 -29
  5. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +15 -7
  6. csv_detective/detection/engine.py +1 -1
  7. csv_detective/detection/formats.py +42 -95
  8. csv_detective/detection/variables.py +2 -2
  9. csv_detective/explore_csv.py +5 -7
  10. csv_detective/load_tests.py +11 -4
  11. csv_detective/output/__init__.py +8 -4
  12. csv_detective/output/dataframe.py +37 -0
  13. csv_detective/output/example.py +3 -1
  14. csv_detective/output/profile.py +59 -19
  15. csv_detective/parsing/columns.py +133 -35
  16. csv_detective/parsing/csv.py +26 -23
  17. csv_detective/parsing/load.py +21 -8
  18. csv_detective/validate.py +86 -40
  19. {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/METADATA +45 -29
  20. {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/RECORD +28 -28
  21. tests/test_fields.py +9 -13
  22. tests/test_file.py +64 -36
  23. tests/test_structure.py +4 -1
  24. tests/test_validation.py +9 -4
  25. {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/WHEEL +0 -0
  26. {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/entry_points.txt +0 -0
  27. {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/licenses/LICENSE +0 -0
  28. {csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.3.dev2140
3
+ Version: 0.9.3.dev2232
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -22,16 +22,16 @@ Requires-Dist: python-magic==0.4.27
22
22
  Requires-Dist: frformat==0.4.0
23
23
  Requires-Dist: Faker>=33.0.0
24
24
  Requires-Dist: rstr==3.2.2
25
+ Requires-Dist: more-itertools>=10.8.0
25
26
  Provides-Extra: dev
26
27
  Requires-Dist: pytest>=8.3.0; extra == "dev"
27
28
  Requires-Dist: responses>=0.25.0; extra == "dev"
28
- Requires-Dist: bumpx>=0.3.10; extra == "dev"
29
29
  Requires-Dist: ruff>=0.9.3; extra == "dev"
30
30
  Dynamic: license-file
31
31
 
32
32
  # CSV Detective
33
33
 
34
- This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks to see for each column if it matches with various content types. This is currently done through regex and string comparison.
34
+ This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types.
35
35
 
36
36
  Currently supported file types: csv, xls, xlsx, ods.
37
37
 
@@ -51,7 +51,7 @@ pip install csv-detective
51
51
 
52
52
  Say you have a tabular file located at `file_path`. This is how you could use `csv_detective`:
53
53
 
54
- ```
54
+ ```python
55
55
  # Import the csv_detective package
56
56
  from csv_detective import routine
57
57
  import os # for this example only
@@ -159,13 +159,26 @@ The program creates a `Python` dictionnary with the following information :
159
159
  ```
160
160
 
161
161
  The output slightly differs depending on the file format:
162
- - csv files have `encoding` and `separator`
162
+ - csv files have `encoding` and `separator` (and `compression` if relevant)
163
163
  - xls, xls, ods files have `engine` and `sheet_name`
164
164
 
165
+ You may also set `output_df` to `True`, in which case the output is a tuple of two elements:
166
+ - the analysis (as described above)
167
+ - an iteror of `pd.DataFrame`s which contain the columns cast with the detected types (which can be used with `pd.concat` or in a loop):
168
+ ```python
169
+ inspection, df_chunks = routine(
170
+ file_path=file_path,
171
+ num_rows=-1,
172
+ output_df=True,
173
+ )
174
+ cast_df = pd.concat(df_chunks, ignore_index=True)
175
+ # if "col1" has been detected as a float, then cast_df["col1"] contains floats
176
+ ```
177
+
165
178
  ### What Formats Can Be Detected
166
179
 
167
180
  Includes :
168
-
181
+ - types (float, int, dates, datetimes, JSON) and more specific (latitude, longitude, geoJSON...)
169
182
  - Communes, Départements, Régions, Pays
170
183
  - Codes Communes, Codes Postaux, Codes Departement, ISO Pays
171
184
  - Codes CSP, Description CSP, SIREN
@@ -173,6 +186,16 @@ Includes :
173
186
  - Years, Dates, Jours de la Semaine FR
174
187
  - UUIDs, Mongo ObjectIds
175
188
 
189
+ ### Validation
190
+ If you have a pre-made analysis of a file, you can check whether an other file conforms to the same analysis:
191
+ ```python
192
+ from csv_detective import validate
193
+ is_valid, *_ = validate(
194
+ file_path,
195
+ previous_analysis, # exactly as it came out from the routine function
196
+ )
197
+ ```
198
+
176
199
  ### Format detection and scoring
177
200
  For each column, 3 scores are computed for each format, the higher the score, the more likely the format:
178
201
  - the field score based on the values contained in the column (0.0 to 1.0).
@@ -200,7 +223,6 @@ Only the format with highest score is present in the output.
200
223
  Related ideas:
201
224
 
202
225
  - store column names to make a learning model based on column names for (possible pre-screen)
203
- - normalising data based on column prediction
204
226
  - entity resolution (good luck...)
205
227
 
206
228
  ## Why Could This Be of Any Use ?
@@ -220,32 +242,26 @@ ruff check --fix .
220
242
  ruff format .
221
243
  ```
222
244
 
223
- ## Release
224
-
225
- The release process uses `bumpx`.
226
-
227
- ```shell
228
- pip install -e .[dev]
229
- ```
245
+ ### 🏷️ Release
230
246
 
231
- ### Process
247
+ The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
232
248
 
233
- 1. `bumpx` will handle bumping the version according to your command (patch, minor, major)
234
- 2. It will update the CHANGELOG according to the new version being published
235
- 3. It will push a tag with the given version to github
236
- 4. CircleCI will pickup this tag, build the package and publish it to pypi
237
- 5. `bumpx` will have everything ready for the next version (version, changelog...)
249
+ ```bash
250
+ # Create a new release
251
+ ./tag_version.sh <version>
238
252
 
239
- ### Dry run
253
+ # Example
254
+ ./tag_version.sh 2.5.0
240
255
 
241
- ```shell
242
- bumpx -d -v
256
+ # Dry run to see what would happen
257
+ ./tag_version.sh 2.5.0 --dry-run
243
258
  ```
244
259
 
245
- ### Release
246
-
247
- This will release a patch version:
260
+ **Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
248
261
 
249
- ```shell
250
- bumpx -v
251
- ```
262
+ The script automatically:
263
+ - Updates the version in pyproject.toml
264
+ - Extracts commits since the last tag and formats them for CHANGELOG.md
265
+ - Identifies breaking changes (commits with `!:` in the subject)
266
+ - Creates a git tag and pushes it to the remote repository
267
+ - Creates a GitHub release with the changelog content
@@ -1,9 +1,9 @@
1
- csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
1
+ csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
4
- csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
3
+ csv_detective/explore_csv.py,sha256=kuLkORQarelG13swoi0dH4cERu8BoRtRvyQ2SsYYhCY,5653
4
+ csv_detective/load_tests.py,sha256=VzHJq1Q22C666nad17ciPRtcQEonP40YmSERn9zylvQ,2399
5
5
  csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
6
- csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
6
+ csv_detective/validate.py,sha256=CNTYu_rOiv-Z8iWqCI_Ac_LXvbneRSukiu7NxB9Rcuo,5187
7
7
  csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
8
8
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -97,8 +97,8 @@ csv_detective/detect_labels/FR/other/csp_insee/__init__.py,sha256=AI9nqj3zm6_vyc
97
97
  csv_detective/detect_labels/FR/other/date_fr/__init__.py,sha256=4Crk045ZD_tVovI7C-IqjKFz23Ej5-hrFkhZK4OilqA,258
98
98
  csv_detective/detect_labels/FR/other/insee_ape700/__init__.py,sha256=N7LzmtNwZERgrwMy3EFHaVBpdiwkt2_9Tt7XVJLff6U,406
99
99
  csv_detective/detect_labels/FR/other/sexe/__init__.py,sha256=ZWhc8S9L1X2fFh2g5Ja-LuhsfHg_lALKrur6yDnGDPk,238
100
- csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=g7Y7IvW9VKO528z1MSPxfFtRB7kQXSiG7QQ-VZRfFEk,386
101
- csv_detective/detect_labels/FR/other/siret/__init__.py,sha256=-gvdxUnv3LRfje60ljC4F3B2c1LBcWfV3zZbV3VJZ08,323
100
+ csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=cGzc9HPzbWlffkzJgwujUqupLi1Pkm0HWBLZv-_c4to,402
101
+ csv_detective/detect_labels/FR/other/siret/__init__.py,sha256=Av8IsLre6pRnPj-AHtqaU-1C_TMCxgDYAbTGIW0XIdU,339
102
102
  csv_detective/detect_labels/FR/other/tel_fr/__init__.py,sha256=4jIZ9cmN73XhP4ayGcEMcB_y0X45oRk1Lq2p_pNfgok,426
103
103
  csv_detective/detect_labels/FR/other/uai/__init__.py,sha256=5L6JowK9y6y9uZNg6hWzknMSzh0SurkwQeTINNKTdYY,599
104
104
  csv_detective/detect_labels/FR/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -110,9 +110,9 @@ csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py,sha256=biUZP
110
110
  csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
111
111
  csv_detective/detect_labels/geo/json_geojson/__init__.py,sha256=On8VOCDD0EspZra6fTQCXH4MYao2xmRu-o7xWcab7Jg,355
112
112
  csv_detective/detect_labels/geo/latitude_wgs/__init__.py,sha256=ME_KjniqDSdAwXP7XnKXyr5IA75KrGSLIhvPNfsux6E,664
113
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=tDndlFyEM7qKS3ATxp0Xs0FsPsOPpRWhDe1ockbWw8s,923
113
+ csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=CegBNN-RR1k-I0OU7ZsdlpVI5UBYDcj5QDX9KaWay-w,701
114
114
  csv_detective/detect_labels/geo/longitude_wgs/__init__.py,sha256=_8IV2FLtrOjzhQNsk-fsgc9-jbAgzKDVMr4tXu2P-s4,429
115
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py,sha256=7gbumJFp5xhz4GZ4uTAJQoxw5D53WJZddptyANmdEws,346
115
+ csv_detective/detect_labels/geo/lonlat_wgs/__init__.py,sha256=ZmBLiCyboJzpsbXa5fsTxvAbO0W-ukRXnRWemN-Z-wc,481
116
116
  csv_detective/detect_labels/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
117
117
  csv_detective/detect_labels/other/booleen/__init__.py,sha256=zEkarex7L4T3vmYjR5hdhtnhugTVDsvkgG_it6nN0aA,214
118
118
  csv_detective/detect_labels/other/email/__init__.py,sha256=Poagn45-eC2a_Wdk5Qs6d2BgYdncCQKZp2yEB50IuNw,431
@@ -130,37 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
130
130
  csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
131
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
132
132
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
133
- csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
134
- csv_detective/detection/formats.py,sha256=QXdxdECU5uC_ytLBT_6-xe0VAiaMptXF4KYiShRUVCA,7702
133
+ csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
134
+ csv_detective/detection/formats.py,sha256=92ZZafoJFZHDSxSUR6rDCFjLGpD4f4IZorqqVgxwFY8,5595
135
135
  csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
136
136
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
137
137
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
138
- csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
139
- csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
140
- csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
141
- csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
142
- csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
138
+ csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
139
+ csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
140
+ csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
141
+ csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
142
+ csv_detective/output/profile.py,sha256=oWIuHchiZ72VzGLB9q3mW-hhWq1VxiU1Z09VWjAU-JM,4696
143
143
  csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
144
144
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
145
145
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
- csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
146
+ csv_detective/parsing/columns.py,sha256=X5v3_1zgZXadnxjUG3__xLjOIvNU4n9LOiWZbix4I30,9838
147
147
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
148
- csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
148
+ csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,1742
149
149
  csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
150
- csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
150
+ csv_detective/parsing/load.py,sha256=Ks1S92H_GErvd2Uy0_EuShMzZSkiuWdTmVQFJ_XX5lg,4167
151
151
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
152
- csv_detective-0.9.3.dev2140.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
152
+ csv_detective-0.9.3.dev2232.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
153
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
154
154
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
155
- tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
156
- tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
155
+ tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
156
+ tests/test_file.py,sha256=bYP-NzPoGEXPwNZLD1EjJlviT9a_27IY6cb0shdiR4U,12329
157
157
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
158
- tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
159
- tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
158
+ tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
159
+ tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
160
160
  venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
161
161
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
162
- csv_detective-0.9.3.dev2140.dist-info/METADATA,sha256=kAuk6tI5cOB7zLgqjzVki_fDHUhH7lrFtu1fxXra1o4,9736
163
- csv_detective-0.9.3.dev2140.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- csv_detective-0.9.3.dev2140.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
- csv_detective-0.9.3.dev2140.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
- csv_detective-0.9.3.dev2140.dist-info/RECORD,,
162
+ csv_detective-0.9.3.dev2232.dist-info/METADATA,sha256=q8o2SRFri-iFmUgOp3tL5jGlIsuXB-TDyUj7BOaCPhg,10845
163
+ csv_detective-0.9.3.dev2232.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ csv_detective-0.9.3.dev2232.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
+ csv_detective-0.9.3.dev2232.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
+ csv_detective-0.9.3.dev2232.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -84,13 +84,13 @@ from csv_detective.parsing.columns import test_col as col_test # to prevent pyt
84
84
 
85
85
  def test_all_tests_return_bool():
86
86
  all_tests = return_all_tests("ALL", "detect_fields")
87
- for test in all_tests:
87
+ for attr in all_tests.values():
88
88
  for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
89
- assert isinstance(test._is(tmp), bool)
89
+ assert isinstance(attr["func"](tmp), bool)
90
90
 
91
91
 
92
92
  # categorical
93
- def test_detetect_categorical_variable():
93
+ def test_detect_categorical_variable():
94
94
  categorical_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
95
95
  categorical_col2 = [str(k // 20) for k in range(100)]
96
96
  not_categorical_col = [i for i in range(100)]
@@ -103,7 +103,7 @@ def test_detetect_categorical_variable():
103
103
  df = pd.DataFrame(df_dict, dtype=str)
104
104
 
105
105
  res, _ = detect_categorical_variable(df)
106
- assert len(res.values) and all(k in res.values for k in ["cat", "cat2"])
106
+ assert len(res) and all(k in res for k in ["cat", "cat2"])
107
107
 
108
108
 
109
109
  # continuous
@@ -394,8 +394,8 @@ fields = {
394
394
 
395
395
  def test_all_fields_have_tests():
396
396
  all_tests = return_all_tests("ALL", "detect_fields")
397
- for test in all_tests:
398
- assert fields.get(test)
397
+ for attr in all_tests.values():
398
+ assert fields.get(attr["module"])
399
399
 
400
400
 
401
401
  @pytest.mark.parametrize(
@@ -475,13 +475,9 @@ def test_early_detection(args):
475
475
  def test_all_proportion_1():
476
476
  all_tests = return_all_tests("ALL", "detect_fields")
477
477
  prop_1 = {
478
- t.__name__.split(".")[-1]: eval(
479
- t.__name__.split(".")[-1]
480
- if t.__name__.split(".")[-1] not in ["int", "float"]
481
- else "test_" + t.__name__.split(".")[-1]
482
- )
483
- for t in all_tests
484
- if t.PROPORTION == 1
478
+ name: eval(name if name not in ["int", "float"] else "test_" + name)
479
+ for name, attr in all_tests.items()
480
+ if attr["prop"] == 1
485
481
  }
486
482
  # building a table that uses only correct values for these formats, except on one row
487
483
  table = pd.DataFrame(
tests/test_file.py CHANGED
@@ -1,4 +1,4 @@
1
- from unittest.mock import patch
1
+ from unittest.mock import MagicMock, patch
2
2
 
3
3
  import pandas as pd
4
4
  import pytest
@@ -6,15 +6,19 @@ import responses
6
6
 
7
7
  from csv_detective import routine
8
8
  from csv_detective.output.profile import create_profile
9
- from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
9
+ from csv_detective.parsing.csv import CHUNK_SIZE
10
10
 
11
11
 
12
12
  @pytest.mark.parametrize(
13
- "max_rows_analysis",
14
- (100, int(1e5)),
13
+ "chunk_size",
14
+ (100, 404, int(1e5)),
15
15
  )
16
- def test_columns_output_on_file(max_rows_analysis):
17
- with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", max_rows_analysis):
16
+ def test_columns_output_on_file(chunk_size):
17
+ with (
18
+ # maybe we should refactor later to avoid having to patch everywhere
19
+ patch("csv_detective.parsing.csv.CHUNK_SIZE", chunk_size),
20
+ patch("csv_detective.parsing.columns.CHUNK_SIZE", chunk_size),
21
+ ):
18
22
  output = routine(
19
23
  file_path="tests/data/a_test_file.csv",
20
24
  num_rows=-1,
@@ -248,17 +252,23 @@ def mocked_responses():
248
252
  def test_urls(mocked_responses, params):
249
253
  file_name, checks = params
250
254
  url = f"http://example.com/{file_name}"
255
+ expected_content = open(f"tests/data/{file_name}", "rb").read()
251
256
  mocked_responses.get(
252
257
  url,
253
- body=open(f"tests/data/{file_name}", "rb").read(),
258
+ body=expected_content,
254
259
  status=200,
255
260
  )
256
- _ = routine(
257
- file_path=url,
258
- num_rows=-1,
259
- output_profile=False,
260
- save_results=False,
261
- )
261
+ with patch("urllib.request.urlopen") as mock_urlopen:
262
+ mock_response = MagicMock()
263
+ mock_response.read.return_value = expected_content
264
+ mock_response.__enter__.return_value = mock_response
265
+ mock_urlopen.return_value = mock_response
266
+ _ = routine(
267
+ file_path=url,
268
+ num_rows=-1,
269
+ output_profile=False,
270
+ save_results=False,
271
+ )
262
272
  for k, v in checks.items():
263
273
  if v is None:
264
274
  assert not _.get(k)
@@ -289,13 +299,14 @@ def test_nan_values(expected_type):
289
299
 
290
300
 
291
301
  def test_output_df():
292
- output, df = routine(
302
+ output, df_chunks = routine(
293
303
  file_path="tests/data/b_test_file.csv",
294
304
  num_rows=-1,
295
305
  output_profile=False,
296
306
  save_results=False,
297
307
  output_df=True,
298
308
  )
309
+ df = pd.concat(df_chunks, ignore_index=True)
299
310
  assert isinstance(output, dict)
300
311
  assert isinstance(df, pd.DataFrame)
301
312
  assert len(df) == 6
@@ -317,14 +328,20 @@ def test_cast_json(mocked_responses, cast_json):
317
328
  body=expected_content,
318
329
  status=200,
319
330
  )
320
- analysis, df = routine(
321
- file_path="http://example.com/test.csv",
322
- num_rows=-1,
323
- output_profile=False,
324
- save_results=False,
325
- output_df=True,
326
- cast_json=cast_json,
327
- )
331
+ with patch("urllib.request.urlopen") as mock_urlopen:
332
+ mock_response = MagicMock()
333
+ mock_response.read.return_value = expected_content.encode("utf-8")
334
+ mock_response.__enter__.return_value = mock_response
335
+ mock_urlopen.return_value = mock_response
336
+ analysis, df_chunks = routine(
337
+ file_path="http://example.com/test.csv",
338
+ num_rows=-1,
339
+ output_profile=False,
340
+ save_results=False,
341
+ output_df=True,
342
+ cast_json=cast_json,
343
+ )
344
+ df = pd.concat(df_chunks, ignore_index=True)
328
345
  assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
329
346
  assert isinstance(df["a_simple_dict"][0], expected_type)
330
347
 
@@ -337,27 +354,38 @@ def test_almost_uniform_column(mocked_responses):
337
354
  body=expected_content,
338
355
  status=200,
339
356
  )
340
- analysis = routine(
341
- file_path="http://example.com/test.csv",
342
- num_rows=-1,
343
- output_profile=False,
344
- save_results=False,
345
- )
357
+ with patch("urllib.request.urlopen") as mock_urlopen:
358
+ mock_response = MagicMock()
359
+ mock_response.read.return_value = expected_content.encode("utf-8")
360
+ mock_response.__enter__.return_value = mock_response
361
+ mock_urlopen.return_value = mock_response
362
+ analysis = routine(
363
+ file_path="http://example.com/test.csv",
364
+ num_rows=-1,
365
+ output_profile=False,
366
+ save_results=False,
367
+ )
346
368
  assert analysis["columns"][col_name]["format"] == "int"
347
369
 
348
370
 
349
371
  def test_full_nan_column(mocked_responses):
350
372
  # we want a file that needs sampling
351
- expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
373
+ expected_content = "only_nan,second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
352
374
  mocked_responses.get(
353
375
  "http://example.com/test.csv",
354
376
  body=expected_content,
355
377
  status=200,
356
378
  )
357
- # just testing it doesn't fail
358
- routine(
359
- file_path="http://example.com/test.csv",
360
- num_rows=-1,
361
- output_profile=False,
362
- save_results=False,
363
- )
379
+ with patch("urllib.request.urlopen") as mock_urlopen:
380
+ # Create a mock HTTP response object
381
+ mock_response = MagicMock()
382
+ mock_response.read.return_value = expected_content.encode("utf-8")
383
+ mock_response.__enter__.return_value = mock_response
384
+ mock_urlopen.return_value = mock_response
385
+ # just testing it doesn't fail
386
+ routine(
387
+ file_path="http://example.com/test.csv",
388
+ num_rows=-1,
389
+ output_profile=False,
390
+ save_results=False,
391
+ )
tests/test_structure.py CHANGED
@@ -34,5 +34,8 @@ def tests_conformity():
34
34
 
35
35
 
36
36
  def test_all_tests_have_unique_name():
37
- names = [t.__name__.split(".")[-1] for t in return_all_tests("ALL", "detect_fields")]
37
+ names = [
38
+ attr["module"].__name__.split(".")[-1]
39
+ for attr in return_all_tests("ALL", "detect_fields").values()
40
+ ]
38
41
  assert len(names) == len(set(names))
tests/test_validation.py CHANGED
@@ -49,12 +49,9 @@ def test_validation(_params):
49
49
  for dotkey in modif_previous_analysis:
50
50
  keys = dotkey.split(".")
51
51
  set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
52
- is_valid, table, analysis = validate(
52
+ is_valid, table, analysis, col_values = validate(
53
53
  "tests/data/a_test_file.csv",
54
54
  previous_analysis=previous_analysis,
55
- num_rows=-1,
56
- sep=previous_analysis.get("separator"),
57
- encoding=previous_analysis.get("encoding"),
58
55
  )
59
56
  assert is_valid == should_be_valid
60
57
  if table_type is None:
@@ -65,6 +62,14 @@ def test_validation(_params):
65
62
  assert analysis is None
66
63
  else:
67
64
  assert isinstance(analysis, analysis_type)
65
+ if should_be_valid:
66
+ assert isinstance(col_values, dict)
67
+ assert all(
68
+ col in table.columns and isinstance(values, pd.Series)
69
+ for col, values in col_values.items()
70
+ )
71
+ else:
72
+ assert col_values is None
68
73
 
69
74
 
70
75
  @pytest.mark.parametrize(