PyPI - csv-detective - Versions diffs - 0.9.3.dev2140__py3-none-any.whl → 0.9.3.dev2232__py3-none-any.whl - Mend

csv-detective 0.9.3.dev2140py3-none-any.whl → 0.9.3.dev2232py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

csv_detective/__init__.py +2 -1
csv_detective/detect_labels/FR/other/siren/__init__.py +1 -0
csv_detective/detect_labels/FR/other/siret/__init__.py +1 -0
csv_detective/detect_labels/geo/latlon_wgs/__init__.py +22 -29
csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +15 -7
csv_detective/detection/engine.py +1 -1
csv_detective/detection/formats.py +42 -95
csv_detective/detection/variables.py +2 -2
csv_detective/explore_csv.py +5 -7
csv_detective/load_tests.py +11 -4
csv_detective/output/__init__.py +8 -4
csv_detective/output/dataframe.py +37 -0
csv_detective/output/example.py +3 -1
csv_detective/output/profile.py +59 -19
csv_detective/parsing/columns.py +133 -35
csv_detective/parsing/csv.py +26 -23
csv_detective/parsing/load.py +21 -8
csv_detective/validate.py +86 -40
{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/METADATA +45 -29
{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/RECORD +28 -28
tests/test_fields.py +9 -13
tests/test_file.py +64 -36
tests/test_structure.py +4 -1
tests/test_validation.py +9 -4
{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/WHEEL +0 -0
{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/entry_points.txt +0 -0
{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/licenses/LICENSE +0 -0
{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/top_level.txt +0 -0

{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.3.dev2140
+Version: 0.9.3.dev2232
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT
@@ -22,16 +22,16 @@ Requires-Dist: python-magic==0.4.27
 Requires-Dist: frformat==0.4.0
 Requires-Dist: Faker>=33.0.0
 Requires-Dist: rstr==3.2.2
+Requires-Dist: more-itertools>=10.8.0
 Provides-Extra: dev
 Requires-Dist: pytest>=8.3.0; extra == "dev"
 Requires-Dist: responses>=0.25.0; extra == "dev"
-Requires-Dist: bumpx>=0.3.10; extra == "dev"
 Requires-Dist: ruff>=0.9.3; extra == "dev"
 Dynamic: license-file
 # CSV Detective
-This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks to see for each column if it matches with various content types. This is currently done through regex and string comparison.
+This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types.
 Currently supported file types: csv, xls, xlsx, ods.
@@ -51,7 +51,7 @@ pip install csv-detective
 Say you have a tabular file located at `file_path`. This is how you could use `csv_detective`:
-```
+```python
 # Import the csv_detective package
 from csv_detective import routine
 import os # for this example only
@@ -159,13 +159,26 @@ The program creates a `Python` dictionnary with the following information :
 ```
 The output slightly differs depending on the file format:
-- csv files have `encoding` and `separator`
+- csv files have `encoding` and `separator` (and `compression` if relevant)
 - xls, xls, ods files have `engine` and `sheet_name`
+You may also set `output_df` to `True`, in which case the output is a tuple of two elements:
+- the analysis (as described above)
+- an iteror of `pd.DataFrame`s which contain the columns cast with the detected types (which can be used with `pd.concat` or in a loop):
+```python
+inspection, df_chunks = routine(
+    file_path=file_path,
+    num_rows=-1,
+    output_df=True,
+)
+cast_df = pd.concat(df_chunks, ignore_index=True)
+# if "col1" has been detected as a float, then cast_df["col1"] contains floats
+```
 ### What Formats Can Be Detected
 Includes :
+- types (float, int, dates, datetimes, JSON) and more specific (latitude, longitude, geoJSON...)
 - Communes, Départements, Régions, Pays
 - Codes Communes, Codes Postaux, Codes Departement, ISO Pays
 - Codes CSP, Description CSP, SIREN
@@ -173,6 +186,16 @@ Includes :
 - Years, Dates, Jours de la Semaine FR
 - UUIDs, Mongo ObjectIds
+### Validation
+If you have a pre-made analysis of a file, you can check whether an other file conforms to the same analysis:
+```python
+from csv_detective import validate
+is_valid, *_ = validate(
+  file_path,
+  previous_analysis,  # exactly as it came out from the routine function
+)
+```
 ### Format detection and scoring
 For each column, 3 scores are computed for each format, the higher the score, the more likely the format:
 - the field score based on the values contained in the column (0.0 to 1.0).
@@ -200,7 +223,6 @@ Only the format with highest score is present in the output.
 Related ideas:
 - store column names to make a learning model based on column names for (possible pre-screen)
-- normalising data based on column prediction
 - entity resolution (good luck...)
 ## Why Could This Be of Any Use ?
@@ -220,32 +242,26 @@ ruff check --fix .
 ruff format .
 ```
-## Release
-The release process uses `bumpx`.
-```shell
-pip install -e .[dev]
-```
+### 🏷️ Release
-### Process
+The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
-1. `bumpx` will handle bumping the version according to your command (patch, minor, major)
-2. It will update the CHANGELOG according to the new version being published
-3. It will push a tag with the given version to github
-4. CircleCI will pickup this tag, build the package and publish it to pypi
-5. `bumpx` will have everything ready for the next version (version, changelog...)
+```bash
+# Create a new release
+./tag_version.sh <version>
-### Dry run
+# Example
+./tag_version.sh 2.5.0
-```shell
-bumpx -d -v
+# Dry run to see what would happen
+./tag_version.sh 2.5.0 --dry-run
 ```
-### Release
-This will release a patch version:
+**Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
-```shell
-bumpx -v
-```
+The script automatically:
+- Updates the version in pyproject.toml
+- Extracts commits since the last tag and formats them for CHANGELOG.md
+- Identifies breaking changes (commits with `!:` in the subject)
+- Creates a git tag and pushes it to the remote repository
+- Creates a GitHub release with the changelog content

{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
+csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
 csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
-csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
-csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
+csv_detective/explore_csv.py,sha256=kuLkORQarelG13swoi0dH4cERu8BoRtRvyQ2SsYYhCY,5653
+csv_detective/load_tests.py,sha256=VzHJq1Q22C666nad17ciPRtcQEonP40YmSERn9zylvQ,2399
 csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
-csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
+csv_detective/validate.py,sha256=CNTYu_rOiv-Z8iWqCI_Ac_LXvbneRSukiu7NxB9Rcuo,5187
 csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -97,8 +97,8 @@ csv_detective/detect_labels/FR/other/csp_insee/__init__.py,sha256=AI9nqj3zm6_vyc
 csv_detective/detect_labels/FR/other/date_fr/__init__.py,sha256=4Crk045ZD_tVovI7C-IqjKFz23Ej5-hrFkhZK4OilqA,258
 csv_detective/detect_labels/FR/other/insee_ape700/__init__.py,sha256=N7LzmtNwZERgrwMy3EFHaVBpdiwkt2_9Tt7XVJLff6U,406
 csv_detective/detect_labels/FR/other/sexe/__init__.py,sha256=ZWhc8S9L1X2fFh2g5Ja-LuhsfHg_lALKrur6yDnGDPk,238
-csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=g7Y7IvW9VKO528z1MSPxfFtRB7kQXSiG7QQ-VZRfFEk,386
-csv_detective/detect_labels/FR/other/siret/__init__.py,sha256=-gvdxUnv3LRfje60ljC4F3B2c1LBcWfV3zZbV3VJZ08,323
+csv_detective/detect_labels/FR/other/siren/__init__.py,sha256=cGzc9HPzbWlffkzJgwujUqupLi1Pkm0HWBLZv-_c4to,402
+csv_detective/detect_labels/FR/other/siret/__init__.py,sha256=Av8IsLre6pRnPj-AHtqaU-1C_TMCxgDYAbTGIW0XIdU,339
 csv_detective/detect_labels/FR/other/tel_fr/__init__.py,sha256=4jIZ9cmN73XhP4ayGcEMcB_y0X45oRk1Lq2p_pNfgok,426
 csv_detective/detect_labels/FR/other/uai/__init__.py,sha256=5L6JowK9y6y9uZNg6hWzknMSzh0SurkwQeTINNKTdYY,599
 csv_detective/detect_labels/FR/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -110,9 +110,9 @@ csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py,sha256=biUZP
 csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
 csv_detective/detect_labels/geo/json_geojson/__init__.py,sha256=On8VOCDD0EspZra6fTQCXH4MYao2xmRu-o7xWcab7Jg,355
 csv_detective/detect_labels/geo/latitude_wgs/__init__.py,sha256=ME_KjniqDSdAwXP7XnKXyr5IA75KrGSLIhvPNfsux6E,664
-csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=tDndlFyEM7qKS3ATxp0Xs0FsPsOPpRWhDe1ockbWw8s,923
+csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=CegBNN-RR1k-I0OU7ZsdlpVI5UBYDcj5QDX9KaWay-w,701
 csv_detective/detect_labels/geo/longitude_wgs/__init__.py,sha256=_8IV2FLtrOjzhQNsk-fsgc9-jbAgzKDVMr4tXu2P-s4,429
-csv_detective/detect_labels/geo/lonlat_wgs/__init__.py,sha256=7gbumJFp5xhz4GZ4uTAJQoxw5D53WJZddptyANmdEws,346
+csv_detective/detect_labels/geo/lonlat_wgs/__init__.py,sha256=ZmBLiCyboJzpsbXa5fsTxvAbO0W-ukRXnRWemN-Z-wc,481
 csv_detective/detect_labels/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_labels/other/booleen/__init__.py,sha256=zEkarex7L4T3vmYjR5hdhtnhugTVDsvkgG_it6nN0aA,214
 csv_detective/detect_labels/other/email/__init__.py,sha256=Poagn45-eC2a_Wdk5Qs6d2BgYdncCQKZp2yEB50IuNw,431
@@ -130,37 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
 csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
-csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
-csv_detective/detection/formats.py,sha256=QXdxdECU5uC_ytLBT_6-xe0VAiaMptXF4KYiShRUVCA,7702
+csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
+csv_detective/detection/formats.py,sha256=92ZZafoJFZHDSxSUR6rDCFjLGpD4f4IZorqqVgxwFY8,5595
 csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
 csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
-csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
-csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
-csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
-csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
-csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
+csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
+csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
+csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
+csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
+csv_detective/output/profile.py,sha256=oWIuHchiZ72VzGLB9q3mW-hhWq1VxiU1Z09VWjAU-JM,4696
 csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
+csv_detective/parsing/columns.py,sha256=X5v3_1zgZXadnxjUG3__xLjOIvNU4n9LOiWZbix4I30,9838
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
-csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
+csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,1742
 csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
-csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
+csv_detective/parsing/load.py,sha256=Ks1S92H_GErvd2Uy0_EuShMzZSkiuWdTmVQFJ_XX5lg,4167
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.3.dev2140.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.3.dev2232.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
-tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
-tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
+tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
+tests/test_file.py,sha256=bYP-NzPoGEXPwNZLD1EjJlviT9a_27IY6cb0shdiR4U,12329
 tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
-tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
-tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
+tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
+tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
 venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.3.dev2140.dist-info/METADATA,sha256=kAuk6tI5cOB7zLgqjzVki_fDHUhH7lrFtu1fxXra1o4,9736
-csv_detective-0.9.3.dev2140.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.3.dev2140.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.3.dev2140.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.3.dev2140.dist-info/RECORD,,
+csv_detective-0.9.3.dev2232.dist-info/METADATA,sha256=q8o2SRFri-iFmUgOp3tL5jGlIsuXB-TDyUj7BOaCPhg,10845
+csv_detective-0.9.3.dev2232.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.3.dev2232.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.3.dev2232.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.3.dev2232.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -84,13 +84,13 @@ from csv_detective.parsing.columns import test_col as col_test  # to prevent pyt
 def test_all_tests_return_bool():
     all_tests = return_all_tests("ALL", "detect_fields")
-    for test in all_tests:
+    for attr in all_tests.values():
         for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
-            assert isinstance(test._is(tmp), bool)
+            assert isinstance(attr["func"](tmp), bool)
 # categorical
-def test_detetect_categorical_variable():
+def test_detect_categorical_variable():
     categorical_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
     categorical_col2 = [str(k // 20) for k in range(100)]
     not_categorical_col = [i for i in range(100)]
@@ -103,7 +103,7 @@ def test_detetect_categorical_variable():
     df = pd.DataFrame(df_dict, dtype=str)
     res, _ = detect_categorical_variable(df)
-    assert len(res.values) and all(k in res.values for k in ["cat", "cat2"])
+    assert len(res) and all(k in res for k in ["cat", "cat2"])
 # continuous
@@ -394,8 +394,8 @@ fields = {
 def test_all_fields_have_tests():
     all_tests = return_all_tests("ALL", "detect_fields")
-    for test in all_tests:
-        assert fields.get(test)
+    for attr in all_tests.values():
+        assert fields.get(attr["module"])
 @pytest.mark.parametrize(
@@ -475,13 +475,9 @@ def test_early_detection(args):
 def test_all_proportion_1():
     all_tests = return_all_tests("ALL", "detect_fields")
     prop_1 = {
-        t.__name__.split(".")[-1]: eval(
-            t.__name__.split(".")[-1]
-            if t.__name__.split(".")[-1] not in ["int", "float"]
-            else "test_" + t.__name__.split(".")[-1]
-        )
-        for t in all_tests
-        if t.PROPORTION == 1
+        name: eval(name if name not in ["int", "float"] else "test_" + name)
+        for name, attr in all_tests.items()
+        if attr["prop"] == 1
     }
     # building a table that uses only correct values for these formats, except on one row
     table = pd.DataFrame(

tests/test_file.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 import pandas as pd
 import pytest
@@ -6,15 +6,19 @@ import responses
 from csv_detective import routine
 from csv_detective.output.profile import create_profile
-from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
+from csv_detective.parsing.csv import CHUNK_SIZE
 @pytest.mark.parametrize(
-    "max_rows_analysis",
-    (100, int(1e5)),
+    "chunk_size",
+    (100, 404, int(1e5)),
 )
-def test_columns_output_on_file(max_rows_analysis):
-    with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", max_rows_analysis):
+def test_columns_output_on_file(chunk_size):
+    with (
+        # maybe we should refactor later to avoid having to patch everywhere
+        patch("csv_detective.parsing.csv.CHUNK_SIZE", chunk_size),
+        patch("csv_detective.parsing.columns.CHUNK_SIZE", chunk_size),
+    ):
         output = routine(
             file_path="tests/data/a_test_file.csv",
             num_rows=-1,
@@ -248,17 +252,23 @@ def mocked_responses():
 def test_urls(mocked_responses, params):
     file_name, checks = params
     url = f"http://example.com/{file_name}"
+    expected_content = open(f"tests/data/{file_name}", "rb").read()
     mocked_responses.get(
         url,
-        body=open(f"tests/data/{file_name}", "rb").read(),
+        body=expected_content,
         status=200,
     )
-    _ = routine(
-        file_path=url,
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        _ = routine(
+            file_path=url,
+            num_rows=-1,
+            output_profile=False,
+            save_results=False,
+        )
     for k, v in checks.items():
         if v is None:
             assert not _.get(k)
@@ -289,13 +299,14 @@ def test_nan_values(expected_type):
 def test_output_df():
-    output, df = routine(
+    output, df_chunks = routine(
         file_path="tests/data/b_test_file.csv",
         num_rows=-1,
         output_profile=False,
         save_results=False,
         output_df=True,
     )
+    df = pd.concat(df_chunks, ignore_index=True)
     assert isinstance(output, dict)
     assert isinstance(df, pd.DataFrame)
     assert len(df) == 6
@@ -317,14 +328,20 @@ def test_cast_json(mocked_responses, cast_json):
         body=expected_content,
         status=200,
     )
-    analysis, df = routine(
-        file_path="http://example.com/test.csv",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-        output_df=True,
-        cast_json=cast_json,
-    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content.encode("utf-8")
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        analysis, df_chunks = routine(
+            file_path="http://example.com/test.csv",
+            num_rows=-1,
+            output_profile=False,
+            save_results=False,
+            output_df=True,
+            cast_json=cast_json,
+        )
+    df = pd.concat(df_chunks, ignore_index=True)
     assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
     assert isinstance(df["a_simple_dict"][0], expected_type)
@@ -337,27 +354,38 @@ def test_almost_uniform_column(mocked_responses):
         body=expected_content,
         status=200,
     )
-    analysis = routine(
-        file_path="http://example.com/test.csv",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content.encode("utf-8")
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        analysis = routine(
+            file_path="http://example.com/test.csv",
+            num_rows=-1,
+            output_profile=False,
+            save_results=False,
+        )
     assert analysis["columns"][col_name]["format"] == "int"
 def test_full_nan_column(mocked_responses):
     # we want a file that needs sampling
-    expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
+    expected_content = "only_nan,second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
     mocked_responses.get(
         "http://example.com/test.csv",
         body=expected_content,
         status=200,
     )
-    # just testing it doesn't fail
-    routine(
-        file_path="http://example.com/test.csv",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        # Create a mock HTTP response object
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content.encode("utf-8")
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        # just testing it doesn't fail
+        routine(
+            file_path="http://example.com/test.csv",
+            num_rows=-1,
+            output_profile=False,
+            save_results=False,
+        )

tests/test_structure.py CHANGED Viewed

@@ -34,5 +34,8 @@ def tests_conformity():
 def test_all_tests_have_unique_name():
-    names = [t.__name__.split(".")[-1] for t in return_all_tests("ALL", "detect_fields")]
+    names = [
+        attr["module"].__name__.split(".")[-1]
+        for attr in return_all_tests("ALL", "detect_fields").values()
+    ]
     assert len(names) == len(set(names))

tests/test_validation.py CHANGED Viewed

@@ -49,12 +49,9 @@ def test_validation(_params):
     for dotkey in modif_previous_analysis:
         keys = dotkey.split(".")
         set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
-    is_valid, table, analysis = validate(
+    is_valid, table, analysis, col_values = validate(
         "tests/data/a_test_file.csv",
         previous_analysis=previous_analysis,
-        num_rows=-1,
-        sep=previous_analysis.get("separator"),
-        encoding=previous_analysis.get("encoding"),
     )
     assert is_valid == should_be_valid
     if table_type is None:
@@ -65,6 +62,14 @@ def test_validation(_params):
         assert analysis is None
     else:
         assert isinstance(analysis, analysis_type)
+    if should_be_valid:
+        assert isinstance(col_values, dict)
+        assert all(
+            col in table.columns and isinstance(values, pd.Series)
+            for col, values in col_values.items()
+        )
+    else:
+        assert col_values is None
 @pytest.mark.parametrize(

{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev2140.dist-info → csv_detective-0.9.3.dev2232.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.9.3.dev2140__py3-none-any.whl → 0.9.3.dev2232__py3-none-any.whl

csv-detective 0.9.3.dev2140py3-none-any.whl → 0.9.3.dev2232py3-none-any.whl