PyPI - csv-detective - Versions diffs - 0.8.1.dev1440__py3-none-any.whl → 0.8.1.dev1460__py3-none-any.whl - Mend

csv-detective 0.8.1.dev1440py3-none-any.whl → 0.8.1.dev1460py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

csv_detective/explore_csv.py CHANGED Viewed

@@ -75,6 +75,7 @@ def routine(
     analysis = detect_formats(
         table=table,
         analysis=analysis,
+        file_path=file_path,
         user_input_tests=user_input_tests,
         limited_output=limited_output,
         skipna=skipna,
@@ -145,6 +146,7 @@ def validate_then_detect(
         analysis = detect_formats(
             table=table,
             analysis=analysis,
+            file_path=file_path,
             user_input_tests=user_input_tests,
             limited_output=limited_output,
             skipna=skipna,

{csv_detective-0.8.1.dev1440.data → csv_detective-0.8.1.dev1460.data}/data/share/csv_detective/CHANGELOG.md RENAMED Viewed

@@ -5,6 +5,7 @@
 - Refactor label testing [#119](https://github.com/datagouv/csv-detective/pull/119)
 - Refactor repo metadata and requirements [#120](https://github.com/datagouv/csv-detective/pull/120) [#122](https://github.com/datagouv/csv-detective/pull/122)
 - Better URL detection [#121](https://github.com/datagouv/csv-detective/pull/121)
+- For big files, analyse on sample then validate on whole file [#124](https://github.com/datagouv/csv-detective/pull/124)
 ## 0.8.0 (2025-05-20)

{csv_detective-0.8.1.dev1440.dist-info → csv_detective-0.8.1.dev1460.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv_detective
-Version: 0.8.1.dev1440
+Version: 0.8.1.dev1460
 Summary: Detect tabular files column content
 Home-page: https://github.com/datagouv/csv_detective
 Author: Etalab

{csv_detective-0.8.1.dev1440.dist-info → csv_detective-0.8.1.dev1460.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 csv_detective/__init__.py,sha256=fxctDlEyUexNk_ePriWu6V05xZEeirMV0v_StnEZ8vQ,165
 csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
-csv_detective/explore_csv.py,sha256=IT1-9TbS78p6oeDpQ5T6DQ93xQbobcscyBQb6nh86H4,9082
+csv_detective/explore_csv.py,sha256=YxXgaUqUNdAGsU8bC-cs_TVvSza4wc4aMJQjWRkRT5s,9144
 csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
 csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
 csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
@@ -127,19 +127,19 @@ csv_detective/output/example.py,sha256=EdPX1iqHhIG4DsiHuYdy-J7JxOkjgUh_o2D5nrfM5
 csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
 csv_detective/output/schema.py,sha256=ZDBWDOD8IYp7rcB0_n8l9JXGIhOQ6bTZHFWfTmnNNEQ,13480
 csv_detective/output/utils.py,sha256=HbmvCCCmFo7NJxhD_UsJIveuw-rrfhrvYckv1CJn_10,2301
-csv_detective-0.8.1.dev1440.data/data/share/csv_detective/CHANGELOG.md,sha256=b-F0tSnDQUauOqqPJCg57dvlaLt_xsb6J6O88RiiKwY,8603
-csv_detective-0.8.1.dev1440.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
-csv_detective-0.8.1.dev1440.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
-csv_detective-0.8.1.dev1440.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.8.1.dev1460.data/data/share/csv_detective/CHANGELOG.md,sha256=BsmO9YQAMi31co_c0I8aYRsm2m5Q5--vORWoJArdhOM,8725
+csv_detective-0.8.1.dev1460.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.8.1.dev1460.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
+csv_detective-0.8.1.dev1460.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
 tests/test_fields.py,sha256=d2tNvjtal6ZbO646x1GDbp_CGgp-EIcdg2SgMG72J6E,10270
-tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
+tests/test_file.py,sha256=FWVtYHlD5uU7tPeYsqlQg6O4lpU8Ct35vddkbzhvvjA,8508
 tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
 tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
 tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
-csv_detective-0.8.1.dev1440.dist-info/METADATA,sha256=4ECGBhA77ruP1PeRV0QamjdD1lfKOgoJ_RLJ8iiQ3nA,10443
-csv_detective-0.8.1.dev1440.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.8.1.dev1440.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.8.1.dev1440.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
-csv_detective-0.8.1.dev1440.dist-info/RECORD,,
+csv_detective-0.8.1.dev1460.dist-info/METADATA,sha256=Rhi872uRXV2PcYpcI64GJ9vw12TsYIEQJxf8H1srLic,10443
+csv_detective-0.8.1.dev1460.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.8.1.dev1460.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.8.1.dev1460.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
+csv_detective-0.8.1.dev1460.dist-info/RECORD,,

tests/test_file.py CHANGED Viewed

@@ -1,42 +1,49 @@
 import pandas as pd
 import pytest
 import responses
+from unittest.mock import patch
 from csv_detective import routine
-def test_columns_output_on_file():
-    output = routine(
-        file_path="tests/data/a_test_file.csv",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
-    assert isinstance(output, dict)
-    assert output["separator"] == ";"
-    assert output["header_row_idx"] == 2
-    assert output["header"] == [
-        "NUMCOM",
-        "NOMCOM",
-        "NUMDEP",
-        "NOMDEP",
-        "NUMEPCI",
-        "NOMEPCI",
-        "TXCOUVGLO_COM_2014",
-        "TXCOUVGLO_DEP_2014",
-        "TXCOUVGLO_EPCI_2014",
-        "STRUCTURED_INFO",
-        "GEO_INFO",
-    ]
-    assert output["total_lines"] == 404
-    assert output["nb_duplicates"] == 7
-    assert output["columns"]["NOMCOM"]["format"] == "commune"
-    assert output["columns"]["NOMDEP"]["format"] == "departement"
-    assert output["columns"]["NUMEPCI"]["format"] == "siren"
-    assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json"
-    assert output["columns"]["STRUCTURED_INFO"]["format"] == "json"
-    assert output["columns"]["GEO_INFO"]["python_type"] == "json"
-    assert output["columns"]["GEO_INFO"]["format"] == "json_geojson"
+@pytest.mark.parametrize(
+    "reduce_max_rows_analysis",
+    (True, False),
+)
+def test_columns_output_on_file(reduce_max_rows_analysis):
+    patched = 100 if reduce_max_rows_analysis else 1e5
+    with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", patched):
+        output = routine(
+            file_path="tests/data/a_test_file.csv",
+            num_rows=-1,
+            output_profile=False,
+            save_results=False,
+        )
+        assert isinstance(output, dict)
+        assert output["separator"] == ";"
+        assert output["header_row_idx"] == 2
+        assert output["header"] == [
+            "NUMCOM",
+            "NOMCOM",
+            "NUMDEP",
+            "NOMDEP",
+            "NUMEPCI",
+            "NOMEPCI",
+            "TXCOUVGLO_COM_2014",
+            "TXCOUVGLO_DEP_2014",
+            "TXCOUVGLO_EPCI_2014",
+            "STRUCTURED_INFO",
+            "GEO_INFO",
+        ]
+        assert output["total_lines"] == 404
+        assert output["nb_duplicates"] == 7
+        assert output["columns"]["NOMCOM"]["format"] == "commune"
+        assert output["columns"]["NOMDEP"]["format"] == "departement"
+        assert output["columns"]["NUMEPCI"]["format"] == "siren"
+        assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json"
+        assert output["columns"]["STRUCTURED_INFO"]["format"] == "json"
+        assert output["columns"]["GEO_INFO"]["python_type"] == "json"
+        assert output["columns"]["GEO_INFO"]["format"] == "json_geojson"
 def test_profile_output_on_file():

{csv_detective-0.8.1.dev1440.data → csv_detective-0.8.1.dev1460.data}/data/share/csv_detective/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1440.data → csv_detective-0.8.1.dev1460.data}/data/share/csv_detective/README.md RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1440.dist-info → csv_detective-0.8.1.dev1460.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1440.dist-info → csv_detective-0.8.1.dev1460.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1440.dist-info → csv_detective-0.8.1.dev1460.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1440.dist-info → csv_detective-0.8.1.dev1460.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.8.1.dev1440__py3-none-any.whl → 0.8.1.dev1460__py3-none-any.whl

csv-detective 0.8.1.dev1440py3-none-any.whl → 0.8.1.dev1460py3-none-any.whl