PyPI - csv-detective - Versions diffs - 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2232__py3-none-any.whl - Mend

csv-detective 0.9.3.dev2215py3-none-any.whl → 0.9.3.dev2232py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

csv_detective/__init__.py +2 -1
csv_detective/detection/engine.py +1 -1
csv_detective/detection/formats.py +39 -95
csv_detective/detection/variables.py +2 -2
csv_detective/explore_csv.py +5 -7
csv_detective/load_tests.py +11 -4
csv_detective/output/__init__.py +8 -4
csv_detective/output/dataframe.py +37 -0
csv_detective/output/example.py +3 -1
csv_detective/output/profile.py +59 -19
csv_detective/parsing/columns.py +133 -35
csv_detective/parsing/csv.py +26 -23
csv_detective/parsing/load.py +21 -8
csv_detective/validate.py +86 -40
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/METADATA +29 -6
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/RECORD +24 -24
tests/test_fields.py +9 -13
tests/test_file.py +64 -36
tests/test_structure.py +4 -1
tests/test_validation.py +9 -4
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/WHEEL +0 -0
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/entry_points.txt +0 -0
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/licenses/LICENSE +0 -0
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/top_level.txt +0 -0

{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
+csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
 csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
-csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
-csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
+csv_detective/explore_csv.py,sha256=kuLkORQarelG13swoi0dH4cERu8BoRtRvyQ2SsYYhCY,5653
+csv_detective/load_tests.py,sha256=VzHJq1Q22C666nad17ciPRtcQEonP40YmSERn9zylvQ,2399
 csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
-csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
+csv_detective/validate.py,sha256=CNTYu_rOiv-Z8iWqCI_Ac_LXvbneRSukiu7NxB9Rcuo,5187
 csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -130,37 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
 csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
-csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
-csv_detective/detection/formats.py,sha256=VxLHyQNUb7SrBkS1uV6cTK7cSrCVgrpAd3nd_74s2B0,7775
+csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
+csv_detective/detection/formats.py,sha256=92ZZafoJFZHDSxSUR6rDCFjLGpD4f4IZorqqVgxwFY8,5595
 csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
 csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
-csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
-csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
-csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
-csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
-csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
+csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
+csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
+csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
+csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
+csv_detective/output/profile.py,sha256=oWIuHchiZ72VzGLB9q3mW-hhWq1VxiU1Z09VWjAU-JM,4696
 csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
+csv_detective/parsing/columns.py,sha256=X5v3_1zgZXadnxjUG3__xLjOIvNU4n9LOiWZbix4I30,9838
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
-csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
+csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,1742
 csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
-csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
+csv_detective/parsing/load.py,sha256=Ks1S92H_GErvd2Uy0_EuShMzZSkiuWdTmVQFJ_XX5lg,4167
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.3.dev2215.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.3.dev2232.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
-tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
-tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
+tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
+tests/test_file.py,sha256=bYP-NzPoGEXPwNZLD1EjJlviT9a_27IY6cb0shdiR4U,12329
 tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
-tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
-tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
+tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
+tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
 venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.3.dev2215.dist-info/METADATA,sha256=9F6hwttFFsBbi0eMv_UChawcvho2C9wHug4H_QEUIsQ,9931
-csv_detective-0.9.3.dev2215.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.3.dev2215.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.3.dev2215.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.3.dev2215.dist-info/RECORD,,
+csv_detective-0.9.3.dev2232.dist-info/METADATA,sha256=q8o2SRFri-iFmUgOp3tL5jGlIsuXB-TDyUj7BOaCPhg,10845
+csv_detective-0.9.3.dev2232.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.3.dev2232.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.3.dev2232.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.3.dev2232.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -84,13 +84,13 @@ from csv_detective.parsing.columns import test_col as col_test  # to prevent pyt
 def test_all_tests_return_bool():
     all_tests = return_all_tests("ALL", "detect_fields")
-    for test in all_tests:
+    for attr in all_tests.values():
         for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
-            assert isinstance(test._is(tmp), bool)
+            assert isinstance(attr["func"](tmp), bool)
 # categorical
-def test_detetect_categorical_variable():
+def test_detect_categorical_variable():
     categorical_col = ["type_a"] * 33 + ["type_b"] * 33 + ["type_c"] * 34
     categorical_col2 = [str(k // 20) for k in range(100)]
     not_categorical_col = [i for i in range(100)]
@@ -103,7 +103,7 @@ def test_detetect_categorical_variable():
     df = pd.DataFrame(df_dict, dtype=str)
     res, _ = detect_categorical_variable(df)
-    assert len(res.values) and all(k in res.values for k in ["cat", "cat2"])
+    assert len(res) and all(k in res for k in ["cat", "cat2"])
 # continuous
@@ -394,8 +394,8 @@ fields = {
 def test_all_fields_have_tests():
     all_tests = return_all_tests("ALL", "detect_fields")
-    for test in all_tests:
-        assert fields.get(test)
+    for attr in all_tests.values():
+        assert fields.get(attr["module"])
 @pytest.mark.parametrize(
@@ -475,13 +475,9 @@ def test_early_detection(args):
 def test_all_proportion_1():
     all_tests = return_all_tests("ALL", "detect_fields")
     prop_1 = {
-        t.__name__.split(".")[-1]: eval(
-            t.__name__.split(".")[-1]
-            if t.__name__.split(".")[-1] not in ["int", "float"]
-            else "test_" + t.__name__.split(".")[-1]
-        )
-        for t in all_tests
-        if t.PROPORTION == 1
+        name: eval(name if name not in ["int", "float"] else "test_" + name)
+        for name, attr in all_tests.items()
+        if attr["prop"] == 1
     }
     # building a table that uses only correct values for these formats, except on one row
     table = pd.DataFrame(

tests/test_file.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 import pandas as pd
 import pytest
@@ -6,15 +6,19 @@ import responses
 from csv_detective import routine
 from csv_detective.output.profile import create_profile
-from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
+from csv_detective.parsing.csv import CHUNK_SIZE
 @pytest.mark.parametrize(
-    "max_rows_analysis",
-    (100, int(1e5)),
+    "chunk_size",
+    (100, 404, int(1e5)),
 )
-def test_columns_output_on_file(max_rows_analysis):
-    with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", max_rows_analysis):
+def test_columns_output_on_file(chunk_size):
+    with (
+        # maybe we should refactor later to avoid having to patch everywhere
+        patch("csv_detective.parsing.csv.CHUNK_SIZE", chunk_size),
+        patch("csv_detective.parsing.columns.CHUNK_SIZE", chunk_size),
+    ):
         output = routine(
             file_path="tests/data/a_test_file.csv",
             num_rows=-1,
@@ -248,17 +252,23 @@ def mocked_responses():
 def test_urls(mocked_responses, params):
     file_name, checks = params
     url = f"http://example.com/{file_name}"
+    expected_content = open(f"tests/data/{file_name}", "rb").read()
     mocked_responses.get(
         url,
-        body=open(f"tests/data/{file_name}", "rb").read(),
+        body=expected_content,
         status=200,
     )
-    _ = routine(
-        file_path=url,
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        _ = routine(
+            file_path=url,
+            num_rows=-1,
+            output_profile=False,
+            save_results=False,
+        )
     for k, v in checks.items():
         if v is None:
             assert not _.get(k)
@@ -289,13 +299,14 @@ def test_nan_values(expected_type):
 def test_output_df():
-    output, df = routine(
+    output, df_chunks = routine(
         file_path="tests/data/b_test_file.csv",
         num_rows=-1,
         output_profile=False,
         save_results=False,
         output_df=True,
     )
+    df = pd.concat(df_chunks, ignore_index=True)
     assert isinstance(output, dict)
     assert isinstance(df, pd.DataFrame)
     assert len(df) == 6
@@ -317,14 +328,20 @@ def test_cast_json(mocked_responses, cast_json):
         body=expected_content,
         status=200,
     )
-    analysis, df = routine(
-        file_path="http://example.com/test.csv",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-        output_df=True,
-        cast_json=cast_json,
-    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content.encode("utf-8")
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        analysis, df_chunks = routine(
+            file_path="http://example.com/test.csv",
+            num_rows=-1,
+            output_profile=False,
+            save_results=False,
+            output_df=True,
+            cast_json=cast_json,
+        )
+    df = pd.concat(df_chunks, ignore_index=True)
     assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
     assert isinstance(df["a_simple_dict"][0], expected_type)
@@ -337,27 +354,38 @@ def test_almost_uniform_column(mocked_responses):
         body=expected_content,
         status=200,
     )
-    analysis = routine(
-        file_path="http://example.com/test.csv",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content.encode("utf-8")
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        analysis = routine(
+            file_path="http://example.com/test.csv",
+            num_rows=-1,
+            output_profile=False,
+            save_results=False,
+        )
     assert analysis["columns"][col_name]["format"] == "int"
 def test_full_nan_column(mocked_responses):
     # we want a file that needs sampling
-    expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
+    expected_content = "only_nan,second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
     mocked_responses.get(
         "http://example.com/test.csv",
         body=expected_content,
         status=200,
     )
-    # just testing it doesn't fail
-    routine(
-        file_path="http://example.com/test.csv",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        # Create a mock HTTP response object
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content.encode("utf-8")
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        # just testing it doesn't fail
+        routine(
+            file_path="http://example.com/test.csv",
+            num_rows=-1,
+            output_profile=False,
+            save_results=False,
+        )

tests/test_structure.py CHANGED Viewed

@@ -34,5 +34,8 @@ def tests_conformity():
 def test_all_tests_have_unique_name():
-    names = [t.__name__.split(".")[-1] for t in return_all_tests("ALL", "detect_fields")]
+    names = [
+        attr["module"].__name__.split(".")[-1]
+        for attr in return_all_tests("ALL", "detect_fields").values()
+    ]
     assert len(names) == len(set(names))

tests/test_validation.py CHANGED Viewed

@@ -49,12 +49,9 @@ def test_validation(_params):
     for dotkey in modif_previous_analysis:
         keys = dotkey.split(".")
         set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
-    is_valid, table, analysis = validate(
+    is_valid, table, analysis, col_values = validate(
         "tests/data/a_test_file.csv",
         previous_analysis=previous_analysis,
-        num_rows=-1,
-        sep=previous_analysis.get("separator"),
-        encoding=previous_analysis.get("encoding"),
     )
     assert is_valid == should_be_valid
     if table_type is None:
@@ -65,6 +62,14 @@ def test_validation(_params):
         assert analysis is None
     else:
         assert isinstance(analysis, analysis_type)
+    if should_be_valid:
+        assert isinstance(col_values, dict)
+        assert all(
+            col in table.columns and isinstance(values, pd.Series)
+            for col, values in col_values.items()
+        )
+    else:
+        assert col_values is None
 @pytest.mark.parametrize(

{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2232.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2232__py3-none-any.whl

csv-detective 0.9.3.dev2215py3-none-any.whl → 0.9.3.dev2232py3-none-any.whl