PyPI - csv-detective - Versions diffs - 0.10.3.dev2__py3-none-any.whl → 0.10.3.dev4__py3-none-any.whl - Mend

csv-detective 0.10.3.dev2py3-none-any.whl → 0.10.3.dev4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

csv_detective/detection/headers.py CHANGED Viewed

@@ -5,24 +5,22 @@ from typing import TextIO
 from csv_detective.utils import display_logs_depending_process_time
-def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
+def detect_header_position(file: TextIO, verbose: bool = False) -> int:
     """Tests 10 first rows for possible header (in case header is not 1st row)"""
     if verbose:
         start = time()
-        logging.info("Detecting headers")
+        logging.info("Detecting header position")
     file.seek(0)
     for i in range(10):
         row = file.readline()
         position = file.tell()
-        headers = [c for c in row.replace("\n", "").split(sep) if c]
-        if not any(col == "" for col in headers):
-            next_row = file.readline()
-            file.seek(position)
-            if row != next_row:
-                if verbose:
-                    display_logs_depending_process_time(
-                        f"Detected headers in {round(time() - start, 3)}s",
-                        time() - start,
-                    )
-                return i, headers
+        next_row = file.readline()
+        file.seek(position)
+        if row != next_row:
+            if verbose:
+                display_logs_depending_process_time(
+                    f"Detected header position in {round(time() - start, 3)}s",
+                    time() - start,
+                )
+            return i
     raise ValueError("Could not retrieve headers")

csv_detective/detection/rows.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
-    """Analog process to detect_headers for csv files, determines how many rows to skip
+    """Analog process to detect_header_position for csv files, determines how many rows to skip
     to end up with the header at the right place"""
     idx = 0
     if all([str(c).startswith("Unnamed:") for c in table.columns]):

csv_detective/output/dataframe.py CHANGED Viewed

@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
 def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
-    if not isinstance(value, str) or not value:
-        # None is the current default value in hydra, should we keep this?
+    if not isinstance(value, str) or value in pd._libs.parsers.STR_NA_VALUES:
+        # STR_NA_VALUES are directly ingested as NaN by pandas, we avoid trying to cast them (into int for instance)
         return None
     match _type:
         case "string":

csv_detective/parsing/load.py CHANGED Viewed

@@ -11,7 +11,7 @@ from csv_detective.detection.engine import (
     EXCEL_ENGINES,
     detect_engine,
 )
-from csv_detective.detection.headers import detect_headers
+from csv_detective.detection.headers import detect_header_position
 from csv_detective.detection.separator import detect_separator
 from csv_detective.parsing.compression import unzip
 from csv_detective.parsing.csv import parse_csv
@@ -83,9 +83,7 @@ def load_file(
             str_file = open(file_path, "r", encoding=encoding)
         if sep is None:
             sep = detect_separator(str_file, verbose=verbose)
-        header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
-        if header is None or (isinstance(header, list) and any([h is None for h in header])):
-            raise ValueError("Could not retrieve headers")
+        header_row_idx = detect_header_position(str_file, verbose=verbose)
         heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
         trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
         table, total_lines, nb_duplicates = parse_csv(
@@ -102,9 +100,11 @@ def load_file(
         }
         if engine is not None:
             analysis["compression"] = engine
+    if any(col.startswith("Unnamed:") for col in table.columns):
+        raise ValueError("Columns are not properly set")
     analysis |= {
         "header_row_idx": header_row_idx,
-        "header": header,
+        "header": list(table.columns),
     }
     if total_lines is not None:
         analysis["total_lines"] = total_lines

{csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.10.3.dev2
+Version: 0.10.3.dev4
 Summary: Detect tabular files column content
 Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
 License: MIT

{csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/RECORD RENAMED Viewed

@@ -9,8 +9,8 @@ csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvca
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
 csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
 csv_detective/detection/formats.py,sha256=9aIE4gwTN8c8pa-kofeJ7zalo8NqjGZabYD-G79kV5I,4734
-csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
-csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
+csv_detective/detection/headers.py,sha256=R-Cuo67laOrj6l9Jq_SCFLkRMj3CEzb2HMo_Sa50g9A,879
+csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
 csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
 csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
@@ -74,7 +74,7 @@ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=mLt_qcQ6D8hfy9zdi7
 csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=XFPdGBsyZCBg4D8IDn6VgwsycCwYVfuqPbyHfNeqGv0,1003
 csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7prThkihHrf7mwB6j5uEHpxGvLFE,1003
 csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
-csv_detective/output/dataframe.py,sha256=QX5vplx0AOKgnwwJ6dKvDHWRX9IGPStax-svXEyweJ8,3584
+csv_detective/output/dataframe.py,sha256=juBMdj0eiL8c3OrJJ3kCf15Qs4-CFQfHqh91FnVbG9E,3656
 csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
 csv_detective/output/profile.py,sha256=ADr5DwuvwcBYxugjN38fHm11l6ivfzGHXPd8a87Ht-s,4985
 csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
@@ -84,13 +84,13 @@ csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcO
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
 csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
 csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
-csv_detective/parsing/load.py,sha256=orW6PV5XUsHA093yVSxXkJl33LEUUArr3hP81U9Bzd4,4386
+csv_detective/parsing/load.py,sha256=uWX4r_2K9bf-9qKL6IaiGNkiSPnSSNqng420zdrFkDg,4371
 csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
-csv_detective-0.10.3.dev2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.10.3.dev4.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
-tests/test_fields.py,sha256=EuD2F1JUR8y88Hm-AYuJ5X7AKkGSyLIQfsGdxYgIWng,5680
-tests/test_file.py,sha256=_ftEymft5-1keUVE5AUdF2XkVcChJo6oBjws3ye06FE,14543
+tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
+tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
 tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
 tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
 tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
 tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
 tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
 tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
-csv_detective-0.10.3.dev2.dist-info/METADATA,sha256=QhAD5N5OZx1L_9ajLLuEjhSYSz6q05eAEwVd6_kDPFc,11082
-csv_detective-0.10.3.dev2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
-csv_detective-0.10.3.dev2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.10.3.dev2.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
-csv_detective-0.10.3.dev2.dist-info/RECORD,,
+csv_detective-0.10.3.dev4.dist-info/METADATA,sha256=FRE6DkDtXSuva4aVTV1ws8tj53w03fjesToRlgh_78s,11082
+csv_detective-0.10.3.dev4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+csv_detective-0.10.3.dev4.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.10.3.dev4.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
+csv_detective-0.10.3.dev4.dist-info/RECORD,,

{csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.10.1)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

tests/test_fields.py CHANGED Viewed

@@ -104,11 +104,17 @@ def test_fields_with_values(args):
         ("2022-08-01", "date", _date),
         ("2024-09-23 17:32:07", "datetime", _datetime),
         ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
+        ("N/A", "int", None),
+        ("nan", "bool", None),
+        ("", "date", None),  # all NaN-like values should be cast as None for all type
     ),
 )
 def test_cast(args):
     value, detected_type, cast_type = args
-    assert isinstance(cast(value, detected_type), cast_type)
+    if cast_type is None:
+        assert cast(value, detected_type) is None
+    else:
+        assert isinstance(cast(value, detected_type), cast_type)
 @pytest.mark.parametrize(

tests/test_file.py CHANGED Viewed

@@ -9,6 +9,12 @@ from csv_detective.output.profile import create_profile
 from csv_detective.parsing.csv import CHUNK_SIZE
+@pytest.fixture
+def mocked_responses():
+    with responses.RequestsMock() as rsps:
+        yield rsps
 @pytest.mark.parametrize(
     "chunk_size",
     (100, 404, int(1e5)),
@@ -165,6 +171,26 @@ def test_exception_different_number_of_columns():
         )
+def test_exception_malformed_columns(mocked_responses):
+    """
+    A ValueError should be raised if any column is Unnamed
+    """
+    url = f"http://example.com/bad_cols.csv"
+    expected_content = b"col1,col2,\n1,2,\n3,4,"
+    mocked_responses.get(
+        url,
+        body=expected_content,
+        status=200,
+    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        with pytest.raises(ValueError):
+            routine(file_path=url)
 def test_code_dep_reg_on_file():
     output = routine(
         file_path="tests/data/b_test_file.csv",
@@ -237,12 +263,6 @@ def test_non_csv_files(params):
             assert _[k] == v
-@pytest.fixture
-def mocked_responses():
-    with responses.RequestsMock() as rsps:
-        yield rsps
 @pytest.mark.parametrize(
     "params",
     # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib

{csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.10.3.dev2__py3-none-any.whl → 0.10.3.dev4__py3-none-any.whl

csv-detective 0.10.3.dev2py3-none-any.whl → 0.10.3.dev4py3-none-any.whl