PyPI - csv-detective - Versions diffs - 0.10.2.dev4__py3-none-any.whl → 0.10.3__py3-none-any.whl - Mend

csv-detective 0.10.2.dev4py3-none-any.whl → 0.10.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

csv_detective/detection/headers.py +12 -14
csv_detective/detection/rows.py +1 -1
csv_detective/explore_csv.py +15 -4
csv_detective/format.py +1 -1
csv_detective/formats/date.py +10 -1
csv_detective/output/dataframe.py +2 -2
csv_detective/output/profile.py +1 -1
csv_detective/parsing/columns.py +1 -1
csv_detective/parsing/excel.py +1 -1
csv_detective/parsing/load.py +11 -12
csv_detective/validate.py +66 -37
{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/METADATA +1 -5
{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/RECORD +20 -20
{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/WHEEL +1 -1
tests/test_fields.py +7 -1
tests/test_file.py +26 -6
tests/test_validation.py +70 -15
{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/entry_points.txt +0 -0
{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/licenses/LICENSE +0 -0
{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/top_level.txt +0 -0

csv_detective/detection/headers.py CHANGED Viewed

@@ -5,24 +5,22 @@ from typing import TextIO
 from csv_detective.utils import display_logs_depending_process_time
-def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
+def detect_header_position(file: TextIO, verbose: bool = False) -> int:
     """Tests 10 first rows for possible header (in case header is not 1st row)"""
     if verbose:
         start = time()
-        logging.info("Detecting headers")
+        logging.info("Detecting header position")
     file.seek(0)
     for i in range(10):
         row = file.readline()
         position = file.tell()
-        headers = [c for c in row.replace("\n", "").split(sep) if c]
-        if not any(col == "" for col in headers):
-            next_row = file.readline()
-            file.seek(position)
-            if row != next_row:
-                if verbose:
-                    display_logs_depending_process_time(
-                        f"Detected headers in {round(time() - start, 3)}s",
-                        time() - start,
-                    )
-                return i, headers
-    raise ValueError("Could not retrieve headers")
+        next_row = file.readline()
+        file.seek(position)
+        if row != next_row:
+            if verbose:
+                display_logs_depending_process_time(
+                    f"Detected header position in {round(time() - start, 3)}s",
+                    time() - start,
+                )
+            return i
+    raise ValueError("Could not accurately retrieve headers position")

csv_detective/detection/rows.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
-    """Analog process to detect_headers for csv files, determines how many rows to skip
+    """Analog process to detect_header_position for csv files, determines how many rows to skip
     to end up with the header at the right place"""
     idx = 0
     if all([str(c).startswith("Unnamed:") for c in table.columns]):

csv_detective/explore_csv.py CHANGED Viewed

@@ -142,20 +142,19 @@ def validate_then_detect(
         if is_url(file_path):
             logging.info("Path recognized as a URL")
-    is_valid, table, analysis, col_values = validate(
+    is_valid, analysis, col_values = validate(
         file_path=file_path,
         previous_analysis=previous_analysis,
         verbose=verbose,
         skipna=skipna,
     )
-    if analysis is None:
-        # if loading failed in validate, we load it from scratch
+    if not is_valid:
+        # if loading failed in validate, we load it from scratch and initiate an analysis
         table, analysis = load_file(
             file_path=file_path,
             num_rows=num_rows,
             verbose=verbose,
         )
-    if not is_valid:
         analysis, col_values = detect_formats(
             table=table,
             analysis=analysis,
@@ -165,6 +164,18 @@ def validate_then_detect(
             skipna=skipna,
             verbose=verbose,
         )
+    else:
+        # successful validation means we have a correct analysis and col_values
+        # only need to reload the table, and we already know how
+        table, _ = load_file(
+            file_path=file_path,
+            num_rows=num_rows,
+            verbose=verbose,
+            sep=analysis.get("separator"),
+            encoding=analysis.get("encoding"),
+            engine=analysis.get("engine"),
+            sheet_name=analysis.get("sheet_name"),
+        )
     try:
         return generate_output(
             table=table,

csv_detective/format.py CHANGED Viewed

@@ -27,7 +27,7 @@ class Format:
             tags: to allow users to submit a file to only a subset of formats
         """
         self.name: str = name
-        self.func: Callable = func
+        self.func: Callable[[Any], bool] = func
         self._test_values: dict[bool, list[str]] = _test_values
         self.labels: dict[str, float] = labels
         self.proportion: float = proportion

csv_detective/formats/date.py CHANGED Viewed

@@ -57,7 +57,9 @@ string_month_pattern = (
 def _is(val):
-    # early stops, to cut processing time
+    # many early stops, to cut processing time
+    # and avoid the costly use of date_casting as much as possible
+    # /!\ timestamps are considered ints, not dates
     if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
         return False
     # if it's a usual date pattern
@@ -70,8 +72,13 @@ def _is(val):
         ]
     ):
         return True
+    if re.match(r"^-?\d+[\.|,]\d+$", val):
+        # regular floats are excluded
+        return False
+    # not enough digits => not a date (slightly arbitrary)
     if sum([char.isdigit() for char in val]) / len(val) < threshold:
         return False
+    # last resort
     res = date_casting(val)
     if not res or res.hour or res.minute or res.second:
         return False
@@ -86,6 +93,7 @@ _test_values = {
         "15 décembre 1985",
         "02 05 2003",
         "20030502",
+        "2003.05.02",
         "1993-12/02",
     ],
     False: [
@@ -96,5 +104,6 @@ _test_values = {
         "12152003",
         "20031512",
         "02052003",
+        "6.27367393749392839",
     ],
 }

csv_detective/output/dataframe.py CHANGED Viewed

@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
 def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
-    if not isinstance(value, str) or not value:
-        # None is the current default value in hydra, should we keep this?
+    if not isinstance(value, str) or value in pd._libs.parsers.STR_NA_VALUES:
+        # STR_NA_VALUES are directly ingested as NaN by pandas, we avoid trying to cast them (into int for instance)
         return None
     match _type:
         case "string":

csv_detective/output/profile.py CHANGED Viewed

@@ -23,7 +23,7 @@ def create_profile(
         logging.info("Creating profile")
     if num_rows > 0:
-        raise ValueError("To create profiles num_rows has to be set to -1")
+        raise ValueError("To create profile `num_rows` must be set to -1")
     if not limited_output:
         columns = {
             k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}

csv_detective/parsing/columns.py CHANGED Viewed

@@ -33,7 +33,7 @@ def test_col_val(
     try:
         if skipna:
-            serie = serie.loc[serie.notnull()]
+            serie = serie.dropna()
         ser_len = len(serie)
         if ser_len == 0:
             # being here means the whole column is NaN, so if skipna it's a pass

csv_detective/parsing/excel.py CHANGED Viewed

@@ -23,7 +23,7 @@ def parse_excel(
     file_path: str,
     num_rows: int = -1,
     engine: str | None = None,
-    sheet_name: str | None = None,
+    sheet_name: str | int | None = None,
     random_state: int = 42,
     verbose: bool = False,
 ) -> tuple[pd.DataFrame, int, int, str, str, int]:

csv_detective/parsing/load.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import codecs
 from io import BytesIO, StringIO
 import pandas as pd
@@ -10,7 +11,7 @@ from csv_detective.detection.engine import (
     EXCEL_ENGINES,
     detect_engine,
 )
-from csv_detective.detection.headers import detect_headers
+from csv_detective.detection.headers import detect_header_position
 from csv_detective.detection.separator import detect_separator
 from csv_detective.parsing.compression import unzip
 from csv_detective.parsing.csv import parse_csv
@@ -27,12 +28,12 @@ def load_file(
     encoding: str | None = None,
     sep: str | None = None,
     verbose: bool = False,
+    engine: str | None = None,
     sheet_name: str | int | None = None,
 ) -> tuple[pd.DataFrame, dict]:
     file_name = file_path.split("/")[-1]
-    engine = None
-    if "." not in file_name or not file_name.endswith("csv"):
-        # file has no extension, we'll investigate how to read it
+    if ("." not in file_name or not file_name.endswith("csv")) and engine is None and sep is None:
+        # file has no extension and we don't have insights from arguments, we'll investigate how to read it
         engine = detect_engine(file_path, verbose=verbose)
     if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
@@ -45,9 +46,6 @@ def load_file(
         )
         if table.empty:
             raise ValueError("Table seems to be empty")
-        header = table.columns.to_list()
-        if any(col.startswith("Unnamed") for col in header):
-            raise ValueError("Could not retrieve headers")
         analysis = {
             "engine": engine,
             "sheet_name": sheet_name,
@@ -69,21 +67,20 @@ def load_file(
             binary_file.seek(0)
         # decoding and reading file
         if is_url(file_path) or engine in COMPRESSION_ENGINES:
+            decoder = codecs.getincrementaldecoder(encoding)()
             str_file = StringIO()
             while True:
                 chunk = binary_file.read(1024**2)
                 if not chunk:
                     break
-                str_file.write(chunk.decode(encoding=encoding))
+                str_file.write(decoder.decode(chunk))
             del binary_file
             str_file.seek(0)
         else:
             str_file = open(file_path, "r", encoding=encoding)
         if sep is None:
             sep = detect_separator(str_file, verbose=verbose)
-        header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
-        if header is None or (isinstance(header, list) and any([h is None for h in header])):
-            raise ValueError("Could not retrieve headers")
+        header_row_idx = detect_header_position(str_file, verbose=verbose)
         heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
         trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
         table, total_lines, nb_duplicates = parse_csv(
@@ -100,9 +97,11 @@ def load_file(
         }
         if engine is not None:
             analysis["compression"] = engine
+    if any(not isinstance(col, str) or col.startswith("Unnamed:") for col in table.columns):
+        raise ValueError("Could not accurately detect the file's columns")
     analysis |= {
         "header_row_idx": header_row_idx,
-        "header": header,
+        "header": list(table.columns),
     }
     if total_lines is not None:
         analysis["total_lines"] = total_lines

csv_detective/validate.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import logging
+from collections import defaultdict
 import pandas as pd
 from csv_detective.format import FormatsManager
 from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
+# VALIDATION_CHUNK_SIZE is bigger than (analysis) CHUNK_SIZE because
+# it's faster to validate so we can afford to load more rows
 VALIDATION_CHUNK_SIZE = int(1e5)
 logging.basicConfig(level=logging.INFO)
@@ -16,9 +19,9 @@ def validate(
     previous_analysis: dict,
     verbose: bool = False,
     skipna: bool = True,
-) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
+) -> tuple[bool, dict | None, dict[str, pd.Series] | None]:
     """
-    Verify is the given file has the same fields and types as in the given analysis.
+    Verify is the given file has the same fields and formats as in the given analysis.
     Args:
         file_path: the path of the file to validate
@@ -26,6 +29,15 @@ def validate(
         verbose: whether the code displays the steps it's going through
         skipna: whether to ignore NaN values in the checks
     """
+    if verbose:
+        logging.info(f"Checking given formats exist")
+    for col_name, detected in previous_analysis["columns"].items():
+        if detected["format"] == "string":
+            continue
+        elif detected["format"] not in formats:
+            if verbose:
+                logging.warning(f"> Unknown format `{detected['format']}` in analysis")
+            return False, None, None
     try:
         if previous_analysis.get("separator"):
             # loading the table in chunks
@@ -58,77 +70,94 @@ def validate(
                 ]
             )
             analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
-        first_chunk = next(chunks)
         analysis.update(
             {k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
         )
     except Exception as e:
         if verbose:
             logging.warning(f"> Could not load the file with previous analysis values: {e}")
-        return False, None, None, None
+        return False, None, None
     if verbose:
         logging.info("Comparing table with the previous analysis")
-        logging.info("- Checking if all columns match")
-    if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
-        list(first_chunk.columns)[k] != previous_analysis["header"][k]
-        for k in range(len(previous_analysis["header"]))
-    ):
-        if verbose:
-            logging.warning("> Columns do not match, proceeding with full analysis")
-        return False, None, None, None
-    if verbose:
         logging.info(
             f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
         )
-    # hashing rows to get nb_duplicates
-    row_hashes_count = pd.util.hash_pandas_object(first_chunk, index=False).value_counts()
-    # getting values for profile to read the file only once
-    col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
+    # will contain hashes of each row of the file as index and the number of times
+    # each hash was seen as values; used to compute nb_duplicates
+    row_hashes_count = pd.Series()
+    # will contain the number of times each value of each column is seen in the whole file
+    # used for profile to read the file only once
+    # naming it "count" to be iso with how col_values are made in detect_formats
+    col_values: defaultdict[str, pd.Series] = defaultdict(lambda: pd.Series(name="count"))
     analysis["total_lines"] = 0
-    for idx, chunk in enumerate([first_chunk, *chunks]):
+    checked_values: dict[str, int] = {col_name: 0 for col_name in previous_analysis["columns"]}
+    valid_values: dict[str, int] = {col_name: 0 for col_name in previous_analysis["columns"]}
+    for idx, chunk in enumerate(chunks):
         if verbose:
-            logging.info(f"> Testing chunk number {idx}")
+            logging.info(f"- Testing chunk number {idx}")
+        if idx == 0:
+            if verbose:
+                logging.info("Checking if all columns match")
+            if len(chunk.columns) != len(previous_analysis["header"]) or any(
+                list(chunk.columns)[k] != previous_analysis["header"][k]
+                for k in range(len(previous_analysis["header"]))
+            ):
+                if verbose:
+                    logging.warning("> Columns in the file do not match those of the analysis")
+                return False, None, None
         analysis["total_lines"] += len(chunk)
         row_hashes_count = row_hashes_count.add(
             pd.util.hash_pandas_object(chunk, index=False).value_counts(),
             fill_value=0,
         )
-        for col in chunk.columns:
-            col_values[col] = col_values[col].add(
-                chunk[col].value_counts(dropna=False),
-                fill_value=0,
-            )
         for col_name, detected in previous_analysis["columns"].items():
             if verbose:
                 logging.info(f"- Testing {col_name} for {detected['format']}")
             if detected["format"] == "string":
                 # no test for columns that have not been recognized as a specific format
                 continue
-            if detected["format"] not in formats:
+            to_check = chunk[col_name].dropna() if skipna else chunk[col_name]
+            chunk_valid_values = sum(to_check.apply(formats[detected["format"]].func))
+            if formats[detected["format"]].proportion == 1 and chunk_valid_values < len(to_check):
+                # we can early stop in this case, not all values are valid while we want 100%
                 if verbose:
                     logging.warning(
-                        f"> Unknown format `{detected['format']}`, proceeding with full analysis"
+                        f"> Test failed for column {col_name} with format {detected['format']}"
                     )
-                return False, first_chunk, analysis, None
-            test_result: float = test_col_val(
-                serie=chunk[col_name],
-                format=formats[detected["format"]],
-                skipna=skipna,
-            )
-            if not bool(test_result):
-                if verbose:
-                    logging.warning("> Test failed, proceeding with full analysis")
-                return False, first_chunk, analysis, None
+                return False, None, None
+            checked_values[col_name] += len(to_check)
+            valid_values[col_name] += chunk_valid_values
+            col_values[col_name] = (
+                col_values[col_name]
+                .add(
+                    chunk[col_name].value_counts(dropna=False),
+                    fill_value=0,
+                )
+                .rename_axis(col_name)
+            )  # rename_axis because *sometimes* pandas doesn't pass on the column's name ¯\_(ツ)_/¯
+        del chunk
+    # finally we loop through the formats that accept less than 100% valid values to check the proportion
+    for col_name, detected in previous_analysis["columns"].items():
+        if (
+            checked_values[col_name] > 0
+            and valid_values[col_name] / checked_values[col_name]
+            < formats[detected["format"]].proportion
+        ):
+            if verbose:
+                logging.warning(
+                    f"> Test failed for column {col_name} with format {detected['format']}"
+                )
+            return False, None, None
     if verbose:
         logging.info("> All checks successful")
     analysis["nb_duplicates"] = sum(row_hashes_count > 1)
+    del row_hashes_count
     analysis["categorical"] = [
         col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
     ]
     return (
         True,
-        first_chunk,
         analysis
         | {
             k: previous_analysis[k]

{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.10.2.dev4
+Version: 0.10.3
 Summary: Detect tabular files column content
 Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
 License: MIT
@@ -23,10 +23,6 @@ Requires-Dist: frformat==0.4.0
 Requires-Dist: Faker>=33.0.0
 Requires-Dist: rstr>=3.2.2
 Requires-Dist: more-itertools>=10.8.0
-Provides-Extra: dev
-Requires-Dist: pytest>=8.3.0; extra == "dev"
-Requires-Dist: responses>=0.25.0; extra == "dev"
-Requires-Dist: ruff>=0.9.3; extra == "dev"
 Dynamic: license-file
 # CSV Detective

{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
 csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
 csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
-csv_detective/explore_csv.py,sha256=qSf6N3tbp43BUMJF5wiXz3aYKaTez6ro-75KL2Arci4,7174
-csv_detective/format.py,sha256=VglcxWBmjTvWNMhwSUZDfMdJcK9lAUum64Jxvm70AJ4,2898
+csv_detective/explore_csv.py,sha256=M8jabAP08raPY438v5UeBqJy3bBudTeuo-UNe2unWyE,7639
+csv_detective/format.py,sha256=VTdwg4gp9pq6WYhbkCxv9X2hXq0fMrzfooFchmIL0as,2911
 csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
-csv_detective/validate.py,sha256=QBJhwHP0U0Ux7ODGV6foqNGm-DlbECIo6jUsBFOdDr0,5739
+csv_detective/validate.py,sha256=7k0GC5AsTn5BbsRChetZZDmnTGiYLe40qPKiP3GruYs,7495
 csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
 csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
 csv_detective/detection/formats.py,sha256=9aIE4gwTN8c8pa-kofeJ7zalo8NqjGZabYD-G79kV5I,4734
-csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
-csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
+csv_detective/detection/headers.py,sha256=lnbWRxkI6rdyoWGtmxSfsPkqNjS0Nlpgw-pVevtmBP0,899
+csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
 csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
 csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
@@ -28,7 +28,7 @@ csv_detective/formats/code_rna.py,sha256=o6Kptrux6T2bSnWHi7MBCqIfVKbMMeN4dHlxxzk
 csv_detective/formats/code_waldec.py,sha256=j4-xpj_73c7IdgLoZJY_kRVj3HkpB7RFfGPN4NwPmVo,303
 csv_detective/formats/commune.py,sha256=QVscVy5Ij9kdzKJgIG2aFC_v1IRsov5M9Zkj_SHDWgs,541
 csv_detective/formats/csp_insee.py,sha256=y1w9zPQvijQi5v1Cuye0aX87ZVDC4FeFx1YC0dLqqp8,688
-csv_detective/formats/date.py,sha256=Q6w1azLKNshJJVLOPBHj-77ZinXYMW_EKp_BGDshLLE,2802
+csv_detective/formats/date.py,sha256=caMMvcqkbON8Cxp9oDYZsfmkSXuu-PiiJi8YUbypBso,3167
 csv_detective/formats/date_fr.py,sha256=YnNXSgT6QekfTUJoS5yuRX8LeK-fmVDgLgVP9cP0e4M,505
 csv_detective/formats/datetime_aware.py,sha256=izKo6CA-MNIzmmM3Br4-FOESyqCS_YYK8N4V9D6CVEI,1909
 csv_detective/formats/datetime_naive.py,sha256=DZ0apAm3vIy4cdm5DynAeRueI_8rhuHYQtAOZ5yyZ5k,1681
@@ -74,26 +74,26 @@ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=mLt_qcQ6D8hfy9zdi7
 csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=XFPdGBsyZCBg4D8IDn6VgwsycCwYVfuqPbyHfNeqGv0,1003
 csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7prThkihHrf7mwB6j5uEHpxGvLFE,1003
 csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
-csv_detective/output/dataframe.py,sha256=QX5vplx0AOKgnwwJ6dKvDHWRX9IGPStax-svXEyweJ8,3584
+csv_detective/output/dataframe.py,sha256=juBMdj0eiL8c3OrJJ3kCf15Qs4-CFQfHqh91FnVbG9E,3656
 csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
-csv_detective/output/profile.py,sha256=ADr5DwuvwcBYxugjN38fHm11l6ivfzGHXPd8a87Ht-s,4985
+csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XNNM,4984
 csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcOl8I,9257
+csv_detective/parsing/columns.py,sha256=H_dKHhSgQMIiOfxibnGl6HwTW9bRwGtIeUcYBN13-3A,9245
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
 csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
-csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
-csv_detective/parsing/load.py,sha256=f-8aKiNpy_47qg4Lq-UZUR4NNrbJ_-KEGvcUQZ8cmb0,4317
+csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
+csv_detective/parsing/load.py,sha256=1Fk43ikIOJwtWJUY-e8oNeNOk4MMtpmZV7s-VbQBS1k,4345
 csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
-csv_detective-0.10.2.dev4.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.10.3.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
-tests/test_fields.py,sha256=EuD2F1JUR8y88Hm-AYuJ5X7AKkGSyLIQfsGdxYgIWng,5680
-tests/test_file.py,sha256=_ftEymft5-1keUVE5AUdF2XkVcChJo6oBjws3ye06FE,14543
+tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
+tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
 tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
 tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
-tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
+tests/test_validation.py,sha256=309k3Axgbp-1Wh6qvCj2BpeMBp3HXzLi5j9UKm1bRQs,5384
 tests/data/a_test_file.csv,sha256=SOHjseGYqZer9yu3Bd3oS12Vw8MFsebo0BzrLZ_R4Cc,68871
 tests/data/a_test_file.json,sha256=fB9bCpAMFPxFw8KxHRFlgRqjYG819QVGrCQWxQvwkvo,10542
 tests/data/b_test_file.csv,sha256=wJGX62KhYjZi62De2XjZWClAzeRFEBsg3ET0IPX1BNU,98
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
 tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
 tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
 tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
-csv_detective-0.10.2.dev4.dist-info/METADATA,sha256=JkKreIYD_a8Qgxt6LgKR3kxYaiDHVhSjiBm5WpP1_Zk,11082
-csv_detective-0.10.2.dev4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.10.2.dev4.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.10.2.dev4.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
-csv_detective-0.10.2.dev4.dist-info/RECORD,,
+csv_detective-0.10.3.dist-info/METADATA,sha256=L638U_kKVd5jFzjTk76y48hTz3nMldJ5PkfMngGHobg,10920
+csv_detective-0.10.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+csv_detective-0.10.3.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.10.3.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
+csv_detective-0.10.3.dist-info/RECORD,,

{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

tests/test_fields.py CHANGED Viewed

@@ -104,11 +104,17 @@ def test_fields_with_values(args):
         ("2022-08-01", "date", _date),
         ("2024-09-23 17:32:07", "datetime", _datetime),
         ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
+        ("N/A", "int", None),
+        ("nan", "bool", None),
+        ("", "date", None),  # all NaN-like values should be cast as None for all type
     ),
 )
 def test_cast(args):
     value, detected_type, cast_type = args
-    assert isinstance(cast(value, detected_type), cast_type)
+    if cast_type is None:
+        assert cast(value, detected_type) is None
+    else:
+        assert isinstance(cast(value, detected_type), cast_type)
 @pytest.mark.parametrize(

tests/test_file.py CHANGED Viewed

@@ -9,6 +9,12 @@ from csv_detective.output.profile import create_profile
 from csv_detective.parsing.csv import CHUNK_SIZE
+@pytest.fixture
+def mocked_responses():
+    with responses.RequestsMock() as rsps:
+        yield rsps
 @pytest.mark.parametrize(
     "chunk_size",
     (100, 404, int(1e5)),
@@ -165,6 +171,26 @@ def test_exception_different_number_of_columns():
         )
+def test_exception_malformed_columns(mocked_responses):
+    """
+    A ValueError should be raised if any column is Unnamed
+    """
+    url = f"http://example.com/bad_cols.csv"
+    expected_content = b"col1,col2,\n1,2,\n3,4,"
+    mocked_responses.get(
+        url,
+        body=expected_content,
+        status=200,
+    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        with pytest.raises(ValueError):
+            routine(file_path=url)
 def test_code_dep_reg_on_file():
     output = routine(
         file_path="tests/data/b_test_file.csv",
@@ -237,12 +263,6 @@ def test_non_csv_files(params):
             assert _[k] == v
-@pytest.fixture
-def mocked_responses():
-    with responses.RequestsMock() as rsps:
-        yield rsps
 @pytest.mark.parametrize(
     "params",
     # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib

tests/test_validation.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+from unittest.mock import MagicMock, patch
 import pandas as pd
 import pytest
@@ -26,12 +27,12 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
 @pytest.mark.parametrize(
     "_params",
     (
-        ((True, pd.DataFrame, dict), {}),
-        ((False, None, None), {"separator": "|"}),
-        ((False, None, None), {"encoding": "unknown"}),
-        ((False, None, None), {"header": ["a", "b"]}),
+        ((True, dict), {}),
+        ((False, None), {"separator": "|"}),
+        ((False, None), {"encoding": "unknown"}),
+        ((False, None), {"header": ["a", "b"]}),
         (
-            (False, pd.DataFrame, dict),
+            (False, None),
             {
                 "columns.NUMCOM": {
                     "python_type": "int",
@@ -43,35 +44,89 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
     ),
 )
 def test_validation(_params):
-    (should_be_valid, table_type, analysis_type), modif_previous_analysis = _params
+    (should_be_valid, analysis_type), modif_previous_analysis = _params
     with open("tests/data/a_test_file.json", "r") as f:
         previous_analysis = json.load(f)
     for dotkey in modif_previous_analysis:
         keys = dotkey.split(".")
         set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
-    is_valid, table, analysis, col_values = validate(
+    is_valid, analysis, col_values = validate(
         "tests/data/a_test_file.csv",
         previous_analysis=previous_analysis,
     )
     assert is_valid == should_be_valid
-    if table_type is None:
-        assert table is None
-    else:
-        assert isinstance(table, table_type)
     if analysis_type is None:
         assert analysis is None
     else:
         assert isinstance(analysis, analysis_type)
     if should_be_valid:
         assert isinstance(col_values, dict)
-        assert all(
-            col in table.columns and isinstance(values, pd.Series)
-            for col, values in col_values.items()
-        )
     else:
         assert col_values is None
+@pytest.mark.parametrize(
+    "_params",
+    (
+        # int: proportion = 1, should fail (early)
+        ("12", "1.2", {"python_type": "int", "format": "int", "score": 1.5}, False),
+        # siren: proportion = 0.9, should fail (later)
+        (
+            "130025265",
+            "A13794BC",
+            {"python_type": "string", "format": "siren", "score": 1.5},
+            False,
+        ),
+        # siret: proportion = 0.8, should succeed
+        (
+            "13002526500013",
+            "A13794BC",
+            {"python_type": "string", "format": "siret", "score": 1.5},
+            True,
+        ),
+    ),
+)
+def test_validation_with_proportions(_params):
+    # testing the behaviour for a file that has 15% invalid values, but all in a single chunk
+    valid_value, invalid_value, detected, should_be_valid = _params
+    url = f"http://example.com/test.csv"
+    expected_content = "col\n"
+    for _ in range(60):
+        # 60 rows of valid values
+        expected_content += f"{valid_value}\n"
+    for _ in range(15):
+        # 15 rows of invalid values
+        expected_content += f"{invalid_value}\n"
+    for _ in range(25):
+        # 25 rows of valid values
+        expected_content += f"{valid_value}\n"
+    previous_analysis = {
+        "encoding": "utf-8",
+        "separator": ",",
+        "header_row_idx": 0,
+        "header": ["col"],
+        "columns": {"col": detected},
+        # just setting these keys when validation is successful, they're not used for the validation itself
+        "categorical": [],
+        "columns_fields": {},
+        "columns_labels": {},
+        "formats": {},
+    }
+    with (
+        patch("urllib.request.urlopen") as mock_urlopen,
+        patch("csv_detective.validate.VALIDATION_CHUNK_SIZE", 10),
+    ):
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content.encode("utf-8")
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        is_valid, *_ = validate(
+            file_path=url,
+            previous_analysis=previous_analysis,
+        )
+    assert is_valid == should_be_valid
 @pytest.mark.parametrize(
     "modif_previous_analysis",
     (

{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.10.2.dev4.dist-info → csv_detective-0.10.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.10.2.dev4__py3-none-any.whl → 0.10.3__py3-none-any.whl

csv-detective 0.10.2.dev4py3-none-any.whl → 0.10.3py3-none-any.whl