PyPI - csv-detective - Versions diffs - 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2241__py3-none-any.whl - Mend

csv-detective 0.9.3.dev2215py3-none-any.whl → 0.9.3.dev2241py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

csv_detective/__init__.py +2 -1
csv_detective/detection/engine.py +1 -1
csv_detective/detection/formats.py +39 -95
csv_detective/detection/variables.py +2 -2
csv_detective/explore_csv.py +5 -7
csv_detective/load_tests.py +11 -4
csv_detective/output/__init__.py +8 -4
csv_detective/output/dataframe.py +37 -0
csv_detective/output/example.py +3 -1
csv_detective/output/profile.py +65 -21
csv_detective/parsing/columns.py +133 -35
csv_detective/parsing/csv.py +26 -23
csv_detective/parsing/load.py +21 -8
csv_detective/validate.py +86 -40
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/METADATA +29 -6
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/RECORD +24 -24
tests/test_fields.py +9 -13
tests/test_file.py +85 -35
tests/test_structure.py +4 -1
tests/test_validation.py +9 -4
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/WHEEL +0 -0
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/entry_points.txt +0 -0
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/licenses/LICENSE +0 -0
{csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/top_level.txt +0 -0

csv_detective/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from csv_detective.explore_csv import routine, validate_then_detect
+from csv_detective.explore_csv import routine, validate, validate_then_detect
 __all__ = [
     "routine",
+    "validate",
     "validate_then_detect",
 ]

csv_detective/detection/engine.py CHANGED Viewed

@@ -29,7 +29,7 @@ def detect_engine(file_path: str, verbose=False) -> str | None:
     }
     # if none of the above, we move forwards with the csv process
     if is_url(file_path):
-        remote_content = requests.get(file_path).content
+        remote_content = next(requests.get(file_path, stream=True).iter_content(chunk_size=1024))
         engine = mapping.get(magic.from_buffer(remote_content, mime=True))
     else:
         engine = mapping.get(magic.from_file(file_path, mime=True))

csv_detective/detection/formats.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import logging
 from collections import defaultdict
 import numpy as np
@@ -10,11 +9,12 @@ from csv_detective.detection.variables import (
 )
 from csv_detective.load_tests import return_all_tests
 from csv_detective.output.utils import prepare_output_dict
-from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
-from csv_detective.validate import validate
-# above this threshold, a column is not considered categorical
-MAX_NUMBER_CATEGORICAL_VALUES = 25
+from csv_detective.parsing.columns import (
+    MAX_NUMBER_CATEGORICAL_VALUES,
+    test_col,
+    test_col_chunks,
+    test_label,
+)
 def detect_formats(
@@ -25,36 +25,8 @@ def detect_formats(
     limited_output: bool = True,
     skipna: bool = True,
     verbose: bool = False,
-):
-    on_sample = len(table) > MAX_ROWS_ANALYSIS
-    if on_sample:
-        if verbose:
-            logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
-        table = build_sample(table)
-    if table.empty:
-        res_categorical = []
-        # res_continuous = []
-    else:
-        # Detects columns that are categorical
-        res_categorical, categorical_mask = detect_categorical_variable(
-            table,
-            max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
-            verbose=verbose,
-        )
-        res_categorical = list(res_categorical)
-        # Detect columns that are continuous (we already know the categorical) :
-        # we don't need this for now, cuts processing time
-        # res_continuous = list(
-        #     detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
-        # )
-    analysis.update(
-        {
-            "categorical": res_categorical,
-            # "continuous": res_continuous,
-        }
-    )
+) -> tuple[dict, dict[str, pd.Series] | None]:
+    in_chunks = analysis.get("total_lines") is None
     # list testing to be performed
     all_tests_fields = return_all_tests(
@@ -66,16 +38,41 @@ def detect_formats(
     # if no testing then return
     if not all_tests_fields and not all_tests_labels:
-        return analysis
+        return analysis, None
     # Perform testing on fields
-    scores_table_fields = test_col(
-        table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose
-    )
+    if not in_chunks:
+        # table is small enough to be tested in one go
+        scores_table_fields = test_col(
+            table=table,
+            all_tests=all_tests_fields,
+            limited_output=limited_output,
+            skipna=skipna,
+            verbose=verbose,
+        )
+        res_categorical, _ = detect_categorical_variable(
+            table,
+            max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
+            verbose=verbose,
+        )
+        analysis["categorical"] = res_categorical
+        col_values = None
+    else:
+        scores_table_fields, analysis, col_values = test_col_chunks(
+            table=table,
+            file_path=file_path,
+            analysis=analysis,
+            all_tests=all_tests_fields,
+            limited_output=limited_output,
+            skipna=skipna,
+            verbose=verbose,
+        )
     analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
     # Perform testing on labels
-    scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
+    scores_table_labels = test_label(
+        analysis["header"], all_tests_labels, limited_output, verbose=verbose
+    )
     analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
     # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
@@ -158,57 +155,4 @@ def detect_formats(
         for header, col_metadata in analysis["columns"].items():
             analysis["formats"][col_metadata["format"]].append(header)
-    if on_sample:
-        if verbose:
-            logging.warning("Validating that analysis on the sample works on the whole file")
-        is_valid, _, _ = validate(
-            file_path=file_path,
-            previous_analysis=analysis,
-            num_rows=-1,
-            encoding=analysis.get("encoding"),
-            sep=analysis.get("separator"),
-            sheet_name=analysis.get("sheet_name"),
-            verbose=verbose,
-            skipna=skipna,
-        )
-        if not is_valid:
-            raise ValueError("Could not infer detected formats on the whole file")
-    return analysis
-def build_sample(table: pd.DataFrame) -> pd.DataFrame:
-    """
-    building a sample of MAX_ROWS_ANALYSIS rows that contains at least one representative of
-    the min and max values of each column, and one case of NaN if the column contains any.
-    """
-    samples = pd.concat(
-        [
-            # one row with the minimum of the column
-            table.loc[table[col] == val].iloc[[0]]
-            for col in table.columns
-            if not pd.isna(val := table[col].dropna().min())
-        ]
-        + [
-            # one row with the maximum of the column
-            table.loc[table[col] == val].iloc[[0]]
-            for col in table.columns
-            if not pd.isna(val := table[col].dropna().max())
-        ]
-        + [
-            # one row with a NaN value if the column has any
-            table.loc[table[col].isna()].iloc[[0]]
-            for col in table.columns
-            if table[col].isna().any()
-        ],
-        ignore_index=True,
-    )
-    return (
-        pd.concat(
-            [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
-            ignore_index=True,
-        )
-        # this is very unlikely but we never know
-        if len(samples) <= MAX_ROWS_ANALYSIS
-        else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
-    )
+    return analysis, col_values

csv_detective/detection/variables.py CHANGED Viewed

@@ -56,7 +56,7 @@ def detect_categorical_variable(
     threshold_pct_categorical: float = 0.05,
     max_number_categorical_values: int = 25,
     verbose: bool = False,
-):
+) -> tuple[list[str], pd.DataFrame]:
     """
     Heuristically detects whether a table (df) contains categorical values according to
     the number of unique values contained.
@@ -94,4 +94,4 @@ def detect_categorical_variable(
             f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
             time() - start,
         )
-    return res.index[res], res
+    return list(res.index[res]), res

csv_detective/explore_csv.py CHANGED Viewed

@@ -70,7 +70,7 @@ def routine(
         sheet_name=sheet_name,
     )
-    analysis = detect_formats(
+    analysis, _col_values = detect_formats(
         table=table,
         analysis=analysis,
         file_path=file_path,
@@ -94,6 +94,7 @@ def routine(
             cast_json=cast_json,
             verbose=verbose,
             sheet_name=sheet_name,
+            _col_values=_col_values,
         )
     finally:
         if verbose:
@@ -121,13 +122,9 @@ def validate_then_detect(
         if is_url(file_path):
             logging.info("Path recognized as a URL")
-    is_valid, table, analysis = validate(
+    is_valid, table, analysis, col_values = validate(
         file_path=file_path,
         previous_analysis=previous_analysis,
-        num_rows=num_rows,
-        encoding=previous_analysis.get("encoding"),
-        sep=previous_analysis.get("separator"),
-        sheet_name=previous_analysis.get("sheet_name"),
         verbose=verbose,
         skipna=skipna,
     )
@@ -139,7 +136,7 @@ def validate_then_detect(
             verbose=verbose,
         )
     if not is_valid:
-        analysis = detect_formats(
+        analysis, col_values = detect_formats(
             table=table,
             analysis=analysis,
             file_path=file_path,
@@ -162,6 +159,7 @@ def validate_then_detect(
             cast_json=cast_json,
             verbose=verbose,
             sheet_name=analysis.get("sheet_name"),
+            _col_values=col_values,
         )
     finally:
         if verbose:

csv_detective/load_tests.py CHANGED Viewed

@@ -19,7 +19,7 @@ def get_all_packages(detect_type) -> list:
 def return_all_tests(
     user_input_tests: str | list,
     detect_type: str,
-) -> list:
+) -> dict[str, dict]:
     """
     returns all tests that have a method _is and are listed in the user_input_tests
     the function can select a sub_package from csv_detective
@@ -40,6 +40,7 @@ def return_all_tests(
     else:
         tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"]
     tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"]
+    # removing specified (groups of) tests
     all_tests = [
         # this is why we need to import detect_fields/labels
         eval(x)
@@ -47,6 +48,12 @@ def return_all_tests(
         if any([y == x[: len(y)] for y in tests_to_do])
         and all([y != x[: len(y)] for y in tests_skipped])
     ]
-    # to remove groups of tests
-    all_tests = [test for test in all_tests if "_is" in dir(test)]
-    return all_tests
+    return {
+        test.__name__.split(".")[-1]: {
+            "func": test._is,
+            "prop": test.PROPORTION,
+            "module": test,
+        }
+        for test in all_tests
+        if "_is" in dir(test)
+    }

csv_detective/output/__init__.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import json
 import os
+from typing import Iterator
 import pandas as pd
 from csv_detective.utils import is_url
-from .dataframe import cast_df
+from .dataframe import cast_df_chunks
 from .profile import create_profile
 from .schema import generate_table_schema
@@ -23,7 +24,8 @@ def generate_output(
     cast_json: bool = True,
     verbose: bool = False,
     sheet_name: str | int | None = None,
-) -> dict | tuple[dict, pd.DataFrame]:
+    _col_values: dict[str, pd.Series] | None = None,
+) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
     if output_profile:
         analysis["profile"] = create_profile(
             table=table,
@@ -32,6 +34,7 @@ def generate_output(
             limited_output=limited_output,
             cast_json=cast_json,
             verbose=verbose,
+            _col_values=_col_values,
         )
     if save_results:
@@ -53,9 +56,10 @@ def generate_output(
         analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
     if output_df:
-        return analysis, cast_df(
+        return analysis, cast_df_chunks(
             df=table,
-            columns=analysis["columns"],
+            analysis=analysis,
+            file_path=file_path,
             cast_json=cast_json,
             verbose=verbose,
         )

csv_detective/output/dataframe.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import json
 from datetime import date, datetime
 from time import time
+from typing import Iterator
 import pandas as pd
 from csv_detective.detect_fields.other.booleen import bool_casting
 from csv_detective.detect_fields.other.float import float_casting
 from csv_detective.detect_fields.temp.date import date_casting
+from csv_detective.parsing.csv import CHUNK_SIZE
 from csv_detective.utils import display_logs_depending_process_time
@@ -52,3 +54,38 @@ def cast_df(
             time() - start,
         )
     return df
+def cast_df_chunks(
+    df: pd.DataFrame,
+    analysis: dict,
+    file_path: str,
+    cast_json: bool = True,
+    verbose: bool = False,
+) -> Iterator[pd.DataFrame]:
+    if analysis.get("engine") or analysis["total_lines"] <= CHUNK_SIZE:
+        # the file is loaded in one chunk, so returning the cast df
+        yield cast_df(
+            df=df,
+            columns=analysis["columns"],
+            cast_json=cast_json,
+            verbose=verbose,
+        )
+    else:
+        # loading the csv in chunks using the analysis
+        chunks = pd.read_csv(
+            file_path,
+            dtype=str,
+            sep=analysis["separator"],
+            encoding=analysis["encoding"],
+            skiprows=analysis["header_row_idx"],
+            compression=analysis.get("compression"),
+            chunksize=CHUNK_SIZE,
+        )
+        for chunk in chunks:
+            yield cast_df(
+                df=chunk,
+                columns=analysis["columns"],
+                cast_json=cast_json,
+                verbose=verbose,
+            )

csv_detective/output/example.py CHANGED Viewed

@@ -10,6 +10,8 @@ import requests
 import rstr
 from faker import Faker
+from csv_detective.utils import is_url
 fake = Faker()
@@ -183,7 +185,7 @@ def create_example_csv_file(
     }
     if schema_path:
-        if schema_path.startswith("http"):
+        if is_url(schema_path):
             schema = requests.get(schema_path).json()
         else:
             with open(schema_path, encoding=encoding) as jsonfile:

csv_detective/output/profile.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import logging
 from collections import defaultdict
 from time import time
+from typing import Optional
+import numpy as np
 import pandas as pd
 from csv_detective.detect_fields.other.float import float_casting
@@ -15,6 +17,7 @@ def create_profile(
     limited_output: bool = True,
     cast_json: bool = True,
     verbose: bool = False,
+    _col_values: Optional[dict[str, pd.Series]] = None,
 ) -> dict:
     if verbose:
         start = time()
@@ -27,50 +30,91 @@ def create_profile(
             k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
             for k, v in columns.items()
         }
+    # value_counts().reset_index() tries to insert a "count" column, and fails if it's already here
+    _count_col = "count"
+    while _count_col in table.columns:
+        _count_col = "_" + _count_col
     profile = defaultdict(dict)
     for c in table.columns:
         # for numerical formats we want min, max, mean, std
         if columns[c]["python_type"] in ["float", "int"]:
-            # we locally cast the column to perform the operations, using the same method as in cast_df
-            cast_col = (
-                table[c].astype(pd.Int64Dtype())
-                if columns[c]["python_type"] == "int"
-                else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
-            )
-            profile[c].update(
-                min=cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
-                max=cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
-                mean=cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
-                std=cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
-            )
+            # if we have read the file in chunks we already have what we need
+            if _col_values is None:
+                # we locally cast the column to perform the operations,
+                # using the same method as in cast_df
+                cast_col = (
+                    table[c].astype(pd.Int64Dtype())
+                    if columns[c]["python_type"] == "int"
+                    else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
+                )
+                stats = {
+                    "min": cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
+                    "mean": cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
+                    "max": cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
+                    "std": cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
+                }
+            else:
+                cast_col = _col_values[c].reset_index()
+                cast_col = cast_col.loc[cast_col[c].notna()]
+                cast_col[c] = (
+                    cast_col[c].astype(pd.Int64Dtype())
+                    if columns[c]["python_type"] == "int"
+                    else cast_col[c].apply(
+                        lambda x: float_casting(x) if isinstance(x, str) else pd.NA
+                    )
+                )
+                stats = {
+                    "min": cast_prevent_nan(cast_col[c].min(), columns[c]["python_type"]),
+                    "mean": cast_prevent_nan(
+                        (cast_col[c] * cast_col["count"]).sum() / sum(cast_col["count"]),
+                        columns[c]["python_type"],
+                    ),
+                    "max": cast_prevent_nan(cast_col[c].max(), columns[c]["python_type"]),
+                }
+                stats["std"] = cast_prevent_nan(
+                    np.sqrt(
+                        sum(cast_col["count"] * (cast_col[c] - stats["mean"]) ** 2)
+                        / sum(cast_col["count"])
+                    ),
+                    columns[c]["python_type"],
+                )
+            profile[c].update(**stats)
             del cast_col
         # for all formats we want most frequent values, nb unique values and nb missing values
         tops_bruts = (
-            table.loc[table[c].notna(), c]
-            .value_counts()
-            .reset_index()
+            (table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
+            .reset_index(name=_count_col)
             .iloc[:10]
             .to_dict(orient="records")
         )
         profile[c].update(
             tops=[
                 {
-                    "count": tb["count"],
+                    "count": tb[_count_col],
                     "value": tb[c],
                 }
                 for tb in tops_bruts
             ],
             nb_distinct=(
-                table[c].nunique()
-                if columns[c]["python_type"] != "json" or not cast_json
-                # a column containing cast json is not serializable
-                else table[c].astype(str).nunique()
+                (
+                    table[c].nunique()
+                    if columns[c]["python_type"] != "json" or not cast_json
+                    # a column containing cast json is not serializable
+                    else table[c].astype(str).nunique()
+                )
+                if _col_values is None
+                else len(_col_values)
+            ),
+            nb_missing_values=(
+                len(table[c].loc[table[c].isna()])
+                if _col_values is None
+                else (_col_values[c].loc[pd.NA] if pd.NA in _col_values[c].index else 0)
             ),
-            nb_missing_values=len(table[c].loc[table[c].isna()]),
         )
     if verbose:
         display_logs_depending_process_time(
             f"Created profile in {round(time() - start, 3)}s",
             time() - start,
         )
+    del _col_values
     return profile

csv-detective 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2241__py3-none-any.whl

csv-detective 0.9.3.dev2215py3-none-any.whl → 0.9.3.dev2241py3-none-any.whl