PyPI - csv-detective - Versions diffs - 0.7.5.dev1277__py3-none-any.whl → 0.7.5.dev1286__py3-none-any.whl - Mend

csv-detective 0.7.5.dev1277py3-none-any.whl → 0.7.5.dev1286py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

csv_detective/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .explore_csv import routine, routine_minio  # noqa
+from .explore_csv import routine, routine_minio, validate_then_detect  # noqa
 from .output.example import create_example_csv_file  # noqa
 __version__ = '0.7.5.dev'

csv_detective/detect_fields/other/float/__init__.py CHANGED Viewed

@@ -2,16 +2,16 @@ PROPORTION = 1
 def float_casting(val: str) -> float:
-    return float(val.replace(',', '.'))
+    return float(val.replace(",", "."))
 def _is(val):
-    '''Detects floats, assuming that tables will not have scientific
-    notations (3e6) or "+" in the string. "-" is still accepted.'''
+    """Detects floats, assuming that tables will not have scientific
+    notations (3e6) or "+" in the string. "-" is still accepted."""
     try:
         if (
             not isinstance(val, str)
-            or any([k in val for k in ['_', '+', 'e', 'E']])
+            or any([k in val for k in ["_", "+", "e", "E"]])
             or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
         ):
             return False

csv_detective/detection/formats.py ADDED Viewed

@@ -0,0 +1,145 @@
+from collections import defaultdict
+from typing import Union
+import numpy as np
+import pandas as pd
+from csv_detective.detection.variables import (
+    detect_categorical_variable,
+    # detect_continuous_variable,
+)
+from csv_detective.load_tests import return_all_tests
+from csv_detective.output.utils import prepare_output_dict
+from csv_detective.parsing.columns import test_col, test_label
+def detect_formats(
+    table: pd.DataFrame,
+    analysis: dict,
+    user_input_tests: Union[str, list[str]] = "ALL",
+    limited_output: bool = True,
+    skipna: bool = True,
+    verbose: bool = False,
+):
+    if table.empty:
+        res_categorical = []
+        # res_continuous = []
+    else:
+        # Detects columns that are categorical
+        res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
+        res_categorical = list(res_categorical)
+        # Detect columns that are continuous (we already know the categorical) :
+        # we don't need this for now, cuts processing time
+        # res_continuous = list(
+        #     detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
+        # )
+    analysis.update({
+        "categorical": res_categorical,
+        # "continuous": res_continuous,
+    })
+    # list testing to be performed
+    all_tests_fields = return_all_tests(
+        user_input_tests, detect_type="detect_fields"
+    )  # list all tests for the fields
+    all_tests_labels = return_all_tests(
+        user_input_tests, detect_type="detect_labels"
+    )  # list all tests for the labels
+    # if no testing then return
+    if not all_tests_fields and not all_tests_labels:
+        return analysis
+    # Perform testing on fields
+    scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
+    analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
+    # Perform testing on labels
+    scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
+    analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
+    # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
+    # This is because the fields are more important than the labels and yields a max
+    # of 1.5 for the final score.
+    scores_table = scores_table_fields * (
+        1
+        + scores_table_labels.reindex(
+            index=scores_table_fields.index, fill_value=0
+        ).values / 2
+    )
+    # To reduce false positives: ensure these formats are detected only if the label yields
+    # a detection (skipping the ones that have been excluded by the users).
+    formats_with_mandatory_label = [
+        f for f in [
+            "code_departement",
+            "code_commune_insee",
+            "code_postal",
+            "latitude_wgs",
+            "longitude_wgs",
+            "latitude_wgs_fr_metropole",
+            "longitude_wgs_fr_metropole",
+            "latitude_l93",
+            "longitude_l93",
+        ] if f in scores_table.index
+    ]
+    scores_table.loc[formats_with_mandatory_label, :] = np.where(
+        scores_table_labels.loc[formats_with_mandatory_label, :],
+        scores_table.loc[formats_with_mandatory_label, :],
+        0,
+    )
+    analysis["columns"] = prepare_output_dict(scores_table, limited_output)
+    metier_to_python_type = {
+        "booleen": "bool",
+        "int": "int",
+        "float": "float",
+        "string": "string",
+        "json": "json",
+        "json_geojson": "json",
+        "datetime": "datetime",
+        "datetime_iso": "datetime",
+        "datetime_rfc822": "datetime",
+        "date": "date",
+        "latitude": "float",
+        "latitude_l93": "float",
+        "latitude_wgs": "float",
+        "latitude_wgs_fr_metropole": "float",
+        "longitude": "float",
+        "longitude_l93": "float",
+        "longitude_wgs": "float",
+        "longitude_wgs_fr_metropole": "float",
+    }
+    if not limited_output:
+        for detection_method in ["columns_fields", "columns_labels", "columns"]:
+            analysis[detection_method] = {
+                col_name: [
+                    {
+                        "python_type": metier_to_python_type.get(
+                            detection["format"], "string"
+                        ),
+                        **detection,
+                    }
+                    for detection in detections
+                ]
+                for col_name, detections in analysis[detection_method].items()
+            }
+    else:
+        for detection_method in ["columns_fields", "columns_labels", "columns"]:
+            analysis[detection_method] = {
+                col_name: {
+                    "python_type": metier_to_python_type.get(
+                        detection["format"], "string"
+                    ),
+                    **detection,
+                }
+                for col_name, detection in analysis[detection_method].items()
+            }
+        # Add detection with formats as keys
+        analysis["formats"] = defaultdict(list)
+        for header, col_metadata in analysis["columns"].items():
+            analysis["formats"][col_metadata["format"]].append(header)
+    return analysis

csv_detective/explore_csv.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from collections import defaultdict
 import json
 import logging
 import os
@@ -6,80 +5,16 @@ import tempfile
 from time import time
 from typing import Optional, Union
-import numpy as np
 import pandas as pd
-# flake8: noqa
-from csv_detective import detect_fields, detect_labels
-from .detection.variables import (
-    detect_categorical_variable,
-    # detect_continuous_variable,
-)
-from .output.dataframe import cast_df
-from .output.profile import create_profile
-from .output.schema import generate_table_schema
-from .output.utils import prepare_output_dict
+from .detection.formats import detect_formats
+from .output import generate_output, generate_table_schema
 from .parsing.load import load_file
-from .parsing.columns import test_col, test_label
 from .s3_utils import download_from_minio, upload_to_minio
 from .utils import display_logs_depending_process_time, is_url
+from .validate import validate
-def get_all_packages(detect_type: str) -> list:
-    root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
-    modules = []
-    for dirpath, _, filenames in os.walk(root_dir):
-        for filename in filenames:
-            file = os.path.join(dirpath, filename).replace(root_dir, "")
-            if file.endswith("__init__.py"):
-                module = (
-                    file.replace("__init__.py", "")
-                    .replace("/", ".").replace("\\", ".")[:-1]
-                )
-                if module:
-                    modules.append(detect_type + module)
-    return modules
-def return_all_tests(
-    user_input_tests: Union[str, list],
-    detect_type: str,
-) -> list:
-    """
-    returns all tests that have a method _is and are listed in the user_input_tests
-    the function can select a sub_package from csv_detective
-    user_input_tests may look like this:
-        - "ALL": all possible tests are made
-        - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
-        this specifc (group of) test(s) only
-        - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
-        specific (groups of) tests by add "-" at the start (e.g "-temp.date")
-    """
-    assert detect_type in ["detect_fields", "detect_labels"]
-    all_packages = get_all_packages(detect_type=detect_type)
-    if isinstance(user_input_tests, str):
-        user_input_tests = [user_input_tests]
-    if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
-        tests_to_do = [detect_type]
-    else:
-        tests_to_do = [
-            f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
-        ]
-    tests_skipped = [
-        f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
-    ]
-    all_tests = [
-        # this is why we need to import detect_fields/labels
-        eval(x) for x in all_packages
-        if any([y == x[: len(y)] for y in tests_to_do])
-        and all([y != x[: len(y)] for y in tests_skipped])
-    ]
-    # to remove groups of tests
-    all_tests = [
-        test for test in all_tests if "_is" in dir(test)
-    ]
-    return all_tests
+logging.basicConfig(level=logging.INFO)
 def routine(
@@ -99,7 +34,7 @@ def routine(
     sheet_name: Optional[Union[str, int]] = None,
 ) -> Union[dict, tuple[dict, pd.DataFrame]]:
     """Returns a dict with information about the csv table and possible
-    column contents.
+    column contents, and if requested the DataFrame with columns cast according to analysis.
     Args:
         file_path: local path to CSV file if not using Minio
@@ -112,14 +47,14 @@ def routine(
         output_schema: whether or not to add the 'schema' field to the output (tableschema)
         output_df: whether or not to return the loaded DataFrame along with the analysis report
         cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
-        verbose: whether or not to print process logs in console
+        verbose: whether or not to print process logs in console
         sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
         skipna: whether to keep NaN (empty cells) for tests
     Returns:
         dict: a dict with information about the csv and possible types for each column
     """
     if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
         raise ValueError("`save_results` must be a bool or a valid path to a json file.")
@@ -137,168 +72,105 @@ def routine(
         sheet_name=sheet_name,
     )
-    if table.empty:
-        res_categorical = []
-        # res_continuous = []
-    else:
-        # Detects columns that are categorical
-        res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
-        res_categorical = list(res_categorical)
-        # Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
-        # res_continuous = list(
-        #     detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
-        # )
-    analysis.update({
-        "categorical": res_categorical,
-        # "continuous": res_continuous,
-    })
-    # list testing to be performed
-    all_tests_fields = return_all_tests(
-        user_input_tests, detect_type="detect_fields"
-    )  # list all tests for the fields
-    all_tests_labels = return_all_tests(
-        user_input_tests, detect_type="detect_labels"
-    )  # list all tests for the labels
-    # if no testing then return
-    if not all_tests_fields and not all_tests_labels:
-        return analysis
-    # Perform testing on fields
-    scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
-    analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
-    # Perform testing on labels
-    scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
-    analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
-    # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
-    # This is because the fields are more important than the labels and yields a max
-    # of 1.5 for the final score.
-    scores_table = scores_table_fields * (
-        1
-        + scores_table_labels.reindex(
-            index=scores_table_fields.index, fill_value=0
-        ).values / 2
-    )
-    # To reduce false positives: ensure these formats are detected only if the label yields
-    # a detection (skipping the ones that have been excluded by the users).
-    formats_with_mandatory_label = [
-        f for f in [
-            "code_departement",
-            "code_commune_insee",
-            "code_postal",
-            "latitude_wgs",
-            "longitude_wgs",
-            "latitude_wgs_fr_metropole",
-            "longitude_wgs_fr_metropole",
-            "latitude_l93",
-            "longitude_l93",
-        ] if f in scores_table.index
-    ]
-    scores_table.loc[formats_with_mandatory_label, :] = np.where(
-        scores_table_labels.loc[formats_with_mandatory_label, :],
-        scores_table.loc[formats_with_mandatory_label, :],
-        0,
+    analysis = detect_formats(
+        table=table,
+        analysis=analysis,
+        user_input_tests=user_input_tests,
+        limited_output=limited_output,
+        skipna=skipna,
+        verbose=verbose,
     )
-    analysis["columns"] = prepare_output_dict(scores_table, limited_output)
-    metier_to_python_type = {
-        "booleen": "bool",
-        "int": "int",
-        "float": "float",
-        "string": "string",
-        "json": "json",
-        "json_geojson": "json",
-        "datetime": "datetime",
-        "datetime_iso": "datetime",
-        "datetime_rfc822": "datetime",
-        "date": "date",
-        "latitude": "float",
-        "latitude_l93": "float",
-        "latitude_wgs": "float",
-        "latitude_wgs_fr_metropole": "float",
-        "longitude": "float",
-        "longitude_l93": "float",
-        "longitude_wgs": "float",
-        "longitude_wgs_fr_metropole": "float",
-    }
-    if not limited_output:
-        for detection_method in ["columns_fields", "columns_labels", "columns"]:
-            analysis[detection_method] = {
-                col_name: [
-                    {
-                        "python_type": metier_to_python_type.get(
-                            detection["format"], "string"
-                        ),
-                        **detection,
-                    }
-                    for detection in detections
-                ]
-                for col_name, detections in analysis[detection_method].items()
-            }
-    else:
-        for detection_method in ["columns_fields", "columns_labels", "columns"]:
-            analysis[detection_method] = {
-                col_name: {
-                    "python_type": metier_to_python_type.get(
-                        detection["format"], "string"
-                    ),
-                    **detection,
-                }
-                for col_name, detection in analysis[detection_method].items()
-            }
-        # Add detection with formats as keys
-        analysis["formats"] = defaultdict(list)
-        for header, col_metadata in analysis["columns"].items():
-            analysis["formats"][col_metadata["format"]].append(header)
-    if output_profile:
-        analysis["profile"] = create_profile(
+    try:
+        return generate_output(
             table=table,
-            dict_cols_fields=analysis["columns"],
+            analysis=analysis,
+            file_path=file_path,
             num_rows=num_rows,
             limited_output=limited_output,
+            save_results=save_results,
+            output_profile=output_profile,
+            output_schema=output_schema,
+            output_df=output_df,
+            cast_json=cast_json,
             verbose=verbose,
+            sheet_name=sheet_name,
         )
+    finally:
+        if verbose:
+            display_logs_depending_process_time(
+                f"Routine completed in {round(time() - start_routine, 3)}s",
+                time() - start_routine
+            )
-    if save_results:
-        if isinstance(save_results, str):
-            output_path = save_results
-        else:
-            output_path = os.path.splitext(file_path)[0]
-            if is_url(output_path):
-                output_path = output_path.split('/')[-1]
-            if analysis.get("sheet_name"):
-                output_path += "_sheet-" + str(sheet_name)
-            output_path += ".json"
-        with open(output_path, "w", encoding="utf8") as fp:
-            json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
-    if output_schema:
-        analysis["schema"] = generate_table_schema(
-            analysis,
-            save_file=False,
-            verbose=verbose
-        )
+def validate_then_detect(
+    file_path: str,
+    previous_analysis: dict,
+    num_rows: int = 500,
+    user_input_tests: Union[str, list[str]] = "ALL",
+    limited_output: bool = True,
+    save_results: Union[bool, str] = True,
+    encoding: str = None,
+    sep: str = None,
+    skipna: bool = True,
+    output_profile: bool = False,
+    output_schema: bool = False,
+    output_df: bool = False,
+    cast_json: bool = True,
+    verbose: bool = False,
+    sheet_name: Union[str, int] = None,
+):
     if verbose:
-        display_logs_depending_process_time(
-            f'Routine completed in {round(time() - start_routine, 3)}s',
-            time() - start_routine
+        start_routine = time()
+        if is_url(file_path):
+            logging.info("Path recognized as a URL")
+    is_valid, table, analysis = validate(
+        file_path=file_path,
+        previous_analysis=previous_analysis,
+        num_rows=num_rows,
+        encoding=encoding,
+        sep=sep,
+        verbose=verbose,
+        skipna=skipna,
+        sheet_name=sheet_name,
+    )
+    if is_valid:
+        # skipping formats detection as the validation is successful
+        analysis = previous_analysis
+        del analysis["profile"]
+    else:
+        analysis = detect_formats(
+            table=table,
+            analysis=analysis,
+            user_input_tests=user_input_tests,
+            limited_output=limited_output,
+            skipna=skipna,
+            verbose=verbose,
         )
-    if output_df:
-        return analysis, cast_df(
-            df=table,
-            columns=analysis["columns"],
+    try:
+        return generate_output(
+            table=table,
+            analysis=analysis,
+            file_path=file_path,
+            num_rows=num_rows,
+            limited_output=limited_output,
+            save_results=save_results,
+            output_profile=output_profile,
+            output_schema=output_schema,
+            output_df=output_df,
             cast_json=cast_json,
             verbose=verbose,
+            sheet_name=sheet_name,
         )
-    return analysis
+    finally:
+        if verbose:
+            display_logs_depending_process_time(
+                f"Process completed in {round(time() - start_routine, 3)}s",
+                time() - start_routine
+            )
 def routine_minio(
@@ -369,8 +241,8 @@ def routine_minio(
         minio_pwd=minio_pwd,
     )
-    analysis = routine(file_path,
-        num_rows,
+    analysis = routine(
+        file_path,
         save_results=True,
         **kwargs,
     )

csv_detective/load_tests.py ADDED Viewed

@@ -0,0 +1,62 @@
+import os
+from typing import Union
+# flake8: noqa
+from csv_detective import detect_fields, detect_labels
+def get_all_packages(detect_type) -> list:
+    root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
+    modules = []
+    for dirpath, _, filenames in os.walk(root_dir):
+        for filename in filenames:
+            file = os.path.join(dirpath, filename).replace(root_dir, "")
+            if file.endswith("__init__.py"):
+                module = (
+                    file.replace("__init__.py", "")
+                    .replace("/", ".").replace("\\", ".")[:-1]
+                )
+                if module:
+                    modules.append(detect_type + module)
+    return modules
+def return_all_tests(
+    user_input_tests: Union[str, list],
+    detect_type: str,
+) -> list:
+    """
+    returns all tests that have a method _is and are listed in the user_input_tests
+    the function can select a sub_package from csv_detective
+    user_input_tests may look like this:
+        - "ALL": all possible tests are made
+        - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
+        this specifc (group of) test(s) only
+        - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
+        specific (groups of) tests by add "-" at the start (e.g "-temp.date")
+    """
+    assert detect_type in ["detect_fields", "detect_labels"]
+    all_packages = get_all_packages(detect_type=detect_type)
+    if isinstance(user_input_tests, str):
+        user_input_tests = [user_input_tests]
+    if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
+        tests_to_do = [detect_type]
+    else:
+        tests_to_do = [
+            f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
+        ]
+    tests_skipped = [
+        f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
+    ]
+    all_tests = [
+        # this is why we need to import detect_fields/labels
+        eval(x) for x in all_packages
+        if any([y == x[: len(y)] for y in tests_to_do])
+        and all([y != x[: len(y)] for y in tests_skipped])
+    ]
+    # to remove groups of tests
+    all_tests = [
+        test for test in all_tests if "_is" in dir(test)
+    ]
+    return all_tests

csv_detective/output/__init__.py ADDED Viewed

@@ -0,0 +1,64 @@
+import json
+import os
+from typing import Union
+import pandas as pd
+from csv_detective.utils import is_url
+from .dataframe import cast_df
+from .profile import create_profile
+from .schema import generate_table_schema
+def generate_output(
+    table: pd.DataFrame,
+    analysis: dict,
+    file_path: str,
+    num_rows: int = 500,
+    limited_output: bool = True,
+    save_results: Union[bool, str] = True,
+    output_profile: bool = False,
+    output_schema: bool = False,
+    output_df: bool = False,
+    cast_json: bool = True,
+    verbose: bool = False,
+    sheet_name: Union[str, int] = None,
+) -> Union[dict, tuple[dict, pd.DataFrame]]:
+    if output_profile:
+        analysis["profile"] = create_profile(
+            table=table,
+            dict_cols_fields=analysis["columns"],
+            num_rows=num_rows,
+            limited_output=limited_output,
+            verbose=verbose,
+        )
+    if save_results:
+        if isinstance(save_results, str):
+            output_path = save_results
+        else:
+            output_path = os.path.splitext(file_path)[0]
+            if is_url(output_path):
+                output_path = output_path.split('/')[-1]
+            if analysis.get("sheet_name"):
+                output_path += "_sheet-" + str(sheet_name)
+            output_path += ".json"
+        with open(output_path, "w", encoding="utf8") as fp:
+            json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
+    if output_schema:
+        analysis["schema"] = generate_table_schema(
+            analysis,
+            save_file=False,
+            verbose=verbose
+        )
+    if output_df:
+        return analysis, cast_df(
+            df=table,
+            columns=analysis["columns"],
+            cast_json=cast_json,
+            verbose=verbose,
+        )
+    return analysis

csv_detective/output/dataframe.py CHANGED Viewed

File without changes

csv_detective/output/example.py CHANGED Viewed

File without changes

csv_detective/output/profile.py CHANGED Viewed

File without changes

csv_detective/output/schema.py CHANGED Viewed

File without changes

csv_detective/output/utils.py CHANGED Viewed

File without changes

csv_detective/utils.py CHANGED Viewed

@@ -2,6 +2,8 @@ import logging
 import math
 from typing import Optional
+logging.basicConfig(level=logging.INFO)
 def display_logs_depending_process_time(prompt: str, duration: float):
     '''

csv_detective/validate.py ADDED Viewed

@@ -0,0 +1,70 @@
+import logging
+from typing import Union
+import pandas as pd
+from csv_detective.load_tests import return_all_tests
+from .parsing.load import load_file
+logging.basicConfig(level=logging.INFO)
+tests = {
+    t.__name__.split(".")[-1]: t._is
+    for t in return_all_tests("ALL", "detect_fields")
+}
+def validate(
+    file_path: str,
+    previous_analysis: dict,
+    num_rows: int = 500,
+    encoding: str = None,
+    sep: str = None,
+    verbose: bool = False,
+    skipna: bool = True,
+    sheet_name: Union[str, int] = None,
+) -> tuple[bool, pd.DataFrame, dict]:
+    """
+    Verify is the given file has the same fields and types as in the previous analysis.
+    """
+    table, analysis = load_file(
+        file_path=file_path,
+        num_rows=num_rows,
+        encoding=encoding,
+        sep=sep,
+        verbose=verbose,
+        sheet_name=sheet_name,
+    )
+    if verbose:
+        logging.info("Comparing table with the previous analysis")
+        logging.info("- Checking if all columns match")
+    if (
+        any(col_name not in list(table.columns) for col_name in previous_analysis["columns"])
+        or any(col_name not in list(previous_analysis["columns"].keys()) for col_name in table.columns)
+    ):
+        logging.warning("> Columns do not match, proceeding with full analysis")
+        return False, table, analysis
+    for col_name, args in previous_analysis["columns"].items():
+        if verbose:
+            logging.info(f"- Testing {col_name} for {args['format']}")
+        if args["format"] == "string":
+            # no test for columns that have not been recognized as a specific format
+            continue
+        test_func = tests[args["format"]]
+        col_data = table[col_name]
+        if skipna:
+            col_data = col_data.loc[~col_data.isna()]
+        if not col_data.apply(test_func).all():
+            logging.warning("> Test failed, proceeding with full analysis")
+            return False, table, analysis
+    if verbose:
+        logging.info("> All checks successful")
+    return True, table, analysis | {
+        k: previous_analysis[k] for k in [
+            "categorical",
+            "columns",
+            "columns_fields",
+            "columns_labels",
+            "formats",
+        ]
+    }

{csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/CHANGELOG.md RENAMED Viewed

@@ -13,6 +13,7 @@
 - Handle csv.gz files [#110](https://github.com/datagouv/csv-detective/pull/110)
 - Refactor file tests [#110](https://github.com/datagouv/csv-detective/pull/110)
 - Restructure repo (breaking changes) [#111](https://github.com/datagouv/csv-detective/pull/111)
+- Add validation function and associated flow [#112](https://github.com/datagouv/csv-detective/pull/112)
 - Better float detection [#113](https://github.com/datagouv/csv-detective/pull/113)
 ## 0.7.4 (2024-11-15)

{csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv_detective
-Version: 0.7.5.dev1277
+Version: 0.7.5.dev1286
 Summary: Detect CSV column content
 Home-page: https://github.com/etalab/csv_detective
 Author: Etalab

{csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,10 @@
-csv_detective/__init__.py,sha256=GCHgu0BhH5ACV7cf-1gDr9nRyvSoeQ1vRw9SjEHeMT4,143
+csv_detective/__init__.py,sha256=vpK7WMkIQbcJzu6HKOwcn7PpHsNCCaXZ1YLMS5Wq9tM,165
 csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
-csv_detective/explore_csv.py,sha256=FmgJ2h1SxV8b_wOWia4xsswyVJTlCCW66e0nhltz-0s,14511
+csv_detective/explore_csv.py,sha256=ocWlUEtuwZ-6bjDc6gfhC2-6DljMVhvXhHrfICCXGfQ,8986
+csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
 csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
-csv_detective/utils.py,sha256=KAYfSJXnPuAXnSc38Jm57oQ_JP_0kUkmI1OV6gN5_ys,1116
+csv_detective/utils.py,sha256=Bx_1k4Sdpd5PCjuAy4AeayCmmw7TMR_zgtKIHNLi5g0,1157
+csv_detective/validate.py,sha256=o4Qulf8E-x1zsWT9OD4Fpw83Gku1WA3JlX83j7bu0DA,2314
 csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -53,7 +55,7 @@ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7g
 csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
 csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
-csv_detective/detect_fields/other/float/__init__.py,sha256=7bXuPAmBuIhKJEhq7d20B60WVol1AUpqRkWhreQpWfU,578
+csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
 csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
 csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
 csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
@@ -126,10 +128,12 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=3U9j8Hux432KdGtIyArq_-v
 csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
 csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
 csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
+csv_detective/detection/formats.py,sha256=VwFazRAFJN6eaYUK7IauVU88vuUBHccESY4UD8EgGUo,5386
 csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
 csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
 csv_detective/detection/variables.py,sha256=3qEMtjZ_zyIFXvTnFgK7ZMDx8C12uQXKfFjEj2moyJc,3558
+csv_detective/output/__init__.py,sha256=XDS4Dgvv6oloIao9JquHa0m1nnlQ_q2gHuEPGlaETic,1890
 csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
 csv_detective/output/example.py,sha256=i8PkdXxidF7qR_9aK8vh12JpZdJQryhBgyrMS8iy5rk,8642
 csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
@@ -141,18 +145,19 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
 csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
 csv_detective/parsing/load.py,sha256=SpP0pfxswOAPPpwbZfoP1blh0EKV5VMs0TpTgQJKzjs,3621
 csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
-csv_detective-0.7.5.dev1277.data/data/share/csv_detective/CHANGELOG.md,sha256=tgIIm6s4qoP4RGJK1cmqf-Cm5aHmXmBrwi37NVIYedg,7796
-csv_detective-0.7.5.dev1277.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1277.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
-csv_detective-0.7.5.dev1277.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1286.data/data/share/csv_detective/CHANGELOG.md,sha256=Gqw7W41bXK_JgIYi80vdOPR6JLY5rgABeNsiDStE4XA,7901
+csv_detective-0.7.5.dev1286.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1286.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
+csv_detective-0.7.5.dev1286.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
-tests/test_fields.py,sha256=LPLx09cX5u9XHAh65XvTgIqzKylToiHZxXzKhpV0wsk,11148
-tests/test_file.py,sha256=EleTssys5fCP4N0W1eTZN35uijzoF15e3dIcuIlrMsk,7865
+tests/test_fields.py,sha256=53kiUQiqGt4_fnyCoxhNLeEsuN1LRDB-7HGT3p_Ed9I,11147
+tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
 tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
-tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
-csv_detective-0.7.5.dev1277.dist-info/METADATA,sha256=RgcnqpKqQ1us0lmVf6McKYJs38DC1sqvAh10XgnJOY8,1386
-csv_detective-0.7.5.dev1277.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
-csv_detective-0.7.5.dev1277.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.7.5.dev1277.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
-csv_detective-0.7.5.dev1277.dist-info/RECORD,,
+tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
+tests/test_validation.py,sha256=VwtBcnGAQ_eSFrBibWnMSTDjuy6y2JLlqvc3Zb667NY,479
+csv_detective-0.7.5.dev1286.dist-info/METADATA,sha256=rLptgL-FkLZzfkxPt7_0I-k7EKPKbEHhd3Ei2qt54KI,1386
+csv_detective-0.7.5.dev1286.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
+csv_detective-0.7.5.dev1286.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.7.5.dev1286.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
+csv_detective-0.7.5.dev1286.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -48,7 +48,7 @@ from csv_detective.detection.variables import (
     detect_continuous_variable,
     detect_categorical_variable,
 )
-from csv_detective.explore_csv import return_all_tests
+from csv_detective.load_tests import return_all_tests
 from csv_detective.output.dataframe import cast

tests/test_file.py CHANGED Viewed

@@ -28,7 +28,7 @@ def test_columns_output_on_file():
         "STRUCTURED_INFO",
         "GEO_INFO",
     ]
-    assert output["total_lines"] == 414
+    assert output["total_lines"] == 404
     assert output["nb_duplicates"] == 7
     assert output["columns"]["NOMCOM"]["format"] == "commune"
     assert output["columns"]["NOMDEP"]["format"] == "departement"
@@ -48,7 +48,7 @@ def test_profile_output_on_file():
     )
     assert all(
         [
-            c in list(output["profile"]["NUMCOM"].keys())
+            c in list(output["profile"]["TXCOUVGLO_COM_2014"].keys())
             for c in [
                 "min",
                 "max",
@@ -60,12 +60,22 @@ def test_profile_output_on_file():
             ]
         ]
     )
-    assert len(output["profile"]["NOMCOM"].keys()) == 3
-    assert output["profile"]["NUMCOM"]["min"] == 1001
-    assert output["profile"]["NUMCOM"]["max"] == 6125
-    assert round(output["profile"]["NUMCOM"]["mean"]) == 1245
-    assert round(output["profile"]["NUMCOM"]["std"]) == 363
-    assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 296
+    assert not any(
+        [
+            c in list(output["profile"]["NUMCOM"].keys())
+            for c in [
+                    "min",
+                    "max",
+                    "mean",
+                    "std",
+            ]
+        ]
+    )
+    assert output["profile"]["TXCOUVGLO_COM_2014"]["min"] == 0.0
+    assert output["profile"]["TXCOUVGLO_COM_2014"]["max"] == 200.2
+    assert round(output["profile"]["TXCOUVGLO_COM_2014"]["mean"]) == 60
+    assert round(output["profile"]["TXCOUVGLO_COM_2014"]["std"]) == 36
+    assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 290
     assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_missing_values"] == 3
     assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
@@ -175,7 +185,7 @@ def mocked_responses():
     "params",
     # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
     # which doesn't support the way we mock the response, TBC
-    params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 414})]
+    params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})]
 )
 def test_urls(mocked_responses, params):
     file_name, checks = params

tests/test_structure.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 # flake8: noqa
 from csv_detective import detect_fields, detect_labels
+from csv_detective.load_tests import return_all_tests
 def tests_conformity():
@@ -29,3 +30,8 @@ def tests_conformity():
                 .replace("/", ".")
             )
             assert "_is" in dir(_package)
+def test_all_tests_have_unique_name():
+    names = [t.__name__.split(".")[-1] for t in return_all_tests("ALL", "detect_fields")]
+    assert len(names) == len(set(names))

tests/test_validation.py ADDED Viewed

@@ -0,0 +1,18 @@
+import json
+import pandas as pd
+from csv_detective.validate import validate
+def test_validation():
+    with open("tests/data/a_test_file.json", "r") as f:
+        previous_analysis = json.load(f)
+    is_valid, table, analysis = validate(
+        "tests/data/a_test_file.csv",
+        previous_analysis=previous_analysis,
+        num_rows=-1,
+    )
+    assert is_valid is True
+    assert isinstance(table, pd.DataFrame)
+    assert isinstance(analysis, dict)

{csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/README.md RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/licenses/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.7.5.dev1277__py3-none-any.whl → 0.7.5.dev1286__py3-none-any.whl

csv-detective 0.7.5.dev1277py3-none-any.whl → 0.7.5.dev1286py3-none-any.whl