PyPI - csv-detective - Versions diffs - 0.7.5.dev1277__py3-none-any.whl → 0.7.5.dev1298__py3-none-any.whl - Mend

csv-detective 0.7.5.dev1277py3-none-any.whl → 0.7.5.dev1298py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

csv_detective/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .explore_csv import routine, routine_minio  # noqa
+from .explore_csv import routine, routine_minio, validate_then_detect  # noqa
 from .output.example import create_example_csv_file  # noqa
 __version__ = '0.7.5.dev'

csv_detective/detect_fields/__init__.py CHANGED Viewed

@@ -10,19 +10,21 @@ from .FR.other import (
     insee_ape700,
     date_fr,
     code_waldec,
-    code_rna
+    code_rna,
 )
 from .other import (
     email,
     url,
     booleen,
+    money,
     mongo_object_id,
+    percent,
     twitter,
     float,
     int,
     uuid,
-    json
+    json,
 )
 from .FR.geo import (
@@ -40,7 +42,7 @@ from .FR.geo import (
     code_region,
     latitude_l93,
     longitude_l93,
-    insee_canton
+    insee_canton,
 )
 from .geo import (
@@ -50,7 +52,7 @@ from .geo import (
     latitude_wgs,
     longitude_wgs,
     latlon_wgs,
-    json_geojson
+    json_geojson,
 )
 from .FR.temp import jour_de_la_semaine, mois_de_annee

csv_detective/detect_fields/geo/latlon_wgs/__init__.py CHANGED Viewed

@@ -1,13 +1,13 @@
-import re
+from ..latitude_wgs import _is as is_lat
+from ..longitude_wgs import _is as is_lon
-PROPORTION = 0.9
+PROPORTION = 1
 def _is(val):
     '''Renvoie True si val peut etre une latitude,longitude'''
-    return isinstance(val, str) and bool(
-        re.match(
-            r'^\[?[\+\-]?[0-8]?\d\.\d* ?, ?[\+\-]?(1[0-7]\d|\d{1,2})\.\d+\]?$', val
-        )
-    )
+    if not isinstance(val, str) or val.count(",") != 1:
+        return False
+    lat, lon = val.split(",")
+    return is_lat(lat) and is_lon(lon.replace(" ", ""))

csv_detective/detect_fields/other/float/__init__.py CHANGED Viewed

@@ -2,16 +2,16 @@ PROPORTION = 1
 def float_casting(val: str) -> float:
-    return float(val.replace(',', '.'))
+    return float(val.replace(",", "."))
 def _is(val):
-    '''Detects floats, assuming that tables will not have scientific
-    notations (3e6) or "+" in the string. "-" is still accepted.'''
+    """Detects floats, assuming that tables will not have scientific
+    notations (3e6) or "+" in the string. "-" is still accepted."""
     try:
         if (
             not isinstance(val, str)
-            or any([k in val for k in ['_', '+', 'e', 'E']])
+            or any([k in val for k in ["_", "+", "e", "E"]])
             or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
         ):
             return False

csv_detective/detect_fields/other/money/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from ..float import _is as is_float
+currencies = set(["€", "$", "£", "¥"])
+PROPORTION = 0.8
+def _is(val: str):
+    if not isinstance(val, str) or val[-1] not in currencies:
+        return False
+    return is_float(val[:-1])

csv_detective/detect_fields/other/percent/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from ..float import _is as is_float
+PROPORTION = 0.8
+def _is(val: str):
+    if not isinstance(val, str) or val[-1] != "%":
+        return False
+    return is_float(val[:-1])

csv_detective/detection/formats.py ADDED Viewed

@@ -0,0 +1,145 @@
+from collections import defaultdict
+from typing import Union
+import numpy as np
+import pandas as pd
+from csv_detective.detection.variables import (
+    detect_categorical_variable,
+    # detect_continuous_variable,
+)
+from csv_detective.load_tests import return_all_tests
+from csv_detective.output.utils import prepare_output_dict
+from csv_detective.parsing.columns import test_col, test_label
+def detect_formats(
+    table: pd.DataFrame,
+    analysis: dict,
+    user_input_tests: Union[str, list[str]] = "ALL",
+    limited_output: bool = True,
+    skipna: bool = True,
+    verbose: bool = False,
+):
+    if table.empty:
+        res_categorical = []
+        # res_continuous = []
+    else:
+        # Detects columns that are categorical
+        res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
+        res_categorical = list(res_categorical)
+        # Detect columns that are continuous (we already know the categorical) :
+        # we don't need this for now, cuts processing time
+        # res_continuous = list(
+        #     detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
+        # )
+    analysis.update({
+        "categorical": res_categorical,
+        # "continuous": res_continuous,
+    })
+    # list testing to be performed
+    all_tests_fields = return_all_tests(
+        user_input_tests, detect_type="detect_fields"
+    )  # list all tests for the fields
+    all_tests_labels = return_all_tests(
+        user_input_tests, detect_type="detect_labels"
+    )  # list all tests for the labels
+    # if no testing then return
+    if not all_tests_fields and not all_tests_labels:
+        return analysis
+    # Perform testing on fields
+    scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
+    analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
+    # Perform testing on labels
+    scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
+    analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
+    # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
+    # This is because the fields are more important than the labels and yields a max
+    # of 1.5 for the final score.
+    scores_table = scores_table_fields * (
+        1
+        + scores_table_labels.reindex(
+            index=scores_table_fields.index, fill_value=0
+        ).values / 2
+    )
+    # To reduce false positives: ensure these formats are detected only if the label yields
+    # a detection (skipping the ones that have been excluded by the users).
+    formats_with_mandatory_label = [
+        f for f in [
+            "code_departement",
+            "code_commune_insee",
+            "code_postal",
+            "latitude_wgs",
+            "longitude_wgs",
+            "latitude_wgs_fr_metropole",
+            "longitude_wgs_fr_metropole",
+            "latitude_l93",
+            "longitude_l93",
+        ] if f in scores_table.index
+    ]
+    scores_table.loc[formats_with_mandatory_label, :] = np.where(
+        scores_table_labels.loc[formats_with_mandatory_label, :],
+        scores_table.loc[formats_with_mandatory_label, :],
+        0,
+    )
+    analysis["columns"] = prepare_output_dict(scores_table, limited_output)
+    metier_to_python_type = {
+        "booleen": "bool",
+        "int": "int",
+        "float": "float",
+        "string": "string",
+        "json": "json",
+        "json_geojson": "json",
+        "datetime": "datetime",
+        "datetime_iso": "datetime",
+        "datetime_rfc822": "datetime",
+        "date": "date",
+        "latitude": "float",
+        "latitude_l93": "float",
+        "latitude_wgs": "float",
+        "latitude_wgs_fr_metropole": "float",
+        "longitude": "float",
+        "longitude_l93": "float",
+        "longitude_wgs": "float",
+        "longitude_wgs_fr_metropole": "float",
+    }
+    if not limited_output:
+        for detection_method in ["columns_fields", "columns_labels", "columns"]:
+            analysis[detection_method] = {
+                col_name: [
+                    {
+                        "python_type": metier_to_python_type.get(
+                            detection["format"], "string"
+                        ),
+                        **detection,
+                    }
+                    for detection in detections
+                ]
+                for col_name, detections in analysis[detection_method].items()
+            }
+    else:
+        for detection_method in ["columns_fields", "columns_labels", "columns"]:
+            analysis[detection_method] = {
+                col_name: {
+                    "python_type": metier_to_python_type.get(
+                        detection["format"], "string"
+                    ),
+                    **detection,
+                }
+                for col_name, detection in analysis[detection_method].items()
+            }
+        # Add detection with formats as keys
+        analysis["formats"] = defaultdict(list)
+        for header, col_metadata in analysis["columns"].items():
+            analysis["formats"][col_metadata["format"]].append(header)
+    return analysis

csv_detective/explore_csv.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from collections import defaultdict
 import json
 import logging
 import os
@@ -6,80 +5,16 @@ import tempfile
 from time import time
 from typing import Optional, Union
-import numpy as np
 import pandas as pd
-# flake8: noqa
-from csv_detective import detect_fields, detect_labels
-from .detection.variables import (
-    detect_categorical_variable,
-    # detect_continuous_variable,
-)
-from .output.dataframe import cast_df
-from .output.profile import create_profile
-from .output.schema import generate_table_schema
-from .output.utils import prepare_output_dict
+from .detection.formats import detect_formats
+from .output import generate_output, generate_table_schema
 from .parsing.load import load_file
-from .parsing.columns import test_col, test_label
 from .s3_utils import download_from_minio, upload_to_minio
 from .utils import display_logs_depending_process_time, is_url
+from .validate import validate
-def get_all_packages(detect_type: str) -> list:
-    root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
-    modules = []
-    for dirpath, _, filenames in os.walk(root_dir):
-        for filename in filenames:
-            file = os.path.join(dirpath, filename).replace(root_dir, "")
-            if file.endswith("__init__.py"):
-                module = (
-                    file.replace("__init__.py", "")
-                    .replace("/", ".").replace("\\", ".")[:-1]
-                )
-                if module:
-                    modules.append(detect_type + module)
-    return modules
-def return_all_tests(
-    user_input_tests: Union[str, list],
-    detect_type: str,
-) -> list:
-    """
-    returns all tests that have a method _is and are listed in the user_input_tests
-    the function can select a sub_package from csv_detective
-    user_input_tests may look like this:
-        - "ALL": all possible tests are made
-        - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
-        this specifc (group of) test(s) only
-        - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
-        specific (groups of) tests by add "-" at the start (e.g "-temp.date")
-    """
-    assert detect_type in ["detect_fields", "detect_labels"]
-    all_packages = get_all_packages(detect_type=detect_type)
-    if isinstance(user_input_tests, str):
-        user_input_tests = [user_input_tests]
-    if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
-        tests_to_do = [detect_type]
-    else:
-        tests_to_do = [
-            f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
-        ]
-    tests_skipped = [
-        f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
-    ]
-    all_tests = [
-        # this is why we need to import detect_fields/labels
-        eval(x) for x in all_packages
-        if any([y == x[: len(y)] for y in tests_to_do])
-        and all([y != x[: len(y)] for y in tests_skipped])
-    ]
-    # to remove groups of tests
-    all_tests = [
-        test for test in all_tests if "_is" in dir(test)
-    ]
-    return all_tests
+logging.basicConfig(level=logging.INFO)
 def routine(
@@ -99,7 +34,7 @@ def routine(
     sheet_name: Optional[Union[str, int]] = None,
 ) -> Union[dict, tuple[dict, pd.DataFrame]]:
     """Returns a dict with information about the csv table and possible
-    column contents.
+    column contents, and if requested the DataFrame with columns cast according to analysis.
     Args:
         file_path: local path to CSV file if not using Minio
@@ -112,14 +47,14 @@ def routine(
         output_schema: whether or not to add the 'schema' field to the output (tableschema)
         output_df: whether or not to return the loaded DataFrame along with the analysis report
         cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
-        verbose: whether or not to print process logs in console
+        verbose: whether or not to print process logs in console
         sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
         skipna: whether to keep NaN (empty cells) for tests
     Returns:
         dict: a dict with information about the csv and possible types for each column
     """
     if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
         raise ValueError("`save_results` must be a bool or a valid path to a json file.")
@@ -137,168 +72,105 @@ def routine(
         sheet_name=sheet_name,
     )
-    if table.empty:
-        res_categorical = []
-        # res_continuous = []
-    else:
-        # Detects columns that are categorical
-        res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
-        res_categorical = list(res_categorical)
-        # Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
-        # res_continuous = list(
-        #     detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
-        # )
-    analysis.update({
-        "categorical": res_categorical,
-        # "continuous": res_continuous,
-    })
-    # list testing to be performed
-    all_tests_fields = return_all_tests(
-        user_input_tests, detect_type="detect_fields"
-    )  # list all tests for the fields
-    all_tests_labels = return_all_tests(
-        user_input_tests, detect_type="detect_labels"
-    )  # list all tests for the labels
-    # if no testing then return
-    if not all_tests_fields and not all_tests_labels:
-        return analysis
-    # Perform testing on fields
-    scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
-    analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
-    # Perform testing on labels
-    scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
-    analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
-    # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
-    # This is because the fields are more important than the labels and yields a max
-    # of 1.5 for the final score.
-    scores_table = scores_table_fields * (
-        1
-        + scores_table_labels.reindex(
-            index=scores_table_fields.index, fill_value=0
-        ).values / 2
-    )
-    # To reduce false positives: ensure these formats are detected only if the label yields
-    # a detection (skipping the ones that have been excluded by the users).
-    formats_with_mandatory_label = [
-        f for f in [
-            "code_departement",
-            "code_commune_insee",
-            "code_postal",
-            "latitude_wgs",
-            "longitude_wgs",
-            "latitude_wgs_fr_metropole",
-            "longitude_wgs_fr_metropole",
-            "latitude_l93",
-            "longitude_l93",
-        ] if f in scores_table.index
-    ]
-    scores_table.loc[formats_with_mandatory_label, :] = np.where(
-        scores_table_labels.loc[formats_with_mandatory_label, :],
-        scores_table.loc[formats_with_mandatory_label, :],
-        0,
+    analysis = detect_formats(
+        table=table,
+        analysis=analysis,
+        user_input_tests=user_input_tests,
+        limited_output=limited_output,
+        skipna=skipna,
+        verbose=verbose,
     )
-    analysis["columns"] = prepare_output_dict(scores_table, limited_output)
-    metier_to_python_type = {
-        "booleen": "bool",
-        "int": "int",
-        "float": "float",
-        "string": "string",
-        "json": "json",
-        "json_geojson": "json",
-        "datetime": "datetime",
-        "datetime_iso": "datetime",
-        "datetime_rfc822": "datetime",
-        "date": "date",
-        "latitude": "float",
-        "latitude_l93": "float",
-        "latitude_wgs": "float",
-        "latitude_wgs_fr_metropole": "float",
-        "longitude": "float",
-        "longitude_l93": "float",
-        "longitude_wgs": "float",
-        "longitude_wgs_fr_metropole": "float",
-    }
-    if not limited_output:
-        for detection_method in ["columns_fields", "columns_labels", "columns"]:
-            analysis[detection_method] = {
-                col_name: [
-                    {
-                        "python_type": metier_to_python_type.get(
-                            detection["format"], "string"
-                        ),
-                        **detection,
-                    }
-                    for detection in detections
-                ]
-                for col_name, detections in analysis[detection_method].items()
-            }
-    else:
-        for detection_method in ["columns_fields", "columns_labels", "columns"]:
-            analysis[detection_method] = {
-                col_name: {
-                    "python_type": metier_to_python_type.get(
-                        detection["format"], "string"
-                    ),
-                    **detection,
-                }
-                for col_name, detection in analysis[detection_method].items()
-            }
-        # Add detection with formats as keys
-        analysis["formats"] = defaultdict(list)
-        for header, col_metadata in analysis["columns"].items():
-            analysis["formats"][col_metadata["format"]].append(header)
-    if output_profile:
-        analysis["profile"] = create_profile(
+    try:
+        return generate_output(
             table=table,
-            dict_cols_fields=analysis["columns"],
+            analysis=analysis,
+            file_path=file_path,
             num_rows=num_rows,
             limited_output=limited_output,
+            save_results=save_results,
+            output_profile=output_profile,
+            output_schema=output_schema,
+            output_df=output_df,
+            cast_json=cast_json,
             verbose=verbose,
+            sheet_name=sheet_name,
         )
+    finally:
+        if verbose:
+            display_logs_depending_process_time(
+                f"Routine completed in {round(time() - start_routine, 3)}s",
+                time() - start_routine
+            )
-    if save_results:
-        if isinstance(save_results, str):
-            output_path = save_results
-        else:
-            output_path = os.path.splitext(file_path)[0]
-            if is_url(output_path):
-                output_path = output_path.split('/')[-1]
-            if analysis.get("sheet_name"):
-                output_path += "_sheet-" + str(sheet_name)
-            output_path += ".json"
-        with open(output_path, "w", encoding="utf8") as fp:
-            json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
-    if output_schema:
-        analysis["schema"] = generate_table_schema(
-            analysis,
-            save_file=False,
-            verbose=verbose
-        )
+def validate_then_detect(
+    file_path: str,
+    previous_analysis: dict,
+    num_rows: int = 500,
+    user_input_tests: Union[str, list[str]] = "ALL",
+    limited_output: bool = True,
+    save_results: Union[bool, str] = True,
+    encoding: str = None,
+    sep: str = None,
+    skipna: bool = True,
+    output_profile: bool = False,
+    output_schema: bool = False,
+    output_df: bool = False,
+    cast_json: bool = True,
+    verbose: bool = False,
+    sheet_name: Union[str, int] = None,
+):
     if verbose:
-        display_logs_depending_process_time(
-            f'Routine completed in {round(time() - start_routine, 3)}s',
-            time() - start_routine
+        start_routine = time()
+        if is_url(file_path):
+            logging.info("Path recognized as a URL")
+    is_valid, table, analysis = validate(
+        file_path=file_path,
+        previous_analysis=previous_analysis,
+        num_rows=num_rows,
+        encoding=encoding,
+        sep=sep,
+        verbose=verbose,
+        skipna=skipna,
+        sheet_name=sheet_name,
+    )
+    if is_valid:
+        # skipping formats detection as the validation is successful
+        analysis = previous_analysis
+        del analysis["profile"]
+    else:
+        analysis = detect_formats(
+            table=table,
+            analysis=analysis,
+            user_input_tests=user_input_tests,
+            limited_output=limited_output,
+            skipna=skipna,
+            verbose=verbose,
         )
-    if output_df:
-        return analysis, cast_df(
-            df=table,
-            columns=analysis["columns"],
+    try:
+        return generate_output(
+            table=table,
+            analysis=analysis,
+            file_path=file_path,
+            num_rows=num_rows,
+            limited_output=limited_output,
+            save_results=save_results,
+            output_profile=output_profile,
+            output_schema=output_schema,
+            output_df=output_df,
             cast_json=cast_json,
             verbose=verbose,
+            sheet_name=sheet_name,
         )
-    return analysis
+    finally:
+        if verbose:
+            display_logs_depending_process_time(
+                f"Process completed in {round(time() - start_routine, 3)}s",
+                time() - start_routine
+            )
 def routine_minio(
@@ -369,8 +241,8 @@ def routine_minio(
         minio_pwd=minio_pwd,
     )
-    analysis = routine(file_path,
-        num_rows,
+    analysis = routine(
+        file_path,
         save_results=True,
         **kwargs,
     )

csv-detective 0.7.5.dev1277__py3-none-any.whl → 0.7.5.dev1298__py3-none-any.whl

csv-detective 0.7.5.dev1277py3-none-any.whl → 0.7.5.dev1298py3-none-any.whl