PyPI - csv-detective - Versions diffs - 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl - Mend

csv-detective 0.7.5.dev1197py3-none-any.whl → 0.7.5.dev1209py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

csv_detective/detection/rows.py ADDED Viewed

@@ -0,0 +1,18 @@
+import pandas as pd
+def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
+    """Analog process to detect_headers for csv files, determines how many rows to skip
+    to end up with the header at the right place"""
+    idx = 0
+    if all([str(c).startswith('Unnamed:') for c in table.columns]):
+        # there is on offset between the index in the file (idx here)
+        # and the index in the dataframe, because of the header
+        idx = 1
+        while table.iloc[idx - 1].isna().all():
+            idx += 1
+        cols = table.iloc[idx - 1]
+        table = table.iloc[idx:]
+        table.columns = cols.to_list()
+    # +1 here because the headers should count as a row
+    return table, idx

csv_detective/detection/separator.py ADDED Viewed

@@ -0,0 +1,44 @@
+import csv
+import logging
+from time import time
+from typing import TextIO
+from csv_detective.utils import display_logs_depending_process_time
+def detect_separator(file: TextIO, verbose: bool = False) -> str:
+    """Detects csv separator"""
+    # TODO: add a robust detection:
+    # si on a un point virgule comme texte et \t comme séparateur, on renvoie
+    # pour l'instant un point virgule
+    if verbose:
+        start = time()
+        logging.info("Detecting separator")
+    file.seek(0)
+    header = file.readline()
+    possible_separators = [";", ",", "|", "\t"]
+    sep_count = dict()
+    for sep in possible_separators:
+        sep_count[sep] = header.count(sep)
+    sep = max(sep_count, key=sep_count.get)
+    # testing that the first 10 (arbitrary) rows all have the same number of fields
+    # as the header. Prevents downstream unwanted behaviour where pandas can load
+    # the file (in a weird way) but the process is irrelevant.
+    file.seek(0)
+    reader = csv.reader(file, delimiter=sep)
+    rows_lengths = set()
+    for idx, row in enumerate(reader):
+        if idx > 10:
+            break
+        rows_lengths.add(len(row))
+    if len(rows_lengths) > 1:
+        raise ValueError(
+            f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
+        )
+    if verbose:
+        display_logs_depending_process_time(
+            f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
+            time() - start,
+        )
+    return sep

csv_detective/detection/variables.py ADDED Viewed

@@ -0,0 +1,98 @@
+from ast import literal_eval
+import logging
+from time import time
+import pandas as pd
+from csv_detective.utils import display_logs_depending_process_time
+def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
+    """
+    Detects whether a column contains continuous variables. We consider a continuous column
+    one that contains a considerable amount of float values.
+    We removed the integers as we then end up with postal codes, insee codes, and all sort
+    of codes and types.
+    This is not optimal but it will do for now.
+    """
+    # if we need this again in the future, could be first based on columns detected as int/float to cut time
+    def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
+        count = serie.value_counts().to_dict()
+        total_nb = len(serie)
+        if float in count:
+            nb_floats = count[float]
+        else:
+            return False
+        if nb_floats / total_nb >= continuous_th:
+            return True
+        else:
+            return False
+    def parses_to_integer(value: str):
+        try:
+            value = value.replace(",", ".")
+            value = literal_eval(value)
+            return type(value)
+        # flake8: noqa
+        except:
+            return False
+    if verbose:
+        start = time()
+        logging.info("Detecting continuous columns")
+    res = table.apply(
+        lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
+    )
+    if verbose:
+        display_logs_depending_process_time(
+            f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
+            time() - start,
+        )
+    return res.index[res]
+def detect_categorical_variable(
+    table: pd.DataFrame,
+    threshold_pct_categorical: float = 0.05,
+    max_number_categorical_values: int = 25,
+    verbose: bool = False,
+):
+    """
+    Heuristically detects whether a table (df) contains categorical values according to
+    the number of unique values contained.
+    As the idea of detecting categorical values is to then try to learn models to predict
+    them, we limit categorical values to at most 25 different modes or at most 5% disparity.
+    Postal code, insee code, code region and so on, may be thus not considered categorical values.
+    :param table:
+    :param threshold_pct_categorical:
+    :param max_number_categorical_values:
+    :return:
+    """
+    def abs_number_different_values(column_values: pd.Series):
+        return column_values.nunique()
+    def rel_number_different_values(column_values: pd.Series):
+        return column_values.nunique() / len(column_values)
+    def detect_categorical(column_values: pd.Series):
+        abs_unique_values = abs_number_different_values(column_values)
+        rel_unique_values = rel_number_different_values(column_values)
+        if (
+            abs_unique_values <= max_number_categorical_values
+            or rel_unique_values <= threshold_pct_categorical
+        ):
+            return True
+        return False
+    if verbose:
+        start = time()
+        logging.info("Detecting categorical columns")
+    res = table.apply(lambda serie: detect_categorical(serie))
+    if verbose:
+        display_logs_depending_process_time(
+            f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
+            time() - start,
+        )
+    return res.index[res], res

csv_detective/explore_csv.py CHANGED Viewed

@@ -1,52 +1,28 @@
-"""
-Ce script analyse les premières lignes d'un CSV pour essayer de déterminer le
-contenu possible des champs
-"""
-from typing import Dict, List, Union
 from collections import defaultdict
 import json
-import numpy as np
+import logging
 import os
 import tempfile
-import logging
 from time import time
-import requests
-from io import BytesIO, StringIO
+from typing import Union
+import numpy as np
 import pandas as pd
 # flake8: noqa
 from csv_detective import detect_fields, detect_labels
-from csv_detective.s3_utils import download_from_minio, upload_to_minio
-from csv_detective.schema_generation import generate_table_schema
-from csv_detective.utils import (
-    cast_df,
-    display_logs_depending_process_time,
-    prepare_output_dict,
-    test_col,
-    test_label,
-)
-from .detection import (
-    detect_engine,
-    detect_separator,
-    detect_encoding,
-    detect_headers,
-    detect_heading_columns,
-    detect_trailing_columns,
-    parse_table,
-    parse_excel,
-    create_profile,
-    detetect_categorical_variable,
+from .detection.variables import (
+    detect_categorical_variable,
     # detect_continuous_variable,
-    is_url,
-    unzip,
-    XLS_LIKE_EXT,
-    EXCEL_ENGINES,
-    COMPRESSION_ENGINES,
 )
-logging.basicConfig(level=logging.INFO)
+from .output.dataframe import cast_df
+from .output.profile import create_profile
+from .output.schema import generate_table_schema
+from .output.utils import prepare_output_dict
+from .parsing.load import load_file
+from .parsing.columns import test_col, test_label
+from .s3_utils import download_from_minio, upload_to_minio
+from .utils import display_logs_depending_process_time, is_url
 def get_all_packages(detect_type) -> list:
@@ -107,9 +83,9 @@ def return_all_tests(
 def routine(
-    csv_file_path: str,
+    file_path: str,
     num_rows: int = 500,
-    user_input_tests: Union[str, List[str]] = "ALL",
+    user_input_tests: Union[str, list[str]] = "ALL",
     limited_output: bool = True,
     save_results: Union[bool, str] = True,
     encoding: str = None,
@@ -126,7 +102,7 @@ def routine(
     column contents.
     Args:
-        csv_file_path: local path to CSV file if not using Minio
+        file_path: local path to CSV file if not using Minio
         num_rows: number of rows to sample from the file for analysis ; -1 for analysis
         of the whole file
         user_input_tests: tests to run on the file
@@ -143,100 +119,40 @@ def routine(
     Returns:
         dict: a dict with information about the csv and possible types for each column
     """
-    if not csv_file_path:
-        raise ValueError("csv_file_path is required.")
     if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
         raise ValueError("`save_results` must be a bool or a valid path to a json file.")
     if verbose:
         start_routine = time()
-        if is_url(csv_file_path):
+        if is_url(file_path):
             logging.info("Path recognized as a URL")
-    file_name = csv_file_path.split('/')[-1]
-    engine = None
-    if '.' not in file_name or not file_name.endswith("csv"):
-        # file has no extension, we'll investigate how to read it
-        engine = detect_engine(csv_file_path, verbose=verbose)
-    is_xls_like = False
-    if engine in EXCEL_ENGINES or any([csv_file_path.endswith(k) for k in XLS_LIKE_EXT]):
-        is_xls_like = True
-        encoding, sep, heading_columns, trailing_columns = None, None, None, None
-        table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
-            csv_file_path=csv_file_path,
-            num_rows=num_rows,
-            engine=engine,
-            sheet_name=sheet_name,
-            verbose=verbose,
-        )
-        header = table.columns.to_list()
-    else:
-        # fetching or reading file as binary
-        if is_url(csv_file_path):
-            r = requests.get(csv_file_path, allow_redirects=True)
-            r.raise_for_status()
-            binary_file = BytesIO(r.content)
-        else:
-            binary_file = open(csv_file_path, "rb")
-        # handling compression
-        if engine in COMPRESSION_ENGINES:
-            binary_file: BytesIO = unzip(binary_file=binary_file, engine=engine)
-        # detecting encoding if not specified
-        if encoding is None:
-            encoding: str = detect_encoding(binary_file, verbose=verbose)
-            binary_file.seek(0)
-        # decoding and reading file
-        if is_url(csv_file_path) or engine in COMPRESSION_ENGINES:
-            str_file = StringIO(binary_file.read().decode(encoding=encoding))
-        else:
-            str_file = open(csv_file_path, "r", encoding=encoding)
-        if sep is None:
-            sep = detect_separator(str_file, verbose=verbose)
-        header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
-        if header is None:
-            return {"error": True}
-        elif isinstance(header, list):
-            if any([x is None for x in header]):
-                return {"error": True}
-        heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
-        trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
-        table, total_lines, nb_duplicates = parse_table(
-            str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
-        )
+    table, analysis = load_file(
+        file_path=file_path,
+        num_rows=num_rows,
+        encoding=encoding,
+        sep=sep,
+        verbose=verbose,
+        sheet_name=sheet_name,
+    )
     if table.empty:
         res_categorical = []
         # res_continuous = []
     else:
         # Detects columns that are categorical
-        res_categorical, categorical_mask = detetect_categorical_variable(table, verbose=verbose)
+        res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
         res_categorical = list(res_categorical)
         # Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
         # res_continuous = list(
         #     detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
         # )
-    # Creating return dictionary
-    analysis = {
-        "header_row_idx": header_row_idx,
-        "header": header,
-        "total_lines": total_lines,
-        "nb_duplicates": nb_duplicates,
-        "heading_columns": heading_columns,
-        "trailing_columns": trailing_columns,
+    analysis.update({
         "categorical": res_categorical,
         # "continuous": res_continuous,
-    }
-    # this is only relevant for xls-like
-    if is_xls_like:
-        analysis["engine"] = engine
-        analysis["sheet_name"] = sheet_name
-    # this is only relevant for csv
-    else:
-        analysis["encoding"] = encoding
-        analysis["separator"] = sep
+    })
     # list testing to be performed
     all_tests_fields = return_all_tests(
@@ -355,10 +271,10 @@ def routine(
         if isinstance(save_results, str):
             output_path = save_results
         else:
-            output_path = os.path.splitext(csv_file_path)[0]
+            output_path = os.path.splitext(file_path)[0]
             if is_url(output_path):
                 output_path = output_path.split('/')[-1]
-            if is_xls_like:
+            if analysis.get("sheet_name"):
                 output_path += "_sheet-" + str(sheet_name)
             output_path += ".json"
         with open(output_path, "w", encoding="utf8") as fp:
@@ -386,13 +302,13 @@ def routine(
 def routine_minio(
-    csv_minio_location: Dict[str, str],
-    output_minio_location: Dict[str, str],
-    tableschema_minio_location: Dict[str, str],
+    csv_minio_location: dict[str, str],
+    output_minio_location: dict[str, str],
+    tableschema_minio_location: dict[str, str],
     minio_user: str,
     minio_pwd: str,
     num_rows: int = 500,
-    user_input_tests: Union[str, List[str]] = "ALL",
+    user_input_tests: Union[str, list[str]] = "ALL",
     encoding: str = None,
     sep: str = None,
 ):
@@ -450,18 +366,18 @@ def routine_minio(
             ):
                 raise ValueError("Minio location dict must contain url, bucket and key")
-    csv_file_path = tempfile.NamedTemporaryFile(delete=False).name
+    file_path = tempfile.NamedTemporaryFile(delete=False).name
     download_from_minio(
         netloc=csv_minio_location["netloc"],
         bucket=csv_minio_location["bucket"],
         key=csv_minio_location["key"],
-        filepath=csv_file_path,
+        filepath=file_path,
         minio_user=minio_user,
         minio_pwd=minio_pwd,
     )
     analysis = routine(
-        csv_file_path,
+        file_path,
         num_rows,
         user_input_tests,
         output_mode="LIMITED",
@@ -471,7 +387,7 @@ def routine_minio(
     )
     # Write report JSON file.
-    output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
+    output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
     with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
         json.dump(analysis, fp, indent=4, separators=(",", ": "))
@@ -485,7 +401,7 @@ def routine_minio(
     )
     os.remove(output_path_to_store_minio_file)
-    os.remove(csv_file_path)
+    os.remove(file_path)
     generate_table_schema(
         analysis,

csv_detective/output/dataframe.py ADDED Viewed

@@ -0,0 +1,55 @@
+from datetime import date, datetime
+import json
+from typing import Optional, Union
+from time import time
+import pandas as pd
+from csv_detective.detect_fields.other.booleen import bool_casting
+from csv_detective.detect_fields.other.float import float_casting
+from csv_detective.detect_fields.temp.date import date_casting
+from csv_detective.utils import display_logs_depending_process_time
+def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
+    if not isinstance(value, str) or not value:
+        # None is the current default value in hydra, should we keep this?
+        return None
+    if _type == "float":
+        return float_casting(value)
+    if _type == "bool":
+        return bool_casting(value)
+    if _type == "json":
+        # in hydra json are given to postgres as strings, conversion is done by postgres
+        return json.loads(value)
+    if _type == "date":
+        _date = date_casting(value)
+        return _date.date() if _date else None
+    if _type == "datetime":
+        return date_casting(value)
+    raise ValueError(f"Unknown type `{_type}`")
+def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
+    if verbose:
+        start = time()
+    output_df = pd.DataFrame()
+    for col_name, detection in columns.items():
+        if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
+            # no change if detected type is string
+            output_df[col_name] = df[col_name].copy()
+        elif detection["python_type"] == "int":
+            # to allow having ints and NaN in the same column
+            output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
+        else:
+            output_df[col_name] = df[col_name].apply(
+                lambda col: cast(col, _type=detection["python_type"])
+            )
+        # to save RAM
+        del df[col_name]
+    if verbose:
+        display_logs_depending_process_time(
+            f'Casting columns completed in {round(time() - start, 3)}s',
+            time() - start,
+        )
+    return output_df

csv_detective/{create_example.py → output/example.py} RENAMED Viewed

@@ -1,13 +1,14 @@
+from datetime import datetime
+import json
 import random
-import uuid
 import string
-from datetime import datetime
+from typing import Union, Optional, Any, Type
+import uuid
+from faker import Faker
 import pandas as pd
-from typing import List, Union, Optional, Any, Type
-import json
 import requests
 import rstr
-from faker import Faker
 fake = Faker()
@@ -69,7 +70,7 @@ def create_example_csv_file(
         return str(uuid.uuid4())
     def _date(
-        date_range: Union[None, List[str]] = None,
+        date_range: Union[None, list[str]] = None,
         format: str = '%Y-%m-%d',
         required: bool = True,
     ) -> str:
@@ -98,7 +99,7 @@ def create_example_csv_file(
         return fake.time(format)
     def _datetime(
-        datetime_range: Optional[List[str]] = None,
+        datetime_range: Optional[list[str]] = None,
         format: str = '%Y-%m-%d %H-%M-%S',
         required: bool = True,
     ) -> str:
@@ -123,7 +124,7 @@ def create_example_csv_file(
     def _number(
         num_type: Type[Union[int, float]] = int,
-        num_range: Optional[List[float]] = None,
+        num_range: Optional[list[float]] = None,
         enum: Optional[list] = None,
         required: bool = True,
     ) -> Union[int, float]:
@@ -144,7 +145,7 @@ def create_example_csv_file(
             return ''
         return random.randint(0, 1) == 0
-    def _array(enum: List[Any], required: bool = True) -> str:
+    def _array(enum: list[Any], required: bool = True) -> str:
         if potential_skip(required):
             return ''
         return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"

csv_detective/output/profile.py ADDED Viewed

@@ -0,0 +1,87 @@
+from collections import defaultdict
+import logging
+from time import time
+import pandas as pd
+from csv_detective.detect_fields.other.float import float_casting
+from csv_detective.utils import display_logs_depending_process_time, prevent_nan
+def create_profile(
+    table: pd.DataFrame,
+    dict_cols_fields: dict,
+    num_rows: int,
+    limited_output: bool = True,
+    verbose: bool = False,
+) -> dict:
+    if verbose:
+        start = time()
+        logging.info("Creating profile")
+    map_python_types = {
+        "string": str,
+        "int": float,
+        "float": float,
+    }
+    if num_rows > 0:
+        raise ValueError("To create profiles num_rows has to be set to -1")
+    safe_table = table.copy()
+    if not limited_output:
+        dict_cols_fields = {
+            k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
+            for k, v in dict_cols_fields.items()
+        }
+    dtypes = {
+        k: map_python_types.get(v["python_type"], str)
+        for k, v in dict_cols_fields.items()
+    }
+    for c in safe_table.columns:
+        if dtypes[c] == float:
+            safe_table[c] = safe_table[c].apply(
+                lambda s: float_casting(s) if isinstance(s, str) else s
+            )
+    profile = defaultdict(dict)
+    for c in safe_table.columns:
+        if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
+            float,
+            int,
+        ]:
+            profile[c].update(
+                min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
+                    safe_table[c].min()
+                )),
+                max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
+                    safe_table[c].max()
+                )),
+                mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
+                    safe_table[c].mean()
+                )),
+                std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
+                    safe_table[c].std()
+                )),
+            )
+        tops_bruts = (
+            safe_table[safe_table[c].notna()][c]
+            .value_counts(dropna=True)
+            .reset_index()
+            .iloc[:10]
+            .to_dict(orient="records")
+        )
+        tops = []
+        for tb in tops_bruts:
+            tops.append({
+                "count": tb["count"],
+                "value": tb[c],
+            })
+        profile[c].update(
+            tops=tops,
+            nb_distinct=safe_table[c].nunique(),
+            nb_missing_values=len(safe_table[c].loc[safe_table[c].isna()]),
+        )
+    if verbose:
+        display_logs_depending_process_time(
+            f"Created profile in {round(time() - start, 3)}s",
+            time() - start,
+        )
+    return profile

csv-detective 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl

csv-detective 0.7.5.dev1197py3-none-any.whl → 0.7.5.dev1209py3-none-any.whl