PyPI - csv-detective - Versions diffs - 0.7.5.dev1069__py3-none-any.whl → 0.7.5.dev1113__py3-none-any.whl - Mend

csv-detective 0.7.5.dev1069py3-none-any.whl → 0.7.5.dev1113py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

csv_detective/detect_fields/__init__.py CHANGED Viewed

@@ -54,4 +54,4 @@ from .geo import (
 )
 from .FR.temp import jour_de_la_semaine, mois_de_annee
-from .temp import year, date, datetime_iso, datetime_rfc822
+from .temp import year, date, datetime, datetime_iso, datetime_rfc822

csv_detective/detect_fields/other/booleen/__init__.py CHANGED Viewed

@@ -1,21 +1,27 @@
 PROPORTION = 1
-liste_bool = {
-    '0',
-    '1',
-    'vrai',
-    'faux',
-    'true',
-    'false',
-    'oui',
-    'non',
-    'yes',
-    'no',
-    'y',
-    'n',
-    'o'
+bool_mapping = {
+    "1": True,
+    "0": False,
+    "vrai": True,
+    "faux": False,
+    "true": True,
+    "false": False,
+    "oui": True,
+    "non": False,
+    "yes": True,
+    "no": False,
+    "y": True,
+    "n": False,
+    "o": True,
 }
+liste_bool = set(bool_mapping.keys())
-def _is(val):
-    '''Détection les booléens'''
+def bool_casting(val: str) -> bool:
+    return bool_mapping.get(val)
+def _is(val: str) -> bool:
+    '''Détecte les booléens'''
     return isinstance(val, str) and val.lower() in liste_bool

csv_detective/detect_fields/other/float/__init__.py CHANGED Viewed

@@ -1,8 +1,8 @@
 PROPORTION = 1
-def float_casting(str2cast):
-    return float(str2cast.replace(',', '.'))
+def float_casting(val: str) -> float:
+    return float(val.replace(',', '.'))
 def _is(val):

csv_detective/detect_fields/temp/date/__init__.py CHANGED Viewed

@@ -1,46 +1,30 @@
-import re
-from dateutil.parser import parse, ParserError
-from csv_detective.detect_fields.other.float import _is as is_float
-from unidecode import unidecode
+from datetime import datetime
+from typing import Optional
+from dateparser import parse as date_parser
+from dateutil.parser import parse as dateutil_parser, ParserError
 PROPORTION = 1
 # /!\ this is only for dates, not datetimes which are handled by other utils
-def is_dateutil_date(val: str) -> bool:
-    # we don't want to get datetimes here, so length restriction
-    # longest date string expected here is DD-septembre-YYYY, so 17 characters
-    if len(val) > 17:
-        return False
+def date_casting(val: str) -> Optional[datetime]:
+    """For performance reasons, we try first with dateutil and fallback on dateparser"""
     try:
-        res = parse(val, fuzzy=False)
-        if res.hour or res.minute or res.second:
-            return False
-        return True
-    except (ParserError, ValueError, TypeError, OverflowError):
-        return False
-seps = r'[\s/\-\*_\|;.,]'
-# matches JJ-MM-AAAA with any of the listed separators
-pat = r'^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$'.replace('SEP', seps)
-# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
-tap = r'^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$'.replace('SEP', seps + '?')
-# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
-letters = (
-    r'^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr'
-    r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
-    r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP'
-    r'(\d{2}|\d{4})$'
-).replace('SEP', seps + '?')
+        return dateutil_parser(val)
+    except ParserError:
+        return date_parser(val)
 def _is(val):
-    '''Renvoie True si val peut être une date, False sinon
-    On ne garde que les regex pour les cas où parse() ne convient pas'''
-    return isinstance(val, str) and (
-        (is_dateutil_date(val) and not is_float(val))
-        or bool(re.match(letters, unidecode(val)))
-        or bool(re.match(pat, val))
-        or bool(re.match(tap, val))
-    )
+    '''Renvoie True si val peut être une date, False sinon'''
+    # early stops, to cut processing time
+    if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
+        return False
+    threshold = 0.3
+    if sum([char.isdigit() for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    if not res or res.hour or res.minute or res.second:
+        return False
+    return True

csv_detective/detect_fields/temp/datetime/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from typing import Any, Optional
+from csv_detective.detect_fields.temp.date import date_casting
+PROPORTION = 1
+def _is(val: Optional[Any]) -> bool:
+    '''Renvoie True si val peut être un datetime, False sinon'''
+    # early stops, to cut processing time
+    if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
+        return False
+    threshold = 0.7
+    if sum([char.isdigit() for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    if res and (res.hour or res.minute or res.second):
+        return True
+    return False

csv_detective/detect_labels/temp/date/__init__.py CHANGED Viewed

@@ -27,7 +27,9 @@ def _is(header):
         'dateouv',
         'date der maj',
         'dmaj',
-        'jour'
+        'jour',
+        'yyyymmdd',
+        'aaaammjj',
     ]
     processed_header = _process_text(header)

csv_detective/detection.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import TextIO, Optional
+from typing import TextIO, Optional, Union
+from collections import defaultdict
 import pandas as pd
 import math
 import csv
@@ -27,7 +28,7 @@ engine_to_file = {
 }
-def is_url(csv_file_path: str):
+def is_url(csv_file_path: str) -> bool:
     # could be more sophisticated if needed
     return csv_file_path.startswith('http')
@@ -35,17 +36,14 @@ def is_url(csv_file_path: str):
 def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
     """
     Detects whether a column contains continuous variables. We consider a continuous column
-    one that contains
-    a considerable amount of float values.
+    one that contains a considerable amount of float values.
     We removed the integers as we then end up with postal codes, insee codes, and all sort
     of codes and types.
     This is not optimal but it will do for now.
-    :param table:
-    :return:
     """
     # if we need this again in the future, could be first based on columns detected as int/float to cut time
-    def check_threshold(serie: pd.Series, continuous_th: float):
+    def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
         count = serie.value_counts().to_dict()
         total_nb = len(serie)
         if float in count:
@@ -75,7 +73,7 @@ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9,
     if verbose:
         display_logs_depending_process_time(
             f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
-            time() - start
+            time() - start,
         )
     return res.index[res]
@@ -121,12 +119,12 @@ def detetect_categorical_variable(
     if verbose:
         display_logs_depending_process_time(
             f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
-            time() - start
+            time() - start,
         )
     return res.index[res], res
-def detect_engine(csv_file_path: str, verbose=False):
+def detect_engine(csv_file_path: str, verbose=False) -> Optional[str]:
     if verbose:
         start = time()
     mapping = {
@@ -145,12 +143,12 @@ def detect_engine(csv_file_path: str, verbose=False):
     if verbose:
         display_logs_depending_process_time(
             f'File has no extension, detected {engine_to_file.get(engine, "csv")}',
-            time() - start
+            time() - start,
         )
     return engine
-def detect_separator(file: TextIO, verbose: bool = False):
+def detect_separator(file: TextIO, verbose: bool = False) -> str:
     """Detects csv separator"""
     # TODO: add a robust detection:
     # si on a un point virgule comme texte et \t comme séparateur, on renvoie
@@ -181,12 +179,12 @@ def detect_separator(file: TextIO, verbose: bool = False):
     if verbose:
         display_logs_depending_process_time(
             f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
-            time() - start
+            time() - start,
         )
     return sep
-def detect_encoding(csv_file_path: str, verbose: bool = False):
+def detect_encoding(csv_file_path: str, verbose: bool = False) -> str:
     """
     Detects file encoding using faust-cchardet (forked from the original cchardet)
     """
@@ -205,7 +203,7 @@ def detect_encoding(csv_file_path: str, verbose: bool = False):
         message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
         display_logs_depending_process_time(
             message,
-            time() - start
+            time() - start,
         )
     return encoding_dict['encoding']
@@ -218,8 +216,7 @@ def parse_table(
     skiprows: int,
     random_state: int = 42,
     verbose : bool = False,
-):
-    # Takes care of some problems
+) -> tuple[pd.DataFrame, int, int]:
     if verbose:
         start = time()
         logging.info("Parsing table")
@@ -230,7 +227,6 @@ def parse_table(
     total_lines = None
     for encoding in [encoding, "ISO-8859-1", "utf-8"]:
-        # TODO : modification systematique
         if encoding is None:
             continue
@@ -251,17 +247,16 @@ def parse_table(
             print("Trying encoding : {encoding}".format(encoding=encoding))
     if table is None:
-        logging.error("  >> encoding not found")
-        return table, "NA", "NA"
+        raise ValueError("Could not load file")
     if verbose:
         display_logs_depending_process_time(
             f'Table parsed successfully in {round(time() - start, 3)}s',
-            time() - start
+            time() - start,
         )
     return table, total_lines, nb_duplicates
-def remove_empty_first_rows(table: pd.DataFrame):
+def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
     """Analog process to detect_headers for csv files, determines how many rows to skip
     to end up with the header at the right place"""
     idx = 0
@@ -274,7 +269,7 @@ def remove_empty_first_rows(table: pd.DataFrame):
         cols = table.iloc[idx - 1]
         table = table.iloc[idx:]
         table.columns = cols.to_list()
-    # +1 here because the columns should count as a row
+    # +1 here because the headers should count as a row
     return table, idx
@@ -285,7 +280,7 @@ def parse_excel(
     sheet_name: Optional[str] = None,
     random_state: int = 42,
     verbose : bool = False,
-):
+) -> tuple[pd.DataFrame, int, int, str, str, int]:
     """"Excel-like parsing is really slow, could be a good improvement for future development"""
     if verbose:
         start = time()
@@ -309,7 +304,7 @@ def parse_excel(
             if verbose:
                 display_logs_depending_process_time(
                     f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
-                    time() - start
+                    time() - start,
                 )
             try:
                 if engine == "openpyxl":
@@ -341,7 +336,7 @@ def parse_excel(
                 if verbose:
                     display_logs_depending_process_time(
                         'Could not read file with classic xls reader, trying with ODS',
-                        time() - start
+                        time() - start,
                     )
                 engine = "odf"
@@ -354,33 +349,33 @@ def parse_excel(
             if verbose:
                 display_logs_depending_process_time(
                     f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
-                    time() - start
+                    time() - start,
                 )
             tables = pd.read_excel(
                 csv_file_path,
                 engine="odf",
                 sheet_name=None,
-                dtype="unicode"
+                dtype="unicode",
             )
             sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
             sheet_name = max(sizes, key=sizes.get)
             if verbose:
                 display_logs_depending_process_time(
                     f'Going forwards with sheet "{sheet_name}"',
-                    time() - start
+                    time() - start,
                 )
             table = tables[sheet_name]
         else:
             if verbose:
                 display_logs_depending_process_time(
                     f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
-                    time() - start
+                    time() - start,
                 )
             table = pd.read_excel(
                 csv_file_path,
                 engine="odf",
                 sheet_name=sheet_name,
-                dtype="unicode"
+                dtype="unicode",
             )
         table, header_row_idx = remove_empty_first_rows(table)
         total_lines = len(table)
@@ -391,7 +386,7 @@ def parse_excel(
         if verbose:
             display_logs_depending_process_time(
                 f'Table parsed successfully in {round(time() - start, 3)}s',
-                time() - start
+                time() - start,
             )
         return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
@@ -400,18 +395,18 @@ def parse_excel(
         if no_sheet_specified:
             display_logs_depending_process_time(
                 f'Going forwards with sheet "{sheet_name}"',
-                time() - start
+                time() - start,
             )
         else:
             display_logs_depending_process_time(
                 f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
-                time() - start
+                time() - start,
             )
     table = pd.read_excel(
         csv_file_path,
         engine=engine,
         sheet_name=sheet_name,
-        dtype="unicode"
+        dtype="unicode",
     )
     table, header_row_idx = remove_empty_first_rows(table)
     total_lines = len(table)
@@ -422,12 +417,12 @@ def parse_excel(
     if verbose:
         display_logs_depending_process_time(
             f'Table parsed successfully in {round(time() - start, 3)}s',
-            time() - start
+            time() - start,
         )
     return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
-def prevent_nan(value: float):
+def prevent_nan(value: float) -> Optional[float]:
     if math.isnan(value):
         return None
     return value
@@ -439,7 +434,7 @@ def create_profile(
     num_rows: int,
     limited_output: bool = True,
     verbose: bool = False,
-):
+) -> dict:
     if verbose:
         start = time()
         logging.info("Creating profile")
@@ -466,9 +461,8 @@ def create_profile(
             safe_table[c] = safe_table[c].apply(
                 lambda s: float_casting(s) if isinstance(s, str) else s
             )
-    profile = {}
+    profile = defaultdict(dict)
     for c in safe_table.columns:
-        profile[c] = {}
         if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
             float,
             int,
@@ -494,10 +488,10 @@ def create_profile(
                 .to_dict(orient="records")
         tops = []
         for tb in tops_bruts:
-            top = {}
-            top["count"] = tb["count"]
-            top["value"] = tb[c]
-            tops.append(top)
+            tops.append({
+                "count": tb["count"],
+                "value": tb[c],
+            })
         profile[c].update(
             tops=tops,
             nb_distinct=safe_table[c].nunique(),
@@ -506,7 +500,7 @@ def create_profile(
     if verbose:
         display_logs_depending_process_time(
             f"Created profile in {round(time() - start, 3)}s",
-            time() - start
+            time() - start,
         )
     return profile
@@ -540,7 +534,7 @@ def detect_extra_columns(file: TextIO, sep: str):
     return nb_useless_col, retour
-def detect_headers(file: TextIO, sep: str, verbose: bool = False):
+def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
     """Tests 10 first rows for possible header (header not in 1st line)"""
     if verbose:
         start = time()
@@ -559,7 +553,7 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False):
                 if verbose:
                     display_logs_depending_process_time(
                         f'Detected headers in {round(time() - start, 3)}s',
-                        time() - start
+                        time() - start,
                     )
                 return i, chaine
     if verbose:
@@ -567,7 +561,7 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False):
     return 0, None
-def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False):
+def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False) -> int:
     """Tests first 10 lines to see if there are empty heading columns"""
     if verbose:
         start = time()
@@ -581,18 +575,18 @@ def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False):
             if verbose:
                 display_logs_depending_process_time(
                     f'No heading column detected in {round(time() - start, 3)}s',
-                    time() - start
+                    time() - start,
                 )
             return 0
     if verbose:
         display_logs_depending_process_time(
             f'{return_int} heading columns detected in {round(time() - start, 3)}s',
-            time() - start
+            time() - start,
         )
     return return_int
-def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False):
+def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False) -> int:
     """Tests first 10 lines to see if there are empty trailing columns"""
     if verbose:
         start = time()
@@ -611,12 +605,12 @@ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbos
             if verbose:
                 display_logs_depending_process_time(
                     f'No trailing column detected in {round(time() - start, 3)}s',
-                    time() - start
+                    time() - start,
                 )
             return 0
     if verbose:
         display_logs_depending_process_time(
             f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
-            time() - start
+            time() - start,
         )
     return return_int

csv_detective/explore_csv.py CHANGED Viewed

@@ -4,6 +4,7 @@ contenu possible des champs
 """
 from typing import Dict, List, Union
+from collections import defaultdict
 import json
 import numpy as np
 import os
@@ -18,7 +19,13 @@ import pandas as pd
 from csv_detective import detect_fields, detect_labels
 from csv_detective.s3_utils import download_from_minio, upload_to_minio
 from csv_detective.schema_generation import generate_table_schema
-from csv_detective.utils import test_col, test_label, prepare_output_dict, display_logs_depending_process_time
+from csv_detective.utils import (
+    cast_df,
+    display_logs_depending_process_time,
+    prepare_output_dict,
+    test_col,
+    test_label,
+)
 from .detection import (
     detect_engine,
     detect_separator,
@@ -39,7 +46,7 @@ from .detection import (
 logging.basicConfig(level=logging.INFO)
-def get_all_packages(detect_type):
+def get_all_packages(detect_type) -> list:
     root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
     modules = []
     for dirpath, _, filenames in os.walk(root_dir):
@@ -58,7 +65,7 @@ def get_all_packages(detect_type):
 def return_all_tests(
     user_input_tests: Union[str, list],
     detect_type: str,
-):
+) -> list:
     """
     returns all tests that have a method _is and are listed in the user_input_tests
     the function can select a sub_package from csv_detective
@@ -110,6 +117,7 @@ def routine(
     output_profile: bool = False,
     output_schema: bool = False,
     output_df: bool = False,
+    cast_json: bool = True,
     verbose: bool = False,
     sheet_name: Union[str, int] = None,
 ) -> Union[dict, tuple[dict, pd.DataFrame]]:
@@ -126,6 +134,7 @@ def routine(
         output_profile: whether or not to add the 'profile' field to the output
         output_schema: whether or not to add the 'schema' field to the output (tableschema)
         output_df: whether or not to return the loaded DataFrame along with the analysis report
+        cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
         verbose: whether or not to print process logs in console
         sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
         skipna: whether to keep NaN (empty cells) for tests
@@ -175,12 +184,10 @@ def routine(
             sep = detect_separator(str_file, verbose=verbose)
         header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
         if header is None:
-            return_dict = {"error": True}
-            return return_dict
+            return {"error": True}
         elif isinstance(header, list):
             if any([x is None for x in header]):
-                return_dict = {"error": True}
-                return return_dict
+                return {"error": True}
         heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
         trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
         table, total_lines, nb_duplicates = parse_table(
@@ -200,7 +207,7 @@ def routine(
         # )
     # Creating return dictionary
-    return_dict = {
+    analysis = {
         "header_row_idx": header_row_idx,
         "header": header,
         "total_lines": total_lines,
@@ -212,12 +219,12 @@ def routine(
     }
     # this is only relevant for xls-like
     if is_xls_like:
-        return_dict["engine"] = engine
-        return_dict["sheet_name"] = sheet_name
+        analysis["engine"] = engine
+        analysis["sheet_name"] = sheet_name
     # this is only relevant for csv
     else:
-        return_dict["encoding"] = encoding
-        return_dict["separator"] = sep
+        analysis["encoding"] = encoding
+        analysis["separator"] = sep
     # list testing to be performed
     all_tests_fields = return_all_tests(
@@ -229,25 +236,24 @@ def routine(
     # if no testing then return
     if not all_tests_fields and not all_tests_labels:
-        return return_dict
+        return analysis
     # Perform testing on fields
-    return_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
-    return_dict["columns_fields"] = prepare_output_dict(return_table_fields, limited_output)
+    scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
+    analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
     # Perform testing on labels
-    return_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
-    return_dict["columns_labels"] = prepare_output_dict(return_table_labels, limited_output)
+    scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
+    analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
     # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
     # This is because the fields are more important than the labels and yields a max
     # of 1.5 for the final score.
-    return_table = return_table_fields * (
+    scores_table = scores_table_fields * (
         1
-        + return_table_labels.reindex(
-            index=return_table_fields.index, fill_value=0
-        ).values
-        / 2
+        + scores_table_labels.reindex(
+            index=scores_table_fields.index, fill_value=0
+        ).values / 2
     )
     # To reduce false positives: ensure these formats are detected only if the label yields
@@ -263,12 +269,12 @@ def routine(
         "latitude_l93",
         "longitude_l93",
     ]
-    return_table.loc[formats_with_mandatory_label, :] = np.where(
-        return_table_labels.loc[formats_with_mandatory_label, :],
-        return_table.loc[formats_with_mandatory_label, :],
+    scores_table.loc[formats_with_mandatory_label, :] = np.where(
+        scores_table_labels.loc[formats_with_mandatory_label, :],
+        scores_table.loc[formats_with_mandatory_label, :],
         0,
     )
-    return_dict["columns"] = prepare_output_dict(return_table, limited_output)
+    analysis["columns"] = prepare_output_dict(scores_table, limited_output)
     metier_to_python_type = {
         "booleen": "bool",
@@ -278,6 +284,8 @@ def routine(
         "json": "json",
         "json_geojson": "json",
         "datetime": "datetime",
+        "datetime_iso": "datetime",
+        "datetime_rfc822": "datetime",
         "date": "date",
         "latitude": "float",
         "latitude_l93": "float",
@@ -291,7 +299,7 @@ def routine(
     if not limited_output:
         for detection_method in ["columns_fields", "columns_labels", "columns"]:
-            return_dict[detection_method] = {
+            analysis[detection_method] = {
                 col_name: [
                     {
                         "python_type": metier_to_python_type.get(
@@ -301,32 +309,29 @@ def routine(
                     }
                     for detection in detections
                 ]
-                for col_name, detections in return_dict[detection_method].items()
+                for col_name, detections in analysis[detection_method].items()
             }
     else:
         for detection_method in ["columns_fields", "columns_labels", "columns"]:
-            return_dict[detection_method] = {
+            analysis[detection_method] = {
                 col_name: {
                     "python_type": metier_to_python_type.get(
                         detection["format"], "string"
                     ),
                     **detection,
                 }
-                for col_name, detection in return_dict[detection_method].items()
+                for col_name, detection in analysis[detection_method].items()
             }
         # Add detection with formats as keys
-        return_dict["formats"] = {
-            column_metadata["format"]: []
-            for column_metadata in return_dict["columns"].values()
-        }
-        for header, col_metadata in return_dict["columns"].items():
-            return_dict["formats"][col_metadata["format"]].append(header)
+        analysis["formats"] = defaultdict(list)
+        for header, col_metadata in analysis["columns"].items():
+            analysis["formats"][col_metadata["format"]].append(header)
     if output_profile:
-        return_dict["profile"] = create_profile(
+        analysis["profile"] = create_profile(
             table=table,
-            dict_cols_fields=return_dict["columns"],
+            dict_cols_fields=analysis["columns"],
             num_rows=num_rows,
             limited_output=limited_output,
             verbose=verbose,
@@ -343,11 +348,11 @@ def routine(
                 output_path += "_sheet-" + str(sheet_name)
             output_path += ".json"
         with open(output_path, "w", encoding="utf8") as fp:
-            json.dump(return_dict, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
+            json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
     if output_schema:
-        return_dict["schema"] = generate_table_schema(
-            return_dict,
+        analysis["schema"] = generate_table_schema(
+            analysis,
             save_file=False,
             verbose=verbose
         )
@@ -357,8 +362,13 @@ def routine(
             time() - start_routine
         )
     if output_df:
-        return return_dict, table
-    return return_dict
+        return analysis, cast_df(
+            df=table,
+            columns=analysis["columns"],
+            cast_json=cast_json,
+            verbose=verbose,
+        )
+    return analysis
 def routine_minio(
@@ -436,7 +446,7 @@ def routine_minio(
         minio_pwd=minio_pwd,
     )
-    return_dict = routine(
+    analysis = routine(
         csv_file_path,
         num_rows,
         user_input_tests,
@@ -449,7 +459,7 @@ def routine_minio(
     # Write report JSON file.
     output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
     with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
-        json.dump(return_dict, fp, indent=4, separators=(",", ": "))
+        json.dump(analysis, fp, indent=4, separators=(",", ": "))
     upload_to_minio(
         netloc=output_minio_location["netloc"],
@@ -464,7 +474,7 @@ def routine_minio(
     os.remove(csv_file_path)
     generate_table_schema(
-        return_dict,
+        analysis,
         True,
         netloc=tableschema_minio_location["netloc"],
         bucket=tableschema_minio_location["bucket"],
@@ -473,4 +483,4 @@ def routine_minio(
         minio_pwd=minio_pwd,
     )
-    return return_dict
+    return analysis

csv_detective/utils.py CHANGED Viewed

@@ -1,7 +1,13 @@
-from typing import Callable
+from typing import Callable, Optional, Union
+import json
 import pandas as pd
 import logging
 from time import time
+from datetime import date, datetime
+from csv_detective.detect_fields.other.booleen import bool_casting
+from csv_detective.detect_fields.other.float import float_casting
+from csv_detective.detect_fields.temp.date import date_casting
 logging.basicConfig(level=logging.INFO)
@@ -210,7 +216,52 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
 def full_word_strictly_inside_string(word: str, string: str):
     return (
-        (" " + word + " " in string)
+        word == string
+        or (" " + word + " " in string)
         or (string.startswith(word + " "))
         or (string.endswith(" " + word))
     )
+def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
+    if not isinstance(value, str) or not value:
+        # None is the current default value in hydra, should we keep this?
+        return None
+    if _type == "float":
+        return float_casting(value)
+    if _type == "bool":
+        return bool_casting(value)
+    if _type == "json":
+        # in hydra json are given to postgres as strings, conversion is done by postgres
+        return json.loads(value)
+    if _type == "date":
+        _date = date_casting(value)
+        return _date.date() if _date else None
+    if _type == "datetime":
+        return date_casting(value)
+    raise ValueError(f"Unknown type `{_type}`")
+def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
+    if verbose:
+        start = time()
+    output_df = pd.DataFrame()
+    for col_name, detection in columns.items():
+        if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
+            # no change if detected type is string
+            output_df[col_name] = df[col_name].copy()
+        elif detection["python_type"] == "int":
+            # to allow having ints and NaN in the same column
+            output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
+        else:
+            output_df[col_name] = df[col_name].apply(
+                lambda col: cast(col, _type=detection["python_type"])
+            )
+        # to save RAM
+        del df[col_name]
+    if verbose:
+        display_logs_depending_process_time(
+            f'Casting columns completed in {round(time() - start, 3)}s',
+            time() - start,
+        )
+    return output_df

{csv_detective-0.7.5.dev1069.data → csv_detective-0.7.5.dev1113.data}/data/share/csv_detective/CHANGELOG.md RENAMED Viewed

@@ -4,6 +4,8 @@
 - New function that creates a csv from a list of fields and constraints, or from a TableSchema [#101](https://github.com/datagouv/csv-detective/pull/101)
 - Enable outputing loaded dataframe [#102](https://github.com/datagouv/csv-detective/pull/102)
+- Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
+- The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
 ## 0.7.4 (2024-11-15)

{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: csv_detective
-Version: 0.7.5.dev1069
+Version: 0.7.5.dev1113
 Summary: Detect CSV column content
 Home-page: https://github.com/etalab/csv_detective
 Author: Etalab
@@ -15,6 +15,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Description-Content-Type: text/markdown
 License-File: LICENSE.AGPL.txt
 Requires-Dist: boto3==1.34.0
+Requires-Dist: dateparser==1.2.0
 Requires-Dist: faust-cchardet==2.1.19
 Requires-Dist: pandas==2.2.0
 Requires-Dist: pytest==8.3.0

{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
 csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
 csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
-csv_detective/detection.py,sha256=AuXlPOZfzqznZY2ybAAgaXIq6qVITYd3MXf2CoigI3I,22097
-csv_detective/explore_csv.py,sha256=6kGl1E061_CefAdei-wgwafZT1g8oKWg0eE1D5zWTOk,17216
+csv_detective/detection.py,sha256=SUNGMvvuM_bj3gKYw-x6-CjjkirqCPoeAm0NCPkijrM,22225
+csv_detective/explore_csv.py,sha256=-AxnM0hGlhrbI4w1wdZwC_w-DYgoOCFpMQ94agIaD5U,17380
 csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
 csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
 csv_detective/schema_generation.py,sha256=D1Cq4QRajsKtY8EJSwbRTIB-T_Cb2ZpcmYtCrJ6DvJQ,13135
-csv_detective/utils.py,sha256=3nzHNjMaNtAhwhQv_leVuBFXEYgPVFmWy1KzNCybblw,8556
-csv_detective/detect_fields/__init__.py,sha256=CchNbi1vrgIGh_uBexXZTzfjBETDY0kQLjI-PAquU8M,921
+csv_detective/utils.py,sha256=yO9INaLh-QX-FFL2A153AlMqftE04wb0hpN6HJvsKGg,10581
+csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=e5JqMNOPxx0Ivju3zAHCGMopZroCpR4vr3DJKlQhMz4,1675
@@ -55,9 +55,9 @@ csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSq
 csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=3nlBqFYD4kVSVxw4b9DTPcxW59oL0T3Kj0OxPlyP9og,268
 csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
 csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_fields/other/booleen/__init__.py,sha256=rM__y88CGoLkMXoRkonC4YxJT2E-HfjAXocKFjIqoxU,281
+csv_detective/detect_fields/other/booleen/__init__.py,sha256=1qIEI681iEaPVb9XxmH2ewxDdfmYhHe4-s3MZ6L1A9Q,489
 csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
-csv_detective/detect_fields/other/float/__init__.py,sha256=tdHBimi668qpJhVc87w-msUfGGUcKY_tex31u5W_VQs,545
+csv_detective/detect_fields/other/float/__init__.py,sha256=dpEd5ZijmjQ7gqcTnYRoRoLGGJae0RyGwVC6MPra9go,549
 csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
 csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
 csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
@@ -65,7 +65,8 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
 csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
 csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
 csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_fields/temp/date/__init__.py,sha256=9-XhY3sMYRFQliEbprwKhfXCNz4_imgweZs_4Mbno9M,1784
+csv_detective/detect_fields/temp/date/__init__.py,sha256=aFP1feFWFhCpR6Q9s_4BBwWxFtwFiMXY1iduSeQIjdA,943
+csv_detective/detect_fields/temp/datetime/__init__.py,sha256=Ykwhk2ospjY9P0KOG0AitgqN0sld6UmhOlbMz_XGQzQ,597
 csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
 csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
 csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
@@ -122,22 +123,22 @@ csv_detective/detect_labels/other/twitter/__init__.py,sha256=D8G4vGsFL9a99OJz-03
 csv_detective/detect_labels/other/url/__init__.py,sha256=vqUQvn5o6JZU8iRsSG3AYqggjlhzagozVYWwpuSReV8,1202
 csv_detective/detect_labels/other/uuid/__init__.py,sha256=OdMUxqvqMdGaY5nph7CbIF_Q0LSxljxE72kCMT4m-Zk,931
 csv_detective/detect_labels/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_labels/temp/date/__init__.py,sha256=GrIbo64WVM3hi7ShBRKKyKUZxkZlVKhpgk41FxkM1VI,1281
+csv_detective/detect_labels/temp/date/__init__.py,sha256=-R7VqlryozelSn4wH_7w9x6ks77DP1kw2XMBYSLrzXE,1322
 csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
 csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
 csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
-csv_detective-0.7.5.dev1069.data/data/share/csv_detective/CHANGELOG.md,sha256=QbZKEEWbkt7a-TMHB6CpzzliDqv3BLECa_zkJgZOFkY,6820
-csv_detective-0.7.5.dev1069.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1069.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
+csv_detective-0.7.5.dev1113.data/data/share/csv_detective/CHANGELOG.md,sha256=S9f0BlHhNQhrJ8bbw7bThthn2AG-gP5n8eg4Eep05IA,7063
+csv_detective-0.7.5.dev1113.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1113.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
-tests/test_fields.py,sha256=kXel-hiyQYrJ3OLmwUMg1K3DKbbwBLvUplxZWxpp18I,10605
-tests/test_file.py,sha256=oQITvAxdcrqDby2wWSh_X9TCwFqdFaP34XNy92ibXyg,6725
+tests/test_fields.py,sha256=_96htvTzvM7u-W57RpOBbsacWirIm4R36PP7JhPEaYQ,11123
+tests/test_file.py,sha256=HO-Zqv0ZDFy3d0ZrpjWQPXBrwgUmzesseoEofy8G2UU,7529
 tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
 tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
-csv_detective-0.7.5.dev1069.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1069.dist-info/METADATA,sha256=sqa9hWFoiOj9-MpBX1uuwOl5qyPCSoca3wo0RrglmNY,1145
-csv_detective-0.7.5.dev1069.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-csv_detective-0.7.5.dev1069.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.7.5.dev1069.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
-csv_detective-0.7.5.dev1069.dist-info/RECORD,,
+csv_detective-0.7.5.dev1113.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1113.dist-info/METADATA,sha256=7kqAw_UnjMjoBSfLqk59j7OYdY9PB0bPC35p9QxXbFY,1178
+csv_detective-0.7.5.dev1113.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+csv_detective-0.7.5.dev1113.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.7.5.dev1113.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
+csv_detective-0.7.5.dev1113.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import pandas as pd
 from numpy import random
+import pytest
+from datetime import date as _date, datetime as _datetime
 from csv_detective.detect_fields.FR.geo import (
     adresse,
@@ -46,6 +48,7 @@ from csv_detective.detection import (
     detetect_categorical_variable,
 )
 from csv_detective.explore_csv import return_all_tests
+from csv_detective.utils import cast
 def test_all_tests_return_bool():
@@ -504,3 +507,19 @@ def test_match_float():
 def test_not_match_float():
     for val in ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"]:
         assert not test_float._is(val)
+@pytest.mark.parametrize(
+    "args",
+    (
+        ("1.9", "float", float),
+        ("oui", "bool", bool),
+        ("[1, 2]", "json", list),
+        ('{"a": 1}', "json", dict),
+        ("2022-08-01", "date", _date),
+        ("2024-09-23 17:32:07", "datetime", _datetime),
+    ),
+)
+def test_cast(args):
+    value, detected_type, cast_type = args
+    assert isinstance(cast(value, detected_type), cast_type)

tests/test_file.py CHANGED Viewed

@@ -232,3 +232,31 @@ def test_output_df():
     assert isinstance(output, dict)
     assert isinstance(df, pd.DataFrame)
     assert len(df) == 6
+    assert df["partly_empty"].dtype == pd.Int64Dtype()
+@pytest.mark.parametrize(
+    "cast_json",
+    (
+        (True, dict),
+        (False, str),
+    ),
+)
+def test_cast_json(mocked_responses, cast_json):
+    cast_json, expected_type = cast_json
+    expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
+    mocked_responses.get(
+        'http://example.com/test.csv',
+        body=expected_content,
+        status=200,
+    )
+    analysis, df = routine(
+        csv_file_path='http://example.com/test.csv',
+        num_rows=-1,
+        output_profile=False,
+        save_results=False,
+        output_df=True,
+        cast_json=cast_json,
+    )
+    assert analysis['columns']["a_simple_dict"]["python_type"] == "json"
+    assert isinstance(df["a_simple_dict"][0], expected_type)

{csv_detective-0.7.5.dev1069.data → csv_detective-0.7.5.dev1113.data}/data/share/csv_detective/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1069.data → csv_detective-0.7.5.dev1113.data}/data/share/csv_detective/README.md RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1069.dist-info → csv_detective-0.7.5.dev1113.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.7.5.dev1069__py3-none-any.whl → 0.7.5.dev1113__py3-none-any.whl

csv-detective 0.7.5.dev1069py3-none-any.whl → 0.7.5.dev1113py3-none-any.whl