PyPI - csv-detective - Versions diffs - 0.7.5.dev1078__py3-none-any.whl → 0.7.5.dev1139__py3-none-any.whl - Mend

csv-detective 0.7.5.dev1078py3-none-any.whl → 0.7.5.dev1139py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

csv_detective/detect_fields/__init__.py CHANGED Viewed

@@ -54,4 +54,4 @@ from .geo import (
 )
 from .FR.temp import jour_de_la_semaine, mois_de_annee
-from .temp import year, date, datetime_iso, datetime_rfc822
+from .temp import year, date, datetime, datetime_iso, datetime_rfc822

csv_detective/detect_fields/other/booleen/__init__.py CHANGED Viewed

@@ -1,21 +1,27 @@
 PROPORTION = 1
-liste_bool = {
-    '0',
-    '1',
-    'vrai',
-    'faux',
-    'true',
-    'false',
-    'oui',
-    'non',
-    'yes',
-    'no',
-    'y',
-    'n',
-    'o'
+bool_mapping = {
+    "1": True,
+    "0": False,
+    "vrai": True,
+    "faux": False,
+    "true": True,
+    "false": False,
+    "oui": True,
+    "non": False,
+    "yes": True,
+    "no": False,
+    "y": True,
+    "n": False,
+    "o": True,
 }
+liste_bool = set(bool_mapping.keys())
-def _is(val):
-    '''Détection les booléens'''
+def bool_casting(val: str) -> bool:
+    return bool_mapping.get(val)
+def _is(val: str) -> bool:
+    '''Détecte les booléens'''
     return isinstance(val, str) and val.lower() in liste_bool

csv_detective/detect_fields/other/float/__init__.py CHANGED Viewed

@@ -1,8 +1,8 @@
 PROPORTION = 1
-def float_casting(str2cast):
-    return float(str2cast.replace(',', '.'))
+def float_casting(val: str) -> float:
+    return float(val.replace(',', '.'))
 def _is(val):

csv_detective/detect_fields/temp/date/__init__.py CHANGED Viewed

@@ -1,46 +1,30 @@
-import re
-from dateutil.parser import parse, ParserError
-from csv_detective.detect_fields.other.float import _is as is_float
-from unidecode import unidecode
+from datetime import datetime
+from typing import Optional
+from dateparser import parse as date_parser
+from dateutil.parser import parse as dateutil_parser, ParserError
 PROPORTION = 1
 # /!\ this is only for dates, not datetimes which are handled by other utils
-def is_dateutil_date(val: str) -> bool:
-    # we don't want to get datetimes here, so length restriction
-    # longest date string expected here is DD-septembre-YYYY, so 17 characters
-    if len(val) > 17:
-        return False
+def date_casting(val: str) -> Optional[datetime]:
+    """For performance reasons, we try first with dateutil and fallback on dateparser"""
     try:
-        res = parse(val, fuzzy=False)
-        if res.hour or res.minute or res.second:
-            return False
-        return True
-    except (ParserError, ValueError, TypeError, OverflowError):
-        return False
-seps = r'[\s/\-\*_\|;.,]'
-# matches JJ-MM-AAAA with any of the listed separators
-pat = r'^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$'.replace('SEP', seps)
-# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
-tap = r'^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$'.replace('SEP', seps + '?')
-# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
-letters = (
-    r'^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr'
-    r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
-    r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP'
-    r'(\d{2}|\d{4})$'
-).replace('SEP', seps + '?')
+        return dateutil_parser(val)
+    except ParserError:
+        return date_parser(val)
 def _is(val):
-    '''Renvoie True si val peut être une date, False sinon
-    On ne garde que les regex pour les cas où parse() ne convient pas'''
-    return isinstance(val, str) and (
-        (is_dateutil_date(val) and not is_float(val))
-        or bool(re.match(letters, unidecode(val)))
-        or bool(re.match(pat, val))
-        or bool(re.match(tap, val))
-    )
+    '''Renvoie True si val peut être une date, False sinon'''
+    # early stops, to cut processing time
+    if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
+        return False
+    threshold = 0.3
+    if sum([char.isdigit() for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    if not res or res.hour or res.minute or res.second:
+        return False
+    return True

csv_detective/detect_fields/temp/datetime/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from typing import Any, Optional
+from csv_detective.detect_fields.temp.date import date_casting
+PROPORTION = 1
+def _is(val: Optional[Any]) -> bool:
+    '''Renvoie True si val peut être un datetime, False sinon'''
+    # early stops, to cut processing time
+    if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
+        return False
+    threshold = 0.7
+    if sum([char.isdigit() for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    if res and (res.hour or res.minute or res.second):
+        return True
+    return False

csv_detective/detect_labels/temp/date/__init__.py CHANGED Viewed

@@ -27,7 +27,9 @@ def _is(header):
         'dateouv',
         'date der maj',
         'dmaj',
-        'jour'
+        'jour',
+        'yyyymmdd',
+        'aaaammjj',
     ]
     processed_header = _process_text(header)

csv_detective/detection.py CHANGED Viewed

@@ -198,6 +198,8 @@ def detect_encoding(csv_file_path: str, verbose: bool = False) -> str:
     else:
         binary_file = open(csv_file_path, mode="rb")
     encoding_dict = detect(binary_file.read())
+    if not encoding_dict["encoding"]:
+        raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
     if verbose:
         message = f'Detected encoding: "{encoding_dict["encoding"]}"'
         message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'

csv_detective/explore_csv.py CHANGED Viewed

@@ -19,7 +19,13 @@ import pandas as pd
 from csv_detective import detect_fields, detect_labels
 from csv_detective.s3_utils import download_from_minio, upload_to_minio
 from csv_detective.schema_generation import generate_table_schema
-from csv_detective.utils import test_col, test_label, prepare_output_dict, display_logs_depending_process_time
+from csv_detective.utils import (
+    cast_df,
+    display_logs_depending_process_time,
+    prepare_output_dict,
+    test_col,
+    test_label,
+)
 from .detection import (
     detect_engine,
     detect_separator,
@@ -111,6 +117,7 @@ def routine(
     output_profile: bool = False,
     output_schema: bool = False,
     output_df: bool = False,
+    cast_json: bool = True,
     verbose: bool = False,
     sheet_name: Union[str, int] = None,
 ) -> Union[dict, tuple[dict, pd.DataFrame]]:
@@ -127,6 +134,7 @@ def routine(
         output_profile: whether or not to add the 'profile' field to the output
         output_schema: whether or not to add the 'schema' field to the output (tableschema)
         output_df: whether or not to return the loaded DataFrame along with the analysis report
+        cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
         verbose: whether or not to print process logs in console
         sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
         skipna: whether to keep NaN (empty cells) for tests
@@ -276,6 +284,8 @@ def routine(
         "json": "json",
         "json_geojson": "json",
         "datetime": "datetime",
+        "datetime_iso": "datetime",
+        "datetime_rfc822": "datetime",
         "date": "date",
         "latitude": "float",
         "latitude_l93": "float",
@@ -352,7 +362,12 @@ def routine(
             time() - start_routine
         )
     if output_df:
-        return analysis, table
+        return analysis, cast_df(
+            df=table,
+            columns=analysis["columns"],
+            cast_json=cast_json,
+            verbose=verbose,
+        )
     return analysis

csv_detective/utils.py CHANGED Viewed

@@ -1,7 +1,13 @@
-from typing import Callable
+from typing import Callable, Optional, Union
+import json
 import pandas as pd
 import logging
 from time import time
+from datetime import date, datetime
+from csv_detective.detect_fields.other.booleen import bool_casting
+from csv_detective.detect_fields.other.float import float_casting
+from csv_detective.detect_fields.temp.date import date_casting
 logging.basicConfig(level=logging.INFO)
@@ -210,7 +216,52 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
 def full_word_strictly_inside_string(word: str, string: str):
     return (
-        (" " + word + " " in string)
+        word == string
+        or (" " + word + " " in string)
         or (string.startswith(word + " "))
         or (string.endswith(" " + word))
     )
+def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
+    if not isinstance(value, str) or not value:
+        # None is the current default value in hydra, should we keep this?
+        return None
+    if _type == "float":
+        return float_casting(value)
+    if _type == "bool":
+        return bool_casting(value)
+    if _type == "json":
+        # in hydra json are given to postgres as strings, conversion is done by postgres
+        return json.loads(value)
+    if _type == "date":
+        _date = date_casting(value)
+        return _date.date() if _date else None
+    if _type == "datetime":
+        return date_casting(value)
+    raise ValueError(f"Unknown type `{_type}`")
+def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
+    if verbose:
+        start = time()
+    output_df = pd.DataFrame()
+    for col_name, detection in columns.items():
+        if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
+            # no change if detected type is string
+            output_df[col_name] = df[col_name].copy()
+        elif detection["python_type"] == "int":
+            # to allow having ints and NaN in the same column
+            output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
+        else:
+            output_df[col_name] = df[col_name].apply(
+                lambda col: cast(col, _type=detection["python_type"])
+            )
+        # to save RAM
+        del df[col_name]
+    if verbose:
+        display_logs_depending_process_time(
+            f'Casting columns completed in {round(time() - start, 3)}s',
+            time() - start,
+        )
+    return output_df

{csv_detective-0.7.5.dev1078.data → csv_detective-0.7.5.dev1139.data}/data/share/csv_detective/CHANGELOG.md RENAMED Viewed

@@ -5,6 +5,8 @@
 - New function that creates a csv from a list of fields and constraints, or from a TableSchema [#101](https://github.com/datagouv/csv-detective/pull/101)
 - Enable outputing loaded dataframe [#102](https://github.com/datagouv/csv-detective/pull/102)
 - Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
+- The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
+- Raise an error if the encoding could not be guessed [#106](https://github.com/datagouv/csv-detective/pull/106)
 ## 0.7.4 (2024-11-15)

{csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: csv_detective
-Version: 0.7.5.dev1078
+Version: 0.7.5.dev1139
 Summary: Detect CSV column content
 Home-page: https://github.com/etalab/csv_detective
 Author: Etalab
@@ -15,6 +15,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Description-Content-Type: text/markdown
 License-File: LICENSE.AGPL.txt
 Requires-Dist: boto3==1.34.0
+Requires-Dist: dateparser==1.2.0
 Requires-Dist: faust-cchardet==2.1.19
 Requires-Dist: pandas==2.2.0
 Requires-Dist: pytest==8.3.0
@@ -29,3 +30,12 @@ Requires-Dist: python-magic==0.4.27
 Requires-Dist: frformat==0.4.0
 Requires-Dist: faker==33.0.0
 Requires-Dist: rstr==3.2.2
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: summary

{csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
 csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
 csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
-csv_detective/detection.py,sha256=SUNGMvvuM_bj3gKYw-x6-CjjkirqCPoeAm0NCPkijrM,22225
-csv_detective/explore_csv.py,sha256=i1m1JmnMSILlGnPhXlXsUbDVcgXaJ1E2nKE7_6D2xEE,16996
+csv_detective/detection.py,sha256=zrP8qvLDvhVXTHi7Ty8G_ga4zfZPjBhuyApqFQkPq2Y,22373
+csv_detective/explore_csv.py,sha256=-AxnM0hGlhrbI4w1wdZwC_w-DYgoOCFpMQ94agIaD5U,17380
 csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
 csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
 csv_detective/schema_generation.py,sha256=D1Cq4QRajsKtY8EJSwbRTIB-T_Cb2ZpcmYtCrJ6DvJQ,13135
-csv_detective/utils.py,sha256=3nzHNjMaNtAhwhQv_leVuBFXEYgPVFmWy1KzNCybblw,8556
-csv_detective/detect_fields/__init__.py,sha256=CchNbi1vrgIGh_uBexXZTzfjBETDY0kQLjI-PAquU8M,921
+csv_detective/utils.py,sha256=yO9INaLh-QX-FFL2A153AlMqftE04wb0hpN6HJvsKGg,10581
+csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=e5JqMNOPxx0Ivju3zAHCGMopZroCpR4vr3DJKlQhMz4,1675
@@ -55,9 +55,9 @@ csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSq
 csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=3nlBqFYD4kVSVxw4b9DTPcxW59oL0T3Kj0OxPlyP9og,268
 csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
 csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_fields/other/booleen/__init__.py,sha256=rM__y88CGoLkMXoRkonC4YxJT2E-HfjAXocKFjIqoxU,281
+csv_detective/detect_fields/other/booleen/__init__.py,sha256=1qIEI681iEaPVb9XxmH2ewxDdfmYhHe4-s3MZ6L1A9Q,489
 csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
-csv_detective/detect_fields/other/float/__init__.py,sha256=tdHBimi668qpJhVc87w-msUfGGUcKY_tex31u5W_VQs,545
+csv_detective/detect_fields/other/float/__init__.py,sha256=dpEd5ZijmjQ7gqcTnYRoRoLGGJae0RyGwVC6MPra9go,549
 csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
 csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
 csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
@@ -65,7 +65,8 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
 csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
 csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
 csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_fields/temp/date/__init__.py,sha256=9-XhY3sMYRFQliEbprwKhfXCNz4_imgweZs_4Mbno9M,1784
+csv_detective/detect_fields/temp/date/__init__.py,sha256=aFP1feFWFhCpR6Q9s_4BBwWxFtwFiMXY1iduSeQIjdA,943
+csv_detective/detect_fields/temp/datetime/__init__.py,sha256=Ykwhk2ospjY9P0KOG0AitgqN0sld6UmhOlbMz_XGQzQ,597
 csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
 csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
 csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
@@ -122,22 +123,22 @@ csv_detective/detect_labels/other/twitter/__init__.py,sha256=D8G4vGsFL9a99OJz-03
 csv_detective/detect_labels/other/url/__init__.py,sha256=vqUQvn5o6JZU8iRsSG3AYqggjlhzagozVYWwpuSReV8,1202
 csv_detective/detect_labels/other/uuid/__init__.py,sha256=OdMUxqvqMdGaY5nph7CbIF_Q0LSxljxE72kCMT4m-Zk,931
 csv_detective/detect_labels/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_labels/temp/date/__init__.py,sha256=GrIbo64WVM3hi7ShBRKKyKUZxkZlVKhpgk41FxkM1VI,1281
+csv_detective/detect_labels/temp/date/__init__.py,sha256=-R7VqlryozelSn4wH_7w9x6ks77DP1kw2XMBYSLrzXE,1322
 csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
 csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
 csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
-csv_detective-0.7.5.dev1078.data/data/share/csv_detective/CHANGELOG.md,sha256=5M95hTftsY9Ic2q_jexDNp-MgAFAXuPZyWGyFABi3l4,6927
-csv_detective-0.7.5.dev1078.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1078.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
+csv_detective-0.7.5.dev1139.data/data/share/csv_detective/CHANGELOG.md,sha256=jgcSpxGkuEdOQtQ08tRFflkCNuHstGFHN_29Tv6M1dE,7176
+csv_detective-0.7.5.dev1139.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1139.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
-tests/test_fields.py,sha256=kXel-hiyQYrJ3OLmwUMg1K3DKbbwBLvUplxZWxpp18I,10605
-tests/test_file.py,sha256=oQITvAxdcrqDby2wWSh_X9TCwFqdFaP34XNy92ibXyg,6725
+tests/test_fields.py,sha256=_96htvTzvM7u-W57RpOBbsacWirIm4R36PP7JhPEaYQ,11123
+tests/test_file.py,sha256=HO-Zqv0ZDFy3d0ZrpjWQPXBrwgUmzesseoEofy8G2UU,7529
 tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
 tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
-csv_detective-0.7.5.dev1078.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1078.dist-info/METADATA,sha256=NSxmqCJpApiSavZ59QEMfRuzeB_pmOk2Wm_zTy-o2eQ,1145
-csv_detective-0.7.5.dev1078.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-csv_detective-0.7.5.dev1078.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.7.5.dev1078.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
-csv_detective-0.7.5.dev1078.dist-info/RECORD,,
+csv_detective-0.7.5.dev1139.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1139.dist-info/METADATA,sha256=L-WEqkw3fze2JUmOVwfqV1Ttgsf5lhi_WqEULi0AKkA,1364
+csv_detective-0.7.5.dev1139.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+csv_detective-0.7.5.dev1139.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.7.5.dev1139.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
+csv_detective-0.7.5.dev1139.dist-info/RECORD,,

{csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.6.0)
+Generator: setuptools (75.8.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

tests/test_fields.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import pandas as pd
 from numpy import random
+import pytest
+from datetime import date as _date, datetime as _datetime
 from csv_detective.detect_fields.FR.geo import (
     adresse,
@@ -46,6 +48,7 @@ from csv_detective.detection import (
     detetect_categorical_variable,
 )
 from csv_detective.explore_csv import return_all_tests
+from csv_detective.utils import cast
 def test_all_tests_return_bool():
@@ -504,3 +507,19 @@ def test_match_float():
 def test_not_match_float():
     for val in ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"]:
         assert not test_float._is(val)
+@pytest.mark.parametrize(
+    "args",
+    (
+        ("1.9", "float", float),
+        ("oui", "bool", bool),
+        ("[1, 2]", "json", list),
+        ('{"a": 1}', "json", dict),
+        ("2022-08-01", "date", _date),
+        ("2024-09-23 17:32:07", "datetime", _datetime),
+    ),
+)
+def test_cast(args):
+    value, detected_type, cast_type = args
+    assert isinstance(cast(value, detected_type), cast_type)

tests/test_file.py CHANGED Viewed

@@ -232,3 +232,31 @@ def test_output_df():
     assert isinstance(output, dict)
     assert isinstance(df, pd.DataFrame)
     assert len(df) == 6
+    assert df["partly_empty"].dtype == pd.Int64Dtype()
+@pytest.mark.parametrize(
+    "cast_json",
+    (
+        (True, dict),
+        (False, str),
+    ),
+)
+def test_cast_json(mocked_responses, cast_json):
+    cast_json, expected_type = cast_json
+    expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
+    mocked_responses.get(
+        'http://example.com/test.csv',
+        body=expected_content,
+        status=200,
+    )
+    analysis, df = routine(
+        csv_file_path='http://example.com/test.csv',
+        num_rows=-1,
+        output_profile=False,
+        save_results=False,
+        output_df=True,
+        cast_json=cast_json,
+    )
+    assert analysis['columns']["a_simple_dict"]["python_type"] == "json"
+    assert isinstance(df["a_simple_dict"][0], expected_type)

{csv_detective-0.7.5.dev1078.data → csv_detective-0.7.5.dev1139.data}/data/share/csv_detective/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1078.data → csv_detective-0.7.5.dev1139.data}/data/share/csv_detective/README.md RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.7.5.dev1078__py3-none-any.whl → 0.7.5.dev1139__py3-none-any.whl

csv-detective 0.7.5.dev1078py3-none-any.whl → 0.7.5.dev1139py3-none-any.whl