PyPI - csv-detective - Versions diffs - 0.8.1.dev1500__py3-none-any.whl → 0.8.1.dev1526__py3-none-any.whl - Mend

csv-detective 0.8.1.dev1500py3-none-any.whl → 0.8.1.dev1526py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

csv_detective/detect_fields/__init__.py CHANGED Viewed

@@ -57,4 +57,4 @@ from .geo import (
 )
 from .FR.temp import jour_de_la_semaine, mois_de_annee
-from .temp import year, date, datetime, datetime_iso, datetime_rfc822
+from .temp import year, date, datetime_aware, datetime_iso, datetime_naive, datetime_rfc822

csv_detective/detect_fields/temp/date/__init__.py CHANGED Viewed

@@ -14,7 +14,7 @@ def date_casting(val: str) -> Optional[datetime]:
         return dateutil_parser(val)
     except ParserError:
         return date_parser(val)
-    except OverflowError:
+    except Exception:
         return None

csv_detective/detect_fields/temp/datetime_aware/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+from typing import Any, Optional
+from csv_detective.detect_fields.temp.date import date_casting
+PROPORTION = 1
+def _is(val: Optional[Any]) -> bool:
+    """Detects timezone-aware datetimes only"""
+    # early stops, to cut processing time
+    if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
+        return False
+    threshold = 0.7
+    if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
+        return False
+    res = date_casting(val)
+    return (
+        res is not None
+        and bool(res.hour or res.minute or res.second or res.microsecond)
+        and bool(res.tzinfo)
+    )

csv_detective/detect_fields/temp/{datetime → datetime_naive}/__init__.py RENAMED Viewed

@@ -6,14 +6,16 @@ PROPORTION = 1
 def _is(val: Optional[Any]) -> bool:
-    '''Renvoie True si val peut être un datetime, False sinon'''
+    """Detects naive datetimes only"""
     # early stops, to cut processing time
     if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
         return False
     threshold = 0.7
-    if sum([char.isdigit() for char in val]) / len(val) < threshold:
+    if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
         return False
     res = date_casting(val)
-    if res and (res.hour or res.minute or res.second):
-        return True
-    return False
+    return (
+        res is not None
+        and bool(res.hour or res.minute or res.second or res.microsecond)
+        and not bool(res.tzinfo)
+    )

csv_detective/detection/formats.py CHANGED Viewed

@@ -106,8 +106,9 @@ def detect_formats(
         "string": "string",
         "json": "json",
         "json_geojson": "json",
-        "datetime": "datetime",
+        "datetime_aware": "datetime",
         "datetime_iso": "datetime",
+        "datetime_naive": "datetime",
         "datetime_rfc822": "datetime",
         "date": "date",
         "latitude": "float",

csv_detective/output/schema.py CHANGED Viewed

@@ -106,7 +106,9 @@ def get_validata_type(format: str) -> str:
         "float": "number",
         "string": "string",
         "date": "date",
+        "datetime_aware": "datetime",
         "datetime_iso": "datetime",
+        "datetime_naive": "datetime",
         "datetime_rfc822": "datetime",
         "json_geojson": "geojson",
         "latitude": "number",

csv_detective/output/utils.py CHANGED Viewed

@@ -34,6 +34,10 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
             formats_to_remove.add("longitude_l93")
         if "code_region" in formats_detected:
             formats_to_remove.add("code_departement")
+        if "datetime_iso" in formats_detected:
+            formats_to_remove.add("datetime_naive")
+        if "datetime_rfc822" in formats_detected:
+            formats_to_remove.add("datetime_aware")
         formats_to_keep = formats_detected - formats_to_remove

csv_detective/parsing/columns.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 from csv_detective.utils import display_logs_depending_process_time
-MAX_ROWS_ANALYSIS = 1e5
+MAX_ROWS_ANALYSIS = int(1e5)
 def test_col_val(

{csv_detective-0.8.1.dev1500.data → csv_detective-0.8.1.dev1526.data}/data/share/csv_detective/CHANGELOG.md RENAMED Viewed

@@ -5,8 +5,9 @@
 - Refactor label testing [#119](https://github.com/datagouv/csv-detective/pull/119)
 - Refactor repo metadata and requirements [#120](https://github.com/datagouv/csv-detective/pull/120) [#122](https://github.com/datagouv/csv-detective/pull/122)
 - Better URL detection [#121](https://github.com/datagouv/csv-detective/pull/121)
-- For big files, analyse on sample then validate on whole file [#124](https://github.com/datagouv/csv-detective/pull/124)
+- For big files, analyse on sample then validate on whole file [#124](https://github.com/datagouv/csv-detective/pull/124) [#129](https://github.com/datagouv/csv-detective/pull/129)
 - Fix imports [#125](https://github.com/datagouv/csv-detective/pull/125) [#126](https://github.com/datagouv/csv-detective/pull/126) [#127](https://github.com/datagouv/csv-detective/pull/127) [#128](https://github.com/datagouv/csv-detective/pull/128)
+- Split aware and naive datetimes for hydra to cast them separately [#130](https://github.com/datagouv/csv-detective/pull/130)
 ## 0.8.0 (2025-05-20)

{csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv_detective
-Version: 0.8.1.dev1500
+Version: 0.8.1.dev1526
 Summary: Detect tabular files column content
 Home-page: https://github.com/datagouv/csv_detective
 Author: Etalab

{csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/RECORD RENAMED Viewed

@@ -5,7 +5,7 @@ csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2
 csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
 csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
 csv_detective/validate.py,sha256=d_4Phmjk6Y0Z0YYVw4vpoZy8E79K370reGgkpzx1mcQ,2644
-csv_detective/detect_fields/__init__.py,sha256=7Tz0Niaz0BboA3YVsp_6WPA6ywciwDN4-lOy_Ie_0Y8,976
+csv_detective/detect_fields/__init__.py,sha256=HYSy0P_aH6R8Z8Hvd8aMaBAQaZ1QwcsWHT0YPm0iYs0,998
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=NqV8ULf9gY9iFnA1deKR-1Yobr96WwCsn5JfbP_MjiY,1675
@@ -66,9 +66,10 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
 csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
 csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
 csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_fields/temp/date/__init__.py,sha256=1a_Ra9fmT4wgGMrcknXP7eN7A2QiaMF0Yjy0-BMihtA,987
-csv_detective/detect_fields/temp/datetime/__init__.py,sha256=Ykwhk2ospjY9P0KOG0AitgqN0sld6UmhOlbMz_XGQzQ,597
+csv_detective/detect_fields/temp/date/__init__.py,sha256=VC4_C5lQbjqTweC4T2p9GZAIO64zERhAuf53CPfXgw4,983
+csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=Xi3fWiqm_S09AaMeHVrgx6bSieX1gEdjjM7GYsKqEx8,667
 csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
+csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=q5Ow1yH9nCz8aY4uOHIKv8CCYIEPLUZlHzg8Nr59kBo,662
 csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
 csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
 csv_detective/detect_labels/__init__.py,sha256=BJjWlwTnnDe9nomABDUreu9EMu6IFG3T47d7YCJZbRc,878
@@ -131,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
 csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
 csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
 csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
-csv_detective/detection/formats.py,sha256=5ZW7gmhyQt6BB7xLcVVhui17oGn1udAWI9w22EAOHy4,6337
+csv_detective/detection/formats.py,sha256=LDrstnAJccDeOEvGbWA5Ppx4gdlJrKbqd7qqWRG2tHI,6382
 csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
 csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -140,28 +141,28 @@ csv_detective/output/__init__.py,sha256=5KTevPfp_4MRxByJyOntQjToNfeG7dPQn-_13wSq
 csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
 csv_detective/output/example.py,sha256=EdPX1iqHhIG4DsiHuYdy-J7JxOkjgUh_o2D5nrfM5fA,8649
 csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
-csv_detective/output/schema.py,sha256=ZDBWDOD8IYp7rcB0_n8l9JXGIhOQ6bTZHFWfTmnNNEQ,13480
-csv_detective/output/utils.py,sha256=HbmvCCCmFo7NJxhD_UsJIveuw-rrfhrvYckv1CJn_10,2301
+csv_detective/output/schema.py,sha256=WxgajFuLfUTQQtmEdlO8ve2ULDzw2BYfz8QFwUsdDh0,13558
+csv_detective/output/utils.py,sha256=qFYhxJmkKrTUefdH7Owh-liZijswomCafic4cXYSyCg,2506
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/parsing/columns.py,sha256=e0xVmeXNvSC3su5HTFSNClgkz8PlFkoHmNwRYdS57mk,5670
+csv_detective/parsing/columns.py,sha256=VzgG9Nwph5C_fLW_TuQC5BZVlPmOyjrH7Plvm_c8kWc,5675
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
 csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,1626
 csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
 csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
 csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
-csv_detective-0.8.1.dev1500.data/data/share/csv_detective/CHANGELOG.md,sha256=oia4cDcjux7TfUIfy5uRUTtTy48s_XxtmKiCgETseJI,8975
-csv_detective-0.8.1.dev1500.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
-csv_detective-0.8.1.dev1500.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
-csv_detective-0.8.1.dev1500.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.8.1.dev1526.data/data/share/csv_detective/CHANGELOG.md,sha256=QBkuYfCNZtm-waJYz1YEITwR8kCMDKKZH6-ef7oj8tQ,9161
+csv_detective-0.8.1.dev1526.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.8.1.dev1526.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
+csv_detective-0.8.1.dev1526.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
-tests/test_fields.py,sha256=d2tNvjtal6ZbO646x1GDbp_CGgp-EIcdg2SgMG72J6E,10270
-tests/test_file.py,sha256=FWVtYHlD5uU7tPeYsqlQg6O4lpU8Ct35vddkbzhvvjA,8508
+tests/test_fields.py,sha256=zeEQbHs0ougLzydmZLZs1l2UdrhKBEtdCCK64B4dhSU,10700
+tests/test_file.py,sha256=0bHV9wx9mSRoav_DVF19g694yohb1p0bw7rtcBeKG-8,8451
 tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
 tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
 tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
-csv_detective-0.8.1.dev1500.dist-info/METADATA,sha256=jHSw0ajgp-2BhtGeneNUCWHPEDXcdvRrvdGYiOy4v7k,10443
-csv_detective-0.8.1.dev1500.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.8.1.dev1500.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.8.1.dev1500.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
-csv_detective-0.8.1.dev1500.dist-info/RECORD,,
+csv_detective-0.8.1.dev1526.dist-info/METADATA,sha256=6w8386meaPhTcYjmslsOqjkqvpLPZme5ikCsx7zJizo,10443
+csv_detective-0.8.1.dev1526.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.8.1.dev1526.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.8.1.dev1526.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
+csv_detective-0.8.1.dev1526.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -58,7 +58,14 @@ from csv_detective.detect_fields.other import (
     int as test_int,
     float as test_float,
 )
-from csv_detective.detect_fields.temp import date, datetime, datetime_iso, datetime_rfc822, year
+from csv_detective.detect_fields.temp import (
+    date,
+    datetime_aware,
+    datetime_iso,
+    datetime_naive,
+    datetime_rfc822,
+    year,
+)
 from csv_detective.detection.variables import (
     detect_continuous_variable,
     detect_categorical_variable,
@@ -70,7 +77,7 @@ from csv_detective.output.dataframe import cast
 def test_all_tests_return_bool():
     all_tests = return_all_tests("ALL", "detect_fields")
     for test in all_tests:
-        for tmp in ["a", "9", "3.14", "[]", float("nan")]:
+        for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]:
             assert isinstance(test._is(tmp), bool)
@@ -337,9 +344,13 @@ fields = {
             "02052003",
         ],
     },
-    datetime: {
-        True: ["2021-06-22T10:20:10"],
-        False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT"],
+    datetime_aware: {
+        True: ["2021-06-22 10:20:10-04:00", "2030-06-22 00:00:00.0028+02:00", "1996/06/22 10:20:10 GMT"],
+        False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
+    },
+    datetime_naive: {
+        True: ["2021-06-22 10:20:10", "2030/06/22 00:00:00.0028"],
+        False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10+02:00"],
     },
     datetime_iso: {
         True: ["2021-06-22T10:20:10"],
@@ -388,6 +399,7 @@ def test_fields_with_values(args):
         ('{"a": 1}', "json", dict),
         ("2022-08-01", "date", _date),
         ("2024-09-23 17:32:07", "datetime", _datetime),
+        ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
     ),
 )
 def test_cast(args):

tests/test_file.py CHANGED Viewed

@@ -7,12 +7,11 @@ from csv_detective import routine
 @pytest.mark.parametrize(
-    "reduce_max_rows_analysis",
-    (True, False),
+    "max_rows_analysis",
+    (100, int(1e5)),
 )
-def test_columns_output_on_file(reduce_max_rows_analysis):
-    patched = 100 if reduce_max_rows_analysis else 1e5
-    with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", patched):
+def test_columns_output_on_file(max_rows_analysis):
+    with patch("csv_detective.detection.formats.MAX_ROWS_ANALYSIS", max_rows_analysis):
         output = routine(
             file_path="tests/data/a_test_file.csv",
             num_rows=-1,

{csv_detective-0.8.1.dev1500.data → csv_detective-0.8.1.dev1526.data}/data/share/csv_detective/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1500.data → csv_detective-0.8.1.dev1526.data}/data/share/csv_detective/README.md RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.8.1.dev1500.dist-info → csv_detective-0.8.1.dev1526.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.8.1.dev1500__py3-none-any.whl → 0.8.1.dev1526__py3-none-any.whl

csv-detective 0.8.1.dev1500py3-none-any.whl → 0.8.1.dev1526py3-none-any.whl