PyPI - csv-detective - Versions diffs - 0.9.1.dev1801__py3-none-any.whl → 0.9.1.dev1847__py3-none-any.whl - Mend

csv-detective 0.9.1.dev1801py3-none-any.whl → 0.9.1.dev1847py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

csv_detective/detect_fields/temp/date/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import re
 from datetime import datetime
 from typing import Optional
@@ -19,6 +20,23 @@ def date_casting(val: str) -> Optional[datetime]:
         return None
+seps = r"[\s/\-\*_\|;.,]"
+# matches JJ-MM-AAAA with any of the listed separators
+jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
+    "SEP", seps
+)
+# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
+aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
+    "SEP", seps + "?"
+)
+# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
+string_month_pattern = (
+    r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
+    r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
+    r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
+    r"([0-9]{2}$|(19|20)[0-9]{2}$)"
+).replace("SEP", seps + "?")
 threshold = 0.3
@@ -27,6 +45,16 @@ def _is(val):
     # early stops, to cut processing time
     if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
         return False
+    # if it's a usual date pattern
+    if any(
+        # with this syntax, if any of the first value is True, the next ones are not computed
+        [
+            bool(re.match(jjmmaaaa_pattern, val))
+            or bool(re.match(aaaammjj_pattern, val))
+            or bool(re.match(string_month_pattern, val, re.IGNORECASE))
+        ]
+    ):
+        return True
     if sum([char.isdigit() for char in val]) / len(val) < threshold:
         return False
     res = date_casting(val)

csv_detective/detect_fields/temp/datetime_aware/__init__.py CHANGED Viewed

@@ -1,8 +1,16 @@
+import re
 from typing import Any, Optional
-from csv_detective.detect_fields.temp.date import date_casting
+from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
 PROPORTION = 1
+threshold = 0.7
+# matches AAAA-MM-JJTHH:MM:SS(.dddddd)±HH:MM with any of the listed separators for the date OR NO SEPARATOR
+pat = (
+    aaaammjj_pattern.replace("$", "")
+    + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?[+-](0\d|1[0-9]|2[0-3]):([0-5][0-9])$"
+)
 def _is(val: Optional[Any]) -> bool:
@@ -12,7 +20,9 @@ def _is(val: Optional[Any]) -> bool:
     # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
     if not isinstance(val, str) or len(val) > 35 or len(val) < 21:
         return False
-    threshold = 0.7
+    # if usual format, no need to parse
+    if bool(re.match(pat, val)):
+        return True
     if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
         return False
     res = date_casting(val)

csv_detective/detect_fields/temp/datetime_naive/__init__.py CHANGED Viewed

@@ -1,8 +1,16 @@
+import re
 from typing import Any, Optional
-from csv_detective.detect_fields.temp.date import date_casting
+from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
 PROPORTION = 1
+threshold = 0.7
+# matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
+pat = (
+    aaaammjj_pattern.replace("$", "")
+    + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?Z$"
+)
 def _is(val: Optional[Any]) -> bool:
@@ -12,7 +20,9 @@ def _is(val: Optional[Any]) -> bool:
     # 26 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd, keeping some slack
     if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
         return False
-    threshold = 0.7
+    # if usual format, no need to parse
+    if bool(re.match(pat, val)):
+        return True
     if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
         return False
     res = date_casting(val)

csv_detective/output/dataframe.py CHANGED Viewed

@@ -33,27 +33,23 @@ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datet
 def cast_df(
     df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
 ) -> pd.DataFrame:
+    # for efficiency this modifies the dataframe in place as we don't need it anymore afterwards
     if verbose:
         start = time()
-    output_df = pd.DataFrame()
     for col_name, detection in columns.items():
         if detection["python_type"] == "string" or (
             detection["python_type"] == "json" and not cast_json
         ):
             # no change if detected type is string
-            output_df[col_name] = df[col_name].copy()
+            continue
         elif detection["python_type"] == "int":
             # to allow having ints and NaN in the same column
-            output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
+            df[col_name] = df[col_name].astype(pd.Int64Dtype())
         else:
-            output_df[col_name] = df[col_name].apply(
-                lambda col: cast(col, _type=detection["python_type"])
-            )
-        # to save RAM
-        del df[col_name]
+            df[col_name] = df[col_name].apply(lambda col: cast(col, _type=detection["python_type"]))
     if verbose:
         display_logs_depending_process_time(
             f"Casting columns completed in {round(time() - start, 3)}s",
             time() - start,
         )
-    return output_df
+    return df

csv_detective/parsing/csv.py CHANGED Viewed

@@ -32,9 +32,7 @@ def parse_csv(
         if "ISO-8859" in encoding:
             encoding = "ISO-8859-1"
         try:
-            table = pd.read_csv(
-                the_file, sep=sep, dtype="unicode", encoding=encoding, skiprows=skiprows
-            )
+            table = pd.read_csv(the_file, sep=sep, dtype=str, encoding=encoding, skiprows=skiprows)
             total_lines = len(table)
             nb_duplicates = len(table.loc[table.duplicated()])
             if num_rows > 0:

csv_detective/parsing/excel.py CHANGED Viewed

@@ -101,7 +101,7 @@ def parse_excel(
                 file_path,
                 engine="odf",
                 sheet_name=None,
-                dtype="unicode",
+                dtype=str,
             )
             sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
             sheet_name = max(sizes, key=sizes.get)
@@ -121,7 +121,7 @@ def parse_excel(
                 file_path,
                 engine="odf",
                 sheet_name=sheet_name,
-                dtype="unicode",
+                dtype=str,
             )
         table, header_row_idx = remove_empty_first_rows(table)
         total_lines = len(table)
@@ -152,7 +152,7 @@ def parse_excel(
         file_path,
         engine=engine,
         sheet_name=sheet_name,
-        dtype="unicode",
+        dtype=str,
     )
     table, header_row_idx = remove_empty_first_rows(table)
     total_lines = len(table)

{csv_detective-0.9.1.dev1801.dist-info → csv_detective-0.9.1.dev1847.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.1.dev1801
+Version: 0.9.1.dev1847
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT

{csv_detective-0.9.1.dev1801.dist-info → csv_detective-0.9.1.dev1847.dist-info}/RECORD RENAMED Viewed

@@ -67,9 +67,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
 csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
 csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
 csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_fields/temp/date/__init__.py,sha256=uVOszufihKqiQmS0wz7nUuQ2Dz-Tq9fSk1nf3S00mg4,1010
-csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=bEfWvXx_GNCRUxMGJYqfOK4wRDr3WMaGVAmIa_C2pXE,853
-csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=GtQo55SrrXfoT-L7ZXW63jrlAYvNT5m56wMfhuY3pyI,836
+csv_detective/detect_fields/temp/date/__init__.py,sha256=JtWaK8hkzBaIUc-fu0G7lIFpWqCfraRh6l0Mo65U3b0,2155
+csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=ZDNUcbU0ZJzaxUt0Utc1Y9dRrq4HHW9uCbcnOuz5Sfk,1247
+csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=QoVOA98lT_GVSGO_mQwKtAy2o-REs8C9d6JB9d_L_B4,1189
 csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
 csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
 csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
@@ -138,7 +138,7 @@ csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2V
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
 csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
 csv_detective/output/__init__.py,sha256=f-UFv_iULpVF_Fy39H4sfACEnrthjK4N3mCAVPkjnKw,1860
-csv_detective/output/dataframe.py,sha256=UpLuSxx_SFbKpem1n-xY7jF16MXGpKQYEWjaSMIiB4s,2215
+csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
 csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
 csv_detective/output/profile.py,sha256=Jeh0mrfH_hAVxV2E5I4XzdCm7ZAGAV_Xj3AXOi77lcA,3130
 csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
@@ -146,14 +146,14 @@ csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/parsing/columns.py,sha256=fbvQMu12gAmz4TnNCL7pLnMFB-mWN_O-zEoj8jEGj0A,5696
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
-csv_detective/parsing/csv.py,sha256=qZFLOT3YCPoHF0svfVfQBnS8eHtucjDZ7dFITAPgLhc,1626
-csv_detective/parsing/excel.py,sha256=ULUDw76z6hs1Xm2yL9KBM0EOvIsfBLkxwqTZfDEx6aE,7045
+csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
+csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
 csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.1.dev1801.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.1.dev1847.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
-tests/test_fields.py,sha256=Y2mBfV9ZdxTHYwHnkzGbpo1k_qJRLC8nU-zzAUxFmAE,11964
+tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
 tests/test_file.py,sha256=YuVbSfeo_ASPiLT8CyxXqJENcDpj4wAFXzLwu_GzsOA,8437
 tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
 tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
@@ -161,8 +161,8 @@ tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
 venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
 venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.1.dev1801.dist-info/METADATA,sha256=v6wVh2pCJfMUKK3tKjDm23UXJ1tKMAfnaLSrHFUMrKI,9767
-csv_detective-0.9.1.dev1801.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.1.dev1801.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.1.dev1801.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.1.dev1801.dist-info/RECORD,,
+csv_detective-0.9.1.dev1847.dist-info/METADATA,sha256=4GPrJUwsDAkxwVV9fnFv4pVHmelYX1C1H4QCh_zG8wc,9767
+csv_detective-0.9.1.dev1847.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.1.dev1847.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.1.dev1847.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.1.dev1847.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from datetime import date as _date
 from datetime import datetime as _datetime
+from unittest.mock import patch
 import pandas as pd
 import pytest
@@ -98,7 +99,7 @@ def test_detetect_categorical_variable():
         "cat2": categorical_col2,
         "not_cat": not_categorical_col,
     }
-    df = pd.DataFrame(df_dict, dtype="unicode")
+    df = pd.DataFrame(df_dict, dtype=str)
     res, _ = detect_categorical_variable(df)
     assert len(res.values) and all(k in res.values for k in ["cat", "cat2"])
@@ -113,8 +114,8 @@ def test_detect_continuous_variable():
     df_dict = {"cont": continuous_col, "not_cont": not_continuous_col}
     df_dict_2 = {"cont": continuous_col_2, "not_cont": not_continuous_col}
-    df = pd.DataFrame(df_dict, dtype="unicode")
-    df2 = pd.DataFrame(df_dict_2, dtype="unicode")
+    df = pd.DataFrame(df_dict, dtype=str)
+    df2 = pd.DataFrame(df_dict_2, dtype=str)
     res = detect_continuous_variable(df)
     res2 = detect_continuous_variable(df2, continuous_th=0.65)
@@ -441,3 +442,22 @@ def test_priority(args):
     col = "col1"
     output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
     assert output[col]["format"] == expected
+@pytest.mark.parametrize(
+    "args",
+    (
+        ("1996-02-13", date),
+        ("28/01/2000", date),
+        ("2025-08-20T14:30:00+02:00", datetime_aware),
+        ("2025/08/20 14:30:00.2763-12:00", datetime_aware),
+        ("1925_12_20T14:30:00.2763Z", datetime_naive),
+        ("1925 12 20 14:30:00Z", datetime_naive),
+    ),
+)
+def test_early_detection(args):
+    value, module = args
+    with patch("csv_detective.detect_fields.temp.date.date_casting") as mock_func:
+        res = module._is(value)
+        assert res
+        mock_func.assert_not_called()

{csv_detective-0.9.1.dev1801.dist-info → csv_detective-0.9.1.dev1847.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1801.dist-info → csv_detective-0.9.1.dev1847.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1801.dist-info → csv_detective-0.9.1.dev1847.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1801.dist-info → csv_detective-0.9.1.dev1847.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.9.1.dev1801__py3-none-any.whl → 0.9.1.dev1847__py3-none-any.whl

csv-detective 0.9.1.dev1801py3-none-any.whl → 0.9.1.dev1847py3-none-any.whl