PyPI - csv-detective - Versions diffs - 0.9.1.dev1869__py3-none-any.whl → 0.9.2__py3-none-any.whl - Mend

csv-detective 0.9.1.dev1869py3-none-any.whl → 0.9.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

csv_detective/output/__init__.py CHANGED Viewed

@@ -25,14 +25,7 @@ def generate_output(
     verbose: bool = False,
     sheet_name: Optional[Union[str, int]] = None,
 ) -> Union[dict, tuple[dict, pd.DataFrame]]:
-    if output_profile or output_df:
-        # to create the profile we have to cast columns, so using the dedicated function
-        table = cast_df(
-            df=table,
-            columns=analysis["columns"],
-            cast_json=cast_json,
-            verbose=verbose,
-        )
+    if output_profile:
         analysis["profile"] = create_profile(
             table=table,
             columns=analysis["columns"],
@@ -61,5 +54,10 @@ def generate_output(
         analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
     if output_df:
-        return analysis, table
+        return analysis, cast_df(
+            df=table,
+            columns=analysis["columns"],
+            cast_json=cast_json,
+            verbose=verbose,
+        )
     return analysis

csv_detective/output/profile.py CHANGED Viewed

@@ -4,7 +4,8 @@ from time import time
 import pandas as pd
-from csv_detective.utils import display_logs_depending_process_time, prevent_nan
+from csv_detective.detect_fields.other.float import float_casting
+from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time
 def create_profile(
@@ -18,11 +19,6 @@ def create_profile(
     if verbose:
         start = time()
         logging.info("Creating profile")
-    map_python_types = {
-        "string": str,
-        "int": float,
-        "float": float,
-    }
     if num_rows > 0:
         raise ValueError("To create profiles num_rows has to be set to -1")
@@ -35,12 +31,19 @@ def create_profile(
     for c in table.columns:
         # for numerical formats we want min, max, mean, std
         if columns[c]["python_type"] in ["float", "int"]:
+            # we locally cast the column to perform the operations, using the same method as in cast_df
+            cast_col = (
+                table[c].astype(pd.Int64Dtype())
+                if columns[c]["python_type"] == "int"
+                else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
+            )
             profile[c].update(
-                min=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].min())),
-                max=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].max())),
-                mean=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].mean())),
-                std=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].std())),
+                min=cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
+                max=cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
+                mean=cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
+                std=cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
             )
+            del cast_col
         # for all formats we want most frequent values, nb unique values and nb missing values
         tops_bruts = (
             table.loc[table[c].notna(), c]

csv_detective/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Optional
+from typing import Optional, Union
 import pandas as pd
@@ -31,5 +31,7 @@ def is_url(file_path: str) -> bool:
     return file_path.startswith("http")
-def prevent_nan(value: float) -> Optional[float]:
-    return None if pd.isna(value) else value
+def cast_prevent_nan(value: float, _type: str) -> Optional[Union[float, int]]:
+    if _type not in {"int", "float"}:
+        raise ValueError(f"Invalid type was passed: {_type}")
+    return None if pd.isna(value) else eval(_type)(value)

{csv_detective-0.9.1.dev1869.dist-info → csv_detective-0.9.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.1.dev1869
+Version: 0.9.2
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT

{csv_detective-0.9.1.dev1869.dist-info → csv_detective-0.9.2.dist-info}/RECORD RENAMED Viewed

@@ -3,7 +3,7 @@ csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
 csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
 csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
 csv_detective/s3_utils.py,sha256=z1KTVVkdurMv21o-rZu7_aluMJnSi-d5uxnQbqT2NoI,1407
-csv_detective/utils.py,sha256=u9I1tsyMfVr2eIYiGCD7Iu30d55H3za44-N3cV2nj8M,1013
+csv_detective/utils.py,sha256=xiIO7ZDqkTm9Rnhnq6RaDdnrPIfoG0JV9AsmaOG6plA,1162
 csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
 csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
 csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -137,10 +137,10 @@ csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPp
 csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
 csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
-csv_detective/output/__init__.py,sha256=02F5D5TODMiImyZzjnX-vIkMPkUC0ioIryqdBm6xT-w,2056
+csv_detective/output/__init__.py,sha256=bMsLp-XCVf4sNymIof_kdMdqFIY7GocOas-lPNekfQg,1930
 csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
 csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
-csv_detective/output/profile.py,sha256=k-t--uVHkrt3MRLnRAthiaF069jGc1jsQnfcOoBchrU,2524
+csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
 csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -150,19 +150,19 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
 csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
 csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.1.dev1869.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
 tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
-tests/test_file.py,sha256=NBLwPCFN2skZHLkckPZ0M0ZvanEdL88KVK1Vi9GhSaU,8925
+tests/test_file.py,sha256=ZL0Jx499RUpmKFvcPQVnAeafSbyc23fqwt93ZrYg9GE,10258
 tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
 tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
 tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
 venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
 venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.1.dev1869.dist-info/METADATA,sha256=3gGiQT_yLk3thJkrLt5l90W8ylzk_MVYN0_F3wGv5qE,9767
-csv_detective-0.9.1.dev1869.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.1.dev1869.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.1.dev1869.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.1.dev1869.dist-info/RECORD,,
+csv_detective-0.9.2.dist-info/METADATA,sha256=Yval8NfM6FC2eiIz8bybr9vbjJXOgS81VHzDJiBiPGI,9759
+csv_detective-0.9.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.2.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.2.dist-info/RECORD,,

tests/test_file.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pytest
 import responses
 from csv_detective import routine
+from csv_detective.output.profile import create_profile
 @pytest.mark.parametrize(
@@ -97,6 +98,55 @@ def test_profile_with_num_rows():
         )
+@pytest.mark.parametrize(
+    "params",
+    (
+        (
+            True,
+            {
+                "int_with_nan": {"format": "int", "python_type": "int"},
+                "date": {"format": "date", "python_type": "date"},
+            },
+        ),
+        (
+            False,
+            {
+                "int_with_nan": [{"format": "int", "python_type": "int"}],
+                "date": [{"format": "date", "python_type": "date"}],
+            },
+        ),
+    ),
+)
+def test_profile_specific_cases(params):
+    limited_output, columns = params
+    table = pd.DataFrame(
+        {
+            "int_with_nan": ["1", pd.NA, pd.NA],
+            "date": ["1996-01-02", "1996-01-02", "2024-11-12"],
+        }
+    )
+    profile = create_profile(
+        table=table,
+        columns=columns,
+        limited_output=limited_output,
+        num_rows=-1,
+    )
+    assert profile["int_with_nan"] == {
+        "min": 1,
+        "max": 1,
+        "mean": 1,
+        "std": None,
+        "tops": [{"count": 1, "value": "1"}],
+        "nb_distinct": 1,
+        "nb_missing_values": 2,
+    }
+    assert profile["date"] == {
+        "tops": [{"count": 2, "value": "1996-01-02"}, {"count": 1, "value": "2024-11-12"}],
+        "nb_distinct": 2,
+        "nb_missing_values": 0,
+    }
 def test_exception_different_number_of_columns():
     """
     A ValueError should be raised if the number of columns differs between the first rows

{csv_detective-0.9.1.dev1869.dist-info → csv_detective-0.9.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1869.dist-info → csv_detective-0.9.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1869.dist-info → csv_detective-0.9.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1869.dist-info → csv_detective-0.9.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.9.1.dev1869__py3-none-any.whl → 0.9.2__py3-none-any.whl

csv-detective 0.9.1.dev1869py3-none-any.whl → 0.9.2py3-none-any.whl