PyPI - csv-detective - Versions diffs - 0.9.1.dev1860__py3-none-any.whl → 0.9.1.dev1869__py3-none-any.whl - Mend

csv-detective 0.9.1.dev1860py3-none-any.whl → 0.9.1.dev1869py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

csv_detective/output/__init__.py CHANGED Viewed

@@ -25,12 +25,20 @@ def generate_output(
     verbose: bool = False,
     sheet_name: Optional[Union[str, int]] = None,
 ) -> Union[dict, tuple[dict, pd.DataFrame]]:
-    if output_profile:
+    if output_profile or output_df:
+        # to create the profile we have to cast columns, so using the dedicated function
+        table = cast_df(
+            df=table,
+            columns=analysis["columns"],
+            cast_json=cast_json,
+            verbose=verbose,
+        )
         analysis["profile"] = create_profile(
             table=table,
-            dict_cols_fields=analysis["columns"],
+            columns=analysis["columns"],
             num_rows=num_rows,
             limited_output=limited_output,
+            cast_json=cast_json,
             verbose=verbose,
         )
@@ -45,16 +53,13 @@ def generate_output(
                 output_path += "_sheet-" + str(sheet_name)
             output_path += ".json"
         with open(output_path, "w", encoding="utf8") as fp:
-            json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
+            json.dump(
+                analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str
+            )
     if output_schema:
         analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
     if output_df:
-        return analysis, cast_df(
-            df=table,
-            columns=analysis["columns"],
-            cast_json=cast_json,
-            verbose=verbose,
-        )
+        return analysis, table
     return analysis

csv_detective/output/profile.py CHANGED Viewed

@@ -4,15 +4,15 @@ from time import time
 import pandas as pd
-from csv_detective.detect_fields.other.float import float_casting
 from csv_detective.utils import display_logs_depending_process_time, prevent_nan
 def create_profile(
     table: pd.DataFrame,
-    dict_cols_fields: dict,
+    columns: dict,
     num_rows: int,
     limited_output: bool = True,
+    cast_json: bool = True,
     verbose: bool = False,
 ) -> dict:
     if verbose:
@@ -26,65 +26,44 @@ def create_profile(
     if num_rows > 0:
         raise ValueError("To create profiles num_rows has to be set to -1")
-    safe_table = table.copy()
     if not limited_output:
-        dict_cols_fields = {
+        columns = {
             k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
-            for k, v in dict_cols_fields.items()
+            for k, v in columns.items()
         }
-    dtypes = {k: map_python_types.get(v["python_type"], str) for k, v in dict_cols_fields.items()}
-    for c in safe_table.columns:
-        if dtypes[c] is float:
-            safe_table[c] = safe_table[c].apply(
-                lambda s: float_casting(s) if isinstance(s, str) else s
-            )
     profile = defaultdict(dict)
-    for c in safe_table.columns:
-        if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
-            float,
-            int,
-        ]:
+    for c in table.columns:
+        # for numerical formats we want min, max, mean, std
+        if columns[c]["python_type"] in ["float", "int"]:
             profile[c].update(
-                min=prevent_nan(
-                    map_python_types.get(dict_cols_fields[c]["python_type"], str)(
-                        safe_table[c].min()
-                    )
-                ),
-                max=prevent_nan(
-                    map_python_types.get(dict_cols_fields[c]["python_type"], str)(
-                        safe_table[c].max()
-                    )
-                ),
-                mean=prevent_nan(
-                    map_python_types.get(dict_cols_fields[c]["python_type"], str)(
-                        safe_table[c].mean()
-                    )
-                ),
-                std=prevent_nan(
-                    map_python_types.get(dict_cols_fields[c]["python_type"], str)(
-                        safe_table[c].std()
-                    )
-                ),
+                min=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].min())),
+                max=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].max())),
+                mean=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].mean())),
+                std=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].std())),
             )
+        # for all formats we want most frequent values, nb unique values and nb missing values
         tops_bruts = (
-            safe_table[safe_table[c].notna()][c]
-            .value_counts(dropna=True)
+            table.loc[table[c].notna(), c]
+            .value_counts()
             .reset_index()
             .iloc[:10]
             .to_dict(orient="records")
         )
-        tops = []
-        for tb in tops_bruts:
-            tops.append(
+        profile[c].update(
+            tops=[
                 {
                     "count": tb["count"],
                     "value": tb[c],
                 }
-            )
-        profile[c].update(
-            tops=tops,
-            nb_distinct=safe_table[c].nunique(),
-            nb_missing_values=len(safe_table[c].loc[safe_table[c].isna()]),
+                for tb in tops_bruts
+            ],
+            nb_distinct=(
+                table[c].nunique()
+                if columns[c]["python_type"] != "json" or not cast_json
+                # a column containing cast json is not serializable
+                else table[c].astype(str).nunique()
+            ),
+            nb_missing_values=len(table[c].loc[table[c].isna()]),
         )
     if verbose:
         display_logs_depending_process_time(

{csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.1.dev1860
+Version: 0.9.1.dev1869
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT

{csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/RECORD RENAMED Viewed

@@ -137,10 +137,10 @@ csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPp
 csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
 csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
-csv_detective/output/__init__.py,sha256=f-UFv_iULpVF_Fy39H4sfACEnrthjK4N3mCAVPkjnKw,1860
+csv_detective/output/__init__.py,sha256=02F5D5TODMiImyZzjnX-vIkMPkUC0ioIryqdBm6xT-w,2056
 csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
 csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
-csv_detective/output/profile.py,sha256=Jeh0mrfH_hAVxV2E5I4XzdCm7ZAGAV_Xj3AXOi77lcA,3130
+csv_detective/output/profile.py,sha256=k-t--uVHkrt3MRLnRAthiaF069jGc1jsQnfcOoBchrU,2524
 csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -150,7 +150,7 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
 csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
 csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.1.dev1860.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.1.dev1869.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
 tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
@@ -161,8 +161,8 @@ tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
 venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
 venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.1.dev1860.dist-info/METADATA,sha256=v8z2NQcMQznhH_35NtggEtjF-H9UGUycexq3Y8dNtp8,9767
-csv_detective-0.9.1.dev1860.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.1.dev1860.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.1.dev1860.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.1.dev1860.dist-info/RECORD,,
+csv_detective-0.9.1.dev1869.dist-info/METADATA,sha256=3gGiQT_yLk3thJkrLt5l90W8ylzk_MVYN0_F3wGv5qE,9767
+csv_detective-0.9.1.dev1869.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.1.dev1869.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.1.dev1869.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.1.dev1869.dist-info/RECORD,,

{csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.9.1.dev1860__py3-none-any.whl → 0.9.1.dev1869__py3-none-any.whl

csv-detective 0.9.1.dev1860py3-none-any.whl → 0.9.1.dev1869py3-none-any.whl