PyPI - csv-detective - Versions diffs - 0.9.1.dev1830__py3-none-any.whl → 0.9.1.dev1860__py3-none-any.whl - Mend

csv-detective 0.9.1.dev1830py3-none-any.whl → 0.9.1.dev1860py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

csv_detective/detection/formats.py CHANGED Viewed

@@ -14,6 +14,9 @@ from csv_detective.output.utils import prepare_output_dict
 from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
 from csv_detective.validate import validate
+# above this threshold, a column is not considered categorical
+MAX_NUMBER_CATEGORICAL_VALUES = 25
 def detect_formats(
     table: pd.DataFrame,
@@ -28,14 +31,18 @@ def detect_formats(
     if on_sample:
         if verbose:
             logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
-        table = table.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
+        table = build_sample(table)
     if table.empty:
         res_categorical = []
         # res_continuous = []
     else:
         # Detects columns that are categorical
-        res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
+        res_categorical, categorical_mask = detect_categorical_variable(
+            table,
+            max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
+            verbose=verbose,
+        )
         res_categorical = list(res_categorical)
         # Detect columns that are continuous (we already know the categorical) :
         # we don't need this for now, cuts processing time
@@ -166,3 +173,33 @@ def detect_formats(
             raise ValueError("Could not infer detected formats on the whole file")
     return analysis
+def build_sample(table: pd.DataFrame) -> pd.DataFrame:
+    """
+    building a sample of MAX_ROWS_ANALYSIS rows that contains at least one representative of
+    the min and max values of each column, and one case of NaN if the column contains any.
+    """
+    samples = pd.concat(
+        [
+            # one row with the minimum of the column
+            table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
+            for col in table.columns
+        ]
+        + [
+            # one row with the maximum of the column
+            table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
+            for col in table.columns
+        ]
+        + [
+            # one row with a NaN value if the column has any
+            table.loc[table[col].isna()].iloc[[0]]
+            for col in table.columns
+            if table[col].isna().any()
+        ],
+        ignore_index=True,
+    )
+    return pd.concat(
+        [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
+        ignore_index=True,
+    )

csv_detective/parsing/columns.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 from csv_detective.utils import display_logs_depending_process_time
-MAX_ROWS_ANALYSIS = int(1e5)
+MAX_ROWS_ANALYSIS = int(1e4)
 def test_col_val(

csv_detective/parsing/csv.py CHANGED Viewed

@@ -32,9 +32,7 @@ def parse_csv(
         if "ISO-8859" in encoding:
             encoding = "ISO-8859-1"
         try:
-            table = pd.read_csv(
-                the_file, sep=sep, dtype="unicode", encoding=encoding, skiprows=skiprows
-            )
+            table = pd.read_csv(the_file, sep=sep, dtype=str, encoding=encoding, skiprows=skiprows)
             total_lines = len(table)
             nb_duplicates = len(table.loc[table.duplicated()])
             if num_rows > 0:

csv_detective/parsing/excel.py CHANGED Viewed

@@ -101,7 +101,7 @@ def parse_excel(
                 file_path,
                 engine="odf",
                 sheet_name=None,
-                dtype="unicode",
+                dtype=str,
             )
             sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
             sheet_name = max(sizes, key=sizes.get)
@@ -121,7 +121,7 @@ def parse_excel(
                 file_path,
                 engine="odf",
                 sheet_name=sheet_name,
-                dtype="unicode",
+                dtype=str,
             )
         table, header_row_idx = remove_empty_first_rows(table)
         total_lines = len(table)
@@ -152,7 +152,7 @@ def parse_excel(
         file_path,
         engine=engine,
         sheet_name=sheet_name,
-        dtype="unicode",
+        dtype=str,
     )
     table, header_row_idx = remove_empty_first_rows(table)
     total_lines = len(table)

{csv_detective-0.9.1.dev1830.dist-info → csv_detective-0.9.1.dev1860.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.1.dev1830
+Version: 0.9.1.dev1860
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT

{csv_detective-0.9.1.dev1830.dist-info → csv_detective-0.9.1.dev1860.dist-info}/RECORD RENAMED Viewed

@@ -132,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
 csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
 csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
-csv_detective/detection/formats.py,sha256=dzJPdi2rP2jTHZBk9UHpJL3c5N-PSohCymHs-OZt45c,6211
+csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
 csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
 csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -144,25 +144,25 @@ csv_detective/output/profile.py,sha256=Jeh0mrfH_hAVxV2E5I4XzdCm7ZAGAV_Xj3AXOi77l
 csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/parsing/columns.py,sha256=fbvQMu12gAmz4TnNCL7pLnMFB-mWN_O-zEoj8jEGj0A,5696
+csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
-csv_detective/parsing/csv.py,sha256=qZFLOT3YCPoHF0svfVfQBnS8eHtucjDZ7dFITAPgLhc,1626
-csv_detective/parsing/excel.py,sha256=ULUDw76z6hs1Xm2yL9KBM0EOvIsfBLkxwqTZfDEx6aE,7045
+csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
+csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
 csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.1.dev1830.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.1.dev1860.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
-tests/test_fields.py,sha256=VhhQny2Jqy_Z6SplpnN_qAXqBRQCuA42IgSNu37R2cc,12560
-tests/test_file.py,sha256=YuVbSfeo_ASPiLT8CyxXqJENcDpj4wAFXzLwu_GzsOA,8437
+tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
+tests/test_file.py,sha256=NBLwPCFN2skZHLkckPZ0M0ZvanEdL88KVK1Vi9GhSaU,8925
 tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
 tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
 tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
 venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
 venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.1.dev1830.dist-info/METADATA,sha256=eYNe6QPycRGL5VnIyx_kj0e79azipmi7qu5jh766OD0,9767
-csv_detective-0.9.1.dev1830.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.1.dev1830.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.1.dev1830.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.1.dev1830.dist-info/RECORD,,
+csv_detective-0.9.1.dev1860.dist-info/METADATA,sha256=v8z2NQcMQznhH_35NtggEtjF-H9UGUycexq3Y8dNtp8,9767
+csv_detective-0.9.1.dev1860.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.1.dev1860.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.1.dev1860.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.1.dev1860.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -99,7 +99,7 @@ def test_detetect_categorical_variable():
         "cat2": categorical_col2,
         "not_cat": not_categorical_col,
     }
-    df = pd.DataFrame(df_dict, dtype="unicode")
+    df = pd.DataFrame(df_dict, dtype=str)
     res, _ = detect_categorical_variable(df)
     assert len(res.values) and all(k in res.values for k in ["cat", "cat2"])
@@ -114,8 +114,8 @@ def test_detect_continuous_variable():
     df_dict = {"cont": continuous_col, "not_cont": not_continuous_col}
     df_dict_2 = {"cont": continuous_col_2, "not_cont": not_continuous_col}
-    df = pd.DataFrame(df_dict, dtype="unicode")
-    df2 = pd.DataFrame(df_dict_2, dtype="unicode")
+    df = pd.DataFrame(df_dict, dtype=str)
+    df2 = pd.DataFrame(df_dict_2, dtype=str)
     res = detect_continuous_variable(df)
     res2 = detect_continuous_variable(df2, continuous_th=0.65)

tests/test_file.py CHANGED Viewed

@@ -276,3 +276,20 @@ def test_cast_json(mocked_responses, cast_json):
     )
     assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
     assert isinstance(df["a_simple_dict"][0], expected_type)
+def test_almost_uniform_column(mocked_responses):
+    col_name = "int_not_bool"
+    expected_content = f"{col_name}\n" + "9\n" + "1\n" * int(1e7)
+    mocked_responses.get(
+        "http://example.com/test.csv",
+        body=expected_content,
+        status=200,
+    )
+    analysis = routine(
+        file_path="http://example.com/test.csv",
+        num_rows=-1,
+        output_profile=False,
+        save_results=False,
+    )
+    assert analysis["columns"][col_name]["format"] == "int"

{csv_detective-0.9.1.dev1830.dist-info → csv_detective-0.9.1.dev1860.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1830.dist-info → csv_detective-0.9.1.dev1860.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1830.dist-info → csv_detective-0.9.1.dev1860.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.9.1.dev1830.dist-info → csv_detective-0.9.1.dev1860.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.9.1.dev1830__py3-none-any.whl → 0.9.1.dev1860__py3-none-any.whl

csv-detective 0.9.1.dev1830py3-none-any.whl → 0.9.1.dev1860py3-none-any.whl