PyPI - csv-detective - Versions diffs - 0.10.3.dev7__py3-none-any.whl → 0.10.4.dev2__py3-none-any.whl - Mend

csv-detective 0.10.3.dev7py3-none-any.whl → 0.10.4.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

csv_detective/detection/formats.py CHANGED Viewed

@@ -11,6 +11,7 @@ from csv_detective.format import Format, FormatsManager
 from csv_detective.output.utils import prepare_output_dict
 from csv_detective.parsing.columns import (
     MAX_NUMBER_CATEGORICAL_VALUES,
+    handle_empty_columns,
     test_col,
     test_col_chunks,
     test_label,
@@ -49,6 +50,7 @@ def detect_formats(
             skipna=skipna,
             verbose=verbose,
         )
+        handle_empty_columns(scores_table_fields)
         res_categorical, _ = detect_categorical_variable(
             table,
             max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,

csv_detective/explore_csv.py CHANGED Viewed

@@ -36,7 +36,7 @@ def routine(
         file_path: local path or URL to file
         num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
         tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
-        limited_output: whether or not to return all possible types or only the most likely one for each column
+        limited_output: whether or not to return all possible formats or only the most likely one for each column
         save_results: whether or not to save the results in a json file, or the path where to dump the output
         output_profile: whether or not to add the 'profile' field to the output
         output_schema: whether or not to add the 'schema' field to the output (tableschema)
@@ -47,7 +47,7 @@ def routine(
         skipna: whether to keep NaN (empty cells) for tests
     Returns:
-        dict: a dict with information about the csv and possible types for each column
+        dict: a dict with information about the csv and possible formats for each column
     """
     if not (

csv_detective/format.py CHANGED Viewed

@@ -25,6 +25,8 @@ class Format:
             labels: the dict of hint headers and their credibilty for the header score (NB: credibility is relative witin a single format, should be used to rank the valid labels)
             proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
             tags: to allow users to submit a file to only a subset of formats
+            mandatory_label: whether the format can only be considered if the column passes both field and label tests
+            python_type: the python type related to the format (less specific, used for downstream casting)
         """
         self.name: str = name
         self.func: Callable[[Any], bool] = func

csv_detective/parsing/columns.py CHANGED Viewed

@@ -13,6 +13,13 @@ from csv_detective.utils import display_logs_depending_process_time
 MAX_NUMBER_CATEGORICAL_VALUES = 25
+def handle_empty_columns(return_table: pd.DataFrame):
+    # handling that empty columns score 1 everywhere
+    for col in return_table.columns:
+        if sum(return_table[col]) == len(return_table):
+            return_table[col] = 0
 def test_col_val(
     serie: pd.Series,
     format: Format,
@@ -87,7 +94,7 @@ def test_col(
             )
         if verbose:
             display_logs_depending_process_time(
-                f'\t> Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
+                f'\t> Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
                 time() - start_type,
             )
     if verbose:
@@ -102,7 +109,7 @@ def test_label(
 ):
     if verbose:
         start = time()
-        logging.info("Testing labels to get types")
+        logging.info("Testing labels to get formats")
     return_table = pd.DataFrame(columns=columns)
     for idx, (label, format) in enumerate(formats.items()):
@@ -111,7 +118,7 @@ def test_label(
         return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns]
         if verbose:
             display_logs_depending_process_time(
-                f'\t- Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
+                f'\t- Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
                 time() - start_type,
             )
     if verbose:
@@ -222,10 +229,7 @@ def test_col_chunks(
     analysis["categorical"] = [
         col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
     ]
-    # handling that empty columns score 1 everywhere
-    for col in return_table.columns:
-        if sum(return_table[col]) == len(return_table):
-            return_table[col] = 0
+    handle_empty_columns(return_table)
     if verbose:
         display_logs_depending_process_time(
             f"Done testing chunks in {round(time() - start, 3)}s", time() - start

csv_detective/parsing/csv.py CHANGED Viewed

@@ -39,7 +39,7 @@ def parse_csv(
         # branch between small and big files starts here
         if total_lines == CHUNK_SIZE:
             if verbose:
-                logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
+                logging.warning(f"File is too long, loading in chunks of {CHUNK_SIZE} rows")
             total_lines, nb_duplicates = None, None
         else:
             nb_duplicates = table.duplicated().sum()

{csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.10.3.dev7
+Version: 0.10.4.dev2
 Summary: Detect tabular files column content
 Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
 License: MIT

{csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
 csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
-csv_detective/explore_csv.py,sha256=M8jabAP08raPY438v5UeBqJy3bBudTeuo-UNe2unWyE,7639
-csv_detective/format.py,sha256=VTdwg4gp9pq6WYhbkCxv9X2hXq0fMrzfooFchmIL0as,2911
+csv_detective/explore_csv.py,sha256=YjkQihSm1vgZbEfXHxJ-_bVJrtCUT3Ut8x8FX60ZK3k,7643
+csv_detective/format.py,sha256=vYz4h-WDUZ3pZIfxUc1toXRbUiNnWqHHiUXBbZtnaUw,3140
 csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
 csv_detective/validate.py,sha256=7k0GC5AsTn5BbsRChetZZDmnTGiYLe40qPKiP3GruYs,7495
 csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
 csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
-csv_detective/detection/formats.py,sha256=9aIE4gwTN8c8pa-kofeJ7zalo8NqjGZabYD-G79kV5I,4734
+csv_detective/detection/formats.py,sha256=cgECpxRaygwnedPhOteG1P_697qCoceeDrKK9G_O-u8,4812
 csv_detective/detection/headers.py,sha256=lnbWRxkI6rdyoWGtmxSfsPkqNjS0Nlpgw-pVevtmBP0,899
 csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -80,17 +80,17 @@ csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XN
 csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/parsing/columns.py,sha256=H_dKHhSgQMIiOfxibnGl6HwTW9bRwGtIeUcYBN13-3A,9245
+csv_detective/parsing/columns.py,sha256=ExabNAzQB-IvQfjmVLClBfj1GMkbAQxUUnournFpdy8,9350
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
-csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
+csv_detective/parsing/csv.py,sha256=vfAHkpgzLkzeUXWVqrWlttZ1y-Hql0GNFSZZrA366mk,1714
 csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
 csv_detective/parsing/load.py,sha256=1Fk43ikIOJwtWJUY-e8oNeNOk4MMtpmZV7s-VbQBS1k,4345
 csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
-csv_detective-0.10.3.dev7.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.10.4.dev2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
 tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
-tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
+tests/test_file.py,sha256=Ov9NGvZQxeoehxTpfcsnwEybebM0tnbmcRsFwe46cjg,15277
 tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
 tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
 tests/test_validation.py,sha256=309k3Axgbp-1Wh6qvCj2BpeMBp3HXzLi5j9UKm1bRQs,5384
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
 tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
 tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
 tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
-csv_detective-0.10.3.dev7.dist-info/METADATA,sha256=Kj-Nd01oXyqZpNwaVNJ9CwRcYYX80YCKtIhsijoO89A,10925
-csv_detective-0.10.3.dev7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-csv_detective-0.10.3.dev7.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.10.3.dev7.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
-csv_detective-0.10.3.dev7.dist-info/RECORD,,
+csv_detective-0.10.4.dev2.dist-info/METADATA,sha256=52ZirEUs9m49EuivGtEmtr_p2h_wVsSRbyQTs0SfoWE,10925
+csv_detective-0.10.4.dev2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+csv_detective-0.10.4.dev2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.10.4.dev2.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
+csv_detective-0.10.4.dev2.dist-info/RECORD,,

tests/test_file.py CHANGED Viewed

@@ -388,10 +388,11 @@ def test_almost_uniform_column(mocked_responses):
     assert analysis["columns"][col_name]["format"] == "int"
-def test_full_nan_column(mocked_responses):
+@pytest.mark.parametrize("nb_rows", (CHUNK_SIZE // 10, CHUNK_SIZE + 1))
+def test_full_nan_column(mocked_responses, nb_rows):
     # we want a file that needs sampling
     col_name = "only_nan"
-    expected_content = f"{col_name},second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
+    expected_content = f"{col_name},second_col\n" + ",1\n" * nb_rows
     mocked_responses.get(
         "http://example.com/test.csv",
         body=expected_content,

{csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.10.3.dev7__py3-none-any.whl → 0.10.4.dev2__py3-none-any.whl

csv-detective 0.10.3.dev7py3-none-any.whl → 0.10.4.dev2py3-none-any.whl