PyPI - csv-detective - Versions diffs - 0.7.5.dev1171__py3-none-any.whl → 0.7.5.dev1197__py3-none-any.whl - Mend

csv-detective 0.7.5.dev1171py3-none-any.whl → 0.7.5.dev1197py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

csv_detective/detect_fields/temp/date/__init__.py CHANGED Viewed

@@ -14,6 +14,11 @@ def date_casting(val: str) -> Optional[datetime]:
         return dateutil_parser(val)
     except ParserError:
         return date_parser(val)
+    except OverflowError:
+        return None
+threshold = 0.3
 def _is(val):
@@ -21,7 +26,6 @@ def _is(val):
     # early stops, to cut processing time
     if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
         return False
-    threshold = 0.3
     if sum([char.isdigit() for char in val]) / len(val) < threshold:
         return False
     res = date_casting(val)

csv_detective/detection.py CHANGED Viewed

@@ -1,10 +1,11 @@
-from typing import TextIO, Optional, Union
+from typing import TextIO, Optional
 from collections import defaultdict
 import pandas as pd
 import math
 import csv
 from cchardet import detect
 from ast import literal_eval
+import gzip
 import logging
 from time import time
 import openpyxl
@@ -21,10 +22,13 @@ NEW_EXCEL_EXT = [".xlsx", ".xlsm", ".xltx", ".xltm"]
 OLD_EXCEL_EXT = [".xls"]
 OPEN_OFFICE_EXT = [".odf", ".ods", ".odt"]
 XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
+EXCEL_ENGINES = ["openpyxl", "xlrd", "odf"]
+COMPRESSION_ENGINES = ["gzip"]
 engine_to_file = {
     "openpyxl": "Excel",
     "xlrd": "old Excel",
-    "odf": "OpenOffice"
+    "odf": "OpenOffice",
+    "gzip": "csv.gz",
 }
@@ -128,6 +132,8 @@ def detect_engine(csv_file_path: str, verbose=False) -> Optional[str]:
     if verbose:
         start = time()
     mapping = {
+        "application/gzip": "gzip",
+        "application/x-gzip": "gzip",
         'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
         'application/vnd.ms-excel': 'xlrd',
         'application/vnd.oasis.opendocument.spreadsheet': 'odf',
@@ -141,8 +147,12 @@ def detect_engine(csv_file_path: str, verbose=False) -> Optional[str]:
     else:
         engine = mapping.get(magic.from_file(csv_file_path, mime=True))
     if verbose:
+        message = (
+            f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
+            if engine else "Processing the file as a csv"
+        )
         display_logs_depending_process_time(
-            f'File has no extension, detected {engine_to_file.get(engine, "csv")}',
+            message,
             time() - start,
         )
     return engine
@@ -174,7 +184,9 @@ def detect_separator(file: TextIO, verbose: bool = False) -> str:
             break
         rows_lengths.add(len(row))
     if len(rows_lengths) > 1:
-        raise ValueError('Number of columns is not even across the first 10 rows.')
+        raise ValueError(
+            f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
+        )
     if verbose:
         display_logs_depending_process_time(
@@ -184,19 +196,22 @@ def detect_separator(file: TextIO, verbose: bool = False) -> str:
     return sep
-def detect_encoding(csv_file_path: str, verbose: bool = False) -> str:
+def unzip(binary_file: BytesIO, engine: str) -> BytesIO:
+    if engine == "gzip":
+        with gzip.open(binary_file, mode="rb") as binary_file:
+            file_content = binary_file.read()
+    else:
+        raise NotImplementedError(f"{engine} is not yet supported")
+    return BytesIO(file_content)
+def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
     """
     Detects file encoding using faust-cchardet (forked from the original cchardet)
     """
     if verbose:
         start = time()
         logging.info("Detecting encoding")
-    if is_url(csv_file_path):
-        r = requests.get(csv_file_path)
-        r.raise_for_status()
-        binary_file = BytesIO(r.content)
-    else:
-        binary_file = open(csv_file_path, mode="rb")
     encoding_dict = detect(binary_file.read())
     if not encoding_dict["encoding"]:
         raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")

csv_detective/explore_csv.py CHANGED Viewed

@@ -12,7 +12,7 @@ import tempfile
 import logging
 from time import time
 import requests
-from io import StringIO
+from io import BytesIO, StringIO
 import pandas as pd
 # flake8: noqa
@@ -39,7 +39,10 @@ from .detection import (
     detetect_categorical_variable,
     # detect_continuous_variable,
     is_url,
+    unzip,
     XLS_LIKE_EXT,
+    EXCEL_ENGINES,
+    COMPRESSION_ENGINES,
 )
@@ -81,16 +84,14 @@ def return_all_tests(
     if isinstance(user_input_tests, str):
         user_input_tests = [user_input_tests]
-    if "ALL" in user_input_tests:
+    if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
         tests_to_do = [detect_type]
     else:
-        # can't require to only skip tests
-        assert not all(x[0] == "-" for x in user_input_tests)
         tests_to_do = [
-            detect_type + "." + x for x in user_input_tests if x[0] != "-"
+            f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
         ]
     tests_skipped = [
-        detect_type + "." + x[1:] for x in user_input_tests if x[0] == "-"
+        f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
     ]
     all_tests = [
         # this is why we need to import detect_fields/labels
@@ -155,12 +156,12 @@ def routine(
     file_name = csv_file_path.split('/')[-1]
     engine = None
-    if '.' not in file_name:
+    if '.' not in file_name or not file_name.endswith("csv"):
         # file has no extension, we'll investigate how to read it
         engine = detect_engine(csv_file_path, verbose=verbose)
     is_xls_like = False
-    if engine or any([csv_file_path.endswith(k) for k in XLS_LIKE_EXT]):
+    if engine in EXCEL_ENGINES or any([csv_file_path.endswith(k) for k in XLS_LIKE_EXT]):
         is_xls_like = True
         encoding, sep, heading_columns, trailing_columns = None, None, None, None
         table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
@@ -172,12 +173,23 @@ def routine(
         )
         header = table.columns.to_list()
     else:
-        if encoding is None:
-            encoding = detect_encoding(csv_file_path, verbose=verbose)
+        # fetching or reading file as binary
         if is_url(csv_file_path):
             r = requests.get(csv_file_path, allow_redirects=True)
             r.raise_for_status()
-            str_file = StringIO(r.content.decode(encoding=encoding))
+            binary_file = BytesIO(r.content)
+        else:
+            binary_file = open(csv_file_path, "rb")
+        # handling compression
+        if engine in COMPRESSION_ENGINES:
+            binary_file: BytesIO = unzip(binary_file=binary_file, engine=engine)
+        # detecting encoding if not specified
+        if encoding is None:
+            encoding: str = detect_encoding(binary_file, verbose=verbose)
+            binary_file.seek(0)
+        # decoding and reading file
+        if is_url(csv_file_path) or engine in COMPRESSION_ENGINES:
+            str_file = StringIO(binary_file.read().decode(encoding=encoding))
         else:
             str_file = open(csv_file_path, "r", encoding=encoding)
         if sep is None:
@@ -257,17 +269,19 @@ def routine(
     )
     # To reduce false positives: ensure these formats are detected only if the label yields
-    # a detection.
+    # a detection (skipping the ones that have been excluded by the users).
     formats_with_mandatory_label = [
-        "code_departement",
-        "code_commune_insee",
-        "code_postal",
-        "latitude_wgs",
-        "longitude_wgs",
-        "latitude_wgs_fr_metropole",
-        "longitude_wgs_fr_metropole",
-        "latitude_l93",
-        "longitude_l93",
+        f for f in [
+            "code_departement",
+            "code_commune_insee",
+            "code_postal",
+            "latitude_wgs",
+            "longitude_wgs",
+            "latitude_wgs_fr_metropole",
+            "longitude_wgs_fr_metropole",
+            "latitude_l93",
+            "longitude_l93",
+        ] if f in scores_table.index
     ]
     scores_table.loc[formats_with_mandatory_label, :] = np.where(
         scores_table_labels.loc[formats_with_mandatory_label, :],

{csv_detective-0.7.5.dev1171.data → csv_detective-0.7.5.dev1197.data}/data/share/csv_detective/CHANGELOG.md RENAMED Viewed

@@ -7,7 +7,10 @@
 - Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
 - The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
 - Raise an error if the encoding could not be guessed [#106](https://github.com/datagouv/csv-detective/pull/106)
+- Allow to only specify tests to skip ("all but...") [#108](https://github.com/datagouv/csv-detective/pull/108)
 - Fix bool casting [#109](https://github.com/datagouv/csv-detective/pull/109)
+- Handle csv.gz files [#110](https://github.com/datagouv/csv-detective/pull/110)
+- Refactor file tests [#110](https://github.com/datagouv/csv-detective/pull/110)
 ## 0.7.4 (2024-11-15)

{csv_detective-0.7.5.dev1171.dist-info → csv_detective-0.7.5.dev1197.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: csv_detective
-Version: 0.7.5.dev1171
+Version: 0.7.5.dev1197
 Summary: Detect CSV column content
 Home-page: https://github.com/etalab/csv_detective
 Author: Etalab
@@ -37,5 +37,6 @@ Dynamic: description-content-type
 Dynamic: home-page
 Dynamic: keywords
 Dynamic: license
+Dynamic: license-file
 Dynamic: requires-dist
 Dynamic: summary

{csv_detective-0.7.5.dev1171.dist-info → csv_detective-0.7.5.dev1197.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
 csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
 csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
-csv_detective/detection.py,sha256=zrP8qvLDvhVXTHi7Ty8G_ga4zfZPjBhuyApqFQkPq2Y,22373
-csv_detective/explore_csv.py,sha256=-AxnM0hGlhrbI4w1wdZwC_w-DYgoOCFpMQ94agIaD5U,17380
+csv_detective/detection.py,sha256=dqjAKR-h7QC2pbl7FEUleS15bvGHBiTleu9CtVKp_Vo,22806
+csv_detective/explore_csv.py,sha256=HM4RlNV2eWfP9wTDvhrow-_yDMbGuE3JDvFCfmMNWyY,18087
 csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
 csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
 csv_detective/schema_generation.py,sha256=D1Cq4QRajsKtY8EJSwbRTIB-T_Cb2ZpcmYtCrJ6DvJQ,13135
@@ -65,7 +65,7 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
 csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
 csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
 csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-csv_detective/detect_fields/temp/date/__init__.py,sha256=aFP1feFWFhCpR6Q9s_4BBwWxFtwFiMXY1iduSeQIjdA,943
+csv_detective/detect_fields/temp/date/__init__.py,sha256=1a_Ra9fmT4wgGMrcknXP7eN7A2QiaMF0Yjy0-BMihtA,987
 csv_detective/detect_fields/temp/datetime/__init__.py,sha256=Ykwhk2ospjY9P0KOG0AitgqN0sld6UmhOlbMz_XGQzQ,597
 csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
 csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
@@ -127,18 +127,18 @@ csv_detective/detect_labels/temp/date/__init__.py,sha256=-R7VqlryozelSn4wH_7w9x6
 csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
 csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
 csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
-csv_detective-0.7.5.dev1171.data/data/share/csv_detective/CHANGELOG.md,sha256=MU0DrzId6qDxIPeAp9nAazYlEYwh1A8mlqnvkyFK55c,7254
-csv_detective-0.7.5.dev1171.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1171.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
+csv_detective-0.7.5.dev1197.data/data/share/csv_detective/CHANGELOG.md,sha256=YlXiPqHlJv23g6HfqEXzic6y14IfPWUoz5ADOis0YeY,7528
+csv_detective-0.7.5.dev1197.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
+csv_detective-0.7.5.dev1197.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
+csv_detective-0.7.5.dev1197.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
 tests/test_fields.py,sha256=_96htvTzvM7u-W57RpOBbsacWirIm4R36PP7JhPEaYQ,11123
-tests/test_file.py,sha256=HO-Zqv0ZDFy3d0ZrpjWQPXBrwgUmzesseoEofy8G2UU,7529
+tests/test_file.py,sha256=w-nKXnm8A5l5_MAtA6E99ouTefOkU38B6jMJVyBHr50,7858
 tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
 tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
-csv_detective-0.7.5.dev1171.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
-csv_detective-0.7.5.dev1171.dist-info/METADATA,sha256=MhOtq7Bv7pJMRUoavz8f0VcKXpY1z-n5NLSbAJkyRLg,1364
-csv_detective-0.7.5.dev1171.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-csv_detective-0.7.5.dev1171.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.7.5.dev1171.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
-csv_detective-0.7.5.dev1171.dist-info/RECORD,,
+csv_detective-0.7.5.dev1197.dist-info/METADATA,sha256=2B1bE17lCw02QHnXyk_2Rt9M-fcN8J_RlUlkKzNL4tM,1386
+csv_detective-0.7.5.dev1197.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+csv_detective-0.7.5.dev1197.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.7.5.dev1197.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
+csv_detective-0.7.5.dev1197.dist-info/RECORD,,

{csv_detective-0.7.5.dev1171.dist-info → csv_detective-0.7.5.dev1197.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.2)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

tests/test_file.py CHANGED Viewed

@@ -69,7 +69,7 @@ def test_profile_output_on_file():
     assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
-def test_exception():
+def test_profile_with_num_rows():
     with pytest.raises(ValueError):
         routine(
             csv_file_path="tests/a_test_file.csv",
@@ -131,52 +131,37 @@ def test_schema_on_file():
     assert is_column_reg
-def test_non_csv_files():
-    _ = routine(
-        csv_file_path="tests/file.ods",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
-    assert _['engine'] == 'odf'
+params_csv = [
+    ("csv_file", {"engine": None, "sheet_name": None}),
+    ("file.csv.gz", {"engine": None, "sheet_name": None, "separator": ",", "columns.len": 3}),
+]
+params_others = [
+    ("file.ods", {"engine": "odf"}),
     # this is a "tricked" xls file that is actually read as odf
-    _ = routine(
-        csv_file_path="tests/file.xls",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
-    assert _['engine'] == 'odf'
+    ("file.xls", {"engine": "odf"}),
+    # this file has an empty first row; check if the sheet we consider is the largest
+    ("file.xlsx", {"engine": "openpyxl", "header_row_idx": 1, "sheet_name": "REI_1987"}),
+    ("xlsx_file", {"engine": "openpyxl"}),
+]
-    _ = routine(
-        csv_file_path="tests/file.xlsx",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
-    assert _['engine'] == 'openpyxl'
-    # this file has an empty first row
-    assert _['header_row_idx'] == 1
-    # check if the sheet we consider is the largest
-    assert _['sheet_name'] == 'REI_1987'
+@pytest.mark.parametrize("params", params_csv + params_others)
+def test_non_csv_files(params):
+    file_name, checks = params
     _ = routine(
-        csv_file_path="tests/csv_file",
+        csv_file_path=f"tests/{file_name}",
         num_rows=-1,
         output_profile=False,
         save_results=False,
     )
-    assert not _.get('engine')
-    assert not _.get('sheet_name')
-    _ = routine(
-        csv_file_path="tests/xlsx_file",
-        num_rows=-1,
-        output_profile=False,
-        save_results=False,
-    )
-    assert _['engine'] == 'openpyxl'
+    for k, v in checks.items():
+        if v is None:
+            assert not _.get(k)
+        elif "." in k:
+            key, func = k.split(".")
+            assert eval(func)(_[key]) == v
+        else:
+            assert _[k] == v
 @pytest.fixture
@@ -185,21 +170,34 @@ def mocked_responses():
         yield rsps
-def test_urls(mocked_responses):
-    url = 'http://example.com/test.csv'
-    expected_content = 'id,name,first_name\n1,John,Smith\n2,Jane,Doe\n3,Bob,Johnson'
+@pytest.mark.parametrize(
+    "params",
+    # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
+    # which doesn't support the way we mock the response, TBC
+    params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 414})]
+)
+def test_urls(mocked_responses, params):
+    file_name, checks = params
+    url = f"http://example.com/{file_name}"
     mocked_responses.get(
         url,
-        body=expected_content,
+        body=open(f"tests/{file_name}", "rb").read(),
         status=200,
     )
-    output = routine(
+    _ = routine(
         csv_file_path=url,
         num_rows=-1,
         output_profile=False,
         save_results=False,
     )
-    assert output['header'] == ["id", "name", "first_name"]
+    for k, v in checks.items():
+        if v is None:
+            assert not _.get(k)
+        elif "." in k:
+            key, func = k.split(".")
+            assert eval(func)(_[key]) == v
+        else:
+            assert _[k] == v
 @pytest.mark.parametrize(

{csv_detective-0.7.5.dev1171.data → csv_detective-0.7.5.dev1197.data}/data/share/csv_detective/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1171.data → csv_detective-0.7.5.dev1197.data}/data/share/csv_detective/README.md RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1171.dist-info → csv_detective-0.7.5.dev1197.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1171.dist-info → csv_detective-0.7.5.dev1197.dist-info/licenses}/LICENSE.AGPL.txt RENAMED Viewed

File without changes

{csv_detective-0.7.5.dev1171.dist-info → csv_detective-0.7.5.dev1197.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.7.5.dev1171__py3-none-any.whl → 0.7.5.dev1197__py3-none-any.whl

csv-detective 0.7.5.dev1171py3-none-any.whl → 0.7.5.dev1197py3-none-any.whl