PyPI - csv-detective - Versions diffs - 0.9.3.dev2232__py3-none-any.whl → 0.9.3.dev2258__py3-none-any.whl - Mend

csv-detective 0.9.3.dev2232py3-none-any.whl → 0.9.3.dev2258py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

csv_detective/detection/headers.py CHANGED Viewed

@@ -12,19 +12,17 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int,
         logging.info("Detecting headers")
     file.seek(0)
     for i in range(10):
-        header = file.readline()
+        row = file.readline()
         position = file.tell()
-        chaine = [c for c in header.replace("\n", "").split(sep) if c]
-        if chaine[-1] not in ["", "\n"] and all([mot not in ["", "\n"] for mot in chaine[1:-1]]):
+        headers = [c for c in row.replace("\n", "").split(sep) if c]
+        if not any(col == "" for col in headers):
             next_row = file.readline()
             file.seek(position)
-            if header != next_row:
+            if row != next_row:
                 if verbose:
                     display_logs_depending_process_time(
                         f"Detected headers in {round(time() - start, 3)}s",
                         time() - start,
                     )
-                return i, chaine
-    if verbose:
-        logging.info("No header detected")
-    return 0, None
+                return i, headers
+    raise ValueError("Could not retrieve headers")

csv_detective/output/profile.py CHANGED Viewed

@@ -30,6 +30,10 @@ def create_profile(
             k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
             for k, v in columns.items()
         }
+    # value_counts().reset_index() tries to insert a "count" column, and fails if it's already here
+    _count_col = "count"
+    while _count_col in table.columns:
+        _count_col = "_" + _count_col
     profile = defaultdict(dict)
     for c in table.columns:
         # for numerical formats we want min, max, mean, std
@@ -79,14 +83,14 @@ def create_profile(
         # for all formats we want most frequent values, nb unique values and nb missing values
         tops_bruts = (
             (table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
-            .reset_index()
+            .reset_index(name=_count_col)
             .iloc[:10]
             .to_dict(orient="records")
         )
         profile[c].update(
             tops=[
                 {
-                    "count": tb["count"],
+                    "count": tb[_count_col],
                     "value": tb[c],
                 }
                 for tb in tops_bruts

csv_detective/parsing/load.py CHANGED Viewed

@@ -47,6 +47,8 @@ def load_file(
         if table.empty:
             raise ValueError("Table seems to be empty")
         header = table.columns.to_list()
+        if any(col.startswith("Unnamed") for col in header):
+            raise ValueError("Could not retrieve headers")
         analysis = {
             "engine": engine,
             "sheet_name": sheet_name,
@@ -99,12 +101,10 @@ def load_file(
         }
         if engine is not None:
             analysis["compression"] = engine
-    analysis.update(
-        {
-            "header_row_idx": header_row_idx,
-            "header": header,
-        }
-    )
+    analysis |= {
+        "header_row_idx": header_row_idx,
+        "header": header,
+    }
     if total_lines is not None:
         analysis["total_lines"] = total_lines
     if nb_duplicates is not None:

{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.3.dev2232
+Version: 0.9.3.dev2258
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT

{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/RECORD RENAMED Viewed

@@ -132,14 +132,14 @@ csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvca
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
 csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
 csv_detective/detection/formats.py,sha256=92ZZafoJFZHDSxSUR6rDCFjLGpD4f4IZorqqVgxwFY8,5595
-csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
+csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
 csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
 csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
 csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
 csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
 csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
-csv_detective/output/profile.py,sha256=oWIuHchiZ72VzGLB9q3mW-hhWq1VxiU1Z09VWjAU-JM,4696
+csv_detective/output/profile.py,sha256=ZGKMSeVfmQerAfVhViWXVU9j4jbCrv5K484SQNep7Xw,4920
 csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
 csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
 csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -147,20 +147,20 @@ csv_detective/parsing/columns.py,sha256=X5v3_1zgZXadnxjUG3__xLjOIvNU4n9LOiWZbix4
 csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
 csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,1742
 csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
-csv_detective/parsing/load.py,sha256=Ks1S92H_GErvd2Uy0_EuShMzZSkiuWdTmVQFJ_XX5lg,4167
+csv_detective/parsing/load.py,sha256=EHVWQqV9TmWOiVNLCyHr9V8x4PI_53O0iTVluzIqw78,4256
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.3.dev2232.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.3.dev2258.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
 tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
-tests/test_file.py,sha256=bYP-NzPoGEXPwNZLD1EjJlviT9a_27IY6cb0shdiR4U,12329
+tests/test_file.py,sha256=EKFW08W96VA5nVwNPvN1v7zXDL0qEEuGWnUqfJJdMh4,13130
 tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
 tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
 tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
 venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.3.dev2232.dist-info/METADATA,sha256=q8o2SRFri-iFmUgOp3tL5jGlIsuXB-TDyUj7BOaCPhg,10845
-csv_detective-0.9.3.dev2232.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.3.dev2232.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.3.dev2232.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.3.dev2232.dist-info/RECORD,,
+csv_detective-0.9.3.dev2258.dist-info/METADATA,sha256=gGV63RWaQSVUSaJl24mR9Ynk2BOV1tK6LWyNsq-AYkA,10845
+csv_detective-0.9.3.dev2258.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.3.dev2258.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.3.dev2258.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.3.dev2258.dist-info/RECORD,,

tests/test_file.py CHANGED Viewed

@@ -370,22 +370,44 @@ def test_almost_uniform_column(mocked_responses):
 def test_full_nan_column(mocked_responses):
     # we want a file that needs sampling
-    expected_content = "only_nan,second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
+    col_name = "only_nan"
+    expected_content = f"{col_name},second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
     mocked_responses.get(
         "http://example.com/test.csv",
         body=expected_content,
         status=200,
     )
     with patch("urllib.request.urlopen") as mock_urlopen:
-        # Create a mock HTTP response object
         mock_response = MagicMock()
         mock_response.read.return_value = expected_content.encode("utf-8")
         mock_response.__enter__.return_value = mock_response
         mock_urlopen.return_value = mock_response
-        # just testing it doesn't fail
-        routine(
+        # only NaNs should return "string"
+        analysis = routine(
             file_path="http://example.com/test.csv",
             num_rows=-1,
             output_profile=False,
             save_results=False,
         )
+        assert analysis["columns"][col_name]["format"] == "string"
+def test_count_column(mocked_responses):
+    expected_content = "count,_count\n" + "a,1\n" * 100
+    mocked_responses.get(
+        "http://example.com/test.csv",
+        body=expected_content,
+        status=200,
+    )
+    with patch("urllib.request.urlopen") as mock_urlopen:
+        mock_response = MagicMock()
+        mock_response.read.return_value = expected_content.encode("utf-8")
+        mock_response.__enter__.return_value = mock_response
+        mock_urlopen.return_value = mock_response
+        # only testing it doesn't fail with output_profile=True
+        routine(
+            file_path="http://example.com/test.csv",
+            num_rows=-1,
+            output_profile=True,
+            save_results=False,
+        )

{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.9.3.dev2232__py3-none-any.whl → 0.9.3.dev2258__py3-none-any.whl

csv-detective 0.9.3.dev2232py3-none-any.whl → 0.9.3.dev2258py3-none-any.whl