PyPI - csv-detective - Versions diffs - 0.9.3.dev1901__py3-none-any.whl → 0.9.3.dev1948__py3-none-any.whl - Mend

csv-detective 0.9.3.dev1901py3-none-any.whl → 0.9.3.dev1948py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

csv_detective/detect_fields/other/email/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import re
-PROPORTION = 1
+PROPORTION = 0.9
 def _is(val):
     """Detects e-mails"""
     return isinstance(val, str) and bool(
-        re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
+        re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
     )

csv_detective/detection/formats.py CHANGED Viewed

@@ -30,7 +30,7 @@ def detect_formats(
     on_sample = len(table) > MAX_ROWS_ANALYSIS
     if on_sample:
         if verbose:
-            logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
+            logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
         table = build_sample(table)
     if table.empty:
@@ -183,13 +183,15 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
     samples = pd.concat(
         [
             # one row with the minimum of the column
-            table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
+            table.loc[table[col] == val].iloc[[0]]
             for col in table.columns
+            if not pd.isna(val := table[col].dropna().min())
         ]
         + [
             # one row with the maximum of the column
-            table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
+            table.loc[table[col] == val].iloc[[0]]
             for col in table.columns
+            if not pd.isna(val := table[col].dropna().max())
         ]
         + [
             # one row with a NaN value if the column has any
@@ -199,7 +201,12 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
         ],
         ignore_index=True,
     )
-    return pd.concat(
-        [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
-        ignore_index=True,
+    return (
+        pd.concat(
+            [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
+            ignore_index=True,
+        )
+        # this is very unlikely but we never know
+        if len(samples) <= MAX_ROWS_ANALYSIS
+        else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
     )

{csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csv-detective
-Version: 0.9.3.dev1901
+Version: 0.9.3.dev1948
 Summary: Detect tabular files column content
 Author-email: Etalab <opendatateam@data.gouv.fr>
 License: MIT

{csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/RECORD RENAMED Viewed

@@ -56,7 +56,7 @@ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=gPnNTe-L9xjBVE-
 csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
 csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
-csv_detective/detect_fields/other/email/__init__.py,sha256=p235wILf0fR9TeSEuyuPgoysAv9zg23a4vzdy3YJlxE,192
+csv_detective/detect_fields/other/email/__init__.py,sha256=O09KVBDsI-_wOecOlqyrtgr8V1ubPqB5EwPhOCxVLJ8,199
 csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
 csv_detective/detect_fields/other/int/__init__.py,sha256=4SQAgaYTafeBL6hdT7Wp_xwcRNQsOWlYjaXKl78EuDw,320
 csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
@@ -132,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
 csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
 csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
 csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
-csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
+csv_detective/detection/formats.py,sha256=aP6boV9fz0xH-u_uMAwwo2GKO_jkUBWi8orxRcZQVGE,7734
 csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
 csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
 csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -150,19 +150,19 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
 csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
 csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
 csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
-csv_detective-0.9.3.dev1901.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
+csv_detective-0.9.3.dev1948.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
-tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
-tests/test_file.py,sha256=ZL0Jx499RUpmKFvcPQVnAeafSbyc23fqwt93ZrYg9GE,10258
+tests/test_fields.py,sha256=-6wwuqNmGUIxpNn4u9_OmgqgS95uKWBtahDGy3iw3NI,12566
+tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
 tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
 tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
 tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
 venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
 venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
 venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
-csv_detective-0.9.3.dev1901.dist-info/METADATA,sha256=zlYwJcrxQIjXmPEUaQuAIsIyl2hQsa_ORGAwO5SKfAw,9767
-csv_detective-0.9.3.dev1901.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csv_detective-0.9.3.dev1901.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
-csv_detective-0.9.3.dev1901.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
-csv_detective-0.9.3.dev1901.dist-info/RECORD,,
+csv_detective-0.9.3.dev1948.dist-info/METADATA,sha256=gl7Ss-DfsY0OU7kn0cdoe4PInQ1WpXed4GRru0np4rU,9767
+csv_detective-0.9.3.dev1948.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csv_detective-0.9.3.dev1948.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
+csv_detective-0.9.3.dev1948.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
+csv_detective-0.9.3.dev1948.dist-info/RECORD,,

tests/test_fields.py CHANGED Viewed

@@ -284,7 +284,7 @@ fields = {
         False: ["nein", "ja", "2", "-0"],
     },
     email: {
-        True: ["cdo_intern@data.gouv.fr"],
+        True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
         False: ["cdo@@gouv.sfd"],
     },
     json: {

tests/test_file.py CHANGED Viewed

@@ -6,6 +6,7 @@ import responses
 from csv_detective import routine
 from csv_detective.output.profile import create_profile
+from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
 @pytest.mark.parametrize(
@@ -343,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
         save_results=False,
     )
     assert analysis["columns"][col_name]["format"] == "int"
+def test_full_nan_column(mocked_responses):
+    # we want a file that needs sampling
+    expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
+    mocked_responses.get(
+        "http://example.com/test.csv",
+        body=expected_content,
+        status=200,
+    )
+    # just testing it doesn't fail
+    routine(
+        file_path="http://example.com/test.csv",
+        num_rows=-1,
+        output_profile=False,
+        save_results=False,
+    )

{csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/WHEEL RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/top_level.txt RENAMED Viewed

File without changes

csv-detective 0.9.3.dev1901__py3-none-any.whl → 0.9.3.dev1948__py3-none-any.whl

csv-detective 0.9.3.dev1901py3-none-any.whl → 0.9.3.dev1948py3-none-any.whl