csv-detective 0.9.3.dev1901__py3-none-any.whl → 0.9.3.dev1948__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_fields/other/email/__init__.py +2 -2
- csv_detective/detection/formats.py +13 -6
- {csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/METADATA +1 -1
- {csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/RECORD +10 -10
- tests/test_fields.py +1 -1
- tests/test_file.py +18 -0
- {csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
|
-
PROPORTION =
|
|
3
|
+
PROPORTION = 0.9
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def _is(val):
|
|
7
7
|
"""Detects e-mails"""
|
|
8
8
|
return isinstance(val, str) and bool(
|
|
9
|
-
re.match(r"^[a-
|
|
9
|
+
re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
|
|
10
10
|
)
|
|
@@ -30,7 +30,7 @@ def detect_formats(
|
|
|
30
30
|
on_sample = len(table) > MAX_ROWS_ANALYSIS
|
|
31
31
|
if on_sample:
|
|
32
32
|
if verbose:
|
|
33
|
-
logging.warning(f"File is too long, analysing
|
|
33
|
+
logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
|
|
34
34
|
table = build_sample(table)
|
|
35
35
|
|
|
36
36
|
if table.empty:
|
|
@@ -183,13 +183,15 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
|
|
|
183
183
|
samples = pd.concat(
|
|
184
184
|
[
|
|
185
185
|
# one row with the minimum of the column
|
|
186
|
-
table.loc[table[col] ==
|
|
186
|
+
table.loc[table[col] == val].iloc[[0]]
|
|
187
187
|
for col in table.columns
|
|
188
|
+
if not pd.isna(val := table[col].dropna().min())
|
|
188
189
|
]
|
|
189
190
|
+ [
|
|
190
191
|
# one row with the maximum of the column
|
|
191
|
-
table.loc[table[col] ==
|
|
192
|
+
table.loc[table[col] == val].iloc[[0]]
|
|
192
193
|
for col in table.columns
|
|
194
|
+
if not pd.isna(val := table[col].dropna().max())
|
|
193
195
|
]
|
|
194
196
|
+ [
|
|
195
197
|
# one row with a NaN value if the column has any
|
|
@@ -199,7 +201,12 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
|
|
|
199
201
|
],
|
|
200
202
|
ignore_index=True,
|
|
201
203
|
)
|
|
202
|
-
return
|
|
203
|
-
|
|
204
|
-
|
|
204
|
+
return (
|
|
205
|
+
pd.concat(
|
|
206
|
+
[samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
|
|
207
|
+
ignore_index=True,
|
|
208
|
+
)
|
|
209
|
+
# this is very unlikely but we never know
|
|
210
|
+
if len(samples) <= MAX_ROWS_ANALYSIS
|
|
211
|
+
else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
|
|
205
212
|
)
|
|
@@ -56,7 +56,7 @@ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=gPnNTe-L9xjBVE-
|
|
|
56
56
|
csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
|
|
57
57
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
|
|
59
|
-
csv_detective/detect_fields/other/email/__init__.py,sha256=
|
|
59
|
+
csv_detective/detect_fields/other/email/__init__.py,sha256=O09KVBDsI-_wOecOlqyrtgr8V1ubPqB5EwPhOCxVLJ8,199
|
|
60
60
|
csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
|
|
61
61
|
csv_detective/detect_fields/other/int/__init__.py,sha256=4SQAgaYTafeBL6hdT7Wp_xwcRNQsOWlYjaXKl78EuDw,320
|
|
62
62
|
csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
|
|
@@ -132,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
|
|
|
132
132
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
133
133
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
134
134
|
csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
|
|
135
|
-
csv_detective/detection/formats.py,sha256=
|
|
135
|
+
csv_detective/detection/formats.py,sha256=aP6boV9fz0xH-u_uMAwwo2GKO_jkUBWi8orxRcZQVGE,7734
|
|
136
136
|
csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
|
|
137
137
|
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
138
138
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
@@ -150,19 +150,19 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
|
|
|
150
150
|
csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
|
|
151
151
|
csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
|
|
152
152
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
153
|
-
csv_detective-0.9.3.
|
|
153
|
+
csv_detective-0.9.3.dev1948.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
154
154
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
155
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
156
|
-
tests/test_fields.py,sha256
|
|
157
|
-
tests/test_file.py,sha256=
|
|
156
|
+
tests/test_fields.py,sha256=-6wwuqNmGUIxpNn4u9_OmgqgS95uKWBtahDGy3iw3NI,12566
|
|
157
|
+
tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
|
|
158
158
|
tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
|
|
159
159
|
tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
|
|
160
160
|
tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
|
|
161
161
|
venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
|
|
162
162
|
venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
|
|
163
163
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
164
|
-
csv_detective-0.9.3.
|
|
165
|
-
csv_detective-0.9.3.
|
|
166
|
-
csv_detective-0.9.3.
|
|
167
|
-
csv_detective-0.9.3.
|
|
168
|
-
csv_detective-0.9.3.
|
|
164
|
+
csv_detective-0.9.3.dev1948.dist-info/METADATA,sha256=gl7Ss-DfsY0OU7kn0cdoe4PInQ1WpXed4GRru0np4rU,9767
|
|
165
|
+
csv_detective-0.9.3.dev1948.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
csv_detective-0.9.3.dev1948.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
167
|
+
csv_detective-0.9.3.dev1948.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
168
|
+
csv_detective-0.9.3.dev1948.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
tests/test_file.py
CHANGED
|
@@ -6,6 +6,7 @@ import responses
|
|
|
6
6
|
|
|
7
7
|
from csv_detective import routine
|
|
8
8
|
from csv_detective.output.profile import create_profile
|
|
9
|
+
from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
@pytest.mark.parametrize(
|
|
@@ -343,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
|
|
|
343
344
|
save_results=False,
|
|
344
345
|
)
|
|
345
346
|
assert analysis["columns"][col_name]["format"] == "int"
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_full_nan_column(mocked_responses):
|
|
350
|
+
# we want a file that needs sampling
|
|
351
|
+
expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
|
|
352
|
+
mocked_responses.get(
|
|
353
|
+
"http://example.com/test.csv",
|
|
354
|
+
body=expected_content,
|
|
355
|
+
status=200,
|
|
356
|
+
)
|
|
357
|
+
# just testing it doesn't fail
|
|
358
|
+
routine(
|
|
359
|
+
file_path="http://example.com/test.csv",
|
|
360
|
+
num_rows=-1,
|
|
361
|
+
output_profile=False,
|
|
362
|
+
save_results=False,
|
|
363
|
+
)
|
|
File without changes
|
{csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev1901.dist-info → csv_detective-0.9.3.dev1948.dist-info}/top_level.txt
RENAMED
|
File without changes
|