csv-detective 0.10.3.dev2__py3-none-any.whl → 0.10.3.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/headers.py +11 -13
- csv_detective/detection/rows.py +1 -1
- csv_detective/output/dataframe.py +2 -2
- csv_detective/parsing/load.py +5 -5
- {csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/METADATA +1 -1
- {csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/RECORD +12 -12
- {csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/WHEEL +1 -1
- tests/test_fields.py +7 -1
- tests/test_file.py +26 -6
- {csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/top_level.txt +0 -0
|
@@ -5,24 +5,22 @@ from typing import TextIO
|
|
|
5
5
|
from csv_detective.utils import display_logs_depending_process_time
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def
|
|
8
|
+
def detect_header_position(file: TextIO, verbose: bool = False) -> int:
|
|
9
9
|
"""Tests 10 first rows for possible header (in case header is not 1st row)"""
|
|
10
10
|
if verbose:
|
|
11
11
|
start = time()
|
|
12
|
-
logging.info("Detecting
|
|
12
|
+
logging.info("Detecting header position")
|
|
13
13
|
file.seek(0)
|
|
14
14
|
for i in range(10):
|
|
15
15
|
row = file.readline()
|
|
16
16
|
position = file.tell()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
)
|
|
27
|
-
return i, headers
|
|
17
|
+
next_row = file.readline()
|
|
18
|
+
file.seek(position)
|
|
19
|
+
if row != next_row:
|
|
20
|
+
if verbose:
|
|
21
|
+
display_logs_depending_process_time(
|
|
22
|
+
f"Detected header position in {round(time() - start, 3)}s",
|
|
23
|
+
time() - start,
|
|
24
|
+
)
|
|
25
|
+
return i
|
|
28
26
|
raise ValueError("Could not retrieve headers")
|
csv_detective/detection/rows.py
CHANGED
|
@@ -2,7 +2,7 @@ import pandas as pd
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
5
|
-
"""Analog process to
|
|
5
|
+
"""Analog process to detect_header_position for csv files, determines how many rows to skip
|
|
6
6
|
to end up with the header at the right place"""
|
|
7
7
|
idx = 0
|
|
8
8
|
if all([str(c).startswith("Unnamed:") for c in table.columns]):
|
|
@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
|
|
17
|
-
if not isinstance(value, str) or
|
|
18
|
-
#
|
|
17
|
+
if not isinstance(value, str) or value in pd._libs.parsers.STR_NA_VALUES:
|
|
18
|
+
# STR_NA_VALUES are directly ingested as NaN by pandas, we avoid trying to cast them (into int for instance)
|
|
19
19
|
return None
|
|
20
20
|
match _type:
|
|
21
21
|
case "string":
|
csv_detective/parsing/load.py
CHANGED
|
@@ -11,7 +11,7 @@ from csv_detective.detection.engine import (
|
|
|
11
11
|
EXCEL_ENGINES,
|
|
12
12
|
detect_engine,
|
|
13
13
|
)
|
|
14
|
-
from csv_detective.detection.headers import
|
|
14
|
+
from csv_detective.detection.headers import detect_header_position
|
|
15
15
|
from csv_detective.detection.separator import detect_separator
|
|
16
16
|
from csv_detective.parsing.compression import unzip
|
|
17
17
|
from csv_detective.parsing.csv import parse_csv
|
|
@@ -83,9 +83,7 @@ def load_file(
|
|
|
83
83
|
str_file = open(file_path, "r", encoding=encoding)
|
|
84
84
|
if sep is None:
|
|
85
85
|
sep = detect_separator(str_file, verbose=verbose)
|
|
86
|
-
header_row_idx
|
|
87
|
-
if header is None or (isinstance(header, list) and any([h is None for h in header])):
|
|
88
|
-
raise ValueError("Could not retrieve headers")
|
|
86
|
+
header_row_idx = detect_header_position(str_file, verbose=verbose)
|
|
89
87
|
heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
|
|
90
88
|
trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
|
|
91
89
|
table, total_lines, nb_duplicates = parse_csv(
|
|
@@ -102,9 +100,11 @@ def load_file(
|
|
|
102
100
|
}
|
|
103
101
|
if engine is not None:
|
|
104
102
|
analysis["compression"] = engine
|
|
103
|
+
if any(col.startswith("Unnamed:") for col in table.columns):
|
|
104
|
+
raise ValueError("Columns are not properly set")
|
|
105
105
|
analysis |= {
|
|
106
106
|
"header_row_idx": header_row_idx,
|
|
107
|
-
"header":
|
|
107
|
+
"header": list(table.columns),
|
|
108
108
|
}
|
|
109
109
|
if total_lines is not None:
|
|
110
110
|
analysis["total_lines"] = total_lines
|
|
@@ -9,8 +9,8 @@ csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvca
|
|
|
9
9
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
10
10
|
csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
|
|
11
11
|
csv_detective/detection/formats.py,sha256=9aIE4gwTN8c8pa-kofeJ7zalo8NqjGZabYD-G79kV5I,4734
|
|
12
|
-
csv_detective/detection/headers.py,sha256=
|
|
13
|
-
csv_detective/detection/rows.py,sha256=
|
|
12
|
+
csv_detective/detection/headers.py,sha256=R-Cuo67laOrj6l9Jq_SCFLkRMj3CEzb2HMo_Sa50g9A,879
|
|
13
|
+
csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
|
|
14
14
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
15
15
|
csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
|
|
16
16
|
csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
|
|
@@ -74,7 +74,7 @@ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=mLt_qcQ6D8hfy9zdi7
|
|
|
74
74
|
csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=XFPdGBsyZCBg4D8IDn6VgwsycCwYVfuqPbyHfNeqGv0,1003
|
|
75
75
|
csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7prThkihHrf7mwB6j5uEHpxGvLFE,1003
|
|
76
76
|
csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
|
|
77
|
-
csv_detective/output/dataframe.py,sha256=
|
|
77
|
+
csv_detective/output/dataframe.py,sha256=juBMdj0eiL8c3OrJJ3kCf15Qs4-CFQfHqh91FnVbG9E,3656
|
|
78
78
|
csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
|
|
79
79
|
csv_detective/output/profile.py,sha256=ADr5DwuvwcBYxugjN38fHm11l6ivfzGHXPd8a87Ht-s,4985
|
|
80
80
|
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
@@ -84,13 +84,13 @@ csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcO
|
|
|
84
84
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
85
85
|
csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
|
|
86
86
|
csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
|
|
87
|
-
csv_detective/parsing/load.py,sha256=
|
|
87
|
+
csv_detective/parsing/load.py,sha256=uWX4r_2K9bf-9qKL6IaiGNkiSPnSSNqng420zdrFkDg,4371
|
|
88
88
|
csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
|
|
89
|
-
csv_detective-0.10.3.
|
|
89
|
+
csv_detective-0.10.3.dev4.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
90
90
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
91
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
92
|
-
tests/test_fields.py,sha256=
|
|
93
|
-
tests/test_file.py,sha256=
|
|
92
|
+
tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
|
|
93
|
+
tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
|
|
94
94
|
tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
|
|
95
95
|
tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
|
|
96
96
|
tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
|
|
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
|
|
|
104
104
|
tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
|
|
105
105
|
tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
|
|
106
106
|
tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
|
|
107
|
-
csv_detective-0.10.3.
|
|
108
|
-
csv_detective-0.10.3.
|
|
109
|
-
csv_detective-0.10.3.
|
|
110
|
-
csv_detective-0.10.3.
|
|
111
|
-
csv_detective-0.10.3.
|
|
107
|
+
csv_detective-0.10.3.dev4.dist-info/METADATA,sha256=FRE6DkDtXSuva4aVTV1ws8tj53w03fjesToRlgh_78s,11082
|
|
108
|
+
csv_detective-0.10.3.dev4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
109
|
+
csv_detective-0.10.3.dev4.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
110
|
+
csv_detective-0.10.3.dev4.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
|
|
111
|
+
csv_detective-0.10.3.dev4.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -104,11 +104,17 @@ def test_fields_with_values(args):
|
|
|
104
104
|
("2022-08-01", "date", _date),
|
|
105
105
|
("2024-09-23 17:32:07", "datetime", _datetime),
|
|
106
106
|
("2024-09-23 17:32:07+02:00", "datetime", _datetime),
|
|
107
|
+
("N/A", "int", None),
|
|
108
|
+
("nan", "bool", None),
|
|
109
|
+
("", "date", None), # all NaN-like values should be cast as None for all type
|
|
107
110
|
),
|
|
108
111
|
)
|
|
109
112
|
def test_cast(args):
|
|
110
113
|
value, detected_type, cast_type = args
|
|
111
|
-
|
|
114
|
+
if cast_type is None:
|
|
115
|
+
assert cast(value, detected_type) is None
|
|
116
|
+
else:
|
|
117
|
+
assert isinstance(cast(value, detected_type), cast_type)
|
|
112
118
|
|
|
113
119
|
|
|
114
120
|
@pytest.mark.parametrize(
|
tests/test_file.py
CHANGED
|
@@ -9,6 +9,12 @@ from csv_detective.output.profile import create_profile
|
|
|
9
9
|
from csv_detective.parsing.csv import CHUNK_SIZE
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
@pytest.fixture
|
|
13
|
+
def mocked_responses():
|
|
14
|
+
with responses.RequestsMock() as rsps:
|
|
15
|
+
yield rsps
|
|
16
|
+
|
|
17
|
+
|
|
12
18
|
@pytest.mark.parametrize(
|
|
13
19
|
"chunk_size",
|
|
14
20
|
(100, 404, int(1e5)),
|
|
@@ -165,6 +171,26 @@ def test_exception_different_number_of_columns():
|
|
|
165
171
|
)
|
|
166
172
|
|
|
167
173
|
|
|
174
|
+
def test_exception_malformed_columns(mocked_responses):
|
|
175
|
+
"""
|
|
176
|
+
A ValueError should be raised if any column is Unnamed
|
|
177
|
+
"""
|
|
178
|
+
url = f"http://example.com/bad_cols.csv"
|
|
179
|
+
expected_content = b"col1,col2,\n1,2,\n3,4,"
|
|
180
|
+
mocked_responses.get(
|
|
181
|
+
url,
|
|
182
|
+
body=expected_content,
|
|
183
|
+
status=200,
|
|
184
|
+
)
|
|
185
|
+
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
186
|
+
mock_response = MagicMock()
|
|
187
|
+
mock_response.read.return_value = expected_content
|
|
188
|
+
mock_response.__enter__.return_value = mock_response
|
|
189
|
+
mock_urlopen.return_value = mock_response
|
|
190
|
+
with pytest.raises(ValueError):
|
|
191
|
+
routine(file_path=url)
|
|
192
|
+
|
|
193
|
+
|
|
168
194
|
def test_code_dep_reg_on_file():
|
|
169
195
|
output = routine(
|
|
170
196
|
file_path="tests/data/b_test_file.csv",
|
|
@@ -237,12 +263,6 @@ def test_non_csv_files(params):
|
|
|
237
263
|
assert _[k] == v
|
|
238
264
|
|
|
239
265
|
|
|
240
|
-
@pytest.fixture
|
|
241
|
-
def mocked_responses():
|
|
242
|
-
with responses.RequestsMock() as rsps:
|
|
243
|
-
yield rsps
|
|
244
|
-
|
|
245
|
-
|
|
246
266
|
@pytest.mark.parametrize(
|
|
247
267
|
"params",
|
|
248
268
|
# ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
|
{csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.10.3.dev2.dist-info → csv_detective-0.10.3.dev4.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|