csv-detective 0.9.3.dev2232__py3-none-any.whl → 0.9.3.dev2258__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/headers.py +6 -8
- csv_detective/output/profile.py +6 -2
- csv_detective/parsing/load.py +6 -6
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/METADATA +1 -1
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/RECORD +10 -10
- tests/test_file.py +26 -4
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/top_level.txt +0 -0
|
@@ -12,19 +12,17 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int,
|
|
|
12
12
|
logging.info("Detecting headers")
|
|
13
13
|
file.seek(0)
|
|
14
14
|
for i in range(10):
|
|
15
|
-
|
|
15
|
+
row = file.readline()
|
|
16
16
|
position = file.tell()
|
|
17
|
-
|
|
18
|
-
if
|
|
17
|
+
headers = [c for c in row.replace("\n", "").split(sep) if c]
|
|
18
|
+
if not any(col == "" for col in headers):
|
|
19
19
|
next_row = file.readline()
|
|
20
20
|
file.seek(position)
|
|
21
|
-
if
|
|
21
|
+
if row != next_row:
|
|
22
22
|
if verbose:
|
|
23
23
|
display_logs_depending_process_time(
|
|
24
24
|
f"Detected headers in {round(time() - start, 3)}s",
|
|
25
25
|
time() - start,
|
|
26
26
|
)
|
|
27
|
-
return i,
|
|
28
|
-
|
|
29
|
-
logging.info("No header detected")
|
|
30
|
-
return 0, None
|
|
27
|
+
return i, headers
|
|
28
|
+
raise ValueError("Could not retrieve headers")
|
csv_detective/output/profile.py
CHANGED
|
@@ -30,6 +30,10 @@ def create_profile(
|
|
|
30
30
|
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
|
31
31
|
for k, v in columns.items()
|
|
32
32
|
}
|
|
33
|
+
# value_counts().reset_index() tries to insert a "count" column, and fails if it's already here
|
|
34
|
+
_count_col = "count"
|
|
35
|
+
while _count_col in table.columns:
|
|
36
|
+
_count_col = "_" + _count_col
|
|
33
37
|
profile = defaultdict(dict)
|
|
34
38
|
for c in table.columns:
|
|
35
39
|
# for numerical formats we want min, max, mean, std
|
|
@@ -79,14 +83,14 @@ def create_profile(
|
|
|
79
83
|
# for all formats we want most frequent values, nb unique values and nb missing values
|
|
80
84
|
tops_bruts = (
|
|
81
85
|
(table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
|
|
82
|
-
.reset_index()
|
|
86
|
+
.reset_index(name=_count_col)
|
|
83
87
|
.iloc[:10]
|
|
84
88
|
.to_dict(orient="records")
|
|
85
89
|
)
|
|
86
90
|
profile[c].update(
|
|
87
91
|
tops=[
|
|
88
92
|
{
|
|
89
|
-
"count": tb[
|
|
93
|
+
"count": tb[_count_col],
|
|
90
94
|
"value": tb[c],
|
|
91
95
|
}
|
|
92
96
|
for tb in tops_bruts
|
csv_detective/parsing/load.py
CHANGED
|
@@ -47,6 +47,8 @@ def load_file(
|
|
|
47
47
|
if table.empty:
|
|
48
48
|
raise ValueError("Table seems to be empty")
|
|
49
49
|
header = table.columns.to_list()
|
|
50
|
+
if any(col.startswith("Unnamed") for col in header):
|
|
51
|
+
raise ValueError("Could not retrieve headers")
|
|
50
52
|
analysis = {
|
|
51
53
|
"engine": engine,
|
|
52
54
|
"sheet_name": sheet_name,
|
|
@@ -99,12 +101,10 @@ def load_file(
|
|
|
99
101
|
}
|
|
100
102
|
if engine is not None:
|
|
101
103
|
analysis["compression"] = engine
|
|
102
|
-
analysis
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
}
|
|
107
|
-
)
|
|
104
|
+
analysis |= {
|
|
105
|
+
"header_row_idx": header_row_idx,
|
|
106
|
+
"header": header,
|
|
107
|
+
}
|
|
108
108
|
if total_lines is not None:
|
|
109
109
|
analysis["total_lines"] = total_lines
|
|
110
110
|
if nb_duplicates is not None:
|
|
@@ -132,14 +132,14 @@ csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvca
|
|
|
132
132
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
133
133
|
csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
|
|
134
134
|
csv_detective/detection/formats.py,sha256=92ZZafoJFZHDSxSUR6rDCFjLGpD4f4IZorqqVgxwFY8,5595
|
|
135
|
-
csv_detective/detection/headers.py,sha256=
|
|
135
|
+
csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
|
|
136
136
|
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
137
137
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
138
138
|
csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
|
|
139
139
|
csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
|
|
140
140
|
csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
|
|
141
141
|
csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
|
|
142
|
-
csv_detective/output/profile.py,sha256=
|
|
142
|
+
csv_detective/output/profile.py,sha256=ZGKMSeVfmQerAfVhViWXVU9j4jbCrv5K484SQNep7Xw,4920
|
|
143
143
|
csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
|
|
144
144
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
145
145
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -147,20 +147,20 @@ csv_detective/parsing/columns.py,sha256=X5v3_1zgZXadnxjUG3__xLjOIvNU4n9LOiWZbix4
|
|
|
147
147
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
148
148
|
csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,1742
|
|
149
149
|
csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
|
|
150
|
-
csv_detective/parsing/load.py,sha256=
|
|
150
|
+
csv_detective/parsing/load.py,sha256=EHVWQqV9TmWOiVNLCyHr9V8x4PI_53O0iTVluzIqw78,4256
|
|
151
151
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
152
|
-
csv_detective-0.9.3.
|
|
152
|
+
csv_detective-0.9.3.dev2258.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
153
153
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
154
154
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
155
155
|
tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
|
|
156
|
-
tests/test_file.py,sha256=
|
|
156
|
+
tests/test_file.py,sha256=EKFW08W96VA5nVwNPvN1v7zXDL0qEEuGWnUqfJJdMh4,13130
|
|
157
157
|
tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
|
|
158
158
|
tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
|
|
159
159
|
tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
|
|
160
160
|
venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
|
|
161
161
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
162
|
-
csv_detective-0.9.3.
|
|
163
|
-
csv_detective-0.9.3.
|
|
164
|
-
csv_detective-0.9.3.
|
|
165
|
-
csv_detective-0.9.3.
|
|
166
|
-
csv_detective-0.9.3.
|
|
162
|
+
csv_detective-0.9.3.dev2258.dist-info/METADATA,sha256=gGV63RWaQSVUSaJl24mR9Ynk2BOV1tK6LWyNsq-AYkA,10845
|
|
163
|
+
csv_detective-0.9.3.dev2258.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
164
|
+
csv_detective-0.9.3.dev2258.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
165
|
+
csv_detective-0.9.3.dev2258.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
166
|
+
csv_detective-0.9.3.dev2258.dist-info/RECORD,,
|
tests/test_file.py
CHANGED
|
@@ -370,22 +370,44 @@ def test_almost_uniform_column(mocked_responses):
|
|
|
370
370
|
|
|
371
371
|
def test_full_nan_column(mocked_responses):
|
|
372
372
|
# we want a file that needs sampling
|
|
373
|
-
|
|
373
|
+
col_name = "only_nan"
|
|
374
|
+
expected_content = f"{col_name},second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
|
|
374
375
|
mocked_responses.get(
|
|
375
376
|
"http://example.com/test.csv",
|
|
376
377
|
body=expected_content,
|
|
377
378
|
status=200,
|
|
378
379
|
)
|
|
379
380
|
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
380
|
-
# Create a mock HTTP response object
|
|
381
381
|
mock_response = MagicMock()
|
|
382
382
|
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
383
383
|
mock_response.__enter__.return_value = mock_response
|
|
384
384
|
mock_urlopen.return_value = mock_response
|
|
385
|
-
#
|
|
386
|
-
routine(
|
|
385
|
+
# only NaNs should return "string"
|
|
386
|
+
analysis = routine(
|
|
387
387
|
file_path="http://example.com/test.csv",
|
|
388
388
|
num_rows=-1,
|
|
389
389
|
output_profile=False,
|
|
390
390
|
save_results=False,
|
|
391
391
|
)
|
|
392
|
+
assert analysis["columns"][col_name]["format"] == "string"
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def test_count_column(mocked_responses):
|
|
396
|
+
expected_content = "count,_count\n" + "a,1\n" * 100
|
|
397
|
+
mocked_responses.get(
|
|
398
|
+
"http://example.com/test.csv",
|
|
399
|
+
body=expected_content,
|
|
400
|
+
status=200,
|
|
401
|
+
)
|
|
402
|
+
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
403
|
+
mock_response = MagicMock()
|
|
404
|
+
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
405
|
+
mock_response.__enter__.return_value = mock_response
|
|
406
|
+
mock_urlopen.return_value = mock_response
|
|
407
|
+
# only testing it doesn't fail with output_profile=True
|
|
408
|
+
routine(
|
|
409
|
+
file_path="http://example.com/test.csv",
|
|
410
|
+
num_rows=-1,
|
|
411
|
+
output_profile=True,
|
|
412
|
+
save_results=False,
|
|
413
|
+
)
|
|
File without changes
|
{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2258.dist-info}/top_level.txt
RENAMED
|
File without changes
|