csv-detective 0.9.3.dev2232__py3-none-any.whl → 0.9.3.dev2241__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/output/profile.py +6 -2
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2241.dist-info}/METADATA +1 -1
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2241.dist-info}/RECORD +8 -8
- tests/test_file.py +26 -4
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2241.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2241.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2241.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2241.dist-info}/top_level.txt +0 -0
csv_detective/output/profile.py
CHANGED
|
@@ -30,6 +30,10 @@ def create_profile(
|
|
|
30
30
|
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
|
31
31
|
for k, v in columns.items()
|
|
32
32
|
}
|
|
33
|
+
# value_counts().reset_index() tries to insert a "count" column, and fails if it's already here
|
|
34
|
+
_count_col = "count"
|
|
35
|
+
while _count_col in table.columns:
|
|
36
|
+
_count_col = "_" + _count_col
|
|
33
37
|
profile = defaultdict(dict)
|
|
34
38
|
for c in table.columns:
|
|
35
39
|
# for numerical formats we want min, max, mean, std
|
|
@@ -79,14 +83,14 @@ def create_profile(
|
|
|
79
83
|
# for all formats we want most frequent values, nb unique values and nb missing values
|
|
80
84
|
tops_bruts = (
|
|
81
85
|
(table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
|
|
82
|
-
.reset_index()
|
|
86
|
+
.reset_index(name=_count_col)
|
|
83
87
|
.iloc[:10]
|
|
84
88
|
.to_dict(orient="records")
|
|
85
89
|
)
|
|
86
90
|
profile[c].update(
|
|
87
91
|
tops=[
|
|
88
92
|
{
|
|
89
|
-
"count": tb[
|
|
93
|
+
"count": tb[_count_col],
|
|
90
94
|
"value": tb[c],
|
|
91
95
|
}
|
|
92
96
|
for tb in tops_bruts
|
|
@@ -139,7 +139,7 @@ csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0Jze
|
|
|
139
139
|
csv_detective/output/__init__.py,sha256=B0RRaXEUAEduLFOoHll4Hl6x35b55Kwko-tQv5WmPt0,2045
|
|
140
140
|
csv_detective/output/dataframe.py,sha256=J_617q8j1_INQOYl668IJt8M0Mi5zWYWAwtzdV4sJSo,3254
|
|
141
141
|
csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
|
|
142
|
-
csv_detective/output/profile.py,sha256=
|
|
142
|
+
csv_detective/output/profile.py,sha256=ZGKMSeVfmQerAfVhViWXVU9j4jbCrv5K484SQNep7Xw,4920
|
|
143
143
|
csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
|
|
144
144
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
145
145
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -149,18 +149,18 @@ csv_detective/parsing/csv.py,sha256=BJ_fqoCCCCSJ61uHyiEpDmXlBdrqWLY-UKtKwkYw65c,
|
|
|
149
149
|
csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
|
|
150
150
|
csv_detective/parsing/load.py,sha256=Ks1S92H_GErvd2Uy0_EuShMzZSkiuWdTmVQFJ_XX5lg,4167
|
|
151
151
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
152
|
-
csv_detective-0.9.3.
|
|
152
|
+
csv_detective-0.9.3.dev2241.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
153
153
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
154
154
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
155
155
|
tests/test_fields.py,sha256=QoMsVR-ZhH5F9DFqYDvzP6vQCZcoalEi8JBb_fxWR44,13665
|
|
156
|
-
tests/test_file.py,sha256=
|
|
156
|
+
tests/test_file.py,sha256=EKFW08W96VA5nVwNPvN1v7zXDL0qEEuGWnUqfJJdMh4,13130
|
|
157
157
|
tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
|
|
158
158
|
tests/test_structure.py,sha256=KGpw45weVK3iEWAg3OVHHEbj7RYALFicnZ59z7rCFuU,1450
|
|
159
159
|
tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
|
|
160
160
|
venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
|
|
161
161
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
162
|
-
csv_detective-0.9.3.
|
|
163
|
-
csv_detective-0.9.3.
|
|
164
|
-
csv_detective-0.9.3.
|
|
165
|
-
csv_detective-0.9.3.
|
|
166
|
-
csv_detective-0.9.3.
|
|
162
|
+
csv_detective-0.9.3.dev2241.dist-info/METADATA,sha256=Cy0R4v1C7Lg-KRgD1_pBP4uO_huMU0158VorSXw8b2w,10845
|
|
163
|
+
csv_detective-0.9.3.dev2241.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
164
|
+
csv_detective-0.9.3.dev2241.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
165
|
+
csv_detective-0.9.3.dev2241.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
166
|
+
csv_detective-0.9.3.dev2241.dist-info/RECORD,,
|
tests/test_file.py
CHANGED
|
@@ -370,22 +370,44 @@ def test_almost_uniform_column(mocked_responses):
|
|
|
370
370
|
|
|
371
371
|
def test_full_nan_column(mocked_responses):
|
|
372
372
|
# we want a file that needs sampling
|
|
373
|
-
|
|
373
|
+
col_name = "only_nan"
|
|
374
|
+
expected_content = f"{col_name},second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
|
|
374
375
|
mocked_responses.get(
|
|
375
376
|
"http://example.com/test.csv",
|
|
376
377
|
body=expected_content,
|
|
377
378
|
status=200,
|
|
378
379
|
)
|
|
379
380
|
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
380
|
-
# Create a mock HTTP response object
|
|
381
381
|
mock_response = MagicMock()
|
|
382
382
|
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
383
383
|
mock_response.__enter__.return_value = mock_response
|
|
384
384
|
mock_urlopen.return_value = mock_response
|
|
385
|
-
#
|
|
386
|
-
routine(
|
|
385
|
+
# only NaNs should return "string"
|
|
386
|
+
analysis = routine(
|
|
387
387
|
file_path="http://example.com/test.csv",
|
|
388
388
|
num_rows=-1,
|
|
389
389
|
output_profile=False,
|
|
390
390
|
save_results=False,
|
|
391
391
|
)
|
|
392
|
+
assert analysis["columns"][col_name]["format"] == "string"
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def test_count_column(mocked_responses):
|
|
396
|
+
expected_content = "count,_count\n" + "a,1\n" * 100
|
|
397
|
+
mocked_responses.get(
|
|
398
|
+
"http://example.com/test.csv",
|
|
399
|
+
body=expected_content,
|
|
400
|
+
status=200,
|
|
401
|
+
)
|
|
402
|
+
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
403
|
+
mock_response = MagicMock()
|
|
404
|
+
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
405
|
+
mock_response.__enter__.return_value = mock_response
|
|
406
|
+
mock_urlopen.return_value = mock_response
|
|
407
|
+
# only testing it doesn't fail with output_profile=True
|
|
408
|
+
routine(
|
|
409
|
+
file_path="http://example.com/test.csv",
|
|
410
|
+
num_rows=-1,
|
|
411
|
+
output_profile=True,
|
|
412
|
+
save_results=False,
|
|
413
|
+
)
|
|
File without changes
|
{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2241.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2241.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.9.3.dev2232.dist-info → csv_detective-0.9.3.dev2241.dist-info}/top_level.txt
RENAMED
|
File without changes
|