csv-detective 0.10.3.dev6__py3-none-any.whl → 0.10.4.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +2 -0
- csv_detective/parsing/columns.py +8 -4
- {csv_detective-0.10.3.dev6.dist-info → csv_detective-0.10.4.dev1.dist-info}/METADATA +1 -5
- {csv_detective-0.10.3.dev6.dist-info → csv_detective-0.10.4.dev1.dist-info}/RECORD +9 -9
- tests/test_file.py +3 -2
- {csv_detective-0.10.3.dev6.dist-info → csv_detective-0.10.4.dev1.dist-info}/WHEEL +0 -0
- {csv_detective-0.10.3.dev6.dist-info → csv_detective-0.10.4.dev1.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.10.3.dev6.dist-info → csv_detective-0.10.4.dev1.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.10.3.dev6.dist-info → csv_detective-0.10.4.dev1.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,7 @@ from csv_detective.format import Format, FormatsManager
|
|
|
11
11
|
from csv_detective.output.utils import prepare_output_dict
|
|
12
12
|
from csv_detective.parsing.columns import (
|
|
13
13
|
MAX_NUMBER_CATEGORICAL_VALUES,
|
|
14
|
+
handle_empty_columns,
|
|
14
15
|
test_col,
|
|
15
16
|
test_col_chunks,
|
|
16
17
|
test_label,
|
|
@@ -49,6 +50,7 @@ def detect_formats(
|
|
|
49
50
|
skipna=skipna,
|
|
50
51
|
verbose=verbose,
|
|
51
52
|
)
|
|
53
|
+
handle_empty_columns(scores_table_fields)
|
|
52
54
|
res_categorical, _ = detect_categorical_variable(
|
|
53
55
|
table,
|
|
54
56
|
max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
|
csv_detective/parsing/columns.py
CHANGED
|
@@ -13,6 +13,13 @@ from csv_detective.utils import display_logs_depending_process_time
|
|
|
13
13
|
MAX_NUMBER_CATEGORICAL_VALUES = 25
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
def handle_empty_columns(return_table: pd.DataFrame):
|
|
17
|
+
# handling that empty columns score 1 everywhere
|
|
18
|
+
for col in return_table.columns:
|
|
19
|
+
if sum(return_table[col]) == len(return_table):
|
|
20
|
+
return_table[col] = 0
|
|
21
|
+
|
|
22
|
+
|
|
16
23
|
def test_col_val(
|
|
17
24
|
serie: pd.Series,
|
|
18
25
|
format: Format,
|
|
@@ -222,10 +229,7 @@ def test_col_chunks(
|
|
|
222
229
|
analysis["categorical"] = [
|
|
223
230
|
col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
|
|
224
231
|
]
|
|
225
|
-
|
|
226
|
-
for col in return_table.columns:
|
|
227
|
-
if sum(return_table[col]) == len(return_table):
|
|
228
|
-
return_table[col] = 0
|
|
232
|
+
handle_empty_columns(return_table)
|
|
229
233
|
if verbose:
|
|
230
234
|
display_logs_depending_process_time(
|
|
231
235
|
f"Done testing chunks in {round(time() - start, 3)}s", time() - start
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csv-detective
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.4.dev1
|
|
4
4
|
Summary: Detect tabular files column content
|
|
5
5
|
Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
|
|
6
6
|
License: MIT
|
|
@@ -23,10 +23,6 @@ Requires-Dist: frformat==0.4.0
|
|
|
23
23
|
Requires-Dist: Faker>=33.0.0
|
|
24
24
|
Requires-Dist: rstr>=3.2.2
|
|
25
25
|
Requires-Dist: more-itertools>=10.8.0
|
|
26
|
-
Provides-Extra: dev
|
|
27
|
-
Requires-Dist: pytest>=8.3.0; extra == "dev"
|
|
28
|
-
Requires-Dist: responses>=0.25.0; extra == "dev"
|
|
29
|
-
Requires-Dist: ruff>=0.9.3; extra == "dev"
|
|
30
26
|
Dynamic: license-file
|
|
31
27
|
|
|
32
28
|
# CSV Detective
|
|
@@ -8,7 +8,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
|
|
|
8
8
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
9
9
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
10
10
|
csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
|
|
11
|
-
csv_detective/detection/formats.py,sha256=
|
|
11
|
+
csv_detective/detection/formats.py,sha256=cgECpxRaygwnedPhOteG1P_697qCoceeDrKK9G_O-u8,4812
|
|
12
12
|
csv_detective/detection/headers.py,sha256=lnbWRxkI6rdyoWGtmxSfsPkqNjS0Nlpgw-pVevtmBP0,899
|
|
13
13
|
csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
|
|
14
14
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
@@ -80,17 +80,17 @@ csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XN
|
|
|
80
80
|
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
81
81
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
82
82
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
|
-
csv_detective/parsing/columns.py,sha256=
|
|
83
|
+
csv_detective/parsing/columns.py,sha256=MFtEJFLsFdlKdM5AXtgXbf5p6HRW6DuOC4XnxhFMpIY,9344
|
|
84
84
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
85
85
|
csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
|
|
86
86
|
csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
|
|
87
87
|
csv_detective/parsing/load.py,sha256=1Fk43ikIOJwtWJUY-e8oNeNOk4MMtpmZV7s-VbQBS1k,4345
|
|
88
88
|
csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
|
|
89
|
-
csv_detective-0.10.
|
|
89
|
+
csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
90
90
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
91
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
92
92
|
tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
|
|
93
|
-
tests/test_file.py,sha256=
|
|
93
|
+
tests/test_file.py,sha256=Ov9NGvZQxeoehxTpfcsnwEybebM0tnbmcRsFwe46cjg,15277
|
|
94
94
|
tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
|
|
95
95
|
tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
|
|
96
96
|
tests/test_validation.py,sha256=309k3Axgbp-1Wh6qvCj2BpeMBp3HXzLi5j9UKm1bRQs,5384
|
|
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
|
|
|
104
104
|
tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
|
|
105
105
|
tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
|
|
106
106
|
tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
|
|
107
|
-
csv_detective-0.10.
|
|
108
|
-
csv_detective-0.10.
|
|
109
|
-
csv_detective-0.10.
|
|
110
|
-
csv_detective-0.10.
|
|
111
|
-
csv_detective-0.10.
|
|
107
|
+
csv_detective-0.10.4.dev1.dist-info/METADATA,sha256=le1Rn1JIh8MoIf_RTc3Fi9DOOlvug4eR-Mwpw4AK0To,10925
|
|
108
|
+
csv_detective-0.10.4.dev1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
109
|
+
csv_detective-0.10.4.dev1.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
110
|
+
csv_detective-0.10.4.dev1.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
|
|
111
|
+
csv_detective-0.10.4.dev1.dist-info/RECORD,,
|
tests/test_file.py
CHANGED
|
@@ -388,10 +388,11 @@ def test_almost_uniform_column(mocked_responses):
|
|
|
388
388
|
assert analysis["columns"][col_name]["format"] == "int"
|
|
389
389
|
|
|
390
390
|
|
|
391
|
-
|
|
391
|
+
@pytest.mark.parametrize("nb_rows", (CHUNK_SIZE // 10, CHUNK_SIZE + 1))
|
|
392
|
+
def test_full_nan_column(mocked_responses, nb_rows):
|
|
392
393
|
# we want a file that needs sampling
|
|
393
394
|
col_name = "only_nan"
|
|
394
|
-
expected_content = f"{col_name},second_col\n" + ",1\n" *
|
|
395
|
+
expected_content = f"{col_name},second_col\n" + ",1\n" * nb_rows
|
|
395
396
|
mocked_responses.get(
|
|
396
397
|
"http://example.com/test.csv",
|
|
397
398
|
body=expected_content,
|
|
File without changes
|
{csv_detective-0.10.3.dev6.dist-info → csv_detective-0.10.4.dev1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.10.3.dev6.dist-info → csv_detective-0.10.4.dev1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|