csv-detective 0.10.3.dev7__py3-none-any.whl → 0.10.4.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +2 -0
- csv_detective/explore_csv.py +2 -2
- csv_detective/format.py +2 -0
- csv_detective/parsing/columns.py +11 -7
- csv_detective/parsing/csv.py +1 -1
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/METADATA +1 -1
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/RECORD +12 -12
- tests/test_file.py +3 -2
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/WHEEL +0 -0
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,7 @@ from csv_detective.format import Format, FormatsManager
|
|
|
11
11
|
from csv_detective.output.utils import prepare_output_dict
|
|
12
12
|
from csv_detective.parsing.columns import (
|
|
13
13
|
MAX_NUMBER_CATEGORICAL_VALUES,
|
|
14
|
+
handle_empty_columns,
|
|
14
15
|
test_col,
|
|
15
16
|
test_col_chunks,
|
|
16
17
|
test_label,
|
|
@@ -49,6 +50,7 @@ def detect_formats(
|
|
|
49
50
|
skipna=skipna,
|
|
50
51
|
verbose=verbose,
|
|
51
52
|
)
|
|
53
|
+
handle_empty_columns(scores_table_fields)
|
|
52
54
|
res_categorical, _ = detect_categorical_variable(
|
|
53
55
|
table,
|
|
54
56
|
max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
|
csv_detective/explore_csv.py
CHANGED
|
@@ -36,7 +36,7 @@ def routine(
|
|
|
36
36
|
file_path: local path or URL to file
|
|
37
37
|
num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
|
|
38
38
|
tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
|
|
39
|
-
limited_output: whether or not to return all possible
|
|
39
|
+
limited_output: whether or not to return all possible formats or only the most likely one for each column
|
|
40
40
|
save_results: whether or not to save the results in a json file, or the path where to dump the output
|
|
41
41
|
output_profile: whether or not to add the 'profile' field to the output
|
|
42
42
|
output_schema: whether or not to add the 'schema' field to the output (tableschema)
|
|
@@ -47,7 +47,7 @@ def routine(
|
|
|
47
47
|
skipna: whether to keep NaN (empty cells) for tests
|
|
48
48
|
|
|
49
49
|
Returns:
|
|
50
|
-
dict: a dict with information about the csv and possible
|
|
50
|
+
dict: a dict with information about the csv and possible formats for each column
|
|
51
51
|
"""
|
|
52
52
|
|
|
53
53
|
if not (
|
csv_detective/format.py
CHANGED
|
@@ -25,6 +25,8 @@ class Format:
|
|
|
25
25
|
labels: the dict of hint headers and their credibilty for the header score (NB: credibility is relative witin a single format, should be used to rank the valid labels)
|
|
26
26
|
proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
|
|
27
27
|
tags: to allow users to submit a file to only a subset of formats
|
|
28
|
+
mandatory_label: whether the format can only be considered if the column passes both field and label tests
|
|
29
|
+
python_type: the python type related to the format (less specific, used for downstream casting)
|
|
28
30
|
"""
|
|
29
31
|
self.name: str = name
|
|
30
32
|
self.func: Callable[[Any], bool] = func
|
csv_detective/parsing/columns.py
CHANGED
|
@@ -13,6 +13,13 @@ from csv_detective.utils import display_logs_depending_process_time
|
|
|
13
13
|
MAX_NUMBER_CATEGORICAL_VALUES = 25
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
def handle_empty_columns(return_table: pd.DataFrame):
|
|
17
|
+
# handling that empty columns score 1 everywhere
|
|
18
|
+
for col in return_table.columns:
|
|
19
|
+
if sum(return_table[col]) == len(return_table):
|
|
20
|
+
return_table[col] = 0
|
|
21
|
+
|
|
22
|
+
|
|
16
23
|
def test_col_val(
|
|
17
24
|
serie: pd.Series,
|
|
18
25
|
format: Format,
|
|
@@ -87,7 +94,7 @@ def test_col(
|
|
|
87
94
|
)
|
|
88
95
|
if verbose:
|
|
89
96
|
display_logs_depending_process_time(
|
|
90
|
-
f'\t> Done with
|
|
97
|
+
f'\t> Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
|
|
91
98
|
time() - start_type,
|
|
92
99
|
)
|
|
93
100
|
if verbose:
|
|
@@ -102,7 +109,7 @@ def test_label(
|
|
|
102
109
|
):
|
|
103
110
|
if verbose:
|
|
104
111
|
start = time()
|
|
105
|
-
logging.info("Testing labels to get
|
|
112
|
+
logging.info("Testing labels to get formats")
|
|
106
113
|
|
|
107
114
|
return_table = pd.DataFrame(columns=columns)
|
|
108
115
|
for idx, (label, format) in enumerate(formats.items()):
|
|
@@ -111,7 +118,7 @@ def test_label(
|
|
|
111
118
|
return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns]
|
|
112
119
|
if verbose:
|
|
113
120
|
display_logs_depending_process_time(
|
|
114
|
-
f'\t- Done with
|
|
121
|
+
f'\t- Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
|
|
115
122
|
time() - start_type,
|
|
116
123
|
)
|
|
117
124
|
if verbose:
|
|
@@ -222,10 +229,7 @@ def test_col_chunks(
|
|
|
222
229
|
analysis["categorical"] = [
|
|
223
230
|
col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
|
|
224
231
|
]
|
|
225
|
-
|
|
226
|
-
for col in return_table.columns:
|
|
227
|
-
if sum(return_table[col]) == len(return_table):
|
|
228
|
-
return_table[col] = 0
|
|
232
|
+
handle_empty_columns(return_table)
|
|
229
233
|
if verbose:
|
|
230
234
|
display_logs_depending_process_time(
|
|
231
235
|
f"Done testing chunks in {round(time() - start, 3)}s", time() - start
|
csv_detective/parsing/csv.py
CHANGED
|
@@ -39,7 +39,7 @@ def parse_csv(
|
|
|
39
39
|
# branch between small and big files starts here
|
|
40
40
|
if total_lines == CHUNK_SIZE:
|
|
41
41
|
if verbose:
|
|
42
|
-
logging.warning(f"File is too long,
|
|
42
|
+
logging.warning(f"File is too long, loading in chunks of {CHUNK_SIZE} rows")
|
|
43
43
|
total_lines, nb_duplicates = None, None
|
|
44
44
|
else:
|
|
45
45
|
nb_duplicates = table.duplicated().sum()
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
|
|
2
2
|
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
4
|
-
csv_detective/format.py,sha256=
|
|
3
|
+
csv_detective/explore_csv.py,sha256=YjkQihSm1vgZbEfXHxJ-_bVJrtCUT3Ut8x8FX60ZK3k,7643
|
|
4
|
+
csv_detective/format.py,sha256=vYz4h-WDUZ3pZIfxUc1toXRbUiNnWqHHiUXBbZtnaUw,3140
|
|
5
5
|
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
6
6
|
csv_detective/validate.py,sha256=7k0GC5AsTn5BbsRChetZZDmnTGiYLe40qPKiP3GruYs,7495
|
|
7
7
|
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
9
9
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
10
10
|
csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
|
|
11
|
-
csv_detective/detection/formats.py,sha256=
|
|
11
|
+
csv_detective/detection/formats.py,sha256=cgECpxRaygwnedPhOteG1P_697qCoceeDrKK9G_O-u8,4812
|
|
12
12
|
csv_detective/detection/headers.py,sha256=lnbWRxkI6rdyoWGtmxSfsPkqNjS0Nlpgw-pVevtmBP0,899
|
|
13
13
|
csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
|
|
14
14
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
@@ -80,17 +80,17 @@ csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XN
|
|
|
80
80
|
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
81
81
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
82
82
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
|
-
csv_detective/parsing/columns.py,sha256=
|
|
83
|
+
csv_detective/parsing/columns.py,sha256=ExabNAzQB-IvQfjmVLClBfj1GMkbAQxUUnournFpdy8,9350
|
|
84
84
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
85
|
-
csv_detective/parsing/csv.py,sha256=
|
|
85
|
+
csv_detective/parsing/csv.py,sha256=vfAHkpgzLkzeUXWVqrWlttZ1y-Hql0GNFSZZrA366mk,1714
|
|
86
86
|
csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
|
|
87
87
|
csv_detective/parsing/load.py,sha256=1Fk43ikIOJwtWJUY-e8oNeNOk4MMtpmZV7s-VbQBS1k,4345
|
|
88
88
|
csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
|
|
89
|
-
csv_detective-0.10.
|
|
89
|
+
csv_detective-0.10.4.dev2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
90
90
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
91
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
92
92
|
tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
|
|
93
|
-
tests/test_file.py,sha256=
|
|
93
|
+
tests/test_file.py,sha256=Ov9NGvZQxeoehxTpfcsnwEybebM0tnbmcRsFwe46cjg,15277
|
|
94
94
|
tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
|
|
95
95
|
tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
|
|
96
96
|
tests/test_validation.py,sha256=309k3Axgbp-1Wh6qvCj2BpeMBp3HXzLi5j9UKm1bRQs,5384
|
|
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
|
|
|
104
104
|
tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
|
|
105
105
|
tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
|
|
106
106
|
tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
|
|
107
|
-
csv_detective-0.10.
|
|
108
|
-
csv_detective-0.10.
|
|
109
|
-
csv_detective-0.10.
|
|
110
|
-
csv_detective-0.10.
|
|
111
|
-
csv_detective-0.10.
|
|
107
|
+
csv_detective-0.10.4.dev2.dist-info/METADATA,sha256=52ZirEUs9m49EuivGtEmtr_p2h_wVsSRbyQTs0SfoWE,10925
|
|
108
|
+
csv_detective-0.10.4.dev2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
109
|
+
csv_detective-0.10.4.dev2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
110
|
+
csv_detective-0.10.4.dev2.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
|
|
111
|
+
csv_detective-0.10.4.dev2.dist-info/RECORD,,
|
tests/test_file.py
CHANGED
|
@@ -388,10 +388,11 @@ def test_almost_uniform_column(mocked_responses):
|
|
|
388
388
|
assert analysis["columns"][col_name]["format"] == "int"
|
|
389
389
|
|
|
390
390
|
|
|
391
|
-
|
|
391
|
+
@pytest.mark.parametrize("nb_rows", (CHUNK_SIZE // 10, CHUNK_SIZE + 1))
|
|
392
|
+
def test_full_nan_column(mocked_responses, nb_rows):
|
|
392
393
|
# we want a file that needs sampling
|
|
393
394
|
col_name = "only_nan"
|
|
394
|
-
expected_content = f"{col_name},second_col\n" + ",1\n" *
|
|
395
|
+
expected_content = f"{col_name},second_col\n" + ",1\n" * nb_rows
|
|
395
396
|
mocked_responses.get(
|
|
396
397
|
"http://example.com/test.csv",
|
|
397
398
|
body=expected_content,
|
|
File without changes
|
{csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.10.3.dev7.dist-info → csv_detective-0.10.4.dev2.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|