csv-detective 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2241__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +2 -1
- csv_detective/detection/engine.py +1 -1
- csv_detective/detection/formats.py +39 -95
- csv_detective/detection/variables.py +2 -2
- csv_detective/explore_csv.py +5 -7
- csv_detective/load_tests.py +11 -4
- csv_detective/output/__init__.py +8 -4
- csv_detective/output/dataframe.py +37 -0
- csv_detective/output/example.py +3 -1
- csv_detective/output/profile.py +65 -21
- csv_detective/parsing/columns.py +133 -35
- csv_detective/parsing/csv.py +26 -23
- csv_detective/parsing/load.py +21 -8
- csv_detective/validate.py +86 -40
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/METADATA +29 -6
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/RECORD +24 -24
- tests/test_fields.py +9 -13
- tests/test_file.py +85 -35
- tests/test_structure.py +4 -1
- tests/test_validation.py +9 -4
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/top_level.txt +0 -0
csv_detective/parsing/columns.py
CHANGED
|
@@ -3,10 +3,13 @@ from time import time
|
|
|
3
3
|
from typing import Callable
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
|
+
from more_itertools import peekable
|
|
6
7
|
|
|
8
|
+
from csv_detective.parsing.csv import CHUNK_SIZE
|
|
7
9
|
from csv_detective.utils import display_logs_depending_process_time
|
|
8
10
|
|
|
9
|
-
|
|
11
|
+
# above this threshold, a column is not considered categorical
|
|
12
|
+
MAX_NUMBER_CATEGORICAL_VALUES = 25
|
|
10
13
|
|
|
11
14
|
|
|
12
15
|
def test_col_val(
|
|
@@ -34,28 +37,24 @@ def test_col_val(
|
|
|
34
37
|
serie = serie[serie.notnull()]
|
|
35
38
|
ser_len = len(serie)
|
|
36
39
|
if ser_len == 0:
|
|
37
|
-
|
|
40
|
+
# being here means the whole column is NaN, so if skipna it's a pass
|
|
41
|
+
return 1.0 if skipna else 0.0
|
|
38
42
|
if not limited_output:
|
|
39
43
|
result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
|
|
40
44
|
return result if result >= proportion else 0.0
|
|
41
45
|
else:
|
|
42
|
-
if proportion == 1:
|
|
46
|
+
if proportion == 1:
|
|
47
|
+
# early stops (1 then 5 rows) to not waste time if directly unsuccessful
|
|
43
48
|
for _range in [
|
|
44
49
|
min(1, ser_len),
|
|
45
50
|
min(5, ser_len),
|
|
46
51
|
ser_len,
|
|
47
|
-
]:
|
|
48
|
-
|
|
49
|
-
if all(apply_test_func(serie, test_func, _range)):
|
|
50
|
-
pass
|
|
51
|
-
else:
|
|
52
|
+
]:
|
|
53
|
+
if not all(apply_test_func(serie, test_func, _range)):
|
|
52
54
|
return 0.0
|
|
53
55
|
return 1.0
|
|
54
56
|
else:
|
|
55
|
-
|
|
56
|
-
# (arbitrary number) and get a significant result
|
|
57
|
-
to_analyse = min(ser_len, MAX_ROWS_ANALYSIS)
|
|
58
|
-
result = apply_test_func(serie, test_func, to_analyse).sum() / to_analyse
|
|
57
|
+
result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
|
|
59
58
|
return result if result >= proportion else 0.0
|
|
60
59
|
finally:
|
|
61
60
|
if verbose and time() - start > 3:
|
|
@@ -81,7 +80,7 @@ def test_col_label(
|
|
|
81
80
|
|
|
82
81
|
def test_col(
|
|
83
82
|
table: pd.DataFrame,
|
|
84
|
-
all_tests:
|
|
83
|
+
all_tests: dict[str, dict],
|
|
85
84
|
limited_output: bool,
|
|
86
85
|
skipna: bool = True,
|
|
87
86
|
verbose: bool = False,
|
|
@@ -89,25 +88,18 @@ def test_col(
|
|
|
89
88
|
if verbose:
|
|
90
89
|
start = time()
|
|
91
90
|
logging.info("Testing columns to get types")
|
|
92
|
-
test_funcs = {
|
|
93
|
-
test.__name__.split(".")[-1]: {
|
|
94
|
-
"func": test._is,
|
|
95
|
-
"prop": test.PROPORTION,
|
|
96
|
-
}
|
|
97
|
-
for test in all_tests
|
|
98
|
-
}
|
|
99
91
|
return_table = pd.DataFrame(columns=table.columns)
|
|
100
|
-
for idx, (
|
|
92
|
+
for idx, (name, attributes) in enumerate(all_tests.items()):
|
|
101
93
|
if verbose:
|
|
102
94
|
start_type = time()
|
|
103
|
-
logging.info(f"\t- Starting with type '{
|
|
95
|
+
logging.info(f"\t- Starting with type '{name}'")
|
|
104
96
|
# improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
|
|
105
97
|
# => the following needs to change, "apply" means all columns are tested for one type at once
|
|
106
|
-
return_table.loc[
|
|
98
|
+
return_table.loc[name] = table.apply(
|
|
107
99
|
lambda serie: test_col_val(
|
|
108
100
|
serie,
|
|
109
|
-
|
|
110
|
-
|
|
101
|
+
attributes["func"],
|
|
102
|
+
attributes["prop"],
|
|
111
103
|
skipna=skipna,
|
|
112
104
|
limited_output=limited_output,
|
|
113
105
|
verbose=verbose,
|
|
@@ -115,7 +107,7 @@ def test_col(
|
|
|
115
107
|
)
|
|
116
108
|
if verbose:
|
|
117
109
|
display_logs_depending_process_time(
|
|
118
|
-
f'\t> Done with type "{
|
|
110
|
+
f'\t> Done with type "{name}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})',
|
|
119
111
|
time() - start_type,
|
|
120
112
|
)
|
|
121
113
|
if verbose:
|
|
@@ -125,26 +117,24 @@ def test_col(
|
|
|
125
117
|
return return_table
|
|
126
118
|
|
|
127
119
|
|
|
128
|
-
def test_label(
|
|
120
|
+
def test_label(
|
|
121
|
+
columns: list[str], all_tests: dict[str, dict], limited_output: bool, verbose: bool = False
|
|
122
|
+
):
|
|
129
123
|
if verbose:
|
|
130
124
|
start = time()
|
|
131
125
|
logging.info("Testing labels to get types")
|
|
132
|
-
test_funcs = dict()
|
|
133
|
-
for test in all_tests:
|
|
134
|
-
name = test.__name__.split(".")[-1]
|
|
135
|
-
test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
|
|
136
126
|
|
|
137
|
-
return_table = pd.DataFrame(columns=
|
|
138
|
-
for idx, (key, value) in enumerate(
|
|
127
|
+
return_table = pd.DataFrame(columns=columns)
|
|
128
|
+
for idx, (key, value) in enumerate(all_tests.items()):
|
|
139
129
|
if verbose:
|
|
140
130
|
start_type = time()
|
|
141
131
|
return_table.loc[key] = [
|
|
142
132
|
test_col_label(col_name, value["func"], value["prop"], limited_output=limited_output)
|
|
143
|
-
for col_name in
|
|
133
|
+
for col_name in columns
|
|
144
134
|
]
|
|
145
135
|
if verbose:
|
|
146
136
|
display_logs_depending_process_time(
|
|
147
|
-
f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(
|
|
137
|
+
f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})',
|
|
148
138
|
time() - start_type,
|
|
149
139
|
)
|
|
150
140
|
if verbose:
|
|
@@ -152,3 +142,111 @@ def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbo
|
|
|
152
142
|
f"Done testing labels in {round(time() - start, 3)}s", time() - start
|
|
153
143
|
)
|
|
154
144
|
return return_table
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def test_col_chunks(
|
|
148
|
+
table: pd.DataFrame,
|
|
149
|
+
file_path: str,
|
|
150
|
+
analysis: dict,
|
|
151
|
+
all_tests: list,
|
|
152
|
+
limited_output: bool,
|
|
153
|
+
skipna: bool = True,
|
|
154
|
+
verbose: bool = False,
|
|
155
|
+
) -> tuple[pd.DataFrame, dict, dict[str, pd.Series]]:
|
|
156
|
+
def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[str]]:
|
|
157
|
+
return {
|
|
158
|
+
col: [test for test in return_table.index if return_table.loc[test, col] > 0]
|
|
159
|
+
for col in return_table.columns
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if verbose:
|
|
163
|
+
start = time()
|
|
164
|
+
logging.info("Testing columns to get types on chunks")
|
|
165
|
+
|
|
166
|
+
# analysing the sample to get a first guess
|
|
167
|
+
return_table = test_col(table, all_tests, limited_output, skipna=skipna, verbose=verbose)
|
|
168
|
+
remaining_tests_per_col = build_remaining_tests_per_col(return_table)
|
|
169
|
+
|
|
170
|
+
# hashing rows to get nb_duplicates
|
|
171
|
+
row_hashes_count = table.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
|
|
172
|
+
# getting values for profile to read the file only once
|
|
173
|
+
col_values = {col: table[col].value_counts(dropna=False) for col in table.columns}
|
|
174
|
+
|
|
175
|
+
# only csv files can end up here, can't chunk excel
|
|
176
|
+
chunks = pd.read_csv(
|
|
177
|
+
file_path,
|
|
178
|
+
dtype=str,
|
|
179
|
+
encoding=analysis["encoding"],
|
|
180
|
+
sep=analysis["separator"],
|
|
181
|
+
skiprows=analysis["header_row_idx"],
|
|
182
|
+
compression=analysis.get("compression"),
|
|
183
|
+
chunksize=CHUNK_SIZE,
|
|
184
|
+
)
|
|
185
|
+
analysis["total_lines"] = CHUNK_SIZE
|
|
186
|
+
batch, batch_number = [], 1
|
|
187
|
+
iterator = peekable(enumerate(chunks))
|
|
188
|
+
while iterator:
|
|
189
|
+
idx, chunk = next(iterator)
|
|
190
|
+
if idx == 0:
|
|
191
|
+
# we have read and analysed the first chunk already
|
|
192
|
+
continue
|
|
193
|
+
if len(batch) < 10:
|
|
194
|
+
# it's too slow to process chunks directly, but we want to keep the first analysis
|
|
195
|
+
# on a "small" chunk, so partial analyses are done on batches of chunks
|
|
196
|
+
batch.append(chunk)
|
|
197
|
+
# we don't know when the chunks end, and doing one additionnal step
|
|
198
|
+
# for the final batch is ugly
|
|
199
|
+
try:
|
|
200
|
+
iterator.peek()
|
|
201
|
+
continue
|
|
202
|
+
except StopIteration:
|
|
203
|
+
pass
|
|
204
|
+
if verbose:
|
|
205
|
+
logging.info(f"> Testing batch number {batch_number}")
|
|
206
|
+
batch = pd.concat(batch, ignore_index=True)
|
|
207
|
+
analysis["total_lines"] += len(batch)
|
|
208
|
+
row_hashes_count = row_hashes_count.add(
|
|
209
|
+
batch.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
|
|
210
|
+
fill_value=0,
|
|
211
|
+
)
|
|
212
|
+
for col in batch.columns:
|
|
213
|
+
col_values[col] = col_values[col].add(
|
|
214
|
+
batch[col].value_counts(dropna=False),
|
|
215
|
+
fill_value=0,
|
|
216
|
+
)
|
|
217
|
+
if not any(remaining_tests for remaining_tests in remaining_tests_per_col.values()):
|
|
218
|
+
# no more potential tests to do on any column, early stop
|
|
219
|
+
break
|
|
220
|
+
for col, tests in remaining_tests_per_col.items():
|
|
221
|
+
# testing each column with the tests that are still competing
|
|
222
|
+
# after previous batchs analyses
|
|
223
|
+
for test in tests:
|
|
224
|
+
batch_col_test = test_col_val(
|
|
225
|
+
batch[col],
|
|
226
|
+
all_tests[test]["func"],
|
|
227
|
+
all_tests[test]["prop"],
|
|
228
|
+
limited_output=limited_output,
|
|
229
|
+
skipna=skipna,
|
|
230
|
+
)
|
|
231
|
+
return_table.loc[test, col] = (
|
|
232
|
+
# if this batch's column tested 0 then test fails overall
|
|
233
|
+
0
|
|
234
|
+
if batch_col_test == 0
|
|
235
|
+
# otherwise updating the score with weighted average
|
|
236
|
+
else ((return_table.loc[test, col] * idx + batch_col_test) / (idx + 1))
|
|
237
|
+
)
|
|
238
|
+
remaining_tests_per_col = build_remaining_tests_per_col(return_table)
|
|
239
|
+
batch, batch_number = [], batch_number + 1
|
|
240
|
+
analysis["nb_duplicates"] = sum(row_hashes_count > 1)
|
|
241
|
+
analysis["categorical"] = [
|
|
242
|
+
col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
|
|
243
|
+
]
|
|
244
|
+
# handling that empty columns score 1 everywhere
|
|
245
|
+
for col in return_table.columns:
|
|
246
|
+
if sum(return_table[col]) == len(return_table):
|
|
247
|
+
return_table[col] = 0
|
|
248
|
+
if verbose:
|
|
249
|
+
display_logs_depending_process_time(
|
|
250
|
+
f"Done testing chunks in {round(time() - start, 3)}s", time() - start
|
|
251
|
+
)
|
|
252
|
+
return return_table, analysis, col_values
|
csv_detective/parsing/csv.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from time import time
|
|
3
|
-
from typing import TextIO
|
|
3
|
+
from typing import Optional, TextIO
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
7
|
from csv_detective.utils import display_logs_depending_process_time
|
|
8
8
|
|
|
9
|
+
# the number of rows for the first analysis, and the number of rows per chunk of the df iterator
|
|
10
|
+
CHUNK_SIZE = int(1e4)
|
|
11
|
+
|
|
9
12
|
|
|
10
13
|
def parse_csv(
|
|
11
14
|
the_file: TextIO,
|
|
@@ -15,36 +18,36 @@ def parse_csv(
|
|
|
15
18
|
skiprows: int,
|
|
16
19
|
random_state: int = 42,
|
|
17
20
|
verbose: bool = False,
|
|
18
|
-
) -> tuple[pd.DataFrame, int, int]:
|
|
21
|
+
) -> tuple[pd.DataFrame, Optional[int], Optional[int]]:
|
|
19
22
|
if verbose:
|
|
20
23
|
start = time()
|
|
21
24
|
logging.info("Parsing table")
|
|
22
|
-
table = None
|
|
23
25
|
|
|
24
26
|
if not isinstance(the_file, str):
|
|
25
27
|
the_file.seek(0)
|
|
26
28
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
29
|
+
try:
|
|
30
|
+
table = pd.read_csv(
|
|
31
|
+
the_file,
|
|
32
|
+
sep=sep,
|
|
33
|
+
dtype=str,
|
|
34
|
+
encoding=encoding,
|
|
35
|
+
skiprows=skiprows,
|
|
36
|
+
nrows=CHUNK_SIZE,
|
|
37
|
+
)
|
|
38
|
+
total_lines = len(table)
|
|
39
|
+
# branch between small and big files starts here
|
|
40
|
+
if total_lines == CHUNK_SIZE:
|
|
41
|
+
if verbose:
|
|
42
|
+
logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
|
|
43
|
+
total_lines, nb_duplicates = None, None
|
|
44
|
+
else:
|
|
37
45
|
nb_duplicates = len(table.loc[table.duplicated()])
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
except TypeError:
|
|
44
|
-
print("Trying encoding : {encoding}".format(encoding=encoding))
|
|
45
|
-
|
|
46
|
-
if table is None:
|
|
47
|
-
raise ValueError("Could not load file")
|
|
46
|
+
if num_rows > 0:
|
|
47
|
+
num_rows = min(num_rows, total_lines or len(table))
|
|
48
|
+
table = table.sample(num_rows, random_state=random_state)
|
|
49
|
+
except Exception as e:
|
|
50
|
+
raise ValueError("Could not load file") from e
|
|
48
51
|
if verbose:
|
|
49
52
|
display_logs_depending_process_time(
|
|
50
53
|
f"Table parsed successfully in {round(time() - start, 3)}s",
|
csv_detective/parsing/load.py
CHANGED
|
@@ -44,6 +44,8 @@ def load_file(
|
|
|
44
44
|
sheet_name=sheet_name,
|
|
45
45
|
verbose=verbose,
|
|
46
46
|
)
|
|
47
|
+
if table.empty:
|
|
48
|
+
raise ValueError("Table seems to be empty")
|
|
47
49
|
header = table.columns.to_list()
|
|
48
50
|
analysis = {
|
|
49
51
|
"engine": engine,
|
|
@@ -66,34 +68,45 @@ def load_file(
|
|
|
66
68
|
binary_file.seek(0)
|
|
67
69
|
# decoding and reading file
|
|
68
70
|
if is_url(file_path) or engine in COMPRESSION_ENGINES:
|
|
69
|
-
str_file = StringIO(
|
|
71
|
+
str_file = StringIO()
|
|
72
|
+
while True:
|
|
73
|
+
chunk = binary_file.read(1024**2)
|
|
74
|
+
if not chunk:
|
|
75
|
+
break
|
|
76
|
+
str_file.write(chunk.decode(encoding=encoding))
|
|
77
|
+
del binary_file
|
|
78
|
+
str_file.seek(0)
|
|
70
79
|
else:
|
|
71
80
|
str_file = open(file_path, "r", encoding=encoding)
|
|
72
81
|
if sep is None:
|
|
73
82
|
sep = detect_separator(str_file, verbose=verbose)
|
|
74
83
|
header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
|
|
75
|
-
if header is None:
|
|
76
|
-
|
|
77
|
-
elif isinstance(header, list):
|
|
78
|
-
if any([x is None for x in header]):
|
|
79
|
-
return {"error": True}
|
|
84
|
+
if header is None or (isinstance(header, list) and any([h is None for h in header])):
|
|
85
|
+
raise ValueError("Could not retrieve headers")
|
|
80
86
|
heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
|
|
81
87
|
trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
|
|
82
88
|
table, total_lines, nb_duplicates = parse_csv(
|
|
83
89
|
str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
|
|
84
90
|
)
|
|
91
|
+
del str_file
|
|
92
|
+
if table.empty:
|
|
93
|
+
raise ValueError("Table seems to be empty")
|
|
85
94
|
analysis = {
|
|
86
95
|
"encoding": encoding,
|
|
87
96
|
"separator": sep,
|
|
88
97
|
"heading_columns": heading_columns,
|
|
89
98
|
"trailing_columns": trailing_columns,
|
|
90
99
|
}
|
|
100
|
+
if engine is not None:
|
|
101
|
+
analysis["compression"] = engine
|
|
91
102
|
analysis.update(
|
|
92
103
|
{
|
|
93
104
|
"header_row_idx": header_row_idx,
|
|
94
105
|
"header": header,
|
|
95
|
-
"total_lines": total_lines,
|
|
96
|
-
"nb_duplicates": nb_duplicates,
|
|
97
106
|
}
|
|
98
107
|
)
|
|
108
|
+
if total_lines is not None:
|
|
109
|
+
analysis["total_lines"] = total_lines
|
|
110
|
+
if nb_duplicates is not None:
|
|
111
|
+
analysis["nb_duplicates"] = nb_duplicates
|
|
99
112
|
return table, analysis
|
csv_detective/validate.py
CHANGED
|
@@ -3,76 +3,121 @@ import logging
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
5
|
from csv_detective.load_tests import return_all_tests
|
|
6
|
-
from csv_detective.parsing.columns import test_col_val
|
|
7
|
-
from csv_detective.parsing.load import load_file
|
|
6
|
+
from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
|
|
8
7
|
|
|
8
|
+
VALIDATION_CHUNK_SIZE = int(1e5)
|
|
9
9
|
logging.basicConfig(level=logging.INFO)
|
|
10
10
|
|
|
11
|
-
tests =
|
|
12
|
-
t.__name__.split(".")[-1]: {
|
|
13
|
-
"func": t._is,
|
|
14
|
-
"prop": t.PROPORTION,
|
|
15
|
-
}
|
|
16
|
-
for t in return_all_tests("ALL", "detect_fields")
|
|
17
|
-
}
|
|
11
|
+
tests = return_all_tests("ALL", "detect_fields")
|
|
18
12
|
|
|
19
13
|
|
|
20
14
|
def validate(
|
|
21
15
|
file_path: str,
|
|
22
16
|
previous_analysis: dict,
|
|
23
|
-
num_rows: int = 500,
|
|
24
|
-
encoding: str | None = None,
|
|
25
|
-
sep: str | None = None,
|
|
26
17
|
verbose: bool = False,
|
|
27
18
|
skipna: bool = True,
|
|
28
|
-
|
|
29
|
-
) -> tuple[bool, pd.DataFrame | None, dict | None]:
|
|
19
|
+
) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
|
|
30
20
|
"""
|
|
31
|
-
Verify is the given file has the same fields and types as in the
|
|
21
|
+
Verify is the given file has the same fields and types as in the given analysis.
|
|
32
22
|
"""
|
|
33
23
|
try:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
24
|
+
if previous_analysis.get("separator"):
|
|
25
|
+
# loading the table in chunks
|
|
26
|
+
chunks = pd.read_csv(
|
|
27
|
+
file_path,
|
|
28
|
+
dtype=str,
|
|
29
|
+
sep=previous_analysis["separator"],
|
|
30
|
+
encoding=previous_analysis["encoding"],
|
|
31
|
+
skiprows=previous_analysis["header_row_idx"],
|
|
32
|
+
compression=previous_analysis.get("compression"),
|
|
33
|
+
chunksize=VALIDATION_CHUNK_SIZE,
|
|
34
|
+
)
|
|
35
|
+
analysis = {
|
|
36
|
+
k: v
|
|
37
|
+
for k, v in previous_analysis.items()
|
|
38
|
+
if k
|
|
39
|
+
in ["encoding", "separator", "compression", "heading_columns", "trailing_columns"]
|
|
40
|
+
and v is not None
|
|
41
|
+
}
|
|
42
|
+
else:
|
|
43
|
+
# or chunks-like if not chunkable
|
|
44
|
+
chunks = iter(
|
|
45
|
+
[
|
|
46
|
+
pd.read_excel(
|
|
47
|
+
file_path,
|
|
48
|
+
dtype=str,
|
|
49
|
+
engine=previous_analysis["engine"],
|
|
50
|
+
sheet_name=previous_analysis["sheet_name"],
|
|
51
|
+
)
|
|
52
|
+
]
|
|
53
|
+
)
|
|
54
|
+
analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
|
|
55
|
+
first_chunk = next(chunks)
|
|
56
|
+
analysis.update(
|
|
57
|
+
{k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
|
|
41
58
|
)
|
|
42
59
|
except Exception as e:
|
|
43
60
|
if verbose:
|
|
44
61
|
logging.warning(f"> Could not load the file with previous analysis values: {e}")
|
|
45
|
-
return False, None, None
|
|
62
|
+
return False, None, None, None
|
|
46
63
|
if verbose:
|
|
47
64
|
logging.info("Comparing table with the previous analysis")
|
|
48
65
|
logging.info("- Checking if all columns match")
|
|
49
|
-
if
|
|
50
|
-
|
|
66
|
+
if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
|
|
67
|
+
list(first_chunk.columns)[k] != previous_analysis["header"][k]
|
|
68
|
+
for k in range(len(previous_analysis["header"]))
|
|
51
69
|
):
|
|
52
70
|
if verbose:
|
|
53
71
|
logging.warning("> Columns do not match, proceeding with full analysis")
|
|
54
|
-
return False, None, None
|
|
55
|
-
|
|
72
|
+
return False, None, None, None
|
|
73
|
+
if verbose:
|
|
74
|
+
logging.info(
|
|
75
|
+
f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# hashing rows to get nb_duplicates
|
|
79
|
+
row_hashes_count = first_chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
|
|
80
|
+
# getting values for profile to read the file only once
|
|
81
|
+
col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
|
|
82
|
+
analysis["total_lines"] = 0
|
|
83
|
+
for idx, chunk in enumerate([first_chunk, *chunks]):
|
|
56
84
|
if verbose:
|
|
57
|
-
logging.info(f"
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
serie=table[col_name],
|
|
63
|
-
test_func=tests[args["format"]]["func"],
|
|
64
|
-
proportion=tests[args["format"]]["prop"],
|
|
65
|
-
skipna=skipna,
|
|
85
|
+
logging.info(f"> Testing chunk number {idx}")
|
|
86
|
+
analysis["total_lines"] += len(chunk)
|
|
87
|
+
row_hashes_count = row_hashes_count.add(
|
|
88
|
+
chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
|
|
89
|
+
fill_value=0,
|
|
66
90
|
)
|
|
67
|
-
|
|
91
|
+
for col in chunk.columns:
|
|
92
|
+
col_values[col] = col_values[col].add(
|
|
93
|
+
chunk[col].value_counts(dropna=False),
|
|
94
|
+
fill_value=0,
|
|
95
|
+
)
|
|
96
|
+
for col_name, args in previous_analysis["columns"].items():
|
|
68
97
|
if verbose:
|
|
69
|
-
logging.
|
|
70
|
-
|
|
98
|
+
logging.info(f"- Testing {col_name} for {args['format']}")
|
|
99
|
+
if args["format"] == "string":
|
|
100
|
+
# no test for columns that have not been recognized as a specific format
|
|
101
|
+
continue
|
|
102
|
+
test_result: float = test_col_val(
|
|
103
|
+
serie=chunk[col_name],
|
|
104
|
+
test_func=tests[args["format"]]["func"],
|
|
105
|
+
proportion=tests[args["format"]]["prop"],
|
|
106
|
+
skipna=skipna,
|
|
107
|
+
)
|
|
108
|
+
if not bool(test_result):
|
|
109
|
+
if verbose:
|
|
110
|
+
logging.warning("> Test failed, proceeding with full analysis")
|
|
111
|
+
return False, first_chunk, analysis, None
|
|
71
112
|
if verbose:
|
|
72
113
|
logging.info("> All checks successful")
|
|
114
|
+
analysis["nb_duplicates"] = sum(row_hashes_count > 1)
|
|
115
|
+
analysis["categorical"] = [
|
|
116
|
+
col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
|
|
117
|
+
]
|
|
73
118
|
return (
|
|
74
119
|
True,
|
|
75
|
-
|
|
120
|
+
first_chunk,
|
|
76
121
|
analysis
|
|
77
122
|
| {
|
|
78
123
|
k: previous_analysis[k]
|
|
@@ -84,4 +129,5 @@ def validate(
|
|
|
84
129
|
"formats",
|
|
85
130
|
]
|
|
86
131
|
},
|
|
132
|
+
col_values,
|
|
87
133
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csv-detective
|
|
3
|
-
Version: 0.9.3.
|
|
3
|
+
Version: 0.9.3.dev2241
|
|
4
4
|
Summary: Detect tabular files column content
|
|
5
5
|
Author-email: Etalab <opendatateam@data.gouv.fr>
|
|
6
6
|
License: MIT
|
|
@@ -22,6 +22,7 @@ Requires-Dist: python-magic==0.4.27
|
|
|
22
22
|
Requires-Dist: frformat==0.4.0
|
|
23
23
|
Requires-Dist: Faker>=33.0.0
|
|
24
24
|
Requires-Dist: rstr==3.2.2
|
|
25
|
+
Requires-Dist: more-itertools>=10.8.0
|
|
25
26
|
Provides-Extra: dev
|
|
26
27
|
Requires-Dist: pytest>=8.3.0; extra == "dev"
|
|
27
28
|
Requires-Dist: responses>=0.25.0; extra == "dev"
|
|
@@ -30,7 +31,7 @@ Dynamic: license-file
|
|
|
30
31
|
|
|
31
32
|
# CSV Detective
|
|
32
33
|
|
|
33
|
-
This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks to see for each column if it matches with various content types.
|
|
34
|
+
This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types.
|
|
34
35
|
|
|
35
36
|
Currently supported file types: csv, xls, xlsx, ods.
|
|
36
37
|
|
|
@@ -50,7 +51,7 @@ pip install csv-detective
|
|
|
50
51
|
|
|
51
52
|
Say you have a tabular file located at `file_path`. This is how you could use `csv_detective`:
|
|
52
53
|
|
|
53
|
-
```
|
|
54
|
+
```python
|
|
54
55
|
# Import the csv_detective package
|
|
55
56
|
from csv_detective import routine
|
|
56
57
|
import os # for this example only
|
|
@@ -158,13 +159,26 @@ The program creates a `Python` dictionnary with the following information :
|
|
|
158
159
|
```
|
|
159
160
|
|
|
160
161
|
The output slightly differs depending on the file format:
|
|
161
|
-
- csv files have `encoding` and `separator`
|
|
162
|
+
- csv files have `encoding` and `separator` (and `compression` if relevant)
|
|
162
163
|
- xls, xls, ods files have `engine` and `sheet_name`
|
|
163
164
|
|
|
165
|
+
You may also set `output_df` to `True`, in which case the output is a tuple of two elements:
|
|
166
|
+
- the analysis (as described above)
|
|
167
|
+
- an iteror of `pd.DataFrame`s which contain the columns cast with the detected types (which can be used with `pd.concat` or in a loop):
|
|
168
|
+
```python
|
|
169
|
+
inspection, df_chunks = routine(
|
|
170
|
+
file_path=file_path,
|
|
171
|
+
num_rows=-1,
|
|
172
|
+
output_df=True,
|
|
173
|
+
)
|
|
174
|
+
cast_df = pd.concat(df_chunks, ignore_index=True)
|
|
175
|
+
# if "col1" has been detected as a float, then cast_df["col1"] contains floats
|
|
176
|
+
```
|
|
177
|
+
|
|
164
178
|
### What Formats Can Be Detected
|
|
165
179
|
|
|
166
180
|
Includes :
|
|
167
|
-
|
|
181
|
+
- types (float, int, dates, datetimes, JSON) and more specific (latitude, longitude, geoJSON...)
|
|
168
182
|
- Communes, Départements, Régions, Pays
|
|
169
183
|
- Codes Communes, Codes Postaux, Codes Departement, ISO Pays
|
|
170
184
|
- Codes CSP, Description CSP, SIREN
|
|
@@ -172,6 +186,16 @@ Includes :
|
|
|
172
186
|
- Years, Dates, Jours de la Semaine FR
|
|
173
187
|
- UUIDs, Mongo ObjectIds
|
|
174
188
|
|
|
189
|
+
### Validation
|
|
190
|
+
If you have a pre-made analysis of a file, you can check whether an other file conforms to the same analysis:
|
|
191
|
+
```python
|
|
192
|
+
from csv_detective import validate
|
|
193
|
+
is_valid, *_ = validate(
|
|
194
|
+
file_path,
|
|
195
|
+
previous_analysis, # exactly as it came out from the routine function
|
|
196
|
+
)
|
|
197
|
+
```
|
|
198
|
+
|
|
175
199
|
### Format detection and scoring
|
|
176
200
|
For each column, 3 scores are computed for each format, the higher the score, the more likely the format:
|
|
177
201
|
- the field score based on the values contained in the column (0.0 to 1.0).
|
|
@@ -199,7 +223,6 @@ Only the format with highest score is present in the output.
|
|
|
199
223
|
Related ideas:
|
|
200
224
|
|
|
201
225
|
- store column names to make a learning model based on column names for (possible pre-screen)
|
|
202
|
-
- normalising data based on column prediction
|
|
203
226
|
- entity resolution (good luck...)
|
|
204
227
|
|
|
205
228
|
## Why Could This Be of Any Use ?
|