csv-detective 0.10.3.dev4__py3-none-any.whl → 0.10.3.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/explore_csv.py +15 -4
- csv_detective/format.py +1 -1
- csv_detective/output/profile.py +1 -1
- csv_detective/parsing/columns.py +1 -1
- csv_detective/parsing/excel.py +1 -1
- csv_detective/parsing/load.py +3 -3
- csv_detective/validate.py +66 -37
- {csv_detective-0.10.3.dev4.dist-info → csv_detective-0.10.3.dev5.dist-info}/METADATA +1 -1
- {csv_detective-0.10.3.dev4.dist-info → csv_detective-0.10.3.dev5.dist-info}/RECORD +14 -14
- tests/test_validation.py +70 -15
- {csv_detective-0.10.3.dev4.dist-info → csv_detective-0.10.3.dev5.dist-info}/WHEEL +0 -0
- {csv_detective-0.10.3.dev4.dist-info → csv_detective-0.10.3.dev5.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.10.3.dev4.dist-info → csv_detective-0.10.3.dev5.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.10.3.dev4.dist-info → csv_detective-0.10.3.dev5.dist-info}/top_level.txt +0 -0
csv_detective/explore_csv.py
CHANGED
|
@@ -142,20 +142,19 @@ def validate_then_detect(
|
|
|
142
142
|
if is_url(file_path):
|
|
143
143
|
logging.info("Path recognized as a URL")
|
|
144
144
|
|
|
145
|
-
is_valid,
|
|
145
|
+
is_valid, analysis, col_values = validate(
|
|
146
146
|
file_path=file_path,
|
|
147
147
|
previous_analysis=previous_analysis,
|
|
148
148
|
verbose=verbose,
|
|
149
149
|
skipna=skipna,
|
|
150
150
|
)
|
|
151
|
-
if
|
|
152
|
-
# if loading failed in validate, we load it from scratch
|
|
151
|
+
if not is_valid:
|
|
152
|
+
# if loading failed in validate, we load it from scratch and initiate an analysis
|
|
153
153
|
table, analysis = load_file(
|
|
154
154
|
file_path=file_path,
|
|
155
155
|
num_rows=num_rows,
|
|
156
156
|
verbose=verbose,
|
|
157
157
|
)
|
|
158
|
-
if not is_valid:
|
|
159
158
|
analysis, col_values = detect_formats(
|
|
160
159
|
table=table,
|
|
161
160
|
analysis=analysis,
|
|
@@ -165,6 +164,18 @@ def validate_then_detect(
|
|
|
165
164
|
skipna=skipna,
|
|
166
165
|
verbose=verbose,
|
|
167
166
|
)
|
|
167
|
+
else:
|
|
168
|
+
# successful validation means we have a correct analysis and col_values
|
|
169
|
+
# only need to reload the table, and we already know how
|
|
170
|
+
table, _ = load_file(
|
|
171
|
+
file_path=file_path,
|
|
172
|
+
num_rows=num_rows,
|
|
173
|
+
verbose=verbose,
|
|
174
|
+
sep=analysis.get("separator"),
|
|
175
|
+
encoding=analysis.get("encoding"),
|
|
176
|
+
engine=analysis.get("engine"),
|
|
177
|
+
sheet_name=analysis.get("sheet_name"),
|
|
178
|
+
)
|
|
168
179
|
try:
|
|
169
180
|
return generate_output(
|
|
170
181
|
table=table,
|
csv_detective/format.py
CHANGED
|
@@ -27,7 +27,7 @@ class Format:
|
|
|
27
27
|
tags: to allow users to submit a file to only a subset of formats
|
|
28
28
|
"""
|
|
29
29
|
self.name: str = name
|
|
30
|
-
self.func: Callable = func
|
|
30
|
+
self.func: Callable[[Any], bool] = func
|
|
31
31
|
self._test_values: dict[bool, list[str]] = _test_values
|
|
32
32
|
self.labels: dict[str, float] = labels
|
|
33
33
|
self.proportion: float = proportion
|
csv_detective/output/profile.py
CHANGED
|
@@ -23,7 +23,7 @@ def create_profile(
|
|
|
23
23
|
logging.info("Creating profile")
|
|
24
24
|
|
|
25
25
|
if num_rows > 0:
|
|
26
|
-
raise ValueError("To create
|
|
26
|
+
raise ValueError("To create profile `num_rows` must be set to -1")
|
|
27
27
|
if not limited_output:
|
|
28
28
|
columns = {
|
|
29
29
|
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
csv_detective/parsing/columns.py
CHANGED
csv_detective/parsing/excel.py
CHANGED
|
@@ -23,7 +23,7 @@ def parse_excel(
|
|
|
23
23
|
file_path: str,
|
|
24
24
|
num_rows: int = -1,
|
|
25
25
|
engine: str | None = None,
|
|
26
|
-
sheet_name: str | None = None,
|
|
26
|
+
sheet_name: str | int | None = None,
|
|
27
27
|
random_state: int = 42,
|
|
28
28
|
verbose: bool = False,
|
|
29
29
|
) -> tuple[pd.DataFrame, int, int, str, str, int]:
|
csv_detective/parsing/load.py
CHANGED
|
@@ -28,12 +28,12 @@ def load_file(
|
|
|
28
28
|
encoding: str | None = None,
|
|
29
29
|
sep: str | None = None,
|
|
30
30
|
verbose: bool = False,
|
|
31
|
+
engine: str | None = None,
|
|
31
32
|
sheet_name: str | int | None = None,
|
|
32
33
|
) -> tuple[pd.DataFrame, dict]:
|
|
33
34
|
file_name = file_path.split("/")[-1]
|
|
34
|
-
engine
|
|
35
|
-
|
|
36
|
-
# file has no extension, we'll investigate how to read it
|
|
35
|
+
if ("." not in file_name or not file_name.endswith("csv")) and engine is None and sep is None:
|
|
36
|
+
# file has no extension and we don't have insights from arguments, we'll investigate how to read it
|
|
37
37
|
engine = detect_engine(file_path, verbose=verbose)
|
|
38
38
|
|
|
39
39
|
if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
|
csv_detective/validate.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
|
|
3
4
|
import pandas as pd
|
|
4
5
|
|
|
5
6
|
from csv_detective.format import FormatsManager
|
|
6
7
|
from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
|
|
7
8
|
|
|
9
|
+
# VALIDATION_CHUNK_SIZE is bigger than (analysis) CHUNK_SIZE because
|
|
10
|
+
# it's faster to validate so we can afford to load more rows
|
|
8
11
|
VALIDATION_CHUNK_SIZE = int(1e5)
|
|
9
12
|
logging.basicConfig(level=logging.INFO)
|
|
10
13
|
|
|
@@ -16,9 +19,9 @@ def validate(
|
|
|
16
19
|
previous_analysis: dict,
|
|
17
20
|
verbose: bool = False,
|
|
18
21
|
skipna: bool = True,
|
|
19
|
-
) -> tuple[bool,
|
|
22
|
+
) -> tuple[bool, dict | None, dict[str, pd.Series] | None]:
|
|
20
23
|
"""
|
|
21
|
-
Verify is the given file has the same fields and
|
|
24
|
+
Verify is the given file has the same fields and formats as in the given analysis.
|
|
22
25
|
|
|
23
26
|
Args:
|
|
24
27
|
file_path: the path of the file to validate
|
|
@@ -26,6 +29,15 @@ def validate(
|
|
|
26
29
|
verbose: whether the code displays the steps it's going through
|
|
27
30
|
skipna: whether to ignore NaN values in the checks
|
|
28
31
|
"""
|
|
32
|
+
if verbose:
|
|
33
|
+
logging.info(f"Checking given formats exist")
|
|
34
|
+
for col_name, detected in previous_analysis["columns"].items():
|
|
35
|
+
if detected["format"] == "string":
|
|
36
|
+
continue
|
|
37
|
+
elif detected["format"] not in formats:
|
|
38
|
+
if verbose:
|
|
39
|
+
logging.warning(f"> Unknown format `{detected['format']}` in analysis")
|
|
40
|
+
return False, None, None
|
|
29
41
|
try:
|
|
30
42
|
if previous_analysis.get("separator"):
|
|
31
43
|
# loading the table in chunks
|
|
@@ -58,77 +70,94 @@ def validate(
|
|
|
58
70
|
]
|
|
59
71
|
)
|
|
60
72
|
analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
|
|
61
|
-
first_chunk = next(chunks)
|
|
62
73
|
analysis.update(
|
|
63
74
|
{k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
|
|
64
75
|
)
|
|
65
76
|
except Exception as e:
|
|
66
77
|
if verbose:
|
|
67
78
|
logging.warning(f"> Could not load the file with previous analysis values: {e}")
|
|
68
|
-
return False, None, None
|
|
79
|
+
return False, None, None
|
|
69
80
|
if verbose:
|
|
70
81
|
logging.info("Comparing table with the previous analysis")
|
|
71
|
-
logging.info("- Checking if all columns match")
|
|
72
|
-
if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
|
|
73
|
-
list(first_chunk.columns)[k] != previous_analysis["header"][k]
|
|
74
|
-
for k in range(len(previous_analysis["header"]))
|
|
75
|
-
):
|
|
76
|
-
if verbose:
|
|
77
|
-
logging.warning("> Columns do not match, proceeding with full analysis")
|
|
78
|
-
return False, None, None, None
|
|
79
|
-
if verbose:
|
|
80
82
|
logging.info(
|
|
81
83
|
f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
|
|
82
84
|
)
|
|
83
85
|
|
|
84
|
-
#
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
86
|
+
# will contain hashes of each row of the file as index and the number of times
|
|
87
|
+
# each hash was seen as values; used to compute nb_duplicates
|
|
88
|
+
row_hashes_count = pd.Series()
|
|
89
|
+
# will contain the number of times each value of each column is seen in the whole file
|
|
90
|
+
# used for profile to read the file only once
|
|
91
|
+
# naming it "count" to be iso with how col_values are made in detect_formats
|
|
92
|
+
col_values: defaultdict[str, pd.Series] = defaultdict(lambda: pd.Series(name="count"))
|
|
88
93
|
analysis["total_lines"] = 0
|
|
89
|
-
|
|
94
|
+
checked_values: dict[str, int] = {col_name: 0 for col_name in previous_analysis["columns"]}
|
|
95
|
+
valid_values: dict[str, int] = {col_name: 0 for col_name in previous_analysis["columns"]}
|
|
96
|
+
for idx, chunk in enumerate(chunks):
|
|
90
97
|
if verbose:
|
|
91
|
-
logging.info(f"
|
|
98
|
+
logging.info(f"- Testing chunk number {idx}")
|
|
99
|
+
if idx == 0:
|
|
100
|
+
if verbose:
|
|
101
|
+
logging.info("Checking if all columns match")
|
|
102
|
+
if len(chunk.columns) != len(previous_analysis["header"]) or any(
|
|
103
|
+
list(chunk.columns)[k] != previous_analysis["header"][k]
|
|
104
|
+
for k in range(len(previous_analysis["header"]))
|
|
105
|
+
):
|
|
106
|
+
if verbose:
|
|
107
|
+
logging.warning("> Columns in the file do not match those of the analysis")
|
|
108
|
+
return False, None, None
|
|
92
109
|
analysis["total_lines"] += len(chunk)
|
|
93
110
|
row_hashes_count = row_hashes_count.add(
|
|
94
111
|
pd.util.hash_pandas_object(chunk, index=False).value_counts(),
|
|
95
112
|
fill_value=0,
|
|
96
113
|
)
|
|
97
|
-
for col in chunk.columns:
|
|
98
|
-
col_values[col] = col_values[col].add(
|
|
99
|
-
chunk[col].value_counts(dropna=False),
|
|
100
|
-
fill_value=0,
|
|
101
|
-
)
|
|
102
114
|
for col_name, detected in previous_analysis["columns"].items():
|
|
103
115
|
if verbose:
|
|
104
116
|
logging.info(f"- Testing {col_name} for {detected['format']}")
|
|
105
117
|
if detected["format"] == "string":
|
|
106
118
|
# no test for columns that have not been recognized as a specific format
|
|
107
119
|
continue
|
|
108
|
-
|
|
120
|
+
to_check = chunk[col_name].dropna() if skipna else chunk[col_name]
|
|
121
|
+
chunk_valid_values = sum(to_check.apply(formats[detected["format"]].func))
|
|
122
|
+
if formats[detected["format"]].proportion == 1 and chunk_valid_values < len(to_check):
|
|
123
|
+
# we can early stop in this case, not all values are valid while we want 100%
|
|
109
124
|
if verbose:
|
|
110
125
|
logging.warning(
|
|
111
|
-
f">
|
|
126
|
+
f"> Test failed for column {col_name} with format {detected['format']}"
|
|
112
127
|
)
|
|
113
|
-
return False,
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
128
|
+
return False, None, None
|
|
129
|
+
checked_values[col_name] += len(to_check)
|
|
130
|
+
valid_values[col_name] += chunk_valid_values
|
|
131
|
+
col_values[col_name] = (
|
|
132
|
+
col_values[col_name]
|
|
133
|
+
.add(
|
|
134
|
+
chunk[col_name].value_counts(dropna=False),
|
|
135
|
+
fill_value=0,
|
|
136
|
+
)
|
|
137
|
+
.rename_axis(col_name)
|
|
138
|
+
) # rename_axis because *sometimes* pandas doesn't pass on the column's name ¯\_(ツ)_/¯
|
|
139
|
+
del chunk
|
|
140
|
+
# finally we loop through the formats that accept less than 100% valid values to check the proportion
|
|
141
|
+
for col_name, detected in previous_analysis["columns"].items():
|
|
142
|
+
if (
|
|
143
|
+
checked_values[col_name] > 0
|
|
144
|
+
and valid_values[col_name] / checked_values[col_name]
|
|
145
|
+
< formats[detected["format"]].proportion
|
|
146
|
+
):
|
|
147
|
+
if verbose:
|
|
148
|
+
logging.warning(
|
|
149
|
+
f"> Test failed for column {col_name} with format {detected['format']}"
|
|
150
|
+
)
|
|
151
|
+
return False, None, None
|
|
123
152
|
if verbose:
|
|
124
153
|
logging.info("> All checks successful")
|
|
125
154
|
analysis["nb_duplicates"] = sum(row_hashes_count > 1)
|
|
155
|
+
del row_hashes_count
|
|
126
156
|
analysis["categorical"] = [
|
|
127
157
|
col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
|
|
128
158
|
]
|
|
129
159
|
return (
|
|
130
160
|
True,
|
|
131
|
-
first_chunk,
|
|
132
161
|
analysis
|
|
133
162
|
| {
|
|
134
163
|
k: previous_analysis[k]
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
|
|
2
2
|
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
4
|
-
csv_detective/format.py,sha256=
|
|
3
|
+
csv_detective/explore_csv.py,sha256=M8jabAP08raPY438v5UeBqJy3bBudTeuo-UNe2unWyE,7639
|
|
4
|
+
csv_detective/format.py,sha256=VTdwg4gp9pq6WYhbkCxv9X2hXq0fMrzfooFchmIL0as,2911
|
|
5
5
|
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
6
|
-
csv_detective/validate.py,sha256=
|
|
6
|
+
csv_detective/validate.py,sha256=7k0GC5AsTn5BbsRChetZZDmnTGiYLe40qPKiP3GruYs,7495
|
|
7
7
|
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
9
9
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
@@ -76,24 +76,24 @@ csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7p
|
|
|
76
76
|
csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
|
|
77
77
|
csv_detective/output/dataframe.py,sha256=juBMdj0eiL8c3OrJJ3kCf15Qs4-CFQfHqh91FnVbG9E,3656
|
|
78
78
|
csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
|
|
79
|
-
csv_detective/output/profile.py,sha256=
|
|
79
|
+
csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XNNM,4984
|
|
80
80
|
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
81
81
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
82
82
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
|
-
csv_detective/parsing/columns.py,sha256=
|
|
83
|
+
csv_detective/parsing/columns.py,sha256=H_dKHhSgQMIiOfxibnGl6HwTW9bRwGtIeUcYBN13-3A,9245
|
|
84
84
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
85
85
|
csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
|
|
86
|
-
csv_detective/parsing/excel.py,sha256=
|
|
87
|
-
csv_detective/parsing/load.py,sha256=
|
|
86
|
+
csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
|
|
87
|
+
csv_detective/parsing/load.py,sha256=pZ9ub47s0GO39F5-0D7KZhWQRAjMg8L8ljqDIRDjWg8,4463
|
|
88
88
|
csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
|
|
89
|
-
csv_detective-0.10.3.
|
|
89
|
+
csv_detective-0.10.3.dev5.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
90
90
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
91
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
92
92
|
tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
|
|
93
93
|
tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
|
|
94
94
|
tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
|
|
95
95
|
tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
|
|
96
|
-
tests/test_validation.py,sha256=
|
|
96
|
+
tests/test_validation.py,sha256=309k3Axgbp-1Wh6qvCj2BpeMBp3HXzLi5j9UKm1bRQs,5384
|
|
97
97
|
tests/data/a_test_file.csv,sha256=SOHjseGYqZer9yu3Bd3oS12Vw8MFsebo0BzrLZ_R4Cc,68871
|
|
98
98
|
tests/data/a_test_file.json,sha256=fB9bCpAMFPxFw8KxHRFlgRqjYG819QVGrCQWxQvwkvo,10542
|
|
99
99
|
tests/data/b_test_file.csv,sha256=wJGX62KhYjZi62De2XjZWClAzeRFEBsg3ET0IPX1BNU,98
|
|
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
|
|
|
104
104
|
tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
|
|
105
105
|
tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
|
|
106
106
|
tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
|
|
107
|
-
csv_detective-0.10.3.
|
|
108
|
-
csv_detective-0.10.3.
|
|
109
|
-
csv_detective-0.10.3.
|
|
110
|
-
csv_detective-0.10.3.
|
|
111
|
-
csv_detective-0.10.3.
|
|
107
|
+
csv_detective-0.10.3.dev5.dist-info/METADATA,sha256=TxQwNe_bPUxntht3aTRh0ct8I8J8NJOLv2ysXjpPLxA,11082
|
|
108
|
+
csv_detective-0.10.3.dev5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
109
|
+
csv_detective-0.10.3.dev5.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
110
|
+
csv_detective-0.10.3.dev5.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
|
|
111
|
+
csv_detective-0.10.3.dev5.dist-info/RECORD,,
|
tests/test_validation.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from unittest.mock import MagicMock, patch
|
|
2
3
|
|
|
3
4
|
import pandas as pd
|
|
4
5
|
import pytest
|
|
@@ -26,12 +27,12 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
|
|
|
26
27
|
@pytest.mark.parametrize(
|
|
27
28
|
"_params",
|
|
28
29
|
(
|
|
29
|
-
((True,
|
|
30
|
-
((False, None
|
|
31
|
-
((False, None
|
|
32
|
-
((False, None
|
|
30
|
+
((True, dict), {}),
|
|
31
|
+
((False, None), {"separator": "|"}),
|
|
32
|
+
((False, None), {"encoding": "unknown"}),
|
|
33
|
+
((False, None), {"header": ["a", "b"]}),
|
|
33
34
|
(
|
|
34
|
-
(False,
|
|
35
|
+
(False, None),
|
|
35
36
|
{
|
|
36
37
|
"columns.NUMCOM": {
|
|
37
38
|
"python_type": "int",
|
|
@@ -43,35 +44,89 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
|
|
|
43
44
|
),
|
|
44
45
|
)
|
|
45
46
|
def test_validation(_params):
|
|
46
|
-
(should_be_valid,
|
|
47
|
+
(should_be_valid, analysis_type), modif_previous_analysis = _params
|
|
47
48
|
with open("tests/data/a_test_file.json", "r") as f:
|
|
48
49
|
previous_analysis = json.load(f)
|
|
49
50
|
for dotkey in modif_previous_analysis:
|
|
50
51
|
keys = dotkey.split(".")
|
|
51
52
|
set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
|
|
52
|
-
is_valid,
|
|
53
|
+
is_valid, analysis, col_values = validate(
|
|
53
54
|
"tests/data/a_test_file.csv",
|
|
54
55
|
previous_analysis=previous_analysis,
|
|
55
56
|
)
|
|
56
57
|
assert is_valid == should_be_valid
|
|
57
|
-
if table_type is None:
|
|
58
|
-
assert table is None
|
|
59
|
-
else:
|
|
60
|
-
assert isinstance(table, table_type)
|
|
61
58
|
if analysis_type is None:
|
|
62
59
|
assert analysis is None
|
|
63
60
|
else:
|
|
64
61
|
assert isinstance(analysis, analysis_type)
|
|
65
62
|
if should_be_valid:
|
|
66
63
|
assert isinstance(col_values, dict)
|
|
67
|
-
assert all(
|
|
68
|
-
col in table.columns and isinstance(values, pd.Series)
|
|
69
|
-
for col, values in col_values.items()
|
|
70
|
-
)
|
|
71
64
|
else:
|
|
72
65
|
assert col_values is None
|
|
73
66
|
|
|
74
67
|
|
|
68
|
+
@pytest.mark.parametrize(
|
|
69
|
+
"_params",
|
|
70
|
+
(
|
|
71
|
+
# int: proportion = 1, should fail (early)
|
|
72
|
+
("12", "1.2", {"python_type": "int", "format": "int", "score": 1.5}, False),
|
|
73
|
+
# siren: proportion = 0.9, should fail (later)
|
|
74
|
+
(
|
|
75
|
+
"130025265",
|
|
76
|
+
"A13794BC",
|
|
77
|
+
{"python_type": "string", "format": "siren", "score": 1.5},
|
|
78
|
+
False,
|
|
79
|
+
),
|
|
80
|
+
# siret: proportion = 0.8, should succeed
|
|
81
|
+
(
|
|
82
|
+
"13002526500013",
|
|
83
|
+
"A13794BC",
|
|
84
|
+
{"python_type": "string", "format": "siret", "score": 1.5},
|
|
85
|
+
True,
|
|
86
|
+
),
|
|
87
|
+
),
|
|
88
|
+
)
|
|
89
|
+
def test_validation_with_proportions(_params):
|
|
90
|
+
# testing the behaviour for a file that has 15% invalid values, but all in a single chunk
|
|
91
|
+
valid_value, invalid_value, detected, should_be_valid = _params
|
|
92
|
+
url = f"http://example.com/test.csv"
|
|
93
|
+
expected_content = "col\n"
|
|
94
|
+
for _ in range(60):
|
|
95
|
+
# 60 rows of valid values
|
|
96
|
+
expected_content += f"{valid_value}\n"
|
|
97
|
+
for _ in range(15):
|
|
98
|
+
# 15 rows of invalid values
|
|
99
|
+
expected_content += f"{invalid_value}\n"
|
|
100
|
+
for _ in range(25):
|
|
101
|
+
# 25 rows of valid values
|
|
102
|
+
expected_content += f"{valid_value}\n"
|
|
103
|
+
previous_analysis = {
|
|
104
|
+
"encoding": "utf-8",
|
|
105
|
+
"separator": ",",
|
|
106
|
+
"header_row_idx": 0,
|
|
107
|
+
"header": ["col"],
|
|
108
|
+
"columns": {"col": detected},
|
|
109
|
+
# just setting these keys when validation is successful, they're not used for the validation itself
|
|
110
|
+
"categorical": [],
|
|
111
|
+
"columns_fields": {},
|
|
112
|
+
"columns_labels": {},
|
|
113
|
+
"formats": {},
|
|
114
|
+
}
|
|
115
|
+
with (
|
|
116
|
+
patch("urllib.request.urlopen") as mock_urlopen,
|
|
117
|
+
patch("csv_detective.validate.VALIDATION_CHUNK_SIZE", 10),
|
|
118
|
+
):
|
|
119
|
+
mock_response = MagicMock()
|
|
120
|
+
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
121
|
+
mock_response.__enter__.return_value = mock_response
|
|
122
|
+
mock_urlopen.return_value = mock_response
|
|
123
|
+
is_valid, *_ = validate(
|
|
124
|
+
file_path=url,
|
|
125
|
+
previous_analysis=previous_analysis,
|
|
126
|
+
)
|
|
127
|
+
assert is_valid == should_be_valid
|
|
128
|
+
|
|
129
|
+
|
|
75
130
|
@pytest.mark.parametrize(
|
|
76
131
|
"modif_previous_analysis",
|
|
77
132
|
(
|
|
File without changes
|
{csv_detective-0.10.3.dev4.dist-info → csv_detective-0.10.3.dev5.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.10.3.dev4.dist-info → csv_detective-0.10.3.dev5.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|