csv-detective 0.10.3__py3-none-any.whl → 0.10.3.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/headers.py +14 -12
- csv_detective/detection/rows.py +1 -1
- csv_detective/explore_csv.py +4 -15
- csv_detective/format.py +1 -1
- csv_detective/output/dataframe.py +2 -2
- csv_detective/output/profile.py +1 -1
- csv_detective/parsing/columns.py +1 -1
- csv_detective/parsing/excel.py +1 -1
- csv_detective/parsing/load.py +11 -8
- csv_detective/validate.py +37 -66
- {csv_detective-0.10.3.dist-info → csv_detective-0.10.3.dev2.dist-info}/METADATA +5 -1
- {csv_detective-0.10.3.dist-info → csv_detective-0.10.3.dev2.dist-info}/RECORD +19 -19
- {csv_detective-0.10.3.dist-info → csv_detective-0.10.3.dev2.dist-info}/WHEEL +1 -1
- tests/test_fields.py +1 -7
- tests/test_file.py +6 -26
- tests/test_validation.py +15 -70
- {csv_detective-0.10.3.dist-info → csv_detective-0.10.3.dev2.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.10.3.dist-info → csv_detective-0.10.3.dev2.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.10.3.dist-info → csv_detective-0.10.3.dev2.dist-info}/top_level.txt +0 -0
|
@@ -5,22 +5,24 @@ from typing import TextIO
|
|
|
5
5
|
from csv_detective.utils import display_logs_depending_process_time
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def
|
|
8
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
|
|
9
9
|
"""Tests 10 first rows for possible header (in case header is not 1st row)"""
|
|
10
10
|
if verbose:
|
|
11
11
|
start = time()
|
|
12
|
-
logging.info("Detecting
|
|
12
|
+
logging.info("Detecting headers")
|
|
13
13
|
file.seek(0)
|
|
14
14
|
for i in range(10):
|
|
15
15
|
row = file.readline()
|
|
16
16
|
position = file.tell()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
17
|
+
headers = [c for c in row.replace("\n", "").split(sep) if c]
|
|
18
|
+
if not any(col == "" for col in headers):
|
|
19
|
+
next_row = file.readline()
|
|
20
|
+
file.seek(position)
|
|
21
|
+
if row != next_row:
|
|
22
|
+
if verbose:
|
|
23
|
+
display_logs_depending_process_time(
|
|
24
|
+
f"Detected headers in {round(time() - start, 3)}s",
|
|
25
|
+
time() - start,
|
|
26
|
+
)
|
|
27
|
+
return i, headers
|
|
28
|
+
raise ValueError("Could not retrieve headers")
|
csv_detective/detection/rows.py
CHANGED
|
@@ -2,7 +2,7 @@ import pandas as pd
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
5
|
-
"""Analog process to
|
|
5
|
+
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
6
6
|
to end up with the header at the right place"""
|
|
7
7
|
idx = 0
|
|
8
8
|
if all([str(c).startswith("Unnamed:") for c in table.columns]):
|
csv_detective/explore_csv.py
CHANGED
|
@@ -142,19 +142,20 @@ def validate_then_detect(
|
|
|
142
142
|
if is_url(file_path):
|
|
143
143
|
logging.info("Path recognized as a URL")
|
|
144
144
|
|
|
145
|
-
is_valid, analysis, col_values = validate(
|
|
145
|
+
is_valid, table, analysis, col_values = validate(
|
|
146
146
|
file_path=file_path,
|
|
147
147
|
previous_analysis=previous_analysis,
|
|
148
148
|
verbose=verbose,
|
|
149
149
|
skipna=skipna,
|
|
150
150
|
)
|
|
151
|
-
if
|
|
152
|
-
# if loading failed in validate, we load it from scratch
|
|
151
|
+
if analysis is None:
|
|
152
|
+
# if loading failed in validate, we load it from scratch
|
|
153
153
|
table, analysis = load_file(
|
|
154
154
|
file_path=file_path,
|
|
155
155
|
num_rows=num_rows,
|
|
156
156
|
verbose=verbose,
|
|
157
157
|
)
|
|
158
|
+
if not is_valid:
|
|
158
159
|
analysis, col_values = detect_formats(
|
|
159
160
|
table=table,
|
|
160
161
|
analysis=analysis,
|
|
@@ -164,18 +165,6 @@ def validate_then_detect(
|
|
|
164
165
|
skipna=skipna,
|
|
165
166
|
verbose=verbose,
|
|
166
167
|
)
|
|
167
|
-
else:
|
|
168
|
-
# successful validation means we have a correct analysis and col_values
|
|
169
|
-
# only need to reload the table, and we already know how
|
|
170
|
-
table, _ = load_file(
|
|
171
|
-
file_path=file_path,
|
|
172
|
-
num_rows=num_rows,
|
|
173
|
-
verbose=verbose,
|
|
174
|
-
sep=analysis.get("separator"),
|
|
175
|
-
encoding=analysis.get("encoding"),
|
|
176
|
-
engine=analysis.get("engine"),
|
|
177
|
-
sheet_name=analysis.get("sheet_name"),
|
|
178
|
-
)
|
|
179
168
|
try:
|
|
180
169
|
return generate_output(
|
|
181
170
|
table=table,
|
csv_detective/format.py
CHANGED
|
@@ -27,7 +27,7 @@ class Format:
|
|
|
27
27
|
tags: to allow users to submit a file to only a subset of formats
|
|
28
28
|
"""
|
|
29
29
|
self.name: str = name
|
|
30
|
-
self.func: Callable
|
|
30
|
+
self.func: Callable = func
|
|
31
31
|
self._test_values: dict[bool, list[str]] = _test_values
|
|
32
32
|
self.labels: dict[str, float] = labels
|
|
33
33
|
self.proportion: float = proportion
|
|
@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
|
|
17
|
-
if not isinstance(value, str) or value
|
|
18
|
-
#
|
|
17
|
+
if not isinstance(value, str) or not value:
|
|
18
|
+
# None is the current default value in hydra, should we keep this?
|
|
19
19
|
return None
|
|
20
20
|
match _type:
|
|
21
21
|
case "string":
|
csv_detective/output/profile.py
CHANGED
|
@@ -23,7 +23,7 @@ def create_profile(
|
|
|
23
23
|
logging.info("Creating profile")
|
|
24
24
|
|
|
25
25
|
if num_rows > 0:
|
|
26
|
-
raise ValueError("To create
|
|
26
|
+
raise ValueError("To create profiles num_rows has to be set to -1")
|
|
27
27
|
if not limited_output:
|
|
28
28
|
columns = {
|
|
29
29
|
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
csv_detective/parsing/columns.py
CHANGED
csv_detective/parsing/excel.py
CHANGED
|
@@ -23,7 +23,7 @@ def parse_excel(
|
|
|
23
23
|
file_path: str,
|
|
24
24
|
num_rows: int = -1,
|
|
25
25
|
engine: str | None = None,
|
|
26
|
-
sheet_name: str |
|
|
26
|
+
sheet_name: str | None = None,
|
|
27
27
|
random_state: int = 42,
|
|
28
28
|
verbose: bool = False,
|
|
29
29
|
) -> tuple[pd.DataFrame, int, int, str, str, int]:
|
csv_detective/parsing/load.py
CHANGED
|
@@ -11,7 +11,7 @@ from csv_detective.detection.engine import (
|
|
|
11
11
|
EXCEL_ENGINES,
|
|
12
12
|
detect_engine,
|
|
13
13
|
)
|
|
14
|
-
from csv_detective.detection.headers import
|
|
14
|
+
from csv_detective.detection.headers import detect_headers
|
|
15
15
|
from csv_detective.detection.separator import detect_separator
|
|
16
16
|
from csv_detective.parsing.compression import unzip
|
|
17
17
|
from csv_detective.parsing.csv import parse_csv
|
|
@@ -28,12 +28,12 @@ def load_file(
|
|
|
28
28
|
encoding: str | None = None,
|
|
29
29
|
sep: str | None = None,
|
|
30
30
|
verbose: bool = False,
|
|
31
|
-
engine: str | None = None,
|
|
32
31
|
sheet_name: str | int | None = None,
|
|
33
32
|
) -> tuple[pd.DataFrame, dict]:
|
|
34
33
|
file_name = file_path.split("/")[-1]
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
engine = None
|
|
35
|
+
if "." not in file_name or not file_name.endswith("csv"):
|
|
36
|
+
# file has no extension, we'll investigate how to read it
|
|
37
37
|
engine = detect_engine(file_path, verbose=verbose)
|
|
38
38
|
|
|
39
39
|
if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
|
|
@@ -46,6 +46,9 @@ def load_file(
|
|
|
46
46
|
)
|
|
47
47
|
if table.empty:
|
|
48
48
|
raise ValueError("Table seems to be empty")
|
|
49
|
+
header = table.columns.to_list()
|
|
50
|
+
if any(col.startswith("Unnamed") for col in header):
|
|
51
|
+
raise ValueError("Could not retrieve headers")
|
|
49
52
|
analysis = {
|
|
50
53
|
"engine": engine,
|
|
51
54
|
"sheet_name": sheet_name,
|
|
@@ -80,7 +83,9 @@ def load_file(
|
|
|
80
83
|
str_file = open(file_path, "r", encoding=encoding)
|
|
81
84
|
if sep is None:
|
|
82
85
|
sep = detect_separator(str_file, verbose=verbose)
|
|
83
|
-
header_row_idx =
|
|
86
|
+
header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
|
|
87
|
+
if header is None or (isinstance(header, list) and any([h is None for h in header])):
|
|
88
|
+
raise ValueError("Could not retrieve headers")
|
|
84
89
|
heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
|
|
85
90
|
trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
|
|
86
91
|
table, total_lines, nb_duplicates = parse_csv(
|
|
@@ -97,11 +102,9 @@ def load_file(
|
|
|
97
102
|
}
|
|
98
103
|
if engine is not None:
|
|
99
104
|
analysis["compression"] = engine
|
|
100
|
-
if any(not isinstance(col, str) or col.startswith("Unnamed:") for col in table.columns):
|
|
101
|
-
raise ValueError("Could not accurately detect the file's columns")
|
|
102
105
|
analysis |= {
|
|
103
106
|
"header_row_idx": header_row_idx,
|
|
104
|
-
"header":
|
|
107
|
+
"header": header,
|
|
105
108
|
}
|
|
106
109
|
if total_lines is not None:
|
|
107
110
|
analysis["total_lines"] = total_lines
|
csv_detective/validate.py
CHANGED
|
@@ -1,13 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from collections import defaultdict
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
|
|
6
5
|
from csv_detective.format import FormatsManager
|
|
7
6
|
from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
|
|
8
7
|
|
|
9
|
-
# VALIDATION_CHUNK_SIZE is bigger than (analysis) CHUNK_SIZE because
|
|
10
|
-
# it's faster to validate so we can afford to load more rows
|
|
11
8
|
VALIDATION_CHUNK_SIZE = int(1e5)
|
|
12
9
|
logging.basicConfig(level=logging.INFO)
|
|
13
10
|
|
|
@@ -19,9 +16,9 @@ def validate(
|
|
|
19
16
|
previous_analysis: dict,
|
|
20
17
|
verbose: bool = False,
|
|
21
18
|
skipna: bool = True,
|
|
22
|
-
) -> tuple[bool, dict | None, dict[str, pd.Series] | None]:
|
|
19
|
+
) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
|
|
23
20
|
"""
|
|
24
|
-
Verify is the given file has the same fields and
|
|
21
|
+
Verify is the given file has the same fields and types as in the given analysis.
|
|
25
22
|
|
|
26
23
|
Args:
|
|
27
24
|
file_path: the path of the file to validate
|
|
@@ -29,15 +26,6 @@ def validate(
|
|
|
29
26
|
verbose: whether the code displays the steps it's going through
|
|
30
27
|
skipna: whether to ignore NaN values in the checks
|
|
31
28
|
"""
|
|
32
|
-
if verbose:
|
|
33
|
-
logging.info(f"Checking given formats exist")
|
|
34
|
-
for col_name, detected in previous_analysis["columns"].items():
|
|
35
|
-
if detected["format"] == "string":
|
|
36
|
-
continue
|
|
37
|
-
elif detected["format"] not in formats:
|
|
38
|
-
if verbose:
|
|
39
|
-
logging.warning(f"> Unknown format `{detected['format']}` in analysis")
|
|
40
|
-
return False, None, None
|
|
41
29
|
try:
|
|
42
30
|
if previous_analysis.get("separator"):
|
|
43
31
|
# loading the table in chunks
|
|
@@ -70,94 +58,77 @@ def validate(
|
|
|
70
58
|
]
|
|
71
59
|
)
|
|
72
60
|
analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
|
|
61
|
+
first_chunk = next(chunks)
|
|
73
62
|
analysis.update(
|
|
74
63
|
{k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
|
|
75
64
|
)
|
|
76
65
|
except Exception as e:
|
|
77
66
|
if verbose:
|
|
78
67
|
logging.warning(f"> Could not load the file with previous analysis values: {e}")
|
|
79
|
-
return False, None, None
|
|
68
|
+
return False, None, None, None
|
|
80
69
|
if verbose:
|
|
81
70
|
logging.info("Comparing table with the previous analysis")
|
|
71
|
+
logging.info("- Checking if all columns match")
|
|
72
|
+
if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
|
|
73
|
+
list(first_chunk.columns)[k] != previous_analysis["header"][k]
|
|
74
|
+
for k in range(len(previous_analysis["header"]))
|
|
75
|
+
):
|
|
76
|
+
if verbose:
|
|
77
|
+
logging.warning("> Columns do not match, proceeding with full analysis")
|
|
78
|
+
return False, None, None, None
|
|
79
|
+
if verbose:
|
|
82
80
|
logging.info(
|
|
83
81
|
f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
|
|
84
82
|
)
|
|
85
83
|
|
|
86
|
-
#
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# used for profile to read the file only once
|
|
91
|
-
# naming it "count" to be iso with how col_values are made in detect_formats
|
|
92
|
-
col_values: defaultdict[str, pd.Series] = defaultdict(lambda: pd.Series(name="count"))
|
|
84
|
+
# hashing rows to get nb_duplicates
|
|
85
|
+
row_hashes_count = pd.util.hash_pandas_object(first_chunk, index=False).value_counts()
|
|
86
|
+
# getting values for profile to read the file only once
|
|
87
|
+
col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
|
|
93
88
|
analysis["total_lines"] = 0
|
|
94
|
-
|
|
95
|
-
valid_values: dict[str, int] = {col_name: 0 for col_name in previous_analysis["columns"]}
|
|
96
|
-
for idx, chunk in enumerate(chunks):
|
|
89
|
+
for idx, chunk in enumerate([first_chunk, *chunks]):
|
|
97
90
|
if verbose:
|
|
98
|
-
logging.info(f"
|
|
99
|
-
if idx == 0:
|
|
100
|
-
if verbose:
|
|
101
|
-
logging.info("Checking if all columns match")
|
|
102
|
-
if len(chunk.columns) != len(previous_analysis["header"]) or any(
|
|
103
|
-
list(chunk.columns)[k] != previous_analysis["header"][k]
|
|
104
|
-
for k in range(len(previous_analysis["header"]))
|
|
105
|
-
):
|
|
106
|
-
if verbose:
|
|
107
|
-
logging.warning("> Columns in the file do not match those of the analysis")
|
|
108
|
-
return False, None, None
|
|
91
|
+
logging.info(f"> Testing chunk number {idx}")
|
|
109
92
|
analysis["total_lines"] += len(chunk)
|
|
110
93
|
row_hashes_count = row_hashes_count.add(
|
|
111
94
|
pd.util.hash_pandas_object(chunk, index=False).value_counts(),
|
|
112
95
|
fill_value=0,
|
|
113
96
|
)
|
|
97
|
+
for col in chunk.columns:
|
|
98
|
+
col_values[col] = col_values[col].add(
|
|
99
|
+
chunk[col].value_counts(dropna=False),
|
|
100
|
+
fill_value=0,
|
|
101
|
+
)
|
|
114
102
|
for col_name, detected in previous_analysis["columns"].items():
|
|
115
103
|
if verbose:
|
|
116
104
|
logging.info(f"- Testing {col_name} for {detected['format']}")
|
|
117
105
|
if detected["format"] == "string":
|
|
118
106
|
# no test for columns that have not been recognized as a specific format
|
|
119
107
|
continue
|
|
120
|
-
|
|
121
|
-
chunk_valid_values = sum(to_check.apply(formats[detected["format"]].func))
|
|
122
|
-
if formats[detected["format"]].proportion == 1 and chunk_valid_values < len(to_check):
|
|
123
|
-
# we can early stop in this case, not all values are valid while we want 100%
|
|
108
|
+
if detected["format"] not in formats:
|
|
124
109
|
if verbose:
|
|
125
110
|
logging.warning(
|
|
126
|
-
f">
|
|
111
|
+
f"> Unknown format `{detected['format']}`, proceeding with full analysis"
|
|
127
112
|
)
|
|
128
|
-
return False,
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
) # rename_axis because *sometimes* pandas doesn't pass on the column's name ¯\_(ツ)_/¯
|
|
139
|
-
del chunk
|
|
140
|
-
# finally we loop through the formats that accept less than 100% valid values to check the proportion
|
|
141
|
-
for col_name, detected in previous_analysis["columns"].items():
|
|
142
|
-
if (
|
|
143
|
-
checked_values[col_name] > 0
|
|
144
|
-
and valid_values[col_name] / checked_values[col_name]
|
|
145
|
-
< formats[detected["format"]].proportion
|
|
146
|
-
):
|
|
147
|
-
if verbose:
|
|
148
|
-
logging.warning(
|
|
149
|
-
f"> Test failed for column {col_name} with format {detected['format']}"
|
|
150
|
-
)
|
|
151
|
-
return False, None, None
|
|
113
|
+
return False, first_chunk, analysis, None
|
|
114
|
+
test_result: float = test_col_val(
|
|
115
|
+
serie=chunk[col_name],
|
|
116
|
+
format=formats[detected["format"]],
|
|
117
|
+
skipna=skipna,
|
|
118
|
+
)
|
|
119
|
+
if not bool(test_result):
|
|
120
|
+
if verbose:
|
|
121
|
+
logging.warning("> Test failed, proceeding with full analysis")
|
|
122
|
+
return False, first_chunk, analysis, None
|
|
152
123
|
if verbose:
|
|
153
124
|
logging.info("> All checks successful")
|
|
154
125
|
analysis["nb_duplicates"] = sum(row_hashes_count > 1)
|
|
155
|
-
del row_hashes_count
|
|
156
126
|
analysis["categorical"] = [
|
|
157
127
|
col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
|
|
158
128
|
]
|
|
159
129
|
return (
|
|
160
130
|
True,
|
|
131
|
+
first_chunk,
|
|
161
132
|
analysis
|
|
162
133
|
| {
|
|
163
134
|
k: previous_analysis[k]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csv-detective
|
|
3
|
-
Version: 0.10.3
|
|
3
|
+
Version: 0.10.3.dev2
|
|
4
4
|
Summary: Detect tabular files column content
|
|
5
5
|
Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
|
|
6
6
|
License: MIT
|
|
@@ -23,6 +23,10 @@ Requires-Dist: frformat==0.4.0
|
|
|
23
23
|
Requires-Dist: Faker>=33.0.0
|
|
24
24
|
Requires-Dist: rstr>=3.2.2
|
|
25
25
|
Requires-Dist: more-itertools>=10.8.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.3.0; extra == "dev"
|
|
28
|
+
Requires-Dist: responses>=0.25.0; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff>=0.9.3; extra == "dev"
|
|
26
30
|
Dynamic: license-file
|
|
27
31
|
|
|
28
32
|
# CSV Detective
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
|
|
2
2
|
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
4
|
-
csv_detective/format.py,sha256=
|
|
3
|
+
csv_detective/explore_csv.py,sha256=qSf6N3tbp43BUMJF5wiXz3aYKaTez6ro-75KL2Arci4,7174
|
|
4
|
+
csv_detective/format.py,sha256=VglcxWBmjTvWNMhwSUZDfMdJcK9lAUum64Jxvm70AJ4,2898
|
|
5
5
|
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
6
|
-
csv_detective/validate.py,sha256=
|
|
6
|
+
csv_detective/validate.py,sha256=QBJhwHP0U0Ux7ODGV6foqNGm-DlbECIo6jUsBFOdDr0,5739
|
|
7
7
|
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
9
9
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
10
10
|
csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
|
|
11
11
|
csv_detective/detection/formats.py,sha256=9aIE4gwTN8c8pa-kofeJ7zalo8NqjGZabYD-G79kV5I,4734
|
|
12
|
-
csv_detective/detection/headers.py,sha256=
|
|
13
|
-
csv_detective/detection/rows.py,sha256=
|
|
12
|
+
csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
|
|
13
|
+
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
14
14
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
15
15
|
csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
|
|
16
16
|
csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
|
|
@@ -74,26 +74,26 @@ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=mLt_qcQ6D8hfy9zdi7
|
|
|
74
74
|
csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=XFPdGBsyZCBg4D8IDn6VgwsycCwYVfuqPbyHfNeqGv0,1003
|
|
75
75
|
csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7prThkihHrf7mwB6j5uEHpxGvLFE,1003
|
|
76
76
|
csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
|
|
77
|
-
csv_detective/output/dataframe.py,sha256=
|
|
77
|
+
csv_detective/output/dataframe.py,sha256=QX5vplx0AOKgnwwJ6dKvDHWRX9IGPStax-svXEyweJ8,3584
|
|
78
78
|
csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
|
|
79
|
-
csv_detective/output/profile.py,sha256=
|
|
79
|
+
csv_detective/output/profile.py,sha256=ADr5DwuvwcBYxugjN38fHm11l6ivfzGHXPd8a87Ht-s,4985
|
|
80
80
|
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
81
81
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
82
82
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
|
-
csv_detective/parsing/columns.py,sha256=
|
|
83
|
+
csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcOl8I,9257
|
|
84
84
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
85
85
|
csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
|
|
86
|
-
csv_detective/parsing/excel.py,sha256=
|
|
87
|
-
csv_detective/parsing/load.py,sha256=
|
|
86
|
+
csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
|
|
87
|
+
csv_detective/parsing/load.py,sha256=orW6PV5XUsHA093yVSxXkJl33LEUUArr3hP81U9Bzd4,4386
|
|
88
88
|
csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
|
|
89
|
-
csv_detective-0.10.3.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
89
|
+
csv_detective-0.10.3.dev2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
90
90
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
91
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
92
|
-
tests/test_fields.py,sha256=
|
|
93
|
-
tests/test_file.py,sha256=
|
|
92
|
+
tests/test_fields.py,sha256=EuD2F1JUR8y88Hm-AYuJ5X7AKkGSyLIQfsGdxYgIWng,5680
|
|
93
|
+
tests/test_file.py,sha256=_ftEymft5-1keUVE5AUdF2XkVcChJo6oBjws3ye06FE,14543
|
|
94
94
|
tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
|
|
95
95
|
tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
|
|
96
|
-
tests/test_validation.py,sha256=
|
|
96
|
+
tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
|
|
97
97
|
tests/data/a_test_file.csv,sha256=SOHjseGYqZer9yu3Bd3oS12Vw8MFsebo0BzrLZ_R4Cc,68871
|
|
98
98
|
tests/data/a_test_file.json,sha256=fB9bCpAMFPxFw8KxHRFlgRqjYG819QVGrCQWxQvwkvo,10542
|
|
99
99
|
tests/data/b_test_file.csv,sha256=wJGX62KhYjZi62De2XjZWClAzeRFEBsg3ET0IPX1BNU,98
|
|
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
|
|
|
104
104
|
tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
|
|
105
105
|
tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
|
|
106
106
|
tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
|
|
107
|
-
csv_detective-0.10.3.dist-info/METADATA,sha256=
|
|
108
|
-
csv_detective-0.10.3.dist-info/WHEEL,sha256=
|
|
109
|
-
csv_detective-0.10.3.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
110
|
-
csv_detective-0.10.3.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
|
|
111
|
-
csv_detective-0.10.3.dist-info/RECORD,,
|
|
107
|
+
csv_detective-0.10.3.dev2.dist-info/METADATA,sha256=QhAD5N5OZx1L_9ajLLuEjhSYSz6q05eAEwVd6_kDPFc,11082
|
|
108
|
+
csv_detective-0.10.3.dev2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
109
|
+
csv_detective-0.10.3.dev2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
110
|
+
csv_detective-0.10.3.dev2.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
|
|
111
|
+
csv_detective-0.10.3.dev2.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -104,17 +104,11 @@ def test_fields_with_values(args):
|
|
|
104
104
|
("2022-08-01", "date", _date),
|
|
105
105
|
("2024-09-23 17:32:07", "datetime", _datetime),
|
|
106
106
|
("2024-09-23 17:32:07+02:00", "datetime", _datetime),
|
|
107
|
-
("N/A", "int", None),
|
|
108
|
-
("nan", "bool", None),
|
|
109
|
-
("", "date", None), # all NaN-like values should be cast as None for all type
|
|
110
107
|
),
|
|
111
108
|
)
|
|
112
109
|
def test_cast(args):
|
|
113
110
|
value, detected_type, cast_type = args
|
|
114
|
-
|
|
115
|
-
assert cast(value, detected_type) is None
|
|
116
|
-
else:
|
|
117
|
-
assert isinstance(cast(value, detected_type), cast_type)
|
|
111
|
+
assert isinstance(cast(value, detected_type), cast_type)
|
|
118
112
|
|
|
119
113
|
|
|
120
114
|
@pytest.mark.parametrize(
|
tests/test_file.py
CHANGED
|
@@ -9,12 +9,6 @@ from csv_detective.output.profile import create_profile
|
|
|
9
9
|
from csv_detective.parsing.csv import CHUNK_SIZE
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
@pytest.fixture
|
|
13
|
-
def mocked_responses():
|
|
14
|
-
with responses.RequestsMock() as rsps:
|
|
15
|
-
yield rsps
|
|
16
|
-
|
|
17
|
-
|
|
18
12
|
@pytest.mark.parametrize(
|
|
19
13
|
"chunk_size",
|
|
20
14
|
(100, 404, int(1e5)),
|
|
@@ -171,26 +165,6 @@ def test_exception_different_number_of_columns():
|
|
|
171
165
|
)
|
|
172
166
|
|
|
173
167
|
|
|
174
|
-
def test_exception_malformed_columns(mocked_responses):
|
|
175
|
-
"""
|
|
176
|
-
A ValueError should be raised if any column is Unnamed
|
|
177
|
-
"""
|
|
178
|
-
url = f"http://example.com/bad_cols.csv"
|
|
179
|
-
expected_content = b"col1,col2,\n1,2,\n3,4,"
|
|
180
|
-
mocked_responses.get(
|
|
181
|
-
url,
|
|
182
|
-
body=expected_content,
|
|
183
|
-
status=200,
|
|
184
|
-
)
|
|
185
|
-
with patch("urllib.request.urlopen") as mock_urlopen:
|
|
186
|
-
mock_response = MagicMock()
|
|
187
|
-
mock_response.read.return_value = expected_content
|
|
188
|
-
mock_response.__enter__.return_value = mock_response
|
|
189
|
-
mock_urlopen.return_value = mock_response
|
|
190
|
-
with pytest.raises(ValueError):
|
|
191
|
-
routine(file_path=url)
|
|
192
|
-
|
|
193
|
-
|
|
194
168
|
def test_code_dep_reg_on_file():
|
|
195
169
|
output = routine(
|
|
196
170
|
file_path="tests/data/b_test_file.csv",
|
|
@@ -263,6 +237,12 @@ def test_non_csv_files(params):
|
|
|
263
237
|
assert _[k] == v
|
|
264
238
|
|
|
265
239
|
|
|
240
|
+
@pytest.fixture
|
|
241
|
+
def mocked_responses():
|
|
242
|
+
with responses.RequestsMock() as rsps:
|
|
243
|
+
yield rsps
|
|
244
|
+
|
|
245
|
+
|
|
266
246
|
@pytest.mark.parametrize(
|
|
267
247
|
"params",
|
|
268
248
|
# ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
|
tests/test_validation.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from unittest.mock import MagicMock, patch
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
import pytest
|
|
@@ -27,12 +26,12 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
|
|
|
27
26
|
@pytest.mark.parametrize(
|
|
28
27
|
"_params",
|
|
29
28
|
(
|
|
30
|
-
((True, dict), {}),
|
|
31
|
-
((False, None), {"separator": "|"}),
|
|
32
|
-
((False, None), {"encoding": "unknown"}),
|
|
33
|
-
((False, None), {"header": ["a", "b"]}),
|
|
29
|
+
((True, pd.DataFrame, dict), {}),
|
|
30
|
+
((False, None, None), {"separator": "|"}),
|
|
31
|
+
((False, None, None), {"encoding": "unknown"}),
|
|
32
|
+
((False, None, None), {"header": ["a", "b"]}),
|
|
34
33
|
(
|
|
35
|
-
(False,
|
|
34
|
+
(False, pd.DataFrame, dict),
|
|
36
35
|
{
|
|
37
36
|
"columns.NUMCOM": {
|
|
38
37
|
"python_type": "int",
|
|
@@ -44,89 +43,35 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
|
|
|
44
43
|
),
|
|
45
44
|
)
|
|
46
45
|
def test_validation(_params):
|
|
47
|
-
(should_be_valid, analysis_type), modif_previous_analysis = _params
|
|
46
|
+
(should_be_valid, table_type, analysis_type), modif_previous_analysis = _params
|
|
48
47
|
with open("tests/data/a_test_file.json", "r") as f:
|
|
49
48
|
previous_analysis = json.load(f)
|
|
50
49
|
for dotkey in modif_previous_analysis:
|
|
51
50
|
keys = dotkey.split(".")
|
|
52
51
|
set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
|
|
53
|
-
is_valid, analysis, col_values = validate(
|
|
52
|
+
is_valid, table, analysis, col_values = validate(
|
|
54
53
|
"tests/data/a_test_file.csv",
|
|
55
54
|
previous_analysis=previous_analysis,
|
|
56
55
|
)
|
|
57
56
|
assert is_valid == should_be_valid
|
|
57
|
+
if table_type is None:
|
|
58
|
+
assert table is None
|
|
59
|
+
else:
|
|
60
|
+
assert isinstance(table, table_type)
|
|
58
61
|
if analysis_type is None:
|
|
59
62
|
assert analysis is None
|
|
60
63
|
else:
|
|
61
64
|
assert isinstance(analysis, analysis_type)
|
|
62
65
|
if should_be_valid:
|
|
63
66
|
assert isinstance(col_values, dict)
|
|
67
|
+
assert all(
|
|
68
|
+
col in table.columns and isinstance(values, pd.Series)
|
|
69
|
+
for col, values in col_values.items()
|
|
70
|
+
)
|
|
64
71
|
else:
|
|
65
72
|
assert col_values is None
|
|
66
73
|
|
|
67
74
|
|
|
68
|
-
@pytest.mark.parametrize(
|
|
69
|
-
"_params",
|
|
70
|
-
(
|
|
71
|
-
# int: proportion = 1, should fail (early)
|
|
72
|
-
("12", "1.2", {"python_type": "int", "format": "int", "score": 1.5}, False),
|
|
73
|
-
# siren: proportion = 0.9, should fail (later)
|
|
74
|
-
(
|
|
75
|
-
"130025265",
|
|
76
|
-
"A13794BC",
|
|
77
|
-
{"python_type": "string", "format": "siren", "score": 1.5},
|
|
78
|
-
False,
|
|
79
|
-
),
|
|
80
|
-
# siret: proportion = 0.8, should succeed
|
|
81
|
-
(
|
|
82
|
-
"13002526500013",
|
|
83
|
-
"A13794BC",
|
|
84
|
-
{"python_type": "string", "format": "siret", "score": 1.5},
|
|
85
|
-
True,
|
|
86
|
-
),
|
|
87
|
-
),
|
|
88
|
-
)
|
|
89
|
-
def test_validation_with_proportions(_params):
|
|
90
|
-
# testing the behaviour for a file that has 15% invalid values, but all in a single chunk
|
|
91
|
-
valid_value, invalid_value, detected, should_be_valid = _params
|
|
92
|
-
url = f"http://example.com/test.csv"
|
|
93
|
-
expected_content = "col\n"
|
|
94
|
-
for _ in range(60):
|
|
95
|
-
# 60 rows of valid values
|
|
96
|
-
expected_content += f"{valid_value}\n"
|
|
97
|
-
for _ in range(15):
|
|
98
|
-
# 15 rows of invalid values
|
|
99
|
-
expected_content += f"{invalid_value}\n"
|
|
100
|
-
for _ in range(25):
|
|
101
|
-
# 25 rows of valid values
|
|
102
|
-
expected_content += f"{valid_value}\n"
|
|
103
|
-
previous_analysis = {
|
|
104
|
-
"encoding": "utf-8",
|
|
105
|
-
"separator": ",",
|
|
106
|
-
"header_row_idx": 0,
|
|
107
|
-
"header": ["col"],
|
|
108
|
-
"columns": {"col": detected},
|
|
109
|
-
# just setting these keys when validation is successful, they're not used for the validation itself
|
|
110
|
-
"categorical": [],
|
|
111
|
-
"columns_fields": {},
|
|
112
|
-
"columns_labels": {},
|
|
113
|
-
"formats": {},
|
|
114
|
-
}
|
|
115
|
-
with (
|
|
116
|
-
patch("urllib.request.urlopen") as mock_urlopen,
|
|
117
|
-
patch("csv_detective.validate.VALIDATION_CHUNK_SIZE", 10),
|
|
118
|
-
):
|
|
119
|
-
mock_response = MagicMock()
|
|
120
|
-
mock_response.read.return_value = expected_content.encode("utf-8")
|
|
121
|
-
mock_response.__enter__.return_value = mock_response
|
|
122
|
-
mock_urlopen.return_value = mock_response
|
|
123
|
-
is_valid, *_ = validate(
|
|
124
|
-
file_path=url,
|
|
125
|
-
previous_analysis=previous_analysis,
|
|
126
|
-
)
|
|
127
|
-
assert is_valid == should_be_valid
|
|
128
|
-
|
|
129
|
-
|
|
130
75
|
@pytest.mark.parametrize(
|
|
131
76
|
"modif_previous_analysis",
|
|
132
77
|
(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|