csv-detective 0.7.5.dev1277__py3-none-any.whl → 0.7.5.dev1298__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -1
- csv_detective/detect_fields/__init__.py +6 -4
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +7 -7
- csv_detective/detect_fields/other/float/__init__.py +4 -4
- csv_detective/detect_fields/other/money/__init__.py +11 -0
- csv_detective/detect_fields/other/percent/__init__.py +9 -0
- csv_detective/detection/formats.py +145 -0
- csv_detective/explore_csv.py +94 -222
- csv_detective/load_tests.py +62 -0
- csv_detective/output/__init__.py +64 -0
- csv_detective/output/dataframe.py +0 -0
- csv_detective/output/example.py +77 -77
- csv_detective/output/profile.py +0 -0
- csv_detective/output/schema.py +0 -0
- csv_detective/output/utils.py +0 -0
- csv_detective/utils.py +2 -0
- csv_detective/validate.py +70 -0
- {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1298.data}/data/share/csv_detective/CHANGELOG.md +2 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/METADATA +1 -1
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/RECORD +27 -20
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/WHEEL +1 -1
- tests/test_example.py +10 -10
- tests/test_fields.py +270 -415
- tests/test_file.py +19 -9
- tests/test_structure.py +6 -0
- tests/test_validation.py +18 -0
- {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1298.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1298.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
# flake8: noqa
|
|
5
|
+
from csv_detective import detect_fields, detect_labels
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_all_packages(detect_type) -> list:
|
|
9
|
+
root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
|
|
10
|
+
modules = []
|
|
11
|
+
for dirpath, _, filenames in os.walk(root_dir):
|
|
12
|
+
for filename in filenames:
|
|
13
|
+
file = os.path.join(dirpath, filename).replace(root_dir, "")
|
|
14
|
+
if file.endswith("__init__.py"):
|
|
15
|
+
module = (
|
|
16
|
+
file.replace("__init__.py", "")
|
|
17
|
+
.replace("/", ".").replace("\\", ".")[:-1]
|
|
18
|
+
)
|
|
19
|
+
if module:
|
|
20
|
+
modules.append(detect_type + module)
|
|
21
|
+
return modules
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def return_all_tests(
|
|
25
|
+
user_input_tests: Union[str, list],
|
|
26
|
+
detect_type: str,
|
|
27
|
+
) -> list:
|
|
28
|
+
"""
|
|
29
|
+
returns all tests that have a method _is and are listed in the user_input_tests
|
|
30
|
+
the function can select a sub_package from csv_detective
|
|
31
|
+
user_input_tests may look like this:
|
|
32
|
+
- "ALL": all possible tests are made
|
|
33
|
+
- "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
|
|
34
|
+
this specifc (group of) test(s) only
|
|
35
|
+
- ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
|
|
36
|
+
specific (groups of) tests by add "-" at the start (e.g "-temp.date")
|
|
37
|
+
"""
|
|
38
|
+
assert detect_type in ["detect_fields", "detect_labels"]
|
|
39
|
+
all_packages = get_all_packages(detect_type=detect_type)
|
|
40
|
+
|
|
41
|
+
if isinstance(user_input_tests, str):
|
|
42
|
+
user_input_tests = [user_input_tests]
|
|
43
|
+
if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
|
|
44
|
+
tests_to_do = [detect_type]
|
|
45
|
+
else:
|
|
46
|
+
tests_to_do = [
|
|
47
|
+
f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
|
|
48
|
+
]
|
|
49
|
+
tests_skipped = [
|
|
50
|
+
f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
|
|
51
|
+
]
|
|
52
|
+
all_tests = [
|
|
53
|
+
# this is why we need to import detect_fields/labels
|
|
54
|
+
eval(x) for x in all_packages
|
|
55
|
+
if any([y == x[: len(y)] for y in tests_to_do])
|
|
56
|
+
and all([y != x[: len(y)] for y in tests_skipped])
|
|
57
|
+
]
|
|
58
|
+
# to remove groups of tests
|
|
59
|
+
all_tests = [
|
|
60
|
+
test for test in all_tests if "_is" in dir(test)
|
|
61
|
+
]
|
|
62
|
+
return all_tests
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import is_url
|
|
8
|
+
from .dataframe import cast_df
|
|
9
|
+
from .profile import create_profile
|
|
10
|
+
from .schema import generate_table_schema
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_output(
|
|
14
|
+
table: pd.DataFrame,
|
|
15
|
+
analysis: dict,
|
|
16
|
+
file_path: str,
|
|
17
|
+
num_rows: int = 500,
|
|
18
|
+
limited_output: bool = True,
|
|
19
|
+
save_results: Union[bool, str] = True,
|
|
20
|
+
output_profile: bool = False,
|
|
21
|
+
output_schema: bool = False,
|
|
22
|
+
output_df: bool = False,
|
|
23
|
+
cast_json: bool = True,
|
|
24
|
+
verbose: bool = False,
|
|
25
|
+
sheet_name: Union[str, int] = None,
|
|
26
|
+
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
27
|
+
|
|
28
|
+
if output_profile:
|
|
29
|
+
analysis["profile"] = create_profile(
|
|
30
|
+
table=table,
|
|
31
|
+
dict_cols_fields=analysis["columns"],
|
|
32
|
+
num_rows=num_rows,
|
|
33
|
+
limited_output=limited_output,
|
|
34
|
+
verbose=verbose,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
if save_results:
|
|
38
|
+
if isinstance(save_results, str):
|
|
39
|
+
output_path = save_results
|
|
40
|
+
else:
|
|
41
|
+
output_path = os.path.splitext(file_path)[0]
|
|
42
|
+
if is_url(output_path):
|
|
43
|
+
output_path = output_path.split('/')[-1]
|
|
44
|
+
if analysis.get("sheet_name"):
|
|
45
|
+
output_path += "_sheet-" + str(sheet_name)
|
|
46
|
+
output_path += ".json"
|
|
47
|
+
with open(output_path, "w", encoding="utf8") as fp:
|
|
48
|
+
json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
|
|
49
|
+
|
|
50
|
+
if output_schema:
|
|
51
|
+
analysis["schema"] = generate_table_schema(
|
|
52
|
+
analysis,
|
|
53
|
+
save_file=False,
|
|
54
|
+
verbose=verbose
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if output_df:
|
|
58
|
+
return analysis, cast_df(
|
|
59
|
+
df=table,
|
|
60
|
+
columns=analysis["columns"],
|
|
61
|
+
cast_json=cast_json,
|
|
62
|
+
verbose=verbose,
|
|
63
|
+
)
|
|
64
|
+
return analysis
|
|
File without changes
|
csv_detective/output/example.py
CHANGED
|
@@ -17,12 +17,12 @@ def create_example_csv_file(
|
|
|
17
17
|
fields: Optional[dict] = None,
|
|
18
18
|
schema_path: Optional[str] = None,
|
|
19
19
|
file_length: int = 10,
|
|
20
|
-
output_name: str =
|
|
21
|
-
output_sep: str =
|
|
22
|
-
encoding: str =
|
|
20
|
+
output_name: Optional[str] = "example_file.csv",
|
|
21
|
+
output_sep: str = ";",
|
|
22
|
+
encoding: str = "utf-8",
|
|
23
23
|
ignore_required: bool = False,
|
|
24
24
|
) -> pd.DataFrame:
|
|
25
|
-
|
|
25
|
+
"""
|
|
26
26
|
Create an example file based on a list of dicts like follows:
|
|
27
27
|
fields = [
|
|
28
28
|
{
|
|
@@ -33,7 +33,7 @@ def create_example_csv_file(
|
|
|
33
33
|
...
|
|
34
34
|
]
|
|
35
35
|
Or from a TableSchema
|
|
36
|
-
|
|
36
|
+
"""
|
|
37
37
|
# need to make a CLI command
|
|
38
38
|
|
|
39
39
|
if not (fields or schema_path):
|
|
@@ -53,65 +53,65 @@ def create_example_csv_file(
|
|
|
53
53
|
enum: Optional[str] = None,
|
|
54
54
|
) -> str:
|
|
55
55
|
if potential_skip(required):
|
|
56
|
-
return
|
|
56
|
+
return ""
|
|
57
57
|
if pattern is not None:
|
|
58
58
|
return rstr.xeger(pattern)
|
|
59
59
|
elif enum is not None:
|
|
60
60
|
return random.choice(enum)
|
|
61
61
|
else:
|
|
62
62
|
letters = string.ascii_lowercase
|
|
63
|
-
return
|
|
63
|
+
return "".join(random.choice(letters) for i in range(length))
|
|
64
64
|
|
|
65
65
|
def _id(
|
|
66
66
|
required: bool = True,
|
|
67
67
|
) -> str:
|
|
68
68
|
if potential_skip(required):
|
|
69
|
-
return
|
|
69
|
+
return ""
|
|
70
70
|
return str(uuid.uuid4())
|
|
71
71
|
|
|
72
72
|
def _date(
|
|
73
73
|
date_range: Union[None, list[str]] = None,
|
|
74
|
-
format: str =
|
|
74
|
+
format: str = "%Y-%m-%d",
|
|
75
75
|
required: bool = True,
|
|
76
76
|
) -> str:
|
|
77
77
|
# the bounds specified in date_range are expected in the same format as the desired output format
|
|
78
|
-
assert all([k in format for k in [
|
|
78
|
+
assert all([k in format for k in ["%d", "%m", "%Y"]])
|
|
79
79
|
if potential_skip(required):
|
|
80
|
-
return
|
|
80
|
+
return ""
|
|
81
81
|
if date_range is None:
|
|
82
82
|
return fake.date(format)
|
|
83
83
|
else:
|
|
84
84
|
if len(date_range) != 2:
|
|
85
|
-
raise ValueError('
|
|
85
|
+
raise ValueError("'date_range' must have exactly two elements.")
|
|
86
86
|
return fake.date_between_dates(
|
|
87
87
|
datetime.strptime(date_range[0], format),
|
|
88
88
|
datetime.strptime(date_range[1], format),
|
|
89
89
|
).strftime(format)
|
|
90
90
|
|
|
91
91
|
def _time(
|
|
92
|
-
format: str =
|
|
92
|
+
format: str = "%H:%M:%S",
|
|
93
93
|
required: bool = True,
|
|
94
94
|
) -> str:
|
|
95
|
-
assert all([k in format for k in [
|
|
95
|
+
assert all([k in format for k in ["%H", "%M", "%S"]])
|
|
96
96
|
if potential_skip(required):
|
|
97
|
-
return
|
|
97
|
+
return ""
|
|
98
98
|
# maybe add a time_range argument?
|
|
99
99
|
return fake.time(format)
|
|
100
100
|
|
|
101
101
|
def _datetime(
|
|
102
102
|
datetime_range: Optional[list[str]] = None,
|
|
103
|
-
format: str =
|
|
103
|
+
format: str = "%Y-%m-%d %H-%M-%S",
|
|
104
104
|
required: bool = True,
|
|
105
105
|
) -> str:
|
|
106
106
|
# the bounds specified in datetime_range are expected in the same format as the desired output format
|
|
107
|
-
assert all([k in format for k in [
|
|
107
|
+
assert all([k in format for k in ["%d", "%m", "%Y", "%H", "%M", "%S"]])
|
|
108
108
|
if potential_skip(required):
|
|
109
|
-
return
|
|
109
|
+
return ""
|
|
110
110
|
if datetime_range is None:
|
|
111
111
|
return fake.date_time().strftime(format)
|
|
112
112
|
else:
|
|
113
113
|
if len(datetime_range) != 2:
|
|
114
|
-
raise ValueError('
|
|
114
|
+
raise ValueError("'date_range' must have exactly two elements.")
|
|
115
115
|
return fake.date_time_between(
|
|
116
116
|
datetime.strptime(datetime_range[0], format),
|
|
117
117
|
datetime.strptime(datetime_range[1], format),
|
|
@@ -119,8 +119,8 @@ def create_example_csv_file(
|
|
|
119
119
|
|
|
120
120
|
def _url(required: bool = True) -> str:
|
|
121
121
|
if potential_skip(required):
|
|
122
|
-
return
|
|
123
|
-
return f
|
|
122
|
+
return ""
|
|
123
|
+
return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
|
|
124
124
|
|
|
125
125
|
def _number(
|
|
126
126
|
num_type: Type[Union[int, float]] = int,
|
|
@@ -130,7 +130,7 @@ def create_example_csv_file(
|
|
|
130
130
|
) -> Union[int, float]:
|
|
131
131
|
assert num_range is None or len(num_range) == 2
|
|
132
132
|
if potential_skip(required):
|
|
133
|
-
return
|
|
133
|
+
return ""
|
|
134
134
|
if enum:
|
|
135
135
|
return random.choice(enum)
|
|
136
136
|
if num_range is None:
|
|
@@ -142,100 +142,100 @@ def create_example_csv_file(
|
|
|
142
142
|
|
|
143
143
|
def _bool(required: bool = True) -> bool:
|
|
144
144
|
if potential_skip(required):
|
|
145
|
-
return
|
|
145
|
+
return ""
|
|
146
146
|
return random.randint(0, 1) == 0
|
|
147
147
|
|
|
148
148
|
def _array(enum: list[Any], required: bool = True) -> str:
|
|
149
149
|
if potential_skip(required):
|
|
150
|
-
return
|
|
150
|
+
return ""
|
|
151
151
|
return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
|
|
152
152
|
|
|
153
153
|
def build_args_from_constraints(constraints: dict) -> dict:
|
|
154
154
|
args = {}
|
|
155
|
-
args[
|
|
156
|
-
for _ in [
|
|
155
|
+
args["required"] = constraints.get("required", False)
|
|
156
|
+
for _ in ["pattern", "enum", "format"]:
|
|
157
157
|
if _ in constraints:
|
|
158
158
|
args[_] = constraints[_]
|
|
159
|
-
if
|
|
160
|
-
args[
|
|
159
|
+
if "minimum" in constraints and "maximum" in constraints:
|
|
160
|
+
args["num_range"] = [constraints["minimum"], constraints["maximum"]]
|
|
161
161
|
# maybe there are better values than these?
|
|
162
|
-
elif
|
|
163
|
-
args[
|
|
164
|
-
elif
|
|
165
|
-
args[
|
|
166
|
-
if
|
|
167
|
-
args[
|
|
168
|
-
if
|
|
169
|
-
args[
|
|
162
|
+
elif "minimum" in constraints:
|
|
163
|
+
args["num_range"] = [constraints["minimum"], 10 + constraints["minimum"]]
|
|
164
|
+
elif "maximum" in constraints:
|
|
165
|
+
args["num_range"] = [constraints["maximum"] - 10, constraints["maximum"]]
|
|
166
|
+
if "minLength" in constraints:
|
|
167
|
+
args["length"] = constraints["minLength"]
|
|
168
|
+
if "maxLength" in constraints:
|
|
169
|
+
args["length"] = constraints["maxLength"]
|
|
170
170
|
return args
|
|
171
171
|
|
|
172
172
|
schema_types_to_python = {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
173
|
+
"number": "float",
|
|
174
|
+
"integer": "int",
|
|
175
|
+
"string": "str",
|
|
176
|
+
"year": "year",
|
|
177
|
+
"boolean": "bool",
|
|
178
|
+
"date": "date",
|
|
179
|
+
"yearmonth": "date",
|
|
180
|
+
"time": "time",
|
|
181
|
+
"datetime": "datetime",
|
|
182
|
+
"array": "array"
|
|
183
183
|
}
|
|
184
184
|
|
|
185
185
|
if schema_path:
|
|
186
|
-
if schema_path.startswith(
|
|
186
|
+
if schema_path.startswith("http"):
|
|
187
187
|
schema = requests.get(schema_path).json()
|
|
188
188
|
else:
|
|
189
189
|
with open(schema_path, encoding=encoding) as jsonfile:
|
|
190
190
|
schema = json.load(jsonfile)
|
|
191
|
-
if not (
|
|
192
|
-
raise ValueError(
|
|
191
|
+
if not ("fields" in schema.keys()):
|
|
192
|
+
raise ValueError("The schema must have a 'fields' key.")
|
|
193
193
|
else:
|
|
194
194
|
fields = [
|
|
195
195
|
{
|
|
196
|
-
|
|
197
|
-
|
|
196
|
+
"name": f["name"],
|
|
197
|
+
"type": schema_types_to_python.get(f["type"], "str"),
|
|
198
198
|
# when frformat is supported in TableSchema, we can build args for French standards
|
|
199
199
|
# linked to https://github.com/datagouv/fr-format/issues/26
|
|
200
|
-
|
|
201
|
-
build_args_from_constraints(f[
|
|
202
|
-
else build_args_from_constraints(f[
|
|
203
|
-
if
|
|
200
|
+
"args": (
|
|
201
|
+
build_args_from_constraints(f["constraints"]) if "constraints" in f.keys()
|
|
202
|
+
else build_args_from_constraints(f["arrayItem"]["constraints"])
|
|
203
|
+
if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
|
|
204
204
|
else {}
|
|
205
205
|
)
|
|
206
|
-
} for f in schema[
|
|
206
|
+
} for f in schema["fields"]
|
|
207
207
|
]
|
|
208
208
|
|
|
209
209
|
for k in range(len(fields)):
|
|
210
|
-
if
|
|
211
|
-
fields[k][
|
|
212
|
-
if fields[k][
|
|
213
|
-
fields[k][
|
|
214
|
-
elif fields[k][
|
|
215
|
-
fields[k][
|
|
216
|
-
elif fields[k][
|
|
217
|
-
fields[k][
|
|
218
|
-
fields[k][
|
|
210
|
+
if "args" not in fields[k]:
|
|
211
|
+
fields[k]["args"] = {}
|
|
212
|
+
if fields[k]["type"] == "float":
|
|
213
|
+
fields[k]["args"]["num_type"] = float
|
|
214
|
+
elif fields[k]["type"] == "int":
|
|
215
|
+
fields[k]["args"]["num_type"] = int
|
|
216
|
+
elif fields[k]["type"] == "year":
|
|
217
|
+
fields[k]["args"]["num_type"] = int
|
|
218
|
+
fields[k]["args"]["num_range"] = [1990, 2050]
|
|
219
219
|
|
|
220
220
|
types_to_func = {
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
221
|
+
"int": _number,
|
|
222
|
+
"float": _number,
|
|
223
|
+
"date": _date,
|
|
224
|
+
"time": _time,
|
|
225
|
+
"str": _string,
|
|
226
|
+
"url": _url,
|
|
227
|
+
"id": _id,
|
|
228
|
+
"year": _number,
|
|
229
|
+
"bool": _bool,
|
|
230
|
+
"datetime": _datetime,
|
|
231
|
+
"array": _array,
|
|
232
232
|
}
|
|
233
233
|
|
|
234
234
|
# would it be better to create by column or by row (as for now)?
|
|
235
235
|
output = pd.DataFrame(
|
|
236
236
|
[
|
|
237
237
|
[
|
|
238
|
-
types_to_func.get(f[
|
|
238
|
+
types_to_func.get(f["type"], "str")(**f["args"])
|
|
239
239
|
for f in fields
|
|
240
240
|
] for _ in range(file_length)
|
|
241
241
|
],
|
csv_detective/output/profile.py
CHANGED
|
File without changes
|
csv_detective/output/schema.py
CHANGED
|
File without changes
|
csv_detective/output/utils.py
CHANGED
|
File without changes
|
csv_detective/utils.py
CHANGED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from csv_detective.load_tests import return_all_tests
|
|
7
|
+
from .parsing.load import load_file
|
|
8
|
+
|
|
9
|
+
logging.basicConfig(level=logging.INFO)
|
|
10
|
+
|
|
11
|
+
tests = {
|
|
12
|
+
t.__name__.split(".")[-1]: t._is
|
|
13
|
+
for t in return_all_tests("ALL", "detect_fields")
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def validate(
|
|
18
|
+
file_path: str,
|
|
19
|
+
previous_analysis: dict,
|
|
20
|
+
num_rows: int = 500,
|
|
21
|
+
encoding: str = None,
|
|
22
|
+
sep: str = None,
|
|
23
|
+
verbose: bool = False,
|
|
24
|
+
skipna: bool = True,
|
|
25
|
+
sheet_name: Union[str, int] = None,
|
|
26
|
+
) -> tuple[bool, pd.DataFrame, dict]:
|
|
27
|
+
"""
|
|
28
|
+
Verify is the given file has the same fields and types as in the previous analysis.
|
|
29
|
+
"""
|
|
30
|
+
table, analysis = load_file(
|
|
31
|
+
file_path=file_path,
|
|
32
|
+
num_rows=num_rows,
|
|
33
|
+
encoding=encoding,
|
|
34
|
+
sep=sep,
|
|
35
|
+
verbose=verbose,
|
|
36
|
+
sheet_name=sheet_name,
|
|
37
|
+
)
|
|
38
|
+
if verbose:
|
|
39
|
+
logging.info("Comparing table with the previous analysis")
|
|
40
|
+
logging.info("- Checking if all columns match")
|
|
41
|
+
if (
|
|
42
|
+
any(col_name not in list(table.columns) for col_name in previous_analysis["columns"])
|
|
43
|
+
or any(col_name not in list(previous_analysis["columns"].keys()) for col_name in table.columns)
|
|
44
|
+
):
|
|
45
|
+
logging.warning("> Columns do not match, proceeding with full analysis")
|
|
46
|
+
return False, table, analysis
|
|
47
|
+
for col_name, args in previous_analysis["columns"].items():
|
|
48
|
+
if verbose:
|
|
49
|
+
logging.info(f"- Testing {col_name} for {args['format']}")
|
|
50
|
+
if args["format"] == "string":
|
|
51
|
+
# no test for columns that have not been recognized as a specific format
|
|
52
|
+
continue
|
|
53
|
+
test_func = tests[args["format"]]
|
|
54
|
+
col_data = table[col_name]
|
|
55
|
+
if skipna:
|
|
56
|
+
col_data = col_data.loc[~col_data.isna()]
|
|
57
|
+
if not col_data.apply(test_func).all():
|
|
58
|
+
logging.warning("> Test failed, proceeding with full analysis")
|
|
59
|
+
return False, table, analysis
|
|
60
|
+
if verbose:
|
|
61
|
+
logging.info("> All checks successful")
|
|
62
|
+
return True, table, analysis | {
|
|
63
|
+
k: previous_analysis[k] for k in [
|
|
64
|
+
"categorical",
|
|
65
|
+
"columns",
|
|
66
|
+
"columns_fields",
|
|
67
|
+
"columns_labels",
|
|
68
|
+
"formats",
|
|
69
|
+
]
|
|
70
|
+
}
|
|
@@ -13,7 +13,9 @@
|
|
|
13
13
|
- Handle csv.gz files [#110](https://github.com/datagouv/csv-detective/pull/110)
|
|
14
14
|
- Refactor file tests [#110](https://github.com/datagouv/csv-detective/pull/110)
|
|
15
15
|
- Restructure repo (breaking changes) [#111](https://github.com/datagouv/csv-detective/pull/111)
|
|
16
|
+
- Add validation function and associated flow [#112](https://github.com/datagouv/csv-detective/pull/112)
|
|
16
17
|
- Better float detection [#113](https://github.com/datagouv/csv-detective/pull/113)
|
|
18
|
+
- Refactor fields tests [#114](https://github.com/datagouv/csv-detective/pull/114)
|
|
17
19
|
|
|
18
20
|
## 0.7.4 (2024-11-15)
|
|
19
21
|
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
csv_detective/__init__.py,sha256=
|
|
1
|
+
csv_detective/__init__.py,sha256=vpK7WMkIQbcJzu6HKOwcn7PpHsNCCaXZ1YLMS5Wq9tM,165
|
|
2
2
|
csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
3
|
+
csv_detective/explore_csv.py,sha256=ocWlUEtuwZ-6bjDc6gfhC2-6DljMVhvXhHrfICCXGfQ,8986
|
|
4
|
+
csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
|
|
4
5
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
5
|
-
csv_detective/utils.py,sha256=
|
|
6
|
-
csv_detective/
|
|
6
|
+
csv_detective/utils.py,sha256=Bx_1k4Sdpd5PCjuAy4AeayCmmw7TMR_zgtKIHNLi5g0,1157
|
|
7
|
+
csv_detective/validate.py,sha256=o4Qulf8E-x1zsWT9OD4Fpw83Gku1WA3JlX83j7bu0DA,2314
|
|
8
|
+
csv_detective/detect_fields/__init__.py,sha256=qkwT_o_S7qvLEsRssICpoGmCc3h5y2MVy1XI56LFcV0,959
|
|
7
9
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
10
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
11
|
csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=NqV8ULf9gY9iFnA1deKR-1Yobr96WwCsn5JfbP_MjiY,1675
|
|
@@ -48,15 +50,17 @@ csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=wJAy
|
|
|
48
50
|
csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
|
|
49
51
|
csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=FPHOfTrfXJs62-NgeOcNGOvwPd7I1fEVp8lTdMNfj3w,433
|
|
50
52
|
csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSqNDZhXZz1TwzdiwdV8ovRYTOacpg,327
|
|
51
|
-
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=
|
|
53
|
+
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=7_mnO9uC_kI7e2WR8xIer7Kqw8zi-v-JKaAD4zcoGbE,342
|
|
52
54
|
csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
|
|
53
55
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
56
|
csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
|
|
55
57
|
csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
|
|
56
|
-
csv_detective/detect_fields/other/float/__init__.py,sha256=
|
|
58
|
+
csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
|
|
57
59
|
csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
|
|
58
60
|
csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
|
|
61
|
+
csv_detective/detect_fields/other/money/__init__.py,sha256=g_ZwBZXl9LhldwFYQotC5WqLiE8qQCZHtoI9eJvl_9M,232
|
|
59
62
|
csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
|
|
63
|
+
csv_detective/detect_fields/other/percent/__init__.py,sha256=vgpekNOPBRuunoVBXMi81rwHv4uSOhe78pbVtQ5SBO8,177
|
|
60
64
|
csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeEZ5Hkf5Wwi3ZKclLER_V0YO3g,154
|
|
61
65
|
csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
|
|
62
66
|
csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
|
|
@@ -126,12 +130,14 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=3U9j8Hux432KdGtIyArq_-v
|
|
|
126
130
|
csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
|
|
127
131
|
csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
|
|
128
132
|
csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
|
|
133
|
+
csv_detective/detection/formats.py,sha256=VwFazRAFJN6eaYUK7IauVU88vuUBHccESY4UD8EgGUo,5386
|
|
129
134
|
csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
|
|
130
135
|
csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
|
|
131
136
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
132
137
|
csv_detective/detection/variables.py,sha256=3qEMtjZ_zyIFXvTnFgK7ZMDx8C12uQXKfFjEj2moyJc,3558
|
|
138
|
+
csv_detective/output/__init__.py,sha256=XDS4Dgvv6oloIao9JquHa0m1nnlQ_q2gHuEPGlaETic,1890
|
|
133
139
|
csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
|
|
134
|
-
csv_detective/output/example.py,sha256=
|
|
140
|
+
csv_detective/output/example.py,sha256=26rY7XNXK47e9xJMl-Js8jJwFIuv7V7B7e256VecKuk,8652
|
|
135
141
|
csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
|
|
136
142
|
csv_detective/output/schema.py,sha256=ZDBWDOD8IYp7rcB0_n8l9JXGIhOQ6bTZHFWfTmnNNEQ,13480
|
|
137
143
|
csv_detective/output/utils.py,sha256=HbmvCCCmFo7NJxhD_UsJIveuw-rrfhrvYckv1CJn_10,2301
|
|
@@ -141,18 +147,19 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
|
|
|
141
147
|
csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
|
|
142
148
|
csv_detective/parsing/load.py,sha256=SpP0pfxswOAPPpwbZfoP1blh0EKV5VMs0TpTgQJKzjs,3621
|
|
143
149
|
csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
|
|
144
|
-
csv_detective-0.7.5.
|
|
145
|
-
csv_detective-0.7.5.
|
|
146
|
-
csv_detective-0.7.5.
|
|
147
|
-
csv_detective-0.7.5.
|
|
150
|
+
csv_detective-0.7.5.dev1298.data/data/share/csv_detective/CHANGELOG.md,sha256=Y8aL18x5EGGvA9AqukEi4tn78se_Lzisa2J32kOSer8,7984
|
|
151
|
+
csv_detective-0.7.5.dev1298.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
152
|
+
csv_detective-0.7.5.dev1298.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
153
|
+
csv_detective-0.7.5.dev1298.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
148
154
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
|
-
tests/test_example.py,sha256=
|
|
150
|
-
tests/test_fields.py,sha256=
|
|
151
|
-
tests/test_file.py,sha256=
|
|
155
|
+
tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
|
|
156
|
+
tests/test_fields.py,sha256=0hce2XtDHY9dTLCYhrm2s4I41OeKsQbbaKmDZ4XctUw,9824
|
|
157
|
+
tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
|
|
152
158
|
tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
|
|
153
|
-
tests/test_structure.py,sha256=
|
|
154
|
-
|
|
155
|
-
csv_detective-0.7.5.
|
|
156
|
-
csv_detective-0.7.5.
|
|
157
|
-
csv_detective-0.7.5.
|
|
158
|
-
csv_detective-0.7.5.
|
|
159
|
+
tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
|
|
160
|
+
tests/test_validation.py,sha256=VwtBcnGAQ_eSFrBibWnMSTDjuy6y2JLlqvc3Zb667NY,479
|
|
161
|
+
csv_detective-0.7.5.dev1298.dist-info/METADATA,sha256=cy8kKhsbQVd8DQ2UMJe7z1nyxoGEvmFnQfsTdCTwXXc,1386
|
|
162
|
+
csv_detective-0.7.5.dev1298.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
163
|
+
csv_detective-0.7.5.dev1298.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
164
|
+
csv_detective-0.7.5.dev1298.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
165
|
+
csv_detective-0.7.5.dev1298.dist-info/RECORD,,
|