csv-detective 0.7.5.dev1277__py3-none-any.whl → 0.7.5.dev1286__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -1
- csv_detective/detect_fields/other/float/__init__.py +4 -4
- csv_detective/detection/formats.py +145 -0
- csv_detective/explore_csv.py +94 -222
- csv_detective/load_tests.py +62 -0
- csv_detective/output/__init__.py +64 -0
- csv_detective/output/dataframe.py +0 -0
- csv_detective/output/example.py +0 -0
- csv_detective/output/profile.py +0 -0
- csv_detective/output/schema.py +0 -0
- csv_detective/output/utils.py +0 -0
- csv_detective/utils.py +2 -0
- csv_detective/validate.py +70 -0
- {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/CHANGELOG.md +1 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/METADATA +1 -1
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/RECORD +21 -16
- tests/test_fields.py +1 -1
- tests/test_file.py +19 -9
- tests/test_structure.py +6 -0
- tests/test_validation.py +18 -0
- {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/WHEEL +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py
CHANGED
|
@@ -2,16 +2,16 @@ PROPORTION = 1
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def float_casting(val: str) -> float:
|
|
5
|
-
return float(val.replace(
|
|
5
|
+
return float(val.replace(",", "."))
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def _is(val):
|
|
9
|
-
|
|
10
|
-
notations (3e6) or "+" in the string. "-" is still accepted.
|
|
9
|
+
"""Detects floats, assuming that tables will not have scientific
|
|
10
|
+
notations (3e6) or "+" in the string. "-" is still accepted."""
|
|
11
11
|
try:
|
|
12
12
|
if (
|
|
13
13
|
not isinstance(val, str)
|
|
14
|
-
or any([k in val for k in [
|
|
14
|
+
or any([k in val for k in ["_", "+", "e", "E"]])
|
|
15
15
|
or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
|
|
16
16
|
):
|
|
17
17
|
return False
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from csv_detective.detection.variables import (
|
|
7
|
+
detect_categorical_variable,
|
|
8
|
+
# detect_continuous_variable,
|
|
9
|
+
)
|
|
10
|
+
from csv_detective.load_tests import return_all_tests
|
|
11
|
+
from csv_detective.output.utils import prepare_output_dict
|
|
12
|
+
from csv_detective.parsing.columns import test_col, test_label
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def detect_formats(
|
|
16
|
+
table: pd.DataFrame,
|
|
17
|
+
analysis: dict,
|
|
18
|
+
user_input_tests: Union[str, list[str]] = "ALL",
|
|
19
|
+
limited_output: bool = True,
|
|
20
|
+
skipna: bool = True,
|
|
21
|
+
verbose: bool = False,
|
|
22
|
+
):
|
|
23
|
+
|
|
24
|
+
if table.empty:
|
|
25
|
+
res_categorical = []
|
|
26
|
+
# res_continuous = []
|
|
27
|
+
else:
|
|
28
|
+
# Detects columns that are categorical
|
|
29
|
+
res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
|
|
30
|
+
res_categorical = list(res_categorical)
|
|
31
|
+
# Detect columns that are continuous (we already know the categorical) :
|
|
32
|
+
# we don't need this for now, cuts processing time
|
|
33
|
+
# res_continuous = list(
|
|
34
|
+
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
35
|
+
# )
|
|
36
|
+
|
|
37
|
+
analysis.update({
|
|
38
|
+
"categorical": res_categorical,
|
|
39
|
+
# "continuous": res_continuous,
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
# list testing to be performed
|
|
43
|
+
all_tests_fields = return_all_tests(
|
|
44
|
+
user_input_tests, detect_type="detect_fields"
|
|
45
|
+
) # list all tests for the fields
|
|
46
|
+
all_tests_labels = return_all_tests(
|
|
47
|
+
user_input_tests, detect_type="detect_labels"
|
|
48
|
+
) # list all tests for the labels
|
|
49
|
+
|
|
50
|
+
# if no testing then return
|
|
51
|
+
if not all_tests_fields and not all_tests_labels:
|
|
52
|
+
return analysis
|
|
53
|
+
|
|
54
|
+
# Perform testing on fields
|
|
55
|
+
scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
|
|
56
|
+
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
57
|
+
|
|
58
|
+
# Perform testing on labels
|
|
59
|
+
scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
|
|
60
|
+
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
61
|
+
|
|
62
|
+
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
63
|
+
# This is because the fields are more important than the labels and yields a max
|
|
64
|
+
# of 1.5 for the final score.
|
|
65
|
+
scores_table = scores_table_fields * (
|
|
66
|
+
1
|
|
67
|
+
+ scores_table_labels.reindex(
|
|
68
|
+
index=scores_table_fields.index, fill_value=0
|
|
69
|
+
).values / 2
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
73
|
+
# a detection (skipping the ones that have been excluded by the users).
|
|
74
|
+
formats_with_mandatory_label = [
|
|
75
|
+
f for f in [
|
|
76
|
+
"code_departement",
|
|
77
|
+
"code_commune_insee",
|
|
78
|
+
"code_postal",
|
|
79
|
+
"latitude_wgs",
|
|
80
|
+
"longitude_wgs",
|
|
81
|
+
"latitude_wgs_fr_metropole",
|
|
82
|
+
"longitude_wgs_fr_metropole",
|
|
83
|
+
"latitude_l93",
|
|
84
|
+
"longitude_l93",
|
|
85
|
+
] if f in scores_table.index
|
|
86
|
+
]
|
|
87
|
+
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
88
|
+
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
89
|
+
scores_table.loc[formats_with_mandatory_label, :],
|
|
90
|
+
0,
|
|
91
|
+
)
|
|
92
|
+
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
93
|
+
|
|
94
|
+
metier_to_python_type = {
|
|
95
|
+
"booleen": "bool",
|
|
96
|
+
"int": "int",
|
|
97
|
+
"float": "float",
|
|
98
|
+
"string": "string",
|
|
99
|
+
"json": "json",
|
|
100
|
+
"json_geojson": "json",
|
|
101
|
+
"datetime": "datetime",
|
|
102
|
+
"datetime_iso": "datetime",
|
|
103
|
+
"datetime_rfc822": "datetime",
|
|
104
|
+
"date": "date",
|
|
105
|
+
"latitude": "float",
|
|
106
|
+
"latitude_l93": "float",
|
|
107
|
+
"latitude_wgs": "float",
|
|
108
|
+
"latitude_wgs_fr_metropole": "float",
|
|
109
|
+
"longitude": "float",
|
|
110
|
+
"longitude_l93": "float",
|
|
111
|
+
"longitude_wgs": "float",
|
|
112
|
+
"longitude_wgs_fr_metropole": "float",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if not limited_output:
|
|
116
|
+
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
117
|
+
analysis[detection_method] = {
|
|
118
|
+
col_name: [
|
|
119
|
+
{
|
|
120
|
+
"python_type": metier_to_python_type.get(
|
|
121
|
+
detection["format"], "string"
|
|
122
|
+
),
|
|
123
|
+
**detection,
|
|
124
|
+
}
|
|
125
|
+
for detection in detections
|
|
126
|
+
]
|
|
127
|
+
for col_name, detections in analysis[detection_method].items()
|
|
128
|
+
}
|
|
129
|
+
else:
|
|
130
|
+
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
131
|
+
analysis[detection_method] = {
|
|
132
|
+
col_name: {
|
|
133
|
+
"python_type": metier_to_python_type.get(
|
|
134
|
+
detection["format"], "string"
|
|
135
|
+
),
|
|
136
|
+
**detection,
|
|
137
|
+
}
|
|
138
|
+
for col_name, detection in analysis[detection_method].items()
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# Add detection with formats as keys
|
|
142
|
+
analysis["formats"] = defaultdict(list)
|
|
143
|
+
for header, col_metadata in analysis["columns"].items():
|
|
144
|
+
analysis["formats"][col_metadata["format"]].append(header)
|
|
145
|
+
return analysis
|
csv_detective/explore_csv.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
4
3
|
import os
|
|
@@ -6,80 +5,16 @@ import tempfile
|
|
|
6
5
|
from time import time
|
|
7
6
|
from typing import Optional, Union
|
|
8
7
|
|
|
9
|
-
import numpy as np
|
|
10
8
|
import pandas as pd
|
|
11
9
|
|
|
12
|
-
|
|
13
|
-
from
|
|
14
|
-
from .detection.variables import (
|
|
15
|
-
detect_categorical_variable,
|
|
16
|
-
# detect_continuous_variable,
|
|
17
|
-
)
|
|
18
|
-
from .output.dataframe import cast_df
|
|
19
|
-
from .output.profile import create_profile
|
|
20
|
-
from .output.schema import generate_table_schema
|
|
21
|
-
from .output.utils import prepare_output_dict
|
|
10
|
+
from .detection.formats import detect_formats
|
|
11
|
+
from .output import generate_output, generate_table_schema
|
|
22
12
|
from .parsing.load import load_file
|
|
23
|
-
from .parsing.columns import test_col, test_label
|
|
24
13
|
from .s3_utils import download_from_minio, upload_to_minio
|
|
25
14
|
from .utils import display_logs_depending_process_time, is_url
|
|
15
|
+
from .validate import validate
|
|
26
16
|
|
|
27
|
-
|
|
28
|
-
def get_all_packages(detect_type: str) -> list:
|
|
29
|
-
root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
|
|
30
|
-
modules = []
|
|
31
|
-
for dirpath, _, filenames in os.walk(root_dir):
|
|
32
|
-
for filename in filenames:
|
|
33
|
-
file = os.path.join(dirpath, filename).replace(root_dir, "")
|
|
34
|
-
if file.endswith("__init__.py"):
|
|
35
|
-
module = (
|
|
36
|
-
file.replace("__init__.py", "")
|
|
37
|
-
.replace("/", ".").replace("\\", ".")[:-1]
|
|
38
|
-
)
|
|
39
|
-
if module:
|
|
40
|
-
modules.append(detect_type + module)
|
|
41
|
-
return modules
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def return_all_tests(
|
|
45
|
-
user_input_tests: Union[str, list],
|
|
46
|
-
detect_type: str,
|
|
47
|
-
) -> list:
|
|
48
|
-
"""
|
|
49
|
-
returns all tests that have a method _is and are listed in the user_input_tests
|
|
50
|
-
the function can select a sub_package from csv_detective
|
|
51
|
-
user_input_tests may look like this:
|
|
52
|
-
- "ALL": all possible tests are made
|
|
53
|
-
- "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
|
|
54
|
-
this specifc (group of) test(s) only
|
|
55
|
-
- ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
|
|
56
|
-
specific (groups of) tests by add "-" at the start (e.g "-temp.date")
|
|
57
|
-
"""
|
|
58
|
-
assert detect_type in ["detect_fields", "detect_labels"]
|
|
59
|
-
all_packages = get_all_packages(detect_type=detect_type)
|
|
60
|
-
|
|
61
|
-
if isinstance(user_input_tests, str):
|
|
62
|
-
user_input_tests = [user_input_tests]
|
|
63
|
-
if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
|
|
64
|
-
tests_to_do = [detect_type]
|
|
65
|
-
else:
|
|
66
|
-
tests_to_do = [
|
|
67
|
-
f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
|
|
68
|
-
]
|
|
69
|
-
tests_skipped = [
|
|
70
|
-
f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
|
|
71
|
-
]
|
|
72
|
-
all_tests = [
|
|
73
|
-
# this is why we need to import detect_fields/labels
|
|
74
|
-
eval(x) for x in all_packages
|
|
75
|
-
if any([y == x[: len(y)] for y in tests_to_do])
|
|
76
|
-
and all([y != x[: len(y)] for y in tests_skipped])
|
|
77
|
-
]
|
|
78
|
-
# to remove groups of tests
|
|
79
|
-
all_tests = [
|
|
80
|
-
test for test in all_tests if "_is" in dir(test)
|
|
81
|
-
]
|
|
82
|
-
return all_tests
|
|
17
|
+
logging.basicConfig(level=logging.INFO)
|
|
83
18
|
|
|
84
19
|
|
|
85
20
|
def routine(
|
|
@@ -99,7 +34,7 @@ def routine(
|
|
|
99
34
|
sheet_name: Optional[Union[str, int]] = None,
|
|
100
35
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
101
36
|
"""Returns a dict with information about the csv table and possible
|
|
102
|
-
column contents.
|
|
37
|
+
column contents, and if requested the DataFrame with columns cast according to analysis.
|
|
103
38
|
|
|
104
39
|
Args:
|
|
105
40
|
file_path: local path to CSV file if not using Minio
|
|
@@ -112,14 +47,14 @@ def routine(
|
|
|
112
47
|
output_schema: whether or not to add the 'schema' field to the output (tableschema)
|
|
113
48
|
output_df: whether or not to return the loaded DataFrame along with the analysis report
|
|
114
49
|
cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
|
|
115
|
-
verbose: whether or not to print process logs in console
|
|
50
|
+
verbose: whether or not to print process logs in console
|
|
116
51
|
sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
|
|
117
52
|
skipna: whether to keep NaN (empty cells) for tests
|
|
118
53
|
|
|
119
54
|
Returns:
|
|
120
55
|
dict: a dict with information about the csv and possible types for each column
|
|
121
56
|
"""
|
|
122
|
-
|
|
57
|
+
|
|
123
58
|
if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
|
|
124
59
|
raise ValueError("`save_results` must be a bool or a valid path to a json file.")
|
|
125
60
|
|
|
@@ -137,168 +72,105 @@ def routine(
|
|
|
137
72
|
sheet_name=sheet_name,
|
|
138
73
|
)
|
|
139
74
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
# Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
|
|
148
|
-
# res_continuous = list(
|
|
149
|
-
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
150
|
-
# )
|
|
151
|
-
|
|
152
|
-
analysis.update({
|
|
153
|
-
"categorical": res_categorical,
|
|
154
|
-
# "continuous": res_continuous,
|
|
155
|
-
})
|
|
156
|
-
|
|
157
|
-
# list testing to be performed
|
|
158
|
-
all_tests_fields = return_all_tests(
|
|
159
|
-
user_input_tests, detect_type="detect_fields"
|
|
160
|
-
) # list all tests for the fields
|
|
161
|
-
all_tests_labels = return_all_tests(
|
|
162
|
-
user_input_tests, detect_type="detect_labels"
|
|
163
|
-
) # list all tests for the labels
|
|
164
|
-
|
|
165
|
-
# if no testing then return
|
|
166
|
-
if not all_tests_fields and not all_tests_labels:
|
|
167
|
-
return analysis
|
|
168
|
-
|
|
169
|
-
# Perform testing on fields
|
|
170
|
-
scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
|
|
171
|
-
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
172
|
-
|
|
173
|
-
# Perform testing on labels
|
|
174
|
-
scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
|
|
175
|
-
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
176
|
-
|
|
177
|
-
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
178
|
-
# This is because the fields are more important than the labels and yields a max
|
|
179
|
-
# of 1.5 for the final score.
|
|
180
|
-
scores_table = scores_table_fields * (
|
|
181
|
-
1
|
|
182
|
-
+ scores_table_labels.reindex(
|
|
183
|
-
index=scores_table_fields.index, fill_value=0
|
|
184
|
-
).values / 2
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
188
|
-
# a detection (skipping the ones that have been excluded by the users).
|
|
189
|
-
formats_with_mandatory_label = [
|
|
190
|
-
f for f in [
|
|
191
|
-
"code_departement",
|
|
192
|
-
"code_commune_insee",
|
|
193
|
-
"code_postal",
|
|
194
|
-
"latitude_wgs",
|
|
195
|
-
"longitude_wgs",
|
|
196
|
-
"latitude_wgs_fr_metropole",
|
|
197
|
-
"longitude_wgs_fr_metropole",
|
|
198
|
-
"latitude_l93",
|
|
199
|
-
"longitude_l93",
|
|
200
|
-
] if f in scores_table.index
|
|
201
|
-
]
|
|
202
|
-
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
203
|
-
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
204
|
-
scores_table.loc[formats_with_mandatory_label, :],
|
|
205
|
-
0,
|
|
75
|
+
analysis = detect_formats(
|
|
76
|
+
table=table,
|
|
77
|
+
analysis=analysis,
|
|
78
|
+
user_input_tests=user_input_tests,
|
|
79
|
+
limited_output=limited_output,
|
|
80
|
+
skipna=skipna,
|
|
81
|
+
verbose=verbose,
|
|
206
82
|
)
|
|
207
|
-
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
208
|
-
|
|
209
|
-
metier_to_python_type = {
|
|
210
|
-
"booleen": "bool",
|
|
211
|
-
"int": "int",
|
|
212
|
-
"float": "float",
|
|
213
|
-
"string": "string",
|
|
214
|
-
"json": "json",
|
|
215
|
-
"json_geojson": "json",
|
|
216
|
-
"datetime": "datetime",
|
|
217
|
-
"datetime_iso": "datetime",
|
|
218
|
-
"datetime_rfc822": "datetime",
|
|
219
|
-
"date": "date",
|
|
220
|
-
"latitude": "float",
|
|
221
|
-
"latitude_l93": "float",
|
|
222
|
-
"latitude_wgs": "float",
|
|
223
|
-
"latitude_wgs_fr_metropole": "float",
|
|
224
|
-
"longitude": "float",
|
|
225
|
-
"longitude_l93": "float",
|
|
226
|
-
"longitude_wgs": "float",
|
|
227
|
-
"longitude_wgs_fr_metropole": "float",
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
if not limited_output:
|
|
231
|
-
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
232
|
-
analysis[detection_method] = {
|
|
233
|
-
col_name: [
|
|
234
|
-
{
|
|
235
|
-
"python_type": metier_to_python_type.get(
|
|
236
|
-
detection["format"], "string"
|
|
237
|
-
),
|
|
238
|
-
**detection,
|
|
239
|
-
}
|
|
240
|
-
for detection in detections
|
|
241
|
-
]
|
|
242
|
-
for col_name, detections in analysis[detection_method].items()
|
|
243
|
-
}
|
|
244
|
-
else:
|
|
245
|
-
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
246
|
-
analysis[detection_method] = {
|
|
247
|
-
col_name: {
|
|
248
|
-
"python_type": metier_to_python_type.get(
|
|
249
|
-
detection["format"], "string"
|
|
250
|
-
),
|
|
251
|
-
**detection,
|
|
252
|
-
}
|
|
253
|
-
for col_name, detection in analysis[detection_method].items()
|
|
254
|
-
}
|
|
255
83
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
for header, col_metadata in analysis["columns"].items():
|
|
259
|
-
analysis["formats"][col_metadata["format"]].append(header)
|
|
260
|
-
|
|
261
|
-
if output_profile:
|
|
262
|
-
analysis["profile"] = create_profile(
|
|
84
|
+
try:
|
|
85
|
+
return generate_output(
|
|
263
86
|
table=table,
|
|
264
|
-
|
|
87
|
+
analysis=analysis,
|
|
88
|
+
file_path=file_path,
|
|
265
89
|
num_rows=num_rows,
|
|
266
90
|
limited_output=limited_output,
|
|
91
|
+
save_results=save_results,
|
|
92
|
+
output_profile=output_profile,
|
|
93
|
+
output_schema=output_schema,
|
|
94
|
+
output_df=output_df,
|
|
95
|
+
cast_json=cast_json,
|
|
267
96
|
verbose=verbose,
|
|
97
|
+
sheet_name=sheet_name,
|
|
268
98
|
)
|
|
99
|
+
finally:
|
|
100
|
+
if verbose:
|
|
101
|
+
display_logs_depending_process_time(
|
|
102
|
+
f"Routine completed in {round(time() - start_routine, 3)}s",
|
|
103
|
+
time() - start_routine
|
|
104
|
+
)
|
|
269
105
|
|
|
270
|
-
if save_results:
|
|
271
|
-
if isinstance(save_results, str):
|
|
272
|
-
output_path = save_results
|
|
273
|
-
else:
|
|
274
|
-
output_path = os.path.splitext(file_path)[0]
|
|
275
|
-
if is_url(output_path):
|
|
276
|
-
output_path = output_path.split('/')[-1]
|
|
277
|
-
if analysis.get("sheet_name"):
|
|
278
|
-
output_path += "_sheet-" + str(sheet_name)
|
|
279
|
-
output_path += ".json"
|
|
280
|
-
with open(output_path, "w", encoding="utf8") as fp:
|
|
281
|
-
json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
|
|
282
106
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
107
|
+
def validate_then_detect(
|
|
108
|
+
file_path: str,
|
|
109
|
+
previous_analysis: dict,
|
|
110
|
+
num_rows: int = 500,
|
|
111
|
+
user_input_tests: Union[str, list[str]] = "ALL",
|
|
112
|
+
limited_output: bool = True,
|
|
113
|
+
save_results: Union[bool, str] = True,
|
|
114
|
+
encoding: str = None,
|
|
115
|
+
sep: str = None,
|
|
116
|
+
skipna: bool = True,
|
|
117
|
+
output_profile: bool = False,
|
|
118
|
+
output_schema: bool = False,
|
|
119
|
+
output_df: bool = False,
|
|
120
|
+
cast_json: bool = True,
|
|
121
|
+
verbose: bool = False,
|
|
122
|
+
sheet_name: Union[str, int] = None,
|
|
123
|
+
):
|
|
124
|
+
|
|
289
125
|
if verbose:
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
126
|
+
start_routine = time()
|
|
127
|
+
if is_url(file_path):
|
|
128
|
+
logging.info("Path recognized as a URL")
|
|
129
|
+
|
|
130
|
+
is_valid, table, analysis = validate(
|
|
131
|
+
file_path=file_path,
|
|
132
|
+
previous_analysis=previous_analysis,
|
|
133
|
+
num_rows=num_rows,
|
|
134
|
+
encoding=encoding,
|
|
135
|
+
sep=sep,
|
|
136
|
+
verbose=verbose,
|
|
137
|
+
skipna=skipna,
|
|
138
|
+
sheet_name=sheet_name,
|
|
139
|
+
)
|
|
140
|
+
if is_valid:
|
|
141
|
+
# skipping formats detection as the validation is successful
|
|
142
|
+
analysis = previous_analysis
|
|
143
|
+
del analysis["profile"]
|
|
144
|
+
else:
|
|
145
|
+
analysis = detect_formats(
|
|
146
|
+
table=table,
|
|
147
|
+
analysis=analysis,
|
|
148
|
+
user_input_tests=user_input_tests,
|
|
149
|
+
limited_output=limited_output,
|
|
150
|
+
skipna=skipna,
|
|
151
|
+
verbose=verbose,
|
|
293
152
|
)
|
|
294
|
-
|
|
295
|
-
return
|
|
296
|
-
|
|
297
|
-
|
|
153
|
+
try:
|
|
154
|
+
return generate_output(
|
|
155
|
+
table=table,
|
|
156
|
+
analysis=analysis,
|
|
157
|
+
file_path=file_path,
|
|
158
|
+
num_rows=num_rows,
|
|
159
|
+
limited_output=limited_output,
|
|
160
|
+
save_results=save_results,
|
|
161
|
+
output_profile=output_profile,
|
|
162
|
+
output_schema=output_schema,
|
|
163
|
+
output_df=output_df,
|
|
298
164
|
cast_json=cast_json,
|
|
299
165
|
verbose=verbose,
|
|
166
|
+
sheet_name=sheet_name,
|
|
300
167
|
)
|
|
301
|
-
|
|
168
|
+
finally:
|
|
169
|
+
if verbose:
|
|
170
|
+
display_logs_depending_process_time(
|
|
171
|
+
f"Process completed in {round(time() - start_routine, 3)}s",
|
|
172
|
+
time() - start_routine
|
|
173
|
+
)
|
|
302
174
|
|
|
303
175
|
|
|
304
176
|
def routine_minio(
|
|
@@ -369,8 +241,8 @@ def routine_minio(
|
|
|
369
241
|
minio_pwd=minio_pwd,
|
|
370
242
|
)
|
|
371
243
|
|
|
372
|
-
analysis = routine(
|
|
373
|
-
|
|
244
|
+
analysis = routine(
|
|
245
|
+
file_path,
|
|
374
246
|
save_results=True,
|
|
375
247
|
**kwargs,
|
|
376
248
|
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
# flake8: noqa
|
|
5
|
+
from csv_detective import detect_fields, detect_labels
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_all_packages(detect_type) -> list:
|
|
9
|
+
root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
|
|
10
|
+
modules = []
|
|
11
|
+
for dirpath, _, filenames in os.walk(root_dir):
|
|
12
|
+
for filename in filenames:
|
|
13
|
+
file = os.path.join(dirpath, filename).replace(root_dir, "")
|
|
14
|
+
if file.endswith("__init__.py"):
|
|
15
|
+
module = (
|
|
16
|
+
file.replace("__init__.py", "")
|
|
17
|
+
.replace("/", ".").replace("\\", ".")[:-1]
|
|
18
|
+
)
|
|
19
|
+
if module:
|
|
20
|
+
modules.append(detect_type + module)
|
|
21
|
+
return modules
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def return_all_tests(
|
|
25
|
+
user_input_tests: Union[str, list],
|
|
26
|
+
detect_type: str,
|
|
27
|
+
) -> list:
|
|
28
|
+
"""
|
|
29
|
+
returns all tests that have a method _is and are listed in the user_input_tests
|
|
30
|
+
the function can select a sub_package from csv_detective
|
|
31
|
+
user_input_tests may look like this:
|
|
32
|
+
- "ALL": all possible tests are made
|
|
33
|
+
- "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
|
|
34
|
+
this specifc (group of) test(s) only
|
|
35
|
+
- ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
|
|
36
|
+
specific (groups of) tests by add "-" at the start (e.g "-temp.date")
|
|
37
|
+
"""
|
|
38
|
+
assert detect_type in ["detect_fields", "detect_labels"]
|
|
39
|
+
all_packages = get_all_packages(detect_type=detect_type)
|
|
40
|
+
|
|
41
|
+
if isinstance(user_input_tests, str):
|
|
42
|
+
user_input_tests = [user_input_tests]
|
|
43
|
+
if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
|
|
44
|
+
tests_to_do = [detect_type]
|
|
45
|
+
else:
|
|
46
|
+
tests_to_do = [
|
|
47
|
+
f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
|
|
48
|
+
]
|
|
49
|
+
tests_skipped = [
|
|
50
|
+
f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
|
|
51
|
+
]
|
|
52
|
+
all_tests = [
|
|
53
|
+
# this is why we need to import detect_fields/labels
|
|
54
|
+
eval(x) for x in all_packages
|
|
55
|
+
if any([y == x[: len(y)] for y in tests_to_do])
|
|
56
|
+
and all([y != x[: len(y)] for y in tests_skipped])
|
|
57
|
+
]
|
|
58
|
+
# to remove groups of tests
|
|
59
|
+
all_tests = [
|
|
60
|
+
test for test in all_tests if "_is" in dir(test)
|
|
61
|
+
]
|
|
62
|
+
return all_tests
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import is_url
|
|
8
|
+
from .dataframe import cast_df
|
|
9
|
+
from .profile import create_profile
|
|
10
|
+
from .schema import generate_table_schema
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_output(
|
|
14
|
+
table: pd.DataFrame,
|
|
15
|
+
analysis: dict,
|
|
16
|
+
file_path: str,
|
|
17
|
+
num_rows: int = 500,
|
|
18
|
+
limited_output: bool = True,
|
|
19
|
+
save_results: Union[bool, str] = True,
|
|
20
|
+
output_profile: bool = False,
|
|
21
|
+
output_schema: bool = False,
|
|
22
|
+
output_df: bool = False,
|
|
23
|
+
cast_json: bool = True,
|
|
24
|
+
verbose: bool = False,
|
|
25
|
+
sheet_name: Union[str, int] = None,
|
|
26
|
+
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
27
|
+
|
|
28
|
+
if output_profile:
|
|
29
|
+
analysis["profile"] = create_profile(
|
|
30
|
+
table=table,
|
|
31
|
+
dict_cols_fields=analysis["columns"],
|
|
32
|
+
num_rows=num_rows,
|
|
33
|
+
limited_output=limited_output,
|
|
34
|
+
verbose=verbose,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
if save_results:
|
|
38
|
+
if isinstance(save_results, str):
|
|
39
|
+
output_path = save_results
|
|
40
|
+
else:
|
|
41
|
+
output_path = os.path.splitext(file_path)[0]
|
|
42
|
+
if is_url(output_path):
|
|
43
|
+
output_path = output_path.split('/')[-1]
|
|
44
|
+
if analysis.get("sheet_name"):
|
|
45
|
+
output_path += "_sheet-" + str(sheet_name)
|
|
46
|
+
output_path += ".json"
|
|
47
|
+
with open(output_path, "w", encoding="utf8") as fp:
|
|
48
|
+
json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
|
|
49
|
+
|
|
50
|
+
if output_schema:
|
|
51
|
+
analysis["schema"] = generate_table_schema(
|
|
52
|
+
analysis,
|
|
53
|
+
save_file=False,
|
|
54
|
+
verbose=verbose
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if output_df:
|
|
58
|
+
return analysis, cast_df(
|
|
59
|
+
df=table,
|
|
60
|
+
columns=analysis["columns"],
|
|
61
|
+
cast_json=cast_json,
|
|
62
|
+
verbose=verbose,
|
|
63
|
+
)
|
|
64
|
+
return analysis
|
|
File without changes
|
csv_detective/output/example.py
CHANGED
|
File without changes
|
csv_detective/output/profile.py
CHANGED
|
File without changes
|
csv_detective/output/schema.py
CHANGED
|
File without changes
|
csv_detective/output/utils.py
CHANGED
|
File without changes
|
csv_detective/utils.py
CHANGED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from csv_detective.load_tests import return_all_tests
|
|
7
|
+
from .parsing.load import load_file
|
|
8
|
+
|
|
9
|
+
logging.basicConfig(level=logging.INFO)
|
|
10
|
+
|
|
11
|
+
tests = {
|
|
12
|
+
t.__name__.split(".")[-1]: t._is
|
|
13
|
+
for t in return_all_tests("ALL", "detect_fields")
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def validate(
|
|
18
|
+
file_path: str,
|
|
19
|
+
previous_analysis: dict,
|
|
20
|
+
num_rows: int = 500,
|
|
21
|
+
encoding: str = None,
|
|
22
|
+
sep: str = None,
|
|
23
|
+
verbose: bool = False,
|
|
24
|
+
skipna: bool = True,
|
|
25
|
+
sheet_name: Union[str, int] = None,
|
|
26
|
+
) -> tuple[bool, pd.DataFrame, dict]:
|
|
27
|
+
"""
|
|
28
|
+
Verify is the given file has the same fields and types as in the previous analysis.
|
|
29
|
+
"""
|
|
30
|
+
table, analysis = load_file(
|
|
31
|
+
file_path=file_path,
|
|
32
|
+
num_rows=num_rows,
|
|
33
|
+
encoding=encoding,
|
|
34
|
+
sep=sep,
|
|
35
|
+
verbose=verbose,
|
|
36
|
+
sheet_name=sheet_name,
|
|
37
|
+
)
|
|
38
|
+
if verbose:
|
|
39
|
+
logging.info("Comparing table with the previous analysis")
|
|
40
|
+
logging.info("- Checking if all columns match")
|
|
41
|
+
if (
|
|
42
|
+
any(col_name not in list(table.columns) for col_name in previous_analysis["columns"])
|
|
43
|
+
or any(col_name not in list(previous_analysis["columns"].keys()) for col_name in table.columns)
|
|
44
|
+
):
|
|
45
|
+
logging.warning("> Columns do not match, proceeding with full analysis")
|
|
46
|
+
return False, table, analysis
|
|
47
|
+
for col_name, args in previous_analysis["columns"].items():
|
|
48
|
+
if verbose:
|
|
49
|
+
logging.info(f"- Testing {col_name} for {args['format']}")
|
|
50
|
+
if args["format"] == "string":
|
|
51
|
+
# no test for columns that have not been recognized as a specific format
|
|
52
|
+
continue
|
|
53
|
+
test_func = tests[args["format"]]
|
|
54
|
+
col_data = table[col_name]
|
|
55
|
+
if skipna:
|
|
56
|
+
col_data = col_data.loc[~col_data.isna()]
|
|
57
|
+
if not col_data.apply(test_func).all():
|
|
58
|
+
logging.warning("> Test failed, proceeding with full analysis")
|
|
59
|
+
return False, table, analysis
|
|
60
|
+
if verbose:
|
|
61
|
+
logging.info("> All checks successful")
|
|
62
|
+
return True, table, analysis | {
|
|
63
|
+
k: previous_analysis[k] for k in [
|
|
64
|
+
"categorical",
|
|
65
|
+
"columns",
|
|
66
|
+
"columns_fields",
|
|
67
|
+
"columns_labels",
|
|
68
|
+
"formats",
|
|
69
|
+
]
|
|
70
|
+
}
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
- Handle csv.gz files [#110](https://github.com/datagouv/csv-detective/pull/110)
|
|
14
14
|
- Refactor file tests [#110](https://github.com/datagouv/csv-detective/pull/110)
|
|
15
15
|
- Restructure repo (breaking changes) [#111](https://github.com/datagouv/csv-detective/pull/111)
|
|
16
|
+
- Add validation function and associated flow [#112](https://github.com/datagouv/csv-detective/pull/112)
|
|
16
17
|
- Better float detection [#113](https://github.com/datagouv/csv-detective/pull/113)
|
|
17
18
|
|
|
18
19
|
## 0.7.4 (2024-11-15)
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
csv_detective/__init__.py,sha256=
|
|
1
|
+
csv_detective/__init__.py,sha256=vpK7WMkIQbcJzu6HKOwcn7PpHsNCCaXZ1YLMS5Wq9tM,165
|
|
2
2
|
csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
3
|
+
csv_detective/explore_csv.py,sha256=ocWlUEtuwZ-6bjDc6gfhC2-6DljMVhvXhHrfICCXGfQ,8986
|
|
4
|
+
csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
|
|
4
5
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
5
|
-
csv_detective/utils.py,sha256=
|
|
6
|
+
csv_detective/utils.py,sha256=Bx_1k4Sdpd5PCjuAy4AeayCmmw7TMR_zgtKIHNLi5g0,1157
|
|
7
|
+
csv_detective/validate.py,sha256=o4Qulf8E-x1zsWT9OD4Fpw83Gku1WA3JlX83j7bu0DA,2314
|
|
6
8
|
csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
|
|
7
9
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
10
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -53,7 +55,7 @@ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7g
|
|
|
53
55
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
56
|
csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
|
|
55
57
|
csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
|
|
56
|
-
csv_detective/detect_fields/other/float/__init__.py,sha256=
|
|
58
|
+
csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
|
|
57
59
|
csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
|
|
58
60
|
csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
|
|
59
61
|
csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
|
|
@@ -126,10 +128,12 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=3U9j8Hux432KdGtIyArq_-v
|
|
|
126
128
|
csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
|
|
127
129
|
csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
|
|
128
130
|
csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
|
|
131
|
+
csv_detective/detection/formats.py,sha256=VwFazRAFJN6eaYUK7IauVU88vuUBHccESY4UD8EgGUo,5386
|
|
129
132
|
csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
|
|
130
133
|
csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
|
|
131
134
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
132
135
|
csv_detective/detection/variables.py,sha256=3qEMtjZ_zyIFXvTnFgK7ZMDx8C12uQXKfFjEj2moyJc,3558
|
|
136
|
+
csv_detective/output/__init__.py,sha256=XDS4Dgvv6oloIao9JquHa0m1nnlQ_q2gHuEPGlaETic,1890
|
|
133
137
|
csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
|
|
134
138
|
csv_detective/output/example.py,sha256=i8PkdXxidF7qR_9aK8vh12JpZdJQryhBgyrMS8iy5rk,8642
|
|
135
139
|
csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
|
|
@@ -141,18 +145,19 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
|
|
|
141
145
|
csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
|
|
142
146
|
csv_detective/parsing/load.py,sha256=SpP0pfxswOAPPpwbZfoP1blh0EKV5VMs0TpTgQJKzjs,3621
|
|
143
147
|
csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
|
|
144
|
-
csv_detective-0.7.5.
|
|
145
|
-
csv_detective-0.7.5.
|
|
146
|
-
csv_detective-0.7.5.
|
|
147
|
-
csv_detective-0.7.5.
|
|
148
|
+
csv_detective-0.7.5.dev1286.data/data/share/csv_detective/CHANGELOG.md,sha256=Gqw7W41bXK_JgIYi80vdOPR6JLY5rgABeNsiDStE4XA,7901
|
|
149
|
+
csv_detective-0.7.5.dev1286.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
150
|
+
csv_detective-0.7.5.dev1286.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
151
|
+
csv_detective-0.7.5.dev1286.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
148
152
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
153
|
tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
|
|
150
|
-
tests/test_fields.py,sha256=
|
|
151
|
-
tests/test_file.py,sha256=
|
|
154
|
+
tests/test_fields.py,sha256=53kiUQiqGt4_fnyCoxhNLeEsuN1LRDB-7HGT3p_Ed9I,11147
|
|
155
|
+
tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
|
|
152
156
|
tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
|
|
153
|
-
tests/test_structure.py,sha256=
|
|
154
|
-
|
|
155
|
-
csv_detective-0.7.5.
|
|
156
|
-
csv_detective-0.7.5.
|
|
157
|
-
csv_detective-0.7.5.
|
|
158
|
-
csv_detective-0.7.5.
|
|
157
|
+
tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
|
|
158
|
+
tests/test_validation.py,sha256=VwtBcnGAQ_eSFrBibWnMSTDjuy6y2JLlqvc3Zb667NY,479
|
|
159
|
+
csv_detective-0.7.5.dev1286.dist-info/METADATA,sha256=rLptgL-FkLZzfkxPt7_0I-k7EKPKbEHhd3Ei2qt54KI,1386
|
|
160
|
+
csv_detective-0.7.5.dev1286.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
|
161
|
+
csv_detective-0.7.5.dev1286.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
162
|
+
csv_detective-0.7.5.dev1286.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
163
|
+
csv_detective-0.7.5.dev1286.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -48,7 +48,7 @@ from csv_detective.detection.variables import (
|
|
|
48
48
|
detect_continuous_variable,
|
|
49
49
|
detect_categorical_variable,
|
|
50
50
|
)
|
|
51
|
-
from csv_detective.
|
|
51
|
+
from csv_detective.load_tests import return_all_tests
|
|
52
52
|
from csv_detective.output.dataframe import cast
|
|
53
53
|
|
|
54
54
|
|
tests/test_file.py
CHANGED
|
@@ -28,7 +28,7 @@ def test_columns_output_on_file():
|
|
|
28
28
|
"STRUCTURED_INFO",
|
|
29
29
|
"GEO_INFO",
|
|
30
30
|
]
|
|
31
|
-
assert output["total_lines"] ==
|
|
31
|
+
assert output["total_lines"] == 404
|
|
32
32
|
assert output["nb_duplicates"] == 7
|
|
33
33
|
assert output["columns"]["NOMCOM"]["format"] == "commune"
|
|
34
34
|
assert output["columns"]["NOMDEP"]["format"] == "departement"
|
|
@@ -48,7 +48,7 @@ def test_profile_output_on_file():
|
|
|
48
48
|
)
|
|
49
49
|
assert all(
|
|
50
50
|
[
|
|
51
|
-
c in list(output["profile"]["
|
|
51
|
+
c in list(output["profile"]["TXCOUVGLO_COM_2014"].keys())
|
|
52
52
|
for c in [
|
|
53
53
|
"min",
|
|
54
54
|
"max",
|
|
@@ -60,12 +60,22 @@ def test_profile_output_on_file():
|
|
|
60
60
|
]
|
|
61
61
|
]
|
|
62
62
|
)
|
|
63
|
-
assert
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
63
|
+
assert not any(
|
|
64
|
+
[
|
|
65
|
+
c in list(output["profile"]["NUMCOM"].keys())
|
|
66
|
+
for c in [
|
|
67
|
+
"min",
|
|
68
|
+
"max",
|
|
69
|
+
"mean",
|
|
70
|
+
"std",
|
|
71
|
+
]
|
|
72
|
+
]
|
|
73
|
+
)
|
|
74
|
+
assert output["profile"]["TXCOUVGLO_COM_2014"]["min"] == 0.0
|
|
75
|
+
assert output["profile"]["TXCOUVGLO_COM_2014"]["max"] == 200.2
|
|
76
|
+
assert round(output["profile"]["TXCOUVGLO_COM_2014"]["mean"]) == 60
|
|
77
|
+
assert round(output["profile"]["TXCOUVGLO_COM_2014"]["std"]) == 36
|
|
78
|
+
assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 290
|
|
69
79
|
assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_missing_values"] == 3
|
|
70
80
|
assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
|
|
71
81
|
|
|
@@ -175,7 +185,7 @@ def mocked_responses():
|
|
|
175
185
|
"params",
|
|
176
186
|
# ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
|
|
177
187
|
# which doesn't support the way we mock the response, TBC
|
|
178
|
-
params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines":
|
|
188
|
+
params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})]
|
|
179
189
|
)
|
|
180
190
|
def test_urls(mocked_responses, params):
|
|
181
191
|
file_name, checks = params
|
tests/test_structure.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
# flake8: noqa
|
|
3
3
|
from csv_detective import detect_fields, detect_labels
|
|
4
|
+
from csv_detective.load_tests import return_all_tests
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def tests_conformity():
|
|
@@ -29,3 +30,8 @@ def tests_conformity():
|
|
|
29
30
|
.replace("/", ".")
|
|
30
31
|
)
|
|
31
32
|
assert "_is" in dir(_package)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_all_tests_have_unique_name():
|
|
36
|
+
names = [t.__name__.split(".")[-1] for t in return_all_tests("ALL", "detect_fields")]
|
|
37
|
+
assert len(names) == len(set(names))
|
tests/test_validation.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from csv_detective.validate import validate
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_validation():
|
|
9
|
+
with open("tests/data/a_test_file.json", "r") as f:
|
|
10
|
+
previous_analysis = json.load(f)
|
|
11
|
+
is_valid, table, analysis = validate(
|
|
12
|
+
"tests/data/a_test_file.csv",
|
|
13
|
+
previous_analysis=previous_analysis,
|
|
14
|
+
num_rows=-1,
|
|
15
|
+
)
|
|
16
|
+
assert is_valid is True
|
|
17
|
+
assert isinstance(table, pd.DataFrame)
|
|
18
|
+
assert isinstance(analysis, dict)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/top_level.txt
RENAMED
|
File without changes
|