csv-detective 0.7.5.dev1277__py3-none-any.whl → 0.7.5.dev1298__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -1
- csv_detective/detect_fields/__init__.py +6 -4
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +7 -7
- csv_detective/detect_fields/other/float/__init__.py +4 -4
- csv_detective/detect_fields/other/money/__init__.py +11 -0
- csv_detective/detect_fields/other/percent/__init__.py +9 -0
- csv_detective/detection/formats.py +145 -0
- csv_detective/explore_csv.py +94 -222
- csv_detective/load_tests.py +62 -0
- csv_detective/output/__init__.py +64 -0
- csv_detective/output/dataframe.py +0 -0
- csv_detective/output/example.py +77 -77
- csv_detective/output/profile.py +0 -0
- csv_detective/output/schema.py +0 -0
- csv_detective/output/utils.py +0 -0
- csv_detective/utils.py +2 -0
- csv_detective/validate.py +70 -0
- {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1298.data}/data/share/csv_detective/CHANGELOG.md +2 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/METADATA +1 -1
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/RECORD +27 -20
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/WHEEL +1 -1
- tests/test_example.py +10 -10
- tests/test_fields.py +270 -415
- tests/test_file.py +19 -9
- tests/test_structure.py +6 -0
- tests/test_validation.py +18 -0
- {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1298.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1298.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1298.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py
CHANGED
|
@@ -10,19 +10,21 @@ from .FR.other import (
|
|
|
10
10
|
insee_ape700,
|
|
11
11
|
date_fr,
|
|
12
12
|
code_waldec,
|
|
13
|
-
code_rna
|
|
13
|
+
code_rna,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
from .other import (
|
|
17
17
|
email,
|
|
18
18
|
url,
|
|
19
19
|
booleen,
|
|
20
|
+
money,
|
|
20
21
|
mongo_object_id,
|
|
22
|
+
percent,
|
|
21
23
|
twitter,
|
|
22
24
|
float,
|
|
23
25
|
int,
|
|
24
26
|
uuid,
|
|
25
|
-
json
|
|
27
|
+
json,
|
|
26
28
|
)
|
|
27
29
|
|
|
28
30
|
from .FR.geo import (
|
|
@@ -40,7 +42,7 @@ from .FR.geo import (
|
|
|
40
42
|
code_region,
|
|
41
43
|
latitude_l93,
|
|
42
44
|
longitude_l93,
|
|
43
|
-
insee_canton
|
|
45
|
+
insee_canton,
|
|
44
46
|
)
|
|
45
47
|
|
|
46
48
|
from .geo import (
|
|
@@ -50,7 +52,7 @@ from .geo import (
|
|
|
50
52
|
latitude_wgs,
|
|
51
53
|
longitude_wgs,
|
|
52
54
|
latlon_wgs,
|
|
53
|
-
json_geojson
|
|
55
|
+
json_geojson,
|
|
54
56
|
)
|
|
55
57
|
|
|
56
58
|
from .FR.temp import jour_de_la_semaine, mois_de_annee
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import
|
|
1
|
+
from ..latitude_wgs import _is as is_lat
|
|
2
|
+
from ..longitude_wgs import _is as is_lon
|
|
2
3
|
|
|
3
|
-
PROPORTION =
|
|
4
|
+
PROPORTION = 1
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def _is(val):
|
|
7
8
|
'''Renvoie True si val peut etre une latitude,longitude'''
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
)
|
|
10
|
+
if not isinstance(val, str) or val.count(",") != 1:
|
|
11
|
+
return False
|
|
12
|
+
lat, lon = val.split(",")
|
|
13
|
+
return is_lat(lat) and is_lon(lon.replace(" ", ""))
|
|
@@ -2,16 +2,16 @@ PROPORTION = 1
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def float_casting(val: str) -> float:
|
|
5
|
-
return float(val.replace(
|
|
5
|
+
return float(val.replace(",", "."))
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def _is(val):
|
|
9
|
-
|
|
10
|
-
notations (3e6) or "+" in the string. "-" is still accepted.
|
|
9
|
+
"""Detects floats, assuming that tables will not have scientific
|
|
10
|
+
notations (3e6) or "+" in the string. "-" is still accepted."""
|
|
11
11
|
try:
|
|
12
12
|
if (
|
|
13
13
|
not isinstance(val, str)
|
|
14
|
-
or any([k in val for k in [
|
|
14
|
+
or any([k in val for k in ["_", "+", "e", "E"]])
|
|
15
15
|
or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
|
|
16
16
|
):
|
|
17
17
|
return False
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from csv_detective.detection.variables import (
|
|
7
|
+
detect_categorical_variable,
|
|
8
|
+
# detect_continuous_variable,
|
|
9
|
+
)
|
|
10
|
+
from csv_detective.load_tests import return_all_tests
|
|
11
|
+
from csv_detective.output.utils import prepare_output_dict
|
|
12
|
+
from csv_detective.parsing.columns import test_col, test_label
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def detect_formats(
|
|
16
|
+
table: pd.DataFrame,
|
|
17
|
+
analysis: dict,
|
|
18
|
+
user_input_tests: Union[str, list[str]] = "ALL",
|
|
19
|
+
limited_output: bool = True,
|
|
20
|
+
skipna: bool = True,
|
|
21
|
+
verbose: bool = False,
|
|
22
|
+
):
|
|
23
|
+
|
|
24
|
+
if table.empty:
|
|
25
|
+
res_categorical = []
|
|
26
|
+
# res_continuous = []
|
|
27
|
+
else:
|
|
28
|
+
# Detects columns that are categorical
|
|
29
|
+
res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
|
|
30
|
+
res_categorical = list(res_categorical)
|
|
31
|
+
# Detect columns that are continuous (we already know the categorical) :
|
|
32
|
+
# we don't need this for now, cuts processing time
|
|
33
|
+
# res_continuous = list(
|
|
34
|
+
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
35
|
+
# )
|
|
36
|
+
|
|
37
|
+
analysis.update({
|
|
38
|
+
"categorical": res_categorical,
|
|
39
|
+
# "continuous": res_continuous,
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
# list testing to be performed
|
|
43
|
+
all_tests_fields = return_all_tests(
|
|
44
|
+
user_input_tests, detect_type="detect_fields"
|
|
45
|
+
) # list all tests for the fields
|
|
46
|
+
all_tests_labels = return_all_tests(
|
|
47
|
+
user_input_tests, detect_type="detect_labels"
|
|
48
|
+
) # list all tests for the labels
|
|
49
|
+
|
|
50
|
+
# if no testing then return
|
|
51
|
+
if not all_tests_fields and not all_tests_labels:
|
|
52
|
+
return analysis
|
|
53
|
+
|
|
54
|
+
# Perform testing on fields
|
|
55
|
+
scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
|
|
56
|
+
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
57
|
+
|
|
58
|
+
# Perform testing on labels
|
|
59
|
+
scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
|
|
60
|
+
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
61
|
+
|
|
62
|
+
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
63
|
+
# This is because the fields are more important than the labels and yields a max
|
|
64
|
+
# of 1.5 for the final score.
|
|
65
|
+
scores_table = scores_table_fields * (
|
|
66
|
+
1
|
|
67
|
+
+ scores_table_labels.reindex(
|
|
68
|
+
index=scores_table_fields.index, fill_value=0
|
|
69
|
+
).values / 2
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
73
|
+
# a detection (skipping the ones that have been excluded by the users).
|
|
74
|
+
formats_with_mandatory_label = [
|
|
75
|
+
f for f in [
|
|
76
|
+
"code_departement",
|
|
77
|
+
"code_commune_insee",
|
|
78
|
+
"code_postal",
|
|
79
|
+
"latitude_wgs",
|
|
80
|
+
"longitude_wgs",
|
|
81
|
+
"latitude_wgs_fr_metropole",
|
|
82
|
+
"longitude_wgs_fr_metropole",
|
|
83
|
+
"latitude_l93",
|
|
84
|
+
"longitude_l93",
|
|
85
|
+
] if f in scores_table.index
|
|
86
|
+
]
|
|
87
|
+
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
88
|
+
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
89
|
+
scores_table.loc[formats_with_mandatory_label, :],
|
|
90
|
+
0,
|
|
91
|
+
)
|
|
92
|
+
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
93
|
+
|
|
94
|
+
metier_to_python_type = {
|
|
95
|
+
"booleen": "bool",
|
|
96
|
+
"int": "int",
|
|
97
|
+
"float": "float",
|
|
98
|
+
"string": "string",
|
|
99
|
+
"json": "json",
|
|
100
|
+
"json_geojson": "json",
|
|
101
|
+
"datetime": "datetime",
|
|
102
|
+
"datetime_iso": "datetime",
|
|
103
|
+
"datetime_rfc822": "datetime",
|
|
104
|
+
"date": "date",
|
|
105
|
+
"latitude": "float",
|
|
106
|
+
"latitude_l93": "float",
|
|
107
|
+
"latitude_wgs": "float",
|
|
108
|
+
"latitude_wgs_fr_metropole": "float",
|
|
109
|
+
"longitude": "float",
|
|
110
|
+
"longitude_l93": "float",
|
|
111
|
+
"longitude_wgs": "float",
|
|
112
|
+
"longitude_wgs_fr_metropole": "float",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if not limited_output:
|
|
116
|
+
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
117
|
+
analysis[detection_method] = {
|
|
118
|
+
col_name: [
|
|
119
|
+
{
|
|
120
|
+
"python_type": metier_to_python_type.get(
|
|
121
|
+
detection["format"], "string"
|
|
122
|
+
),
|
|
123
|
+
**detection,
|
|
124
|
+
}
|
|
125
|
+
for detection in detections
|
|
126
|
+
]
|
|
127
|
+
for col_name, detections in analysis[detection_method].items()
|
|
128
|
+
}
|
|
129
|
+
else:
|
|
130
|
+
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
131
|
+
analysis[detection_method] = {
|
|
132
|
+
col_name: {
|
|
133
|
+
"python_type": metier_to_python_type.get(
|
|
134
|
+
detection["format"], "string"
|
|
135
|
+
),
|
|
136
|
+
**detection,
|
|
137
|
+
}
|
|
138
|
+
for col_name, detection in analysis[detection_method].items()
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# Add detection with formats as keys
|
|
142
|
+
analysis["formats"] = defaultdict(list)
|
|
143
|
+
for header, col_metadata in analysis["columns"].items():
|
|
144
|
+
analysis["formats"][col_metadata["format"]].append(header)
|
|
145
|
+
return analysis
|
csv_detective/explore_csv.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
4
3
|
import os
|
|
@@ -6,80 +5,16 @@ import tempfile
|
|
|
6
5
|
from time import time
|
|
7
6
|
from typing import Optional, Union
|
|
8
7
|
|
|
9
|
-
import numpy as np
|
|
10
8
|
import pandas as pd
|
|
11
9
|
|
|
12
|
-
|
|
13
|
-
from
|
|
14
|
-
from .detection.variables import (
|
|
15
|
-
detect_categorical_variable,
|
|
16
|
-
# detect_continuous_variable,
|
|
17
|
-
)
|
|
18
|
-
from .output.dataframe import cast_df
|
|
19
|
-
from .output.profile import create_profile
|
|
20
|
-
from .output.schema import generate_table_schema
|
|
21
|
-
from .output.utils import prepare_output_dict
|
|
10
|
+
from .detection.formats import detect_formats
|
|
11
|
+
from .output import generate_output, generate_table_schema
|
|
22
12
|
from .parsing.load import load_file
|
|
23
|
-
from .parsing.columns import test_col, test_label
|
|
24
13
|
from .s3_utils import download_from_minio, upload_to_minio
|
|
25
14
|
from .utils import display_logs_depending_process_time, is_url
|
|
15
|
+
from .validate import validate
|
|
26
16
|
|
|
27
|
-
|
|
28
|
-
def get_all_packages(detect_type: str) -> list:
|
|
29
|
-
root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
|
|
30
|
-
modules = []
|
|
31
|
-
for dirpath, _, filenames in os.walk(root_dir):
|
|
32
|
-
for filename in filenames:
|
|
33
|
-
file = os.path.join(dirpath, filename).replace(root_dir, "")
|
|
34
|
-
if file.endswith("__init__.py"):
|
|
35
|
-
module = (
|
|
36
|
-
file.replace("__init__.py", "")
|
|
37
|
-
.replace("/", ".").replace("\\", ".")[:-1]
|
|
38
|
-
)
|
|
39
|
-
if module:
|
|
40
|
-
modules.append(detect_type + module)
|
|
41
|
-
return modules
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def return_all_tests(
|
|
45
|
-
user_input_tests: Union[str, list],
|
|
46
|
-
detect_type: str,
|
|
47
|
-
) -> list:
|
|
48
|
-
"""
|
|
49
|
-
returns all tests that have a method _is and are listed in the user_input_tests
|
|
50
|
-
the function can select a sub_package from csv_detective
|
|
51
|
-
user_input_tests may look like this:
|
|
52
|
-
- "ALL": all possible tests are made
|
|
53
|
-
- "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
|
|
54
|
-
this specifc (group of) test(s) only
|
|
55
|
-
- ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
|
|
56
|
-
specific (groups of) tests by add "-" at the start (e.g "-temp.date")
|
|
57
|
-
"""
|
|
58
|
-
assert detect_type in ["detect_fields", "detect_labels"]
|
|
59
|
-
all_packages = get_all_packages(detect_type=detect_type)
|
|
60
|
-
|
|
61
|
-
if isinstance(user_input_tests, str):
|
|
62
|
-
user_input_tests = [user_input_tests]
|
|
63
|
-
if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
|
|
64
|
-
tests_to_do = [detect_type]
|
|
65
|
-
else:
|
|
66
|
-
tests_to_do = [
|
|
67
|
-
f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
|
|
68
|
-
]
|
|
69
|
-
tests_skipped = [
|
|
70
|
-
f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
|
|
71
|
-
]
|
|
72
|
-
all_tests = [
|
|
73
|
-
# this is why we need to import detect_fields/labels
|
|
74
|
-
eval(x) for x in all_packages
|
|
75
|
-
if any([y == x[: len(y)] for y in tests_to_do])
|
|
76
|
-
and all([y != x[: len(y)] for y in tests_skipped])
|
|
77
|
-
]
|
|
78
|
-
# to remove groups of tests
|
|
79
|
-
all_tests = [
|
|
80
|
-
test for test in all_tests if "_is" in dir(test)
|
|
81
|
-
]
|
|
82
|
-
return all_tests
|
|
17
|
+
logging.basicConfig(level=logging.INFO)
|
|
83
18
|
|
|
84
19
|
|
|
85
20
|
def routine(
|
|
@@ -99,7 +34,7 @@ def routine(
|
|
|
99
34
|
sheet_name: Optional[Union[str, int]] = None,
|
|
100
35
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
101
36
|
"""Returns a dict with information about the csv table and possible
|
|
102
|
-
column contents.
|
|
37
|
+
column contents, and if requested the DataFrame with columns cast according to analysis.
|
|
103
38
|
|
|
104
39
|
Args:
|
|
105
40
|
file_path: local path to CSV file if not using Minio
|
|
@@ -112,14 +47,14 @@ def routine(
|
|
|
112
47
|
output_schema: whether or not to add the 'schema' field to the output (tableschema)
|
|
113
48
|
output_df: whether or not to return the loaded DataFrame along with the analysis report
|
|
114
49
|
cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
|
|
115
|
-
verbose: whether or not to print process logs in console
|
|
50
|
+
verbose: whether or not to print process logs in console
|
|
116
51
|
sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
|
|
117
52
|
skipna: whether to keep NaN (empty cells) for tests
|
|
118
53
|
|
|
119
54
|
Returns:
|
|
120
55
|
dict: a dict with information about the csv and possible types for each column
|
|
121
56
|
"""
|
|
122
|
-
|
|
57
|
+
|
|
123
58
|
if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
|
|
124
59
|
raise ValueError("`save_results` must be a bool or a valid path to a json file.")
|
|
125
60
|
|
|
@@ -137,168 +72,105 @@ def routine(
|
|
|
137
72
|
sheet_name=sheet_name,
|
|
138
73
|
)
|
|
139
74
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
# Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
|
|
148
|
-
# res_continuous = list(
|
|
149
|
-
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
150
|
-
# )
|
|
151
|
-
|
|
152
|
-
analysis.update({
|
|
153
|
-
"categorical": res_categorical,
|
|
154
|
-
# "continuous": res_continuous,
|
|
155
|
-
})
|
|
156
|
-
|
|
157
|
-
# list testing to be performed
|
|
158
|
-
all_tests_fields = return_all_tests(
|
|
159
|
-
user_input_tests, detect_type="detect_fields"
|
|
160
|
-
) # list all tests for the fields
|
|
161
|
-
all_tests_labels = return_all_tests(
|
|
162
|
-
user_input_tests, detect_type="detect_labels"
|
|
163
|
-
) # list all tests for the labels
|
|
164
|
-
|
|
165
|
-
# if no testing then return
|
|
166
|
-
if not all_tests_fields and not all_tests_labels:
|
|
167
|
-
return analysis
|
|
168
|
-
|
|
169
|
-
# Perform testing on fields
|
|
170
|
-
scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
|
|
171
|
-
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
172
|
-
|
|
173
|
-
# Perform testing on labels
|
|
174
|
-
scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
|
|
175
|
-
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
176
|
-
|
|
177
|
-
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
178
|
-
# This is because the fields are more important than the labels and yields a max
|
|
179
|
-
# of 1.5 for the final score.
|
|
180
|
-
scores_table = scores_table_fields * (
|
|
181
|
-
1
|
|
182
|
-
+ scores_table_labels.reindex(
|
|
183
|
-
index=scores_table_fields.index, fill_value=0
|
|
184
|
-
).values / 2
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
188
|
-
# a detection (skipping the ones that have been excluded by the users).
|
|
189
|
-
formats_with_mandatory_label = [
|
|
190
|
-
f for f in [
|
|
191
|
-
"code_departement",
|
|
192
|
-
"code_commune_insee",
|
|
193
|
-
"code_postal",
|
|
194
|
-
"latitude_wgs",
|
|
195
|
-
"longitude_wgs",
|
|
196
|
-
"latitude_wgs_fr_metropole",
|
|
197
|
-
"longitude_wgs_fr_metropole",
|
|
198
|
-
"latitude_l93",
|
|
199
|
-
"longitude_l93",
|
|
200
|
-
] if f in scores_table.index
|
|
201
|
-
]
|
|
202
|
-
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
203
|
-
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
204
|
-
scores_table.loc[formats_with_mandatory_label, :],
|
|
205
|
-
0,
|
|
75
|
+
analysis = detect_formats(
|
|
76
|
+
table=table,
|
|
77
|
+
analysis=analysis,
|
|
78
|
+
user_input_tests=user_input_tests,
|
|
79
|
+
limited_output=limited_output,
|
|
80
|
+
skipna=skipna,
|
|
81
|
+
verbose=verbose,
|
|
206
82
|
)
|
|
207
|
-
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
208
|
-
|
|
209
|
-
metier_to_python_type = {
|
|
210
|
-
"booleen": "bool",
|
|
211
|
-
"int": "int",
|
|
212
|
-
"float": "float",
|
|
213
|
-
"string": "string",
|
|
214
|
-
"json": "json",
|
|
215
|
-
"json_geojson": "json",
|
|
216
|
-
"datetime": "datetime",
|
|
217
|
-
"datetime_iso": "datetime",
|
|
218
|
-
"datetime_rfc822": "datetime",
|
|
219
|
-
"date": "date",
|
|
220
|
-
"latitude": "float",
|
|
221
|
-
"latitude_l93": "float",
|
|
222
|
-
"latitude_wgs": "float",
|
|
223
|
-
"latitude_wgs_fr_metropole": "float",
|
|
224
|
-
"longitude": "float",
|
|
225
|
-
"longitude_l93": "float",
|
|
226
|
-
"longitude_wgs": "float",
|
|
227
|
-
"longitude_wgs_fr_metropole": "float",
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
if not limited_output:
|
|
231
|
-
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
232
|
-
analysis[detection_method] = {
|
|
233
|
-
col_name: [
|
|
234
|
-
{
|
|
235
|
-
"python_type": metier_to_python_type.get(
|
|
236
|
-
detection["format"], "string"
|
|
237
|
-
),
|
|
238
|
-
**detection,
|
|
239
|
-
}
|
|
240
|
-
for detection in detections
|
|
241
|
-
]
|
|
242
|
-
for col_name, detections in analysis[detection_method].items()
|
|
243
|
-
}
|
|
244
|
-
else:
|
|
245
|
-
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
246
|
-
analysis[detection_method] = {
|
|
247
|
-
col_name: {
|
|
248
|
-
"python_type": metier_to_python_type.get(
|
|
249
|
-
detection["format"], "string"
|
|
250
|
-
),
|
|
251
|
-
**detection,
|
|
252
|
-
}
|
|
253
|
-
for col_name, detection in analysis[detection_method].items()
|
|
254
|
-
}
|
|
255
83
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
for header, col_metadata in analysis["columns"].items():
|
|
259
|
-
analysis["formats"][col_metadata["format"]].append(header)
|
|
260
|
-
|
|
261
|
-
if output_profile:
|
|
262
|
-
analysis["profile"] = create_profile(
|
|
84
|
+
try:
|
|
85
|
+
return generate_output(
|
|
263
86
|
table=table,
|
|
264
|
-
|
|
87
|
+
analysis=analysis,
|
|
88
|
+
file_path=file_path,
|
|
265
89
|
num_rows=num_rows,
|
|
266
90
|
limited_output=limited_output,
|
|
91
|
+
save_results=save_results,
|
|
92
|
+
output_profile=output_profile,
|
|
93
|
+
output_schema=output_schema,
|
|
94
|
+
output_df=output_df,
|
|
95
|
+
cast_json=cast_json,
|
|
267
96
|
verbose=verbose,
|
|
97
|
+
sheet_name=sheet_name,
|
|
268
98
|
)
|
|
99
|
+
finally:
|
|
100
|
+
if verbose:
|
|
101
|
+
display_logs_depending_process_time(
|
|
102
|
+
f"Routine completed in {round(time() - start_routine, 3)}s",
|
|
103
|
+
time() - start_routine
|
|
104
|
+
)
|
|
269
105
|
|
|
270
|
-
if save_results:
|
|
271
|
-
if isinstance(save_results, str):
|
|
272
|
-
output_path = save_results
|
|
273
|
-
else:
|
|
274
|
-
output_path = os.path.splitext(file_path)[0]
|
|
275
|
-
if is_url(output_path):
|
|
276
|
-
output_path = output_path.split('/')[-1]
|
|
277
|
-
if analysis.get("sheet_name"):
|
|
278
|
-
output_path += "_sheet-" + str(sheet_name)
|
|
279
|
-
output_path += ".json"
|
|
280
|
-
with open(output_path, "w", encoding="utf8") as fp:
|
|
281
|
-
json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
|
|
282
106
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
107
|
+
def validate_then_detect(
|
|
108
|
+
file_path: str,
|
|
109
|
+
previous_analysis: dict,
|
|
110
|
+
num_rows: int = 500,
|
|
111
|
+
user_input_tests: Union[str, list[str]] = "ALL",
|
|
112
|
+
limited_output: bool = True,
|
|
113
|
+
save_results: Union[bool, str] = True,
|
|
114
|
+
encoding: str = None,
|
|
115
|
+
sep: str = None,
|
|
116
|
+
skipna: bool = True,
|
|
117
|
+
output_profile: bool = False,
|
|
118
|
+
output_schema: bool = False,
|
|
119
|
+
output_df: bool = False,
|
|
120
|
+
cast_json: bool = True,
|
|
121
|
+
verbose: bool = False,
|
|
122
|
+
sheet_name: Union[str, int] = None,
|
|
123
|
+
):
|
|
124
|
+
|
|
289
125
|
if verbose:
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
126
|
+
start_routine = time()
|
|
127
|
+
if is_url(file_path):
|
|
128
|
+
logging.info("Path recognized as a URL")
|
|
129
|
+
|
|
130
|
+
is_valid, table, analysis = validate(
|
|
131
|
+
file_path=file_path,
|
|
132
|
+
previous_analysis=previous_analysis,
|
|
133
|
+
num_rows=num_rows,
|
|
134
|
+
encoding=encoding,
|
|
135
|
+
sep=sep,
|
|
136
|
+
verbose=verbose,
|
|
137
|
+
skipna=skipna,
|
|
138
|
+
sheet_name=sheet_name,
|
|
139
|
+
)
|
|
140
|
+
if is_valid:
|
|
141
|
+
# skipping formats detection as the validation is successful
|
|
142
|
+
analysis = previous_analysis
|
|
143
|
+
del analysis["profile"]
|
|
144
|
+
else:
|
|
145
|
+
analysis = detect_formats(
|
|
146
|
+
table=table,
|
|
147
|
+
analysis=analysis,
|
|
148
|
+
user_input_tests=user_input_tests,
|
|
149
|
+
limited_output=limited_output,
|
|
150
|
+
skipna=skipna,
|
|
151
|
+
verbose=verbose,
|
|
293
152
|
)
|
|
294
|
-
|
|
295
|
-
return
|
|
296
|
-
|
|
297
|
-
|
|
153
|
+
try:
|
|
154
|
+
return generate_output(
|
|
155
|
+
table=table,
|
|
156
|
+
analysis=analysis,
|
|
157
|
+
file_path=file_path,
|
|
158
|
+
num_rows=num_rows,
|
|
159
|
+
limited_output=limited_output,
|
|
160
|
+
save_results=save_results,
|
|
161
|
+
output_profile=output_profile,
|
|
162
|
+
output_schema=output_schema,
|
|
163
|
+
output_df=output_df,
|
|
298
164
|
cast_json=cast_json,
|
|
299
165
|
verbose=verbose,
|
|
166
|
+
sheet_name=sheet_name,
|
|
300
167
|
)
|
|
301
|
-
|
|
168
|
+
finally:
|
|
169
|
+
if verbose:
|
|
170
|
+
display_logs_depending_process_time(
|
|
171
|
+
f"Process completed in {round(time() - start_routine, 3)}s",
|
|
172
|
+
time() - start_routine
|
|
173
|
+
)
|
|
302
174
|
|
|
303
175
|
|
|
304
176
|
def routine_minio(
|
|
@@ -369,8 +241,8 @@ def routine_minio(
|
|
|
369
241
|
minio_pwd=minio_pwd,
|
|
370
242
|
)
|
|
371
243
|
|
|
372
|
-
analysis = routine(
|
|
373
|
-
|
|
244
|
+
analysis = routine(
|
|
245
|
+
file_path,
|
|
374
246
|
save_results=True,
|
|
375
247
|
**kwargs,
|
|
376
248
|
)
|