csv-detective 0.8.1.dev1703__py3-none-any.whl → 0.8.1.dev1729__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/cli.py +6 -9
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
- csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
- csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
- csv_detective/detect_fields/__init__.py +94 -43
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/other/booleen/__init__.py +1 -1
- csv_detective/detect_fields/other/email/__init__.py +4 -2
- csv_detective/detect_fields/other/int/__init__.py +3 -3
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
- csv_detective/detect_fields/other/twitter/__init__.py +2 -2
- csv_detective/detect_fields/other/uuid/__init__.py +4 -5
- csv_detective/detect_fields/temp/date/__init__.py +3 -2
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
- csv_detective/detect_fields/temp/year/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
- csv_detective/detect_labels/__init__.py +51 -1
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detection/columns.py +9 -9
- csv_detective/detection/encoding.py +6 -4
- csv_detective/detection/engine.py +6 -5
- csv_detective/detection/formats.py +19 -19
- csv_detective/detection/headers.py +3 -5
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/variables.py +6 -7
- csv_detective/explore_csv.py +7 -8
- csv_detective/load_tests.py +7 -16
- csv_detective/output/__init__.py +3 -7
- csv_detective/output/dataframe.py +9 -5
- csv_detective/output/example.py +13 -13
- csv_detective/output/profile.py +30 -23
- csv_detective/output/schema.py +20 -23
- csv_detective/output/utils.py +15 -15
- csv_detective/parsing/columns.py +23 -12
- csv_detective/parsing/csv.py +1 -1
- csv_detective/parsing/excel.py +10 -11
- csv_detective/parsing/load.py +11 -8
- csv_detective/parsing/text.py +4 -9
- csv_detective/s3_utils.py +3 -7
- csv_detective/utils.py +4 -2
- csv_detective/validate.py +18 -13
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/METADATA +12 -2
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/RECORD +79 -79
- tests/test_example.py +2 -6
- tests/test_fields.py +16 -10
- tests/test_file.py +10 -9
- tests/test_labels.py +3 -2
- tests/test_structure.py +4 -3
- tests/test_validation.py +9 -6
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
import logging
|
|
2
|
+
from collections import defaultdict
|
|
3
3
|
from typing import Union
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
|
+
|
|
7
8
|
from csv_detective.detection.variables import (
|
|
8
9
|
detect_categorical_variable,
|
|
9
10
|
# detect_continuous_variable,
|
|
10
11
|
)
|
|
11
12
|
from csv_detective.load_tests import return_all_tests
|
|
12
13
|
from csv_detective.output.utils import prepare_output_dict
|
|
13
|
-
from csv_detective.parsing.columns import test_col, test_label
|
|
14
|
+
from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
|
|
14
15
|
from csv_detective.validate import validate
|
|
15
16
|
|
|
16
17
|
|
|
@@ -42,10 +43,12 @@ def detect_formats(
|
|
|
42
43
|
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
43
44
|
# )
|
|
44
45
|
|
|
45
|
-
analysis.update(
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
analysis.update(
|
|
47
|
+
{
|
|
48
|
+
"categorical": res_categorical,
|
|
49
|
+
# "continuous": res_continuous,
|
|
50
|
+
}
|
|
51
|
+
)
|
|
49
52
|
|
|
50
53
|
# list testing to be performed
|
|
51
54
|
all_tests_fields = return_all_tests(
|
|
@@ -60,7 +63,9 @@ def detect_formats(
|
|
|
60
63
|
return analysis
|
|
61
64
|
|
|
62
65
|
# Perform testing on fields
|
|
63
|
-
scores_table_fields = test_col(
|
|
66
|
+
scores_table_fields = test_col(
|
|
67
|
+
table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose
|
|
68
|
+
)
|
|
64
69
|
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
65
70
|
|
|
66
71
|
# Perform testing on labels
|
|
@@ -71,16 +76,14 @@ def detect_formats(
|
|
|
71
76
|
# This is because the fields are more important than the labels and yields a max
|
|
72
77
|
# of 1.5 for the final score.
|
|
73
78
|
scores_table = scores_table_fields * (
|
|
74
|
-
1
|
|
75
|
-
+ scores_table_labels.reindex(
|
|
76
|
-
index=scores_table_fields.index, fill_value=0
|
|
77
|
-
).values / 2
|
|
79
|
+
1 + scores_table_labels.reindex(index=scores_table_fields.index, fill_value=0).values / 2
|
|
78
80
|
)
|
|
79
81
|
|
|
80
82
|
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
81
83
|
# a detection (skipping the ones that have been excluded by the users).
|
|
82
84
|
formats_with_mandatory_label = [
|
|
83
|
-
f
|
|
85
|
+
f
|
|
86
|
+
for f in [
|
|
84
87
|
"code_departement",
|
|
85
88
|
"code_commune_insee",
|
|
86
89
|
"code_postal",
|
|
@@ -90,7 +93,8 @@ def detect_formats(
|
|
|
90
93
|
"longitude_wgs_fr_metropole",
|
|
91
94
|
"latitude_l93",
|
|
92
95
|
"longitude_l93",
|
|
93
|
-
]
|
|
96
|
+
]
|
|
97
|
+
if f in scores_table.index
|
|
94
98
|
]
|
|
95
99
|
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
96
100
|
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
@@ -123,9 +127,7 @@ def detect_formats(
|
|
|
123
127
|
analysis[detection_method] = {
|
|
124
128
|
col_name: [
|
|
125
129
|
{
|
|
126
|
-
"python_type": metier_to_python_type.get(
|
|
127
|
-
detection["format"], "string"
|
|
128
|
-
),
|
|
130
|
+
"python_type": metier_to_python_type.get(detection["format"], "string"),
|
|
129
131
|
**detection,
|
|
130
132
|
}
|
|
131
133
|
for detection in detections
|
|
@@ -136,9 +138,7 @@ def detect_formats(
|
|
|
136
138
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
137
139
|
analysis[detection_method] = {
|
|
138
140
|
col_name: {
|
|
139
|
-
"python_type": metier_to_python_type.get(
|
|
140
|
-
detection["format"], "string"
|
|
141
|
-
),
|
|
141
|
+
"python_type": metier_to_python_type.get(detection["format"], "string"),
|
|
142
142
|
**detection,
|
|
143
143
|
}
|
|
144
144
|
for col_name, detection in analysis[detection_method].items()
|
|
@@ -15,18 +15,16 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int,
|
|
|
15
15
|
header = file.readline()
|
|
16
16
|
position = file.tell()
|
|
17
17
|
chaine = [c for c in header.replace("\n", "").split(sep) if c]
|
|
18
|
-
if chaine[-1] not in ["", "\n"] and all(
|
|
19
|
-
[mot not in ["", "\n"] for mot in chaine[1:-1]]
|
|
20
|
-
):
|
|
18
|
+
if chaine[-1] not in ["", "\n"] and all([mot not in ["", "\n"] for mot in chaine[1:-1]]):
|
|
21
19
|
next_row = file.readline()
|
|
22
20
|
file.seek(position)
|
|
23
21
|
if header != next_row:
|
|
24
22
|
if verbose:
|
|
25
23
|
display_logs_depending_process_time(
|
|
26
|
-
f
|
|
24
|
+
f"Detected headers in {round(time() - start, 3)}s",
|
|
27
25
|
time() - start,
|
|
28
26
|
)
|
|
29
27
|
return i, chaine
|
|
30
28
|
if verbose:
|
|
31
|
-
logging.info(
|
|
29
|
+
logging.info("No header detected")
|
|
32
30
|
return 0, None
|
csv_detective/detection/rows.py
CHANGED
|
@@ -5,7 +5,7 @@ def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
|
5
5
|
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
6
6
|
to end up with the header at the right place"""
|
|
7
7
|
idx = 0
|
|
8
|
-
if all([str(c).startswith(
|
|
8
|
+
if all([str(c).startswith("Unnamed:") for c in table.columns]):
|
|
9
9
|
# there is on offset between the index in the file (idx here)
|
|
10
10
|
# and the index in the dataframe, because of the header
|
|
11
11
|
idx = 1
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from ast import literal_eval
|
|
2
1
|
import logging
|
|
2
|
+
from ast import literal_eval
|
|
3
3
|
from time import time
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
@@ -7,7 +7,9 @@ import pandas as pd
|
|
|
7
7
|
from csv_detective.utils import display_logs_depending_process_time
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def detect_continuous_variable(
|
|
10
|
+
def detect_continuous_variable(
|
|
11
|
+
table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False
|
|
12
|
+
):
|
|
11
13
|
"""
|
|
12
14
|
Detects whether a column contains continuous variables. We consider a continuous column
|
|
13
15
|
one that contains a considerable amount of float values.
|
|
@@ -34,16 +36,13 @@ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9,
|
|
|
34
36
|
value = value.replace(",", ".")
|
|
35
37
|
value = literal_eval(value)
|
|
36
38
|
return type(value)
|
|
37
|
-
|
|
38
|
-
except:
|
|
39
|
+
except Exception:
|
|
39
40
|
return False
|
|
40
41
|
|
|
41
42
|
if verbose:
|
|
42
43
|
start = time()
|
|
43
44
|
logging.info("Detecting continuous columns")
|
|
44
|
-
res = table.apply(
|
|
45
|
-
lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
|
|
46
|
-
)
|
|
45
|
+
res = table.apply(lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th))
|
|
47
46
|
if verbose:
|
|
48
47
|
display_logs_depending_process_time(
|
|
49
48
|
f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
|
csv_detective/explore_csv.py
CHANGED
|
@@ -55,7 +55,10 @@ def routine(
|
|
|
55
55
|
dict: a dict with information about the csv and possible types for each column
|
|
56
56
|
"""
|
|
57
57
|
|
|
58
|
-
if not (
|
|
58
|
+
if not (
|
|
59
|
+
isinstance(save_results, bool)
|
|
60
|
+
or (isinstance(save_results, str) and save_results.endswith(".json"))
|
|
61
|
+
):
|
|
59
62
|
raise ValueError("`save_results` must be a bool or a valid path to a json file.")
|
|
60
63
|
|
|
61
64
|
if verbose:
|
|
@@ -100,8 +103,7 @@ def routine(
|
|
|
100
103
|
finally:
|
|
101
104
|
if verbose:
|
|
102
105
|
display_logs_depending_process_time(
|
|
103
|
-
f"Routine completed in {round(time() - start_routine, 3)}s",
|
|
104
|
-
time() - start_routine
|
|
106
|
+
f"Routine completed in {round(time() - start_routine, 3)}s", time() - start_routine
|
|
105
107
|
)
|
|
106
108
|
|
|
107
109
|
|
|
@@ -119,7 +121,6 @@ def validate_then_detect(
|
|
|
119
121
|
cast_json: bool = True,
|
|
120
122
|
verbose: bool = False,
|
|
121
123
|
):
|
|
122
|
-
|
|
123
124
|
if verbose:
|
|
124
125
|
start_routine = time()
|
|
125
126
|
if is_url(file_path):
|
|
@@ -170,8 +171,7 @@ def validate_then_detect(
|
|
|
170
171
|
finally:
|
|
171
172
|
if verbose:
|
|
172
173
|
display_logs_depending_process_time(
|
|
173
|
-
f"Process completed in {round(time() - start_routine, 3)}s",
|
|
174
|
-
time() - start_routine
|
|
174
|
+
f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
|
|
175
175
|
)
|
|
176
176
|
|
|
177
177
|
|
|
@@ -226,8 +226,7 @@ def routine_minio(
|
|
|
226
226
|
if location_dict is not None:
|
|
227
227
|
if any(
|
|
228
228
|
[
|
|
229
|
-
(location_key not in location_dict)
|
|
230
|
-
or (location_dict[location_key] is None)
|
|
229
|
+
(location_key not in location_dict) or (location_dict[location_key] is None)
|
|
231
230
|
for location_key in ["netloc", "bucket", "key"]
|
|
232
231
|
]
|
|
233
232
|
):
|
csv_detective/load_tests.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Union
|
|
3
3
|
|
|
4
|
-
#
|
|
5
|
-
from csv_detective import detect_fields, detect_labels
|
|
4
|
+
from csv_detective import detect_fields, detect_labels # noqa
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def get_all_packages(detect_type) -> list:
|
|
@@ -12,10 +11,7 @@ def get_all_packages(detect_type) -> list:
|
|
|
12
11
|
for filename in filenames:
|
|
13
12
|
file = os.path.join(dirpath, filename).replace(root_dir, "")
|
|
14
13
|
if file.endswith("__init__.py"):
|
|
15
|
-
module = (
|
|
16
|
-
file.replace("__init__.py", "")
|
|
17
|
-
.replace("/", ".").replace("\\", ".")[:-1]
|
|
18
|
-
)
|
|
14
|
+
module = file.replace("__init__.py", "").replace("/", ".").replace("\\", ".")[:-1]
|
|
19
15
|
if module:
|
|
20
16
|
modules.append(detect_type + module)
|
|
21
17
|
return modules
|
|
@@ -43,20 +39,15 @@ def return_all_tests(
|
|
|
43
39
|
if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
|
|
44
40
|
tests_to_do = [detect_type]
|
|
45
41
|
else:
|
|
46
|
-
tests_to_do = [
|
|
47
|
-
|
|
48
|
-
]
|
|
49
|
-
tests_skipped = [
|
|
50
|
-
f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
|
|
51
|
-
]
|
|
42
|
+
tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"]
|
|
43
|
+
tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"]
|
|
52
44
|
all_tests = [
|
|
53
45
|
# this is why we need to import detect_fields/labels
|
|
54
|
-
eval(x)
|
|
46
|
+
eval(x)
|
|
47
|
+
for x in all_packages
|
|
55
48
|
if any([y == x[: len(y)] for y in tests_to_do])
|
|
56
49
|
and all([y != x[: len(y)] for y in tests_skipped])
|
|
57
50
|
]
|
|
58
51
|
# to remove groups of tests
|
|
59
|
-
all_tests = [
|
|
60
|
-
test for test in all_tests if "_is" in dir(test)
|
|
61
|
-
]
|
|
52
|
+
all_tests = [test for test in all_tests if "_is" in dir(test)]
|
|
62
53
|
return all_tests
|
csv_detective/output/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Optional, Union
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
7
|
from csv_detective.utils import is_url
|
|
8
|
+
|
|
8
9
|
from .dataframe import cast_df
|
|
9
10
|
from .profile import create_profile
|
|
10
11
|
from .schema import generate_table_schema
|
|
@@ -24,7 +25,6 @@ def generate_output(
|
|
|
24
25
|
verbose: bool = False,
|
|
25
26
|
sheet_name: Optional[Union[str, int]] = None,
|
|
26
27
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
27
|
-
|
|
28
28
|
if output_profile:
|
|
29
29
|
analysis["profile"] = create_profile(
|
|
30
30
|
table=table,
|
|
@@ -40,7 +40,7 @@ def generate_output(
|
|
|
40
40
|
else:
|
|
41
41
|
output_path = os.path.splitext(file_path)[0]
|
|
42
42
|
if is_url(output_path):
|
|
43
|
-
output_path = output_path.split(
|
|
43
|
+
output_path = output_path.split("/")[-1]
|
|
44
44
|
if analysis.get("sheet_name"):
|
|
45
45
|
output_path += "_sheet-" + str(sheet_name)
|
|
46
46
|
output_path += ".json"
|
|
@@ -48,11 +48,7 @@ def generate_output(
|
|
|
48
48
|
json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
|
|
49
49
|
|
|
50
50
|
if output_schema:
|
|
51
|
-
analysis["schema"] = generate_table_schema(
|
|
52
|
-
analysis,
|
|
53
|
-
save_file=False,
|
|
54
|
-
verbose=verbose
|
|
55
|
-
)
|
|
51
|
+
analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
|
|
56
52
|
|
|
57
53
|
if output_df:
|
|
58
54
|
return analysis, cast_df(
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from datetime import date, datetime
|
|
2
1
|
import json
|
|
3
|
-
from
|
|
2
|
+
from datetime import date, datetime
|
|
4
3
|
from time import time
|
|
4
|
+
from typing import Optional, Union
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
@@ -30,12 +30,16 @@ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datet
|
|
|
30
30
|
raise ValueError(f"Unknown type `{_type}`")
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def cast_df(
|
|
33
|
+
def cast_df(
|
|
34
|
+
df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
|
|
35
|
+
) -> pd.DataFrame:
|
|
34
36
|
if verbose:
|
|
35
37
|
start = time()
|
|
36
38
|
output_df = pd.DataFrame()
|
|
37
39
|
for col_name, detection in columns.items():
|
|
38
|
-
if detection["python_type"] == "string" or (
|
|
40
|
+
if detection["python_type"] == "string" or (
|
|
41
|
+
detection["python_type"] == "json" and not cast_json
|
|
42
|
+
):
|
|
39
43
|
# no change if detected type is string
|
|
40
44
|
output_df[col_name] = df[col_name].copy()
|
|
41
45
|
elif detection["python_type"] == "int":
|
|
@@ -49,7 +53,7 @@ def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bo
|
|
|
49
53
|
del df[col_name]
|
|
50
54
|
if verbose:
|
|
51
55
|
display_logs_depending_process_time(
|
|
52
|
-
f
|
|
56
|
+
f"Casting columns completed in {round(time() - start, 3)}s",
|
|
53
57
|
time() - start,
|
|
54
58
|
)
|
|
55
59
|
return output_df
|
csv_detective/output/example.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
from datetime import datetime
|
|
2
1
|
import json
|
|
3
2
|
import random
|
|
4
3
|
import string
|
|
5
|
-
from typing import Union, Optional, Any, Type
|
|
6
4
|
import uuid
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any, Optional, Type, Union
|
|
7
7
|
|
|
8
|
-
from faker import Faker
|
|
9
8
|
import pandas as pd
|
|
10
9
|
import requests
|
|
11
10
|
import rstr
|
|
11
|
+
from faker import Faker
|
|
12
12
|
|
|
13
13
|
fake = Faker()
|
|
14
14
|
|
|
@@ -135,7 +135,7 @@ def create_example_csv_file(
|
|
|
135
135
|
return random.choice(enum)
|
|
136
136
|
if num_range is None:
|
|
137
137
|
num_range = [0, 1000]
|
|
138
|
-
if num_type
|
|
138
|
+
if num_type is int:
|
|
139
139
|
return random.randint(num_range[0], num_range[1])
|
|
140
140
|
else:
|
|
141
141
|
return round(random.uniform(num_range[0], num_range[1]), 1)
|
|
@@ -179,7 +179,7 @@ def create_example_csv_file(
|
|
|
179
179
|
"yearmonth": "date",
|
|
180
180
|
"time": "time",
|
|
181
181
|
"datetime": "datetime",
|
|
182
|
-
"array": "array"
|
|
182
|
+
"array": "array",
|
|
183
183
|
}
|
|
184
184
|
|
|
185
185
|
if schema_path:
|
|
@@ -188,7 +188,7 @@ def create_example_csv_file(
|
|
|
188
188
|
else:
|
|
189
189
|
with open(schema_path, encoding=encoding) as jsonfile:
|
|
190
190
|
schema = json.load(jsonfile)
|
|
191
|
-
if
|
|
191
|
+
if "fields" not in schema.keys():
|
|
192
192
|
raise ValueError("The schema must have a 'fields' key.")
|
|
193
193
|
else:
|
|
194
194
|
fields = [
|
|
@@ -198,12 +198,14 @@ def create_example_csv_file(
|
|
|
198
198
|
# when frformat is supported in TableSchema, we can build args for French standards
|
|
199
199
|
# linked to https://github.com/datagouv/fr-format/issues/26
|
|
200
200
|
"args": (
|
|
201
|
-
build_args_from_constraints(f["constraints"])
|
|
201
|
+
build_args_from_constraints(f["constraints"])
|
|
202
|
+
if "constraints" in f.keys()
|
|
202
203
|
else build_args_from_constraints(f["arrayItem"]["constraints"])
|
|
203
204
|
if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
|
|
204
205
|
else {}
|
|
205
|
-
)
|
|
206
|
-
}
|
|
206
|
+
),
|
|
207
|
+
}
|
|
208
|
+
for f in schema["fields"]
|
|
207
209
|
]
|
|
208
210
|
|
|
209
211
|
for k in range(len(fields)):
|
|
@@ -234,10 +236,8 @@ def create_example_csv_file(
|
|
|
234
236
|
# would it be better to create by column or by row (as for now)?
|
|
235
237
|
output = pd.DataFrame(
|
|
236
238
|
[
|
|
237
|
-
[
|
|
238
|
-
|
|
239
|
-
for f in fields
|
|
240
|
-
] for _ in range(file_length)
|
|
239
|
+
[types_to_func.get(f["type"], "str")(**f["args"]) for f in fields]
|
|
240
|
+
for _ in range(file_length)
|
|
241
241
|
],
|
|
242
242
|
columns=[f["name"] for f in fields],
|
|
243
243
|
)
|
csv_detective/output/profile.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
import logging
|
|
2
|
+
from collections import defaultdict
|
|
3
3
|
from time import time
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
@@ -29,15 +29,12 @@ def create_profile(
|
|
|
29
29
|
safe_table = table.copy()
|
|
30
30
|
if not limited_output:
|
|
31
31
|
dict_cols_fields = {
|
|
32
|
-
k: v[0] if v else {
|
|
32
|
+
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
|
33
33
|
for k, v in dict_cols_fields.items()
|
|
34
34
|
}
|
|
35
|
-
dtypes = {
|
|
36
|
-
k: map_python_types.get(v["python_type"], str)
|
|
37
|
-
for k, v in dict_cols_fields.items()
|
|
38
|
-
}
|
|
35
|
+
dtypes = {k: map_python_types.get(v["python_type"], str) for k, v in dict_cols_fields.items()}
|
|
39
36
|
for c in safe_table.columns:
|
|
40
|
-
if dtypes[c]
|
|
37
|
+
if dtypes[c] is float:
|
|
41
38
|
safe_table[c] = safe_table[c].apply(
|
|
42
39
|
lambda s: float_casting(s) if isinstance(s, str) else s
|
|
43
40
|
)
|
|
@@ -48,18 +45,26 @@ def create_profile(
|
|
|
48
45
|
int,
|
|
49
46
|
]:
|
|
50
47
|
profile[c].update(
|
|
51
|
-
min=prevent_nan(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
48
|
+
min=prevent_nan(
|
|
49
|
+
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
50
|
+
safe_table[c].min()
|
|
51
|
+
)
|
|
52
|
+
),
|
|
53
|
+
max=prevent_nan(
|
|
54
|
+
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
55
|
+
safe_table[c].max()
|
|
56
|
+
)
|
|
57
|
+
),
|
|
58
|
+
mean=prevent_nan(
|
|
59
|
+
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
60
|
+
safe_table[c].mean()
|
|
61
|
+
)
|
|
62
|
+
),
|
|
63
|
+
std=prevent_nan(
|
|
64
|
+
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
65
|
+
safe_table[c].std()
|
|
66
|
+
)
|
|
67
|
+
),
|
|
63
68
|
)
|
|
64
69
|
tops_bruts = (
|
|
65
70
|
safe_table[safe_table[c].notna()][c]
|
|
@@ -70,10 +75,12 @@ def create_profile(
|
|
|
70
75
|
)
|
|
71
76
|
tops = []
|
|
72
77
|
for tb in tops_bruts:
|
|
73
|
-
tops.append(
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
78
|
+
tops.append(
|
|
79
|
+
{
|
|
80
|
+
"count": tb["count"],
|
|
81
|
+
"value": tb[c],
|
|
82
|
+
}
|
|
83
|
+
)
|
|
77
84
|
profile[c].update(
|
|
78
85
|
tops=tops,
|
|
79
86
|
nb_distinct=safe_table[c].nunique(),
|
csv_detective/output/schema.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
from datetime import datetime
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
4
3
|
import os
|
|
5
4
|
import tempfile
|
|
5
|
+
from datetime import datetime
|
|
6
6
|
from time import time
|
|
7
7
|
from typing import Optional
|
|
8
8
|
|
|
9
9
|
from botocore.exceptions import ClientError
|
|
10
10
|
|
|
11
|
-
from csv_detective.s3_utils import
|
|
11
|
+
from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
|
|
12
12
|
from csv_detective.utils import display_logs_depending_process_time
|
|
13
13
|
|
|
14
14
|
|
|
@@ -26,13 +26,11 @@ def get_description(format: str) -> str:
|
|
|
26
26
|
"insee_canton": "Le nom du canton",
|
|
27
27
|
"latitude_l93": "La latitude au format Lambert 93",
|
|
28
28
|
"latitude_wgs_fr_metropole": (
|
|
29
|
-
"La latitude au format WGS. Ne concerne que des latitudes "
|
|
30
|
-
"de la métropole française"
|
|
29
|
+
"La latitude au format WGS. Ne concerne que des latitudes de la métropole française"
|
|
31
30
|
),
|
|
32
31
|
"longitude_l93": "La longitude au format Lambert 93",
|
|
33
32
|
"longitude_wgs_fr_metropole": (
|
|
34
|
-
"La longitude au format WGS. Ne concerne que des longitudes "
|
|
35
|
-
"de la métropole française"
|
|
33
|
+
"La longitude au format WGS. Ne concerne que des longitudes de la métropole française"
|
|
36
34
|
),
|
|
37
35
|
"pays": "Le nom du pays",
|
|
38
36
|
"region": "Le nom de la région",
|
|
@@ -86,13 +84,13 @@ def get_pattern(format: str) -> str:
|
|
|
86
84
|
),
|
|
87
85
|
"uai": r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$",
|
|
88
86
|
"email": r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$",
|
|
89
|
-
"twitter": r
|
|
90
|
-
"mongo_object_id": r
|
|
91
|
-
"uuid": r
|
|
87
|
+
"twitter": r"^@[A-Za-z0-9_]+$",
|
|
88
|
+
"mongo_object_id": r"^[0-9a-fA-F]{24}$",
|
|
89
|
+
"uuid": r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$",
|
|
92
90
|
"url": (
|
|
93
|
-
r
|
|
94
|
-
r
|
|
95
|
-
)
|
|
91
|
+
r"^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]"
|
|
92
|
+
r"{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$"
|
|
93
|
+
),
|
|
96
94
|
}
|
|
97
95
|
if format in format_to_pattern:
|
|
98
96
|
return {"pattern": format_to_pattern[format]}
|
|
@@ -210,7 +208,7 @@ def generate_table_schema(
|
|
|
210
208
|
key: Optional[str] = None,
|
|
211
209
|
minio_user: Optional[str] = None,
|
|
212
210
|
minio_pwd: Optional[str] = None,
|
|
213
|
-
verbose: bool = False
|
|
211
|
+
verbose: bool = False,
|
|
214
212
|
) -> dict:
|
|
215
213
|
"""Generates a table schema from the analysis report
|
|
216
214
|
|
|
@@ -236,7 +234,7 @@ def generate_table_schema(
|
|
|
236
234
|
"example": get_example(field_report["format"]),
|
|
237
235
|
"type": get_validata_type(field_report["format"]),
|
|
238
236
|
"formatFR": field_report["format"],
|
|
239
|
-
"constraints": get_constraints(field_report["format"])
|
|
237
|
+
"constraints": get_constraints(field_report["format"]),
|
|
240
238
|
}
|
|
241
239
|
for header, field_report in analysis_report["columns"].items()
|
|
242
240
|
]
|
|
@@ -255,12 +253,9 @@ def generate_table_schema(
|
|
|
255
253
|
"sources": [
|
|
256
254
|
{
|
|
257
255
|
"title": "Spécification Tableschema",
|
|
258
|
-
"path": "https://specs.frictionlessdata.io/table-schema"
|
|
256
|
+
"path": "https://specs.frictionlessdata.io/table-schema",
|
|
259
257
|
},
|
|
260
|
-
{
|
|
261
|
-
"title": "schema.data.gouv.fr",
|
|
262
|
-
"path": "https://schema.data.gouv.fr"
|
|
263
|
-
}
|
|
258
|
+
{"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"},
|
|
264
259
|
],
|
|
265
260
|
"created": datetime.today().strftime("%Y-%m-%d"),
|
|
266
261
|
"lastModified": datetime.today().strftime("%Y-%m-%d"),
|
|
@@ -278,7 +273,9 @@ def generate_table_schema(
|
|
|
278
273
|
}
|
|
279
274
|
|
|
280
275
|
if verbose:
|
|
281
|
-
display_logs_depending_process_time(
|
|
276
|
+
display_logs_depending_process_time(
|
|
277
|
+
f"Created schema in {round(time() - start, 3)}s", time() - start
|
|
278
|
+
)
|
|
282
279
|
|
|
283
280
|
if not save_file:
|
|
284
281
|
return schema
|
|
@@ -301,9 +298,9 @@ def generate_table_schema(
|
|
|
301
298
|
if "Contents" in tableschema_objects:
|
|
302
299
|
tableschema_keys = [
|
|
303
300
|
tableschema["Key"]
|
|
304
|
-
for tableschema in client.list_objects(
|
|
305
|
-
|
|
306
|
-
|
|
301
|
+
for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
|
|
302
|
+
"Contents"
|
|
303
|
+
]
|
|
307
304
|
]
|
|
308
305
|
tableschema_versions = [
|
|
309
306
|
os.path.splitext(tableschema_key)[0].split("_")[-1]
|
csv_detective/output/utils.py
CHANGED
|
@@ -19,14 +19,17 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
|
|
|
19
19
|
# no need to specify int and float everywhere, they are deprioritized anyway
|
|
20
20
|
("int", ("float",)),
|
|
21
21
|
# bool over everything
|
|
22
|
-
(
|
|
23
|
-
"
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
22
|
+
(
|
|
23
|
+
"booleen",
|
|
24
|
+
(
|
|
25
|
+
"latitude_l93",
|
|
26
|
+
"latitude_wgs",
|
|
27
|
+
"latitude_wgs_fr_metropole",
|
|
28
|
+
"longitude_l93",
|
|
29
|
+
"longitude_wgs",
|
|
30
|
+
"longitude_wgs_fr_metropole",
|
|
31
|
+
),
|
|
32
|
+
),
|
|
30
33
|
("geojson", ("json",)),
|
|
31
34
|
# latlon over lonlat if no longitude allows to discriminate
|
|
32
35
|
("latlon_wgs", ("json", "lonlat_wgs")),
|
|
@@ -49,13 +52,10 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
|
|
|
49
52
|
for prio_format, secondary_formats in priorities:
|
|
50
53
|
if prio_format in detected_formats:
|
|
51
54
|
for secondary in secondary_formats:
|
|
52
|
-
if (
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
>= return_dict_cols[column_name][secondary]
|
|
57
|
-
or return_dict_cols[column_name][prio_format] >= 1
|
|
58
|
-
)
|
|
55
|
+
if secondary in detected_formats and (
|
|
56
|
+
return_dict_cols[column_name][prio_format]
|
|
57
|
+
>= return_dict_cols[column_name][secondary]
|
|
58
|
+
or return_dict_cols[column_name][prio_format] >= 1
|
|
59
59
|
):
|
|
60
60
|
formats_to_remove.add(secondary)
|
|
61
61
|
|