csv-detective 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
- csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/other/booleen/__init__.py +1 -1
- csv_detective/detect_labels/other/email/__init__.py +1 -1
- csv_detective/detect_labels/other/float/__init__.py +1 -1
- csv_detective/detect_labels/other/int/__init__.py +1 -1
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detect_labels/other/twitter/__init__.py +1 -1
- csv_detective/detect_labels/other/url/__init__.py +1 -1
- csv_detective/detect_labels/other/uuid/__init__.py +1 -1
- csv_detective/detect_labels/temp/date/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
- csv_detective/detect_labels/temp/year/__init__.py +1 -1
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +27 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/headers.py +32 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +98 -0
- csv_detective/explore_csv.py +40 -124
- csv_detective/output/dataframe.py +55 -0
- csv_detective/{create_example.py → output/example.py} +10 -9
- csv_detective/output/profile.py +87 -0
- csv_detective/{schema_generation.py → output/schema.py} +344 -343
- csv_detective/output/utils.py +51 -0
- csv_detective/parsing/columns.py +141 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +55 -0
- csv_detective/parsing/excel.py +169 -0
- csv_detective/parsing/load.py +97 -0
- csv_detective/utils.py +10 -236
- {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +1 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +1 -1
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +84 -70
- tests/test_fields.py +7 -6
- tests/test_file.py +15 -14
- csv_detective/detection.py +0 -633
- /csv_detective/{process_text.py → parsing/text.py} +0 -0
- {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
5
|
+
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
6
|
+
to end up with the header at the right place"""
|
|
7
|
+
idx = 0
|
|
8
|
+
if all([str(c).startswith('Unnamed:') for c in table.columns]):
|
|
9
|
+
# there is on offset between the index in the file (idx here)
|
|
10
|
+
# and the index in the dataframe, because of the header
|
|
11
|
+
idx = 1
|
|
12
|
+
while table.iloc[idx - 1].isna().all():
|
|
13
|
+
idx += 1
|
|
14
|
+
cols = table.iloc[idx - 1]
|
|
15
|
+
table = table.iloc[idx:]
|
|
16
|
+
table.columns = cols.to_list()
|
|
17
|
+
# +1 here because the headers should count as a row
|
|
18
|
+
return table, idx
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import logging
|
|
3
|
+
from time import time
|
|
4
|
+
from typing import TextIO
|
|
5
|
+
|
|
6
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def detect_separator(file: TextIO, verbose: bool = False) -> str:
|
|
10
|
+
"""Detects csv separator"""
|
|
11
|
+
# TODO: add a robust detection:
|
|
12
|
+
# si on a un point virgule comme texte et \t comme séparateur, on renvoie
|
|
13
|
+
# pour l'instant un point virgule
|
|
14
|
+
if verbose:
|
|
15
|
+
start = time()
|
|
16
|
+
logging.info("Detecting separator")
|
|
17
|
+
file.seek(0)
|
|
18
|
+
header = file.readline()
|
|
19
|
+
possible_separators = [";", ",", "|", "\t"]
|
|
20
|
+
sep_count = dict()
|
|
21
|
+
for sep in possible_separators:
|
|
22
|
+
sep_count[sep] = header.count(sep)
|
|
23
|
+
sep = max(sep_count, key=sep_count.get)
|
|
24
|
+
# testing that the first 10 (arbitrary) rows all have the same number of fields
|
|
25
|
+
# as the header. Prevents downstream unwanted behaviour where pandas can load
|
|
26
|
+
# the file (in a weird way) but the process is irrelevant.
|
|
27
|
+
file.seek(0)
|
|
28
|
+
reader = csv.reader(file, delimiter=sep)
|
|
29
|
+
rows_lengths = set()
|
|
30
|
+
for idx, row in enumerate(reader):
|
|
31
|
+
if idx > 10:
|
|
32
|
+
break
|
|
33
|
+
rows_lengths.add(len(row))
|
|
34
|
+
if len(rows_lengths) > 1:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if verbose:
|
|
40
|
+
display_logs_depending_process_time(
|
|
41
|
+
f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
|
|
42
|
+
time() - start,
|
|
43
|
+
)
|
|
44
|
+
return sep
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from ast import literal_eval
|
|
2
|
+
import logging
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
|
|
11
|
+
"""
|
|
12
|
+
Detects whether a column contains continuous variables. We consider a continuous column
|
|
13
|
+
one that contains a considerable amount of float values.
|
|
14
|
+
We removed the integers as we then end up with postal codes, insee codes, and all sort
|
|
15
|
+
of codes and types.
|
|
16
|
+
This is not optimal but it will do for now.
|
|
17
|
+
"""
|
|
18
|
+
# if we need this again in the future, could be first based on columns detected as int/float to cut time
|
|
19
|
+
|
|
20
|
+
def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
|
|
21
|
+
count = serie.value_counts().to_dict()
|
|
22
|
+
total_nb = len(serie)
|
|
23
|
+
if float in count:
|
|
24
|
+
nb_floats = count[float]
|
|
25
|
+
else:
|
|
26
|
+
return False
|
|
27
|
+
if nb_floats / total_nb >= continuous_th:
|
|
28
|
+
return True
|
|
29
|
+
else:
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
def parses_to_integer(value: str):
|
|
33
|
+
try:
|
|
34
|
+
value = value.replace(",", ".")
|
|
35
|
+
value = literal_eval(value)
|
|
36
|
+
return type(value)
|
|
37
|
+
# flake8: noqa
|
|
38
|
+
except:
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
if verbose:
|
|
42
|
+
start = time()
|
|
43
|
+
logging.info("Detecting continuous columns")
|
|
44
|
+
res = table.apply(
|
|
45
|
+
lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
|
|
46
|
+
)
|
|
47
|
+
if verbose:
|
|
48
|
+
display_logs_depending_process_time(
|
|
49
|
+
f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
|
|
50
|
+
time() - start,
|
|
51
|
+
)
|
|
52
|
+
return res.index[res]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def detect_categorical_variable(
|
|
56
|
+
table: pd.DataFrame,
|
|
57
|
+
threshold_pct_categorical: float = 0.05,
|
|
58
|
+
max_number_categorical_values: int = 25,
|
|
59
|
+
verbose: bool = False,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Heuristically detects whether a table (df) contains categorical values according to
|
|
63
|
+
the number of unique values contained.
|
|
64
|
+
As the idea of detecting categorical values is to then try to learn models to predict
|
|
65
|
+
them, we limit categorical values to at most 25 different modes or at most 5% disparity.
|
|
66
|
+
Postal code, insee code, code region and so on, may be thus not considered categorical values.
|
|
67
|
+
:param table:
|
|
68
|
+
:param threshold_pct_categorical:
|
|
69
|
+
:param max_number_categorical_values:
|
|
70
|
+
:return:
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def abs_number_different_values(column_values: pd.Series):
|
|
74
|
+
return column_values.nunique()
|
|
75
|
+
|
|
76
|
+
def rel_number_different_values(column_values: pd.Series):
|
|
77
|
+
return column_values.nunique() / len(column_values)
|
|
78
|
+
|
|
79
|
+
def detect_categorical(column_values: pd.Series):
|
|
80
|
+
abs_unique_values = abs_number_different_values(column_values)
|
|
81
|
+
rel_unique_values = rel_number_different_values(column_values)
|
|
82
|
+
if (
|
|
83
|
+
abs_unique_values <= max_number_categorical_values
|
|
84
|
+
or rel_unique_values <= threshold_pct_categorical
|
|
85
|
+
):
|
|
86
|
+
return True
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
if verbose:
|
|
90
|
+
start = time()
|
|
91
|
+
logging.info("Detecting categorical columns")
|
|
92
|
+
res = table.apply(lambda serie: detect_categorical(serie))
|
|
93
|
+
if verbose:
|
|
94
|
+
display_logs_depending_process_time(
|
|
95
|
+
f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
|
|
96
|
+
time() - start,
|
|
97
|
+
)
|
|
98
|
+
return res.index[res], res
|
csv_detective/explore_csv.py
CHANGED
|
@@ -1,52 +1,28 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Ce script analyse les premières lignes d'un CSV pour essayer de déterminer le
|
|
3
|
-
contenu possible des champs
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
from typing import Dict, List, Union
|
|
7
1
|
from collections import defaultdict
|
|
8
2
|
import json
|
|
9
|
-
import
|
|
3
|
+
import logging
|
|
10
4
|
import os
|
|
11
5
|
import tempfile
|
|
12
|
-
import logging
|
|
13
6
|
from time import time
|
|
14
|
-
import
|
|
15
|
-
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
16
10
|
import pandas as pd
|
|
17
11
|
|
|
18
12
|
# flake8: noqa
|
|
19
13
|
from csv_detective import detect_fields, detect_labels
|
|
20
|
-
from
|
|
21
|
-
|
|
22
|
-
from csv_detective.utils import (
|
|
23
|
-
cast_df,
|
|
24
|
-
display_logs_depending_process_time,
|
|
25
|
-
prepare_output_dict,
|
|
26
|
-
test_col,
|
|
27
|
-
test_label,
|
|
28
|
-
)
|
|
29
|
-
from .detection import (
|
|
30
|
-
detect_engine,
|
|
31
|
-
detect_separator,
|
|
32
|
-
detect_encoding,
|
|
33
|
-
detect_headers,
|
|
34
|
-
detect_heading_columns,
|
|
35
|
-
detect_trailing_columns,
|
|
36
|
-
parse_table,
|
|
37
|
-
parse_excel,
|
|
38
|
-
create_profile,
|
|
39
|
-
detetect_categorical_variable,
|
|
14
|
+
from .detection.variables import (
|
|
15
|
+
detect_categorical_variable,
|
|
40
16
|
# detect_continuous_variable,
|
|
41
|
-
is_url,
|
|
42
|
-
unzip,
|
|
43
|
-
XLS_LIKE_EXT,
|
|
44
|
-
EXCEL_ENGINES,
|
|
45
|
-
COMPRESSION_ENGINES,
|
|
46
17
|
)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
18
|
+
from .output.dataframe import cast_df
|
|
19
|
+
from .output.profile import create_profile
|
|
20
|
+
from .output.schema import generate_table_schema
|
|
21
|
+
from .output.utils import prepare_output_dict
|
|
22
|
+
from .parsing.load import load_file
|
|
23
|
+
from .parsing.columns import test_col, test_label
|
|
24
|
+
from .s3_utils import download_from_minio, upload_to_minio
|
|
25
|
+
from .utils import display_logs_depending_process_time, is_url
|
|
50
26
|
|
|
51
27
|
|
|
52
28
|
def get_all_packages(detect_type) -> list:
|
|
@@ -107,9 +83,9 @@ def return_all_tests(
|
|
|
107
83
|
|
|
108
84
|
|
|
109
85
|
def routine(
|
|
110
|
-
|
|
86
|
+
file_path: str,
|
|
111
87
|
num_rows: int = 500,
|
|
112
|
-
user_input_tests: Union[str,
|
|
88
|
+
user_input_tests: Union[str, list[str]] = "ALL",
|
|
113
89
|
limited_output: bool = True,
|
|
114
90
|
save_results: Union[bool, str] = True,
|
|
115
91
|
encoding: str = None,
|
|
@@ -126,7 +102,7 @@ def routine(
|
|
|
126
102
|
column contents.
|
|
127
103
|
|
|
128
104
|
Args:
|
|
129
|
-
|
|
105
|
+
file_path: local path to CSV file if not using Minio
|
|
130
106
|
num_rows: number of rows to sample from the file for analysis ; -1 for analysis
|
|
131
107
|
of the whole file
|
|
132
108
|
user_input_tests: tests to run on the file
|
|
@@ -143,100 +119,40 @@ def routine(
|
|
|
143
119
|
Returns:
|
|
144
120
|
dict: a dict with information about the csv and possible types for each column
|
|
145
121
|
"""
|
|
146
|
-
if not csv_file_path:
|
|
147
|
-
raise ValueError("csv_file_path is required.")
|
|
148
122
|
|
|
149
123
|
if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
|
|
150
124
|
raise ValueError("`save_results` must be a bool or a valid path to a json file.")
|
|
151
125
|
|
|
152
126
|
if verbose:
|
|
153
127
|
start_routine = time()
|
|
154
|
-
if is_url(
|
|
128
|
+
if is_url(file_path):
|
|
155
129
|
logging.info("Path recognized as a URL")
|
|
156
130
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
is_xls_like = True
|
|
166
|
-
encoding, sep, heading_columns, trailing_columns = None, None, None, None
|
|
167
|
-
table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
|
|
168
|
-
csv_file_path=csv_file_path,
|
|
169
|
-
num_rows=num_rows,
|
|
170
|
-
engine=engine,
|
|
171
|
-
sheet_name=sheet_name,
|
|
172
|
-
verbose=verbose,
|
|
173
|
-
)
|
|
174
|
-
header = table.columns.to_list()
|
|
175
|
-
else:
|
|
176
|
-
# fetching or reading file as binary
|
|
177
|
-
if is_url(csv_file_path):
|
|
178
|
-
r = requests.get(csv_file_path, allow_redirects=True)
|
|
179
|
-
r.raise_for_status()
|
|
180
|
-
binary_file = BytesIO(r.content)
|
|
181
|
-
else:
|
|
182
|
-
binary_file = open(csv_file_path, "rb")
|
|
183
|
-
# handling compression
|
|
184
|
-
if engine in COMPRESSION_ENGINES:
|
|
185
|
-
binary_file: BytesIO = unzip(binary_file=binary_file, engine=engine)
|
|
186
|
-
# detecting encoding if not specified
|
|
187
|
-
if encoding is None:
|
|
188
|
-
encoding: str = detect_encoding(binary_file, verbose=verbose)
|
|
189
|
-
binary_file.seek(0)
|
|
190
|
-
# decoding and reading file
|
|
191
|
-
if is_url(csv_file_path) or engine in COMPRESSION_ENGINES:
|
|
192
|
-
str_file = StringIO(binary_file.read().decode(encoding=encoding))
|
|
193
|
-
else:
|
|
194
|
-
str_file = open(csv_file_path, "r", encoding=encoding)
|
|
195
|
-
if sep is None:
|
|
196
|
-
sep = detect_separator(str_file, verbose=verbose)
|
|
197
|
-
header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
|
|
198
|
-
if header is None:
|
|
199
|
-
return {"error": True}
|
|
200
|
-
elif isinstance(header, list):
|
|
201
|
-
if any([x is None for x in header]):
|
|
202
|
-
return {"error": True}
|
|
203
|
-
heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
|
|
204
|
-
trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
|
|
205
|
-
table, total_lines, nb_duplicates = parse_table(
|
|
206
|
-
str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
|
|
207
|
-
)
|
|
131
|
+
table, analysis = load_file(
|
|
132
|
+
file_path=file_path,
|
|
133
|
+
num_rows=num_rows,
|
|
134
|
+
encoding=encoding,
|
|
135
|
+
sep=sep,
|
|
136
|
+
verbose=verbose,
|
|
137
|
+
sheet_name=sheet_name,
|
|
138
|
+
)
|
|
208
139
|
|
|
209
140
|
if table.empty:
|
|
210
141
|
res_categorical = []
|
|
211
142
|
# res_continuous = []
|
|
212
143
|
else:
|
|
213
144
|
# Detects columns that are categorical
|
|
214
|
-
res_categorical, categorical_mask =
|
|
145
|
+
res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
|
|
215
146
|
res_categorical = list(res_categorical)
|
|
216
147
|
# Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
|
|
217
148
|
# res_continuous = list(
|
|
218
149
|
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
219
150
|
# )
|
|
220
151
|
|
|
221
|
-
|
|
222
|
-
analysis = {
|
|
223
|
-
"header_row_idx": header_row_idx,
|
|
224
|
-
"header": header,
|
|
225
|
-
"total_lines": total_lines,
|
|
226
|
-
"nb_duplicates": nb_duplicates,
|
|
227
|
-
"heading_columns": heading_columns,
|
|
228
|
-
"trailing_columns": trailing_columns,
|
|
152
|
+
analysis.update({
|
|
229
153
|
"categorical": res_categorical,
|
|
230
154
|
# "continuous": res_continuous,
|
|
231
|
-
}
|
|
232
|
-
# this is only relevant for xls-like
|
|
233
|
-
if is_xls_like:
|
|
234
|
-
analysis["engine"] = engine
|
|
235
|
-
analysis["sheet_name"] = sheet_name
|
|
236
|
-
# this is only relevant for csv
|
|
237
|
-
else:
|
|
238
|
-
analysis["encoding"] = encoding
|
|
239
|
-
analysis["separator"] = sep
|
|
155
|
+
})
|
|
240
156
|
|
|
241
157
|
# list testing to be performed
|
|
242
158
|
all_tests_fields = return_all_tests(
|
|
@@ -355,10 +271,10 @@ def routine(
|
|
|
355
271
|
if isinstance(save_results, str):
|
|
356
272
|
output_path = save_results
|
|
357
273
|
else:
|
|
358
|
-
output_path = os.path.splitext(
|
|
274
|
+
output_path = os.path.splitext(file_path)[0]
|
|
359
275
|
if is_url(output_path):
|
|
360
276
|
output_path = output_path.split('/')[-1]
|
|
361
|
-
if
|
|
277
|
+
if analysis.get("sheet_name"):
|
|
362
278
|
output_path += "_sheet-" + str(sheet_name)
|
|
363
279
|
output_path += ".json"
|
|
364
280
|
with open(output_path, "w", encoding="utf8") as fp:
|
|
@@ -386,13 +302,13 @@ def routine(
|
|
|
386
302
|
|
|
387
303
|
|
|
388
304
|
def routine_minio(
|
|
389
|
-
csv_minio_location:
|
|
390
|
-
output_minio_location:
|
|
391
|
-
tableschema_minio_location:
|
|
305
|
+
csv_minio_location: dict[str, str],
|
|
306
|
+
output_minio_location: dict[str, str],
|
|
307
|
+
tableschema_minio_location: dict[str, str],
|
|
392
308
|
minio_user: str,
|
|
393
309
|
minio_pwd: str,
|
|
394
310
|
num_rows: int = 500,
|
|
395
|
-
user_input_tests: Union[str,
|
|
311
|
+
user_input_tests: Union[str, list[str]] = "ALL",
|
|
396
312
|
encoding: str = None,
|
|
397
313
|
sep: str = None,
|
|
398
314
|
):
|
|
@@ -450,18 +366,18 @@ def routine_minio(
|
|
|
450
366
|
):
|
|
451
367
|
raise ValueError("Minio location dict must contain url, bucket and key")
|
|
452
368
|
|
|
453
|
-
|
|
369
|
+
file_path = tempfile.NamedTemporaryFile(delete=False).name
|
|
454
370
|
download_from_minio(
|
|
455
371
|
netloc=csv_minio_location["netloc"],
|
|
456
372
|
bucket=csv_minio_location["bucket"],
|
|
457
373
|
key=csv_minio_location["key"],
|
|
458
|
-
filepath=
|
|
374
|
+
filepath=file_path,
|
|
459
375
|
minio_user=minio_user,
|
|
460
376
|
minio_pwd=minio_pwd,
|
|
461
377
|
)
|
|
462
378
|
|
|
463
379
|
analysis = routine(
|
|
464
|
-
|
|
380
|
+
file_path,
|
|
465
381
|
num_rows,
|
|
466
382
|
user_input_tests,
|
|
467
383
|
output_mode="LIMITED",
|
|
@@ -471,7 +387,7 @@ def routine_minio(
|
|
|
471
387
|
)
|
|
472
388
|
|
|
473
389
|
# Write report JSON file.
|
|
474
|
-
output_path_to_store_minio_file = os.path.splitext(
|
|
390
|
+
output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
|
|
475
391
|
with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
|
|
476
392
|
json.dump(analysis, fp, indent=4, separators=(",", ": "))
|
|
477
393
|
|
|
@@ -485,7 +401,7 @@ def routine_minio(
|
|
|
485
401
|
)
|
|
486
402
|
|
|
487
403
|
os.remove(output_path_to_store_minio_file)
|
|
488
|
-
os.remove(
|
|
404
|
+
os.remove(file_path)
|
|
489
405
|
|
|
490
406
|
generate_table_schema(
|
|
491
407
|
analysis,
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from datetime import date, datetime
|
|
2
|
+
import json
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
from time import time
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from csv_detective.detect_fields.other.booleen import bool_casting
|
|
9
|
+
from csv_detective.detect_fields.other.float import float_casting
|
|
10
|
+
from csv_detective.detect_fields.temp.date import date_casting
|
|
11
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
|
|
15
|
+
if not isinstance(value, str) or not value:
|
|
16
|
+
# None is the current default value in hydra, should we keep this?
|
|
17
|
+
return None
|
|
18
|
+
if _type == "float":
|
|
19
|
+
return float_casting(value)
|
|
20
|
+
if _type == "bool":
|
|
21
|
+
return bool_casting(value)
|
|
22
|
+
if _type == "json":
|
|
23
|
+
# in hydra json are given to postgres as strings, conversion is done by postgres
|
|
24
|
+
return json.loads(value)
|
|
25
|
+
if _type == "date":
|
|
26
|
+
_date = date_casting(value)
|
|
27
|
+
return _date.date() if _date else None
|
|
28
|
+
if _type == "datetime":
|
|
29
|
+
return date_casting(value)
|
|
30
|
+
raise ValueError(f"Unknown type `{_type}`")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
|
|
34
|
+
if verbose:
|
|
35
|
+
start = time()
|
|
36
|
+
output_df = pd.DataFrame()
|
|
37
|
+
for col_name, detection in columns.items():
|
|
38
|
+
if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
|
|
39
|
+
# no change if detected type is string
|
|
40
|
+
output_df[col_name] = df[col_name].copy()
|
|
41
|
+
elif detection["python_type"] == "int":
|
|
42
|
+
# to allow having ints and NaN in the same column
|
|
43
|
+
output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
|
|
44
|
+
else:
|
|
45
|
+
output_df[col_name] = df[col_name].apply(
|
|
46
|
+
lambda col: cast(col, _type=detection["python_type"])
|
|
47
|
+
)
|
|
48
|
+
# to save RAM
|
|
49
|
+
del df[col_name]
|
|
50
|
+
if verbose:
|
|
51
|
+
display_logs_depending_process_time(
|
|
52
|
+
f'Casting columns completed in {round(time() - start, 3)}s',
|
|
53
|
+
time() - start,
|
|
54
|
+
)
|
|
55
|
+
return output_df
|
|
@@ -1,13 +1,14 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
import json
|
|
1
3
|
import random
|
|
2
|
-
import uuid
|
|
3
4
|
import string
|
|
4
|
-
from
|
|
5
|
+
from typing import Union, Optional, Any, Type
|
|
6
|
+
import uuid
|
|
7
|
+
|
|
8
|
+
from faker import Faker
|
|
5
9
|
import pandas as pd
|
|
6
|
-
from typing import List, Union, Optional, Any, Type
|
|
7
|
-
import json
|
|
8
10
|
import requests
|
|
9
11
|
import rstr
|
|
10
|
-
from faker import Faker
|
|
11
12
|
|
|
12
13
|
fake = Faker()
|
|
13
14
|
|
|
@@ -69,7 +70,7 @@ def create_example_csv_file(
|
|
|
69
70
|
return str(uuid.uuid4())
|
|
70
71
|
|
|
71
72
|
def _date(
|
|
72
|
-
date_range: Union[None,
|
|
73
|
+
date_range: Union[None, list[str]] = None,
|
|
73
74
|
format: str = '%Y-%m-%d',
|
|
74
75
|
required: bool = True,
|
|
75
76
|
) -> str:
|
|
@@ -98,7 +99,7 @@ def create_example_csv_file(
|
|
|
98
99
|
return fake.time(format)
|
|
99
100
|
|
|
100
101
|
def _datetime(
|
|
101
|
-
datetime_range: Optional[
|
|
102
|
+
datetime_range: Optional[list[str]] = None,
|
|
102
103
|
format: str = '%Y-%m-%d %H-%M-%S',
|
|
103
104
|
required: bool = True,
|
|
104
105
|
) -> str:
|
|
@@ -123,7 +124,7 @@ def create_example_csv_file(
|
|
|
123
124
|
|
|
124
125
|
def _number(
|
|
125
126
|
num_type: Type[Union[int, float]] = int,
|
|
126
|
-
num_range: Optional[
|
|
127
|
+
num_range: Optional[list[float]] = None,
|
|
127
128
|
enum: Optional[list] = None,
|
|
128
129
|
required: bool = True,
|
|
129
130
|
) -> Union[int, float]:
|
|
@@ -144,7 +145,7 @@ def create_example_csv_file(
|
|
|
144
145
|
return ''
|
|
145
146
|
return random.randint(0, 1) == 0
|
|
146
147
|
|
|
147
|
-
def _array(enum:
|
|
148
|
+
def _array(enum: list[Any], required: bool = True) -> str:
|
|
148
149
|
if potential_skip(required):
|
|
149
150
|
return ''
|
|
150
151
|
return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
import logging
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.detect_fields.other.float import float_casting
|
|
8
|
+
from csv_detective.utils import display_logs_depending_process_time, prevent_nan
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def create_profile(
|
|
12
|
+
table: pd.DataFrame,
|
|
13
|
+
dict_cols_fields: dict,
|
|
14
|
+
num_rows: int,
|
|
15
|
+
limited_output: bool = True,
|
|
16
|
+
verbose: bool = False,
|
|
17
|
+
) -> dict:
|
|
18
|
+
if verbose:
|
|
19
|
+
start = time()
|
|
20
|
+
logging.info("Creating profile")
|
|
21
|
+
map_python_types = {
|
|
22
|
+
"string": str,
|
|
23
|
+
"int": float,
|
|
24
|
+
"float": float,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if num_rows > 0:
|
|
28
|
+
raise ValueError("To create profiles num_rows has to be set to -1")
|
|
29
|
+
safe_table = table.copy()
|
|
30
|
+
if not limited_output:
|
|
31
|
+
dict_cols_fields = {
|
|
32
|
+
k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
|
|
33
|
+
for k, v in dict_cols_fields.items()
|
|
34
|
+
}
|
|
35
|
+
dtypes = {
|
|
36
|
+
k: map_python_types.get(v["python_type"], str)
|
|
37
|
+
for k, v in dict_cols_fields.items()
|
|
38
|
+
}
|
|
39
|
+
for c in safe_table.columns:
|
|
40
|
+
if dtypes[c] == float:
|
|
41
|
+
safe_table[c] = safe_table[c].apply(
|
|
42
|
+
lambda s: float_casting(s) if isinstance(s, str) else s
|
|
43
|
+
)
|
|
44
|
+
profile = defaultdict(dict)
|
|
45
|
+
for c in safe_table.columns:
|
|
46
|
+
if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
|
|
47
|
+
float,
|
|
48
|
+
int,
|
|
49
|
+
]:
|
|
50
|
+
profile[c].update(
|
|
51
|
+
min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
52
|
+
safe_table[c].min()
|
|
53
|
+
)),
|
|
54
|
+
max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
55
|
+
safe_table[c].max()
|
|
56
|
+
)),
|
|
57
|
+
mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
58
|
+
safe_table[c].mean()
|
|
59
|
+
)),
|
|
60
|
+
std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
61
|
+
safe_table[c].std()
|
|
62
|
+
)),
|
|
63
|
+
)
|
|
64
|
+
tops_bruts = (
|
|
65
|
+
safe_table[safe_table[c].notna()][c]
|
|
66
|
+
.value_counts(dropna=True)
|
|
67
|
+
.reset_index()
|
|
68
|
+
.iloc[:10]
|
|
69
|
+
.to_dict(orient="records")
|
|
70
|
+
)
|
|
71
|
+
tops = []
|
|
72
|
+
for tb in tops_bruts:
|
|
73
|
+
tops.append({
|
|
74
|
+
"count": tb["count"],
|
|
75
|
+
"value": tb[c],
|
|
76
|
+
})
|
|
77
|
+
profile[c].update(
|
|
78
|
+
tops=tops,
|
|
79
|
+
nb_distinct=safe_table[c].nunique(),
|
|
80
|
+
nb_missing_values=len(safe_table[c].loc[safe_table[c].isna()]),
|
|
81
|
+
)
|
|
82
|
+
if verbose:
|
|
83
|
+
display_logs_depending_process_time(
|
|
84
|
+
f"Created profile in {round(time() - start, 3)}s",
|
|
85
|
+
time() - start,
|
|
86
|
+
)
|
|
87
|
+
return profile
|