csv-detective 0.7.5.dev1180__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_fields/temp/date/__init__.py +5 -1
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
- csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
- csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_labels/other/booleen/__init__.py +1 -1
- csv_detective/detect_labels/other/email/__init__.py +1 -1
- csv_detective/detect_labels/other/float/__init__.py +1 -1
- csv_detective/detect_labels/other/int/__init__.py +1 -1
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detect_labels/other/twitter/__init__.py +1 -1
- csv_detective/detect_labels/other/url/__init__.py +1 -1
- csv_detective/detect_labels/other/uuid/__init__.py +1 -1
- csv_detective/detect_labels/temp/date/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
- csv_detective/detect_labels/temp/year/__init__.py +1 -1
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +27 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/headers.py +32 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +98 -0
- csv_detective/explore_csv.py +40 -110
- csv_detective/output/dataframe.py +55 -0
- csv_detective/{create_example.py → output/example.py} +10 -9
- csv_detective/output/profile.py +87 -0
- csv_detective/{schema_generation.py → output/schema.py} +344 -343
- csv_detective/output/utils.py +51 -0
- csv_detective/parsing/columns.py +141 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +55 -0
- csv_detective/parsing/excel.py +169 -0
- csv_detective/parsing/load.py +97 -0
- csv_detective/utils.py +10 -236
- {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +3 -0
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +3 -2
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +85 -71
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +1 -1
- tests/test_fields.py +7 -6
- tests/test_file.py +56 -57
- csv_detective/detection.py +0 -618
- /csv_detective/{process_text.py → parsing/text.py} +0 -0
- {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info/licenses}/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from time import time
|
|
3
|
+
from typing import Optional, TextIO
|
|
4
|
+
|
|
5
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
|
|
9
|
+
"""Tests 10 first rows for possible header (in case header is not 1st row)"""
|
|
10
|
+
if verbose:
|
|
11
|
+
start = time()
|
|
12
|
+
logging.info("Detecting headers")
|
|
13
|
+
file.seek(0)
|
|
14
|
+
for i in range(10):
|
|
15
|
+
header = file.readline()
|
|
16
|
+
position = file.tell()
|
|
17
|
+
chaine = [c for c in header.replace("\n", "").split(sep) if c]
|
|
18
|
+
if chaine[-1] not in ["", "\n"] and all(
|
|
19
|
+
[mot not in ["", "\n"] for mot in chaine[1:-1]]
|
|
20
|
+
):
|
|
21
|
+
next_row = file.readline()
|
|
22
|
+
file.seek(position)
|
|
23
|
+
if header != next_row:
|
|
24
|
+
if verbose:
|
|
25
|
+
display_logs_depending_process_time(
|
|
26
|
+
f'Detected headers in {round(time() - start, 3)}s',
|
|
27
|
+
time() - start,
|
|
28
|
+
)
|
|
29
|
+
return i, chaine
|
|
30
|
+
if verbose:
|
|
31
|
+
logging.info('No header detected')
|
|
32
|
+
return 0, None
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
5
|
+
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
6
|
+
to end up with the header at the right place"""
|
|
7
|
+
idx = 0
|
|
8
|
+
if all([str(c).startswith('Unnamed:') for c in table.columns]):
|
|
9
|
+
# there is on offset between the index in the file (idx here)
|
|
10
|
+
# and the index in the dataframe, because of the header
|
|
11
|
+
idx = 1
|
|
12
|
+
while table.iloc[idx - 1].isna().all():
|
|
13
|
+
idx += 1
|
|
14
|
+
cols = table.iloc[idx - 1]
|
|
15
|
+
table = table.iloc[idx:]
|
|
16
|
+
table.columns = cols.to_list()
|
|
17
|
+
# +1 here because the headers should count as a row
|
|
18
|
+
return table, idx
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import logging
|
|
3
|
+
from time import time
|
|
4
|
+
from typing import TextIO
|
|
5
|
+
|
|
6
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def detect_separator(file: TextIO, verbose: bool = False) -> str:
|
|
10
|
+
"""Detects csv separator"""
|
|
11
|
+
# TODO: add a robust detection:
|
|
12
|
+
# si on a un point virgule comme texte et \t comme séparateur, on renvoie
|
|
13
|
+
# pour l'instant un point virgule
|
|
14
|
+
if verbose:
|
|
15
|
+
start = time()
|
|
16
|
+
logging.info("Detecting separator")
|
|
17
|
+
file.seek(0)
|
|
18
|
+
header = file.readline()
|
|
19
|
+
possible_separators = [";", ",", "|", "\t"]
|
|
20
|
+
sep_count = dict()
|
|
21
|
+
for sep in possible_separators:
|
|
22
|
+
sep_count[sep] = header.count(sep)
|
|
23
|
+
sep = max(sep_count, key=sep_count.get)
|
|
24
|
+
# testing that the first 10 (arbitrary) rows all have the same number of fields
|
|
25
|
+
# as the header. Prevents downstream unwanted behaviour where pandas can load
|
|
26
|
+
# the file (in a weird way) but the process is irrelevant.
|
|
27
|
+
file.seek(0)
|
|
28
|
+
reader = csv.reader(file, delimiter=sep)
|
|
29
|
+
rows_lengths = set()
|
|
30
|
+
for idx, row in enumerate(reader):
|
|
31
|
+
if idx > 10:
|
|
32
|
+
break
|
|
33
|
+
rows_lengths.add(len(row))
|
|
34
|
+
if len(rows_lengths) > 1:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if verbose:
|
|
40
|
+
display_logs_depending_process_time(
|
|
41
|
+
f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
|
|
42
|
+
time() - start,
|
|
43
|
+
)
|
|
44
|
+
return sep
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from ast import literal_eval
|
|
2
|
+
import logging
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
|
|
11
|
+
"""
|
|
12
|
+
Detects whether a column contains continuous variables. We consider a continuous column
|
|
13
|
+
one that contains a considerable amount of float values.
|
|
14
|
+
We removed the integers as we then end up with postal codes, insee codes, and all sort
|
|
15
|
+
of codes and types.
|
|
16
|
+
This is not optimal but it will do for now.
|
|
17
|
+
"""
|
|
18
|
+
# if we need this again in the future, could be first based on columns detected as int/float to cut time
|
|
19
|
+
|
|
20
|
+
def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
|
|
21
|
+
count = serie.value_counts().to_dict()
|
|
22
|
+
total_nb = len(serie)
|
|
23
|
+
if float in count:
|
|
24
|
+
nb_floats = count[float]
|
|
25
|
+
else:
|
|
26
|
+
return False
|
|
27
|
+
if nb_floats / total_nb >= continuous_th:
|
|
28
|
+
return True
|
|
29
|
+
else:
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
def parses_to_integer(value: str):
|
|
33
|
+
try:
|
|
34
|
+
value = value.replace(",", ".")
|
|
35
|
+
value = literal_eval(value)
|
|
36
|
+
return type(value)
|
|
37
|
+
# flake8: noqa
|
|
38
|
+
except:
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
if verbose:
|
|
42
|
+
start = time()
|
|
43
|
+
logging.info("Detecting continuous columns")
|
|
44
|
+
res = table.apply(
|
|
45
|
+
lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
|
|
46
|
+
)
|
|
47
|
+
if verbose:
|
|
48
|
+
display_logs_depending_process_time(
|
|
49
|
+
f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
|
|
50
|
+
time() - start,
|
|
51
|
+
)
|
|
52
|
+
return res.index[res]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def detect_categorical_variable(
|
|
56
|
+
table: pd.DataFrame,
|
|
57
|
+
threshold_pct_categorical: float = 0.05,
|
|
58
|
+
max_number_categorical_values: int = 25,
|
|
59
|
+
verbose: bool = False,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Heuristically detects whether a table (df) contains categorical values according to
|
|
63
|
+
the number of unique values contained.
|
|
64
|
+
As the idea of detecting categorical values is to then try to learn models to predict
|
|
65
|
+
them, we limit categorical values to at most 25 different modes or at most 5% disparity.
|
|
66
|
+
Postal code, insee code, code region and so on, may be thus not considered categorical values.
|
|
67
|
+
:param table:
|
|
68
|
+
:param threshold_pct_categorical:
|
|
69
|
+
:param max_number_categorical_values:
|
|
70
|
+
:return:
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def abs_number_different_values(column_values: pd.Series):
|
|
74
|
+
return column_values.nunique()
|
|
75
|
+
|
|
76
|
+
def rel_number_different_values(column_values: pd.Series):
|
|
77
|
+
return column_values.nunique() / len(column_values)
|
|
78
|
+
|
|
79
|
+
def detect_categorical(column_values: pd.Series):
|
|
80
|
+
abs_unique_values = abs_number_different_values(column_values)
|
|
81
|
+
rel_unique_values = rel_number_different_values(column_values)
|
|
82
|
+
if (
|
|
83
|
+
abs_unique_values <= max_number_categorical_values
|
|
84
|
+
or rel_unique_values <= threshold_pct_categorical
|
|
85
|
+
):
|
|
86
|
+
return True
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
if verbose:
|
|
90
|
+
start = time()
|
|
91
|
+
logging.info("Detecting categorical columns")
|
|
92
|
+
res = table.apply(lambda serie: detect_categorical(serie))
|
|
93
|
+
if verbose:
|
|
94
|
+
display_logs_depending_process_time(
|
|
95
|
+
f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
|
|
96
|
+
time() - start,
|
|
97
|
+
)
|
|
98
|
+
return res.index[res], res
|
csv_detective/explore_csv.py
CHANGED
|
@@ -1,49 +1,28 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Ce script analyse les premières lignes d'un CSV pour essayer de déterminer le
|
|
3
|
-
contenu possible des champs
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
from typing import Dict, List, Union
|
|
7
1
|
from collections import defaultdict
|
|
8
2
|
import json
|
|
9
|
-
import
|
|
3
|
+
import logging
|
|
10
4
|
import os
|
|
11
5
|
import tempfile
|
|
12
|
-
import logging
|
|
13
6
|
from time import time
|
|
14
|
-
import
|
|
15
|
-
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
16
10
|
import pandas as pd
|
|
17
11
|
|
|
18
12
|
# flake8: noqa
|
|
19
13
|
from csv_detective import detect_fields, detect_labels
|
|
20
|
-
from
|
|
21
|
-
|
|
22
|
-
from csv_detective.utils import (
|
|
23
|
-
cast_df,
|
|
24
|
-
display_logs_depending_process_time,
|
|
25
|
-
prepare_output_dict,
|
|
26
|
-
test_col,
|
|
27
|
-
test_label,
|
|
28
|
-
)
|
|
29
|
-
from .detection import (
|
|
30
|
-
detect_engine,
|
|
31
|
-
detect_separator,
|
|
32
|
-
detect_encoding,
|
|
33
|
-
detect_headers,
|
|
34
|
-
detect_heading_columns,
|
|
35
|
-
detect_trailing_columns,
|
|
36
|
-
parse_table,
|
|
37
|
-
parse_excel,
|
|
38
|
-
create_profile,
|
|
39
|
-
detetect_categorical_variable,
|
|
14
|
+
from .detection.variables import (
|
|
15
|
+
detect_categorical_variable,
|
|
40
16
|
# detect_continuous_variable,
|
|
41
|
-
is_url,
|
|
42
|
-
XLS_LIKE_EXT,
|
|
43
17
|
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
18
|
+
from .output.dataframe import cast_df
|
|
19
|
+
from .output.profile import create_profile
|
|
20
|
+
from .output.schema import generate_table_schema
|
|
21
|
+
from .output.utils import prepare_output_dict
|
|
22
|
+
from .parsing.load import load_file
|
|
23
|
+
from .parsing.columns import test_col, test_label
|
|
24
|
+
from .s3_utils import download_from_minio, upload_to_minio
|
|
25
|
+
from .utils import display_logs_depending_process_time, is_url
|
|
47
26
|
|
|
48
27
|
|
|
49
28
|
def get_all_packages(detect_type) -> list:
|
|
@@ -104,9 +83,9 @@ def return_all_tests(
|
|
|
104
83
|
|
|
105
84
|
|
|
106
85
|
def routine(
|
|
107
|
-
|
|
86
|
+
file_path: str,
|
|
108
87
|
num_rows: int = 500,
|
|
109
|
-
user_input_tests: Union[str,
|
|
88
|
+
user_input_tests: Union[str, list[str]] = "ALL",
|
|
110
89
|
limited_output: bool = True,
|
|
111
90
|
save_results: Union[bool, str] = True,
|
|
112
91
|
encoding: str = None,
|
|
@@ -123,7 +102,7 @@ def routine(
|
|
|
123
102
|
column contents.
|
|
124
103
|
|
|
125
104
|
Args:
|
|
126
|
-
|
|
105
|
+
file_path: local path to CSV file if not using Minio
|
|
127
106
|
num_rows: number of rows to sample from the file for analysis ; -1 for analysis
|
|
128
107
|
of the whole file
|
|
129
108
|
user_input_tests: tests to run on the file
|
|
@@ -140,89 +119,40 @@ def routine(
|
|
|
140
119
|
Returns:
|
|
141
120
|
dict: a dict with information about the csv and possible types for each column
|
|
142
121
|
"""
|
|
143
|
-
if not csv_file_path:
|
|
144
|
-
raise ValueError("csv_file_path is required.")
|
|
145
122
|
|
|
146
123
|
if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
|
|
147
124
|
raise ValueError("`save_results` must be a bool or a valid path to a json file.")
|
|
148
125
|
|
|
149
126
|
if verbose:
|
|
150
127
|
start_routine = time()
|
|
151
|
-
if is_url(
|
|
128
|
+
if is_url(file_path):
|
|
152
129
|
logging.info("Path recognized as a URL")
|
|
153
130
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
is_xls_like = True
|
|
163
|
-
encoding, sep, heading_columns, trailing_columns = None, None, None, None
|
|
164
|
-
table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
|
|
165
|
-
csv_file_path=csv_file_path,
|
|
166
|
-
num_rows=num_rows,
|
|
167
|
-
engine=engine,
|
|
168
|
-
sheet_name=sheet_name,
|
|
169
|
-
verbose=verbose,
|
|
170
|
-
)
|
|
171
|
-
header = table.columns.to_list()
|
|
172
|
-
else:
|
|
173
|
-
if encoding is None:
|
|
174
|
-
encoding = detect_encoding(csv_file_path, verbose=verbose)
|
|
175
|
-
if is_url(csv_file_path):
|
|
176
|
-
r = requests.get(csv_file_path, allow_redirects=True)
|
|
177
|
-
r.raise_for_status()
|
|
178
|
-
str_file = StringIO(r.content.decode(encoding=encoding))
|
|
179
|
-
else:
|
|
180
|
-
str_file = open(csv_file_path, "r", encoding=encoding)
|
|
181
|
-
if sep is None:
|
|
182
|
-
sep = detect_separator(str_file, verbose=verbose)
|
|
183
|
-
header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
|
|
184
|
-
if header is None:
|
|
185
|
-
return {"error": True}
|
|
186
|
-
elif isinstance(header, list):
|
|
187
|
-
if any([x is None for x in header]):
|
|
188
|
-
return {"error": True}
|
|
189
|
-
heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
|
|
190
|
-
trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
|
|
191
|
-
table, total_lines, nb_duplicates = parse_table(
|
|
192
|
-
str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
|
|
193
|
-
)
|
|
131
|
+
table, analysis = load_file(
|
|
132
|
+
file_path=file_path,
|
|
133
|
+
num_rows=num_rows,
|
|
134
|
+
encoding=encoding,
|
|
135
|
+
sep=sep,
|
|
136
|
+
verbose=verbose,
|
|
137
|
+
sheet_name=sheet_name,
|
|
138
|
+
)
|
|
194
139
|
|
|
195
140
|
if table.empty:
|
|
196
141
|
res_categorical = []
|
|
197
142
|
# res_continuous = []
|
|
198
143
|
else:
|
|
199
144
|
# Detects columns that are categorical
|
|
200
|
-
res_categorical, categorical_mask =
|
|
145
|
+
res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
|
|
201
146
|
res_categorical = list(res_categorical)
|
|
202
147
|
# Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
|
|
203
148
|
# res_continuous = list(
|
|
204
149
|
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
205
150
|
# )
|
|
206
151
|
|
|
207
|
-
|
|
208
|
-
analysis = {
|
|
209
|
-
"header_row_idx": header_row_idx,
|
|
210
|
-
"header": header,
|
|
211
|
-
"total_lines": total_lines,
|
|
212
|
-
"nb_duplicates": nb_duplicates,
|
|
213
|
-
"heading_columns": heading_columns,
|
|
214
|
-
"trailing_columns": trailing_columns,
|
|
152
|
+
analysis.update({
|
|
215
153
|
"categorical": res_categorical,
|
|
216
154
|
# "continuous": res_continuous,
|
|
217
|
-
}
|
|
218
|
-
# this is only relevant for xls-like
|
|
219
|
-
if is_xls_like:
|
|
220
|
-
analysis["engine"] = engine
|
|
221
|
-
analysis["sheet_name"] = sheet_name
|
|
222
|
-
# this is only relevant for csv
|
|
223
|
-
else:
|
|
224
|
-
analysis["encoding"] = encoding
|
|
225
|
-
analysis["separator"] = sep
|
|
155
|
+
})
|
|
226
156
|
|
|
227
157
|
# list testing to be performed
|
|
228
158
|
all_tests_fields = return_all_tests(
|
|
@@ -341,10 +271,10 @@ def routine(
|
|
|
341
271
|
if isinstance(save_results, str):
|
|
342
272
|
output_path = save_results
|
|
343
273
|
else:
|
|
344
|
-
output_path = os.path.splitext(
|
|
274
|
+
output_path = os.path.splitext(file_path)[0]
|
|
345
275
|
if is_url(output_path):
|
|
346
276
|
output_path = output_path.split('/')[-1]
|
|
347
|
-
if
|
|
277
|
+
if analysis.get("sheet_name"):
|
|
348
278
|
output_path += "_sheet-" + str(sheet_name)
|
|
349
279
|
output_path += ".json"
|
|
350
280
|
with open(output_path, "w", encoding="utf8") as fp:
|
|
@@ -372,13 +302,13 @@ def routine(
|
|
|
372
302
|
|
|
373
303
|
|
|
374
304
|
def routine_minio(
|
|
375
|
-
csv_minio_location:
|
|
376
|
-
output_minio_location:
|
|
377
|
-
tableschema_minio_location:
|
|
305
|
+
csv_minio_location: dict[str, str],
|
|
306
|
+
output_minio_location: dict[str, str],
|
|
307
|
+
tableschema_minio_location: dict[str, str],
|
|
378
308
|
minio_user: str,
|
|
379
309
|
minio_pwd: str,
|
|
380
310
|
num_rows: int = 500,
|
|
381
|
-
user_input_tests: Union[str,
|
|
311
|
+
user_input_tests: Union[str, list[str]] = "ALL",
|
|
382
312
|
encoding: str = None,
|
|
383
313
|
sep: str = None,
|
|
384
314
|
):
|
|
@@ -436,18 +366,18 @@ def routine_minio(
|
|
|
436
366
|
):
|
|
437
367
|
raise ValueError("Minio location dict must contain url, bucket and key")
|
|
438
368
|
|
|
439
|
-
|
|
369
|
+
file_path = tempfile.NamedTemporaryFile(delete=False).name
|
|
440
370
|
download_from_minio(
|
|
441
371
|
netloc=csv_minio_location["netloc"],
|
|
442
372
|
bucket=csv_minio_location["bucket"],
|
|
443
373
|
key=csv_minio_location["key"],
|
|
444
|
-
filepath=
|
|
374
|
+
filepath=file_path,
|
|
445
375
|
minio_user=minio_user,
|
|
446
376
|
minio_pwd=minio_pwd,
|
|
447
377
|
)
|
|
448
378
|
|
|
449
379
|
analysis = routine(
|
|
450
|
-
|
|
380
|
+
file_path,
|
|
451
381
|
num_rows,
|
|
452
382
|
user_input_tests,
|
|
453
383
|
output_mode="LIMITED",
|
|
@@ -457,7 +387,7 @@ def routine_minio(
|
|
|
457
387
|
)
|
|
458
388
|
|
|
459
389
|
# Write report JSON file.
|
|
460
|
-
output_path_to_store_minio_file = os.path.splitext(
|
|
390
|
+
output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
|
|
461
391
|
with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
|
|
462
392
|
json.dump(analysis, fp, indent=4, separators=(",", ": "))
|
|
463
393
|
|
|
@@ -471,7 +401,7 @@ def routine_minio(
|
|
|
471
401
|
)
|
|
472
402
|
|
|
473
403
|
os.remove(output_path_to_store_minio_file)
|
|
474
|
-
os.remove(
|
|
404
|
+
os.remove(file_path)
|
|
475
405
|
|
|
476
406
|
generate_table_schema(
|
|
477
407
|
analysis,
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from datetime import date, datetime
|
|
2
|
+
import json
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
from time import time
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from csv_detective.detect_fields.other.booleen import bool_casting
|
|
9
|
+
from csv_detective.detect_fields.other.float import float_casting
|
|
10
|
+
from csv_detective.detect_fields.temp.date import date_casting
|
|
11
|
+
from csv_detective.utils import display_logs_depending_process_time
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
|
|
15
|
+
if not isinstance(value, str) or not value:
|
|
16
|
+
# None is the current default value in hydra, should we keep this?
|
|
17
|
+
return None
|
|
18
|
+
if _type == "float":
|
|
19
|
+
return float_casting(value)
|
|
20
|
+
if _type == "bool":
|
|
21
|
+
return bool_casting(value)
|
|
22
|
+
if _type == "json":
|
|
23
|
+
# in hydra json are given to postgres as strings, conversion is done by postgres
|
|
24
|
+
return json.loads(value)
|
|
25
|
+
if _type == "date":
|
|
26
|
+
_date = date_casting(value)
|
|
27
|
+
return _date.date() if _date else None
|
|
28
|
+
if _type == "datetime":
|
|
29
|
+
return date_casting(value)
|
|
30
|
+
raise ValueError(f"Unknown type `{_type}`")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
|
|
34
|
+
if verbose:
|
|
35
|
+
start = time()
|
|
36
|
+
output_df = pd.DataFrame()
|
|
37
|
+
for col_name, detection in columns.items():
|
|
38
|
+
if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
|
|
39
|
+
# no change if detected type is string
|
|
40
|
+
output_df[col_name] = df[col_name].copy()
|
|
41
|
+
elif detection["python_type"] == "int":
|
|
42
|
+
# to allow having ints and NaN in the same column
|
|
43
|
+
output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
|
|
44
|
+
else:
|
|
45
|
+
output_df[col_name] = df[col_name].apply(
|
|
46
|
+
lambda col: cast(col, _type=detection["python_type"])
|
|
47
|
+
)
|
|
48
|
+
# to save RAM
|
|
49
|
+
del df[col_name]
|
|
50
|
+
if verbose:
|
|
51
|
+
display_logs_depending_process_time(
|
|
52
|
+
f'Casting columns completed in {round(time() - start, 3)}s',
|
|
53
|
+
time() - start,
|
|
54
|
+
)
|
|
55
|
+
return output_df
|
|
@@ -1,13 +1,14 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
import json
|
|
1
3
|
import random
|
|
2
|
-
import uuid
|
|
3
4
|
import string
|
|
4
|
-
from
|
|
5
|
+
from typing import Union, Optional, Any, Type
|
|
6
|
+
import uuid
|
|
7
|
+
|
|
8
|
+
from faker import Faker
|
|
5
9
|
import pandas as pd
|
|
6
|
-
from typing import List, Union, Optional, Any, Type
|
|
7
|
-
import json
|
|
8
10
|
import requests
|
|
9
11
|
import rstr
|
|
10
|
-
from faker import Faker
|
|
11
12
|
|
|
12
13
|
fake = Faker()
|
|
13
14
|
|
|
@@ -69,7 +70,7 @@ def create_example_csv_file(
|
|
|
69
70
|
return str(uuid.uuid4())
|
|
70
71
|
|
|
71
72
|
def _date(
|
|
72
|
-
date_range: Union[None,
|
|
73
|
+
date_range: Union[None, list[str]] = None,
|
|
73
74
|
format: str = '%Y-%m-%d',
|
|
74
75
|
required: bool = True,
|
|
75
76
|
) -> str:
|
|
@@ -98,7 +99,7 @@ def create_example_csv_file(
|
|
|
98
99
|
return fake.time(format)
|
|
99
100
|
|
|
100
101
|
def _datetime(
|
|
101
|
-
datetime_range: Optional[
|
|
102
|
+
datetime_range: Optional[list[str]] = None,
|
|
102
103
|
format: str = '%Y-%m-%d %H-%M-%S',
|
|
103
104
|
required: bool = True,
|
|
104
105
|
) -> str:
|
|
@@ -123,7 +124,7 @@ def create_example_csv_file(
|
|
|
123
124
|
|
|
124
125
|
def _number(
|
|
125
126
|
num_type: Type[Union[int, float]] = int,
|
|
126
|
-
num_range: Optional[
|
|
127
|
+
num_range: Optional[list[float]] = None,
|
|
127
128
|
enum: Optional[list] = None,
|
|
128
129
|
required: bool = True,
|
|
129
130
|
) -> Union[int, float]:
|
|
@@ -144,7 +145,7 @@ def create_example_csv_file(
|
|
|
144
145
|
return ''
|
|
145
146
|
return random.randint(0, 1) == 0
|
|
146
147
|
|
|
147
|
-
def _array(enum:
|
|
148
|
+
def _array(enum: list[Any], required: bool = True) -> str:
|
|
148
149
|
if potential_skip(required):
|
|
149
150
|
return ''
|
|
150
151
|
return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
import logging
|
|
3
|
+
from time import time
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from csv_detective.detect_fields.other.float import float_casting
|
|
8
|
+
from csv_detective.utils import display_logs_depending_process_time, prevent_nan
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def create_profile(
|
|
12
|
+
table: pd.DataFrame,
|
|
13
|
+
dict_cols_fields: dict,
|
|
14
|
+
num_rows: int,
|
|
15
|
+
limited_output: bool = True,
|
|
16
|
+
verbose: bool = False,
|
|
17
|
+
) -> dict:
|
|
18
|
+
if verbose:
|
|
19
|
+
start = time()
|
|
20
|
+
logging.info("Creating profile")
|
|
21
|
+
map_python_types = {
|
|
22
|
+
"string": str,
|
|
23
|
+
"int": float,
|
|
24
|
+
"float": float,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if num_rows > 0:
|
|
28
|
+
raise ValueError("To create profiles num_rows has to be set to -1")
|
|
29
|
+
safe_table = table.copy()
|
|
30
|
+
if not limited_output:
|
|
31
|
+
dict_cols_fields = {
|
|
32
|
+
k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
|
|
33
|
+
for k, v in dict_cols_fields.items()
|
|
34
|
+
}
|
|
35
|
+
dtypes = {
|
|
36
|
+
k: map_python_types.get(v["python_type"], str)
|
|
37
|
+
for k, v in dict_cols_fields.items()
|
|
38
|
+
}
|
|
39
|
+
for c in safe_table.columns:
|
|
40
|
+
if dtypes[c] == float:
|
|
41
|
+
safe_table[c] = safe_table[c].apply(
|
|
42
|
+
lambda s: float_casting(s) if isinstance(s, str) else s
|
|
43
|
+
)
|
|
44
|
+
profile = defaultdict(dict)
|
|
45
|
+
for c in safe_table.columns:
|
|
46
|
+
if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
|
|
47
|
+
float,
|
|
48
|
+
int,
|
|
49
|
+
]:
|
|
50
|
+
profile[c].update(
|
|
51
|
+
min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
52
|
+
safe_table[c].min()
|
|
53
|
+
)),
|
|
54
|
+
max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
55
|
+
safe_table[c].max()
|
|
56
|
+
)),
|
|
57
|
+
mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
58
|
+
safe_table[c].mean()
|
|
59
|
+
)),
|
|
60
|
+
std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
61
|
+
safe_table[c].std()
|
|
62
|
+
)),
|
|
63
|
+
)
|
|
64
|
+
tops_bruts = (
|
|
65
|
+
safe_table[safe_table[c].notna()][c]
|
|
66
|
+
.value_counts(dropna=True)
|
|
67
|
+
.reset_index()
|
|
68
|
+
.iloc[:10]
|
|
69
|
+
.to_dict(orient="records")
|
|
70
|
+
)
|
|
71
|
+
tops = []
|
|
72
|
+
for tb in tops_bruts:
|
|
73
|
+
tops.append({
|
|
74
|
+
"count": tb["count"],
|
|
75
|
+
"value": tb[c],
|
|
76
|
+
})
|
|
77
|
+
profile[c].update(
|
|
78
|
+
tops=tops,
|
|
79
|
+
nb_distinct=safe_table[c].nunique(),
|
|
80
|
+
nb_missing_values=len(safe_table[c].loc[safe_table[c].isna()]),
|
|
81
|
+
)
|
|
82
|
+
if verbose:
|
|
83
|
+
display_logs_depending_process_time(
|
|
84
|
+
f"Created profile in {round(time() - start, 3)}s",
|
|
85
|
+
time() - start,
|
|
86
|
+
)
|
|
87
|
+
return profile
|