csv-detective 0.8.1.dev1729__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -2
- csv_detective/detect_fields/temp/date/__init__.py +28 -0
- csv_detective/detect_fields/temp/datetime_aware/__init__.py +12 -2
- csv_detective/detect_fields/temp/datetime_naive/__init__.py +12 -2
- csv_detective/detection/formats.py +39 -2
- csv_detective/output/__init__.py +14 -9
- csv_detective/output/dataframe.py +5 -9
- csv_detective/output/profile.py +25 -46
- csv_detective/parsing/columns.py +1 -1
- csv_detective/parsing/csv.py +1 -3
- csv_detective/parsing/excel.py +3 -3
- {csv_detective-0.8.1.dev1729.dist-info → csv_detective-0.9.1.dist-info}/METADATA +1 -1
- {csv_detective-0.8.1.dev1729.dist-info → csv_detective-0.9.1.dist-info}/RECORD +20 -20
- tests/test_example.py +1 -1
- tests/test_fields.py +23 -3
- tests/test_file.py +17 -0
- {csv_detective-0.8.1.dev1729.dist-info → csv_detective-0.9.1.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1729.dist-info → csv_detective-0.9.1.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1729.dist-info → csv_detective-0.9.1.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.8.1.dev1729.dist-info → csv_detective-0.9.1.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py
CHANGED
|
@@ -1,2 +1,7 @@
|
|
|
1
|
-
from csv_detective.explore_csv import routine, routine_minio, validate_then_detect
|
|
2
|
-
|
|
1
|
+
from csv_detective.explore_csv import routine, routine_minio, validate_then_detect
|
|
2
|
+
|
|
3
|
+
__all__ = [
|
|
4
|
+
"routine",
|
|
5
|
+
"routine_minio",
|
|
6
|
+
"validate_then_detect",
|
|
7
|
+
]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from datetime import datetime
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
@@ -19,6 +20,23 @@ def date_casting(val: str) -> Optional[datetime]:
|
|
|
19
20
|
return None
|
|
20
21
|
|
|
21
22
|
|
|
23
|
+
seps = r"[\s/\-\*_\|;.,]"
|
|
24
|
+
# matches JJ-MM-AAAA with any of the listed separators
|
|
25
|
+
jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
|
|
26
|
+
"SEP", seps
|
|
27
|
+
)
|
|
28
|
+
# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
|
|
29
|
+
aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
|
|
30
|
+
"SEP", seps + "?"
|
|
31
|
+
)
|
|
32
|
+
# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
|
|
33
|
+
string_month_pattern = (
|
|
34
|
+
r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
|
|
35
|
+
r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
|
|
36
|
+
r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
|
|
37
|
+
r"([0-9]{2}$|(19|20)[0-9]{2}$)"
|
|
38
|
+
).replace("SEP", seps + "?")
|
|
39
|
+
|
|
22
40
|
threshold = 0.3
|
|
23
41
|
|
|
24
42
|
|
|
@@ -27,6 +45,16 @@ def _is(val):
|
|
|
27
45
|
# early stops, to cut processing time
|
|
28
46
|
if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
|
|
29
47
|
return False
|
|
48
|
+
# if it's a usual date pattern
|
|
49
|
+
if any(
|
|
50
|
+
# with this syntax, if any of the first value is True, the next ones are not computed
|
|
51
|
+
[
|
|
52
|
+
bool(re.match(jjmmaaaa_pattern, val))
|
|
53
|
+
or bool(re.match(aaaammjj_pattern, val))
|
|
54
|
+
or bool(re.match(string_month_pattern, val, re.IGNORECASE))
|
|
55
|
+
]
|
|
56
|
+
):
|
|
57
|
+
return True
|
|
30
58
|
if sum([char.isdigit() for char in val]) / len(val) < threshold:
|
|
31
59
|
return False
|
|
32
60
|
res = date_casting(val)
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from typing import Any, Optional
|
|
2
3
|
|
|
3
|
-
from csv_detective.detect_fields.temp.date import date_casting
|
|
4
|
+
from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
|
|
4
5
|
|
|
5
6
|
PROPORTION = 1
|
|
7
|
+
threshold = 0.7
|
|
8
|
+
|
|
9
|
+
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)±HH:MM with any of the listed separators for the date OR NO SEPARATOR
|
|
10
|
+
pat = (
|
|
11
|
+
aaaammjj_pattern.replace("$", "")
|
|
12
|
+
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?[+-](0\d|1[0-9]|2[0-3]):([0-5][0-9])$"
|
|
13
|
+
)
|
|
6
14
|
|
|
7
15
|
|
|
8
16
|
def _is(val: Optional[Any]) -> bool:
|
|
@@ -12,7 +20,9 @@ def _is(val: Optional[Any]) -> bool:
|
|
|
12
20
|
# 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
|
|
13
21
|
if not isinstance(val, str) or len(val) > 35 or len(val) < 21:
|
|
14
22
|
return False
|
|
15
|
-
|
|
23
|
+
# if usual format, no need to parse
|
|
24
|
+
if bool(re.match(pat, val)):
|
|
25
|
+
return True
|
|
16
26
|
if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
|
|
17
27
|
return False
|
|
18
28
|
res = date_casting(val)
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from typing import Any, Optional
|
|
2
3
|
|
|
3
|
-
from csv_detective.detect_fields.temp.date import date_casting
|
|
4
|
+
from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
|
|
4
5
|
|
|
5
6
|
PROPORTION = 1
|
|
7
|
+
threshold = 0.7
|
|
8
|
+
|
|
9
|
+
# matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
|
|
10
|
+
pat = (
|
|
11
|
+
aaaammjj_pattern.replace("$", "")
|
|
12
|
+
+ r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?Z$"
|
|
13
|
+
)
|
|
6
14
|
|
|
7
15
|
|
|
8
16
|
def _is(val: Optional[Any]) -> bool:
|
|
@@ -12,7 +20,9 @@ def _is(val: Optional[Any]) -> bool:
|
|
|
12
20
|
# 26 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd, keeping some slack
|
|
13
21
|
if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
|
|
14
22
|
return False
|
|
15
|
-
|
|
23
|
+
# if usual format, no need to parse
|
|
24
|
+
if bool(re.match(pat, val)):
|
|
25
|
+
return True
|
|
16
26
|
if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
|
|
17
27
|
return False
|
|
18
28
|
res = date_casting(val)
|
|
@@ -14,6 +14,9 @@ from csv_detective.output.utils import prepare_output_dict
|
|
|
14
14
|
from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
|
|
15
15
|
from csv_detective.validate import validate
|
|
16
16
|
|
|
17
|
+
# above this threshold, a column is not considered categorical
|
|
18
|
+
MAX_NUMBER_CATEGORICAL_VALUES = 25
|
|
19
|
+
|
|
17
20
|
|
|
18
21
|
def detect_formats(
|
|
19
22
|
table: pd.DataFrame,
|
|
@@ -28,14 +31,18 @@ def detect_formats(
|
|
|
28
31
|
if on_sample:
|
|
29
32
|
if verbose:
|
|
30
33
|
logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
|
|
31
|
-
table = table
|
|
34
|
+
table = build_sample(table)
|
|
32
35
|
|
|
33
36
|
if table.empty:
|
|
34
37
|
res_categorical = []
|
|
35
38
|
# res_continuous = []
|
|
36
39
|
else:
|
|
37
40
|
# Detects columns that are categorical
|
|
38
|
-
res_categorical, categorical_mask = detect_categorical_variable(
|
|
41
|
+
res_categorical, categorical_mask = detect_categorical_variable(
|
|
42
|
+
table,
|
|
43
|
+
max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
|
|
44
|
+
verbose=verbose,
|
|
45
|
+
)
|
|
39
46
|
res_categorical = list(res_categorical)
|
|
40
47
|
# Detect columns that are continuous (we already know the categorical) :
|
|
41
48
|
# we don't need this for now, cuts processing time
|
|
@@ -166,3 +173,33 @@ def detect_formats(
|
|
|
166
173
|
raise ValueError("Could not infer detected formats on the whole file")
|
|
167
174
|
|
|
168
175
|
return analysis
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def build_sample(table: pd.DataFrame) -> pd.DataFrame:
|
|
179
|
+
"""
|
|
180
|
+
building a sample of MAX_ROWS_ANALYSIS rows that contains at least one representative of
|
|
181
|
+
the min and max values of each column, and one case of NaN if the column contains any.
|
|
182
|
+
"""
|
|
183
|
+
samples = pd.concat(
|
|
184
|
+
[
|
|
185
|
+
# one row with the minimum of the column
|
|
186
|
+
table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
|
|
187
|
+
for col in table.columns
|
|
188
|
+
]
|
|
189
|
+
+ [
|
|
190
|
+
# one row with the maximum of the column
|
|
191
|
+
table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
|
|
192
|
+
for col in table.columns
|
|
193
|
+
]
|
|
194
|
+
+ [
|
|
195
|
+
# one row with a NaN value if the column has any
|
|
196
|
+
table.loc[table[col].isna()].iloc[[0]]
|
|
197
|
+
for col in table.columns
|
|
198
|
+
if table[col].isna().any()
|
|
199
|
+
],
|
|
200
|
+
ignore_index=True,
|
|
201
|
+
)
|
|
202
|
+
return pd.concat(
|
|
203
|
+
[samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
|
|
204
|
+
ignore_index=True,
|
|
205
|
+
)
|
csv_detective/output/__init__.py
CHANGED
|
@@ -25,12 +25,20 @@ def generate_output(
|
|
|
25
25
|
verbose: bool = False,
|
|
26
26
|
sheet_name: Optional[Union[str, int]] = None,
|
|
27
27
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
28
|
-
if output_profile:
|
|
28
|
+
if output_profile or output_df:
|
|
29
|
+
# to create the profile we have to cast columns, so using the dedicated function
|
|
30
|
+
table = cast_df(
|
|
31
|
+
df=table,
|
|
32
|
+
columns=analysis["columns"],
|
|
33
|
+
cast_json=cast_json,
|
|
34
|
+
verbose=verbose,
|
|
35
|
+
)
|
|
29
36
|
analysis["profile"] = create_profile(
|
|
30
37
|
table=table,
|
|
31
|
-
|
|
38
|
+
columns=analysis["columns"],
|
|
32
39
|
num_rows=num_rows,
|
|
33
40
|
limited_output=limited_output,
|
|
41
|
+
cast_json=cast_json,
|
|
34
42
|
verbose=verbose,
|
|
35
43
|
)
|
|
36
44
|
|
|
@@ -45,16 +53,13 @@ def generate_output(
|
|
|
45
53
|
output_path += "_sheet-" + str(sheet_name)
|
|
46
54
|
output_path += ".json"
|
|
47
55
|
with open(output_path, "w", encoding="utf8") as fp:
|
|
48
|
-
json.dump(
|
|
56
|
+
json.dump(
|
|
57
|
+
analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str
|
|
58
|
+
)
|
|
49
59
|
|
|
50
60
|
if output_schema:
|
|
51
61
|
analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
|
|
52
62
|
|
|
53
63
|
if output_df:
|
|
54
|
-
return analysis,
|
|
55
|
-
df=table,
|
|
56
|
-
columns=analysis["columns"],
|
|
57
|
-
cast_json=cast_json,
|
|
58
|
-
verbose=verbose,
|
|
59
|
-
)
|
|
64
|
+
return analysis, table
|
|
60
65
|
return analysis
|
|
@@ -33,27 +33,23 @@ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datet
|
|
|
33
33
|
def cast_df(
|
|
34
34
|
df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
|
|
35
35
|
) -> pd.DataFrame:
|
|
36
|
+
# for efficiency this modifies the dataframe in place as we don't need it anymore afterwards
|
|
36
37
|
if verbose:
|
|
37
38
|
start = time()
|
|
38
|
-
output_df = pd.DataFrame()
|
|
39
39
|
for col_name, detection in columns.items():
|
|
40
40
|
if detection["python_type"] == "string" or (
|
|
41
41
|
detection["python_type"] == "json" and not cast_json
|
|
42
42
|
):
|
|
43
43
|
# no change if detected type is string
|
|
44
|
-
|
|
44
|
+
continue
|
|
45
45
|
elif detection["python_type"] == "int":
|
|
46
46
|
# to allow having ints and NaN in the same column
|
|
47
|
-
|
|
47
|
+
df[col_name] = df[col_name].astype(pd.Int64Dtype())
|
|
48
48
|
else:
|
|
49
|
-
|
|
50
|
-
lambda col: cast(col, _type=detection["python_type"])
|
|
51
|
-
)
|
|
52
|
-
# to save RAM
|
|
53
|
-
del df[col_name]
|
|
49
|
+
df[col_name] = df[col_name].apply(lambda col: cast(col, _type=detection["python_type"]))
|
|
54
50
|
if verbose:
|
|
55
51
|
display_logs_depending_process_time(
|
|
56
52
|
f"Casting columns completed in {round(time() - start, 3)}s",
|
|
57
53
|
time() - start,
|
|
58
54
|
)
|
|
59
|
-
return
|
|
55
|
+
return df
|
csv_detective/output/profile.py
CHANGED
|
@@ -4,15 +4,15 @@ from time import time
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from csv_detective.detect_fields.other.float import float_casting
|
|
8
7
|
from csv_detective.utils import display_logs_depending_process_time, prevent_nan
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def create_profile(
|
|
12
11
|
table: pd.DataFrame,
|
|
13
|
-
|
|
12
|
+
columns: dict,
|
|
14
13
|
num_rows: int,
|
|
15
14
|
limited_output: bool = True,
|
|
15
|
+
cast_json: bool = True,
|
|
16
16
|
verbose: bool = False,
|
|
17
17
|
) -> dict:
|
|
18
18
|
if verbose:
|
|
@@ -26,65 +26,44 @@ def create_profile(
|
|
|
26
26
|
|
|
27
27
|
if num_rows > 0:
|
|
28
28
|
raise ValueError("To create profiles num_rows has to be set to -1")
|
|
29
|
-
safe_table = table.copy()
|
|
30
29
|
if not limited_output:
|
|
31
|
-
|
|
30
|
+
columns = {
|
|
32
31
|
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
|
33
|
-
for k, v in
|
|
32
|
+
for k, v in columns.items()
|
|
34
33
|
}
|
|
35
|
-
dtypes = {k: map_python_types.get(v["python_type"], str) for k, v in dict_cols_fields.items()}
|
|
36
|
-
for c in safe_table.columns:
|
|
37
|
-
if dtypes[c] is float:
|
|
38
|
-
safe_table[c] = safe_table[c].apply(
|
|
39
|
-
lambda s: float_casting(s) if isinstance(s, str) else s
|
|
40
|
-
)
|
|
41
34
|
profile = defaultdict(dict)
|
|
42
|
-
for c in
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
int,
|
|
46
|
-
]:
|
|
35
|
+
for c in table.columns:
|
|
36
|
+
# for numerical formats we want min, max, mean, std
|
|
37
|
+
if columns[c]["python_type"] in ["float", "int"]:
|
|
47
38
|
profile[c].update(
|
|
48
|
-
min=prevent_nan(
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
),
|
|
53
|
-
max=prevent_nan(
|
|
54
|
-
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
55
|
-
safe_table[c].max()
|
|
56
|
-
)
|
|
57
|
-
),
|
|
58
|
-
mean=prevent_nan(
|
|
59
|
-
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
60
|
-
safe_table[c].mean()
|
|
61
|
-
)
|
|
62
|
-
),
|
|
63
|
-
std=prevent_nan(
|
|
64
|
-
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
65
|
-
safe_table[c].std()
|
|
66
|
-
)
|
|
67
|
-
),
|
|
39
|
+
min=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].min())),
|
|
40
|
+
max=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].max())),
|
|
41
|
+
mean=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].mean())),
|
|
42
|
+
std=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].std())),
|
|
68
43
|
)
|
|
44
|
+
# for all formats we want most frequent values, nb unique values and nb missing values
|
|
69
45
|
tops_bruts = (
|
|
70
|
-
|
|
71
|
-
.value_counts(
|
|
46
|
+
table.loc[table[c].notna(), c]
|
|
47
|
+
.value_counts()
|
|
72
48
|
.reset_index()
|
|
73
49
|
.iloc[:10]
|
|
74
50
|
.to_dict(orient="records")
|
|
75
51
|
)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
tops.append(
|
|
52
|
+
profile[c].update(
|
|
53
|
+
tops=[
|
|
79
54
|
{
|
|
80
55
|
"count": tb["count"],
|
|
81
56
|
"value": tb[c],
|
|
82
57
|
}
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
58
|
+
for tb in tops_bruts
|
|
59
|
+
],
|
|
60
|
+
nb_distinct=(
|
|
61
|
+
table[c].nunique()
|
|
62
|
+
if columns[c]["python_type"] != "json" or not cast_json
|
|
63
|
+
# a column containing cast json is not serializable
|
|
64
|
+
else table[c].astype(str).nunique()
|
|
65
|
+
),
|
|
66
|
+
nb_missing_values=len(table[c].loc[table[c].isna()]),
|
|
88
67
|
)
|
|
89
68
|
if verbose:
|
|
90
69
|
display_logs_depending_process_time(
|
csv_detective/parsing/columns.py
CHANGED
csv_detective/parsing/csv.py
CHANGED
|
@@ -32,9 +32,7 @@ def parse_csv(
|
|
|
32
32
|
if "ISO-8859" in encoding:
|
|
33
33
|
encoding = "ISO-8859-1"
|
|
34
34
|
try:
|
|
35
|
-
table = pd.read_csv(
|
|
36
|
-
the_file, sep=sep, dtype="unicode", encoding=encoding, skiprows=skiprows
|
|
37
|
-
)
|
|
35
|
+
table = pd.read_csv(the_file, sep=sep, dtype=str, encoding=encoding, skiprows=skiprows)
|
|
38
36
|
total_lines = len(table)
|
|
39
37
|
nb_duplicates = len(table.loc[table.duplicated()])
|
|
40
38
|
if num_rows > 0:
|
csv_detective/parsing/excel.py
CHANGED
|
@@ -101,7 +101,7 @@ def parse_excel(
|
|
|
101
101
|
file_path,
|
|
102
102
|
engine="odf",
|
|
103
103
|
sheet_name=None,
|
|
104
|
-
dtype=
|
|
104
|
+
dtype=str,
|
|
105
105
|
)
|
|
106
106
|
sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
|
|
107
107
|
sheet_name = max(sizes, key=sizes.get)
|
|
@@ -121,7 +121,7 @@ def parse_excel(
|
|
|
121
121
|
file_path,
|
|
122
122
|
engine="odf",
|
|
123
123
|
sheet_name=sheet_name,
|
|
124
|
-
dtype=
|
|
124
|
+
dtype=str,
|
|
125
125
|
)
|
|
126
126
|
table, header_row_idx = remove_empty_first_rows(table)
|
|
127
127
|
total_lines = len(table)
|
|
@@ -152,7 +152,7 @@ def parse_excel(
|
|
|
152
152
|
file_path,
|
|
153
153
|
engine=engine,
|
|
154
154
|
sheet_name=sheet_name,
|
|
155
|
-
dtype=
|
|
155
|
+
dtype=str,
|
|
156
156
|
)
|
|
157
157
|
table, header_row_idx = remove_empty_first_rows(table)
|
|
158
158
|
total_lines = len(table)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
csv_detective/__init__.py,sha256=
|
|
1
|
+
csv_detective/__init__.py,sha256=FsL6q5F-gKLMnWy05-1CJpa4cz9tquheZ2LS1tjkVgI,162
|
|
2
2
|
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
3
|
csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
|
|
4
4
|
csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
|
|
@@ -67,9 +67,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
|
|
|
67
67
|
csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
|
|
68
68
|
csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
|
|
69
69
|
csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
70
|
-
csv_detective/detect_fields/temp/date/__init__.py,sha256=
|
|
71
|
-
csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=
|
|
72
|
-
csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=
|
|
70
|
+
csv_detective/detect_fields/temp/date/__init__.py,sha256=JtWaK8hkzBaIUc-fu0G7lIFpWqCfraRh6l0Mo65U3b0,2155
|
|
71
|
+
csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=ZDNUcbU0ZJzaxUt0Utc1Y9dRrq4HHW9uCbcnOuz5Sfk,1247
|
|
72
|
+
csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=QoVOA98lT_GVSGO_mQwKtAy2o-REs8C9d6JB9d_L_B4,1189
|
|
73
73
|
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
|
|
74
74
|
csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
|
|
75
75
|
csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
|
|
@@ -132,37 +132,37 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
|
|
|
132
132
|
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
133
133
|
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
134
134
|
csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
|
|
135
|
-
csv_detective/detection/formats.py,sha256=
|
|
135
|
+
csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
|
|
136
136
|
csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
|
|
137
137
|
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
138
138
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
139
139
|
csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
|
|
140
|
-
csv_detective/output/__init__.py,sha256=
|
|
141
|
-
csv_detective/output/dataframe.py,sha256=
|
|
140
|
+
csv_detective/output/__init__.py,sha256=02F5D5TODMiImyZzjnX-vIkMPkUC0ioIryqdBm6xT-w,2056
|
|
141
|
+
csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
|
|
142
142
|
csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
|
|
143
|
-
csv_detective/output/profile.py,sha256=
|
|
143
|
+
csv_detective/output/profile.py,sha256=k-t--uVHkrt3MRLnRAthiaF069jGc1jsQnfcOoBchrU,2524
|
|
144
144
|
csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
|
|
145
145
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
146
146
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
147
|
-
csv_detective/parsing/columns.py,sha256=
|
|
147
|
+
csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
|
|
148
148
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
149
|
-
csv_detective/parsing/csv.py,sha256=
|
|
150
|
-
csv_detective/parsing/excel.py,sha256=
|
|
149
|
+
csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
|
|
150
|
+
csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
|
|
151
151
|
csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
|
|
152
152
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
153
|
-
csv_detective-0.
|
|
153
|
+
csv_detective-0.9.1.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
154
154
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
|
-
tests/test_example.py,sha256=
|
|
156
|
-
tests/test_fields.py,sha256=
|
|
157
|
-
tests/test_file.py,sha256=
|
|
155
|
+
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
156
|
+
tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
|
|
157
|
+
tests/test_file.py,sha256=NBLwPCFN2skZHLkckPZ0M0ZvanEdL88KVK1Vi9GhSaU,8925
|
|
158
158
|
tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
|
|
159
159
|
tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
|
|
160
160
|
tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
|
|
161
161
|
venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
|
|
162
162
|
venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
|
|
163
163
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
164
|
-
csv_detective-0.
|
|
165
|
-
csv_detective-0.
|
|
166
|
-
csv_detective-0.
|
|
167
|
-
csv_detective-0.
|
|
168
|
-
csv_detective-0.
|
|
164
|
+
csv_detective-0.9.1.dist-info/METADATA,sha256=AXtW7yGuAY6Y0XOdIXMTrDmnw9EMDtJbOB3Vl4oai6w,9759
|
|
165
|
+
csv_detective-0.9.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
csv_detective-0.9.1.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
167
|
+
csv_detective-0.9.1.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
168
|
+
csv_detective-0.9.1.dist-info/RECORD,,
|
tests/test_example.py
CHANGED
tests/test_fields.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from datetime import date as _date
|
|
2
2
|
from datetime import datetime as _datetime
|
|
3
|
+
from unittest.mock import patch
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
import pytest
|
|
@@ -98,7 +99,7 @@ def test_detetect_categorical_variable():
|
|
|
98
99
|
"cat2": categorical_col2,
|
|
99
100
|
"not_cat": not_categorical_col,
|
|
100
101
|
}
|
|
101
|
-
df = pd.DataFrame(df_dict, dtype=
|
|
102
|
+
df = pd.DataFrame(df_dict, dtype=str)
|
|
102
103
|
|
|
103
104
|
res, _ = detect_categorical_variable(df)
|
|
104
105
|
assert len(res.values) and all(k in res.values for k in ["cat", "cat2"])
|
|
@@ -113,8 +114,8 @@ def test_detect_continuous_variable():
|
|
|
113
114
|
df_dict = {"cont": continuous_col, "not_cont": not_continuous_col}
|
|
114
115
|
df_dict_2 = {"cont": continuous_col_2, "not_cont": not_continuous_col}
|
|
115
116
|
|
|
116
|
-
df = pd.DataFrame(df_dict, dtype=
|
|
117
|
-
df2 = pd.DataFrame(df_dict_2, dtype=
|
|
117
|
+
df = pd.DataFrame(df_dict, dtype=str)
|
|
118
|
+
df2 = pd.DataFrame(df_dict_2, dtype=str)
|
|
118
119
|
|
|
119
120
|
res = detect_continuous_variable(df)
|
|
120
121
|
res2 = detect_continuous_variable(df2, continuous_th=0.65)
|
|
@@ -441,3 +442,22 @@ def test_priority(args):
|
|
|
441
442
|
col = "col1"
|
|
442
443
|
output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
|
|
443
444
|
assert output[col]["format"] == expected
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
@pytest.mark.parametrize(
|
|
448
|
+
"args",
|
|
449
|
+
(
|
|
450
|
+
("1996-02-13", date),
|
|
451
|
+
("28/01/2000", date),
|
|
452
|
+
("2025-08-20T14:30:00+02:00", datetime_aware),
|
|
453
|
+
("2025/08/20 14:30:00.2763-12:00", datetime_aware),
|
|
454
|
+
("1925_12_20T14:30:00.2763Z", datetime_naive),
|
|
455
|
+
("1925 12 20 14:30:00Z", datetime_naive),
|
|
456
|
+
),
|
|
457
|
+
)
|
|
458
|
+
def test_early_detection(args):
|
|
459
|
+
value, module = args
|
|
460
|
+
with patch("csv_detective.detect_fields.temp.date.date_casting") as mock_func:
|
|
461
|
+
res = module._is(value)
|
|
462
|
+
assert res
|
|
463
|
+
mock_func.assert_not_called()
|
tests/test_file.py
CHANGED
|
@@ -276,3 +276,20 @@ def test_cast_json(mocked_responses, cast_json):
|
|
|
276
276
|
)
|
|
277
277
|
assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
|
|
278
278
|
assert isinstance(df["a_simple_dict"][0], expected_type)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def test_almost_uniform_column(mocked_responses):
|
|
282
|
+
col_name = "int_not_bool"
|
|
283
|
+
expected_content = f"{col_name}\n" + "9\n" + "1\n" * int(1e7)
|
|
284
|
+
mocked_responses.get(
|
|
285
|
+
"http://example.com/test.csv",
|
|
286
|
+
body=expected_content,
|
|
287
|
+
status=200,
|
|
288
|
+
)
|
|
289
|
+
analysis = routine(
|
|
290
|
+
file_path="http://example.com/test.csv",
|
|
291
|
+
num_rows=-1,
|
|
292
|
+
output_profile=False,
|
|
293
|
+
save_results=False,
|
|
294
|
+
)
|
|
295
|
+
assert analysis["columns"][col_name]["format"] == "int"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|