csv-detective 0.9.1.dev1860__py3-none-any.whl → 0.9.1.dev1869__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/output/__init__.py +14 -9
- csv_detective/output/profile.py +25 -46
- {csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/METADATA +1 -1
- {csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/RECORD +8 -8
- {csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/top_level.txt +0 -0
csv_detective/output/__init__.py
CHANGED
|
@@ -25,12 +25,20 @@ def generate_output(
|
|
|
25
25
|
verbose: bool = False,
|
|
26
26
|
sheet_name: Optional[Union[str, int]] = None,
|
|
27
27
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
28
|
-
if output_profile:
|
|
28
|
+
if output_profile or output_df:
|
|
29
|
+
# to create the profile we have to cast columns, so using the dedicated function
|
|
30
|
+
table = cast_df(
|
|
31
|
+
df=table,
|
|
32
|
+
columns=analysis["columns"],
|
|
33
|
+
cast_json=cast_json,
|
|
34
|
+
verbose=verbose,
|
|
35
|
+
)
|
|
29
36
|
analysis["profile"] = create_profile(
|
|
30
37
|
table=table,
|
|
31
|
-
|
|
38
|
+
columns=analysis["columns"],
|
|
32
39
|
num_rows=num_rows,
|
|
33
40
|
limited_output=limited_output,
|
|
41
|
+
cast_json=cast_json,
|
|
34
42
|
verbose=verbose,
|
|
35
43
|
)
|
|
36
44
|
|
|
@@ -45,16 +53,13 @@ def generate_output(
|
|
|
45
53
|
output_path += "_sheet-" + str(sheet_name)
|
|
46
54
|
output_path += ".json"
|
|
47
55
|
with open(output_path, "w", encoding="utf8") as fp:
|
|
48
|
-
json.dump(
|
|
56
|
+
json.dump(
|
|
57
|
+
analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str
|
|
58
|
+
)
|
|
49
59
|
|
|
50
60
|
if output_schema:
|
|
51
61
|
analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
|
|
52
62
|
|
|
53
63
|
if output_df:
|
|
54
|
-
return analysis,
|
|
55
|
-
df=table,
|
|
56
|
-
columns=analysis["columns"],
|
|
57
|
-
cast_json=cast_json,
|
|
58
|
-
verbose=verbose,
|
|
59
|
-
)
|
|
64
|
+
return analysis, table
|
|
60
65
|
return analysis
|
csv_detective/output/profile.py
CHANGED
|
@@ -4,15 +4,15 @@ from time import time
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from csv_detective.detect_fields.other.float import float_casting
|
|
8
7
|
from csv_detective.utils import display_logs_depending_process_time, prevent_nan
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def create_profile(
|
|
12
11
|
table: pd.DataFrame,
|
|
13
|
-
|
|
12
|
+
columns: dict,
|
|
14
13
|
num_rows: int,
|
|
15
14
|
limited_output: bool = True,
|
|
15
|
+
cast_json: bool = True,
|
|
16
16
|
verbose: bool = False,
|
|
17
17
|
) -> dict:
|
|
18
18
|
if verbose:
|
|
@@ -26,65 +26,44 @@ def create_profile(
|
|
|
26
26
|
|
|
27
27
|
if num_rows > 0:
|
|
28
28
|
raise ValueError("To create profiles num_rows has to be set to -1")
|
|
29
|
-
safe_table = table.copy()
|
|
30
29
|
if not limited_output:
|
|
31
|
-
|
|
30
|
+
columns = {
|
|
32
31
|
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
|
33
|
-
for k, v in
|
|
32
|
+
for k, v in columns.items()
|
|
34
33
|
}
|
|
35
|
-
dtypes = {k: map_python_types.get(v["python_type"], str) for k, v in dict_cols_fields.items()}
|
|
36
|
-
for c in safe_table.columns:
|
|
37
|
-
if dtypes[c] is float:
|
|
38
|
-
safe_table[c] = safe_table[c].apply(
|
|
39
|
-
lambda s: float_casting(s) if isinstance(s, str) else s
|
|
40
|
-
)
|
|
41
34
|
profile = defaultdict(dict)
|
|
42
|
-
for c in
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
int,
|
|
46
|
-
]:
|
|
35
|
+
for c in table.columns:
|
|
36
|
+
# for numerical formats we want min, max, mean, std
|
|
37
|
+
if columns[c]["python_type"] in ["float", "int"]:
|
|
47
38
|
profile[c].update(
|
|
48
|
-
min=prevent_nan(
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
),
|
|
53
|
-
max=prevent_nan(
|
|
54
|
-
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
55
|
-
safe_table[c].max()
|
|
56
|
-
)
|
|
57
|
-
),
|
|
58
|
-
mean=prevent_nan(
|
|
59
|
-
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
60
|
-
safe_table[c].mean()
|
|
61
|
-
)
|
|
62
|
-
),
|
|
63
|
-
std=prevent_nan(
|
|
64
|
-
map_python_types.get(dict_cols_fields[c]["python_type"], str)(
|
|
65
|
-
safe_table[c].std()
|
|
66
|
-
)
|
|
67
|
-
),
|
|
39
|
+
min=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].min())),
|
|
40
|
+
max=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].max())),
|
|
41
|
+
mean=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].mean())),
|
|
42
|
+
std=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].std())),
|
|
68
43
|
)
|
|
44
|
+
# for all formats we want most frequent values, nb unique values and nb missing values
|
|
69
45
|
tops_bruts = (
|
|
70
|
-
|
|
71
|
-
.value_counts(
|
|
46
|
+
table.loc[table[c].notna(), c]
|
|
47
|
+
.value_counts()
|
|
72
48
|
.reset_index()
|
|
73
49
|
.iloc[:10]
|
|
74
50
|
.to_dict(orient="records")
|
|
75
51
|
)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
tops.append(
|
|
52
|
+
profile[c].update(
|
|
53
|
+
tops=[
|
|
79
54
|
{
|
|
80
55
|
"count": tb["count"],
|
|
81
56
|
"value": tb[c],
|
|
82
57
|
}
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
58
|
+
for tb in tops_bruts
|
|
59
|
+
],
|
|
60
|
+
nb_distinct=(
|
|
61
|
+
table[c].nunique()
|
|
62
|
+
if columns[c]["python_type"] != "json" or not cast_json
|
|
63
|
+
# a column containing cast json is not serializable
|
|
64
|
+
else table[c].astype(str).nunique()
|
|
65
|
+
),
|
|
66
|
+
nb_missing_values=len(table[c].loc[table[c].isna()]),
|
|
88
67
|
)
|
|
89
68
|
if verbose:
|
|
90
69
|
display_logs_depending_process_time(
|
|
@@ -137,10 +137,10 @@ csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPp
|
|
|
137
137
|
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
138
138
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
139
139
|
csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
|
|
140
|
-
csv_detective/output/__init__.py,sha256=
|
|
140
|
+
csv_detective/output/__init__.py,sha256=02F5D5TODMiImyZzjnX-vIkMPkUC0ioIryqdBm6xT-w,2056
|
|
141
141
|
csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
|
|
142
142
|
csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
|
|
143
|
-
csv_detective/output/profile.py,sha256=
|
|
143
|
+
csv_detective/output/profile.py,sha256=k-t--uVHkrt3MRLnRAthiaF069jGc1jsQnfcOoBchrU,2524
|
|
144
144
|
csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
|
|
145
145
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
146
146
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -150,7 +150,7 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
|
|
|
150
150
|
csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
|
|
151
151
|
csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
|
|
152
152
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
153
|
-
csv_detective-0.9.1.
|
|
153
|
+
csv_detective-0.9.1.dev1869.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
154
154
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
155
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
156
156
|
tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
|
|
@@ -161,8 +161,8 @@ tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
|
|
|
161
161
|
venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
|
|
162
162
|
venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
|
|
163
163
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
164
|
-
csv_detective-0.9.1.
|
|
165
|
-
csv_detective-0.9.1.
|
|
166
|
-
csv_detective-0.9.1.
|
|
167
|
-
csv_detective-0.9.1.
|
|
168
|
-
csv_detective-0.9.1.
|
|
164
|
+
csv_detective-0.9.1.dev1869.dist-info/METADATA,sha256=3gGiQT_yLk3thJkrLt5l90W8ylzk_MVYN0_F3wGv5qE,9767
|
|
165
|
+
csv_detective-0.9.1.dev1869.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
csv_detective-0.9.1.dev1869.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
167
|
+
csv_detective-0.9.1.dev1869.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
168
|
+
csv_detective-0.9.1.dev1869.dist-info/RECORD,,
|
|
File without changes
|
{csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.9.1.dev1860.dist-info → csv_detective-0.9.1.dev1869.dist-info}/top_level.txt
RENAMED
|
File without changes
|