csv-detective 0.9.2.dev1874__py3-none-any.whl → 0.9.2.dev1896__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/output/__init__.py +7 -9
- csv_detective/output/profile.py +13 -10
- csv_detective/utils.py +5 -3
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.2.dev1896.dist-info}/METADATA +1 -1
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.2.dev1896.dist-info}/RECORD +10 -10
- tests/test_file.py +50 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.2.dev1896.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.2.dev1896.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.2.dev1896.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.2.dev1896.dist-info}/top_level.txt +0 -0
csv_detective/output/__init__.py
CHANGED
|
@@ -25,14 +25,7 @@ def generate_output(
|
|
|
25
25
|
verbose: bool = False,
|
|
26
26
|
sheet_name: Optional[Union[str, int]] = None,
|
|
27
27
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
28
|
-
if output_profile
|
|
29
|
-
# to create the profile we have to cast columns, so using the dedicated function
|
|
30
|
-
table = cast_df(
|
|
31
|
-
df=table,
|
|
32
|
-
columns=analysis["columns"],
|
|
33
|
-
cast_json=cast_json,
|
|
34
|
-
verbose=verbose,
|
|
35
|
-
)
|
|
28
|
+
if output_profile:
|
|
36
29
|
analysis["profile"] = create_profile(
|
|
37
30
|
table=table,
|
|
38
31
|
columns=analysis["columns"],
|
|
@@ -61,5 +54,10 @@ def generate_output(
|
|
|
61
54
|
analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
|
|
62
55
|
|
|
63
56
|
if output_df:
|
|
64
|
-
return analysis,
|
|
57
|
+
return analysis, cast_df(
|
|
58
|
+
df=table,
|
|
59
|
+
columns=analysis["columns"],
|
|
60
|
+
cast_json=cast_json,
|
|
61
|
+
verbose=verbose,
|
|
62
|
+
)
|
|
65
63
|
return analysis
|
csv_detective/output/profile.py
CHANGED
|
@@ -4,7 +4,8 @@ from time import time
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from csv_detective.
|
|
7
|
+
from csv_detective.detect_fields.other.float import float_casting
|
|
8
|
+
from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
def create_profile(
|
|
@@ -18,11 +19,6 @@ def create_profile(
|
|
|
18
19
|
if verbose:
|
|
19
20
|
start = time()
|
|
20
21
|
logging.info("Creating profile")
|
|
21
|
-
map_python_types = {
|
|
22
|
-
"string": str,
|
|
23
|
-
"int": float,
|
|
24
|
-
"float": float,
|
|
25
|
-
}
|
|
26
22
|
|
|
27
23
|
if num_rows > 0:
|
|
28
24
|
raise ValueError("To create profiles num_rows has to be set to -1")
|
|
@@ -35,12 +31,19 @@ def create_profile(
|
|
|
35
31
|
for c in table.columns:
|
|
36
32
|
# for numerical formats we want min, max, mean, std
|
|
37
33
|
if columns[c]["python_type"] in ["float", "int"]:
|
|
34
|
+
# we locally cast the column to perform the operations, using the same method as in cast_df
|
|
35
|
+
cast_col = (
|
|
36
|
+
table[c].astype(pd.Int64Dtype())
|
|
37
|
+
if columns[c]["python_type"] == "int"
|
|
38
|
+
else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
|
|
39
|
+
)
|
|
38
40
|
profile[c].update(
|
|
39
|
-
min=
|
|
40
|
-
max=
|
|
41
|
-
mean=
|
|
42
|
-
std=
|
|
41
|
+
min=cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
|
|
42
|
+
max=cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
|
|
43
|
+
mean=cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
|
|
44
|
+
std=cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
|
|
43
45
|
)
|
|
46
|
+
del cast_col
|
|
44
47
|
# for all formats we want most frequent values, nb unique values and nb missing values
|
|
45
48
|
tops_bruts = (
|
|
46
49
|
table.loc[table[c].notna(), c]
|
csv_detective/utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Optional
|
|
2
|
+
from typing import Optional, Union
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
@@ -31,5 +31,7 @@ def is_url(file_path: str) -> bool:
|
|
|
31
31
|
return file_path.startswith("http")
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def
|
|
35
|
-
|
|
34
|
+
def cast_prevent_nan(value: float, _type: str) -> Optional[Union[float, int]]:
|
|
35
|
+
if _type not in {"int", "float"}:
|
|
36
|
+
raise ValueError(f"Invalid type was passed: {_type}")
|
|
37
|
+
return None if pd.isna(value) else eval(_type)(value)
|
|
@@ -3,7 +3,7 @@ csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
|
3
3
|
csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
|
|
4
4
|
csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
|
|
5
5
|
csv_detective/s3_utils.py,sha256=z1KTVVkdurMv21o-rZu7_aluMJnSi-d5uxnQbqT2NoI,1407
|
|
6
|
-
csv_detective/utils.py,sha256=
|
|
6
|
+
csv_detective/utils.py,sha256=xiIO7ZDqkTm9Rnhnq6RaDdnrPIfoG0JV9AsmaOG6plA,1162
|
|
7
7
|
csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
|
|
8
8
|
csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
|
|
9
9
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -137,10 +137,10 @@ csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPp
|
|
|
137
137
|
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
138
138
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
139
139
|
csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
|
|
140
|
-
csv_detective/output/__init__.py,sha256=
|
|
140
|
+
csv_detective/output/__init__.py,sha256=bMsLp-XCVf4sNymIof_kdMdqFIY7GocOas-lPNekfQg,1930
|
|
141
141
|
csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
|
|
142
142
|
csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
|
|
143
|
-
csv_detective/output/profile.py,sha256=
|
|
143
|
+
csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
|
|
144
144
|
csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
|
|
145
145
|
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
146
146
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -150,19 +150,19 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
|
|
|
150
150
|
csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
|
|
151
151
|
csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
|
|
152
152
|
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
153
|
-
csv_detective-0.9.2.
|
|
153
|
+
csv_detective-0.9.2.dev1896.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
154
154
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
155
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
156
156
|
tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
|
|
157
|
-
tests/test_file.py,sha256=
|
|
157
|
+
tests/test_file.py,sha256=ZL0Jx499RUpmKFvcPQVnAeafSbyc23fqwt93ZrYg9GE,10258
|
|
158
158
|
tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
|
|
159
159
|
tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
|
|
160
160
|
tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
|
|
161
161
|
venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
|
|
162
162
|
venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
|
|
163
163
|
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
164
|
-
csv_detective-0.9.2.
|
|
165
|
-
csv_detective-0.9.2.
|
|
166
|
-
csv_detective-0.9.2.
|
|
167
|
-
csv_detective-0.9.2.
|
|
168
|
-
csv_detective-0.9.2.
|
|
164
|
+
csv_detective-0.9.2.dev1896.dist-info/METADATA,sha256=2ZrcsJkSf2uY3pxlmwvui5uFbicmYpa8nDnxmkp4-xM,9767
|
|
165
|
+
csv_detective-0.9.2.dev1896.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
csv_detective-0.9.2.dev1896.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
167
|
+
csv_detective-0.9.2.dev1896.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
168
|
+
csv_detective-0.9.2.dev1896.dist-info/RECORD,,
|
tests/test_file.py
CHANGED
|
@@ -5,6 +5,7 @@ import pytest
|
|
|
5
5
|
import responses
|
|
6
6
|
|
|
7
7
|
from csv_detective import routine
|
|
8
|
+
from csv_detective.output.profile import create_profile
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
@pytest.mark.parametrize(
|
|
@@ -97,6 +98,55 @@ def test_profile_with_num_rows():
|
|
|
97
98
|
)
|
|
98
99
|
|
|
99
100
|
|
|
101
|
+
@pytest.mark.parametrize(
|
|
102
|
+
"params",
|
|
103
|
+
(
|
|
104
|
+
(
|
|
105
|
+
True,
|
|
106
|
+
{
|
|
107
|
+
"int_with_nan": {"format": "int", "python_type": "int"},
|
|
108
|
+
"date": {"format": "date", "python_type": "date"},
|
|
109
|
+
},
|
|
110
|
+
),
|
|
111
|
+
(
|
|
112
|
+
False,
|
|
113
|
+
{
|
|
114
|
+
"int_with_nan": [{"format": "int", "python_type": "int"}],
|
|
115
|
+
"date": [{"format": "date", "python_type": "date"}],
|
|
116
|
+
},
|
|
117
|
+
),
|
|
118
|
+
),
|
|
119
|
+
)
|
|
120
|
+
def test_profile_specific_cases(params):
|
|
121
|
+
limited_output, columns = params
|
|
122
|
+
table = pd.DataFrame(
|
|
123
|
+
{
|
|
124
|
+
"int_with_nan": ["1", pd.NA, pd.NA],
|
|
125
|
+
"date": ["1996-01-02", "1996-01-02", "2024-11-12"],
|
|
126
|
+
}
|
|
127
|
+
)
|
|
128
|
+
profile = create_profile(
|
|
129
|
+
table=table,
|
|
130
|
+
columns=columns,
|
|
131
|
+
limited_output=limited_output,
|
|
132
|
+
num_rows=-1,
|
|
133
|
+
)
|
|
134
|
+
assert profile["int_with_nan"] == {
|
|
135
|
+
"min": 1,
|
|
136
|
+
"max": 1,
|
|
137
|
+
"mean": 1,
|
|
138
|
+
"std": None,
|
|
139
|
+
"tops": [{"count": 1, "value": "1"}],
|
|
140
|
+
"nb_distinct": 1,
|
|
141
|
+
"nb_missing_values": 2,
|
|
142
|
+
}
|
|
143
|
+
assert profile["date"] == {
|
|
144
|
+
"tops": [{"count": 2, "value": "1996-01-02"}, {"count": 1, "value": "2024-11-12"}],
|
|
145
|
+
"nb_distinct": 2,
|
|
146
|
+
"nb_missing_values": 0,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
|
|
100
150
|
def test_exception_different_number_of_columns():
|
|
101
151
|
"""
|
|
102
152
|
A ValueError should be raised if the number of columns differs between the first rows
|
|
File without changes
|
{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.2.dev1896.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.2.dev1896.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.2.dev1896.dist-info}/top_level.txt
RENAMED
|
File without changes
|