csv-detective 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2241__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +2 -1
- csv_detective/detection/engine.py +1 -1
- csv_detective/detection/formats.py +39 -95
- csv_detective/detection/variables.py +2 -2
- csv_detective/explore_csv.py +5 -7
- csv_detective/load_tests.py +11 -4
- csv_detective/output/__init__.py +8 -4
- csv_detective/output/dataframe.py +37 -0
- csv_detective/output/example.py +3 -1
- csv_detective/output/profile.py +65 -21
- csv_detective/parsing/columns.py +133 -35
- csv_detective/parsing/csv.py +26 -23
- csv_detective/parsing/load.py +21 -8
- csv_detective/validate.py +86 -40
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/METADATA +29 -6
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/RECORD +24 -24
- tests/test_fields.py +9 -13
- tests/test_file.py +85 -35
- tests/test_structure.py +4 -1
- tests/test_validation.py +9 -4
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2215.dist-info → csv_detective-0.9.3.dev2241.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py
CHANGED
|
@@ -29,7 +29,7 @@ def detect_engine(file_path: str, verbose=False) -> str | None:
|
|
|
29
29
|
}
|
|
30
30
|
# if none of the above, we move forwards with the csv process
|
|
31
31
|
if is_url(file_path):
|
|
32
|
-
remote_content = requests.get(file_path).
|
|
32
|
+
remote_content = next(requests.get(file_path, stream=True).iter_content(chunk_size=1024))
|
|
33
33
|
engine = mapping.get(magic.from_buffer(remote_content, mime=True))
|
|
34
34
|
else:
|
|
35
35
|
engine = mapping.get(magic.from_file(file_path, mime=True))
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import logging
|
|
2
1
|
from collections import defaultdict
|
|
3
2
|
|
|
4
3
|
import numpy as np
|
|
@@ -10,11 +9,12 @@ from csv_detective.detection.variables import (
|
|
|
10
9
|
)
|
|
11
10
|
from csv_detective.load_tests import return_all_tests
|
|
12
11
|
from csv_detective.output.utils import prepare_output_dict
|
|
13
|
-
from csv_detective.parsing.columns import
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
12
|
+
from csv_detective.parsing.columns import (
|
|
13
|
+
MAX_NUMBER_CATEGORICAL_VALUES,
|
|
14
|
+
test_col,
|
|
15
|
+
test_col_chunks,
|
|
16
|
+
test_label,
|
|
17
|
+
)
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def detect_formats(
|
|
@@ -25,36 +25,8 @@ def detect_formats(
|
|
|
25
25
|
limited_output: bool = True,
|
|
26
26
|
skipna: bool = True,
|
|
27
27
|
verbose: bool = False,
|
|
28
|
-
):
|
|
29
|
-
|
|
30
|
-
if on_sample:
|
|
31
|
-
if verbose:
|
|
32
|
-
logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
|
|
33
|
-
table = build_sample(table)
|
|
34
|
-
|
|
35
|
-
if table.empty:
|
|
36
|
-
res_categorical = []
|
|
37
|
-
# res_continuous = []
|
|
38
|
-
else:
|
|
39
|
-
# Detects columns that are categorical
|
|
40
|
-
res_categorical, categorical_mask = detect_categorical_variable(
|
|
41
|
-
table,
|
|
42
|
-
max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
|
|
43
|
-
verbose=verbose,
|
|
44
|
-
)
|
|
45
|
-
res_categorical = list(res_categorical)
|
|
46
|
-
# Detect columns that are continuous (we already know the categorical) :
|
|
47
|
-
# we don't need this for now, cuts processing time
|
|
48
|
-
# res_continuous = list(
|
|
49
|
-
# detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
|
|
50
|
-
# )
|
|
51
|
-
|
|
52
|
-
analysis.update(
|
|
53
|
-
{
|
|
54
|
-
"categorical": res_categorical,
|
|
55
|
-
# "continuous": res_continuous,
|
|
56
|
-
}
|
|
57
|
-
)
|
|
28
|
+
) -> tuple[dict, dict[str, pd.Series] | None]:
|
|
29
|
+
in_chunks = analysis.get("total_lines") is None
|
|
58
30
|
|
|
59
31
|
# list testing to be performed
|
|
60
32
|
all_tests_fields = return_all_tests(
|
|
@@ -66,16 +38,41 @@ def detect_formats(
|
|
|
66
38
|
|
|
67
39
|
# if no testing then return
|
|
68
40
|
if not all_tests_fields and not all_tests_labels:
|
|
69
|
-
return analysis
|
|
41
|
+
return analysis, None
|
|
70
42
|
|
|
71
43
|
# Perform testing on fields
|
|
72
|
-
|
|
73
|
-
table
|
|
74
|
-
|
|
44
|
+
if not in_chunks:
|
|
45
|
+
# table is small enough to be tested in one go
|
|
46
|
+
scores_table_fields = test_col(
|
|
47
|
+
table=table,
|
|
48
|
+
all_tests=all_tests_fields,
|
|
49
|
+
limited_output=limited_output,
|
|
50
|
+
skipna=skipna,
|
|
51
|
+
verbose=verbose,
|
|
52
|
+
)
|
|
53
|
+
res_categorical, _ = detect_categorical_variable(
|
|
54
|
+
table,
|
|
55
|
+
max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
|
|
56
|
+
verbose=verbose,
|
|
57
|
+
)
|
|
58
|
+
analysis["categorical"] = res_categorical
|
|
59
|
+
col_values = None
|
|
60
|
+
else:
|
|
61
|
+
scores_table_fields, analysis, col_values = test_col_chunks(
|
|
62
|
+
table=table,
|
|
63
|
+
file_path=file_path,
|
|
64
|
+
analysis=analysis,
|
|
65
|
+
all_tests=all_tests_fields,
|
|
66
|
+
limited_output=limited_output,
|
|
67
|
+
skipna=skipna,
|
|
68
|
+
verbose=verbose,
|
|
69
|
+
)
|
|
75
70
|
analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
|
|
76
71
|
|
|
77
72
|
# Perform testing on labels
|
|
78
|
-
scores_table_labels = test_label(
|
|
73
|
+
scores_table_labels = test_label(
|
|
74
|
+
analysis["header"], all_tests_labels, limited_output, verbose=verbose
|
|
75
|
+
)
|
|
79
76
|
analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
|
|
80
77
|
|
|
81
78
|
# Multiply the results of the fields by 1 + 0.5 * the results of the labels.
|
|
@@ -158,57 +155,4 @@ def detect_formats(
|
|
|
158
155
|
for header, col_metadata in analysis["columns"].items():
|
|
159
156
|
analysis["formats"][col_metadata["format"]].append(header)
|
|
160
157
|
|
|
161
|
-
|
|
162
|
-
if verbose:
|
|
163
|
-
logging.warning("Validating that analysis on the sample works on the whole file")
|
|
164
|
-
is_valid, _, _ = validate(
|
|
165
|
-
file_path=file_path,
|
|
166
|
-
previous_analysis=analysis,
|
|
167
|
-
num_rows=-1,
|
|
168
|
-
encoding=analysis.get("encoding"),
|
|
169
|
-
sep=analysis.get("separator"),
|
|
170
|
-
sheet_name=analysis.get("sheet_name"),
|
|
171
|
-
verbose=verbose,
|
|
172
|
-
skipna=skipna,
|
|
173
|
-
)
|
|
174
|
-
if not is_valid:
|
|
175
|
-
raise ValueError("Could not infer detected formats on the whole file")
|
|
176
|
-
|
|
177
|
-
return analysis
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def build_sample(table: pd.DataFrame) -> pd.DataFrame:
|
|
181
|
-
"""
|
|
182
|
-
building a sample of MAX_ROWS_ANALYSIS rows that contains at least one representative of
|
|
183
|
-
the min and max values of each column, and one case of NaN if the column contains any.
|
|
184
|
-
"""
|
|
185
|
-
samples = pd.concat(
|
|
186
|
-
[
|
|
187
|
-
# one row with the minimum of the column
|
|
188
|
-
table.loc[table[col] == val].iloc[[0]]
|
|
189
|
-
for col in table.columns
|
|
190
|
-
if not pd.isna(val := table[col].dropna().min())
|
|
191
|
-
]
|
|
192
|
-
+ [
|
|
193
|
-
# one row with the maximum of the column
|
|
194
|
-
table.loc[table[col] == val].iloc[[0]]
|
|
195
|
-
for col in table.columns
|
|
196
|
-
if not pd.isna(val := table[col].dropna().max())
|
|
197
|
-
]
|
|
198
|
-
+ [
|
|
199
|
-
# one row with a NaN value if the column has any
|
|
200
|
-
table.loc[table[col].isna()].iloc[[0]]
|
|
201
|
-
for col in table.columns
|
|
202
|
-
if table[col].isna().any()
|
|
203
|
-
],
|
|
204
|
-
ignore_index=True,
|
|
205
|
-
)
|
|
206
|
-
return (
|
|
207
|
-
pd.concat(
|
|
208
|
-
[samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
|
|
209
|
-
ignore_index=True,
|
|
210
|
-
)
|
|
211
|
-
# this is very unlikely but we never know
|
|
212
|
-
if len(samples) <= MAX_ROWS_ANALYSIS
|
|
213
|
-
else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
|
|
214
|
-
)
|
|
158
|
+
return analysis, col_values
|
|
@@ -56,7 +56,7 @@ def detect_categorical_variable(
|
|
|
56
56
|
threshold_pct_categorical: float = 0.05,
|
|
57
57
|
max_number_categorical_values: int = 25,
|
|
58
58
|
verbose: bool = False,
|
|
59
|
-
):
|
|
59
|
+
) -> tuple[list[str], pd.DataFrame]:
|
|
60
60
|
"""
|
|
61
61
|
Heuristically detects whether a table (df) contains categorical values according to
|
|
62
62
|
the number of unique values contained.
|
|
@@ -94,4 +94,4 @@ def detect_categorical_variable(
|
|
|
94
94
|
f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
|
|
95
95
|
time() - start,
|
|
96
96
|
)
|
|
97
|
-
return res.index[res], res
|
|
97
|
+
return list(res.index[res]), res
|
csv_detective/explore_csv.py
CHANGED
|
@@ -70,7 +70,7 @@ def routine(
|
|
|
70
70
|
sheet_name=sheet_name,
|
|
71
71
|
)
|
|
72
72
|
|
|
73
|
-
analysis = detect_formats(
|
|
73
|
+
analysis, _col_values = detect_formats(
|
|
74
74
|
table=table,
|
|
75
75
|
analysis=analysis,
|
|
76
76
|
file_path=file_path,
|
|
@@ -94,6 +94,7 @@ def routine(
|
|
|
94
94
|
cast_json=cast_json,
|
|
95
95
|
verbose=verbose,
|
|
96
96
|
sheet_name=sheet_name,
|
|
97
|
+
_col_values=_col_values,
|
|
97
98
|
)
|
|
98
99
|
finally:
|
|
99
100
|
if verbose:
|
|
@@ -121,13 +122,9 @@ def validate_then_detect(
|
|
|
121
122
|
if is_url(file_path):
|
|
122
123
|
logging.info("Path recognized as a URL")
|
|
123
124
|
|
|
124
|
-
is_valid, table, analysis = validate(
|
|
125
|
+
is_valid, table, analysis, col_values = validate(
|
|
125
126
|
file_path=file_path,
|
|
126
127
|
previous_analysis=previous_analysis,
|
|
127
|
-
num_rows=num_rows,
|
|
128
|
-
encoding=previous_analysis.get("encoding"),
|
|
129
|
-
sep=previous_analysis.get("separator"),
|
|
130
|
-
sheet_name=previous_analysis.get("sheet_name"),
|
|
131
128
|
verbose=verbose,
|
|
132
129
|
skipna=skipna,
|
|
133
130
|
)
|
|
@@ -139,7 +136,7 @@ def validate_then_detect(
|
|
|
139
136
|
verbose=verbose,
|
|
140
137
|
)
|
|
141
138
|
if not is_valid:
|
|
142
|
-
analysis = detect_formats(
|
|
139
|
+
analysis, col_values = detect_formats(
|
|
143
140
|
table=table,
|
|
144
141
|
analysis=analysis,
|
|
145
142
|
file_path=file_path,
|
|
@@ -162,6 +159,7 @@ def validate_then_detect(
|
|
|
162
159
|
cast_json=cast_json,
|
|
163
160
|
verbose=verbose,
|
|
164
161
|
sheet_name=analysis.get("sheet_name"),
|
|
162
|
+
_col_values=col_values,
|
|
165
163
|
)
|
|
166
164
|
finally:
|
|
167
165
|
if verbose:
|
csv_detective/load_tests.py
CHANGED
|
@@ -19,7 +19,7 @@ def get_all_packages(detect_type) -> list:
|
|
|
19
19
|
def return_all_tests(
|
|
20
20
|
user_input_tests: str | list,
|
|
21
21
|
detect_type: str,
|
|
22
|
-
) ->
|
|
22
|
+
) -> dict[str, dict]:
|
|
23
23
|
"""
|
|
24
24
|
returns all tests that have a method _is and are listed in the user_input_tests
|
|
25
25
|
the function can select a sub_package from csv_detective
|
|
@@ -40,6 +40,7 @@ def return_all_tests(
|
|
|
40
40
|
else:
|
|
41
41
|
tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"]
|
|
42
42
|
tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"]
|
|
43
|
+
# removing specified (groups of) tests
|
|
43
44
|
all_tests = [
|
|
44
45
|
# this is why we need to import detect_fields/labels
|
|
45
46
|
eval(x)
|
|
@@ -47,6 +48,12 @@ def return_all_tests(
|
|
|
47
48
|
if any([y == x[: len(y)] for y in tests_to_do])
|
|
48
49
|
and all([y != x[: len(y)] for y in tests_skipped])
|
|
49
50
|
]
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
51
|
+
return {
|
|
52
|
+
test.__name__.split(".")[-1]: {
|
|
53
|
+
"func": test._is,
|
|
54
|
+
"prop": test.PROPORTION,
|
|
55
|
+
"module": test,
|
|
56
|
+
}
|
|
57
|
+
for test in all_tests
|
|
58
|
+
if "_is" in dir(test)
|
|
59
|
+
}
|
csv_detective/output/__init__.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
+
from typing import Iterator
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
|
|
6
7
|
from csv_detective.utils import is_url
|
|
7
8
|
|
|
8
|
-
from .dataframe import
|
|
9
|
+
from .dataframe import cast_df_chunks
|
|
9
10
|
from .profile import create_profile
|
|
10
11
|
from .schema import generate_table_schema
|
|
11
12
|
|
|
@@ -23,7 +24,8 @@ def generate_output(
|
|
|
23
24
|
cast_json: bool = True,
|
|
24
25
|
verbose: bool = False,
|
|
25
26
|
sheet_name: str | int | None = None,
|
|
26
|
-
|
|
27
|
+
_col_values: dict[str, pd.Series] | None = None,
|
|
28
|
+
) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
|
|
27
29
|
if output_profile:
|
|
28
30
|
analysis["profile"] = create_profile(
|
|
29
31
|
table=table,
|
|
@@ -32,6 +34,7 @@ def generate_output(
|
|
|
32
34
|
limited_output=limited_output,
|
|
33
35
|
cast_json=cast_json,
|
|
34
36
|
verbose=verbose,
|
|
37
|
+
_col_values=_col_values,
|
|
35
38
|
)
|
|
36
39
|
|
|
37
40
|
if save_results:
|
|
@@ -53,9 +56,10 @@ def generate_output(
|
|
|
53
56
|
analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
|
|
54
57
|
|
|
55
58
|
if output_df:
|
|
56
|
-
return analysis,
|
|
59
|
+
return analysis, cast_df_chunks(
|
|
57
60
|
df=table,
|
|
58
|
-
|
|
61
|
+
analysis=analysis,
|
|
62
|
+
file_path=file_path,
|
|
59
63
|
cast_json=cast_json,
|
|
60
64
|
verbose=verbose,
|
|
61
65
|
)
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from datetime import date, datetime
|
|
3
3
|
from time import time
|
|
4
|
+
from typing import Iterator
|
|
4
5
|
|
|
5
6
|
import pandas as pd
|
|
6
7
|
|
|
7
8
|
from csv_detective.detect_fields.other.booleen import bool_casting
|
|
8
9
|
from csv_detective.detect_fields.other.float import float_casting
|
|
9
10
|
from csv_detective.detect_fields.temp.date import date_casting
|
|
11
|
+
from csv_detective.parsing.csv import CHUNK_SIZE
|
|
10
12
|
from csv_detective.utils import display_logs_depending_process_time
|
|
11
13
|
|
|
12
14
|
|
|
@@ -52,3 +54,38 @@ def cast_df(
|
|
|
52
54
|
time() - start,
|
|
53
55
|
)
|
|
54
56
|
return df
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def cast_df_chunks(
|
|
60
|
+
df: pd.DataFrame,
|
|
61
|
+
analysis: dict,
|
|
62
|
+
file_path: str,
|
|
63
|
+
cast_json: bool = True,
|
|
64
|
+
verbose: bool = False,
|
|
65
|
+
) -> Iterator[pd.DataFrame]:
|
|
66
|
+
if analysis.get("engine") or analysis["total_lines"] <= CHUNK_SIZE:
|
|
67
|
+
# the file is loaded in one chunk, so returning the cast df
|
|
68
|
+
yield cast_df(
|
|
69
|
+
df=df,
|
|
70
|
+
columns=analysis["columns"],
|
|
71
|
+
cast_json=cast_json,
|
|
72
|
+
verbose=verbose,
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
# loading the csv in chunks using the analysis
|
|
76
|
+
chunks = pd.read_csv(
|
|
77
|
+
file_path,
|
|
78
|
+
dtype=str,
|
|
79
|
+
sep=analysis["separator"],
|
|
80
|
+
encoding=analysis["encoding"],
|
|
81
|
+
skiprows=analysis["header_row_idx"],
|
|
82
|
+
compression=analysis.get("compression"),
|
|
83
|
+
chunksize=CHUNK_SIZE,
|
|
84
|
+
)
|
|
85
|
+
for chunk in chunks:
|
|
86
|
+
yield cast_df(
|
|
87
|
+
df=chunk,
|
|
88
|
+
columns=analysis["columns"],
|
|
89
|
+
cast_json=cast_json,
|
|
90
|
+
verbose=verbose,
|
|
91
|
+
)
|
csv_detective/output/example.py
CHANGED
|
@@ -10,6 +10,8 @@ import requests
|
|
|
10
10
|
import rstr
|
|
11
11
|
from faker import Faker
|
|
12
12
|
|
|
13
|
+
from csv_detective.utils import is_url
|
|
14
|
+
|
|
13
15
|
fake = Faker()
|
|
14
16
|
|
|
15
17
|
|
|
@@ -183,7 +185,7 @@ def create_example_csv_file(
|
|
|
183
185
|
}
|
|
184
186
|
|
|
185
187
|
if schema_path:
|
|
186
|
-
if schema_path
|
|
188
|
+
if is_url(schema_path):
|
|
187
189
|
schema = requests.get(schema_path).json()
|
|
188
190
|
else:
|
|
189
191
|
with open(schema_path, encoding=encoding) as jsonfile:
|
csv_detective/output/profile.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from time import time
|
|
4
|
+
from typing import Optional
|
|
4
5
|
|
|
6
|
+
import numpy as np
|
|
5
7
|
import pandas as pd
|
|
6
8
|
|
|
7
9
|
from csv_detective.detect_fields.other.float import float_casting
|
|
@@ -15,6 +17,7 @@ def create_profile(
|
|
|
15
17
|
limited_output: bool = True,
|
|
16
18
|
cast_json: bool = True,
|
|
17
19
|
verbose: bool = False,
|
|
20
|
+
_col_values: Optional[dict[str, pd.Series]] = None,
|
|
18
21
|
) -> dict:
|
|
19
22
|
if verbose:
|
|
20
23
|
start = time()
|
|
@@ -27,50 +30,91 @@ def create_profile(
|
|
|
27
30
|
k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
|
|
28
31
|
for k, v in columns.items()
|
|
29
32
|
}
|
|
33
|
+
# value_counts().reset_index() tries to insert a "count" column, and fails if it's already here
|
|
34
|
+
_count_col = "count"
|
|
35
|
+
while _count_col in table.columns:
|
|
36
|
+
_count_col = "_" + _count_col
|
|
30
37
|
profile = defaultdict(dict)
|
|
31
38
|
for c in table.columns:
|
|
32
39
|
# for numerical formats we want min, max, mean, std
|
|
33
40
|
if columns[c]["python_type"] in ["float", "int"]:
|
|
34
|
-
# we
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
41
|
+
# if we have read the file in chunks we already have what we need
|
|
42
|
+
if _col_values is None:
|
|
43
|
+
# we locally cast the column to perform the operations,
|
|
44
|
+
# using the same method as in cast_df
|
|
45
|
+
cast_col = (
|
|
46
|
+
table[c].astype(pd.Int64Dtype())
|
|
47
|
+
if columns[c]["python_type"] == "int"
|
|
48
|
+
else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
|
|
49
|
+
)
|
|
50
|
+
stats = {
|
|
51
|
+
"min": cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
|
|
52
|
+
"mean": cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
|
|
53
|
+
"max": cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
|
|
54
|
+
"std": cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
|
|
55
|
+
}
|
|
56
|
+
else:
|
|
57
|
+
cast_col = _col_values[c].reset_index()
|
|
58
|
+
cast_col = cast_col.loc[cast_col[c].notna()]
|
|
59
|
+
cast_col[c] = (
|
|
60
|
+
cast_col[c].astype(pd.Int64Dtype())
|
|
61
|
+
if columns[c]["python_type"] == "int"
|
|
62
|
+
else cast_col[c].apply(
|
|
63
|
+
lambda x: float_casting(x) if isinstance(x, str) else pd.NA
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
stats = {
|
|
67
|
+
"min": cast_prevent_nan(cast_col[c].min(), columns[c]["python_type"]),
|
|
68
|
+
"mean": cast_prevent_nan(
|
|
69
|
+
(cast_col[c] * cast_col["count"]).sum() / sum(cast_col["count"]),
|
|
70
|
+
columns[c]["python_type"],
|
|
71
|
+
),
|
|
72
|
+
"max": cast_prevent_nan(cast_col[c].max(), columns[c]["python_type"]),
|
|
73
|
+
}
|
|
74
|
+
stats["std"] = cast_prevent_nan(
|
|
75
|
+
np.sqrt(
|
|
76
|
+
sum(cast_col["count"] * (cast_col[c] - stats["mean"]) ** 2)
|
|
77
|
+
/ sum(cast_col["count"])
|
|
78
|
+
),
|
|
79
|
+
columns[c]["python_type"],
|
|
80
|
+
)
|
|
81
|
+
profile[c].update(**stats)
|
|
46
82
|
del cast_col
|
|
47
83
|
# for all formats we want most frequent values, nb unique values and nb missing values
|
|
48
84
|
tops_bruts = (
|
|
49
|
-
table
|
|
50
|
-
.
|
|
51
|
-
.reset_index()
|
|
85
|
+
(table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
|
|
86
|
+
.reset_index(name=_count_col)
|
|
52
87
|
.iloc[:10]
|
|
53
88
|
.to_dict(orient="records")
|
|
54
89
|
)
|
|
55
90
|
profile[c].update(
|
|
56
91
|
tops=[
|
|
57
92
|
{
|
|
58
|
-
"count": tb[
|
|
93
|
+
"count": tb[_count_col],
|
|
59
94
|
"value": tb[c],
|
|
60
95
|
}
|
|
61
96
|
for tb in tops_bruts
|
|
62
97
|
],
|
|
63
98
|
nb_distinct=(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
99
|
+
(
|
|
100
|
+
table[c].nunique()
|
|
101
|
+
if columns[c]["python_type"] != "json" or not cast_json
|
|
102
|
+
# a column containing cast json is not serializable
|
|
103
|
+
else table[c].astype(str).nunique()
|
|
104
|
+
)
|
|
105
|
+
if _col_values is None
|
|
106
|
+
else len(_col_values)
|
|
107
|
+
),
|
|
108
|
+
nb_missing_values=(
|
|
109
|
+
len(table[c].loc[table[c].isna()])
|
|
110
|
+
if _col_values is None
|
|
111
|
+
else (_col_values[c].loc[pd.NA] if pd.NA in _col_values[c].index else 0)
|
|
68
112
|
),
|
|
69
|
-
nb_missing_values=len(table[c].loc[table[c].isna()]),
|
|
70
113
|
)
|
|
71
114
|
if verbose:
|
|
72
115
|
display_logs_depending_process_time(
|
|
73
116
|
f"Created profile in {round(time() - start, 3)}s",
|
|
74
117
|
time() - start,
|
|
75
118
|
)
|
|
119
|
+
del _col_values
|
|
76
120
|
return profile
|