py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.25__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +297 -229
- py2ls/ml2ls.py +996 -155
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +351 -40
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/RECORD +15 -11
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -4,6 +4,8 @@ import sys, os
|
|
4
4
|
from IPython.display import display
|
5
5
|
from typing import List, Optional, Union
|
6
6
|
|
7
|
+
from regex import X
|
8
|
+
|
7
9
|
try:
|
8
10
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
9
11
|
get_ipython().run_line_magic("autoreload", "2")
|
@@ -16,15 +18,17 @@ warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
|
16
18
|
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
17
19
|
|
18
20
|
|
19
|
-
def run_once_within(duration=60): # default 60s
|
21
|
+
def run_once_within(duration=60,reverse=False): # default 60s
|
20
22
|
import time
|
21
23
|
|
22
24
|
"""
|
25
|
+
如果reverse is True, 则在第一次运行时并不运行.但是在第二次运行时则运行
|
23
26
|
usage:
|
24
27
|
if run_once_within():
|
25
28
|
print("This code runs once per minute.")
|
26
29
|
else:
|
27
30
|
print("The code has already been run in the last minute.")
|
31
|
+
|
28
32
|
"""
|
29
33
|
if not hasattr(run_once_within, "time_last"):
|
30
34
|
run_once_within.time_last = None
|
@@ -34,9 +38,9 @@ def run_once_within(duration=60): # default 60s
|
|
34
38
|
time_curr - run_once_within.time_last >= duration
|
35
39
|
):
|
36
40
|
run_once_within.time_last = time_curr # Update the last execution time
|
37
|
-
return True
|
41
|
+
return False if reverse else True
|
38
42
|
else:
|
39
|
-
return False
|
43
|
+
return True if reverse else False
|
40
44
|
|
41
45
|
|
42
46
|
def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
@@ -1828,16 +1832,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1828
1832
|
# Check data types
|
1829
1833
|
data_types = df.dtypes
|
1830
1834
|
# messages.append(f"Data types of columns:\n{data_types}")
|
1831
|
-
|
1832
|
-
# Check for constant values across any column
|
1833
|
-
constant_columns = df.columns[df.nunique() == 1].tolist()
|
1834
|
-
if constant_columns:
|
1835
|
-
messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
|
1836
|
-
is_abnormal = True
|
1837
|
-
if verbose:
|
1838
|
-
print(f"df.columns[df.nunique() == 1].tolist()")
|
1839
|
-
if verbose:
|
1840
|
-
print("5", is_abnormal)
|
1835
|
+
|
1841
1836
|
# Check for an unreasonable number of rows or columns
|
1842
1837
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
1843
1838
|
messages.append(
|
@@ -1989,30 +1984,29 @@ def fload(fpath, kind=None, **kwargs):
|
|
1989
1984
|
def load_csv(fpath, **kwargs):
|
1990
1985
|
from pandas.errors import EmptyDataError
|
1991
1986
|
|
1992
|
-
engine = kwargs.pop("engine", "pyarrow")
|
1993
|
-
sep = kwargs.pop("sep",
|
1994
|
-
index_col = kwargs.pop("index_col", None)
|
1995
|
-
memory_map = kwargs.pop("memory_map", False)
|
1996
|
-
skipinitialspace = kwargs.pop("skipinitialspace", False)
|
1997
|
-
encoding = kwargs.pop("encoding", "utf-8")
|
1998
|
-
on_bad_lines = kwargs.pop("on_bad_lines", "skip")
|
1999
|
-
comment = kwargs.pop("comment", None)
|
2000
|
-
fmt = kwargs.pop("fmt", False)
|
2001
|
-
chunksize = kwargs.pop("chunksize", None)
|
1987
|
+
engine = kwargs.pop("engine", "pyarrow")# default: None
|
1988
|
+
sep = kwargs.pop("sep", None)# default: ','
|
1989
|
+
index_col = kwargs.pop("index_col", None)# default: None
|
1990
|
+
memory_map = kwargs.pop("memory_map", False)# default: False
|
1991
|
+
skipinitialspace = kwargs.pop("skipinitialspace", False)# default: False
|
1992
|
+
encoding = kwargs.pop("encoding", "utf-8")# default: "utf-8"
|
1993
|
+
on_bad_lines = kwargs.pop("on_bad_lines", "skip")# default: 'error'
|
1994
|
+
comment = kwargs.pop("comment", None)# default: None
|
1995
|
+
fmt = kwargs.pop("fmt", False)# default:
|
1996
|
+
chunksize = kwargs.pop("chunksize", None)# default: None
|
2002
1997
|
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
2003
|
-
low_memory = kwargs.pop("low_memory", True)
|
1998
|
+
low_memory = kwargs.pop("low_memory", True)# default: True
|
2004
1999
|
low_memory = (
|
2005
2000
|
False if chunksize else True
|
2006
|
-
) # when chunksize, recommend low_memory=False
|
2001
|
+
) # when chunksize, recommend low_memory=False # default:
|
2007
2002
|
verbose = kwargs.pop("verbose", False)
|
2008
2003
|
if run_once_within():
|
2009
2004
|
use_pd("read_csv", verbose=verbose)
|
2010
2005
|
|
2011
|
-
if comment is None:
|
2006
|
+
if comment is None:# default: None
|
2012
2007
|
comment = get_comment(
|
2013
2008
|
fpath, comment=None, encoding="utf-8", lines_to_check=5
|
2014
2009
|
)
|
2015
|
-
|
2016
2010
|
try:
|
2017
2011
|
df = pd.read_csv(
|
2018
2012
|
fpath,
|
@@ -2107,8 +2101,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
2107
2101
|
separators = [",", "\t", ";", "|", " "]
|
2108
2102
|
for sep in separators:
|
2109
2103
|
sep2show = sep if sep != "\t" else "\\t"
|
2110
|
-
|
2111
|
-
|
2104
|
+
if verbose:
|
2105
|
+
print(f'trying with: engine=pyarrow, sep="{sep2show}"')
|
2112
2106
|
try:
|
2113
2107
|
df = pd.read_csv(
|
2114
2108
|
fpath,
|
@@ -2137,8 +2131,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2137
2131
|
separators = [",", "\t", ";", "|", " "]
|
2138
2132
|
for sep in separators:
|
2139
2133
|
try:
|
2140
|
-
|
2141
|
-
|
2134
|
+
sep2show = sep if sep != "\t" else "\\t"
|
2135
|
+
if verbose:
|
2136
|
+
print(f"trying with: engine={engine}, sep='{sep2show}'")
|
2142
2137
|
# print(".")
|
2143
2138
|
df = pd.read_csv(
|
2144
2139
|
fpath,
|
@@ -2171,8 +2166,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2171
2166
|
continue
|
2172
2167
|
else:
|
2173
2168
|
pass
|
2174
|
-
|
2175
|
-
|
2169
|
+
print(kwargs)
|
2170
|
+
# if is_df_abnormal(df,verbose=verbose):
|
2171
|
+
# df=pd.read_csv(fpath,**kwargs)
|
2176
2172
|
display(df.head(2))
|
2177
2173
|
print(f"shape: {df.shape}")
|
2178
2174
|
return df
|
@@ -2386,7 +2382,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2386
2382
|
elif kind == "xml":
|
2387
2383
|
return load_xml(fpath)
|
2388
2384
|
elif kind in ["csv", "tsv"]:
|
2389
|
-
verbose = kwargs.pop("verbose", False)
|
2385
|
+
# verbose = kwargs.pop("verbose", False)
|
2390
2386
|
if run_once_within():
|
2391
2387
|
use_pd("read_csv")
|
2392
2388
|
content = load_csv(fpath, **kwargs)
|
@@ -3503,12 +3499,8 @@ def figsave(*args, dpi=300):
|
|
3503
3499
|
)
|
3504
3500
|
else:
|
3505
3501
|
plt.savefig(
|
3506
|
-
fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", pad_inches=0
|
3507
|
-
)
|
3508
|
-
# elif ftype.lower() == "png":
|
3509
|
-
# plt.savefig(fname, format="png", dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0)
|
3510
|
-
# elif ftype.lower() in ["tiff", "tif"]:
|
3511
|
-
# plt.savefig(fname, format="tiff", dpi=dpi, bbox_inches="tight",pad_inches=0)
|
3502
|
+
fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0
|
3503
|
+
)
|
3512
3504
|
elif ftype.lower() == "emf":
|
3513
3505
|
plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
|
3514
3506
|
elif ftype.lower() == "fig":
|
@@ -5236,15 +5228,44 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
|
5236
5228
|
data = data.explode(column, ignore_index=True)
|
5237
5229
|
return data
|
5238
5230
|
|
5231
|
+
def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
5232
|
+
"""
|
5233
|
+
Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
|
5234
|
+
Usage:
|
5235
|
+
data = pd.DataFrame({'month': [1, 4, 7, 10, 12]}) # Just months as an example
|
5236
|
+
# df_cycle month cyclically
|
5237
|
+
data = df_cycle(data, 'month', 12)
|
5238
|
+
"""
|
5239
|
+
if columns is None:
|
5240
|
+
columns = list(data.select_dtypes(include=np.number).columns) # If no columns specified, use all columns
|
5241
|
+
if max_val is None:
|
5242
|
+
max_val = np.max(data[columns]) # If no max_val specified, use the maximum value across all columns
|
5243
|
+
if isinstance(columns, str):
|
5244
|
+
columns = [columns] # If a single column name is provided as a string, convert it to a list
|
5245
|
+
|
5246
|
+
# Check if inplace is True, so we modify the original dataframe
|
5247
|
+
if inplace:
|
5248
|
+
# Modify the data in place, no return statement needed
|
5249
|
+
for col in columns:
|
5250
|
+
data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
|
5251
|
+
data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
|
5252
|
+
else:
|
5253
|
+
# If inplace is False, return the modified dataframe
|
5254
|
+
new_data = data.copy()
|
5255
|
+
for col in columns:
|
5256
|
+
new_data[col + '_sin'] = np.sin(2 * np.pi * new_data[col] / max_val)
|
5257
|
+
new_data[col + '_cos'] = np.cos(2 * np.pi * new_data[col] / max_val)
|
5258
|
+
return new_data
|
5259
|
+
|
5239
5260
|
|
5240
5261
|
# ! DataFrame
|
5241
5262
|
def df_astype(
|
5242
5263
|
data: pd.DataFrame,
|
5243
5264
|
columns: Optional[Union[str, List[str]]] = None,
|
5244
|
-
astype: str = "datetime",
|
5265
|
+
astype: str = None,#"datetime",
|
5245
5266
|
skip_row: Union[str, list] = None,
|
5246
5267
|
fmt: Optional[str] = None,
|
5247
|
-
inplace: bool =
|
5268
|
+
inplace: bool = False,
|
5248
5269
|
errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
|
5249
5270
|
**kwargs,
|
5250
5271
|
) -> Optional[pd.DataFrame]:
|
@@ -5304,6 +5325,7 @@ def df_astype(
|
|
5304
5325
|
"day",
|
5305
5326
|
"month",
|
5306
5327
|
"year",
|
5328
|
+
"circular"
|
5307
5329
|
]
|
5308
5330
|
# If inplace is False, make a copy of the DataFrame
|
5309
5331
|
if not inplace:
|
@@ -5398,10 +5420,22 @@ def df_astype(
|
|
5398
5420
|
kwargs.pop("errors", None)
|
5399
5421
|
data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
|
5400
5422
|
# print(f"Successfully converted '{column}' to timedelta.")
|
5423
|
+
elif astype == "circular":
|
5424
|
+
max_val = kwargs.get('max_val',None)
|
5425
|
+
data[column]=df_cycle(data=data,columns=column,max_val=max_val)
|
5401
5426
|
else:
|
5402
5427
|
# Convert to other types (e.g., float, int)
|
5403
|
-
|
5428
|
+
if astype=='int':
|
5429
|
+
data[column] = data[column].astype('float').astype('int')
|
5430
|
+
else:
|
5431
|
+
data[column] = data[column].astype(astype)
|
5404
5432
|
# print(f"Successfully converted '{column}' to {astype}.")
|
5433
|
+
# format
|
5434
|
+
try:
|
5435
|
+
if fmt is not None:
|
5436
|
+
data[column] = data[column].apply(lambda x: f"{x:{fmt}}")
|
5437
|
+
except Exception as e:
|
5438
|
+
print(f"设置格式的时候有误: {e}")
|
5405
5439
|
except Exception as e:
|
5406
5440
|
print(f"Error converting '{column}' to {astype}: {e}")
|
5407
5441
|
try:
|
@@ -5874,11 +5908,13 @@ def df_encoder(
|
|
5874
5908
|
|
5875
5909
|
def df_scaler(
|
5876
5910
|
data: pd.DataFrame, # should be numeric dtype
|
5911
|
+
scaler=None,
|
5877
5912
|
method="standard",
|
5878
5913
|
columns=None, # default, select all numeric col/row
|
5879
5914
|
inplace=False,
|
5880
5915
|
verbose=False, # show usage
|
5881
5916
|
axis=0, # defalut column-wise
|
5917
|
+
return_scaler:bool=False,# True: return both: return df, scaler
|
5882
5918
|
**kwargs,
|
5883
5919
|
):
|
5884
5920
|
"""
|
@@ -5896,31 +5932,49 @@ def df_scaler(
|
|
5896
5932
|
"""
|
5897
5933
|
if verbose:
|
5898
5934
|
print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
|
5899
|
-
|
5900
|
-
|
5901
|
-
|
5902
|
-
|
5903
|
-
|
5904
|
-
|
5905
|
-
|
5906
|
-
|
5907
|
-
|
5908
|
-
|
5909
|
-
|
5910
|
-
|
5911
|
-
|
5912
|
-
|
5913
|
-
|
5914
|
-
|
5915
|
-
|
5916
|
-
|
5935
|
+
if scaler is None:
|
5936
|
+
methods = ["standard", "minmax", "robust","maxabs"]
|
5937
|
+
method = strcmp(method, methods)[0]
|
5938
|
+
if method == "standard":
|
5939
|
+
from sklearn.preprocessing import StandardScaler
|
5940
|
+
if verbose:
|
5941
|
+
print("performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1.")
|
5942
|
+
print("Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks.")
|
5943
|
+
scaler = StandardScaler(**kwargs)
|
5944
|
+
elif method == "minmax":
|
5945
|
+
from sklearn.preprocessing import MinMaxScaler
|
5946
|
+
if verbose:
|
5947
|
+
print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
|
5948
|
+
print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
|
5949
|
+
print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
|
5950
|
+
scaler = MinMaxScaler(**kwargs)
|
5951
|
+
elif method == "robust":
|
5952
|
+
from sklearn.preprocessing import RobustScaler
|
5953
|
+
if verbose:
|
5954
|
+
print("scales the data based on the median and interquartile range, which is robust to outliers.")
|
5955
|
+
print("Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation.")
|
5956
|
+
scaler = RobustScaler(**kwargs)
|
5957
|
+
elif method=="maxabs":
|
5958
|
+
from sklearn.preprocessing import MaxAbsScaler
|
5959
|
+
if verbose:
|
5960
|
+
print("This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature.")
|
5961
|
+
print("Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems.")
|
5962
|
+
scaler = MaxAbsScaler(**kwargs)
|
5963
|
+
if axis not in [0, 1]:
|
5964
|
+
raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
|
5965
|
+
if verbose:
|
5966
|
+
print(scaler)
|
5917
5967
|
if axis == 0:
|
5918
5968
|
# Column-wise scaling (default)
|
5919
5969
|
if columns is None:
|
5920
5970
|
columns = data.select_dtypes(include=np.number).columns.tolist()
|
5921
5971
|
non_numeric_columns = data.columns.difference(columns)
|
5922
5972
|
|
5923
|
-
scaled_data = scaler.fit_transform(data[columns])
|
5973
|
+
# scaled_data = scaler.fit_transform(data[columns])
|
5974
|
+
if scaler is None or not hasattr(scaler, 'mean_'):
|
5975
|
+
scaled_data = scaler.fit_transform(data[columns])
|
5976
|
+
else:
|
5977
|
+
scaled_data = scaler.transform(data[columns])
|
5924
5978
|
|
5925
5979
|
if inplace:
|
5926
5980
|
data[columns] = scaled_data
|
@@ -5934,7 +5988,10 @@ def df_scaler(
|
|
5934
5988
|
axis=1,
|
5935
5989
|
)
|
5936
5990
|
scaled_df = scaled_df[data.columns] # Maintain column order
|
5937
|
-
|
5991
|
+
if return_scaler:
|
5992
|
+
return scaled_df,scaler
|
5993
|
+
else:
|
5994
|
+
return scaled_df
|
5938
5995
|
|
5939
5996
|
elif axis == 1:
|
5940
5997
|
# Row-wise scaling
|
@@ -5946,9 +6003,10 @@ def df_scaler(
|
|
5946
6003
|
|
5947
6004
|
print(f"Scaling rows")
|
5948
6005
|
|
5949
|
-
scaled_data = scaler.fit_transform(
|
5950
|
-
|
5951
|
-
).T # Transpose for scaling and then back
|
6006
|
+
# scaled_data = scaler.fit_transform(
|
6007
|
+
# numeric_rows.T
|
6008
|
+
# ).T # Transpose for scaling and then back
|
6009
|
+
scaled_data = scaler.fit_transform(numeric_rows.T).T if scaler is None or not hasattr(scaler, 'mean_') else scaler.transform(numeric_rows.T).T
|
5952
6010
|
|
5953
6011
|
if inplace:
|
5954
6012
|
data.loc[numeric_rows.index] = scaled_data
|
@@ -5956,7 +6014,10 @@ def df_scaler(
|
|
5956
6014
|
else:
|
5957
6015
|
scaled_df = data.copy()
|
5958
6016
|
scaled_df.loc[numeric_rows.index] = scaled_data
|
5959
|
-
|
6017
|
+
if return_scaler:
|
6018
|
+
return scaled_df,scaler
|
6019
|
+
else:
|
6020
|
+
return scaled_df
|
5960
6021
|
|
5961
6022
|
|
5962
6023
|
def df_special_characters_cleaner(
|
@@ -6325,6 +6386,7 @@ def df_reducer(
|
|
6325
6386
|
random_state=1,
|
6326
6387
|
ax=None,
|
6327
6388
|
figsize=None,
|
6389
|
+
verbose=True,
|
6328
6390
|
**kwargs,
|
6329
6391
|
) -> pd.DataFrame:
|
6330
6392
|
dict_methods = {
|
@@ -6364,7 +6426,8 @@ def df_reducer(
|
|
6364
6426
|
# "autoencoder","nmf",
|
6365
6427
|
]
|
6366
6428
|
method = strcmp(method, methods)[0]
|
6367
|
-
|
6429
|
+
if verbose:
|
6430
|
+
print(f"\nprocessing with using {dict_methods[method]}:")
|
6368
6431
|
xlabel, ylabel = None, None
|
6369
6432
|
if columns is None:
|
6370
6433
|
columns = data.select_dtypes(include="number").columns.tolist()
|
@@ -6863,7 +6926,7 @@ def df_reducer(
|
|
6863
6926
|
hue=hue,
|
6864
6927
|
s=size,
|
6865
6928
|
edgecolor=edgecolor,
|
6866
|
-
|
6929
|
+
kind_="scater",
|
6867
6930
|
figsets=dict(
|
6868
6931
|
legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
|
6869
6932
|
xlabel=xlabel if xlabel else None,
|
@@ -7334,10 +7397,13 @@ def evaluate_cluster(
|
|
7334
7397
|
def df_qc(
|
7335
7398
|
data: pd.DataFrame,
|
7336
7399
|
columns=None,
|
7337
|
-
|
7400
|
+
skim=False,
|
7338
7401
|
plot_=True,
|
7339
7402
|
max_cols=20, # only for plots
|
7403
|
+
hue=None,
|
7340
7404
|
output=False,
|
7405
|
+
verbose=True,
|
7406
|
+
dir_save=None
|
7341
7407
|
):
|
7342
7408
|
"""
|
7343
7409
|
Usage example:
|
@@ -7345,22 +7411,24 @@ def df_qc(
|
|
7345
7411
|
"""
|
7346
7412
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
7347
7413
|
from scipy.stats import skew, kurtosis, entropy
|
7348
|
-
|
7349
|
-
|
7414
|
+
|
7350
7415
|
#! display(data.select_dtypes(include=[np.number]).describe())
|
7351
7416
|
#!skim
|
7352
7417
|
if columns is not None:
|
7353
7418
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7354
7419
|
data=data[columns]
|
7355
|
-
|
7356
|
-
|
7357
|
-
|
7358
|
-
|
7359
|
-
|
7420
|
+
if skim:
|
7421
|
+
try:
|
7422
|
+
import skimpy
|
7423
|
+
skimpy.skim(data)
|
7424
|
+
except:
|
7425
|
+
numerical_data = data.select_dtypes(include=[np.number])
|
7426
|
+
skimpy.skim(numerical_data)
|
7360
7427
|
# Fill completely NaN columns with a default value (e.g., 0)
|
7361
7428
|
data = data.copy()
|
7362
7429
|
data.loc[:, data.isna().all()] = 0
|
7363
7430
|
res_qc = {}
|
7431
|
+
print(f"data.shape:{data.shape}")
|
7364
7432
|
|
7365
7433
|
# Missing values
|
7366
7434
|
res_qc["missing_values"] = data.isnull().sum()
|
@@ -7403,7 +7471,7 @@ def df_qc(
|
|
7403
7471
|
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7404
7472
|
vif_data = pd.DataFrame()
|
7405
7473
|
res_qc["vif"]=vif_data
|
7406
|
-
if numeric_df.shape[1] > 1:
|
7474
|
+
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
7407
7475
|
vif_data["feature"] = numeric_df.columns
|
7408
7476
|
vif_data["VIF"] = [
|
7409
7477
|
variance_inflation_factor(numeric_df.values, i)
|
@@ -7495,72 +7563,70 @@ def df_qc(
|
|
7495
7563
|
# Report generation
|
7496
7564
|
if verbose:
|
7497
7565
|
print("=== QC Report Summary ===")
|
7498
|
-
print("\
|
7499
|
-
|
7500
|
-
print(
|
7501
|
-
|
7502
|
-
|
7503
|
-
|
7504
|
-
|
7505
|
-
|
7506
|
-
|
7507
|
-
|
7508
|
-
|
7509
|
-
|
7510
|
-
|
7511
|
-
|
7512
|
-
|
7513
|
-
|
7566
|
+
print("\n⤵ Summary Statistics:")
|
7567
|
+
display(res_qc["summary_statistics"])
|
7568
|
+
print("\n⤵ Data Types:")
|
7569
|
+
display(res_qc["data_types"])
|
7570
|
+
if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
|
7571
|
+
print(" ⤵ Missing Values Counts:")
|
7572
|
+
display(res_qc["missing_values"][res_qc["missing_values"] > 0])
|
7573
|
+
# print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
7574
|
+
print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
|
7575
|
+
|
7576
|
+
if any(res_qc["outlier_num"]):
|
7577
|
+
print("\n⤵ Outlier Report:")
|
7578
|
+
display(res_qc["outlier_num"])
|
7579
|
+
if any(res_qc["unique_values"]):
|
7580
|
+
print("\n⤵ Unique Values per Column:")
|
7581
|
+
display(res_qc["unique_values"])
|
7582
|
+
|
7583
|
+
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
7584
|
+
|
7585
|
+
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
7586
|
+
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
7514
7587
|
|
7515
7588
|
if res_qc["empty_columns"]:
|
7516
|
-
print("\
|
7517
|
-
|
7518
|
-
print("\nOutlier Report:")
|
7519
|
-
print(res_qc["outlier_num"])
|
7520
|
-
print("\nPercentage of Values Replaced per Column:")
|
7521
|
-
print(res_qc["outlier_percentage"])
|
7589
|
+
print("\n⤵ Empty Columns:", res_qc["empty_columns"])
|
7522
7590
|
|
7523
|
-
|
7524
|
-
|
7525
|
-
|
7591
|
+
if any(res_qc["high_correlations"]):
|
7592
|
+
print("\n⤵ High Correlations (>|0.9|):")
|
7593
|
+
for col1, col2 in res_qc["high_correlations"]:
|
7594
|
+
print(f" {col1} and {col2}")
|
7526
7595
|
|
7527
7596
|
if "vif" in res_qc:
|
7528
|
-
print("\
|
7597
|
+
print("\n⤵ Features with High VIF (>|5|):")
|
7529
7598
|
print(res_qc["vif"])
|
7530
7599
|
|
7531
|
-
|
7532
|
-
|
7533
|
-
|
7534
|
-
|
7535
|
-
|
7536
|
-
|
7537
|
-
|
7538
|
-
|
7539
|
-
|
7540
|
-
|
7541
|
-
|
7542
|
-
|
7543
|
-
f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
|
7544
|
-
)
|
7545
|
-
|
7546
|
-
print("\nSummary Statistics:")
|
7547
|
-
print(res_qc["summary_statistics"])
|
7600
|
+
if any(res_qc["high_cardinality_categoricals"]):
|
7601
|
+
print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
|
7602
|
+
print(res_qc["high_cardinality_categoricals"])
|
7603
|
+
if any(res_qc["inconsistent_types"]):
|
7604
|
+
print("\n⤵ Inconsistent Data Types:")
|
7605
|
+
display(res_qc["inconsistent_types"])
|
7606
|
+
if any(res_qc["text_length_analysis"]):
|
7607
|
+
print("\n⤵ Text Length Analysis:")
|
7608
|
+
for col, stats in res_qc["text_length_analysis"].items():
|
7609
|
+
print(
|
7610
|
+
f"{col}: Avg Length={round(stats['avg_length'],1)}, Length Variance={round(stats['length_variance'],1)}"
|
7611
|
+
)
|
7548
7612
|
|
7549
7613
|
if res_qc["warnings"]:
|
7550
7614
|
print("\nWarnings:")
|
7551
7615
|
for warning in res_qc["warnings"]:
|
7552
7616
|
print(" -", warning)
|
7553
7617
|
if plot_:
|
7554
|
-
df_qc_plots(data=data, res_qc=res_qc, max_cols=
|
7555
|
-
if output:
|
7618
|
+
df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
|
7619
|
+
if output or not plot_:
|
7556
7620
|
return res_qc
|
7557
7621
|
return None
|
7558
7622
|
|
7559
7623
|
|
7560
|
-
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
|
7624
|
+
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None,dir_save=None):
|
7561
7625
|
import matplotlib.pyplot as plt
|
7562
7626
|
import seaborn as sns
|
7563
7627
|
from .plot import subplot, figsets, get_color
|
7628
|
+
from datetime import datetime
|
7629
|
+
now_ = datetime.now().strftime("%y%m%d_%H%M%S")
|
7564
7630
|
|
7565
7631
|
if columns is not None:
|
7566
7632
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
@@ -7574,91 +7640,65 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7574
7640
|
)
|
7575
7641
|
if len(missing_data) > max_cols:
|
7576
7642
|
missing_data = missing_data[:max_cols]
|
7577
|
-
|
7578
|
-
|
7579
|
-
|
7580
|
-
hue=missing_data.index,
|
7581
|
-
palette=get_color(len(missing_data), cmap="
|
7643
|
+
ax_missing_data=sns.barplot(
|
7644
|
+
y=missing_data.index,
|
7645
|
+
x=missing_data.values,
|
7646
|
+
hue=missing_data.index,
|
7647
|
+
palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
|
7582
7648
|
ax=nexttile(),
|
7583
7649
|
)
|
7584
|
-
figsets(
|
7585
|
-
|
7586
|
-
ax2 = ax.twinx()
|
7587
|
-
# Plot missing value percentages
|
7588
|
-
missing_percentage = res_qc["missing_percentage"][
|
7589
|
-
res_qc["missing_percentage"] > 0
|
7590
|
-
].sort_values(ascending=False)
|
7591
|
-
sns.barplot(
|
7592
|
-
x=missing_percentage.index,
|
7593
|
-
y=missing_percentage.values,
|
7594
|
-
hue=missing_percentage.index,
|
7595
|
-
palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
|
7596
|
-
ax=ax2,#nexttile(),
|
7597
|
-
)
|
7598
|
-
figsets(xangle=45, ylabel="%",ax=ax2)
|
7599
|
-
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7600
|
-
ax2.yaxis.label.set_color('r')
|
7650
|
+
figsets(title="Missing (#)", xlabel="#",ax=ax_missing_data,ylabel=None,fontsize=8 if len(missing_data)<=20 else 6)
|
7601
7651
|
|
7602
7652
|
outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
|
7603
7653
|
if len(outlier_num) > max_cols:
|
7604
7654
|
outlier_num = outlier_num[:max_cols]
|
7605
7655
|
ax_outlier_num=sns.barplot(
|
7606
|
-
|
7607
|
-
|
7656
|
+
y=outlier_num.index,
|
7657
|
+
x=outlier_num.values,
|
7608
7658
|
hue=outlier_num.index,
|
7609
7659
|
palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
|
7610
7660
|
ax=nexttile(),
|
7611
7661
|
)
|
7612
|
-
figsets(
|
7613
|
-
|
7614
|
-
|
7615
|
-
|
7616
|
-
|
7617
|
-
|
7618
|
-
|
7619
|
-
|
7620
|
-
|
7621
|
-
|
7622
|
-
|
7623
|
-
)
|
7624
|
-
figsets(
|
7625
|
-
xangle=45,
|
7626
|
-
ylabel="%",
|
7627
|
-
xlabel=None,
|
7628
|
-
ylim=[0, outlier_percentage.max() + 2],
|
7629
|
-
ax=ax_outlier_percentage
|
7630
|
-
)
|
7631
|
-
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7632
|
-
ax2.yaxis.label.set_color('r')
|
7662
|
+
figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
|
7663
|
+
|
7664
|
+
#!
|
7665
|
+
try:
|
7666
|
+
for col in data.select_dtypes(include='category').columns:
|
7667
|
+
sns.countplot(y=data[col],
|
7668
|
+
palette=get_color(data.select_dtypes(include='category').shape[1], cmap="coolwarm")[::-1],
|
7669
|
+
ax=nexttile())
|
7670
|
+
figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
|
7671
|
+
except Exception as e:
|
7672
|
+
pass
|
7633
7673
|
|
7634
7674
|
# Skewness and Kurtosis Plots
|
7635
7675
|
skewness = res_qc["skewness"].sort_values(ascending=False)
|
7636
7676
|
kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
|
7637
7677
|
if not skewness.empty:
|
7638
7678
|
ax_skewness=sns.barplot(
|
7639
|
-
|
7640
|
-
|
7679
|
+
y=skewness.index,
|
7680
|
+
x=skewness.values,
|
7641
7681
|
hue=skewness.index,
|
7642
7682
|
palette=get_color(len(skewness), cmap="coolwarm")[::-1],
|
7643
7683
|
ax=nexttile(),
|
7644
7684
|
)
|
7645
7685
|
figsets(
|
7646
|
-
xangle=45,
|
7647
7686
|
title="Highly Skewed Numeric Columns (Skewness > 1)",
|
7648
|
-
|
7687
|
+
xlabel="Skewness",ylabel=None,ax=ax_skewness,
|
7688
|
+
fontsize=8 if len(skewness)<=20 else 6
|
7649
7689
|
)
|
7650
7690
|
if not kurtosis.empty:
|
7651
7691
|
ax_kurtosis=sns.barplot(
|
7652
|
-
|
7653
|
-
|
7692
|
+
y=kurtosis.index,
|
7693
|
+
x=kurtosis.values,
|
7654
7694
|
hue=kurtosis.index,
|
7655
7695
|
palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
|
7656
7696
|
ax=nexttile(),
|
7657
7697
|
)
|
7658
7698
|
figsets(
|
7659
|
-
xangle=45,
|
7660
7699
|
title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
|
7661
|
-
|
7700
|
+
xlabel="Kurtosis",ylabel=None,ax=ax_kurtosis,
|
7701
|
+
fontsize=8 if len(kurtosis)<=20 else 6
|
7662
7702
|
)
|
7663
7703
|
|
7664
7704
|
# Entropy for Categorical Variables
|
@@ -7666,56 +7706,46 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7666
7706
|
ascending=False
|
7667
7707
|
)
|
7668
7708
|
ax_entropy_data=sns.barplot(
|
7669
|
-
|
7709
|
+
y=entropy_data.index, x=entropy_data.values,hue=entropy_data.index,
|
7710
|
+
palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
|
7711
|
+
ax=nexttile()
|
7670
7712
|
)
|
7671
7713
|
figsets(
|
7672
|
-
|
7673
|
-
xlabel="Categorical Columns",
|
7714
|
+
ylabel="Categorical Columns",
|
7674
7715
|
title="Entropy of Categorical Variables",
|
7675
|
-
|
7676
|
-
ax=ax_entropy_data
|
7677
|
-
|
7678
|
-
|
7679
|
-
|
7680
|
-
data=data[res_qc["distribution_analysis"].index],
|
7681
|
-
orient="v",
|
7682
|
-
palette="Set3",
|
7683
|
-
ax=nexttile(),
|
7684
|
-
)
|
7685
|
-
figsets(
|
7686
|
-
xangle=45,
|
7687
|
-
title="Range for Numeric Columns",
|
7688
|
-
ylabel="#",
|
7689
|
-
ax=ax_iqr
|
7690
|
-
)
|
7716
|
+
xlabel="Entropy (bits)",
|
7717
|
+
ax=ax_entropy_data,
|
7718
|
+
fontsize=8 if len(entropy_data)<=20 else 6
|
7719
|
+
)
|
7720
|
+
|
7691
7721
|
# unique counts
|
7692
7722
|
unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
|
7693
7723
|
ax_unique_counts_=sns.barplot(
|
7694
|
-
|
7695
|
-
|
7724
|
+
y=unique_counts.index,
|
7725
|
+
x=unique_counts.values,
|
7696
7726
|
hue=unique_counts.index,
|
7697
|
-
palette=get_color(len(unique_counts)
|
7727
|
+
palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
|
7698
7728
|
ax=nexttile())
|
7699
7729
|
figsets(
|
7700
|
-
xangle=45,
|
7701
7730
|
title="Unique Counts",
|
7702
|
-
|
7703
|
-
|
7704
|
-
ax=ax_unique_counts_
|
7731
|
+
ylabel=None,
|
7732
|
+
xlabel="#",
|
7733
|
+
ax=ax_unique_counts_,
|
7734
|
+
fontsize=8 if len(unique_counts)<=20 else 6
|
7705
7735
|
)
|
7706
7736
|
# Binary Checking
|
7707
|
-
ax_unique_counts=sns.barplot(
|
7708
|
-
|
7709
|
-
hue=unique_counts[unique_counts<
|
7710
|
-
palette=get_color(len(unique_counts[unique_counts<
|
7737
|
+
ax_unique_counts=sns.barplot(y=unique_counts[unique_counts<8].index,
|
7738
|
+
x=unique_counts[unique_counts<8].values,
|
7739
|
+
hue=unique_counts[unique_counts<8].index,
|
7740
|
+
palette=get_color(len(unique_counts[unique_counts<8].index), cmap="coolwarm")[::-1],
|
7711
7741
|
ax=nexttile())
|
7712
|
-
plt.
|
7742
|
+
plt.axvline(x=2, color="r", linestyle="--", lw=2)
|
7713
7743
|
figsets(
|
7714
|
-
|
7715
|
-
xlabel=None,
|
7744
|
+
ylabel=None,
|
7716
7745
|
title="Binary Checking",
|
7717
|
-
|
7718
|
-
ax=ax_unique_counts
|
7746
|
+
xlabel="#",
|
7747
|
+
ax=ax_unique_counts,
|
7748
|
+
fontsize=8 if len(unique_counts[unique_counts<10].index)<=20 else 6
|
7719
7749
|
)
|
7720
7750
|
|
7721
7751
|
# dtypes counts
|
@@ -7751,14 +7781,15 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7751
7781
|
ha="center",
|
7752
7782
|
va="top",
|
7753
7783
|
c="k",
|
7754
|
-
fontsize=8,
|
7784
|
+
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
7755
7785
|
rotation=0,
|
7756
7786
|
)
|
7757
7787
|
figsets(
|
7758
7788
|
xlabel=None,
|
7759
7789
|
title="Dtypes",
|
7760
7790
|
ylabel="#",
|
7761
|
-
ax=ax_dtype_counts
|
7791
|
+
ax=ax_dtype_counts,
|
7792
|
+
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
7762
7793
|
)
|
7763
7794
|
|
7764
7795
|
# High cardinality: Show top categorical columns by unique value count
|
@@ -7772,24 +7803,26 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7772
7803
|
|
7773
7804
|
if high_cardinality:
|
7774
7805
|
ax_high_cardinality=sns.barplot(
|
7775
|
-
|
7776
|
-
|
7806
|
+
y=list(high_cardinality.keys()),
|
7807
|
+
x=list(high_cardinality.values()),
|
7777
7808
|
hue=list(high_cardinality.keys()),
|
7778
|
-
palette=
|
7809
|
+
palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[::-1],
|
7810
|
+
ax=nexttile(),
|
7779
7811
|
)
|
7780
7812
|
figsets(
|
7781
|
-
xangle=45,
|
7782
7813
|
title="High Cardinality Categorical Columns",
|
7783
|
-
|
7784
|
-
ax=ax_high_cardinality
|
7814
|
+
xlabel="Unique Value Count",
|
7815
|
+
ax=ax_high_cardinality,
|
7816
|
+
fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
|
7785
7817
|
)
|
7786
7818
|
if res_qc["low_variance_features"]:
|
7787
7819
|
low_variance_data = data[res_qc["low_variance_features"]].copy()
|
7788
7820
|
for col in low_variance_data.columns:
|
7789
|
-
sns.histplot(
|
7821
|
+
ax_low_variance_features=sns.histplot(
|
7790
7822
|
low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
|
7791
7823
|
)
|
7792
|
-
|
7824
|
+
figsets(title=f"Low Variance Feature: {col}",ax=ax_low_variance_features,
|
7825
|
+
fontsize=8 if len(low_variance_data[col])<=20 else 6)
|
7793
7826
|
|
7794
7827
|
# VIF plot for multicollinearity detection
|
7795
7828
|
if "vif" in res_qc and not res_qc["vif"].empty:
|
@@ -7800,23 +7833,22 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7800
7833
|
x="VIF",
|
7801
7834
|
y="feature",
|
7802
7835
|
hue="VIF",
|
7803
|
-
palette=get_color(len(vif_data)
|
7836
|
+
palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
|
7804
7837
|
ax=nexttile())
|
7805
7838
|
figsets(
|
7806
|
-
xangle=45,
|
7807
7839
|
title="Variance Inflation Factor(VIF)",
|
7808
|
-
xlabel="
|
7840
|
+
xlabel="VIF",
|
7809
7841
|
ylabel="Features",
|
7810
7842
|
legend=None,
|
7811
|
-
ax=ax_vif
|
7843
|
+
ax=ax_vif,
|
7844
|
+
fontsize=8 if len(vif_data)<=20 else 6
|
7812
7845
|
)
|
7813
7846
|
|
7814
7847
|
# Correlation heatmap for numeric columns with high correlation pairs
|
7815
7848
|
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7816
|
-
corr = data.select_dtypes(include=[np.number]).
|
7849
|
+
corr = data.select_dtypes(include=[np.number]).corr()
|
7817
7850
|
if corr.shape[1]<=33:
|
7818
7851
|
mask = np.triu(np.ones_like(corr, dtype=bool))
|
7819
|
-
# Dynamically scale fontsize based on the number of columns
|
7820
7852
|
num_columns = corr.shape[1]
|
7821
7853
|
fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
|
7822
7854
|
|
@@ -7826,7 +7858,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7826
7858
|
annot=True,
|
7827
7859
|
cmap="coolwarm",
|
7828
7860
|
center=0,
|
7829
|
-
fmt=".
|
7861
|
+
fmt=".1f",
|
7830
7862
|
linewidths=0.5,
|
7831
7863
|
vmin=-1, vmax=1,
|
7832
7864
|
ax=nexttile(2, 2),
|
@@ -7839,7 +7871,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7839
7871
|
title="Correlation Heatmap",
|
7840
7872
|
ax=ax_heatmap
|
7841
7873
|
)
|
7874
|
+
# save figure
|
7875
|
+
if dir_save:
|
7876
|
+
figsave(dir_save,f"qc_plot_{now_}.pdf")
|
7842
7877
|
|
7878
|
+
if columns is not None:
|
7879
|
+
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7880
|
+
data=data[columns]
|
7881
|
+
len_total = len(res_qc)
|
7882
|
+
n_row, n_col = int((len_total + 10) / 3), 3
|
7883
|
+
nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
|
7884
|
+
#! check distribution
|
7885
|
+
data_num = data.select_dtypes(include=np.number)
|
7886
|
+
if len(data_num) > max_cols:
|
7887
|
+
data_num = data_num.iloc[:,:max_cols]
|
7888
|
+
|
7889
|
+
data_num = df_scaler(data=data_num, method='standard')
|
7890
|
+
|
7891
|
+
import scipy.stats as stats
|
7892
|
+
for column in data_num.columns:
|
7893
|
+
#* Shapiro-Wilk test for normality
|
7894
|
+
stat, p_value = stats.shapiro(data_num[column])
|
7895
|
+
normality = "norm" if p_value > 0.05 else "not_norm"
|
7896
|
+
#* Plot histogram
|
7897
|
+
ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
|
7898
|
+
x_min, x_max = ax_hist.get_xlim()
|
7899
|
+
y_min, y_max = ax_hist.get_ylim()
|
7900
|
+
ax_hist.text(x_min+(x_max-x_min)*0.5, y_min+(y_max-y_min)*0.75,
|
7901
|
+
f'p(Shapiro-Wilk)={p_value:.3f}\n{normality}',
|
7902
|
+
ha='center', va='top')
|
7903
|
+
figsets(title=column,ax=ax_hist)
|
7904
|
+
ax_twin=ax_hist.twinx()
|
7905
|
+
#* Q-Q plot
|
7906
|
+
stats.probplot(data_num[column], dist="norm", plot=ax_twin)
|
7907
|
+
figsets(ylabel=f'Q-Q Plot:{column}',title=None)
|
7908
|
+
# save figure
|
7909
|
+
if dir_save:
|
7910
|
+
figsave(dir_save,f"qq_plot_{now_}.pdf")
|
7843
7911
|
def use_pd(
|
7844
7912
|
func_name="excel",
|
7845
7913
|
verbose=True,
|