py2ls 0.2.4.22__py3-none-any.whl → 0.2.4.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/ips.py +213 -195
- py2ls/ml2ls.py +774 -66
- {py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/RECORD +12 -9
- {py2ls-0.2.4.22.dist-info → py2ls-0.2.4.24.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -4,6 +4,8 @@ import sys, os
|
|
4
4
|
from IPython.display import display
|
5
5
|
from typing import List, Optional, Union
|
6
6
|
|
7
|
+
from regex import X
|
8
|
+
|
7
9
|
try:
|
8
10
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
9
11
|
get_ipython().run_line_magic("autoreload", "2")
|
@@ -1828,16 +1830,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1828
1830
|
# Check data types
|
1829
1831
|
data_types = df.dtypes
|
1830
1832
|
# messages.append(f"Data types of columns:\n{data_types}")
|
1831
|
-
|
1832
|
-
# Check for constant values across any column
|
1833
|
-
constant_columns = df.columns[df.nunique() == 1].tolist()
|
1834
|
-
if constant_columns:
|
1835
|
-
messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
|
1836
|
-
is_abnormal = True
|
1837
|
-
if verbose:
|
1838
|
-
print(f"df.columns[df.nunique() == 1].tolist()")
|
1839
|
-
if verbose:
|
1840
|
-
print("5", is_abnormal)
|
1833
|
+
|
1841
1834
|
# Check for an unreasonable number of rows or columns
|
1842
1835
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
1843
1836
|
messages.append(
|
@@ -1989,30 +1982,29 @@ def fload(fpath, kind=None, **kwargs):
|
|
1989
1982
|
def load_csv(fpath, **kwargs):
|
1990
1983
|
from pandas.errors import EmptyDataError
|
1991
1984
|
|
1992
|
-
engine = kwargs.pop("engine", "pyarrow")
|
1993
|
-
sep = kwargs.pop("sep",
|
1994
|
-
index_col = kwargs.pop("index_col", None)
|
1995
|
-
memory_map = kwargs.pop("memory_map", False)
|
1996
|
-
skipinitialspace = kwargs.pop("skipinitialspace", False)
|
1997
|
-
encoding = kwargs.pop("encoding", "utf-8")
|
1998
|
-
on_bad_lines = kwargs.pop("on_bad_lines", "skip")
|
1999
|
-
comment = kwargs.pop("comment", None)
|
2000
|
-
fmt = kwargs.pop("fmt", False)
|
2001
|
-
chunksize = kwargs.pop("chunksize", None)
|
1985
|
+
engine = kwargs.pop("engine", "pyarrow")# default: None
|
1986
|
+
sep = kwargs.pop("sep", None)# default: ','
|
1987
|
+
index_col = kwargs.pop("index_col", None)# default: None
|
1988
|
+
memory_map = kwargs.pop("memory_map", False)# default: False
|
1989
|
+
skipinitialspace = kwargs.pop("skipinitialspace", False)# default: False
|
1990
|
+
encoding = kwargs.pop("encoding", "utf-8")# default: "utf-8"
|
1991
|
+
on_bad_lines = kwargs.pop("on_bad_lines", "skip")# default: 'error'
|
1992
|
+
comment = kwargs.pop("comment", None)# default: None
|
1993
|
+
fmt = kwargs.pop("fmt", False)# default:
|
1994
|
+
chunksize = kwargs.pop("chunksize", None)# default: None
|
2002
1995
|
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
2003
|
-
low_memory = kwargs.pop("low_memory", True)
|
1996
|
+
low_memory = kwargs.pop("low_memory", True)# default: True
|
2004
1997
|
low_memory = (
|
2005
1998
|
False if chunksize else True
|
2006
|
-
) # when chunksize, recommend low_memory=False
|
1999
|
+
) # when chunksize, recommend low_memory=False # default:
|
2007
2000
|
verbose = kwargs.pop("verbose", False)
|
2008
2001
|
if run_once_within():
|
2009
2002
|
use_pd("read_csv", verbose=verbose)
|
2010
2003
|
|
2011
|
-
if comment is None:
|
2004
|
+
if comment is None:# default: None
|
2012
2005
|
comment = get_comment(
|
2013
2006
|
fpath, comment=None, encoding="utf-8", lines_to_check=5
|
2014
2007
|
)
|
2015
|
-
|
2016
2008
|
try:
|
2017
2009
|
df = pd.read_csv(
|
2018
2010
|
fpath,
|
@@ -2107,8 +2099,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
2107
2099
|
separators = [",", "\t", ";", "|", " "]
|
2108
2100
|
for sep in separators:
|
2109
2101
|
sep2show = sep if sep != "\t" else "\\t"
|
2110
|
-
|
2111
|
-
|
2102
|
+
if verbose:
|
2103
|
+
print(f'trying with: engine=pyarrow, sep="{sep2show}"')
|
2112
2104
|
try:
|
2113
2105
|
df = pd.read_csv(
|
2114
2106
|
fpath,
|
@@ -2137,8 +2129,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2137
2129
|
separators = [",", "\t", ";", "|", " "]
|
2138
2130
|
for sep in separators:
|
2139
2131
|
try:
|
2140
|
-
|
2141
|
-
|
2132
|
+
sep2show = sep if sep != "\t" else "\\t"
|
2133
|
+
if verbose:
|
2134
|
+
print(f"trying with: engine={engine}, sep='{sep2show}'")
|
2142
2135
|
# print(".")
|
2143
2136
|
df = pd.read_csv(
|
2144
2137
|
fpath,
|
@@ -2171,8 +2164,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2171
2164
|
continue
|
2172
2165
|
else:
|
2173
2166
|
pass
|
2174
|
-
|
2175
|
-
|
2167
|
+
print(kwargs)
|
2168
|
+
# if is_df_abnormal(df,verbose=verbose):
|
2169
|
+
# df=pd.read_csv(fpath,**kwargs)
|
2176
2170
|
display(df.head(2))
|
2177
2171
|
print(f"shape: {df.shape}")
|
2178
2172
|
return df
|
@@ -2386,7 +2380,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2386
2380
|
elif kind == "xml":
|
2387
2381
|
return load_xml(fpath)
|
2388
2382
|
elif kind in ["csv", "tsv"]:
|
2389
|
-
verbose = kwargs.pop("verbose", False)
|
2383
|
+
# verbose = kwargs.pop("verbose", False)
|
2390
2384
|
if run_once_within():
|
2391
2385
|
use_pd("read_csv")
|
2392
2386
|
content = load_csv(fpath, **kwargs)
|
@@ -5236,15 +5230,44 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
|
5236
5230
|
data = data.explode(column, ignore_index=True)
|
5237
5231
|
return data
|
5238
5232
|
|
5233
|
+
def df_circular(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
5234
|
+
"""
|
5235
|
+
Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
|
5236
|
+
Usage:
|
5237
|
+
data = pd.DataFrame({'month': [1, 4, 7, 10, 12]}) # Just months as an example
|
5238
|
+
# df_circular month cyclically
|
5239
|
+
data = df_circular(data, 'month', 12)
|
5240
|
+
"""
|
5241
|
+
if columns is None:
|
5242
|
+
columns = list(data.columns) # If no columns specified, use all columns
|
5243
|
+
if max_val is None:
|
5244
|
+
max_val = np.max(data[columns]) # If no max_val specified, use the maximum value across all columns
|
5245
|
+
if isinstance(columns, str):
|
5246
|
+
columns = [columns] # If a single column name is provided as a string, convert it to a list
|
5247
|
+
|
5248
|
+
# Check if inplace is True, so we modify the original dataframe
|
5249
|
+
if inplace:
|
5250
|
+
# Modify the data in place, no return statement needed
|
5251
|
+
for col in columns:
|
5252
|
+
data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
|
5253
|
+
data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
|
5254
|
+
else:
|
5255
|
+
# If inplace is False, return the modified dataframe
|
5256
|
+
new_data = data.copy()
|
5257
|
+
for col in columns:
|
5258
|
+
new_data[col + '_sin'] = np.sin(2 * np.pi * new_data[col] / max_val)
|
5259
|
+
new_data[col + '_cos'] = np.cos(2 * np.pi * new_data[col] / max_val)
|
5260
|
+
return new_data
|
5261
|
+
|
5239
5262
|
|
5240
5263
|
# ! DataFrame
|
5241
5264
|
def df_astype(
|
5242
5265
|
data: pd.DataFrame,
|
5243
5266
|
columns: Optional[Union[str, List[str]]] = None,
|
5244
|
-
astype: str = "datetime",
|
5267
|
+
astype: str = None,#"datetime",
|
5245
5268
|
skip_row: Union[str, list] = None,
|
5246
5269
|
fmt: Optional[str] = None,
|
5247
|
-
inplace: bool =
|
5270
|
+
inplace: bool = False,
|
5248
5271
|
errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
|
5249
5272
|
**kwargs,
|
5250
5273
|
) -> Optional[pd.DataFrame]:
|
@@ -5304,6 +5327,7 @@ def df_astype(
|
|
5304
5327
|
"day",
|
5305
5328
|
"month",
|
5306
5329
|
"year",
|
5330
|
+
"circular"
|
5307
5331
|
]
|
5308
5332
|
# If inplace is False, make a copy of the DataFrame
|
5309
5333
|
if not inplace:
|
@@ -5398,10 +5422,22 @@ def df_astype(
|
|
5398
5422
|
kwargs.pop("errors", None)
|
5399
5423
|
data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
|
5400
5424
|
# print(f"Successfully converted '{column}' to timedelta.")
|
5425
|
+
elif astype == "circular":
|
5426
|
+
max_val = kwargs.get('max_val',None)
|
5427
|
+
data[column]=df_circular(data=data,columns=column,max_val=max_val)
|
5401
5428
|
else:
|
5402
5429
|
# Convert to other types (e.g., float, int)
|
5403
|
-
|
5430
|
+
if astype=='int':
|
5431
|
+
data[column] = data[column].astype('float').astype('int')
|
5432
|
+
else:
|
5433
|
+
data[column] = data[column].astype(astype)
|
5404
5434
|
# print(f"Successfully converted '{column}' to {astype}.")
|
5435
|
+
# format
|
5436
|
+
try:
|
5437
|
+
if fmt is not None:
|
5438
|
+
data[column] = data[column].apply(lambda x: f"{x:{fmt}}")
|
5439
|
+
except Exception as e:
|
5440
|
+
print(f"设置格式的时候有误: {e}")
|
5405
5441
|
except Exception as e:
|
5406
5442
|
print(f"Error converting '{column}' to {astype}: {e}")
|
5407
5443
|
try:
|
@@ -6325,6 +6361,7 @@ def df_reducer(
|
|
6325
6361
|
random_state=1,
|
6326
6362
|
ax=None,
|
6327
6363
|
figsize=None,
|
6364
|
+
verbose=True,
|
6328
6365
|
**kwargs,
|
6329
6366
|
) -> pd.DataFrame:
|
6330
6367
|
dict_methods = {
|
@@ -6364,7 +6401,8 @@ def df_reducer(
|
|
6364
6401
|
# "autoencoder","nmf",
|
6365
6402
|
]
|
6366
6403
|
method = strcmp(method, methods)[0]
|
6367
|
-
|
6404
|
+
if verbose:
|
6405
|
+
print(f"\nprocessing with using {dict_methods[method]}:")
|
6368
6406
|
xlabel, ylabel = None, None
|
6369
6407
|
if columns is None:
|
6370
6408
|
columns = data.select_dtypes(include="number").columns.tolist()
|
@@ -6863,7 +6901,7 @@ def df_reducer(
|
|
6863
6901
|
hue=hue,
|
6864
6902
|
s=size,
|
6865
6903
|
edgecolor=edgecolor,
|
6866
|
-
|
6904
|
+
kind_="scater",
|
6867
6905
|
figsets=dict(
|
6868
6906
|
legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
|
6869
6907
|
xlabel=xlabel if xlabel else None,
|
@@ -7334,10 +7372,13 @@ def evaluate_cluster(
|
|
7334
7372
|
def df_qc(
|
7335
7373
|
data: pd.DataFrame,
|
7336
7374
|
columns=None,
|
7337
|
-
|
7375
|
+
skim=False,
|
7338
7376
|
plot_=True,
|
7339
7377
|
max_cols=20, # only for plots
|
7378
|
+
hue=None,
|
7340
7379
|
output=False,
|
7380
|
+
verbose=True,
|
7381
|
+
dir_save=None
|
7341
7382
|
):
|
7342
7383
|
"""
|
7343
7384
|
Usage example:
|
@@ -7345,22 +7386,24 @@ def df_qc(
|
|
7345
7386
|
"""
|
7346
7387
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
7347
7388
|
from scipy.stats import skew, kurtosis, entropy
|
7348
|
-
|
7349
|
-
|
7389
|
+
|
7350
7390
|
#! display(data.select_dtypes(include=[np.number]).describe())
|
7351
7391
|
#!skim
|
7352
7392
|
if columns is not None:
|
7353
7393
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7354
7394
|
data=data[columns]
|
7355
|
-
|
7356
|
-
|
7357
|
-
|
7358
|
-
|
7359
|
-
|
7395
|
+
if skim:
|
7396
|
+
try:
|
7397
|
+
import skimpy
|
7398
|
+
skimpy.skim(data)
|
7399
|
+
except:
|
7400
|
+
numerical_data = data.select_dtypes(include=[np.number])
|
7401
|
+
skimpy.skim(numerical_data)
|
7360
7402
|
# Fill completely NaN columns with a default value (e.g., 0)
|
7361
7403
|
data = data.copy()
|
7362
7404
|
data.loc[:, data.isna().all()] = 0
|
7363
7405
|
res_qc = {}
|
7406
|
+
print(f"data.shape:{data.shape}")
|
7364
7407
|
|
7365
7408
|
# Missing values
|
7366
7409
|
res_qc["missing_values"] = data.isnull().sum()
|
@@ -7403,7 +7446,7 @@ def df_qc(
|
|
7403
7446
|
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7404
7447
|
vif_data = pd.DataFrame()
|
7405
7448
|
res_qc["vif"]=vif_data
|
7406
|
-
if numeric_df.shape[1] > 1:
|
7449
|
+
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
7407
7450
|
vif_data["feature"] = numeric_df.columns
|
7408
7451
|
vif_data["VIF"] = [
|
7409
7452
|
variance_inflation_factor(numeric_df.values, i)
|
@@ -7495,69 +7538,70 @@ def df_qc(
|
|
7495
7538
|
# Report generation
|
7496
7539
|
if verbose:
|
7497
7540
|
print("=== QC Report Summary ===")
|
7498
|
-
print("\
|
7499
|
-
|
7500
|
-
print(
|
7501
|
-
|
7502
|
-
|
7503
|
-
|
7504
|
-
|
7505
|
-
|
7506
|
-
|
7507
|
-
|
7508
|
-
|
7509
|
-
|
7510
|
-
|
7511
|
-
|
7512
|
-
|
7513
|
-
|
7541
|
+
print("\n⤵ Summary Statistics:")
|
7542
|
+
display(res_qc["summary_statistics"])
|
7543
|
+
print("\n⤵ Data Types:")
|
7544
|
+
display(res_qc["data_types"])
|
7545
|
+
if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
|
7546
|
+
print(" ⤵ Missing Values Counts:")
|
7547
|
+
display(res_qc["missing_values"][res_qc["missing_values"] > 0])
|
7548
|
+
# print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
7549
|
+
print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
|
7550
|
+
|
7551
|
+
if any(res_qc["outlier_num"]):
|
7552
|
+
print("\n⤵ Outlier Report:")
|
7553
|
+
display(res_qc["outlier_num"])
|
7554
|
+
if any(res_qc["unique_values"]):
|
7555
|
+
print("\n⤵ Unique Values per Column:")
|
7556
|
+
display(res_qc["unique_values"])
|
7557
|
+
|
7558
|
+
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
7559
|
+
|
7560
|
+
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
7561
|
+
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
7514
7562
|
|
7515
7563
|
if res_qc["empty_columns"]:
|
7516
|
-
print("\
|
7517
|
-
|
7518
|
-
print("\nOutlier Report:")
|
7519
|
-
print(res_qc["outlier_num"])
|
7520
|
-
print("\nPercentage of Values Replaced per Column:")
|
7521
|
-
print(res_qc["outlier_percentage"])
|
7564
|
+
print("\n⤵ Empty Columns:", res_qc["empty_columns"])
|
7522
7565
|
|
7523
|
-
|
7524
|
-
|
7525
|
-
|
7566
|
+
if any(res_qc["high_correlations"]):
|
7567
|
+
print("\n⤵ High Correlations (>|0.9|):")
|
7568
|
+
for col1, col2 in res_qc["high_correlations"]:
|
7569
|
+
print(f" {col1} and {col2}")
|
7526
7570
|
|
7527
7571
|
if "vif" in res_qc:
|
7528
|
-
print("\
|
7572
|
+
print("\n⤵ Features with High VIF (>|5|):")
|
7529
7573
|
print(res_qc["vif"])
|
7530
7574
|
|
7531
|
-
|
7532
|
-
|
7533
|
-
|
7534
|
-
|
7535
|
-
|
7536
|
-
|
7537
|
-
|
7538
|
-
|
7539
|
-
|
7540
|
-
|
7541
|
-
|
7542
|
-
|
7543
|
-
f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
|
7544
|
-
)
|
7545
|
-
|
7546
|
-
print("\nSummary Statistics:")
|
7547
|
-
print(res_qc["summary_statistics"])
|
7575
|
+
if any(res_qc["high_cardinality_categoricals"]):
|
7576
|
+
print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
|
7577
|
+
print(res_qc["high_cardinality_categoricals"])
|
7578
|
+
if any(res_qc["inconsistent_types"]):
|
7579
|
+
print("\n⤵ Inconsistent Data Types:")
|
7580
|
+
display(res_qc["inconsistent_types"])
|
7581
|
+
if any(res_qc["text_length_analysis"]):
|
7582
|
+
print("\n⤵ Text Length Analysis:")
|
7583
|
+
for col, stats in res_qc["text_length_analysis"].items():
|
7584
|
+
print(
|
7585
|
+
f"{col}: Avg Length={round(stats['avg_length'],1)}, Length Variance={round(stats['length_variance'],1)}"
|
7586
|
+
)
|
7548
7587
|
|
7549
7588
|
if res_qc["warnings"]:
|
7550
7589
|
print("\nWarnings:")
|
7551
7590
|
for warning in res_qc["warnings"]:
|
7552
7591
|
print(" -", warning)
|
7553
7592
|
if plot_:
|
7554
|
-
df_qc_plots(data=data, res_qc=res_qc, max_cols=
|
7593
|
+
df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue)
|
7594
|
+
if dir_save:
|
7595
|
+
try:
|
7596
|
+
figsave(dir_save)
|
7597
|
+
except Exception as e:
|
7598
|
+
print(f"⚠️: {e}")
|
7555
7599
|
if output:
|
7556
7600
|
return res_qc
|
7557
7601
|
return None
|
7558
7602
|
|
7559
7603
|
|
7560
|
-
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
|
7604
|
+
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None):
|
7561
7605
|
import matplotlib.pyplot as plt
|
7562
7606
|
import seaborn as sns
|
7563
7607
|
from .plot import subplot, figsets, get_color
|
@@ -7574,91 +7618,73 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7574
7618
|
)
|
7575
7619
|
if len(missing_data) > max_cols:
|
7576
7620
|
missing_data = missing_data[:max_cols]
|
7577
|
-
|
7578
|
-
|
7579
|
-
|
7580
|
-
hue=missing_data.index,
|
7581
|
-
palette=get_color(len(missing_data), cmap="
|
7621
|
+
ax_missing_data=sns.barplot(
|
7622
|
+
y=missing_data.index,
|
7623
|
+
x=missing_data.values,
|
7624
|
+
hue=missing_data.index,
|
7625
|
+
palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
|
7582
7626
|
ax=nexttile(),
|
7583
7627
|
)
|
7584
|
-
figsets(
|
7585
|
-
|
7586
|
-
ax2 = ax.twinx()
|
7587
|
-
# Plot missing value percentages
|
7588
|
-
missing_percentage = res_qc["missing_percentage"][
|
7589
|
-
res_qc["missing_percentage"] > 0
|
7590
|
-
].sort_values(ascending=False)
|
7591
|
-
sns.barplot(
|
7592
|
-
x=missing_percentage.index,
|
7593
|
-
y=missing_percentage.values,
|
7594
|
-
hue=missing_percentage.index,
|
7595
|
-
palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
|
7596
|
-
ax=ax2,#nexttile(),
|
7597
|
-
)
|
7598
|
-
figsets(xangle=45, ylabel="%",ax=ax2)
|
7599
|
-
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7600
|
-
ax2.yaxis.label.set_color('r')
|
7628
|
+
figsets(title="Missing (#)", xlabel="#",ax=ax_missing_data,ylabel=None,fontsize=8 if len(missing_data)<=20 else 6)
|
7601
7629
|
|
7602
7630
|
outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
|
7603
7631
|
if len(outlier_num) > max_cols:
|
7604
7632
|
outlier_num = outlier_num[:max_cols]
|
7605
7633
|
ax_outlier_num=sns.barplot(
|
7606
|
-
|
7607
|
-
|
7634
|
+
y=outlier_num.index,
|
7635
|
+
x=outlier_num.values,
|
7608
7636
|
hue=outlier_num.index,
|
7609
7637
|
palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
|
7610
7638
|
ax=nexttile(),
|
7611
7639
|
)
|
7612
|
-
figsets(
|
7613
|
-
|
7614
|
-
|
7615
|
-
|
7616
|
-
|
7617
|
-
|
7618
|
-
|
7619
|
-
|
7620
|
-
|
7621
|
-
|
7622
|
-
|
7623
|
-
|
7624
|
-
|
7625
|
-
|
7626
|
-
|
7627
|
-
|
7628
|
-
|
7629
|
-
|
7630
|
-
|
7631
|
-
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7632
|
-
ax2.yaxis.label.set_color('r')
|
7640
|
+
figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
|
7641
|
+
|
7642
|
+
#!
|
7643
|
+
try:
|
7644
|
+
if data.select_dtypes(include=np.number).shape[1]<=10:
|
7645
|
+
for col in data.select_dtypes(include=np.number).columns:
|
7646
|
+
sns.histplot(data[col], kde=True, bins=30, ax=nexttile())
|
7647
|
+
figsets(title=f"Distribution: {col}", xlabel=col, ylabel="Frequency")
|
7648
|
+
except:
|
7649
|
+
pass
|
7650
|
+
#!
|
7651
|
+
try:
|
7652
|
+
for col in data.select_dtypes(include='category').columns:
|
7653
|
+
sns.countplot(y=data[col],
|
7654
|
+
palette=get_color(data.select_dtypes(include='category').shape[1], cmap="coolwarm")[::-1],
|
7655
|
+
ax=nexttile())
|
7656
|
+
figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
|
7657
|
+
except Exception as e:
|
7658
|
+
pass
|
7633
7659
|
|
7634
7660
|
# Skewness and Kurtosis Plots
|
7635
7661
|
skewness = res_qc["skewness"].sort_values(ascending=False)
|
7636
7662
|
kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
|
7637
7663
|
if not skewness.empty:
|
7638
7664
|
ax_skewness=sns.barplot(
|
7639
|
-
|
7640
|
-
|
7665
|
+
y=skewness.index,
|
7666
|
+
x=skewness.values,
|
7641
7667
|
hue=skewness.index,
|
7642
7668
|
palette=get_color(len(skewness), cmap="coolwarm")[::-1],
|
7643
7669
|
ax=nexttile(),
|
7644
7670
|
)
|
7645
7671
|
figsets(
|
7646
|
-
xangle=45,
|
7647
7672
|
title="Highly Skewed Numeric Columns (Skewness > 1)",
|
7648
|
-
|
7673
|
+
xlabel="Skewness",ylabel=None,ax=ax_skewness,
|
7674
|
+
fontsize=8 if len(skewness)<=20 else 6
|
7649
7675
|
)
|
7650
7676
|
if not kurtosis.empty:
|
7651
7677
|
ax_kurtosis=sns.barplot(
|
7652
|
-
|
7653
|
-
|
7678
|
+
y=kurtosis.index,
|
7679
|
+
x=kurtosis.values,
|
7654
7680
|
hue=kurtosis.index,
|
7655
7681
|
palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
|
7656
7682
|
ax=nexttile(),
|
7657
7683
|
)
|
7658
7684
|
figsets(
|
7659
|
-
xangle=45,
|
7660
7685
|
title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
|
7661
|
-
|
7686
|
+
xlabel="Kurtosis",ylabel=None,ax=ax_kurtosis,
|
7687
|
+
fontsize=8 if len(kurtosis)<=20 else 6
|
7662
7688
|
)
|
7663
7689
|
|
7664
7690
|
# Entropy for Categorical Variables
|
@@ -7666,56 +7692,46 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7666
7692
|
ascending=False
|
7667
7693
|
)
|
7668
7694
|
ax_entropy_data=sns.barplot(
|
7669
|
-
|
7695
|
+
y=entropy_data.index, x=entropy_data.values,hue=entropy_data.index,
|
7696
|
+
palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
|
7697
|
+
ax=nexttile()
|
7670
7698
|
)
|
7671
7699
|
figsets(
|
7672
|
-
|
7673
|
-
xlabel="Categorical Columns",
|
7700
|
+
ylabel="Categorical Columns",
|
7674
7701
|
title="Entropy of Categorical Variables",
|
7675
|
-
|
7676
|
-
ax=ax_entropy_data
|
7677
|
-
|
7678
|
-
|
7679
|
-
|
7680
|
-
data=data[res_qc["distribution_analysis"].index],
|
7681
|
-
orient="v",
|
7682
|
-
palette="Set3",
|
7683
|
-
ax=nexttile(),
|
7684
|
-
)
|
7685
|
-
figsets(
|
7686
|
-
xangle=45,
|
7687
|
-
title="Range for Numeric Columns",
|
7688
|
-
ylabel="#",
|
7689
|
-
ax=ax_iqr
|
7690
|
-
)
|
7702
|
+
xlabel="Entropy (bits)",
|
7703
|
+
ax=ax_entropy_data,
|
7704
|
+
fontsize=8 if len(entropy_data)<=20 else 6
|
7705
|
+
)
|
7706
|
+
|
7691
7707
|
# unique counts
|
7692
7708
|
unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
|
7693
7709
|
ax_unique_counts_=sns.barplot(
|
7694
|
-
|
7695
|
-
|
7710
|
+
y=unique_counts.index,
|
7711
|
+
x=unique_counts.values,
|
7696
7712
|
hue=unique_counts.index,
|
7697
|
-
palette=get_color(len(unique_counts)
|
7713
|
+
palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
|
7698
7714
|
ax=nexttile())
|
7699
7715
|
figsets(
|
7700
|
-
xangle=45,
|
7701
7716
|
title="Unique Counts",
|
7702
|
-
|
7703
|
-
|
7704
|
-
ax=ax_unique_counts_
|
7717
|
+
ylabel=None,
|
7718
|
+
xlabel="#",
|
7719
|
+
ax=ax_unique_counts_,
|
7720
|
+
fontsize=8 if len(unique_counts)<=20 else 6
|
7705
7721
|
)
|
7706
7722
|
# Binary Checking
|
7707
|
-
ax_unique_counts=sns.barplot(
|
7708
|
-
|
7709
|
-
hue=unique_counts[unique_counts<
|
7710
|
-
palette=get_color(len(unique_counts[unique_counts<
|
7723
|
+
ax_unique_counts=sns.barplot(y=unique_counts[unique_counts<8].index,
|
7724
|
+
x=unique_counts[unique_counts<8].values,
|
7725
|
+
hue=unique_counts[unique_counts<8].index,
|
7726
|
+
palette=get_color(len(unique_counts[unique_counts<8].index), cmap="coolwarm")[::-1],
|
7711
7727
|
ax=nexttile())
|
7712
|
-
plt.
|
7728
|
+
plt.axvline(x=2, color="r", linestyle="--", lw=2)
|
7713
7729
|
figsets(
|
7714
|
-
|
7715
|
-
xlabel=None,
|
7730
|
+
ylabel=None,
|
7716
7731
|
title="Binary Checking",
|
7717
|
-
|
7718
|
-
ax=ax_unique_counts
|
7732
|
+
xlabel="#",
|
7733
|
+
ax=ax_unique_counts,
|
7734
|
+
fontsize=8 if len(unique_counts[unique_counts<10].index)<=20 else 6
|
7719
7735
|
)
|
7720
7736
|
|
7721
7737
|
# dtypes counts
|
@@ -7751,14 +7767,15 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7751
7767
|
ha="center",
|
7752
7768
|
va="top",
|
7753
7769
|
c="k",
|
7754
|
-
fontsize=8,
|
7770
|
+
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
7755
7771
|
rotation=0,
|
7756
7772
|
)
|
7757
7773
|
figsets(
|
7758
7774
|
xlabel=None,
|
7759
7775
|
title="Dtypes",
|
7760
7776
|
ylabel="#",
|
7761
|
-
ax=ax_dtype_counts
|
7777
|
+
ax=ax_dtype_counts,
|
7778
|
+
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
7762
7779
|
)
|
7763
7780
|
|
7764
7781
|
# High cardinality: Show top categorical columns by unique value count
|
@@ -7772,24 +7789,26 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7772
7789
|
|
7773
7790
|
if high_cardinality:
|
7774
7791
|
ax_high_cardinality=sns.barplot(
|
7775
|
-
|
7776
|
-
|
7792
|
+
y=list(high_cardinality.keys()),
|
7793
|
+
x=list(high_cardinality.values()),
|
7777
7794
|
hue=list(high_cardinality.keys()),
|
7778
|
-
palette=
|
7795
|
+
palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[::-1],
|
7796
|
+
ax=nexttile(),
|
7779
7797
|
)
|
7780
7798
|
figsets(
|
7781
|
-
xangle=45,
|
7782
7799
|
title="High Cardinality Categorical Columns",
|
7783
|
-
|
7784
|
-
ax=ax_high_cardinality
|
7800
|
+
xlabel="Unique Value Count",
|
7801
|
+
ax=ax_high_cardinality,
|
7802
|
+
fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
|
7785
7803
|
)
|
7786
7804
|
if res_qc["low_variance_features"]:
|
7787
7805
|
low_variance_data = data[res_qc["low_variance_features"]].copy()
|
7788
7806
|
for col in low_variance_data.columns:
|
7789
|
-
sns.histplot(
|
7807
|
+
ax_low_variance_features=sns.histplot(
|
7790
7808
|
low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
|
7791
7809
|
)
|
7792
|
-
|
7810
|
+
figsets(title=f"Low Variance Feature: {col}",ax=ax_low_variance_features,
|
7811
|
+
fontsize=8 if len(low_variance_data[col])<=20 else 6)
|
7793
7812
|
|
7794
7813
|
# VIF plot for multicollinearity detection
|
7795
7814
|
if "vif" in res_qc and not res_qc["vif"].empty:
|
@@ -7800,23 +7819,22 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7800
7819
|
x="VIF",
|
7801
7820
|
y="feature",
|
7802
7821
|
hue="VIF",
|
7803
|
-
palette=get_color(len(vif_data)
|
7822
|
+
palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
|
7804
7823
|
ax=nexttile())
|
7805
7824
|
figsets(
|
7806
|
-
xangle=45,
|
7807
7825
|
title="Variance Inflation Factor(VIF)",
|
7808
|
-
xlabel="
|
7826
|
+
xlabel="VIF",
|
7809
7827
|
ylabel="Features",
|
7810
7828
|
legend=None,
|
7811
|
-
ax=ax_vif
|
7829
|
+
ax=ax_vif,
|
7830
|
+
fontsize=8 if len(vif_data)<=20 else 6
|
7812
7831
|
)
|
7813
7832
|
|
7814
7833
|
# Correlation heatmap for numeric columns with high correlation pairs
|
7815
7834
|
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7816
|
-
corr = data.select_dtypes(include=[np.number]).
|
7835
|
+
corr = data.select_dtypes(include=[np.number]).corr()
|
7817
7836
|
if corr.shape[1]<=33:
|
7818
7837
|
mask = np.triu(np.ones_like(corr, dtype=bool))
|
7819
|
-
# Dynamically scale fontsize based on the number of columns
|
7820
7838
|
num_columns = corr.shape[1]
|
7821
7839
|
fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
|
7822
7840
|
|
@@ -7826,7 +7844,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7826
7844
|
annot=True,
|
7827
7845
|
cmap="coolwarm",
|
7828
7846
|
center=0,
|
7829
|
-
fmt=".
|
7847
|
+
fmt=".1f",
|
7830
7848
|
linewidths=0.5,
|
7831
7849
|
vmin=-1, vmax=1,
|
7832
7850
|
ax=nexttile(2, 2),
|