py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.24__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/ips.py +213 -195
- py2ls/ml2ls.py +768 -61
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.24.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.24.dist-info}/RECORD +12 -9
- {py2ls-0.2.4.23.dist-info → py2ls-0.2.4.24.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -4,6 +4,8 @@ import sys, os
|
|
4
4
|
from IPython.display import display
|
5
5
|
from typing import List, Optional, Union
|
6
6
|
|
7
|
+
from regex import X
|
8
|
+
|
7
9
|
try:
|
8
10
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
9
11
|
get_ipython().run_line_magic("autoreload", "2")
|
@@ -1828,16 +1830,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1828
1830
|
# Check data types
|
1829
1831
|
data_types = df.dtypes
|
1830
1832
|
# messages.append(f"Data types of columns:\n{data_types}")
|
1831
|
-
|
1832
|
-
# Check for constant values across any column
|
1833
|
-
constant_columns = df.columns[df.nunique() == 1].tolist()
|
1834
|
-
if constant_columns:
|
1835
|
-
messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
|
1836
|
-
is_abnormal = True
|
1837
|
-
if verbose:
|
1838
|
-
print(f"df.columns[df.nunique() == 1].tolist()")
|
1839
|
-
if verbose:
|
1840
|
-
print("5", is_abnormal)
|
1833
|
+
|
1841
1834
|
# Check for an unreasonable number of rows or columns
|
1842
1835
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
1843
1836
|
messages.append(
|
@@ -1989,30 +1982,29 @@ def fload(fpath, kind=None, **kwargs):
|
|
1989
1982
|
def load_csv(fpath, **kwargs):
|
1990
1983
|
from pandas.errors import EmptyDataError
|
1991
1984
|
|
1992
|
-
engine = kwargs.pop("engine", "pyarrow")
|
1993
|
-
sep = kwargs.pop("sep",
|
1994
|
-
index_col = kwargs.pop("index_col", None)
|
1995
|
-
memory_map = kwargs.pop("memory_map", False)
|
1996
|
-
skipinitialspace = kwargs.pop("skipinitialspace", False)
|
1997
|
-
encoding = kwargs.pop("encoding", "utf-8")
|
1998
|
-
on_bad_lines = kwargs.pop("on_bad_lines", "skip")
|
1999
|
-
comment = kwargs.pop("comment", None)
|
2000
|
-
fmt = kwargs.pop("fmt", False)
|
2001
|
-
chunksize = kwargs.pop("chunksize", None)
|
1985
|
+
engine = kwargs.pop("engine", "pyarrow")# default: None
|
1986
|
+
sep = kwargs.pop("sep", None)# default: ','
|
1987
|
+
index_col = kwargs.pop("index_col", None)# default: None
|
1988
|
+
memory_map = kwargs.pop("memory_map", False)# default: False
|
1989
|
+
skipinitialspace = kwargs.pop("skipinitialspace", False)# default: False
|
1990
|
+
encoding = kwargs.pop("encoding", "utf-8")# default: "utf-8"
|
1991
|
+
on_bad_lines = kwargs.pop("on_bad_lines", "skip")# default: 'error'
|
1992
|
+
comment = kwargs.pop("comment", None)# default: None
|
1993
|
+
fmt = kwargs.pop("fmt", False)# default:
|
1994
|
+
chunksize = kwargs.pop("chunksize", None)# default: None
|
2002
1995
|
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
2003
|
-
low_memory = kwargs.pop("low_memory", True)
|
1996
|
+
low_memory = kwargs.pop("low_memory", True)# default: True
|
2004
1997
|
low_memory = (
|
2005
1998
|
False if chunksize else True
|
2006
|
-
) # when chunksize, recommend low_memory=False
|
1999
|
+
) # when chunksize, recommend low_memory=False # default:
|
2007
2000
|
verbose = kwargs.pop("verbose", False)
|
2008
2001
|
if run_once_within():
|
2009
2002
|
use_pd("read_csv", verbose=verbose)
|
2010
2003
|
|
2011
|
-
if comment is None:
|
2004
|
+
if comment is None:# default: None
|
2012
2005
|
comment = get_comment(
|
2013
2006
|
fpath, comment=None, encoding="utf-8", lines_to_check=5
|
2014
2007
|
)
|
2015
|
-
|
2016
2008
|
try:
|
2017
2009
|
df = pd.read_csv(
|
2018
2010
|
fpath,
|
@@ -2107,8 +2099,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
2107
2099
|
separators = [",", "\t", ";", "|", " "]
|
2108
2100
|
for sep in separators:
|
2109
2101
|
sep2show = sep if sep != "\t" else "\\t"
|
2110
|
-
|
2111
|
-
|
2102
|
+
if verbose:
|
2103
|
+
print(f'trying with: engine=pyarrow, sep="{sep2show}"')
|
2112
2104
|
try:
|
2113
2105
|
df = pd.read_csv(
|
2114
2106
|
fpath,
|
@@ -2137,8 +2129,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2137
2129
|
separators = [",", "\t", ";", "|", " "]
|
2138
2130
|
for sep in separators:
|
2139
2131
|
try:
|
2140
|
-
|
2141
|
-
|
2132
|
+
sep2show = sep if sep != "\t" else "\\t"
|
2133
|
+
if verbose:
|
2134
|
+
print(f"trying with: engine={engine}, sep='{sep2show}'")
|
2142
2135
|
# print(".")
|
2143
2136
|
df = pd.read_csv(
|
2144
2137
|
fpath,
|
@@ -2171,8 +2164,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2171
2164
|
continue
|
2172
2165
|
else:
|
2173
2166
|
pass
|
2174
|
-
|
2175
|
-
|
2167
|
+
print(kwargs)
|
2168
|
+
# if is_df_abnormal(df,verbose=verbose):
|
2169
|
+
# df=pd.read_csv(fpath,**kwargs)
|
2176
2170
|
display(df.head(2))
|
2177
2171
|
print(f"shape: {df.shape}")
|
2178
2172
|
return df
|
@@ -2386,7 +2380,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2386
2380
|
elif kind == "xml":
|
2387
2381
|
return load_xml(fpath)
|
2388
2382
|
elif kind in ["csv", "tsv"]:
|
2389
|
-
verbose = kwargs.pop("verbose", False)
|
2383
|
+
# verbose = kwargs.pop("verbose", False)
|
2390
2384
|
if run_once_within():
|
2391
2385
|
use_pd("read_csv")
|
2392
2386
|
content = load_csv(fpath, **kwargs)
|
@@ -5236,15 +5230,44 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
|
5236
5230
|
data = data.explode(column, ignore_index=True)
|
5237
5231
|
return data
|
5238
5232
|
|
5233
|
+
def df_circular(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
|
5234
|
+
"""
|
5235
|
+
Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
|
5236
|
+
Usage:
|
5237
|
+
data = pd.DataFrame({'month': [1, 4, 7, 10, 12]}) # Just months as an example
|
5238
|
+
# df_circular month cyclically
|
5239
|
+
data = df_circular(data, 'month', 12)
|
5240
|
+
"""
|
5241
|
+
if columns is None:
|
5242
|
+
columns = list(data.columns) # If no columns specified, use all columns
|
5243
|
+
if max_val is None:
|
5244
|
+
max_val = np.max(data[columns]) # If no max_val specified, use the maximum value across all columns
|
5245
|
+
if isinstance(columns, str):
|
5246
|
+
columns = [columns] # If a single column name is provided as a string, convert it to a list
|
5247
|
+
|
5248
|
+
# Check if inplace is True, so we modify the original dataframe
|
5249
|
+
if inplace:
|
5250
|
+
# Modify the data in place, no return statement needed
|
5251
|
+
for col in columns:
|
5252
|
+
data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
|
5253
|
+
data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
|
5254
|
+
else:
|
5255
|
+
# If inplace is False, return the modified dataframe
|
5256
|
+
new_data = data.copy()
|
5257
|
+
for col in columns:
|
5258
|
+
new_data[col + '_sin'] = np.sin(2 * np.pi * new_data[col] / max_val)
|
5259
|
+
new_data[col + '_cos'] = np.cos(2 * np.pi * new_data[col] / max_val)
|
5260
|
+
return new_data
|
5261
|
+
|
5239
5262
|
|
5240
5263
|
# ! DataFrame
|
5241
5264
|
def df_astype(
|
5242
5265
|
data: pd.DataFrame,
|
5243
5266
|
columns: Optional[Union[str, List[str]]] = None,
|
5244
|
-
astype: str = "datetime",
|
5267
|
+
astype: str = None,#"datetime",
|
5245
5268
|
skip_row: Union[str, list] = None,
|
5246
5269
|
fmt: Optional[str] = None,
|
5247
|
-
inplace: bool =
|
5270
|
+
inplace: bool = False,
|
5248
5271
|
errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
|
5249
5272
|
**kwargs,
|
5250
5273
|
) -> Optional[pd.DataFrame]:
|
@@ -5304,6 +5327,7 @@ def df_astype(
|
|
5304
5327
|
"day",
|
5305
5328
|
"month",
|
5306
5329
|
"year",
|
5330
|
+
"circular"
|
5307
5331
|
]
|
5308
5332
|
# If inplace is False, make a copy of the DataFrame
|
5309
5333
|
if not inplace:
|
@@ -5398,10 +5422,22 @@ def df_astype(
|
|
5398
5422
|
kwargs.pop("errors", None)
|
5399
5423
|
data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
|
5400
5424
|
# print(f"Successfully converted '{column}' to timedelta.")
|
5425
|
+
elif astype == "circular":
|
5426
|
+
max_val = kwargs.get('max_val',None)
|
5427
|
+
data[column]=df_circular(data=data,columns=column,max_val=max_val)
|
5401
5428
|
else:
|
5402
5429
|
# Convert to other types (e.g., float, int)
|
5403
|
-
|
5430
|
+
if astype=='int':
|
5431
|
+
data[column] = data[column].astype('float').astype('int')
|
5432
|
+
else:
|
5433
|
+
data[column] = data[column].astype(astype)
|
5404
5434
|
# print(f"Successfully converted '{column}' to {astype}.")
|
5435
|
+
# format
|
5436
|
+
try:
|
5437
|
+
if fmt is not None:
|
5438
|
+
data[column] = data[column].apply(lambda x: f"{x:{fmt}}")
|
5439
|
+
except Exception as e:
|
5440
|
+
print(f"设置格式的时候有误: {e}")
|
5405
5441
|
except Exception as e:
|
5406
5442
|
print(f"Error converting '{column}' to {astype}: {e}")
|
5407
5443
|
try:
|
@@ -6325,6 +6361,7 @@ def df_reducer(
|
|
6325
6361
|
random_state=1,
|
6326
6362
|
ax=None,
|
6327
6363
|
figsize=None,
|
6364
|
+
verbose=True,
|
6328
6365
|
**kwargs,
|
6329
6366
|
) -> pd.DataFrame:
|
6330
6367
|
dict_methods = {
|
@@ -6364,7 +6401,8 @@ def df_reducer(
|
|
6364
6401
|
# "autoencoder","nmf",
|
6365
6402
|
]
|
6366
6403
|
method = strcmp(method, methods)[0]
|
6367
|
-
|
6404
|
+
if verbose:
|
6405
|
+
print(f"\nprocessing with using {dict_methods[method]}:")
|
6368
6406
|
xlabel, ylabel = None, None
|
6369
6407
|
if columns is None:
|
6370
6408
|
columns = data.select_dtypes(include="number").columns.tolist()
|
@@ -6863,7 +6901,7 @@ def df_reducer(
|
|
6863
6901
|
hue=hue,
|
6864
6902
|
s=size,
|
6865
6903
|
edgecolor=edgecolor,
|
6866
|
-
|
6904
|
+
kind_="scater",
|
6867
6905
|
figsets=dict(
|
6868
6906
|
legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
|
6869
6907
|
xlabel=xlabel if xlabel else None,
|
@@ -7334,10 +7372,13 @@ def evaluate_cluster(
|
|
7334
7372
|
def df_qc(
|
7335
7373
|
data: pd.DataFrame,
|
7336
7374
|
columns=None,
|
7337
|
-
|
7375
|
+
skim=False,
|
7338
7376
|
plot_=True,
|
7339
7377
|
max_cols=20, # only for plots
|
7378
|
+
hue=None,
|
7340
7379
|
output=False,
|
7380
|
+
verbose=True,
|
7381
|
+
dir_save=None
|
7341
7382
|
):
|
7342
7383
|
"""
|
7343
7384
|
Usage example:
|
@@ -7345,22 +7386,24 @@ def df_qc(
|
|
7345
7386
|
"""
|
7346
7387
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
7347
7388
|
from scipy.stats import skew, kurtosis, entropy
|
7348
|
-
|
7349
|
-
|
7389
|
+
|
7350
7390
|
#! display(data.select_dtypes(include=[np.number]).describe())
|
7351
7391
|
#!skim
|
7352
7392
|
if columns is not None:
|
7353
7393
|
if isinstance(columns, (list,pd.core.indexes.base.Index)):
|
7354
7394
|
data=data[columns]
|
7355
|
-
|
7356
|
-
|
7357
|
-
|
7358
|
-
|
7359
|
-
|
7395
|
+
if skim:
|
7396
|
+
try:
|
7397
|
+
import skimpy
|
7398
|
+
skimpy.skim(data)
|
7399
|
+
except:
|
7400
|
+
numerical_data = data.select_dtypes(include=[np.number])
|
7401
|
+
skimpy.skim(numerical_data)
|
7360
7402
|
# Fill completely NaN columns with a default value (e.g., 0)
|
7361
7403
|
data = data.copy()
|
7362
7404
|
data.loc[:, data.isna().all()] = 0
|
7363
7405
|
res_qc = {}
|
7406
|
+
print(f"data.shape:{data.shape}")
|
7364
7407
|
|
7365
7408
|
# Missing values
|
7366
7409
|
res_qc["missing_values"] = data.isnull().sum()
|
@@ -7403,7 +7446,7 @@ def df_qc(
|
|
7403
7446
|
numeric_df = data.select_dtypes(include=[np.number]).dropna()
|
7404
7447
|
vif_data = pd.DataFrame()
|
7405
7448
|
res_qc["vif"]=vif_data
|
7406
|
-
if numeric_df.shape[1] > 1:
|
7449
|
+
if numeric_df.shape[1] > 1 and not numeric_df.empty:
|
7407
7450
|
vif_data["feature"] = numeric_df.columns
|
7408
7451
|
vif_data["VIF"] = [
|
7409
7452
|
variance_inflation_factor(numeric_df.values, i)
|
@@ -7495,69 +7538,70 @@ def df_qc(
|
|
7495
7538
|
# Report generation
|
7496
7539
|
if verbose:
|
7497
7540
|
print("=== QC Report Summary ===")
|
7498
|
-
print("\
|
7499
|
-
|
7500
|
-
print(
|
7501
|
-
|
7502
|
-
|
7503
|
-
|
7504
|
-
|
7505
|
-
|
7506
|
-
|
7507
|
-
|
7508
|
-
|
7509
|
-
|
7510
|
-
|
7511
|
-
|
7512
|
-
|
7513
|
-
|
7541
|
+
print("\n⤵ Summary Statistics:")
|
7542
|
+
display(res_qc["summary_statistics"])
|
7543
|
+
print("\n⤵ Data Types:")
|
7544
|
+
display(res_qc["data_types"])
|
7545
|
+
if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
|
7546
|
+
print(" ⤵ Missing Values Counts:")
|
7547
|
+
display(res_qc["missing_values"][res_qc["missing_values"] > 0])
|
7548
|
+
# print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
|
7549
|
+
print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
|
7550
|
+
|
7551
|
+
if any(res_qc["outlier_num"]):
|
7552
|
+
print("\n⤵ Outlier Report:")
|
7553
|
+
display(res_qc["outlier_num"])
|
7554
|
+
if any(res_qc["unique_values"]):
|
7555
|
+
print("\n⤵ Unique Values per Column:")
|
7556
|
+
display(res_qc["unique_values"])
|
7557
|
+
|
7558
|
+
print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
|
7559
|
+
|
7560
|
+
print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
|
7561
|
+
print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
|
7514
7562
|
|
7515
7563
|
if res_qc["empty_columns"]:
|
7516
|
-
print("\
|
7517
|
-
|
7518
|
-
print("\nOutlier Report:")
|
7519
|
-
print(res_qc["outlier_num"])
|
7520
|
-
print("\nPercentage of Values Replaced per Column:")
|
7521
|
-
print(res_qc["outlier_percentage"])
|
7564
|
+
print("\n⤵ Empty Columns:", res_qc["empty_columns"])
|
7522
7565
|
|
7523
|
-
|
7524
|
-
|
7525
|
-
|
7566
|
+
if any(res_qc["high_correlations"]):
|
7567
|
+
print("\n⤵ High Correlations (>|0.9|):")
|
7568
|
+
for col1, col2 in res_qc["high_correlations"]:
|
7569
|
+
print(f" {col1} and {col2}")
|
7526
7570
|
|
7527
7571
|
if "vif" in res_qc:
|
7528
|
-
print("\
|
7572
|
+
print("\n⤵ Features with High VIF (>|5|):")
|
7529
7573
|
print(res_qc["vif"])
|
7530
7574
|
|
7531
|
-
|
7532
|
-
|
7533
|
-
|
7534
|
-
|
7535
|
-
|
7536
|
-
|
7537
|
-
|
7538
|
-
|
7539
|
-
|
7540
|
-
|
7541
|
-
|
7542
|
-
|
7543
|
-
f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
|
7544
|
-
)
|
7545
|
-
|
7546
|
-
print("\nSummary Statistics:")
|
7547
|
-
print(res_qc["summary_statistics"])
|
7575
|
+
if any(res_qc["high_cardinality_categoricals"]):
|
7576
|
+
print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
|
7577
|
+
print(res_qc["high_cardinality_categoricals"])
|
7578
|
+
if any(res_qc["inconsistent_types"]):
|
7579
|
+
print("\n⤵ Inconsistent Data Types:")
|
7580
|
+
display(res_qc["inconsistent_types"])
|
7581
|
+
if any(res_qc["text_length_analysis"]):
|
7582
|
+
print("\n⤵ Text Length Analysis:")
|
7583
|
+
for col, stats in res_qc["text_length_analysis"].items():
|
7584
|
+
print(
|
7585
|
+
f"{col}: Avg Length={round(stats['avg_length'],1)}, Length Variance={round(stats['length_variance'],1)}"
|
7586
|
+
)
|
7548
7587
|
|
7549
7588
|
if res_qc["warnings"]:
|
7550
7589
|
print("\nWarnings:")
|
7551
7590
|
for warning in res_qc["warnings"]:
|
7552
7591
|
print(" -", warning)
|
7553
7592
|
if plot_:
|
7554
|
-
df_qc_plots(data=data, res_qc=res_qc, max_cols=
|
7593
|
+
df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue)
|
7594
|
+
if dir_save:
|
7595
|
+
try:
|
7596
|
+
figsave(dir_save)
|
7597
|
+
except Exception as e:
|
7598
|
+
print(f"⚠️: {e}")
|
7555
7599
|
if output:
|
7556
7600
|
return res_qc
|
7557
7601
|
return None
|
7558
7602
|
|
7559
7603
|
|
7560
|
-
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
|
7604
|
+
def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None):
|
7561
7605
|
import matplotlib.pyplot as plt
|
7562
7606
|
import seaborn as sns
|
7563
7607
|
from .plot import subplot, figsets, get_color
|
@@ -7574,91 +7618,73 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7574
7618
|
)
|
7575
7619
|
if len(missing_data) > max_cols:
|
7576
7620
|
missing_data = missing_data[:max_cols]
|
7577
|
-
|
7578
|
-
|
7579
|
-
|
7580
|
-
hue=missing_data.index,
|
7581
|
-
palette=get_color(len(missing_data), cmap="
|
7621
|
+
ax_missing_data=sns.barplot(
|
7622
|
+
y=missing_data.index,
|
7623
|
+
x=missing_data.values,
|
7624
|
+
hue=missing_data.index,
|
7625
|
+
palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
|
7582
7626
|
ax=nexttile(),
|
7583
7627
|
)
|
7584
|
-
figsets(
|
7585
|
-
|
7586
|
-
ax2 = ax.twinx()
|
7587
|
-
# Plot missing value percentages
|
7588
|
-
missing_percentage = res_qc["missing_percentage"][
|
7589
|
-
res_qc["missing_percentage"] > 0
|
7590
|
-
].sort_values(ascending=False)
|
7591
|
-
sns.barplot(
|
7592
|
-
x=missing_percentage.index,
|
7593
|
-
y=missing_percentage.values,
|
7594
|
-
hue=missing_percentage.index,
|
7595
|
-
palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
|
7596
|
-
ax=ax2,#nexttile(),
|
7597
|
-
)
|
7598
|
-
figsets(xangle=45, ylabel="%",ax=ax2)
|
7599
|
-
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7600
|
-
ax2.yaxis.label.set_color('r')
|
7628
|
+
figsets(title="Missing (#)", xlabel="#",ax=ax_missing_data,ylabel=None,fontsize=8 if len(missing_data)<=20 else 6)
|
7601
7629
|
|
7602
7630
|
outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
|
7603
7631
|
if len(outlier_num) > max_cols:
|
7604
7632
|
outlier_num = outlier_num[:max_cols]
|
7605
7633
|
ax_outlier_num=sns.barplot(
|
7606
|
-
|
7607
|
-
|
7634
|
+
y=outlier_num.index,
|
7635
|
+
x=outlier_num.values,
|
7608
7636
|
hue=outlier_num.index,
|
7609
7637
|
palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
|
7610
7638
|
ax=nexttile(),
|
7611
7639
|
)
|
7612
|
-
figsets(
|
7613
|
-
|
7614
|
-
|
7615
|
-
|
7616
|
-
|
7617
|
-
|
7618
|
-
|
7619
|
-
|
7620
|
-
|
7621
|
-
|
7622
|
-
|
7623
|
-
|
7624
|
-
|
7625
|
-
|
7626
|
-
|
7627
|
-
|
7628
|
-
|
7629
|
-
|
7630
|
-
|
7631
|
-
ax2.tick_params(axis="y", color='r',labelcolor='r')
|
7632
|
-
ax2.yaxis.label.set_color('r')
|
7640
|
+
figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
|
7641
|
+
|
7642
|
+
#!
|
7643
|
+
try:
|
7644
|
+
if data.select_dtypes(include=np.number).shape[1]<=10:
|
7645
|
+
for col in data.select_dtypes(include=np.number).columns:
|
7646
|
+
sns.histplot(data[col], kde=True, bins=30, ax=nexttile())
|
7647
|
+
figsets(title=f"Distribution: {col}", xlabel=col, ylabel="Frequency")
|
7648
|
+
except:
|
7649
|
+
pass
|
7650
|
+
#!
|
7651
|
+
try:
|
7652
|
+
for col in data.select_dtypes(include='category').columns:
|
7653
|
+
sns.countplot(y=data[col],
|
7654
|
+
palette=get_color(data.select_dtypes(include='category').shape[1], cmap="coolwarm")[::-1],
|
7655
|
+
ax=nexttile())
|
7656
|
+
figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
|
7657
|
+
except Exception as e:
|
7658
|
+
pass
|
7633
7659
|
|
7634
7660
|
# Skewness and Kurtosis Plots
|
7635
7661
|
skewness = res_qc["skewness"].sort_values(ascending=False)
|
7636
7662
|
kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
|
7637
7663
|
if not skewness.empty:
|
7638
7664
|
ax_skewness=sns.barplot(
|
7639
|
-
|
7640
|
-
|
7665
|
+
y=skewness.index,
|
7666
|
+
x=skewness.values,
|
7641
7667
|
hue=skewness.index,
|
7642
7668
|
palette=get_color(len(skewness), cmap="coolwarm")[::-1],
|
7643
7669
|
ax=nexttile(),
|
7644
7670
|
)
|
7645
7671
|
figsets(
|
7646
|
-
xangle=45,
|
7647
7672
|
title="Highly Skewed Numeric Columns (Skewness > 1)",
|
7648
|
-
|
7673
|
+
xlabel="Skewness",ylabel=None,ax=ax_skewness,
|
7674
|
+
fontsize=8 if len(skewness)<=20 else 6
|
7649
7675
|
)
|
7650
7676
|
if not kurtosis.empty:
|
7651
7677
|
ax_kurtosis=sns.barplot(
|
7652
|
-
|
7653
|
-
|
7678
|
+
y=kurtosis.index,
|
7679
|
+
x=kurtosis.values,
|
7654
7680
|
hue=kurtosis.index,
|
7655
7681
|
palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
|
7656
7682
|
ax=nexttile(),
|
7657
7683
|
)
|
7658
7684
|
figsets(
|
7659
|
-
xangle=45,
|
7660
7685
|
title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
|
7661
|
-
|
7686
|
+
xlabel="Kurtosis",ylabel=None,ax=ax_kurtosis,
|
7687
|
+
fontsize=8 if len(kurtosis)<=20 else 6
|
7662
7688
|
)
|
7663
7689
|
|
7664
7690
|
# Entropy for Categorical Variables
|
@@ -7666,56 +7692,46 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7666
7692
|
ascending=False
|
7667
7693
|
)
|
7668
7694
|
ax_entropy_data=sns.barplot(
|
7669
|
-
|
7695
|
+
y=entropy_data.index, x=entropy_data.values,hue=entropy_data.index,
|
7696
|
+
palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
|
7697
|
+
ax=nexttile()
|
7670
7698
|
)
|
7671
7699
|
figsets(
|
7672
|
-
|
7673
|
-
xlabel="Categorical Columns",
|
7700
|
+
ylabel="Categorical Columns",
|
7674
7701
|
title="Entropy of Categorical Variables",
|
7675
|
-
|
7676
|
-
ax=ax_entropy_data
|
7677
|
-
|
7678
|
-
|
7679
|
-
|
7680
|
-
data=data[res_qc["distribution_analysis"].index],
|
7681
|
-
orient="v",
|
7682
|
-
palette="Set3",
|
7683
|
-
ax=nexttile(),
|
7684
|
-
)
|
7685
|
-
figsets(
|
7686
|
-
xangle=45,
|
7687
|
-
title="Range for Numeric Columns",
|
7688
|
-
ylabel="#",
|
7689
|
-
ax=ax_iqr
|
7690
|
-
)
|
7702
|
+
xlabel="Entropy (bits)",
|
7703
|
+
ax=ax_entropy_data,
|
7704
|
+
fontsize=8 if len(entropy_data)<=20 else 6
|
7705
|
+
)
|
7706
|
+
|
7691
7707
|
# unique counts
|
7692
7708
|
unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
|
7693
7709
|
ax_unique_counts_=sns.barplot(
|
7694
|
-
|
7695
|
-
|
7710
|
+
y=unique_counts.index,
|
7711
|
+
x=unique_counts.values,
|
7696
7712
|
hue=unique_counts.index,
|
7697
|
-
palette=get_color(len(unique_counts)
|
7713
|
+
palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
|
7698
7714
|
ax=nexttile())
|
7699
7715
|
figsets(
|
7700
|
-
xangle=45,
|
7701
7716
|
title="Unique Counts",
|
7702
|
-
|
7703
|
-
|
7704
|
-
ax=ax_unique_counts_
|
7717
|
+
ylabel=None,
|
7718
|
+
xlabel="#",
|
7719
|
+
ax=ax_unique_counts_,
|
7720
|
+
fontsize=8 if len(unique_counts)<=20 else 6
|
7705
7721
|
)
|
7706
7722
|
# Binary Checking
|
7707
|
-
ax_unique_counts=sns.barplot(
|
7708
|
-
|
7709
|
-
hue=unique_counts[unique_counts<
|
7710
|
-
palette=get_color(len(unique_counts[unique_counts<
|
7723
|
+
ax_unique_counts=sns.barplot(y=unique_counts[unique_counts<8].index,
|
7724
|
+
x=unique_counts[unique_counts<8].values,
|
7725
|
+
hue=unique_counts[unique_counts<8].index,
|
7726
|
+
palette=get_color(len(unique_counts[unique_counts<8].index), cmap="coolwarm")[::-1],
|
7711
7727
|
ax=nexttile())
|
7712
|
-
plt.
|
7728
|
+
plt.axvline(x=2, color="r", linestyle="--", lw=2)
|
7713
7729
|
figsets(
|
7714
|
-
|
7715
|
-
xlabel=None,
|
7730
|
+
ylabel=None,
|
7716
7731
|
title="Binary Checking",
|
7717
|
-
|
7718
|
-
ax=ax_unique_counts
|
7732
|
+
xlabel="#",
|
7733
|
+
ax=ax_unique_counts,
|
7734
|
+
fontsize=8 if len(unique_counts[unique_counts<10].index)<=20 else 6
|
7719
7735
|
)
|
7720
7736
|
|
7721
7737
|
# dtypes counts
|
@@ -7751,14 +7767,15 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7751
7767
|
ha="center",
|
7752
7768
|
va="top",
|
7753
7769
|
c="k",
|
7754
|
-
fontsize=8,
|
7770
|
+
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
7755
7771
|
rotation=0,
|
7756
7772
|
)
|
7757
7773
|
figsets(
|
7758
7774
|
xlabel=None,
|
7759
7775
|
title="Dtypes",
|
7760
7776
|
ylabel="#",
|
7761
|
-
ax=ax_dtype_counts
|
7777
|
+
ax=ax_dtype_counts,
|
7778
|
+
fontsize=8 if len(dtype_counts.index)<=20 else 6,
|
7762
7779
|
)
|
7763
7780
|
|
7764
7781
|
# High cardinality: Show top categorical columns by unique value count
|
@@ -7772,24 +7789,26 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7772
7789
|
|
7773
7790
|
if high_cardinality:
|
7774
7791
|
ax_high_cardinality=sns.barplot(
|
7775
|
-
|
7776
|
-
|
7792
|
+
y=list(high_cardinality.keys()),
|
7793
|
+
x=list(high_cardinality.values()),
|
7777
7794
|
hue=list(high_cardinality.keys()),
|
7778
|
-
palette=
|
7795
|
+
palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[::-1],
|
7796
|
+
ax=nexttile(),
|
7779
7797
|
)
|
7780
7798
|
figsets(
|
7781
|
-
xangle=45,
|
7782
7799
|
title="High Cardinality Categorical Columns",
|
7783
|
-
|
7784
|
-
ax=ax_high_cardinality
|
7800
|
+
xlabel="Unique Value Count",
|
7801
|
+
ax=ax_high_cardinality,
|
7802
|
+
fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
|
7785
7803
|
)
|
7786
7804
|
if res_qc["low_variance_features"]:
|
7787
7805
|
low_variance_data = data[res_qc["low_variance_features"]].copy()
|
7788
7806
|
for col in low_variance_data.columns:
|
7789
|
-
sns.histplot(
|
7807
|
+
ax_low_variance_features=sns.histplot(
|
7790
7808
|
low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
|
7791
7809
|
)
|
7792
|
-
|
7810
|
+
figsets(title=f"Low Variance Feature: {col}",ax=ax_low_variance_features,
|
7811
|
+
fontsize=8 if len(low_variance_data[col])<=20 else 6)
|
7793
7812
|
|
7794
7813
|
# VIF plot for multicollinearity detection
|
7795
7814
|
if "vif" in res_qc and not res_qc["vif"].empty:
|
@@ -7800,23 +7819,22 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7800
7819
|
x="VIF",
|
7801
7820
|
y="feature",
|
7802
7821
|
hue="VIF",
|
7803
|
-
palette=get_color(len(vif_data)
|
7822
|
+
palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
|
7804
7823
|
ax=nexttile())
|
7805
7824
|
figsets(
|
7806
|
-
xangle=45,
|
7807
7825
|
title="Variance Inflation Factor(VIF)",
|
7808
|
-
xlabel="
|
7826
|
+
xlabel="VIF",
|
7809
7827
|
ylabel="Features",
|
7810
7828
|
legend=None,
|
7811
|
-
ax=ax_vif
|
7829
|
+
ax=ax_vif,
|
7830
|
+
fontsize=8 if len(vif_data)<=20 else 6
|
7812
7831
|
)
|
7813
7832
|
|
7814
7833
|
# Correlation heatmap for numeric columns with high correlation pairs
|
7815
7834
|
if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
|
7816
|
-
corr = data.select_dtypes(include=[np.number]).
|
7835
|
+
corr = data.select_dtypes(include=[np.number]).corr()
|
7817
7836
|
if corr.shape[1]<=33:
|
7818
7837
|
mask = np.triu(np.ones_like(corr, dtype=bool))
|
7819
|
-
# Dynamically scale fontsize based on the number of columns
|
7820
7838
|
num_columns = corr.shape[1]
|
7821
7839
|
fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
|
7822
7840
|
|
@@ -7826,7 +7844,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
|
|
7826
7844
|
annot=True,
|
7827
7845
|
cmap="coolwarm",
|
7828
7846
|
center=0,
|
7829
|
-
fmt=".
|
7847
|
+
fmt=".1f",
|
7830
7848
|
linewidths=0.5,
|
7831
7849
|
vmin=-1, vmax=1,
|
7832
7850
|
ax=nexttile(2, 2),
|