py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.24__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ips.py CHANGED
@@ -4,6 +4,8 @@ import sys, os
4
4
  from IPython.display import display
5
5
  from typing import List, Optional, Union
6
6
 
7
+ from regex import X
8
+
7
9
  try:
8
10
  get_ipython().run_line_magic("load_ext", "autoreload")
9
11
  get_ipython().run_line_magic("autoreload", "2")
@@ -1828,16 +1830,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1828
1830
  # Check data types
1829
1831
  data_types = df.dtypes
1830
1832
  # messages.append(f"Data types of columns:\n{data_types}")
1831
-
1832
- # Check for constant values across any column
1833
- constant_columns = df.columns[df.nunique() == 1].tolist()
1834
- if constant_columns:
1835
- messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
1836
- is_abnormal = True
1837
- if verbose:
1838
- print(f"df.columns[df.nunique() == 1].tolist()")
1839
- if verbose:
1840
- print("5", is_abnormal)
1833
+
1841
1834
  # Check for an unreasonable number of rows or columns
1842
1835
  if actual_shape[0] < 2 or actual_shape[1] < 2:
1843
1836
  messages.append(
@@ -1989,30 +1982,29 @@ def fload(fpath, kind=None, **kwargs):
1989
1982
  def load_csv(fpath, **kwargs):
1990
1983
  from pandas.errors import EmptyDataError
1991
1984
 
1992
- engine = kwargs.pop("engine", "pyarrow")
1993
- sep = kwargs.pop("sep", "\t")
1994
- index_col = kwargs.pop("index_col", None)
1995
- memory_map = kwargs.pop("memory_map", False)
1996
- skipinitialspace = kwargs.pop("skipinitialspace", False)
1997
- encoding = kwargs.pop("encoding", "utf-8")
1998
- on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1999
- comment = kwargs.pop("comment", None)
2000
- fmt = kwargs.pop("fmt", False)
2001
- chunksize = kwargs.pop("chunksize", None)
1985
+ engine = kwargs.pop("engine", "pyarrow")# default: None
1986
+ sep = kwargs.pop("sep", None)# default: ','
1987
+ index_col = kwargs.pop("index_col", None)# default: None
1988
+ memory_map = kwargs.pop("memory_map", False)# default: False
1989
+ skipinitialspace = kwargs.pop("skipinitialspace", False)# default: False
1990
+ encoding = kwargs.pop("encoding", "utf-8")# default: "utf-8"
1991
+ on_bad_lines = kwargs.pop("on_bad_lines", "skip")# default: 'error'
1992
+ comment = kwargs.pop("comment", None)# default: None
1993
+ fmt = kwargs.pop("fmt", False)# default:
1994
+ chunksize = kwargs.pop("chunksize", None)# default: None
2002
1995
  engine = "c" if chunksize else engine # when chunksize, recommend 'c'
2003
- low_memory = kwargs.pop("low_memory", True)
1996
+ low_memory = kwargs.pop("low_memory", True)# default: True
2004
1997
  low_memory = (
2005
1998
  False if chunksize else True
2006
- ) # when chunksize, recommend low_memory=False
1999
+ ) # when chunksize, recommend low_memory=False # default:
2007
2000
  verbose = kwargs.pop("verbose", False)
2008
2001
  if run_once_within():
2009
2002
  use_pd("read_csv", verbose=verbose)
2010
2003
 
2011
- if comment is None:
2004
+ if comment is None:# default: None
2012
2005
  comment = get_comment(
2013
2006
  fpath, comment=None, encoding="utf-8", lines_to_check=5
2014
2007
  )
2015
-
2016
2008
  try:
2017
2009
  df = pd.read_csv(
2018
2010
  fpath,
@@ -2107,8 +2099,8 @@ def fload(fpath, kind=None, **kwargs):
2107
2099
  separators = [",", "\t", ";", "|", " "]
2108
2100
  for sep in separators:
2109
2101
  sep2show = sep if sep != "\t" else "\\t"
2110
- # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
2111
- # print(".")
2102
+ if verbose:
2103
+ print(f'trying with: engine=pyarrow, sep="{sep2show}"')
2112
2104
  try:
2113
2105
  df = pd.read_csv(
2114
2106
  fpath,
@@ -2137,8 +2129,9 @@ def fload(fpath, kind=None, **kwargs):
2137
2129
  separators = [",", "\t", ";", "|", " "]
2138
2130
  for sep in separators:
2139
2131
  try:
2140
- # sep2show = sep if sep != "\t" else "\\t"
2141
- # print(f"trying with: engine={engine}, sep='{sep2show}'")
2132
+ sep2show = sep if sep != "\t" else "\\t"
2133
+ if verbose:
2134
+ print(f"trying with: engine={engine}, sep='{sep2show}'")
2142
2135
  # print(".")
2143
2136
  df = pd.read_csv(
2144
2137
  fpath,
@@ -2171,8 +2164,9 @@ def fload(fpath, kind=None, **kwargs):
2171
2164
  continue
2172
2165
  else:
2173
2166
  pass
2174
- if is_df_abnormal(df,verbose=verbose):
2175
- df=pd.read_csv(fpath,**kwargs)
2167
+ print(kwargs)
2168
+ # if is_df_abnormal(df,verbose=verbose):
2169
+ # df=pd.read_csv(fpath,**kwargs)
2176
2170
  display(df.head(2))
2177
2171
  print(f"shape: {df.shape}")
2178
2172
  return df
@@ -2386,7 +2380,7 @@ def fload(fpath, kind=None, **kwargs):
2386
2380
  elif kind == "xml":
2387
2381
  return load_xml(fpath)
2388
2382
  elif kind in ["csv", "tsv"]:
2389
- verbose = kwargs.pop("verbose", False)
2383
+ # verbose = kwargs.pop("verbose", False)
2390
2384
  if run_once_within():
2391
2385
  use_pd("read_csv")
2392
2386
  content = load_csv(fpath, **kwargs)
@@ -5236,15 +5230,44 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
5236
5230
  data = data.explode(column, ignore_index=True)
5237
5231
  return data
5238
5232
 
5233
+ def df_circular(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
5234
+ """
5235
+ Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
5236
+ Usage:
5237
+ data = pd.DataFrame({'month': [1, 4, 7, 10, 12]}) # Just months as an example
5238
+ # df_circular month cyclically
5239
+ data = df_circular(data, 'month', 12)
5240
+ """
5241
+ if columns is None:
5242
+ columns = list(data.columns) # If no columns specified, use all columns
5243
+ if max_val is None:
5244
+ max_val = np.max(data[columns]) # If no max_val specified, use the maximum value across all columns
5245
+ if isinstance(columns, str):
5246
+ columns = [columns] # If a single column name is provided as a string, convert it to a list
5247
+
5248
+ # Check if inplace is True, so we modify the original dataframe
5249
+ if inplace:
5250
+ # Modify the data in place, no return statement needed
5251
+ for col in columns:
5252
+ data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
5253
+ data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
5254
+ else:
5255
+ # If inplace is False, return the modified dataframe
5256
+ new_data = data.copy()
5257
+ for col in columns:
5258
+ new_data[col + '_sin'] = np.sin(2 * np.pi * new_data[col] / max_val)
5259
+ new_data[col + '_cos'] = np.cos(2 * np.pi * new_data[col] / max_val)
5260
+ return new_data
5261
+
5239
5262
 
5240
5263
  # ! DataFrame
5241
5264
  def df_astype(
5242
5265
  data: pd.DataFrame,
5243
5266
  columns: Optional[Union[str, List[str]]] = None,
5244
- astype: str = "datetime",
5267
+ astype: str = None,#"datetime",
5245
5268
  skip_row: Union[str, list] = None,
5246
5269
  fmt: Optional[str] = None,
5247
- inplace: bool = True,
5270
+ inplace: bool = False,
5248
5271
  errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
5249
5272
  **kwargs,
5250
5273
  ) -> Optional[pd.DataFrame]:
@@ -5304,6 +5327,7 @@ def df_astype(
5304
5327
  "day",
5305
5328
  "month",
5306
5329
  "year",
5330
+ "circular"
5307
5331
  ]
5308
5332
  # If inplace is False, make a copy of the DataFrame
5309
5333
  if not inplace:
@@ -5398,10 +5422,22 @@ def df_astype(
5398
5422
  kwargs.pop("errors", None)
5399
5423
  data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
5400
5424
  # print(f"Successfully converted '{column}' to timedelta.")
5425
+ elif astype == "circular":
5426
+ max_val = kwargs.get('max_val',None)
5427
+ data[column]=df_circular(data=data,columns=column,max_val=max_val)
5401
5428
  else:
5402
5429
  # Convert to other types (e.g., float, int)
5403
- data[column] = data[column].astype(astype)
5430
+ if astype=='int':
5431
+ data[column] = data[column].astype('float').astype('int')
5432
+ else:
5433
+ data[column] = data[column].astype(astype)
5404
5434
  # print(f"Successfully converted '{column}' to {astype}.")
5435
+ # format
5436
+ try:
5437
+ if fmt is not None:
5438
+ data[column] = data[column].apply(lambda x: f"{x:{fmt}}")
5439
+ except Exception as e:
5440
+ print(f"设置格式的时候有误: {e}")
5405
5441
  except Exception as e:
5406
5442
  print(f"Error converting '{column}' to {astype}: {e}")
5407
5443
  try:
@@ -6325,6 +6361,7 @@ def df_reducer(
6325
6361
  random_state=1,
6326
6362
  ax=None,
6327
6363
  figsize=None,
6364
+ verbose=True,
6328
6365
  **kwargs,
6329
6366
  ) -> pd.DataFrame:
6330
6367
  dict_methods = {
@@ -6364,7 +6401,8 @@ def df_reducer(
6364
6401
  # "autoencoder","nmf",
6365
6402
  ]
6366
6403
  method = strcmp(method, methods)[0]
6367
- print(f"\nprocessing with using {dict_methods[method]}:")
6404
+ if verbose:
6405
+ print(f"\nprocessing with using {dict_methods[method]}:")
6368
6406
  xlabel, ylabel = None, None
6369
6407
  if columns is None:
6370
6408
  columns = data.select_dtypes(include="number").columns.tolist()
@@ -6863,7 +6901,7 @@ def df_reducer(
6863
6901
  hue=hue,
6864
6902
  s=size,
6865
6903
  edgecolor=edgecolor,
6866
- kind="scater",
6904
+ kind_="scater",
6867
6905
  figsets=dict(
6868
6906
  legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
6869
6907
  xlabel=xlabel if xlabel else None,
@@ -7334,10 +7372,13 @@ def evaluate_cluster(
7334
7372
  def df_qc(
7335
7373
  data: pd.DataFrame,
7336
7374
  columns=None,
7337
- verbose=False,
7375
+ skim=False,
7338
7376
  plot_=True,
7339
7377
  max_cols=20, # only for plots
7378
+ hue=None,
7340
7379
  output=False,
7380
+ verbose=True,
7381
+ dir_save=None
7341
7382
  ):
7342
7383
  """
7343
7384
  Usage example:
@@ -7345,22 +7386,24 @@ def df_qc(
7345
7386
  """
7346
7387
  from statsmodels.stats.outliers_influence import variance_inflation_factor
7347
7388
  from scipy.stats import skew, kurtosis, entropy
7348
- import skimpy
7349
-
7389
+
7350
7390
  #! display(data.select_dtypes(include=[np.number]).describe())
7351
7391
  #!skim
7352
7392
  if columns is not None:
7353
7393
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7354
7394
  data=data[columns]
7355
- try:
7356
- skimpy.skim(data)
7357
- except:
7358
- numerical_data = data.select_dtypes(include=[np.number])
7359
- skimpy.skim(numerical_data)
7395
+ if skim:
7396
+ try:
7397
+ import skimpy
7398
+ skimpy.skim(data)
7399
+ except:
7400
+ numerical_data = data.select_dtypes(include=[np.number])
7401
+ skimpy.skim(numerical_data)
7360
7402
  # Fill completely NaN columns with a default value (e.g., 0)
7361
7403
  data = data.copy()
7362
7404
  data.loc[:, data.isna().all()] = 0
7363
7405
  res_qc = {}
7406
+ print(f"data.shape:{data.shape}")
7364
7407
 
7365
7408
  # Missing values
7366
7409
  res_qc["missing_values"] = data.isnull().sum()
@@ -7403,7 +7446,7 @@ def df_qc(
7403
7446
  numeric_df = data.select_dtypes(include=[np.number]).dropna()
7404
7447
  vif_data = pd.DataFrame()
7405
7448
  res_qc["vif"]=vif_data
7406
- if numeric_df.shape[1] > 1:
7449
+ if numeric_df.shape[1] > 1 and not numeric_df.empty:
7407
7450
  vif_data["feature"] = numeric_df.columns
7408
7451
  vif_data["VIF"] = [
7409
7452
  variance_inflation_factor(numeric_df.values, i)
@@ -7495,69 +7538,70 @@ def df_qc(
7495
7538
  # Report generation
7496
7539
  if verbose:
7497
7540
  print("=== QC Report Summary ===")
7498
- print("\nMissing Values (Total and %):")
7499
- print(res_qc["missing_values"][res_qc["missing_values"] > 0])
7500
- print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7501
-
7502
- print("\nRows with Missing Values:", res_qc["rows_with_missing"])
7503
-
7504
- print("\nData Types:")
7505
- print(res_qc["data_types"])
7506
-
7507
- print("\nUnique Values per Column:")
7508
- print(res_qc["unique_values"])
7509
-
7510
- print("\nConstant Columns:", res_qc["constant_columns"])
7511
-
7512
- print("\nDuplicate Rows:", res_qc["duplicate_rows"])
7513
- print("Duplicate Columns:", res_qc["duplicate_columns"])
7541
+ print("\n⤵ Summary Statistics:")
7542
+ display(res_qc["summary_statistics"])
7543
+ print("\n⤵ Data Types:")
7544
+ display(res_qc["data_types"])
7545
+ if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
7546
+ print(" ⤵ Missing Values Counts:")
7547
+ display(res_qc["missing_values"][res_qc["missing_values"] > 0])
7548
+ # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7549
+ print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
7550
+
7551
+ if any(res_qc["outlier_num"]):
7552
+ print("\n⤵ Outlier Report:")
7553
+ display(res_qc["outlier_num"])
7554
+ if any(res_qc["unique_values"]):
7555
+ print("\n⤵ Unique Values per Column:")
7556
+ display(res_qc["unique_values"])
7557
+
7558
+ print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
7559
+
7560
+ print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
7561
+ print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
7514
7562
 
7515
7563
  if res_qc["empty_columns"]:
7516
- print("\nEmpty Columns:", res_qc["empty_columns"])
7517
-
7518
- print("\nOutlier Report:")
7519
- print(res_qc["outlier_num"])
7520
- print("\nPercentage of Values Replaced per Column:")
7521
- print(res_qc["outlier_percentage"])
7564
+ print("\n⤵ Empty Columns:", res_qc["empty_columns"])
7522
7565
 
7523
- print("\nHigh Correlations (>|0.9|):")
7524
- for col1, col2 in res_qc["high_correlations"]:
7525
- print(f" {col1} and {col2}")
7566
+ if any(res_qc["high_correlations"]):
7567
+ print("\n⤵ High Correlations (>|0.9|):")
7568
+ for col1, col2 in res_qc["high_correlations"]:
7569
+ print(f" {col1} and {col2}")
7526
7570
 
7527
7571
  if "vif" in res_qc:
7528
- print("\nFeatures with High VIF (>|5|):")
7572
+ print("\n⤵ Features with High VIF (>|5|):")
7529
7573
  print(res_qc["vif"])
7530
7574
 
7531
- print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
7532
- print(res_qc["high_cardinality_categoricals"])
7533
-
7534
- print("\nInconsistent Data Types:")
7535
- print(res_qc["inconsistent_types"])
7536
-
7537
- print("\nRange Checks for Numeric Columns:")
7538
- print(res_qc["range_checks"])
7539
-
7540
- print("\nText Length Analysis:")
7541
- for col, stats in res_qc["text_length_analysis"].items():
7542
- print(
7543
- f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
7544
- )
7545
-
7546
- print("\nSummary Statistics:")
7547
- print(res_qc["summary_statistics"])
7575
+ if any(res_qc["high_cardinality_categoricals"]):
7576
+ print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
7577
+ print(res_qc["high_cardinality_categoricals"])
7578
+ if any(res_qc["inconsistent_types"]):
7579
+ print("\n⤵ Inconsistent Data Types:")
7580
+ display(res_qc["inconsistent_types"])
7581
+ if any(res_qc["text_length_analysis"]):
7582
+ print("\n⤵ Text Length Analysis:")
7583
+ for col, stats in res_qc["text_length_analysis"].items():
7584
+ print(
7585
+ f"{col}: Avg Length={round(stats['avg_length'],1)}, Length Variance={round(stats['length_variance'],1)}"
7586
+ )
7548
7587
 
7549
7588
  if res_qc["warnings"]:
7550
7589
  print("\nWarnings:")
7551
7590
  for warning in res_qc["warnings"]:
7552
7591
  print(" -", warning)
7553
7592
  if plot_:
7554
- df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
7593
+ df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue)
7594
+ if dir_save:
7595
+ try:
7596
+ figsave(dir_save)
7597
+ except Exception as e:
7598
+ print(f"⚠️: {e}")
7555
7599
  if output:
7556
7600
  return res_qc
7557
7601
  return None
7558
7602
 
7559
7603
 
7560
- def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
7604
+ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None):
7561
7605
  import matplotlib.pyplot as plt
7562
7606
  import seaborn as sns
7563
7607
  from .plot import subplot, figsets, get_color
@@ -7574,91 +7618,73 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7574
7618
  )
7575
7619
  if len(missing_data) > max_cols:
7576
7620
  missing_data = missing_data[:max_cols]
7577
- ax=sns.barplot(
7578
- x=missing_data.index,
7579
- y=missing_data.values,
7580
- hue=missing_data.index,
7581
- palette=get_color(len(missing_data), cmap="Blues")[::-1],
7621
+ ax_missing_data=sns.barplot(
7622
+ y=missing_data.index,
7623
+ x=missing_data.values,
7624
+ hue=missing_data.index,
7625
+ palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
7582
7626
  ax=nexttile(),
7583
7627
  )
7584
- figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
7585
-
7586
- ax2 = ax.twinx()
7587
- # Plot missing value percentages
7588
- missing_percentage = res_qc["missing_percentage"][
7589
- res_qc["missing_percentage"] > 0
7590
- ].sort_values(ascending=False)
7591
- sns.barplot(
7592
- x=missing_percentage.index,
7593
- y=missing_percentage.values,
7594
- hue=missing_percentage.index,
7595
- palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
7596
- ax=ax2,#nexttile(),
7597
- )
7598
- figsets(xangle=45, ylabel="%",ax=ax2)
7599
- ax2.tick_params(axis="y", color='r',labelcolor='r')
7600
- ax2.yaxis.label.set_color('r')
7628
+ figsets(title="Missing (#)", xlabel="#",ax=ax_missing_data,ylabel=None,fontsize=8 if len(missing_data)<=20 else 6)
7601
7629
 
7602
7630
  outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
7603
7631
  if len(outlier_num) > max_cols:
7604
7632
  outlier_num = outlier_num[:max_cols]
7605
7633
  ax_outlier_num=sns.barplot(
7606
- x=outlier_num.index,
7607
- y=outlier_num.values,
7634
+ y=outlier_num.index,
7635
+ x=outlier_num.values,
7608
7636
  hue=outlier_num.index,
7609
7637
  palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
7610
7638
  ax=nexttile(),
7611
7639
  )
7612
- figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
7613
- ax_outlier_percentage = ax_outlier_num.twinx()
7614
- outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
7615
- if len(outlier_percentage) > max_cols:
7616
- outlier_percentage = outlier_percentage[:max_cols]
7617
- ax_outlier_percentage=sns.barplot(
7618
- x=outlier_percentage.index,
7619
- y=outlier_percentage.values,
7620
- hue=outlier_percentage.index,
7621
- palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
7622
- ax=ax2 #nexttile(),
7623
- )
7624
- figsets(
7625
- xangle=45,
7626
- ylabel="%",
7627
- xlabel=None,
7628
- ylim=[0, outlier_percentage.max() + 2],
7629
- ax=ax_outlier_percentage
7630
- )
7631
- ax2.tick_params(axis="y", color='r',labelcolor='r')
7632
- ax2.yaxis.label.set_color('r')
7640
+ figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
7641
+
7642
+ #!
7643
+ try:
7644
+ if data.select_dtypes(include=np.number).shape[1]<=10:
7645
+ for col in data.select_dtypes(include=np.number).columns:
7646
+ sns.histplot(data[col], kde=True, bins=30, ax=nexttile())
7647
+ figsets(title=f"Distribution: {col}", xlabel=col, ylabel="Frequency")
7648
+ except:
7649
+ pass
7650
+ #!
7651
+ try:
7652
+ for col in data.select_dtypes(include='category').columns:
7653
+ sns.countplot(y=data[col],
7654
+ palette=get_color(data.select_dtypes(include='category').shape[1], cmap="coolwarm")[::-1],
7655
+ ax=nexttile())
7656
+ figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
7657
+ except Exception as e:
7658
+ pass
7633
7659
 
7634
7660
  # Skewness and Kurtosis Plots
7635
7661
  skewness = res_qc["skewness"].sort_values(ascending=False)
7636
7662
  kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
7637
7663
  if not skewness.empty:
7638
7664
  ax_skewness=sns.barplot(
7639
- x=skewness.index,
7640
- y=skewness.values,
7665
+ y=skewness.index,
7666
+ x=skewness.values,
7641
7667
  hue=skewness.index,
7642
7668
  palette=get_color(len(skewness), cmap="coolwarm")[::-1],
7643
7669
  ax=nexttile(),
7644
7670
  )
7645
7671
  figsets(
7646
- xangle=45,
7647
7672
  title="Highly Skewed Numeric Columns (Skewness > 1)",
7648
- ylabel="Skewness",xlabel=None,ax=ax_skewness
7673
+ xlabel="Skewness",ylabel=None,ax=ax_skewness,
7674
+ fontsize=8 if len(skewness)<=20 else 6
7649
7675
  )
7650
7676
  if not kurtosis.empty:
7651
7677
  ax_kurtosis=sns.barplot(
7652
- x=kurtosis.index,
7653
- y=kurtosis.values,
7678
+ y=kurtosis.index,
7679
+ x=kurtosis.values,
7654
7680
  hue=kurtosis.index,
7655
7681
  palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
7656
7682
  ax=nexttile(),
7657
7683
  )
7658
7684
  figsets(
7659
- xangle=45,
7660
7685
  title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
7661
- ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
7686
+ xlabel="Kurtosis",ylabel=None,ax=ax_kurtosis,
7687
+ fontsize=8 if len(kurtosis)<=20 else 6
7662
7688
  )
7663
7689
 
7664
7690
  # Entropy for Categorical Variables
@@ -7666,56 +7692,46 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7666
7692
  ascending=False
7667
7693
  )
7668
7694
  ax_entropy_data=sns.barplot(
7669
- x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
7695
+ y=entropy_data.index, x=entropy_data.values,hue=entropy_data.index,
7696
+ palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
7697
+ ax=nexttile()
7670
7698
  )
7671
7699
  figsets(
7672
- xangle=45,
7673
- xlabel="Categorical Columns",
7700
+ ylabel="Categorical Columns",
7674
7701
  title="Entropy of Categorical Variables",
7675
- ylabel="Entropy (bits)",
7676
- ax=ax_entropy_data
7677
- )
7678
- # Distribution Analysis: Boxplot for IQR
7679
- ax_iqr=sns.boxplot(
7680
- data=data[res_qc["distribution_analysis"].index],
7681
- orient="v",
7682
- palette="Set3",
7683
- ax=nexttile(),
7684
- )
7685
- figsets(
7686
- xangle=45,
7687
- title="Range for Numeric Columns",
7688
- ylabel="#",
7689
- ax=ax_iqr
7690
- )
7702
+ xlabel="Entropy (bits)",
7703
+ ax=ax_entropy_data,
7704
+ fontsize=8 if len(entropy_data)<=20 else 6
7705
+ )
7706
+
7691
7707
  # unique counts
7692
7708
  unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
7693
7709
  ax_unique_counts_=sns.barplot(
7694
- x=unique_counts.index,
7695
- y=unique_counts.values,
7710
+ y=unique_counts.index,
7711
+ x=unique_counts.values,
7696
7712
  hue=unique_counts.index,
7697
- palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
7713
+ palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
7698
7714
  ax=nexttile())
7699
7715
  figsets(
7700
- xangle=45,
7701
7716
  title="Unique Counts",
7702
- xlabel=None,
7703
- ylabel="#",
7704
- ax=ax_unique_counts_
7717
+ ylabel=None,
7718
+ xlabel="#",
7719
+ ax=ax_unique_counts_,
7720
+ fontsize=8 if len(unique_counts)<=20 else 6
7705
7721
  )
7706
7722
  # Binary Checking
7707
- ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
7708
- y=unique_counts[unique_counts<10].values,
7709
- hue=unique_counts[unique_counts<10].index,
7710
- palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
7723
+ ax_unique_counts=sns.barplot(y=unique_counts[unique_counts<8].index,
7724
+ x=unique_counts[unique_counts<8].values,
7725
+ hue=unique_counts[unique_counts<8].index,
7726
+ palette=get_color(len(unique_counts[unique_counts<8].index), cmap="coolwarm")[::-1],
7711
7727
  ax=nexttile())
7712
- plt.axhline(y=2, color="r", linestyle="--", lw=2)
7728
+ plt.axvline(x=2, color="r", linestyle="--", lw=2)
7713
7729
  figsets(
7714
- xangle=45,
7715
- xlabel=None,
7730
+ ylabel=None,
7716
7731
  title="Binary Checking",
7717
- ylabel="#",
7718
- ax=ax_unique_counts
7732
+ xlabel="#",
7733
+ ax=ax_unique_counts,
7734
+ fontsize=8 if len(unique_counts[unique_counts<10].index)<=20 else 6
7719
7735
  )
7720
7736
 
7721
7737
  # dtypes counts
@@ -7751,14 +7767,15 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7751
7767
  ha="center",
7752
7768
  va="top",
7753
7769
  c="k",
7754
- fontsize=8,
7770
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7755
7771
  rotation=0,
7756
7772
  )
7757
7773
  figsets(
7758
7774
  xlabel=None,
7759
7775
  title="Dtypes",
7760
7776
  ylabel="#",
7761
- ax=ax_dtype_counts
7777
+ ax=ax_dtype_counts,
7778
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7762
7779
  )
7763
7780
 
7764
7781
  # High cardinality: Show top categorical columns by unique value count
@@ -7772,24 +7789,26 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7772
7789
 
7773
7790
  if high_cardinality:
7774
7791
  ax_high_cardinality=sns.barplot(
7775
- x=list(high_cardinality.keys()),
7776
- y=list(high_cardinality.values()),
7792
+ y=list(high_cardinality.keys()),
7793
+ x=list(high_cardinality.values()),
7777
7794
  hue=list(high_cardinality.keys()),
7778
- palette="Oranges", ax=nexttile()
7795
+ palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[::-1],
7796
+ ax=nexttile(),
7779
7797
  )
7780
7798
  figsets(
7781
- xangle=45,
7782
7799
  title="High Cardinality Categorical Columns",
7783
- ylabel="Unique Value Count",
7784
- ax=ax_high_cardinality
7800
+ xlabel="Unique Value Count",
7801
+ ax=ax_high_cardinality,
7802
+ fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
7785
7803
  )
7786
7804
  if res_qc["low_variance_features"]:
7787
7805
  low_variance_data = data[res_qc["low_variance_features"]].copy()
7788
7806
  for col in low_variance_data.columns:
7789
- sns.histplot(
7807
+ ax_low_variance_features=sns.histplot(
7790
7808
  low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
7791
7809
  )
7792
- plt.title(f"Low Variance Feature: {col}")
7810
+ figsets(title=f"Low Variance Feature: {col}",ax=ax_low_variance_features,
7811
+ fontsize=8 if len(low_variance_data[col])<=20 else 6)
7793
7812
 
7794
7813
  # VIF plot for multicollinearity detection
7795
7814
  if "vif" in res_qc and not res_qc["vif"].empty:
@@ -7800,23 +7819,22 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7800
7819
  x="VIF",
7801
7820
  y="feature",
7802
7821
  hue="VIF",
7803
- palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
7822
+ palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
7804
7823
  ax=nexttile())
7805
7824
  figsets(
7806
- xangle=45,
7807
7825
  title="Variance Inflation Factor(VIF)",
7808
- xlabel="Variance Inflation Factor(VIF)",
7826
+ xlabel="VIF",
7809
7827
  ylabel="Features",
7810
7828
  legend=None,
7811
- ax=ax_vif
7829
+ ax=ax_vif,
7830
+ fontsize=8 if len(vif_data)<=20 else 6
7812
7831
  )
7813
7832
 
7814
7833
  # Correlation heatmap for numeric columns with high correlation pairs
7815
7834
  if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7816
- corr = data.select_dtypes(include=[np.number]).dropna().corr()
7835
+ corr = data.select_dtypes(include=[np.number]).corr()
7817
7836
  if corr.shape[1]<=33:
7818
7837
  mask = np.triu(np.ones_like(corr, dtype=bool))
7819
- # Dynamically scale fontsize based on the number of columns
7820
7838
  num_columns = corr.shape[1]
7821
7839
  fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
7822
7840
 
@@ -7826,7 +7844,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7826
7844
  annot=True,
7827
7845
  cmap="coolwarm",
7828
7846
  center=0,
7829
- fmt=".2f",
7847
+ fmt=".1f",
7830
7848
  linewidths=0.5,
7831
7849
  vmin=-1, vmax=1,
7832
7850
  ax=nexttile(2, 2),