py2ls 0.2.4.22__py3-none-any.whl → 0.2.4.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -4,6 +4,8 @@ import sys, os
4
4
  from IPython.display import display
5
5
  from typing import List, Optional, Union
6
6
 
7
+ from regex import X
8
+
7
9
  try:
8
10
  get_ipython().run_line_magic("load_ext", "autoreload")
9
11
  get_ipython().run_line_magic("autoreload", "2")
@@ -1828,16 +1830,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1828
1830
  # Check data types
1829
1831
  data_types = df.dtypes
1830
1832
  # messages.append(f"Data types of columns:\n{data_types}")
1831
-
1832
- # Check for constant values across any column
1833
- constant_columns = df.columns[df.nunique() == 1].tolist()
1834
- if constant_columns:
1835
- messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
1836
- is_abnormal = True
1837
- if verbose:
1838
- print(f"df.columns[df.nunique() == 1].tolist()")
1839
- if verbose:
1840
- print("5", is_abnormal)
1833
+
1841
1834
  # Check for an unreasonable number of rows or columns
1842
1835
  if actual_shape[0] < 2 or actual_shape[1] < 2:
1843
1836
  messages.append(
@@ -1989,30 +1982,29 @@ def fload(fpath, kind=None, **kwargs):
1989
1982
  def load_csv(fpath, **kwargs):
1990
1983
  from pandas.errors import EmptyDataError
1991
1984
 
1992
- engine = kwargs.pop("engine", "pyarrow")
1993
- sep = kwargs.pop("sep", "\t")
1994
- index_col = kwargs.pop("index_col", None)
1995
- memory_map = kwargs.pop("memory_map", False)
1996
- skipinitialspace = kwargs.pop("skipinitialspace", False)
1997
- encoding = kwargs.pop("encoding", "utf-8")
1998
- on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1999
- comment = kwargs.pop("comment", None)
2000
- fmt = kwargs.pop("fmt", False)
2001
- chunksize = kwargs.pop("chunksize", None)
1985
+ engine = kwargs.pop("engine", "pyarrow")# default: None
1986
+ sep = kwargs.pop("sep", None)# default: ','
1987
+ index_col = kwargs.pop("index_col", None)# default: None
1988
+ memory_map = kwargs.pop("memory_map", False)# default: False
1989
+ skipinitialspace = kwargs.pop("skipinitialspace", False)# default: False
1990
+ encoding = kwargs.pop("encoding", "utf-8")# default: "utf-8"
1991
+ on_bad_lines = kwargs.pop("on_bad_lines", "skip")# default: 'error'
1992
+ comment = kwargs.pop("comment", None)# default: None
1993
+ fmt = kwargs.pop("fmt", False)# default:
1994
+ chunksize = kwargs.pop("chunksize", None)# default: None
2002
1995
  engine = "c" if chunksize else engine # when chunksize, recommend 'c'
2003
- low_memory = kwargs.pop("low_memory", True)
1996
+ low_memory = kwargs.pop("low_memory", True)# default: True
2004
1997
  low_memory = (
2005
1998
  False if chunksize else True
2006
- ) # when chunksize, recommend low_memory=False
1999
+ ) # when chunksize, recommend low_memory=False # default:
2007
2000
  verbose = kwargs.pop("verbose", False)
2008
2001
  if run_once_within():
2009
2002
  use_pd("read_csv", verbose=verbose)
2010
2003
 
2011
- if comment is None:
2004
+ if comment is None:# default: None
2012
2005
  comment = get_comment(
2013
2006
  fpath, comment=None, encoding="utf-8", lines_to_check=5
2014
2007
  )
2015
-
2016
2008
  try:
2017
2009
  df = pd.read_csv(
2018
2010
  fpath,
@@ -2107,8 +2099,8 @@ def fload(fpath, kind=None, **kwargs):
2107
2099
  separators = [",", "\t", ";", "|", " "]
2108
2100
  for sep in separators:
2109
2101
  sep2show = sep if sep != "\t" else "\\t"
2110
- # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
2111
- # print(".")
2102
+ if verbose:
2103
+ print(f'trying with: engine=pyarrow, sep="{sep2show}"')
2112
2104
  try:
2113
2105
  df = pd.read_csv(
2114
2106
  fpath,
@@ -2137,8 +2129,9 @@ def fload(fpath, kind=None, **kwargs):
2137
2129
  separators = [",", "\t", ";", "|", " "]
2138
2130
  for sep in separators:
2139
2131
  try:
2140
- # sep2show = sep if sep != "\t" else "\\t"
2141
- # print(f"trying with: engine={engine}, sep='{sep2show}'")
2132
+ sep2show = sep if sep != "\t" else "\\t"
2133
+ if verbose:
2134
+ print(f"trying with: engine={engine}, sep='{sep2show}'")
2142
2135
  # print(".")
2143
2136
  df = pd.read_csv(
2144
2137
  fpath,
@@ -2171,8 +2164,9 @@ def fload(fpath, kind=None, **kwargs):
2171
2164
  continue
2172
2165
  else:
2173
2166
  pass
2174
- if is_df_abnormal(df,verbose=verbose):
2175
- df=pd.read_csv(fpath,**kwargs)
2167
+ print(kwargs)
2168
+ # if is_df_abnormal(df,verbose=verbose):
2169
+ # df=pd.read_csv(fpath,**kwargs)
2176
2170
  display(df.head(2))
2177
2171
  print(f"shape: {df.shape}")
2178
2172
  return df
@@ -2386,7 +2380,7 @@ def fload(fpath, kind=None, **kwargs):
2386
2380
  elif kind == "xml":
2387
2381
  return load_xml(fpath)
2388
2382
  elif kind in ["csv", "tsv"]:
2389
- verbose = kwargs.pop("verbose", False)
2383
+ # verbose = kwargs.pop("verbose", False)
2390
2384
  if run_once_within():
2391
2385
  use_pd("read_csv")
2392
2386
  content = load_csv(fpath, **kwargs)
@@ -5236,15 +5230,44 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
5236
5230
  data = data.explode(column, ignore_index=True)
5237
5231
  return data
5238
5232
 
5233
+ def df_circular(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
5234
+ """
5235
+ Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
5236
+ Usage:
5237
+ data = pd.DataFrame({'month': [1, 4, 7, 10, 12]}) # Just months as an example
5238
+ # df_circular month cyclically
5239
+ data = df_circular(data, 'month', 12)
5240
+ """
5241
+ if columns is None:
5242
+ columns = list(data.columns) # If no columns specified, use all columns
5243
+ if max_val is None:
5244
+ max_val = np.max(data[columns]) # If no max_val specified, use the maximum value across all columns
5245
+ if isinstance(columns, str):
5246
+ columns = [columns] # If a single column name is provided as a string, convert it to a list
5247
+
5248
+ # Check if inplace is True, so we modify the original dataframe
5249
+ if inplace:
5250
+ # Modify the data in place, no return statement needed
5251
+ for col in columns:
5252
+ data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
5253
+ data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
5254
+ else:
5255
+ # If inplace is False, return the modified dataframe
5256
+ new_data = data.copy()
5257
+ for col in columns:
5258
+ new_data[col + '_sin'] = np.sin(2 * np.pi * new_data[col] / max_val)
5259
+ new_data[col + '_cos'] = np.cos(2 * np.pi * new_data[col] / max_val)
5260
+ return new_data
5261
+
5239
5262
 
5240
5263
  # ! DataFrame
5241
5264
  def df_astype(
5242
5265
  data: pd.DataFrame,
5243
5266
  columns: Optional[Union[str, List[str]]] = None,
5244
- astype: str = "datetime",
5267
+ astype: str = None,#"datetime",
5245
5268
  skip_row: Union[str, list] = None,
5246
5269
  fmt: Optional[str] = None,
5247
- inplace: bool = True,
5270
+ inplace: bool = False,
5248
5271
  errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
5249
5272
  **kwargs,
5250
5273
  ) -> Optional[pd.DataFrame]:
@@ -5304,6 +5327,7 @@ def df_astype(
5304
5327
  "day",
5305
5328
  "month",
5306
5329
  "year",
5330
+ "circular"
5307
5331
  ]
5308
5332
  # If inplace is False, make a copy of the DataFrame
5309
5333
  if not inplace:
@@ -5398,10 +5422,22 @@ def df_astype(
5398
5422
  kwargs.pop("errors", None)
5399
5423
  data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
5400
5424
  # print(f"Successfully converted '{column}' to timedelta.")
5425
+ elif astype == "circular":
5426
+ max_val = kwargs.get('max_val',None)
5427
+ data[column]=df_circular(data=data,columns=column,max_val=max_val)
5401
5428
  else:
5402
5429
  # Convert to other types (e.g., float, int)
5403
- data[column] = data[column].astype(astype)
5430
+ if astype=='int':
5431
+ data[column] = data[column].astype('float').astype('int')
5432
+ else:
5433
+ data[column] = data[column].astype(astype)
5404
5434
  # print(f"Successfully converted '{column}' to {astype}.")
5435
+ # format
5436
+ try:
5437
+ if fmt is not None:
5438
+ data[column] = data[column].apply(lambda x: f"{x:{fmt}}")
5439
+ except Exception as e:
5440
+ print(f"设置格式的时候有误: {e}")
5405
5441
  except Exception as e:
5406
5442
  print(f"Error converting '{column}' to {astype}: {e}")
5407
5443
  try:
@@ -6325,6 +6361,7 @@ def df_reducer(
6325
6361
  random_state=1,
6326
6362
  ax=None,
6327
6363
  figsize=None,
6364
+ verbose=True,
6328
6365
  **kwargs,
6329
6366
  ) -> pd.DataFrame:
6330
6367
  dict_methods = {
@@ -6364,7 +6401,8 @@ def df_reducer(
6364
6401
  # "autoencoder","nmf",
6365
6402
  ]
6366
6403
  method = strcmp(method, methods)[0]
6367
- print(f"\nprocessing with using {dict_methods[method]}:")
6404
+ if verbose:
6405
+ print(f"\nprocessing with using {dict_methods[method]}:")
6368
6406
  xlabel, ylabel = None, None
6369
6407
  if columns is None:
6370
6408
  columns = data.select_dtypes(include="number").columns.tolist()
@@ -6863,7 +6901,7 @@ def df_reducer(
6863
6901
  hue=hue,
6864
6902
  s=size,
6865
6903
  edgecolor=edgecolor,
6866
- kind="scater",
6904
+ kind_="scater",
6867
6905
  figsets=dict(
6868
6906
  legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
6869
6907
  xlabel=xlabel if xlabel else None,
@@ -7334,10 +7372,13 @@ def evaluate_cluster(
7334
7372
  def df_qc(
7335
7373
  data: pd.DataFrame,
7336
7374
  columns=None,
7337
- verbose=False,
7375
+ skim=False,
7338
7376
  plot_=True,
7339
7377
  max_cols=20, # only for plots
7378
+ hue=None,
7340
7379
  output=False,
7380
+ verbose=True,
7381
+ dir_save=None
7341
7382
  ):
7342
7383
  """
7343
7384
  Usage example:
@@ -7345,22 +7386,24 @@ def df_qc(
7345
7386
  """
7346
7387
  from statsmodels.stats.outliers_influence import variance_inflation_factor
7347
7388
  from scipy.stats import skew, kurtosis, entropy
7348
- import skimpy
7349
-
7389
+
7350
7390
  #! display(data.select_dtypes(include=[np.number]).describe())
7351
7391
  #!skim
7352
7392
  if columns is not None:
7353
7393
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7354
7394
  data=data[columns]
7355
- try:
7356
- skimpy.skim(data)
7357
- except:
7358
- numerical_data = data.select_dtypes(include=[np.number])
7359
- skimpy.skim(numerical_data)
7395
+ if skim:
7396
+ try:
7397
+ import skimpy
7398
+ skimpy.skim(data)
7399
+ except:
7400
+ numerical_data = data.select_dtypes(include=[np.number])
7401
+ skimpy.skim(numerical_data)
7360
7402
  # Fill completely NaN columns with a default value (e.g., 0)
7361
7403
  data = data.copy()
7362
7404
  data.loc[:, data.isna().all()] = 0
7363
7405
  res_qc = {}
7406
+ print(f"data.shape:{data.shape}")
7364
7407
 
7365
7408
  # Missing values
7366
7409
  res_qc["missing_values"] = data.isnull().sum()
@@ -7403,7 +7446,7 @@ def df_qc(
7403
7446
  numeric_df = data.select_dtypes(include=[np.number]).dropna()
7404
7447
  vif_data = pd.DataFrame()
7405
7448
  res_qc["vif"]=vif_data
7406
- if numeric_df.shape[1] > 1:
7449
+ if numeric_df.shape[1] > 1 and not numeric_df.empty:
7407
7450
  vif_data["feature"] = numeric_df.columns
7408
7451
  vif_data["VIF"] = [
7409
7452
  variance_inflation_factor(numeric_df.values, i)
@@ -7495,69 +7538,70 @@ def df_qc(
7495
7538
  # Report generation
7496
7539
  if verbose:
7497
7540
  print("=== QC Report Summary ===")
7498
- print("\nMissing Values (Total and %):")
7499
- print(res_qc["missing_values"][res_qc["missing_values"] > 0])
7500
- print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7501
-
7502
- print("\nRows with Missing Values:", res_qc["rows_with_missing"])
7503
-
7504
- print("\nData Types:")
7505
- print(res_qc["data_types"])
7506
-
7507
- print("\nUnique Values per Column:")
7508
- print(res_qc["unique_values"])
7509
-
7510
- print("\nConstant Columns:", res_qc["constant_columns"])
7511
-
7512
- print("\nDuplicate Rows:", res_qc["duplicate_rows"])
7513
- print("Duplicate Columns:", res_qc["duplicate_columns"])
7541
+ print("\n⤵ Summary Statistics:")
7542
+ display(res_qc["summary_statistics"])
7543
+ print("\n⤵ Data Types:")
7544
+ display(res_qc["data_types"])
7545
+ if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
7546
+ print(" ⤵ Missing Values Counts:")
7547
+ display(res_qc["missing_values"][res_qc["missing_values"] > 0])
7548
+ # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7549
+ print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
7550
+
7551
+ if any(res_qc["outlier_num"]):
7552
+ print("\n⤵ Outlier Report:")
7553
+ display(res_qc["outlier_num"])
7554
+ if any(res_qc["unique_values"]):
7555
+ print("\n⤵ Unique Values per Column:")
7556
+ display(res_qc["unique_values"])
7557
+
7558
+ print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
7559
+
7560
+ print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
7561
+ print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
7514
7562
 
7515
7563
  if res_qc["empty_columns"]:
7516
- print("\nEmpty Columns:", res_qc["empty_columns"])
7517
-
7518
- print("\nOutlier Report:")
7519
- print(res_qc["outlier_num"])
7520
- print("\nPercentage of Values Replaced per Column:")
7521
- print(res_qc["outlier_percentage"])
7564
+ print("\n⤵ Empty Columns:", res_qc["empty_columns"])
7522
7565
 
7523
- print("\nHigh Correlations (>|0.9|):")
7524
- for col1, col2 in res_qc["high_correlations"]:
7525
- print(f" {col1} and {col2}")
7566
+ if any(res_qc["high_correlations"]):
7567
+ print("\n⤵ High Correlations (>|0.9|):")
7568
+ for col1, col2 in res_qc["high_correlations"]:
7569
+ print(f" {col1} and {col2}")
7526
7570
 
7527
7571
  if "vif" in res_qc:
7528
- print("\nFeatures with High VIF (>|5|):")
7572
+ print("\n⤵ Features with High VIF (>|5|):")
7529
7573
  print(res_qc["vif"])
7530
7574
 
7531
- print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
7532
- print(res_qc["high_cardinality_categoricals"])
7533
-
7534
- print("\nInconsistent Data Types:")
7535
- print(res_qc["inconsistent_types"])
7536
-
7537
- print("\nRange Checks for Numeric Columns:")
7538
- print(res_qc["range_checks"])
7539
-
7540
- print("\nText Length Analysis:")
7541
- for col, stats in res_qc["text_length_analysis"].items():
7542
- print(
7543
- f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
7544
- )
7545
-
7546
- print("\nSummary Statistics:")
7547
- print(res_qc["summary_statistics"])
7575
+ if any(res_qc["high_cardinality_categoricals"]):
7576
+ print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
7577
+ print(res_qc["high_cardinality_categoricals"])
7578
+ if any(res_qc["inconsistent_types"]):
7579
+ print("\n⤵ Inconsistent Data Types:")
7580
+ display(res_qc["inconsistent_types"])
7581
+ if any(res_qc["text_length_analysis"]):
7582
+ print("\n⤵ Text Length Analysis:")
7583
+ for col, stats in res_qc["text_length_analysis"].items():
7584
+ print(
7585
+ f"{col}: Avg Length={round(stats['avg_length'],1)}, Length Variance={round(stats['length_variance'],1)}"
7586
+ )
7548
7587
 
7549
7588
  if res_qc["warnings"]:
7550
7589
  print("\nWarnings:")
7551
7590
  for warning in res_qc["warnings"]:
7552
7591
  print(" -", warning)
7553
7592
  if plot_:
7554
- df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
7593
+ df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue)
7594
+ if dir_save:
7595
+ try:
7596
+ figsave(dir_save)
7597
+ except Exception as e:
7598
+ print(f"⚠️: {e}")
7555
7599
  if output:
7556
7600
  return res_qc
7557
7601
  return None
7558
7602
 
7559
7603
 
7560
- def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
7604
+ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None):
7561
7605
  import matplotlib.pyplot as plt
7562
7606
  import seaborn as sns
7563
7607
  from .plot import subplot, figsets, get_color
@@ -7574,91 +7618,73 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7574
7618
  )
7575
7619
  if len(missing_data) > max_cols:
7576
7620
  missing_data = missing_data[:max_cols]
7577
- ax=sns.barplot(
7578
- x=missing_data.index,
7579
- y=missing_data.values,
7580
- hue=missing_data.index,
7581
- palette=get_color(len(missing_data), cmap="Blues")[::-1],
7621
+ ax_missing_data=sns.barplot(
7622
+ y=missing_data.index,
7623
+ x=missing_data.values,
7624
+ hue=missing_data.index,
7625
+ palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
7582
7626
  ax=nexttile(),
7583
7627
  )
7584
- figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
7585
-
7586
- ax2 = ax.twinx()
7587
- # Plot missing value percentages
7588
- missing_percentage = res_qc["missing_percentage"][
7589
- res_qc["missing_percentage"] > 0
7590
- ].sort_values(ascending=False)
7591
- sns.barplot(
7592
- x=missing_percentage.index,
7593
- y=missing_percentage.values,
7594
- hue=missing_percentage.index,
7595
- palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
7596
- ax=ax2,#nexttile(),
7597
- )
7598
- figsets(xangle=45, ylabel="%",ax=ax2)
7599
- ax2.tick_params(axis="y", color='r',labelcolor='r')
7600
- ax2.yaxis.label.set_color('r')
7628
+ figsets(title="Missing (#)", xlabel="#",ax=ax_missing_data,ylabel=None,fontsize=8 if len(missing_data)<=20 else 6)
7601
7629
 
7602
7630
  outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
7603
7631
  if len(outlier_num) > max_cols:
7604
7632
  outlier_num = outlier_num[:max_cols]
7605
7633
  ax_outlier_num=sns.barplot(
7606
- x=outlier_num.index,
7607
- y=outlier_num.values,
7634
+ y=outlier_num.index,
7635
+ x=outlier_num.values,
7608
7636
  hue=outlier_num.index,
7609
7637
  palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
7610
7638
  ax=nexttile(),
7611
7639
  )
7612
- figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
7613
- ax_outlier_percentage = ax_outlier_num.twinx()
7614
- outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
7615
- if len(outlier_percentage) > max_cols:
7616
- outlier_percentage = outlier_percentage[:max_cols]
7617
- ax_outlier_percentage=sns.barplot(
7618
- x=outlier_percentage.index,
7619
- y=outlier_percentage.values,
7620
- hue=outlier_percentage.index,
7621
- palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
7622
- ax=ax2 #nexttile(),
7623
- )
7624
- figsets(
7625
- xangle=45,
7626
- ylabel="%",
7627
- xlabel=None,
7628
- ylim=[0, outlier_percentage.max() + 2],
7629
- ax=ax_outlier_percentage
7630
- )
7631
- ax2.tick_params(axis="y", color='r',labelcolor='r')
7632
- ax2.yaxis.label.set_color('r')
7640
+ figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
7641
+
7642
+ #!
7643
+ try:
7644
+ if data.select_dtypes(include=np.number).shape[1]<=10:
7645
+ for col in data.select_dtypes(include=np.number).columns:
7646
+ sns.histplot(data[col], kde=True, bins=30, ax=nexttile())
7647
+ figsets(title=f"Distribution: {col}", xlabel=col, ylabel="Frequency")
7648
+ except:
7649
+ pass
7650
+ #!
7651
+ try:
7652
+ for col in data.select_dtypes(include='category').columns:
7653
+ sns.countplot(y=data[col],
7654
+ palette=get_color(data.select_dtypes(include='category').shape[1], cmap="coolwarm")[::-1],
7655
+ ax=nexttile())
7656
+ figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
7657
+ except Exception as e:
7658
+ pass
7633
7659
 
7634
7660
  # Skewness and Kurtosis Plots
7635
7661
  skewness = res_qc["skewness"].sort_values(ascending=False)
7636
7662
  kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
7637
7663
  if not skewness.empty:
7638
7664
  ax_skewness=sns.barplot(
7639
- x=skewness.index,
7640
- y=skewness.values,
7665
+ y=skewness.index,
7666
+ x=skewness.values,
7641
7667
  hue=skewness.index,
7642
7668
  palette=get_color(len(skewness), cmap="coolwarm")[::-1],
7643
7669
  ax=nexttile(),
7644
7670
  )
7645
7671
  figsets(
7646
- xangle=45,
7647
7672
  title="Highly Skewed Numeric Columns (Skewness > 1)",
7648
- ylabel="Skewness",xlabel=None,ax=ax_skewness
7673
+ xlabel="Skewness",ylabel=None,ax=ax_skewness,
7674
+ fontsize=8 if len(skewness)<=20 else 6
7649
7675
  )
7650
7676
  if not kurtosis.empty:
7651
7677
  ax_kurtosis=sns.barplot(
7652
- x=kurtosis.index,
7653
- y=kurtosis.values,
7678
+ y=kurtosis.index,
7679
+ x=kurtosis.values,
7654
7680
  hue=kurtosis.index,
7655
7681
  palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
7656
7682
  ax=nexttile(),
7657
7683
  )
7658
7684
  figsets(
7659
- xangle=45,
7660
7685
  title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
7661
- ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
7686
+ xlabel="Kurtosis",ylabel=None,ax=ax_kurtosis,
7687
+ fontsize=8 if len(kurtosis)<=20 else 6
7662
7688
  )
7663
7689
 
7664
7690
  # Entropy for Categorical Variables
@@ -7666,56 +7692,46 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7666
7692
  ascending=False
7667
7693
  )
7668
7694
  ax_entropy_data=sns.barplot(
7669
- x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
7695
+ y=entropy_data.index, x=entropy_data.values,hue=entropy_data.index,
7696
+ palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
7697
+ ax=nexttile()
7670
7698
  )
7671
7699
  figsets(
7672
- xangle=45,
7673
- xlabel="Categorical Columns",
7700
+ ylabel="Categorical Columns",
7674
7701
  title="Entropy of Categorical Variables",
7675
- ylabel="Entropy (bits)",
7676
- ax=ax_entropy_data
7677
- )
7678
- # Distribution Analysis: Boxplot for IQR
7679
- ax_iqr=sns.boxplot(
7680
- data=data[res_qc["distribution_analysis"].index],
7681
- orient="v",
7682
- palette="Set3",
7683
- ax=nexttile(),
7684
- )
7685
- figsets(
7686
- xangle=45,
7687
- title="Range for Numeric Columns",
7688
- ylabel="#",
7689
- ax=ax_iqr
7690
- )
7702
+ xlabel="Entropy (bits)",
7703
+ ax=ax_entropy_data,
7704
+ fontsize=8 if len(entropy_data)<=20 else 6
7705
+ )
7706
+
7691
7707
  # unique counts
7692
7708
  unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
7693
7709
  ax_unique_counts_=sns.barplot(
7694
- x=unique_counts.index,
7695
- y=unique_counts.values,
7710
+ y=unique_counts.index,
7711
+ x=unique_counts.values,
7696
7712
  hue=unique_counts.index,
7697
- palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
7713
+ palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
7698
7714
  ax=nexttile())
7699
7715
  figsets(
7700
- xangle=45,
7701
7716
  title="Unique Counts",
7702
- xlabel=None,
7703
- ylabel="#",
7704
- ax=ax_unique_counts_
7717
+ ylabel=None,
7718
+ xlabel="#",
7719
+ ax=ax_unique_counts_,
7720
+ fontsize=8 if len(unique_counts)<=20 else 6
7705
7721
  )
7706
7722
  # Binary Checking
7707
- ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
7708
- y=unique_counts[unique_counts<10].values,
7709
- hue=unique_counts[unique_counts<10].index,
7710
- palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
7723
+ ax_unique_counts=sns.barplot(y=unique_counts[unique_counts<8].index,
7724
+ x=unique_counts[unique_counts<8].values,
7725
+ hue=unique_counts[unique_counts<8].index,
7726
+ palette=get_color(len(unique_counts[unique_counts<8].index), cmap="coolwarm")[::-1],
7711
7727
  ax=nexttile())
7712
- plt.axhline(y=2, color="r", linestyle="--", lw=2)
7728
+ plt.axvline(x=2, color="r", linestyle="--", lw=2)
7713
7729
  figsets(
7714
- xangle=45,
7715
- xlabel=None,
7730
+ ylabel=None,
7716
7731
  title="Binary Checking",
7717
- ylabel="#",
7718
- ax=ax_unique_counts
7732
+ xlabel="#",
7733
+ ax=ax_unique_counts,
7734
+ fontsize=8 if len(unique_counts[unique_counts<10].index)<=20 else 6
7719
7735
  )
7720
7736
 
7721
7737
  # dtypes counts
@@ -7751,14 +7767,15 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7751
7767
  ha="center",
7752
7768
  va="top",
7753
7769
  c="k",
7754
- fontsize=8,
7770
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7755
7771
  rotation=0,
7756
7772
  )
7757
7773
  figsets(
7758
7774
  xlabel=None,
7759
7775
  title="Dtypes",
7760
7776
  ylabel="#",
7761
- ax=ax_dtype_counts
7777
+ ax=ax_dtype_counts,
7778
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7762
7779
  )
7763
7780
 
7764
7781
  # High cardinality: Show top categorical columns by unique value count
@@ -7772,24 +7789,26 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7772
7789
 
7773
7790
  if high_cardinality:
7774
7791
  ax_high_cardinality=sns.barplot(
7775
- x=list(high_cardinality.keys()),
7776
- y=list(high_cardinality.values()),
7792
+ y=list(high_cardinality.keys()),
7793
+ x=list(high_cardinality.values()),
7777
7794
  hue=list(high_cardinality.keys()),
7778
- palette="Oranges", ax=nexttile()
7795
+ palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[::-1],
7796
+ ax=nexttile(),
7779
7797
  )
7780
7798
  figsets(
7781
- xangle=45,
7782
7799
  title="High Cardinality Categorical Columns",
7783
- ylabel="Unique Value Count",
7784
- ax=ax_high_cardinality
7800
+ xlabel="Unique Value Count",
7801
+ ax=ax_high_cardinality,
7802
+ fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
7785
7803
  )
7786
7804
  if res_qc["low_variance_features"]:
7787
7805
  low_variance_data = data[res_qc["low_variance_features"]].copy()
7788
7806
  for col in low_variance_data.columns:
7789
- sns.histplot(
7807
+ ax_low_variance_features=sns.histplot(
7790
7808
  low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
7791
7809
  )
7792
- plt.title(f"Low Variance Feature: {col}")
7810
+ figsets(title=f"Low Variance Feature: {col}",ax=ax_low_variance_features,
7811
+ fontsize=8 if len(low_variance_data[col])<=20 else 6)
7793
7812
 
7794
7813
  # VIF plot for multicollinearity detection
7795
7814
  if "vif" in res_qc and not res_qc["vif"].empty:
@@ -7800,23 +7819,22 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7800
7819
  x="VIF",
7801
7820
  y="feature",
7802
7821
  hue="VIF",
7803
- palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
7822
+ palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
7804
7823
  ax=nexttile())
7805
7824
  figsets(
7806
- xangle=45,
7807
7825
  title="Variance Inflation Factor(VIF)",
7808
- xlabel="Variance Inflation Factor(VIF)",
7826
+ xlabel="VIF",
7809
7827
  ylabel="Features",
7810
7828
  legend=None,
7811
- ax=ax_vif
7829
+ ax=ax_vif,
7830
+ fontsize=8 if len(vif_data)<=20 else 6
7812
7831
  )
7813
7832
 
7814
7833
  # Correlation heatmap for numeric columns with high correlation pairs
7815
7834
  if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7816
- corr = data.select_dtypes(include=[np.number]).dropna().corr()
7835
+ corr = data.select_dtypes(include=[np.number]).corr()
7817
7836
  if corr.shape[1]<=33:
7818
7837
  mask = np.triu(np.ones_like(corr, dtype=bool))
7819
- # Dynamically scale fontsize based on the number of columns
7820
7838
  num_columns = corr.shape[1]
7821
7839
  fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
7822
7840
 
@@ -7826,7 +7844,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7826
7844
  annot=True,
7827
7845
  cmap="coolwarm",
7828
7846
  center=0,
7829
- fmt=".2f",
7847
+ fmt=".1f",
7830
7848
  linewidths=0.5,
7831
7849
  vmin=-1, vmax=1,
7832
7850
  ax=nexttile(2, 2),