py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.25__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ips.py CHANGED
@@ -4,6 +4,8 @@ import sys, os
4
4
  from IPython.display import display
5
5
  from typing import List, Optional, Union
6
6
 
7
+ from regex import X
8
+
7
9
  try:
8
10
  get_ipython().run_line_magic("load_ext", "autoreload")
9
11
  get_ipython().run_line_magic("autoreload", "2")
@@ -16,15 +18,17 @@ warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
16
18
  warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
17
19
 
18
20
 
19
- def run_once_within(duration=60): # default 60s
21
+ def run_once_within(duration=60,reverse=False): # default 60s
20
22
  import time
21
23
 
22
24
  """
25
+ 如果reverse is True, 则在第一次运行时并不运行.但是在第二次运行时则运行
23
26
  usage:
24
27
  if run_once_within():
25
28
  print("This code runs once per minute.")
26
29
  else:
27
30
  print("The code has already been run in the last minute.")
31
+
28
32
  """
29
33
  if not hasattr(run_once_within, "time_last"):
30
34
  run_once_within.time_last = None
@@ -34,9 +38,9 @@ def run_once_within(duration=60): # default 60s
34
38
  time_curr - run_once_within.time_last >= duration
35
39
  ):
36
40
  run_once_within.time_last = time_curr # Update the last execution time
37
- return True
41
+ return False if reverse else True
38
42
  else:
39
- return False
43
+ return True if reverse else False
40
44
 
41
45
 
42
46
  def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
@@ -1828,16 +1832,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1828
1832
  # Check data types
1829
1833
  data_types = df.dtypes
1830
1834
  # messages.append(f"Data types of columns:\n{data_types}")
1831
-
1832
- # Check for constant values across any column
1833
- constant_columns = df.columns[df.nunique() == 1].tolist()
1834
- if constant_columns:
1835
- messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
1836
- is_abnormal = True
1837
- if verbose:
1838
- print(f"df.columns[df.nunique() == 1].tolist()")
1839
- if verbose:
1840
- print("5", is_abnormal)
1835
+
1841
1836
  # Check for an unreasonable number of rows or columns
1842
1837
  if actual_shape[0] < 2 or actual_shape[1] < 2:
1843
1838
  messages.append(
@@ -1989,30 +1984,29 @@ def fload(fpath, kind=None, **kwargs):
1989
1984
  def load_csv(fpath, **kwargs):
1990
1985
  from pandas.errors import EmptyDataError
1991
1986
 
1992
- engine = kwargs.pop("engine", "pyarrow")
1993
- sep = kwargs.pop("sep", "\t")
1994
- index_col = kwargs.pop("index_col", None)
1995
- memory_map = kwargs.pop("memory_map", False)
1996
- skipinitialspace = kwargs.pop("skipinitialspace", False)
1997
- encoding = kwargs.pop("encoding", "utf-8")
1998
- on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1999
- comment = kwargs.pop("comment", None)
2000
- fmt = kwargs.pop("fmt", False)
2001
- chunksize = kwargs.pop("chunksize", None)
1987
+ engine = kwargs.pop("engine", "pyarrow")# default: None
1988
+ sep = kwargs.pop("sep", None)# default: ','
1989
+ index_col = kwargs.pop("index_col", None)# default: None
1990
+ memory_map = kwargs.pop("memory_map", False)# default: False
1991
+ skipinitialspace = kwargs.pop("skipinitialspace", False)# default: False
1992
+ encoding = kwargs.pop("encoding", "utf-8")# default: "utf-8"
1993
+ on_bad_lines = kwargs.pop("on_bad_lines", "skip")# default: 'error'
1994
+ comment = kwargs.pop("comment", None)# default: None
1995
+ fmt = kwargs.pop("fmt", False)# default:
1996
+ chunksize = kwargs.pop("chunksize", None)# default: None
2002
1997
  engine = "c" if chunksize else engine # when chunksize, recommend 'c'
2003
- low_memory = kwargs.pop("low_memory", True)
1998
+ low_memory = kwargs.pop("low_memory", True)# default: True
2004
1999
  low_memory = (
2005
2000
  False if chunksize else True
2006
- ) # when chunksize, recommend low_memory=False
2001
+ ) # when chunksize, recommend low_memory=False # default:
2007
2002
  verbose = kwargs.pop("verbose", False)
2008
2003
  if run_once_within():
2009
2004
  use_pd("read_csv", verbose=verbose)
2010
2005
 
2011
- if comment is None:
2006
+ if comment is None:# default: None
2012
2007
  comment = get_comment(
2013
2008
  fpath, comment=None, encoding="utf-8", lines_to_check=5
2014
2009
  )
2015
-
2016
2010
  try:
2017
2011
  df = pd.read_csv(
2018
2012
  fpath,
@@ -2107,8 +2101,8 @@ def fload(fpath, kind=None, **kwargs):
2107
2101
  separators = [",", "\t", ";", "|", " "]
2108
2102
  for sep in separators:
2109
2103
  sep2show = sep if sep != "\t" else "\\t"
2110
- # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
2111
- # print(".")
2104
+ if verbose:
2105
+ print(f'trying with: engine=pyarrow, sep="{sep2show}"')
2112
2106
  try:
2113
2107
  df = pd.read_csv(
2114
2108
  fpath,
@@ -2137,8 +2131,9 @@ def fload(fpath, kind=None, **kwargs):
2137
2131
  separators = [",", "\t", ";", "|", " "]
2138
2132
  for sep in separators:
2139
2133
  try:
2140
- # sep2show = sep if sep != "\t" else "\\t"
2141
- # print(f"trying with: engine={engine}, sep='{sep2show}'")
2134
+ sep2show = sep if sep != "\t" else "\\t"
2135
+ if verbose:
2136
+ print(f"trying with: engine={engine}, sep='{sep2show}'")
2142
2137
  # print(".")
2143
2138
  df = pd.read_csv(
2144
2139
  fpath,
@@ -2171,8 +2166,9 @@ def fload(fpath, kind=None, **kwargs):
2171
2166
  continue
2172
2167
  else:
2173
2168
  pass
2174
- if is_df_abnormal(df,verbose=verbose):
2175
- df=pd.read_csv(fpath,**kwargs)
2169
+ print(kwargs)
2170
+ # if is_df_abnormal(df,verbose=verbose):
2171
+ # df=pd.read_csv(fpath,**kwargs)
2176
2172
  display(df.head(2))
2177
2173
  print(f"shape: {df.shape}")
2178
2174
  return df
@@ -2386,7 +2382,7 @@ def fload(fpath, kind=None, **kwargs):
2386
2382
  elif kind == "xml":
2387
2383
  return load_xml(fpath)
2388
2384
  elif kind in ["csv", "tsv"]:
2389
- verbose = kwargs.pop("verbose", False)
2385
+ # verbose = kwargs.pop("verbose", False)
2390
2386
  if run_once_within():
2391
2387
  use_pd("read_csv")
2392
2388
  content = load_csv(fpath, **kwargs)
@@ -3503,12 +3499,8 @@ def figsave(*args, dpi=300):
3503
3499
  )
3504
3500
  else:
3505
3501
  plt.savefig(
3506
- fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", pad_inches=0
3507
- )
3508
- # elif ftype.lower() == "png":
3509
- # plt.savefig(fname, format="png", dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0)
3510
- # elif ftype.lower() in ["tiff", "tif"]:
3511
- # plt.savefig(fname, format="tiff", dpi=dpi, bbox_inches="tight",pad_inches=0)
3502
+ fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0
3503
+ )
3512
3504
  elif ftype.lower() == "emf":
3513
3505
  plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
3514
3506
  elif ftype.lower() == "fig":
@@ -5236,15 +5228,44 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
5236
5228
  data = data.explode(column, ignore_index=True)
5237
5229
  return data
5238
5230
 
5231
+ def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
5232
+ """
5233
+ Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
5234
+ Usage:
5235
+ data = pd.DataFrame({'month': [1, 4, 7, 10, 12]}) # Just months as an example
5236
+ # df_cycle month cyclically
5237
+ data = df_cycle(data, 'month', 12)
5238
+ """
5239
+ if columns is None:
5240
+ columns = list(data.select_dtypes(include=np.number).columns) # If no columns specified, use all columns
5241
+ if max_val is None:
5242
+ max_val = np.max(data[columns]) # If no max_val specified, use the maximum value across all columns
5243
+ if isinstance(columns, str):
5244
+ columns = [columns] # If a single column name is provided as a string, convert it to a list
5245
+
5246
+ # Check if inplace is True, so we modify the original dataframe
5247
+ if inplace:
5248
+ # Modify the data in place, no return statement needed
5249
+ for col in columns:
5250
+ data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
5251
+ data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
5252
+ else:
5253
+ # If inplace is False, return the modified dataframe
5254
+ new_data = data.copy()
5255
+ for col in columns:
5256
+ new_data[col + '_sin'] = np.sin(2 * np.pi * new_data[col] / max_val)
5257
+ new_data[col + '_cos'] = np.cos(2 * np.pi * new_data[col] / max_val)
5258
+ return new_data
5259
+
5239
5260
 
5240
5261
  # ! DataFrame
5241
5262
  def df_astype(
5242
5263
  data: pd.DataFrame,
5243
5264
  columns: Optional[Union[str, List[str]]] = None,
5244
- astype: str = "datetime",
5265
+ astype: str = None,#"datetime",
5245
5266
  skip_row: Union[str, list] = None,
5246
5267
  fmt: Optional[str] = None,
5247
- inplace: bool = True,
5268
+ inplace: bool = False,
5248
5269
  errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
5249
5270
  **kwargs,
5250
5271
  ) -> Optional[pd.DataFrame]:
@@ -5304,6 +5325,7 @@ def df_astype(
5304
5325
  "day",
5305
5326
  "month",
5306
5327
  "year",
5328
+ "circular"
5307
5329
  ]
5308
5330
  # If inplace is False, make a copy of the DataFrame
5309
5331
  if not inplace:
@@ -5398,10 +5420,22 @@ def df_astype(
5398
5420
  kwargs.pop("errors", None)
5399
5421
  data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
5400
5422
  # print(f"Successfully converted '{column}' to timedelta.")
5423
+ elif astype == "circular":
5424
+ max_val = kwargs.get('max_val',None)
5425
+ data[column]=df_cycle(data=data,columns=column,max_val=max_val)
5401
5426
  else:
5402
5427
  # Convert to other types (e.g., float, int)
5403
- data[column] = data[column].astype(astype)
5428
+ if astype=='int':
5429
+ data[column] = data[column].astype('float').astype('int')
5430
+ else:
5431
+ data[column] = data[column].astype(astype)
5404
5432
  # print(f"Successfully converted '{column}' to {astype}.")
5433
+ # format
5434
+ try:
5435
+ if fmt is not None:
5436
+ data[column] = data[column].apply(lambda x: f"{x:{fmt}}")
5437
+ except Exception as e:
5438
+ print(f"设置格式的时候有误: {e}")
5405
5439
  except Exception as e:
5406
5440
  print(f"Error converting '{column}' to {astype}: {e}")
5407
5441
  try:
@@ -5874,11 +5908,13 @@ def df_encoder(
5874
5908
 
5875
5909
  def df_scaler(
5876
5910
  data: pd.DataFrame, # should be numeric dtype
5911
+ scaler=None,
5877
5912
  method="standard",
5878
5913
  columns=None, # default, select all numeric col/row
5879
5914
  inplace=False,
5880
5915
  verbose=False, # show usage
5881
5916
  axis=0, # defalut column-wise
5917
+ return_scaler:bool=False,# True: return both: return df, scaler
5882
5918
  **kwargs,
5883
5919
  ):
5884
5920
  """
@@ -5896,31 +5932,49 @@ def df_scaler(
5896
5932
  """
5897
5933
  if verbose:
5898
5934
  print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
5899
-
5900
- methods = ["standard", "minmax", "robust"]
5901
- method = strcmp(method, methods)[0]
5902
- if method == "standard":
5903
- from sklearn.preprocessing import StandardScaler
5904
-
5905
- scaler = StandardScaler(**kwargs)
5906
- elif method == "minmax":
5907
- from sklearn.preprocessing import MinMaxScaler
5908
-
5909
- scaler = MinMaxScaler(**kwargs)
5910
- elif method == "robust":
5911
- from sklearn.preprocessing import RobustScaler
5912
-
5913
- scaler = RobustScaler(**kwargs)
5914
- if axis not in [0, 1]:
5915
- raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
5916
-
5935
+ if scaler is None:
5936
+ methods = ["standard", "minmax", "robust","maxabs"]
5937
+ method = strcmp(method, methods)[0]
5938
+ if method == "standard":
5939
+ from sklearn.preprocessing import StandardScaler
5940
+ if verbose:
5941
+ print("performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1.")
5942
+ print("Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks.")
5943
+ scaler = StandardScaler(**kwargs)
5944
+ elif method == "minmax":
5945
+ from sklearn.preprocessing import MinMaxScaler
5946
+ if verbose:
5947
+ print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
5948
+ print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
5949
+ print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
5950
+ scaler = MinMaxScaler(**kwargs)
5951
+ elif method == "robust":
5952
+ from sklearn.preprocessing import RobustScaler
5953
+ if verbose:
5954
+ print("scales the data based on the median and interquartile range, which is robust to outliers.")
5955
+ print("Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation.")
5956
+ scaler = RobustScaler(**kwargs)
5957
+ elif method=="maxabs":
5958
+ from sklearn.preprocessing import MaxAbsScaler
5959
+ if verbose:
5960
+ print("This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature.")
5961
+ print("Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems.")
5962
+ scaler = MaxAbsScaler(**kwargs)
5963
+ if axis not in [0, 1]:
5964
+ raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
5965
+ if verbose:
5966
+ print(scaler)
5917
5967
  if axis == 0:
5918
5968
  # Column-wise scaling (default)
5919
5969
  if columns is None:
5920
5970
  columns = data.select_dtypes(include=np.number).columns.tolist()
5921
5971
  non_numeric_columns = data.columns.difference(columns)
5922
5972
 
5923
- scaled_data = scaler.fit_transform(data[columns])
5973
+ # scaled_data = scaler.fit_transform(data[columns])
5974
+ if scaler is None or not hasattr(scaler, 'mean_'):
5975
+ scaled_data = scaler.fit_transform(data[columns])
5976
+ else:
5977
+ scaled_data = scaler.transform(data[columns])
5924
5978
 
5925
5979
  if inplace:
5926
5980
  data[columns] = scaled_data
@@ -5934,7 +5988,10 @@ def df_scaler(
5934
5988
  axis=1,
5935
5989
  )
5936
5990
  scaled_df = scaled_df[data.columns] # Maintain column order
5937
- return scaled_df
5991
+ if return_scaler:
5992
+ return scaled_df,scaler
5993
+ else:
5994
+ return scaled_df
5938
5995
 
5939
5996
  elif axis == 1:
5940
5997
  # Row-wise scaling
@@ -5946,9 +6003,10 @@ def df_scaler(
5946
6003
 
5947
6004
  print(f"Scaling rows")
5948
6005
 
5949
- scaled_data = scaler.fit_transform(
5950
- numeric_rows.T
5951
- ).T # Transpose for scaling and then back
6006
+ # scaled_data = scaler.fit_transform(
6007
+ # numeric_rows.T
6008
+ # ).T # Transpose for scaling and then back
6009
+ scaled_data = scaler.fit_transform(numeric_rows.T).T if scaler is None or not hasattr(scaler, 'mean_') else scaler.transform(numeric_rows.T).T
5952
6010
 
5953
6011
  if inplace:
5954
6012
  data.loc[numeric_rows.index] = scaled_data
@@ -5956,7 +6014,10 @@ def df_scaler(
5956
6014
  else:
5957
6015
  scaled_df = data.copy()
5958
6016
  scaled_df.loc[numeric_rows.index] = scaled_data
5959
- return scaled_df
6017
+ if return_scaler:
6018
+ return scaled_df,scaler
6019
+ else:
6020
+ return scaled_df
5960
6021
 
5961
6022
 
5962
6023
  def df_special_characters_cleaner(
@@ -6325,6 +6386,7 @@ def df_reducer(
6325
6386
  random_state=1,
6326
6387
  ax=None,
6327
6388
  figsize=None,
6389
+ verbose=True,
6328
6390
  **kwargs,
6329
6391
  ) -> pd.DataFrame:
6330
6392
  dict_methods = {
@@ -6364,7 +6426,8 @@ def df_reducer(
6364
6426
  # "autoencoder","nmf",
6365
6427
  ]
6366
6428
  method = strcmp(method, methods)[0]
6367
- print(f"\nprocessing with using {dict_methods[method]}:")
6429
+ if verbose:
6430
+ print(f"\nprocessing with using {dict_methods[method]}:")
6368
6431
  xlabel, ylabel = None, None
6369
6432
  if columns is None:
6370
6433
  columns = data.select_dtypes(include="number").columns.tolist()
@@ -6863,7 +6926,7 @@ def df_reducer(
6863
6926
  hue=hue,
6864
6927
  s=size,
6865
6928
  edgecolor=edgecolor,
6866
- kind="scater",
6929
+ kind_="scater",
6867
6930
  figsets=dict(
6868
6931
  legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
6869
6932
  xlabel=xlabel if xlabel else None,
@@ -7334,10 +7397,13 @@ def evaluate_cluster(
7334
7397
  def df_qc(
7335
7398
  data: pd.DataFrame,
7336
7399
  columns=None,
7337
- verbose=False,
7400
+ skim=False,
7338
7401
  plot_=True,
7339
7402
  max_cols=20, # only for plots
7403
+ hue=None,
7340
7404
  output=False,
7405
+ verbose=True,
7406
+ dir_save=None
7341
7407
  ):
7342
7408
  """
7343
7409
  Usage example:
@@ -7345,22 +7411,24 @@ def df_qc(
7345
7411
  """
7346
7412
  from statsmodels.stats.outliers_influence import variance_inflation_factor
7347
7413
  from scipy.stats import skew, kurtosis, entropy
7348
- import skimpy
7349
-
7414
+
7350
7415
  #! display(data.select_dtypes(include=[np.number]).describe())
7351
7416
  #!skim
7352
7417
  if columns is not None:
7353
7418
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7354
7419
  data=data[columns]
7355
- try:
7356
- skimpy.skim(data)
7357
- except:
7358
- numerical_data = data.select_dtypes(include=[np.number])
7359
- skimpy.skim(numerical_data)
7420
+ if skim:
7421
+ try:
7422
+ import skimpy
7423
+ skimpy.skim(data)
7424
+ except:
7425
+ numerical_data = data.select_dtypes(include=[np.number])
7426
+ skimpy.skim(numerical_data)
7360
7427
  # Fill completely NaN columns with a default value (e.g., 0)
7361
7428
  data = data.copy()
7362
7429
  data.loc[:, data.isna().all()] = 0
7363
7430
  res_qc = {}
7431
+ print(f"data.shape:{data.shape}")
7364
7432
 
7365
7433
  # Missing values
7366
7434
  res_qc["missing_values"] = data.isnull().sum()
@@ -7403,7 +7471,7 @@ def df_qc(
7403
7471
  numeric_df = data.select_dtypes(include=[np.number]).dropna()
7404
7472
  vif_data = pd.DataFrame()
7405
7473
  res_qc["vif"]=vif_data
7406
- if numeric_df.shape[1] > 1:
7474
+ if numeric_df.shape[1] > 1 and not numeric_df.empty:
7407
7475
  vif_data["feature"] = numeric_df.columns
7408
7476
  vif_data["VIF"] = [
7409
7477
  variance_inflation_factor(numeric_df.values, i)
@@ -7495,72 +7563,70 @@ def df_qc(
7495
7563
  # Report generation
7496
7564
  if verbose:
7497
7565
  print("=== QC Report Summary ===")
7498
- print("\nMissing Values (Total and %):")
7499
- print(res_qc["missing_values"][res_qc["missing_values"] > 0])
7500
- print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7501
-
7502
- print("\nRows with Missing Values:", res_qc["rows_with_missing"])
7503
-
7504
- print("\nData Types:")
7505
- print(res_qc["data_types"])
7506
-
7507
- print("\nUnique Values per Column:")
7508
- print(res_qc["unique_values"])
7509
-
7510
- print("\nConstant Columns:", res_qc["constant_columns"])
7511
-
7512
- print("\nDuplicate Rows:", res_qc["duplicate_rows"])
7513
- print("Duplicate Columns:", res_qc["duplicate_columns"])
7566
+ print("\n⤵ Summary Statistics:")
7567
+ display(res_qc["summary_statistics"])
7568
+ print("\n⤵ Data Types:")
7569
+ display(res_qc["data_types"])
7570
+ if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
7571
+ print(" ⤵ Missing Values Counts:")
7572
+ display(res_qc["missing_values"][res_qc["missing_values"] > 0])
7573
+ # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7574
+ print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
7575
+
7576
+ if any(res_qc["outlier_num"]):
7577
+ print("\n⤵ Outlier Report:")
7578
+ display(res_qc["outlier_num"])
7579
+ if any(res_qc["unique_values"]):
7580
+ print("\n⤵ Unique Values per Column:")
7581
+ display(res_qc["unique_values"])
7582
+
7583
+ print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
7584
+
7585
+ print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
7586
+ print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
7514
7587
 
7515
7588
  if res_qc["empty_columns"]:
7516
- print("\nEmpty Columns:", res_qc["empty_columns"])
7517
-
7518
- print("\nOutlier Report:")
7519
- print(res_qc["outlier_num"])
7520
- print("\nPercentage of Values Replaced per Column:")
7521
- print(res_qc["outlier_percentage"])
7589
+ print("\n⤵ Empty Columns:", res_qc["empty_columns"])
7522
7590
 
7523
- print("\nHigh Correlations (>|0.9|):")
7524
- for col1, col2 in res_qc["high_correlations"]:
7525
- print(f" {col1} and {col2}")
7591
+ if any(res_qc["high_correlations"]):
7592
+ print("\n⤵ High Correlations (>|0.9|):")
7593
+ for col1, col2 in res_qc["high_correlations"]:
7594
+ print(f" {col1} and {col2}")
7526
7595
 
7527
7596
  if "vif" in res_qc:
7528
- print("\nFeatures with High VIF (>|5|):")
7597
+ print("\n⤵ Features with High VIF (>|5|):")
7529
7598
  print(res_qc["vif"])
7530
7599
 
7531
- print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
7532
- print(res_qc["high_cardinality_categoricals"])
7533
-
7534
- print("\nInconsistent Data Types:")
7535
- print(res_qc["inconsistent_types"])
7536
-
7537
- print("\nRange Checks for Numeric Columns:")
7538
- print(res_qc["range_checks"])
7539
-
7540
- print("\nText Length Analysis:")
7541
- for col, stats in res_qc["text_length_analysis"].items():
7542
- print(
7543
- f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
7544
- )
7545
-
7546
- print("\nSummary Statistics:")
7547
- print(res_qc["summary_statistics"])
7600
+ if any(res_qc["high_cardinality_categoricals"]):
7601
+ print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
7602
+ print(res_qc["high_cardinality_categoricals"])
7603
+ if any(res_qc["inconsistent_types"]):
7604
+ print("\n⤵ Inconsistent Data Types:")
7605
+ display(res_qc["inconsistent_types"])
7606
+ if any(res_qc["text_length_analysis"]):
7607
+ print("\n⤵ Text Length Analysis:")
7608
+ for col, stats in res_qc["text_length_analysis"].items():
7609
+ print(
7610
+ f"{col}: Avg Length={round(stats['avg_length'],1)}, Length Variance={round(stats['length_variance'],1)}"
7611
+ )
7548
7612
 
7549
7613
  if res_qc["warnings"]:
7550
7614
  print("\nWarnings:")
7551
7615
  for warning in res_qc["warnings"]:
7552
7616
  print(" -", warning)
7553
7617
  if plot_:
7554
- df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
7555
- if output:
7618
+ df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
7619
+ if output or not plot_:
7556
7620
  return res_qc
7557
7621
  return None
7558
7622
 
7559
7623
 
7560
- def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
7624
+ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None,dir_save=None):
7561
7625
  import matplotlib.pyplot as plt
7562
7626
  import seaborn as sns
7563
7627
  from .plot import subplot, figsets, get_color
7628
+ from datetime import datetime
7629
+ now_ = datetime.now().strftime("%y%m%d_%H%M%S")
7564
7630
 
7565
7631
  if columns is not None:
7566
7632
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
@@ -7574,91 +7640,65 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7574
7640
  )
7575
7641
  if len(missing_data) > max_cols:
7576
7642
  missing_data = missing_data[:max_cols]
7577
- ax=sns.barplot(
7578
- x=missing_data.index,
7579
- y=missing_data.values,
7580
- hue=missing_data.index,
7581
- palette=get_color(len(missing_data), cmap="Blues")[::-1],
7643
+ ax_missing_data=sns.barplot(
7644
+ y=missing_data.index,
7645
+ x=missing_data.values,
7646
+ hue=missing_data.index,
7647
+ palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
7582
7648
  ax=nexttile(),
7583
7649
  )
7584
- figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
7585
-
7586
- ax2 = ax.twinx()
7587
- # Plot missing value percentages
7588
- missing_percentage = res_qc["missing_percentage"][
7589
- res_qc["missing_percentage"] > 0
7590
- ].sort_values(ascending=False)
7591
- sns.barplot(
7592
- x=missing_percentage.index,
7593
- y=missing_percentage.values,
7594
- hue=missing_percentage.index,
7595
- palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
7596
- ax=ax2,#nexttile(),
7597
- )
7598
- figsets(xangle=45, ylabel="%",ax=ax2)
7599
- ax2.tick_params(axis="y", color='r',labelcolor='r')
7600
- ax2.yaxis.label.set_color('r')
7650
+ figsets(title="Missing (#)", xlabel="#",ax=ax_missing_data,ylabel=None,fontsize=8 if len(missing_data)<=20 else 6)
7601
7651
 
7602
7652
  outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
7603
7653
  if len(outlier_num) > max_cols:
7604
7654
  outlier_num = outlier_num[:max_cols]
7605
7655
  ax_outlier_num=sns.barplot(
7606
- x=outlier_num.index,
7607
- y=outlier_num.values,
7656
+ y=outlier_num.index,
7657
+ x=outlier_num.values,
7608
7658
  hue=outlier_num.index,
7609
7659
  palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
7610
7660
  ax=nexttile(),
7611
7661
  )
7612
- figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
7613
- ax_outlier_percentage = ax_outlier_num.twinx()
7614
- outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
7615
- if len(outlier_percentage) > max_cols:
7616
- outlier_percentage = outlier_percentage[:max_cols]
7617
- ax_outlier_percentage=sns.barplot(
7618
- x=outlier_percentage.index,
7619
- y=outlier_percentage.values,
7620
- hue=outlier_percentage.index,
7621
- palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
7622
- ax=ax2 #nexttile(),
7623
- )
7624
- figsets(
7625
- xangle=45,
7626
- ylabel="%",
7627
- xlabel=None,
7628
- ylim=[0, outlier_percentage.max() + 2],
7629
- ax=ax_outlier_percentage
7630
- )
7631
- ax2.tick_params(axis="y", color='r',labelcolor='r')
7632
- ax2.yaxis.label.set_color('r')
7662
+ figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
7663
+
7664
+ #!
7665
+ try:
7666
+ for col in data.select_dtypes(include='category').columns:
7667
+ sns.countplot(y=data[col],
7668
+ palette=get_color(data.select_dtypes(include='category').shape[1], cmap="coolwarm")[::-1],
7669
+ ax=nexttile())
7670
+ figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
7671
+ except Exception as e:
7672
+ pass
7633
7673
 
7634
7674
  # Skewness and Kurtosis Plots
7635
7675
  skewness = res_qc["skewness"].sort_values(ascending=False)
7636
7676
  kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
7637
7677
  if not skewness.empty:
7638
7678
  ax_skewness=sns.barplot(
7639
- x=skewness.index,
7640
- y=skewness.values,
7679
+ y=skewness.index,
7680
+ x=skewness.values,
7641
7681
  hue=skewness.index,
7642
7682
  palette=get_color(len(skewness), cmap="coolwarm")[::-1],
7643
7683
  ax=nexttile(),
7644
7684
  )
7645
7685
  figsets(
7646
- xangle=45,
7647
7686
  title="Highly Skewed Numeric Columns (Skewness > 1)",
7648
- ylabel="Skewness",xlabel=None,ax=ax_skewness
7687
+ xlabel="Skewness",ylabel=None,ax=ax_skewness,
7688
+ fontsize=8 if len(skewness)<=20 else 6
7649
7689
  )
7650
7690
  if not kurtosis.empty:
7651
7691
  ax_kurtosis=sns.barplot(
7652
- x=kurtosis.index,
7653
- y=kurtosis.values,
7692
+ y=kurtosis.index,
7693
+ x=kurtosis.values,
7654
7694
  hue=kurtosis.index,
7655
7695
  palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
7656
7696
  ax=nexttile(),
7657
7697
  )
7658
7698
  figsets(
7659
- xangle=45,
7660
7699
  title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
7661
- ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
7700
+ xlabel="Kurtosis",ylabel=None,ax=ax_kurtosis,
7701
+ fontsize=8 if len(kurtosis)<=20 else 6
7662
7702
  )
7663
7703
 
7664
7704
  # Entropy for Categorical Variables
@@ -7666,56 +7706,46 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7666
7706
  ascending=False
7667
7707
  )
7668
7708
  ax_entropy_data=sns.barplot(
7669
- x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
7709
+ y=entropy_data.index, x=entropy_data.values,hue=entropy_data.index,
7710
+ palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
7711
+ ax=nexttile()
7670
7712
  )
7671
7713
  figsets(
7672
- xangle=45,
7673
- xlabel="Categorical Columns",
7714
+ ylabel="Categorical Columns",
7674
7715
  title="Entropy of Categorical Variables",
7675
- ylabel="Entropy (bits)",
7676
- ax=ax_entropy_data
7677
- )
7678
- # Distribution Analysis: Boxplot for IQR
7679
- ax_iqr=sns.boxplot(
7680
- data=data[res_qc["distribution_analysis"].index],
7681
- orient="v",
7682
- palette="Set3",
7683
- ax=nexttile(),
7684
- )
7685
- figsets(
7686
- xangle=45,
7687
- title="Range for Numeric Columns",
7688
- ylabel="#",
7689
- ax=ax_iqr
7690
- )
7716
+ xlabel="Entropy (bits)",
7717
+ ax=ax_entropy_data,
7718
+ fontsize=8 if len(entropy_data)<=20 else 6
7719
+ )
7720
+
7691
7721
  # unique counts
7692
7722
  unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
7693
7723
  ax_unique_counts_=sns.barplot(
7694
- x=unique_counts.index,
7695
- y=unique_counts.values,
7724
+ y=unique_counts.index,
7725
+ x=unique_counts.values,
7696
7726
  hue=unique_counts.index,
7697
- palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
7727
+ palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
7698
7728
  ax=nexttile())
7699
7729
  figsets(
7700
- xangle=45,
7701
7730
  title="Unique Counts",
7702
- xlabel=None,
7703
- ylabel="#",
7704
- ax=ax_unique_counts_
7731
+ ylabel=None,
7732
+ xlabel="#",
7733
+ ax=ax_unique_counts_,
7734
+ fontsize=8 if len(unique_counts)<=20 else 6
7705
7735
  )
7706
7736
  # Binary Checking
7707
- ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
7708
- y=unique_counts[unique_counts<10].values,
7709
- hue=unique_counts[unique_counts<10].index,
7710
- palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
7737
+ ax_unique_counts=sns.barplot(y=unique_counts[unique_counts<8].index,
7738
+ x=unique_counts[unique_counts<8].values,
7739
+ hue=unique_counts[unique_counts<8].index,
7740
+ palette=get_color(len(unique_counts[unique_counts<8].index), cmap="coolwarm")[::-1],
7711
7741
  ax=nexttile())
7712
- plt.axhline(y=2, color="r", linestyle="--", lw=2)
7742
+ plt.axvline(x=2, color="r", linestyle="--", lw=2)
7713
7743
  figsets(
7714
- xangle=45,
7715
- xlabel=None,
7744
+ ylabel=None,
7716
7745
  title="Binary Checking",
7717
- ylabel="#",
7718
- ax=ax_unique_counts
7746
+ xlabel="#",
7747
+ ax=ax_unique_counts,
7748
+ fontsize=8 if len(unique_counts[unique_counts<10].index)<=20 else 6
7719
7749
  )
7720
7750
 
7721
7751
  # dtypes counts
@@ -7751,14 +7781,15 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7751
7781
  ha="center",
7752
7782
  va="top",
7753
7783
  c="k",
7754
- fontsize=8,
7784
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7755
7785
  rotation=0,
7756
7786
  )
7757
7787
  figsets(
7758
7788
  xlabel=None,
7759
7789
  title="Dtypes",
7760
7790
  ylabel="#",
7761
- ax=ax_dtype_counts
7791
+ ax=ax_dtype_counts,
7792
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7762
7793
  )
7763
7794
 
7764
7795
  # High cardinality: Show top categorical columns by unique value count
@@ -7772,24 +7803,26 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7772
7803
 
7773
7804
  if high_cardinality:
7774
7805
  ax_high_cardinality=sns.barplot(
7775
- x=list(high_cardinality.keys()),
7776
- y=list(high_cardinality.values()),
7806
+ y=list(high_cardinality.keys()),
7807
+ x=list(high_cardinality.values()),
7777
7808
  hue=list(high_cardinality.keys()),
7778
- palette="Oranges", ax=nexttile()
7809
+ palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[::-1],
7810
+ ax=nexttile(),
7779
7811
  )
7780
7812
  figsets(
7781
- xangle=45,
7782
7813
  title="High Cardinality Categorical Columns",
7783
- ylabel="Unique Value Count",
7784
- ax=ax_high_cardinality
7814
+ xlabel="Unique Value Count",
7815
+ ax=ax_high_cardinality,
7816
+ fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
7785
7817
  )
7786
7818
  if res_qc["low_variance_features"]:
7787
7819
  low_variance_data = data[res_qc["low_variance_features"]].copy()
7788
7820
  for col in low_variance_data.columns:
7789
- sns.histplot(
7821
+ ax_low_variance_features=sns.histplot(
7790
7822
  low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
7791
7823
  )
7792
- plt.title(f"Low Variance Feature: {col}")
7824
+ figsets(title=f"Low Variance Feature: {col}",ax=ax_low_variance_features,
7825
+ fontsize=8 if len(low_variance_data[col])<=20 else 6)
7793
7826
 
7794
7827
  # VIF plot for multicollinearity detection
7795
7828
  if "vif" in res_qc and not res_qc["vif"].empty:
@@ -7800,23 +7833,22 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7800
7833
  x="VIF",
7801
7834
  y="feature",
7802
7835
  hue="VIF",
7803
- palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
7836
+ palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
7804
7837
  ax=nexttile())
7805
7838
  figsets(
7806
- xangle=45,
7807
7839
  title="Variance Inflation Factor(VIF)",
7808
- xlabel="Variance Inflation Factor(VIF)",
7840
+ xlabel="VIF",
7809
7841
  ylabel="Features",
7810
7842
  legend=None,
7811
- ax=ax_vif
7843
+ ax=ax_vif,
7844
+ fontsize=8 if len(vif_data)<=20 else 6
7812
7845
  )
7813
7846
 
7814
7847
  # Correlation heatmap for numeric columns with high correlation pairs
7815
7848
  if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7816
- corr = data.select_dtypes(include=[np.number]).dropna().corr()
7849
+ corr = data.select_dtypes(include=[np.number]).corr()
7817
7850
  if corr.shape[1]<=33:
7818
7851
  mask = np.triu(np.ones_like(corr, dtype=bool))
7819
- # Dynamically scale fontsize based on the number of columns
7820
7852
  num_columns = corr.shape[1]
7821
7853
  fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2)) # Scale between 8 and 12
7822
7854
 
@@ -7826,7 +7858,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7826
7858
  annot=True,
7827
7859
  cmap="coolwarm",
7828
7860
  center=0,
7829
- fmt=".2f",
7861
+ fmt=".1f",
7830
7862
  linewidths=0.5,
7831
7863
  vmin=-1, vmax=1,
7832
7864
  ax=nexttile(2, 2),
@@ -7839,7 +7871,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
7839
7871
  title="Correlation Heatmap",
7840
7872
  ax=ax_heatmap
7841
7873
  )
7874
+ # save figure
7875
+ if dir_save:
7876
+ figsave(dir_save,f"qc_plot_{now_}.pdf")
7842
7877
 
7878
+ if columns is not None:
7879
+ if isinstance(columns, (list,pd.core.indexes.base.Index)):
7880
+ data=data[columns]
7881
+ len_total = len(res_qc)
7882
+ n_row, n_col = int((len_total + 10) / 3), 3
7883
+ nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7884
+ #! check distribution
7885
+ data_num = data.select_dtypes(include=np.number)
7886
+ if len(data_num) > max_cols:
7887
+ data_num = data_num.iloc[:,:max_cols]
7888
+
7889
+ data_num = df_scaler(data=data_num, method='standard')
7890
+
7891
+ import scipy.stats as stats
7892
+ for column in data_num.columns:
7893
+ #* Shapiro-Wilk test for normality
7894
+ stat, p_value = stats.shapiro(data_num[column])
7895
+ normality = "norm" if p_value > 0.05 else "not_norm"
7896
+ #* Plot histogram
7897
+ ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
7898
+ x_min, x_max = ax_hist.get_xlim()
7899
+ y_min, y_max = ax_hist.get_ylim()
7900
+ ax_hist.text(x_min+(x_max-x_min)*0.5, y_min+(y_max-y_min)*0.75,
7901
+ f'p(Shapiro-Wilk)={p_value:.3f}\n{normality}',
7902
+ ha='center', va='top')
7903
+ figsets(title=column,ax=ax_hist)
7904
+ ax_twin=ax_hist.twinx()
7905
+ #* Q-Q plot
7906
+ stats.probplot(data_num[column], dist="norm", plot=ax_twin)
7907
+ figsets(ylabel=f'Q-Q Plot:{column}',title=None)
7908
+ # save figure
7909
+ if dir_save:
7910
+ figsave(dir_save,f"qq_plot_{now_}.pdf")
7843
7911
  def use_pd(
7844
7912
  func_name="excel",
7845
7913
  verbose=True,