py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -16,6 +16,7 @@ import warnings
16
16
 
17
17
  warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
18
18
  warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
19
+ warnings.filterwarnings("ignore")
19
20
 
20
21
 
21
22
  def run_once_within(duration=60,reverse=False): # default 60s
@@ -541,8 +542,7 @@ def is_text(s):
541
542
 
542
543
  from typing import Any, Union
543
544
 
544
-
545
- def shared(*args, strict=True, n_shared=2, verbose=True):
545
+ def share(*args, strict=True, n_shared=2, verbose=True):
546
546
  """
547
547
  check the shared elelements in two list.
548
548
  usage:
@@ -587,12 +587,68 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
587
587
  elements2show = (
588
588
  shared_elements if len(shared_elements) < 10 else shared_elements[:5]
589
589
  )
590
+ tail = '' if len(shared_elements) < 10 else '......'
591
+ elements2show.append(tail)
590
592
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
591
593
  print("********* checking shared elements *********")
592
594
  return shared_elements
593
595
 
596
+ def shared(*args, n_shared=None, verbose=True,**kwargs):
597
+ """
598
+ check the shared elelements in two list.
599
+ usage:
600
+ list1 = [1, 2, 3, 4, 5]
601
+ list2 = [4, 5, 6, 7, 8]
602
+ list3 = [5, 6, 9, 10]
603
+ a = shared(list1, list2,list3)
604
+ """
605
+ if verbose:
606
+ print("\n********* checking shared elements *********")
607
+
608
+ if len(args) == 1 and isinstance(args[0], list):
609
+ lists = args[0] # Unpack the single list
610
+ else:
611
+ lists = args # Use the provided arguments as lists
612
+ flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
613
+
614
+ if n_shared is None:
615
+ n_shared = len(flattened_lists)
616
+ strict = True
617
+ else:
618
+ strict = False
619
+ # Ensure all arguments are lists
620
+ if any(not isinstance(lst, list) for lst in flattened_lists):
621
+ print(f"{' ' * 2}All inputs must be lists.")
622
+ return []
623
+ first_list = flattened_lists[0]
624
+ shared_elements = [
625
+ item for item in first_list if all(item in lst for lst in flattened_lists)
626
+ ]
627
+ if strict:
628
+ # Strict mode: require elements to be in all lists
629
+ shared_elements = set(flattened_lists[0])
630
+ for lst in flattened_lists[1:]:
631
+ shared_elements.intersection_update(lst)
632
+ else:
633
+ from collections import Counter
594
634
 
595
- def not_shared(*args, strict=True, n_shared=2, verbose=False):
635
+ all_elements = [item for sublist in flattened_lists for item in sublist]
636
+ element_count = Counter(all_elements)
637
+ # Get elements that appear in at least n_shared lists
638
+ shared_elements = [
639
+ item for item, count in element_count.items() if count >= n_shared
640
+ ]
641
+
642
+ shared_elements = flatten(shared_elements, verbose=verbose)
643
+ if verbose:
644
+ elements2show = (
645
+ shared_elements if len(shared_elements) < 10 else shared_elements[:5]
646
+ )
647
+ print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
648
+ print("********* checking shared elements *********")
649
+ return shared_elements
650
+
651
+ def share_not(*args, n_shared=None, verbose=False):
596
652
  """
597
653
  To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
598
654
  usage:
@@ -600,7 +656,19 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
600
656
  list2 = [4, 5, 6, 7, 8]
601
657
  not_shared(list1,list2)# output [1,3]
602
658
  """
603
- _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
659
+ _common = shared(*args, n_shared=n_shared, verbose=verbose)
660
+ list1 = flatten(args[0], verbose=verbose)
661
+ _not_shared = [item for item in list1 if item not in _common]
662
+ return _not_shared
663
+ def not_shared(*args, n_shared=None, verbose=False):
664
+ """
665
+ To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
666
+ usage:
667
+ list1 = [1, 8, 3, 3, 4, 5]
668
+ list2 = [4, 5, 6, 7, 8]
669
+ not_shared(list1,list2)# output [1,3]
670
+ """
671
+ _common = shared(*args, n_shared=n_shared, verbose=verbose)
604
672
  list1 = flatten(args[0], verbose=verbose)
605
673
  _not_shared = [item for item in list1 if item not in _common]
606
674
  return _not_shared
@@ -1983,7 +2051,6 @@ def fload(fpath, kind=None, **kwargs):
1983
2051
 
1984
2052
  def load_csv(fpath, **kwargs):
1985
2053
  from pandas.errors import EmptyDataError
1986
-
1987
2054
  engine = kwargs.pop("engine", "pyarrow")# default: None
1988
2055
  sep = kwargs.pop("sep", None)# default: ','
1989
2056
  index_col = kwargs.pop("index_col", None)# default: None
@@ -1994,13 +2061,20 @@ def fload(fpath, kind=None, **kwargs):
1994
2061
  comment = kwargs.pop("comment", None)# default: None
1995
2062
  fmt = kwargs.pop("fmt", False)# default:
1996
2063
  chunksize = kwargs.pop("chunksize", None)# default: None
2064
+
2065
+ #check filesize
2066
+ f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
2067
+ if f_size>=50: #50 MB
2068
+ if chunksize is None:
2069
+ chunksize = 5000
2070
+ print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
1997
2071
  engine = "c" if chunksize else engine # when chunksize, recommend 'c'
1998
2072
  low_memory = kwargs.pop("low_memory", True)# default: True
1999
2073
  low_memory = (
2000
2074
  False if chunksize else True
2001
2075
  ) # when chunksize, recommend low_memory=False # default:
2002
2076
  verbose = kwargs.pop("verbose", False)
2003
- if run_once_within():
2077
+ if run_once_within(reverse=True):
2004
2078
  use_pd("read_csv", verbose=verbose)
2005
2079
 
2006
2080
  if comment is None:# default: None
@@ -2176,7 +2250,7 @@ def fload(fpath, kind=None, **kwargs):
2176
2250
  def load_excel(fpath, **kwargs):
2177
2251
  engine = kwargs.get("engine", "openpyxl")
2178
2252
  verbose = kwargs.pop("verbose", False)
2179
- if run_once_within():
2253
+ if run_once_within(reverse=True):
2180
2254
  use_pd("read_excel", verbose=verbose)
2181
2255
  df = pd.read_excel(fpath, engine=engine, **kwargs)
2182
2256
  try:
@@ -2206,7 +2280,7 @@ def fload(fpath, kind=None, **kwargs):
2206
2280
  engine = kwargs.get("engine", "pyarrow")
2207
2281
  verbose = kwargs.pop("verbose", False)
2208
2282
 
2209
- if run_once_within():
2283
+ if run_once_within(reverse=True):
2210
2284
  use_pd("read_parquet", verbose=verbose)
2211
2285
  try:
2212
2286
  df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2383,13 +2457,13 @@ def fload(fpath, kind=None, **kwargs):
2383
2457
  return load_xml(fpath)
2384
2458
  elif kind in ["csv", "tsv"]:
2385
2459
  # verbose = kwargs.pop("verbose", False)
2386
- if run_once_within():
2460
+ if run_once_within(reverse=True):
2387
2461
  use_pd("read_csv")
2388
2462
  content = load_csv(fpath, **kwargs)
2389
2463
  return content
2390
2464
  elif kind == "pkl":
2391
2465
  verbose = kwargs.pop("verbose", False)
2392
- if run_once_within():
2466
+ if run_once_within(reverse=True):
2393
2467
  use_pd("read_pickle")
2394
2468
  return pd.read_pickle(fpath, **kwargs)
2395
2469
  elif kind in ["ods", "ods", "odt"]:
@@ -2420,12 +2494,12 @@ def fload(fpath, kind=None, **kwargs):
2420
2494
  return load_ipynb(fpath, **kwargs)
2421
2495
  elif kind in ["parquet", "snappy"]:
2422
2496
  verbose = kwargs.pop("verbose", False)
2423
- if run_once_within():
2497
+ if run_once_within(reverse=True):
2424
2498
  use_pd("read_parquet")
2425
2499
  return load_parquet(fpath, **kwargs)
2426
2500
  elif kind == "feather":
2427
2501
  verbose = kwargs.pop("verbose", False)
2428
- if run_once_within():
2502
+ if run_once_within(reverse=True):
2429
2503
  use_pd("read_feather")
2430
2504
  content = pd.read_feather(fpath, **kwargs)
2431
2505
  return content
@@ -2684,7 +2758,7 @@ def fsave(
2684
2758
  # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
2685
2759
 
2686
2760
  verbose = kwargs.pop("verbose", False)
2687
- if run_once_within():
2761
+ if run_once_within(reverse=True):
2688
2762
  use_pd("to_csv", verbose=verbose)
2689
2763
  kwargs_csv = dict(
2690
2764
  path_or_buf=None,
@@ -2716,7 +2790,7 @@ def fsave(
2716
2790
  def save_xlsx(fpath, data, **kwargs):
2717
2791
  verbose = kwargs.pop("verbose", False)
2718
2792
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2719
- if run_once_within():
2793
+ if run_once_within(reverse=True):
2720
2794
  use_pd("to_excel", verbose=verbose)
2721
2795
  if any(kwargs):
2722
2796
  format_excel(df=data, filename=fpath, **kwargs)
@@ -5911,6 +5985,9 @@ def df_scaler(
5911
5985
  scaler=None,
5912
5986
  method="standard",
5913
5987
  columns=None, # default, select all numeric col/row
5988
+ feature_range=None,# specific for 'minmax'
5989
+ vmin=0,
5990
+ vmax=1,
5914
5991
  inplace=False,
5915
5992
  verbose=False, # show usage
5916
5993
  axis=0, # defalut column-wise
@@ -5943,11 +6020,13 @@ def df_scaler(
5943
6020
  scaler = StandardScaler(**kwargs)
5944
6021
  elif method == "minmax":
5945
6022
  from sklearn.preprocessing import MinMaxScaler
6023
+ if feature_range is None:
6024
+ feature_range=(vmin,vmax)
5946
6025
  if verbose:
5947
6026
  print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
5948
6027
  print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
5949
6028
  print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
5950
- scaler = MinMaxScaler(**kwargs)
6029
+ scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
5951
6030
  elif method == "robust":
5952
6031
  from sklearn.preprocessing import RobustScaler
5953
6032
  if verbose:
@@ -6035,15 +6114,20 @@ def df_special_characters_cleaner(
6035
6114
 
6036
6115
  # 1. Clean column names by replacing special characters with underscores
6037
6116
  if "column" in where_:
6038
- data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
6117
+ try:
6118
+ data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
6119
+ except Exception as e:
6120
+ print(e)
6039
6121
 
6040
6122
  # 2. Clean only object-type columns (text columns)
6041
- if "content" in where_:
6042
- for col in data.select_dtypes(include=["object"]).columns:
6043
- data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
6044
- if data.index.dtype == "object" and index in where_:
6045
- data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
6046
-
6123
+ try:
6124
+ if "content" in where_:
6125
+ for col in data.select_dtypes(include=["object"]).columns:
6126
+ data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
6127
+ if data.index.dtype == "object" and index in where_:
6128
+ data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
6129
+ except:
6130
+ pass
6047
6131
  return data
6048
6132
 
6049
6133
 
@@ -6426,6 +6510,9 @@ def df_reducer(
6426
6510
  # "autoencoder","nmf",
6427
6511
  ]
6428
6512
  method = strcmp(method, methods)[0]
6513
+ if run_once_within(reverse=True):
6514
+ print(f"support methods:{methods}")
6515
+
6429
6516
  if verbose:
6430
6517
  print(f"\nprocessing with using {dict_methods[method]}:")
6431
6518
  xlabel, ylabel = None, None
@@ -6433,16 +6520,20 @@ def df_reducer(
6433
6520
  columns = data.select_dtypes(include="number").columns.tolist()
6434
6521
  if hue is None:
6435
6522
  hue = data.select_dtypes(exclude="number").columns.tolist()
6523
+ print(f"auto select the non-number as 'hue':{hue}")
6436
6524
  if isinstance(hue, list):
6437
6525
  print("Warning: hue is a list, only select the 1st one")
6438
6526
  hue = hue[0]
6439
- if not hue:
6527
+ if not any(hue):
6440
6528
  # Select columns if specified, else use all columns
6441
6529
  X = data[columns].values if columns else data.values
6442
6530
  else:
6443
6531
  # Select columns to reduce and hue for LDA
6444
- X = data[columns].values if columns else data.drop(columns=[hue]).values
6445
- y = data[hue].values
6532
+ try:
6533
+ X = data[columns].values if columns else data.drop(columns=[hue]).values
6534
+ y = data[hue].values
6535
+ except:
6536
+ pass
6446
6537
  print(X.shape)
6447
6538
  # Handle missing values
6448
6539
  if fill_missing:
@@ -6909,33 +7000,49 @@ def df_reducer(
6909
7000
  colname_met = "SVD_"
6910
7001
  # Quick plots
6911
7002
  if plot_ and (not method in ["isolation_forest"]):
6912
- from .plot import plotxy
6913
- if ax is None:
6914
- if figsize is None:
6915
- _, ax = plt.subplots(figsize=cm2inch(8, 8))
6916
- else:
6917
- _, ax = plt.subplots(figsize=figsize)
6918
- else:
6919
- ax = ax.cla()
7003
+ from .plot import plotxy,figsets,get_color
7004
+ # if ax is None:
7005
+ # if figsize is None:
7006
+ # _, ax = plt.subplots(figsize=cm2inch(8, 8))
7007
+ # else:
7008
+ # _, ax = plt.subplots(figsize=figsize)
7009
+ # else:
7010
+ # ax = ax.cla()
6920
7011
  xlabel = f"{colname_met}1" if xlabel is None else xlabel
6921
7012
  ylabel = f"{colname_met}2" if ylabel is None else ylabel
7013
+ palette=get_color(len(flatten(data[hue],verbose=0)))
7014
+
7015
+ reduced_df=reduced_df.sort_values(by=hue)
7016
+ print(flatten(reduced_df[hue]))
6922
7017
  ax = plotxy(
6923
7018
  data=reduced_df,
6924
7019
  x=colname_met + "1",
6925
7020
  y=colname_met + "2",
6926
7021
  hue=hue,
6927
- s=size,
7022
+ palette=palette,
7023
+ # size=size,
6928
7024
  edgecolor=edgecolor,
6929
- kind_="scater",
6930
- figsets=dict(
6931
- legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
6932
- xlabel=xlabel if xlabel else None,
6933
- ylabel=ylabel if ylabel else None,
6934
- ),
6935
- ax=ax,
7025
+ kind_=["joint",
7026
+ # "kde",
7027
+ "ell",
7028
+ ],
7029
+ kws_kde=dict(
7030
+ hue=hue,
7031
+ levels=2,
7032
+ common_norm=False,
7033
+ fill=True,
7034
+ alpha=0.05,
7035
+ ),
7036
+ kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
7037
+ kws_ellipse=dict(alpha=0.1,lw=1,label=None),
6936
7038
  verbose=False,
6937
7039
  **kwargs,
6938
7040
  )
7041
+ figsets(
7042
+ legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
7043
+ xlabel=xlabel if xlabel else None,
7044
+ ylabel=ylabel if ylabel else None,
7045
+ )
6939
7046
 
6940
7047
  if inplace:
6941
7048
  # If inplace=True, add components back into the original data
@@ -7412,6 +7519,7 @@ def df_qc(
7412
7519
  from statsmodels.stats.outliers_influence import variance_inflation_factor
7413
7520
  from scipy.stats import skew, kurtosis, entropy
7414
7521
 
7522
+ pd.options.display.max_seq_items = 10
7415
7523
  #! display(data.select_dtypes(include=[np.number]).describe())
7416
7524
  #!skim
7417
7525
  if columns is not None:
@@ -7428,16 +7536,18 @@ def df_qc(
7428
7536
  data = data.copy()
7429
7537
  data.loc[:, data.isna().all()] = 0
7430
7538
  res_qc = {}
7431
- print(f"data.shape:{data.shape}")
7539
+ print(f"data.shape:{data.shape}\n⤵ data.sample(10):")
7540
+ display(data.sample(10).style.background_gradient(cmap="coolwarm", axis=1))
7432
7541
 
7433
7542
  # Missing values
7434
7543
  res_qc["missing_values"] = data.isnull().sum()
7435
- res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
7544
+ res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
7436
7545
  res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
7437
7546
 
7438
7547
  # Data types and unique values
7439
7548
  res_qc["data_types"] = data.dtypes
7440
- res_qc["unique_values"] = data.nunique()
7549
+ res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
7550
+ res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
7441
7551
  res_qc["constant_columns"] = [
7442
7552
  col for col in data.columns if data[col].nunique() <= 1
7443
7553
  ]
@@ -7453,33 +7563,42 @@ def df_qc(
7453
7563
  data_outliers = df_outlier(data)
7454
7564
  outlier_num = data_outliers.isna().sum() - data.isnull().sum()
7455
7565
  res_qc["outlier_num"] = outlier_num[outlier_num > 0]
7456
- outlier_percentage=(outlier_num / len(data_outliers)) * 100
7566
+ outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
7457
7567
  res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
7458
- # Correlation and multicollinearity (VIF)
7459
- if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7460
- numeric_df = data.select_dtypes(include=[np.number]).dropna()
7461
- corr_matrix = numeric_df.corr()
7462
- high_corr_pairs = [
7463
- (col1, col2)
7464
- for col1 in corr_matrix.columns
7465
- for col2 in corr_matrix.columns
7466
- if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7467
- ]
7468
- res_qc["high_correlations"] = high_corr_pairs
7469
-
7470
- # VIF for multicollinearity check
7471
- numeric_df = data.select_dtypes(include=[np.number]).dropna()
7472
- vif_data = pd.DataFrame()
7473
- res_qc["vif"]=vif_data
7474
- if numeric_df.shape[1] > 1 and not numeric_df.empty:
7475
- vif_data["feature"] = numeric_df.columns
7476
- vif_data["VIF"] = [
7477
- variance_inflation_factor(numeric_df.values, i)
7478
- for i in range(numeric_df.shape[1])
7568
+ try:
7569
+ # Correlation and multicollinearity (VIF)
7570
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7571
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7572
+ corr_matrix = numeric_df.corr()
7573
+ high_corr_pairs = [
7574
+ (col1, col2)
7575
+ for col1 in corr_matrix.columns
7576
+ for col2 in corr_matrix.columns
7577
+ if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7479
7578
  ]
7480
- res_qc["vif"] = vif_data[
7481
- vif_data["VIF"] > 5
7482
- ] # Typically VIF > 5 indicates multicollinearity
7579
+ res_qc["high_correlations"] = high_corr_pairs
7580
+
7581
+ # VIF for multicollinearity check
7582
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7583
+ if isinstance(numeric_df.columns, pd.MultiIndex):
7584
+ numeric_df.columns = [
7585
+ "_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
7586
+ ]
7587
+
7588
+
7589
+ vif_data = pd.DataFrame()
7590
+ res_qc["vif"]=vif_data
7591
+ if numeric_df.shape[1] > 1 and not numeric_df.empty:
7592
+ vif_data["feature"] = numeric_df.columns.tolist()
7593
+ vif_data["VIF"] = [
7594
+ round(variance_inflation_factor(numeric_df.values, i),2)
7595
+ for i in range(numeric_df.shape[1])
7596
+ ]
7597
+ res_qc["vif"] = vif_data[
7598
+ vif_data["VIF"] > 5
7599
+ ] # Typically VIF > 5 indicates multicollinearity
7600
+ except Exception as e:
7601
+ print(e)
7483
7602
  # Skewness and Kurtosis
7484
7603
  skewness = data.skew(numeric_only=True)
7485
7604
  kurtosis_vals = data.kurt(numeric_only=True)
@@ -7492,8 +7611,7 @@ def df_qc(
7492
7611
  col: entropy(data[col].value_counts(normalize=True), base=2)
7493
7612
  for col in categorical_cols
7494
7613
  }
7495
- # number of unique
7496
- res_qc["unique_counts"] = data.nunique()
7614
+
7497
7615
  # dtypes counts
7498
7616
  res_qc['dtype_counts']=data.dtypes.value_counts()
7499
7617
 
@@ -7540,7 +7658,7 @@ def df_qc(
7540
7658
  res_qc["text_length_analysis"] = text_lengths
7541
7659
 
7542
7660
  # Summary statistics
7543
- res_qc["summary_statistics"] = data.describe().T
7661
+ res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
7544
7662
 
7545
7663
  # Automated warnings
7546
7664
  warnings = []
@@ -7562,28 +7680,45 @@ def df_qc(
7562
7680
 
7563
7681
  # Report generation
7564
7682
  if verbose:
7565
- print("=== QC Report Summary ===")
7566
7683
  print("\n⤵ Summary Statistics:")
7567
7684
  display(res_qc["summary_statistics"])
7568
7685
  print("\n⤵ Data Types:")
7569
7686
  display(res_qc["data_types"])
7570
7687
  if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
7571
7688
  print(" ⤵ Missing Values Counts:")
7572
- display(res_qc["missing_values"][res_qc["missing_values"] > 0])
7689
+ display(pd.DataFrame(
7690
+ {
7691
+ "missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
7692
+ "missing_percent(%)": res_qc["missing_percentage"][
7693
+ res_qc["missing_percentage"] > 0
7694
+ ],
7695
+ }
7696
+ ).style.background_gradient(cmap="coolwarm", axis=0)
7697
+ )
7573
7698
  # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7574
7699
  print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
7575
7700
 
7701
+ print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
7702
+ print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
7703
+ print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
7704
+
7576
7705
  if any(res_qc["outlier_num"]):
7577
7706
  print("\n⤵ Outlier Report:")
7578
- display(res_qc["outlier_num"])
7579
- if any(res_qc["unique_values"]):
7707
+ display(pd.DataFrame(
7708
+ {
7709
+ "outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
7710
+ "outlier_percentage(%)": res_qc["outlier_percentage"][
7711
+ res_qc["outlier_percentage"] > 0
7712
+ ],
7713
+ }
7714
+ ).style.background_gradient(cmap="coolwarm", axis=0)
7715
+ )
7716
+
7717
+ if any(res_qc["unique_counts"]):
7580
7718
  print("\n⤵ Unique Values per Column:")
7581
- display(res_qc["unique_values"])
7719
+ display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
7720
+ "unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
7582
7721
 
7583
- print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
7584
-
7585
- print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
7586
- print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
7587
7722
 
7588
7723
  if res_qc["empty_columns"]:
7589
7724
  print("\n⤵ Empty Columns:", res_qc["empty_columns"])
@@ -7595,7 +7730,7 @@ def df_qc(
7595
7730
 
7596
7731
  if "vif" in res_qc:
7597
7732
  print("\n⤵ Features with High VIF (>|5|):")
7598
- print(res_qc["vif"])
7733
+ display(res_qc["vif"].style.background_gradient(cmap="coolwarm", axis=0))
7599
7734
 
7600
7735
  if any(res_qc["high_cardinality_categoricals"]):
7601
7736
  print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
@@ -7614,6 +7749,8 @@ def df_qc(
7614
7749
  print("\nWarnings:")
7615
7750
  for warning in res_qc["warnings"]:
7616
7751
  print(" -", warning)
7752
+
7753
+ pd.reset_option("display.max_seq_items")
7617
7754
  if plot_:
7618
7755
  df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
7619
7756
  if output or not plot_:
@@ -7632,7 +7769,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7632
7769
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7633
7770
  data=data[columns]
7634
7771
  len_total = len(res_qc)
7635
- n_row, n_col = int((len_total + 10) / 3), 3
7772
+ n_row, n_col = int((len_total + 10)), 3
7636
7773
  nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7637
7774
 
7638
7775
  missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
@@ -7789,8 +7926,10 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7789
7926
  title="Dtypes",
7790
7927
  ylabel="#",
7791
7928
  ax=ax_dtype_counts,
7792
- fontsize=8 if len(dtype_counts.index)<=20 else 6,
7929
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7793
7930
  )
7931
+ # from .plot import pie
7932
+ # pie()
7794
7933
 
7795
7934
  # High cardinality: Show top categorical columns by unique value count
7796
7935
  high_cardinality = res_qc["high_cardinality_categoricals"]
@@ -7871,16 +8010,17 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7871
8010
  title="Correlation Heatmap",
7872
8011
  ax=ax_heatmap
7873
8012
  )
7874
- # save figure
7875
- if dir_save:
7876
- figsave(dir_save,f"qc_plot_{now_}.pdf")
8013
+ # # save figure
8014
+ # if dir_save:
8015
+ # figsave(dir_save,f"qc_plot_{now_}.pdf")
7877
8016
 
7878
8017
  if columns is not None:
7879
8018
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7880
8019
  data=data[columns]
7881
- len_total = len(res_qc)
7882
- n_row, n_col = int((len_total + 10) / 3), 3
7883
- nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
8020
+
8021
+ # len_total = len(res_qc)
8022
+ # n_row, n_col = int((len_total + 10) / 3), 3
8023
+ # nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7884
8024
  #! check distribution
7885
8025
  data_num = data.select_dtypes(include=np.number)
7886
8026
  if len(data_num) > max_cols:
@@ -7907,7 +8047,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7907
8047
  figsets(ylabel=f'Q-Q Plot:{column}',title=None)
7908
8048
  # save figure
7909
8049
  if dir_save:
7910
- figsave(dir_save,f"qq_plot_{now_}.pdf")
8050
+ figsave(dir_save,f"qc_plot_{now_}.pdf")
8051
+
8052
+ def df_corr(df: pd.DataFrame, method="pearson"):
8053
+ """
8054
+ Compute correlation coefficients and p-values for a DataFrame.
8055
+
8056
+ Parameters:
8057
+ - df (pd.DataFrame): Input DataFrame with numeric data.
8058
+ - method (str): Correlation method ("pearson", "spearman", "kendall").
8059
+
8060
+ Returns:
8061
+ - corr_matrix (pd.DataFrame): Correlation coefficient matrix.
8062
+ - pval_matrix (pd.DataFrame): P-value matrix.
8063
+ """
8064
+ from scipy.stats import pearsonr, spearmanr, kendalltau
8065
+
8066
+ methods = ["pearson", "spearman", "kendall"]
8067
+ method = strcmp(method, methods)[0]
8068
+ methods_dict = {"pearson": pearsonr, "spearman": spearmanr, "kendall": kendalltau}
8069
+
8070
+ cols = df.columns
8071
+ corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
8072
+ pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
8073
+ correlation_func = methods_dict[method]
8074
+
8075
+ for col1 in cols:
8076
+ for col2 in cols:
8077
+ if col1 == col2:
8078
+ corr_matrix.loc[col1, col2] = 1.0
8079
+ pval_matrix.loc[col1, col2] = 0.0
8080
+ else:
8081
+ corr, pval = correlation_func(df[col1], df[col2])
8082
+ corr_matrix.loc[col1, col2] = corr
8083
+ pval_matrix.loc[col1, col2] = pval
8084
+
8085
+ return corr_matrix, pval_matrix
8086
+
7911
8087
  def use_pd(
7912
8088
  func_name="excel",
7913
8089
  verbose=True,
@@ -7927,3 +8103,135 @@ def use_pd(
7927
8103
  except Exception as e:
7928
8104
  if verbose:
7929
8105
  print(e)
8106
+
8107
+ def get_phone(phone_number: str, region: str = None,verbose=True):
8108
+ """
8109
+ usage:
8110
+ info = get_phone(15237654321, "DE")
8111
+ preview(info)
8112
+
8113
+ Extremely advanced phone number analysis function.
8114
+
8115
+ Args:
8116
+ phone_number (str): The phone number to analyze.
8117
+ region (str): None (Default). Tries to work with international numbers including country codes; otherwise, uses the specified region.
8118
+
8119
+ Returns:
8120
+ dict: Comprehensive information about the phone number.
8121
+ """
8122
+ import phonenumbers
8123
+ from phonenumbers import geocoder, carrier, timezone, number_type
8124
+ from datetime import datetime
8125
+ import pytz
8126
+ from tzlocal import get_localzone
8127
+
8128
+ if not isinstance(phone_number, str):
8129
+ phone_number = str(phone_number)
8130
+ if isinstance(region, str):
8131
+ region = region.upper()
8132
+
8133
+ try:
8134
+ # Parse the phone number
8135
+ parsed_number = phonenumbers.parse(phone_number, region)
8136
+
8137
+ # Validate the phone number
8138
+ valid = phonenumbers.is_valid_number(parsed_number)
8139
+ possible = phonenumbers.is_possible_number(parsed_number)
8140
+
8141
+ if not valid:
8142
+ suggested_fix = phonenumbers.example_number(region) if region else "Unknown"
8143
+ return {
8144
+ "valid": False,
8145
+ "error": "Invalid phone number",
8146
+ "suggested_fix": suggested_fix,
8147
+ }
8148
+
8149
+ # Basic details
8150
+ formatted_international = phonenumbers.format_number(
8151
+ parsed_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
8152
+ )
8153
+ formatted_national = phonenumbers.format_number(
8154
+ parsed_number, phonenumbers.PhoneNumberFormat.NATIONAL
8155
+ )
8156
+ formatted_e164 = phonenumbers.format_number(
8157
+ parsed_number, phonenumbers.PhoneNumberFormat.E164
8158
+ )
8159
+ country_code = parsed_number.country_code
8160
+ region_code = geocoder.region_code_for_number(parsed_number)
8161
+ country_name = geocoder.country_name_for_number(parsed_number, "en")
8162
+
8163
+ location = geocoder.description_for_number(parsed_number, "en")
8164
+ carrier_name = carrier.name_for_number(parsed_number, "en") or "Unknown Carrier"
8165
+ time_zones = timezone.time_zones_for_number(parsed_number)[0]
8166
+ current_times = datetime.now(pytz.timezone(time_zones)).strftime(
8167
+ "%Y-%m-%d %H:%M:%S %Z"
8168
+ )
8169
+ number_type_str = {
8170
+ phonenumbers.PhoneNumberType.FIXED_LINE: "Fixed Line",
8171
+ phonenumbers.PhoneNumberType.MOBILE: "Mobile",
8172
+ phonenumbers.PhoneNumberType.FIXED_LINE_OR_MOBILE: "Fixed Line or Mobile",
8173
+ phonenumbers.PhoneNumberType.TOLL_FREE: "Toll Free",
8174
+ phonenumbers.PhoneNumberType.PREMIUM_RATE: "Premium Rate",
8175
+ phonenumbers.PhoneNumberType.SHARED_COST: "Shared Cost",
8176
+ phonenumbers.PhoneNumberType.VOIP: "VOIP",
8177
+ phonenumbers.PhoneNumberType.PERSONAL_NUMBER: "Personal Number",
8178
+ phonenumbers.PhoneNumberType.PAGER: "Pager",
8179
+ phonenumbers.PhoneNumberType.UAN: "UAN",
8180
+ phonenumbers.PhoneNumberType.UNKNOWN: "Unknown",
8181
+ }.get(number_type(parsed_number), "Unknown")
8182
+
8183
+ # Advanced Features
8184
+ is_toll_free = (
8185
+ number_type(parsed_number) == phonenumbers.PhoneNumberType.TOLL_FREE
8186
+ )
8187
+ is_premium_rate = (
8188
+ number_type(parsed_number) == phonenumbers.PhoneNumberType.PREMIUM_RATE
8189
+ )
8190
+
8191
+ # Dialing Information
8192
+ dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
8193
+
8194
+ # Advanced Timezone Handling
8195
+ gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
8196
+ # Get the local timezone (current computer's time)
8197
+ local_timezone = get_localzone()
8198
+ #local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
8199
+ local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
8200
+ offset_diff = local_offset - gmt_offsets
8201
+ head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
8202
+ res= {
8203
+ "valid": True,
8204
+ "possible": possible,
8205
+ "formatted": {
8206
+ "international": formatted_international,
8207
+ "national": formatted_national,
8208
+ "e164": formatted_e164,
8209
+ },
8210
+ "country_code": country_code,
8211
+ "country_name": country_name,
8212
+ "region_code": region_code,
8213
+ "location": location if location else "Unknown",
8214
+ "carrier": carrier_name,
8215
+ "time_zone": time_zones,
8216
+ "current_times": current_times,
8217
+ "local_offset":f"{local_offset} utcoffset",
8218
+ "time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
8219
+ "number_type": number_type_str,
8220
+ "is_toll_free": is_toll_free,
8221
+ "is_premium_rate": is_premium_rate,
8222
+ "dialing_instructions": dialing_instructions,
8223
+ "suggested_fix": None, # Use phonenumbers.example_number if invalid
8224
+ "logs": {
8225
+ "number_analysis_completed": datetime.now().strftime(
8226
+ "%Y-%m-%d %H:%M:%S"
8227
+ ),
8228
+ "raw_input": phone_number,
8229
+ "parsed_number": str(parsed_number),
8230
+ },
8231
+ }
8232
+
8233
+ except phonenumbers.NumberParseException as e:
8234
+ res= {"valid": False, "error": str(e)}
8235
+ if verbose:
8236
+ preview(res)
8237
+ return res