py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/ips.py CHANGED
@@ -16,6 +16,7 @@ import warnings
16
16
 
17
17
  warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
18
18
  warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
19
+ warnings.filterwarnings("ignore")
19
20
 
20
21
 
21
22
  def run_once_within(duration=60,reverse=False): # default 60s
@@ -541,8 +542,7 @@ def is_text(s):
541
542
 
542
543
  from typing import Any, Union
543
544
 
544
-
545
- def shared(*args, strict=True, n_shared=2, verbose=True):
545
+ def share(*args, strict=True, n_shared=2, verbose=True):
546
546
  """
547
547
  check the shared elelements in two list.
548
548
  usage:
@@ -587,12 +587,68 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
587
587
  elements2show = (
588
588
  shared_elements if len(shared_elements) < 10 else shared_elements[:5]
589
589
  )
590
+ tail = '' if len(shared_elements) < 10 else '......'
591
+ elements2show.append(tail)
590
592
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
591
593
  print("********* checking shared elements *********")
592
594
  return shared_elements
593
595
 
596
+ def shared(*args, n_shared=None, verbose=True,**kwargs):
597
+ """
598
+ check the shared elelements in two list.
599
+ usage:
600
+ list1 = [1, 2, 3, 4, 5]
601
+ list2 = [4, 5, 6, 7, 8]
602
+ list3 = [5, 6, 9, 10]
603
+ a = shared(list1, list2,list3)
604
+ """
605
+ if verbose:
606
+ print("\n********* checking shared elements *********")
607
+
608
+ if len(args) == 1 and isinstance(args[0], list):
609
+ lists = args[0] # Unpack the single list
610
+ else:
611
+ lists = args # Use the provided arguments as lists
612
+ flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
613
+
614
+ if n_shared is None:
615
+ n_shared = len(flattened_lists)
616
+ strict = True
617
+ else:
618
+ strict = False
619
+ # Ensure all arguments are lists
620
+ if any(not isinstance(lst, list) for lst in flattened_lists):
621
+ print(f"{' ' * 2}All inputs must be lists.")
622
+ return []
623
+ first_list = flattened_lists[0]
624
+ shared_elements = [
625
+ item for item in first_list if all(item in lst for lst in flattened_lists)
626
+ ]
627
+ if strict:
628
+ # Strict mode: require elements to be in all lists
629
+ shared_elements = set(flattened_lists[0])
630
+ for lst in flattened_lists[1:]:
631
+ shared_elements.intersection_update(lst)
632
+ else:
633
+ from collections import Counter
594
634
 
595
- def not_shared(*args, strict=True, n_shared=2, verbose=False):
635
+ all_elements = [item for sublist in flattened_lists for item in sublist]
636
+ element_count = Counter(all_elements)
637
+ # Get elements that appear in at least n_shared lists
638
+ shared_elements = [
639
+ item for item, count in element_count.items() if count >= n_shared
640
+ ]
641
+
642
+ shared_elements = flatten(shared_elements, verbose=verbose)
643
+ if verbose:
644
+ elements2show = (
645
+ shared_elements if len(shared_elements) < 10 else shared_elements[:5]
646
+ )
647
+ print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
648
+ print("********* checking shared elements *********")
649
+ return shared_elements
650
+
651
+ def share_not(*args, n_shared=None, verbose=False):
596
652
  """
597
653
  To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
598
654
  usage:
@@ -600,7 +656,19 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
600
656
  list2 = [4, 5, 6, 7, 8]
601
657
  not_shared(list1,list2)# output [1,3]
602
658
  """
603
- _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
659
+ _common = shared(*args, n_shared=n_shared, verbose=verbose)
660
+ list1 = flatten(args[0], verbose=verbose)
661
+ _not_shared = [item for item in list1 if item not in _common]
662
+ return _not_shared
663
+ def not_shared(*args, n_shared=None, verbose=False):
664
+ """
665
+ To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
666
+ usage:
667
+ list1 = [1, 8, 3, 3, 4, 5]
668
+ list2 = [4, 5, 6, 7, 8]
669
+ not_shared(list1,list2)# output [1,3]
670
+ """
671
+ _common = shared(*args, n_shared=n_shared, verbose=verbose)
604
672
  list1 = flatten(args[0], verbose=verbose)
605
673
  _not_shared = [item for item in list1 if item not in _common]
606
674
  return _not_shared
@@ -1983,7 +2051,6 @@ def fload(fpath, kind=None, **kwargs):
1983
2051
 
1984
2052
  def load_csv(fpath, **kwargs):
1985
2053
  from pandas.errors import EmptyDataError
1986
-
1987
2054
  engine = kwargs.pop("engine", "pyarrow")# default: None
1988
2055
  sep = kwargs.pop("sep", None)# default: ','
1989
2056
  index_col = kwargs.pop("index_col", None)# default: None
@@ -1994,13 +2061,20 @@ def fload(fpath, kind=None, **kwargs):
1994
2061
  comment = kwargs.pop("comment", None)# default: None
1995
2062
  fmt = kwargs.pop("fmt", False)# default:
1996
2063
  chunksize = kwargs.pop("chunksize", None)# default: None
2064
+
2065
+ #check filesize
2066
+ f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
2067
+ if f_size>=50: #50 MB
2068
+ if chunksize is None:
2069
+ chunksize = 5000
2070
+ print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
1997
2071
  engine = "c" if chunksize else engine # when chunksize, recommend 'c'
1998
2072
  low_memory = kwargs.pop("low_memory", True)# default: True
1999
2073
  low_memory = (
2000
2074
  False if chunksize else True
2001
2075
  ) # when chunksize, recommend low_memory=False # default:
2002
2076
  verbose = kwargs.pop("verbose", False)
2003
- if run_once_within():
2077
+ if run_once_within(reverse=True):
2004
2078
  use_pd("read_csv", verbose=verbose)
2005
2079
 
2006
2080
  if comment is None:# default: None
@@ -2176,7 +2250,7 @@ def fload(fpath, kind=None, **kwargs):
2176
2250
  def load_excel(fpath, **kwargs):
2177
2251
  engine = kwargs.get("engine", "openpyxl")
2178
2252
  verbose = kwargs.pop("verbose", False)
2179
- if run_once_within():
2253
+ if run_once_within(reverse=True):
2180
2254
  use_pd("read_excel", verbose=verbose)
2181
2255
  df = pd.read_excel(fpath, engine=engine, **kwargs)
2182
2256
  try:
@@ -2206,7 +2280,7 @@ def fload(fpath, kind=None, **kwargs):
2206
2280
  engine = kwargs.get("engine", "pyarrow")
2207
2281
  verbose = kwargs.pop("verbose", False)
2208
2282
 
2209
- if run_once_within():
2283
+ if run_once_within(reverse=True):
2210
2284
  use_pd("read_parquet", verbose=verbose)
2211
2285
  try:
2212
2286
  df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2383,13 +2457,13 @@ def fload(fpath, kind=None, **kwargs):
2383
2457
  return load_xml(fpath)
2384
2458
  elif kind in ["csv", "tsv"]:
2385
2459
  # verbose = kwargs.pop("verbose", False)
2386
- if run_once_within():
2460
+ if run_once_within(reverse=True):
2387
2461
  use_pd("read_csv")
2388
2462
  content = load_csv(fpath, **kwargs)
2389
2463
  return content
2390
2464
  elif kind == "pkl":
2391
2465
  verbose = kwargs.pop("verbose", False)
2392
- if run_once_within():
2466
+ if run_once_within(reverse=True):
2393
2467
  use_pd("read_pickle")
2394
2468
  return pd.read_pickle(fpath, **kwargs)
2395
2469
  elif kind in ["ods", "ods", "odt"]:
@@ -2420,12 +2494,12 @@ def fload(fpath, kind=None, **kwargs):
2420
2494
  return load_ipynb(fpath, **kwargs)
2421
2495
  elif kind in ["parquet", "snappy"]:
2422
2496
  verbose = kwargs.pop("verbose", False)
2423
- if run_once_within():
2497
+ if run_once_within(reverse=True):
2424
2498
  use_pd("read_parquet")
2425
2499
  return load_parquet(fpath, **kwargs)
2426
2500
  elif kind == "feather":
2427
2501
  verbose = kwargs.pop("verbose", False)
2428
- if run_once_within():
2502
+ if run_once_within(reverse=True):
2429
2503
  use_pd("read_feather")
2430
2504
  content = pd.read_feather(fpath, **kwargs)
2431
2505
  return content
@@ -2684,7 +2758,7 @@ def fsave(
2684
2758
  # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
2685
2759
 
2686
2760
  verbose = kwargs.pop("verbose", False)
2687
- if run_once_within():
2761
+ if run_once_within(reverse=True):
2688
2762
  use_pd("to_csv", verbose=verbose)
2689
2763
  kwargs_csv = dict(
2690
2764
  path_or_buf=None,
@@ -2716,7 +2790,7 @@ def fsave(
2716
2790
  def save_xlsx(fpath, data, **kwargs):
2717
2791
  verbose = kwargs.pop("verbose", False)
2718
2792
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2719
- if run_once_within():
2793
+ if run_once_within(reverse=True):
2720
2794
  use_pd("to_excel", verbose=verbose)
2721
2795
  if any(kwargs):
2722
2796
  format_excel(df=data, filename=fpath, **kwargs)
@@ -5911,6 +5985,9 @@ def df_scaler(
5911
5985
  scaler=None,
5912
5986
  method="standard",
5913
5987
  columns=None, # default, select all numeric col/row
5988
+ feature_range=None,# specific for 'minmax'
5989
+ vmin=0,
5990
+ vmax=1,
5914
5991
  inplace=False,
5915
5992
  verbose=False, # show usage
5916
5993
  axis=0, # defalut column-wise
@@ -5943,11 +6020,13 @@ def df_scaler(
5943
6020
  scaler = StandardScaler(**kwargs)
5944
6021
  elif method == "minmax":
5945
6022
  from sklearn.preprocessing import MinMaxScaler
6023
+ if feature_range is None:
6024
+ feature_range=(vmin,vmax)
5946
6025
  if verbose:
5947
6026
  print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
5948
6027
  print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
5949
6028
  print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
5950
- scaler = MinMaxScaler(**kwargs)
6029
+ scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
5951
6030
  elif method == "robust":
5952
6031
  from sklearn.preprocessing import RobustScaler
5953
6032
  if verbose:
@@ -6035,15 +6114,20 @@ def df_special_characters_cleaner(
6035
6114
 
6036
6115
  # 1. Clean column names by replacing special characters with underscores
6037
6116
  if "column" in where_:
6038
- data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
6117
+ try:
6118
+ data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
6119
+ except Exception as e:
6120
+ print(e)
6039
6121
 
6040
6122
  # 2. Clean only object-type columns (text columns)
6041
- if "content" in where_:
6042
- for col in data.select_dtypes(include=["object"]).columns:
6043
- data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
6044
- if data.index.dtype == "object" and index in where_:
6045
- data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
6046
-
6123
+ try:
6124
+ if "content" in where_:
6125
+ for col in data.select_dtypes(include=["object"]).columns:
6126
+ data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
6127
+ if data.index.dtype == "object" and index in where_:
6128
+ data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
6129
+ except:
6130
+ pass
6047
6131
  return data
6048
6132
 
6049
6133
 
@@ -6426,6 +6510,9 @@ def df_reducer(
6426
6510
  # "autoencoder","nmf",
6427
6511
  ]
6428
6512
  method = strcmp(method, methods)[0]
6513
+ if run_once_within(reverse=True):
6514
+ print(f"support methods:{methods}")
6515
+
6429
6516
  if verbose:
6430
6517
  print(f"\nprocessing with using {dict_methods[method]}:")
6431
6518
  xlabel, ylabel = None, None
@@ -6433,16 +6520,20 @@ def df_reducer(
6433
6520
  columns = data.select_dtypes(include="number").columns.tolist()
6434
6521
  if hue is None:
6435
6522
  hue = data.select_dtypes(exclude="number").columns.tolist()
6523
+ print(f"auto select the non-number as 'hue':{hue}")
6436
6524
  if isinstance(hue, list):
6437
6525
  print("Warning: hue is a list, only select the 1st one")
6438
6526
  hue = hue[0]
6439
- if not hue:
6527
+ if not any(hue):
6440
6528
  # Select columns if specified, else use all columns
6441
6529
  X = data[columns].values if columns else data.values
6442
6530
  else:
6443
6531
  # Select columns to reduce and hue for LDA
6444
- X = data[columns].values if columns else data.drop(columns=[hue]).values
6445
- y = data[hue].values
6532
+ try:
6533
+ X = data[columns].values if columns else data.drop(columns=[hue]).values
6534
+ y = data[hue].values
6535
+ except:
6536
+ pass
6446
6537
  print(X.shape)
6447
6538
  # Handle missing values
6448
6539
  if fill_missing:
@@ -6909,33 +7000,49 @@ def df_reducer(
6909
7000
  colname_met = "SVD_"
6910
7001
  # Quick plots
6911
7002
  if plot_ and (not method in ["isolation_forest"]):
6912
- from .plot import plotxy
6913
- if ax is None:
6914
- if figsize is None:
6915
- _, ax = plt.subplots(figsize=cm2inch(8, 8))
6916
- else:
6917
- _, ax = plt.subplots(figsize=figsize)
6918
- else:
6919
- ax = ax.cla()
7003
+ from .plot import plotxy,figsets,get_color
7004
+ # if ax is None:
7005
+ # if figsize is None:
7006
+ # _, ax = plt.subplots(figsize=cm2inch(8, 8))
7007
+ # else:
7008
+ # _, ax = plt.subplots(figsize=figsize)
7009
+ # else:
7010
+ # ax = ax.cla()
6920
7011
  xlabel = f"{colname_met}1" if xlabel is None else xlabel
6921
7012
  ylabel = f"{colname_met}2" if ylabel is None else ylabel
7013
+ palette=get_color(len(flatten(data[hue],verbose=0)))
7014
+
7015
+ reduced_df=reduced_df.sort_values(by=hue)
7016
+ print(flatten(reduced_df[hue]))
6922
7017
  ax = plotxy(
6923
7018
  data=reduced_df,
6924
7019
  x=colname_met + "1",
6925
7020
  y=colname_met + "2",
6926
7021
  hue=hue,
6927
- s=size,
7022
+ palette=palette,
7023
+ # size=size,
6928
7024
  edgecolor=edgecolor,
6929
- kind_="scater",
6930
- figsets=dict(
6931
- legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
6932
- xlabel=xlabel if xlabel else None,
6933
- ylabel=ylabel if ylabel else None,
6934
- ),
6935
- ax=ax,
7025
+ kind_=["joint",
7026
+ # "kde",
7027
+ "ell",
7028
+ ],
7029
+ kws_kde=dict(
7030
+ hue=hue,
7031
+ levels=2,
7032
+ common_norm=False,
7033
+ fill=True,
7034
+ alpha=0.05,
7035
+ ),
7036
+ kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
7037
+ kws_ellipse=dict(alpha=0.1,lw=1,label=None),
6936
7038
  verbose=False,
6937
7039
  **kwargs,
6938
7040
  )
7041
+ figsets(
7042
+ legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
7043
+ xlabel=xlabel if xlabel else None,
7044
+ ylabel=ylabel if ylabel else None,
7045
+ )
6939
7046
 
6940
7047
  if inplace:
6941
7048
  # If inplace=True, add components back into the original data
@@ -7412,6 +7519,7 @@ def df_qc(
7412
7519
  from statsmodels.stats.outliers_influence import variance_inflation_factor
7413
7520
  from scipy.stats import skew, kurtosis, entropy
7414
7521
 
7522
+ pd.options.display.max_seq_items = 10
7415
7523
  #! display(data.select_dtypes(include=[np.number]).describe())
7416
7524
  #!skim
7417
7525
  if columns is not None:
@@ -7428,16 +7536,18 @@ def df_qc(
7428
7536
  data = data.copy()
7429
7537
  data.loc[:, data.isna().all()] = 0
7430
7538
  res_qc = {}
7431
- print(f"data.shape:{data.shape}")
7539
+ print(f"data.shape:{data.shape}\n⤵ data.sample(10):")
7540
+ display(data.sample(10).style.background_gradient(cmap="coolwarm", axis=1))
7432
7541
 
7433
7542
  # Missing values
7434
7543
  res_qc["missing_values"] = data.isnull().sum()
7435
- res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
7544
+ res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
7436
7545
  res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
7437
7546
 
7438
7547
  # Data types and unique values
7439
7548
  res_qc["data_types"] = data.dtypes
7440
- res_qc["unique_values"] = data.nunique()
7549
+ res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
7550
+ res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
7441
7551
  res_qc["constant_columns"] = [
7442
7552
  col for col in data.columns if data[col].nunique() <= 1
7443
7553
  ]
@@ -7453,33 +7563,42 @@ def df_qc(
7453
7563
  data_outliers = df_outlier(data)
7454
7564
  outlier_num = data_outliers.isna().sum() - data.isnull().sum()
7455
7565
  res_qc["outlier_num"] = outlier_num[outlier_num > 0]
7456
- outlier_percentage=(outlier_num / len(data_outliers)) * 100
7566
+ outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
7457
7567
  res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
7458
- # Correlation and multicollinearity (VIF)
7459
- if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7460
- numeric_df = data.select_dtypes(include=[np.number]).dropna()
7461
- corr_matrix = numeric_df.corr()
7462
- high_corr_pairs = [
7463
- (col1, col2)
7464
- for col1 in corr_matrix.columns
7465
- for col2 in corr_matrix.columns
7466
- if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7467
- ]
7468
- res_qc["high_correlations"] = high_corr_pairs
7469
-
7470
- # VIF for multicollinearity check
7471
- numeric_df = data.select_dtypes(include=[np.number]).dropna()
7472
- vif_data = pd.DataFrame()
7473
- res_qc["vif"]=vif_data
7474
- if numeric_df.shape[1] > 1 and not numeric_df.empty:
7475
- vif_data["feature"] = numeric_df.columns
7476
- vif_data["VIF"] = [
7477
- variance_inflation_factor(numeric_df.values, i)
7478
- for i in range(numeric_df.shape[1])
7568
+ try:
7569
+ # Correlation and multicollinearity (VIF)
7570
+ if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
7571
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7572
+ corr_matrix = numeric_df.corr()
7573
+ high_corr_pairs = [
7574
+ (col1, col2)
7575
+ for col1 in corr_matrix.columns
7576
+ for col2 in corr_matrix.columns
7577
+ if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
7479
7578
  ]
7480
- res_qc["vif"] = vif_data[
7481
- vif_data["VIF"] > 5
7482
- ] # Typically VIF > 5 indicates multicollinearity
7579
+ res_qc["high_correlations"] = high_corr_pairs
7580
+
7581
+ # VIF for multicollinearity check
7582
+ numeric_df = data.select_dtypes(include=[np.number]).dropna()
7583
+ if isinstance(numeric_df.columns, pd.MultiIndex):
7584
+ numeric_df.columns = [
7585
+ "_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
7586
+ ]
7587
+
7588
+
7589
+ vif_data = pd.DataFrame()
7590
+ res_qc["vif"]=vif_data
7591
+ if numeric_df.shape[1] > 1 and not numeric_df.empty:
7592
+ vif_data["feature"] = numeric_df.columns.tolist()
7593
+ vif_data["VIF"] = [
7594
+ round(variance_inflation_factor(numeric_df.values, i),2)
7595
+ for i in range(numeric_df.shape[1])
7596
+ ]
7597
+ res_qc["vif"] = vif_data[
7598
+ vif_data["VIF"] > 5
7599
+ ] # Typically VIF > 5 indicates multicollinearity
7600
+ except Exception as e:
7601
+ print(e)
7483
7602
  # Skewness and Kurtosis
7484
7603
  skewness = data.skew(numeric_only=True)
7485
7604
  kurtosis_vals = data.kurt(numeric_only=True)
@@ -7492,8 +7611,7 @@ def df_qc(
7492
7611
  col: entropy(data[col].value_counts(normalize=True), base=2)
7493
7612
  for col in categorical_cols
7494
7613
  }
7495
- # number of unique
7496
- res_qc["unique_counts"] = data.nunique()
7614
+
7497
7615
  # dtypes counts
7498
7616
  res_qc['dtype_counts']=data.dtypes.value_counts()
7499
7617
 
@@ -7540,7 +7658,7 @@ def df_qc(
7540
7658
  res_qc["text_length_analysis"] = text_lengths
7541
7659
 
7542
7660
  # Summary statistics
7543
- res_qc["summary_statistics"] = data.describe().T
7661
+ res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
7544
7662
 
7545
7663
  # Automated warnings
7546
7664
  warnings = []
@@ -7562,28 +7680,45 @@ def df_qc(
7562
7680
 
7563
7681
  # Report generation
7564
7682
  if verbose:
7565
- print("=== QC Report Summary ===")
7566
7683
  print("\n⤵ Summary Statistics:")
7567
7684
  display(res_qc["summary_statistics"])
7568
7685
  print("\n⤵ Data Types:")
7569
7686
  display(res_qc["data_types"])
7570
7687
  if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
7571
7688
  print(" ⤵ Missing Values Counts:")
7572
- display(res_qc["missing_values"][res_qc["missing_values"] > 0])
7689
+ display(pd.DataFrame(
7690
+ {
7691
+ "missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
7692
+ "missing_percent(%)": res_qc["missing_percentage"][
7693
+ res_qc["missing_percentage"] > 0
7694
+ ],
7695
+ }
7696
+ ).style.background_gradient(cmap="coolwarm", axis=0)
7697
+ )
7573
7698
  # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
7574
7699
  print("\n⤵ Rows with Missing Values:",res_qc["rows_with_missing"])
7575
7700
 
7701
+ print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
7702
+ print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
7703
+ print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
7704
+
7576
7705
  if any(res_qc["outlier_num"]):
7577
7706
  print("\n⤵ Outlier Report:")
7578
- display(res_qc["outlier_num"])
7579
- if any(res_qc["unique_values"]):
7707
+ display(pd.DataFrame(
7708
+ {
7709
+ "outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
7710
+ "outlier_percentage(%)": res_qc["outlier_percentage"][
7711
+ res_qc["outlier_percentage"] > 0
7712
+ ],
7713
+ }
7714
+ ).style.background_gradient(cmap="coolwarm", axis=0)
7715
+ )
7716
+
7717
+ if any(res_qc["unique_counts"]):
7580
7718
  print("\n⤵ Unique Values per Column:")
7581
- display(res_qc["unique_values"])
7719
+ display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
7720
+ "unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
7582
7721
 
7583
- print("\n⤵ Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
7584
-
7585
- print("⤵ Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
7586
- print("⤵ Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
7587
7722
 
7588
7723
  if res_qc["empty_columns"]:
7589
7724
  print("\n⤵ Empty Columns:", res_qc["empty_columns"])
@@ -7595,7 +7730,7 @@ def df_qc(
7595
7730
 
7596
7731
  if "vif" in res_qc:
7597
7732
  print("\n⤵ Features with High VIF (>|5|):")
7598
- print(res_qc["vif"])
7733
+ display(res_qc["vif"].style.background_gradient(cmap="coolwarm", axis=0))
7599
7734
 
7600
7735
  if any(res_qc["high_cardinality_categoricals"]):
7601
7736
  print("\n⤵ High Cardinality Categorical Columns (>|50 unique|):")
@@ -7614,6 +7749,8 @@ def df_qc(
7614
7749
  print("\nWarnings:")
7615
7750
  for warning in res_qc["warnings"]:
7616
7751
  print(" -", warning)
7752
+
7753
+ pd.reset_option("display.max_seq_items")
7617
7754
  if plot_:
7618
7755
  df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
7619
7756
  if output or not plot_:
@@ -7632,7 +7769,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7632
7769
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7633
7770
  data=data[columns]
7634
7771
  len_total = len(res_qc)
7635
- n_row, n_col = int((len_total + 10) / 3), 3
7772
+ n_row, n_col = int((len_total + 10)), 3
7636
7773
  nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7637
7774
 
7638
7775
  missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
@@ -7789,8 +7926,10 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7789
7926
  title="Dtypes",
7790
7927
  ylabel="#",
7791
7928
  ax=ax_dtype_counts,
7792
- fontsize=8 if len(dtype_counts.index)<=20 else 6,
7929
+ fontsize=8 if len(dtype_counts.index)<=20 else 6,
7793
7930
  )
7931
+ # from .plot import pie
7932
+ # pie()
7794
7933
 
7795
7934
  # High cardinality: Show top categorical columns by unique value count
7796
7935
  high_cardinality = res_qc["high_cardinality_categoricals"]
@@ -7871,16 +8010,17 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7871
8010
  title="Correlation Heatmap",
7872
8011
  ax=ax_heatmap
7873
8012
  )
7874
- # save figure
7875
- if dir_save:
7876
- figsave(dir_save,f"qc_plot_{now_}.pdf")
8013
+ # # save figure
8014
+ # if dir_save:
8015
+ # figsave(dir_save,f"qc_plot_{now_}.pdf")
7877
8016
 
7878
8017
  if columns is not None:
7879
8018
  if isinstance(columns, (list,pd.core.indexes.base.Index)):
7880
8019
  data=data[columns]
7881
- len_total = len(res_qc)
7882
- n_row, n_col = int((len_total + 10) / 3), 3
7883
- nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
8020
+
8021
+ # len_total = len(res_qc)
8022
+ # n_row, n_col = int((len_total + 10) / 3), 3
8023
+ # nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
7884
8024
  #! check distribution
7885
8025
  data_num = data.select_dtypes(include=np.number)
7886
8026
  if len(data_num) > max_cols:
@@ -7907,7 +8047,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
7907
8047
  figsets(ylabel=f'Q-Q Plot:{column}',title=None)
7908
8048
  # save figure
7909
8049
  if dir_save:
7910
- figsave(dir_save,f"qq_plot_{now_}.pdf")
8050
+ figsave(dir_save,f"qc_plot_{now_}.pdf")
8051
+
8052
+ def df_corr(df: pd.DataFrame, method="pearson"):
8053
+ """
8054
+ Compute correlation coefficients and p-values for a DataFrame.
8055
+
8056
+ Parameters:
8057
+ - df (pd.DataFrame): Input DataFrame with numeric data.
8058
+ - method (str): Correlation method ("pearson", "spearman", "kendall").
8059
+
8060
+ Returns:
8061
+ - corr_matrix (pd.DataFrame): Correlation coefficient matrix.
8062
+ - pval_matrix (pd.DataFrame): P-value matrix.
8063
+ """
8064
+ from scipy.stats import pearsonr, spearmanr, kendalltau
8065
+
8066
+ methods = ["pearson", "spearman", "kendall"]
8067
+ method = strcmp(method, methods)[0]
8068
+ methods_dict = {"pearson": pearsonr, "spearman": spearmanr, "kendall": kendalltau}
8069
+
8070
+ cols = df.columns
8071
+ corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
8072
+ pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
8073
+ correlation_func = methods_dict[method]
8074
+
8075
+ for col1 in cols:
8076
+ for col2 in cols:
8077
+ if col1 == col2:
8078
+ corr_matrix.loc[col1, col2] = 1.0
8079
+ pval_matrix.loc[col1, col2] = 0.0
8080
+ else:
8081
+ corr, pval = correlation_func(df[col1], df[col2])
8082
+ corr_matrix.loc[col1, col2] = corr
8083
+ pval_matrix.loc[col1, col2] = pval
8084
+
8085
+ return corr_matrix, pval_matrix
8086
+
7911
8087
  def use_pd(
7912
8088
  func_name="excel",
7913
8089
  verbose=True,
@@ -7927,3 +8103,135 @@ def use_pd(
7927
8103
  except Exception as e:
7928
8104
  if verbose:
7929
8105
  print(e)
8106
+
8107
+ def get_phone(phone_number: str, region: str = None,verbose=True):
8108
+ """
8109
+ usage:
8110
+ info = get_phone(15237654321, "DE")
8111
+ preview(info)
8112
+
8113
+ Extremely advanced phone number analysis function.
8114
+
8115
+ Args:
8116
+ phone_number (str): The phone number to analyze.
8117
+ region (str): None (Default). Tries to work with international numbers including country codes; otherwise, uses the specified region.
8118
+
8119
+ Returns:
8120
+ dict: Comprehensive information about the phone number.
8121
+ """
8122
+ import phonenumbers
8123
+ from phonenumbers import geocoder, carrier, timezone, number_type
8124
+ from datetime import datetime
8125
+ import pytz
8126
+ from tzlocal import get_localzone
8127
+
8128
+ if not isinstance(phone_number, str):
8129
+ phone_number = str(phone_number)
8130
+ if isinstance(region, str):
8131
+ region = region.upper()
8132
+
8133
+ try:
8134
+ # Parse the phone number
8135
+ parsed_number = phonenumbers.parse(phone_number, region)
8136
+
8137
+ # Validate the phone number
8138
+ valid = phonenumbers.is_valid_number(parsed_number)
8139
+ possible = phonenumbers.is_possible_number(parsed_number)
8140
+
8141
+ if not valid:
8142
+ suggested_fix = phonenumbers.example_number(region) if region else "Unknown"
8143
+ return {
8144
+ "valid": False,
8145
+ "error": "Invalid phone number",
8146
+ "suggested_fix": suggested_fix,
8147
+ }
8148
+
8149
+ # Basic details
8150
+ formatted_international = phonenumbers.format_number(
8151
+ parsed_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
8152
+ )
8153
+ formatted_national = phonenumbers.format_number(
8154
+ parsed_number, phonenumbers.PhoneNumberFormat.NATIONAL
8155
+ )
8156
+ formatted_e164 = phonenumbers.format_number(
8157
+ parsed_number, phonenumbers.PhoneNumberFormat.E164
8158
+ )
8159
+ country_code = parsed_number.country_code
8160
+ region_code = geocoder.region_code_for_number(parsed_number)
8161
+ country_name = geocoder.country_name_for_number(parsed_number, "en")
8162
+
8163
+ location = geocoder.description_for_number(parsed_number, "en")
8164
+ carrier_name = carrier.name_for_number(parsed_number, "en") or "Unknown Carrier"
8165
+ time_zones = timezone.time_zones_for_number(parsed_number)[0]
8166
+ current_times = datetime.now(pytz.timezone(time_zones)).strftime(
8167
+ "%Y-%m-%d %H:%M:%S %Z"
8168
+ )
8169
+ number_type_str = {
8170
+ phonenumbers.PhoneNumberType.FIXED_LINE: "Fixed Line",
8171
+ phonenumbers.PhoneNumberType.MOBILE: "Mobile",
8172
+ phonenumbers.PhoneNumberType.FIXED_LINE_OR_MOBILE: "Fixed Line or Mobile",
8173
+ phonenumbers.PhoneNumberType.TOLL_FREE: "Toll Free",
8174
+ phonenumbers.PhoneNumberType.PREMIUM_RATE: "Premium Rate",
8175
+ phonenumbers.PhoneNumberType.SHARED_COST: "Shared Cost",
8176
+ phonenumbers.PhoneNumberType.VOIP: "VOIP",
8177
+ phonenumbers.PhoneNumberType.PERSONAL_NUMBER: "Personal Number",
8178
+ phonenumbers.PhoneNumberType.PAGER: "Pager",
8179
+ phonenumbers.PhoneNumberType.UAN: "UAN",
8180
+ phonenumbers.PhoneNumberType.UNKNOWN: "Unknown",
8181
+ }.get(number_type(parsed_number), "Unknown")
8182
+
8183
+ # Advanced Features
8184
+ is_toll_free = (
8185
+ number_type(parsed_number) == phonenumbers.PhoneNumberType.TOLL_FREE
8186
+ )
8187
+ is_premium_rate = (
8188
+ number_type(parsed_number) == phonenumbers.PhoneNumberType.PREMIUM_RATE
8189
+ )
8190
+
8191
+ # Dialing Information
8192
+ dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
8193
+
8194
+ # Advanced Timezone Handling
8195
+ gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
8196
+ # Get the local timezone (current computer's time)
8197
+ local_timezone = get_localzone()
8198
+ #local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
8199
+ local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
8200
+ offset_diff = local_offset - gmt_offsets
8201
+ head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
8202
+ res= {
8203
+ "valid": True,
8204
+ "possible": possible,
8205
+ "formatted": {
8206
+ "international": formatted_international,
8207
+ "national": formatted_national,
8208
+ "e164": formatted_e164,
8209
+ },
8210
+ "country_code": country_code,
8211
+ "country_name": country_name,
8212
+ "region_code": region_code,
8213
+ "location": location if location else "Unknown",
8214
+ "carrier": carrier_name,
8215
+ "time_zone": time_zones,
8216
+ "current_times": current_times,
8217
+ "local_offset":f"{local_offset} utcoffset",
8218
+ "time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
8219
+ "number_type": number_type_str,
8220
+ "is_toll_free": is_toll_free,
8221
+ "is_premium_rate": is_premium_rate,
8222
+ "dialing_instructions": dialing_instructions,
8223
+ "suggested_fix": None, # Use phonenumbers.example_number if invalid
8224
+ "logs": {
8225
+ "number_analysis_completed": datetime.now().strftime(
8226
+ "%Y-%m-%d %H:%M:%S"
8227
+ ),
8228
+ "raw_input": phone_number,
8229
+ "parsed_number": str(parsed_number),
8230
+ },
8231
+ }
8232
+
8233
+ except phonenumbers.NumberParseException as e:
8234
+ res= {"valid": False, "error": str(e)}
8235
+ if verbose:
8236
+ preview(res)
8237
+ return res