py2ls 0.2.4.2__py3-none-any.whl → 0.2.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -51,8 +51,6 @@ from bs4 import BeautifulSoup
51
51
 
52
52
  from . import netfinder
53
53
 
54
- # from .plot import get_color
55
-
56
54
  try:
57
55
  get_ipython().run_line_magic("load_ext", "autoreload")
58
56
  get_ipython().run_line_magic("autoreload", "2")
@@ -108,6 +106,8 @@ def unique(lst, ascending=None):
108
106
  返回:
109
107
  list: 一个列表,其中的元素是唯一的,顺序根据参数 `ascending` 进行排序。
110
108
  """
109
+ if not lst:
110
+ return []
111
111
  if ascending is not None:
112
112
  # 移除重复项
113
113
  unique_items = list(set(lst))
@@ -518,6 +518,77 @@ def is_text(s):
518
518
  return has_alpha and has_non_alpha
519
519
 
520
520
 
521
+ from typing import Any, Union
522
+
523
+ def shared(*args, strict=True, n_shared=2, verbose=True):
524
+ """
525
+ check the shared elelements in two list.
526
+ usage:
527
+ list1 = [1, 2, 3, 4, 5]
528
+ list2 = [4, 5, 6, 7, 8]
529
+ list3 = [5, 6, 9, 10]
530
+ a = shared(list1, list2,list3)
531
+ """
532
+ if verbose:
533
+ print("\n********* checking shared elements *********")
534
+
535
+ if len(args) == 1 and isinstance(args[0], list):
536
+ lists = args[0] # Unpack the single list
537
+ else:
538
+ lists = args # Use the provided arguments as lists
539
+ flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
540
+ # Ensure all arguments are lists
541
+ if any(not isinstance(lst, list) for lst in flattened_lists):
542
+ print(f"{' ' * 2}All inputs must be lists.")
543
+ return []
544
+ first_list = flattened_lists[0]
545
+ shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
546
+ if strict:
547
+ # Strict mode: require elements to be in all lists
548
+ shared_elements = set(flattened_lists[0])
549
+ for lst in flattened_lists[1:]:
550
+ shared_elements.intersection_update(lst)
551
+ else:
552
+ all_elements = [item for sublist in flattened_lists for item in sublist]
553
+ element_count = Counter(all_elements)
554
+ # Get elements that appear in at least n_shared lists
555
+ shared_elements = [item for item, count in element_count.items() if count >= n_shared]
556
+
557
+ shared_elements = flatten(shared_elements)
558
+ if verbose:
559
+ elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
560
+ print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
561
+ print("********* checking shared elements *********")
562
+ return shared_elements
563
+
564
+ def flatten(nested: Any, unique_list=True,verbose=True):
565
+ """
566
+ Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
567
+ Parameters:
568
+ nested : Any, Can be a list, tuple, dictionary, or set.
569
+ Returns: list, A flattened list.
570
+ """
571
+ flattened_list = []
572
+ stack = [nested]
573
+ while stack:
574
+ current = stack.pop()
575
+ if isinstance(current, dict):
576
+ stack.extend(current.values())
577
+ elif isinstance(current, (list, tuple, set)):
578
+ stack.extend(current)
579
+ elif isinstance(current, pd.Series):
580
+ stack.extend(current)
581
+ elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
582
+ stack.extend(current.tolist())
583
+ else:
584
+ flattened_list.append(current)
585
+ if verbose:
586
+ print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
587
+ if unique_list:
588
+ return unique(flattened_list)[::-1]
589
+ else:
590
+ return flattened_list
591
+
521
592
  def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
522
593
  """
523
594
  Compares a search term with a list of candidate strings and finds the best match based on similarity score.
@@ -548,7 +619,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
548
619
  similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
549
620
  elif "W" in scorer.lower():
550
621
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
551
- elif "ratio" in scorer.lower():#Ratio (Strictest)
622
+ elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
552
623
  similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
553
624
  else:
554
625
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
@@ -1567,6 +1638,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1567
1638
  """
1568
1639
  Usage
1569
1640
  is_abnormal = is_df_abnormal(df, verbose=1)
1641
+ True: abnormal
1642
+ False: normal
1570
1643
 
1571
1644
  """
1572
1645
  # Initialize a list to hold messages about abnormalities
@@ -1594,25 +1667,34 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1594
1667
  if len(column_names) == 1 and delimiter_counts["\t"] > 1:
1595
1668
  messages.append("Abnormal: Column names are not split correctly.")
1596
1669
  is_abnormal = True
1670
+ if verbose:
1671
+ print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
1597
1672
 
1598
1673
  if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
1599
1674
  messages.append("Abnormal: Too many delimiters in column names.")
1600
1675
  is_abnormal = True
1676
+ if verbose:
1677
+ print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
1601
1678
 
1602
1679
  if delimiter_counts[""] > 3:
1603
1680
  messages.append("Abnormal: There are empty column names.")
1604
1681
  is_abnormal = True
1682
+ if verbose:
1683
+ print(f'delimiter_counts[""] > 3')
1605
1684
 
1606
1685
  if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
1607
1686
  messages.append("Abnormal: Some column names contain unexpected characters.")
1608
1687
  is_abnormal = True
1688
+ if verbose:
1689
+ print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
1609
1690
 
1610
- # Check for missing values
1611
- missing_values = df.isnull().sum()
1612
- if missing_values.any():
1613
- messages.append("Missing values in columns:")
1614
- messages.append(missing_values[missing_values > 0].to_string())
1615
- is_abnormal = True
1691
+ # # Check for missing values
1692
+ # missing_values = df.isnull().sum()
1693
+ # if missing_values.any():
1694
+ # messages.append("Missing values in columns:")
1695
+ # messages.append(missing_values[missing_values > 0].to_string())
1696
+ # is_abnormal = True
1697
+ # print(f'missing_values.any()')
1616
1698
 
1617
1699
  # Check data types
1618
1700
  data_types = df.dtypes
@@ -1623,6 +1705,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1623
1705
  if constant_columns:
1624
1706
  messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
1625
1707
  is_abnormal = True
1708
+ if verbose:
1709
+ print(f'df.columns[df.nunique() == 1].tolist()')
1626
1710
 
1627
1711
  # Check for an unreasonable number of rows or columns
1628
1712
  if actual_shape[0] < 2 or actual_shape[1] < 2:
@@ -1630,6 +1714,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1630
1714
  "Abnormal: DataFrame is too small (less than 2 rows or columns)."
1631
1715
  )
1632
1716
  is_abnormal = True
1717
+ if verbose:
1718
+ print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
1633
1719
 
1634
1720
  # Compile results
1635
1721
  if verbose:
@@ -1672,10 +1758,36 @@ def fload(fpath, kind=None, **kwargs):
1672
1758
  content = yaml.safe_load(file)
1673
1759
  return content
1674
1760
 
1675
- def load_xml(fpath):
1676
- tree = etree.parse(fpath)
1677
- root = tree.getroot()
1678
- return etree.tostring(root, pretty_print=True).decode()
1761
+
1762
+ def load_xml(fpath, fsize_thr: int = 100):
1763
+ def load_small_xml(fpath):
1764
+ tree = etree.parse(fpath)
1765
+ root = tree.getroot()
1766
+ return etree.tostring(root, pretty_print=True).decode()
1767
+
1768
+ def load_large_xml(fpath):
1769
+ xml_parts = []
1770
+ context = etree.iterparse(
1771
+ fpath, events=("start", "end"), recover=True, huge_tree=True
1772
+ )
1773
+
1774
+ for event, elem in context:
1775
+ if event == "end":
1776
+ xml_parts.append(etree.tostring(elem, pretty_print=True).decode())
1777
+ elem.clear()
1778
+ while elem.getprevious() is not None:
1779
+ del elem.getparent()[0]
1780
+ del context
1781
+ return "".join(xml_parts)
1782
+
1783
+ file_size = os.path.getsize(fpath) / 1024 / 1024 # in MB
1784
+
1785
+ if file_size > fsize_thr:
1786
+ print(f"reading a small file:{file_size} Mb")
1787
+ return load_large_xml(fpath)
1788
+ else:
1789
+ print(f"reading a big file:{file_size} Mb")
1790
+ return load_small_xml(fpath)
1679
1791
 
1680
1792
  def get_comment(fpath, comment=None, encoding="utf-8", lines_to_check=5):
1681
1793
  """
@@ -1721,7 +1833,7 @@ def fload(fpath, kind=None, **kwargs):
1721
1833
  fmt=kwargs.pop("fmt",False)
1722
1834
  verbose=kwargs.pop("verbose",False)
1723
1835
  if verbose:
1724
- print_pd_usage("read_csv", verbose=verbose)
1836
+ use_pd("read_csv", verbose=verbose)
1725
1837
  return
1726
1838
 
1727
1839
  if comment is None:
@@ -1742,6 +1854,8 @@ def fload(fpath, kind=None, **kwargs):
1742
1854
  on_bad_lines=on_bad_lines,
1743
1855
  **kwargs,
1744
1856
  )
1857
+ if is_df_abnormal(df, verbose=0):
1858
+ raise ValueError("the df is abnormal")
1745
1859
  except:
1746
1860
  try:
1747
1861
  try:
@@ -1769,7 +1883,6 @@ def fload(fpath, kind=None, **kwargs):
1769
1883
  comment=comment,
1770
1884
  **kwargs,
1771
1885
  )
1772
-
1773
1886
  if is_df_abnormal(df, verbose=0):
1774
1887
  raise ValueError("the df is abnormal")
1775
1888
  except (UnicodeDecodeError, ValueError):
@@ -1805,7 +1918,8 @@ def fload(fpath, kind=None, **kwargs):
1805
1918
  separators = [",", "\t", ";", "|", " "]
1806
1919
  for sep in separators:
1807
1920
  sep2show = sep if sep != "\t" else "\\t"
1808
- print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1921
+ # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1922
+ # print(".")
1809
1923
  try:
1810
1924
  df = pd.read_csv(
1811
1925
  fpath,
@@ -1817,10 +1931,9 @@ def fload(fpath, kind=None, **kwargs):
1817
1931
  **kwargs,
1818
1932
  )
1819
1933
  if not is_df_abnormal(df, verbose=0): # normal
1820
- break
1821
- else:
1822
- if is_df_abnormal(df, verbose=0):
1823
- pass
1934
+ display(df.head(2))
1935
+ print(f"shape: {df.shape}")
1936
+ return df
1824
1937
  except:
1825
1938
  pass
1826
1939
  else:
@@ -1829,8 +1942,9 @@ def fload(fpath, kind=None, **kwargs):
1829
1942
  separators = [",", "\t", ";", "|", " "]
1830
1943
  for sep in separators:
1831
1944
  try:
1832
- sep2show = sep if sep != "\t" else "\\t"
1833
- print(f"trying with: engine={engine}, sep='{sep2show}'")
1945
+ # sep2show = sep if sep != "\t" else "\\t"
1946
+ # print(f"trying with: engine={engine}, sep='{sep2show}'")
1947
+ # print(".")
1834
1948
  df = pd.read_csv(
1835
1949
  fpath,
1836
1950
  engine=engine,
@@ -1839,8 +1953,12 @@ def fload(fpath, kind=None, **kwargs):
1839
1953
  comment=comment,
1840
1954
  **kwargs,
1841
1955
  )
1956
+ # display(df.head(2))
1957
+ # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
1842
1958
  if not is_df_abnormal(df, verbose=0):
1843
- break
1959
+ display(df.head(2))
1960
+ print(f"shape: {df.shape}")
1961
+ return df
1844
1962
  except EmptyDataError as e:
1845
1963
  continue
1846
1964
  else:
@@ -1853,7 +1971,7 @@ def fload(fpath, kind=None, **kwargs):
1853
1971
  engine = kwargs.get("engine", "openpyxl")
1854
1972
  verbose=kwargs.pop("verbose",False)
1855
1973
  if verbose:
1856
- print_pd_usage("read_excel", verbose=verbose)
1974
+ use_pd("read_excel", verbose=verbose)
1857
1975
  df = pd.read_excel(fpath, engine=engine, **kwargs)
1858
1976
  try:
1859
1977
  meata=pd.ExcelFile(fpath)
@@ -2263,7 +2381,7 @@ def fsave(
2263
2381
 
2264
2382
  verbose=kwargs.pop("verbose",False)
2265
2383
  if verbose:
2266
- print_pd_usage("to_csv", verbose=verbose)
2384
+ use_pd("to_csv", verbose=verbose)
2267
2385
  kwargs_csv = dict(
2268
2386
  path_or_buf=None,
2269
2387
  sep=",",
@@ -2295,7 +2413,7 @@ def fsave(
2295
2413
  verbose=kwargs.pop("verbose",False)
2296
2414
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2297
2415
  if verbose:
2298
- print_pd_usage("to_excel", verbose=verbose)
2416
+ use_pd("to_excel", verbose=verbose)
2299
2417
  if any(kwargs):
2300
2418
  format_excel(df=data, filename=fpath, **kwargs)
2301
2419
  else:
@@ -2342,15 +2460,20 @@ def fsave(
2342
2460
  # json.dump(data, file, **kwargs)
2343
2461
 
2344
2462
  def save_json(fpath_fname, var_dict_or_df):
2463
+ def _convert_js(data):
2464
+ if isinstance(data, pd.DataFrame):
2465
+ return data.to_dict(orient="list")
2466
+ elif isinstance(data, np.ndarray):
2467
+ return data.tolist()
2468
+ elif isinstance(data, dict):
2469
+ return {key: _convert_js(value) for key, value in data.items()}
2470
+ return data
2471
+
2472
+ serializable_data = _convert_js(var_dict_or_df)
2473
+
2474
+ # Save the serializable data to the JSON file
2345
2475
  with open(fpath_fname, "w") as f_json:
2346
- if isinstance(var_dict_or_df, pd.DataFrame):
2347
- var_dict_or_df = var_dict_or_df.to_dict(orient="dict")
2348
- if isinstance(var_dict_or_df, dict):
2349
- for key, value in var_dict_or_df.items():
2350
- if isinstance(value, np.ndarray):
2351
- var_dict_or_df[key] = value.tolist()
2352
- # Save the dictionary or list of dictionaries to a JSON file
2353
- json.dump(var_dict_or_df, f_json, indent=4)
2476
+ json.dump(serializable_data, f_json, indent=4)
2354
2477
 
2355
2478
  # # Example usage:
2356
2479
  # sets = {"title": "mse_path_ MSE"}
@@ -2594,7 +2717,7 @@ def listdir(
2594
2717
  print(ls)
2595
2718
  df_all = pd.DataFrame(
2596
2719
  {
2597
- "fname": all_files,
2720
+ "fname": ls,
2598
2721
  "fpath": [os.path.join(rootdir, i) for i in ls],
2599
2722
  }
2600
2723
  )
@@ -4444,7 +4567,42 @@ def preview(var):
4444
4567
  # preview("# This is a Markdown header")
4445
4568
  # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
4446
4569
  # preview({"key": "value", "numbers": [1, 2, 3]})
4447
-
4570
+ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
4571
+ """
4572
+ Extend a DataFrame by the list elecments in the column.
4573
+
4574
+ Parameters:
4575
+ ----------
4576
+ data : pd.DataFrame
4577
+ The input DataFrame to be extended.
4578
+
4579
+ column : str
4580
+ The name of the column to be split.
4581
+
4582
+ axis : int, optional
4583
+ The axis along which to expand the DataFrame.
4584
+ - 0 (default): Expand the specified column into multiple rows.
4585
+ - 1: Expand the specified column into multiple columns.
4586
+
4587
+ sep : str, optional
4588
+ The separator used to split the values in the specified column.
4589
+ Must be provided for the function to work correctly.
4590
+ """
4591
+
4592
+ data = data.copy()
4593
+ mask = data[column].str.contains(sep, na=False)
4594
+ data = data.copy()
4595
+ if mask.any():
4596
+ data[column] = (
4597
+ data[column]
4598
+ .apply(lambda x: x.split(sep) if isinstance(x, str) else x) # Only split if x is a string
4599
+ )
4600
+
4601
+ # Strip spaces from each item in the lists
4602
+ data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
4603
+
4604
+ data = data.explode(column, ignore_index=True)
4605
+ return data
4448
4606
  # ! DataFrame
4449
4607
  def df_astype(
4450
4608
  data: pd.DataFrame,
@@ -4703,7 +4861,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4703
4861
  def df_merge(
4704
4862
  df1: pd.DataFrame,
4705
4863
  df2: pd.DataFrame,
4706
- use_index: bool = True,
4864
+ use_index: bool = False,
4707
4865
  columns: list = ["col_left", "col_right"],
4708
4866
  how: str = "left",
4709
4867
  ) -> pd.DataFrame:
@@ -4731,7 +4889,7 @@ def df_merge(
4731
4889
  """
4732
4890
 
4733
4891
  # 1. Check if indices are comparable (same length and types)
4734
- if use_index or df1.index.equals(df2.index):
4892
+ if use_index:
4735
4893
  print(f"Merging based on index using '{how}' join...")
4736
4894
  df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
4737
4895
  return df_merged
@@ -4762,12 +4920,53 @@ def df_merge(
4762
4920
  )
4763
4921
  return df_merged
4764
4922
 
4923
+ def df_drop_duplicates(
4924
+ data: pd.DataFrame,
4925
+ by: Union[
4926
+ str, List[str]
4927
+ ] = "index", # Options: 'index', or column name(s) for 'rows'
4928
+ keep="first", # Options: 'first', 'last', or False (drop all duplicates)
4929
+ ignore_index=True,
4930
+ inplace: bool = False,
4931
+ verbose=True
4932
+ ):
4933
+ """
4934
+ data (pd.DataFrame): DataFrame to drop duplicates from.
4935
+ by (str): Specify by to drop duplicates:
4936
+ - 'index': Drop duplicates based on the DataFrame index.
4937
+ - Column name(s) for row-wise duplicate checking.
4938
+ keep (str): Which duplicates to keep:
4939
+ 'first',
4940
+ 'last',
4941
+ False (drop all duplicates).
4942
+ inplace (bool): Whether to modify the original DataFrame in place.
4943
+ """
4944
+ original_shape = data.shape
4945
+ if by == "index":
4946
+ # Drop duplicates in the index
4947
+ result = data[~data.index.duplicated(keep=keep)]
4948
+ else:
4949
+ # Drop duplicates row-wise based on column(s)
4950
+ result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
4951
+ if original_shape!=result.shape or verbose:
4952
+ print(f"\nshape:{original_shape} (before drop_duplicates)")
4953
+ print(f"shape:{result.shape} (after drop_duplicates)")
4954
+ if inplace:
4955
+ # Modify the original DataFrame in place
4956
+ data.drop(data.index, inplace=True) # Drop all rows first
4957
+ data[data.columns] = result # Refill the DataFrame
4958
+ return None
4959
+ else:
4960
+ return result
4765
4961
  def df_fillna(
4766
4962
  data: pd.DataFrame,
4767
- method: str = "mean",
4963
+ method: str = "knn",
4768
4964
  axis: int = 0,# column-wise
4769
4965
  constant: float = None,
4966
+ n_neighbors: int = 5, # KNN-specific
4967
+ max_iter: int = 10, # Iterative methods specific
4770
4968
  inplace: bool = True,
4969
+ random_state:int = None
4771
4970
  ) -> pd.DataFrame:
4772
4971
  """
4773
4972
  Fill missing values in a DataFrame using specified imputation method.
@@ -4779,8 +4978,15 @@ def df_fillna(
4779
4978
  - 'median': Replace missing values with the median of the column.
4780
4979
  - 'most_frequent': Replace missing values with the most frequent value in the column.
4781
4980
  - 'constant': Replace missing values with a constant value provided by the `constant` parameter.
4782
- - 'knn': Use K-Nearest Neighbors imputation.
4783
- - 'iterative': Use Iterative imputation.
4981
+ - 'knn': Use K-Nearest Neighbors imputation; replaces missing values based on the values of the nearest neighbors
4982
+ - 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
4983
+ - 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
4984
+ # - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
4985
+ # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
4986
+ # reconstructing the data matrix using low-rank approximation
4987
+ # - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
4988
+ # - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
4989
+
4784
4990
  axis (int): The axis along which to impute:
4785
4991
  - 0: Impute column-wise (default).
4786
4992
  - 1: Impute row-wise.
@@ -4793,7 +4999,8 @@ def df_fillna(
4793
4999
  raise ValueError("Input DataFrame is empty.")
4794
5000
 
4795
5001
  # Validate method
4796
- methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
5002
+ methods = ["mean", "median", "most_frequent",
5003
+ "constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
4797
5004
  method = strcmp(method, methods)[0]
4798
5005
 
4799
5006
  # If using constant method, ask for a constant value
@@ -4806,18 +5013,27 @@ def df_fillna(
4806
5013
 
4807
5014
  # Initialize SimpleImputer with the chosen method
4808
5015
  if method == "constant":
5016
+ from sklearn.impute import SimpleImputer
4809
5017
  imputer = SimpleImputer(strategy=method, fill_value=constant)
4810
5018
  elif method == "knn":
4811
5019
  from sklearn.impute import KNNImputer
4812
-
4813
5020
  imputer = KNNImputer(n_neighbors=n_neighbors)
4814
- elif method == "iterative":
5021
+ elif method == "iterative" or method == "mice":
5022
+ from sklearn.experimental import enable_iterative_imputer
4815
5023
  from sklearn.impute import IterativeImputer
4816
5024
 
4817
- imputer = IterativeImputer(max_iter=max_iter)
4818
- else:
5025
+ imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
5026
+ # elif method == "missforest":
5027
+ # from missingpy import MissForest
5028
+ # imputer = MissForest(max_iter=max_iter, random_state=random_state)
5029
+ # elif method == "softimpute":
5030
+ # from fancyimpute import SoftImpute
5031
+ # imputer = SoftImpute()
5032
+ # elif method == "svd":
5033
+ # from fancyimpute import IterativeSVD
5034
+ # imputer = IterativeSVD(max_iters=max_iter)
5035
+ else: # mean, median, most_frequent
4819
5036
  from sklearn.impute import SimpleImputer
4820
-
4821
5037
  imputer = SimpleImputer(strategy=method)
4822
5038
 
4823
5039
  # Fit and transform the data
@@ -4843,8 +5059,38 @@ def df_fillna(
4843
5059
  return None # replace original
4844
5060
  else:
4845
5061
  return df_filled
5062
+ # # example
5063
+ # data = {
5064
+ # "A": [1, 2, np.nan, 4, 5],
5065
+ # "B": [np.nan, 2, 3, 4, np.nan],
5066
+ # "C": [1, np.nan, 3, 4, 5],
5067
+ # "D": [1, 2, 3, 4, np.nan],
5068
+ # }
5069
+
5070
+ # # Define a function to test each imputation method
5071
+ # methods = [
5072
+ # "mean",
5073
+ # "median",
5074
+ # "most_frequent",
5075
+ # "constant",
5076
+ # "knn",
5077
+ # "iterative",
5078
+ # # "missforest",
5079
+ # # "softimpute",
5080
+ # # "svd",
5081
+ # ]
5082
+
5083
+ # # Create a dictionary to hold results
5084
+ # results = {}
5085
+
5086
+ # for method_name in methods:
5087
+ # print(method_name)
5088
+ # display(df)
5089
+ # display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
5090
+
5091
+
4846
5092
  def df_scaler(
4847
- data: pd.DataFrame,
5093
+ data: pd.DataFrame, # should be numeric dtype
4848
5094
  method="standard",
4849
5095
  columns=None, # default, select all numeric col/row
4850
5096
  inplace=False,
@@ -4984,7 +5230,7 @@ def df_cluster(
4984
5230
  X = scaler.fit_transform(X)
4985
5231
 
4986
5232
  for n_cluster in range_n_clusters:
4987
- kmeans = KMeans(n_clusters=n_cluster, random_state=42)
5233
+ kmeans = KMeans(n_clusters=n_cluster, random_state=1)
4988
5234
  cluster_labels = kmeans.fit_predict(X)
4989
5235
 
4990
5236
  silhouette_avg = silhouette_score(X, cluster_labels)
@@ -5000,7 +5246,7 @@ def df_cluster(
5000
5246
  print(f"n_clusters = {n_clusters}")
5001
5247
 
5002
5248
  # Apply K-Means Clustering with Optimal Number of Clusters
5003
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
5249
+ kmeans = KMeans(n_clusters=n_clusters, random_state=1)
5004
5250
  cluster_labels = kmeans.fit_predict(X)
5005
5251
 
5006
5252
  if plot:
@@ -5101,7 +5347,7 @@ def df_cluster(
5101
5347
  # n_clusters = (
5102
5348
  # np.argmax(silhouette_avg_scores) + 2
5103
5349
  # ) # Optimal clusters based on max silhouette score
5104
- # kmeans = KMeans(n_clusters=n_clusters, random_state=42)
5350
+ # kmeans = KMeans(n_clusters=n_clusters, random_state=1)
5105
5351
  # cluster_labels = kmeans.fit_predict(X)
5106
5352
  silhouette_vals = silhouette_samples(X, cluster_labels)
5107
5353
 
@@ -5252,12 +5498,14 @@ def df_reducer(
5252
5498
  columns: Optional[List[str]] = None,
5253
5499
  method: str = "umap", # 'pca', 'umap'
5254
5500
  n_components: int = 2, # Default for umap, but 50 for PCA
5255
- umap_neighbors: int = 15, # Default
5256
- umap_min_dist: float = 0.1, # Default
5501
+ umap_neighbors: int = 15, # UMAP-specific
5502
+ umap_min_dist: float = 0.1, # UMAP-specific
5503
+ tsne_perplexity: int = 30, # t-SNE-specific
5257
5504
  scale: bool = True,
5258
5505
  fill_missing: bool = True,
5259
5506
  debug: bool = False,
5260
5507
  inplace: bool = True, # replace the oringinal data
5508
+ plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
5261
5509
  ) -> pd.DataFrame:
5262
5510
  """
5263
5511
  Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
@@ -5293,14 +5541,40 @@ def df_reducer(
5293
5541
  reduced_df : pd.DataFrame
5294
5542
  DataFrame with the reduced dimensions.
5295
5543
  """
5296
- from sklearn.decomposition import PCA
5544
+
5545
+ """
5546
+ PCA: explained_variance:
5547
+ indicates the proportion of the dataset's total variance that each principal
5548
+ component (PC) explains. It gives you a sense of how much information
5549
+ (or variance) is captured by each PC
5550
+ Interpretation:
5551
+ - Higher values indicate that the corresponding PC captures more variance.
5552
+ - The sum of the explained variances for all PCs equals 1 (or 100%).
5553
+ - If the first few components explain a high percentage (e.g., 90%),
5554
+ it means you can reduce the dimensionality of the data significantly without losing much information.
5555
+ Use case:
5556
+ You may plot a scree plot, which shows the explained variance for each PC, to help decide
5557
+ how many components to keep for analysis.
5558
+
5559
+ PCA: Singular values:
5560
+ represent the magnitude of variance along each principal component. Mathematically,
5561
+ they are the square roots of the eigenvalues of the covariance matrix.
5562
+ Interpretation:
5563
+ Larger singular values indicate that the associated PC captures more variance.
5564
+ Singular values are related to the scale of the data. If the data are scaled
5565
+ before PCA (e.g., standardized), then the singular values will provide a measure
5566
+ of the spread of data along each PC.
5567
+ Use case:
5568
+ Singular values help quantify the contribution of each principal component in a
5569
+ similar way to the explained variance. They are useful in understanding the overall
5570
+ structure of the data.
5571
+ """
5297
5572
  from sklearn.preprocessing import StandardScaler
5298
- import umap
5299
5573
  from sklearn.impute import SimpleImputer
5300
5574
 
5301
5575
  # Select columns if specified, else use all columns
5302
5576
  X = data[columns].values if columns else data.values
5303
-
5577
+ print(X.shape,type(X))
5304
5578
  # Handle missing values
5305
5579
  if fill_missing:
5306
5580
  imputer = SimpleImputer(strategy="mean")
@@ -5312,76 +5586,215 @@ def df_reducer(
5312
5586
  X = scaler.fit_transform(X)
5313
5587
 
5314
5588
  # Check valid method input
5315
- if method not in ["pca", "umap"]:
5316
- raise ValueError(f"Invalid method '{method}'. Choose 'pca' or 'umap'.")
5317
-
5589
+ methods=["pca", "umap","tsne","factor","isolation_forest"]
5590
+ method=strcmp(method, methods)[0]
5318
5591
  # Apply PCA if selected
5319
- if method == "pca":
5320
- if n_components is None:
5321
- # to get the n_components with threshold method:
5322
- pca = PCA()
5323
- pca_result = pca.fit_transform(X)
5324
-
5325
- # Calculate explained variance
5326
- explained_variance = pca.explained_variance_ratio_
5327
- # Cumulative explained variance
5328
- cumulative_variance = np.cumsum(explained_variance)
5329
- # Set a threshold for cumulative variance
5330
- threshold = 0.95 # Example threshold
5331
- n_components = (
5332
- np.argmax(cumulative_variance >= threshold) + 1
5333
- ) # Number of components to retain
5334
- if debug:
5335
- # debug:
5336
- # Plot the cumulative explained variance
5337
- plt.figure(figsize=(8, 5))
5338
- plt.plot(
5339
- range(1, len(cumulative_variance) + 1),
5340
- cumulative_variance,
5341
- marker="o",
5342
- linestyle="-",
5343
- )
5344
- plt.title("Cumulative Explained Variance by Principal Components")
5345
- plt.xlabel("Number of Principal Components")
5346
- plt.ylabel("Cumulative Explained Variance")
5347
- plt.xticks(range(1, len(cumulative_variance) + 1))
5348
- # Add horizontal line for the threshold
5349
- plt.axhline(
5350
- y=threshold, color="r", linestyle="--", label="Threshold (95%)"
5351
- )
5352
- # Add vertical line for n_components
5353
- plt.axvline(
5354
- x=n_components,
5355
- color="g",
5356
- linestyle="--",
5357
- label=f"n_components = {n_components}",
5358
- )
5359
- plt.legend()
5360
- plt.grid()
5592
+ if method == "pca":
5593
+ from sklearn.decomposition import PCA
5361
5594
  pca = PCA(n_components=n_components)
5362
5595
  X_reduced = pca.fit_transform(X)
5363
- print(f"PCA completed: Reduced to {n_components} components.")
5596
+
5597
+ # Additional PCA information
5598
+ explained_variance = pca.explained_variance_ratio_
5599
+ singular_values = pca.singular_values_
5600
+ loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
5601
+
5602
+ if debug:
5603
+ print(f"PCA completed: Reduced to {n_components} components.")
5604
+ print(f"Explained Variance: {explained_variance}")
5605
+ print(f"Singular Values: {singular_values}")
5606
+
5607
+ # Plot explained variance if debug=True
5608
+ if debug:
5609
+ # Plot explained variance
5610
+ cumulative_variance = np.cumsum(explained_variance)
5611
+ plt.figure(figsize=(8, 5))
5612
+ plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
5613
+ plt.title("Cumulative Explained Variance by Principal Components")
5614
+ plt.xlabel("Number of Principal Components")
5615
+ plt.ylabel("Cumulative Explained Variance")
5616
+ plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
5617
+ plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
5618
+ plt.legend()
5619
+ plt.grid()
5620
+ plt.show()
5621
+
5622
+ # Prepare reduced DataFrame with additional PCA info
5623
+ pca_df = pd.DataFrame(
5624
+ X_reduced, index=data.index,
5625
+ columns=[f"PC_{i+1}" for i in range(n_components)]
5626
+ )
5627
+ # pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
5628
+ # pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
5629
+ # Expand explained variance to multiple columns if needed
5630
+ for i in range(n_components):
5631
+ pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
5632
+ for i in range(n_components):
5633
+ pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
5364
5634
 
5365
5635
  # Apply UMAP if selected
5366
5636
  elif method == "umap":
5637
+ import umap
5367
5638
  umap_reducer = umap.UMAP(
5368
5639
  n_neighbors=umap_neighbors,
5369
5640
  min_dist=umap_min_dist,
5370
- n_components=n_components,
5641
+ n_components=n_components
5371
5642
  )
5372
5643
  X_reduced = umap_reducer.fit_transform(X)
5373
- print(f"UMAP completed: Reduced to {n_components} components.")
5374
5644
 
5375
- # Return reduced data as a new DataFrame with the same index
5376
- reduced_df = pd.DataFrame(X_reduced, index=data.index)
5645
+ # Additional UMAP information
5646
+ embedding = umap_reducer.embedding_
5647
+ trustworthiness = umap_reducer._raw_data[:, :n_components]
5648
+
5649
+ if debug:
5650
+ print(f"UMAP completed: Reduced to {n_components} components.")
5651
+ print(f"Embedding Shape: {embedding.shape}")
5652
+ print(f"Trustworthiness: {trustworthiness}")
5653
+
5654
+ # Prepare reduced DataFrame with additional UMAP info
5655
+ umap_df = pd.DataFrame(
5656
+ X_reduced, index=data.index,
5657
+ columns=[f"UMAP_{i+1}" for i in range(n_components)]
5658
+ )
5659
+ umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
5660
+ umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
5661
+ elif method == "tsne":
5662
+ from sklearn.manifold import TSNE
5663
+ tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
5664
+ X_reduced = tsne.fit_transform(X)
5665
+
5666
+ # Prepare reduced DataFrame with additional t-SNE info
5667
+ tsne_df = pd.DataFrame(
5668
+ X_reduced, index=data.index,
5669
+ columns=[f"tSNE_{i+1}" for i in range(n_components)]
5670
+ )
5671
+ tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
5672
+
5673
+ # Apply Factor Analysis if selected
5674
+ elif method == "factor":
5675
+ from sklearn.decomposition import FactorAnalysis
5676
+ factor = FactorAnalysis(n_components=n_components, random_state=1)
5677
+ X_reduced = factor.fit_transform(X)
5678
+ # Factor Analysis does not directly provide explained variance, but we can approximate it
5679
+ fa_variance = factor.noise_variance_
5680
+ # Prepare reduced DataFrame with additional Factor Analysis info
5681
+ factor_df = pd.DataFrame(
5682
+ X_reduced, index=data.index,
5683
+ columns=[f"Factor_{i+1}" for i in range(n_components)]
5684
+ )
5685
+ factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
5686
+
5687
+ # Apply Isolation Forest for outlier detection if selected
5688
+ elif method == "isolation_forest":
5689
+ from sklearn.decomposition import PCA
5690
+ from sklearn.ensemble import IsolationForest
5691
+ # Step 1: Apply PCA for dimensionality reduction to 2 components
5692
+ pca = PCA(n_components=n_components)
5693
+ X_pca = pca.fit_transform(X)
5694
+
5695
+ explained_variance = pca.explained_variance_ratio_
5696
+ singular_values = pca.singular_values_
5697
+
5698
+ # Prepare reduced DataFrame with additional PCA info
5699
+ iso_forest_df = pd.DataFrame(
5700
+ X_pca, index=data.index,
5701
+ columns=[f"PC_{i+1}" for i in range(n_components)]
5702
+ )
5703
+
5704
+ isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
5705
+ isolation_forest.fit(X)
5706
+ anomaly_scores = isolation_forest.decision_function(X) # Anomaly score: larger is less anomalous
5707
+ # Predict labels: 1 (normal), -1 (anomaly)
5708
+ anomaly_labels = isolation_forest.fit_predict(X)
5709
+ # Add anomaly scores and labels to the DataFrame
5710
+ iso_forest_df["Anomaly Score"] = anomaly_scores
5711
+ iso_forest_df["Anomaly Label"] = anomaly_labels
5712
+ # add info from pca
5713
+ for i in range(n_components):
5714
+ iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
5715
+ for i in range(n_components):
5716
+ iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
5717
+
5718
+ # Return reduced data and info as a new DataFrame with the same index
5719
+ if method == "pca":
5720
+ reduced_df = pca_df
5721
+ colname_met = "PC_"
5722
+ if plot_:
5723
+ sns.scatterplot(
5724
+ data=pca_df,
5725
+ x="PC_1",
5726
+ y="PC_2",
5727
+ # hue="condition",
5728
+ )
5729
+ elif method == "umap":
5730
+ reduced_df = umap_df
5731
+ colname_met = "UMAP_"
5732
+ if plot_:
5733
+ sns.scatterplot(
5734
+ data=umap_df,
5735
+ x="UMAP_1",
5736
+ y="UMAP_2",
5737
+ # hue="condition",
5738
+ )
5739
+ elif method == "tsne":
5740
+ reduced_df = tsne_df
5741
+ colname_met = "t-SNE_"
5742
+ if plot_:
5743
+ sns.scatterplot(
5744
+ data=tsne_df,
5745
+ x="tSNE_1",
5746
+ y="tSNE_2",
5747
+ # hue="batch",
5748
+ )
5749
+ elif method == "factor":
5750
+ reduced_df = factor_df
5751
+ colname_met = "Factor_"
5752
+ if plot_:
5753
+ sns.scatterplot(
5754
+ data=factor_df,
5755
+ x="Factor_1",
5756
+ y="Factor_2",
5757
+ # hue="batch",
5758
+ )
5759
+ elif method == "isolation_forest":
5760
+ reduced_df = iso_forest_df # Already a DataFrame for outliers
5761
+ colname_met = "PC_"
5762
+ if plot_:
5763
+ ax = sns.scatterplot(
5764
+ data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
5765
+ x="PC_1",
5766
+ y="PC_2",
5767
+ label="normal", c="b",
5768
+ )
5769
+ ax = sns.scatterplot(
5770
+ ax=ax,
5771
+ data=iso_forest_df[iso_forest_df["Anomaly Label"] == -1],
5772
+ x="PC_1",
5773
+ y="PC_2",
5774
+ c="r",
5775
+ label="outlier", marker="+", s=30,
5776
+ )
5777
+
5377
5778
 
5378
5779
  if inplace:
5379
- # Replace or add new columns based on n_components
5780
+ # If inplace=True, add components back into the original data
5380
5781
  for col_idx in range(n_components):
5381
- data[f"Component_{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5782
+ data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5783
+ # Add extra info for PCA/UMAP
5784
+ if method == "pca":
5785
+ for i in range(n_components):
5786
+ data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
5787
+ for i in range(n_components):
5788
+ data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
5789
+ elif method == "umap":
5790
+ for i in range(n_components):
5791
+ data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
5792
+ data["Embedding"] = reduced_df["Embedding"]
5793
+ data["Trustworthiness"] = reduced_df["Trustworthiness"]
5382
5794
  return None # No return when inplace=True
5795
+
5383
5796
 
5384
- return reduced_df
5797
+ return reduced_df
5385
5798
 
5386
5799
 
5387
5800
  # example:
@@ -5636,7 +6049,7 @@ def evaluate_cluster(
5636
6049
  return metrics
5637
6050
 
5638
6051
 
5639
- def print_pd_usage(
6052
+ def use_pd(
5640
6053
  func_name="excel",
5641
6054
  verbose=True,
5642
6055
  dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",