py2ls 0.2.4.3__py3-none-any.whl → 0.2.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -106,6 +106,8 @@ def unique(lst, ascending=None):
106
106
  返回:
107
107
  list: 一个列表,其中的元素是唯一的,顺序根据参数 `ascending` 进行排序。
108
108
  """
109
+ if not lst:
110
+ return []
109
111
  if ascending is not None:
110
112
  # 移除重复项
111
113
  unique_items = list(set(lst))
@@ -518,7 +520,7 @@ def is_text(s):
518
520
 
519
521
  from typing import Any, Union
520
522
 
521
- def shared(lst1:Any, lst2:Any,*args, verbose=True):
523
+ def shared(*args, strict=True, n_shared=2, verbose=True):
522
524
  """
523
525
  check the shared elelements in two list.
524
526
  usage:
@@ -529,14 +531,30 @@ def shared(lst1:Any, lst2:Any,*args, verbose=True):
529
531
  """
530
532
  if verbose:
531
533
  print("\n********* checking shared elements *********")
532
- if any([not isinstance(lst1,list),not isinstance(lst1,list)]):
533
- print(f"{' '*2}type(list1):\t{type(lst1)},\n{' '*2}type(list2):\t{type(lst2)}>")
534
- shared_elements=set(flatten(lst1,verbose=verbose)).intersection(flatten(lst2,verbose=verbose))
535
- # support more lists
536
- if args:
537
- for arg in args:
538
- shared_elements=shared_elements.intersection(set(flatten(arg,verbose=verbose)))
539
- shared_elements = list(shared_elements)
534
+
535
+ if len(args) == 1 and isinstance(args[0], list):
536
+ lists = args[0] # Unpack the single list
537
+ else:
538
+ lists = args # Use the provided arguments as lists
539
+ flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
540
+ # Ensure all arguments are lists
541
+ if any(not isinstance(lst, list) for lst in flattened_lists):
542
+ print(f"{' ' * 2}All inputs must be lists.")
543
+ return []
544
+ first_list = flattened_lists[0]
545
+ shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
546
+ if strict:
547
+ # Strict mode: require elements to be in all lists
548
+ shared_elements = set(flattened_lists[0])
549
+ for lst in flattened_lists[1:]:
550
+ shared_elements.intersection_update(lst)
551
+ else:
552
+ all_elements = [item for sublist in flattened_lists for item in sublist]
553
+ element_count = Counter(all_elements)
554
+ # Get elements that appear in at least n_shared lists
555
+ shared_elements = [item for item, count in element_count.items() if count >= n_shared]
556
+
557
+ shared_elements = flatten(shared_elements)
540
558
  if verbose:
541
559
  elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
542
560
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
@@ -555,17 +573,19 @@ def flatten(nested: Any, unique_list=True,verbose=True):
555
573
  while stack:
556
574
  current = stack.pop()
557
575
  if isinstance(current, dict):
558
- stack.extend(current.values())
576
+ stack.extend(current.values())
559
577
  elif isinstance(current, (list, tuple, set)):
560
578
  stack.extend(current)
561
579
  elif isinstance(current, pd.Series):
562
580
  stack.extend(current)
581
+ elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
582
+ stack.extend(current.tolist())
563
583
  else:
564
584
  flattened_list.append(current)
565
585
  if verbose:
566
586
  print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
567
587
  if unique_list:
568
- return unique(flattened_list)
588
+ return unique(flattened_list)[::-1]
569
589
  else:
570
590
  return flattened_list
571
591
 
@@ -1618,6 +1638,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1618
1638
  """
1619
1639
  Usage
1620
1640
  is_abnormal = is_df_abnormal(df, verbose=1)
1641
+ True: abnormal
1642
+ False: normal
1621
1643
 
1622
1644
  """
1623
1645
  # Initialize a list to hold messages about abnormalities
@@ -1645,25 +1667,34 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1645
1667
  if len(column_names) == 1 and delimiter_counts["\t"] > 1:
1646
1668
  messages.append("Abnormal: Column names are not split correctly.")
1647
1669
  is_abnormal = True
1670
+ if verbose:
1671
+ print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
1648
1672
 
1649
1673
  if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
1650
1674
  messages.append("Abnormal: Too many delimiters in column names.")
1651
1675
  is_abnormal = True
1676
+ if verbose:
1677
+ print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
1652
1678
 
1653
1679
  if delimiter_counts[""] > 3:
1654
1680
  messages.append("Abnormal: There are empty column names.")
1655
1681
  is_abnormal = True
1682
+ if verbose:
1683
+ print(f'delimiter_counts[""] > 3')
1656
1684
 
1657
1685
  if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
1658
1686
  messages.append("Abnormal: Some column names contain unexpected characters.")
1659
1687
  is_abnormal = True
1688
+ if verbose:
1689
+ print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
1660
1690
 
1661
- # Check for missing values
1662
- missing_values = df.isnull().sum()
1663
- if missing_values.any():
1664
- messages.append("Missing values in columns:")
1665
- messages.append(missing_values[missing_values > 0].to_string())
1666
- is_abnormal = True
1691
+ # # Check for missing values
1692
+ # missing_values = df.isnull().sum()
1693
+ # if missing_values.any():
1694
+ # messages.append("Missing values in columns:")
1695
+ # messages.append(missing_values[missing_values > 0].to_string())
1696
+ # is_abnormal = True
1697
+ # print(f'missing_values.any()')
1667
1698
 
1668
1699
  # Check data types
1669
1700
  data_types = df.dtypes
@@ -1674,6 +1705,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1674
1705
  if constant_columns:
1675
1706
  messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
1676
1707
  is_abnormal = True
1708
+ if verbose:
1709
+ print(f'df.columns[df.nunique() == 1].tolist()')
1677
1710
 
1678
1711
  # Check for an unreasonable number of rows or columns
1679
1712
  if actual_shape[0] < 2 or actual_shape[1] < 2:
@@ -1681,6 +1714,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1681
1714
  "Abnormal: DataFrame is too small (less than 2 rows or columns)."
1682
1715
  )
1683
1716
  is_abnormal = True
1717
+ if verbose:
1718
+ print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
1684
1719
 
1685
1720
  # Compile results
1686
1721
  if verbose:
@@ -1723,10 +1758,36 @@ def fload(fpath, kind=None, **kwargs):
1723
1758
  content = yaml.safe_load(file)
1724
1759
  return content
1725
1760
 
1726
- def load_xml(fpath):
1727
- tree = etree.parse(fpath)
1728
- root = tree.getroot()
1729
- return etree.tostring(root, pretty_print=True).decode()
1761
+
1762
+ def load_xml(fpath, fsize_thr: int = 100):
1763
+ def load_small_xml(fpath):
1764
+ tree = etree.parse(fpath)
1765
+ root = tree.getroot()
1766
+ return etree.tostring(root, pretty_print=True).decode()
1767
+
1768
+ def load_large_xml(fpath):
1769
+ xml_parts = []
1770
+ context = etree.iterparse(
1771
+ fpath, events=("start", "end"), recover=True, huge_tree=True
1772
+ )
1773
+
1774
+ for event, elem in context:
1775
+ if event == "end":
1776
+ xml_parts.append(etree.tostring(elem, pretty_print=True).decode())
1777
+ elem.clear()
1778
+ while elem.getprevious() is not None:
1779
+ del elem.getparent()[0]
1780
+ del context
1781
+ return "".join(xml_parts)
1782
+
1783
+ file_size = os.path.getsize(fpath) / 1024 / 1024 # in MB
1784
+
1785
+ if file_size > fsize_thr:
1786
+ print(f"reading a small file:{file_size} Mb")
1787
+ return load_large_xml(fpath)
1788
+ else:
1789
+ print(f"reading a big file:{file_size} Mb")
1790
+ return load_small_xml(fpath)
1730
1791
 
1731
1792
  def get_comment(fpath, comment=None, encoding="utf-8", lines_to_check=5):
1732
1793
  """
@@ -1793,6 +1854,8 @@ def fload(fpath, kind=None, **kwargs):
1793
1854
  on_bad_lines=on_bad_lines,
1794
1855
  **kwargs,
1795
1856
  )
1857
+ if is_df_abnormal(df, verbose=0):
1858
+ raise ValueError("the df is abnormal")
1796
1859
  except:
1797
1860
  try:
1798
1861
  try:
@@ -1820,7 +1883,6 @@ def fload(fpath, kind=None, **kwargs):
1820
1883
  comment=comment,
1821
1884
  **kwargs,
1822
1885
  )
1823
-
1824
1886
  if is_df_abnormal(df, verbose=0):
1825
1887
  raise ValueError("the df is abnormal")
1826
1888
  except (UnicodeDecodeError, ValueError):
@@ -1856,7 +1918,8 @@ def fload(fpath, kind=None, **kwargs):
1856
1918
  separators = [",", "\t", ";", "|", " "]
1857
1919
  for sep in separators:
1858
1920
  sep2show = sep if sep != "\t" else "\\t"
1859
- print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1921
+ # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
1922
+ # print(".")
1860
1923
  try:
1861
1924
  df = pd.read_csv(
1862
1925
  fpath,
@@ -1868,10 +1931,9 @@ def fload(fpath, kind=None, **kwargs):
1868
1931
  **kwargs,
1869
1932
  )
1870
1933
  if not is_df_abnormal(df, verbose=0): # normal
1871
- break
1872
- else:
1873
- if is_df_abnormal(df, verbose=0):
1874
- pass
1934
+ display(df.head(2))
1935
+ print(f"shape: {df.shape}")
1936
+ return df
1875
1937
  except:
1876
1938
  pass
1877
1939
  else:
@@ -1880,8 +1942,9 @@ def fload(fpath, kind=None, **kwargs):
1880
1942
  separators = [",", "\t", ";", "|", " "]
1881
1943
  for sep in separators:
1882
1944
  try:
1883
- sep2show = sep if sep != "\t" else "\\t"
1884
- print(f"trying with: engine={engine}, sep='{sep2show}'")
1945
+ # sep2show = sep if sep != "\t" else "\\t"
1946
+ # print(f"trying with: engine={engine}, sep='{sep2show}'")
1947
+ # print(".")
1885
1948
  df = pd.read_csv(
1886
1949
  fpath,
1887
1950
  engine=engine,
@@ -1890,8 +1953,12 @@ def fload(fpath, kind=None, **kwargs):
1890
1953
  comment=comment,
1891
1954
  **kwargs,
1892
1955
  )
1956
+ # display(df.head(2))
1957
+ # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
1893
1958
  if not is_df_abnormal(df, verbose=0):
1894
- break
1959
+ display(df.head(2))
1960
+ print(f"shape: {df.shape}")
1961
+ return df
1895
1962
  except EmptyDataError as e:
1896
1963
  continue
1897
1964
  else:
@@ -2393,15 +2460,20 @@ def fsave(
2393
2460
  # json.dump(data, file, **kwargs)
2394
2461
 
2395
2462
  def save_json(fpath_fname, var_dict_or_df):
2463
+ def _convert_js(data):
2464
+ if isinstance(data, pd.DataFrame):
2465
+ return data.to_dict(orient="list")
2466
+ elif isinstance(data, np.ndarray):
2467
+ return data.tolist()
2468
+ elif isinstance(data, dict):
2469
+ return {key: _convert_js(value) for key, value in data.items()}
2470
+ return data
2471
+
2472
+ serializable_data = _convert_js(var_dict_or_df)
2473
+
2474
+ # Save the serializable data to the JSON file
2396
2475
  with open(fpath_fname, "w") as f_json:
2397
- if isinstance(var_dict_or_df, pd.DataFrame):
2398
- var_dict_or_df = var_dict_or_df.to_dict(orient="dict")
2399
- if isinstance(var_dict_or_df, dict):
2400
- for key, value in var_dict_or_df.items():
2401
- if isinstance(value, np.ndarray):
2402
- var_dict_or_df[key] = value.tolist()
2403
- # Save the dictionary or list of dictionaries to a JSON file
2404
- json.dump(var_dict_or_df, f_json, indent=4)
2476
+ json.dump(serializable_data, f_json, indent=4)
2405
2477
 
2406
2478
  # # Example usage:
2407
2479
  # sets = {"title": "mse_path_ MSE"}
@@ -2645,7 +2717,7 @@ def listdir(
2645
2717
  print(ls)
2646
2718
  df_all = pd.DataFrame(
2647
2719
  {
2648
- "fname": all_files,
2720
+ "fname": ls,
2649
2721
  "fpath": [os.path.join(rootdir, i) for i in ls],
2650
2722
  }
2651
2723
  )
@@ -4789,7 +4861,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4789
4861
  def df_merge(
4790
4862
  df1: pd.DataFrame,
4791
4863
  df2: pd.DataFrame,
4792
- use_index: bool = True,
4864
+ use_index: bool = False,
4793
4865
  columns: list = ["col_left", "col_right"],
4794
4866
  how: str = "left",
4795
4867
  ) -> pd.DataFrame:
@@ -4848,12 +4920,53 @@ def df_merge(
4848
4920
  )
4849
4921
  return df_merged
4850
4922
 
4923
+ def df_drop_duplicates(
4924
+ data: pd.DataFrame,
4925
+ by: Union[
4926
+ str, List[str]
4927
+ ] = "index", # Options: 'index', or column name(s) for 'rows'
4928
+ keep="first", # Options: 'first', 'last', or False (drop all duplicates)
4929
+ ignore_index=True,
4930
+ inplace: bool = False,
4931
+ verbose=True
4932
+ ):
4933
+ """
4934
+ data (pd.DataFrame): DataFrame to drop duplicates from.
4935
+ by (str): Specify by to drop duplicates:
4936
+ - 'index': Drop duplicates based on the DataFrame index.
4937
+ - Column name(s) for row-wise duplicate checking.
4938
+ keep (str): Which duplicates to keep:
4939
+ 'first',
4940
+ 'last',
4941
+ False (drop all duplicates).
4942
+ inplace (bool): Whether to modify the original DataFrame in place.
4943
+ """
4944
+ original_shape = data.shape
4945
+ if by == "index":
4946
+ # Drop duplicates in the index
4947
+ result = data[~data.index.duplicated(keep=keep)]
4948
+ else:
4949
+ # Drop duplicates row-wise based on column(s)
4950
+ result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
4951
+ if original_shape!=result.shape or verbose:
4952
+ print(f"\nshape:{original_shape} (before drop_duplicates)")
4953
+ print(f"shape:{result.shape} (after drop_duplicates)")
4954
+ if inplace:
4955
+ # Modify the original DataFrame in place
4956
+ data.drop(data.index, inplace=True) # Drop all rows first
4957
+ data[data.columns] = result # Refill the DataFrame
4958
+ return None
4959
+ else:
4960
+ return result
4851
4961
  def df_fillna(
4852
4962
  data: pd.DataFrame,
4853
- method: str = "mean",
4963
+ method: str = "knn",
4854
4964
  axis: int = 0,# column-wise
4855
4965
  constant: float = None,
4966
+ n_neighbors: int = 5, # KNN-specific
4967
+ max_iter: int = 10, # Iterative methods specific
4856
4968
  inplace: bool = True,
4969
+ random_state:int = None
4857
4970
  ) -> pd.DataFrame:
4858
4971
  """
4859
4972
  Fill missing values in a DataFrame using specified imputation method.
@@ -4865,8 +4978,15 @@ def df_fillna(
4865
4978
  - 'median': Replace missing values with the median of the column.
4866
4979
  - 'most_frequent': Replace missing values with the most frequent value in the column.
4867
4980
  - 'constant': Replace missing values with a constant value provided by the `constant` parameter.
4868
- - 'knn': Use K-Nearest Neighbors imputation.
4869
- - 'iterative': Use Iterative imputation.
4981
+ - 'knn': Use K-Nearest Neighbors imputation; replaces missing values based on the values of the nearest neighbors
4982
+ - 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
4983
+ - 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
4984
+ # - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
4985
+ # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
4986
+ # reconstructing the data matrix using low-rank approximation
4987
+ # - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
4988
+ # - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
4989
+
4870
4990
  axis (int): The axis along which to impute:
4871
4991
  - 0: Impute column-wise (default).
4872
4992
  - 1: Impute row-wise.
@@ -4879,7 +4999,8 @@ def df_fillna(
4879
4999
  raise ValueError("Input DataFrame is empty.")
4880
5000
 
4881
5001
  # Validate method
4882
- methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
5002
+ methods = ["mean", "median", "most_frequent",
5003
+ "constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
4883
5004
  method = strcmp(method, methods)[0]
4884
5005
 
4885
5006
  # If using constant method, ask for a constant value
@@ -4892,18 +5013,27 @@ def df_fillna(
4892
5013
 
4893
5014
  # Initialize SimpleImputer with the chosen method
4894
5015
  if method == "constant":
5016
+ from sklearn.impute import SimpleImputer
4895
5017
  imputer = SimpleImputer(strategy=method, fill_value=constant)
4896
5018
  elif method == "knn":
4897
5019
  from sklearn.impute import KNNImputer
4898
-
4899
5020
  imputer = KNNImputer(n_neighbors=n_neighbors)
4900
- elif method == "iterative":
5021
+ elif method == "iterative" or method == "mice":
5022
+ from sklearn.experimental import enable_iterative_imputer
4901
5023
  from sklearn.impute import IterativeImputer
4902
5024
 
4903
- imputer = IterativeImputer(max_iter=max_iter)
4904
- else:
5025
+ imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
5026
+ # elif method == "missforest":
5027
+ # from missingpy import MissForest
5028
+ # imputer = MissForest(max_iter=max_iter, random_state=random_state)
5029
+ # elif method == "softimpute":
5030
+ # from fancyimpute import SoftImpute
5031
+ # imputer = SoftImpute()
5032
+ # elif method == "svd":
5033
+ # from fancyimpute import IterativeSVD
5034
+ # imputer = IterativeSVD(max_iters=max_iter)
5035
+ else: # mean, median, most_frequent
4905
5036
  from sklearn.impute import SimpleImputer
4906
-
4907
5037
  imputer = SimpleImputer(strategy=method)
4908
5038
 
4909
5039
  # Fit and transform the data
@@ -4929,8 +5059,38 @@ def df_fillna(
4929
5059
  return None # replace original
4930
5060
  else:
4931
5061
  return df_filled
5062
+ # # example
5063
+ # data = {
5064
+ # "A": [1, 2, np.nan, 4, 5],
5065
+ # "B": [np.nan, 2, 3, 4, np.nan],
5066
+ # "C": [1, np.nan, 3, 4, 5],
5067
+ # "D": [1, 2, 3, 4, np.nan],
5068
+ # }
5069
+
5070
+ # # Define a function to test each imputation method
5071
+ # methods = [
5072
+ # "mean",
5073
+ # "median",
5074
+ # "most_frequent",
5075
+ # "constant",
5076
+ # "knn",
5077
+ # "iterative",
5078
+ # # "missforest",
5079
+ # # "softimpute",
5080
+ # # "svd",
5081
+ # ]
5082
+
5083
+ # # Create a dictionary to hold results
5084
+ # results = {}
5085
+
5086
+ # for method_name in methods:
5087
+ # print(method_name)
5088
+ # display(df)
5089
+ # display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
5090
+
5091
+
4932
5092
  def df_scaler(
4933
- data: pd.DataFrame,
5093
+ data: pd.DataFrame, # should be numeric dtype
4934
5094
  method="standard",
4935
5095
  columns=None, # default, select all numeric col/row
4936
5096
  inplace=False,
@@ -5414,7 +5574,7 @@ def df_reducer(
5414
5574
 
5415
5575
  # Select columns if specified, else use all columns
5416
5576
  X = data[columns].values if columns else data.values
5417
-
5577
+ print(X.shape,type(X))
5418
5578
  # Handle missing values
5419
5579
  if fill_missing:
5420
5580
  imputer = SimpleImputer(strategy="mean")
@@ -5620,15 +5780,19 @@ def df_reducer(
5620
5780
  # If inplace=True, add components back into the original data
5621
5781
  for col_idx in range(n_components):
5622
5782
  data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5623
-
5624
5783
  # Add extra info for PCA/UMAP
5625
5784
  if method == "pca":
5626
- data["Explained Variance"] = reduced_df["Explained Variance"]
5627
- data["Singular Values"] = reduced_df["Singular Values"]
5628
- elif method == "umap":
5785
+ for i in range(n_components):
5786
+ data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
5787
+ for i in range(n_components):
5788
+ data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
5789
+ elif method == "umap":
5790
+ for i in range(n_components):
5791
+ data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
5629
5792
  data["Embedding"] = reduced_df["Embedding"]
5630
5793
  data["Trustworthiness"] = reduced_df["Trustworthiness"]
5631
5794
  return None # No return when inplace=True
5795
+
5632
5796
 
5633
5797
  return reduced_df
5634
5798