py2ls 0.2.4.3__py3-none-any.whl → 0.2.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/bio.py +955 -18
- py2ls/data/mygenes_fields_241022.txt +355 -0
- py2ls/ips.py +219 -55
- py2ls/ml2ls.py +1094 -0
- py2ls/netfinder.py +12 -1
- py2ls/plot.py +266 -71
- {py2ls-0.2.4.3.dist-info → py2ls-0.2.4.4.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.3.dist-info → py2ls-0.2.4.4.dist-info}/RECORD +11 -9
- {py2ls-0.2.4.3.dist-info → py2ls-0.2.4.4.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -106,6 +106,8 @@ def unique(lst, ascending=None):
|
|
106
106
|
返回:
|
107
107
|
list: 一个列表,其中的元素是唯一的,顺序根据参数 `ascending` 进行排序。
|
108
108
|
"""
|
109
|
+
if not lst:
|
110
|
+
return []
|
109
111
|
if ascending is not None:
|
110
112
|
# 移除重复项
|
111
113
|
unique_items = list(set(lst))
|
@@ -518,7 +520,7 @@ def is_text(s):
|
|
518
520
|
|
519
521
|
from typing import Any, Union
|
520
522
|
|
521
|
-
def shared(
|
523
|
+
def shared(*args, strict=True, n_shared=2, verbose=True):
|
522
524
|
"""
|
523
525
|
check the shared elelements in two list.
|
524
526
|
usage:
|
@@ -529,14 +531,30 @@ def shared(lst1:Any, lst2:Any,*args, verbose=True):
|
|
529
531
|
"""
|
530
532
|
if verbose:
|
531
533
|
print("\n********* checking shared elements *********")
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
534
|
+
|
535
|
+
if len(args) == 1 and isinstance(args[0], list):
|
536
|
+
lists = args[0] # Unpack the single list
|
537
|
+
else:
|
538
|
+
lists = args # Use the provided arguments as lists
|
539
|
+
flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
|
540
|
+
# Ensure all arguments are lists
|
541
|
+
if any(not isinstance(lst, list) for lst in flattened_lists):
|
542
|
+
print(f"{' ' * 2}All inputs must be lists.")
|
543
|
+
return []
|
544
|
+
first_list = flattened_lists[0]
|
545
|
+
shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
|
546
|
+
if strict:
|
547
|
+
# Strict mode: require elements to be in all lists
|
548
|
+
shared_elements = set(flattened_lists[0])
|
549
|
+
for lst in flattened_lists[1:]:
|
550
|
+
shared_elements.intersection_update(lst)
|
551
|
+
else:
|
552
|
+
all_elements = [item for sublist in flattened_lists for item in sublist]
|
553
|
+
element_count = Counter(all_elements)
|
554
|
+
# Get elements that appear in at least n_shared lists
|
555
|
+
shared_elements = [item for item, count in element_count.items() if count >= n_shared]
|
556
|
+
|
557
|
+
shared_elements = flatten(shared_elements)
|
540
558
|
if verbose:
|
541
559
|
elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
|
542
560
|
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
@@ -555,17 +573,19 @@ def flatten(nested: Any, unique_list=True,verbose=True):
|
|
555
573
|
while stack:
|
556
574
|
current = stack.pop()
|
557
575
|
if isinstance(current, dict):
|
558
|
-
stack.extend(current.values())
|
576
|
+
stack.extend(current.values())
|
559
577
|
elif isinstance(current, (list, tuple, set)):
|
560
578
|
stack.extend(current)
|
561
579
|
elif isinstance(current, pd.Series):
|
562
580
|
stack.extend(current)
|
581
|
+
elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
|
582
|
+
stack.extend(current.tolist())
|
563
583
|
else:
|
564
584
|
flattened_list.append(current)
|
565
585
|
if verbose:
|
566
586
|
print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
|
567
587
|
if unique_list:
|
568
|
-
return unique(flattened_list)
|
588
|
+
return unique(flattened_list)[::-1]
|
569
589
|
else:
|
570
590
|
return flattened_list
|
571
591
|
|
@@ -1618,6 +1638,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1618
1638
|
"""
|
1619
1639
|
Usage
|
1620
1640
|
is_abnormal = is_df_abnormal(df, verbose=1)
|
1641
|
+
True: abnormal
|
1642
|
+
False: normal
|
1621
1643
|
|
1622
1644
|
"""
|
1623
1645
|
# Initialize a list to hold messages about abnormalities
|
@@ -1645,25 +1667,34 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1645
1667
|
if len(column_names) == 1 and delimiter_counts["\t"] > 1:
|
1646
1668
|
messages.append("Abnormal: Column names are not split correctly.")
|
1647
1669
|
is_abnormal = True
|
1670
|
+
if verbose:
|
1671
|
+
print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
|
1648
1672
|
|
1649
1673
|
if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
|
1650
1674
|
messages.append("Abnormal: Too many delimiters in column names.")
|
1651
1675
|
is_abnormal = True
|
1676
|
+
if verbose:
|
1677
|
+
print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
|
1652
1678
|
|
1653
1679
|
if delimiter_counts[""] > 3:
|
1654
1680
|
messages.append("Abnormal: There are empty column names.")
|
1655
1681
|
is_abnormal = True
|
1682
|
+
if verbose:
|
1683
|
+
print(f'delimiter_counts[""] > 3')
|
1656
1684
|
|
1657
1685
|
if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
|
1658
1686
|
messages.append("Abnormal: Some column names contain unexpected characters.")
|
1659
1687
|
is_abnormal = True
|
1688
|
+
if verbose:
|
1689
|
+
print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
|
1660
1690
|
|
1661
|
-
# Check for missing values
|
1662
|
-
missing_values = df.isnull().sum()
|
1663
|
-
if missing_values.any():
|
1664
|
-
|
1665
|
-
|
1666
|
-
|
1691
|
+
# # Check for missing values
|
1692
|
+
# missing_values = df.isnull().sum()
|
1693
|
+
# if missing_values.any():
|
1694
|
+
# messages.append("Missing values in columns:")
|
1695
|
+
# messages.append(missing_values[missing_values > 0].to_string())
|
1696
|
+
# is_abnormal = True
|
1697
|
+
# print(f'missing_values.any()')
|
1667
1698
|
|
1668
1699
|
# Check data types
|
1669
1700
|
data_types = df.dtypes
|
@@ -1674,6 +1705,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1674
1705
|
if constant_columns:
|
1675
1706
|
messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
|
1676
1707
|
is_abnormal = True
|
1708
|
+
if verbose:
|
1709
|
+
print(f'df.columns[df.nunique() == 1].tolist()')
|
1677
1710
|
|
1678
1711
|
# Check for an unreasonable number of rows or columns
|
1679
1712
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
@@ -1681,6 +1714,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1681
1714
|
"Abnormal: DataFrame is too small (less than 2 rows or columns)."
|
1682
1715
|
)
|
1683
1716
|
is_abnormal = True
|
1717
|
+
if verbose:
|
1718
|
+
print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
|
1684
1719
|
|
1685
1720
|
# Compile results
|
1686
1721
|
if verbose:
|
@@ -1723,10 +1758,36 @@ def fload(fpath, kind=None, **kwargs):
|
|
1723
1758
|
content = yaml.safe_load(file)
|
1724
1759
|
return content
|
1725
1760
|
|
1726
|
-
|
1727
|
-
|
1728
|
-
|
1729
|
-
|
1761
|
+
|
1762
|
+
def load_xml(fpath, fsize_thr: int = 100):
|
1763
|
+
def load_small_xml(fpath):
|
1764
|
+
tree = etree.parse(fpath)
|
1765
|
+
root = tree.getroot()
|
1766
|
+
return etree.tostring(root, pretty_print=True).decode()
|
1767
|
+
|
1768
|
+
def load_large_xml(fpath):
|
1769
|
+
xml_parts = []
|
1770
|
+
context = etree.iterparse(
|
1771
|
+
fpath, events=("start", "end"), recover=True, huge_tree=True
|
1772
|
+
)
|
1773
|
+
|
1774
|
+
for event, elem in context:
|
1775
|
+
if event == "end":
|
1776
|
+
xml_parts.append(etree.tostring(elem, pretty_print=True).decode())
|
1777
|
+
elem.clear()
|
1778
|
+
while elem.getprevious() is not None:
|
1779
|
+
del elem.getparent()[0]
|
1780
|
+
del context
|
1781
|
+
return "".join(xml_parts)
|
1782
|
+
|
1783
|
+
file_size = os.path.getsize(fpath) / 1024 / 1024 # in MB
|
1784
|
+
|
1785
|
+
if file_size > fsize_thr:
|
1786
|
+
print(f"reading a small file:{file_size} Mb")
|
1787
|
+
return load_large_xml(fpath)
|
1788
|
+
else:
|
1789
|
+
print(f"reading a big file:{file_size} Mb")
|
1790
|
+
return load_small_xml(fpath)
|
1730
1791
|
|
1731
1792
|
def get_comment(fpath, comment=None, encoding="utf-8", lines_to_check=5):
|
1732
1793
|
"""
|
@@ -1793,6 +1854,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
1793
1854
|
on_bad_lines=on_bad_lines,
|
1794
1855
|
**kwargs,
|
1795
1856
|
)
|
1857
|
+
if is_df_abnormal(df, verbose=0):
|
1858
|
+
raise ValueError("the df is abnormal")
|
1796
1859
|
except:
|
1797
1860
|
try:
|
1798
1861
|
try:
|
@@ -1820,7 +1883,6 @@ def fload(fpath, kind=None, **kwargs):
|
|
1820
1883
|
comment=comment,
|
1821
1884
|
**kwargs,
|
1822
1885
|
)
|
1823
|
-
|
1824
1886
|
if is_df_abnormal(df, verbose=0):
|
1825
1887
|
raise ValueError("the df is abnormal")
|
1826
1888
|
except (UnicodeDecodeError, ValueError):
|
@@ -1856,7 +1918,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
1856
1918
|
separators = [",", "\t", ";", "|", " "]
|
1857
1919
|
for sep in separators:
|
1858
1920
|
sep2show = sep if sep != "\t" else "\\t"
|
1859
|
-
print(f'trying with: engine=pyarrow, sep="{sep2show}"')
|
1921
|
+
# print(f'trying with: engine=pyarrow, sep="{sep2show}"')
|
1922
|
+
# print(".")
|
1860
1923
|
try:
|
1861
1924
|
df = pd.read_csv(
|
1862
1925
|
fpath,
|
@@ -1868,10 +1931,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
1868
1931
|
**kwargs,
|
1869
1932
|
)
|
1870
1933
|
if not is_df_abnormal(df, verbose=0): # normal
|
1871
|
-
|
1872
|
-
|
1873
|
-
|
1874
|
-
pass
|
1934
|
+
display(df.head(2))
|
1935
|
+
print(f"shape: {df.shape}")
|
1936
|
+
return df
|
1875
1937
|
except:
|
1876
1938
|
pass
|
1877
1939
|
else:
|
@@ -1880,8 +1942,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
1880
1942
|
separators = [",", "\t", ";", "|", " "]
|
1881
1943
|
for sep in separators:
|
1882
1944
|
try:
|
1883
|
-
sep2show = sep if sep != "\t" else "\\t"
|
1884
|
-
print(f"trying with: engine={engine}, sep='{sep2show}'")
|
1945
|
+
# sep2show = sep if sep != "\t" else "\\t"
|
1946
|
+
# print(f"trying with: engine={engine}, sep='{sep2show}'")
|
1947
|
+
# print(".")
|
1885
1948
|
df = pd.read_csv(
|
1886
1949
|
fpath,
|
1887
1950
|
engine=engine,
|
@@ -1890,8 +1953,12 @@ def fload(fpath, kind=None, **kwargs):
|
|
1890
1953
|
comment=comment,
|
1891
1954
|
**kwargs,
|
1892
1955
|
)
|
1956
|
+
# display(df.head(2))
|
1957
|
+
# print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
|
1893
1958
|
if not is_df_abnormal(df, verbose=0):
|
1894
|
-
|
1959
|
+
display(df.head(2))
|
1960
|
+
print(f"shape: {df.shape}")
|
1961
|
+
return df
|
1895
1962
|
except EmptyDataError as e:
|
1896
1963
|
continue
|
1897
1964
|
else:
|
@@ -2393,15 +2460,20 @@ def fsave(
|
|
2393
2460
|
# json.dump(data, file, **kwargs)
|
2394
2461
|
|
2395
2462
|
def save_json(fpath_fname, var_dict_or_df):
|
2463
|
+
def _convert_js(data):
|
2464
|
+
if isinstance(data, pd.DataFrame):
|
2465
|
+
return data.to_dict(orient="list")
|
2466
|
+
elif isinstance(data, np.ndarray):
|
2467
|
+
return data.tolist()
|
2468
|
+
elif isinstance(data, dict):
|
2469
|
+
return {key: _convert_js(value) for key, value in data.items()}
|
2470
|
+
return data
|
2471
|
+
|
2472
|
+
serializable_data = _convert_js(var_dict_or_df)
|
2473
|
+
|
2474
|
+
# Save the serializable data to the JSON file
|
2396
2475
|
with open(fpath_fname, "w") as f_json:
|
2397
|
-
|
2398
|
-
var_dict_or_df = var_dict_or_df.to_dict(orient="dict")
|
2399
|
-
if isinstance(var_dict_or_df, dict):
|
2400
|
-
for key, value in var_dict_or_df.items():
|
2401
|
-
if isinstance(value, np.ndarray):
|
2402
|
-
var_dict_or_df[key] = value.tolist()
|
2403
|
-
# Save the dictionary or list of dictionaries to a JSON file
|
2404
|
-
json.dump(var_dict_or_df, f_json, indent=4)
|
2476
|
+
json.dump(serializable_data, f_json, indent=4)
|
2405
2477
|
|
2406
2478
|
# # Example usage:
|
2407
2479
|
# sets = {"title": "mse_path_ MSE"}
|
@@ -2645,7 +2717,7 @@ def listdir(
|
|
2645
2717
|
print(ls)
|
2646
2718
|
df_all = pd.DataFrame(
|
2647
2719
|
{
|
2648
|
-
"fname":
|
2720
|
+
"fname": ls,
|
2649
2721
|
"fpath": [os.path.join(rootdir, i) for i in ls],
|
2650
2722
|
}
|
2651
2723
|
)
|
@@ -4789,7 +4861,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4789
4861
|
def df_merge(
|
4790
4862
|
df1: pd.DataFrame,
|
4791
4863
|
df2: pd.DataFrame,
|
4792
|
-
use_index: bool =
|
4864
|
+
use_index: bool = False,
|
4793
4865
|
columns: list = ["col_left", "col_right"],
|
4794
4866
|
how: str = "left",
|
4795
4867
|
) -> pd.DataFrame:
|
@@ -4848,12 +4920,53 @@ def df_merge(
|
|
4848
4920
|
)
|
4849
4921
|
return df_merged
|
4850
4922
|
|
4923
|
+
def df_drop_duplicates(
|
4924
|
+
data: pd.DataFrame,
|
4925
|
+
by: Union[
|
4926
|
+
str, List[str]
|
4927
|
+
] = "index", # Options: 'index', or column name(s) for 'rows'
|
4928
|
+
keep="first", # Options: 'first', 'last', or False (drop all duplicates)
|
4929
|
+
ignore_index=True,
|
4930
|
+
inplace: bool = False,
|
4931
|
+
verbose=True
|
4932
|
+
):
|
4933
|
+
"""
|
4934
|
+
data (pd.DataFrame): DataFrame to drop duplicates from.
|
4935
|
+
by (str): Specify by to drop duplicates:
|
4936
|
+
- 'index': Drop duplicates based on the DataFrame index.
|
4937
|
+
- Column name(s) for row-wise duplicate checking.
|
4938
|
+
keep (str): Which duplicates to keep:
|
4939
|
+
'first',
|
4940
|
+
'last',
|
4941
|
+
False (drop all duplicates).
|
4942
|
+
inplace (bool): Whether to modify the original DataFrame in place.
|
4943
|
+
"""
|
4944
|
+
original_shape = data.shape
|
4945
|
+
if by == "index":
|
4946
|
+
# Drop duplicates in the index
|
4947
|
+
result = data[~data.index.duplicated(keep=keep)]
|
4948
|
+
else:
|
4949
|
+
# Drop duplicates row-wise based on column(s)
|
4950
|
+
result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
|
4951
|
+
if original_shape!=result.shape or verbose:
|
4952
|
+
print(f"\nshape:{original_shape} (before drop_duplicates)")
|
4953
|
+
print(f"shape:{result.shape} (after drop_duplicates)")
|
4954
|
+
if inplace:
|
4955
|
+
# Modify the original DataFrame in place
|
4956
|
+
data.drop(data.index, inplace=True) # Drop all rows first
|
4957
|
+
data[data.columns] = result # Refill the DataFrame
|
4958
|
+
return None
|
4959
|
+
else:
|
4960
|
+
return result
|
4851
4961
|
def df_fillna(
|
4852
4962
|
data: pd.DataFrame,
|
4853
|
-
method: str = "
|
4963
|
+
method: str = "knn",
|
4854
4964
|
axis: int = 0,# column-wise
|
4855
4965
|
constant: float = None,
|
4966
|
+
n_neighbors: int = 5, # KNN-specific
|
4967
|
+
max_iter: int = 10, # Iterative methods specific
|
4856
4968
|
inplace: bool = True,
|
4969
|
+
random_state:int = None
|
4857
4970
|
) -> pd.DataFrame:
|
4858
4971
|
"""
|
4859
4972
|
Fill missing values in a DataFrame using specified imputation method.
|
@@ -4865,8 +4978,15 @@ def df_fillna(
|
|
4865
4978
|
- 'median': Replace missing values with the median of the column.
|
4866
4979
|
- 'most_frequent': Replace missing values with the most frequent value in the column.
|
4867
4980
|
- 'constant': Replace missing values with a constant value provided by the `constant` parameter.
|
4868
|
-
- 'knn': Use K-Nearest Neighbors imputation
|
4869
|
-
- 'iterative': Use Iterative imputation
|
4981
|
+
- 'knn': Use K-Nearest Neighbors imputation; replaces missing values based on the values of the nearest neighbors
|
4982
|
+
- 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
|
4983
|
+
- 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
|
4984
|
+
# - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
|
4985
|
+
# - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
|
4986
|
+
# reconstructing the data matrix using low-rank approximation
|
4987
|
+
# - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
|
4988
|
+
# - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
|
4989
|
+
|
4870
4990
|
axis (int): The axis along which to impute:
|
4871
4991
|
- 0: Impute column-wise (default).
|
4872
4992
|
- 1: Impute row-wise.
|
@@ -4879,7 +4999,8 @@ def df_fillna(
|
|
4879
4999
|
raise ValueError("Input DataFrame is empty.")
|
4880
5000
|
|
4881
5001
|
# Validate method
|
4882
|
-
methods = ["mean", "median", "most_frequent",
|
5002
|
+
methods = ["mean", "median", "most_frequent",
|
5003
|
+
"constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
|
4883
5004
|
method = strcmp(method, methods)[0]
|
4884
5005
|
|
4885
5006
|
# If using constant method, ask for a constant value
|
@@ -4892,18 +5013,27 @@ def df_fillna(
|
|
4892
5013
|
|
4893
5014
|
# Initialize SimpleImputer with the chosen method
|
4894
5015
|
if method == "constant":
|
5016
|
+
from sklearn.impute import SimpleImputer
|
4895
5017
|
imputer = SimpleImputer(strategy=method, fill_value=constant)
|
4896
5018
|
elif method == "knn":
|
4897
5019
|
from sklearn.impute import KNNImputer
|
4898
|
-
|
4899
5020
|
imputer = KNNImputer(n_neighbors=n_neighbors)
|
4900
|
-
elif method == "iterative":
|
5021
|
+
elif method == "iterative" or method == "mice":
|
5022
|
+
from sklearn.experimental import enable_iterative_imputer
|
4901
5023
|
from sklearn.impute import IterativeImputer
|
4902
5024
|
|
4903
|
-
imputer = IterativeImputer(max_iter=max_iter)
|
4904
|
-
|
5025
|
+
imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
|
5026
|
+
# elif method == "missforest":
|
5027
|
+
# from missingpy import MissForest
|
5028
|
+
# imputer = MissForest(max_iter=max_iter, random_state=random_state)
|
5029
|
+
# elif method == "softimpute":
|
5030
|
+
# from fancyimpute import SoftImpute
|
5031
|
+
# imputer = SoftImpute()
|
5032
|
+
# elif method == "svd":
|
5033
|
+
# from fancyimpute import IterativeSVD
|
5034
|
+
# imputer = IterativeSVD(max_iters=max_iter)
|
5035
|
+
else: # mean, median, most_frequent
|
4905
5036
|
from sklearn.impute import SimpleImputer
|
4906
|
-
|
4907
5037
|
imputer = SimpleImputer(strategy=method)
|
4908
5038
|
|
4909
5039
|
# Fit and transform the data
|
@@ -4929,8 +5059,38 @@ def df_fillna(
|
|
4929
5059
|
return None # replace original
|
4930
5060
|
else:
|
4931
5061
|
return df_filled
|
5062
|
+
# # example
|
5063
|
+
# data = {
|
5064
|
+
# "A": [1, 2, np.nan, 4, 5],
|
5065
|
+
# "B": [np.nan, 2, 3, 4, np.nan],
|
5066
|
+
# "C": [1, np.nan, 3, 4, 5],
|
5067
|
+
# "D": [1, 2, 3, 4, np.nan],
|
5068
|
+
# }
|
5069
|
+
|
5070
|
+
# # Define a function to test each imputation method
|
5071
|
+
# methods = [
|
5072
|
+
# "mean",
|
5073
|
+
# "median",
|
5074
|
+
# "most_frequent",
|
5075
|
+
# "constant",
|
5076
|
+
# "knn",
|
5077
|
+
# "iterative",
|
5078
|
+
# # "missforest",
|
5079
|
+
# # "softimpute",
|
5080
|
+
# # "svd",
|
5081
|
+
# ]
|
5082
|
+
|
5083
|
+
# # Create a dictionary to hold results
|
5084
|
+
# results = {}
|
5085
|
+
|
5086
|
+
# for method_name in methods:
|
5087
|
+
# print(method_name)
|
5088
|
+
# display(df)
|
5089
|
+
# display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
|
5090
|
+
|
5091
|
+
|
4932
5092
|
def df_scaler(
|
4933
|
-
data: pd.DataFrame,
|
5093
|
+
data: pd.DataFrame, # should be numeric dtype
|
4934
5094
|
method="standard",
|
4935
5095
|
columns=None, # default, select all numeric col/row
|
4936
5096
|
inplace=False,
|
@@ -5414,7 +5574,7 @@ def df_reducer(
|
|
5414
5574
|
|
5415
5575
|
# Select columns if specified, else use all columns
|
5416
5576
|
X = data[columns].values if columns else data.values
|
5417
|
-
|
5577
|
+
print(X.shape,type(X))
|
5418
5578
|
# Handle missing values
|
5419
5579
|
if fill_missing:
|
5420
5580
|
imputer = SimpleImputer(strategy="mean")
|
@@ -5620,15 +5780,19 @@ def df_reducer(
|
|
5620
5780
|
# If inplace=True, add components back into the original data
|
5621
5781
|
for col_idx in range(n_components):
|
5622
5782
|
data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
5623
|
-
|
5624
5783
|
# Add extra info for PCA/UMAP
|
5625
5784
|
if method == "pca":
|
5626
|
-
|
5627
|
-
|
5628
|
-
|
5785
|
+
for i in range(n_components):
|
5786
|
+
data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
|
5787
|
+
for i in range(n_components):
|
5788
|
+
data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
|
5789
|
+
elif method == "umap":
|
5790
|
+
for i in range(n_components):
|
5791
|
+
data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
|
5629
5792
|
data["Embedding"] = reduced_df["Embedding"]
|
5630
5793
|
data["Trustworthiness"] = reduced_df["Trustworthiness"]
|
5631
5794
|
return None # No return when inplace=True
|
5795
|
+
|
5632
5796
|
|
5633
5797
|
return reduced_df
|
5634
5798
|
|