py2ls 0.2.4.2__py3-none-any.whl → 0.2.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/bio.py +1225 -47
- py2ls/data/mygenes_fields_241022.txt +355 -0
- py2ls/ips.py +523 -110
- py2ls/ml2ls.py +1094 -0
- py2ls/netfinder.py +12 -1
- py2ls/plot.py +290 -75
- {py2ls-0.2.4.2.dist-info → py2ls-0.2.4.4.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.2.dist-info → py2ls-0.2.4.4.dist-info}/RECORD +11 -9
- {py2ls-0.2.4.2.dist-info → py2ls-0.2.4.4.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -51,8 +51,6 @@ from bs4 import BeautifulSoup
|
|
51
51
|
|
52
52
|
from . import netfinder
|
53
53
|
|
54
|
-
# from .plot import get_color
|
55
|
-
|
56
54
|
try:
|
57
55
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
58
56
|
get_ipython().run_line_magic("autoreload", "2")
|
@@ -108,6 +106,8 @@ def unique(lst, ascending=None):
|
|
108
106
|
返回:
|
109
107
|
list: 一个列表,其中的元素是唯一的,顺序根据参数 `ascending` 进行排序。
|
110
108
|
"""
|
109
|
+
if not lst:
|
110
|
+
return []
|
111
111
|
if ascending is not None:
|
112
112
|
# 移除重复项
|
113
113
|
unique_items = list(set(lst))
|
@@ -518,6 +518,77 @@ def is_text(s):
|
|
518
518
|
return has_alpha and has_non_alpha
|
519
519
|
|
520
520
|
|
521
|
+
from typing import Any, Union
|
522
|
+
|
523
|
+
def shared(*args, strict=True, n_shared=2, verbose=True):
|
524
|
+
"""
|
525
|
+
check the shared elelements in two list.
|
526
|
+
usage:
|
527
|
+
list1 = [1, 2, 3, 4, 5]
|
528
|
+
list2 = [4, 5, 6, 7, 8]
|
529
|
+
list3 = [5, 6, 9, 10]
|
530
|
+
a = shared(list1, list2,list3)
|
531
|
+
"""
|
532
|
+
if verbose:
|
533
|
+
print("\n********* checking shared elements *********")
|
534
|
+
|
535
|
+
if len(args) == 1 and isinstance(args[0], list):
|
536
|
+
lists = args[0] # Unpack the single list
|
537
|
+
else:
|
538
|
+
lists = args # Use the provided arguments as lists
|
539
|
+
flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
|
540
|
+
# Ensure all arguments are lists
|
541
|
+
if any(not isinstance(lst, list) for lst in flattened_lists):
|
542
|
+
print(f"{' ' * 2}All inputs must be lists.")
|
543
|
+
return []
|
544
|
+
first_list = flattened_lists[0]
|
545
|
+
shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
|
546
|
+
if strict:
|
547
|
+
# Strict mode: require elements to be in all lists
|
548
|
+
shared_elements = set(flattened_lists[0])
|
549
|
+
for lst in flattened_lists[1:]:
|
550
|
+
shared_elements.intersection_update(lst)
|
551
|
+
else:
|
552
|
+
all_elements = [item for sublist in flattened_lists for item in sublist]
|
553
|
+
element_count = Counter(all_elements)
|
554
|
+
# Get elements that appear in at least n_shared lists
|
555
|
+
shared_elements = [item for item, count in element_count.items() if count >= n_shared]
|
556
|
+
|
557
|
+
shared_elements = flatten(shared_elements)
|
558
|
+
if verbose:
|
559
|
+
elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
|
560
|
+
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
561
|
+
print("********* checking shared elements *********")
|
562
|
+
return shared_elements
|
563
|
+
|
564
|
+
def flatten(nested: Any, unique_list=True,verbose=True):
|
565
|
+
"""
|
566
|
+
Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
|
567
|
+
Parameters:
|
568
|
+
nested : Any, Can be a list, tuple, dictionary, or set.
|
569
|
+
Returns: list, A flattened list.
|
570
|
+
"""
|
571
|
+
flattened_list = []
|
572
|
+
stack = [nested]
|
573
|
+
while stack:
|
574
|
+
current = stack.pop()
|
575
|
+
if isinstance(current, dict):
|
576
|
+
stack.extend(current.values())
|
577
|
+
elif isinstance(current, (list, tuple, set)):
|
578
|
+
stack.extend(current)
|
579
|
+
elif isinstance(current, pd.Series):
|
580
|
+
stack.extend(current)
|
581
|
+
elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
|
582
|
+
stack.extend(current.tolist())
|
583
|
+
else:
|
584
|
+
flattened_list.append(current)
|
585
|
+
if verbose:
|
586
|
+
print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
|
587
|
+
if unique_list:
|
588
|
+
return unique(flattened_list)[::-1]
|
589
|
+
else:
|
590
|
+
return flattened_list
|
591
|
+
|
521
592
|
def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
|
522
593
|
"""
|
523
594
|
Compares a search term with a list of candidate strings and finds the best match based on similarity score.
|
@@ -548,7 +619,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
|
|
548
619
|
similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
|
549
620
|
elif "W" in scorer.lower():
|
550
621
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
551
|
-
elif "ratio" in scorer.lower():#Ratio (Strictest)
|
622
|
+
elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
|
552
623
|
similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
|
553
624
|
else:
|
554
625
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
@@ -1567,6 +1638,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1567
1638
|
"""
|
1568
1639
|
Usage
|
1569
1640
|
is_abnormal = is_df_abnormal(df, verbose=1)
|
1641
|
+
True: abnormal
|
1642
|
+
False: normal
|
1570
1643
|
|
1571
1644
|
"""
|
1572
1645
|
# Initialize a list to hold messages about abnormalities
|
@@ -1594,25 +1667,34 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1594
1667
|
if len(column_names) == 1 and delimiter_counts["\t"] > 1:
|
1595
1668
|
messages.append("Abnormal: Column names are not split correctly.")
|
1596
1669
|
is_abnormal = True
|
1670
|
+
if verbose:
|
1671
|
+
print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
|
1597
1672
|
|
1598
1673
|
if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
|
1599
1674
|
messages.append("Abnormal: Too many delimiters in column names.")
|
1600
1675
|
is_abnormal = True
|
1676
|
+
if verbose:
|
1677
|
+
print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
|
1601
1678
|
|
1602
1679
|
if delimiter_counts[""] > 3:
|
1603
1680
|
messages.append("Abnormal: There are empty column names.")
|
1604
1681
|
is_abnormal = True
|
1682
|
+
if verbose:
|
1683
|
+
print(f'delimiter_counts[""] > 3')
|
1605
1684
|
|
1606
1685
|
if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
|
1607
1686
|
messages.append("Abnormal: Some column names contain unexpected characters.")
|
1608
1687
|
is_abnormal = True
|
1688
|
+
if verbose:
|
1689
|
+
print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
|
1609
1690
|
|
1610
|
-
# Check for missing values
|
1611
|
-
missing_values = df.isnull().sum()
|
1612
|
-
if missing_values.any():
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1691
|
+
# # Check for missing values
|
1692
|
+
# missing_values = df.isnull().sum()
|
1693
|
+
# if missing_values.any():
|
1694
|
+
# messages.append("Missing values in columns:")
|
1695
|
+
# messages.append(missing_values[missing_values > 0].to_string())
|
1696
|
+
# is_abnormal = True
|
1697
|
+
# print(f'missing_values.any()')
|
1616
1698
|
|
1617
1699
|
# Check data types
|
1618
1700
|
data_types = df.dtypes
|
@@ -1623,6 +1705,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1623
1705
|
if constant_columns:
|
1624
1706
|
messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
|
1625
1707
|
is_abnormal = True
|
1708
|
+
if verbose:
|
1709
|
+
print(f'df.columns[df.nunique() == 1].tolist()')
|
1626
1710
|
|
1627
1711
|
# Check for an unreasonable number of rows or columns
|
1628
1712
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
@@ -1630,6 +1714,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1630
1714
|
"Abnormal: DataFrame is too small (less than 2 rows or columns)."
|
1631
1715
|
)
|
1632
1716
|
is_abnormal = True
|
1717
|
+
if verbose:
|
1718
|
+
print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
|
1633
1719
|
|
1634
1720
|
# Compile results
|
1635
1721
|
if verbose:
|
@@ -1672,10 +1758,36 @@ def fload(fpath, kind=None, **kwargs):
|
|
1672
1758
|
content = yaml.safe_load(file)
|
1673
1759
|
return content
|
1674
1760
|
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1761
|
+
|
1762
|
+
def load_xml(fpath, fsize_thr: int = 100):
|
1763
|
+
def load_small_xml(fpath):
|
1764
|
+
tree = etree.parse(fpath)
|
1765
|
+
root = tree.getroot()
|
1766
|
+
return etree.tostring(root, pretty_print=True).decode()
|
1767
|
+
|
1768
|
+
def load_large_xml(fpath):
|
1769
|
+
xml_parts = []
|
1770
|
+
context = etree.iterparse(
|
1771
|
+
fpath, events=("start", "end"), recover=True, huge_tree=True
|
1772
|
+
)
|
1773
|
+
|
1774
|
+
for event, elem in context:
|
1775
|
+
if event == "end":
|
1776
|
+
xml_parts.append(etree.tostring(elem, pretty_print=True).decode())
|
1777
|
+
elem.clear()
|
1778
|
+
while elem.getprevious() is not None:
|
1779
|
+
del elem.getparent()[0]
|
1780
|
+
del context
|
1781
|
+
return "".join(xml_parts)
|
1782
|
+
|
1783
|
+
file_size = os.path.getsize(fpath) / 1024 / 1024 # in MB
|
1784
|
+
|
1785
|
+
if file_size > fsize_thr:
|
1786
|
+
print(f"reading a small file:{file_size} Mb")
|
1787
|
+
return load_large_xml(fpath)
|
1788
|
+
else:
|
1789
|
+
print(f"reading a big file:{file_size} Mb")
|
1790
|
+
return load_small_xml(fpath)
|
1679
1791
|
|
1680
1792
|
def get_comment(fpath, comment=None, encoding="utf-8", lines_to_check=5):
|
1681
1793
|
"""
|
@@ -1721,7 +1833,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1721
1833
|
fmt=kwargs.pop("fmt",False)
|
1722
1834
|
verbose=kwargs.pop("verbose",False)
|
1723
1835
|
if verbose:
|
1724
|
-
|
1836
|
+
use_pd("read_csv", verbose=verbose)
|
1725
1837
|
return
|
1726
1838
|
|
1727
1839
|
if comment is None:
|
@@ -1742,6 +1854,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
1742
1854
|
on_bad_lines=on_bad_lines,
|
1743
1855
|
**kwargs,
|
1744
1856
|
)
|
1857
|
+
if is_df_abnormal(df, verbose=0):
|
1858
|
+
raise ValueError("the df is abnormal")
|
1745
1859
|
except:
|
1746
1860
|
try:
|
1747
1861
|
try:
|
@@ -1769,7 +1883,6 @@ def fload(fpath, kind=None, **kwargs):
|
|
1769
1883
|
comment=comment,
|
1770
1884
|
**kwargs,
|
1771
1885
|
)
|
1772
|
-
|
1773
1886
|
if is_df_abnormal(df, verbose=0):
|
1774
1887
|
raise ValueError("the df is abnormal")
|
1775
1888
|
except (UnicodeDecodeError, ValueError):
|
@@ -1805,7 +1918,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
1805
1918
|
separators = [",", "\t", ";", "|", " "]
|
1806
1919
|
for sep in separators:
|
1807
1920
|
sep2show = sep if sep != "\t" else "\\t"
|
1808
|
-
print(f'trying with: engine=pyarrow, sep="{sep2show}"')
|
1921
|
+
# print(f'trying with: engine=pyarrow, sep="{sep2show}"')
|
1922
|
+
# print(".")
|
1809
1923
|
try:
|
1810
1924
|
df = pd.read_csv(
|
1811
1925
|
fpath,
|
@@ -1817,10 +1931,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
1817
1931
|
**kwargs,
|
1818
1932
|
)
|
1819
1933
|
if not is_df_abnormal(df, verbose=0): # normal
|
1820
|
-
|
1821
|
-
|
1822
|
-
|
1823
|
-
pass
|
1934
|
+
display(df.head(2))
|
1935
|
+
print(f"shape: {df.shape}")
|
1936
|
+
return df
|
1824
1937
|
except:
|
1825
1938
|
pass
|
1826
1939
|
else:
|
@@ -1829,8 +1942,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
1829
1942
|
separators = [",", "\t", ";", "|", " "]
|
1830
1943
|
for sep in separators:
|
1831
1944
|
try:
|
1832
|
-
sep2show = sep if sep != "\t" else "\\t"
|
1833
|
-
print(f"trying with: engine={engine}, sep='{sep2show}'")
|
1945
|
+
# sep2show = sep if sep != "\t" else "\\t"
|
1946
|
+
# print(f"trying with: engine={engine}, sep='{sep2show}'")
|
1947
|
+
# print(".")
|
1834
1948
|
df = pd.read_csv(
|
1835
1949
|
fpath,
|
1836
1950
|
engine=engine,
|
@@ -1839,8 +1953,12 @@ def fload(fpath, kind=None, **kwargs):
|
|
1839
1953
|
comment=comment,
|
1840
1954
|
**kwargs,
|
1841
1955
|
)
|
1956
|
+
# display(df.head(2))
|
1957
|
+
# print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
|
1842
1958
|
if not is_df_abnormal(df, verbose=0):
|
1843
|
-
|
1959
|
+
display(df.head(2))
|
1960
|
+
print(f"shape: {df.shape}")
|
1961
|
+
return df
|
1844
1962
|
except EmptyDataError as e:
|
1845
1963
|
continue
|
1846
1964
|
else:
|
@@ -1853,7 +1971,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1853
1971
|
engine = kwargs.get("engine", "openpyxl")
|
1854
1972
|
verbose=kwargs.pop("verbose",False)
|
1855
1973
|
if verbose:
|
1856
|
-
|
1974
|
+
use_pd("read_excel", verbose=verbose)
|
1857
1975
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
1858
1976
|
try:
|
1859
1977
|
meata=pd.ExcelFile(fpath)
|
@@ -2263,7 +2381,7 @@ def fsave(
|
|
2263
2381
|
|
2264
2382
|
verbose=kwargs.pop("verbose",False)
|
2265
2383
|
if verbose:
|
2266
|
-
|
2384
|
+
use_pd("to_csv", verbose=verbose)
|
2267
2385
|
kwargs_csv = dict(
|
2268
2386
|
path_or_buf=None,
|
2269
2387
|
sep=",",
|
@@ -2295,7 +2413,7 @@ def fsave(
|
|
2295
2413
|
verbose=kwargs.pop("verbose",False)
|
2296
2414
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2297
2415
|
if verbose:
|
2298
|
-
|
2416
|
+
use_pd("to_excel", verbose=verbose)
|
2299
2417
|
if any(kwargs):
|
2300
2418
|
format_excel(df=data, filename=fpath, **kwargs)
|
2301
2419
|
else:
|
@@ -2342,15 +2460,20 @@ def fsave(
|
|
2342
2460
|
# json.dump(data, file, **kwargs)
|
2343
2461
|
|
2344
2462
|
def save_json(fpath_fname, var_dict_or_df):
|
2463
|
+
def _convert_js(data):
|
2464
|
+
if isinstance(data, pd.DataFrame):
|
2465
|
+
return data.to_dict(orient="list")
|
2466
|
+
elif isinstance(data, np.ndarray):
|
2467
|
+
return data.tolist()
|
2468
|
+
elif isinstance(data, dict):
|
2469
|
+
return {key: _convert_js(value) for key, value in data.items()}
|
2470
|
+
return data
|
2471
|
+
|
2472
|
+
serializable_data = _convert_js(var_dict_or_df)
|
2473
|
+
|
2474
|
+
# Save the serializable data to the JSON file
|
2345
2475
|
with open(fpath_fname, "w") as f_json:
|
2346
|
-
|
2347
|
-
var_dict_or_df = var_dict_or_df.to_dict(orient="dict")
|
2348
|
-
if isinstance(var_dict_or_df, dict):
|
2349
|
-
for key, value in var_dict_or_df.items():
|
2350
|
-
if isinstance(value, np.ndarray):
|
2351
|
-
var_dict_or_df[key] = value.tolist()
|
2352
|
-
# Save the dictionary or list of dictionaries to a JSON file
|
2353
|
-
json.dump(var_dict_or_df, f_json, indent=4)
|
2476
|
+
json.dump(serializable_data, f_json, indent=4)
|
2354
2477
|
|
2355
2478
|
# # Example usage:
|
2356
2479
|
# sets = {"title": "mse_path_ MSE"}
|
@@ -2594,7 +2717,7 @@ def listdir(
|
|
2594
2717
|
print(ls)
|
2595
2718
|
df_all = pd.DataFrame(
|
2596
2719
|
{
|
2597
|
-
"fname":
|
2720
|
+
"fname": ls,
|
2598
2721
|
"fpath": [os.path.join(rootdir, i) for i in ls],
|
2599
2722
|
}
|
2600
2723
|
)
|
@@ -4444,7 +4567,42 @@ def preview(var):
|
|
4444
4567
|
# preview("# This is a Markdown header")
|
4445
4568
|
# preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
|
4446
4569
|
# preview({"key": "value", "numbers": [1, 2, 3]})
|
4447
|
-
|
4570
|
+
def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
4571
|
+
"""
|
4572
|
+
Extend a DataFrame by the list elecments in the column.
|
4573
|
+
|
4574
|
+
Parameters:
|
4575
|
+
----------
|
4576
|
+
data : pd.DataFrame
|
4577
|
+
The input DataFrame to be extended.
|
4578
|
+
|
4579
|
+
column : str
|
4580
|
+
The name of the column to be split.
|
4581
|
+
|
4582
|
+
axis : int, optional
|
4583
|
+
The axis along which to expand the DataFrame.
|
4584
|
+
- 0 (default): Expand the specified column into multiple rows.
|
4585
|
+
- 1: Expand the specified column into multiple columns.
|
4586
|
+
|
4587
|
+
sep : str, optional
|
4588
|
+
The separator used to split the values in the specified column.
|
4589
|
+
Must be provided for the function to work correctly.
|
4590
|
+
"""
|
4591
|
+
|
4592
|
+
data = data.copy()
|
4593
|
+
mask = data[column].str.contains(sep, na=False)
|
4594
|
+
data = data.copy()
|
4595
|
+
if mask.any():
|
4596
|
+
data[column] = (
|
4597
|
+
data[column]
|
4598
|
+
.apply(lambda x: x.split(sep) if isinstance(x, str) else x) # Only split if x is a string
|
4599
|
+
)
|
4600
|
+
|
4601
|
+
# Strip spaces from each item in the lists
|
4602
|
+
data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
|
4603
|
+
|
4604
|
+
data = data.explode(column, ignore_index=True)
|
4605
|
+
return data
|
4448
4606
|
# ! DataFrame
|
4449
4607
|
def df_astype(
|
4450
4608
|
data: pd.DataFrame,
|
@@ -4703,7 +4861,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4703
4861
|
def df_merge(
|
4704
4862
|
df1: pd.DataFrame,
|
4705
4863
|
df2: pd.DataFrame,
|
4706
|
-
use_index: bool =
|
4864
|
+
use_index: bool = False,
|
4707
4865
|
columns: list = ["col_left", "col_right"],
|
4708
4866
|
how: str = "left",
|
4709
4867
|
) -> pd.DataFrame:
|
@@ -4731,7 +4889,7 @@ def df_merge(
|
|
4731
4889
|
"""
|
4732
4890
|
|
4733
4891
|
# 1. Check if indices are comparable (same length and types)
|
4734
|
-
if use_index
|
4892
|
+
if use_index:
|
4735
4893
|
print(f"Merging based on index using '{how}' join...")
|
4736
4894
|
df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
|
4737
4895
|
return df_merged
|
@@ -4762,12 +4920,53 @@ def df_merge(
|
|
4762
4920
|
)
|
4763
4921
|
return df_merged
|
4764
4922
|
|
4923
|
+
def df_drop_duplicates(
|
4924
|
+
data: pd.DataFrame,
|
4925
|
+
by: Union[
|
4926
|
+
str, List[str]
|
4927
|
+
] = "index", # Options: 'index', or column name(s) for 'rows'
|
4928
|
+
keep="first", # Options: 'first', 'last', or False (drop all duplicates)
|
4929
|
+
ignore_index=True,
|
4930
|
+
inplace: bool = False,
|
4931
|
+
verbose=True
|
4932
|
+
):
|
4933
|
+
"""
|
4934
|
+
data (pd.DataFrame): DataFrame to drop duplicates from.
|
4935
|
+
by (str): Specify by to drop duplicates:
|
4936
|
+
- 'index': Drop duplicates based on the DataFrame index.
|
4937
|
+
- Column name(s) for row-wise duplicate checking.
|
4938
|
+
keep (str): Which duplicates to keep:
|
4939
|
+
'first',
|
4940
|
+
'last',
|
4941
|
+
False (drop all duplicates).
|
4942
|
+
inplace (bool): Whether to modify the original DataFrame in place.
|
4943
|
+
"""
|
4944
|
+
original_shape = data.shape
|
4945
|
+
if by == "index":
|
4946
|
+
# Drop duplicates in the index
|
4947
|
+
result = data[~data.index.duplicated(keep=keep)]
|
4948
|
+
else:
|
4949
|
+
# Drop duplicates row-wise based on column(s)
|
4950
|
+
result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
|
4951
|
+
if original_shape!=result.shape or verbose:
|
4952
|
+
print(f"\nshape:{original_shape} (before drop_duplicates)")
|
4953
|
+
print(f"shape:{result.shape} (after drop_duplicates)")
|
4954
|
+
if inplace:
|
4955
|
+
# Modify the original DataFrame in place
|
4956
|
+
data.drop(data.index, inplace=True) # Drop all rows first
|
4957
|
+
data[data.columns] = result # Refill the DataFrame
|
4958
|
+
return None
|
4959
|
+
else:
|
4960
|
+
return result
|
4765
4961
|
def df_fillna(
|
4766
4962
|
data: pd.DataFrame,
|
4767
|
-
method: str = "
|
4963
|
+
method: str = "knn",
|
4768
4964
|
axis: int = 0,# column-wise
|
4769
4965
|
constant: float = None,
|
4966
|
+
n_neighbors: int = 5, # KNN-specific
|
4967
|
+
max_iter: int = 10, # Iterative methods specific
|
4770
4968
|
inplace: bool = True,
|
4969
|
+
random_state:int = None
|
4771
4970
|
) -> pd.DataFrame:
|
4772
4971
|
"""
|
4773
4972
|
Fill missing values in a DataFrame using specified imputation method.
|
@@ -4779,8 +4978,15 @@ def df_fillna(
|
|
4779
4978
|
- 'median': Replace missing values with the median of the column.
|
4780
4979
|
- 'most_frequent': Replace missing values with the most frequent value in the column.
|
4781
4980
|
- 'constant': Replace missing values with a constant value provided by the `constant` parameter.
|
4782
|
-
- 'knn': Use K-Nearest Neighbors imputation
|
4783
|
-
- 'iterative': Use Iterative imputation
|
4981
|
+
- 'knn': Use K-Nearest Neighbors imputation; replaces missing values based on the values of the nearest neighbors
|
4982
|
+
- 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
|
4983
|
+
- 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
|
4984
|
+
# - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
|
4985
|
+
# - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
|
4986
|
+
# reconstructing the data matrix using low-rank approximation
|
4987
|
+
# - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
|
4988
|
+
# - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
|
4989
|
+
|
4784
4990
|
axis (int): The axis along which to impute:
|
4785
4991
|
- 0: Impute column-wise (default).
|
4786
4992
|
- 1: Impute row-wise.
|
@@ -4793,7 +4999,8 @@ def df_fillna(
|
|
4793
4999
|
raise ValueError("Input DataFrame is empty.")
|
4794
5000
|
|
4795
5001
|
# Validate method
|
4796
|
-
methods = ["mean", "median", "most_frequent",
|
5002
|
+
methods = ["mean", "median", "most_frequent",
|
5003
|
+
"constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
|
4797
5004
|
method = strcmp(method, methods)[0]
|
4798
5005
|
|
4799
5006
|
# If using constant method, ask for a constant value
|
@@ -4806,18 +5013,27 @@ def df_fillna(
|
|
4806
5013
|
|
4807
5014
|
# Initialize SimpleImputer with the chosen method
|
4808
5015
|
if method == "constant":
|
5016
|
+
from sklearn.impute import SimpleImputer
|
4809
5017
|
imputer = SimpleImputer(strategy=method, fill_value=constant)
|
4810
5018
|
elif method == "knn":
|
4811
5019
|
from sklearn.impute import KNNImputer
|
4812
|
-
|
4813
5020
|
imputer = KNNImputer(n_neighbors=n_neighbors)
|
4814
|
-
elif method == "iterative":
|
5021
|
+
elif method == "iterative" or method == "mice":
|
5022
|
+
from sklearn.experimental import enable_iterative_imputer
|
4815
5023
|
from sklearn.impute import IterativeImputer
|
4816
5024
|
|
4817
|
-
imputer = IterativeImputer(max_iter=max_iter)
|
4818
|
-
|
5025
|
+
imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
|
5026
|
+
# elif method == "missforest":
|
5027
|
+
# from missingpy import MissForest
|
5028
|
+
# imputer = MissForest(max_iter=max_iter, random_state=random_state)
|
5029
|
+
# elif method == "softimpute":
|
5030
|
+
# from fancyimpute import SoftImpute
|
5031
|
+
# imputer = SoftImpute()
|
5032
|
+
# elif method == "svd":
|
5033
|
+
# from fancyimpute import IterativeSVD
|
5034
|
+
# imputer = IterativeSVD(max_iters=max_iter)
|
5035
|
+
else: # mean, median, most_frequent
|
4819
5036
|
from sklearn.impute import SimpleImputer
|
4820
|
-
|
4821
5037
|
imputer = SimpleImputer(strategy=method)
|
4822
5038
|
|
4823
5039
|
# Fit and transform the data
|
@@ -4843,8 +5059,38 @@ def df_fillna(
|
|
4843
5059
|
return None # replace original
|
4844
5060
|
else:
|
4845
5061
|
return df_filled
|
5062
|
+
# # example
|
5063
|
+
# data = {
|
5064
|
+
# "A": [1, 2, np.nan, 4, 5],
|
5065
|
+
# "B": [np.nan, 2, 3, 4, np.nan],
|
5066
|
+
# "C": [1, np.nan, 3, 4, 5],
|
5067
|
+
# "D": [1, 2, 3, 4, np.nan],
|
5068
|
+
# }
|
5069
|
+
|
5070
|
+
# # Define a function to test each imputation method
|
5071
|
+
# methods = [
|
5072
|
+
# "mean",
|
5073
|
+
# "median",
|
5074
|
+
# "most_frequent",
|
5075
|
+
# "constant",
|
5076
|
+
# "knn",
|
5077
|
+
# "iterative",
|
5078
|
+
# # "missforest",
|
5079
|
+
# # "softimpute",
|
5080
|
+
# # "svd",
|
5081
|
+
# ]
|
5082
|
+
|
5083
|
+
# # Create a dictionary to hold results
|
5084
|
+
# results = {}
|
5085
|
+
|
5086
|
+
# for method_name in methods:
|
5087
|
+
# print(method_name)
|
5088
|
+
# display(df)
|
5089
|
+
# display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
|
5090
|
+
|
5091
|
+
|
4846
5092
|
def df_scaler(
|
4847
|
-
data: pd.DataFrame,
|
5093
|
+
data: pd.DataFrame, # should be numeric dtype
|
4848
5094
|
method="standard",
|
4849
5095
|
columns=None, # default, select all numeric col/row
|
4850
5096
|
inplace=False,
|
@@ -4984,7 +5230,7 @@ def df_cluster(
|
|
4984
5230
|
X = scaler.fit_transform(X)
|
4985
5231
|
|
4986
5232
|
for n_cluster in range_n_clusters:
|
4987
|
-
kmeans = KMeans(n_clusters=n_cluster, random_state=
|
5233
|
+
kmeans = KMeans(n_clusters=n_cluster, random_state=1)
|
4988
5234
|
cluster_labels = kmeans.fit_predict(X)
|
4989
5235
|
|
4990
5236
|
silhouette_avg = silhouette_score(X, cluster_labels)
|
@@ -5000,7 +5246,7 @@ def df_cluster(
|
|
5000
5246
|
print(f"n_clusters = {n_clusters}")
|
5001
5247
|
|
5002
5248
|
# Apply K-Means Clustering with Optimal Number of Clusters
|
5003
|
-
kmeans = KMeans(n_clusters=n_clusters, random_state=
|
5249
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=1)
|
5004
5250
|
cluster_labels = kmeans.fit_predict(X)
|
5005
5251
|
|
5006
5252
|
if plot:
|
@@ -5101,7 +5347,7 @@ def df_cluster(
|
|
5101
5347
|
# n_clusters = (
|
5102
5348
|
# np.argmax(silhouette_avg_scores) + 2
|
5103
5349
|
# ) # Optimal clusters based on max silhouette score
|
5104
|
-
# kmeans = KMeans(n_clusters=n_clusters, random_state=
|
5350
|
+
# kmeans = KMeans(n_clusters=n_clusters, random_state=1)
|
5105
5351
|
# cluster_labels = kmeans.fit_predict(X)
|
5106
5352
|
silhouette_vals = silhouette_samples(X, cluster_labels)
|
5107
5353
|
|
@@ -5252,12 +5498,14 @@ def df_reducer(
|
|
5252
5498
|
columns: Optional[List[str]] = None,
|
5253
5499
|
method: str = "umap", # 'pca', 'umap'
|
5254
5500
|
n_components: int = 2, # Default for umap, but 50 for PCA
|
5255
|
-
umap_neighbors: int = 15, #
|
5256
|
-
umap_min_dist: float = 0.1, #
|
5501
|
+
umap_neighbors: int = 15, # UMAP-specific
|
5502
|
+
umap_min_dist: float = 0.1, # UMAP-specific
|
5503
|
+
tsne_perplexity: int = 30, # t-SNE-specific
|
5257
5504
|
scale: bool = True,
|
5258
5505
|
fill_missing: bool = True,
|
5259
5506
|
debug: bool = False,
|
5260
5507
|
inplace: bool = True, # replace the oringinal data
|
5508
|
+
plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
|
5261
5509
|
) -> pd.DataFrame:
|
5262
5510
|
"""
|
5263
5511
|
Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
|
@@ -5293,14 +5541,40 @@ def df_reducer(
|
|
5293
5541
|
reduced_df : pd.DataFrame
|
5294
5542
|
DataFrame with the reduced dimensions.
|
5295
5543
|
"""
|
5296
|
-
|
5544
|
+
|
5545
|
+
"""
|
5546
|
+
PCA: explained_variance:
|
5547
|
+
indicates the proportion of the dataset's total variance that each principal
|
5548
|
+
component (PC) explains. It gives you a sense of how much information
|
5549
|
+
(or variance) is captured by each PC
|
5550
|
+
Interpretation:
|
5551
|
+
- Higher values indicate that the corresponding PC captures more variance.
|
5552
|
+
- The sum of the explained variances for all PCs equals 1 (or 100%).
|
5553
|
+
- If the first few components explain a high percentage (e.g., 90%),
|
5554
|
+
it means you can reduce the dimensionality of the data significantly without losing much information.
|
5555
|
+
Use case:
|
5556
|
+
You may plot a scree plot, which shows the explained variance for each PC, to help decide
|
5557
|
+
how many components to keep for analysis.
|
5558
|
+
|
5559
|
+
PCA: Singular values:
|
5560
|
+
represent the magnitude of variance along each principal component. Mathematically,
|
5561
|
+
they are the square roots of the eigenvalues of the covariance matrix.
|
5562
|
+
Interpretation:
|
5563
|
+
Larger singular values indicate that the associated PC captures more variance.
|
5564
|
+
Singular values are related to the scale of the data. If the data are scaled
|
5565
|
+
before PCA (e.g., standardized), then the singular values will provide a measure
|
5566
|
+
of the spread of data along each PC.
|
5567
|
+
Use case:
|
5568
|
+
Singular values help quantify the contribution of each principal component in a
|
5569
|
+
similar way to the explained variance. They are useful in understanding the overall
|
5570
|
+
structure of the data.
|
5571
|
+
"""
|
5297
5572
|
from sklearn.preprocessing import StandardScaler
|
5298
|
-
import umap
|
5299
5573
|
from sklearn.impute import SimpleImputer
|
5300
5574
|
|
5301
5575
|
# Select columns if specified, else use all columns
|
5302
5576
|
X = data[columns].values if columns else data.values
|
5303
|
-
|
5577
|
+
print(X.shape,type(X))
|
5304
5578
|
# Handle missing values
|
5305
5579
|
if fill_missing:
|
5306
5580
|
imputer = SimpleImputer(strategy="mean")
|
@@ -5312,76 +5586,215 @@ def df_reducer(
|
|
5312
5586
|
X = scaler.fit_transform(X)
|
5313
5587
|
|
5314
5588
|
# Check valid method input
|
5315
|
-
|
5316
|
-
|
5317
|
-
|
5589
|
+
methods=["pca", "umap","tsne","factor","isolation_forest"]
|
5590
|
+
method=strcmp(method, methods)[0]
|
5318
5591
|
# Apply PCA if selected
|
5319
|
-
if method == "pca":
|
5320
|
-
|
5321
|
-
# to get the n_components with threshold method:
|
5322
|
-
pca = PCA()
|
5323
|
-
pca_result = pca.fit_transform(X)
|
5324
|
-
|
5325
|
-
# Calculate explained variance
|
5326
|
-
explained_variance = pca.explained_variance_ratio_
|
5327
|
-
# Cumulative explained variance
|
5328
|
-
cumulative_variance = np.cumsum(explained_variance)
|
5329
|
-
# Set a threshold for cumulative variance
|
5330
|
-
threshold = 0.95 # Example threshold
|
5331
|
-
n_components = (
|
5332
|
-
np.argmax(cumulative_variance >= threshold) + 1
|
5333
|
-
) # Number of components to retain
|
5334
|
-
if debug:
|
5335
|
-
# debug:
|
5336
|
-
# Plot the cumulative explained variance
|
5337
|
-
plt.figure(figsize=(8, 5))
|
5338
|
-
plt.plot(
|
5339
|
-
range(1, len(cumulative_variance) + 1),
|
5340
|
-
cumulative_variance,
|
5341
|
-
marker="o",
|
5342
|
-
linestyle="-",
|
5343
|
-
)
|
5344
|
-
plt.title("Cumulative Explained Variance by Principal Components")
|
5345
|
-
plt.xlabel("Number of Principal Components")
|
5346
|
-
plt.ylabel("Cumulative Explained Variance")
|
5347
|
-
plt.xticks(range(1, len(cumulative_variance) + 1))
|
5348
|
-
# Add horizontal line for the threshold
|
5349
|
-
plt.axhline(
|
5350
|
-
y=threshold, color="r", linestyle="--", label="Threshold (95%)"
|
5351
|
-
)
|
5352
|
-
# Add vertical line for n_components
|
5353
|
-
plt.axvline(
|
5354
|
-
x=n_components,
|
5355
|
-
color="g",
|
5356
|
-
linestyle="--",
|
5357
|
-
label=f"n_components = {n_components}",
|
5358
|
-
)
|
5359
|
-
plt.legend()
|
5360
|
-
plt.grid()
|
5592
|
+
if method == "pca":
|
5593
|
+
from sklearn.decomposition import PCA
|
5361
5594
|
pca = PCA(n_components=n_components)
|
5362
5595
|
X_reduced = pca.fit_transform(X)
|
5363
|
-
|
5596
|
+
|
5597
|
+
# Additional PCA information
|
5598
|
+
explained_variance = pca.explained_variance_ratio_
|
5599
|
+
singular_values = pca.singular_values_
|
5600
|
+
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
|
5601
|
+
|
5602
|
+
if debug:
|
5603
|
+
print(f"PCA completed: Reduced to {n_components} components.")
|
5604
|
+
print(f"Explained Variance: {explained_variance}")
|
5605
|
+
print(f"Singular Values: {singular_values}")
|
5606
|
+
|
5607
|
+
# Plot explained variance if debug=True
|
5608
|
+
if debug:
|
5609
|
+
# Plot explained variance
|
5610
|
+
cumulative_variance = np.cumsum(explained_variance)
|
5611
|
+
plt.figure(figsize=(8, 5))
|
5612
|
+
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
|
5613
|
+
plt.title("Cumulative Explained Variance by Principal Components")
|
5614
|
+
plt.xlabel("Number of Principal Components")
|
5615
|
+
plt.ylabel("Cumulative Explained Variance")
|
5616
|
+
plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
|
5617
|
+
plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
|
5618
|
+
plt.legend()
|
5619
|
+
plt.grid()
|
5620
|
+
plt.show()
|
5621
|
+
|
5622
|
+
# Prepare reduced DataFrame with additional PCA info
|
5623
|
+
pca_df = pd.DataFrame(
|
5624
|
+
X_reduced, index=data.index,
|
5625
|
+
columns=[f"PC_{i+1}" for i in range(n_components)]
|
5626
|
+
)
|
5627
|
+
# pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
|
5628
|
+
# pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
|
5629
|
+
# Expand explained variance to multiple columns if needed
|
5630
|
+
for i in range(n_components):
|
5631
|
+
pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
|
5632
|
+
for i in range(n_components):
|
5633
|
+
pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
|
5364
5634
|
|
5365
5635
|
# Apply UMAP if selected
|
5366
5636
|
elif method == "umap":
|
5637
|
+
import umap
|
5367
5638
|
umap_reducer = umap.UMAP(
|
5368
5639
|
n_neighbors=umap_neighbors,
|
5369
5640
|
min_dist=umap_min_dist,
|
5370
|
-
n_components=n_components
|
5641
|
+
n_components=n_components
|
5371
5642
|
)
|
5372
5643
|
X_reduced = umap_reducer.fit_transform(X)
|
5373
|
-
print(f"UMAP completed: Reduced to {n_components} components.")
|
5374
5644
|
|
5375
|
-
|
5376
|
-
|
5645
|
+
# Additional UMAP information
|
5646
|
+
embedding = umap_reducer.embedding_
|
5647
|
+
trustworthiness = umap_reducer._raw_data[:, :n_components]
|
5648
|
+
|
5649
|
+
if debug:
|
5650
|
+
print(f"UMAP completed: Reduced to {n_components} components.")
|
5651
|
+
print(f"Embedding Shape: {embedding.shape}")
|
5652
|
+
print(f"Trustworthiness: {trustworthiness}")
|
5653
|
+
|
5654
|
+
# Prepare reduced DataFrame with additional UMAP info
|
5655
|
+
umap_df = pd.DataFrame(
|
5656
|
+
X_reduced, index=data.index,
|
5657
|
+
columns=[f"UMAP_{i+1}" for i in range(n_components)]
|
5658
|
+
)
|
5659
|
+
umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
|
5660
|
+
umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
|
5661
|
+
elif method == "tsne":
|
5662
|
+
from sklearn.manifold import TSNE
|
5663
|
+
tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
|
5664
|
+
X_reduced = tsne.fit_transform(X)
|
5665
|
+
|
5666
|
+
# Prepare reduced DataFrame with additional t-SNE info
|
5667
|
+
tsne_df = pd.DataFrame(
|
5668
|
+
X_reduced, index=data.index,
|
5669
|
+
columns=[f"tSNE_{i+1}" for i in range(n_components)]
|
5670
|
+
)
|
5671
|
+
tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
|
5672
|
+
|
5673
|
+
# Apply Factor Analysis if selected
|
5674
|
+
elif method == "factor":
|
5675
|
+
from sklearn.decomposition import FactorAnalysis
|
5676
|
+
factor = FactorAnalysis(n_components=n_components, random_state=1)
|
5677
|
+
X_reduced = factor.fit_transform(X)
|
5678
|
+
# Factor Analysis does not directly provide explained variance, but we can approximate it
|
5679
|
+
fa_variance = factor.noise_variance_
|
5680
|
+
# Prepare reduced DataFrame with additional Factor Analysis info
|
5681
|
+
factor_df = pd.DataFrame(
|
5682
|
+
X_reduced, index=data.index,
|
5683
|
+
columns=[f"Factor_{i+1}" for i in range(n_components)]
|
5684
|
+
)
|
5685
|
+
factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
|
5686
|
+
|
5687
|
+
# Apply Isolation Forest for outlier detection if selected
|
5688
|
+
elif method == "isolation_forest":
|
5689
|
+
from sklearn.decomposition import PCA
|
5690
|
+
from sklearn.ensemble import IsolationForest
|
5691
|
+
# Step 1: Apply PCA for dimensionality reduction to 2 components
|
5692
|
+
pca = PCA(n_components=n_components)
|
5693
|
+
X_pca = pca.fit_transform(X)
|
5694
|
+
|
5695
|
+
explained_variance = pca.explained_variance_ratio_
|
5696
|
+
singular_values = pca.singular_values_
|
5697
|
+
|
5698
|
+
# Prepare reduced DataFrame with additional PCA info
|
5699
|
+
iso_forest_df = pd.DataFrame(
|
5700
|
+
X_pca, index=data.index,
|
5701
|
+
columns=[f"PC_{i+1}" for i in range(n_components)]
|
5702
|
+
)
|
5703
|
+
|
5704
|
+
isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
|
5705
|
+
isolation_forest.fit(X)
|
5706
|
+
anomaly_scores = isolation_forest.decision_function(X) # Anomaly score: larger is less anomalous
|
5707
|
+
# Predict labels: 1 (normal), -1 (anomaly)
|
5708
|
+
anomaly_labels = isolation_forest.fit_predict(X)
|
5709
|
+
# Add anomaly scores and labels to the DataFrame
|
5710
|
+
iso_forest_df["Anomaly Score"] = anomaly_scores
|
5711
|
+
iso_forest_df["Anomaly Label"] = anomaly_labels
|
5712
|
+
# add info from pca
|
5713
|
+
for i in range(n_components):
|
5714
|
+
iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
|
5715
|
+
for i in range(n_components):
|
5716
|
+
iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
|
5717
|
+
|
5718
|
+
# Return reduced data and info as a new DataFrame with the same index
|
5719
|
+
if method == "pca":
|
5720
|
+
reduced_df = pca_df
|
5721
|
+
colname_met = "PC_"
|
5722
|
+
if plot_:
|
5723
|
+
sns.scatterplot(
|
5724
|
+
data=pca_df,
|
5725
|
+
x="PC_1",
|
5726
|
+
y="PC_2",
|
5727
|
+
# hue="condition",
|
5728
|
+
)
|
5729
|
+
elif method == "umap":
|
5730
|
+
reduced_df = umap_df
|
5731
|
+
colname_met = "UMAP_"
|
5732
|
+
if plot_:
|
5733
|
+
sns.scatterplot(
|
5734
|
+
data=umap_df,
|
5735
|
+
x="UMAP_1",
|
5736
|
+
y="UMAP_2",
|
5737
|
+
# hue="condition",
|
5738
|
+
)
|
5739
|
+
elif method == "tsne":
|
5740
|
+
reduced_df = tsne_df
|
5741
|
+
colname_met = "t-SNE_"
|
5742
|
+
if plot_:
|
5743
|
+
sns.scatterplot(
|
5744
|
+
data=tsne_df,
|
5745
|
+
x="tSNE_1",
|
5746
|
+
y="tSNE_2",
|
5747
|
+
# hue="batch",
|
5748
|
+
)
|
5749
|
+
elif method == "factor":
|
5750
|
+
reduced_df = factor_df
|
5751
|
+
colname_met = "Factor_"
|
5752
|
+
if plot_:
|
5753
|
+
sns.scatterplot(
|
5754
|
+
data=factor_df,
|
5755
|
+
x="Factor_1",
|
5756
|
+
y="Factor_2",
|
5757
|
+
# hue="batch",
|
5758
|
+
)
|
5759
|
+
elif method == "isolation_forest":
|
5760
|
+
reduced_df = iso_forest_df # Already a DataFrame for outliers
|
5761
|
+
colname_met = "PC_"
|
5762
|
+
if plot_:
|
5763
|
+
ax = sns.scatterplot(
|
5764
|
+
data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
|
5765
|
+
x="PC_1",
|
5766
|
+
y="PC_2",
|
5767
|
+
label="normal", c="b",
|
5768
|
+
)
|
5769
|
+
ax = sns.scatterplot(
|
5770
|
+
ax=ax,
|
5771
|
+
data=iso_forest_df[iso_forest_df["Anomaly Label"] == -1],
|
5772
|
+
x="PC_1",
|
5773
|
+
y="PC_2",
|
5774
|
+
c="r",
|
5775
|
+
label="outlier", marker="+", s=30,
|
5776
|
+
)
|
5777
|
+
|
5377
5778
|
|
5378
5779
|
if inplace:
|
5379
|
-
#
|
5780
|
+
# If inplace=True, add components back into the original data
|
5380
5781
|
for col_idx in range(n_components):
|
5381
|
-
data[f"
|
5782
|
+
data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
5783
|
+
# Add extra info for PCA/UMAP
|
5784
|
+
if method == "pca":
|
5785
|
+
for i in range(n_components):
|
5786
|
+
data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
|
5787
|
+
for i in range(n_components):
|
5788
|
+
data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
|
5789
|
+
elif method == "umap":
|
5790
|
+
for i in range(n_components):
|
5791
|
+
data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
|
5792
|
+
data["Embedding"] = reduced_df["Embedding"]
|
5793
|
+
data["Trustworthiness"] = reduced_df["Trustworthiness"]
|
5382
5794
|
return None # No return when inplace=True
|
5795
|
+
|
5383
5796
|
|
5384
|
-
return reduced_df
|
5797
|
+
return reduced_df
|
5385
5798
|
|
5386
5799
|
|
5387
5800
|
# example:
|
@@ -5636,7 +6049,7 @@ def evaluate_cluster(
|
|
5636
6049
|
return metrics
|
5637
6050
|
|
5638
6051
|
|
5639
|
-
def
|
6052
|
+
def use_pd(
|
5640
6053
|
func_name="excel",
|
5641
6054
|
verbose=True,
|
5642
6055
|
dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
|