py2ls 0.2.4.1__py3-none-any.whl → 0.2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/bio.py +513 -0
- py2ls/data/usages_pd copy.json +1105 -0
- py2ls/data/usages_pd.json +1413 -52
- py2ls/fetch_update.py +45 -27
- py2ls/ips.py +680 -168
- py2ls/plot.py +104 -77
- {py2ls-0.2.4.1.dist-info → py2ls-0.2.4.3.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.1.dist-info → py2ls-0.2.4.3.dist-info}/RECORD +9 -7
- {py2ls-0.2.4.1.dist-info → py2ls-0.2.4.3.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -51,8 +51,6 @@ from bs4 import BeautifulSoup
|
|
51
51
|
|
52
52
|
from . import netfinder
|
53
53
|
|
54
|
-
# from .plot import get_color
|
55
|
-
|
56
54
|
try:
|
57
55
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
58
56
|
get_ipython().run_line_magic("autoreload", "2")
|
@@ -61,19 +59,31 @@ except NameError:
|
|
61
59
|
|
62
60
|
def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
63
61
|
"""
|
64
|
-
Add the Chinese font to the font manager
|
62
|
+
Add the Chinese (default) font to the font manager
|
65
63
|
Args:
|
66
64
|
dir_font (str, optional): _description_. Defaults to "/System/Library/Fonts/Hiragino Sans GB.ttc".
|
67
65
|
"""
|
68
66
|
import matplotlib.pyplot as plt
|
69
67
|
from matplotlib import font_manager
|
68
|
+
slashtype = "/" if 'mac' in get_os() else "\\"
|
69
|
+
if slashtype in dir_font:
|
70
|
+
font_manager.fontManager.addfont(dir_font)
|
71
|
+
fontname = os.path.basename(dir_font).split(".")[0]
|
72
|
+
else:
|
73
|
+
if "cn" in dir_font.lower() or "ch" in dir_font.lower():
|
74
|
+
fontname = "Hiragino Sans GB" # default Chinese font
|
75
|
+
else:
|
76
|
+
fontname = dir_font
|
70
77
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
plt.rcParams["font.sans-serif"] = [fontname_chinese]
|
75
|
-
plt.rcParams["font.family"] = "sans-serif"
|
78
|
+
plt.rcParams["font.sans-serif"] = [fontname]
|
79
|
+
# plt.rcParams["font.family"] = "sans-serif"
|
76
80
|
plt.rcParams["axes.unicode_minus"] = False
|
81
|
+
fonts_in_system = font_manager.findSystemFonts(fontpaths=None, fontext="ttf")
|
82
|
+
fontname_in_system = [os.path.basename(i).split(".")[0] for i in fonts_in_system]
|
83
|
+
if fontname not in fontname_in_system:
|
84
|
+
print(f"Font '{fontname}' not found. Falling back to default.")
|
85
|
+
plt.rcParams["font.sans-serif"] = ["Arial"]
|
86
|
+
return fontname
|
77
87
|
|
78
88
|
# set 'dir_save'
|
79
89
|
if "dar" in sys.platform:
|
@@ -506,6 +516,59 @@ def is_text(s):
|
|
506
516
|
return has_alpha and has_non_alpha
|
507
517
|
|
508
518
|
|
519
|
+
from typing import Any, Union
|
520
|
+
|
521
|
+
def shared(lst1:Any, lst2:Any,*args, verbose=True):
|
522
|
+
"""
|
523
|
+
check the shared elelements in two list.
|
524
|
+
usage:
|
525
|
+
list1 = [1, 2, 3, 4, 5]
|
526
|
+
list2 = [4, 5, 6, 7, 8]
|
527
|
+
list3 = [5, 6, 9, 10]
|
528
|
+
a = shared(list1, list2,list3)
|
529
|
+
"""
|
530
|
+
if verbose:
|
531
|
+
print("\n********* checking shared elements *********")
|
532
|
+
if any([not isinstance(lst1,list),not isinstance(lst1,list)]):
|
533
|
+
print(f"{' '*2}type(list1):\t{type(lst1)},\n{' '*2}type(list2):\t{type(lst2)}>")
|
534
|
+
shared_elements=set(flatten(lst1,verbose=verbose)).intersection(flatten(lst2,verbose=verbose))
|
535
|
+
# support more lists
|
536
|
+
if args:
|
537
|
+
for arg in args:
|
538
|
+
shared_elements=shared_elements.intersection(set(flatten(arg,verbose=verbose)))
|
539
|
+
shared_elements = list(shared_elements)
|
540
|
+
if verbose:
|
541
|
+
elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
|
542
|
+
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
543
|
+
print("********* checking shared elements *********")
|
544
|
+
return shared_elements
|
545
|
+
|
546
|
+
def flatten(nested: Any, unique_list=True,verbose=True):
|
547
|
+
"""
|
548
|
+
Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
|
549
|
+
Parameters:
|
550
|
+
nested : Any, Can be a list, tuple, dictionary, or set.
|
551
|
+
Returns: list, A flattened list.
|
552
|
+
"""
|
553
|
+
flattened_list = []
|
554
|
+
stack = [nested]
|
555
|
+
while stack:
|
556
|
+
current = stack.pop()
|
557
|
+
if isinstance(current, dict):
|
558
|
+
stack.extend(current.values())
|
559
|
+
elif isinstance(current, (list, tuple, set)):
|
560
|
+
stack.extend(current)
|
561
|
+
elif isinstance(current, pd.Series):
|
562
|
+
stack.extend(current)
|
563
|
+
else:
|
564
|
+
flattened_list.append(current)
|
565
|
+
if verbose:
|
566
|
+
print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
|
567
|
+
if unique_list:
|
568
|
+
return unique(flattened_list)
|
569
|
+
else:
|
570
|
+
return flattened_list
|
571
|
+
|
509
572
|
def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
|
510
573
|
"""
|
511
574
|
Compares a search term with a list of candidate strings and finds the best match based on similarity score.
|
@@ -526,6 +589,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
|
|
526
589
|
if isinstance(s, str):
|
527
590
|
return s.lower()
|
528
591
|
elif isinstance(s, list):
|
592
|
+
s=[str(i) for i in s]# convert all to str
|
529
593
|
return [elem.lower() for elem in s]
|
530
594
|
return s
|
531
595
|
|
@@ -535,7 +599,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
|
|
535
599
|
similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
|
536
600
|
elif "W" in scorer.lower():
|
537
601
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
538
|
-
elif "ratio" in scorer.lower():#Ratio (Strictest)
|
602
|
+
elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
|
539
603
|
similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
|
540
604
|
else:
|
541
605
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
@@ -1697,26 +1761,18 @@ def fload(fpath, kind=None, **kwargs):
|
|
1697
1761
|
def load_csv(fpath, **kwargs):
|
1698
1762
|
from pandas.errors import EmptyDataError
|
1699
1763
|
|
1700
|
-
engine = kwargs.
|
1701
|
-
kwargs.pop("
|
1702
|
-
|
1703
|
-
kwargs.pop("
|
1704
|
-
|
1705
|
-
kwargs.pop("
|
1706
|
-
|
1707
|
-
kwargs.pop("
|
1708
|
-
skipinitialspace = kwargs.get("skipinitialspace", True)
|
1709
|
-
kwargs.pop("skipinitialspace", None)
|
1710
|
-
encoding = kwargs.get("encoding", "utf-8")
|
1711
|
-
kwargs.pop("encoding", None)
|
1712
|
-
on_bad_lines = kwargs.get("on_bad_lines", "skip")
|
1713
|
-
kwargs.pop("on_bad_lines", None)
|
1714
|
-
comment = kwargs.get("comment", None)
|
1715
|
-
kwargs.pop("comment", None)
|
1716
|
-
|
1764
|
+
engine = kwargs.pop("engine", "pyarrow")
|
1765
|
+
sep = kwargs.pop("sep", "\t")
|
1766
|
+
index_col = kwargs.pop("index_col", None)
|
1767
|
+
memory_map = kwargs.pop("memory_map", False)
|
1768
|
+
skipinitialspace = kwargs.pop("skipinitialspace", False)
|
1769
|
+
encoding = kwargs.pop("encoding", "utf-8")
|
1770
|
+
on_bad_lines = kwargs.pop("on_bad_lines", "skip")
|
1771
|
+
comment = kwargs.pop("comment", None)
|
1717
1772
|
fmt=kwargs.pop("fmt",False)
|
1773
|
+
verbose=kwargs.pop("verbose",False)
|
1718
1774
|
if verbose:
|
1719
|
-
|
1775
|
+
use_pd("read_csv", verbose=verbose)
|
1720
1776
|
return
|
1721
1777
|
|
1722
1778
|
if comment is None:
|
@@ -1800,7 +1856,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1800
1856
|
separators = [",", "\t", ";", "|", " "]
|
1801
1857
|
for sep in separators:
|
1802
1858
|
sep2show = sep if sep != "\t" else "\\t"
|
1803
|
-
|
1859
|
+
print(f'trying with: engine=pyarrow, sep="{sep2show}"')
|
1804
1860
|
try:
|
1805
1861
|
df = pd.read_csv(
|
1806
1862
|
fpath,
|
@@ -1819,13 +1875,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
1819
1875
|
except:
|
1820
1876
|
pass
|
1821
1877
|
else:
|
1822
|
-
engines = ["c", "python"]
|
1878
|
+
engines = [None,"c", "python"]
|
1823
1879
|
for engine in engines:
|
1824
|
-
|
1880
|
+
separators = [",", "\t", ";", "|", " "]
|
1825
1881
|
for sep in separators:
|
1826
1882
|
try:
|
1827
1883
|
sep2show = sep if sep != "\t" else "\\t"
|
1828
|
-
|
1884
|
+
print(f"trying with: engine={engine}, sep='{sep2show}'")
|
1829
1885
|
df = pd.read_csv(
|
1830
1886
|
fpath,
|
1831
1887
|
engine=engine,
|
@@ -1848,7 +1904,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1848
1904
|
engine = kwargs.get("engine", "openpyxl")
|
1849
1905
|
verbose=kwargs.pop("verbose",False)
|
1850
1906
|
if verbose:
|
1851
|
-
|
1907
|
+
use_pd("read_excel", verbose=verbose)
|
1852
1908
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
1853
1909
|
try:
|
1854
1910
|
meata=pd.ExcelFile(fpath)
|
@@ -2031,6 +2087,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2031
2087
|
elif kind.lower() in img_types:
|
2032
2088
|
print(f'Image ".{kind}" is loaded.')
|
2033
2089
|
return load_img(fpath)
|
2090
|
+
elif kind=="gz" and fpath.endswith(".soft.gz"):
|
2091
|
+
import GEOparse
|
2092
|
+
return GEOparse.get_GEO(filepath=fpath)
|
2034
2093
|
elif kind.lower() in zip_types:
|
2035
2094
|
keep = kwargs.get("keep", False)
|
2036
2095
|
fpath_unzip = unzip(fpath)
|
@@ -2105,30 +2164,51 @@ def fload(fpath, kind=None, **kwargs):
|
|
2105
2164
|
# docx_content = fload('sample.docx')
|
2106
2165
|
|
2107
2166
|
|
2108
|
-
def fupdate(fpath, content=None):
|
2167
|
+
def fupdate(fpath, content=None, how="head"):
|
2109
2168
|
"""
|
2110
2169
|
Update a file by adding new content at the top and moving the old content to the bottom.
|
2170
|
+
If the file is a JSON file, merge the new content with the old content.
|
2171
|
+
|
2111
2172
|
Parameters
|
2112
2173
|
----------
|
2113
2174
|
fpath : str
|
2114
2175
|
The file path where the content should be updated.
|
2115
|
-
content : str, optional
|
2116
|
-
The new content to add at the top of the file
|
2176
|
+
content : str or dict, optional
|
2177
|
+
The new content to add at the top of the file (for text) or merge (for JSON).
|
2178
|
+
If not provided, the function will not add any new content.
|
2179
|
+
|
2117
2180
|
Notes
|
2118
2181
|
-----
|
2119
2182
|
- If the file at `fpath` does not exist, it will be created.
|
2120
|
-
-
|
2183
|
+
- For text files, the new content will be added at the top, followed by the old content.
|
2184
|
+
- For JSON files, the new content will be merged with the existing JSON content.
|
2121
2185
|
"""
|
2122
2186
|
content = content or ""
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2187
|
+
file_ext = os.path.splitext(fpath)[1]
|
2188
|
+
how_s=["head", "tail","start","end","beginning", "stop",'last',"before"]
|
2189
|
+
how = strcmp(how, how_s)[0]
|
2190
|
+
print(how)
|
2191
|
+
add_where = 'head' if how in ["head", "start","beginning", "before"] else "tail"
|
2192
|
+
if "json" in file_ext.lower():
|
2193
|
+
old_content=fload(fpath,kind='json') if os.path.exists(fpath) else {}
|
2194
|
+
updated_content = {**content,**old_content} if add_where=="head" else {**old_content, **content} if isinstance(content, dict) else old_content
|
2195
|
+
fsave(fpath,updated_content)
|
2126
2196
|
else:
|
2127
|
-
|
2197
|
+
# Handle text file
|
2198
|
+
if os.path.exists(fpath):
|
2199
|
+
with open(fpath, "r") as file:
|
2200
|
+
old_content = file.read()
|
2201
|
+
else:
|
2202
|
+
old_content = ""
|
2128
2203
|
|
2129
|
-
|
2130
|
-
|
2131
|
-
|
2204
|
+
# Write new content at the top followed by old content
|
2205
|
+
with open(fpath, "w") as file:
|
2206
|
+
if add_where=="head":
|
2207
|
+
file.write(content + "\n")
|
2208
|
+
file.write(old_content)
|
2209
|
+
else:
|
2210
|
+
file.write(old_content)
|
2211
|
+
file.write(content + "\n")
|
2132
2212
|
|
2133
2213
|
|
2134
2214
|
def fappend(fpath, content=None):
|
@@ -2234,7 +2314,7 @@ def fsave(
|
|
2234
2314
|
|
2235
2315
|
verbose=kwargs.pop("verbose",False)
|
2236
2316
|
if verbose:
|
2237
|
-
|
2317
|
+
use_pd("to_csv", verbose=verbose)
|
2238
2318
|
kwargs_csv = dict(
|
2239
2319
|
path_or_buf=None,
|
2240
2320
|
sep=",",
|
@@ -2266,7 +2346,7 @@ def fsave(
|
|
2266
2346
|
verbose=kwargs.pop("verbose",False)
|
2267
2347
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2268
2348
|
if verbose:
|
2269
|
-
|
2349
|
+
use_pd("to_excel", verbose=verbose)
|
2270
2350
|
if any(kwargs):
|
2271
2351
|
format_excel(df=data, filename=fpath, **kwargs)
|
2272
2352
|
else:
|
@@ -2710,12 +2790,14 @@ def mkdir_nest(fpath: str) -> str:
|
|
2710
2790
|
Returns:
|
2711
2791
|
- str: The path of the created directory.
|
2712
2792
|
"""
|
2713
|
-
|
2714
|
-
if os.path.isdir(fpath):
|
2715
|
-
return fpath
|
2793
|
+
|
2716
2794
|
|
2717
2795
|
# Split the full path into directories
|
2718
2796
|
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
2797
|
+
if os.path.isdir(fpath):
|
2798
|
+
fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
|
2799
|
+
print(fpath)
|
2800
|
+
return fpath
|
2719
2801
|
dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
|
2720
2802
|
|
2721
2803
|
# Start creating directories from the root to the desired path
|
@@ -2744,34 +2826,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
2744
2826
|
- str: The path of the created directory or an error message.
|
2745
2827
|
"""
|
2746
2828
|
|
2747
|
-
rootdir = []
|
2748
|
-
# Convert string to list
|
2829
|
+
rootdir = []
|
2749
2830
|
if chdir is None:
|
2750
2831
|
return mkdir_nest(pardir)
|
2751
2832
|
if isinstance(chdir, str):
|
2752
|
-
chdir = [chdir]
|
2753
|
-
# Subfoldername should be unique
|
2833
|
+
chdir = [chdir]
|
2754
2834
|
chdir = list(set(chdir))
|
2755
2835
|
if isinstance(pardir, str): # Dir_parents should be 'str' type
|
2756
|
-
pardir = os.path.normpath(pardir)
|
2757
|
-
# Get the slash type: "/" or "\"
|
2758
|
-
stype = "/" if "/" in pardir else "\\"
|
2836
|
+
pardir = os.path.normpath(pardir)
|
2759
2837
|
if "mac" in get_os().lower() or "lin" in get_os().lower():
|
2760
2838
|
stype = "/"
|
2761
2839
|
elif "win" in get_os().lower():
|
2762
2840
|
stype = "\\"
|
2763
2841
|
else:
|
2764
2842
|
stype = "/"
|
2765
|
-
|
2766
|
-
# Check if the parent directory exists and is a directory path
|
2843
|
+
|
2767
2844
|
if os.path.isdir(pardir):
|
2768
2845
|
os.chdir(pardir) # Set current path
|
2769
2846
|
# Check if subdirectories are not empty
|
2770
2847
|
if chdir:
|
2771
|
-
chdir.sort()
|
2772
|
-
|
2773
|
-
for folder in chdir:
|
2774
|
-
# Check if the subfolder already exists
|
2848
|
+
chdir.sort()
|
2849
|
+
for folder in chdir:
|
2775
2850
|
child_tmp = os.path.join(pardir, folder)
|
2776
2851
|
if not os.path.isdir(child_tmp):
|
2777
2852
|
os.mkdir("./" + folder)
|
@@ -2791,6 +2866,8 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
2791
2866
|
# Dir is the main output, if only one dir, then str type is inconvenient
|
2792
2867
|
if len(rootdir) == 1:
|
2793
2868
|
rootdir = rootdir[0]
|
2869
|
+
rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
|
2870
|
+
print(rootdir)
|
2794
2871
|
return rootdir
|
2795
2872
|
|
2796
2873
|
|
@@ -2805,22 +2882,25 @@ def figsave(*args, dpi=300):
|
|
2805
2882
|
dir_save = None
|
2806
2883
|
fname = None
|
2807
2884
|
img = None
|
2885
|
+
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
2808
2886
|
for arg in args:
|
2809
2887
|
if isinstance(arg, str):
|
2810
|
-
if
|
2888
|
+
if f_slash in arg:
|
2811
2889
|
dir_save = arg
|
2812
|
-
|
2890
|
+
else:
|
2813
2891
|
fname = arg
|
2814
2892
|
elif isinstance(arg, (Image.Image, np.ndarray)):
|
2815
2893
|
img = arg # Store the PIL image if provided
|
2816
2894
|
|
2817
|
-
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
2818
2895
|
if dir_save is None:
|
2819
2896
|
dir_save="./"
|
2897
|
+
print(dir_save)
|
2898
|
+
# dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
|
2820
2899
|
dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
|
2821
2900
|
dir_ch = "".join(dir_save.split(f_slash)[-1:])
|
2822
2901
|
if not dir_par.endswith(f_slash):
|
2823
2902
|
dir_par += f_slash
|
2903
|
+
print(dir_par)
|
2824
2904
|
if fname is None:
|
2825
2905
|
fname = dir_ch
|
2826
2906
|
mkdir(dir_par)
|
@@ -4415,12 +4495,48 @@ def preview(var):
|
|
4415
4495
|
# preview("# This is a Markdown header")
|
4416
4496
|
# preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
|
4417
4497
|
# preview({"key": "value", "numbers": [1, 2, 3]})
|
4418
|
-
|
4498
|
+
def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
4499
|
+
"""
|
4500
|
+
Extend a DataFrame by the list elecments in the column.
|
4501
|
+
|
4502
|
+
Parameters:
|
4503
|
+
----------
|
4504
|
+
data : pd.DataFrame
|
4505
|
+
The input DataFrame to be extended.
|
4506
|
+
|
4507
|
+
column : str
|
4508
|
+
The name of the column to be split.
|
4509
|
+
|
4510
|
+
axis : int, optional
|
4511
|
+
The axis along which to expand the DataFrame.
|
4512
|
+
- 0 (default): Expand the specified column into multiple rows.
|
4513
|
+
- 1: Expand the specified column into multiple columns.
|
4514
|
+
|
4515
|
+
sep : str, optional
|
4516
|
+
The separator used to split the values in the specified column.
|
4517
|
+
Must be provided for the function to work correctly.
|
4518
|
+
"""
|
4519
|
+
|
4520
|
+
data = data.copy()
|
4521
|
+
mask = data[column].str.contains(sep, na=False)
|
4522
|
+
data = data.copy()
|
4523
|
+
if mask.any():
|
4524
|
+
data[column] = (
|
4525
|
+
data[column]
|
4526
|
+
.apply(lambda x: x.split(sep) if isinstance(x, str) else x) # Only split if x is a string
|
4527
|
+
)
|
4528
|
+
|
4529
|
+
# Strip spaces from each item in the lists
|
4530
|
+
data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
|
4531
|
+
|
4532
|
+
data = data.explode(column, ignore_index=True)
|
4533
|
+
return data
|
4419
4534
|
# ! DataFrame
|
4420
4535
|
def df_astype(
|
4421
|
-
|
4536
|
+
data: pd.DataFrame,
|
4422
4537
|
columns: Optional[Union[str, List[str]]] = None,
|
4423
4538
|
astype: str = "datetime",
|
4539
|
+
skip_row:Union[str,list]=None,
|
4424
4540
|
fmt: Optional[str] = None,
|
4425
4541
|
inplace: bool = True,
|
4426
4542
|
errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
|
@@ -4484,22 +4600,24 @@ def df_astype(
|
|
4484
4600
|
]
|
4485
4601
|
# If inplace is False, make a copy of the DataFrame
|
4486
4602
|
if not inplace:
|
4487
|
-
|
4603
|
+
data = data.copy()
|
4604
|
+
if skip_row is not None:
|
4605
|
+
data = data.drop(index=skip_row, errors='ignore')
|
4606
|
+
# If columns is None, apply to all columns
|
4607
|
+
if columns is None:
|
4608
|
+
columns = data.columns.tolist()
|
4488
4609
|
# correct the astype input
|
4489
4610
|
if isinstance(astype,str):
|
4490
4611
|
astype = strcmp(astype, astypes)[0]
|
4491
|
-
print(f"converting
|
4612
|
+
print(f"converting as type: {astype}")
|
4492
4613
|
elif isinstance(astype,dict):
|
4493
4614
|
for col, dtype in astype.items():
|
4494
4615
|
dtype='date' if dtype=="day" else dtype
|
4495
|
-
|
4496
|
-
return
|
4497
|
-
# If columns is None, apply to all columns
|
4498
|
-
if columns is None:
|
4499
|
-
columns = df.columns
|
4616
|
+
data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
|
4617
|
+
return data if not inplace else None
|
4500
4618
|
|
4501
4619
|
# Ensure columns is a list
|
4502
|
-
if isinstance(columns,
|
4620
|
+
if isinstance(columns, str):
|
4503
4621
|
columns = [columns]
|
4504
4622
|
|
4505
4623
|
# Convert specified columns
|
@@ -4519,72 +4637,74 @@ def df_astype(
|
|
4519
4637
|
kwargs.pop("errors", None)
|
4520
4638
|
# convert it as type: datetime
|
4521
4639
|
if isinstance(column, int):
|
4522
|
-
|
4523
|
-
|
4640
|
+
data.iloc[:, column] = pd.to_datetime(
|
4641
|
+
data.iloc[:, column], format=fmt, errors=errors, **kwargs
|
4524
4642
|
)
|
4525
4643
|
# further convert:
|
4526
4644
|
if astype == "time":
|
4527
|
-
|
4645
|
+
data.iloc[:, column] = data.iloc[:, column].dt.time
|
4528
4646
|
elif astype == "month":
|
4529
|
-
|
4647
|
+
data.iloc[:, column] = data.iloc[:, column].dt.month
|
4530
4648
|
elif astype == "year":
|
4531
|
-
|
4649
|
+
data.iloc[:, column] = data.iloc[:, column].dt.year
|
4532
4650
|
elif astype == "date" or astype == "day":
|
4533
|
-
|
4651
|
+
data.iloc[:, column] = data.iloc[:, column].dt.date
|
4534
4652
|
elif astype == "hour":
|
4535
|
-
|
4653
|
+
data.iloc[:, column] = data.iloc[:, column].dt.hour
|
4536
4654
|
elif astype == "minute":
|
4537
|
-
|
4655
|
+
data.iloc[:, column] = data.iloc[:, column].dt.minute
|
4538
4656
|
elif astype == "second":
|
4539
|
-
|
4657
|
+
data.iloc[:, column] = data.iloc[:, column].dt.second
|
4540
4658
|
elif astype == "week":
|
4541
|
-
|
4659
|
+
data.iloc[:, column] = data.iloc[:, column].dt.day_name()
|
4542
4660
|
else:
|
4543
|
-
|
4661
|
+
data[column] = (
|
4544
4662
|
pd.to_datetime(
|
4545
|
-
|
4663
|
+
data[column], format=fmt, errors=errors, **kwargs
|
4546
4664
|
)
|
4547
4665
|
if fmt
|
4548
|
-
else pd.to_datetime(
|
4666
|
+
else pd.to_datetime(data[column], errors=errors, **kwargs)
|
4549
4667
|
)
|
4550
4668
|
# further convert:
|
4551
4669
|
if astype == "time":
|
4552
|
-
|
4670
|
+
data[column] = data[column].dt.time
|
4553
4671
|
elif astype == "month":
|
4554
|
-
|
4672
|
+
data[column] = data[column].dt.month
|
4555
4673
|
elif astype == "year":
|
4556
|
-
|
4674
|
+
data[column] = data[column].dt.year
|
4557
4675
|
elif astype == "date":
|
4558
|
-
|
4676
|
+
data[column] = data[column].dt.date
|
4559
4677
|
elif astype == "hour":
|
4560
|
-
|
4678
|
+
data[column] = data[column].dt.hour
|
4561
4679
|
elif astype == "minute":
|
4562
|
-
|
4680
|
+
data[column] = data[column].dt.minute
|
4563
4681
|
elif astype == "second":
|
4564
|
-
|
4682
|
+
data[column] = data[column].dt.second
|
4565
4683
|
elif astype == "week":
|
4566
|
-
|
4684
|
+
data[column] = data[column].dt.day_name()
|
4567
4685
|
|
4568
4686
|
elif astype == "numeric":
|
4569
4687
|
kwargs.pop("errors", None)
|
4570
|
-
|
4688
|
+
data[column] = pd.to_numeric(data[column], errors=errors, **kwargs)
|
4571
4689
|
# print(f"Successfully converted '{column}' to numeric.")
|
4572
4690
|
elif astype == "timedelta":
|
4573
4691
|
kwargs.pop("errors", None)
|
4574
|
-
|
4692
|
+
data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
|
4575
4693
|
# print(f"Successfully converted '{column}' to timedelta.")
|
4576
4694
|
else:
|
4577
4695
|
# Convert to other types (e.g., float, int)
|
4578
|
-
|
4696
|
+
data[column] = data[column].astype(astype)
|
4579
4697
|
# print(f"Successfully converted '{column}' to {astype}.")
|
4580
4698
|
except Exception as e:
|
4581
4699
|
print(f"Error converting '{column}' to {astype}: {e}")
|
4582
|
-
|
4583
|
-
|
4584
|
-
|
4700
|
+
try:
|
4701
|
+
display(data.info()[:10])
|
4702
|
+
except:
|
4703
|
+
pass
|
4704
|
+
return data
|
4585
4705
|
|
4586
4706
|
|
4587
|
-
# ! DataFrame
|
4707
|
+
# ! DataFrame
|
4588
4708
|
def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
4589
4709
|
"""
|
4590
4710
|
Sort a DataFrame by a specified column based on a custom order or by count.
|
@@ -4601,7 +4721,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4601
4721
|
Returns:
|
4602
4722
|
- Sorted DataFrame if inplace is False, otherwise None.
|
4603
4723
|
"""
|
4604
|
-
if column not in
|
4724
|
+
if column not in data.columns:
|
4605
4725
|
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
4606
4726
|
|
4607
4727
|
if isinstance(by, str) and 'count' in by.lower():
|
@@ -4624,11 +4744,11 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4624
4744
|
|
4625
4745
|
try:
|
4626
4746
|
if inplace: # replace the original
|
4627
|
-
|
4747
|
+
data.sort_values(column, ascending=ascending, inplace=True, **kwargs)
|
4628
4748
|
print(f"Successfully sorted DataFrame by '{column}'")
|
4629
4749
|
return None
|
4630
4750
|
else:
|
4631
|
-
sorted_df =
|
4751
|
+
sorted_df = data.sort_values(column, ascending=ascending, **kwargs)
|
4632
4752
|
print(f"Successfully sorted DataFrame by '{column}' using custom order.")
|
4633
4753
|
return sorted_df
|
4634
4754
|
except Exception as e:
|
@@ -4636,7 +4756,6 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4636
4756
|
return df
|
4637
4757
|
|
4638
4758
|
|
4639
|
-
|
4640
4759
|
# # Example usage:
|
4641
4760
|
# # Sample DataFrame
|
4642
4761
|
# data = {
|
@@ -4667,6 +4786,236 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4667
4786
|
# display(df_month)
|
4668
4787
|
|
4669
4788
|
|
4789
|
+
def df_merge(
|
4790
|
+
df1: pd.DataFrame,
|
4791
|
+
df2: pd.DataFrame,
|
4792
|
+
use_index: bool = True,
|
4793
|
+
columns: list = ["col_left", "col_right"],
|
4794
|
+
how: str = "left",
|
4795
|
+
) -> pd.DataFrame:
|
4796
|
+
"""
|
4797
|
+
Merges two DataFrames based on either the index or shared columns with matching data types.
|
4798
|
+
usage:
|
4799
|
+
#(1) if the index are the same
|
4800
|
+
df_merged = df_merge(df1, df2, use_index=True(defalut), how='outer')
|
4801
|
+
#(2) if there are shaed columns, then based on shared columns
|
4802
|
+
df_merged = df_merge(df1, df2, how='outer')
|
4803
|
+
#(3) if columns: then based on the specific columns
|
4804
|
+
df_merged = df_merge(df1, df2, columns=["col_left", "col_right"],how='outer')
|
4805
|
+
Parameters:
|
4806
|
+
- df1 (pd.DataFrame): The first DataFrame.
|
4807
|
+
- df2 (pd.DataFrame): The second DataFrame.
|
4808
|
+
- use_index (bool): If True, first try to merge by index if they are comparable; otherwise, fall back to column-based merge.
|
4809
|
+
- how (str): Type of merge to perform: 'inner', 'outer', 'left', or 'right'. Default is 'inner'.
|
4810
|
+
'inner': only the rows that have matching values in both DataFrames (intersection)
|
4811
|
+
'outer': keeps all rows from both DataFrames and fills in missing values with NaN
|
4812
|
+
'left': keeps all rows from the left DataFrame and matches rows from the right DataFrame
|
4813
|
+
'right': keeps all rows from the right DataFrame and matches rows from the left DataFrame, filling with NaN if there is no match.
|
4814
|
+
|
4815
|
+
Returns:
|
4816
|
+
- pd.DataFrame: The merged DataFrame.
|
4817
|
+
"""
|
4818
|
+
|
4819
|
+
# 1. Check if indices are comparable (same length and types)
|
4820
|
+
if use_index:
|
4821
|
+
print(f"Merging based on index using '{how}' join...")
|
4822
|
+
df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
|
4823
|
+
return df_merged
|
4824
|
+
|
4825
|
+
# 2. Find common columns with the same dtype
|
4826
|
+
common_columns = df1.columns.intersection(df2.columns)
|
4827
|
+
shared_columns = []
|
4828
|
+
for col in common_columns:
|
4829
|
+
if df1[col].dtype == df2[col].dtype:
|
4830
|
+
shared_columns.append(col)
|
4831
|
+
if not isinstance(columns, list):
|
4832
|
+
columns = [columns]
|
4833
|
+
if len(columns) != 2:
|
4834
|
+
raise ValueError(
|
4835
|
+
"'columns':list shoule be a list: columns=['col_left','col_right']"
|
4836
|
+
)
|
4837
|
+
if all(columns):
|
4838
|
+
print(f"Merging based on columns: {columns} using '{how}' join...")
|
4839
|
+
df_merged = pd.merge(df1, df2, left_on=columns[0], right_on=columns[1], how=how)
|
4840
|
+
elif shared_columns:
|
4841
|
+
print(
|
4842
|
+
f"Merging based on shared columns: {shared_columns} using '{how}' join..."
|
4843
|
+
)
|
4844
|
+
df_merged = pd.merge(df1, df2, on=shared_columns, how=how)
|
4845
|
+
else:
|
4846
|
+
raise ValueError(
|
4847
|
+
"No common columns with matching data types to merge on, and indices are not comparable."
|
4848
|
+
)
|
4849
|
+
return df_merged
|
4850
|
+
|
4851
|
+
def df_fillna(
|
4852
|
+
data: pd.DataFrame,
|
4853
|
+
method: str = "mean",
|
4854
|
+
axis: int = 0,# column-wise
|
4855
|
+
constant: float = None,
|
4856
|
+
inplace: bool = True,
|
4857
|
+
) -> pd.DataFrame:
|
4858
|
+
"""
|
4859
|
+
Fill missing values in a DataFrame using specified imputation method.
|
4860
|
+
|
4861
|
+
Parameters:
|
4862
|
+
data (pd.DataFrame): The DataFrame to fill missing values.
|
4863
|
+
method (str): The imputation method to use. Options are:
|
4864
|
+
- 'mean': Replace missing values with the mean of the column.
|
4865
|
+
- 'median': Replace missing values with the median of the column.
|
4866
|
+
- 'most_frequent': Replace missing values with the most frequent value in the column.
|
4867
|
+
- 'constant': Replace missing values with a constant value provided by the `constant` parameter.
|
4868
|
+
- 'knn': Use K-Nearest Neighbors imputation.
|
4869
|
+
- 'iterative': Use Iterative imputation.
|
4870
|
+
axis (int): The axis along which to impute:
|
4871
|
+
- 0: Impute column-wise (default).
|
4872
|
+
- 1: Impute row-wise.
|
4873
|
+
constant (float, optional): Constant value to use for filling NaNs if method is 'constant'.
|
4874
|
+
inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
|
4875
|
+
|
4876
|
+
"""
|
4877
|
+
|
4878
|
+
if data.empty:
|
4879
|
+
raise ValueError("Input DataFrame is empty.")
|
4880
|
+
|
4881
|
+
# Validate method
|
4882
|
+
methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
|
4883
|
+
method = strcmp(method, methods)[0]
|
4884
|
+
|
4885
|
+
# If using constant method, ask for a constant value
|
4886
|
+
if constant is not None:
|
4887
|
+
method = "constant"
|
4888
|
+
try:
|
4889
|
+
constant = float(constant)
|
4890
|
+
except ValueError:
|
4891
|
+
raise ValueError("Constant value must be a number.")
|
4892
|
+
|
4893
|
+
# Initialize SimpleImputer with the chosen method
|
4894
|
+
if method == "constant":
|
4895
|
+
imputer = SimpleImputer(strategy=method, fill_value=constant)
|
4896
|
+
elif method == "knn":
|
4897
|
+
from sklearn.impute import KNNImputer
|
4898
|
+
|
4899
|
+
imputer = KNNImputer(n_neighbors=n_neighbors)
|
4900
|
+
elif method == "iterative":
|
4901
|
+
from sklearn.impute import IterativeImputer
|
4902
|
+
|
4903
|
+
imputer = IterativeImputer(max_iter=max_iter)
|
4904
|
+
else:
|
4905
|
+
from sklearn.impute import SimpleImputer
|
4906
|
+
|
4907
|
+
imputer = SimpleImputer(strategy=method)
|
4908
|
+
|
4909
|
+
# Fit and transform the data
|
4910
|
+
if axis == 0:
|
4911
|
+
# Impute column-wise
|
4912
|
+
imputed_data = imputer.fit_transform(data)
|
4913
|
+
imputed_data.shape
|
4914
|
+
elif axis == 1:
|
4915
|
+
# Impute row-wise
|
4916
|
+
imputed_data = imputer.fit_transform(data.T)
|
4917
|
+
imputed_data.shape
|
4918
|
+
else:
|
4919
|
+
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
4920
|
+
|
4921
|
+
df_filled = pd.DataFrame(
|
4922
|
+
imputed_data if axis == 0 else imputed_data.T,
|
4923
|
+
index=data.index,# if axis == 0 else data.columns,
|
4924
|
+
columns=data.columns,# if axis == 0 else data.index,
|
4925
|
+
)
|
4926
|
+
|
4927
|
+
if inplace:
|
4928
|
+
data.update(df_filled)
|
4929
|
+
return None # replace original
|
4930
|
+
else:
|
4931
|
+
return df_filled
|
4932
|
+
def df_scaler(
|
4933
|
+
data: pd.DataFrame,
|
4934
|
+
method="standard",
|
4935
|
+
columns=None, # default, select all numeric col/row
|
4936
|
+
inplace=False,
|
4937
|
+
verbose=False, # show usage
|
4938
|
+
axis=0, # defalut column-wise
|
4939
|
+
**kwargs,
|
4940
|
+
):
|
4941
|
+
"""
|
4942
|
+
df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)
|
4943
|
+
|
4944
|
+
Parameters:
|
4945
|
+
- data: pandas DataFrame to be scaled.
|
4946
|
+
- method: Scaler type ('standard', 'minmax', 'robust'). Default is 'standard'.
|
4947
|
+
- columns: List of columns (for axis=0) or rows (for axis=1) to scale.
|
4948
|
+
If None, all numeric columns/rows will be scaled.
|
4949
|
+
- inplace: If True, modify the DataFrame in place. Otherwise, return a new DataFrame.
|
4950
|
+
- axis: Axis along which to scale. 0 for column-wise, 1 for row-wise. Default is 0.
|
4951
|
+
- verbose: If True, prints logs of the process.
|
4952
|
+
- kwargs: Additional arguments to be passed to the scaler.
|
4953
|
+
"""
|
4954
|
+
if verbose:
|
4955
|
+
print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
|
4956
|
+
|
4957
|
+
methods = ["standard", "minmax", "robust"]
|
4958
|
+
method = strcmp(method, methods)[0]
|
4959
|
+
if method == "standard":
|
4960
|
+
from sklearn.preprocessing import StandardScaler
|
4961
|
+
|
4962
|
+
scaler = StandardScaler(**kwargs)
|
4963
|
+
elif method == "minmax":
|
4964
|
+
from sklearn.preprocessing import MinMaxScaler
|
4965
|
+
|
4966
|
+
scaler = MinMaxScaler(**kwargs)
|
4967
|
+
elif method == "robust":
|
4968
|
+
from sklearn.preprocessing import RobustScaler
|
4969
|
+
|
4970
|
+
scaler = RobustScaler(**kwargs)
|
4971
|
+
if axis not in [0, 1]:
|
4972
|
+
raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
|
4973
|
+
|
4974
|
+
if axis == 0:
|
4975
|
+
# Column-wise scaling (default)
|
4976
|
+
if columns is None:
|
4977
|
+
columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
|
4978
|
+
non_numeric_columns = data.columns.difference(columns)
|
4979
|
+
print(f"Scaling columns")
|
4980
|
+
|
4981
|
+
scaled_data = scaler.fit_transform(data[columns])
|
4982
|
+
|
4983
|
+
if inplace:
|
4984
|
+
data[columns] = scaled_data
|
4985
|
+
print("Original DataFrame modified in place (column-wise).")
|
4986
|
+
else:
|
4987
|
+
scaled_df = pd.concat(
|
4988
|
+
[
|
4989
|
+
pd.DataFrame(scaled_data, columns=columns, index=data.index),
|
4990
|
+
data[non_numeric_columns],
|
4991
|
+
],
|
4992
|
+
axis=1,
|
4993
|
+
)
|
4994
|
+
scaled_df = scaled_df[data.columns] # Maintain column order
|
4995
|
+
return scaled_df
|
4996
|
+
|
4997
|
+
elif axis == 1:
|
4998
|
+
# Row-wise scaling
|
4999
|
+
if columns is None:
|
5000
|
+
columns = data.index.tolist()
|
5001
|
+
numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
|
5002
|
+
if numeric_rows.empty:
|
5003
|
+
raise ValueError("No numeric rows to scale.")
|
5004
|
+
|
5005
|
+
print(f"Scaling rows")
|
5006
|
+
|
5007
|
+
scaled_data = scaler.fit_transform(
|
5008
|
+
numeric_rows.T
|
5009
|
+
).T # Transpose for scaling and then back
|
5010
|
+
|
5011
|
+
if inplace:
|
5012
|
+
data.loc[numeric_rows.index] = scaled_data
|
5013
|
+
print("Original DataFrame modified in place (row-wise).")
|
5014
|
+
else:
|
5015
|
+
scaled_df = data.copy()
|
5016
|
+
scaled_df.loc[numeric_rows.index] = scaled_data
|
5017
|
+
return scaled_df
|
5018
|
+
|
4670
5019
|
def df_cluster(
|
4671
5020
|
data: pd.DataFrame,
|
4672
5021
|
columns: Optional[list] = None,
|
@@ -4721,7 +5070,7 @@ def df_cluster(
|
|
4721
5070
|
X = scaler.fit_transform(X)
|
4722
5071
|
|
4723
5072
|
for n_cluster in range_n_clusters:
|
4724
|
-
kmeans = KMeans(n_clusters=n_cluster, random_state=
|
5073
|
+
kmeans = KMeans(n_clusters=n_cluster, random_state=1)
|
4725
5074
|
cluster_labels = kmeans.fit_predict(X)
|
4726
5075
|
|
4727
5076
|
silhouette_avg = silhouette_score(X, cluster_labels)
|
@@ -4737,7 +5086,7 @@ def df_cluster(
|
|
4737
5086
|
print(f"n_clusters = {n_clusters}")
|
4738
5087
|
|
4739
5088
|
# Apply K-Means Clustering with Optimal Number of Clusters
|
4740
|
-
kmeans = KMeans(n_clusters=n_clusters, random_state=
|
5089
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=1)
|
4741
5090
|
cluster_labels = kmeans.fit_predict(X)
|
4742
5091
|
|
4743
5092
|
if plot:
|
@@ -4838,7 +5187,7 @@ def df_cluster(
|
|
4838
5187
|
# n_clusters = (
|
4839
5188
|
# np.argmax(silhouette_avg_scores) + 2
|
4840
5189
|
# ) # Optimal clusters based on max silhouette score
|
4841
|
-
# kmeans = KMeans(n_clusters=n_clusters, random_state=
|
5190
|
+
# kmeans = KMeans(n_clusters=n_clusters, random_state=1)
|
4842
5191
|
# cluster_labels = kmeans.fit_predict(X)
|
4843
5192
|
silhouette_vals = silhouette_samples(X, cluster_labels)
|
4844
5193
|
|
@@ -4989,12 +5338,14 @@ def df_reducer(
|
|
4989
5338
|
columns: Optional[List[str]] = None,
|
4990
5339
|
method: str = "umap", # 'pca', 'umap'
|
4991
5340
|
n_components: int = 2, # Default for umap, but 50 for PCA
|
4992
|
-
umap_neighbors: int = 15, #
|
4993
|
-
umap_min_dist: float = 0.1, #
|
5341
|
+
umap_neighbors: int = 15, # UMAP-specific
|
5342
|
+
umap_min_dist: float = 0.1, # UMAP-specific
|
5343
|
+
tsne_perplexity: int = 30, # t-SNE-specific
|
4994
5344
|
scale: bool = True,
|
4995
5345
|
fill_missing: bool = True,
|
4996
5346
|
debug: bool = False,
|
4997
5347
|
inplace: bool = True, # replace the oringinal data
|
5348
|
+
plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
|
4998
5349
|
) -> pd.DataFrame:
|
4999
5350
|
"""
|
5000
5351
|
Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
|
@@ -5030,9 +5381,35 @@ def df_reducer(
|
|
5030
5381
|
reduced_df : pd.DataFrame
|
5031
5382
|
DataFrame with the reduced dimensions.
|
5032
5383
|
"""
|
5033
|
-
|
5384
|
+
|
5385
|
+
"""
|
5386
|
+
PCA: explained_variance:
|
5387
|
+
indicates the proportion of the dataset's total variance that each principal
|
5388
|
+
component (PC) explains. It gives you a sense of how much information
|
5389
|
+
(or variance) is captured by each PC
|
5390
|
+
Interpretation:
|
5391
|
+
- Higher values indicate that the corresponding PC captures more variance.
|
5392
|
+
- The sum of the explained variances for all PCs equals 1 (or 100%).
|
5393
|
+
- If the first few components explain a high percentage (e.g., 90%),
|
5394
|
+
it means you can reduce the dimensionality of the data significantly without losing much information.
|
5395
|
+
Use case:
|
5396
|
+
You may plot a scree plot, which shows the explained variance for each PC, to help decide
|
5397
|
+
how many components to keep for analysis.
|
5398
|
+
|
5399
|
+
PCA: Singular values:
|
5400
|
+
represent the magnitude of variance along each principal component. Mathematically,
|
5401
|
+
they are the square roots of the eigenvalues of the covariance matrix.
|
5402
|
+
Interpretation:
|
5403
|
+
Larger singular values indicate that the associated PC captures more variance.
|
5404
|
+
Singular values are related to the scale of the data. If the data are scaled
|
5405
|
+
before PCA (e.g., standardized), then the singular values will provide a measure
|
5406
|
+
of the spread of data along each PC.
|
5407
|
+
Use case:
|
5408
|
+
Singular values help quantify the contribution of each principal component in a
|
5409
|
+
similar way to the explained variance. They are useful in understanding the overall
|
5410
|
+
structure of the data.
|
5411
|
+
"""
|
5034
5412
|
from sklearn.preprocessing import StandardScaler
|
5035
|
-
import umap
|
5036
5413
|
from sklearn.impute import SimpleImputer
|
5037
5414
|
|
5038
5415
|
# Select columns if specified, else use all columns
|
@@ -5049,76 +5426,211 @@ def df_reducer(
|
|
5049
5426
|
X = scaler.fit_transform(X)
|
5050
5427
|
|
5051
5428
|
# Check valid method input
|
5052
|
-
|
5053
|
-
|
5054
|
-
|
5429
|
+
methods=["pca", "umap","tsne","factor","isolation_forest"]
|
5430
|
+
method=strcmp(method, methods)[0]
|
5055
5431
|
# Apply PCA if selected
|
5056
|
-
if method == "pca":
|
5057
|
-
|
5058
|
-
# to get the n_components with threshold method:
|
5059
|
-
pca = PCA()
|
5060
|
-
pca_result = pca.fit_transform(X)
|
5061
|
-
|
5062
|
-
# Calculate explained variance
|
5063
|
-
explained_variance = pca.explained_variance_ratio_
|
5064
|
-
# Cumulative explained variance
|
5065
|
-
cumulative_variance = np.cumsum(explained_variance)
|
5066
|
-
# Set a threshold for cumulative variance
|
5067
|
-
threshold = 0.95 # Example threshold
|
5068
|
-
n_components = (
|
5069
|
-
np.argmax(cumulative_variance >= threshold) + 1
|
5070
|
-
) # Number of components to retain
|
5071
|
-
if debug:
|
5072
|
-
# debug:
|
5073
|
-
# Plot the cumulative explained variance
|
5074
|
-
plt.figure(figsize=(8, 5))
|
5075
|
-
plt.plot(
|
5076
|
-
range(1, len(cumulative_variance) + 1),
|
5077
|
-
cumulative_variance,
|
5078
|
-
marker="o",
|
5079
|
-
linestyle="-",
|
5080
|
-
)
|
5081
|
-
plt.title("Cumulative Explained Variance by Principal Components")
|
5082
|
-
plt.xlabel("Number of Principal Components")
|
5083
|
-
plt.ylabel("Cumulative Explained Variance")
|
5084
|
-
plt.xticks(range(1, len(cumulative_variance) + 1))
|
5085
|
-
# Add horizontal line for the threshold
|
5086
|
-
plt.axhline(
|
5087
|
-
y=threshold, color="r", linestyle="--", label="Threshold (95%)"
|
5088
|
-
)
|
5089
|
-
# Add vertical line for n_components
|
5090
|
-
plt.axvline(
|
5091
|
-
x=n_components,
|
5092
|
-
color="g",
|
5093
|
-
linestyle="--",
|
5094
|
-
label=f"n_components = {n_components}",
|
5095
|
-
)
|
5096
|
-
plt.legend()
|
5097
|
-
plt.grid()
|
5432
|
+
if method == "pca":
|
5433
|
+
from sklearn.decomposition import PCA
|
5098
5434
|
pca = PCA(n_components=n_components)
|
5099
5435
|
X_reduced = pca.fit_transform(X)
|
5100
|
-
|
5436
|
+
|
5437
|
+
# Additional PCA information
|
5438
|
+
explained_variance = pca.explained_variance_ratio_
|
5439
|
+
singular_values = pca.singular_values_
|
5440
|
+
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
|
5441
|
+
|
5442
|
+
if debug:
|
5443
|
+
print(f"PCA completed: Reduced to {n_components} components.")
|
5444
|
+
print(f"Explained Variance: {explained_variance}")
|
5445
|
+
print(f"Singular Values: {singular_values}")
|
5446
|
+
|
5447
|
+
# Plot explained variance if debug=True
|
5448
|
+
if debug:
|
5449
|
+
# Plot explained variance
|
5450
|
+
cumulative_variance = np.cumsum(explained_variance)
|
5451
|
+
plt.figure(figsize=(8, 5))
|
5452
|
+
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
|
5453
|
+
plt.title("Cumulative Explained Variance by Principal Components")
|
5454
|
+
plt.xlabel("Number of Principal Components")
|
5455
|
+
plt.ylabel("Cumulative Explained Variance")
|
5456
|
+
plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
|
5457
|
+
plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
|
5458
|
+
plt.legend()
|
5459
|
+
plt.grid()
|
5460
|
+
plt.show()
|
5461
|
+
|
5462
|
+
# Prepare reduced DataFrame with additional PCA info
|
5463
|
+
pca_df = pd.DataFrame(
|
5464
|
+
X_reduced, index=data.index,
|
5465
|
+
columns=[f"PC_{i+1}" for i in range(n_components)]
|
5466
|
+
)
|
5467
|
+
# pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
|
5468
|
+
# pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
|
5469
|
+
# Expand explained variance to multiple columns if needed
|
5470
|
+
for i in range(n_components):
|
5471
|
+
pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
|
5472
|
+
for i in range(n_components):
|
5473
|
+
pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
|
5101
5474
|
|
5102
5475
|
# Apply UMAP if selected
|
5103
5476
|
elif method == "umap":
|
5477
|
+
import umap
|
5104
5478
|
umap_reducer = umap.UMAP(
|
5105
5479
|
n_neighbors=umap_neighbors,
|
5106
5480
|
min_dist=umap_min_dist,
|
5107
|
-
n_components=n_components
|
5481
|
+
n_components=n_components
|
5108
5482
|
)
|
5109
5483
|
X_reduced = umap_reducer.fit_transform(X)
|
5110
|
-
print(f"UMAP completed: Reduced to {n_components} components.")
|
5111
5484
|
|
5112
|
-
|
5113
|
-
|
5485
|
+
# Additional UMAP information
|
5486
|
+
embedding = umap_reducer.embedding_
|
5487
|
+
trustworthiness = umap_reducer._raw_data[:, :n_components]
|
5488
|
+
|
5489
|
+
if debug:
|
5490
|
+
print(f"UMAP completed: Reduced to {n_components} components.")
|
5491
|
+
print(f"Embedding Shape: {embedding.shape}")
|
5492
|
+
print(f"Trustworthiness: {trustworthiness}")
|
5493
|
+
|
5494
|
+
# Prepare reduced DataFrame with additional UMAP info
|
5495
|
+
umap_df = pd.DataFrame(
|
5496
|
+
X_reduced, index=data.index,
|
5497
|
+
columns=[f"UMAP_{i+1}" for i in range(n_components)]
|
5498
|
+
)
|
5499
|
+
umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
|
5500
|
+
umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
|
5501
|
+
elif method == "tsne":
|
5502
|
+
from sklearn.manifold import TSNE
|
5503
|
+
tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
|
5504
|
+
X_reduced = tsne.fit_transform(X)
|
5505
|
+
|
5506
|
+
# Prepare reduced DataFrame with additional t-SNE info
|
5507
|
+
tsne_df = pd.DataFrame(
|
5508
|
+
X_reduced, index=data.index,
|
5509
|
+
columns=[f"tSNE_{i+1}" for i in range(n_components)]
|
5510
|
+
)
|
5511
|
+
tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
|
5512
|
+
|
5513
|
+
# Apply Factor Analysis if selected
|
5514
|
+
elif method == "factor":
|
5515
|
+
from sklearn.decomposition import FactorAnalysis
|
5516
|
+
factor = FactorAnalysis(n_components=n_components, random_state=1)
|
5517
|
+
X_reduced = factor.fit_transform(X)
|
5518
|
+
# Factor Analysis does not directly provide explained variance, but we can approximate it
|
5519
|
+
fa_variance = factor.noise_variance_
|
5520
|
+
# Prepare reduced DataFrame with additional Factor Analysis info
|
5521
|
+
factor_df = pd.DataFrame(
|
5522
|
+
X_reduced, index=data.index,
|
5523
|
+
columns=[f"Factor_{i+1}" for i in range(n_components)]
|
5524
|
+
)
|
5525
|
+
factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
|
5526
|
+
|
5527
|
+
# Apply Isolation Forest for outlier detection if selected
|
5528
|
+
elif method == "isolation_forest":
|
5529
|
+
from sklearn.decomposition import PCA
|
5530
|
+
from sklearn.ensemble import IsolationForest
|
5531
|
+
# Step 1: Apply PCA for dimensionality reduction to 2 components
|
5532
|
+
pca = PCA(n_components=n_components)
|
5533
|
+
X_pca = pca.fit_transform(X)
|
5534
|
+
|
5535
|
+
explained_variance = pca.explained_variance_ratio_
|
5536
|
+
singular_values = pca.singular_values_
|
5537
|
+
|
5538
|
+
# Prepare reduced DataFrame with additional PCA info
|
5539
|
+
iso_forest_df = pd.DataFrame(
|
5540
|
+
X_pca, index=data.index,
|
5541
|
+
columns=[f"PC_{i+1}" for i in range(n_components)]
|
5542
|
+
)
|
5543
|
+
|
5544
|
+
isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
|
5545
|
+
isolation_forest.fit(X)
|
5546
|
+
anomaly_scores = isolation_forest.decision_function(X) # Anomaly score: larger is less anomalous
|
5547
|
+
# Predict labels: 1 (normal), -1 (anomaly)
|
5548
|
+
anomaly_labels = isolation_forest.fit_predict(X)
|
5549
|
+
# Add anomaly scores and labels to the DataFrame
|
5550
|
+
iso_forest_df["Anomaly Score"] = anomaly_scores
|
5551
|
+
iso_forest_df["Anomaly Label"] = anomaly_labels
|
5552
|
+
# add info from pca
|
5553
|
+
for i in range(n_components):
|
5554
|
+
iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
|
5555
|
+
for i in range(n_components):
|
5556
|
+
iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
|
5557
|
+
|
5558
|
+
# Return reduced data and info as a new DataFrame with the same index
|
5559
|
+
if method == "pca":
|
5560
|
+
reduced_df = pca_df
|
5561
|
+
colname_met = "PC_"
|
5562
|
+
if plot_:
|
5563
|
+
sns.scatterplot(
|
5564
|
+
data=pca_df,
|
5565
|
+
x="PC_1",
|
5566
|
+
y="PC_2",
|
5567
|
+
# hue="condition",
|
5568
|
+
)
|
5569
|
+
elif method == "umap":
|
5570
|
+
reduced_df = umap_df
|
5571
|
+
colname_met = "UMAP_"
|
5572
|
+
if plot_:
|
5573
|
+
sns.scatterplot(
|
5574
|
+
data=umap_df,
|
5575
|
+
x="UMAP_1",
|
5576
|
+
y="UMAP_2",
|
5577
|
+
# hue="condition",
|
5578
|
+
)
|
5579
|
+
elif method == "tsne":
|
5580
|
+
reduced_df = tsne_df
|
5581
|
+
colname_met = "t-SNE_"
|
5582
|
+
if plot_:
|
5583
|
+
sns.scatterplot(
|
5584
|
+
data=tsne_df,
|
5585
|
+
x="tSNE_1",
|
5586
|
+
y="tSNE_2",
|
5587
|
+
# hue="batch",
|
5588
|
+
)
|
5589
|
+
elif method == "factor":
|
5590
|
+
reduced_df = factor_df
|
5591
|
+
colname_met = "Factor_"
|
5592
|
+
if plot_:
|
5593
|
+
sns.scatterplot(
|
5594
|
+
data=factor_df,
|
5595
|
+
x="Factor_1",
|
5596
|
+
y="Factor_2",
|
5597
|
+
# hue="batch",
|
5598
|
+
)
|
5599
|
+
elif method == "isolation_forest":
|
5600
|
+
reduced_df = iso_forest_df # Already a DataFrame for outliers
|
5601
|
+
colname_met = "PC_"
|
5602
|
+
if plot_:
|
5603
|
+
ax = sns.scatterplot(
|
5604
|
+
data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
|
5605
|
+
x="PC_1",
|
5606
|
+
y="PC_2",
|
5607
|
+
label="normal", c="b",
|
5608
|
+
)
|
5609
|
+
ax = sns.scatterplot(
|
5610
|
+
ax=ax,
|
5611
|
+
data=iso_forest_df[iso_forest_df["Anomaly Label"] == -1],
|
5612
|
+
x="PC_1",
|
5613
|
+
y="PC_2",
|
5614
|
+
c="r",
|
5615
|
+
label="outlier", marker="+", s=30,
|
5616
|
+
)
|
5617
|
+
|
5114
5618
|
|
5115
5619
|
if inplace:
|
5116
|
-
#
|
5620
|
+
# If inplace=True, add components back into the original data
|
5117
5621
|
for col_idx in range(n_components):
|
5118
|
-
data[f"
|
5622
|
+
data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
5623
|
+
|
5624
|
+
# Add extra info for PCA/UMAP
|
5625
|
+
if method == "pca":
|
5626
|
+
data["Explained Variance"] = reduced_df["Explained Variance"]
|
5627
|
+
data["Singular Values"] = reduced_df["Singular Values"]
|
5628
|
+
elif method == "umap":
|
5629
|
+
data["Embedding"] = reduced_df["Embedding"]
|
5630
|
+
data["Trustworthiness"] = reduced_df["Trustworthiness"]
|
5119
5631
|
return None # No return when inplace=True
|
5120
5632
|
|
5121
|
-
return reduced_df
|
5633
|
+
return reduced_df
|
5122
5634
|
|
5123
5635
|
|
5124
5636
|
# example:
|
@@ -5373,7 +5885,7 @@ def evaluate_cluster(
|
|
5373
5885
|
return metrics
|
5374
5886
|
|
5375
5887
|
|
5376
|
-
def
|
5888
|
+
def use_pd(
|
5377
5889
|
func_name="excel",
|
5378
5890
|
verbose=True,
|
5379
5891
|
dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
|
@@ -5387,4 +5899,4 @@ def print_pd_usage(
|
|
5387
5899
|
i_ = i_.replace("=", "\t= ") + ","
|
5388
5900
|
print(i_) if i == 0 else print("\t", i_)
|
5389
5901
|
else:
|
5390
|
-
print(usage)
|
5902
|
+
print(usage)
|