py2ls 0.2.4.1__py3-none-any.whl → 0.2.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/bio.py +272 -0
- py2ls/data/usages_pd copy.json +1105 -0
- py2ls/data/usages_pd.json +1413 -52
- py2ls/fetch_update.py +45 -27
- py2ls/ips.py +362 -99
- py2ls/plot.py +71 -64
- {py2ls-0.2.4.1.dist-info → py2ls-0.2.4.2.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.1.dist-info → py2ls-0.2.4.2.dist-info}/RECORD +9 -7
- {py2ls-0.2.4.1.dist-info → py2ls-0.2.4.2.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -61,19 +61,31 @@ except NameError:
|
|
61
61
|
|
62
62
|
def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
63
63
|
"""
|
64
|
-
Add the Chinese font to the font manager
|
64
|
+
Add the Chinese (default) font to the font manager
|
65
65
|
Args:
|
66
66
|
dir_font (str, optional): _description_. Defaults to "/System/Library/Fonts/Hiragino Sans GB.ttc".
|
67
67
|
"""
|
68
68
|
import matplotlib.pyplot as plt
|
69
69
|
from matplotlib import font_manager
|
70
|
+
slashtype = "/" if 'mac' in get_os() else "\\"
|
71
|
+
if slashtype in dir_font:
|
72
|
+
font_manager.fontManager.addfont(dir_font)
|
73
|
+
fontname = os.path.basename(dir_font).split(".")[0]
|
74
|
+
else:
|
75
|
+
if "cn" in dir_font.lower() or "ch" in dir_font.lower():
|
76
|
+
fontname = "Hiragino Sans GB" # default Chinese font
|
77
|
+
else:
|
78
|
+
fontname = dir_font
|
70
79
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
plt.rcParams["font.sans-serif"] = [fontname_chinese]
|
75
|
-
plt.rcParams["font.family"] = "sans-serif"
|
80
|
+
plt.rcParams["font.sans-serif"] = [fontname]
|
81
|
+
# plt.rcParams["font.family"] = "sans-serif"
|
76
82
|
plt.rcParams["axes.unicode_minus"] = False
|
83
|
+
fonts_in_system = font_manager.findSystemFonts(fontpaths=None, fontext="ttf")
|
84
|
+
fontname_in_system = [os.path.basename(i).split(".")[0] for i in fonts_in_system]
|
85
|
+
if fontname not in fontname_in_system:
|
86
|
+
print(f"Font '{fontname}' not found. Falling back to default.")
|
87
|
+
plt.rcParams["font.sans-serif"] = ["Arial"]
|
88
|
+
return fontname
|
77
89
|
|
78
90
|
# set 'dir_save'
|
79
91
|
if "dar" in sys.platform:
|
@@ -526,6 +538,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
|
|
526
538
|
if isinstance(s, str):
|
527
539
|
return s.lower()
|
528
540
|
elif isinstance(s, list):
|
541
|
+
s=[str(i) for i in s]# convert all to str
|
529
542
|
return [elem.lower() for elem in s]
|
530
543
|
return s
|
531
544
|
|
@@ -1697,24 +1710,16 @@ def fload(fpath, kind=None, **kwargs):
|
|
1697
1710
|
def load_csv(fpath, **kwargs):
|
1698
1711
|
from pandas.errors import EmptyDataError
|
1699
1712
|
|
1700
|
-
engine = kwargs.
|
1701
|
-
kwargs.pop("
|
1702
|
-
|
1703
|
-
kwargs.pop("
|
1704
|
-
|
1705
|
-
kwargs.pop("
|
1706
|
-
|
1707
|
-
kwargs.pop("
|
1708
|
-
skipinitialspace = kwargs.get("skipinitialspace", True)
|
1709
|
-
kwargs.pop("skipinitialspace", None)
|
1710
|
-
encoding = kwargs.get("encoding", "utf-8")
|
1711
|
-
kwargs.pop("encoding", None)
|
1712
|
-
on_bad_lines = kwargs.get("on_bad_lines", "skip")
|
1713
|
-
kwargs.pop("on_bad_lines", None)
|
1714
|
-
comment = kwargs.get("comment", None)
|
1715
|
-
kwargs.pop("comment", None)
|
1716
|
-
|
1713
|
+
engine = kwargs.pop("engine", "pyarrow")
|
1714
|
+
sep = kwargs.pop("sep", "\t")
|
1715
|
+
index_col = kwargs.pop("index_col", None)
|
1716
|
+
memory_map = kwargs.pop("memory_map", False)
|
1717
|
+
skipinitialspace = kwargs.pop("skipinitialspace", False)
|
1718
|
+
encoding = kwargs.pop("encoding", "utf-8")
|
1719
|
+
on_bad_lines = kwargs.pop("on_bad_lines", "skip")
|
1720
|
+
comment = kwargs.pop("comment", None)
|
1717
1721
|
fmt=kwargs.pop("fmt",False)
|
1722
|
+
verbose=kwargs.pop("verbose",False)
|
1718
1723
|
if verbose:
|
1719
1724
|
print_pd_usage("read_csv", verbose=verbose)
|
1720
1725
|
return
|
@@ -1800,7 +1805,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1800
1805
|
separators = [",", "\t", ";", "|", " "]
|
1801
1806
|
for sep in separators:
|
1802
1807
|
sep2show = sep if sep != "\t" else "\\t"
|
1803
|
-
|
1808
|
+
print(f'trying with: engine=pyarrow, sep="{sep2show}"')
|
1804
1809
|
try:
|
1805
1810
|
df = pd.read_csv(
|
1806
1811
|
fpath,
|
@@ -1819,13 +1824,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
1819
1824
|
except:
|
1820
1825
|
pass
|
1821
1826
|
else:
|
1822
|
-
engines = ["c", "python"]
|
1827
|
+
engines = [None,"c", "python"]
|
1823
1828
|
for engine in engines:
|
1824
|
-
|
1829
|
+
separators = [",", "\t", ";", "|", " "]
|
1825
1830
|
for sep in separators:
|
1826
1831
|
try:
|
1827
1832
|
sep2show = sep if sep != "\t" else "\\t"
|
1828
|
-
|
1833
|
+
print(f"trying with: engine={engine}, sep='{sep2show}'")
|
1829
1834
|
df = pd.read_csv(
|
1830
1835
|
fpath,
|
1831
1836
|
engine=engine,
|
@@ -2031,6 +2036,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2031
2036
|
elif kind.lower() in img_types:
|
2032
2037
|
print(f'Image ".{kind}" is loaded.')
|
2033
2038
|
return load_img(fpath)
|
2039
|
+
elif kind=="gz" and fpath.endswith(".soft.gz"):
|
2040
|
+
import GEOparse
|
2041
|
+
return GEOparse.get_GEO(filepath=fpath)
|
2034
2042
|
elif kind.lower() in zip_types:
|
2035
2043
|
keep = kwargs.get("keep", False)
|
2036
2044
|
fpath_unzip = unzip(fpath)
|
@@ -2105,30 +2113,51 @@ def fload(fpath, kind=None, **kwargs):
|
|
2105
2113
|
# docx_content = fload('sample.docx')
|
2106
2114
|
|
2107
2115
|
|
2108
|
-
def fupdate(fpath, content=None):
|
2116
|
+
def fupdate(fpath, content=None, how="head"):
|
2109
2117
|
"""
|
2110
2118
|
Update a file by adding new content at the top and moving the old content to the bottom.
|
2119
|
+
If the file is a JSON file, merge the new content with the old content.
|
2120
|
+
|
2111
2121
|
Parameters
|
2112
2122
|
----------
|
2113
2123
|
fpath : str
|
2114
2124
|
The file path where the content should be updated.
|
2115
|
-
content : str, optional
|
2116
|
-
The new content to add at the top of the file
|
2125
|
+
content : str or dict, optional
|
2126
|
+
The new content to add at the top of the file (for text) or merge (for JSON).
|
2127
|
+
If not provided, the function will not add any new content.
|
2128
|
+
|
2117
2129
|
Notes
|
2118
2130
|
-----
|
2119
2131
|
- If the file at `fpath` does not exist, it will be created.
|
2120
|
-
-
|
2132
|
+
- For text files, the new content will be added at the top, followed by the old content.
|
2133
|
+
- For JSON files, the new content will be merged with the existing JSON content.
|
2121
2134
|
"""
|
2122
2135
|
content = content or ""
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2136
|
+
file_ext = os.path.splitext(fpath)[1]
|
2137
|
+
how_s=["head", "tail","start","end","beginning", "stop",'last',"before"]
|
2138
|
+
how = strcmp(how, how_s)[0]
|
2139
|
+
print(how)
|
2140
|
+
add_where = 'head' if how in ["head", "start","beginning", "before"] else "tail"
|
2141
|
+
if "json" in file_ext.lower():
|
2142
|
+
old_content=fload(fpath,kind='json') if os.path.exists(fpath) else {}
|
2143
|
+
updated_content = {**content,**old_content} if add_where=="head" else {**old_content, **content} if isinstance(content, dict) else old_content
|
2144
|
+
fsave(fpath,updated_content)
|
2126
2145
|
else:
|
2127
|
-
|
2146
|
+
# Handle text file
|
2147
|
+
if os.path.exists(fpath):
|
2148
|
+
with open(fpath, "r") as file:
|
2149
|
+
old_content = file.read()
|
2150
|
+
else:
|
2151
|
+
old_content = ""
|
2128
2152
|
|
2129
|
-
|
2130
|
-
|
2131
|
-
|
2153
|
+
# Write new content at the top followed by old content
|
2154
|
+
with open(fpath, "w") as file:
|
2155
|
+
if add_where=="head":
|
2156
|
+
file.write(content + "\n")
|
2157
|
+
file.write(old_content)
|
2158
|
+
else:
|
2159
|
+
file.write(old_content)
|
2160
|
+
file.write(content + "\n")
|
2132
2161
|
|
2133
2162
|
|
2134
2163
|
def fappend(fpath, content=None):
|
@@ -2710,12 +2739,14 @@ def mkdir_nest(fpath: str) -> str:
|
|
2710
2739
|
Returns:
|
2711
2740
|
- str: The path of the created directory.
|
2712
2741
|
"""
|
2713
|
-
|
2714
|
-
if os.path.isdir(fpath):
|
2715
|
-
return fpath
|
2742
|
+
|
2716
2743
|
|
2717
2744
|
# Split the full path into directories
|
2718
2745
|
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
2746
|
+
if os.path.isdir(fpath):
|
2747
|
+
fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
|
2748
|
+
print(fpath)
|
2749
|
+
return fpath
|
2719
2750
|
dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
|
2720
2751
|
|
2721
2752
|
# Start creating directories from the root to the desired path
|
@@ -2744,34 +2775,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
2744
2775
|
- str: The path of the created directory or an error message.
|
2745
2776
|
"""
|
2746
2777
|
|
2747
|
-
rootdir = []
|
2748
|
-
# Convert string to list
|
2778
|
+
rootdir = []
|
2749
2779
|
if chdir is None:
|
2750
2780
|
return mkdir_nest(pardir)
|
2751
2781
|
if isinstance(chdir, str):
|
2752
|
-
chdir = [chdir]
|
2753
|
-
# Subfoldername should be unique
|
2782
|
+
chdir = [chdir]
|
2754
2783
|
chdir = list(set(chdir))
|
2755
2784
|
if isinstance(pardir, str): # Dir_parents should be 'str' type
|
2756
|
-
pardir = os.path.normpath(pardir)
|
2757
|
-
# Get the slash type: "/" or "\"
|
2758
|
-
stype = "/" if "/" in pardir else "\\"
|
2785
|
+
pardir = os.path.normpath(pardir)
|
2759
2786
|
if "mac" in get_os().lower() or "lin" in get_os().lower():
|
2760
2787
|
stype = "/"
|
2761
2788
|
elif "win" in get_os().lower():
|
2762
2789
|
stype = "\\"
|
2763
2790
|
else:
|
2764
2791
|
stype = "/"
|
2765
|
-
|
2766
|
-
# Check if the parent directory exists and is a directory path
|
2792
|
+
|
2767
2793
|
if os.path.isdir(pardir):
|
2768
2794
|
os.chdir(pardir) # Set current path
|
2769
2795
|
# Check if subdirectories are not empty
|
2770
2796
|
if chdir:
|
2771
|
-
chdir.sort()
|
2772
|
-
|
2773
|
-
for folder in chdir:
|
2774
|
-
# Check if the subfolder already exists
|
2797
|
+
chdir.sort()
|
2798
|
+
for folder in chdir:
|
2775
2799
|
child_tmp = os.path.join(pardir, folder)
|
2776
2800
|
if not os.path.isdir(child_tmp):
|
2777
2801
|
os.mkdir("./" + folder)
|
@@ -2791,6 +2815,8 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
2791
2815
|
# Dir is the main output, if only one dir, then str type is inconvenient
|
2792
2816
|
if len(rootdir) == 1:
|
2793
2817
|
rootdir = rootdir[0]
|
2818
|
+
rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
|
2819
|
+
print(rootdir)
|
2794
2820
|
return rootdir
|
2795
2821
|
|
2796
2822
|
|
@@ -2805,22 +2831,25 @@ def figsave(*args, dpi=300):
|
|
2805
2831
|
dir_save = None
|
2806
2832
|
fname = None
|
2807
2833
|
img = None
|
2834
|
+
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
2808
2835
|
for arg in args:
|
2809
2836
|
if isinstance(arg, str):
|
2810
|
-
if
|
2837
|
+
if f_slash in arg:
|
2811
2838
|
dir_save = arg
|
2812
|
-
|
2839
|
+
else:
|
2813
2840
|
fname = arg
|
2814
2841
|
elif isinstance(arg, (Image.Image, np.ndarray)):
|
2815
2842
|
img = arg # Store the PIL image if provided
|
2816
2843
|
|
2817
|
-
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
2818
2844
|
if dir_save is None:
|
2819
2845
|
dir_save="./"
|
2846
|
+
print(dir_save)
|
2847
|
+
# dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
|
2820
2848
|
dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
|
2821
2849
|
dir_ch = "".join(dir_save.split(f_slash)[-1:])
|
2822
2850
|
if not dir_par.endswith(f_slash):
|
2823
2851
|
dir_par += f_slash
|
2852
|
+
print(dir_par)
|
2824
2853
|
if fname is None:
|
2825
2854
|
fname = dir_ch
|
2826
2855
|
mkdir(dir_par)
|
@@ -4418,9 +4447,10 @@ def preview(var):
|
|
4418
4447
|
|
4419
4448
|
# ! DataFrame
|
4420
4449
|
def df_astype(
|
4421
|
-
|
4450
|
+
data: pd.DataFrame,
|
4422
4451
|
columns: Optional[Union[str, List[str]]] = None,
|
4423
4452
|
astype: str = "datetime",
|
4453
|
+
skip_row:Union[str,list]=None,
|
4424
4454
|
fmt: Optional[str] = None,
|
4425
4455
|
inplace: bool = True,
|
4426
4456
|
errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
|
@@ -4484,22 +4514,24 @@ def df_astype(
|
|
4484
4514
|
]
|
4485
4515
|
# If inplace is False, make a copy of the DataFrame
|
4486
4516
|
if not inplace:
|
4487
|
-
|
4517
|
+
data = data.copy()
|
4518
|
+
if skip_row is not None:
|
4519
|
+
data = data.drop(index=skip_row, errors='ignore')
|
4520
|
+
# If columns is None, apply to all columns
|
4521
|
+
if columns is None:
|
4522
|
+
columns = data.columns.tolist()
|
4488
4523
|
# correct the astype input
|
4489
4524
|
if isinstance(astype,str):
|
4490
4525
|
astype = strcmp(astype, astypes)[0]
|
4491
|
-
print(f"converting
|
4526
|
+
print(f"converting as type: {astype}")
|
4492
4527
|
elif isinstance(astype,dict):
|
4493
4528
|
for col, dtype in astype.items():
|
4494
4529
|
dtype='date' if dtype=="day" else dtype
|
4495
|
-
|
4496
|
-
return
|
4497
|
-
# If columns is None, apply to all columns
|
4498
|
-
if columns is None:
|
4499
|
-
columns = df.columns
|
4530
|
+
data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
|
4531
|
+
return data if not inplace else None
|
4500
4532
|
|
4501
4533
|
# Ensure columns is a list
|
4502
|
-
if isinstance(columns,
|
4534
|
+
if isinstance(columns, str):
|
4503
4535
|
columns = [columns]
|
4504
4536
|
|
4505
4537
|
# Convert specified columns
|
@@ -4519,72 +4551,74 @@ def df_astype(
|
|
4519
4551
|
kwargs.pop("errors", None)
|
4520
4552
|
# convert it as type: datetime
|
4521
4553
|
if isinstance(column, int):
|
4522
|
-
|
4523
|
-
|
4554
|
+
data.iloc[:, column] = pd.to_datetime(
|
4555
|
+
data.iloc[:, column], format=fmt, errors=errors, **kwargs
|
4524
4556
|
)
|
4525
4557
|
# further convert:
|
4526
4558
|
if astype == "time":
|
4527
|
-
|
4559
|
+
data.iloc[:, column] = data.iloc[:, column].dt.time
|
4528
4560
|
elif astype == "month":
|
4529
|
-
|
4561
|
+
data.iloc[:, column] = data.iloc[:, column].dt.month
|
4530
4562
|
elif astype == "year":
|
4531
|
-
|
4563
|
+
data.iloc[:, column] = data.iloc[:, column].dt.year
|
4532
4564
|
elif astype == "date" or astype == "day":
|
4533
|
-
|
4565
|
+
data.iloc[:, column] = data.iloc[:, column].dt.date
|
4534
4566
|
elif astype == "hour":
|
4535
|
-
|
4567
|
+
data.iloc[:, column] = data.iloc[:, column].dt.hour
|
4536
4568
|
elif astype == "minute":
|
4537
|
-
|
4569
|
+
data.iloc[:, column] = data.iloc[:, column].dt.minute
|
4538
4570
|
elif astype == "second":
|
4539
|
-
|
4571
|
+
data.iloc[:, column] = data.iloc[:, column].dt.second
|
4540
4572
|
elif astype == "week":
|
4541
|
-
|
4573
|
+
data.iloc[:, column] = data.iloc[:, column].dt.day_name()
|
4542
4574
|
else:
|
4543
|
-
|
4575
|
+
data[column] = (
|
4544
4576
|
pd.to_datetime(
|
4545
|
-
|
4577
|
+
data[column], format=fmt, errors=errors, **kwargs
|
4546
4578
|
)
|
4547
4579
|
if fmt
|
4548
|
-
else pd.to_datetime(
|
4580
|
+
else pd.to_datetime(data[column], errors=errors, **kwargs)
|
4549
4581
|
)
|
4550
4582
|
# further convert:
|
4551
4583
|
if astype == "time":
|
4552
|
-
|
4584
|
+
data[column] = data[column].dt.time
|
4553
4585
|
elif astype == "month":
|
4554
|
-
|
4586
|
+
data[column] = data[column].dt.month
|
4555
4587
|
elif astype == "year":
|
4556
|
-
|
4588
|
+
data[column] = data[column].dt.year
|
4557
4589
|
elif astype == "date":
|
4558
|
-
|
4590
|
+
data[column] = data[column].dt.date
|
4559
4591
|
elif astype == "hour":
|
4560
|
-
|
4592
|
+
data[column] = data[column].dt.hour
|
4561
4593
|
elif astype == "minute":
|
4562
|
-
|
4594
|
+
data[column] = data[column].dt.minute
|
4563
4595
|
elif astype == "second":
|
4564
|
-
|
4596
|
+
data[column] = data[column].dt.second
|
4565
4597
|
elif astype == "week":
|
4566
|
-
|
4598
|
+
data[column] = data[column].dt.day_name()
|
4567
4599
|
|
4568
4600
|
elif astype == "numeric":
|
4569
4601
|
kwargs.pop("errors", None)
|
4570
|
-
|
4602
|
+
data[column] = pd.to_numeric(data[column], errors=errors, **kwargs)
|
4571
4603
|
# print(f"Successfully converted '{column}' to numeric.")
|
4572
4604
|
elif astype == "timedelta":
|
4573
4605
|
kwargs.pop("errors", None)
|
4574
|
-
|
4606
|
+
data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
|
4575
4607
|
# print(f"Successfully converted '{column}' to timedelta.")
|
4576
4608
|
else:
|
4577
4609
|
# Convert to other types (e.g., float, int)
|
4578
|
-
|
4610
|
+
data[column] = data[column].astype(astype)
|
4579
4611
|
# print(f"Successfully converted '{column}' to {astype}.")
|
4580
4612
|
except Exception as e:
|
4581
4613
|
print(f"Error converting '{column}' to {astype}: {e}")
|
4582
|
-
|
4583
|
-
|
4584
|
-
|
4614
|
+
try:
|
4615
|
+
display(data.info()[:10])
|
4616
|
+
except:
|
4617
|
+
pass
|
4618
|
+
return data
|
4585
4619
|
|
4586
4620
|
|
4587
|
-
# ! DataFrame
|
4621
|
+
# ! DataFrame
|
4588
4622
|
def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
4589
4623
|
"""
|
4590
4624
|
Sort a DataFrame by a specified column based on a custom order or by count.
|
@@ -4601,7 +4635,7 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4601
4635
|
Returns:
|
4602
4636
|
- Sorted DataFrame if inplace is False, otherwise None.
|
4603
4637
|
"""
|
4604
|
-
if column not in
|
4638
|
+
if column not in data.columns:
|
4605
4639
|
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
4606
4640
|
|
4607
4641
|
if isinstance(by, str) and 'count' in by.lower():
|
@@ -4624,11 +4658,11 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4624
4658
|
|
4625
4659
|
try:
|
4626
4660
|
if inplace: # replace the original
|
4627
|
-
|
4661
|
+
data.sort_values(column, ascending=ascending, inplace=True, **kwargs)
|
4628
4662
|
print(f"Successfully sorted DataFrame by '{column}'")
|
4629
4663
|
return None
|
4630
4664
|
else:
|
4631
|
-
sorted_df =
|
4665
|
+
sorted_df = data.sort_values(column, ascending=ascending, **kwargs)
|
4632
4666
|
print(f"Successfully sorted DataFrame by '{column}' using custom order.")
|
4633
4667
|
return sorted_df
|
4634
4668
|
except Exception as e:
|
@@ -4636,7 +4670,6 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4636
4670
|
return df
|
4637
4671
|
|
4638
4672
|
|
4639
|
-
|
4640
4673
|
# # Example usage:
|
4641
4674
|
# # Sample DataFrame
|
4642
4675
|
# data = {
|
@@ -4667,6 +4700,236 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4667
4700
|
# display(df_month)
|
4668
4701
|
|
4669
4702
|
|
4703
|
+
def df_merge(
|
4704
|
+
df1: pd.DataFrame,
|
4705
|
+
df2: pd.DataFrame,
|
4706
|
+
use_index: bool = True,
|
4707
|
+
columns: list = ["col_left", "col_right"],
|
4708
|
+
how: str = "left",
|
4709
|
+
) -> pd.DataFrame:
|
4710
|
+
"""
|
4711
|
+
Merges two DataFrames based on either the index or shared columns with matching data types.
|
4712
|
+
usage:
|
4713
|
+
#(1) if the index are the same
|
4714
|
+
df_merged = df_merge(df1, df2, use_index=True(defalut), how='outer')
|
4715
|
+
#(2) if there are shaed columns, then based on shared columns
|
4716
|
+
df_merged = df_merge(df1, df2, how='outer')
|
4717
|
+
#(3) if columns: then based on the specific columns
|
4718
|
+
df_merged = df_merge(df1, df2, columns=["col_left", "col_right"],how='outer')
|
4719
|
+
Parameters:
|
4720
|
+
- df1 (pd.DataFrame): The first DataFrame.
|
4721
|
+
- df2 (pd.DataFrame): The second DataFrame.
|
4722
|
+
- use_index (bool): If True, first try to merge by index if they are comparable; otherwise, fall back to column-based merge.
|
4723
|
+
- how (str): Type of merge to perform: 'inner', 'outer', 'left', or 'right'. Default is 'inner'.
|
4724
|
+
'inner': only the rows that have matching values in both DataFrames (intersection)
|
4725
|
+
'outer': keeps all rows from both DataFrames and fills in missing values with NaN
|
4726
|
+
'left': keeps all rows from the left DataFrame and matches rows from the right DataFrame
|
4727
|
+
'right': keeps all rows from the right DataFrame and matches rows from the left DataFrame, filling with NaN if there is no match.
|
4728
|
+
|
4729
|
+
Returns:
|
4730
|
+
- pd.DataFrame: The merged DataFrame.
|
4731
|
+
"""
|
4732
|
+
|
4733
|
+
# 1. Check if indices are comparable (same length and types)
|
4734
|
+
if use_index or df1.index.equals(df2.index):
|
4735
|
+
print(f"Merging based on index using '{how}' join...")
|
4736
|
+
df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
|
4737
|
+
return df_merged
|
4738
|
+
|
4739
|
+
# 2. Find common columns with the same dtype
|
4740
|
+
common_columns = df1.columns.intersection(df2.columns)
|
4741
|
+
shared_columns = []
|
4742
|
+
for col in common_columns:
|
4743
|
+
if df1[col].dtype == df2[col].dtype:
|
4744
|
+
shared_columns.append(col)
|
4745
|
+
if not isinstance(columns, list):
|
4746
|
+
columns = [columns]
|
4747
|
+
if len(columns) != 2:
|
4748
|
+
raise ValueError(
|
4749
|
+
"'columns':list shoule be a list: columns=['col_left','col_right']"
|
4750
|
+
)
|
4751
|
+
if all(columns):
|
4752
|
+
print(f"Merging based on columns: {columns} using '{how}' join...")
|
4753
|
+
df_merged = pd.merge(df1, df2, left_on=columns[0], right_on=columns[1], how=how)
|
4754
|
+
elif shared_columns:
|
4755
|
+
print(
|
4756
|
+
f"Merging based on shared columns: {shared_columns} using '{how}' join..."
|
4757
|
+
)
|
4758
|
+
df_merged = pd.merge(df1, df2, on=shared_columns, how=how)
|
4759
|
+
else:
|
4760
|
+
raise ValueError(
|
4761
|
+
"No common columns with matching data types to merge on, and indices are not comparable."
|
4762
|
+
)
|
4763
|
+
return df_merged
|
4764
|
+
|
4765
|
+
def df_fillna(
|
4766
|
+
data: pd.DataFrame,
|
4767
|
+
method: str = "mean",
|
4768
|
+
axis: int = 0,# column-wise
|
4769
|
+
constant: float = None,
|
4770
|
+
inplace: bool = True,
|
4771
|
+
) -> pd.DataFrame:
|
4772
|
+
"""
|
4773
|
+
Fill missing values in a DataFrame using specified imputation method.
|
4774
|
+
|
4775
|
+
Parameters:
|
4776
|
+
data (pd.DataFrame): The DataFrame to fill missing values.
|
4777
|
+
method (str): The imputation method to use. Options are:
|
4778
|
+
- 'mean': Replace missing values with the mean of the column.
|
4779
|
+
- 'median': Replace missing values with the median of the column.
|
4780
|
+
- 'most_frequent': Replace missing values with the most frequent value in the column.
|
4781
|
+
- 'constant': Replace missing values with a constant value provided by the `constant` parameter.
|
4782
|
+
- 'knn': Use K-Nearest Neighbors imputation.
|
4783
|
+
- 'iterative': Use Iterative imputation.
|
4784
|
+
axis (int): The axis along which to impute:
|
4785
|
+
- 0: Impute column-wise (default).
|
4786
|
+
- 1: Impute row-wise.
|
4787
|
+
constant (float, optional): Constant value to use for filling NaNs if method is 'constant'.
|
4788
|
+
inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
|
4789
|
+
|
4790
|
+
"""
|
4791
|
+
|
4792
|
+
if data.empty:
|
4793
|
+
raise ValueError("Input DataFrame is empty.")
|
4794
|
+
|
4795
|
+
# Validate method
|
4796
|
+
methods = ["mean", "median", "most_frequent", "constant", "knn", "iterative"]
|
4797
|
+
method = strcmp(method, methods)[0]
|
4798
|
+
|
4799
|
+
# If using constant method, ask for a constant value
|
4800
|
+
if constant is not None:
|
4801
|
+
method = "constant"
|
4802
|
+
try:
|
4803
|
+
constant = float(constant)
|
4804
|
+
except ValueError:
|
4805
|
+
raise ValueError("Constant value must be a number.")
|
4806
|
+
|
4807
|
+
# Initialize SimpleImputer with the chosen method
|
4808
|
+
if method == "constant":
|
4809
|
+
imputer = SimpleImputer(strategy=method, fill_value=constant)
|
4810
|
+
elif method == "knn":
|
4811
|
+
from sklearn.impute import KNNImputer
|
4812
|
+
|
4813
|
+
imputer = KNNImputer(n_neighbors=n_neighbors)
|
4814
|
+
elif method == "iterative":
|
4815
|
+
from sklearn.impute import IterativeImputer
|
4816
|
+
|
4817
|
+
imputer = IterativeImputer(max_iter=max_iter)
|
4818
|
+
else:
|
4819
|
+
from sklearn.impute import SimpleImputer
|
4820
|
+
|
4821
|
+
imputer = SimpleImputer(strategy=method)
|
4822
|
+
|
4823
|
+
# Fit and transform the data
|
4824
|
+
if axis == 0:
|
4825
|
+
# Impute column-wise
|
4826
|
+
imputed_data = imputer.fit_transform(data)
|
4827
|
+
imputed_data.shape
|
4828
|
+
elif axis == 1:
|
4829
|
+
# Impute row-wise
|
4830
|
+
imputed_data = imputer.fit_transform(data.T)
|
4831
|
+
imputed_data.shape
|
4832
|
+
else:
|
4833
|
+
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
4834
|
+
|
4835
|
+
df_filled = pd.DataFrame(
|
4836
|
+
imputed_data if axis == 0 else imputed_data.T,
|
4837
|
+
index=data.index,# if axis == 0 else data.columns,
|
4838
|
+
columns=data.columns,# if axis == 0 else data.index,
|
4839
|
+
)
|
4840
|
+
|
4841
|
+
if inplace:
|
4842
|
+
data.update(df_filled)
|
4843
|
+
return None # replace original
|
4844
|
+
else:
|
4845
|
+
return df_filled
|
4846
|
+
def df_scaler(
|
4847
|
+
data: pd.DataFrame,
|
4848
|
+
method="standard",
|
4849
|
+
columns=None, # default, select all numeric col/row
|
4850
|
+
inplace=False,
|
4851
|
+
verbose=False, # show usage
|
4852
|
+
axis=0, # defalut column-wise
|
4853
|
+
**kwargs,
|
4854
|
+
):
|
4855
|
+
"""
|
4856
|
+
df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)
|
4857
|
+
|
4858
|
+
Parameters:
|
4859
|
+
- data: pandas DataFrame to be scaled.
|
4860
|
+
- method: Scaler type ('standard', 'minmax', 'robust'). Default is 'standard'.
|
4861
|
+
- columns: List of columns (for axis=0) or rows (for axis=1) to scale.
|
4862
|
+
If None, all numeric columns/rows will be scaled.
|
4863
|
+
- inplace: If True, modify the DataFrame in place. Otherwise, return a new DataFrame.
|
4864
|
+
- axis: Axis along which to scale. 0 for column-wise, 1 for row-wise. Default is 0.
|
4865
|
+
- verbose: If True, prints logs of the process.
|
4866
|
+
- kwargs: Additional arguments to be passed to the scaler.
|
4867
|
+
"""
|
4868
|
+
if verbose:
|
4869
|
+
print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
|
4870
|
+
|
4871
|
+
methods = ["standard", "minmax", "robust"]
|
4872
|
+
method = strcmp(method, methods)[0]
|
4873
|
+
if method == "standard":
|
4874
|
+
from sklearn.preprocessing import StandardScaler
|
4875
|
+
|
4876
|
+
scaler = StandardScaler(**kwargs)
|
4877
|
+
elif method == "minmax":
|
4878
|
+
from sklearn.preprocessing import MinMaxScaler
|
4879
|
+
|
4880
|
+
scaler = MinMaxScaler(**kwargs)
|
4881
|
+
elif method == "robust":
|
4882
|
+
from sklearn.preprocessing import RobustScaler
|
4883
|
+
|
4884
|
+
scaler = RobustScaler(**kwargs)
|
4885
|
+
if axis not in [0, 1]:
|
4886
|
+
raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
|
4887
|
+
|
4888
|
+
if axis == 0:
|
4889
|
+
# Column-wise scaling (default)
|
4890
|
+
if columns is None:
|
4891
|
+
columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
|
4892
|
+
non_numeric_columns = data.columns.difference(columns)
|
4893
|
+
print(f"Scaling columns")
|
4894
|
+
|
4895
|
+
scaled_data = scaler.fit_transform(data[columns])
|
4896
|
+
|
4897
|
+
if inplace:
|
4898
|
+
data[columns] = scaled_data
|
4899
|
+
print("Original DataFrame modified in place (column-wise).")
|
4900
|
+
else:
|
4901
|
+
scaled_df = pd.concat(
|
4902
|
+
[
|
4903
|
+
pd.DataFrame(scaled_data, columns=columns, index=data.index),
|
4904
|
+
data[non_numeric_columns],
|
4905
|
+
],
|
4906
|
+
axis=1,
|
4907
|
+
)
|
4908
|
+
scaled_df = scaled_df[data.columns] # Maintain column order
|
4909
|
+
return scaled_df
|
4910
|
+
|
4911
|
+
elif axis == 1:
|
4912
|
+
# Row-wise scaling
|
4913
|
+
if columns is None:
|
4914
|
+
columns = data.index.tolist()
|
4915
|
+
numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
|
4916
|
+
if numeric_rows.empty:
|
4917
|
+
raise ValueError("No numeric rows to scale.")
|
4918
|
+
|
4919
|
+
print(f"Scaling rows")
|
4920
|
+
|
4921
|
+
scaled_data = scaler.fit_transform(
|
4922
|
+
numeric_rows.T
|
4923
|
+
).T # Transpose for scaling and then back
|
4924
|
+
|
4925
|
+
if inplace:
|
4926
|
+
data.loc[numeric_rows.index] = scaled_data
|
4927
|
+
print("Original DataFrame modified in place (row-wise).")
|
4928
|
+
else:
|
4929
|
+
scaled_df = data.copy()
|
4930
|
+
scaled_df.loc[numeric_rows.index] = scaled_data
|
4931
|
+
return scaled_df
|
4932
|
+
|
4670
4933
|
def df_cluster(
|
4671
4934
|
data: pd.DataFrame,
|
4672
4935
|
columns: Optional[list] = None,
|
@@ -5387,4 +5650,4 @@ def print_pd_usage(
|
|
5387
5650
|
i_ = i_.replace("=", "\t= ") + ","
|
5388
5651
|
print(i_) if i == 0 else print("\t", i_)
|
5389
5652
|
else:
|
5390
|
-
print(usage)
|
5653
|
+
print(usage)
|