py2ls 0.2.5.12__py3-none-any.whl → 0.2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/__init__.py +1 -5
- py2ls/ich2ls.py +1955 -296
- py2ls/ips.py +2782 -736
- py2ls/netfinder.py +12 -5
- py2ls/plot.py +13 -7
- py2ls/stats.py +1 -144
- {py2ls-0.2.5.12.dist-info → py2ls-0.2.5.15.dist-info}/METADATA +89 -232
- {py2ls-0.2.5.12.dist-info → py2ls-0.2.5.15.dist-info}/RECORD +11 -12
- {py2ls-0.2.5.12.dist-info → py2ls-0.2.5.15.dist-info}/WHEEL +1 -1
- py2ls/ips_lab.py +0 -17172
py2ls/ips.py
CHANGED
@@ -1,19 +1,18 @@
|
|
1
1
|
from tkinter import FALSE
|
2
2
|
import numpy as np
|
3
3
|
import pandas as pd
|
4
|
-
import sys
|
5
|
-
import os
|
4
|
+
import sys # built-in
|
5
|
+
import os # built-in
|
6
6
|
from IPython.display import display
|
7
7
|
import shutil
|
8
8
|
import logging
|
9
9
|
from pathlib import Path
|
10
10
|
from datetime import datetime, date, time
|
11
|
-
import re
|
11
|
+
import re # built-in
|
12
12
|
import stat
|
13
13
|
import platform
|
14
14
|
|
15
|
-
from typing import Dict, List, Optional, Union, Any,Tuple
|
16
|
-
|
15
|
+
from typing import Dict, List, Optional, Union, Any, Tuple, Literal,Callable
|
17
16
|
from regex import X
|
18
17
|
|
19
18
|
try:
|
@@ -27,7 +26,218 @@ import warnings
|
|
27
26
|
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
28
27
|
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
29
28
|
warnings.filterwarnings("ignore")
|
29
|
+
try:
|
30
|
+
import pkg_resources
|
31
|
+
except ImportError:
|
32
|
+
pkg_resources = None
|
33
|
+
import glob # built-in
|
34
|
+
import pkg_resources # built-in
|
35
|
+
class PkgManager:
|
36
|
+
"""
|
37
|
+
PkgManager.uninstall("py2ls")
|
38
|
+
PkgManager.uninstall("py2ls", mode="startswith")
|
39
|
+
PkgManager.uninstall("py2ls", mode="endswith")
|
40
|
+
PkgManager.uninstall("py2ls", mode="contains")
|
41
|
+
PkgManager.uninstall("py2ls", mode="regex")
|
42
|
+
|
43
|
+
PkgManager.timemachine()
|
44
|
+
"""
|
45
|
+
|
46
|
+
@staticmethod
|
47
|
+
def uninstall(
|
48
|
+
kw: Union[str, List[str]],
|
49
|
+
mode: str = "exact",
|
50
|
+
dry_run: bool = False,
|
51
|
+
make_backup: bool = True,
|
52
|
+
make_log: bool = True,
|
53
|
+
station: Optional[str] = None,
|
54
|
+
) -> None:
|
55
|
+
if station is None:
|
56
|
+
station = os.path.dirname(os.path.dirname(sys.executable))
|
57
|
+
os.makedirs(station, exist_ok=True)
|
58
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
59
|
+
|
60
|
+
if isinstance(kw, str):
|
61
|
+
kw = [kw]
|
62
|
+
kw = [k.lower() for k in kw] if mode != "regex" else kw
|
63
|
+
mode = mode.lower()
|
64
|
+
valid_modes = {"exact", "startswith", "endswith", "contains", "regex"}
|
65
|
+
if mode not in valid_modes:
|
66
|
+
raise ValueError(f"Mode must be one of {valid_modes}")
|
67
|
+
|
68
|
+
installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
|
69
|
+
matched: Set[str] = set()
|
70
|
+
|
71
|
+
for name in installed_packages:
|
72
|
+
for key in kw:
|
73
|
+
if (
|
74
|
+
(mode == "exact" and name == key)
|
75
|
+
or (mode == "startswith" and name.startswith(key))
|
76
|
+
or (mode == "endswith" and name.endswith(key))
|
77
|
+
or (mode == "contains" and key in name)
|
78
|
+
or (mode == "regex" and re.search(key, name))
|
79
|
+
):
|
80
|
+
matched.add(name)
|
81
|
+
break
|
82
|
+
|
83
|
+
if not matched:
|
84
|
+
print("No packages matched the criteria.")
|
85
|
+
return
|
86
|
+
|
87
|
+
if make_backup and not dry_run:
|
88
|
+
backup_path = os.path.join(station, f"requirements_backup_{timestamp}.txt")
|
89
|
+
with open(backup_path, "w") as f:
|
90
|
+
subprocess.run(["pip", "freeze"], stdout=f, check=True)
|
91
|
+
print(f"Backup created at: '{backup_path}'")
|
92
|
+
|
93
|
+
if dry_run:
|
94
|
+
print("[DRY RUN] The following packages would be uninstalled:")
|
95
|
+
for pkg in sorted(matched):
|
96
|
+
print(f" - {pkg}=={installed_packages[pkg]}")
|
97
|
+
return
|
98
|
+
|
99
|
+
print(f"[UNINSTALLING] {len(matched)} packages:")
|
100
|
+
for pkg in sorted(matched):
|
101
|
+
print(f" - {pkg}=={installed_packages[pkg]}")
|
102
|
+
subprocess.run(["pip", "uninstall", "-y", pkg], check=True)
|
103
|
+
|
104
|
+
if make_log:
|
105
|
+
log_path = os.path.join(station, f"uninstall_{timestamp}.txt")
|
106
|
+
with open(log_path, "w") as f:
|
107
|
+
f.write(f"# Uninstallation log created at {timestamp}\n")
|
108
|
+
f.write(f"# Mode: {mode}, Keywords: {kw}\n\n")
|
109
|
+
for pkg in sorted(matched):
|
110
|
+
f.write(f"{pkg}=={installed_packages[pkg]}\n")
|
111
|
+
print(f"Log written to '{log_path}'")
|
112
|
+
|
113
|
+
@staticmethod
|
114
|
+
def list_backups(station: Optional[str] = None) -> List[str]:
|
115
|
+
if station is None:
|
116
|
+
station = os.path.dirname(sys.executable)
|
117
|
+
if os.name == "nt":
|
118
|
+
station = os.path.dirname(station)
|
119
|
+
return sorted(glob.glob(os.path.join(station, "requirements_backup_*.txt")))
|
120
|
+
|
121
|
+
@staticmethod
|
122
|
+
def list_logs(station: Optional[str] = None) -> List[str]:
|
123
|
+
if station is None:
|
124
|
+
station = os.path.dirname(sys.executable)
|
125
|
+
if os.name == "nt":
|
126
|
+
station = os.path.dirname(station)
|
127
|
+
return sorted(glob.glob(os.path.join(station, "uninstall_*.txt")))
|
128
|
+
|
129
|
+
@staticmethod
|
130
|
+
def restore(
|
131
|
+
timestamp: Optional[str] = None,
|
132
|
+
station: Optional[str] = None,
|
133
|
+
dry_run: bool = False,
|
134
|
+
) -> None:
|
135
|
+
if station is None:
|
136
|
+
station = os.path.dirname(sys.executable)
|
137
|
+
if os.name == "nt":
|
138
|
+
station = os.path.dirname(station)
|
139
|
+
|
140
|
+
backups = PkgManager.list_backups(station)
|
141
|
+
logs = PkgManager.list_logs(station)
|
142
|
+
|
143
|
+
if not timestamp:
|
144
|
+
print("Available restore points:\n\nBackups:")
|
145
|
+
for i, backup in enumerate(backups, 1):
|
146
|
+
ts = os.path.basename(backup)[18:-4]
|
147
|
+
print(f" {i}. {ts} (backup)")
|
148
|
+
print("\nUninstall logs:")
|
149
|
+
for i, log in enumerate(logs, len(backups) + 1):
|
150
|
+
ts = os.path.basename(log)[10:-4]
|
151
|
+
print(f" {i}. {ts} (log)")
|
152
|
+
print("\nSpecify timestamp or selection number to restore.")
|
153
|
+
return
|
154
|
+
|
155
|
+
try:
|
156
|
+
selection = int(timestamp)
|
157
|
+
all_files = backups + logs
|
158
|
+
if 1 <= selection <= len(all_files):
|
159
|
+
file_path = all_files[selection - 1]
|
160
|
+
is_log = selection > len(backups)
|
161
|
+
else:
|
162
|
+
raise ValueError("Invalid selection number")
|
163
|
+
except ValueError:
|
164
|
+
backup_pattern = os.path.join(
|
165
|
+
station, f"requirements_backup_{timestamp}.txt"
|
166
|
+
)
|
167
|
+
log_pattern = os.path.join(station, f"uninstall_{timestamp}.txt")
|
168
|
+
matching_backups = glob.glob(backup_pattern)
|
169
|
+
matching_logs = glob.glob(log_pattern)
|
170
|
+
|
171
|
+
if matching_backups:
|
172
|
+
file_path = matching_backups[0]
|
173
|
+
is_log = False
|
174
|
+
elif matching_logs:
|
175
|
+
file_path = matching_logs[0]
|
176
|
+
is_log = True
|
177
|
+
else:
|
178
|
+
print(f"No backup or log found for timestamp: {timestamp}")
|
179
|
+
return
|
180
|
+
|
181
|
+
with open(file_path, "r") as f:
|
182
|
+
packages = [
|
183
|
+
line.strip() for line in f if line.strip() and not line.startswith("#")
|
184
|
+
]
|
185
|
+
|
186
|
+
if dry_run:
|
187
|
+
print(
|
188
|
+
f"[DRY RUN] Would restore {len(packages)} packages from:\n {file_path}"
|
189
|
+
)
|
190
|
+
for pkg in packages:
|
191
|
+
print(f" - {pkg}")
|
192
|
+
return
|
193
|
+
|
194
|
+
print(f"[RESTORING] {len(packages)} packages from:\n {file_path}")
|
195
|
+
for pkg in packages:
|
196
|
+
print(f" - Installing {pkg}")
|
197
|
+
subprocess.run(["pip", "install", pkg], check=True)
|
30
198
|
|
199
|
+
@staticmethod
|
200
|
+
def timemachine(station: Optional[str] = None) -> None:
|
201
|
+
if station is None:
|
202
|
+
station = os.path.dirname(sys.executable)
|
203
|
+
if os.name == "nt":
|
204
|
+
station = os.path.dirname(station)
|
205
|
+
|
206
|
+
backups = PkgManager.list_backups(station)
|
207
|
+
logs = PkgManager.list_logs(station)
|
208
|
+
|
209
|
+
if not backups and not logs:
|
210
|
+
print("No backup or log files found.")
|
211
|
+
return
|
212
|
+
|
213
|
+
print("\nTime Machine - Available Restore Points:")
|
214
|
+
print("--------------------------------------")
|
215
|
+
print("\nBackups (complete environment snapshots):")
|
216
|
+
for i, backup in enumerate(backups, 1):
|
217
|
+
ts = os.path.basename(backup)[18:-4]
|
218
|
+
print(f" {i}. {ts}")
|
219
|
+
print("\nUninstall Logs (specific package lists):")
|
220
|
+
for i, log in enumerate(logs, len(backups) + 1):
|
221
|
+
ts = os.path.basename(log)[10:-4]
|
222
|
+
print(f" {i}. {ts}")
|
223
|
+
print("\n0. Exit Time Machine")
|
224
|
+
|
225
|
+
while True:
|
226
|
+
try:
|
227
|
+
choice = input("\nSelect a restore point (number) or '0' to exit: ")
|
228
|
+
if choice == "0":
|
229
|
+
return
|
230
|
+
selection = int(choice)
|
231
|
+
all_files = backups + logs
|
232
|
+
if 1 <= selection <= len(all_files):
|
233
|
+
file_path = all_files[selection - 1]
|
234
|
+
timestamp = os.path.basename(file_path).split("_")[-1][:-4]
|
235
|
+
PkgManager.restore(timestamp, station)
|
236
|
+
return
|
237
|
+
else:
|
238
|
+
print("Invalid selection. Please try again.")
|
239
|
+
except ValueError:
|
240
|
+
print("Please enter a valid number.")
|
31
241
|
|
32
242
|
def _yaoshi_fernet(mima="mimashigudingde",yan=b"mimashigudingde",verbose=True):
|
33
243
|
import base64
|
@@ -1663,76 +1873,465 @@ def flatten(nested: Any, unique_list=True, verbose=False):
|
|
1663
1873
|
return flattened_list
|
1664
1874
|
|
1665
1875
|
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1876
|
+
#! ===========extract_text===========
|
1877
|
+
def extract_text(
|
1878
|
+
text: Union[str, List[str]],
|
1879
|
+
patterns: Union[str, List[str]],
|
1880
|
+
*,
|
1881
|
+
mode: Literal["between", "split", "extract"] = "between",
|
1882
|
+
keep: Literal["none", "left", "right", "both", "markers"] = "none",
|
1883
|
+
case: Literal["sensitive", "insensitive"] = "insensitive",
|
1884
|
+
all_matches: bool = False,
|
1885
|
+
positions: bool = False,
|
1886
|
+
regex: bool = False,
|
1887
|
+
delimiter: Optional[str] = None,
|
1888
|
+
trim: bool = True,
|
1889
|
+
as_dict: bool = False,
|
1890
|
+
verbose: bool = False,
|
1891
|
+
**kwargs,
|
1892
|
+
) -> Union[List[str], Tuple[int, str], Dict[str, Any], List[Dict[str, Any]], None]:
|
1893
|
+
"""
|
1894
|
+
Ultimate text extraction tool with enhanced reliability and features.
|
1677
1895
|
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1896
|
+
Key improvements:
|
1897
|
+
- Robust split mode with proper delimiter handling
|
1898
|
+
- Consistent return types across all modes
|
1899
|
+
- Improved pattern matching logic
|
1900
|
+
- Better edge case handling
|
1683
1901
|
|
1684
|
-
|
1685
|
-
|
1686
|
-
#
|
1687
|
-
|
1688
|
-
|
1689
|
-
#
|
1690
|
-
|
1691
|
-
|
1692
|
-
#
|
1693
|
-
|
1694
|
-
|
1695
|
-
#
|
1696
|
-
|
1697
|
-
|
1698
|
-
|
1699
|
-
|
1700
|
-
|
1701
|
-
|
1702
|
-
|
1703
|
-
|
1704
|
-
|
1705
|
-
|
1706
|
-
|
1707
|
-
|
1708
|
-
|
1709
|
-
|
1710
|
-
|
1711
|
-
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
#
|
1717
|
-
|
1718
|
-
|
1719
|
-
|
1720
|
-
|
1721
|
-
#
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
|
1726
|
-
#
|
1727
|
-
|
1728
|
-
|
1729
|
-
|
1730
|
-
|
1731
|
-
|
1732
|
-
|
1733
|
-
|
1902
|
+
|
1903
|
+
print(extract_text("A,B,C", ",", mode="split", keep="none", all_matches=True))
|
1904
|
+
# Correctly returns: ['A', 'B', 'C']
|
1905
|
+
|
1906
|
+
print(extract_text("A,B,C", ",", mode="split", keep="left"))
|
1907
|
+
# Returns: ['A,', 'B,', 'C']
|
1908
|
+
|
1909
|
+
print(extract_text("A,B,C", ",", mode="split", keep="right"))
|
1910
|
+
# Returns: [',B', ',C']
|
1911
|
+
|
1912
|
+
print(extract_text("A,B,C", ",", mode="split", keep="both"))
|
1913
|
+
# Returns: ['A', ',', 'B', ',', 'C']
|
1914
|
+
"""
|
1915
|
+
if verbose:
|
1916
|
+
print("""
|
1917
|
+
extract_text(
|
1918
|
+
text: Union[str, List[str]],
|
1919
|
+
patterns: Union[str, List[str]],
|
1920
|
+
*,
|
1921
|
+
mode: Literal["between", "split", "extract"] = "between",
|
1922
|
+
keep: Literal["none", "left", "right", "both", "markers"] = "none",
|
1923
|
+
case: Literal["sensitive", "insensitive"] = "insensitive",
|
1924
|
+
all_matches: bool = False,
|
1925
|
+
positions: bool = False,
|
1926
|
+
regex: bool = False,
|
1927
|
+
delimiter: Optional[str] = None,
|
1928
|
+
trim: bool = True,
|
1929
|
+
as_dict: bool = False,
|
1930
|
+
verbose: bool = False,
|
1931
|
+
**kwargs,
|
1932
|
+
)
|
1933
|
+
""")
|
1934
|
+
# Normalization and validation
|
1935
|
+
text = _normalize_text(text, delimiter)
|
1936
|
+
patterns = _validate_patterns(patterns)
|
1937
|
+
flags = re.IGNORECASE if case == "insensitive" else 0
|
1938
|
+
|
1939
|
+
# Find all matches with enhanced validation
|
1940
|
+
matches = _find_matches(text, patterns, regex, flags)
|
1941
|
+
if not matches:
|
1942
|
+
return None
|
1943
|
+
|
1944
|
+
# Mode-specific processing
|
1945
|
+
if mode == "extract":
|
1946
|
+
return _handle_extract(matches, all_matches, as_dict, positions, trim)
|
1947
|
+
elif mode == "split":
|
1948
|
+
return _handle_split(text, matches, keep, all_matches, as_dict, positions, trim)
|
1949
|
+
elif mode == "between":
|
1950
|
+
return _handle_between(text, matches, patterns, keep, as_dict, positions, trim)
|
1951
|
+
else:
|
1952
|
+
raise ValueError(f"Invalid mode: {mode}")
|
1953
|
+
|
1954
|
+
|
1955
|
+
def _normalize_text(text: Union[str, List[str]], delimiter: Optional[str]) -> str:
|
1956
|
+
"""Normalize text input to single string"""
|
1957
|
+
if isinstance(text, list):
|
1958
|
+
return delimiter.join(text) if delimiter else " ".join(text)
|
1959
|
+
return text
|
1960
|
+
|
1961
|
+
|
1962
|
+
def _validate_patterns(patterns: Union[str, List[str]]) -> List[str]:
|
1963
|
+
"""Validate and normalize patterns"""
|
1964
|
+
if isinstance(patterns, str):
|
1965
|
+
return [patterns]
|
1966
|
+
if not patterns:
|
1967
|
+
raise ValueError("At least one pattern required")
|
1968
|
+
return patterns
|
1969
|
+
|
1970
|
+
|
1971
|
+
def _find_matches(
|
1972
|
+
text: str, patterns: List[str], regex: bool, flags: int
|
1973
|
+
) -> List[dict]:
|
1974
|
+
"""Find all pattern matches with enhanced regex handling"""
|
1975
|
+
matches = []
|
1976
|
+
for pattern in patterns:
|
1977
|
+
try:
|
1978
|
+
search_pattern = pattern if regex else re.escape(pattern)
|
1979
|
+
for match in re.finditer(search_pattern, text, flags=flags):
|
1980
|
+
matches.append(
|
1981
|
+
{
|
1982
|
+
"text": match.group(),
|
1983
|
+
"start": match.start(),
|
1984
|
+
"end": match.end(),
|
1985
|
+
"pattern": pattern,
|
1986
|
+
"full_match": match,
|
1987
|
+
}
|
1988
|
+
)
|
1989
|
+
except re.error as e:
|
1990
|
+
raise ValueError(f"Invalid pattern '{pattern}': {e}")
|
1991
|
+
return sorted(matches, key=lambda x: x["start"])
|
1992
|
+
|
1993
|
+
|
1994
|
+
def _handle_extract(
|
1995
|
+
matches: List[dict], all_matches: bool, as_dict: bool, positions: bool, trim: bool
|
1996
|
+
) -> Union[List, dict]:
|
1997
|
+
"""Handle text extraction of matched patterns"""
|
1998
|
+
results = []
|
1999
|
+
for match in matches if all_matches else [matches[0]]:
|
2000
|
+
content = match["text"].strip() if trim else match["text"]
|
2001
|
+
result = (
|
2002
|
+
{
|
2003
|
+
"text": content,
|
2004
|
+
"start": match["start"],
|
2005
|
+
"end": match["end"],
|
2006
|
+
"pattern": match["pattern"],
|
2007
|
+
}
|
2008
|
+
if as_dict
|
2009
|
+
else content
|
2010
|
+
)
|
2011
|
+
if positions and as_dict:
|
2012
|
+
result["positions"] = [(match["start"], match["end"])]
|
2013
|
+
results.append(result)
|
2014
|
+
|
2015
|
+
return results[0] if not all_matches else results
|
2016
|
+
|
2017
|
+
|
2018
|
+
def _create_part(
|
2019
|
+
content: str,
|
2020
|
+
start: int,
|
2021
|
+
end: int,
|
2022
|
+
match: Optional[dict],
|
2023
|
+
as_dict: bool,
|
2024
|
+
positions: bool,
|
2025
|
+
trim: bool,
|
2026
|
+
) -> Union[str, dict]:
|
2027
|
+
"""Create a standardized result part"""
|
2028
|
+
content = content.strip() if trim else content
|
2029
|
+
if not as_dict:
|
2030
|
+
return content
|
2031
|
+
|
2032
|
+
part = {
|
2033
|
+
"text": content,
|
2034
|
+
"start": start,
|
2035
|
+
"end": end,
|
2036
|
+
"pattern": match["pattern"] if match else None,
|
2037
|
+
}
|
2038
|
+
if positions and match:
|
2039
|
+
part["positions"] = [(match["start"], match["end"])]
|
2040
|
+
return part
|
2041
|
+
|
2042
|
+
|
2043
|
+
def _handle_between(
|
2044
|
+
text: str,
|
2045
|
+
matches: List[dict],
|
2046
|
+
patterns: List[str],
|
2047
|
+
keep: str,
|
2048
|
+
as_dict: bool,
|
2049
|
+
positions: bool,
|
2050
|
+
trim: bool,
|
2051
|
+
) -> Union[Tuple, dict]:
|
2052
|
+
"""Reliable between-mode implementation with boundary checks"""
|
2053
|
+
first_pattern, last_pattern = patterns[0], patterns[-1]
|
2054
|
+
first_matches = [m for m in matches if m["pattern"] == first_pattern]
|
2055
|
+
last_matches = [m for m in matches if m["pattern"] == last_pattern]
|
2056
|
+
|
2057
|
+
if not first_matches or not last_matches:
|
2058
|
+
return None
|
2059
|
+
|
2060
|
+
first = first_matches[0]
|
2061
|
+
last = last_matches[-1]
|
2062
|
+
|
2063
|
+
if first["start"] > last["start"]:
|
2064
|
+
return None
|
2065
|
+
|
2066
|
+
# Calculate extraction window
|
2067
|
+
start, end = first["start"], last["end"]
|
2068
|
+
if keep == "none":
|
2069
|
+
start, end = first["end"], last["start"]
|
2070
|
+
elif keep == "left":
|
2071
|
+
end = last["start"]
|
2072
|
+
elif keep == "right":
|
2073
|
+
start = first["end"]
|
2074
|
+
|
2075
|
+
extracted = text[start:end].strip() if trim else text[start:end]
|
2076
|
+
|
2077
|
+
if as_dict:
|
2078
|
+
result = {
|
2079
|
+
"text": extracted,
|
2080
|
+
"start": start,
|
2081
|
+
"end": end,
|
2082
|
+
"patterns": patterns,
|
2083
|
+
"match_positions": [(m["start"], m["end"]) for m in matches],
|
2084
|
+
}
|
2085
|
+
return result
|
2086
|
+
|
2087
|
+
return (
|
2088
|
+
(start, extracted)
|
2089
|
+
if not positions
|
2090
|
+
else (start, extracted, [(m["start"], m["end"]) for m in matches])
|
2091
|
+
)
|
2092
|
+
|
2093
|
+
|
2094
|
+
def _handle_split(
|
2095
|
+
text: str,
|
2096
|
+
matches: List[dict],
|
2097
|
+
keep: str,
|
2098
|
+
all_matches: bool,
|
2099
|
+
as_dict: bool,
|
2100
|
+
positions: bool,
|
2101
|
+
trim: bool,
|
2102
|
+
) -> Union[List, dict]:
|
2103
|
+
"""Split text with proper handling of keep='both' to include delimiters on both sides"""
|
2104
|
+
if not matches:
|
2105
|
+
return (
|
2106
|
+
[text]
|
2107
|
+
if not as_dict
|
2108
|
+
else [{"text": text, "start": 0, "end": len(text), "pattern": None}]
|
2109
|
+
)
|
2110
|
+
|
2111
|
+
parts = []
|
2112
|
+
prev_end = 0
|
2113
|
+
process_matches = matches if all_matches else [matches[0]]
|
2114
|
+
|
2115
|
+
# Special handling for keep="both"
|
2116
|
+
if keep == "both":
|
2117
|
+
for i, match in enumerate(process_matches):
|
2118
|
+
start, end = match["start"], match["end"]
|
2119
|
+
matched_text = text[start:end]
|
2120
|
+
|
2121
|
+
# First segment (text before first delimiter + first delimiter)
|
2122
|
+
if i == 0:
|
2123
|
+
segment = text[prev_end:end] # From start to end of first delimiter
|
2124
|
+
if trim:
|
2125
|
+
segment = segment.strip()
|
2126
|
+
if segment or not trim:
|
2127
|
+
if as_dict:
|
2128
|
+
parts.append(
|
2129
|
+
{
|
2130
|
+
"text": segment,
|
2131
|
+
"start": prev_end,
|
2132
|
+
"end": end,
|
2133
|
+
"pattern": match["pattern"],
|
2134
|
+
**({"positions": [(start, end)]} if positions else {}),
|
2135
|
+
}
|
2136
|
+
)
|
2137
|
+
else:
|
2138
|
+
parts.append(segment)
|
2139
|
+
prev_end = end
|
2140
|
+
|
2141
|
+
# Middle segments (delimiter + text + next delimiter)
|
2142
|
+
if i > 0 and i < len(process_matches):
|
2143
|
+
next_match = process_matches[i]
|
2144
|
+
next_start, next_end = next_match["start"], next_match["end"]
|
2145
|
+
segment = text[
|
2146
|
+
prev_end:next_end
|
2147
|
+
] # From prev_end to end of next delimiter
|
2148
|
+
if trim:
|
2149
|
+
segment = segment.strip()
|
2150
|
+
if segment or not trim:
|
2151
|
+
if as_dict:
|
2152
|
+
parts.append(
|
2153
|
+
{
|
2154
|
+
"text": segment,
|
2155
|
+
"start": prev_end,
|
2156
|
+
"end": next_end,
|
2157
|
+
"pattern": next_match["pattern"],
|
2158
|
+
**(
|
2159
|
+
{"positions": [(next_start, next_end)]}
|
2160
|
+
if positions
|
2161
|
+
else {}
|
2162
|
+
),
|
2163
|
+
}
|
2164
|
+
)
|
2165
|
+
else:
|
2166
|
+
parts.append(segment)
|
2167
|
+
prev_end = next_end
|
2168
|
+
|
2169
|
+
# Last segment (last delimiter + remaining text)
|
2170
|
+
if process_matches and prev_end < len(text):
|
2171
|
+
last_match = process_matches[-1]
|
2172
|
+
segment = text[
|
2173
|
+
last_match["start"] : len(text)
|
2174
|
+
] # From last delimiter to end
|
2175
|
+
if trim:
|
2176
|
+
segment = segment.strip()
|
2177
|
+
if segment or not trim:
|
2178
|
+
if as_dict:
|
2179
|
+
parts.append(
|
2180
|
+
{
|
2181
|
+
"text": segment,
|
2182
|
+
"start": last_match["start"],
|
2183
|
+
"end": len(text),
|
2184
|
+
"pattern": last_match["pattern"],
|
2185
|
+
**(
|
2186
|
+
{
|
2187
|
+
"positions": [
|
2188
|
+
(last_match["start"], last_match["end"])
|
2189
|
+
]
|
2190
|
+
}
|
2191
|
+
if positions
|
2192
|
+
else {}
|
2193
|
+
),
|
2194
|
+
}
|
2195
|
+
)
|
2196
|
+
else:
|
2197
|
+
parts.append(segment)
|
2198
|
+
|
2199
|
+
return parts
|
2200
|
+
|
2201
|
+
# Original handling for other keep modes
|
2202
|
+
for i, match in enumerate(process_matches):
|
2203
|
+
start, end = match["start"], match["end"]
|
2204
|
+
matched_text = text[start:end]
|
2205
|
+
|
2206
|
+
# Handle text before the match
|
2207
|
+
if prev_end < start:
|
2208
|
+
before = text[prev_end:start]
|
2209
|
+
if trim:
|
2210
|
+
before = before.strip()
|
2211
|
+
if before or not trim:
|
2212
|
+
if as_dict:
|
2213
|
+
parts.append(
|
2214
|
+
{
|
2215
|
+
"text": before,
|
2216
|
+
"start": prev_end,
|
2217
|
+
"end": start,
|
2218
|
+
"pattern": None,
|
2219
|
+
**({"positions": []} if positions else {}),
|
2220
|
+
}
|
2221
|
+
)
|
2222
|
+
else:
|
2223
|
+
parts.append(before)
|
2224
|
+
|
2225
|
+
# Handle the match based on keep mode
|
2226
|
+
if keep == "none":
|
2227
|
+
pass # Skip the delimiter
|
2228
|
+
elif keep == "left":
|
2229
|
+
if parts:
|
2230
|
+
if as_dict:
|
2231
|
+
parts[-1]["text"] += matched_text
|
2232
|
+
parts[-1]["end"] = end
|
2233
|
+
else:
|
2234
|
+
parts[-1] += matched_text
|
2235
|
+
else:
|
2236
|
+
if as_dict:
|
2237
|
+
parts.append(
|
2238
|
+
{
|
2239
|
+
"text": matched_text,
|
2240
|
+
"start": start,
|
2241
|
+
"end": end,
|
2242
|
+
"pattern": match["pattern"],
|
2243
|
+
**({"positions": [(start, end)]} if positions else {}),
|
2244
|
+
}
|
2245
|
+
)
|
2246
|
+
else:
|
2247
|
+
parts.append(matched_text)
|
2248
|
+
elif keep == "right":
|
2249
|
+
if i < len(process_matches) - 1:
|
2250
|
+
next_start = process_matches[i + 1]["start"]
|
2251
|
+
if end < next_start:
|
2252
|
+
between = text[end:next_start]
|
2253
|
+
if as_dict:
|
2254
|
+
parts.append(
|
2255
|
+
{
|
2256
|
+
"text": matched_text + between,
|
2257
|
+
"start": start,
|
2258
|
+
"end": next_start,
|
2259
|
+
"pattern": match["pattern"],
|
2260
|
+
**({"positions": [(start, end)]} if positions else {}),
|
2261
|
+
}
|
2262
|
+
)
|
2263
|
+
else:
|
2264
|
+
parts.append(matched_text + between)
|
2265
|
+
prev_end = next_start
|
2266
|
+
continue
|
2267
|
+
|
2268
|
+
prev_end = end
|
2269
|
+
|
2270
|
+
# Handle remaining text after last match
|
2271
|
+
if prev_end < len(text):
|
2272
|
+
remaining = text[prev_end:]
|
2273
|
+
if trim:
|
2274
|
+
remaining = remaining.strip()
|
2275
|
+
if remaining or not trim:
|
2276
|
+
if keep == "right" and parts and process_matches:
|
2277
|
+
last_match = process_matches[-1]
|
2278
|
+
matched_text = text[last_match["start"] : last_match["end"]]
|
2279
|
+
if as_dict:
|
2280
|
+
parts.append(
|
2281
|
+
{
|
2282
|
+
"text": matched_text + remaining,
|
2283
|
+
"start": last_match["start"],
|
2284
|
+
"end": len(text),
|
2285
|
+
"pattern": last_match["pattern"],
|
2286
|
+
**(
|
2287
|
+
{
|
2288
|
+
"positions": [
|
2289
|
+
(last_match["start"], last_match["end"])
|
2290
|
+
]
|
2291
|
+
}
|
2292
|
+
if positions
|
2293
|
+
else {}
|
2294
|
+
),
|
2295
|
+
}
|
2296
|
+
)
|
2297
|
+
else:
|
2298
|
+
parts.append(matched_text + remaining)
|
2299
|
+
else:
|
2300
|
+
if as_dict:
|
2301
|
+
parts.append(
|
2302
|
+
{
|
2303
|
+
"text": remaining,
|
2304
|
+
"start": prev_end,
|
2305
|
+
"end": len(text),
|
2306
|
+
"pattern": None,
|
2307
|
+
**({"positions": []} if positions else {}),
|
2308
|
+
}
|
2309
|
+
)
|
2310
|
+
else:
|
2311
|
+
parts.append(remaining)
|
2312
|
+
|
2313
|
+
# Filter empty parts if trimming
|
2314
|
+
if trim:
|
2315
|
+
parts = [p for p in parts if (p["text"].strip() if as_dict else p.strip())]
|
2316
|
+
|
2317
|
+
return parts
|
1734
2318
|
|
1735
2319
|
|
2320
|
+
def _merge_parts(
|
2321
|
+
parts: List[Union[str, dict]], text: str, as_dict: bool, trim: bool
|
2322
|
+
) -> Union[str, dict]:
|
2323
|
+
"""Merge adjacent parts for keep=left mode"""
|
2324
|
+
if as_dict:
|
2325
|
+
merged_text = "".join(p["text"] for p in parts)
|
2326
|
+
return {
|
2327
|
+
"text": merged_text.strip() if trim else merged_text,
|
2328
|
+
"start": parts[0]["start"],
|
2329
|
+
"end": parts[-1]["end"],
|
2330
|
+
"patterns": list(set(p["pattern"] for p in parts if p["pattern"])),
|
2331
|
+
}
|
2332
|
+
return "".join(parts).strip() if trim else "".join(parts)
|
2333
|
+
#! ===========extract_text===========
|
2334
|
+
|
1736
2335
|
def strcmp(
|
1737
2336
|
search_term: str,
|
1738
2337
|
candidates: List[str],
|
@@ -2794,73 +3393,6 @@ def text2audio(
|
|
2794
3393
|
|
2795
3394
|
# from datetime import datetime
|
2796
3395
|
from dateutil import parser
|
2797
|
-
# import re
|
2798
|
-
# from typing import Union, Optional, Dict, Any
|
2799
|
-
# def str2time(time_str, fmt="24"):
|
2800
|
-
# """
|
2801
|
-
# Convert a time string into the specified format.
|
2802
|
-
# Parameters:
|
2803
|
-
# - time_str (str): The time string to be converted.
|
2804
|
-
# - fmt (str): The format to convert the time to. Defaults to '%H:%M:%S'.
|
2805
|
-
# Returns:
|
2806
|
-
# %I represents the hour in 12-hour format.
|
2807
|
-
# %H represents the hour in 24-hour format (00 through 23).
|
2808
|
-
# %M represents the minute.
|
2809
|
-
# %S represents the second.
|
2810
|
-
# %p represents AM or PM.
|
2811
|
-
# - str: The converted time string.
|
2812
|
-
# """
|
2813
|
-
# from datetime import datetime
|
2814
|
-
|
2815
|
-
# def time_len_corr(time_str):
|
2816
|
-
# time_str_ = (
|
2817
|
-
# ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
|
2818
|
-
# )
|
2819
|
-
# time_str_split = []
|
2820
|
-
# [time_str_split.append(i) for i in time_str_ if is_num(i)]
|
2821
|
-
# if time_str_split:
|
2822
|
-
# if len(time_str_split) == 2:
|
2823
|
-
# H, M = time_str_split
|
2824
|
-
# time_str_full = H + ":" + M + ":00"
|
2825
|
-
# elif len(time_str_split) == 3:
|
2826
|
-
# H, M, S = time_str_split
|
2827
|
-
# time_str_full = H + ":" + M + ":" + S
|
2828
|
-
# else:
|
2829
|
-
# time_str_full = time_str_
|
2830
|
-
# if "am" in time_str.lower():
|
2831
|
-
# time_str_full += " AM"
|
2832
|
-
# elif "pm" in time_str.lower():
|
2833
|
-
# time_str_full += " PM"
|
2834
|
-
# return time_str_full
|
2835
|
-
|
2836
|
-
# if "12" in fmt:
|
2837
|
-
# fmt = "%I:%M:%S %p"
|
2838
|
-
# elif "24" in fmt:
|
2839
|
-
# fmt = "%H:%M:%S"
|
2840
|
-
|
2841
|
-
# try:
|
2842
|
-
# # Try to parse the time string assuming it could be in 24-hour or 12-hour format
|
2843
|
-
# time_obj = datetime.strptime(time_len_corr(time_str), "%H:%M:%S")
|
2844
|
-
# except ValueError:
|
2845
|
-
# try:
|
2846
|
-
# time_obj = datetime.strptime(time_len_corr(time_str), "%I:%M:%S %p")
|
2847
|
-
# except ValueError as e:
|
2848
|
-
# raise ValueError(f"Unable to parse time string: {time_str}. Error: {e}")
|
2849
|
-
|
2850
|
-
# # Format the time object to the desired output format
|
2851
|
-
# formatted_time = time_obj.strftime(fmt)
|
2852
|
-
# return formatted_time
|
2853
|
-
|
2854
|
-
|
2855
|
-
# # # Example usage:
|
2856
|
-
# # time_str1 = "14:30:45"
|
2857
|
-
# # time_str2 = "02:30:45 PM"
|
2858
|
-
|
2859
|
-
# # formatted_time1 = str2time(time_str1, fmt='12') # Convert to 12-hour format
|
2860
|
-
# # formatted_time2 = str2time(time_str2, fmt='24') # Convert to 24-hour format
|
2861
|
-
|
2862
|
-
# # print(formatted_time1) # Output: 02:30:45 PM
|
2863
|
-
# # print(formatted_time2) # Output: 14:30:45
|
2864
3396
|
def str2time(
|
2865
3397
|
time_str: str,
|
2866
3398
|
fmt: str = "24",
|
@@ -2964,57 +3496,6 @@ def str2time(
|
|
2964
3496
|
raise ValueError(f"Unable to parse time string: '{time_str}'. Error: {e}")
|
2965
3497
|
return default
|
2966
3498
|
|
2967
|
-
|
2968
|
-
# def str2date(date_str, original_fmt=None, fmt="%Y-%m-%d"):
|
2969
|
-
# """
|
2970
|
-
# Convert a date string to the desired format and extract components if needed.
|
2971
|
-
# Usage:
|
2972
|
-
# str2date(x, fmt="%d.%m.%y",original_fmt="%d.%m.%y")
|
2973
|
-
# Parameters:
|
2974
|
-
# - date_str (str): The input date string.
|
2975
|
-
# - original_fmt (str, optional): The original format of the date string. If not provided, it will be auto-detected.
|
2976
|
-
# - fmt (str): The desired format for the output date string. Defaults to '%Y-%m-%d'.
|
2977
|
-
|
2978
|
-
# Returns:
|
2979
|
-
# - dict: A dictionary containing the converted date string and its components (year, month, day).
|
2980
|
-
|
2981
|
-
# Raises:
|
2982
|
-
# - ValueError: If the date cannot be parsed.
|
2983
|
-
# """
|
2984
|
-
# from dateutil import parser
|
2985
|
-
# try:
|
2986
|
-
# if not isinstance(date_str,str):
|
2987
|
-
# date_str=str(date_str)
|
2988
|
-
# # Parse the date using the provided original format or auto-detect
|
2989
|
-
# if original_fmt:
|
2990
|
-
# try:
|
2991
|
-
# date_obj = datetime.strptime(date_str, original_fmt)
|
2992
|
-
# except Exception as e:
|
2993
|
-
# print(e)
|
2994
|
-
# date_obj=None
|
2995
|
-
# else:
|
2996
|
-
# try:
|
2997
|
-
# date_obj = parser.parse(date_str)
|
2998
|
-
# except Exception as e:
|
2999
|
-
# print(e)
|
3000
|
-
# date_obj=None
|
3001
|
-
# # Return formatted string if `fmt` is specified, otherwise return the datetime object
|
3002
|
-
# if date_obj is not None:
|
3003
|
-
# if fmt:
|
3004
|
-
# date_obj=date_obj.strftime(fmt)
|
3005
|
-
# else:
|
3006
|
-
# date_obj=date_str
|
3007
|
-
# return date_obj
|
3008
|
-
|
3009
|
-
# except (ValueError, TypeError) as e:
|
3010
|
-
# raise ValueError(f"Unable to process date string: '{date_str}'. Error: {e}")
|
3011
|
-
|
3012
|
-
|
3013
|
-
# # str1=str2date(num2str(20240625),fmt="%a %d-%B-%Y")
|
3014
|
-
# # print(str1)
|
3015
|
-
# # str2=str2num(str2date(str1,fmt='%a %Y%m%d'))
|
3016
|
-
# # print(str2)
|
3017
|
-
|
3018
3499
|
def str2date(
|
3019
3500
|
date_str: Union[str, int, float],
|
3020
3501
|
fmt: Optional[str] = "%Y-%m-%d",
|
@@ -4054,8 +4535,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
|
|
4054
4535
|
|
4055
4536
|
|
4056
4537
|
def ssplit(text, by="space", verbose: bool =False, strict: bool =False, strip_results: bool = True, **kws):
|
4057
|
-
"""
|
4058
|
-
# Determines the splitting strategy:
|
4538
|
+
"""# Determines the splitting strategy:
|
4059
4539
|
# - "space", "whitespace", "sp": split by whitespace (default)
|
4060
4540
|
# - "word": split into words using NLTK's word_tokenize
|
4061
4541
|
# - "sentence", "sent": split into sentences using NLTK's sent_tokenize
|
@@ -4172,13 +4652,6 @@ def ssplit(text, by="space", verbose: bool =False, strict: bool =False, strip_re
|
|
4172
4652
|
|
4173
4653
|
def split_by_regex_end(text, pattern):
|
4174
4654
|
return re.split(f"(?={pattern})", text)
|
4175
|
-
|
4176
|
-
# def split_by_sentence_endings(text):
|
4177
|
-
# return re.split(r"(?<=[.!?])", text)
|
4178
|
-
# def split_non_ascii(text):
|
4179
|
-
# # return re.split(r"([^\x00-\x7F\w\s,.!?:\"'()\-]+)", text)
|
4180
|
-
# # return re.split(r"[^\x00-\x7F]+", text)
|
4181
|
-
# return re.split(r"([^\x00-\x7F]+)", text)
|
4182
4655
|
def split_non_ascii(text, keep_delimiters=False):
|
4183
4656
|
"""
|
4184
4657
|
Split text at non-ASCII characters.
|
@@ -4903,145 +5376,6 @@ def _backup_validations(sheet, verbose=False):
|
|
4903
5376
|
|
4904
5377
|
return backup
|
4905
5378
|
|
4906
|
-
# def _backup_validations(sheet):
|
4907
|
-
# """
|
4908
|
-
# Complete validation backup with XML-level cross-sheet detection
|
4909
|
-
# """
|
4910
|
-
# from openpyxl.utils import get_column_letter
|
4911
|
-
# import re
|
4912
|
-
# from openpyxl.worksheet.datavalidation import DataValidation
|
4913
|
-
# from openpyxl.xml.functions import fromstring
|
4914
|
-
|
4915
|
-
# backup = {
|
4916
|
-
# "validations": [],
|
4917
|
-
# "conditional_formatting": [],
|
4918
|
-
# "merged_cells": [str(mr) for mr in sheet.merged_cells.ranges],
|
4919
|
-
# "_metadata": {
|
4920
|
-
# "validated_cells": set(),
|
4921
|
-
# "validated_columns": set(),
|
4922
|
-
# "validation_types": set(),
|
4923
|
-
# "cross_sheet_validations": set()
|
4924
|
-
# }
|
4925
|
-
# }
|
4926
|
-
|
4927
|
-
# # METHOD 1: Primary validation backup (standard method)
|
4928
|
-
# for dv in sheet.data_validations:
|
4929
|
-
# # ... (existing standard validation backup code) ...
|
4930
|
-
|
4931
|
-
# # METHOD 2: XML-based cross-sheet validation detection
|
4932
|
-
# print("Performing deep XML scan for cross-sheet validations...")
|
4933
|
-
|
4934
|
-
# # Access the worksheet XML directly
|
4935
|
-
# xml_source = sheet._worksheet.xml
|
4936
|
-
# if not xml_source:
|
4937
|
-
# print("Warning: Could not access worksheet XML source")
|
4938
|
-
# return backup
|
4939
|
-
|
4940
|
-
# try:
|
4941
|
-
# # Parse the XML
|
4942
|
-
# root = fromstring(xml_source)
|
4943
|
-
# ns = {'ns': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
4944
|
-
|
4945
|
-
# # Find all dataValidation elements
|
4946
|
-
# for dv_xml in root.findall('.//ns:dataValidation', ns):
|
4947
|
-
# try:
|
4948
|
-
# # Extract validation attributes
|
4949
|
-
# dv_type = dv_xml.get('type', 'none')
|
4950
|
-
# formula1 = dv_xml.find('.//ns:formula1', ns)
|
4951
|
-
# formula_text = formula1.text if formula1 is not None else None
|
4952
|
-
|
4953
|
-
# # Skip if not a list type or no formula
|
4954
|
-
# if dv_type != 'list' or not formula_text:
|
4955
|
-
# continue
|
4956
|
-
|
4957
|
-
# # Clean the formula
|
4958
|
-
# clean_formula = formula_text.strip('"\'')
|
4959
|
-
|
4960
|
-
# # Check for cross-sheet patterns
|
4961
|
-
# cross_sheet_patterns = [
|
4962
|
-
# (r'^[\w\s]+!\$?[A-Za-z]+\$?\d+(?::\$?[A-Za-z]+\$?\d+)?$', "direct sheet reference"),
|
4963
|
-
# (r'INDIRECT\(["\'][\w\s]+![A-Za-z]+\d+(?::[A-Za-z]+\d+)?["\']\)', "INDIRECT sheet reference"),
|
4964
|
-
# (r'^[^\s!]+$', "potential named range"),
|
4965
|
-
# ]
|
4966
|
-
|
4967
|
-
# # Determine if this is a cross-sheet reference
|
4968
|
-
# is_cross_sheet = False
|
4969
|
-
# detection_method = ""
|
4970
|
-
|
4971
|
-
# for pattern, description in cross_sheet_patterns:
|
4972
|
-
# if re.match(pattern, clean_formula, re.IGNORECASE):
|
4973
|
-
# is_cross_sheet = True
|
4974
|
-
# detection_method = description
|
4975
|
-
# break
|
4976
|
-
|
4977
|
-
# if not is_cross_sheet:
|
4978
|
-
# continue
|
4979
|
-
|
4980
|
-
# # Process the ranges
|
4981
|
-
# ranges = []
|
4982
|
-
# sqref = dv_xml.get('sqref', '')
|
4983
|
-
# for range_str in sqref.split():
|
4984
|
-
# try:
|
4985
|
-
# # Convert range to coordinates
|
4986
|
-
# if ':' in range_str:
|
4987
|
-
# start, end = range_str.split(':')
|
4988
|
-
# col_start = int(''.join(filter(str.isdigit, start)))
|
4989
|
-
# col_end = int(''.join(filter(str.isdigit, end)))
|
4990
|
-
# row_start = int(''.join(filter(str.isalpha, start)))
|
4991
|
-
# row_end = int(''.join(filter(str.isalpha, end)))
|
4992
|
-
# ranges.append({
|
4993
|
-
# 'range': range_str,
|
4994
|
-
# 'cells': [f"{get_column_letter(col)}{row}"
|
4995
|
-
# for col in range(col_start, col_end+1)
|
4996
|
-
# for row in range(row_start, row_end+1)]
|
4997
|
-
# })
|
4998
|
-
# else:
|
4999
|
-
# col = int(''.join(filter(str.isdigit, range_str)))
|
5000
|
-
# row = int(''.join(filter(str.isalpha, range_str)))
|
5001
|
-
# ranges.append({
|
5002
|
-
# 'range': range_str,
|
5003
|
-
# 'cells': [f"{get_column_letter(col)}{row}"]
|
5004
|
-
# })
|
5005
|
-
# except Exception as e:
|
5006
|
-
# print(f"Error parsing range {range_str}: {e}")
|
5007
|
-
|
5008
|
-
# # Create validation record
|
5009
|
-
# validation_data = {
|
5010
|
-
# 'type': 'list',
|
5011
|
-
# 'formula1': formula_text,
|
5012
|
-
# 'formula2': None,
|
5013
|
-
# 'allow_blank': dv_xml.get('allowBlank', '1') == '1',
|
5014
|
-
# 'showDropDown': dv_xml.get('showDropDown', '1') == '1',
|
5015
|
-
# 'showInputMessage': dv_xml.get('showInputMessage', '1') == '1',
|
5016
|
-
# 'showErrorMessage': dv_xml.get('showErrorMessage', '0') == '1',
|
5017
|
-
# 'errorTitle': dv_xml.get('errorTitle', ''),
|
5018
|
-
# 'error': dv_xml.get('error', ''),
|
5019
|
-
# 'promptTitle': dv_xml.get('promptTitle', ''),
|
5020
|
-
# 'prompt': dv_xml.get('prompt', ''),
|
5021
|
-
# 'ranges': ranges,
|
5022
|
-
# '_source': 'xml_validation',
|
5023
|
-
# '_detection_method': detection_method,
|
5024
|
-
# '_is_cross_sheet': True,
|
5025
|
-
# '_formula_clean': clean_formula
|
5026
|
-
# }
|
5027
|
-
|
5028
|
-
# # Add to backup
|
5029
|
-
# backup['validations'].append(validation_data)
|
5030
|
-
# for rng in ranges:
|
5031
|
-
# for cell_ref in rng['cells']:
|
5032
|
-
# backup['_metadata']['validated_cells'].add(cell_ref)
|
5033
|
-
# backup['_metadata']['validated_columns'].add(''.join(filter(str.isalpha, cell_ref)))
|
5034
|
-
# backup['_metadata']['validation_types'].add('list')
|
5035
|
-
# backup['_metadata']['cross_sheet_validations'].add(clean_formula.split('!')[0])
|
5036
|
-
|
5037
|
-
# except Exception as e:
|
5038
|
-
# print(f"Error processing XML validation: {e}")
|
5039
|
-
|
5040
|
-
# except Exception as e:
|
5041
|
-
# print(f"Error parsing worksheet XML: {e}")
|
5042
|
-
|
5043
|
-
# return backup
|
5044
|
-
|
5045
5379
|
def _restore_validations(sheet, backup,verbose=False):
|
5046
5380
|
"""
|
5047
5381
|
恢复数据验证和条件格式规则到工作表
|
@@ -5247,11 +5581,6 @@ def fload(fpath, kind=None, **kwargs):
|
|
5247
5581
|
with open(fpath, "r") as file:
|
5248
5582
|
content = file.read()
|
5249
5583
|
return content
|
5250
|
-
|
5251
|
-
# def load_html(fpath):
|
5252
|
-
# with open(fpath, "r") as file:
|
5253
|
-
# content = file.read()
|
5254
|
-
# return content
|
5255
5584
|
def load_html(fpath, **kwargs):
|
5256
5585
|
return pd.read_html(fpath, **kwargs)
|
5257
5586
|
|
@@ -5570,7 +5899,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
5570
5899
|
if output in ["dataframe", "df"]:
|
5571
5900
|
if verbose:
|
5572
5901
|
print("loading data as a DataFrame")
|
5573
|
-
if not password:
|
5902
|
+
if not bool(password):
|
5574
5903
|
if verbose:
|
5575
5904
|
print("Reading Excel without password protection...")
|
5576
5905
|
df = pd.read_excel(fpath, engine=engine, sheet_name=sheet_name, **kwargs)
|
@@ -6518,27 +6847,6 @@ def fsave(
|
|
6518
6847
|
print(
|
6519
6848
|
f"Error:\n{kind} is not in the supported list ['docx', 'txt', 'md', 'html', 'pdf', 'csv', 'xlsx', 'json', 'xml', 'yaml']"
|
6520
6849
|
)
|
6521
|
-
|
6522
|
-
|
6523
|
-
# # Example usage
|
6524
|
-
# text_content = ["Hello, this is a sample text file.", "This is the second paragraph."]
|
6525
|
-
# tabular_content = {"Name": ["Alice", "Bob"], "Age": [24, 30]}
|
6526
|
-
# json_content = {"name": "Alice", "age": 24}
|
6527
|
-
# yaml_content = {"Name": "Alice", "Age": 24}
|
6528
|
-
# xml_content = {"Name": "Alice", "Age": 24}
|
6529
|
-
# dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
|
6530
|
-
# fsave(dir_save + "sample.txt", text_content)
|
6531
|
-
# fsave(dir_save + "sample.md", text_content)
|
6532
|
-
# fsave(dir_save + "sample.html", text_content)
|
6533
|
-
# fsave(dir_save + "sample.pdf", text_content)
|
6534
|
-
# fsave(dir_save + "sample.docx", text_content)
|
6535
|
-
# fsave(dir_save + "sample.csv", tabular_content, index=False)
|
6536
|
-
# fsave(dir_save + "sample.xlsx", tabular_content, sheet_name="Sheet1", index=False)
|
6537
|
-
# fsave(dir_save + "sample.json", json_content, indent=4)
|
6538
|
-
# fsave(dir_save + "sample.yaml", yaml_content)
|
6539
|
-
# fsave(dir_save + "sample.xml", xml_content)
|
6540
|
-
|
6541
|
-
|
6542
6850
|
def addpath(fpath):
|
6543
6851
|
sys.path.insert(0, dir)
|
6544
6852
|
|
@@ -7118,7 +7426,7 @@ def listdir(
|
|
7118
7426
|
hidden=False, # Include hidden files/folders
|
7119
7427
|
orient="list",
|
7120
7428
|
output="df", # "df", 'list','dict','records','index','series'
|
7121
|
-
verbose=
|
7429
|
+
verbose=False,
|
7122
7430
|
):
|
7123
7431
|
def is_hidden(filepath):
|
7124
7432
|
"""Check if a file or folder is hidden."""
|
@@ -7348,7 +7656,7 @@ def listdir(
|
|
7348
7656
|
if "se" in orient.lower(): # records
|
7349
7657
|
return Box(f.to_dict(orient="series"))
|
7350
7658
|
|
7351
|
-
|
7659
|
+
|
7352
7660
|
def listpkg(where="env", verbose=False):
|
7353
7661
|
"""list all pacakages"""
|
7354
7662
|
|
@@ -7829,87 +8137,7 @@ def split_path(fpath):
|
|
7829
8137
|
dir_par = f_slash.join(fpath.split(f_slash)[:-1])
|
7830
8138
|
dir_ch = "".join(fpath.split(f_slash)[-1:])
|
7831
8139
|
return dir_par, dir_ch
|
7832
|
-
|
7833
|
-
|
7834
|
-
def figsave(*args, dpi=300, **kwargs):
|
7835
|
-
import matplotlib.pyplot as plt
|
7836
|
-
from PIL import Image
|
7837
|
-
bbox_inches = kwargs.pop("bbox_inches", "tight")
|
7838
|
-
pad_inches = kwargs.pop("pad_inches", 0)
|
7839
|
-
facecolor = kwargs.pop("facecolor", "white")
|
7840
|
-
edgecolor = kwargs.pop("edgecolor", "auto")
|
7841
|
-
|
7842
|
-
dir_save = None
|
7843
|
-
fname = None
|
7844
|
-
img = None
|
7845
|
-
|
7846
|
-
for arg in args:
|
7847
|
-
if isinstance(arg, str):
|
7848
|
-
path = Path(arg)
|
7849
|
-
if path.suffix: # Has file extension
|
7850
|
-
fname = path.name
|
7851
|
-
dir_save = path.parent
|
7852
|
-
else:
|
7853
|
-
dir_save = path
|
7854
|
-
elif isinstance(arg, (Image.Image, np.ndarray)):
|
7855
|
-
img = arg # Store PIL image or numpy array
|
7856
|
-
|
7857
|
-
# Set default save directory
|
7858
|
-
dir_save = Path(dir_save) if dir_save else Path(".")
|
7859
|
-
dir_save.mkdir(parents=True, exist_ok=True)
|
7860
|
-
|
7861
|
-
# Handle filename and extension
|
7862
|
-
if fname is None:
|
7863
|
-
fname = "figure"
|
7864
|
-
fname = dir_save / fname
|
7865
|
-
if fname.suffix == "":
|
7866
|
-
fname = fname.with_suffix(".pdf") # Default format
|
7867
|
-
|
7868
|
-
ftype = fname.suffix.lstrip(".").lower()
|
7869
|
-
|
7870
|
-
# Save figure based on file type
|
7871
|
-
if ftype == "eps":
|
7872
|
-
plt.savefig(fname, format="eps", bbox_inches=bbox_inches)
|
7873
|
-
plt.savefig(fname.with_suffix(".pdf"), format="pdf", dpi=dpi,
|
7874
|
-
pad_inches=pad_inches, bbox_inches=bbox_inches,
|
7875
|
-
facecolor=facecolor, edgecolor=edgecolor)
|
7876
|
-
elif ftype == "pdf":
|
7877
|
-
plt.savefig(fname, format="pdf", dpi=dpi, pad_inches=pad_inches,
|
7878
|
-
bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
|
7879
|
-
elif ftype in ["jpg", "jpeg", "png", "tiff", "tif"]:
|
7880
|
-
if img is not None: # If an image is provided
|
7881
|
-
if isinstance(img, Image.Image):
|
7882
|
-
img = img.convert("RGB") if img.mode == "RGBA" else img
|
7883
|
-
img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
7884
|
-
elif isinstance(img, np.ndarray):
|
7885
|
-
import cv2
|
7886
|
-
if img.ndim == 2:
|
7887
|
-
Image.fromarray(img).save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
7888
|
-
elif img.ndim == 3:
|
7889
|
-
if img.shape[2] == 3:
|
7890
|
-
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
7891
|
-
elif img.shape[2] == 4:
|
7892
|
-
img = cv2.cvtColor(img, cv2.COLOR_BGRA2RGBA)
|
7893
|
-
Image.fromarray(img).save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
7894
|
-
else:
|
7895
|
-
raise ValueError("Unexpected image dimensions.")
|
7896
|
-
else:
|
7897
|
-
plt.savefig(fname, format=ftype, dpi=dpi, pad_inches=pad_inches,
|
7898
|
-
bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
|
7899
|
-
elif ftype == "ico":
|
7900
|
-
if img is None:
|
7901
|
-
plt.savefig(fname, dpi=dpi, pad_inches=pad_inches,
|
7902
|
-
bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
|
7903
|
-
img = Image.open(fname)
|
7904
|
-
img = img.convert("RGBA")
|
7905
|
-
icon_sizes = [(32, 32), (64, 64), (128, 128), (256, 256)]
|
7906
|
-
img.save(fname, format="ICO", sizes=icon_sizes)
|
7907
|
-
print(f"Icon saved @: {fname} with sizes: {icon_sizes}")
|
7908
|
-
else:
|
7909
|
-
raise ValueError(f"Unsupported file format: {ftype}")
|
7910
|
-
|
7911
|
-
print(f"\nSaved @ {fname} (dpi={dpi})")
|
7912
|
-
|
8140
|
+
|
7913
8141
|
def figsave(*args, dpi=300, **kwargs):
|
7914
8142
|
"""
|
7915
8143
|
Save a Matplotlib figure or image file in various formats.
|
@@ -8038,7 +8266,7 @@ def figsave(*args, dpi=300, **kwargs):
|
|
8038
8266
|
img = img.convert("RGBA")
|
8039
8267
|
img.save(fname, format="ICO", sizes=icon_sizes)
|
8040
8268
|
print(f"Icon saved @: {fname} with sizes: {icon_sizes}")
|
8041
|
-
print(f"\
|
8269
|
+
print(f"\nSaved @: dpi={dpi}\n{fname}")
|
8042
8270
|
|
8043
8271
|
|
8044
8272
|
def is_str_color(s):
|
@@ -8806,7 +9034,8 @@ def detect_angle(image, by="median", template=None):
|
|
8806
9034
|
|
8807
9035
|
# Use Hough transform to detect lines
|
8808
9036
|
lines = transform.probabilistic_hough_line(edges)
|
8809
|
-
|
9037
|
+
if isinstance(by, bool):
|
9038
|
+
by="mean" if by else 0
|
8810
9039
|
if not lines and any(["me" in by, "pca" in by]):
|
8811
9040
|
print("No lines detected. Adjust the edge detection parameters.")
|
8812
9041
|
return 0
|
@@ -9180,7 +9409,7 @@ def imgsets(
|
|
9180
9409
|
elif "cro" in k.lower() or "cut" in k.lower():
|
9181
9410
|
img_update = img_update.crop(value)
|
9182
9411
|
elif "rota" in k.lower():
|
9183
|
-
if isinstance(value, str):
|
9412
|
+
if isinstance(value, (str,bool)):
|
9184
9413
|
value = detect_angle(img_update, by=value)
|
9185
9414
|
print(f"rotated by {value}°")
|
9186
9415
|
img_update = img_update.rotate(value)
|
@@ -9371,12 +9600,6 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
9371
9600
|
else:
|
9372
9601
|
figsave(dirname(dir_save), fname)
|
9373
9602
|
|
9374
|
-
|
9375
|
-
# usage:
|
9376
|
-
# fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
|
9377
|
-
# thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
|
9378
|
-
|
9379
|
-
|
9380
9603
|
# search and fine the director of the libary, which installed at local
|
9381
9604
|
def dir_lib(lib_oi):
|
9382
9605
|
"""
|
@@ -9524,35 +9747,276 @@ def finfo(fpath, output='json', verbose=False):
|
|
9524
9747
|
extra_info=extra_info,
|
9525
9748
|
)
|
9526
9749
|
|
9527
|
-
|
9528
|
-
|
9529
|
-
|
9530
|
-
|
9531
|
-
|
9532
|
-
def hex2argb(color):
|
9750
|
+
def color2rgb(
|
9751
|
+
color_input: str | tuple | list | None,
|
9752
|
+
alpha: float | None = None
|
9753
|
+
) -> tuple | None:
|
9533
9754
|
"""
|
9534
|
-
|
9535
|
-
|
9536
|
-
:
|
9537
|
-
|
9538
|
-
|
9539
|
-
|
9540
|
-
|
9541
|
-
|
9542
|
-
|
9543
|
-
|
9755
|
+
Ultimate color conversion utility with support for multiple formats and transparency.
|
9756
|
+
|
9757
|
+
Parameters:
|
9758
|
+
-----------
|
9759
|
+
color_input : str | tuple | list | None
|
9760
|
+
Supported formats:
|
9761
|
+
- Hex strings ("#RRGGBB", "#RGB")
|
9762
|
+
- Named colors ("red", "blue")
|
9763
|
+
- RGB tuples ((0.2, 0.4, 0.6))
|
9764
|
+
- RGBA tuples ((0.2, 0.4, 0.6, 0.8))
|
9765
|
+
- HTML/CSS colors ("cornflowerblue")
|
9766
|
+
- CSS formats:
|
9767
|
+
- rgb(100,200,50)
|
9768
|
+
- rgba(100,200,50,0.8)
|
9769
|
+
- hsl(120,60%,70%)
|
9770
|
+
- hsla(120,60%,70%,0.8)
|
9771
|
+
alpha : float | None, optional
|
9772
|
+
Opacity value (0.0-1.0). If provided, adds/overrides alpha channel.
|
9773
|
+
|
9774
|
+
Returns:
|
9775
|
+
--------
|
9776
|
+
tuple | None
|
9777
|
+
(R, G, B) or (R, G, B, A) tuple in 0-1 range, or None if invalid
|
9544
9778
|
"""
|
9545
|
-
import
|
9779
|
+
from matplotlib import colors as mcolors
|
9546
9780
|
import re
|
9547
|
-
|
9548
|
-
|
9781
|
+
|
9782
|
+
if color_input is None:
|
9783
|
+
return None
|
9784
|
+
|
9785
|
+
# Case 1: Already in RGB/RGBA tuple format
|
9786
|
+
if isinstance(color_input, (tuple, list)):
|
9787
|
+
if 3 <= len(color_input) <= 4:
|
9788
|
+
if all(0 <= x <= 1 for x in color_input):
|
9789
|
+
if alpha is not None and len(color_input) == 3:
|
9790
|
+
return (*color_input, alpha)
|
9791
|
+
return tuple(color_input)
|
9792
|
+
|
9793
|
+
# Case 2: String input
|
9794
|
+
if isinstance(color_input, str):
|
9795
|
+
# Remove whitespace and make lowercase
|
9796
|
+
color_str = color_input.strip().lower()
|
9797
|
+
|
9798
|
+
# Handle CSS rgb/rgba format
|
9799
|
+
if color_str.startswith(('rgb(', 'rgba(')):
|
9800
|
+
try:
|
9801
|
+
nums = list(map(float, re.findall(r"[\d.]+", color_str)))
|
9802
|
+
if 3 <= len(nums) <= 4:
|
9803
|
+
rgb = tuple(x/255 if i < 3 else x for i, x in enumerate(nums))
|
9804
|
+
if alpha is not None:
|
9805
|
+
return (*rgb[:3], alpha)
|
9806
|
+
return rgb[:4] if len(rgb) == 4 else rgb[:3]
|
9807
|
+
except:
|
9808
|
+
pass
|
9809
|
+
|
9810
|
+
# Handle CSS hsl/hsla format
|
9811
|
+
elif color_str.startswith(('hsl(', 'hsla(')):
|
9812
|
+
try:
|
9813
|
+
nums = list(map(float, re.findall(r"[\d.]+", color_str)))
|
9814
|
+
if 3 <= len(nums) <= 4:
|
9815
|
+
h, s, l = nums[0]/360, nums[1]/100, nums[2]/100
|
9816
|
+
rgb = mcolors.hsv_to_rgb((h, s, l))
|
9817
|
+
if len(nums) == 4:
|
9818
|
+
rgb += (nums[3],)
|
9819
|
+
if alpha is not None:
|
9820
|
+
return (*rgb[:3], alpha)
|
9821
|
+
return rgb[:4] if len(rgb) == 4 else rgb[:3]
|
9822
|
+
except:
|
9823
|
+
pass
|
9824
|
+
|
9825
|
+
# Standard hex/named color processing
|
9826
|
+
try:
|
9827
|
+
rgb = mcolors.to_rgba(color_str)
|
9828
|
+
if alpha is not None:
|
9829
|
+
return (*rgb[:3], alpha)
|
9830
|
+
return rgb if len(rgb) == 4 and rgb[3] != 1 else rgb[:3]
|
9831
|
+
except ValueError:
|
9832
|
+
pass
|
9833
|
+
|
9834
|
+
# Fallback for invalid colors
|
9835
|
+
print(f"Warning: Invalid color format '{color_input}'")
|
9836
|
+
return None
|
9837
|
+
|
9838
|
+
def color2hex(
|
9839
|
+
color_input: str | tuple | list | dict | int | None,
|
9840
|
+
keep_alpha: bool = False,
|
9841
|
+
force_long: bool = False,
|
9842
|
+
uppercase: bool = False,
|
9843
|
+
prefix: str = "#",
|
9844
|
+
allow_short: bool = True
|
9845
|
+
) -> str | None:
|
9846
|
+
"""
|
9847
|
+
Ultimate color to hex converter with comprehensive format support.
|
9848
|
+
|
9849
|
+
Parameters:
|
9850
|
+
-----------
|
9851
|
+
color_input : str | tuple | list | dict | int | None
|
9852
|
+
Input color in any of these formats:
|
9853
|
+
- Hex strings ("#RRGGBB", "#RGB", "RRGGBB", "RGB")
|
9854
|
+
- Named colors ("red", "blue", "transparent")
|
9855
|
+
- RGB/RGBA tuples ((0.2, 0.4, 0.6), (255, 0, 0), (100, 100, 100, 0.5))
|
9856
|
+
- CSS formats:
|
9857
|
+
- rgb(100,200,50)
|
9858
|
+
- rgba(100,200,50,0.8)
|
9859
|
+
- hsl(120,60%,70%)
|
9860
|
+
- hsla(120,60%,70%,0.8)
|
9861
|
+
- Integer RGB (0xFF0000 for red)
|
9862
|
+
- Dictionary {"r": 255, "g": 0, "b": 0} or {"h": 0, "s": 100, "l": 50}
|
9863
|
+
keep_alpha : bool, optional
|
9864
|
+
Whether to include alpha channel in hex format (#RRGGBBAA)
|
9865
|
+
force_long : bool, optional
|
9866
|
+
Force 6/8-digit hex even when 3/4-digit would be possible
|
9867
|
+
uppercase : bool, optional
|
9868
|
+
Use uppercase hex characters (False for lowercase)
|
9869
|
+
prefix : str, optional
|
9870
|
+
Prefix for hex string ("#" for CSS, "0x" for programming, "" for raw)
|
9871
|
+
allow_short : bool, optional
|
9872
|
+
Allow shortened 3/4-digit hex when possible
|
9873
|
+
|
9874
|
+
Returns:
|
9875
|
+
--------
|
9876
|
+
str | None
|
9877
|
+
Hex color string or None if invalid
|
9878
|
+
|
9879
|
+
Examples:
|
9880
|
+
---------
|
9881
|
+
>>> color2hex((0.5, 0.2, 0.8)) → "#7f33cc"
|
9882
|
+
>>> color2hex("rgb(127, 51, 204)") → "#7f33cc"
|
9883
|
+
>>> color2hex((0.2, 0.4, 0.6, 0.8), True) → "#336699cc"
|
9884
|
+
>>> color2hex(0xFF0000, uppercase=True) → "#FF0000"
|
9885
|
+
>>> color2hex({"r": 255, "g": 165, "b": 0}, prefix="") → "ffa500"
|
9886
|
+
>>> color2hex("hsl(120, 100%, 50%)") → "#00ff00"
|
9887
|
+
"""
|
9888
|
+
from matplotlib import colors as mcolors
|
9889
|
+
import re
|
9890
|
+
|
9891
|
+
def to_rgba(color) -> tuple | None:
|
9892
|
+
"""Internal conversion to RGBA tuple"""
|
9893
|
+
# Handle None
|
9894
|
+
if color is None:
|
9895
|
+
return None
|
9896
|
+
|
9897
|
+
# Handle integer RGB
|
9898
|
+
if isinstance(color, int):
|
9899
|
+
if color < 0:
|
9900
|
+
return None
|
9901
|
+
return (
|
9902
|
+
(color >> 16) & 0xFF,
|
9903
|
+
(color >> 8) & 0xFF,
|
9904
|
+
color & 0xFF,
|
9905
|
+
255
|
9906
|
+
)
|
9907
|
+
|
9908
|
+
# Handle dictionary formats
|
9909
|
+
if isinstance(color, dict):
|
9910
|
+
keys = set(color.keys())
|
9911
|
+
if {'r','g','b'}.issubset(keys):
|
9912
|
+
return (
|
9913
|
+
color['r'] / 255 if color['r'] > 1 else color['r'],
|
9914
|
+
color['g'] / 255 if color['g'] > 1 else color['g'],
|
9915
|
+
color['b'] / 255 if color['b'] > 1 else color['b'],
|
9916
|
+
color.get('a', 1.0)
|
9917
|
+
)
|
9918
|
+
elif {'h','s','l'}.issubset(keys):
|
9919
|
+
return mcolors.hsv_to_rgb((
|
9920
|
+
color['h'] / 360,
|
9921
|
+
color['s'] / 100,
|
9922
|
+
color['l'] / 100
|
9923
|
+
)) + (color.get('a', 1.0),)
|
9924
|
+
return None
|
9925
|
+
|
9926
|
+
# Handle string formats
|
9927
|
+
if isinstance(color, str):
|
9928
|
+
color = color.strip().lower()
|
9929
|
+
|
9930
|
+
# Handle hex without prefix
|
9931
|
+
if re.match(r'^[0-9a-f]{3,8}$', color):
|
9932
|
+
return mcolors.to_rgba(f"#{color}")
|
9933
|
+
|
9934
|
+
# Handle CSS functions
|
9935
|
+
if color.startswith(('rgb(', 'rgba(', 'hsl(', 'hsla(')):
|
9936
|
+
try:
|
9937
|
+
return mcolors.to_rgba(color)
|
9938
|
+
except ValueError:
|
9939
|
+
return None
|
9940
|
+
|
9941
|
+
# Handle named colors (including 'transparent')
|
9942
|
+
try:
|
9943
|
+
return mcolors.to_rgba(color)
|
9944
|
+
except ValueError:
|
9945
|
+
return None
|
9946
|
+
|
9947
|
+
# Handle tuple/list formats
|
9948
|
+
if isinstance(color, (tuple, list)):
|
9949
|
+
if len(color) in (3, 4):
|
9950
|
+
# Normalize values
|
9951
|
+
normalized = []
|
9952
|
+
for i, v in enumerate(color):
|
9953
|
+
if i < 3: # RGB channels
|
9954
|
+
if isinstance(v, int):
|
9955
|
+
normalized.append(v / 255 if v > 1 else v)
|
9956
|
+
else:
|
9957
|
+
normalized.append(float(v))
|
9958
|
+
else: # Alpha channel
|
9959
|
+
normalized.append(float(v))
|
9960
|
+
return tuple(normalized)
|
9961
|
+
|
9962
|
+
return None
|
9963
|
+
|
9964
|
+
# Convert input to RGBA
|
9965
|
+
rgba = to_rgba(color_input)
|
9966
|
+
if rgba is None:
|
9967
|
+
return None
|
9968
|
+
|
9969
|
+
# Extract components
|
9970
|
+
components = []
|
9971
|
+
for i, c in enumerate(rgba):
|
9972
|
+
if i == 3 and not keep_alpha:
|
9973
|
+
break
|
9974
|
+
components.append(round(c * 255 if c <= 1 else c))
|
9975
|
+
|
9976
|
+
# Determine if we can use short format
|
9977
|
+
use_short = (allow_short and
|
9978
|
+
not force_long and
|
9979
|
+
len(components) in (3, 4) and
|
9980
|
+
all((x % 17 == 0) for x in components[:3]))
|
9981
|
+
|
9982
|
+
# Format the hex string
|
9983
|
+
if use_short:
|
9984
|
+
short_components = [x//17 for x in components[:3]] + components[3:]
|
9985
|
+
hex_str = "".join(f"{x:1x}" for x in short_components)
|
9986
|
+
else:
|
9987
|
+
hex_str = "".join(f"{x:02x}" for x in components)
|
9988
|
+
|
9989
|
+
# Apply case and prefix
|
9990
|
+
if uppercase:
|
9991
|
+
hex_str = hex_str.upper()
|
9992
|
+
|
9993
|
+
return f"{prefix}{hex_str}"
|
9994
|
+
# ! format excel file
|
9995
|
+
|
9996
|
+
def hex2argb(color):
|
9997
|
+
"""
|
9998
|
+
Convert a color name or hex code to aARGB format required by openpyxl.
|
9999
|
+
|
10000
|
+
:param color: A color in the format: 'blue', '#RRGGBB', 'RRGGBB', 'aARRGGBB'
|
10001
|
+
:return: A hex color code in the format aARRGGBB.
|
10002
|
+
|
10003
|
+
Example:
|
10004
|
+
print(hex2argb("blue")) # Output: FF0000FF
|
10005
|
+
print(hex2argb("FFFF00")) # Output: FFFFFF00
|
10006
|
+
print(hex2argb("#DF4245")) # Output: FFDf4245
|
10007
|
+
print(hex2argb("FF00FF00")) # Output: FF00FF00 (already in aARGB format)
|
10008
|
+
"""
|
10009
|
+
import matplotlib.colors as mcolors
|
10010
|
+
import re
|
10011
|
+
color = color.lower().replace(" ", "") # 'light blue'
|
10012
|
+
# Convert color name (e.g., "blue") to hex
|
9549
10013
|
if color.lower() in mcolors.CSS4_COLORS:
|
9550
10014
|
color = mcolors.CSS4_COLORS[color.lower()].lstrip("#")
|
9551
10015
|
color = color.lstrip("#").upper()# Remove '#' if present
|
9552
10016
|
|
9553
10017
|
# Validate hex format
|
9554
10018
|
if not re.fullmatch(r"[A-F0-9]{6,8}", color):
|
9555
|
-
raise ValueError(f"
|
10019
|
+
raise ValueError(f"格式错误: {color}, 应该使用 RRGGBB, #RRGGBB, or aARRGGBB format.")
|
9556
10020
|
|
9557
10021
|
# If already in aARRGGBB format (8 chars), return as is
|
9558
10022
|
if len(color) == 8:
|
@@ -9752,6 +10216,624 @@ def copy_format(
|
|
9752
10216
|
wb_source.close()
|
9753
10217
|
if "wb_target" in locals():
|
9754
10218
|
wb_target.close()
|
10219
|
+
# ! =========(below) interact with worrkbook and DataFrame===========
|
10220
|
+
import pandas as pd
|
10221
|
+
from openpyxl import load_workbook
|
10222
|
+
from openpyxl.workbook.workbook import Workbook
|
10223
|
+
from openpyxl.utils import get_column_letter
|
10224
|
+
|
10225
|
+
class DataFrameAlignExcel:
|
10226
|
+
"""
|
10227
|
+
A powerful tool for updating Excel files with data from DataFrames with various matching strategies.
|
10228
|
+
|
10229
|
+
Features:
|
10230
|
+
- Accepts either file path or open Workbook object
|
10231
|
+
- Multiple matching strategies (exact, contains, starts_with, ends_with, regex)
|
10232
|
+
- Multiple value update strategies (overwrite, add, subtract, multiply, divide, append)
|
10233
|
+
- Support for multiple worksheets
|
10234
|
+
- Automatic column creation
|
10235
|
+
- Value normalization options
|
10236
|
+
- Detailed logging and dry-run mode
|
10237
|
+
- Progress reporting
|
10238
|
+
- Data validation
|
10239
|
+
- make_backup functionality
|
10240
|
+
"""
|
10241
|
+
|
10242
|
+
def __init__(self, fpath: Union[str, Workbook], df: pd.DataFrame = None):
|
10243
|
+
"""
|
10244
|
+
Initialize the DataFrameAlignExcel.
|
10245
|
+
|
10246
|
+
Args:
|
10247
|
+
fpath: Path to the Excel file (str) or open Workbook object
|
10248
|
+
df: Optional DataFrame to use for updates
|
10249
|
+
"""
|
10250
|
+
self.fpath_or_wb = fpath
|
10251
|
+
self.df = df
|
10252
|
+
self.wb = None
|
10253
|
+
self.backup_path = None
|
10254
|
+
self.log = []
|
10255
|
+
self.owns_workbook = (
|
10256
|
+
False # Track whether we created the workbook or it was passed in
|
10257
|
+
)
|
10258
|
+
|
10259
|
+
def load_workbook(self) -> None:
|
10260
|
+
"""Load the Excel workbook if a path was provided."""
|
10261
|
+
if isinstance(self.fpath_or_wb, str):
|
10262
|
+
if not os.path.exists(self.fpath_or_wb):
|
10263
|
+
raise FileNotFoundError(f"Excel file not found: {self.fpath_or_wb}")
|
10264
|
+
self.wb = load_workbook(self.fpath_or_wb)
|
10265
|
+
self.owns_workbook = True
|
10266
|
+
elif isinstance(self.fpath_or_wb, Workbook):
|
10267
|
+
self.wb = self.fpath_or_wb
|
10268
|
+
self.owns_workbook = False
|
10269
|
+
else:
|
10270
|
+
raise TypeError(
|
10271
|
+
"fpath must be either a string path or an openpyxl Workbook object"
|
10272
|
+
)
|
10273
|
+
|
10274
|
+
def create_make_backup(self) -> None:
|
10275
|
+
"""Create a make_backup of the original Excel file (only if we loaded from a file)."""
|
10276
|
+
if not isinstance(self.fpath_or_wb, str):
|
10277
|
+
self.log.append(
|
10278
|
+
"Skipping make_backup - working with Workbook object directly"
|
10279
|
+
)
|
10280
|
+
return
|
10281
|
+
|
10282
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
10283
|
+
self.backup_path = os.path.join(
|
10284
|
+
os.path.dirname(self.fpath_or_wb),
|
10285
|
+
f"backup_{timestamp}_{os.path.basename(self.fpath_or_wb)}",
|
10286
|
+
)
|
10287
|
+
self.wb.save(self.backup_path)
|
10288
|
+
self.log.append(f"Created make_backup at: {self.backup_path}")
|
10289
|
+
|
10290
|
+
def save_workbook(self, dir_save: str = None) -> None:
|
10291
|
+
"""
|
10292
|
+
Save the workbook to a file.
|
10293
|
+
|
10294
|
+
Args:
|
10295
|
+
dir_save: Optional path to save to. If None and we loaded from a file,
|
10296
|
+
saves to the original path.
|
10297
|
+
"""
|
10298
|
+
if self.wb is None:
|
10299
|
+
raise ValueError("No workbook loaded")
|
10300
|
+
|
10301
|
+
if dir_save is None:
|
10302
|
+
if isinstance(self.fpath_or_wb, str):
|
10303
|
+
dir_save = self.fpath_or_wb
|
10304
|
+
else:
|
10305
|
+
dir_save = datetime.now().strftime("%Y%m%d_%H%M%S") + ".xlsx"
|
10306
|
+
print(
|
10307
|
+
f"No save path provided and original input was a Workbook object, so used : {dir_save}"
|
10308
|
+
)
|
10309
|
+
self.wb.save(dir_save)
|
10310
|
+
self.log.append(f"Saved workbook to: {dir_save}")
|
10311
|
+
|
10312
|
+
def normalize_value(self, value, clean_keys: str = "strip_split_first") -> str:
|
10313
|
+
"""
|
10314
|
+
Normalize a value based on the specified method.
|
10315
|
+
|
10316
|
+
Args:
|
10317
|
+
value: Value to normalize
|
10318
|
+
clean_keys: One of:
|
10319
|
+
- 'strip': just strip whitespace
|
10320
|
+
- 'strip_lower': strip and lowercase
|
10321
|
+
- 'strip_split_first': strip and take first part before comma
|
10322
|
+
- 'strip_split_last': strip and take last part after comma
|
10323
|
+
- None: no normalization
|
10324
|
+
|
10325
|
+
Returns:
|
10326
|
+
Normalized value
|
10327
|
+
"""
|
10328
|
+
if value is None:
|
10329
|
+
return None
|
10330
|
+
|
10331
|
+
value = str(value)
|
10332
|
+
|
10333
|
+
if clean_keys is None:
|
10334
|
+
return value
|
10335
|
+
|
10336
|
+
if clean_keys == "strip":
|
10337
|
+
return value.strip()
|
10338
|
+
elif clean_keys == "strip_lower":
|
10339
|
+
return value.strip().lower()
|
10340
|
+
elif clean_keys == "strip_split_first":
|
10341
|
+
return value.strip().split(",")[0].strip()
|
10342
|
+
elif clean_keys == "strip_split_last":
|
10343
|
+
parts = value.strip().split(",")
|
10344
|
+
return parts[-1].strip() if len(parts) > 1 else value.strip()
|
10345
|
+
else:
|
10346
|
+
warnings.warn(f"Unknown clean_keys: {clean_keys}. Using 'strip'.")
|
10347
|
+
return value.strip()
|
10348
|
+
|
10349
|
+
def find_column_index(self, ws, header_row: int, column_name: str, max_search_columns: int = 100) -> int:
|
10350
|
+
"""
|
10351
|
+
Efficiently find the column index (1-based) for a given column name,
|
10352
|
+
considering only non-empty cells and limiting search range.
|
10353
|
+
|
10354
|
+
Args:
|
10355
|
+
ws: Worksheet object
|
10356
|
+
header_row: Row number containing headers (1-based)
|
10357
|
+
column_name: Column name to find
|
10358
|
+
max_search_columns: Max number of columns to search (to prevent infinite loops)
|
10359
|
+
|
10360
|
+
Returns:
|
10361
|
+
Column index (1-based), or -1 if not found
|
10362
|
+
"""
|
10363
|
+
row_iter = ws.iter_rows(min_row=header_row, max_row=header_row, max_col=max_search_columns, values_only=False)
|
10364
|
+
for row in row_iter:
|
10365
|
+
for cell in row:
|
10366
|
+
if cell.value and str(cell.value).strip().lower() == column_name.lower():
|
10367
|
+
return cell.column
|
10368
|
+
break # Only process the header row
|
10369
|
+
return -1
|
10370
|
+
# def find_column_index(self, ws, header_row: int, column_name: str, max_search_columns: int = 100) -> int:
|
10371
|
+
# """
|
10372
|
+
# Find the column index (1-based) for a given column name.
|
10373
|
+
# If not found, return the last non-empty header column index.
|
10374
|
+
|
10375
|
+
# Args:
|
10376
|
+
# ws: Worksheet object
|
10377
|
+
# header_row: Row number containing headers (1-based)
|
10378
|
+
# column_name: Column name to find
|
10379
|
+
# max_search_columns: Max number of columns to search
|
10380
|
+
|
10381
|
+
# Returns:
|
10382
|
+
# Column index (1-based)
|
10383
|
+
# """
|
10384
|
+
# row_iter = ws.iter_rows(min_row=header_row, max_row=header_row, max_col=max_search_columns, values_only=False)
|
10385
|
+
# last_non_empty_col = -1
|
10386
|
+
|
10387
|
+
# for row in row_iter:
|
10388
|
+
# for cell in row:
|
10389
|
+
# if cell.value and str(cell.value).strip():
|
10390
|
+
# last_non_empty_col = cell.column
|
10391
|
+
# if str(cell.value).strip().lower() == column_name.lower():
|
10392
|
+
# return cell.column
|
10393
|
+
# break # Only one row being read
|
10394
|
+
|
10395
|
+
# return last_non_empty_col
|
10396
|
+
|
10397
|
+
def update_values(
|
10398
|
+
self,
|
10399
|
+
df: pd.DataFrame = None,
|
10400
|
+
sheet_name: Union[str, int, List[Union[str, int]]] = 0,
|
10401
|
+
header_row: int = 1,
|
10402
|
+
column_match: Union[Dict[str, str], List[Tuple[str, str]]] = None,
|
10403
|
+
column_mapping: Union[Dict[str, str], List[Tuple[str, str]]] = None,
|
10404
|
+
clean_keys: str = "strip_split_first",
|
10405
|
+
match_method: str = "exact",
|
10406
|
+
update_strategy: str = "overwrite",
|
10407
|
+
create_missing_columns: bool = True,
|
10408
|
+
preview_only: bool = False,
|
10409
|
+
show_progress: bool = True,
|
10410
|
+
skip_no_match: bool = True,
|
10411
|
+
make_backup: bool = True,
|
10412
|
+
dir_save: str = None,
|
10413
|
+
row_max=500
|
10414
|
+
) -> Dict[str, int]:
|
10415
|
+
"""
|
10416
|
+
Update Excel with values from DataFrame.
|
10417
|
+
|
10418
|
+
Args:
|
10419
|
+
df: DataFrame containing update data (if None, uses self.df)
|
10420
|
+
sheet_name: Sheet name(s) to update (str, int, or list of these)
|
10421
|
+
header_row: Row number containing headers (1-based)
|
10422
|
+
column_match: Dict or list of tuples mapping DataFrame columns to Excel columns for matching
|
10423
|
+
e.g., {'SampleID': 'ID'} or [('SampleID', 'ID'), ('Batch', 'Lot')]
|
10424
|
+
column_mapping: Dict or list of tuples mapping DataFrame columns to Excel columns to update
|
10425
|
+
e.g., {'Vials': 'Qty'} or [('Vials', 'Qty'), ('Status', 'State')]
|
10426
|
+
clean_keys: How to normalize matching values (see normalize_value())
|
10427
|
+
match_method: How to match values ('exact', 'contains', 'starts_with', 'ends_with', 'regex')
|
10428
|
+
update_strategy: How to update values ('overwrite', 'add', 'subtract', 'multiply', 'divide', 'append')
|
10429
|
+
create_missing_columns: Whether to create columns that don't exist
|
10430
|
+
preview_only: If True, don't actually update the Excel file
|
10431
|
+
show_progress: If True, print progress updates
|
10432
|
+
skip_no_match: If True, skip rows where match columns don't match
|
10433
|
+
make_backup: If True, create a make_backup before updating (only if working with file path)
|
10434
|
+
dir_save: Optional path to save to. If None and we loaded from a file,
|
10435
|
+
saves to the original path. Ignored if preview_only=True.
|
10436
|
+
|
10437
|
+
Returns:
|
10438
|
+
Dictionary with update statistics
|
10439
|
+
"""
|
10440
|
+
# Initialize
|
10441
|
+
start_time = datetime.now()
|
10442
|
+
if df is None:
|
10443
|
+
df = self.df
|
10444
|
+
if df is None:
|
10445
|
+
raise ValueError("No DataFrame provided")
|
10446
|
+
|
10447
|
+
if not isinstance(column_match, (dict, list)) or not column_match:
|
10448
|
+
raise ValueError(
|
10449
|
+
"column_match must be a non-empty dict or list of tuples"
|
10450
|
+
)
|
10451
|
+
|
10452
|
+
if not isinstance(column_mapping, (dict, list)) or not column_mapping:
|
10453
|
+
raise ValueError("column_mapping must be a non-empty dict or list of tuples")
|
10454
|
+
|
10455
|
+
# Convert match/update columns to consistent format
|
10456
|
+
if isinstance(column_match, dict):
|
10457
|
+
column_match = list(column_match.items())
|
10458
|
+
if isinstance(column_mapping, dict):
|
10459
|
+
column_mapping = list(column_mapping.items())
|
10460
|
+
|
10461
|
+
# Load workbook if not already loaded
|
10462
|
+
if self.wb is None:
|
10463
|
+
self.load_workbook()
|
10464
|
+
|
10465
|
+
# Create make_backup (only if we're working with a file path)
|
10466
|
+
if not preview_only:
|
10467
|
+
self.create_make_backup()
|
10468
|
+
|
10469
|
+
# Prepare statistics
|
10470
|
+
stats = {
|
10471
|
+
"processed_sheet_names":[],
|
10472
|
+
"processed_sheets": 0,
|
10473
|
+
"total_updates": 0,
|
10474
|
+
"skipped_rows": 0,
|
10475
|
+
"created_columns": 0,
|
10476
|
+
}
|
10477
|
+
|
10478
|
+
# Normalize sheet names
|
10479
|
+
if not isinstance(sheet_name, list):
|
10480
|
+
sheet_name = [sheet_name]
|
10481
|
+
|
10482
|
+
# Process each sheet
|
10483
|
+
for sheet in sheet_name:
|
10484
|
+
try:
|
10485
|
+
if isinstance(sheet, str):
|
10486
|
+
ws = self.wb[sheet]
|
10487
|
+
elif isinstance(sheet, int):
|
10488
|
+
ws = self.wb.worksheets[sheet]
|
10489
|
+
else:
|
10490
|
+
ws = self.wb.active
|
10491
|
+
|
10492
|
+
sheet_name = ws.title
|
10493
|
+
self.log.append(f"\nProcessing sheet: {sheet_name}")
|
10494
|
+
|
10495
|
+
# Prepare matching data
|
10496
|
+
match_dict = {}
|
10497
|
+
for df_col, excel_col in column_match:
|
10498
|
+
if clean_keys:
|
10499
|
+
match_dict[excel_col] = dict(
|
10500
|
+
zip(
|
10501
|
+
df[df_col].apply(
|
10502
|
+
lambda x: self.normalize_value(x, clean_keys)
|
10503
|
+
),
|
10504
|
+
df.index,
|
10505
|
+
)
|
10506
|
+
)
|
10507
|
+
else:
|
10508
|
+
match_dict[excel_col] = dict(zip(df[df_col], df.index))
|
10509
|
+
|
10510
|
+
# Find or create update columns
|
10511
|
+
update_col_indices = {}
|
10512
|
+
for df_col, excel_col in column_mapping:
|
10513
|
+
col_idx = self.find_column_index(ws, header_row, excel_col)
|
10514
|
+
if col_idx == -1:
|
10515
|
+
if create_missing_columns:
|
10516
|
+
# Find last column
|
10517
|
+
last_col = max(
|
10518
|
+
[cell.column for cell in ws[header_row] if cell.value is not None], default=0
|
10519
|
+
)
|
10520
|
+
col_idx = last_col + 1
|
10521
|
+
ws.cell(row=header_row, column=col_idx, value=excel_col)
|
10522
|
+
update_col_indices[excel_col] = col_idx
|
10523
|
+
stats["created_columns"] += 1
|
10524
|
+
self.log.append(
|
10525
|
+
f"Created new column '{excel_col}' at position {col_idx}"
|
10526
|
+
)
|
10527
|
+
else:
|
10528
|
+
raise ValueError(
|
10529
|
+
f"Column '{excel_col}' not found and create_missing_columns=False"
|
10530
|
+
)
|
10531
|
+
else:
|
10532
|
+
update_col_indices[excel_col] = col_idx
|
10533
|
+
|
10534
|
+
# Process rows
|
10535
|
+
for row in ws.iter_rows(min_row=header_row + 1):
|
10536
|
+
match_values = {}
|
10537
|
+
match_failed = False
|
10538
|
+
|
10539
|
+
for excel_col in match_dict.keys():
|
10540
|
+
col_idx = self.find_column_index(ws, header_row, excel_col)
|
10541
|
+
if col_idx == -1:
|
10542
|
+
if skip_no_match:
|
10543
|
+
match_failed = True
|
10544
|
+
break
|
10545
|
+
else:
|
10546
|
+
raise ValueError(
|
10547
|
+
f"Match column '{excel_col}' not found in sheet"
|
10548
|
+
)
|
10549
|
+
|
10550
|
+
cell_value = row[
|
10551
|
+
col_idx - 1
|
10552
|
+
].value # -1 because iter_rows returns 0-based list
|
10553
|
+
if clean_keys:
|
10554
|
+
cell_value = self.normalize_value(cell_value, clean_keys)
|
10555
|
+
|
10556
|
+
match_values[excel_col] = cell_value
|
10557
|
+
|
10558
|
+
if match_failed:
|
10559
|
+
stats["skipped_rows"] += 1
|
10560
|
+
continue
|
10561
|
+
|
10562
|
+
# Find matching DataFrame row
|
10563
|
+
df_index = None
|
10564
|
+
for excel_col, value in match_values.items():
|
10565
|
+
if value in match_dict[excel_col]:
|
10566
|
+
if df_index is None:
|
10567
|
+
df_index = match_dict[excel_col][value]
|
10568
|
+
elif df_index != match_dict[excel_col][value]:
|
10569
|
+
# Multiple match columns point to different rows - skip
|
10570
|
+
df_index = None
|
10571
|
+
break
|
10572
|
+
|
10573
|
+
if df_index is None:
|
10574
|
+
stats["skipped_rows"] += 1
|
10575
|
+
continue
|
10576
|
+
|
10577
|
+
# Update cells
|
10578
|
+
for df_col, excel_col in column_mapping:
|
10579
|
+
col_idx = update_col_indices[excel_col]
|
10580
|
+
cell = row[
|
10581
|
+
col_idx - 1
|
10582
|
+
] # -1 because iter_rows returns 0-based list
|
10583
|
+
new_value = df.at[df_index, df_col]
|
10584
|
+
|
10585
|
+
# Apply update strategy
|
10586
|
+
if update_strategy == "overwrite":
|
10587
|
+
cell.value = new_value
|
10588
|
+
elif update_strategy in (
|
10589
|
+
"add",
|
10590
|
+
"subtract",
|
10591
|
+
"multiply",
|
10592
|
+
"divide",
|
10593
|
+
):
|
10594
|
+
try:
|
10595
|
+
old_value = (
|
10596
|
+
float(cell.value) if cell.value is not None else 0
|
10597
|
+
)
|
10598
|
+
new_value = (
|
10599
|
+
float(new_value) if new_value is not None else 0
|
10600
|
+
)
|
10601
|
+
if update_strategy == "add":
|
10602
|
+
cell.value = old_value + new_value
|
10603
|
+
elif update_strategy == "subtract":
|
10604
|
+
cell.value = old_value - new_value
|
10605
|
+
elif update_strategy == "multiply":
|
10606
|
+
cell.value = old_value * new_value
|
10607
|
+
elif update_strategy == "divide":
|
10608
|
+
cell.value = (
|
10609
|
+
old_value / new_value
|
10610
|
+
if new_value != 0
|
10611
|
+
else old_value
|
10612
|
+
)
|
10613
|
+
except (ValueError, TypeError):
|
10614
|
+
if skip_no_match:
|
10615
|
+
continue
|
10616
|
+
raise ValueError(
|
10617
|
+
f"Could not perform {update_strategy} operation on non-numeric values"
|
10618
|
+
)
|
10619
|
+
elif update_strategy == "append":
|
10620
|
+
separator = ", " if cell.value else ""
|
10621
|
+
cell.value = (
|
10622
|
+
f"{cell.value}{separator}{new_value}"
|
10623
|
+
if cell.value
|
10624
|
+
else new_value
|
10625
|
+
)
|
10626
|
+
else:
|
10627
|
+
raise ValueError(
|
10628
|
+
f"Unknown update_strategy: {update_strategy}"
|
10629
|
+
)
|
10630
|
+
|
10631
|
+
stats["total_updates"] += 1
|
10632
|
+
|
10633
|
+
stats["processed_sheets"] += 1
|
10634
|
+
stats["processed_sheet_names"].append(sheet_name)
|
10635
|
+
except Exception as e:
|
10636
|
+
self.log.append(f"Error processing sheet {sheet}: {str(e)}")
|
10637
|
+
if (
|
10638
|
+
not preview_only
|
10639
|
+
and self.backup_path
|
10640
|
+
and isinstance(self.fpath_or_wb, str)
|
10641
|
+
):
|
10642
|
+
self.log.append("Restoring from make_backup due to error")
|
10643
|
+
self.wb = load_workbook(self.backup_path)
|
10644
|
+
raise
|
10645
|
+
|
10646
|
+
# Save changes if not dry run
|
10647
|
+
if not preview_only:
|
10648
|
+
self.save_workbook(dir_save)
|
10649
|
+
if not make_backup:
|
10650
|
+
if os.path.exists(self.backup_path):
|
10651
|
+
os.remove(self.backup_path)
|
10652
|
+
else:
|
10653
|
+
self.log.append("\nDry run complete - no changes saved")
|
10654
|
+
|
10655
|
+
# Print summary
|
10656
|
+
summary = (
|
10657
|
+
f"\nUpdate Summary:\n"
|
10658
|
+
f"\tProcessed {stats["processed_sheets"]} sheetnames: {stats['processed_sheet_names']}\n"
|
10659
|
+
f"\tTotal updates: {stats['total_updates']}\n"
|
10660
|
+
f"\tSkipped rows: {stats['skipped_rows']}\n"
|
10661
|
+
)
|
10662
|
+
self.log.append(summary)
|
10663
|
+
|
10664
|
+
if show_progress:
|
10665
|
+
print(summary)
|
10666
|
+
|
10667
|
+
return stats
|
10668
|
+
|
10669
|
+
def get_log(self) -> str:
|
10670
|
+
"""Get the operation log as a string."""
|
10671
|
+
return "\n".join(self.log)
|
10672
|
+
|
10673
|
+
def close(self) -> None:
|
10674
|
+
"""Close the workbook if we own it."""
|
10675
|
+
if self.wb is not None and self.owns_workbook:
|
10676
|
+
self.wb.close()
|
10677
|
+
self.wb = None
|
10678
|
+
|
10679
|
+
|
10680
|
+
DFToExcelMapping = Union[Dict[str, str], List[Tuple[str, str]]]
|
10681
|
+
def df_align(
|
10682
|
+
fpath: Union[str, Workbook],
|
10683
|
+
df: pd.DataFrame,
|
10684
|
+
sheet_name: Union[str, int, List[Union[str, int]]] = 0,
|
10685
|
+
header_row: int = 1,
|
10686
|
+
column_match: DFToExcelMapping = None,
|
10687
|
+
column_mapping: DFToExcelMapping = None,
|
10688
|
+
clean_keys: str = "strip_split_first",
|
10689
|
+
match_method: str = "exact",
|
10690
|
+
update_strategy: str = "overwrite",
|
10691
|
+
create_missing_columns: bool = True,
|
10692
|
+
preview_only: bool = False,
|
10693
|
+
show_progress: bool = True,
|
10694
|
+
skip_no_match: bool = True,
|
10695
|
+
make_backup: bool = True,
|
10696
|
+
dir_save: str = None,
|
10697
|
+
) -> Dict[str, int]:
|
10698
|
+
"""
|
10699
|
+
wb = fload(
|
10700
|
+
dir_aml,
|
10701
|
+
password="XBuzwVk4xsC2361cHzyi9JFgfJHaTSerjBOQ0JAJU24=",
|
10702
|
+
sheet_name=0,
|
10703
|
+
header=1,
|
10704
|
+
output="bit",
|
10705
|
+
)
|
10706
|
+
ws = wb[wb.sheetnames[0]]
|
10707
|
+
df_align(
|
10708
|
+
fpath=wb,
|
10709
|
+
df=df_,
|
10710
|
+
sheet_name=None,
|
10711
|
+
header_row=2,
|
10712
|
+
column_match={"SampleID": "SampleID"},# key是 df中的列名, value是 excel中,
|
10713
|
+
column_mapping={"Vials": "Vials", "Vials_": "Total Vials"}, # key是 df中的列名, value是 excel中,
|
10714
|
+
)
|
10715
|
+
"""
|
10716
|
+
updater = DataFrameAlignExcel(fpath, df)
|
10717
|
+
try:
|
10718
|
+
result = updater.update_values(
|
10719
|
+
sheet_name=sheet_name,
|
10720
|
+
header_row=header_row,
|
10721
|
+
column_match=column_match,
|
10722
|
+
column_mapping=column_mapping,
|
10723
|
+
clean_keys=clean_keys,
|
10724
|
+
match_method=match_method,
|
10725
|
+
update_strategy=update_strategy,
|
10726
|
+
create_missing_columns=create_missing_columns,
|
10727
|
+
preview_only=preview_only,
|
10728
|
+
show_progress=show_progress,
|
10729
|
+
skip_no_match=skip_no_match,
|
10730
|
+
make_backup=make_backup,
|
10731
|
+
dir_save=dir_save,
|
10732
|
+
)
|
10733
|
+
return result
|
10734
|
+
finally:
|
10735
|
+
updater.close()
|
10736
|
+
|
10737
|
+
|
10738
|
+
# ! =========(Above) interact with worrkbook and DataFrame===========
|
10739
|
+
def set_sheet_visible(
|
10740
|
+
fpath: str,
|
10741
|
+
sheet_name: Union[int, str, None,list] = 1,
|
10742
|
+
show: Union[bool, str] = True,
|
10743
|
+
exclude: Union[List[str], None,list,int] = None,
|
10744
|
+
verbose: bool = False
|
10745
|
+
) -> None:
|
10746
|
+
"""
|
10747
|
+
Modify sheet visibility in an Excel workbook.
|
10748
|
+
set_sheet_visible(fpath=dir_data_collection,sheet_name=None,show=1,verbose=1)
|
10749
|
+
Args:
|
10750
|
+
fpath (str): Path to the Excel workbook.
|
10751
|
+
sheet_name (int | str | None): Index or name of the sheet to apply visibility to.
|
10752
|
+
If None, all sheets are considered.
|
10753
|
+
show (bool | str): Visibility mode. Can be:
|
10754
|
+
- True -> visible
|
10755
|
+
- False -> veryHidden
|
10756
|
+
- 'visible', 'hidden', 'veryHidden' as str
|
10757
|
+
exclude (list[str] | None): List of sheet names to exclude from changes.
|
10758
|
+
verbose (bool): If True, logs actions.
|
10759
|
+
"""
|
10760
|
+
|
10761
|
+
try:
|
10762
|
+
wb = fload(fpath, output="bit", get_validations=1)
|
10763
|
+
except Exception as e:
|
10764
|
+
raise FileNotFoundError(f"Unable to load workbook: {e}")
|
10765
|
+
|
10766
|
+
sheet_names = wb.sheetnames
|
10767
|
+
if verbose:
|
10768
|
+
print("Workbook loaded with sheets:")
|
10769
|
+
for i, name in enumerate(sheet_names):
|
10770
|
+
print(f" [{i}] {name}")
|
10771
|
+
|
10772
|
+
excludes=[]
|
10773
|
+
if exclude is None:
|
10774
|
+
exclude=[]
|
10775
|
+
if ~isinstance(exclude, list):
|
10776
|
+
exclude = [exclude]
|
10777
|
+
for exclude_ in exclude:
|
10778
|
+
if isinstance(exclude_, str):
|
10779
|
+
excludes.append(strcmp(exclude_, sheet_names)[0])
|
10780
|
+
elif isinstance(exclude_, int):
|
10781
|
+
if 0 <= exclude_ < len(sheet_names):
|
10782
|
+
excludes.append(sheet_names[exclude_])
|
10783
|
+
else:
|
10784
|
+
raise IndexError(f"sheet_name index {exclude_} is out of range:0~{len(sheet_names)-1}.")
|
10785
|
+
|
10786
|
+
# Resolve the sheet_name target
|
10787
|
+
target_indices = []
|
10788
|
+
if not isinstance(sheet_name,list):
|
10789
|
+
sheet_name=[sheet_name]
|
10790
|
+
for sheet_name_ in sheet_name:
|
10791
|
+
if sheet_name_ is None:
|
10792
|
+
target_indices = list(range(len(sheet_names)))
|
10793
|
+
break
|
10794
|
+
elif isinstance(sheet_name_, int):
|
10795
|
+
if 0 <= sheet_name_ < len(sheet_names):
|
10796
|
+
target_indices.append(sheet_name_)
|
10797
|
+
else:
|
10798
|
+
raise IndexError(f"sheet_name index {sheet_name_} is out of range :0~{len(sheet_names)-1}.")
|
10799
|
+
elif isinstance(sheet_name_, str):
|
10800
|
+
idx = strcmp(sheet_name_, sheet_names)[1]
|
10801
|
+
if idx == -1:
|
10802
|
+
raise ValueError(f"Sheet '{sheet_name_}' not found.")
|
10803
|
+
target_indices.append(idx)
|
10804
|
+
|
10805
|
+
# Map show argument to valid state
|
10806
|
+
valid_states = ["veryHidden", "visible", "hidden"]
|
10807
|
+
if isinstance(show, str):
|
10808
|
+
if show not in valid_states:
|
10809
|
+
raise ValueError(f"Invalid show value '{show}'. Must be one of {valid_states}")
|
10810
|
+
state = show
|
10811
|
+
else:
|
10812
|
+
state = "visible" if show else "veryHidden"
|
10813
|
+
# Modify sheet visibility
|
10814
|
+
for idx in target_indices:
|
10815
|
+
ws= wb[sheet_names[idx]]
|
10816
|
+
if ws.title in excludes:
|
10817
|
+
if verbose:
|
10818
|
+
print(f"Skipping excluded sheet: '{ws.title}'")
|
10819
|
+
continue
|
10820
|
+
ws.sheet_state = state
|
10821
|
+
# Ensure at least one sheet is visible
|
10822
|
+
visible_sheets = [s for s in wb.worksheets if s.sheet_state == "visible"]
|
10823
|
+
not_visible_sheets = [s for s in wb.worksheets if s.sheet_state != "visible"]
|
10824
|
+
if not visible_sheets:
|
10825
|
+
fallback_sheet = wb.worksheets[0]
|
10826
|
+
fallback_sheet.sheet_state = "visible"
|
10827
|
+
if verbose:
|
10828
|
+
print(f"No visible sheets found. Setting '{fallback_sheet.title}' to visible.")
|
10829
|
+
if verbose:
|
10830
|
+
print(f"visible sheets:{[s.title for s in visible_sheets]}")
|
10831
|
+
|
10832
|
+
try:
|
10833
|
+
wb.save(fpath)
|
10834
|
+
except Exception as e:
|
10835
|
+
raise IOError(f"Error saving workbook: {e}")
|
10836
|
+
|
9755
10837
|
|
9756
10838
|
def format_excel(
|
9757
10839
|
df: pd.DataFrame=None,
|
@@ -9780,7 +10862,7 @@ def format_excel(
|
|
9780
10862
|
number_format:dict=None, # dict: e.g., {1:"0.00", 2:"#,##0",3:"0%",4:"$#,##0.00"}
|
9781
10863
|
data_validation=None, # dict
|
9782
10864
|
template:dict={},# e.g., template=dict(path="xx.xlsx",sheet_name=['sheet_name1',"sheet_name2"])
|
9783
|
-
apply_filter:bool=
|
10865
|
+
apply_filter:bool=False, # add filter
|
9784
10866
|
freeze :str= False,#"A2",
|
9785
10867
|
conditional_format:dict=None, # dict
|
9786
10868
|
verbose:bool=False,
|
@@ -9942,6 +11024,67 @@ def format_excel(
|
|
9942
11024
|
if end_col_letter
|
9943
11025
|
else f"{start_col_letter}{start_row}"
|
9944
11026
|
)
|
11027
|
+
|
11028
|
+
|
11029
|
+
def is_merged_cell(ws, cell):
|
11030
|
+
"""Check if a cell is part of any merged range."""
|
11031
|
+
for merged_range in ws.merged_cells.ranges:
|
11032
|
+
if cell.coordinate in merged_range:
|
11033
|
+
return True
|
11034
|
+
return False
|
11035
|
+
|
11036
|
+
def apply_auto_width(ws, width_factor=1.2, width_padding=2, width_max=50):
|
11037
|
+
"""
|
11038
|
+
Automatically adjust column widths based on content length,
|
11039
|
+
with complete protection against merged cell errors.
|
11040
|
+
|
11041
|
+
Args:
|
11042
|
+
ws: Worksheet object
|
11043
|
+
width_factor: Multiplier for content length (default 1.2)
|
11044
|
+
width_padding: Additional padding (default 2)
|
11045
|
+
width_max: Maximum column width (default 50)
|
11046
|
+
"""
|
11047
|
+
# First build a set of all merged cell coordinates
|
11048
|
+
merged_coords = set()
|
11049
|
+
for merged_range in ws.merged_cells.ranges:
|
11050
|
+
for row in ws.iter_rows(min_row=merged_range.min_row,
|
11051
|
+
max_row=merged_range.max_row,
|
11052
|
+
min_col=merged_range.min_col,
|
11053
|
+
max_col=merged_range.max_col):
|
11054
|
+
for cell in row:
|
11055
|
+
merged_coords.add(cell.coordinate)
|
11056
|
+
|
11057
|
+
for col in ws.columns:
|
11058
|
+
if not col:
|
11059
|
+
continue
|
11060
|
+
|
11061
|
+
col_letter = get_column_letter(col[0].column)
|
11062
|
+
max_length = 0
|
11063
|
+
|
11064
|
+
for cell in col:
|
11065
|
+
# Skip merged cells entirely
|
11066
|
+
if cell.coordinate in merged_coords:
|
11067
|
+
continue
|
11068
|
+
|
11069
|
+
try:
|
11070
|
+
if cell.value is not None:
|
11071
|
+
# Handle both single-line and multi-line content
|
11072
|
+
cell_value = str(cell.value)
|
11073
|
+
lines = cell_value.split('\n')
|
11074
|
+
current_max = max(len(line) for line in lines)
|
11075
|
+
max_length = max(max_length, current_max)
|
11076
|
+
except Exception as e:
|
11077
|
+
print(f"Skipping cell {cell.coordinate} due to error: {e}")
|
11078
|
+
continue
|
11079
|
+
|
11080
|
+
# Calculate width with constraints
|
11081
|
+
adjusted_width = min(
|
11082
|
+
max(1, (max_length * width_factor) + width_padding),
|
11083
|
+
width_max if width_max is not None else float('inf')
|
11084
|
+
)
|
11085
|
+
|
11086
|
+
ws.column_dimensions[col_letter].width = adjusted_width
|
11087
|
+
|
9945
11088
|
def apply_color_to_worksheet(ws=None, sheet_name=None, conditions=None, cell_idx=None,where="text"):
|
9946
11089
|
"""
|
9947
11090
|
Apply text color formatting to a specific cell range in an openpyxl workbook based on conditions.
|
@@ -10047,6 +11190,11 @@ def format_excel(
|
|
10047
11190
|
|
10048
11191
|
def apply_format(ws, cell, cell_range):
|
10049
11192
|
"""Apply cell formatting to a specified range."""
|
11193
|
+
# Get all merged cell coordinates first
|
11194
|
+
merged_cells = set()
|
11195
|
+
for merged_range in ws.merged_cells.ranges:
|
11196
|
+
for coord in merged_range.cells:
|
11197
|
+
merged_cells.add(coord)
|
10050
11198
|
cell_font, cell_fill, cell_alignment, border = None, None, None, None
|
10051
11199
|
kws_cell = ["font", "fill", "alignment", "border"]
|
10052
11200
|
for K, _ in cell.items():
|
@@ -10244,6 +11392,7 @@ def format_excel(
|
|
10244
11392
|
)
|
10245
11393
|
# get colors config
|
10246
11394
|
for k, v in cell.get(K, {}).items():
|
11395
|
+
print(k, v,strcmp(k, kws_border)[0])
|
10247
11396
|
if strcmp(k, kws_border)[0] in ["color"]:
|
10248
11397
|
border_color_all = hex2argb(v)
|
10249
11398
|
# 如果设置了color,表示其它的所有的都设置成为一样的
|
@@ -10374,6 +11523,8 @@ def format_excel(
|
|
10374
11523
|
#! final apply configs
|
10375
11524
|
for row in ws[cell_range]:
|
10376
11525
|
for cell_ in row:
|
11526
|
+
if cell_.coordinate in merged_cells:
|
11527
|
+
continue # Skip merged cells
|
10377
11528
|
if cell_font:
|
10378
11529
|
cell_.font = cell_font
|
10379
11530
|
if cell_fill:
|
@@ -10451,11 +11602,9 @@ def format_excel(
|
|
10451
11602
|
if not os.path.exists(filename) or mode=="w":
|
10452
11603
|
# ws=wb.active
|
10453
11604
|
# ws.title = sheet_name
|
10454
|
-
ws = wb.create_sheet(title=sheet_name)
|
10455
|
-
print(1)
|
11605
|
+
ws = wb.create_sheet(title=sheet_name)
|
10456
11606
|
else:# file exists
|
10457
|
-
wb = load_workbook(filename)
|
10458
|
-
print(2)
|
11607
|
+
wb = load_workbook(filename)
|
10459
11608
|
# with pd.ExcelWriter(filename, mode="a", engine=engine, if_sheet_exists=if_sheet_exists) as writer:
|
10460
11609
|
# for ws in wb.worksheets: # Iterate through worksheets in the input workbook
|
10461
11610
|
# ws_df = pd.DataFrame(ws.values)
|
@@ -10782,44 +11931,62 @@ def format_excel(
|
|
10782
11931
|
if freeze:
|
10783
11932
|
ws.freeze_panes = freeze # Freeze everything above and to the left of A2
|
10784
11933
|
# !widths
|
10785
|
-
if isinstance(width,bool):
|
11934
|
+
if isinstance(width, bool):
|
10786
11935
|
width=None if width else False
|
10787
11936
|
if isinstance(height,bool):
|
10788
11937
|
height=None if height else False
|
10789
|
-
if width is None or width=={}: # automatic adust width
|
10790
|
-
for col in ws.columns:
|
10791
|
-
max_length = 0
|
10792
|
-
"""column = col[0].column_letter # Get the column letter"""
|
10793
|
-
# Check the first cell in the column to get the column letter
|
10794
|
-
cell_first = col[0]
|
10795
11938
|
|
10796
|
-
|
10797
|
-
|
10798
|
-
|
10799
|
-
|
10800
|
-
|
11939
|
+
merged_cells = set()
|
11940
|
+
for merged_range in ws.merged_cells.ranges:
|
11941
|
+
for row in ws.iter_rows(min_row=merged_range.min_row,
|
11942
|
+
max_row=merged_range.max_row,
|
11943
|
+
min_col=merged_range.min_col,
|
11944
|
+
max_col=merged_range.max_col):
|
11945
|
+
for cell in row:
|
11946
|
+
merged_cells.add(cell.coordinate)
|
11947
|
+
if width is None or width == {}: # automatic adjust width
|
11948
|
+
print("auto-width")
|
11949
|
+
for col in ws.columns:
|
11950
|
+
if not col:
|
10801
11951
|
continue
|
10802
|
-
|
10803
|
-
|
10804
|
-
|
10805
|
-
|
10806
|
-
|
10807
|
-
|
10808
|
-
|
10809
|
-
|
10810
|
-
|
10811
|
-
|
10812
|
-
|
11952
|
+
try:
|
11953
|
+
col_letter = get_column_letter(col[0].column)
|
11954
|
+
|
11955
|
+
# Skip entire column if any cell is merged
|
11956
|
+
if any(cell.coordinate in merged_cells for cell in col):
|
11957
|
+
continue
|
11958
|
+
|
11959
|
+
max_length = 0
|
11960
|
+
for cell in col:
|
11961
|
+
try:
|
11962
|
+
if cell.value:
|
11963
|
+
cell_value = str(cell.value)
|
11964
|
+
if '\n' in cell_value:
|
11965
|
+
max_line_length = max(len(line) for line in cell_value.split('\n'))
|
11966
|
+
max_length = max(max_length, max_line_length)
|
11967
|
+
else:
|
11968
|
+
max_length = max(max_length, len(cell_value))
|
11969
|
+
except:
|
11970
|
+
pass
|
11971
|
+
|
11972
|
+
adjusted_width = (max_length * width_factor) + width_padding
|
11973
|
+
if width_max is not None:
|
11974
|
+
adjusted_width = min(adjusted_width, width_max)
|
11975
|
+
ws.column_dimensions[col_letter].width = max(5, adjusted_width)
|
11976
|
+
|
11977
|
+
except Exception as e:
|
11978
|
+
print(f"Error adjusting width for column: {e}")
|
11979
|
+
continue
|
11980
|
+
elif isinstance(width, (int, float)): # set all columns to this value
|
11981
|
+
print("set to fixed width {}".format(width))
|
10813
11982
|
for col in ws.columns:
|
10814
|
-
column=get_column_letter(col[0].column)
|
10815
|
-
ws.column_dimensions[column].width=width*width_factor+width_padding
|
10816
|
-
elif isinstance(width,
|
10817
|
-
pass
|
10818
|
-
else:
|
11983
|
+
column = get_column_letter(col[0].column)
|
11984
|
+
ws.column_dimensions[column].width = width * width_factor + width_padding
|
11985
|
+
elif isinstance(width, dict): # custom widths per column
|
10819
11986
|
for col_idx, width_ in width.items():
|
10820
11987
|
col_letter = get_column_letter(col_idx)
|
10821
11988
|
ws.column_dimensions[col_letter].width = width_
|
10822
|
-
|
11989
|
+
|
10823
11990
|
# !heights
|
10824
11991
|
if height is None or height=={}: # automatic adust height
|
10825
11992
|
for row in ws.iter_rows(min_row=1, max_row=ws.max_row):
|
@@ -11276,9 +12443,28 @@ def format_excel(
|
|
11276
12443
|
|
11277
12444
|
# ungroup sheets
|
11278
12445
|
for sheet in wb.worksheets:
|
11279
|
-
sheet.sheet_view.tabSelected = False
|
12446
|
+
sheet.sheet_view.tabSelected = False
|
11280
12447
|
# !Save the workbook
|
11281
|
-
|
12448
|
+
try:
|
12449
|
+
wb.save(filename)
|
12450
|
+
except Exception as e:
|
12451
|
+
print(f"Error saving workbook: {str(e)}")
|
12452
|
+
# Replace your final save operation with this:
|
12453
|
+
# try:
|
12454
|
+
# # Create a temporary file for safer saving
|
12455
|
+
# temp_filename = filename + '.tmp'
|
12456
|
+
# wb.save(temp_filename)
|
12457
|
+
|
12458
|
+
# # If save succeeds, replace original file
|
12459
|
+
# if os.path.exists(filename):
|
12460
|
+
# os.remove(filename)
|
12461
|
+
# os.rename(temp_filename, filename)
|
12462
|
+
|
12463
|
+
# except Exception as e:
|
12464
|
+
# print(f"Error saving workbook: {str(e)}")
|
12465
|
+
# if os.path.exists(temp_filename):
|
12466
|
+
# os.remove(temp_filename)
|
12467
|
+
# raise
|
11282
12468
|
|
11283
12469
|
|
11284
12470
|
def preview(var):
|
@@ -13282,61 +14468,630 @@ def df_fillna(
|
|
13282
14468
|
strategy="constant", fill_value=constant
|
13283
14469
|
)
|
13284
14470
|
else:
|
13285
|
-
non_numeric_imputer = SimpleImputer(strategy="most_frequent")
|
14471
|
+
non_numeric_imputer = SimpleImputer(strategy="most_frequent")
|
14472
|
+
|
14473
|
+
# Impute non-numeric columns column-wise (axis=0)
|
14474
|
+
imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
|
14475
|
+
|
14476
|
+
# Convert imputed non-numeric array back to DataFrame with original index and column names
|
14477
|
+
imputed_non_numeric_df = pd.DataFrame(
|
14478
|
+
imputed_non_numeric,
|
14479
|
+
index=non_numeric_data.index,
|
14480
|
+
columns=non_numeric_data.columns,
|
14481
|
+
)
|
14482
|
+
else:
|
14483
|
+
imputed_non_numeric_df = pd.DataFrame(index=data.index)
|
14484
|
+
|
14485
|
+
imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(
|
14486
|
+
columns=data.columns
|
14487
|
+
)
|
14488
|
+
|
14489
|
+
if inplace:
|
14490
|
+
# Modify the original DataFrame
|
14491
|
+
data[:] = imputed_data[col_names_org]
|
14492
|
+
return None
|
14493
|
+
else:
|
14494
|
+
# Return the modified DataFrame
|
14495
|
+
return imputed_data[col_names_org]
|
14496
|
+
|
14497
|
+
|
14498
|
+
# # example
|
14499
|
+
# data = {
|
14500
|
+
# "A": [1, 2, np.nan, 4, 5],
|
14501
|
+
# "B": [np.nan, 2, 3, 4, np.nan],
|
14502
|
+
# "C": [1, np.nan, 3, 4, 5],
|
14503
|
+
# "D": [1, 2, 3, 4, np.nan],
|
14504
|
+
# }
|
14505
|
+
|
14506
|
+
# # Define a function to test each imputation method
|
14507
|
+
# methods = [
|
14508
|
+
# "mean",
|
14509
|
+
# "median",
|
14510
|
+
# "most_frequent",
|
14511
|
+
# "constant",
|
14512
|
+
# "knn",
|
14513
|
+
# "iterative",
|
14514
|
+
# # "missforest",
|
14515
|
+
# # "softimpute",
|
14516
|
+
# # "svd",
|
14517
|
+
# ]
|
14518
|
+
|
14519
|
+
# # Create a dictionary to hold results
|
14520
|
+
# results = {}
|
14521
|
+
|
14522
|
+
# for method_name in methods:
|
14523
|
+
# print(method_name)
|
14524
|
+
# display(df)
|
14525
|
+
# display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
|
14526
|
+
def df_cut(
|
14527
|
+
df: pd.DataFrame,
|
14528
|
+
column: str,
|
14529
|
+
*,
|
14530
|
+
new_col_name: Optional[str] = None,
|
14531
|
+
bins: Optional[
|
14532
|
+
Union[int, List[float], Dict[str, Union[float, str, pd.Timestamp]]]
|
14533
|
+
] = None,
|
14534
|
+
range_start: Optional[Union[float, str, pd.Timestamp]] = None,
|
14535
|
+
range_end: Optional[Union[float, str, pd.Timestamp]] = None,
|
14536
|
+
step: Optional[Union[float, str, pd.Timedelta]] = None,
|
14537
|
+
labels: Optional[List[str]] = None,
|
14538
|
+
label_format: Optional[Union[str, Callable[[float, float], str]]] = None,
|
14539
|
+
include_overflow: bool = True,
|
14540
|
+
include_underflow: bool = False,
|
14541
|
+
right: bool = False,
|
14542
|
+
drop_original: bool = False,
|
14543
|
+
precision: int = 2,
|
14544
|
+
show_count: bool = False,
|
14545
|
+
symbol_count: str = "n=",
|
14546
|
+
show_percentage: bool = False,
|
14547
|
+
symbol_percentage: str = "%",
|
14548
|
+
show_total_count: bool = False,
|
14549
|
+
symbol_total_count: str = "∑n=",
|
14550
|
+
sep_between: str = " | ",
|
14551
|
+
sort_labels: bool = True,
|
14552
|
+
na_action: str = "keep",
|
14553
|
+
na_fill_value: Optional[str] = None,
|
14554
|
+
dtype: Optional[Union[str, pd.CategoricalDtype]] = None,
|
14555
|
+
ordered: bool = True,
|
14556
|
+
inplace: bool = False,
|
14557
|
+
datetime_format: str = "%Y-%m-%d",
|
14558
|
+
categorical_agg: str = "count",
|
14559
|
+
) -> Optional[pd.DataFrame]:
|
14560
|
+
"""
|
14561
|
+
Enhanced binning function that works with numeric, datetime, and categorical columns.
|
14562
|
+
|
14563
|
+
Features:
|
14564
|
+
- Automatic type detection (numeric, datetime, categorical)
|
14565
|
+
- Flexible bin specification (number of bins, explicit edges, or range+step)
|
14566
|
+
- Customizable labels with formatting
|
14567
|
+
- Count and percentage display options
|
14568
|
+
- NA value handling
|
14569
|
+
square bracket: means inclusive
|
14570
|
+
parenthesis: means exclusive
|
14571
|
+
Parameters:
|
14572
|
+
-----------
|
14573
|
+
df : pd.DataFrame
|
14574
|
+
Input DataFrame containing the column to bin
|
14575
|
+
column : str
|
14576
|
+
Name of column to bin
|
14577
|
+
new_col_name : str, optional
|
14578
|
+
Name for binned column (default: f"{column}_binned")
|
14579
|
+
bins : int, list, or dict, optional
|
14580
|
+
- int: Number of equal-width bins
|
14581
|
+
- list: Explicit bin edges
|
14582
|
+
- dict: {'start': x, 'end': y, 'step': z} for range specification
|
14583
|
+
range_start : float or datetime-like, optional
|
14584
|
+
Start value for bin range (required if bins is None or dict)
|
14585
|
+
range_end : float or datetime-like, optional
|
14586
|
+
End value for bin range (default: max of column)
|
14587
|
+
step : float or timedelta-like, optional
|
14588
|
+
Step size for bin creation (required if bins is None or dict)
|
14589
|
+
labels : list of str, optional
|
14590
|
+
Custom labels for bins (must match number of bins)
|
14591
|
+
label_format : str or callable, optional
|
14592
|
+
Format string or function for bin labels
|
14593
|
+
include_overflow : bool, default True
|
14594
|
+
Include catch-all bin for values above range_end
|
14595
|
+
include_underflow : bool, default False
|
14596
|
+
Include catch-all bin for values below range_start
|
14597
|
+
right : bool, default False
|
14598
|
+
Whether bins include the right edge
|
14599
|
+
drop_original : bool, default False
|
14600
|
+
Drop original column after binning
|
14601
|
+
precision : int, default 2
|
14602
|
+
Decimal precision for numeric bin labels
|
14603
|
+
show_count : bool, default False
|
14604
|
+
Show count of items in each bin
|
14605
|
+
show_percentage : bool, default False
|
14606
|
+
Show percentage of items in each bin
|
14607
|
+
show_total_count : bool, default False
|
14608
|
+
Show total count in labels
|
14609
|
+
na_action : str, default 'keep'
|
14610
|
+
How to handle NA values ('keep', 'drop', or 'fill')
|
14611
|
+
na_fill_value : str, optional
|
14612
|
+
Value to fill NAs with if na_action='fill'
|
14613
|
+
dtype : dtype or CategoricalDtype, optional
|
14614
|
+
Output dtype for binned column
|
14615
|
+
ordered : bool, default True
|
14616
|
+
Whether bins are ordered
|
14617
|
+
inplace : bool, default False
|
14618
|
+
Modify DataFrame in place
|
14619
|
+
datetime_format : str, default "%Y-%m-%d"
|
14620
|
+
Format string for datetime labels
|
14621
|
+
categorical_agg : str, default 'count'
|
14622
|
+
For categorical data: 'count' or 'ratio'
|
14623
|
+
|
14624
|
+
Returns:
|
14625
|
+
--------
|
14626
|
+
pd.DataFrame or None
|
14627
|
+
Returns modified DataFrame unless inplace=True
|
14628
|
+
|
14629
|
+
Examples:
|
14630
|
+
--------
|
14631
|
+
# Numeric binning
|
14632
|
+
df_cut(df, 'age', bins=5)
|
14633
|
+
df_cut(df, 'price', range_start=0, range_end=1000, step=100)
|
14634
|
+
|
14635
|
+
# Datetime binning
|
14636
|
+
df_cut(df, 'date', bins={'start': '2023-01-01', 'end': '2023-12-31', 'step': '1M'})
|
14637
|
+
|
14638
|
+
# Categorical binning
|
14639
|
+
df_cut(df, 'category', bins=5, categorical_agg='ratio')
|
14640
|
+
|
14641
|
+
# Sample datetime data
|
14642
|
+
dates = pd.date_range("2020-01-01", "2023-12-31", freq="D")
|
14643
|
+
df = pd.DataFrame(
|
14644
|
+
{
|
14645
|
+
"order_date": np.random.choice(dates, 500),
|
14646
|
+
"delivery_time": np.random.randint(1, 72, 500), # hours
|
14647
|
+
}
|
14648
|
+
)
|
14649
|
+
# Example 1: Monthly bins
|
14650
|
+
# Monthly binning with exact month boundaries
|
14651
|
+
df_cut(
|
14652
|
+
df,
|
14653
|
+
"order_date",
|
14654
|
+
bins={"start": "2019-01-01", "end": "2023-12-31", "step": "1Y"},
|
14655
|
+
datetime_format="%Y-%m-%d",
|
14656
|
+
label_format="%m-%d",
|
14657
|
+
show_count=True,
|
14658
|
+
show_percentage=True,
|
14659
|
+
show_total_count=True,
|
14660
|
+
)
|
14661
|
+
# Weekly binning
|
14662
|
+
df_cut(
|
14663
|
+
df,
|
14664
|
+
"order_date",
|
14665
|
+
bins={"start": "2019-01-01", "end": "2023-12-31", "step": "1W"},
|
14666
|
+
label_format="%Y-%m-%d",
|
14667
|
+
datetime_format="%Y-%m-%d",
|
14668
|
+
show_count=True,
|
14669
|
+
show_percentage=True,
|
14670
|
+
show_total_count=True,
|
14671
|
+
)
|
14672
|
+
|
14673
|
+
|
14674
|
+
# Sample numeric data
|
14675
|
+
df = pd.DataFrame(
|
14676
|
+
{"price": np.random.uniform(10, 1000, 1000), "age": np.random.randint(18, 80, 1000)}
|
14677
|
+
)
|
14678
|
+
|
14679
|
+
# Example 1: Equal-width bins
|
14680
|
+
df_cut(df, "price", bins=5, show_count=True)
|
14681
|
+
|
14682
|
+
# Example 2: Custom range with step
|
14683
|
+
df_cut(
|
14684
|
+
df,
|
14685
|
+
"price",
|
14686
|
+
range_start=0,
|
14687
|
+
range_end=1000,
|
14688
|
+
step=200,
|
14689
|
+
label_format="${left:.0f}-${right:.0f}",
|
14690
|
+
show_percentage=True,
|
14691
|
+
)
|
14692
|
+
df_cut(
|
14693
|
+
df,
|
14694
|
+
"price",
|
14695
|
+
bins={"start": 0, "end": 1000, "step": 200},
|
14696
|
+
# label_format="${left:.0f}-${right:.0f}",
|
14697
|
+
show_percentage=True,
|
14698
|
+
)
|
14699
|
+
"""
|
14700
|
+
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
|
14701
|
+
|
14702
|
+
def _process_time_step(step: Union[str, int, float, pd.Timedelta]) -> str:
|
14703
|
+
"""Convert step to pandas frequency string."""
|
14704
|
+
if isinstance(step, pd.Timedelta):
|
14705
|
+
return step.freqstr if step.freqstr else str(step)
|
14706
|
+
|
14707
|
+
if isinstance(step, (int, float)):
|
14708
|
+
return f"{step}S" # Interpret numbers as seconds
|
14709
|
+
|
14710
|
+
if isinstance(step, str):
|
14711
|
+
step = step.strip().lower()
|
14712
|
+
match = re.match(r"(\d*\.?\d+)?\s*([a-z]+)", step)
|
14713
|
+
if not match:
|
14714
|
+
raise ValueError(f"Invalid time step format: {step}")
|
14715
|
+
|
14716
|
+
num_part, unit_part = match.groups()
|
14717
|
+
num = float(num_part) if num_part else 1.0
|
14718
|
+
|
14719
|
+
unit_map = {
|
14720
|
+
"y": "Y",
|
14721
|
+
"yr": "Y",
|
14722
|
+
"yrs": "Y",
|
14723
|
+
"year": "Y",
|
14724
|
+
"years": "Y",
|
14725
|
+
"m": "M",
|
14726
|
+
"mo": "M",
|
14727
|
+
"mon": "M",
|
14728
|
+
"month": "M",
|
14729
|
+
"months": "M",
|
14730
|
+
"w": "W",
|
14731
|
+
"wk": "W",
|
14732
|
+
"wks": "W",
|
14733
|
+
"week": "W",
|
14734
|
+
"weeks": "W",
|
14735
|
+
"d": "D",
|
14736
|
+
"day": "D",
|
14737
|
+
"days": "D",
|
14738
|
+
"h": "H",
|
14739
|
+
"hr": "H",
|
14740
|
+
"hrs": "H",
|
14741
|
+
"hour": "H",
|
14742
|
+
"hours": "H",
|
14743
|
+
"min": "T",
|
14744
|
+
"mins": "T",
|
14745
|
+
"minute": "T",
|
14746
|
+
"minutes": "T",
|
14747
|
+
"s": "S",
|
14748
|
+
"sec": "S",
|
14749
|
+
"secs": "S",
|
14750
|
+
"second": "S",
|
14751
|
+
"seconds": "S",
|
14752
|
+
}
|
14753
|
+
|
14754
|
+
if unit_part not in unit_map:
|
14755
|
+
raise ValueError(f"Unknown time unit: {unit_part}")
|
14756
|
+
|
14757
|
+
freq = unit_map[unit_part]
|
14758
|
+
if num.is_integer():
|
14759
|
+
num = int(num)
|
14760
|
+
return f"{num}{freq}"
|
14761
|
+
|
14762
|
+
raise TypeError(f"Unsupported step type: {type(step)}")
|
14763
|
+
|
14764
|
+
|
14765
|
+
def _process_datetime_column(
|
14766
|
+
col: pd.Series,
|
14767
|
+
bins: Optional[Union[int, List[pd.Timestamp]]],
|
14768
|
+
range_start: Optional[Union[str, pd.Timestamp]],
|
14769
|
+
range_end: Optional[Union[str, pd.Timestamp]],
|
14770
|
+
step: Optional[Union[str, pd.Timedelta]],
|
14771
|
+
labels: Optional[List[str]],
|
14772
|
+
label_format: Optional[Union[str, Callable]],
|
14773
|
+
datetime_format: str,
|
14774
|
+
right: bool,
|
14775
|
+
include_underflow: bool,
|
14776
|
+
include_overflow: bool,
|
14777
|
+
) -> Tuple[pd.Categorical, List[str]]:
|
14778
|
+
"""Process datetime column with accurate counting."""
|
14779
|
+
col = pd.to_datetime(col)
|
14780
|
+
|
14781
|
+
# Handle bin edges
|
14782
|
+
if bins is None:
|
14783
|
+
if step is None:
|
14784
|
+
raise ValueError("Step must be provided for datetime binning")
|
14785
|
+
|
14786
|
+
# Convert step to pandas frequency string
|
14787
|
+
step_freq = _process_time_step(step)
|
14788
|
+
|
14789
|
+
# Set default range if needed
|
14790
|
+
range_start = (
|
14791
|
+
pd.to_datetime(range_start) if range_start is not None else col.min()
|
14792
|
+
)
|
14793
|
+
range_end = pd.to_datetime(range_end) if range_end is not None else col.max()
|
14794
|
+
|
14795
|
+
# Generate bins
|
14796
|
+
try:
|
14797
|
+
bin_edges = pd.date_range(start=range_start, end=range_end, freq=step_freq)
|
14798
|
+
if len(bin_edges) == 0:
|
14799
|
+
bin_edges = pd.date_range(start=range_start, end=range_end, periods=2)
|
14800
|
+
elif bin_edges[-1] < range_end:
|
14801
|
+
bin_edges = bin_edges.append(pd.DatetimeIndex([range_end]))
|
14802
|
+
except ValueError as e:
|
14803
|
+
raise ValueError(f"Invalid frequency specification: {step_freq}") from e
|
14804
|
+
elif isinstance(bins, int):
|
14805
|
+
bin_edges = pd.date_range(start=col.min(), end=col.max(), periods=bins + 1)
|
14806
|
+
else:
|
14807
|
+
bin_edges = pd.to_datetime(bins)
|
14808
|
+
|
14809
|
+
# Add overflow/underflow bins
|
14810
|
+
if include_underflow:
|
14811
|
+
bin_edges = bin_edges.insert(0, pd.Timestamp.min)
|
14812
|
+
if include_overflow:
|
14813
|
+
bin_edges = bin_edges.append(pd.DatetimeIndex([pd.Timestamp.max]))
|
14814
|
+
|
14815
|
+
# Perform the cut - this is where we ensure proper binning
|
14816
|
+
binned = pd.cut(
|
14817
|
+
col.astype("int64"), # Convert to nanoseconds for precise binning
|
14818
|
+
bins=bin_edges.astype("int64"),
|
14819
|
+
right=right,
|
14820
|
+
include_lowest=True,
|
14821
|
+
)
|
14822
|
+
|
14823
|
+
# Generate labels if not provided
|
14824
|
+
if labels is None:
|
14825
|
+
labels = []
|
14826
|
+
for i in range(len(bin_edges) - 1):
|
14827
|
+
left = bin_edges[i]
|
14828
|
+
right_ = bin_edges[i + 1]
|
14829
|
+
|
14830
|
+
# Handle special cases
|
14831
|
+
if left == pd.Timestamp.min:
|
14832
|
+
left_str = "<"
|
14833
|
+
else:
|
14834
|
+
left_str = left.strftime(datetime_format)
|
14835
|
+
|
14836
|
+
if right_ == pd.Timestamp.max:
|
14837
|
+
right_str = ">"
|
14838
|
+
else:
|
14839
|
+
right_str = right_.strftime(datetime_format)
|
14840
|
+
|
14841
|
+
# Apply label formatting
|
14842
|
+
if callable(label_format):
|
14843
|
+
label = label_format(left, right_)
|
14844
|
+
elif isinstance(label_format, str):
|
14845
|
+
try:
|
14846
|
+
if left != pd.Timestamp.min and right_ != pd.Timestamp.max:
|
14847
|
+
label = f"{left.strftime(label_format)}-{right_.strftime(label_format)}"
|
14848
|
+
else:
|
14849
|
+
label = f"{left_str}-{right_str}"
|
14850
|
+
except (ValueError, AttributeError):
|
14851
|
+
label = f"{left_str}-{right_str}"
|
14852
|
+
else:
|
14853
|
+
label = f"{left_str}-{right_str}"
|
14854
|
+
|
14855
|
+
labels.append(label)
|
14856
|
+
|
14857
|
+
return binned, labels
|
14858
|
+
|
14859
|
+
|
14860
|
+
def _process_categorical_column(
|
14861
|
+
col: pd.Series,
|
14862
|
+
bins: Optional[Union[int, List[str]]],
|
14863
|
+
labels: Optional[List[str]],
|
14864
|
+
categorical_agg: str,
|
14865
|
+
) -> Tuple[pd.Categorical, List[str]]:
|
14866
|
+
value_counts = col.value_counts(normalize=(categorical_agg == "ratio"))
|
14867
|
+
|
14868
|
+
if bins is not None and isinstance(bins, int):
|
14869
|
+
top_categories = value_counts.head(bins).index
|
14870
|
+
binned = col.where(col.isin(top_categories), "Other")
|
14871
|
+
elif isinstance(bins, list):
|
14872
|
+
binned = col.where(col.isin(bins), "Other")
|
14873
|
+
else:
|
14874
|
+
binned = col
|
14875
|
+
|
14876
|
+
binned = binned.astype("category")
|
14877
|
+
|
14878
|
+
if labels is not None:
|
14879
|
+
binned = binned.cat.rename_categories(dict(zip(binned.cat.categories, labels)))
|
14880
|
+
|
14881
|
+
return binned, list(binned.cat.categories)
|
14882
|
+
|
14883
|
+
|
14884
|
+
def _process_numeric_column(
|
14885
|
+
col: pd.Series,
|
14886
|
+
bins: Optional[Union[int, List[float]]],
|
14887
|
+
range_start: Optional[float],
|
14888
|
+
range_end: Optional[float],
|
14889
|
+
step: Optional[float],
|
14890
|
+
labels: Optional[List[str]],
|
14891
|
+
label_format: Optional[Union[str, Callable]],
|
14892
|
+
precision: int,
|
14893
|
+
right: bool,
|
14894
|
+
include_underflow: bool,
|
14895
|
+
include_overflow: bool,
|
14896
|
+
) -> Tuple[pd.Categorical, List[str]]:
|
14897
|
+
if bins is None:
|
14898
|
+
if range_start is None or step is None:
|
14899
|
+
raise ValueError("If bins not provided, must set range_start and step")
|
14900
|
+
if range_end is None:
|
14901
|
+
range_end = col.max()
|
14902
|
+
|
14903
|
+
bin_edges = list(np.arange(range_start, range_end + step, step))
|
14904
|
+
elif isinstance(bins, int):
|
14905
|
+
bin_edges = np.linspace(col.min(), col.max(), bins + 1).tolist()
|
14906
|
+
else:
|
14907
|
+
bin_edges = list(bins)
|
14908
|
+
|
14909
|
+
# Add overflow/underflow bins if needed
|
14910
|
+
if include_underflow and not np.isinf(bin_edges[0]):
|
14911
|
+
bin_edges.insert(0, float("-inf"))
|
14912
|
+
if include_overflow and not np.isinf(bin_edges[-1]):
|
14913
|
+
bin_edges.append(float("inf"))
|
14914
|
+
|
14915
|
+
# Generate labels if not provided
|
14916
|
+
if labels is None:
|
14917
|
+
labels = []
|
14918
|
+
for i in range(len(bin_edges) - 1):
|
14919
|
+
left = round(bin_edges[i], precision)
|
14920
|
+
right_ = round(bin_edges[i + 1], precision)
|
14921
|
+
|
14922
|
+
if label_format:
|
14923
|
+
label = (
|
14924
|
+
label_format(left, right_)
|
14925
|
+
if callable(label_format)
|
14926
|
+
else label_format.format(left=left, right=right_)
|
14927
|
+
)
|
14928
|
+
else:
|
14929
|
+
if np.isinf(left) and left < 0:
|
14930
|
+
label = f"<{right_}"
|
14931
|
+
elif np.isinf(right_):
|
14932
|
+
label = f">{left}"
|
14933
|
+
else:
|
14934
|
+
label = f"[{left}, {right_}{']' if right else ')'}"
|
13286
14935
|
|
13287
|
-
|
13288
|
-
imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
|
14936
|
+
labels.append(label)
|
13289
14937
|
|
13290
|
-
|
13291
|
-
|
13292
|
-
imputed_non_numeric,
|
13293
|
-
index=non_numeric_data.index,
|
13294
|
-
columns=non_numeric_data.columns,
|
14938
|
+
binned = pd.cut(
|
14939
|
+
col, bins=bin_edges, labels=labels, right=right, include_lowest=True
|
13295
14940
|
)
|
13296
|
-
|
13297
|
-
|
14941
|
+
return binned, labels
|
14942
|
+
|
14943
|
+
|
14944
|
+
def _handle_na_values(
|
14945
|
+
col: pd.Series, na_action: str, na_fill_value: Optional[str]
|
14946
|
+
) -> pd.Series:
|
14947
|
+
if na_action == "drop":
|
14948
|
+
return col.dropna()
|
14949
|
+
elif na_action == "fill" and na_fill_value is not None:
|
14950
|
+
return col.fillna(na_fill_value)
|
14951
|
+
return col
|
14952
|
+
|
14953
|
+
|
14954
|
+
def _add_statistical_labels(
|
14955
|
+
binned: pd.Categorical,
|
14956
|
+
labels: List[str],
|
14957
|
+
show_count: bool,
|
14958
|
+
show_percentage: bool,
|
14959
|
+
show_total_count: bool,
|
14960
|
+
symbol_count: str,
|
14961
|
+
symbol_percentage: str,
|
14962
|
+
symbol_total_count: str,
|
14963
|
+
sep_between: str,
|
14964
|
+
) -> List[str]:
|
14965
|
+
"""Add statistical information with accurate counts."""
|
14966
|
+
# Get counts by matching the exact bin intervals
|
14967
|
+
value_counts = binned.value_counts()
|
14968
|
+
total = len(binned.dropna())
|
14969
|
+
|
14970
|
+
new_labels = []
|
14971
|
+
for i, (label, category) in enumerate(zip(labels, binned.cat.categories)):
|
14972
|
+
count = value_counts.get(category, 0)
|
14973
|
+
parts = [label]
|
14974
|
+
|
14975
|
+
if show_count:
|
14976
|
+
parts.append(f"{symbol_count}{count}")
|
14977
|
+
if show_percentage:
|
14978
|
+
percentage = (count / total * 100) if total > 0 else 0
|
14979
|
+
parts.append(f"{percentage:.1f}{symbol_percentage}")
|
14980
|
+
if show_total_count:
|
14981
|
+
parts.append(f"{symbol_total_count}{total}")
|
14982
|
+
|
14983
|
+
# Ensure unique labels
|
14984
|
+
new_label = sep_between.join(parts)
|
14985
|
+
if new_label in new_labels:
|
14986
|
+
new_label = f"{new_label}_{i}"
|
14987
|
+
new_labels.append(new_label)
|
14988
|
+
|
14989
|
+
return new_labels
|
14990
|
+
|
14991
|
+
|
14992
|
+
def _sort_bin_labels(binned: pd.Categorical, labels: List[str]) -> pd.Categorical:
|
14993
|
+
try:
|
14994
|
+
# Attempt to sort by the underlying intervals
|
14995
|
+
sorted_categories = sorted(binned.cat.categories)
|
14996
|
+
binned = binned.cat.reorder_categories(sorted_categories, ordered=True)
|
14997
|
+
except Exception:
|
14998
|
+
# If sorting fails (e.g., string labels), fallback to given label order
|
14999
|
+
binned = binned.cat.set_categories(labels, ordered=True)
|
15000
|
+
return binned
|
15001
|
+
# Input validation
|
15002
|
+
if column not in df.columns:
|
15003
|
+
raise ValueError(f"Column '{column}' not found in DataFrame")
|
13298
15004
|
|
13299
|
-
|
13300
|
-
|
13301
|
-
)
|
15005
|
+
if not inplace:
|
15006
|
+
df = df.copy()
|
13302
15007
|
|
13303
|
-
|
13304
|
-
|
13305
|
-
|
13306
|
-
|
15008
|
+
col_data = df[column]
|
15009
|
+
|
15010
|
+
# Determine column type
|
15011
|
+
if is_datetime64_any_dtype(col_data):
|
15012
|
+
col_type = "datetime"
|
15013
|
+
col_data = pd.to_datetime(col_data)
|
15014
|
+
elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == "object":
|
15015
|
+
col_type = "categorical"
|
15016
|
+
elif is_numeric_dtype(col_data):
|
15017
|
+
col_type = "numeric"
|
13307
15018
|
else:
|
13308
|
-
|
13309
|
-
|
15019
|
+
raise TypeError(f"Unsupported column type: {col_data.dtype}")
|
15020
|
+
|
15021
|
+
# Handle dictionary bin specification
|
15022
|
+
if isinstance(bins, dict):
|
15023
|
+
range_start = bins.get("start", range_start)
|
15024
|
+
range_end = bins.get("end", range_end)
|
15025
|
+
step = bins.get("step", step)
|
15026
|
+
bins = None
|
15027
|
+
|
15028
|
+
# Process based on column type
|
15029
|
+
if col_type == "datetime":
|
15030
|
+
binned, bin_labels = _process_datetime_column(
|
15031
|
+
col_data,
|
15032
|
+
bins,
|
15033
|
+
range_start,
|
15034
|
+
range_end,
|
15035
|
+
step,
|
15036
|
+
labels,
|
15037
|
+
label_format,
|
15038
|
+
datetime_format,
|
15039
|
+
right,
|
15040
|
+
include_underflow,
|
15041
|
+
include_overflow,
|
15042
|
+
)
|
15043
|
+
elif col_type == "categorical":
|
15044
|
+
binned, bin_labels = _process_categorical_column(
|
15045
|
+
col_data, bins, labels, categorical_agg
|
15046
|
+
)
|
15047
|
+
else:
|
15048
|
+
binned, bin_labels = _process_numeric_column(
|
15049
|
+
col_data,
|
15050
|
+
bins,
|
15051
|
+
range_start,
|
15052
|
+
range_end,
|
15053
|
+
step,
|
15054
|
+
labels,
|
15055
|
+
label_format,
|
15056
|
+
precision,
|
15057
|
+
right,
|
15058
|
+
include_underflow,
|
15059
|
+
include_overflow,
|
15060
|
+
)
|
13310
15061
|
|
15062
|
+
# Handle NA values
|
15063
|
+
binned = _handle_na_values(binned, na_action, na_fill_value)
|
15064
|
+
|
15065
|
+
# Add statistical information to labels if requested
|
15066
|
+
if show_count or show_percentage or show_total_count:
|
15067
|
+
bin_labels = _add_statistical_labels(
|
15068
|
+
binned,
|
15069
|
+
bin_labels,
|
15070
|
+
show_count,
|
15071
|
+
show_percentage,
|
15072
|
+
show_total_count,
|
15073
|
+
symbol_count,
|
15074
|
+
symbol_percentage,
|
15075
|
+
symbol_total_count,
|
15076
|
+
sep_between,
|
15077
|
+
)
|
15078
|
+
binned = binned.cat.rename_categories(
|
15079
|
+
dict(zip(binned.cat.categories, bin_labels))
|
15080
|
+
)
|
13311
15081
|
|
13312
|
-
#
|
13313
|
-
|
13314
|
-
|
13315
|
-
# "B": [np.nan, 2, 3, 4, np.nan],
|
13316
|
-
# "C": [1, np.nan, 3, 4, 5],
|
13317
|
-
# "D": [1, 2, 3, 4, np.nan],
|
13318
|
-
# }
|
15082
|
+
# Sort labels if requested
|
15083
|
+
if sort_labels and not right and len(bin_labels) > 1:
|
15084
|
+
binned = _sort_bin_labels(binned, bin_labels)
|
13319
15085
|
|
13320
|
-
#
|
13321
|
-
|
13322
|
-
|
13323
|
-
# "median",
|
13324
|
-
# "most_frequent",
|
13325
|
-
# "constant",
|
13326
|
-
# "knn",
|
13327
|
-
# "iterative",
|
13328
|
-
# # "missforest",
|
13329
|
-
# # "softimpute",
|
13330
|
-
# # "svd",
|
13331
|
-
# ]
|
15086
|
+
# Create final output column
|
15087
|
+
new_col = new_col_name or f"{column}_binned"
|
15088
|
+
df[new_col] = binned.astype(dtype) if dtype else binned
|
13332
15089
|
|
13333
|
-
|
13334
|
-
|
15090
|
+
if drop_original:
|
15091
|
+
df.drop(columns=[column], inplace=True)
|
15092
|
+
|
15093
|
+
return None if inplace else df
|
13335
15094
|
|
13336
|
-
# for method_name in methods:
|
13337
|
-
# print(method_name)
|
13338
|
-
# display(df)
|
13339
|
-
# display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
|
13340
15095
|
|
13341
15096
|
|
13342
15097
|
def df_encoder(
|
@@ -14580,209 +16335,213 @@ def df_reducer(
|
|
14580
16335
|
|
14581
16336
|
# example:
|
14582
16337
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
16338
|
+
|
14583
16339
|
|
14584
16340
|
|
14585
|
-
def get_df_format(data, threshold_unique=0.5, verbose=False):
|
16341
|
+
def get_df_format(data, threshold_unique=0.5, verbose=False, sample_size=1000):
|
14586
16342
|
"""
|
14587
|
-
|
14588
|
-
|
16343
|
+
Detect whether a DataFrame is in long or wide format with optimized performance and accuracy.
|
16344
|
+
|
14589
16345
|
Parameters:
|
14590
|
-
- data (pd.DataFrame): DataFrame to
|
14591
|
-
- threshold_unique (float):
|
14592
|
-
|
16346
|
+
- data (pd.DataFrame): DataFrame to analyze
|
16347
|
+
- threshold_unique (float): Threshold for categorical column detection (0-1)
|
16348
|
+
- verbose (bool): Whether to print diagnostic messages
|
16349
|
+
- sample_size (int): Maximum number of rows/columns to sample for large datasets
|
16350
|
+
|
14593
16351
|
Returns:
|
14594
|
-
- "long" if detected as long format
|
16352
|
+
- "long" if detected as long format
|
14595
16353
|
- "wide" if detected as wide format
|
14596
|
-
- "uncertain" if ambiguous
|
16354
|
+
- "uncertain" if format is ambiguous
|
14597
16355
|
"""
|
16356
|
+
import pandas as pd
|
16357
|
+
import numpy as np
|
14598
16358
|
from scipy.stats import entropy
|
14599
16359
|
from sklearn.cluster import AgglomerativeClustering
|
14600
16360
|
from sklearn.preprocessing import StandardScaler
|
14601
|
-
|
14602
|
-
|
14603
|
-
|
14604
|
-
# -----
|
14605
|
-
if n_rows > fs:
|
14606
|
-
if verbose:
|
14607
|
-
print(f"Sampling {fs} rows from {n_rows} rows.")
|
14608
|
-
data = data.sample(n=fs, random_state=1)
|
14609
|
-
if n_cols > fs:
|
14610
|
-
if verbose:
|
14611
|
-
print(f"Using first {fs} columns out of {n_cols} columns.")
|
14612
|
-
data = data.iloc[:, :fs]
|
16361
|
+
from sklearn.metrics import pairwise_distances
|
16362
|
+
from collections import Counter
|
16363
|
+
import re
|
16364
|
+
# ----- Initial Setup and Sampling -----
|
14613
16365
|
n_rows, n_cols = data.shape
|
16366
|
+
if verbose:
|
16367
|
+
print(f"Initial shape: {n_rows} rows, {n_cols} columns")
|
14614
16368
|
|
14615
|
-
#
|
14616
|
-
if n_rows >
|
14617
|
-
|
14618
|
-
|
14619
|
-
|
14620
|
-
|
14621
|
-
|
14622
|
-
|
14623
|
-
|
14624
|
-
|
14625
|
-
|
14626
|
-
|
14627
|
-
|
14628
|
-
|
14629
|
-
|
14630
|
-
|
16369
|
+
# Sample data if too large
|
16370
|
+
if n_rows > sample_size:
|
16371
|
+
data = data.sample(n=sample_size, random_state=42)
|
16372
|
+
n_rows = sample_size
|
16373
|
+
if n_cols > sample_size:
|
16374
|
+
data = data.iloc[:, :sample_size]
|
16375
|
+
n_cols = sample_size
|
16376
|
+
|
16377
|
+
# Early exit for tiny datasets
|
16378
|
+
if n_rows < 3 or n_cols < 3:
|
16379
|
+
return "uncertain"
|
16380
|
+
|
16381
|
+
long_score = 0
|
16382
|
+
wide_score = 0
|
16383
|
+
|
16384
|
+
# ----- Feature Extraction -----
|
16385
|
+
# Basic statistics
|
16386
|
+
row_col_ratio = n_rows / n_cols if n_cols != 0 else float('inf')
|
16387
|
+
|
16388
|
+
# Column types
|
16389
|
+
numeric_cols = data.select_dtypes(include=np.number).columns
|
16390
|
+
cat_cols = data.select_dtypes(include=['object', 'category']).columns
|
16391
|
+
other_cols = [col for col in data.columns if col not in numeric_cols and col not in cat_cols]
|
16392
|
+
|
16393
|
+
# Unique value analysis
|
16394
|
+
unique_counts = data.nunique(dropna=False)
|
14631
16395
|
duplicate_ratio = 1 - unique_counts / n_rows
|
14632
|
-
|
14633
|
-
|
14634
|
-
|
14635
|
-
|
14636
|
-
|
14637
|
-
|
14638
|
-
if verbose:
|
14639
|
-
print(
|
14640
|
-
"Lower duplicate ratio suggests long format (higher row variability)."
|
14641
|
-
)
|
14642
|
-
|
14643
|
-
# Calculate entropy for categorical columns
|
14644
|
-
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
14645
|
-
if len(categorical_cols) > 0:
|
14646
|
-
for col in categorical_cols:
|
14647
|
-
counts = data[col].value_counts(normalize=True)
|
14648
|
-
col_entropy = entropy(counts)
|
14649
|
-
if col_entropy < 1.5:
|
14650
|
-
long_score += 1
|
14651
|
-
if verbose:
|
14652
|
-
print(
|
14653
|
-
f"Column '{col}' entropy suggests categorical, supporting long format."
|
14654
|
-
)
|
14655
|
-
else:
|
14656
|
-
wide_score += 1
|
14657
|
-
if verbose:
|
14658
|
-
print(f"Column '{col}' entropy is higher, supporting wide format.")
|
14659
|
-
|
14660
|
-
# Step 3: Column grouping analysis for patterns in suffixes/prefixes
|
16396
|
+
|
16397
|
+
# Missing values
|
16398
|
+
missing_per_row = data.isna().sum(axis=1)
|
16399
|
+
missing_per_col = data.isna().sum()
|
16400
|
+
|
16401
|
+
# Column name patterns
|
14661
16402
|
col_names = data.columns.astype(str)
|
14662
|
-
|
14663
|
-
|
16403
|
+
has_suffix = sum(bool(re.search(r'(_\d+|\d+_?$)', col)) for col in col_names)
|
16404
|
+
has_time = sum(bool(re.search(r'(^time|^date|^year|^month|^day|^t\d+)', col.lower())) for col in col_names)
|
16405
|
+
|
16406
|
+
# ----- Scoring Rules -----
|
16407
|
+
|
16408
|
+
# 1. Row-Column Ratio (weighted)
|
16409
|
+
if row_col_ratio > 5:
|
16410
|
+
long_score += 3
|
16411
|
+
if verbose: print(f"High row/col ratio ({row_col_ratio:.1f}) → long +3")
|
16412
|
+
elif row_col_ratio < 0.2:
|
16413
|
+
wide_score += 3
|
16414
|
+
if verbose: print(f"Low row/col ratio ({row_col_ratio:.1f}) → wide +3")
|
16415
|
+
elif row_col_ratio > 2:
|
16416
|
+
long_score += 1
|
16417
|
+
if verbose: print(f"Moderate row/col ratio ({row_col_ratio:.1f}) → long +1")
|
16418
|
+
elif row_col_ratio < 0.5:
|
16419
|
+
wide_score += 1
|
16420
|
+
if verbose: print(f"Moderate row/col ratio ({row_col_ratio:.1f}) → wide +1")
|
16421
|
+
|
16422
|
+
# 2. Duplication Patterns
|
16423
|
+
high_dupe_cols = sum(duplicate_ratio > 0.3)
|
16424
|
+
if high_dupe_cols > 0.6 * n_cols:
|
14664
16425
|
wide_score += 2
|
14665
|
-
if verbose:
|
14666
|
-
|
14667
|
-
|
14668
|
-
|
14669
|
-
|
14670
|
-
#
|
14671
|
-
if len(
|
14672
|
-
|
14673
|
-
|
14674
|
-
|
14675
|
-
|
16426
|
+
if verbose: print(f"Many columns ({high_dupe_cols}/{n_cols}) with duplicates → wide +2")
|
16427
|
+
elif high_dupe_cols < 0.2 * n_cols:
|
16428
|
+
long_score += 1
|
16429
|
+
if verbose: print(f"Few columns ({high_dupe_cols}/{n_cols}) with duplicates → long +1")
|
16430
|
+
|
16431
|
+
# 3. Categorical Column Analysis
|
16432
|
+
if len(cat_cols) > 0:
|
16433
|
+
# Entropy analysis
|
16434
|
+
cat_entropies = []
|
16435
|
+
for col in cat_cols:
|
16436
|
+
counts = data[col].value_counts(normalize=True, dropna=False)
|
16437
|
+
cat_entropies.append(entropy(counts))
|
16438
|
+
|
16439
|
+
avg_cat_entropy = np.mean(cat_entropies) if cat_entropies else 0
|
16440
|
+
if avg_cat_entropy < 1.2:
|
14676
16441
|
long_score += 2
|
14677
|
-
if verbose:
|
14678
|
-
|
14679
|
-
|
14680
|
-
|
14681
|
-
|
14682
|
-
|
14683
|
-
|
14684
|
-
|
14685
|
-
|
14686
|
-
|
14687
|
-
|
14688
|
-
|
14689
|
-
|
14690
|
-
|
14691
|
-
|
14692
|
-
|
14693
|
-
|
14694
|
-
|
14695
|
-
|
14696
|
-
|
14697
|
-
|
14698
|
-
|
14699
|
-
|
16442
|
+
if verbose: print(f"Low categorical entropy ({avg_cat_entropy:.2f}) → long +2")
|
16443
|
+
elif avg_cat_entropy > 2:
|
16444
|
+
wide_score += 1
|
16445
|
+
if verbose: print(f"High categorical entropy ({avg_cat_entropy:.2f}) → wide +1")
|
16446
|
+
|
16447
|
+
# Entity identifier detection
|
16448
|
+
if len(cat_cols) >= 2 and n_rows > 10:
|
16449
|
+
dup_rows = data.duplicated(subset=cat_cols.tolist()[:2], keep=False).sum()
|
16450
|
+
if dup_rows > 0.3 * n_rows:
|
16451
|
+
long_score += 2
|
16452
|
+
if verbose: print(f"Duplicate rows in categorical cols ({dup_rows}/{n_rows}) → long +2")
|
16453
|
+
|
16454
|
+
# 4. Column Name Patterns
|
16455
|
+
if has_suffix > 0.4 * n_cols:
|
16456
|
+
wide_score += 2
|
16457
|
+
if verbose: print(f"Many suffix patterns ({has_suffix}/{n_cols}) → wide +2")
|
16458
|
+
if has_time > 0.3 * n_cols:
|
16459
|
+
wide_score += 1
|
16460
|
+
if verbose: print(f"Time-like columns ({has_time}/{n_cols}) → wide +1")
|
16461
|
+
|
16462
|
+
# 5. Numeric Column Analysis (only if enough numeric columns)
|
16463
|
+
if len(numeric_cols) > 2:
|
16464
|
+
# Correlation analysis
|
14700
16465
|
corr_matrix = data[numeric_cols].corr().abs()
|
14701
|
-
avg_corr = (
|
14702
|
-
|
14703
|
-
|
14704
|
-
if avg_corr > 0.6:
|
16466
|
+
avg_corr = corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)].mean()
|
16467
|
+
|
16468
|
+
if avg_corr > 0.5:
|
14705
16469
|
wide_score += 2
|
14706
|
-
if verbose:
|
14707
|
-
|
14708
|
-
|
14709
|
-
|
14710
|
-
|
14711
|
-
|
16470
|
+
if verbose: print(f"High numeric correlation ({avg_corr:.2f}) → wide +2")
|
16471
|
+
elif avg_corr < 0.2:
|
16472
|
+
long_score += 1
|
16473
|
+
if verbose: print(f"Low numeric correlation ({avg_corr:.2f}) → long +1")
|
16474
|
+
|
16475
|
+
# Entropy analysis
|
16476
|
+
try:
|
16477
|
+
numeric_data = data[numeric_cols].dropna()
|
16478
|
+
if len(numeric_data) > 10:
|
16479
|
+
numeric_entropy = numeric_data.apply(lambda x: entropy(pd.cut(x, bins=min(10, len(x.unique())).value_counts(normalize=True))))
|
16480
|
+
if numeric_entropy.mean() < 1.5:
|
16481
|
+
wide_score += 1
|
16482
|
+
if verbose: print(f"Low numeric entropy ({numeric_entropy.mean():.2f}) → wide +1")
|
16483
|
+
except Exception as e:
|
16484
|
+
if verbose: print(f"Numeric entropy failed: {str(e)}")
|
16485
|
+
|
16486
|
+
# 6. Missing Value Patterns
|
16487
|
+
missing_row_std = missing_per_row.std()
|
16488
|
+
if missing_row_std < 1 and missing_per_row.mean() > 0.1 * n_cols:
|
14712
16489
|
wide_score += 1
|
14713
|
-
if verbose:
|
14714
|
-
|
14715
|
-
"Low variation in missing patterns across rows, supporting wide format."
|
14716
|
-
)
|
14717
|
-
elif missing_patterns.mean() < 1:
|
16490
|
+
if verbose: print(f"Uniform missing pattern (std={missing_row_std:.2f}) → wide +1")
|
16491
|
+
elif missing_per_row.mean() < 0.05 * n_cols:
|
14718
16492
|
long_score += 1
|
14719
|
-
if verbose:
|
14720
|
-
|
14721
|
-
|
14722
|
-
|
14723
|
-
if len(numeric_cols) > 1 and n_rows > 5:
|
16493
|
+
if verbose: print(f"Few missing values → long +1")
|
16494
|
+
|
16495
|
+
# 7. Advanced Clustering (only for medium/large datasets)
|
16496
|
+
if len(numeric_cols) > 3 and n_rows > 10 and n_cols > 5:
|
14724
16497
|
try:
|
14725
|
-
|
14726
|
-
|
14727
|
-
|
14728
|
-
|
14729
|
-
|
16498
|
+
# Efficient clustering with sampling
|
16499
|
+
sample_data = data[numeric_cols].sample(n=min(100, n_rows), random_state=42)
|
16500
|
+
scaled_data = StandardScaler().fit_transform(sample_data.dropna())
|
16501
|
+
|
16502
|
+
if scaled_data.shape[0] > 5:
|
16503
|
+
# Column clustering
|
16504
|
+
col_dist = pairwise_distances(scaled_data.T)
|
16505
|
+
col_clusters = AgglomerativeClustering(n_clusters=2,
|
16506
|
+
affinity='precomputed',
|
16507
|
+
linkage='complete').fit(col_dist)
|
16508
|
+
cluster_counts = Counter(col_clusters.labels_)
|
16509
|
+
if max(cluster_counts.values()) > 0.7 * len(numeric_cols):
|
16510
|
+
wide_score += 2
|
16511
|
+
if verbose: print(f"Column clustering shows dominant group → wide +2")
|
16512
|
+
|
16513
|
+
# Row clustering
|
16514
|
+
row_clusters = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
|
16515
|
+
row_cluster_counts = Counter(row_clusters.labels_)
|
16516
|
+
if max(row_cluster_counts.values()) > 0.8 * scaled_data.shape[0]:
|
16517
|
+
wide_score += 1
|
16518
|
+
if verbose: print(f"Row clustering shows homogeneity → wide +1")
|
14730
16519
|
except Exception as e:
|
14731
|
-
|
14732
|
-
|
14733
|
-
#
|
14734
|
-
|
14735
|
-
|
14736
|
-
|
14737
|
-
|
14738
|
-
|
14739
|
-
|
14740
|
-
|
14741
|
-
|
14742
|
-
|
14743
|
-
|
14744
|
-
if
|
14745
|
-
|
14746
|
-
|
14747
|
-
|
14748
|
-
|
14749
|
-
|
14750
|
-
|
14751
|
-
|
14752
|
-
|
14753
|
-
|
14754
|
-
if wide_score == long_score:
|
14755
|
-
if n_cols > n_rows:
|
14756
|
-
wide_score += 1
|
14757
|
-
if verbose:
|
14758
|
-
print(
|
14759
|
-
"Tie-breaking based on column-major structure, favoring wide format."
|
14760
|
-
)
|
14761
|
-
elif n_rows > n_cols:
|
14762
|
-
long_score += 1
|
14763
|
-
if verbose:
|
14764
|
-
print(
|
14765
|
-
"Tie-breaking based on row-major structure, favoring long format."
|
14766
|
-
)
|
14767
|
-
else:
|
14768
|
-
if verbose:
|
14769
|
-
print("Tie-breaking inconclusive; returning 'uncertain'.")
|
14770
|
-
return "uncertain"
|
14771
|
-
|
14772
|
-
# Final decision
|
14773
|
-
if wide_score > long_score:
|
14774
|
-
if verbose:
|
14775
|
-
print("Final decision: Wide format.")
|
14776
|
-
return "wide"
|
14777
|
-
elif long_score > wide_score:
|
14778
|
-
if verbose:
|
14779
|
-
print("Final decision: Long format.")
|
14780
|
-
return "long"
|
16520
|
+
if verbose: print(f"Clustering skipped: {str(e)}")
|
16521
|
+
|
16522
|
+
# ----- Decision Logic -----
|
16523
|
+
score_diff = long_score - wide_score
|
16524
|
+
abs_diff = abs(score_diff)
|
16525
|
+
|
16526
|
+
if verbose:
|
16527
|
+
print(f"\nFinal scores - Long: {long_score}, Wide: {wide_score}")
|
16528
|
+
|
16529
|
+
if abs_diff >= 3:
|
16530
|
+
return "long" if score_diff > 0 else "wide"
|
16531
|
+
elif abs_diff >= 1:
|
16532
|
+
# Additional tie-breakers
|
16533
|
+
if score_diff == 0:
|
16534
|
+
if row_col_ratio > 1.5:
|
16535
|
+
return "long"
|
16536
|
+
elif row_col_ratio < 0.67:
|
16537
|
+
return "wide"
|
16538
|
+
elif len(cat_cols) > len(numeric_cols):
|
16539
|
+
return "long"
|
16540
|
+
else:
|
16541
|
+
return "wide"
|
16542
|
+
return "long" if score_diff > 0 else "wide"
|
14781
16543
|
else:
|
14782
|
-
if verbose:
|
14783
|
-
print("Final decision: Uncertain format.")
|
14784
16544
|
return "uncertain"
|
14785
|
-
|
14786
16545
|
#! ========== workbook, worksheet, wb,ws =============
|
14787
16546
|
|
14788
16547
|
import openpyxl
|
@@ -15917,7 +17676,7 @@ def df_corr(df: pd.DataFrame, method="pearson"):
|
|
15917
17676
|
def use_pd(
|
15918
17677
|
func_name="excel",
|
15919
17678
|
verbose=True,
|
15920
|
-
dir_json="
|
17679
|
+
dir_json="./data/usages_pd.json",
|
15921
17680
|
):
|
15922
17681
|
try:
|
15923
17682
|
default_settings = fload(dir_json, output="json")
|
@@ -17221,3 +18980,290 @@ def set_theme(
|
|
17221
18980
|
color_codes=color_codes,
|
17222
18981
|
rc=rc_params,
|
17223
18982
|
)
|
18983
|
+
|
18984
|
+
|
18985
|
+
|
18986
|
+
def df_wide_long(df):
|
18987
|
+
rows, columns = df.shape
|
18988
|
+
if columns > rows:
|
18989
|
+
return "Wide"
|
18990
|
+
elif rows > columns:
|
18991
|
+
return "Long"
|
18992
|
+
|
18993
|
+
def df2array(data: pd.DataFrame, x=None, y=None, hue=None, sort=False):
|
18994
|
+
|
18995
|
+
def sort_rows_move_nan(arr, sort=False):
|
18996
|
+
# Handle edge cases where all values are NaN
|
18997
|
+
if np.all(np.isnan(arr)):
|
18998
|
+
return arr # Return unchanged if the entire array is NaN
|
18999
|
+
|
19000
|
+
if sort:
|
19001
|
+
# Replace NaNs with a temporary large value for sorting
|
19002
|
+
temp_value = (
|
19003
|
+
np.nanmax(arr[np.isfinite(arr)]) + 1 if np.any(np.isfinite(arr)) else np.inf
|
19004
|
+
)
|
19005
|
+
arr_no_nan = np.where(np.isnan(arr), temp_value, arr)
|
19006
|
+
|
19007
|
+
# Sort each row
|
19008
|
+
sorted_arr = np.sort(arr_no_nan, axis=1)
|
19009
|
+
|
19010
|
+
# Move NaNs to the end
|
19011
|
+
result_arr = np.where(sorted_arr == temp_value, np.nan, sorted_arr)
|
19012
|
+
else:
|
19013
|
+
result_rows = []
|
19014
|
+
for row in arr:
|
19015
|
+
# Separate non-NaN and NaN values
|
19016
|
+
non_nan_values = row[~np.isnan(row)]
|
19017
|
+
nan_count = np.isnan(row).sum()
|
19018
|
+
# Create a new row with non-NaN values followed by NaNs
|
19019
|
+
new_row = np.concatenate([non_nan_values, [np.nan] * nan_count])
|
19020
|
+
result_rows.append(new_row)
|
19021
|
+
# Convert the list of rows back into a 2D NumPy array
|
19022
|
+
result_arr = np.array(result_rows)
|
19023
|
+
|
19024
|
+
# Remove rows/columns that contain only NaNs
|
19025
|
+
clean_arr = result_arr[~np.isnan(result_arr).all(axis=1)]
|
19026
|
+
clean_arr_ = clean_arr[:, ~np.isnan(clean_arr).all(axis=0)]
|
19027
|
+
|
19028
|
+
return clean_arr_
|
19029
|
+
# data = data.copy()
|
19030
|
+
# data[y] = pd.to_numeric(data[y], errors="coerce")
|
19031
|
+
# data = data.dropna(subset=[y])
|
19032
|
+
if hue is None:
|
19033
|
+
a = []
|
19034
|
+
if sort:
|
19035
|
+
cat_x = np.sort(data[x].unique().tolist()).tolist()
|
19036
|
+
else:
|
19037
|
+
cat_x = data[x].unique().tolist()
|
19038
|
+
for i, x_ in enumerate(cat_x):
|
19039
|
+
new_ = data.loc[data[x] == x_, y].to_list()
|
19040
|
+
a = padcat(a, new_, axis=0)
|
19041
|
+
return sort_rows_move_nan(a).T
|
19042
|
+
else:
|
19043
|
+
a = []
|
19044
|
+
if sort:
|
19045
|
+
cat_x = np.sort(data[x].unique().tolist()).tolist()
|
19046
|
+
cat_hue = np.sort(data[hue].unique().tolist()).tolist()
|
19047
|
+
else:
|
19048
|
+
cat_x = data[x].unique().tolist()
|
19049
|
+
cat_hue = data[hue].unique().tolist()
|
19050
|
+
for i, x_ in enumerate(cat_x):
|
19051
|
+
for j, hue_ in enumerate(cat_hue):
|
19052
|
+
new_ = data.loc[(data[x] == x_) & (data[hue] == hue_), y].to_list()
|
19053
|
+
a = padcat(a, new_, axis=0)
|
19054
|
+
return sort_rows_move_nan(a).T
|
19055
|
+
|
19056
|
+
|
19057
|
+
def array2df(data: np.ndarray):
|
19058
|
+
df = pd.DataFrame()
|
19059
|
+
df["group"] = (
|
19060
|
+
np.tile(
|
19061
|
+
["group" + str(i) for i in range(1, data.shape[1] + 1)], [data.shape[0], 1]
|
19062
|
+
)
|
19063
|
+
.reshape(-1, 1, order="F")[:, 0]
|
19064
|
+
.tolist()
|
19065
|
+
)
|
19066
|
+
df["value"] = data.reshape(-1, 1, order="F")
|
19067
|
+
return df
|
19068
|
+
|
19069
|
+
|
19070
|
+
def padcat(*args, fill_value=np.nan, axis=1, order="row"):
|
19071
|
+
"""
|
19072
|
+
Concatenate vectors with padding.
|
19073
|
+
|
19074
|
+
Parameters:
|
19075
|
+
*args : variable number of list or 1D arrays
|
19076
|
+
Input arrays to concatenate.
|
19077
|
+
fill_value : scalar, optional
|
19078
|
+
The value to use for padding the shorter lists (default is np.nan).
|
19079
|
+
axis : int, optional
|
19080
|
+
The axis along which to concatenate (0 for rows, 1 for columns, default is 1).
|
19081
|
+
order : str, optional
|
19082
|
+
The order for flattening when required: "row" or "column" (default is "row").
|
19083
|
+
|
19084
|
+
Returns:
|
19085
|
+
np.ndarray
|
19086
|
+
A 2D array with the input arrays concatenated along the specified axis,
|
19087
|
+
padded with fill_value where necessary.
|
19088
|
+
|
19089
|
+
|
19090
|
+
# Example usage:
|
19091
|
+
a = [1, np.nan]
|
19092
|
+
b = [1, 3, 4, np.nan, 2, np.nan]
|
19093
|
+
c = [1, 2, 3, 4, 5, 6, 7, 8, 10]
|
19094
|
+
d = padcat(a, b)
|
19095
|
+
result1 = padcat(d, c)
|
19096
|
+
result2 = padcat(a, b, c)
|
19097
|
+
print("Result of padcat(d, c):\n", result1)
|
19098
|
+
print("Result of padcat(a, b, c):\n", result2)
|
19099
|
+
"""
|
19100
|
+
# Set the order for processing
|
19101
|
+
if "ro" in order.lower():
|
19102
|
+
order = "C" # row-major order
|
19103
|
+
else:
|
19104
|
+
order = "F" # column-major order
|
19105
|
+
|
19106
|
+
# Process input arrays based on their dimensions
|
19107
|
+
processed_arrays = []
|
19108
|
+
for arg in args:
|
19109
|
+
arr = np.asarray(arg)
|
19110
|
+
if arr.ndim == 1:
|
19111
|
+
processed_arrays.append(arr) # Keep 1D arrays as is
|
19112
|
+
elif arr.ndim == 2:
|
19113
|
+
if axis == 0:
|
19114
|
+
# If concatenating along rows, split 2D arrays into 1D arrays row-wise
|
19115
|
+
processed_arrays.extend(arr)
|
19116
|
+
elif axis == 1:
|
19117
|
+
# If concatenating along columns, split 2D arrays into 1D arrays column-wise
|
19118
|
+
processed_arrays.extend(arr.T)
|
19119
|
+
else:
|
19120
|
+
raise ValueError("axis must be 0 or 1")
|
19121
|
+
else:
|
19122
|
+
raise ValueError("Input arrays must be 1D or 2D")
|
19123
|
+
|
19124
|
+
if axis == 0:
|
19125
|
+
# Concatenate along rows
|
19126
|
+
max_len = max(arr.size for arr in processed_arrays)
|
19127
|
+
result = np.full((len(processed_arrays), max_len), fill_value)
|
19128
|
+
for i, arr in enumerate(processed_arrays):
|
19129
|
+
result[i, : arr.size] = arr
|
19130
|
+
elif axis == 1:
|
19131
|
+
# Concatenate along columns
|
19132
|
+
max_len = max(arr.size for arr in processed_arrays)
|
19133
|
+
result = np.full((max_len, len(processed_arrays)), fill_value)
|
19134
|
+
for i, arr in enumerate(processed_arrays):
|
19135
|
+
result[: arr.size, i] = arr
|
19136
|
+
else:
|
19137
|
+
raise ValueError("axis must be 0 or 1")
|
19138
|
+
|
19139
|
+
return result
|
19140
|
+
|
19141
|
+
|
19142
|
+
# ========== memory cleaner ==========
|
19143
|
+
import gc
|
19144
|
+
import os
|
19145
|
+
import sys
|
19146
|
+
import psutil
|
19147
|
+
import platform
|
19148
|
+
import ctypes
|
19149
|
+
import subprocess
|
19150
|
+
import warnings
|
19151
|
+
import time
|
19152
|
+
|
19153
|
+
class MemoryOptimizer:
|
19154
|
+
def __init__(self, verbose: bool = True, aggressive_mode: bool = True):
|
19155
|
+
self.verbose = verbose
|
19156
|
+
self.aggressive_mode = aggressive_mode
|
19157
|
+
self.system = platform.system()
|
19158
|
+
self.process = psutil.Process(os.getpid())
|
19159
|
+
self.start_time = time.time()
|
19160
|
+
self.memory_history = []
|
19161
|
+
|
19162
|
+
def log(self, msg: str, level: str = "INFO"):
|
19163
|
+
if self.verbose:
|
19164
|
+
rss = self.process.memory_info().rss / (1024 ** 2)
|
19165
|
+
elapsed = time.time() - self.start_time
|
19166
|
+
print(f"[{level}][{elapsed:.2f}s][{rss:.1f}MB] {msg}")
|
19167
|
+
|
19168
|
+
def collect_garbage(self):
|
19169
|
+
self.log("Performing deep garbage collection...")
|
19170
|
+
stats = {}
|
19171
|
+
before_mem = self.process.memory_info().rss
|
19172
|
+
for gen in reversed(range(3)):
|
19173
|
+
collected = gc.collect(gen)
|
19174
|
+
self.log(f"GC Gen {gen}: Collected {collected}")
|
19175
|
+
gc.garbage.clear()
|
19176
|
+
after_mem = self.process.memory_info().rss
|
19177
|
+
stats['freed_mb'] = (before_mem - after_mem) / (1024 ** 2)
|
19178
|
+
return stats
|
19179
|
+
|
19180
|
+
def clear_frameworks(self):
|
19181
|
+
result = {}
|
19182
|
+
try:
|
19183
|
+
import torch
|
19184
|
+
if torch.cuda.is_available():
|
19185
|
+
self.log("Clearing PyTorch cache...")
|
19186
|
+
torch.cuda.empty_cache()
|
19187
|
+
torch.cuda.ipc_collect()
|
19188
|
+
result['pytorch'] = 'cleared'
|
19189
|
+
except Exception as e:
|
19190
|
+
self.log(f"PyTorch skipped: {e}", "WARNING")
|
19191
|
+
|
19192
|
+
try:
|
19193
|
+
import tensorflow as tf
|
19194
|
+
self.log("Clearing TensorFlow session...")
|
19195
|
+
tf.keras.backend.clear_session()
|
19196
|
+
result['tensorflow'] = 'cleared'
|
19197
|
+
except Exception as e:
|
19198
|
+
self.log(f"TensorFlow skipped: {e}", "WARNING")
|
19199
|
+
|
19200
|
+
try:
|
19201
|
+
import cv2
|
19202
|
+
self.log("Closing OpenCV windows...")
|
19203
|
+
cv2.destroyAllWindows()
|
19204
|
+
result['opencv'] = 'cleared'
|
19205
|
+
except Exception:
|
19206
|
+
pass
|
19207
|
+
|
19208
|
+
try:
|
19209
|
+
import matplotlib.pyplot as plt
|
19210
|
+
self.log("Closing matplotlib figures...")
|
19211
|
+
plt.close('all')
|
19212
|
+
result['matplotlib'] = 'cleared'
|
19213
|
+
except Exception:
|
19214
|
+
pass
|
19215
|
+
|
19216
|
+
return result
|
19217
|
+
|
19218
|
+
def clear_system_caches(self):
|
19219
|
+
result = {}
|
19220
|
+
self.log("Attempting full system cache clearance...")
|
19221
|
+
try:
|
19222
|
+
if self.system == "Linux":
|
19223
|
+
subprocess.run(["sync"], check=True)
|
19224
|
+
subprocess.run(["sudo", "sh", "-c", "echo 3 > /proc/sys/vm/drop_caches"], check=True)
|
19225
|
+
result['linux'] = 'caches dropped'
|
19226
|
+
elif self.system == "Darwin":
|
19227
|
+
subprocess.run(["sudo", "purge"], check=True)
|
19228
|
+
result['macos'] = 'purge run'
|
19229
|
+
elif self.system == "Windows":
|
19230
|
+
ctypes.windll.psapi.EmptyWorkingSet(-1)
|
19231
|
+
if self.aggressive_mode:
|
19232
|
+
ctypes.windll.kernel32.SetProcessWorkingSetSizeEx(
|
19233
|
+
-1, ctypes.c_size_t(-1), ctypes.c_size_t(-1), ctypes.c_uint(0x1)
|
19234
|
+
)
|
19235
|
+
result['windows'] = 'working set emptied'
|
19236
|
+
except Exception as e:
|
19237
|
+
self.log(f"System cache clearing failed: {e}", "ERROR")
|
19238
|
+
return result
|
19239
|
+
|
19240
|
+
def profile(self) -> Dict[str, Any]:
|
19241
|
+
mem = self.process.memory_info()
|
19242
|
+
vm = psutil.virtual_memory()
|
19243
|
+
profile = {
|
19244
|
+
'rss_mb': mem.rss / (1024 ** 2),
|
19245
|
+
'vms_mb': mem.vms / (1024 ** 2),
|
19246
|
+
'used_gb': vm.used / (1024 ** 3),
|
19247
|
+
'available_gb': vm.available / (1024 ** 3),
|
19248
|
+
'percent': vm.percent,
|
19249
|
+
}
|
19250
|
+
self.memory_history.append(profile)
|
19251
|
+
return profile
|
19252
|
+
|
19253
|
+
def optimize(self) -> Dict[str, Any]:
|
19254
|
+
result = {}
|
19255
|
+
result['before'] = self.profile()
|
19256
|
+
result['gc'] = self.collect_garbage()
|
19257
|
+
result['frameworks'] = self.clear_frameworks()
|
19258
|
+
result['system'] = self.clear_system_caches()
|
19259
|
+
result['after'] = self.profile()
|
19260
|
+
saved = result['before']['rss_mb'] - result['after']['rss_mb']
|
19261
|
+
result['saved_mb'] = saved
|
19262
|
+
result['saved_percent'] = (saved / result['before']['rss_mb']) * 100 if result['before']['rss_mb'] else 0
|
19263
|
+
self.log(f"Optimization complete: Saved {saved:.2f} MB ({result['saved_percent']:.1f}%)", "SUCCESS")
|
19264
|
+
return result
|
19265
|
+
|
19266
|
+
|
19267
|
+
def cleaner(verbose: bool = True, aggressive: bool = True) -> Dict[str, Any]:
|
19268
|
+
optimizer = MemoryOptimizer(verbose=verbose, aggressive_mode=aggressive)
|
19269
|
+
return optimizer.optimize()
|