py2ls 0.2.5.12__py3-none-any.whl → 0.2.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -1,19 +1,18 @@
1
1
  from tkinter import FALSE
2
2
  import numpy as np
3
3
  import pandas as pd
4
- import sys
5
- import os
4
+ import sys # built-in
5
+ import os # built-in
6
6
  from IPython.display import display
7
7
  import shutil
8
8
  import logging
9
9
  from pathlib import Path
10
10
  from datetime import datetime, date, time
11
- import re
11
+ import re # built-in
12
12
  import stat
13
13
  import platform
14
14
 
15
- from typing import Dict, List, Optional, Union, Any,Tuple
16
-
15
+ from typing import Dict, List, Optional, Union, Any, Tuple, Literal,Callable
17
16
  from regex import X
18
17
 
19
18
  try:
@@ -27,7 +26,218 @@ import warnings
27
26
  warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
28
27
  warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
29
28
  warnings.filterwarnings("ignore")
29
+ try:
30
+ import pkg_resources
31
+ except ImportError:
32
+ pkg_resources = None
33
+ import glob # built-in
34
+ import pkg_resources # built-in
35
+ class PkgManager:
36
+ """
37
+ PkgManager.uninstall("py2ls")
38
+ PkgManager.uninstall("py2ls", mode="startswith")
39
+ PkgManager.uninstall("py2ls", mode="endswith")
40
+ PkgManager.uninstall("py2ls", mode="contains")
41
+ PkgManager.uninstall("py2ls", mode="regex")
42
+
43
+ PkgManager.timemachine()
44
+ """
45
+
46
+ @staticmethod
47
+ def uninstall(
48
+ kw: Union[str, List[str]],
49
+ mode: str = "exact",
50
+ dry_run: bool = False,
51
+ make_backup: bool = True,
52
+ make_log: bool = True,
53
+ station: Optional[str] = None,
54
+ ) -> None:
55
+ if station is None:
56
+ station = os.path.dirname(os.path.dirname(sys.executable))
57
+ os.makedirs(station, exist_ok=True)
58
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
59
+
60
+ if isinstance(kw, str):
61
+ kw = [kw]
62
+ kw = [k.lower() for k in kw] if mode != "regex" else kw
63
+ mode = mode.lower()
64
+ valid_modes = {"exact", "startswith", "endswith", "contains", "regex"}
65
+ if mode not in valid_modes:
66
+ raise ValueError(f"Mode must be one of {valid_modes}")
67
+
68
+ installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
69
+ matched: Set[str] = set()
70
+
71
+ for name in installed_packages:
72
+ for key in kw:
73
+ if (
74
+ (mode == "exact" and name == key)
75
+ or (mode == "startswith" and name.startswith(key))
76
+ or (mode == "endswith" and name.endswith(key))
77
+ or (mode == "contains" and key in name)
78
+ or (mode == "regex" and re.search(key, name))
79
+ ):
80
+ matched.add(name)
81
+ break
82
+
83
+ if not matched:
84
+ print("No packages matched the criteria.")
85
+ return
86
+
87
+ if make_backup and not dry_run:
88
+ backup_path = os.path.join(station, f"requirements_backup_{timestamp}.txt")
89
+ with open(backup_path, "w") as f:
90
+ subprocess.run(["pip", "freeze"], stdout=f, check=True)
91
+ print(f"Backup created at: '{backup_path}'")
92
+
93
+ if dry_run:
94
+ print("[DRY RUN] The following packages would be uninstalled:")
95
+ for pkg in sorted(matched):
96
+ print(f" - {pkg}=={installed_packages[pkg]}")
97
+ return
98
+
99
+ print(f"[UNINSTALLING] {len(matched)} packages:")
100
+ for pkg in sorted(matched):
101
+ print(f" - {pkg}=={installed_packages[pkg]}")
102
+ subprocess.run(["pip", "uninstall", "-y", pkg], check=True)
103
+
104
+ if make_log:
105
+ log_path = os.path.join(station, f"uninstall_{timestamp}.txt")
106
+ with open(log_path, "w") as f:
107
+ f.write(f"# Uninstallation log created at {timestamp}\n")
108
+ f.write(f"# Mode: {mode}, Keywords: {kw}\n\n")
109
+ for pkg in sorted(matched):
110
+ f.write(f"{pkg}=={installed_packages[pkg]}\n")
111
+ print(f"Log written to '{log_path}'")
112
+
113
+ @staticmethod
114
+ def list_backups(station: Optional[str] = None) -> List[str]:
115
+ if station is None:
116
+ station = os.path.dirname(sys.executable)
117
+ if os.name == "nt":
118
+ station = os.path.dirname(station)
119
+ return sorted(glob.glob(os.path.join(station, "requirements_backup_*.txt")))
120
+
121
+ @staticmethod
122
+ def list_logs(station: Optional[str] = None) -> List[str]:
123
+ if station is None:
124
+ station = os.path.dirname(sys.executable)
125
+ if os.name == "nt":
126
+ station = os.path.dirname(station)
127
+ return sorted(glob.glob(os.path.join(station, "uninstall_*.txt")))
128
+
129
+ @staticmethod
130
+ def restore(
131
+ timestamp: Optional[str] = None,
132
+ station: Optional[str] = None,
133
+ dry_run: bool = False,
134
+ ) -> None:
135
+ if station is None:
136
+ station = os.path.dirname(sys.executable)
137
+ if os.name == "nt":
138
+ station = os.path.dirname(station)
139
+
140
+ backups = PkgManager.list_backups(station)
141
+ logs = PkgManager.list_logs(station)
142
+
143
+ if not timestamp:
144
+ print("Available restore points:\n\nBackups:")
145
+ for i, backup in enumerate(backups, 1):
146
+ ts = os.path.basename(backup)[18:-4]
147
+ print(f" {i}. {ts} (backup)")
148
+ print("\nUninstall logs:")
149
+ for i, log in enumerate(logs, len(backups) + 1):
150
+ ts = os.path.basename(log)[10:-4]
151
+ print(f" {i}. {ts} (log)")
152
+ print("\nSpecify timestamp or selection number to restore.")
153
+ return
154
+
155
+ try:
156
+ selection = int(timestamp)
157
+ all_files = backups + logs
158
+ if 1 <= selection <= len(all_files):
159
+ file_path = all_files[selection - 1]
160
+ is_log = selection > len(backups)
161
+ else:
162
+ raise ValueError("Invalid selection number")
163
+ except ValueError:
164
+ backup_pattern = os.path.join(
165
+ station, f"requirements_backup_{timestamp}.txt"
166
+ )
167
+ log_pattern = os.path.join(station, f"uninstall_{timestamp}.txt")
168
+ matching_backups = glob.glob(backup_pattern)
169
+ matching_logs = glob.glob(log_pattern)
170
+
171
+ if matching_backups:
172
+ file_path = matching_backups[0]
173
+ is_log = False
174
+ elif matching_logs:
175
+ file_path = matching_logs[0]
176
+ is_log = True
177
+ else:
178
+ print(f"No backup or log found for timestamp: {timestamp}")
179
+ return
180
+
181
+ with open(file_path, "r") as f:
182
+ packages = [
183
+ line.strip() for line in f if line.strip() and not line.startswith("#")
184
+ ]
185
+
186
+ if dry_run:
187
+ print(
188
+ f"[DRY RUN] Would restore {len(packages)} packages from:\n {file_path}"
189
+ )
190
+ for pkg in packages:
191
+ print(f" - {pkg}")
192
+ return
193
+
194
+ print(f"[RESTORING] {len(packages)} packages from:\n {file_path}")
195
+ for pkg in packages:
196
+ print(f" - Installing {pkg}")
197
+ subprocess.run(["pip", "install", pkg], check=True)
30
198
 
199
+ @staticmethod
200
+ def timemachine(station: Optional[str] = None) -> None:
201
+ if station is None:
202
+ station = os.path.dirname(sys.executable)
203
+ if os.name == "nt":
204
+ station = os.path.dirname(station)
205
+
206
+ backups = PkgManager.list_backups(station)
207
+ logs = PkgManager.list_logs(station)
208
+
209
+ if not backups and not logs:
210
+ print("No backup or log files found.")
211
+ return
212
+
213
+ print("\nTime Machine - Available Restore Points:")
214
+ print("--------------------------------------")
215
+ print("\nBackups (complete environment snapshots):")
216
+ for i, backup in enumerate(backups, 1):
217
+ ts = os.path.basename(backup)[18:-4]
218
+ print(f" {i}. {ts}")
219
+ print("\nUninstall Logs (specific package lists):")
220
+ for i, log in enumerate(logs, len(backups) + 1):
221
+ ts = os.path.basename(log)[10:-4]
222
+ print(f" {i}. {ts}")
223
+ print("\n0. Exit Time Machine")
224
+
225
+ while True:
226
+ try:
227
+ choice = input("\nSelect a restore point (number) or '0' to exit: ")
228
+ if choice == "0":
229
+ return
230
+ selection = int(choice)
231
+ all_files = backups + logs
232
+ if 1 <= selection <= len(all_files):
233
+ file_path = all_files[selection - 1]
234
+ timestamp = os.path.basename(file_path).split("_")[-1][:-4]
235
+ PkgManager.restore(timestamp, station)
236
+ return
237
+ else:
238
+ print("Invalid selection. Please try again.")
239
+ except ValueError:
240
+ print("Please enter a valid number.")
31
241
 
32
242
  def _yaoshi_fernet(mima="mimashigudingde",yan=b"mimashigudingde",verbose=True):
33
243
  import base64
@@ -1663,76 +1873,465 @@ def flatten(nested: Any, unique_list=True, verbose=False):
1663
1873
  return flattened_list
1664
1874
 
1665
1875
 
1666
- # def strcmp(
1667
- # search_term,
1668
- # candidates,
1669
- # ignore_case=True,
1670
- # get_rank=False,
1671
- # verbose=False,
1672
- # scorer="WR",
1673
- # method=None,
1674
- # ):
1675
- # """
1676
- # Compares a search term with a list of candidate strings and finds the best match based on similarity score.
1876
+ #! ===========extract_text===========
1877
+ def extract_text(
1878
+ text: Union[str, List[str]],
1879
+ patterns: Union[str, List[str]],
1880
+ *,
1881
+ mode: Literal["between", "split", "extract"] = "between",
1882
+ keep: Literal["none", "left", "right", "both", "markers"] = "none",
1883
+ case: Literal["sensitive", "insensitive"] = "insensitive",
1884
+ all_matches: bool = False,
1885
+ positions: bool = False,
1886
+ regex: bool = False,
1887
+ delimiter: Optional[str] = None,
1888
+ trim: bool = True,
1889
+ as_dict: bool = False,
1890
+ verbose: bool = False,
1891
+ **kwargs,
1892
+ ) -> Union[List[str], Tuple[int, str], Dict[str, Any], List[Dict[str, Any]], None]:
1893
+ """
1894
+ Ultimate text extraction tool with enhanced reliability and features.
1677
1895
 
1678
- # Parameters:
1679
- # search_term (str): The term to be searched for.
1680
- # candidates (list of str): A list of candidate strings to compare against the search term.
1681
- # ignore_case (bool): If True, the comparison ignores case differences.
1682
- # verbose (bool): If True, prints the similarity score and the best match.
1896
+ Key improvements:
1897
+ - Robust split mode with proper delimiter handling
1898
+ - Consistent return types across all modes
1899
+ - Improved pattern matching logic
1900
+ - Better edge case handling
1683
1901
 
1684
- # Returns:
1685
- # tuple: A tuple containing the best match and its index in the candidates list.
1686
- # """
1687
- # from fuzzywuzzy import fuzz, process
1688
-
1689
- # def to_lower(s, ignore_case=True):
1690
- # # Converts a string or list of strings to lowercase if ignore_case is True.
1691
- # if ignore_case:
1692
- # if isinstance(s, str):
1693
- # return s.lower()
1694
- # elif isinstance(s, list):
1695
- # s = [str(i) for i in s] # convert all to str
1696
- # return [elem.lower() for elem in s]
1697
- # return s
1698
- # scorer = str(method).lower() if method is not None else scorer
1699
- # str1_, str2_ = to_lower(search_term, ignore_case), to_lower(candidates, ignore_case)
1700
- # if isinstance(str2_, list):
1701
- # if "part" in scorer.lower():
1702
- # similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
1703
- # elif "w" in scorer.lower():
1704
- # similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
1705
- # elif "ratio" in scorer.lower() or "stri" in scorer.lower(): # Ratio (Strictest)
1706
- # similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
1707
- # else:
1708
- # similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
1709
- # if get_rank:
1710
- # idx = [
1711
- # similarity_scores.index(i)
1712
- # for i in sorted(similarity_scores, reverse=True)
1713
- # ]
1714
- # if verbose:
1715
- # display([candidates[ii] for ii in idx])
1716
- # return [candidates[ii] for ii in idx]
1717
- # best_match_index = similarity_scores.index(max(similarity_scores))
1718
- # best_match_score = similarity_scores[best_match_index]
1719
- # else:
1720
- # best_match_index = 0
1721
- # if "part" in scorer.lower():
1722
- # best_match_score = fuzz.partial_ratio(str1_, str2_)
1723
- # elif "w" in scorer.lower():
1724
- # best_match_score = fuzz.WRatio(str1_, str2_)
1725
- # elif "Ratio" in scorer.lower():
1726
- # best_match_score = fuzz.ratio(str1_, str2_)
1727
- # else:
1728
- # best_match_score = fuzz.WRatio(str1_, str2_)
1729
- # if verbose:
1730
- # print(f"\nbest_match is: {candidates[best_match_index],best_match_score}")
1731
- # best_match = process.extract(search_term, candidates)
1732
- # print(f"建议: {best_match}")
1733
- # return candidates[best_match_index], best_match_index
1902
+
1903
+ print(extract_text("A,B,C", ",", mode="split", keep="none", all_matches=True))
1904
+ # Correctly returns: ['A', 'B', 'C']
1905
+
1906
+ print(extract_text("A,B,C", ",", mode="split", keep="left"))
1907
+ # Returns: ['A,', 'B,', 'C']
1908
+
1909
+ print(extract_text("A,B,C", ",", mode="split", keep="right"))
1910
+ # Returns: [',B', ',C']
1911
+
1912
+ print(extract_text("A,B,C", ",", mode="split", keep="both"))
1913
+ # Returns: ['A', ',', 'B', ',', 'C']
1914
+ """
1915
+ if verbose:
1916
+ print("""
1917
+ extract_text(
1918
+ text: Union[str, List[str]],
1919
+ patterns: Union[str, List[str]],
1920
+ *,
1921
+ mode: Literal["between", "split", "extract"] = "between",
1922
+ keep: Literal["none", "left", "right", "both", "markers"] = "none",
1923
+ case: Literal["sensitive", "insensitive"] = "insensitive",
1924
+ all_matches: bool = False,
1925
+ positions: bool = False,
1926
+ regex: bool = False,
1927
+ delimiter: Optional[str] = None,
1928
+ trim: bool = True,
1929
+ as_dict: bool = False,
1930
+ verbose: bool = False,
1931
+ **kwargs,
1932
+ )
1933
+ """)
1934
+ # Normalization and validation
1935
+ text = _normalize_text(text, delimiter)
1936
+ patterns = _validate_patterns(patterns)
1937
+ flags = re.IGNORECASE if case == "insensitive" else 0
1938
+
1939
+ # Find all matches with enhanced validation
1940
+ matches = _find_matches(text, patterns, regex, flags)
1941
+ if not matches:
1942
+ return None
1943
+
1944
+ # Mode-specific processing
1945
+ if mode == "extract":
1946
+ return _handle_extract(matches, all_matches, as_dict, positions, trim)
1947
+ elif mode == "split":
1948
+ return _handle_split(text, matches, keep, all_matches, as_dict, positions, trim)
1949
+ elif mode == "between":
1950
+ return _handle_between(text, matches, patterns, keep, as_dict, positions, trim)
1951
+ else:
1952
+ raise ValueError(f"Invalid mode: {mode}")
1953
+
1954
+
1955
+ def _normalize_text(text: Union[str, List[str]], delimiter: Optional[str]) -> str:
1956
+ """Normalize text input to single string"""
1957
+ if isinstance(text, list):
1958
+ return delimiter.join(text) if delimiter else " ".join(text)
1959
+ return text
1960
+
1961
+
1962
+ def _validate_patterns(patterns: Union[str, List[str]]) -> List[str]:
1963
+ """Validate and normalize patterns"""
1964
+ if isinstance(patterns, str):
1965
+ return [patterns]
1966
+ if not patterns:
1967
+ raise ValueError("At least one pattern required")
1968
+ return patterns
1969
+
1970
+
1971
+ def _find_matches(
1972
+ text: str, patterns: List[str], regex: bool, flags: int
1973
+ ) -> List[dict]:
1974
+ """Find all pattern matches with enhanced regex handling"""
1975
+ matches = []
1976
+ for pattern in patterns:
1977
+ try:
1978
+ search_pattern = pattern if regex else re.escape(pattern)
1979
+ for match in re.finditer(search_pattern, text, flags=flags):
1980
+ matches.append(
1981
+ {
1982
+ "text": match.group(),
1983
+ "start": match.start(),
1984
+ "end": match.end(),
1985
+ "pattern": pattern,
1986
+ "full_match": match,
1987
+ }
1988
+ )
1989
+ except re.error as e:
1990
+ raise ValueError(f"Invalid pattern '{pattern}': {e}")
1991
+ return sorted(matches, key=lambda x: x["start"])
1992
+
1993
+
1994
+ def _handle_extract(
1995
+ matches: List[dict], all_matches: bool, as_dict: bool, positions: bool, trim: bool
1996
+ ) -> Union[List, dict]:
1997
+ """Handle text extraction of matched patterns"""
1998
+ results = []
1999
+ for match in matches if all_matches else [matches[0]]:
2000
+ content = match["text"].strip() if trim else match["text"]
2001
+ result = (
2002
+ {
2003
+ "text": content,
2004
+ "start": match["start"],
2005
+ "end": match["end"],
2006
+ "pattern": match["pattern"],
2007
+ }
2008
+ if as_dict
2009
+ else content
2010
+ )
2011
+ if positions and as_dict:
2012
+ result["positions"] = [(match["start"], match["end"])]
2013
+ results.append(result)
2014
+
2015
+ return results[0] if not all_matches else results
2016
+
2017
+
2018
+ def _create_part(
2019
+ content: str,
2020
+ start: int,
2021
+ end: int,
2022
+ match: Optional[dict],
2023
+ as_dict: bool,
2024
+ positions: bool,
2025
+ trim: bool,
2026
+ ) -> Union[str, dict]:
2027
+ """Create a standardized result part"""
2028
+ content = content.strip() if trim else content
2029
+ if not as_dict:
2030
+ return content
2031
+
2032
+ part = {
2033
+ "text": content,
2034
+ "start": start,
2035
+ "end": end,
2036
+ "pattern": match["pattern"] if match else None,
2037
+ }
2038
+ if positions and match:
2039
+ part["positions"] = [(match["start"], match["end"])]
2040
+ return part
2041
+
2042
+
2043
+ def _handle_between(
2044
+ text: str,
2045
+ matches: List[dict],
2046
+ patterns: List[str],
2047
+ keep: str,
2048
+ as_dict: bool,
2049
+ positions: bool,
2050
+ trim: bool,
2051
+ ) -> Union[Tuple, dict]:
2052
+ """Reliable between-mode implementation with boundary checks"""
2053
+ first_pattern, last_pattern = patterns[0], patterns[-1]
2054
+ first_matches = [m for m in matches if m["pattern"] == first_pattern]
2055
+ last_matches = [m for m in matches if m["pattern"] == last_pattern]
2056
+
2057
+ if not first_matches or not last_matches:
2058
+ return None
2059
+
2060
+ first = first_matches[0]
2061
+ last = last_matches[-1]
2062
+
2063
+ if first["start"] > last["start"]:
2064
+ return None
2065
+
2066
+ # Calculate extraction window
2067
+ start, end = first["start"], last["end"]
2068
+ if keep == "none":
2069
+ start, end = first["end"], last["start"]
2070
+ elif keep == "left":
2071
+ end = last["start"]
2072
+ elif keep == "right":
2073
+ start = first["end"]
2074
+
2075
+ extracted = text[start:end].strip() if trim else text[start:end]
2076
+
2077
+ if as_dict:
2078
+ result = {
2079
+ "text": extracted,
2080
+ "start": start,
2081
+ "end": end,
2082
+ "patterns": patterns,
2083
+ "match_positions": [(m["start"], m["end"]) for m in matches],
2084
+ }
2085
+ return result
2086
+
2087
+ return (
2088
+ (start, extracted)
2089
+ if not positions
2090
+ else (start, extracted, [(m["start"], m["end"]) for m in matches])
2091
+ )
2092
+
2093
+
2094
+ def _handle_split(
2095
+ text: str,
2096
+ matches: List[dict],
2097
+ keep: str,
2098
+ all_matches: bool,
2099
+ as_dict: bool,
2100
+ positions: bool,
2101
+ trim: bool,
2102
+ ) -> Union[List, dict]:
2103
+ """Split text with proper handling of keep='both' to include delimiters on both sides"""
2104
+ if not matches:
2105
+ return (
2106
+ [text]
2107
+ if not as_dict
2108
+ else [{"text": text, "start": 0, "end": len(text), "pattern": None}]
2109
+ )
2110
+
2111
+ parts = []
2112
+ prev_end = 0
2113
+ process_matches = matches if all_matches else [matches[0]]
2114
+
2115
+ # Special handling for keep="both"
2116
+ if keep == "both":
2117
+ for i, match in enumerate(process_matches):
2118
+ start, end = match["start"], match["end"]
2119
+ matched_text = text[start:end]
2120
+
2121
+ # First segment (text before first delimiter + first delimiter)
2122
+ if i == 0:
2123
+ segment = text[prev_end:end] # From start to end of first delimiter
2124
+ if trim:
2125
+ segment = segment.strip()
2126
+ if segment or not trim:
2127
+ if as_dict:
2128
+ parts.append(
2129
+ {
2130
+ "text": segment,
2131
+ "start": prev_end,
2132
+ "end": end,
2133
+ "pattern": match["pattern"],
2134
+ **({"positions": [(start, end)]} if positions else {}),
2135
+ }
2136
+ )
2137
+ else:
2138
+ parts.append(segment)
2139
+ prev_end = end
2140
+
2141
+ # Middle segments (delimiter + text + next delimiter)
2142
+ if i > 0 and i < len(process_matches):
2143
+ next_match = process_matches[i]
2144
+ next_start, next_end = next_match["start"], next_match["end"]
2145
+ segment = text[
2146
+ prev_end:next_end
2147
+ ] # From prev_end to end of next delimiter
2148
+ if trim:
2149
+ segment = segment.strip()
2150
+ if segment or not trim:
2151
+ if as_dict:
2152
+ parts.append(
2153
+ {
2154
+ "text": segment,
2155
+ "start": prev_end,
2156
+ "end": next_end,
2157
+ "pattern": next_match["pattern"],
2158
+ **(
2159
+ {"positions": [(next_start, next_end)]}
2160
+ if positions
2161
+ else {}
2162
+ ),
2163
+ }
2164
+ )
2165
+ else:
2166
+ parts.append(segment)
2167
+ prev_end = next_end
2168
+
2169
+ # Last segment (last delimiter + remaining text)
2170
+ if process_matches and prev_end < len(text):
2171
+ last_match = process_matches[-1]
2172
+ segment = text[
2173
+ last_match["start"] : len(text)
2174
+ ] # From last delimiter to end
2175
+ if trim:
2176
+ segment = segment.strip()
2177
+ if segment or not trim:
2178
+ if as_dict:
2179
+ parts.append(
2180
+ {
2181
+ "text": segment,
2182
+ "start": last_match["start"],
2183
+ "end": len(text),
2184
+ "pattern": last_match["pattern"],
2185
+ **(
2186
+ {
2187
+ "positions": [
2188
+ (last_match["start"], last_match["end"])
2189
+ ]
2190
+ }
2191
+ if positions
2192
+ else {}
2193
+ ),
2194
+ }
2195
+ )
2196
+ else:
2197
+ parts.append(segment)
2198
+
2199
+ return parts
2200
+
2201
+ # Original handling for other keep modes
2202
+ for i, match in enumerate(process_matches):
2203
+ start, end = match["start"], match["end"]
2204
+ matched_text = text[start:end]
2205
+
2206
+ # Handle text before the match
2207
+ if prev_end < start:
2208
+ before = text[prev_end:start]
2209
+ if trim:
2210
+ before = before.strip()
2211
+ if before or not trim:
2212
+ if as_dict:
2213
+ parts.append(
2214
+ {
2215
+ "text": before,
2216
+ "start": prev_end,
2217
+ "end": start,
2218
+ "pattern": None,
2219
+ **({"positions": []} if positions else {}),
2220
+ }
2221
+ )
2222
+ else:
2223
+ parts.append(before)
2224
+
2225
+ # Handle the match based on keep mode
2226
+ if keep == "none":
2227
+ pass # Skip the delimiter
2228
+ elif keep == "left":
2229
+ if parts:
2230
+ if as_dict:
2231
+ parts[-1]["text"] += matched_text
2232
+ parts[-1]["end"] = end
2233
+ else:
2234
+ parts[-1] += matched_text
2235
+ else:
2236
+ if as_dict:
2237
+ parts.append(
2238
+ {
2239
+ "text": matched_text,
2240
+ "start": start,
2241
+ "end": end,
2242
+ "pattern": match["pattern"],
2243
+ **({"positions": [(start, end)]} if positions else {}),
2244
+ }
2245
+ )
2246
+ else:
2247
+ parts.append(matched_text)
2248
+ elif keep == "right":
2249
+ if i < len(process_matches) - 1:
2250
+ next_start = process_matches[i + 1]["start"]
2251
+ if end < next_start:
2252
+ between = text[end:next_start]
2253
+ if as_dict:
2254
+ parts.append(
2255
+ {
2256
+ "text": matched_text + between,
2257
+ "start": start,
2258
+ "end": next_start,
2259
+ "pattern": match["pattern"],
2260
+ **({"positions": [(start, end)]} if positions else {}),
2261
+ }
2262
+ )
2263
+ else:
2264
+ parts.append(matched_text + between)
2265
+ prev_end = next_start
2266
+ continue
2267
+
2268
+ prev_end = end
2269
+
2270
+ # Handle remaining text after last match
2271
+ if prev_end < len(text):
2272
+ remaining = text[prev_end:]
2273
+ if trim:
2274
+ remaining = remaining.strip()
2275
+ if remaining or not trim:
2276
+ if keep == "right" and parts and process_matches:
2277
+ last_match = process_matches[-1]
2278
+ matched_text = text[last_match["start"] : last_match["end"]]
2279
+ if as_dict:
2280
+ parts.append(
2281
+ {
2282
+ "text": matched_text + remaining,
2283
+ "start": last_match["start"],
2284
+ "end": len(text),
2285
+ "pattern": last_match["pattern"],
2286
+ **(
2287
+ {
2288
+ "positions": [
2289
+ (last_match["start"], last_match["end"])
2290
+ ]
2291
+ }
2292
+ if positions
2293
+ else {}
2294
+ ),
2295
+ }
2296
+ )
2297
+ else:
2298
+ parts.append(matched_text + remaining)
2299
+ else:
2300
+ if as_dict:
2301
+ parts.append(
2302
+ {
2303
+ "text": remaining,
2304
+ "start": prev_end,
2305
+ "end": len(text),
2306
+ "pattern": None,
2307
+ **({"positions": []} if positions else {}),
2308
+ }
2309
+ )
2310
+ else:
2311
+ parts.append(remaining)
2312
+
2313
+ # Filter empty parts if trimming
2314
+ if trim:
2315
+ parts = [p for p in parts if (p["text"].strip() if as_dict else p.strip())]
2316
+
2317
+ return parts
1734
2318
 
1735
2319
 
2320
+ def _merge_parts(
2321
+ parts: List[Union[str, dict]], text: str, as_dict: bool, trim: bool
2322
+ ) -> Union[str, dict]:
2323
+ """Merge adjacent parts for keep=left mode"""
2324
+ if as_dict:
2325
+ merged_text = "".join(p["text"] for p in parts)
2326
+ return {
2327
+ "text": merged_text.strip() if trim else merged_text,
2328
+ "start": parts[0]["start"],
2329
+ "end": parts[-1]["end"],
2330
+ "patterns": list(set(p["pattern"] for p in parts if p["pattern"])),
2331
+ }
2332
+ return "".join(parts).strip() if trim else "".join(parts)
2333
+ #! ===========extract_text===========
2334
+
1736
2335
  def strcmp(
1737
2336
  search_term: str,
1738
2337
  candidates: List[str],
@@ -2794,73 +3393,6 @@ def text2audio(
2794
3393
 
2795
3394
  # from datetime import datetime
2796
3395
  from dateutil import parser
2797
- # import re
2798
- # from typing import Union, Optional, Dict, Any
2799
- # def str2time(time_str, fmt="24"):
2800
- # """
2801
- # Convert a time string into the specified format.
2802
- # Parameters:
2803
- # - time_str (str): The time string to be converted.
2804
- # - fmt (str): The format to convert the time to. Defaults to '%H:%M:%S'.
2805
- # Returns:
2806
- # %I represents the hour in 12-hour format.
2807
- # %H represents the hour in 24-hour format (00 through 23).
2808
- # %M represents the minute.
2809
- # %S represents the second.
2810
- # %p represents AM or PM.
2811
- # - str: The converted time string.
2812
- # """
2813
- # from datetime import datetime
2814
-
2815
- # def time_len_corr(time_str):
2816
- # time_str_ = (
2817
- # ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
2818
- # )
2819
- # time_str_split = []
2820
- # [time_str_split.append(i) for i in time_str_ if is_num(i)]
2821
- # if time_str_split:
2822
- # if len(time_str_split) == 2:
2823
- # H, M = time_str_split
2824
- # time_str_full = H + ":" + M + ":00"
2825
- # elif len(time_str_split) == 3:
2826
- # H, M, S = time_str_split
2827
- # time_str_full = H + ":" + M + ":" + S
2828
- # else:
2829
- # time_str_full = time_str_
2830
- # if "am" in time_str.lower():
2831
- # time_str_full += " AM"
2832
- # elif "pm" in time_str.lower():
2833
- # time_str_full += " PM"
2834
- # return time_str_full
2835
-
2836
- # if "12" in fmt:
2837
- # fmt = "%I:%M:%S %p"
2838
- # elif "24" in fmt:
2839
- # fmt = "%H:%M:%S"
2840
-
2841
- # try:
2842
- # # Try to parse the time string assuming it could be in 24-hour or 12-hour format
2843
- # time_obj = datetime.strptime(time_len_corr(time_str), "%H:%M:%S")
2844
- # except ValueError:
2845
- # try:
2846
- # time_obj = datetime.strptime(time_len_corr(time_str), "%I:%M:%S %p")
2847
- # except ValueError as e:
2848
- # raise ValueError(f"Unable to parse time string: {time_str}. Error: {e}")
2849
-
2850
- # # Format the time object to the desired output format
2851
- # formatted_time = time_obj.strftime(fmt)
2852
- # return formatted_time
2853
-
2854
-
2855
- # # # Example usage:
2856
- # # time_str1 = "14:30:45"
2857
- # # time_str2 = "02:30:45 PM"
2858
-
2859
- # # formatted_time1 = str2time(time_str1, fmt='12') # Convert to 12-hour format
2860
- # # formatted_time2 = str2time(time_str2, fmt='24') # Convert to 24-hour format
2861
-
2862
- # # print(formatted_time1) # Output: 02:30:45 PM
2863
- # # print(formatted_time2) # Output: 14:30:45
2864
3396
  def str2time(
2865
3397
  time_str: str,
2866
3398
  fmt: str = "24",
@@ -2964,57 +3496,6 @@ def str2time(
2964
3496
  raise ValueError(f"Unable to parse time string: '{time_str}'. Error: {e}")
2965
3497
  return default
2966
3498
 
2967
-
2968
- # def str2date(date_str, original_fmt=None, fmt="%Y-%m-%d"):
2969
- # """
2970
- # Convert a date string to the desired format and extract components if needed.
2971
- # Usage:
2972
- # str2date(x, fmt="%d.%m.%y",original_fmt="%d.%m.%y")
2973
- # Parameters:
2974
- # - date_str (str): The input date string.
2975
- # - original_fmt (str, optional): The original format of the date string. If not provided, it will be auto-detected.
2976
- # - fmt (str): The desired format for the output date string. Defaults to '%Y-%m-%d'.
2977
-
2978
- # Returns:
2979
- # - dict: A dictionary containing the converted date string and its components (year, month, day).
2980
-
2981
- # Raises:
2982
- # - ValueError: If the date cannot be parsed.
2983
- # """
2984
- # from dateutil import parser
2985
- # try:
2986
- # if not isinstance(date_str,str):
2987
- # date_str=str(date_str)
2988
- # # Parse the date using the provided original format or auto-detect
2989
- # if original_fmt:
2990
- # try:
2991
- # date_obj = datetime.strptime(date_str, original_fmt)
2992
- # except Exception as e:
2993
- # print(e)
2994
- # date_obj=None
2995
- # else:
2996
- # try:
2997
- # date_obj = parser.parse(date_str)
2998
- # except Exception as e:
2999
- # print(e)
3000
- # date_obj=None
3001
- # # Return formatted string if `fmt` is specified, otherwise return the datetime object
3002
- # if date_obj is not None:
3003
- # if fmt:
3004
- # date_obj=date_obj.strftime(fmt)
3005
- # else:
3006
- # date_obj=date_str
3007
- # return date_obj
3008
-
3009
- # except (ValueError, TypeError) as e:
3010
- # raise ValueError(f"Unable to process date string: '{date_str}'. Error: {e}")
3011
-
3012
-
3013
- # # str1=str2date(num2str(20240625),fmt="%a %d-%B-%Y")
3014
- # # print(str1)
3015
- # # str2=str2num(str2date(str1,fmt='%a %Y%m%d'))
3016
- # # print(str2)
3017
-
3018
3499
  def str2date(
3019
3500
  date_str: Union[str, int, float],
3020
3501
  fmt: Optional[str] = "%Y-%m-%d",
@@ -4054,8 +4535,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
4054
4535
 
4055
4536
 
4056
4537
  def ssplit(text, by="space", verbose: bool =False, strict: bool =False, strip_results: bool = True, **kws):
4057
- """
4058
- # Determines the splitting strategy:
4538
+ """# Determines the splitting strategy:
4059
4539
  # - "space", "whitespace", "sp": split by whitespace (default)
4060
4540
  # - "word": split into words using NLTK's word_tokenize
4061
4541
  # - "sentence", "sent": split into sentences using NLTK's sent_tokenize
@@ -4172,13 +4652,6 @@ def ssplit(text, by="space", verbose: bool =False, strict: bool =False, strip_re
4172
4652
 
4173
4653
  def split_by_regex_end(text, pattern):
4174
4654
  return re.split(f"(?={pattern})", text)
4175
-
4176
- # def split_by_sentence_endings(text):
4177
- # return re.split(r"(?<=[.!?])", text)
4178
- # def split_non_ascii(text):
4179
- # # return re.split(r"([^\x00-\x7F\w\s,.!?:\"'()\-]+)", text)
4180
- # # return re.split(r"[^\x00-\x7F]+", text)
4181
- # return re.split(r"([^\x00-\x7F]+)", text)
4182
4655
  def split_non_ascii(text, keep_delimiters=False):
4183
4656
  """
4184
4657
  Split text at non-ASCII characters.
@@ -4903,145 +5376,6 @@ def _backup_validations(sheet, verbose=False):
4903
5376
 
4904
5377
  return backup
4905
5378
 
4906
- # def _backup_validations(sheet):
4907
- # """
4908
- # Complete validation backup with XML-level cross-sheet detection
4909
- # """
4910
- # from openpyxl.utils import get_column_letter
4911
- # import re
4912
- # from openpyxl.worksheet.datavalidation import DataValidation
4913
- # from openpyxl.xml.functions import fromstring
4914
-
4915
- # backup = {
4916
- # "validations": [],
4917
- # "conditional_formatting": [],
4918
- # "merged_cells": [str(mr) for mr in sheet.merged_cells.ranges],
4919
- # "_metadata": {
4920
- # "validated_cells": set(),
4921
- # "validated_columns": set(),
4922
- # "validation_types": set(),
4923
- # "cross_sheet_validations": set()
4924
- # }
4925
- # }
4926
-
4927
- # # METHOD 1: Primary validation backup (standard method)
4928
- # for dv in sheet.data_validations:
4929
- # # ... (existing standard validation backup code) ...
4930
-
4931
- # # METHOD 2: XML-based cross-sheet validation detection
4932
- # print("Performing deep XML scan for cross-sheet validations...")
4933
-
4934
- # # Access the worksheet XML directly
4935
- # xml_source = sheet._worksheet.xml
4936
- # if not xml_source:
4937
- # print("Warning: Could not access worksheet XML source")
4938
- # return backup
4939
-
4940
- # try:
4941
- # # Parse the XML
4942
- # root = fromstring(xml_source)
4943
- # ns = {'ns': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
4944
-
4945
- # # Find all dataValidation elements
4946
- # for dv_xml in root.findall('.//ns:dataValidation', ns):
4947
- # try:
4948
- # # Extract validation attributes
4949
- # dv_type = dv_xml.get('type', 'none')
4950
- # formula1 = dv_xml.find('.//ns:formula1', ns)
4951
- # formula_text = formula1.text if formula1 is not None else None
4952
-
4953
- # # Skip if not a list type or no formula
4954
- # if dv_type != 'list' or not formula_text:
4955
- # continue
4956
-
4957
- # # Clean the formula
4958
- # clean_formula = formula_text.strip('"\'')
4959
-
4960
- # # Check for cross-sheet patterns
4961
- # cross_sheet_patterns = [
4962
- # (r'^[\w\s]+!\$?[A-Za-z]+\$?\d+(?::\$?[A-Za-z]+\$?\d+)?$', "direct sheet reference"),
4963
- # (r'INDIRECT\(["\'][\w\s]+![A-Za-z]+\d+(?::[A-Za-z]+\d+)?["\']\)', "INDIRECT sheet reference"),
4964
- # (r'^[^\s!]+$', "potential named range"),
4965
- # ]
4966
-
4967
- # # Determine if this is a cross-sheet reference
4968
- # is_cross_sheet = False
4969
- # detection_method = ""
4970
-
4971
- # for pattern, description in cross_sheet_patterns:
4972
- # if re.match(pattern, clean_formula, re.IGNORECASE):
4973
- # is_cross_sheet = True
4974
- # detection_method = description
4975
- # break
4976
-
4977
- # if not is_cross_sheet:
4978
- # continue
4979
-
4980
- # # Process the ranges
4981
- # ranges = []
4982
- # sqref = dv_xml.get('sqref', '')
4983
- # for range_str in sqref.split():
4984
- # try:
4985
- # # Convert range to coordinates
4986
- # if ':' in range_str:
4987
- # start, end = range_str.split(':')
4988
- # col_start = int(''.join(filter(str.isdigit, start)))
4989
- # col_end = int(''.join(filter(str.isdigit, end)))
4990
- # row_start = int(''.join(filter(str.isalpha, start)))
4991
- # row_end = int(''.join(filter(str.isalpha, end)))
4992
- # ranges.append({
4993
- # 'range': range_str,
4994
- # 'cells': [f"{get_column_letter(col)}{row}"
4995
- # for col in range(col_start, col_end+1)
4996
- # for row in range(row_start, row_end+1)]
4997
- # })
4998
- # else:
4999
- # col = int(''.join(filter(str.isdigit, range_str)))
5000
- # row = int(''.join(filter(str.isalpha, range_str)))
5001
- # ranges.append({
5002
- # 'range': range_str,
5003
- # 'cells': [f"{get_column_letter(col)}{row}"]
5004
- # })
5005
- # except Exception as e:
5006
- # print(f"Error parsing range {range_str}: {e}")
5007
-
5008
- # # Create validation record
5009
- # validation_data = {
5010
- # 'type': 'list',
5011
- # 'formula1': formula_text,
5012
- # 'formula2': None,
5013
- # 'allow_blank': dv_xml.get('allowBlank', '1') == '1',
5014
- # 'showDropDown': dv_xml.get('showDropDown', '1') == '1',
5015
- # 'showInputMessage': dv_xml.get('showInputMessage', '1') == '1',
5016
- # 'showErrorMessage': dv_xml.get('showErrorMessage', '0') == '1',
5017
- # 'errorTitle': dv_xml.get('errorTitle', ''),
5018
- # 'error': dv_xml.get('error', ''),
5019
- # 'promptTitle': dv_xml.get('promptTitle', ''),
5020
- # 'prompt': dv_xml.get('prompt', ''),
5021
- # 'ranges': ranges,
5022
- # '_source': 'xml_validation',
5023
- # '_detection_method': detection_method,
5024
- # '_is_cross_sheet': True,
5025
- # '_formula_clean': clean_formula
5026
- # }
5027
-
5028
- # # Add to backup
5029
- # backup['validations'].append(validation_data)
5030
- # for rng in ranges:
5031
- # for cell_ref in rng['cells']:
5032
- # backup['_metadata']['validated_cells'].add(cell_ref)
5033
- # backup['_metadata']['validated_columns'].add(''.join(filter(str.isalpha, cell_ref)))
5034
- # backup['_metadata']['validation_types'].add('list')
5035
- # backup['_metadata']['cross_sheet_validations'].add(clean_formula.split('!')[0])
5036
-
5037
- # except Exception as e:
5038
- # print(f"Error processing XML validation: {e}")
5039
-
5040
- # except Exception as e:
5041
- # print(f"Error parsing worksheet XML: {e}")
5042
-
5043
- # return backup
5044
-
5045
5379
  def _restore_validations(sheet, backup,verbose=False):
5046
5380
  """
5047
5381
  恢复数据验证和条件格式规则到工作表
@@ -5247,11 +5581,6 @@ def fload(fpath, kind=None, **kwargs):
5247
5581
  with open(fpath, "r") as file:
5248
5582
  content = file.read()
5249
5583
  return content
5250
-
5251
- # def load_html(fpath):
5252
- # with open(fpath, "r") as file:
5253
- # content = file.read()
5254
- # return content
5255
5584
  def load_html(fpath, **kwargs):
5256
5585
  return pd.read_html(fpath, **kwargs)
5257
5586
 
@@ -5570,7 +5899,7 @@ def fload(fpath, kind=None, **kwargs):
5570
5899
  if output in ["dataframe", "df"]:
5571
5900
  if verbose:
5572
5901
  print("loading data as a DataFrame")
5573
- if not password:
5902
+ if not bool(password):
5574
5903
  if verbose:
5575
5904
  print("Reading Excel without password protection...")
5576
5905
  df = pd.read_excel(fpath, engine=engine, sheet_name=sheet_name, **kwargs)
@@ -6518,27 +6847,6 @@ def fsave(
6518
6847
  print(
6519
6848
  f"Error:\n{kind} is not in the supported list ['docx', 'txt', 'md', 'html', 'pdf', 'csv', 'xlsx', 'json', 'xml', 'yaml']"
6520
6849
  )
6521
-
6522
-
6523
- # # Example usage
6524
- # text_content = ["Hello, this is a sample text file.", "This is the second paragraph."]
6525
- # tabular_content = {"Name": ["Alice", "Bob"], "Age": [24, 30]}
6526
- # json_content = {"name": "Alice", "age": 24}
6527
- # yaml_content = {"Name": "Alice", "Age": 24}
6528
- # xml_content = {"Name": "Alice", "Age": 24}
6529
- # dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
6530
- # fsave(dir_save + "sample.txt", text_content)
6531
- # fsave(dir_save + "sample.md", text_content)
6532
- # fsave(dir_save + "sample.html", text_content)
6533
- # fsave(dir_save + "sample.pdf", text_content)
6534
- # fsave(dir_save + "sample.docx", text_content)
6535
- # fsave(dir_save + "sample.csv", tabular_content, index=False)
6536
- # fsave(dir_save + "sample.xlsx", tabular_content, sheet_name="Sheet1", index=False)
6537
- # fsave(dir_save + "sample.json", json_content, indent=4)
6538
- # fsave(dir_save + "sample.yaml", yaml_content)
6539
- # fsave(dir_save + "sample.xml", xml_content)
6540
-
6541
-
6542
6850
  def addpath(fpath):
6543
6851
  sys.path.insert(0, dir)
6544
6852
 
@@ -7118,7 +7426,7 @@ def listdir(
7118
7426
  hidden=False, # Include hidden files/folders
7119
7427
  orient="list",
7120
7428
  output="df", # "df", 'list','dict','records','index','series'
7121
- verbose=True,
7429
+ verbose=False,
7122
7430
  ):
7123
7431
  def is_hidden(filepath):
7124
7432
  """Check if a file or folder is hidden."""
@@ -7348,7 +7656,7 @@ def listdir(
7348
7656
  if "se" in orient.lower(): # records
7349
7657
  return Box(f.to_dict(orient="series"))
7350
7658
 
7351
-
7659
+
7352
7660
  def listpkg(where="env", verbose=False):
7353
7661
  """list all pacakages"""
7354
7662
 
@@ -7829,87 +8137,7 @@ def split_path(fpath):
7829
8137
  dir_par = f_slash.join(fpath.split(f_slash)[:-1])
7830
8138
  dir_ch = "".join(fpath.split(f_slash)[-1:])
7831
8139
  return dir_par, dir_ch
7832
-
7833
-
7834
- def figsave(*args, dpi=300, **kwargs):
7835
- import matplotlib.pyplot as plt
7836
- from PIL import Image
7837
- bbox_inches = kwargs.pop("bbox_inches", "tight")
7838
- pad_inches = kwargs.pop("pad_inches", 0)
7839
- facecolor = kwargs.pop("facecolor", "white")
7840
- edgecolor = kwargs.pop("edgecolor", "auto")
7841
-
7842
- dir_save = None
7843
- fname = None
7844
- img = None
7845
-
7846
- for arg in args:
7847
- if isinstance(arg, str):
7848
- path = Path(arg)
7849
- if path.suffix: # Has file extension
7850
- fname = path.name
7851
- dir_save = path.parent
7852
- else:
7853
- dir_save = path
7854
- elif isinstance(arg, (Image.Image, np.ndarray)):
7855
- img = arg # Store PIL image or numpy array
7856
-
7857
- # Set default save directory
7858
- dir_save = Path(dir_save) if dir_save else Path(".")
7859
- dir_save.mkdir(parents=True, exist_ok=True)
7860
-
7861
- # Handle filename and extension
7862
- if fname is None:
7863
- fname = "figure"
7864
- fname = dir_save / fname
7865
- if fname.suffix == "":
7866
- fname = fname.with_suffix(".pdf") # Default format
7867
-
7868
- ftype = fname.suffix.lstrip(".").lower()
7869
-
7870
- # Save figure based on file type
7871
- if ftype == "eps":
7872
- plt.savefig(fname, format="eps", bbox_inches=bbox_inches)
7873
- plt.savefig(fname.with_suffix(".pdf"), format="pdf", dpi=dpi,
7874
- pad_inches=pad_inches, bbox_inches=bbox_inches,
7875
- facecolor=facecolor, edgecolor=edgecolor)
7876
- elif ftype == "pdf":
7877
- plt.savefig(fname, format="pdf", dpi=dpi, pad_inches=pad_inches,
7878
- bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
7879
- elif ftype in ["jpg", "jpeg", "png", "tiff", "tif"]:
7880
- if img is not None: # If an image is provided
7881
- if isinstance(img, Image.Image):
7882
- img = img.convert("RGB") if img.mode == "RGBA" else img
7883
- img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
7884
- elif isinstance(img, np.ndarray):
7885
- import cv2
7886
- if img.ndim == 2:
7887
- Image.fromarray(img).save(fname, format=ftype.upper(), dpi=(dpi, dpi))
7888
- elif img.ndim == 3:
7889
- if img.shape[2] == 3:
7890
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
7891
- elif img.shape[2] == 4:
7892
- img = cv2.cvtColor(img, cv2.COLOR_BGRA2RGBA)
7893
- Image.fromarray(img).save(fname, format=ftype.upper(), dpi=(dpi, dpi))
7894
- else:
7895
- raise ValueError("Unexpected image dimensions.")
7896
- else:
7897
- plt.savefig(fname, format=ftype, dpi=dpi, pad_inches=pad_inches,
7898
- bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
7899
- elif ftype == "ico":
7900
- if img is None:
7901
- plt.savefig(fname, dpi=dpi, pad_inches=pad_inches,
7902
- bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
7903
- img = Image.open(fname)
7904
- img = img.convert("RGBA")
7905
- icon_sizes = [(32, 32), (64, 64), (128, 128), (256, 256)]
7906
- img.save(fname, format="ICO", sizes=icon_sizes)
7907
- print(f"Icon saved @: {fname} with sizes: {icon_sizes}")
7908
- else:
7909
- raise ValueError(f"Unsupported file format: {ftype}")
7910
-
7911
- print(f"\nSaved @ {fname} (dpi={dpi})")
7912
-
8140
+
7913
8141
  def figsave(*args, dpi=300, **kwargs):
7914
8142
  """
7915
8143
  Save a Matplotlib figure or image file in various formats.
@@ -8038,7 +8266,7 @@ def figsave(*args, dpi=300, **kwargs):
8038
8266
  img = img.convert("RGBA")
8039
8267
  img.save(fname, format="ICO", sizes=icon_sizes)
8040
8268
  print(f"Icon saved @: {fname} with sizes: {icon_sizes}")
8041
- print(f"\n✅ Saved @: dpi={dpi}\n{fname}")
8269
+ print(f"\nSaved @: dpi={dpi}\n{fname}")
8042
8270
 
8043
8271
 
8044
8272
  def is_str_color(s):
@@ -8806,7 +9034,8 @@ def detect_angle(image, by="median", template=None):
8806
9034
 
8807
9035
  # Use Hough transform to detect lines
8808
9036
  lines = transform.probabilistic_hough_line(edges)
8809
-
9037
+ if isinstance(by, bool):
9038
+ by="mean" if by else 0
8810
9039
  if not lines and any(["me" in by, "pca" in by]):
8811
9040
  print("No lines detected. Adjust the edge detection parameters.")
8812
9041
  return 0
@@ -9180,7 +9409,7 @@ def imgsets(
9180
9409
  elif "cro" in k.lower() or "cut" in k.lower():
9181
9410
  img_update = img_update.crop(value)
9182
9411
  elif "rota" in k.lower():
9183
- if isinstance(value, str):
9412
+ if isinstance(value, (str,bool)):
9184
9413
  value = detect_angle(img_update, by=value)
9185
9414
  print(f"rotated by {value}°")
9186
9415
  img_update = img_update.rotate(value)
@@ -9371,12 +9600,6 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
9371
9600
  else:
9372
9601
  figsave(dirname(dir_save), fname)
9373
9602
 
9374
-
9375
- # usage:
9376
- # fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
9377
- # thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
9378
-
9379
-
9380
9603
  # search and fine the director of the libary, which installed at local
9381
9604
  def dir_lib(lib_oi):
9382
9605
  """
@@ -9524,35 +9747,276 @@ def finfo(fpath, output='json', verbose=False):
9524
9747
  extra_info=extra_info,
9525
9748
  )
9526
9749
 
9527
-
9528
-
9529
- # ! format excel file
9530
-
9531
-
9532
- def hex2argb(color):
9750
+ def color2rgb(
9751
+ color_input: str | tuple | list | None,
9752
+ alpha: float | None = None
9753
+ ) -> tuple | None:
9533
9754
  """
9534
- Convert a color name or hex code to aARGB format required by openpyxl.
9535
-
9536
- :param color: A color in the format: 'blue', '#RRGGBB', 'RRGGBB', 'aARRGGBB'
9537
- :return: A hex color code in the format aARRGGBB.
9538
-
9539
- Example:
9540
- print(hex2argb("blue")) # Output: FF0000FF
9541
- print(hex2argb("FFFF00")) # Output: FFFFFF00
9542
- print(hex2argb("#DF4245")) # Output: FFDf4245
9543
- print(hex2argb("FF00FF00")) # Output: FF00FF00 (already in aARGB format)
9755
+ Ultimate color conversion utility with support for multiple formats and transparency.
9756
+
9757
+ Parameters:
9758
+ -----------
9759
+ color_input : str | tuple | list | None
9760
+ Supported formats:
9761
+ - Hex strings ("#RRGGBB", "#RGB")
9762
+ - Named colors ("red", "blue")
9763
+ - RGB tuples ((0.2, 0.4, 0.6))
9764
+ - RGBA tuples ((0.2, 0.4, 0.6, 0.8))
9765
+ - HTML/CSS colors ("cornflowerblue")
9766
+ - CSS formats:
9767
+ - rgb(100,200,50)
9768
+ - rgba(100,200,50,0.8)
9769
+ - hsl(120,60%,70%)
9770
+ - hsla(120,60%,70%,0.8)
9771
+ alpha : float | None, optional
9772
+ Opacity value (0.0-1.0). If provided, adds/overrides alpha channel.
9773
+
9774
+ Returns:
9775
+ --------
9776
+ tuple | None
9777
+ (R, G, B) or (R, G, B, A) tuple in 0-1 range, or None if invalid
9544
9778
  """
9545
- import matplotlib.colors as mcolors
9779
+ from matplotlib import colors as mcolors
9546
9780
  import re
9547
- color = color.lower().replace(" ", "") # 'light blue'
9548
- # Convert color name (e.g., "blue") to hex
9781
+
9782
+ if color_input is None:
9783
+ return None
9784
+
9785
+ # Case 1: Already in RGB/RGBA tuple format
9786
+ if isinstance(color_input, (tuple, list)):
9787
+ if 3 <= len(color_input) <= 4:
9788
+ if all(0 <= x <= 1 for x in color_input):
9789
+ if alpha is not None and len(color_input) == 3:
9790
+ return (*color_input, alpha)
9791
+ return tuple(color_input)
9792
+
9793
+ # Case 2: String input
9794
+ if isinstance(color_input, str):
9795
+ # Remove whitespace and make lowercase
9796
+ color_str = color_input.strip().lower()
9797
+
9798
+ # Handle CSS rgb/rgba format
9799
+ if color_str.startswith(('rgb(', 'rgba(')):
9800
+ try:
9801
+ nums = list(map(float, re.findall(r"[\d.]+", color_str)))
9802
+ if 3 <= len(nums) <= 4:
9803
+ rgb = tuple(x/255 if i < 3 else x for i, x in enumerate(nums))
9804
+ if alpha is not None:
9805
+ return (*rgb[:3], alpha)
9806
+ return rgb[:4] if len(rgb) == 4 else rgb[:3]
9807
+ except:
9808
+ pass
9809
+
9810
+ # Handle CSS hsl/hsla format
9811
+ elif color_str.startswith(('hsl(', 'hsla(')):
9812
+ try:
9813
+ nums = list(map(float, re.findall(r"[\d.]+", color_str)))
9814
+ if 3 <= len(nums) <= 4:
9815
+ h, s, l = nums[0]/360, nums[1]/100, nums[2]/100
9816
+ rgb = mcolors.hsv_to_rgb((h, s, l))
9817
+ if len(nums) == 4:
9818
+ rgb += (nums[3],)
9819
+ if alpha is not None:
9820
+ return (*rgb[:3], alpha)
9821
+ return rgb[:4] if len(rgb) == 4 else rgb[:3]
9822
+ except:
9823
+ pass
9824
+
9825
+ # Standard hex/named color processing
9826
+ try:
9827
+ rgb = mcolors.to_rgba(color_str)
9828
+ if alpha is not None:
9829
+ return (*rgb[:3], alpha)
9830
+ return rgb if len(rgb) == 4 and rgb[3] != 1 else rgb[:3]
9831
+ except ValueError:
9832
+ pass
9833
+
9834
+ # Fallback for invalid colors
9835
+ print(f"Warning: Invalid color format '{color_input}'")
9836
+ return None
9837
+
9838
+ def color2hex(
9839
+ color_input: str | tuple | list | dict | int | None,
9840
+ keep_alpha: bool = False,
9841
+ force_long: bool = False,
9842
+ uppercase: bool = False,
9843
+ prefix: str = "#",
9844
+ allow_short: bool = True
9845
+ ) -> str | None:
9846
+ """
9847
+ Ultimate color to hex converter with comprehensive format support.
9848
+
9849
+ Parameters:
9850
+ -----------
9851
+ color_input : str | tuple | list | dict | int | None
9852
+ Input color in any of these formats:
9853
+ - Hex strings ("#RRGGBB", "#RGB", "RRGGBB", "RGB")
9854
+ - Named colors ("red", "blue", "transparent")
9855
+ - RGB/RGBA tuples ((0.2, 0.4, 0.6), (255, 0, 0), (100, 100, 100, 0.5))
9856
+ - CSS formats:
9857
+ - rgb(100,200,50)
9858
+ - rgba(100,200,50,0.8)
9859
+ - hsl(120,60%,70%)
9860
+ - hsla(120,60%,70%,0.8)
9861
+ - Integer RGB (0xFF0000 for red)
9862
+ - Dictionary {"r": 255, "g": 0, "b": 0} or {"h": 0, "s": 100, "l": 50}
9863
+ keep_alpha : bool, optional
9864
+ Whether to include alpha channel in hex format (#RRGGBBAA)
9865
+ force_long : bool, optional
9866
+ Force 6/8-digit hex even when 3/4-digit would be possible
9867
+ uppercase : bool, optional
9868
+ Use uppercase hex characters (False for lowercase)
9869
+ prefix : str, optional
9870
+ Prefix for hex string ("#" for CSS, "0x" for programming, "" for raw)
9871
+ allow_short : bool, optional
9872
+ Allow shortened 3/4-digit hex when possible
9873
+
9874
+ Returns:
9875
+ --------
9876
+ str | None
9877
+ Hex color string or None if invalid
9878
+
9879
+ Examples:
9880
+ ---------
9881
+ >>> color2hex((0.5, 0.2, 0.8)) → "#7f33cc"
9882
+ >>> color2hex("rgb(127, 51, 204)") → "#7f33cc"
9883
+ >>> color2hex((0.2, 0.4, 0.6, 0.8), True) → "#336699cc"
9884
+ >>> color2hex(0xFF0000, uppercase=True) → "#FF0000"
9885
+ >>> color2hex({"r": 255, "g": 165, "b": 0}, prefix="") → "ffa500"
9886
+ >>> color2hex("hsl(120, 100%, 50%)") → "#00ff00"
9887
+ """
9888
+ from matplotlib import colors as mcolors
9889
+ import re
9890
+
9891
+ def to_rgba(color) -> tuple | None:
9892
+ """Internal conversion to RGBA tuple"""
9893
+ # Handle None
9894
+ if color is None:
9895
+ return None
9896
+
9897
+ # Handle integer RGB
9898
+ if isinstance(color, int):
9899
+ if color < 0:
9900
+ return None
9901
+ return (
9902
+ (color >> 16) & 0xFF,
9903
+ (color >> 8) & 0xFF,
9904
+ color & 0xFF,
9905
+ 255
9906
+ )
9907
+
9908
+ # Handle dictionary formats
9909
+ if isinstance(color, dict):
9910
+ keys = set(color.keys())
9911
+ if {'r','g','b'}.issubset(keys):
9912
+ return (
9913
+ color['r'] / 255 if color['r'] > 1 else color['r'],
9914
+ color['g'] / 255 if color['g'] > 1 else color['g'],
9915
+ color['b'] / 255 if color['b'] > 1 else color['b'],
9916
+ color.get('a', 1.0)
9917
+ )
9918
+ elif {'h','s','l'}.issubset(keys):
9919
+ return mcolors.hsv_to_rgb((
9920
+ color['h'] / 360,
9921
+ color['s'] / 100,
9922
+ color['l'] / 100
9923
+ )) + (color.get('a', 1.0),)
9924
+ return None
9925
+
9926
+ # Handle string formats
9927
+ if isinstance(color, str):
9928
+ color = color.strip().lower()
9929
+
9930
+ # Handle hex without prefix
9931
+ if re.match(r'^[0-9a-f]{3,8}$', color):
9932
+ return mcolors.to_rgba(f"#{color}")
9933
+
9934
+ # Handle CSS functions
9935
+ if color.startswith(('rgb(', 'rgba(', 'hsl(', 'hsla(')):
9936
+ try:
9937
+ return mcolors.to_rgba(color)
9938
+ except ValueError:
9939
+ return None
9940
+
9941
+ # Handle named colors (including 'transparent')
9942
+ try:
9943
+ return mcolors.to_rgba(color)
9944
+ except ValueError:
9945
+ return None
9946
+
9947
+ # Handle tuple/list formats
9948
+ if isinstance(color, (tuple, list)):
9949
+ if len(color) in (3, 4):
9950
+ # Normalize values
9951
+ normalized = []
9952
+ for i, v in enumerate(color):
9953
+ if i < 3: # RGB channels
9954
+ if isinstance(v, int):
9955
+ normalized.append(v / 255 if v > 1 else v)
9956
+ else:
9957
+ normalized.append(float(v))
9958
+ else: # Alpha channel
9959
+ normalized.append(float(v))
9960
+ return tuple(normalized)
9961
+
9962
+ return None
9963
+
9964
+ # Convert input to RGBA
9965
+ rgba = to_rgba(color_input)
9966
+ if rgba is None:
9967
+ return None
9968
+
9969
+ # Extract components
9970
+ components = []
9971
+ for i, c in enumerate(rgba):
9972
+ if i == 3 and not keep_alpha:
9973
+ break
9974
+ components.append(round(c * 255 if c <= 1 else c))
9975
+
9976
+ # Determine if we can use short format
9977
+ use_short = (allow_short and
9978
+ not force_long and
9979
+ len(components) in (3, 4) and
9980
+ all((x % 17 == 0) for x in components[:3]))
9981
+
9982
+ # Format the hex string
9983
+ if use_short:
9984
+ short_components = [x//17 for x in components[:3]] + components[3:]
9985
+ hex_str = "".join(f"{x:1x}" for x in short_components)
9986
+ else:
9987
+ hex_str = "".join(f"{x:02x}" for x in components)
9988
+
9989
+ # Apply case and prefix
9990
+ if uppercase:
9991
+ hex_str = hex_str.upper()
9992
+
9993
+ return f"{prefix}{hex_str}"
9994
+ # ! format excel file
9995
+
9996
+ def hex2argb(color):
9997
+ """
9998
+ Convert a color name or hex code to aARGB format required by openpyxl.
9999
+
10000
+ :param color: A color in the format: 'blue', '#RRGGBB', 'RRGGBB', 'aARRGGBB'
10001
+ :return: A hex color code in the format aARRGGBB.
10002
+
10003
+ Example:
10004
+ print(hex2argb("blue")) # Output: FF0000FF
10005
+ print(hex2argb("FFFF00")) # Output: FFFFFF00
10006
+ print(hex2argb("#DF4245")) # Output: FFDf4245
10007
+ print(hex2argb("FF00FF00")) # Output: FF00FF00 (already in aARGB format)
10008
+ """
10009
+ import matplotlib.colors as mcolors
10010
+ import re
10011
+ color = color.lower().replace(" ", "") # 'light blue'
10012
+ # Convert color name (e.g., "blue") to hex
9549
10013
  if color.lower() in mcolors.CSS4_COLORS:
9550
10014
  color = mcolors.CSS4_COLORS[color.lower()].lstrip("#")
9551
10015
  color = color.lstrip("#").upper()# Remove '#' if present
9552
10016
 
9553
10017
  # Validate hex format
9554
10018
  if not re.fullmatch(r"[A-F0-9]{6,8}", color):
9555
- raise ValueError(f"格式错误❌: {color}, 应该使用 RRGGBB, #RRGGBB, or aARRGGBB format.")
10019
+ raise ValueError(f"格式错误: {color}, 应该使用 RRGGBB, #RRGGBB, or aARRGGBB format.")
9556
10020
 
9557
10021
  # If already in aARRGGBB format (8 chars), return as is
9558
10022
  if len(color) == 8:
@@ -9752,6 +10216,624 @@ def copy_format(
9752
10216
  wb_source.close()
9753
10217
  if "wb_target" in locals():
9754
10218
  wb_target.close()
10219
+ # ! =========(below) interact with worrkbook and DataFrame===========
10220
+ import pandas as pd
10221
+ from openpyxl import load_workbook
10222
+ from openpyxl.workbook.workbook import Workbook
10223
+ from openpyxl.utils import get_column_letter
10224
+
10225
+ class DataFrameAlignExcel:
10226
+ """
10227
+ A powerful tool for updating Excel files with data from DataFrames with various matching strategies.
10228
+
10229
+ Features:
10230
+ - Accepts either file path or open Workbook object
10231
+ - Multiple matching strategies (exact, contains, starts_with, ends_with, regex)
10232
+ - Multiple value update strategies (overwrite, add, subtract, multiply, divide, append)
10233
+ - Support for multiple worksheets
10234
+ - Automatic column creation
10235
+ - Value normalization options
10236
+ - Detailed logging and dry-run mode
10237
+ - Progress reporting
10238
+ - Data validation
10239
+ - make_backup functionality
10240
+ """
10241
+
10242
+ def __init__(self, fpath: Union[str, Workbook], df: pd.DataFrame = None):
10243
+ """
10244
+ Initialize the DataFrameAlignExcel.
10245
+
10246
+ Args:
10247
+ fpath: Path to the Excel file (str) or open Workbook object
10248
+ df: Optional DataFrame to use for updates
10249
+ """
10250
+ self.fpath_or_wb = fpath
10251
+ self.df = df
10252
+ self.wb = None
10253
+ self.backup_path = None
10254
+ self.log = []
10255
+ self.owns_workbook = (
10256
+ False # Track whether we created the workbook or it was passed in
10257
+ )
10258
+
10259
+ def load_workbook(self) -> None:
10260
+ """Load the Excel workbook if a path was provided."""
10261
+ if isinstance(self.fpath_or_wb, str):
10262
+ if not os.path.exists(self.fpath_or_wb):
10263
+ raise FileNotFoundError(f"Excel file not found: {self.fpath_or_wb}")
10264
+ self.wb = load_workbook(self.fpath_or_wb)
10265
+ self.owns_workbook = True
10266
+ elif isinstance(self.fpath_or_wb, Workbook):
10267
+ self.wb = self.fpath_or_wb
10268
+ self.owns_workbook = False
10269
+ else:
10270
+ raise TypeError(
10271
+ "fpath must be either a string path or an openpyxl Workbook object"
10272
+ )
10273
+
10274
+ def create_make_backup(self) -> None:
10275
+ """Create a make_backup of the original Excel file (only if we loaded from a file)."""
10276
+ if not isinstance(self.fpath_or_wb, str):
10277
+ self.log.append(
10278
+ "Skipping make_backup - working with Workbook object directly"
10279
+ )
10280
+ return
10281
+
10282
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
10283
+ self.backup_path = os.path.join(
10284
+ os.path.dirname(self.fpath_or_wb),
10285
+ f"backup_{timestamp}_{os.path.basename(self.fpath_or_wb)}",
10286
+ )
10287
+ self.wb.save(self.backup_path)
10288
+ self.log.append(f"Created make_backup at: {self.backup_path}")
10289
+
10290
+ def save_workbook(self, dir_save: str = None) -> None:
10291
+ """
10292
+ Save the workbook to a file.
10293
+
10294
+ Args:
10295
+ dir_save: Optional path to save to. If None and we loaded from a file,
10296
+ saves to the original path.
10297
+ """
10298
+ if self.wb is None:
10299
+ raise ValueError("No workbook loaded")
10300
+
10301
+ if dir_save is None:
10302
+ if isinstance(self.fpath_or_wb, str):
10303
+ dir_save = self.fpath_or_wb
10304
+ else:
10305
+ dir_save = datetime.now().strftime("%Y%m%d_%H%M%S") + ".xlsx"
10306
+ print(
10307
+ f"No save path provided and original input was a Workbook object, so used : {dir_save}"
10308
+ )
10309
+ self.wb.save(dir_save)
10310
+ self.log.append(f"Saved workbook to: {dir_save}")
10311
+
10312
+ def normalize_value(self, value, clean_keys: str = "strip_split_first") -> str:
10313
+ """
10314
+ Normalize a value based on the specified method.
10315
+
10316
+ Args:
10317
+ value: Value to normalize
10318
+ clean_keys: One of:
10319
+ - 'strip': just strip whitespace
10320
+ - 'strip_lower': strip and lowercase
10321
+ - 'strip_split_first': strip and take first part before comma
10322
+ - 'strip_split_last': strip and take last part after comma
10323
+ - None: no normalization
10324
+
10325
+ Returns:
10326
+ Normalized value
10327
+ """
10328
+ if value is None:
10329
+ return None
10330
+
10331
+ value = str(value)
10332
+
10333
+ if clean_keys is None:
10334
+ return value
10335
+
10336
+ if clean_keys == "strip":
10337
+ return value.strip()
10338
+ elif clean_keys == "strip_lower":
10339
+ return value.strip().lower()
10340
+ elif clean_keys == "strip_split_first":
10341
+ return value.strip().split(",")[0].strip()
10342
+ elif clean_keys == "strip_split_last":
10343
+ parts = value.strip().split(",")
10344
+ return parts[-1].strip() if len(parts) > 1 else value.strip()
10345
+ else:
10346
+ warnings.warn(f"Unknown clean_keys: {clean_keys}. Using 'strip'.")
10347
+ return value.strip()
10348
+
10349
+ def find_column_index(self, ws, header_row: int, column_name: str, max_search_columns: int = 100) -> int:
10350
+ """
10351
+ Efficiently find the column index (1-based) for a given column name,
10352
+ considering only non-empty cells and limiting search range.
10353
+
10354
+ Args:
10355
+ ws: Worksheet object
10356
+ header_row: Row number containing headers (1-based)
10357
+ column_name: Column name to find
10358
+ max_search_columns: Max number of columns to search (to prevent infinite loops)
10359
+
10360
+ Returns:
10361
+ Column index (1-based), or -1 if not found
10362
+ """
10363
+ row_iter = ws.iter_rows(min_row=header_row, max_row=header_row, max_col=max_search_columns, values_only=False)
10364
+ for row in row_iter:
10365
+ for cell in row:
10366
+ if cell.value and str(cell.value).strip().lower() == column_name.lower():
10367
+ return cell.column
10368
+ break # Only process the header row
10369
+ return -1
10370
+ # def find_column_index(self, ws, header_row: int, column_name: str, max_search_columns: int = 100) -> int:
10371
+ # """
10372
+ # Find the column index (1-based) for a given column name.
10373
+ # If not found, return the last non-empty header column index.
10374
+
10375
+ # Args:
10376
+ # ws: Worksheet object
10377
+ # header_row: Row number containing headers (1-based)
10378
+ # column_name: Column name to find
10379
+ # max_search_columns: Max number of columns to search
10380
+
10381
+ # Returns:
10382
+ # Column index (1-based)
10383
+ # """
10384
+ # row_iter = ws.iter_rows(min_row=header_row, max_row=header_row, max_col=max_search_columns, values_only=False)
10385
+ # last_non_empty_col = -1
10386
+
10387
+ # for row in row_iter:
10388
+ # for cell in row:
10389
+ # if cell.value and str(cell.value).strip():
10390
+ # last_non_empty_col = cell.column
10391
+ # if str(cell.value).strip().lower() == column_name.lower():
10392
+ # return cell.column
10393
+ # break # Only one row being read
10394
+
10395
+ # return last_non_empty_col
10396
+
10397
+ def update_values(
10398
+ self,
10399
+ df: pd.DataFrame = None,
10400
+ sheet_name: Union[str, int, List[Union[str, int]]] = 0,
10401
+ header_row: int = 1,
10402
+ column_match: Union[Dict[str, str], List[Tuple[str, str]]] = None,
10403
+ column_mapping: Union[Dict[str, str], List[Tuple[str, str]]] = None,
10404
+ clean_keys: str = "strip_split_first",
10405
+ match_method: str = "exact",
10406
+ update_strategy: str = "overwrite",
10407
+ create_missing_columns: bool = True,
10408
+ preview_only: bool = False,
10409
+ show_progress: bool = True,
10410
+ skip_no_match: bool = True,
10411
+ make_backup: bool = True,
10412
+ dir_save: str = None,
10413
+ row_max=500
10414
+ ) -> Dict[str, int]:
10415
+ """
10416
+ Update Excel with values from DataFrame.
10417
+
10418
+ Args:
10419
+ df: DataFrame containing update data (if None, uses self.df)
10420
+ sheet_name: Sheet name(s) to update (str, int, or list of these)
10421
+ header_row: Row number containing headers (1-based)
10422
+ column_match: Dict or list of tuples mapping DataFrame columns to Excel columns for matching
10423
+ e.g., {'SampleID': 'ID'} or [('SampleID', 'ID'), ('Batch', 'Lot')]
10424
+ column_mapping: Dict or list of tuples mapping DataFrame columns to Excel columns to update
10425
+ e.g., {'Vials': 'Qty'} or [('Vials', 'Qty'), ('Status', 'State')]
10426
+ clean_keys: How to normalize matching values (see normalize_value())
10427
+ match_method: How to match values ('exact', 'contains', 'starts_with', 'ends_with', 'regex')
10428
+ update_strategy: How to update values ('overwrite', 'add', 'subtract', 'multiply', 'divide', 'append')
10429
+ create_missing_columns: Whether to create columns that don't exist
10430
+ preview_only: If True, don't actually update the Excel file
10431
+ show_progress: If True, print progress updates
10432
+ skip_no_match: If True, skip rows where match columns don't match
10433
+ make_backup: If True, create a make_backup before updating (only if working with file path)
10434
+ dir_save: Optional path to save to. If None and we loaded from a file,
10435
+ saves to the original path. Ignored if preview_only=True.
10436
+
10437
+ Returns:
10438
+ Dictionary with update statistics
10439
+ """
10440
+ # Initialize
10441
+ start_time = datetime.now()
10442
+ if df is None:
10443
+ df = self.df
10444
+ if df is None:
10445
+ raise ValueError("No DataFrame provided")
10446
+
10447
+ if not isinstance(column_match, (dict, list)) or not column_match:
10448
+ raise ValueError(
10449
+ "column_match must be a non-empty dict or list of tuples"
10450
+ )
10451
+
10452
+ if not isinstance(column_mapping, (dict, list)) or not column_mapping:
10453
+ raise ValueError("column_mapping must be a non-empty dict or list of tuples")
10454
+
10455
+ # Convert match/update columns to consistent format
10456
+ if isinstance(column_match, dict):
10457
+ column_match = list(column_match.items())
10458
+ if isinstance(column_mapping, dict):
10459
+ column_mapping = list(column_mapping.items())
10460
+
10461
+ # Load workbook if not already loaded
10462
+ if self.wb is None:
10463
+ self.load_workbook()
10464
+
10465
+ # Create make_backup (only if we're working with a file path)
10466
+ if not preview_only:
10467
+ self.create_make_backup()
10468
+
10469
+ # Prepare statistics
10470
+ stats = {
10471
+ "processed_sheet_names":[],
10472
+ "processed_sheets": 0,
10473
+ "total_updates": 0,
10474
+ "skipped_rows": 0,
10475
+ "created_columns": 0,
10476
+ }
10477
+
10478
+ # Normalize sheet names
10479
+ if not isinstance(sheet_name, list):
10480
+ sheet_name = [sheet_name]
10481
+
10482
+ # Process each sheet
10483
+ for sheet in sheet_name:
10484
+ try:
10485
+ if isinstance(sheet, str):
10486
+ ws = self.wb[sheet]
10487
+ elif isinstance(sheet, int):
10488
+ ws = self.wb.worksheets[sheet]
10489
+ else:
10490
+ ws = self.wb.active
10491
+
10492
+ sheet_name = ws.title
10493
+ self.log.append(f"\nProcessing sheet: {sheet_name}")
10494
+
10495
+ # Prepare matching data
10496
+ match_dict = {}
10497
+ for df_col, excel_col in column_match:
10498
+ if clean_keys:
10499
+ match_dict[excel_col] = dict(
10500
+ zip(
10501
+ df[df_col].apply(
10502
+ lambda x: self.normalize_value(x, clean_keys)
10503
+ ),
10504
+ df.index,
10505
+ )
10506
+ )
10507
+ else:
10508
+ match_dict[excel_col] = dict(zip(df[df_col], df.index))
10509
+
10510
+ # Find or create update columns
10511
+ update_col_indices = {}
10512
+ for df_col, excel_col in column_mapping:
10513
+ col_idx = self.find_column_index(ws, header_row, excel_col)
10514
+ if col_idx == -1:
10515
+ if create_missing_columns:
10516
+ # Find last column
10517
+ last_col = max(
10518
+ [cell.column for cell in ws[header_row] if cell.value is not None], default=0
10519
+ )
10520
+ col_idx = last_col + 1
10521
+ ws.cell(row=header_row, column=col_idx, value=excel_col)
10522
+ update_col_indices[excel_col] = col_idx
10523
+ stats["created_columns"] += 1
10524
+ self.log.append(
10525
+ f"Created new column '{excel_col}' at position {col_idx}"
10526
+ )
10527
+ else:
10528
+ raise ValueError(
10529
+ f"Column '{excel_col}' not found and create_missing_columns=False"
10530
+ )
10531
+ else:
10532
+ update_col_indices[excel_col] = col_idx
10533
+
10534
+ # Process rows
10535
+ for row in ws.iter_rows(min_row=header_row + 1):
10536
+ match_values = {}
10537
+ match_failed = False
10538
+
10539
+ for excel_col in match_dict.keys():
10540
+ col_idx = self.find_column_index(ws, header_row, excel_col)
10541
+ if col_idx == -1:
10542
+ if skip_no_match:
10543
+ match_failed = True
10544
+ break
10545
+ else:
10546
+ raise ValueError(
10547
+ f"Match column '{excel_col}' not found in sheet"
10548
+ )
10549
+
10550
+ cell_value = row[
10551
+ col_idx - 1
10552
+ ].value # -1 because iter_rows returns 0-based list
10553
+ if clean_keys:
10554
+ cell_value = self.normalize_value(cell_value, clean_keys)
10555
+
10556
+ match_values[excel_col] = cell_value
10557
+
10558
+ if match_failed:
10559
+ stats["skipped_rows"] += 1
10560
+ continue
10561
+
10562
+ # Find matching DataFrame row
10563
+ df_index = None
10564
+ for excel_col, value in match_values.items():
10565
+ if value in match_dict[excel_col]:
10566
+ if df_index is None:
10567
+ df_index = match_dict[excel_col][value]
10568
+ elif df_index != match_dict[excel_col][value]:
10569
+ # Multiple match columns point to different rows - skip
10570
+ df_index = None
10571
+ break
10572
+
10573
+ if df_index is None:
10574
+ stats["skipped_rows"] += 1
10575
+ continue
10576
+
10577
+ # Update cells
10578
+ for df_col, excel_col in column_mapping:
10579
+ col_idx = update_col_indices[excel_col]
10580
+ cell = row[
10581
+ col_idx - 1
10582
+ ] # -1 because iter_rows returns 0-based list
10583
+ new_value = df.at[df_index, df_col]
10584
+
10585
+ # Apply update strategy
10586
+ if update_strategy == "overwrite":
10587
+ cell.value = new_value
10588
+ elif update_strategy in (
10589
+ "add",
10590
+ "subtract",
10591
+ "multiply",
10592
+ "divide",
10593
+ ):
10594
+ try:
10595
+ old_value = (
10596
+ float(cell.value) if cell.value is not None else 0
10597
+ )
10598
+ new_value = (
10599
+ float(new_value) if new_value is not None else 0
10600
+ )
10601
+ if update_strategy == "add":
10602
+ cell.value = old_value + new_value
10603
+ elif update_strategy == "subtract":
10604
+ cell.value = old_value - new_value
10605
+ elif update_strategy == "multiply":
10606
+ cell.value = old_value * new_value
10607
+ elif update_strategy == "divide":
10608
+ cell.value = (
10609
+ old_value / new_value
10610
+ if new_value != 0
10611
+ else old_value
10612
+ )
10613
+ except (ValueError, TypeError):
10614
+ if skip_no_match:
10615
+ continue
10616
+ raise ValueError(
10617
+ f"Could not perform {update_strategy} operation on non-numeric values"
10618
+ )
10619
+ elif update_strategy == "append":
10620
+ separator = ", " if cell.value else ""
10621
+ cell.value = (
10622
+ f"{cell.value}{separator}{new_value}"
10623
+ if cell.value
10624
+ else new_value
10625
+ )
10626
+ else:
10627
+ raise ValueError(
10628
+ f"Unknown update_strategy: {update_strategy}"
10629
+ )
10630
+
10631
+ stats["total_updates"] += 1
10632
+
10633
+ stats["processed_sheets"] += 1
10634
+ stats["processed_sheet_names"].append(sheet_name)
10635
+ except Exception as e:
10636
+ self.log.append(f"Error processing sheet {sheet}: {str(e)}")
10637
+ if (
10638
+ not preview_only
10639
+ and self.backup_path
10640
+ and isinstance(self.fpath_or_wb, str)
10641
+ ):
10642
+ self.log.append("Restoring from make_backup due to error")
10643
+ self.wb = load_workbook(self.backup_path)
10644
+ raise
10645
+
10646
+ # Save changes if not dry run
10647
+ if not preview_only:
10648
+ self.save_workbook(dir_save)
10649
+ if not make_backup:
10650
+ if os.path.exists(self.backup_path):
10651
+ os.remove(self.backup_path)
10652
+ else:
10653
+ self.log.append("\nDry run complete - no changes saved")
10654
+
10655
+ # Print summary
10656
+ summary = (
10657
+ f"\nUpdate Summary:\n"
10658
+ f"\tProcessed {stats["processed_sheets"]} sheetnames: {stats['processed_sheet_names']}\n"
10659
+ f"\tTotal updates: {stats['total_updates']}\n"
10660
+ f"\tSkipped rows: {stats['skipped_rows']}\n"
10661
+ )
10662
+ self.log.append(summary)
10663
+
10664
+ if show_progress:
10665
+ print(summary)
10666
+
10667
+ return stats
10668
+
10669
+ def get_log(self) -> str:
10670
+ """Get the operation log as a string."""
10671
+ return "\n".join(self.log)
10672
+
10673
+ def close(self) -> None:
10674
+ """Close the workbook if we own it."""
10675
+ if self.wb is not None and self.owns_workbook:
10676
+ self.wb.close()
10677
+ self.wb = None
10678
+
10679
+
10680
+ DFToExcelMapping = Union[Dict[str, str], List[Tuple[str, str]]]
10681
+ def df_align(
10682
+ fpath: Union[str, Workbook],
10683
+ df: pd.DataFrame,
10684
+ sheet_name: Union[str, int, List[Union[str, int]]] = 0,
10685
+ header_row: int = 1,
10686
+ column_match: DFToExcelMapping = None,
10687
+ column_mapping: DFToExcelMapping = None,
10688
+ clean_keys: str = "strip_split_first",
10689
+ match_method: str = "exact",
10690
+ update_strategy: str = "overwrite",
10691
+ create_missing_columns: bool = True,
10692
+ preview_only: bool = False,
10693
+ show_progress: bool = True,
10694
+ skip_no_match: bool = True,
10695
+ make_backup: bool = True,
10696
+ dir_save: str = None,
10697
+ ) -> Dict[str, int]:
10698
+ """
10699
+ wb = fload(
10700
+ dir_aml,
10701
+ password="XBuzwVk4xsC2361cHzyi9JFgfJHaTSerjBOQ0JAJU24=",
10702
+ sheet_name=0,
10703
+ header=1,
10704
+ output="bit",
10705
+ )
10706
+ ws = wb[wb.sheetnames[0]]
10707
+ df_align(
10708
+ fpath=wb,
10709
+ df=df_,
10710
+ sheet_name=None,
10711
+ header_row=2,
10712
+ column_match={"SampleID": "SampleID"},# key是 df中的列名, value是 excel中,
10713
+ column_mapping={"Vials": "Vials", "Vials_": "Total Vials"}, # key是 df中的列名, value是 excel中,
10714
+ )
10715
+ """
10716
+ updater = DataFrameAlignExcel(fpath, df)
10717
+ try:
10718
+ result = updater.update_values(
10719
+ sheet_name=sheet_name,
10720
+ header_row=header_row,
10721
+ column_match=column_match,
10722
+ column_mapping=column_mapping,
10723
+ clean_keys=clean_keys,
10724
+ match_method=match_method,
10725
+ update_strategy=update_strategy,
10726
+ create_missing_columns=create_missing_columns,
10727
+ preview_only=preview_only,
10728
+ show_progress=show_progress,
10729
+ skip_no_match=skip_no_match,
10730
+ make_backup=make_backup,
10731
+ dir_save=dir_save,
10732
+ )
10733
+ return result
10734
+ finally:
10735
+ updater.close()
10736
+
10737
+
10738
+ # ! =========(Above) interact with worrkbook and DataFrame===========
10739
+ def set_sheet_visible(
10740
+ fpath: str,
10741
+ sheet_name: Union[int, str, None,list] = 1,
10742
+ show: Union[bool, str] = True,
10743
+ exclude: Union[List[str], None,list,int] = None,
10744
+ verbose: bool = False
10745
+ ) -> None:
10746
+ """
10747
+ Modify sheet visibility in an Excel workbook.
10748
+ set_sheet_visible(fpath=dir_data_collection,sheet_name=None,show=1,verbose=1)
10749
+ Args:
10750
+ fpath (str): Path to the Excel workbook.
10751
+ sheet_name (int | str | None): Index or name of the sheet to apply visibility to.
10752
+ If None, all sheets are considered.
10753
+ show (bool | str): Visibility mode. Can be:
10754
+ - True -> visible
10755
+ - False -> veryHidden
10756
+ - 'visible', 'hidden', 'veryHidden' as str
10757
+ exclude (list[str] | None): List of sheet names to exclude from changes.
10758
+ verbose (bool): If True, logs actions.
10759
+ """
10760
+
10761
+ try:
10762
+ wb = fload(fpath, output="bit", get_validations=1)
10763
+ except Exception as e:
10764
+ raise FileNotFoundError(f"Unable to load workbook: {e}")
10765
+
10766
+ sheet_names = wb.sheetnames
10767
+ if verbose:
10768
+ print("Workbook loaded with sheets:")
10769
+ for i, name in enumerate(sheet_names):
10770
+ print(f" [{i}] {name}")
10771
+
10772
+ excludes=[]
10773
+ if exclude is None:
10774
+ exclude=[]
10775
+ if ~isinstance(exclude, list):
10776
+ exclude = [exclude]
10777
+ for exclude_ in exclude:
10778
+ if isinstance(exclude_, str):
10779
+ excludes.append(strcmp(exclude_, sheet_names)[0])
10780
+ elif isinstance(exclude_, int):
10781
+ if 0 <= exclude_ < len(sheet_names):
10782
+ excludes.append(sheet_names[exclude_])
10783
+ else:
10784
+ raise IndexError(f"sheet_name index {exclude_} is out of range:0~{len(sheet_names)-1}.")
10785
+
10786
+ # Resolve the sheet_name target
10787
+ target_indices = []
10788
+ if not isinstance(sheet_name,list):
10789
+ sheet_name=[sheet_name]
10790
+ for sheet_name_ in sheet_name:
10791
+ if sheet_name_ is None:
10792
+ target_indices = list(range(len(sheet_names)))
10793
+ break
10794
+ elif isinstance(sheet_name_, int):
10795
+ if 0 <= sheet_name_ < len(sheet_names):
10796
+ target_indices.append(sheet_name_)
10797
+ else:
10798
+ raise IndexError(f"sheet_name index {sheet_name_} is out of range :0~{len(sheet_names)-1}.")
10799
+ elif isinstance(sheet_name_, str):
10800
+ idx = strcmp(sheet_name_, sheet_names)[1]
10801
+ if idx == -1:
10802
+ raise ValueError(f"Sheet '{sheet_name_}' not found.")
10803
+ target_indices.append(idx)
10804
+
10805
+ # Map show argument to valid state
10806
+ valid_states = ["veryHidden", "visible", "hidden"]
10807
+ if isinstance(show, str):
10808
+ if show not in valid_states:
10809
+ raise ValueError(f"Invalid show value '{show}'. Must be one of {valid_states}")
10810
+ state = show
10811
+ else:
10812
+ state = "visible" if show else "veryHidden"
10813
+ # Modify sheet visibility
10814
+ for idx in target_indices:
10815
+ ws= wb[sheet_names[idx]]
10816
+ if ws.title in excludes:
10817
+ if verbose:
10818
+ print(f"Skipping excluded sheet: '{ws.title}'")
10819
+ continue
10820
+ ws.sheet_state = state
10821
+ # Ensure at least one sheet is visible
10822
+ visible_sheets = [s for s in wb.worksheets if s.sheet_state == "visible"]
10823
+ not_visible_sheets = [s for s in wb.worksheets if s.sheet_state != "visible"]
10824
+ if not visible_sheets:
10825
+ fallback_sheet = wb.worksheets[0]
10826
+ fallback_sheet.sheet_state = "visible"
10827
+ if verbose:
10828
+ print(f"No visible sheets found. Setting '{fallback_sheet.title}' to visible.")
10829
+ if verbose:
10830
+ print(f"visible sheets:{[s.title for s in visible_sheets]}")
10831
+
10832
+ try:
10833
+ wb.save(fpath)
10834
+ except Exception as e:
10835
+ raise IOError(f"Error saving workbook: {e}")
10836
+
9755
10837
 
9756
10838
  def format_excel(
9757
10839
  df: pd.DataFrame=None,
@@ -9780,7 +10862,7 @@ def format_excel(
9780
10862
  number_format:dict=None, # dict: e.g., {1:"0.00", 2:"#,##0",3:"0%",4:"$#,##0.00"}
9781
10863
  data_validation=None, # dict
9782
10864
  template:dict={},# e.g., template=dict(path="xx.xlsx",sheet_name=['sheet_name1',"sheet_name2"])
9783
- apply_filter:bool=True, # add filter
10865
+ apply_filter:bool=False, # add filter
9784
10866
  freeze :str= False,#"A2",
9785
10867
  conditional_format:dict=None, # dict
9786
10868
  verbose:bool=False,
@@ -9942,6 +11024,67 @@ def format_excel(
9942
11024
  if end_col_letter
9943
11025
  else f"{start_col_letter}{start_row}"
9944
11026
  )
11027
+
11028
+
11029
+ def is_merged_cell(ws, cell):
11030
+ """Check if a cell is part of any merged range."""
11031
+ for merged_range in ws.merged_cells.ranges:
11032
+ if cell.coordinate in merged_range:
11033
+ return True
11034
+ return False
11035
+
11036
+ def apply_auto_width(ws, width_factor=1.2, width_padding=2, width_max=50):
11037
+ """
11038
+ Automatically adjust column widths based on content length,
11039
+ with complete protection against merged cell errors.
11040
+
11041
+ Args:
11042
+ ws: Worksheet object
11043
+ width_factor: Multiplier for content length (default 1.2)
11044
+ width_padding: Additional padding (default 2)
11045
+ width_max: Maximum column width (default 50)
11046
+ """
11047
+ # First build a set of all merged cell coordinates
11048
+ merged_coords = set()
11049
+ for merged_range in ws.merged_cells.ranges:
11050
+ for row in ws.iter_rows(min_row=merged_range.min_row,
11051
+ max_row=merged_range.max_row,
11052
+ min_col=merged_range.min_col,
11053
+ max_col=merged_range.max_col):
11054
+ for cell in row:
11055
+ merged_coords.add(cell.coordinate)
11056
+
11057
+ for col in ws.columns:
11058
+ if not col:
11059
+ continue
11060
+
11061
+ col_letter = get_column_letter(col[0].column)
11062
+ max_length = 0
11063
+
11064
+ for cell in col:
11065
+ # Skip merged cells entirely
11066
+ if cell.coordinate in merged_coords:
11067
+ continue
11068
+
11069
+ try:
11070
+ if cell.value is not None:
11071
+ # Handle both single-line and multi-line content
11072
+ cell_value = str(cell.value)
11073
+ lines = cell_value.split('\n')
11074
+ current_max = max(len(line) for line in lines)
11075
+ max_length = max(max_length, current_max)
11076
+ except Exception as e:
11077
+ print(f"Skipping cell {cell.coordinate} due to error: {e}")
11078
+ continue
11079
+
11080
+ # Calculate width with constraints
11081
+ adjusted_width = min(
11082
+ max(1, (max_length * width_factor) + width_padding),
11083
+ width_max if width_max is not None else float('inf')
11084
+ )
11085
+
11086
+ ws.column_dimensions[col_letter].width = adjusted_width
11087
+
9945
11088
  def apply_color_to_worksheet(ws=None, sheet_name=None, conditions=None, cell_idx=None,where="text"):
9946
11089
  """
9947
11090
  Apply text color formatting to a specific cell range in an openpyxl workbook based on conditions.
@@ -10047,6 +11190,11 @@ def format_excel(
10047
11190
 
10048
11191
  def apply_format(ws, cell, cell_range):
10049
11192
  """Apply cell formatting to a specified range."""
11193
+ # Get all merged cell coordinates first
11194
+ merged_cells = set()
11195
+ for merged_range in ws.merged_cells.ranges:
11196
+ for coord in merged_range.cells:
11197
+ merged_cells.add(coord)
10050
11198
  cell_font, cell_fill, cell_alignment, border = None, None, None, None
10051
11199
  kws_cell = ["font", "fill", "alignment", "border"]
10052
11200
  for K, _ in cell.items():
@@ -10244,6 +11392,7 @@ def format_excel(
10244
11392
  )
10245
11393
  # get colors config
10246
11394
  for k, v in cell.get(K, {}).items():
11395
+ print(k, v,strcmp(k, kws_border)[0])
10247
11396
  if strcmp(k, kws_border)[0] in ["color"]:
10248
11397
  border_color_all = hex2argb(v)
10249
11398
  # 如果设置了color,表示其它的所有的都设置成为一样的
@@ -10374,6 +11523,8 @@ def format_excel(
10374
11523
  #! final apply configs
10375
11524
  for row in ws[cell_range]:
10376
11525
  for cell_ in row:
11526
+ if cell_.coordinate in merged_cells:
11527
+ continue # Skip merged cells
10377
11528
  if cell_font:
10378
11529
  cell_.font = cell_font
10379
11530
  if cell_fill:
@@ -10451,11 +11602,9 @@ def format_excel(
10451
11602
  if not os.path.exists(filename) or mode=="w":
10452
11603
  # ws=wb.active
10453
11604
  # ws.title = sheet_name
10454
- ws = wb.create_sheet(title=sheet_name)
10455
- print(1)
11605
+ ws = wb.create_sheet(title=sheet_name)
10456
11606
  else:# file exists
10457
- wb = load_workbook(filename)
10458
- print(2)
11607
+ wb = load_workbook(filename)
10459
11608
  # with pd.ExcelWriter(filename, mode="a", engine=engine, if_sheet_exists=if_sheet_exists) as writer:
10460
11609
  # for ws in wb.worksheets: # Iterate through worksheets in the input workbook
10461
11610
  # ws_df = pd.DataFrame(ws.values)
@@ -10782,44 +11931,62 @@ def format_excel(
10782
11931
  if freeze:
10783
11932
  ws.freeze_panes = freeze # Freeze everything above and to the left of A2
10784
11933
  # !widths
10785
- if isinstance(width,bool):
11934
+ if isinstance(width, bool):
10786
11935
  width=None if width else False
10787
11936
  if isinstance(height,bool):
10788
11937
  height=None if height else False
10789
- if width is None or width=={}: # automatic adust width
10790
- for col in ws.columns:
10791
- max_length = 0
10792
- """column = col[0].column_letter # Get the column letter"""
10793
- # Check the first cell in the column to get the column letter
10794
- cell_first = col[0]
10795
11938
 
10796
- # Check if the cell is part of a merged range
10797
- if not any(cell_first.coordinate in range_ for range_ in ws.merged_cells.ranges):
10798
- column = get_column_letter(cell_first.column) # Get the column letter from the first cell
10799
- else:
10800
- # Skip the column if the first cell is merged
11939
+ merged_cells = set()
11940
+ for merged_range in ws.merged_cells.ranges:
11941
+ for row in ws.iter_rows(min_row=merged_range.min_row,
11942
+ max_row=merged_range.max_row,
11943
+ min_col=merged_range.min_col,
11944
+ max_col=merged_range.max_col):
11945
+ for cell in row:
11946
+ merged_cells.add(cell.coordinate)
11947
+ if width is None or width == {}: # automatic adjust width
11948
+ print("auto-width")
11949
+ for col in ws.columns:
11950
+ if not col:
10801
11951
  continue
10802
- for cell_ in col:
10803
- try:
10804
- if cell_.value:
10805
- max_length = max(max_length, len(str(cell_.value)))
10806
- except Exception:
10807
- pass
10808
- adjusted_width = max_length*width_factor+width_padding
10809
- if width_max is not None:
10810
- adjusted_width = min(adjusted_width, width_max)
10811
- ws.column_dimensions[column].width = max(5,adjusted_width)
10812
- elif isinstance(width, (int, float)): # set all columns to this value
11952
+ try:
11953
+ col_letter = get_column_letter(col[0].column)
11954
+
11955
+ # Skip entire column if any cell is merged
11956
+ if any(cell.coordinate in merged_cells for cell in col):
11957
+ continue
11958
+
11959
+ max_length = 0
11960
+ for cell in col:
11961
+ try:
11962
+ if cell.value:
11963
+ cell_value = str(cell.value)
11964
+ if '\n' in cell_value:
11965
+ max_line_length = max(len(line) for line in cell_value.split('\n'))
11966
+ max_length = max(max_length, max_line_length)
11967
+ else:
11968
+ max_length = max(max_length, len(cell_value))
11969
+ except:
11970
+ pass
11971
+
11972
+ adjusted_width = (max_length * width_factor) + width_padding
11973
+ if width_max is not None:
11974
+ adjusted_width = min(adjusted_width, width_max)
11975
+ ws.column_dimensions[col_letter].width = max(5, adjusted_width)
11976
+
11977
+ except Exception as e:
11978
+ print(f"Error adjusting width for column: {e}")
11979
+ continue
11980
+ elif isinstance(width, (int, float)): # set all columns to this value
11981
+ print("set to fixed width {}".format(width))
10813
11982
  for col in ws.columns:
10814
- column=get_column_letter(col[0].column)
10815
- ws.column_dimensions[column].width=width*width_factor+width_padding
10816
- elif isinstance(width,bool):
10817
- pass
10818
- else:
11983
+ column = get_column_letter(col[0].column)
11984
+ ws.column_dimensions[column].width = width * width_factor + width_padding
11985
+ elif isinstance(width, dict): # custom widths per column
10819
11986
  for col_idx, width_ in width.items():
10820
11987
  col_letter = get_column_letter(col_idx)
10821
11988
  ws.column_dimensions[col_letter].width = width_
10822
-
11989
+
10823
11990
  # !heights
10824
11991
  if height is None or height=={}: # automatic adust height
10825
11992
  for row in ws.iter_rows(min_row=1, max_row=ws.max_row):
@@ -11276,9 +12443,28 @@ def format_excel(
11276
12443
 
11277
12444
  # ungroup sheets
11278
12445
  for sheet in wb.worksheets:
11279
- sheet.sheet_view.tabSelected = False
12446
+ sheet.sheet_view.tabSelected = False
11280
12447
  # !Save the workbook
11281
- wb.save(filename)
12448
+ try:
12449
+ wb.save(filename)
12450
+ except Exception as e:
12451
+ print(f"Error saving workbook: {str(e)}")
12452
+ # Replace your final save operation with this:
12453
+ # try:
12454
+ # # Create a temporary file for safer saving
12455
+ # temp_filename = filename + '.tmp'
12456
+ # wb.save(temp_filename)
12457
+
12458
+ # # If save succeeds, replace original file
12459
+ # if os.path.exists(filename):
12460
+ # os.remove(filename)
12461
+ # os.rename(temp_filename, filename)
12462
+
12463
+ # except Exception as e:
12464
+ # print(f"Error saving workbook: {str(e)}")
12465
+ # if os.path.exists(temp_filename):
12466
+ # os.remove(temp_filename)
12467
+ # raise
11282
12468
 
11283
12469
 
11284
12470
  def preview(var):
@@ -13282,61 +14468,630 @@ def df_fillna(
13282
14468
  strategy="constant", fill_value=constant
13283
14469
  )
13284
14470
  else:
13285
- non_numeric_imputer = SimpleImputer(strategy="most_frequent")
14471
+ non_numeric_imputer = SimpleImputer(strategy="most_frequent")
14472
+
14473
+ # Impute non-numeric columns column-wise (axis=0)
14474
+ imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
14475
+
14476
+ # Convert imputed non-numeric array back to DataFrame with original index and column names
14477
+ imputed_non_numeric_df = pd.DataFrame(
14478
+ imputed_non_numeric,
14479
+ index=non_numeric_data.index,
14480
+ columns=non_numeric_data.columns,
14481
+ )
14482
+ else:
14483
+ imputed_non_numeric_df = pd.DataFrame(index=data.index)
14484
+
14485
+ imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(
14486
+ columns=data.columns
14487
+ )
14488
+
14489
+ if inplace:
14490
+ # Modify the original DataFrame
14491
+ data[:] = imputed_data[col_names_org]
14492
+ return None
14493
+ else:
14494
+ # Return the modified DataFrame
14495
+ return imputed_data[col_names_org]
14496
+
14497
+
14498
+ # # example
14499
+ # data = {
14500
+ # "A": [1, 2, np.nan, 4, 5],
14501
+ # "B": [np.nan, 2, 3, 4, np.nan],
14502
+ # "C": [1, np.nan, 3, 4, 5],
14503
+ # "D": [1, 2, 3, 4, np.nan],
14504
+ # }
14505
+
14506
+ # # Define a function to test each imputation method
14507
+ # methods = [
14508
+ # "mean",
14509
+ # "median",
14510
+ # "most_frequent",
14511
+ # "constant",
14512
+ # "knn",
14513
+ # "iterative",
14514
+ # # "missforest",
14515
+ # # "softimpute",
14516
+ # # "svd",
14517
+ # ]
14518
+
14519
+ # # Create a dictionary to hold results
14520
+ # results = {}
14521
+
14522
+ # for method_name in methods:
14523
+ # print(method_name)
14524
+ # display(df)
14525
+ # display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
14526
+ def df_cut(
14527
+ df: pd.DataFrame,
14528
+ column: str,
14529
+ *,
14530
+ new_col_name: Optional[str] = None,
14531
+ bins: Optional[
14532
+ Union[int, List[float], Dict[str, Union[float, str, pd.Timestamp]]]
14533
+ ] = None,
14534
+ range_start: Optional[Union[float, str, pd.Timestamp]] = None,
14535
+ range_end: Optional[Union[float, str, pd.Timestamp]] = None,
14536
+ step: Optional[Union[float, str, pd.Timedelta]] = None,
14537
+ labels: Optional[List[str]] = None,
14538
+ label_format: Optional[Union[str, Callable[[float, float], str]]] = None,
14539
+ include_overflow: bool = True,
14540
+ include_underflow: bool = False,
14541
+ right: bool = False,
14542
+ drop_original: bool = False,
14543
+ precision: int = 2,
14544
+ show_count: bool = False,
14545
+ symbol_count: str = "n=",
14546
+ show_percentage: bool = False,
14547
+ symbol_percentage: str = "%",
14548
+ show_total_count: bool = False,
14549
+ symbol_total_count: str = "∑n=",
14550
+ sep_between: str = " | ",
14551
+ sort_labels: bool = True,
14552
+ na_action: str = "keep",
14553
+ na_fill_value: Optional[str] = None,
14554
+ dtype: Optional[Union[str, pd.CategoricalDtype]] = None,
14555
+ ordered: bool = True,
14556
+ inplace: bool = False,
14557
+ datetime_format: str = "%Y-%m-%d",
14558
+ categorical_agg: str = "count",
14559
+ ) -> Optional[pd.DataFrame]:
14560
+ """
14561
+ Enhanced binning function that works with numeric, datetime, and categorical columns.
14562
+
14563
+ Features:
14564
+ - Automatic type detection (numeric, datetime, categorical)
14565
+ - Flexible bin specification (number of bins, explicit edges, or range+step)
14566
+ - Customizable labels with formatting
14567
+ - Count and percentage display options
14568
+ - NA value handling
14569
+ square bracket: means inclusive
14570
+ parenthesis: means exclusive
14571
+ Parameters:
14572
+ -----------
14573
+ df : pd.DataFrame
14574
+ Input DataFrame containing the column to bin
14575
+ column : str
14576
+ Name of column to bin
14577
+ new_col_name : str, optional
14578
+ Name for binned column (default: f"{column}_binned")
14579
+ bins : int, list, or dict, optional
14580
+ - int: Number of equal-width bins
14581
+ - list: Explicit bin edges
14582
+ - dict: {'start': x, 'end': y, 'step': z} for range specification
14583
+ range_start : float or datetime-like, optional
14584
+ Start value for bin range (required if bins is None or dict)
14585
+ range_end : float or datetime-like, optional
14586
+ End value for bin range (default: max of column)
14587
+ step : float or timedelta-like, optional
14588
+ Step size for bin creation (required if bins is None or dict)
14589
+ labels : list of str, optional
14590
+ Custom labels for bins (must match number of bins)
14591
+ label_format : str or callable, optional
14592
+ Format string or function for bin labels
14593
+ include_overflow : bool, default True
14594
+ Include catch-all bin for values above range_end
14595
+ include_underflow : bool, default False
14596
+ Include catch-all bin for values below range_start
14597
+ right : bool, default False
14598
+ Whether bins include the right edge
14599
+ drop_original : bool, default False
14600
+ Drop original column after binning
14601
+ precision : int, default 2
14602
+ Decimal precision for numeric bin labels
14603
+ show_count : bool, default False
14604
+ Show count of items in each bin
14605
+ show_percentage : bool, default False
14606
+ Show percentage of items in each bin
14607
+ show_total_count : bool, default False
14608
+ Show total count in labels
14609
+ na_action : str, default 'keep'
14610
+ How to handle NA values ('keep', 'drop', or 'fill')
14611
+ na_fill_value : str, optional
14612
+ Value to fill NAs with if na_action='fill'
14613
+ dtype : dtype or CategoricalDtype, optional
14614
+ Output dtype for binned column
14615
+ ordered : bool, default True
14616
+ Whether bins are ordered
14617
+ inplace : bool, default False
14618
+ Modify DataFrame in place
14619
+ datetime_format : str, default "%Y-%m-%d"
14620
+ Format string for datetime labels
14621
+ categorical_agg : str, default 'count'
14622
+ For categorical data: 'count' or 'ratio'
14623
+
14624
+ Returns:
14625
+ --------
14626
+ pd.DataFrame or None
14627
+ Returns modified DataFrame unless inplace=True
14628
+
14629
+ Examples:
14630
+ --------
14631
+ # Numeric binning
14632
+ df_cut(df, 'age', bins=5)
14633
+ df_cut(df, 'price', range_start=0, range_end=1000, step=100)
14634
+
14635
+ # Datetime binning
14636
+ df_cut(df, 'date', bins={'start': '2023-01-01', 'end': '2023-12-31', 'step': '1M'})
14637
+
14638
+ # Categorical binning
14639
+ df_cut(df, 'category', bins=5, categorical_agg='ratio')
14640
+
14641
+ # Sample datetime data
14642
+ dates = pd.date_range("2020-01-01", "2023-12-31", freq="D")
14643
+ df = pd.DataFrame(
14644
+ {
14645
+ "order_date": np.random.choice(dates, 500),
14646
+ "delivery_time": np.random.randint(1, 72, 500), # hours
14647
+ }
14648
+ )
14649
+ # Example 1: Monthly bins
14650
+ # Monthly binning with exact month boundaries
14651
+ df_cut(
14652
+ df,
14653
+ "order_date",
14654
+ bins={"start": "2019-01-01", "end": "2023-12-31", "step": "1Y"},
14655
+ datetime_format="%Y-%m-%d",
14656
+ label_format="%m-%d",
14657
+ show_count=True,
14658
+ show_percentage=True,
14659
+ show_total_count=True,
14660
+ )
14661
+ # Weekly binning
14662
+ df_cut(
14663
+ df,
14664
+ "order_date",
14665
+ bins={"start": "2019-01-01", "end": "2023-12-31", "step": "1W"},
14666
+ label_format="%Y-%m-%d",
14667
+ datetime_format="%Y-%m-%d",
14668
+ show_count=True,
14669
+ show_percentage=True,
14670
+ show_total_count=True,
14671
+ )
14672
+
14673
+
14674
+ # Sample numeric data
14675
+ df = pd.DataFrame(
14676
+ {"price": np.random.uniform(10, 1000, 1000), "age": np.random.randint(18, 80, 1000)}
14677
+ )
14678
+
14679
+ # Example 1: Equal-width bins
14680
+ df_cut(df, "price", bins=5, show_count=True)
14681
+
14682
+ # Example 2: Custom range with step
14683
+ df_cut(
14684
+ df,
14685
+ "price",
14686
+ range_start=0,
14687
+ range_end=1000,
14688
+ step=200,
14689
+ label_format="${left:.0f}-${right:.0f}",
14690
+ show_percentage=True,
14691
+ )
14692
+ df_cut(
14693
+ df,
14694
+ "price",
14695
+ bins={"start": 0, "end": 1000, "step": 200},
14696
+ # label_format="${left:.0f}-${right:.0f}",
14697
+ show_percentage=True,
14698
+ )
14699
+ """
14700
+ from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
14701
+
14702
+ def _process_time_step(step: Union[str, int, float, pd.Timedelta]) -> str:
14703
+ """Convert step to pandas frequency string."""
14704
+ if isinstance(step, pd.Timedelta):
14705
+ return step.freqstr if step.freqstr else str(step)
14706
+
14707
+ if isinstance(step, (int, float)):
14708
+ return f"{step}S" # Interpret numbers as seconds
14709
+
14710
+ if isinstance(step, str):
14711
+ step = step.strip().lower()
14712
+ match = re.match(r"(\d*\.?\d+)?\s*([a-z]+)", step)
14713
+ if not match:
14714
+ raise ValueError(f"Invalid time step format: {step}")
14715
+
14716
+ num_part, unit_part = match.groups()
14717
+ num = float(num_part) if num_part else 1.0
14718
+
14719
+ unit_map = {
14720
+ "y": "Y",
14721
+ "yr": "Y",
14722
+ "yrs": "Y",
14723
+ "year": "Y",
14724
+ "years": "Y",
14725
+ "m": "M",
14726
+ "mo": "M",
14727
+ "mon": "M",
14728
+ "month": "M",
14729
+ "months": "M",
14730
+ "w": "W",
14731
+ "wk": "W",
14732
+ "wks": "W",
14733
+ "week": "W",
14734
+ "weeks": "W",
14735
+ "d": "D",
14736
+ "day": "D",
14737
+ "days": "D",
14738
+ "h": "H",
14739
+ "hr": "H",
14740
+ "hrs": "H",
14741
+ "hour": "H",
14742
+ "hours": "H",
14743
+ "min": "T",
14744
+ "mins": "T",
14745
+ "minute": "T",
14746
+ "minutes": "T",
14747
+ "s": "S",
14748
+ "sec": "S",
14749
+ "secs": "S",
14750
+ "second": "S",
14751
+ "seconds": "S",
14752
+ }
14753
+
14754
+ if unit_part not in unit_map:
14755
+ raise ValueError(f"Unknown time unit: {unit_part}")
14756
+
14757
+ freq = unit_map[unit_part]
14758
+ if num.is_integer():
14759
+ num = int(num)
14760
+ return f"{num}{freq}"
14761
+
14762
+ raise TypeError(f"Unsupported step type: {type(step)}")
14763
+
14764
+
14765
+ def _process_datetime_column(
14766
+ col: pd.Series,
14767
+ bins: Optional[Union[int, List[pd.Timestamp]]],
14768
+ range_start: Optional[Union[str, pd.Timestamp]],
14769
+ range_end: Optional[Union[str, pd.Timestamp]],
14770
+ step: Optional[Union[str, pd.Timedelta]],
14771
+ labels: Optional[List[str]],
14772
+ label_format: Optional[Union[str, Callable]],
14773
+ datetime_format: str,
14774
+ right: bool,
14775
+ include_underflow: bool,
14776
+ include_overflow: bool,
14777
+ ) -> Tuple[pd.Categorical, List[str]]:
14778
+ """Process datetime column with accurate counting."""
14779
+ col = pd.to_datetime(col)
14780
+
14781
+ # Handle bin edges
14782
+ if bins is None:
14783
+ if step is None:
14784
+ raise ValueError("Step must be provided for datetime binning")
14785
+
14786
+ # Convert step to pandas frequency string
14787
+ step_freq = _process_time_step(step)
14788
+
14789
+ # Set default range if needed
14790
+ range_start = (
14791
+ pd.to_datetime(range_start) if range_start is not None else col.min()
14792
+ )
14793
+ range_end = pd.to_datetime(range_end) if range_end is not None else col.max()
14794
+
14795
+ # Generate bins
14796
+ try:
14797
+ bin_edges = pd.date_range(start=range_start, end=range_end, freq=step_freq)
14798
+ if len(bin_edges) == 0:
14799
+ bin_edges = pd.date_range(start=range_start, end=range_end, periods=2)
14800
+ elif bin_edges[-1] < range_end:
14801
+ bin_edges = bin_edges.append(pd.DatetimeIndex([range_end]))
14802
+ except ValueError as e:
14803
+ raise ValueError(f"Invalid frequency specification: {step_freq}") from e
14804
+ elif isinstance(bins, int):
14805
+ bin_edges = pd.date_range(start=col.min(), end=col.max(), periods=bins + 1)
14806
+ else:
14807
+ bin_edges = pd.to_datetime(bins)
14808
+
14809
+ # Add overflow/underflow bins
14810
+ if include_underflow:
14811
+ bin_edges = bin_edges.insert(0, pd.Timestamp.min)
14812
+ if include_overflow:
14813
+ bin_edges = bin_edges.append(pd.DatetimeIndex([pd.Timestamp.max]))
14814
+
14815
+ # Perform the cut - this is where we ensure proper binning
14816
+ binned = pd.cut(
14817
+ col.astype("int64"), # Convert to nanoseconds for precise binning
14818
+ bins=bin_edges.astype("int64"),
14819
+ right=right,
14820
+ include_lowest=True,
14821
+ )
14822
+
14823
+ # Generate labels if not provided
14824
+ if labels is None:
14825
+ labels = []
14826
+ for i in range(len(bin_edges) - 1):
14827
+ left = bin_edges[i]
14828
+ right_ = bin_edges[i + 1]
14829
+
14830
+ # Handle special cases
14831
+ if left == pd.Timestamp.min:
14832
+ left_str = "<"
14833
+ else:
14834
+ left_str = left.strftime(datetime_format)
14835
+
14836
+ if right_ == pd.Timestamp.max:
14837
+ right_str = ">"
14838
+ else:
14839
+ right_str = right_.strftime(datetime_format)
14840
+
14841
+ # Apply label formatting
14842
+ if callable(label_format):
14843
+ label = label_format(left, right_)
14844
+ elif isinstance(label_format, str):
14845
+ try:
14846
+ if left != pd.Timestamp.min and right_ != pd.Timestamp.max:
14847
+ label = f"{left.strftime(label_format)}-{right_.strftime(label_format)}"
14848
+ else:
14849
+ label = f"{left_str}-{right_str}"
14850
+ except (ValueError, AttributeError):
14851
+ label = f"{left_str}-{right_str}"
14852
+ else:
14853
+ label = f"{left_str}-{right_str}"
14854
+
14855
+ labels.append(label)
14856
+
14857
+ return binned, labels
14858
+
14859
+
14860
+ def _process_categorical_column(
14861
+ col: pd.Series,
14862
+ bins: Optional[Union[int, List[str]]],
14863
+ labels: Optional[List[str]],
14864
+ categorical_agg: str,
14865
+ ) -> Tuple[pd.Categorical, List[str]]:
14866
+ value_counts = col.value_counts(normalize=(categorical_agg == "ratio"))
14867
+
14868
+ if bins is not None and isinstance(bins, int):
14869
+ top_categories = value_counts.head(bins).index
14870
+ binned = col.where(col.isin(top_categories), "Other")
14871
+ elif isinstance(bins, list):
14872
+ binned = col.where(col.isin(bins), "Other")
14873
+ else:
14874
+ binned = col
14875
+
14876
+ binned = binned.astype("category")
14877
+
14878
+ if labels is not None:
14879
+ binned = binned.cat.rename_categories(dict(zip(binned.cat.categories, labels)))
14880
+
14881
+ return binned, list(binned.cat.categories)
14882
+
14883
+
14884
+ def _process_numeric_column(
14885
+ col: pd.Series,
14886
+ bins: Optional[Union[int, List[float]]],
14887
+ range_start: Optional[float],
14888
+ range_end: Optional[float],
14889
+ step: Optional[float],
14890
+ labels: Optional[List[str]],
14891
+ label_format: Optional[Union[str, Callable]],
14892
+ precision: int,
14893
+ right: bool,
14894
+ include_underflow: bool,
14895
+ include_overflow: bool,
14896
+ ) -> Tuple[pd.Categorical, List[str]]:
14897
+ if bins is None:
14898
+ if range_start is None or step is None:
14899
+ raise ValueError("If bins not provided, must set range_start and step")
14900
+ if range_end is None:
14901
+ range_end = col.max()
14902
+
14903
+ bin_edges = list(np.arange(range_start, range_end + step, step))
14904
+ elif isinstance(bins, int):
14905
+ bin_edges = np.linspace(col.min(), col.max(), bins + 1).tolist()
14906
+ else:
14907
+ bin_edges = list(bins)
14908
+
14909
+ # Add overflow/underflow bins if needed
14910
+ if include_underflow and not np.isinf(bin_edges[0]):
14911
+ bin_edges.insert(0, float("-inf"))
14912
+ if include_overflow and not np.isinf(bin_edges[-1]):
14913
+ bin_edges.append(float("inf"))
14914
+
14915
+ # Generate labels if not provided
14916
+ if labels is None:
14917
+ labels = []
14918
+ for i in range(len(bin_edges) - 1):
14919
+ left = round(bin_edges[i], precision)
14920
+ right_ = round(bin_edges[i + 1], precision)
14921
+
14922
+ if label_format:
14923
+ label = (
14924
+ label_format(left, right_)
14925
+ if callable(label_format)
14926
+ else label_format.format(left=left, right=right_)
14927
+ )
14928
+ else:
14929
+ if np.isinf(left) and left < 0:
14930
+ label = f"<{right_}"
14931
+ elif np.isinf(right_):
14932
+ label = f">{left}"
14933
+ else:
14934
+ label = f"[{left}, {right_}{']' if right else ')'}"
13286
14935
 
13287
- # Impute non-numeric columns column-wise (axis=0)
13288
- imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
14936
+ labels.append(label)
13289
14937
 
13290
- # Convert imputed non-numeric array back to DataFrame with original index and column names
13291
- imputed_non_numeric_df = pd.DataFrame(
13292
- imputed_non_numeric,
13293
- index=non_numeric_data.index,
13294
- columns=non_numeric_data.columns,
14938
+ binned = pd.cut(
14939
+ col, bins=bin_edges, labels=labels, right=right, include_lowest=True
13295
14940
  )
13296
- else:
13297
- imputed_non_numeric_df = pd.DataFrame(index=data.index)
14941
+ return binned, labels
14942
+
14943
+
14944
+ def _handle_na_values(
14945
+ col: pd.Series, na_action: str, na_fill_value: Optional[str]
14946
+ ) -> pd.Series:
14947
+ if na_action == "drop":
14948
+ return col.dropna()
14949
+ elif na_action == "fill" and na_fill_value is not None:
14950
+ return col.fillna(na_fill_value)
14951
+ return col
14952
+
14953
+
14954
+ def _add_statistical_labels(
14955
+ binned: pd.Categorical,
14956
+ labels: List[str],
14957
+ show_count: bool,
14958
+ show_percentage: bool,
14959
+ show_total_count: bool,
14960
+ symbol_count: str,
14961
+ symbol_percentage: str,
14962
+ symbol_total_count: str,
14963
+ sep_between: str,
14964
+ ) -> List[str]:
14965
+ """Add statistical information with accurate counts."""
14966
+ # Get counts by matching the exact bin intervals
14967
+ value_counts = binned.value_counts()
14968
+ total = len(binned.dropna())
14969
+
14970
+ new_labels = []
14971
+ for i, (label, category) in enumerate(zip(labels, binned.cat.categories)):
14972
+ count = value_counts.get(category, 0)
14973
+ parts = [label]
14974
+
14975
+ if show_count:
14976
+ parts.append(f"{symbol_count}{count}")
14977
+ if show_percentage:
14978
+ percentage = (count / total * 100) if total > 0 else 0
14979
+ parts.append(f"{percentage:.1f}{symbol_percentage}")
14980
+ if show_total_count:
14981
+ parts.append(f"{symbol_total_count}{total}")
14982
+
14983
+ # Ensure unique labels
14984
+ new_label = sep_between.join(parts)
14985
+ if new_label in new_labels:
14986
+ new_label = f"{new_label}_{i}"
14987
+ new_labels.append(new_label)
14988
+
14989
+ return new_labels
14990
+
14991
+
14992
+ def _sort_bin_labels(binned: pd.Categorical, labels: List[str]) -> pd.Categorical:
14993
+ try:
14994
+ # Attempt to sort by the underlying intervals
14995
+ sorted_categories = sorted(binned.cat.categories)
14996
+ binned = binned.cat.reorder_categories(sorted_categories, ordered=True)
14997
+ except Exception:
14998
+ # If sorting fails (e.g., string labels), fallback to given label order
14999
+ binned = binned.cat.set_categories(labels, ordered=True)
15000
+ return binned
15001
+ # Input validation
15002
+ if column not in df.columns:
15003
+ raise ValueError(f"Column '{column}' not found in DataFrame")
13298
15004
 
13299
- imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(
13300
- columns=data.columns
13301
- )
15005
+ if not inplace:
15006
+ df = df.copy()
13302
15007
 
13303
- if inplace:
13304
- # Modify the original DataFrame
13305
- data[:] = imputed_data[col_names_org]
13306
- return None
15008
+ col_data = df[column]
15009
+
15010
+ # Determine column type
15011
+ if is_datetime64_any_dtype(col_data):
15012
+ col_type = "datetime"
15013
+ col_data = pd.to_datetime(col_data)
15014
+ elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == "object":
15015
+ col_type = "categorical"
15016
+ elif is_numeric_dtype(col_data):
15017
+ col_type = "numeric"
13307
15018
  else:
13308
- # Return the modified DataFrame
13309
- return imputed_data[col_names_org]
15019
+ raise TypeError(f"Unsupported column type: {col_data.dtype}")
15020
+
15021
+ # Handle dictionary bin specification
15022
+ if isinstance(bins, dict):
15023
+ range_start = bins.get("start", range_start)
15024
+ range_end = bins.get("end", range_end)
15025
+ step = bins.get("step", step)
15026
+ bins = None
15027
+
15028
+ # Process based on column type
15029
+ if col_type == "datetime":
15030
+ binned, bin_labels = _process_datetime_column(
15031
+ col_data,
15032
+ bins,
15033
+ range_start,
15034
+ range_end,
15035
+ step,
15036
+ labels,
15037
+ label_format,
15038
+ datetime_format,
15039
+ right,
15040
+ include_underflow,
15041
+ include_overflow,
15042
+ )
15043
+ elif col_type == "categorical":
15044
+ binned, bin_labels = _process_categorical_column(
15045
+ col_data, bins, labels, categorical_agg
15046
+ )
15047
+ else:
15048
+ binned, bin_labels = _process_numeric_column(
15049
+ col_data,
15050
+ bins,
15051
+ range_start,
15052
+ range_end,
15053
+ step,
15054
+ labels,
15055
+ label_format,
15056
+ precision,
15057
+ right,
15058
+ include_underflow,
15059
+ include_overflow,
15060
+ )
13310
15061
 
15062
+ # Handle NA values
15063
+ binned = _handle_na_values(binned, na_action, na_fill_value)
15064
+
15065
+ # Add statistical information to labels if requested
15066
+ if show_count or show_percentage or show_total_count:
15067
+ bin_labels = _add_statistical_labels(
15068
+ binned,
15069
+ bin_labels,
15070
+ show_count,
15071
+ show_percentage,
15072
+ show_total_count,
15073
+ symbol_count,
15074
+ symbol_percentage,
15075
+ symbol_total_count,
15076
+ sep_between,
15077
+ )
15078
+ binned = binned.cat.rename_categories(
15079
+ dict(zip(binned.cat.categories, bin_labels))
15080
+ )
13311
15081
 
13312
- # # example
13313
- # data = {
13314
- # "A": [1, 2, np.nan, 4, 5],
13315
- # "B": [np.nan, 2, 3, 4, np.nan],
13316
- # "C": [1, np.nan, 3, 4, 5],
13317
- # "D": [1, 2, 3, 4, np.nan],
13318
- # }
15082
+ # Sort labels if requested
15083
+ if sort_labels and not right and len(bin_labels) > 1:
15084
+ binned = _sort_bin_labels(binned, bin_labels)
13319
15085
 
13320
- # # Define a function to test each imputation method
13321
- # methods = [
13322
- # "mean",
13323
- # "median",
13324
- # "most_frequent",
13325
- # "constant",
13326
- # "knn",
13327
- # "iterative",
13328
- # # "missforest",
13329
- # # "softimpute",
13330
- # # "svd",
13331
- # ]
15086
+ # Create final output column
15087
+ new_col = new_col_name or f"{column}_binned"
15088
+ df[new_col] = binned.astype(dtype) if dtype else binned
13332
15089
 
13333
- # # Create a dictionary to hold results
13334
- # results = {}
15090
+ if drop_original:
15091
+ df.drop(columns=[column], inplace=True)
15092
+
15093
+ return None if inplace else df
13335
15094
 
13336
- # for method_name in methods:
13337
- # print(method_name)
13338
- # display(df)
13339
- # display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
13340
15095
 
13341
15096
 
13342
15097
  def df_encoder(
@@ -14580,209 +16335,213 @@ def df_reducer(
14580
16335
 
14581
16336
  # example:
14582
16337
  # df_reducer(data=data_log, columns=markers, n_components=2)
16338
+
14583
16339
 
14584
16340
 
14585
- def get_df_format(data, threshold_unique=0.5, verbose=False):
16341
+ def get_df_format(data, threshold_unique=0.5, verbose=False, sample_size=1000):
14586
16342
  """
14587
- 检测表格: long, wide or uncertain.
14588
-
16343
+ Detect whether a DataFrame is in long or wide format with optimized performance and accuracy.
16344
+
14589
16345
  Parameters:
14590
- - data (pd.DataFrame): DataFrame to check.
14591
- - threshold_unique (float): Proportion threshold for detecting categorical columns.
14592
-
16346
+ - data (pd.DataFrame): DataFrame to analyze
16347
+ - threshold_unique (float): Threshold for categorical column detection (0-1)
16348
+ - verbose (bool): Whether to print diagnostic messages
16349
+ - sample_size (int): Maximum number of rows/columns to sample for large datasets
16350
+
14593
16351
  Returns:
14594
- - "long" if detected as long format,
16352
+ - "long" if detected as long format
14595
16353
  - "wide" if detected as wide format
14596
- - "uncertain" if ambiguous.
16354
+ - "uncertain" if format is ambiguous
14597
16355
  """
16356
+ import pandas as pd
16357
+ import numpy as np
14598
16358
  from scipy.stats import entropy
14599
16359
  from sklearn.cluster import AgglomerativeClustering
14600
16360
  from sklearn.preprocessing import StandardScaler
14601
-
14602
- long_score,wide_score,fs = 0,0,500
14603
- n_rows, n_cols = data.shape
14604
- # -----to reduce memory, only check 500 rows/columns----
14605
- if n_rows > fs:
14606
- if verbose:
14607
- print(f"Sampling {fs} rows from {n_rows} rows.")
14608
- data = data.sample(n=fs, random_state=1)
14609
- if n_cols > fs:
14610
- if verbose:
14611
- print(f"Using first {fs} columns out of {n_cols} columns.")
14612
- data = data.iloc[:, :fs]
16361
+ from sklearn.metrics import pairwise_distances
16362
+ from collections import Counter
16363
+ import re
16364
+ # ----- Initial Setup and Sampling -----
14613
16365
  n_rows, n_cols = data.shape
16366
+ if verbose:
16367
+ print(f"Initial shape: {n_rows} rows, {n_cols} columns")
14614
16368
 
14615
- # Step 1: Row-Column Ratio Heuristic
14616
- if n_rows > 3 * n_cols:
14617
- long_score += 2
14618
- if verbose:
14619
- print(
14620
- "Row-Column Ratio suggests long format (many rows relative to columns)."
14621
- )
14622
- elif n_cols > 3 * n_rows:
14623
- wide_score += 2
14624
- if verbose:
14625
- print(
14626
- "Row-Column Ratio suggests wide format (many columns relative to rows)."
14627
- )
14628
-
14629
- # Step 2: Unique-to-duplicate ratio and entropy for categorical variables
14630
- unique_counts = data.apply(lambda x: x.nunique())
16369
+ # Sample data if too large
16370
+ if n_rows > sample_size:
16371
+ data = data.sample(n=sample_size, random_state=42)
16372
+ n_rows = sample_size
16373
+ if n_cols > sample_size:
16374
+ data = data.iloc[:, :sample_size]
16375
+ n_cols = sample_size
16376
+
16377
+ # Early exit for tiny datasets
16378
+ if n_rows < 3 or n_cols < 3:
16379
+ return "uncertain"
16380
+
16381
+ long_score = 0
16382
+ wide_score = 0
16383
+
16384
+ # ----- Feature Extraction -----
16385
+ # Basic statistics
16386
+ row_col_ratio = n_rows / n_cols if n_cols != 0 else float('inf')
16387
+
16388
+ # Column types
16389
+ numeric_cols = data.select_dtypes(include=np.number).columns
16390
+ cat_cols = data.select_dtypes(include=['object', 'category']).columns
16391
+ other_cols = [col for col in data.columns if col not in numeric_cols and col not in cat_cols]
16392
+
16393
+ # Unique value analysis
16394
+ unique_counts = data.nunique(dropna=False)
14631
16395
  duplicate_ratio = 1 - unique_counts / n_rows
14632
- if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
14633
- wide_score += 2
14634
- if verbose:
14635
- print("High duplicate values in columns suggest wide format.")
14636
- else:
14637
- long_score += 1
14638
- if verbose:
14639
- print(
14640
- "Lower duplicate ratio suggests long format (higher row variability)."
14641
- )
14642
-
14643
- # Calculate entropy for categorical columns
14644
- categorical_cols = data.select_dtypes(include=["object", "category"]).columns
14645
- if len(categorical_cols) > 0:
14646
- for col in categorical_cols:
14647
- counts = data[col].value_counts(normalize=True)
14648
- col_entropy = entropy(counts)
14649
- if col_entropy < 1.5:
14650
- long_score += 1
14651
- if verbose:
14652
- print(
14653
- f"Column '{col}' entropy suggests categorical, supporting long format."
14654
- )
14655
- else:
14656
- wide_score += 1
14657
- if verbose:
14658
- print(f"Column '{col}' entropy is higher, supporting wide format.")
14659
-
14660
- # Step 3: Column grouping analysis for patterns in suffixes/prefixes
16396
+
16397
+ # Missing values
16398
+ missing_per_row = data.isna().sum(axis=1)
16399
+ missing_per_col = data.isna().sum()
16400
+
16401
+ # Column name patterns
14661
16402
  col_names = data.columns.astype(str)
14662
- suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
14663
- if suffix_count > 0.3 * n_cols:
16403
+ has_suffix = sum(bool(re.search(r'(_\d+|\d+_?$)', col)) for col in col_names)
16404
+ has_time = sum(bool(re.search(r'(^time|^date|^year|^month|^day|^t\d+)', col.lower())) for col in col_names)
16405
+
16406
+ # ----- Scoring Rules -----
16407
+
16408
+ # 1. Row-Column Ratio (weighted)
16409
+ if row_col_ratio > 5:
16410
+ long_score += 3
16411
+ if verbose: print(f"High row/col ratio ({row_col_ratio:.1f}) → long +3")
16412
+ elif row_col_ratio < 0.2:
16413
+ wide_score += 3
16414
+ if verbose: print(f"Low row/col ratio ({row_col_ratio:.1f}) → wide +3")
16415
+ elif row_col_ratio > 2:
16416
+ long_score += 1
16417
+ if verbose: print(f"Moderate row/col ratio ({row_col_ratio:.1f}) → long +1")
16418
+ elif row_col_ratio < 0.5:
16419
+ wide_score += 1
16420
+ if verbose: print(f"Moderate row/col ratio ({row_col_ratio:.1f}) → wide +1")
16421
+
16422
+ # 2. Duplication Patterns
16423
+ high_dupe_cols = sum(duplicate_ratio > 0.3)
16424
+ if high_dupe_cols > 0.6 * n_cols:
14664
16425
  wide_score += 2
14665
- if verbose:
14666
- print(
14667
- "Detected suffix/prefix patterns in column names, suggesting wide format."
14668
- )
14669
-
14670
- # Step 4: Entity identifier detection for long format with categorical columns
14671
- if len(categorical_cols) > 0 and n_rows > n_cols:
14672
- entity_identifier_count = sum(
14673
- data.duplicated(subset=categorical_cols, keep=False)
14674
- )
14675
- if entity_identifier_count > 0.2 * n_rows:
16426
+ if verbose: print(f"Many columns ({high_dupe_cols}/{n_cols}) with duplicates → wide +2")
16427
+ elif high_dupe_cols < 0.2 * n_cols:
16428
+ long_score += 1
16429
+ if verbose: print(f"Few columns ({high_dupe_cols}/{n_cols}) with duplicates → long +1")
16430
+
16431
+ # 3. Categorical Column Analysis
16432
+ if len(cat_cols) > 0:
16433
+ # Entropy analysis
16434
+ cat_entropies = []
16435
+ for col in cat_cols:
16436
+ counts = data[col].value_counts(normalize=True, dropna=False)
16437
+ cat_entropies.append(entropy(counts))
16438
+
16439
+ avg_cat_entropy = np.mean(cat_entropies) if cat_entropies else 0
16440
+ if avg_cat_entropy < 1.2:
14676
16441
  long_score += 2
14677
- if verbose:
14678
- print(
14679
- "Significant duplicate rows based on categorical columns, suggesting long format."
14680
- )
14681
-
14682
- # Step 5: Clustering analysis on numerical columns for correlation in wide format
14683
- numeric_cols = data.select_dtypes(include="number").columns
14684
- if len(numeric_cols) > 1:
14685
- try:
14686
- scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
14687
- clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
14688
- cluster_labels = pd.Series(clustering.labels_)
14689
- if cluster_labels.nunique() < len(numeric_cols) * 0.5:
14690
- wide_score += 2
14691
- if verbose:
14692
- print(
14693
- "Clustering on columns shows grouping, suggesting wide format."
14694
- )
14695
- except Exception as e:
14696
- print(e) if verbose else None
14697
-
14698
- # Step 6: Inter-column correlation analysis
14699
- if len(numeric_cols) > 1:
16442
+ if verbose: print(f"Low categorical entropy ({avg_cat_entropy:.2f}) → long +2")
16443
+ elif avg_cat_entropy > 2:
16444
+ wide_score += 1
16445
+ if verbose: print(f"High categorical entropy ({avg_cat_entropy:.2f}) → wide +1")
16446
+
16447
+ # Entity identifier detection
16448
+ if len(cat_cols) >= 2 and n_rows > 10:
16449
+ dup_rows = data.duplicated(subset=cat_cols.tolist()[:2], keep=False).sum()
16450
+ if dup_rows > 0.3 * n_rows:
16451
+ long_score += 2
16452
+ if verbose: print(f"Duplicate rows in categorical cols ({dup_rows}/{n_rows}) → long +2")
16453
+
16454
+ # 4. Column Name Patterns
16455
+ if has_suffix > 0.4 * n_cols:
16456
+ wide_score += 2
16457
+ if verbose: print(f"Many suffix patterns ({has_suffix}/{n_cols}) → wide +2")
16458
+ if has_time > 0.3 * n_cols:
16459
+ wide_score += 1
16460
+ if verbose: print(f"Time-like columns ({has_time}/{n_cols}) → wide +1")
16461
+
16462
+ # 5. Numeric Column Analysis (only if enough numeric columns)
16463
+ if len(numeric_cols) > 2:
16464
+ # Correlation analysis
14700
16465
  corr_matrix = data[numeric_cols].corr().abs()
14701
- avg_corr = (
14702
- corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
14703
- )
14704
- if avg_corr > 0.6:
16466
+ avg_corr = corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)].mean()
16467
+
16468
+ if avg_corr > 0.5:
14705
16469
  wide_score += 2
14706
- if verbose:
14707
- print("High inter-column correlation suggests wide format.")
14708
-
14709
- # Step 7: Missing value pattern analysis
14710
- missing_patterns = data.isna().sum(axis=1)
14711
- if missing_patterns.std() < 2:
16470
+ if verbose: print(f"High numeric correlation ({avg_corr:.2f}) → wide +2")
16471
+ elif avg_corr < 0.2:
16472
+ long_score += 1
16473
+ if verbose: print(f"Low numeric correlation ({avg_corr:.2f}) → long +1")
16474
+
16475
+ # Entropy analysis
16476
+ try:
16477
+ numeric_data = data[numeric_cols].dropna()
16478
+ if len(numeric_data) > 10:
16479
+ numeric_entropy = numeric_data.apply(lambda x: entropy(pd.cut(x, bins=min(10, len(x.unique())).value_counts(normalize=True))))
16480
+ if numeric_entropy.mean() < 1.5:
16481
+ wide_score += 1
16482
+ if verbose: print(f"Low numeric entropy ({numeric_entropy.mean():.2f}) → wide +1")
16483
+ except Exception as e:
16484
+ if verbose: print(f"Numeric entropy failed: {str(e)}")
16485
+
16486
+ # 6. Missing Value Patterns
16487
+ missing_row_std = missing_per_row.std()
16488
+ if missing_row_std < 1 and missing_per_row.mean() > 0.1 * n_cols:
14712
16489
  wide_score += 1
14713
- if verbose:
14714
- print(
14715
- "Low variation in missing patterns across rows, supporting wide format."
14716
- )
14717
- elif missing_patterns.mean() < 1:
16490
+ if verbose: print(f"Uniform missing pattern (std={missing_row_std:.2f}) → wide +1")
16491
+ elif missing_per_row.mean() < 0.05 * n_cols:
14718
16492
  long_score += 1
14719
- if verbose:
14720
- print("Lower missing pattern suggests long format (less structured).")
14721
-
14722
- # Step 8: Multi-level clustering on rows to detect block structure for wide format
14723
- if len(numeric_cols) > 1 and n_rows > 5:
16493
+ if verbose: print(f"Few missing values → long +1")
16494
+
16495
+ # 7. Advanced Clustering (only for medium/large datasets)
16496
+ if len(numeric_cols) > 3 and n_rows > 10 and n_cols > 5:
14724
16497
  try:
14725
- clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
14726
- if pd.Series(clustering_rows.labels_).nunique() < 2:
14727
- wide_score += 2
14728
- if verbose:
14729
- print("Row clustering reveals homogeneity, suggesting wide format.")
16498
+ # Efficient clustering with sampling
16499
+ sample_data = data[numeric_cols].sample(n=min(100, n_rows), random_state=42)
16500
+ scaled_data = StandardScaler().fit_transform(sample_data.dropna())
16501
+
16502
+ if scaled_data.shape[0] > 5:
16503
+ # Column clustering
16504
+ col_dist = pairwise_distances(scaled_data.T)
16505
+ col_clusters = AgglomerativeClustering(n_clusters=2,
16506
+ affinity='precomputed',
16507
+ linkage='complete').fit(col_dist)
16508
+ cluster_counts = Counter(col_clusters.labels_)
16509
+ if max(cluster_counts.values()) > 0.7 * len(numeric_cols):
16510
+ wide_score += 2
16511
+ if verbose: print(f"Column clustering shows dominant group → wide +2")
16512
+
16513
+ # Row clustering
16514
+ row_clusters = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
16515
+ row_cluster_counts = Counter(row_clusters.labels_)
16516
+ if max(row_cluster_counts.values()) > 0.8 * scaled_data.shape[0]:
16517
+ wide_score += 1
16518
+ if verbose: print(f"Row clustering shows homogeneity → wide +1")
14730
16519
  except Exception as e:
14731
- print(e) if verbose else None
14732
-
14733
- # Step 9: Sequential name detection for time-series pattern in wide format
14734
- if any(col.isdigit() or col.startswith("T") for col in col_names):
14735
- wide_score += 1
14736
- if verbose:
14737
- print("Detected time-like sequential column names, supporting wide format.")
14738
-
14739
- # Step 10: Entropy of numeric columns
14740
- try:
14741
- numeric_entropy = data[numeric_cols].apply(
14742
- lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
14743
- )
14744
- if numeric_entropy.mean() < 2:
14745
- wide_score += 2
14746
- if verbose:
14747
- print(
14748
- "Low entropy in numeric columns indicates stability across columns, supporting wide format."
14749
- )
14750
- except Exception as e:
14751
- print(e) if verbose else None
14752
-
14753
- # Step 11: Tie-breaking strategy if scores are equal
14754
- if wide_score == long_score:
14755
- if n_cols > n_rows:
14756
- wide_score += 1
14757
- if verbose:
14758
- print(
14759
- "Tie-breaking based on column-major structure, favoring wide format."
14760
- )
14761
- elif n_rows > n_cols:
14762
- long_score += 1
14763
- if verbose:
14764
- print(
14765
- "Tie-breaking based on row-major structure, favoring long format."
14766
- )
14767
- else:
14768
- if verbose:
14769
- print("Tie-breaking inconclusive; returning 'uncertain'.")
14770
- return "uncertain"
14771
-
14772
- # Final decision
14773
- if wide_score > long_score:
14774
- if verbose:
14775
- print("Final decision: Wide format.")
14776
- return "wide"
14777
- elif long_score > wide_score:
14778
- if verbose:
14779
- print("Final decision: Long format.")
14780
- return "long"
16520
+ if verbose: print(f"Clustering skipped: {str(e)}")
16521
+
16522
+ # ----- Decision Logic -----
16523
+ score_diff = long_score - wide_score
16524
+ abs_diff = abs(score_diff)
16525
+
16526
+ if verbose:
16527
+ print(f"\nFinal scores - Long: {long_score}, Wide: {wide_score}")
16528
+
16529
+ if abs_diff >= 3:
16530
+ return "long" if score_diff > 0 else "wide"
16531
+ elif abs_diff >= 1:
16532
+ # Additional tie-breakers
16533
+ if score_diff == 0:
16534
+ if row_col_ratio > 1.5:
16535
+ return "long"
16536
+ elif row_col_ratio < 0.67:
16537
+ return "wide"
16538
+ elif len(cat_cols) > len(numeric_cols):
16539
+ return "long"
16540
+ else:
16541
+ return "wide"
16542
+ return "long" if score_diff > 0 else "wide"
14781
16543
  else:
14782
- if verbose:
14783
- print("Final decision: Uncertain format.")
14784
16544
  return "uncertain"
14785
-
14786
16545
  #! ========== workbook, worksheet, wb,ws =============
14787
16546
 
14788
16547
  import openpyxl
@@ -15917,7 +17676,7 @@ def df_corr(df: pd.DataFrame, method="pearson"):
15917
17676
  def use_pd(
15918
17677
  func_name="excel",
15919
17678
  verbose=True,
15920
- dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
17679
+ dir_json="./data/usages_pd.json",
15921
17680
  ):
15922
17681
  try:
15923
17682
  default_settings = fload(dir_json, output="json")
@@ -17221,3 +18980,290 @@ def set_theme(
17221
18980
  color_codes=color_codes,
17222
18981
  rc=rc_params,
17223
18982
  )
18983
+
18984
+
18985
+
18986
+ def df_wide_long(df):
18987
+ rows, columns = df.shape
18988
+ if columns > rows:
18989
+ return "Wide"
18990
+ elif rows > columns:
18991
+ return "Long"
18992
+
18993
+ def df2array(data: pd.DataFrame, x=None, y=None, hue=None, sort=False):
18994
+
18995
+ def sort_rows_move_nan(arr, sort=False):
18996
+ # Handle edge cases where all values are NaN
18997
+ if np.all(np.isnan(arr)):
18998
+ return arr # Return unchanged if the entire array is NaN
18999
+
19000
+ if sort:
19001
+ # Replace NaNs with a temporary large value for sorting
19002
+ temp_value = (
19003
+ np.nanmax(arr[np.isfinite(arr)]) + 1 if np.any(np.isfinite(arr)) else np.inf
19004
+ )
19005
+ arr_no_nan = np.where(np.isnan(arr), temp_value, arr)
19006
+
19007
+ # Sort each row
19008
+ sorted_arr = np.sort(arr_no_nan, axis=1)
19009
+
19010
+ # Move NaNs to the end
19011
+ result_arr = np.where(sorted_arr == temp_value, np.nan, sorted_arr)
19012
+ else:
19013
+ result_rows = []
19014
+ for row in arr:
19015
+ # Separate non-NaN and NaN values
19016
+ non_nan_values = row[~np.isnan(row)]
19017
+ nan_count = np.isnan(row).sum()
19018
+ # Create a new row with non-NaN values followed by NaNs
19019
+ new_row = np.concatenate([non_nan_values, [np.nan] * nan_count])
19020
+ result_rows.append(new_row)
19021
+ # Convert the list of rows back into a 2D NumPy array
19022
+ result_arr = np.array(result_rows)
19023
+
19024
+ # Remove rows/columns that contain only NaNs
19025
+ clean_arr = result_arr[~np.isnan(result_arr).all(axis=1)]
19026
+ clean_arr_ = clean_arr[:, ~np.isnan(clean_arr).all(axis=0)]
19027
+
19028
+ return clean_arr_
19029
+ # data = data.copy()
19030
+ # data[y] = pd.to_numeric(data[y], errors="coerce")
19031
+ # data = data.dropna(subset=[y])
19032
+ if hue is None:
19033
+ a = []
19034
+ if sort:
19035
+ cat_x = np.sort(data[x].unique().tolist()).tolist()
19036
+ else:
19037
+ cat_x = data[x].unique().tolist()
19038
+ for i, x_ in enumerate(cat_x):
19039
+ new_ = data.loc[data[x] == x_, y].to_list()
19040
+ a = padcat(a, new_, axis=0)
19041
+ return sort_rows_move_nan(a).T
19042
+ else:
19043
+ a = []
19044
+ if sort:
19045
+ cat_x = np.sort(data[x].unique().tolist()).tolist()
19046
+ cat_hue = np.sort(data[hue].unique().tolist()).tolist()
19047
+ else:
19048
+ cat_x = data[x].unique().tolist()
19049
+ cat_hue = data[hue].unique().tolist()
19050
+ for i, x_ in enumerate(cat_x):
19051
+ for j, hue_ in enumerate(cat_hue):
19052
+ new_ = data.loc[(data[x] == x_) & (data[hue] == hue_), y].to_list()
19053
+ a = padcat(a, new_, axis=0)
19054
+ return sort_rows_move_nan(a).T
19055
+
19056
+
19057
+ def array2df(data: np.ndarray):
19058
+ df = pd.DataFrame()
19059
+ df["group"] = (
19060
+ np.tile(
19061
+ ["group" + str(i) for i in range(1, data.shape[1] + 1)], [data.shape[0], 1]
19062
+ )
19063
+ .reshape(-1, 1, order="F")[:, 0]
19064
+ .tolist()
19065
+ )
19066
+ df["value"] = data.reshape(-1, 1, order="F")
19067
+ return df
19068
+
19069
+
19070
+ def padcat(*args, fill_value=np.nan, axis=1, order="row"):
19071
+ """
19072
+ Concatenate vectors with padding.
19073
+
19074
+ Parameters:
19075
+ *args : variable number of list or 1D arrays
19076
+ Input arrays to concatenate.
19077
+ fill_value : scalar, optional
19078
+ The value to use for padding the shorter lists (default is np.nan).
19079
+ axis : int, optional
19080
+ The axis along which to concatenate (0 for rows, 1 for columns, default is 1).
19081
+ order : str, optional
19082
+ The order for flattening when required: "row" or "column" (default is "row").
19083
+
19084
+ Returns:
19085
+ np.ndarray
19086
+ A 2D array with the input arrays concatenated along the specified axis,
19087
+ padded with fill_value where necessary.
19088
+
19089
+
19090
+ # Example usage:
19091
+ a = [1, np.nan]
19092
+ b = [1, 3, 4, np.nan, 2, np.nan]
19093
+ c = [1, 2, 3, 4, 5, 6, 7, 8, 10]
19094
+ d = padcat(a, b)
19095
+ result1 = padcat(d, c)
19096
+ result2 = padcat(a, b, c)
19097
+ print("Result of padcat(d, c):\n", result1)
19098
+ print("Result of padcat(a, b, c):\n", result2)
19099
+ """
19100
+ # Set the order for processing
19101
+ if "ro" in order.lower():
19102
+ order = "C" # row-major order
19103
+ else:
19104
+ order = "F" # column-major order
19105
+
19106
+ # Process input arrays based on their dimensions
19107
+ processed_arrays = []
19108
+ for arg in args:
19109
+ arr = np.asarray(arg)
19110
+ if arr.ndim == 1:
19111
+ processed_arrays.append(arr) # Keep 1D arrays as is
19112
+ elif arr.ndim == 2:
19113
+ if axis == 0:
19114
+ # If concatenating along rows, split 2D arrays into 1D arrays row-wise
19115
+ processed_arrays.extend(arr)
19116
+ elif axis == 1:
19117
+ # If concatenating along columns, split 2D arrays into 1D arrays column-wise
19118
+ processed_arrays.extend(arr.T)
19119
+ else:
19120
+ raise ValueError("axis must be 0 or 1")
19121
+ else:
19122
+ raise ValueError("Input arrays must be 1D or 2D")
19123
+
19124
+ if axis == 0:
19125
+ # Concatenate along rows
19126
+ max_len = max(arr.size for arr in processed_arrays)
19127
+ result = np.full((len(processed_arrays), max_len), fill_value)
19128
+ for i, arr in enumerate(processed_arrays):
19129
+ result[i, : arr.size] = arr
19130
+ elif axis == 1:
19131
+ # Concatenate along columns
19132
+ max_len = max(arr.size for arr in processed_arrays)
19133
+ result = np.full((max_len, len(processed_arrays)), fill_value)
19134
+ for i, arr in enumerate(processed_arrays):
19135
+ result[: arr.size, i] = arr
19136
+ else:
19137
+ raise ValueError("axis must be 0 or 1")
19138
+
19139
+ return result
19140
+
19141
+
19142
+ # ========== memory cleaner ==========
19143
+ import gc
19144
+ import os
19145
+ import sys
19146
+ import psutil
19147
+ import platform
19148
+ import ctypes
19149
+ import subprocess
19150
+ import warnings
19151
+ import time
19152
+
19153
+ class MemoryOptimizer:
19154
+ def __init__(self, verbose: bool = True, aggressive_mode: bool = True):
19155
+ self.verbose = verbose
19156
+ self.aggressive_mode = aggressive_mode
19157
+ self.system = platform.system()
19158
+ self.process = psutil.Process(os.getpid())
19159
+ self.start_time = time.time()
19160
+ self.memory_history = []
19161
+
19162
+ def log(self, msg: str, level: str = "INFO"):
19163
+ if self.verbose:
19164
+ rss = self.process.memory_info().rss / (1024 ** 2)
19165
+ elapsed = time.time() - self.start_time
19166
+ print(f"[{level}][{elapsed:.2f}s][{rss:.1f}MB] {msg}")
19167
+
19168
+ def collect_garbage(self):
19169
+ self.log("Performing deep garbage collection...")
19170
+ stats = {}
19171
+ before_mem = self.process.memory_info().rss
19172
+ for gen in reversed(range(3)):
19173
+ collected = gc.collect(gen)
19174
+ self.log(f"GC Gen {gen}: Collected {collected}")
19175
+ gc.garbage.clear()
19176
+ after_mem = self.process.memory_info().rss
19177
+ stats['freed_mb'] = (before_mem - after_mem) / (1024 ** 2)
19178
+ return stats
19179
+
19180
+ def clear_frameworks(self):
19181
+ result = {}
19182
+ try:
19183
+ import torch
19184
+ if torch.cuda.is_available():
19185
+ self.log("Clearing PyTorch cache...")
19186
+ torch.cuda.empty_cache()
19187
+ torch.cuda.ipc_collect()
19188
+ result['pytorch'] = 'cleared'
19189
+ except Exception as e:
19190
+ self.log(f"PyTorch skipped: {e}", "WARNING")
19191
+
19192
+ try:
19193
+ import tensorflow as tf
19194
+ self.log("Clearing TensorFlow session...")
19195
+ tf.keras.backend.clear_session()
19196
+ result['tensorflow'] = 'cleared'
19197
+ except Exception as e:
19198
+ self.log(f"TensorFlow skipped: {e}", "WARNING")
19199
+
19200
+ try:
19201
+ import cv2
19202
+ self.log("Closing OpenCV windows...")
19203
+ cv2.destroyAllWindows()
19204
+ result['opencv'] = 'cleared'
19205
+ except Exception:
19206
+ pass
19207
+
19208
+ try:
19209
+ import matplotlib.pyplot as plt
19210
+ self.log("Closing matplotlib figures...")
19211
+ plt.close('all')
19212
+ result['matplotlib'] = 'cleared'
19213
+ except Exception:
19214
+ pass
19215
+
19216
+ return result
19217
+
19218
+ def clear_system_caches(self):
19219
+ result = {}
19220
+ self.log("Attempting full system cache clearance...")
19221
+ try:
19222
+ if self.system == "Linux":
19223
+ subprocess.run(["sync"], check=True)
19224
+ subprocess.run(["sudo", "sh", "-c", "echo 3 > /proc/sys/vm/drop_caches"], check=True)
19225
+ result['linux'] = 'caches dropped'
19226
+ elif self.system == "Darwin":
19227
+ subprocess.run(["sudo", "purge"], check=True)
19228
+ result['macos'] = 'purge run'
19229
+ elif self.system == "Windows":
19230
+ ctypes.windll.psapi.EmptyWorkingSet(-1)
19231
+ if self.aggressive_mode:
19232
+ ctypes.windll.kernel32.SetProcessWorkingSetSizeEx(
19233
+ -1, ctypes.c_size_t(-1), ctypes.c_size_t(-1), ctypes.c_uint(0x1)
19234
+ )
19235
+ result['windows'] = 'working set emptied'
19236
+ except Exception as e:
19237
+ self.log(f"System cache clearing failed: {e}", "ERROR")
19238
+ return result
19239
+
19240
+ def profile(self) -> Dict[str, Any]:
19241
+ mem = self.process.memory_info()
19242
+ vm = psutil.virtual_memory()
19243
+ profile = {
19244
+ 'rss_mb': mem.rss / (1024 ** 2),
19245
+ 'vms_mb': mem.vms / (1024 ** 2),
19246
+ 'used_gb': vm.used / (1024 ** 3),
19247
+ 'available_gb': vm.available / (1024 ** 3),
19248
+ 'percent': vm.percent,
19249
+ }
19250
+ self.memory_history.append(profile)
19251
+ return profile
19252
+
19253
+ def optimize(self) -> Dict[str, Any]:
19254
+ result = {}
19255
+ result['before'] = self.profile()
19256
+ result['gc'] = self.collect_garbage()
19257
+ result['frameworks'] = self.clear_frameworks()
19258
+ result['system'] = self.clear_system_caches()
19259
+ result['after'] = self.profile()
19260
+ saved = result['before']['rss_mb'] - result['after']['rss_mb']
19261
+ result['saved_mb'] = saved
19262
+ result['saved_percent'] = (saved / result['before']['rss_mb']) * 100 if result['before']['rss_mb'] else 0
19263
+ self.log(f"Optimization complete: Saved {saved:.2f} MB ({result['saved_percent']:.1f}%)", "SUCCESS")
19264
+ return result
19265
+
19266
+
19267
+ def cleaner(verbose: bool = True, aggressive: bool = True) -> Dict[str, Any]:
19268
+ optimizer = MemoryOptimizer(verbose=verbose, aggressive_mode=aggressive)
19269
+ return optimizer.optimize()