py2ls 0.2.5.12__py3-none-any.whl → 0.2.5.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -12,8 +12,7 @@ import re
12
12
  import stat
13
13
  import platform
14
14
 
15
- from typing import Dict, List, Optional, Union, Any,Tuple
16
-
15
+ from typing import Dict, List, Optional, Union, Any, Tuple, Literal
17
16
  from regex import X
18
17
 
19
18
  try:
@@ -1663,75 +1662,464 @@ def flatten(nested: Any, unique_list=True, verbose=False):
1663
1662
  return flattened_list
1664
1663
 
1665
1664
 
1666
- # def strcmp(
1667
- # search_term,
1668
- # candidates,
1669
- # ignore_case=True,
1670
- # get_rank=False,
1671
- # verbose=False,
1672
- # scorer="WR",
1673
- # method=None,
1674
- # ):
1675
- # """
1676
- # Compares a search term with a list of candidate strings and finds the best match based on similarity score.
1665
+ #! ===========extract_text===========
1666
+ def extract_text(
1667
+ text: Union[str, List[str]],
1668
+ patterns: Union[str, List[str]],
1669
+ *,
1670
+ mode: Literal["between", "split", "extract"] = "between",
1671
+ keep: Literal["none", "left", "right", "both", "markers"] = "none",
1672
+ case: Literal["sensitive", "insensitive"] = "insensitive",
1673
+ all_matches: bool = False,
1674
+ positions: bool = False,
1675
+ regex: bool = False,
1676
+ delimiter: Optional[str] = None,
1677
+ trim: bool = True,
1678
+ as_dict: bool = False,
1679
+ verbose: bool = False,
1680
+ **kwargs,
1681
+ ) -> Union[List[str], Tuple[int, str], Dict[str, Any], List[Dict[str, Any]], None]:
1682
+ """
1683
+ Ultimate text extraction tool with enhanced reliability and features.
1677
1684
 
1678
- # Parameters:
1679
- # search_term (str): The term to be searched for.
1680
- # candidates (list of str): A list of candidate strings to compare against the search term.
1681
- # ignore_case (bool): If True, the comparison ignores case differences.
1682
- # verbose (bool): If True, prints the similarity score and the best match.
1685
+ Key improvements:
1686
+ - Robust split mode with proper delimiter handling
1687
+ - Consistent return types across all modes
1688
+ - Improved pattern matching logic
1689
+ - Better edge case handling
1683
1690
 
1684
- # Returns:
1685
- # tuple: A tuple containing the best match and its index in the candidates list.
1686
- # """
1687
- # from fuzzywuzzy import fuzz, process
1688
-
1689
- # def to_lower(s, ignore_case=True):
1690
- # # Converts a string or list of strings to lowercase if ignore_case is True.
1691
- # if ignore_case:
1692
- # if isinstance(s, str):
1693
- # return s.lower()
1694
- # elif isinstance(s, list):
1695
- # s = [str(i) for i in s] # convert all to str
1696
- # return [elem.lower() for elem in s]
1697
- # return s
1698
- # scorer = str(method).lower() if method is not None else scorer
1699
- # str1_, str2_ = to_lower(search_term, ignore_case), to_lower(candidates, ignore_case)
1700
- # if isinstance(str2_, list):
1701
- # if "part" in scorer.lower():
1702
- # similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
1703
- # elif "w" in scorer.lower():
1704
- # similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
1705
- # elif "ratio" in scorer.lower() or "stri" in scorer.lower(): # Ratio (Strictest)
1706
- # similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
1707
- # else:
1708
- # similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
1709
- # if get_rank:
1710
- # idx = [
1711
- # similarity_scores.index(i)
1712
- # for i in sorted(similarity_scores, reverse=True)
1713
- # ]
1714
- # if verbose:
1715
- # display([candidates[ii] for ii in idx])
1716
- # return [candidates[ii] for ii in idx]
1717
- # best_match_index = similarity_scores.index(max(similarity_scores))
1718
- # best_match_score = similarity_scores[best_match_index]
1719
- # else:
1720
- # best_match_index = 0
1721
- # if "part" in scorer.lower():
1722
- # best_match_score = fuzz.partial_ratio(str1_, str2_)
1723
- # elif "w" in scorer.lower():
1724
- # best_match_score = fuzz.WRatio(str1_, str2_)
1725
- # elif "Ratio" in scorer.lower():
1726
- # best_match_score = fuzz.ratio(str1_, str2_)
1727
- # else:
1728
- # best_match_score = fuzz.WRatio(str1_, str2_)
1729
- # if verbose:
1730
- # print(f"\nbest_match is: {candidates[best_match_index],best_match_score}")
1731
- # best_match = process.extract(search_term, candidates)
1732
- # print(f"建议: {best_match}")
1733
- # return candidates[best_match_index], best_match_index
1734
1691
 
1692
+ print(extract_text("A,B,C", ",", mode="split", keep="none", all_matches=True))
1693
+ # Correctly returns: ['A', 'B', 'C']
1694
+
1695
+ print(extract_text("A,B,C", ",", mode="split", keep="left"))
1696
+ # Returns: ['A,', 'B,', 'C']
1697
+
1698
+ print(extract_text("A,B,C", ",", mode="split", keep="right"))
1699
+ # Returns: [',B', ',C']
1700
+
1701
+ print(extract_text("A,B,C", ",", mode="split", keep="both"))
1702
+ # Returns: ['A', ',', 'B', ',', 'C']
1703
+ """
1704
+ if verbose:
1705
+ print("""
1706
+ extract_text(
1707
+ text: Union[str, List[str]],
1708
+ patterns: Union[str, List[str]],
1709
+ *,
1710
+ mode: Literal["between", "split", "extract"] = "between",
1711
+ keep: Literal["none", "left", "right", "both", "markers"] = "none",
1712
+ case: Literal["sensitive", "insensitive"] = "insensitive",
1713
+ all_matches: bool = False,
1714
+ positions: bool = False,
1715
+ regex: bool = False,
1716
+ delimiter: Optional[str] = None,
1717
+ trim: bool = True,
1718
+ as_dict: bool = False,
1719
+ verbose: bool = False,
1720
+ **kwargs,
1721
+ )
1722
+ """)
1723
+ # Normalization and validation
1724
+ text = _normalize_text(text, delimiter)
1725
+ patterns = _validate_patterns(patterns)
1726
+ flags = re.IGNORECASE if case == "insensitive" else 0
1727
+
1728
+ # Find all matches with enhanced validation
1729
+ matches = _find_matches(text, patterns, regex, flags)
1730
+ if not matches:
1731
+ return None
1732
+
1733
+ # Mode-specific processing
1734
+ if mode == "extract":
1735
+ return _handle_extract(matches, all_matches, as_dict, positions, trim)
1736
+ elif mode == "split":
1737
+ return _handle_split(text, matches, keep, all_matches, as_dict, positions, trim)
1738
+ elif mode == "between":
1739
+ return _handle_between(text, matches, patterns, keep, as_dict, positions, trim)
1740
+ else:
1741
+ raise ValueError(f"Invalid mode: {mode}")
1742
+
1743
+
1744
+ def _normalize_text(text: Union[str, List[str]], delimiter: Optional[str]) -> str:
1745
+ """Normalize text input to single string"""
1746
+ if isinstance(text, list):
1747
+ return delimiter.join(text) if delimiter else " ".join(text)
1748
+ return text
1749
+
1750
+
1751
+ def _validate_patterns(patterns: Union[str, List[str]]) -> List[str]:
1752
+ """Validate and normalize patterns"""
1753
+ if isinstance(patterns, str):
1754
+ return [patterns]
1755
+ if not patterns:
1756
+ raise ValueError("At least one pattern required")
1757
+ return patterns
1758
+
1759
+
1760
+ def _find_matches(
1761
+ text: str, patterns: List[str], regex: bool, flags: int
1762
+ ) -> List[dict]:
1763
+ """Find all pattern matches with enhanced regex handling"""
1764
+ matches = []
1765
+ for pattern in patterns:
1766
+ try:
1767
+ search_pattern = pattern if regex else re.escape(pattern)
1768
+ for match in re.finditer(search_pattern, text, flags=flags):
1769
+ matches.append(
1770
+ {
1771
+ "text": match.group(),
1772
+ "start": match.start(),
1773
+ "end": match.end(),
1774
+ "pattern": pattern,
1775
+ "full_match": match,
1776
+ }
1777
+ )
1778
+ except re.error as e:
1779
+ raise ValueError(f"Invalid pattern '{pattern}': {e}")
1780
+ return sorted(matches, key=lambda x: x["start"])
1781
+
1782
+
1783
+ def _handle_extract(
1784
+ matches: List[dict], all_matches: bool, as_dict: bool, positions: bool, trim: bool
1785
+ ) -> Union[List, dict]:
1786
+ """Handle text extraction of matched patterns"""
1787
+ results = []
1788
+ for match in matches if all_matches else [matches[0]]:
1789
+ content = match["text"].strip() if trim else match["text"]
1790
+ result = (
1791
+ {
1792
+ "text": content,
1793
+ "start": match["start"],
1794
+ "end": match["end"],
1795
+ "pattern": match["pattern"],
1796
+ }
1797
+ if as_dict
1798
+ else content
1799
+ )
1800
+ if positions and as_dict:
1801
+ result["positions"] = [(match["start"], match["end"])]
1802
+ results.append(result)
1803
+
1804
+ return results[0] if not all_matches else results
1805
+
1806
+
1807
+ def _create_part(
1808
+ content: str,
1809
+ start: int,
1810
+ end: int,
1811
+ match: Optional[dict],
1812
+ as_dict: bool,
1813
+ positions: bool,
1814
+ trim: bool,
1815
+ ) -> Union[str, dict]:
1816
+ """Create a standardized result part"""
1817
+ content = content.strip() if trim else content
1818
+ if not as_dict:
1819
+ return content
1820
+
1821
+ part = {
1822
+ "text": content,
1823
+ "start": start,
1824
+ "end": end,
1825
+ "pattern": match["pattern"] if match else None,
1826
+ }
1827
+ if positions and match:
1828
+ part["positions"] = [(match["start"], match["end"])]
1829
+ return part
1830
+
1831
+
1832
+ def _handle_between(
1833
+ text: str,
1834
+ matches: List[dict],
1835
+ patterns: List[str],
1836
+ keep: str,
1837
+ as_dict: bool,
1838
+ positions: bool,
1839
+ trim: bool,
1840
+ ) -> Union[Tuple, dict]:
1841
+ """Reliable between-mode implementation with boundary checks"""
1842
+ first_pattern, last_pattern = patterns[0], patterns[-1]
1843
+ first_matches = [m for m in matches if m["pattern"] == first_pattern]
1844
+ last_matches = [m for m in matches if m["pattern"] == last_pattern]
1845
+
1846
+ if not first_matches or not last_matches:
1847
+ return None
1848
+
1849
+ first = first_matches[0]
1850
+ last = last_matches[-1]
1851
+
1852
+ if first["start"] > last["start"]:
1853
+ return None
1854
+
1855
+ # Calculate extraction window
1856
+ start, end = first["start"], last["end"]
1857
+ if keep == "none":
1858
+ start, end = first["end"], last["start"]
1859
+ elif keep == "left":
1860
+ end = last["start"]
1861
+ elif keep == "right":
1862
+ start = first["end"]
1863
+
1864
+ extracted = text[start:end].strip() if trim else text[start:end]
1865
+
1866
+ if as_dict:
1867
+ result = {
1868
+ "text": extracted,
1869
+ "start": start,
1870
+ "end": end,
1871
+ "patterns": patterns,
1872
+ "match_positions": [(m["start"], m["end"]) for m in matches],
1873
+ }
1874
+ return result
1875
+
1876
+ return (
1877
+ (start, extracted)
1878
+ if not positions
1879
+ else (start, extracted, [(m["start"], m["end"]) for m in matches])
1880
+ )
1881
+
1882
+
1883
+ def _handle_split(
1884
+ text: str,
1885
+ matches: List[dict],
1886
+ keep: str,
1887
+ all_matches: bool,
1888
+ as_dict: bool,
1889
+ positions: bool,
1890
+ trim: bool,
1891
+ ) -> Union[List, dict]:
1892
+ """Split text with proper handling of keep='both' to include delimiters on both sides"""
1893
+ if not matches:
1894
+ return (
1895
+ [text]
1896
+ if not as_dict
1897
+ else [{"text": text, "start": 0, "end": len(text), "pattern": None}]
1898
+ )
1899
+
1900
+ parts = []
1901
+ prev_end = 0
1902
+ process_matches = matches if all_matches else [matches[0]]
1903
+
1904
+ # Special handling for keep="both"
1905
+ if keep == "both":
1906
+ for i, match in enumerate(process_matches):
1907
+ start, end = match["start"], match["end"]
1908
+ matched_text = text[start:end]
1909
+
1910
+ # First segment (text before first delimiter + first delimiter)
1911
+ if i == 0:
1912
+ segment = text[prev_end:end] # From start to end of first delimiter
1913
+ if trim:
1914
+ segment = segment.strip()
1915
+ if segment or not trim:
1916
+ if as_dict:
1917
+ parts.append(
1918
+ {
1919
+ "text": segment,
1920
+ "start": prev_end,
1921
+ "end": end,
1922
+ "pattern": match["pattern"],
1923
+ **({"positions": [(start, end)]} if positions else {}),
1924
+ }
1925
+ )
1926
+ else:
1927
+ parts.append(segment)
1928
+ prev_end = end
1929
+
1930
+ # Middle segments (delimiter + text + next delimiter)
1931
+ if i > 0 and i < len(process_matches):
1932
+ next_match = process_matches[i]
1933
+ next_start, next_end = next_match["start"], next_match["end"]
1934
+ segment = text[
1935
+ prev_end:next_end
1936
+ ] # From prev_end to end of next delimiter
1937
+ if trim:
1938
+ segment = segment.strip()
1939
+ if segment or not trim:
1940
+ if as_dict:
1941
+ parts.append(
1942
+ {
1943
+ "text": segment,
1944
+ "start": prev_end,
1945
+ "end": next_end,
1946
+ "pattern": next_match["pattern"],
1947
+ **(
1948
+ {"positions": [(next_start, next_end)]}
1949
+ if positions
1950
+ else {}
1951
+ ),
1952
+ }
1953
+ )
1954
+ else:
1955
+ parts.append(segment)
1956
+ prev_end = next_end
1957
+
1958
+ # Last segment (last delimiter + remaining text)
1959
+ if process_matches and prev_end < len(text):
1960
+ last_match = process_matches[-1]
1961
+ segment = text[
1962
+ last_match["start"] : len(text)
1963
+ ] # From last delimiter to end
1964
+ if trim:
1965
+ segment = segment.strip()
1966
+ if segment or not trim:
1967
+ if as_dict:
1968
+ parts.append(
1969
+ {
1970
+ "text": segment,
1971
+ "start": last_match["start"],
1972
+ "end": len(text),
1973
+ "pattern": last_match["pattern"],
1974
+ **(
1975
+ {
1976
+ "positions": [
1977
+ (last_match["start"], last_match["end"])
1978
+ ]
1979
+ }
1980
+ if positions
1981
+ else {}
1982
+ ),
1983
+ }
1984
+ )
1985
+ else:
1986
+ parts.append(segment)
1987
+
1988
+ return parts
1989
+
1990
+ # Original handling for other keep modes
1991
+ for i, match in enumerate(process_matches):
1992
+ start, end = match["start"], match["end"]
1993
+ matched_text = text[start:end]
1994
+
1995
+ # Handle text before the match
1996
+ if prev_end < start:
1997
+ before = text[prev_end:start]
1998
+ if trim:
1999
+ before = before.strip()
2000
+ if before or not trim:
2001
+ if as_dict:
2002
+ parts.append(
2003
+ {
2004
+ "text": before,
2005
+ "start": prev_end,
2006
+ "end": start,
2007
+ "pattern": None,
2008
+ **({"positions": []} if positions else {}),
2009
+ }
2010
+ )
2011
+ else:
2012
+ parts.append(before)
2013
+
2014
+ # Handle the match based on keep mode
2015
+ if keep == "none":
2016
+ pass # Skip the delimiter
2017
+ elif keep == "left":
2018
+ if parts:
2019
+ if as_dict:
2020
+ parts[-1]["text"] += matched_text
2021
+ parts[-1]["end"] = end
2022
+ else:
2023
+ parts[-1] += matched_text
2024
+ else:
2025
+ if as_dict:
2026
+ parts.append(
2027
+ {
2028
+ "text": matched_text,
2029
+ "start": start,
2030
+ "end": end,
2031
+ "pattern": match["pattern"],
2032
+ **({"positions": [(start, end)]} if positions else {}),
2033
+ }
2034
+ )
2035
+ else:
2036
+ parts.append(matched_text)
2037
+ elif keep == "right":
2038
+ if i < len(process_matches) - 1:
2039
+ next_start = process_matches[i + 1]["start"]
2040
+ if end < next_start:
2041
+ between = text[end:next_start]
2042
+ if as_dict:
2043
+ parts.append(
2044
+ {
2045
+ "text": matched_text + between,
2046
+ "start": start,
2047
+ "end": next_start,
2048
+ "pattern": match["pattern"],
2049
+ **({"positions": [(start, end)]} if positions else {}),
2050
+ }
2051
+ )
2052
+ else:
2053
+ parts.append(matched_text + between)
2054
+ prev_end = next_start
2055
+ continue
2056
+
2057
+ prev_end = end
2058
+
2059
+ # Handle remaining text after last match
2060
+ if prev_end < len(text):
2061
+ remaining = text[prev_end:]
2062
+ if trim:
2063
+ remaining = remaining.strip()
2064
+ if remaining or not trim:
2065
+ if keep == "right" and parts and process_matches:
2066
+ last_match = process_matches[-1]
2067
+ matched_text = text[last_match["start"] : last_match["end"]]
2068
+ if as_dict:
2069
+ parts.append(
2070
+ {
2071
+ "text": matched_text + remaining,
2072
+ "start": last_match["start"],
2073
+ "end": len(text),
2074
+ "pattern": last_match["pattern"],
2075
+ **(
2076
+ {
2077
+ "positions": [
2078
+ (last_match["start"], last_match["end"])
2079
+ ]
2080
+ }
2081
+ if positions
2082
+ else {}
2083
+ ),
2084
+ }
2085
+ )
2086
+ else:
2087
+ parts.append(matched_text + remaining)
2088
+ else:
2089
+ if as_dict:
2090
+ parts.append(
2091
+ {
2092
+ "text": remaining,
2093
+ "start": prev_end,
2094
+ "end": len(text),
2095
+ "pattern": None,
2096
+ **({"positions": []} if positions else {}),
2097
+ }
2098
+ )
2099
+ else:
2100
+ parts.append(remaining)
2101
+
2102
+ # Filter empty parts if trimming
2103
+ if trim:
2104
+ parts = [p for p in parts if (p["text"].strip() if as_dict else p.strip())]
2105
+
2106
+ return parts
2107
+
2108
+
2109
+ def _merge_parts(
2110
+ parts: List[Union[str, dict]], text: str, as_dict: bool, trim: bool
2111
+ ) -> Union[str, dict]:
2112
+ """Merge adjacent parts for keep=left mode"""
2113
+ if as_dict:
2114
+ merged_text = "".join(p["text"] for p in parts)
2115
+ return {
2116
+ "text": merged_text.strip() if trim else merged_text,
2117
+ "start": parts[0]["start"],
2118
+ "end": parts[-1]["end"],
2119
+ "patterns": list(set(p["pattern"] for p in parts if p["pattern"])),
2120
+ }
2121
+ return "".join(parts).strip() if trim else "".join(parts)
2122
+ #! ===========extract_text===========
1735
2123
 
1736
2124
  def strcmp(
1737
2125
  search_term: str,
@@ -2794,73 +3182,6 @@ def text2audio(
2794
3182
 
2795
3183
  # from datetime import datetime
2796
3184
  from dateutil import parser
2797
- # import re
2798
- # from typing import Union, Optional, Dict, Any
2799
- # def str2time(time_str, fmt="24"):
2800
- # """
2801
- # Convert a time string into the specified format.
2802
- # Parameters:
2803
- # - time_str (str): The time string to be converted.
2804
- # - fmt (str): The format to convert the time to. Defaults to '%H:%M:%S'.
2805
- # Returns:
2806
- # %I represents the hour in 12-hour format.
2807
- # %H represents the hour in 24-hour format (00 through 23).
2808
- # %M represents the minute.
2809
- # %S represents the second.
2810
- # %p represents AM or PM.
2811
- # - str: The converted time string.
2812
- # """
2813
- # from datetime import datetime
2814
-
2815
- # def time_len_corr(time_str):
2816
- # time_str_ = (
2817
- # ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
2818
- # )
2819
- # time_str_split = []
2820
- # [time_str_split.append(i) for i in time_str_ if is_num(i)]
2821
- # if time_str_split:
2822
- # if len(time_str_split) == 2:
2823
- # H, M = time_str_split
2824
- # time_str_full = H + ":" + M + ":00"
2825
- # elif len(time_str_split) == 3:
2826
- # H, M, S = time_str_split
2827
- # time_str_full = H + ":" + M + ":" + S
2828
- # else:
2829
- # time_str_full = time_str_
2830
- # if "am" in time_str.lower():
2831
- # time_str_full += " AM"
2832
- # elif "pm" in time_str.lower():
2833
- # time_str_full += " PM"
2834
- # return time_str_full
2835
-
2836
- # if "12" in fmt:
2837
- # fmt = "%I:%M:%S %p"
2838
- # elif "24" in fmt:
2839
- # fmt = "%H:%M:%S"
2840
-
2841
- # try:
2842
- # # Try to parse the time string assuming it could be in 24-hour or 12-hour format
2843
- # time_obj = datetime.strptime(time_len_corr(time_str), "%H:%M:%S")
2844
- # except ValueError:
2845
- # try:
2846
- # time_obj = datetime.strptime(time_len_corr(time_str), "%I:%M:%S %p")
2847
- # except ValueError as e:
2848
- # raise ValueError(f"Unable to parse time string: {time_str}. Error: {e}")
2849
-
2850
- # # Format the time object to the desired output format
2851
- # formatted_time = time_obj.strftime(fmt)
2852
- # return formatted_time
2853
-
2854
-
2855
- # # # Example usage:
2856
- # # time_str1 = "14:30:45"
2857
- # # time_str2 = "02:30:45 PM"
2858
-
2859
- # # formatted_time1 = str2time(time_str1, fmt='12') # Convert to 12-hour format
2860
- # # formatted_time2 = str2time(time_str2, fmt='24') # Convert to 24-hour format
2861
-
2862
- # # print(formatted_time1) # Output: 02:30:45 PM
2863
- # # print(formatted_time2) # Output: 14:30:45
2864
3185
  def str2time(
2865
3186
  time_str: str,
2866
3187
  fmt: str = "24",
@@ -2964,57 +3285,6 @@ def str2time(
2964
3285
  raise ValueError(f"Unable to parse time string: '{time_str}'. Error: {e}")
2965
3286
  return default
2966
3287
 
2967
-
2968
- # def str2date(date_str, original_fmt=None, fmt="%Y-%m-%d"):
2969
- # """
2970
- # Convert a date string to the desired format and extract components if needed.
2971
- # Usage:
2972
- # str2date(x, fmt="%d.%m.%y",original_fmt="%d.%m.%y")
2973
- # Parameters:
2974
- # - date_str (str): The input date string.
2975
- # - original_fmt (str, optional): The original format of the date string. If not provided, it will be auto-detected.
2976
- # - fmt (str): The desired format for the output date string. Defaults to '%Y-%m-%d'.
2977
-
2978
- # Returns:
2979
- # - dict: A dictionary containing the converted date string and its components (year, month, day).
2980
-
2981
- # Raises:
2982
- # - ValueError: If the date cannot be parsed.
2983
- # """
2984
- # from dateutil import parser
2985
- # try:
2986
- # if not isinstance(date_str,str):
2987
- # date_str=str(date_str)
2988
- # # Parse the date using the provided original format or auto-detect
2989
- # if original_fmt:
2990
- # try:
2991
- # date_obj = datetime.strptime(date_str, original_fmt)
2992
- # except Exception as e:
2993
- # print(e)
2994
- # date_obj=None
2995
- # else:
2996
- # try:
2997
- # date_obj = parser.parse(date_str)
2998
- # except Exception as e:
2999
- # print(e)
3000
- # date_obj=None
3001
- # # Return formatted string if `fmt` is specified, otherwise return the datetime object
3002
- # if date_obj is not None:
3003
- # if fmt:
3004
- # date_obj=date_obj.strftime(fmt)
3005
- # else:
3006
- # date_obj=date_str
3007
- # return date_obj
3008
-
3009
- # except (ValueError, TypeError) as e:
3010
- # raise ValueError(f"Unable to process date string: '{date_str}'. Error: {e}")
3011
-
3012
-
3013
- # # str1=str2date(num2str(20240625),fmt="%a %d-%B-%Y")
3014
- # # print(str1)
3015
- # # str2=str2num(str2date(str1,fmt='%a %Y%m%d'))
3016
- # # print(str2)
3017
-
3018
3288
  def str2date(
3019
3289
  date_str: Union[str, int, float],
3020
3290
  fmt: Optional[str] = "%Y-%m-%d",
@@ -4054,8 +4324,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
4054
4324
 
4055
4325
 
4056
4326
  def ssplit(text, by="space", verbose: bool =False, strict: bool =False, strip_results: bool = True, **kws):
4057
- """
4058
- # Determines the splitting strategy:
4327
+ """# Determines the splitting strategy:
4059
4328
  # - "space", "whitespace", "sp": split by whitespace (default)
4060
4329
  # - "word": split into words using NLTK's word_tokenize
4061
4330
  # - "sentence", "sent": split into sentences using NLTK's sent_tokenize
@@ -4172,13 +4441,6 @@ def ssplit(text, by="space", verbose: bool =False, strict: bool =False, strip_re
4172
4441
 
4173
4442
  def split_by_regex_end(text, pattern):
4174
4443
  return re.split(f"(?={pattern})", text)
4175
-
4176
- # def split_by_sentence_endings(text):
4177
- # return re.split(r"(?<=[.!?])", text)
4178
- # def split_non_ascii(text):
4179
- # # return re.split(r"([^\x00-\x7F\w\s,.!?:\"'()\-]+)", text)
4180
- # # return re.split(r"[^\x00-\x7F]+", text)
4181
- # return re.split(r"([^\x00-\x7F]+)", text)
4182
4444
  def split_non_ascii(text, keep_delimiters=False):
4183
4445
  """
4184
4446
  Split text at non-ASCII characters.
@@ -4903,145 +5165,6 @@ def _backup_validations(sheet, verbose=False):
4903
5165
 
4904
5166
  return backup
4905
5167
 
4906
- # def _backup_validations(sheet):
4907
- # """
4908
- # Complete validation backup with XML-level cross-sheet detection
4909
- # """
4910
- # from openpyxl.utils import get_column_letter
4911
- # import re
4912
- # from openpyxl.worksheet.datavalidation import DataValidation
4913
- # from openpyxl.xml.functions import fromstring
4914
-
4915
- # backup = {
4916
- # "validations": [],
4917
- # "conditional_formatting": [],
4918
- # "merged_cells": [str(mr) for mr in sheet.merged_cells.ranges],
4919
- # "_metadata": {
4920
- # "validated_cells": set(),
4921
- # "validated_columns": set(),
4922
- # "validation_types": set(),
4923
- # "cross_sheet_validations": set()
4924
- # }
4925
- # }
4926
-
4927
- # # METHOD 1: Primary validation backup (standard method)
4928
- # for dv in sheet.data_validations:
4929
- # # ... (existing standard validation backup code) ...
4930
-
4931
- # # METHOD 2: XML-based cross-sheet validation detection
4932
- # print("Performing deep XML scan for cross-sheet validations...")
4933
-
4934
- # # Access the worksheet XML directly
4935
- # xml_source = sheet._worksheet.xml
4936
- # if not xml_source:
4937
- # print("Warning: Could not access worksheet XML source")
4938
- # return backup
4939
-
4940
- # try:
4941
- # # Parse the XML
4942
- # root = fromstring(xml_source)
4943
- # ns = {'ns': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
4944
-
4945
- # # Find all dataValidation elements
4946
- # for dv_xml in root.findall('.//ns:dataValidation', ns):
4947
- # try:
4948
- # # Extract validation attributes
4949
- # dv_type = dv_xml.get('type', 'none')
4950
- # formula1 = dv_xml.find('.//ns:formula1', ns)
4951
- # formula_text = formula1.text if formula1 is not None else None
4952
-
4953
- # # Skip if not a list type or no formula
4954
- # if dv_type != 'list' or not formula_text:
4955
- # continue
4956
-
4957
- # # Clean the formula
4958
- # clean_formula = formula_text.strip('"\'')
4959
-
4960
- # # Check for cross-sheet patterns
4961
- # cross_sheet_patterns = [
4962
- # (r'^[\w\s]+!\$?[A-Za-z]+\$?\d+(?::\$?[A-Za-z]+\$?\d+)?$', "direct sheet reference"),
4963
- # (r'INDIRECT\(["\'][\w\s]+![A-Za-z]+\d+(?::[A-Za-z]+\d+)?["\']\)', "INDIRECT sheet reference"),
4964
- # (r'^[^\s!]+$', "potential named range"),
4965
- # ]
4966
-
4967
- # # Determine if this is a cross-sheet reference
4968
- # is_cross_sheet = False
4969
- # detection_method = ""
4970
-
4971
- # for pattern, description in cross_sheet_patterns:
4972
- # if re.match(pattern, clean_formula, re.IGNORECASE):
4973
- # is_cross_sheet = True
4974
- # detection_method = description
4975
- # break
4976
-
4977
- # if not is_cross_sheet:
4978
- # continue
4979
-
4980
- # # Process the ranges
4981
- # ranges = []
4982
- # sqref = dv_xml.get('sqref', '')
4983
- # for range_str in sqref.split():
4984
- # try:
4985
- # # Convert range to coordinates
4986
- # if ':' in range_str:
4987
- # start, end = range_str.split(':')
4988
- # col_start = int(''.join(filter(str.isdigit, start)))
4989
- # col_end = int(''.join(filter(str.isdigit, end)))
4990
- # row_start = int(''.join(filter(str.isalpha, start)))
4991
- # row_end = int(''.join(filter(str.isalpha, end)))
4992
- # ranges.append({
4993
- # 'range': range_str,
4994
- # 'cells': [f"{get_column_letter(col)}{row}"
4995
- # for col in range(col_start, col_end+1)
4996
- # for row in range(row_start, row_end+1)]
4997
- # })
4998
- # else:
4999
- # col = int(''.join(filter(str.isdigit, range_str)))
5000
- # row = int(''.join(filter(str.isalpha, range_str)))
5001
- # ranges.append({
5002
- # 'range': range_str,
5003
- # 'cells': [f"{get_column_letter(col)}{row}"]
5004
- # })
5005
- # except Exception as e:
5006
- # print(f"Error parsing range {range_str}: {e}")
5007
-
5008
- # # Create validation record
5009
- # validation_data = {
5010
- # 'type': 'list',
5011
- # 'formula1': formula_text,
5012
- # 'formula2': None,
5013
- # 'allow_blank': dv_xml.get('allowBlank', '1') == '1',
5014
- # 'showDropDown': dv_xml.get('showDropDown', '1') == '1',
5015
- # 'showInputMessage': dv_xml.get('showInputMessage', '1') == '1',
5016
- # 'showErrorMessage': dv_xml.get('showErrorMessage', '0') == '1',
5017
- # 'errorTitle': dv_xml.get('errorTitle', ''),
5018
- # 'error': dv_xml.get('error', ''),
5019
- # 'promptTitle': dv_xml.get('promptTitle', ''),
5020
- # 'prompt': dv_xml.get('prompt', ''),
5021
- # 'ranges': ranges,
5022
- # '_source': 'xml_validation',
5023
- # '_detection_method': detection_method,
5024
- # '_is_cross_sheet': True,
5025
- # '_formula_clean': clean_formula
5026
- # }
5027
-
5028
- # # Add to backup
5029
- # backup['validations'].append(validation_data)
5030
- # for rng in ranges:
5031
- # for cell_ref in rng['cells']:
5032
- # backup['_metadata']['validated_cells'].add(cell_ref)
5033
- # backup['_metadata']['validated_columns'].add(''.join(filter(str.isalpha, cell_ref)))
5034
- # backup['_metadata']['validation_types'].add('list')
5035
- # backup['_metadata']['cross_sheet_validations'].add(clean_formula.split('!')[0])
5036
-
5037
- # except Exception as e:
5038
- # print(f"Error processing XML validation: {e}")
5039
-
5040
- # except Exception as e:
5041
- # print(f"Error parsing worksheet XML: {e}")
5042
-
5043
- # return backup
5044
-
5045
5168
  def _restore_validations(sheet, backup,verbose=False):
5046
5169
  """
5047
5170
  恢复数据验证和条件格式规则到工作表
@@ -5247,11 +5370,6 @@ def fload(fpath, kind=None, **kwargs):
5247
5370
  with open(fpath, "r") as file:
5248
5371
  content = file.read()
5249
5372
  return content
5250
-
5251
- # def load_html(fpath):
5252
- # with open(fpath, "r") as file:
5253
- # content = file.read()
5254
- # return content
5255
5373
  def load_html(fpath, **kwargs):
5256
5374
  return pd.read_html(fpath, **kwargs)
5257
5375
 
@@ -7118,7 +7236,7 @@ def listdir(
7118
7236
  hidden=False, # Include hidden files/folders
7119
7237
  orient="list",
7120
7238
  output="df", # "df", 'list','dict','records','index','series'
7121
- verbose=True,
7239
+ verbose=False,
7122
7240
  ):
7123
7241
  def is_hidden(filepath):
7124
7242
  """Check if a file or folder is hidden."""
@@ -7348,7 +7466,7 @@ def listdir(
7348
7466
  if "se" in orient.lower(): # records
7349
7467
  return Box(f.to_dict(orient="series"))
7350
7468
 
7351
-
7469
+
7352
7470
  def listpkg(where="env", verbose=False):
7353
7471
  """list all pacakages"""
7354
7472
 
@@ -7829,87 +7947,7 @@ def split_path(fpath):
7829
7947
  dir_par = f_slash.join(fpath.split(f_slash)[:-1])
7830
7948
  dir_ch = "".join(fpath.split(f_slash)[-1:])
7831
7949
  return dir_par, dir_ch
7832
-
7833
-
7834
- def figsave(*args, dpi=300, **kwargs):
7835
- import matplotlib.pyplot as plt
7836
- from PIL import Image
7837
- bbox_inches = kwargs.pop("bbox_inches", "tight")
7838
- pad_inches = kwargs.pop("pad_inches", 0)
7839
- facecolor = kwargs.pop("facecolor", "white")
7840
- edgecolor = kwargs.pop("edgecolor", "auto")
7841
-
7842
- dir_save = None
7843
- fname = None
7844
- img = None
7845
-
7846
- for arg in args:
7847
- if isinstance(arg, str):
7848
- path = Path(arg)
7849
- if path.suffix: # Has file extension
7850
- fname = path.name
7851
- dir_save = path.parent
7852
- else:
7853
- dir_save = path
7854
- elif isinstance(arg, (Image.Image, np.ndarray)):
7855
- img = arg # Store PIL image or numpy array
7856
-
7857
- # Set default save directory
7858
- dir_save = Path(dir_save) if dir_save else Path(".")
7859
- dir_save.mkdir(parents=True, exist_ok=True)
7860
-
7861
- # Handle filename and extension
7862
- if fname is None:
7863
- fname = "figure"
7864
- fname = dir_save / fname
7865
- if fname.suffix == "":
7866
- fname = fname.with_suffix(".pdf") # Default format
7867
-
7868
- ftype = fname.suffix.lstrip(".").lower()
7869
-
7870
- # Save figure based on file type
7871
- if ftype == "eps":
7872
- plt.savefig(fname, format="eps", bbox_inches=bbox_inches)
7873
- plt.savefig(fname.with_suffix(".pdf"), format="pdf", dpi=dpi,
7874
- pad_inches=pad_inches, bbox_inches=bbox_inches,
7875
- facecolor=facecolor, edgecolor=edgecolor)
7876
- elif ftype == "pdf":
7877
- plt.savefig(fname, format="pdf", dpi=dpi, pad_inches=pad_inches,
7878
- bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
7879
- elif ftype in ["jpg", "jpeg", "png", "tiff", "tif"]:
7880
- if img is not None: # If an image is provided
7881
- if isinstance(img, Image.Image):
7882
- img = img.convert("RGB") if img.mode == "RGBA" else img
7883
- img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
7884
- elif isinstance(img, np.ndarray):
7885
- import cv2
7886
- if img.ndim == 2:
7887
- Image.fromarray(img).save(fname, format=ftype.upper(), dpi=(dpi, dpi))
7888
- elif img.ndim == 3:
7889
- if img.shape[2] == 3:
7890
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
7891
- elif img.shape[2] == 4:
7892
- img = cv2.cvtColor(img, cv2.COLOR_BGRA2RGBA)
7893
- Image.fromarray(img).save(fname, format=ftype.upper(), dpi=(dpi, dpi))
7894
- else:
7895
- raise ValueError("Unexpected image dimensions.")
7896
- else:
7897
- plt.savefig(fname, format=ftype, dpi=dpi, pad_inches=pad_inches,
7898
- bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
7899
- elif ftype == "ico":
7900
- if img is None:
7901
- plt.savefig(fname, dpi=dpi, pad_inches=pad_inches,
7902
- bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
7903
- img = Image.open(fname)
7904
- img = img.convert("RGBA")
7905
- icon_sizes = [(32, 32), (64, 64), (128, 128), (256, 256)]
7906
- img.save(fname, format="ICO", sizes=icon_sizes)
7907
- print(f"Icon saved @: {fname} with sizes: {icon_sizes}")
7908
- else:
7909
- raise ValueError(f"Unsupported file format: {ftype}")
7910
-
7911
- print(f"\nSaved @ {fname} (dpi={dpi})")
7912
-
7950
+
7913
7951
  def figsave(*args, dpi=300, **kwargs):
7914
7952
  """
7915
7953
  Save a Matplotlib figure or image file in various formats.
@@ -8038,7 +8076,7 @@ def figsave(*args, dpi=300, **kwargs):
8038
8076
  img = img.convert("RGBA")
8039
8077
  img.save(fname, format="ICO", sizes=icon_sizes)
8040
8078
  print(f"Icon saved @: {fname} with sizes: {icon_sizes}")
8041
- print(f"\n✅ Saved @: dpi={dpi}\n{fname}")
8079
+ print(f"\nSaved @: dpi={dpi}\n{fname}")
8042
8080
 
8043
8081
 
8044
8082
  def is_str_color(s):
@@ -8806,7 +8844,8 @@ def detect_angle(image, by="median", template=None):
8806
8844
 
8807
8845
  # Use Hough transform to detect lines
8808
8846
  lines = transform.probabilistic_hough_line(edges)
8809
-
8847
+ if isinstance(by, bool):
8848
+ by="mean" if by else 0
8810
8849
  if not lines and any(["me" in by, "pca" in by]):
8811
8850
  print("No lines detected. Adjust the edge detection parameters.")
8812
8851
  return 0
@@ -9180,7 +9219,7 @@ def imgsets(
9180
9219
  elif "cro" in k.lower() or "cut" in k.lower():
9181
9220
  img_update = img_update.crop(value)
9182
9221
  elif "rota" in k.lower():
9183
- if isinstance(value, str):
9222
+ if isinstance(value, (str,bool)):
9184
9223
  value = detect_angle(img_update, by=value)
9185
9224
  print(f"rotated by {value}°")
9186
9225
  img_update = img_update.rotate(value)
@@ -9524,11 +9563,252 @@ def finfo(fpath, output='json', verbose=False):
9524
9563
  extra_info=extra_info,
9525
9564
  )
9526
9565
 
9527
-
9528
-
9566
+ def color2rgb(
9567
+ color_input: str | tuple | list | None,
9568
+ alpha: float | None = None
9569
+ ) -> tuple | None:
9570
+ """
9571
+ Ultimate color conversion utility with support for multiple formats and transparency.
9572
+
9573
+ Parameters:
9574
+ -----------
9575
+ color_input : str | tuple | list | None
9576
+ Supported formats:
9577
+ - Hex strings ("#RRGGBB", "#RGB")
9578
+ - Named colors ("red", "blue")
9579
+ - RGB tuples ((0.2, 0.4, 0.6))
9580
+ - RGBA tuples ((0.2, 0.4, 0.6, 0.8))
9581
+ - HTML/CSS colors ("cornflowerblue")
9582
+ - CSS formats:
9583
+ - rgb(100,200,50)
9584
+ - rgba(100,200,50,0.8)
9585
+ - hsl(120,60%,70%)
9586
+ - hsla(120,60%,70%,0.8)
9587
+ alpha : float | None, optional
9588
+ Opacity value (0.0-1.0). If provided, adds/overrides alpha channel.
9589
+
9590
+ Returns:
9591
+ --------
9592
+ tuple | None
9593
+ (R, G, B) or (R, G, B, A) tuple in 0-1 range, or None if invalid
9594
+ """
9595
+ from matplotlib import colors as mcolors
9596
+ import re
9597
+
9598
+ if color_input is None:
9599
+ return None
9600
+
9601
+ # Case 1: Already in RGB/RGBA tuple format
9602
+ if isinstance(color_input, (tuple, list)):
9603
+ if 3 <= len(color_input) <= 4:
9604
+ if all(0 <= x <= 1 for x in color_input):
9605
+ if alpha is not None and len(color_input) == 3:
9606
+ return (*color_input, alpha)
9607
+ return tuple(color_input)
9608
+
9609
+ # Case 2: String input
9610
+ if isinstance(color_input, str):
9611
+ # Remove whitespace and make lowercase
9612
+ color_str = color_input.strip().lower()
9613
+
9614
+ # Handle CSS rgb/rgba format
9615
+ if color_str.startswith(('rgb(', 'rgba(')):
9616
+ try:
9617
+ nums = list(map(float, re.findall(r"[\d.]+", color_str)))
9618
+ if 3 <= len(nums) <= 4:
9619
+ rgb = tuple(x/255 if i < 3 else x for i, x in enumerate(nums))
9620
+ if alpha is not None:
9621
+ return (*rgb[:3], alpha)
9622
+ return rgb[:4] if len(rgb) == 4 else rgb[:3]
9623
+ except:
9624
+ pass
9625
+
9626
+ # Handle CSS hsl/hsla format
9627
+ elif color_str.startswith(('hsl(', 'hsla(')):
9628
+ try:
9629
+ nums = list(map(float, re.findall(r"[\d.]+", color_str)))
9630
+ if 3 <= len(nums) <= 4:
9631
+ h, s, l = nums[0]/360, nums[1]/100, nums[2]/100
9632
+ rgb = mcolors.hsv_to_rgb((h, s, l))
9633
+ if len(nums) == 4:
9634
+ rgb += (nums[3],)
9635
+ if alpha is not None:
9636
+ return (*rgb[:3], alpha)
9637
+ return rgb[:4] if len(rgb) == 4 else rgb[:3]
9638
+ except:
9639
+ pass
9640
+
9641
+ # Standard hex/named color processing
9642
+ try:
9643
+ rgb = mcolors.to_rgba(color_str)
9644
+ if alpha is not None:
9645
+ return (*rgb[:3], alpha)
9646
+ return rgb if len(rgb) == 4 and rgb[3] != 1 else rgb[:3]
9647
+ except ValueError:
9648
+ pass
9649
+
9650
+ # Fallback for invalid colors
9651
+ print(f"Warning: Invalid color format '{color_input}'")
9652
+ return None
9653
+
9654
+ def color2hex(
9655
+ color_input: str | tuple | list | dict | int | None,
9656
+ keep_alpha: bool = False,
9657
+ force_long: bool = False,
9658
+ uppercase: bool = False,
9659
+ prefix: str = "#",
9660
+ allow_short: bool = True
9661
+ ) -> str | None:
9662
+ """
9663
+ Ultimate color to hex converter with comprehensive format support.
9664
+
9665
+ Parameters:
9666
+ -----------
9667
+ color_input : str | tuple | list | dict | int | None
9668
+ Input color in any of these formats:
9669
+ - Hex strings ("#RRGGBB", "#RGB", "RRGGBB", "RGB")
9670
+ - Named colors ("red", "blue", "transparent")
9671
+ - RGB/RGBA tuples ((0.2, 0.4, 0.6), (255, 0, 0), (100, 100, 100, 0.5))
9672
+ - CSS formats:
9673
+ - rgb(100,200,50)
9674
+ - rgba(100,200,50,0.8)
9675
+ - hsl(120,60%,70%)
9676
+ - hsla(120,60%,70%,0.8)
9677
+ - Integer RGB (0xFF0000 for red)
9678
+ - Dictionary {"r": 255, "g": 0, "b": 0} or {"h": 0, "s": 100, "l": 50}
9679
+ keep_alpha : bool, optional
9680
+ Whether to include alpha channel in hex format (#RRGGBBAA)
9681
+ force_long : bool, optional
9682
+ Force 6/8-digit hex even when 3/4-digit would be possible
9683
+ uppercase : bool, optional
9684
+ Use uppercase hex characters (False for lowercase)
9685
+ prefix : str, optional
9686
+ Prefix for hex string ("#" for CSS, "0x" for programming, "" for raw)
9687
+ allow_short : bool, optional
9688
+ Allow shortened 3/4-digit hex when possible
9689
+
9690
+ Returns:
9691
+ --------
9692
+ str | None
9693
+ Hex color string or None if invalid
9694
+
9695
+ Examples:
9696
+ ---------
9697
+ >>> color2hex((0.5, 0.2, 0.8)) → "#7f33cc"
9698
+ >>> color2hex("rgb(127, 51, 204)") → "#7f33cc"
9699
+ >>> color2hex((0.2, 0.4, 0.6, 0.8), True) → "#336699cc"
9700
+ >>> color2hex(0xFF0000, uppercase=True) → "#FF0000"
9701
+ >>> color2hex({"r": 255, "g": 165, "b": 0}, prefix="") → "ffa500"
9702
+ >>> color2hex("hsl(120, 100%, 50%)") → "#00ff00"
9703
+ """
9704
+ from matplotlib import colors as mcolors
9705
+ import re
9706
+
9707
+ def to_rgba(color) -> tuple | None:
9708
+ """Internal conversion to RGBA tuple"""
9709
+ # Handle None
9710
+ if color is None:
9711
+ return None
9712
+
9713
+ # Handle integer RGB
9714
+ if isinstance(color, int):
9715
+ if color < 0:
9716
+ return None
9717
+ return (
9718
+ (color >> 16) & 0xFF,
9719
+ (color >> 8) & 0xFF,
9720
+ color & 0xFF,
9721
+ 255
9722
+ )
9723
+
9724
+ # Handle dictionary formats
9725
+ if isinstance(color, dict):
9726
+ keys = set(color.keys())
9727
+ if {'r','g','b'}.issubset(keys):
9728
+ return (
9729
+ color['r'] / 255 if color['r'] > 1 else color['r'],
9730
+ color['g'] / 255 if color['g'] > 1 else color['g'],
9731
+ color['b'] / 255 if color['b'] > 1 else color['b'],
9732
+ color.get('a', 1.0)
9733
+ )
9734
+ elif {'h','s','l'}.issubset(keys):
9735
+ return mcolors.hsv_to_rgb((
9736
+ color['h'] / 360,
9737
+ color['s'] / 100,
9738
+ color['l'] / 100
9739
+ )) + (color.get('a', 1.0),)
9740
+ return None
9741
+
9742
+ # Handle string formats
9743
+ if isinstance(color, str):
9744
+ color = color.strip().lower()
9745
+
9746
+ # Handle hex without prefix
9747
+ if re.match(r'^[0-9a-f]{3,8}$', color):
9748
+ return mcolors.to_rgba(f"#{color}")
9749
+
9750
+ # Handle CSS functions
9751
+ if color.startswith(('rgb(', 'rgba(', 'hsl(', 'hsla(')):
9752
+ try:
9753
+ return mcolors.to_rgba(color)
9754
+ except ValueError:
9755
+ return None
9756
+
9757
+ # Handle named colors (including 'transparent')
9758
+ try:
9759
+ return mcolors.to_rgba(color)
9760
+ except ValueError:
9761
+ return None
9762
+
9763
+ # Handle tuple/list formats
9764
+ if isinstance(color, (tuple, list)):
9765
+ if len(color) in (3, 4):
9766
+ # Normalize values
9767
+ normalized = []
9768
+ for i, v in enumerate(color):
9769
+ if i < 3: # RGB channels
9770
+ if isinstance(v, int):
9771
+ normalized.append(v / 255 if v > 1 else v)
9772
+ else:
9773
+ normalized.append(float(v))
9774
+ else: # Alpha channel
9775
+ normalized.append(float(v))
9776
+ return tuple(normalized)
9777
+
9778
+ return None
9779
+
9780
+ # Convert input to RGBA
9781
+ rgba = to_rgba(color_input)
9782
+ if rgba is None:
9783
+ return None
9784
+
9785
+ # Extract components
9786
+ components = []
9787
+ for i, c in enumerate(rgba):
9788
+ if i == 3 and not keep_alpha:
9789
+ break
9790
+ components.append(round(c * 255 if c <= 1 else c))
9791
+
9792
+ # Determine if we can use short format
9793
+ use_short = (allow_short and
9794
+ not force_long and
9795
+ len(components) in (3, 4) and
9796
+ all((x % 17 == 0) for x in components[:3]))
9797
+
9798
+ # Format the hex string
9799
+ if use_short:
9800
+ short_components = [x//17 for x in components[:3]] + components[3:]
9801
+ hex_str = "".join(f"{x:1x}" for x in short_components)
9802
+ else:
9803
+ hex_str = "".join(f"{x:02x}" for x in components)
9804
+
9805
+ # Apply case and prefix
9806
+ if uppercase:
9807
+ hex_str = hex_str.upper()
9808
+
9809
+ return f"{prefix}{hex_str}"
9529
9810
  # ! format excel file
9530
-
9531
-
9811
+
9532
9812
  def hex2argb(color):
9533
9813
  """
9534
9814
  Convert a color name or hex code to aARGB format required by openpyxl.
@@ -9753,6 +10033,105 @@ def copy_format(
9753
10033
  if "wb_target" in locals():
9754
10034
  wb_target.close()
9755
10035
 
10036
+ def set_sheet_visible(
10037
+ fpath: str,
10038
+ sheet_name: Union[int, str, None,list] = 1,
10039
+ show: Union[bool, str] = True,
10040
+ exclude: Union[List[str], None,list,int] = None,
10041
+ verbose: bool = False
10042
+ ) -> None:
10043
+ """
10044
+ Modify sheet visibility in an Excel workbook.
10045
+ set_sheet_visible(fpath=dir_data_collection,sheet_name=None,show=1,verbose=1)
10046
+ Args:
10047
+ fpath (str): Path to the Excel workbook.
10048
+ sheet_name (int | str | None): Index or name of the sheet to apply visibility to.
10049
+ If None, all sheets are considered.
10050
+ show (bool | str): Visibility mode. Can be:
10051
+ - True -> visible
10052
+ - False -> veryHidden
10053
+ - 'visible', 'hidden', 'veryHidden' as str
10054
+ exclude (list[str] | None): List of sheet names to exclude from changes.
10055
+ verbose (bool): If True, logs actions.
10056
+ """
10057
+
10058
+ try:
10059
+ wb = fload(fpath, output="bit", get_validations=1)
10060
+ except Exception as e:
10061
+ raise FileNotFoundError(f"Unable to load workbook: {e}")
10062
+
10063
+ sheet_names = wb.sheetnames
10064
+ if verbose:
10065
+ print("Workbook loaded with sheets:")
10066
+ for i, name in enumerate(sheet_names):
10067
+ print(f" [{i}] {name}")
10068
+
10069
+ excludes=[]
10070
+ if exclude is None:
10071
+ exclude=[]
10072
+ if ~isinstance(exclude, list):
10073
+ exclude = [exclude]
10074
+ for exclude_ in exclude:
10075
+ if isinstance(exclude_, str):
10076
+ excludes.append(strcmp(exclude_, sheet_names)[0])
10077
+ elif isinstance(exclude_, int):
10078
+ if 0 <= exclude_ < len(sheet_names):
10079
+ excludes.append(sheet_names[exclude_])
10080
+ else:
10081
+ raise IndexError(f"sheet_name index {exclude_} is out of range:0~{len(sheet_names)-1}.")
10082
+
10083
+ # Resolve the sheet_name target
10084
+ target_indices = []
10085
+ if not isinstance(sheet_name,list):
10086
+ sheet_name=[sheet_name]
10087
+ for sheet_name_ in sheet_name:
10088
+ if sheet_name_ is None:
10089
+ target_indices = list(range(len(sheet_names)))
10090
+ break
10091
+ elif isinstance(sheet_name_, int):
10092
+ if 0 <= sheet_name_ < len(sheet_names):
10093
+ target_indices.append(sheet_name_)
10094
+ else:
10095
+ raise IndexError(f"sheet_name index {sheet_name_} is out of range :0~{len(sheet_names)-1}.")
10096
+ elif isinstance(sheet_name_, str):
10097
+ idx = strcmp(sheet_name_, sheet_names)[1]
10098
+ if idx == -1:
10099
+ raise ValueError(f"Sheet '{sheet_name_}' not found.")
10100
+ target_indices.append(idx)
10101
+
10102
+ # Map show argument to valid state
10103
+ valid_states = ["veryHidden", "visible", "hidden"]
10104
+ if isinstance(show, str):
10105
+ if show not in valid_states:
10106
+ raise ValueError(f"Invalid show value '{show}'. Must be one of {valid_states}")
10107
+ state = show
10108
+ else:
10109
+ state = "visible" if show else "veryHidden"
10110
+ # Modify sheet visibility
10111
+ for idx in target_indices:
10112
+ ws= wb[sheet_names[idx]]
10113
+ if ws.title in excludes:
10114
+ if verbose:
10115
+ print(f"Skipping excluded sheet: '{ws.title}'")
10116
+ continue
10117
+ ws.sheet_state = state
10118
+ # Ensure at least one sheet is visible
10119
+ visible_sheets = [s for s in wb.worksheets if s.sheet_state == "visible"]
10120
+ not_visible_sheets = [s for s in wb.worksheets if s.sheet_state != "visible"]
10121
+ if not visible_sheets:
10122
+ fallback_sheet = wb.worksheets[0]
10123
+ fallback_sheet.sheet_state = "visible"
10124
+ if verbose:
10125
+ print(f"No visible sheets found. Setting '{fallback_sheet.title}' to visible.")
10126
+ if verbose:
10127
+ print(f"visible sheets:{[s.title for s in visible_sheets]}")
10128
+
10129
+ try:
10130
+ wb.save(fpath)
10131
+ except Exception as e:
10132
+ raise IOError(f"Error saving workbook: {e}")
10133
+
10134
+
9756
10135
  def format_excel(
9757
10136
  df: pd.DataFrame=None,
9758
10137
  filename:str=None,
@@ -14580,209 +14959,213 @@ def df_reducer(
14580
14959
 
14581
14960
  # example:
14582
14961
  # df_reducer(data=data_log, columns=markers, n_components=2)
14962
+
14583
14963
 
14584
14964
 
14585
- def get_df_format(data, threshold_unique=0.5, verbose=False):
14965
+ def get_df_format(data, threshold_unique=0.5, verbose=False, sample_size=1000):
14586
14966
  """
14587
- 检测表格: long, wide or uncertain.
14588
-
14967
+ Detect whether a DataFrame is in long or wide format with optimized performance and accuracy.
14968
+
14589
14969
  Parameters:
14590
- - data (pd.DataFrame): DataFrame to check.
14591
- - threshold_unique (float): Proportion threshold for detecting categorical columns.
14592
-
14970
+ - data (pd.DataFrame): DataFrame to analyze
14971
+ - threshold_unique (float): Threshold for categorical column detection (0-1)
14972
+ - verbose (bool): Whether to print diagnostic messages
14973
+ - sample_size (int): Maximum number of rows/columns to sample for large datasets
14974
+
14593
14975
  Returns:
14594
- - "long" if detected as long format,
14976
+ - "long" if detected as long format
14595
14977
  - "wide" if detected as wide format
14596
- - "uncertain" if ambiguous.
14978
+ - "uncertain" if format is ambiguous
14597
14979
  """
14980
+ import pandas as pd
14981
+ import numpy as np
14598
14982
  from scipy.stats import entropy
14599
14983
  from sklearn.cluster import AgglomerativeClustering
14600
14984
  from sklearn.preprocessing import StandardScaler
14601
-
14602
- long_score,wide_score,fs = 0,0,500
14603
- n_rows, n_cols = data.shape
14604
- # -----to reduce memory, only check 500 rows/columns----
14605
- if n_rows > fs:
14606
- if verbose:
14607
- print(f"Sampling {fs} rows from {n_rows} rows.")
14608
- data = data.sample(n=fs, random_state=1)
14609
- if n_cols > fs:
14610
- if verbose:
14611
- print(f"Using first {fs} columns out of {n_cols} columns.")
14612
- data = data.iloc[:, :fs]
14985
+ from sklearn.metrics import pairwise_distances
14986
+ from collections import Counter
14987
+ import re
14988
+ # ----- Initial Setup and Sampling -----
14613
14989
  n_rows, n_cols = data.shape
14990
+ if verbose:
14991
+ print(f"Initial shape: {n_rows} rows, {n_cols} columns")
14614
14992
 
14615
- # Step 1: Row-Column Ratio Heuristic
14616
- if n_rows > 3 * n_cols:
14617
- long_score += 2
14618
- if verbose:
14619
- print(
14620
- "Row-Column Ratio suggests long format (many rows relative to columns)."
14621
- )
14622
- elif n_cols > 3 * n_rows:
14623
- wide_score += 2
14624
- if verbose:
14625
- print(
14626
- "Row-Column Ratio suggests wide format (many columns relative to rows)."
14627
- )
14628
-
14629
- # Step 2: Unique-to-duplicate ratio and entropy for categorical variables
14630
- unique_counts = data.apply(lambda x: x.nunique())
14993
+ # Sample data if too large
14994
+ if n_rows > sample_size:
14995
+ data = data.sample(n=sample_size, random_state=42)
14996
+ n_rows = sample_size
14997
+ if n_cols > sample_size:
14998
+ data = data.iloc[:, :sample_size]
14999
+ n_cols = sample_size
15000
+
15001
+ # Early exit for tiny datasets
15002
+ if n_rows < 3 or n_cols < 3:
15003
+ return "uncertain"
15004
+
15005
+ long_score = 0
15006
+ wide_score = 0
15007
+
15008
+ # ----- Feature Extraction -----
15009
+ # Basic statistics
15010
+ row_col_ratio = n_rows / n_cols if n_cols != 0 else float('inf')
15011
+
15012
+ # Column types
15013
+ numeric_cols = data.select_dtypes(include=np.number).columns
15014
+ cat_cols = data.select_dtypes(include=['object', 'category']).columns
15015
+ other_cols = [col for col in data.columns if col not in numeric_cols and col not in cat_cols]
15016
+
15017
+ # Unique value analysis
15018
+ unique_counts = data.nunique(dropna=False)
14631
15019
  duplicate_ratio = 1 - unique_counts / n_rows
14632
- if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
14633
- wide_score += 2
14634
- if verbose:
14635
- print("High duplicate values in columns suggest wide format.")
14636
- else:
14637
- long_score += 1
14638
- if verbose:
14639
- print(
14640
- "Lower duplicate ratio suggests long format (higher row variability)."
14641
- )
14642
-
14643
- # Calculate entropy for categorical columns
14644
- categorical_cols = data.select_dtypes(include=["object", "category"]).columns
14645
- if len(categorical_cols) > 0:
14646
- for col in categorical_cols:
14647
- counts = data[col].value_counts(normalize=True)
14648
- col_entropy = entropy(counts)
14649
- if col_entropy < 1.5:
14650
- long_score += 1
14651
- if verbose:
14652
- print(
14653
- f"Column '{col}' entropy suggests categorical, supporting long format."
14654
- )
14655
- else:
14656
- wide_score += 1
14657
- if verbose:
14658
- print(f"Column '{col}' entropy is higher, supporting wide format.")
14659
-
14660
- # Step 3: Column grouping analysis for patterns in suffixes/prefixes
15020
+
15021
+ # Missing values
15022
+ missing_per_row = data.isna().sum(axis=1)
15023
+ missing_per_col = data.isna().sum()
15024
+
15025
+ # Column name patterns
14661
15026
  col_names = data.columns.astype(str)
14662
- suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
14663
- if suffix_count > 0.3 * n_cols:
15027
+ has_suffix = sum(bool(re.search(r'(_\d+|\d+_?$)', col)) for col in col_names)
15028
+ has_time = sum(bool(re.search(r'(^time|^date|^year|^month|^day|^t\d+)', col.lower())) for col in col_names)
15029
+
15030
+ # ----- Scoring Rules -----
15031
+
15032
+ # 1. Row-Column Ratio (weighted)
15033
+ if row_col_ratio > 5:
15034
+ long_score += 3
15035
+ if verbose: print(f"High row/col ratio ({row_col_ratio:.1f}) → long +3")
15036
+ elif row_col_ratio < 0.2:
15037
+ wide_score += 3
15038
+ if verbose: print(f"Low row/col ratio ({row_col_ratio:.1f}) → wide +3")
15039
+ elif row_col_ratio > 2:
15040
+ long_score += 1
15041
+ if verbose: print(f"Moderate row/col ratio ({row_col_ratio:.1f}) → long +1")
15042
+ elif row_col_ratio < 0.5:
15043
+ wide_score += 1
15044
+ if verbose: print(f"Moderate row/col ratio ({row_col_ratio:.1f}) → wide +1")
15045
+
15046
+ # 2. Duplication Patterns
15047
+ high_dupe_cols = sum(duplicate_ratio > 0.3)
15048
+ if high_dupe_cols > 0.6 * n_cols:
14664
15049
  wide_score += 2
14665
- if verbose:
14666
- print(
14667
- "Detected suffix/prefix patterns in column names, suggesting wide format."
14668
- )
14669
-
14670
- # Step 4: Entity identifier detection for long format with categorical columns
14671
- if len(categorical_cols) > 0 and n_rows > n_cols:
14672
- entity_identifier_count = sum(
14673
- data.duplicated(subset=categorical_cols, keep=False)
14674
- )
14675
- if entity_identifier_count > 0.2 * n_rows:
15050
+ if verbose: print(f"Many columns ({high_dupe_cols}/{n_cols}) with duplicates → wide +2")
15051
+ elif high_dupe_cols < 0.2 * n_cols:
15052
+ long_score += 1
15053
+ if verbose: print(f"Few columns ({high_dupe_cols}/{n_cols}) with duplicates → long +1")
15054
+
15055
+ # 3. Categorical Column Analysis
15056
+ if len(cat_cols) > 0:
15057
+ # Entropy analysis
15058
+ cat_entropies = []
15059
+ for col in cat_cols:
15060
+ counts = data[col].value_counts(normalize=True, dropna=False)
15061
+ cat_entropies.append(entropy(counts))
15062
+
15063
+ avg_cat_entropy = np.mean(cat_entropies) if cat_entropies else 0
15064
+ if avg_cat_entropy < 1.2:
14676
15065
  long_score += 2
14677
- if verbose:
14678
- print(
14679
- "Significant duplicate rows based on categorical columns, suggesting long format."
14680
- )
14681
-
14682
- # Step 5: Clustering analysis on numerical columns for correlation in wide format
14683
- numeric_cols = data.select_dtypes(include="number").columns
14684
- if len(numeric_cols) > 1:
14685
- try:
14686
- scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
14687
- clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
14688
- cluster_labels = pd.Series(clustering.labels_)
14689
- if cluster_labels.nunique() < len(numeric_cols) * 0.5:
14690
- wide_score += 2
14691
- if verbose:
14692
- print(
14693
- "Clustering on columns shows grouping, suggesting wide format."
14694
- )
14695
- except Exception as e:
14696
- print(e) if verbose else None
14697
-
14698
- # Step 6: Inter-column correlation analysis
14699
- if len(numeric_cols) > 1:
15066
+ if verbose: print(f"Low categorical entropy ({avg_cat_entropy:.2f}) → long +2")
15067
+ elif avg_cat_entropy > 2:
15068
+ wide_score += 1
15069
+ if verbose: print(f"High categorical entropy ({avg_cat_entropy:.2f}) → wide +1")
15070
+
15071
+ # Entity identifier detection
15072
+ if len(cat_cols) >= 2 and n_rows > 10:
15073
+ dup_rows = data.duplicated(subset=cat_cols.tolist()[:2], keep=False).sum()
15074
+ if dup_rows > 0.3 * n_rows:
15075
+ long_score += 2
15076
+ if verbose: print(f"Duplicate rows in categorical cols ({dup_rows}/{n_rows}) → long +2")
15077
+
15078
+ # 4. Column Name Patterns
15079
+ if has_suffix > 0.4 * n_cols:
15080
+ wide_score += 2
15081
+ if verbose: print(f"Many suffix patterns ({has_suffix}/{n_cols}) → wide +2")
15082
+ if has_time > 0.3 * n_cols:
15083
+ wide_score += 1
15084
+ if verbose: print(f"Time-like columns ({has_time}/{n_cols}) → wide +1")
15085
+
15086
+ # 5. Numeric Column Analysis (only if enough numeric columns)
15087
+ if len(numeric_cols) > 2:
15088
+ # Correlation analysis
14700
15089
  corr_matrix = data[numeric_cols].corr().abs()
14701
- avg_corr = (
14702
- corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
14703
- )
14704
- if avg_corr > 0.6:
15090
+ avg_corr = corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)].mean()
15091
+
15092
+ if avg_corr > 0.5:
14705
15093
  wide_score += 2
14706
- if verbose:
14707
- print("High inter-column correlation suggests wide format.")
14708
-
14709
- # Step 7: Missing value pattern analysis
14710
- missing_patterns = data.isna().sum(axis=1)
14711
- if missing_patterns.std() < 2:
15094
+ if verbose: print(f"High numeric correlation ({avg_corr:.2f}) → wide +2")
15095
+ elif avg_corr < 0.2:
15096
+ long_score += 1
15097
+ if verbose: print(f"Low numeric correlation ({avg_corr:.2f}) → long +1")
15098
+
15099
+ # Entropy analysis
15100
+ try:
15101
+ numeric_data = data[numeric_cols].dropna()
15102
+ if len(numeric_data) > 10:
15103
+ numeric_entropy = numeric_data.apply(lambda x: entropy(pd.cut(x, bins=min(10, len(x.unique())).value_counts(normalize=True))))
15104
+ if numeric_entropy.mean() < 1.5:
15105
+ wide_score += 1
15106
+ if verbose: print(f"Low numeric entropy ({numeric_entropy.mean():.2f}) → wide +1")
15107
+ except Exception as e:
15108
+ if verbose: print(f"Numeric entropy failed: {str(e)}")
15109
+
15110
+ # 6. Missing Value Patterns
15111
+ missing_row_std = missing_per_row.std()
15112
+ if missing_row_std < 1 and missing_per_row.mean() > 0.1 * n_cols:
14712
15113
  wide_score += 1
14713
- if verbose:
14714
- print(
14715
- "Low variation in missing patterns across rows, supporting wide format."
14716
- )
14717
- elif missing_patterns.mean() < 1:
15114
+ if verbose: print(f"Uniform missing pattern (std={missing_row_std:.2f}) → wide +1")
15115
+ elif missing_per_row.mean() < 0.05 * n_cols:
14718
15116
  long_score += 1
14719
- if verbose:
14720
- print("Lower missing pattern suggests long format (less structured).")
14721
-
14722
- # Step 8: Multi-level clustering on rows to detect block structure for wide format
14723
- if len(numeric_cols) > 1 and n_rows > 5:
15117
+ if verbose: print(f"Few missing values → long +1")
15118
+
15119
+ # 7. Advanced Clustering (only for medium/large datasets)
15120
+ if len(numeric_cols) > 3 and n_rows > 10 and n_cols > 5:
14724
15121
  try:
14725
- clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
14726
- if pd.Series(clustering_rows.labels_).nunique() < 2:
14727
- wide_score += 2
14728
- if verbose:
14729
- print("Row clustering reveals homogeneity, suggesting wide format.")
15122
+ # Efficient clustering with sampling
15123
+ sample_data = data[numeric_cols].sample(n=min(100, n_rows), random_state=42)
15124
+ scaled_data = StandardScaler().fit_transform(sample_data.dropna())
15125
+
15126
+ if scaled_data.shape[0] > 5:
15127
+ # Column clustering
15128
+ col_dist = pairwise_distances(scaled_data.T)
15129
+ col_clusters = AgglomerativeClustering(n_clusters=2,
15130
+ affinity='precomputed',
15131
+ linkage='complete').fit(col_dist)
15132
+ cluster_counts = Counter(col_clusters.labels_)
15133
+ if max(cluster_counts.values()) > 0.7 * len(numeric_cols):
15134
+ wide_score += 2
15135
+ if verbose: print(f"Column clustering shows dominant group → wide +2")
15136
+
15137
+ # Row clustering
15138
+ row_clusters = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
15139
+ row_cluster_counts = Counter(row_clusters.labels_)
15140
+ if max(row_cluster_counts.values()) > 0.8 * scaled_data.shape[0]:
15141
+ wide_score += 1
15142
+ if verbose: print(f"Row clustering shows homogeneity → wide +1")
14730
15143
  except Exception as e:
14731
- print(e) if verbose else None
14732
-
14733
- # Step 9: Sequential name detection for time-series pattern in wide format
14734
- if any(col.isdigit() or col.startswith("T") for col in col_names):
14735
- wide_score += 1
14736
- if verbose:
14737
- print("Detected time-like sequential column names, supporting wide format.")
14738
-
14739
- # Step 10: Entropy of numeric columns
14740
- try:
14741
- numeric_entropy = data[numeric_cols].apply(
14742
- lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
14743
- )
14744
- if numeric_entropy.mean() < 2:
14745
- wide_score += 2
14746
- if verbose:
14747
- print(
14748
- "Low entropy in numeric columns indicates stability across columns, supporting wide format."
14749
- )
14750
- except Exception as e:
14751
- print(e) if verbose else None
14752
-
14753
- # Step 11: Tie-breaking strategy if scores are equal
14754
- if wide_score == long_score:
14755
- if n_cols > n_rows:
14756
- wide_score += 1
14757
- if verbose:
14758
- print(
14759
- "Tie-breaking based on column-major structure, favoring wide format."
14760
- )
14761
- elif n_rows > n_cols:
14762
- long_score += 1
14763
- if verbose:
14764
- print(
14765
- "Tie-breaking based on row-major structure, favoring long format."
14766
- )
14767
- else:
14768
- if verbose:
14769
- print("Tie-breaking inconclusive; returning 'uncertain'.")
14770
- return "uncertain"
14771
-
14772
- # Final decision
14773
- if wide_score > long_score:
14774
- if verbose:
14775
- print("Final decision: Wide format.")
14776
- return "wide"
14777
- elif long_score > wide_score:
14778
- if verbose:
14779
- print("Final decision: Long format.")
14780
- return "long"
15144
+ if verbose: print(f"Clustering skipped: {str(e)}")
15145
+
15146
+ # ----- Decision Logic -----
15147
+ score_diff = long_score - wide_score
15148
+ abs_diff = abs(score_diff)
15149
+
15150
+ if verbose:
15151
+ print(f"\nFinal scores - Long: {long_score}, Wide: {wide_score}")
15152
+
15153
+ if abs_diff >= 3:
15154
+ return "long" if score_diff > 0 else "wide"
15155
+ elif abs_diff >= 1:
15156
+ # Additional tie-breakers
15157
+ if score_diff == 0:
15158
+ if row_col_ratio > 1.5:
15159
+ return "long"
15160
+ elif row_col_ratio < 0.67:
15161
+ return "wide"
15162
+ elif len(cat_cols) > len(numeric_cols):
15163
+ return "long"
15164
+ else:
15165
+ return "wide"
15166
+ return "long" if score_diff > 0 else "wide"
14781
15167
  else:
14782
- if verbose:
14783
- print("Final decision: Uncertain format.")
14784
15168
  return "uncertain"
14785
-
14786
15169
  #! ========== workbook, worksheet, wb,ws =============
14787
15170
 
14788
15171
  import openpyxl
@@ -17221,3 +17604,290 @@ def set_theme(
17221
17604
  color_codes=color_codes,
17222
17605
  rc=rc_params,
17223
17606
  )
17607
+
17608
+
17609
+
17610
+ def df_wide_long(df):
17611
+ rows, columns = df.shape
17612
+ if columns > rows:
17613
+ return "Wide"
17614
+ elif rows > columns:
17615
+ return "Long"
17616
+
17617
+ def df2array(data: pd.DataFrame, x=None, y=None, hue=None, sort=False):
17618
+
17619
+ def sort_rows_move_nan(arr, sort=False):
17620
+ # Handle edge cases where all values are NaN
17621
+ if np.all(np.isnan(arr)):
17622
+ return arr # Return unchanged if the entire array is NaN
17623
+
17624
+ if sort:
17625
+ # Replace NaNs with a temporary large value for sorting
17626
+ temp_value = (
17627
+ np.nanmax(arr[np.isfinite(arr)]) + 1 if np.any(np.isfinite(arr)) else np.inf
17628
+ )
17629
+ arr_no_nan = np.where(np.isnan(arr), temp_value, arr)
17630
+
17631
+ # Sort each row
17632
+ sorted_arr = np.sort(arr_no_nan, axis=1)
17633
+
17634
+ # Move NaNs to the end
17635
+ result_arr = np.where(sorted_arr == temp_value, np.nan, sorted_arr)
17636
+ else:
17637
+ result_rows = []
17638
+ for row in arr:
17639
+ # Separate non-NaN and NaN values
17640
+ non_nan_values = row[~np.isnan(row)]
17641
+ nan_count = np.isnan(row).sum()
17642
+ # Create a new row with non-NaN values followed by NaNs
17643
+ new_row = np.concatenate([non_nan_values, [np.nan] * nan_count])
17644
+ result_rows.append(new_row)
17645
+ # Convert the list of rows back into a 2D NumPy array
17646
+ result_arr = np.array(result_rows)
17647
+
17648
+ # Remove rows/columns that contain only NaNs
17649
+ clean_arr = result_arr[~np.isnan(result_arr).all(axis=1)]
17650
+ clean_arr_ = clean_arr[:, ~np.isnan(clean_arr).all(axis=0)]
17651
+
17652
+ return clean_arr_
17653
+ # data = data.copy()
17654
+ # data[y] = pd.to_numeric(data[y], errors="coerce")
17655
+ # data = data.dropna(subset=[y])
17656
+ if hue is None:
17657
+ a = []
17658
+ if sort:
17659
+ cat_x = np.sort(data[x].unique().tolist()).tolist()
17660
+ else:
17661
+ cat_x = data[x].unique().tolist()
17662
+ for i, x_ in enumerate(cat_x):
17663
+ new_ = data.loc[data[x] == x_, y].to_list()
17664
+ a = padcat(a, new_, axis=0)
17665
+ return sort_rows_move_nan(a).T
17666
+ else:
17667
+ a = []
17668
+ if sort:
17669
+ cat_x = np.sort(data[x].unique().tolist()).tolist()
17670
+ cat_hue = np.sort(data[hue].unique().tolist()).tolist()
17671
+ else:
17672
+ cat_x = data[x].unique().tolist()
17673
+ cat_hue = data[hue].unique().tolist()
17674
+ for i, x_ in enumerate(cat_x):
17675
+ for j, hue_ in enumerate(cat_hue):
17676
+ new_ = data.loc[(data[x] == x_) & (data[hue] == hue_), y].to_list()
17677
+ a = padcat(a, new_, axis=0)
17678
+ return sort_rows_move_nan(a).T
17679
+
17680
+
17681
+ def array2df(data: np.ndarray):
17682
+ df = pd.DataFrame()
17683
+ df["group"] = (
17684
+ np.tile(
17685
+ ["group" + str(i) for i in range(1, data.shape[1] + 1)], [data.shape[0], 1]
17686
+ )
17687
+ .reshape(-1, 1, order="F")[:, 0]
17688
+ .tolist()
17689
+ )
17690
+ df["value"] = data.reshape(-1, 1, order="F")
17691
+ return df
17692
+
17693
+
17694
+ def padcat(*args, fill_value=np.nan, axis=1, order="row"):
17695
+ """
17696
+ Concatenate vectors with padding.
17697
+
17698
+ Parameters:
17699
+ *args : variable number of list or 1D arrays
17700
+ Input arrays to concatenate.
17701
+ fill_value : scalar, optional
17702
+ The value to use for padding the shorter lists (default is np.nan).
17703
+ axis : int, optional
17704
+ The axis along which to concatenate (0 for rows, 1 for columns, default is 1).
17705
+ order : str, optional
17706
+ The order for flattening when required: "row" or "column" (default is "row").
17707
+
17708
+ Returns:
17709
+ np.ndarray
17710
+ A 2D array with the input arrays concatenated along the specified axis,
17711
+ padded with fill_value where necessary.
17712
+
17713
+
17714
+ # Example usage:
17715
+ a = [1, np.nan]
17716
+ b = [1, 3, 4, np.nan, 2, np.nan]
17717
+ c = [1, 2, 3, 4, 5, 6, 7, 8, 10]
17718
+ d = padcat(a, b)
17719
+ result1 = padcat(d, c)
17720
+ result2 = padcat(a, b, c)
17721
+ print("Result of padcat(d, c):\n", result1)
17722
+ print("Result of padcat(a, b, c):\n", result2)
17723
+ """
17724
+ # Set the order for processing
17725
+ if "ro" in order.lower():
17726
+ order = "C" # row-major order
17727
+ else:
17728
+ order = "F" # column-major order
17729
+
17730
+ # Process input arrays based on their dimensions
17731
+ processed_arrays = []
17732
+ for arg in args:
17733
+ arr = np.asarray(arg)
17734
+ if arr.ndim == 1:
17735
+ processed_arrays.append(arr) # Keep 1D arrays as is
17736
+ elif arr.ndim == 2:
17737
+ if axis == 0:
17738
+ # If concatenating along rows, split 2D arrays into 1D arrays row-wise
17739
+ processed_arrays.extend(arr)
17740
+ elif axis == 1:
17741
+ # If concatenating along columns, split 2D arrays into 1D arrays column-wise
17742
+ processed_arrays.extend(arr.T)
17743
+ else:
17744
+ raise ValueError("axis must be 0 or 1")
17745
+ else:
17746
+ raise ValueError("Input arrays must be 1D or 2D")
17747
+
17748
+ if axis == 0:
17749
+ # Concatenate along rows
17750
+ max_len = max(arr.size for arr in processed_arrays)
17751
+ result = np.full((len(processed_arrays), max_len), fill_value)
17752
+ for i, arr in enumerate(processed_arrays):
17753
+ result[i, : arr.size] = arr
17754
+ elif axis == 1:
17755
+ # Concatenate along columns
17756
+ max_len = max(arr.size for arr in processed_arrays)
17757
+ result = np.full((max_len, len(processed_arrays)), fill_value)
17758
+ for i, arr in enumerate(processed_arrays):
17759
+ result[: arr.size, i] = arr
17760
+ else:
17761
+ raise ValueError("axis must be 0 or 1")
17762
+
17763
+ return result
17764
+
17765
+
17766
+ # ========== memory cleaner ==========
17767
+ import gc
17768
+ import os
17769
+ import sys
17770
+ import psutil
17771
+ import platform
17772
+ import ctypes
17773
+ import subprocess
17774
+ import warnings
17775
+ import time
17776
+
17777
+ class MemoryOptimizer:
17778
+ def __init__(self, verbose: bool = True, aggressive_mode: bool = True):
17779
+ self.verbose = verbose
17780
+ self.aggressive_mode = aggressive_mode
17781
+ self.system = platform.system()
17782
+ self.process = psutil.Process(os.getpid())
17783
+ self.start_time = time.time()
17784
+ self.memory_history = []
17785
+
17786
+ def log(self, msg: str, level: str = "INFO"):
17787
+ if self.verbose:
17788
+ rss = self.process.memory_info().rss / (1024 ** 2)
17789
+ elapsed = time.time() - self.start_time
17790
+ print(f"[{level}][{elapsed:.2f}s][{rss:.1f}MB] {msg}")
17791
+
17792
+ def collect_garbage(self):
17793
+ self.log("Performing deep garbage collection...")
17794
+ stats = {}
17795
+ before_mem = self.process.memory_info().rss
17796
+ for gen in reversed(range(3)):
17797
+ collected = gc.collect(gen)
17798
+ self.log(f"GC Gen {gen}: Collected {collected}")
17799
+ gc.garbage.clear()
17800
+ after_mem = self.process.memory_info().rss
17801
+ stats['freed_mb'] = (before_mem - after_mem) / (1024 ** 2)
17802
+ return stats
17803
+
17804
+ def clear_frameworks(self):
17805
+ result = {}
17806
+ try:
17807
+ import torch
17808
+ if torch.cuda.is_available():
17809
+ self.log("Clearing PyTorch cache...")
17810
+ torch.cuda.empty_cache()
17811
+ torch.cuda.ipc_collect()
17812
+ result['pytorch'] = 'cleared'
17813
+ except Exception as e:
17814
+ self.log(f"PyTorch skipped: {e}", "WARNING")
17815
+
17816
+ try:
17817
+ import tensorflow as tf
17818
+ self.log("Clearing TensorFlow session...")
17819
+ tf.keras.backend.clear_session()
17820
+ result['tensorflow'] = 'cleared'
17821
+ except Exception as e:
17822
+ self.log(f"TensorFlow skipped: {e}", "WARNING")
17823
+
17824
+ try:
17825
+ import cv2
17826
+ self.log("Closing OpenCV windows...")
17827
+ cv2.destroyAllWindows()
17828
+ result['opencv'] = 'cleared'
17829
+ except Exception:
17830
+ pass
17831
+
17832
+ try:
17833
+ import matplotlib.pyplot as plt
17834
+ self.log("Closing matplotlib figures...")
17835
+ plt.close('all')
17836
+ result['matplotlib'] = 'cleared'
17837
+ except Exception:
17838
+ pass
17839
+
17840
+ return result
17841
+
17842
+ def clear_system_caches(self):
17843
+ result = {}
17844
+ self.log("Attempting full system cache clearance...")
17845
+ try:
17846
+ if self.system == "Linux":
17847
+ subprocess.run(["sync"], check=True)
17848
+ subprocess.run(["sudo", "sh", "-c", "echo 3 > /proc/sys/vm/drop_caches"], check=True)
17849
+ result['linux'] = 'caches dropped'
17850
+ elif self.system == "Darwin":
17851
+ subprocess.run(["sudo", "purge"], check=True)
17852
+ result['macos'] = 'purge run'
17853
+ elif self.system == "Windows":
17854
+ ctypes.windll.psapi.EmptyWorkingSet(-1)
17855
+ if self.aggressive_mode:
17856
+ ctypes.windll.kernel32.SetProcessWorkingSetSizeEx(
17857
+ -1, ctypes.c_size_t(-1), ctypes.c_size_t(-1), ctypes.c_uint(0x1)
17858
+ )
17859
+ result['windows'] = 'working set emptied'
17860
+ except Exception as e:
17861
+ self.log(f"System cache clearing failed: {e}", "ERROR")
17862
+ return result
17863
+
17864
+ def profile(self) -> Dict[str, Any]:
17865
+ mem = self.process.memory_info()
17866
+ vm = psutil.virtual_memory()
17867
+ profile = {
17868
+ 'rss_mb': mem.rss / (1024 ** 2),
17869
+ 'vms_mb': mem.vms / (1024 ** 2),
17870
+ 'used_gb': vm.used / (1024 ** 3),
17871
+ 'available_gb': vm.available / (1024 ** 3),
17872
+ 'percent': vm.percent,
17873
+ }
17874
+ self.memory_history.append(profile)
17875
+ return profile
17876
+
17877
+ def optimize(self) -> Dict[str, Any]:
17878
+ result = {}
17879
+ result['before'] = self.profile()
17880
+ result['gc'] = self.collect_garbage()
17881
+ result['frameworks'] = self.clear_frameworks()
17882
+ result['system'] = self.clear_system_caches()
17883
+ result['after'] = self.profile()
17884
+ saved = result['before']['rss_mb'] - result['after']['rss_mb']
17885
+ result['saved_mb'] = saved
17886
+ result['saved_percent'] = (saved / result['before']['rss_mb']) * 100 if result['before']['rss_mb'] else 0
17887
+ self.log(f"Optimization complete: Saved {saved:.2f} MB ({result['saved_percent']:.1f}%)", "SUCCESS")
17888
+ return result
17889
+
17890
+
17891
+ def cleaner(verbose: bool = True, aggressive: bool = True) -> Dict[str, Any]:
17892
+ optimizer = MemoryOptimizer(verbose=verbose, aggressive_mode=aggressive)
17893
+ return optimizer.optimize()