py2ls 0.2.5.12__py3-none-any.whl → 0.2.5.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/ich2ls.py +1955 -296
- py2ls/ips.py +1278 -608
- py2ls/netfinder.py +12 -5
- py2ls/plot.py +13 -7
- py2ls/stats.py +1 -144
- {py2ls-0.2.5.12.dist-info → py2ls-0.2.5.14.dist-info}/METADATA +1 -1
- {py2ls-0.2.5.12.dist-info → py2ls-0.2.5.14.dist-info}/RECORD +8 -8
- {py2ls-0.2.5.12.dist-info → py2ls-0.2.5.14.dist-info}/WHEEL +1 -1
py2ls/ips.py
CHANGED
@@ -12,8 +12,7 @@ import re
|
|
12
12
|
import stat
|
13
13
|
import platform
|
14
14
|
|
15
|
-
from typing import Dict, List, Optional, Union, Any,Tuple
|
16
|
-
|
15
|
+
from typing import Dict, List, Optional, Union, Any, Tuple, Literal
|
17
16
|
from regex import X
|
18
17
|
|
19
18
|
try:
|
@@ -1663,75 +1662,464 @@ def flatten(nested: Any, unique_list=True, verbose=False):
|
|
1663
1662
|
return flattened_list
|
1664
1663
|
|
1665
1664
|
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1665
|
+
#! ===========extract_text===========
|
1666
|
+
def extract_text(
|
1667
|
+
text: Union[str, List[str]],
|
1668
|
+
patterns: Union[str, List[str]],
|
1669
|
+
*,
|
1670
|
+
mode: Literal["between", "split", "extract"] = "between",
|
1671
|
+
keep: Literal["none", "left", "right", "both", "markers"] = "none",
|
1672
|
+
case: Literal["sensitive", "insensitive"] = "insensitive",
|
1673
|
+
all_matches: bool = False,
|
1674
|
+
positions: bool = False,
|
1675
|
+
regex: bool = False,
|
1676
|
+
delimiter: Optional[str] = None,
|
1677
|
+
trim: bool = True,
|
1678
|
+
as_dict: bool = False,
|
1679
|
+
verbose: bool = False,
|
1680
|
+
**kwargs,
|
1681
|
+
) -> Union[List[str], Tuple[int, str], Dict[str, Any], List[Dict[str, Any]], None]:
|
1682
|
+
"""
|
1683
|
+
Ultimate text extraction tool with enhanced reliability and features.
|
1677
1684
|
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1685
|
+
Key improvements:
|
1686
|
+
- Robust split mode with proper delimiter handling
|
1687
|
+
- Consistent return types across all modes
|
1688
|
+
- Improved pattern matching logic
|
1689
|
+
- Better edge case handling
|
1683
1690
|
|
1684
|
-
# Returns:
|
1685
|
-
# tuple: A tuple containing the best match and its index in the candidates list.
|
1686
|
-
# """
|
1687
|
-
# from fuzzywuzzy import fuzz, process
|
1688
|
-
|
1689
|
-
# def to_lower(s, ignore_case=True):
|
1690
|
-
# # Converts a string or list of strings to lowercase if ignore_case is True.
|
1691
|
-
# if ignore_case:
|
1692
|
-
# if isinstance(s, str):
|
1693
|
-
# return s.lower()
|
1694
|
-
# elif isinstance(s, list):
|
1695
|
-
# s = [str(i) for i in s] # convert all to str
|
1696
|
-
# return [elem.lower() for elem in s]
|
1697
|
-
# return s
|
1698
|
-
# scorer = str(method).lower() if method is not None else scorer
|
1699
|
-
# str1_, str2_ = to_lower(search_term, ignore_case), to_lower(candidates, ignore_case)
|
1700
|
-
# if isinstance(str2_, list):
|
1701
|
-
# if "part" in scorer.lower():
|
1702
|
-
# similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
|
1703
|
-
# elif "w" in scorer.lower():
|
1704
|
-
# similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
1705
|
-
# elif "ratio" in scorer.lower() or "stri" in scorer.lower(): # Ratio (Strictest)
|
1706
|
-
# similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
|
1707
|
-
# else:
|
1708
|
-
# similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
1709
|
-
# if get_rank:
|
1710
|
-
# idx = [
|
1711
|
-
# similarity_scores.index(i)
|
1712
|
-
# for i in sorted(similarity_scores, reverse=True)
|
1713
|
-
# ]
|
1714
|
-
# if verbose:
|
1715
|
-
# display([candidates[ii] for ii in idx])
|
1716
|
-
# return [candidates[ii] for ii in idx]
|
1717
|
-
# best_match_index = similarity_scores.index(max(similarity_scores))
|
1718
|
-
# best_match_score = similarity_scores[best_match_index]
|
1719
|
-
# else:
|
1720
|
-
# best_match_index = 0
|
1721
|
-
# if "part" in scorer.lower():
|
1722
|
-
# best_match_score = fuzz.partial_ratio(str1_, str2_)
|
1723
|
-
# elif "w" in scorer.lower():
|
1724
|
-
# best_match_score = fuzz.WRatio(str1_, str2_)
|
1725
|
-
# elif "Ratio" in scorer.lower():
|
1726
|
-
# best_match_score = fuzz.ratio(str1_, str2_)
|
1727
|
-
# else:
|
1728
|
-
# best_match_score = fuzz.WRatio(str1_, str2_)
|
1729
|
-
# if verbose:
|
1730
|
-
# print(f"\nbest_match is: {candidates[best_match_index],best_match_score}")
|
1731
|
-
# best_match = process.extract(search_term, candidates)
|
1732
|
-
# print(f"建议: {best_match}")
|
1733
|
-
# return candidates[best_match_index], best_match_index
|
1734
1691
|
|
1692
|
+
print(extract_text("A,B,C", ",", mode="split", keep="none", all_matches=True))
|
1693
|
+
# Correctly returns: ['A', 'B', 'C']
|
1694
|
+
|
1695
|
+
print(extract_text("A,B,C", ",", mode="split", keep="left"))
|
1696
|
+
# Returns: ['A,', 'B,', 'C']
|
1697
|
+
|
1698
|
+
print(extract_text("A,B,C", ",", mode="split", keep="right"))
|
1699
|
+
# Returns: [',B', ',C']
|
1700
|
+
|
1701
|
+
print(extract_text("A,B,C", ",", mode="split", keep="both"))
|
1702
|
+
# Returns: ['A', ',', 'B', ',', 'C']
|
1703
|
+
"""
|
1704
|
+
if verbose:
|
1705
|
+
print("""
|
1706
|
+
extract_text(
|
1707
|
+
text: Union[str, List[str]],
|
1708
|
+
patterns: Union[str, List[str]],
|
1709
|
+
*,
|
1710
|
+
mode: Literal["between", "split", "extract"] = "between",
|
1711
|
+
keep: Literal["none", "left", "right", "both", "markers"] = "none",
|
1712
|
+
case: Literal["sensitive", "insensitive"] = "insensitive",
|
1713
|
+
all_matches: bool = False,
|
1714
|
+
positions: bool = False,
|
1715
|
+
regex: bool = False,
|
1716
|
+
delimiter: Optional[str] = None,
|
1717
|
+
trim: bool = True,
|
1718
|
+
as_dict: bool = False,
|
1719
|
+
verbose: bool = False,
|
1720
|
+
**kwargs,
|
1721
|
+
)
|
1722
|
+
""")
|
1723
|
+
# Normalization and validation
|
1724
|
+
text = _normalize_text(text, delimiter)
|
1725
|
+
patterns = _validate_patterns(patterns)
|
1726
|
+
flags = re.IGNORECASE if case == "insensitive" else 0
|
1727
|
+
|
1728
|
+
# Find all matches with enhanced validation
|
1729
|
+
matches = _find_matches(text, patterns, regex, flags)
|
1730
|
+
if not matches:
|
1731
|
+
return None
|
1732
|
+
|
1733
|
+
# Mode-specific processing
|
1734
|
+
if mode == "extract":
|
1735
|
+
return _handle_extract(matches, all_matches, as_dict, positions, trim)
|
1736
|
+
elif mode == "split":
|
1737
|
+
return _handle_split(text, matches, keep, all_matches, as_dict, positions, trim)
|
1738
|
+
elif mode == "between":
|
1739
|
+
return _handle_between(text, matches, patterns, keep, as_dict, positions, trim)
|
1740
|
+
else:
|
1741
|
+
raise ValueError(f"Invalid mode: {mode}")
|
1742
|
+
|
1743
|
+
|
1744
|
+
def _normalize_text(text: Union[str, List[str]], delimiter: Optional[str]) -> str:
|
1745
|
+
"""Normalize text input to single string"""
|
1746
|
+
if isinstance(text, list):
|
1747
|
+
return delimiter.join(text) if delimiter else " ".join(text)
|
1748
|
+
return text
|
1749
|
+
|
1750
|
+
|
1751
|
+
def _validate_patterns(patterns: Union[str, List[str]]) -> List[str]:
|
1752
|
+
"""Validate and normalize patterns"""
|
1753
|
+
if isinstance(patterns, str):
|
1754
|
+
return [patterns]
|
1755
|
+
if not patterns:
|
1756
|
+
raise ValueError("At least one pattern required")
|
1757
|
+
return patterns
|
1758
|
+
|
1759
|
+
|
1760
|
+
def _find_matches(
|
1761
|
+
text: str, patterns: List[str], regex: bool, flags: int
|
1762
|
+
) -> List[dict]:
|
1763
|
+
"""Find all pattern matches with enhanced regex handling"""
|
1764
|
+
matches = []
|
1765
|
+
for pattern in patterns:
|
1766
|
+
try:
|
1767
|
+
search_pattern = pattern if regex else re.escape(pattern)
|
1768
|
+
for match in re.finditer(search_pattern, text, flags=flags):
|
1769
|
+
matches.append(
|
1770
|
+
{
|
1771
|
+
"text": match.group(),
|
1772
|
+
"start": match.start(),
|
1773
|
+
"end": match.end(),
|
1774
|
+
"pattern": pattern,
|
1775
|
+
"full_match": match,
|
1776
|
+
}
|
1777
|
+
)
|
1778
|
+
except re.error as e:
|
1779
|
+
raise ValueError(f"Invalid pattern '{pattern}': {e}")
|
1780
|
+
return sorted(matches, key=lambda x: x["start"])
|
1781
|
+
|
1782
|
+
|
1783
|
+
def _handle_extract(
|
1784
|
+
matches: List[dict], all_matches: bool, as_dict: bool, positions: bool, trim: bool
|
1785
|
+
) -> Union[List, dict]:
|
1786
|
+
"""Handle text extraction of matched patterns"""
|
1787
|
+
results = []
|
1788
|
+
for match in matches if all_matches else [matches[0]]:
|
1789
|
+
content = match["text"].strip() if trim else match["text"]
|
1790
|
+
result = (
|
1791
|
+
{
|
1792
|
+
"text": content,
|
1793
|
+
"start": match["start"],
|
1794
|
+
"end": match["end"],
|
1795
|
+
"pattern": match["pattern"],
|
1796
|
+
}
|
1797
|
+
if as_dict
|
1798
|
+
else content
|
1799
|
+
)
|
1800
|
+
if positions and as_dict:
|
1801
|
+
result["positions"] = [(match["start"], match["end"])]
|
1802
|
+
results.append(result)
|
1803
|
+
|
1804
|
+
return results[0] if not all_matches else results
|
1805
|
+
|
1806
|
+
|
1807
|
+
def _create_part(
|
1808
|
+
content: str,
|
1809
|
+
start: int,
|
1810
|
+
end: int,
|
1811
|
+
match: Optional[dict],
|
1812
|
+
as_dict: bool,
|
1813
|
+
positions: bool,
|
1814
|
+
trim: bool,
|
1815
|
+
) -> Union[str, dict]:
|
1816
|
+
"""Create a standardized result part"""
|
1817
|
+
content = content.strip() if trim else content
|
1818
|
+
if not as_dict:
|
1819
|
+
return content
|
1820
|
+
|
1821
|
+
part = {
|
1822
|
+
"text": content,
|
1823
|
+
"start": start,
|
1824
|
+
"end": end,
|
1825
|
+
"pattern": match["pattern"] if match else None,
|
1826
|
+
}
|
1827
|
+
if positions and match:
|
1828
|
+
part["positions"] = [(match["start"], match["end"])]
|
1829
|
+
return part
|
1830
|
+
|
1831
|
+
|
1832
|
+
def _handle_between(
|
1833
|
+
text: str,
|
1834
|
+
matches: List[dict],
|
1835
|
+
patterns: List[str],
|
1836
|
+
keep: str,
|
1837
|
+
as_dict: bool,
|
1838
|
+
positions: bool,
|
1839
|
+
trim: bool,
|
1840
|
+
) -> Union[Tuple, dict]:
|
1841
|
+
"""Reliable between-mode implementation with boundary checks"""
|
1842
|
+
first_pattern, last_pattern = patterns[0], patterns[-1]
|
1843
|
+
first_matches = [m for m in matches if m["pattern"] == first_pattern]
|
1844
|
+
last_matches = [m for m in matches if m["pattern"] == last_pattern]
|
1845
|
+
|
1846
|
+
if not first_matches or not last_matches:
|
1847
|
+
return None
|
1848
|
+
|
1849
|
+
first = first_matches[0]
|
1850
|
+
last = last_matches[-1]
|
1851
|
+
|
1852
|
+
if first["start"] > last["start"]:
|
1853
|
+
return None
|
1854
|
+
|
1855
|
+
# Calculate extraction window
|
1856
|
+
start, end = first["start"], last["end"]
|
1857
|
+
if keep == "none":
|
1858
|
+
start, end = first["end"], last["start"]
|
1859
|
+
elif keep == "left":
|
1860
|
+
end = last["start"]
|
1861
|
+
elif keep == "right":
|
1862
|
+
start = first["end"]
|
1863
|
+
|
1864
|
+
extracted = text[start:end].strip() if trim else text[start:end]
|
1865
|
+
|
1866
|
+
if as_dict:
|
1867
|
+
result = {
|
1868
|
+
"text": extracted,
|
1869
|
+
"start": start,
|
1870
|
+
"end": end,
|
1871
|
+
"patterns": patterns,
|
1872
|
+
"match_positions": [(m["start"], m["end"]) for m in matches],
|
1873
|
+
}
|
1874
|
+
return result
|
1875
|
+
|
1876
|
+
return (
|
1877
|
+
(start, extracted)
|
1878
|
+
if not positions
|
1879
|
+
else (start, extracted, [(m["start"], m["end"]) for m in matches])
|
1880
|
+
)
|
1881
|
+
|
1882
|
+
|
1883
|
+
def _handle_split(
|
1884
|
+
text: str,
|
1885
|
+
matches: List[dict],
|
1886
|
+
keep: str,
|
1887
|
+
all_matches: bool,
|
1888
|
+
as_dict: bool,
|
1889
|
+
positions: bool,
|
1890
|
+
trim: bool,
|
1891
|
+
) -> Union[List, dict]:
|
1892
|
+
"""Split text with proper handling of keep='both' to include delimiters on both sides"""
|
1893
|
+
if not matches:
|
1894
|
+
return (
|
1895
|
+
[text]
|
1896
|
+
if not as_dict
|
1897
|
+
else [{"text": text, "start": 0, "end": len(text), "pattern": None}]
|
1898
|
+
)
|
1899
|
+
|
1900
|
+
parts = []
|
1901
|
+
prev_end = 0
|
1902
|
+
process_matches = matches if all_matches else [matches[0]]
|
1903
|
+
|
1904
|
+
# Special handling for keep="both"
|
1905
|
+
if keep == "both":
|
1906
|
+
for i, match in enumerate(process_matches):
|
1907
|
+
start, end = match["start"], match["end"]
|
1908
|
+
matched_text = text[start:end]
|
1909
|
+
|
1910
|
+
# First segment (text before first delimiter + first delimiter)
|
1911
|
+
if i == 0:
|
1912
|
+
segment = text[prev_end:end] # From start to end of first delimiter
|
1913
|
+
if trim:
|
1914
|
+
segment = segment.strip()
|
1915
|
+
if segment or not trim:
|
1916
|
+
if as_dict:
|
1917
|
+
parts.append(
|
1918
|
+
{
|
1919
|
+
"text": segment,
|
1920
|
+
"start": prev_end,
|
1921
|
+
"end": end,
|
1922
|
+
"pattern": match["pattern"],
|
1923
|
+
**({"positions": [(start, end)]} if positions else {}),
|
1924
|
+
}
|
1925
|
+
)
|
1926
|
+
else:
|
1927
|
+
parts.append(segment)
|
1928
|
+
prev_end = end
|
1929
|
+
|
1930
|
+
# Middle segments (delimiter + text + next delimiter)
|
1931
|
+
if i > 0 and i < len(process_matches):
|
1932
|
+
next_match = process_matches[i]
|
1933
|
+
next_start, next_end = next_match["start"], next_match["end"]
|
1934
|
+
segment = text[
|
1935
|
+
prev_end:next_end
|
1936
|
+
] # From prev_end to end of next delimiter
|
1937
|
+
if trim:
|
1938
|
+
segment = segment.strip()
|
1939
|
+
if segment or not trim:
|
1940
|
+
if as_dict:
|
1941
|
+
parts.append(
|
1942
|
+
{
|
1943
|
+
"text": segment,
|
1944
|
+
"start": prev_end,
|
1945
|
+
"end": next_end,
|
1946
|
+
"pattern": next_match["pattern"],
|
1947
|
+
**(
|
1948
|
+
{"positions": [(next_start, next_end)]}
|
1949
|
+
if positions
|
1950
|
+
else {}
|
1951
|
+
),
|
1952
|
+
}
|
1953
|
+
)
|
1954
|
+
else:
|
1955
|
+
parts.append(segment)
|
1956
|
+
prev_end = next_end
|
1957
|
+
|
1958
|
+
# Last segment (last delimiter + remaining text)
|
1959
|
+
if process_matches and prev_end < len(text):
|
1960
|
+
last_match = process_matches[-1]
|
1961
|
+
segment = text[
|
1962
|
+
last_match["start"] : len(text)
|
1963
|
+
] # From last delimiter to end
|
1964
|
+
if trim:
|
1965
|
+
segment = segment.strip()
|
1966
|
+
if segment or not trim:
|
1967
|
+
if as_dict:
|
1968
|
+
parts.append(
|
1969
|
+
{
|
1970
|
+
"text": segment,
|
1971
|
+
"start": last_match["start"],
|
1972
|
+
"end": len(text),
|
1973
|
+
"pattern": last_match["pattern"],
|
1974
|
+
**(
|
1975
|
+
{
|
1976
|
+
"positions": [
|
1977
|
+
(last_match["start"], last_match["end"])
|
1978
|
+
]
|
1979
|
+
}
|
1980
|
+
if positions
|
1981
|
+
else {}
|
1982
|
+
),
|
1983
|
+
}
|
1984
|
+
)
|
1985
|
+
else:
|
1986
|
+
parts.append(segment)
|
1987
|
+
|
1988
|
+
return parts
|
1989
|
+
|
1990
|
+
# Original handling for other keep modes
|
1991
|
+
for i, match in enumerate(process_matches):
|
1992
|
+
start, end = match["start"], match["end"]
|
1993
|
+
matched_text = text[start:end]
|
1994
|
+
|
1995
|
+
# Handle text before the match
|
1996
|
+
if prev_end < start:
|
1997
|
+
before = text[prev_end:start]
|
1998
|
+
if trim:
|
1999
|
+
before = before.strip()
|
2000
|
+
if before or not trim:
|
2001
|
+
if as_dict:
|
2002
|
+
parts.append(
|
2003
|
+
{
|
2004
|
+
"text": before,
|
2005
|
+
"start": prev_end,
|
2006
|
+
"end": start,
|
2007
|
+
"pattern": None,
|
2008
|
+
**({"positions": []} if positions else {}),
|
2009
|
+
}
|
2010
|
+
)
|
2011
|
+
else:
|
2012
|
+
parts.append(before)
|
2013
|
+
|
2014
|
+
# Handle the match based on keep mode
|
2015
|
+
if keep == "none":
|
2016
|
+
pass # Skip the delimiter
|
2017
|
+
elif keep == "left":
|
2018
|
+
if parts:
|
2019
|
+
if as_dict:
|
2020
|
+
parts[-1]["text"] += matched_text
|
2021
|
+
parts[-1]["end"] = end
|
2022
|
+
else:
|
2023
|
+
parts[-1] += matched_text
|
2024
|
+
else:
|
2025
|
+
if as_dict:
|
2026
|
+
parts.append(
|
2027
|
+
{
|
2028
|
+
"text": matched_text,
|
2029
|
+
"start": start,
|
2030
|
+
"end": end,
|
2031
|
+
"pattern": match["pattern"],
|
2032
|
+
**({"positions": [(start, end)]} if positions else {}),
|
2033
|
+
}
|
2034
|
+
)
|
2035
|
+
else:
|
2036
|
+
parts.append(matched_text)
|
2037
|
+
elif keep == "right":
|
2038
|
+
if i < len(process_matches) - 1:
|
2039
|
+
next_start = process_matches[i + 1]["start"]
|
2040
|
+
if end < next_start:
|
2041
|
+
between = text[end:next_start]
|
2042
|
+
if as_dict:
|
2043
|
+
parts.append(
|
2044
|
+
{
|
2045
|
+
"text": matched_text + between,
|
2046
|
+
"start": start,
|
2047
|
+
"end": next_start,
|
2048
|
+
"pattern": match["pattern"],
|
2049
|
+
**({"positions": [(start, end)]} if positions else {}),
|
2050
|
+
}
|
2051
|
+
)
|
2052
|
+
else:
|
2053
|
+
parts.append(matched_text + between)
|
2054
|
+
prev_end = next_start
|
2055
|
+
continue
|
2056
|
+
|
2057
|
+
prev_end = end
|
2058
|
+
|
2059
|
+
# Handle remaining text after last match
|
2060
|
+
if prev_end < len(text):
|
2061
|
+
remaining = text[prev_end:]
|
2062
|
+
if trim:
|
2063
|
+
remaining = remaining.strip()
|
2064
|
+
if remaining or not trim:
|
2065
|
+
if keep == "right" and parts and process_matches:
|
2066
|
+
last_match = process_matches[-1]
|
2067
|
+
matched_text = text[last_match["start"] : last_match["end"]]
|
2068
|
+
if as_dict:
|
2069
|
+
parts.append(
|
2070
|
+
{
|
2071
|
+
"text": matched_text + remaining,
|
2072
|
+
"start": last_match["start"],
|
2073
|
+
"end": len(text),
|
2074
|
+
"pattern": last_match["pattern"],
|
2075
|
+
**(
|
2076
|
+
{
|
2077
|
+
"positions": [
|
2078
|
+
(last_match["start"], last_match["end"])
|
2079
|
+
]
|
2080
|
+
}
|
2081
|
+
if positions
|
2082
|
+
else {}
|
2083
|
+
),
|
2084
|
+
}
|
2085
|
+
)
|
2086
|
+
else:
|
2087
|
+
parts.append(matched_text + remaining)
|
2088
|
+
else:
|
2089
|
+
if as_dict:
|
2090
|
+
parts.append(
|
2091
|
+
{
|
2092
|
+
"text": remaining,
|
2093
|
+
"start": prev_end,
|
2094
|
+
"end": len(text),
|
2095
|
+
"pattern": None,
|
2096
|
+
**({"positions": []} if positions else {}),
|
2097
|
+
}
|
2098
|
+
)
|
2099
|
+
else:
|
2100
|
+
parts.append(remaining)
|
2101
|
+
|
2102
|
+
# Filter empty parts if trimming
|
2103
|
+
if trim:
|
2104
|
+
parts = [p for p in parts if (p["text"].strip() if as_dict else p.strip())]
|
2105
|
+
|
2106
|
+
return parts
|
2107
|
+
|
2108
|
+
|
2109
|
+
def _merge_parts(
|
2110
|
+
parts: List[Union[str, dict]], text: str, as_dict: bool, trim: bool
|
2111
|
+
) -> Union[str, dict]:
|
2112
|
+
"""Merge adjacent parts for keep=left mode"""
|
2113
|
+
if as_dict:
|
2114
|
+
merged_text = "".join(p["text"] for p in parts)
|
2115
|
+
return {
|
2116
|
+
"text": merged_text.strip() if trim else merged_text,
|
2117
|
+
"start": parts[0]["start"],
|
2118
|
+
"end": parts[-1]["end"],
|
2119
|
+
"patterns": list(set(p["pattern"] for p in parts if p["pattern"])),
|
2120
|
+
}
|
2121
|
+
return "".join(parts).strip() if trim else "".join(parts)
|
2122
|
+
#! ===========extract_text===========
|
1735
2123
|
|
1736
2124
|
def strcmp(
|
1737
2125
|
search_term: str,
|
@@ -2794,73 +3182,6 @@ def text2audio(
|
|
2794
3182
|
|
2795
3183
|
# from datetime import datetime
|
2796
3184
|
from dateutil import parser
|
2797
|
-
# import re
|
2798
|
-
# from typing import Union, Optional, Dict, Any
|
2799
|
-
# def str2time(time_str, fmt="24"):
|
2800
|
-
# """
|
2801
|
-
# Convert a time string into the specified format.
|
2802
|
-
# Parameters:
|
2803
|
-
# - time_str (str): The time string to be converted.
|
2804
|
-
# - fmt (str): The format to convert the time to. Defaults to '%H:%M:%S'.
|
2805
|
-
# Returns:
|
2806
|
-
# %I represents the hour in 12-hour format.
|
2807
|
-
# %H represents the hour in 24-hour format (00 through 23).
|
2808
|
-
# %M represents the minute.
|
2809
|
-
# %S represents the second.
|
2810
|
-
# %p represents AM or PM.
|
2811
|
-
# - str: The converted time string.
|
2812
|
-
# """
|
2813
|
-
# from datetime import datetime
|
2814
|
-
|
2815
|
-
# def time_len_corr(time_str):
|
2816
|
-
# time_str_ = (
|
2817
|
-
# ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
|
2818
|
-
# )
|
2819
|
-
# time_str_split = []
|
2820
|
-
# [time_str_split.append(i) for i in time_str_ if is_num(i)]
|
2821
|
-
# if time_str_split:
|
2822
|
-
# if len(time_str_split) == 2:
|
2823
|
-
# H, M = time_str_split
|
2824
|
-
# time_str_full = H + ":" + M + ":00"
|
2825
|
-
# elif len(time_str_split) == 3:
|
2826
|
-
# H, M, S = time_str_split
|
2827
|
-
# time_str_full = H + ":" + M + ":" + S
|
2828
|
-
# else:
|
2829
|
-
# time_str_full = time_str_
|
2830
|
-
# if "am" in time_str.lower():
|
2831
|
-
# time_str_full += " AM"
|
2832
|
-
# elif "pm" in time_str.lower():
|
2833
|
-
# time_str_full += " PM"
|
2834
|
-
# return time_str_full
|
2835
|
-
|
2836
|
-
# if "12" in fmt:
|
2837
|
-
# fmt = "%I:%M:%S %p"
|
2838
|
-
# elif "24" in fmt:
|
2839
|
-
# fmt = "%H:%M:%S"
|
2840
|
-
|
2841
|
-
# try:
|
2842
|
-
# # Try to parse the time string assuming it could be in 24-hour or 12-hour format
|
2843
|
-
# time_obj = datetime.strptime(time_len_corr(time_str), "%H:%M:%S")
|
2844
|
-
# except ValueError:
|
2845
|
-
# try:
|
2846
|
-
# time_obj = datetime.strptime(time_len_corr(time_str), "%I:%M:%S %p")
|
2847
|
-
# except ValueError as e:
|
2848
|
-
# raise ValueError(f"Unable to parse time string: {time_str}. Error: {e}")
|
2849
|
-
|
2850
|
-
# # Format the time object to the desired output format
|
2851
|
-
# formatted_time = time_obj.strftime(fmt)
|
2852
|
-
# return formatted_time
|
2853
|
-
|
2854
|
-
|
2855
|
-
# # # Example usage:
|
2856
|
-
# # time_str1 = "14:30:45"
|
2857
|
-
# # time_str2 = "02:30:45 PM"
|
2858
|
-
|
2859
|
-
# # formatted_time1 = str2time(time_str1, fmt='12') # Convert to 12-hour format
|
2860
|
-
# # formatted_time2 = str2time(time_str2, fmt='24') # Convert to 24-hour format
|
2861
|
-
|
2862
|
-
# # print(formatted_time1) # Output: 02:30:45 PM
|
2863
|
-
# # print(formatted_time2) # Output: 14:30:45
|
2864
3185
|
def str2time(
|
2865
3186
|
time_str: str,
|
2866
3187
|
fmt: str = "24",
|
@@ -2964,57 +3285,6 @@ def str2time(
|
|
2964
3285
|
raise ValueError(f"Unable to parse time string: '{time_str}'. Error: {e}")
|
2965
3286
|
return default
|
2966
3287
|
|
2967
|
-
|
2968
|
-
# def str2date(date_str, original_fmt=None, fmt="%Y-%m-%d"):
|
2969
|
-
# """
|
2970
|
-
# Convert a date string to the desired format and extract components if needed.
|
2971
|
-
# Usage:
|
2972
|
-
# str2date(x, fmt="%d.%m.%y",original_fmt="%d.%m.%y")
|
2973
|
-
# Parameters:
|
2974
|
-
# - date_str (str): The input date string.
|
2975
|
-
# - original_fmt (str, optional): The original format of the date string. If not provided, it will be auto-detected.
|
2976
|
-
# - fmt (str): The desired format for the output date string. Defaults to '%Y-%m-%d'.
|
2977
|
-
|
2978
|
-
# Returns:
|
2979
|
-
# - dict: A dictionary containing the converted date string and its components (year, month, day).
|
2980
|
-
|
2981
|
-
# Raises:
|
2982
|
-
# - ValueError: If the date cannot be parsed.
|
2983
|
-
# """
|
2984
|
-
# from dateutil import parser
|
2985
|
-
# try:
|
2986
|
-
# if not isinstance(date_str,str):
|
2987
|
-
# date_str=str(date_str)
|
2988
|
-
# # Parse the date using the provided original format or auto-detect
|
2989
|
-
# if original_fmt:
|
2990
|
-
# try:
|
2991
|
-
# date_obj = datetime.strptime(date_str, original_fmt)
|
2992
|
-
# except Exception as e:
|
2993
|
-
# print(e)
|
2994
|
-
# date_obj=None
|
2995
|
-
# else:
|
2996
|
-
# try:
|
2997
|
-
# date_obj = parser.parse(date_str)
|
2998
|
-
# except Exception as e:
|
2999
|
-
# print(e)
|
3000
|
-
# date_obj=None
|
3001
|
-
# # Return formatted string if `fmt` is specified, otherwise return the datetime object
|
3002
|
-
# if date_obj is not None:
|
3003
|
-
# if fmt:
|
3004
|
-
# date_obj=date_obj.strftime(fmt)
|
3005
|
-
# else:
|
3006
|
-
# date_obj=date_str
|
3007
|
-
# return date_obj
|
3008
|
-
|
3009
|
-
# except (ValueError, TypeError) as e:
|
3010
|
-
# raise ValueError(f"Unable to process date string: '{date_str}'. Error: {e}")
|
3011
|
-
|
3012
|
-
|
3013
|
-
# # str1=str2date(num2str(20240625),fmt="%a %d-%B-%Y")
|
3014
|
-
# # print(str1)
|
3015
|
-
# # str2=str2num(str2date(str1,fmt='%a %Y%m%d'))
|
3016
|
-
# # print(str2)
|
3017
|
-
|
3018
3288
|
def str2date(
|
3019
3289
|
date_str: Union[str, int, float],
|
3020
3290
|
fmt: Optional[str] = "%Y-%m-%d",
|
@@ -4054,8 +4324,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
|
|
4054
4324
|
|
4055
4325
|
|
4056
4326
|
def ssplit(text, by="space", verbose: bool =False, strict: bool =False, strip_results: bool = True, **kws):
|
4057
|
-
"""
|
4058
|
-
# Determines the splitting strategy:
|
4327
|
+
"""# Determines the splitting strategy:
|
4059
4328
|
# - "space", "whitespace", "sp": split by whitespace (default)
|
4060
4329
|
# - "word": split into words using NLTK's word_tokenize
|
4061
4330
|
# - "sentence", "sent": split into sentences using NLTK's sent_tokenize
|
@@ -4172,13 +4441,6 @@ def ssplit(text, by="space", verbose: bool =False, strict: bool =False, strip_re
|
|
4172
4441
|
|
4173
4442
|
def split_by_regex_end(text, pattern):
|
4174
4443
|
return re.split(f"(?={pattern})", text)
|
4175
|
-
|
4176
|
-
# def split_by_sentence_endings(text):
|
4177
|
-
# return re.split(r"(?<=[.!?])", text)
|
4178
|
-
# def split_non_ascii(text):
|
4179
|
-
# # return re.split(r"([^\x00-\x7F\w\s,.!?:\"'()\-]+)", text)
|
4180
|
-
# # return re.split(r"[^\x00-\x7F]+", text)
|
4181
|
-
# return re.split(r"([^\x00-\x7F]+)", text)
|
4182
4444
|
def split_non_ascii(text, keep_delimiters=False):
|
4183
4445
|
"""
|
4184
4446
|
Split text at non-ASCII characters.
|
@@ -4903,145 +5165,6 @@ def _backup_validations(sheet, verbose=False):
|
|
4903
5165
|
|
4904
5166
|
return backup
|
4905
5167
|
|
4906
|
-
# def _backup_validations(sheet):
|
4907
|
-
# """
|
4908
|
-
# Complete validation backup with XML-level cross-sheet detection
|
4909
|
-
# """
|
4910
|
-
# from openpyxl.utils import get_column_letter
|
4911
|
-
# import re
|
4912
|
-
# from openpyxl.worksheet.datavalidation import DataValidation
|
4913
|
-
# from openpyxl.xml.functions import fromstring
|
4914
|
-
|
4915
|
-
# backup = {
|
4916
|
-
# "validations": [],
|
4917
|
-
# "conditional_formatting": [],
|
4918
|
-
# "merged_cells": [str(mr) for mr in sheet.merged_cells.ranges],
|
4919
|
-
# "_metadata": {
|
4920
|
-
# "validated_cells": set(),
|
4921
|
-
# "validated_columns": set(),
|
4922
|
-
# "validation_types": set(),
|
4923
|
-
# "cross_sheet_validations": set()
|
4924
|
-
# }
|
4925
|
-
# }
|
4926
|
-
|
4927
|
-
# # METHOD 1: Primary validation backup (standard method)
|
4928
|
-
# for dv in sheet.data_validations:
|
4929
|
-
# # ... (existing standard validation backup code) ...
|
4930
|
-
|
4931
|
-
# # METHOD 2: XML-based cross-sheet validation detection
|
4932
|
-
# print("Performing deep XML scan for cross-sheet validations...")
|
4933
|
-
|
4934
|
-
# # Access the worksheet XML directly
|
4935
|
-
# xml_source = sheet._worksheet.xml
|
4936
|
-
# if not xml_source:
|
4937
|
-
# print("Warning: Could not access worksheet XML source")
|
4938
|
-
# return backup
|
4939
|
-
|
4940
|
-
# try:
|
4941
|
-
# # Parse the XML
|
4942
|
-
# root = fromstring(xml_source)
|
4943
|
-
# ns = {'ns': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
4944
|
-
|
4945
|
-
# # Find all dataValidation elements
|
4946
|
-
# for dv_xml in root.findall('.//ns:dataValidation', ns):
|
4947
|
-
# try:
|
4948
|
-
# # Extract validation attributes
|
4949
|
-
# dv_type = dv_xml.get('type', 'none')
|
4950
|
-
# formula1 = dv_xml.find('.//ns:formula1', ns)
|
4951
|
-
# formula_text = formula1.text if formula1 is not None else None
|
4952
|
-
|
4953
|
-
# # Skip if not a list type or no formula
|
4954
|
-
# if dv_type != 'list' or not formula_text:
|
4955
|
-
# continue
|
4956
|
-
|
4957
|
-
# # Clean the formula
|
4958
|
-
# clean_formula = formula_text.strip('"\'')
|
4959
|
-
|
4960
|
-
# # Check for cross-sheet patterns
|
4961
|
-
# cross_sheet_patterns = [
|
4962
|
-
# (r'^[\w\s]+!\$?[A-Za-z]+\$?\d+(?::\$?[A-Za-z]+\$?\d+)?$', "direct sheet reference"),
|
4963
|
-
# (r'INDIRECT\(["\'][\w\s]+![A-Za-z]+\d+(?::[A-Za-z]+\d+)?["\']\)', "INDIRECT sheet reference"),
|
4964
|
-
# (r'^[^\s!]+$', "potential named range"),
|
4965
|
-
# ]
|
4966
|
-
|
4967
|
-
# # Determine if this is a cross-sheet reference
|
4968
|
-
# is_cross_sheet = False
|
4969
|
-
# detection_method = ""
|
4970
|
-
|
4971
|
-
# for pattern, description in cross_sheet_patterns:
|
4972
|
-
# if re.match(pattern, clean_formula, re.IGNORECASE):
|
4973
|
-
# is_cross_sheet = True
|
4974
|
-
# detection_method = description
|
4975
|
-
# break
|
4976
|
-
|
4977
|
-
# if not is_cross_sheet:
|
4978
|
-
# continue
|
4979
|
-
|
4980
|
-
# # Process the ranges
|
4981
|
-
# ranges = []
|
4982
|
-
# sqref = dv_xml.get('sqref', '')
|
4983
|
-
# for range_str in sqref.split():
|
4984
|
-
# try:
|
4985
|
-
# # Convert range to coordinates
|
4986
|
-
# if ':' in range_str:
|
4987
|
-
# start, end = range_str.split(':')
|
4988
|
-
# col_start = int(''.join(filter(str.isdigit, start)))
|
4989
|
-
# col_end = int(''.join(filter(str.isdigit, end)))
|
4990
|
-
# row_start = int(''.join(filter(str.isalpha, start)))
|
4991
|
-
# row_end = int(''.join(filter(str.isalpha, end)))
|
4992
|
-
# ranges.append({
|
4993
|
-
# 'range': range_str,
|
4994
|
-
# 'cells': [f"{get_column_letter(col)}{row}"
|
4995
|
-
# for col in range(col_start, col_end+1)
|
4996
|
-
# for row in range(row_start, row_end+1)]
|
4997
|
-
# })
|
4998
|
-
# else:
|
4999
|
-
# col = int(''.join(filter(str.isdigit, range_str)))
|
5000
|
-
# row = int(''.join(filter(str.isalpha, range_str)))
|
5001
|
-
# ranges.append({
|
5002
|
-
# 'range': range_str,
|
5003
|
-
# 'cells': [f"{get_column_letter(col)}{row}"]
|
5004
|
-
# })
|
5005
|
-
# except Exception as e:
|
5006
|
-
# print(f"Error parsing range {range_str}: {e}")
|
5007
|
-
|
5008
|
-
# # Create validation record
|
5009
|
-
# validation_data = {
|
5010
|
-
# 'type': 'list',
|
5011
|
-
# 'formula1': formula_text,
|
5012
|
-
# 'formula2': None,
|
5013
|
-
# 'allow_blank': dv_xml.get('allowBlank', '1') == '1',
|
5014
|
-
# 'showDropDown': dv_xml.get('showDropDown', '1') == '1',
|
5015
|
-
# 'showInputMessage': dv_xml.get('showInputMessage', '1') == '1',
|
5016
|
-
# 'showErrorMessage': dv_xml.get('showErrorMessage', '0') == '1',
|
5017
|
-
# 'errorTitle': dv_xml.get('errorTitle', ''),
|
5018
|
-
# 'error': dv_xml.get('error', ''),
|
5019
|
-
# 'promptTitle': dv_xml.get('promptTitle', ''),
|
5020
|
-
# 'prompt': dv_xml.get('prompt', ''),
|
5021
|
-
# 'ranges': ranges,
|
5022
|
-
# '_source': 'xml_validation',
|
5023
|
-
# '_detection_method': detection_method,
|
5024
|
-
# '_is_cross_sheet': True,
|
5025
|
-
# '_formula_clean': clean_formula
|
5026
|
-
# }
|
5027
|
-
|
5028
|
-
# # Add to backup
|
5029
|
-
# backup['validations'].append(validation_data)
|
5030
|
-
# for rng in ranges:
|
5031
|
-
# for cell_ref in rng['cells']:
|
5032
|
-
# backup['_metadata']['validated_cells'].add(cell_ref)
|
5033
|
-
# backup['_metadata']['validated_columns'].add(''.join(filter(str.isalpha, cell_ref)))
|
5034
|
-
# backup['_metadata']['validation_types'].add('list')
|
5035
|
-
# backup['_metadata']['cross_sheet_validations'].add(clean_formula.split('!')[0])
|
5036
|
-
|
5037
|
-
# except Exception as e:
|
5038
|
-
# print(f"Error processing XML validation: {e}")
|
5039
|
-
|
5040
|
-
# except Exception as e:
|
5041
|
-
# print(f"Error parsing worksheet XML: {e}")
|
5042
|
-
|
5043
|
-
# return backup
|
5044
|
-
|
5045
5168
|
def _restore_validations(sheet, backup,verbose=False):
|
5046
5169
|
"""
|
5047
5170
|
恢复数据验证和条件格式规则到工作表
|
@@ -5247,11 +5370,6 @@ def fload(fpath, kind=None, **kwargs):
|
|
5247
5370
|
with open(fpath, "r") as file:
|
5248
5371
|
content = file.read()
|
5249
5372
|
return content
|
5250
|
-
|
5251
|
-
# def load_html(fpath):
|
5252
|
-
# with open(fpath, "r") as file:
|
5253
|
-
# content = file.read()
|
5254
|
-
# return content
|
5255
5373
|
def load_html(fpath, **kwargs):
|
5256
5374
|
return pd.read_html(fpath, **kwargs)
|
5257
5375
|
|
@@ -7118,7 +7236,7 @@ def listdir(
|
|
7118
7236
|
hidden=False, # Include hidden files/folders
|
7119
7237
|
orient="list",
|
7120
7238
|
output="df", # "df", 'list','dict','records','index','series'
|
7121
|
-
verbose=
|
7239
|
+
verbose=False,
|
7122
7240
|
):
|
7123
7241
|
def is_hidden(filepath):
|
7124
7242
|
"""Check if a file or folder is hidden."""
|
@@ -7348,7 +7466,7 @@ def listdir(
|
|
7348
7466
|
if "se" in orient.lower(): # records
|
7349
7467
|
return Box(f.to_dict(orient="series"))
|
7350
7468
|
|
7351
|
-
|
7469
|
+
|
7352
7470
|
def listpkg(where="env", verbose=False):
|
7353
7471
|
"""list all pacakages"""
|
7354
7472
|
|
@@ -7829,87 +7947,7 @@ def split_path(fpath):
|
|
7829
7947
|
dir_par = f_slash.join(fpath.split(f_slash)[:-1])
|
7830
7948
|
dir_ch = "".join(fpath.split(f_slash)[-1:])
|
7831
7949
|
return dir_par, dir_ch
|
7832
|
-
|
7833
|
-
|
7834
|
-
def figsave(*args, dpi=300, **kwargs):
|
7835
|
-
import matplotlib.pyplot as plt
|
7836
|
-
from PIL import Image
|
7837
|
-
bbox_inches = kwargs.pop("bbox_inches", "tight")
|
7838
|
-
pad_inches = kwargs.pop("pad_inches", 0)
|
7839
|
-
facecolor = kwargs.pop("facecolor", "white")
|
7840
|
-
edgecolor = kwargs.pop("edgecolor", "auto")
|
7841
|
-
|
7842
|
-
dir_save = None
|
7843
|
-
fname = None
|
7844
|
-
img = None
|
7845
|
-
|
7846
|
-
for arg in args:
|
7847
|
-
if isinstance(arg, str):
|
7848
|
-
path = Path(arg)
|
7849
|
-
if path.suffix: # Has file extension
|
7850
|
-
fname = path.name
|
7851
|
-
dir_save = path.parent
|
7852
|
-
else:
|
7853
|
-
dir_save = path
|
7854
|
-
elif isinstance(arg, (Image.Image, np.ndarray)):
|
7855
|
-
img = arg # Store PIL image or numpy array
|
7856
|
-
|
7857
|
-
# Set default save directory
|
7858
|
-
dir_save = Path(dir_save) if dir_save else Path(".")
|
7859
|
-
dir_save.mkdir(parents=True, exist_ok=True)
|
7860
|
-
|
7861
|
-
# Handle filename and extension
|
7862
|
-
if fname is None:
|
7863
|
-
fname = "figure"
|
7864
|
-
fname = dir_save / fname
|
7865
|
-
if fname.suffix == "":
|
7866
|
-
fname = fname.with_suffix(".pdf") # Default format
|
7867
|
-
|
7868
|
-
ftype = fname.suffix.lstrip(".").lower()
|
7869
|
-
|
7870
|
-
# Save figure based on file type
|
7871
|
-
if ftype == "eps":
|
7872
|
-
plt.savefig(fname, format="eps", bbox_inches=bbox_inches)
|
7873
|
-
plt.savefig(fname.with_suffix(".pdf"), format="pdf", dpi=dpi,
|
7874
|
-
pad_inches=pad_inches, bbox_inches=bbox_inches,
|
7875
|
-
facecolor=facecolor, edgecolor=edgecolor)
|
7876
|
-
elif ftype == "pdf":
|
7877
|
-
plt.savefig(fname, format="pdf", dpi=dpi, pad_inches=pad_inches,
|
7878
|
-
bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
|
7879
|
-
elif ftype in ["jpg", "jpeg", "png", "tiff", "tif"]:
|
7880
|
-
if img is not None: # If an image is provided
|
7881
|
-
if isinstance(img, Image.Image):
|
7882
|
-
img = img.convert("RGB") if img.mode == "RGBA" else img
|
7883
|
-
img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
7884
|
-
elif isinstance(img, np.ndarray):
|
7885
|
-
import cv2
|
7886
|
-
if img.ndim == 2:
|
7887
|
-
Image.fromarray(img).save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
7888
|
-
elif img.ndim == 3:
|
7889
|
-
if img.shape[2] == 3:
|
7890
|
-
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
7891
|
-
elif img.shape[2] == 4:
|
7892
|
-
img = cv2.cvtColor(img, cv2.COLOR_BGRA2RGBA)
|
7893
|
-
Image.fromarray(img).save(fname, format=ftype.upper(), dpi=(dpi, dpi))
|
7894
|
-
else:
|
7895
|
-
raise ValueError("Unexpected image dimensions.")
|
7896
|
-
else:
|
7897
|
-
plt.savefig(fname, format=ftype, dpi=dpi, pad_inches=pad_inches,
|
7898
|
-
bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
|
7899
|
-
elif ftype == "ico":
|
7900
|
-
if img is None:
|
7901
|
-
plt.savefig(fname, dpi=dpi, pad_inches=pad_inches,
|
7902
|
-
bbox_inches=bbox_inches, facecolor=facecolor, edgecolor=edgecolor)
|
7903
|
-
img = Image.open(fname)
|
7904
|
-
img = img.convert("RGBA")
|
7905
|
-
icon_sizes = [(32, 32), (64, 64), (128, 128), (256, 256)]
|
7906
|
-
img.save(fname, format="ICO", sizes=icon_sizes)
|
7907
|
-
print(f"Icon saved @: {fname} with sizes: {icon_sizes}")
|
7908
|
-
else:
|
7909
|
-
raise ValueError(f"Unsupported file format: {ftype}")
|
7910
|
-
|
7911
|
-
print(f"\nSaved @ {fname} (dpi={dpi})")
|
7912
|
-
|
7950
|
+
|
7913
7951
|
def figsave(*args, dpi=300, **kwargs):
|
7914
7952
|
"""
|
7915
7953
|
Save a Matplotlib figure or image file in various formats.
|
@@ -8038,7 +8076,7 @@ def figsave(*args, dpi=300, **kwargs):
|
|
8038
8076
|
img = img.convert("RGBA")
|
8039
8077
|
img.save(fname, format="ICO", sizes=icon_sizes)
|
8040
8078
|
print(f"Icon saved @: {fname} with sizes: {icon_sizes}")
|
8041
|
-
print(f"\
|
8079
|
+
print(f"\nSaved @: dpi={dpi}\n{fname}")
|
8042
8080
|
|
8043
8081
|
|
8044
8082
|
def is_str_color(s):
|
@@ -8806,7 +8844,8 @@ def detect_angle(image, by="median", template=None):
|
|
8806
8844
|
|
8807
8845
|
# Use Hough transform to detect lines
|
8808
8846
|
lines = transform.probabilistic_hough_line(edges)
|
8809
|
-
|
8847
|
+
if isinstance(by, bool):
|
8848
|
+
by="mean" if by else 0
|
8810
8849
|
if not lines and any(["me" in by, "pca" in by]):
|
8811
8850
|
print("No lines detected. Adjust the edge detection parameters.")
|
8812
8851
|
return 0
|
@@ -9180,7 +9219,7 @@ def imgsets(
|
|
9180
9219
|
elif "cro" in k.lower() or "cut" in k.lower():
|
9181
9220
|
img_update = img_update.crop(value)
|
9182
9221
|
elif "rota" in k.lower():
|
9183
|
-
if isinstance(value, str):
|
9222
|
+
if isinstance(value, (str,bool)):
|
9184
9223
|
value = detect_angle(img_update, by=value)
|
9185
9224
|
print(f"rotated by {value}°")
|
9186
9225
|
img_update = img_update.rotate(value)
|
@@ -9524,11 +9563,252 @@ def finfo(fpath, output='json', verbose=False):
|
|
9524
9563
|
extra_info=extra_info,
|
9525
9564
|
)
|
9526
9565
|
|
9527
|
-
|
9528
|
-
|
9566
|
+
def color2rgb(
|
9567
|
+
color_input: str | tuple | list | None,
|
9568
|
+
alpha: float | None = None
|
9569
|
+
) -> tuple | None:
|
9570
|
+
"""
|
9571
|
+
Ultimate color conversion utility with support for multiple formats and transparency.
|
9572
|
+
|
9573
|
+
Parameters:
|
9574
|
+
-----------
|
9575
|
+
color_input : str | tuple | list | None
|
9576
|
+
Supported formats:
|
9577
|
+
- Hex strings ("#RRGGBB", "#RGB")
|
9578
|
+
- Named colors ("red", "blue")
|
9579
|
+
- RGB tuples ((0.2, 0.4, 0.6))
|
9580
|
+
- RGBA tuples ((0.2, 0.4, 0.6, 0.8))
|
9581
|
+
- HTML/CSS colors ("cornflowerblue")
|
9582
|
+
- CSS formats:
|
9583
|
+
- rgb(100,200,50)
|
9584
|
+
- rgba(100,200,50,0.8)
|
9585
|
+
- hsl(120,60%,70%)
|
9586
|
+
- hsla(120,60%,70%,0.8)
|
9587
|
+
alpha : float | None, optional
|
9588
|
+
Opacity value (0.0-1.0). If provided, adds/overrides alpha channel.
|
9589
|
+
|
9590
|
+
Returns:
|
9591
|
+
--------
|
9592
|
+
tuple | None
|
9593
|
+
(R, G, B) or (R, G, B, A) tuple in 0-1 range, or None if invalid
|
9594
|
+
"""
|
9595
|
+
from matplotlib import colors as mcolors
|
9596
|
+
import re
|
9597
|
+
|
9598
|
+
if color_input is None:
|
9599
|
+
return None
|
9600
|
+
|
9601
|
+
# Case 1: Already in RGB/RGBA tuple format
|
9602
|
+
if isinstance(color_input, (tuple, list)):
|
9603
|
+
if 3 <= len(color_input) <= 4:
|
9604
|
+
if all(0 <= x <= 1 for x in color_input):
|
9605
|
+
if alpha is not None and len(color_input) == 3:
|
9606
|
+
return (*color_input, alpha)
|
9607
|
+
return tuple(color_input)
|
9608
|
+
|
9609
|
+
# Case 2: String input
|
9610
|
+
if isinstance(color_input, str):
|
9611
|
+
# Remove whitespace and make lowercase
|
9612
|
+
color_str = color_input.strip().lower()
|
9613
|
+
|
9614
|
+
# Handle CSS rgb/rgba format
|
9615
|
+
if color_str.startswith(('rgb(', 'rgba(')):
|
9616
|
+
try:
|
9617
|
+
nums = list(map(float, re.findall(r"[\d.]+", color_str)))
|
9618
|
+
if 3 <= len(nums) <= 4:
|
9619
|
+
rgb = tuple(x/255 if i < 3 else x for i, x in enumerate(nums))
|
9620
|
+
if alpha is not None:
|
9621
|
+
return (*rgb[:3], alpha)
|
9622
|
+
return rgb[:4] if len(rgb) == 4 else rgb[:3]
|
9623
|
+
except:
|
9624
|
+
pass
|
9625
|
+
|
9626
|
+
# Handle CSS hsl/hsla format
|
9627
|
+
elif color_str.startswith(('hsl(', 'hsla(')):
|
9628
|
+
try:
|
9629
|
+
nums = list(map(float, re.findall(r"[\d.]+", color_str)))
|
9630
|
+
if 3 <= len(nums) <= 4:
|
9631
|
+
h, s, l = nums[0]/360, nums[1]/100, nums[2]/100
|
9632
|
+
rgb = mcolors.hsv_to_rgb((h, s, l))
|
9633
|
+
if len(nums) == 4:
|
9634
|
+
rgb += (nums[3],)
|
9635
|
+
if alpha is not None:
|
9636
|
+
return (*rgb[:3], alpha)
|
9637
|
+
return rgb[:4] if len(rgb) == 4 else rgb[:3]
|
9638
|
+
except:
|
9639
|
+
pass
|
9640
|
+
|
9641
|
+
# Standard hex/named color processing
|
9642
|
+
try:
|
9643
|
+
rgb = mcolors.to_rgba(color_str)
|
9644
|
+
if alpha is not None:
|
9645
|
+
return (*rgb[:3], alpha)
|
9646
|
+
return rgb if len(rgb) == 4 and rgb[3] != 1 else rgb[:3]
|
9647
|
+
except ValueError:
|
9648
|
+
pass
|
9649
|
+
|
9650
|
+
# Fallback for invalid colors
|
9651
|
+
print(f"Warning: Invalid color format '{color_input}'")
|
9652
|
+
return None
|
9653
|
+
|
9654
|
+
def color2hex(
|
9655
|
+
color_input: str | tuple | list | dict | int | None,
|
9656
|
+
keep_alpha: bool = False,
|
9657
|
+
force_long: bool = False,
|
9658
|
+
uppercase: bool = False,
|
9659
|
+
prefix: str = "#",
|
9660
|
+
allow_short: bool = True
|
9661
|
+
) -> str | None:
|
9662
|
+
"""
|
9663
|
+
Ultimate color to hex converter with comprehensive format support.
|
9664
|
+
|
9665
|
+
Parameters:
|
9666
|
+
-----------
|
9667
|
+
color_input : str | tuple | list | dict | int | None
|
9668
|
+
Input color in any of these formats:
|
9669
|
+
- Hex strings ("#RRGGBB", "#RGB", "RRGGBB", "RGB")
|
9670
|
+
- Named colors ("red", "blue", "transparent")
|
9671
|
+
- RGB/RGBA tuples ((0.2, 0.4, 0.6), (255, 0, 0), (100, 100, 100, 0.5))
|
9672
|
+
- CSS formats:
|
9673
|
+
- rgb(100,200,50)
|
9674
|
+
- rgba(100,200,50,0.8)
|
9675
|
+
- hsl(120,60%,70%)
|
9676
|
+
- hsla(120,60%,70%,0.8)
|
9677
|
+
- Integer RGB (0xFF0000 for red)
|
9678
|
+
- Dictionary {"r": 255, "g": 0, "b": 0} or {"h": 0, "s": 100, "l": 50}
|
9679
|
+
keep_alpha : bool, optional
|
9680
|
+
Whether to include alpha channel in hex format (#RRGGBBAA)
|
9681
|
+
force_long : bool, optional
|
9682
|
+
Force 6/8-digit hex even when 3/4-digit would be possible
|
9683
|
+
uppercase : bool, optional
|
9684
|
+
Use uppercase hex characters (False for lowercase)
|
9685
|
+
prefix : str, optional
|
9686
|
+
Prefix for hex string ("#" for CSS, "0x" for programming, "" for raw)
|
9687
|
+
allow_short : bool, optional
|
9688
|
+
Allow shortened 3/4-digit hex when possible
|
9689
|
+
|
9690
|
+
Returns:
|
9691
|
+
--------
|
9692
|
+
str | None
|
9693
|
+
Hex color string or None if invalid
|
9694
|
+
|
9695
|
+
Examples:
|
9696
|
+
---------
|
9697
|
+
>>> color2hex((0.5, 0.2, 0.8)) → "#7f33cc"
|
9698
|
+
>>> color2hex("rgb(127, 51, 204)") → "#7f33cc"
|
9699
|
+
>>> color2hex((0.2, 0.4, 0.6, 0.8), True) → "#336699cc"
|
9700
|
+
>>> color2hex(0xFF0000, uppercase=True) → "#FF0000"
|
9701
|
+
>>> color2hex({"r": 255, "g": 165, "b": 0}, prefix="") → "ffa500"
|
9702
|
+
>>> color2hex("hsl(120, 100%, 50%)") → "#00ff00"
|
9703
|
+
"""
|
9704
|
+
from matplotlib import colors as mcolors
|
9705
|
+
import re
|
9706
|
+
|
9707
|
+
def to_rgba(color) -> tuple | None:
|
9708
|
+
"""Internal conversion to RGBA tuple"""
|
9709
|
+
# Handle None
|
9710
|
+
if color is None:
|
9711
|
+
return None
|
9712
|
+
|
9713
|
+
# Handle integer RGB
|
9714
|
+
if isinstance(color, int):
|
9715
|
+
if color < 0:
|
9716
|
+
return None
|
9717
|
+
return (
|
9718
|
+
(color >> 16) & 0xFF,
|
9719
|
+
(color >> 8) & 0xFF,
|
9720
|
+
color & 0xFF,
|
9721
|
+
255
|
9722
|
+
)
|
9723
|
+
|
9724
|
+
# Handle dictionary formats
|
9725
|
+
if isinstance(color, dict):
|
9726
|
+
keys = set(color.keys())
|
9727
|
+
if {'r','g','b'}.issubset(keys):
|
9728
|
+
return (
|
9729
|
+
color['r'] / 255 if color['r'] > 1 else color['r'],
|
9730
|
+
color['g'] / 255 if color['g'] > 1 else color['g'],
|
9731
|
+
color['b'] / 255 if color['b'] > 1 else color['b'],
|
9732
|
+
color.get('a', 1.0)
|
9733
|
+
)
|
9734
|
+
elif {'h','s','l'}.issubset(keys):
|
9735
|
+
return mcolors.hsv_to_rgb((
|
9736
|
+
color['h'] / 360,
|
9737
|
+
color['s'] / 100,
|
9738
|
+
color['l'] / 100
|
9739
|
+
)) + (color.get('a', 1.0),)
|
9740
|
+
return None
|
9741
|
+
|
9742
|
+
# Handle string formats
|
9743
|
+
if isinstance(color, str):
|
9744
|
+
color = color.strip().lower()
|
9745
|
+
|
9746
|
+
# Handle hex without prefix
|
9747
|
+
if re.match(r'^[0-9a-f]{3,8}$', color):
|
9748
|
+
return mcolors.to_rgba(f"#{color}")
|
9749
|
+
|
9750
|
+
# Handle CSS functions
|
9751
|
+
if color.startswith(('rgb(', 'rgba(', 'hsl(', 'hsla(')):
|
9752
|
+
try:
|
9753
|
+
return mcolors.to_rgba(color)
|
9754
|
+
except ValueError:
|
9755
|
+
return None
|
9756
|
+
|
9757
|
+
# Handle named colors (including 'transparent')
|
9758
|
+
try:
|
9759
|
+
return mcolors.to_rgba(color)
|
9760
|
+
except ValueError:
|
9761
|
+
return None
|
9762
|
+
|
9763
|
+
# Handle tuple/list formats
|
9764
|
+
if isinstance(color, (tuple, list)):
|
9765
|
+
if len(color) in (3, 4):
|
9766
|
+
# Normalize values
|
9767
|
+
normalized = []
|
9768
|
+
for i, v in enumerate(color):
|
9769
|
+
if i < 3: # RGB channels
|
9770
|
+
if isinstance(v, int):
|
9771
|
+
normalized.append(v / 255 if v > 1 else v)
|
9772
|
+
else:
|
9773
|
+
normalized.append(float(v))
|
9774
|
+
else: # Alpha channel
|
9775
|
+
normalized.append(float(v))
|
9776
|
+
return tuple(normalized)
|
9777
|
+
|
9778
|
+
return None
|
9779
|
+
|
9780
|
+
# Convert input to RGBA
|
9781
|
+
rgba = to_rgba(color_input)
|
9782
|
+
if rgba is None:
|
9783
|
+
return None
|
9784
|
+
|
9785
|
+
# Extract components
|
9786
|
+
components = []
|
9787
|
+
for i, c in enumerate(rgba):
|
9788
|
+
if i == 3 and not keep_alpha:
|
9789
|
+
break
|
9790
|
+
components.append(round(c * 255 if c <= 1 else c))
|
9791
|
+
|
9792
|
+
# Determine if we can use short format
|
9793
|
+
use_short = (allow_short and
|
9794
|
+
not force_long and
|
9795
|
+
len(components) in (3, 4) and
|
9796
|
+
all((x % 17 == 0) for x in components[:3]))
|
9797
|
+
|
9798
|
+
# Format the hex string
|
9799
|
+
if use_short:
|
9800
|
+
short_components = [x//17 for x in components[:3]] + components[3:]
|
9801
|
+
hex_str = "".join(f"{x:1x}" for x in short_components)
|
9802
|
+
else:
|
9803
|
+
hex_str = "".join(f"{x:02x}" for x in components)
|
9804
|
+
|
9805
|
+
# Apply case and prefix
|
9806
|
+
if uppercase:
|
9807
|
+
hex_str = hex_str.upper()
|
9808
|
+
|
9809
|
+
return f"{prefix}{hex_str}"
|
9529
9810
|
# ! format excel file
|
9530
|
-
|
9531
|
-
|
9811
|
+
|
9532
9812
|
def hex2argb(color):
|
9533
9813
|
"""
|
9534
9814
|
Convert a color name or hex code to aARGB format required by openpyxl.
|
@@ -9753,6 +10033,105 @@ def copy_format(
|
|
9753
10033
|
if "wb_target" in locals():
|
9754
10034
|
wb_target.close()
|
9755
10035
|
|
10036
|
+
def set_sheet_visible(
|
10037
|
+
fpath: str,
|
10038
|
+
sheet_name: Union[int, str, None,list] = 1,
|
10039
|
+
show: Union[bool, str] = True,
|
10040
|
+
exclude: Union[List[str], None,list,int] = None,
|
10041
|
+
verbose: bool = False
|
10042
|
+
) -> None:
|
10043
|
+
"""
|
10044
|
+
Modify sheet visibility in an Excel workbook.
|
10045
|
+
set_sheet_visible(fpath=dir_data_collection,sheet_name=None,show=1,verbose=1)
|
10046
|
+
Args:
|
10047
|
+
fpath (str): Path to the Excel workbook.
|
10048
|
+
sheet_name (int | str | None): Index or name of the sheet to apply visibility to.
|
10049
|
+
If None, all sheets are considered.
|
10050
|
+
show (bool | str): Visibility mode. Can be:
|
10051
|
+
- True -> visible
|
10052
|
+
- False -> veryHidden
|
10053
|
+
- 'visible', 'hidden', 'veryHidden' as str
|
10054
|
+
exclude (list[str] | None): List of sheet names to exclude from changes.
|
10055
|
+
verbose (bool): If True, logs actions.
|
10056
|
+
"""
|
10057
|
+
|
10058
|
+
try:
|
10059
|
+
wb = fload(fpath, output="bit", get_validations=1)
|
10060
|
+
except Exception as e:
|
10061
|
+
raise FileNotFoundError(f"Unable to load workbook: {e}")
|
10062
|
+
|
10063
|
+
sheet_names = wb.sheetnames
|
10064
|
+
if verbose:
|
10065
|
+
print("Workbook loaded with sheets:")
|
10066
|
+
for i, name in enumerate(sheet_names):
|
10067
|
+
print(f" [{i}] {name}")
|
10068
|
+
|
10069
|
+
excludes=[]
|
10070
|
+
if exclude is None:
|
10071
|
+
exclude=[]
|
10072
|
+
if ~isinstance(exclude, list):
|
10073
|
+
exclude = [exclude]
|
10074
|
+
for exclude_ in exclude:
|
10075
|
+
if isinstance(exclude_, str):
|
10076
|
+
excludes.append(strcmp(exclude_, sheet_names)[0])
|
10077
|
+
elif isinstance(exclude_, int):
|
10078
|
+
if 0 <= exclude_ < len(sheet_names):
|
10079
|
+
excludes.append(sheet_names[exclude_])
|
10080
|
+
else:
|
10081
|
+
raise IndexError(f"sheet_name index {exclude_} is out of range:0~{len(sheet_names)-1}.")
|
10082
|
+
|
10083
|
+
# Resolve the sheet_name target
|
10084
|
+
target_indices = []
|
10085
|
+
if not isinstance(sheet_name,list):
|
10086
|
+
sheet_name=[sheet_name]
|
10087
|
+
for sheet_name_ in sheet_name:
|
10088
|
+
if sheet_name_ is None:
|
10089
|
+
target_indices = list(range(len(sheet_names)))
|
10090
|
+
break
|
10091
|
+
elif isinstance(sheet_name_, int):
|
10092
|
+
if 0 <= sheet_name_ < len(sheet_names):
|
10093
|
+
target_indices.append(sheet_name_)
|
10094
|
+
else:
|
10095
|
+
raise IndexError(f"sheet_name index {sheet_name_} is out of range :0~{len(sheet_names)-1}.")
|
10096
|
+
elif isinstance(sheet_name_, str):
|
10097
|
+
idx = strcmp(sheet_name_, sheet_names)[1]
|
10098
|
+
if idx == -1:
|
10099
|
+
raise ValueError(f"Sheet '{sheet_name_}' not found.")
|
10100
|
+
target_indices.append(idx)
|
10101
|
+
|
10102
|
+
# Map show argument to valid state
|
10103
|
+
valid_states = ["veryHidden", "visible", "hidden"]
|
10104
|
+
if isinstance(show, str):
|
10105
|
+
if show not in valid_states:
|
10106
|
+
raise ValueError(f"Invalid show value '{show}'. Must be one of {valid_states}")
|
10107
|
+
state = show
|
10108
|
+
else:
|
10109
|
+
state = "visible" if show else "veryHidden"
|
10110
|
+
# Modify sheet visibility
|
10111
|
+
for idx in target_indices:
|
10112
|
+
ws= wb[sheet_names[idx]]
|
10113
|
+
if ws.title in excludes:
|
10114
|
+
if verbose:
|
10115
|
+
print(f"Skipping excluded sheet: '{ws.title}'")
|
10116
|
+
continue
|
10117
|
+
ws.sheet_state = state
|
10118
|
+
# Ensure at least one sheet is visible
|
10119
|
+
visible_sheets = [s for s in wb.worksheets if s.sheet_state == "visible"]
|
10120
|
+
not_visible_sheets = [s for s in wb.worksheets if s.sheet_state != "visible"]
|
10121
|
+
if not visible_sheets:
|
10122
|
+
fallback_sheet = wb.worksheets[0]
|
10123
|
+
fallback_sheet.sheet_state = "visible"
|
10124
|
+
if verbose:
|
10125
|
+
print(f"No visible sheets found. Setting '{fallback_sheet.title}' to visible.")
|
10126
|
+
if verbose:
|
10127
|
+
print(f"visible sheets:{[s.title for s in visible_sheets]}")
|
10128
|
+
|
10129
|
+
try:
|
10130
|
+
wb.save(fpath)
|
10131
|
+
except Exception as e:
|
10132
|
+
raise IOError(f"Error saving workbook: {e}")
|
10133
|
+
|
10134
|
+
|
9756
10135
|
def format_excel(
|
9757
10136
|
df: pd.DataFrame=None,
|
9758
10137
|
filename:str=None,
|
@@ -14580,209 +14959,213 @@ def df_reducer(
|
|
14580
14959
|
|
14581
14960
|
# example:
|
14582
14961
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
14962
|
+
|
14583
14963
|
|
14584
14964
|
|
14585
|
-
def get_df_format(data, threshold_unique=0.5, verbose=False):
|
14965
|
+
def get_df_format(data, threshold_unique=0.5, verbose=False, sample_size=1000):
|
14586
14966
|
"""
|
14587
|
-
|
14588
|
-
|
14967
|
+
Detect whether a DataFrame is in long or wide format with optimized performance and accuracy.
|
14968
|
+
|
14589
14969
|
Parameters:
|
14590
|
-
- data (pd.DataFrame): DataFrame to
|
14591
|
-
- threshold_unique (float):
|
14592
|
-
|
14970
|
+
- data (pd.DataFrame): DataFrame to analyze
|
14971
|
+
- threshold_unique (float): Threshold for categorical column detection (0-1)
|
14972
|
+
- verbose (bool): Whether to print diagnostic messages
|
14973
|
+
- sample_size (int): Maximum number of rows/columns to sample for large datasets
|
14974
|
+
|
14593
14975
|
Returns:
|
14594
|
-
- "long" if detected as long format
|
14976
|
+
- "long" if detected as long format
|
14595
14977
|
- "wide" if detected as wide format
|
14596
|
-
- "uncertain" if ambiguous
|
14978
|
+
- "uncertain" if format is ambiguous
|
14597
14979
|
"""
|
14980
|
+
import pandas as pd
|
14981
|
+
import numpy as np
|
14598
14982
|
from scipy.stats import entropy
|
14599
14983
|
from sklearn.cluster import AgglomerativeClustering
|
14600
14984
|
from sklearn.preprocessing import StandardScaler
|
14601
|
-
|
14602
|
-
|
14603
|
-
|
14604
|
-
# -----
|
14605
|
-
if n_rows > fs:
|
14606
|
-
if verbose:
|
14607
|
-
print(f"Sampling {fs} rows from {n_rows} rows.")
|
14608
|
-
data = data.sample(n=fs, random_state=1)
|
14609
|
-
if n_cols > fs:
|
14610
|
-
if verbose:
|
14611
|
-
print(f"Using first {fs} columns out of {n_cols} columns.")
|
14612
|
-
data = data.iloc[:, :fs]
|
14985
|
+
from sklearn.metrics import pairwise_distances
|
14986
|
+
from collections import Counter
|
14987
|
+
import re
|
14988
|
+
# ----- Initial Setup and Sampling -----
|
14613
14989
|
n_rows, n_cols = data.shape
|
14990
|
+
if verbose:
|
14991
|
+
print(f"Initial shape: {n_rows} rows, {n_cols} columns")
|
14614
14992
|
|
14615
|
-
#
|
14616
|
-
if n_rows >
|
14617
|
-
|
14618
|
-
|
14619
|
-
|
14620
|
-
|
14621
|
-
|
14622
|
-
|
14623
|
-
|
14624
|
-
|
14625
|
-
|
14626
|
-
|
14627
|
-
|
14628
|
-
|
14629
|
-
|
14630
|
-
|
14993
|
+
# Sample data if too large
|
14994
|
+
if n_rows > sample_size:
|
14995
|
+
data = data.sample(n=sample_size, random_state=42)
|
14996
|
+
n_rows = sample_size
|
14997
|
+
if n_cols > sample_size:
|
14998
|
+
data = data.iloc[:, :sample_size]
|
14999
|
+
n_cols = sample_size
|
15000
|
+
|
15001
|
+
# Early exit for tiny datasets
|
15002
|
+
if n_rows < 3 or n_cols < 3:
|
15003
|
+
return "uncertain"
|
15004
|
+
|
15005
|
+
long_score = 0
|
15006
|
+
wide_score = 0
|
15007
|
+
|
15008
|
+
# ----- Feature Extraction -----
|
15009
|
+
# Basic statistics
|
15010
|
+
row_col_ratio = n_rows / n_cols if n_cols != 0 else float('inf')
|
15011
|
+
|
15012
|
+
# Column types
|
15013
|
+
numeric_cols = data.select_dtypes(include=np.number).columns
|
15014
|
+
cat_cols = data.select_dtypes(include=['object', 'category']).columns
|
15015
|
+
other_cols = [col for col in data.columns if col not in numeric_cols and col not in cat_cols]
|
15016
|
+
|
15017
|
+
# Unique value analysis
|
15018
|
+
unique_counts = data.nunique(dropna=False)
|
14631
15019
|
duplicate_ratio = 1 - unique_counts / n_rows
|
14632
|
-
|
14633
|
-
|
14634
|
-
|
14635
|
-
|
14636
|
-
|
14637
|
-
|
14638
|
-
if verbose:
|
14639
|
-
print(
|
14640
|
-
"Lower duplicate ratio suggests long format (higher row variability)."
|
14641
|
-
)
|
14642
|
-
|
14643
|
-
# Calculate entropy for categorical columns
|
14644
|
-
categorical_cols = data.select_dtypes(include=["object", "category"]).columns
|
14645
|
-
if len(categorical_cols) > 0:
|
14646
|
-
for col in categorical_cols:
|
14647
|
-
counts = data[col].value_counts(normalize=True)
|
14648
|
-
col_entropy = entropy(counts)
|
14649
|
-
if col_entropy < 1.5:
|
14650
|
-
long_score += 1
|
14651
|
-
if verbose:
|
14652
|
-
print(
|
14653
|
-
f"Column '{col}' entropy suggests categorical, supporting long format."
|
14654
|
-
)
|
14655
|
-
else:
|
14656
|
-
wide_score += 1
|
14657
|
-
if verbose:
|
14658
|
-
print(f"Column '{col}' entropy is higher, supporting wide format.")
|
14659
|
-
|
14660
|
-
# Step 3: Column grouping analysis for patterns in suffixes/prefixes
|
15020
|
+
|
15021
|
+
# Missing values
|
15022
|
+
missing_per_row = data.isna().sum(axis=1)
|
15023
|
+
missing_per_col = data.isna().sum()
|
15024
|
+
|
15025
|
+
# Column name patterns
|
14661
15026
|
col_names = data.columns.astype(str)
|
14662
|
-
|
14663
|
-
|
15027
|
+
has_suffix = sum(bool(re.search(r'(_\d+|\d+_?$)', col)) for col in col_names)
|
15028
|
+
has_time = sum(bool(re.search(r'(^time|^date|^year|^month|^day|^t\d+)', col.lower())) for col in col_names)
|
15029
|
+
|
15030
|
+
# ----- Scoring Rules -----
|
15031
|
+
|
15032
|
+
# 1. Row-Column Ratio (weighted)
|
15033
|
+
if row_col_ratio > 5:
|
15034
|
+
long_score += 3
|
15035
|
+
if verbose: print(f"High row/col ratio ({row_col_ratio:.1f}) → long +3")
|
15036
|
+
elif row_col_ratio < 0.2:
|
15037
|
+
wide_score += 3
|
15038
|
+
if verbose: print(f"Low row/col ratio ({row_col_ratio:.1f}) → wide +3")
|
15039
|
+
elif row_col_ratio > 2:
|
15040
|
+
long_score += 1
|
15041
|
+
if verbose: print(f"Moderate row/col ratio ({row_col_ratio:.1f}) → long +1")
|
15042
|
+
elif row_col_ratio < 0.5:
|
15043
|
+
wide_score += 1
|
15044
|
+
if verbose: print(f"Moderate row/col ratio ({row_col_ratio:.1f}) → wide +1")
|
15045
|
+
|
15046
|
+
# 2. Duplication Patterns
|
15047
|
+
high_dupe_cols = sum(duplicate_ratio > 0.3)
|
15048
|
+
if high_dupe_cols > 0.6 * n_cols:
|
14664
15049
|
wide_score += 2
|
14665
|
-
if verbose:
|
14666
|
-
|
14667
|
-
|
14668
|
-
|
14669
|
-
|
14670
|
-
#
|
14671
|
-
if len(
|
14672
|
-
|
14673
|
-
|
14674
|
-
|
14675
|
-
|
15050
|
+
if verbose: print(f"Many columns ({high_dupe_cols}/{n_cols}) with duplicates → wide +2")
|
15051
|
+
elif high_dupe_cols < 0.2 * n_cols:
|
15052
|
+
long_score += 1
|
15053
|
+
if verbose: print(f"Few columns ({high_dupe_cols}/{n_cols}) with duplicates → long +1")
|
15054
|
+
|
15055
|
+
# 3. Categorical Column Analysis
|
15056
|
+
if len(cat_cols) > 0:
|
15057
|
+
# Entropy analysis
|
15058
|
+
cat_entropies = []
|
15059
|
+
for col in cat_cols:
|
15060
|
+
counts = data[col].value_counts(normalize=True, dropna=False)
|
15061
|
+
cat_entropies.append(entropy(counts))
|
15062
|
+
|
15063
|
+
avg_cat_entropy = np.mean(cat_entropies) if cat_entropies else 0
|
15064
|
+
if avg_cat_entropy < 1.2:
|
14676
15065
|
long_score += 2
|
14677
|
-
if verbose:
|
14678
|
-
|
14679
|
-
|
14680
|
-
|
14681
|
-
|
14682
|
-
|
14683
|
-
|
14684
|
-
|
14685
|
-
|
14686
|
-
|
14687
|
-
|
14688
|
-
|
14689
|
-
|
14690
|
-
|
14691
|
-
|
14692
|
-
|
14693
|
-
|
14694
|
-
|
14695
|
-
|
14696
|
-
|
14697
|
-
|
14698
|
-
|
14699
|
-
|
15066
|
+
if verbose: print(f"Low categorical entropy ({avg_cat_entropy:.2f}) → long +2")
|
15067
|
+
elif avg_cat_entropy > 2:
|
15068
|
+
wide_score += 1
|
15069
|
+
if verbose: print(f"High categorical entropy ({avg_cat_entropy:.2f}) → wide +1")
|
15070
|
+
|
15071
|
+
# Entity identifier detection
|
15072
|
+
if len(cat_cols) >= 2 and n_rows > 10:
|
15073
|
+
dup_rows = data.duplicated(subset=cat_cols.tolist()[:2], keep=False).sum()
|
15074
|
+
if dup_rows > 0.3 * n_rows:
|
15075
|
+
long_score += 2
|
15076
|
+
if verbose: print(f"Duplicate rows in categorical cols ({dup_rows}/{n_rows}) → long +2")
|
15077
|
+
|
15078
|
+
# 4. Column Name Patterns
|
15079
|
+
if has_suffix > 0.4 * n_cols:
|
15080
|
+
wide_score += 2
|
15081
|
+
if verbose: print(f"Many suffix patterns ({has_suffix}/{n_cols}) → wide +2")
|
15082
|
+
if has_time > 0.3 * n_cols:
|
15083
|
+
wide_score += 1
|
15084
|
+
if verbose: print(f"Time-like columns ({has_time}/{n_cols}) → wide +1")
|
15085
|
+
|
15086
|
+
# 5. Numeric Column Analysis (only if enough numeric columns)
|
15087
|
+
if len(numeric_cols) > 2:
|
15088
|
+
# Correlation analysis
|
14700
15089
|
corr_matrix = data[numeric_cols].corr().abs()
|
14701
|
-
avg_corr = (
|
14702
|
-
|
14703
|
-
|
14704
|
-
if avg_corr > 0.6:
|
15090
|
+
avg_corr = corr_matrix.values[np.triu_indices_from(corr_matrix, k=1)].mean()
|
15091
|
+
|
15092
|
+
if avg_corr > 0.5:
|
14705
15093
|
wide_score += 2
|
14706
|
-
if verbose:
|
14707
|
-
|
14708
|
-
|
14709
|
-
|
14710
|
-
|
14711
|
-
|
15094
|
+
if verbose: print(f"High numeric correlation ({avg_corr:.2f}) → wide +2")
|
15095
|
+
elif avg_corr < 0.2:
|
15096
|
+
long_score += 1
|
15097
|
+
if verbose: print(f"Low numeric correlation ({avg_corr:.2f}) → long +1")
|
15098
|
+
|
15099
|
+
# Entropy analysis
|
15100
|
+
try:
|
15101
|
+
numeric_data = data[numeric_cols].dropna()
|
15102
|
+
if len(numeric_data) > 10:
|
15103
|
+
numeric_entropy = numeric_data.apply(lambda x: entropy(pd.cut(x, bins=min(10, len(x.unique())).value_counts(normalize=True))))
|
15104
|
+
if numeric_entropy.mean() < 1.5:
|
15105
|
+
wide_score += 1
|
15106
|
+
if verbose: print(f"Low numeric entropy ({numeric_entropy.mean():.2f}) → wide +1")
|
15107
|
+
except Exception as e:
|
15108
|
+
if verbose: print(f"Numeric entropy failed: {str(e)}")
|
15109
|
+
|
15110
|
+
# 6. Missing Value Patterns
|
15111
|
+
missing_row_std = missing_per_row.std()
|
15112
|
+
if missing_row_std < 1 and missing_per_row.mean() > 0.1 * n_cols:
|
14712
15113
|
wide_score += 1
|
14713
|
-
if verbose:
|
14714
|
-
|
14715
|
-
"Low variation in missing patterns across rows, supporting wide format."
|
14716
|
-
)
|
14717
|
-
elif missing_patterns.mean() < 1:
|
15114
|
+
if verbose: print(f"Uniform missing pattern (std={missing_row_std:.2f}) → wide +1")
|
15115
|
+
elif missing_per_row.mean() < 0.05 * n_cols:
|
14718
15116
|
long_score += 1
|
14719
|
-
if verbose:
|
14720
|
-
|
14721
|
-
|
14722
|
-
|
14723
|
-
if len(numeric_cols) > 1 and n_rows > 5:
|
15117
|
+
if verbose: print(f"Few missing values → long +1")
|
15118
|
+
|
15119
|
+
# 7. Advanced Clustering (only for medium/large datasets)
|
15120
|
+
if len(numeric_cols) > 3 and n_rows > 10 and n_cols > 5:
|
14724
15121
|
try:
|
14725
|
-
|
14726
|
-
|
14727
|
-
|
14728
|
-
|
14729
|
-
|
15122
|
+
# Efficient clustering with sampling
|
15123
|
+
sample_data = data[numeric_cols].sample(n=min(100, n_rows), random_state=42)
|
15124
|
+
scaled_data = StandardScaler().fit_transform(sample_data.dropna())
|
15125
|
+
|
15126
|
+
if scaled_data.shape[0] > 5:
|
15127
|
+
# Column clustering
|
15128
|
+
col_dist = pairwise_distances(scaled_data.T)
|
15129
|
+
col_clusters = AgglomerativeClustering(n_clusters=2,
|
15130
|
+
affinity='precomputed',
|
15131
|
+
linkage='complete').fit(col_dist)
|
15132
|
+
cluster_counts = Counter(col_clusters.labels_)
|
15133
|
+
if max(cluster_counts.values()) > 0.7 * len(numeric_cols):
|
15134
|
+
wide_score += 2
|
15135
|
+
if verbose: print(f"Column clustering shows dominant group → wide +2")
|
15136
|
+
|
15137
|
+
# Row clustering
|
15138
|
+
row_clusters = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
|
15139
|
+
row_cluster_counts = Counter(row_clusters.labels_)
|
15140
|
+
if max(row_cluster_counts.values()) > 0.8 * scaled_data.shape[0]:
|
15141
|
+
wide_score += 1
|
15142
|
+
if verbose: print(f"Row clustering shows homogeneity → wide +1")
|
14730
15143
|
except Exception as e:
|
14731
|
-
|
14732
|
-
|
14733
|
-
#
|
14734
|
-
|
14735
|
-
|
14736
|
-
|
14737
|
-
|
14738
|
-
|
14739
|
-
|
14740
|
-
|
14741
|
-
|
14742
|
-
|
14743
|
-
|
14744
|
-
if
|
14745
|
-
|
14746
|
-
|
14747
|
-
|
14748
|
-
|
14749
|
-
|
14750
|
-
|
14751
|
-
|
14752
|
-
|
14753
|
-
|
14754
|
-
if wide_score == long_score:
|
14755
|
-
if n_cols > n_rows:
|
14756
|
-
wide_score += 1
|
14757
|
-
if verbose:
|
14758
|
-
print(
|
14759
|
-
"Tie-breaking based on column-major structure, favoring wide format."
|
14760
|
-
)
|
14761
|
-
elif n_rows > n_cols:
|
14762
|
-
long_score += 1
|
14763
|
-
if verbose:
|
14764
|
-
print(
|
14765
|
-
"Tie-breaking based on row-major structure, favoring long format."
|
14766
|
-
)
|
14767
|
-
else:
|
14768
|
-
if verbose:
|
14769
|
-
print("Tie-breaking inconclusive; returning 'uncertain'.")
|
14770
|
-
return "uncertain"
|
14771
|
-
|
14772
|
-
# Final decision
|
14773
|
-
if wide_score > long_score:
|
14774
|
-
if verbose:
|
14775
|
-
print("Final decision: Wide format.")
|
14776
|
-
return "wide"
|
14777
|
-
elif long_score > wide_score:
|
14778
|
-
if verbose:
|
14779
|
-
print("Final decision: Long format.")
|
14780
|
-
return "long"
|
15144
|
+
if verbose: print(f"Clustering skipped: {str(e)}")
|
15145
|
+
|
15146
|
+
# ----- Decision Logic -----
|
15147
|
+
score_diff = long_score - wide_score
|
15148
|
+
abs_diff = abs(score_diff)
|
15149
|
+
|
15150
|
+
if verbose:
|
15151
|
+
print(f"\nFinal scores - Long: {long_score}, Wide: {wide_score}")
|
15152
|
+
|
15153
|
+
if abs_diff >= 3:
|
15154
|
+
return "long" if score_diff > 0 else "wide"
|
15155
|
+
elif abs_diff >= 1:
|
15156
|
+
# Additional tie-breakers
|
15157
|
+
if score_diff == 0:
|
15158
|
+
if row_col_ratio > 1.5:
|
15159
|
+
return "long"
|
15160
|
+
elif row_col_ratio < 0.67:
|
15161
|
+
return "wide"
|
15162
|
+
elif len(cat_cols) > len(numeric_cols):
|
15163
|
+
return "long"
|
15164
|
+
else:
|
15165
|
+
return "wide"
|
15166
|
+
return "long" if score_diff > 0 else "wide"
|
14781
15167
|
else:
|
14782
|
-
if verbose:
|
14783
|
-
print("Final decision: Uncertain format.")
|
14784
15168
|
return "uncertain"
|
14785
|
-
|
14786
15169
|
#! ========== workbook, worksheet, wb,ws =============
|
14787
15170
|
|
14788
15171
|
import openpyxl
|
@@ -17221,3 +17604,290 @@ def set_theme(
|
|
17221
17604
|
color_codes=color_codes,
|
17222
17605
|
rc=rc_params,
|
17223
17606
|
)
|
17607
|
+
|
17608
|
+
|
17609
|
+
|
17610
|
+
def df_wide_long(df):
|
17611
|
+
rows, columns = df.shape
|
17612
|
+
if columns > rows:
|
17613
|
+
return "Wide"
|
17614
|
+
elif rows > columns:
|
17615
|
+
return "Long"
|
17616
|
+
|
17617
|
+
def df2array(data: pd.DataFrame, x=None, y=None, hue=None, sort=False):
|
17618
|
+
|
17619
|
+
def sort_rows_move_nan(arr, sort=False):
|
17620
|
+
# Handle edge cases where all values are NaN
|
17621
|
+
if np.all(np.isnan(arr)):
|
17622
|
+
return arr # Return unchanged if the entire array is NaN
|
17623
|
+
|
17624
|
+
if sort:
|
17625
|
+
# Replace NaNs with a temporary large value for sorting
|
17626
|
+
temp_value = (
|
17627
|
+
np.nanmax(arr[np.isfinite(arr)]) + 1 if np.any(np.isfinite(arr)) else np.inf
|
17628
|
+
)
|
17629
|
+
arr_no_nan = np.where(np.isnan(arr), temp_value, arr)
|
17630
|
+
|
17631
|
+
# Sort each row
|
17632
|
+
sorted_arr = np.sort(arr_no_nan, axis=1)
|
17633
|
+
|
17634
|
+
# Move NaNs to the end
|
17635
|
+
result_arr = np.where(sorted_arr == temp_value, np.nan, sorted_arr)
|
17636
|
+
else:
|
17637
|
+
result_rows = []
|
17638
|
+
for row in arr:
|
17639
|
+
# Separate non-NaN and NaN values
|
17640
|
+
non_nan_values = row[~np.isnan(row)]
|
17641
|
+
nan_count = np.isnan(row).sum()
|
17642
|
+
# Create a new row with non-NaN values followed by NaNs
|
17643
|
+
new_row = np.concatenate([non_nan_values, [np.nan] * nan_count])
|
17644
|
+
result_rows.append(new_row)
|
17645
|
+
# Convert the list of rows back into a 2D NumPy array
|
17646
|
+
result_arr = np.array(result_rows)
|
17647
|
+
|
17648
|
+
# Remove rows/columns that contain only NaNs
|
17649
|
+
clean_arr = result_arr[~np.isnan(result_arr).all(axis=1)]
|
17650
|
+
clean_arr_ = clean_arr[:, ~np.isnan(clean_arr).all(axis=0)]
|
17651
|
+
|
17652
|
+
return clean_arr_
|
17653
|
+
# data = data.copy()
|
17654
|
+
# data[y] = pd.to_numeric(data[y], errors="coerce")
|
17655
|
+
# data = data.dropna(subset=[y])
|
17656
|
+
if hue is None:
|
17657
|
+
a = []
|
17658
|
+
if sort:
|
17659
|
+
cat_x = np.sort(data[x].unique().tolist()).tolist()
|
17660
|
+
else:
|
17661
|
+
cat_x = data[x].unique().tolist()
|
17662
|
+
for i, x_ in enumerate(cat_x):
|
17663
|
+
new_ = data.loc[data[x] == x_, y].to_list()
|
17664
|
+
a = padcat(a, new_, axis=0)
|
17665
|
+
return sort_rows_move_nan(a).T
|
17666
|
+
else:
|
17667
|
+
a = []
|
17668
|
+
if sort:
|
17669
|
+
cat_x = np.sort(data[x].unique().tolist()).tolist()
|
17670
|
+
cat_hue = np.sort(data[hue].unique().tolist()).tolist()
|
17671
|
+
else:
|
17672
|
+
cat_x = data[x].unique().tolist()
|
17673
|
+
cat_hue = data[hue].unique().tolist()
|
17674
|
+
for i, x_ in enumerate(cat_x):
|
17675
|
+
for j, hue_ in enumerate(cat_hue):
|
17676
|
+
new_ = data.loc[(data[x] == x_) & (data[hue] == hue_), y].to_list()
|
17677
|
+
a = padcat(a, new_, axis=0)
|
17678
|
+
return sort_rows_move_nan(a).T
|
17679
|
+
|
17680
|
+
|
17681
|
+
def array2df(data: np.ndarray):
|
17682
|
+
df = pd.DataFrame()
|
17683
|
+
df["group"] = (
|
17684
|
+
np.tile(
|
17685
|
+
["group" + str(i) for i in range(1, data.shape[1] + 1)], [data.shape[0], 1]
|
17686
|
+
)
|
17687
|
+
.reshape(-1, 1, order="F")[:, 0]
|
17688
|
+
.tolist()
|
17689
|
+
)
|
17690
|
+
df["value"] = data.reshape(-1, 1, order="F")
|
17691
|
+
return df
|
17692
|
+
|
17693
|
+
|
17694
|
+
def padcat(*args, fill_value=np.nan, axis=1, order="row"):
|
17695
|
+
"""
|
17696
|
+
Concatenate vectors with padding.
|
17697
|
+
|
17698
|
+
Parameters:
|
17699
|
+
*args : variable number of list or 1D arrays
|
17700
|
+
Input arrays to concatenate.
|
17701
|
+
fill_value : scalar, optional
|
17702
|
+
The value to use for padding the shorter lists (default is np.nan).
|
17703
|
+
axis : int, optional
|
17704
|
+
The axis along which to concatenate (0 for rows, 1 for columns, default is 1).
|
17705
|
+
order : str, optional
|
17706
|
+
The order for flattening when required: "row" or "column" (default is "row").
|
17707
|
+
|
17708
|
+
Returns:
|
17709
|
+
np.ndarray
|
17710
|
+
A 2D array with the input arrays concatenated along the specified axis,
|
17711
|
+
padded with fill_value where necessary.
|
17712
|
+
|
17713
|
+
|
17714
|
+
# Example usage:
|
17715
|
+
a = [1, np.nan]
|
17716
|
+
b = [1, 3, 4, np.nan, 2, np.nan]
|
17717
|
+
c = [1, 2, 3, 4, 5, 6, 7, 8, 10]
|
17718
|
+
d = padcat(a, b)
|
17719
|
+
result1 = padcat(d, c)
|
17720
|
+
result2 = padcat(a, b, c)
|
17721
|
+
print("Result of padcat(d, c):\n", result1)
|
17722
|
+
print("Result of padcat(a, b, c):\n", result2)
|
17723
|
+
"""
|
17724
|
+
# Set the order for processing
|
17725
|
+
if "ro" in order.lower():
|
17726
|
+
order = "C" # row-major order
|
17727
|
+
else:
|
17728
|
+
order = "F" # column-major order
|
17729
|
+
|
17730
|
+
# Process input arrays based on their dimensions
|
17731
|
+
processed_arrays = []
|
17732
|
+
for arg in args:
|
17733
|
+
arr = np.asarray(arg)
|
17734
|
+
if arr.ndim == 1:
|
17735
|
+
processed_arrays.append(arr) # Keep 1D arrays as is
|
17736
|
+
elif arr.ndim == 2:
|
17737
|
+
if axis == 0:
|
17738
|
+
# If concatenating along rows, split 2D arrays into 1D arrays row-wise
|
17739
|
+
processed_arrays.extend(arr)
|
17740
|
+
elif axis == 1:
|
17741
|
+
# If concatenating along columns, split 2D arrays into 1D arrays column-wise
|
17742
|
+
processed_arrays.extend(arr.T)
|
17743
|
+
else:
|
17744
|
+
raise ValueError("axis must be 0 or 1")
|
17745
|
+
else:
|
17746
|
+
raise ValueError("Input arrays must be 1D or 2D")
|
17747
|
+
|
17748
|
+
if axis == 0:
|
17749
|
+
# Concatenate along rows
|
17750
|
+
max_len = max(arr.size for arr in processed_arrays)
|
17751
|
+
result = np.full((len(processed_arrays), max_len), fill_value)
|
17752
|
+
for i, arr in enumerate(processed_arrays):
|
17753
|
+
result[i, : arr.size] = arr
|
17754
|
+
elif axis == 1:
|
17755
|
+
# Concatenate along columns
|
17756
|
+
max_len = max(arr.size for arr in processed_arrays)
|
17757
|
+
result = np.full((max_len, len(processed_arrays)), fill_value)
|
17758
|
+
for i, arr in enumerate(processed_arrays):
|
17759
|
+
result[: arr.size, i] = arr
|
17760
|
+
else:
|
17761
|
+
raise ValueError("axis must be 0 or 1")
|
17762
|
+
|
17763
|
+
return result
|
17764
|
+
|
17765
|
+
|
17766
|
+
# ========== memory cleaner ==========
|
17767
|
+
import gc
|
17768
|
+
import os
|
17769
|
+
import sys
|
17770
|
+
import psutil
|
17771
|
+
import platform
|
17772
|
+
import ctypes
|
17773
|
+
import subprocess
|
17774
|
+
import warnings
|
17775
|
+
import time
|
17776
|
+
|
17777
|
+
class MemoryOptimizer:
|
17778
|
+
def __init__(self, verbose: bool = True, aggressive_mode: bool = True):
|
17779
|
+
self.verbose = verbose
|
17780
|
+
self.aggressive_mode = aggressive_mode
|
17781
|
+
self.system = platform.system()
|
17782
|
+
self.process = psutil.Process(os.getpid())
|
17783
|
+
self.start_time = time.time()
|
17784
|
+
self.memory_history = []
|
17785
|
+
|
17786
|
+
def log(self, msg: str, level: str = "INFO"):
|
17787
|
+
if self.verbose:
|
17788
|
+
rss = self.process.memory_info().rss / (1024 ** 2)
|
17789
|
+
elapsed = time.time() - self.start_time
|
17790
|
+
print(f"[{level}][{elapsed:.2f}s][{rss:.1f}MB] {msg}")
|
17791
|
+
|
17792
|
+
def collect_garbage(self):
|
17793
|
+
self.log("Performing deep garbage collection...")
|
17794
|
+
stats = {}
|
17795
|
+
before_mem = self.process.memory_info().rss
|
17796
|
+
for gen in reversed(range(3)):
|
17797
|
+
collected = gc.collect(gen)
|
17798
|
+
self.log(f"GC Gen {gen}: Collected {collected}")
|
17799
|
+
gc.garbage.clear()
|
17800
|
+
after_mem = self.process.memory_info().rss
|
17801
|
+
stats['freed_mb'] = (before_mem - after_mem) / (1024 ** 2)
|
17802
|
+
return stats
|
17803
|
+
|
17804
|
+
def clear_frameworks(self):
|
17805
|
+
result = {}
|
17806
|
+
try:
|
17807
|
+
import torch
|
17808
|
+
if torch.cuda.is_available():
|
17809
|
+
self.log("Clearing PyTorch cache...")
|
17810
|
+
torch.cuda.empty_cache()
|
17811
|
+
torch.cuda.ipc_collect()
|
17812
|
+
result['pytorch'] = 'cleared'
|
17813
|
+
except Exception as e:
|
17814
|
+
self.log(f"PyTorch skipped: {e}", "WARNING")
|
17815
|
+
|
17816
|
+
try:
|
17817
|
+
import tensorflow as tf
|
17818
|
+
self.log("Clearing TensorFlow session...")
|
17819
|
+
tf.keras.backend.clear_session()
|
17820
|
+
result['tensorflow'] = 'cleared'
|
17821
|
+
except Exception as e:
|
17822
|
+
self.log(f"TensorFlow skipped: {e}", "WARNING")
|
17823
|
+
|
17824
|
+
try:
|
17825
|
+
import cv2
|
17826
|
+
self.log("Closing OpenCV windows...")
|
17827
|
+
cv2.destroyAllWindows()
|
17828
|
+
result['opencv'] = 'cleared'
|
17829
|
+
except Exception:
|
17830
|
+
pass
|
17831
|
+
|
17832
|
+
try:
|
17833
|
+
import matplotlib.pyplot as plt
|
17834
|
+
self.log("Closing matplotlib figures...")
|
17835
|
+
plt.close('all')
|
17836
|
+
result['matplotlib'] = 'cleared'
|
17837
|
+
except Exception:
|
17838
|
+
pass
|
17839
|
+
|
17840
|
+
return result
|
17841
|
+
|
17842
|
+
def clear_system_caches(self):
|
17843
|
+
result = {}
|
17844
|
+
self.log("Attempting full system cache clearance...")
|
17845
|
+
try:
|
17846
|
+
if self.system == "Linux":
|
17847
|
+
subprocess.run(["sync"], check=True)
|
17848
|
+
subprocess.run(["sudo", "sh", "-c", "echo 3 > /proc/sys/vm/drop_caches"], check=True)
|
17849
|
+
result['linux'] = 'caches dropped'
|
17850
|
+
elif self.system == "Darwin":
|
17851
|
+
subprocess.run(["sudo", "purge"], check=True)
|
17852
|
+
result['macos'] = 'purge run'
|
17853
|
+
elif self.system == "Windows":
|
17854
|
+
ctypes.windll.psapi.EmptyWorkingSet(-1)
|
17855
|
+
if self.aggressive_mode:
|
17856
|
+
ctypes.windll.kernel32.SetProcessWorkingSetSizeEx(
|
17857
|
+
-1, ctypes.c_size_t(-1), ctypes.c_size_t(-1), ctypes.c_uint(0x1)
|
17858
|
+
)
|
17859
|
+
result['windows'] = 'working set emptied'
|
17860
|
+
except Exception as e:
|
17861
|
+
self.log(f"System cache clearing failed: {e}", "ERROR")
|
17862
|
+
return result
|
17863
|
+
|
17864
|
+
def profile(self) -> Dict[str, Any]:
|
17865
|
+
mem = self.process.memory_info()
|
17866
|
+
vm = psutil.virtual_memory()
|
17867
|
+
profile = {
|
17868
|
+
'rss_mb': mem.rss / (1024 ** 2),
|
17869
|
+
'vms_mb': mem.vms / (1024 ** 2),
|
17870
|
+
'used_gb': vm.used / (1024 ** 3),
|
17871
|
+
'available_gb': vm.available / (1024 ** 3),
|
17872
|
+
'percent': vm.percent,
|
17873
|
+
}
|
17874
|
+
self.memory_history.append(profile)
|
17875
|
+
return profile
|
17876
|
+
|
17877
|
+
def optimize(self) -> Dict[str, Any]:
|
17878
|
+
result = {}
|
17879
|
+
result['before'] = self.profile()
|
17880
|
+
result['gc'] = self.collect_garbage()
|
17881
|
+
result['frameworks'] = self.clear_frameworks()
|
17882
|
+
result['system'] = self.clear_system_caches()
|
17883
|
+
result['after'] = self.profile()
|
17884
|
+
saved = result['before']['rss_mb'] - result['after']['rss_mb']
|
17885
|
+
result['saved_mb'] = saved
|
17886
|
+
result['saved_percent'] = (saved / result['before']['rss_mb']) * 100 if result['before']['rss_mb'] else 0
|
17887
|
+
self.log(f"Optimization complete: Saved {saved:.2f} MB ({result['saved_percent']:.1f}%)", "SUCCESS")
|
17888
|
+
return result
|
17889
|
+
|
17890
|
+
|
17891
|
+
def cleaner(verbose: bool = True, aggressive: bool = True) -> Dict[str, Any]:
|
17892
|
+
optimizer = MemoryOptimizer(verbose=verbose, aggressive_mode=aggressive)
|
17893
|
+
return optimizer.optimize()
|