py2ls 0.2.5.14__py3-none-any.whl → 0.2.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/__init__.py +1 -5
- py2ls/ips.py +1446 -70
- {py2ls-0.2.5.14.dist-info → py2ls-0.2.6.1.dist-info}/METADATA +89 -232
- {py2ls-0.2.5.14.dist-info → py2ls-0.2.6.1.dist-info}/RECORD +7 -8
- py2ls/ips_lab.py +0 -17172
- {py2ls-0.2.5.14.dist-info → py2ls-0.2.6.1.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
from tkinter import FALSE
|
2
2
|
import numpy as np
|
3
3
|
import pandas as pd
|
4
|
-
import sys
|
5
|
-
import os
|
4
|
+
import sys # built-in
|
5
|
+
import os # built-in
|
6
6
|
from IPython.display import display
|
7
7
|
import shutil
|
8
8
|
import logging
|
9
9
|
from pathlib import Path
|
10
10
|
from datetime import datetime, date, time
|
11
|
-
import re
|
11
|
+
import re # built-in
|
12
12
|
import stat
|
13
13
|
import platform
|
14
14
|
|
15
|
-
from typing import Dict, List, Optional, Union, Any, Tuple, Literal
|
15
|
+
from typing import Dict, List, Optional, Union, Any, Tuple, Literal,Callable
|
16
16
|
from regex import X
|
17
17
|
|
18
18
|
try:
|
@@ -26,7 +26,218 @@ import warnings
|
|
26
26
|
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
27
27
|
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
28
28
|
warnings.filterwarnings("ignore")
|
29
|
+
try:
|
30
|
+
import pkg_resources
|
31
|
+
except ImportError:
|
32
|
+
pkg_resources = None
|
33
|
+
import glob # built-in
|
34
|
+
import pkg_resources # built-in
|
35
|
+
class PkgManager:
|
36
|
+
"""
|
37
|
+
PkgManager.uninstall("py2ls")
|
38
|
+
PkgManager.uninstall("py2ls", mode="startswith")
|
39
|
+
PkgManager.uninstall("py2ls", mode="endswith")
|
40
|
+
PkgManager.uninstall("py2ls", mode="contains")
|
41
|
+
PkgManager.uninstall("py2ls", mode="regex")
|
42
|
+
|
43
|
+
PkgManager.timemachine()
|
44
|
+
"""
|
29
45
|
|
46
|
+
@staticmethod
|
47
|
+
def uninstall(
|
48
|
+
kw: Union[str, List[str]],
|
49
|
+
mode: str = "exact",
|
50
|
+
dry_run: bool = False,
|
51
|
+
make_backup: bool = True,
|
52
|
+
make_log: bool = True,
|
53
|
+
station: Optional[str] = None,
|
54
|
+
) -> None:
|
55
|
+
if station is None:
|
56
|
+
station = os.path.dirname(os.path.dirname(sys.executable))
|
57
|
+
os.makedirs(station, exist_ok=True)
|
58
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
59
|
+
|
60
|
+
if isinstance(kw, str):
|
61
|
+
kw = [kw]
|
62
|
+
kw = [k.lower() for k in kw] if mode != "regex" else kw
|
63
|
+
mode = mode.lower()
|
64
|
+
valid_modes = {"exact", "startswith", "endswith", "contains", "regex"}
|
65
|
+
if mode not in valid_modes:
|
66
|
+
raise ValueError(f"Mode must be one of {valid_modes}")
|
67
|
+
|
68
|
+
installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
|
69
|
+
matched: Set[str] = set()
|
70
|
+
|
71
|
+
for name in installed_packages:
|
72
|
+
for key in kw:
|
73
|
+
if (
|
74
|
+
(mode == "exact" and name == key)
|
75
|
+
or (mode == "startswith" and name.startswith(key))
|
76
|
+
or (mode == "endswith" and name.endswith(key))
|
77
|
+
or (mode == "contains" and key in name)
|
78
|
+
or (mode == "regex" and re.search(key, name))
|
79
|
+
):
|
80
|
+
matched.add(name)
|
81
|
+
break
|
82
|
+
|
83
|
+
if not matched:
|
84
|
+
print("No packages matched the criteria.")
|
85
|
+
return
|
86
|
+
|
87
|
+
if make_backup and not dry_run:
|
88
|
+
backup_path = os.path.join(station, f"requirements_backup_{timestamp}.txt")
|
89
|
+
with open(backup_path, "w") as f:
|
90
|
+
subprocess.run(["pip", "freeze"], stdout=f, check=True)
|
91
|
+
print(f"Backup created at: '{backup_path}'")
|
92
|
+
|
93
|
+
if dry_run:
|
94
|
+
print("[DRY RUN] The following packages would be uninstalled:")
|
95
|
+
for pkg in sorted(matched):
|
96
|
+
print(f" - {pkg}=={installed_packages[pkg]}")
|
97
|
+
return
|
98
|
+
|
99
|
+
print(f"[UNINSTALLING] {len(matched)} packages:")
|
100
|
+
for pkg in sorted(matched):
|
101
|
+
print(f" - {pkg}=={installed_packages[pkg]}")
|
102
|
+
subprocess.run(["pip", "uninstall", "-y", pkg], check=True)
|
103
|
+
|
104
|
+
if make_log:
|
105
|
+
log_path = os.path.join(station, f"uninstall_{timestamp}.txt")
|
106
|
+
with open(log_path, "w") as f:
|
107
|
+
f.write(f"# Uninstallation log created at {timestamp}\n")
|
108
|
+
f.write(f"# Mode: {mode}, Keywords: {kw}\n\n")
|
109
|
+
for pkg in sorted(matched):
|
110
|
+
f.write(f"{pkg}=={installed_packages[pkg]}\n")
|
111
|
+
print(f"Log written to '{log_path}'")
|
112
|
+
|
113
|
+
@staticmethod
|
114
|
+
def list_backups(station: Optional[str] = None) -> List[str]:
|
115
|
+
if station is None:
|
116
|
+
station = os.path.dirname(sys.executable)
|
117
|
+
if os.name == "nt":
|
118
|
+
station = os.path.dirname(station)
|
119
|
+
return sorted(glob.glob(os.path.join(station, "requirements_backup_*.txt")))
|
120
|
+
|
121
|
+
@staticmethod
|
122
|
+
def list_logs(station: Optional[str] = None) -> List[str]:
|
123
|
+
if station is None:
|
124
|
+
station = os.path.dirname(sys.executable)
|
125
|
+
if os.name == "nt":
|
126
|
+
station = os.path.dirname(station)
|
127
|
+
return sorted(glob.glob(os.path.join(station, "uninstall_*.txt")))
|
128
|
+
|
129
|
+
@staticmethod
|
130
|
+
def restore(
|
131
|
+
timestamp: Optional[str] = None,
|
132
|
+
station: Optional[str] = None,
|
133
|
+
dry_run: bool = False,
|
134
|
+
) -> None:
|
135
|
+
if station is None:
|
136
|
+
station = os.path.dirname(sys.executable)
|
137
|
+
if os.name == "nt":
|
138
|
+
station = os.path.dirname(station)
|
139
|
+
|
140
|
+
backups = PkgManager.list_backups(station)
|
141
|
+
logs = PkgManager.list_logs(station)
|
142
|
+
|
143
|
+
if not timestamp:
|
144
|
+
print("Available restore points:\n\nBackups:")
|
145
|
+
for i, backup in enumerate(backups, 1):
|
146
|
+
ts = os.path.basename(backup)[18:-4]
|
147
|
+
print(f" {i}. {ts} (backup)")
|
148
|
+
print("\nUninstall logs:")
|
149
|
+
for i, log in enumerate(logs, len(backups) + 1):
|
150
|
+
ts = os.path.basename(log)[10:-4]
|
151
|
+
print(f" {i}. {ts} (log)")
|
152
|
+
print("\nSpecify timestamp or selection number to restore.")
|
153
|
+
return
|
154
|
+
|
155
|
+
try:
|
156
|
+
selection = int(timestamp)
|
157
|
+
all_files = backups + logs
|
158
|
+
if 1 <= selection <= len(all_files):
|
159
|
+
file_path = all_files[selection - 1]
|
160
|
+
is_log = selection > len(backups)
|
161
|
+
else:
|
162
|
+
raise ValueError("Invalid selection number")
|
163
|
+
except ValueError:
|
164
|
+
backup_pattern = os.path.join(
|
165
|
+
station, f"requirements_backup_{timestamp}.txt"
|
166
|
+
)
|
167
|
+
log_pattern = os.path.join(station, f"uninstall_{timestamp}.txt")
|
168
|
+
matching_backups = glob.glob(backup_pattern)
|
169
|
+
matching_logs = glob.glob(log_pattern)
|
170
|
+
|
171
|
+
if matching_backups:
|
172
|
+
file_path = matching_backups[0]
|
173
|
+
is_log = False
|
174
|
+
elif matching_logs:
|
175
|
+
file_path = matching_logs[0]
|
176
|
+
is_log = True
|
177
|
+
else:
|
178
|
+
print(f"No backup or log found for timestamp: {timestamp}")
|
179
|
+
return
|
180
|
+
|
181
|
+
with open(file_path, "r") as f:
|
182
|
+
packages = [
|
183
|
+
line.strip() for line in f if line.strip() and not line.startswith("#")
|
184
|
+
]
|
185
|
+
|
186
|
+
if dry_run:
|
187
|
+
print(
|
188
|
+
f"[DRY RUN] Would restore {len(packages)} packages from:\n {file_path}"
|
189
|
+
)
|
190
|
+
for pkg in packages:
|
191
|
+
print(f" - {pkg}")
|
192
|
+
return
|
193
|
+
|
194
|
+
print(f"[RESTORING] {len(packages)} packages from:\n {file_path}")
|
195
|
+
for pkg in packages:
|
196
|
+
print(f" - Installing {pkg}")
|
197
|
+
subprocess.run(["pip", "install", pkg], check=True)
|
198
|
+
|
199
|
+
@staticmethod
|
200
|
+
def timemachine(station: Optional[str] = None) -> None:
|
201
|
+
if station is None:
|
202
|
+
station = os.path.dirname(sys.executable)
|
203
|
+
if os.name == "nt":
|
204
|
+
station = os.path.dirname(station)
|
205
|
+
|
206
|
+
backups = PkgManager.list_backups(station)
|
207
|
+
logs = PkgManager.list_logs(station)
|
208
|
+
|
209
|
+
if not backups and not logs:
|
210
|
+
print("No backup or log files found.")
|
211
|
+
return
|
212
|
+
|
213
|
+
print("\nTime Machine - Available Restore Points:")
|
214
|
+
print("--------------------------------------")
|
215
|
+
print("\nBackups (complete environment snapshots):")
|
216
|
+
for i, backup in enumerate(backups, 1):
|
217
|
+
ts = os.path.basename(backup)[18:-4]
|
218
|
+
print(f" {i}. {ts}")
|
219
|
+
print("\nUninstall Logs (specific package lists):")
|
220
|
+
for i, log in enumerate(logs, len(backups) + 1):
|
221
|
+
ts = os.path.basename(log)[10:-4]
|
222
|
+
print(f" {i}. {ts}")
|
223
|
+
print("\n0. Exit Time Machine")
|
224
|
+
|
225
|
+
while True:
|
226
|
+
try:
|
227
|
+
choice = input("\nSelect a restore point (number) or '0' to exit: ")
|
228
|
+
if choice == "0":
|
229
|
+
return
|
230
|
+
selection = int(choice)
|
231
|
+
all_files = backups + logs
|
232
|
+
if 1 <= selection <= len(all_files):
|
233
|
+
file_path = all_files[selection - 1]
|
234
|
+
timestamp = os.path.basename(file_path).split("_")[-1][:-4]
|
235
|
+
PkgManager.restore(timestamp, station)
|
236
|
+
return
|
237
|
+
else:
|
238
|
+
print("Invalid selection. Please try again.")
|
239
|
+
except ValueError:
|
240
|
+
print("Please enter a valid number.")
|
30
241
|
|
31
242
|
def _yaoshi_fernet(mima="mimashigudingde",yan=b"mimashigudingde",verbose=True):
|
32
243
|
import base64
|
@@ -5688,7 +5899,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
5688
5899
|
if output in ["dataframe", "df"]:
|
5689
5900
|
if verbose:
|
5690
5901
|
print("loading data as a DataFrame")
|
5691
|
-
if not password:
|
5902
|
+
if not bool(password):
|
5692
5903
|
if verbose:
|
5693
5904
|
print("Reading Excel without password protection...")
|
5694
5905
|
df = pd.read_excel(fpath, engine=engine, sheet_name=sheet_name, **kwargs)
|
@@ -6636,27 +6847,6 @@ def fsave(
|
|
6636
6847
|
print(
|
6637
6848
|
f"Error:\n{kind} is not in the supported list ['docx', 'txt', 'md', 'html', 'pdf', 'csv', 'xlsx', 'json', 'xml', 'yaml']"
|
6638
6849
|
)
|
6639
|
-
|
6640
|
-
|
6641
|
-
# # Example usage
|
6642
|
-
# text_content = ["Hello, this is a sample text file.", "This is the second paragraph."]
|
6643
|
-
# tabular_content = {"Name": ["Alice", "Bob"], "Age": [24, 30]}
|
6644
|
-
# json_content = {"name": "Alice", "age": 24}
|
6645
|
-
# yaml_content = {"Name": "Alice", "Age": 24}
|
6646
|
-
# xml_content = {"Name": "Alice", "Age": 24}
|
6647
|
-
# dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
|
6648
|
-
# fsave(dir_save + "sample.txt", text_content)
|
6649
|
-
# fsave(dir_save + "sample.md", text_content)
|
6650
|
-
# fsave(dir_save + "sample.html", text_content)
|
6651
|
-
# fsave(dir_save + "sample.pdf", text_content)
|
6652
|
-
# fsave(dir_save + "sample.docx", text_content)
|
6653
|
-
# fsave(dir_save + "sample.csv", tabular_content, index=False)
|
6654
|
-
# fsave(dir_save + "sample.xlsx", tabular_content, sheet_name="Sheet1", index=False)
|
6655
|
-
# fsave(dir_save + "sample.json", json_content, indent=4)
|
6656
|
-
# fsave(dir_save + "sample.yaml", yaml_content)
|
6657
|
-
# fsave(dir_save + "sample.xml", xml_content)
|
6658
|
-
|
6659
|
-
|
6660
6850
|
def addpath(fpath):
|
6661
6851
|
sys.path.insert(0, dir)
|
6662
6852
|
|
@@ -9410,12 +9600,6 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
9410
9600
|
else:
|
9411
9601
|
figsave(dirname(dir_save), fname)
|
9412
9602
|
|
9413
|
-
|
9414
|
-
# usage:
|
9415
|
-
# fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
|
9416
|
-
# thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
|
9417
|
-
|
9418
|
-
|
9419
9603
|
# search and fine the director of the libary, which installed at local
|
9420
9604
|
def dir_lib(lib_oi):
|
9421
9605
|
"""
|
@@ -9832,7 +10016,7 @@ def hex2argb(color):
|
|
9832
10016
|
|
9833
10017
|
# Validate hex format
|
9834
10018
|
if not re.fullmatch(r"[A-F0-9]{6,8}", color):
|
9835
|
-
raise ValueError(f"
|
10019
|
+
raise ValueError(f"格式错误: {color}, 应该使用 RRGGBB, #RRGGBB, or aARRGGBB format.")
|
9836
10020
|
|
9837
10021
|
# If already in aARRGGBB format (8 chars), return as is
|
9838
10022
|
if len(color) == 8:
|
@@ -10032,7 +10216,526 @@ def copy_format(
|
|
10032
10216
|
wb_source.close()
|
10033
10217
|
if "wb_target" in locals():
|
10034
10218
|
wb_target.close()
|
10219
|
+
# ! =========(below) interact with worrkbook and DataFrame===========
|
10220
|
+
import pandas as pd
|
10221
|
+
from openpyxl import load_workbook
|
10222
|
+
from openpyxl.workbook.workbook import Workbook
|
10223
|
+
from openpyxl.utils import get_column_letter
|
10224
|
+
|
10225
|
+
class DataFrameAlignExcel:
|
10226
|
+
"""
|
10227
|
+
A powerful tool for updating Excel files with data from DataFrames with various matching strategies.
|
10228
|
+
|
10229
|
+
Features:
|
10230
|
+
- Accepts either file path or open Workbook object
|
10231
|
+
- Multiple matching strategies (exact, contains, starts_with, ends_with, regex)
|
10232
|
+
- Multiple value update strategies (overwrite, add, subtract, multiply, divide, append)
|
10233
|
+
- Support for multiple worksheets
|
10234
|
+
- Automatic column creation
|
10235
|
+
- Value normalization options
|
10236
|
+
- Detailed logging and dry-run mode
|
10237
|
+
- Progress reporting
|
10238
|
+
- Data validation
|
10239
|
+
- make_backup functionality
|
10240
|
+
"""
|
10241
|
+
|
10242
|
+
def __init__(self, fpath: Union[str, Workbook], df: pd.DataFrame = None):
|
10243
|
+
"""
|
10244
|
+
Initialize the DataFrameAlignExcel.
|
10245
|
+
|
10246
|
+
Args:
|
10247
|
+
fpath: Path to the Excel file (str) or open Workbook object
|
10248
|
+
df: Optional DataFrame to use for updates
|
10249
|
+
"""
|
10250
|
+
self.fpath_or_wb = fpath
|
10251
|
+
self.df = df
|
10252
|
+
self.wb = None
|
10253
|
+
self.backup_path = None
|
10254
|
+
self.log = []
|
10255
|
+
self.owns_workbook = (
|
10256
|
+
False # Track whether we created the workbook or it was passed in
|
10257
|
+
)
|
10258
|
+
|
10259
|
+
def load_workbook(self) -> None:
|
10260
|
+
"""Load the Excel workbook if a path was provided."""
|
10261
|
+
if isinstance(self.fpath_or_wb, str):
|
10262
|
+
if not os.path.exists(self.fpath_or_wb):
|
10263
|
+
raise FileNotFoundError(f"Excel file not found: {self.fpath_or_wb}")
|
10264
|
+
self.wb = load_workbook(self.fpath_or_wb)
|
10265
|
+
self.owns_workbook = True
|
10266
|
+
elif isinstance(self.fpath_or_wb, Workbook):
|
10267
|
+
self.wb = self.fpath_or_wb
|
10268
|
+
self.owns_workbook = False
|
10269
|
+
else:
|
10270
|
+
raise TypeError(
|
10271
|
+
"fpath must be either a string path or an openpyxl Workbook object"
|
10272
|
+
)
|
10273
|
+
|
10274
|
+
def create_make_backup(self) -> None:
|
10275
|
+
"""Create a make_backup of the original Excel file (only if we loaded from a file)."""
|
10276
|
+
if not isinstance(self.fpath_or_wb, str):
|
10277
|
+
self.log.append(
|
10278
|
+
"Skipping make_backup - working with Workbook object directly"
|
10279
|
+
)
|
10280
|
+
return
|
10281
|
+
|
10282
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
10283
|
+
self.backup_path = os.path.join(
|
10284
|
+
os.path.dirname(self.fpath_or_wb),
|
10285
|
+
f"backup_{timestamp}_{os.path.basename(self.fpath_or_wb)}",
|
10286
|
+
)
|
10287
|
+
self.wb.save(self.backup_path)
|
10288
|
+
self.log.append(f"Created make_backup at: {self.backup_path}")
|
10289
|
+
|
10290
|
+
def save_workbook(self, dir_save: str = None) -> None:
|
10291
|
+
"""
|
10292
|
+
Save the workbook to a file.
|
10293
|
+
|
10294
|
+
Args:
|
10295
|
+
dir_save: Optional path to save to. If None and we loaded from a file,
|
10296
|
+
saves to the original path.
|
10297
|
+
"""
|
10298
|
+
if self.wb is None:
|
10299
|
+
raise ValueError("No workbook loaded")
|
10300
|
+
|
10301
|
+
if dir_save is None:
|
10302
|
+
if isinstance(self.fpath_or_wb, str):
|
10303
|
+
dir_save = self.fpath_or_wb
|
10304
|
+
else:
|
10305
|
+
dir_save = datetime.now().strftime("%Y%m%d_%H%M%S") + ".xlsx"
|
10306
|
+
print(
|
10307
|
+
f"No save path provided and original input was a Workbook object, so used : {dir_save}"
|
10308
|
+
)
|
10309
|
+
self.wb.save(dir_save)
|
10310
|
+
self.log.append(f"Saved workbook to: {dir_save}")
|
10311
|
+
|
10312
|
+
def normalize_value(self, value, clean_keys: str = "strip_split_first") -> str:
|
10313
|
+
"""
|
10314
|
+
Normalize a value based on the specified method.
|
10315
|
+
|
10316
|
+
Args:
|
10317
|
+
value: Value to normalize
|
10318
|
+
clean_keys: One of:
|
10319
|
+
- 'strip': just strip whitespace
|
10320
|
+
- 'strip_lower': strip and lowercase
|
10321
|
+
- 'strip_split_first': strip and take first part before comma
|
10322
|
+
- 'strip_split_last': strip and take last part after comma
|
10323
|
+
- None: no normalization
|
10324
|
+
|
10325
|
+
Returns:
|
10326
|
+
Normalized value
|
10327
|
+
"""
|
10328
|
+
if value is None:
|
10329
|
+
return None
|
10330
|
+
|
10331
|
+
value = str(value)
|
10332
|
+
|
10333
|
+
if clean_keys is None:
|
10334
|
+
return value
|
10335
|
+
|
10336
|
+
if clean_keys == "strip":
|
10337
|
+
return value.strip()
|
10338
|
+
elif clean_keys == "strip_lower":
|
10339
|
+
return value.strip().lower()
|
10340
|
+
elif clean_keys == "strip_split_first":
|
10341
|
+
return value.strip().split(",")[0].strip()
|
10342
|
+
elif clean_keys == "strip_split_last":
|
10343
|
+
parts = value.strip().split(",")
|
10344
|
+
return parts[-1].strip() if len(parts) > 1 else value.strip()
|
10345
|
+
else:
|
10346
|
+
warnings.warn(f"Unknown clean_keys: {clean_keys}. Using 'strip'.")
|
10347
|
+
return value.strip()
|
10348
|
+
|
10349
|
+
def find_column_index(self, ws, header_row: int, column_name: str, max_search_columns: int = 100) -> int:
|
10350
|
+
"""
|
10351
|
+
Efficiently find the column index (1-based) for a given column name,
|
10352
|
+
considering only non-empty cells and limiting search range.
|
10353
|
+
|
10354
|
+
Args:
|
10355
|
+
ws: Worksheet object
|
10356
|
+
header_row: Row number containing headers (1-based)
|
10357
|
+
column_name: Column name to find
|
10358
|
+
max_search_columns: Max number of columns to search (to prevent infinite loops)
|
10359
|
+
|
10360
|
+
Returns:
|
10361
|
+
Column index (1-based), or -1 if not found
|
10362
|
+
"""
|
10363
|
+
row_iter = ws.iter_rows(min_row=header_row, max_row=header_row, max_col=max_search_columns, values_only=False)
|
10364
|
+
for row in row_iter:
|
10365
|
+
for cell in row:
|
10366
|
+
if cell.value and str(cell.value).strip().lower() == column_name.lower():
|
10367
|
+
return cell.column
|
10368
|
+
break # Only process the header row
|
10369
|
+
return -1
|
10370
|
+
# def find_column_index(self, ws, header_row: int, column_name: str, max_search_columns: int = 100) -> int:
|
10371
|
+
# """
|
10372
|
+
# Find the column index (1-based) for a given column name.
|
10373
|
+
# If not found, return the last non-empty header column index.
|
10374
|
+
|
10375
|
+
# Args:
|
10376
|
+
# ws: Worksheet object
|
10377
|
+
# header_row: Row number containing headers (1-based)
|
10378
|
+
# column_name: Column name to find
|
10379
|
+
# max_search_columns: Max number of columns to search
|
10380
|
+
|
10381
|
+
# Returns:
|
10382
|
+
# Column index (1-based)
|
10383
|
+
# """
|
10384
|
+
# row_iter = ws.iter_rows(min_row=header_row, max_row=header_row, max_col=max_search_columns, values_only=False)
|
10385
|
+
# last_non_empty_col = -1
|
10386
|
+
|
10387
|
+
# for row in row_iter:
|
10388
|
+
# for cell in row:
|
10389
|
+
# if cell.value and str(cell.value).strip():
|
10390
|
+
# last_non_empty_col = cell.column
|
10391
|
+
# if str(cell.value).strip().lower() == column_name.lower():
|
10392
|
+
# return cell.column
|
10393
|
+
# break # Only one row being read
|
10394
|
+
|
10395
|
+
# return last_non_empty_col
|
10396
|
+
|
10397
|
+
def update_values(
|
10398
|
+
self,
|
10399
|
+
df: pd.DataFrame = None,
|
10400
|
+
sheet_name: Union[str, int, List[Union[str, int]]] = 0,
|
10401
|
+
header_row: int = 1,
|
10402
|
+
column_match: Union[Dict[str, str], List[Tuple[str, str]]] = None,
|
10403
|
+
column_mapping: Union[Dict[str, str], List[Tuple[str, str]]] = None,
|
10404
|
+
clean_keys: str = "strip_split_first",
|
10405
|
+
match_method: str = "exact",
|
10406
|
+
update_strategy: str = "overwrite",
|
10407
|
+
create_missing_columns: bool = True,
|
10408
|
+
preview_only: bool = False,
|
10409
|
+
show_progress: bool = True,
|
10410
|
+
skip_no_match: bool = True,
|
10411
|
+
make_backup: bool = True,
|
10412
|
+
dir_save: str = None,
|
10413
|
+
row_max=500
|
10414
|
+
) -> Dict[str, int]:
|
10415
|
+
"""
|
10416
|
+
Update Excel with values from DataFrame.
|
10417
|
+
|
10418
|
+
Args:
|
10419
|
+
df: DataFrame containing update data (if None, uses self.df)
|
10420
|
+
sheet_name: Sheet name(s) to update (str, int, or list of these)
|
10421
|
+
header_row: Row number containing headers (1-based)
|
10422
|
+
column_match: Dict or list of tuples mapping DataFrame columns to Excel columns for matching
|
10423
|
+
e.g., {'SampleID': 'ID'} or [('SampleID', 'ID'), ('Batch', 'Lot')]
|
10424
|
+
column_mapping: Dict or list of tuples mapping DataFrame columns to Excel columns to update
|
10425
|
+
e.g., {'Vials': 'Qty'} or [('Vials', 'Qty'), ('Status', 'State')]
|
10426
|
+
clean_keys: How to normalize matching values (see normalize_value())
|
10427
|
+
match_method: How to match values ('exact', 'contains', 'starts_with', 'ends_with', 'regex')
|
10428
|
+
update_strategy: How to update values ('overwrite', 'add', 'subtract', 'multiply', 'divide', 'append')
|
10429
|
+
create_missing_columns: Whether to create columns that don't exist
|
10430
|
+
preview_only: If True, don't actually update the Excel file
|
10431
|
+
show_progress: If True, print progress updates
|
10432
|
+
skip_no_match: If True, skip rows where match columns don't match
|
10433
|
+
make_backup: If True, create a make_backup before updating (only if working with file path)
|
10434
|
+
dir_save: Optional path to save to. If None and we loaded from a file,
|
10435
|
+
saves to the original path. Ignored if preview_only=True.
|
10436
|
+
|
10437
|
+
Returns:
|
10438
|
+
Dictionary with update statistics
|
10439
|
+
"""
|
10440
|
+
# Initialize
|
10441
|
+
start_time = datetime.now()
|
10442
|
+
if df is None:
|
10443
|
+
df = self.df
|
10444
|
+
if df is None:
|
10445
|
+
raise ValueError("No DataFrame provided")
|
10446
|
+
|
10447
|
+
if not isinstance(column_match, (dict, list)) or not column_match:
|
10448
|
+
raise ValueError(
|
10449
|
+
"column_match must be a non-empty dict or list of tuples"
|
10450
|
+
)
|
10451
|
+
|
10452
|
+
if not isinstance(column_mapping, (dict, list)) or not column_mapping:
|
10453
|
+
raise ValueError("column_mapping must be a non-empty dict or list of tuples")
|
10454
|
+
|
10455
|
+
# Convert match/update columns to consistent format
|
10456
|
+
if isinstance(column_match, dict):
|
10457
|
+
column_match = list(column_match.items())
|
10458
|
+
if isinstance(column_mapping, dict):
|
10459
|
+
column_mapping = list(column_mapping.items())
|
10460
|
+
|
10461
|
+
# Load workbook if not already loaded
|
10462
|
+
if self.wb is None:
|
10463
|
+
self.load_workbook()
|
10464
|
+
|
10465
|
+
# Create make_backup (only if we're working with a file path)
|
10466
|
+
if not preview_only:
|
10467
|
+
self.create_make_backup()
|
10468
|
+
|
10469
|
+
# Prepare statistics
|
10470
|
+
stats = {
|
10471
|
+
"processed_sheet_names":[],
|
10472
|
+
"processed_sheets": 0,
|
10473
|
+
"total_updates": 0,
|
10474
|
+
"skipped_rows": 0,
|
10475
|
+
"created_columns": 0,
|
10476
|
+
}
|
10477
|
+
|
10478
|
+
# Normalize sheet names
|
10479
|
+
if not isinstance(sheet_name, list):
|
10480
|
+
sheet_name = [sheet_name]
|
10481
|
+
|
10482
|
+
# Process each sheet
|
10483
|
+
for sheet in sheet_name:
|
10484
|
+
try:
|
10485
|
+
if isinstance(sheet, str):
|
10486
|
+
ws = self.wb[sheet]
|
10487
|
+
elif isinstance(sheet, int):
|
10488
|
+
ws = self.wb.worksheets[sheet]
|
10489
|
+
else:
|
10490
|
+
ws = self.wb.active
|
10491
|
+
|
10492
|
+
sheet_name = ws.title
|
10493
|
+
self.log.append(f"\nProcessing sheet: {sheet_name}")
|
10494
|
+
|
10495
|
+
# Prepare matching data
|
10496
|
+
match_dict = {}
|
10497
|
+
for df_col, excel_col in column_match:
|
10498
|
+
if clean_keys:
|
10499
|
+
match_dict[excel_col] = dict(
|
10500
|
+
zip(
|
10501
|
+
df[df_col].apply(
|
10502
|
+
lambda x: self.normalize_value(x, clean_keys)
|
10503
|
+
),
|
10504
|
+
df.index,
|
10505
|
+
)
|
10506
|
+
)
|
10507
|
+
else:
|
10508
|
+
match_dict[excel_col] = dict(zip(df[df_col], df.index))
|
10509
|
+
|
10510
|
+
# Find or create update columns
|
10511
|
+
update_col_indices = {}
|
10512
|
+
for df_col, excel_col in column_mapping:
|
10513
|
+
col_idx = self.find_column_index(ws, header_row, excel_col)
|
10514
|
+
if col_idx == -1:
|
10515
|
+
if create_missing_columns:
|
10516
|
+
# Find last column
|
10517
|
+
last_col = max(
|
10518
|
+
[cell.column for cell in ws[header_row] if cell.value is not None], default=0
|
10519
|
+
)
|
10520
|
+
col_idx = last_col + 1
|
10521
|
+
ws.cell(row=header_row, column=col_idx, value=excel_col)
|
10522
|
+
update_col_indices[excel_col] = col_idx
|
10523
|
+
stats["created_columns"] += 1
|
10524
|
+
self.log.append(
|
10525
|
+
f"Created new column '{excel_col}' at position {col_idx}"
|
10526
|
+
)
|
10527
|
+
else:
|
10528
|
+
raise ValueError(
|
10529
|
+
f"Column '{excel_col}' not found and create_missing_columns=False"
|
10530
|
+
)
|
10531
|
+
else:
|
10532
|
+
update_col_indices[excel_col] = col_idx
|
10533
|
+
|
10534
|
+
# Process rows
|
10535
|
+
for row in ws.iter_rows(min_row=header_row + 1):
|
10536
|
+
match_values = {}
|
10537
|
+
match_failed = False
|
10538
|
+
|
10539
|
+
for excel_col in match_dict.keys():
|
10540
|
+
col_idx = self.find_column_index(ws, header_row, excel_col)
|
10541
|
+
if col_idx == -1:
|
10542
|
+
if skip_no_match:
|
10543
|
+
match_failed = True
|
10544
|
+
break
|
10545
|
+
else:
|
10546
|
+
raise ValueError(
|
10547
|
+
f"Match column '{excel_col}' not found in sheet"
|
10548
|
+
)
|
10549
|
+
|
10550
|
+
cell_value = row[
|
10551
|
+
col_idx - 1
|
10552
|
+
].value # -1 because iter_rows returns 0-based list
|
10553
|
+
if clean_keys:
|
10554
|
+
cell_value = self.normalize_value(cell_value, clean_keys)
|
10555
|
+
|
10556
|
+
match_values[excel_col] = cell_value
|
10557
|
+
|
10558
|
+
if match_failed:
|
10559
|
+
stats["skipped_rows"] += 1
|
10560
|
+
continue
|
10561
|
+
|
10562
|
+
# Find matching DataFrame row
|
10563
|
+
df_index = None
|
10564
|
+
for excel_col, value in match_values.items():
|
10565
|
+
if value in match_dict[excel_col]:
|
10566
|
+
if df_index is None:
|
10567
|
+
df_index = match_dict[excel_col][value]
|
10568
|
+
elif df_index != match_dict[excel_col][value]:
|
10569
|
+
# Multiple match columns point to different rows - skip
|
10570
|
+
df_index = None
|
10571
|
+
break
|
10572
|
+
|
10573
|
+
if df_index is None:
|
10574
|
+
stats["skipped_rows"] += 1
|
10575
|
+
continue
|
10576
|
+
|
10577
|
+
# Update cells
|
10578
|
+
for df_col, excel_col in column_mapping:
|
10579
|
+
col_idx = update_col_indices[excel_col]
|
10580
|
+
cell = row[
|
10581
|
+
col_idx - 1
|
10582
|
+
] # -1 because iter_rows returns 0-based list
|
10583
|
+
new_value = df.at[df_index, df_col]
|
10584
|
+
|
10585
|
+
# Apply update strategy
|
10586
|
+
if update_strategy == "overwrite":
|
10587
|
+
cell.value = new_value
|
10588
|
+
elif update_strategy in (
|
10589
|
+
"add",
|
10590
|
+
"subtract",
|
10591
|
+
"multiply",
|
10592
|
+
"divide",
|
10593
|
+
):
|
10594
|
+
try:
|
10595
|
+
old_value = (
|
10596
|
+
float(cell.value) if cell.value is not None else 0
|
10597
|
+
)
|
10598
|
+
new_value = (
|
10599
|
+
float(new_value) if new_value is not None else 0
|
10600
|
+
)
|
10601
|
+
if update_strategy == "add":
|
10602
|
+
cell.value = old_value + new_value
|
10603
|
+
elif update_strategy == "subtract":
|
10604
|
+
cell.value = old_value - new_value
|
10605
|
+
elif update_strategy == "multiply":
|
10606
|
+
cell.value = old_value * new_value
|
10607
|
+
elif update_strategy == "divide":
|
10608
|
+
cell.value = (
|
10609
|
+
old_value / new_value
|
10610
|
+
if new_value != 0
|
10611
|
+
else old_value
|
10612
|
+
)
|
10613
|
+
except (ValueError, TypeError):
|
10614
|
+
if skip_no_match:
|
10615
|
+
continue
|
10616
|
+
raise ValueError(
|
10617
|
+
f"Could not perform {update_strategy} operation on non-numeric values"
|
10618
|
+
)
|
10619
|
+
elif update_strategy == "append":
|
10620
|
+
separator = ", " if cell.value else ""
|
10621
|
+
cell.value = (
|
10622
|
+
f"{cell.value}{separator}{new_value}"
|
10623
|
+
if cell.value
|
10624
|
+
else new_value
|
10625
|
+
)
|
10626
|
+
else:
|
10627
|
+
raise ValueError(
|
10628
|
+
f"Unknown update_strategy: {update_strategy}"
|
10629
|
+
)
|
10630
|
+
|
10631
|
+
stats["total_updates"] += 1
|
10632
|
+
|
10633
|
+
stats["processed_sheets"] += 1
|
10634
|
+
stats["processed_sheet_names"].append(sheet_name)
|
10635
|
+
except Exception as e:
|
10636
|
+
self.log.append(f"Error processing sheet {sheet}: {str(e)}")
|
10637
|
+
if (
|
10638
|
+
not preview_only
|
10639
|
+
and self.backup_path
|
10640
|
+
and isinstance(self.fpath_or_wb, str)
|
10641
|
+
):
|
10642
|
+
self.log.append("Restoring from make_backup due to error")
|
10643
|
+
self.wb = load_workbook(self.backup_path)
|
10644
|
+
raise
|
10645
|
+
|
10646
|
+
# Save changes if not dry run
|
10647
|
+
if not preview_only:
|
10648
|
+
self.save_workbook(dir_save)
|
10649
|
+
if not make_backup:
|
10650
|
+
if os.path.exists(self.backup_path):
|
10651
|
+
os.remove(self.backup_path)
|
10652
|
+
else:
|
10653
|
+
self.log.append("\nDry run complete - no changes saved")
|
10654
|
+
|
10655
|
+
# Print summary
|
10656
|
+
summary = (
|
10657
|
+
f"\nUpdate Summary:\n"
|
10658
|
+
f"\tProcessed {stats["processed_sheets"]} sheetnames: {stats['processed_sheet_names']}\n"
|
10659
|
+
f"\tTotal updates: {stats['total_updates']}\n"
|
10660
|
+
f"\tSkipped rows: {stats['skipped_rows']}\n"
|
10661
|
+
)
|
10662
|
+
self.log.append(summary)
|
10663
|
+
|
10664
|
+
if show_progress:
|
10665
|
+
print(summary)
|
10035
10666
|
|
10667
|
+
return stats
|
10668
|
+
|
10669
|
+
def get_log(self) -> str:
|
10670
|
+
"""Get the operation log as a string."""
|
10671
|
+
return "\n".join(self.log)
|
10672
|
+
|
10673
|
+
def close(self) -> None:
|
10674
|
+
"""Close the workbook if we own it."""
|
10675
|
+
if self.wb is not None and self.owns_workbook:
|
10676
|
+
self.wb.close()
|
10677
|
+
self.wb = None
|
10678
|
+
|
10679
|
+
|
10680
|
+
DFToExcelMapping = Union[Dict[str, str], List[Tuple[str, str]]]
|
10681
|
+
def df_align(
|
10682
|
+
fpath: Union[str, Workbook],
|
10683
|
+
df: pd.DataFrame,
|
10684
|
+
sheet_name: Union[str, int, List[Union[str, int]]] = 0,
|
10685
|
+
header_row: int = 1,
|
10686
|
+
column_match: DFToExcelMapping = None,
|
10687
|
+
column_mapping: DFToExcelMapping = None,
|
10688
|
+
clean_keys: str = "strip_split_first",
|
10689
|
+
match_method: str = "exact",
|
10690
|
+
update_strategy: str = "overwrite",
|
10691
|
+
create_missing_columns: bool = True,
|
10692
|
+
preview_only: bool = False,
|
10693
|
+
show_progress: bool = True,
|
10694
|
+
skip_no_match: bool = True,
|
10695
|
+
make_backup: bool = True,
|
10696
|
+
dir_save: str = None,
|
10697
|
+
) -> Dict[str, int]:
|
10698
|
+
"""
|
10699
|
+
wb = fload(
|
10700
|
+
dir_aml,
|
10701
|
+
password="XBuzwVk4xsC2361cHzyi9JFgfJHaTSerjBOQ0JAJU24=",
|
10702
|
+
sheet_name=0,
|
10703
|
+
header=1,
|
10704
|
+
output="bit",
|
10705
|
+
)
|
10706
|
+
ws = wb[wb.sheetnames[0]]
|
10707
|
+
df_align(
|
10708
|
+
fpath=wb,
|
10709
|
+
df=df_,
|
10710
|
+
sheet_name=None,
|
10711
|
+
header_row=2,
|
10712
|
+
column_match={"SampleID": "SampleID"},# key是 df中的列名, value是 excel中,
|
10713
|
+
column_mapping={"Vials": "Vials", "Vials_": "Total Vials"}, # key是 df中的列名, value是 excel中,
|
10714
|
+
)
|
10715
|
+
"""
|
10716
|
+
updater = DataFrameAlignExcel(fpath, df)
|
10717
|
+
try:
|
10718
|
+
result = updater.update_values(
|
10719
|
+
sheet_name=sheet_name,
|
10720
|
+
header_row=header_row,
|
10721
|
+
column_match=column_match,
|
10722
|
+
column_mapping=column_mapping,
|
10723
|
+
clean_keys=clean_keys,
|
10724
|
+
match_method=match_method,
|
10725
|
+
update_strategy=update_strategy,
|
10726
|
+
create_missing_columns=create_missing_columns,
|
10727
|
+
preview_only=preview_only,
|
10728
|
+
show_progress=show_progress,
|
10729
|
+
skip_no_match=skip_no_match,
|
10730
|
+
make_backup=make_backup,
|
10731
|
+
dir_save=dir_save,
|
10732
|
+
)
|
10733
|
+
return result
|
10734
|
+
finally:
|
10735
|
+
updater.close()
|
10736
|
+
|
10737
|
+
|
10738
|
+
# ! =========(Above) interact with worrkbook and DataFrame===========
|
10036
10739
|
def set_sheet_visible(
|
10037
10740
|
fpath: str,
|
10038
10741
|
sheet_name: Union[int, str, None,list] = 1,
|
@@ -10159,7 +10862,7 @@ def format_excel(
|
|
10159
10862
|
number_format:dict=None, # dict: e.g., {1:"0.00", 2:"#,##0",3:"0%",4:"$#,##0.00"}
|
10160
10863
|
data_validation=None, # dict
|
10161
10864
|
template:dict={},# e.g., template=dict(path="xx.xlsx",sheet_name=['sheet_name1',"sheet_name2"])
|
10162
|
-
apply_filter:bool=
|
10865
|
+
apply_filter:bool=False, # add filter
|
10163
10866
|
freeze :str= False,#"A2",
|
10164
10867
|
conditional_format:dict=None, # dict
|
10165
10868
|
verbose:bool=False,
|
@@ -10321,6 +11024,67 @@ def format_excel(
|
|
10321
11024
|
if end_col_letter
|
10322
11025
|
else f"{start_col_letter}{start_row}"
|
10323
11026
|
)
|
11027
|
+
|
11028
|
+
|
11029
|
+
def is_merged_cell(ws, cell):
|
11030
|
+
"""Check if a cell is part of any merged range."""
|
11031
|
+
for merged_range in ws.merged_cells.ranges:
|
11032
|
+
if cell.coordinate in merged_range:
|
11033
|
+
return True
|
11034
|
+
return False
|
11035
|
+
|
11036
|
+
def apply_auto_width(ws, width_factor=1.2, width_padding=2, width_max=50):
|
11037
|
+
"""
|
11038
|
+
Automatically adjust column widths based on content length,
|
11039
|
+
with complete protection against merged cell errors.
|
11040
|
+
|
11041
|
+
Args:
|
11042
|
+
ws: Worksheet object
|
11043
|
+
width_factor: Multiplier for content length (default 1.2)
|
11044
|
+
width_padding: Additional padding (default 2)
|
11045
|
+
width_max: Maximum column width (default 50)
|
11046
|
+
"""
|
11047
|
+
# First build a set of all merged cell coordinates
|
11048
|
+
merged_coords = set()
|
11049
|
+
for merged_range in ws.merged_cells.ranges:
|
11050
|
+
for row in ws.iter_rows(min_row=merged_range.min_row,
|
11051
|
+
max_row=merged_range.max_row,
|
11052
|
+
min_col=merged_range.min_col,
|
11053
|
+
max_col=merged_range.max_col):
|
11054
|
+
for cell in row:
|
11055
|
+
merged_coords.add(cell.coordinate)
|
11056
|
+
|
11057
|
+
for col in ws.columns:
|
11058
|
+
if not col:
|
11059
|
+
continue
|
11060
|
+
|
11061
|
+
col_letter = get_column_letter(col[0].column)
|
11062
|
+
max_length = 0
|
11063
|
+
|
11064
|
+
for cell in col:
|
11065
|
+
# Skip merged cells entirely
|
11066
|
+
if cell.coordinate in merged_coords:
|
11067
|
+
continue
|
11068
|
+
|
11069
|
+
try:
|
11070
|
+
if cell.value is not None:
|
11071
|
+
# Handle both single-line and multi-line content
|
11072
|
+
cell_value = str(cell.value)
|
11073
|
+
lines = cell_value.split('\n')
|
11074
|
+
current_max = max(len(line) for line in lines)
|
11075
|
+
max_length = max(max_length, current_max)
|
11076
|
+
except Exception as e:
|
11077
|
+
print(f"Skipping cell {cell.coordinate} due to error: {e}")
|
11078
|
+
continue
|
11079
|
+
|
11080
|
+
# Calculate width with constraints
|
11081
|
+
adjusted_width = min(
|
11082
|
+
max(1, (max_length * width_factor) + width_padding),
|
11083
|
+
width_max if width_max is not None else float('inf')
|
11084
|
+
)
|
11085
|
+
|
11086
|
+
ws.column_dimensions[col_letter].width = adjusted_width
|
11087
|
+
|
10324
11088
|
def apply_color_to_worksheet(ws=None, sheet_name=None, conditions=None, cell_idx=None,where="text"):
|
10325
11089
|
"""
|
10326
11090
|
Apply text color formatting to a specific cell range in an openpyxl workbook based on conditions.
|
@@ -10426,6 +11190,11 @@ def format_excel(
|
|
10426
11190
|
|
10427
11191
|
def apply_format(ws, cell, cell_range):
|
10428
11192
|
"""Apply cell formatting to a specified range."""
|
11193
|
+
# Get all merged cell coordinates first
|
11194
|
+
merged_cells = set()
|
11195
|
+
for merged_range in ws.merged_cells.ranges:
|
11196
|
+
for coord in merged_range.cells:
|
11197
|
+
merged_cells.add(coord)
|
10429
11198
|
cell_font, cell_fill, cell_alignment, border = None, None, None, None
|
10430
11199
|
kws_cell = ["font", "fill", "alignment", "border"]
|
10431
11200
|
for K, _ in cell.items():
|
@@ -10623,6 +11392,7 @@ def format_excel(
|
|
10623
11392
|
)
|
10624
11393
|
# get colors config
|
10625
11394
|
for k, v in cell.get(K, {}).items():
|
11395
|
+
print(k, v,strcmp(k, kws_border)[0])
|
10626
11396
|
if strcmp(k, kws_border)[0] in ["color"]:
|
10627
11397
|
border_color_all = hex2argb(v)
|
10628
11398
|
# 如果设置了color,表示其它的所有的都设置成为一样的
|
@@ -10753,6 +11523,8 @@ def format_excel(
|
|
10753
11523
|
#! final apply configs
|
10754
11524
|
for row in ws[cell_range]:
|
10755
11525
|
for cell_ in row:
|
11526
|
+
if cell_.coordinate in merged_cells:
|
11527
|
+
continue # Skip merged cells
|
10756
11528
|
if cell_font:
|
10757
11529
|
cell_.font = cell_font
|
10758
11530
|
if cell_fill:
|
@@ -10830,11 +11602,9 @@ def format_excel(
|
|
10830
11602
|
if not os.path.exists(filename) or mode=="w":
|
10831
11603
|
# ws=wb.active
|
10832
11604
|
# ws.title = sheet_name
|
10833
|
-
ws = wb.create_sheet(title=sheet_name)
|
10834
|
-
print(1)
|
11605
|
+
ws = wb.create_sheet(title=sheet_name)
|
10835
11606
|
else:# file exists
|
10836
|
-
wb = load_workbook(filename)
|
10837
|
-
print(2)
|
11607
|
+
wb = load_workbook(filename)
|
10838
11608
|
# with pd.ExcelWriter(filename, mode="a", engine=engine, if_sheet_exists=if_sheet_exists) as writer:
|
10839
11609
|
# for ws in wb.worksheets: # Iterate through worksheets in the input workbook
|
10840
11610
|
# ws_df = pd.DataFrame(ws.values)
|
@@ -11161,44 +11931,62 @@ def format_excel(
|
|
11161
11931
|
if freeze:
|
11162
11932
|
ws.freeze_panes = freeze # Freeze everything above and to the left of A2
|
11163
11933
|
# !widths
|
11164
|
-
if isinstance(width,bool):
|
11934
|
+
if isinstance(width, bool):
|
11165
11935
|
width=None if width else False
|
11166
11936
|
if isinstance(height,bool):
|
11167
11937
|
height=None if height else False
|
11168
|
-
if width is None or width=={}: # automatic adust width
|
11169
|
-
for col in ws.columns:
|
11170
|
-
max_length = 0
|
11171
|
-
"""column = col[0].column_letter # Get the column letter"""
|
11172
|
-
# Check the first cell in the column to get the column letter
|
11173
|
-
cell_first = col[0]
|
11174
11938
|
|
11175
|
-
|
11176
|
-
|
11177
|
-
|
11178
|
-
|
11179
|
-
|
11939
|
+
merged_cells = set()
|
11940
|
+
for merged_range in ws.merged_cells.ranges:
|
11941
|
+
for row in ws.iter_rows(min_row=merged_range.min_row,
|
11942
|
+
max_row=merged_range.max_row,
|
11943
|
+
min_col=merged_range.min_col,
|
11944
|
+
max_col=merged_range.max_col):
|
11945
|
+
for cell in row:
|
11946
|
+
merged_cells.add(cell.coordinate)
|
11947
|
+
if width is None or width == {}: # automatic adjust width
|
11948
|
+
print("auto-width")
|
11949
|
+
for col in ws.columns:
|
11950
|
+
if not col:
|
11180
11951
|
continue
|
11181
|
-
|
11182
|
-
|
11183
|
-
|
11184
|
-
|
11185
|
-
|
11186
|
-
|
11187
|
-
|
11188
|
-
|
11189
|
-
|
11190
|
-
|
11191
|
-
|
11952
|
+
try:
|
11953
|
+
col_letter = get_column_letter(col[0].column)
|
11954
|
+
|
11955
|
+
# Skip entire column if any cell is merged
|
11956
|
+
if any(cell.coordinate in merged_cells for cell in col):
|
11957
|
+
continue
|
11958
|
+
|
11959
|
+
max_length = 0
|
11960
|
+
for cell in col:
|
11961
|
+
try:
|
11962
|
+
if cell.value:
|
11963
|
+
cell_value = str(cell.value)
|
11964
|
+
if '\n' in cell_value:
|
11965
|
+
max_line_length = max(len(line) for line in cell_value.split('\n'))
|
11966
|
+
max_length = max(max_length, max_line_length)
|
11967
|
+
else:
|
11968
|
+
max_length = max(max_length, len(cell_value))
|
11969
|
+
except:
|
11970
|
+
pass
|
11971
|
+
|
11972
|
+
adjusted_width = (max_length * width_factor) + width_padding
|
11973
|
+
if width_max is not None:
|
11974
|
+
adjusted_width = min(adjusted_width, width_max)
|
11975
|
+
ws.column_dimensions[col_letter].width = max(5, adjusted_width)
|
11976
|
+
|
11977
|
+
except Exception as e:
|
11978
|
+
print(f"Error adjusting width for column: {e}")
|
11979
|
+
continue
|
11980
|
+
elif isinstance(width, (int, float)): # set all columns to this value
|
11981
|
+
print("set to fixed width {}".format(width))
|
11192
11982
|
for col in ws.columns:
|
11193
|
-
column=get_column_letter(col[0].column)
|
11194
|
-
ws.column_dimensions[column].width=width*width_factor+width_padding
|
11195
|
-
elif isinstance(width,
|
11196
|
-
pass
|
11197
|
-
else:
|
11983
|
+
column = get_column_letter(col[0].column)
|
11984
|
+
ws.column_dimensions[column].width = width * width_factor + width_padding
|
11985
|
+
elif isinstance(width, dict): # custom widths per column
|
11198
11986
|
for col_idx, width_ in width.items():
|
11199
11987
|
col_letter = get_column_letter(col_idx)
|
11200
11988
|
ws.column_dimensions[col_letter].width = width_
|
11201
|
-
|
11989
|
+
|
11202
11990
|
# !heights
|
11203
11991
|
if height is None or height=={}: # automatic adust height
|
11204
11992
|
for row in ws.iter_rows(min_row=1, max_row=ws.max_row):
|
@@ -11655,9 +12443,28 @@ def format_excel(
|
|
11655
12443
|
|
11656
12444
|
# ungroup sheets
|
11657
12445
|
for sheet in wb.worksheets:
|
11658
|
-
sheet.sheet_view.tabSelected = False
|
12446
|
+
sheet.sheet_view.tabSelected = False
|
11659
12447
|
# !Save the workbook
|
11660
|
-
|
12448
|
+
try:
|
12449
|
+
wb.save(filename)
|
12450
|
+
except Exception as e:
|
12451
|
+
print(f"Error saving workbook: {str(e)}")
|
12452
|
+
# Replace your final save operation with this:
|
12453
|
+
# try:
|
12454
|
+
# # Create a temporary file for safer saving
|
12455
|
+
# temp_filename = filename + '.tmp'
|
12456
|
+
# wb.save(temp_filename)
|
12457
|
+
|
12458
|
+
# # If save succeeds, replace original file
|
12459
|
+
# if os.path.exists(filename):
|
12460
|
+
# os.remove(filename)
|
12461
|
+
# os.rename(temp_filename, filename)
|
12462
|
+
|
12463
|
+
# except Exception as e:
|
12464
|
+
# print(f"Error saving workbook: {str(e)}")
|
12465
|
+
# if os.path.exists(temp_filename):
|
12466
|
+
# os.remove(temp_filename)
|
12467
|
+
# raise
|
11661
12468
|
|
11662
12469
|
|
11663
12470
|
def preview(var):
|
@@ -13716,6 +14523,575 @@ def df_fillna(
|
|
13716
14523
|
# print(method_name)
|
13717
14524
|
# display(df)
|
13718
14525
|
# display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
|
14526
|
+
def df_cut(
|
14527
|
+
df: pd.DataFrame,
|
14528
|
+
column: str,
|
14529
|
+
*,
|
14530
|
+
new_col_name: Optional[str] = None,
|
14531
|
+
bins: Optional[
|
14532
|
+
Union[int, List[float], Dict[str, Union[float, str, pd.Timestamp]]]
|
14533
|
+
] = None,
|
14534
|
+
range_start: Optional[Union[float, str, pd.Timestamp]] = None,
|
14535
|
+
range_end: Optional[Union[float, str, pd.Timestamp]] = None,
|
14536
|
+
step: Optional[Union[float, str, pd.Timedelta]] = None,
|
14537
|
+
labels: Optional[List[str]] = None,
|
14538
|
+
label_format: Optional[Union[str, Callable[[float, float], str]]] = None,
|
14539
|
+
include_overflow: bool = True,
|
14540
|
+
include_underflow: bool = False,
|
14541
|
+
right: bool = False,
|
14542
|
+
drop_original: bool = False,
|
14543
|
+
precision: int = 2,
|
14544
|
+
show_count: bool = False,
|
14545
|
+
symbol_count: str = "n=",
|
14546
|
+
show_percentage: bool = False,
|
14547
|
+
symbol_percentage: str = "%",
|
14548
|
+
show_total_count: bool = False,
|
14549
|
+
symbol_total_count: str = "∑n=",
|
14550
|
+
sep_between: str = " | ",
|
14551
|
+
sort_labels: bool = True,
|
14552
|
+
na_action: str = "keep",
|
14553
|
+
na_fill_value: Optional[str] = None,
|
14554
|
+
dtype: Optional[Union[str, pd.CategoricalDtype]] = None,
|
14555
|
+
ordered: bool = True,
|
14556
|
+
inplace: bool = False,
|
14557
|
+
datetime_format: str = "%Y-%m-%d",
|
14558
|
+
categorical_agg: str = "count",
|
14559
|
+
) -> Optional[pd.DataFrame]:
|
14560
|
+
"""
|
14561
|
+
Enhanced binning function that works with numeric, datetime, and categorical columns.
|
14562
|
+
|
14563
|
+
Features:
|
14564
|
+
- Automatic type detection (numeric, datetime, categorical)
|
14565
|
+
- Flexible bin specification (number of bins, explicit edges, or range+step)
|
14566
|
+
- Customizable labels with formatting
|
14567
|
+
- Count and percentage display options
|
14568
|
+
- NA value handling
|
14569
|
+
square bracket: means inclusive
|
14570
|
+
parenthesis: means exclusive
|
14571
|
+
Parameters:
|
14572
|
+
-----------
|
14573
|
+
df : pd.DataFrame
|
14574
|
+
Input DataFrame containing the column to bin
|
14575
|
+
column : str
|
14576
|
+
Name of column to bin
|
14577
|
+
new_col_name : str, optional
|
14578
|
+
Name for binned column (default: f"{column}_binned")
|
14579
|
+
bins : int, list, or dict, optional
|
14580
|
+
- int: Number of equal-width bins
|
14581
|
+
- list: Explicit bin edges
|
14582
|
+
- dict: {'start': x, 'end': y, 'step': z} for range specification
|
14583
|
+
range_start : float or datetime-like, optional
|
14584
|
+
Start value for bin range (required if bins is None or dict)
|
14585
|
+
range_end : float or datetime-like, optional
|
14586
|
+
End value for bin range (default: max of column)
|
14587
|
+
step : float or timedelta-like, optional
|
14588
|
+
Step size for bin creation (required if bins is None or dict)
|
14589
|
+
labels : list of str, optional
|
14590
|
+
Custom labels for bins (must match number of bins)
|
14591
|
+
label_format : str or callable, optional
|
14592
|
+
Format string or function for bin labels
|
14593
|
+
include_overflow : bool, default True
|
14594
|
+
Include catch-all bin for values above range_end
|
14595
|
+
include_underflow : bool, default False
|
14596
|
+
Include catch-all bin for values below range_start
|
14597
|
+
right : bool, default False
|
14598
|
+
Whether bins include the right edge
|
14599
|
+
drop_original : bool, default False
|
14600
|
+
Drop original column after binning
|
14601
|
+
precision : int, default 2
|
14602
|
+
Decimal precision for numeric bin labels
|
14603
|
+
show_count : bool, default False
|
14604
|
+
Show count of items in each bin
|
14605
|
+
show_percentage : bool, default False
|
14606
|
+
Show percentage of items in each bin
|
14607
|
+
show_total_count : bool, default False
|
14608
|
+
Show total count in labels
|
14609
|
+
na_action : str, default 'keep'
|
14610
|
+
How to handle NA values ('keep', 'drop', or 'fill')
|
14611
|
+
na_fill_value : str, optional
|
14612
|
+
Value to fill NAs with if na_action='fill'
|
14613
|
+
dtype : dtype or CategoricalDtype, optional
|
14614
|
+
Output dtype for binned column
|
14615
|
+
ordered : bool, default True
|
14616
|
+
Whether bins are ordered
|
14617
|
+
inplace : bool, default False
|
14618
|
+
Modify DataFrame in place
|
14619
|
+
datetime_format : str, default "%Y-%m-%d"
|
14620
|
+
Format string for datetime labels
|
14621
|
+
categorical_agg : str, default 'count'
|
14622
|
+
For categorical data: 'count' or 'ratio'
|
14623
|
+
|
14624
|
+
Returns:
|
14625
|
+
--------
|
14626
|
+
pd.DataFrame or None
|
14627
|
+
Returns modified DataFrame unless inplace=True
|
14628
|
+
|
14629
|
+
Examples:
|
14630
|
+
--------
|
14631
|
+
# Numeric binning
|
14632
|
+
df_cut(df, 'age', bins=5)
|
14633
|
+
df_cut(df, 'price', range_start=0, range_end=1000, step=100)
|
14634
|
+
|
14635
|
+
# Datetime binning
|
14636
|
+
df_cut(df, 'date', bins={'start': '2023-01-01', 'end': '2023-12-31', 'step': '1M'})
|
14637
|
+
|
14638
|
+
# Categorical binning
|
14639
|
+
df_cut(df, 'category', bins=5, categorical_agg='ratio')
|
14640
|
+
|
14641
|
+
# Sample datetime data
|
14642
|
+
dates = pd.date_range("2020-01-01", "2023-12-31", freq="D")
|
14643
|
+
df = pd.DataFrame(
|
14644
|
+
{
|
14645
|
+
"order_date": np.random.choice(dates, 500),
|
14646
|
+
"delivery_time": np.random.randint(1, 72, 500), # hours
|
14647
|
+
}
|
14648
|
+
)
|
14649
|
+
# Example 1: Monthly bins
|
14650
|
+
# Monthly binning with exact month boundaries
|
14651
|
+
df_cut(
|
14652
|
+
df,
|
14653
|
+
"order_date",
|
14654
|
+
bins={"start": "2019-01-01", "end": "2023-12-31", "step": "1Y"},
|
14655
|
+
datetime_format="%Y-%m-%d",
|
14656
|
+
label_format="%m-%d",
|
14657
|
+
show_count=True,
|
14658
|
+
show_percentage=True,
|
14659
|
+
show_total_count=True,
|
14660
|
+
)
|
14661
|
+
# Weekly binning
|
14662
|
+
df_cut(
|
14663
|
+
df,
|
14664
|
+
"order_date",
|
14665
|
+
bins={"start": "2019-01-01", "end": "2023-12-31", "step": "1W"},
|
14666
|
+
label_format="%Y-%m-%d",
|
14667
|
+
datetime_format="%Y-%m-%d",
|
14668
|
+
show_count=True,
|
14669
|
+
show_percentage=True,
|
14670
|
+
show_total_count=True,
|
14671
|
+
)
|
14672
|
+
|
14673
|
+
|
14674
|
+
# Sample numeric data
|
14675
|
+
df = pd.DataFrame(
|
14676
|
+
{"price": np.random.uniform(10, 1000, 1000), "age": np.random.randint(18, 80, 1000)}
|
14677
|
+
)
|
14678
|
+
|
14679
|
+
# Example 1: Equal-width bins
|
14680
|
+
df_cut(df, "price", bins=5, show_count=True)
|
14681
|
+
|
14682
|
+
# Example 2: Custom range with step
|
14683
|
+
df_cut(
|
14684
|
+
df,
|
14685
|
+
"price",
|
14686
|
+
range_start=0,
|
14687
|
+
range_end=1000,
|
14688
|
+
step=200,
|
14689
|
+
label_format="${left:.0f}-${right:.0f}",
|
14690
|
+
show_percentage=True,
|
14691
|
+
)
|
14692
|
+
df_cut(
|
14693
|
+
df,
|
14694
|
+
"price",
|
14695
|
+
bins={"start": 0, "end": 1000, "step": 200},
|
14696
|
+
# label_format="${left:.0f}-${right:.0f}",
|
14697
|
+
show_percentage=True,
|
14698
|
+
)
|
14699
|
+
"""
|
14700
|
+
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
|
14701
|
+
|
14702
|
+
def _process_time_step(step: Union[str, int, float, pd.Timedelta]) -> str:
|
14703
|
+
"""Convert step to pandas frequency string."""
|
14704
|
+
if isinstance(step, pd.Timedelta):
|
14705
|
+
return step.freqstr if step.freqstr else str(step)
|
14706
|
+
|
14707
|
+
if isinstance(step, (int, float)):
|
14708
|
+
return f"{step}S" # Interpret numbers as seconds
|
14709
|
+
|
14710
|
+
if isinstance(step, str):
|
14711
|
+
step = step.strip().lower()
|
14712
|
+
match = re.match(r"(\d*\.?\d+)?\s*([a-z]+)", step)
|
14713
|
+
if not match:
|
14714
|
+
raise ValueError(f"Invalid time step format: {step}")
|
14715
|
+
|
14716
|
+
num_part, unit_part = match.groups()
|
14717
|
+
num = float(num_part) if num_part else 1.0
|
14718
|
+
|
14719
|
+
unit_map = {
|
14720
|
+
"y": "Y",
|
14721
|
+
"yr": "Y",
|
14722
|
+
"yrs": "Y",
|
14723
|
+
"year": "Y",
|
14724
|
+
"years": "Y",
|
14725
|
+
"m": "M",
|
14726
|
+
"mo": "M",
|
14727
|
+
"mon": "M",
|
14728
|
+
"month": "M",
|
14729
|
+
"months": "M",
|
14730
|
+
"w": "W",
|
14731
|
+
"wk": "W",
|
14732
|
+
"wks": "W",
|
14733
|
+
"week": "W",
|
14734
|
+
"weeks": "W",
|
14735
|
+
"d": "D",
|
14736
|
+
"day": "D",
|
14737
|
+
"days": "D",
|
14738
|
+
"h": "H",
|
14739
|
+
"hr": "H",
|
14740
|
+
"hrs": "H",
|
14741
|
+
"hour": "H",
|
14742
|
+
"hours": "H",
|
14743
|
+
"min": "T",
|
14744
|
+
"mins": "T",
|
14745
|
+
"minute": "T",
|
14746
|
+
"minutes": "T",
|
14747
|
+
"s": "S",
|
14748
|
+
"sec": "S",
|
14749
|
+
"secs": "S",
|
14750
|
+
"second": "S",
|
14751
|
+
"seconds": "S",
|
14752
|
+
}
|
14753
|
+
|
14754
|
+
if unit_part not in unit_map:
|
14755
|
+
raise ValueError(f"Unknown time unit: {unit_part}")
|
14756
|
+
|
14757
|
+
freq = unit_map[unit_part]
|
14758
|
+
if num.is_integer():
|
14759
|
+
num = int(num)
|
14760
|
+
return f"{num}{freq}"
|
14761
|
+
|
14762
|
+
raise TypeError(f"Unsupported step type: {type(step)}")
|
14763
|
+
|
14764
|
+
|
14765
|
+
def _process_datetime_column(
|
14766
|
+
col: pd.Series,
|
14767
|
+
bins: Optional[Union[int, List[pd.Timestamp]]],
|
14768
|
+
range_start: Optional[Union[str, pd.Timestamp]],
|
14769
|
+
range_end: Optional[Union[str, pd.Timestamp]],
|
14770
|
+
step: Optional[Union[str, pd.Timedelta]],
|
14771
|
+
labels: Optional[List[str]],
|
14772
|
+
label_format: Optional[Union[str, Callable]],
|
14773
|
+
datetime_format: str,
|
14774
|
+
right: bool,
|
14775
|
+
include_underflow: bool,
|
14776
|
+
include_overflow: bool,
|
14777
|
+
) -> Tuple[pd.Categorical, List[str]]:
|
14778
|
+
"""Process datetime column with accurate counting."""
|
14779
|
+
col = pd.to_datetime(col)
|
14780
|
+
|
14781
|
+
# Handle bin edges
|
14782
|
+
if bins is None:
|
14783
|
+
if step is None:
|
14784
|
+
raise ValueError("Step must be provided for datetime binning")
|
14785
|
+
|
14786
|
+
# Convert step to pandas frequency string
|
14787
|
+
step_freq = _process_time_step(step)
|
14788
|
+
|
14789
|
+
# Set default range if needed
|
14790
|
+
range_start = (
|
14791
|
+
pd.to_datetime(range_start) if range_start is not None else col.min()
|
14792
|
+
)
|
14793
|
+
range_end = pd.to_datetime(range_end) if range_end is not None else col.max()
|
14794
|
+
|
14795
|
+
# Generate bins
|
14796
|
+
try:
|
14797
|
+
bin_edges = pd.date_range(start=range_start, end=range_end, freq=step_freq)
|
14798
|
+
if len(bin_edges) == 0:
|
14799
|
+
bin_edges = pd.date_range(start=range_start, end=range_end, periods=2)
|
14800
|
+
elif bin_edges[-1] < range_end:
|
14801
|
+
bin_edges = bin_edges.append(pd.DatetimeIndex([range_end]))
|
14802
|
+
except ValueError as e:
|
14803
|
+
raise ValueError(f"Invalid frequency specification: {step_freq}") from e
|
14804
|
+
elif isinstance(bins, int):
|
14805
|
+
bin_edges = pd.date_range(start=col.min(), end=col.max(), periods=bins + 1)
|
14806
|
+
else:
|
14807
|
+
bin_edges = pd.to_datetime(bins)
|
14808
|
+
|
14809
|
+
# Add overflow/underflow bins
|
14810
|
+
if include_underflow:
|
14811
|
+
bin_edges = bin_edges.insert(0, pd.Timestamp.min)
|
14812
|
+
if include_overflow:
|
14813
|
+
bin_edges = bin_edges.append(pd.DatetimeIndex([pd.Timestamp.max]))
|
14814
|
+
|
14815
|
+
# Perform the cut - this is where we ensure proper binning
|
14816
|
+
binned = pd.cut(
|
14817
|
+
col.astype("int64"), # Convert to nanoseconds for precise binning
|
14818
|
+
bins=bin_edges.astype("int64"),
|
14819
|
+
right=right,
|
14820
|
+
include_lowest=True,
|
14821
|
+
)
|
14822
|
+
|
14823
|
+
# Generate labels if not provided
|
14824
|
+
if labels is None:
|
14825
|
+
labels = []
|
14826
|
+
for i in range(len(bin_edges) - 1):
|
14827
|
+
left = bin_edges[i]
|
14828
|
+
right_ = bin_edges[i + 1]
|
14829
|
+
|
14830
|
+
# Handle special cases
|
14831
|
+
if left == pd.Timestamp.min:
|
14832
|
+
left_str = "<"
|
14833
|
+
else:
|
14834
|
+
left_str = left.strftime(datetime_format)
|
14835
|
+
|
14836
|
+
if right_ == pd.Timestamp.max:
|
14837
|
+
right_str = ">"
|
14838
|
+
else:
|
14839
|
+
right_str = right_.strftime(datetime_format)
|
14840
|
+
|
14841
|
+
# Apply label formatting
|
14842
|
+
if callable(label_format):
|
14843
|
+
label = label_format(left, right_)
|
14844
|
+
elif isinstance(label_format, str):
|
14845
|
+
try:
|
14846
|
+
if left != pd.Timestamp.min and right_ != pd.Timestamp.max:
|
14847
|
+
label = f"{left.strftime(label_format)}-{right_.strftime(label_format)}"
|
14848
|
+
else:
|
14849
|
+
label = f"{left_str}-{right_str}"
|
14850
|
+
except (ValueError, AttributeError):
|
14851
|
+
label = f"{left_str}-{right_str}"
|
14852
|
+
else:
|
14853
|
+
label = f"{left_str}-{right_str}"
|
14854
|
+
|
14855
|
+
labels.append(label)
|
14856
|
+
|
14857
|
+
return binned, labels
|
14858
|
+
|
14859
|
+
|
14860
|
+
def _process_categorical_column(
|
14861
|
+
col: pd.Series,
|
14862
|
+
bins: Optional[Union[int, List[str]]],
|
14863
|
+
labels: Optional[List[str]],
|
14864
|
+
categorical_agg: str,
|
14865
|
+
) -> Tuple[pd.Categorical, List[str]]:
|
14866
|
+
value_counts = col.value_counts(normalize=(categorical_agg == "ratio"))
|
14867
|
+
|
14868
|
+
if bins is not None and isinstance(bins, int):
|
14869
|
+
top_categories = value_counts.head(bins).index
|
14870
|
+
binned = col.where(col.isin(top_categories), "Other")
|
14871
|
+
elif isinstance(bins, list):
|
14872
|
+
binned = col.where(col.isin(bins), "Other")
|
14873
|
+
else:
|
14874
|
+
binned = col
|
14875
|
+
|
14876
|
+
binned = binned.astype("category")
|
14877
|
+
|
14878
|
+
if labels is not None:
|
14879
|
+
binned = binned.cat.rename_categories(dict(zip(binned.cat.categories, labels)))
|
14880
|
+
|
14881
|
+
return binned, list(binned.cat.categories)
|
14882
|
+
|
14883
|
+
|
14884
|
+
def _process_numeric_column(
|
14885
|
+
col: pd.Series,
|
14886
|
+
bins: Optional[Union[int, List[float]]],
|
14887
|
+
range_start: Optional[float],
|
14888
|
+
range_end: Optional[float],
|
14889
|
+
step: Optional[float],
|
14890
|
+
labels: Optional[List[str]],
|
14891
|
+
label_format: Optional[Union[str, Callable]],
|
14892
|
+
precision: int,
|
14893
|
+
right: bool,
|
14894
|
+
include_underflow: bool,
|
14895
|
+
include_overflow: bool,
|
14896
|
+
) -> Tuple[pd.Categorical, List[str]]:
|
14897
|
+
if bins is None:
|
14898
|
+
if range_start is None or step is None:
|
14899
|
+
raise ValueError("If bins not provided, must set range_start and step")
|
14900
|
+
if range_end is None:
|
14901
|
+
range_end = col.max()
|
14902
|
+
|
14903
|
+
bin_edges = list(np.arange(range_start, range_end + step, step))
|
14904
|
+
elif isinstance(bins, int):
|
14905
|
+
bin_edges = np.linspace(col.min(), col.max(), bins + 1).tolist()
|
14906
|
+
else:
|
14907
|
+
bin_edges = list(bins)
|
14908
|
+
|
14909
|
+
# Add overflow/underflow bins if needed
|
14910
|
+
if include_underflow and not np.isinf(bin_edges[0]):
|
14911
|
+
bin_edges.insert(0, float("-inf"))
|
14912
|
+
if include_overflow and not np.isinf(bin_edges[-1]):
|
14913
|
+
bin_edges.append(float("inf"))
|
14914
|
+
|
14915
|
+
# Generate labels if not provided
|
14916
|
+
if labels is None:
|
14917
|
+
labels = []
|
14918
|
+
for i in range(len(bin_edges) - 1):
|
14919
|
+
left = round(bin_edges[i], precision)
|
14920
|
+
right_ = round(bin_edges[i + 1], precision)
|
14921
|
+
|
14922
|
+
if label_format:
|
14923
|
+
label = (
|
14924
|
+
label_format(left, right_)
|
14925
|
+
if callable(label_format)
|
14926
|
+
else label_format.format(left=left, right=right_)
|
14927
|
+
)
|
14928
|
+
else:
|
14929
|
+
if np.isinf(left) and left < 0:
|
14930
|
+
label = f"<{right_}"
|
14931
|
+
elif np.isinf(right_):
|
14932
|
+
label = f">{left}"
|
14933
|
+
else:
|
14934
|
+
label = f"[{left}, {right_}{']' if right else ')'}"
|
14935
|
+
|
14936
|
+
labels.append(label)
|
14937
|
+
|
14938
|
+
binned = pd.cut(
|
14939
|
+
col, bins=bin_edges, labels=labels, right=right, include_lowest=True
|
14940
|
+
)
|
14941
|
+
return binned, labels
|
14942
|
+
|
14943
|
+
|
14944
|
+
def _handle_na_values(
|
14945
|
+
col: pd.Series, na_action: str, na_fill_value: Optional[str]
|
14946
|
+
) -> pd.Series:
|
14947
|
+
if na_action == "drop":
|
14948
|
+
return col.dropna()
|
14949
|
+
elif na_action == "fill" and na_fill_value is not None:
|
14950
|
+
return col.fillna(na_fill_value)
|
14951
|
+
return col
|
14952
|
+
|
14953
|
+
|
14954
|
+
def _add_statistical_labels(
|
14955
|
+
binned: pd.Categorical,
|
14956
|
+
labels: List[str],
|
14957
|
+
show_count: bool,
|
14958
|
+
show_percentage: bool,
|
14959
|
+
show_total_count: bool,
|
14960
|
+
symbol_count: str,
|
14961
|
+
symbol_percentage: str,
|
14962
|
+
symbol_total_count: str,
|
14963
|
+
sep_between: str,
|
14964
|
+
) -> List[str]:
|
14965
|
+
"""Add statistical information with accurate counts."""
|
14966
|
+
# Get counts by matching the exact bin intervals
|
14967
|
+
value_counts = binned.value_counts()
|
14968
|
+
total = len(binned.dropna())
|
14969
|
+
|
14970
|
+
new_labels = []
|
14971
|
+
for i, (label, category) in enumerate(zip(labels, binned.cat.categories)):
|
14972
|
+
count = value_counts.get(category, 0)
|
14973
|
+
parts = [label]
|
14974
|
+
|
14975
|
+
if show_count:
|
14976
|
+
parts.append(f"{symbol_count}{count}")
|
14977
|
+
if show_percentage:
|
14978
|
+
percentage = (count / total * 100) if total > 0 else 0
|
14979
|
+
parts.append(f"{percentage:.1f}{symbol_percentage}")
|
14980
|
+
if show_total_count:
|
14981
|
+
parts.append(f"{symbol_total_count}{total}")
|
14982
|
+
|
14983
|
+
# Ensure unique labels
|
14984
|
+
new_label = sep_between.join(parts)
|
14985
|
+
if new_label in new_labels:
|
14986
|
+
new_label = f"{new_label}_{i}"
|
14987
|
+
new_labels.append(new_label)
|
14988
|
+
|
14989
|
+
return new_labels
|
14990
|
+
|
14991
|
+
|
14992
|
+
def _sort_bin_labels(binned: pd.Categorical, labels: List[str]) -> pd.Categorical:
|
14993
|
+
try:
|
14994
|
+
# Attempt to sort by the underlying intervals
|
14995
|
+
sorted_categories = sorted(binned.cat.categories)
|
14996
|
+
binned = binned.cat.reorder_categories(sorted_categories, ordered=True)
|
14997
|
+
except Exception:
|
14998
|
+
# If sorting fails (e.g., string labels), fallback to given label order
|
14999
|
+
binned = binned.cat.set_categories(labels, ordered=True)
|
15000
|
+
return binned
|
15001
|
+
# Input validation
|
15002
|
+
if column not in df.columns:
|
15003
|
+
raise ValueError(f"Column '{column}' not found in DataFrame")
|
15004
|
+
|
15005
|
+
if not inplace:
|
15006
|
+
df = df.copy()
|
15007
|
+
|
15008
|
+
col_data = df[column]
|
15009
|
+
|
15010
|
+
# Determine column type
|
15011
|
+
if is_datetime64_any_dtype(col_data):
|
15012
|
+
col_type = "datetime"
|
15013
|
+
col_data = pd.to_datetime(col_data)
|
15014
|
+
elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == "object":
|
15015
|
+
col_type = "categorical"
|
15016
|
+
elif is_numeric_dtype(col_data):
|
15017
|
+
col_type = "numeric"
|
15018
|
+
else:
|
15019
|
+
raise TypeError(f"Unsupported column type: {col_data.dtype}")
|
15020
|
+
|
15021
|
+
# Handle dictionary bin specification
|
15022
|
+
if isinstance(bins, dict):
|
15023
|
+
range_start = bins.get("start", range_start)
|
15024
|
+
range_end = bins.get("end", range_end)
|
15025
|
+
step = bins.get("step", step)
|
15026
|
+
bins = None
|
15027
|
+
|
15028
|
+
# Process based on column type
|
15029
|
+
if col_type == "datetime":
|
15030
|
+
binned, bin_labels = _process_datetime_column(
|
15031
|
+
col_data,
|
15032
|
+
bins,
|
15033
|
+
range_start,
|
15034
|
+
range_end,
|
15035
|
+
step,
|
15036
|
+
labels,
|
15037
|
+
label_format,
|
15038
|
+
datetime_format,
|
15039
|
+
right,
|
15040
|
+
include_underflow,
|
15041
|
+
include_overflow,
|
15042
|
+
)
|
15043
|
+
elif col_type == "categorical":
|
15044
|
+
binned, bin_labels = _process_categorical_column(
|
15045
|
+
col_data, bins, labels, categorical_agg
|
15046
|
+
)
|
15047
|
+
else:
|
15048
|
+
binned, bin_labels = _process_numeric_column(
|
15049
|
+
col_data,
|
15050
|
+
bins,
|
15051
|
+
range_start,
|
15052
|
+
range_end,
|
15053
|
+
step,
|
15054
|
+
labels,
|
15055
|
+
label_format,
|
15056
|
+
precision,
|
15057
|
+
right,
|
15058
|
+
include_underflow,
|
15059
|
+
include_overflow,
|
15060
|
+
)
|
15061
|
+
|
15062
|
+
# Handle NA values
|
15063
|
+
binned = _handle_na_values(binned, na_action, na_fill_value)
|
15064
|
+
|
15065
|
+
# Add statistical information to labels if requested
|
15066
|
+
if show_count or show_percentage or show_total_count:
|
15067
|
+
bin_labels = _add_statistical_labels(
|
15068
|
+
binned,
|
15069
|
+
bin_labels,
|
15070
|
+
show_count,
|
15071
|
+
show_percentage,
|
15072
|
+
show_total_count,
|
15073
|
+
symbol_count,
|
15074
|
+
symbol_percentage,
|
15075
|
+
symbol_total_count,
|
15076
|
+
sep_between,
|
15077
|
+
)
|
15078
|
+
binned = binned.cat.rename_categories(
|
15079
|
+
dict(zip(binned.cat.categories, bin_labels))
|
15080
|
+
)
|
15081
|
+
|
15082
|
+
# Sort labels if requested
|
15083
|
+
if sort_labels and not right and len(bin_labels) > 1:
|
15084
|
+
binned = _sort_bin_labels(binned, bin_labels)
|
15085
|
+
|
15086
|
+
# Create final output column
|
15087
|
+
new_col = new_col_name or f"{column}_binned"
|
15088
|
+
df[new_col] = binned.astype(dtype) if dtype else binned
|
15089
|
+
|
15090
|
+
if drop_original:
|
15091
|
+
df.drop(columns=[column], inplace=True)
|
15092
|
+
|
15093
|
+
return None if inplace else df
|
15094
|
+
|
13719
15095
|
|
13720
15096
|
|
13721
15097
|
def df_encoder(
|
@@ -16300,7 +17676,7 @@ def df_corr(df: pd.DataFrame, method="pearson"):
|
|
16300
17676
|
def use_pd(
|
16301
17677
|
func_name="excel",
|
16302
17678
|
verbose=True,
|
16303
|
-
dir_json="
|
17679
|
+
dir_json="./data/usages_pd.json",
|
16304
17680
|
):
|
16305
17681
|
try:
|
16306
17682
|
default_settings = fload(dir_json, output="json")
|