py2ls 0.2.5.14__py3-none-any.whl → 0.2.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -1,18 +1,18 @@
1
1
  from tkinter import FALSE
2
2
  import numpy as np
3
3
  import pandas as pd
4
- import sys
5
- import os
4
+ import sys # built-in
5
+ import os # built-in
6
6
  from IPython.display import display
7
7
  import shutil
8
8
  import logging
9
9
  from pathlib import Path
10
10
  from datetime import datetime, date, time
11
- import re
11
+ import re # built-in
12
12
  import stat
13
13
  import platform
14
14
 
15
- from typing import Dict, List, Optional, Union, Any, Tuple, Literal
15
+ from typing import Dict, List, Optional, Union, Any, Tuple, Literal,Callable
16
16
  from regex import X
17
17
 
18
18
  try:
@@ -26,7 +26,218 @@ import warnings
26
26
  warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
27
27
  warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
28
28
  warnings.filterwarnings("ignore")
29
+ try:
30
+ import pkg_resources
31
+ except ImportError:
32
+ pkg_resources = None
33
+ import glob # built-in
34
+ import pkg_resources # built-in
35
+ class PkgManager:
36
+ """
37
+ PkgManager.uninstall("py2ls")
38
+ PkgManager.uninstall("py2ls", mode="startswith")
39
+ PkgManager.uninstall("py2ls", mode="endswith")
40
+ PkgManager.uninstall("py2ls", mode="contains")
41
+ PkgManager.uninstall("py2ls", mode="regex")
42
+
43
+ PkgManager.timemachine()
44
+ """
29
45
 
46
+ @staticmethod
47
+ def uninstall(
48
+ kw: Union[str, List[str]],
49
+ mode: str = "exact",
50
+ dry_run: bool = False,
51
+ make_backup: bool = True,
52
+ make_log: bool = True,
53
+ station: Optional[str] = None,
54
+ ) -> None:
55
+ if station is None:
56
+ station = os.path.dirname(os.path.dirname(sys.executable))
57
+ os.makedirs(station, exist_ok=True)
58
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
59
+
60
+ if isinstance(kw, str):
61
+ kw = [kw]
62
+ kw = [k.lower() for k in kw] if mode != "regex" else kw
63
+ mode = mode.lower()
64
+ valid_modes = {"exact", "startswith", "endswith", "contains", "regex"}
65
+ if mode not in valid_modes:
66
+ raise ValueError(f"Mode must be one of {valid_modes}")
67
+
68
+ installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
69
+ matched: Set[str] = set()
70
+
71
+ for name in installed_packages:
72
+ for key in kw:
73
+ if (
74
+ (mode == "exact" and name == key)
75
+ or (mode == "startswith" and name.startswith(key))
76
+ or (mode == "endswith" and name.endswith(key))
77
+ or (mode == "contains" and key in name)
78
+ or (mode == "regex" and re.search(key, name))
79
+ ):
80
+ matched.add(name)
81
+ break
82
+
83
+ if not matched:
84
+ print("No packages matched the criteria.")
85
+ return
86
+
87
+ if make_backup and not dry_run:
88
+ backup_path = os.path.join(station, f"requirements_backup_{timestamp}.txt")
89
+ with open(backup_path, "w") as f:
90
+ subprocess.run(["pip", "freeze"], stdout=f, check=True)
91
+ print(f"Backup created at: '{backup_path}'")
92
+
93
+ if dry_run:
94
+ print("[DRY RUN] The following packages would be uninstalled:")
95
+ for pkg in sorted(matched):
96
+ print(f" - {pkg}=={installed_packages[pkg]}")
97
+ return
98
+
99
+ print(f"[UNINSTALLING] {len(matched)} packages:")
100
+ for pkg in sorted(matched):
101
+ print(f" - {pkg}=={installed_packages[pkg]}")
102
+ subprocess.run(["pip", "uninstall", "-y", pkg], check=True)
103
+
104
+ if make_log:
105
+ log_path = os.path.join(station, f"uninstall_{timestamp}.txt")
106
+ with open(log_path, "w") as f:
107
+ f.write(f"# Uninstallation log created at {timestamp}\n")
108
+ f.write(f"# Mode: {mode}, Keywords: {kw}\n\n")
109
+ for pkg in sorted(matched):
110
+ f.write(f"{pkg}=={installed_packages[pkg]}\n")
111
+ print(f"Log written to '{log_path}'")
112
+
113
+ @staticmethod
114
+ def list_backups(station: Optional[str] = None) -> List[str]:
115
+ if station is None:
116
+ station = os.path.dirname(sys.executable)
117
+ if os.name == "nt":
118
+ station = os.path.dirname(station)
119
+ return sorted(glob.glob(os.path.join(station, "requirements_backup_*.txt")))
120
+
121
+ @staticmethod
122
+ def list_logs(station: Optional[str] = None) -> List[str]:
123
+ if station is None:
124
+ station = os.path.dirname(sys.executable)
125
+ if os.name == "nt":
126
+ station = os.path.dirname(station)
127
+ return sorted(glob.glob(os.path.join(station, "uninstall_*.txt")))
128
+
129
+ @staticmethod
130
+ def restore(
131
+ timestamp: Optional[str] = None,
132
+ station: Optional[str] = None,
133
+ dry_run: bool = False,
134
+ ) -> None:
135
+ if station is None:
136
+ station = os.path.dirname(sys.executable)
137
+ if os.name == "nt":
138
+ station = os.path.dirname(station)
139
+
140
+ backups = PkgManager.list_backups(station)
141
+ logs = PkgManager.list_logs(station)
142
+
143
+ if not timestamp:
144
+ print("Available restore points:\n\nBackups:")
145
+ for i, backup in enumerate(backups, 1):
146
+ ts = os.path.basename(backup)[18:-4]
147
+ print(f" {i}. {ts} (backup)")
148
+ print("\nUninstall logs:")
149
+ for i, log in enumerate(logs, len(backups) + 1):
150
+ ts = os.path.basename(log)[10:-4]
151
+ print(f" {i}. {ts} (log)")
152
+ print("\nSpecify timestamp or selection number to restore.")
153
+ return
154
+
155
+ try:
156
+ selection = int(timestamp)
157
+ all_files = backups + logs
158
+ if 1 <= selection <= len(all_files):
159
+ file_path = all_files[selection - 1]
160
+ is_log = selection > len(backups)
161
+ else:
162
+ raise ValueError("Invalid selection number")
163
+ except ValueError:
164
+ backup_pattern = os.path.join(
165
+ station, f"requirements_backup_{timestamp}.txt"
166
+ )
167
+ log_pattern = os.path.join(station, f"uninstall_{timestamp}.txt")
168
+ matching_backups = glob.glob(backup_pattern)
169
+ matching_logs = glob.glob(log_pattern)
170
+
171
+ if matching_backups:
172
+ file_path = matching_backups[0]
173
+ is_log = False
174
+ elif matching_logs:
175
+ file_path = matching_logs[0]
176
+ is_log = True
177
+ else:
178
+ print(f"No backup or log found for timestamp: {timestamp}")
179
+ return
180
+
181
+ with open(file_path, "r") as f:
182
+ packages = [
183
+ line.strip() for line in f if line.strip() and not line.startswith("#")
184
+ ]
185
+
186
+ if dry_run:
187
+ print(
188
+ f"[DRY RUN] Would restore {len(packages)} packages from:\n {file_path}"
189
+ )
190
+ for pkg in packages:
191
+ print(f" - {pkg}")
192
+ return
193
+
194
+ print(f"[RESTORING] {len(packages)} packages from:\n {file_path}")
195
+ for pkg in packages:
196
+ print(f" - Installing {pkg}")
197
+ subprocess.run(["pip", "install", pkg], check=True)
198
+
199
+ @staticmethod
200
+ def timemachine(station: Optional[str] = None) -> None:
201
+ if station is None:
202
+ station = os.path.dirname(sys.executable)
203
+ if os.name == "nt":
204
+ station = os.path.dirname(station)
205
+
206
+ backups = PkgManager.list_backups(station)
207
+ logs = PkgManager.list_logs(station)
208
+
209
+ if not backups and not logs:
210
+ print("No backup or log files found.")
211
+ return
212
+
213
+ print("\nTime Machine - Available Restore Points:")
214
+ print("--------------------------------------")
215
+ print("\nBackups (complete environment snapshots):")
216
+ for i, backup in enumerate(backups, 1):
217
+ ts = os.path.basename(backup)[18:-4]
218
+ print(f" {i}. {ts}")
219
+ print("\nUninstall Logs (specific package lists):")
220
+ for i, log in enumerate(logs, len(backups) + 1):
221
+ ts = os.path.basename(log)[10:-4]
222
+ print(f" {i}. {ts}")
223
+ print("\n0. Exit Time Machine")
224
+
225
+ while True:
226
+ try:
227
+ choice = input("\nSelect a restore point (number) or '0' to exit: ")
228
+ if choice == "0":
229
+ return
230
+ selection = int(choice)
231
+ all_files = backups + logs
232
+ if 1 <= selection <= len(all_files):
233
+ file_path = all_files[selection - 1]
234
+ timestamp = os.path.basename(file_path).split("_")[-1][:-4]
235
+ PkgManager.restore(timestamp, station)
236
+ return
237
+ else:
238
+ print("Invalid selection. Please try again.")
239
+ except ValueError:
240
+ print("Please enter a valid number.")
30
241
 
31
242
  def _yaoshi_fernet(mima="mimashigudingde",yan=b"mimashigudingde",verbose=True):
32
243
  import base64
@@ -5688,7 +5899,7 @@ def fload(fpath, kind=None, **kwargs):
5688
5899
  if output in ["dataframe", "df"]:
5689
5900
  if verbose:
5690
5901
  print("loading data as a DataFrame")
5691
- if not password:
5902
+ if not bool(password):
5692
5903
  if verbose:
5693
5904
  print("Reading Excel without password protection...")
5694
5905
  df = pd.read_excel(fpath, engine=engine, sheet_name=sheet_name, **kwargs)
@@ -6636,27 +6847,6 @@ def fsave(
6636
6847
  print(
6637
6848
  f"Error:\n{kind} is not in the supported list ['docx', 'txt', 'md', 'html', 'pdf', 'csv', 'xlsx', 'json', 'xml', 'yaml']"
6638
6849
  )
6639
-
6640
-
6641
- # # Example usage
6642
- # text_content = ["Hello, this is a sample text file.", "This is the second paragraph."]
6643
- # tabular_content = {"Name": ["Alice", "Bob"], "Age": [24, 30]}
6644
- # json_content = {"name": "Alice", "age": 24}
6645
- # yaml_content = {"Name": "Alice", "Age": 24}
6646
- # xml_content = {"Name": "Alice", "Age": 24}
6647
- # dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
6648
- # fsave(dir_save + "sample.txt", text_content)
6649
- # fsave(dir_save + "sample.md", text_content)
6650
- # fsave(dir_save + "sample.html", text_content)
6651
- # fsave(dir_save + "sample.pdf", text_content)
6652
- # fsave(dir_save + "sample.docx", text_content)
6653
- # fsave(dir_save + "sample.csv", tabular_content, index=False)
6654
- # fsave(dir_save + "sample.xlsx", tabular_content, sheet_name="Sheet1", index=False)
6655
- # fsave(dir_save + "sample.json", json_content, indent=4)
6656
- # fsave(dir_save + "sample.yaml", yaml_content)
6657
- # fsave(dir_save + "sample.xml", xml_content)
6658
-
6659
-
6660
6850
  def addpath(fpath):
6661
6851
  sys.path.insert(0, dir)
6662
6852
 
@@ -9410,12 +9600,6 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
9410
9600
  else:
9411
9601
  figsave(dirname(dir_save), fname)
9412
9602
 
9413
-
9414
- # usage:
9415
- # fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
9416
- # thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
9417
-
9418
-
9419
9603
  # search and fine the director of the libary, which installed at local
9420
9604
  def dir_lib(lib_oi):
9421
9605
  """
@@ -9832,7 +10016,7 @@ def hex2argb(color):
9832
10016
 
9833
10017
  # Validate hex format
9834
10018
  if not re.fullmatch(r"[A-F0-9]{6,8}", color):
9835
- raise ValueError(f"格式错误❌: {color}, 应该使用 RRGGBB, #RRGGBB, or aARRGGBB format.")
10019
+ raise ValueError(f"格式错误: {color}, 应该使用 RRGGBB, #RRGGBB, or aARRGGBB format.")
9836
10020
 
9837
10021
  # If already in aARRGGBB format (8 chars), return as is
9838
10022
  if len(color) == 8:
@@ -10032,7 +10216,526 @@ def copy_format(
10032
10216
  wb_source.close()
10033
10217
  if "wb_target" in locals():
10034
10218
  wb_target.close()
10219
+ # ! =========(below) interact with worrkbook and DataFrame===========
10220
+ import pandas as pd
10221
+ from openpyxl import load_workbook
10222
+ from openpyxl.workbook.workbook import Workbook
10223
+ from openpyxl.utils import get_column_letter
10224
+
10225
+ class DataFrameAlignExcel:
10226
+ """
10227
+ A powerful tool for updating Excel files with data from DataFrames with various matching strategies.
10228
+
10229
+ Features:
10230
+ - Accepts either file path or open Workbook object
10231
+ - Multiple matching strategies (exact, contains, starts_with, ends_with, regex)
10232
+ - Multiple value update strategies (overwrite, add, subtract, multiply, divide, append)
10233
+ - Support for multiple worksheets
10234
+ - Automatic column creation
10235
+ - Value normalization options
10236
+ - Detailed logging and dry-run mode
10237
+ - Progress reporting
10238
+ - Data validation
10239
+ - make_backup functionality
10240
+ """
10241
+
10242
+ def __init__(self, fpath: Union[str, Workbook], df: pd.DataFrame = None):
10243
+ """
10244
+ Initialize the DataFrameAlignExcel.
10245
+
10246
+ Args:
10247
+ fpath: Path to the Excel file (str) or open Workbook object
10248
+ df: Optional DataFrame to use for updates
10249
+ """
10250
+ self.fpath_or_wb = fpath
10251
+ self.df = df
10252
+ self.wb = None
10253
+ self.backup_path = None
10254
+ self.log = []
10255
+ self.owns_workbook = (
10256
+ False # Track whether we created the workbook or it was passed in
10257
+ )
10258
+
10259
+ def load_workbook(self) -> None:
10260
+ """Load the Excel workbook if a path was provided."""
10261
+ if isinstance(self.fpath_or_wb, str):
10262
+ if not os.path.exists(self.fpath_or_wb):
10263
+ raise FileNotFoundError(f"Excel file not found: {self.fpath_or_wb}")
10264
+ self.wb = load_workbook(self.fpath_or_wb)
10265
+ self.owns_workbook = True
10266
+ elif isinstance(self.fpath_or_wb, Workbook):
10267
+ self.wb = self.fpath_or_wb
10268
+ self.owns_workbook = False
10269
+ else:
10270
+ raise TypeError(
10271
+ "fpath must be either a string path or an openpyxl Workbook object"
10272
+ )
10273
+
10274
+ def create_make_backup(self) -> None:
10275
+ """Create a make_backup of the original Excel file (only if we loaded from a file)."""
10276
+ if not isinstance(self.fpath_or_wb, str):
10277
+ self.log.append(
10278
+ "Skipping make_backup - working with Workbook object directly"
10279
+ )
10280
+ return
10281
+
10282
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
10283
+ self.backup_path = os.path.join(
10284
+ os.path.dirname(self.fpath_or_wb),
10285
+ f"backup_{timestamp}_{os.path.basename(self.fpath_or_wb)}",
10286
+ )
10287
+ self.wb.save(self.backup_path)
10288
+ self.log.append(f"Created make_backup at: {self.backup_path}")
10289
+
10290
+ def save_workbook(self, dir_save: str = None) -> None:
10291
+ """
10292
+ Save the workbook to a file.
10293
+
10294
+ Args:
10295
+ dir_save: Optional path to save to. If None and we loaded from a file,
10296
+ saves to the original path.
10297
+ """
10298
+ if self.wb is None:
10299
+ raise ValueError("No workbook loaded")
10300
+
10301
+ if dir_save is None:
10302
+ if isinstance(self.fpath_or_wb, str):
10303
+ dir_save = self.fpath_or_wb
10304
+ else:
10305
+ dir_save = datetime.now().strftime("%Y%m%d_%H%M%S") + ".xlsx"
10306
+ print(
10307
+ f"No save path provided and original input was a Workbook object, so used : {dir_save}"
10308
+ )
10309
+ self.wb.save(dir_save)
10310
+ self.log.append(f"Saved workbook to: {dir_save}")
10311
+
10312
+ def normalize_value(self, value, clean_keys: str = "strip_split_first") -> str:
10313
+ """
10314
+ Normalize a value based on the specified method.
10315
+
10316
+ Args:
10317
+ value: Value to normalize
10318
+ clean_keys: One of:
10319
+ - 'strip': just strip whitespace
10320
+ - 'strip_lower': strip and lowercase
10321
+ - 'strip_split_first': strip and take first part before comma
10322
+ - 'strip_split_last': strip and take last part after comma
10323
+ - None: no normalization
10324
+
10325
+ Returns:
10326
+ Normalized value
10327
+ """
10328
+ if value is None:
10329
+ return None
10330
+
10331
+ value = str(value)
10332
+
10333
+ if clean_keys is None:
10334
+ return value
10335
+
10336
+ if clean_keys == "strip":
10337
+ return value.strip()
10338
+ elif clean_keys == "strip_lower":
10339
+ return value.strip().lower()
10340
+ elif clean_keys == "strip_split_first":
10341
+ return value.strip().split(",")[0].strip()
10342
+ elif clean_keys == "strip_split_last":
10343
+ parts = value.strip().split(",")
10344
+ return parts[-1].strip() if len(parts) > 1 else value.strip()
10345
+ else:
10346
+ warnings.warn(f"Unknown clean_keys: {clean_keys}. Using 'strip'.")
10347
+ return value.strip()
10348
+
10349
+ def find_column_index(self, ws, header_row: int, column_name: str, max_search_columns: int = 100) -> int:
10350
+ """
10351
+ Efficiently find the column index (1-based) for a given column name,
10352
+ considering only non-empty cells and limiting search range.
10353
+
10354
+ Args:
10355
+ ws: Worksheet object
10356
+ header_row: Row number containing headers (1-based)
10357
+ column_name: Column name to find
10358
+ max_search_columns: Max number of columns to search (to prevent infinite loops)
10359
+
10360
+ Returns:
10361
+ Column index (1-based), or -1 if not found
10362
+ """
10363
+ row_iter = ws.iter_rows(min_row=header_row, max_row=header_row, max_col=max_search_columns, values_only=False)
10364
+ for row in row_iter:
10365
+ for cell in row:
10366
+ if cell.value and str(cell.value).strip().lower() == column_name.lower():
10367
+ return cell.column
10368
+ break # Only process the header row
10369
+ return -1
10370
+ # def find_column_index(self, ws, header_row: int, column_name: str, max_search_columns: int = 100) -> int:
10371
+ # """
10372
+ # Find the column index (1-based) for a given column name.
10373
+ # If not found, return the last non-empty header column index.
10374
+
10375
+ # Args:
10376
+ # ws: Worksheet object
10377
+ # header_row: Row number containing headers (1-based)
10378
+ # column_name: Column name to find
10379
+ # max_search_columns: Max number of columns to search
10380
+
10381
+ # Returns:
10382
+ # Column index (1-based)
10383
+ # """
10384
+ # row_iter = ws.iter_rows(min_row=header_row, max_row=header_row, max_col=max_search_columns, values_only=False)
10385
+ # last_non_empty_col = -1
10386
+
10387
+ # for row in row_iter:
10388
+ # for cell in row:
10389
+ # if cell.value and str(cell.value).strip():
10390
+ # last_non_empty_col = cell.column
10391
+ # if str(cell.value).strip().lower() == column_name.lower():
10392
+ # return cell.column
10393
+ # break # Only one row being read
10394
+
10395
+ # return last_non_empty_col
10396
+
10397
+ def update_values(
10398
+ self,
10399
+ df: pd.DataFrame = None,
10400
+ sheet_name: Union[str, int, List[Union[str, int]]] = 0,
10401
+ header_row: int = 1,
10402
+ column_match: Union[Dict[str, str], List[Tuple[str, str]]] = None,
10403
+ column_mapping: Union[Dict[str, str], List[Tuple[str, str]]] = None,
10404
+ clean_keys: str = "strip_split_first",
10405
+ match_method: str = "exact",
10406
+ update_strategy: str = "overwrite",
10407
+ create_missing_columns: bool = True,
10408
+ preview_only: bool = False,
10409
+ show_progress: bool = True,
10410
+ skip_no_match: bool = True,
10411
+ make_backup: bool = True,
10412
+ dir_save: str = None,
10413
+ row_max=500
10414
+ ) -> Dict[str, int]:
10415
+ """
10416
+ Update Excel with values from DataFrame.
10417
+
10418
+ Args:
10419
+ df: DataFrame containing update data (if None, uses self.df)
10420
+ sheet_name: Sheet name(s) to update (str, int, or list of these)
10421
+ header_row: Row number containing headers (1-based)
10422
+ column_match: Dict or list of tuples mapping DataFrame columns to Excel columns for matching
10423
+ e.g., {'SampleID': 'ID'} or [('SampleID', 'ID'), ('Batch', 'Lot')]
10424
+ column_mapping: Dict or list of tuples mapping DataFrame columns to Excel columns to update
10425
+ e.g., {'Vials': 'Qty'} or [('Vials', 'Qty'), ('Status', 'State')]
10426
+ clean_keys: How to normalize matching values (see normalize_value())
10427
+ match_method: How to match values ('exact', 'contains', 'starts_with', 'ends_with', 'regex')
10428
+ update_strategy: How to update values ('overwrite', 'add', 'subtract', 'multiply', 'divide', 'append')
10429
+ create_missing_columns: Whether to create columns that don't exist
10430
+ preview_only: If True, don't actually update the Excel file
10431
+ show_progress: If True, print progress updates
10432
+ skip_no_match: If True, skip rows where match columns don't match
10433
+ make_backup: If True, create a make_backup before updating (only if working with file path)
10434
+ dir_save: Optional path to save to. If None and we loaded from a file,
10435
+ saves to the original path. Ignored if preview_only=True.
10436
+
10437
+ Returns:
10438
+ Dictionary with update statistics
10439
+ """
10440
+ # Initialize
10441
+ start_time = datetime.now()
10442
+ if df is None:
10443
+ df = self.df
10444
+ if df is None:
10445
+ raise ValueError("No DataFrame provided")
10446
+
10447
+ if not isinstance(column_match, (dict, list)) or not column_match:
10448
+ raise ValueError(
10449
+ "column_match must be a non-empty dict or list of tuples"
10450
+ )
10451
+
10452
+ if not isinstance(column_mapping, (dict, list)) or not column_mapping:
10453
+ raise ValueError("column_mapping must be a non-empty dict or list of tuples")
10454
+
10455
+ # Convert match/update columns to consistent format
10456
+ if isinstance(column_match, dict):
10457
+ column_match = list(column_match.items())
10458
+ if isinstance(column_mapping, dict):
10459
+ column_mapping = list(column_mapping.items())
10460
+
10461
+ # Load workbook if not already loaded
10462
+ if self.wb is None:
10463
+ self.load_workbook()
10464
+
10465
+ # Create make_backup (only if we're working with a file path)
10466
+ if not preview_only:
10467
+ self.create_make_backup()
10468
+
10469
+ # Prepare statistics
10470
+ stats = {
10471
+ "processed_sheet_names":[],
10472
+ "processed_sheets": 0,
10473
+ "total_updates": 0,
10474
+ "skipped_rows": 0,
10475
+ "created_columns": 0,
10476
+ }
10477
+
10478
+ # Normalize sheet names
10479
+ if not isinstance(sheet_name, list):
10480
+ sheet_name = [sheet_name]
10481
+
10482
+ # Process each sheet
10483
+ for sheet in sheet_name:
10484
+ try:
10485
+ if isinstance(sheet, str):
10486
+ ws = self.wb[sheet]
10487
+ elif isinstance(sheet, int):
10488
+ ws = self.wb.worksheets[sheet]
10489
+ else:
10490
+ ws = self.wb.active
10491
+
10492
+ sheet_name = ws.title
10493
+ self.log.append(f"\nProcessing sheet: {sheet_name}")
10494
+
10495
+ # Prepare matching data
10496
+ match_dict = {}
10497
+ for df_col, excel_col in column_match:
10498
+ if clean_keys:
10499
+ match_dict[excel_col] = dict(
10500
+ zip(
10501
+ df[df_col].apply(
10502
+ lambda x: self.normalize_value(x, clean_keys)
10503
+ ),
10504
+ df.index,
10505
+ )
10506
+ )
10507
+ else:
10508
+ match_dict[excel_col] = dict(zip(df[df_col], df.index))
10509
+
10510
+ # Find or create update columns
10511
+ update_col_indices = {}
10512
+ for df_col, excel_col in column_mapping:
10513
+ col_idx = self.find_column_index(ws, header_row, excel_col)
10514
+ if col_idx == -1:
10515
+ if create_missing_columns:
10516
+ # Find last column
10517
+ last_col = max(
10518
+ [cell.column for cell in ws[header_row] if cell.value is not None], default=0
10519
+ )
10520
+ col_idx = last_col + 1
10521
+ ws.cell(row=header_row, column=col_idx, value=excel_col)
10522
+ update_col_indices[excel_col] = col_idx
10523
+ stats["created_columns"] += 1
10524
+ self.log.append(
10525
+ f"Created new column '{excel_col}' at position {col_idx}"
10526
+ )
10527
+ else:
10528
+ raise ValueError(
10529
+ f"Column '{excel_col}' not found and create_missing_columns=False"
10530
+ )
10531
+ else:
10532
+ update_col_indices[excel_col] = col_idx
10533
+
10534
+ # Process rows
10535
+ for row in ws.iter_rows(min_row=header_row + 1):
10536
+ match_values = {}
10537
+ match_failed = False
10538
+
10539
+ for excel_col in match_dict.keys():
10540
+ col_idx = self.find_column_index(ws, header_row, excel_col)
10541
+ if col_idx == -1:
10542
+ if skip_no_match:
10543
+ match_failed = True
10544
+ break
10545
+ else:
10546
+ raise ValueError(
10547
+ f"Match column '{excel_col}' not found in sheet"
10548
+ )
10549
+
10550
+ cell_value = row[
10551
+ col_idx - 1
10552
+ ].value # -1 because iter_rows returns 0-based list
10553
+ if clean_keys:
10554
+ cell_value = self.normalize_value(cell_value, clean_keys)
10555
+
10556
+ match_values[excel_col] = cell_value
10557
+
10558
+ if match_failed:
10559
+ stats["skipped_rows"] += 1
10560
+ continue
10561
+
10562
+ # Find matching DataFrame row
10563
+ df_index = None
10564
+ for excel_col, value in match_values.items():
10565
+ if value in match_dict[excel_col]:
10566
+ if df_index is None:
10567
+ df_index = match_dict[excel_col][value]
10568
+ elif df_index != match_dict[excel_col][value]:
10569
+ # Multiple match columns point to different rows - skip
10570
+ df_index = None
10571
+ break
10572
+
10573
+ if df_index is None:
10574
+ stats["skipped_rows"] += 1
10575
+ continue
10576
+
10577
+ # Update cells
10578
+ for df_col, excel_col in column_mapping:
10579
+ col_idx = update_col_indices[excel_col]
10580
+ cell = row[
10581
+ col_idx - 1
10582
+ ] # -1 because iter_rows returns 0-based list
10583
+ new_value = df.at[df_index, df_col]
10584
+
10585
+ # Apply update strategy
10586
+ if update_strategy == "overwrite":
10587
+ cell.value = new_value
10588
+ elif update_strategy in (
10589
+ "add",
10590
+ "subtract",
10591
+ "multiply",
10592
+ "divide",
10593
+ ):
10594
+ try:
10595
+ old_value = (
10596
+ float(cell.value) if cell.value is not None else 0
10597
+ )
10598
+ new_value = (
10599
+ float(new_value) if new_value is not None else 0
10600
+ )
10601
+ if update_strategy == "add":
10602
+ cell.value = old_value + new_value
10603
+ elif update_strategy == "subtract":
10604
+ cell.value = old_value - new_value
10605
+ elif update_strategy == "multiply":
10606
+ cell.value = old_value * new_value
10607
+ elif update_strategy == "divide":
10608
+ cell.value = (
10609
+ old_value / new_value
10610
+ if new_value != 0
10611
+ else old_value
10612
+ )
10613
+ except (ValueError, TypeError):
10614
+ if skip_no_match:
10615
+ continue
10616
+ raise ValueError(
10617
+ f"Could not perform {update_strategy} operation on non-numeric values"
10618
+ )
10619
+ elif update_strategy == "append":
10620
+ separator = ", " if cell.value else ""
10621
+ cell.value = (
10622
+ f"{cell.value}{separator}{new_value}"
10623
+ if cell.value
10624
+ else new_value
10625
+ )
10626
+ else:
10627
+ raise ValueError(
10628
+ f"Unknown update_strategy: {update_strategy}"
10629
+ )
10630
+
10631
+ stats["total_updates"] += 1
10632
+
10633
+ stats["processed_sheets"] += 1
10634
+ stats["processed_sheet_names"].append(sheet_name)
10635
+ except Exception as e:
10636
+ self.log.append(f"Error processing sheet {sheet}: {str(e)}")
10637
+ if (
10638
+ not preview_only
10639
+ and self.backup_path
10640
+ and isinstance(self.fpath_or_wb, str)
10641
+ ):
10642
+ self.log.append("Restoring from make_backup due to error")
10643
+ self.wb = load_workbook(self.backup_path)
10644
+ raise
10645
+
10646
+ # Save changes if not dry run
10647
+ if not preview_only:
10648
+ self.save_workbook(dir_save)
10649
+ if not make_backup:
10650
+ if os.path.exists(self.backup_path):
10651
+ os.remove(self.backup_path)
10652
+ else:
10653
+ self.log.append("\nDry run complete - no changes saved")
10654
+
10655
+ # Print summary
10656
+ summary = (
10657
+ f"\nUpdate Summary:\n"
10658
+ f"\tProcessed {stats["processed_sheets"]} sheetnames: {stats['processed_sheet_names']}\n"
10659
+ f"\tTotal updates: {stats['total_updates']}\n"
10660
+ f"\tSkipped rows: {stats['skipped_rows']}\n"
10661
+ )
10662
+ self.log.append(summary)
10663
+
10664
+ if show_progress:
10665
+ print(summary)
10035
10666
 
10667
+ return stats
10668
+
10669
+ def get_log(self) -> str:
10670
+ """Get the operation log as a string."""
10671
+ return "\n".join(self.log)
10672
+
10673
+ def close(self) -> None:
10674
+ """Close the workbook if we own it."""
10675
+ if self.wb is not None and self.owns_workbook:
10676
+ self.wb.close()
10677
+ self.wb = None
10678
+
10679
+
10680
+ DFToExcelMapping = Union[Dict[str, str], List[Tuple[str, str]]]
10681
+ def df_align(
10682
+ fpath: Union[str, Workbook],
10683
+ df: pd.DataFrame,
10684
+ sheet_name: Union[str, int, List[Union[str, int]]] = 0,
10685
+ header_row: int = 1,
10686
+ column_match: DFToExcelMapping = None,
10687
+ column_mapping: DFToExcelMapping = None,
10688
+ clean_keys: str = "strip_split_first",
10689
+ match_method: str = "exact",
10690
+ update_strategy: str = "overwrite",
10691
+ create_missing_columns: bool = True,
10692
+ preview_only: bool = False,
10693
+ show_progress: bool = True,
10694
+ skip_no_match: bool = True,
10695
+ make_backup: bool = True,
10696
+ dir_save: str = None,
10697
+ ) -> Dict[str, int]:
10698
+ """
10699
+ wb = fload(
10700
+ dir_aml,
10701
+ password="XBuzwVk4xsC2361cHzyi9JFgfJHaTSerjBOQ0JAJU24=",
10702
+ sheet_name=0,
10703
+ header=1,
10704
+ output="bit",
10705
+ )
10706
+ ws = wb[wb.sheetnames[0]]
10707
+ df_align(
10708
+ fpath=wb,
10709
+ df=df_,
10710
+ sheet_name=None,
10711
+ header_row=2,
10712
+ column_match={"SampleID": "SampleID"},# key是 df中的列名, value是 excel中,
10713
+ column_mapping={"Vials": "Vials", "Vials_": "Total Vials"}, # key是 df中的列名, value是 excel中,
10714
+ )
10715
+ """
10716
+ updater = DataFrameAlignExcel(fpath, df)
10717
+ try:
10718
+ result = updater.update_values(
10719
+ sheet_name=sheet_name,
10720
+ header_row=header_row,
10721
+ column_match=column_match,
10722
+ column_mapping=column_mapping,
10723
+ clean_keys=clean_keys,
10724
+ match_method=match_method,
10725
+ update_strategy=update_strategy,
10726
+ create_missing_columns=create_missing_columns,
10727
+ preview_only=preview_only,
10728
+ show_progress=show_progress,
10729
+ skip_no_match=skip_no_match,
10730
+ make_backup=make_backup,
10731
+ dir_save=dir_save,
10732
+ )
10733
+ return result
10734
+ finally:
10735
+ updater.close()
10736
+
10737
+
10738
+ # ! =========(Above) interact with worrkbook and DataFrame===========
10036
10739
  def set_sheet_visible(
10037
10740
  fpath: str,
10038
10741
  sheet_name: Union[int, str, None,list] = 1,
@@ -10159,7 +10862,7 @@ def format_excel(
10159
10862
  number_format:dict=None, # dict: e.g., {1:"0.00", 2:"#,##0",3:"0%",4:"$#,##0.00"}
10160
10863
  data_validation=None, # dict
10161
10864
  template:dict={},# e.g., template=dict(path="xx.xlsx",sheet_name=['sheet_name1',"sheet_name2"])
10162
- apply_filter:bool=True, # add filter
10865
+ apply_filter:bool=False, # add filter
10163
10866
  freeze :str= False,#"A2",
10164
10867
  conditional_format:dict=None, # dict
10165
10868
  verbose:bool=False,
@@ -10321,6 +11024,67 @@ def format_excel(
10321
11024
  if end_col_letter
10322
11025
  else f"{start_col_letter}{start_row}"
10323
11026
  )
11027
+
11028
+
11029
+ def is_merged_cell(ws, cell):
11030
+ """Check if a cell is part of any merged range."""
11031
+ for merged_range in ws.merged_cells.ranges:
11032
+ if cell.coordinate in merged_range:
11033
+ return True
11034
+ return False
11035
+
11036
+ def apply_auto_width(ws, width_factor=1.2, width_padding=2, width_max=50):
11037
+ """
11038
+ Automatically adjust column widths based on content length,
11039
+ with complete protection against merged cell errors.
11040
+
11041
+ Args:
11042
+ ws: Worksheet object
11043
+ width_factor: Multiplier for content length (default 1.2)
11044
+ width_padding: Additional padding (default 2)
11045
+ width_max: Maximum column width (default 50)
11046
+ """
11047
+ # First build a set of all merged cell coordinates
11048
+ merged_coords = set()
11049
+ for merged_range in ws.merged_cells.ranges:
11050
+ for row in ws.iter_rows(min_row=merged_range.min_row,
11051
+ max_row=merged_range.max_row,
11052
+ min_col=merged_range.min_col,
11053
+ max_col=merged_range.max_col):
11054
+ for cell in row:
11055
+ merged_coords.add(cell.coordinate)
11056
+
11057
+ for col in ws.columns:
11058
+ if not col:
11059
+ continue
11060
+
11061
+ col_letter = get_column_letter(col[0].column)
11062
+ max_length = 0
11063
+
11064
+ for cell in col:
11065
+ # Skip merged cells entirely
11066
+ if cell.coordinate in merged_coords:
11067
+ continue
11068
+
11069
+ try:
11070
+ if cell.value is not None:
11071
+ # Handle both single-line and multi-line content
11072
+ cell_value = str(cell.value)
11073
+ lines = cell_value.split('\n')
11074
+ current_max = max(len(line) for line in lines)
11075
+ max_length = max(max_length, current_max)
11076
+ except Exception as e:
11077
+ print(f"Skipping cell {cell.coordinate} due to error: {e}")
11078
+ continue
11079
+
11080
+ # Calculate width with constraints
11081
+ adjusted_width = min(
11082
+ max(1, (max_length * width_factor) + width_padding),
11083
+ width_max if width_max is not None else float('inf')
11084
+ )
11085
+
11086
+ ws.column_dimensions[col_letter].width = adjusted_width
11087
+
10324
11088
  def apply_color_to_worksheet(ws=None, sheet_name=None, conditions=None, cell_idx=None,where="text"):
10325
11089
  """
10326
11090
  Apply text color formatting to a specific cell range in an openpyxl workbook based on conditions.
@@ -10426,6 +11190,11 @@ def format_excel(
10426
11190
 
10427
11191
  def apply_format(ws, cell, cell_range):
10428
11192
  """Apply cell formatting to a specified range."""
11193
+ # Get all merged cell coordinates first
11194
+ merged_cells = set()
11195
+ for merged_range in ws.merged_cells.ranges:
11196
+ for coord in merged_range.cells:
11197
+ merged_cells.add(coord)
10429
11198
  cell_font, cell_fill, cell_alignment, border = None, None, None, None
10430
11199
  kws_cell = ["font", "fill", "alignment", "border"]
10431
11200
  for K, _ in cell.items():
@@ -10623,6 +11392,7 @@ def format_excel(
10623
11392
  )
10624
11393
  # get colors config
10625
11394
  for k, v in cell.get(K, {}).items():
11395
+ print(k, v,strcmp(k, kws_border)[0])
10626
11396
  if strcmp(k, kws_border)[0] in ["color"]:
10627
11397
  border_color_all = hex2argb(v)
10628
11398
  # 如果设置了color,表示其它的所有的都设置成为一样的
@@ -10753,6 +11523,8 @@ def format_excel(
10753
11523
  #! final apply configs
10754
11524
  for row in ws[cell_range]:
10755
11525
  for cell_ in row:
11526
+ if cell_.coordinate in merged_cells:
11527
+ continue # Skip merged cells
10756
11528
  if cell_font:
10757
11529
  cell_.font = cell_font
10758
11530
  if cell_fill:
@@ -10830,11 +11602,9 @@ def format_excel(
10830
11602
  if not os.path.exists(filename) or mode=="w":
10831
11603
  # ws=wb.active
10832
11604
  # ws.title = sheet_name
10833
- ws = wb.create_sheet(title=sheet_name)
10834
- print(1)
11605
+ ws = wb.create_sheet(title=sheet_name)
10835
11606
  else:# file exists
10836
- wb = load_workbook(filename)
10837
- print(2)
11607
+ wb = load_workbook(filename)
10838
11608
  # with pd.ExcelWriter(filename, mode="a", engine=engine, if_sheet_exists=if_sheet_exists) as writer:
10839
11609
  # for ws in wb.worksheets: # Iterate through worksheets in the input workbook
10840
11610
  # ws_df = pd.DataFrame(ws.values)
@@ -11161,44 +11931,62 @@ def format_excel(
11161
11931
  if freeze:
11162
11932
  ws.freeze_panes = freeze # Freeze everything above and to the left of A2
11163
11933
  # !widths
11164
- if isinstance(width,bool):
11934
+ if isinstance(width, bool):
11165
11935
  width=None if width else False
11166
11936
  if isinstance(height,bool):
11167
11937
  height=None if height else False
11168
- if width is None or width=={}: # automatic adust width
11169
- for col in ws.columns:
11170
- max_length = 0
11171
- """column = col[0].column_letter # Get the column letter"""
11172
- # Check the first cell in the column to get the column letter
11173
- cell_first = col[0]
11174
11938
 
11175
- # Check if the cell is part of a merged range
11176
- if not any(cell_first.coordinate in range_ for range_ in ws.merged_cells.ranges):
11177
- column = get_column_letter(cell_first.column) # Get the column letter from the first cell
11178
- else:
11179
- # Skip the column if the first cell is merged
11939
+ merged_cells = set()
11940
+ for merged_range in ws.merged_cells.ranges:
11941
+ for row in ws.iter_rows(min_row=merged_range.min_row,
11942
+ max_row=merged_range.max_row,
11943
+ min_col=merged_range.min_col,
11944
+ max_col=merged_range.max_col):
11945
+ for cell in row:
11946
+ merged_cells.add(cell.coordinate)
11947
+ if width is None or width == {}: # automatic adjust width
11948
+ print("auto-width")
11949
+ for col in ws.columns:
11950
+ if not col:
11180
11951
  continue
11181
- for cell_ in col:
11182
- try:
11183
- if cell_.value:
11184
- max_length = max(max_length, len(str(cell_.value)))
11185
- except Exception:
11186
- pass
11187
- adjusted_width = max_length*width_factor+width_padding
11188
- if width_max is not None:
11189
- adjusted_width = min(adjusted_width, width_max)
11190
- ws.column_dimensions[column].width = max(5,adjusted_width)
11191
- elif isinstance(width, (int, float)): # set all columns to this value
11952
+ try:
11953
+ col_letter = get_column_letter(col[0].column)
11954
+
11955
+ # Skip entire column if any cell is merged
11956
+ if any(cell.coordinate in merged_cells for cell in col):
11957
+ continue
11958
+
11959
+ max_length = 0
11960
+ for cell in col:
11961
+ try:
11962
+ if cell.value:
11963
+ cell_value = str(cell.value)
11964
+ if '\n' in cell_value:
11965
+ max_line_length = max(len(line) for line in cell_value.split('\n'))
11966
+ max_length = max(max_length, max_line_length)
11967
+ else:
11968
+ max_length = max(max_length, len(cell_value))
11969
+ except:
11970
+ pass
11971
+
11972
+ adjusted_width = (max_length * width_factor) + width_padding
11973
+ if width_max is not None:
11974
+ adjusted_width = min(adjusted_width, width_max)
11975
+ ws.column_dimensions[col_letter].width = max(5, adjusted_width)
11976
+
11977
+ except Exception as e:
11978
+ print(f"Error adjusting width for column: {e}")
11979
+ continue
11980
+ elif isinstance(width, (int, float)): # set all columns to this value
11981
+ print("set to fixed width {}".format(width))
11192
11982
  for col in ws.columns:
11193
- column=get_column_letter(col[0].column)
11194
- ws.column_dimensions[column].width=width*width_factor+width_padding
11195
- elif isinstance(width,bool):
11196
- pass
11197
- else:
11983
+ column = get_column_letter(col[0].column)
11984
+ ws.column_dimensions[column].width = width * width_factor + width_padding
11985
+ elif isinstance(width, dict): # custom widths per column
11198
11986
  for col_idx, width_ in width.items():
11199
11987
  col_letter = get_column_letter(col_idx)
11200
11988
  ws.column_dimensions[col_letter].width = width_
11201
-
11989
+
11202
11990
  # !heights
11203
11991
  if height is None or height=={}: # automatic adust height
11204
11992
  for row in ws.iter_rows(min_row=1, max_row=ws.max_row):
@@ -11655,9 +12443,28 @@ def format_excel(
11655
12443
 
11656
12444
  # ungroup sheets
11657
12445
  for sheet in wb.worksheets:
11658
- sheet.sheet_view.tabSelected = False
12446
+ sheet.sheet_view.tabSelected = False
11659
12447
  # !Save the workbook
11660
- wb.save(filename)
12448
+ try:
12449
+ wb.save(filename)
12450
+ except Exception as e:
12451
+ print(f"Error saving workbook: {str(e)}")
12452
+ # Replace your final save operation with this:
12453
+ # try:
12454
+ # # Create a temporary file for safer saving
12455
+ # temp_filename = filename + '.tmp'
12456
+ # wb.save(temp_filename)
12457
+
12458
+ # # If save succeeds, replace original file
12459
+ # if os.path.exists(filename):
12460
+ # os.remove(filename)
12461
+ # os.rename(temp_filename, filename)
12462
+
12463
+ # except Exception as e:
12464
+ # print(f"Error saving workbook: {str(e)}")
12465
+ # if os.path.exists(temp_filename):
12466
+ # os.remove(temp_filename)
12467
+ # raise
11661
12468
 
11662
12469
 
11663
12470
  def preview(var):
@@ -13716,6 +14523,575 @@ def df_fillna(
13716
14523
  # print(method_name)
13717
14524
  # display(df)
13718
14525
  # display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
14526
+ def df_cut(
14527
+ df: pd.DataFrame,
14528
+ column: str,
14529
+ *,
14530
+ new_col_name: Optional[str] = None,
14531
+ bins: Optional[
14532
+ Union[int, List[float], Dict[str, Union[float, str, pd.Timestamp]]]
14533
+ ] = None,
14534
+ range_start: Optional[Union[float, str, pd.Timestamp]] = None,
14535
+ range_end: Optional[Union[float, str, pd.Timestamp]] = None,
14536
+ step: Optional[Union[float, str, pd.Timedelta]] = None,
14537
+ labels: Optional[List[str]] = None,
14538
+ label_format: Optional[Union[str, Callable[[float, float], str]]] = None,
14539
+ include_overflow: bool = True,
14540
+ include_underflow: bool = False,
14541
+ right: bool = False,
14542
+ drop_original: bool = False,
14543
+ precision: int = 2,
14544
+ show_count: bool = False,
14545
+ symbol_count: str = "n=",
14546
+ show_percentage: bool = False,
14547
+ symbol_percentage: str = "%",
14548
+ show_total_count: bool = False,
14549
+ symbol_total_count: str = "∑n=",
14550
+ sep_between: str = " | ",
14551
+ sort_labels: bool = True,
14552
+ na_action: str = "keep",
14553
+ na_fill_value: Optional[str] = None,
14554
+ dtype: Optional[Union[str, pd.CategoricalDtype]] = None,
14555
+ ordered: bool = True,
14556
+ inplace: bool = False,
14557
+ datetime_format: str = "%Y-%m-%d",
14558
+ categorical_agg: str = "count",
14559
+ ) -> Optional[pd.DataFrame]:
14560
+ """
14561
+ Enhanced binning function that works with numeric, datetime, and categorical columns.
14562
+
14563
+ Features:
14564
+ - Automatic type detection (numeric, datetime, categorical)
14565
+ - Flexible bin specification (number of bins, explicit edges, or range+step)
14566
+ - Customizable labels with formatting
14567
+ - Count and percentage display options
14568
+ - NA value handling
14569
+ square bracket: means inclusive
14570
+ parenthesis: means exclusive
14571
+ Parameters:
14572
+ -----------
14573
+ df : pd.DataFrame
14574
+ Input DataFrame containing the column to bin
14575
+ column : str
14576
+ Name of column to bin
14577
+ new_col_name : str, optional
14578
+ Name for binned column (default: f"{column}_binned")
14579
+ bins : int, list, or dict, optional
14580
+ - int: Number of equal-width bins
14581
+ - list: Explicit bin edges
14582
+ - dict: {'start': x, 'end': y, 'step': z} for range specification
14583
+ range_start : float or datetime-like, optional
14584
+ Start value for bin range (required if bins is None or dict)
14585
+ range_end : float or datetime-like, optional
14586
+ End value for bin range (default: max of column)
14587
+ step : float or timedelta-like, optional
14588
+ Step size for bin creation (required if bins is None or dict)
14589
+ labels : list of str, optional
14590
+ Custom labels for bins (must match number of bins)
14591
+ label_format : str or callable, optional
14592
+ Format string or function for bin labels
14593
+ include_overflow : bool, default True
14594
+ Include catch-all bin for values above range_end
14595
+ include_underflow : bool, default False
14596
+ Include catch-all bin for values below range_start
14597
+ right : bool, default False
14598
+ Whether bins include the right edge
14599
+ drop_original : bool, default False
14600
+ Drop original column after binning
14601
+ precision : int, default 2
14602
+ Decimal precision for numeric bin labels
14603
+ show_count : bool, default False
14604
+ Show count of items in each bin
14605
+ show_percentage : bool, default False
14606
+ Show percentage of items in each bin
14607
+ show_total_count : bool, default False
14608
+ Show total count in labels
14609
+ na_action : str, default 'keep'
14610
+ How to handle NA values ('keep', 'drop', or 'fill')
14611
+ na_fill_value : str, optional
14612
+ Value to fill NAs with if na_action='fill'
14613
+ dtype : dtype or CategoricalDtype, optional
14614
+ Output dtype for binned column
14615
+ ordered : bool, default True
14616
+ Whether bins are ordered
14617
+ inplace : bool, default False
14618
+ Modify DataFrame in place
14619
+ datetime_format : str, default "%Y-%m-%d"
14620
+ Format string for datetime labels
14621
+ categorical_agg : str, default 'count'
14622
+ For categorical data: 'count' or 'ratio'
14623
+
14624
+ Returns:
14625
+ --------
14626
+ pd.DataFrame or None
14627
+ Returns modified DataFrame unless inplace=True
14628
+
14629
+ Examples:
14630
+ --------
14631
+ # Numeric binning
14632
+ df_cut(df, 'age', bins=5)
14633
+ df_cut(df, 'price', range_start=0, range_end=1000, step=100)
14634
+
14635
+ # Datetime binning
14636
+ df_cut(df, 'date', bins={'start': '2023-01-01', 'end': '2023-12-31', 'step': '1M'})
14637
+
14638
+ # Categorical binning
14639
+ df_cut(df, 'category', bins=5, categorical_agg='ratio')
14640
+
14641
+ # Sample datetime data
14642
+ dates = pd.date_range("2020-01-01", "2023-12-31", freq="D")
14643
+ df = pd.DataFrame(
14644
+ {
14645
+ "order_date": np.random.choice(dates, 500),
14646
+ "delivery_time": np.random.randint(1, 72, 500), # hours
14647
+ }
14648
+ )
14649
+ # Example 1: Monthly bins
14650
+ # Monthly binning with exact month boundaries
14651
+ df_cut(
14652
+ df,
14653
+ "order_date",
14654
+ bins={"start": "2019-01-01", "end": "2023-12-31", "step": "1Y"},
14655
+ datetime_format="%Y-%m-%d",
14656
+ label_format="%m-%d",
14657
+ show_count=True,
14658
+ show_percentage=True,
14659
+ show_total_count=True,
14660
+ )
14661
+ # Weekly binning
14662
+ df_cut(
14663
+ df,
14664
+ "order_date",
14665
+ bins={"start": "2019-01-01", "end": "2023-12-31", "step": "1W"},
14666
+ label_format="%Y-%m-%d",
14667
+ datetime_format="%Y-%m-%d",
14668
+ show_count=True,
14669
+ show_percentage=True,
14670
+ show_total_count=True,
14671
+ )
14672
+
14673
+
14674
+ # Sample numeric data
14675
+ df = pd.DataFrame(
14676
+ {"price": np.random.uniform(10, 1000, 1000), "age": np.random.randint(18, 80, 1000)}
14677
+ )
14678
+
14679
+ # Example 1: Equal-width bins
14680
+ df_cut(df, "price", bins=5, show_count=True)
14681
+
14682
+ # Example 2: Custom range with step
14683
+ df_cut(
14684
+ df,
14685
+ "price",
14686
+ range_start=0,
14687
+ range_end=1000,
14688
+ step=200,
14689
+ label_format="${left:.0f}-${right:.0f}",
14690
+ show_percentage=True,
14691
+ )
14692
+ df_cut(
14693
+ df,
14694
+ "price",
14695
+ bins={"start": 0, "end": 1000, "step": 200},
14696
+ # label_format="${left:.0f}-${right:.0f}",
14697
+ show_percentage=True,
14698
+ )
14699
+ """
14700
+ from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
14701
+
14702
+ def _process_time_step(step: Union[str, int, float, pd.Timedelta]) -> str:
14703
+ """Convert step to pandas frequency string."""
14704
+ if isinstance(step, pd.Timedelta):
14705
+ return step.freqstr if step.freqstr else str(step)
14706
+
14707
+ if isinstance(step, (int, float)):
14708
+ return f"{step}S" # Interpret numbers as seconds
14709
+
14710
+ if isinstance(step, str):
14711
+ step = step.strip().lower()
14712
+ match = re.match(r"(\d*\.?\d+)?\s*([a-z]+)", step)
14713
+ if not match:
14714
+ raise ValueError(f"Invalid time step format: {step}")
14715
+
14716
+ num_part, unit_part = match.groups()
14717
+ num = float(num_part) if num_part else 1.0
14718
+
14719
+ unit_map = {
14720
+ "y": "Y",
14721
+ "yr": "Y",
14722
+ "yrs": "Y",
14723
+ "year": "Y",
14724
+ "years": "Y",
14725
+ "m": "M",
14726
+ "mo": "M",
14727
+ "mon": "M",
14728
+ "month": "M",
14729
+ "months": "M",
14730
+ "w": "W",
14731
+ "wk": "W",
14732
+ "wks": "W",
14733
+ "week": "W",
14734
+ "weeks": "W",
14735
+ "d": "D",
14736
+ "day": "D",
14737
+ "days": "D",
14738
+ "h": "H",
14739
+ "hr": "H",
14740
+ "hrs": "H",
14741
+ "hour": "H",
14742
+ "hours": "H",
14743
+ "min": "T",
14744
+ "mins": "T",
14745
+ "minute": "T",
14746
+ "minutes": "T",
14747
+ "s": "S",
14748
+ "sec": "S",
14749
+ "secs": "S",
14750
+ "second": "S",
14751
+ "seconds": "S",
14752
+ }
14753
+
14754
+ if unit_part not in unit_map:
14755
+ raise ValueError(f"Unknown time unit: {unit_part}")
14756
+
14757
+ freq = unit_map[unit_part]
14758
+ if num.is_integer():
14759
+ num = int(num)
14760
+ return f"{num}{freq}"
14761
+
14762
+ raise TypeError(f"Unsupported step type: {type(step)}")
14763
+
14764
+
14765
+ def _process_datetime_column(
14766
+ col: pd.Series,
14767
+ bins: Optional[Union[int, List[pd.Timestamp]]],
14768
+ range_start: Optional[Union[str, pd.Timestamp]],
14769
+ range_end: Optional[Union[str, pd.Timestamp]],
14770
+ step: Optional[Union[str, pd.Timedelta]],
14771
+ labels: Optional[List[str]],
14772
+ label_format: Optional[Union[str, Callable]],
14773
+ datetime_format: str,
14774
+ right: bool,
14775
+ include_underflow: bool,
14776
+ include_overflow: bool,
14777
+ ) -> Tuple[pd.Categorical, List[str]]:
14778
+ """Process datetime column with accurate counting."""
14779
+ col = pd.to_datetime(col)
14780
+
14781
+ # Handle bin edges
14782
+ if bins is None:
14783
+ if step is None:
14784
+ raise ValueError("Step must be provided for datetime binning")
14785
+
14786
+ # Convert step to pandas frequency string
14787
+ step_freq = _process_time_step(step)
14788
+
14789
+ # Set default range if needed
14790
+ range_start = (
14791
+ pd.to_datetime(range_start) if range_start is not None else col.min()
14792
+ )
14793
+ range_end = pd.to_datetime(range_end) if range_end is not None else col.max()
14794
+
14795
+ # Generate bins
14796
+ try:
14797
+ bin_edges = pd.date_range(start=range_start, end=range_end, freq=step_freq)
14798
+ if len(bin_edges) == 0:
14799
+ bin_edges = pd.date_range(start=range_start, end=range_end, periods=2)
14800
+ elif bin_edges[-1] < range_end:
14801
+ bin_edges = bin_edges.append(pd.DatetimeIndex([range_end]))
14802
+ except ValueError as e:
14803
+ raise ValueError(f"Invalid frequency specification: {step_freq}") from e
14804
+ elif isinstance(bins, int):
14805
+ bin_edges = pd.date_range(start=col.min(), end=col.max(), periods=bins + 1)
14806
+ else:
14807
+ bin_edges = pd.to_datetime(bins)
14808
+
14809
+ # Add overflow/underflow bins
14810
+ if include_underflow:
14811
+ bin_edges = bin_edges.insert(0, pd.Timestamp.min)
14812
+ if include_overflow:
14813
+ bin_edges = bin_edges.append(pd.DatetimeIndex([pd.Timestamp.max]))
14814
+
14815
+ # Perform the cut - this is where we ensure proper binning
14816
+ binned = pd.cut(
14817
+ col.astype("int64"), # Convert to nanoseconds for precise binning
14818
+ bins=bin_edges.astype("int64"),
14819
+ right=right,
14820
+ include_lowest=True,
14821
+ )
14822
+
14823
+ # Generate labels if not provided
14824
+ if labels is None:
14825
+ labels = []
14826
+ for i in range(len(bin_edges) - 1):
14827
+ left = bin_edges[i]
14828
+ right_ = bin_edges[i + 1]
14829
+
14830
+ # Handle special cases
14831
+ if left == pd.Timestamp.min:
14832
+ left_str = "<"
14833
+ else:
14834
+ left_str = left.strftime(datetime_format)
14835
+
14836
+ if right_ == pd.Timestamp.max:
14837
+ right_str = ">"
14838
+ else:
14839
+ right_str = right_.strftime(datetime_format)
14840
+
14841
+ # Apply label formatting
14842
+ if callable(label_format):
14843
+ label = label_format(left, right_)
14844
+ elif isinstance(label_format, str):
14845
+ try:
14846
+ if left != pd.Timestamp.min and right_ != pd.Timestamp.max:
14847
+ label = f"{left.strftime(label_format)}-{right_.strftime(label_format)}"
14848
+ else:
14849
+ label = f"{left_str}-{right_str}"
14850
+ except (ValueError, AttributeError):
14851
+ label = f"{left_str}-{right_str}"
14852
+ else:
14853
+ label = f"{left_str}-{right_str}"
14854
+
14855
+ labels.append(label)
14856
+
14857
+ return binned, labels
14858
+
14859
+
14860
+ def _process_categorical_column(
14861
+ col: pd.Series,
14862
+ bins: Optional[Union[int, List[str]]],
14863
+ labels: Optional[List[str]],
14864
+ categorical_agg: str,
14865
+ ) -> Tuple[pd.Categorical, List[str]]:
14866
+ value_counts = col.value_counts(normalize=(categorical_agg == "ratio"))
14867
+
14868
+ if bins is not None and isinstance(bins, int):
14869
+ top_categories = value_counts.head(bins).index
14870
+ binned = col.where(col.isin(top_categories), "Other")
14871
+ elif isinstance(bins, list):
14872
+ binned = col.where(col.isin(bins), "Other")
14873
+ else:
14874
+ binned = col
14875
+
14876
+ binned = binned.astype("category")
14877
+
14878
+ if labels is not None:
14879
+ binned = binned.cat.rename_categories(dict(zip(binned.cat.categories, labels)))
14880
+
14881
+ return binned, list(binned.cat.categories)
14882
+
14883
+
14884
+ def _process_numeric_column(
14885
+ col: pd.Series,
14886
+ bins: Optional[Union[int, List[float]]],
14887
+ range_start: Optional[float],
14888
+ range_end: Optional[float],
14889
+ step: Optional[float],
14890
+ labels: Optional[List[str]],
14891
+ label_format: Optional[Union[str, Callable]],
14892
+ precision: int,
14893
+ right: bool,
14894
+ include_underflow: bool,
14895
+ include_overflow: bool,
14896
+ ) -> Tuple[pd.Categorical, List[str]]:
14897
+ if bins is None:
14898
+ if range_start is None or step is None:
14899
+ raise ValueError("If bins not provided, must set range_start and step")
14900
+ if range_end is None:
14901
+ range_end = col.max()
14902
+
14903
+ bin_edges = list(np.arange(range_start, range_end + step, step))
14904
+ elif isinstance(bins, int):
14905
+ bin_edges = np.linspace(col.min(), col.max(), bins + 1).tolist()
14906
+ else:
14907
+ bin_edges = list(bins)
14908
+
14909
+ # Add overflow/underflow bins if needed
14910
+ if include_underflow and not np.isinf(bin_edges[0]):
14911
+ bin_edges.insert(0, float("-inf"))
14912
+ if include_overflow and not np.isinf(bin_edges[-1]):
14913
+ bin_edges.append(float("inf"))
14914
+
14915
+ # Generate labels if not provided
14916
+ if labels is None:
14917
+ labels = []
14918
+ for i in range(len(bin_edges) - 1):
14919
+ left = round(bin_edges[i], precision)
14920
+ right_ = round(bin_edges[i + 1], precision)
14921
+
14922
+ if label_format:
14923
+ label = (
14924
+ label_format(left, right_)
14925
+ if callable(label_format)
14926
+ else label_format.format(left=left, right=right_)
14927
+ )
14928
+ else:
14929
+ if np.isinf(left) and left < 0:
14930
+ label = f"<{right_}"
14931
+ elif np.isinf(right_):
14932
+ label = f">{left}"
14933
+ else:
14934
+ label = f"[{left}, {right_}{']' if right else ')'}"
14935
+
14936
+ labels.append(label)
14937
+
14938
+ binned = pd.cut(
14939
+ col, bins=bin_edges, labels=labels, right=right, include_lowest=True
14940
+ )
14941
+ return binned, labels
14942
+
14943
+
14944
+ def _handle_na_values(
14945
+ col: pd.Series, na_action: str, na_fill_value: Optional[str]
14946
+ ) -> pd.Series:
14947
+ if na_action == "drop":
14948
+ return col.dropna()
14949
+ elif na_action == "fill" and na_fill_value is not None:
14950
+ return col.fillna(na_fill_value)
14951
+ return col
14952
+
14953
+
14954
+ def _add_statistical_labels(
14955
+ binned: pd.Categorical,
14956
+ labels: List[str],
14957
+ show_count: bool,
14958
+ show_percentage: bool,
14959
+ show_total_count: bool,
14960
+ symbol_count: str,
14961
+ symbol_percentage: str,
14962
+ symbol_total_count: str,
14963
+ sep_between: str,
14964
+ ) -> List[str]:
14965
+ """Add statistical information with accurate counts."""
14966
+ # Get counts by matching the exact bin intervals
14967
+ value_counts = binned.value_counts()
14968
+ total = len(binned.dropna())
14969
+
14970
+ new_labels = []
14971
+ for i, (label, category) in enumerate(zip(labels, binned.cat.categories)):
14972
+ count = value_counts.get(category, 0)
14973
+ parts = [label]
14974
+
14975
+ if show_count:
14976
+ parts.append(f"{symbol_count}{count}")
14977
+ if show_percentage:
14978
+ percentage = (count / total * 100) if total > 0 else 0
14979
+ parts.append(f"{percentage:.1f}{symbol_percentage}")
14980
+ if show_total_count:
14981
+ parts.append(f"{symbol_total_count}{total}")
14982
+
14983
+ # Ensure unique labels
14984
+ new_label = sep_between.join(parts)
14985
+ if new_label in new_labels:
14986
+ new_label = f"{new_label}_{i}"
14987
+ new_labels.append(new_label)
14988
+
14989
+ return new_labels
14990
+
14991
+
14992
+ def _sort_bin_labels(binned: pd.Categorical, labels: List[str]) -> pd.Categorical:
14993
+ try:
14994
+ # Attempt to sort by the underlying intervals
14995
+ sorted_categories = sorted(binned.cat.categories)
14996
+ binned = binned.cat.reorder_categories(sorted_categories, ordered=True)
14997
+ except Exception:
14998
+ # If sorting fails (e.g., string labels), fallback to given label order
14999
+ binned = binned.cat.set_categories(labels, ordered=True)
15000
+ return binned
15001
+ # Input validation
15002
+ if column not in df.columns:
15003
+ raise ValueError(f"Column '{column}' not found in DataFrame")
15004
+
15005
+ if not inplace:
15006
+ df = df.copy()
15007
+
15008
+ col_data = df[column]
15009
+
15010
+ # Determine column type
15011
+ if is_datetime64_any_dtype(col_data):
15012
+ col_type = "datetime"
15013
+ col_data = pd.to_datetime(col_data)
15014
+ elif isinstance(col_data.dtype, pd.CategoricalDtype) or col_data.dtype == "object":
15015
+ col_type = "categorical"
15016
+ elif is_numeric_dtype(col_data):
15017
+ col_type = "numeric"
15018
+ else:
15019
+ raise TypeError(f"Unsupported column type: {col_data.dtype}")
15020
+
15021
+ # Handle dictionary bin specification
15022
+ if isinstance(bins, dict):
15023
+ range_start = bins.get("start", range_start)
15024
+ range_end = bins.get("end", range_end)
15025
+ step = bins.get("step", step)
15026
+ bins = None
15027
+
15028
+ # Process based on column type
15029
+ if col_type == "datetime":
15030
+ binned, bin_labels = _process_datetime_column(
15031
+ col_data,
15032
+ bins,
15033
+ range_start,
15034
+ range_end,
15035
+ step,
15036
+ labels,
15037
+ label_format,
15038
+ datetime_format,
15039
+ right,
15040
+ include_underflow,
15041
+ include_overflow,
15042
+ )
15043
+ elif col_type == "categorical":
15044
+ binned, bin_labels = _process_categorical_column(
15045
+ col_data, bins, labels, categorical_agg
15046
+ )
15047
+ else:
15048
+ binned, bin_labels = _process_numeric_column(
15049
+ col_data,
15050
+ bins,
15051
+ range_start,
15052
+ range_end,
15053
+ step,
15054
+ labels,
15055
+ label_format,
15056
+ precision,
15057
+ right,
15058
+ include_underflow,
15059
+ include_overflow,
15060
+ )
15061
+
15062
+ # Handle NA values
15063
+ binned = _handle_na_values(binned, na_action, na_fill_value)
15064
+
15065
+ # Add statistical information to labels if requested
15066
+ if show_count or show_percentage or show_total_count:
15067
+ bin_labels = _add_statistical_labels(
15068
+ binned,
15069
+ bin_labels,
15070
+ show_count,
15071
+ show_percentage,
15072
+ show_total_count,
15073
+ symbol_count,
15074
+ symbol_percentage,
15075
+ symbol_total_count,
15076
+ sep_between,
15077
+ )
15078
+ binned = binned.cat.rename_categories(
15079
+ dict(zip(binned.cat.categories, bin_labels))
15080
+ )
15081
+
15082
+ # Sort labels if requested
15083
+ if sort_labels and not right and len(bin_labels) > 1:
15084
+ binned = _sort_bin_labels(binned, bin_labels)
15085
+
15086
+ # Create final output column
15087
+ new_col = new_col_name or f"{column}_binned"
15088
+ df[new_col] = binned.astype(dtype) if dtype else binned
15089
+
15090
+ if drop_original:
15091
+ df.drop(columns=[column], inplace=True)
15092
+
15093
+ return None if inplace else df
15094
+
13719
15095
 
13720
15096
 
13721
15097
  def df_encoder(
@@ -16300,7 +17676,7 @@ def df_corr(df: pd.DataFrame, method="pearson"):
16300
17676
  def use_pd(
16301
17677
  func_name="excel",
16302
17678
  verbose=True,
16303
- dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
17679
+ dir_json="./data/usages_pd.json",
16304
17680
  ):
16305
17681
  try:
16306
17682
  default_settings = fload(dir_json, output="json")