py2ls 0.2.4.6__py3-none-any.whl → 0.2.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -1,62 +1,38 @@
1
1
  import numpy as np
2
- import pandas as pd
3
-
4
- import json
5
- import matplotlib
6
- import matplotlib.pyplot as plt
7
- import matplotlib.ticker as tck
8
- from cycler import cycler
9
- from mpl_toolkits.mplot3d import Axes3D
10
- import seaborn as sns
11
-
12
- from sklearn.kernel_approximation import KERNEL_PARAMS
13
- from sympy import is_increasing
14
- import sys, os, shutil, re, yaml, json, subprocess
15
- import importlib.util
16
- import time
17
- from dateutil import parser
18
- from datetime import datetime
19
- import schedule
20
-
21
- from PIL import Image, ImageEnhance, ImageOps, ImageFilter
22
- from rembg import remove, new_session
23
-
24
- import docx
25
- from fpdf import FPDF
26
- from lxml import etree
27
- from docx import Document
28
- from PyPDF2 import PdfReader
29
- from pptx import Presentation
30
- from pptx.util import Inches
31
- from pdf2image import convert_from_path, pdfinfo_from_path
32
- from nltk.tokenize import sent_tokenize, word_tokenize
33
- import nltk # nltk.download("punkt")
34
- from docx2pdf import convert
35
- import img2pdf as image2pdf
36
- import nbformat
37
- from nbconvert import MarkdownExporter
38
-
39
- from itertools import pairwise
40
- from box import Box, BoxList
41
- from numerizer import numerize
42
- from tqdm import tqdm
43
- import mimetypes
44
- from pprint import pp
45
- from collections import Counter
46
- from fuzzywuzzy import fuzz, process
47
- from langdetect import detect
48
- from duckduckgo_search import DDGS
2
+ import pandas as pd
3
+ import sys, os
4
+ from IPython.display import display
49
5
  from typing import List, Optional, Union
50
- from bs4 import BeautifulSoup
51
-
52
- from . import netfinder
53
-
54
6
  try:
55
7
  get_ipython().run_line_magic("load_ext", "autoreload")
56
8
  get_ipython().run_line_magic("autoreload", "2")
57
9
  except NameError:
58
10
  pass
59
11
 
12
+ import warnings
13
+ warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
14
+ warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
15
+
16
+ def run_once_within(duration=60): # default 60s
17
+ import time
18
+ """
19
+ usage:
20
+ if run_once_within():
21
+ print("This code runs once per minute.")
22
+ else:
23
+ print("The code has already been run in the last minute.")
24
+ """
25
+ if not hasattr(run_once_within, "time_last"):
26
+ run_once_within.time_last = None
27
+ time_curr = time.time()
28
+
29
+ if (run_once_within.time_last is None) or (time_curr - run_once_within.time_last >= duration):
30
+ run_once_within.time_last = time_curr # Update the last execution time
31
+ return True
32
+ else:
33
+ return False
34
+
35
+
60
36
  def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
61
37
  """
62
38
  Add the Chinese (default) font to the font manager
@@ -155,6 +131,8 @@ def run_every(when: str = None, job=None, wait: int = 60):
155
131
  :param when: String specifying the interval, e.g. '2 minutes', '4 hours', '1 day'.
156
132
  :param job: The function to be scheduled.
157
133
  """
134
+ import schedule
135
+ import time
158
136
  if job is None:
159
137
  print("No job provided!")
160
138
  return
@@ -200,6 +178,8 @@ def run_at(when: str, job=None, wait: int = 60):
200
178
  :param job: The function to be scheduled.
201
179
  :param wait: The sleep interval between checks in seconds.
202
180
  """
181
+ from datetime import datetime
182
+ import time
203
183
  if job is None:
204
184
  print("No job provided!")
205
185
  return
@@ -279,6 +259,7 @@ def get_timezone(timezone: str | list = None):
279
259
 
280
260
  def is_package_installed(package_name):
281
261
  """Check if a package is installed."""
262
+ import importlib.util
282
263
  package_spec = importlib.util.find_spec(package_name)
283
264
  return package_spec is not None
284
265
 
@@ -291,6 +272,7 @@ def upgrade(module="py2ls",uninstall=False):
291
272
  module (str): The name of the module to install/upgrade.
292
273
  uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
293
274
  """
275
+ import subprocess
294
276
  if not is_package_installed(module):
295
277
  try:
296
278
  subprocess.check_call([sys.executable, "-m", "pip", "install", module])
@@ -327,6 +309,7 @@ def get_version(pkg):
327
309
 
328
310
 
329
311
  def rm_folder(folder_path, verbose=True):
312
+ import shutil
330
313
  try:
331
314
  shutil.rmtree(folder_path)
332
315
  if verbose:
@@ -345,6 +328,7 @@ def fremove(path, verbose=True):
345
328
  """
346
329
  try:
347
330
  if os.path.isdir(path):
331
+ import shutil
348
332
  shutil.rmtree(path)
349
333
  if verbose:
350
334
  print(f"Successfully deleted folder {path}")
@@ -360,23 +344,30 @@ def fremove(path, verbose=True):
360
344
  print(f"Failed to delete {path}. Reason: {e}")
361
345
 
362
346
 
363
- def get_cwd(verbose: bool = True):
364
- """
365
- get_cwd: to get the current working directory
366
- Args:
367
- verbose (bool, optional): to show which function is use. Defaults to True.
368
- """
369
- try:
370
- script_dir = os.path.dirname(os.path.abspath(__file__))
371
- if verbose:
372
- print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
373
- except NameError:
374
- # This works in an interactive environment (like a Jupyter notebook)
375
- script_dir = os.getcwd()
376
- if verbose:
377
- print("os.getcwd():", script_dir)
378
- return script_dir
379
-
347
+ # def get_cwd(verbose: bool = True):
348
+ # """
349
+ # get_cwd: to get the current working directory
350
+ # Args:
351
+ # verbose (bool, optional): to show which function is use. Defaults to True.
352
+ # """
353
+ # try:
354
+ # script_dir = os.path.dirname(os.path.abspath(__file__))
355
+ # if verbose:
356
+ # print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
357
+ # except NameError:
358
+ # # This works in an interactive environment (like a Jupyter notebook)
359
+ # script_dir = os.getcwd()
360
+ # if verbose:
361
+ # print("os.getcwd():", script_dir)
362
+ # return script_dir
363
+
364
+
365
+ def get_cwd():
366
+ from pathlib import Path
367
+ # Get the current script's directory as a Path object
368
+ current_directory = Path(__file__).resolve().parent
369
+
370
+ return current_directory
380
371
 
381
372
  def search(
382
373
  query,
@@ -388,7 +379,7 @@ def search(
388
379
  dir_save=dir_save,
389
380
  **kwargs,
390
381
  ):
391
-
382
+ from duckduckgo_search import DDGS
392
383
  if "te" in kind.lower():
393
384
  results = DDGS().text(query, max_results=limit)
394
385
  res = pd.DataFrame(results)
@@ -421,7 +412,7 @@ def echo(*args, **kwargs):
421
412
  str: the answer from ai
422
413
  """
423
414
  global dir_save
424
-
415
+ from duckduckgo_search import DDGS
425
416
  query = None
426
417
  model = kwargs.get("model", "gpt")
427
418
  verbose = kwargs.get("verbose", True)
@@ -469,8 +460,11 @@ def echo(*args, **kwargs):
469
460
  model_valid = valid_mod_name(model)
470
461
  res = DDGS().chat(query, model=model_valid)
471
462
  if verbose:
463
+ from pprint import pp
472
464
  pp(res)
473
465
  if log:
466
+ from datetime import datetime
467
+ import time
474
468
  dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
475
469
  res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
476
470
  if bool(os.path.basename(dir_save)):
@@ -492,6 +486,7 @@ def ai(*args, **kwargs):
492
486
 
493
487
 
494
488
  def detect_lang(text, output="lang", verbose=True):
489
+ from langdetect import detect
495
490
  dir_curr_script = os.path.dirname(os.path.abspath(__file__))
496
491
  dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
497
492
  print(dir_curr_script, os.getcwd(), dir_lang_code)
@@ -550,19 +545,34 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
550
545
  for lst in flattened_lists[1:]:
551
546
  shared_elements.intersection_update(lst)
552
547
  else:
548
+ from collections import Counter
553
549
  all_elements = [item for sublist in flattened_lists for item in sublist]
554
550
  element_count = Counter(all_elements)
555
551
  # Get elements that appear in at least n_shared lists
556
552
  shared_elements = [item for item, count in element_count.items() if count >= n_shared]
557
553
 
558
- shared_elements = flatten(shared_elements, verbose=verbose)
554
+ shared_elements = flatten(shared_elements, verbose=verbose)
559
555
  if verbose:
560
556
  elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
561
557
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
562
558
  print("********* checking shared elements *********")
563
559
  return shared_elements
564
560
 
565
- def flatten(nested: Any, unique_list=True, verbose=True):
561
+ def not_shared(*args, strict=True, n_shared=2, verbose=False):
562
+ """
563
+ To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
564
+ usage:
565
+ list1 = [1, 8, 3, 3, 4, 5]
566
+ list2 = [4, 5, 6, 7, 8]
567
+ not_shared(list1,list2)# output [1,3]
568
+ """
569
+ _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
570
+ list1 = flatten(args[0], verbose=verbose)
571
+ _not_shared=[item for item in list1 if item not in _common]
572
+ return _not_shared
573
+
574
+
575
+ def flatten(nested: Any, unique_list=True, verbose=False):
566
576
  """
567
577
  Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
568
578
  Parameters:
@@ -603,7 +613,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
603
613
  Returns:
604
614
  tuple: A tuple containing the best match and its index in the candidates list.
605
615
  """
606
-
616
+ from fuzzywuzzy import fuzz, process
607
617
  def to_lower(s, ignore_case=True):
608
618
  # Converts a string or list of strings to lowercase if ignore_case is True.
609
619
  if ignore_case:
@@ -729,6 +739,7 @@ def cn2pinyin(
729
739
  return pinyin_flat
730
740
 
731
741
  def counter(list_, verbose=True):
742
+ from collections import Counter
732
743
  c = Counter(list_)
733
744
  # Print the name counts
734
745
  for item, count in c.items():
@@ -757,7 +768,7 @@ def str2time(time_str, fmt="24"):
757
768
  %p represents AM or PM.
758
769
  - str: The converted time string.
759
770
  """
760
-
771
+ from datetime import datetime
761
772
  def time_len_corr(time_str):
762
773
  time_str_ = (
763
774
  ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
@@ -818,6 +829,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
818
829
  Returns:
819
830
  - str: The converted date string.
820
831
  """
832
+ from dateutil import parser
821
833
  try:
822
834
  date_obj = parser.parse(date_str)
823
835
  except ValueError as e:
@@ -834,6 +846,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
834
846
 
835
847
 
836
848
  def str2num(s, *args, **kwargs):
849
+ import re
837
850
  delimiter = kwargs.get("sep", None)
838
851
  round_digits = kwargs.get("round", None)
839
852
  if delimiter is not None:
@@ -849,6 +862,7 @@ def str2num(s, *args, **kwargs):
849
862
  try:
850
863
  num = float(s)
851
864
  except ValueError:
865
+ from numerizer import numerize
852
866
  try:
853
867
  numerized = numerize(s)
854
868
  num = int(numerized) if "." not in numerized else float(numerized)
@@ -1016,7 +1030,7 @@ def px2inch(*px, dpi=300) -> list:
1016
1030
  return [i / dpi for i in px]
1017
1031
 
1018
1032
 
1019
- def cm2inch(*cm) -> list:
1033
+ def inch2cm(*cm) -> list:
1020
1034
  """
1021
1035
  cm2inch: converts centimeter measurements to inches.
1022
1036
  Usage:
@@ -1037,24 +1051,31 @@ def cm2inch(*cm) -> list:
1037
1051
  def inch2px(*inch, dpi=300) -> list:
1038
1052
  """
1039
1053
  inch2px: converts inch measurements to pixels based on the given dpi.
1054
+
1040
1055
  Usage:
1041
1056
  inch2px(1, 2, dpi=300); inch2px([1, 2], dpi=300)
1057
+
1058
+ Parameters:
1059
+ inch : float, list, or tuple
1060
+ Single or multiple measurements in inches to convert to pixels.
1061
+ dpi : int, optional (default=300)
1062
+ Dots per inch (DPI), representing the pixel density.
1063
+
1042
1064
  Returns:
1043
- list: in pixels
1065
+ list: Converted measurements in pixels.
1044
1066
  """
1045
- # Case 1: When the user passes a single argument that is a list or tuple, such as inch2px([1, 2]) or inch2px((1, 2))
1067
+ # Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
1046
1068
  if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
1047
- # If the input is a single list or tuple, we unpack its elements and convert each to pixels
1048
1069
  return [i * dpi for i in inch[0]]
1049
- # Case 2: When the user passes multiple arguments directly, such as inch2px(1, 2)
1070
+
1071
+ # Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
1050
1072
  else:
1051
- # Here, we convert each individual argument directly to pixels
1052
1073
  return [i * dpi for i in inch]
1053
1074
 
1054
1075
 
1055
- def inch2cm(*inch) -> list:
1076
+
1077
+ def cm2inch(*inch) -> list:
1056
1078
  """
1057
- inch2cm: converts inch measurements to centimeters.
1058
1079
  Usage:
1059
1080
  inch2cm(8,5); inch2cm((8,5)); inch2cm([8,5])
1060
1081
  Returns:
@@ -1169,6 +1190,7 @@ def paper_size(paper_type_str="a4"):
1169
1190
 
1170
1191
 
1171
1192
  def docx2pdf(dir_docx, dir_pdf=None):
1193
+ from docx2pdf import convert
1172
1194
  if dir_pdf:
1173
1195
  convert(dir_docx, dir_pdf)
1174
1196
  else:
@@ -1176,6 +1198,7 @@ def docx2pdf(dir_docx, dir_pdf=None):
1176
1198
 
1177
1199
 
1178
1200
  def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
1201
+ import img2pdf as image2pdf
1179
1202
  def mm_to_point(size):
1180
1203
  return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
1181
1204
 
@@ -1227,6 +1250,9 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
1227
1250
 
1228
1251
 
1229
1252
  def pdf2ppt(dir_pdf, dir_ppt):
1253
+ from PyPDF2 import PdfReader
1254
+ from pptx.util import Inches
1255
+ from pptx import Presentation
1230
1256
  prs = Presentation()
1231
1257
 
1232
1258
  # Open the PDF file
@@ -1255,6 +1281,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
1255
1281
 
1256
1282
 
1257
1283
  def ssplit(text, by="space", verbose=False, strict=False, **kws):
1284
+ import re
1258
1285
  if isinstance(text, list):
1259
1286
  nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
1260
1287
  flat_list = [item for sublist in nested_list for item in sublist]
@@ -1302,6 +1329,8 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1302
1329
  return [text[i : i + length] for i in range(0, len(text), length)]
1303
1330
 
1304
1331
  def split_by_sent_num(text, n=10):
1332
+ from nltk.tokenize import sent_tokenize
1333
+ from itertools import pairwise
1305
1334
  # split text into sentences
1306
1335
  text_split_by_sent = sent_tokenize(text)
1307
1336
  cut_loc_array = np.arange(0, len(text_split_by_sent), n)
@@ -1374,10 +1403,12 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1374
1403
  print(f"splited by camel_case")
1375
1404
  return split_by_camel_case(text)
1376
1405
  elif ("word" in by) and not strict:
1406
+ from nltk.tokenize import word_tokenize
1377
1407
  if verbose:
1378
1408
  print(f"splited by word")
1379
1409
  return word_tokenize(text)
1380
1410
  elif ("sen" in by and not "num" in by) and not strict:
1411
+ from nltk.tokenize import sent_tokenize
1381
1412
  if verbose:
1382
1413
  print(f"splited by sentence")
1383
1414
  return sent_tokenize(text)
@@ -1427,9 +1458,11 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1427
1458
 
1428
1459
 
1429
1460
  def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
1461
+ from pdf2image import convert_from_path, pdfinfo_from_path
1430
1462
  df_dir_img_single_page = pd.DataFrame()
1431
1463
  dir_single_page = []
1432
1464
  if verbose:
1465
+ from pprint import pp
1433
1466
  pp(pdfinfo_from_path(dir_pdf))
1434
1467
  if isinstance(page, tuple) and page:
1435
1468
  page = list(page)
@@ -1548,6 +1581,7 @@ def unzip(dir_path, output_dir=None):
1548
1581
  # If the output directory already exists, remove it and replace it
1549
1582
  if os.path.exists(output_dir):
1550
1583
  if os.path.isdir(output_dir): # check if it is a folder
1584
+ import shutil
1551
1585
  shutil.rmtree(output_dir) # remove folder
1552
1586
  else:
1553
1587
  os.remove(output_dir) # remove file
@@ -1560,13 +1594,27 @@ def unzip(dir_path, output_dir=None):
1560
1594
  tar_ref.extractall(output_dir)
1561
1595
  return output_dir
1562
1596
  # Handle .gz files
1563
- if dir_path.endswith(".gz"):
1597
+ if dir_path.endswith(".gz") or dir_path.endswith(".gzip"):
1564
1598
  import gzip
1565
1599
 
1566
1600
  output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
1567
- with gzip.open(dir_path, "rb") as gz_file:
1568
- with open(output_file, "wb") as out_file:
1569
- shutil.copyfileobj(gz_file, out_file)
1601
+ try:
1602
+ import shutil
1603
+ with gzip.open(dir_path, "rb") as gz_file:
1604
+ with open(output_file, "wb") as out_file:
1605
+ shutil.copyfileobj(gz_file, out_file)
1606
+ print(f"unzipped '{dir_path}' to '{output_file}'")
1607
+ except FileNotFoundError:
1608
+ print(f"Error: The file '{dir_path}' was not found.")
1609
+ except PermissionError:
1610
+ print(f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'.")
1611
+ except Exception as e:
1612
+ try:
1613
+ import tarfile
1614
+ with tarfile.open(dir_path, 'r:gz') as tar:
1615
+ tar.extractall(path=output_file)
1616
+ except Exception as final_e:
1617
+ print(f"An final unexpected error occurred: {final_e}")
1570
1618
  return output_file
1571
1619
 
1572
1620
  # Handle .zip files
@@ -1648,6 +1696,11 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1648
1696
  False: normal
1649
1697
 
1650
1698
  """
1699
+ if not isinstance(df, pd.DataFrame):
1700
+ if verbose:
1701
+ print('not pd.DataFrame')
1702
+ return False
1703
+ df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
1651
1704
  # Initialize a list to hold messages about abnormalities
1652
1705
  messages = []
1653
1706
  is_abnormal = False
@@ -1675,25 +1728,29 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1675
1728
  is_abnormal = True
1676
1729
  if verbose:
1677
1730
  print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
1678
-
1731
+ if verbose:
1732
+ print("1",is_abnormal)
1679
1733
  if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
1680
1734
  messages.append("Abnormal: Too many delimiters in column names.")
1681
1735
  is_abnormal = True
1682
1736
  if verbose:
1683
1737
  print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
1684
-
1738
+ if verbose:
1739
+ print("2",is_abnormal)
1685
1740
  if delimiter_counts[""] > 3:
1686
1741
  messages.append("Abnormal: There are empty column names.")
1687
1742
  is_abnormal = True
1688
1743
  if verbose:
1689
1744
  print(f'delimiter_counts[""] > 3')
1690
-
1745
+ if verbose:
1746
+ print("3",is_abnormal)
1691
1747
  if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
1692
1748
  messages.append("Abnormal: Some column names contain unexpected characters.")
1693
1749
  is_abnormal = True
1694
1750
  if verbose:
1695
1751
  print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
1696
-
1752
+ if verbose:
1753
+ print("4",is_abnormal)
1697
1754
  # # Check for missing values
1698
1755
  # missing_values = df.isnull().sum()
1699
1756
  # if missing_values.any():
@@ -1713,7 +1770,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1713
1770
  is_abnormal = True
1714
1771
  if verbose:
1715
1772
  print(f'df.columns[df.nunique() == 1].tolist()')
1716
-
1773
+ if verbose:
1774
+ print("5",is_abnormal)
1717
1775
  # Check for an unreasonable number of rows or columns
1718
1776
  if actual_shape[0] < 2 or actual_shape[1] < 2:
1719
1777
  messages.append(
@@ -1722,7 +1780,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1722
1780
  is_abnormal = True
1723
1781
  if verbose:
1724
1782
  print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
1725
-
1783
+ if verbose:
1784
+ print("6",is_abnormal)
1726
1785
  # Compile results
1727
1786
  if verbose:
1728
1787
  print("\n".join(messages))
@@ -1739,20 +1798,40 @@ def fload(fpath, kind=None, **kwargs):
1739
1798
  Returns:
1740
1799
  content: The content loaded from the file.
1741
1800
  """
1742
-
1801
+ def read_mplstyle(style_file):
1802
+ import matplotlib.pyplot as plt
1803
+ # Load the style file
1804
+ plt.style.use(style_file)
1805
+
1806
+ # Get the current style properties
1807
+ style_dict = plt.rcParams
1808
+
1809
+ # Convert to dictionary
1810
+ style_dict = dict(style_dict)
1811
+ # Print the style dictionary
1812
+ for i, j in style_dict.items():
1813
+ print(f"\n{i}::::{j}")
1814
+ return style_dict
1815
+ # #example usage:
1816
+ # style_file = "/ std-colors.mplstyle"
1817
+ # style_dict = read_mplstyle(style_file)
1818
+
1743
1819
  def load_txt_md(fpath):
1744
1820
  with open(fpath, "r") as file:
1745
1821
  content = file.read()
1746
1822
  return content
1747
1823
 
1748
- def load_html(fpath):
1749
- with open(fpath, "r") as file:
1750
- content = file.read()
1751
- return content
1824
+ # def load_html(fpath):
1825
+ # with open(fpath, "r") as file:
1826
+ # content = file.read()
1827
+ # return content
1828
+ def load_html(fpath,**kwargs):
1829
+ return pd.read_html(fpath,**kwargs)
1752
1830
 
1753
1831
  def load_json(fpath, **kwargs):
1754
1832
  output=kwargs.pop("output","json")
1755
1833
  if output=='json':
1834
+ import json
1756
1835
  with open(fpath, "r") as file:
1757
1836
  content = json.load(file)
1758
1837
  return content
@@ -1760,12 +1839,14 @@ def fload(fpath, kind=None, **kwargs):
1760
1839
  return pd.read_json(fpath,**kwargs)
1761
1840
 
1762
1841
  def load_yaml(fpath):
1842
+ import yaml
1763
1843
  with open(fpath, "r") as file:
1764
1844
  content = yaml.safe_load(file)
1765
1845
  return content
1766
1846
 
1767
1847
 
1768
1848
  def load_xml(fpath, fsize_thr: int = 100):
1849
+ from lxml import etree
1769
1850
  def load_small_xml(fpath):
1770
1851
  tree = etree.parse(fpath)
1771
1852
  root = tree.getroot()
@@ -1824,6 +1905,15 @@ def fload(fpath, kind=None, **kwargs):
1824
1905
  if line.startswith(char):
1825
1906
  return char
1826
1907
  return None
1908
+
1909
+ def _get_chunks(df_fake):
1910
+ """
1911
+ helper func for 'load_csv'
1912
+ """
1913
+ chunks = []
1914
+ for chunk in df_fake:
1915
+ chunks.append(chunk)
1916
+ return pd.concat(chunks, ignore_index=True)
1827
1917
 
1828
1918
  def load_csv(fpath, **kwargs):
1829
1919
  from pandas.errors import EmptyDataError
@@ -1837,16 +1927,19 @@ def fload(fpath, kind=None, **kwargs):
1837
1927
  on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1838
1928
  comment = kwargs.pop("comment", None)
1839
1929
  fmt=kwargs.pop("fmt",False)
1930
+ chunksize=kwargs.pop("chunksize", None)
1931
+ engine='c' if chunksize else engine # when chunksize, recommend 'c'
1932
+ low_memory=kwargs.pop("low_memory",True)
1933
+ low_memory=False if chunksize else True # when chunksize, recommend low_memory=False
1840
1934
  verbose=kwargs.pop("verbose",False)
1841
- if verbose:
1935
+ if run_once_within():
1842
1936
  use_pd("read_csv", verbose=verbose)
1843
- return
1844
1937
 
1845
1938
  if comment is None:
1846
1939
  comment = get_comment(
1847
1940
  fpath, comment=None, encoding="utf-8", lines_to_check=5
1848
1941
  )
1849
-
1942
+
1850
1943
  try:
1851
1944
  df = pd.read_csv(
1852
1945
  fpath,
@@ -1858,14 +1951,19 @@ def fload(fpath, kind=None, **kwargs):
1858
1951
  skipinitialspace=skipinitialspace,
1859
1952
  sep=sep,
1860
1953
  on_bad_lines=on_bad_lines,
1954
+ chunksize=chunksize,
1955
+ low_memory=low_memory,
1861
1956
  **kwargs,
1862
1957
  )
1863
- if is_df_abnormal(df, verbose=0):
1958
+ if chunksize:
1959
+ df=_get_chunks(df)
1960
+ print(df.shape)
1961
+ if is_df_abnormal(df, verbose=0): # raise error
1864
1962
  raise ValueError("the df is abnormal")
1865
1963
  except:
1866
1964
  try:
1867
1965
  try:
1868
- if engine == "pyarrow":
1966
+ if engine == "pyarrow" and not chunksize:
1869
1967
  df = pd.read_csv(
1870
1968
  fpath,
1871
1969
  engine=engine,
@@ -1874,6 +1972,7 @@ def fload(fpath, kind=None, **kwargs):
1874
1972
  sep=sep,
1875
1973
  on_bad_lines=on_bad_lines,
1876
1974
  comment=comment,
1975
+ low_memory=low_memory,
1877
1976
  **kwargs,
1878
1977
  )
1879
1978
  else:
@@ -1887,14 +1986,19 @@ def fload(fpath, kind=None, **kwargs):
1887
1986
  skipinitialspace=skipinitialspace,
1888
1987
  on_bad_lines=on_bad_lines,
1889
1988
  comment=comment,
1989
+ chunksize=chunksize,
1990
+ low_memory=low_memory,
1890
1991
  **kwargs,
1891
1992
  )
1993
+ if chunksize:
1994
+ df=_get_chunks(df)
1995
+ print(df.shape)
1892
1996
  if is_df_abnormal(df, verbose=0):
1893
1997
  raise ValueError("the df is abnormal")
1894
1998
  except (UnicodeDecodeError, ValueError):
1895
1999
  encoding = get_encoding(fpath)
1896
2000
  # print(f"utf-8 failed. Retrying with detected encoding: {encoding}")
1897
- if engine == "pyarrow":
2001
+ if engine == "pyarrow" and not chunksize:
1898
2002
  df = pd.read_csv(
1899
2003
  fpath,
1900
2004
  engine=engine,
@@ -1903,6 +2007,7 @@ def fload(fpath, kind=None, **kwargs):
1903
2007
  sep=sep,
1904
2008
  on_bad_lines=on_bad_lines,
1905
2009
  comment=comment,
2010
+ low_memory=low_memory,
1906
2011
  **kwargs,
1907
2012
  )
1908
2013
  else:
@@ -1916,8 +2021,13 @@ def fload(fpath, kind=None, **kwargs):
1916
2021
  skipinitialspace=skipinitialspace,
1917
2022
  on_bad_lines=on_bad_lines,
1918
2023
  comment=comment,
2024
+ chunksize=chunksize,
2025
+ low_memory=low_memory,
1919
2026
  **kwargs,
1920
2027
  )
2028
+ if chunksize:
2029
+ df=_get_chunks(df)
2030
+ print(df.shape)
1921
2031
  if is_df_abnormal(df, verbose=0):
1922
2032
  raise ValueError("the df is abnormal")
1923
2033
  except Exception as e:
@@ -1934,8 +2044,13 @@ def fload(fpath, kind=None, **kwargs):
1934
2044
  sep=sep,
1935
2045
  on_bad_lines=on_bad_lines,
1936
2046
  comment=comment,
2047
+ chunksize=chunksize,
2048
+ low_memory=low_memory,
1937
2049
  **kwargs,
1938
2050
  )
2051
+ if chunksize:
2052
+ df=_get_chunks(df)
2053
+ print(df.shape)
1939
2054
  if not is_df_abnormal(df, verbose=0): # normal
1940
2055
  display(df.head(2))
1941
2056
  print(f"shape: {df.shape}")
@@ -1943,32 +2058,38 @@ def fload(fpath, kind=None, **kwargs):
1943
2058
  except:
1944
2059
  pass
1945
2060
  else:
1946
- engines = [None,"c", "python"]
1947
- for engine in engines:
1948
- separators = [",", "\t", ";", "|", " "]
1949
- for sep in separators:
1950
- try:
1951
- # sep2show = sep if sep != "\t" else "\\t"
1952
- # print(f"trying with: engine={engine}, sep='{sep2show}'")
1953
- # print(".")
1954
- df = pd.read_csv(
1955
- fpath,
1956
- engine=engine,
1957
- sep=sep,
1958
- on_bad_lines=on_bad_lines,
1959
- comment=comment,
1960
- **kwargs,
1961
- )
1962
- # display(df.head(2))
1963
- # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
1964
- if not is_df_abnormal(df, verbose=0):
1965
- display(df.head(2))
1966
- print(f"shape: {df.shape}")
1967
- return df
1968
- except EmptyDataError as e:
1969
- continue
1970
- else:
1971
- pass
2061
+ if not chunksize:
2062
+ engines = [None,"c", "python"]
2063
+ for engine in engines:
2064
+ separators = [",", "\t", ";", "|", " "]
2065
+ for sep in separators:
2066
+ try:
2067
+ # sep2show = sep if sep != "\t" else "\\t"
2068
+ # print(f"trying with: engine={engine}, sep='{sep2show}'")
2069
+ # print(".")
2070
+ df = pd.read_csv(
2071
+ fpath,
2072
+ engine=engine,
2073
+ sep=sep,
2074
+ on_bad_lines=on_bad_lines,
2075
+ comment=comment,
2076
+ chunksize=chunksize,
2077
+ low_memory=low_memory,
2078
+ **kwargs,
2079
+ )
2080
+ # display(df.head(2))
2081
+ # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
2082
+ if chunksize:
2083
+ df=_get_chunks(df)
2084
+ print(df.shape)
2085
+ if not is_df_abnormal(df, verbose=0):
2086
+ display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
2087
+ print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
2088
+ return df
2089
+ except EmptyDataError as e:
2090
+ continue
2091
+ else:
2092
+ pass
1972
2093
  display(df.head(2))
1973
2094
  print(f"shape: {df.shape}")
1974
2095
  return df
@@ -1976,7 +2097,7 @@ def fload(fpath, kind=None, **kwargs):
1976
2097
  def load_excel(fpath, **kwargs):
1977
2098
  engine = kwargs.get("engine", "openpyxl")
1978
2099
  verbose=kwargs.pop("verbose",False)
1979
- if verbose:
2100
+ if run_once_within():
1980
2101
  use_pd("read_excel", verbose=verbose)
1981
2102
  df = pd.read_excel(fpath, engine=engine, **kwargs)
1982
2103
  try:
@@ -1987,7 +2108,45 @@ def fload(fpath, kind=None, **kwargs):
1987
2108
  pass
1988
2109
  return df
1989
2110
 
2111
+
2112
+ def load_parquet(fpath, **kwargs):
2113
+ """
2114
+ Load a Parquet file into a Pandas DataFrame with advanced options.
2115
+
2116
+ Parameters:
2117
+ - fpath (str): The file path to the Parquet file.
2118
+ - engine (str): The engine to use for reading the Parquet file (default is 'pyarrow').
2119
+ - columns (list): List of columns to load. If None, loads all columns.
2120
+ - verbose (bool): If True, prints additional information about the loading process.
2121
+ - filters (list): List of filter conditions for predicate pushdown.
2122
+ - **kwargs: Additional keyword arguments for `pd.read_parquet`.
2123
+
2124
+ Returns:
2125
+ - df (DataFrame): The loaded DataFrame.
2126
+ """
2127
+
2128
+ engine = kwargs.get("engine", "pyarrow")
2129
+ verbose = kwargs.pop("verbose", False)
2130
+
2131
+ if run_once_within():
2132
+ use_pd("read_parquet", verbose=verbose)
2133
+ try:
2134
+ df = pd.read_parquet(fpath, engine=engine, **kwargs)
2135
+ if verbose:
2136
+ if 'columns' in kwargs:
2137
+ print(f"Loaded columns: {kwargs['columns']}")
2138
+ else:
2139
+ print("Loaded all columns.")
2140
+ print(f"shape: {df.shape}")
2141
+ except Exception as e:
2142
+ print(f"An error occurred while loading the Parquet file: {e}")
2143
+ df = None
2144
+
2145
+ return df
2146
+
1990
2147
  def load_ipynb(fpath, **kwargs):
2148
+ import nbformat
2149
+ from nbconvert import MarkdownExporter
1991
2150
  as_version = kwargs.get("as_version", 4)
1992
2151
  with open(fpath, "r") as file:
1993
2152
  nb = nbformat.read(file, as_version=as_version)
@@ -2017,6 +2176,7 @@ def fload(fpath, kind=None, **kwargs):
2017
2176
  If page is an integer, it returns the text of the specified page number.
2018
2177
  If the specified page is not found, it returns the string "Page is not found".
2019
2178
  """
2179
+ from PyPDF2 import PdfReader
2020
2180
  text_dict = {}
2021
2181
  with open(fpath, "rb") as file:
2022
2182
  pdf_reader = PdfReader(file)
@@ -2046,6 +2206,7 @@ def fload(fpath, kind=None, **kwargs):
2046
2206
  return text_dict.get(int(page), "Page is not found")
2047
2207
 
2048
2208
  def load_docx(fpath):
2209
+ from docx import Document
2049
2210
  doc = Document(fpath)
2050
2211
  content = [para.text for para in doc.paragraphs]
2051
2212
  return content
@@ -2055,51 +2216,21 @@ def fload(fpath, kind=None, **kwargs):
2055
2216
  kind = kind.lower()
2056
2217
  kind = kind.lstrip(".").lower()
2057
2218
  img_types = [
2058
- "bmp",
2059
- "eps",
2060
- "gif",
2061
- "icns",
2062
- "ico",
2063
- "im",
2064
- "jpg",
2065
- "jpeg",
2066
- "jpeg2000",
2067
- "msp",
2068
- "pcx",
2069
- "png",
2070
- "ppm",
2071
- "sgi",
2072
- "spider",
2073
- "tga",
2074
- "tiff",
2075
- "tif",
2076
- "webp",
2077
- "json",
2219
+ "bmp","eps","gif","png","jpg","jpeg","jpeg2000","tiff","tif",
2220
+ "icns","ico","im","msp","pcx","ppm","sgi","spider","tga","webp",
2078
2221
  ]
2079
2222
  doc_types = [
2080
- "docx",
2081
- "txt",
2082
- "md",
2083
- "html",
2084
- "json",
2085
- "yaml",
2086
- "xml",
2087
- "csv",
2088
- "xlsx",
2089
- "pdf",
2223
+ "docx","pdf",
2224
+ "txt","csv","xlsx","tsv","parquet","snappy",
2225
+ "md","html",
2226
+ "json","yaml","xml",
2090
2227
  "ipynb",
2228
+ "mtx"
2091
2229
  ]
2092
2230
  zip_types = [
2093
- "gz",
2094
- "zip",
2095
- "7z",
2096
- "tar",
2097
- "tar.gz",
2098
- "tar.bz2",
2099
- "bz2",
2100
- "xz",
2101
- "rar",
2102
- "tgz",
2231
+ "gz","zip","7z","rar","tgz",
2232
+ "tar","tar.gz","tar.bz2",
2233
+ "bz2","xz","gzip"
2103
2234
  ]
2104
2235
  other_types = ["fcs"]
2105
2236
  supported_types = [*doc_types, *img_types, *zip_types, *other_types]
@@ -2128,16 +2259,24 @@ def fload(fpath, kind=None, **kwargs):
2128
2259
  elif kind == "txt" or kind == "md":
2129
2260
  return load_txt_md(fpath)
2130
2261
  elif kind == "html":
2131
- return load_html(fpath)
2262
+ return load_html(fpath, **kwargs)
2132
2263
  elif kind == "json":
2133
- return load_json(fpath)
2264
+ return load_json(fpath, **kwargs)
2134
2265
  elif kind == "yaml":
2135
2266
  return load_yaml(fpath)
2136
2267
  elif kind == "xml":
2137
2268
  return load_xml(fpath)
2138
- elif kind == "csv":
2269
+ elif kind in ["csv","tsv"]:
2270
+ verbose=kwargs.pop('verbose',False)
2271
+ if run_once_within():
2272
+ use_pd("read_csv")
2139
2273
  content = load_csv(fpath, **kwargs)
2140
2274
  return content
2275
+ elif kind=='pkl':
2276
+ verbose=kwargs.pop('verbose',False)
2277
+ if run_once_within():
2278
+ use_pd("read_pickle")
2279
+ return pd.read_pickle(fpath,**kwargs)
2141
2280
  elif kind in ["ods", "ods", "odt"]:
2142
2281
  engine = kwargs.get("engine", "odf")
2143
2282
  kwargs.pop("engine", None)
@@ -2146,14 +2285,40 @@ def fload(fpath, kind=None, **kwargs):
2146
2285
  engine = kwargs.get("engine", "xlrd")
2147
2286
  kwargs.pop("engine", None)
2148
2287
  content = load_excel(fpath, engine=engine, **kwargs)
2149
- display(content.head(3))
2288
+ print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2289
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2150
2290
  return content
2151
2291
  elif kind == "xlsx":
2152
2292
  content = load_excel(fpath, **kwargs)
2153
- display(content.head(3))
2293
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2294
+ print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2295
+ return content
2296
+ elif kind=='mtx':
2297
+ from scipy.io import mmread
2298
+ dat_mtx=mmread(fpath)
2299
+ content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
2300
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2301
+ print(f"shape: {content.shape}")
2154
2302
  return content
2155
2303
  elif kind == "ipynb":
2156
2304
  return load_ipynb(fpath, **kwargs)
2305
+ elif kind in ['parquet','snappy']:
2306
+ verbose=kwargs.pop('verbose',False)
2307
+ if run_once_within():
2308
+ use_pd("read_parquet")
2309
+ return load_parquet(fpath,**kwargs)
2310
+ elif kind =='feather':
2311
+ verbose=kwargs.pop('verbose',False)
2312
+ if run_once_within():
2313
+ use_pd("read_feather")
2314
+ content=pd.read_feather(fpath,**kwargs)
2315
+ return content
2316
+ elif kind =='h5':
2317
+ content=pd.read_hdf(fpath,**kwargs)
2318
+ return content
2319
+ elif kind =='pkl':
2320
+ content=pd.read_pickle(fpath,**kwargs)
2321
+ return content
2157
2322
  elif kind == "pdf":
2158
2323
  # print('usage:load_pdf(fpath, page="all", verbose=False)')
2159
2324
  return load_pdf(fpath, **kwargs)
@@ -2164,6 +2329,7 @@ def fload(fpath, kind=None, **kwargs):
2164
2329
  import GEOparse
2165
2330
  return GEOparse.get_GEO(filepath=fpath)
2166
2331
  elif kind.lower() in zip_types:
2332
+ from pprint import pp
2167
2333
  keep = kwargs.get("keep", False)
2168
2334
  fpath_unzip = unzip(fpath)
2169
2335
  if os.path.isdir(fpath_unzip):
@@ -2198,10 +2364,11 @@ def fload(fpath, kind=None, **kwargs):
2198
2364
  meta, data = fcsparser.parse(fpath, reformat_meta=True)
2199
2365
  return meta, data
2200
2366
 
2367
+ elif kind=="mplstyle":
2368
+ return read_mplstyle(fpath)
2369
+
2201
2370
  else:
2202
- # try:
2203
- # content = load_csv(fpath, **kwargs)
2204
- # except:
2371
+ print("direct reading...")
2205
2372
  try:
2206
2373
  try:
2207
2374
  with open(fpath, "r", encoding="utf-8") as f:
@@ -2311,6 +2478,7 @@ def filter_kwargs(kws, valid_kwargs):
2311
2478
  }
2312
2479
  return kwargs_filtered
2313
2480
 
2481
+ str_space_speed='sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
2314
2482
 
2315
2483
  def fsave(
2316
2484
  fpath,
@@ -2346,6 +2514,7 @@ def fsave(
2346
2514
  fappend(fpath, content=content)
2347
2515
 
2348
2516
  def save_docx(fpath, content, font_name, font_size, spacing):
2517
+ import docx
2349
2518
  if isinstance(content, str):
2350
2519
  content = content.split(". ")
2351
2520
  doc = docx.Document()
@@ -2373,6 +2542,7 @@ def fsave(
2373
2542
  save_content(fpath, html_content, mode)
2374
2543
 
2375
2544
  def save_pdf(fpath, content, font_name, font_size):
2545
+ from fpdf import FPDF
2376
2546
  pdf = FPDF()
2377
2547
  pdf.add_page()
2378
2548
  # pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
@@ -2386,7 +2556,7 @@ def fsave(
2386
2556
  # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
2387
2557
 
2388
2558
  verbose=kwargs.pop("verbose",False)
2389
- if verbose:
2559
+ if run_once_within():
2390
2560
  use_pd("to_csv", verbose=verbose)
2391
2561
  kwargs_csv = dict(
2392
2562
  path_or_buf=None,
@@ -2418,7 +2588,7 @@ def fsave(
2418
2588
  def save_xlsx(fpath, data, **kwargs):
2419
2589
  verbose=kwargs.pop("verbose",False)
2420
2590
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2421
- if verbose:
2591
+ if run_once_within():
2422
2592
  use_pd("to_excel", verbose=verbose)
2423
2593
  if any(kwargs):
2424
2594
  format_excel(df=data, filename=fpath, **kwargs)
@@ -2444,9 +2614,10 @@ def fsave(
2444
2614
 
2445
2615
  def save_ipynb(fpath, data, **kwargs):
2446
2616
  # Split the content by code fences to distinguish between code and markdown
2617
+ import nbformat
2447
2618
  parts = data.split("```")
2448
2619
  cells = []
2449
-
2620
+
2450
2621
  for i, part in enumerate(parts):
2451
2622
  if i % 2 == 0:
2452
2623
  # Even index: markdown content
@@ -2466,6 +2637,7 @@ def fsave(
2466
2637
  # json.dump(data, file, **kwargs)
2467
2638
 
2468
2639
  def save_json(fpath_fname, var_dict_or_df):
2640
+ import json
2469
2641
  def _convert_js(data):
2470
2642
  if isinstance(data, pd.DataFrame):
2471
2643
  return data.to_dict(orient="list")
@@ -2487,10 +2659,12 @@ def fsave(
2487
2659
  # # setss = jsonload("/.json")
2488
2660
 
2489
2661
  def save_yaml(fpath, data, **kwargs):
2662
+ import yaml
2490
2663
  with open(fpath, "w") as file:
2491
2664
  yaml.dump(data, file, **kwargs)
2492
2665
 
2493
2666
  def save_xml(fpath, data):
2667
+ from lxml import etree
2494
2668
  root = etree.Element("root")
2495
2669
  if isinstance(data, dict):
2496
2670
  for key, val in data.items():
@@ -2501,6 +2675,25 @@ def fsave(
2501
2675
  tree = etree.ElementTree(root)
2502
2676
  tree.write(fpath, pretty_print=True, xml_declaration=True, encoding="UTF-8")
2503
2677
 
2678
+ def save_parquet(fpath:str, data:pd.DataFrame, **kwargs):
2679
+ engine = kwargs.pop("engine","auto") # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
2680
+ compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2681
+ try:
2682
+ # Attempt to save with "pyarrow" if engine is set to "auto"
2683
+ data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
2684
+ print(f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression.")
2685
+ except Exception as e:
2686
+ print(f"Error using with engine '{engine}' and {compression} compression: {e}")
2687
+ if "Sparse" in str(e):
2688
+ try:
2689
+ # Handle sparse data by converting columns to dense
2690
+ print("Attempting to convert sparse columns to dense format...")
2691
+ data = data.apply(lambda x: x.sparse.to_dense() if pd.api.types.is_sparse(x) else x)
2692
+ save_parquet(fpath, data=data,**kwargs)
2693
+ except Exception as last_e:
2694
+ print(f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}")
2695
+
2696
+
2504
2697
  if kind is None:
2505
2698
  _, kind = os.path.splitext(fpath)
2506
2699
  kind = kind.lower()
@@ -2546,7 +2739,92 @@ def fsave(
2546
2739
  save_yaml(fpath, content, **kwargs)
2547
2740
  elif kind == "ipynb":
2548
2741
  save_ipynb(fpath, content, **kwargs)
2742
+ elif kind.lower() in ["parquet","pq","big","par"]:
2743
+ verbose=kwargs.pop('verbose',False)
2744
+ if verbose:
2745
+ print(str_space_speed)
2746
+ use_pd("to_parquet")
2747
+ return None
2748
+ compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2749
+ # fix the fpath ends
2750
+ _fpath, _ext = os.path.splitext(fpath)
2751
+ fpath = _fpath+_ext.replace(kind, 'parquet')
2752
+ if compression is not None:
2753
+ if not fpath.endswith(compression):
2754
+ fpath=fpath+f".{compression}"
2755
+ save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
2756
+ elif kind.lower() in ["pkl","pk","pickle","pick"]:
2757
+ # Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
2758
+ # Pickle is convenient if you want to preserve exact Python object types.
2759
+ verbose=kwargs.pop('verbose',False)
2760
+ if verbose:
2761
+ print(str_space_speed)
2762
+ use_pd("to_pickle")
2763
+ return None
2764
+ _fpath, _ext = os.path.splitext(fpath)
2765
+ fpath = _fpath+_ext.replace(kind, 'pkl')
2766
+ compression=kwargs.pop("compression",None)
2767
+ if compression is not None:
2768
+ if not fpath.endswith(compression["method"]):
2769
+ fpath=fpath+f".{compression["method"]}"
2770
+ if isinstance(content, pd.DataFrame):
2771
+ content.to_pickle(fpath,**kwargs)
2772
+ else:
2773
+ try:
2774
+ print("trying to convert it as a DataFrame...")
2775
+ content=pd.DataFrame(content)
2776
+ content.to_pickle(fpath,**kwargs)
2777
+ except Exception as e:
2778
+ raise ValueError(
2779
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2780
+ )
2781
+ elif kind.lower() in ["fea",'feather','ft','fe','feat','fether']:
2782
+ # Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
2783
+ # optimized for data analytics tasks and is especially fast when working with Pandas.
2784
+
2785
+ verbose=kwargs.pop('verbose',False)
2786
+ if verbose:
2787
+ print(str_space_speed)
2788
+ use_pd("to_feather")
2789
+ return None
2790
+ _fpath, _ext = os.path.splitext(fpath)
2791
+ fpath = _fpath+_ext.replace(kind, 'feather')
2792
+ if isinstance(content, pd.DataFrame):
2793
+ content.to_feather(fpath,**kwargs)
2794
+ else:
2795
+ try:
2796
+ print("trying to convert it as a DataFrame...")
2797
+ content=pd.DataFrame(content)
2798
+ content.to_feather(fpath, **kwargs)
2799
+ except Exception as e:
2800
+ raise ValueError(
2801
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2802
+ )
2803
+ elif kind.lower() in ["hd",'hdf','h','h5']:
2804
+ # particularly useful for large datasets and can handle complex data structures
2805
+ verbose=kwargs.pop('verbose',False)
2806
+ if verbose:
2807
+ print(str_space_speed)
2808
+ use_pd("to_hdf")
2809
+ _fpath, _ext = os.path.splitext(fpath)
2810
+ fpath = _fpath+_ext.replace(kind, 'h5')
2811
+ compression=kwargs.pop("compression",None)
2812
+ if compression is not None:
2813
+ if not fpath.endswith(compression):
2814
+ fpath=fpath+f".{compression}"
2815
+ if isinstance(content, pd.DataFrame):
2816
+ content.to_hdf(fpath,key='content',**kwargs)
2817
+ else:
2818
+ try:
2819
+ print("trying to convert it as a DataFrame...")
2820
+ content=pd.DataFrame(content)
2821
+ content.to_hdf(fpath,**kwargs)
2822
+ except Exception as e:
2823
+ raise ValueError(
2824
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2825
+ )
2549
2826
  else:
2827
+ from . import netfinder
2550
2828
  try:
2551
2829
  netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
2552
2830
  except:
@@ -2669,6 +2947,7 @@ def isa(content, kind):
2669
2947
  elif "color" in kind.lower(): # file
2670
2948
  return is_str_color(content)
2671
2949
  elif "html" in kind.lower():
2950
+ import re
2672
2951
  if content is None or not isinstance(content, str):
2673
2952
  return False
2674
2953
  # Remove leading and trailing whitespace
@@ -2828,6 +3107,7 @@ def listdir(
2828
3107
  display(f.head())
2829
3108
  return f
2830
3109
  else:
3110
+ from box import Box
2831
3111
  if "l" in orient.lower(): # list # default
2832
3112
  res_output = Box(f.to_dict(orient="list"))
2833
3113
  return res_output
@@ -2868,13 +3148,10 @@ def mkdir_nest(fpath: str) -> str:
2868
3148
  Returns:
2869
3149
  - str: The path of the created directory.
2870
3150
  """
2871
-
2872
-
2873
3151
  # Split the full path into directories
2874
3152
  f_slash = "/" if "mac" in get_os().lower() else "\\"
2875
3153
  if os.path.isdir(fpath):
2876
3154
  fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
2877
- print(fpath)
2878
3155
  return fpath
2879
3156
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
2880
3157
 
@@ -2945,7 +3222,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
2945
3222
  if len(rootdir) == 1:
2946
3223
  rootdir = rootdir[0]
2947
3224
  rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
2948
- print(rootdir)
3225
+
2949
3226
  return rootdir
2950
3227
 
2951
3228
 
@@ -2957,6 +3234,8 @@ def split_path(fpath):
2957
3234
 
2958
3235
 
2959
3236
  def figsave(*args, dpi=300):
3237
+ import matplotlib.pyplot as plt
3238
+ from PIL import Image
2960
3239
  dir_save = None
2961
3240
  fname = None
2962
3241
  img = None
@@ -2972,13 +3251,13 @@ def figsave(*args, dpi=300):
2972
3251
 
2973
3252
  if dir_save is None:
2974
3253
  dir_save="./"
2975
- print(dir_save)
3254
+
2976
3255
  # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
2977
3256
  dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
2978
3257
  dir_ch = "".join(dir_save.split(f_slash)[-1:])
2979
3258
  if not dir_par.endswith(f_slash):
2980
3259
  dir_par += f_slash
2981
- print(dir_par)
3260
+
2982
3261
  if fname is None:
2983
3262
  fname = dir_ch
2984
3263
  mkdir(dir_par)
@@ -3065,6 +3344,7 @@ def figsave(*args, dpi=300):
3065
3344
  def is_str_color(s):
3066
3345
  # Regular expression pattern for hexadecimal color codes
3067
3346
  if isinstance(s,str):
3347
+ import re
3068
3348
  color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
3069
3349
  return re.match(color_code_pattern, s) is not None
3070
3350
  else:
@@ -3091,6 +3371,7 @@ def isnum(s):
3091
3371
 
3092
3372
 
3093
3373
  def is_image(fpath):
3374
+ import mimetypes
3094
3375
  mime_type, _ = mimetypes.guess_type(fpath)
3095
3376
  if mime_type and mime_type.startswith("image"):
3096
3377
  return True
@@ -3099,6 +3380,7 @@ def is_image(fpath):
3099
3380
 
3100
3381
 
3101
3382
  def is_document(fpath):
3383
+ import mimetypes
3102
3384
  mime_type, _ = mimetypes.guess_type(fpath)
3103
3385
  if mime_type and (
3104
3386
  mime_type.startswith("text/")
@@ -3119,6 +3401,7 @@ def is_document(fpath):
3119
3401
 
3120
3402
 
3121
3403
  def is_zip(fpath):
3404
+ import mimetypes
3122
3405
  mime_type, _ = mimetypes.guess_type(fpath)
3123
3406
  if mime_type == "application/zip":
3124
3407
  return True
@@ -3127,6 +3410,7 @@ def is_zip(fpath):
3127
3410
 
3128
3411
 
3129
3412
  def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
3413
+ import matplotlib.pyplot as plt
3130
3414
  if ax is None:
3131
3415
  ax = plt.gca()
3132
3416
  for loc, spine in ax.spines.items():
@@ -3215,7 +3499,7 @@ def apply_filter(img, *args):
3215
3499
  Returns:
3216
3500
  PIL.Image: The filtered image.
3217
3501
  """
3218
-
3502
+ from PIL import ImageFilter
3219
3503
  def correct_filter_name(filter_name):
3220
3504
  if "bl" in filter_name.lower() and "box" not in filter_name.lower():
3221
3505
  return "BLUR"
@@ -3457,6 +3741,8 @@ def imgsets(img, **kwargs):
3457
3741
  avg_contrast_factor = sum(contrast_factors) / num_channels
3458
3742
  return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
3459
3743
 
3744
+ import matplotlib.pyplot as plt
3745
+ from PIL import ImageEnhance,ImageOps
3460
3746
  # Load image if input is a file path
3461
3747
  if isinstance(img, str):
3462
3748
  img = load_img(img)
@@ -3520,6 +3806,7 @@ def imgsets(img, **kwargs):
3520
3806
  elif "pad" in k.lower():
3521
3807
  img_update = ImageOps.pad(img_update, size=value)
3522
3808
  elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
3809
+ from rembg import remove, new_session
3523
3810
  if isinstance(value, bool):
3524
3811
  session = new_session("isnet-general-use")
3525
3812
  img_update = remove(img_update, session=session)
@@ -3558,6 +3845,7 @@ def imgsets(img, **kwargs):
3558
3845
  else:
3559
3846
  img_update = remove(img_update)
3560
3847
  elif "bg" in k.lower() and "color" in k.lower():
3848
+ from rembg import remove
3561
3849
  if isinstance(value, list):
3562
3850
  value = tuple(value)
3563
3851
  if isinstance(value, tuple): # replace the background color
@@ -3589,6 +3877,8 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
3589
3877
  Args:
3590
3878
  dir_img_list (list): List of the Directory containing the images.
3591
3879
  """
3880
+ import matplotlib.pyplot as plt
3881
+ from PIL import Image
3592
3882
  num_images = len(dir_img_list)
3593
3883
  if not kind.startswith("."):
3594
3884
  kind = "." + kind
@@ -3625,28 +3915,15 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
3625
3915
  # usage:
3626
3916
  # fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
3627
3917
  # thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
3628
- def read_mplstyle(style_file):
3629
- # Load the style file
3630
- plt.style.use(style_file)
3631
-
3632
- # Get the current style properties
3633
- style_dict = plt.rcParams
3634
-
3635
- # Convert to dictionary
3636
- style_dict = dict(style_dict)
3637
- # Print the style dictionary
3638
- for i, j in style_dict.items():
3639
- print(f"\n{i}::::{j}")
3640
- return style_dict
3641
-
3642
3918
 
3643
- # #example usage:
3644
- # style_file = "/ std-colors.mplstyle"
3645
- # style_dict = read_mplstyle(style_file)
3646
3919
 
3647
3920
 
3648
3921
  # search and fine the director of the libary, which installed at local
3649
3922
  def dir_lib(lib_oi):
3923
+ """
3924
+ # example usage:
3925
+ # dir_lib("seaborn")
3926
+ """
3650
3927
  import site
3651
3928
 
3652
3929
  # Get the site-packages directory
@@ -3664,23 +3941,6 @@ def dir_lib(lib_oi):
3664
3941
  print(f"Cannot find the {lib_oi} in site-packages directory.")
3665
3942
  return dir_list
3666
3943
 
3667
-
3668
- # example usage:
3669
- # dir_lib("seaborn")
3670
-
3671
- """
3672
- # n = 7
3673
- # clist = get_color(n, cmap="auto", how="linspace") # get_color(100)
3674
- # plt.figure(figsize=[8, 5], dpi=100)
3675
- # x = np.linspace(0, 2 * np.pi, 50) * 100
3676
- # y = np.sin(x)
3677
- # for i in range(1, n + 1):
3678
- # plt.plot(x, y + i, c=clist[i - 1], lw=5, label=str(i))
3679
- # plt.legend()
3680
- # plt.ylim(-2, 20)
3681
- # figsets(plt.gca(), {"style": "whitegrid"}) """
3682
-
3683
-
3684
3944
  class FileInfo:
3685
3945
  def __init__(
3686
3946
  self,
@@ -3757,6 +4017,7 @@ class FileInfo:
3757
4017
 
3758
4018
 
3759
4019
  def finfo(fpath):
4020
+ import time
3760
4021
  fname, fmt = os.path.splitext(fpath)
3761
4022
  dir_par = os.path.dirname(fpath) + "/"
3762
4023
  data = {
@@ -3771,6 +4032,7 @@ def finfo(fpath):
3771
4032
  }
3772
4033
  extra_info = {}
3773
4034
  if data["kind"] == ".pdf":
4035
+ from pdf2image import pdfinfo_from_path
3774
4036
  extra_info = pdfinfo_from_path(fpath)
3775
4037
 
3776
4038
  return FileInfo(
@@ -3785,18 +4047,7 @@ def finfo(fpath):
3785
4047
  extra_info=extra_info,
3786
4048
  )
3787
4049
 
3788
-
3789
4050
  # ! format excel file
3790
- import pandas as pd
3791
- from datetime import datetime
3792
- from openpyxl import load_workbook
3793
- from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
3794
- from openpyxl.utils import get_column_letter
3795
- from openpyxl.worksheet.datavalidation import DataValidation
3796
- from openpyxl.comments import Comment
3797
- from openpyxl.formatting.rule import ColorScaleRule
3798
-
3799
-
3800
4051
  def hex2argb(hex_color):
3801
4052
  """
3802
4053
  Convert a hex color code to aARGB format required by openpyxl.
@@ -3827,341 +4078,7 @@ def hex2argb(hex_color):
3827
4078
  return hex_color[-9:]
3828
4079
  else:
3829
4080
  return "F" * (9 - len(hex_color)) + hex_color
3830
- raise ValueError(
3831
- "Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format."
3832
- )
3833
-
3834
-
3835
- def convert_indices_to_range(row_slice, col_slice):
3836
- """Convert numerical row and column slices to Excel-style range strings."""
3837
- start_row = row_slice.start + 1
3838
- end_row = row_slice.stop if row_slice.stop is not None else None
3839
- start_col = col_slice.start + 1
3840
- end_col = col_slice.stop if col_slice.stop is not None else None
3841
-
3842
- start_col_letter = get_column_letter(start_col)
3843
- end_col_letter = get_column_letter(end_col) if end_col else None
3844
- return (
3845
- f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
3846
- if end_col_letter
3847
- else f"{start_col_letter}{start_row}"
3848
- )
3849
-
3850
-
3851
- def apply_format(ws, cell, cell_range):
3852
- """Apply cell formatting to a specified range."""
3853
- cell_font, cell_fill, cell_alignment, border = None, None, None, None
3854
- kws_cell = ["font", "fill", "alignment", "border"]
3855
- for K, _ in cell.items():
3856
- if strcmp(K, kws_cell)[0] == "font":
3857
- #! font
3858
- font_color = "000000"
3859
- font_name = "Arial"
3860
- font_underline = "none"
3861
- font_size = 14
3862
- font_bold = False
3863
- font_strike = False
3864
- font_italic = False
3865
- kws_font = [
3866
- "name",
3867
- "size",
3868
- "bold",
3869
- "underline",
3870
- "color",
3871
- "strike",
3872
- "italic",
3873
- ]
3874
- for k_, v_ in cell.get(K, {}).items():
3875
- if strcmp(k_, kws_font)[0] == "name":
3876
- font_name = v_
3877
- elif strcmp(k_, kws_font)[0] == "size":
3878
- font_size = v_
3879
- elif strcmp(k_, kws_font)[0] == "bold":
3880
- font_bold = v_
3881
- elif strcmp(k_, kws_font)[0] == "underline":
3882
- font_underline = strcmp(v_, ["none", "single", "double"])[0]
3883
- elif strcmp(k_, kws_font)[0] == "color":
3884
- font_color = hex2argb(v_)
3885
- elif strcmp(k_, kws_font)[0] == "strike":
3886
- font_strike = v_
3887
- elif strcmp(k_, kws_font)[0] == "italic":
3888
- font_italic = v_
3889
-
3890
- cell_font = Font(
3891
- name=font_name,
3892
- size=font_size,
3893
- bold=font_bold,
3894
- italic=font_italic,
3895
- underline=font_underline,
3896
- strike=font_strike,
3897
- color=font_color,
3898
- )
3899
-
3900
- if strcmp(K, kws_cell)[0] == "fill":
3901
- #! fill
3902
- kws_fill = ["start_color", "end_color", "fill_type", "color"]
3903
- kws_fill_type = [
3904
- "darkVertical",
3905
- "lightDown",
3906
- "lightGrid",
3907
- "solid",
3908
- "darkDown",
3909
- "lightGray",
3910
- "lightUp",
3911
- "gray0625",
3912
- "lightVertical",
3913
- "lightHorizontal",
3914
- "darkHorizontal",
3915
- "gray125",
3916
- "darkUp",
3917
- "mediumGray",
3918
- "darkTrellis",
3919
- "darkGray",
3920
- "lightTrellis",
3921
- "darkGrid",
3922
- ]
3923
- start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
3924
- for k, v in cell.get(K, {}).items():
3925
- if strcmp(k, kws_fill)[0] == "color":
3926
- start_color, end_color = hex2argb(v), hex2argb(v)
3927
- break
3928
- for k, v in cell.get(K, {}).items():
3929
- if strcmp(k, kws_fill)[0] == "start_color":
3930
- start_color = hex2argb(v)
3931
- elif strcmp(k, kws_fill)[0] == "end_color":
3932
- end_color = hex2argb(v)
3933
- elif strcmp(k, kws_fill)[0] == "fill_type":
3934
- fill_type = strcmp(v, kws_fill_type)[0]
3935
- cell_fill = PatternFill(
3936
- start_color=start_color,
3937
- end_color=end_color,
3938
- fill_type=fill_type,
3939
- )
3940
-
3941
- if strcmp(K, kws_cell)[0] == "alignment":
3942
- #! alignment
3943
- # default
3944
- align_horizontal = "general"
3945
- align_vertical = "center"
3946
- align_rot = 0
3947
- align_wrap = False
3948
- align_shrink = False
3949
- align_indent = 0
3950
- kws_align = [
3951
- "horizontal",
3952
- "ha",
3953
- "vertical",
3954
- "va",
3955
- "text_rotation",
3956
- "rotat",
3957
- "rot",
3958
- "wrap_text",
3959
- "wrap",
3960
- "shrink_to_fit",
3961
- "shrink",
3962
- "indent",
3963
- ]
3964
- for k, v in cell.get(K, {}).items():
3965
- if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
3966
- align_horizontal = strcmp(
3967
- v, ["general", "left", "right", "center"]
3968
- )[0]
3969
- elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
3970
- align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
3971
- elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
3972
- align_rot = v
3973
- elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
3974
- align_wrap = v
3975
- elif strcmp(k, kws_align)[0] in [
3976
- "shrink_to_fit",
3977
- "shrink",
3978
- "wrap_text",
3979
- "wrap",
3980
- ]:
3981
- align_shrink = v
3982
- elif strcmp(k, kws_align)[0] in ["indent"]:
3983
- align_indent = v
3984
- cell_alignment = Alignment(
3985
- horizontal=align_horizontal,
3986
- vertical=align_vertical,
3987
- text_rotation=align_rot,
3988
- wrap_text=align_wrap,
3989
- shrink_to_fit=align_shrink,
3990
- indent=align_indent,
3991
- )
3992
-
3993
- if strcmp(K, kws_cell)[0] == "border":
3994
- #! border
3995
- kws_border = [
3996
- "color_left",
3997
- "color_l",
3998
- "color_right",
3999
- "color_r",
4000
- "color_top",
4001
- "color_t",
4002
- "color_bottom",
4003
- "color_b",
4004
- "color_diagonal",
4005
- "color_d",
4006
- "color_outline",
4007
- "color_o",
4008
- "color_vertical",
4009
- "color_v",
4010
- "color_horizontal",
4011
- "color_h",
4012
- "color",
4013
- "style_left",
4014
- "style_l",
4015
- "style_right",
4016
- "style_r",
4017
- "style_top",
4018
- "style_t",
4019
- "style_bottom",
4020
- "style_b",
4021
- "style_diagonal",
4022
- "style_d",
4023
- "style_outline",
4024
- "style_o",
4025
- "style_vertical",
4026
- "style_v",
4027
- "style_horizontal",
4028
- "style_h",
4029
- "style",
4030
- ]
4031
- # * border color
4032
- border_color_l, border_color_r, border_color_t, border_color_b = (
4033
- "FF000000",
4034
- "FF000000",
4035
- "FF000000",
4036
- "FF000000",
4037
- )
4038
- border_color_d, border_color_o, border_color_v, border_color_h = (
4039
- "FF000000",
4040
- "FF000000",
4041
- "FF000000",
4042
- "FF000000",
4043
- )
4044
- # get colors config
4045
- for k, v in cell.get(K, {}).items():
4046
- if strcmp(k, kws_border)[0] in ["color"]:
4047
- border_color_all = hex2argb(v)
4048
- # 如果设置了color,表示其它的所有的都设置成为一样的
4049
- # 然后再才开始自己定义其它的color
4050
- border_color_l, border_color_r, border_color_t, border_color_b = (
4051
- border_color_all,
4052
- border_color_all,
4053
- border_color_all,
4054
- border_color_all,
4055
- )
4056
- border_color_d, border_color_o, border_color_v, border_color_h = (
4057
- border_color_all,
4058
- border_color_all,
4059
- border_color_all,
4060
- border_color_all,
4061
- )
4062
- elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
4063
- border_color_l = hex2argb(v)
4064
- elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
4065
- border_color_r = hex2argb(v)
4066
- elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
4067
- border_color_t = hex2argb(v)
4068
- elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
4069
- border_color_b = hex2argb(v)
4070
- elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
4071
- border_color_d = hex2argb(v)
4072
- elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
4073
- border_color_o = hex2argb(v)
4074
- elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
4075
- border_color_v = hex2argb(v)
4076
- elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
4077
- border_color_h = hex2argb(v)
4078
- # *border style
4079
- border_styles = [
4080
- "thin",
4081
- "medium",
4082
- "thick",
4083
- "dotted",
4084
- "dashed",
4085
- "hair",
4086
- "mediumDashed",
4087
- "dashDot",
4088
- "dashDotDot",
4089
- "slantDashDot",
4090
- "none",
4091
- ]
4092
- border_style_l, border_style_r, border_style_t, border_style_b = (
4093
- None,
4094
- None,
4095
- None,
4096
- None,
4097
- )
4098
- border_style_d, border_style_o, border_style_v, border_style_h = (
4099
- None,
4100
- None,
4101
- None,
4102
- None,
4103
- )
4104
- # get styles config
4105
- for k, v in cell.get(K, {}).items():
4106
- # if not "style" in k:
4107
- # break
4108
- if strcmp(k, kws_border)[0] in ["style"]:
4109
- border_style_all = strcmp(v, border_styles)[0]
4110
- # 如果设置了style,表示其它的所有的都设置成为一样的
4111
- # 然后再才开始自己定义其它的style
4112
- border_style_l, border_style_r, border_style_t, border_style_b = (
4113
- border_style_all,
4114
- border_style_all,
4115
- border_style_all,
4116
- border_style_all,
4117
- )
4118
- border_style_d, border_style_o, border_style_v, border_style_h = (
4119
- border_style_all,
4120
- border_style_all,
4121
- border_style_all,
4122
- border_style_all,
4123
- )
4124
- elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
4125
- border_style_l = strcmp(v, border_styles)[0]
4126
- elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
4127
- border_style_r = strcmp(v, border_styles)[0]
4128
- elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
4129
- border_style_t = strcmp(v, border_styles)[0]
4130
- elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
4131
- border_style_b = strcmp(v, border_styles)[0]
4132
- elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
4133
- border_style_d = strcmp(v, border_styles)[0]
4134
- elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
4135
- border_style_o = strcmp(v, border_styles)[0]
4136
- elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
4137
- border_style_v = strcmp(v, border_styles)[0]
4138
- elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
4139
- border_style_h = strcmp(v, border_styles)[0]
4140
- # * apply border config
4141
- border = Border(
4142
- left=Side(border_style=border_style_l, color=border_color_l),
4143
- right=Side(border_style=border_style_r, color=border_color_r),
4144
- top=Side(border_style=border_style_t, color=border_color_t),
4145
- bottom=Side(border_style=border_style_b, color=border_color_b),
4146
- diagonal=Side(border_style=border_style_d, color=border_color_d),
4147
- diagonal_direction=0,
4148
- outline=Side(border_style=border_style_o, color=border_color_o),
4149
- vertical=Side(border_style=border_style_v, color=border_color_v),
4150
- horizontal=Side(border_style=border_style_h, color=border_color_h),
4151
- )
4152
-
4153
- #! final apply configs
4154
- for row in ws[cell_range]:
4155
- for cell_ in row:
4156
- if cell_font:
4157
- cell_.font = cell_font
4158
- if cell_fill:
4159
- cell_.fill = cell_fill
4160
- if cell_alignment:
4161
- cell_.alignment = cell_alignment
4162
- if border:
4163
- cell_.border = border
4164
-
4081
+ raise ValueError("Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format.")
4165
4082
 
4166
4083
  def format_excel(
4167
4084
  df=None,
@@ -4182,6 +4099,255 @@ def format_excel(
4182
4099
  conditional_format=None, # dict
4183
4100
  **kwargs,
4184
4101
  ):
4102
+ import pandas as pd
4103
+ from datetime import datetime
4104
+ from openpyxl import load_workbook
4105
+ from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
4106
+ from openpyxl.utils import get_column_letter
4107
+ from openpyxl.worksheet.datavalidation import DataValidation
4108
+ from openpyxl.comments import Comment
4109
+ from openpyxl.formatting.rule import ColorScaleRule
4110
+
4111
+ def convert_indices_to_range(row_slice, col_slice):
4112
+ """Convert numerical row and column slices to Excel-style range strings."""
4113
+ start_row = row_slice.start + 1
4114
+ end_row = row_slice.stop if row_slice.stop is not None else None
4115
+ start_col = col_slice.start + 1
4116
+ end_col = col_slice.stop if col_slice.stop is not None else None
4117
+
4118
+ start_col_letter = get_column_letter(start_col)
4119
+ end_col_letter = get_column_letter(end_col) if end_col else None
4120
+ return (
4121
+ f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
4122
+ if end_col_letter
4123
+ else f"{start_col_letter}{start_row}"
4124
+ )
4125
+
4126
+ def apply_format(ws, cell, cell_range):
4127
+ """Apply cell formatting to a specified range."""
4128
+ cell_font, cell_fill, cell_alignment, border = None, None, None, None
4129
+ kws_cell = ["font", "fill", "alignment", "border"]
4130
+ for K, _ in cell.items():
4131
+ if strcmp(K, kws_cell)[0] == "font":
4132
+ #! font
4133
+ font_color = "000000"
4134
+ font_name = "Arial"
4135
+ font_underline = "none"
4136
+ font_size = 14
4137
+ font_bold = False
4138
+ font_strike = False
4139
+ font_italic = False
4140
+ kws_font = ["name","size","bold","underline","color","strike","italic"]
4141
+ for k_, v_ in cell.get(K, {}).items():
4142
+ if strcmp(k_, kws_font)[0] == "name":
4143
+ font_name = v_
4144
+ elif strcmp(k_, kws_font)[0] == "size":
4145
+ font_size = v_
4146
+ elif strcmp(k_, kws_font)[0] == "bold":
4147
+ font_bold = v_
4148
+ elif strcmp(k_, kws_font)[0] == "underline":
4149
+ font_underline = strcmp(v_, ["none", "single", "double"])[0]
4150
+ elif strcmp(k_, kws_font)[0] == "color":
4151
+ font_color = hex2argb(v_)
4152
+ elif strcmp(k_, kws_font)[0] == "strike":
4153
+ font_strike = v_
4154
+ elif strcmp(k_, kws_font)[0] == "italic":
4155
+ font_italic = v_
4156
+
4157
+ cell_font = Font(
4158
+ name=font_name,
4159
+ size=font_size,
4160
+ bold=font_bold,
4161
+ italic=font_italic,
4162
+ underline=font_underline,
4163
+ strike=font_strike,
4164
+ color=font_color,
4165
+ )
4166
+
4167
+ if strcmp(K, kws_cell)[0] == "fill":
4168
+ #! fill
4169
+ kws_fill = ["start_color", "end_color", "fill_type", "color"]
4170
+ kws_fill_type = ["darkVertical","lightDown","lightGrid","solid","darkDown","lightGray","lightUp","gray0625","lightVertical","lightHorizontal",
4171
+ "darkHorizontal","gray125","darkUp","mediumGray","darkTrellis","darkGray","lightTrellis","darkGrid"]
4172
+ start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
4173
+ for k, v in cell.get(K, {}).items():
4174
+ if strcmp(k, kws_fill)[0] == "color":
4175
+ start_color, end_color = hex2argb(v), hex2argb(v)
4176
+ break
4177
+ for k, v in cell.get(K, {}).items():
4178
+ if strcmp(k, kws_fill)[0] == "start_color":
4179
+ start_color = hex2argb(v)
4180
+ elif strcmp(k, kws_fill)[0] == "end_color":
4181
+ end_color = hex2argb(v)
4182
+ elif strcmp(k, kws_fill)[0] == "fill_type":
4183
+ fill_type = strcmp(v, kws_fill_type)[0]
4184
+ cell_fill = PatternFill(
4185
+ start_color=start_color,
4186
+ end_color=end_color,
4187
+ fill_type=fill_type,
4188
+ )
4189
+
4190
+ if strcmp(K, kws_cell)[0] == "alignment":
4191
+ #! alignment
4192
+ # default
4193
+ align_horizontal = "general"
4194
+ align_vertical = "center"
4195
+ align_rot = 0
4196
+ align_wrap = False
4197
+ align_shrink = False
4198
+ align_indent = 0
4199
+ kws_align = [
4200
+ "horizontal",
4201
+ "ha",
4202
+ "vertical",
4203
+ "va",
4204
+ "text_rotation",
4205
+ "rotat",
4206
+ "rot",
4207
+ "wrap_text",
4208
+ "wrap",
4209
+ "shrink_to_fit",
4210
+ "shrink",
4211
+ "indent",
4212
+ ]
4213
+ for k, v in cell.get(K, {}).items():
4214
+ if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
4215
+ align_horizontal = strcmp(
4216
+ v, ["general", "left", "right", "center"]
4217
+ )[0]
4218
+ elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
4219
+ align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
4220
+ elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
4221
+ align_rot = v
4222
+ elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
4223
+ align_wrap = v
4224
+ elif strcmp(k, kws_align)[0] in [
4225
+ "shrink_to_fit",
4226
+ "shrink",
4227
+ "wrap_text",
4228
+ "wrap",
4229
+ ]:
4230
+ align_shrink = v
4231
+ elif strcmp(k, kws_align)[0] in ["indent"]:
4232
+ align_indent = v
4233
+ cell_alignment = Alignment(
4234
+ horizontal=align_horizontal,
4235
+ vertical=align_vertical,
4236
+ text_rotation=align_rot,
4237
+ wrap_text=align_wrap,
4238
+ shrink_to_fit=align_shrink,
4239
+ indent=align_indent,
4240
+ )
4241
+
4242
+ if strcmp(K, kws_cell)[0] == "border":
4243
+ #! border
4244
+ kws_border = ["color_left","color_l","color_right","color_r","color_top","color_t","color_bottom","color_b",
4245
+ "color_diagonal","color_d","color_outline","color_o","color_vertical","color_v","color_horizontal",
4246
+ "color_h","color","style_left","style_l","style_right","style_r","style_top","style_t","style_bottom","style_b",
4247
+ "style_diagonal","style_d","style_outline","style_o","style_vertical","style_v","style_horizontal",
4248
+ "style_h","style"]
4249
+ # * border color
4250
+ border_color_l, border_color_r, border_color_t, border_color_b = ("FF000000","FF000000","FF000000","FF000000")
4251
+ border_color_d, border_color_o, border_color_v, border_color_h = ("FF000000","FF000000","FF000000","FF000000")
4252
+ # get colors config
4253
+ for k, v in cell.get(K, {}).items():
4254
+ if strcmp(k, kws_border)[0] in ["color"]:
4255
+ border_color_all = hex2argb(v)
4256
+ # 如果设置了color,表示其它的所有的都设置成为一样的
4257
+ # 然后再才开始自己定义其它的color
4258
+ border_color_l, border_color_r, border_color_t, border_color_b = (
4259
+ border_color_all,
4260
+ border_color_all,
4261
+ border_color_all,
4262
+ border_color_all,
4263
+ )
4264
+ border_color_d, border_color_o, border_color_v, border_color_h = (
4265
+ border_color_all,
4266
+ border_color_all,
4267
+ border_color_all,
4268
+ border_color_all,
4269
+ )
4270
+ elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
4271
+ border_color_l = hex2argb(v)
4272
+ elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
4273
+ border_color_r = hex2argb(v)
4274
+ elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
4275
+ border_color_t = hex2argb(v)
4276
+ elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
4277
+ border_color_b = hex2argb(v)
4278
+ elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
4279
+ border_color_d = hex2argb(v)
4280
+ elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
4281
+ border_color_o = hex2argb(v)
4282
+ elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
4283
+ border_color_v = hex2argb(v)
4284
+ elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
4285
+ border_color_h = hex2argb(v)
4286
+ # *border style
4287
+ border_styles = ["thin","medium","thick","dotted","dashed",
4288
+ "hair","mediumDashed","dashDot","dashDotDot","slantDashDot","none"]
4289
+ border_style_l, border_style_r, border_style_t, border_style_b = (None,None,None,None)
4290
+ border_style_d, border_style_o, border_style_v, border_style_h = (None,None,None,None)
4291
+ # get styles config
4292
+ for k, v in cell.get(K, {}).items():
4293
+ # if not "style" in k:
4294
+ # break
4295
+ if strcmp(k, kws_border)[0] in ["style"]:
4296
+ border_style_all = strcmp(v, border_styles)[0]
4297
+ # 如果设置了style,表示其它的所有的都设置成为一样的
4298
+ # 然后再才开始自己定义其它的style
4299
+ border_style_l, border_style_r, border_style_t, border_style_b = (
4300
+ border_style_all,
4301
+ border_style_all,
4302
+ border_style_all,
4303
+ border_style_all,
4304
+ )
4305
+ border_style_d, border_style_o, border_style_v, border_style_h = (
4306
+ border_style_all,
4307
+ border_style_all,
4308
+ border_style_all,
4309
+ border_style_all,
4310
+ )
4311
+ elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
4312
+ border_style_l = strcmp(v, border_styles)[0]
4313
+ elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
4314
+ border_style_r = strcmp(v, border_styles)[0]
4315
+ elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
4316
+ border_style_t = strcmp(v, border_styles)[0]
4317
+ elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
4318
+ border_style_b = strcmp(v, border_styles)[0]
4319
+ elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
4320
+ border_style_d = strcmp(v, border_styles)[0]
4321
+ elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
4322
+ border_style_o = strcmp(v, border_styles)[0]
4323
+ elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
4324
+ border_style_v = strcmp(v, border_styles)[0]
4325
+ elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
4326
+ border_style_h = strcmp(v, border_styles)[0]
4327
+ # * apply border config
4328
+ border = Border(
4329
+ left=Side(border_style=border_style_l, color=border_color_l),
4330
+ right=Side(border_style=border_style_r, color=border_color_r),
4331
+ top=Side(border_style=border_style_t, color=border_color_t),
4332
+ bottom=Side(border_style=border_style_b, color=border_color_b),
4333
+ diagonal=Side(border_style=border_style_d, color=border_color_d),
4334
+ diagonal_direction=0,
4335
+ outline=Side(border_style=border_style_o, color=border_color_o),
4336
+ vertical=Side(border_style=border_style_v, color=border_color_v),
4337
+ horizontal=Side(border_style=border_style_h, color=border_color_h),
4338
+ )
4339
+
4340
+ #! final apply configs
4341
+ for row in ws[cell_range]:
4342
+ for cell_ in row:
4343
+ if cell_font:
4344
+ cell_.font = cell_font
4345
+ if cell_fill:
4346
+ cell_.fill = cell_fill
4347
+ if cell_alignment:
4348
+ cell_.alignment = cell_alignment
4349
+ if border:
4350
+ cell_.border = border
4185
4351
  if not isinstance(df, pd.DataFrame):
4186
4352
  try:
4187
4353
  print(f"is loading file {os.path.basename(df)}")
@@ -4527,12 +4693,10 @@ format_excel(
4527
4693
  print(f"Formatted Excel file saved as:\n{filename}")
4528
4694
 
4529
4695
 
4530
- from IPython.display import display, HTML, Markdown
4531
-
4532
-
4533
4696
  def preview(var):
4534
4697
  """Master function to preview formatted variables in Jupyter."""
4535
-
4698
+ from bs4 import BeautifulSoup
4699
+ from IPython.display import display, HTML, Markdown
4536
4700
  if isinstance(var, str):
4537
4701
  if isa(var, "html"):
4538
4702
  display(HTML(var)) # Render as HTML
@@ -4549,6 +4713,7 @@ def preview(var):
4549
4713
  display(var)
4550
4714
 
4551
4715
  elif isinstance(var, list) or isinstance(var, dict):
4716
+ import json
4552
4717
  # Display JSON
4553
4718
  json_str = json.dumps(var, indent=4)
4554
4719
  display(Markdown(f"```json\n{json_str}\n```"))
@@ -4562,6 +4727,7 @@ def preview(var):
4562
4727
  display(Image(filename=var))
4563
4728
 
4564
4729
  elif isinstance(var, dict):
4730
+ import json
4565
4731
  # Handle dictionary formatting
4566
4732
  json_str = json.dumps(var, indent=4)
4567
4733
  display(Markdown(f"```json\n{json_str}\n```"))
@@ -4569,13 +4735,154 @@ def preview(var):
4569
4735
  else:
4570
4736
  # If the format is not recognized, print a message
4571
4737
  print("Format not recognized or unsupported.")
4572
-
4573
-
4574
4738
  # # Example usages:
4575
4739
  # preview("This is a plain text message.")
4576
4740
  # preview("# This is a Markdown header")
4577
4741
  # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
4578
4742
  # preview({"key": "value", "numbers": [1, 2, 3]})
4743
+
4744
+ def _df_outlier(
4745
+ data,
4746
+ columns=None,
4747
+ method=["zscore", "iqr", "percentile", "iforest"],
4748
+ min_outlier_method=3, # 至少两种方法检查出outlier
4749
+ zscore_threshold=3,
4750
+ iqr_threshold=1.5,
4751
+ lower_percentile=5,
4752
+ upper_percentile=95,
4753
+ ):
4754
+ from scipy.stats import zscore
4755
+ from sklearn.ensemble import IsolationForest
4756
+ from sklearn.preprocessing import StandardScaler
4757
+
4758
+ col_names_org = data.columns.tolist()
4759
+ index_names_org = data.index.tolist()
4760
+ # Separate numeric and non-numeric columns
4761
+ numeric_data = data.select_dtypes(include=[np.number])
4762
+ non_numeric_data = data.select_dtypes(exclude=[np.number])
4763
+
4764
+ if columns is not None:
4765
+ numeric_data = numeric_data[columns]
4766
+ elif numeric_data.empty:
4767
+ raise ValueError("Input data must contain numeric columns.")
4768
+
4769
+ outliers_df = pd.DataFrame(index=numeric_data.index)
4770
+ if isinstance(method, str):
4771
+ method = [method]
4772
+
4773
+ # Z-score method
4774
+ if "zscore" in method:
4775
+ z_scores = np.abs(zscore(numeric_data))
4776
+ outliers_df["zscore"] = np.any(z_scores > zscore_threshold, axis=1)
4777
+
4778
+ # IQR method
4779
+ if "iqr" in method:
4780
+ Q1 = numeric_data.quantile(0.25)
4781
+ Q3 = numeric_data.quantile(0.75)
4782
+ IQR = Q3 - Q1
4783
+ lower_bound = Q1 - iqr_threshold * IQR
4784
+ upper_bound = Q3 + iqr_threshold * IQR
4785
+ outliers_df["iqr"] = (
4786
+ (numeric_data < lower_bound) | (numeric_data > upper_bound)
4787
+ ).any(axis=1)
4788
+
4789
+ # Percentile method
4790
+ if "percentile" in method:
4791
+ lower_bound = numeric_data.quantile(lower_percentile / 100)
4792
+ upper_bound = numeric_data.quantile(upper_percentile / 100)
4793
+ outliers_df["percentile"] = (
4794
+ (numeric_data < lower_bound) | (numeric_data > upper_bound)
4795
+ ).any(axis=1)
4796
+
4797
+ # Isolation Forest method
4798
+ if "iforest" in method:
4799
+ # iforest method cannot handle NaNs, then fillna with mean
4800
+ numeric_data_ = numeric_data.fillna(numeric_data.mean())
4801
+ scaler = StandardScaler()
4802
+ scaled_data = scaler.fit_transform(numeric_data_)
4803
+ iso_forest = IsolationForest(contamination=0.05)
4804
+ outliers_df["iforest"] = iso_forest.fit_predict(scaled_data) == -1
4805
+
4806
+ # Combine all outlier detections
4807
+ if len(method) == 4: # all method are used:
4808
+ outliers_df["outlier"] = outliers_df.sum(axis=1) >= min_outlier_method
4809
+ else:
4810
+ outliers_df["outlier"] = outliers_df.any(axis=1)
4811
+
4812
+ # Handling Outliers: Remove or Winsorize or Replace with NaN
4813
+ processed_data = numeric_data.copy()
4814
+
4815
+ processed_data.loc[outliers_df["outlier"]] = np.nan
4816
+
4817
+ return processed_data
4818
+
4819
+
4820
+ def df_outlier(
4821
+ data,
4822
+ columns=None,
4823
+ method=["zscore", "iqr", "percentile", "iforest"],
4824
+ min_outlier_method=2, # 至少两种方法检查出outlier
4825
+ zscore_threshold=3,
4826
+ iqr_threshold=1.5,
4827
+ lower_percentile=5,
4828
+ upper_percentile=95,
4829
+ ):
4830
+ """
4831
+ Usage:
4832
+ data_out = df_outlier(
4833
+ data,
4834
+ columns=["income"],
4835
+ method="iforest",
4836
+ min_outlier_method=1)
4837
+
4838
+ Advanced outlier detection and handling function.
4839
+
4840
+ Parameters:
4841
+ - data: DataFrame, the input data (numerical).
4842
+ - method: List, the outlier detection method to use. Options: 'zscore', 'iqr', 'percentile', 'iforest'.
4843
+ - zscore_threshold: float, threshold for Z-score outlier detection (default 3).
4844
+ - iqr_threshold: float, threshold for IQR method (default 1.5).
4845
+ - lower_percentile: float, lower percentile for percentile-based outliers (default 5).
4846
+ - upper_percentile: float, upper percentile for percentile-based outliers (default 95).
4847
+ - keep_nan: bool, whether to replace outliers with NaN (default True).
4848
+ - plot: bool, whether to visualize the outliers (default False).
4849
+ - min_outlier_method: int, minimum number of method that need to flag a row as an outlier (default 2).
4850
+ - inplace: bool, whether to modify the original `data` DataFrame (default False).
4851
+
4852
+ Returns:
4853
+ - processed_data: DataFrame with outliers handled based on method (if winsorize/remove is True).
4854
+ """
4855
+ col_names_org = data.columns.tolist()
4856
+ index_names_org = data.index.tolist()
4857
+
4858
+ numeric_data = data.select_dtypes(include=[np.number])
4859
+ non_numeric_data = data.select_dtypes(exclude=[np.number])
4860
+
4861
+ _outlier_df_tmp = pd.DataFrame()
4862
+ for col in numeric_data.columns:
4863
+ _outlier_df_tmp = pd.concat(
4864
+ [
4865
+ _outlier_df_tmp,
4866
+ _df_outlier(
4867
+ data=data,
4868
+ columns=[col],
4869
+ method=method,
4870
+ min_outlier_method=min_outlier_method, # 至少两种方法检查出outlier
4871
+ zscore_threshold=zscore_threshold,
4872
+ iqr_threshold=iqr_threshold,
4873
+ lower_percentile=lower_percentile,
4874
+ upper_percentile=upper_percentile,
4875
+ ),
4876
+ ],
4877
+ axis=1,
4878
+ # join="inner",
4879
+ )
4880
+ processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
4881
+ processed_data = processed_data[col_names_org]
4882
+ return processed_data
4883
+
4884
+
4885
+
4579
4886
  def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
4580
4887
  """
4581
4888
  Extend a DataFrame by the list elecments in the column.
@@ -4967,6 +5274,7 @@ def df_drop_duplicates(
4967
5274
  return None
4968
5275
  else:
4969
5276
  return result
5277
+ #! fillna()
4970
5278
  def df_fillna(
4971
5279
  data: pd.DataFrame,
4972
5280
  method: str = "knn",
@@ -4974,8 +5282,8 @@ def df_fillna(
4974
5282
  constant: float = None,
4975
5283
  n_neighbors: int = 5, # KNN-specific
4976
5284
  max_iter: int = 10, # Iterative methods specific
4977
- inplace: bool = True,
4978
- random_state:int = None
5285
+ inplace: bool = False,
5286
+ random_state:int = 1
4979
5287
  ) -> pd.DataFrame:
4980
5288
  """
4981
5289
  Fill missing values in a DataFrame using specified imputation method.
@@ -5003,7 +5311,18 @@ def df_fillna(
5003
5311
  inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
5004
5312
 
5005
5313
  """
5006
-
5314
+ if isinstance(data, pd.Series):
5315
+ data=pd.DataFrame(data)
5316
+ # handle None
5317
+ for col in data.columns:
5318
+ data[col] = data[col].apply(lambda x: np.nan if x is None else x)
5319
+
5320
+ col_names_org = data.columns.tolist()
5321
+ index_names_org = data.index.tolist()
5322
+ # Separate numeric and non-numeric columns
5323
+ numeric_data = data.select_dtypes(include=[np.number])
5324
+ non_numeric_data = data.select_dtypes(exclude=[np.number])
5325
+
5007
5326
  if data.empty:
5008
5327
  raise ValueError("Input DataFrame is empty.")
5009
5328
 
@@ -5032,15 +5351,6 @@ def df_fillna(
5032
5351
  from sklearn.impute import IterativeImputer
5033
5352
 
5034
5353
  imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
5035
- # elif method == "missforest":
5036
- # from missingpy import MissForest
5037
- # imputer = MissForest(max_iter=max_iter, random_state=random_state)
5038
- # elif method == "softimpute":
5039
- # from fancyimpute import SoftImpute
5040
- # imputer = SoftImpute()
5041
- # elif method == "svd":
5042
- # from fancyimpute import IterativeSVD
5043
- # imputer = IterativeSVD(max_iters=max_iter)
5044
5354
  else: # mean, median, most_frequent
5045
5355
  from sklearn.impute import SimpleImputer
5046
5356
  imputer = SimpleImputer(strategy=method)
@@ -5048,26 +5358,49 @@ def df_fillna(
5048
5358
  # Fit and transform the data
5049
5359
  if axis == 0:
5050
5360
  # Impute column-wise
5051
- imputed_data = imputer.fit_transform(data)
5052
- imputed_data.shape
5361
+ imputed_data = imputer.fit_transform(numeric_data)
5053
5362
  elif axis == 1:
5054
5363
  # Impute row-wise
5055
- imputed_data = imputer.fit_transform(data.T)
5056
- imputed_data.shape
5364
+ imputed_data = imputer.fit_transform(numeric_data.T)
5057
5365
  else:
5058
5366
  raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
5059
5367
 
5060
- df_filled = pd.DataFrame(
5368
+ imputed_data = pd.DataFrame(
5061
5369
  imputed_data if axis == 0 else imputed_data.T,
5062
- index=data.index,# if axis == 0 else data.columns,
5063
- columns=data.columns,# if axis == 0 else data.index,
5370
+ index=numeric_data.index if axis == 0 else data.columns,
5371
+ columns=numeric_data.columns if axis == 0 else data.index,
5064
5372
  )
5373
+ for col in imputed_data.select_dtypes(include=[np.number]).columns:
5374
+ imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
5375
+
5376
+ # Handle non-numeric data imputation
5377
+ if not non_numeric_data.empty:
5378
+ from sklearn.impute import SimpleImputer
5379
+ if method == "constant":
5380
+ non_numeric_imputer = SimpleImputer(strategy="constant", fill_value=constant)
5381
+ else:
5382
+ non_numeric_imputer = SimpleImputer(strategy="most_frequent")
5383
+
5384
+ # Impute non-numeric columns column-wise (axis=0)
5385
+ imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
5386
+
5387
+ # Convert imputed non-numeric array back to DataFrame with original index and column names
5388
+ imputed_non_numeric_df = pd.DataFrame(
5389
+ imputed_non_numeric, index=non_numeric_data.index, columns=non_numeric_data.columns
5390
+ )
5391
+ else:
5392
+ imputed_non_numeric_df = pd.DataFrame(index=data.index)
5393
+
5394
+
5395
+ imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(columns=data.columns)
5065
5396
 
5066
5397
  if inplace:
5067
- data.update(df_filled)
5068
- return None # replace original
5398
+ # Modify the original DataFrame
5399
+ data[:] = imputed_data[col_names_org]
5400
+ return None
5069
5401
  else:
5070
- return df_filled
5402
+ # Return the modified DataFrame
5403
+ return imputed_data[col_names_org]
5071
5404
  # # example
5072
5405
  # data = {
5073
5406
  # "A": [1, 2, np.nan, 4, 5],
@@ -5097,7 +5430,94 @@ def df_fillna(
5097
5430
  # display(df)
5098
5431
  # display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
5099
5432
 
5100
-
5433
+ def df_encoder(
5434
+ data: pd.DataFrame,
5435
+ method: str = "dummy",#'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
5436
+ columns=None,
5437
+ target_column=None, # Required for 'target' encoding method
5438
+ **kwargs
5439
+ ) -> pd.DataFrame:
5440
+ """
5441
+ Methods explained:
5442
+ - 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
5443
+
5444
+ - 'onehot': One-hot encoding is used when there is no inherent order in categories. It creates a binary column for each category and is useful for nominal categorical variables. However, it increases dimensionality significantly if there are many unique categories.
5445
+
5446
+ - 'ordinal': Ordinal encoding is used when there is an inherent order in the categories. It assigns integers to categories based on their order. Use this when the categories have a ranking (e.g., 'low', 'medium', 'high').
5447
+
5448
+ - 'label': Label encoding is used for converting each unique category to a numeric label. It can be useful when working with algorithms that can handle categorical data natively (e.g., decision trees). However, it might introduce unintended ordinal relationships between the categories.
5449
+
5450
+ - 'target': Target encoding is used when you encode a categorical feature based on the mean of the target variable. This is useful when there is a strong correlation between the categorical feature and the target variable. It is often used in predictive modeling to capture relationships that are not directly encoded in the feature.
5451
+
5452
+ - 'binary': Binary encoding is a more efficient alternative to one-hot encoding when dealing with high-cardinality categorical variables. It converts categories into binary numbers and then splits them into multiple columns, reducing dimensionality compared to one-hot encoding.
5453
+ """
5454
+
5455
+ # Select categorical columns
5456
+ categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
5457
+ methods = ["dummy","onehot", "ordinal", "label", "target", "binary"]
5458
+ method = strcmp(method, methods)[0]
5459
+
5460
+ if columns is None:
5461
+ columns = categorical_cols
5462
+
5463
+ # pd.get_dummies()
5464
+ if method=='dummy':
5465
+ dtype=kwargs.pop("dtype",int)
5466
+ drop_first=kwargs.pop("drop_first",True)
5467
+ try:
5468
+ encoded_df = pd.get_dummies(data[columns], drop_first=drop_first, dtype=dtype, **kwargs)
5469
+ return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5470
+ except Exception as e:
5471
+ # print(f"Warning, 没有进行转换, 因为: {e}")
5472
+ return data
5473
+ # One-hot encoding
5474
+ elif method == "onehot":
5475
+ from sklearn.preprocessing import OneHotEncoder
5476
+
5477
+ encoder = OneHotEncoder(drop="first", sparse_output=False, **kwargs)
5478
+ encoded_data = encoder.fit_transform(data[columns])
5479
+ encoded_df = pd.DataFrame(
5480
+ encoded_data,
5481
+ columns=encoder.get_feature_names_out(columns),
5482
+ index=data.index,
5483
+ )
5484
+ return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5485
+
5486
+ # Ordinal encoding
5487
+ elif method == "ordinal":
5488
+ from sklearn.preprocessing import OrdinalEncoder
5489
+
5490
+ encoder = OrdinalEncoder(**kwargs)
5491
+ encoded_data = encoder.fit_transform(data[columns])
5492
+ encoded_df = pd.DataFrame(encoded_data, columns=columns, index=data.index)
5493
+ return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5494
+
5495
+ # Label encoding
5496
+ elif method == "label":
5497
+ from sklearn.preprocessing import LabelEncoder
5498
+
5499
+ encoder = LabelEncoder()
5500
+ encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
5501
+ return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5502
+
5503
+ # Target encoding (Mean of the target for each category)
5504
+ elif method == "target":
5505
+ if target_column is None:
5506
+ raise ValueError("target_column must be provided for target encoding.")
5507
+ from category_encoders import TargetEncoder
5508
+
5509
+ encoder = TargetEncoder(cols=columns, **kwargs)
5510
+ encoded_data = encoder.fit_transform(data[columns], data[target_column])
5511
+ return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5512
+
5513
+ # Binary encoding (for high-cardinality categorical variables)
5514
+ elif method == "binary":
5515
+ from category_encoders import BinaryEncoder
5516
+
5517
+ encoder = BinaryEncoder(cols=columns, **kwargs)
5518
+ encoded_data = encoder.fit_transform(data[columns])
5519
+ return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5520
+
5101
5521
  def df_scaler(
5102
5522
  data: pd.DataFrame, # should be numeric dtype
5103
5523
  method="standard",
@@ -5143,9 +5563,8 @@ def df_scaler(
5143
5563
  if axis == 0:
5144
5564
  # Column-wise scaling (default)
5145
5565
  if columns is None:
5146
- columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
5566
+ columns = data.select_dtypes(include=np.number).columns.tolist()
5147
5567
  non_numeric_columns = data.columns.difference(columns)
5148
- print(f"Scaling columns")
5149
5568
 
5150
5569
  scaled_data = scaler.fit_transform(data[columns])
5151
5570
 
@@ -5167,7 +5586,7 @@ def df_scaler(
5167
5586
  # Row-wise scaling
5168
5587
  if columns is None:
5169
5588
  columns = data.index.tolist()
5170
- numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
5589
+ numeric_rows = data.loc[columns].select_dtypes(include=np.number)
5171
5590
  if numeric_rows.empty:
5172
5591
  raise ValueError("No numeric rows to scale.")
5173
5592
 
@@ -5184,7 +5603,31 @@ def df_scaler(
5184
5603
  scaled_df = data.copy()
5185
5604
  scaled_df.loc[numeric_rows.index] = scaled_data
5186
5605
  return scaled_df
5606
+ def df_special_characters_cleaner(
5607
+ data: pd.DataFrame, where=["column", "content", "index"]
5608
+ ) -> pd.DataFrame:
5609
+ """
5610
+ to clean special characters:
5611
+ usage:
5612
+ df_special_characters_cleaner(data=df, where='column')
5613
+ """
5614
+ if not isinstance(where, list):
5615
+ where = [where]
5616
+ where_to_clean = ["column", "content", "index"]
5617
+ where_ = [strcmp(i, where_to_clean)[0] for i in where]
5618
+
5619
+ # 1. Clean column names by replacing special characters with underscores
5620
+ if "column" in where_:
5621
+ data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
5622
+
5623
+ # 2. Clean only object-type columns (text columns)
5624
+ if "content" in where_:
5625
+ for col in data.select_dtypes(include=["object"]).columns:
5626
+ data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
5627
+ if data.index.dtype == "object" and index in where_:
5628
+ data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
5187
5629
 
5630
+ return data
5188
5631
  def df_cluster(
5189
5632
  data: pd.DataFrame,
5190
5633
  columns: Optional[list] = None,
@@ -5193,8 +5636,8 @@ def df_cluster(
5193
5636
  scale: bool = True,
5194
5637
  plot: Union[str, list] = "all",
5195
5638
  inplace: bool = True,
5196
- ax: Optional[plt.Axes] = None,
5197
- ) -> tuple[pd.DataFrame, int, Optional[plt.Axes]]:
5639
+ ax = None,
5640
+ ):
5198
5641
  from sklearn.preprocessing import StandardScaler
5199
5642
  from sklearn.cluster import KMeans
5200
5643
  from sklearn.metrics import silhouette_score, silhouette_samples
@@ -5202,7 +5645,6 @@ def df_cluster(
5202
5645
  import numpy as np
5203
5646
  import pandas as pd
5204
5647
  import matplotlib.pyplot as plt
5205
- import seaborn as sns
5206
5648
 
5207
5649
  """
5208
5650
  Performs clustering analysis on the provided feature matrix using K-Means.
@@ -5510,80 +5952,61 @@ def df_reducer(
5510
5952
  umap_neighbors: int = 15, # UMAP-specific
5511
5953
  umap_min_dist: float = 0.1, # UMAP-specific
5512
5954
  tsne_perplexity: int = 30, # t-SNE-specific
5955
+ hue:str = None,# lda-specific
5513
5956
  scale: bool = True,
5514
5957
  fill_missing: bool = True,
5515
5958
  debug: bool = False,
5516
5959
  inplace: bool = True, # replace the oringinal data
5517
5960
  plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
5518
- ) -> pd.DataFrame:
5519
- """
5520
- Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
5521
-
5522
- Parameters:
5523
- -----------
5524
- data : pd.DataFrame
5525
- The input DataFrame (samples x features).
5526
-
5527
- columns : List[str], optional
5528
- List of column names to reduce. If None, all columns are used.
5529
-
5530
- method : str, optional, default="umap"
5531
- Dimensionality reduction method, either "pca" or "umap".
5532
-
5533
- n_components : int, optional, default=50
5534
- Number of components for PCA or UMAP.
5535
-
5536
- umap_neighbors : int, optional, default=15
5537
- Number of neighbors considered for UMAP embedding.
5538
-
5539
- umap_min_dist : float, optional, default=0.1
5540
- Minimum distance between points in UMAP embedding.
5541
-
5542
- scale : bool, optional, default=True
5543
- Whether to scale the data using StandardScaler.
5544
-
5545
- fill_missing : bool, optional, default=True
5546
- Whether to fill missing values using the mean before applying PCA/UMAP.
5961
+ random_state=1,
5962
+ ax = None,
5963
+ figsize=None,
5964
+ **kwargs
5965
+ ) -> pd.DataFrame:
5966
+ dict_methods = {
5967
+ #!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
5968
+ "pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
5969
+ "lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
5970
+ "factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
5971
+ "svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
5972
+
5973
+ #! Non-linear Dimensionality Reduction (Manifold Learning)
5974
+ "umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
5975
+ "tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other. Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
5976
+ "mds": "mds(Multidimensional Scaling):\n\tAppropriate for visualizing pairwise similarity or distance in data. Advantage: Maintains the perceived similarity or dissimilarity between points. Limitation: Computationally expensive for large datasets, less effective for complex, high-dimensional structures.",
5977
+ "lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
5978
+ "kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
5979
+ "ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
5980
+
5981
+ #! Anomaly Detection: Specialized for detecting outliers or unusual patterns
5982
+ "isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
5983
+ }
5547
5984
 
5548
- Returns:
5549
- --------
5550
- reduced_df : pd.DataFrame
5551
- DataFrame with the reduced dimensions.
5552
- """
5553
-
5554
- """
5555
- PCA: explained_variance:
5556
- indicates the proportion of the dataset's total variance that each principal
5557
- component (PC) explains. It gives you a sense of how much information
5558
- (or variance) is captured by each PC
5559
- Interpretation:
5560
- - Higher values indicate that the corresponding PC captures more variance.
5561
- - The sum of the explained variances for all PCs equals 1 (or 100%).
5562
- - If the first few components explain a high percentage (e.g., 90%),
5563
- it means you can reduce the dimensionality of the data significantly without losing much information.
5564
- Use case:
5565
- You may plot a scree plot, which shows the explained variance for each PC, to help decide
5566
- how many components to keep for analysis.
5567
-
5568
- PCA: Singular values:
5569
- represent the magnitude of variance along each principal component. Mathematically,
5570
- they are the square roots of the eigenvalues of the covariance matrix.
5571
- Interpretation:
5572
- Larger singular values indicate that the associated PC captures more variance.
5573
- Singular values are related to the scale of the data. If the data are scaled
5574
- before PCA (e.g., standardized), then the singular values will provide a measure
5575
- of the spread of data along each PC.
5576
- Use case:
5577
- Singular values help quantify the contribution of each principal component in a
5578
- similar way to the explained variance. They are useful in understanding the overall
5579
- structure of the data.
5580
- """
5581
5985
  from sklearn.preprocessing import StandardScaler
5582
5986
  from sklearn.impute import SimpleImputer
5583
-
5584
- # Select columns if specified, else use all columns
5585
- X = data[columns].values if columns else data.values
5586
- print(X.shape,type(X))
5987
+ if plot_:
5988
+ import matplotlib.pyplot as plt
5989
+ import seaborn as sns
5990
+ # Check valid method input
5991
+ methods=["pca", "umap","tsne","factor","isolation_forest","lda","kpca","ica","mds","lle","svd"]
5992
+ method=strcmp(method, methods)[0]
5993
+ print(f"\nprocessing with using {dict_methods[method]}:")
5994
+ xlabel,ylabel=None,None
5995
+ if columns is None:
5996
+ columns = data.select_dtypes(include='number').columns.tolist()
5997
+ if hue is None:
5998
+ hue = data.select_dtypes(exclude='number').columns.tolist()
5999
+ if isinstance(hue, list):
6000
+ print("Warning: hue is a list, only select the 1st one")
6001
+ hue=hue[0]
6002
+ if not hue:
6003
+ # Select columns if specified, else use all columns
6004
+ X = data[columns].values if columns else data.values
6005
+ else:
6006
+ # Select columns to reduce and hue for LDA
6007
+ X = data[columns].values if columns else data.drop(columns=[hue]).values
6008
+ y = data[hue].values
6009
+ print(X.shape)
5587
6010
  # Handle missing values
5588
6011
  if fill_missing:
5589
6012
  imputer = SimpleImputer(strategy="mean")
@@ -5594,9 +6017,6 @@ def df_reducer(
5594
6017
  scaler = StandardScaler()
5595
6018
  X = scaler.fit_transform(X)
5596
6019
 
5597
- # Check valid method input
5598
- methods=["pca", "umap","tsne","factor","isolation_forest"]
5599
- method=strcmp(method, methods)[0]
5600
6020
  # Apply PCA if selected
5601
6021
  if method == "pca":
5602
6022
  from sklearn.decomposition import PCA
@@ -5640,7 +6060,27 @@ def df_reducer(
5640
6060
  pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
5641
6061
  for i in range(n_components):
5642
6062
  pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
6063
+ if hue:
6064
+ pca_df[hue]=y
6065
+ elif method =='lda':
6066
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
6067
+
6068
+ if "hue" not in locals() or hue is None:
6069
+ raise ValueError("LDA requires a 'hue' col parameter to specify class labels.")
5643
6070
 
6071
+ lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
6072
+ X_reduced = lda_reducer.fit_transform(X, y)
6073
+
6074
+ # Prepare reduced DataFrame with additional LDA info
6075
+ lda_df = pd.DataFrame(
6076
+ X_reduced, index=data.index,
6077
+ columns=[f"LDA_{i+1}" for i in range(n_components)]
6078
+ )
6079
+ if debug:
6080
+ print(f"LDA completed: Reduced to {n_components} components.")
6081
+ print("Class separability achieved by LDA.")
6082
+ if hue:
6083
+ lda_df[hue]=y
5644
6084
  # Apply UMAP if selected
5645
6085
  elif method == "umap":
5646
6086
  import umap
@@ -5667,32 +6107,36 @@ def df_reducer(
5667
6107
  )
5668
6108
  umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
5669
6109
  umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
6110
+ if hue:
6111
+ umap_df[hue]=y
5670
6112
  elif method == "tsne":
5671
6113
  from sklearn.manifold import TSNE
5672
- tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
5673
- X_reduced = tsne.fit_transform(X)
5674
-
5675
- # Prepare reduced DataFrame with additional t-SNE info
6114
+ tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=random_state)
6115
+ X_reduced = tsne.fit_transform(X)
5676
6116
  tsne_df = pd.DataFrame(
5677
- X_reduced, index=data.index,
6117
+ X_reduced,
6118
+ index=data.index,
5678
6119
  columns=[f"tSNE_{i+1}" for i in range(n_components)]
5679
6120
  )
5680
6121
  tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
5681
-
6122
+ if hue:
6123
+ tsne_df[hue]=y
5682
6124
  # Apply Factor Analysis if selected
5683
6125
  elif method == "factor":
5684
6126
  from sklearn.decomposition import FactorAnalysis
5685
- factor = FactorAnalysis(n_components=n_components, random_state=1)
6127
+ factor = FactorAnalysis(n_components=n_components, random_state=random_state)
5686
6128
  X_reduced = factor.fit_transform(X)
5687
6129
  # Factor Analysis does not directly provide explained variance, but we can approximate it
5688
6130
  fa_variance = factor.noise_variance_
5689
6131
  # Prepare reduced DataFrame with additional Factor Analysis info
5690
6132
  factor_df = pd.DataFrame(
5691
- X_reduced, index=data.index,
6133
+ X_reduced,
6134
+ index=data.index,
5692
6135
  columns=[f"Factor_{i+1}" for i in range(n_components)]
5693
6136
  )
5694
6137
  factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
5695
-
6138
+ if hue:
6139
+ factor_df[hue]=y
5696
6140
  # Apply Isolation Forest for outlier detection if selected
5697
6141
  elif method == "isolation_forest":
5698
6142
  from sklearn.decomposition import PCA
@@ -5723,48 +6167,100 @@ def df_reducer(
5723
6167
  iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
5724
6168
  for i in range(n_components):
5725
6169
  iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
6170
+ if hue:
6171
+ iso_forest_df[hue]=y
6172
+ #* Apply Kernel PCA if selected
6173
+ elif method == "kpca":
6174
+ from sklearn.decomposition import KernelPCA
6175
+ kpca = KernelPCA(n_components=n_components, kernel="rbf", random_state=random_state)
6176
+ X_reduced = kpca.fit_transform(X)
6177
+
6178
+ # Prepare reduced DataFrame with KPCA info
6179
+ kpca_df = pd.DataFrame(
6180
+ X_reduced,
6181
+ index=data.index,
6182
+ columns=[f"KPCA_{i+1}" for i in range(n_components)]
6183
+ )
6184
+ if debug:
6185
+ print("Kernel PCA completed with RBF kernel.")
6186
+ if hue:
6187
+ kpca_df[hue]=y
6188
+ #* Apply ICA if selected
6189
+ elif method == "ica":
6190
+ from sklearn.decomposition import FastICA
6191
+ ica = FastICA(n_components=n_components, random_state=random_state)
6192
+ X_reduced = ica.fit_transform(X)
6193
+
6194
+ # Prepare reduced DataFrame with ICA info
6195
+ ica_df = pd.DataFrame(
6196
+ X_reduced, index=data.index,
6197
+ columns=[f"ICA_{i+1}" for i in range(n_components)]
6198
+ )
6199
+ if debug:
6200
+ print("Independent Component Analysis (ICA) completed.")
6201
+ if hue:
6202
+ ica_df[hue]=y
6203
+ #* Apply MDS if selected
6204
+ elif method == "mds":
6205
+ from sklearn.manifold import MDS
6206
+ mds = MDS(n_components=n_components, random_state=random_state)
6207
+ X_reduced = mds.fit_transform(X)
6208
+
6209
+ # Prepare reduced DataFrame with MDS info
6210
+ mds_df = pd.DataFrame(
6211
+ X_reduced, index=data.index,
6212
+ columns=[f"MDS_{i+1}" for i in range(n_components)]
6213
+ )
6214
+ if debug:
6215
+ print("Multidimensional Scaling (MDS) completed.")
6216
+ if hue:
6217
+ mds_df[hue]=y
6218
+ #* Apply Locally Linear Embedding (LLE) if selected
6219
+ elif method == "lle":
6220
+ from sklearn.manifold import LocallyLinearEmbedding
6221
+ lle = LocallyLinearEmbedding(n_components=n_components, n_neighbors=umap_neighbors, random_state=random_state)
6222
+ X_reduced = lle.fit_transform(X)
6223
+
6224
+ # Prepare reduced DataFrame with LLE info
6225
+ lle_df = pd.DataFrame(
6226
+ X_reduced, index=data.index,
6227
+ columns=[f"LLE_{i+1}" for i in range(n_components)]
6228
+ )
6229
+ if debug:
6230
+ print("Locally Linear Embedding (LLE) completed.")
6231
+ if hue:
6232
+ lle_df[hue]=y
6233
+ #* Apply Singular Value Decomposition (SVD) if selected
6234
+ elif method == "svd":
6235
+ # Using NumPy's SVD for dimensionality reduction
6236
+ U, s, Vt = np.linalg.svd(X, full_matrices=False)
6237
+ X_reduced = U[:, :n_components] * s[:n_components]
6238
+
6239
+ # Prepare reduced DataFrame with SVD info
6240
+ svd_df = pd.DataFrame(
6241
+ X_reduced, index=data.index,
6242
+ columns=[f"SVD_{i+1}" for i in range(n_components)]
6243
+ )
6244
+ if hue:
6245
+ svd_df[hue]=y
6246
+ if debug:
6247
+ print("Singular Value Decomposition (SVD) completed.")
5726
6248
 
5727
6249
  # Return reduced data and info as a new DataFrame with the same index
5728
6250
  if method == "pca":
5729
6251
  reduced_df = pca_df
5730
6252
  colname_met = "PC_"
5731
- if plot_:
5732
- sns.scatterplot(
5733
- data=pca_df,
5734
- x="PC_1",
5735
- y="PC_2",
5736
- # hue="condition",
5737
- )
6253
+ xlabel= f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
6254
+ ylabel= f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
5738
6255
  elif method == "umap":
5739
6256
  reduced_df = umap_df
5740
- colname_met = "UMAP_"
5741
- if plot_:
5742
- sns.scatterplot(
5743
- data=umap_df,
5744
- x="UMAP_1",
5745
- y="UMAP_2",
5746
- # hue="condition",
5747
- )
6257
+ colname_met = "UMAP_"
5748
6258
  elif method == "tsne":
5749
6259
  reduced_df = tsne_df
5750
- colname_met = "t-SNE_"
5751
- if plot_:
5752
- sns.scatterplot(
5753
- data=tsne_df,
5754
- x="tSNE_1",
5755
- y="tSNE_2",
5756
- # hue="batch",
5757
- )
6260
+ colname_met = "tSNE_"
5758
6261
  elif method == "factor":
5759
6262
  reduced_df = factor_df
5760
- colname_met = "Factor_"
5761
- if plot_:
5762
- sns.scatterplot(
5763
- data=factor_df,
5764
- x="Factor_1",
5765
- y="Factor_2",
5766
- # hue="batch",
5767
- )
6263
+ colname_met = "Factor_"
5768
6264
  elif method == "isolation_forest":
5769
6265
  reduced_df = iso_forest_df # Already a DataFrame for outliers
5770
6266
  colname_met = "PC_"
@@ -5783,33 +6279,71 @@ def df_reducer(
5783
6279
  c="r",
5784
6280
  label="outlier", marker="+", s=30,
5785
6281
  )
5786
-
6282
+ elif method=='lda':
6283
+ reduced_df=lda_df
6284
+ colname_met="LDA_"
6285
+ elif method=="kpca":
6286
+ reduced_df=kpca_df
6287
+ colname_met="KPCA_"
6288
+ elif method=="ica":
6289
+ reduced_df=ica_df
6290
+ colname_met="ICA_"
6291
+ elif method=="mds":
6292
+ reduced_df=mds_df
6293
+ colname_met="MDS_"
6294
+ elif method=="lle":
6295
+ reduced_df=lle_df
6296
+ colname_met="LLE_"
6297
+ elif method=="svd":
6298
+ reduced_df=svd_df
6299
+ colname_met="SVD_"
6300
+ # Quick plots
6301
+ if plot_ and (not method in ["isolation_forest"]):
6302
+ from .plot import plotxy
6303
+ if ax is None:
6304
+ if figsize is None:
6305
+ _, ax = plt.subplots(figsize=cm2inch(8,8))
6306
+ else:
6307
+ _, ax = plt.subplots(figsize=figsize)
6308
+ else:
6309
+ ax=ax.cla()
6310
+ ax=plotxy(data=reduced_df,
6311
+ x=colname_met+"1",
6312
+ y=colname_met+"2",
6313
+ hue=hue,
6314
+ s=1,
6315
+ edgecolor='none',
6316
+ kind='scater',
6317
+ figsets=dict(legend=dict(loc='best',markerscale=4),
6318
+ xlabel=xlabel if xlabel else None,
6319
+ ylabel=ylabel if ylabel else None),
6320
+ ax=ax,
6321
+ verbose=False,
6322
+ **kwargs
6323
+ )
5787
6324
 
5788
6325
  if inplace:
5789
6326
  # If inplace=True, add components back into the original data
5790
6327
  for col_idx in range(n_components):
5791
- data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
6328
+ data.loc[:,f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5792
6329
  # Add extra info for PCA/UMAP
5793
6330
  if method == "pca":
5794
6331
  for i in range(n_components):
5795
- data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
6332
+ data.loc[:,f"Explained Variance PC_{i+1}"] = reduced_df.loc[:,f"Explained Variance PC_{i+1}"]
5796
6333
  for i in range(n_components):
5797
- data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
6334
+ data.loc[:,f"Singular Values PC_{i+1}"] = reduced_df.loc[:,f"Singular Values PC_{i+1}"]
5798
6335
  elif method == "umap":
5799
6336
  for i in range(n_components):
5800
- data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
5801
- data["Embedding"] = reduced_df["Embedding"]
5802
- data["Trustworthiness"] = reduced_df["Trustworthiness"]
6337
+ data.loc[:,f"UMAP_{i+1}"]=reduced_df.loc[:,f"UMAP_{i+1}"]
6338
+ data.loc[:,"Embedding"] = reduced_df.loc[:,"Embedding"]
6339
+ data.loc[:,"Trustworthiness"] = reduced_df.loc[:,"Trustworthiness"]
6340
+
5803
6341
  return None # No return when inplace=True
5804
-
5805
6342
 
5806
6343
  return reduced_df
5807
-
5808
-
5809
6344
  # example:
5810
6345
  # df_reducer(data=data_log, columns=markers, n_components=2)
5811
6346
 
5812
-
5813
6347
  def plot_cluster(
5814
6348
  data: pd.DataFrame,
5815
6349
  labels: np.ndarray,
@@ -5833,7 +6367,7 @@ def plot_cluster(
5833
6367
  """
5834
6368
  import seaborn as sns
5835
6369
  from sklearn.metrics import silhouette_samples
5836
-
6370
+ import matplotlib.pyplot as plt
5837
6371
  if metrics is None:
5838
6372
  metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)
5839
6373