py2ls 0.2.4.7__py3-none-any.whl → 0.2.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -1,62 +1,38 @@
1
1
  import numpy as np
2
- import pandas as pd
3
-
4
- import json
5
- import matplotlib
6
- import matplotlib.pyplot as plt
7
- import matplotlib.ticker as tck
8
- from cycler import cycler
9
- from mpl_toolkits.mplot3d import Axes3D
10
- import seaborn as sns
11
-
12
- from sklearn.kernel_approximation import KERNEL_PARAMS
13
- from sympy import is_increasing
14
- import sys, os, shutil, re, yaml, json, subprocess
15
- import importlib.util
16
- import time
17
- from dateutil import parser
18
- from datetime import datetime
19
- import schedule
20
-
21
- from PIL import Image, ImageEnhance, ImageOps, ImageFilter
22
- from rembg import remove, new_session
23
-
24
- import docx
25
- from fpdf import FPDF
26
- from lxml import etree
27
- from docx import Document
28
- from PyPDF2 import PdfReader
29
- from pptx import Presentation
30
- from pptx.util import Inches
31
- from pdf2image import convert_from_path, pdfinfo_from_path
32
- from nltk.tokenize import sent_tokenize, word_tokenize
33
- import nltk # nltk.download("punkt")
34
- from docx2pdf import convert
35
- import img2pdf as image2pdf
36
- import nbformat
37
- from nbconvert import MarkdownExporter
38
-
39
- from itertools import pairwise
40
- from box import Box, BoxList
41
- from numerizer import numerize
42
- from tqdm import tqdm
43
- import mimetypes
44
- from pprint import pp
45
- from collections import Counter
46
- from fuzzywuzzy import fuzz, process
47
- from langdetect import detect
48
- from duckduckgo_search import DDGS
2
+ import pandas as pd
3
+ import sys, os
4
+ from IPython.display import display
49
5
  from typing import List, Optional, Union
50
- from bs4 import BeautifulSoup
51
-
52
- from . import netfinder
53
-
54
6
  try:
55
7
  get_ipython().run_line_magic("load_ext", "autoreload")
56
8
  get_ipython().run_line_magic("autoreload", "2")
57
9
  except NameError:
58
10
  pass
59
11
 
12
+ import warnings
13
+ warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
14
+ warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
15
+
16
+ def run_once_within(duration=60): # default 60s
17
+ import time
18
+ """
19
+ usage:
20
+ if run_once_within():
21
+ print("This code runs once per minute.")
22
+ else:
23
+ print("The code has already been run in the last minute.")
24
+ """
25
+ if not hasattr(run_once_within, "time_last"):
26
+ run_once_within.time_last = None
27
+ time_curr = time.time()
28
+
29
+ if (run_once_within.time_last is None) or (time_curr - run_once_within.time_last >= duration):
30
+ run_once_within.time_last = time_curr # Update the last execution time
31
+ return True
32
+ else:
33
+ return False
34
+
35
+
60
36
  def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
61
37
  """
62
38
  Add the Chinese (default) font to the font manager
@@ -155,6 +131,8 @@ def run_every(when: str = None, job=None, wait: int = 60):
155
131
  :param when: String specifying the interval, e.g. '2 minutes', '4 hours', '1 day'.
156
132
  :param job: The function to be scheduled.
157
133
  """
134
+ import schedule
135
+ import time
158
136
  if job is None:
159
137
  print("No job provided!")
160
138
  return
@@ -200,6 +178,8 @@ def run_at(when: str, job=None, wait: int = 60):
200
178
  :param job: The function to be scheduled.
201
179
  :param wait: The sleep interval between checks in seconds.
202
180
  """
181
+ from datetime import datetime
182
+ import time
203
183
  if job is None:
204
184
  print("No job provided!")
205
185
  return
@@ -279,6 +259,7 @@ def get_timezone(timezone: str | list = None):
279
259
 
280
260
  def is_package_installed(package_name):
281
261
  """Check if a package is installed."""
262
+ import importlib.util
282
263
  package_spec = importlib.util.find_spec(package_name)
283
264
  return package_spec is not None
284
265
 
@@ -291,6 +272,7 @@ def upgrade(module="py2ls",uninstall=False):
291
272
  module (str): The name of the module to install/upgrade.
292
273
  uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
293
274
  """
275
+ import subprocess
294
276
  if not is_package_installed(module):
295
277
  try:
296
278
  subprocess.check_call([sys.executable, "-m", "pip", "install", module])
@@ -327,6 +309,7 @@ def get_version(pkg):
327
309
 
328
310
 
329
311
  def rm_folder(folder_path, verbose=True):
312
+ import shutil
330
313
  try:
331
314
  shutil.rmtree(folder_path)
332
315
  if verbose:
@@ -345,6 +328,7 @@ def fremove(path, verbose=True):
345
328
  """
346
329
  try:
347
330
  if os.path.isdir(path):
331
+ import shutil
348
332
  shutil.rmtree(path)
349
333
  if verbose:
350
334
  print(f"Successfully deleted folder {path}")
@@ -360,23 +344,30 @@ def fremove(path, verbose=True):
360
344
  print(f"Failed to delete {path}. Reason: {e}")
361
345
 
362
346
 
363
- def get_cwd(verbose: bool = True):
364
- """
365
- get_cwd: to get the current working directory
366
- Args:
367
- verbose (bool, optional): to show which function is use. Defaults to True.
368
- """
369
- try:
370
- script_dir = os.path.dirname(os.path.abspath(__file__))
371
- if verbose:
372
- print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
373
- except NameError:
374
- # This works in an interactive environment (like a Jupyter notebook)
375
- script_dir = os.getcwd()
376
- if verbose:
377
- print("os.getcwd():", script_dir)
378
- return script_dir
379
-
347
+ # def get_cwd(verbose: bool = True):
348
+ # """
349
+ # get_cwd: to get the current working directory
350
+ # Args:
351
+ # verbose (bool, optional): to show which function is use. Defaults to True.
352
+ # """
353
+ # try:
354
+ # script_dir = os.path.dirname(os.path.abspath(__file__))
355
+ # if verbose:
356
+ # print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
357
+ # except NameError:
358
+ # # This works in an interactive environment (like a Jupyter notebook)
359
+ # script_dir = os.getcwd()
360
+ # if verbose:
361
+ # print("os.getcwd():", script_dir)
362
+ # return script_dir
363
+
364
+
365
+ def get_cwd():
366
+ from pathlib import Path
367
+ # Get the current script's directory as a Path object
368
+ current_directory = Path(__file__).resolve().parent
369
+
370
+ return current_directory
380
371
 
381
372
  def search(
382
373
  query,
@@ -388,7 +379,7 @@ def search(
388
379
  dir_save=dir_save,
389
380
  **kwargs,
390
381
  ):
391
-
382
+ from duckduckgo_search import DDGS
392
383
  if "te" in kind.lower():
393
384
  results = DDGS().text(query, max_results=limit)
394
385
  res = pd.DataFrame(results)
@@ -421,7 +412,7 @@ def echo(*args, **kwargs):
421
412
  str: the answer from ai
422
413
  """
423
414
  global dir_save
424
-
415
+ from duckduckgo_search import DDGS
425
416
  query = None
426
417
  model = kwargs.get("model", "gpt")
427
418
  verbose = kwargs.get("verbose", True)
@@ -469,8 +460,11 @@ def echo(*args, **kwargs):
469
460
  model_valid = valid_mod_name(model)
470
461
  res = DDGS().chat(query, model=model_valid)
471
462
  if verbose:
463
+ from pprint import pp
472
464
  pp(res)
473
465
  if log:
466
+ from datetime import datetime
467
+ import time
474
468
  dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
475
469
  res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
476
470
  if bool(os.path.basename(dir_save)):
@@ -492,6 +486,7 @@ def ai(*args, **kwargs):
492
486
 
493
487
 
494
488
  def detect_lang(text, output="lang", verbose=True):
489
+ from langdetect import detect
495
490
  dir_curr_script = os.path.dirname(os.path.abspath(__file__))
496
491
  dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
497
492
  print(dir_curr_script, os.getcwd(), dir_lang_code)
@@ -550,6 +545,7 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
550
545
  for lst in flattened_lists[1:]:
551
546
  shared_elements.intersection_update(lst)
552
547
  else:
548
+ from collections import Counter
553
549
  all_elements = [item for sublist in flattened_lists for item in sublist]
554
550
  element_count = Counter(all_elements)
555
551
  # Get elements that appear in at least n_shared lists
@@ -571,9 +567,9 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
571
567
  not_shared(list1,list2)# output [1,3]
572
568
  """
573
569
  _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
574
- list1 = args[0]
570
+ list1 = flatten(args[0], verbose=verbose)
575
571
  _not_shared=[item for item in list1 if item not in _common]
576
- return flatten(_not_shared, verbose=verbose)
572
+ return _not_shared
577
573
 
578
574
 
579
575
  def flatten(nested: Any, unique_list=True, verbose=False):
@@ -617,7 +613,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
617
613
  Returns:
618
614
  tuple: A tuple containing the best match and its index in the candidates list.
619
615
  """
620
-
616
+ from fuzzywuzzy import fuzz, process
621
617
  def to_lower(s, ignore_case=True):
622
618
  # Converts a string or list of strings to lowercase if ignore_case is True.
623
619
  if ignore_case:
@@ -743,6 +739,7 @@ def cn2pinyin(
743
739
  return pinyin_flat
744
740
 
745
741
  def counter(list_, verbose=True):
742
+ from collections import Counter
746
743
  c = Counter(list_)
747
744
  # Print the name counts
748
745
  for item, count in c.items():
@@ -771,7 +768,7 @@ def str2time(time_str, fmt="24"):
771
768
  %p represents AM or PM.
772
769
  - str: The converted time string.
773
770
  """
774
-
771
+ from datetime import datetime
775
772
  def time_len_corr(time_str):
776
773
  time_str_ = (
777
774
  ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
@@ -832,6 +829,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
832
829
  Returns:
833
830
  - str: The converted date string.
834
831
  """
832
+ from dateutil import parser
835
833
  try:
836
834
  date_obj = parser.parse(date_str)
837
835
  except ValueError as e:
@@ -848,6 +846,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
848
846
 
849
847
 
850
848
  def str2num(s, *args, **kwargs):
849
+ import re
851
850
  delimiter = kwargs.get("sep", None)
852
851
  round_digits = kwargs.get("round", None)
853
852
  if delimiter is not None:
@@ -863,6 +862,7 @@ def str2num(s, *args, **kwargs):
863
862
  try:
864
863
  num = float(s)
865
864
  except ValueError:
865
+ from numerizer import numerize
866
866
  try:
867
867
  numerized = numerize(s)
868
868
  num = int(numerized) if "." not in numerized else float(numerized)
@@ -1030,7 +1030,7 @@ def px2inch(*px, dpi=300) -> list:
1030
1030
  return [i / dpi for i in px]
1031
1031
 
1032
1032
 
1033
- def cm2inch(*cm) -> list:
1033
+ def inch2cm(*cm) -> list:
1034
1034
  """
1035
1035
  cm2inch: converts centimeter measurements to inches.
1036
1036
  Usage:
@@ -1051,24 +1051,31 @@ def cm2inch(*cm) -> list:
1051
1051
  def inch2px(*inch, dpi=300) -> list:
1052
1052
  """
1053
1053
  inch2px: converts inch measurements to pixels based on the given dpi.
1054
+
1054
1055
  Usage:
1055
1056
  inch2px(1, 2, dpi=300); inch2px([1, 2], dpi=300)
1057
+
1058
+ Parameters:
1059
+ inch : float, list, or tuple
1060
+ Single or multiple measurements in inches to convert to pixels.
1061
+ dpi : int, optional (default=300)
1062
+ Dots per inch (DPI), representing the pixel density.
1063
+
1056
1064
  Returns:
1057
- list: in pixels
1065
+ list: Converted measurements in pixels.
1058
1066
  """
1059
- # Case 1: When the user passes a single argument that is a list or tuple, such as inch2px([1, 2]) or inch2px((1, 2))
1067
+ # Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
1060
1068
  if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
1061
- # If the input is a single list or tuple, we unpack its elements and convert each to pixels
1062
1069
  return [i * dpi for i in inch[0]]
1063
- # Case 2: When the user passes multiple arguments directly, such as inch2px(1, 2)
1070
+
1071
+ # Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
1064
1072
  else:
1065
- # Here, we convert each individual argument directly to pixels
1066
1073
  return [i * dpi for i in inch]
1067
1074
 
1068
1075
 
1069
- def inch2cm(*inch) -> list:
1076
+
1077
+ def cm2inch(*inch) -> list:
1070
1078
  """
1071
- inch2cm: converts inch measurements to centimeters.
1072
1079
  Usage:
1073
1080
  inch2cm(8,5); inch2cm((8,5)); inch2cm([8,5])
1074
1081
  Returns:
@@ -1183,6 +1190,7 @@ def paper_size(paper_type_str="a4"):
1183
1190
 
1184
1191
 
1185
1192
  def docx2pdf(dir_docx, dir_pdf=None):
1193
+ from docx2pdf import convert
1186
1194
  if dir_pdf:
1187
1195
  convert(dir_docx, dir_pdf)
1188
1196
  else:
@@ -1190,6 +1198,7 @@ def docx2pdf(dir_docx, dir_pdf=None):
1190
1198
 
1191
1199
 
1192
1200
  def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
1201
+ import img2pdf as image2pdf
1193
1202
  def mm_to_point(size):
1194
1203
  return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
1195
1204
 
@@ -1241,6 +1250,9 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
1241
1250
 
1242
1251
 
1243
1252
  def pdf2ppt(dir_pdf, dir_ppt):
1253
+ from PyPDF2 import PdfReader
1254
+ from pptx.util import Inches
1255
+ from pptx import Presentation
1244
1256
  prs = Presentation()
1245
1257
 
1246
1258
  # Open the PDF file
@@ -1269,6 +1281,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
1269
1281
 
1270
1282
 
1271
1283
  def ssplit(text, by="space", verbose=False, strict=False, **kws):
1284
+ import re
1272
1285
  if isinstance(text, list):
1273
1286
  nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
1274
1287
  flat_list = [item for sublist in nested_list for item in sublist]
@@ -1316,6 +1329,8 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1316
1329
  return [text[i : i + length] for i in range(0, len(text), length)]
1317
1330
 
1318
1331
  def split_by_sent_num(text, n=10):
1332
+ from nltk.tokenize import sent_tokenize
1333
+ from itertools import pairwise
1319
1334
  # split text into sentences
1320
1335
  text_split_by_sent = sent_tokenize(text)
1321
1336
  cut_loc_array = np.arange(0, len(text_split_by_sent), n)
@@ -1388,10 +1403,12 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1388
1403
  print(f"splited by camel_case")
1389
1404
  return split_by_camel_case(text)
1390
1405
  elif ("word" in by) and not strict:
1406
+ from nltk.tokenize import word_tokenize
1391
1407
  if verbose:
1392
1408
  print(f"splited by word")
1393
1409
  return word_tokenize(text)
1394
1410
  elif ("sen" in by and not "num" in by) and not strict:
1411
+ from nltk.tokenize import sent_tokenize
1395
1412
  if verbose:
1396
1413
  print(f"splited by sentence")
1397
1414
  return sent_tokenize(text)
@@ -1441,9 +1458,11 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1441
1458
 
1442
1459
 
1443
1460
  def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
1461
+ from pdf2image import convert_from_path, pdfinfo_from_path
1444
1462
  df_dir_img_single_page = pd.DataFrame()
1445
1463
  dir_single_page = []
1446
1464
  if verbose:
1465
+ from pprint import pp
1447
1466
  pp(pdfinfo_from_path(dir_pdf))
1448
1467
  if isinstance(page, tuple) and page:
1449
1468
  page = list(page)
@@ -1562,6 +1581,7 @@ def unzip(dir_path, output_dir=None):
1562
1581
  # If the output directory already exists, remove it and replace it
1563
1582
  if os.path.exists(output_dir):
1564
1583
  if os.path.isdir(output_dir): # check if it is a folder
1584
+ import shutil
1565
1585
  shutil.rmtree(output_dir) # remove folder
1566
1586
  else:
1567
1587
  os.remove(output_dir) # remove file
@@ -1579,6 +1599,7 @@ def unzip(dir_path, output_dir=None):
1579
1599
 
1580
1600
  output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
1581
1601
  try:
1602
+ import shutil
1582
1603
  with gzip.open(dir_path, "rb") as gz_file:
1583
1604
  with open(output_file, "wb") as out_file:
1584
1605
  shutil.copyfileobj(gz_file, out_file)
@@ -1676,11 +1697,13 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1676
1697
 
1677
1698
  """
1678
1699
  if not isinstance(df, pd.DataFrame):
1700
+ if verbose:
1701
+ print('not pd.DataFrame')
1679
1702
  return False
1680
1703
  df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
1681
1704
  # Initialize a list to hold messages about abnormalities
1682
1705
  messages = []
1683
- is_abnormal = True
1706
+ is_abnormal = False
1684
1707
  # Check the shape of the DataFrame
1685
1708
  actual_shape = df.shape
1686
1709
  messages.append(f"Shape of DataFrame: {actual_shape}")
@@ -1705,25 +1728,29 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1705
1728
  is_abnormal = True
1706
1729
  if verbose:
1707
1730
  print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
1708
-
1731
+ if verbose:
1732
+ print("1",is_abnormal)
1709
1733
  if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
1710
1734
  messages.append("Abnormal: Too many delimiters in column names.")
1711
1735
  is_abnormal = True
1712
1736
  if verbose:
1713
1737
  print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
1714
-
1738
+ if verbose:
1739
+ print("2",is_abnormal)
1715
1740
  if delimiter_counts[""] > 3:
1716
1741
  messages.append("Abnormal: There are empty column names.")
1717
1742
  is_abnormal = True
1718
1743
  if verbose:
1719
1744
  print(f'delimiter_counts[""] > 3')
1720
-
1745
+ if verbose:
1746
+ print("3",is_abnormal)
1721
1747
  if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
1722
1748
  messages.append("Abnormal: Some column names contain unexpected characters.")
1723
1749
  is_abnormal = True
1724
1750
  if verbose:
1725
1751
  print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
1726
-
1752
+ if verbose:
1753
+ print("4",is_abnormal)
1727
1754
  # # Check for missing values
1728
1755
  # missing_values = df.isnull().sum()
1729
1756
  # if missing_values.any():
@@ -1743,7 +1770,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1743
1770
  is_abnormal = True
1744
1771
  if verbose:
1745
1772
  print(f'df.columns[df.nunique() == 1].tolist()')
1746
-
1773
+ if verbose:
1774
+ print("5",is_abnormal)
1747
1775
  # Check for an unreasonable number of rows or columns
1748
1776
  if actual_shape[0] < 2 or actual_shape[1] < 2:
1749
1777
  messages.append(
@@ -1752,7 +1780,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1752
1780
  is_abnormal = True
1753
1781
  if verbose:
1754
1782
  print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
1755
-
1783
+ if verbose:
1784
+ print("6",is_abnormal)
1756
1785
  # Compile results
1757
1786
  if verbose:
1758
1787
  print("\n".join(messages))
@@ -1769,7 +1798,24 @@ def fload(fpath, kind=None, **kwargs):
1769
1798
  Returns:
1770
1799
  content: The content loaded from the file.
1771
1800
  """
1772
-
1801
+ def read_mplstyle(style_file):
1802
+ import matplotlib.pyplot as plt
1803
+ # Load the style file
1804
+ plt.style.use(style_file)
1805
+
1806
+ # Get the current style properties
1807
+ style_dict = plt.rcParams
1808
+
1809
+ # Convert to dictionary
1810
+ style_dict = dict(style_dict)
1811
+ # Print the style dictionary
1812
+ for i, j in style_dict.items():
1813
+ print(f"\n{i}::::{j}")
1814
+ return style_dict
1815
+ # #example usage:
1816
+ # style_file = "/ std-colors.mplstyle"
1817
+ # style_dict = read_mplstyle(style_file)
1818
+
1773
1819
  def load_txt_md(fpath):
1774
1820
  with open(fpath, "r") as file:
1775
1821
  content = file.read()
@@ -1785,6 +1831,7 @@ def fload(fpath, kind=None, **kwargs):
1785
1831
  def load_json(fpath, **kwargs):
1786
1832
  output=kwargs.pop("output","json")
1787
1833
  if output=='json':
1834
+ import json
1788
1835
  with open(fpath, "r") as file:
1789
1836
  content = json.load(file)
1790
1837
  return content
@@ -1792,12 +1839,14 @@ def fload(fpath, kind=None, **kwargs):
1792
1839
  return pd.read_json(fpath,**kwargs)
1793
1840
 
1794
1841
  def load_yaml(fpath):
1842
+ import yaml
1795
1843
  with open(fpath, "r") as file:
1796
1844
  content = yaml.safe_load(file)
1797
1845
  return content
1798
1846
 
1799
1847
 
1800
1848
  def load_xml(fpath, fsize_thr: int = 100):
1849
+ from lxml import etree
1801
1850
  def load_small_xml(fpath):
1802
1851
  tree = etree.parse(fpath)
1803
1852
  root = tree.getroot()
@@ -1856,6 +1905,15 @@ def fload(fpath, kind=None, **kwargs):
1856
1905
  if line.startswith(char):
1857
1906
  return char
1858
1907
  return None
1908
+
1909
+ def _get_chunks(df_fake):
1910
+ """
1911
+ helper func for 'load_csv'
1912
+ """
1913
+ chunks = []
1914
+ for chunk in df_fake:
1915
+ chunks.append(chunk)
1916
+ return pd.concat(chunks, ignore_index=True)
1859
1917
 
1860
1918
  def load_csv(fpath, **kwargs):
1861
1919
  from pandas.errors import EmptyDataError
@@ -1869,16 +1927,19 @@ def fload(fpath, kind=None, **kwargs):
1869
1927
  on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1870
1928
  comment = kwargs.pop("comment", None)
1871
1929
  fmt=kwargs.pop("fmt",False)
1930
+ chunksize=kwargs.pop("chunksize", None)
1931
+ engine='c' if chunksize else engine # when chunksize, recommend 'c'
1932
+ low_memory=kwargs.pop("low_memory",True)
1933
+ low_memory=False if chunksize else True # when chunksize, recommend low_memory=False
1872
1934
  verbose=kwargs.pop("verbose",False)
1873
- if verbose:
1935
+ if run_once_within():
1874
1936
  use_pd("read_csv", verbose=verbose)
1875
- return
1876
1937
 
1877
1938
  if comment is None:
1878
1939
  comment = get_comment(
1879
1940
  fpath, comment=None, encoding="utf-8", lines_to_check=5
1880
1941
  )
1881
-
1942
+
1882
1943
  try:
1883
1944
  df = pd.read_csv(
1884
1945
  fpath,
@@ -1890,14 +1951,19 @@ def fload(fpath, kind=None, **kwargs):
1890
1951
  skipinitialspace=skipinitialspace,
1891
1952
  sep=sep,
1892
1953
  on_bad_lines=on_bad_lines,
1954
+ chunksize=chunksize,
1955
+ low_memory=low_memory,
1893
1956
  **kwargs,
1894
1957
  )
1895
- if is_df_abnormal(df, verbose=0):
1958
+ if chunksize:
1959
+ df=_get_chunks(df)
1960
+ print(df.shape)
1961
+ if is_df_abnormal(df, verbose=0): # raise error
1896
1962
  raise ValueError("the df is abnormal")
1897
1963
  except:
1898
1964
  try:
1899
1965
  try:
1900
- if engine == "pyarrow":
1966
+ if engine == "pyarrow" and not chunksize:
1901
1967
  df = pd.read_csv(
1902
1968
  fpath,
1903
1969
  engine=engine,
@@ -1906,6 +1972,7 @@ def fload(fpath, kind=None, **kwargs):
1906
1972
  sep=sep,
1907
1973
  on_bad_lines=on_bad_lines,
1908
1974
  comment=comment,
1975
+ low_memory=low_memory,
1909
1976
  **kwargs,
1910
1977
  )
1911
1978
  else:
@@ -1919,14 +1986,19 @@ def fload(fpath, kind=None, **kwargs):
1919
1986
  skipinitialspace=skipinitialspace,
1920
1987
  on_bad_lines=on_bad_lines,
1921
1988
  comment=comment,
1989
+ chunksize=chunksize,
1990
+ low_memory=low_memory,
1922
1991
  **kwargs,
1923
1992
  )
1993
+ if chunksize:
1994
+ df=_get_chunks(df)
1995
+ print(df.shape)
1924
1996
  if is_df_abnormal(df, verbose=0):
1925
1997
  raise ValueError("the df is abnormal")
1926
1998
  except (UnicodeDecodeError, ValueError):
1927
1999
  encoding = get_encoding(fpath)
1928
2000
  # print(f"utf-8 failed. Retrying with detected encoding: {encoding}")
1929
- if engine == "pyarrow":
2001
+ if engine == "pyarrow" and not chunksize:
1930
2002
  df = pd.read_csv(
1931
2003
  fpath,
1932
2004
  engine=engine,
@@ -1935,6 +2007,7 @@ def fload(fpath, kind=None, **kwargs):
1935
2007
  sep=sep,
1936
2008
  on_bad_lines=on_bad_lines,
1937
2009
  comment=comment,
2010
+ low_memory=low_memory,
1938
2011
  **kwargs,
1939
2012
  )
1940
2013
  else:
@@ -1948,8 +2021,13 @@ def fload(fpath, kind=None, **kwargs):
1948
2021
  skipinitialspace=skipinitialspace,
1949
2022
  on_bad_lines=on_bad_lines,
1950
2023
  comment=comment,
2024
+ chunksize=chunksize,
2025
+ low_memory=low_memory,
1951
2026
  **kwargs,
1952
2027
  )
2028
+ if chunksize:
2029
+ df=_get_chunks(df)
2030
+ print(df.shape)
1953
2031
  if is_df_abnormal(df, verbose=0):
1954
2032
  raise ValueError("the df is abnormal")
1955
2033
  except Exception as e:
@@ -1966,8 +2044,13 @@ def fload(fpath, kind=None, **kwargs):
1966
2044
  sep=sep,
1967
2045
  on_bad_lines=on_bad_lines,
1968
2046
  comment=comment,
2047
+ chunksize=chunksize,
2048
+ low_memory=low_memory,
1969
2049
  **kwargs,
1970
2050
  )
2051
+ if chunksize:
2052
+ df=_get_chunks(df)
2053
+ print(df.shape)
1971
2054
  if not is_df_abnormal(df, verbose=0): # normal
1972
2055
  display(df.head(2))
1973
2056
  print(f"shape: {df.shape}")
@@ -1975,32 +2058,38 @@ def fload(fpath, kind=None, **kwargs):
1975
2058
  except:
1976
2059
  pass
1977
2060
  else:
1978
- engines = [None,"c", "python"]
1979
- for engine in engines:
1980
- separators = [",", "\t", ";", "|", " "]
1981
- for sep in separators:
1982
- try:
1983
- # sep2show = sep if sep != "\t" else "\\t"
1984
- # print(f"trying with: engine={engine}, sep='{sep2show}'")
1985
- # print(".")
1986
- df = pd.read_csv(
1987
- fpath,
1988
- engine=engine,
1989
- sep=sep,
1990
- on_bad_lines=on_bad_lines,
1991
- comment=comment,
1992
- **kwargs,
1993
- )
1994
- # display(df.head(2))
1995
- # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
1996
- if not is_df_abnormal(df, verbose=0):
1997
- display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
1998
- print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
1999
- return df
2000
- except EmptyDataError as e:
2001
- continue
2002
- else:
2003
- pass
2061
+ if not chunksize:
2062
+ engines = [None,"c", "python"]
2063
+ for engine in engines:
2064
+ separators = [",", "\t", ";", "|", " "]
2065
+ for sep in separators:
2066
+ try:
2067
+ # sep2show = sep if sep != "\t" else "\\t"
2068
+ # print(f"trying with: engine={engine}, sep='{sep2show}'")
2069
+ # print(".")
2070
+ df = pd.read_csv(
2071
+ fpath,
2072
+ engine=engine,
2073
+ sep=sep,
2074
+ on_bad_lines=on_bad_lines,
2075
+ comment=comment,
2076
+ chunksize=chunksize,
2077
+ low_memory=low_memory,
2078
+ **kwargs,
2079
+ )
2080
+ # display(df.head(2))
2081
+ # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
2082
+ if chunksize:
2083
+ df=_get_chunks(df)
2084
+ print(df.shape)
2085
+ if not is_df_abnormal(df, verbose=0):
2086
+ display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
2087
+ print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
2088
+ return df
2089
+ except EmptyDataError as e:
2090
+ continue
2091
+ else:
2092
+ pass
2004
2093
  display(df.head(2))
2005
2094
  print(f"shape: {df.shape}")
2006
2095
  return df
@@ -2008,7 +2097,7 @@ def fload(fpath, kind=None, **kwargs):
2008
2097
  def load_excel(fpath, **kwargs):
2009
2098
  engine = kwargs.get("engine", "openpyxl")
2010
2099
  verbose=kwargs.pop("verbose",False)
2011
- if verbose:
2100
+ if run_once_within():
2012
2101
  use_pd("read_excel", verbose=verbose)
2013
2102
  df = pd.read_excel(fpath, engine=engine, **kwargs)
2014
2103
  try:
@@ -2039,7 +2128,7 @@ def fload(fpath, kind=None, **kwargs):
2039
2128
  engine = kwargs.get("engine", "pyarrow")
2040
2129
  verbose = kwargs.pop("verbose", False)
2041
2130
 
2042
- if verbose:
2131
+ if run_once_within():
2043
2132
  use_pd("read_parquet", verbose=verbose)
2044
2133
  try:
2045
2134
  df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2056,6 +2145,8 @@ def fload(fpath, kind=None, **kwargs):
2056
2145
  return df
2057
2146
 
2058
2147
  def load_ipynb(fpath, **kwargs):
2148
+ import nbformat
2149
+ from nbconvert import MarkdownExporter
2059
2150
  as_version = kwargs.get("as_version", 4)
2060
2151
  with open(fpath, "r") as file:
2061
2152
  nb = nbformat.read(file, as_version=as_version)
@@ -2085,6 +2176,7 @@ def fload(fpath, kind=None, **kwargs):
2085
2176
  If page is an integer, it returns the text of the specified page number.
2086
2177
  If the specified page is not found, it returns the string "Page is not found".
2087
2178
  """
2179
+ from PyPDF2 import PdfReader
2088
2180
  text_dict = {}
2089
2181
  with open(fpath, "rb") as file:
2090
2182
  pdf_reader = PdfReader(file)
@@ -2114,6 +2206,7 @@ def fload(fpath, kind=None, **kwargs):
2114
2206
  return text_dict.get(int(page), "Page is not found")
2115
2207
 
2116
2208
  def load_docx(fpath):
2209
+ from docx import Document
2117
2210
  doc = Document(fpath)
2118
2211
  content = [para.text for para in doc.paragraphs]
2119
2212
  return content
@@ -2174,8 +2267,16 @@ def fload(fpath, kind=None, **kwargs):
2174
2267
  elif kind == "xml":
2175
2268
  return load_xml(fpath)
2176
2269
  elif kind in ["csv","tsv"]:
2270
+ verbose=kwargs.pop('verbose',False)
2271
+ if run_once_within():
2272
+ use_pd("read_csv")
2177
2273
  content = load_csv(fpath, **kwargs)
2178
2274
  return content
2275
+ elif kind=='pkl':
2276
+ verbose=kwargs.pop('verbose',False)
2277
+ if run_once_within():
2278
+ use_pd("read_pickle")
2279
+ return pd.read_pickle(fpath,**kwargs)
2179
2280
  elif kind in ["ods", "ods", "odt"]:
2180
2281
  engine = kwargs.get("engine", "odf")
2181
2282
  kwargs.pop("engine", None)
@@ -2184,25 +2285,40 @@ def fload(fpath, kind=None, **kwargs):
2184
2285
  engine = kwargs.get("engine", "xlrd")
2185
2286
  kwargs.pop("engine", None)
2186
2287
  content = load_excel(fpath, engine=engine, **kwargs)
2187
- print(f"shape: {content.shape}")
2188
- display(content.head(3))
2288
+ print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2289
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2189
2290
  return content
2190
2291
  elif kind == "xlsx":
2191
2292
  content = load_excel(fpath, **kwargs)
2192
- display(content.head(3))
2193
- print(f"shape: {content.shape}")
2293
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2294
+ print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2194
2295
  return content
2195
2296
  elif kind=='mtx':
2196
2297
  from scipy.io import mmread
2197
2298
  dat_mtx=mmread(fpath)
2198
2299
  content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
2199
- display(content.head(3))
2300
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2200
2301
  print(f"shape: {content.shape}")
2201
2302
  return content
2202
2303
  elif kind == "ipynb":
2203
2304
  return load_ipynb(fpath, **kwargs)
2204
2305
  elif kind in ['parquet','snappy']:
2306
+ verbose=kwargs.pop('verbose',False)
2307
+ if run_once_within():
2308
+ use_pd("read_parquet")
2205
2309
  return load_parquet(fpath,**kwargs)
2310
+ elif kind =='feather':
2311
+ verbose=kwargs.pop('verbose',False)
2312
+ if run_once_within():
2313
+ use_pd("read_feather")
2314
+ content=pd.read_feather(fpath,**kwargs)
2315
+ return content
2316
+ elif kind =='h5':
2317
+ content=pd.read_hdf(fpath,**kwargs)
2318
+ return content
2319
+ elif kind =='pkl':
2320
+ content=pd.read_pickle(fpath,**kwargs)
2321
+ return content
2206
2322
  elif kind == "pdf":
2207
2323
  # print('usage:load_pdf(fpath, page="all", verbose=False)')
2208
2324
  return load_pdf(fpath, **kwargs)
@@ -2213,6 +2329,7 @@ def fload(fpath, kind=None, **kwargs):
2213
2329
  import GEOparse
2214
2330
  return GEOparse.get_GEO(filepath=fpath)
2215
2331
  elif kind.lower() in zip_types:
2332
+ from pprint import pp
2216
2333
  keep = kwargs.get("keep", False)
2217
2334
  fpath_unzip = unzip(fpath)
2218
2335
  if os.path.isdir(fpath_unzip):
@@ -2247,6 +2364,9 @@ def fload(fpath, kind=None, **kwargs):
2247
2364
  meta, data = fcsparser.parse(fpath, reformat_meta=True)
2248
2365
  return meta, data
2249
2366
 
2367
+ elif kind=="mplstyle":
2368
+ return read_mplstyle(fpath)
2369
+
2250
2370
  else:
2251
2371
  print("direct reading...")
2252
2372
  try:
@@ -2358,6 +2478,7 @@ def filter_kwargs(kws, valid_kwargs):
2358
2478
  }
2359
2479
  return kwargs_filtered
2360
2480
 
2481
+ str_space_speed='sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
2361
2482
 
2362
2483
  def fsave(
2363
2484
  fpath,
@@ -2393,6 +2514,7 @@ def fsave(
2393
2514
  fappend(fpath, content=content)
2394
2515
 
2395
2516
  def save_docx(fpath, content, font_name, font_size, spacing):
2517
+ import docx
2396
2518
  if isinstance(content, str):
2397
2519
  content = content.split(". ")
2398
2520
  doc = docx.Document()
@@ -2420,6 +2542,7 @@ def fsave(
2420
2542
  save_content(fpath, html_content, mode)
2421
2543
 
2422
2544
  def save_pdf(fpath, content, font_name, font_size):
2545
+ from fpdf import FPDF
2423
2546
  pdf = FPDF()
2424
2547
  pdf.add_page()
2425
2548
  # pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
@@ -2433,7 +2556,7 @@ def fsave(
2433
2556
  # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
2434
2557
 
2435
2558
  verbose=kwargs.pop("verbose",False)
2436
- if verbose:
2559
+ if run_once_within():
2437
2560
  use_pd("to_csv", verbose=verbose)
2438
2561
  kwargs_csv = dict(
2439
2562
  path_or_buf=None,
@@ -2465,7 +2588,7 @@ def fsave(
2465
2588
  def save_xlsx(fpath, data, **kwargs):
2466
2589
  verbose=kwargs.pop("verbose",False)
2467
2590
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2468
- if verbose:
2591
+ if run_once_within():
2469
2592
  use_pd("to_excel", verbose=verbose)
2470
2593
  if any(kwargs):
2471
2594
  format_excel(df=data, filename=fpath, **kwargs)
@@ -2491,9 +2614,10 @@ def fsave(
2491
2614
 
2492
2615
  def save_ipynb(fpath, data, **kwargs):
2493
2616
  # Split the content by code fences to distinguish between code and markdown
2617
+ import nbformat
2494
2618
  parts = data.split("```")
2495
2619
  cells = []
2496
-
2620
+
2497
2621
  for i, part in enumerate(parts):
2498
2622
  if i % 2 == 0:
2499
2623
  # Even index: markdown content
@@ -2513,6 +2637,7 @@ def fsave(
2513
2637
  # json.dump(data, file, **kwargs)
2514
2638
 
2515
2639
  def save_json(fpath_fname, var_dict_or_df):
2640
+ import json
2516
2641
  def _convert_js(data):
2517
2642
  if isinstance(data, pd.DataFrame):
2518
2643
  return data.to_dict(orient="list")
@@ -2534,10 +2659,12 @@ def fsave(
2534
2659
  # # setss = jsonload("/.json")
2535
2660
 
2536
2661
  def save_yaml(fpath, data, **kwargs):
2662
+ import yaml
2537
2663
  with open(fpath, "w") as file:
2538
2664
  yaml.dump(data, file, **kwargs)
2539
2665
 
2540
2666
  def save_xml(fpath, data):
2667
+ from lxml import etree
2541
2668
  root = etree.Element("root")
2542
2669
  if isinstance(data, dict):
2543
2670
  for key, val in data.items():
@@ -2613,15 +2740,91 @@ def fsave(
2613
2740
  elif kind == "ipynb":
2614
2741
  save_ipynb(fpath, content, **kwargs)
2615
2742
  elif kind.lower() in ["parquet","pq","big","par"]:
2743
+ verbose=kwargs.pop('verbose',False)
2744
+ if verbose:
2745
+ print(str_space_speed)
2746
+ use_pd("to_parquet")
2747
+ return None
2616
2748
  compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2617
2749
  # fix the fpath ends
2618
- if not '.parquet' in fpath:
2619
- fpath=fpath.replace(kind, 'parquet')
2750
+ _fpath, _ext = os.path.splitext(fpath)
2751
+ fpath = _fpath+_ext.replace(kind, 'parquet')
2620
2752
  if compression is not None:
2621
2753
  if not fpath.endswith(compression):
2622
2754
  fpath=fpath+f".{compression}"
2623
2755
  save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
2756
+ elif kind.lower() in ["pkl","pk","pickle","pick"]:
2757
+ # Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
2758
+ # Pickle is convenient if you want to preserve exact Python object types.
2759
+ verbose=kwargs.pop('verbose',False)
2760
+ if verbose:
2761
+ print(str_space_speed)
2762
+ use_pd("to_pickle")
2763
+ return None
2764
+ _fpath, _ext = os.path.splitext(fpath)
2765
+ fpath = _fpath+_ext.replace(kind, 'pkl')
2766
+ compression=kwargs.pop("compression",None)
2767
+ if compression is not None:
2768
+ if not fpath.endswith(compression["method"]):
2769
+ fpath=fpath+f".{compression["method"]}"
2770
+ if isinstance(content, pd.DataFrame):
2771
+ content.to_pickle(fpath,**kwargs)
2772
+ else:
2773
+ try:
2774
+ print("trying to convert it as a DataFrame...")
2775
+ content=pd.DataFrame(content)
2776
+ content.to_pickle(fpath,**kwargs)
2777
+ except Exception as e:
2778
+ raise ValueError(
2779
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2780
+ )
2781
+ elif kind.lower() in ["fea",'feather','ft','fe','feat','fether']:
2782
+ # Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
2783
+ # optimized for data analytics tasks and is especially fast when working with Pandas.
2784
+
2785
+ verbose=kwargs.pop('verbose',False)
2786
+ if verbose:
2787
+ print(str_space_speed)
2788
+ use_pd("to_feather")
2789
+ return None
2790
+ _fpath, _ext = os.path.splitext(fpath)
2791
+ fpath = _fpath+_ext.replace(kind, 'feather')
2792
+ if isinstance(content, pd.DataFrame):
2793
+ content.to_feather(fpath,**kwargs)
2794
+ else:
2795
+ try:
2796
+ print("trying to convert it as a DataFrame...")
2797
+ content=pd.DataFrame(content)
2798
+ content.to_feather(fpath, **kwargs)
2799
+ except Exception as e:
2800
+ raise ValueError(
2801
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2802
+ )
2803
+ elif kind.lower() in ["hd",'hdf','h','h5']:
2804
+ # particularly useful for large datasets and can handle complex data structures
2805
+ verbose=kwargs.pop('verbose',False)
2806
+ if verbose:
2807
+ print(str_space_speed)
2808
+ use_pd("to_hdf")
2809
+ _fpath, _ext = os.path.splitext(fpath)
2810
+ fpath = _fpath+_ext.replace(kind, 'h5')
2811
+ compression=kwargs.pop("compression",None)
2812
+ if compression is not None:
2813
+ if not fpath.endswith(compression):
2814
+ fpath=fpath+f".{compression}"
2815
+ if isinstance(content, pd.DataFrame):
2816
+ content.to_hdf(fpath,key='content',**kwargs)
2817
+ else:
2818
+ try:
2819
+ print("trying to convert it as a DataFrame...")
2820
+ content=pd.DataFrame(content)
2821
+ content.to_hdf(fpath,**kwargs)
2822
+ except Exception as e:
2823
+ raise ValueError(
2824
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2825
+ )
2624
2826
  else:
2827
+ from . import netfinder
2625
2828
  try:
2626
2829
  netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
2627
2830
  except:
@@ -2744,6 +2947,7 @@ def isa(content, kind):
2744
2947
  elif "color" in kind.lower(): # file
2745
2948
  return is_str_color(content)
2746
2949
  elif "html" in kind.lower():
2950
+ import re
2747
2951
  if content is None or not isinstance(content, str):
2748
2952
  return False
2749
2953
  # Remove leading and trailing whitespace
@@ -2903,6 +3107,7 @@ def listdir(
2903
3107
  display(f.head())
2904
3108
  return f
2905
3109
  else:
3110
+ from box import Box
2906
3111
  if "l" in orient.lower(): # list # default
2907
3112
  res_output = Box(f.to_dict(orient="list"))
2908
3113
  return res_output
@@ -2943,13 +3148,10 @@ def mkdir_nest(fpath: str) -> str:
2943
3148
  Returns:
2944
3149
  - str: The path of the created directory.
2945
3150
  """
2946
-
2947
-
2948
3151
  # Split the full path into directories
2949
3152
  f_slash = "/" if "mac" in get_os().lower() else "\\"
2950
3153
  if os.path.isdir(fpath):
2951
3154
  fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
2952
- print(fpath)
2953
3155
  return fpath
2954
3156
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
2955
3157
 
@@ -3020,7 +3222,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
3020
3222
  if len(rootdir) == 1:
3021
3223
  rootdir = rootdir[0]
3022
3224
  rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
3023
- print(rootdir)
3225
+
3024
3226
  return rootdir
3025
3227
 
3026
3228
 
@@ -3032,6 +3234,8 @@ def split_path(fpath):
3032
3234
 
3033
3235
 
3034
3236
  def figsave(*args, dpi=300):
3237
+ import matplotlib.pyplot as plt
3238
+ from PIL import Image
3035
3239
  dir_save = None
3036
3240
  fname = None
3037
3241
  img = None
@@ -3047,13 +3251,13 @@ def figsave(*args, dpi=300):
3047
3251
 
3048
3252
  if dir_save is None:
3049
3253
  dir_save="./"
3050
- print(dir_save)
3254
+
3051
3255
  # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
3052
3256
  dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
3053
3257
  dir_ch = "".join(dir_save.split(f_slash)[-1:])
3054
3258
  if not dir_par.endswith(f_slash):
3055
3259
  dir_par += f_slash
3056
- print(dir_par)
3260
+
3057
3261
  if fname is None:
3058
3262
  fname = dir_ch
3059
3263
  mkdir(dir_par)
@@ -3140,6 +3344,7 @@ def figsave(*args, dpi=300):
3140
3344
  def is_str_color(s):
3141
3345
  # Regular expression pattern for hexadecimal color codes
3142
3346
  if isinstance(s,str):
3347
+ import re
3143
3348
  color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
3144
3349
  return re.match(color_code_pattern, s) is not None
3145
3350
  else:
@@ -3166,6 +3371,7 @@ def isnum(s):
3166
3371
 
3167
3372
 
3168
3373
  def is_image(fpath):
3374
+ import mimetypes
3169
3375
  mime_type, _ = mimetypes.guess_type(fpath)
3170
3376
  if mime_type and mime_type.startswith("image"):
3171
3377
  return True
@@ -3174,6 +3380,7 @@ def is_image(fpath):
3174
3380
 
3175
3381
 
3176
3382
  def is_document(fpath):
3383
+ import mimetypes
3177
3384
  mime_type, _ = mimetypes.guess_type(fpath)
3178
3385
  if mime_type and (
3179
3386
  mime_type.startswith("text/")
@@ -3194,6 +3401,7 @@ def is_document(fpath):
3194
3401
 
3195
3402
 
3196
3403
  def is_zip(fpath):
3404
+ import mimetypes
3197
3405
  mime_type, _ = mimetypes.guess_type(fpath)
3198
3406
  if mime_type == "application/zip":
3199
3407
  return True
@@ -3202,6 +3410,7 @@ def is_zip(fpath):
3202
3410
 
3203
3411
 
3204
3412
  def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
3413
+ import matplotlib.pyplot as plt
3205
3414
  if ax is None:
3206
3415
  ax = plt.gca()
3207
3416
  for loc, spine in ax.spines.items():
@@ -3290,7 +3499,7 @@ def apply_filter(img, *args):
3290
3499
  Returns:
3291
3500
  PIL.Image: The filtered image.
3292
3501
  """
3293
-
3502
+ from PIL import ImageFilter
3294
3503
  def correct_filter_name(filter_name):
3295
3504
  if "bl" in filter_name.lower() and "box" not in filter_name.lower():
3296
3505
  return "BLUR"
@@ -3532,6 +3741,8 @@ def imgsets(img, **kwargs):
3532
3741
  avg_contrast_factor = sum(contrast_factors) / num_channels
3533
3742
  return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
3534
3743
 
3744
+ import matplotlib.pyplot as plt
3745
+ from PIL import ImageEnhance,ImageOps
3535
3746
  # Load image if input is a file path
3536
3747
  if isinstance(img, str):
3537
3748
  img = load_img(img)
@@ -3595,6 +3806,7 @@ def imgsets(img, **kwargs):
3595
3806
  elif "pad" in k.lower():
3596
3807
  img_update = ImageOps.pad(img_update, size=value)
3597
3808
  elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
3809
+ from rembg import remove, new_session
3598
3810
  if isinstance(value, bool):
3599
3811
  session = new_session("isnet-general-use")
3600
3812
  img_update = remove(img_update, session=session)
@@ -3633,6 +3845,7 @@ def imgsets(img, **kwargs):
3633
3845
  else:
3634
3846
  img_update = remove(img_update)
3635
3847
  elif "bg" in k.lower() and "color" in k.lower():
3848
+ from rembg import remove
3636
3849
  if isinstance(value, list):
3637
3850
  value = tuple(value)
3638
3851
  if isinstance(value, tuple): # replace the background color
@@ -3664,6 +3877,8 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
3664
3877
  Args:
3665
3878
  dir_img_list (list): List of the Directory containing the images.
3666
3879
  """
3880
+ import matplotlib.pyplot as plt
3881
+ from PIL import Image
3667
3882
  num_images = len(dir_img_list)
3668
3883
  if not kind.startswith("."):
3669
3884
  kind = "." + kind
@@ -3700,28 +3915,15 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
3700
3915
  # usage:
3701
3916
  # fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
3702
3917
  # thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
3703
- def read_mplstyle(style_file):
3704
- # Load the style file
3705
- plt.style.use(style_file)
3706
-
3707
- # Get the current style properties
3708
- style_dict = plt.rcParams
3709
-
3710
- # Convert to dictionary
3711
- style_dict = dict(style_dict)
3712
- # Print the style dictionary
3713
- for i, j in style_dict.items():
3714
- print(f"\n{i}::::{j}")
3715
- return style_dict
3716
3918
 
3717
3919
 
3718
- # #example usage:
3719
- # style_file = "/ std-colors.mplstyle"
3720
- # style_dict = read_mplstyle(style_file)
3721
-
3722
3920
 
3723
3921
  # search and fine the director of the libary, which installed at local
3724
3922
  def dir_lib(lib_oi):
3923
+ """
3924
+ # example usage:
3925
+ # dir_lib("seaborn")
3926
+ """
3725
3927
  import site
3726
3928
 
3727
3929
  # Get the site-packages directory
@@ -3739,23 +3941,6 @@ def dir_lib(lib_oi):
3739
3941
  print(f"Cannot find the {lib_oi} in site-packages directory.")
3740
3942
  return dir_list
3741
3943
 
3742
-
3743
- # example usage:
3744
- # dir_lib("seaborn")
3745
-
3746
- """
3747
- # n = 7
3748
- # clist = get_color(n, cmap="auto", how="linspace") # get_color(100)
3749
- # plt.figure(figsize=[8, 5], dpi=100)
3750
- # x = np.linspace(0, 2 * np.pi, 50) * 100
3751
- # y = np.sin(x)
3752
- # for i in range(1, n + 1):
3753
- # plt.plot(x, y + i, c=clist[i - 1], lw=5, label=str(i))
3754
- # plt.legend()
3755
- # plt.ylim(-2, 20)
3756
- # figsets(plt.gca(), {"style": "whitegrid"}) """
3757
-
3758
-
3759
3944
  class FileInfo:
3760
3945
  def __init__(
3761
3946
  self,
@@ -3832,6 +4017,7 @@ class FileInfo:
3832
4017
 
3833
4018
 
3834
4019
  def finfo(fpath):
4020
+ import time
3835
4021
  fname, fmt = os.path.splitext(fpath)
3836
4022
  dir_par = os.path.dirname(fpath) + "/"
3837
4023
  data = {
@@ -3846,6 +4032,7 @@ def finfo(fpath):
3846
4032
  }
3847
4033
  extra_info = {}
3848
4034
  if data["kind"] == ".pdf":
4035
+ from pdf2image import pdfinfo_from_path
3849
4036
  extra_info = pdfinfo_from_path(fpath)
3850
4037
 
3851
4038
  return FileInfo(
@@ -3860,18 +4047,7 @@ def finfo(fpath):
3860
4047
  extra_info=extra_info,
3861
4048
  )
3862
4049
 
3863
-
3864
4050
  # ! format excel file
3865
- import pandas as pd
3866
- from datetime import datetime
3867
- from openpyxl import load_workbook
3868
- from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
3869
- from openpyxl.utils import get_column_letter
3870
- from openpyxl.worksheet.datavalidation import DataValidation
3871
- from openpyxl.comments import Comment
3872
- from openpyxl.formatting.rule import ColorScaleRule
3873
-
3874
-
3875
4051
  def hex2argb(hex_color):
3876
4052
  """
3877
4053
  Convert a hex color code to aARGB format required by openpyxl.
@@ -3902,341 +4078,7 @@ def hex2argb(hex_color):
3902
4078
  return hex_color[-9:]
3903
4079
  else:
3904
4080
  return "F" * (9 - len(hex_color)) + hex_color
3905
- raise ValueError(
3906
- "Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format."
3907
- )
3908
-
3909
-
3910
- def convert_indices_to_range(row_slice, col_slice):
3911
- """Convert numerical row and column slices to Excel-style range strings."""
3912
- start_row = row_slice.start + 1
3913
- end_row = row_slice.stop if row_slice.stop is not None else None
3914
- start_col = col_slice.start + 1
3915
- end_col = col_slice.stop if col_slice.stop is not None else None
3916
-
3917
- start_col_letter = get_column_letter(start_col)
3918
- end_col_letter = get_column_letter(end_col) if end_col else None
3919
- return (
3920
- f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
3921
- if end_col_letter
3922
- else f"{start_col_letter}{start_row}"
3923
- )
3924
-
3925
-
3926
- def apply_format(ws, cell, cell_range):
3927
- """Apply cell formatting to a specified range."""
3928
- cell_font, cell_fill, cell_alignment, border = None, None, None, None
3929
- kws_cell = ["font", "fill", "alignment", "border"]
3930
- for K, _ in cell.items():
3931
- if strcmp(K, kws_cell)[0] == "font":
3932
- #! font
3933
- font_color = "000000"
3934
- font_name = "Arial"
3935
- font_underline = "none"
3936
- font_size = 14
3937
- font_bold = False
3938
- font_strike = False
3939
- font_italic = False
3940
- kws_font = [
3941
- "name",
3942
- "size",
3943
- "bold",
3944
- "underline",
3945
- "color",
3946
- "strike",
3947
- "italic",
3948
- ]
3949
- for k_, v_ in cell.get(K, {}).items():
3950
- if strcmp(k_, kws_font)[0] == "name":
3951
- font_name = v_
3952
- elif strcmp(k_, kws_font)[0] == "size":
3953
- font_size = v_
3954
- elif strcmp(k_, kws_font)[0] == "bold":
3955
- font_bold = v_
3956
- elif strcmp(k_, kws_font)[0] == "underline":
3957
- font_underline = strcmp(v_, ["none", "single", "double"])[0]
3958
- elif strcmp(k_, kws_font)[0] == "color":
3959
- font_color = hex2argb(v_)
3960
- elif strcmp(k_, kws_font)[0] == "strike":
3961
- font_strike = v_
3962
- elif strcmp(k_, kws_font)[0] == "italic":
3963
- font_italic = v_
3964
-
3965
- cell_font = Font(
3966
- name=font_name,
3967
- size=font_size,
3968
- bold=font_bold,
3969
- italic=font_italic,
3970
- underline=font_underline,
3971
- strike=font_strike,
3972
- color=font_color,
3973
- )
3974
-
3975
- if strcmp(K, kws_cell)[0] == "fill":
3976
- #! fill
3977
- kws_fill = ["start_color", "end_color", "fill_type", "color"]
3978
- kws_fill_type = [
3979
- "darkVertical",
3980
- "lightDown",
3981
- "lightGrid",
3982
- "solid",
3983
- "darkDown",
3984
- "lightGray",
3985
- "lightUp",
3986
- "gray0625",
3987
- "lightVertical",
3988
- "lightHorizontal",
3989
- "darkHorizontal",
3990
- "gray125",
3991
- "darkUp",
3992
- "mediumGray",
3993
- "darkTrellis",
3994
- "darkGray",
3995
- "lightTrellis",
3996
- "darkGrid",
3997
- ]
3998
- start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
3999
- for k, v in cell.get(K, {}).items():
4000
- if strcmp(k, kws_fill)[0] == "color":
4001
- start_color, end_color = hex2argb(v), hex2argb(v)
4002
- break
4003
- for k, v in cell.get(K, {}).items():
4004
- if strcmp(k, kws_fill)[0] == "start_color":
4005
- start_color = hex2argb(v)
4006
- elif strcmp(k, kws_fill)[0] == "end_color":
4007
- end_color = hex2argb(v)
4008
- elif strcmp(k, kws_fill)[0] == "fill_type":
4009
- fill_type = strcmp(v, kws_fill_type)[0]
4010
- cell_fill = PatternFill(
4011
- start_color=start_color,
4012
- end_color=end_color,
4013
- fill_type=fill_type,
4014
- )
4015
-
4016
- if strcmp(K, kws_cell)[0] == "alignment":
4017
- #! alignment
4018
- # default
4019
- align_horizontal = "general"
4020
- align_vertical = "center"
4021
- align_rot = 0
4022
- align_wrap = False
4023
- align_shrink = False
4024
- align_indent = 0
4025
- kws_align = [
4026
- "horizontal",
4027
- "ha",
4028
- "vertical",
4029
- "va",
4030
- "text_rotation",
4031
- "rotat",
4032
- "rot",
4033
- "wrap_text",
4034
- "wrap",
4035
- "shrink_to_fit",
4036
- "shrink",
4037
- "indent",
4038
- ]
4039
- for k, v in cell.get(K, {}).items():
4040
- if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
4041
- align_horizontal = strcmp(
4042
- v, ["general", "left", "right", "center"]
4043
- )[0]
4044
- elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
4045
- align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
4046
- elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
4047
- align_rot = v
4048
- elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
4049
- align_wrap = v
4050
- elif strcmp(k, kws_align)[0] in [
4051
- "shrink_to_fit",
4052
- "shrink",
4053
- "wrap_text",
4054
- "wrap",
4055
- ]:
4056
- align_shrink = v
4057
- elif strcmp(k, kws_align)[0] in ["indent"]:
4058
- align_indent = v
4059
- cell_alignment = Alignment(
4060
- horizontal=align_horizontal,
4061
- vertical=align_vertical,
4062
- text_rotation=align_rot,
4063
- wrap_text=align_wrap,
4064
- shrink_to_fit=align_shrink,
4065
- indent=align_indent,
4066
- )
4067
-
4068
- if strcmp(K, kws_cell)[0] == "border":
4069
- #! border
4070
- kws_border = [
4071
- "color_left",
4072
- "color_l",
4073
- "color_right",
4074
- "color_r",
4075
- "color_top",
4076
- "color_t",
4077
- "color_bottom",
4078
- "color_b",
4079
- "color_diagonal",
4080
- "color_d",
4081
- "color_outline",
4082
- "color_o",
4083
- "color_vertical",
4084
- "color_v",
4085
- "color_horizontal",
4086
- "color_h",
4087
- "color",
4088
- "style_left",
4089
- "style_l",
4090
- "style_right",
4091
- "style_r",
4092
- "style_top",
4093
- "style_t",
4094
- "style_bottom",
4095
- "style_b",
4096
- "style_diagonal",
4097
- "style_d",
4098
- "style_outline",
4099
- "style_o",
4100
- "style_vertical",
4101
- "style_v",
4102
- "style_horizontal",
4103
- "style_h",
4104
- "style",
4105
- ]
4106
- # * border color
4107
- border_color_l, border_color_r, border_color_t, border_color_b = (
4108
- "FF000000",
4109
- "FF000000",
4110
- "FF000000",
4111
- "FF000000",
4112
- )
4113
- border_color_d, border_color_o, border_color_v, border_color_h = (
4114
- "FF000000",
4115
- "FF000000",
4116
- "FF000000",
4117
- "FF000000",
4118
- )
4119
- # get colors config
4120
- for k, v in cell.get(K, {}).items():
4121
- if strcmp(k, kws_border)[0] in ["color"]:
4122
- border_color_all = hex2argb(v)
4123
- # 如果设置了color,表示其它的所有的都设置成为一样的
4124
- # 然后再才开始自己定义其它的color
4125
- border_color_l, border_color_r, border_color_t, border_color_b = (
4126
- border_color_all,
4127
- border_color_all,
4128
- border_color_all,
4129
- border_color_all,
4130
- )
4131
- border_color_d, border_color_o, border_color_v, border_color_h = (
4132
- border_color_all,
4133
- border_color_all,
4134
- border_color_all,
4135
- border_color_all,
4136
- )
4137
- elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
4138
- border_color_l = hex2argb(v)
4139
- elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
4140
- border_color_r = hex2argb(v)
4141
- elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
4142
- border_color_t = hex2argb(v)
4143
- elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
4144
- border_color_b = hex2argb(v)
4145
- elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
4146
- border_color_d = hex2argb(v)
4147
- elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
4148
- border_color_o = hex2argb(v)
4149
- elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
4150
- border_color_v = hex2argb(v)
4151
- elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
4152
- border_color_h = hex2argb(v)
4153
- # *border style
4154
- border_styles = [
4155
- "thin",
4156
- "medium",
4157
- "thick",
4158
- "dotted",
4159
- "dashed",
4160
- "hair",
4161
- "mediumDashed",
4162
- "dashDot",
4163
- "dashDotDot",
4164
- "slantDashDot",
4165
- "none",
4166
- ]
4167
- border_style_l, border_style_r, border_style_t, border_style_b = (
4168
- None,
4169
- None,
4170
- None,
4171
- None,
4172
- )
4173
- border_style_d, border_style_o, border_style_v, border_style_h = (
4174
- None,
4175
- None,
4176
- None,
4177
- None,
4178
- )
4179
- # get styles config
4180
- for k, v in cell.get(K, {}).items():
4181
- # if not "style" in k:
4182
- # break
4183
- if strcmp(k, kws_border)[0] in ["style"]:
4184
- border_style_all = strcmp(v, border_styles)[0]
4185
- # 如果设置了style,表示其它的所有的都设置成为一样的
4186
- # 然后再才开始自己定义其它的style
4187
- border_style_l, border_style_r, border_style_t, border_style_b = (
4188
- border_style_all,
4189
- border_style_all,
4190
- border_style_all,
4191
- border_style_all,
4192
- )
4193
- border_style_d, border_style_o, border_style_v, border_style_h = (
4194
- border_style_all,
4195
- border_style_all,
4196
- border_style_all,
4197
- border_style_all,
4198
- )
4199
- elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
4200
- border_style_l = strcmp(v, border_styles)[0]
4201
- elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
4202
- border_style_r = strcmp(v, border_styles)[0]
4203
- elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
4204
- border_style_t = strcmp(v, border_styles)[0]
4205
- elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
4206
- border_style_b = strcmp(v, border_styles)[0]
4207
- elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
4208
- border_style_d = strcmp(v, border_styles)[0]
4209
- elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
4210
- border_style_o = strcmp(v, border_styles)[0]
4211
- elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
4212
- border_style_v = strcmp(v, border_styles)[0]
4213
- elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
4214
- border_style_h = strcmp(v, border_styles)[0]
4215
- # * apply border config
4216
- border = Border(
4217
- left=Side(border_style=border_style_l, color=border_color_l),
4218
- right=Side(border_style=border_style_r, color=border_color_r),
4219
- top=Side(border_style=border_style_t, color=border_color_t),
4220
- bottom=Side(border_style=border_style_b, color=border_color_b),
4221
- diagonal=Side(border_style=border_style_d, color=border_color_d),
4222
- diagonal_direction=0,
4223
- outline=Side(border_style=border_style_o, color=border_color_o),
4224
- vertical=Side(border_style=border_style_v, color=border_color_v),
4225
- horizontal=Side(border_style=border_style_h, color=border_color_h),
4226
- )
4227
-
4228
- #! final apply configs
4229
- for row in ws[cell_range]:
4230
- for cell_ in row:
4231
- if cell_font:
4232
- cell_.font = cell_font
4233
- if cell_fill:
4234
- cell_.fill = cell_fill
4235
- if cell_alignment:
4236
- cell_.alignment = cell_alignment
4237
- if border:
4238
- cell_.border = border
4239
-
4081
+ raise ValueError("Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format.")
4240
4082
 
4241
4083
  def format_excel(
4242
4084
  df=None,
@@ -4257,6 +4099,255 @@ def format_excel(
4257
4099
  conditional_format=None, # dict
4258
4100
  **kwargs,
4259
4101
  ):
4102
+ import pandas as pd
4103
+ from datetime import datetime
4104
+ from openpyxl import load_workbook
4105
+ from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
4106
+ from openpyxl.utils import get_column_letter
4107
+ from openpyxl.worksheet.datavalidation import DataValidation
4108
+ from openpyxl.comments import Comment
4109
+ from openpyxl.formatting.rule import ColorScaleRule
4110
+
4111
+ def convert_indices_to_range(row_slice, col_slice):
4112
+ """Convert numerical row and column slices to Excel-style range strings."""
4113
+ start_row = row_slice.start + 1
4114
+ end_row = row_slice.stop if row_slice.stop is not None else None
4115
+ start_col = col_slice.start + 1
4116
+ end_col = col_slice.stop if col_slice.stop is not None else None
4117
+
4118
+ start_col_letter = get_column_letter(start_col)
4119
+ end_col_letter = get_column_letter(end_col) if end_col else None
4120
+ return (
4121
+ f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
4122
+ if end_col_letter
4123
+ else f"{start_col_letter}{start_row}"
4124
+ )
4125
+
4126
+ def apply_format(ws, cell, cell_range):
4127
+ """Apply cell formatting to a specified range."""
4128
+ cell_font, cell_fill, cell_alignment, border = None, None, None, None
4129
+ kws_cell = ["font", "fill", "alignment", "border"]
4130
+ for K, _ in cell.items():
4131
+ if strcmp(K, kws_cell)[0] == "font":
4132
+ #! font
4133
+ font_color = "000000"
4134
+ font_name = "Arial"
4135
+ font_underline = "none"
4136
+ font_size = 14
4137
+ font_bold = False
4138
+ font_strike = False
4139
+ font_italic = False
4140
+ kws_font = ["name","size","bold","underline","color","strike","italic"]
4141
+ for k_, v_ in cell.get(K, {}).items():
4142
+ if strcmp(k_, kws_font)[0] == "name":
4143
+ font_name = v_
4144
+ elif strcmp(k_, kws_font)[0] == "size":
4145
+ font_size = v_
4146
+ elif strcmp(k_, kws_font)[0] == "bold":
4147
+ font_bold = v_
4148
+ elif strcmp(k_, kws_font)[0] == "underline":
4149
+ font_underline = strcmp(v_, ["none", "single", "double"])[0]
4150
+ elif strcmp(k_, kws_font)[0] == "color":
4151
+ font_color = hex2argb(v_)
4152
+ elif strcmp(k_, kws_font)[0] == "strike":
4153
+ font_strike = v_
4154
+ elif strcmp(k_, kws_font)[0] == "italic":
4155
+ font_italic = v_
4156
+
4157
+ cell_font = Font(
4158
+ name=font_name,
4159
+ size=font_size,
4160
+ bold=font_bold,
4161
+ italic=font_italic,
4162
+ underline=font_underline,
4163
+ strike=font_strike,
4164
+ color=font_color,
4165
+ )
4166
+
4167
+ if strcmp(K, kws_cell)[0] == "fill":
4168
+ #! fill
4169
+ kws_fill = ["start_color", "end_color", "fill_type", "color"]
4170
+ kws_fill_type = ["darkVertical","lightDown","lightGrid","solid","darkDown","lightGray","lightUp","gray0625","lightVertical","lightHorizontal",
4171
+ "darkHorizontal","gray125","darkUp","mediumGray","darkTrellis","darkGray","lightTrellis","darkGrid"]
4172
+ start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
4173
+ for k, v in cell.get(K, {}).items():
4174
+ if strcmp(k, kws_fill)[0] == "color":
4175
+ start_color, end_color = hex2argb(v), hex2argb(v)
4176
+ break
4177
+ for k, v in cell.get(K, {}).items():
4178
+ if strcmp(k, kws_fill)[0] == "start_color":
4179
+ start_color = hex2argb(v)
4180
+ elif strcmp(k, kws_fill)[0] == "end_color":
4181
+ end_color = hex2argb(v)
4182
+ elif strcmp(k, kws_fill)[0] == "fill_type":
4183
+ fill_type = strcmp(v, kws_fill_type)[0]
4184
+ cell_fill = PatternFill(
4185
+ start_color=start_color,
4186
+ end_color=end_color,
4187
+ fill_type=fill_type,
4188
+ )
4189
+
4190
+ if strcmp(K, kws_cell)[0] == "alignment":
4191
+ #! alignment
4192
+ # default
4193
+ align_horizontal = "general"
4194
+ align_vertical = "center"
4195
+ align_rot = 0
4196
+ align_wrap = False
4197
+ align_shrink = False
4198
+ align_indent = 0
4199
+ kws_align = [
4200
+ "horizontal",
4201
+ "ha",
4202
+ "vertical",
4203
+ "va",
4204
+ "text_rotation",
4205
+ "rotat",
4206
+ "rot",
4207
+ "wrap_text",
4208
+ "wrap",
4209
+ "shrink_to_fit",
4210
+ "shrink",
4211
+ "indent",
4212
+ ]
4213
+ for k, v in cell.get(K, {}).items():
4214
+ if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
4215
+ align_horizontal = strcmp(
4216
+ v, ["general", "left", "right", "center"]
4217
+ )[0]
4218
+ elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
4219
+ align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
4220
+ elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
4221
+ align_rot = v
4222
+ elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
4223
+ align_wrap = v
4224
+ elif strcmp(k, kws_align)[0] in [
4225
+ "shrink_to_fit",
4226
+ "shrink",
4227
+ "wrap_text",
4228
+ "wrap",
4229
+ ]:
4230
+ align_shrink = v
4231
+ elif strcmp(k, kws_align)[0] in ["indent"]:
4232
+ align_indent = v
4233
+ cell_alignment = Alignment(
4234
+ horizontal=align_horizontal,
4235
+ vertical=align_vertical,
4236
+ text_rotation=align_rot,
4237
+ wrap_text=align_wrap,
4238
+ shrink_to_fit=align_shrink,
4239
+ indent=align_indent,
4240
+ )
4241
+
4242
+ if strcmp(K, kws_cell)[0] == "border":
4243
+ #! border
4244
+ kws_border = ["color_left","color_l","color_right","color_r","color_top","color_t","color_bottom","color_b",
4245
+ "color_diagonal","color_d","color_outline","color_o","color_vertical","color_v","color_horizontal",
4246
+ "color_h","color","style_left","style_l","style_right","style_r","style_top","style_t","style_bottom","style_b",
4247
+ "style_diagonal","style_d","style_outline","style_o","style_vertical","style_v","style_horizontal",
4248
+ "style_h","style"]
4249
+ # * border color
4250
+ border_color_l, border_color_r, border_color_t, border_color_b = ("FF000000","FF000000","FF000000","FF000000")
4251
+ border_color_d, border_color_o, border_color_v, border_color_h = ("FF000000","FF000000","FF000000","FF000000")
4252
+ # get colors config
4253
+ for k, v in cell.get(K, {}).items():
4254
+ if strcmp(k, kws_border)[0] in ["color"]:
4255
+ border_color_all = hex2argb(v)
4256
+ # 如果设置了color,表示其它的所有的都设置成为一样的
4257
+ # 然后再才开始自己定义其它的color
4258
+ border_color_l, border_color_r, border_color_t, border_color_b = (
4259
+ border_color_all,
4260
+ border_color_all,
4261
+ border_color_all,
4262
+ border_color_all,
4263
+ )
4264
+ border_color_d, border_color_o, border_color_v, border_color_h = (
4265
+ border_color_all,
4266
+ border_color_all,
4267
+ border_color_all,
4268
+ border_color_all,
4269
+ )
4270
+ elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
4271
+ border_color_l = hex2argb(v)
4272
+ elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
4273
+ border_color_r = hex2argb(v)
4274
+ elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
4275
+ border_color_t = hex2argb(v)
4276
+ elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
4277
+ border_color_b = hex2argb(v)
4278
+ elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
4279
+ border_color_d = hex2argb(v)
4280
+ elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
4281
+ border_color_o = hex2argb(v)
4282
+ elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
4283
+ border_color_v = hex2argb(v)
4284
+ elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
4285
+ border_color_h = hex2argb(v)
4286
+ # *border style
4287
+ border_styles = ["thin","medium","thick","dotted","dashed",
4288
+ "hair","mediumDashed","dashDot","dashDotDot","slantDashDot","none"]
4289
+ border_style_l, border_style_r, border_style_t, border_style_b = (None,None,None,None)
4290
+ border_style_d, border_style_o, border_style_v, border_style_h = (None,None,None,None)
4291
+ # get styles config
4292
+ for k, v in cell.get(K, {}).items():
4293
+ # if not "style" in k:
4294
+ # break
4295
+ if strcmp(k, kws_border)[0] in ["style"]:
4296
+ border_style_all = strcmp(v, border_styles)[0]
4297
+ # 如果设置了style,表示其它的所有的都设置成为一样的
4298
+ # 然后再才开始自己定义其它的style
4299
+ border_style_l, border_style_r, border_style_t, border_style_b = (
4300
+ border_style_all,
4301
+ border_style_all,
4302
+ border_style_all,
4303
+ border_style_all,
4304
+ )
4305
+ border_style_d, border_style_o, border_style_v, border_style_h = (
4306
+ border_style_all,
4307
+ border_style_all,
4308
+ border_style_all,
4309
+ border_style_all,
4310
+ )
4311
+ elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
4312
+ border_style_l = strcmp(v, border_styles)[0]
4313
+ elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
4314
+ border_style_r = strcmp(v, border_styles)[0]
4315
+ elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
4316
+ border_style_t = strcmp(v, border_styles)[0]
4317
+ elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
4318
+ border_style_b = strcmp(v, border_styles)[0]
4319
+ elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
4320
+ border_style_d = strcmp(v, border_styles)[0]
4321
+ elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
4322
+ border_style_o = strcmp(v, border_styles)[0]
4323
+ elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
4324
+ border_style_v = strcmp(v, border_styles)[0]
4325
+ elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
4326
+ border_style_h = strcmp(v, border_styles)[0]
4327
+ # * apply border config
4328
+ border = Border(
4329
+ left=Side(border_style=border_style_l, color=border_color_l),
4330
+ right=Side(border_style=border_style_r, color=border_color_r),
4331
+ top=Side(border_style=border_style_t, color=border_color_t),
4332
+ bottom=Side(border_style=border_style_b, color=border_color_b),
4333
+ diagonal=Side(border_style=border_style_d, color=border_color_d),
4334
+ diagonal_direction=0,
4335
+ outline=Side(border_style=border_style_o, color=border_color_o),
4336
+ vertical=Side(border_style=border_style_v, color=border_color_v),
4337
+ horizontal=Side(border_style=border_style_h, color=border_color_h),
4338
+ )
4339
+
4340
+ #! final apply configs
4341
+ for row in ws[cell_range]:
4342
+ for cell_ in row:
4343
+ if cell_font:
4344
+ cell_.font = cell_font
4345
+ if cell_fill:
4346
+ cell_.fill = cell_fill
4347
+ if cell_alignment:
4348
+ cell_.alignment = cell_alignment
4349
+ if border:
4350
+ cell_.border = border
4260
4351
  if not isinstance(df, pd.DataFrame):
4261
4352
  try:
4262
4353
  print(f"is loading file {os.path.basename(df)}")
@@ -4602,12 +4693,10 @@ format_excel(
4602
4693
  print(f"Formatted Excel file saved as:\n{filename}")
4603
4694
 
4604
4695
 
4605
- from IPython.display import display, HTML, Markdown
4606
-
4607
-
4608
4696
  def preview(var):
4609
4697
  """Master function to preview formatted variables in Jupyter."""
4610
-
4698
+ from bs4 import BeautifulSoup
4699
+ from IPython.display import display, HTML, Markdown
4611
4700
  if isinstance(var, str):
4612
4701
  if isa(var, "html"):
4613
4702
  display(HTML(var)) # Render as HTML
@@ -4624,6 +4713,7 @@ def preview(var):
4624
4713
  display(var)
4625
4714
 
4626
4715
  elif isinstance(var, list) or isinstance(var, dict):
4716
+ import json
4627
4717
  # Display JSON
4628
4718
  json_str = json.dumps(var, indent=4)
4629
4719
  display(Markdown(f"```json\n{json_str}\n```"))
@@ -4637,6 +4727,7 @@ def preview(var):
4637
4727
  display(Image(filename=var))
4638
4728
 
4639
4729
  elif isinstance(var, dict):
4730
+ import json
4640
4731
  # Handle dictionary formatting
4641
4732
  json_str = json.dumps(var, indent=4)
4642
4733
  display(Markdown(f"```json\n{json_str}\n```"))
@@ -4644,13 +4735,154 @@ def preview(var):
4644
4735
  else:
4645
4736
  # If the format is not recognized, print a message
4646
4737
  print("Format not recognized or unsupported.")
4647
-
4648
-
4649
4738
  # # Example usages:
4650
4739
  # preview("This is a plain text message.")
4651
4740
  # preview("# This is a Markdown header")
4652
4741
  # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
4653
4742
  # preview({"key": "value", "numbers": [1, 2, 3]})
4743
+
4744
+ def _df_outlier(
4745
+ data,
4746
+ columns=None,
4747
+ method=["zscore", "iqr", "percentile", "iforest"],
4748
+ min_outlier_method=3, # 至少两种方法检查出outlier
4749
+ zscore_threshold=3,
4750
+ iqr_threshold=1.5,
4751
+ lower_percentile=5,
4752
+ upper_percentile=95,
4753
+ ):
4754
+ from scipy.stats import zscore
4755
+ from sklearn.ensemble import IsolationForest
4756
+ from sklearn.preprocessing import StandardScaler
4757
+
4758
+ col_names_org = data.columns.tolist()
4759
+ index_names_org = data.index.tolist()
4760
+ # Separate numeric and non-numeric columns
4761
+ numeric_data = data.select_dtypes(include=[np.number])
4762
+ non_numeric_data = data.select_dtypes(exclude=[np.number])
4763
+
4764
+ if columns is not None:
4765
+ numeric_data = numeric_data[columns]
4766
+ elif numeric_data.empty:
4767
+ raise ValueError("Input data must contain numeric columns.")
4768
+
4769
+ outliers_df = pd.DataFrame(index=numeric_data.index)
4770
+ if isinstance(method, str):
4771
+ method = [method]
4772
+
4773
+ # Z-score method
4774
+ if "zscore" in method:
4775
+ z_scores = np.abs(zscore(numeric_data))
4776
+ outliers_df["zscore"] = np.any(z_scores > zscore_threshold, axis=1)
4777
+
4778
+ # IQR method
4779
+ if "iqr" in method:
4780
+ Q1 = numeric_data.quantile(0.25)
4781
+ Q3 = numeric_data.quantile(0.75)
4782
+ IQR = Q3 - Q1
4783
+ lower_bound = Q1 - iqr_threshold * IQR
4784
+ upper_bound = Q3 + iqr_threshold * IQR
4785
+ outliers_df["iqr"] = (
4786
+ (numeric_data < lower_bound) | (numeric_data > upper_bound)
4787
+ ).any(axis=1)
4788
+
4789
+ # Percentile method
4790
+ if "percentile" in method:
4791
+ lower_bound = numeric_data.quantile(lower_percentile / 100)
4792
+ upper_bound = numeric_data.quantile(upper_percentile / 100)
4793
+ outliers_df["percentile"] = (
4794
+ (numeric_data < lower_bound) | (numeric_data > upper_bound)
4795
+ ).any(axis=1)
4796
+
4797
+ # Isolation Forest method
4798
+ if "iforest" in method:
4799
+ # iforest method cannot handle NaNs, then fillna with mean
4800
+ numeric_data_ = numeric_data.fillna(numeric_data.mean())
4801
+ scaler = StandardScaler()
4802
+ scaled_data = scaler.fit_transform(numeric_data_)
4803
+ iso_forest = IsolationForest(contamination=0.05)
4804
+ outliers_df["iforest"] = iso_forest.fit_predict(scaled_data) == -1
4805
+
4806
+ # Combine all outlier detections
4807
+ if len(method) == 4: # all method are used:
4808
+ outliers_df["outlier"] = outliers_df.sum(axis=1) >= min_outlier_method
4809
+ else:
4810
+ outliers_df["outlier"] = outliers_df.any(axis=1)
4811
+
4812
+ # Handling Outliers: Remove or Winsorize or Replace with NaN
4813
+ processed_data = numeric_data.copy()
4814
+
4815
+ processed_data.loc[outliers_df["outlier"]] = np.nan
4816
+
4817
+ return processed_data
4818
+
4819
+
4820
+ def df_outlier(
4821
+ data,
4822
+ columns=None,
4823
+ method=["zscore", "iqr", "percentile", "iforest"],
4824
+ min_outlier_method=2, # 至少两种方法检查出outlier
4825
+ zscore_threshold=3,
4826
+ iqr_threshold=1.5,
4827
+ lower_percentile=5,
4828
+ upper_percentile=95,
4829
+ ):
4830
+ """
4831
+ Usage:
4832
+ data_out = df_outlier(
4833
+ data,
4834
+ columns=["income"],
4835
+ method="iforest",
4836
+ min_outlier_method=1)
4837
+
4838
+ Advanced outlier detection and handling function.
4839
+
4840
+ Parameters:
4841
+ - data: DataFrame, the input data (numerical).
4842
+ - method: List, the outlier detection method to use. Options: 'zscore', 'iqr', 'percentile', 'iforest'.
4843
+ - zscore_threshold: float, threshold for Z-score outlier detection (default 3).
4844
+ - iqr_threshold: float, threshold for IQR method (default 1.5).
4845
+ - lower_percentile: float, lower percentile for percentile-based outliers (default 5).
4846
+ - upper_percentile: float, upper percentile for percentile-based outliers (default 95).
4847
+ - keep_nan: bool, whether to replace outliers with NaN (default True).
4848
+ - plot: bool, whether to visualize the outliers (default False).
4849
+ - min_outlier_method: int, minimum number of method that need to flag a row as an outlier (default 2).
4850
+ - inplace: bool, whether to modify the original `data` DataFrame (default False).
4851
+
4852
+ Returns:
4853
+ - processed_data: DataFrame with outliers handled based on method (if winsorize/remove is True).
4854
+ """
4855
+ col_names_org = data.columns.tolist()
4856
+ index_names_org = data.index.tolist()
4857
+
4858
+ numeric_data = data.select_dtypes(include=[np.number])
4859
+ non_numeric_data = data.select_dtypes(exclude=[np.number])
4860
+
4861
+ _outlier_df_tmp = pd.DataFrame()
4862
+ for col in numeric_data.columns:
4863
+ _outlier_df_tmp = pd.concat(
4864
+ [
4865
+ _outlier_df_tmp,
4866
+ _df_outlier(
4867
+ data=data,
4868
+ columns=[col],
4869
+ method=method,
4870
+ min_outlier_method=min_outlier_method, # 至少两种方法检查出outlier
4871
+ zscore_threshold=zscore_threshold,
4872
+ iqr_threshold=iqr_threshold,
4873
+ lower_percentile=lower_percentile,
4874
+ upper_percentile=upper_percentile,
4875
+ ),
4876
+ ],
4877
+ axis=1,
4878
+ # join="inner",
4879
+ )
4880
+ processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
4881
+ processed_data = processed_data[col_names_org]
4882
+ return processed_data
4883
+
4884
+
4885
+
4654
4886
  def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
4655
4887
  """
4656
4888
  Extend a DataFrame by the list elecments in the column.
@@ -5042,6 +5274,7 @@ def df_drop_duplicates(
5042
5274
  return None
5043
5275
  else:
5044
5276
  return result
5277
+ #! fillna()
5045
5278
  def df_fillna(
5046
5279
  data: pd.DataFrame,
5047
5280
  method: str = "knn",
@@ -5049,8 +5282,8 @@ def df_fillna(
5049
5282
  constant: float = None,
5050
5283
  n_neighbors: int = 5, # KNN-specific
5051
5284
  max_iter: int = 10, # Iterative methods specific
5052
- inplace: bool = True,
5053
- random_state:int = None
5285
+ inplace: bool = False,
5286
+ random_state:int = 1
5054
5287
  ) -> pd.DataFrame:
5055
5288
  """
5056
5289
  Fill missing values in a DataFrame using specified imputation method.
@@ -5078,7 +5311,18 @@ def df_fillna(
5078
5311
  inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
5079
5312
 
5080
5313
  """
5081
-
5314
+ if isinstance(data, pd.Series):
5315
+ data=pd.DataFrame(data)
5316
+ # handle None
5317
+ for col in data.columns:
5318
+ data[col] = data[col].apply(lambda x: np.nan if x is None else x)
5319
+
5320
+ col_names_org = data.columns.tolist()
5321
+ index_names_org = data.index.tolist()
5322
+ # Separate numeric and non-numeric columns
5323
+ numeric_data = data.select_dtypes(include=[np.number])
5324
+ non_numeric_data = data.select_dtypes(exclude=[np.number])
5325
+
5082
5326
  if data.empty:
5083
5327
  raise ValueError("Input DataFrame is empty.")
5084
5328
 
@@ -5107,15 +5351,6 @@ def df_fillna(
5107
5351
  from sklearn.impute import IterativeImputer
5108
5352
 
5109
5353
  imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
5110
- # elif method == "missforest":
5111
- # from missingpy import MissForest
5112
- # imputer = MissForest(max_iter=max_iter, random_state=random_state)
5113
- # elif method == "softimpute":
5114
- # from fancyimpute import SoftImpute
5115
- # imputer = SoftImpute()
5116
- # elif method == "svd":
5117
- # from fancyimpute import IterativeSVD
5118
- # imputer = IterativeSVD(max_iters=max_iter)
5119
5354
  else: # mean, median, most_frequent
5120
5355
  from sklearn.impute import SimpleImputer
5121
5356
  imputer = SimpleImputer(strategy=method)
@@ -5123,26 +5358,49 @@ def df_fillna(
5123
5358
  # Fit and transform the data
5124
5359
  if axis == 0:
5125
5360
  # Impute column-wise
5126
- imputed_data = imputer.fit_transform(data)
5127
- imputed_data.shape
5361
+ imputed_data = imputer.fit_transform(numeric_data)
5128
5362
  elif axis == 1:
5129
5363
  # Impute row-wise
5130
- imputed_data = imputer.fit_transform(data.T)
5131
- imputed_data.shape
5364
+ imputed_data = imputer.fit_transform(numeric_data.T)
5132
5365
  else:
5133
5366
  raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
5134
5367
 
5135
- df_filled = pd.DataFrame(
5368
+ imputed_data = pd.DataFrame(
5136
5369
  imputed_data if axis == 0 else imputed_data.T,
5137
- index=data.index,# if axis == 0 else data.columns,
5138
- columns=data.columns,# if axis == 0 else data.index,
5370
+ index=numeric_data.index if axis == 0 else data.columns,
5371
+ columns=numeric_data.columns if axis == 0 else data.index,
5139
5372
  )
5373
+ for col in imputed_data.select_dtypes(include=[np.number]).columns:
5374
+ imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
5375
+
5376
+ # Handle non-numeric data imputation
5377
+ if not non_numeric_data.empty:
5378
+ from sklearn.impute import SimpleImputer
5379
+ if method == "constant":
5380
+ non_numeric_imputer = SimpleImputer(strategy="constant", fill_value=constant)
5381
+ else:
5382
+ non_numeric_imputer = SimpleImputer(strategy="most_frequent")
5383
+
5384
+ # Impute non-numeric columns column-wise (axis=0)
5385
+ imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
5386
+
5387
+ # Convert imputed non-numeric array back to DataFrame with original index and column names
5388
+ imputed_non_numeric_df = pd.DataFrame(
5389
+ imputed_non_numeric, index=non_numeric_data.index, columns=non_numeric_data.columns
5390
+ )
5391
+ else:
5392
+ imputed_non_numeric_df = pd.DataFrame(index=data.index)
5393
+
5394
+
5395
+ imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(columns=data.columns)
5140
5396
 
5141
5397
  if inplace:
5142
- data.update(df_filled)
5143
- return None # replace original
5398
+ # Modify the original DataFrame
5399
+ data[:] = imputed_data[col_names_org]
5400
+ return None
5144
5401
  else:
5145
- return df_filled
5402
+ # Return the modified DataFrame
5403
+ return imputed_data[col_names_org]
5146
5404
  # # example
5147
5405
  # data = {
5148
5406
  # "A": [1, 2, np.nan, 4, 5],
@@ -5172,7 +5430,94 @@ def df_fillna(
5172
5430
  # display(df)
5173
5431
  # display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
5174
5432
 
5175
-
5433
+ def df_encoder(
5434
+ data: pd.DataFrame,
5435
+ method: str = "dummy",#'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
5436
+ columns=None,
5437
+ target_column=None, # Required for 'target' encoding method
5438
+ **kwargs
5439
+ ) -> pd.DataFrame:
5440
+ """
5441
+ Methods explained:
5442
+ - 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
5443
+
5444
+ - 'onehot': One-hot encoding is used when there is no inherent order in categories. It creates a binary column for each category and is useful for nominal categorical variables. However, it increases dimensionality significantly if there are many unique categories.
5445
+
5446
+ - 'ordinal': Ordinal encoding is used when there is an inherent order in the categories. It assigns integers to categories based on their order. Use this when the categories have a ranking (e.g., 'low', 'medium', 'high').
5447
+
5448
+ - 'label': Label encoding is used for converting each unique category to a numeric label. It can be useful when working with algorithms that can handle categorical data natively (e.g., decision trees). However, it might introduce unintended ordinal relationships between the categories.
5449
+
5450
+ - 'target': Target encoding is used when you encode a categorical feature based on the mean of the target variable. This is useful when there is a strong correlation between the categorical feature and the target variable. It is often used in predictive modeling to capture relationships that are not directly encoded in the feature.
5451
+
5452
+ - 'binary': Binary encoding is a more efficient alternative to one-hot encoding when dealing with high-cardinality categorical variables. It converts categories into binary numbers and then splits them into multiple columns, reducing dimensionality compared to one-hot encoding.
5453
+ """
5454
+
5455
+ # Select categorical columns
5456
+ categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
5457
+ methods = ["dummy","onehot", "ordinal", "label", "target", "binary"]
5458
+ method = strcmp(method, methods)[0]
5459
+
5460
+ if columns is None:
5461
+ columns = categorical_cols
5462
+
5463
+ # pd.get_dummies()
5464
+ if method=='dummy':
5465
+ dtype=kwargs.pop("dtype",int)
5466
+ drop_first=kwargs.pop("drop_first",True)
5467
+ try:
5468
+ encoded_df = pd.get_dummies(data[columns], drop_first=drop_first, dtype=dtype, **kwargs)
5469
+ return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5470
+ except Exception as e:
5471
+ # print(f"Warning, 没有进行转换, 因为: {e}")
5472
+ return data
5473
+ # One-hot encoding
5474
+ elif method == "onehot":
5475
+ from sklearn.preprocessing import OneHotEncoder
5476
+
5477
+ encoder = OneHotEncoder(drop="first", sparse_output=False, **kwargs)
5478
+ encoded_data = encoder.fit_transform(data[columns])
5479
+ encoded_df = pd.DataFrame(
5480
+ encoded_data,
5481
+ columns=encoder.get_feature_names_out(columns),
5482
+ index=data.index,
5483
+ )
5484
+ return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5485
+
5486
+ # Ordinal encoding
5487
+ elif method == "ordinal":
5488
+ from sklearn.preprocessing import OrdinalEncoder
5489
+
5490
+ encoder = OrdinalEncoder(**kwargs)
5491
+ encoded_data = encoder.fit_transform(data[columns])
5492
+ encoded_df = pd.DataFrame(encoded_data, columns=columns, index=data.index)
5493
+ return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5494
+
5495
+ # Label encoding
5496
+ elif method == "label":
5497
+ from sklearn.preprocessing import LabelEncoder
5498
+
5499
+ encoder = LabelEncoder()
5500
+ encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
5501
+ return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5502
+
5503
+ # Target encoding (Mean of the target for each category)
5504
+ elif method == "target":
5505
+ if target_column is None:
5506
+ raise ValueError("target_column must be provided for target encoding.")
5507
+ from category_encoders import TargetEncoder
5508
+
5509
+ encoder = TargetEncoder(cols=columns, **kwargs)
5510
+ encoded_data = encoder.fit_transform(data[columns], data[target_column])
5511
+ return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5512
+
5513
+ # Binary encoding (for high-cardinality categorical variables)
5514
+ elif method == "binary":
5515
+ from category_encoders import BinaryEncoder
5516
+
5517
+ encoder = BinaryEncoder(cols=columns, **kwargs)
5518
+ encoded_data = encoder.fit_transform(data[columns])
5519
+ return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5520
+
5176
5521
  def df_scaler(
5177
5522
  data: pd.DataFrame, # should be numeric dtype
5178
5523
  method="standard",
@@ -5218,9 +5563,8 @@ def df_scaler(
5218
5563
  if axis == 0:
5219
5564
  # Column-wise scaling (default)
5220
5565
  if columns is None:
5221
- columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
5566
+ columns = data.select_dtypes(include=np.number).columns.tolist()
5222
5567
  non_numeric_columns = data.columns.difference(columns)
5223
- print(f"Scaling columns")
5224
5568
 
5225
5569
  scaled_data = scaler.fit_transform(data[columns])
5226
5570
 
@@ -5242,7 +5586,7 @@ def df_scaler(
5242
5586
  # Row-wise scaling
5243
5587
  if columns is None:
5244
5588
  columns = data.index.tolist()
5245
- numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
5589
+ numeric_rows = data.loc[columns].select_dtypes(include=np.number)
5246
5590
  if numeric_rows.empty:
5247
5591
  raise ValueError("No numeric rows to scale.")
5248
5592
 
@@ -5259,7 +5603,31 @@ def df_scaler(
5259
5603
  scaled_df = data.copy()
5260
5604
  scaled_df.loc[numeric_rows.index] = scaled_data
5261
5605
  return scaled_df
5606
+ def df_special_characters_cleaner(
5607
+ data: pd.DataFrame, where=["column", "content", "index"]
5608
+ ) -> pd.DataFrame:
5609
+ """
5610
+ to clean special characters:
5611
+ usage:
5612
+ df_special_characters_cleaner(data=df, where='column')
5613
+ """
5614
+ if not isinstance(where, list):
5615
+ where = [where]
5616
+ where_to_clean = ["column", "content", "index"]
5617
+ where_ = [strcmp(i, where_to_clean)[0] for i in where]
5618
+
5619
+ # 1. Clean column names by replacing special characters with underscores
5620
+ if "column" in where_:
5621
+ data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
5622
+
5623
+ # 2. Clean only object-type columns (text columns)
5624
+ if "content" in where_:
5625
+ for col in data.select_dtypes(include=["object"]).columns:
5626
+ data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
5627
+ if data.index.dtype == "object" and index in where_:
5628
+ data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
5262
5629
 
5630
+ return data
5263
5631
  def df_cluster(
5264
5632
  data: pd.DataFrame,
5265
5633
  columns: Optional[list] = None,
@@ -5268,8 +5636,8 @@ def df_cluster(
5268
5636
  scale: bool = True,
5269
5637
  plot: Union[str, list] = "all",
5270
5638
  inplace: bool = True,
5271
- ax: Optional[plt.Axes] = None,
5272
- ) -> tuple[pd.DataFrame, int, Optional[plt.Axes]]:
5639
+ ax = None,
5640
+ ):
5273
5641
  from sklearn.preprocessing import StandardScaler
5274
5642
  from sklearn.cluster import KMeans
5275
5643
  from sklearn.metrics import silhouette_score, silhouette_samples
@@ -5277,7 +5645,6 @@ def df_cluster(
5277
5645
  import numpy as np
5278
5646
  import pandas as pd
5279
5647
  import matplotlib.pyplot as plt
5280
- import seaborn as sns
5281
5648
 
5282
5649
  """
5283
5650
  Performs clustering analysis on the provided feature matrix using K-Means.
@@ -5585,94 +5952,61 @@ def df_reducer(
5585
5952
  umap_neighbors: int = 15, # UMAP-specific
5586
5953
  umap_min_dist: float = 0.1, # UMAP-specific
5587
5954
  tsne_perplexity: int = 30, # t-SNE-specific
5955
+ hue:str = None,# lda-specific
5588
5956
  scale: bool = True,
5589
5957
  fill_missing: bool = True,
5590
5958
  debug: bool = False,
5591
5959
  inplace: bool = True, # replace the oringinal data
5592
5960
  plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
5593
- ) -> pd.DataFrame:
5594
- """
5595
- Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
5596
- method:
5597
- 1. 'umap':
5598
- - big dataset and global structure, often preferred in large-scale datasets for
5599
- visualization and dimensionality reduction, balancing speed and quality of visualization.
5600
- - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
5601
- relationships, causing clusters to appear in arbitrary proximities to each other.
5602
- 2. 'pca':
5603
- - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
5604
- relationships, causing clusters to appear in arbitrary proximities to each other.
5605
- - useful as a preprocessing step and in datasets where linear relationships dominate.
5606
- 3. 't-SNE':
5607
- a. t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
5608
- relationships, causing clusters to appear in arbitrary proximities to each other.
5609
- b. often preferred in large-scale datasets for visualization and dimensionality
5610
- reduction, balancing speed and quality of visualization.
5611
- Parameters:
5612
- -----------
5613
- data : pd.DataFrame
5614
- The input DataFrame (samples x features).
5615
-
5616
- columns : List[str], optional
5617
- List of column names to reduce. If None, all columns are used.
5618
-
5619
- method : str, optional, default="umap"
5620
- Dimensionality reduction method, either "pca" or "umap".
5621
-
5622
- n_components : int, optional, default=50
5623
- Number of components for PCA or UMAP.
5624
-
5625
- umap_neighbors : int, optional, default=15
5626
- Number of neighbors considered for UMAP embedding.
5627
-
5628
- umap_min_dist : float, optional, default=0.1
5629
- Minimum distance between points in UMAP embedding.
5630
-
5631
- scale : bool, optional, default=True
5632
- Whether to scale the data using StandardScaler.
5633
-
5634
- fill_missing : bool, optional, default=True
5635
- Whether to fill missing values using the mean before applying PCA/UMAP.
5961
+ random_state=1,
5962
+ ax = None,
5963
+ figsize=None,
5964
+ **kwargs
5965
+ ) -> pd.DataFrame:
5966
+ dict_methods = {
5967
+ #!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
5968
+ "pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
5969
+ "lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
5970
+ "factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
5971
+ "svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
5972
+
5973
+ #! Non-linear Dimensionality Reduction (Manifold Learning)
5974
+ "umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
5975
+ "tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other. Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
5976
+ "mds": "mds(Multidimensional Scaling):\n\tAppropriate for visualizing pairwise similarity or distance in data. Advantage: Maintains the perceived similarity or dissimilarity between points. Limitation: Computationally expensive for large datasets, less effective for complex, high-dimensional structures.",
5977
+ "lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
5978
+ "kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
5979
+ "ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
5980
+
5981
+ #! Anomaly Detection: Specialized for detecting outliers or unusual patterns
5982
+ "isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
5983
+ }
5636
5984
 
5637
- Returns:
5638
- --------
5639
- reduced_df : pd.DataFrame
5640
- DataFrame with the reduced dimensions.
5641
- """
5642
-
5643
- """
5644
- PCA: explained_variance:
5645
- indicates the proportion of the dataset's total variance that each principal
5646
- component (PC) explains. It gives you a sense of how much information
5647
- (or variance) is captured by each PC
5648
- Interpretation:
5649
- - Higher values indicate that the corresponding PC captures more variance.
5650
- - The sum of the explained variances for all PCs equals 1 (or 100%).
5651
- - If the first few components explain a high percentage (e.g., 90%),
5652
- it means you can reduce the dimensionality of the data significantly without losing much information.
5653
- Use case:
5654
- You may plot a scree plot, which shows the explained variance for each PC, to help decide
5655
- how many components to keep for analysis.
5656
-
5657
- PCA: Singular values:
5658
- represent the magnitude of variance along each principal component. Mathematically,
5659
- they are the square roots of the eigenvalues of the covariance matrix.
5660
- Interpretation:
5661
- Larger singular values indicate that the associated PC captures more variance.
5662
- Singular values are related to the scale of the data. If the data are scaled
5663
- before PCA (e.g., standardized), then the singular values will provide a measure
5664
- of the spread of data along each PC.
5665
- Use case:
5666
- Singular values help quantify the contribution of each principal component in a
5667
- similar way to the explained variance. They are useful in understanding the overall
5668
- structure of the data.
5669
- """
5670
5985
  from sklearn.preprocessing import StandardScaler
5671
5986
  from sklearn.impute import SimpleImputer
5672
-
5673
- # Select columns if specified, else use all columns
5674
- X = data[columns].values if columns else data.values
5675
- print(X.shape,type(X))
5987
+ if plot_:
5988
+ import matplotlib.pyplot as plt
5989
+ import seaborn as sns
5990
+ # Check valid method input
5991
+ methods=["pca", "umap","tsne","factor","isolation_forest","lda","kpca","ica","mds","lle","svd"]
5992
+ method=strcmp(method, methods)[0]
5993
+ print(f"\nprocessing with using {dict_methods[method]}:")
5994
+ xlabel,ylabel=None,None
5995
+ if columns is None:
5996
+ columns = data.select_dtypes(include='number').columns.tolist()
5997
+ if hue is None:
5998
+ hue = data.select_dtypes(exclude='number').columns.tolist()
5999
+ if isinstance(hue, list):
6000
+ print("Warning: hue is a list, only select the 1st one")
6001
+ hue=hue[0]
6002
+ if not hue:
6003
+ # Select columns if specified, else use all columns
6004
+ X = data[columns].values if columns else data.values
6005
+ else:
6006
+ # Select columns to reduce and hue for LDA
6007
+ X = data[columns].values if columns else data.drop(columns=[hue]).values
6008
+ y = data[hue].values
6009
+ print(X.shape)
5676
6010
  # Handle missing values
5677
6011
  if fill_missing:
5678
6012
  imputer = SimpleImputer(strategy="mean")
@@ -5683,9 +6017,6 @@ def df_reducer(
5683
6017
  scaler = StandardScaler()
5684
6018
  X = scaler.fit_transform(X)
5685
6019
 
5686
- # Check valid method input
5687
- methods=["pca", "umap","tsne","factor","isolation_forest"]
5688
- method=strcmp(method, methods)[0]
5689
6020
  # Apply PCA if selected
5690
6021
  if method == "pca":
5691
6022
  from sklearn.decomposition import PCA
@@ -5729,7 +6060,27 @@ def df_reducer(
5729
6060
  pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
5730
6061
  for i in range(n_components):
5731
6062
  pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
6063
+ if hue:
6064
+ pca_df[hue]=y
6065
+ elif method =='lda':
6066
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
6067
+
6068
+ if "hue" not in locals() or hue is None:
6069
+ raise ValueError("LDA requires a 'hue' col parameter to specify class labels.")
5732
6070
 
6071
+ lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
6072
+ X_reduced = lda_reducer.fit_transform(X, y)
6073
+
6074
+ # Prepare reduced DataFrame with additional LDA info
6075
+ lda_df = pd.DataFrame(
6076
+ X_reduced, index=data.index,
6077
+ columns=[f"LDA_{i+1}" for i in range(n_components)]
6078
+ )
6079
+ if debug:
6080
+ print(f"LDA completed: Reduced to {n_components} components.")
6081
+ print("Class separability achieved by LDA.")
6082
+ if hue:
6083
+ lda_df[hue]=y
5733
6084
  # Apply UMAP if selected
5734
6085
  elif method == "umap":
5735
6086
  import umap
@@ -5756,32 +6107,36 @@ def df_reducer(
5756
6107
  )
5757
6108
  umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
5758
6109
  umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
6110
+ if hue:
6111
+ umap_df[hue]=y
5759
6112
  elif method == "tsne":
5760
6113
  from sklearn.manifold import TSNE
5761
- tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
5762
- X_reduced = tsne.fit_transform(X)
5763
-
5764
- # Prepare reduced DataFrame with additional t-SNE info
6114
+ tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=random_state)
6115
+ X_reduced = tsne.fit_transform(X)
5765
6116
  tsne_df = pd.DataFrame(
5766
- X_reduced, index=data.index,
6117
+ X_reduced,
6118
+ index=data.index,
5767
6119
  columns=[f"tSNE_{i+1}" for i in range(n_components)]
5768
6120
  )
5769
6121
  tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
5770
-
6122
+ if hue:
6123
+ tsne_df[hue]=y
5771
6124
  # Apply Factor Analysis if selected
5772
6125
  elif method == "factor":
5773
6126
  from sklearn.decomposition import FactorAnalysis
5774
- factor = FactorAnalysis(n_components=n_components, random_state=1)
6127
+ factor = FactorAnalysis(n_components=n_components, random_state=random_state)
5775
6128
  X_reduced = factor.fit_transform(X)
5776
6129
  # Factor Analysis does not directly provide explained variance, but we can approximate it
5777
6130
  fa_variance = factor.noise_variance_
5778
6131
  # Prepare reduced DataFrame with additional Factor Analysis info
5779
6132
  factor_df = pd.DataFrame(
5780
- X_reduced, index=data.index,
6133
+ X_reduced,
6134
+ index=data.index,
5781
6135
  columns=[f"Factor_{i+1}" for i in range(n_components)]
5782
6136
  )
5783
6137
  factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
5784
-
6138
+ if hue:
6139
+ factor_df[hue]=y
5785
6140
  # Apply Isolation Forest for outlier detection if selected
5786
6141
  elif method == "isolation_forest":
5787
6142
  from sklearn.decomposition import PCA
@@ -5812,48 +6167,100 @@ def df_reducer(
5812
6167
  iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
5813
6168
  for i in range(n_components):
5814
6169
  iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
6170
+ if hue:
6171
+ iso_forest_df[hue]=y
6172
+ #* Apply Kernel PCA if selected
6173
+ elif method == "kpca":
6174
+ from sklearn.decomposition import KernelPCA
6175
+ kpca = KernelPCA(n_components=n_components, kernel="rbf", random_state=random_state)
6176
+ X_reduced = kpca.fit_transform(X)
6177
+
6178
+ # Prepare reduced DataFrame with KPCA info
6179
+ kpca_df = pd.DataFrame(
6180
+ X_reduced,
6181
+ index=data.index,
6182
+ columns=[f"KPCA_{i+1}" for i in range(n_components)]
6183
+ )
6184
+ if debug:
6185
+ print("Kernel PCA completed with RBF kernel.")
6186
+ if hue:
6187
+ kpca_df[hue]=y
6188
+ #* Apply ICA if selected
6189
+ elif method == "ica":
6190
+ from sklearn.decomposition import FastICA
6191
+ ica = FastICA(n_components=n_components, random_state=random_state)
6192
+ X_reduced = ica.fit_transform(X)
6193
+
6194
+ # Prepare reduced DataFrame with ICA info
6195
+ ica_df = pd.DataFrame(
6196
+ X_reduced, index=data.index,
6197
+ columns=[f"ICA_{i+1}" for i in range(n_components)]
6198
+ )
6199
+ if debug:
6200
+ print("Independent Component Analysis (ICA) completed.")
6201
+ if hue:
6202
+ ica_df[hue]=y
6203
+ #* Apply MDS if selected
6204
+ elif method == "mds":
6205
+ from sklearn.manifold import MDS
6206
+ mds = MDS(n_components=n_components, random_state=random_state)
6207
+ X_reduced = mds.fit_transform(X)
6208
+
6209
+ # Prepare reduced DataFrame with MDS info
6210
+ mds_df = pd.DataFrame(
6211
+ X_reduced, index=data.index,
6212
+ columns=[f"MDS_{i+1}" for i in range(n_components)]
6213
+ )
6214
+ if debug:
6215
+ print("Multidimensional Scaling (MDS) completed.")
6216
+ if hue:
6217
+ mds_df[hue]=y
6218
+ #* Apply Locally Linear Embedding (LLE) if selected
6219
+ elif method == "lle":
6220
+ from sklearn.manifold import LocallyLinearEmbedding
6221
+ lle = LocallyLinearEmbedding(n_components=n_components, n_neighbors=umap_neighbors, random_state=random_state)
6222
+ X_reduced = lle.fit_transform(X)
6223
+
6224
+ # Prepare reduced DataFrame with LLE info
6225
+ lle_df = pd.DataFrame(
6226
+ X_reduced, index=data.index,
6227
+ columns=[f"LLE_{i+1}" for i in range(n_components)]
6228
+ )
6229
+ if debug:
6230
+ print("Locally Linear Embedding (LLE) completed.")
6231
+ if hue:
6232
+ lle_df[hue]=y
6233
+ #* Apply Singular Value Decomposition (SVD) if selected
6234
+ elif method == "svd":
6235
+ # Using NumPy's SVD for dimensionality reduction
6236
+ U, s, Vt = np.linalg.svd(X, full_matrices=False)
6237
+ X_reduced = U[:, :n_components] * s[:n_components]
6238
+
6239
+ # Prepare reduced DataFrame with SVD info
6240
+ svd_df = pd.DataFrame(
6241
+ X_reduced, index=data.index,
6242
+ columns=[f"SVD_{i+1}" for i in range(n_components)]
6243
+ )
6244
+ if hue:
6245
+ svd_df[hue]=y
6246
+ if debug:
6247
+ print("Singular Value Decomposition (SVD) completed.")
5815
6248
 
5816
6249
  # Return reduced data and info as a new DataFrame with the same index
5817
6250
  if method == "pca":
5818
6251
  reduced_df = pca_df
5819
6252
  colname_met = "PC_"
5820
- if plot_:
5821
- sns.scatterplot(
5822
- data=pca_df,
5823
- x="PC_1",
5824
- y="PC_2",
5825
- # hue="condition",
5826
- )
6253
+ xlabel= f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
6254
+ ylabel= f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
5827
6255
  elif method == "umap":
5828
6256
  reduced_df = umap_df
5829
- colname_met = "UMAP_"
5830
- if plot_:
5831
- sns.scatterplot(
5832
- data=umap_df,
5833
- x="UMAP_1",
5834
- y="UMAP_2",
5835
- # hue="condition",
5836
- )
6257
+ colname_met = "UMAP_"
5837
6258
  elif method == "tsne":
5838
6259
  reduced_df = tsne_df
5839
- colname_met = "t-SNE_"
5840
- if plot_:
5841
- sns.scatterplot(
5842
- data=tsne_df,
5843
- x="tSNE_1",
5844
- y="tSNE_2",
5845
- # hue="batch",
5846
- )
6260
+ colname_met = "tSNE_"
5847
6261
  elif method == "factor":
5848
6262
  reduced_df = factor_df
5849
- colname_met = "Factor_"
5850
- if plot_:
5851
- sns.scatterplot(
5852
- data=factor_df,
5853
- x="Factor_1",
5854
- y="Factor_2",
5855
- # hue="batch",
5856
- )
6263
+ colname_met = "Factor_"
5857
6264
  elif method == "isolation_forest":
5858
6265
  reduced_df = iso_forest_df # Already a DataFrame for outliers
5859
6266
  colname_met = "PC_"
@@ -5872,33 +6279,71 @@ def df_reducer(
5872
6279
  c="r",
5873
6280
  label="outlier", marker="+", s=30,
5874
6281
  )
5875
-
6282
+ elif method=='lda':
6283
+ reduced_df=lda_df
6284
+ colname_met="LDA_"
6285
+ elif method=="kpca":
6286
+ reduced_df=kpca_df
6287
+ colname_met="KPCA_"
6288
+ elif method=="ica":
6289
+ reduced_df=ica_df
6290
+ colname_met="ICA_"
6291
+ elif method=="mds":
6292
+ reduced_df=mds_df
6293
+ colname_met="MDS_"
6294
+ elif method=="lle":
6295
+ reduced_df=lle_df
6296
+ colname_met="LLE_"
6297
+ elif method=="svd":
6298
+ reduced_df=svd_df
6299
+ colname_met="SVD_"
6300
+ # Quick plots
6301
+ if plot_ and (not method in ["isolation_forest"]):
6302
+ from .plot import plotxy
6303
+ if ax is None:
6304
+ if figsize is None:
6305
+ _, ax = plt.subplots(figsize=cm2inch(8,8))
6306
+ else:
6307
+ _, ax = plt.subplots(figsize=figsize)
6308
+ else:
6309
+ ax=ax.cla()
6310
+ ax=plotxy(data=reduced_df,
6311
+ x=colname_met+"1",
6312
+ y=colname_met+"2",
6313
+ hue=hue,
6314
+ s=1,
6315
+ edgecolor='none',
6316
+ kind='scater',
6317
+ figsets=dict(legend=dict(loc='best',markerscale=4),
6318
+ xlabel=xlabel if xlabel else None,
6319
+ ylabel=ylabel if ylabel else None),
6320
+ ax=ax,
6321
+ verbose=False,
6322
+ **kwargs
6323
+ )
5876
6324
 
5877
6325
  if inplace:
5878
6326
  # If inplace=True, add components back into the original data
5879
6327
  for col_idx in range(n_components):
5880
- data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
6328
+ data.loc[:,f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5881
6329
  # Add extra info for PCA/UMAP
5882
6330
  if method == "pca":
5883
6331
  for i in range(n_components):
5884
- data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
6332
+ data.loc[:,f"Explained Variance PC_{i+1}"] = reduced_df.loc[:,f"Explained Variance PC_{i+1}"]
5885
6333
  for i in range(n_components):
5886
- data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
6334
+ data.loc[:,f"Singular Values PC_{i+1}"] = reduced_df.loc[:,f"Singular Values PC_{i+1}"]
5887
6335
  elif method == "umap":
5888
6336
  for i in range(n_components):
5889
- data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
5890
- data["Embedding"] = reduced_df["Embedding"]
5891
- data["Trustworthiness"] = reduced_df["Trustworthiness"]
6337
+ data.loc[:,f"UMAP_{i+1}"]=reduced_df.loc[:,f"UMAP_{i+1}"]
6338
+ data.loc[:,"Embedding"] = reduced_df.loc[:,"Embedding"]
6339
+ data.loc[:,"Trustworthiness"] = reduced_df.loc[:,"Trustworthiness"]
6340
+
5892
6341
  return None # No return when inplace=True
5893
-
5894
6342
 
5895
6343
  return reduced_df
5896
-
5897
-
5898
6344
  # example:
5899
6345
  # df_reducer(data=data_log, columns=markers, n_components=2)
5900
6346
 
5901
-
5902
6347
  def plot_cluster(
5903
6348
  data: pd.DataFrame,
5904
6349
  labels: np.ndarray,
@@ -5922,7 +6367,7 @@ def plot_cluster(
5922
6367
  """
5923
6368
  import seaborn as sns
5924
6369
  from sklearn.metrics import silhouette_samples
5925
-
6370
+ import matplotlib.pyplot as plt
5926
6371
  if metrics is None:
5927
6372
  metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)
5928
6373