py2ls 0.2.4.7__py3-none-any.whl → 0.2.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -1,55 +1,8 @@
1
1
  import numpy as np
2
2
  import pandas as pd
3
-
4
- import json
5
- import matplotlib
6
- import matplotlib.pyplot as plt
7
- import matplotlib.ticker as tck
8
- from cycler import cycler
9
- from mpl_toolkits.mplot3d import Axes3D
10
- import seaborn as sns
11
-
12
- from sklearn.kernel_approximation import KERNEL_PARAMS
13
- from sympy import is_increasing
14
- import sys, os, shutil, re, yaml, json, subprocess
15
- import importlib.util
16
- import time
17
- from dateutil import parser
18
- from datetime import datetime
19
- import schedule
20
-
21
- from PIL import Image, ImageEnhance, ImageOps, ImageFilter
22
- from rembg import remove, new_session
23
-
24
- import docx
25
- from fpdf import FPDF
26
- from lxml import etree
27
- from docx import Document
28
- from PyPDF2 import PdfReader
29
- from pptx import Presentation
30
- from pptx.util import Inches
31
- from pdf2image import convert_from_path, pdfinfo_from_path
32
- from nltk.tokenize import sent_tokenize, word_tokenize
33
- import nltk # nltk.download("punkt")
34
- from docx2pdf import convert
35
- import img2pdf as image2pdf
36
- import nbformat
37
- from nbconvert import MarkdownExporter
38
-
39
- from itertools import pairwise
40
- from box import Box, BoxList
41
- from numerizer import numerize
42
- from tqdm import tqdm
43
- import mimetypes
44
- from pprint import pp
45
- from collections import Counter
46
- from fuzzywuzzy import fuzz, process
47
- from langdetect import detect
48
- from duckduckgo_search import DDGS
3
+ import sys, os
4
+ from IPython.display import display
49
5
  from typing import List, Optional, Union
50
- from bs4 import BeautifulSoup
51
-
52
- from . import netfinder
53
6
 
54
7
  try:
55
8
  get_ipython().run_line_magic("load_ext", "autoreload")
@@ -57,6 +10,35 @@ try:
57
10
  except NameError:
58
11
  pass
59
12
 
13
+ import warnings
14
+
15
+ warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
16
+ warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
17
+
18
+
19
+ def run_once_within(duration=60): # default 60s
20
+ import time
21
+
22
+ """
23
+ usage:
24
+ if run_once_within():
25
+ print("This code runs once per minute.")
26
+ else:
27
+ print("The code has already been run in the last minute.")
28
+ """
29
+ if not hasattr(run_once_within, "time_last"):
30
+ run_once_within.time_last = None
31
+ time_curr = time.time()
32
+
33
+ if (run_once_within.time_last is None) or (
34
+ time_curr - run_once_within.time_last >= duration
35
+ ):
36
+ run_once_within.time_last = time_curr # Update the last execution time
37
+ return True
38
+ else:
39
+ return False
40
+
41
+
60
42
  def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
61
43
  """
62
44
  Add the Chinese (default) font to the font manager
@@ -66,13 +48,14 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
66
48
  """
67
49
  import matplotlib.pyplot as plt
68
50
  from matplotlib import font_manager
69
- slashtype = "/" if 'mac' in get_os() else "\\"
51
+
52
+ slashtype = "/" if "mac" in get_os() else "\\"
70
53
  if slashtype in dir_font:
71
54
  font_manager.fontManager.addfont(dir_font)
72
55
  fontname = os.path.basename(dir_font).split(".")[0]
73
56
  else:
74
57
  if "cn" in dir_font.lower() or "ch" in dir_font.lower():
75
- fontname = "Hiragino Sans GB" # default Chinese font
58
+ fontname = "Hiragino Sans GB" # default Chinese font
76
59
  else:
77
60
  fontname = dir_font
78
61
 
@@ -86,6 +69,7 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
86
69
  plt.rcParams["font.sans-serif"] = ["Arial"]
87
70
  return fontname
88
71
 
72
+
89
73
  # set 'dir_save'
90
74
  if "dar" in sys.platform:
91
75
  dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
@@ -155,6 +139,9 @@ def run_every(when: str = None, job=None, wait: int = 60):
155
139
  :param when: String specifying the interval, e.g. '2 minutes', '4 hours', '1 day'.
156
140
  :param job: The function to be scheduled.
157
141
  """
142
+ import schedule
143
+ import time
144
+
158
145
  if job is None:
159
146
  print("No job provided!")
160
147
  return
@@ -200,6 +187,9 @@ def run_at(when: str, job=None, wait: int = 60):
200
187
  :param job: The function to be scheduled.
201
188
  :param wait: The sleep interval between checks in seconds.
202
189
  """
190
+ from datetime import datetime
191
+ import time
192
+
203
193
  if job is None:
204
194
  print("No job provided!")
205
195
  return
@@ -279,11 +269,13 @@ def get_timezone(timezone: str | list = None):
279
269
 
280
270
  def is_package_installed(package_name):
281
271
  """Check if a package is installed."""
272
+ import importlib.util
273
+
282
274
  package_spec = importlib.util.find_spec(package_name)
283
275
  return package_spec is not None
284
276
 
285
277
 
286
- def upgrade(module="py2ls",uninstall=False):
278
+ def upgrade(module="py2ls", uninstall=False):
287
279
  """
288
280
  Installs or upgrades a specified Python module.
289
281
 
@@ -291,6 +283,8 @@ def upgrade(module="py2ls",uninstall=False):
291
283
  module (str): The name of the module to install/upgrade.
292
284
  uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
293
285
  """
286
+ import subprocess
287
+
294
288
  if not is_package_installed(module):
295
289
  try:
296
290
  subprocess.check_call([sys.executable, "-m", "pip", "install", module])
@@ -327,6 +321,8 @@ def get_version(pkg):
327
321
 
328
322
 
329
323
  def rm_folder(folder_path, verbose=True):
324
+ import shutil
325
+
330
326
  try:
331
327
  shutil.rmtree(folder_path)
332
328
  if verbose:
@@ -345,6 +341,8 @@ def fremove(path, verbose=True):
345
341
  """
346
342
  try:
347
343
  if os.path.isdir(path):
344
+ import shutil
345
+
348
346
  shutil.rmtree(path)
349
347
  if verbose:
350
348
  print(f"Successfully deleted folder {path}")
@@ -360,22 +358,31 @@ def fremove(path, verbose=True):
360
358
  print(f"Failed to delete {path}. Reason: {e}")
361
359
 
362
360
 
363
- def get_cwd(verbose: bool = True):
364
- """
365
- get_cwd: to get the current working directory
366
- Args:
367
- verbose (bool, optional): to show which function is use. Defaults to True.
368
- """
369
- try:
370
- script_dir = os.path.dirname(os.path.abspath(__file__))
371
- if verbose:
372
- print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
373
- except NameError:
374
- # This works in an interactive environment (like a Jupyter notebook)
375
- script_dir = os.getcwd()
376
- if verbose:
377
- print("os.getcwd():", script_dir)
378
- return script_dir
361
+ # def get_cwd(verbose: bool = True):
362
+ # """
363
+ # get_cwd: to get the current working directory
364
+ # Args:
365
+ # verbose (bool, optional): to show which function is use. Defaults to True.
366
+ # """
367
+ # try:
368
+ # script_dir = os.path.dirname(os.path.abspath(__file__))
369
+ # if verbose:
370
+ # print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
371
+ # except NameError:
372
+ # # This works in an interactive environment (like a Jupyter notebook)
373
+ # script_dir = os.getcwd()
374
+ # if verbose:
375
+ # print("os.getcwd():", script_dir)
376
+ # return script_dir
377
+
378
+
379
+ def get_cwd():
380
+ from pathlib import Path
381
+
382
+ # Get the current script's directory as a Path object
383
+ current_directory = Path(__file__).resolve().parent
384
+
385
+ return current_directory
379
386
 
380
387
 
381
388
  def search(
@@ -388,6 +395,7 @@ def search(
388
395
  dir_save=dir_save,
389
396
  **kwargs,
390
397
  ):
398
+ from duckduckgo_search import DDGS
391
399
 
392
400
  if "te" in kind.lower():
393
401
  results = DDGS().text(query, max_results=limit)
@@ -421,6 +429,7 @@ def echo(*args, **kwargs):
421
429
  str: the answer from ai
422
430
  """
423
431
  global dir_save
432
+ from duckduckgo_search import DDGS
424
433
 
425
434
  query = None
426
435
  model = kwargs.get("model", "gpt")
@@ -469,8 +478,13 @@ def echo(*args, **kwargs):
469
478
  model_valid = valid_mod_name(model)
470
479
  res = DDGS().chat(query, model=model_valid)
471
480
  if verbose:
481
+ from pprint import pp
482
+
472
483
  pp(res)
473
484
  if log:
485
+ from datetime import datetime
486
+ import time
487
+
474
488
  dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
475
489
  res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
476
490
  if bool(os.path.basename(dir_save)):
@@ -492,6 +506,8 @@ def ai(*args, **kwargs):
492
506
 
493
507
 
494
508
  def detect_lang(text, output="lang", verbose=True):
509
+ from langdetect import detect
510
+
495
511
  dir_curr_script = os.path.dirname(os.path.abspath(__file__))
496
512
  dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
497
513
  print(dir_curr_script, os.getcwd(), dir_lang_code)
@@ -521,13 +537,14 @@ def is_text(s):
521
537
 
522
538
  from typing import Any, Union
523
539
 
540
+
524
541
  def shared(*args, strict=True, n_shared=2, verbose=True):
525
542
  """
526
543
  check the shared elelements in two list.
527
544
  usage:
528
545
  list1 = [1, 2, 3, 4, 5]
529
546
  list2 = [4, 5, 6, 7, 8]
530
- list3 = [5, 6, 9, 10]
547
+ list3 = [5, 6, 9, 10]
531
548
  a = shared(list1, list2,list3)
532
549
  """
533
550
  if verbose:
@@ -543,25 +560,34 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
543
560
  print(f"{' ' * 2}All inputs must be lists.")
544
561
  return []
545
562
  first_list = flattened_lists[0]
546
- shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
563
+ shared_elements = [
564
+ item for item in first_list if all(item in lst for lst in flattened_lists)
565
+ ]
547
566
  if strict:
548
- # Strict mode: require elements to be in all lists
549
- shared_elements = set(flattened_lists[0])
550
- for lst in flattened_lists[1:]:
551
- shared_elements.intersection_update(lst)
567
+ # Strict mode: require elements to be in all lists
568
+ shared_elements = set(flattened_lists[0])
569
+ for lst in flattened_lists[1:]:
570
+ shared_elements.intersection_update(lst)
552
571
  else:
572
+ from collections import Counter
573
+
553
574
  all_elements = [item for sublist in flattened_lists for item in sublist]
554
575
  element_count = Counter(all_elements)
555
576
  # Get elements that appear in at least n_shared lists
556
- shared_elements = [item for item, count in element_count.items() if count >= n_shared]
577
+ shared_elements = [
578
+ item for item, count in element_count.items() if count >= n_shared
579
+ ]
557
580
 
558
581
  shared_elements = flatten(shared_elements, verbose=verbose)
559
582
  if verbose:
560
- elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
583
+ elements2show = (
584
+ shared_elements if len(shared_elements) < 10 else shared_elements[:5]
585
+ )
561
586
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
562
587
  print("********* checking shared elements *********")
563
588
  return shared_elements
564
589
 
590
+
565
591
  def not_shared(*args, strict=True, n_shared=2, verbose=False):
566
592
  """
567
593
  To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
@@ -571,9 +597,9 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
571
597
  not_shared(list1,list2)# output [1,3]
572
598
  """
573
599
  _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
574
- list1 = args[0]
575
- _not_shared=[item for item in list1 if item not in _common]
576
- return flatten(_not_shared, verbose=verbose)
600
+ list1 = flatten(args[0], verbose=verbose)
601
+ _not_shared = [item for item in list1 if item not in _common]
602
+ return _not_shared
577
603
 
578
604
 
579
605
  def flatten(nested: Any, unique_list=True, verbose=False):
@@ -582,29 +608,41 @@ def flatten(nested: Any, unique_list=True, verbose=False):
582
608
  Parameters:
583
609
  nested : Any, Can be a list, tuple, dictionary, or set.
584
610
  Returns: list, A flattened list.
585
- """
611
+ """
586
612
  flattened_list = []
587
613
  stack = [nested]
588
614
  while stack:
589
615
  current = stack.pop()
590
616
  if isinstance(current, dict):
591
- stack.extend(current.values())
617
+ stack.extend(current.values())
592
618
  elif isinstance(current, (list, tuple, set)):
593
619
  stack.extend(current)
594
620
  elif isinstance(current, pd.Series):
595
621
  stack.extend(current)
596
- elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
622
+ elif isinstance(
623
+ current, (pd.Index, np.ndarray)
624
+ ): # df.columns df.index are object of type pd.Index
597
625
  stack.extend(current.tolist())
598
626
  else:
599
627
  flattened_list.append(current)
600
628
  if verbose:
601
- print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
629
+ print(
630
+ f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>"
631
+ )
602
632
  if unique_list:
603
633
  return unique(flattened_list)[::-1]
604
634
  else:
605
635
  return flattened_list
606
-
607
- def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=False, scorer="WR"):
636
+
637
+
638
+ def strcmp(
639
+ search_term,
640
+ candidates,
641
+ ignore_case=True,
642
+ get_rank=False,
643
+ verbose=False,
644
+ scorer="WR",
645
+ ):
608
646
  """
609
647
  Compares a search term with a list of candidate strings and finds the best match based on similarity score.
610
648
 
@@ -617,6 +655,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
617
655
  Returns:
618
656
  tuple: A tuple containing the best match and its index in the candidates list.
619
657
  """
658
+ from fuzzywuzzy import fuzz, process
620
659
 
621
660
  def to_lower(s, ignore_case=True):
622
661
  # Converts a string or list of strings to lowercase if ignore_case is True.
@@ -624,7 +663,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
624
663
  if isinstance(s, str):
625
664
  return s.lower()
626
665
  elif isinstance(s, list):
627
- s=[str(i) for i in s]# convert all to str
666
+ s = [str(i) for i in s] # convert all to str
628
667
  return [elem.lower() for elem in s]
629
668
  return s
630
669
 
@@ -634,12 +673,15 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
634
673
  similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
635
674
  elif "W" in scorer.lower():
636
675
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
637
- elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
676
+ elif "ratio" in scorer.lower() or "stri" in scorer.lower(): # Ratio (Strictest)
638
677
  similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
639
678
  else:
640
679
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
641
680
  if get_rank:
642
- idx = [similarity_scores.index(i) for i in sorted(similarity_scores,reverse=True)]
681
+ idx = [
682
+ similarity_scores.index(i)
683
+ for i in sorted(similarity_scores, reverse=True)
684
+ ]
643
685
  if verbose:
644
686
  display([candidates[ii] for ii in idx])
645
687
  return [candidates[ii] for ii in idx]
@@ -667,6 +709,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
667
709
  # str2 = ['PLoS Computational Biology', 'PLOS BIOLOGY']
668
710
  # best_match, idx = strcmp(str1, str2, ignore_case=1)
669
711
 
712
+
670
713
  def cn2pinyin(
671
714
  cn_str: Union[str, list] = None,
672
715
  sep: str = " ",
@@ -731,18 +774,21 @@ def cn2pinyin(
731
774
  style = Style.PL
732
775
  else:
733
776
  style = Style.NORMAL
734
- if not isinstance(cn_str,list):
735
- cn_str=[cn_str]
736
- pinyin_flat=[]
777
+ if not isinstance(cn_str, list):
778
+ cn_str = [cn_str]
779
+ pinyin_flat = []
737
780
  for cn_str_ in cn_str:
738
781
  pinyin_string = pinyin(cn_str_, style=style)
739
782
  pinyin_flat.append(sep.join([item[0] for item in pinyin_string]))
740
- if len(pinyin_flat)==1:
783
+ if len(pinyin_flat) == 1:
741
784
  return pinyin_flat[0]
742
785
  else:
743
786
  return pinyin_flat
744
787
 
788
+
745
789
  def counter(list_, verbose=True):
790
+ from collections import Counter
791
+
746
792
  c = Counter(list_)
747
793
  # Print the name counts
748
794
  for item, count in c.items():
@@ -771,6 +817,7 @@ def str2time(time_str, fmt="24"):
771
817
  %p represents AM or PM.
772
818
  - str: The converted time string.
773
819
  """
820
+ from datetime import datetime
774
821
 
775
822
  def time_len_corr(time_str):
776
823
  time_str_ = (
@@ -832,6 +879,8 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
832
879
  Returns:
833
880
  - str: The converted date string.
834
881
  """
882
+ from dateutil import parser
883
+
835
884
  try:
836
885
  date_obj = parser.parse(date_str)
837
886
  except ValueError as e:
@@ -848,6 +897,8 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
848
897
 
849
898
 
850
899
  def str2num(s, *args, **kwargs):
900
+ import re
901
+
851
902
  delimiter = kwargs.get("sep", None)
852
903
  round_digits = kwargs.get("round", None)
853
904
  if delimiter is not None:
@@ -863,6 +914,8 @@ def str2num(s, *args, **kwargs):
863
914
  try:
864
915
  num = float(s)
865
916
  except ValueError:
917
+ from numerizer import numerize
918
+
866
919
  try:
867
920
  numerized = numerize(s)
868
921
  num = int(numerized) if "." not in numerized else float(numerized)
@@ -1030,7 +1083,7 @@ def px2inch(*px, dpi=300) -> list:
1030
1083
  return [i / dpi for i in px]
1031
1084
 
1032
1085
 
1033
- def cm2inch(*cm) -> list:
1086
+ def inch2cm(*cm) -> list:
1034
1087
  """
1035
1088
  cm2inch: converts centimeter measurements to inches.
1036
1089
  Usage:
@@ -1051,24 +1104,30 @@ def cm2inch(*cm) -> list:
1051
1104
  def inch2px(*inch, dpi=300) -> list:
1052
1105
  """
1053
1106
  inch2px: converts inch measurements to pixels based on the given dpi.
1107
+
1054
1108
  Usage:
1055
1109
  inch2px(1, 2, dpi=300); inch2px([1, 2], dpi=300)
1110
+
1111
+ Parameters:
1112
+ inch : float, list, or tuple
1113
+ Single or multiple measurements in inches to convert to pixels.
1114
+ dpi : int, optional (default=300)
1115
+ Dots per inch (DPI), representing the pixel density.
1116
+
1056
1117
  Returns:
1057
- list: in pixels
1118
+ list: Converted measurements in pixels.
1058
1119
  """
1059
- # Case 1: When the user passes a single argument that is a list or tuple, such as inch2px([1, 2]) or inch2px((1, 2))
1120
+ # Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
1060
1121
  if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
1061
- # If the input is a single list or tuple, we unpack its elements and convert each to pixels
1062
1122
  return [i * dpi for i in inch[0]]
1063
- # Case 2: When the user passes multiple arguments directly, such as inch2px(1, 2)
1123
+
1124
+ # Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
1064
1125
  else:
1065
- # Here, we convert each individual argument directly to pixels
1066
1126
  return [i * dpi for i in inch]
1067
1127
 
1068
1128
 
1069
- def inch2cm(*inch) -> list:
1129
+ def cm2inch(*inch) -> list:
1070
1130
  """
1071
- inch2cm: converts inch measurements to centimeters.
1072
1131
  Usage:
1073
1132
  inch2cm(8,5); inch2cm((8,5)); inch2cm([8,5])
1074
1133
  Returns:
@@ -1183,6 +1242,8 @@ def paper_size(paper_type_str="a4"):
1183
1242
 
1184
1243
 
1185
1244
  def docx2pdf(dir_docx, dir_pdf=None):
1245
+ from docx2pdf import convert
1246
+
1186
1247
  if dir_pdf:
1187
1248
  convert(dir_docx, dir_pdf)
1188
1249
  else:
@@ -1190,6 +1251,8 @@ def docx2pdf(dir_docx, dir_pdf=None):
1190
1251
 
1191
1252
 
1192
1253
  def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
1254
+ import img2pdf as image2pdf
1255
+
1193
1256
  def mm_to_point(size):
1194
1257
  return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
1195
1258
 
@@ -1241,6 +1304,10 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
1241
1304
 
1242
1305
 
1243
1306
  def pdf2ppt(dir_pdf, dir_ppt):
1307
+ from PyPDF2 import PdfReader
1308
+ from pptx.util import Inches
1309
+ from pptx import Presentation
1310
+
1244
1311
  prs = Presentation()
1245
1312
 
1246
1313
  # Open the PDF file
@@ -1269,6 +1336,8 @@ def pdf2ppt(dir_pdf, dir_ppt):
1269
1336
 
1270
1337
 
1271
1338
  def ssplit(text, by="space", verbose=False, strict=False, **kws):
1339
+ import re
1340
+
1272
1341
  if isinstance(text, list):
1273
1342
  nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
1274
1343
  flat_list = [item for sublist in nested_list for item in sublist]
@@ -1316,6 +1385,9 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1316
1385
  return [text[i : i + length] for i in range(0, len(text), length)]
1317
1386
 
1318
1387
  def split_by_sent_num(text, n=10):
1388
+ from nltk.tokenize import sent_tokenize
1389
+ from itertools import pairwise
1390
+
1319
1391
  # split text into sentences
1320
1392
  text_split_by_sent = sent_tokenize(text)
1321
1393
  cut_loc_array = np.arange(0, len(text_split_by_sent), n)
@@ -1388,10 +1460,14 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1388
1460
  print(f"splited by camel_case")
1389
1461
  return split_by_camel_case(text)
1390
1462
  elif ("word" in by) and not strict:
1463
+ from nltk.tokenize import word_tokenize
1464
+
1391
1465
  if verbose:
1392
1466
  print(f"splited by word")
1393
1467
  return word_tokenize(text)
1394
1468
  elif ("sen" in by and not "num" in by) and not strict:
1469
+ from nltk.tokenize import sent_tokenize
1470
+
1395
1471
  if verbose:
1396
1472
  print(f"splited by sentence")
1397
1473
  return sent_tokenize(text)
@@ -1441,9 +1517,13 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1441
1517
 
1442
1518
 
1443
1519
  def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
1520
+ from pdf2image import convert_from_path, pdfinfo_from_path
1521
+
1444
1522
  df_dir_img_single_page = pd.DataFrame()
1445
1523
  dir_single_page = []
1446
1524
  if verbose:
1525
+ from pprint import pp
1526
+
1447
1527
  pp(pdfinfo_from_path(dir_pdf))
1448
1528
  if isinstance(page, tuple) and page:
1449
1529
  page = list(page)
@@ -1562,6 +1642,8 @@ def unzip(dir_path, output_dir=None):
1562
1642
  # If the output directory already exists, remove it and replace it
1563
1643
  if os.path.exists(output_dir):
1564
1644
  if os.path.isdir(output_dir): # check if it is a folder
1645
+ import shutil
1646
+
1565
1647
  shutil.rmtree(output_dir) # remove folder
1566
1648
  else:
1567
1649
  os.remove(output_dir) # remove file
@@ -1579,6 +1661,8 @@ def unzip(dir_path, output_dir=None):
1579
1661
 
1580
1662
  output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
1581
1663
  try:
1664
+ import shutil
1665
+
1582
1666
  with gzip.open(dir_path, "rb") as gz_file:
1583
1667
  with open(output_file, "wb") as out_file:
1584
1668
  shutil.copyfileobj(gz_file, out_file)
@@ -1586,11 +1670,14 @@ def unzip(dir_path, output_dir=None):
1586
1670
  except FileNotFoundError:
1587
1671
  print(f"Error: The file '{dir_path}' was not found.")
1588
1672
  except PermissionError:
1589
- print(f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'.")
1673
+ print(
1674
+ f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'."
1675
+ )
1590
1676
  except Exception as e:
1591
1677
  try:
1592
1678
  import tarfile
1593
- with tarfile.open(dir_path, 'r:gz') as tar:
1679
+
1680
+ with tarfile.open(dir_path, "r:gz") as tar:
1594
1681
  tar.extractall(path=output_file)
1595
1682
  except Exception as final_e:
1596
1683
  print(f"An final unexpected error occurred: {final_e}")
@@ -1676,11 +1763,13 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1676
1763
 
1677
1764
  """
1678
1765
  if not isinstance(df, pd.DataFrame):
1766
+ if verbose:
1767
+ print("not pd.DataFrame")
1679
1768
  return False
1680
- df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
1769
+ df.columns = df.columns.astype(str) # 把它变成str, 这样就可以进行counts运算了
1681
1770
  # Initialize a list to hold messages about abnormalities
1682
1771
  messages = []
1683
- is_abnormal = True
1772
+ is_abnormal = False
1684
1773
  # Check the shape of the DataFrame
1685
1774
  actual_shape = df.shape
1686
1775
  messages.append(f"Shape of DataFrame: {actual_shape}")
@@ -1705,25 +1794,29 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1705
1794
  is_abnormal = True
1706
1795
  if verbose:
1707
1796
  print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
1708
-
1797
+ if verbose:
1798
+ print("1", is_abnormal)
1709
1799
  if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
1710
1800
  messages.append("Abnormal: Too many delimiters in column names.")
1711
1801
  is_abnormal = True
1712
1802
  if verbose:
1713
1803
  print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
1714
-
1804
+ if verbose:
1805
+ print("2", is_abnormal)
1715
1806
  if delimiter_counts[""] > 3:
1716
1807
  messages.append("Abnormal: There are empty column names.")
1717
1808
  is_abnormal = True
1718
1809
  if verbose:
1719
1810
  print(f'delimiter_counts[""] > 3')
1720
-
1811
+ if verbose:
1812
+ print("3", is_abnormal)
1721
1813
  if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
1722
1814
  messages.append("Abnormal: Some column names contain unexpected characters.")
1723
1815
  is_abnormal = True
1724
1816
  if verbose:
1725
1817
  print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
1726
-
1818
+ if verbose:
1819
+ print("4", is_abnormal)
1727
1820
  # # Check for missing values
1728
1821
  # missing_values = df.isnull().sum()
1729
1822
  # if missing_values.any():
@@ -1742,8 +1835,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1742
1835
  messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
1743
1836
  is_abnormal = True
1744
1837
  if verbose:
1745
- print(f'df.columns[df.nunique() == 1].tolist()')
1746
-
1838
+ print(f"df.columns[df.nunique() == 1].tolist()")
1839
+ if verbose:
1840
+ print("5", is_abnormal)
1747
1841
  # Check for an unreasonable number of rows or columns
1748
1842
  if actual_shape[0] < 2 or actual_shape[1] < 2:
1749
1843
  messages.append(
@@ -1751,8 +1845,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1751
1845
  )
1752
1846
  is_abnormal = True
1753
1847
  if verbose:
1754
- print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
1755
-
1848
+ print(f"actual_shape[0] < 2 or actual_shape[1] < 2")
1849
+ if verbose:
1850
+ print("6", is_abnormal)
1756
1851
  # Compile results
1757
1852
  if verbose:
1758
1853
  print("\n".join(messages))
@@ -1770,6 +1865,26 @@ def fload(fpath, kind=None, **kwargs):
1770
1865
  content: The content loaded from the file.
1771
1866
  """
1772
1867
 
1868
+ def read_mplstyle(style_file):
1869
+ import matplotlib.pyplot as plt
1870
+
1871
+ # Load the style file
1872
+ plt.style.use(style_file)
1873
+
1874
+ # Get the current style properties
1875
+ style_dict = plt.rcParams
1876
+
1877
+ # Convert to dictionary
1878
+ style_dict = dict(style_dict)
1879
+ # Print the style dictionary
1880
+ for i, j in style_dict.items():
1881
+ print(f"\n{i}::::{j}")
1882
+ return style_dict
1883
+
1884
+ # #example usage:
1885
+ # style_file = "/ std-colors.mplstyle"
1886
+ # style_dict = read_mplstyle(style_file)
1887
+
1773
1888
  def load_txt_md(fpath):
1774
1889
  with open(fpath, "r") as file:
1775
1890
  content = file.read()
@@ -1779,25 +1894,30 @@ def fload(fpath, kind=None, **kwargs):
1779
1894
  # with open(fpath, "r") as file:
1780
1895
  # content = file.read()
1781
1896
  # return content
1782
- def load_html(fpath,**kwargs):
1783
- return pd.read_html(fpath,**kwargs)
1897
+ def load_html(fpath, **kwargs):
1898
+ return pd.read_html(fpath, **kwargs)
1784
1899
 
1785
1900
  def load_json(fpath, **kwargs):
1786
- output=kwargs.pop("output","json")
1787
- if output=='json':
1901
+ output = kwargs.pop("output", "json")
1902
+ if output == "json":
1903
+ import json
1904
+
1788
1905
  with open(fpath, "r") as file:
1789
1906
  content = json.load(file)
1790
1907
  return content
1791
1908
  else:
1792
- return pd.read_json(fpath,**kwargs)
1909
+ return pd.read_json(fpath, **kwargs)
1793
1910
 
1794
1911
  def load_yaml(fpath):
1912
+ import yaml
1913
+
1795
1914
  with open(fpath, "r") as file:
1796
1915
  content = yaml.safe_load(file)
1797
1916
  return content
1798
1917
 
1799
-
1800
1918
  def load_xml(fpath, fsize_thr: int = 100):
1919
+ from lxml import etree
1920
+
1801
1921
  def load_small_xml(fpath):
1802
1922
  tree = etree.parse(fpath)
1803
1923
  root = tree.getroot()
@@ -1857,6 +1977,15 @@ def fload(fpath, kind=None, **kwargs):
1857
1977
  return char
1858
1978
  return None
1859
1979
 
1980
+ def _get_chunks(df_fake):
1981
+ """
1982
+ helper func for 'load_csv'
1983
+ """
1984
+ chunks = []
1985
+ for chunk in df_fake:
1986
+ chunks.append(chunk)
1987
+ return pd.concat(chunks, ignore_index=True)
1988
+
1860
1989
  def load_csv(fpath, **kwargs):
1861
1990
  from pandas.errors import EmptyDataError
1862
1991
 
@@ -1868,12 +1997,17 @@ def fload(fpath, kind=None, **kwargs):
1868
1997
  encoding = kwargs.pop("encoding", "utf-8")
1869
1998
  on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1870
1999
  comment = kwargs.pop("comment", None)
1871
- fmt=kwargs.pop("fmt",False)
1872
- verbose=kwargs.pop("verbose",False)
1873
- if verbose:
2000
+ fmt = kwargs.pop("fmt", False)
2001
+ chunksize = kwargs.pop("chunksize", None)
2002
+ engine = "c" if chunksize else engine # when chunksize, recommend 'c'
2003
+ low_memory = kwargs.pop("low_memory", True)
2004
+ low_memory = (
2005
+ False if chunksize else True
2006
+ ) # when chunksize, recommend low_memory=False
2007
+ verbose = kwargs.pop("verbose", False)
2008
+ if run_once_within():
1874
2009
  use_pd("read_csv", verbose=verbose)
1875
- return
1876
-
2010
+
1877
2011
  if comment is None:
1878
2012
  comment = get_comment(
1879
2013
  fpath, comment=None, encoding="utf-8", lines_to_check=5
@@ -1890,14 +2024,19 @@ def fload(fpath, kind=None, **kwargs):
1890
2024
  skipinitialspace=skipinitialspace,
1891
2025
  sep=sep,
1892
2026
  on_bad_lines=on_bad_lines,
2027
+ chunksize=chunksize,
2028
+ low_memory=low_memory,
1893
2029
  **kwargs,
1894
2030
  )
1895
- if is_df_abnormal(df, verbose=0):
2031
+ if chunksize:
2032
+ df = _get_chunks(df)
2033
+ print(df.shape)
2034
+ if is_df_abnormal(df, verbose=0): # raise error
1896
2035
  raise ValueError("the df is abnormal")
1897
2036
  except:
1898
2037
  try:
1899
2038
  try:
1900
- if engine == "pyarrow":
2039
+ if engine == "pyarrow" and not chunksize:
1901
2040
  df = pd.read_csv(
1902
2041
  fpath,
1903
2042
  engine=engine,
@@ -1906,6 +2045,7 @@ def fload(fpath, kind=None, **kwargs):
1906
2045
  sep=sep,
1907
2046
  on_bad_lines=on_bad_lines,
1908
2047
  comment=comment,
2048
+ low_memory=low_memory,
1909
2049
  **kwargs,
1910
2050
  )
1911
2051
  else:
@@ -1919,14 +2059,19 @@ def fload(fpath, kind=None, **kwargs):
1919
2059
  skipinitialspace=skipinitialspace,
1920
2060
  on_bad_lines=on_bad_lines,
1921
2061
  comment=comment,
2062
+ chunksize=chunksize,
2063
+ low_memory=low_memory,
1922
2064
  **kwargs,
1923
2065
  )
2066
+ if chunksize:
2067
+ df = _get_chunks(df)
2068
+ print(df.shape)
1924
2069
  if is_df_abnormal(df, verbose=0):
1925
2070
  raise ValueError("the df is abnormal")
1926
2071
  except (UnicodeDecodeError, ValueError):
1927
2072
  encoding = get_encoding(fpath)
1928
2073
  # print(f"utf-8 failed. Retrying with detected encoding: {encoding}")
1929
- if engine == "pyarrow":
2074
+ if engine == "pyarrow" and not chunksize:
1930
2075
  df = pd.read_csv(
1931
2076
  fpath,
1932
2077
  engine=engine,
@@ -1935,6 +2080,7 @@ def fload(fpath, kind=None, **kwargs):
1935
2080
  sep=sep,
1936
2081
  on_bad_lines=on_bad_lines,
1937
2082
  comment=comment,
2083
+ low_memory=low_memory,
1938
2084
  **kwargs,
1939
2085
  )
1940
2086
  else:
@@ -1948,8 +2094,13 @@ def fload(fpath, kind=None, **kwargs):
1948
2094
  skipinitialspace=skipinitialspace,
1949
2095
  on_bad_lines=on_bad_lines,
1950
2096
  comment=comment,
2097
+ chunksize=chunksize,
2098
+ low_memory=low_memory,
1951
2099
  **kwargs,
1952
2100
  )
2101
+ if chunksize:
2102
+ df = _get_chunks(df)
2103
+ print(df.shape)
1953
2104
  if is_df_abnormal(df, verbose=0):
1954
2105
  raise ValueError("the df is abnormal")
1955
2106
  except Exception as e:
@@ -1966,8 +2117,13 @@ def fload(fpath, kind=None, **kwargs):
1966
2117
  sep=sep,
1967
2118
  on_bad_lines=on_bad_lines,
1968
2119
  comment=comment,
2120
+ chunksize=chunksize,
2121
+ low_memory=low_memory,
1969
2122
  **kwargs,
1970
2123
  )
2124
+ if chunksize:
2125
+ df = _get_chunks(df)
2126
+ print(df.shape)
1971
2127
  if not is_df_abnormal(df, verbose=0): # normal
1972
2128
  display(df.head(2))
1973
2129
  print(f"shape: {df.shape}")
@@ -1975,51 +2131,64 @@ def fload(fpath, kind=None, **kwargs):
1975
2131
  except:
1976
2132
  pass
1977
2133
  else:
1978
- engines = [None,"c", "python"]
1979
- for engine in engines:
1980
- separators = [",", "\t", ";", "|", " "]
1981
- for sep in separators:
1982
- try:
1983
- # sep2show = sep if sep != "\t" else "\\t"
1984
- # print(f"trying with: engine={engine}, sep='{sep2show}'")
1985
- # print(".")
1986
- df = pd.read_csv(
1987
- fpath,
1988
- engine=engine,
1989
- sep=sep,
1990
- on_bad_lines=on_bad_lines,
1991
- comment=comment,
1992
- **kwargs,
1993
- )
1994
- # display(df.head(2))
1995
- # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
1996
- if not is_df_abnormal(df, verbose=0):
1997
- display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
1998
- print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
1999
- return df
2000
- except EmptyDataError as e:
2001
- continue
2002
- else:
2003
- pass
2134
+ if not chunksize:
2135
+ engines = [None, "c", "python"]
2136
+ for engine in engines:
2137
+ separators = [",", "\t", ";", "|", " "]
2138
+ for sep in separators:
2139
+ try:
2140
+ # sep2show = sep if sep != "\t" else "\\t"
2141
+ # print(f"trying with: engine={engine}, sep='{sep2show}'")
2142
+ # print(".")
2143
+ df = pd.read_csv(
2144
+ fpath,
2145
+ engine=engine,
2146
+ sep=sep,
2147
+ on_bad_lines=on_bad_lines,
2148
+ comment=comment,
2149
+ chunksize=chunksize,
2150
+ low_memory=low_memory,
2151
+ **kwargs,
2152
+ )
2153
+ # display(df.head(2))
2154
+ # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
2155
+ if chunksize:
2156
+ df = _get_chunks(df)
2157
+ print(df.shape)
2158
+ if not is_df_abnormal(df, verbose=0):
2159
+ (
2160
+ display(df.head(2))
2161
+ if isinstance(df, pd.DataFrame)
2162
+ else display("it is not a DataFrame")
2163
+ )
2164
+ (
2165
+ print(f"shape: {df.shape}")
2166
+ if isinstance(df, pd.DataFrame)
2167
+ else display("it is not a DataFrame")
2168
+ )
2169
+ return df
2170
+ except EmptyDataError as e:
2171
+ continue
2172
+ else:
2173
+ pass
2004
2174
  display(df.head(2))
2005
2175
  print(f"shape: {df.shape}")
2006
2176
  return df
2007
2177
 
2008
2178
  def load_excel(fpath, **kwargs):
2009
2179
  engine = kwargs.get("engine", "openpyxl")
2010
- verbose=kwargs.pop("verbose",False)
2011
- if verbose:
2180
+ verbose = kwargs.pop("verbose", False)
2181
+ if run_once_within():
2012
2182
  use_pd("read_excel", verbose=verbose)
2013
2183
  df = pd.read_excel(fpath, engine=engine, **kwargs)
2014
2184
  try:
2015
- meata=pd.ExcelFile(fpath)
2185
+ meata = pd.ExcelFile(fpath)
2016
2186
  print(f"n_sheet={len(meata.sheet_names)},\t'sheetname = 0 (default)':")
2017
- [print(f"{i}:\t{i_}") for i,i_ in enumerate(meata.sheet_names)]
2187
+ [print(f"{i}:\t{i_}") for i, i_ in enumerate(meata.sheet_names)]
2018
2188
  except:
2019
2189
  pass
2020
2190
  return df
2021
2191
 
2022
-
2023
2192
  def load_parquet(fpath, **kwargs):
2024
2193
  """
2025
2194
  Load a Parquet file into a Pandas DataFrame with advanced options.
@@ -2035,16 +2204,16 @@ def fload(fpath, kind=None, **kwargs):
2035
2204
  Returns:
2036
2205
  - df (DataFrame): The loaded DataFrame.
2037
2206
  """
2038
-
2207
+
2039
2208
  engine = kwargs.get("engine", "pyarrow")
2040
2209
  verbose = kwargs.pop("verbose", False)
2041
-
2042
- if verbose:
2210
+
2211
+ if run_once_within():
2043
2212
  use_pd("read_parquet", verbose=verbose)
2044
2213
  try:
2045
2214
  df = pd.read_parquet(fpath, engine=engine, **kwargs)
2046
2215
  if verbose:
2047
- if 'columns' in kwargs:
2216
+ if "columns" in kwargs:
2048
2217
  print(f"Loaded columns: {kwargs['columns']}")
2049
2218
  else:
2050
2219
  print("Loaded all columns.")
@@ -2053,9 +2222,12 @@ def fload(fpath, kind=None, **kwargs):
2053
2222
  print(f"An error occurred while loading the Parquet file: {e}")
2054
2223
  df = None
2055
2224
 
2056
- return df
2225
+ return df
2057
2226
 
2058
2227
  def load_ipynb(fpath, **kwargs):
2228
+ import nbformat
2229
+ from nbconvert import MarkdownExporter
2230
+
2059
2231
  as_version = kwargs.get("as_version", 4)
2060
2232
  with open(fpath, "r") as file:
2061
2233
  nb = nbformat.read(file, as_version=as_version)
@@ -2085,6 +2257,8 @@ def fload(fpath, kind=None, **kwargs):
2085
2257
  If page is an integer, it returns the text of the specified page number.
2086
2258
  If the specified page is not found, it returns the string "Page is not found".
2087
2259
  """
2260
+ from PyPDF2 import PdfReader
2261
+
2088
2262
  text_dict = {}
2089
2263
  with open(fpath, "rb") as file:
2090
2264
  pdf_reader = PdfReader(file)
@@ -2114,6 +2288,8 @@ def fload(fpath, kind=None, **kwargs):
2114
2288
  return text_dict.get(int(page), "Page is not found")
2115
2289
 
2116
2290
  def load_docx(fpath):
2291
+ from docx import Document
2292
+
2117
2293
  doc = Document(fpath)
2118
2294
  content = [para.text for para in doc.paragraphs]
2119
2295
  return content
@@ -2123,21 +2299,55 @@ def fload(fpath, kind=None, **kwargs):
2123
2299
  kind = kind.lower()
2124
2300
  kind = kind.lstrip(".").lower()
2125
2301
  img_types = [
2126
- "bmp","eps","gif","png","jpg","jpeg","jpeg2000","tiff","tif",
2127
- "icns","ico","im","msp","pcx","ppm","sgi","spider","tga","webp",
2302
+ "bmp",
2303
+ "eps",
2304
+ "gif",
2305
+ "png",
2306
+ "jpg",
2307
+ "jpeg",
2308
+ "jpeg2000",
2309
+ "tiff",
2310
+ "tif",
2311
+ "icns",
2312
+ "ico",
2313
+ "im",
2314
+ "msp",
2315
+ "pcx",
2316
+ "ppm",
2317
+ "sgi",
2318
+ "spider",
2319
+ "tga",
2320
+ "webp",
2128
2321
  ]
2129
2322
  doc_types = [
2130
- "docx","pdf",
2131
- "txt","csv","xlsx","tsv","parquet","snappy",
2132
- "md","html",
2133
- "json","yaml","xml",
2323
+ "docx",
2324
+ "pdf",
2325
+ "txt",
2326
+ "csv",
2327
+ "xlsx",
2328
+ "tsv",
2329
+ "parquet",
2330
+ "snappy",
2331
+ "md",
2332
+ "html",
2333
+ "json",
2334
+ "yaml",
2335
+ "xml",
2134
2336
  "ipynb",
2135
- "mtx"
2337
+ "mtx",
2136
2338
  ]
2137
2339
  zip_types = [
2138
- "gz","zip","7z","rar","tgz",
2139
- "tar","tar.gz","tar.bz2",
2140
- "bz2","xz","gzip"
2340
+ "gz",
2341
+ "zip",
2342
+ "7z",
2343
+ "rar",
2344
+ "tgz",
2345
+ "tar",
2346
+ "tar.gz",
2347
+ "tar.bz2",
2348
+ "bz2",
2349
+ "xz",
2350
+ "gzip",
2141
2351
  ]
2142
2352
  other_types = ["fcs"]
2143
2353
  supported_types = [*doc_types, *img_types, *zip_types, *other_types]
@@ -2173,9 +2383,17 @@ def fload(fpath, kind=None, **kwargs):
2173
2383
  return load_yaml(fpath)
2174
2384
  elif kind == "xml":
2175
2385
  return load_xml(fpath)
2176
- elif kind in ["csv","tsv"]:
2386
+ elif kind in ["csv", "tsv"]:
2387
+ verbose = kwargs.pop("verbose", False)
2388
+ if run_once_within():
2389
+ use_pd("read_csv")
2177
2390
  content = load_csv(fpath, **kwargs)
2178
2391
  return content
2392
+ elif kind == "pkl":
2393
+ verbose = kwargs.pop("verbose", False)
2394
+ if run_once_within():
2395
+ use_pd("read_pickle")
2396
+ return pd.read_pickle(fpath, **kwargs)
2179
2397
  elif kind in ["ods", "ods", "odt"]:
2180
2398
  engine = kwargs.get("engine", "odf")
2181
2399
  kwargs.pop("engine", None)
@@ -2184,35 +2402,54 @@ def fload(fpath, kind=None, **kwargs):
2184
2402
  engine = kwargs.get("engine", "xlrd")
2185
2403
  kwargs.pop("engine", None)
2186
2404
  content = load_excel(fpath, engine=engine, **kwargs)
2187
- print(f"shape: {content.shape}")
2188
- display(content.head(3))
2405
+ print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2406
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2189
2407
  return content
2190
2408
  elif kind == "xlsx":
2191
2409
  content = load_excel(fpath, **kwargs)
2192
- display(content.head(3))
2193
- print(f"shape: {content.shape}")
2410
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2411
+ print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2194
2412
  return content
2195
- elif kind=='mtx':
2413
+ elif kind == "mtx":
2196
2414
  from scipy.io import mmread
2197
- dat_mtx=mmread(fpath)
2198
- content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
2199
- display(content.head(3))
2415
+
2416
+ dat_mtx = mmread(fpath)
2417
+ content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
2418
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2200
2419
  print(f"shape: {content.shape}")
2201
2420
  return content
2202
2421
  elif kind == "ipynb":
2203
2422
  return load_ipynb(fpath, **kwargs)
2204
- elif kind in ['parquet','snappy']:
2205
- return load_parquet(fpath,**kwargs)
2423
+ elif kind in ["parquet", "snappy"]:
2424
+ verbose = kwargs.pop("verbose", False)
2425
+ if run_once_within():
2426
+ use_pd("read_parquet")
2427
+ return load_parquet(fpath, **kwargs)
2428
+ elif kind == "feather":
2429
+ verbose = kwargs.pop("verbose", False)
2430
+ if run_once_within():
2431
+ use_pd("read_feather")
2432
+ content = pd.read_feather(fpath, **kwargs)
2433
+ return content
2434
+ elif kind == "h5":
2435
+ content = pd.read_hdf(fpath, **kwargs)
2436
+ return content
2437
+ elif kind == "pkl":
2438
+ content = pd.read_pickle(fpath, **kwargs)
2439
+ return content
2206
2440
  elif kind == "pdf":
2207
2441
  # print('usage:load_pdf(fpath, page="all", verbose=False)')
2208
2442
  return load_pdf(fpath, **kwargs)
2209
2443
  elif kind.lower() in img_types:
2210
2444
  print(f'Image ".{kind}" is loaded.')
2211
2445
  return load_img(fpath)
2212
- elif kind=="gz" and fpath.endswith(".soft.gz"):
2446
+ elif kind == "gz" and fpath.endswith(".soft.gz"):
2213
2447
  import GEOparse
2448
+
2214
2449
  return GEOparse.get_GEO(filepath=fpath)
2215
2450
  elif kind.lower() in zip_types:
2451
+ from pprint import pp
2452
+
2216
2453
  keep = kwargs.get("keep", False)
2217
2454
  fpath_unzip = unzip(fpath)
2218
2455
  if os.path.isdir(fpath_unzip):
@@ -2247,6 +2484,9 @@ def fload(fpath, kind=None, **kwargs):
2247
2484
  meta, data = fcsparser.parse(fpath, reformat_meta=True)
2248
2485
  return meta, data
2249
2486
 
2487
+ elif kind == "mplstyle":
2488
+ return read_mplstyle(fpath)
2489
+
2250
2490
  else:
2251
2491
  print("direct reading...")
2252
2492
  try:
@@ -2288,7 +2528,7 @@ def fupdate(fpath, content=None, how="head"):
2288
2528
  """
2289
2529
  Update a file by adding new content at the top and moving the old content to the bottom.
2290
2530
  If the file is a JSON file, merge the new content with the old content.
2291
-
2531
+
2292
2532
  Parameters
2293
2533
  ----------
2294
2534
  fpath : str
@@ -2296,7 +2536,7 @@ def fupdate(fpath, content=None, how="head"):
2296
2536
  content : str or dict, optional
2297
2537
  The new content to add at the top of the file (for text) or merge (for JSON).
2298
2538
  If not provided, the function will not add any new content.
2299
-
2539
+
2300
2540
  Notes
2301
2541
  -----
2302
2542
  - If the file at `fpath` does not exist, it will be created.
@@ -2305,14 +2545,20 @@ def fupdate(fpath, content=None, how="head"):
2305
2545
  """
2306
2546
  content = content or ""
2307
2547
  file_ext = os.path.splitext(fpath)[1]
2308
- how_s=["head", "tail","start","end","beginning", "stop",'last',"before"]
2548
+ how_s = ["head", "tail", "start", "end", "beginning", "stop", "last", "before"]
2309
2549
  how = strcmp(how, how_s)[0]
2310
2550
  print(how)
2311
- add_where = 'head' if how in ["head", "start","beginning", "before"] else "tail"
2551
+ add_where = "head" if how in ["head", "start", "beginning", "before"] else "tail"
2312
2552
  if "json" in file_ext.lower():
2313
- old_content=fload(fpath,kind='json') if os.path.exists(fpath) else {}
2314
- updated_content = {**content,**old_content} if add_where=="head" else {**old_content, **content} if isinstance(content, dict) else old_content
2315
- fsave(fpath,updated_content)
2553
+ old_content = fload(fpath, kind="json") if os.path.exists(fpath) else {}
2554
+ updated_content = (
2555
+ {**content, **old_content}
2556
+ if add_where == "head"
2557
+ else (
2558
+ {**old_content, **content} if isinstance(content, dict) else old_content
2559
+ )
2560
+ )
2561
+ fsave(fpath, updated_content)
2316
2562
  else:
2317
2563
  # Handle text file
2318
2564
  if os.path.exists(fpath):
@@ -2323,7 +2569,7 @@ def fupdate(fpath, content=None, how="head"):
2323
2569
 
2324
2570
  # Write new content at the top followed by old content
2325
2571
  with open(fpath, "w") as file:
2326
- if add_where=="head":
2572
+ if add_where == "head":
2327
2573
  file.write(content + "\n")
2328
2574
  file.write(old_content)
2329
2575
  else:
@@ -2359,6 +2605,9 @@ def filter_kwargs(kws, valid_kwargs):
2359
2605
  return kwargs_filtered
2360
2606
 
2361
2607
 
2608
+ str_space_speed = 'sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
2609
+
2610
+
2362
2611
  def fsave(
2363
2612
  fpath,
2364
2613
  content,
@@ -2393,6 +2642,8 @@ def fsave(
2393
2642
  fappend(fpath, content=content)
2394
2643
 
2395
2644
  def save_docx(fpath, content, font_name, font_size, spacing):
2645
+ import docx
2646
+
2396
2647
  if isinstance(content, str):
2397
2648
  content = content.split(". ")
2398
2649
  doc = docx.Document()
@@ -2420,6 +2671,8 @@ def fsave(
2420
2671
  save_content(fpath, html_content, mode)
2421
2672
 
2422
2673
  def save_pdf(fpath, content, font_name, font_size):
2674
+ from fpdf import FPDF
2675
+
2423
2676
  pdf = FPDF()
2424
2677
  pdf.add_page()
2425
2678
  # pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
@@ -2432,8 +2685,8 @@ def fsave(
2432
2685
  def save_csv(fpath, data, **kwargs):
2433
2686
  # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
2434
2687
 
2435
- verbose=kwargs.pop("verbose",False)
2436
- if verbose:
2688
+ verbose = kwargs.pop("verbose", False)
2689
+ if run_once_within():
2437
2690
  use_pd("to_csv", verbose=verbose)
2438
2691
  kwargs_csv = dict(
2439
2692
  path_or_buf=None,
@@ -2463,18 +2716,30 @@ def fsave(
2463
2716
  df.to_csv(fpath, **kwargs_valid)
2464
2717
 
2465
2718
  def save_xlsx(fpath, data, **kwargs):
2466
- verbose=kwargs.pop("verbose",False)
2719
+ verbose = kwargs.pop("verbose", False)
2467
2720
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2468
- if verbose:
2721
+ if run_once_within():
2469
2722
  use_pd("to_excel", verbose=verbose)
2470
2723
  if any(kwargs):
2471
2724
  format_excel(df=data, filename=fpath, **kwargs)
2472
2725
  else:
2473
2726
  # Remove non-relevant kwargs
2474
2727
  irrelevant_keys = [
2475
- "format", "usage", "cell", "width", "height", "height_max", "merge",
2476
- "shade", "comment", "link", "protect", "number_format", "conditional_format",
2477
- "index_default"]
2728
+ "format",
2729
+ "usage",
2730
+ "cell",
2731
+ "width",
2732
+ "height",
2733
+ "height_max",
2734
+ "merge",
2735
+ "shade",
2736
+ "comment",
2737
+ "link",
2738
+ "protect",
2739
+ "number_format",
2740
+ "conditional_format",
2741
+ "index_default",
2742
+ ]
2478
2743
  for key in irrelevant_keys:
2479
2744
  kwargs.pop(key, None)
2480
2745
 
@@ -2482,15 +2747,18 @@ def fsave(
2482
2747
  # Check if the file exists, then append the sheet, otherwise create a new file
2483
2748
  try:
2484
2749
  # Use ExcelWriter with append mode if the file exists
2485
- with pd.ExcelWriter(fpath, engine='openpyxl', mode='a', if_sheet_exists='new') as writer:
2750
+ with pd.ExcelWriter(
2751
+ fpath, engine="openpyxl", mode="a", if_sheet_exists="new"
2752
+ ) as writer:
2486
2753
  df.to_excel(writer, sheet_name=sheet_name, index=False, **kwargs)
2487
2754
  except FileNotFoundError:
2488
2755
  # If file doesn't exist, create a new one
2489
2756
  df.to_excel(fpath, sheet_name=sheet_name, index=False, **kwargs)
2490
2757
 
2491
-
2492
2758
  def save_ipynb(fpath, data, **kwargs):
2493
2759
  # Split the content by code fences to distinguish between code and markdown
2760
+ import nbformat
2761
+
2494
2762
  parts = data.split("```")
2495
2763
  cells = []
2496
2764
 
@@ -2513,17 +2781,19 @@ def fsave(
2513
2781
  # json.dump(data, file, **kwargs)
2514
2782
 
2515
2783
  def save_json(fpath_fname, var_dict_or_df):
2784
+ import json
2785
+
2516
2786
  def _convert_js(data):
2517
2787
  if isinstance(data, pd.DataFrame):
2518
- return data.to_dict(orient="list")
2788
+ return data.to_dict(orient="list")
2519
2789
  elif isinstance(data, np.ndarray):
2520
2790
  return data.tolist()
2521
2791
  elif isinstance(data, dict):
2522
2792
  return {key: _convert_js(value) for key, value in data.items()}
2523
- return data
2793
+ return data
2524
2794
 
2525
2795
  serializable_data = _convert_js(var_dict_or_df)
2526
-
2796
+
2527
2797
  # Save the serializable data to the JSON file
2528
2798
  with open(fpath_fname, "w") as f_json:
2529
2799
  json.dump(serializable_data, f_json, indent=4)
@@ -2534,10 +2804,14 @@ def fsave(
2534
2804
  # # setss = jsonload("/.json")
2535
2805
 
2536
2806
  def save_yaml(fpath, data, **kwargs):
2807
+ import yaml
2808
+
2537
2809
  with open(fpath, "w") as file:
2538
2810
  yaml.dump(data, file, **kwargs)
2539
2811
 
2540
2812
  def save_xml(fpath, data):
2813
+ from lxml import etree
2814
+
2541
2815
  root = etree.Element("root")
2542
2816
  if isinstance(data, dict):
2543
2817
  for key, val in data.items():
@@ -2548,24 +2822,37 @@ def fsave(
2548
2822
  tree = etree.ElementTree(root)
2549
2823
  tree.write(fpath, pretty_print=True, xml_declaration=True, encoding="UTF-8")
2550
2824
 
2551
- def save_parquet(fpath:str, data:pd.DataFrame, **kwargs):
2552
- engine = kwargs.pop("engine","auto") # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
2553
- compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2825
+ def save_parquet(fpath: str, data: pd.DataFrame, **kwargs):
2826
+ engine = kwargs.pop(
2827
+ "engine", "auto"
2828
+ ) # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
2829
+ compression = kwargs.pop(
2830
+ "compression", None
2831
+ ) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2554
2832
  try:
2555
2833
  # Attempt to save with "pyarrow" if engine is set to "auto"
2556
- data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
2557
- print(f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression.")
2834
+ data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
2835
+ print(
2836
+ f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression."
2837
+ )
2558
2838
  except Exception as e:
2559
- print(f"Error using with engine '{engine}' and {compression} compression: {e}")
2839
+ print(
2840
+ f"Error using with engine '{engine}' and {compression} compression: {e}"
2841
+ )
2560
2842
  if "Sparse" in str(e):
2561
2843
  try:
2562
2844
  # Handle sparse data by converting columns to dense
2563
2845
  print("Attempting to convert sparse columns to dense format...")
2564
- data = data.apply(lambda x: x.sparse.to_dense() if pd.api.types.is_sparse(x) else x)
2565
- save_parquet(fpath, data=data,**kwargs)
2846
+ data = data.apply(
2847
+ lambda x: (
2848
+ x.sparse.to_dense() if pd.api.types.is_sparse(x) else x
2849
+ )
2850
+ )
2851
+ save_parquet(fpath, data=data, **kwargs)
2566
2852
  except Exception as last_e:
2567
- print(f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}")
2568
-
2853
+ print(
2854
+ f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}"
2855
+ )
2569
2856
 
2570
2857
  if kind is None:
2571
2858
  _, kind = os.path.splitext(fpath)
@@ -2612,16 +2899,95 @@ def fsave(
2612
2899
  save_yaml(fpath, content, **kwargs)
2613
2900
  elif kind == "ipynb":
2614
2901
  save_ipynb(fpath, content, **kwargs)
2615
- elif kind.lower() in ["parquet","pq","big","par"]:
2616
- compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2902
+ elif kind.lower() in ["parquet", "pq", "big", "par"]:
2903
+ verbose = kwargs.pop("verbose", False)
2904
+ if verbose:
2905
+ print(str_space_speed)
2906
+ use_pd("to_parquet")
2907
+ return None
2908
+ compression = kwargs.pop(
2909
+ "compression", None
2910
+ ) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2617
2911
  # fix the fpath ends
2618
- if not '.parquet' in fpath:
2619
- fpath=fpath.replace(kind, 'parquet')
2912
+ _fpath, _ext = os.path.splitext(fpath)
2913
+ fpath = _fpath + _ext.replace(kind, "parquet")
2620
2914
  if compression is not None:
2621
2915
  if not fpath.endswith(compression):
2622
- fpath=fpath+f".{compression}"
2623
- save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
2916
+ fpath = fpath + f".{compression}"
2917
+ save_parquet(fpath=fpath, data=content, compression=compression, **kwargs)
2918
+ elif kind.lower() in ["pkl", "pk", "pickle", "pick"]:
2919
+ # Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
2920
+ # Pickle is convenient if you want to preserve exact Python object types.
2921
+ verbose = kwargs.pop("verbose", False)
2922
+ if verbose:
2923
+ print(str_space_speed)
2924
+ use_pd("to_pickle")
2925
+ return None
2926
+ _fpath, _ext = os.path.splitext(fpath)
2927
+ fpath = _fpath + _ext.replace(kind, "pkl")
2928
+ compression = kwargs.pop("compression", None)
2929
+ if compression is not None:
2930
+ if not fpath.endswith(compression["method"]):
2931
+ fpath = fpath + f".{compression['method']}"
2932
+ if isinstance(content, pd.DataFrame):
2933
+ content.to_pickle(fpath, **kwargs)
2934
+ else:
2935
+ try:
2936
+ print("trying to convert it as a DataFrame...")
2937
+ content = pd.DataFrame(content)
2938
+ content.to_pickle(fpath, **kwargs)
2939
+ except Exception as e:
2940
+ raise ValueError(
2941
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2942
+ )
2943
+ elif kind.lower() in ["fea", "feather", "ft", "fe", "feat", "fether"]:
2944
+ # Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
2945
+ # optimized for data analytics tasks and is especially fast when working with Pandas.
2946
+
2947
+ verbose = kwargs.pop("verbose", False)
2948
+ if verbose:
2949
+ print(str_space_speed)
2950
+ use_pd("to_feather")
2951
+ return None
2952
+ _fpath, _ext = os.path.splitext(fpath)
2953
+ fpath = _fpath + _ext.replace(kind, "feather")
2954
+ if isinstance(content, pd.DataFrame):
2955
+ content.to_feather(fpath, **kwargs)
2956
+ else:
2957
+ try:
2958
+ print("trying to convert it as a DataFrame...")
2959
+ content = pd.DataFrame(content)
2960
+ content.to_feather(fpath, **kwargs)
2961
+ except Exception as e:
2962
+ raise ValueError(
2963
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2964
+ )
2965
+ elif kind.lower() in ["hd", "hdf", "h", "h5"]:
2966
+ # particularly useful for large datasets and can handle complex data structures
2967
+ verbose = kwargs.pop("verbose", False)
2968
+ if verbose:
2969
+ print(str_space_speed)
2970
+ use_pd("to_hdf")
2971
+ _fpath, _ext = os.path.splitext(fpath)
2972
+ fpath = _fpath + _ext.replace(kind, "h5")
2973
+ compression = kwargs.pop("compression", None)
2974
+ if compression is not None:
2975
+ if not fpath.endswith(compression):
2976
+ fpath = fpath + f".{compression}"
2977
+ if isinstance(content, pd.DataFrame):
2978
+ content.to_hdf(fpath, key="content", **kwargs)
2979
+ else:
2980
+ try:
2981
+ print("trying to convert it as a DataFrame...")
2982
+ content = pd.DataFrame(content)
2983
+ content.to_hdf(fpath, **kwargs)
2984
+ except Exception as e:
2985
+ raise ValueError(
2986
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2987
+ )
2624
2988
  else:
2989
+ from . import netfinder
2990
+
2625
2991
  try:
2626
2992
  netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
2627
2993
  except:
@@ -2744,6 +3110,8 @@ def isa(content, kind):
2744
3110
  elif "color" in kind.lower(): # file
2745
3111
  return is_str_color(content)
2746
3112
  elif "html" in kind.lower():
3113
+ import re
3114
+
2747
3115
  if content is None or not isinstance(content, str):
2748
3116
  return False
2749
3117
  # Remove leading and trailing whitespace
@@ -2793,8 +3161,8 @@ def listdir(
2793
3161
  verbose=True,
2794
3162
  ):
2795
3163
  if kind is None:
2796
- ls=os.listdir(rootdir)
2797
- ls = [f for f in ls if not f.startswith('.') and not f.startswith('~')]
3164
+ ls = os.listdir(rootdir)
3165
+ ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
2798
3166
  print(ls)
2799
3167
  df_all = pd.DataFrame(
2800
3168
  {
@@ -2825,7 +3193,7 @@ def listdir(
2825
3193
 
2826
3194
  if os.path.isdir(rootdir):
2827
3195
  ls = os.listdir(rootdir)
2828
- ls = [f for f in ls if not f.startswith('.') and not f.startswith('~')]
3196
+ ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
2829
3197
  fd = [".fd", ".fld", ".fol", ".fd", ".folder"]
2830
3198
  i = 0
2831
3199
  f = {
@@ -2903,6 +3271,8 @@ def listdir(
2903
3271
  display(f.head())
2904
3272
  return f
2905
3273
  else:
3274
+ from box import Box
3275
+
2906
3276
  if "l" in orient.lower(): # list # default
2907
3277
  res_output = Box(f.to_dict(orient="list"))
2908
3278
  return res_output
@@ -2943,13 +3313,10 @@ def mkdir_nest(fpath: str) -> str:
2943
3313
  Returns:
2944
3314
  - str: The path of the created directory.
2945
3315
  """
2946
-
2947
-
2948
3316
  # Split the full path into directories
2949
3317
  f_slash = "/" if "mac" in get_os().lower() else "\\"
2950
3318
  if os.path.isdir(fpath):
2951
- fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
2952
- print(fpath)
3319
+ fpath = fpath + f_slash if not fpath.endswith(f_slash) else fpath
2953
3320
  return fpath
2954
3321
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
2955
3322
 
@@ -2979,27 +3346,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
2979
3346
  - str: The path of the created directory or an error message.
2980
3347
  """
2981
3348
 
2982
- rootdir = []
3349
+ rootdir = []
2983
3350
  if chdir is None:
2984
3351
  return mkdir_nest(pardir)
2985
3352
  if isinstance(chdir, str):
2986
- chdir = [chdir]
3353
+ chdir = [chdir]
2987
3354
  chdir = list(set(chdir))
2988
3355
  if isinstance(pardir, str): # Dir_parents should be 'str' type
2989
- pardir = os.path.normpath(pardir)
3356
+ pardir = os.path.normpath(pardir)
2990
3357
  if "mac" in get_os().lower() or "lin" in get_os().lower():
2991
3358
  stype = "/"
2992
3359
  elif "win" in get_os().lower():
2993
3360
  stype = "\\"
2994
3361
  else:
2995
3362
  stype = "/"
2996
-
3363
+
2997
3364
  if os.path.isdir(pardir):
2998
3365
  os.chdir(pardir) # Set current path
2999
3366
  # Check if subdirectories are not empty
3000
3367
  if chdir:
3001
- chdir.sort()
3002
- for folder in chdir:
3368
+ chdir.sort()
3369
+ for folder in chdir:
3003
3370
  child_tmp = os.path.join(pardir, folder)
3004
3371
  if not os.path.isdir(child_tmp):
3005
3372
  os.mkdir("./" + folder)
@@ -3019,8 +3386,8 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
3019
3386
  # Dir is the main output, if only one dir, then str type is inconvenient
3020
3387
  if len(rootdir) == 1:
3021
3388
  rootdir = rootdir[0]
3022
- rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
3023
- print(rootdir)
3389
+ rootdir = rootdir + stype if not rootdir.endswith(stype) else rootdir
3390
+
3024
3391
  return rootdir
3025
3392
 
3026
3393
 
@@ -3032,6 +3399,9 @@ def split_path(fpath):
3032
3399
 
3033
3400
 
3034
3401
  def figsave(*args, dpi=300):
3402
+ import matplotlib.pyplot as plt
3403
+ from PIL import Image
3404
+
3035
3405
  dir_save = None
3036
3406
  fname = None
3037
3407
  img = None
@@ -3046,14 +3416,14 @@ def figsave(*args, dpi=300):
3046
3416
  img = arg # Store the PIL image if provided
3047
3417
 
3048
3418
  if dir_save is None:
3049
- dir_save="./"
3050
- print(dir_save)
3419
+ dir_save = "./"
3420
+
3051
3421
  # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
3052
3422
  dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
3053
3423
  dir_ch = "".join(dir_save.split(f_slash)[-1:])
3054
3424
  if not dir_par.endswith(f_slash):
3055
3425
  dir_par += f_slash
3056
- print(dir_par)
3426
+
3057
3427
  if fname is None:
3058
3428
  fname = dir_ch
3059
3429
  mkdir(dir_par)
@@ -3139,7 +3509,9 @@ def figsave(*args, dpi=300):
3139
3509
 
3140
3510
  def is_str_color(s):
3141
3511
  # Regular expression pattern for hexadecimal color codes
3142
- if isinstance(s,str):
3512
+ if isinstance(s, str):
3513
+ import re
3514
+
3143
3515
  color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
3144
3516
  return re.match(color_code_pattern, s) is not None
3145
3517
  else:
@@ -3166,6 +3538,8 @@ def isnum(s):
3166
3538
 
3167
3539
 
3168
3540
  def is_image(fpath):
3541
+ import mimetypes
3542
+
3169
3543
  mime_type, _ = mimetypes.guess_type(fpath)
3170
3544
  if mime_type and mime_type.startswith("image"):
3171
3545
  return True
@@ -3174,6 +3548,8 @@ def is_image(fpath):
3174
3548
 
3175
3549
 
3176
3550
  def is_document(fpath):
3551
+ import mimetypes
3552
+
3177
3553
  mime_type, _ = mimetypes.guess_type(fpath)
3178
3554
  if mime_type and (
3179
3555
  mime_type.startswith("text/")
@@ -3194,6 +3570,8 @@ def is_document(fpath):
3194
3570
 
3195
3571
 
3196
3572
  def is_zip(fpath):
3573
+ import mimetypes
3574
+
3197
3575
  mime_type, _ = mimetypes.guess_type(fpath)
3198
3576
  if mime_type == "application/zip":
3199
3577
  return True
@@ -3202,6 +3580,8 @@ def is_zip(fpath):
3202
3580
 
3203
3581
 
3204
3582
  def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
3583
+ import matplotlib.pyplot as plt
3584
+
3205
3585
  if ax is None:
3206
3586
  ax = plt.gca()
3207
3587
  for loc, spine in ax.spines.items():
@@ -3290,6 +3670,7 @@ def apply_filter(img, *args):
3290
3670
  Returns:
3291
3671
  PIL.Image: The filtered image.
3292
3672
  """
3673
+ from PIL import ImageFilter
3293
3674
 
3294
3675
  def correct_filter_name(filter_name):
3295
3676
  if "bl" in filter_name.lower() and "box" not in filter_name.lower():
@@ -3532,6 +3913,9 @@ def imgsets(img, **kwargs):
3532
3913
  avg_contrast_factor = sum(contrast_factors) / num_channels
3533
3914
  return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
3534
3915
 
3916
+ import matplotlib.pyplot as plt
3917
+ from PIL import ImageEnhance, ImageOps
3918
+
3535
3919
  # Load image if input is a file path
3536
3920
  if isinstance(img, str):
3537
3921
  img = load_img(img)
@@ -3595,6 +3979,8 @@ def imgsets(img, **kwargs):
3595
3979
  elif "pad" in k.lower():
3596
3980
  img_update = ImageOps.pad(img_update, size=value)
3597
3981
  elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
3982
+ from rembg import remove, new_session
3983
+
3598
3984
  if isinstance(value, bool):
3599
3985
  session = new_session("isnet-general-use")
3600
3986
  img_update = remove(img_update, session=session)
@@ -3633,6 +4019,8 @@ def imgsets(img, **kwargs):
3633
4019
  else:
3634
4020
  img_update = remove(img_update)
3635
4021
  elif "bg" in k.lower() and "color" in k.lower():
4022
+ from rembg import remove
4023
+
3636
4024
  if isinstance(value, list):
3637
4025
  value = tuple(value)
3638
4026
  if isinstance(value, tuple): # replace the background color
@@ -3664,6 +4052,9 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
3664
4052
  Args:
3665
4053
  dir_img_list (list): List of the Directory containing the images.
3666
4054
  """
4055
+ import matplotlib.pyplot as plt
4056
+ from PIL import Image
4057
+
3667
4058
  num_images = len(dir_img_list)
3668
4059
  if not kind.startswith("."):
3669
4060
  kind = "." + kind
@@ -3700,28 +4091,14 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
3700
4091
  # usage:
3701
4092
  # fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
3702
4093
  # thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
3703
- def read_mplstyle(style_file):
3704
- # Load the style file
3705
- plt.style.use(style_file)
3706
-
3707
- # Get the current style properties
3708
- style_dict = plt.rcParams
3709
-
3710
- # Convert to dictionary
3711
- style_dict = dict(style_dict)
3712
- # Print the style dictionary
3713
- for i, j in style_dict.items():
3714
- print(f"\n{i}::::{j}")
3715
- return style_dict
3716
-
3717
-
3718
- # #example usage:
3719
- # style_file = "/ std-colors.mplstyle"
3720
- # style_dict = read_mplstyle(style_file)
3721
4094
 
3722
4095
 
3723
4096
  # search and fine the director of the libary, which installed at local
3724
4097
  def dir_lib(lib_oi):
4098
+ """
4099
+ # example usage:
4100
+ # dir_lib("seaborn")
4101
+ """
3725
4102
  import site
3726
4103
 
3727
4104
  # Get the site-packages directory
@@ -3740,22 +4117,6 @@ def dir_lib(lib_oi):
3740
4117
  return dir_list
3741
4118
 
3742
4119
 
3743
- # example usage:
3744
- # dir_lib("seaborn")
3745
-
3746
- """
3747
- # n = 7
3748
- # clist = get_color(n, cmap="auto", how="linspace") # get_color(100)
3749
- # plt.figure(figsize=[8, 5], dpi=100)
3750
- # x = np.linspace(0, 2 * np.pi, 50) * 100
3751
- # y = np.sin(x)
3752
- # for i in range(1, n + 1):
3753
- # plt.plot(x, y + i, c=clist[i - 1], lw=5, label=str(i))
3754
- # plt.legend()
3755
- # plt.ylim(-2, 20)
3756
- # figsets(plt.gca(), {"style": "whitegrid"}) """
3757
-
3758
-
3759
4120
  class FileInfo:
3760
4121
  def __init__(
3761
4122
  self,
@@ -3832,6 +4193,8 @@ class FileInfo:
3832
4193
 
3833
4194
 
3834
4195
  def finfo(fpath):
4196
+ import time
4197
+
3835
4198
  fname, fmt = os.path.splitext(fpath)
3836
4199
  dir_par = os.path.dirname(fpath) + "/"
3837
4200
  data = {
@@ -3846,6 +4209,8 @@ def finfo(fpath):
3846
4209
  }
3847
4210
  extra_info = {}
3848
4211
  if data["kind"] == ".pdf":
4212
+ from pdf2image import pdfinfo_from_path
4213
+
3849
4214
  extra_info = pdfinfo_from_path(fpath)
3850
4215
 
3851
4216
  return FileInfo(
@@ -3862,16 +4227,6 @@ def finfo(fpath):
3862
4227
 
3863
4228
 
3864
4229
  # ! format excel file
3865
- import pandas as pd
3866
- from datetime import datetime
3867
- from openpyxl import load_workbook
3868
- from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
3869
- from openpyxl.utils import get_column_letter
3870
- from openpyxl.worksheet.datavalidation import DataValidation
3871
- from openpyxl.comments import Comment
3872
- from openpyxl.formatting.rule import ColorScaleRule
3873
-
3874
-
3875
4230
  def hex2argb(hex_color):
3876
4231
  """
3877
4232
  Convert a hex color code to aARGB format required by openpyxl.
@@ -3907,337 +4262,6 @@ def hex2argb(hex_color):
3907
4262
  )
3908
4263
 
3909
4264
 
3910
- def convert_indices_to_range(row_slice, col_slice):
3911
- """Convert numerical row and column slices to Excel-style range strings."""
3912
- start_row = row_slice.start + 1
3913
- end_row = row_slice.stop if row_slice.stop is not None else None
3914
- start_col = col_slice.start + 1
3915
- end_col = col_slice.stop if col_slice.stop is not None else None
3916
-
3917
- start_col_letter = get_column_letter(start_col)
3918
- end_col_letter = get_column_letter(end_col) if end_col else None
3919
- return (
3920
- f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
3921
- if end_col_letter
3922
- else f"{start_col_letter}{start_row}"
3923
- )
3924
-
3925
-
3926
- def apply_format(ws, cell, cell_range):
3927
- """Apply cell formatting to a specified range."""
3928
- cell_font, cell_fill, cell_alignment, border = None, None, None, None
3929
- kws_cell = ["font", "fill", "alignment", "border"]
3930
- for K, _ in cell.items():
3931
- if strcmp(K, kws_cell)[0] == "font":
3932
- #! font
3933
- font_color = "000000"
3934
- font_name = "Arial"
3935
- font_underline = "none"
3936
- font_size = 14
3937
- font_bold = False
3938
- font_strike = False
3939
- font_italic = False
3940
- kws_font = [
3941
- "name",
3942
- "size",
3943
- "bold",
3944
- "underline",
3945
- "color",
3946
- "strike",
3947
- "italic",
3948
- ]
3949
- for k_, v_ in cell.get(K, {}).items():
3950
- if strcmp(k_, kws_font)[0] == "name":
3951
- font_name = v_
3952
- elif strcmp(k_, kws_font)[0] == "size":
3953
- font_size = v_
3954
- elif strcmp(k_, kws_font)[0] == "bold":
3955
- font_bold = v_
3956
- elif strcmp(k_, kws_font)[0] == "underline":
3957
- font_underline = strcmp(v_, ["none", "single", "double"])[0]
3958
- elif strcmp(k_, kws_font)[0] == "color":
3959
- font_color = hex2argb(v_)
3960
- elif strcmp(k_, kws_font)[0] == "strike":
3961
- font_strike = v_
3962
- elif strcmp(k_, kws_font)[0] == "italic":
3963
- font_italic = v_
3964
-
3965
- cell_font = Font(
3966
- name=font_name,
3967
- size=font_size,
3968
- bold=font_bold,
3969
- italic=font_italic,
3970
- underline=font_underline,
3971
- strike=font_strike,
3972
- color=font_color,
3973
- )
3974
-
3975
- if strcmp(K, kws_cell)[0] == "fill":
3976
- #! fill
3977
- kws_fill = ["start_color", "end_color", "fill_type", "color"]
3978
- kws_fill_type = [
3979
- "darkVertical",
3980
- "lightDown",
3981
- "lightGrid",
3982
- "solid",
3983
- "darkDown",
3984
- "lightGray",
3985
- "lightUp",
3986
- "gray0625",
3987
- "lightVertical",
3988
- "lightHorizontal",
3989
- "darkHorizontal",
3990
- "gray125",
3991
- "darkUp",
3992
- "mediumGray",
3993
- "darkTrellis",
3994
- "darkGray",
3995
- "lightTrellis",
3996
- "darkGrid",
3997
- ]
3998
- start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
3999
- for k, v in cell.get(K, {}).items():
4000
- if strcmp(k, kws_fill)[0] == "color":
4001
- start_color, end_color = hex2argb(v), hex2argb(v)
4002
- break
4003
- for k, v in cell.get(K, {}).items():
4004
- if strcmp(k, kws_fill)[0] == "start_color":
4005
- start_color = hex2argb(v)
4006
- elif strcmp(k, kws_fill)[0] == "end_color":
4007
- end_color = hex2argb(v)
4008
- elif strcmp(k, kws_fill)[0] == "fill_type":
4009
- fill_type = strcmp(v, kws_fill_type)[0]
4010
- cell_fill = PatternFill(
4011
- start_color=start_color,
4012
- end_color=end_color,
4013
- fill_type=fill_type,
4014
- )
4015
-
4016
- if strcmp(K, kws_cell)[0] == "alignment":
4017
- #! alignment
4018
- # default
4019
- align_horizontal = "general"
4020
- align_vertical = "center"
4021
- align_rot = 0
4022
- align_wrap = False
4023
- align_shrink = False
4024
- align_indent = 0
4025
- kws_align = [
4026
- "horizontal",
4027
- "ha",
4028
- "vertical",
4029
- "va",
4030
- "text_rotation",
4031
- "rotat",
4032
- "rot",
4033
- "wrap_text",
4034
- "wrap",
4035
- "shrink_to_fit",
4036
- "shrink",
4037
- "indent",
4038
- ]
4039
- for k, v in cell.get(K, {}).items():
4040
- if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
4041
- align_horizontal = strcmp(
4042
- v, ["general", "left", "right", "center"]
4043
- )[0]
4044
- elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
4045
- align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
4046
- elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
4047
- align_rot = v
4048
- elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
4049
- align_wrap = v
4050
- elif strcmp(k, kws_align)[0] in [
4051
- "shrink_to_fit",
4052
- "shrink",
4053
- "wrap_text",
4054
- "wrap",
4055
- ]:
4056
- align_shrink = v
4057
- elif strcmp(k, kws_align)[0] in ["indent"]:
4058
- align_indent = v
4059
- cell_alignment = Alignment(
4060
- horizontal=align_horizontal,
4061
- vertical=align_vertical,
4062
- text_rotation=align_rot,
4063
- wrap_text=align_wrap,
4064
- shrink_to_fit=align_shrink,
4065
- indent=align_indent,
4066
- )
4067
-
4068
- if strcmp(K, kws_cell)[0] == "border":
4069
- #! border
4070
- kws_border = [
4071
- "color_left",
4072
- "color_l",
4073
- "color_right",
4074
- "color_r",
4075
- "color_top",
4076
- "color_t",
4077
- "color_bottom",
4078
- "color_b",
4079
- "color_diagonal",
4080
- "color_d",
4081
- "color_outline",
4082
- "color_o",
4083
- "color_vertical",
4084
- "color_v",
4085
- "color_horizontal",
4086
- "color_h",
4087
- "color",
4088
- "style_left",
4089
- "style_l",
4090
- "style_right",
4091
- "style_r",
4092
- "style_top",
4093
- "style_t",
4094
- "style_bottom",
4095
- "style_b",
4096
- "style_diagonal",
4097
- "style_d",
4098
- "style_outline",
4099
- "style_o",
4100
- "style_vertical",
4101
- "style_v",
4102
- "style_horizontal",
4103
- "style_h",
4104
- "style",
4105
- ]
4106
- # * border color
4107
- border_color_l, border_color_r, border_color_t, border_color_b = (
4108
- "FF000000",
4109
- "FF000000",
4110
- "FF000000",
4111
- "FF000000",
4112
- )
4113
- border_color_d, border_color_o, border_color_v, border_color_h = (
4114
- "FF000000",
4115
- "FF000000",
4116
- "FF000000",
4117
- "FF000000",
4118
- )
4119
- # get colors config
4120
- for k, v in cell.get(K, {}).items():
4121
- if strcmp(k, kws_border)[0] in ["color"]:
4122
- border_color_all = hex2argb(v)
4123
- # 如果设置了color,表示其它的所有的都设置成为一样的
4124
- # 然后再才开始自己定义其它的color
4125
- border_color_l, border_color_r, border_color_t, border_color_b = (
4126
- border_color_all,
4127
- border_color_all,
4128
- border_color_all,
4129
- border_color_all,
4130
- )
4131
- border_color_d, border_color_o, border_color_v, border_color_h = (
4132
- border_color_all,
4133
- border_color_all,
4134
- border_color_all,
4135
- border_color_all,
4136
- )
4137
- elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
4138
- border_color_l = hex2argb(v)
4139
- elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
4140
- border_color_r = hex2argb(v)
4141
- elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
4142
- border_color_t = hex2argb(v)
4143
- elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
4144
- border_color_b = hex2argb(v)
4145
- elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
4146
- border_color_d = hex2argb(v)
4147
- elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
4148
- border_color_o = hex2argb(v)
4149
- elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
4150
- border_color_v = hex2argb(v)
4151
- elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
4152
- border_color_h = hex2argb(v)
4153
- # *border style
4154
- border_styles = [
4155
- "thin",
4156
- "medium",
4157
- "thick",
4158
- "dotted",
4159
- "dashed",
4160
- "hair",
4161
- "mediumDashed",
4162
- "dashDot",
4163
- "dashDotDot",
4164
- "slantDashDot",
4165
- "none",
4166
- ]
4167
- border_style_l, border_style_r, border_style_t, border_style_b = (
4168
- None,
4169
- None,
4170
- None,
4171
- None,
4172
- )
4173
- border_style_d, border_style_o, border_style_v, border_style_h = (
4174
- None,
4175
- None,
4176
- None,
4177
- None,
4178
- )
4179
- # get styles config
4180
- for k, v in cell.get(K, {}).items():
4181
- # if not "style" in k:
4182
- # break
4183
- if strcmp(k, kws_border)[0] in ["style"]:
4184
- border_style_all = strcmp(v, border_styles)[0]
4185
- # 如果设置了style,表示其它的所有的都设置成为一样的
4186
- # 然后再才开始自己定义其它的style
4187
- border_style_l, border_style_r, border_style_t, border_style_b = (
4188
- border_style_all,
4189
- border_style_all,
4190
- border_style_all,
4191
- border_style_all,
4192
- )
4193
- border_style_d, border_style_o, border_style_v, border_style_h = (
4194
- border_style_all,
4195
- border_style_all,
4196
- border_style_all,
4197
- border_style_all,
4198
- )
4199
- elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
4200
- border_style_l = strcmp(v, border_styles)[0]
4201
- elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
4202
- border_style_r = strcmp(v, border_styles)[0]
4203
- elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
4204
- border_style_t = strcmp(v, border_styles)[0]
4205
- elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
4206
- border_style_b = strcmp(v, border_styles)[0]
4207
- elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
4208
- border_style_d = strcmp(v, border_styles)[0]
4209
- elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
4210
- border_style_o = strcmp(v, border_styles)[0]
4211
- elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
4212
- border_style_v = strcmp(v, border_styles)[0]
4213
- elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
4214
- border_style_h = strcmp(v, border_styles)[0]
4215
- # * apply border config
4216
- border = Border(
4217
- left=Side(border_style=border_style_l, color=border_color_l),
4218
- right=Side(border_style=border_style_r, color=border_color_r),
4219
- top=Side(border_style=border_style_t, color=border_color_t),
4220
- bottom=Side(border_style=border_style_b, color=border_color_b),
4221
- diagonal=Side(border_style=border_style_d, color=border_color_d),
4222
- diagonal_direction=0,
4223
- outline=Side(border_style=border_style_o, color=border_color_o),
4224
- vertical=Side(border_style=border_style_v, color=border_color_v),
4225
- horizontal=Side(border_style=border_style_h, color=border_color_h),
4226
- )
4227
-
4228
- #! final apply configs
4229
- for row in ws[cell_range]:
4230
- for cell_ in row:
4231
- if cell_font:
4232
- cell_.font = cell_font
4233
- if cell_fill:
4234
- cell_.fill = cell_fill
4235
- if cell_alignment:
4236
- cell_.alignment = cell_alignment
4237
- if border:
4238
- cell_.border = border
4239
-
4240
-
4241
4265
  def format_excel(
4242
4266
  df=None,
4243
4267
  filename=None,
@@ -4257,6 +4281,368 @@ def format_excel(
4257
4281
  conditional_format=None, # dict
4258
4282
  **kwargs,
4259
4283
  ):
4284
+ import pandas as pd
4285
+ from datetime import datetime
4286
+ from openpyxl import load_workbook
4287
+ from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
4288
+ from openpyxl.utils import get_column_letter
4289
+ from openpyxl.worksheet.datavalidation import DataValidation
4290
+ from openpyxl.comments import Comment
4291
+ from openpyxl.formatting.rule import ColorScaleRule
4292
+
4293
+ def convert_indices_to_range(row_slice, col_slice):
4294
+ """Convert numerical row and column slices to Excel-style range strings."""
4295
+ start_row = row_slice.start + 1
4296
+ end_row = row_slice.stop if row_slice.stop is not None else None
4297
+ start_col = col_slice.start + 1
4298
+ end_col = col_slice.stop if col_slice.stop is not None else None
4299
+
4300
+ start_col_letter = get_column_letter(start_col)
4301
+ end_col_letter = get_column_letter(end_col) if end_col else None
4302
+ return (
4303
+ f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
4304
+ if end_col_letter
4305
+ else f"{start_col_letter}{start_row}"
4306
+ )
4307
+
4308
+ def apply_format(ws, cell, cell_range):
4309
+ """Apply cell formatting to a specified range."""
4310
+ cell_font, cell_fill, cell_alignment, border = None, None, None, None
4311
+ kws_cell = ["font", "fill", "alignment", "border"]
4312
+ for K, _ in cell.items():
4313
+ if strcmp(K, kws_cell)[0] == "font":
4314
+ #! font
4315
+ font_color = "000000"
4316
+ font_name = "Arial"
4317
+ font_underline = "none"
4318
+ font_size = 14
4319
+ font_bold = False
4320
+ font_strike = False
4321
+ font_italic = False
4322
+ kws_font = [
4323
+ "name",
4324
+ "size",
4325
+ "bold",
4326
+ "underline",
4327
+ "color",
4328
+ "strike",
4329
+ "italic",
4330
+ ]
4331
+ for k_, v_ in cell.get(K, {}).items():
4332
+ if strcmp(k_, kws_font)[0] == "name":
4333
+ font_name = v_
4334
+ elif strcmp(k_, kws_font)[0] == "size":
4335
+ font_size = v_
4336
+ elif strcmp(k_, kws_font)[0] == "bold":
4337
+ font_bold = v_
4338
+ elif strcmp(k_, kws_font)[0] == "underline":
4339
+ font_underline = strcmp(v_, ["none", "single", "double"])[0]
4340
+ elif strcmp(k_, kws_font)[0] == "color":
4341
+ font_color = hex2argb(v_)
4342
+ elif strcmp(k_, kws_font)[0] == "strike":
4343
+ font_strike = v_
4344
+ elif strcmp(k_, kws_font)[0] == "italic":
4345
+ font_italic = v_
4346
+
4347
+ cell_font = Font(
4348
+ name=font_name,
4349
+ size=font_size,
4350
+ bold=font_bold,
4351
+ italic=font_italic,
4352
+ underline=font_underline,
4353
+ strike=font_strike,
4354
+ color=font_color,
4355
+ )
4356
+
4357
+ if strcmp(K, kws_cell)[0] == "fill":
4358
+ #! fill
4359
+ kws_fill = ["start_color", "end_color", "fill_type", "color"]
4360
+ kws_fill_type = [
4361
+ "darkVertical",
4362
+ "lightDown",
4363
+ "lightGrid",
4364
+ "solid",
4365
+ "darkDown",
4366
+ "lightGray",
4367
+ "lightUp",
4368
+ "gray0625",
4369
+ "lightVertical",
4370
+ "lightHorizontal",
4371
+ "darkHorizontal",
4372
+ "gray125",
4373
+ "darkUp",
4374
+ "mediumGray",
4375
+ "darkTrellis",
4376
+ "darkGray",
4377
+ "lightTrellis",
4378
+ "darkGrid",
4379
+ ]
4380
+ start_color, end_color, fill_type = (
4381
+ "FFFFFF",
4382
+ "FFFFFF",
4383
+ "solid",
4384
+ ) # default
4385
+ for k, v in cell.get(K, {}).items():
4386
+ if strcmp(k, kws_fill)[0] == "color":
4387
+ start_color, end_color = hex2argb(v), hex2argb(v)
4388
+ break
4389
+ for k, v in cell.get(K, {}).items():
4390
+ if strcmp(k, kws_fill)[0] == "start_color":
4391
+ start_color = hex2argb(v)
4392
+ elif strcmp(k, kws_fill)[0] == "end_color":
4393
+ end_color = hex2argb(v)
4394
+ elif strcmp(k, kws_fill)[0] == "fill_type":
4395
+ fill_type = strcmp(v, kws_fill_type)[0]
4396
+ cell_fill = PatternFill(
4397
+ start_color=start_color,
4398
+ end_color=end_color,
4399
+ fill_type=fill_type,
4400
+ )
4401
+
4402
+ if strcmp(K, kws_cell)[0] == "alignment":
4403
+ #! alignment
4404
+ # default
4405
+ align_horizontal = "general"
4406
+ align_vertical = "center"
4407
+ align_rot = 0
4408
+ align_wrap = False
4409
+ align_shrink = False
4410
+ align_indent = 0
4411
+ kws_align = [
4412
+ "horizontal",
4413
+ "ha",
4414
+ "vertical",
4415
+ "va",
4416
+ "text_rotation",
4417
+ "rotat",
4418
+ "rot",
4419
+ "wrap_text",
4420
+ "wrap",
4421
+ "shrink_to_fit",
4422
+ "shrink",
4423
+ "indent",
4424
+ ]
4425
+ for k, v in cell.get(K, {}).items():
4426
+ if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
4427
+ align_horizontal = strcmp(
4428
+ v, ["general", "left", "right", "center"]
4429
+ )[0]
4430
+ elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
4431
+ align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
4432
+ elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
4433
+ align_rot = v
4434
+ elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
4435
+ align_wrap = v
4436
+ elif strcmp(k, kws_align)[0] in [
4437
+ "shrink_to_fit",
4438
+ "shrink",
4439
+ "wrap_text",
4440
+ "wrap",
4441
+ ]:
4442
+ align_shrink = v
4443
+ elif strcmp(k, kws_align)[0] in ["indent"]:
4444
+ align_indent = v
4445
+ cell_alignment = Alignment(
4446
+ horizontal=align_horizontal,
4447
+ vertical=align_vertical,
4448
+ text_rotation=align_rot,
4449
+ wrap_text=align_wrap,
4450
+ shrink_to_fit=align_shrink,
4451
+ indent=align_indent,
4452
+ )
4453
+
4454
+ if strcmp(K, kws_cell)[0] == "border":
4455
+ #! border
4456
+ kws_border = [
4457
+ "color_left",
4458
+ "color_l",
4459
+ "color_right",
4460
+ "color_r",
4461
+ "color_top",
4462
+ "color_t",
4463
+ "color_bottom",
4464
+ "color_b",
4465
+ "color_diagonal",
4466
+ "color_d",
4467
+ "color_outline",
4468
+ "color_o",
4469
+ "color_vertical",
4470
+ "color_v",
4471
+ "color_horizontal",
4472
+ "color_h",
4473
+ "color",
4474
+ "style_left",
4475
+ "style_l",
4476
+ "style_right",
4477
+ "style_r",
4478
+ "style_top",
4479
+ "style_t",
4480
+ "style_bottom",
4481
+ "style_b",
4482
+ "style_diagonal",
4483
+ "style_d",
4484
+ "style_outline",
4485
+ "style_o",
4486
+ "style_vertical",
4487
+ "style_v",
4488
+ "style_horizontal",
4489
+ "style_h",
4490
+ "style",
4491
+ ]
4492
+ # * border color
4493
+ border_color_l, border_color_r, border_color_t, border_color_b = (
4494
+ "FF000000",
4495
+ "FF000000",
4496
+ "FF000000",
4497
+ "FF000000",
4498
+ )
4499
+ border_color_d, border_color_o, border_color_v, border_color_h = (
4500
+ "FF000000",
4501
+ "FF000000",
4502
+ "FF000000",
4503
+ "FF000000",
4504
+ )
4505
+ # get colors config
4506
+ for k, v in cell.get(K, {}).items():
4507
+ if strcmp(k, kws_border)[0] in ["color"]:
4508
+ border_color_all = hex2argb(v)
4509
+ # 如果设置了color,表示其它的所有的都设置成为一样的
4510
+ # 然后再才开始自己定义其它的color
4511
+ (
4512
+ border_color_l,
4513
+ border_color_r,
4514
+ border_color_t,
4515
+ border_color_b,
4516
+ ) = (
4517
+ border_color_all,
4518
+ border_color_all,
4519
+ border_color_all,
4520
+ border_color_all,
4521
+ )
4522
+ (
4523
+ border_color_d,
4524
+ border_color_o,
4525
+ border_color_v,
4526
+ border_color_h,
4527
+ ) = (
4528
+ border_color_all,
4529
+ border_color_all,
4530
+ border_color_all,
4531
+ border_color_all,
4532
+ )
4533
+ elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
4534
+ border_color_l = hex2argb(v)
4535
+ elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
4536
+ border_color_r = hex2argb(v)
4537
+ elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
4538
+ border_color_t = hex2argb(v)
4539
+ elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
4540
+ border_color_b = hex2argb(v)
4541
+ elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
4542
+ border_color_d = hex2argb(v)
4543
+ elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
4544
+ border_color_o = hex2argb(v)
4545
+ elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
4546
+ border_color_v = hex2argb(v)
4547
+ elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
4548
+ border_color_h = hex2argb(v)
4549
+ # *border style
4550
+ border_styles = [
4551
+ "thin",
4552
+ "medium",
4553
+ "thick",
4554
+ "dotted",
4555
+ "dashed",
4556
+ "hair",
4557
+ "mediumDashed",
4558
+ "dashDot",
4559
+ "dashDotDot",
4560
+ "slantDashDot",
4561
+ "none",
4562
+ ]
4563
+ border_style_l, border_style_r, border_style_t, border_style_b = (
4564
+ None,
4565
+ None,
4566
+ None,
4567
+ None,
4568
+ )
4569
+ border_style_d, border_style_o, border_style_v, border_style_h = (
4570
+ None,
4571
+ None,
4572
+ None,
4573
+ None,
4574
+ )
4575
+ # get styles config
4576
+ for k, v in cell.get(K, {}).items():
4577
+ # if not "style" in k:
4578
+ # break
4579
+ if strcmp(k, kws_border)[0] in ["style"]:
4580
+ border_style_all = strcmp(v, border_styles)[0]
4581
+ # 如果设置了style,表示其它的所有的都设置成为一样的
4582
+ # 然后再才开始自己定义其它的style
4583
+ (
4584
+ border_style_l,
4585
+ border_style_r,
4586
+ border_style_t,
4587
+ border_style_b,
4588
+ ) = (
4589
+ border_style_all,
4590
+ border_style_all,
4591
+ border_style_all,
4592
+ border_style_all,
4593
+ )
4594
+ (
4595
+ border_style_d,
4596
+ border_style_o,
4597
+ border_style_v,
4598
+ border_style_h,
4599
+ ) = (
4600
+ border_style_all,
4601
+ border_style_all,
4602
+ border_style_all,
4603
+ border_style_all,
4604
+ )
4605
+ elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
4606
+ border_style_l = strcmp(v, border_styles)[0]
4607
+ elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
4608
+ border_style_r = strcmp(v, border_styles)[0]
4609
+ elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
4610
+ border_style_t = strcmp(v, border_styles)[0]
4611
+ elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
4612
+ border_style_b = strcmp(v, border_styles)[0]
4613
+ elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
4614
+ border_style_d = strcmp(v, border_styles)[0]
4615
+ elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
4616
+ border_style_o = strcmp(v, border_styles)[0]
4617
+ elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
4618
+ border_style_v = strcmp(v, border_styles)[0]
4619
+ elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
4620
+ border_style_h = strcmp(v, border_styles)[0]
4621
+ # * apply border config
4622
+ border = Border(
4623
+ left=Side(border_style=border_style_l, color=border_color_l),
4624
+ right=Side(border_style=border_style_r, color=border_color_r),
4625
+ top=Side(border_style=border_style_t, color=border_color_t),
4626
+ bottom=Side(border_style=border_style_b, color=border_color_b),
4627
+ diagonal=Side(border_style=border_style_d, color=border_color_d),
4628
+ diagonal_direction=0,
4629
+ outline=Side(border_style=border_style_o, color=border_color_o),
4630
+ vertical=Side(border_style=border_style_v, color=border_color_v),
4631
+ horizontal=Side(border_style=border_style_h, color=border_color_h),
4632
+ )
4633
+
4634
+ #! final apply configs
4635
+ for row in ws[cell_range]:
4636
+ for cell_ in row:
4637
+ if cell_font:
4638
+ cell_.font = cell_font
4639
+ if cell_fill:
4640
+ cell_.fill = cell_fill
4641
+ if cell_alignment:
4642
+ cell_.alignment = cell_alignment
4643
+ if border:
4644
+ cell_.border = border
4645
+
4260
4646
  if not isinstance(df, pd.DataFrame):
4261
4647
  try:
4262
4648
  print(f"is loading file {os.path.basename(df)}")
@@ -4602,11 +4988,10 @@ format_excel(
4602
4988
  print(f"Formatted Excel file saved as:\n{filename}")
4603
4989
 
4604
4990
 
4605
- from IPython.display import display, HTML, Markdown
4606
-
4607
-
4608
4991
  def preview(var):
4609
4992
  """Master function to preview formatted variables in Jupyter."""
4993
+ from bs4 import BeautifulSoup
4994
+ from IPython.display import display, HTML, Markdown
4610
4995
 
4611
4996
  if isinstance(var, str):
4612
4997
  if isa(var, "html"):
@@ -4624,6 +5009,8 @@ def preview(var):
4624
5009
  display(var)
4625
5010
 
4626
5011
  elif isinstance(var, list) or isinstance(var, dict):
5012
+ import json
5013
+
4627
5014
  # Display JSON
4628
5015
  json_str = json.dumps(var, indent=4)
4629
5016
  display(Markdown(f"```json\n{json_str}\n```"))
@@ -4637,6 +5024,8 @@ def preview(var):
4637
5024
  display(Image(filename=var))
4638
5025
 
4639
5026
  elif isinstance(var, dict):
5027
+ import json
5028
+
4640
5029
  # Handle dictionary formatting
4641
5030
  json_str = json.dumps(var, indent=4)
4642
5031
  display(Markdown(f"```json\n{json_str}\n```"))
@@ -4651,48 +5040,194 @@ def preview(var):
4651
5040
  # preview("# This is a Markdown header")
4652
5041
  # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
4653
5042
  # preview({"key": "value", "numbers": [1, 2, 3]})
5043
+
5044
+
5045
+ def _df_outlier(
5046
+ data,
5047
+ columns=None,
5048
+ method=["zscore", "iqr", "percentile", "iforest"],
5049
+ min_outlier_method=3, # 至少两种方法检查出outlier
5050
+ zscore_threshold=3,
5051
+ iqr_threshold=1.5,
5052
+ lower_percentile=5,
5053
+ upper_percentile=95,
5054
+ ):
5055
+ from scipy.stats import zscore
5056
+ from sklearn.ensemble import IsolationForest
5057
+ from sklearn.preprocessing import StandardScaler
5058
+
5059
+ col_names_org = data.columns.tolist()
5060
+ index_names_org = data.index.tolist()
5061
+ # Separate numeric and non-numeric columns
5062
+ numeric_data = data.select_dtypes(include=[np.number])
5063
+ non_numeric_data = data.select_dtypes(exclude=[np.number])
5064
+
5065
+ if columns is not None:
5066
+ numeric_data = numeric_data[columns]
5067
+ elif numeric_data.empty:
5068
+ raise ValueError("Input data must contain numeric columns.")
5069
+
5070
+ outliers_df = pd.DataFrame(index=numeric_data.index)
5071
+ if isinstance(method, str):
5072
+ method = [method]
5073
+
5074
+ # Z-score method
5075
+ if "zscore" in method:
5076
+ z_scores = np.abs(zscore(numeric_data))
5077
+ outliers_df["zscore"] = np.any(z_scores > zscore_threshold, axis=1)
5078
+
5079
+ # IQR method
5080
+ if "iqr" in method:
5081
+ Q1 = numeric_data.quantile(0.25)
5082
+ Q3 = numeric_data.quantile(0.75)
5083
+ IQR = Q3 - Q1
5084
+ lower_bound = Q1 - iqr_threshold * IQR
5085
+ upper_bound = Q3 + iqr_threshold * IQR
5086
+ outliers_df["iqr"] = (
5087
+ (numeric_data < lower_bound) | (numeric_data > upper_bound)
5088
+ ).any(axis=1)
5089
+
5090
+ # Percentile method
5091
+ if "percentile" in method:
5092
+ lower_bound = numeric_data.quantile(lower_percentile / 100)
5093
+ upper_bound = numeric_data.quantile(upper_percentile / 100)
5094
+ outliers_df["percentile"] = (
5095
+ (numeric_data < lower_bound) | (numeric_data > upper_bound)
5096
+ ).any(axis=1)
5097
+
5098
+ # Isolation Forest method
5099
+ if "iforest" in method:
5100
+ # iforest method cannot handle NaNs, then fillna with mean
5101
+ numeric_data_ = numeric_data.fillna(numeric_data.mean())
5102
+ scaler = StandardScaler()
5103
+ scaled_data = scaler.fit_transform(numeric_data_)
5104
+ iso_forest = IsolationForest(contamination=0.05)
5105
+ outliers_df["iforest"] = iso_forest.fit_predict(scaled_data) == -1
5106
+
5107
+ # Combine all outlier detections
5108
+ if len(method) == 4: # all method are used:
5109
+ outliers_df["outlier"] = outliers_df.sum(axis=1) >= min_outlier_method
5110
+ else:
5111
+ outliers_df["outlier"] = outliers_df.any(axis=1)
5112
+
5113
+ # Handling Outliers: Remove or Winsorize or Replace with NaN
5114
+ processed_data = numeric_data.copy()
5115
+
5116
+ processed_data.loc[outliers_df["outlier"]] = np.nan
5117
+
5118
+ return processed_data
5119
+
5120
+
5121
+ def df_outlier(
5122
+ data,
5123
+ columns=None,
5124
+ method=["zscore", "iqr", "percentile", "iforest"],
5125
+ min_outlier_method=2, # 至少两种方法检查出outlier
5126
+ zscore_threshold=3,
5127
+ iqr_threshold=1.5,
5128
+ lower_percentile=5,
5129
+ upper_percentile=95,
5130
+ ):
5131
+ """
5132
+ Usage:
5133
+ data_out = df_outlier(
5134
+ data,
5135
+ columns=["income"],
5136
+ method="iforest",
5137
+ min_outlier_method=1)
5138
+
5139
+ Advanced outlier detection and handling function.
5140
+
5141
+ Parameters:
5142
+ - data: DataFrame, the input data (numerical).
5143
+ - method: List, the outlier detection method to use. Options: 'zscore', 'iqr', 'percentile', 'iforest'.
5144
+ - zscore_threshold: float, threshold for Z-score outlier detection (default 3).
5145
+ - iqr_threshold: float, threshold for IQR method (default 1.5).
5146
+ - lower_percentile: float, lower percentile for percentile-based outliers (default 5).
5147
+ - upper_percentile: float, upper percentile for percentile-based outliers (default 95).
5148
+ - keep_nan: bool, whether to replace outliers with NaN (default True).
5149
+ - plot: bool, whether to visualize the outliers (default False).
5150
+ - min_outlier_method: int, minimum number of method that need to flag a row as an outlier (default 2).
5151
+ - inplace: bool, whether to modify the original `data` DataFrame (default False).
5152
+
5153
+ Returns:
5154
+ - processed_data: DataFrame with outliers handled based on method (if winsorize/remove is True).
5155
+ """
5156
+ col_names_org = data.columns.tolist()
5157
+ index_names_org = data.index.tolist()
5158
+
5159
+ numeric_data = data.select_dtypes(include=[np.number])
5160
+ non_numeric_data = data.select_dtypes(exclude=[np.number])
5161
+
5162
+ _outlier_df_tmp = pd.DataFrame()
5163
+ for col in numeric_data.columns:
5164
+ _outlier_df_tmp = pd.concat(
5165
+ [
5166
+ _outlier_df_tmp,
5167
+ _df_outlier(
5168
+ data=data,
5169
+ columns=[col],
5170
+ method=method,
5171
+ min_outlier_method=min_outlier_method, # 至少两种方法检查出outlier
5172
+ zscore_threshold=zscore_threshold,
5173
+ iqr_threshold=iqr_threshold,
5174
+ lower_percentile=lower_percentile,
5175
+ upper_percentile=upper_percentile,
5176
+ ),
5177
+ ],
5178
+ axis=1,
5179
+ # join="inner",
5180
+ )
5181
+ processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
5182
+ processed_data = processed_data[col_names_org]
5183
+ return processed_data
5184
+
5185
+
4654
5186
  def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
4655
5187
  """
4656
5188
  Extend a DataFrame by the list elecments in the column.
4657
-
5189
+
4658
5190
  Parameters:
4659
5191
  ----------
4660
5192
  data : pd.DataFrame
4661
5193
  The input DataFrame to be extended.
4662
-
5194
+
4663
5195
  column : str
4664
5196
  The name of the column to be split.
4665
-
5197
+
4666
5198
  axis : int, optional
4667
- The axis along which to expand the DataFrame.
5199
+ The axis along which to expand the DataFrame.
4668
5200
  - 0 (default): Expand the specified column into multiple rows.
4669
5201
  - 1: Expand the specified column into multiple columns.
4670
-
5202
+
4671
5203
  sep : str, optional
4672
5204
  The separator used to split the values in the specified column.
4673
5205
  Must be provided for the function to work correctly.
4674
5206
  """
4675
-
4676
- data = data.copy()
5207
+
5208
+ data = data.copy()
4677
5209
  mask = data[column].str.contains(sep, na=False)
4678
5210
  data = data.copy()
4679
5211
  if mask.any():
4680
- data[column] = (
4681
- data[column]
4682
- .apply(lambda x: x.split(sep) if isinstance(x, str) else x) # Only split if x is a string
4683
- )
4684
-
5212
+ data[column] = data[column].apply(
5213
+ lambda x: x.split(sep) if isinstance(x, str) else x
5214
+ ) # Only split if x is a string
5215
+
4685
5216
  # Strip spaces from each item in the lists
4686
- data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
4687
-
5217
+ data[column] = data[column].apply(
5218
+ lambda x: [item.strip() for item in x] if isinstance(x, list) else x
5219
+ )
5220
+
4688
5221
  data = data.explode(column, ignore_index=True)
4689
5222
  return data
5223
+
5224
+
4690
5225
  # ! DataFrame
4691
5226
  def df_astype(
4692
5227
  data: pd.DataFrame,
4693
5228
  columns: Optional[Union[str, List[str]]] = None,
4694
5229
  astype: str = "datetime",
4695
- skip_row:Union[str,list]=None,
5230
+ skip_row: Union[str, list] = None,
4696
5231
  fmt: Optional[str] = None,
4697
5232
  inplace: bool = True,
4698
5233
  errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
@@ -4750,7 +5285,8 @@ def df_astype(
4750
5285
  "second",
4751
5286
  "time",
4752
5287
  "week",
4753
- "date","day",
5288
+ "date",
5289
+ "day",
4754
5290
  "month",
4755
5291
  "year",
4756
5292
  ]
@@ -4758,18 +5294,18 @@ def df_astype(
4758
5294
  if not inplace:
4759
5295
  data = data.copy()
4760
5296
  if skip_row is not None:
4761
- data = data.drop(index=skip_row, errors='ignore')
5297
+ data = data.drop(index=skip_row, errors="ignore")
4762
5298
  # If columns is None, apply to all columns
4763
5299
  if columns is None:
4764
5300
  columns = data.columns.tolist()
4765
5301
  # correct the astype input
4766
- if isinstance(astype,str):
5302
+ if isinstance(astype, str):
4767
5303
  astype = strcmp(astype, astypes)[0]
4768
5304
  print(f"converting as type: {astype}")
4769
- elif isinstance(astype,dict):
5305
+ elif isinstance(astype, dict):
4770
5306
  for col, dtype in astype.items():
4771
- dtype='date' if dtype=="day" else dtype
4772
- data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
5307
+ dtype = "date" if dtype == "day" else dtype
5308
+ data["col"] = data["col"].adtype(strcmp(dtype, astypes)[0])
4773
5309
  return data if not inplace else None
4774
5310
 
4775
5311
  # Ensure columns is a list
@@ -4880,13 +5416,15 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
4880
5416
  if column not in data.columns:
4881
5417
  raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
4882
5418
 
4883
- if isinstance(by, str) and 'count' in by.lower():
5419
+ if isinstance(by, str) and "count" in by.lower():
4884
5420
  # Count occurrences of each value in the specified column
4885
5421
  value_counts = df[column].value_counts()
4886
5422
 
4887
5423
  # Determine the order based on counts
4888
5424
  count_ascending = kwargs.pop("count_ascending", ascending)
4889
- sorted_counts = value_counts.sort_values(ascending=count_ascending).index.tolist()
5425
+ sorted_counts = value_counts.sort_values(
5426
+ ascending=count_ascending
5427
+ ).index.tolist()
4890
5428
 
4891
5429
  # Convert to a categorical type with the new order
4892
5430
  df[column] = pd.Categorical(df[column], categories=sorted_counts, ordered=True)
@@ -5004,6 +5542,7 @@ def df_merge(
5004
5542
  )
5005
5543
  return df_merged
5006
5544
 
5545
+
5007
5546
  def df_drop_duplicates(
5008
5547
  data: pd.DataFrame,
5009
5548
  by: Union[
@@ -5012,16 +5551,16 @@ def df_drop_duplicates(
5012
5551
  keep="first", # Options: 'first', 'last', or False (drop all duplicates)
5013
5552
  ignore_index=True,
5014
5553
  inplace: bool = False,
5015
- verbose=True
5554
+ verbose=True,
5016
5555
  ):
5017
5556
  """
5018
5557
  data (pd.DataFrame): DataFrame to drop duplicates from.
5019
5558
  by (str): Specify by to drop duplicates:
5020
5559
  - 'index': Drop duplicates based on the DataFrame index.
5021
5560
  - Column name(s) for row-wise duplicate checking.
5022
- keep (str): Which duplicates to keep:
5023
- 'first',
5024
- 'last',
5561
+ keep (str): Which duplicates to keep:
5562
+ 'first',
5563
+ 'last',
5025
5564
  False (drop all duplicates).
5026
5565
  inplace (bool): Whether to modify the original DataFrame in place.
5027
5566
  """
@@ -5031,8 +5570,8 @@ def df_drop_duplicates(
5031
5570
  result = data[~data.index.duplicated(keep=keep)]
5032
5571
  else:
5033
5572
  # Drop duplicates row-wise based on column(s)
5034
- result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
5035
- if original_shape!=result.shape or verbose:
5573
+ result = data.drop_duplicates(subset=by, keep=keep, ignore_index=ignore_index)
5574
+ if original_shape != result.shape or verbose:
5036
5575
  print(f"\nshape:{original_shape} (before drop_duplicates)")
5037
5576
  print(f"shape:{result.shape} (after drop_duplicates)")
5038
5577
  if inplace:
@@ -5042,15 +5581,18 @@ def df_drop_duplicates(
5042
5581
  return None
5043
5582
  else:
5044
5583
  return result
5584
+
5585
+
5586
+ #! fillna()
5045
5587
  def df_fillna(
5046
5588
  data: pd.DataFrame,
5047
5589
  method: str = "knn",
5048
- axis: int = 0,# column-wise
5590
+ axis: int = 0, # column-wise
5049
5591
  constant: float = None,
5050
5592
  n_neighbors: int = 5, # KNN-specific
5051
- max_iter: int = 10, # Iterative methods specific
5052
- inplace: bool = True,
5053
- random_state:int = None
5593
+ max_iter: int = 10, # Iterative methods specific
5594
+ inplace: bool = False,
5595
+ random_state: int = 1,
5054
5596
  ) -> pd.DataFrame:
5055
5597
  """
5056
5598
  Fill missing values in a DataFrame using specified imputation method.
@@ -5066,11 +5608,11 @@ def df_fillna(
5066
5608
  - 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
5067
5609
  - 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
5068
5610
  # - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
5069
- # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
5611
+ # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
5070
5612
  # reconstructing the data matrix using low-rank approximation
5071
5613
  # - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
5072
5614
  # - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
5073
-
5615
+
5074
5616
  axis (int): The axis along which to impute:
5075
5617
  - 0: Impute column-wise (default).
5076
5618
  - 1: Impute row-wise.
@@ -5078,13 +5620,30 @@ def df_fillna(
5078
5620
  inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
5079
5621
 
5080
5622
  """
5623
+ if isinstance(data, pd.Series):
5624
+ data = pd.DataFrame(data)
5625
+ # handle None
5626
+ for col in data.columns:
5627
+ data[col] = data[col].apply(lambda x: np.nan if x is None else x)
5628
+
5629
+ col_names_org = data.columns.tolist()
5630
+ index_names_org = data.index.tolist()
5631
+ # Separate numeric and non-numeric columns
5632
+ numeric_data = data.select_dtypes(include=[np.number])
5633
+ non_numeric_data = data.select_dtypes(exclude=[np.number])
5081
5634
 
5082
5635
  if data.empty:
5083
5636
  raise ValueError("Input DataFrame is empty.")
5084
5637
 
5085
5638
  # Validate method
5086
- methods = ["mean", "median", "most_frequent",
5087
- "constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
5639
+ methods = [
5640
+ "mean",
5641
+ "median",
5642
+ "most_frequent",
5643
+ "constant",
5644
+ "knn",
5645
+ "iterative",
5646
+ ] # ,"missforest","softimpute","svd"]
5088
5647
  method = strcmp(method, methods)[0]
5089
5648
 
5090
5649
  # If using constant method, ask for a constant value
@@ -5098,51 +5657,76 @@ def df_fillna(
5098
5657
  # Initialize SimpleImputer with the chosen method
5099
5658
  if method == "constant":
5100
5659
  from sklearn.impute import SimpleImputer
5660
+
5101
5661
  imputer = SimpleImputer(strategy=method, fill_value=constant)
5102
5662
  elif method == "knn":
5103
5663
  from sklearn.impute import KNNImputer
5664
+
5104
5665
  imputer = KNNImputer(n_neighbors=n_neighbors)
5105
5666
  elif method == "iterative" or method == "mice":
5106
5667
  from sklearn.experimental import enable_iterative_imputer
5107
5668
  from sklearn.impute import IterativeImputer
5108
5669
 
5109
- imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
5110
- # elif method == "missforest":
5111
- # from missingpy import MissForest
5112
- # imputer = MissForest(max_iter=max_iter, random_state=random_state)
5113
- # elif method == "softimpute":
5114
- # from fancyimpute import SoftImpute
5115
- # imputer = SoftImpute()
5116
- # elif method == "svd":
5117
- # from fancyimpute import IterativeSVD
5118
- # imputer = IterativeSVD(max_iters=max_iter)
5119
- else: # mean, median, most_frequent
5670
+ imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
5671
+ else: # mean, median, most_frequent
5120
5672
  from sklearn.impute import SimpleImputer
5673
+
5121
5674
  imputer = SimpleImputer(strategy=method)
5122
5675
 
5123
5676
  # Fit and transform the data
5124
5677
  if axis == 0:
5125
5678
  # Impute column-wise
5126
- imputed_data = imputer.fit_transform(data)
5127
- imputed_data.shape
5679
+ imputed_data = imputer.fit_transform(numeric_data)
5128
5680
  elif axis == 1:
5129
5681
  # Impute row-wise
5130
- imputed_data = imputer.fit_transform(data.T)
5131
- imputed_data.shape
5682
+ imputed_data = imputer.fit_transform(numeric_data.T)
5132
5683
  else:
5133
5684
  raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
5134
5685
 
5135
- df_filled = pd.DataFrame(
5686
+ imputed_data = pd.DataFrame(
5136
5687
  imputed_data if axis == 0 else imputed_data.T,
5137
- index=data.index,# if axis == 0 else data.columns,
5138
- columns=data.columns,# if axis == 0 else data.index,
5688
+ index=numeric_data.index if axis == 0 else data.columns,
5689
+ columns=numeric_data.columns if axis == 0 else data.index,
5690
+ )
5691
+ for col in imputed_data.select_dtypes(include=[np.number]).columns:
5692
+ imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
5693
+
5694
+ # Handle non-numeric data imputation
5695
+ if not non_numeric_data.empty:
5696
+ from sklearn.impute import SimpleImputer
5697
+
5698
+ if method == "constant":
5699
+ non_numeric_imputer = SimpleImputer(
5700
+ strategy="constant", fill_value=constant
5701
+ )
5702
+ else:
5703
+ non_numeric_imputer = SimpleImputer(strategy="most_frequent")
5704
+
5705
+ # Impute non-numeric columns column-wise (axis=0)
5706
+ imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
5707
+
5708
+ # Convert imputed non-numeric array back to DataFrame with original index and column names
5709
+ imputed_non_numeric_df = pd.DataFrame(
5710
+ imputed_non_numeric,
5711
+ index=non_numeric_data.index,
5712
+ columns=non_numeric_data.columns,
5713
+ )
5714
+ else:
5715
+ imputed_non_numeric_df = pd.DataFrame(index=data.index)
5716
+
5717
+ imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(
5718
+ columns=data.columns
5139
5719
  )
5140
5720
 
5141
5721
  if inplace:
5142
- data.update(df_filled)
5143
- return None # replace original
5722
+ # Modify the original DataFrame
5723
+ data[:] = imputed_data[col_names_org]
5724
+ return None
5144
5725
  else:
5145
- return df_filled
5726
+ # Return the modified DataFrame
5727
+ return imputed_data[col_names_org]
5728
+
5729
+
5146
5730
  # # example
5147
5731
  # data = {
5148
5732
  # "A": [1, 2, np.nan, 4, 5],
@@ -5172,9 +5756,100 @@ def df_fillna(
5172
5756
  # display(df)
5173
5757
  # display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
5174
5758
 
5175
-
5759
+
5760
+ def df_encoder(
5761
+ data: pd.DataFrame,
5762
+ method: str = "dummy", #'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
5763
+ columns=None,
5764
+ target_column=None, # Required for 'target' encoding method
5765
+ **kwargs,
5766
+ ) -> pd.DataFrame:
5767
+ """
5768
+ Methods explained:
5769
+ - 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
5770
+
5771
+ - 'onehot': One-hot encoding is used when there is no inherent order in categories. It creates a binary column for each category and is useful for nominal categorical variables. However, it increases dimensionality significantly if there are many unique categories.
5772
+
5773
+ - 'ordinal': Ordinal encoding is used when there is an inherent order in the categories. It assigns integers to categories based on their order. Use this when the categories have a ranking (e.g., 'low', 'medium', 'high').
5774
+
5775
+ - 'label': Label encoding is used for converting each unique category to a numeric label. It can be useful when working with algorithms that can handle categorical data natively (e.g., decision trees). However, it might introduce unintended ordinal relationships between the categories.
5776
+
5777
+ - 'target': Target encoding is used when you encode a categorical feature based on the mean of the target variable. This is useful when there is a strong correlation between the categorical feature and the target variable. It is often used in predictive modeling to capture relationships that are not directly encoded in the feature.
5778
+
5779
+ - 'binary': Binary encoding is a more efficient alternative to one-hot encoding when dealing with high-cardinality categorical variables. It converts categories into binary numbers and then splits them into multiple columns, reducing dimensionality compared to one-hot encoding.
5780
+ """
5781
+
5782
+ # Select categorical columns
5783
+ categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
5784
+ methods = ["dummy", "onehot", "ordinal", "label", "target", "binary"]
5785
+ method = strcmp(method, methods)[0]
5786
+
5787
+ if columns is None:
5788
+ columns = categorical_cols
5789
+
5790
+ # pd.get_dummies()
5791
+ if method == "dummy":
5792
+ dtype = kwargs.pop("dtype", int)
5793
+ drop_first = kwargs.pop("drop_first", True)
5794
+ try:
5795
+ encoded_df = pd.get_dummies(
5796
+ data[columns], drop_first=drop_first, dtype=dtype, **kwargs
5797
+ )
5798
+ return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5799
+ except Exception as e:
5800
+ # print(f"Warning, 没有进行转换, 因为: {e}")
5801
+ return data
5802
+ # One-hot encoding
5803
+ elif method == "onehot":
5804
+ from sklearn.preprocessing import OneHotEncoder
5805
+
5806
+ encoder = OneHotEncoder(drop="first", sparse_output=False, **kwargs)
5807
+ encoded_data = encoder.fit_transform(data[columns])
5808
+ encoded_df = pd.DataFrame(
5809
+ encoded_data,
5810
+ columns=encoder.get_feature_names_out(columns),
5811
+ index=data.index,
5812
+ )
5813
+ return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5814
+
5815
+ # Ordinal encoding
5816
+ elif method == "ordinal":
5817
+ from sklearn.preprocessing import OrdinalEncoder
5818
+
5819
+ encoder = OrdinalEncoder(**kwargs)
5820
+ encoded_data = encoder.fit_transform(data[columns])
5821
+ encoded_df = pd.DataFrame(encoded_data, columns=columns, index=data.index)
5822
+ return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5823
+
5824
+ # Label encoding
5825
+ elif method == "label":
5826
+ from sklearn.preprocessing import LabelEncoder
5827
+
5828
+ encoder = LabelEncoder()
5829
+ encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
5830
+ return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5831
+
5832
+ # Target encoding (Mean of the target for each category)
5833
+ elif method == "target":
5834
+ if target_column is None:
5835
+ raise ValueError("target_column must be provided for target encoding.")
5836
+ from category_encoders import TargetEncoder
5837
+
5838
+ encoder = TargetEncoder(cols=columns, **kwargs)
5839
+ encoded_data = encoder.fit_transform(data[columns], data[target_column])
5840
+ return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5841
+
5842
+ # Binary encoding (for high-cardinality categorical variables)
5843
+ elif method == "binary":
5844
+ from category_encoders import BinaryEncoder
5845
+
5846
+ encoder = BinaryEncoder(cols=columns, **kwargs)
5847
+ encoded_data = encoder.fit_transform(data[columns])
5848
+ return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5849
+
5850
+
5176
5851
  def df_scaler(
5177
- data: pd.DataFrame, # should be numeric dtype
5852
+ data: pd.DataFrame, # should be numeric dtype
5178
5853
  method="standard",
5179
5854
  columns=None, # default, select all numeric col/row
5180
5855
  inplace=False,
@@ -5218,9 +5893,8 @@ def df_scaler(
5218
5893
  if axis == 0:
5219
5894
  # Column-wise scaling (default)
5220
5895
  if columns is None:
5221
- columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
5896
+ columns = data.select_dtypes(include=np.number).columns.tolist()
5222
5897
  non_numeric_columns = data.columns.difference(columns)
5223
- print(f"Scaling columns")
5224
5898
 
5225
5899
  scaled_data = scaler.fit_transform(data[columns])
5226
5900
 
@@ -5242,7 +5916,7 @@ def df_scaler(
5242
5916
  # Row-wise scaling
5243
5917
  if columns is None:
5244
5918
  columns = data.index.tolist()
5245
- numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
5919
+ numeric_rows = data.loc[columns].select_dtypes(include=np.number)
5246
5920
  if numeric_rows.empty:
5247
5921
  raise ValueError("No numeric rows to scale.")
5248
5922
 
@@ -5260,6 +5934,34 @@ def df_scaler(
5260
5934
  scaled_df.loc[numeric_rows.index] = scaled_data
5261
5935
  return scaled_df
5262
5936
 
5937
+
5938
+ def df_special_characters_cleaner(
5939
+ data: pd.DataFrame, where=["column", "content", "index"]
5940
+ ) -> pd.DataFrame:
5941
+ """
5942
+ to clean special characters:
5943
+ usage:
5944
+ df_special_characters_cleaner(data=df, where='column')
5945
+ """
5946
+ if not isinstance(where, list):
5947
+ where = [where]
5948
+ where_to_clean = ["column", "content", "index"]
5949
+ where_ = [strcmp(i, where_to_clean)[0] for i in where]
5950
+
5951
+ # 1. Clean column names by replacing special characters with underscores
5952
+ if "column" in where_:
5953
+ data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
5954
+
5955
+ # 2. Clean only object-type columns (text columns)
5956
+ if "content" in where_:
5957
+ for col in data.select_dtypes(include=["object"]).columns:
5958
+ data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
5959
+ if data.index.dtype == "object" and index in where_:
5960
+ data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
5961
+
5962
+ return data
5963
+
5964
+
5263
5965
  def df_cluster(
5264
5966
  data: pd.DataFrame,
5265
5967
  columns: Optional[list] = None,
@@ -5268,8 +5970,8 @@ def df_cluster(
5268
5970
  scale: bool = True,
5269
5971
  plot: Union[str, list] = "all",
5270
5972
  inplace: bool = True,
5271
- ax: Optional[plt.Axes] = None,
5272
- ) -> tuple[pd.DataFrame, int, Optional[plt.Axes]]:
5973
+ ax=None,
5974
+ ):
5273
5975
  from sklearn.preprocessing import StandardScaler
5274
5976
  from sklearn.cluster import KMeans
5275
5977
  from sklearn.metrics import silhouette_score, silhouette_samples
@@ -5277,7 +5979,6 @@ def df_cluster(
5277
5979
  import numpy as np
5278
5980
  import pandas as pd
5279
5981
  import matplotlib.pyplot as plt
5280
- import seaborn as sns
5281
5982
 
5282
5983
  """
5283
5984
  Performs clustering analysis on the provided feature matrix using K-Means.
@@ -5585,94 +6286,72 @@ def df_reducer(
5585
6286
  umap_neighbors: int = 15, # UMAP-specific
5586
6287
  umap_min_dist: float = 0.1, # UMAP-specific
5587
6288
  tsne_perplexity: int = 30, # t-SNE-specific
6289
+ hue: str = None, # lda-specific
5588
6290
  scale: bool = True,
5589
6291
  fill_missing: bool = True,
5590
6292
  debug: bool = False,
5591
6293
  inplace: bool = True, # replace the oringinal data
5592
- plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
6294
+ plot_: bool = False, # plot scatterplot, but no 'hue',so it is meaningless
6295
+ random_state=1,
6296
+ ax=None,
6297
+ figsize=None,
6298
+ **kwargs,
5593
6299
  ) -> pd.DataFrame:
5594
- """
5595
- Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
5596
- method:
5597
- 1. 'umap':
5598
- - big dataset and global structure, often preferred in large-scale datasets for
5599
- visualization and dimensionality reduction, balancing speed and quality of visualization.
5600
- - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
5601
- relationships, causing clusters to appear in arbitrary proximities to each other.
5602
- 2. 'pca':
5603
- - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
5604
- relationships, causing clusters to appear in arbitrary proximities to each other.
5605
- - useful as a preprocessing step and in datasets where linear relationships dominate.
5606
- 3. 't-SNE':
5607
- a. t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
5608
- relationships, causing clusters to appear in arbitrary proximities to each other.
5609
- b. often preferred in large-scale datasets for visualization and dimensionality
5610
- reduction, balancing speed and quality of visualization.
5611
- Parameters:
5612
- -----------
5613
- data : pd.DataFrame
5614
- The input DataFrame (samples x features).
5615
-
5616
- columns : List[str], optional
5617
- List of column names to reduce. If None, all columns are used.
5618
-
5619
- method : str, optional, default="umap"
5620
- Dimensionality reduction method, either "pca" or "umap".
5621
-
5622
- n_components : int, optional, default=50
5623
- Number of components for PCA or UMAP.
5624
-
5625
- umap_neighbors : int, optional, default=15
5626
- Number of neighbors considered for UMAP embedding.
5627
-
5628
- umap_min_dist : float, optional, default=0.1
5629
- Minimum distance between points in UMAP embedding.
5630
-
5631
- scale : bool, optional, default=True
5632
- Whether to scale the data using StandardScaler.
5633
-
5634
- fill_missing : bool, optional, default=True
5635
- Whether to fill missing values using the mean before applying PCA/UMAP.
6300
+ dict_methods = {
6301
+ #!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
6302
+ "pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
6303
+ "lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
6304
+ "factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
6305
+ "svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
6306
+ #! Non-linear Dimensionality Reduction (Manifold Learning)
6307
+ "umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
6308
+ "tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other. Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
6309
+ "mds": "mds(Multidimensional Scaling):\n\tAppropriate for visualizing pairwise similarity or distance in data. Advantage: Maintains the perceived similarity or dissimilarity between points. Limitation: Computationally expensive for large datasets, less effective for complex, high-dimensional structures.",
6310
+ "lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
6311
+ "kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
6312
+ "ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
6313
+ #! Anomaly Detection: Specialized for detecting outliers or unusual patterns
6314
+ "isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
6315
+ }
5636
6316
 
5637
- Returns:
5638
- --------
5639
- reduced_df : pd.DataFrame
5640
- DataFrame with the reduced dimensions.
5641
- """
5642
-
5643
- """
5644
- PCA: explained_variance:
5645
- indicates the proportion of the dataset's total variance that each principal
5646
- component (PC) explains. It gives you a sense of how much information
5647
- (or variance) is captured by each PC
5648
- Interpretation:
5649
- - Higher values indicate that the corresponding PC captures more variance.
5650
- - The sum of the explained variances for all PCs equals 1 (or 100%).
5651
- - If the first few components explain a high percentage (e.g., 90%),
5652
- it means you can reduce the dimensionality of the data significantly without losing much information.
5653
- Use case:
5654
- You may plot a scree plot, which shows the explained variance for each PC, to help decide
5655
- how many components to keep for analysis.
5656
-
5657
- PCA: Singular values:
5658
- represent the magnitude of variance along each principal component. Mathematically,
5659
- they are the square roots of the eigenvalues of the covariance matrix.
5660
- Interpretation:
5661
- Larger singular values indicate that the associated PC captures more variance.
5662
- Singular values are related to the scale of the data. If the data are scaled
5663
- before PCA (e.g., standardized), then the singular values will provide a measure
5664
- of the spread of data along each PC.
5665
- Use case:
5666
- Singular values help quantify the contribution of each principal component in a
5667
- similar way to the explained variance. They are useful in understanding the overall
5668
- structure of the data.
5669
- """
5670
6317
  from sklearn.preprocessing import StandardScaler
5671
6318
  from sklearn.impute import SimpleImputer
5672
6319
 
5673
- # Select columns if specified, else use all columns
5674
- X = data[columns].values if columns else data.values
5675
- print(X.shape,type(X))
6320
+ if plot_:
6321
+ import matplotlib.pyplot as plt
6322
+ import seaborn as sns
6323
+ # Check valid method input
6324
+ methods = [
6325
+ "pca",
6326
+ "umap",
6327
+ "tsne",
6328
+ "factor",
6329
+ "isolation_forest",
6330
+ "lda",
6331
+ "kpca",
6332
+ "ica",
6333
+ "mds",
6334
+ "lle",
6335
+ "svd",
6336
+ ]
6337
+ method = strcmp(method, methods)[0]
6338
+ print(f"\nprocessing with using {dict_methods[method]}:")
6339
+ xlabel, ylabel = None, None
6340
+ if columns is None:
6341
+ columns = data.select_dtypes(include="number").columns.tolist()
6342
+ if hue is None:
6343
+ hue = data.select_dtypes(exclude="number").columns.tolist()
6344
+ if isinstance(hue, list):
6345
+ print("Warning: hue is a list, only select the 1st one")
6346
+ hue = hue[0]
6347
+ if not hue:
6348
+ # Select columns if specified, else use all columns
6349
+ X = data[columns].values if columns else data.values
6350
+ else:
6351
+ # Select columns to reduce and hue for LDA
6352
+ X = data[columns].values if columns else data.drop(columns=[hue]).values
6353
+ y = data[hue].values
6354
+ print(X.shape)
5676
6355
  # Handle missing values
5677
6356
  if fill_missing:
5678
6357
  imputer = SimpleImputer(strategy="mean")
@@ -5683,15 +6362,13 @@ def df_reducer(
5683
6362
  scaler = StandardScaler()
5684
6363
  X = scaler.fit_transform(X)
5685
6364
 
5686
- # Check valid method input
5687
- methods=["pca", "umap","tsne","factor","isolation_forest"]
5688
- method=strcmp(method, methods)[0]
5689
6365
  # Apply PCA if selected
5690
- if method == "pca":
6366
+ if method == "pca":
5691
6367
  from sklearn.decomposition import PCA
6368
+
5692
6369
  pca = PCA(n_components=n_components)
5693
6370
  X_reduced = pca.fit_transform(X)
5694
-
6371
+
5695
6372
  # Additional PCA information
5696
6373
  explained_variance = pca.explained_variance_ratio_
5697
6374
  singular_values = pca.singular_values_
@@ -5707,36 +6384,72 @@ def df_reducer(
5707
6384
  # Plot explained variance
5708
6385
  cumulative_variance = np.cumsum(explained_variance)
5709
6386
  plt.figure(figsize=(8, 5))
5710
- plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
6387
+ plt.plot(
6388
+ range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o"
6389
+ )
5711
6390
  plt.title("Cumulative Explained Variance by Principal Components")
5712
6391
  plt.xlabel("Number of Principal Components")
5713
6392
  plt.ylabel("Cumulative Explained Variance")
5714
6393
  plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
5715
- plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
6394
+ plt.axvline(
6395
+ x=n_components,
6396
+ color="g",
6397
+ linestyle="--",
6398
+ label=f"n_components = {n_components}",
6399
+ )
5716
6400
  plt.legend()
5717
6401
  plt.grid()
5718
6402
  plt.show()
5719
6403
 
5720
6404
  # Prepare reduced DataFrame with additional PCA info
5721
6405
  pca_df = pd.DataFrame(
5722
- X_reduced, index=data.index,
5723
- columns=[f"PC_{i+1}" for i in range(n_components)]
5724
- )
6406
+ X_reduced,
6407
+ index=data.index,
6408
+ columns=[f"PC_{i+1}" for i in range(n_components)],
6409
+ )
5725
6410
  # pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
5726
6411
  # pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
5727
6412
  # Expand explained variance to multiple columns if needed
5728
6413
  for i in range(n_components):
5729
- pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
6414
+ pca_df[f"Explained Variance PC_{i+1}"] = np.tile(
6415
+ format(explained_variance[i] * 100, ".3f") + "%", (pca_df.shape[0], 1)
6416
+ )
5730
6417
  for i in range(n_components):
5731
- pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
6418
+ pca_df[f"Singular Values PC_{i+1}"] = np.tile(
6419
+ singular_values[i], (pca_df.shape[0], 1)
6420
+ )
6421
+ if hue:
6422
+ pca_df[hue] = y
6423
+ elif method == "lda":
6424
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
5732
6425
 
6426
+ if "hue" not in locals() or hue is None:
6427
+ raise ValueError(
6428
+ "LDA requires a 'hue' col parameter to specify class labels."
6429
+ )
6430
+
6431
+ lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
6432
+ X_reduced = lda_reducer.fit_transform(X, y)
6433
+
6434
+ # Prepare reduced DataFrame with additional LDA info
6435
+ lda_df = pd.DataFrame(
6436
+ X_reduced,
6437
+ index=data.index,
6438
+ columns=[f"LDA_{i+1}" for i in range(n_components)],
6439
+ )
6440
+ if debug:
6441
+ print(f"LDA completed: Reduced to {n_components} components.")
6442
+ print("Class separability achieved by LDA.")
6443
+ if hue:
6444
+ lda_df[hue] = y
5733
6445
  # Apply UMAP if selected
5734
6446
  elif method == "umap":
5735
6447
  import umap
6448
+
5736
6449
  umap_reducer = umap.UMAP(
5737
6450
  n_neighbors=umap_neighbors,
5738
6451
  min_dist=umap_min_dist,
5739
- n_components=n_components
6452
+ n_components=n_components,
5740
6453
  )
5741
6454
  X_reduced = umap_reducer.fit_transform(X)
5742
6455
 
@@ -5751,41 +6464,57 @@ def df_reducer(
5751
6464
 
5752
6465
  # Prepare reduced DataFrame with additional UMAP info
5753
6466
  umap_df = pd.DataFrame(
5754
- X_reduced, index=data.index,
5755
- columns=[f"UMAP_{i+1}" for i in range(n_components)]
6467
+ X_reduced,
6468
+ index=data.index,
6469
+ columns=[f"UMAP_{i+1}" for i in range(n_components)],
5756
6470
  )
5757
6471
  umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
5758
6472
  umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
6473
+ if hue:
6474
+ umap_df[hue] = y
5759
6475
  elif method == "tsne":
5760
6476
  from sklearn.manifold import TSNE
5761
- tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
5762
- X_reduced = tsne.fit_transform(X)
5763
6477
 
5764
- # Prepare reduced DataFrame with additional t-SNE info
6478
+ tsne = TSNE(
6479
+ n_components=n_components,
6480
+ perplexity=tsne_perplexity,
6481
+ random_state=random_state,
6482
+ )
6483
+ X_reduced = tsne.fit_transform(X)
5765
6484
  tsne_df = pd.DataFrame(
5766
- X_reduced, index=data.index,
5767
- columns=[f"tSNE_{i+1}" for i in range(n_components)]
6485
+ X_reduced,
6486
+ index=data.index,
6487
+ columns=[f"tSNE_{i+1}" for i in range(n_components)],
5768
6488
  )
5769
- tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
5770
-
6489
+ tsne_df["Perplexity"] = np.tile(
6490
+ f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1)
6491
+ )
6492
+ if hue:
6493
+ tsne_df[hue] = y
5771
6494
  # Apply Factor Analysis if selected
5772
6495
  elif method == "factor":
5773
6496
  from sklearn.decomposition import FactorAnalysis
5774
- factor = FactorAnalysis(n_components=n_components, random_state=1)
6497
+
6498
+ factor = FactorAnalysis(n_components=n_components, random_state=random_state)
5775
6499
  X_reduced = factor.fit_transform(X)
5776
6500
  # Factor Analysis does not directly provide explained variance, but we can approximate it
5777
6501
  fa_variance = factor.noise_variance_
5778
6502
  # Prepare reduced DataFrame with additional Factor Analysis info
5779
6503
  factor_df = pd.DataFrame(
5780
- X_reduced, index=data.index,
5781
- columns=[f"Factor_{i+1}" for i in range(n_components)]
6504
+ X_reduced,
6505
+ index=data.index,
6506
+ columns=[f"Factor_{i+1}" for i in range(n_components)],
5782
6507
  )
5783
- factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
5784
-
6508
+ factor_df["Noise Variance"] = np.tile(
6509
+ format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1)
6510
+ )
6511
+ if hue:
6512
+ factor_df[hue] = y
5785
6513
  # Apply Isolation Forest for outlier detection if selected
5786
6514
  elif method == "isolation_forest":
5787
6515
  from sklearn.decomposition import PCA
5788
6516
  from sklearn.ensemble import IsolationForest
6517
+
5789
6518
  # Step 1: Apply PCA for dimensionality reduction to 2 components
5790
6519
  pca = PCA(n_components=n_components)
5791
6520
  X_pca = pca.fit_transform(X)
@@ -5795,65 +6524,139 @@ def df_reducer(
5795
6524
 
5796
6525
  # Prepare reduced DataFrame with additional PCA info
5797
6526
  iso_forest_df = pd.DataFrame(
5798
- X_pca, index=data.index,
5799
- columns=[f"PC_{i+1}" for i in range(n_components)]
6527
+ X_pca, index=data.index, columns=[f"PC_{i+1}" for i in range(n_components)]
5800
6528
  )
5801
6529
 
5802
- isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
6530
+ isolation_forest = IsolationForest(
6531
+ n_estimators=100, contamination="auto", random_state=1
6532
+ )
5803
6533
  isolation_forest.fit(X)
5804
- anomaly_scores = isolation_forest.decision_function(X) # Anomaly score: larger is less anomalous
6534
+ anomaly_scores = isolation_forest.decision_function(
6535
+ X
6536
+ ) # Anomaly score: larger is less anomalous
5805
6537
  # Predict labels: 1 (normal), -1 (anomaly)
5806
- anomaly_labels = isolation_forest.fit_predict(X)
6538
+ anomaly_labels = isolation_forest.fit_predict(X)
5807
6539
  # Add anomaly scores and labels to the DataFrame
5808
6540
  iso_forest_df["Anomaly Score"] = anomaly_scores
5809
6541
  iso_forest_df["Anomaly Label"] = anomaly_labels
5810
6542
  # add info from pca
5811
6543
  for i in range(n_components):
5812
- iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
6544
+ iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(
6545
+ format(explained_variance[i] * 100, ".3f") + "%",
6546
+ (iso_forest_df.shape[0], 1),
6547
+ )
5813
6548
  for i in range(n_components):
5814
- iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
6549
+ iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(
6550
+ singular_values[i], (iso_forest_df.shape[0], 1)
6551
+ )
6552
+ if hue:
6553
+ iso_forest_df[hue] = y
6554
+ # * Apply Kernel PCA if selected
6555
+ elif method == "kpca":
6556
+ from sklearn.decomposition import KernelPCA
6557
+
6558
+ kpca = KernelPCA(
6559
+ n_components=n_components, kernel="rbf", random_state=random_state
6560
+ )
6561
+ X_reduced = kpca.fit_transform(X)
6562
+
6563
+ # Prepare reduced DataFrame with KPCA info
6564
+ kpca_df = pd.DataFrame(
6565
+ X_reduced,
6566
+ index=data.index,
6567
+ columns=[f"KPCA_{i+1}" for i in range(n_components)],
6568
+ )
6569
+ if debug:
6570
+ print("Kernel PCA completed with RBF kernel.")
6571
+ if hue:
6572
+ kpca_df[hue] = y
6573
+ # * Apply ICA if selected
6574
+ elif method == "ica":
6575
+ from sklearn.decomposition import FastICA
6576
+
6577
+ ica = FastICA(n_components=n_components, random_state=random_state)
6578
+ X_reduced = ica.fit_transform(X)
6579
+
6580
+ # Prepare reduced DataFrame with ICA info
6581
+ ica_df = pd.DataFrame(
6582
+ X_reduced,
6583
+ index=data.index,
6584
+ columns=[f"ICA_{i+1}" for i in range(n_components)],
6585
+ )
6586
+ if debug:
6587
+ print("Independent Component Analysis (ICA) completed.")
6588
+ if hue:
6589
+ ica_df[hue] = y
6590
+ # * Apply MDS if selected
6591
+ elif method == "mds":
6592
+ from sklearn.manifold import MDS
6593
+
6594
+ mds = MDS(n_components=n_components, random_state=random_state)
6595
+ X_reduced = mds.fit_transform(X)
6596
+
6597
+ # Prepare reduced DataFrame with MDS info
6598
+ mds_df = pd.DataFrame(
6599
+ X_reduced,
6600
+ index=data.index,
6601
+ columns=[f"MDS_{i+1}" for i in range(n_components)],
6602
+ )
6603
+ if debug:
6604
+ print("Multidimensional Scaling (MDS) completed.")
6605
+ if hue:
6606
+ mds_df[hue] = y
6607
+ # * Apply Locally Linear Embedding (LLE) if selected
6608
+ elif method == "lle":
6609
+ from sklearn.manifold import LocallyLinearEmbedding
6610
+
6611
+ lle = LocallyLinearEmbedding(
6612
+ n_components=n_components,
6613
+ n_neighbors=umap_neighbors,
6614
+ random_state=random_state,
6615
+ )
6616
+ X_reduced = lle.fit_transform(X)
6617
+
6618
+ # Prepare reduced DataFrame with LLE info
6619
+ lle_df = pd.DataFrame(
6620
+ X_reduced,
6621
+ index=data.index,
6622
+ columns=[f"LLE_{i+1}" for i in range(n_components)],
6623
+ )
6624
+ if debug:
6625
+ print("Locally Linear Embedding (LLE) completed.")
6626
+ if hue:
6627
+ lle_df[hue] = y
6628
+ # * Apply Singular Value Decomposition (SVD) if selected
6629
+ elif method == "svd":
6630
+ # Using NumPy's SVD for dimensionality reduction
6631
+ U, s, Vt = np.linalg.svd(X, full_matrices=False)
6632
+ X_reduced = U[:, :n_components] * s[:n_components]
6633
+
6634
+ # Prepare reduced DataFrame with SVD info
6635
+ svd_df = pd.DataFrame(
6636
+ X_reduced,
6637
+ index=data.index,
6638
+ columns=[f"SVD_{i+1}" for i in range(n_components)],
6639
+ )
6640
+ if hue:
6641
+ svd_df[hue] = y
6642
+ if debug:
6643
+ print("Singular Value Decomposition (SVD) completed.")
5815
6644
 
5816
6645
  # Return reduced data and info as a new DataFrame with the same index
5817
6646
  if method == "pca":
5818
6647
  reduced_df = pca_df
5819
6648
  colname_met = "PC_"
5820
- if plot_:
5821
- sns.scatterplot(
5822
- data=pca_df,
5823
- x="PC_1",
5824
- y="PC_2",
5825
- # hue="condition",
5826
- )
6649
+ xlabel = f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
6650
+ ylabel = f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
5827
6651
  elif method == "umap":
5828
6652
  reduced_df = umap_df
5829
6653
  colname_met = "UMAP_"
5830
- if plot_:
5831
- sns.scatterplot(
5832
- data=umap_df,
5833
- x="UMAP_1",
5834
- y="UMAP_2",
5835
- # hue="condition",
5836
- )
5837
6654
  elif method == "tsne":
5838
6655
  reduced_df = tsne_df
5839
- colname_met = "t-SNE_"
5840
- if plot_:
5841
- sns.scatterplot(
5842
- data=tsne_df,
5843
- x="tSNE_1",
5844
- y="tSNE_2",
5845
- # hue="batch",
5846
- )
6656
+ colname_met = "tSNE_"
5847
6657
  elif method == "factor":
5848
6658
  reduced_df = factor_df
5849
6659
  colname_met = "Factor_"
5850
- if plot_:
5851
- sns.scatterplot(
5852
- data=factor_df,
5853
- x="Factor_1",
5854
- y="Factor_2",
5855
- # hue="batch",
5856
- )
5857
6660
  elif method == "isolation_forest":
5858
6661
  reduced_df = iso_forest_df # Already a DataFrame for outliers
5859
6662
  colname_met = "PC_"
@@ -5862,7 +6665,8 @@ def df_reducer(
5862
6665
  data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
5863
6666
  x="PC_1",
5864
6667
  y="PC_2",
5865
- label="normal", c="b",
6668
+ label="normal",
6669
+ c="b",
5866
6670
  )
5867
6671
  ax = sns.scatterplot(
5868
6672
  ax=ax,
@@ -5870,29 +6674,80 @@ def df_reducer(
5870
6674
  x="PC_1",
5871
6675
  y="PC_2",
5872
6676
  c="r",
5873
- label="outlier", marker="+", s=30,
6677
+ label="outlier",
6678
+ marker="+",
6679
+ s=30,
5874
6680
  )
6681
+ elif method == "lda":
6682
+ reduced_df = lda_df
6683
+ colname_met = "LDA_"
6684
+ elif method == "kpca":
6685
+ reduced_df = kpca_df
6686
+ colname_met = "KPCA_"
6687
+ elif method == "ica":
6688
+ reduced_df = ica_df
6689
+ colname_met = "ICA_"
6690
+ elif method == "mds":
6691
+ reduced_df = mds_df
6692
+ colname_met = "MDS_"
6693
+ elif method == "lle":
6694
+ reduced_df = lle_df
6695
+ colname_met = "LLE_"
6696
+ elif method == "svd":
6697
+ reduced_df = svd_df
6698
+ colname_met = "SVD_"
6699
+ # Quick plots
6700
+ if plot_ and (not method in ["isolation_forest"]):
6701
+ from .plot import plotxy
5875
6702
 
6703
+ if ax is None:
6704
+ if figsize is None:
6705
+ _, ax = plt.subplots(figsize=cm2inch(8, 8))
6706
+ else:
6707
+ _, ax = plt.subplots(figsize=figsize)
6708
+ else:
6709
+ ax = ax.cla()
6710
+ ax = plotxy(
6711
+ data=reduced_df,
6712
+ x=colname_met + "1",
6713
+ y=colname_met + "2",
6714
+ hue=hue,
6715
+ s=1,
6716
+ edgecolor="none",
6717
+ kind="scater",
6718
+ figsets=dict(
6719
+ legend=dict(loc="best", markerscale=4),
6720
+ xlabel=xlabel if xlabel else None,
6721
+ ylabel=ylabel if ylabel else None,
6722
+ ),
6723
+ ax=ax,
6724
+ verbose=False,
6725
+ **kwargs,
6726
+ )
5876
6727
 
5877
6728
  if inplace:
5878
6729
  # If inplace=True, add components back into the original data
5879
6730
  for col_idx in range(n_components):
5880
- data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
6731
+ data.loc[:, f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5881
6732
  # Add extra info for PCA/UMAP
5882
6733
  if method == "pca":
5883
6734
  for i in range(n_components):
5884
- data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
6735
+ data.loc[:, f"Explained Variance PC_{i+1}"] = reduced_df.loc[
6736
+ :, f"Explained Variance PC_{i+1}"
6737
+ ]
5885
6738
  for i in range(n_components):
5886
- data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
5887
- elif method == "umap":
6739
+ data.loc[:, f"Singular Values PC_{i+1}"] = reduced_df.loc[
6740
+ :, f"Singular Values PC_{i+1}"
6741
+ ]
6742
+ elif method == "umap":
5888
6743
  for i in range(n_components):
5889
- data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
5890
- data["Embedding"] = reduced_df["Embedding"]
5891
- data["Trustworthiness"] = reduced_df["Trustworthiness"]
6744
+ data.loc[:, f"UMAP_{i+1}"] = reduced_df.loc[:, f"UMAP_{i+1}"]
6745
+ data.loc[:, "Embedding"] = reduced_df.loc[:, "Embedding"]
6746
+ data.loc[:, "Trustworthiness"] = reduced_df.loc[:, "Trustworthiness"]
6747
+
5892
6748
  return None # No return when inplace=True
5893
-
5894
6749
 
5895
- return reduced_df
6750
+ return reduced_df
5896
6751
 
5897
6752
 
5898
6753
  # example:
@@ -5922,6 +6777,7 @@ def plot_cluster(
5922
6777
  """
5923
6778
  import seaborn as sns
5924
6779
  from sklearn.metrics import silhouette_samples
6780
+ import matplotlib.pyplot as plt
5925
6781
 
5926
6782
  if metrics is None:
5927
6783
  metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)
@@ -6152,10 +7008,10 @@ def use_pd(
6152
7008
  verbose=True,
6153
7009
  dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
6154
7010
  ):
6155
- default_settings = fload(dir_json, output='json')
7011
+ default_settings = fload(dir_json, output="json")
6156
7012
  valid_kinds = list(default_settings.keys())
6157
7013
  kind = strcmp(func_name, valid_kinds)[0]
6158
- usage=default_settings[kind]
7014
+ usage = default_settings[kind]
6159
7015
  if verbose:
6160
7016
  for i, i_ in enumerate(ssplit(usage, by=",")):
6161
7017
  i_ = i_.replace("=", "\t= ") + ","