py2ls 0.2.4.8__py3-none-any.whl → 0.2.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py CHANGED
@@ -1,8 +1,9 @@
1
1
  import numpy as np
2
- import pandas as pd
2
+ import pandas as pd
3
3
  import sys, os
4
- from IPython.display import display
4
+ from IPython.display import display
5
5
  from typing import List, Optional, Union
6
+
6
7
  try:
7
8
  get_ipython().run_line_magic("load_ext", "autoreload")
8
9
  get_ipython().run_line_magic("autoreload", "2")
@@ -10,11 +11,14 @@ except NameError:
10
11
  pass
11
12
 
12
13
  import warnings
14
+
13
15
  warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
14
16
  warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
15
17
 
16
- def run_once_within(duration=60): # default 60s
18
+
19
+ def run_once_within(duration=60): # default 60s
17
20
  import time
21
+
18
22
  """
19
23
  usage:
20
24
  if run_once_within():
@@ -26,7 +30,9 @@ def run_once_within(duration=60): # default 60s
26
30
  run_once_within.time_last = None
27
31
  time_curr = time.time()
28
32
 
29
- if (run_once_within.time_last is None) or (time_curr - run_once_within.time_last >= duration):
33
+ if (run_once_within.time_last is None) or (
34
+ time_curr - run_once_within.time_last >= duration
35
+ ):
30
36
  run_once_within.time_last = time_curr # Update the last execution time
31
37
  return True
32
38
  else:
@@ -42,13 +48,14 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
42
48
  """
43
49
  import matplotlib.pyplot as plt
44
50
  from matplotlib import font_manager
45
- slashtype = "/" if 'mac' in get_os() else "\\"
51
+
52
+ slashtype = "/" if "mac" in get_os() else "\\"
46
53
  if slashtype in dir_font:
47
54
  font_manager.fontManager.addfont(dir_font)
48
55
  fontname = os.path.basename(dir_font).split(".")[0]
49
56
  else:
50
57
  if "cn" in dir_font.lower() or "ch" in dir_font.lower():
51
- fontname = "Hiragino Sans GB" # default Chinese font
58
+ fontname = "Hiragino Sans GB" # default Chinese font
52
59
  else:
53
60
  fontname = dir_font
54
61
 
@@ -62,6 +69,7 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
62
69
  plt.rcParams["font.sans-serif"] = ["Arial"]
63
70
  return fontname
64
71
 
72
+
65
73
  # set 'dir_save'
66
74
  if "dar" in sys.platform:
67
75
  dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
@@ -133,6 +141,7 @@ def run_every(when: str = None, job=None, wait: int = 60):
133
141
  """
134
142
  import schedule
135
143
  import time
144
+
136
145
  if job is None:
137
146
  print("No job provided!")
138
147
  return
@@ -180,6 +189,7 @@ def run_at(when: str, job=None, wait: int = 60):
180
189
  """
181
190
  from datetime import datetime
182
191
  import time
192
+
183
193
  if job is None:
184
194
  print("No job provided!")
185
195
  return
@@ -260,11 +270,12 @@ def get_timezone(timezone: str | list = None):
260
270
  def is_package_installed(package_name):
261
271
  """Check if a package is installed."""
262
272
  import importlib.util
273
+
263
274
  package_spec = importlib.util.find_spec(package_name)
264
275
  return package_spec is not None
265
276
 
266
277
 
267
- def upgrade(module="py2ls",uninstall=False):
278
+ def upgrade(module="py2ls", uninstall=False):
268
279
  """
269
280
  Installs or upgrades a specified Python module.
270
281
 
@@ -273,6 +284,7 @@ def upgrade(module="py2ls",uninstall=False):
273
284
  uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
274
285
  """
275
286
  import subprocess
287
+
276
288
  if not is_package_installed(module):
277
289
  try:
278
290
  subprocess.check_call([sys.executable, "-m", "pip", "install", module])
@@ -310,6 +322,7 @@ def get_version(pkg):
310
322
 
311
323
  def rm_folder(folder_path, verbose=True):
312
324
  import shutil
325
+
313
326
  try:
314
327
  shutil.rmtree(folder_path)
315
328
  if verbose:
@@ -329,6 +342,7 @@ def fremove(path, verbose=True):
329
342
  try:
330
343
  if os.path.isdir(path):
331
344
  import shutil
345
+
332
346
  shutil.rmtree(path)
333
347
  if verbose:
334
348
  print(f"Successfully deleted folder {path}")
@@ -364,11 +378,13 @@ def fremove(path, verbose=True):
364
378
 
365
379
  def get_cwd():
366
380
  from pathlib import Path
381
+
367
382
  # Get the current script's directory as a Path object
368
- current_directory = Path(__file__).resolve().parent
369
-
383
+ current_directory = Path(__file__).resolve().parent
384
+
370
385
  return current_directory
371
386
 
387
+
372
388
  def search(
373
389
  query,
374
390
  limit=5,
@@ -380,6 +396,7 @@ def search(
380
396
  **kwargs,
381
397
  ):
382
398
  from duckduckgo_search import DDGS
399
+
383
400
  if "te" in kind.lower():
384
401
  results = DDGS().text(query, max_results=limit)
385
402
  res = pd.DataFrame(results)
@@ -413,6 +430,7 @@ def echo(*args, **kwargs):
413
430
  """
414
431
  global dir_save
415
432
  from duckduckgo_search import DDGS
433
+
416
434
  query = None
417
435
  model = kwargs.get("model", "gpt")
418
436
  verbose = kwargs.get("verbose", True)
@@ -461,10 +479,12 @@ def echo(*args, **kwargs):
461
479
  res = DDGS().chat(query, model=model_valid)
462
480
  if verbose:
463
481
  from pprint import pp
482
+
464
483
  pp(res)
465
484
  if log:
466
485
  from datetime import datetime
467
486
  import time
487
+
468
488
  dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
469
489
  res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
470
490
  if bool(os.path.basename(dir_save)):
@@ -487,6 +507,7 @@ def ai(*args, **kwargs):
487
507
 
488
508
  def detect_lang(text, output="lang", verbose=True):
489
509
  from langdetect import detect
510
+
490
511
  dir_curr_script = os.path.dirname(os.path.abspath(__file__))
491
512
  dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
492
513
  print(dir_curr_script, os.getcwd(), dir_lang_code)
@@ -516,13 +537,14 @@ def is_text(s):
516
537
 
517
538
  from typing import Any, Union
518
539
 
540
+
519
541
  def shared(*args, strict=True, n_shared=2, verbose=True):
520
542
  """
521
543
  check the shared elelements in two list.
522
544
  usage:
523
545
  list1 = [1, 2, 3, 4, 5]
524
546
  list2 = [4, 5, 6, 7, 8]
525
- list3 = [5, 6, 9, 10]
547
+ list3 = [5, 6, 9, 10]
526
548
  a = shared(list1, list2,list3)
527
549
  """
528
550
  if verbose:
@@ -538,26 +560,34 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
538
560
  print(f"{' ' * 2}All inputs must be lists.")
539
561
  return []
540
562
  first_list = flattened_lists[0]
541
- shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
563
+ shared_elements = [
564
+ item for item in first_list if all(item in lst for lst in flattened_lists)
565
+ ]
542
566
  if strict:
543
- # Strict mode: require elements to be in all lists
544
- shared_elements = set(flattened_lists[0])
545
- for lst in flattened_lists[1:]:
546
- shared_elements.intersection_update(lst)
567
+ # Strict mode: require elements to be in all lists
568
+ shared_elements = set(flattened_lists[0])
569
+ for lst in flattened_lists[1:]:
570
+ shared_elements.intersection_update(lst)
547
571
  else:
548
572
  from collections import Counter
573
+
549
574
  all_elements = [item for sublist in flattened_lists for item in sublist]
550
575
  element_count = Counter(all_elements)
551
576
  # Get elements that appear in at least n_shared lists
552
- shared_elements = [item for item, count in element_count.items() if count >= n_shared]
577
+ shared_elements = [
578
+ item for item, count in element_count.items() if count >= n_shared
579
+ ]
553
580
 
554
581
  shared_elements = flatten(shared_elements, verbose=verbose)
555
582
  if verbose:
556
- elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
583
+ elements2show = (
584
+ shared_elements if len(shared_elements) < 10 else shared_elements[:5]
585
+ )
557
586
  print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
558
587
  print("********* checking shared elements *********")
559
588
  return shared_elements
560
589
 
590
+
561
591
  def not_shared(*args, strict=True, n_shared=2, verbose=False):
562
592
  """
563
593
  To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
@@ -568,7 +598,7 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
568
598
  """
569
599
  _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
570
600
  list1 = flatten(args[0], verbose=verbose)
571
- _not_shared=[item for item in list1 if item not in _common]
601
+ _not_shared = [item for item in list1 if item not in _common]
572
602
  return _not_shared
573
603
 
574
604
 
@@ -578,29 +608,41 @@ def flatten(nested: Any, unique_list=True, verbose=False):
578
608
  Parameters:
579
609
  nested : Any, Can be a list, tuple, dictionary, or set.
580
610
  Returns: list, A flattened list.
581
- """
611
+ """
582
612
  flattened_list = []
583
613
  stack = [nested]
584
614
  while stack:
585
615
  current = stack.pop()
586
616
  if isinstance(current, dict):
587
- stack.extend(current.values())
617
+ stack.extend(current.values())
588
618
  elif isinstance(current, (list, tuple, set)):
589
619
  stack.extend(current)
590
620
  elif isinstance(current, pd.Series):
591
621
  stack.extend(current)
592
- elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
622
+ elif isinstance(
623
+ current, (pd.Index, np.ndarray)
624
+ ): # df.columns df.index are object of type pd.Index
593
625
  stack.extend(current.tolist())
594
626
  else:
595
627
  flattened_list.append(current)
596
628
  if verbose:
597
- print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
629
+ print(
630
+ f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>"
631
+ )
598
632
  if unique_list:
599
633
  return unique(flattened_list)[::-1]
600
634
  else:
601
635
  return flattened_list
602
-
603
- def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=False, scorer="WR"):
636
+
637
+
638
+ def strcmp(
639
+ search_term,
640
+ candidates,
641
+ ignore_case=True,
642
+ get_rank=False,
643
+ verbose=False,
644
+ scorer="WR",
645
+ ):
604
646
  """
605
647
  Compares a search term with a list of candidate strings and finds the best match based on similarity score.
606
648
 
@@ -614,13 +656,14 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
614
656
  tuple: A tuple containing the best match and its index in the candidates list.
615
657
  """
616
658
  from fuzzywuzzy import fuzz, process
659
+
617
660
  def to_lower(s, ignore_case=True):
618
661
  # Converts a string or list of strings to lowercase if ignore_case is True.
619
662
  if ignore_case:
620
663
  if isinstance(s, str):
621
664
  return s.lower()
622
665
  elif isinstance(s, list):
623
- s=[str(i) for i in s]# convert all to str
666
+ s = [str(i) for i in s] # convert all to str
624
667
  return [elem.lower() for elem in s]
625
668
  return s
626
669
 
@@ -630,12 +673,15 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
630
673
  similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
631
674
  elif "W" in scorer.lower():
632
675
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
633
- elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
676
+ elif "ratio" in scorer.lower() or "stri" in scorer.lower(): # Ratio (Strictest)
634
677
  similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
635
678
  else:
636
679
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
637
680
  if get_rank:
638
- idx = [similarity_scores.index(i) for i in sorted(similarity_scores,reverse=True)]
681
+ idx = [
682
+ similarity_scores.index(i)
683
+ for i in sorted(similarity_scores, reverse=True)
684
+ ]
639
685
  if verbose:
640
686
  display([candidates[ii] for ii in idx])
641
687
  return [candidates[ii] for ii in idx]
@@ -663,6 +709,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
663
709
  # str2 = ['PLoS Computational Biology', 'PLOS BIOLOGY']
664
710
  # best_match, idx = strcmp(str1, str2, ignore_case=1)
665
711
 
712
+
666
713
  def cn2pinyin(
667
714
  cn_str: Union[str, list] = None,
668
715
  sep: str = " ",
@@ -727,19 +774,21 @@ def cn2pinyin(
727
774
  style = Style.PL
728
775
  else:
729
776
  style = Style.NORMAL
730
- if not isinstance(cn_str,list):
731
- cn_str=[cn_str]
732
- pinyin_flat=[]
777
+ if not isinstance(cn_str, list):
778
+ cn_str = [cn_str]
779
+ pinyin_flat = []
733
780
  for cn_str_ in cn_str:
734
781
  pinyin_string = pinyin(cn_str_, style=style)
735
782
  pinyin_flat.append(sep.join([item[0] for item in pinyin_string]))
736
- if len(pinyin_flat)==1:
783
+ if len(pinyin_flat) == 1:
737
784
  return pinyin_flat[0]
738
785
  else:
739
786
  return pinyin_flat
740
787
 
788
+
741
789
  def counter(list_, verbose=True):
742
790
  from collections import Counter
791
+
743
792
  c = Counter(list_)
744
793
  # Print the name counts
745
794
  for item, count in c.items():
@@ -769,6 +818,7 @@ def str2time(time_str, fmt="24"):
769
818
  - str: The converted time string.
770
819
  """
771
820
  from datetime import datetime
821
+
772
822
  def time_len_corr(time_str):
773
823
  time_str_ = (
774
824
  ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
@@ -830,6 +880,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
830
880
  - str: The converted date string.
831
881
  """
832
882
  from dateutil import parser
883
+
833
884
  try:
834
885
  date_obj = parser.parse(date_str)
835
886
  except ValueError as e:
@@ -847,6 +898,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
847
898
 
848
899
  def str2num(s, *args, **kwargs):
849
900
  import re
901
+
850
902
  delimiter = kwargs.get("sep", None)
851
903
  round_digits = kwargs.get("round", None)
852
904
  if delimiter is not None:
@@ -863,6 +915,7 @@ def str2num(s, *args, **kwargs):
863
915
  num = float(s)
864
916
  except ValueError:
865
917
  from numerizer import numerize
918
+
866
919
  try:
867
920
  numerized = numerize(s)
868
921
  num = int(numerized) if "." not in numerized else float(numerized)
@@ -1067,13 +1120,12 @@ def inch2px(*inch, dpi=300) -> list:
1067
1120
  # Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
1068
1121
  if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
1069
1122
  return [i * dpi for i in inch[0]]
1070
-
1123
+
1071
1124
  # Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
1072
1125
  else:
1073
1126
  return [i * dpi for i in inch]
1074
1127
 
1075
1128
 
1076
-
1077
1129
  def cm2inch(*inch) -> list:
1078
1130
  """
1079
1131
  Usage:
@@ -1191,6 +1243,7 @@ def paper_size(paper_type_str="a4"):
1191
1243
 
1192
1244
  def docx2pdf(dir_docx, dir_pdf=None):
1193
1245
  from docx2pdf import convert
1246
+
1194
1247
  if dir_pdf:
1195
1248
  convert(dir_docx, dir_pdf)
1196
1249
  else:
@@ -1199,6 +1252,7 @@ def docx2pdf(dir_docx, dir_pdf=None):
1199
1252
 
1200
1253
  def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
1201
1254
  import img2pdf as image2pdf
1255
+
1202
1256
  def mm_to_point(size):
1203
1257
  return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
1204
1258
 
@@ -1253,6 +1307,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
1253
1307
  from PyPDF2 import PdfReader
1254
1308
  from pptx.util import Inches
1255
1309
  from pptx import Presentation
1310
+
1256
1311
  prs = Presentation()
1257
1312
 
1258
1313
  # Open the PDF file
@@ -1282,6 +1337,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
1282
1337
 
1283
1338
  def ssplit(text, by="space", verbose=False, strict=False, **kws):
1284
1339
  import re
1340
+
1285
1341
  if isinstance(text, list):
1286
1342
  nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
1287
1343
  flat_list = [item for sublist in nested_list for item in sublist]
@@ -1331,6 +1387,7 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1331
1387
  def split_by_sent_num(text, n=10):
1332
1388
  from nltk.tokenize import sent_tokenize
1333
1389
  from itertools import pairwise
1390
+
1334
1391
  # split text into sentences
1335
1392
  text_split_by_sent = sent_tokenize(text)
1336
1393
  cut_loc_array = np.arange(0, len(text_split_by_sent), n)
@@ -1404,11 +1461,13 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1404
1461
  return split_by_camel_case(text)
1405
1462
  elif ("word" in by) and not strict:
1406
1463
  from nltk.tokenize import word_tokenize
1464
+
1407
1465
  if verbose:
1408
1466
  print(f"splited by word")
1409
1467
  return word_tokenize(text)
1410
1468
  elif ("sen" in by and not "num" in by) and not strict:
1411
1469
  from nltk.tokenize import sent_tokenize
1470
+
1412
1471
  if verbose:
1413
1472
  print(f"splited by sentence")
1414
1473
  return sent_tokenize(text)
@@ -1459,10 +1518,12 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
1459
1518
 
1460
1519
  def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
1461
1520
  from pdf2image import convert_from_path, pdfinfo_from_path
1521
+
1462
1522
  df_dir_img_single_page = pd.DataFrame()
1463
1523
  dir_single_page = []
1464
1524
  if verbose:
1465
1525
  from pprint import pp
1526
+
1466
1527
  pp(pdfinfo_from_path(dir_pdf))
1467
1528
  if isinstance(page, tuple) and page:
1468
1529
  page = list(page)
@@ -1582,6 +1643,7 @@ def unzip(dir_path, output_dir=None):
1582
1643
  if os.path.exists(output_dir):
1583
1644
  if os.path.isdir(output_dir): # check if it is a folder
1584
1645
  import shutil
1646
+
1585
1647
  shutil.rmtree(output_dir) # remove folder
1586
1648
  else:
1587
1649
  os.remove(output_dir) # remove file
@@ -1600,6 +1662,7 @@ def unzip(dir_path, output_dir=None):
1600
1662
  output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
1601
1663
  try:
1602
1664
  import shutil
1665
+
1603
1666
  with gzip.open(dir_path, "rb") as gz_file:
1604
1667
  with open(output_file, "wb") as out_file:
1605
1668
  shutil.copyfileobj(gz_file, out_file)
@@ -1607,11 +1670,14 @@ def unzip(dir_path, output_dir=None):
1607
1670
  except FileNotFoundError:
1608
1671
  print(f"Error: The file '{dir_path}' was not found.")
1609
1672
  except PermissionError:
1610
- print(f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'.")
1673
+ print(
1674
+ f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'."
1675
+ )
1611
1676
  except Exception as e:
1612
1677
  try:
1613
1678
  import tarfile
1614
- with tarfile.open(dir_path, 'r:gz') as tar:
1679
+
1680
+ with tarfile.open(dir_path, "r:gz") as tar:
1615
1681
  tar.extractall(path=output_file)
1616
1682
  except Exception as final_e:
1617
1683
  print(f"An final unexpected error occurred: {final_e}")
@@ -1698,9 +1764,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1698
1764
  """
1699
1765
  if not isinstance(df, pd.DataFrame):
1700
1766
  if verbose:
1701
- print('not pd.DataFrame')
1767
+ print("not pd.DataFrame")
1702
1768
  return False
1703
- df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
1769
+ df.columns = df.columns.astype(str) # 把它变成str, 这样就可以进行counts运算了
1704
1770
  # Initialize a list to hold messages about abnormalities
1705
1771
  messages = []
1706
1772
  is_abnormal = False
@@ -1729,28 +1795,28 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1729
1795
  if verbose:
1730
1796
  print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
1731
1797
  if verbose:
1732
- print("1",is_abnormal)
1798
+ print("1", is_abnormal)
1733
1799
  if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
1734
1800
  messages.append("Abnormal: Too many delimiters in column names.")
1735
1801
  is_abnormal = True
1736
1802
  if verbose:
1737
1803
  print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
1738
1804
  if verbose:
1739
- print("2",is_abnormal)
1805
+ print("2", is_abnormal)
1740
1806
  if delimiter_counts[""] > 3:
1741
1807
  messages.append("Abnormal: There are empty column names.")
1742
1808
  is_abnormal = True
1743
1809
  if verbose:
1744
1810
  print(f'delimiter_counts[""] > 3')
1745
1811
  if verbose:
1746
- print("3",is_abnormal)
1812
+ print("3", is_abnormal)
1747
1813
  if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
1748
1814
  messages.append("Abnormal: Some column names contain unexpected characters.")
1749
1815
  is_abnormal = True
1750
1816
  if verbose:
1751
1817
  print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
1752
1818
  if verbose:
1753
- print("4",is_abnormal)
1819
+ print("4", is_abnormal)
1754
1820
  # # Check for missing values
1755
1821
  # missing_values = df.isnull().sum()
1756
1822
  # if missing_values.any():
@@ -1769,9 +1835,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1769
1835
  messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
1770
1836
  is_abnormal = True
1771
1837
  if verbose:
1772
- print(f'df.columns[df.nunique() == 1].tolist()')
1838
+ print(f"df.columns[df.nunique() == 1].tolist()")
1773
1839
  if verbose:
1774
- print("5",is_abnormal)
1840
+ print("5", is_abnormal)
1775
1841
  # Check for an unreasonable number of rows or columns
1776
1842
  if actual_shape[0] < 2 or actual_shape[1] < 2:
1777
1843
  messages.append(
@@ -1779,9 +1845,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
1779
1845
  )
1780
1846
  is_abnormal = True
1781
1847
  if verbose:
1782
- print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
1848
+ print(f"actual_shape[0] < 2 or actual_shape[1] < 2")
1783
1849
  if verbose:
1784
- print("6",is_abnormal)
1850
+ print("6", is_abnormal)
1785
1851
  # Compile results
1786
1852
  if verbose:
1787
1853
  print("\n".join(messages))
@@ -1798,8 +1864,10 @@ def fload(fpath, kind=None, **kwargs):
1798
1864
  Returns:
1799
1865
  content: The content loaded from the file.
1800
1866
  """
1867
+
1801
1868
  def read_mplstyle(style_file):
1802
1869
  import matplotlib.pyplot as plt
1870
+
1803
1871
  # Load the style file
1804
1872
  plt.style.use(style_file)
1805
1873
 
@@ -1812,10 +1880,11 @@ def fload(fpath, kind=None, **kwargs):
1812
1880
  for i, j in style_dict.items():
1813
1881
  print(f"\n{i}::::{j}")
1814
1882
  return style_dict
1883
+
1815
1884
  # #example usage:
1816
1885
  # style_file = "/ std-colors.mplstyle"
1817
1886
  # style_dict = read_mplstyle(style_file)
1818
-
1887
+
1819
1888
  def load_txt_md(fpath):
1820
1889
  with open(fpath, "r") as file:
1821
1890
  content = file.read()
@@ -1825,28 +1894,30 @@ def fload(fpath, kind=None, **kwargs):
1825
1894
  # with open(fpath, "r") as file:
1826
1895
  # content = file.read()
1827
1896
  # return content
1828
- def load_html(fpath,**kwargs):
1829
- return pd.read_html(fpath,**kwargs)
1897
+ def load_html(fpath, **kwargs):
1898
+ return pd.read_html(fpath, **kwargs)
1830
1899
 
1831
1900
  def load_json(fpath, **kwargs):
1832
- output=kwargs.pop("output","json")
1833
- if output=='json':
1901
+ output = kwargs.pop("output", "json")
1902
+ if output == "json":
1834
1903
  import json
1904
+
1835
1905
  with open(fpath, "r") as file:
1836
1906
  content = json.load(file)
1837
1907
  return content
1838
1908
  else:
1839
- return pd.read_json(fpath,**kwargs)
1909
+ return pd.read_json(fpath, **kwargs)
1840
1910
 
1841
1911
  def load_yaml(fpath):
1842
1912
  import yaml
1913
+
1843
1914
  with open(fpath, "r") as file:
1844
1915
  content = yaml.safe_load(file)
1845
1916
  return content
1846
1917
 
1847
-
1848
1918
  def load_xml(fpath, fsize_thr: int = 100):
1849
1919
  from lxml import etree
1920
+
1850
1921
  def load_small_xml(fpath):
1851
1922
  tree = etree.parse(fpath)
1852
1923
  root = tree.getroot()
@@ -1905,7 +1976,7 @@ def fload(fpath, kind=None, **kwargs):
1905
1976
  if line.startswith(char):
1906
1977
  return char
1907
1978
  return None
1908
-
1979
+
1909
1980
  def _get_chunks(df_fake):
1910
1981
  """
1911
1982
  helper func for 'load_csv'
@@ -1926,20 +1997,22 @@ def fload(fpath, kind=None, **kwargs):
1926
1997
  encoding = kwargs.pop("encoding", "utf-8")
1927
1998
  on_bad_lines = kwargs.pop("on_bad_lines", "skip")
1928
1999
  comment = kwargs.pop("comment", None)
1929
- fmt=kwargs.pop("fmt",False)
1930
- chunksize=kwargs.pop("chunksize", None)
1931
- engine='c' if chunksize else engine # when chunksize, recommend 'c'
1932
- low_memory=kwargs.pop("low_memory",True)
1933
- low_memory=False if chunksize else True # when chunksize, recommend low_memory=False
1934
- verbose=kwargs.pop("verbose",False)
2000
+ fmt = kwargs.pop("fmt", False)
2001
+ chunksize = kwargs.pop("chunksize", None)
2002
+ engine = "c" if chunksize else engine # when chunksize, recommend 'c'
2003
+ low_memory = kwargs.pop("low_memory", True)
2004
+ low_memory = (
2005
+ False if chunksize else True
2006
+ ) # when chunksize, recommend low_memory=False
2007
+ verbose = kwargs.pop("verbose", False)
1935
2008
  if run_once_within():
1936
2009
  use_pd("read_csv", verbose=verbose)
1937
-
2010
+
1938
2011
  if comment is None:
1939
2012
  comment = get_comment(
1940
2013
  fpath, comment=None, encoding="utf-8", lines_to_check=5
1941
2014
  )
1942
-
2015
+
1943
2016
  try:
1944
2017
  df = pd.read_csv(
1945
2018
  fpath,
@@ -1956,9 +2029,9 @@ def fload(fpath, kind=None, **kwargs):
1956
2029
  **kwargs,
1957
2030
  )
1958
2031
  if chunksize:
1959
- df=_get_chunks(df)
2032
+ df = _get_chunks(df)
1960
2033
  print(df.shape)
1961
- if is_df_abnormal(df, verbose=0): # raise error
2034
+ if is_df_abnormal(df, verbose=0): # raise error
1962
2035
  raise ValueError("the df is abnormal")
1963
2036
  except:
1964
2037
  try:
@@ -1991,7 +2064,7 @@ def fload(fpath, kind=None, **kwargs):
1991
2064
  **kwargs,
1992
2065
  )
1993
2066
  if chunksize:
1994
- df=_get_chunks(df)
2067
+ df = _get_chunks(df)
1995
2068
  print(df.shape)
1996
2069
  if is_df_abnormal(df, verbose=0):
1997
2070
  raise ValueError("the df is abnormal")
@@ -2026,7 +2099,7 @@ def fload(fpath, kind=None, **kwargs):
2026
2099
  **kwargs,
2027
2100
  )
2028
2101
  if chunksize:
2029
- df=_get_chunks(df)
2102
+ df = _get_chunks(df)
2030
2103
  print(df.shape)
2031
2104
  if is_df_abnormal(df, verbose=0):
2032
2105
  raise ValueError("the df is abnormal")
@@ -2049,7 +2122,7 @@ def fload(fpath, kind=None, **kwargs):
2049
2122
  **kwargs,
2050
2123
  )
2051
2124
  if chunksize:
2052
- df=_get_chunks(df)
2125
+ df = _get_chunks(df)
2053
2126
  print(df.shape)
2054
2127
  if not is_df_abnormal(df, verbose=0): # normal
2055
2128
  display(df.head(2))
@@ -2059,7 +2132,7 @@ def fload(fpath, kind=None, **kwargs):
2059
2132
  pass
2060
2133
  else:
2061
2134
  if not chunksize:
2062
- engines = [None,"c", "python"]
2135
+ engines = [None, "c", "python"]
2063
2136
  for engine in engines:
2064
2137
  separators = [",", "\t", ";", "|", " "]
2065
2138
  for sep in separators:
@@ -2080,11 +2153,19 @@ def fload(fpath, kind=None, **kwargs):
2080
2153
  # display(df.head(2))
2081
2154
  # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
2082
2155
  if chunksize:
2083
- df=_get_chunks(df)
2156
+ df = _get_chunks(df)
2084
2157
  print(df.shape)
2085
2158
  if not is_df_abnormal(df, verbose=0):
2086
- display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
2087
- print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
2159
+ (
2160
+ display(df.head(2))
2161
+ if isinstance(df, pd.DataFrame)
2162
+ else display("it is not a DataFrame")
2163
+ )
2164
+ (
2165
+ print(f"shape: {df.shape}")
2166
+ if isinstance(df, pd.DataFrame)
2167
+ else display("it is not a DataFrame")
2168
+ )
2088
2169
  return df
2089
2170
  except EmptyDataError as e:
2090
2171
  continue
@@ -2096,19 +2177,18 @@ def fload(fpath, kind=None, **kwargs):
2096
2177
 
2097
2178
  def load_excel(fpath, **kwargs):
2098
2179
  engine = kwargs.get("engine", "openpyxl")
2099
- verbose=kwargs.pop("verbose",False)
2180
+ verbose = kwargs.pop("verbose", False)
2100
2181
  if run_once_within():
2101
2182
  use_pd("read_excel", verbose=verbose)
2102
2183
  df = pd.read_excel(fpath, engine=engine, **kwargs)
2103
2184
  try:
2104
- meata=pd.ExcelFile(fpath)
2185
+ meata = pd.ExcelFile(fpath)
2105
2186
  print(f"n_sheet={len(meata.sheet_names)},\t'sheetname = 0 (default)':")
2106
- [print(f"{i}:\t{i_}") for i,i_ in enumerate(meata.sheet_names)]
2187
+ [print(f"{i}:\t{i_}") for i, i_ in enumerate(meata.sheet_names)]
2107
2188
  except:
2108
2189
  pass
2109
2190
  return df
2110
2191
 
2111
-
2112
2192
  def load_parquet(fpath, **kwargs):
2113
2193
  """
2114
2194
  Load a Parquet file into a Pandas DataFrame with advanced options.
@@ -2124,16 +2204,16 @@ def fload(fpath, kind=None, **kwargs):
2124
2204
  Returns:
2125
2205
  - df (DataFrame): The loaded DataFrame.
2126
2206
  """
2127
-
2207
+
2128
2208
  engine = kwargs.get("engine", "pyarrow")
2129
2209
  verbose = kwargs.pop("verbose", False)
2130
-
2210
+
2131
2211
  if run_once_within():
2132
2212
  use_pd("read_parquet", verbose=verbose)
2133
2213
  try:
2134
2214
  df = pd.read_parquet(fpath, engine=engine, **kwargs)
2135
2215
  if verbose:
2136
- if 'columns' in kwargs:
2216
+ if "columns" in kwargs:
2137
2217
  print(f"Loaded columns: {kwargs['columns']}")
2138
2218
  else:
2139
2219
  print("Loaded all columns.")
@@ -2142,11 +2222,12 @@ def fload(fpath, kind=None, **kwargs):
2142
2222
  print(f"An error occurred while loading the Parquet file: {e}")
2143
2223
  df = None
2144
2224
 
2145
- return df
2225
+ return df
2146
2226
 
2147
2227
  def load_ipynb(fpath, **kwargs):
2148
2228
  import nbformat
2149
2229
  from nbconvert import MarkdownExporter
2230
+
2150
2231
  as_version = kwargs.get("as_version", 4)
2151
2232
  with open(fpath, "r") as file:
2152
2233
  nb = nbformat.read(file, as_version=as_version)
@@ -2177,6 +2258,7 @@ def fload(fpath, kind=None, **kwargs):
2177
2258
  If the specified page is not found, it returns the string "Page is not found".
2178
2259
  """
2179
2260
  from PyPDF2 import PdfReader
2261
+
2180
2262
  text_dict = {}
2181
2263
  with open(fpath, "rb") as file:
2182
2264
  pdf_reader = PdfReader(file)
@@ -2207,6 +2289,7 @@ def fload(fpath, kind=None, **kwargs):
2207
2289
 
2208
2290
  def load_docx(fpath):
2209
2291
  from docx import Document
2292
+
2210
2293
  doc = Document(fpath)
2211
2294
  content = [para.text for para in doc.paragraphs]
2212
2295
  return content
@@ -2216,21 +2299,55 @@ def fload(fpath, kind=None, **kwargs):
2216
2299
  kind = kind.lower()
2217
2300
  kind = kind.lstrip(".").lower()
2218
2301
  img_types = [
2219
- "bmp","eps","gif","png","jpg","jpeg","jpeg2000","tiff","tif",
2220
- "icns","ico","im","msp","pcx","ppm","sgi","spider","tga","webp",
2302
+ "bmp",
2303
+ "eps",
2304
+ "gif",
2305
+ "png",
2306
+ "jpg",
2307
+ "jpeg",
2308
+ "jpeg2000",
2309
+ "tiff",
2310
+ "tif",
2311
+ "icns",
2312
+ "ico",
2313
+ "im",
2314
+ "msp",
2315
+ "pcx",
2316
+ "ppm",
2317
+ "sgi",
2318
+ "spider",
2319
+ "tga",
2320
+ "webp",
2221
2321
  ]
2222
2322
  doc_types = [
2223
- "docx","pdf",
2224
- "txt","csv","xlsx","tsv","parquet","snappy",
2225
- "md","html",
2226
- "json","yaml","xml",
2323
+ "docx",
2324
+ "pdf",
2325
+ "txt",
2326
+ "csv",
2327
+ "xlsx",
2328
+ "tsv",
2329
+ "parquet",
2330
+ "snappy",
2331
+ "md",
2332
+ "html",
2333
+ "json",
2334
+ "yaml",
2335
+ "xml",
2227
2336
  "ipynb",
2228
- "mtx"
2337
+ "mtx",
2229
2338
  ]
2230
2339
  zip_types = [
2231
- "gz","zip","7z","rar","tgz",
2232
- "tar","tar.gz","tar.bz2",
2233
- "bz2","xz","gzip"
2340
+ "gz",
2341
+ "zip",
2342
+ "7z",
2343
+ "rar",
2344
+ "tgz",
2345
+ "tar",
2346
+ "tar.gz",
2347
+ "tar.bz2",
2348
+ "bz2",
2349
+ "xz",
2350
+ "gzip",
2234
2351
  ]
2235
2352
  other_types = ["fcs"]
2236
2353
  supported_types = [*doc_types, *img_types, *zip_types, *other_types]
@@ -2266,17 +2383,17 @@ def fload(fpath, kind=None, **kwargs):
2266
2383
  return load_yaml(fpath)
2267
2384
  elif kind == "xml":
2268
2385
  return load_xml(fpath)
2269
- elif kind in ["csv","tsv"]:
2270
- verbose=kwargs.pop('verbose',False)
2386
+ elif kind in ["csv", "tsv"]:
2387
+ verbose = kwargs.pop("verbose", False)
2271
2388
  if run_once_within():
2272
2389
  use_pd("read_csv")
2273
2390
  content = load_csv(fpath, **kwargs)
2274
2391
  return content
2275
- elif kind=='pkl':
2276
- verbose=kwargs.pop('verbose',False)
2392
+ elif kind == "pkl":
2393
+ verbose = kwargs.pop("verbose", False)
2277
2394
  if run_once_within():
2278
2395
  use_pd("read_pickle")
2279
- return pd.read_pickle(fpath,**kwargs)
2396
+ return pd.read_pickle(fpath, **kwargs)
2280
2397
  elif kind in ["ods", "ods", "odt"]:
2281
2398
  engine = kwargs.get("engine", "odf")
2282
2399
  kwargs.pop("engine", None)
@@ -2286,38 +2403,39 @@ def fload(fpath, kind=None, **kwargs):
2286
2403
  kwargs.pop("engine", None)
2287
2404
  content = load_excel(fpath, engine=engine, **kwargs)
2288
2405
  print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2289
- display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2406
+ display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2290
2407
  return content
2291
2408
  elif kind == "xlsx":
2292
2409
  content = load_excel(fpath, **kwargs)
2293
2410
  display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2294
2411
  print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
2295
2412
  return content
2296
- elif kind=='mtx':
2413
+ elif kind == "mtx":
2297
2414
  from scipy.io import mmread
2298
- dat_mtx=mmread(fpath)
2299
- content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
2415
+
2416
+ dat_mtx = mmread(fpath)
2417
+ content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
2300
2418
  display(content.head(3)) if isinstance(content, pd.DataFrame) else None
2301
2419
  print(f"shape: {content.shape}")
2302
2420
  return content
2303
2421
  elif kind == "ipynb":
2304
2422
  return load_ipynb(fpath, **kwargs)
2305
- elif kind in ['parquet','snappy']:
2306
- verbose=kwargs.pop('verbose',False)
2423
+ elif kind in ["parquet", "snappy"]:
2424
+ verbose = kwargs.pop("verbose", False)
2307
2425
  if run_once_within():
2308
2426
  use_pd("read_parquet")
2309
- return load_parquet(fpath,**kwargs)
2310
- elif kind =='feather':
2311
- verbose=kwargs.pop('verbose',False)
2427
+ return load_parquet(fpath, **kwargs)
2428
+ elif kind == "feather":
2429
+ verbose = kwargs.pop("verbose", False)
2312
2430
  if run_once_within():
2313
2431
  use_pd("read_feather")
2314
- content=pd.read_feather(fpath,**kwargs)
2432
+ content = pd.read_feather(fpath, **kwargs)
2315
2433
  return content
2316
- elif kind =='h5':
2317
- content=pd.read_hdf(fpath,**kwargs)
2434
+ elif kind == "h5":
2435
+ content = pd.read_hdf(fpath, **kwargs)
2318
2436
  return content
2319
- elif kind =='pkl':
2320
- content=pd.read_pickle(fpath,**kwargs)
2437
+ elif kind == "pkl":
2438
+ content = pd.read_pickle(fpath, **kwargs)
2321
2439
  return content
2322
2440
  elif kind == "pdf":
2323
2441
  # print('usage:load_pdf(fpath, page="all", verbose=False)')
@@ -2325,11 +2443,13 @@ def fload(fpath, kind=None, **kwargs):
2325
2443
  elif kind.lower() in img_types:
2326
2444
  print(f'Image ".{kind}" is loaded.')
2327
2445
  return load_img(fpath)
2328
- elif kind=="gz" and fpath.endswith(".soft.gz"):
2446
+ elif kind == "gz" and fpath.endswith(".soft.gz"):
2329
2447
  import GEOparse
2448
+
2330
2449
  return GEOparse.get_GEO(filepath=fpath)
2331
2450
  elif kind.lower() in zip_types:
2332
2451
  from pprint import pp
2452
+
2333
2453
  keep = kwargs.get("keep", False)
2334
2454
  fpath_unzip = unzip(fpath)
2335
2455
  if os.path.isdir(fpath_unzip):
@@ -2364,7 +2484,7 @@ def fload(fpath, kind=None, **kwargs):
2364
2484
  meta, data = fcsparser.parse(fpath, reformat_meta=True)
2365
2485
  return meta, data
2366
2486
 
2367
- elif kind=="mplstyle":
2487
+ elif kind == "mplstyle":
2368
2488
  return read_mplstyle(fpath)
2369
2489
 
2370
2490
  else:
@@ -2408,7 +2528,7 @@ def fupdate(fpath, content=None, how="head"):
2408
2528
  """
2409
2529
  Update a file by adding new content at the top and moving the old content to the bottom.
2410
2530
  If the file is a JSON file, merge the new content with the old content.
2411
-
2531
+
2412
2532
  Parameters
2413
2533
  ----------
2414
2534
  fpath : str
@@ -2416,7 +2536,7 @@ def fupdate(fpath, content=None, how="head"):
2416
2536
  content : str or dict, optional
2417
2537
  The new content to add at the top of the file (for text) or merge (for JSON).
2418
2538
  If not provided, the function will not add any new content.
2419
-
2539
+
2420
2540
  Notes
2421
2541
  -----
2422
2542
  - If the file at `fpath` does not exist, it will be created.
@@ -2425,14 +2545,20 @@ def fupdate(fpath, content=None, how="head"):
2425
2545
  """
2426
2546
  content = content or ""
2427
2547
  file_ext = os.path.splitext(fpath)[1]
2428
- how_s=["head", "tail","start","end","beginning", "stop",'last',"before"]
2548
+ how_s = ["head", "tail", "start", "end", "beginning", "stop", "last", "before"]
2429
2549
  how = strcmp(how, how_s)[0]
2430
2550
  print(how)
2431
- add_where = 'head' if how in ["head", "start","beginning", "before"] else "tail"
2551
+ add_where = "head" if how in ["head", "start", "beginning", "before"] else "tail"
2432
2552
  if "json" in file_ext.lower():
2433
- old_content=fload(fpath,kind='json') if os.path.exists(fpath) else {}
2434
- updated_content = {**content,**old_content} if add_where=="head" else {**old_content, **content} if isinstance(content, dict) else old_content
2435
- fsave(fpath,updated_content)
2553
+ old_content = fload(fpath, kind="json") if os.path.exists(fpath) else {}
2554
+ updated_content = (
2555
+ {**content, **old_content}
2556
+ if add_where == "head"
2557
+ else (
2558
+ {**old_content, **content} if isinstance(content, dict) else old_content
2559
+ )
2560
+ )
2561
+ fsave(fpath, updated_content)
2436
2562
  else:
2437
2563
  # Handle text file
2438
2564
  if os.path.exists(fpath):
@@ -2443,7 +2569,7 @@ def fupdate(fpath, content=None, how="head"):
2443
2569
 
2444
2570
  # Write new content at the top followed by old content
2445
2571
  with open(fpath, "w") as file:
2446
- if add_where=="head":
2572
+ if add_where == "head":
2447
2573
  file.write(content + "\n")
2448
2574
  file.write(old_content)
2449
2575
  else:
@@ -2478,7 +2604,9 @@ def filter_kwargs(kws, valid_kwargs):
2478
2604
  }
2479
2605
  return kwargs_filtered
2480
2606
 
2481
- str_space_speed='sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
2607
+
2608
+ str_space_speed = 'sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
2609
+
2482
2610
 
2483
2611
  def fsave(
2484
2612
  fpath,
@@ -2515,6 +2643,7 @@ def fsave(
2515
2643
 
2516
2644
  def save_docx(fpath, content, font_name, font_size, spacing):
2517
2645
  import docx
2646
+
2518
2647
  if isinstance(content, str):
2519
2648
  content = content.split(". ")
2520
2649
  doc = docx.Document()
@@ -2543,6 +2672,7 @@ def fsave(
2543
2672
 
2544
2673
  def save_pdf(fpath, content, font_name, font_size):
2545
2674
  from fpdf import FPDF
2675
+
2546
2676
  pdf = FPDF()
2547
2677
  pdf.add_page()
2548
2678
  # pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
@@ -2555,7 +2685,7 @@ def fsave(
2555
2685
  def save_csv(fpath, data, **kwargs):
2556
2686
  # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
2557
2687
 
2558
- verbose=kwargs.pop("verbose",False)
2688
+ verbose = kwargs.pop("verbose", False)
2559
2689
  if run_once_within():
2560
2690
  use_pd("to_csv", verbose=verbose)
2561
2691
  kwargs_csv = dict(
@@ -2586,7 +2716,7 @@ def fsave(
2586
2716
  df.to_csv(fpath, **kwargs_valid)
2587
2717
 
2588
2718
  def save_xlsx(fpath, data, **kwargs):
2589
- verbose=kwargs.pop("verbose",False)
2719
+ verbose = kwargs.pop("verbose", False)
2590
2720
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2591
2721
  if run_once_within():
2592
2722
  use_pd("to_excel", verbose=verbose)
@@ -2595,9 +2725,21 @@ def fsave(
2595
2725
  else:
2596
2726
  # Remove non-relevant kwargs
2597
2727
  irrelevant_keys = [
2598
- "format", "usage", "cell", "width", "height", "height_max", "merge",
2599
- "shade", "comment", "link", "protect", "number_format", "conditional_format",
2600
- "index_default"]
2728
+ "format",
2729
+ "usage",
2730
+ "cell",
2731
+ "width",
2732
+ "height",
2733
+ "height_max",
2734
+ "merge",
2735
+ "shade",
2736
+ "comment",
2737
+ "link",
2738
+ "protect",
2739
+ "number_format",
2740
+ "conditional_format",
2741
+ "index_default",
2742
+ ]
2601
2743
  for key in irrelevant_keys:
2602
2744
  kwargs.pop(key, None)
2603
2745
 
@@ -2605,19 +2747,21 @@ def fsave(
2605
2747
  # Check if the file exists, then append the sheet, otherwise create a new file
2606
2748
  try:
2607
2749
  # Use ExcelWriter with append mode if the file exists
2608
- with pd.ExcelWriter(fpath, engine='openpyxl', mode='a', if_sheet_exists='new') as writer:
2750
+ with pd.ExcelWriter(
2751
+ fpath, engine="openpyxl", mode="a", if_sheet_exists="new"
2752
+ ) as writer:
2609
2753
  df.to_excel(writer, sheet_name=sheet_name, index=False, **kwargs)
2610
2754
  except FileNotFoundError:
2611
2755
  # If file doesn't exist, create a new one
2612
2756
  df.to_excel(fpath, sheet_name=sheet_name, index=False, **kwargs)
2613
2757
 
2614
-
2615
2758
  def save_ipynb(fpath, data, **kwargs):
2616
2759
  # Split the content by code fences to distinguish between code and markdown
2617
2760
  import nbformat
2761
+
2618
2762
  parts = data.split("```")
2619
2763
  cells = []
2620
-
2764
+
2621
2765
  for i, part in enumerate(parts):
2622
2766
  if i % 2 == 0:
2623
2767
  # Even index: markdown content
@@ -2638,17 +2782,18 @@ def fsave(
2638
2782
 
2639
2783
  def save_json(fpath_fname, var_dict_or_df):
2640
2784
  import json
2785
+
2641
2786
  def _convert_js(data):
2642
2787
  if isinstance(data, pd.DataFrame):
2643
- return data.to_dict(orient="list")
2788
+ return data.to_dict(orient="list")
2644
2789
  elif isinstance(data, np.ndarray):
2645
2790
  return data.tolist()
2646
2791
  elif isinstance(data, dict):
2647
2792
  return {key: _convert_js(value) for key, value in data.items()}
2648
- return data
2793
+ return data
2649
2794
 
2650
2795
  serializable_data = _convert_js(var_dict_or_df)
2651
-
2796
+
2652
2797
  # Save the serializable data to the JSON file
2653
2798
  with open(fpath_fname, "w") as f_json:
2654
2799
  json.dump(serializable_data, f_json, indent=4)
@@ -2660,11 +2805,13 @@ def fsave(
2660
2805
 
2661
2806
  def save_yaml(fpath, data, **kwargs):
2662
2807
  import yaml
2808
+
2663
2809
  with open(fpath, "w") as file:
2664
2810
  yaml.dump(data, file, **kwargs)
2665
2811
 
2666
2812
  def save_xml(fpath, data):
2667
2813
  from lxml import etree
2814
+
2668
2815
  root = etree.Element("root")
2669
2816
  if isinstance(data, dict):
2670
2817
  for key, val in data.items():
@@ -2675,24 +2822,37 @@ def fsave(
2675
2822
  tree = etree.ElementTree(root)
2676
2823
  tree.write(fpath, pretty_print=True, xml_declaration=True, encoding="UTF-8")
2677
2824
 
2678
- def save_parquet(fpath:str, data:pd.DataFrame, **kwargs):
2679
- engine = kwargs.pop("engine","auto") # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
2680
- compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2825
+ def save_parquet(fpath: str, data: pd.DataFrame, **kwargs):
2826
+ engine = kwargs.pop(
2827
+ "engine", "auto"
2828
+ ) # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
2829
+ compression = kwargs.pop(
2830
+ "compression", None
2831
+ ) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2681
2832
  try:
2682
2833
  # Attempt to save with "pyarrow" if engine is set to "auto"
2683
- data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
2684
- print(f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression.")
2834
+ data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
2835
+ print(
2836
+ f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression."
2837
+ )
2685
2838
  except Exception as e:
2686
- print(f"Error using with engine '{engine}' and {compression} compression: {e}")
2839
+ print(
2840
+ f"Error using with engine '{engine}' and {compression} compression: {e}"
2841
+ )
2687
2842
  if "Sparse" in str(e):
2688
2843
  try:
2689
2844
  # Handle sparse data by converting columns to dense
2690
2845
  print("Attempting to convert sparse columns to dense format...")
2691
- data = data.apply(lambda x: x.sparse.to_dense() if pd.api.types.is_sparse(x) else x)
2692
- save_parquet(fpath, data=data,**kwargs)
2846
+ data = data.apply(
2847
+ lambda x: (
2848
+ x.sparse.to_dense() if pd.api.types.is_sparse(x) else x
2849
+ )
2850
+ )
2851
+ save_parquet(fpath, data=data, **kwargs)
2693
2852
  except Exception as last_e:
2694
- print(f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}")
2695
-
2853
+ print(
2854
+ f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}"
2855
+ )
2696
2856
 
2697
2857
  if kind is None:
2698
2858
  _, kind = os.path.splitext(fpath)
@@ -2739,92 +2899,95 @@ def fsave(
2739
2899
  save_yaml(fpath, content, **kwargs)
2740
2900
  elif kind == "ipynb":
2741
2901
  save_ipynb(fpath, content, **kwargs)
2742
- elif kind.lower() in ["parquet","pq","big","par"]:
2743
- verbose=kwargs.pop('verbose',False)
2902
+ elif kind.lower() in ["parquet", "pq", "big", "par"]:
2903
+ verbose = kwargs.pop("verbose", False)
2744
2904
  if verbose:
2745
2905
  print(str_space_speed)
2746
2906
  use_pd("to_parquet")
2747
2907
  return None
2748
- compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2908
+ compression = kwargs.pop(
2909
+ "compression", None
2910
+ ) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
2749
2911
  # fix the fpath ends
2750
2912
  _fpath, _ext = os.path.splitext(fpath)
2751
- fpath = _fpath+_ext.replace(kind, 'parquet')
2913
+ fpath = _fpath + _ext.replace(kind, "parquet")
2752
2914
  if compression is not None:
2753
2915
  if not fpath.endswith(compression):
2754
- fpath=fpath+f".{compression}"
2755
- save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
2756
- elif kind.lower() in ["pkl","pk","pickle","pick"]:
2757
- # Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
2758
- # Pickle is convenient if you want to preserve exact Python object types.
2759
- verbose=kwargs.pop('verbose',False)
2916
+ fpath = fpath + f".{compression}"
2917
+ save_parquet(fpath=fpath, data=content, compression=compression, **kwargs)
2918
+ elif kind.lower() in ["pkl", "pk", "pickle", "pick"]:
2919
+ # Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
2920
+ # Pickle is convenient if you want to preserve exact Python object types.
2921
+ verbose = kwargs.pop("verbose", False)
2760
2922
  if verbose:
2761
2923
  print(str_space_speed)
2762
2924
  use_pd("to_pickle")
2763
2925
  return None
2764
2926
  _fpath, _ext = os.path.splitext(fpath)
2765
- fpath = _fpath+_ext.replace(kind, 'pkl')
2766
- compression=kwargs.pop("compression",None)
2927
+ fpath = _fpath + _ext.replace(kind, "pkl")
2928
+ compression = kwargs.pop("compression", None)
2767
2929
  if compression is not None:
2768
2930
  if not fpath.endswith(compression["method"]):
2769
- fpath=fpath+f".{compression["method"]}"
2931
+ fpath = fpath + f".{compression['method']}"
2770
2932
  if isinstance(content, pd.DataFrame):
2771
- content.to_pickle(fpath,**kwargs)
2933
+ content.to_pickle(fpath, **kwargs)
2772
2934
  else:
2773
2935
  try:
2774
2936
  print("trying to convert it as a DataFrame...")
2775
- content=pd.DataFrame(content)
2776
- content.to_pickle(fpath,**kwargs)
2937
+ content = pd.DataFrame(content)
2938
+ content.to_pickle(fpath, **kwargs)
2777
2939
  except Exception as e:
2778
2940
  raise ValueError(
2779
- f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2780
- )
2781
- elif kind.lower() in ["fea",'feather','ft','fe','feat','fether']:
2782
- # Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
2783
- # optimized for data analytics tasks and is especially fast when working with Pandas.
2784
-
2785
- verbose=kwargs.pop('verbose',False)
2941
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2942
+ )
2943
+ elif kind.lower() in ["fea", "feather", "ft", "fe", "feat", "fether"]:
2944
+ # Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
2945
+ # optimized for data analytics tasks and is especially fast when working with Pandas.
2946
+
2947
+ verbose = kwargs.pop("verbose", False)
2786
2948
  if verbose:
2787
2949
  print(str_space_speed)
2788
2950
  use_pd("to_feather")
2789
2951
  return None
2790
2952
  _fpath, _ext = os.path.splitext(fpath)
2791
- fpath = _fpath+_ext.replace(kind, 'feather')
2953
+ fpath = _fpath + _ext.replace(kind, "feather")
2792
2954
  if isinstance(content, pd.DataFrame):
2793
- content.to_feather(fpath,**kwargs)
2955
+ content.to_feather(fpath, **kwargs)
2794
2956
  else:
2795
2957
  try:
2796
2958
  print("trying to convert it as a DataFrame...")
2797
- content=pd.DataFrame(content)
2959
+ content = pd.DataFrame(content)
2798
2960
  content.to_feather(fpath, **kwargs)
2799
2961
  except Exception as e:
2800
2962
  raise ValueError(
2801
- f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2802
- )
2803
- elif kind.lower() in ["hd",'hdf','h','h5']:
2963
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2964
+ )
2965
+ elif kind.lower() in ["hd", "hdf", "h", "h5"]:
2804
2966
  # particularly useful for large datasets and can handle complex data structures
2805
- verbose=kwargs.pop('verbose',False)
2967
+ verbose = kwargs.pop("verbose", False)
2806
2968
  if verbose:
2807
2969
  print(str_space_speed)
2808
2970
  use_pd("to_hdf")
2809
2971
  _fpath, _ext = os.path.splitext(fpath)
2810
- fpath = _fpath+_ext.replace(kind, 'h5')
2811
- compression=kwargs.pop("compression",None)
2972
+ fpath = _fpath + _ext.replace(kind, "h5")
2973
+ compression = kwargs.pop("compression", None)
2812
2974
  if compression is not None:
2813
2975
  if not fpath.endswith(compression):
2814
- fpath=fpath+f".{compression}"
2976
+ fpath = fpath + f".{compression}"
2815
2977
  if isinstance(content, pd.DataFrame):
2816
- content.to_hdf(fpath,key='content',**kwargs)
2978
+ content.to_hdf(fpath, key="content", **kwargs)
2817
2979
  else:
2818
2980
  try:
2819
2981
  print("trying to convert it as a DataFrame...")
2820
- content=pd.DataFrame(content)
2821
- content.to_hdf(fpath,**kwargs)
2982
+ content = pd.DataFrame(content)
2983
+ content.to_hdf(fpath, **kwargs)
2822
2984
  except Exception as e:
2823
2985
  raise ValueError(
2824
- f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2825
- )
2986
+ f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
2987
+ )
2826
2988
  else:
2827
2989
  from . import netfinder
2990
+
2828
2991
  try:
2829
2992
  netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
2830
2993
  except:
@@ -2948,6 +3111,7 @@ def isa(content, kind):
2948
3111
  return is_str_color(content)
2949
3112
  elif "html" in kind.lower():
2950
3113
  import re
3114
+
2951
3115
  if content is None or not isinstance(content, str):
2952
3116
  return False
2953
3117
  # Remove leading and trailing whitespace
@@ -2997,8 +3161,8 @@ def listdir(
2997
3161
  verbose=True,
2998
3162
  ):
2999
3163
  if kind is None:
3000
- ls=os.listdir(rootdir)
3001
- ls = [f for f in ls if not f.startswith('.') and not f.startswith('~')]
3164
+ ls = os.listdir(rootdir)
3165
+ ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
3002
3166
  print(ls)
3003
3167
  df_all = pd.DataFrame(
3004
3168
  {
@@ -3029,7 +3193,7 @@ def listdir(
3029
3193
 
3030
3194
  if os.path.isdir(rootdir):
3031
3195
  ls = os.listdir(rootdir)
3032
- ls = [f for f in ls if not f.startswith('.') and not f.startswith('~')]
3196
+ ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
3033
3197
  fd = [".fd", ".fld", ".fol", ".fd", ".folder"]
3034
3198
  i = 0
3035
3199
  f = {
@@ -3108,6 +3272,7 @@ def listdir(
3108
3272
  return f
3109
3273
  else:
3110
3274
  from box import Box
3275
+
3111
3276
  if "l" in orient.lower(): # list # default
3112
3277
  res_output = Box(f.to_dict(orient="list"))
3113
3278
  return res_output
@@ -3151,7 +3316,7 @@ def mkdir_nest(fpath: str) -> str:
3151
3316
  # Split the full path into directories
3152
3317
  f_slash = "/" if "mac" in get_os().lower() else "\\"
3153
3318
  if os.path.isdir(fpath):
3154
- fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
3319
+ fpath = fpath + f_slash if not fpath.endswith(f_slash) else fpath
3155
3320
  return fpath
3156
3321
  dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
3157
3322
 
@@ -3181,27 +3346,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
3181
3346
  - str: The path of the created directory or an error message.
3182
3347
  """
3183
3348
 
3184
- rootdir = []
3349
+ rootdir = []
3185
3350
  if chdir is None:
3186
3351
  return mkdir_nest(pardir)
3187
3352
  if isinstance(chdir, str):
3188
- chdir = [chdir]
3353
+ chdir = [chdir]
3189
3354
  chdir = list(set(chdir))
3190
3355
  if isinstance(pardir, str): # Dir_parents should be 'str' type
3191
- pardir = os.path.normpath(pardir)
3356
+ pardir = os.path.normpath(pardir)
3192
3357
  if "mac" in get_os().lower() or "lin" in get_os().lower():
3193
3358
  stype = "/"
3194
3359
  elif "win" in get_os().lower():
3195
3360
  stype = "\\"
3196
3361
  else:
3197
3362
  stype = "/"
3198
-
3363
+
3199
3364
  if os.path.isdir(pardir):
3200
3365
  os.chdir(pardir) # Set current path
3201
3366
  # Check if subdirectories are not empty
3202
3367
  if chdir:
3203
- chdir.sort()
3204
- for folder in chdir:
3368
+ chdir.sort()
3369
+ for folder in chdir:
3205
3370
  child_tmp = os.path.join(pardir, folder)
3206
3371
  if not os.path.isdir(child_tmp):
3207
3372
  os.mkdir("./" + folder)
@@ -3221,7 +3386,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
3221
3386
  # Dir is the main output, if only one dir, then str type is inconvenient
3222
3387
  if len(rootdir) == 1:
3223
3388
  rootdir = rootdir[0]
3224
- rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
3389
+ rootdir = rootdir + stype if not rootdir.endswith(stype) else rootdir
3225
3390
 
3226
3391
  return rootdir
3227
3392
 
@@ -3236,6 +3401,7 @@ def split_path(fpath):
3236
3401
  def figsave(*args, dpi=300):
3237
3402
  import matplotlib.pyplot as plt
3238
3403
  from PIL import Image
3404
+
3239
3405
  dir_save = None
3240
3406
  fname = None
3241
3407
  img = None
@@ -3250,7 +3416,7 @@ def figsave(*args, dpi=300):
3250
3416
  img = arg # Store the PIL image if provided
3251
3417
 
3252
3418
  if dir_save is None:
3253
- dir_save="./"
3419
+ dir_save = "./"
3254
3420
 
3255
3421
  # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
3256
3422
  dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
@@ -3343,8 +3509,9 @@ def figsave(*args, dpi=300):
3343
3509
 
3344
3510
  def is_str_color(s):
3345
3511
  # Regular expression pattern for hexadecimal color codes
3346
- if isinstance(s,str):
3512
+ if isinstance(s, str):
3347
3513
  import re
3514
+
3348
3515
  color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
3349
3516
  return re.match(color_code_pattern, s) is not None
3350
3517
  else:
@@ -3372,6 +3539,7 @@ def isnum(s):
3372
3539
 
3373
3540
  def is_image(fpath):
3374
3541
  import mimetypes
3542
+
3375
3543
  mime_type, _ = mimetypes.guess_type(fpath)
3376
3544
  if mime_type and mime_type.startswith("image"):
3377
3545
  return True
@@ -3381,6 +3549,7 @@ def is_image(fpath):
3381
3549
 
3382
3550
  def is_document(fpath):
3383
3551
  import mimetypes
3552
+
3384
3553
  mime_type, _ = mimetypes.guess_type(fpath)
3385
3554
  if mime_type and (
3386
3555
  mime_type.startswith("text/")
@@ -3402,6 +3571,7 @@ def is_document(fpath):
3402
3571
 
3403
3572
  def is_zip(fpath):
3404
3573
  import mimetypes
3574
+
3405
3575
  mime_type, _ = mimetypes.guess_type(fpath)
3406
3576
  if mime_type == "application/zip":
3407
3577
  return True
@@ -3411,6 +3581,7 @@ def is_zip(fpath):
3411
3581
 
3412
3582
  def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
3413
3583
  import matplotlib.pyplot as plt
3584
+
3414
3585
  if ax is None:
3415
3586
  ax = plt.gca()
3416
3587
  for loc, spine in ax.spines.items():
@@ -3500,6 +3671,7 @@ def apply_filter(img, *args):
3500
3671
  PIL.Image: The filtered image.
3501
3672
  """
3502
3673
  from PIL import ImageFilter
3674
+
3503
3675
  def correct_filter_name(filter_name):
3504
3676
  if "bl" in filter_name.lower() and "box" not in filter_name.lower():
3505
3677
  return "BLUR"
@@ -3742,7 +3914,8 @@ def imgsets(img, **kwargs):
3742
3914
  return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
3743
3915
 
3744
3916
  import matplotlib.pyplot as plt
3745
- from PIL import ImageEnhance,ImageOps
3917
+ from PIL import ImageEnhance, ImageOps
3918
+
3746
3919
  # Load image if input is a file path
3747
3920
  if isinstance(img, str):
3748
3921
  img = load_img(img)
@@ -3807,6 +3980,7 @@ def imgsets(img, **kwargs):
3807
3980
  img_update = ImageOps.pad(img_update, size=value)
3808
3981
  elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
3809
3982
  from rembg import remove, new_session
3983
+
3810
3984
  if isinstance(value, bool):
3811
3985
  session = new_session("isnet-general-use")
3812
3986
  img_update = remove(img_update, session=session)
@@ -3846,6 +4020,7 @@ def imgsets(img, **kwargs):
3846
4020
  img_update = remove(img_update)
3847
4021
  elif "bg" in k.lower() and "color" in k.lower():
3848
4022
  from rembg import remove
4023
+
3849
4024
  if isinstance(value, list):
3850
4025
  value = tuple(value)
3851
4026
  if isinstance(value, tuple): # replace the background color
@@ -3879,6 +4054,7 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
3879
4054
  """
3880
4055
  import matplotlib.pyplot as plt
3881
4056
  from PIL import Image
4057
+
3882
4058
  num_images = len(dir_img_list)
3883
4059
  if not kind.startswith("."):
3884
4060
  kind = "." + kind
@@ -3917,12 +4093,11 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
3917
4093
  # thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
3918
4094
 
3919
4095
 
3920
-
3921
4096
  # search and fine the director of the libary, which installed at local
3922
4097
  def dir_lib(lib_oi):
3923
4098
  """
3924
4099
  # example usage:
3925
- # dir_lib("seaborn")
4100
+ # dir_lib("seaborn")
3926
4101
  """
3927
4102
  import site
3928
4103
 
@@ -3941,6 +4116,7 @@ def dir_lib(lib_oi):
3941
4116
  print(f"Cannot find the {lib_oi} in site-packages directory.")
3942
4117
  return dir_list
3943
4118
 
4119
+
3944
4120
  class FileInfo:
3945
4121
  def __init__(
3946
4122
  self,
@@ -4018,6 +4194,7 @@ class FileInfo:
4018
4194
 
4019
4195
  def finfo(fpath):
4020
4196
  import time
4197
+
4021
4198
  fname, fmt = os.path.splitext(fpath)
4022
4199
  dir_par = os.path.dirname(fpath) + "/"
4023
4200
  data = {
@@ -4033,6 +4210,7 @@ def finfo(fpath):
4033
4210
  extra_info = {}
4034
4211
  if data["kind"] == ".pdf":
4035
4212
  from pdf2image import pdfinfo_from_path
4213
+
4036
4214
  extra_info = pdfinfo_from_path(fpath)
4037
4215
 
4038
4216
  return FileInfo(
@@ -4047,6 +4225,7 @@ def finfo(fpath):
4047
4225
  extra_info=extra_info,
4048
4226
  )
4049
4227
 
4228
+
4050
4229
  # ! format excel file
4051
4230
  def hex2argb(hex_color):
4052
4231
  """
@@ -4078,7 +4257,10 @@ def hex2argb(hex_color):
4078
4257
  return hex_color[-9:]
4079
4258
  else:
4080
4259
  return "F" * (9 - len(hex_color)) + hex_color
4081
- raise ValueError("Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format.")
4260
+ raise ValueError(
4261
+ "Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format."
4262
+ )
4263
+
4082
4264
 
4083
4265
  def format_excel(
4084
4266
  df=None,
@@ -4137,7 +4319,15 @@ def format_excel(
4137
4319
  font_bold = False
4138
4320
  font_strike = False
4139
4321
  font_italic = False
4140
- kws_font = ["name","size","bold","underline","color","strike","italic"]
4322
+ kws_font = [
4323
+ "name",
4324
+ "size",
4325
+ "bold",
4326
+ "underline",
4327
+ "color",
4328
+ "strike",
4329
+ "italic",
4330
+ ]
4141
4331
  for k_, v_ in cell.get(K, {}).items():
4142
4332
  if strcmp(k_, kws_font)[0] == "name":
4143
4333
  font_name = v_
@@ -4167,9 +4357,31 @@ def format_excel(
4167
4357
  if strcmp(K, kws_cell)[0] == "fill":
4168
4358
  #! fill
4169
4359
  kws_fill = ["start_color", "end_color", "fill_type", "color"]
4170
- kws_fill_type = ["darkVertical","lightDown","lightGrid","solid","darkDown","lightGray","lightUp","gray0625","lightVertical","lightHorizontal",
4171
- "darkHorizontal","gray125","darkUp","mediumGray","darkTrellis","darkGray","lightTrellis","darkGrid"]
4172
- start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
4360
+ kws_fill_type = [
4361
+ "darkVertical",
4362
+ "lightDown",
4363
+ "lightGrid",
4364
+ "solid",
4365
+ "darkDown",
4366
+ "lightGray",
4367
+ "lightUp",
4368
+ "gray0625",
4369
+ "lightVertical",
4370
+ "lightHorizontal",
4371
+ "darkHorizontal",
4372
+ "gray125",
4373
+ "darkUp",
4374
+ "mediumGray",
4375
+ "darkTrellis",
4376
+ "darkGray",
4377
+ "lightTrellis",
4378
+ "darkGrid",
4379
+ ]
4380
+ start_color, end_color, fill_type = (
4381
+ "FFFFFF",
4382
+ "FFFFFF",
4383
+ "solid",
4384
+ ) # default
4173
4385
  for k, v in cell.get(K, {}).items():
4174
4386
  if strcmp(k, kws_fill)[0] == "color":
4175
4387
  start_color, end_color = hex2argb(v), hex2argb(v)
@@ -4241,27 +4453,78 @@ def format_excel(
4241
4453
 
4242
4454
  if strcmp(K, kws_cell)[0] == "border":
4243
4455
  #! border
4244
- kws_border = ["color_left","color_l","color_right","color_r","color_top","color_t","color_bottom","color_b",
4245
- "color_diagonal","color_d","color_outline","color_o","color_vertical","color_v","color_horizontal",
4246
- "color_h","color","style_left","style_l","style_right","style_r","style_top","style_t","style_bottom","style_b",
4247
- "style_diagonal","style_d","style_outline","style_o","style_vertical","style_v","style_horizontal",
4248
- "style_h","style"]
4456
+ kws_border = [
4457
+ "color_left",
4458
+ "color_l",
4459
+ "color_right",
4460
+ "color_r",
4461
+ "color_top",
4462
+ "color_t",
4463
+ "color_bottom",
4464
+ "color_b",
4465
+ "color_diagonal",
4466
+ "color_d",
4467
+ "color_outline",
4468
+ "color_o",
4469
+ "color_vertical",
4470
+ "color_v",
4471
+ "color_horizontal",
4472
+ "color_h",
4473
+ "color",
4474
+ "style_left",
4475
+ "style_l",
4476
+ "style_right",
4477
+ "style_r",
4478
+ "style_top",
4479
+ "style_t",
4480
+ "style_bottom",
4481
+ "style_b",
4482
+ "style_diagonal",
4483
+ "style_d",
4484
+ "style_outline",
4485
+ "style_o",
4486
+ "style_vertical",
4487
+ "style_v",
4488
+ "style_horizontal",
4489
+ "style_h",
4490
+ "style",
4491
+ ]
4249
4492
  # * border color
4250
- border_color_l, border_color_r, border_color_t, border_color_b = ("FF000000","FF000000","FF000000","FF000000")
4251
- border_color_d, border_color_o, border_color_v, border_color_h = ("FF000000","FF000000","FF000000","FF000000")
4493
+ border_color_l, border_color_r, border_color_t, border_color_b = (
4494
+ "FF000000",
4495
+ "FF000000",
4496
+ "FF000000",
4497
+ "FF000000",
4498
+ )
4499
+ border_color_d, border_color_o, border_color_v, border_color_h = (
4500
+ "FF000000",
4501
+ "FF000000",
4502
+ "FF000000",
4503
+ "FF000000",
4504
+ )
4252
4505
  # get colors config
4253
4506
  for k, v in cell.get(K, {}).items():
4254
4507
  if strcmp(k, kws_border)[0] in ["color"]:
4255
4508
  border_color_all = hex2argb(v)
4256
4509
  # 如果设置了color,表示其它的所有的都设置成为一样的
4257
4510
  # 然后再才开始自己定义其它的color
4258
- border_color_l, border_color_r, border_color_t, border_color_b = (
4511
+ (
4512
+ border_color_l,
4513
+ border_color_r,
4514
+ border_color_t,
4515
+ border_color_b,
4516
+ ) = (
4259
4517
  border_color_all,
4260
4518
  border_color_all,
4261
4519
  border_color_all,
4262
4520
  border_color_all,
4263
4521
  )
4264
- border_color_d, border_color_o, border_color_v, border_color_h = (
4522
+ (
4523
+ border_color_d,
4524
+ border_color_o,
4525
+ border_color_v,
4526
+ border_color_h,
4527
+ ) = (
4265
4528
  border_color_all,
4266
4529
  border_color_all,
4267
4530
  border_color_all,
@@ -4284,10 +4547,31 @@ def format_excel(
4284
4547
  elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
4285
4548
  border_color_h = hex2argb(v)
4286
4549
  # *border style
4287
- border_styles = ["thin","medium","thick","dotted","dashed",
4288
- "hair","mediumDashed","dashDot","dashDotDot","slantDashDot","none"]
4289
- border_style_l, border_style_r, border_style_t, border_style_b = (None,None,None,None)
4290
- border_style_d, border_style_o, border_style_v, border_style_h = (None,None,None,None)
4550
+ border_styles = [
4551
+ "thin",
4552
+ "medium",
4553
+ "thick",
4554
+ "dotted",
4555
+ "dashed",
4556
+ "hair",
4557
+ "mediumDashed",
4558
+ "dashDot",
4559
+ "dashDotDot",
4560
+ "slantDashDot",
4561
+ "none",
4562
+ ]
4563
+ border_style_l, border_style_r, border_style_t, border_style_b = (
4564
+ None,
4565
+ None,
4566
+ None,
4567
+ None,
4568
+ )
4569
+ border_style_d, border_style_o, border_style_v, border_style_h = (
4570
+ None,
4571
+ None,
4572
+ None,
4573
+ None,
4574
+ )
4291
4575
  # get styles config
4292
4576
  for k, v in cell.get(K, {}).items():
4293
4577
  # if not "style" in k:
@@ -4296,13 +4580,23 @@ def format_excel(
4296
4580
  border_style_all = strcmp(v, border_styles)[0]
4297
4581
  # 如果设置了style,表示其它的所有的都设置成为一样的
4298
4582
  # 然后再才开始自己定义其它的style
4299
- border_style_l, border_style_r, border_style_t, border_style_b = (
4583
+ (
4584
+ border_style_l,
4585
+ border_style_r,
4586
+ border_style_t,
4587
+ border_style_b,
4588
+ ) = (
4300
4589
  border_style_all,
4301
4590
  border_style_all,
4302
4591
  border_style_all,
4303
4592
  border_style_all,
4304
4593
  )
4305
- border_style_d, border_style_o, border_style_v, border_style_h = (
4594
+ (
4595
+ border_style_d,
4596
+ border_style_o,
4597
+ border_style_v,
4598
+ border_style_h,
4599
+ ) = (
4306
4600
  border_style_all,
4307
4601
  border_style_all,
4308
4602
  border_style_all,
@@ -4348,6 +4642,7 @@ def format_excel(
4348
4642
  cell_.alignment = cell_alignment
4349
4643
  if border:
4350
4644
  cell_.border = border
4645
+
4351
4646
  if not isinstance(df, pd.DataFrame):
4352
4647
  try:
4353
4648
  print(f"is loading file {os.path.basename(df)}")
@@ -4697,6 +4992,7 @@ def preview(var):
4697
4992
  """Master function to preview formatted variables in Jupyter."""
4698
4993
  from bs4 import BeautifulSoup
4699
4994
  from IPython.display import display, HTML, Markdown
4995
+
4700
4996
  if isinstance(var, str):
4701
4997
  if isa(var, "html"):
4702
4998
  display(HTML(var)) # Render as HTML
@@ -4714,6 +5010,7 @@ def preview(var):
4714
5010
 
4715
5011
  elif isinstance(var, list) or isinstance(var, dict):
4716
5012
  import json
5013
+
4717
5014
  # Display JSON
4718
5015
  json_str = json.dumps(var, indent=4)
4719
5016
  display(Markdown(f"```json\n{json_str}\n```"))
@@ -4728,6 +5025,7 @@ def preview(var):
4728
5025
 
4729
5026
  elif isinstance(var, dict):
4730
5027
  import json
5028
+
4731
5029
  # Handle dictionary formatting
4732
5030
  json_str = json.dumps(var, indent=4)
4733
5031
  display(Markdown(f"```json\n{json_str}\n```"))
@@ -4735,12 +5033,15 @@ def preview(var):
4735
5033
  else:
4736
5034
  # If the format is not recognized, print a message
4737
5035
  print("Format not recognized or unsupported.")
5036
+
5037
+
4738
5038
  # # Example usages:
4739
5039
  # preview("This is a plain text message.")
4740
5040
  # preview("# This is a Markdown header")
4741
5041
  # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
4742
5042
  # preview({"key": "value", "numbers": [1, 2, 3]})
4743
5043
 
5044
+
4744
5045
  def _df_outlier(
4745
5046
  data,
4746
5047
  columns=None,
@@ -4880,51 +5181,53 @@ def df_outlier(
4880
5181
  processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
4881
5182
  processed_data = processed_data[col_names_org]
4882
5183
  return processed_data
4883
-
4884
5184
 
4885
5185
 
4886
5186
  def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
4887
5187
  """
4888
5188
  Extend a DataFrame by the list elecments in the column.
4889
-
5189
+
4890
5190
  Parameters:
4891
5191
  ----------
4892
5192
  data : pd.DataFrame
4893
5193
  The input DataFrame to be extended.
4894
-
5194
+
4895
5195
  column : str
4896
5196
  The name of the column to be split.
4897
-
5197
+
4898
5198
  axis : int, optional
4899
- The axis along which to expand the DataFrame.
5199
+ The axis along which to expand the DataFrame.
4900
5200
  - 0 (default): Expand the specified column into multiple rows.
4901
5201
  - 1: Expand the specified column into multiple columns.
4902
-
5202
+
4903
5203
  sep : str, optional
4904
5204
  The separator used to split the values in the specified column.
4905
5205
  Must be provided for the function to work correctly.
4906
5206
  """
4907
-
4908
- data = data.copy()
5207
+
5208
+ data = data.copy()
4909
5209
  mask = data[column].str.contains(sep, na=False)
4910
5210
  data = data.copy()
4911
5211
  if mask.any():
4912
- data[column] = (
4913
- data[column]
4914
- .apply(lambda x: x.split(sep) if isinstance(x, str) else x) # Only split if x is a string
4915
- )
4916
-
5212
+ data[column] = data[column].apply(
5213
+ lambda x: x.split(sep) if isinstance(x, str) else x
5214
+ ) # Only split if x is a string
5215
+
4917
5216
  # Strip spaces from each item in the lists
4918
- data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
4919
-
5217
+ data[column] = data[column].apply(
5218
+ lambda x: [item.strip() for item in x] if isinstance(x, list) else x
5219
+ )
5220
+
4920
5221
  data = data.explode(column, ignore_index=True)
4921
5222
  return data
5223
+
5224
+
4922
5225
  # ! DataFrame
4923
5226
  def df_astype(
4924
5227
  data: pd.DataFrame,
4925
5228
  columns: Optional[Union[str, List[str]]] = None,
4926
5229
  astype: str = "datetime",
4927
- skip_row:Union[str,list]=None,
5230
+ skip_row: Union[str, list] = None,
4928
5231
  fmt: Optional[str] = None,
4929
5232
  inplace: bool = True,
4930
5233
  errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
@@ -4982,7 +5285,8 @@ def df_astype(
4982
5285
  "second",
4983
5286
  "time",
4984
5287
  "week",
4985
- "date","day",
5288
+ "date",
5289
+ "day",
4986
5290
  "month",
4987
5291
  "year",
4988
5292
  ]
@@ -4990,18 +5294,18 @@ def df_astype(
4990
5294
  if not inplace:
4991
5295
  data = data.copy()
4992
5296
  if skip_row is not None:
4993
- data = data.drop(index=skip_row, errors='ignore')
5297
+ data = data.drop(index=skip_row, errors="ignore")
4994
5298
  # If columns is None, apply to all columns
4995
5299
  if columns is None:
4996
5300
  columns = data.columns.tolist()
4997
5301
  # correct the astype input
4998
- if isinstance(astype,str):
5302
+ if isinstance(astype, str):
4999
5303
  astype = strcmp(astype, astypes)[0]
5000
5304
  print(f"converting as type: {astype}")
5001
- elif isinstance(astype,dict):
5305
+ elif isinstance(astype, dict):
5002
5306
  for col, dtype in astype.items():
5003
- dtype='date' if dtype=="day" else dtype
5004
- data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
5307
+ dtype = "date" if dtype == "day" else dtype
5308
+ data["col"] = data["col"].adtype(strcmp(dtype, astypes)[0])
5005
5309
  return data if not inplace else None
5006
5310
 
5007
5311
  # Ensure columns is a list
@@ -5112,13 +5416,15 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
5112
5416
  if column not in data.columns:
5113
5417
  raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
5114
5418
 
5115
- if isinstance(by, str) and 'count' in by.lower():
5419
+ if isinstance(by, str) and "count" in by.lower():
5116
5420
  # Count occurrences of each value in the specified column
5117
5421
  value_counts = df[column].value_counts()
5118
5422
 
5119
5423
  # Determine the order based on counts
5120
5424
  count_ascending = kwargs.pop("count_ascending", ascending)
5121
- sorted_counts = value_counts.sort_values(ascending=count_ascending).index.tolist()
5425
+ sorted_counts = value_counts.sort_values(
5426
+ ascending=count_ascending
5427
+ ).index.tolist()
5122
5428
 
5123
5429
  # Convert to a categorical type with the new order
5124
5430
  df[column] = pd.Categorical(df[column], categories=sorted_counts, ordered=True)
@@ -5236,6 +5542,7 @@ def df_merge(
5236
5542
  )
5237
5543
  return df_merged
5238
5544
 
5545
+
5239
5546
  def df_drop_duplicates(
5240
5547
  data: pd.DataFrame,
5241
5548
  by: Union[
@@ -5244,16 +5551,16 @@ def df_drop_duplicates(
5244
5551
  keep="first", # Options: 'first', 'last', or False (drop all duplicates)
5245
5552
  ignore_index=True,
5246
5553
  inplace: bool = False,
5247
- verbose=True
5554
+ verbose=True,
5248
5555
  ):
5249
5556
  """
5250
5557
  data (pd.DataFrame): DataFrame to drop duplicates from.
5251
5558
  by (str): Specify by to drop duplicates:
5252
5559
  - 'index': Drop duplicates based on the DataFrame index.
5253
5560
  - Column name(s) for row-wise duplicate checking.
5254
- keep (str): Which duplicates to keep:
5255
- 'first',
5256
- 'last',
5561
+ keep (str): Which duplicates to keep:
5562
+ 'first',
5563
+ 'last',
5257
5564
  False (drop all duplicates).
5258
5565
  inplace (bool): Whether to modify the original DataFrame in place.
5259
5566
  """
@@ -5263,8 +5570,8 @@ def df_drop_duplicates(
5263
5570
  result = data[~data.index.duplicated(keep=keep)]
5264
5571
  else:
5265
5572
  # Drop duplicates row-wise based on column(s)
5266
- result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
5267
- if original_shape!=result.shape or verbose:
5573
+ result = data.drop_duplicates(subset=by, keep=keep, ignore_index=ignore_index)
5574
+ if original_shape != result.shape or verbose:
5268
5575
  print(f"\nshape:{original_shape} (before drop_duplicates)")
5269
5576
  print(f"shape:{result.shape} (after drop_duplicates)")
5270
5577
  if inplace:
@@ -5274,16 +5581,18 @@ def df_drop_duplicates(
5274
5581
  return None
5275
5582
  else:
5276
5583
  return result
5584
+
5585
+
5277
5586
  #! fillna()
5278
5587
  def df_fillna(
5279
5588
  data: pd.DataFrame,
5280
5589
  method: str = "knn",
5281
- axis: int = 0,# column-wise
5590
+ axis: int = 0, # column-wise
5282
5591
  constant: float = None,
5283
5592
  n_neighbors: int = 5, # KNN-specific
5284
- max_iter: int = 10, # Iterative methods specific
5593
+ max_iter: int = 10, # Iterative methods specific
5285
5594
  inplace: bool = False,
5286
- random_state:int = 1
5595
+ random_state: int = 1,
5287
5596
  ) -> pd.DataFrame:
5288
5597
  """
5289
5598
  Fill missing values in a DataFrame using specified imputation method.
@@ -5299,11 +5608,11 @@ def df_fillna(
5299
5608
  - 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
5300
5609
  - 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
5301
5610
  # - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
5302
- # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
5611
+ # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
5303
5612
  # reconstructing the data matrix using low-rank approximation
5304
5613
  # - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
5305
5614
  # - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
5306
-
5615
+
5307
5616
  axis (int): The axis along which to impute:
5308
5617
  - 0: Impute column-wise (default).
5309
5618
  - 1: Impute row-wise.
@@ -5312,7 +5621,7 @@ def df_fillna(
5312
5621
 
5313
5622
  """
5314
5623
  if isinstance(data, pd.Series):
5315
- data=pd.DataFrame(data)
5624
+ data = pd.DataFrame(data)
5316
5625
  # handle None
5317
5626
  for col in data.columns:
5318
5627
  data[col] = data[col].apply(lambda x: np.nan if x is None else x)
@@ -5322,13 +5631,19 @@ def df_fillna(
5322
5631
  # Separate numeric and non-numeric columns
5323
5632
  numeric_data = data.select_dtypes(include=[np.number])
5324
5633
  non_numeric_data = data.select_dtypes(exclude=[np.number])
5325
-
5634
+
5326
5635
  if data.empty:
5327
5636
  raise ValueError("Input DataFrame is empty.")
5328
5637
 
5329
5638
  # Validate method
5330
- methods = ["mean", "median", "most_frequent",
5331
- "constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
5639
+ methods = [
5640
+ "mean",
5641
+ "median",
5642
+ "most_frequent",
5643
+ "constant",
5644
+ "knn",
5645
+ "iterative",
5646
+ ] # ,"missforest","softimpute","svd"]
5332
5647
  method = strcmp(method, methods)[0]
5333
5648
 
5334
5649
  # If using constant method, ask for a constant value
@@ -5342,17 +5657,20 @@ def df_fillna(
5342
5657
  # Initialize SimpleImputer with the chosen method
5343
5658
  if method == "constant":
5344
5659
  from sklearn.impute import SimpleImputer
5660
+
5345
5661
  imputer = SimpleImputer(strategy=method, fill_value=constant)
5346
5662
  elif method == "knn":
5347
5663
  from sklearn.impute import KNNImputer
5664
+
5348
5665
  imputer = KNNImputer(n_neighbors=n_neighbors)
5349
5666
  elif method == "iterative" or method == "mice":
5350
5667
  from sklearn.experimental import enable_iterative_imputer
5351
5668
  from sklearn.impute import IterativeImputer
5352
5669
 
5353
- imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
5354
- else: # mean, median, most_frequent
5670
+ imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
5671
+ else: # mean, median, most_frequent
5355
5672
  from sklearn.impute import SimpleImputer
5673
+
5356
5674
  imputer = SimpleImputer(strategy=method)
5357
5675
 
5358
5676
  # Fit and transform the data
@@ -5376,23 +5694,29 @@ def df_fillna(
5376
5694
  # Handle non-numeric data imputation
5377
5695
  if not non_numeric_data.empty:
5378
5696
  from sklearn.impute import SimpleImputer
5697
+
5379
5698
  if method == "constant":
5380
- non_numeric_imputer = SimpleImputer(strategy="constant", fill_value=constant)
5699
+ non_numeric_imputer = SimpleImputer(
5700
+ strategy="constant", fill_value=constant
5701
+ )
5381
5702
  else:
5382
5703
  non_numeric_imputer = SimpleImputer(strategy="most_frequent")
5383
-
5704
+
5384
5705
  # Impute non-numeric columns column-wise (axis=0)
5385
5706
  imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
5386
-
5707
+
5387
5708
  # Convert imputed non-numeric array back to DataFrame with original index and column names
5388
5709
  imputed_non_numeric_df = pd.DataFrame(
5389
- imputed_non_numeric, index=non_numeric_data.index, columns=non_numeric_data.columns
5710
+ imputed_non_numeric,
5711
+ index=non_numeric_data.index,
5712
+ columns=non_numeric_data.columns,
5390
5713
  )
5391
5714
  else:
5392
5715
  imputed_non_numeric_df = pd.DataFrame(index=data.index)
5393
5716
 
5394
-
5395
- imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(columns=data.columns)
5717
+ imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(
5718
+ columns=data.columns
5719
+ )
5396
5720
 
5397
5721
  if inplace:
5398
5722
  # Modify the original DataFrame
@@ -5401,6 +5725,8 @@ def df_fillna(
5401
5725
  else:
5402
5726
  # Return the modified DataFrame
5403
5727
  return imputed_data[col_names_org]
5728
+
5729
+
5404
5730
  # # example
5405
5731
  # data = {
5406
5732
  # "A": [1, 2, np.nan, 4, 5],
@@ -5430,14 +5756,15 @@ def df_fillna(
5430
5756
  # display(df)
5431
5757
  # display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
5432
5758
 
5759
+
5433
5760
  def df_encoder(
5434
5761
  data: pd.DataFrame,
5435
- method: str = "dummy",#'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
5762
+ method: str = "dummy", #'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
5436
5763
  columns=None,
5437
5764
  target_column=None, # Required for 'target' encoding method
5438
- **kwargs
5765
+ **kwargs,
5439
5766
  ) -> pd.DataFrame:
5440
- """
5767
+ """
5441
5768
  Methods explained:
5442
5769
  - 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
5443
5770
 
@@ -5454,18 +5781,20 @@ def df_encoder(
5454
5781
 
5455
5782
  # Select categorical columns
5456
5783
  categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
5457
- methods = ["dummy","onehot", "ordinal", "label", "target", "binary"]
5784
+ methods = ["dummy", "onehot", "ordinal", "label", "target", "binary"]
5458
5785
  method = strcmp(method, methods)[0]
5459
5786
 
5460
5787
  if columns is None:
5461
5788
  columns = categorical_cols
5462
5789
 
5463
5790
  # pd.get_dummies()
5464
- if method=='dummy':
5465
- dtype=kwargs.pop("dtype",int)
5466
- drop_first=kwargs.pop("drop_first",True)
5791
+ if method == "dummy":
5792
+ dtype = kwargs.pop("dtype", int)
5793
+ drop_first = kwargs.pop("drop_first", True)
5467
5794
  try:
5468
- encoded_df = pd.get_dummies(data[columns], drop_first=drop_first, dtype=dtype, **kwargs)
5795
+ encoded_df = pd.get_dummies(
5796
+ data[columns], drop_first=drop_first, dtype=dtype, **kwargs
5797
+ )
5469
5798
  return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
5470
5799
  except Exception as e:
5471
5800
  # print(f"Warning, 没有进行转换, 因为: {e}")
@@ -5518,8 +5847,9 @@ def df_encoder(
5518
5847
  encoded_data = encoder.fit_transform(data[columns])
5519
5848
  return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
5520
5849
 
5850
+
5521
5851
  def df_scaler(
5522
- data: pd.DataFrame, # should be numeric dtype
5852
+ data: pd.DataFrame, # should be numeric dtype
5523
5853
  method="standard",
5524
5854
  columns=None, # default, select all numeric col/row
5525
5855
  inplace=False,
@@ -5603,6 +5933,8 @@ def df_scaler(
5603
5933
  scaled_df = data.copy()
5604
5934
  scaled_df.loc[numeric_rows.index] = scaled_data
5605
5935
  return scaled_df
5936
+
5937
+
5606
5938
  def df_special_characters_cleaner(
5607
5939
  data: pd.DataFrame, where=["column", "content", "index"]
5608
5940
  ) -> pd.DataFrame:
@@ -5628,6 +5960,8 @@ def df_special_characters_cleaner(
5628
5960
  data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
5629
5961
 
5630
5962
  return data
5963
+
5964
+
5631
5965
  def df_cluster(
5632
5966
  data: pd.DataFrame,
5633
5967
  columns: Optional[list] = None,
@@ -5636,7 +5970,7 @@ def df_cluster(
5636
5970
  scale: bool = True,
5637
5971
  plot: Union[str, list] = "all",
5638
5972
  inplace: bool = True,
5639
- ax = None,
5973
+ ax=None,
5640
5974
  ):
5641
5975
  from sklearn.preprocessing import StandardScaler
5642
5976
  from sklearn.cluster import KMeans
@@ -5952,24 +6286,23 @@ def df_reducer(
5952
6286
  umap_neighbors: int = 15, # UMAP-specific
5953
6287
  umap_min_dist: float = 0.1, # UMAP-specific
5954
6288
  tsne_perplexity: int = 30, # t-SNE-specific
5955
- hue:str = None,# lda-specific
6289
+ hue: str = None, # lda-specific
5956
6290
  scale: bool = True,
5957
6291
  fill_missing: bool = True,
5958
6292
  debug: bool = False,
5959
6293
  inplace: bool = True, # replace the oringinal data
5960
- plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
6294
+ plot_: bool = False, # plot scatterplot, but no 'hue',so it is meaningless
5961
6295
  random_state=1,
5962
- ax = None,
6296
+ ax=None,
5963
6297
  figsize=None,
5964
- **kwargs
5965
- ) -> pd.DataFrame:
6298
+ **kwargs,
6299
+ ) -> pd.DataFrame:
5966
6300
  dict_methods = {
5967
6301
  #!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
5968
6302
  "pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
5969
6303
  "lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
5970
6304
  "factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
5971
6305
  "svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
5972
-
5973
6306
  #! Non-linear Dimensionality Reduction (Manifold Learning)
5974
6307
  "umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
5975
6308
  "tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other. Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
@@ -5977,28 +6310,40 @@ def df_reducer(
5977
6310
  "lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
5978
6311
  "kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
5979
6312
  "ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
5980
-
5981
6313
  #! Anomaly Detection: Specialized for detecting outliers or unusual patterns
5982
6314
  "isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
5983
6315
  }
5984
6316
 
5985
6317
  from sklearn.preprocessing import StandardScaler
5986
6318
  from sklearn.impute import SimpleImputer
5987
- if plot_:
5988
- import matplotlib.pyplot as plt
6319
+
6320
+ if plot_:
6321
+ import matplotlib.pyplot as plt
5989
6322
  import seaborn as sns
5990
6323
  # Check valid method input
5991
- methods=["pca", "umap","tsne","factor","isolation_forest","lda","kpca","ica","mds","lle","svd"]
5992
- method=strcmp(method, methods)[0]
6324
+ methods = [
6325
+ "pca",
6326
+ "umap",
6327
+ "tsne",
6328
+ "factor",
6329
+ "isolation_forest",
6330
+ "lda",
6331
+ "kpca",
6332
+ "ica",
6333
+ "mds",
6334
+ "lle",
6335
+ "svd",
6336
+ ]
6337
+ method = strcmp(method, methods)[0]
5993
6338
  print(f"\nprocessing with using {dict_methods[method]}:")
5994
- xlabel,ylabel=None,None
6339
+ xlabel, ylabel = None, None
5995
6340
  if columns is None:
5996
- columns = data.select_dtypes(include='number').columns.tolist()
6341
+ columns = data.select_dtypes(include="number").columns.tolist()
5997
6342
  if hue is None:
5998
- hue = data.select_dtypes(exclude='number').columns.tolist()
6343
+ hue = data.select_dtypes(exclude="number").columns.tolist()
5999
6344
  if isinstance(hue, list):
6000
6345
  print("Warning: hue is a list, only select the 1st one")
6001
- hue=hue[0]
6346
+ hue = hue[0]
6002
6347
  if not hue:
6003
6348
  # Select columns if specified, else use all columns
6004
6349
  X = data[columns].values if columns else data.values
@@ -6018,11 +6363,12 @@ def df_reducer(
6018
6363
  X = scaler.fit_transform(X)
6019
6364
 
6020
6365
  # Apply PCA if selected
6021
- if method == "pca":
6366
+ if method == "pca":
6022
6367
  from sklearn.decomposition import PCA
6368
+
6023
6369
  pca = PCA(n_components=n_components)
6024
6370
  X_reduced = pca.fit_transform(X)
6025
-
6371
+
6026
6372
  # Additional PCA information
6027
6373
  explained_variance = pca.explained_variance_ratio_
6028
6374
  singular_values = pca.singular_values_
@@ -6038,56 +6384,72 @@ def df_reducer(
6038
6384
  # Plot explained variance
6039
6385
  cumulative_variance = np.cumsum(explained_variance)
6040
6386
  plt.figure(figsize=(8, 5))
6041
- plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
6387
+ plt.plot(
6388
+ range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o"
6389
+ )
6042
6390
  plt.title("Cumulative Explained Variance by Principal Components")
6043
6391
  plt.xlabel("Number of Principal Components")
6044
6392
  plt.ylabel("Cumulative Explained Variance")
6045
6393
  plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
6046
- plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
6394
+ plt.axvline(
6395
+ x=n_components,
6396
+ color="g",
6397
+ linestyle="--",
6398
+ label=f"n_components = {n_components}",
6399
+ )
6047
6400
  plt.legend()
6048
6401
  plt.grid()
6049
6402
  plt.show()
6050
6403
 
6051
6404
  # Prepare reduced DataFrame with additional PCA info
6052
6405
  pca_df = pd.DataFrame(
6053
- X_reduced, index=data.index,
6054
- columns=[f"PC_{i+1}" for i in range(n_components)]
6055
- )
6406
+ X_reduced,
6407
+ index=data.index,
6408
+ columns=[f"PC_{i+1}" for i in range(n_components)],
6409
+ )
6056
6410
  # pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
6057
6411
  # pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
6058
6412
  # Expand explained variance to multiple columns if needed
6059
6413
  for i in range(n_components):
6060
- pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
6414
+ pca_df[f"Explained Variance PC_{i+1}"] = np.tile(
6415
+ format(explained_variance[i] * 100, ".3f") + "%", (pca_df.shape[0], 1)
6416
+ )
6061
6417
  for i in range(n_components):
6062
- pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
6418
+ pca_df[f"Singular Values PC_{i+1}"] = np.tile(
6419
+ singular_values[i], (pca_df.shape[0], 1)
6420
+ )
6063
6421
  if hue:
6064
- pca_df[hue]=y
6065
- elif method =='lda':
6422
+ pca_df[hue] = y
6423
+ elif method == "lda":
6066
6424
  from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
6067
-
6425
+
6068
6426
  if "hue" not in locals() or hue is None:
6069
- raise ValueError("LDA requires a 'hue' col parameter to specify class labels.")
6427
+ raise ValueError(
6428
+ "LDA requires a 'hue' col parameter to specify class labels."
6429
+ )
6070
6430
 
6071
6431
  lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
6072
6432
  X_reduced = lda_reducer.fit_transform(X, y)
6073
-
6433
+
6074
6434
  # Prepare reduced DataFrame with additional LDA info
6075
6435
  lda_df = pd.DataFrame(
6076
- X_reduced, index=data.index,
6077
- columns=[f"LDA_{i+1}" for i in range(n_components)]
6436
+ X_reduced,
6437
+ index=data.index,
6438
+ columns=[f"LDA_{i+1}" for i in range(n_components)],
6078
6439
  )
6079
6440
  if debug:
6080
6441
  print(f"LDA completed: Reduced to {n_components} components.")
6081
6442
  print("Class separability achieved by LDA.")
6082
6443
  if hue:
6083
- lda_df[hue]=y
6444
+ lda_df[hue] = y
6084
6445
  # Apply UMAP if selected
6085
6446
  elif method == "umap":
6086
6447
  import umap
6448
+
6087
6449
  umap_reducer = umap.UMAP(
6088
6450
  n_neighbors=umap_neighbors,
6089
6451
  min_dist=umap_min_dist,
6090
- n_components=n_components
6452
+ n_components=n_components,
6091
6453
  )
6092
6454
  X_reduced = umap_reducer.fit_transform(X)
6093
6455
 
@@ -6102,45 +6464,57 @@ def df_reducer(
6102
6464
 
6103
6465
  # Prepare reduced DataFrame with additional UMAP info
6104
6466
  umap_df = pd.DataFrame(
6105
- X_reduced, index=data.index,
6106
- columns=[f"UMAP_{i+1}" for i in range(n_components)]
6467
+ X_reduced,
6468
+ index=data.index,
6469
+ columns=[f"UMAP_{i+1}" for i in range(n_components)],
6107
6470
  )
6108
6471
  umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
6109
6472
  umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
6110
6473
  if hue:
6111
- umap_df[hue]=y
6474
+ umap_df[hue] = y
6112
6475
  elif method == "tsne":
6113
6476
  from sklearn.manifold import TSNE
6114
- tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=random_state)
6115
- X_reduced = tsne.fit_transform(X)
6477
+
6478
+ tsne = TSNE(
6479
+ n_components=n_components,
6480
+ perplexity=tsne_perplexity,
6481
+ random_state=random_state,
6482
+ )
6483
+ X_reduced = tsne.fit_transform(X)
6116
6484
  tsne_df = pd.DataFrame(
6117
- X_reduced,
6485
+ X_reduced,
6118
6486
  index=data.index,
6119
- columns=[f"tSNE_{i+1}" for i in range(n_components)]
6487
+ columns=[f"tSNE_{i+1}" for i in range(n_components)],
6488
+ )
6489
+ tsne_df["Perplexity"] = np.tile(
6490
+ f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1)
6120
6491
  )
6121
- tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
6122
6492
  if hue:
6123
- tsne_df[hue]=y
6493
+ tsne_df[hue] = y
6124
6494
  # Apply Factor Analysis if selected
6125
6495
  elif method == "factor":
6126
6496
  from sklearn.decomposition import FactorAnalysis
6497
+
6127
6498
  factor = FactorAnalysis(n_components=n_components, random_state=random_state)
6128
6499
  X_reduced = factor.fit_transform(X)
6129
6500
  # Factor Analysis does not directly provide explained variance, but we can approximate it
6130
6501
  fa_variance = factor.noise_variance_
6131
6502
  # Prepare reduced DataFrame with additional Factor Analysis info
6132
6503
  factor_df = pd.DataFrame(
6133
- X_reduced,
6504
+ X_reduced,
6134
6505
  index=data.index,
6135
- columns=[f"Factor_{i+1}" for i in range(n_components)]
6506
+ columns=[f"Factor_{i+1}" for i in range(n_components)],
6507
+ )
6508
+ factor_df["Noise Variance"] = np.tile(
6509
+ format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1)
6136
6510
  )
6137
- factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
6138
6511
  if hue:
6139
- factor_df[hue]=y
6512
+ factor_df[hue] = y
6140
6513
  # Apply Isolation Forest for outlier detection if selected
6141
6514
  elif method == "isolation_forest":
6142
6515
  from sklearn.decomposition import PCA
6143
6516
  from sklearn.ensemble import IsolationForest
6517
+
6144
6518
  # Step 1: Apply PCA for dimensionality reduction to 2 components
6145
6519
  pca = PCA(n_components=n_components)
6146
6520
  X_pca = pca.fit_transform(X)
@@ -6150,87 +6524,108 @@ def df_reducer(
6150
6524
 
6151
6525
  # Prepare reduced DataFrame with additional PCA info
6152
6526
  iso_forest_df = pd.DataFrame(
6153
- X_pca, index=data.index,
6154
- columns=[f"PC_{i+1}" for i in range(n_components)]
6527
+ X_pca, index=data.index, columns=[f"PC_{i+1}" for i in range(n_components)]
6155
6528
  )
6156
6529
 
6157
- isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
6530
+ isolation_forest = IsolationForest(
6531
+ n_estimators=100, contamination="auto", random_state=1
6532
+ )
6158
6533
  isolation_forest.fit(X)
6159
- anomaly_scores = isolation_forest.decision_function(X) # Anomaly score: larger is less anomalous
6534
+ anomaly_scores = isolation_forest.decision_function(
6535
+ X
6536
+ ) # Anomaly score: larger is less anomalous
6160
6537
  # Predict labels: 1 (normal), -1 (anomaly)
6161
- anomaly_labels = isolation_forest.fit_predict(X)
6538
+ anomaly_labels = isolation_forest.fit_predict(X)
6162
6539
  # Add anomaly scores and labels to the DataFrame
6163
6540
  iso_forest_df["Anomaly Score"] = anomaly_scores
6164
6541
  iso_forest_df["Anomaly Label"] = anomaly_labels
6165
6542
  # add info from pca
6166
6543
  for i in range(n_components):
6167
- iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
6544
+ iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(
6545
+ format(explained_variance[i] * 100, ".3f") + "%",
6546
+ (iso_forest_df.shape[0], 1),
6547
+ )
6168
6548
  for i in range(n_components):
6169
- iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
6549
+ iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(
6550
+ singular_values[i], (iso_forest_df.shape[0], 1)
6551
+ )
6170
6552
  if hue:
6171
- iso_forest_df[hue]=y
6172
- #* Apply Kernel PCA if selected
6553
+ iso_forest_df[hue] = y
6554
+ # * Apply Kernel PCA if selected
6173
6555
  elif method == "kpca":
6174
6556
  from sklearn.decomposition import KernelPCA
6175
- kpca = KernelPCA(n_components=n_components, kernel="rbf", random_state=random_state)
6557
+
6558
+ kpca = KernelPCA(
6559
+ n_components=n_components, kernel="rbf", random_state=random_state
6560
+ )
6176
6561
  X_reduced = kpca.fit_transform(X)
6177
6562
 
6178
6563
  # Prepare reduced DataFrame with KPCA info
6179
6564
  kpca_df = pd.DataFrame(
6180
- X_reduced,
6565
+ X_reduced,
6181
6566
  index=data.index,
6182
- columns=[f"KPCA_{i+1}" for i in range(n_components)]
6567
+ columns=[f"KPCA_{i+1}" for i in range(n_components)],
6183
6568
  )
6184
6569
  if debug:
6185
6570
  print("Kernel PCA completed with RBF kernel.")
6186
6571
  if hue:
6187
- kpca_df[hue]=y
6188
- #* Apply ICA if selected
6572
+ kpca_df[hue] = y
6573
+ # * Apply ICA if selected
6189
6574
  elif method == "ica":
6190
6575
  from sklearn.decomposition import FastICA
6576
+
6191
6577
  ica = FastICA(n_components=n_components, random_state=random_state)
6192
6578
  X_reduced = ica.fit_transform(X)
6193
6579
 
6194
6580
  # Prepare reduced DataFrame with ICA info
6195
6581
  ica_df = pd.DataFrame(
6196
- X_reduced, index=data.index,
6197
- columns=[f"ICA_{i+1}" for i in range(n_components)]
6582
+ X_reduced,
6583
+ index=data.index,
6584
+ columns=[f"ICA_{i+1}" for i in range(n_components)],
6198
6585
  )
6199
6586
  if debug:
6200
6587
  print("Independent Component Analysis (ICA) completed.")
6201
6588
  if hue:
6202
- ica_df[hue]=y
6203
- #* Apply MDS if selected
6589
+ ica_df[hue] = y
6590
+ # * Apply MDS if selected
6204
6591
  elif method == "mds":
6205
6592
  from sklearn.manifold import MDS
6593
+
6206
6594
  mds = MDS(n_components=n_components, random_state=random_state)
6207
6595
  X_reduced = mds.fit_transform(X)
6208
6596
 
6209
6597
  # Prepare reduced DataFrame with MDS info
6210
6598
  mds_df = pd.DataFrame(
6211
- X_reduced, index=data.index,
6212
- columns=[f"MDS_{i+1}" for i in range(n_components)]
6599
+ X_reduced,
6600
+ index=data.index,
6601
+ columns=[f"MDS_{i+1}" for i in range(n_components)],
6213
6602
  )
6214
6603
  if debug:
6215
6604
  print("Multidimensional Scaling (MDS) completed.")
6216
6605
  if hue:
6217
- mds_df[hue]=y
6218
- #* Apply Locally Linear Embedding (LLE) if selected
6606
+ mds_df[hue] = y
6607
+ # * Apply Locally Linear Embedding (LLE) if selected
6219
6608
  elif method == "lle":
6220
6609
  from sklearn.manifold import LocallyLinearEmbedding
6221
- lle = LocallyLinearEmbedding(n_components=n_components, n_neighbors=umap_neighbors, random_state=random_state)
6610
+
6611
+ lle = LocallyLinearEmbedding(
6612
+ n_components=n_components,
6613
+ n_neighbors=umap_neighbors,
6614
+ random_state=random_state,
6615
+ )
6222
6616
  X_reduced = lle.fit_transform(X)
6223
6617
 
6224
6618
  # Prepare reduced DataFrame with LLE info
6225
6619
  lle_df = pd.DataFrame(
6226
- X_reduced, index=data.index,
6227
- columns=[f"LLE_{i+1}" for i in range(n_components)]
6620
+ X_reduced,
6621
+ index=data.index,
6622
+ columns=[f"LLE_{i+1}" for i in range(n_components)],
6228
6623
  )
6229
6624
  if debug:
6230
6625
  print("Locally Linear Embedding (LLE) completed.")
6231
6626
  if hue:
6232
- lle_df[hue]=y
6233
- #* Apply Singular Value Decomposition (SVD) if selected
6627
+ lle_df[hue] = y
6628
+ # * Apply Singular Value Decomposition (SVD) if selected
6234
6629
  elif method == "svd":
6235
6630
  # Using NumPy's SVD for dimensionality reduction
6236
6631
  U, s, Vt = np.linalg.svd(X, full_matrices=False)
@@ -6238,11 +6633,12 @@ def df_reducer(
6238
6633
 
6239
6634
  # Prepare reduced DataFrame with SVD info
6240
6635
  svd_df = pd.DataFrame(
6241
- X_reduced, index=data.index,
6242
- columns=[f"SVD_{i+1}" for i in range(n_components)]
6636
+ X_reduced,
6637
+ index=data.index,
6638
+ columns=[f"SVD_{i+1}" for i in range(n_components)],
6243
6639
  )
6244
6640
  if hue:
6245
- svd_df[hue]=y
6641
+ svd_df[hue] = y
6246
6642
  if debug:
6247
6643
  print("Singular Value Decomposition (SVD) completed.")
6248
6644
 
@@ -6250,17 +6646,17 @@ def df_reducer(
6250
6646
  if method == "pca":
6251
6647
  reduced_df = pca_df
6252
6648
  colname_met = "PC_"
6253
- xlabel= f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
6254
- ylabel= f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
6649
+ xlabel = f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
6650
+ ylabel = f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
6255
6651
  elif method == "umap":
6256
6652
  reduced_df = umap_df
6257
- colname_met = "UMAP_"
6653
+ colname_met = "UMAP_"
6258
6654
  elif method == "tsne":
6259
6655
  reduced_df = tsne_df
6260
- colname_met = "tSNE_"
6656
+ colname_met = "tSNE_"
6261
6657
  elif method == "factor":
6262
6658
  reduced_df = factor_df
6263
- colname_met = "Factor_"
6659
+ colname_met = "Factor_"
6264
6660
  elif method == "isolation_forest":
6265
6661
  reduced_df = iso_forest_df # Already a DataFrame for outliers
6266
6662
  colname_met = "PC_"
@@ -6269,7 +6665,8 @@ def df_reducer(
6269
6665
  data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
6270
6666
  x="PC_1",
6271
6667
  y="PC_2",
6272
- label="normal", c="b",
6668
+ label="normal",
6669
+ c="b",
6273
6670
  )
6274
6671
  ax = sns.scatterplot(
6275
6672
  ax=ax,
@@ -6277,73 +6674,86 @@ def df_reducer(
6277
6674
  x="PC_1",
6278
6675
  y="PC_2",
6279
6676
  c="r",
6280
- label="outlier", marker="+", s=30,
6677
+ label="outlier",
6678
+ marker="+",
6679
+ s=30,
6281
6680
  )
6282
- elif method=='lda':
6283
- reduced_df=lda_df
6284
- colname_met="LDA_"
6285
- elif method=="kpca":
6286
- reduced_df=kpca_df
6287
- colname_met="KPCA_"
6288
- elif method=="ica":
6289
- reduced_df=ica_df
6290
- colname_met="ICA_"
6291
- elif method=="mds":
6292
- reduced_df=mds_df
6293
- colname_met="MDS_"
6294
- elif method=="lle":
6295
- reduced_df=lle_df
6296
- colname_met="LLE_"
6297
- elif method=="svd":
6298
- reduced_df=svd_df
6299
- colname_met="SVD_"
6681
+ elif method == "lda":
6682
+ reduced_df = lda_df
6683
+ colname_met = "LDA_"
6684
+ elif method == "kpca":
6685
+ reduced_df = kpca_df
6686
+ colname_met = "KPCA_"
6687
+ elif method == "ica":
6688
+ reduced_df = ica_df
6689
+ colname_met = "ICA_"
6690
+ elif method == "mds":
6691
+ reduced_df = mds_df
6692
+ colname_met = "MDS_"
6693
+ elif method == "lle":
6694
+ reduced_df = lle_df
6695
+ colname_met = "LLE_"
6696
+ elif method == "svd":
6697
+ reduced_df = svd_df
6698
+ colname_met = "SVD_"
6300
6699
  # Quick plots
6301
6700
  if plot_ and (not method in ["isolation_forest"]):
6302
6701
  from .plot import plotxy
6702
+
6303
6703
  if ax is None:
6304
6704
  if figsize is None:
6305
- _, ax = plt.subplots(figsize=cm2inch(8,8))
6705
+ _, ax = plt.subplots(figsize=cm2inch(8, 8))
6306
6706
  else:
6307
6707
  _, ax = plt.subplots(figsize=figsize)
6308
6708
  else:
6309
- ax=ax.cla()
6310
- ax=plotxy(data=reduced_df,
6311
- x=colname_met+"1",
6312
- y=colname_met+"2",
6313
- hue=hue,
6314
- s=1,
6315
- edgecolor='none',
6316
- kind='scater',
6317
- figsets=dict(legend=dict(loc='best',markerscale=4),
6318
- xlabel=xlabel if xlabel else None,
6319
- ylabel=ylabel if ylabel else None),
6320
- ax=ax,
6321
- verbose=False,
6322
- **kwargs
6323
- )
6709
+ ax = ax.cla()
6710
+ ax = plotxy(
6711
+ data=reduced_df,
6712
+ x=colname_met + "1",
6713
+ y=colname_met + "2",
6714
+ hue=hue,
6715
+ s=1,
6716
+ edgecolor="none",
6717
+ kind="scater",
6718
+ figsets=dict(
6719
+ legend=dict(loc="best", markerscale=4),
6720
+ xlabel=xlabel if xlabel else None,
6721
+ ylabel=ylabel if ylabel else None,
6722
+ ),
6723
+ ax=ax,
6724
+ verbose=False,
6725
+ **kwargs,
6726
+ )
6324
6727
 
6325
6728
  if inplace:
6326
6729
  # If inplace=True, add components back into the original data
6327
6730
  for col_idx in range(n_components):
6328
- data.loc[:,f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
6731
+ data.loc[:, f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
6329
6732
  # Add extra info for PCA/UMAP
6330
6733
  if method == "pca":
6331
6734
  for i in range(n_components):
6332
- data.loc[:,f"Explained Variance PC_{i+1}"] = reduced_df.loc[:,f"Explained Variance PC_{i+1}"]
6735
+ data.loc[:, f"Explained Variance PC_{i+1}"] = reduced_df.loc[
6736
+ :, f"Explained Variance PC_{i+1}"
6737
+ ]
6333
6738
  for i in range(n_components):
6334
- data.loc[:,f"Singular Values PC_{i+1}"] = reduced_df.loc[:,f"Singular Values PC_{i+1}"]
6335
- elif method == "umap":
6739
+ data.loc[:, f"Singular Values PC_{i+1}"] = reduced_df.loc[
6740
+ :, f"Singular Values PC_{i+1}"
6741
+ ]
6742
+ elif method == "umap":
6336
6743
  for i in range(n_components):
6337
- data.loc[:,f"UMAP_{i+1}"]=reduced_df.loc[:,f"UMAP_{i+1}"]
6338
- data.loc[:,"Embedding"] = reduced_df.loc[:,"Embedding"]
6339
- data.loc[:,"Trustworthiness"] = reduced_df.loc[:,"Trustworthiness"]
6340
-
6744
+ data.loc[:, f"UMAP_{i+1}"] = reduced_df.loc[:, f"UMAP_{i+1}"]
6745
+ data.loc[:, "Embedding"] = reduced_df.loc[:, "Embedding"]
6746
+ data.loc[:, "Trustworthiness"] = reduced_df.loc[:, "Trustworthiness"]
6747
+
6341
6748
  return None # No return when inplace=True
6342
6749
 
6343
- return reduced_df
6750
+ return reduced_df
6751
+
6752
+
6344
6753
  # example:
6345
6754
  # df_reducer(data=data_log, columns=markers, n_components=2)
6346
6755
 
6756
+
6347
6757
  def plot_cluster(
6348
6758
  data: pd.DataFrame,
6349
6759
  labels: np.ndarray,
@@ -6368,6 +6778,7 @@ def plot_cluster(
6368
6778
  import seaborn as sns
6369
6779
  from sklearn.metrics import silhouette_samples
6370
6780
  import matplotlib.pyplot as plt
6781
+
6371
6782
  if metrics is None:
6372
6783
  metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)
6373
6784
 
@@ -6597,10 +7008,10 @@ def use_pd(
6597
7008
  verbose=True,
6598
7009
  dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
6599
7010
  ):
6600
- default_settings = fload(dir_json, output='json')
7011
+ default_settings = fload(dir_json, output="json")
6601
7012
  valid_kinds = list(default_settings.keys())
6602
7013
  kind = strcmp(func_name, valid_kinds)[0]
6603
- usage=default_settings[kind]
7014
+ usage = default_settings[kind]
6604
7015
  if verbose:
6605
7016
  for i, i_ in enumerate(ssplit(usage, by=",")):
6606
7017
  i_ = i_.replace("=", "\t= ") + ","