py2ls 0.2.4.8__py3-none-any.whl → 0.2.4.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/ips.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import numpy as np
|
2
|
-
import pandas as pd
|
2
|
+
import pandas as pd
|
3
3
|
import sys, os
|
4
|
-
from IPython.display import display
|
4
|
+
from IPython.display import display
|
5
5
|
from typing import List, Optional, Union
|
6
|
+
|
6
7
|
try:
|
7
8
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
8
9
|
get_ipython().run_line_magic("autoreload", "2")
|
@@ -10,11 +11,14 @@ except NameError:
|
|
10
11
|
pass
|
11
12
|
|
12
13
|
import warnings
|
14
|
+
|
13
15
|
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
14
16
|
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
15
17
|
|
16
|
-
|
18
|
+
|
19
|
+
def run_once_within(duration=60): # default 60s
|
17
20
|
import time
|
21
|
+
|
18
22
|
"""
|
19
23
|
usage:
|
20
24
|
if run_once_within():
|
@@ -26,7 +30,9 @@ def run_once_within(duration=60): # default 60s
|
|
26
30
|
run_once_within.time_last = None
|
27
31
|
time_curr = time.time()
|
28
32
|
|
29
|
-
if (run_once_within.time_last is None) or (
|
33
|
+
if (run_once_within.time_last is None) or (
|
34
|
+
time_curr - run_once_within.time_last >= duration
|
35
|
+
):
|
30
36
|
run_once_within.time_last = time_curr # Update the last execution time
|
31
37
|
return True
|
32
38
|
else:
|
@@ -42,13 +48,14 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
|
42
48
|
"""
|
43
49
|
import matplotlib.pyplot as plt
|
44
50
|
from matplotlib import font_manager
|
45
|
-
|
51
|
+
|
52
|
+
slashtype = "/" if "mac" in get_os() else "\\"
|
46
53
|
if slashtype in dir_font:
|
47
54
|
font_manager.fontManager.addfont(dir_font)
|
48
55
|
fontname = os.path.basename(dir_font).split(".")[0]
|
49
56
|
else:
|
50
57
|
if "cn" in dir_font.lower() or "ch" in dir_font.lower():
|
51
|
-
fontname = "Hiragino Sans GB"
|
58
|
+
fontname = "Hiragino Sans GB" # default Chinese font
|
52
59
|
else:
|
53
60
|
fontname = dir_font
|
54
61
|
|
@@ -62,6 +69,7 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
|
62
69
|
plt.rcParams["font.sans-serif"] = ["Arial"]
|
63
70
|
return fontname
|
64
71
|
|
72
|
+
|
65
73
|
# set 'dir_save'
|
66
74
|
if "dar" in sys.platform:
|
67
75
|
dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
|
@@ -133,6 +141,7 @@ def run_every(when: str = None, job=None, wait: int = 60):
|
|
133
141
|
"""
|
134
142
|
import schedule
|
135
143
|
import time
|
144
|
+
|
136
145
|
if job is None:
|
137
146
|
print("No job provided!")
|
138
147
|
return
|
@@ -180,6 +189,7 @@ def run_at(when: str, job=None, wait: int = 60):
|
|
180
189
|
"""
|
181
190
|
from datetime import datetime
|
182
191
|
import time
|
192
|
+
|
183
193
|
if job is None:
|
184
194
|
print("No job provided!")
|
185
195
|
return
|
@@ -260,11 +270,12 @@ def get_timezone(timezone: str | list = None):
|
|
260
270
|
def is_package_installed(package_name):
|
261
271
|
"""Check if a package is installed."""
|
262
272
|
import importlib.util
|
273
|
+
|
263
274
|
package_spec = importlib.util.find_spec(package_name)
|
264
275
|
return package_spec is not None
|
265
276
|
|
266
277
|
|
267
|
-
def upgrade(module="py2ls",uninstall=False):
|
278
|
+
def upgrade(module="py2ls", uninstall=False):
|
268
279
|
"""
|
269
280
|
Installs or upgrades a specified Python module.
|
270
281
|
|
@@ -273,6 +284,7 @@ def upgrade(module="py2ls",uninstall=False):
|
|
273
284
|
uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
|
274
285
|
"""
|
275
286
|
import subprocess
|
287
|
+
|
276
288
|
if not is_package_installed(module):
|
277
289
|
try:
|
278
290
|
subprocess.check_call([sys.executable, "-m", "pip", "install", module])
|
@@ -310,6 +322,7 @@ def get_version(pkg):
|
|
310
322
|
|
311
323
|
def rm_folder(folder_path, verbose=True):
|
312
324
|
import shutil
|
325
|
+
|
313
326
|
try:
|
314
327
|
shutil.rmtree(folder_path)
|
315
328
|
if verbose:
|
@@ -329,6 +342,7 @@ def fremove(path, verbose=True):
|
|
329
342
|
try:
|
330
343
|
if os.path.isdir(path):
|
331
344
|
import shutil
|
345
|
+
|
332
346
|
shutil.rmtree(path)
|
333
347
|
if verbose:
|
334
348
|
print(f"Successfully deleted folder {path}")
|
@@ -364,11 +378,13 @@ def fremove(path, verbose=True):
|
|
364
378
|
|
365
379
|
def get_cwd():
|
366
380
|
from pathlib import Path
|
381
|
+
|
367
382
|
# Get the current script's directory as a Path object
|
368
|
-
current_directory = Path(__file__).resolve().parent
|
369
|
-
|
383
|
+
current_directory = Path(__file__).resolve().parent
|
384
|
+
|
370
385
|
return current_directory
|
371
386
|
|
387
|
+
|
372
388
|
def search(
|
373
389
|
query,
|
374
390
|
limit=5,
|
@@ -380,6 +396,7 @@ def search(
|
|
380
396
|
**kwargs,
|
381
397
|
):
|
382
398
|
from duckduckgo_search import DDGS
|
399
|
+
|
383
400
|
if "te" in kind.lower():
|
384
401
|
results = DDGS().text(query, max_results=limit)
|
385
402
|
res = pd.DataFrame(results)
|
@@ -413,6 +430,7 @@ def echo(*args, **kwargs):
|
|
413
430
|
"""
|
414
431
|
global dir_save
|
415
432
|
from duckduckgo_search import DDGS
|
433
|
+
|
416
434
|
query = None
|
417
435
|
model = kwargs.get("model", "gpt")
|
418
436
|
verbose = kwargs.get("verbose", True)
|
@@ -461,10 +479,12 @@ def echo(*args, **kwargs):
|
|
461
479
|
res = DDGS().chat(query, model=model_valid)
|
462
480
|
if verbose:
|
463
481
|
from pprint import pp
|
482
|
+
|
464
483
|
pp(res)
|
465
484
|
if log:
|
466
485
|
from datetime import datetime
|
467
486
|
import time
|
487
|
+
|
468
488
|
dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
|
469
489
|
res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
|
470
490
|
if bool(os.path.basename(dir_save)):
|
@@ -487,6 +507,7 @@ def ai(*args, **kwargs):
|
|
487
507
|
|
488
508
|
def detect_lang(text, output="lang", verbose=True):
|
489
509
|
from langdetect import detect
|
510
|
+
|
490
511
|
dir_curr_script = os.path.dirname(os.path.abspath(__file__))
|
491
512
|
dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
|
492
513
|
print(dir_curr_script, os.getcwd(), dir_lang_code)
|
@@ -516,13 +537,14 @@ def is_text(s):
|
|
516
537
|
|
517
538
|
from typing import Any, Union
|
518
539
|
|
540
|
+
|
519
541
|
def shared(*args, strict=True, n_shared=2, verbose=True):
|
520
542
|
"""
|
521
543
|
check the shared elelements in two list.
|
522
544
|
usage:
|
523
545
|
list1 = [1, 2, 3, 4, 5]
|
524
546
|
list2 = [4, 5, 6, 7, 8]
|
525
|
-
list3 = [5, 6, 9, 10]
|
547
|
+
list3 = [5, 6, 9, 10]
|
526
548
|
a = shared(list1, list2,list3)
|
527
549
|
"""
|
528
550
|
if verbose:
|
@@ -538,26 +560,34 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
|
|
538
560
|
print(f"{' ' * 2}All inputs must be lists.")
|
539
561
|
return []
|
540
562
|
first_list = flattened_lists[0]
|
541
|
-
shared_elements = [
|
563
|
+
shared_elements = [
|
564
|
+
item for item in first_list if all(item in lst for lst in flattened_lists)
|
565
|
+
]
|
542
566
|
if strict:
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
567
|
+
# Strict mode: require elements to be in all lists
|
568
|
+
shared_elements = set(flattened_lists[0])
|
569
|
+
for lst in flattened_lists[1:]:
|
570
|
+
shared_elements.intersection_update(lst)
|
547
571
|
else:
|
548
572
|
from collections import Counter
|
573
|
+
|
549
574
|
all_elements = [item for sublist in flattened_lists for item in sublist]
|
550
575
|
element_count = Counter(all_elements)
|
551
576
|
# Get elements that appear in at least n_shared lists
|
552
|
-
shared_elements = [
|
577
|
+
shared_elements = [
|
578
|
+
item for item, count in element_count.items() if count >= n_shared
|
579
|
+
]
|
553
580
|
|
554
581
|
shared_elements = flatten(shared_elements, verbose=verbose)
|
555
582
|
if verbose:
|
556
|
-
elements2show =
|
583
|
+
elements2show = (
|
584
|
+
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
585
|
+
)
|
557
586
|
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
558
587
|
print("********* checking shared elements *********")
|
559
588
|
return shared_elements
|
560
589
|
|
590
|
+
|
561
591
|
def not_shared(*args, strict=True, n_shared=2, verbose=False):
|
562
592
|
"""
|
563
593
|
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
@@ -568,7 +598,7 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
|
|
568
598
|
"""
|
569
599
|
_common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
|
570
600
|
list1 = flatten(args[0], verbose=verbose)
|
571
|
-
_not_shared=[item for item in list1 if item not in _common]
|
601
|
+
_not_shared = [item for item in list1 if item not in _common]
|
572
602
|
return _not_shared
|
573
603
|
|
574
604
|
|
@@ -578,29 +608,41 @@ def flatten(nested: Any, unique_list=True, verbose=False):
|
|
578
608
|
Parameters:
|
579
609
|
nested : Any, Can be a list, tuple, dictionary, or set.
|
580
610
|
Returns: list, A flattened list.
|
581
|
-
"""
|
611
|
+
"""
|
582
612
|
flattened_list = []
|
583
613
|
stack = [nested]
|
584
614
|
while stack:
|
585
615
|
current = stack.pop()
|
586
616
|
if isinstance(current, dict):
|
587
|
-
stack.extend(current.values())
|
617
|
+
stack.extend(current.values())
|
588
618
|
elif isinstance(current, (list, tuple, set)):
|
589
619
|
stack.extend(current)
|
590
620
|
elif isinstance(current, pd.Series):
|
591
621
|
stack.extend(current)
|
592
|
-
elif isinstance(
|
622
|
+
elif isinstance(
|
623
|
+
current, (pd.Index, np.ndarray)
|
624
|
+
): # df.columns df.index are object of type pd.Index
|
593
625
|
stack.extend(current.tolist())
|
594
626
|
else:
|
595
627
|
flattened_list.append(current)
|
596
628
|
if verbose:
|
597
|
-
print(
|
629
|
+
print(
|
630
|
+
f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>"
|
631
|
+
)
|
598
632
|
if unique_list:
|
599
633
|
return unique(flattened_list)[::-1]
|
600
634
|
else:
|
601
635
|
return flattened_list
|
602
|
-
|
603
|
-
|
636
|
+
|
637
|
+
|
638
|
+
def strcmp(
|
639
|
+
search_term,
|
640
|
+
candidates,
|
641
|
+
ignore_case=True,
|
642
|
+
get_rank=False,
|
643
|
+
verbose=False,
|
644
|
+
scorer="WR",
|
645
|
+
):
|
604
646
|
"""
|
605
647
|
Compares a search term with a list of candidate strings and finds the best match based on similarity score.
|
606
648
|
|
@@ -614,13 +656,14 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
|
|
614
656
|
tuple: A tuple containing the best match and its index in the candidates list.
|
615
657
|
"""
|
616
658
|
from fuzzywuzzy import fuzz, process
|
659
|
+
|
617
660
|
def to_lower(s, ignore_case=True):
|
618
661
|
# Converts a string or list of strings to lowercase if ignore_case is True.
|
619
662
|
if ignore_case:
|
620
663
|
if isinstance(s, str):
|
621
664
|
return s.lower()
|
622
665
|
elif isinstance(s, list):
|
623
|
-
s=[str(i) for i in s]# convert all to str
|
666
|
+
s = [str(i) for i in s] # convert all to str
|
624
667
|
return [elem.lower() for elem in s]
|
625
668
|
return s
|
626
669
|
|
@@ -630,12 +673,15 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
|
|
630
673
|
similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
|
631
674
|
elif "W" in scorer.lower():
|
632
675
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
633
|
-
elif "ratio" in scorer.lower() or "stri" in scorer.lower()
|
676
|
+
elif "ratio" in scorer.lower() or "stri" in scorer.lower(): # Ratio (Strictest)
|
634
677
|
similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
|
635
678
|
else:
|
636
679
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
637
680
|
if get_rank:
|
638
|
-
idx = [
|
681
|
+
idx = [
|
682
|
+
similarity_scores.index(i)
|
683
|
+
for i in sorted(similarity_scores, reverse=True)
|
684
|
+
]
|
639
685
|
if verbose:
|
640
686
|
display([candidates[ii] for ii in idx])
|
641
687
|
return [candidates[ii] for ii in idx]
|
@@ -663,6 +709,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
|
|
663
709
|
# str2 = ['PLoS Computational Biology', 'PLOS BIOLOGY']
|
664
710
|
# best_match, idx = strcmp(str1, str2, ignore_case=1)
|
665
711
|
|
712
|
+
|
666
713
|
def cn2pinyin(
|
667
714
|
cn_str: Union[str, list] = None,
|
668
715
|
sep: str = " ",
|
@@ -727,19 +774,21 @@ def cn2pinyin(
|
|
727
774
|
style = Style.PL
|
728
775
|
else:
|
729
776
|
style = Style.NORMAL
|
730
|
-
if not isinstance(cn_str,list):
|
731
|
-
cn_str=[cn_str]
|
732
|
-
pinyin_flat=[]
|
777
|
+
if not isinstance(cn_str, list):
|
778
|
+
cn_str = [cn_str]
|
779
|
+
pinyin_flat = []
|
733
780
|
for cn_str_ in cn_str:
|
734
781
|
pinyin_string = pinyin(cn_str_, style=style)
|
735
782
|
pinyin_flat.append(sep.join([item[0] for item in pinyin_string]))
|
736
|
-
if len(pinyin_flat)==1:
|
783
|
+
if len(pinyin_flat) == 1:
|
737
784
|
return pinyin_flat[0]
|
738
785
|
else:
|
739
786
|
return pinyin_flat
|
740
787
|
|
788
|
+
|
741
789
|
def counter(list_, verbose=True):
|
742
790
|
from collections import Counter
|
791
|
+
|
743
792
|
c = Counter(list_)
|
744
793
|
# Print the name counts
|
745
794
|
for item, count in c.items():
|
@@ -769,6 +818,7 @@ def str2time(time_str, fmt="24"):
|
|
769
818
|
- str: The converted time string.
|
770
819
|
"""
|
771
820
|
from datetime import datetime
|
821
|
+
|
772
822
|
def time_len_corr(time_str):
|
773
823
|
time_str_ = (
|
774
824
|
ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
|
@@ -830,6 +880,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
|
|
830
880
|
- str: The converted date string.
|
831
881
|
"""
|
832
882
|
from dateutil import parser
|
883
|
+
|
833
884
|
try:
|
834
885
|
date_obj = parser.parse(date_str)
|
835
886
|
except ValueError as e:
|
@@ -847,6 +898,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
|
|
847
898
|
|
848
899
|
def str2num(s, *args, **kwargs):
|
849
900
|
import re
|
901
|
+
|
850
902
|
delimiter = kwargs.get("sep", None)
|
851
903
|
round_digits = kwargs.get("round", None)
|
852
904
|
if delimiter is not None:
|
@@ -863,6 +915,7 @@ def str2num(s, *args, **kwargs):
|
|
863
915
|
num = float(s)
|
864
916
|
except ValueError:
|
865
917
|
from numerizer import numerize
|
918
|
+
|
866
919
|
try:
|
867
920
|
numerized = numerize(s)
|
868
921
|
num = int(numerized) if "." not in numerized else float(numerized)
|
@@ -1067,13 +1120,12 @@ def inch2px(*inch, dpi=300) -> list:
|
|
1067
1120
|
# Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
|
1068
1121
|
if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
|
1069
1122
|
return [i * dpi for i in inch[0]]
|
1070
|
-
|
1123
|
+
|
1071
1124
|
# Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
|
1072
1125
|
else:
|
1073
1126
|
return [i * dpi for i in inch]
|
1074
1127
|
|
1075
1128
|
|
1076
|
-
|
1077
1129
|
def cm2inch(*inch) -> list:
|
1078
1130
|
"""
|
1079
1131
|
Usage:
|
@@ -1191,6 +1243,7 @@ def paper_size(paper_type_str="a4"):
|
|
1191
1243
|
|
1192
1244
|
def docx2pdf(dir_docx, dir_pdf=None):
|
1193
1245
|
from docx2pdf import convert
|
1246
|
+
|
1194
1247
|
if dir_pdf:
|
1195
1248
|
convert(dir_docx, dir_pdf)
|
1196
1249
|
else:
|
@@ -1199,6 +1252,7 @@ def docx2pdf(dir_docx, dir_pdf=None):
|
|
1199
1252
|
|
1200
1253
|
def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
|
1201
1254
|
import img2pdf as image2pdf
|
1255
|
+
|
1202
1256
|
def mm_to_point(size):
|
1203
1257
|
return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
|
1204
1258
|
|
@@ -1253,6 +1307,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
|
|
1253
1307
|
from PyPDF2 import PdfReader
|
1254
1308
|
from pptx.util import Inches
|
1255
1309
|
from pptx import Presentation
|
1310
|
+
|
1256
1311
|
prs = Presentation()
|
1257
1312
|
|
1258
1313
|
# Open the PDF file
|
@@ -1282,6 +1337,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
|
|
1282
1337
|
|
1283
1338
|
def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
1284
1339
|
import re
|
1340
|
+
|
1285
1341
|
if isinstance(text, list):
|
1286
1342
|
nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
|
1287
1343
|
flat_list = [item for sublist in nested_list for item in sublist]
|
@@ -1331,6 +1387,7 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1331
1387
|
def split_by_sent_num(text, n=10):
|
1332
1388
|
from nltk.tokenize import sent_tokenize
|
1333
1389
|
from itertools import pairwise
|
1390
|
+
|
1334
1391
|
# split text into sentences
|
1335
1392
|
text_split_by_sent = sent_tokenize(text)
|
1336
1393
|
cut_loc_array = np.arange(0, len(text_split_by_sent), n)
|
@@ -1404,11 +1461,13 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1404
1461
|
return split_by_camel_case(text)
|
1405
1462
|
elif ("word" in by) and not strict:
|
1406
1463
|
from nltk.tokenize import word_tokenize
|
1464
|
+
|
1407
1465
|
if verbose:
|
1408
1466
|
print(f"splited by word")
|
1409
1467
|
return word_tokenize(text)
|
1410
1468
|
elif ("sen" in by and not "num" in by) and not strict:
|
1411
1469
|
from nltk.tokenize import sent_tokenize
|
1470
|
+
|
1412
1471
|
if verbose:
|
1413
1472
|
print(f"splited by sentence")
|
1414
1473
|
return sent_tokenize(text)
|
@@ -1459,10 +1518,12 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1459
1518
|
|
1460
1519
|
def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
|
1461
1520
|
from pdf2image import convert_from_path, pdfinfo_from_path
|
1521
|
+
|
1462
1522
|
df_dir_img_single_page = pd.DataFrame()
|
1463
1523
|
dir_single_page = []
|
1464
1524
|
if verbose:
|
1465
1525
|
from pprint import pp
|
1526
|
+
|
1466
1527
|
pp(pdfinfo_from_path(dir_pdf))
|
1467
1528
|
if isinstance(page, tuple) and page:
|
1468
1529
|
page = list(page)
|
@@ -1582,6 +1643,7 @@ def unzip(dir_path, output_dir=None):
|
|
1582
1643
|
if os.path.exists(output_dir):
|
1583
1644
|
if os.path.isdir(output_dir): # check if it is a folder
|
1584
1645
|
import shutil
|
1646
|
+
|
1585
1647
|
shutil.rmtree(output_dir) # remove folder
|
1586
1648
|
else:
|
1587
1649
|
os.remove(output_dir) # remove file
|
@@ -1600,6 +1662,7 @@ def unzip(dir_path, output_dir=None):
|
|
1600
1662
|
output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
|
1601
1663
|
try:
|
1602
1664
|
import shutil
|
1665
|
+
|
1603
1666
|
with gzip.open(dir_path, "rb") as gz_file:
|
1604
1667
|
with open(output_file, "wb") as out_file:
|
1605
1668
|
shutil.copyfileobj(gz_file, out_file)
|
@@ -1607,11 +1670,14 @@ def unzip(dir_path, output_dir=None):
|
|
1607
1670
|
except FileNotFoundError:
|
1608
1671
|
print(f"Error: The file '{dir_path}' was not found.")
|
1609
1672
|
except PermissionError:
|
1610
|
-
print(
|
1673
|
+
print(
|
1674
|
+
f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'."
|
1675
|
+
)
|
1611
1676
|
except Exception as e:
|
1612
1677
|
try:
|
1613
1678
|
import tarfile
|
1614
|
-
|
1679
|
+
|
1680
|
+
with tarfile.open(dir_path, "r:gz") as tar:
|
1615
1681
|
tar.extractall(path=output_file)
|
1616
1682
|
except Exception as final_e:
|
1617
1683
|
print(f"An final unexpected error occurred: {final_e}")
|
@@ -1698,9 +1764,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1698
1764
|
"""
|
1699
1765
|
if not isinstance(df, pd.DataFrame):
|
1700
1766
|
if verbose:
|
1701
|
-
print(
|
1767
|
+
print("not pd.DataFrame")
|
1702
1768
|
return False
|
1703
|
-
df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
|
1769
|
+
df.columns = df.columns.astype(str) # 把它变成str, 这样就可以进行counts运算了
|
1704
1770
|
# Initialize a list to hold messages about abnormalities
|
1705
1771
|
messages = []
|
1706
1772
|
is_abnormal = False
|
@@ -1729,28 +1795,28 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1729
1795
|
if verbose:
|
1730
1796
|
print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
|
1731
1797
|
if verbose:
|
1732
|
-
print("1",is_abnormal)
|
1798
|
+
print("1", is_abnormal)
|
1733
1799
|
if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
|
1734
1800
|
messages.append("Abnormal: Too many delimiters in column names.")
|
1735
1801
|
is_abnormal = True
|
1736
1802
|
if verbose:
|
1737
1803
|
print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
|
1738
1804
|
if verbose:
|
1739
|
-
print("2",is_abnormal)
|
1805
|
+
print("2", is_abnormal)
|
1740
1806
|
if delimiter_counts[""] > 3:
|
1741
1807
|
messages.append("Abnormal: There are empty column names.")
|
1742
1808
|
is_abnormal = True
|
1743
1809
|
if verbose:
|
1744
1810
|
print(f'delimiter_counts[""] > 3')
|
1745
1811
|
if verbose:
|
1746
|
-
print("3",is_abnormal)
|
1812
|
+
print("3", is_abnormal)
|
1747
1813
|
if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
|
1748
1814
|
messages.append("Abnormal: Some column names contain unexpected characters.")
|
1749
1815
|
is_abnormal = True
|
1750
1816
|
if verbose:
|
1751
1817
|
print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
|
1752
1818
|
if verbose:
|
1753
|
-
print("4",is_abnormal)
|
1819
|
+
print("4", is_abnormal)
|
1754
1820
|
# # Check for missing values
|
1755
1821
|
# missing_values = df.isnull().sum()
|
1756
1822
|
# if missing_values.any():
|
@@ -1769,9 +1835,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1769
1835
|
messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
|
1770
1836
|
is_abnormal = True
|
1771
1837
|
if verbose:
|
1772
|
-
print(f
|
1838
|
+
print(f"df.columns[df.nunique() == 1].tolist()")
|
1773
1839
|
if verbose:
|
1774
|
-
print("5",is_abnormal)
|
1840
|
+
print("5", is_abnormal)
|
1775
1841
|
# Check for an unreasonable number of rows or columns
|
1776
1842
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
1777
1843
|
messages.append(
|
@@ -1779,9 +1845,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1779
1845
|
)
|
1780
1846
|
is_abnormal = True
|
1781
1847
|
if verbose:
|
1782
|
-
print(f
|
1848
|
+
print(f"actual_shape[0] < 2 or actual_shape[1] < 2")
|
1783
1849
|
if verbose:
|
1784
|
-
print("6",is_abnormal)
|
1850
|
+
print("6", is_abnormal)
|
1785
1851
|
# Compile results
|
1786
1852
|
if verbose:
|
1787
1853
|
print("\n".join(messages))
|
@@ -1798,8 +1864,10 @@ def fload(fpath, kind=None, **kwargs):
|
|
1798
1864
|
Returns:
|
1799
1865
|
content: The content loaded from the file.
|
1800
1866
|
"""
|
1867
|
+
|
1801
1868
|
def read_mplstyle(style_file):
|
1802
1869
|
import matplotlib.pyplot as plt
|
1870
|
+
|
1803
1871
|
# Load the style file
|
1804
1872
|
plt.style.use(style_file)
|
1805
1873
|
|
@@ -1812,10 +1880,11 @@ def fload(fpath, kind=None, **kwargs):
|
|
1812
1880
|
for i, j in style_dict.items():
|
1813
1881
|
print(f"\n{i}::::{j}")
|
1814
1882
|
return style_dict
|
1883
|
+
|
1815
1884
|
# #example usage:
|
1816
1885
|
# style_file = "/ std-colors.mplstyle"
|
1817
1886
|
# style_dict = read_mplstyle(style_file)
|
1818
|
-
|
1887
|
+
|
1819
1888
|
def load_txt_md(fpath):
|
1820
1889
|
with open(fpath, "r") as file:
|
1821
1890
|
content = file.read()
|
@@ -1825,28 +1894,30 @@ def fload(fpath, kind=None, **kwargs):
|
|
1825
1894
|
# with open(fpath, "r") as file:
|
1826
1895
|
# content = file.read()
|
1827
1896
|
# return content
|
1828
|
-
def load_html(fpath
|
1829
|
-
return pd.read_html(fpath
|
1897
|
+
def load_html(fpath, **kwargs):
|
1898
|
+
return pd.read_html(fpath, **kwargs)
|
1830
1899
|
|
1831
1900
|
def load_json(fpath, **kwargs):
|
1832
|
-
output=kwargs.pop("output","json")
|
1833
|
-
if output==
|
1901
|
+
output = kwargs.pop("output", "json")
|
1902
|
+
if output == "json":
|
1834
1903
|
import json
|
1904
|
+
|
1835
1905
|
with open(fpath, "r") as file:
|
1836
1906
|
content = json.load(file)
|
1837
1907
|
return content
|
1838
1908
|
else:
|
1839
|
-
return pd.read_json(fpath
|
1909
|
+
return pd.read_json(fpath, **kwargs)
|
1840
1910
|
|
1841
1911
|
def load_yaml(fpath):
|
1842
1912
|
import yaml
|
1913
|
+
|
1843
1914
|
with open(fpath, "r") as file:
|
1844
1915
|
content = yaml.safe_load(file)
|
1845
1916
|
return content
|
1846
1917
|
|
1847
|
-
|
1848
1918
|
def load_xml(fpath, fsize_thr: int = 100):
|
1849
1919
|
from lxml import etree
|
1920
|
+
|
1850
1921
|
def load_small_xml(fpath):
|
1851
1922
|
tree = etree.parse(fpath)
|
1852
1923
|
root = tree.getroot()
|
@@ -1905,7 +1976,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1905
1976
|
if line.startswith(char):
|
1906
1977
|
return char
|
1907
1978
|
return None
|
1908
|
-
|
1979
|
+
|
1909
1980
|
def _get_chunks(df_fake):
|
1910
1981
|
"""
|
1911
1982
|
helper func for 'load_csv'
|
@@ -1926,20 +1997,22 @@ def fload(fpath, kind=None, **kwargs):
|
|
1926
1997
|
encoding = kwargs.pop("encoding", "utf-8")
|
1927
1998
|
on_bad_lines = kwargs.pop("on_bad_lines", "skip")
|
1928
1999
|
comment = kwargs.pop("comment", None)
|
1929
|
-
fmt=kwargs.pop("fmt",False)
|
1930
|
-
chunksize=kwargs.pop("chunksize", None)
|
1931
|
-
engine=
|
1932
|
-
low_memory=kwargs.pop("low_memory",True)
|
1933
|
-
low_memory=
|
1934
|
-
|
2000
|
+
fmt = kwargs.pop("fmt", False)
|
2001
|
+
chunksize = kwargs.pop("chunksize", None)
|
2002
|
+
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
2003
|
+
low_memory = kwargs.pop("low_memory", True)
|
2004
|
+
low_memory = (
|
2005
|
+
False if chunksize else True
|
2006
|
+
) # when chunksize, recommend low_memory=False
|
2007
|
+
verbose = kwargs.pop("verbose", False)
|
1935
2008
|
if run_once_within():
|
1936
2009
|
use_pd("read_csv", verbose=verbose)
|
1937
|
-
|
2010
|
+
|
1938
2011
|
if comment is None:
|
1939
2012
|
comment = get_comment(
|
1940
2013
|
fpath, comment=None, encoding="utf-8", lines_to_check=5
|
1941
2014
|
)
|
1942
|
-
|
2015
|
+
|
1943
2016
|
try:
|
1944
2017
|
df = pd.read_csv(
|
1945
2018
|
fpath,
|
@@ -1956,9 +2029,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
1956
2029
|
**kwargs,
|
1957
2030
|
)
|
1958
2031
|
if chunksize:
|
1959
|
-
df=_get_chunks(df)
|
2032
|
+
df = _get_chunks(df)
|
1960
2033
|
print(df.shape)
|
1961
|
-
if is_df_abnormal(df, verbose=0):
|
2034
|
+
if is_df_abnormal(df, verbose=0): # raise error
|
1962
2035
|
raise ValueError("the df is abnormal")
|
1963
2036
|
except:
|
1964
2037
|
try:
|
@@ -1991,7 +2064,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1991
2064
|
**kwargs,
|
1992
2065
|
)
|
1993
2066
|
if chunksize:
|
1994
|
-
df=_get_chunks(df)
|
2067
|
+
df = _get_chunks(df)
|
1995
2068
|
print(df.shape)
|
1996
2069
|
if is_df_abnormal(df, verbose=0):
|
1997
2070
|
raise ValueError("the df is abnormal")
|
@@ -2026,7 +2099,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2026
2099
|
**kwargs,
|
2027
2100
|
)
|
2028
2101
|
if chunksize:
|
2029
|
-
df=_get_chunks(df)
|
2102
|
+
df = _get_chunks(df)
|
2030
2103
|
print(df.shape)
|
2031
2104
|
if is_df_abnormal(df, verbose=0):
|
2032
2105
|
raise ValueError("the df is abnormal")
|
@@ -2049,7 +2122,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2049
2122
|
**kwargs,
|
2050
2123
|
)
|
2051
2124
|
if chunksize:
|
2052
|
-
df=_get_chunks(df)
|
2125
|
+
df = _get_chunks(df)
|
2053
2126
|
print(df.shape)
|
2054
2127
|
if not is_df_abnormal(df, verbose=0): # normal
|
2055
2128
|
display(df.head(2))
|
@@ -2059,7 +2132,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2059
2132
|
pass
|
2060
2133
|
else:
|
2061
2134
|
if not chunksize:
|
2062
|
-
engines = [None,"c", "python"]
|
2135
|
+
engines = [None, "c", "python"]
|
2063
2136
|
for engine in engines:
|
2064
2137
|
separators = [",", "\t", ";", "|", " "]
|
2065
2138
|
for sep in separators:
|
@@ -2080,11 +2153,19 @@ def fload(fpath, kind=None, **kwargs):
|
|
2080
2153
|
# display(df.head(2))
|
2081
2154
|
# print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
|
2082
2155
|
if chunksize:
|
2083
|
-
df=_get_chunks(df)
|
2156
|
+
df = _get_chunks(df)
|
2084
2157
|
print(df.shape)
|
2085
2158
|
if not is_df_abnormal(df, verbose=0):
|
2086
|
-
|
2087
|
-
|
2159
|
+
(
|
2160
|
+
display(df.head(2))
|
2161
|
+
if isinstance(df, pd.DataFrame)
|
2162
|
+
else display("it is not a DataFrame")
|
2163
|
+
)
|
2164
|
+
(
|
2165
|
+
print(f"shape: {df.shape}")
|
2166
|
+
if isinstance(df, pd.DataFrame)
|
2167
|
+
else display("it is not a DataFrame")
|
2168
|
+
)
|
2088
2169
|
return df
|
2089
2170
|
except EmptyDataError as e:
|
2090
2171
|
continue
|
@@ -2096,19 +2177,18 @@ def fload(fpath, kind=None, **kwargs):
|
|
2096
2177
|
|
2097
2178
|
def load_excel(fpath, **kwargs):
|
2098
2179
|
engine = kwargs.get("engine", "openpyxl")
|
2099
|
-
verbose=kwargs.pop("verbose",False)
|
2180
|
+
verbose = kwargs.pop("verbose", False)
|
2100
2181
|
if run_once_within():
|
2101
2182
|
use_pd("read_excel", verbose=verbose)
|
2102
2183
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
2103
2184
|
try:
|
2104
|
-
meata=pd.ExcelFile(fpath)
|
2185
|
+
meata = pd.ExcelFile(fpath)
|
2105
2186
|
print(f"n_sheet={len(meata.sheet_names)},\t'sheetname = 0 (default)':")
|
2106
|
-
[print(f"{i}:\t{i_}") for i,i_ in enumerate(meata.sheet_names)]
|
2187
|
+
[print(f"{i}:\t{i_}") for i, i_ in enumerate(meata.sheet_names)]
|
2107
2188
|
except:
|
2108
2189
|
pass
|
2109
2190
|
return df
|
2110
2191
|
|
2111
|
-
|
2112
2192
|
def load_parquet(fpath, **kwargs):
|
2113
2193
|
"""
|
2114
2194
|
Load a Parquet file into a Pandas DataFrame with advanced options.
|
@@ -2124,16 +2204,16 @@ def fload(fpath, kind=None, **kwargs):
|
|
2124
2204
|
Returns:
|
2125
2205
|
- df (DataFrame): The loaded DataFrame.
|
2126
2206
|
"""
|
2127
|
-
|
2207
|
+
|
2128
2208
|
engine = kwargs.get("engine", "pyarrow")
|
2129
2209
|
verbose = kwargs.pop("verbose", False)
|
2130
|
-
|
2210
|
+
|
2131
2211
|
if run_once_within():
|
2132
2212
|
use_pd("read_parquet", verbose=verbose)
|
2133
2213
|
try:
|
2134
2214
|
df = pd.read_parquet(fpath, engine=engine, **kwargs)
|
2135
2215
|
if verbose:
|
2136
|
-
if
|
2216
|
+
if "columns" in kwargs:
|
2137
2217
|
print(f"Loaded columns: {kwargs['columns']}")
|
2138
2218
|
else:
|
2139
2219
|
print("Loaded all columns.")
|
@@ -2142,11 +2222,12 @@ def fload(fpath, kind=None, **kwargs):
|
|
2142
2222
|
print(f"An error occurred while loading the Parquet file: {e}")
|
2143
2223
|
df = None
|
2144
2224
|
|
2145
|
-
return df
|
2225
|
+
return df
|
2146
2226
|
|
2147
2227
|
def load_ipynb(fpath, **kwargs):
|
2148
2228
|
import nbformat
|
2149
2229
|
from nbconvert import MarkdownExporter
|
2230
|
+
|
2150
2231
|
as_version = kwargs.get("as_version", 4)
|
2151
2232
|
with open(fpath, "r") as file:
|
2152
2233
|
nb = nbformat.read(file, as_version=as_version)
|
@@ -2177,6 +2258,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2177
2258
|
If the specified page is not found, it returns the string "Page is not found".
|
2178
2259
|
"""
|
2179
2260
|
from PyPDF2 import PdfReader
|
2261
|
+
|
2180
2262
|
text_dict = {}
|
2181
2263
|
with open(fpath, "rb") as file:
|
2182
2264
|
pdf_reader = PdfReader(file)
|
@@ -2207,6 +2289,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2207
2289
|
|
2208
2290
|
def load_docx(fpath):
|
2209
2291
|
from docx import Document
|
2292
|
+
|
2210
2293
|
doc = Document(fpath)
|
2211
2294
|
content = [para.text for para in doc.paragraphs]
|
2212
2295
|
return content
|
@@ -2216,21 +2299,55 @@ def fload(fpath, kind=None, **kwargs):
|
|
2216
2299
|
kind = kind.lower()
|
2217
2300
|
kind = kind.lstrip(".").lower()
|
2218
2301
|
img_types = [
|
2219
|
-
"bmp",
|
2220
|
-
"
|
2302
|
+
"bmp",
|
2303
|
+
"eps",
|
2304
|
+
"gif",
|
2305
|
+
"png",
|
2306
|
+
"jpg",
|
2307
|
+
"jpeg",
|
2308
|
+
"jpeg2000",
|
2309
|
+
"tiff",
|
2310
|
+
"tif",
|
2311
|
+
"icns",
|
2312
|
+
"ico",
|
2313
|
+
"im",
|
2314
|
+
"msp",
|
2315
|
+
"pcx",
|
2316
|
+
"ppm",
|
2317
|
+
"sgi",
|
2318
|
+
"spider",
|
2319
|
+
"tga",
|
2320
|
+
"webp",
|
2221
2321
|
]
|
2222
2322
|
doc_types = [
|
2223
|
-
"docx",
|
2224
|
-
"
|
2225
|
-
"
|
2226
|
-
"
|
2323
|
+
"docx",
|
2324
|
+
"pdf",
|
2325
|
+
"txt",
|
2326
|
+
"csv",
|
2327
|
+
"xlsx",
|
2328
|
+
"tsv",
|
2329
|
+
"parquet",
|
2330
|
+
"snappy",
|
2331
|
+
"md",
|
2332
|
+
"html",
|
2333
|
+
"json",
|
2334
|
+
"yaml",
|
2335
|
+
"xml",
|
2227
2336
|
"ipynb",
|
2228
|
-
"mtx"
|
2337
|
+
"mtx",
|
2229
2338
|
]
|
2230
2339
|
zip_types = [
|
2231
|
-
"gz",
|
2232
|
-
"
|
2233
|
-
"
|
2340
|
+
"gz",
|
2341
|
+
"zip",
|
2342
|
+
"7z",
|
2343
|
+
"rar",
|
2344
|
+
"tgz",
|
2345
|
+
"tar",
|
2346
|
+
"tar.gz",
|
2347
|
+
"tar.bz2",
|
2348
|
+
"bz2",
|
2349
|
+
"xz",
|
2350
|
+
"gzip",
|
2234
2351
|
]
|
2235
2352
|
other_types = ["fcs"]
|
2236
2353
|
supported_types = [*doc_types, *img_types, *zip_types, *other_types]
|
@@ -2266,17 +2383,17 @@ def fload(fpath, kind=None, **kwargs):
|
|
2266
2383
|
return load_yaml(fpath)
|
2267
2384
|
elif kind == "xml":
|
2268
2385
|
return load_xml(fpath)
|
2269
|
-
elif kind in ["csv","tsv"]:
|
2270
|
-
verbose=kwargs.pop(
|
2386
|
+
elif kind in ["csv", "tsv"]:
|
2387
|
+
verbose = kwargs.pop("verbose", False)
|
2271
2388
|
if run_once_within():
|
2272
2389
|
use_pd("read_csv")
|
2273
2390
|
content = load_csv(fpath, **kwargs)
|
2274
2391
|
return content
|
2275
|
-
elif kind==
|
2276
|
-
verbose=kwargs.pop(
|
2392
|
+
elif kind == "pkl":
|
2393
|
+
verbose = kwargs.pop("verbose", False)
|
2277
2394
|
if run_once_within():
|
2278
2395
|
use_pd("read_pickle")
|
2279
|
-
return pd.read_pickle(fpath
|
2396
|
+
return pd.read_pickle(fpath, **kwargs)
|
2280
2397
|
elif kind in ["ods", "ods", "odt"]:
|
2281
2398
|
engine = kwargs.get("engine", "odf")
|
2282
2399
|
kwargs.pop("engine", None)
|
@@ -2286,38 +2403,39 @@ def fload(fpath, kind=None, **kwargs):
|
|
2286
2403
|
kwargs.pop("engine", None)
|
2287
2404
|
content = load_excel(fpath, engine=engine, **kwargs)
|
2288
2405
|
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2289
|
-
display(content.head(3))
|
2406
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2290
2407
|
return content
|
2291
2408
|
elif kind == "xlsx":
|
2292
2409
|
content = load_excel(fpath, **kwargs)
|
2293
2410
|
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2294
2411
|
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2295
2412
|
return content
|
2296
|
-
elif kind==
|
2413
|
+
elif kind == "mtx":
|
2297
2414
|
from scipy.io import mmread
|
2298
|
-
|
2299
|
-
|
2415
|
+
|
2416
|
+
dat_mtx = mmread(fpath)
|
2417
|
+
content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
|
2300
2418
|
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2301
2419
|
print(f"shape: {content.shape}")
|
2302
2420
|
return content
|
2303
2421
|
elif kind == "ipynb":
|
2304
2422
|
return load_ipynb(fpath, **kwargs)
|
2305
|
-
elif kind in [
|
2306
|
-
verbose=kwargs.pop(
|
2423
|
+
elif kind in ["parquet", "snappy"]:
|
2424
|
+
verbose = kwargs.pop("verbose", False)
|
2307
2425
|
if run_once_within():
|
2308
2426
|
use_pd("read_parquet")
|
2309
|
-
return load_parquet(fpath
|
2310
|
-
elif kind ==
|
2311
|
-
verbose=kwargs.pop(
|
2427
|
+
return load_parquet(fpath, **kwargs)
|
2428
|
+
elif kind == "feather":
|
2429
|
+
verbose = kwargs.pop("verbose", False)
|
2312
2430
|
if run_once_within():
|
2313
2431
|
use_pd("read_feather")
|
2314
|
-
content=pd.read_feather(fpath
|
2432
|
+
content = pd.read_feather(fpath, **kwargs)
|
2315
2433
|
return content
|
2316
|
-
elif kind ==
|
2317
|
-
content=pd.read_hdf(fpath
|
2434
|
+
elif kind == "h5":
|
2435
|
+
content = pd.read_hdf(fpath, **kwargs)
|
2318
2436
|
return content
|
2319
|
-
elif kind ==
|
2320
|
-
content=pd.read_pickle(fpath
|
2437
|
+
elif kind == "pkl":
|
2438
|
+
content = pd.read_pickle(fpath, **kwargs)
|
2321
2439
|
return content
|
2322
2440
|
elif kind == "pdf":
|
2323
2441
|
# print('usage:load_pdf(fpath, page="all", verbose=False)')
|
@@ -2325,11 +2443,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
2325
2443
|
elif kind.lower() in img_types:
|
2326
2444
|
print(f'Image ".{kind}" is loaded.')
|
2327
2445
|
return load_img(fpath)
|
2328
|
-
elif kind=="gz" and fpath.endswith(".soft.gz"):
|
2446
|
+
elif kind == "gz" and fpath.endswith(".soft.gz"):
|
2329
2447
|
import GEOparse
|
2448
|
+
|
2330
2449
|
return GEOparse.get_GEO(filepath=fpath)
|
2331
2450
|
elif kind.lower() in zip_types:
|
2332
2451
|
from pprint import pp
|
2452
|
+
|
2333
2453
|
keep = kwargs.get("keep", False)
|
2334
2454
|
fpath_unzip = unzip(fpath)
|
2335
2455
|
if os.path.isdir(fpath_unzip):
|
@@ -2364,7 +2484,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2364
2484
|
meta, data = fcsparser.parse(fpath, reformat_meta=True)
|
2365
2485
|
return meta, data
|
2366
2486
|
|
2367
|
-
elif kind=="mplstyle":
|
2487
|
+
elif kind == "mplstyle":
|
2368
2488
|
return read_mplstyle(fpath)
|
2369
2489
|
|
2370
2490
|
else:
|
@@ -2408,7 +2528,7 @@ def fupdate(fpath, content=None, how="head"):
|
|
2408
2528
|
"""
|
2409
2529
|
Update a file by adding new content at the top and moving the old content to the bottom.
|
2410
2530
|
If the file is a JSON file, merge the new content with the old content.
|
2411
|
-
|
2531
|
+
|
2412
2532
|
Parameters
|
2413
2533
|
----------
|
2414
2534
|
fpath : str
|
@@ -2416,7 +2536,7 @@ def fupdate(fpath, content=None, how="head"):
|
|
2416
2536
|
content : str or dict, optional
|
2417
2537
|
The new content to add at the top of the file (for text) or merge (for JSON).
|
2418
2538
|
If not provided, the function will not add any new content.
|
2419
|
-
|
2539
|
+
|
2420
2540
|
Notes
|
2421
2541
|
-----
|
2422
2542
|
- If the file at `fpath` does not exist, it will be created.
|
@@ -2425,14 +2545,20 @@ def fupdate(fpath, content=None, how="head"):
|
|
2425
2545
|
"""
|
2426
2546
|
content = content or ""
|
2427
2547
|
file_ext = os.path.splitext(fpath)[1]
|
2428
|
-
how_s=["head", "tail","start","end","beginning", "stop",
|
2548
|
+
how_s = ["head", "tail", "start", "end", "beginning", "stop", "last", "before"]
|
2429
2549
|
how = strcmp(how, how_s)[0]
|
2430
2550
|
print(how)
|
2431
|
-
add_where =
|
2551
|
+
add_where = "head" if how in ["head", "start", "beginning", "before"] else "tail"
|
2432
2552
|
if "json" in file_ext.lower():
|
2433
|
-
old_content=fload(fpath,kind=
|
2434
|
-
updated_content =
|
2435
|
-
|
2553
|
+
old_content = fload(fpath, kind="json") if os.path.exists(fpath) else {}
|
2554
|
+
updated_content = (
|
2555
|
+
{**content, **old_content}
|
2556
|
+
if add_where == "head"
|
2557
|
+
else (
|
2558
|
+
{**old_content, **content} if isinstance(content, dict) else old_content
|
2559
|
+
)
|
2560
|
+
)
|
2561
|
+
fsave(fpath, updated_content)
|
2436
2562
|
else:
|
2437
2563
|
# Handle text file
|
2438
2564
|
if os.path.exists(fpath):
|
@@ -2443,7 +2569,7 @@ def fupdate(fpath, content=None, how="head"):
|
|
2443
2569
|
|
2444
2570
|
# Write new content at the top followed by old content
|
2445
2571
|
with open(fpath, "w") as file:
|
2446
|
-
if add_where=="head":
|
2572
|
+
if add_where == "head":
|
2447
2573
|
file.write(content + "\n")
|
2448
2574
|
file.write(old_content)
|
2449
2575
|
else:
|
@@ -2478,7 +2604,9 @@ def filter_kwargs(kws, valid_kwargs):
|
|
2478
2604
|
}
|
2479
2605
|
return kwargs_filtered
|
2480
2606
|
|
2481
|
-
|
2607
|
+
|
2608
|
+
str_space_speed = 'sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
|
2609
|
+
|
2482
2610
|
|
2483
2611
|
def fsave(
|
2484
2612
|
fpath,
|
@@ -2515,6 +2643,7 @@ def fsave(
|
|
2515
2643
|
|
2516
2644
|
def save_docx(fpath, content, font_name, font_size, spacing):
|
2517
2645
|
import docx
|
2646
|
+
|
2518
2647
|
if isinstance(content, str):
|
2519
2648
|
content = content.split(". ")
|
2520
2649
|
doc = docx.Document()
|
@@ -2543,6 +2672,7 @@ def fsave(
|
|
2543
2672
|
|
2544
2673
|
def save_pdf(fpath, content, font_name, font_size):
|
2545
2674
|
from fpdf import FPDF
|
2675
|
+
|
2546
2676
|
pdf = FPDF()
|
2547
2677
|
pdf.add_page()
|
2548
2678
|
# pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
|
@@ -2555,7 +2685,7 @@ def fsave(
|
|
2555
2685
|
def save_csv(fpath, data, **kwargs):
|
2556
2686
|
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
|
2557
2687
|
|
2558
|
-
verbose=kwargs.pop("verbose",False)
|
2688
|
+
verbose = kwargs.pop("verbose", False)
|
2559
2689
|
if run_once_within():
|
2560
2690
|
use_pd("to_csv", verbose=verbose)
|
2561
2691
|
kwargs_csv = dict(
|
@@ -2586,7 +2716,7 @@ def fsave(
|
|
2586
2716
|
df.to_csv(fpath, **kwargs_valid)
|
2587
2717
|
|
2588
2718
|
def save_xlsx(fpath, data, **kwargs):
|
2589
|
-
verbose=kwargs.pop("verbose",False)
|
2719
|
+
verbose = kwargs.pop("verbose", False)
|
2590
2720
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2591
2721
|
if run_once_within():
|
2592
2722
|
use_pd("to_excel", verbose=verbose)
|
@@ -2595,9 +2725,21 @@ def fsave(
|
|
2595
2725
|
else:
|
2596
2726
|
# Remove non-relevant kwargs
|
2597
2727
|
irrelevant_keys = [
|
2598
|
-
|
2599
|
-
|
2600
|
-
|
2728
|
+
"format",
|
2729
|
+
"usage",
|
2730
|
+
"cell",
|
2731
|
+
"width",
|
2732
|
+
"height",
|
2733
|
+
"height_max",
|
2734
|
+
"merge",
|
2735
|
+
"shade",
|
2736
|
+
"comment",
|
2737
|
+
"link",
|
2738
|
+
"protect",
|
2739
|
+
"number_format",
|
2740
|
+
"conditional_format",
|
2741
|
+
"index_default",
|
2742
|
+
]
|
2601
2743
|
for key in irrelevant_keys:
|
2602
2744
|
kwargs.pop(key, None)
|
2603
2745
|
|
@@ -2605,19 +2747,21 @@ def fsave(
|
|
2605
2747
|
# Check if the file exists, then append the sheet, otherwise create a new file
|
2606
2748
|
try:
|
2607
2749
|
# Use ExcelWriter with append mode if the file exists
|
2608
|
-
with pd.ExcelWriter(
|
2750
|
+
with pd.ExcelWriter(
|
2751
|
+
fpath, engine="openpyxl", mode="a", if_sheet_exists="new"
|
2752
|
+
) as writer:
|
2609
2753
|
df.to_excel(writer, sheet_name=sheet_name, index=False, **kwargs)
|
2610
2754
|
except FileNotFoundError:
|
2611
2755
|
# If file doesn't exist, create a new one
|
2612
2756
|
df.to_excel(fpath, sheet_name=sheet_name, index=False, **kwargs)
|
2613
2757
|
|
2614
|
-
|
2615
2758
|
def save_ipynb(fpath, data, **kwargs):
|
2616
2759
|
# Split the content by code fences to distinguish between code and markdown
|
2617
2760
|
import nbformat
|
2761
|
+
|
2618
2762
|
parts = data.split("```")
|
2619
2763
|
cells = []
|
2620
|
-
|
2764
|
+
|
2621
2765
|
for i, part in enumerate(parts):
|
2622
2766
|
if i % 2 == 0:
|
2623
2767
|
# Even index: markdown content
|
@@ -2638,17 +2782,18 @@ def fsave(
|
|
2638
2782
|
|
2639
2783
|
def save_json(fpath_fname, var_dict_or_df):
|
2640
2784
|
import json
|
2785
|
+
|
2641
2786
|
def _convert_js(data):
|
2642
2787
|
if isinstance(data, pd.DataFrame):
|
2643
|
-
return data.to_dict(orient="list")
|
2788
|
+
return data.to_dict(orient="list")
|
2644
2789
|
elif isinstance(data, np.ndarray):
|
2645
2790
|
return data.tolist()
|
2646
2791
|
elif isinstance(data, dict):
|
2647
2792
|
return {key: _convert_js(value) for key, value in data.items()}
|
2648
|
-
return data
|
2793
|
+
return data
|
2649
2794
|
|
2650
2795
|
serializable_data = _convert_js(var_dict_or_df)
|
2651
|
-
|
2796
|
+
|
2652
2797
|
# Save the serializable data to the JSON file
|
2653
2798
|
with open(fpath_fname, "w") as f_json:
|
2654
2799
|
json.dump(serializable_data, f_json, indent=4)
|
@@ -2660,11 +2805,13 @@ def fsave(
|
|
2660
2805
|
|
2661
2806
|
def save_yaml(fpath, data, **kwargs):
|
2662
2807
|
import yaml
|
2808
|
+
|
2663
2809
|
with open(fpath, "w") as file:
|
2664
2810
|
yaml.dump(data, file, **kwargs)
|
2665
2811
|
|
2666
2812
|
def save_xml(fpath, data):
|
2667
2813
|
from lxml import etree
|
2814
|
+
|
2668
2815
|
root = etree.Element("root")
|
2669
2816
|
if isinstance(data, dict):
|
2670
2817
|
for key, val in data.items():
|
@@ -2675,24 +2822,37 @@ def fsave(
|
|
2675
2822
|
tree = etree.ElementTree(root)
|
2676
2823
|
tree.write(fpath, pretty_print=True, xml_declaration=True, encoding="UTF-8")
|
2677
2824
|
|
2678
|
-
def save_parquet(fpath:str, data:pd.DataFrame, **kwargs):
|
2679
|
-
engine = kwargs.pop(
|
2680
|
-
|
2825
|
+
def save_parquet(fpath: str, data: pd.DataFrame, **kwargs):
|
2826
|
+
engine = kwargs.pop(
|
2827
|
+
"engine", "auto"
|
2828
|
+
) # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
|
2829
|
+
compression = kwargs.pop(
|
2830
|
+
"compression", None
|
2831
|
+
) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
|
2681
2832
|
try:
|
2682
2833
|
# Attempt to save with "pyarrow" if engine is set to "auto"
|
2683
|
-
|
2684
|
-
|
2834
|
+
data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
|
2835
|
+
print(
|
2836
|
+
f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression."
|
2837
|
+
)
|
2685
2838
|
except Exception as e:
|
2686
|
-
print(
|
2839
|
+
print(
|
2840
|
+
f"Error using with engine '{engine}' and {compression} compression: {e}"
|
2841
|
+
)
|
2687
2842
|
if "Sparse" in str(e):
|
2688
2843
|
try:
|
2689
2844
|
# Handle sparse data by converting columns to dense
|
2690
2845
|
print("Attempting to convert sparse columns to dense format...")
|
2691
|
-
data = data.apply(
|
2692
|
-
|
2846
|
+
data = data.apply(
|
2847
|
+
lambda x: (
|
2848
|
+
x.sparse.to_dense() if pd.api.types.is_sparse(x) else x
|
2849
|
+
)
|
2850
|
+
)
|
2851
|
+
save_parquet(fpath, data=data, **kwargs)
|
2693
2852
|
except Exception as last_e:
|
2694
|
-
print(
|
2695
|
-
|
2853
|
+
print(
|
2854
|
+
f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}"
|
2855
|
+
)
|
2696
2856
|
|
2697
2857
|
if kind is None:
|
2698
2858
|
_, kind = os.path.splitext(fpath)
|
@@ -2739,92 +2899,95 @@ def fsave(
|
|
2739
2899
|
save_yaml(fpath, content, **kwargs)
|
2740
2900
|
elif kind == "ipynb":
|
2741
2901
|
save_ipynb(fpath, content, **kwargs)
|
2742
|
-
elif kind.lower() in ["parquet","pq","big","par"]:
|
2743
|
-
verbose=kwargs.pop(
|
2902
|
+
elif kind.lower() in ["parquet", "pq", "big", "par"]:
|
2903
|
+
verbose = kwargs.pop("verbose", False)
|
2744
2904
|
if verbose:
|
2745
2905
|
print(str_space_speed)
|
2746
2906
|
use_pd("to_parquet")
|
2747
2907
|
return None
|
2748
|
-
compression=kwargs.pop(
|
2908
|
+
compression = kwargs.pop(
|
2909
|
+
"compression", None
|
2910
|
+
) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
|
2749
2911
|
# fix the fpath ends
|
2750
2912
|
_fpath, _ext = os.path.splitext(fpath)
|
2751
|
-
fpath = _fpath+_ext.replace(kind,
|
2913
|
+
fpath = _fpath + _ext.replace(kind, "parquet")
|
2752
2914
|
if compression is not None:
|
2753
2915
|
if not fpath.endswith(compression):
|
2754
|
-
fpath=fpath+f".{compression}"
|
2755
|
-
save_parquet(fpath=fpath, data=content,compression=compression
|
2756
|
-
elif kind.lower() in ["pkl","pk","pickle","pick"]:
|
2757
|
-
# Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
|
2758
|
-
# Pickle is convenient if you want to preserve exact Python object types.
|
2759
|
-
verbose=kwargs.pop(
|
2916
|
+
fpath = fpath + f".{compression}"
|
2917
|
+
save_parquet(fpath=fpath, data=content, compression=compression, **kwargs)
|
2918
|
+
elif kind.lower() in ["pkl", "pk", "pickle", "pick"]:
|
2919
|
+
# Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
|
2920
|
+
# Pickle is convenient if you want to preserve exact Python object types.
|
2921
|
+
verbose = kwargs.pop("verbose", False)
|
2760
2922
|
if verbose:
|
2761
2923
|
print(str_space_speed)
|
2762
2924
|
use_pd("to_pickle")
|
2763
2925
|
return None
|
2764
2926
|
_fpath, _ext = os.path.splitext(fpath)
|
2765
|
-
fpath = _fpath+_ext.replace(kind,
|
2766
|
-
compression=kwargs.pop("compression",None)
|
2927
|
+
fpath = _fpath + _ext.replace(kind, "pkl")
|
2928
|
+
compression = kwargs.pop("compression", None)
|
2767
2929
|
if compression is not None:
|
2768
2930
|
if not fpath.endswith(compression["method"]):
|
2769
|
-
fpath=fpath+f".{compression[
|
2931
|
+
fpath = fpath + f".{compression['method']}"
|
2770
2932
|
if isinstance(content, pd.DataFrame):
|
2771
|
-
content.to_pickle(fpath
|
2933
|
+
content.to_pickle(fpath, **kwargs)
|
2772
2934
|
else:
|
2773
2935
|
try:
|
2774
2936
|
print("trying to convert it as a DataFrame...")
|
2775
|
-
content=pd.DataFrame(content)
|
2776
|
-
content.to_pickle(fpath
|
2937
|
+
content = pd.DataFrame(content)
|
2938
|
+
content.to_pickle(fpath, **kwargs)
|
2777
2939
|
except Exception as e:
|
2778
2940
|
raise ValueError(
|
2779
|
-
|
2780
|
-
|
2781
|
-
elif kind.lower() in ["fea",
|
2782
|
-
# Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
|
2783
|
-
# optimized for data analytics tasks and is especially fast when working with Pandas.
|
2784
|
-
|
2785
|
-
verbose=kwargs.pop(
|
2941
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2942
|
+
)
|
2943
|
+
elif kind.lower() in ["fea", "feather", "ft", "fe", "feat", "fether"]:
|
2944
|
+
# Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
|
2945
|
+
# optimized for data analytics tasks and is especially fast when working with Pandas.
|
2946
|
+
|
2947
|
+
verbose = kwargs.pop("verbose", False)
|
2786
2948
|
if verbose:
|
2787
2949
|
print(str_space_speed)
|
2788
2950
|
use_pd("to_feather")
|
2789
2951
|
return None
|
2790
2952
|
_fpath, _ext = os.path.splitext(fpath)
|
2791
|
-
fpath = _fpath+_ext.replace(kind,
|
2953
|
+
fpath = _fpath + _ext.replace(kind, "feather")
|
2792
2954
|
if isinstance(content, pd.DataFrame):
|
2793
|
-
content.to_feather(fpath
|
2955
|
+
content.to_feather(fpath, **kwargs)
|
2794
2956
|
else:
|
2795
2957
|
try:
|
2796
2958
|
print("trying to convert it as a DataFrame...")
|
2797
|
-
content=pd.DataFrame(content)
|
2959
|
+
content = pd.DataFrame(content)
|
2798
2960
|
content.to_feather(fpath, **kwargs)
|
2799
2961
|
except Exception as e:
|
2800
2962
|
raise ValueError(
|
2801
|
-
|
2802
|
-
|
2803
|
-
elif kind.lower() in ["hd",
|
2963
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2964
|
+
)
|
2965
|
+
elif kind.lower() in ["hd", "hdf", "h", "h5"]:
|
2804
2966
|
# particularly useful for large datasets and can handle complex data structures
|
2805
|
-
verbose=kwargs.pop(
|
2967
|
+
verbose = kwargs.pop("verbose", False)
|
2806
2968
|
if verbose:
|
2807
2969
|
print(str_space_speed)
|
2808
2970
|
use_pd("to_hdf")
|
2809
2971
|
_fpath, _ext = os.path.splitext(fpath)
|
2810
|
-
fpath = _fpath+_ext.replace(kind,
|
2811
|
-
compression=kwargs.pop("compression",None)
|
2972
|
+
fpath = _fpath + _ext.replace(kind, "h5")
|
2973
|
+
compression = kwargs.pop("compression", None)
|
2812
2974
|
if compression is not None:
|
2813
2975
|
if not fpath.endswith(compression):
|
2814
|
-
fpath=fpath+f".{compression}"
|
2976
|
+
fpath = fpath + f".{compression}"
|
2815
2977
|
if isinstance(content, pd.DataFrame):
|
2816
|
-
content.to_hdf(fpath,key=
|
2978
|
+
content.to_hdf(fpath, key="content", **kwargs)
|
2817
2979
|
else:
|
2818
2980
|
try:
|
2819
2981
|
print("trying to convert it as a DataFrame...")
|
2820
|
-
content=pd.DataFrame(content)
|
2821
|
-
content.to_hdf(fpath
|
2982
|
+
content = pd.DataFrame(content)
|
2983
|
+
content.to_hdf(fpath, **kwargs)
|
2822
2984
|
except Exception as e:
|
2823
2985
|
raise ValueError(
|
2824
|
-
|
2825
|
-
|
2986
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2987
|
+
)
|
2826
2988
|
else:
|
2827
2989
|
from . import netfinder
|
2990
|
+
|
2828
2991
|
try:
|
2829
2992
|
netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
|
2830
2993
|
except:
|
@@ -2948,6 +3111,7 @@ def isa(content, kind):
|
|
2948
3111
|
return is_str_color(content)
|
2949
3112
|
elif "html" in kind.lower():
|
2950
3113
|
import re
|
3114
|
+
|
2951
3115
|
if content is None or not isinstance(content, str):
|
2952
3116
|
return False
|
2953
3117
|
# Remove leading and trailing whitespace
|
@@ -2997,8 +3161,8 @@ def listdir(
|
|
2997
3161
|
verbose=True,
|
2998
3162
|
):
|
2999
3163
|
if kind is None:
|
3000
|
-
ls=os.listdir(rootdir)
|
3001
|
-
ls = [f for f in ls if not f.startswith(
|
3164
|
+
ls = os.listdir(rootdir)
|
3165
|
+
ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
|
3002
3166
|
print(ls)
|
3003
3167
|
df_all = pd.DataFrame(
|
3004
3168
|
{
|
@@ -3029,7 +3193,7 @@ def listdir(
|
|
3029
3193
|
|
3030
3194
|
if os.path.isdir(rootdir):
|
3031
3195
|
ls = os.listdir(rootdir)
|
3032
|
-
ls = [f for f in ls if not f.startswith(
|
3196
|
+
ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
|
3033
3197
|
fd = [".fd", ".fld", ".fol", ".fd", ".folder"]
|
3034
3198
|
i = 0
|
3035
3199
|
f = {
|
@@ -3108,6 +3272,7 @@ def listdir(
|
|
3108
3272
|
return f
|
3109
3273
|
else:
|
3110
3274
|
from box import Box
|
3275
|
+
|
3111
3276
|
if "l" in orient.lower(): # list # default
|
3112
3277
|
res_output = Box(f.to_dict(orient="list"))
|
3113
3278
|
return res_output
|
@@ -3151,7 +3316,7 @@ def mkdir_nest(fpath: str) -> str:
|
|
3151
3316
|
# Split the full path into directories
|
3152
3317
|
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
3153
3318
|
if os.path.isdir(fpath):
|
3154
|
-
fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
|
3319
|
+
fpath = fpath + f_slash if not fpath.endswith(f_slash) else fpath
|
3155
3320
|
return fpath
|
3156
3321
|
dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
|
3157
3322
|
|
@@ -3181,27 +3346,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
3181
3346
|
- str: The path of the created directory or an error message.
|
3182
3347
|
"""
|
3183
3348
|
|
3184
|
-
rootdir = []
|
3349
|
+
rootdir = []
|
3185
3350
|
if chdir is None:
|
3186
3351
|
return mkdir_nest(pardir)
|
3187
3352
|
if isinstance(chdir, str):
|
3188
|
-
chdir = [chdir]
|
3353
|
+
chdir = [chdir]
|
3189
3354
|
chdir = list(set(chdir))
|
3190
3355
|
if isinstance(pardir, str): # Dir_parents should be 'str' type
|
3191
|
-
pardir = os.path.normpath(pardir)
|
3356
|
+
pardir = os.path.normpath(pardir)
|
3192
3357
|
if "mac" in get_os().lower() or "lin" in get_os().lower():
|
3193
3358
|
stype = "/"
|
3194
3359
|
elif "win" in get_os().lower():
|
3195
3360
|
stype = "\\"
|
3196
3361
|
else:
|
3197
3362
|
stype = "/"
|
3198
|
-
|
3363
|
+
|
3199
3364
|
if os.path.isdir(pardir):
|
3200
3365
|
os.chdir(pardir) # Set current path
|
3201
3366
|
# Check if subdirectories are not empty
|
3202
3367
|
if chdir:
|
3203
|
-
chdir.sort()
|
3204
|
-
for folder in chdir:
|
3368
|
+
chdir.sort()
|
3369
|
+
for folder in chdir:
|
3205
3370
|
child_tmp = os.path.join(pardir, folder)
|
3206
3371
|
if not os.path.isdir(child_tmp):
|
3207
3372
|
os.mkdir("./" + folder)
|
@@ -3221,7 +3386,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
3221
3386
|
# Dir is the main output, if only one dir, then str type is inconvenient
|
3222
3387
|
if len(rootdir) == 1:
|
3223
3388
|
rootdir = rootdir[0]
|
3224
|
-
rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
|
3389
|
+
rootdir = rootdir + stype if not rootdir.endswith(stype) else rootdir
|
3225
3390
|
|
3226
3391
|
return rootdir
|
3227
3392
|
|
@@ -3236,6 +3401,7 @@ def split_path(fpath):
|
|
3236
3401
|
def figsave(*args, dpi=300):
|
3237
3402
|
import matplotlib.pyplot as plt
|
3238
3403
|
from PIL import Image
|
3404
|
+
|
3239
3405
|
dir_save = None
|
3240
3406
|
fname = None
|
3241
3407
|
img = None
|
@@ -3250,7 +3416,7 @@ def figsave(*args, dpi=300):
|
|
3250
3416
|
img = arg # Store the PIL image if provided
|
3251
3417
|
|
3252
3418
|
if dir_save is None:
|
3253
|
-
dir_save="./"
|
3419
|
+
dir_save = "./"
|
3254
3420
|
|
3255
3421
|
# dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
|
3256
3422
|
dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
|
@@ -3343,8 +3509,9 @@ def figsave(*args, dpi=300):
|
|
3343
3509
|
|
3344
3510
|
def is_str_color(s):
|
3345
3511
|
# Regular expression pattern for hexadecimal color codes
|
3346
|
-
if isinstance(s,str):
|
3512
|
+
if isinstance(s, str):
|
3347
3513
|
import re
|
3514
|
+
|
3348
3515
|
color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
|
3349
3516
|
return re.match(color_code_pattern, s) is not None
|
3350
3517
|
else:
|
@@ -3372,6 +3539,7 @@ def isnum(s):
|
|
3372
3539
|
|
3373
3540
|
def is_image(fpath):
|
3374
3541
|
import mimetypes
|
3542
|
+
|
3375
3543
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3376
3544
|
if mime_type and mime_type.startswith("image"):
|
3377
3545
|
return True
|
@@ -3381,6 +3549,7 @@ def is_image(fpath):
|
|
3381
3549
|
|
3382
3550
|
def is_document(fpath):
|
3383
3551
|
import mimetypes
|
3552
|
+
|
3384
3553
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3385
3554
|
if mime_type and (
|
3386
3555
|
mime_type.startswith("text/")
|
@@ -3402,6 +3571,7 @@ def is_document(fpath):
|
|
3402
3571
|
|
3403
3572
|
def is_zip(fpath):
|
3404
3573
|
import mimetypes
|
3574
|
+
|
3405
3575
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3406
3576
|
if mime_type == "application/zip":
|
3407
3577
|
return True
|
@@ -3411,6 +3581,7 @@ def is_zip(fpath):
|
|
3411
3581
|
|
3412
3582
|
def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
|
3413
3583
|
import matplotlib.pyplot as plt
|
3584
|
+
|
3414
3585
|
if ax is None:
|
3415
3586
|
ax = plt.gca()
|
3416
3587
|
for loc, spine in ax.spines.items():
|
@@ -3500,6 +3671,7 @@ def apply_filter(img, *args):
|
|
3500
3671
|
PIL.Image: The filtered image.
|
3501
3672
|
"""
|
3502
3673
|
from PIL import ImageFilter
|
3674
|
+
|
3503
3675
|
def correct_filter_name(filter_name):
|
3504
3676
|
if "bl" in filter_name.lower() and "box" not in filter_name.lower():
|
3505
3677
|
return "BLUR"
|
@@ -3742,7 +3914,8 @@ def imgsets(img, **kwargs):
|
|
3742
3914
|
return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
|
3743
3915
|
|
3744
3916
|
import matplotlib.pyplot as plt
|
3745
|
-
from PIL import ImageEnhance,ImageOps
|
3917
|
+
from PIL import ImageEnhance, ImageOps
|
3918
|
+
|
3746
3919
|
# Load image if input is a file path
|
3747
3920
|
if isinstance(img, str):
|
3748
3921
|
img = load_img(img)
|
@@ -3807,6 +3980,7 @@ def imgsets(img, **kwargs):
|
|
3807
3980
|
img_update = ImageOps.pad(img_update, size=value)
|
3808
3981
|
elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
|
3809
3982
|
from rembg import remove, new_session
|
3983
|
+
|
3810
3984
|
if isinstance(value, bool):
|
3811
3985
|
session = new_session("isnet-general-use")
|
3812
3986
|
img_update = remove(img_update, session=session)
|
@@ -3846,6 +4020,7 @@ def imgsets(img, **kwargs):
|
|
3846
4020
|
img_update = remove(img_update)
|
3847
4021
|
elif "bg" in k.lower() and "color" in k.lower():
|
3848
4022
|
from rembg import remove
|
4023
|
+
|
3849
4024
|
if isinstance(value, list):
|
3850
4025
|
value = tuple(value)
|
3851
4026
|
if isinstance(value, tuple): # replace the background color
|
@@ -3879,6 +4054,7 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
3879
4054
|
"""
|
3880
4055
|
import matplotlib.pyplot as plt
|
3881
4056
|
from PIL import Image
|
4057
|
+
|
3882
4058
|
num_images = len(dir_img_list)
|
3883
4059
|
if not kind.startswith("."):
|
3884
4060
|
kind = "." + kind
|
@@ -3917,12 +4093,11 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
3917
4093
|
# thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
|
3918
4094
|
|
3919
4095
|
|
3920
|
-
|
3921
4096
|
# search and fine the director of the libary, which installed at local
|
3922
4097
|
def dir_lib(lib_oi):
|
3923
4098
|
"""
|
3924
4099
|
# example usage:
|
3925
|
-
# dir_lib("seaborn")
|
4100
|
+
# dir_lib("seaborn")
|
3926
4101
|
"""
|
3927
4102
|
import site
|
3928
4103
|
|
@@ -3941,6 +4116,7 @@ def dir_lib(lib_oi):
|
|
3941
4116
|
print(f"Cannot find the {lib_oi} in site-packages directory.")
|
3942
4117
|
return dir_list
|
3943
4118
|
|
4119
|
+
|
3944
4120
|
class FileInfo:
|
3945
4121
|
def __init__(
|
3946
4122
|
self,
|
@@ -4018,6 +4194,7 @@ class FileInfo:
|
|
4018
4194
|
|
4019
4195
|
def finfo(fpath):
|
4020
4196
|
import time
|
4197
|
+
|
4021
4198
|
fname, fmt = os.path.splitext(fpath)
|
4022
4199
|
dir_par = os.path.dirname(fpath) + "/"
|
4023
4200
|
data = {
|
@@ -4033,6 +4210,7 @@ def finfo(fpath):
|
|
4033
4210
|
extra_info = {}
|
4034
4211
|
if data["kind"] == ".pdf":
|
4035
4212
|
from pdf2image import pdfinfo_from_path
|
4213
|
+
|
4036
4214
|
extra_info = pdfinfo_from_path(fpath)
|
4037
4215
|
|
4038
4216
|
return FileInfo(
|
@@ -4047,6 +4225,7 @@ def finfo(fpath):
|
|
4047
4225
|
extra_info=extra_info,
|
4048
4226
|
)
|
4049
4227
|
|
4228
|
+
|
4050
4229
|
# ! format excel file
|
4051
4230
|
def hex2argb(hex_color):
|
4052
4231
|
"""
|
@@ -4078,7 +4257,10 @@ def hex2argb(hex_color):
|
|
4078
4257
|
return hex_color[-9:]
|
4079
4258
|
else:
|
4080
4259
|
return "F" * (9 - len(hex_color)) + hex_color
|
4081
|
-
raise ValueError(
|
4260
|
+
raise ValueError(
|
4261
|
+
"Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format."
|
4262
|
+
)
|
4263
|
+
|
4082
4264
|
|
4083
4265
|
def format_excel(
|
4084
4266
|
df=None,
|
@@ -4137,7 +4319,15 @@ def format_excel(
|
|
4137
4319
|
font_bold = False
|
4138
4320
|
font_strike = False
|
4139
4321
|
font_italic = False
|
4140
|
-
kws_font = [
|
4322
|
+
kws_font = [
|
4323
|
+
"name",
|
4324
|
+
"size",
|
4325
|
+
"bold",
|
4326
|
+
"underline",
|
4327
|
+
"color",
|
4328
|
+
"strike",
|
4329
|
+
"italic",
|
4330
|
+
]
|
4141
4331
|
for k_, v_ in cell.get(K, {}).items():
|
4142
4332
|
if strcmp(k_, kws_font)[0] == "name":
|
4143
4333
|
font_name = v_
|
@@ -4167,9 +4357,31 @@ def format_excel(
|
|
4167
4357
|
if strcmp(K, kws_cell)[0] == "fill":
|
4168
4358
|
#! fill
|
4169
4359
|
kws_fill = ["start_color", "end_color", "fill_type", "color"]
|
4170
|
-
kws_fill_type = [
|
4171
|
-
|
4172
|
-
|
4360
|
+
kws_fill_type = [
|
4361
|
+
"darkVertical",
|
4362
|
+
"lightDown",
|
4363
|
+
"lightGrid",
|
4364
|
+
"solid",
|
4365
|
+
"darkDown",
|
4366
|
+
"lightGray",
|
4367
|
+
"lightUp",
|
4368
|
+
"gray0625",
|
4369
|
+
"lightVertical",
|
4370
|
+
"lightHorizontal",
|
4371
|
+
"darkHorizontal",
|
4372
|
+
"gray125",
|
4373
|
+
"darkUp",
|
4374
|
+
"mediumGray",
|
4375
|
+
"darkTrellis",
|
4376
|
+
"darkGray",
|
4377
|
+
"lightTrellis",
|
4378
|
+
"darkGrid",
|
4379
|
+
]
|
4380
|
+
start_color, end_color, fill_type = (
|
4381
|
+
"FFFFFF",
|
4382
|
+
"FFFFFF",
|
4383
|
+
"solid",
|
4384
|
+
) # default
|
4173
4385
|
for k, v in cell.get(K, {}).items():
|
4174
4386
|
if strcmp(k, kws_fill)[0] == "color":
|
4175
4387
|
start_color, end_color = hex2argb(v), hex2argb(v)
|
@@ -4241,27 +4453,78 @@ def format_excel(
|
|
4241
4453
|
|
4242
4454
|
if strcmp(K, kws_cell)[0] == "border":
|
4243
4455
|
#! border
|
4244
|
-
kws_border = [
|
4245
|
-
"
|
4246
|
-
"
|
4247
|
-
"
|
4248
|
-
"
|
4456
|
+
kws_border = [
|
4457
|
+
"color_left",
|
4458
|
+
"color_l",
|
4459
|
+
"color_right",
|
4460
|
+
"color_r",
|
4461
|
+
"color_top",
|
4462
|
+
"color_t",
|
4463
|
+
"color_bottom",
|
4464
|
+
"color_b",
|
4465
|
+
"color_diagonal",
|
4466
|
+
"color_d",
|
4467
|
+
"color_outline",
|
4468
|
+
"color_o",
|
4469
|
+
"color_vertical",
|
4470
|
+
"color_v",
|
4471
|
+
"color_horizontal",
|
4472
|
+
"color_h",
|
4473
|
+
"color",
|
4474
|
+
"style_left",
|
4475
|
+
"style_l",
|
4476
|
+
"style_right",
|
4477
|
+
"style_r",
|
4478
|
+
"style_top",
|
4479
|
+
"style_t",
|
4480
|
+
"style_bottom",
|
4481
|
+
"style_b",
|
4482
|
+
"style_diagonal",
|
4483
|
+
"style_d",
|
4484
|
+
"style_outline",
|
4485
|
+
"style_o",
|
4486
|
+
"style_vertical",
|
4487
|
+
"style_v",
|
4488
|
+
"style_horizontal",
|
4489
|
+
"style_h",
|
4490
|
+
"style",
|
4491
|
+
]
|
4249
4492
|
# * border color
|
4250
|
-
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4251
|
-
|
4493
|
+
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4494
|
+
"FF000000",
|
4495
|
+
"FF000000",
|
4496
|
+
"FF000000",
|
4497
|
+
"FF000000",
|
4498
|
+
)
|
4499
|
+
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4500
|
+
"FF000000",
|
4501
|
+
"FF000000",
|
4502
|
+
"FF000000",
|
4503
|
+
"FF000000",
|
4504
|
+
)
|
4252
4505
|
# get colors config
|
4253
4506
|
for k, v in cell.get(K, {}).items():
|
4254
4507
|
if strcmp(k, kws_border)[0] in ["color"]:
|
4255
4508
|
border_color_all = hex2argb(v)
|
4256
4509
|
# 如果设置了color,表示其它的所有的都设置成为一样的
|
4257
4510
|
# 然后再才开始自己定义其它的color
|
4258
|
-
|
4511
|
+
(
|
4512
|
+
border_color_l,
|
4513
|
+
border_color_r,
|
4514
|
+
border_color_t,
|
4515
|
+
border_color_b,
|
4516
|
+
) = (
|
4259
4517
|
border_color_all,
|
4260
4518
|
border_color_all,
|
4261
4519
|
border_color_all,
|
4262
4520
|
border_color_all,
|
4263
4521
|
)
|
4264
|
-
|
4522
|
+
(
|
4523
|
+
border_color_d,
|
4524
|
+
border_color_o,
|
4525
|
+
border_color_v,
|
4526
|
+
border_color_h,
|
4527
|
+
) = (
|
4265
4528
|
border_color_all,
|
4266
4529
|
border_color_all,
|
4267
4530
|
border_color_all,
|
@@ -4284,10 +4547,31 @@ def format_excel(
|
|
4284
4547
|
elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
|
4285
4548
|
border_color_h = hex2argb(v)
|
4286
4549
|
# *border style
|
4287
|
-
border_styles = [
|
4288
|
-
"
|
4289
|
-
|
4290
|
-
|
4550
|
+
border_styles = [
|
4551
|
+
"thin",
|
4552
|
+
"medium",
|
4553
|
+
"thick",
|
4554
|
+
"dotted",
|
4555
|
+
"dashed",
|
4556
|
+
"hair",
|
4557
|
+
"mediumDashed",
|
4558
|
+
"dashDot",
|
4559
|
+
"dashDotDot",
|
4560
|
+
"slantDashDot",
|
4561
|
+
"none",
|
4562
|
+
]
|
4563
|
+
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4564
|
+
None,
|
4565
|
+
None,
|
4566
|
+
None,
|
4567
|
+
None,
|
4568
|
+
)
|
4569
|
+
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4570
|
+
None,
|
4571
|
+
None,
|
4572
|
+
None,
|
4573
|
+
None,
|
4574
|
+
)
|
4291
4575
|
# get styles config
|
4292
4576
|
for k, v in cell.get(K, {}).items():
|
4293
4577
|
# if not "style" in k:
|
@@ -4296,13 +4580,23 @@ def format_excel(
|
|
4296
4580
|
border_style_all = strcmp(v, border_styles)[0]
|
4297
4581
|
# 如果设置了style,表示其它的所有的都设置成为一样的
|
4298
4582
|
# 然后再才开始自己定义其它的style
|
4299
|
-
|
4583
|
+
(
|
4584
|
+
border_style_l,
|
4585
|
+
border_style_r,
|
4586
|
+
border_style_t,
|
4587
|
+
border_style_b,
|
4588
|
+
) = (
|
4300
4589
|
border_style_all,
|
4301
4590
|
border_style_all,
|
4302
4591
|
border_style_all,
|
4303
4592
|
border_style_all,
|
4304
4593
|
)
|
4305
|
-
|
4594
|
+
(
|
4595
|
+
border_style_d,
|
4596
|
+
border_style_o,
|
4597
|
+
border_style_v,
|
4598
|
+
border_style_h,
|
4599
|
+
) = (
|
4306
4600
|
border_style_all,
|
4307
4601
|
border_style_all,
|
4308
4602
|
border_style_all,
|
@@ -4348,6 +4642,7 @@ def format_excel(
|
|
4348
4642
|
cell_.alignment = cell_alignment
|
4349
4643
|
if border:
|
4350
4644
|
cell_.border = border
|
4645
|
+
|
4351
4646
|
if not isinstance(df, pd.DataFrame):
|
4352
4647
|
try:
|
4353
4648
|
print(f"is loading file {os.path.basename(df)}")
|
@@ -4697,6 +4992,7 @@ def preview(var):
|
|
4697
4992
|
"""Master function to preview formatted variables in Jupyter."""
|
4698
4993
|
from bs4 import BeautifulSoup
|
4699
4994
|
from IPython.display import display, HTML, Markdown
|
4995
|
+
|
4700
4996
|
if isinstance(var, str):
|
4701
4997
|
if isa(var, "html"):
|
4702
4998
|
display(HTML(var)) # Render as HTML
|
@@ -4714,6 +5010,7 @@ def preview(var):
|
|
4714
5010
|
|
4715
5011
|
elif isinstance(var, list) or isinstance(var, dict):
|
4716
5012
|
import json
|
5013
|
+
|
4717
5014
|
# Display JSON
|
4718
5015
|
json_str = json.dumps(var, indent=4)
|
4719
5016
|
display(Markdown(f"```json\n{json_str}\n```"))
|
@@ -4728,6 +5025,7 @@ def preview(var):
|
|
4728
5025
|
|
4729
5026
|
elif isinstance(var, dict):
|
4730
5027
|
import json
|
5028
|
+
|
4731
5029
|
# Handle dictionary formatting
|
4732
5030
|
json_str = json.dumps(var, indent=4)
|
4733
5031
|
display(Markdown(f"```json\n{json_str}\n```"))
|
@@ -4735,12 +5033,15 @@ def preview(var):
|
|
4735
5033
|
else:
|
4736
5034
|
# If the format is not recognized, print a message
|
4737
5035
|
print("Format not recognized or unsupported.")
|
5036
|
+
|
5037
|
+
|
4738
5038
|
# # Example usages:
|
4739
5039
|
# preview("This is a plain text message.")
|
4740
5040
|
# preview("# This is a Markdown header")
|
4741
5041
|
# preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
|
4742
5042
|
# preview({"key": "value", "numbers": [1, 2, 3]})
|
4743
5043
|
|
5044
|
+
|
4744
5045
|
def _df_outlier(
|
4745
5046
|
data,
|
4746
5047
|
columns=None,
|
@@ -4880,51 +5181,53 @@ def df_outlier(
|
|
4880
5181
|
processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
|
4881
5182
|
processed_data = processed_data[col_names_org]
|
4882
5183
|
return processed_data
|
4883
|
-
|
4884
5184
|
|
4885
5185
|
|
4886
5186
|
def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
4887
5187
|
"""
|
4888
5188
|
Extend a DataFrame by the list elecments in the column.
|
4889
|
-
|
5189
|
+
|
4890
5190
|
Parameters:
|
4891
5191
|
----------
|
4892
5192
|
data : pd.DataFrame
|
4893
5193
|
The input DataFrame to be extended.
|
4894
|
-
|
5194
|
+
|
4895
5195
|
column : str
|
4896
5196
|
The name of the column to be split.
|
4897
|
-
|
5197
|
+
|
4898
5198
|
axis : int, optional
|
4899
|
-
The axis along which to expand the DataFrame.
|
5199
|
+
The axis along which to expand the DataFrame.
|
4900
5200
|
- 0 (default): Expand the specified column into multiple rows.
|
4901
5201
|
- 1: Expand the specified column into multiple columns.
|
4902
|
-
|
5202
|
+
|
4903
5203
|
sep : str, optional
|
4904
5204
|
The separator used to split the values in the specified column.
|
4905
5205
|
Must be provided for the function to work correctly.
|
4906
5206
|
"""
|
4907
|
-
|
4908
|
-
data = data.copy()
|
5207
|
+
|
5208
|
+
data = data.copy()
|
4909
5209
|
mask = data[column].str.contains(sep, na=False)
|
4910
5210
|
data = data.copy()
|
4911
5211
|
if mask.any():
|
4912
|
-
data[column] = (
|
4913
|
-
|
4914
|
-
|
4915
|
-
|
4916
|
-
|
5212
|
+
data[column] = data[column].apply(
|
5213
|
+
lambda x: x.split(sep) if isinstance(x, str) else x
|
5214
|
+
) # Only split if x is a string
|
5215
|
+
|
4917
5216
|
# Strip spaces from each item in the lists
|
4918
|
-
data[column] = data[column].apply(
|
4919
|
-
|
5217
|
+
data[column] = data[column].apply(
|
5218
|
+
lambda x: [item.strip() for item in x] if isinstance(x, list) else x
|
5219
|
+
)
|
5220
|
+
|
4920
5221
|
data = data.explode(column, ignore_index=True)
|
4921
5222
|
return data
|
5223
|
+
|
5224
|
+
|
4922
5225
|
# ! DataFrame
|
4923
5226
|
def df_astype(
|
4924
5227
|
data: pd.DataFrame,
|
4925
5228
|
columns: Optional[Union[str, List[str]]] = None,
|
4926
5229
|
astype: str = "datetime",
|
4927
|
-
skip_row:Union[str,list]=None,
|
5230
|
+
skip_row: Union[str, list] = None,
|
4928
5231
|
fmt: Optional[str] = None,
|
4929
5232
|
inplace: bool = True,
|
4930
5233
|
errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
|
@@ -4982,7 +5285,8 @@ def df_astype(
|
|
4982
5285
|
"second",
|
4983
5286
|
"time",
|
4984
5287
|
"week",
|
4985
|
-
"date",
|
5288
|
+
"date",
|
5289
|
+
"day",
|
4986
5290
|
"month",
|
4987
5291
|
"year",
|
4988
5292
|
]
|
@@ -4990,18 +5294,18 @@ def df_astype(
|
|
4990
5294
|
if not inplace:
|
4991
5295
|
data = data.copy()
|
4992
5296
|
if skip_row is not None:
|
4993
|
-
data = data.drop(index=skip_row, errors=
|
5297
|
+
data = data.drop(index=skip_row, errors="ignore")
|
4994
5298
|
# If columns is None, apply to all columns
|
4995
5299
|
if columns is None:
|
4996
5300
|
columns = data.columns.tolist()
|
4997
5301
|
# correct the astype input
|
4998
|
-
if isinstance(astype,str):
|
5302
|
+
if isinstance(astype, str):
|
4999
5303
|
astype = strcmp(astype, astypes)[0]
|
5000
5304
|
print(f"converting as type: {astype}")
|
5001
|
-
elif isinstance(astype,dict):
|
5305
|
+
elif isinstance(astype, dict):
|
5002
5306
|
for col, dtype in astype.items():
|
5003
|
-
dtype=
|
5004
|
-
data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
|
5307
|
+
dtype = "date" if dtype == "day" else dtype
|
5308
|
+
data["col"] = data["col"].adtype(strcmp(dtype, astypes)[0])
|
5005
5309
|
return data if not inplace else None
|
5006
5310
|
|
5007
5311
|
# Ensure columns is a list
|
@@ -5112,13 +5416,15 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
5112
5416
|
if column not in data.columns:
|
5113
5417
|
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
5114
5418
|
|
5115
|
-
if isinstance(by, str) and
|
5419
|
+
if isinstance(by, str) and "count" in by.lower():
|
5116
5420
|
# Count occurrences of each value in the specified column
|
5117
5421
|
value_counts = df[column].value_counts()
|
5118
5422
|
|
5119
5423
|
# Determine the order based on counts
|
5120
5424
|
count_ascending = kwargs.pop("count_ascending", ascending)
|
5121
|
-
sorted_counts = value_counts.sort_values(
|
5425
|
+
sorted_counts = value_counts.sort_values(
|
5426
|
+
ascending=count_ascending
|
5427
|
+
).index.tolist()
|
5122
5428
|
|
5123
5429
|
# Convert to a categorical type with the new order
|
5124
5430
|
df[column] = pd.Categorical(df[column], categories=sorted_counts, ordered=True)
|
@@ -5236,6 +5542,7 @@ def df_merge(
|
|
5236
5542
|
)
|
5237
5543
|
return df_merged
|
5238
5544
|
|
5545
|
+
|
5239
5546
|
def df_drop_duplicates(
|
5240
5547
|
data: pd.DataFrame,
|
5241
5548
|
by: Union[
|
@@ -5244,16 +5551,16 @@ def df_drop_duplicates(
|
|
5244
5551
|
keep="first", # Options: 'first', 'last', or False (drop all duplicates)
|
5245
5552
|
ignore_index=True,
|
5246
5553
|
inplace: bool = False,
|
5247
|
-
verbose=True
|
5554
|
+
verbose=True,
|
5248
5555
|
):
|
5249
5556
|
"""
|
5250
5557
|
data (pd.DataFrame): DataFrame to drop duplicates from.
|
5251
5558
|
by (str): Specify by to drop duplicates:
|
5252
5559
|
- 'index': Drop duplicates based on the DataFrame index.
|
5253
5560
|
- Column name(s) for row-wise duplicate checking.
|
5254
|
-
keep (str): Which duplicates to keep:
|
5255
|
-
'first',
|
5256
|
-
'last',
|
5561
|
+
keep (str): Which duplicates to keep:
|
5562
|
+
'first',
|
5563
|
+
'last',
|
5257
5564
|
False (drop all duplicates).
|
5258
5565
|
inplace (bool): Whether to modify the original DataFrame in place.
|
5259
5566
|
"""
|
@@ -5263,8 +5570,8 @@ def df_drop_duplicates(
|
|
5263
5570
|
result = data[~data.index.duplicated(keep=keep)]
|
5264
5571
|
else:
|
5265
5572
|
# Drop duplicates row-wise based on column(s)
|
5266
|
-
result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
|
5267
|
-
if original_shape!=result.shape or verbose:
|
5573
|
+
result = data.drop_duplicates(subset=by, keep=keep, ignore_index=ignore_index)
|
5574
|
+
if original_shape != result.shape or verbose:
|
5268
5575
|
print(f"\nshape:{original_shape} (before drop_duplicates)")
|
5269
5576
|
print(f"shape:{result.shape} (after drop_duplicates)")
|
5270
5577
|
if inplace:
|
@@ -5274,16 +5581,18 @@ def df_drop_duplicates(
|
|
5274
5581
|
return None
|
5275
5582
|
else:
|
5276
5583
|
return result
|
5584
|
+
|
5585
|
+
|
5277
5586
|
#! fillna()
|
5278
5587
|
def df_fillna(
|
5279
5588
|
data: pd.DataFrame,
|
5280
5589
|
method: str = "knn",
|
5281
|
-
axis: int = 0
|
5590
|
+
axis: int = 0, # column-wise
|
5282
5591
|
constant: float = None,
|
5283
5592
|
n_neighbors: int = 5, # KNN-specific
|
5284
|
-
max_iter: int = 10,
|
5593
|
+
max_iter: int = 10, # Iterative methods specific
|
5285
5594
|
inplace: bool = False,
|
5286
|
-
random_state:int = 1
|
5595
|
+
random_state: int = 1,
|
5287
5596
|
) -> pd.DataFrame:
|
5288
5597
|
"""
|
5289
5598
|
Fill missing values in a DataFrame using specified imputation method.
|
@@ -5299,11 +5608,11 @@ def df_fillna(
|
|
5299
5608
|
- 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
|
5300
5609
|
- 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
|
5301
5610
|
# - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
|
5302
|
-
# - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
|
5611
|
+
# - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
|
5303
5612
|
# reconstructing the data matrix using low-rank approximation
|
5304
5613
|
# - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
|
5305
5614
|
# - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
|
5306
|
-
|
5615
|
+
|
5307
5616
|
axis (int): The axis along which to impute:
|
5308
5617
|
- 0: Impute column-wise (default).
|
5309
5618
|
- 1: Impute row-wise.
|
@@ -5312,7 +5621,7 @@ def df_fillna(
|
|
5312
5621
|
|
5313
5622
|
"""
|
5314
5623
|
if isinstance(data, pd.Series):
|
5315
|
-
data=pd.DataFrame(data)
|
5624
|
+
data = pd.DataFrame(data)
|
5316
5625
|
# handle None
|
5317
5626
|
for col in data.columns:
|
5318
5627
|
data[col] = data[col].apply(lambda x: np.nan if x is None else x)
|
@@ -5322,13 +5631,19 @@ def df_fillna(
|
|
5322
5631
|
# Separate numeric and non-numeric columns
|
5323
5632
|
numeric_data = data.select_dtypes(include=[np.number])
|
5324
5633
|
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
5325
|
-
|
5634
|
+
|
5326
5635
|
if data.empty:
|
5327
5636
|
raise ValueError("Input DataFrame is empty.")
|
5328
5637
|
|
5329
5638
|
# Validate method
|
5330
|
-
methods = [
|
5331
|
-
|
5639
|
+
methods = [
|
5640
|
+
"mean",
|
5641
|
+
"median",
|
5642
|
+
"most_frequent",
|
5643
|
+
"constant",
|
5644
|
+
"knn",
|
5645
|
+
"iterative",
|
5646
|
+
] # ,"missforest","softimpute","svd"]
|
5332
5647
|
method = strcmp(method, methods)[0]
|
5333
5648
|
|
5334
5649
|
# If using constant method, ask for a constant value
|
@@ -5342,17 +5657,20 @@ def df_fillna(
|
|
5342
5657
|
# Initialize SimpleImputer with the chosen method
|
5343
5658
|
if method == "constant":
|
5344
5659
|
from sklearn.impute import SimpleImputer
|
5660
|
+
|
5345
5661
|
imputer = SimpleImputer(strategy=method, fill_value=constant)
|
5346
5662
|
elif method == "knn":
|
5347
5663
|
from sklearn.impute import KNNImputer
|
5664
|
+
|
5348
5665
|
imputer = KNNImputer(n_neighbors=n_neighbors)
|
5349
5666
|
elif method == "iterative" or method == "mice":
|
5350
5667
|
from sklearn.experimental import enable_iterative_imputer
|
5351
5668
|
from sklearn.impute import IterativeImputer
|
5352
5669
|
|
5353
|
-
imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
|
5354
|
-
else:
|
5670
|
+
imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
|
5671
|
+
else: # mean, median, most_frequent
|
5355
5672
|
from sklearn.impute import SimpleImputer
|
5673
|
+
|
5356
5674
|
imputer = SimpleImputer(strategy=method)
|
5357
5675
|
|
5358
5676
|
# Fit and transform the data
|
@@ -5376,23 +5694,29 @@ def df_fillna(
|
|
5376
5694
|
# Handle non-numeric data imputation
|
5377
5695
|
if not non_numeric_data.empty:
|
5378
5696
|
from sklearn.impute import SimpleImputer
|
5697
|
+
|
5379
5698
|
if method == "constant":
|
5380
|
-
non_numeric_imputer = SimpleImputer(
|
5699
|
+
non_numeric_imputer = SimpleImputer(
|
5700
|
+
strategy="constant", fill_value=constant
|
5701
|
+
)
|
5381
5702
|
else:
|
5382
5703
|
non_numeric_imputer = SimpleImputer(strategy="most_frequent")
|
5383
|
-
|
5704
|
+
|
5384
5705
|
# Impute non-numeric columns column-wise (axis=0)
|
5385
5706
|
imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
|
5386
|
-
|
5707
|
+
|
5387
5708
|
# Convert imputed non-numeric array back to DataFrame with original index and column names
|
5388
5709
|
imputed_non_numeric_df = pd.DataFrame(
|
5389
|
-
imputed_non_numeric,
|
5710
|
+
imputed_non_numeric,
|
5711
|
+
index=non_numeric_data.index,
|
5712
|
+
columns=non_numeric_data.columns,
|
5390
5713
|
)
|
5391
5714
|
else:
|
5392
5715
|
imputed_non_numeric_df = pd.DataFrame(index=data.index)
|
5393
5716
|
|
5394
|
-
|
5395
|
-
|
5717
|
+
imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(
|
5718
|
+
columns=data.columns
|
5719
|
+
)
|
5396
5720
|
|
5397
5721
|
if inplace:
|
5398
5722
|
# Modify the original DataFrame
|
@@ -5401,6 +5725,8 @@ def df_fillna(
|
|
5401
5725
|
else:
|
5402
5726
|
# Return the modified DataFrame
|
5403
5727
|
return imputed_data[col_names_org]
|
5728
|
+
|
5729
|
+
|
5404
5730
|
# # example
|
5405
5731
|
# data = {
|
5406
5732
|
# "A": [1, 2, np.nan, 4, 5],
|
@@ -5430,14 +5756,15 @@ def df_fillna(
|
|
5430
5756
|
# display(df)
|
5431
5757
|
# display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
|
5432
5758
|
|
5759
|
+
|
5433
5760
|
def df_encoder(
|
5434
5761
|
data: pd.DataFrame,
|
5435
|
-
method: str = "dummy"
|
5762
|
+
method: str = "dummy", #'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
|
5436
5763
|
columns=None,
|
5437
5764
|
target_column=None, # Required for 'target' encoding method
|
5438
|
-
**kwargs
|
5765
|
+
**kwargs,
|
5439
5766
|
) -> pd.DataFrame:
|
5440
|
-
"""
|
5767
|
+
"""
|
5441
5768
|
Methods explained:
|
5442
5769
|
- 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
|
5443
5770
|
|
@@ -5454,18 +5781,20 @@ def df_encoder(
|
|
5454
5781
|
|
5455
5782
|
# Select categorical columns
|
5456
5783
|
categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
|
5457
|
-
methods = ["dummy","onehot", "ordinal", "label", "target", "binary"]
|
5784
|
+
methods = ["dummy", "onehot", "ordinal", "label", "target", "binary"]
|
5458
5785
|
method = strcmp(method, methods)[0]
|
5459
5786
|
|
5460
5787
|
if columns is None:
|
5461
5788
|
columns = categorical_cols
|
5462
5789
|
|
5463
5790
|
# pd.get_dummies()
|
5464
|
-
if method==
|
5465
|
-
dtype=kwargs.pop("dtype",int)
|
5466
|
-
drop_first=kwargs.pop("drop_first",True)
|
5791
|
+
if method == "dummy":
|
5792
|
+
dtype = kwargs.pop("dtype", int)
|
5793
|
+
drop_first = kwargs.pop("drop_first", True)
|
5467
5794
|
try:
|
5468
|
-
encoded_df = pd.get_dummies(
|
5795
|
+
encoded_df = pd.get_dummies(
|
5796
|
+
data[columns], drop_first=drop_first, dtype=dtype, **kwargs
|
5797
|
+
)
|
5469
5798
|
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5470
5799
|
except Exception as e:
|
5471
5800
|
# print(f"Warning, 没有进行转换, 因为: {e}")
|
@@ -5518,8 +5847,9 @@ def df_encoder(
|
|
5518
5847
|
encoded_data = encoder.fit_transform(data[columns])
|
5519
5848
|
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5520
5849
|
|
5850
|
+
|
5521
5851
|
def df_scaler(
|
5522
|
-
data: pd.DataFrame,
|
5852
|
+
data: pd.DataFrame, # should be numeric dtype
|
5523
5853
|
method="standard",
|
5524
5854
|
columns=None, # default, select all numeric col/row
|
5525
5855
|
inplace=False,
|
@@ -5603,6 +5933,8 @@ def df_scaler(
|
|
5603
5933
|
scaled_df = data.copy()
|
5604
5934
|
scaled_df.loc[numeric_rows.index] = scaled_data
|
5605
5935
|
return scaled_df
|
5936
|
+
|
5937
|
+
|
5606
5938
|
def df_special_characters_cleaner(
|
5607
5939
|
data: pd.DataFrame, where=["column", "content", "index"]
|
5608
5940
|
) -> pd.DataFrame:
|
@@ -5628,6 +5960,8 @@ def df_special_characters_cleaner(
|
|
5628
5960
|
data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
|
5629
5961
|
|
5630
5962
|
return data
|
5963
|
+
|
5964
|
+
|
5631
5965
|
def df_cluster(
|
5632
5966
|
data: pd.DataFrame,
|
5633
5967
|
columns: Optional[list] = None,
|
@@ -5636,7 +5970,7 @@ def df_cluster(
|
|
5636
5970
|
scale: bool = True,
|
5637
5971
|
plot: Union[str, list] = "all",
|
5638
5972
|
inplace: bool = True,
|
5639
|
-
ax
|
5973
|
+
ax=None,
|
5640
5974
|
):
|
5641
5975
|
from sklearn.preprocessing import StandardScaler
|
5642
5976
|
from sklearn.cluster import KMeans
|
@@ -5952,24 +6286,23 @@ def df_reducer(
|
|
5952
6286
|
umap_neighbors: int = 15, # UMAP-specific
|
5953
6287
|
umap_min_dist: float = 0.1, # UMAP-specific
|
5954
6288
|
tsne_perplexity: int = 30, # t-SNE-specific
|
5955
|
-
hue:str = None
|
6289
|
+
hue: str = None, # lda-specific
|
5956
6290
|
scale: bool = True,
|
5957
6291
|
fill_missing: bool = True,
|
5958
6292
|
debug: bool = False,
|
5959
6293
|
inplace: bool = True, # replace the oringinal data
|
5960
|
-
plot_:bool = False
|
6294
|
+
plot_: bool = False, # plot scatterplot, but no 'hue',so it is meaningless
|
5961
6295
|
random_state=1,
|
5962
|
-
ax
|
6296
|
+
ax=None,
|
5963
6297
|
figsize=None,
|
5964
|
-
**kwargs
|
5965
|
-
) -> pd.DataFrame:
|
6298
|
+
**kwargs,
|
6299
|
+
) -> pd.DataFrame:
|
5966
6300
|
dict_methods = {
|
5967
6301
|
#!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
|
5968
6302
|
"pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
|
5969
6303
|
"lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
|
5970
6304
|
"factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
|
5971
6305
|
"svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
|
5972
|
-
|
5973
6306
|
#! Non-linear Dimensionality Reduction (Manifold Learning)
|
5974
6307
|
"umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
|
5975
6308
|
"tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other. Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
|
@@ -5977,28 +6310,40 @@ def df_reducer(
|
|
5977
6310
|
"lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
|
5978
6311
|
"kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
|
5979
6312
|
"ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
|
5980
|
-
|
5981
6313
|
#! Anomaly Detection: Specialized for detecting outliers or unusual patterns
|
5982
6314
|
"isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
|
5983
6315
|
}
|
5984
6316
|
|
5985
6317
|
from sklearn.preprocessing import StandardScaler
|
5986
6318
|
from sklearn.impute import SimpleImputer
|
5987
|
-
|
5988
|
-
|
6319
|
+
|
6320
|
+
if plot_:
|
6321
|
+
import matplotlib.pyplot as plt
|
5989
6322
|
import seaborn as sns
|
5990
6323
|
# Check valid method input
|
5991
|
-
methods=[
|
5992
|
-
|
6324
|
+
methods = [
|
6325
|
+
"pca",
|
6326
|
+
"umap",
|
6327
|
+
"tsne",
|
6328
|
+
"factor",
|
6329
|
+
"isolation_forest",
|
6330
|
+
"lda",
|
6331
|
+
"kpca",
|
6332
|
+
"ica",
|
6333
|
+
"mds",
|
6334
|
+
"lle",
|
6335
|
+
"svd",
|
6336
|
+
]
|
6337
|
+
method = strcmp(method, methods)[0]
|
5993
6338
|
print(f"\nprocessing with using {dict_methods[method]}:")
|
5994
|
-
xlabel,ylabel=None,None
|
6339
|
+
xlabel, ylabel = None, None
|
5995
6340
|
if columns is None:
|
5996
|
-
columns = data.select_dtypes(include=
|
6341
|
+
columns = data.select_dtypes(include="number").columns.tolist()
|
5997
6342
|
if hue is None:
|
5998
|
-
hue
|
6343
|
+
hue = data.select_dtypes(exclude="number").columns.tolist()
|
5999
6344
|
if isinstance(hue, list):
|
6000
6345
|
print("Warning: hue is a list, only select the 1st one")
|
6001
|
-
hue=hue[0]
|
6346
|
+
hue = hue[0]
|
6002
6347
|
if not hue:
|
6003
6348
|
# Select columns if specified, else use all columns
|
6004
6349
|
X = data[columns].values if columns else data.values
|
@@ -6018,11 +6363,12 @@ def df_reducer(
|
|
6018
6363
|
X = scaler.fit_transform(X)
|
6019
6364
|
|
6020
6365
|
# Apply PCA if selected
|
6021
|
-
if method == "pca":
|
6366
|
+
if method == "pca":
|
6022
6367
|
from sklearn.decomposition import PCA
|
6368
|
+
|
6023
6369
|
pca = PCA(n_components=n_components)
|
6024
6370
|
X_reduced = pca.fit_transform(X)
|
6025
|
-
|
6371
|
+
|
6026
6372
|
# Additional PCA information
|
6027
6373
|
explained_variance = pca.explained_variance_ratio_
|
6028
6374
|
singular_values = pca.singular_values_
|
@@ -6038,56 +6384,72 @@ def df_reducer(
|
|
6038
6384
|
# Plot explained variance
|
6039
6385
|
cumulative_variance = np.cumsum(explained_variance)
|
6040
6386
|
plt.figure(figsize=(8, 5))
|
6041
|
-
plt.plot(
|
6387
|
+
plt.plot(
|
6388
|
+
range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o"
|
6389
|
+
)
|
6042
6390
|
plt.title("Cumulative Explained Variance by Principal Components")
|
6043
6391
|
plt.xlabel("Number of Principal Components")
|
6044
6392
|
plt.ylabel("Cumulative Explained Variance")
|
6045
6393
|
plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
|
6046
|
-
plt.axvline(
|
6394
|
+
plt.axvline(
|
6395
|
+
x=n_components,
|
6396
|
+
color="g",
|
6397
|
+
linestyle="--",
|
6398
|
+
label=f"n_components = {n_components}",
|
6399
|
+
)
|
6047
6400
|
plt.legend()
|
6048
6401
|
plt.grid()
|
6049
6402
|
plt.show()
|
6050
6403
|
|
6051
6404
|
# Prepare reduced DataFrame with additional PCA info
|
6052
6405
|
pca_df = pd.DataFrame(
|
6053
|
-
X_reduced,
|
6054
|
-
|
6055
|
-
|
6406
|
+
X_reduced,
|
6407
|
+
index=data.index,
|
6408
|
+
columns=[f"PC_{i+1}" for i in range(n_components)],
|
6409
|
+
)
|
6056
6410
|
# pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
|
6057
6411
|
# pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
|
6058
6412
|
# Expand explained variance to multiple columns if needed
|
6059
6413
|
for i in range(n_components):
|
6060
|
-
pca_df[f"Explained Variance PC_{i+1}"] = np.tile(
|
6414
|
+
pca_df[f"Explained Variance PC_{i+1}"] = np.tile(
|
6415
|
+
format(explained_variance[i] * 100, ".3f") + "%", (pca_df.shape[0], 1)
|
6416
|
+
)
|
6061
6417
|
for i in range(n_components):
|
6062
|
-
pca_df[f"Singular Values PC_{i+1}"] = np.tile(
|
6418
|
+
pca_df[f"Singular Values PC_{i+1}"] = np.tile(
|
6419
|
+
singular_values[i], (pca_df.shape[0], 1)
|
6420
|
+
)
|
6063
6421
|
if hue:
|
6064
|
-
pca_df[hue]=y
|
6065
|
-
elif method ==
|
6422
|
+
pca_df[hue] = y
|
6423
|
+
elif method == "lda":
|
6066
6424
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
6067
|
-
|
6425
|
+
|
6068
6426
|
if "hue" not in locals() or hue is None:
|
6069
|
-
raise ValueError(
|
6427
|
+
raise ValueError(
|
6428
|
+
"LDA requires a 'hue' col parameter to specify class labels."
|
6429
|
+
)
|
6070
6430
|
|
6071
6431
|
lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
|
6072
6432
|
X_reduced = lda_reducer.fit_transform(X, y)
|
6073
|
-
|
6433
|
+
|
6074
6434
|
# Prepare reduced DataFrame with additional LDA info
|
6075
6435
|
lda_df = pd.DataFrame(
|
6076
|
-
X_reduced,
|
6077
|
-
|
6436
|
+
X_reduced,
|
6437
|
+
index=data.index,
|
6438
|
+
columns=[f"LDA_{i+1}" for i in range(n_components)],
|
6078
6439
|
)
|
6079
6440
|
if debug:
|
6080
6441
|
print(f"LDA completed: Reduced to {n_components} components.")
|
6081
6442
|
print("Class separability achieved by LDA.")
|
6082
6443
|
if hue:
|
6083
|
-
lda_df[hue]=y
|
6444
|
+
lda_df[hue] = y
|
6084
6445
|
# Apply UMAP if selected
|
6085
6446
|
elif method == "umap":
|
6086
6447
|
import umap
|
6448
|
+
|
6087
6449
|
umap_reducer = umap.UMAP(
|
6088
6450
|
n_neighbors=umap_neighbors,
|
6089
6451
|
min_dist=umap_min_dist,
|
6090
|
-
n_components=n_components
|
6452
|
+
n_components=n_components,
|
6091
6453
|
)
|
6092
6454
|
X_reduced = umap_reducer.fit_transform(X)
|
6093
6455
|
|
@@ -6102,45 +6464,57 @@ def df_reducer(
|
|
6102
6464
|
|
6103
6465
|
# Prepare reduced DataFrame with additional UMAP info
|
6104
6466
|
umap_df = pd.DataFrame(
|
6105
|
-
X_reduced,
|
6106
|
-
|
6467
|
+
X_reduced,
|
6468
|
+
index=data.index,
|
6469
|
+
columns=[f"UMAP_{i+1}" for i in range(n_components)],
|
6107
6470
|
)
|
6108
6471
|
umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
|
6109
6472
|
umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
|
6110
6473
|
if hue:
|
6111
|
-
umap_df[hue]=y
|
6474
|
+
umap_df[hue] = y
|
6112
6475
|
elif method == "tsne":
|
6113
6476
|
from sklearn.manifold import TSNE
|
6114
|
-
|
6115
|
-
|
6477
|
+
|
6478
|
+
tsne = TSNE(
|
6479
|
+
n_components=n_components,
|
6480
|
+
perplexity=tsne_perplexity,
|
6481
|
+
random_state=random_state,
|
6482
|
+
)
|
6483
|
+
X_reduced = tsne.fit_transform(X)
|
6116
6484
|
tsne_df = pd.DataFrame(
|
6117
|
-
X_reduced,
|
6485
|
+
X_reduced,
|
6118
6486
|
index=data.index,
|
6119
|
-
columns=[f"tSNE_{i+1}" for i in range(n_components)]
|
6487
|
+
columns=[f"tSNE_{i+1}" for i in range(n_components)],
|
6488
|
+
)
|
6489
|
+
tsne_df["Perplexity"] = np.tile(
|
6490
|
+
f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1)
|
6120
6491
|
)
|
6121
|
-
tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
|
6122
6492
|
if hue:
|
6123
|
-
tsne_df[hue]=y
|
6493
|
+
tsne_df[hue] = y
|
6124
6494
|
# Apply Factor Analysis if selected
|
6125
6495
|
elif method == "factor":
|
6126
6496
|
from sklearn.decomposition import FactorAnalysis
|
6497
|
+
|
6127
6498
|
factor = FactorAnalysis(n_components=n_components, random_state=random_state)
|
6128
6499
|
X_reduced = factor.fit_transform(X)
|
6129
6500
|
# Factor Analysis does not directly provide explained variance, but we can approximate it
|
6130
6501
|
fa_variance = factor.noise_variance_
|
6131
6502
|
# Prepare reduced DataFrame with additional Factor Analysis info
|
6132
6503
|
factor_df = pd.DataFrame(
|
6133
|
-
X_reduced,
|
6504
|
+
X_reduced,
|
6134
6505
|
index=data.index,
|
6135
|
-
columns=[f"Factor_{i+1}" for i in range(n_components)]
|
6506
|
+
columns=[f"Factor_{i+1}" for i in range(n_components)],
|
6507
|
+
)
|
6508
|
+
factor_df["Noise Variance"] = np.tile(
|
6509
|
+
format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1)
|
6136
6510
|
)
|
6137
|
-
factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
|
6138
6511
|
if hue:
|
6139
|
-
factor_df[hue]=y
|
6512
|
+
factor_df[hue] = y
|
6140
6513
|
# Apply Isolation Forest for outlier detection if selected
|
6141
6514
|
elif method == "isolation_forest":
|
6142
6515
|
from sklearn.decomposition import PCA
|
6143
6516
|
from sklearn.ensemble import IsolationForest
|
6517
|
+
|
6144
6518
|
# Step 1: Apply PCA for dimensionality reduction to 2 components
|
6145
6519
|
pca = PCA(n_components=n_components)
|
6146
6520
|
X_pca = pca.fit_transform(X)
|
@@ -6150,87 +6524,108 @@ def df_reducer(
|
|
6150
6524
|
|
6151
6525
|
# Prepare reduced DataFrame with additional PCA info
|
6152
6526
|
iso_forest_df = pd.DataFrame(
|
6153
|
-
X_pca, index=data.index,
|
6154
|
-
columns=[f"PC_{i+1}" for i in range(n_components)]
|
6527
|
+
X_pca, index=data.index, columns=[f"PC_{i+1}" for i in range(n_components)]
|
6155
6528
|
)
|
6156
6529
|
|
6157
|
-
isolation_forest = IsolationForest(
|
6530
|
+
isolation_forest = IsolationForest(
|
6531
|
+
n_estimators=100, contamination="auto", random_state=1
|
6532
|
+
)
|
6158
6533
|
isolation_forest.fit(X)
|
6159
|
-
anomaly_scores = isolation_forest.decision_function(
|
6534
|
+
anomaly_scores = isolation_forest.decision_function(
|
6535
|
+
X
|
6536
|
+
) # Anomaly score: larger is less anomalous
|
6160
6537
|
# Predict labels: 1 (normal), -1 (anomaly)
|
6161
|
-
anomaly_labels = isolation_forest.fit_predict(X)
|
6538
|
+
anomaly_labels = isolation_forest.fit_predict(X)
|
6162
6539
|
# Add anomaly scores and labels to the DataFrame
|
6163
6540
|
iso_forest_df["Anomaly Score"] = anomaly_scores
|
6164
6541
|
iso_forest_df["Anomaly Label"] = anomaly_labels
|
6165
6542
|
# add info from pca
|
6166
6543
|
for i in range(n_components):
|
6167
|
-
iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(
|
6544
|
+
iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(
|
6545
|
+
format(explained_variance[i] * 100, ".3f") + "%",
|
6546
|
+
(iso_forest_df.shape[0], 1),
|
6547
|
+
)
|
6168
6548
|
for i in range(n_components):
|
6169
|
-
iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(
|
6549
|
+
iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(
|
6550
|
+
singular_values[i], (iso_forest_df.shape[0], 1)
|
6551
|
+
)
|
6170
6552
|
if hue:
|
6171
|
-
iso_forest_df[hue]=y
|
6172
|
-
|
6553
|
+
iso_forest_df[hue] = y
|
6554
|
+
# * Apply Kernel PCA if selected
|
6173
6555
|
elif method == "kpca":
|
6174
6556
|
from sklearn.decomposition import KernelPCA
|
6175
|
-
|
6557
|
+
|
6558
|
+
kpca = KernelPCA(
|
6559
|
+
n_components=n_components, kernel="rbf", random_state=random_state
|
6560
|
+
)
|
6176
6561
|
X_reduced = kpca.fit_transform(X)
|
6177
6562
|
|
6178
6563
|
# Prepare reduced DataFrame with KPCA info
|
6179
6564
|
kpca_df = pd.DataFrame(
|
6180
|
-
X_reduced,
|
6565
|
+
X_reduced,
|
6181
6566
|
index=data.index,
|
6182
|
-
columns=[f"KPCA_{i+1}" for i in range(n_components)]
|
6567
|
+
columns=[f"KPCA_{i+1}" for i in range(n_components)],
|
6183
6568
|
)
|
6184
6569
|
if debug:
|
6185
6570
|
print("Kernel PCA completed with RBF kernel.")
|
6186
6571
|
if hue:
|
6187
|
-
kpca_df[hue]=y
|
6188
|
-
|
6572
|
+
kpca_df[hue] = y
|
6573
|
+
# * Apply ICA if selected
|
6189
6574
|
elif method == "ica":
|
6190
6575
|
from sklearn.decomposition import FastICA
|
6576
|
+
|
6191
6577
|
ica = FastICA(n_components=n_components, random_state=random_state)
|
6192
6578
|
X_reduced = ica.fit_transform(X)
|
6193
6579
|
|
6194
6580
|
# Prepare reduced DataFrame with ICA info
|
6195
6581
|
ica_df = pd.DataFrame(
|
6196
|
-
X_reduced,
|
6197
|
-
|
6582
|
+
X_reduced,
|
6583
|
+
index=data.index,
|
6584
|
+
columns=[f"ICA_{i+1}" for i in range(n_components)],
|
6198
6585
|
)
|
6199
6586
|
if debug:
|
6200
6587
|
print("Independent Component Analysis (ICA) completed.")
|
6201
6588
|
if hue:
|
6202
|
-
ica_df[hue]=y
|
6203
|
-
|
6589
|
+
ica_df[hue] = y
|
6590
|
+
# * Apply MDS if selected
|
6204
6591
|
elif method == "mds":
|
6205
6592
|
from sklearn.manifold import MDS
|
6593
|
+
|
6206
6594
|
mds = MDS(n_components=n_components, random_state=random_state)
|
6207
6595
|
X_reduced = mds.fit_transform(X)
|
6208
6596
|
|
6209
6597
|
# Prepare reduced DataFrame with MDS info
|
6210
6598
|
mds_df = pd.DataFrame(
|
6211
|
-
X_reduced,
|
6212
|
-
|
6599
|
+
X_reduced,
|
6600
|
+
index=data.index,
|
6601
|
+
columns=[f"MDS_{i+1}" for i in range(n_components)],
|
6213
6602
|
)
|
6214
6603
|
if debug:
|
6215
6604
|
print("Multidimensional Scaling (MDS) completed.")
|
6216
6605
|
if hue:
|
6217
|
-
mds_df[hue]=y
|
6218
|
-
|
6606
|
+
mds_df[hue] = y
|
6607
|
+
# * Apply Locally Linear Embedding (LLE) if selected
|
6219
6608
|
elif method == "lle":
|
6220
6609
|
from sklearn.manifold import LocallyLinearEmbedding
|
6221
|
-
|
6610
|
+
|
6611
|
+
lle = LocallyLinearEmbedding(
|
6612
|
+
n_components=n_components,
|
6613
|
+
n_neighbors=umap_neighbors,
|
6614
|
+
random_state=random_state,
|
6615
|
+
)
|
6222
6616
|
X_reduced = lle.fit_transform(X)
|
6223
6617
|
|
6224
6618
|
# Prepare reduced DataFrame with LLE info
|
6225
6619
|
lle_df = pd.DataFrame(
|
6226
|
-
X_reduced,
|
6227
|
-
|
6620
|
+
X_reduced,
|
6621
|
+
index=data.index,
|
6622
|
+
columns=[f"LLE_{i+1}" for i in range(n_components)],
|
6228
6623
|
)
|
6229
6624
|
if debug:
|
6230
6625
|
print("Locally Linear Embedding (LLE) completed.")
|
6231
6626
|
if hue:
|
6232
|
-
lle_df[hue]=y
|
6233
|
-
|
6627
|
+
lle_df[hue] = y
|
6628
|
+
# * Apply Singular Value Decomposition (SVD) if selected
|
6234
6629
|
elif method == "svd":
|
6235
6630
|
# Using NumPy's SVD for dimensionality reduction
|
6236
6631
|
U, s, Vt = np.linalg.svd(X, full_matrices=False)
|
@@ -6238,11 +6633,12 @@ def df_reducer(
|
|
6238
6633
|
|
6239
6634
|
# Prepare reduced DataFrame with SVD info
|
6240
6635
|
svd_df = pd.DataFrame(
|
6241
|
-
X_reduced,
|
6242
|
-
|
6636
|
+
X_reduced,
|
6637
|
+
index=data.index,
|
6638
|
+
columns=[f"SVD_{i+1}" for i in range(n_components)],
|
6243
6639
|
)
|
6244
6640
|
if hue:
|
6245
|
-
svd_df[hue]=y
|
6641
|
+
svd_df[hue] = y
|
6246
6642
|
if debug:
|
6247
6643
|
print("Singular Value Decomposition (SVD) completed.")
|
6248
6644
|
|
@@ -6250,17 +6646,17 @@ def df_reducer(
|
|
6250
6646
|
if method == "pca":
|
6251
6647
|
reduced_df = pca_df
|
6252
6648
|
colname_met = "PC_"
|
6253
|
-
xlabel= f"PC_1 ({pca_df[
|
6254
|
-
ylabel= f"PC_2 ({pca_df[
|
6649
|
+
xlabel = f"PC_1 ({pca_df['Explained Variance PC_1'].tolist()[0]})"
|
6650
|
+
ylabel = f"PC_2 ({pca_df['Explained Variance PC_2'].tolist()[0]})"
|
6255
6651
|
elif method == "umap":
|
6256
6652
|
reduced_df = umap_df
|
6257
|
-
colname_met = "UMAP_"
|
6653
|
+
colname_met = "UMAP_"
|
6258
6654
|
elif method == "tsne":
|
6259
6655
|
reduced_df = tsne_df
|
6260
|
-
colname_met = "tSNE_"
|
6656
|
+
colname_met = "tSNE_"
|
6261
6657
|
elif method == "factor":
|
6262
6658
|
reduced_df = factor_df
|
6263
|
-
colname_met = "Factor_"
|
6659
|
+
colname_met = "Factor_"
|
6264
6660
|
elif method == "isolation_forest":
|
6265
6661
|
reduced_df = iso_forest_df # Already a DataFrame for outliers
|
6266
6662
|
colname_met = "PC_"
|
@@ -6269,7 +6665,8 @@ def df_reducer(
|
|
6269
6665
|
data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
|
6270
6666
|
x="PC_1",
|
6271
6667
|
y="PC_2",
|
6272
|
-
label="normal",
|
6668
|
+
label="normal",
|
6669
|
+
c="b",
|
6273
6670
|
)
|
6274
6671
|
ax = sns.scatterplot(
|
6275
6672
|
ax=ax,
|
@@ -6277,73 +6674,86 @@ def df_reducer(
|
|
6277
6674
|
x="PC_1",
|
6278
6675
|
y="PC_2",
|
6279
6676
|
c="r",
|
6280
|
-
label="outlier",
|
6677
|
+
label="outlier",
|
6678
|
+
marker="+",
|
6679
|
+
s=30,
|
6281
6680
|
)
|
6282
|
-
elif method==
|
6283
|
-
reduced_df=lda_df
|
6284
|
-
colname_met="LDA_"
|
6285
|
-
elif method=="kpca":
|
6286
|
-
reduced_df=kpca_df
|
6287
|
-
colname_met="KPCA_"
|
6288
|
-
elif method=="ica":
|
6289
|
-
reduced_df=ica_df
|
6290
|
-
colname_met="ICA_"
|
6291
|
-
elif method=="mds":
|
6292
|
-
reduced_df=mds_df
|
6293
|
-
colname_met="MDS_"
|
6294
|
-
elif method=="lle":
|
6295
|
-
reduced_df=lle_df
|
6296
|
-
colname_met="LLE_"
|
6297
|
-
elif method=="svd":
|
6298
|
-
reduced_df=svd_df
|
6299
|
-
colname_met="SVD_"
|
6681
|
+
elif method == "lda":
|
6682
|
+
reduced_df = lda_df
|
6683
|
+
colname_met = "LDA_"
|
6684
|
+
elif method == "kpca":
|
6685
|
+
reduced_df = kpca_df
|
6686
|
+
colname_met = "KPCA_"
|
6687
|
+
elif method == "ica":
|
6688
|
+
reduced_df = ica_df
|
6689
|
+
colname_met = "ICA_"
|
6690
|
+
elif method == "mds":
|
6691
|
+
reduced_df = mds_df
|
6692
|
+
colname_met = "MDS_"
|
6693
|
+
elif method == "lle":
|
6694
|
+
reduced_df = lle_df
|
6695
|
+
colname_met = "LLE_"
|
6696
|
+
elif method == "svd":
|
6697
|
+
reduced_df = svd_df
|
6698
|
+
colname_met = "SVD_"
|
6300
6699
|
# Quick plots
|
6301
6700
|
if plot_ and (not method in ["isolation_forest"]):
|
6302
6701
|
from .plot import plotxy
|
6702
|
+
|
6303
6703
|
if ax is None:
|
6304
6704
|
if figsize is None:
|
6305
|
-
_, ax = plt.subplots(figsize=cm2inch(8,8))
|
6705
|
+
_, ax = plt.subplots(figsize=cm2inch(8, 8))
|
6306
6706
|
else:
|
6307
6707
|
_, ax = plt.subplots(figsize=figsize)
|
6308
6708
|
else:
|
6309
|
-
ax=ax.cla()
|
6310
|
-
ax=plotxy(
|
6311
|
-
|
6312
|
-
|
6313
|
-
|
6314
|
-
|
6315
|
-
|
6316
|
-
|
6317
|
-
|
6318
|
-
|
6319
|
-
|
6320
|
-
|
6321
|
-
|
6322
|
-
|
6323
|
-
|
6709
|
+
ax = ax.cla()
|
6710
|
+
ax = plotxy(
|
6711
|
+
data=reduced_df,
|
6712
|
+
x=colname_met + "1",
|
6713
|
+
y=colname_met + "2",
|
6714
|
+
hue=hue,
|
6715
|
+
s=1,
|
6716
|
+
edgecolor="none",
|
6717
|
+
kind="scater",
|
6718
|
+
figsets=dict(
|
6719
|
+
legend=dict(loc="best", markerscale=4),
|
6720
|
+
xlabel=xlabel if xlabel else None,
|
6721
|
+
ylabel=ylabel if ylabel else None,
|
6722
|
+
),
|
6723
|
+
ax=ax,
|
6724
|
+
verbose=False,
|
6725
|
+
**kwargs,
|
6726
|
+
)
|
6324
6727
|
|
6325
6728
|
if inplace:
|
6326
6729
|
# If inplace=True, add components back into the original data
|
6327
6730
|
for col_idx in range(n_components):
|
6328
|
-
data.loc[:,f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
6731
|
+
data.loc[:, f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
6329
6732
|
# Add extra info for PCA/UMAP
|
6330
6733
|
if method == "pca":
|
6331
6734
|
for i in range(n_components):
|
6332
|
-
data.loc[:,f"Explained Variance PC_{i+1}"] = reduced_df.loc[
|
6735
|
+
data.loc[:, f"Explained Variance PC_{i+1}"] = reduced_df.loc[
|
6736
|
+
:, f"Explained Variance PC_{i+1}"
|
6737
|
+
]
|
6333
6738
|
for i in range(n_components):
|
6334
|
-
data.loc[:,f"Singular Values PC_{i+1}"] = reduced_df.loc[
|
6335
|
-
|
6739
|
+
data.loc[:, f"Singular Values PC_{i+1}"] = reduced_df.loc[
|
6740
|
+
:, f"Singular Values PC_{i+1}"
|
6741
|
+
]
|
6742
|
+
elif method == "umap":
|
6336
6743
|
for i in range(n_components):
|
6337
|
-
data.loc[:,f"UMAP_{i+1}"]=reduced_df.loc[:,f"UMAP_{i+1}"]
|
6338
|
-
data.loc[:,"Embedding"] = reduced_df.loc[:,"Embedding"]
|
6339
|
-
data.loc[:,"Trustworthiness"] = reduced_df.loc[:,"Trustworthiness"]
|
6340
|
-
|
6744
|
+
data.loc[:, f"UMAP_{i+1}"] = reduced_df.loc[:, f"UMAP_{i+1}"]
|
6745
|
+
data.loc[:, "Embedding"] = reduced_df.loc[:, "Embedding"]
|
6746
|
+
data.loc[:, "Trustworthiness"] = reduced_df.loc[:, "Trustworthiness"]
|
6747
|
+
|
6341
6748
|
return None # No return when inplace=True
|
6342
6749
|
|
6343
|
-
return reduced_df
|
6750
|
+
return reduced_df
|
6751
|
+
|
6752
|
+
|
6344
6753
|
# example:
|
6345
6754
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
6346
6755
|
|
6756
|
+
|
6347
6757
|
def plot_cluster(
|
6348
6758
|
data: pd.DataFrame,
|
6349
6759
|
labels: np.ndarray,
|
@@ -6368,6 +6778,7 @@ def plot_cluster(
|
|
6368
6778
|
import seaborn as sns
|
6369
6779
|
from sklearn.metrics import silhouette_samples
|
6370
6780
|
import matplotlib.pyplot as plt
|
6781
|
+
|
6371
6782
|
if metrics is None:
|
6372
6783
|
metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)
|
6373
6784
|
|
@@ -6597,10 +7008,10 @@ def use_pd(
|
|
6597
7008
|
verbose=True,
|
6598
7009
|
dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
|
6599
7010
|
):
|
6600
|
-
default_settings = fload(dir_json, output=
|
7011
|
+
default_settings = fload(dir_json, output="json")
|
6601
7012
|
valid_kinds = list(default_settings.keys())
|
6602
7013
|
kind = strcmp(func_name, valid_kinds)[0]
|
6603
|
-
usage=default_settings[kind]
|
7014
|
+
usage = default_settings[kind]
|
6604
7015
|
if verbose:
|
6605
7016
|
for i, i_ in enumerate(ssplit(usage, by=",")):
|
6606
7017
|
i_ = i_.replace("=", "\t= ") + ","
|