py2ls 0.2.4.7__py3-none-any.whl → 0.2.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/batman.py +32 -1
- py2ls/bio.py +3 -17
- py2ls/data/usages_sns.json +2 -1
- py2ls/ips.py +1694 -838
- py2ls/ml2ls.py +1877 -391
- py2ls/plot.py +500 -222
- {py2ls-0.2.4.7.dist-info → py2ls-0.2.4.9.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.7.dist-info → py2ls-0.2.4.9.dist-info}/RECORD +10 -10
- {py2ls-0.2.4.7.dist-info → py2ls-0.2.4.9.dist-info}/WHEEL +1 -1
py2ls/ips.py
CHANGED
@@ -1,55 +1,8 @@
|
|
1
1
|
import numpy as np
|
2
2
|
import pandas as pd
|
3
|
-
|
4
|
-
import
|
5
|
-
import matplotlib
|
6
|
-
import matplotlib.pyplot as plt
|
7
|
-
import matplotlib.ticker as tck
|
8
|
-
from cycler import cycler
|
9
|
-
from mpl_toolkits.mplot3d import Axes3D
|
10
|
-
import seaborn as sns
|
11
|
-
|
12
|
-
from sklearn.kernel_approximation import KERNEL_PARAMS
|
13
|
-
from sympy import is_increasing
|
14
|
-
import sys, os, shutil, re, yaml, json, subprocess
|
15
|
-
import importlib.util
|
16
|
-
import time
|
17
|
-
from dateutil import parser
|
18
|
-
from datetime import datetime
|
19
|
-
import schedule
|
20
|
-
|
21
|
-
from PIL import Image, ImageEnhance, ImageOps, ImageFilter
|
22
|
-
from rembg import remove, new_session
|
23
|
-
|
24
|
-
import docx
|
25
|
-
from fpdf import FPDF
|
26
|
-
from lxml import etree
|
27
|
-
from docx import Document
|
28
|
-
from PyPDF2 import PdfReader
|
29
|
-
from pptx import Presentation
|
30
|
-
from pptx.util import Inches
|
31
|
-
from pdf2image import convert_from_path, pdfinfo_from_path
|
32
|
-
from nltk.tokenize import sent_tokenize, word_tokenize
|
33
|
-
import nltk # nltk.download("punkt")
|
34
|
-
from docx2pdf import convert
|
35
|
-
import img2pdf as image2pdf
|
36
|
-
import nbformat
|
37
|
-
from nbconvert import MarkdownExporter
|
38
|
-
|
39
|
-
from itertools import pairwise
|
40
|
-
from box import Box, BoxList
|
41
|
-
from numerizer import numerize
|
42
|
-
from tqdm import tqdm
|
43
|
-
import mimetypes
|
44
|
-
from pprint import pp
|
45
|
-
from collections import Counter
|
46
|
-
from fuzzywuzzy import fuzz, process
|
47
|
-
from langdetect import detect
|
48
|
-
from duckduckgo_search import DDGS
|
3
|
+
import sys, os
|
4
|
+
from IPython.display import display
|
49
5
|
from typing import List, Optional, Union
|
50
|
-
from bs4 import BeautifulSoup
|
51
|
-
|
52
|
-
from . import netfinder
|
53
6
|
|
54
7
|
try:
|
55
8
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
@@ -57,6 +10,35 @@ try:
|
|
57
10
|
except NameError:
|
58
11
|
pass
|
59
12
|
|
13
|
+
import warnings
|
14
|
+
|
15
|
+
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
16
|
+
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
17
|
+
|
18
|
+
|
19
|
+
def run_once_within(duration=60): # default 60s
|
20
|
+
import time
|
21
|
+
|
22
|
+
"""
|
23
|
+
usage:
|
24
|
+
if run_once_within():
|
25
|
+
print("This code runs once per minute.")
|
26
|
+
else:
|
27
|
+
print("The code has already been run in the last minute.")
|
28
|
+
"""
|
29
|
+
if not hasattr(run_once_within, "time_last"):
|
30
|
+
run_once_within.time_last = None
|
31
|
+
time_curr = time.time()
|
32
|
+
|
33
|
+
if (run_once_within.time_last is None) or (
|
34
|
+
time_curr - run_once_within.time_last >= duration
|
35
|
+
):
|
36
|
+
run_once_within.time_last = time_curr # Update the last execution time
|
37
|
+
return True
|
38
|
+
else:
|
39
|
+
return False
|
40
|
+
|
41
|
+
|
60
42
|
def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
61
43
|
"""
|
62
44
|
Add the Chinese (default) font to the font manager
|
@@ -66,13 +48,14 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
|
66
48
|
"""
|
67
49
|
import matplotlib.pyplot as plt
|
68
50
|
from matplotlib import font_manager
|
69
|
-
|
51
|
+
|
52
|
+
slashtype = "/" if "mac" in get_os() else "\\"
|
70
53
|
if slashtype in dir_font:
|
71
54
|
font_manager.fontManager.addfont(dir_font)
|
72
55
|
fontname = os.path.basename(dir_font).split(".")[0]
|
73
56
|
else:
|
74
57
|
if "cn" in dir_font.lower() or "ch" in dir_font.lower():
|
75
|
-
fontname = "Hiragino Sans GB"
|
58
|
+
fontname = "Hiragino Sans GB" # default Chinese font
|
76
59
|
else:
|
77
60
|
fontname = dir_font
|
78
61
|
|
@@ -86,6 +69,7 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
|
86
69
|
plt.rcParams["font.sans-serif"] = ["Arial"]
|
87
70
|
return fontname
|
88
71
|
|
72
|
+
|
89
73
|
# set 'dir_save'
|
90
74
|
if "dar" in sys.platform:
|
91
75
|
dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
|
@@ -155,6 +139,9 @@ def run_every(when: str = None, job=None, wait: int = 60):
|
|
155
139
|
:param when: String specifying the interval, e.g. '2 minutes', '4 hours', '1 day'.
|
156
140
|
:param job: The function to be scheduled.
|
157
141
|
"""
|
142
|
+
import schedule
|
143
|
+
import time
|
144
|
+
|
158
145
|
if job is None:
|
159
146
|
print("No job provided!")
|
160
147
|
return
|
@@ -200,6 +187,9 @@ def run_at(when: str, job=None, wait: int = 60):
|
|
200
187
|
:param job: The function to be scheduled.
|
201
188
|
:param wait: The sleep interval between checks in seconds.
|
202
189
|
"""
|
190
|
+
from datetime import datetime
|
191
|
+
import time
|
192
|
+
|
203
193
|
if job is None:
|
204
194
|
print("No job provided!")
|
205
195
|
return
|
@@ -279,11 +269,13 @@ def get_timezone(timezone: str | list = None):
|
|
279
269
|
|
280
270
|
def is_package_installed(package_name):
|
281
271
|
"""Check if a package is installed."""
|
272
|
+
import importlib.util
|
273
|
+
|
282
274
|
package_spec = importlib.util.find_spec(package_name)
|
283
275
|
return package_spec is not None
|
284
276
|
|
285
277
|
|
286
|
-
def upgrade(module="py2ls",uninstall=False):
|
278
|
+
def upgrade(module="py2ls", uninstall=False):
|
287
279
|
"""
|
288
280
|
Installs or upgrades a specified Python module.
|
289
281
|
|
@@ -291,6 +283,8 @@ def upgrade(module="py2ls",uninstall=False):
|
|
291
283
|
module (str): The name of the module to install/upgrade.
|
292
284
|
uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
|
293
285
|
"""
|
286
|
+
import subprocess
|
287
|
+
|
294
288
|
if not is_package_installed(module):
|
295
289
|
try:
|
296
290
|
subprocess.check_call([sys.executable, "-m", "pip", "install", module])
|
@@ -327,6 +321,8 @@ def get_version(pkg):
|
|
327
321
|
|
328
322
|
|
329
323
|
def rm_folder(folder_path, verbose=True):
|
324
|
+
import shutil
|
325
|
+
|
330
326
|
try:
|
331
327
|
shutil.rmtree(folder_path)
|
332
328
|
if verbose:
|
@@ -345,6 +341,8 @@ def fremove(path, verbose=True):
|
|
345
341
|
"""
|
346
342
|
try:
|
347
343
|
if os.path.isdir(path):
|
344
|
+
import shutil
|
345
|
+
|
348
346
|
shutil.rmtree(path)
|
349
347
|
if verbose:
|
350
348
|
print(f"Successfully deleted folder {path}")
|
@@ -360,22 +358,31 @@ def fremove(path, verbose=True):
|
|
360
358
|
print(f"Failed to delete {path}. Reason: {e}")
|
361
359
|
|
362
360
|
|
363
|
-
def get_cwd(verbose: bool = True):
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
361
|
+
# def get_cwd(verbose: bool = True):
|
362
|
+
# """
|
363
|
+
# get_cwd: to get the current working directory
|
364
|
+
# Args:
|
365
|
+
# verbose (bool, optional): to show which function is use. Defaults to True.
|
366
|
+
# """
|
367
|
+
# try:
|
368
|
+
# script_dir = os.path.dirname(os.path.abspath(__file__))
|
369
|
+
# if verbose:
|
370
|
+
# print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
|
371
|
+
# except NameError:
|
372
|
+
# # This works in an interactive environment (like a Jupyter notebook)
|
373
|
+
# script_dir = os.getcwd()
|
374
|
+
# if verbose:
|
375
|
+
# print("os.getcwd():", script_dir)
|
376
|
+
# return script_dir
|
377
|
+
|
378
|
+
|
379
|
+
def get_cwd():
|
380
|
+
from pathlib import Path
|
381
|
+
|
382
|
+
# Get the current script's directory as a Path object
|
383
|
+
current_directory = Path(__file__).resolve().parent
|
384
|
+
|
385
|
+
return current_directory
|
379
386
|
|
380
387
|
|
381
388
|
def search(
|
@@ -388,6 +395,7 @@ def search(
|
|
388
395
|
dir_save=dir_save,
|
389
396
|
**kwargs,
|
390
397
|
):
|
398
|
+
from duckduckgo_search import DDGS
|
391
399
|
|
392
400
|
if "te" in kind.lower():
|
393
401
|
results = DDGS().text(query, max_results=limit)
|
@@ -421,6 +429,7 @@ def echo(*args, **kwargs):
|
|
421
429
|
str: the answer from ai
|
422
430
|
"""
|
423
431
|
global dir_save
|
432
|
+
from duckduckgo_search import DDGS
|
424
433
|
|
425
434
|
query = None
|
426
435
|
model = kwargs.get("model", "gpt")
|
@@ -469,8 +478,13 @@ def echo(*args, **kwargs):
|
|
469
478
|
model_valid = valid_mod_name(model)
|
470
479
|
res = DDGS().chat(query, model=model_valid)
|
471
480
|
if verbose:
|
481
|
+
from pprint import pp
|
482
|
+
|
472
483
|
pp(res)
|
473
484
|
if log:
|
485
|
+
from datetime import datetime
|
486
|
+
import time
|
487
|
+
|
474
488
|
dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
|
475
489
|
res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
|
476
490
|
if bool(os.path.basename(dir_save)):
|
@@ -492,6 +506,8 @@ def ai(*args, **kwargs):
|
|
492
506
|
|
493
507
|
|
494
508
|
def detect_lang(text, output="lang", verbose=True):
|
509
|
+
from langdetect import detect
|
510
|
+
|
495
511
|
dir_curr_script = os.path.dirname(os.path.abspath(__file__))
|
496
512
|
dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
|
497
513
|
print(dir_curr_script, os.getcwd(), dir_lang_code)
|
@@ -521,13 +537,14 @@ def is_text(s):
|
|
521
537
|
|
522
538
|
from typing import Any, Union
|
523
539
|
|
540
|
+
|
524
541
|
def shared(*args, strict=True, n_shared=2, verbose=True):
|
525
542
|
"""
|
526
543
|
check the shared elelements in two list.
|
527
544
|
usage:
|
528
545
|
list1 = [1, 2, 3, 4, 5]
|
529
546
|
list2 = [4, 5, 6, 7, 8]
|
530
|
-
list3 = [5, 6, 9, 10]
|
547
|
+
list3 = [5, 6, 9, 10]
|
531
548
|
a = shared(list1, list2,list3)
|
532
549
|
"""
|
533
550
|
if verbose:
|
@@ -543,25 +560,34 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
|
|
543
560
|
print(f"{' ' * 2}All inputs must be lists.")
|
544
561
|
return []
|
545
562
|
first_list = flattened_lists[0]
|
546
|
-
shared_elements = [
|
563
|
+
shared_elements = [
|
564
|
+
item for item in first_list if all(item in lst for lst in flattened_lists)
|
565
|
+
]
|
547
566
|
if strict:
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
567
|
+
# Strict mode: require elements to be in all lists
|
568
|
+
shared_elements = set(flattened_lists[0])
|
569
|
+
for lst in flattened_lists[1:]:
|
570
|
+
shared_elements.intersection_update(lst)
|
552
571
|
else:
|
572
|
+
from collections import Counter
|
573
|
+
|
553
574
|
all_elements = [item for sublist in flattened_lists for item in sublist]
|
554
575
|
element_count = Counter(all_elements)
|
555
576
|
# Get elements that appear in at least n_shared lists
|
556
|
-
shared_elements = [
|
577
|
+
shared_elements = [
|
578
|
+
item for item, count in element_count.items() if count >= n_shared
|
579
|
+
]
|
557
580
|
|
558
581
|
shared_elements = flatten(shared_elements, verbose=verbose)
|
559
582
|
if verbose:
|
560
|
-
elements2show =
|
583
|
+
elements2show = (
|
584
|
+
shared_elements if len(shared_elements) < 10 else shared_elements[:5]
|
585
|
+
)
|
561
586
|
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
562
587
|
print("********* checking shared elements *********")
|
563
588
|
return shared_elements
|
564
589
|
|
590
|
+
|
565
591
|
def not_shared(*args, strict=True, n_shared=2, verbose=False):
|
566
592
|
"""
|
567
593
|
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
@@ -571,9 +597,9 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
|
|
571
597
|
not_shared(list1,list2)# output [1,3]
|
572
598
|
"""
|
573
599
|
_common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
|
574
|
-
list1 = args[0]
|
575
|
-
_not_shared=[item for item in list1 if item not in _common]
|
576
|
-
return
|
600
|
+
list1 = flatten(args[0], verbose=verbose)
|
601
|
+
_not_shared = [item for item in list1 if item not in _common]
|
602
|
+
return _not_shared
|
577
603
|
|
578
604
|
|
579
605
|
def flatten(nested: Any, unique_list=True, verbose=False):
|
@@ -582,29 +608,41 @@ def flatten(nested: Any, unique_list=True, verbose=False):
|
|
582
608
|
Parameters:
|
583
609
|
nested : Any, Can be a list, tuple, dictionary, or set.
|
584
610
|
Returns: list, A flattened list.
|
585
|
-
"""
|
611
|
+
"""
|
586
612
|
flattened_list = []
|
587
613
|
stack = [nested]
|
588
614
|
while stack:
|
589
615
|
current = stack.pop()
|
590
616
|
if isinstance(current, dict):
|
591
|
-
stack.extend(current.values())
|
617
|
+
stack.extend(current.values())
|
592
618
|
elif isinstance(current, (list, tuple, set)):
|
593
619
|
stack.extend(current)
|
594
620
|
elif isinstance(current, pd.Series):
|
595
621
|
stack.extend(current)
|
596
|
-
elif isinstance(
|
622
|
+
elif isinstance(
|
623
|
+
current, (pd.Index, np.ndarray)
|
624
|
+
): # df.columns df.index are object of type pd.Index
|
597
625
|
stack.extend(current.tolist())
|
598
626
|
else:
|
599
627
|
flattened_list.append(current)
|
600
628
|
if verbose:
|
601
|
-
print(
|
629
|
+
print(
|
630
|
+
f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>"
|
631
|
+
)
|
602
632
|
if unique_list:
|
603
633
|
return unique(flattened_list)[::-1]
|
604
634
|
else:
|
605
635
|
return flattened_list
|
606
|
-
|
607
|
-
|
636
|
+
|
637
|
+
|
638
|
+
def strcmp(
|
639
|
+
search_term,
|
640
|
+
candidates,
|
641
|
+
ignore_case=True,
|
642
|
+
get_rank=False,
|
643
|
+
verbose=False,
|
644
|
+
scorer="WR",
|
645
|
+
):
|
608
646
|
"""
|
609
647
|
Compares a search term with a list of candidate strings and finds the best match based on similarity score.
|
610
648
|
|
@@ -617,6 +655,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
|
|
617
655
|
Returns:
|
618
656
|
tuple: A tuple containing the best match and its index in the candidates list.
|
619
657
|
"""
|
658
|
+
from fuzzywuzzy import fuzz, process
|
620
659
|
|
621
660
|
def to_lower(s, ignore_case=True):
|
622
661
|
# Converts a string or list of strings to lowercase if ignore_case is True.
|
@@ -624,7 +663,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
|
|
624
663
|
if isinstance(s, str):
|
625
664
|
return s.lower()
|
626
665
|
elif isinstance(s, list):
|
627
|
-
s=[str(i) for i in s]# convert all to str
|
666
|
+
s = [str(i) for i in s] # convert all to str
|
628
667
|
return [elem.lower() for elem in s]
|
629
668
|
return s
|
630
669
|
|
@@ -634,12 +673,15 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
|
|
634
673
|
similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
|
635
674
|
elif "W" in scorer.lower():
|
636
675
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
637
|
-
elif "ratio" in scorer.lower() or "stri" in scorer.lower()
|
676
|
+
elif "ratio" in scorer.lower() or "stri" in scorer.lower(): # Ratio (Strictest)
|
638
677
|
similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
|
639
678
|
else:
|
640
679
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
641
680
|
if get_rank:
|
642
|
-
idx = [
|
681
|
+
idx = [
|
682
|
+
similarity_scores.index(i)
|
683
|
+
for i in sorted(similarity_scores, reverse=True)
|
684
|
+
]
|
643
685
|
if verbose:
|
644
686
|
display([candidates[ii] for ii in idx])
|
645
687
|
return [candidates[ii] for ii in idx]
|
@@ -667,6 +709,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
|
|
667
709
|
# str2 = ['PLoS Computational Biology', 'PLOS BIOLOGY']
|
668
710
|
# best_match, idx = strcmp(str1, str2, ignore_case=1)
|
669
711
|
|
712
|
+
|
670
713
|
def cn2pinyin(
|
671
714
|
cn_str: Union[str, list] = None,
|
672
715
|
sep: str = " ",
|
@@ -731,18 +774,21 @@ def cn2pinyin(
|
|
731
774
|
style = Style.PL
|
732
775
|
else:
|
733
776
|
style = Style.NORMAL
|
734
|
-
if not isinstance(cn_str,list):
|
735
|
-
cn_str=[cn_str]
|
736
|
-
pinyin_flat=[]
|
777
|
+
if not isinstance(cn_str, list):
|
778
|
+
cn_str = [cn_str]
|
779
|
+
pinyin_flat = []
|
737
780
|
for cn_str_ in cn_str:
|
738
781
|
pinyin_string = pinyin(cn_str_, style=style)
|
739
782
|
pinyin_flat.append(sep.join([item[0] for item in pinyin_string]))
|
740
|
-
if len(pinyin_flat)==1:
|
783
|
+
if len(pinyin_flat) == 1:
|
741
784
|
return pinyin_flat[0]
|
742
785
|
else:
|
743
786
|
return pinyin_flat
|
744
787
|
|
788
|
+
|
745
789
|
def counter(list_, verbose=True):
|
790
|
+
from collections import Counter
|
791
|
+
|
746
792
|
c = Counter(list_)
|
747
793
|
# Print the name counts
|
748
794
|
for item, count in c.items():
|
@@ -771,6 +817,7 @@ def str2time(time_str, fmt="24"):
|
|
771
817
|
%p represents AM or PM.
|
772
818
|
- str: The converted time string.
|
773
819
|
"""
|
820
|
+
from datetime import datetime
|
774
821
|
|
775
822
|
def time_len_corr(time_str):
|
776
823
|
time_str_ = (
|
@@ -832,6 +879,8 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
|
|
832
879
|
Returns:
|
833
880
|
- str: The converted date string.
|
834
881
|
"""
|
882
|
+
from dateutil import parser
|
883
|
+
|
835
884
|
try:
|
836
885
|
date_obj = parser.parse(date_str)
|
837
886
|
except ValueError as e:
|
@@ -848,6 +897,8 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
|
|
848
897
|
|
849
898
|
|
850
899
|
def str2num(s, *args, **kwargs):
|
900
|
+
import re
|
901
|
+
|
851
902
|
delimiter = kwargs.get("sep", None)
|
852
903
|
round_digits = kwargs.get("round", None)
|
853
904
|
if delimiter is not None:
|
@@ -863,6 +914,8 @@ def str2num(s, *args, **kwargs):
|
|
863
914
|
try:
|
864
915
|
num = float(s)
|
865
916
|
except ValueError:
|
917
|
+
from numerizer import numerize
|
918
|
+
|
866
919
|
try:
|
867
920
|
numerized = numerize(s)
|
868
921
|
num = int(numerized) if "." not in numerized else float(numerized)
|
@@ -1030,7 +1083,7 @@ def px2inch(*px, dpi=300) -> list:
|
|
1030
1083
|
return [i / dpi for i in px]
|
1031
1084
|
|
1032
1085
|
|
1033
|
-
def
|
1086
|
+
def inch2cm(*cm) -> list:
|
1034
1087
|
"""
|
1035
1088
|
cm2inch: converts centimeter measurements to inches.
|
1036
1089
|
Usage:
|
@@ -1051,24 +1104,30 @@ def cm2inch(*cm) -> list:
|
|
1051
1104
|
def inch2px(*inch, dpi=300) -> list:
|
1052
1105
|
"""
|
1053
1106
|
inch2px: converts inch measurements to pixels based on the given dpi.
|
1107
|
+
|
1054
1108
|
Usage:
|
1055
1109
|
inch2px(1, 2, dpi=300); inch2px([1, 2], dpi=300)
|
1110
|
+
|
1111
|
+
Parameters:
|
1112
|
+
inch : float, list, or tuple
|
1113
|
+
Single or multiple measurements in inches to convert to pixels.
|
1114
|
+
dpi : int, optional (default=300)
|
1115
|
+
Dots per inch (DPI), representing the pixel density.
|
1116
|
+
|
1056
1117
|
Returns:
|
1057
|
-
list: in pixels
|
1118
|
+
list: Converted measurements in pixels.
|
1058
1119
|
"""
|
1059
|
-
# Case 1: When the user passes a single argument that is a list or tuple,
|
1120
|
+
# Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
|
1060
1121
|
if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
|
1061
|
-
# If the input is a single list or tuple, we unpack its elements and convert each to pixels
|
1062
1122
|
return [i * dpi for i in inch[0]]
|
1063
|
-
|
1123
|
+
|
1124
|
+
# Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
|
1064
1125
|
else:
|
1065
|
-
# Here, we convert each individual argument directly to pixels
|
1066
1126
|
return [i * dpi for i in inch]
|
1067
1127
|
|
1068
1128
|
|
1069
|
-
def
|
1129
|
+
def cm2inch(*inch) -> list:
|
1070
1130
|
"""
|
1071
|
-
inch2cm: converts inch measurements to centimeters.
|
1072
1131
|
Usage:
|
1073
1132
|
inch2cm(8,5); inch2cm((8,5)); inch2cm([8,5])
|
1074
1133
|
Returns:
|
@@ -1183,6 +1242,8 @@ def paper_size(paper_type_str="a4"):
|
|
1183
1242
|
|
1184
1243
|
|
1185
1244
|
def docx2pdf(dir_docx, dir_pdf=None):
|
1245
|
+
from docx2pdf import convert
|
1246
|
+
|
1186
1247
|
if dir_pdf:
|
1187
1248
|
convert(dir_docx, dir_pdf)
|
1188
1249
|
else:
|
@@ -1190,6 +1251,8 @@ def docx2pdf(dir_docx, dir_pdf=None):
|
|
1190
1251
|
|
1191
1252
|
|
1192
1253
|
def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
|
1254
|
+
import img2pdf as image2pdf
|
1255
|
+
|
1193
1256
|
def mm_to_point(size):
|
1194
1257
|
return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
|
1195
1258
|
|
@@ -1241,6 +1304,10 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
|
|
1241
1304
|
|
1242
1305
|
|
1243
1306
|
def pdf2ppt(dir_pdf, dir_ppt):
|
1307
|
+
from PyPDF2 import PdfReader
|
1308
|
+
from pptx.util import Inches
|
1309
|
+
from pptx import Presentation
|
1310
|
+
|
1244
1311
|
prs = Presentation()
|
1245
1312
|
|
1246
1313
|
# Open the PDF file
|
@@ -1269,6 +1336,8 @@ def pdf2ppt(dir_pdf, dir_ppt):
|
|
1269
1336
|
|
1270
1337
|
|
1271
1338
|
def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
1339
|
+
import re
|
1340
|
+
|
1272
1341
|
if isinstance(text, list):
|
1273
1342
|
nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
|
1274
1343
|
flat_list = [item for sublist in nested_list for item in sublist]
|
@@ -1316,6 +1385,9 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1316
1385
|
return [text[i : i + length] for i in range(0, len(text), length)]
|
1317
1386
|
|
1318
1387
|
def split_by_sent_num(text, n=10):
|
1388
|
+
from nltk.tokenize import sent_tokenize
|
1389
|
+
from itertools import pairwise
|
1390
|
+
|
1319
1391
|
# split text into sentences
|
1320
1392
|
text_split_by_sent = sent_tokenize(text)
|
1321
1393
|
cut_loc_array = np.arange(0, len(text_split_by_sent), n)
|
@@ -1388,10 +1460,14 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1388
1460
|
print(f"splited by camel_case")
|
1389
1461
|
return split_by_camel_case(text)
|
1390
1462
|
elif ("word" in by) and not strict:
|
1463
|
+
from nltk.tokenize import word_tokenize
|
1464
|
+
|
1391
1465
|
if verbose:
|
1392
1466
|
print(f"splited by word")
|
1393
1467
|
return word_tokenize(text)
|
1394
1468
|
elif ("sen" in by and not "num" in by) and not strict:
|
1469
|
+
from nltk.tokenize import sent_tokenize
|
1470
|
+
|
1395
1471
|
if verbose:
|
1396
1472
|
print(f"splited by sentence")
|
1397
1473
|
return sent_tokenize(text)
|
@@ -1441,9 +1517,13 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1441
1517
|
|
1442
1518
|
|
1443
1519
|
def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
|
1520
|
+
from pdf2image import convert_from_path, pdfinfo_from_path
|
1521
|
+
|
1444
1522
|
df_dir_img_single_page = pd.DataFrame()
|
1445
1523
|
dir_single_page = []
|
1446
1524
|
if verbose:
|
1525
|
+
from pprint import pp
|
1526
|
+
|
1447
1527
|
pp(pdfinfo_from_path(dir_pdf))
|
1448
1528
|
if isinstance(page, tuple) and page:
|
1449
1529
|
page = list(page)
|
@@ -1562,6 +1642,8 @@ def unzip(dir_path, output_dir=None):
|
|
1562
1642
|
# If the output directory already exists, remove it and replace it
|
1563
1643
|
if os.path.exists(output_dir):
|
1564
1644
|
if os.path.isdir(output_dir): # check if it is a folder
|
1645
|
+
import shutil
|
1646
|
+
|
1565
1647
|
shutil.rmtree(output_dir) # remove folder
|
1566
1648
|
else:
|
1567
1649
|
os.remove(output_dir) # remove file
|
@@ -1579,6 +1661,8 @@ def unzip(dir_path, output_dir=None):
|
|
1579
1661
|
|
1580
1662
|
output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
|
1581
1663
|
try:
|
1664
|
+
import shutil
|
1665
|
+
|
1582
1666
|
with gzip.open(dir_path, "rb") as gz_file:
|
1583
1667
|
with open(output_file, "wb") as out_file:
|
1584
1668
|
shutil.copyfileobj(gz_file, out_file)
|
@@ -1586,11 +1670,14 @@ def unzip(dir_path, output_dir=None):
|
|
1586
1670
|
except FileNotFoundError:
|
1587
1671
|
print(f"Error: The file '{dir_path}' was not found.")
|
1588
1672
|
except PermissionError:
|
1589
|
-
print(
|
1673
|
+
print(
|
1674
|
+
f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'."
|
1675
|
+
)
|
1590
1676
|
except Exception as e:
|
1591
1677
|
try:
|
1592
1678
|
import tarfile
|
1593
|
-
|
1679
|
+
|
1680
|
+
with tarfile.open(dir_path, "r:gz") as tar:
|
1594
1681
|
tar.extractall(path=output_file)
|
1595
1682
|
except Exception as final_e:
|
1596
1683
|
print(f"An final unexpected error occurred: {final_e}")
|
@@ -1676,11 +1763,13 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1676
1763
|
|
1677
1764
|
"""
|
1678
1765
|
if not isinstance(df, pd.DataFrame):
|
1766
|
+
if verbose:
|
1767
|
+
print("not pd.DataFrame")
|
1679
1768
|
return False
|
1680
|
-
df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
|
1769
|
+
df.columns = df.columns.astype(str) # 把它变成str, 这样就可以进行counts运算了
|
1681
1770
|
# Initialize a list to hold messages about abnormalities
|
1682
1771
|
messages = []
|
1683
|
-
is_abnormal =
|
1772
|
+
is_abnormal = False
|
1684
1773
|
# Check the shape of the DataFrame
|
1685
1774
|
actual_shape = df.shape
|
1686
1775
|
messages.append(f"Shape of DataFrame: {actual_shape}")
|
@@ -1705,25 +1794,29 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1705
1794
|
is_abnormal = True
|
1706
1795
|
if verbose:
|
1707
1796
|
print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
|
1708
|
-
|
1797
|
+
if verbose:
|
1798
|
+
print("1", is_abnormal)
|
1709
1799
|
if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
|
1710
1800
|
messages.append("Abnormal: Too many delimiters in column names.")
|
1711
1801
|
is_abnormal = True
|
1712
1802
|
if verbose:
|
1713
1803
|
print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
|
1714
|
-
|
1804
|
+
if verbose:
|
1805
|
+
print("2", is_abnormal)
|
1715
1806
|
if delimiter_counts[""] > 3:
|
1716
1807
|
messages.append("Abnormal: There are empty column names.")
|
1717
1808
|
is_abnormal = True
|
1718
1809
|
if verbose:
|
1719
1810
|
print(f'delimiter_counts[""] > 3')
|
1720
|
-
|
1811
|
+
if verbose:
|
1812
|
+
print("3", is_abnormal)
|
1721
1813
|
if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
|
1722
1814
|
messages.append("Abnormal: Some column names contain unexpected characters.")
|
1723
1815
|
is_abnormal = True
|
1724
1816
|
if verbose:
|
1725
1817
|
print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
|
1726
|
-
|
1818
|
+
if verbose:
|
1819
|
+
print("4", is_abnormal)
|
1727
1820
|
# # Check for missing values
|
1728
1821
|
# missing_values = df.isnull().sum()
|
1729
1822
|
# if missing_values.any():
|
@@ -1742,8 +1835,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1742
1835
|
messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
|
1743
1836
|
is_abnormal = True
|
1744
1837
|
if verbose:
|
1745
|
-
print(f
|
1746
|
-
|
1838
|
+
print(f"df.columns[df.nunique() == 1].tolist()")
|
1839
|
+
if verbose:
|
1840
|
+
print("5", is_abnormal)
|
1747
1841
|
# Check for an unreasonable number of rows or columns
|
1748
1842
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
1749
1843
|
messages.append(
|
@@ -1751,8 +1845,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1751
1845
|
)
|
1752
1846
|
is_abnormal = True
|
1753
1847
|
if verbose:
|
1754
|
-
print(f
|
1755
|
-
|
1848
|
+
print(f"actual_shape[0] < 2 or actual_shape[1] < 2")
|
1849
|
+
if verbose:
|
1850
|
+
print("6", is_abnormal)
|
1756
1851
|
# Compile results
|
1757
1852
|
if verbose:
|
1758
1853
|
print("\n".join(messages))
|
@@ -1770,6 +1865,26 @@ def fload(fpath, kind=None, **kwargs):
|
|
1770
1865
|
content: The content loaded from the file.
|
1771
1866
|
"""
|
1772
1867
|
|
1868
|
+
def read_mplstyle(style_file):
|
1869
|
+
import matplotlib.pyplot as plt
|
1870
|
+
|
1871
|
+
# Load the style file
|
1872
|
+
plt.style.use(style_file)
|
1873
|
+
|
1874
|
+
# Get the current style properties
|
1875
|
+
style_dict = plt.rcParams
|
1876
|
+
|
1877
|
+
# Convert to dictionary
|
1878
|
+
style_dict = dict(style_dict)
|
1879
|
+
# Print the style dictionary
|
1880
|
+
for i, j in style_dict.items():
|
1881
|
+
print(f"\n{i}::::{j}")
|
1882
|
+
return style_dict
|
1883
|
+
|
1884
|
+
# #example usage:
|
1885
|
+
# style_file = "/ std-colors.mplstyle"
|
1886
|
+
# style_dict = read_mplstyle(style_file)
|
1887
|
+
|
1773
1888
|
def load_txt_md(fpath):
|
1774
1889
|
with open(fpath, "r") as file:
|
1775
1890
|
content = file.read()
|
@@ -1779,25 +1894,30 @@ def fload(fpath, kind=None, **kwargs):
|
|
1779
1894
|
# with open(fpath, "r") as file:
|
1780
1895
|
# content = file.read()
|
1781
1896
|
# return content
|
1782
|
-
def load_html(fpath
|
1783
|
-
return pd.read_html(fpath
|
1897
|
+
def load_html(fpath, **kwargs):
|
1898
|
+
return pd.read_html(fpath, **kwargs)
|
1784
1899
|
|
1785
1900
|
def load_json(fpath, **kwargs):
|
1786
|
-
output=kwargs.pop("output","json")
|
1787
|
-
if output==
|
1901
|
+
output = kwargs.pop("output", "json")
|
1902
|
+
if output == "json":
|
1903
|
+
import json
|
1904
|
+
|
1788
1905
|
with open(fpath, "r") as file:
|
1789
1906
|
content = json.load(file)
|
1790
1907
|
return content
|
1791
1908
|
else:
|
1792
|
-
return pd.read_json(fpath
|
1909
|
+
return pd.read_json(fpath, **kwargs)
|
1793
1910
|
|
1794
1911
|
def load_yaml(fpath):
|
1912
|
+
import yaml
|
1913
|
+
|
1795
1914
|
with open(fpath, "r") as file:
|
1796
1915
|
content = yaml.safe_load(file)
|
1797
1916
|
return content
|
1798
1917
|
|
1799
|
-
|
1800
1918
|
def load_xml(fpath, fsize_thr: int = 100):
|
1919
|
+
from lxml import etree
|
1920
|
+
|
1801
1921
|
def load_small_xml(fpath):
|
1802
1922
|
tree = etree.parse(fpath)
|
1803
1923
|
root = tree.getroot()
|
@@ -1857,6 +1977,15 @@ def fload(fpath, kind=None, **kwargs):
|
|
1857
1977
|
return char
|
1858
1978
|
return None
|
1859
1979
|
|
1980
|
+
def _get_chunks(df_fake):
|
1981
|
+
"""
|
1982
|
+
helper func for 'load_csv'
|
1983
|
+
"""
|
1984
|
+
chunks = []
|
1985
|
+
for chunk in df_fake:
|
1986
|
+
chunks.append(chunk)
|
1987
|
+
return pd.concat(chunks, ignore_index=True)
|
1988
|
+
|
1860
1989
|
def load_csv(fpath, **kwargs):
|
1861
1990
|
from pandas.errors import EmptyDataError
|
1862
1991
|
|
@@ -1868,12 +1997,17 @@ def fload(fpath, kind=None, **kwargs):
|
|
1868
1997
|
encoding = kwargs.pop("encoding", "utf-8")
|
1869
1998
|
on_bad_lines = kwargs.pop("on_bad_lines", "skip")
|
1870
1999
|
comment = kwargs.pop("comment", None)
|
1871
|
-
fmt=kwargs.pop("fmt",False)
|
1872
|
-
|
1873
|
-
if
|
2000
|
+
fmt = kwargs.pop("fmt", False)
|
2001
|
+
chunksize = kwargs.pop("chunksize", None)
|
2002
|
+
engine = "c" if chunksize else engine # when chunksize, recommend 'c'
|
2003
|
+
low_memory = kwargs.pop("low_memory", True)
|
2004
|
+
low_memory = (
|
2005
|
+
False if chunksize else True
|
2006
|
+
) # when chunksize, recommend low_memory=False
|
2007
|
+
verbose = kwargs.pop("verbose", False)
|
2008
|
+
if run_once_within():
|
1874
2009
|
use_pd("read_csv", verbose=verbose)
|
1875
|
-
|
1876
|
-
|
2010
|
+
|
1877
2011
|
if comment is None:
|
1878
2012
|
comment = get_comment(
|
1879
2013
|
fpath, comment=None, encoding="utf-8", lines_to_check=5
|
@@ -1890,14 +2024,19 @@ def fload(fpath, kind=None, **kwargs):
|
|
1890
2024
|
skipinitialspace=skipinitialspace,
|
1891
2025
|
sep=sep,
|
1892
2026
|
on_bad_lines=on_bad_lines,
|
2027
|
+
chunksize=chunksize,
|
2028
|
+
low_memory=low_memory,
|
1893
2029
|
**kwargs,
|
1894
2030
|
)
|
1895
|
-
if
|
2031
|
+
if chunksize:
|
2032
|
+
df = _get_chunks(df)
|
2033
|
+
print(df.shape)
|
2034
|
+
if is_df_abnormal(df, verbose=0): # raise error
|
1896
2035
|
raise ValueError("the df is abnormal")
|
1897
2036
|
except:
|
1898
2037
|
try:
|
1899
2038
|
try:
|
1900
|
-
if engine == "pyarrow":
|
2039
|
+
if engine == "pyarrow" and not chunksize:
|
1901
2040
|
df = pd.read_csv(
|
1902
2041
|
fpath,
|
1903
2042
|
engine=engine,
|
@@ -1906,6 +2045,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1906
2045
|
sep=sep,
|
1907
2046
|
on_bad_lines=on_bad_lines,
|
1908
2047
|
comment=comment,
|
2048
|
+
low_memory=low_memory,
|
1909
2049
|
**kwargs,
|
1910
2050
|
)
|
1911
2051
|
else:
|
@@ -1919,14 +2059,19 @@ def fload(fpath, kind=None, **kwargs):
|
|
1919
2059
|
skipinitialspace=skipinitialspace,
|
1920
2060
|
on_bad_lines=on_bad_lines,
|
1921
2061
|
comment=comment,
|
2062
|
+
chunksize=chunksize,
|
2063
|
+
low_memory=low_memory,
|
1922
2064
|
**kwargs,
|
1923
2065
|
)
|
2066
|
+
if chunksize:
|
2067
|
+
df = _get_chunks(df)
|
2068
|
+
print(df.shape)
|
1924
2069
|
if is_df_abnormal(df, verbose=0):
|
1925
2070
|
raise ValueError("the df is abnormal")
|
1926
2071
|
except (UnicodeDecodeError, ValueError):
|
1927
2072
|
encoding = get_encoding(fpath)
|
1928
2073
|
# print(f"utf-8 failed. Retrying with detected encoding: {encoding}")
|
1929
|
-
if engine == "pyarrow":
|
2074
|
+
if engine == "pyarrow" and not chunksize:
|
1930
2075
|
df = pd.read_csv(
|
1931
2076
|
fpath,
|
1932
2077
|
engine=engine,
|
@@ -1935,6 +2080,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1935
2080
|
sep=sep,
|
1936
2081
|
on_bad_lines=on_bad_lines,
|
1937
2082
|
comment=comment,
|
2083
|
+
low_memory=low_memory,
|
1938
2084
|
**kwargs,
|
1939
2085
|
)
|
1940
2086
|
else:
|
@@ -1948,8 +2094,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
1948
2094
|
skipinitialspace=skipinitialspace,
|
1949
2095
|
on_bad_lines=on_bad_lines,
|
1950
2096
|
comment=comment,
|
2097
|
+
chunksize=chunksize,
|
2098
|
+
low_memory=low_memory,
|
1951
2099
|
**kwargs,
|
1952
2100
|
)
|
2101
|
+
if chunksize:
|
2102
|
+
df = _get_chunks(df)
|
2103
|
+
print(df.shape)
|
1953
2104
|
if is_df_abnormal(df, verbose=0):
|
1954
2105
|
raise ValueError("the df is abnormal")
|
1955
2106
|
except Exception as e:
|
@@ -1966,8 +2117,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
1966
2117
|
sep=sep,
|
1967
2118
|
on_bad_lines=on_bad_lines,
|
1968
2119
|
comment=comment,
|
2120
|
+
chunksize=chunksize,
|
2121
|
+
low_memory=low_memory,
|
1969
2122
|
**kwargs,
|
1970
2123
|
)
|
2124
|
+
if chunksize:
|
2125
|
+
df = _get_chunks(df)
|
2126
|
+
print(df.shape)
|
1971
2127
|
if not is_df_abnormal(df, verbose=0): # normal
|
1972
2128
|
display(df.head(2))
|
1973
2129
|
print(f"shape: {df.shape}")
|
@@ -1975,51 +2131,64 @@ def fload(fpath, kind=None, **kwargs):
|
|
1975
2131
|
except:
|
1976
2132
|
pass
|
1977
2133
|
else:
|
1978
|
-
|
1979
|
-
|
1980
|
-
|
1981
|
-
|
1982
|
-
|
1983
|
-
|
1984
|
-
|
1985
|
-
|
1986
|
-
|
1987
|
-
|
1988
|
-
|
1989
|
-
|
1990
|
-
|
1991
|
-
|
1992
|
-
|
1993
|
-
|
1994
|
-
|
1995
|
-
|
1996
|
-
|
1997
|
-
display(df.head(2))
|
1998
|
-
print(f"
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2002
|
-
|
2003
|
-
|
2134
|
+
if not chunksize:
|
2135
|
+
engines = [None, "c", "python"]
|
2136
|
+
for engine in engines:
|
2137
|
+
separators = [",", "\t", ";", "|", " "]
|
2138
|
+
for sep in separators:
|
2139
|
+
try:
|
2140
|
+
# sep2show = sep if sep != "\t" else "\\t"
|
2141
|
+
# print(f"trying with: engine={engine}, sep='{sep2show}'")
|
2142
|
+
# print(".")
|
2143
|
+
df = pd.read_csv(
|
2144
|
+
fpath,
|
2145
|
+
engine=engine,
|
2146
|
+
sep=sep,
|
2147
|
+
on_bad_lines=on_bad_lines,
|
2148
|
+
comment=comment,
|
2149
|
+
chunksize=chunksize,
|
2150
|
+
low_memory=low_memory,
|
2151
|
+
**kwargs,
|
2152
|
+
)
|
2153
|
+
# display(df.head(2))
|
2154
|
+
# print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
|
2155
|
+
if chunksize:
|
2156
|
+
df = _get_chunks(df)
|
2157
|
+
print(df.shape)
|
2158
|
+
if not is_df_abnormal(df, verbose=0):
|
2159
|
+
(
|
2160
|
+
display(df.head(2))
|
2161
|
+
if isinstance(df, pd.DataFrame)
|
2162
|
+
else display("it is not a DataFrame")
|
2163
|
+
)
|
2164
|
+
(
|
2165
|
+
print(f"shape: {df.shape}")
|
2166
|
+
if isinstance(df, pd.DataFrame)
|
2167
|
+
else display("it is not a DataFrame")
|
2168
|
+
)
|
2169
|
+
return df
|
2170
|
+
except EmptyDataError as e:
|
2171
|
+
continue
|
2172
|
+
else:
|
2173
|
+
pass
|
2004
2174
|
display(df.head(2))
|
2005
2175
|
print(f"shape: {df.shape}")
|
2006
2176
|
return df
|
2007
2177
|
|
2008
2178
|
def load_excel(fpath, **kwargs):
|
2009
2179
|
engine = kwargs.get("engine", "openpyxl")
|
2010
|
-
verbose=kwargs.pop("verbose",False)
|
2011
|
-
if
|
2180
|
+
verbose = kwargs.pop("verbose", False)
|
2181
|
+
if run_once_within():
|
2012
2182
|
use_pd("read_excel", verbose=verbose)
|
2013
2183
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
2014
2184
|
try:
|
2015
|
-
meata=pd.ExcelFile(fpath)
|
2185
|
+
meata = pd.ExcelFile(fpath)
|
2016
2186
|
print(f"n_sheet={len(meata.sheet_names)},\t'sheetname = 0 (default)':")
|
2017
|
-
[print(f"{i}:\t{i_}") for i,i_ in enumerate(meata.sheet_names)]
|
2187
|
+
[print(f"{i}:\t{i_}") for i, i_ in enumerate(meata.sheet_names)]
|
2018
2188
|
except:
|
2019
2189
|
pass
|
2020
2190
|
return df
|
2021
2191
|
|
2022
|
-
|
2023
2192
|
def load_parquet(fpath, **kwargs):
|
2024
2193
|
"""
|
2025
2194
|
Load a Parquet file into a Pandas DataFrame with advanced options.
|
@@ -2035,16 +2204,16 @@ def fload(fpath, kind=None, **kwargs):
|
|
2035
2204
|
Returns:
|
2036
2205
|
- df (DataFrame): The loaded DataFrame.
|
2037
2206
|
"""
|
2038
|
-
|
2207
|
+
|
2039
2208
|
engine = kwargs.get("engine", "pyarrow")
|
2040
2209
|
verbose = kwargs.pop("verbose", False)
|
2041
|
-
|
2042
|
-
if
|
2210
|
+
|
2211
|
+
if run_once_within():
|
2043
2212
|
use_pd("read_parquet", verbose=verbose)
|
2044
2213
|
try:
|
2045
2214
|
df = pd.read_parquet(fpath, engine=engine, **kwargs)
|
2046
2215
|
if verbose:
|
2047
|
-
if
|
2216
|
+
if "columns" in kwargs:
|
2048
2217
|
print(f"Loaded columns: {kwargs['columns']}")
|
2049
2218
|
else:
|
2050
2219
|
print("Loaded all columns.")
|
@@ -2053,9 +2222,12 @@ def fload(fpath, kind=None, **kwargs):
|
|
2053
2222
|
print(f"An error occurred while loading the Parquet file: {e}")
|
2054
2223
|
df = None
|
2055
2224
|
|
2056
|
-
return df
|
2225
|
+
return df
|
2057
2226
|
|
2058
2227
|
def load_ipynb(fpath, **kwargs):
|
2228
|
+
import nbformat
|
2229
|
+
from nbconvert import MarkdownExporter
|
2230
|
+
|
2059
2231
|
as_version = kwargs.get("as_version", 4)
|
2060
2232
|
with open(fpath, "r") as file:
|
2061
2233
|
nb = nbformat.read(file, as_version=as_version)
|
@@ -2085,6 +2257,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
2085
2257
|
If page is an integer, it returns the text of the specified page number.
|
2086
2258
|
If the specified page is not found, it returns the string "Page is not found".
|
2087
2259
|
"""
|
2260
|
+
from PyPDF2 import PdfReader
|
2261
|
+
|
2088
2262
|
text_dict = {}
|
2089
2263
|
with open(fpath, "rb") as file:
|
2090
2264
|
pdf_reader = PdfReader(file)
|
@@ -2114,6 +2288,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
2114
2288
|
return text_dict.get(int(page), "Page is not found")
|
2115
2289
|
|
2116
2290
|
def load_docx(fpath):
|
2291
|
+
from docx import Document
|
2292
|
+
|
2117
2293
|
doc = Document(fpath)
|
2118
2294
|
content = [para.text for para in doc.paragraphs]
|
2119
2295
|
return content
|
@@ -2123,21 +2299,55 @@ def fload(fpath, kind=None, **kwargs):
|
|
2123
2299
|
kind = kind.lower()
|
2124
2300
|
kind = kind.lstrip(".").lower()
|
2125
2301
|
img_types = [
|
2126
|
-
"bmp",
|
2127
|
-
"
|
2302
|
+
"bmp",
|
2303
|
+
"eps",
|
2304
|
+
"gif",
|
2305
|
+
"png",
|
2306
|
+
"jpg",
|
2307
|
+
"jpeg",
|
2308
|
+
"jpeg2000",
|
2309
|
+
"tiff",
|
2310
|
+
"tif",
|
2311
|
+
"icns",
|
2312
|
+
"ico",
|
2313
|
+
"im",
|
2314
|
+
"msp",
|
2315
|
+
"pcx",
|
2316
|
+
"ppm",
|
2317
|
+
"sgi",
|
2318
|
+
"spider",
|
2319
|
+
"tga",
|
2320
|
+
"webp",
|
2128
2321
|
]
|
2129
2322
|
doc_types = [
|
2130
|
-
"docx",
|
2131
|
-
"
|
2132
|
-
"
|
2133
|
-
"
|
2323
|
+
"docx",
|
2324
|
+
"pdf",
|
2325
|
+
"txt",
|
2326
|
+
"csv",
|
2327
|
+
"xlsx",
|
2328
|
+
"tsv",
|
2329
|
+
"parquet",
|
2330
|
+
"snappy",
|
2331
|
+
"md",
|
2332
|
+
"html",
|
2333
|
+
"json",
|
2334
|
+
"yaml",
|
2335
|
+
"xml",
|
2134
2336
|
"ipynb",
|
2135
|
-
"mtx"
|
2337
|
+
"mtx",
|
2136
2338
|
]
|
2137
2339
|
zip_types = [
|
2138
|
-
"gz",
|
2139
|
-
"
|
2140
|
-
"
|
2340
|
+
"gz",
|
2341
|
+
"zip",
|
2342
|
+
"7z",
|
2343
|
+
"rar",
|
2344
|
+
"tgz",
|
2345
|
+
"tar",
|
2346
|
+
"tar.gz",
|
2347
|
+
"tar.bz2",
|
2348
|
+
"bz2",
|
2349
|
+
"xz",
|
2350
|
+
"gzip",
|
2141
2351
|
]
|
2142
2352
|
other_types = ["fcs"]
|
2143
2353
|
supported_types = [*doc_types, *img_types, *zip_types, *other_types]
|
@@ -2173,9 +2383,17 @@ def fload(fpath, kind=None, **kwargs):
|
|
2173
2383
|
return load_yaml(fpath)
|
2174
2384
|
elif kind == "xml":
|
2175
2385
|
return load_xml(fpath)
|
2176
|
-
elif kind in ["csv","tsv"]:
|
2386
|
+
elif kind in ["csv", "tsv"]:
|
2387
|
+
verbose = kwargs.pop("verbose", False)
|
2388
|
+
if run_once_within():
|
2389
|
+
use_pd("read_csv")
|
2177
2390
|
content = load_csv(fpath, **kwargs)
|
2178
2391
|
return content
|
2392
|
+
elif kind == "pkl":
|
2393
|
+
verbose = kwargs.pop("verbose", False)
|
2394
|
+
if run_once_within():
|
2395
|
+
use_pd("read_pickle")
|
2396
|
+
return pd.read_pickle(fpath, **kwargs)
|
2179
2397
|
elif kind in ["ods", "ods", "odt"]:
|
2180
2398
|
engine = kwargs.get("engine", "odf")
|
2181
2399
|
kwargs.pop("engine", None)
|
@@ -2184,35 +2402,54 @@ def fload(fpath, kind=None, **kwargs):
|
|
2184
2402
|
engine = kwargs.get("engine", "xlrd")
|
2185
2403
|
kwargs.pop("engine", None)
|
2186
2404
|
content = load_excel(fpath, engine=engine, **kwargs)
|
2187
|
-
print(f"shape: {content.shape}")
|
2188
|
-
display(content.head(3))
|
2405
|
+
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2406
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2189
2407
|
return content
|
2190
2408
|
elif kind == "xlsx":
|
2191
2409
|
content = load_excel(fpath, **kwargs)
|
2192
|
-
display(content.head(3))
|
2193
|
-
print(f"shape: {content.shape}")
|
2410
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2411
|
+
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2194
2412
|
return content
|
2195
|
-
elif kind==
|
2413
|
+
elif kind == "mtx":
|
2196
2414
|
from scipy.io import mmread
|
2197
|
-
|
2198
|
-
|
2199
|
-
|
2415
|
+
|
2416
|
+
dat_mtx = mmread(fpath)
|
2417
|
+
content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
|
2418
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2200
2419
|
print(f"shape: {content.shape}")
|
2201
2420
|
return content
|
2202
2421
|
elif kind == "ipynb":
|
2203
2422
|
return load_ipynb(fpath, **kwargs)
|
2204
|
-
elif kind in [
|
2205
|
-
|
2423
|
+
elif kind in ["parquet", "snappy"]:
|
2424
|
+
verbose = kwargs.pop("verbose", False)
|
2425
|
+
if run_once_within():
|
2426
|
+
use_pd("read_parquet")
|
2427
|
+
return load_parquet(fpath, **kwargs)
|
2428
|
+
elif kind == "feather":
|
2429
|
+
verbose = kwargs.pop("verbose", False)
|
2430
|
+
if run_once_within():
|
2431
|
+
use_pd("read_feather")
|
2432
|
+
content = pd.read_feather(fpath, **kwargs)
|
2433
|
+
return content
|
2434
|
+
elif kind == "h5":
|
2435
|
+
content = pd.read_hdf(fpath, **kwargs)
|
2436
|
+
return content
|
2437
|
+
elif kind == "pkl":
|
2438
|
+
content = pd.read_pickle(fpath, **kwargs)
|
2439
|
+
return content
|
2206
2440
|
elif kind == "pdf":
|
2207
2441
|
# print('usage:load_pdf(fpath, page="all", verbose=False)')
|
2208
2442
|
return load_pdf(fpath, **kwargs)
|
2209
2443
|
elif kind.lower() in img_types:
|
2210
2444
|
print(f'Image ".{kind}" is loaded.')
|
2211
2445
|
return load_img(fpath)
|
2212
|
-
elif kind=="gz" and fpath.endswith(".soft.gz"):
|
2446
|
+
elif kind == "gz" and fpath.endswith(".soft.gz"):
|
2213
2447
|
import GEOparse
|
2448
|
+
|
2214
2449
|
return GEOparse.get_GEO(filepath=fpath)
|
2215
2450
|
elif kind.lower() in zip_types:
|
2451
|
+
from pprint import pp
|
2452
|
+
|
2216
2453
|
keep = kwargs.get("keep", False)
|
2217
2454
|
fpath_unzip = unzip(fpath)
|
2218
2455
|
if os.path.isdir(fpath_unzip):
|
@@ -2247,6 +2484,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2247
2484
|
meta, data = fcsparser.parse(fpath, reformat_meta=True)
|
2248
2485
|
return meta, data
|
2249
2486
|
|
2487
|
+
elif kind == "mplstyle":
|
2488
|
+
return read_mplstyle(fpath)
|
2489
|
+
|
2250
2490
|
else:
|
2251
2491
|
print("direct reading...")
|
2252
2492
|
try:
|
@@ -2288,7 +2528,7 @@ def fupdate(fpath, content=None, how="head"):
|
|
2288
2528
|
"""
|
2289
2529
|
Update a file by adding new content at the top and moving the old content to the bottom.
|
2290
2530
|
If the file is a JSON file, merge the new content with the old content.
|
2291
|
-
|
2531
|
+
|
2292
2532
|
Parameters
|
2293
2533
|
----------
|
2294
2534
|
fpath : str
|
@@ -2296,7 +2536,7 @@ def fupdate(fpath, content=None, how="head"):
|
|
2296
2536
|
content : str or dict, optional
|
2297
2537
|
The new content to add at the top of the file (for text) or merge (for JSON).
|
2298
2538
|
If not provided, the function will not add any new content.
|
2299
|
-
|
2539
|
+
|
2300
2540
|
Notes
|
2301
2541
|
-----
|
2302
2542
|
- If the file at `fpath` does not exist, it will be created.
|
@@ -2305,14 +2545,20 @@ def fupdate(fpath, content=None, how="head"):
|
|
2305
2545
|
"""
|
2306
2546
|
content = content or ""
|
2307
2547
|
file_ext = os.path.splitext(fpath)[1]
|
2308
|
-
how_s=["head", "tail","start","end","beginning", "stop",
|
2548
|
+
how_s = ["head", "tail", "start", "end", "beginning", "stop", "last", "before"]
|
2309
2549
|
how = strcmp(how, how_s)[0]
|
2310
2550
|
print(how)
|
2311
|
-
add_where =
|
2551
|
+
add_where = "head" if how in ["head", "start", "beginning", "before"] else "tail"
|
2312
2552
|
if "json" in file_ext.lower():
|
2313
|
-
old_content=fload(fpath,kind=
|
2314
|
-
updated_content =
|
2315
|
-
|
2553
|
+
old_content = fload(fpath, kind="json") if os.path.exists(fpath) else {}
|
2554
|
+
updated_content = (
|
2555
|
+
{**content, **old_content}
|
2556
|
+
if add_where == "head"
|
2557
|
+
else (
|
2558
|
+
{**old_content, **content} if isinstance(content, dict) else old_content
|
2559
|
+
)
|
2560
|
+
)
|
2561
|
+
fsave(fpath, updated_content)
|
2316
2562
|
else:
|
2317
2563
|
# Handle text file
|
2318
2564
|
if os.path.exists(fpath):
|
@@ -2323,7 +2569,7 @@ def fupdate(fpath, content=None, how="head"):
|
|
2323
2569
|
|
2324
2570
|
# Write new content at the top followed by old content
|
2325
2571
|
with open(fpath, "w") as file:
|
2326
|
-
if add_where=="head":
|
2572
|
+
if add_where == "head":
|
2327
2573
|
file.write(content + "\n")
|
2328
2574
|
file.write(old_content)
|
2329
2575
|
else:
|
@@ -2359,6 +2605,9 @@ def filter_kwargs(kws, valid_kwargs):
|
|
2359
2605
|
return kwargs_filtered
|
2360
2606
|
|
2361
2607
|
|
2608
|
+
str_space_speed = 'sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
|
2609
|
+
|
2610
|
+
|
2362
2611
|
def fsave(
|
2363
2612
|
fpath,
|
2364
2613
|
content,
|
@@ -2393,6 +2642,8 @@ def fsave(
|
|
2393
2642
|
fappend(fpath, content=content)
|
2394
2643
|
|
2395
2644
|
def save_docx(fpath, content, font_name, font_size, spacing):
|
2645
|
+
import docx
|
2646
|
+
|
2396
2647
|
if isinstance(content, str):
|
2397
2648
|
content = content.split(". ")
|
2398
2649
|
doc = docx.Document()
|
@@ -2420,6 +2671,8 @@ def fsave(
|
|
2420
2671
|
save_content(fpath, html_content, mode)
|
2421
2672
|
|
2422
2673
|
def save_pdf(fpath, content, font_name, font_size):
|
2674
|
+
from fpdf import FPDF
|
2675
|
+
|
2423
2676
|
pdf = FPDF()
|
2424
2677
|
pdf.add_page()
|
2425
2678
|
# pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
|
@@ -2432,8 +2685,8 @@ def fsave(
|
|
2432
2685
|
def save_csv(fpath, data, **kwargs):
|
2433
2686
|
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
|
2434
2687
|
|
2435
|
-
verbose=kwargs.pop("verbose",False)
|
2436
|
-
if
|
2688
|
+
verbose = kwargs.pop("verbose", False)
|
2689
|
+
if run_once_within():
|
2437
2690
|
use_pd("to_csv", verbose=verbose)
|
2438
2691
|
kwargs_csv = dict(
|
2439
2692
|
path_or_buf=None,
|
@@ -2463,18 +2716,30 @@ def fsave(
|
|
2463
2716
|
df.to_csv(fpath, **kwargs_valid)
|
2464
2717
|
|
2465
2718
|
def save_xlsx(fpath, data, **kwargs):
|
2466
|
-
verbose=kwargs.pop("verbose",False)
|
2719
|
+
verbose = kwargs.pop("verbose", False)
|
2467
2720
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2468
|
-
if
|
2721
|
+
if run_once_within():
|
2469
2722
|
use_pd("to_excel", verbose=verbose)
|
2470
2723
|
if any(kwargs):
|
2471
2724
|
format_excel(df=data, filename=fpath, **kwargs)
|
2472
2725
|
else:
|
2473
2726
|
# Remove non-relevant kwargs
|
2474
2727
|
irrelevant_keys = [
|
2475
|
-
|
2476
|
-
|
2477
|
-
|
2728
|
+
"format",
|
2729
|
+
"usage",
|
2730
|
+
"cell",
|
2731
|
+
"width",
|
2732
|
+
"height",
|
2733
|
+
"height_max",
|
2734
|
+
"merge",
|
2735
|
+
"shade",
|
2736
|
+
"comment",
|
2737
|
+
"link",
|
2738
|
+
"protect",
|
2739
|
+
"number_format",
|
2740
|
+
"conditional_format",
|
2741
|
+
"index_default",
|
2742
|
+
]
|
2478
2743
|
for key in irrelevant_keys:
|
2479
2744
|
kwargs.pop(key, None)
|
2480
2745
|
|
@@ -2482,15 +2747,18 @@ def fsave(
|
|
2482
2747
|
# Check if the file exists, then append the sheet, otherwise create a new file
|
2483
2748
|
try:
|
2484
2749
|
# Use ExcelWriter with append mode if the file exists
|
2485
|
-
with pd.ExcelWriter(
|
2750
|
+
with pd.ExcelWriter(
|
2751
|
+
fpath, engine="openpyxl", mode="a", if_sheet_exists="new"
|
2752
|
+
) as writer:
|
2486
2753
|
df.to_excel(writer, sheet_name=sheet_name, index=False, **kwargs)
|
2487
2754
|
except FileNotFoundError:
|
2488
2755
|
# If file doesn't exist, create a new one
|
2489
2756
|
df.to_excel(fpath, sheet_name=sheet_name, index=False, **kwargs)
|
2490
2757
|
|
2491
|
-
|
2492
2758
|
def save_ipynb(fpath, data, **kwargs):
|
2493
2759
|
# Split the content by code fences to distinguish between code and markdown
|
2760
|
+
import nbformat
|
2761
|
+
|
2494
2762
|
parts = data.split("```")
|
2495
2763
|
cells = []
|
2496
2764
|
|
@@ -2513,17 +2781,19 @@ def fsave(
|
|
2513
2781
|
# json.dump(data, file, **kwargs)
|
2514
2782
|
|
2515
2783
|
def save_json(fpath_fname, var_dict_or_df):
|
2784
|
+
import json
|
2785
|
+
|
2516
2786
|
def _convert_js(data):
|
2517
2787
|
if isinstance(data, pd.DataFrame):
|
2518
|
-
return data.to_dict(orient="list")
|
2788
|
+
return data.to_dict(orient="list")
|
2519
2789
|
elif isinstance(data, np.ndarray):
|
2520
2790
|
return data.tolist()
|
2521
2791
|
elif isinstance(data, dict):
|
2522
2792
|
return {key: _convert_js(value) for key, value in data.items()}
|
2523
|
-
return data
|
2793
|
+
return data
|
2524
2794
|
|
2525
2795
|
serializable_data = _convert_js(var_dict_or_df)
|
2526
|
-
|
2796
|
+
|
2527
2797
|
# Save the serializable data to the JSON file
|
2528
2798
|
with open(fpath_fname, "w") as f_json:
|
2529
2799
|
json.dump(serializable_data, f_json, indent=4)
|
@@ -2534,10 +2804,14 @@ def fsave(
|
|
2534
2804
|
# # setss = jsonload("/.json")
|
2535
2805
|
|
2536
2806
|
def save_yaml(fpath, data, **kwargs):
|
2807
|
+
import yaml
|
2808
|
+
|
2537
2809
|
with open(fpath, "w") as file:
|
2538
2810
|
yaml.dump(data, file, **kwargs)
|
2539
2811
|
|
2540
2812
|
def save_xml(fpath, data):
|
2813
|
+
from lxml import etree
|
2814
|
+
|
2541
2815
|
root = etree.Element("root")
|
2542
2816
|
if isinstance(data, dict):
|
2543
2817
|
for key, val in data.items():
|
@@ -2548,24 +2822,37 @@ def fsave(
|
|
2548
2822
|
tree = etree.ElementTree(root)
|
2549
2823
|
tree.write(fpath, pretty_print=True, xml_declaration=True, encoding="UTF-8")
|
2550
2824
|
|
2551
|
-
def save_parquet(fpath:str, data:pd.DataFrame, **kwargs):
|
2552
|
-
engine = kwargs.pop(
|
2553
|
-
|
2825
|
+
def save_parquet(fpath: str, data: pd.DataFrame, **kwargs):
|
2826
|
+
engine = kwargs.pop(
|
2827
|
+
"engine", "auto"
|
2828
|
+
) # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
|
2829
|
+
compression = kwargs.pop(
|
2830
|
+
"compression", None
|
2831
|
+
) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
|
2554
2832
|
try:
|
2555
2833
|
# Attempt to save with "pyarrow" if engine is set to "auto"
|
2556
|
-
|
2557
|
-
|
2834
|
+
data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
|
2835
|
+
print(
|
2836
|
+
f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression."
|
2837
|
+
)
|
2558
2838
|
except Exception as e:
|
2559
|
-
print(
|
2839
|
+
print(
|
2840
|
+
f"Error using with engine '{engine}' and {compression} compression: {e}"
|
2841
|
+
)
|
2560
2842
|
if "Sparse" in str(e):
|
2561
2843
|
try:
|
2562
2844
|
# Handle sparse data by converting columns to dense
|
2563
2845
|
print("Attempting to convert sparse columns to dense format...")
|
2564
|
-
data = data.apply(
|
2565
|
-
|
2846
|
+
data = data.apply(
|
2847
|
+
lambda x: (
|
2848
|
+
x.sparse.to_dense() if pd.api.types.is_sparse(x) else x
|
2849
|
+
)
|
2850
|
+
)
|
2851
|
+
save_parquet(fpath, data=data, **kwargs)
|
2566
2852
|
except Exception as last_e:
|
2567
|
-
print(
|
2568
|
-
|
2853
|
+
print(
|
2854
|
+
f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}"
|
2855
|
+
)
|
2569
2856
|
|
2570
2857
|
if kind is None:
|
2571
2858
|
_, kind = os.path.splitext(fpath)
|
@@ -2612,16 +2899,95 @@ def fsave(
|
|
2612
2899
|
save_yaml(fpath, content, **kwargs)
|
2613
2900
|
elif kind == "ipynb":
|
2614
2901
|
save_ipynb(fpath, content, **kwargs)
|
2615
|
-
elif kind.lower() in ["parquet","pq","big","par"]:
|
2616
|
-
|
2902
|
+
elif kind.lower() in ["parquet", "pq", "big", "par"]:
|
2903
|
+
verbose = kwargs.pop("verbose", False)
|
2904
|
+
if verbose:
|
2905
|
+
print(str_space_speed)
|
2906
|
+
use_pd("to_parquet")
|
2907
|
+
return None
|
2908
|
+
compression = kwargs.pop(
|
2909
|
+
"compression", None
|
2910
|
+
) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
|
2617
2911
|
# fix the fpath ends
|
2618
|
-
|
2619
|
-
|
2912
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2913
|
+
fpath = _fpath + _ext.replace(kind, "parquet")
|
2620
2914
|
if compression is not None:
|
2621
2915
|
if not fpath.endswith(compression):
|
2622
|
-
fpath=fpath+f".{compression}"
|
2623
|
-
save_parquet(fpath=fpath, data=content,compression=compression
|
2916
|
+
fpath = fpath + f".{compression}"
|
2917
|
+
save_parquet(fpath=fpath, data=content, compression=compression, **kwargs)
|
2918
|
+
elif kind.lower() in ["pkl", "pk", "pickle", "pick"]:
|
2919
|
+
# Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
|
2920
|
+
# Pickle is convenient if you want to preserve exact Python object types.
|
2921
|
+
verbose = kwargs.pop("verbose", False)
|
2922
|
+
if verbose:
|
2923
|
+
print(str_space_speed)
|
2924
|
+
use_pd("to_pickle")
|
2925
|
+
return None
|
2926
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2927
|
+
fpath = _fpath + _ext.replace(kind, "pkl")
|
2928
|
+
compression = kwargs.pop("compression", None)
|
2929
|
+
if compression is not None:
|
2930
|
+
if not fpath.endswith(compression["method"]):
|
2931
|
+
fpath = fpath + f".{compression['method']}"
|
2932
|
+
if isinstance(content, pd.DataFrame):
|
2933
|
+
content.to_pickle(fpath, **kwargs)
|
2934
|
+
else:
|
2935
|
+
try:
|
2936
|
+
print("trying to convert it as a DataFrame...")
|
2937
|
+
content = pd.DataFrame(content)
|
2938
|
+
content.to_pickle(fpath, **kwargs)
|
2939
|
+
except Exception as e:
|
2940
|
+
raise ValueError(
|
2941
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2942
|
+
)
|
2943
|
+
elif kind.lower() in ["fea", "feather", "ft", "fe", "feat", "fether"]:
|
2944
|
+
# Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
|
2945
|
+
# optimized for data analytics tasks and is especially fast when working with Pandas.
|
2946
|
+
|
2947
|
+
verbose = kwargs.pop("verbose", False)
|
2948
|
+
if verbose:
|
2949
|
+
print(str_space_speed)
|
2950
|
+
use_pd("to_feather")
|
2951
|
+
return None
|
2952
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2953
|
+
fpath = _fpath + _ext.replace(kind, "feather")
|
2954
|
+
if isinstance(content, pd.DataFrame):
|
2955
|
+
content.to_feather(fpath, **kwargs)
|
2956
|
+
else:
|
2957
|
+
try:
|
2958
|
+
print("trying to convert it as a DataFrame...")
|
2959
|
+
content = pd.DataFrame(content)
|
2960
|
+
content.to_feather(fpath, **kwargs)
|
2961
|
+
except Exception as e:
|
2962
|
+
raise ValueError(
|
2963
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2964
|
+
)
|
2965
|
+
elif kind.lower() in ["hd", "hdf", "h", "h5"]:
|
2966
|
+
# particularly useful for large datasets and can handle complex data structures
|
2967
|
+
verbose = kwargs.pop("verbose", False)
|
2968
|
+
if verbose:
|
2969
|
+
print(str_space_speed)
|
2970
|
+
use_pd("to_hdf")
|
2971
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2972
|
+
fpath = _fpath + _ext.replace(kind, "h5")
|
2973
|
+
compression = kwargs.pop("compression", None)
|
2974
|
+
if compression is not None:
|
2975
|
+
if not fpath.endswith(compression):
|
2976
|
+
fpath = fpath + f".{compression}"
|
2977
|
+
if isinstance(content, pd.DataFrame):
|
2978
|
+
content.to_hdf(fpath, key="content", **kwargs)
|
2979
|
+
else:
|
2980
|
+
try:
|
2981
|
+
print("trying to convert it as a DataFrame...")
|
2982
|
+
content = pd.DataFrame(content)
|
2983
|
+
content.to_hdf(fpath, **kwargs)
|
2984
|
+
except Exception as e:
|
2985
|
+
raise ValueError(
|
2986
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2987
|
+
)
|
2624
2988
|
else:
|
2989
|
+
from . import netfinder
|
2990
|
+
|
2625
2991
|
try:
|
2626
2992
|
netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
|
2627
2993
|
except:
|
@@ -2744,6 +3110,8 @@ def isa(content, kind):
|
|
2744
3110
|
elif "color" in kind.lower(): # file
|
2745
3111
|
return is_str_color(content)
|
2746
3112
|
elif "html" in kind.lower():
|
3113
|
+
import re
|
3114
|
+
|
2747
3115
|
if content is None or not isinstance(content, str):
|
2748
3116
|
return False
|
2749
3117
|
# Remove leading and trailing whitespace
|
@@ -2793,8 +3161,8 @@ def listdir(
|
|
2793
3161
|
verbose=True,
|
2794
3162
|
):
|
2795
3163
|
if kind is None:
|
2796
|
-
ls=os.listdir(rootdir)
|
2797
|
-
ls = [f for f in ls if not f.startswith(
|
3164
|
+
ls = os.listdir(rootdir)
|
3165
|
+
ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
|
2798
3166
|
print(ls)
|
2799
3167
|
df_all = pd.DataFrame(
|
2800
3168
|
{
|
@@ -2825,7 +3193,7 @@ def listdir(
|
|
2825
3193
|
|
2826
3194
|
if os.path.isdir(rootdir):
|
2827
3195
|
ls = os.listdir(rootdir)
|
2828
|
-
ls = [f for f in ls if not f.startswith(
|
3196
|
+
ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
|
2829
3197
|
fd = [".fd", ".fld", ".fol", ".fd", ".folder"]
|
2830
3198
|
i = 0
|
2831
3199
|
f = {
|
@@ -2903,6 +3271,8 @@ def listdir(
|
|
2903
3271
|
display(f.head())
|
2904
3272
|
return f
|
2905
3273
|
else:
|
3274
|
+
from box import Box
|
3275
|
+
|
2906
3276
|
if "l" in orient.lower(): # list # default
|
2907
3277
|
res_output = Box(f.to_dict(orient="list"))
|
2908
3278
|
return res_output
|
@@ -2943,13 +3313,10 @@ def mkdir_nest(fpath: str) -> str:
|
|
2943
3313
|
Returns:
|
2944
3314
|
- str: The path of the created directory.
|
2945
3315
|
"""
|
2946
|
-
|
2947
|
-
|
2948
3316
|
# Split the full path into directories
|
2949
3317
|
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
2950
3318
|
if os.path.isdir(fpath):
|
2951
|
-
fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
|
2952
|
-
print(fpath)
|
3319
|
+
fpath = fpath + f_slash if not fpath.endswith(f_slash) else fpath
|
2953
3320
|
return fpath
|
2954
3321
|
dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
|
2955
3322
|
|
@@ -2979,27 +3346,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
2979
3346
|
- str: The path of the created directory or an error message.
|
2980
3347
|
"""
|
2981
3348
|
|
2982
|
-
rootdir = []
|
3349
|
+
rootdir = []
|
2983
3350
|
if chdir is None:
|
2984
3351
|
return mkdir_nest(pardir)
|
2985
3352
|
if isinstance(chdir, str):
|
2986
|
-
chdir = [chdir]
|
3353
|
+
chdir = [chdir]
|
2987
3354
|
chdir = list(set(chdir))
|
2988
3355
|
if isinstance(pardir, str): # Dir_parents should be 'str' type
|
2989
|
-
pardir = os.path.normpath(pardir)
|
3356
|
+
pardir = os.path.normpath(pardir)
|
2990
3357
|
if "mac" in get_os().lower() or "lin" in get_os().lower():
|
2991
3358
|
stype = "/"
|
2992
3359
|
elif "win" in get_os().lower():
|
2993
3360
|
stype = "\\"
|
2994
3361
|
else:
|
2995
3362
|
stype = "/"
|
2996
|
-
|
3363
|
+
|
2997
3364
|
if os.path.isdir(pardir):
|
2998
3365
|
os.chdir(pardir) # Set current path
|
2999
3366
|
# Check if subdirectories are not empty
|
3000
3367
|
if chdir:
|
3001
|
-
chdir.sort()
|
3002
|
-
for folder in chdir:
|
3368
|
+
chdir.sort()
|
3369
|
+
for folder in chdir:
|
3003
3370
|
child_tmp = os.path.join(pardir, folder)
|
3004
3371
|
if not os.path.isdir(child_tmp):
|
3005
3372
|
os.mkdir("./" + folder)
|
@@ -3019,8 +3386,8 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
3019
3386
|
# Dir is the main output, if only one dir, then str type is inconvenient
|
3020
3387
|
if len(rootdir) == 1:
|
3021
3388
|
rootdir = rootdir[0]
|
3022
|
-
rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
|
3023
|
-
|
3389
|
+
rootdir = rootdir + stype if not rootdir.endswith(stype) else rootdir
|
3390
|
+
|
3024
3391
|
return rootdir
|
3025
3392
|
|
3026
3393
|
|
@@ -3032,6 +3399,9 @@ def split_path(fpath):
|
|
3032
3399
|
|
3033
3400
|
|
3034
3401
|
def figsave(*args, dpi=300):
|
3402
|
+
import matplotlib.pyplot as plt
|
3403
|
+
from PIL import Image
|
3404
|
+
|
3035
3405
|
dir_save = None
|
3036
3406
|
fname = None
|
3037
3407
|
img = None
|
@@ -3046,14 +3416,14 @@ def figsave(*args, dpi=300):
|
|
3046
3416
|
img = arg # Store the PIL image if provided
|
3047
3417
|
|
3048
3418
|
if dir_save is None:
|
3049
|
-
dir_save="./"
|
3050
|
-
|
3419
|
+
dir_save = "./"
|
3420
|
+
|
3051
3421
|
# dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
|
3052
3422
|
dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
|
3053
3423
|
dir_ch = "".join(dir_save.split(f_slash)[-1:])
|
3054
3424
|
if not dir_par.endswith(f_slash):
|
3055
3425
|
dir_par += f_slash
|
3056
|
-
|
3426
|
+
|
3057
3427
|
if fname is None:
|
3058
3428
|
fname = dir_ch
|
3059
3429
|
mkdir(dir_par)
|
@@ -3139,7 +3509,9 @@ def figsave(*args, dpi=300):
|
|
3139
3509
|
|
3140
3510
|
def is_str_color(s):
|
3141
3511
|
# Regular expression pattern for hexadecimal color codes
|
3142
|
-
if isinstance(s,str):
|
3512
|
+
if isinstance(s, str):
|
3513
|
+
import re
|
3514
|
+
|
3143
3515
|
color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
|
3144
3516
|
return re.match(color_code_pattern, s) is not None
|
3145
3517
|
else:
|
@@ -3166,6 +3538,8 @@ def isnum(s):
|
|
3166
3538
|
|
3167
3539
|
|
3168
3540
|
def is_image(fpath):
|
3541
|
+
import mimetypes
|
3542
|
+
|
3169
3543
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3170
3544
|
if mime_type and mime_type.startswith("image"):
|
3171
3545
|
return True
|
@@ -3174,6 +3548,8 @@ def is_image(fpath):
|
|
3174
3548
|
|
3175
3549
|
|
3176
3550
|
def is_document(fpath):
|
3551
|
+
import mimetypes
|
3552
|
+
|
3177
3553
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3178
3554
|
if mime_type and (
|
3179
3555
|
mime_type.startswith("text/")
|
@@ -3194,6 +3570,8 @@ def is_document(fpath):
|
|
3194
3570
|
|
3195
3571
|
|
3196
3572
|
def is_zip(fpath):
|
3573
|
+
import mimetypes
|
3574
|
+
|
3197
3575
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3198
3576
|
if mime_type == "application/zip":
|
3199
3577
|
return True
|
@@ -3202,6 +3580,8 @@ def is_zip(fpath):
|
|
3202
3580
|
|
3203
3581
|
|
3204
3582
|
def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
|
3583
|
+
import matplotlib.pyplot as plt
|
3584
|
+
|
3205
3585
|
if ax is None:
|
3206
3586
|
ax = plt.gca()
|
3207
3587
|
for loc, spine in ax.spines.items():
|
@@ -3290,6 +3670,7 @@ def apply_filter(img, *args):
|
|
3290
3670
|
Returns:
|
3291
3671
|
PIL.Image: The filtered image.
|
3292
3672
|
"""
|
3673
|
+
from PIL import ImageFilter
|
3293
3674
|
|
3294
3675
|
def correct_filter_name(filter_name):
|
3295
3676
|
if "bl" in filter_name.lower() and "box" not in filter_name.lower():
|
@@ -3532,6 +3913,9 @@ def imgsets(img, **kwargs):
|
|
3532
3913
|
avg_contrast_factor = sum(contrast_factors) / num_channels
|
3533
3914
|
return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
|
3534
3915
|
|
3916
|
+
import matplotlib.pyplot as plt
|
3917
|
+
from PIL import ImageEnhance, ImageOps
|
3918
|
+
|
3535
3919
|
# Load image if input is a file path
|
3536
3920
|
if isinstance(img, str):
|
3537
3921
|
img = load_img(img)
|
@@ -3595,6 +3979,8 @@ def imgsets(img, **kwargs):
|
|
3595
3979
|
elif "pad" in k.lower():
|
3596
3980
|
img_update = ImageOps.pad(img_update, size=value)
|
3597
3981
|
elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
|
3982
|
+
from rembg import remove, new_session
|
3983
|
+
|
3598
3984
|
if isinstance(value, bool):
|
3599
3985
|
session = new_session("isnet-general-use")
|
3600
3986
|
img_update = remove(img_update, session=session)
|
@@ -3633,6 +4019,8 @@ def imgsets(img, **kwargs):
|
|
3633
4019
|
else:
|
3634
4020
|
img_update = remove(img_update)
|
3635
4021
|
elif "bg" in k.lower() and "color" in k.lower():
|
4022
|
+
from rembg import remove
|
4023
|
+
|
3636
4024
|
if isinstance(value, list):
|
3637
4025
|
value = tuple(value)
|
3638
4026
|
if isinstance(value, tuple): # replace the background color
|
@@ -3664,6 +4052,9 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
3664
4052
|
Args:
|
3665
4053
|
dir_img_list (list): List of the Directory containing the images.
|
3666
4054
|
"""
|
4055
|
+
import matplotlib.pyplot as plt
|
4056
|
+
from PIL import Image
|
4057
|
+
|
3667
4058
|
num_images = len(dir_img_list)
|
3668
4059
|
if not kind.startswith("."):
|
3669
4060
|
kind = "." + kind
|
@@ -3700,28 +4091,14 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
3700
4091
|
# usage:
|
3701
4092
|
# fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
|
3702
4093
|
# thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
|
3703
|
-
def read_mplstyle(style_file):
|
3704
|
-
# Load the style file
|
3705
|
-
plt.style.use(style_file)
|
3706
|
-
|
3707
|
-
# Get the current style properties
|
3708
|
-
style_dict = plt.rcParams
|
3709
|
-
|
3710
|
-
# Convert to dictionary
|
3711
|
-
style_dict = dict(style_dict)
|
3712
|
-
# Print the style dictionary
|
3713
|
-
for i, j in style_dict.items():
|
3714
|
-
print(f"\n{i}::::{j}")
|
3715
|
-
return style_dict
|
3716
|
-
|
3717
|
-
|
3718
|
-
# #example usage:
|
3719
|
-
# style_file = "/ std-colors.mplstyle"
|
3720
|
-
# style_dict = read_mplstyle(style_file)
|
3721
4094
|
|
3722
4095
|
|
3723
4096
|
# search and fine the director of the libary, which installed at local
|
3724
4097
|
def dir_lib(lib_oi):
|
4098
|
+
"""
|
4099
|
+
# example usage:
|
4100
|
+
# dir_lib("seaborn")
|
4101
|
+
"""
|
3725
4102
|
import site
|
3726
4103
|
|
3727
4104
|
# Get the site-packages directory
|
@@ -3740,22 +4117,6 @@ def dir_lib(lib_oi):
|
|
3740
4117
|
return dir_list
|
3741
4118
|
|
3742
4119
|
|
3743
|
-
# example usage:
|
3744
|
-
# dir_lib("seaborn")
|
3745
|
-
|
3746
|
-
"""
|
3747
|
-
# n = 7
|
3748
|
-
# clist = get_color(n, cmap="auto", how="linspace") # get_color(100)
|
3749
|
-
# plt.figure(figsize=[8, 5], dpi=100)
|
3750
|
-
# x = np.linspace(0, 2 * np.pi, 50) * 100
|
3751
|
-
# y = np.sin(x)
|
3752
|
-
# for i in range(1, n + 1):
|
3753
|
-
# plt.plot(x, y + i, c=clist[i - 1], lw=5, label=str(i))
|
3754
|
-
# plt.legend()
|
3755
|
-
# plt.ylim(-2, 20)
|
3756
|
-
# figsets(plt.gca(), {"style": "whitegrid"}) """
|
3757
|
-
|
3758
|
-
|
3759
4120
|
class FileInfo:
|
3760
4121
|
def __init__(
|
3761
4122
|
self,
|
@@ -3832,6 +4193,8 @@ class FileInfo:
|
|
3832
4193
|
|
3833
4194
|
|
3834
4195
|
def finfo(fpath):
|
4196
|
+
import time
|
4197
|
+
|
3835
4198
|
fname, fmt = os.path.splitext(fpath)
|
3836
4199
|
dir_par = os.path.dirname(fpath) + "/"
|
3837
4200
|
data = {
|
@@ -3846,6 +4209,8 @@ def finfo(fpath):
|
|
3846
4209
|
}
|
3847
4210
|
extra_info = {}
|
3848
4211
|
if data["kind"] == ".pdf":
|
4212
|
+
from pdf2image import pdfinfo_from_path
|
4213
|
+
|
3849
4214
|
extra_info = pdfinfo_from_path(fpath)
|
3850
4215
|
|
3851
4216
|
return FileInfo(
|
@@ -3862,16 +4227,6 @@ def finfo(fpath):
|
|
3862
4227
|
|
3863
4228
|
|
3864
4229
|
# ! format excel file
|
3865
|
-
import pandas as pd
|
3866
|
-
from datetime import datetime
|
3867
|
-
from openpyxl import load_workbook
|
3868
|
-
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
3869
|
-
from openpyxl.utils import get_column_letter
|
3870
|
-
from openpyxl.worksheet.datavalidation import DataValidation
|
3871
|
-
from openpyxl.comments import Comment
|
3872
|
-
from openpyxl.formatting.rule import ColorScaleRule
|
3873
|
-
|
3874
|
-
|
3875
4230
|
def hex2argb(hex_color):
|
3876
4231
|
"""
|
3877
4232
|
Convert a hex color code to aARGB format required by openpyxl.
|
@@ -3907,337 +4262,6 @@ def hex2argb(hex_color):
|
|
3907
4262
|
)
|
3908
4263
|
|
3909
4264
|
|
3910
|
-
def convert_indices_to_range(row_slice, col_slice):
|
3911
|
-
"""Convert numerical row and column slices to Excel-style range strings."""
|
3912
|
-
start_row = row_slice.start + 1
|
3913
|
-
end_row = row_slice.stop if row_slice.stop is not None else None
|
3914
|
-
start_col = col_slice.start + 1
|
3915
|
-
end_col = col_slice.stop if col_slice.stop is not None else None
|
3916
|
-
|
3917
|
-
start_col_letter = get_column_letter(start_col)
|
3918
|
-
end_col_letter = get_column_letter(end_col) if end_col else None
|
3919
|
-
return (
|
3920
|
-
f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
|
3921
|
-
if end_col_letter
|
3922
|
-
else f"{start_col_letter}{start_row}"
|
3923
|
-
)
|
3924
|
-
|
3925
|
-
|
3926
|
-
def apply_format(ws, cell, cell_range):
|
3927
|
-
"""Apply cell formatting to a specified range."""
|
3928
|
-
cell_font, cell_fill, cell_alignment, border = None, None, None, None
|
3929
|
-
kws_cell = ["font", "fill", "alignment", "border"]
|
3930
|
-
for K, _ in cell.items():
|
3931
|
-
if strcmp(K, kws_cell)[0] == "font":
|
3932
|
-
#! font
|
3933
|
-
font_color = "000000"
|
3934
|
-
font_name = "Arial"
|
3935
|
-
font_underline = "none"
|
3936
|
-
font_size = 14
|
3937
|
-
font_bold = False
|
3938
|
-
font_strike = False
|
3939
|
-
font_italic = False
|
3940
|
-
kws_font = [
|
3941
|
-
"name",
|
3942
|
-
"size",
|
3943
|
-
"bold",
|
3944
|
-
"underline",
|
3945
|
-
"color",
|
3946
|
-
"strike",
|
3947
|
-
"italic",
|
3948
|
-
]
|
3949
|
-
for k_, v_ in cell.get(K, {}).items():
|
3950
|
-
if strcmp(k_, kws_font)[0] == "name":
|
3951
|
-
font_name = v_
|
3952
|
-
elif strcmp(k_, kws_font)[0] == "size":
|
3953
|
-
font_size = v_
|
3954
|
-
elif strcmp(k_, kws_font)[0] == "bold":
|
3955
|
-
font_bold = v_
|
3956
|
-
elif strcmp(k_, kws_font)[0] == "underline":
|
3957
|
-
font_underline = strcmp(v_, ["none", "single", "double"])[0]
|
3958
|
-
elif strcmp(k_, kws_font)[0] == "color":
|
3959
|
-
font_color = hex2argb(v_)
|
3960
|
-
elif strcmp(k_, kws_font)[0] == "strike":
|
3961
|
-
font_strike = v_
|
3962
|
-
elif strcmp(k_, kws_font)[0] == "italic":
|
3963
|
-
font_italic = v_
|
3964
|
-
|
3965
|
-
cell_font = Font(
|
3966
|
-
name=font_name,
|
3967
|
-
size=font_size,
|
3968
|
-
bold=font_bold,
|
3969
|
-
italic=font_italic,
|
3970
|
-
underline=font_underline,
|
3971
|
-
strike=font_strike,
|
3972
|
-
color=font_color,
|
3973
|
-
)
|
3974
|
-
|
3975
|
-
if strcmp(K, kws_cell)[0] == "fill":
|
3976
|
-
#! fill
|
3977
|
-
kws_fill = ["start_color", "end_color", "fill_type", "color"]
|
3978
|
-
kws_fill_type = [
|
3979
|
-
"darkVertical",
|
3980
|
-
"lightDown",
|
3981
|
-
"lightGrid",
|
3982
|
-
"solid",
|
3983
|
-
"darkDown",
|
3984
|
-
"lightGray",
|
3985
|
-
"lightUp",
|
3986
|
-
"gray0625",
|
3987
|
-
"lightVertical",
|
3988
|
-
"lightHorizontal",
|
3989
|
-
"darkHorizontal",
|
3990
|
-
"gray125",
|
3991
|
-
"darkUp",
|
3992
|
-
"mediumGray",
|
3993
|
-
"darkTrellis",
|
3994
|
-
"darkGray",
|
3995
|
-
"lightTrellis",
|
3996
|
-
"darkGrid",
|
3997
|
-
]
|
3998
|
-
start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
|
3999
|
-
for k, v in cell.get(K, {}).items():
|
4000
|
-
if strcmp(k, kws_fill)[0] == "color":
|
4001
|
-
start_color, end_color = hex2argb(v), hex2argb(v)
|
4002
|
-
break
|
4003
|
-
for k, v in cell.get(K, {}).items():
|
4004
|
-
if strcmp(k, kws_fill)[0] == "start_color":
|
4005
|
-
start_color = hex2argb(v)
|
4006
|
-
elif strcmp(k, kws_fill)[0] == "end_color":
|
4007
|
-
end_color = hex2argb(v)
|
4008
|
-
elif strcmp(k, kws_fill)[0] == "fill_type":
|
4009
|
-
fill_type = strcmp(v, kws_fill_type)[0]
|
4010
|
-
cell_fill = PatternFill(
|
4011
|
-
start_color=start_color,
|
4012
|
-
end_color=end_color,
|
4013
|
-
fill_type=fill_type,
|
4014
|
-
)
|
4015
|
-
|
4016
|
-
if strcmp(K, kws_cell)[0] == "alignment":
|
4017
|
-
#! alignment
|
4018
|
-
# default
|
4019
|
-
align_horizontal = "general"
|
4020
|
-
align_vertical = "center"
|
4021
|
-
align_rot = 0
|
4022
|
-
align_wrap = False
|
4023
|
-
align_shrink = False
|
4024
|
-
align_indent = 0
|
4025
|
-
kws_align = [
|
4026
|
-
"horizontal",
|
4027
|
-
"ha",
|
4028
|
-
"vertical",
|
4029
|
-
"va",
|
4030
|
-
"text_rotation",
|
4031
|
-
"rotat",
|
4032
|
-
"rot",
|
4033
|
-
"wrap_text",
|
4034
|
-
"wrap",
|
4035
|
-
"shrink_to_fit",
|
4036
|
-
"shrink",
|
4037
|
-
"indent",
|
4038
|
-
]
|
4039
|
-
for k, v in cell.get(K, {}).items():
|
4040
|
-
if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
|
4041
|
-
align_horizontal = strcmp(
|
4042
|
-
v, ["general", "left", "right", "center"]
|
4043
|
-
)[0]
|
4044
|
-
elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
|
4045
|
-
align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
|
4046
|
-
elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
|
4047
|
-
align_rot = v
|
4048
|
-
elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
|
4049
|
-
align_wrap = v
|
4050
|
-
elif strcmp(k, kws_align)[0] in [
|
4051
|
-
"shrink_to_fit",
|
4052
|
-
"shrink",
|
4053
|
-
"wrap_text",
|
4054
|
-
"wrap",
|
4055
|
-
]:
|
4056
|
-
align_shrink = v
|
4057
|
-
elif strcmp(k, kws_align)[0] in ["indent"]:
|
4058
|
-
align_indent = v
|
4059
|
-
cell_alignment = Alignment(
|
4060
|
-
horizontal=align_horizontal,
|
4061
|
-
vertical=align_vertical,
|
4062
|
-
text_rotation=align_rot,
|
4063
|
-
wrap_text=align_wrap,
|
4064
|
-
shrink_to_fit=align_shrink,
|
4065
|
-
indent=align_indent,
|
4066
|
-
)
|
4067
|
-
|
4068
|
-
if strcmp(K, kws_cell)[0] == "border":
|
4069
|
-
#! border
|
4070
|
-
kws_border = [
|
4071
|
-
"color_left",
|
4072
|
-
"color_l",
|
4073
|
-
"color_right",
|
4074
|
-
"color_r",
|
4075
|
-
"color_top",
|
4076
|
-
"color_t",
|
4077
|
-
"color_bottom",
|
4078
|
-
"color_b",
|
4079
|
-
"color_diagonal",
|
4080
|
-
"color_d",
|
4081
|
-
"color_outline",
|
4082
|
-
"color_o",
|
4083
|
-
"color_vertical",
|
4084
|
-
"color_v",
|
4085
|
-
"color_horizontal",
|
4086
|
-
"color_h",
|
4087
|
-
"color",
|
4088
|
-
"style_left",
|
4089
|
-
"style_l",
|
4090
|
-
"style_right",
|
4091
|
-
"style_r",
|
4092
|
-
"style_top",
|
4093
|
-
"style_t",
|
4094
|
-
"style_bottom",
|
4095
|
-
"style_b",
|
4096
|
-
"style_diagonal",
|
4097
|
-
"style_d",
|
4098
|
-
"style_outline",
|
4099
|
-
"style_o",
|
4100
|
-
"style_vertical",
|
4101
|
-
"style_v",
|
4102
|
-
"style_horizontal",
|
4103
|
-
"style_h",
|
4104
|
-
"style",
|
4105
|
-
]
|
4106
|
-
# * border color
|
4107
|
-
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4108
|
-
"FF000000",
|
4109
|
-
"FF000000",
|
4110
|
-
"FF000000",
|
4111
|
-
"FF000000",
|
4112
|
-
)
|
4113
|
-
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4114
|
-
"FF000000",
|
4115
|
-
"FF000000",
|
4116
|
-
"FF000000",
|
4117
|
-
"FF000000",
|
4118
|
-
)
|
4119
|
-
# get colors config
|
4120
|
-
for k, v in cell.get(K, {}).items():
|
4121
|
-
if strcmp(k, kws_border)[0] in ["color"]:
|
4122
|
-
border_color_all = hex2argb(v)
|
4123
|
-
# 如果设置了color,表示其它的所有的都设置成为一样的
|
4124
|
-
# 然后再才开始自己定义其它的color
|
4125
|
-
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4126
|
-
border_color_all,
|
4127
|
-
border_color_all,
|
4128
|
-
border_color_all,
|
4129
|
-
border_color_all,
|
4130
|
-
)
|
4131
|
-
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4132
|
-
border_color_all,
|
4133
|
-
border_color_all,
|
4134
|
-
border_color_all,
|
4135
|
-
border_color_all,
|
4136
|
-
)
|
4137
|
-
elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
|
4138
|
-
border_color_l = hex2argb(v)
|
4139
|
-
elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
|
4140
|
-
border_color_r = hex2argb(v)
|
4141
|
-
elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
|
4142
|
-
border_color_t = hex2argb(v)
|
4143
|
-
elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
|
4144
|
-
border_color_b = hex2argb(v)
|
4145
|
-
elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
|
4146
|
-
border_color_d = hex2argb(v)
|
4147
|
-
elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
|
4148
|
-
border_color_o = hex2argb(v)
|
4149
|
-
elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
|
4150
|
-
border_color_v = hex2argb(v)
|
4151
|
-
elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
|
4152
|
-
border_color_h = hex2argb(v)
|
4153
|
-
# *border style
|
4154
|
-
border_styles = [
|
4155
|
-
"thin",
|
4156
|
-
"medium",
|
4157
|
-
"thick",
|
4158
|
-
"dotted",
|
4159
|
-
"dashed",
|
4160
|
-
"hair",
|
4161
|
-
"mediumDashed",
|
4162
|
-
"dashDot",
|
4163
|
-
"dashDotDot",
|
4164
|
-
"slantDashDot",
|
4165
|
-
"none",
|
4166
|
-
]
|
4167
|
-
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4168
|
-
None,
|
4169
|
-
None,
|
4170
|
-
None,
|
4171
|
-
None,
|
4172
|
-
)
|
4173
|
-
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4174
|
-
None,
|
4175
|
-
None,
|
4176
|
-
None,
|
4177
|
-
None,
|
4178
|
-
)
|
4179
|
-
# get styles config
|
4180
|
-
for k, v in cell.get(K, {}).items():
|
4181
|
-
# if not "style" in k:
|
4182
|
-
# break
|
4183
|
-
if strcmp(k, kws_border)[0] in ["style"]:
|
4184
|
-
border_style_all = strcmp(v, border_styles)[0]
|
4185
|
-
# 如果设置了style,表示其它的所有的都设置成为一样的
|
4186
|
-
# 然后再才开始自己定义其它的style
|
4187
|
-
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4188
|
-
border_style_all,
|
4189
|
-
border_style_all,
|
4190
|
-
border_style_all,
|
4191
|
-
border_style_all,
|
4192
|
-
)
|
4193
|
-
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4194
|
-
border_style_all,
|
4195
|
-
border_style_all,
|
4196
|
-
border_style_all,
|
4197
|
-
border_style_all,
|
4198
|
-
)
|
4199
|
-
elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
|
4200
|
-
border_style_l = strcmp(v, border_styles)[0]
|
4201
|
-
elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
|
4202
|
-
border_style_r = strcmp(v, border_styles)[0]
|
4203
|
-
elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
|
4204
|
-
border_style_t = strcmp(v, border_styles)[0]
|
4205
|
-
elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
|
4206
|
-
border_style_b = strcmp(v, border_styles)[0]
|
4207
|
-
elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
|
4208
|
-
border_style_d = strcmp(v, border_styles)[0]
|
4209
|
-
elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
|
4210
|
-
border_style_o = strcmp(v, border_styles)[0]
|
4211
|
-
elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
|
4212
|
-
border_style_v = strcmp(v, border_styles)[0]
|
4213
|
-
elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
|
4214
|
-
border_style_h = strcmp(v, border_styles)[0]
|
4215
|
-
# * apply border config
|
4216
|
-
border = Border(
|
4217
|
-
left=Side(border_style=border_style_l, color=border_color_l),
|
4218
|
-
right=Side(border_style=border_style_r, color=border_color_r),
|
4219
|
-
top=Side(border_style=border_style_t, color=border_color_t),
|
4220
|
-
bottom=Side(border_style=border_style_b, color=border_color_b),
|
4221
|
-
diagonal=Side(border_style=border_style_d, color=border_color_d),
|
4222
|
-
diagonal_direction=0,
|
4223
|
-
outline=Side(border_style=border_style_o, color=border_color_o),
|
4224
|
-
vertical=Side(border_style=border_style_v, color=border_color_v),
|
4225
|
-
horizontal=Side(border_style=border_style_h, color=border_color_h),
|
4226
|
-
)
|
4227
|
-
|
4228
|
-
#! final apply configs
|
4229
|
-
for row in ws[cell_range]:
|
4230
|
-
for cell_ in row:
|
4231
|
-
if cell_font:
|
4232
|
-
cell_.font = cell_font
|
4233
|
-
if cell_fill:
|
4234
|
-
cell_.fill = cell_fill
|
4235
|
-
if cell_alignment:
|
4236
|
-
cell_.alignment = cell_alignment
|
4237
|
-
if border:
|
4238
|
-
cell_.border = border
|
4239
|
-
|
4240
|
-
|
4241
4265
|
def format_excel(
|
4242
4266
|
df=None,
|
4243
4267
|
filename=None,
|
@@ -4257,6 +4281,368 @@ def format_excel(
|
|
4257
4281
|
conditional_format=None, # dict
|
4258
4282
|
**kwargs,
|
4259
4283
|
):
|
4284
|
+
import pandas as pd
|
4285
|
+
from datetime import datetime
|
4286
|
+
from openpyxl import load_workbook
|
4287
|
+
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
4288
|
+
from openpyxl.utils import get_column_letter
|
4289
|
+
from openpyxl.worksheet.datavalidation import DataValidation
|
4290
|
+
from openpyxl.comments import Comment
|
4291
|
+
from openpyxl.formatting.rule import ColorScaleRule
|
4292
|
+
|
4293
|
+
def convert_indices_to_range(row_slice, col_slice):
|
4294
|
+
"""Convert numerical row and column slices to Excel-style range strings."""
|
4295
|
+
start_row = row_slice.start + 1
|
4296
|
+
end_row = row_slice.stop if row_slice.stop is not None else None
|
4297
|
+
start_col = col_slice.start + 1
|
4298
|
+
end_col = col_slice.stop if col_slice.stop is not None else None
|
4299
|
+
|
4300
|
+
start_col_letter = get_column_letter(start_col)
|
4301
|
+
end_col_letter = get_column_letter(end_col) if end_col else None
|
4302
|
+
return (
|
4303
|
+
f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
|
4304
|
+
if end_col_letter
|
4305
|
+
else f"{start_col_letter}{start_row}"
|
4306
|
+
)
|
4307
|
+
|
4308
|
+
def apply_format(ws, cell, cell_range):
|
4309
|
+
"""Apply cell formatting to a specified range."""
|
4310
|
+
cell_font, cell_fill, cell_alignment, border = None, None, None, None
|
4311
|
+
kws_cell = ["font", "fill", "alignment", "border"]
|
4312
|
+
for K, _ in cell.items():
|
4313
|
+
if strcmp(K, kws_cell)[0] == "font":
|
4314
|
+
#! font
|
4315
|
+
font_color = "000000"
|
4316
|
+
font_name = "Arial"
|
4317
|
+
font_underline = "none"
|
4318
|
+
font_size = 14
|
4319
|
+
font_bold = False
|
4320
|
+
font_strike = False
|
4321
|
+
font_italic = False
|
4322
|
+
kws_font = [
|
4323
|
+
"name",
|
4324
|
+
"size",
|
4325
|
+
"bold",
|
4326
|
+
"underline",
|
4327
|
+
"color",
|
4328
|
+
"strike",
|
4329
|
+
"italic",
|
4330
|
+
]
|
4331
|
+
for k_, v_ in cell.get(K, {}).items():
|
4332
|
+
if strcmp(k_, kws_font)[0] == "name":
|
4333
|
+
font_name = v_
|
4334
|
+
elif strcmp(k_, kws_font)[0] == "size":
|
4335
|
+
font_size = v_
|
4336
|
+
elif strcmp(k_, kws_font)[0] == "bold":
|
4337
|
+
font_bold = v_
|
4338
|
+
elif strcmp(k_, kws_font)[0] == "underline":
|
4339
|
+
font_underline = strcmp(v_, ["none", "single", "double"])[0]
|
4340
|
+
elif strcmp(k_, kws_font)[0] == "color":
|
4341
|
+
font_color = hex2argb(v_)
|
4342
|
+
elif strcmp(k_, kws_font)[0] == "strike":
|
4343
|
+
font_strike = v_
|
4344
|
+
elif strcmp(k_, kws_font)[0] == "italic":
|
4345
|
+
font_italic = v_
|
4346
|
+
|
4347
|
+
cell_font = Font(
|
4348
|
+
name=font_name,
|
4349
|
+
size=font_size,
|
4350
|
+
bold=font_bold,
|
4351
|
+
italic=font_italic,
|
4352
|
+
underline=font_underline,
|
4353
|
+
strike=font_strike,
|
4354
|
+
color=font_color,
|
4355
|
+
)
|
4356
|
+
|
4357
|
+
if strcmp(K, kws_cell)[0] == "fill":
|
4358
|
+
#! fill
|
4359
|
+
kws_fill = ["start_color", "end_color", "fill_type", "color"]
|
4360
|
+
kws_fill_type = [
|
4361
|
+
"darkVertical",
|
4362
|
+
"lightDown",
|
4363
|
+
"lightGrid",
|
4364
|
+
"solid",
|
4365
|
+
"darkDown",
|
4366
|
+
"lightGray",
|
4367
|
+
"lightUp",
|
4368
|
+
"gray0625",
|
4369
|
+
"lightVertical",
|
4370
|
+
"lightHorizontal",
|
4371
|
+
"darkHorizontal",
|
4372
|
+
"gray125",
|
4373
|
+
"darkUp",
|
4374
|
+
"mediumGray",
|
4375
|
+
"darkTrellis",
|
4376
|
+
"darkGray",
|
4377
|
+
"lightTrellis",
|
4378
|
+
"darkGrid",
|
4379
|
+
]
|
4380
|
+
start_color, end_color, fill_type = (
|
4381
|
+
"FFFFFF",
|
4382
|
+
"FFFFFF",
|
4383
|
+
"solid",
|
4384
|
+
) # default
|
4385
|
+
for k, v in cell.get(K, {}).items():
|
4386
|
+
if strcmp(k, kws_fill)[0] == "color":
|
4387
|
+
start_color, end_color = hex2argb(v), hex2argb(v)
|
4388
|
+
break
|
4389
|
+
for k, v in cell.get(K, {}).items():
|
4390
|
+
if strcmp(k, kws_fill)[0] == "start_color":
|
4391
|
+
start_color = hex2argb(v)
|
4392
|
+
elif strcmp(k, kws_fill)[0] == "end_color":
|
4393
|
+
end_color = hex2argb(v)
|
4394
|
+
elif strcmp(k, kws_fill)[0] == "fill_type":
|
4395
|
+
fill_type = strcmp(v, kws_fill_type)[0]
|
4396
|
+
cell_fill = PatternFill(
|
4397
|
+
start_color=start_color,
|
4398
|
+
end_color=end_color,
|
4399
|
+
fill_type=fill_type,
|
4400
|
+
)
|
4401
|
+
|
4402
|
+
if strcmp(K, kws_cell)[0] == "alignment":
|
4403
|
+
#! alignment
|
4404
|
+
# default
|
4405
|
+
align_horizontal = "general"
|
4406
|
+
align_vertical = "center"
|
4407
|
+
align_rot = 0
|
4408
|
+
align_wrap = False
|
4409
|
+
align_shrink = False
|
4410
|
+
align_indent = 0
|
4411
|
+
kws_align = [
|
4412
|
+
"horizontal",
|
4413
|
+
"ha",
|
4414
|
+
"vertical",
|
4415
|
+
"va",
|
4416
|
+
"text_rotation",
|
4417
|
+
"rotat",
|
4418
|
+
"rot",
|
4419
|
+
"wrap_text",
|
4420
|
+
"wrap",
|
4421
|
+
"shrink_to_fit",
|
4422
|
+
"shrink",
|
4423
|
+
"indent",
|
4424
|
+
]
|
4425
|
+
for k, v in cell.get(K, {}).items():
|
4426
|
+
if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
|
4427
|
+
align_horizontal = strcmp(
|
4428
|
+
v, ["general", "left", "right", "center"]
|
4429
|
+
)[0]
|
4430
|
+
elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
|
4431
|
+
align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
|
4432
|
+
elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
|
4433
|
+
align_rot = v
|
4434
|
+
elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
|
4435
|
+
align_wrap = v
|
4436
|
+
elif strcmp(k, kws_align)[0] in [
|
4437
|
+
"shrink_to_fit",
|
4438
|
+
"shrink",
|
4439
|
+
"wrap_text",
|
4440
|
+
"wrap",
|
4441
|
+
]:
|
4442
|
+
align_shrink = v
|
4443
|
+
elif strcmp(k, kws_align)[0] in ["indent"]:
|
4444
|
+
align_indent = v
|
4445
|
+
cell_alignment = Alignment(
|
4446
|
+
horizontal=align_horizontal,
|
4447
|
+
vertical=align_vertical,
|
4448
|
+
text_rotation=align_rot,
|
4449
|
+
wrap_text=align_wrap,
|
4450
|
+
shrink_to_fit=align_shrink,
|
4451
|
+
indent=align_indent,
|
4452
|
+
)
|
4453
|
+
|
4454
|
+
if strcmp(K, kws_cell)[0] == "border":
|
4455
|
+
#! border
|
4456
|
+
kws_border = [
|
4457
|
+
"color_left",
|
4458
|
+
"color_l",
|
4459
|
+
"color_right",
|
4460
|
+
"color_r",
|
4461
|
+
"color_top",
|
4462
|
+
"color_t",
|
4463
|
+
"color_bottom",
|
4464
|
+
"color_b",
|
4465
|
+
"color_diagonal",
|
4466
|
+
"color_d",
|
4467
|
+
"color_outline",
|
4468
|
+
"color_o",
|
4469
|
+
"color_vertical",
|
4470
|
+
"color_v",
|
4471
|
+
"color_horizontal",
|
4472
|
+
"color_h",
|
4473
|
+
"color",
|
4474
|
+
"style_left",
|
4475
|
+
"style_l",
|
4476
|
+
"style_right",
|
4477
|
+
"style_r",
|
4478
|
+
"style_top",
|
4479
|
+
"style_t",
|
4480
|
+
"style_bottom",
|
4481
|
+
"style_b",
|
4482
|
+
"style_diagonal",
|
4483
|
+
"style_d",
|
4484
|
+
"style_outline",
|
4485
|
+
"style_o",
|
4486
|
+
"style_vertical",
|
4487
|
+
"style_v",
|
4488
|
+
"style_horizontal",
|
4489
|
+
"style_h",
|
4490
|
+
"style",
|
4491
|
+
]
|
4492
|
+
# * border color
|
4493
|
+
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4494
|
+
"FF000000",
|
4495
|
+
"FF000000",
|
4496
|
+
"FF000000",
|
4497
|
+
"FF000000",
|
4498
|
+
)
|
4499
|
+
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4500
|
+
"FF000000",
|
4501
|
+
"FF000000",
|
4502
|
+
"FF000000",
|
4503
|
+
"FF000000",
|
4504
|
+
)
|
4505
|
+
# get colors config
|
4506
|
+
for k, v in cell.get(K, {}).items():
|
4507
|
+
if strcmp(k, kws_border)[0] in ["color"]:
|
4508
|
+
border_color_all = hex2argb(v)
|
4509
|
+
# 如果设置了color,表示其它的所有的都设置成为一样的
|
4510
|
+
# 然后再才开始自己定义其它的color
|
4511
|
+
(
|
4512
|
+
border_color_l,
|
4513
|
+
border_color_r,
|
4514
|
+
border_color_t,
|
4515
|
+
border_color_b,
|
4516
|
+
) = (
|
4517
|
+
border_color_all,
|
4518
|
+
border_color_all,
|
4519
|
+
border_color_all,
|
4520
|
+
border_color_all,
|
4521
|
+
)
|
4522
|
+
(
|
4523
|
+
border_color_d,
|
4524
|
+
border_color_o,
|
4525
|
+
border_color_v,
|
4526
|
+
border_color_h,
|
4527
|
+
) = (
|
4528
|
+
border_color_all,
|
4529
|
+
border_color_all,
|
4530
|
+
border_color_all,
|
4531
|
+
border_color_all,
|
4532
|
+
)
|
4533
|
+
elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
|
4534
|
+
border_color_l = hex2argb(v)
|
4535
|
+
elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
|
4536
|
+
border_color_r = hex2argb(v)
|
4537
|
+
elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
|
4538
|
+
border_color_t = hex2argb(v)
|
4539
|
+
elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
|
4540
|
+
border_color_b = hex2argb(v)
|
4541
|
+
elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
|
4542
|
+
border_color_d = hex2argb(v)
|
4543
|
+
elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
|
4544
|
+
border_color_o = hex2argb(v)
|
4545
|
+
elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
|
4546
|
+
border_color_v = hex2argb(v)
|
4547
|
+
elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
|
4548
|
+
border_color_h = hex2argb(v)
|
4549
|
+
# *border style
|
4550
|
+
border_styles = [
|
4551
|
+
"thin",
|
4552
|
+
"medium",
|
4553
|
+
"thick",
|
4554
|
+
"dotted",
|
4555
|
+
"dashed",
|
4556
|
+
"hair",
|
4557
|
+
"mediumDashed",
|
4558
|
+
"dashDot",
|
4559
|
+
"dashDotDot",
|
4560
|
+
"slantDashDot",
|
4561
|
+
"none",
|
4562
|
+
]
|
4563
|
+
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4564
|
+
None,
|
4565
|
+
None,
|
4566
|
+
None,
|
4567
|
+
None,
|
4568
|
+
)
|
4569
|
+
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4570
|
+
None,
|
4571
|
+
None,
|
4572
|
+
None,
|
4573
|
+
None,
|
4574
|
+
)
|
4575
|
+
# get styles config
|
4576
|
+
for k, v in cell.get(K, {}).items():
|
4577
|
+
# if not "style" in k:
|
4578
|
+
# break
|
4579
|
+
if strcmp(k, kws_border)[0] in ["style"]:
|
4580
|
+
border_style_all = strcmp(v, border_styles)[0]
|
4581
|
+
# 如果设置了style,表示其它的所有的都设置成为一样的
|
4582
|
+
# 然后再才开始自己定义其它的style
|
4583
|
+
(
|
4584
|
+
border_style_l,
|
4585
|
+
border_style_r,
|
4586
|
+
border_style_t,
|
4587
|
+
border_style_b,
|
4588
|
+
) = (
|
4589
|
+
border_style_all,
|
4590
|
+
border_style_all,
|
4591
|
+
border_style_all,
|
4592
|
+
border_style_all,
|
4593
|
+
)
|
4594
|
+
(
|
4595
|
+
border_style_d,
|
4596
|
+
border_style_o,
|
4597
|
+
border_style_v,
|
4598
|
+
border_style_h,
|
4599
|
+
) = (
|
4600
|
+
border_style_all,
|
4601
|
+
border_style_all,
|
4602
|
+
border_style_all,
|
4603
|
+
border_style_all,
|
4604
|
+
)
|
4605
|
+
elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
|
4606
|
+
border_style_l = strcmp(v, border_styles)[0]
|
4607
|
+
elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
|
4608
|
+
border_style_r = strcmp(v, border_styles)[0]
|
4609
|
+
elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
|
4610
|
+
border_style_t = strcmp(v, border_styles)[0]
|
4611
|
+
elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
|
4612
|
+
border_style_b = strcmp(v, border_styles)[0]
|
4613
|
+
elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
|
4614
|
+
border_style_d = strcmp(v, border_styles)[0]
|
4615
|
+
elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
|
4616
|
+
border_style_o = strcmp(v, border_styles)[0]
|
4617
|
+
elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
|
4618
|
+
border_style_v = strcmp(v, border_styles)[0]
|
4619
|
+
elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
|
4620
|
+
border_style_h = strcmp(v, border_styles)[0]
|
4621
|
+
# * apply border config
|
4622
|
+
border = Border(
|
4623
|
+
left=Side(border_style=border_style_l, color=border_color_l),
|
4624
|
+
right=Side(border_style=border_style_r, color=border_color_r),
|
4625
|
+
top=Side(border_style=border_style_t, color=border_color_t),
|
4626
|
+
bottom=Side(border_style=border_style_b, color=border_color_b),
|
4627
|
+
diagonal=Side(border_style=border_style_d, color=border_color_d),
|
4628
|
+
diagonal_direction=0,
|
4629
|
+
outline=Side(border_style=border_style_o, color=border_color_o),
|
4630
|
+
vertical=Side(border_style=border_style_v, color=border_color_v),
|
4631
|
+
horizontal=Side(border_style=border_style_h, color=border_color_h),
|
4632
|
+
)
|
4633
|
+
|
4634
|
+
#! final apply configs
|
4635
|
+
for row in ws[cell_range]:
|
4636
|
+
for cell_ in row:
|
4637
|
+
if cell_font:
|
4638
|
+
cell_.font = cell_font
|
4639
|
+
if cell_fill:
|
4640
|
+
cell_.fill = cell_fill
|
4641
|
+
if cell_alignment:
|
4642
|
+
cell_.alignment = cell_alignment
|
4643
|
+
if border:
|
4644
|
+
cell_.border = border
|
4645
|
+
|
4260
4646
|
if not isinstance(df, pd.DataFrame):
|
4261
4647
|
try:
|
4262
4648
|
print(f"is loading file {os.path.basename(df)}")
|
@@ -4602,11 +4988,10 @@ format_excel(
|
|
4602
4988
|
print(f"Formatted Excel file saved as:\n{filename}")
|
4603
4989
|
|
4604
4990
|
|
4605
|
-
from IPython.display import display, HTML, Markdown
|
4606
|
-
|
4607
|
-
|
4608
4991
|
def preview(var):
|
4609
4992
|
"""Master function to preview formatted variables in Jupyter."""
|
4993
|
+
from bs4 import BeautifulSoup
|
4994
|
+
from IPython.display import display, HTML, Markdown
|
4610
4995
|
|
4611
4996
|
if isinstance(var, str):
|
4612
4997
|
if isa(var, "html"):
|
@@ -4624,6 +5009,8 @@ def preview(var):
|
|
4624
5009
|
display(var)
|
4625
5010
|
|
4626
5011
|
elif isinstance(var, list) or isinstance(var, dict):
|
5012
|
+
import json
|
5013
|
+
|
4627
5014
|
# Display JSON
|
4628
5015
|
json_str = json.dumps(var, indent=4)
|
4629
5016
|
display(Markdown(f"```json\n{json_str}\n```"))
|
@@ -4637,6 +5024,8 @@ def preview(var):
|
|
4637
5024
|
display(Image(filename=var))
|
4638
5025
|
|
4639
5026
|
elif isinstance(var, dict):
|
5027
|
+
import json
|
5028
|
+
|
4640
5029
|
# Handle dictionary formatting
|
4641
5030
|
json_str = json.dumps(var, indent=4)
|
4642
5031
|
display(Markdown(f"```json\n{json_str}\n```"))
|
@@ -4651,48 +5040,194 @@ def preview(var):
|
|
4651
5040
|
# preview("# This is a Markdown header")
|
4652
5041
|
# preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
|
4653
5042
|
# preview({"key": "value", "numbers": [1, 2, 3]})
|
5043
|
+
|
5044
|
+
|
5045
|
+
def _df_outlier(
|
5046
|
+
data,
|
5047
|
+
columns=None,
|
5048
|
+
method=["zscore", "iqr", "percentile", "iforest"],
|
5049
|
+
min_outlier_method=3, # 至少两种方法检查出outlier
|
5050
|
+
zscore_threshold=3,
|
5051
|
+
iqr_threshold=1.5,
|
5052
|
+
lower_percentile=5,
|
5053
|
+
upper_percentile=95,
|
5054
|
+
):
|
5055
|
+
from scipy.stats import zscore
|
5056
|
+
from sklearn.ensemble import IsolationForest
|
5057
|
+
from sklearn.preprocessing import StandardScaler
|
5058
|
+
|
5059
|
+
col_names_org = data.columns.tolist()
|
5060
|
+
index_names_org = data.index.tolist()
|
5061
|
+
# Separate numeric and non-numeric columns
|
5062
|
+
numeric_data = data.select_dtypes(include=[np.number])
|
5063
|
+
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
5064
|
+
|
5065
|
+
if columns is not None:
|
5066
|
+
numeric_data = numeric_data[columns]
|
5067
|
+
elif numeric_data.empty:
|
5068
|
+
raise ValueError("Input data must contain numeric columns.")
|
5069
|
+
|
5070
|
+
outliers_df = pd.DataFrame(index=numeric_data.index)
|
5071
|
+
if isinstance(method, str):
|
5072
|
+
method = [method]
|
5073
|
+
|
5074
|
+
# Z-score method
|
5075
|
+
if "zscore" in method:
|
5076
|
+
z_scores = np.abs(zscore(numeric_data))
|
5077
|
+
outliers_df["zscore"] = np.any(z_scores > zscore_threshold, axis=1)
|
5078
|
+
|
5079
|
+
# IQR method
|
5080
|
+
if "iqr" in method:
|
5081
|
+
Q1 = numeric_data.quantile(0.25)
|
5082
|
+
Q3 = numeric_data.quantile(0.75)
|
5083
|
+
IQR = Q3 - Q1
|
5084
|
+
lower_bound = Q1 - iqr_threshold * IQR
|
5085
|
+
upper_bound = Q3 + iqr_threshold * IQR
|
5086
|
+
outliers_df["iqr"] = (
|
5087
|
+
(numeric_data < lower_bound) | (numeric_data > upper_bound)
|
5088
|
+
).any(axis=1)
|
5089
|
+
|
5090
|
+
# Percentile method
|
5091
|
+
if "percentile" in method:
|
5092
|
+
lower_bound = numeric_data.quantile(lower_percentile / 100)
|
5093
|
+
upper_bound = numeric_data.quantile(upper_percentile / 100)
|
5094
|
+
outliers_df["percentile"] = (
|
5095
|
+
(numeric_data < lower_bound) | (numeric_data > upper_bound)
|
5096
|
+
).any(axis=1)
|
5097
|
+
|
5098
|
+
# Isolation Forest method
|
5099
|
+
if "iforest" in method:
|
5100
|
+
# iforest method cannot handle NaNs, then fillna with mean
|
5101
|
+
numeric_data_ = numeric_data.fillna(numeric_data.mean())
|
5102
|
+
scaler = StandardScaler()
|
5103
|
+
scaled_data = scaler.fit_transform(numeric_data_)
|
5104
|
+
iso_forest = IsolationForest(contamination=0.05)
|
5105
|
+
outliers_df["iforest"] = iso_forest.fit_predict(scaled_data) == -1
|
5106
|
+
|
5107
|
+
# Combine all outlier detections
|
5108
|
+
if len(method) == 4: # all method are used:
|
5109
|
+
outliers_df["outlier"] = outliers_df.sum(axis=1) >= min_outlier_method
|
5110
|
+
else:
|
5111
|
+
outliers_df["outlier"] = outliers_df.any(axis=1)
|
5112
|
+
|
5113
|
+
# Handling Outliers: Remove or Winsorize or Replace with NaN
|
5114
|
+
processed_data = numeric_data.copy()
|
5115
|
+
|
5116
|
+
processed_data.loc[outliers_df["outlier"]] = np.nan
|
5117
|
+
|
5118
|
+
return processed_data
|
5119
|
+
|
5120
|
+
|
5121
|
+
def df_outlier(
|
5122
|
+
data,
|
5123
|
+
columns=None,
|
5124
|
+
method=["zscore", "iqr", "percentile", "iforest"],
|
5125
|
+
min_outlier_method=2, # 至少两种方法检查出outlier
|
5126
|
+
zscore_threshold=3,
|
5127
|
+
iqr_threshold=1.5,
|
5128
|
+
lower_percentile=5,
|
5129
|
+
upper_percentile=95,
|
5130
|
+
):
|
5131
|
+
"""
|
5132
|
+
Usage:
|
5133
|
+
data_out = df_outlier(
|
5134
|
+
data,
|
5135
|
+
columns=["income"],
|
5136
|
+
method="iforest",
|
5137
|
+
min_outlier_method=1)
|
5138
|
+
|
5139
|
+
Advanced outlier detection and handling function.
|
5140
|
+
|
5141
|
+
Parameters:
|
5142
|
+
- data: DataFrame, the input data (numerical).
|
5143
|
+
- method: List, the outlier detection method to use. Options: 'zscore', 'iqr', 'percentile', 'iforest'.
|
5144
|
+
- zscore_threshold: float, threshold for Z-score outlier detection (default 3).
|
5145
|
+
- iqr_threshold: float, threshold for IQR method (default 1.5).
|
5146
|
+
- lower_percentile: float, lower percentile for percentile-based outliers (default 5).
|
5147
|
+
- upper_percentile: float, upper percentile for percentile-based outliers (default 95).
|
5148
|
+
- keep_nan: bool, whether to replace outliers with NaN (default True).
|
5149
|
+
- plot: bool, whether to visualize the outliers (default False).
|
5150
|
+
- min_outlier_method: int, minimum number of method that need to flag a row as an outlier (default 2).
|
5151
|
+
- inplace: bool, whether to modify the original `data` DataFrame (default False).
|
5152
|
+
|
5153
|
+
Returns:
|
5154
|
+
- processed_data: DataFrame with outliers handled based on method (if winsorize/remove is True).
|
5155
|
+
"""
|
5156
|
+
col_names_org = data.columns.tolist()
|
5157
|
+
index_names_org = data.index.tolist()
|
5158
|
+
|
5159
|
+
numeric_data = data.select_dtypes(include=[np.number])
|
5160
|
+
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
5161
|
+
|
5162
|
+
_outlier_df_tmp = pd.DataFrame()
|
5163
|
+
for col in numeric_data.columns:
|
5164
|
+
_outlier_df_tmp = pd.concat(
|
5165
|
+
[
|
5166
|
+
_outlier_df_tmp,
|
5167
|
+
_df_outlier(
|
5168
|
+
data=data,
|
5169
|
+
columns=[col],
|
5170
|
+
method=method,
|
5171
|
+
min_outlier_method=min_outlier_method, # 至少两种方法检查出outlier
|
5172
|
+
zscore_threshold=zscore_threshold,
|
5173
|
+
iqr_threshold=iqr_threshold,
|
5174
|
+
lower_percentile=lower_percentile,
|
5175
|
+
upper_percentile=upper_percentile,
|
5176
|
+
),
|
5177
|
+
],
|
5178
|
+
axis=1,
|
5179
|
+
# join="inner",
|
5180
|
+
)
|
5181
|
+
processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
|
5182
|
+
processed_data = processed_data[col_names_org]
|
5183
|
+
return processed_data
|
5184
|
+
|
5185
|
+
|
4654
5186
|
def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
4655
5187
|
"""
|
4656
5188
|
Extend a DataFrame by the list elecments in the column.
|
4657
|
-
|
5189
|
+
|
4658
5190
|
Parameters:
|
4659
5191
|
----------
|
4660
5192
|
data : pd.DataFrame
|
4661
5193
|
The input DataFrame to be extended.
|
4662
|
-
|
5194
|
+
|
4663
5195
|
column : str
|
4664
5196
|
The name of the column to be split.
|
4665
|
-
|
5197
|
+
|
4666
5198
|
axis : int, optional
|
4667
|
-
The axis along which to expand the DataFrame.
|
5199
|
+
The axis along which to expand the DataFrame.
|
4668
5200
|
- 0 (default): Expand the specified column into multiple rows.
|
4669
5201
|
- 1: Expand the specified column into multiple columns.
|
4670
|
-
|
5202
|
+
|
4671
5203
|
sep : str, optional
|
4672
5204
|
The separator used to split the values in the specified column.
|
4673
5205
|
Must be provided for the function to work correctly.
|
4674
5206
|
"""
|
4675
|
-
|
4676
|
-
data = data.copy()
|
5207
|
+
|
5208
|
+
data = data.copy()
|
4677
5209
|
mask = data[column].str.contains(sep, na=False)
|
4678
5210
|
data = data.copy()
|
4679
5211
|
if mask.any():
|
4680
|
-
data[column] = (
|
4681
|
-
|
4682
|
-
|
4683
|
-
|
4684
|
-
|
5212
|
+
data[column] = data[column].apply(
|
5213
|
+
lambda x: x.split(sep) if isinstance(x, str) else x
|
5214
|
+
) # Only split if x is a string
|
5215
|
+
|
4685
5216
|
# Strip spaces from each item in the lists
|
4686
|
-
data[column] = data[column].apply(
|
4687
|
-
|
5217
|
+
data[column] = data[column].apply(
|
5218
|
+
lambda x: [item.strip() for item in x] if isinstance(x, list) else x
|
5219
|
+
)
|
5220
|
+
|
4688
5221
|
data = data.explode(column, ignore_index=True)
|
4689
5222
|
return data
|
5223
|
+
|
5224
|
+
|
4690
5225
|
# ! DataFrame
|
4691
5226
|
def df_astype(
|
4692
5227
|
data: pd.DataFrame,
|
4693
5228
|
columns: Optional[Union[str, List[str]]] = None,
|
4694
5229
|
astype: str = "datetime",
|
4695
|
-
skip_row:Union[str,list]=None,
|
5230
|
+
skip_row: Union[str, list] = None,
|
4696
5231
|
fmt: Optional[str] = None,
|
4697
5232
|
inplace: bool = True,
|
4698
5233
|
errors: str = "coerce", # Can be "ignore", "raise", or "coerce"
|
@@ -4750,7 +5285,8 @@ def df_astype(
|
|
4750
5285
|
"second",
|
4751
5286
|
"time",
|
4752
5287
|
"week",
|
4753
|
-
"date",
|
5288
|
+
"date",
|
5289
|
+
"day",
|
4754
5290
|
"month",
|
4755
5291
|
"year",
|
4756
5292
|
]
|
@@ -4758,18 +5294,18 @@ def df_astype(
|
|
4758
5294
|
if not inplace:
|
4759
5295
|
data = data.copy()
|
4760
5296
|
if skip_row is not None:
|
4761
|
-
data = data.drop(index=skip_row, errors=
|
5297
|
+
data = data.drop(index=skip_row, errors="ignore")
|
4762
5298
|
# If columns is None, apply to all columns
|
4763
5299
|
if columns is None:
|
4764
5300
|
columns = data.columns.tolist()
|
4765
5301
|
# correct the astype input
|
4766
|
-
if isinstance(astype,str):
|
5302
|
+
if isinstance(astype, str):
|
4767
5303
|
astype = strcmp(astype, astypes)[0]
|
4768
5304
|
print(f"converting as type: {astype}")
|
4769
|
-
elif isinstance(astype,dict):
|
5305
|
+
elif isinstance(astype, dict):
|
4770
5306
|
for col, dtype in astype.items():
|
4771
|
-
dtype=
|
4772
|
-
data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
|
5307
|
+
dtype = "date" if dtype == "day" else dtype
|
5308
|
+
data["col"] = data["col"].adtype(strcmp(dtype, astypes)[0])
|
4773
5309
|
return data if not inplace else None
|
4774
5310
|
|
4775
5311
|
# Ensure columns is a list
|
@@ -4880,13 +5416,15 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
|
|
4880
5416
|
if column not in data.columns:
|
4881
5417
|
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
|
4882
5418
|
|
4883
|
-
if isinstance(by, str) and
|
5419
|
+
if isinstance(by, str) and "count" in by.lower():
|
4884
5420
|
# Count occurrences of each value in the specified column
|
4885
5421
|
value_counts = df[column].value_counts()
|
4886
5422
|
|
4887
5423
|
# Determine the order based on counts
|
4888
5424
|
count_ascending = kwargs.pop("count_ascending", ascending)
|
4889
|
-
sorted_counts = value_counts.sort_values(
|
5425
|
+
sorted_counts = value_counts.sort_values(
|
5426
|
+
ascending=count_ascending
|
5427
|
+
).index.tolist()
|
4890
5428
|
|
4891
5429
|
# Convert to a categorical type with the new order
|
4892
5430
|
df[column] = pd.Categorical(df[column], categories=sorted_counts, ordered=True)
|
@@ -5004,6 +5542,7 @@ def df_merge(
|
|
5004
5542
|
)
|
5005
5543
|
return df_merged
|
5006
5544
|
|
5545
|
+
|
5007
5546
|
def df_drop_duplicates(
|
5008
5547
|
data: pd.DataFrame,
|
5009
5548
|
by: Union[
|
@@ -5012,16 +5551,16 @@ def df_drop_duplicates(
|
|
5012
5551
|
keep="first", # Options: 'first', 'last', or False (drop all duplicates)
|
5013
5552
|
ignore_index=True,
|
5014
5553
|
inplace: bool = False,
|
5015
|
-
verbose=True
|
5554
|
+
verbose=True,
|
5016
5555
|
):
|
5017
5556
|
"""
|
5018
5557
|
data (pd.DataFrame): DataFrame to drop duplicates from.
|
5019
5558
|
by (str): Specify by to drop duplicates:
|
5020
5559
|
- 'index': Drop duplicates based on the DataFrame index.
|
5021
5560
|
- Column name(s) for row-wise duplicate checking.
|
5022
|
-
keep (str): Which duplicates to keep:
|
5023
|
-
'first',
|
5024
|
-
'last',
|
5561
|
+
keep (str): Which duplicates to keep:
|
5562
|
+
'first',
|
5563
|
+
'last',
|
5025
5564
|
False (drop all duplicates).
|
5026
5565
|
inplace (bool): Whether to modify the original DataFrame in place.
|
5027
5566
|
"""
|
@@ -5031,8 +5570,8 @@ def df_drop_duplicates(
|
|
5031
5570
|
result = data[~data.index.duplicated(keep=keep)]
|
5032
5571
|
else:
|
5033
5572
|
# Drop duplicates row-wise based on column(s)
|
5034
|
-
result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
|
5035
|
-
if original_shape!=result.shape or verbose:
|
5573
|
+
result = data.drop_duplicates(subset=by, keep=keep, ignore_index=ignore_index)
|
5574
|
+
if original_shape != result.shape or verbose:
|
5036
5575
|
print(f"\nshape:{original_shape} (before drop_duplicates)")
|
5037
5576
|
print(f"shape:{result.shape} (after drop_duplicates)")
|
5038
5577
|
if inplace:
|
@@ -5042,15 +5581,18 @@ def df_drop_duplicates(
|
|
5042
5581
|
return None
|
5043
5582
|
else:
|
5044
5583
|
return result
|
5584
|
+
|
5585
|
+
|
5586
|
+
#! fillna()
|
5045
5587
|
def df_fillna(
|
5046
5588
|
data: pd.DataFrame,
|
5047
5589
|
method: str = "knn",
|
5048
|
-
axis: int = 0
|
5590
|
+
axis: int = 0, # column-wise
|
5049
5591
|
constant: float = None,
|
5050
5592
|
n_neighbors: int = 5, # KNN-specific
|
5051
|
-
max_iter: int = 10,
|
5052
|
-
inplace: bool =
|
5053
|
-
random_state:int =
|
5593
|
+
max_iter: int = 10, # Iterative methods specific
|
5594
|
+
inplace: bool = False,
|
5595
|
+
random_state: int = 1,
|
5054
5596
|
) -> pd.DataFrame:
|
5055
5597
|
"""
|
5056
5598
|
Fill missing values in a DataFrame using specified imputation method.
|
@@ -5066,11 +5608,11 @@ def df_fillna(
|
|
5066
5608
|
- 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
|
5067
5609
|
- 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
|
5068
5610
|
# - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
|
5069
|
-
# - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
|
5611
|
+
# - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
|
5070
5612
|
# reconstructing the data matrix using low-rank approximation
|
5071
5613
|
# - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
|
5072
5614
|
# - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
|
5073
|
-
|
5615
|
+
|
5074
5616
|
axis (int): The axis along which to impute:
|
5075
5617
|
- 0: Impute column-wise (default).
|
5076
5618
|
- 1: Impute row-wise.
|
@@ -5078,13 +5620,30 @@ def df_fillna(
|
|
5078
5620
|
inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
|
5079
5621
|
|
5080
5622
|
"""
|
5623
|
+
if isinstance(data, pd.Series):
|
5624
|
+
data = pd.DataFrame(data)
|
5625
|
+
# handle None
|
5626
|
+
for col in data.columns:
|
5627
|
+
data[col] = data[col].apply(lambda x: np.nan if x is None else x)
|
5628
|
+
|
5629
|
+
col_names_org = data.columns.tolist()
|
5630
|
+
index_names_org = data.index.tolist()
|
5631
|
+
# Separate numeric and non-numeric columns
|
5632
|
+
numeric_data = data.select_dtypes(include=[np.number])
|
5633
|
+
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
5081
5634
|
|
5082
5635
|
if data.empty:
|
5083
5636
|
raise ValueError("Input DataFrame is empty.")
|
5084
5637
|
|
5085
5638
|
# Validate method
|
5086
|
-
methods = [
|
5087
|
-
|
5639
|
+
methods = [
|
5640
|
+
"mean",
|
5641
|
+
"median",
|
5642
|
+
"most_frequent",
|
5643
|
+
"constant",
|
5644
|
+
"knn",
|
5645
|
+
"iterative",
|
5646
|
+
] # ,"missforest","softimpute","svd"]
|
5088
5647
|
method = strcmp(method, methods)[0]
|
5089
5648
|
|
5090
5649
|
# If using constant method, ask for a constant value
|
@@ -5098,51 +5657,76 @@ def df_fillna(
|
|
5098
5657
|
# Initialize SimpleImputer with the chosen method
|
5099
5658
|
if method == "constant":
|
5100
5659
|
from sklearn.impute import SimpleImputer
|
5660
|
+
|
5101
5661
|
imputer = SimpleImputer(strategy=method, fill_value=constant)
|
5102
5662
|
elif method == "knn":
|
5103
5663
|
from sklearn.impute import KNNImputer
|
5664
|
+
|
5104
5665
|
imputer = KNNImputer(n_neighbors=n_neighbors)
|
5105
5666
|
elif method == "iterative" or method == "mice":
|
5106
5667
|
from sklearn.experimental import enable_iterative_imputer
|
5107
5668
|
from sklearn.impute import IterativeImputer
|
5108
5669
|
|
5109
|
-
imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
|
5110
|
-
#
|
5111
|
-
# from missingpy import MissForest
|
5112
|
-
# imputer = MissForest(max_iter=max_iter, random_state=random_state)
|
5113
|
-
# elif method == "softimpute":
|
5114
|
-
# from fancyimpute import SoftImpute
|
5115
|
-
# imputer = SoftImpute()
|
5116
|
-
# elif method == "svd":
|
5117
|
-
# from fancyimpute import IterativeSVD
|
5118
|
-
# imputer = IterativeSVD(max_iters=max_iter)
|
5119
|
-
else: # mean, median, most_frequent
|
5670
|
+
imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
|
5671
|
+
else: # mean, median, most_frequent
|
5120
5672
|
from sklearn.impute import SimpleImputer
|
5673
|
+
|
5121
5674
|
imputer = SimpleImputer(strategy=method)
|
5122
5675
|
|
5123
5676
|
# Fit and transform the data
|
5124
5677
|
if axis == 0:
|
5125
5678
|
# Impute column-wise
|
5126
|
-
imputed_data = imputer.fit_transform(
|
5127
|
-
imputed_data.shape
|
5679
|
+
imputed_data = imputer.fit_transform(numeric_data)
|
5128
5680
|
elif axis == 1:
|
5129
5681
|
# Impute row-wise
|
5130
|
-
imputed_data = imputer.fit_transform(
|
5131
|
-
imputed_data.shape
|
5682
|
+
imputed_data = imputer.fit_transform(numeric_data.T)
|
5132
5683
|
else:
|
5133
5684
|
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
5134
5685
|
|
5135
|
-
|
5686
|
+
imputed_data = pd.DataFrame(
|
5136
5687
|
imputed_data if axis == 0 else imputed_data.T,
|
5137
|
-
index=
|
5138
|
-
columns=
|
5688
|
+
index=numeric_data.index if axis == 0 else data.columns,
|
5689
|
+
columns=numeric_data.columns if axis == 0 else data.index,
|
5690
|
+
)
|
5691
|
+
for col in imputed_data.select_dtypes(include=[np.number]).columns:
|
5692
|
+
imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
|
5693
|
+
|
5694
|
+
# Handle non-numeric data imputation
|
5695
|
+
if not non_numeric_data.empty:
|
5696
|
+
from sklearn.impute import SimpleImputer
|
5697
|
+
|
5698
|
+
if method == "constant":
|
5699
|
+
non_numeric_imputer = SimpleImputer(
|
5700
|
+
strategy="constant", fill_value=constant
|
5701
|
+
)
|
5702
|
+
else:
|
5703
|
+
non_numeric_imputer = SimpleImputer(strategy="most_frequent")
|
5704
|
+
|
5705
|
+
# Impute non-numeric columns column-wise (axis=0)
|
5706
|
+
imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
|
5707
|
+
|
5708
|
+
# Convert imputed non-numeric array back to DataFrame with original index and column names
|
5709
|
+
imputed_non_numeric_df = pd.DataFrame(
|
5710
|
+
imputed_non_numeric,
|
5711
|
+
index=non_numeric_data.index,
|
5712
|
+
columns=non_numeric_data.columns,
|
5713
|
+
)
|
5714
|
+
else:
|
5715
|
+
imputed_non_numeric_df = pd.DataFrame(index=data.index)
|
5716
|
+
|
5717
|
+
imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(
|
5718
|
+
columns=data.columns
|
5139
5719
|
)
|
5140
5720
|
|
5141
5721
|
if inplace:
|
5142
|
-
|
5143
|
-
|
5722
|
+
# Modify the original DataFrame
|
5723
|
+
data[:] = imputed_data[col_names_org]
|
5724
|
+
return None
|
5144
5725
|
else:
|
5145
|
-
|
5726
|
+
# Return the modified DataFrame
|
5727
|
+
return imputed_data[col_names_org]
|
5728
|
+
|
5729
|
+
|
5146
5730
|
# # example
|
5147
5731
|
# data = {
|
5148
5732
|
# "A": [1, 2, np.nan, 4, 5],
|
@@ -5172,9 +5756,100 @@ def df_fillna(
|
|
5172
5756
|
# display(df)
|
5173
5757
|
# display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
|
5174
5758
|
|
5175
|
-
|
5759
|
+
|
5760
|
+
def df_encoder(
|
5761
|
+
data: pd.DataFrame,
|
5762
|
+
method: str = "dummy", #'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
|
5763
|
+
columns=None,
|
5764
|
+
target_column=None, # Required for 'target' encoding method
|
5765
|
+
**kwargs,
|
5766
|
+
) -> pd.DataFrame:
|
5767
|
+
"""
|
5768
|
+
Methods explained:
|
5769
|
+
- 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
|
5770
|
+
|
5771
|
+
- 'onehot': One-hot encoding is used when there is no inherent order in categories. It creates a binary column for each category and is useful for nominal categorical variables. However, it increases dimensionality significantly if there are many unique categories.
|
5772
|
+
|
5773
|
+
- 'ordinal': Ordinal encoding is used when there is an inherent order in the categories. It assigns integers to categories based on their order. Use this when the categories have a ranking (e.g., 'low', 'medium', 'high').
|
5774
|
+
|
5775
|
+
- 'label': Label encoding is used for converting each unique category to a numeric label. It can be useful when working with algorithms that can handle categorical data natively (e.g., decision trees). However, it might introduce unintended ordinal relationships between the categories.
|
5776
|
+
|
5777
|
+
- 'target': Target encoding is used when you encode a categorical feature based on the mean of the target variable. This is useful when there is a strong correlation between the categorical feature and the target variable. It is often used in predictive modeling to capture relationships that are not directly encoded in the feature.
|
5778
|
+
|
5779
|
+
- 'binary': Binary encoding is a more efficient alternative to one-hot encoding when dealing with high-cardinality categorical variables. It converts categories into binary numbers and then splits them into multiple columns, reducing dimensionality compared to one-hot encoding.
|
5780
|
+
"""
|
5781
|
+
|
5782
|
+
# Select categorical columns
|
5783
|
+
categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
|
5784
|
+
methods = ["dummy", "onehot", "ordinal", "label", "target", "binary"]
|
5785
|
+
method = strcmp(method, methods)[0]
|
5786
|
+
|
5787
|
+
if columns is None:
|
5788
|
+
columns = categorical_cols
|
5789
|
+
|
5790
|
+
# pd.get_dummies()
|
5791
|
+
if method == "dummy":
|
5792
|
+
dtype = kwargs.pop("dtype", int)
|
5793
|
+
drop_first = kwargs.pop("drop_first", True)
|
5794
|
+
try:
|
5795
|
+
encoded_df = pd.get_dummies(
|
5796
|
+
data[columns], drop_first=drop_first, dtype=dtype, **kwargs
|
5797
|
+
)
|
5798
|
+
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5799
|
+
except Exception as e:
|
5800
|
+
# print(f"Warning, 没有进行转换, 因为: {e}")
|
5801
|
+
return data
|
5802
|
+
# One-hot encoding
|
5803
|
+
elif method == "onehot":
|
5804
|
+
from sklearn.preprocessing import OneHotEncoder
|
5805
|
+
|
5806
|
+
encoder = OneHotEncoder(drop="first", sparse_output=False, **kwargs)
|
5807
|
+
encoded_data = encoder.fit_transform(data[columns])
|
5808
|
+
encoded_df = pd.DataFrame(
|
5809
|
+
encoded_data,
|
5810
|
+
columns=encoder.get_feature_names_out(columns),
|
5811
|
+
index=data.index,
|
5812
|
+
)
|
5813
|
+
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5814
|
+
|
5815
|
+
# Ordinal encoding
|
5816
|
+
elif method == "ordinal":
|
5817
|
+
from sklearn.preprocessing import OrdinalEncoder
|
5818
|
+
|
5819
|
+
encoder = OrdinalEncoder(**kwargs)
|
5820
|
+
encoded_data = encoder.fit_transform(data[columns])
|
5821
|
+
encoded_df = pd.DataFrame(encoded_data, columns=columns, index=data.index)
|
5822
|
+
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5823
|
+
|
5824
|
+
# Label encoding
|
5825
|
+
elif method == "label":
|
5826
|
+
from sklearn.preprocessing import LabelEncoder
|
5827
|
+
|
5828
|
+
encoder = LabelEncoder()
|
5829
|
+
encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
|
5830
|
+
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5831
|
+
|
5832
|
+
# Target encoding (Mean of the target for each category)
|
5833
|
+
elif method == "target":
|
5834
|
+
if target_column is None:
|
5835
|
+
raise ValueError("target_column must be provided for target encoding.")
|
5836
|
+
from category_encoders import TargetEncoder
|
5837
|
+
|
5838
|
+
encoder = TargetEncoder(cols=columns, **kwargs)
|
5839
|
+
encoded_data = encoder.fit_transform(data[columns], data[target_column])
|
5840
|
+
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5841
|
+
|
5842
|
+
# Binary encoding (for high-cardinality categorical variables)
|
5843
|
+
elif method == "binary":
|
5844
|
+
from category_encoders import BinaryEncoder
|
5845
|
+
|
5846
|
+
encoder = BinaryEncoder(cols=columns, **kwargs)
|
5847
|
+
encoded_data = encoder.fit_transform(data[columns])
|
5848
|
+
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5849
|
+
|
5850
|
+
|
5176
5851
|
def df_scaler(
|
5177
|
-
data: pd.DataFrame,
|
5852
|
+
data: pd.DataFrame, # should be numeric dtype
|
5178
5853
|
method="standard",
|
5179
5854
|
columns=None, # default, select all numeric col/row
|
5180
5855
|
inplace=False,
|
@@ -5218,9 +5893,8 @@ def df_scaler(
|
|
5218
5893
|
if axis == 0:
|
5219
5894
|
# Column-wise scaling (default)
|
5220
5895
|
if columns is None:
|
5221
|
-
columns = data.select_dtypes(include=
|
5896
|
+
columns = data.select_dtypes(include=np.number).columns.tolist()
|
5222
5897
|
non_numeric_columns = data.columns.difference(columns)
|
5223
|
-
print(f"Scaling columns")
|
5224
5898
|
|
5225
5899
|
scaled_data = scaler.fit_transform(data[columns])
|
5226
5900
|
|
@@ -5242,7 +5916,7 @@ def df_scaler(
|
|
5242
5916
|
# Row-wise scaling
|
5243
5917
|
if columns is None:
|
5244
5918
|
columns = data.index.tolist()
|
5245
|
-
numeric_rows = data.loc[columns].select_dtypes(include=
|
5919
|
+
numeric_rows = data.loc[columns].select_dtypes(include=np.number)
|
5246
5920
|
if numeric_rows.empty:
|
5247
5921
|
raise ValueError("No numeric rows to scale.")
|
5248
5922
|
|
@@ -5260,6 +5934,34 @@ def df_scaler(
|
|
5260
5934
|
scaled_df.loc[numeric_rows.index] = scaled_data
|
5261
5935
|
return scaled_df
|
5262
5936
|
|
5937
|
+
|
5938
|
+
def df_special_characters_cleaner(
|
5939
|
+
data: pd.DataFrame, where=["column", "content", "index"]
|
5940
|
+
) -> pd.DataFrame:
|
5941
|
+
"""
|
5942
|
+
to clean special characters:
|
5943
|
+
usage:
|
5944
|
+
df_special_characters_cleaner(data=df, where='column')
|
5945
|
+
"""
|
5946
|
+
if not isinstance(where, list):
|
5947
|
+
where = [where]
|
5948
|
+
where_to_clean = ["column", "content", "index"]
|
5949
|
+
where_ = [strcmp(i, where_to_clean)[0] for i in where]
|
5950
|
+
|
5951
|
+
# 1. Clean column names by replacing special characters with underscores
|
5952
|
+
if "column" in where_:
|
5953
|
+
data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
|
5954
|
+
|
5955
|
+
# 2. Clean only object-type columns (text columns)
|
5956
|
+
if "content" in where_:
|
5957
|
+
for col in data.select_dtypes(include=["object"]).columns:
|
5958
|
+
data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
|
5959
|
+
if data.index.dtype == "object" and index in where_:
|
5960
|
+
data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
|
5961
|
+
|
5962
|
+
return data
|
5963
|
+
|
5964
|
+
|
5263
5965
|
def df_cluster(
|
5264
5966
|
data: pd.DataFrame,
|
5265
5967
|
columns: Optional[list] = None,
|
@@ -5268,8 +5970,8 @@ def df_cluster(
|
|
5268
5970
|
scale: bool = True,
|
5269
5971
|
plot: Union[str, list] = "all",
|
5270
5972
|
inplace: bool = True,
|
5271
|
-
ax
|
5272
|
-
)
|
5973
|
+
ax=None,
|
5974
|
+
):
|
5273
5975
|
from sklearn.preprocessing import StandardScaler
|
5274
5976
|
from sklearn.cluster import KMeans
|
5275
5977
|
from sklearn.metrics import silhouette_score, silhouette_samples
|
@@ -5277,7 +5979,6 @@ def df_cluster(
|
|
5277
5979
|
import numpy as np
|
5278
5980
|
import pandas as pd
|
5279
5981
|
import matplotlib.pyplot as plt
|
5280
|
-
import seaborn as sns
|
5281
5982
|
|
5282
5983
|
"""
|
5283
5984
|
Performs clustering analysis on the provided feature matrix using K-Means.
|
@@ -5585,94 +6286,72 @@ def df_reducer(
|
|
5585
6286
|
umap_neighbors: int = 15, # UMAP-specific
|
5586
6287
|
umap_min_dist: float = 0.1, # UMAP-specific
|
5587
6288
|
tsne_perplexity: int = 30, # t-SNE-specific
|
6289
|
+
hue: str = None, # lda-specific
|
5588
6290
|
scale: bool = True,
|
5589
6291
|
fill_missing: bool = True,
|
5590
6292
|
debug: bool = False,
|
5591
6293
|
inplace: bool = True, # replace the oringinal data
|
5592
|
-
plot_:bool = False
|
6294
|
+
plot_: bool = False, # plot scatterplot, but no 'hue',so it is meaningless
|
6295
|
+
random_state=1,
|
6296
|
+
ax=None,
|
6297
|
+
figsize=None,
|
6298
|
+
**kwargs,
|
5593
6299
|
) -> pd.DataFrame:
|
5594
|
-
|
5595
|
-
|
5596
|
-
|
5597
|
-
|
5598
|
-
|
5599
|
-
|
5600
|
-
|
5601
|
-
|
5602
|
-
|
5603
|
-
|
5604
|
-
|
5605
|
-
|
5606
|
-
|
5607
|
-
|
5608
|
-
|
5609
|
-
|
5610
|
-
reduction, balancing speed and quality of visualization.
|
5611
|
-
Parameters:
|
5612
|
-
-----------
|
5613
|
-
data : pd.DataFrame
|
5614
|
-
The input DataFrame (samples x features).
|
5615
|
-
|
5616
|
-
columns : List[str], optional
|
5617
|
-
List of column names to reduce. If None, all columns are used.
|
5618
|
-
|
5619
|
-
method : str, optional, default="umap"
|
5620
|
-
Dimensionality reduction method, either "pca" or "umap".
|
5621
|
-
|
5622
|
-
n_components : int, optional, default=50
|
5623
|
-
Number of components for PCA or UMAP.
|
5624
|
-
|
5625
|
-
umap_neighbors : int, optional, default=15
|
5626
|
-
Number of neighbors considered for UMAP embedding.
|
5627
|
-
|
5628
|
-
umap_min_dist : float, optional, default=0.1
|
5629
|
-
Minimum distance between points in UMAP embedding.
|
5630
|
-
|
5631
|
-
scale : bool, optional, default=True
|
5632
|
-
Whether to scale the data using StandardScaler.
|
5633
|
-
|
5634
|
-
fill_missing : bool, optional, default=True
|
5635
|
-
Whether to fill missing values using the mean before applying PCA/UMAP.
|
6300
|
+
dict_methods = {
|
6301
|
+
#!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
|
6302
|
+
"pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
|
6303
|
+
"lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
|
6304
|
+
"factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
|
6305
|
+
"svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
|
6306
|
+
#! Non-linear Dimensionality Reduction (Manifold Learning)
|
6307
|
+
"umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
|
6308
|
+
"tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other. Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
|
6309
|
+
"mds": "mds(Multidimensional Scaling):\n\tAppropriate for visualizing pairwise similarity or distance in data. Advantage: Maintains the perceived similarity or dissimilarity between points. Limitation: Computationally expensive for large datasets, less effective for complex, high-dimensional structures.",
|
6310
|
+
"lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
|
6311
|
+
"kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
|
6312
|
+
"ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
|
6313
|
+
#! Anomaly Detection: Specialized for detecting outliers or unusual patterns
|
6314
|
+
"isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
|
6315
|
+
}
|
5636
6316
|
|
5637
|
-
Returns:
|
5638
|
-
--------
|
5639
|
-
reduced_df : pd.DataFrame
|
5640
|
-
DataFrame with the reduced dimensions.
|
5641
|
-
"""
|
5642
|
-
|
5643
|
-
"""
|
5644
|
-
PCA: explained_variance:
|
5645
|
-
indicates the proportion of the dataset's total variance that each principal
|
5646
|
-
component (PC) explains. It gives you a sense of how much information
|
5647
|
-
(or variance) is captured by each PC
|
5648
|
-
Interpretation:
|
5649
|
-
- Higher values indicate that the corresponding PC captures more variance.
|
5650
|
-
- The sum of the explained variances for all PCs equals 1 (or 100%).
|
5651
|
-
- If the first few components explain a high percentage (e.g., 90%),
|
5652
|
-
it means you can reduce the dimensionality of the data significantly without losing much information.
|
5653
|
-
Use case:
|
5654
|
-
You may plot a scree plot, which shows the explained variance for each PC, to help decide
|
5655
|
-
how many components to keep for analysis.
|
5656
|
-
|
5657
|
-
PCA: Singular values:
|
5658
|
-
represent the magnitude of variance along each principal component. Mathematically,
|
5659
|
-
they are the square roots of the eigenvalues of the covariance matrix.
|
5660
|
-
Interpretation:
|
5661
|
-
Larger singular values indicate that the associated PC captures more variance.
|
5662
|
-
Singular values are related to the scale of the data. If the data are scaled
|
5663
|
-
before PCA (e.g., standardized), then the singular values will provide a measure
|
5664
|
-
of the spread of data along each PC.
|
5665
|
-
Use case:
|
5666
|
-
Singular values help quantify the contribution of each principal component in a
|
5667
|
-
similar way to the explained variance. They are useful in understanding the overall
|
5668
|
-
structure of the data.
|
5669
|
-
"""
|
5670
6317
|
from sklearn.preprocessing import StandardScaler
|
5671
6318
|
from sklearn.impute import SimpleImputer
|
5672
6319
|
|
5673
|
-
|
5674
|
-
|
5675
|
-
|
6320
|
+
if plot_:
|
6321
|
+
import matplotlib.pyplot as plt
|
6322
|
+
import seaborn as sns
|
6323
|
+
# Check valid method input
|
6324
|
+
methods = [
|
6325
|
+
"pca",
|
6326
|
+
"umap",
|
6327
|
+
"tsne",
|
6328
|
+
"factor",
|
6329
|
+
"isolation_forest",
|
6330
|
+
"lda",
|
6331
|
+
"kpca",
|
6332
|
+
"ica",
|
6333
|
+
"mds",
|
6334
|
+
"lle",
|
6335
|
+
"svd",
|
6336
|
+
]
|
6337
|
+
method = strcmp(method, methods)[0]
|
6338
|
+
print(f"\nprocessing with using {dict_methods[method]}:")
|
6339
|
+
xlabel, ylabel = None, None
|
6340
|
+
if columns is None:
|
6341
|
+
columns = data.select_dtypes(include="number").columns.tolist()
|
6342
|
+
if hue is None:
|
6343
|
+
hue = data.select_dtypes(exclude="number").columns.tolist()
|
6344
|
+
if isinstance(hue, list):
|
6345
|
+
print("Warning: hue is a list, only select the 1st one")
|
6346
|
+
hue = hue[0]
|
6347
|
+
if not hue:
|
6348
|
+
# Select columns if specified, else use all columns
|
6349
|
+
X = data[columns].values if columns else data.values
|
6350
|
+
else:
|
6351
|
+
# Select columns to reduce and hue for LDA
|
6352
|
+
X = data[columns].values if columns else data.drop(columns=[hue]).values
|
6353
|
+
y = data[hue].values
|
6354
|
+
print(X.shape)
|
5676
6355
|
# Handle missing values
|
5677
6356
|
if fill_missing:
|
5678
6357
|
imputer = SimpleImputer(strategy="mean")
|
@@ -5683,15 +6362,13 @@ def df_reducer(
|
|
5683
6362
|
scaler = StandardScaler()
|
5684
6363
|
X = scaler.fit_transform(X)
|
5685
6364
|
|
5686
|
-
# Check valid method input
|
5687
|
-
methods=["pca", "umap","tsne","factor","isolation_forest"]
|
5688
|
-
method=strcmp(method, methods)[0]
|
5689
6365
|
# Apply PCA if selected
|
5690
|
-
if method == "pca":
|
6366
|
+
if method == "pca":
|
5691
6367
|
from sklearn.decomposition import PCA
|
6368
|
+
|
5692
6369
|
pca = PCA(n_components=n_components)
|
5693
6370
|
X_reduced = pca.fit_transform(X)
|
5694
|
-
|
6371
|
+
|
5695
6372
|
# Additional PCA information
|
5696
6373
|
explained_variance = pca.explained_variance_ratio_
|
5697
6374
|
singular_values = pca.singular_values_
|
@@ -5707,36 +6384,72 @@ def df_reducer(
|
|
5707
6384
|
# Plot explained variance
|
5708
6385
|
cumulative_variance = np.cumsum(explained_variance)
|
5709
6386
|
plt.figure(figsize=(8, 5))
|
5710
|
-
plt.plot(
|
6387
|
+
plt.plot(
|
6388
|
+
range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o"
|
6389
|
+
)
|
5711
6390
|
plt.title("Cumulative Explained Variance by Principal Components")
|
5712
6391
|
plt.xlabel("Number of Principal Components")
|
5713
6392
|
plt.ylabel("Cumulative Explained Variance")
|
5714
6393
|
plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
|
5715
|
-
plt.axvline(
|
6394
|
+
plt.axvline(
|
6395
|
+
x=n_components,
|
6396
|
+
color="g",
|
6397
|
+
linestyle="--",
|
6398
|
+
label=f"n_components = {n_components}",
|
6399
|
+
)
|
5716
6400
|
plt.legend()
|
5717
6401
|
plt.grid()
|
5718
6402
|
plt.show()
|
5719
6403
|
|
5720
6404
|
# Prepare reduced DataFrame with additional PCA info
|
5721
6405
|
pca_df = pd.DataFrame(
|
5722
|
-
X_reduced,
|
5723
|
-
|
5724
|
-
|
6406
|
+
X_reduced,
|
6407
|
+
index=data.index,
|
6408
|
+
columns=[f"PC_{i+1}" for i in range(n_components)],
|
6409
|
+
)
|
5725
6410
|
# pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
|
5726
6411
|
# pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
|
5727
6412
|
# Expand explained variance to multiple columns if needed
|
5728
6413
|
for i in range(n_components):
|
5729
|
-
pca_df[f"Explained Variance PC_{i+1}"] = np.tile(
|
6414
|
+
pca_df[f"Explained Variance PC_{i+1}"] = np.tile(
|
6415
|
+
format(explained_variance[i] * 100, ".3f") + "%", (pca_df.shape[0], 1)
|
6416
|
+
)
|
5730
6417
|
for i in range(n_components):
|
5731
|
-
pca_df[f"Singular Values PC_{i+1}"] = np.tile(
|
6418
|
+
pca_df[f"Singular Values PC_{i+1}"] = np.tile(
|
6419
|
+
singular_values[i], (pca_df.shape[0], 1)
|
6420
|
+
)
|
6421
|
+
if hue:
|
6422
|
+
pca_df[hue] = y
|
6423
|
+
elif method == "lda":
|
6424
|
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
5732
6425
|
|
6426
|
+
if "hue" not in locals() or hue is None:
|
6427
|
+
raise ValueError(
|
6428
|
+
"LDA requires a 'hue' col parameter to specify class labels."
|
6429
|
+
)
|
6430
|
+
|
6431
|
+
lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
|
6432
|
+
X_reduced = lda_reducer.fit_transform(X, y)
|
6433
|
+
|
6434
|
+
# Prepare reduced DataFrame with additional LDA info
|
6435
|
+
lda_df = pd.DataFrame(
|
6436
|
+
X_reduced,
|
6437
|
+
index=data.index,
|
6438
|
+
columns=[f"LDA_{i+1}" for i in range(n_components)],
|
6439
|
+
)
|
6440
|
+
if debug:
|
6441
|
+
print(f"LDA completed: Reduced to {n_components} components.")
|
6442
|
+
print("Class separability achieved by LDA.")
|
6443
|
+
if hue:
|
6444
|
+
lda_df[hue] = y
|
5733
6445
|
# Apply UMAP if selected
|
5734
6446
|
elif method == "umap":
|
5735
6447
|
import umap
|
6448
|
+
|
5736
6449
|
umap_reducer = umap.UMAP(
|
5737
6450
|
n_neighbors=umap_neighbors,
|
5738
6451
|
min_dist=umap_min_dist,
|
5739
|
-
n_components=n_components
|
6452
|
+
n_components=n_components,
|
5740
6453
|
)
|
5741
6454
|
X_reduced = umap_reducer.fit_transform(X)
|
5742
6455
|
|
@@ -5751,41 +6464,57 @@ def df_reducer(
|
|
5751
6464
|
|
5752
6465
|
# Prepare reduced DataFrame with additional UMAP info
|
5753
6466
|
umap_df = pd.DataFrame(
|
5754
|
-
X_reduced,
|
5755
|
-
|
6467
|
+
X_reduced,
|
6468
|
+
index=data.index,
|
6469
|
+
columns=[f"UMAP_{i+1}" for i in range(n_components)],
|
5756
6470
|
)
|
5757
6471
|
umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
|
5758
6472
|
umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
|
6473
|
+
if hue:
|
6474
|
+
umap_df[hue] = y
|
5759
6475
|
elif method == "tsne":
|
5760
6476
|
from sklearn.manifold import TSNE
|
5761
|
-
tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
|
5762
|
-
X_reduced = tsne.fit_transform(X)
|
5763
6477
|
|
5764
|
-
|
6478
|
+
tsne = TSNE(
|
6479
|
+
n_components=n_components,
|
6480
|
+
perplexity=tsne_perplexity,
|
6481
|
+
random_state=random_state,
|
6482
|
+
)
|
6483
|
+
X_reduced = tsne.fit_transform(X)
|
5765
6484
|
tsne_df = pd.DataFrame(
|
5766
|
-
X_reduced,
|
5767
|
-
|
6485
|
+
X_reduced,
|
6486
|
+
index=data.index,
|
6487
|
+
columns=[f"tSNE_{i+1}" for i in range(n_components)],
|
5768
6488
|
)
|
5769
|
-
tsne_df["Perplexity"] = np.tile(
|
5770
|
-
|
6489
|
+
tsne_df["Perplexity"] = np.tile(
|
6490
|
+
f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1)
|
6491
|
+
)
|
6492
|
+
if hue:
|
6493
|
+
tsne_df[hue] = y
|
5771
6494
|
# Apply Factor Analysis if selected
|
5772
6495
|
elif method == "factor":
|
5773
6496
|
from sklearn.decomposition import FactorAnalysis
|
5774
|
-
|
6497
|
+
|
6498
|
+
factor = FactorAnalysis(n_components=n_components, random_state=random_state)
|
5775
6499
|
X_reduced = factor.fit_transform(X)
|
5776
6500
|
# Factor Analysis does not directly provide explained variance, but we can approximate it
|
5777
6501
|
fa_variance = factor.noise_variance_
|
5778
6502
|
# Prepare reduced DataFrame with additional Factor Analysis info
|
5779
6503
|
factor_df = pd.DataFrame(
|
5780
|
-
X_reduced,
|
5781
|
-
|
6504
|
+
X_reduced,
|
6505
|
+
index=data.index,
|
6506
|
+
columns=[f"Factor_{i+1}" for i in range(n_components)],
|
5782
6507
|
)
|
5783
|
-
factor_df["Noise Variance"] = np.tile(
|
5784
|
-
|
6508
|
+
factor_df["Noise Variance"] = np.tile(
|
6509
|
+
format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1)
|
6510
|
+
)
|
6511
|
+
if hue:
|
6512
|
+
factor_df[hue] = y
|
5785
6513
|
# Apply Isolation Forest for outlier detection if selected
|
5786
6514
|
elif method == "isolation_forest":
|
5787
6515
|
from sklearn.decomposition import PCA
|
5788
6516
|
from sklearn.ensemble import IsolationForest
|
6517
|
+
|
5789
6518
|
# Step 1: Apply PCA for dimensionality reduction to 2 components
|
5790
6519
|
pca = PCA(n_components=n_components)
|
5791
6520
|
X_pca = pca.fit_transform(X)
|
@@ -5795,65 +6524,139 @@ def df_reducer(
|
|
5795
6524
|
|
5796
6525
|
# Prepare reduced DataFrame with additional PCA info
|
5797
6526
|
iso_forest_df = pd.DataFrame(
|
5798
|
-
X_pca, index=data.index,
|
5799
|
-
columns=[f"PC_{i+1}" for i in range(n_components)]
|
6527
|
+
X_pca, index=data.index, columns=[f"PC_{i+1}" for i in range(n_components)]
|
5800
6528
|
)
|
5801
6529
|
|
5802
|
-
isolation_forest = IsolationForest(
|
6530
|
+
isolation_forest = IsolationForest(
|
6531
|
+
n_estimators=100, contamination="auto", random_state=1
|
6532
|
+
)
|
5803
6533
|
isolation_forest.fit(X)
|
5804
|
-
anomaly_scores = isolation_forest.decision_function(
|
6534
|
+
anomaly_scores = isolation_forest.decision_function(
|
6535
|
+
X
|
6536
|
+
) # Anomaly score: larger is less anomalous
|
5805
6537
|
# Predict labels: 1 (normal), -1 (anomaly)
|
5806
|
-
anomaly_labels = isolation_forest.fit_predict(X)
|
6538
|
+
anomaly_labels = isolation_forest.fit_predict(X)
|
5807
6539
|
# Add anomaly scores and labels to the DataFrame
|
5808
6540
|
iso_forest_df["Anomaly Score"] = anomaly_scores
|
5809
6541
|
iso_forest_df["Anomaly Label"] = anomaly_labels
|
5810
6542
|
# add info from pca
|
5811
6543
|
for i in range(n_components):
|
5812
|
-
iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(
|
6544
|
+
iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(
|
6545
|
+
format(explained_variance[i] * 100, ".3f") + "%",
|
6546
|
+
(iso_forest_df.shape[0], 1),
|
6547
|
+
)
|
5813
6548
|
for i in range(n_components):
|
5814
|
-
iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(
|
6549
|
+
iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(
|
6550
|
+
singular_values[i], (iso_forest_df.shape[0], 1)
|
6551
|
+
)
|
6552
|
+
if hue:
|
6553
|
+
iso_forest_df[hue] = y
|
6554
|
+
# * Apply Kernel PCA if selected
|
6555
|
+
elif method == "kpca":
|
6556
|
+
from sklearn.decomposition import KernelPCA
|
6557
|
+
|
6558
|
+
kpca = KernelPCA(
|
6559
|
+
n_components=n_components, kernel="rbf", random_state=random_state
|
6560
|
+
)
|
6561
|
+
X_reduced = kpca.fit_transform(X)
|
6562
|
+
|
6563
|
+
# Prepare reduced DataFrame with KPCA info
|
6564
|
+
kpca_df = pd.DataFrame(
|
6565
|
+
X_reduced,
|
6566
|
+
index=data.index,
|
6567
|
+
columns=[f"KPCA_{i+1}" for i in range(n_components)],
|
6568
|
+
)
|
6569
|
+
if debug:
|
6570
|
+
print("Kernel PCA completed with RBF kernel.")
|
6571
|
+
if hue:
|
6572
|
+
kpca_df[hue] = y
|
6573
|
+
# * Apply ICA if selected
|
6574
|
+
elif method == "ica":
|
6575
|
+
from sklearn.decomposition import FastICA
|
6576
|
+
|
6577
|
+
ica = FastICA(n_components=n_components, random_state=random_state)
|
6578
|
+
X_reduced = ica.fit_transform(X)
|
6579
|
+
|
6580
|
+
# Prepare reduced DataFrame with ICA info
|
6581
|
+
ica_df = pd.DataFrame(
|
6582
|
+
X_reduced,
|
6583
|
+
index=data.index,
|
6584
|
+
columns=[f"ICA_{i+1}" for i in range(n_components)],
|
6585
|
+
)
|
6586
|
+
if debug:
|
6587
|
+
print("Independent Component Analysis (ICA) completed.")
|
6588
|
+
if hue:
|
6589
|
+
ica_df[hue] = y
|
6590
|
+
# * Apply MDS if selected
|
6591
|
+
elif method == "mds":
|
6592
|
+
from sklearn.manifold import MDS
|
6593
|
+
|
6594
|
+
mds = MDS(n_components=n_components, random_state=random_state)
|
6595
|
+
X_reduced = mds.fit_transform(X)
|
6596
|
+
|
6597
|
+
# Prepare reduced DataFrame with MDS info
|
6598
|
+
mds_df = pd.DataFrame(
|
6599
|
+
X_reduced,
|
6600
|
+
index=data.index,
|
6601
|
+
columns=[f"MDS_{i+1}" for i in range(n_components)],
|
6602
|
+
)
|
6603
|
+
if debug:
|
6604
|
+
print("Multidimensional Scaling (MDS) completed.")
|
6605
|
+
if hue:
|
6606
|
+
mds_df[hue] = y
|
6607
|
+
# * Apply Locally Linear Embedding (LLE) if selected
|
6608
|
+
elif method == "lle":
|
6609
|
+
from sklearn.manifold import LocallyLinearEmbedding
|
6610
|
+
|
6611
|
+
lle = LocallyLinearEmbedding(
|
6612
|
+
n_components=n_components,
|
6613
|
+
n_neighbors=umap_neighbors,
|
6614
|
+
random_state=random_state,
|
6615
|
+
)
|
6616
|
+
X_reduced = lle.fit_transform(X)
|
6617
|
+
|
6618
|
+
# Prepare reduced DataFrame with LLE info
|
6619
|
+
lle_df = pd.DataFrame(
|
6620
|
+
X_reduced,
|
6621
|
+
index=data.index,
|
6622
|
+
columns=[f"LLE_{i+1}" for i in range(n_components)],
|
6623
|
+
)
|
6624
|
+
if debug:
|
6625
|
+
print("Locally Linear Embedding (LLE) completed.")
|
6626
|
+
if hue:
|
6627
|
+
lle_df[hue] = y
|
6628
|
+
# * Apply Singular Value Decomposition (SVD) if selected
|
6629
|
+
elif method == "svd":
|
6630
|
+
# Using NumPy's SVD for dimensionality reduction
|
6631
|
+
U, s, Vt = np.linalg.svd(X, full_matrices=False)
|
6632
|
+
X_reduced = U[:, :n_components] * s[:n_components]
|
6633
|
+
|
6634
|
+
# Prepare reduced DataFrame with SVD info
|
6635
|
+
svd_df = pd.DataFrame(
|
6636
|
+
X_reduced,
|
6637
|
+
index=data.index,
|
6638
|
+
columns=[f"SVD_{i+1}" for i in range(n_components)],
|
6639
|
+
)
|
6640
|
+
if hue:
|
6641
|
+
svd_df[hue] = y
|
6642
|
+
if debug:
|
6643
|
+
print("Singular Value Decomposition (SVD) completed.")
|
5815
6644
|
|
5816
6645
|
# Return reduced data and info as a new DataFrame with the same index
|
5817
6646
|
if method == "pca":
|
5818
6647
|
reduced_df = pca_df
|
5819
6648
|
colname_met = "PC_"
|
5820
|
-
|
5821
|
-
|
5822
|
-
data=pca_df,
|
5823
|
-
x="PC_1",
|
5824
|
-
y="PC_2",
|
5825
|
-
# hue="condition",
|
5826
|
-
)
|
6649
|
+
xlabel = f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
|
6650
|
+
ylabel = f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
|
5827
6651
|
elif method == "umap":
|
5828
6652
|
reduced_df = umap_df
|
5829
6653
|
colname_met = "UMAP_"
|
5830
|
-
if plot_:
|
5831
|
-
sns.scatterplot(
|
5832
|
-
data=umap_df,
|
5833
|
-
x="UMAP_1",
|
5834
|
-
y="UMAP_2",
|
5835
|
-
# hue="condition",
|
5836
|
-
)
|
5837
6654
|
elif method == "tsne":
|
5838
6655
|
reduced_df = tsne_df
|
5839
|
-
colname_met = "
|
5840
|
-
if plot_:
|
5841
|
-
sns.scatterplot(
|
5842
|
-
data=tsne_df,
|
5843
|
-
x="tSNE_1",
|
5844
|
-
y="tSNE_2",
|
5845
|
-
# hue="batch",
|
5846
|
-
)
|
6656
|
+
colname_met = "tSNE_"
|
5847
6657
|
elif method == "factor":
|
5848
6658
|
reduced_df = factor_df
|
5849
6659
|
colname_met = "Factor_"
|
5850
|
-
if plot_:
|
5851
|
-
sns.scatterplot(
|
5852
|
-
data=factor_df,
|
5853
|
-
x="Factor_1",
|
5854
|
-
y="Factor_2",
|
5855
|
-
# hue="batch",
|
5856
|
-
)
|
5857
6660
|
elif method == "isolation_forest":
|
5858
6661
|
reduced_df = iso_forest_df # Already a DataFrame for outliers
|
5859
6662
|
colname_met = "PC_"
|
@@ -5862,7 +6665,8 @@ def df_reducer(
|
|
5862
6665
|
data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
|
5863
6666
|
x="PC_1",
|
5864
6667
|
y="PC_2",
|
5865
|
-
label="normal",
|
6668
|
+
label="normal",
|
6669
|
+
c="b",
|
5866
6670
|
)
|
5867
6671
|
ax = sns.scatterplot(
|
5868
6672
|
ax=ax,
|
@@ -5870,29 +6674,80 @@ def df_reducer(
|
|
5870
6674
|
x="PC_1",
|
5871
6675
|
y="PC_2",
|
5872
6676
|
c="r",
|
5873
|
-
label="outlier",
|
6677
|
+
label="outlier",
|
6678
|
+
marker="+",
|
6679
|
+
s=30,
|
5874
6680
|
)
|
6681
|
+
elif method == "lda":
|
6682
|
+
reduced_df = lda_df
|
6683
|
+
colname_met = "LDA_"
|
6684
|
+
elif method == "kpca":
|
6685
|
+
reduced_df = kpca_df
|
6686
|
+
colname_met = "KPCA_"
|
6687
|
+
elif method == "ica":
|
6688
|
+
reduced_df = ica_df
|
6689
|
+
colname_met = "ICA_"
|
6690
|
+
elif method == "mds":
|
6691
|
+
reduced_df = mds_df
|
6692
|
+
colname_met = "MDS_"
|
6693
|
+
elif method == "lle":
|
6694
|
+
reduced_df = lle_df
|
6695
|
+
colname_met = "LLE_"
|
6696
|
+
elif method == "svd":
|
6697
|
+
reduced_df = svd_df
|
6698
|
+
colname_met = "SVD_"
|
6699
|
+
# Quick plots
|
6700
|
+
if plot_ and (not method in ["isolation_forest"]):
|
6701
|
+
from .plot import plotxy
|
5875
6702
|
|
6703
|
+
if ax is None:
|
6704
|
+
if figsize is None:
|
6705
|
+
_, ax = plt.subplots(figsize=cm2inch(8, 8))
|
6706
|
+
else:
|
6707
|
+
_, ax = plt.subplots(figsize=figsize)
|
6708
|
+
else:
|
6709
|
+
ax = ax.cla()
|
6710
|
+
ax = plotxy(
|
6711
|
+
data=reduced_df,
|
6712
|
+
x=colname_met + "1",
|
6713
|
+
y=colname_met + "2",
|
6714
|
+
hue=hue,
|
6715
|
+
s=1,
|
6716
|
+
edgecolor="none",
|
6717
|
+
kind="scater",
|
6718
|
+
figsets=dict(
|
6719
|
+
legend=dict(loc="best", markerscale=4),
|
6720
|
+
xlabel=xlabel if xlabel else None,
|
6721
|
+
ylabel=ylabel if ylabel else None,
|
6722
|
+
),
|
6723
|
+
ax=ax,
|
6724
|
+
verbose=False,
|
6725
|
+
**kwargs,
|
6726
|
+
)
|
5876
6727
|
|
5877
6728
|
if inplace:
|
5878
6729
|
# If inplace=True, add components back into the original data
|
5879
6730
|
for col_idx in range(n_components):
|
5880
|
-
data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
6731
|
+
data.loc[:, f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
5881
6732
|
# Add extra info for PCA/UMAP
|
5882
6733
|
if method == "pca":
|
5883
6734
|
for i in range(n_components):
|
5884
|
-
data[f"Explained Variance PC_{i+1}"] = reduced_df[
|
6735
|
+
data.loc[:, f"Explained Variance PC_{i+1}"] = reduced_df.loc[
|
6736
|
+
:, f"Explained Variance PC_{i+1}"
|
6737
|
+
]
|
5885
6738
|
for i in range(n_components):
|
5886
|
-
data[f"Singular Values PC_{i+1}"] = reduced_df[
|
5887
|
-
|
6739
|
+
data.loc[:, f"Singular Values PC_{i+1}"] = reduced_df.loc[
|
6740
|
+
:, f"Singular Values PC_{i+1}"
|
6741
|
+
]
|
6742
|
+
elif method == "umap":
|
5888
6743
|
for i in range(n_components):
|
5889
|
-
data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
|
5890
|
-
data["Embedding"] = reduced_df["Embedding"]
|
5891
|
-
data["Trustworthiness"] = reduced_df["Trustworthiness"]
|
6744
|
+
data.loc[:, f"UMAP_{i+1}"] = reduced_df.loc[:, f"UMAP_{i+1}"]
|
6745
|
+
data.loc[:, "Embedding"] = reduced_df.loc[:, "Embedding"]
|
6746
|
+
data.loc[:, "Trustworthiness"] = reduced_df.loc[:, "Trustworthiness"]
|
6747
|
+
|
5892
6748
|
return None # No return when inplace=True
|
5893
|
-
|
5894
6749
|
|
5895
|
-
return reduced_df
|
6750
|
+
return reduced_df
|
5896
6751
|
|
5897
6752
|
|
5898
6753
|
# example:
|
@@ -5922,6 +6777,7 @@ def plot_cluster(
|
|
5922
6777
|
"""
|
5923
6778
|
import seaborn as sns
|
5924
6779
|
from sklearn.metrics import silhouette_samples
|
6780
|
+
import matplotlib.pyplot as plt
|
5925
6781
|
|
5926
6782
|
if metrics is None:
|
5927
6783
|
metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)
|
@@ -6152,10 +7008,10 @@ def use_pd(
|
|
6152
7008
|
verbose=True,
|
6153
7009
|
dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
|
6154
7010
|
):
|
6155
|
-
default_settings = fload(dir_json, output=
|
7011
|
+
default_settings = fload(dir_json, output="json")
|
6156
7012
|
valid_kinds = list(default_settings.keys())
|
6157
7013
|
kind = strcmp(func_name, valid_kinds)[0]
|
6158
|
-
usage=default_settings[kind]
|
7014
|
+
usage = default_settings[kind]
|
6159
7015
|
if verbose:
|
6160
7016
|
for i, i_ in enumerate(ssplit(usage, by=",")):
|
6161
7017
|
i_ = i_.replace("=", "\t= ") + ","
|