py2ls 0.2.4.6__py3-none-any.whl → 0.2.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/batman.py +32 -1
- py2ls/bio.py +25 -22
- py2ls/data/usages_sns.json +2 -1
- py2ls/ips.py +1258 -724
- py2ls/ml2ls.py +1841 -390
- py2ls/plot.py +500 -235
- {py2ls-0.2.4.6.dist-info → py2ls-0.2.4.8.dist-info}/METADATA +2 -2
- {py2ls-0.2.4.6.dist-info → py2ls-0.2.4.8.dist-info}/RECORD +10 -10
- {py2ls-0.2.4.6.dist-info → py2ls-0.2.4.8.dist-info}/WHEEL +0 -0
py2ls/ips.py
CHANGED
@@ -1,62 +1,38 @@
|
|
1
1
|
import numpy as np
|
2
|
-
import pandas as pd
|
3
|
-
|
4
|
-
import
|
5
|
-
import matplotlib
|
6
|
-
import matplotlib.pyplot as plt
|
7
|
-
import matplotlib.ticker as tck
|
8
|
-
from cycler import cycler
|
9
|
-
from mpl_toolkits.mplot3d import Axes3D
|
10
|
-
import seaborn as sns
|
11
|
-
|
12
|
-
from sklearn.kernel_approximation import KERNEL_PARAMS
|
13
|
-
from sympy import is_increasing
|
14
|
-
import sys, os, shutil, re, yaml, json, subprocess
|
15
|
-
import importlib.util
|
16
|
-
import time
|
17
|
-
from dateutil import parser
|
18
|
-
from datetime import datetime
|
19
|
-
import schedule
|
20
|
-
|
21
|
-
from PIL import Image, ImageEnhance, ImageOps, ImageFilter
|
22
|
-
from rembg import remove, new_session
|
23
|
-
|
24
|
-
import docx
|
25
|
-
from fpdf import FPDF
|
26
|
-
from lxml import etree
|
27
|
-
from docx import Document
|
28
|
-
from PyPDF2 import PdfReader
|
29
|
-
from pptx import Presentation
|
30
|
-
from pptx.util import Inches
|
31
|
-
from pdf2image import convert_from_path, pdfinfo_from_path
|
32
|
-
from nltk.tokenize import sent_tokenize, word_tokenize
|
33
|
-
import nltk # nltk.download("punkt")
|
34
|
-
from docx2pdf import convert
|
35
|
-
import img2pdf as image2pdf
|
36
|
-
import nbformat
|
37
|
-
from nbconvert import MarkdownExporter
|
38
|
-
|
39
|
-
from itertools import pairwise
|
40
|
-
from box import Box, BoxList
|
41
|
-
from numerizer import numerize
|
42
|
-
from tqdm import tqdm
|
43
|
-
import mimetypes
|
44
|
-
from pprint import pp
|
45
|
-
from collections import Counter
|
46
|
-
from fuzzywuzzy import fuzz, process
|
47
|
-
from langdetect import detect
|
48
|
-
from duckduckgo_search import DDGS
|
2
|
+
import pandas as pd
|
3
|
+
import sys, os
|
4
|
+
from IPython.display import display
|
49
5
|
from typing import List, Optional, Union
|
50
|
-
from bs4 import BeautifulSoup
|
51
|
-
|
52
|
-
from . import netfinder
|
53
|
-
|
54
6
|
try:
|
55
7
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
56
8
|
get_ipython().run_line_magic("autoreload", "2")
|
57
9
|
except NameError:
|
58
10
|
pass
|
59
11
|
|
12
|
+
import warnings
|
13
|
+
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
14
|
+
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
15
|
+
|
16
|
+
def run_once_within(duration=60): # default 60s
|
17
|
+
import time
|
18
|
+
"""
|
19
|
+
usage:
|
20
|
+
if run_once_within():
|
21
|
+
print("This code runs once per minute.")
|
22
|
+
else:
|
23
|
+
print("The code has already been run in the last minute.")
|
24
|
+
"""
|
25
|
+
if not hasattr(run_once_within, "time_last"):
|
26
|
+
run_once_within.time_last = None
|
27
|
+
time_curr = time.time()
|
28
|
+
|
29
|
+
if (run_once_within.time_last is None) or (time_curr - run_once_within.time_last >= duration):
|
30
|
+
run_once_within.time_last = time_curr # Update the last execution time
|
31
|
+
return True
|
32
|
+
else:
|
33
|
+
return False
|
34
|
+
|
35
|
+
|
60
36
|
def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
61
37
|
"""
|
62
38
|
Add the Chinese (default) font to the font manager
|
@@ -155,6 +131,8 @@ def run_every(when: str = None, job=None, wait: int = 60):
|
|
155
131
|
:param when: String specifying the interval, e.g. '2 minutes', '4 hours', '1 day'.
|
156
132
|
:param job: The function to be scheduled.
|
157
133
|
"""
|
134
|
+
import schedule
|
135
|
+
import time
|
158
136
|
if job is None:
|
159
137
|
print("No job provided!")
|
160
138
|
return
|
@@ -200,6 +178,8 @@ def run_at(when: str, job=None, wait: int = 60):
|
|
200
178
|
:param job: The function to be scheduled.
|
201
179
|
:param wait: The sleep interval between checks in seconds.
|
202
180
|
"""
|
181
|
+
from datetime import datetime
|
182
|
+
import time
|
203
183
|
if job is None:
|
204
184
|
print("No job provided!")
|
205
185
|
return
|
@@ -279,6 +259,7 @@ def get_timezone(timezone: str | list = None):
|
|
279
259
|
|
280
260
|
def is_package_installed(package_name):
|
281
261
|
"""Check if a package is installed."""
|
262
|
+
import importlib.util
|
282
263
|
package_spec = importlib.util.find_spec(package_name)
|
283
264
|
return package_spec is not None
|
284
265
|
|
@@ -291,6 +272,7 @@ def upgrade(module="py2ls",uninstall=False):
|
|
291
272
|
module (str): The name of the module to install/upgrade.
|
292
273
|
uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
|
293
274
|
"""
|
275
|
+
import subprocess
|
294
276
|
if not is_package_installed(module):
|
295
277
|
try:
|
296
278
|
subprocess.check_call([sys.executable, "-m", "pip", "install", module])
|
@@ -327,6 +309,7 @@ def get_version(pkg):
|
|
327
309
|
|
328
310
|
|
329
311
|
def rm_folder(folder_path, verbose=True):
|
312
|
+
import shutil
|
330
313
|
try:
|
331
314
|
shutil.rmtree(folder_path)
|
332
315
|
if verbose:
|
@@ -345,6 +328,7 @@ def fremove(path, verbose=True):
|
|
345
328
|
"""
|
346
329
|
try:
|
347
330
|
if os.path.isdir(path):
|
331
|
+
import shutil
|
348
332
|
shutil.rmtree(path)
|
349
333
|
if verbose:
|
350
334
|
print(f"Successfully deleted folder {path}")
|
@@ -360,23 +344,30 @@ def fremove(path, verbose=True):
|
|
360
344
|
print(f"Failed to delete {path}. Reason: {e}")
|
361
345
|
|
362
346
|
|
363
|
-
def get_cwd(verbose: bool = True):
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
347
|
+
# def get_cwd(verbose: bool = True):
|
348
|
+
# """
|
349
|
+
# get_cwd: to get the current working directory
|
350
|
+
# Args:
|
351
|
+
# verbose (bool, optional): to show which function is use. Defaults to True.
|
352
|
+
# """
|
353
|
+
# try:
|
354
|
+
# script_dir = os.path.dirname(os.path.abspath(__file__))
|
355
|
+
# if verbose:
|
356
|
+
# print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
|
357
|
+
# except NameError:
|
358
|
+
# # This works in an interactive environment (like a Jupyter notebook)
|
359
|
+
# script_dir = os.getcwd()
|
360
|
+
# if verbose:
|
361
|
+
# print("os.getcwd():", script_dir)
|
362
|
+
# return script_dir
|
363
|
+
|
364
|
+
|
365
|
+
def get_cwd():
|
366
|
+
from pathlib import Path
|
367
|
+
# Get the current script's directory as a Path object
|
368
|
+
current_directory = Path(__file__).resolve().parent
|
369
|
+
|
370
|
+
return current_directory
|
380
371
|
|
381
372
|
def search(
|
382
373
|
query,
|
@@ -388,7 +379,7 @@ def search(
|
|
388
379
|
dir_save=dir_save,
|
389
380
|
**kwargs,
|
390
381
|
):
|
391
|
-
|
382
|
+
from duckduckgo_search import DDGS
|
392
383
|
if "te" in kind.lower():
|
393
384
|
results = DDGS().text(query, max_results=limit)
|
394
385
|
res = pd.DataFrame(results)
|
@@ -421,7 +412,7 @@ def echo(*args, **kwargs):
|
|
421
412
|
str: the answer from ai
|
422
413
|
"""
|
423
414
|
global dir_save
|
424
|
-
|
415
|
+
from duckduckgo_search import DDGS
|
425
416
|
query = None
|
426
417
|
model = kwargs.get("model", "gpt")
|
427
418
|
verbose = kwargs.get("verbose", True)
|
@@ -469,8 +460,11 @@ def echo(*args, **kwargs):
|
|
469
460
|
model_valid = valid_mod_name(model)
|
470
461
|
res = DDGS().chat(query, model=model_valid)
|
471
462
|
if verbose:
|
463
|
+
from pprint import pp
|
472
464
|
pp(res)
|
473
465
|
if log:
|
466
|
+
from datetime import datetime
|
467
|
+
import time
|
474
468
|
dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
|
475
469
|
res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
|
476
470
|
if bool(os.path.basename(dir_save)):
|
@@ -492,6 +486,7 @@ def ai(*args, **kwargs):
|
|
492
486
|
|
493
487
|
|
494
488
|
def detect_lang(text, output="lang", verbose=True):
|
489
|
+
from langdetect import detect
|
495
490
|
dir_curr_script = os.path.dirname(os.path.abspath(__file__))
|
496
491
|
dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
|
497
492
|
print(dir_curr_script, os.getcwd(), dir_lang_code)
|
@@ -550,19 +545,34 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
|
|
550
545
|
for lst in flattened_lists[1:]:
|
551
546
|
shared_elements.intersection_update(lst)
|
552
547
|
else:
|
548
|
+
from collections import Counter
|
553
549
|
all_elements = [item for sublist in flattened_lists for item in sublist]
|
554
550
|
element_count = Counter(all_elements)
|
555
551
|
# Get elements that appear in at least n_shared lists
|
556
552
|
shared_elements = [item for item, count in element_count.items() if count >= n_shared]
|
557
553
|
|
558
|
-
shared_elements = flatten(shared_elements, verbose=verbose)
|
554
|
+
shared_elements = flatten(shared_elements, verbose=verbose)
|
559
555
|
if verbose:
|
560
556
|
elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
|
561
557
|
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
562
558
|
print("********* checking shared elements *********")
|
563
559
|
return shared_elements
|
564
560
|
|
565
|
-
def
|
561
|
+
def not_shared(*args, strict=True, n_shared=2, verbose=False):
|
562
|
+
"""
|
563
|
+
To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
|
564
|
+
usage:
|
565
|
+
list1 = [1, 8, 3, 3, 4, 5]
|
566
|
+
list2 = [4, 5, 6, 7, 8]
|
567
|
+
not_shared(list1,list2)# output [1,3]
|
568
|
+
"""
|
569
|
+
_common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
|
570
|
+
list1 = flatten(args[0], verbose=verbose)
|
571
|
+
_not_shared=[item for item in list1 if item not in _common]
|
572
|
+
return _not_shared
|
573
|
+
|
574
|
+
|
575
|
+
def flatten(nested: Any, unique_list=True, verbose=False):
|
566
576
|
"""
|
567
577
|
Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
|
568
578
|
Parameters:
|
@@ -603,7 +613,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
|
|
603
613
|
Returns:
|
604
614
|
tuple: A tuple containing the best match and its index in the candidates list.
|
605
615
|
"""
|
606
|
-
|
616
|
+
from fuzzywuzzy import fuzz, process
|
607
617
|
def to_lower(s, ignore_case=True):
|
608
618
|
# Converts a string or list of strings to lowercase if ignore_case is True.
|
609
619
|
if ignore_case:
|
@@ -729,6 +739,7 @@ def cn2pinyin(
|
|
729
739
|
return pinyin_flat
|
730
740
|
|
731
741
|
def counter(list_, verbose=True):
|
742
|
+
from collections import Counter
|
732
743
|
c = Counter(list_)
|
733
744
|
# Print the name counts
|
734
745
|
for item, count in c.items():
|
@@ -757,7 +768,7 @@ def str2time(time_str, fmt="24"):
|
|
757
768
|
%p represents AM or PM.
|
758
769
|
- str: The converted time string.
|
759
770
|
"""
|
760
|
-
|
771
|
+
from datetime import datetime
|
761
772
|
def time_len_corr(time_str):
|
762
773
|
time_str_ = (
|
763
774
|
ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
|
@@ -818,6 +829,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
|
|
818
829
|
Returns:
|
819
830
|
- str: The converted date string.
|
820
831
|
"""
|
832
|
+
from dateutil import parser
|
821
833
|
try:
|
822
834
|
date_obj = parser.parse(date_str)
|
823
835
|
except ValueError as e:
|
@@ -834,6 +846,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
|
|
834
846
|
|
835
847
|
|
836
848
|
def str2num(s, *args, **kwargs):
|
849
|
+
import re
|
837
850
|
delimiter = kwargs.get("sep", None)
|
838
851
|
round_digits = kwargs.get("round", None)
|
839
852
|
if delimiter is not None:
|
@@ -849,6 +862,7 @@ def str2num(s, *args, **kwargs):
|
|
849
862
|
try:
|
850
863
|
num = float(s)
|
851
864
|
except ValueError:
|
865
|
+
from numerizer import numerize
|
852
866
|
try:
|
853
867
|
numerized = numerize(s)
|
854
868
|
num = int(numerized) if "." not in numerized else float(numerized)
|
@@ -1016,7 +1030,7 @@ def px2inch(*px, dpi=300) -> list:
|
|
1016
1030
|
return [i / dpi for i in px]
|
1017
1031
|
|
1018
1032
|
|
1019
|
-
def
|
1033
|
+
def inch2cm(*cm) -> list:
|
1020
1034
|
"""
|
1021
1035
|
cm2inch: converts centimeter measurements to inches.
|
1022
1036
|
Usage:
|
@@ -1037,24 +1051,31 @@ def cm2inch(*cm) -> list:
|
|
1037
1051
|
def inch2px(*inch, dpi=300) -> list:
|
1038
1052
|
"""
|
1039
1053
|
inch2px: converts inch measurements to pixels based on the given dpi.
|
1054
|
+
|
1040
1055
|
Usage:
|
1041
1056
|
inch2px(1, 2, dpi=300); inch2px([1, 2], dpi=300)
|
1057
|
+
|
1058
|
+
Parameters:
|
1059
|
+
inch : float, list, or tuple
|
1060
|
+
Single or multiple measurements in inches to convert to pixels.
|
1061
|
+
dpi : int, optional (default=300)
|
1062
|
+
Dots per inch (DPI), representing the pixel density.
|
1063
|
+
|
1042
1064
|
Returns:
|
1043
|
-
list: in pixels
|
1065
|
+
list: Converted measurements in pixels.
|
1044
1066
|
"""
|
1045
|
-
# Case 1: When the user passes a single argument that is a list or tuple,
|
1067
|
+
# Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
|
1046
1068
|
if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
|
1047
|
-
# If the input is a single list or tuple, we unpack its elements and convert each to pixels
|
1048
1069
|
return [i * dpi for i in inch[0]]
|
1049
|
-
|
1070
|
+
|
1071
|
+
# Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
|
1050
1072
|
else:
|
1051
|
-
# Here, we convert each individual argument directly to pixels
|
1052
1073
|
return [i * dpi for i in inch]
|
1053
1074
|
|
1054
1075
|
|
1055
|
-
|
1076
|
+
|
1077
|
+
def cm2inch(*inch) -> list:
|
1056
1078
|
"""
|
1057
|
-
inch2cm: converts inch measurements to centimeters.
|
1058
1079
|
Usage:
|
1059
1080
|
inch2cm(8,5); inch2cm((8,5)); inch2cm([8,5])
|
1060
1081
|
Returns:
|
@@ -1169,6 +1190,7 @@ def paper_size(paper_type_str="a4"):
|
|
1169
1190
|
|
1170
1191
|
|
1171
1192
|
def docx2pdf(dir_docx, dir_pdf=None):
|
1193
|
+
from docx2pdf import convert
|
1172
1194
|
if dir_pdf:
|
1173
1195
|
convert(dir_docx, dir_pdf)
|
1174
1196
|
else:
|
@@ -1176,6 +1198,7 @@ def docx2pdf(dir_docx, dir_pdf=None):
|
|
1176
1198
|
|
1177
1199
|
|
1178
1200
|
def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
|
1201
|
+
import img2pdf as image2pdf
|
1179
1202
|
def mm_to_point(size):
|
1180
1203
|
return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
|
1181
1204
|
|
@@ -1227,6 +1250,9 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
|
|
1227
1250
|
|
1228
1251
|
|
1229
1252
|
def pdf2ppt(dir_pdf, dir_ppt):
|
1253
|
+
from PyPDF2 import PdfReader
|
1254
|
+
from pptx.util import Inches
|
1255
|
+
from pptx import Presentation
|
1230
1256
|
prs = Presentation()
|
1231
1257
|
|
1232
1258
|
# Open the PDF file
|
@@ -1255,6 +1281,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
|
|
1255
1281
|
|
1256
1282
|
|
1257
1283
|
def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
1284
|
+
import re
|
1258
1285
|
if isinstance(text, list):
|
1259
1286
|
nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
|
1260
1287
|
flat_list = [item for sublist in nested_list for item in sublist]
|
@@ -1302,6 +1329,8 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1302
1329
|
return [text[i : i + length] for i in range(0, len(text), length)]
|
1303
1330
|
|
1304
1331
|
def split_by_sent_num(text, n=10):
|
1332
|
+
from nltk.tokenize import sent_tokenize
|
1333
|
+
from itertools import pairwise
|
1305
1334
|
# split text into sentences
|
1306
1335
|
text_split_by_sent = sent_tokenize(text)
|
1307
1336
|
cut_loc_array = np.arange(0, len(text_split_by_sent), n)
|
@@ -1374,10 +1403,12 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1374
1403
|
print(f"splited by camel_case")
|
1375
1404
|
return split_by_camel_case(text)
|
1376
1405
|
elif ("word" in by) and not strict:
|
1406
|
+
from nltk.tokenize import word_tokenize
|
1377
1407
|
if verbose:
|
1378
1408
|
print(f"splited by word")
|
1379
1409
|
return word_tokenize(text)
|
1380
1410
|
elif ("sen" in by and not "num" in by) and not strict:
|
1411
|
+
from nltk.tokenize import sent_tokenize
|
1381
1412
|
if verbose:
|
1382
1413
|
print(f"splited by sentence")
|
1383
1414
|
return sent_tokenize(text)
|
@@ -1427,9 +1458,11 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1427
1458
|
|
1428
1459
|
|
1429
1460
|
def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
|
1461
|
+
from pdf2image import convert_from_path, pdfinfo_from_path
|
1430
1462
|
df_dir_img_single_page = pd.DataFrame()
|
1431
1463
|
dir_single_page = []
|
1432
1464
|
if verbose:
|
1465
|
+
from pprint import pp
|
1433
1466
|
pp(pdfinfo_from_path(dir_pdf))
|
1434
1467
|
if isinstance(page, tuple) and page:
|
1435
1468
|
page = list(page)
|
@@ -1548,6 +1581,7 @@ def unzip(dir_path, output_dir=None):
|
|
1548
1581
|
# If the output directory already exists, remove it and replace it
|
1549
1582
|
if os.path.exists(output_dir):
|
1550
1583
|
if os.path.isdir(output_dir): # check if it is a folder
|
1584
|
+
import shutil
|
1551
1585
|
shutil.rmtree(output_dir) # remove folder
|
1552
1586
|
else:
|
1553
1587
|
os.remove(output_dir) # remove file
|
@@ -1560,13 +1594,27 @@ def unzip(dir_path, output_dir=None):
|
|
1560
1594
|
tar_ref.extractall(output_dir)
|
1561
1595
|
return output_dir
|
1562
1596
|
# Handle .gz files
|
1563
|
-
if dir_path.endswith(".gz"):
|
1597
|
+
if dir_path.endswith(".gz") or dir_path.endswith(".gzip"):
|
1564
1598
|
import gzip
|
1565
1599
|
|
1566
1600
|
output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1601
|
+
try:
|
1602
|
+
import shutil
|
1603
|
+
with gzip.open(dir_path, "rb") as gz_file:
|
1604
|
+
with open(output_file, "wb") as out_file:
|
1605
|
+
shutil.copyfileobj(gz_file, out_file)
|
1606
|
+
print(f"unzipped '{dir_path}' to '{output_file}'")
|
1607
|
+
except FileNotFoundError:
|
1608
|
+
print(f"Error: The file '{dir_path}' was not found.")
|
1609
|
+
except PermissionError:
|
1610
|
+
print(f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'.")
|
1611
|
+
except Exception as e:
|
1612
|
+
try:
|
1613
|
+
import tarfile
|
1614
|
+
with tarfile.open(dir_path, 'r:gz') as tar:
|
1615
|
+
tar.extractall(path=output_file)
|
1616
|
+
except Exception as final_e:
|
1617
|
+
print(f"An final unexpected error occurred: {final_e}")
|
1570
1618
|
return output_file
|
1571
1619
|
|
1572
1620
|
# Handle .zip files
|
@@ -1648,6 +1696,11 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1648
1696
|
False: normal
|
1649
1697
|
|
1650
1698
|
"""
|
1699
|
+
if not isinstance(df, pd.DataFrame):
|
1700
|
+
if verbose:
|
1701
|
+
print('not pd.DataFrame')
|
1702
|
+
return False
|
1703
|
+
df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
|
1651
1704
|
# Initialize a list to hold messages about abnormalities
|
1652
1705
|
messages = []
|
1653
1706
|
is_abnormal = False
|
@@ -1675,25 +1728,29 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1675
1728
|
is_abnormal = True
|
1676
1729
|
if verbose:
|
1677
1730
|
print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
|
1678
|
-
|
1731
|
+
if verbose:
|
1732
|
+
print("1",is_abnormal)
|
1679
1733
|
if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
|
1680
1734
|
messages.append("Abnormal: Too many delimiters in column names.")
|
1681
1735
|
is_abnormal = True
|
1682
1736
|
if verbose:
|
1683
1737
|
print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
|
1684
|
-
|
1738
|
+
if verbose:
|
1739
|
+
print("2",is_abnormal)
|
1685
1740
|
if delimiter_counts[""] > 3:
|
1686
1741
|
messages.append("Abnormal: There are empty column names.")
|
1687
1742
|
is_abnormal = True
|
1688
1743
|
if verbose:
|
1689
1744
|
print(f'delimiter_counts[""] > 3')
|
1690
|
-
|
1745
|
+
if verbose:
|
1746
|
+
print("3",is_abnormal)
|
1691
1747
|
if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
|
1692
1748
|
messages.append("Abnormal: Some column names contain unexpected characters.")
|
1693
1749
|
is_abnormal = True
|
1694
1750
|
if verbose:
|
1695
1751
|
print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
|
1696
|
-
|
1752
|
+
if verbose:
|
1753
|
+
print("4",is_abnormal)
|
1697
1754
|
# # Check for missing values
|
1698
1755
|
# missing_values = df.isnull().sum()
|
1699
1756
|
# if missing_values.any():
|
@@ -1713,7 +1770,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1713
1770
|
is_abnormal = True
|
1714
1771
|
if verbose:
|
1715
1772
|
print(f'df.columns[df.nunique() == 1].tolist()')
|
1716
|
-
|
1773
|
+
if verbose:
|
1774
|
+
print("5",is_abnormal)
|
1717
1775
|
# Check for an unreasonable number of rows or columns
|
1718
1776
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
1719
1777
|
messages.append(
|
@@ -1722,7 +1780,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1722
1780
|
is_abnormal = True
|
1723
1781
|
if verbose:
|
1724
1782
|
print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
|
1725
|
-
|
1783
|
+
if verbose:
|
1784
|
+
print("6",is_abnormal)
|
1726
1785
|
# Compile results
|
1727
1786
|
if verbose:
|
1728
1787
|
print("\n".join(messages))
|
@@ -1739,20 +1798,40 @@ def fload(fpath, kind=None, **kwargs):
|
|
1739
1798
|
Returns:
|
1740
1799
|
content: The content loaded from the file.
|
1741
1800
|
"""
|
1742
|
-
|
1801
|
+
def read_mplstyle(style_file):
|
1802
|
+
import matplotlib.pyplot as plt
|
1803
|
+
# Load the style file
|
1804
|
+
plt.style.use(style_file)
|
1805
|
+
|
1806
|
+
# Get the current style properties
|
1807
|
+
style_dict = plt.rcParams
|
1808
|
+
|
1809
|
+
# Convert to dictionary
|
1810
|
+
style_dict = dict(style_dict)
|
1811
|
+
# Print the style dictionary
|
1812
|
+
for i, j in style_dict.items():
|
1813
|
+
print(f"\n{i}::::{j}")
|
1814
|
+
return style_dict
|
1815
|
+
# #example usage:
|
1816
|
+
# style_file = "/ std-colors.mplstyle"
|
1817
|
+
# style_dict = read_mplstyle(style_file)
|
1818
|
+
|
1743
1819
|
def load_txt_md(fpath):
|
1744
1820
|
with open(fpath, "r") as file:
|
1745
1821
|
content = file.read()
|
1746
1822
|
return content
|
1747
1823
|
|
1748
|
-
def load_html(fpath):
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1824
|
+
# def load_html(fpath):
|
1825
|
+
# with open(fpath, "r") as file:
|
1826
|
+
# content = file.read()
|
1827
|
+
# return content
|
1828
|
+
def load_html(fpath,**kwargs):
|
1829
|
+
return pd.read_html(fpath,**kwargs)
|
1752
1830
|
|
1753
1831
|
def load_json(fpath, **kwargs):
|
1754
1832
|
output=kwargs.pop("output","json")
|
1755
1833
|
if output=='json':
|
1834
|
+
import json
|
1756
1835
|
with open(fpath, "r") as file:
|
1757
1836
|
content = json.load(file)
|
1758
1837
|
return content
|
@@ -1760,12 +1839,14 @@ def fload(fpath, kind=None, **kwargs):
|
|
1760
1839
|
return pd.read_json(fpath,**kwargs)
|
1761
1840
|
|
1762
1841
|
def load_yaml(fpath):
|
1842
|
+
import yaml
|
1763
1843
|
with open(fpath, "r") as file:
|
1764
1844
|
content = yaml.safe_load(file)
|
1765
1845
|
return content
|
1766
1846
|
|
1767
1847
|
|
1768
1848
|
def load_xml(fpath, fsize_thr: int = 100):
|
1849
|
+
from lxml import etree
|
1769
1850
|
def load_small_xml(fpath):
|
1770
1851
|
tree = etree.parse(fpath)
|
1771
1852
|
root = tree.getroot()
|
@@ -1824,6 +1905,15 @@ def fload(fpath, kind=None, **kwargs):
|
|
1824
1905
|
if line.startswith(char):
|
1825
1906
|
return char
|
1826
1907
|
return None
|
1908
|
+
|
1909
|
+
def _get_chunks(df_fake):
|
1910
|
+
"""
|
1911
|
+
helper func for 'load_csv'
|
1912
|
+
"""
|
1913
|
+
chunks = []
|
1914
|
+
for chunk in df_fake:
|
1915
|
+
chunks.append(chunk)
|
1916
|
+
return pd.concat(chunks, ignore_index=True)
|
1827
1917
|
|
1828
1918
|
def load_csv(fpath, **kwargs):
|
1829
1919
|
from pandas.errors import EmptyDataError
|
@@ -1837,16 +1927,19 @@ def fload(fpath, kind=None, **kwargs):
|
|
1837
1927
|
on_bad_lines = kwargs.pop("on_bad_lines", "skip")
|
1838
1928
|
comment = kwargs.pop("comment", None)
|
1839
1929
|
fmt=kwargs.pop("fmt",False)
|
1930
|
+
chunksize=kwargs.pop("chunksize", None)
|
1931
|
+
engine='c' if chunksize else engine # when chunksize, recommend 'c'
|
1932
|
+
low_memory=kwargs.pop("low_memory",True)
|
1933
|
+
low_memory=False if chunksize else True # when chunksize, recommend low_memory=False
|
1840
1934
|
verbose=kwargs.pop("verbose",False)
|
1841
|
-
if
|
1935
|
+
if run_once_within():
|
1842
1936
|
use_pd("read_csv", verbose=verbose)
|
1843
|
-
return
|
1844
1937
|
|
1845
1938
|
if comment is None:
|
1846
1939
|
comment = get_comment(
|
1847
1940
|
fpath, comment=None, encoding="utf-8", lines_to_check=5
|
1848
1941
|
)
|
1849
|
-
|
1942
|
+
|
1850
1943
|
try:
|
1851
1944
|
df = pd.read_csv(
|
1852
1945
|
fpath,
|
@@ -1858,14 +1951,19 @@ def fload(fpath, kind=None, **kwargs):
|
|
1858
1951
|
skipinitialspace=skipinitialspace,
|
1859
1952
|
sep=sep,
|
1860
1953
|
on_bad_lines=on_bad_lines,
|
1954
|
+
chunksize=chunksize,
|
1955
|
+
low_memory=low_memory,
|
1861
1956
|
**kwargs,
|
1862
1957
|
)
|
1863
|
-
if
|
1958
|
+
if chunksize:
|
1959
|
+
df=_get_chunks(df)
|
1960
|
+
print(df.shape)
|
1961
|
+
if is_df_abnormal(df, verbose=0): # raise error
|
1864
1962
|
raise ValueError("the df is abnormal")
|
1865
1963
|
except:
|
1866
1964
|
try:
|
1867
1965
|
try:
|
1868
|
-
if engine == "pyarrow":
|
1966
|
+
if engine == "pyarrow" and not chunksize:
|
1869
1967
|
df = pd.read_csv(
|
1870
1968
|
fpath,
|
1871
1969
|
engine=engine,
|
@@ -1874,6 +1972,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1874
1972
|
sep=sep,
|
1875
1973
|
on_bad_lines=on_bad_lines,
|
1876
1974
|
comment=comment,
|
1975
|
+
low_memory=low_memory,
|
1877
1976
|
**kwargs,
|
1878
1977
|
)
|
1879
1978
|
else:
|
@@ -1887,14 +1986,19 @@ def fload(fpath, kind=None, **kwargs):
|
|
1887
1986
|
skipinitialspace=skipinitialspace,
|
1888
1987
|
on_bad_lines=on_bad_lines,
|
1889
1988
|
comment=comment,
|
1989
|
+
chunksize=chunksize,
|
1990
|
+
low_memory=low_memory,
|
1890
1991
|
**kwargs,
|
1891
1992
|
)
|
1993
|
+
if chunksize:
|
1994
|
+
df=_get_chunks(df)
|
1995
|
+
print(df.shape)
|
1892
1996
|
if is_df_abnormal(df, verbose=0):
|
1893
1997
|
raise ValueError("the df is abnormal")
|
1894
1998
|
except (UnicodeDecodeError, ValueError):
|
1895
1999
|
encoding = get_encoding(fpath)
|
1896
2000
|
# print(f"utf-8 failed. Retrying with detected encoding: {encoding}")
|
1897
|
-
if engine == "pyarrow":
|
2001
|
+
if engine == "pyarrow" and not chunksize:
|
1898
2002
|
df = pd.read_csv(
|
1899
2003
|
fpath,
|
1900
2004
|
engine=engine,
|
@@ -1903,6 +2007,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1903
2007
|
sep=sep,
|
1904
2008
|
on_bad_lines=on_bad_lines,
|
1905
2009
|
comment=comment,
|
2010
|
+
low_memory=low_memory,
|
1906
2011
|
**kwargs,
|
1907
2012
|
)
|
1908
2013
|
else:
|
@@ -1916,8 +2021,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
1916
2021
|
skipinitialspace=skipinitialspace,
|
1917
2022
|
on_bad_lines=on_bad_lines,
|
1918
2023
|
comment=comment,
|
2024
|
+
chunksize=chunksize,
|
2025
|
+
low_memory=low_memory,
|
1919
2026
|
**kwargs,
|
1920
2027
|
)
|
2028
|
+
if chunksize:
|
2029
|
+
df=_get_chunks(df)
|
2030
|
+
print(df.shape)
|
1921
2031
|
if is_df_abnormal(df, verbose=0):
|
1922
2032
|
raise ValueError("the df is abnormal")
|
1923
2033
|
except Exception as e:
|
@@ -1934,8 +2044,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
1934
2044
|
sep=sep,
|
1935
2045
|
on_bad_lines=on_bad_lines,
|
1936
2046
|
comment=comment,
|
2047
|
+
chunksize=chunksize,
|
2048
|
+
low_memory=low_memory,
|
1937
2049
|
**kwargs,
|
1938
2050
|
)
|
2051
|
+
if chunksize:
|
2052
|
+
df=_get_chunks(df)
|
2053
|
+
print(df.shape)
|
1939
2054
|
if not is_df_abnormal(df, verbose=0): # normal
|
1940
2055
|
display(df.head(2))
|
1941
2056
|
print(f"shape: {df.shape}")
|
@@ -1943,32 +2058,38 @@ def fload(fpath, kind=None, **kwargs):
|
|
1943
2058
|
except:
|
1944
2059
|
pass
|
1945
2060
|
else:
|
1946
|
-
|
1947
|
-
|
1948
|
-
|
1949
|
-
|
1950
|
-
|
1951
|
-
|
1952
|
-
|
1953
|
-
|
1954
|
-
|
1955
|
-
|
1956
|
-
|
1957
|
-
|
1958
|
-
|
1959
|
-
|
1960
|
-
|
1961
|
-
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1965
|
-
display(df.head(2))
|
1966
|
-
print(f"
|
1967
|
-
|
1968
|
-
|
1969
|
-
|
1970
|
-
|
1971
|
-
|
2061
|
+
if not chunksize:
|
2062
|
+
engines = [None,"c", "python"]
|
2063
|
+
for engine in engines:
|
2064
|
+
separators = [",", "\t", ";", "|", " "]
|
2065
|
+
for sep in separators:
|
2066
|
+
try:
|
2067
|
+
# sep2show = sep if sep != "\t" else "\\t"
|
2068
|
+
# print(f"trying with: engine={engine}, sep='{sep2show}'")
|
2069
|
+
# print(".")
|
2070
|
+
df = pd.read_csv(
|
2071
|
+
fpath,
|
2072
|
+
engine=engine,
|
2073
|
+
sep=sep,
|
2074
|
+
on_bad_lines=on_bad_lines,
|
2075
|
+
comment=comment,
|
2076
|
+
chunksize=chunksize,
|
2077
|
+
low_memory=low_memory,
|
2078
|
+
**kwargs,
|
2079
|
+
)
|
2080
|
+
# display(df.head(2))
|
2081
|
+
# print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
|
2082
|
+
if chunksize:
|
2083
|
+
df=_get_chunks(df)
|
2084
|
+
print(df.shape)
|
2085
|
+
if not is_df_abnormal(df, verbose=0):
|
2086
|
+
display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
|
2087
|
+
print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
|
2088
|
+
return df
|
2089
|
+
except EmptyDataError as e:
|
2090
|
+
continue
|
2091
|
+
else:
|
2092
|
+
pass
|
1972
2093
|
display(df.head(2))
|
1973
2094
|
print(f"shape: {df.shape}")
|
1974
2095
|
return df
|
@@ -1976,7 +2097,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1976
2097
|
def load_excel(fpath, **kwargs):
|
1977
2098
|
engine = kwargs.get("engine", "openpyxl")
|
1978
2099
|
verbose=kwargs.pop("verbose",False)
|
1979
|
-
if
|
2100
|
+
if run_once_within():
|
1980
2101
|
use_pd("read_excel", verbose=verbose)
|
1981
2102
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
1982
2103
|
try:
|
@@ -1987,7 +2108,45 @@ def fload(fpath, kind=None, **kwargs):
|
|
1987
2108
|
pass
|
1988
2109
|
return df
|
1989
2110
|
|
2111
|
+
|
2112
|
+
def load_parquet(fpath, **kwargs):
|
2113
|
+
"""
|
2114
|
+
Load a Parquet file into a Pandas DataFrame with advanced options.
|
2115
|
+
|
2116
|
+
Parameters:
|
2117
|
+
- fpath (str): The file path to the Parquet file.
|
2118
|
+
- engine (str): The engine to use for reading the Parquet file (default is 'pyarrow').
|
2119
|
+
- columns (list): List of columns to load. If None, loads all columns.
|
2120
|
+
- verbose (bool): If True, prints additional information about the loading process.
|
2121
|
+
- filters (list): List of filter conditions for predicate pushdown.
|
2122
|
+
- **kwargs: Additional keyword arguments for `pd.read_parquet`.
|
2123
|
+
|
2124
|
+
Returns:
|
2125
|
+
- df (DataFrame): The loaded DataFrame.
|
2126
|
+
"""
|
2127
|
+
|
2128
|
+
engine = kwargs.get("engine", "pyarrow")
|
2129
|
+
verbose = kwargs.pop("verbose", False)
|
2130
|
+
|
2131
|
+
if run_once_within():
|
2132
|
+
use_pd("read_parquet", verbose=verbose)
|
2133
|
+
try:
|
2134
|
+
df = pd.read_parquet(fpath, engine=engine, **kwargs)
|
2135
|
+
if verbose:
|
2136
|
+
if 'columns' in kwargs:
|
2137
|
+
print(f"Loaded columns: {kwargs['columns']}")
|
2138
|
+
else:
|
2139
|
+
print("Loaded all columns.")
|
2140
|
+
print(f"shape: {df.shape}")
|
2141
|
+
except Exception as e:
|
2142
|
+
print(f"An error occurred while loading the Parquet file: {e}")
|
2143
|
+
df = None
|
2144
|
+
|
2145
|
+
return df
|
2146
|
+
|
1990
2147
|
def load_ipynb(fpath, **kwargs):
|
2148
|
+
import nbformat
|
2149
|
+
from nbconvert import MarkdownExporter
|
1991
2150
|
as_version = kwargs.get("as_version", 4)
|
1992
2151
|
with open(fpath, "r") as file:
|
1993
2152
|
nb = nbformat.read(file, as_version=as_version)
|
@@ -2017,6 +2176,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2017
2176
|
If page is an integer, it returns the text of the specified page number.
|
2018
2177
|
If the specified page is not found, it returns the string "Page is not found".
|
2019
2178
|
"""
|
2179
|
+
from PyPDF2 import PdfReader
|
2020
2180
|
text_dict = {}
|
2021
2181
|
with open(fpath, "rb") as file:
|
2022
2182
|
pdf_reader = PdfReader(file)
|
@@ -2046,6 +2206,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2046
2206
|
return text_dict.get(int(page), "Page is not found")
|
2047
2207
|
|
2048
2208
|
def load_docx(fpath):
|
2209
|
+
from docx import Document
|
2049
2210
|
doc = Document(fpath)
|
2050
2211
|
content = [para.text for para in doc.paragraphs]
|
2051
2212
|
return content
|
@@ -2055,51 +2216,21 @@ def fload(fpath, kind=None, **kwargs):
|
|
2055
2216
|
kind = kind.lower()
|
2056
2217
|
kind = kind.lstrip(".").lower()
|
2057
2218
|
img_types = [
|
2058
|
-
"bmp",
|
2059
|
-
"
|
2060
|
-
"gif",
|
2061
|
-
"icns",
|
2062
|
-
"ico",
|
2063
|
-
"im",
|
2064
|
-
"jpg",
|
2065
|
-
"jpeg",
|
2066
|
-
"jpeg2000",
|
2067
|
-
"msp",
|
2068
|
-
"pcx",
|
2069
|
-
"png",
|
2070
|
-
"ppm",
|
2071
|
-
"sgi",
|
2072
|
-
"spider",
|
2073
|
-
"tga",
|
2074
|
-
"tiff",
|
2075
|
-
"tif",
|
2076
|
-
"webp",
|
2077
|
-
"json",
|
2219
|
+
"bmp","eps","gif","png","jpg","jpeg","jpeg2000","tiff","tif",
|
2220
|
+
"icns","ico","im","msp","pcx","ppm","sgi","spider","tga","webp",
|
2078
2221
|
]
|
2079
2222
|
doc_types = [
|
2080
|
-
"docx",
|
2081
|
-
"txt",
|
2082
|
-
"md",
|
2083
|
-
"
|
2084
|
-
"json",
|
2085
|
-
"yaml",
|
2086
|
-
"xml",
|
2087
|
-
"csv",
|
2088
|
-
"xlsx",
|
2089
|
-
"pdf",
|
2223
|
+
"docx","pdf",
|
2224
|
+
"txt","csv","xlsx","tsv","parquet","snappy",
|
2225
|
+
"md","html",
|
2226
|
+
"json","yaml","xml",
|
2090
2227
|
"ipynb",
|
2228
|
+
"mtx"
|
2091
2229
|
]
|
2092
2230
|
zip_types = [
|
2093
|
-
"gz",
|
2094
|
-
"
|
2095
|
-
"
|
2096
|
-
"tar",
|
2097
|
-
"tar.gz",
|
2098
|
-
"tar.bz2",
|
2099
|
-
"bz2",
|
2100
|
-
"xz",
|
2101
|
-
"rar",
|
2102
|
-
"tgz",
|
2231
|
+
"gz","zip","7z","rar","tgz",
|
2232
|
+
"tar","tar.gz","tar.bz2",
|
2233
|
+
"bz2","xz","gzip"
|
2103
2234
|
]
|
2104
2235
|
other_types = ["fcs"]
|
2105
2236
|
supported_types = [*doc_types, *img_types, *zip_types, *other_types]
|
@@ -2128,16 +2259,24 @@ def fload(fpath, kind=None, **kwargs):
|
|
2128
2259
|
elif kind == "txt" or kind == "md":
|
2129
2260
|
return load_txt_md(fpath)
|
2130
2261
|
elif kind == "html":
|
2131
|
-
return load_html(fpath)
|
2262
|
+
return load_html(fpath, **kwargs)
|
2132
2263
|
elif kind == "json":
|
2133
|
-
return load_json(fpath)
|
2264
|
+
return load_json(fpath, **kwargs)
|
2134
2265
|
elif kind == "yaml":
|
2135
2266
|
return load_yaml(fpath)
|
2136
2267
|
elif kind == "xml":
|
2137
2268
|
return load_xml(fpath)
|
2138
|
-
elif kind
|
2269
|
+
elif kind in ["csv","tsv"]:
|
2270
|
+
verbose=kwargs.pop('verbose',False)
|
2271
|
+
if run_once_within():
|
2272
|
+
use_pd("read_csv")
|
2139
2273
|
content = load_csv(fpath, **kwargs)
|
2140
2274
|
return content
|
2275
|
+
elif kind=='pkl':
|
2276
|
+
verbose=kwargs.pop('verbose',False)
|
2277
|
+
if run_once_within():
|
2278
|
+
use_pd("read_pickle")
|
2279
|
+
return pd.read_pickle(fpath,**kwargs)
|
2141
2280
|
elif kind in ["ods", "ods", "odt"]:
|
2142
2281
|
engine = kwargs.get("engine", "odf")
|
2143
2282
|
kwargs.pop("engine", None)
|
@@ -2146,14 +2285,40 @@ def fload(fpath, kind=None, **kwargs):
|
|
2146
2285
|
engine = kwargs.get("engine", "xlrd")
|
2147
2286
|
kwargs.pop("engine", None)
|
2148
2287
|
content = load_excel(fpath, engine=engine, **kwargs)
|
2149
|
-
|
2288
|
+
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2289
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2150
2290
|
return content
|
2151
2291
|
elif kind == "xlsx":
|
2152
2292
|
content = load_excel(fpath, **kwargs)
|
2153
|
-
display(content.head(3))
|
2293
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2294
|
+
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2295
|
+
return content
|
2296
|
+
elif kind=='mtx':
|
2297
|
+
from scipy.io import mmread
|
2298
|
+
dat_mtx=mmread(fpath)
|
2299
|
+
content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
|
2300
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2301
|
+
print(f"shape: {content.shape}")
|
2154
2302
|
return content
|
2155
2303
|
elif kind == "ipynb":
|
2156
2304
|
return load_ipynb(fpath, **kwargs)
|
2305
|
+
elif kind in ['parquet','snappy']:
|
2306
|
+
verbose=kwargs.pop('verbose',False)
|
2307
|
+
if run_once_within():
|
2308
|
+
use_pd("read_parquet")
|
2309
|
+
return load_parquet(fpath,**kwargs)
|
2310
|
+
elif kind =='feather':
|
2311
|
+
verbose=kwargs.pop('verbose',False)
|
2312
|
+
if run_once_within():
|
2313
|
+
use_pd("read_feather")
|
2314
|
+
content=pd.read_feather(fpath,**kwargs)
|
2315
|
+
return content
|
2316
|
+
elif kind =='h5':
|
2317
|
+
content=pd.read_hdf(fpath,**kwargs)
|
2318
|
+
return content
|
2319
|
+
elif kind =='pkl':
|
2320
|
+
content=pd.read_pickle(fpath,**kwargs)
|
2321
|
+
return content
|
2157
2322
|
elif kind == "pdf":
|
2158
2323
|
# print('usage:load_pdf(fpath, page="all", verbose=False)')
|
2159
2324
|
return load_pdf(fpath, **kwargs)
|
@@ -2164,6 +2329,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2164
2329
|
import GEOparse
|
2165
2330
|
return GEOparse.get_GEO(filepath=fpath)
|
2166
2331
|
elif kind.lower() in zip_types:
|
2332
|
+
from pprint import pp
|
2167
2333
|
keep = kwargs.get("keep", False)
|
2168
2334
|
fpath_unzip = unzip(fpath)
|
2169
2335
|
if os.path.isdir(fpath_unzip):
|
@@ -2198,10 +2364,11 @@ def fload(fpath, kind=None, **kwargs):
|
|
2198
2364
|
meta, data = fcsparser.parse(fpath, reformat_meta=True)
|
2199
2365
|
return meta, data
|
2200
2366
|
|
2367
|
+
elif kind=="mplstyle":
|
2368
|
+
return read_mplstyle(fpath)
|
2369
|
+
|
2201
2370
|
else:
|
2202
|
-
|
2203
|
-
# content = load_csv(fpath, **kwargs)
|
2204
|
-
# except:
|
2371
|
+
print("direct reading...")
|
2205
2372
|
try:
|
2206
2373
|
try:
|
2207
2374
|
with open(fpath, "r", encoding="utf-8") as f:
|
@@ -2311,6 +2478,7 @@ def filter_kwargs(kws, valid_kwargs):
|
|
2311
2478
|
}
|
2312
2479
|
return kwargs_filtered
|
2313
2480
|
|
2481
|
+
str_space_speed='sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
|
2314
2482
|
|
2315
2483
|
def fsave(
|
2316
2484
|
fpath,
|
@@ -2346,6 +2514,7 @@ def fsave(
|
|
2346
2514
|
fappend(fpath, content=content)
|
2347
2515
|
|
2348
2516
|
def save_docx(fpath, content, font_name, font_size, spacing):
|
2517
|
+
import docx
|
2349
2518
|
if isinstance(content, str):
|
2350
2519
|
content = content.split(". ")
|
2351
2520
|
doc = docx.Document()
|
@@ -2373,6 +2542,7 @@ def fsave(
|
|
2373
2542
|
save_content(fpath, html_content, mode)
|
2374
2543
|
|
2375
2544
|
def save_pdf(fpath, content, font_name, font_size):
|
2545
|
+
from fpdf import FPDF
|
2376
2546
|
pdf = FPDF()
|
2377
2547
|
pdf.add_page()
|
2378
2548
|
# pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
|
@@ -2386,7 +2556,7 @@ def fsave(
|
|
2386
2556
|
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
|
2387
2557
|
|
2388
2558
|
verbose=kwargs.pop("verbose",False)
|
2389
|
-
if
|
2559
|
+
if run_once_within():
|
2390
2560
|
use_pd("to_csv", verbose=verbose)
|
2391
2561
|
kwargs_csv = dict(
|
2392
2562
|
path_or_buf=None,
|
@@ -2418,7 +2588,7 @@ def fsave(
|
|
2418
2588
|
def save_xlsx(fpath, data, **kwargs):
|
2419
2589
|
verbose=kwargs.pop("verbose",False)
|
2420
2590
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2421
|
-
if
|
2591
|
+
if run_once_within():
|
2422
2592
|
use_pd("to_excel", verbose=verbose)
|
2423
2593
|
if any(kwargs):
|
2424
2594
|
format_excel(df=data, filename=fpath, **kwargs)
|
@@ -2444,9 +2614,10 @@ def fsave(
|
|
2444
2614
|
|
2445
2615
|
def save_ipynb(fpath, data, **kwargs):
|
2446
2616
|
# Split the content by code fences to distinguish between code and markdown
|
2617
|
+
import nbformat
|
2447
2618
|
parts = data.split("```")
|
2448
2619
|
cells = []
|
2449
|
-
|
2620
|
+
|
2450
2621
|
for i, part in enumerate(parts):
|
2451
2622
|
if i % 2 == 0:
|
2452
2623
|
# Even index: markdown content
|
@@ -2466,6 +2637,7 @@ def fsave(
|
|
2466
2637
|
# json.dump(data, file, **kwargs)
|
2467
2638
|
|
2468
2639
|
def save_json(fpath_fname, var_dict_or_df):
|
2640
|
+
import json
|
2469
2641
|
def _convert_js(data):
|
2470
2642
|
if isinstance(data, pd.DataFrame):
|
2471
2643
|
return data.to_dict(orient="list")
|
@@ -2487,10 +2659,12 @@ def fsave(
|
|
2487
2659
|
# # setss = jsonload("/.json")
|
2488
2660
|
|
2489
2661
|
def save_yaml(fpath, data, **kwargs):
|
2662
|
+
import yaml
|
2490
2663
|
with open(fpath, "w") as file:
|
2491
2664
|
yaml.dump(data, file, **kwargs)
|
2492
2665
|
|
2493
2666
|
def save_xml(fpath, data):
|
2667
|
+
from lxml import etree
|
2494
2668
|
root = etree.Element("root")
|
2495
2669
|
if isinstance(data, dict):
|
2496
2670
|
for key, val in data.items():
|
@@ -2501,6 +2675,25 @@ def fsave(
|
|
2501
2675
|
tree = etree.ElementTree(root)
|
2502
2676
|
tree.write(fpath, pretty_print=True, xml_declaration=True, encoding="UTF-8")
|
2503
2677
|
|
2678
|
+
def save_parquet(fpath:str, data:pd.DataFrame, **kwargs):
|
2679
|
+
engine = kwargs.pop("engine","auto") # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
|
2680
|
+
compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
|
2681
|
+
try:
|
2682
|
+
# Attempt to save with "pyarrow" if engine is set to "auto"
|
2683
|
+
data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
|
2684
|
+
print(f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression.")
|
2685
|
+
except Exception as e:
|
2686
|
+
print(f"Error using with engine '{engine}' and {compression} compression: {e}")
|
2687
|
+
if "Sparse" in str(e):
|
2688
|
+
try:
|
2689
|
+
# Handle sparse data by converting columns to dense
|
2690
|
+
print("Attempting to convert sparse columns to dense format...")
|
2691
|
+
data = data.apply(lambda x: x.sparse.to_dense() if pd.api.types.is_sparse(x) else x)
|
2692
|
+
save_parquet(fpath, data=data,**kwargs)
|
2693
|
+
except Exception as last_e:
|
2694
|
+
print(f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}")
|
2695
|
+
|
2696
|
+
|
2504
2697
|
if kind is None:
|
2505
2698
|
_, kind = os.path.splitext(fpath)
|
2506
2699
|
kind = kind.lower()
|
@@ -2546,7 +2739,92 @@ def fsave(
|
|
2546
2739
|
save_yaml(fpath, content, **kwargs)
|
2547
2740
|
elif kind == "ipynb":
|
2548
2741
|
save_ipynb(fpath, content, **kwargs)
|
2742
|
+
elif kind.lower() in ["parquet","pq","big","par"]:
|
2743
|
+
verbose=kwargs.pop('verbose',False)
|
2744
|
+
if verbose:
|
2745
|
+
print(str_space_speed)
|
2746
|
+
use_pd("to_parquet")
|
2747
|
+
return None
|
2748
|
+
compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
|
2749
|
+
# fix the fpath ends
|
2750
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2751
|
+
fpath = _fpath+_ext.replace(kind, 'parquet')
|
2752
|
+
if compression is not None:
|
2753
|
+
if not fpath.endswith(compression):
|
2754
|
+
fpath=fpath+f".{compression}"
|
2755
|
+
save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
|
2756
|
+
elif kind.lower() in ["pkl","pk","pickle","pick"]:
|
2757
|
+
# Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
|
2758
|
+
# Pickle is convenient if you want to preserve exact Python object types.
|
2759
|
+
verbose=kwargs.pop('verbose',False)
|
2760
|
+
if verbose:
|
2761
|
+
print(str_space_speed)
|
2762
|
+
use_pd("to_pickle")
|
2763
|
+
return None
|
2764
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2765
|
+
fpath = _fpath+_ext.replace(kind, 'pkl')
|
2766
|
+
compression=kwargs.pop("compression",None)
|
2767
|
+
if compression is not None:
|
2768
|
+
if not fpath.endswith(compression["method"]):
|
2769
|
+
fpath=fpath+f".{compression["method"]}"
|
2770
|
+
if isinstance(content, pd.DataFrame):
|
2771
|
+
content.to_pickle(fpath,**kwargs)
|
2772
|
+
else:
|
2773
|
+
try:
|
2774
|
+
print("trying to convert it as a DataFrame...")
|
2775
|
+
content=pd.DataFrame(content)
|
2776
|
+
content.to_pickle(fpath,**kwargs)
|
2777
|
+
except Exception as e:
|
2778
|
+
raise ValueError(
|
2779
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2780
|
+
)
|
2781
|
+
elif kind.lower() in ["fea",'feather','ft','fe','feat','fether']:
|
2782
|
+
# Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
|
2783
|
+
# optimized for data analytics tasks and is especially fast when working with Pandas.
|
2784
|
+
|
2785
|
+
verbose=kwargs.pop('verbose',False)
|
2786
|
+
if verbose:
|
2787
|
+
print(str_space_speed)
|
2788
|
+
use_pd("to_feather")
|
2789
|
+
return None
|
2790
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2791
|
+
fpath = _fpath+_ext.replace(kind, 'feather')
|
2792
|
+
if isinstance(content, pd.DataFrame):
|
2793
|
+
content.to_feather(fpath,**kwargs)
|
2794
|
+
else:
|
2795
|
+
try:
|
2796
|
+
print("trying to convert it as a DataFrame...")
|
2797
|
+
content=pd.DataFrame(content)
|
2798
|
+
content.to_feather(fpath, **kwargs)
|
2799
|
+
except Exception as e:
|
2800
|
+
raise ValueError(
|
2801
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2802
|
+
)
|
2803
|
+
elif kind.lower() in ["hd",'hdf','h','h5']:
|
2804
|
+
# particularly useful for large datasets and can handle complex data structures
|
2805
|
+
verbose=kwargs.pop('verbose',False)
|
2806
|
+
if verbose:
|
2807
|
+
print(str_space_speed)
|
2808
|
+
use_pd("to_hdf")
|
2809
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2810
|
+
fpath = _fpath+_ext.replace(kind, 'h5')
|
2811
|
+
compression=kwargs.pop("compression",None)
|
2812
|
+
if compression is not None:
|
2813
|
+
if not fpath.endswith(compression):
|
2814
|
+
fpath=fpath+f".{compression}"
|
2815
|
+
if isinstance(content, pd.DataFrame):
|
2816
|
+
content.to_hdf(fpath,key='content',**kwargs)
|
2817
|
+
else:
|
2818
|
+
try:
|
2819
|
+
print("trying to convert it as a DataFrame...")
|
2820
|
+
content=pd.DataFrame(content)
|
2821
|
+
content.to_hdf(fpath,**kwargs)
|
2822
|
+
except Exception as e:
|
2823
|
+
raise ValueError(
|
2824
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2825
|
+
)
|
2549
2826
|
else:
|
2827
|
+
from . import netfinder
|
2550
2828
|
try:
|
2551
2829
|
netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
|
2552
2830
|
except:
|
@@ -2669,6 +2947,7 @@ def isa(content, kind):
|
|
2669
2947
|
elif "color" in kind.lower(): # file
|
2670
2948
|
return is_str_color(content)
|
2671
2949
|
elif "html" in kind.lower():
|
2950
|
+
import re
|
2672
2951
|
if content is None or not isinstance(content, str):
|
2673
2952
|
return False
|
2674
2953
|
# Remove leading and trailing whitespace
|
@@ -2828,6 +3107,7 @@ def listdir(
|
|
2828
3107
|
display(f.head())
|
2829
3108
|
return f
|
2830
3109
|
else:
|
3110
|
+
from box import Box
|
2831
3111
|
if "l" in orient.lower(): # list # default
|
2832
3112
|
res_output = Box(f.to_dict(orient="list"))
|
2833
3113
|
return res_output
|
@@ -2868,13 +3148,10 @@ def mkdir_nest(fpath: str) -> str:
|
|
2868
3148
|
Returns:
|
2869
3149
|
- str: The path of the created directory.
|
2870
3150
|
"""
|
2871
|
-
|
2872
|
-
|
2873
3151
|
# Split the full path into directories
|
2874
3152
|
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
2875
3153
|
if os.path.isdir(fpath):
|
2876
3154
|
fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
|
2877
|
-
print(fpath)
|
2878
3155
|
return fpath
|
2879
3156
|
dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
|
2880
3157
|
|
@@ -2945,7 +3222,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
2945
3222
|
if len(rootdir) == 1:
|
2946
3223
|
rootdir = rootdir[0]
|
2947
3224
|
rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
|
2948
|
-
|
3225
|
+
|
2949
3226
|
return rootdir
|
2950
3227
|
|
2951
3228
|
|
@@ -2957,6 +3234,8 @@ def split_path(fpath):
|
|
2957
3234
|
|
2958
3235
|
|
2959
3236
|
def figsave(*args, dpi=300):
|
3237
|
+
import matplotlib.pyplot as plt
|
3238
|
+
from PIL import Image
|
2960
3239
|
dir_save = None
|
2961
3240
|
fname = None
|
2962
3241
|
img = None
|
@@ -2972,13 +3251,13 @@ def figsave(*args, dpi=300):
|
|
2972
3251
|
|
2973
3252
|
if dir_save is None:
|
2974
3253
|
dir_save="./"
|
2975
|
-
|
3254
|
+
|
2976
3255
|
# dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
|
2977
3256
|
dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
|
2978
3257
|
dir_ch = "".join(dir_save.split(f_slash)[-1:])
|
2979
3258
|
if not dir_par.endswith(f_slash):
|
2980
3259
|
dir_par += f_slash
|
2981
|
-
|
3260
|
+
|
2982
3261
|
if fname is None:
|
2983
3262
|
fname = dir_ch
|
2984
3263
|
mkdir(dir_par)
|
@@ -3065,6 +3344,7 @@ def figsave(*args, dpi=300):
|
|
3065
3344
|
def is_str_color(s):
|
3066
3345
|
# Regular expression pattern for hexadecimal color codes
|
3067
3346
|
if isinstance(s,str):
|
3347
|
+
import re
|
3068
3348
|
color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
|
3069
3349
|
return re.match(color_code_pattern, s) is not None
|
3070
3350
|
else:
|
@@ -3091,6 +3371,7 @@ def isnum(s):
|
|
3091
3371
|
|
3092
3372
|
|
3093
3373
|
def is_image(fpath):
|
3374
|
+
import mimetypes
|
3094
3375
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3095
3376
|
if mime_type and mime_type.startswith("image"):
|
3096
3377
|
return True
|
@@ -3099,6 +3380,7 @@ def is_image(fpath):
|
|
3099
3380
|
|
3100
3381
|
|
3101
3382
|
def is_document(fpath):
|
3383
|
+
import mimetypes
|
3102
3384
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3103
3385
|
if mime_type and (
|
3104
3386
|
mime_type.startswith("text/")
|
@@ -3119,6 +3401,7 @@ def is_document(fpath):
|
|
3119
3401
|
|
3120
3402
|
|
3121
3403
|
def is_zip(fpath):
|
3404
|
+
import mimetypes
|
3122
3405
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3123
3406
|
if mime_type == "application/zip":
|
3124
3407
|
return True
|
@@ -3127,6 +3410,7 @@ def is_zip(fpath):
|
|
3127
3410
|
|
3128
3411
|
|
3129
3412
|
def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
|
3413
|
+
import matplotlib.pyplot as plt
|
3130
3414
|
if ax is None:
|
3131
3415
|
ax = plt.gca()
|
3132
3416
|
for loc, spine in ax.spines.items():
|
@@ -3215,7 +3499,7 @@ def apply_filter(img, *args):
|
|
3215
3499
|
Returns:
|
3216
3500
|
PIL.Image: The filtered image.
|
3217
3501
|
"""
|
3218
|
-
|
3502
|
+
from PIL import ImageFilter
|
3219
3503
|
def correct_filter_name(filter_name):
|
3220
3504
|
if "bl" in filter_name.lower() and "box" not in filter_name.lower():
|
3221
3505
|
return "BLUR"
|
@@ -3457,6 +3741,8 @@ def imgsets(img, **kwargs):
|
|
3457
3741
|
avg_contrast_factor = sum(contrast_factors) / num_channels
|
3458
3742
|
return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
|
3459
3743
|
|
3744
|
+
import matplotlib.pyplot as plt
|
3745
|
+
from PIL import ImageEnhance,ImageOps
|
3460
3746
|
# Load image if input is a file path
|
3461
3747
|
if isinstance(img, str):
|
3462
3748
|
img = load_img(img)
|
@@ -3520,6 +3806,7 @@ def imgsets(img, **kwargs):
|
|
3520
3806
|
elif "pad" in k.lower():
|
3521
3807
|
img_update = ImageOps.pad(img_update, size=value)
|
3522
3808
|
elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
|
3809
|
+
from rembg import remove, new_session
|
3523
3810
|
if isinstance(value, bool):
|
3524
3811
|
session = new_session("isnet-general-use")
|
3525
3812
|
img_update = remove(img_update, session=session)
|
@@ -3558,6 +3845,7 @@ def imgsets(img, **kwargs):
|
|
3558
3845
|
else:
|
3559
3846
|
img_update = remove(img_update)
|
3560
3847
|
elif "bg" in k.lower() and "color" in k.lower():
|
3848
|
+
from rembg import remove
|
3561
3849
|
if isinstance(value, list):
|
3562
3850
|
value = tuple(value)
|
3563
3851
|
if isinstance(value, tuple): # replace the background color
|
@@ -3589,6 +3877,8 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
3589
3877
|
Args:
|
3590
3878
|
dir_img_list (list): List of the Directory containing the images.
|
3591
3879
|
"""
|
3880
|
+
import matplotlib.pyplot as plt
|
3881
|
+
from PIL import Image
|
3592
3882
|
num_images = len(dir_img_list)
|
3593
3883
|
if not kind.startswith("."):
|
3594
3884
|
kind = "." + kind
|
@@ -3625,28 +3915,15 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
3625
3915
|
# usage:
|
3626
3916
|
# fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
|
3627
3917
|
# thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
|
3628
|
-
def read_mplstyle(style_file):
|
3629
|
-
# Load the style file
|
3630
|
-
plt.style.use(style_file)
|
3631
|
-
|
3632
|
-
# Get the current style properties
|
3633
|
-
style_dict = plt.rcParams
|
3634
|
-
|
3635
|
-
# Convert to dictionary
|
3636
|
-
style_dict = dict(style_dict)
|
3637
|
-
# Print the style dictionary
|
3638
|
-
for i, j in style_dict.items():
|
3639
|
-
print(f"\n{i}::::{j}")
|
3640
|
-
return style_dict
|
3641
|
-
|
3642
3918
|
|
3643
|
-
# #example usage:
|
3644
|
-
# style_file = "/ std-colors.mplstyle"
|
3645
|
-
# style_dict = read_mplstyle(style_file)
|
3646
3919
|
|
3647
3920
|
|
3648
3921
|
# search and fine the director of the libary, which installed at local
|
3649
3922
|
def dir_lib(lib_oi):
|
3923
|
+
"""
|
3924
|
+
# example usage:
|
3925
|
+
# dir_lib("seaborn")
|
3926
|
+
"""
|
3650
3927
|
import site
|
3651
3928
|
|
3652
3929
|
# Get the site-packages directory
|
@@ -3664,23 +3941,6 @@ def dir_lib(lib_oi):
|
|
3664
3941
|
print(f"Cannot find the {lib_oi} in site-packages directory.")
|
3665
3942
|
return dir_list
|
3666
3943
|
|
3667
|
-
|
3668
|
-
# example usage:
|
3669
|
-
# dir_lib("seaborn")
|
3670
|
-
|
3671
|
-
"""
|
3672
|
-
# n = 7
|
3673
|
-
# clist = get_color(n, cmap="auto", how="linspace") # get_color(100)
|
3674
|
-
# plt.figure(figsize=[8, 5], dpi=100)
|
3675
|
-
# x = np.linspace(0, 2 * np.pi, 50) * 100
|
3676
|
-
# y = np.sin(x)
|
3677
|
-
# for i in range(1, n + 1):
|
3678
|
-
# plt.plot(x, y + i, c=clist[i - 1], lw=5, label=str(i))
|
3679
|
-
# plt.legend()
|
3680
|
-
# plt.ylim(-2, 20)
|
3681
|
-
# figsets(plt.gca(), {"style": "whitegrid"}) """
|
3682
|
-
|
3683
|
-
|
3684
3944
|
class FileInfo:
|
3685
3945
|
def __init__(
|
3686
3946
|
self,
|
@@ -3757,6 +4017,7 @@ class FileInfo:
|
|
3757
4017
|
|
3758
4018
|
|
3759
4019
|
def finfo(fpath):
|
4020
|
+
import time
|
3760
4021
|
fname, fmt = os.path.splitext(fpath)
|
3761
4022
|
dir_par = os.path.dirname(fpath) + "/"
|
3762
4023
|
data = {
|
@@ -3771,6 +4032,7 @@ def finfo(fpath):
|
|
3771
4032
|
}
|
3772
4033
|
extra_info = {}
|
3773
4034
|
if data["kind"] == ".pdf":
|
4035
|
+
from pdf2image import pdfinfo_from_path
|
3774
4036
|
extra_info = pdfinfo_from_path(fpath)
|
3775
4037
|
|
3776
4038
|
return FileInfo(
|
@@ -3785,18 +4047,7 @@ def finfo(fpath):
|
|
3785
4047
|
extra_info=extra_info,
|
3786
4048
|
)
|
3787
4049
|
|
3788
|
-
|
3789
4050
|
# ! format excel file
|
3790
|
-
import pandas as pd
|
3791
|
-
from datetime import datetime
|
3792
|
-
from openpyxl import load_workbook
|
3793
|
-
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
3794
|
-
from openpyxl.utils import get_column_letter
|
3795
|
-
from openpyxl.worksheet.datavalidation import DataValidation
|
3796
|
-
from openpyxl.comments import Comment
|
3797
|
-
from openpyxl.formatting.rule import ColorScaleRule
|
3798
|
-
|
3799
|
-
|
3800
4051
|
def hex2argb(hex_color):
|
3801
4052
|
"""
|
3802
4053
|
Convert a hex color code to aARGB format required by openpyxl.
|
@@ -3827,341 +4078,7 @@ def hex2argb(hex_color):
|
|
3827
4078
|
return hex_color[-9:]
|
3828
4079
|
else:
|
3829
4080
|
return "F" * (9 - len(hex_color)) + hex_color
|
3830
|
-
raise ValueError(
|
3831
|
-
"Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format."
|
3832
|
-
)
|
3833
|
-
|
3834
|
-
|
3835
|
-
def convert_indices_to_range(row_slice, col_slice):
|
3836
|
-
"""Convert numerical row and column slices to Excel-style range strings."""
|
3837
|
-
start_row = row_slice.start + 1
|
3838
|
-
end_row = row_slice.stop if row_slice.stop is not None else None
|
3839
|
-
start_col = col_slice.start + 1
|
3840
|
-
end_col = col_slice.stop if col_slice.stop is not None else None
|
3841
|
-
|
3842
|
-
start_col_letter = get_column_letter(start_col)
|
3843
|
-
end_col_letter = get_column_letter(end_col) if end_col else None
|
3844
|
-
return (
|
3845
|
-
f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
|
3846
|
-
if end_col_letter
|
3847
|
-
else f"{start_col_letter}{start_row}"
|
3848
|
-
)
|
3849
|
-
|
3850
|
-
|
3851
|
-
def apply_format(ws, cell, cell_range):
|
3852
|
-
"""Apply cell formatting to a specified range."""
|
3853
|
-
cell_font, cell_fill, cell_alignment, border = None, None, None, None
|
3854
|
-
kws_cell = ["font", "fill", "alignment", "border"]
|
3855
|
-
for K, _ in cell.items():
|
3856
|
-
if strcmp(K, kws_cell)[0] == "font":
|
3857
|
-
#! font
|
3858
|
-
font_color = "000000"
|
3859
|
-
font_name = "Arial"
|
3860
|
-
font_underline = "none"
|
3861
|
-
font_size = 14
|
3862
|
-
font_bold = False
|
3863
|
-
font_strike = False
|
3864
|
-
font_italic = False
|
3865
|
-
kws_font = [
|
3866
|
-
"name",
|
3867
|
-
"size",
|
3868
|
-
"bold",
|
3869
|
-
"underline",
|
3870
|
-
"color",
|
3871
|
-
"strike",
|
3872
|
-
"italic",
|
3873
|
-
]
|
3874
|
-
for k_, v_ in cell.get(K, {}).items():
|
3875
|
-
if strcmp(k_, kws_font)[0] == "name":
|
3876
|
-
font_name = v_
|
3877
|
-
elif strcmp(k_, kws_font)[0] == "size":
|
3878
|
-
font_size = v_
|
3879
|
-
elif strcmp(k_, kws_font)[0] == "bold":
|
3880
|
-
font_bold = v_
|
3881
|
-
elif strcmp(k_, kws_font)[0] == "underline":
|
3882
|
-
font_underline = strcmp(v_, ["none", "single", "double"])[0]
|
3883
|
-
elif strcmp(k_, kws_font)[0] == "color":
|
3884
|
-
font_color = hex2argb(v_)
|
3885
|
-
elif strcmp(k_, kws_font)[0] == "strike":
|
3886
|
-
font_strike = v_
|
3887
|
-
elif strcmp(k_, kws_font)[0] == "italic":
|
3888
|
-
font_italic = v_
|
3889
|
-
|
3890
|
-
cell_font = Font(
|
3891
|
-
name=font_name,
|
3892
|
-
size=font_size,
|
3893
|
-
bold=font_bold,
|
3894
|
-
italic=font_italic,
|
3895
|
-
underline=font_underline,
|
3896
|
-
strike=font_strike,
|
3897
|
-
color=font_color,
|
3898
|
-
)
|
3899
|
-
|
3900
|
-
if strcmp(K, kws_cell)[0] == "fill":
|
3901
|
-
#! fill
|
3902
|
-
kws_fill = ["start_color", "end_color", "fill_type", "color"]
|
3903
|
-
kws_fill_type = [
|
3904
|
-
"darkVertical",
|
3905
|
-
"lightDown",
|
3906
|
-
"lightGrid",
|
3907
|
-
"solid",
|
3908
|
-
"darkDown",
|
3909
|
-
"lightGray",
|
3910
|
-
"lightUp",
|
3911
|
-
"gray0625",
|
3912
|
-
"lightVertical",
|
3913
|
-
"lightHorizontal",
|
3914
|
-
"darkHorizontal",
|
3915
|
-
"gray125",
|
3916
|
-
"darkUp",
|
3917
|
-
"mediumGray",
|
3918
|
-
"darkTrellis",
|
3919
|
-
"darkGray",
|
3920
|
-
"lightTrellis",
|
3921
|
-
"darkGrid",
|
3922
|
-
]
|
3923
|
-
start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
|
3924
|
-
for k, v in cell.get(K, {}).items():
|
3925
|
-
if strcmp(k, kws_fill)[0] == "color":
|
3926
|
-
start_color, end_color = hex2argb(v), hex2argb(v)
|
3927
|
-
break
|
3928
|
-
for k, v in cell.get(K, {}).items():
|
3929
|
-
if strcmp(k, kws_fill)[0] == "start_color":
|
3930
|
-
start_color = hex2argb(v)
|
3931
|
-
elif strcmp(k, kws_fill)[0] == "end_color":
|
3932
|
-
end_color = hex2argb(v)
|
3933
|
-
elif strcmp(k, kws_fill)[0] == "fill_type":
|
3934
|
-
fill_type = strcmp(v, kws_fill_type)[0]
|
3935
|
-
cell_fill = PatternFill(
|
3936
|
-
start_color=start_color,
|
3937
|
-
end_color=end_color,
|
3938
|
-
fill_type=fill_type,
|
3939
|
-
)
|
3940
|
-
|
3941
|
-
if strcmp(K, kws_cell)[0] == "alignment":
|
3942
|
-
#! alignment
|
3943
|
-
# default
|
3944
|
-
align_horizontal = "general"
|
3945
|
-
align_vertical = "center"
|
3946
|
-
align_rot = 0
|
3947
|
-
align_wrap = False
|
3948
|
-
align_shrink = False
|
3949
|
-
align_indent = 0
|
3950
|
-
kws_align = [
|
3951
|
-
"horizontal",
|
3952
|
-
"ha",
|
3953
|
-
"vertical",
|
3954
|
-
"va",
|
3955
|
-
"text_rotation",
|
3956
|
-
"rotat",
|
3957
|
-
"rot",
|
3958
|
-
"wrap_text",
|
3959
|
-
"wrap",
|
3960
|
-
"shrink_to_fit",
|
3961
|
-
"shrink",
|
3962
|
-
"indent",
|
3963
|
-
]
|
3964
|
-
for k, v in cell.get(K, {}).items():
|
3965
|
-
if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
|
3966
|
-
align_horizontal = strcmp(
|
3967
|
-
v, ["general", "left", "right", "center"]
|
3968
|
-
)[0]
|
3969
|
-
elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
|
3970
|
-
align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
|
3971
|
-
elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
|
3972
|
-
align_rot = v
|
3973
|
-
elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
|
3974
|
-
align_wrap = v
|
3975
|
-
elif strcmp(k, kws_align)[0] in [
|
3976
|
-
"shrink_to_fit",
|
3977
|
-
"shrink",
|
3978
|
-
"wrap_text",
|
3979
|
-
"wrap",
|
3980
|
-
]:
|
3981
|
-
align_shrink = v
|
3982
|
-
elif strcmp(k, kws_align)[0] in ["indent"]:
|
3983
|
-
align_indent = v
|
3984
|
-
cell_alignment = Alignment(
|
3985
|
-
horizontal=align_horizontal,
|
3986
|
-
vertical=align_vertical,
|
3987
|
-
text_rotation=align_rot,
|
3988
|
-
wrap_text=align_wrap,
|
3989
|
-
shrink_to_fit=align_shrink,
|
3990
|
-
indent=align_indent,
|
3991
|
-
)
|
3992
|
-
|
3993
|
-
if strcmp(K, kws_cell)[0] == "border":
|
3994
|
-
#! border
|
3995
|
-
kws_border = [
|
3996
|
-
"color_left",
|
3997
|
-
"color_l",
|
3998
|
-
"color_right",
|
3999
|
-
"color_r",
|
4000
|
-
"color_top",
|
4001
|
-
"color_t",
|
4002
|
-
"color_bottom",
|
4003
|
-
"color_b",
|
4004
|
-
"color_diagonal",
|
4005
|
-
"color_d",
|
4006
|
-
"color_outline",
|
4007
|
-
"color_o",
|
4008
|
-
"color_vertical",
|
4009
|
-
"color_v",
|
4010
|
-
"color_horizontal",
|
4011
|
-
"color_h",
|
4012
|
-
"color",
|
4013
|
-
"style_left",
|
4014
|
-
"style_l",
|
4015
|
-
"style_right",
|
4016
|
-
"style_r",
|
4017
|
-
"style_top",
|
4018
|
-
"style_t",
|
4019
|
-
"style_bottom",
|
4020
|
-
"style_b",
|
4021
|
-
"style_diagonal",
|
4022
|
-
"style_d",
|
4023
|
-
"style_outline",
|
4024
|
-
"style_o",
|
4025
|
-
"style_vertical",
|
4026
|
-
"style_v",
|
4027
|
-
"style_horizontal",
|
4028
|
-
"style_h",
|
4029
|
-
"style",
|
4030
|
-
]
|
4031
|
-
# * border color
|
4032
|
-
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4033
|
-
"FF000000",
|
4034
|
-
"FF000000",
|
4035
|
-
"FF000000",
|
4036
|
-
"FF000000",
|
4037
|
-
)
|
4038
|
-
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4039
|
-
"FF000000",
|
4040
|
-
"FF000000",
|
4041
|
-
"FF000000",
|
4042
|
-
"FF000000",
|
4043
|
-
)
|
4044
|
-
# get colors config
|
4045
|
-
for k, v in cell.get(K, {}).items():
|
4046
|
-
if strcmp(k, kws_border)[0] in ["color"]:
|
4047
|
-
border_color_all = hex2argb(v)
|
4048
|
-
# 如果设置了color,表示其它的所有的都设置成为一样的
|
4049
|
-
# 然后再才开始自己定义其它的color
|
4050
|
-
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4051
|
-
border_color_all,
|
4052
|
-
border_color_all,
|
4053
|
-
border_color_all,
|
4054
|
-
border_color_all,
|
4055
|
-
)
|
4056
|
-
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4057
|
-
border_color_all,
|
4058
|
-
border_color_all,
|
4059
|
-
border_color_all,
|
4060
|
-
border_color_all,
|
4061
|
-
)
|
4062
|
-
elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
|
4063
|
-
border_color_l = hex2argb(v)
|
4064
|
-
elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
|
4065
|
-
border_color_r = hex2argb(v)
|
4066
|
-
elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
|
4067
|
-
border_color_t = hex2argb(v)
|
4068
|
-
elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
|
4069
|
-
border_color_b = hex2argb(v)
|
4070
|
-
elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
|
4071
|
-
border_color_d = hex2argb(v)
|
4072
|
-
elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
|
4073
|
-
border_color_o = hex2argb(v)
|
4074
|
-
elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
|
4075
|
-
border_color_v = hex2argb(v)
|
4076
|
-
elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
|
4077
|
-
border_color_h = hex2argb(v)
|
4078
|
-
# *border style
|
4079
|
-
border_styles = [
|
4080
|
-
"thin",
|
4081
|
-
"medium",
|
4082
|
-
"thick",
|
4083
|
-
"dotted",
|
4084
|
-
"dashed",
|
4085
|
-
"hair",
|
4086
|
-
"mediumDashed",
|
4087
|
-
"dashDot",
|
4088
|
-
"dashDotDot",
|
4089
|
-
"slantDashDot",
|
4090
|
-
"none",
|
4091
|
-
]
|
4092
|
-
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4093
|
-
None,
|
4094
|
-
None,
|
4095
|
-
None,
|
4096
|
-
None,
|
4097
|
-
)
|
4098
|
-
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4099
|
-
None,
|
4100
|
-
None,
|
4101
|
-
None,
|
4102
|
-
None,
|
4103
|
-
)
|
4104
|
-
# get styles config
|
4105
|
-
for k, v in cell.get(K, {}).items():
|
4106
|
-
# if not "style" in k:
|
4107
|
-
# break
|
4108
|
-
if strcmp(k, kws_border)[0] in ["style"]:
|
4109
|
-
border_style_all = strcmp(v, border_styles)[0]
|
4110
|
-
# 如果设置了style,表示其它的所有的都设置成为一样的
|
4111
|
-
# 然后再才开始自己定义其它的style
|
4112
|
-
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4113
|
-
border_style_all,
|
4114
|
-
border_style_all,
|
4115
|
-
border_style_all,
|
4116
|
-
border_style_all,
|
4117
|
-
)
|
4118
|
-
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4119
|
-
border_style_all,
|
4120
|
-
border_style_all,
|
4121
|
-
border_style_all,
|
4122
|
-
border_style_all,
|
4123
|
-
)
|
4124
|
-
elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
|
4125
|
-
border_style_l = strcmp(v, border_styles)[0]
|
4126
|
-
elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
|
4127
|
-
border_style_r = strcmp(v, border_styles)[0]
|
4128
|
-
elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
|
4129
|
-
border_style_t = strcmp(v, border_styles)[0]
|
4130
|
-
elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
|
4131
|
-
border_style_b = strcmp(v, border_styles)[0]
|
4132
|
-
elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
|
4133
|
-
border_style_d = strcmp(v, border_styles)[0]
|
4134
|
-
elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
|
4135
|
-
border_style_o = strcmp(v, border_styles)[0]
|
4136
|
-
elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
|
4137
|
-
border_style_v = strcmp(v, border_styles)[0]
|
4138
|
-
elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
|
4139
|
-
border_style_h = strcmp(v, border_styles)[0]
|
4140
|
-
# * apply border config
|
4141
|
-
border = Border(
|
4142
|
-
left=Side(border_style=border_style_l, color=border_color_l),
|
4143
|
-
right=Side(border_style=border_style_r, color=border_color_r),
|
4144
|
-
top=Side(border_style=border_style_t, color=border_color_t),
|
4145
|
-
bottom=Side(border_style=border_style_b, color=border_color_b),
|
4146
|
-
diagonal=Side(border_style=border_style_d, color=border_color_d),
|
4147
|
-
diagonal_direction=0,
|
4148
|
-
outline=Side(border_style=border_style_o, color=border_color_o),
|
4149
|
-
vertical=Side(border_style=border_style_v, color=border_color_v),
|
4150
|
-
horizontal=Side(border_style=border_style_h, color=border_color_h),
|
4151
|
-
)
|
4152
|
-
|
4153
|
-
#! final apply configs
|
4154
|
-
for row in ws[cell_range]:
|
4155
|
-
for cell_ in row:
|
4156
|
-
if cell_font:
|
4157
|
-
cell_.font = cell_font
|
4158
|
-
if cell_fill:
|
4159
|
-
cell_.fill = cell_fill
|
4160
|
-
if cell_alignment:
|
4161
|
-
cell_.alignment = cell_alignment
|
4162
|
-
if border:
|
4163
|
-
cell_.border = border
|
4164
|
-
|
4081
|
+
raise ValueError("Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format.")
|
4165
4082
|
|
4166
4083
|
def format_excel(
|
4167
4084
|
df=None,
|
@@ -4182,6 +4099,255 @@ def format_excel(
|
|
4182
4099
|
conditional_format=None, # dict
|
4183
4100
|
**kwargs,
|
4184
4101
|
):
|
4102
|
+
import pandas as pd
|
4103
|
+
from datetime import datetime
|
4104
|
+
from openpyxl import load_workbook
|
4105
|
+
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
4106
|
+
from openpyxl.utils import get_column_letter
|
4107
|
+
from openpyxl.worksheet.datavalidation import DataValidation
|
4108
|
+
from openpyxl.comments import Comment
|
4109
|
+
from openpyxl.formatting.rule import ColorScaleRule
|
4110
|
+
|
4111
|
+
def convert_indices_to_range(row_slice, col_slice):
|
4112
|
+
"""Convert numerical row and column slices to Excel-style range strings."""
|
4113
|
+
start_row = row_slice.start + 1
|
4114
|
+
end_row = row_slice.stop if row_slice.stop is not None else None
|
4115
|
+
start_col = col_slice.start + 1
|
4116
|
+
end_col = col_slice.stop if col_slice.stop is not None else None
|
4117
|
+
|
4118
|
+
start_col_letter = get_column_letter(start_col)
|
4119
|
+
end_col_letter = get_column_letter(end_col) if end_col else None
|
4120
|
+
return (
|
4121
|
+
f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
|
4122
|
+
if end_col_letter
|
4123
|
+
else f"{start_col_letter}{start_row}"
|
4124
|
+
)
|
4125
|
+
|
4126
|
+
def apply_format(ws, cell, cell_range):
|
4127
|
+
"""Apply cell formatting to a specified range."""
|
4128
|
+
cell_font, cell_fill, cell_alignment, border = None, None, None, None
|
4129
|
+
kws_cell = ["font", "fill", "alignment", "border"]
|
4130
|
+
for K, _ in cell.items():
|
4131
|
+
if strcmp(K, kws_cell)[0] == "font":
|
4132
|
+
#! font
|
4133
|
+
font_color = "000000"
|
4134
|
+
font_name = "Arial"
|
4135
|
+
font_underline = "none"
|
4136
|
+
font_size = 14
|
4137
|
+
font_bold = False
|
4138
|
+
font_strike = False
|
4139
|
+
font_italic = False
|
4140
|
+
kws_font = ["name","size","bold","underline","color","strike","italic"]
|
4141
|
+
for k_, v_ in cell.get(K, {}).items():
|
4142
|
+
if strcmp(k_, kws_font)[0] == "name":
|
4143
|
+
font_name = v_
|
4144
|
+
elif strcmp(k_, kws_font)[0] == "size":
|
4145
|
+
font_size = v_
|
4146
|
+
elif strcmp(k_, kws_font)[0] == "bold":
|
4147
|
+
font_bold = v_
|
4148
|
+
elif strcmp(k_, kws_font)[0] == "underline":
|
4149
|
+
font_underline = strcmp(v_, ["none", "single", "double"])[0]
|
4150
|
+
elif strcmp(k_, kws_font)[0] == "color":
|
4151
|
+
font_color = hex2argb(v_)
|
4152
|
+
elif strcmp(k_, kws_font)[0] == "strike":
|
4153
|
+
font_strike = v_
|
4154
|
+
elif strcmp(k_, kws_font)[0] == "italic":
|
4155
|
+
font_italic = v_
|
4156
|
+
|
4157
|
+
cell_font = Font(
|
4158
|
+
name=font_name,
|
4159
|
+
size=font_size,
|
4160
|
+
bold=font_bold,
|
4161
|
+
italic=font_italic,
|
4162
|
+
underline=font_underline,
|
4163
|
+
strike=font_strike,
|
4164
|
+
color=font_color,
|
4165
|
+
)
|
4166
|
+
|
4167
|
+
if strcmp(K, kws_cell)[0] == "fill":
|
4168
|
+
#! fill
|
4169
|
+
kws_fill = ["start_color", "end_color", "fill_type", "color"]
|
4170
|
+
kws_fill_type = ["darkVertical","lightDown","lightGrid","solid","darkDown","lightGray","lightUp","gray0625","lightVertical","lightHorizontal",
|
4171
|
+
"darkHorizontal","gray125","darkUp","mediumGray","darkTrellis","darkGray","lightTrellis","darkGrid"]
|
4172
|
+
start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
|
4173
|
+
for k, v in cell.get(K, {}).items():
|
4174
|
+
if strcmp(k, kws_fill)[0] == "color":
|
4175
|
+
start_color, end_color = hex2argb(v), hex2argb(v)
|
4176
|
+
break
|
4177
|
+
for k, v in cell.get(K, {}).items():
|
4178
|
+
if strcmp(k, kws_fill)[0] == "start_color":
|
4179
|
+
start_color = hex2argb(v)
|
4180
|
+
elif strcmp(k, kws_fill)[0] == "end_color":
|
4181
|
+
end_color = hex2argb(v)
|
4182
|
+
elif strcmp(k, kws_fill)[0] == "fill_type":
|
4183
|
+
fill_type = strcmp(v, kws_fill_type)[0]
|
4184
|
+
cell_fill = PatternFill(
|
4185
|
+
start_color=start_color,
|
4186
|
+
end_color=end_color,
|
4187
|
+
fill_type=fill_type,
|
4188
|
+
)
|
4189
|
+
|
4190
|
+
if strcmp(K, kws_cell)[0] == "alignment":
|
4191
|
+
#! alignment
|
4192
|
+
# default
|
4193
|
+
align_horizontal = "general"
|
4194
|
+
align_vertical = "center"
|
4195
|
+
align_rot = 0
|
4196
|
+
align_wrap = False
|
4197
|
+
align_shrink = False
|
4198
|
+
align_indent = 0
|
4199
|
+
kws_align = [
|
4200
|
+
"horizontal",
|
4201
|
+
"ha",
|
4202
|
+
"vertical",
|
4203
|
+
"va",
|
4204
|
+
"text_rotation",
|
4205
|
+
"rotat",
|
4206
|
+
"rot",
|
4207
|
+
"wrap_text",
|
4208
|
+
"wrap",
|
4209
|
+
"shrink_to_fit",
|
4210
|
+
"shrink",
|
4211
|
+
"indent",
|
4212
|
+
]
|
4213
|
+
for k, v in cell.get(K, {}).items():
|
4214
|
+
if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
|
4215
|
+
align_horizontal = strcmp(
|
4216
|
+
v, ["general", "left", "right", "center"]
|
4217
|
+
)[0]
|
4218
|
+
elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
|
4219
|
+
align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
|
4220
|
+
elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
|
4221
|
+
align_rot = v
|
4222
|
+
elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
|
4223
|
+
align_wrap = v
|
4224
|
+
elif strcmp(k, kws_align)[0] in [
|
4225
|
+
"shrink_to_fit",
|
4226
|
+
"shrink",
|
4227
|
+
"wrap_text",
|
4228
|
+
"wrap",
|
4229
|
+
]:
|
4230
|
+
align_shrink = v
|
4231
|
+
elif strcmp(k, kws_align)[0] in ["indent"]:
|
4232
|
+
align_indent = v
|
4233
|
+
cell_alignment = Alignment(
|
4234
|
+
horizontal=align_horizontal,
|
4235
|
+
vertical=align_vertical,
|
4236
|
+
text_rotation=align_rot,
|
4237
|
+
wrap_text=align_wrap,
|
4238
|
+
shrink_to_fit=align_shrink,
|
4239
|
+
indent=align_indent,
|
4240
|
+
)
|
4241
|
+
|
4242
|
+
if strcmp(K, kws_cell)[0] == "border":
|
4243
|
+
#! border
|
4244
|
+
kws_border = ["color_left","color_l","color_right","color_r","color_top","color_t","color_bottom","color_b",
|
4245
|
+
"color_diagonal","color_d","color_outline","color_o","color_vertical","color_v","color_horizontal",
|
4246
|
+
"color_h","color","style_left","style_l","style_right","style_r","style_top","style_t","style_bottom","style_b",
|
4247
|
+
"style_diagonal","style_d","style_outline","style_o","style_vertical","style_v","style_horizontal",
|
4248
|
+
"style_h","style"]
|
4249
|
+
# * border color
|
4250
|
+
border_color_l, border_color_r, border_color_t, border_color_b = ("FF000000","FF000000","FF000000","FF000000")
|
4251
|
+
border_color_d, border_color_o, border_color_v, border_color_h = ("FF000000","FF000000","FF000000","FF000000")
|
4252
|
+
# get colors config
|
4253
|
+
for k, v in cell.get(K, {}).items():
|
4254
|
+
if strcmp(k, kws_border)[0] in ["color"]:
|
4255
|
+
border_color_all = hex2argb(v)
|
4256
|
+
# 如果设置了color,表示其它的所有的都设置成为一样的
|
4257
|
+
# 然后再才开始自己定义其它的color
|
4258
|
+
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4259
|
+
border_color_all,
|
4260
|
+
border_color_all,
|
4261
|
+
border_color_all,
|
4262
|
+
border_color_all,
|
4263
|
+
)
|
4264
|
+
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4265
|
+
border_color_all,
|
4266
|
+
border_color_all,
|
4267
|
+
border_color_all,
|
4268
|
+
border_color_all,
|
4269
|
+
)
|
4270
|
+
elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
|
4271
|
+
border_color_l = hex2argb(v)
|
4272
|
+
elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
|
4273
|
+
border_color_r = hex2argb(v)
|
4274
|
+
elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
|
4275
|
+
border_color_t = hex2argb(v)
|
4276
|
+
elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
|
4277
|
+
border_color_b = hex2argb(v)
|
4278
|
+
elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
|
4279
|
+
border_color_d = hex2argb(v)
|
4280
|
+
elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
|
4281
|
+
border_color_o = hex2argb(v)
|
4282
|
+
elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
|
4283
|
+
border_color_v = hex2argb(v)
|
4284
|
+
elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
|
4285
|
+
border_color_h = hex2argb(v)
|
4286
|
+
# *border style
|
4287
|
+
border_styles = ["thin","medium","thick","dotted","dashed",
|
4288
|
+
"hair","mediumDashed","dashDot","dashDotDot","slantDashDot","none"]
|
4289
|
+
border_style_l, border_style_r, border_style_t, border_style_b = (None,None,None,None)
|
4290
|
+
border_style_d, border_style_o, border_style_v, border_style_h = (None,None,None,None)
|
4291
|
+
# get styles config
|
4292
|
+
for k, v in cell.get(K, {}).items():
|
4293
|
+
# if not "style" in k:
|
4294
|
+
# break
|
4295
|
+
if strcmp(k, kws_border)[0] in ["style"]:
|
4296
|
+
border_style_all = strcmp(v, border_styles)[0]
|
4297
|
+
# 如果设置了style,表示其它的所有的都设置成为一样的
|
4298
|
+
# 然后再才开始自己定义其它的style
|
4299
|
+
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4300
|
+
border_style_all,
|
4301
|
+
border_style_all,
|
4302
|
+
border_style_all,
|
4303
|
+
border_style_all,
|
4304
|
+
)
|
4305
|
+
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4306
|
+
border_style_all,
|
4307
|
+
border_style_all,
|
4308
|
+
border_style_all,
|
4309
|
+
border_style_all,
|
4310
|
+
)
|
4311
|
+
elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
|
4312
|
+
border_style_l = strcmp(v, border_styles)[0]
|
4313
|
+
elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
|
4314
|
+
border_style_r = strcmp(v, border_styles)[0]
|
4315
|
+
elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
|
4316
|
+
border_style_t = strcmp(v, border_styles)[0]
|
4317
|
+
elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
|
4318
|
+
border_style_b = strcmp(v, border_styles)[0]
|
4319
|
+
elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
|
4320
|
+
border_style_d = strcmp(v, border_styles)[0]
|
4321
|
+
elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
|
4322
|
+
border_style_o = strcmp(v, border_styles)[0]
|
4323
|
+
elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
|
4324
|
+
border_style_v = strcmp(v, border_styles)[0]
|
4325
|
+
elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
|
4326
|
+
border_style_h = strcmp(v, border_styles)[0]
|
4327
|
+
# * apply border config
|
4328
|
+
border = Border(
|
4329
|
+
left=Side(border_style=border_style_l, color=border_color_l),
|
4330
|
+
right=Side(border_style=border_style_r, color=border_color_r),
|
4331
|
+
top=Side(border_style=border_style_t, color=border_color_t),
|
4332
|
+
bottom=Side(border_style=border_style_b, color=border_color_b),
|
4333
|
+
diagonal=Side(border_style=border_style_d, color=border_color_d),
|
4334
|
+
diagonal_direction=0,
|
4335
|
+
outline=Side(border_style=border_style_o, color=border_color_o),
|
4336
|
+
vertical=Side(border_style=border_style_v, color=border_color_v),
|
4337
|
+
horizontal=Side(border_style=border_style_h, color=border_color_h),
|
4338
|
+
)
|
4339
|
+
|
4340
|
+
#! final apply configs
|
4341
|
+
for row in ws[cell_range]:
|
4342
|
+
for cell_ in row:
|
4343
|
+
if cell_font:
|
4344
|
+
cell_.font = cell_font
|
4345
|
+
if cell_fill:
|
4346
|
+
cell_.fill = cell_fill
|
4347
|
+
if cell_alignment:
|
4348
|
+
cell_.alignment = cell_alignment
|
4349
|
+
if border:
|
4350
|
+
cell_.border = border
|
4185
4351
|
if not isinstance(df, pd.DataFrame):
|
4186
4352
|
try:
|
4187
4353
|
print(f"is loading file {os.path.basename(df)}")
|
@@ -4527,12 +4693,10 @@ format_excel(
|
|
4527
4693
|
print(f"Formatted Excel file saved as:\n{filename}")
|
4528
4694
|
|
4529
4695
|
|
4530
|
-
from IPython.display import display, HTML, Markdown
|
4531
|
-
|
4532
|
-
|
4533
4696
|
def preview(var):
|
4534
4697
|
"""Master function to preview formatted variables in Jupyter."""
|
4535
|
-
|
4698
|
+
from bs4 import BeautifulSoup
|
4699
|
+
from IPython.display import display, HTML, Markdown
|
4536
4700
|
if isinstance(var, str):
|
4537
4701
|
if isa(var, "html"):
|
4538
4702
|
display(HTML(var)) # Render as HTML
|
@@ -4549,6 +4713,7 @@ def preview(var):
|
|
4549
4713
|
display(var)
|
4550
4714
|
|
4551
4715
|
elif isinstance(var, list) or isinstance(var, dict):
|
4716
|
+
import json
|
4552
4717
|
# Display JSON
|
4553
4718
|
json_str = json.dumps(var, indent=4)
|
4554
4719
|
display(Markdown(f"```json\n{json_str}\n```"))
|
@@ -4562,6 +4727,7 @@ def preview(var):
|
|
4562
4727
|
display(Image(filename=var))
|
4563
4728
|
|
4564
4729
|
elif isinstance(var, dict):
|
4730
|
+
import json
|
4565
4731
|
# Handle dictionary formatting
|
4566
4732
|
json_str = json.dumps(var, indent=4)
|
4567
4733
|
display(Markdown(f"```json\n{json_str}\n```"))
|
@@ -4569,13 +4735,154 @@ def preview(var):
|
|
4569
4735
|
else:
|
4570
4736
|
# If the format is not recognized, print a message
|
4571
4737
|
print("Format not recognized or unsupported.")
|
4572
|
-
|
4573
|
-
|
4574
4738
|
# # Example usages:
|
4575
4739
|
# preview("This is a plain text message.")
|
4576
4740
|
# preview("# This is a Markdown header")
|
4577
4741
|
# preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
|
4578
4742
|
# preview({"key": "value", "numbers": [1, 2, 3]})
|
4743
|
+
|
4744
|
+
def _df_outlier(
|
4745
|
+
data,
|
4746
|
+
columns=None,
|
4747
|
+
method=["zscore", "iqr", "percentile", "iforest"],
|
4748
|
+
min_outlier_method=3, # 至少两种方法检查出outlier
|
4749
|
+
zscore_threshold=3,
|
4750
|
+
iqr_threshold=1.5,
|
4751
|
+
lower_percentile=5,
|
4752
|
+
upper_percentile=95,
|
4753
|
+
):
|
4754
|
+
from scipy.stats import zscore
|
4755
|
+
from sklearn.ensemble import IsolationForest
|
4756
|
+
from sklearn.preprocessing import StandardScaler
|
4757
|
+
|
4758
|
+
col_names_org = data.columns.tolist()
|
4759
|
+
index_names_org = data.index.tolist()
|
4760
|
+
# Separate numeric and non-numeric columns
|
4761
|
+
numeric_data = data.select_dtypes(include=[np.number])
|
4762
|
+
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
4763
|
+
|
4764
|
+
if columns is not None:
|
4765
|
+
numeric_data = numeric_data[columns]
|
4766
|
+
elif numeric_data.empty:
|
4767
|
+
raise ValueError("Input data must contain numeric columns.")
|
4768
|
+
|
4769
|
+
outliers_df = pd.DataFrame(index=numeric_data.index)
|
4770
|
+
if isinstance(method, str):
|
4771
|
+
method = [method]
|
4772
|
+
|
4773
|
+
# Z-score method
|
4774
|
+
if "zscore" in method:
|
4775
|
+
z_scores = np.abs(zscore(numeric_data))
|
4776
|
+
outliers_df["zscore"] = np.any(z_scores > zscore_threshold, axis=1)
|
4777
|
+
|
4778
|
+
# IQR method
|
4779
|
+
if "iqr" in method:
|
4780
|
+
Q1 = numeric_data.quantile(0.25)
|
4781
|
+
Q3 = numeric_data.quantile(0.75)
|
4782
|
+
IQR = Q3 - Q1
|
4783
|
+
lower_bound = Q1 - iqr_threshold * IQR
|
4784
|
+
upper_bound = Q3 + iqr_threshold * IQR
|
4785
|
+
outliers_df["iqr"] = (
|
4786
|
+
(numeric_data < lower_bound) | (numeric_data > upper_bound)
|
4787
|
+
).any(axis=1)
|
4788
|
+
|
4789
|
+
# Percentile method
|
4790
|
+
if "percentile" in method:
|
4791
|
+
lower_bound = numeric_data.quantile(lower_percentile / 100)
|
4792
|
+
upper_bound = numeric_data.quantile(upper_percentile / 100)
|
4793
|
+
outliers_df["percentile"] = (
|
4794
|
+
(numeric_data < lower_bound) | (numeric_data > upper_bound)
|
4795
|
+
).any(axis=1)
|
4796
|
+
|
4797
|
+
# Isolation Forest method
|
4798
|
+
if "iforest" in method:
|
4799
|
+
# iforest method cannot handle NaNs, then fillna with mean
|
4800
|
+
numeric_data_ = numeric_data.fillna(numeric_data.mean())
|
4801
|
+
scaler = StandardScaler()
|
4802
|
+
scaled_data = scaler.fit_transform(numeric_data_)
|
4803
|
+
iso_forest = IsolationForest(contamination=0.05)
|
4804
|
+
outliers_df["iforest"] = iso_forest.fit_predict(scaled_data) == -1
|
4805
|
+
|
4806
|
+
# Combine all outlier detections
|
4807
|
+
if len(method) == 4: # all method are used:
|
4808
|
+
outliers_df["outlier"] = outliers_df.sum(axis=1) >= min_outlier_method
|
4809
|
+
else:
|
4810
|
+
outliers_df["outlier"] = outliers_df.any(axis=1)
|
4811
|
+
|
4812
|
+
# Handling Outliers: Remove or Winsorize or Replace with NaN
|
4813
|
+
processed_data = numeric_data.copy()
|
4814
|
+
|
4815
|
+
processed_data.loc[outliers_df["outlier"]] = np.nan
|
4816
|
+
|
4817
|
+
return processed_data
|
4818
|
+
|
4819
|
+
|
4820
|
+
def df_outlier(
|
4821
|
+
data,
|
4822
|
+
columns=None,
|
4823
|
+
method=["zscore", "iqr", "percentile", "iforest"],
|
4824
|
+
min_outlier_method=2, # 至少两种方法检查出outlier
|
4825
|
+
zscore_threshold=3,
|
4826
|
+
iqr_threshold=1.5,
|
4827
|
+
lower_percentile=5,
|
4828
|
+
upper_percentile=95,
|
4829
|
+
):
|
4830
|
+
"""
|
4831
|
+
Usage:
|
4832
|
+
data_out = df_outlier(
|
4833
|
+
data,
|
4834
|
+
columns=["income"],
|
4835
|
+
method="iforest",
|
4836
|
+
min_outlier_method=1)
|
4837
|
+
|
4838
|
+
Advanced outlier detection and handling function.
|
4839
|
+
|
4840
|
+
Parameters:
|
4841
|
+
- data: DataFrame, the input data (numerical).
|
4842
|
+
- method: List, the outlier detection method to use. Options: 'zscore', 'iqr', 'percentile', 'iforest'.
|
4843
|
+
- zscore_threshold: float, threshold for Z-score outlier detection (default 3).
|
4844
|
+
- iqr_threshold: float, threshold for IQR method (default 1.5).
|
4845
|
+
- lower_percentile: float, lower percentile for percentile-based outliers (default 5).
|
4846
|
+
- upper_percentile: float, upper percentile for percentile-based outliers (default 95).
|
4847
|
+
- keep_nan: bool, whether to replace outliers with NaN (default True).
|
4848
|
+
- plot: bool, whether to visualize the outliers (default False).
|
4849
|
+
- min_outlier_method: int, minimum number of method that need to flag a row as an outlier (default 2).
|
4850
|
+
- inplace: bool, whether to modify the original `data` DataFrame (default False).
|
4851
|
+
|
4852
|
+
Returns:
|
4853
|
+
- processed_data: DataFrame with outliers handled based on method (if winsorize/remove is True).
|
4854
|
+
"""
|
4855
|
+
col_names_org = data.columns.tolist()
|
4856
|
+
index_names_org = data.index.tolist()
|
4857
|
+
|
4858
|
+
numeric_data = data.select_dtypes(include=[np.number])
|
4859
|
+
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
4860
|
+
|
4861
|
+
_outlier_df_tmp = pd.DataFrame()
|
4862
|
+
for col in numeric_data.columns:
|
4863
|
+
_outlier_df_tmp = pd.concat(
|
4864
|
+
[
|
4865
|
+
_outlier_df_tmp,
|
4866
|
+
_df_outlier(
|
4867
|
+
data=data,
|
4868
|
+
columns=[col],
|
4869
|
+
method=method,
|
4870
|
+
min_outlier_method=min_outlier_method, # 至少两种方法检查出outlier
|
4871
|
+
zscore_threshold=zscore_threshold,
|
4872
|
+
iqr_threshold=iqr_threshold,
|
4873
|
+
lower_percentile=lower_percentile,
|
4874
|
+
upper_percentile=upper_percentile,
|
4875
|
+
),
|
4876
|
+
],
|
4877
|
+
axis=1,
|
4878
|
+
# join="inner",
|
4879
|
+
)
|
4880
|
+
processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
|
4881
|
+
processed_data = processed_data[col_names_org]
|
4882
|
+
return processed_data
|
4883
|
+
|
4884
|
+
|
4885
|
+
|
4579
4886
|
def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
4580
4887
|
"""
|
4581
4888
|
Extend a DataFrame by the list elecments in the column.
|
@@ -4967,6 +5274,7 @@ def df_drop_duplicates(
|
|
4967
5274
|
return None
|
4968
5275
|
else:
|
4969
5276
|
return result
|
5277
|
+
#! fillna()
|
4970
5278
|
def df_fillna(
|
4971
5279
|
data: pd.DataFrame,
|
4972
5280
|
method: str = "knn",
|
@@ -4974,8 +5282,8 @@ def df_fillna(
|
|
4974
5282
|
constant: float = None,
|
4975
5283
|
n_neighbors: int = 5, # KNN-specific
|
4976
5284
|
max_iter: int = 10, # Iterative methods specific
|
4977
|
-
inplace: bool =
|
4978
|
-
random_state:int =
|
5285
|
+
inplace: bool = False,
|
5286
|
+
random_state:int = 1
|
4979
5287
|
) -> pd.DataFrame:
|
4980
5288
|
"""
|
4981
5289
|
Fill missing values in a DataFrame using specified imputation method.
|
@@ -5003,7 +5311,18 @@ def df_fillna(
|
|
5003
5311
|
inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
|
5004
5312
|
|
5005
5313
|
"""
|
5006
|
-
|
5314
|
+
if isinstance(data, pd.Series):
|
5315
|
+
data=pd.DataFrame(data)
|
5316
|
+
# handle None
|
5317
|
+
for col in data.columns:
|
5318
|
+
data[col] = data[col].apply(lambda x: np.nan if x is None else x)
|
5319
|
+
|
5320
|
+
col_names_org = data.columns.tolist()
|
5321
|
+
index_names_org = data.index.tolist()
|
5322
|
+
# Separate numeric and non-numeric columns
|
5323
|
+
numeric_data = data.select_dtypes(include=[np.number])
|
5324
|
+
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
5325
|
+
|
5007
5326
|
if data.empty:
|
5008
5327
|
raise ValueError("Input DataFrame is empty.")
|
5009
5328
|
|
@@ -5032,15 +5351,6 @@ def df_fillna(
|
|
5032
5351
|
from sklearn.impute import IterativeImputer
|
5033
5352
|
|
5034
5353
|
imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
|
5035
|
-
# elif method == "missforest":
|
5036
|
-
# from missingpy import MissForest
|
5037
|
-
# imputer = MissForest(max_iter=max_iter, random_state=random_state)
|
5038
|
-
# elif method == "softimpute":
|
5039
|
-
# from fancyimpute import SoftImpute
|
5040
|
-
# imputer = SoftImpute()
|
5041
|
-
# elif method == "svd":
|
5042
|
-
# from fancyimpute import IterativeSVD
|
5043
|
-
# imputer = IterativeSVD(max_iters=max_iter)
|
5044
5354
|
else: # mean, median, most_frequent
|
5045
5355
|
from sklearn.impute import SimpleImputer
|
5046
5356
|
imputer = SimpleImputer(strategy=method)
|
@@ -5048,26 +5358,49 @@ def df_fillna(
|
|
5048
5358
|
# Fit and transform the data
|
5049
5359
|
if axis == 0:
|
5050
5360
|
# Impute column-wise
|
5051
|
-
imputed_data = imputer.fit_transform(
|
5052
|
-
imputed_data.shape
|
5361
|
+
imputed_data = imputer.fit_transform(numeric_data)
|
5053
5362
|
elif axis == 1:
|
5054
5363
|
# Impute row-wise
|
5055
|
-
imputed_data = imputer.fit_transform(
|
5056
|
-
imputed_data.shape
|
5364
|
+
imputed_data = imputer.fit_transform(numeric_data.T)
|
5057
5365
|
else:
|
5058
5366
|
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
5059
5367
|
|
5060
|
-
|
5368
|
+
imputed_data = pd.DataFrame(
|
5061
5369
|
imputed_data if axis == 0 else imputed_data.T,
|
5062
|
-
index=
|
5063
|
-
columns=
|
5370
|
+
index=numeric_data.index if axis == 0 else data.columns,
|
5371
|
+
columns=numeric_data.columns if axis == 0 else data.index,
|
5064
5372
|
)
|
5373
|
+
for col in imputed_data.select_dtypes(include=[np.number]).columns:
|
5374
|
+
imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
|
5375
|
+
|
5376
|
+
# Handle non-numeric data imputation
|
5377
|
+
if not non_numeric_data.empty:
|
5378
|
+
from sklearn.impute import SimpleImputer
|
5379
|
+
if method == "constant":
|
5380
|
+
non_numeric_imputer = SimpleImputer(strategy="constant", fill_value=constant)
|
5381
|
+
else:
|
5382
|
+
non_numeric_imputer = SimpleImputer(strategy="most_frequent")
|
5383
|
+
|
5384
|
+
# Impute non-numeric columns column-wise (axis=0)
|
5385
|
+
imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
|
5386
|
+
|
5387
|
+
# Convert imputed non-numeric array back to DataFrame with original index and column names
|
5388
|
+
imputed_non_numeric_df = pd.DataFrame(
|
5389
|
+
imputed_non_numeric, index=non_numeric_data.index, columns=non_numeric_data.columns
|
5390
|
+
)
|
5391
|
+
else:
|
5392
|
+
imputed_non_numeric_df = pd.DataFrame(index=data.index)
|
5393
|
+
|
5394
|
+
|
5395
|
+
imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(columns=data.columns)
|
5065
5396
|
|
5066
5397
|
if inplace:
|
5067
|
-
|
5068
|
-
|
5398
|
+
# Modify the original DataFrame
|
5399
|
+
data[:] = imputed_data[col_names_org]
|
5400
|
+
return None
|
5069
5401
|
else:
|
5070
|
-
|
5402
|
+
# Return the modified DataFrame
|
5403
|
+
return imputed_data[col_names_org]
|
5071
5404
|
# # example
|
5072
5405
|
# data = {
|
5073
5406
|
# "A": [1, 2, np.nan, 4, 5],
|
@@ -5097,7 +5430,94 @@ def df_fillna(
|
|
5097
5430
|
# display(df)
|
5098
5431
|
# display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
|
5099
5432
|
|
5100
|
-
|
5433
|
+
def df_encoder(
|
5434
|
+
data: pd.DataFrame,
|
5435
|
+
method: str = "dummy",#'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
|
5436
|
+
columns=None,
|
5437
|
+
target_column=None, # Required for 'target' encoding method
|
5438
|
+
**kwargs
|
5439
|
+
) -> pd.DataFrame:
|
5440
|
+
"""
|
5441
|
+
Methods explained:
|
5442
|
+
- 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
|
5443
|
+
|
5444
|
+
- 'onehot': One-hot encoding is used when there is no inherent order in categories. It creates a binary column for each category and is useful for nominal categorical variables. However, it increases dimensionality significantly if there are many unique categories.
|
5445
|
+
|
5446
|
+
- 'ordinal': Ordinal encoding is used when there is an inherent order in the categories. It assigns integers to categories based on their order. Use this when the categories have a ranking (e.g., 'low', 'medium', 'high').
|
5447
|
+
|
5448
|
+
- 'label': Label encoding is used for converting each unique category to a numeric label. It can be useful when working with algorithms that can handle categorical data natively (e.g., decision trees). However, it might introduce unintended ordinal relationships between the categories.
|
5449
|
+
|
5450
|
+
- 'target': Target encoding is used when you encode a categorical feature based on the mean of the target variable. This is useful when there is a strong correlation between the categorical feature and the target variable. It is often used in predictive modeling to capture relationships that are not directly encoded in the feature.
|
5451
|
+
|
5452
|
+
- 'binary': Binary encoding is a more efficient alternative to one-hot encoding when dealing with high-cardinality categorical variables. It converts categories into binary numbers and then splits them into multiple columns, reducing dimensionality compared to one-hot encoding.
|
5453
|
+
"""
|
5454
|
+
|
5455
|
+
# Select categorical columns
|
5456
|
+
categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
|
5457
|
+
methods = ["dummy","onehot", "ordinal", "label", "target", "binary"]
|
5458
|
+
method = strcmp(method, methods)[0]
|
5459
|
+
|
5460
|
+
if columns is None:
|
5461
|
+
columns = categorical_cols
|
5462
|
+
|
5463
|
+
# pd.get_dummies()
|
5464
|
+
if method=='dummy':
|
5465
|
+
dtype=kwargs.pop("dtype",int)
|
5466
|
+
drop_first=kwargs.pop("drop_first",True)
|
5467
|
+
try:
|
5468
|
+
encoded_df = pd.get_dummies(data[columns], drop_first=drop_first, dtype=dtype, **kwargs)
|
5469
|
+
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5470
|
+
except Exception as e:
|
5471
|
+
# print(f"Warning, 没有进行转换, 因为: {e}")
|
5472
|
+
return data
|
5473
|
+
# One-hot encoding
|
5474
|
+
elif method == "onehot":
|
5475
|
+
from sklearn.preprocessing import OneHotEncoder
|
5476
|
+
|
5477
|
+
encoder = OneHotEncoder(drop="first", sparse_output=False, **kwargs)
|
5478
|
+
encoded_data = encoder.fit_transform(data[columns])
|
5479
|
+
encoded_df = pd.DataFrame(
|
5480
|
+
encoded_data,
|
5481
|
+
columns=encoder.get_feature_names_out(columns),
|
5482
|
+
index=data.index,
|
5483
|
+
)
|
5484
|
+
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5485
|
+
|
5486
|
+
# Ordinal encoding
|
5487
|
+
elif method == "ordinal":
|
5488
|
+
from sklearn.preprocessing import OrdinalEncoder
|
5489
|
+
|
5490
|
+
encoder = OrdinalEncoder(**kwargs)
|
5491
|
+
encoded_data = encoder.fit_transform(data[columns])
|
5492
|
+
encoded_df = pd.DataFrame(encoded_data, columns=columns, index=data.index)
|
5493
|
+
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5494
|
+
|
5495
|
+
# Label encoding
|
5496
|
+
elif method == "label":
|
5497
|
+
from sklearn.preprocessing import LabelEncoder
|
5498
|
+
|
5499
|
+
encoder = LabelEncoder()
|
5500
|
+
encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
|
5501
|
+
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5502
|
+
|
5503
|
+
# Target encoding (Mean of the target for each category)
|
5504
|
+
elif method == "target":
|
5505
|
+
if target_column is None:
|
5506
|
+
raise ValueError("target_column must be provided for target encoding.")
|
5507
|
+
from category_encoders import TargetEncoder
|
5508
|
+
|
5509
|
+
encoder = TargetEncoder(cols=columns, **kwargs)
|
5510
|
+
encoded_data = encoder.fit_transform(data[columns], data[target_column])
|
5511
|
+
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5512
|
+
|
5513
|
+
# Binary encoding (for high-cardinality categorical variables)
|
5514
|
+
elif method == "binary":
|
5515
|
+
from category_encoders import BinaryEncoder
|
5516
|
+
|
5517
|
+
encoder = BinaryEncoder(cols=columns, **kwargs)
|
5518
|
+
encoded_data = encoder.fit_transform(data[columns])
|
5519
|
+
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5520
|
+
|
5101
5521
|
def df_scaler(
|
5102
5522
|
data: pd.DataFrame, # should be numeric dtype
|
5103
5523
|
method="standard",
|
@@ -5143,9 +5563,8 @@ def df_scaler(
|
|
5143
5563
|
if axis == 0:
|
5144
5564
|
# Column-wise scaling (default)
|
5145
5565
|
if columns is None:
|
5146
|
-
columns = data.select_dtypes(include=
|
5566
|
+
columns = data.select_dtypes(include=np.number).columns.tolist()
|
5147
5567
|
non_numeric_columns = data.columns.difference(columns)
|
5148
|
-
print(f"Scaling columns")
|
5149
5568
|
|
5150
5569
|
scaled_data = scaler.fit_transform(data[columns])
|
5151
5570
|
|
@@ -5167,7 +5586,7 @@ def df_scaler(
|
|
5167
5586
|
# Row-wise scaling
|
5168
5587
|
if columns is None:
|
5169
5588
|
columns = data.index.tolist()
|
5170
|
-
numeric_rows = data.loc[columns].select_dtypes(include=
|
5589
|
+
numeric_rows = data.loc[columns].select_dtypes(include=np.number)
|
5171
5590
|
if numeric_rows.empty:
|
5172
5591
|
raise ValueError("No numeric rows to scale.")
|
5173
5592
|
|
@@ -5184,7 +5603,31 @@ def df_scaler(
|
|
5184
5603
|
scaled_df = data.copy()
|
5185
5604
|
scaled_df.loc[numeric_rows.index] = scaled_data
|
5186
5605
|
return scaled_df
|
5606
|
+
def df_special_characters_cleaner(
|
5607
|
+
data: pd.DataFrame, where=["column", "content", "index"]
|
5608
|
+
) -> pd.DataFrame:
|
5609
|
+
"""
|
5610
|
+
to clean special characters:
|
5611
|
+
usage:
|
5612
|
+
df_special_characters_cleaner(data=df, where='column')
|
5613
|
+
"""
|
5614
|
+
if not isinstance(where, list):
|
5615
|
+
where = [where]
|
5616
|
+
where_to_clean = ["column", "content", "index"]
|
5617
|
+
where_ = [strcmp(i, where_to_clean)[0] for i in where]
|
5618
|
+
|
5619
|
+
# 1. Clean column names by replacing special characters with underscores
|
5620
|
+
if "column" in where_:
|
5621
|
+
data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
|
5622
|
+
|
5623
|
+
# 2. Clean only object-type columns (text columns)
|
5624
|
+
if "content" in where_:
|
5625
|
+
for col in data.select_dtypes(include=["object"]).columns:
|
5626
|
+
data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
|
5627
|
+
if data.index.dtype == "object" and index in where_:
|
5628
|
+
data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
|
5187
5629
|
|
5630
|
+
return data
|
5188
5631
|
def df_cluster(
|
5189
5632
|
data: pd.DataFrame,
|
5190
5633
|
columns: Optional[list] = None,
|
@@ -5193,8 +5636,8 @@ def df_cluster(
|
|
5193
5636
|
scale: bool = True,
|
5194
5637
|
plot: Union[str, list] = "all",
|
5195
5638
|
inplace: bool = True,
|
5196
|
-
ax
|
5197
|
-
)
|
5639
|
+
ax = None,
|
5640
|
+
):
|
5198
5641
|
from sklearn.preprocessing import StandardScaler
|
5199
5642
|
from sklearn.cluster import KMeans
|
5200
5643
|
from sklearn.metrics import silhouette_score, silhouette_samples
|
@@ -5202,7 +5645,6 @@ def df_cluster(
|
|
5202
5645
|
import numpy as np
|
5203
5646
|
import pandas as pd
|
5204
5647
|
import matplotlib.pyplot as plt
|
5205
|
-
import seaborn as sns
|
5206
5648
|
|
5207
5649
|
"""
|
5208
5650
|
Performs clustering analysis on the provided feature matrix using K-Means.
|
@@ -5510,80 +5952,61 @@ def df_reducer(
|
|
5510
5952
|
umap_neighbors: int = 15, # UMAP-specific
|
5511
5953
|
umap_min_dist: float = 0.1, # UMAP-specific
|
5512
5954
|
tsne_perplexity: int = 30, # t-SNE-specific
|
5955
|
+
hue:str = None,# lda-specific
|
5513
5956
|
scale: bool = True,
|
5514
5957
|
fill_missing: bool = True,
|
5515
5958
|
debug: bool = False,
|
5516
5959
|
inplace: bool = True, # replace the oringinal data
|
5517
5960
|
plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
|
5518
|
-
|
5519
|
-
|
5520
|
-
|
5521
|
-
|
5522
|
-
|
5523
|
-
|
5524
|
-
|
5525
|
-
|
5526
|
-
|
5527
|
-
|
5528
|
-
|
5529
|
-
|
5530
|
-
|
5531
|
-
|
5532
|
-
|
5533
|
-
|
5534
|
-
|
5535
|
-
|
5536
|
-
|
5537
|
-
|
5538
|
-
|
5539
|
-
|
5540
|
-
|
5541
|
-
|
5542
|
-
scale : bool, optional, default=True
|
5543
|
-
Whether to scale the data using StandardScaler.
|
5544
|
-
|
5545
|
-
fill_missing : bool, optional, default=True
|
5546
|
-
Whether to fill missing values using the mean before applying PCA/UMAP.
|
5961
|
+
random_state=1,
|
5962
|
+
ax = None,
|
5963
|
+
figsize=None,
|
5964
|
+
**kwargs
|
5965
|
+
) -> pd.DataFrame:
|
5966
|
+
dict_methods = {
|
5967
|
+
#!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
|
5968
|
+
"pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
|
5969
|
+
"lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
|
5970
|
+
"factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
|
5971
|
+
"svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
|
5972
|
+
|
5973
|
+
#! Non-linear Dimensionality Reduction (Manifold Learning)
|
5974
|
+
"umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
|
5975
|
+
"tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other. Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
|
5976
|
+
"mds": "mds(Multidimensional Scaling):\n\tAppropriate for visualizing pairwise similarity or distance in data. Advantage: Maintains the perceived similarity or dissimilarity between points. Limitation: Computationally expensive for large datasets, less effective for complex, high-dimensional structures.",
|
5977
|
+
"lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
|
5978
|
+
"kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
|
5979
|
+
"ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
|
5980
|
+
|
5981
|
+
#! Anomaly Detection: Specialized for detecting outliers or unusual patterns
|
5982
|
+
"isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
|
5983
|
+
}
|
5547
5984
|
|
5548
|
-
Returns:
|
5549
|
-
--------
|
5550
|
-
reduced_df : pd.DataFrame
|
5551
|
-
DataFrame with the reduced dimensions.
|
5552
|
-
"""
|
5553
|
-
|
5554
|
-
"""
|
5555
|
-
PCA: explained_variance:
|
5556
|
-
indicates the proportion of the dataset's total variance that each principal
|
5557
|
-
component (PC) explains. It gives you a sense of how much information
|
5558
|
-
(or variance) is captured by each PC
|
5559
|
-
Interpretation:
|
5560
|
-
- Higher values indicate that the corresponding PC captures more variance.
|
5561
|
-
- The sum of the explained variances for all PCs equals 1 (or 100%).
|
5562
|
-
- If the first few components explain a high percentage (e.g., 90%),
|
5563
|
-
it means you can reduce the dimensionality of the data significantly without losing much information.
|
5564
|
-
Use case:
|
5565
|
-
You may plot a scree plot, which shows the explained variance for each PC, to help decide
|
5566
|
-
how many components to keep for analysis.
|
5567
|
-
|
5568
|
-
PCA: Singular values:
|
5569
|
-
represent the magnitude of variance along each principal component. Mathematically,
|
5570
|
-
they are the square roots of the eigenvalues of the covariance matrix.
|
5571
|
-
Interpretation:
|
5572
|
-
Larger singular values indicate that the associated PC captures more variance.
|
5573
|
-
Singular values are related to the scale of the data. If the data are scaled
|
5574
|
-
before PCA (e.g., standardized), then the singular values will provide a measure
|
5575
|
-
of the spread of data along each PC.
|
5576
|
-
Use case:
|
5577
|
-
Singular values help quantify the contribution of each principal component in a
|
5578
|
-
similar way to the explained variance. They are useful in understanding the overall
|
5579
|
-
structure of the data.
|
5580
|
-
"""
|
5581
5985
|
from sklearn.preprocessing import StandardScaler
|
5582
5986
|
from sklearn.impute import SimpleImputer
|
5583
|
-
|
5584
|
-
|
5585
|
-
|
5586
|
-
|
5987
|
+
if plot_:
|
5988
|
+
import matplotlib.pyplot as plt
|
5989
|
+
import seaborn as sns
|
5990
|
+
# Check valid method input
|
5991
|
+
methods=["pca", "umap","tsne","factor","isolation_forest","lda","kpca","ica","mds","lle","svd"]
|
5992
|
+
method=strcmp(method, methods)[0]
|
5993
|
+
print(f"\nprocessing with using {dict_methods[method]}:")
|
5994
|
+
xlabel,ylabel=None,None
|
5995
|
+
if columns is None:
|
5996
|
+
columns = data.select_dtypes(include='number').columns.tolist()
|
5997
|
+
if hue is None:
|
5998
|
+
hue = data.select_dtypes(exclude='number').columns.tolist()
|
5999
|
+
if isinstance(hue, list):
|
6000
|
+
print("Warning: hue is a list, only select the 1st one")
|
6001
|
+
hue=hue[0]
|
6002
|
+
if not hue:
|
6003
|
+
# Select columns if specified, else use all columns
|
6004
|
+
X = data[columns].values if columns else data.values
|
6005
|
+
else:
|
6006
|
+
# Select columns to reduce and hue for LDA
|
6007
|
+
X = data[columns].values if columns else data.drop(columns=[hue]).values
|
6008
|
+
y = data[hue].values
|
6009
|
+
print(X.shape)
|
5587
6010
|
# Handle missing values
|
5588
6011
|
if fill_missing:
|
5589
6012
|
imputer = SimpleImputer(strategy="mean")
|
@@ -5594,9 +6017,6 @@ def df_reducer(
|
|
5594
6017
|
scaler = StandardScaler()
|
5595
6018
|
X = scaler.fit_transform(X)
|
5596
6019
|
|
5597
|
-
# Check valid method input
|
5598
|
-
methods=["pca", "umap","tsne","factor","isolation_forest"]
|
5599
|
-
method=strcmp(method, methods)[0]
|
5600
6020
|
# Apply PCA if selected
|
5601
6021
|
if method == "pca":
|
5602
6022
|
from sklearn.decomposition import PCA
|
@@ -5640,7 +6060,27 @@ def df_reducer(
|
|
5640
6060
|
pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
|
5641
6061
|
for i in range(n_components):
|
5642
6062
|
pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
|
6063
|
+
if hue:
|
6064
|
+
pca_df[hue]=y
|
6065
|
+
elif method =='lda':
|
6066
|
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
6067
|
+
|
6068
|
+
if "hue" not in locals() or hue is None:
|
6069
|
+
raise ValueError("LDA requires a 'hue' col parameter to specify class labels.")
|
5643
6070
|
|
6071
|
+
lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
|
6072
|
+
X_reduced = lda_reducer.fit_transform(X, y)
|
6073
|
+
|
6074
|
+
# Prepare reduced DataFrame with additional LDA info
|
6075
|
+
lda_df = pd.DataFrame(
|
6076
|
+
X_reduced, index=data.index,
|
6077
|
+
columns=[f"LDA_{i+1}" for i in range(n_components)]
|
6078
|
+
)
|
6079
|
+
if debug:
|
6080
|
+
print(f"LDA completed: Reduced to {n_components} components.")
|
6081
|
+
print("Class separability achieved by LDA.")
|
6082
|
+
if hue:
|
6083
|
+
lda_df[hue]=y
|
5644
6084
|
# Apply UMAP if selected
|
5645
6085
|
elif method == "umap":
|
5646
6086
|
import umap
|
@@ -5667,32 +6107,36 @@ def df_reducer(
|
|
5667
6107
|
)
|
5668
6108
|
umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
|
5669
6109
|
umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
|
6110
|
+
if hue:
|
6111
|
+
umap_df[hue]=y
|
5670
6112
|
elif method == "tsne":
|
5671
6113
|
from sklearn.manifold import TSNE
|
5672
|
-
tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=
|
5673
|
-
X_reduced = tsne.fit_transform(X)
|
5674
|
-
|
5675
|
-
# Prepare reduced DataFrame with additional t-SNE info
|
6114
|
+
tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=random_state)
|
6115
|
+
X_reduced = tsne.fit_transform(X)
|
5676
6116
|
tsne_df = pd.DataFrame(
|
5677
|
-
X_reduced,
|
6117
|
+
X_reduced,
|
6118
|
+
index=data.index,
|
5678
6119
|
columns=[f"tSNE_{i+1}" for i in range(n_components)]
|
5679
6120
|
)
|
5680
6121
|
tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
|
5681
|
-
|
6122
|
+
if hue:
|
6123
|
+
tsne_df[hue]=y
|
5682
6124
|
# Apply Factor Analysis if selected
|
5683
6125
|
elif method == "factor":
|
5684
6126
|
from sklearn.decomposition import FactorAnalysis
|
5685
|
-
factor = FactorAnalysis(n_components=n_components, random_state=
|
6127
|
+
factor = FactorAnalysis(n_components=n_components, random_state=random_state)
|
5686
6128
|
X_reduced = factor.fit_transform(X)
|
5687
6129
|
# Factor Analysis does not directly provide explained variance, but we can approximate it
|
5688
6130
|
fa_variance = factor.noise_variance_
|
5689
6131
|
# Prepare reduced DataFrame with additional Factor Analysis info
|
5690
6132
|
factor_df = pd.DataFrame(
|
5691
|
-
X_reduced,
|
6133
|
+
X_reduced,
|
6134
|
+
index=data.index,
|
5692
6135
|
columns=[f"Factor_{i+1}" for i in range(n_components)]
|
5693
6136
|
)
|
5694
6137
|
factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
|
5695
|
-
|
6138
|
+
if hue:
|
6139
|
+
factor_df[hue]=y
|
5696
6140
|
# Apply Isolation Forest for outlier detection if selected
|
5697
6141
|
elif method == "isolation_forest":
|
5698
6142
|
from sklearn.decomposition import PCA
|
@@ -5723,48 +6167,100 @@ def df_reducer(
|
|
5723
6167
|
iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
|
5724
6168
|
for i in range(n_components):
|
5725
6169
|
iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
|
6170
|
+
if hue:
|
6171
|
+
iso_forest_df[hue]=y
|
6172
|
+
#* Apply Kernel PCA if selected
|
6173
|
+
elif method == "kpca":
|
6174
|
+
from sklearn.decomposition import KernelPCA
|
6175
|
+
kpca = KernelPCA(n_components=n_components, kernel="rbf", random_state=random_state)
|
6176
|
+
X_reduced = kpca.fit_transform(X)
|
6177
|
+
|
6178
|
+
# Prepare reduced DataFrame with KPCA info
|
6179
|
+
kpca_df = pd.DataFrame(
|
6180
|
+
X_reduced,
|
6181
|
+
index=data.index,
|
6182
|
+
columns=[f"KPCA_{i+1}" for i in range(n_components)]
|
6183
|
+
)
|
6184
|
+
if debug:
|
6185
|
+
print("Kernel PCA completed with RBF kernel.")
|
6186
|
+
if hue:
|
6187
|
+
kpca_df[hue]=y
|
6188
|
+
#* Apply ICA if selected
|
6189
|
+
elif method == "ica":
|
6190
|
+
from sklearn.decomposition import FastICA
|
6191
|
+
ica = FastICA(n_components=n_components, random_state=random_state)
|
6192
|
+
X_reduced = ica.fit_transform(X)
|
6193
|
+
|
6194
|
+
# Prepare reduced DataFrame with ICA info
|
6195
|
+
ica_df = pd.DataFrame(
|
6196
|
+
X_reduced, index=data.index,
|
6197
|
+
columns=[f"ICA_{i+1}" for i in range(n_components)]
|
6198
|
+
)
|
6199
|
+
if debug:
|
6200
|
+
print("Independent Component Analysis (ICA) completed.")
|
6201
|
+
if hue:
|
6202
|
+
ica_df[hue]=y
|
6203
|
+
#* Apply MDS if selected
|
6204
|
+
elif method == "mds":
|
6205
|
+
from sklearn.manifold import MDS
|
6206
|
+
mds = MDS(n_components=n_components, random_state=random_state)
|
6207
|
+
X_reduced = mds.fit_transform(X)
|
6208
|
+
|
6209
|
+
# Prepare reduced DataFrame with MDS info
|
6210
|
+
mds_df = pd.DataFrame(
|
6211
|
+
X_reduced, index=data.index,
|
6212
|
+
columns=[f"MDS_{i+1}" for i in range(n_components)]
|
6213
|
+
)
|
6214
|
+
if debug:
|
6215
|
+
print("Multidimensional Scaling (MDS) completed.")
|
6216
|
+
if hue:
|
6217
|
+
mds_df[hue]=y
|
6218
|
+
#* Apply Locally Linear Embedding (LLE) if selected
|
6219
|
+
elif method == "lle":
|
6220
|
+
from sklearn.manifold import LocallyLinearEmbedding
|
6221
|
+
lle = LocallyLinearEmbedding(n_components=n_components, n_neighbors=umap_neighbors, random_state=random_state)
|
6222
|
+
X_reduced = lle.fit_transform(X)
|
6223
|
+
|
6224
|
+
# Prepare reduced DataFrame with LLE info
|
6225
|
+
lle_df = pd.DataFrame(
|
6226
|
+
X_reduced, index=data.index,
|
6227
|
+
columns=[f"LLE_{i+1}" for i in range(n_components)]
|
6228
|
+
)
|
6229
|
+
if debug:
|
6230
|
+
print("Locally Linear Embedding (LLE) completed.")
|
6231
|
+
if hue:
|
6232
|
+
lle_df[hue]=y
|
6233
|
+
#* Apply Singular Value Decomposition (SVD) if selected
|
6234
|
+
elif method == "svd":
|
6235
|
+
# Using NumPy's SVD for dimensionality reduction
|
6236
|
+
U, s, Vt = np.linalg.svd(X, full_matrices=False)
|
6237
|
+
X_reduced = U[:, :n_components] * s[:n_components]
|
6238
|
+
|
6239
|
+
# Prepare reduced DataFrame with SVD info
|
6240
|
+
svd_df = pd.DataFrame(
|
6241
|
+
X_reduced, index=data.index,
|
6242
|
+
columns=[f"SVD_{i+1}" for i in range(n_components)]
|
6243
|
+
)
|
6244
|
+
if hue:
|
6245
|
+
svd_df[hue]=y
|
6246
|
+
if debug:
|
6247
|
+
print("Singular Value Decomposition (SVD) completed.")
|
5726
6248
|
|
5727
6249
|
# Return reduced data and info as a new DataFrame with the same index
|
5728
6250
|
if method == "pca":
|
5729
6251
|
reduced_df = pca_df
|
5730
6252
|
colname_met = "PC_"
|
5731
|
-
|
5732
|
-
|
5733
|
-
data=pca_df,
|
5734
|
-
x="PC_1",
|
5735
|
-
y="PC_2",
|
5736
|
-
# hue="condition",
|
5737
|
-
)
|
6253
|
+
xlabel= f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
|
6254
|
+
ylabel= f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
|
5738
6255
|
elif method == "umap":
|
5739
6256
|
reduced_df = umap_df
|
5740
|
-
colname_met = "UMAP_"
|
5741
|
-
if plot_:
|
5742
|
-
sns.scatterplot(
|
5743
|
-
data=umap_df,
|
5744
|
-
x="UMAP_1",
|
5745
|
-
y="UMAP_2",
|
5746
|
-
# hue="condition",
|
5747
|
-
)
|
6257
|
+
colname_met = "UMAP_"
|
5748
6258
|
elif method == "tsne":
|
5749
6259
|
reduced_df = tsne_df
|
5750
|
-
colname_met = "
|
5751
|
-
if plot_:
|
5752
|
-
sns.scatterplot(
|
5753
|
-
data=tsne_df,
|
5754
|
-
x="tSNE_1",
|
5755
|
-
y="tSNE_2",
|
5756
|
-
# hue="batch",
|
5757
|
-
)
|
6260
|
+
colname_met = "tSNE_"
|
5758
6261
|
elif method == "factor":
|
5759
6262
|
reduced_df = factor_df
|
5760
|
-
colname_met = "Factor_"
|
5761
|
-
if plot_:
|
5762
|
-
sns.scatterplot(
|
5763
|
-
data=factor_df,
|
5764
|
-
x="Factor_1",
|
5765
|
-
y="Factor_2",
|
5766
|
-
# hue="batch",
|
5767
|
-
)
|
6263
|
+
colname_met = "Factor_"
|
5768
6264
|
elif method == "isolation_forest":
|
5769
6265
|
reduced_df = iso_forest_df # Already a DataFrame for outliers
|
5770
6266
|
colname_met = "PC_"
|
@@ -5783,33 +6279,71 @@ def df_reducer(
|
|
5783
6279
|
c="r",
|
5784
6280
|
label="outlier", marker="+", s=30,
|
5785
6281
|
)
|
5786
|
-
|
6282
|
+
elif method=='lda':
|
6283
|
+
reduced_df=lda_df
|
6284
|
+
colname_met="LDA_"
|
6285
|
+
elif method=="kpca":
|
6286
|
+
reduced_df=kpca_df
|
6287
|
+
colname_met="KPCA_"
|
6288
|
+
elif method=="ica":
|
6289
|
+
reduced_df=ica_df
|
6290
|
+
colname_met="ICA_"
|
6291
|
+
elif method=="mds":
|
6292
|
+
reduced_df=mds_df
|
6293
|
+
colname_met="MDS_"
|
6294
|
+
elif method=="lle":
|
6295
|
+
reduced_df=lle_df
|
6296
|
+
colname_met="LLE_"
|
6297
|
+
elif method=="svd":
|
6298
|
+
reduced_df=svd_df
|
6299
|
+
colname_met="SVD_"
|
6300
|
+
# Quick plots
|
6301
|
+
if plot_ and (not method in ["isolation_forest"]):
|
6302
|
+
from .plot import plotxy
|
6303
|
+
if ax is None:
|
6304
|
+
if figsize is None:
|
6305
|
+
_, ax = plt.subplots(figsize=cm2inch(8,8))
|
6306
|
+
else:
|
6307
|
+
_, ax = plt.subplots(figsize=figsize)
|
6308
|
+
else:
|
6309
|
+
ax=ax.cla()
|
6310
|
+
ax=plotxy(data=reduced_df,
|
6311
|
+
x=colname_met+"1",
|
6312
|
+
y=colname_met+"2",
|
6313
|
+
hue=hue,
|
6314
|
+
s=1,
|
6315
|
+
edgecolor='none',
|
6316
|
+
kind='scater',
|
6317
|
+
figsets=dict(legend=dict(loc='best',markerscale=4),
|
6318
|
+
xlabel=xlabel if xlabel else None,
|
6319
|
+
ylabel=ylabel if ylabel else None),
|
6320
|
+
ax=ax,
|
6321
|
+
verbose=False,
|
6322
|
+
**kwargs
|
6323
|
+
)
|
5787
6324
|
|
5788
6325
|
if inplace:
|
5789
6326
|
# If inplace=True, add components back into the original data
|
5790
6327
|
for col_idx in range(n_components):
|
5791
|
-
data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
6328
|
+
data.loc[:,f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
5792
6329
|
# Add extra info for PCA/UMAP
|
5793
6330
|
if method == "pca":
|
5794
6331
|
for i in range(n_components):
|
5795
|
-
data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
|
6332
|
+
data.loc[:,f"Explained Variance PC_{i+1}"] = reduced_df.loc[:,f"Explained Variance PC_{i+1}"]
|
5796
6333
|
for i in range(n_components):
|
5797
|
-
data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
|
6334
|
+
data.loc[:,f"Singular Values PC_{i+1}"] = reduced_df.loc[:,f"Singular Values PC_{i+1}"]
|
5798
6335
|
elif method == "umap":
|
5799
6336
|
for i in range(n_components):
|
5800
|
-
data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
|
5801
|
-
data["Embedding"] = reduced_df["Embedding"]
|
5802
|
-
data["Trustworthiness"] = reduced_df["Trustworthiness"]
|
6337
|
+
data.loc[:,f"UMAP_{i+1}"]=reduced_df.loc[:,f"UMAP_{i+1}"]
|
6338
|
+
data.loc[:,"Embedding"] = reduced_df.loc[:,"Embedding"]
|
6339
|
+
data.loc[:,"Trustworthiness"] = reduced_df.loc[:,"Trustworthiness"]
|
6340
|
+
|
5803
6341
|
return None # No return when inplace=True
|
5804
|
-
|
5805
6342
|
|
5806
6343
|
return reduced_df
|
5807
|
-
|
5808
|
-
|
5809
6344
|
# example:
|
5810
6345
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
5811
6346
|
|
5812
|
-
|
5813
6347
|
def plot_cluster(
|
5814
6348
|
data: pd.DataFrame,
|
5815
6349
|
labels: np.ndarray,
|
@@ -5833,7 +6367,7 @@ def plot_cluster(
|
|
5833
6367
|
"""
|
5834
6368
|
import seaborn as sns
|
5835
6369
|
from sklearn.metrics import silhouette_samples
|
5836
|
-
|
6370
|
+
import matplotlib.pyplot as plt
|
5837
6371
|
if metrics is None:
|
5838
6372
|
metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)
|
5839
6373
|
|