py2ls 0.2.4.7__py3-none-any.whl → 0.2.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/batman.py +32 -1
- py2ls/bio.py +3 -17
- py2ls/data/usages_sns.json +2 -1
- py2ls/ips.py +1136 -691
- py2ls/ml2ls.py +1841 -390
- py2ls/plot.py +499 -214
- {py2ls-0.2.4.7.dist-info → py2ls-0.2.4.8.dist-info}/METADATA +2 -2
- {py2ls-0.2.4.7.dist-info → py2ls-0.2.4.8.dist-info}/RECORD +10 -10
- {py2ls-0.2.4.7.dist-info → py2ls-0.2.4.8.dist-info}/WHEEL +1 -1
py2ls/ips.py
CHANGED
@@ -1,62 +1,38 @@
|
|
1
1
|
import numpy as np
|
2
|
-
import pandas as pd
|
3
|
-
|
4
|
-
import
|
5
|
-
import matplotlib
|
6
|
-
import matplotlib.pyplot as plt
|
7
|
-
import matplotlib.ticker as tck
|
8
|
-
from cycler import cycler
|
9
|
-
from mpl_toolkits.mplot3d import Axes3D
|
10
|
-
import seaborn as sns
|
11
|
-
|
12
|
-
from sklearn.kernel_approximation import KERNEL_PARAMS
|
13
|
-
from sympy import is_increasing
|
14
|
-
import sys, os, shutil, re, yaml, json, subprocess
|
15
|
-
import importlib.util
|
16
|
-
import time
|
17
|
-
from dateutil import parser
|
18
|
-
from datetime import datetime
|
19
|
-
import schedule
|
20
|
-
|
21
|
-
from PIL import Image, ImageEnhance, ImageOps, ImageFilter
|
22
|
-
from rembg import remove, new_session
|
23
|
-
|
24
|
-
import docx
|
25
|
-
from fpdf import FPDF
|
26
|
-
from lxml import etree
|
27
|
-
from docx import Document
|
28
|
-
from PyPDF2 import PdfReader
|
29
|
-
from pptx import Presentation
|
30
|
-
from pptx.util import Inches
|
31
|
-
from pdf2image import convert_from_path, pdfinfo_from_path
|
32
|
-
from nltk.tokenize import sent_tokenize, word_tokenize
|
33
|
-
import nltk # nltk.download("punkt")
|
34
|
-
from docx2pdf import convert
|
35
|
-
import img2pdf as image2pdf
|
36
|
-
import nbformat
|
37
|
-
from nbconvert import MarkdownExporter
|
38
|
-
|
39
|
-
from itertools import pairwise
|
40
|
-
from box import Box, BoxList
|
41
|
-
from numerizer import numerize
|
42
|
-
from tqdm import tqdm
|
43
|
-
import mimetypes
|
44
|
-
from pprint import pp
|
45
|
-
from collections import Counter
|
46
|
-
from fuzzywuzzy import fuzz, process
|
47
|
-
from langdetect import detect
|
48
|
-
from duckduckgo_search import DDGS
|
2
|
+
import pandas as pd
|
3
|
+
import sys, os
|
4
|
+
from IPython.display import display
|
49
5
|
from typing import List, Optional, Union
|
50
|
-
from bs4 import BeautifulSoup
|
51
|
-
|
52
|
-
from . import netfinder
|
53
|
-
|
54
6
|
try:
|
55
7
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
56
8
|
get_ipython().run_line_magic("autoreload", "2")
|
57
9
|
except NameError:
|
58
10
|
pass
|
59
11
|
|
12
|
+
import warnings
|
13
|
+
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
|
14
|
+
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
|
15
|
+
|
16
|
+
def run_once_within(duration=60): # default 60s
|
17
|
+
import time
|
18
|
+
"""
|
19
|
+
usage:
|
20
|
+
if run_once_within():
|
21
|
+
print("This code runs once per minute.")
|
22
|
+
else:
|
23
|
+
print("The code has already been run in the last minute.")
|
24
|
+
"""
|
25
|
+
if not hasattr(run_once_within, "time_last"):
|
26
|
+
run_once_within.time_last = None
|
27
|
+
time_curr = time.time()
|
28
|
+
|
29
|
+
if (run_once_within.time_last is None) or (time_curr - run_once_within.time_last >= duration):
|
30
|
+
run_once_within.time_last = time_curr # Update the last execution time
|
31
|
+
return True
|
32
|
+
else:
|
33
|
+
return False
|
34
|
+
|
35
|
+
|
60
36
|
def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
|
61
37
|
"""
|
62
38
|
Add the Chinese (default) font to the font manager
|
@@ -155,6 +131,8 @@ def run_every(when: str = None, job=None, wait: int = 60):
|
|
155
131
|
:param when: String specifying the interval, e.g. '2 minutes', '4 hours', '1 day'.
|
156
132
|
:param job: The function to be scheduled.
|
157
133
|
"""
|
134
|
+
import schedule
|
135
|
+
import time
|
158
136
|
if job is None:
|
159
137
|
print("No job provided!")
|
160
138
|
return
|
@@ -200,6 +178,8 @@ def run_at(when: str, job=None, wait: int = 60):
|
|
200
178
|
:param job: The function to be scheduled.
|
201
179
|
:param wait: The sleep interval between checks in seconds.
|
202
180
|
"""
|
181
|
+
from datetime import datetime
|
182
|
+
import time
|
203
183
|
if job is None:
|
204
184
|
print("No job provided!")
|
205
185
|
return
|
@@ -279,6 +259,7 @@ def get_timezone(timezone: str | list = None):
|
|
279
259
|
|
280
260
|
def is_package_installed(package_name):
|
281
261
|
"""Check if a package is installed."""
|
262
|
+
import importlib.util
|
282
263
|
package_spec = importlib.util.find_spec(package_name)
|
283
264
|
return package_spec is not None
|
284
265
|
|
@@ -291,6 +272,7 @@ def upgrade(module="py2ls",uninstall=False):
|
|
291
272
|
module (str): The name of the module to install/upgrade.
|
292
273
|
uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
|
293
274
|
"""
|
275
|
+
import subprocess
|
294
276
|
if not is_package_installed(module):
|
295
277
|
try:
|
296
278
|
subprocess.check_call([sys.executable, "-m", "pip", "install", module])
|
@@ -327,6 +309,7 @@ def get_version(pkg):
|
|
327
309
|
|
328
310
|
|
329
311
|
def rm_folder(folder_path, verbose=True):
|
312
|
+
import shutil
|
330
313
|
try:
|
331
314
|
shutil.rmtree(folder_path)
|
332
315
|
if verbose:
|
@@ -345,6 +328,7 @@ def fremove(path, verbose=True):
|
|
345
328
|
"""
|
346
329
|
try:
|
347
330
|
if os.path.isdir(path):
|
331
|
+
import shutil
|
348
332
|
shutil.rmtree(path)
|
349
333
|
if verbose:
|
350
334
|
print(f"Successfully deleted folder {path}")
|
@@ -360,23 +344,30 @@ def fremove(path, verbose=True):
|
|
360
344
|
print(f"Failed to delete {path}. Reason: {e}")
|
361
345
|
|
362
346
|
|
363
|
-
def get_cwd(verbose: bool = True):
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
347
|
+
# def get_cwd(verbose: bool = True):
|
348
|
+
# """
|
349
|
+
# get_cwd: to get the current working directory
|
350
|
+
# Args:
|
351
|
+
# verbose (bool, optional): to show which function is use. Defaults to True.
|
352
|
+
# """
|
353
|
+
# try:
|
354
|
+
# script_dir = os.path.dirname(os.path.abspath(__file__))
|
355
|
+
# if verbose:
|
356
|
+
# print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
|
357
|
+
# except NameError:
|
358
|
+
# # This works in an interactive environment (like a Jupyter notebook)
|
359
|
+
# script_dir = os.getcwd()
|
360
|
+
# if verbose:
|
361
|
+
# print("os.getcwd():", script_dir)
|
362
|
+
# return script_dir
|
363
|
+
|
364
|
+
|
365
|
+
def get_cwd():
|
366
|
+
from pathlib import Path
|
367
|
+
# Get the current script's directory as a Path object
|
368
|
+
current_directory = Path(__file__).resolve().parent
|
369
|
+
|
370
|
+
return current_directory
|
380
371
|
|
381
372
|
def search(
|
382
373
|
query,
|
@@ -388,7 +379,7 @@ def search(
|
|
388
379
|
dir_save=dir_save,
|
389
380
|
**kwargs,
|
390
381
|
):
|
391
|
-
|
382
|
+
from duckduckgo_search import DDGS
|
392
383
|
if "te" in kind.lower():
|
393
384
|
results = DDGS().text(query, max_results=limit)
|
394
385
|
res = pd.DataFrame(results)
|
@@ -421,7 +412,7 @@ def echo(*args, **kwargs):
|
|
421
412
|
str: the answer from ai
|
422
413
|
"""
|
423
414
|
global dir_save
|
424
|
-
|
415
|
+
from duckduckgo_search import DDGS
|
425
416
|
query = None
|
426
417
|
model = kwargs.get("model", "gpt")
|
427
418
|
verbose = kwargs.get("verbose", True)
|
@@ -469,8 +460,11 @@ def echo(*args, **kwargs):
|
|
469
460
|
model_valid = valid_mod_name(model)
|
470
461
|
res = DDGS().chat(query, model=model_valid)
|
471
462
|
if verbose:
|
463
|
+
from pprint import pp
|
472
464
|
pp(res)
|
473
465
|
if log:
|
466
|
+
from datetime import datetime
|
467
|
+
import time
|
474
468
|
dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
|
475
469
|
res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
|
476
470
|
if bool(os.path.basename(dir_save)):
|
@@ -492,6 +486,7 @@ def ai(*args, **kwargs):
|
|
492
486
|
|
493
487
|
|
494
488
|
def detect_lang(text, output="lang", verbose=True):
|
489
|
+
from langdetect import detect
|
495
490
|
dir_curr_script = os.path.dirname(os.path.abspath(__file__))
|
496
491
|
dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
|
497
492
|
print(dir_curr_script, os.getcwd(), dir_lang_code)
|
@@ -550,6 +545,7 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
|
|
550
545
|
for lst in flattened_lists[1:]:
|
551
546
|
shared_elements.intersection_update(lst)
|
552
547
|
else:
|
548
|
+
from collections import Counter
|
553
549
|
all_elements = [item for sublist in flattened_lists for item in sublist]
|
554
550
|
element_count = Counter(all_elements)
|
555
551
|
# Get elements that appear in at least n_shared lists
|
@@ -571,9 +567,9 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
|
|
571
567
|
not_shared(list1,list2)# output [1,3]
|
572
568
|
"""
|
573
569
|
_common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
|
574
|
-
list1 = args[0]
|
570
|
+
list1 = flatten(args[0], verbose=verbose)
|
575
571
|
_not_shared=[item for item in list1 if item not in _common]
|
576
|
-
return
|
572
|
+
return _not_shared
|
577
573
|
|
578
574
|
|
579
575
|
def flatten(nested: Any, unique_list=True, verbose=False):
|
@@ -617,7 +613,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
|
|
617
613
|
Returns:
|
618
614
|
tuple: A tuple containing the best match and its index in the candidates list.
|
619
615
|
"""
|
620
|
-
|
616
|
+
from fuzzywuzzy import fuzz, process
|
621
617
|
def to_lower(s, ignore_case=True):
|
622
618
|
# Converts a string or list of strings to lowercase if ignore_case is True.
|
623
619
|
if ignore_case:
|
@@ -743,6 +739,7 @@ def cn2pinyin(
|
|
743
739
|
return pinyin_flat
|
744
740
|
|
745
741
|
def counter(list_, verbose=True):
|
742
|
+
from collections import Counter
|
746
743
|
c = Counter(list_)
|
747
744
|
# Print the name counts
|
748
745
|
for item, count in c.items():
|
@@ -771,7 +768,7 @@ def str2time(time_str, fmt="24"):
|
|
771
768
|
%p represents AM or PM.
|
772
769
|
- str: The converted time string.
|
773
770
|
"""
|
774
|
-
|
771
|
+
from datetime import datetime
|
775
772
|
def time_len_corr(time_str):
|
776
773
|
time_str_ = (
|
777
774
|
ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
|
@@ -832,6 +829,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
|
|
832
829
|
Returns:
|
833
830
|
- str: The converted date string.
|
834
831
|
"""
|
832
|
+
from dateutil import parser
|
835
833
|
try:
|
836
834
|
date_obj = parser.parse(date_str)
|
837
835
|
except ValueError as e:
|
@@ -848,6 +846,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
|
|
848
846
|
|
849
847
|
|
850
848
|
def str2num(s, *args, **kwargs):
|
849
|
+
import re
|
851
850
|
delimiter = kwargs.get("sep", None)
|
852
851
|
round_digits = kwargs.get("round", None)
|
853
852
|
if delimiter is not None:
|
@@ -863,6 +862,7 @@ def str2num(s, *args, **kwargs):
|
|
863
862
|
try:
|
864
863
|
num = float(s)
|
865
864
|
except ValueError:
|
865
|
+
from numerizer import numerize
|
866
866
|
try:
|
867
867
|
numerized = numerize(s)
|
868
868
|
num = int(numerized) if "." not in numerized else float(numerized)
|
@@ -1030,7 +1030,7 @@ def px2inch(*px, dpi=300) -> list:
|
|
1030
1030
|
return [i / dpi for i in px]
|
1031
1031
|
|
1032
1032
|
|
1033
|
-
def
|
1033
|
+
def inch2cm(*cm) -> list:
|
1034
1034
|
"""
|
1035
1035
|
cm2inch: converts centimeter measurements to inches.
|
1036
1036
|
Usage:
|
@@ -1051,24 +1051,31 @@ def cm2inch(*cm) -> list:
|
|
1051
1051
|
def inch2px(*inch, dpi=300) -> list:
|
1052
1052
|
"""
|
1053
1053
|
inch2px: converts inch measurements to pixels based on the given dpi.
|
1054
|
+
|
1054
1055
|
Usage:
|
1055
1056
|
inch2px(1, 2, dpi=300); inch2px([1, 2], dpi=300)
|
1057
|
+
|
1058
|
+
Parameters:
|
1059
|
+
inch : float, list, or tuple
|
1060
|
+
Single or multiple measurements in inches to convert to pixels.
|
1061
|
+
dpi : int, optional (default=300)
|
1062
|
+
Dots per inch (DPI), representing the pixel density.
|
1063
|
+
|
1056
1064
|
Returns:
|
1057
|
-
list: in pixels
|
1065
|
+
list: Converted measurements in pixels.
|
1058
1066
|
"""
|
1059
|
-
# Case 1: When the user passes a single argument that is a list or tuple,
|
1067
|
+
# Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
|
1060
1068
|
if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
|
1061
|
-
# If the input is a single list or tuple, we unpack its elements and convert each to pixels
|
1062
1069
|
return [i * dpi for i in inch[0]]
|
1063
|
-
|
1070
|
+
|
1071
|
+
# Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
|
1064
1072
|
else:
|
1065
|
-
# Here, we convert each individual argument directly to pixels
|
1066
1073
|
return [i * dpi for i in inch]
|
1067
1074
|
|
1068
1075
|
|
1069
|
-
|
1076
|
+
|
1077
|
+
def cm2inch(*inch) -> list:
|
1070
1078
|
"""
|
1071
|
-
inch2cm: converts inch measurements to centimeters.
|
1072
1079
|
Usage:
|
1073
1080
|
inch2cm(8,5); inch2cm((8,5)); inch2cm([8,5])
|
1074
1081
|
Returns:
|
@@ -1183,6 +1190,7 @@ def paper_size(paper_type_str="a4"):
|
|
1183
1190
|
|
1184
1191
|
|
1185
1192
|
def docx2pdf(dir_docx, dir_pdf=None):
|
1193
|
+
from docx2pdf import convert
|
1186
1194
|
if dir_pdf:
|
1187
1195
|
convert(dir_docx, dir_pdf)
|
1188
1196
|
else:
|
@@ -1190,6 +1198,7 @@ def docx2pdf(dir_docx, dir_pdf=None):
|
|
1190
1198
|
|
1191
1199
|
|
1192
1200
|
def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
|
1201
|
+
import img2pdf as image2pdf
|
1193
1202
|
def mm_to_point(size):
|
1194
1203
|
return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
|
1195
1204
|
|
@@ -1241,6 +1250,9 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
|
|
1241
1250
|
|
1242
1251
|
|
1243
1252
|
def pdf2ppt(dir_pdf, dir_ppt):
|
1253
|
+
from PyPDF2 import PdfReader
|
1254
|
+
from pptx.util import Inches
|
1255
|
+
from pptx import Presentation
|
1244
1256
|
prs = Presentation()
|
1245
1257
|
|
1246
1258
|
# Open the PDF file
|
@@ -1269,6 +1281,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
|
|
1269
1281
|
|
1270
1282
|
|
1271
1283
|
def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
1284
|
+
import re
|
1272
1285
|
if isinstance(text, list):
|
1273
1286
|
nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
|
1274
1287
|
flat_list = [item for sublist in nested_list for item in sublist]
|
@@ -1316,6 +1329,8 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1316
1329
|
return [text[i : i + length] for i in range(0, len(text), length)]
|
1317
1330
|
|
1318
1331
|
def split_by_sent_num(text, n=10):
|
1332
|
+
from nltk.tokenize import sent_tokenize
|
1333
|
+
from itertools import pairwise
|
1319
1334
|
# split text into sentences
|
1320
1335
|
text_split_by_sent = sent_tokenize(text)
|
1321
1336
|
cut_loc_array = np.arange(0, len(text_split_by_sent), n)
|
@@ -1388,10 +1403,12 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1388
1403
|
print(f"splited by camel_case")
|
1389
1404
|
return split_by_camel_case(text)
|
1390
1405
|
elif ("word" in by) and not strict:
|
1406
|
+
from nltk.tokenize import word_tokenize
|
1391
1407
|
if verbose:
|
1392
1408
|
print(f"splited by word")
|
1393
1409
|
return word_tokenize(text)
|
1394
1410
|
elif ("sen" in by and not "num" in by) and not strict:
|
1411
|
+
from nltk.tokenize import sent_tokenize
|
1395
1412
|
if verbose:
|
1396
1413
|
print(f"splited by sentence")
|
1397
1414
|
return sent_tokenize(text)
|
@@ -1441,9 +1458,11 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
|
|
1441
1458
|
|
1442
1459
|
|
1443
1460
|
def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
|
1461
|
+
from pdf2image import convert_from_path, pdfinfo_from_path
|
1444
1462
|
df_dir_img_single_page = pd.DataFrame()
|
1445
1463
|
dir_single_page = []
|
1446
1464
|
if verbose:
|
1465
|
+
from pprint import pp
|
1447
1466
|
pp(pdfinfo_from_path(dir_pdf))
|
1448
1467
|
if isinstance(page, tuple) and page:
|
1449
1468
|
page = list(page)
|
@@ -1562,6 +1581,7 @@ def unzip(dir_path, output_dir=None):
|
|
1562
1581
|
# If the output directory already exists, remove it and replace it
|
1563
1582
|
if os.path.exists(output_dir):
|
1564
1583
|
if os.path.isdir(output_dir): # check if it is a folder
|
1584
|
+
import shutil
|
1565
1585
|
shutil.rmtree(output_dir) # remove folder
|
1566
1586
|
else:
|
1567
1587
|
os.remove(output_dir) # remove file
|
@@ -1579,6 +1599,7 @@ def unzip(dir_path, output_dir=None):
|
|
1579
1599
|
|
1580
1600
|
output_file = os.path.splitext(dir_path)[0] # remove the .gz extension
|
1581
1601
|
try:
|
1602
|
+
import shutil
|
1582
1603
|
with gzip.open(dir_path, "rb") as gz_file:
|
1583
1604
|
with open(output_file, "wb") as out_file:
|
1584
1605
|
shutil.copyfileobj(gz_file, out_file)
|
@@ -1676,11 +1697,13 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1676
1697
|
|
1677
1698
|
"""
|
1678
1699
|
if not isinstance(df, pd.DataFrame):
|
1700
|
+
if verbose:
|
1701
|
+
print('not pd.DataFrame')
|
1679
1702
|
return False
|
1680
1703
|
df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
|
1681
1704
|
# Initialize a list to hold messages about abnormalities
|
1682
1705
|
messages = []
|
1683
|
-
is_abnormal =
|
1706
|
+
is_abnormal = False
|
1684
1707
|
# Check the shape of the DataFrame
|
1685
1708
|
actual_shape = df.shape
|
1686
1709
|
messages.append(f"Shape of DataFrame: {actual_shape}")
|
@@ -1705,25 +1728,29 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1705
1728
|
is_abnormal = True
|
1706
1729
|
if verbose:
|
1707
1730
|
print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
|
1708
|
-
|
1731
|
+
if verbose:
|
1732
|
+
print("1",is_abnormal)
|
1709
1733
|
if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
|
1710
1734
|
messages.append("Abnormal: Too many delimiters in column names.")
|
1711
1735
|
is_abnormal = True
|
1712
1736
|
if verbose:
|
1713
1737
|
print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
|
1714
|
-
|
1738
|
+
if verbose:
|
1739
|
+
print("2",is_abnormal)
|
1715
1740
|
if delimiter_counts[""] > 3:
|
1716
1741
|
messages.append("Abnormal: There are empty column names.")
|
1717
1742
|
is_abnormal = True
|
1718
1743
|
if verbose:
|
1719
1744
|
print(f'delimiter_counts[""] > 3')
|
1720
|
-
|
1745
|
+
if verbose:
|
1746
|
+
print("3",is_abnormal)
|
1721
1747
|
if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
|
1722
1748
|
messages.append("Abnormal: Some column names contain unexpected characters.")
|
1723
1749
|
is_abnormal = True
|
1724
1750
|
if verbose:
|
1725
1751
|
print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
|
1726
|
-
|
1752
|
+
if verbose:
|
1753
|
+
print("4",is_abnormal)
|
1727
1754
|
# # Check for missing values
|
1728
1755
|
# missing_values = df.isnull().sum()
|
1729
1756
|
# if missing_values.any():
|
@@ -1743,7 +1770,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1743
1770
|
is_abnormal = True
|
1744
1771
|
if verbose:
|
1745
1772
|
print(f'df.columns[df.nunique() == 1].tolist()')
|
1746
|
-
|
1773
|
+
if verbose:
|
1774
|
+
print("5",is_abnormal)
|
1747
1775
|
# Check for an unreasonable number of rows or columns
|
1748
1776
|
if actual_shape[0] < 2 or actual_shape[1] < 2:
|
1749
1777
|
messages.append(
|
@@ -1752,7 +1780,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
|
|
1752
1780
|
is_abnormal = True
|
1753
1781
|
if verbose:
|
1754
1782
|
print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
|
1755
|
-
|
1783
|
+
if verbose:
|
1784
|
+
print("6",is_abnormal)
|
1756
1785
|
# Compile results
|
1757
1786
|
if verbose:
|
1758
1787
|
print("\n".join(messages))
|
@@ -1769,7 +1798,24 @@ def fload(fpath, kind=None, **kwargs):
|
|
1769
1798
|
Returns:
|
1770
1799
|
content: The content loaded from the file.
|
1771
1800
|
"""
|
1772
|
-
|
1801
|
+
def read_mplstyle(style_file):
|
1802
|
+
import matplotlib.pyplot as plt
|
1803
|
+
# Load the style file
|
1804
|
+
plt.style.use(style_file)
|
1805
|
+
|
1806
|
+
# Get the current style properties
|
1807
|
+
style_dict = plt.rcParams
|
1808
|
+
|
1809
|
+
# Convert to dictionary
|
1810
|
+
style_dict = dict(style_dict)
|
1811
|
+
# Print the style dictionary
|
1812
|
+
for i, j in style_dict.items():
|
1813
|
+
print(f"\n{i}::::{j}")
|
1814
|
+
return style_dict
|
1815
|
+
# #example usage:
|
1816
|
+
# style_file = "/ std-colors.mplstyle"
|
1817
|
+
# style_dict = read_mplstyle(style_file)
|
1818
|
+
|
1773
1819
|
def load_txt_md(fpath):
|
1774
1820
|
with open(fpath, "r") as file:
|
1775
1821
|
content = file.read()
|
@@ -1785,6 +1831,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1785
1831
|
def load_json(fpath, **kwargs):
|
1786
1832
|
output=kwargs.pop("output","json")
|
1787
1833
|
if output=='json':
|
1834
|
+
import json
|
1788
1835
|
with open(fpath, "r") as file:
|
1789
1836
|
content = json.load(file)
|
1790
1837
|
return content
|
@@ -1792,12 +1839,14 @@ def fload(fpath, kind=None, **kwargs):
|
|
1792
1839
|
return pd.read_json(fpath,**kwargs)
|
1793
1840
|
|
1794
1841
|
def load_yaml(fpath):
|
1842
|
+
import yaml
|
1795
1843
|
with open(fpath, "r") as file:
|
1796
1844
|
content = yaml.safe_load(file)
|
1797
1845
|
return content
|
1798
1846
|
|
1799
1847
|
|
1800
1848
|
def load_xml(fpath, fsize_thr: int = 100):
|
1849
|
+
from lxml import etree
|
1801
1850
|
def load_small_xml(fpath):
|
1802
1851
|
tree = etree.parse(fpath)
|
1803
1852
|
root = tree.getroot()
|
@@ -1856,6 +1905,15 @@ def fload(fpath, kind=None, **kwargs):
|
|
1856
1905
|
if line.startswith(char):
|
1857
1906
|
return char
|
1858
1907
|
return None
|
1908
|
+
|
1909
|
+
def _get_chunks(df_fake):
|
1910
|
+
"""
|
1911
|
+
helper func for 'load_csv'
|
1912
|
+
"""
|
1913
|
+
chunks = []
|
1914
|
+
for chunk in df_fake:
|
1915
|
+
chunks.append(chunk)
|
1916
|
+
return pd.concat(chunks, ignore_index=True)
|
1859
1917
|
|
1860
1918
|
def load_csv(fpath, **kwargs):
|
1861
1919
|
from pandas.errors import EmptyDataError
|
@@ -1869,16 +1927,19 @@ def fload(fpath, kind=None, **kwargs):
|
|
1869
1927
|
on_bad_lines = kwargs.pop("on_bad_lines", "skip")
|
1870
1928
|
comment = kwargs.pop("comment", None)
|
1871
1929
|
fmt=kwargs.pop("fmt",False)
|
1930
|
+
chunksize=kwargs.pop("chunksize", None)
|
1931
|
+
engine='c' if chunksize else engine # when chunksize, recommend 'c'
|
1932
|
+
low_memory=kwargs.pop("low_memory",True)
|
1933
|
+
low_memory=False if chunksize else True # when chunksize, recommend low_memory=False
|
1872
1934
|
verbose=kwargs.pop("verbose",False)
|
1873
|
-
if
|
1935
|
+
if run_once_within():
|
1874
1936
|
use_pd("read_csv", verbose=verbose)
|
1875
|
-
return
|
1876
1937
|
|
1877
1938
|
if comment is None:
|
1878
1939
|
comment = get_comment(
|
1879
1940
|
fpath, comment=None, encoding="utf-8", lines_to_check=5
|
1880
1941
|
)
|
1881
|
-
|
1942
|
+
|
1882
1943
|
try:
|
1883
1944
|
df = pd.read_csv(
|
1884
1945
|
fpath,
|
@@ -1890,14 +1951,19 @@ def fload(fpath, kind=None, **kwargs):
|
|
1890
1951
|
skipinitialspace=skipinitialspace,
|
1891
1952
|
sep=sep,
|
1892
1953
|
on_bad_lines=on_bad_lines,
|
1954
|
+
chunksize=chunksize,
|
1955
|
+
low_memory=low_memory,
|
1893
1956
|
**kwargs,
|
1894
1957
|
)
|
1895
|
-
if
|
1958
|
+
if chunksize:
|
1959
|
+
df=_get_chunks(df)
|
1960
|
+
print(df.shape)
|
1961
|
+
if is_df_abnormal(df, verbose=0): # raise error
|
1896
1962
|
raise ValueError("the df is abnormal")
|
1897
1963
|
except:
|
1898
1964
|
try:
|
1899
1965
|
try:
|
1900
|
-
if engine == "pyarrow":
|
1966
|
+
if engine == "pyarrow" and not chunksize:
|
1901
1967
|
df = pd.read_csv(
|
1902
1968
|
fpath,
|
1903
1969
|
engine=engine,
|
@@ -1906,6 +1972,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1906
1972
|
sep=sep,
|
1907
1973
|
on_bad_lines=on_bad_lines,
|
1908
1974
|
comment=comment,
|
1975
|
+
low_memory=low_memory,
|
1909
1976
|
**kwargs,
|
1910
1977
|
)
|
1911
1978
|
else:
|
@@ -1919,14 +1986,19 @@ def fload(fpath, kind=None, **kwargs):
|
|
1919
1986
|
skipinitialspace=skipinitialspace,
|
1920
1987
|
on_bad_lines=on_bad_lines,
|
1921
1988
|
comment=comment,
|
1989
|
+
chunksize=chunksize,
|
1990
|
+
low_memory=low_memory,
|
1922
1991
|
**kwargs,
|
1923
1992
|
)
|
1993
|
+
if chunksize:
|
1994
|
+
df=_get_chunks(df)
|
1995
|
+
print(df.shape)
|
1924
1996
|
if is_df_abnormal(df, verbose=0):
|
1925
1997
|
raise ValueError("the df is abnormal")
|
1926
1998
|
except (UnicodeDecodeError, ValueError):
|
1927
1999
|
encoding = get_encoding(fpath)
|
1928
2000
|
# print(f"utf-8 failed. Retrying with detected encoding: {encoding}")
|
1929
|
-
if engine == "pyarrow":
|
2001
|
+
if engine == "pyarrow" and not chunksize:
|
1930
2002
|
df = pd.read_csv(
|
1931
2003
|
fpath,
|
1932
2004
|
engine=engine,
|
@@ -1935,6 +2007,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1935
2007
|
sep=sep,
|
1936
2008
|
on_bad_lines=on_bad_lines,
|
1937
2009
|
comment=comment,
|
2010
|
+
low_memory=low_memory,
|
1938
2011
|
**kwargs,
|
1939
2012
|
)
|
1940
2013
|
else:
|
@@ -1948,8 +2021,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
1948
2021
|
skipinitialspace=skipinitialspace,
|
1949
2022
|
on_bad_lines=on_bad_lines,
|
1950
2023
|
comment=comment,
|
2024
|
+
chunksize=chunksize,
|
2025
|
+
low_memory=low_memory,
|
1951
2026
|
**kwargs,
|
1952
2027
|
)
|
2028
|
+
if chunksize:
|
2029
|
+
df=_get_chunks(df)
|
2030
|
+
print(df.shape)
|
1953
2031
|
if is_df_abnormal(df, verbose=0):
|
1954
2032
|
raise ValueError("the df is abnormal")
|
1955
2033
|
except Exception as e:
|
@@ -1966,8 +2044,13 @@ def fload(fpath, kind=None, **kwargs):
|
|
1966
2044
|
sep=sep,
|
1967
2045
|
on_bad_lines=on_bad_lines,
|
1968
2046
|
comment=comment,
|
2047
|
+
chunksize=chunksize,
|
2048
|
+
low_memory=low_memory,
|
1969
2049
|
**kwargs,
|
1970
2050
|
)
|
2051
|
+
if chunksize:
|
2052
|
+
df=_get_chunks(df)
|
2053
|
+
print(df.shape)
|
1971
2054
|
if not is_df_abnormal(df, verbose=0): # normal
|
1972
2055
|
display(df.head(2))
|
1973
2056
|
print(f"shape: {df.shape}")
|
@@ -1975,32 +2058,38 @@ def fload(fpath, kind=None, **kwargs):
|
|
1975
2058
|
except:
|
1976
2059
|
pass
|
1977
2060
|
else:
|
1978
|
-
|
1979
|
-
|
1980
|
-
|
1981
|
-
|
1982
|
-
|
1983
|
-
|
1984
|
-
|
1985
|
-
|
1986
|
-
|
1987
|
-
|
1988
|
-
|
1989
|
-
|
1990
|
-
|
1991
|
-
|
1992
|
-
|
1993
|
-
|
1994
|
-
|
1995
|
-
|
1996
|
-
|
1997
|
-
display(df.head(2))
|
1998
|
-
print(f"
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2002
|
-
|
2003
|
-
|
2061
|
+
if not chunksize:
|
2062
|
+
engines = [None,"c", "python"]
|
2063
|
+
for engine in engines:
|
2064
|
+
separators = [",", "\t", ";", "|", " "]
|
2065
|
+
for sep in separators:
|
2066
|
+
try:
|
2067
|
+
# sep2show = sep if sep != "\t" else "\\t"
|
2068
|
+
# print(f"trying with: engine={engine}, sep='{sep2show}'")
|
2069
|
+
# print(".")
|
2070
|
+
df = pd.read_csv(
|
2071
|
+
fpath,
|
2072
|
+
engine=engine,
|
2073
|
+
sep=sep,
|
2074
|
+
on_bad_lines=on_bad_lines,
|
2075
|
+
comment=comment,
|
2076
|
+
chunksize=chunksize,
|
2077
|
+
low_memory=low_memory,
|
2078
|
+
**kwargs,
|
2079
|
+
)
|
2080
|
+
# display(df.head(2))
|
2081
|
+
# print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
|
2082
|
+
if chunksize:
|
2083
|
+
df=_get_chunks(df)
|
2084
|
+
print(df.shape)
|
2085
|
+
if not is_df_abnormal(df, verbose=0):
|
2086
|
+
display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
|
2087
|
+
print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
|
2088
|
+
return df
|
2089
|
+
except EmptyDataError as e:
|
2090
|
+
continue
|
2091
|
+
else:
|
2092
|
+
pass
|
2004
2093
|
display(df.head(2))
|
2005
2094
|
print(f"shape: {df.shape}")
|
2006
2095
|
return df
|
@@ -2008,7 +2097,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2008
2097
|
def load_excel(fpath, **kwargs):
|
2009
2098
|
engine = kwargs.get("engine", "openpyxl")
|
2010
2099
|
verbose=kwargs.pop("verbose",False)
|
2011
|
-
if
|
2100
|
+
if run_once_within():
|
2012
2101
|
use_pd("read_excel", verbose=verbose)
|
2013
2102
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
2014
2103
|
try:
|
@@ -2039,7 +2128,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2039
2128
|
engine = kwargs.get("engine", "pyarrow")
|
2040
2129
|
verbose = kwargs.pop("verbose", False)
|
2041
2130
|
|
2042
|
-
if
|
2131
|
+
if run_once_within():
|
2043
2132
|
use_pd("read_parquet", verbose=verbose)
|
2044
2133
|
try:
|
2045
2134
|
df = pd.read_parquet(fpath, engine=engine, **kwargs)
|
@@ -2056,6 +2145,8 @@ def fload(fpath, kind=None, **kwargs):
|
|
2056
2145
|
return df
|
2057
2146
|
|
2058
2147
|
def load_ipynb(fpath, **kwargs):
|
2148
|
+
import nbformat
|
2149
|
+
from nbconvert import MarkdownExporter
|
2059
2150
|
as_version = kwargs.get("as_version", 4)
|
2060
2151
|
with open(fpath, "r") as file:
|
2061
2152
|
nb = nbformat.read(file, as_version=as_version)
|
@@ -2085,6 +2176,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2085
2176
|
If page is an integer, it returns the text of the specified page number.
|
2086
2177
|
If the specified page is not found, it returns the string "Page is not found".
|
2087
2178
|
"""
|
2179
|
+
from PyPDF2 import PdfReader
|
2088
2180
|
text_dict = {}
|
2089
2181
|
with open(fpath, "rb") as file:
|
2090
2182
|
pdf_reader = PdfReader(file)
|
@@ -2114,6 +2206,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2114
2206
|
return text_dict.get(int(page), "Page is not found")
|
2115
2207
|
|
2116
2208
|
def load_docx(fpath):
|
2209
|
+
from docx import Document
|
2117
2210
|
doc = Document(fpath)
|
2118
2211
|
content = [para.text for para in doc.paragraphs]
|
2119
2212
|
return content
|
@@ -2174,8 +2267,16 @@ def fload(fpath, kind=None, **kwargs):
|
|
2174
2267
|
elif kind == "xml":
|
2175
2268
|
return load_xml(fpath)
|
2176
2269
|
elif kind in ["csv","tsv"]:
|
2270
|
+
verbose=kwargs.pop('verbose',False)
|
2271
|
+
if run_once_within():
|
2272
|
+
use_pd("read_csv")
|
2177
2273
|
content = load_csv(fpath, **kwargs)
|
2178
2274
|
return content
|
2275
|
+
elif kind=='pkl':
|
2276
|
+
verbose=kwargs.pop('verbose',False)
|
2277
|
+
if run_once_within():
|
2278
|
+
use_pd("read_pickle")
|
2279
|
+
return pd.read_pickle(fpath,**kwargs)
|
2179
2280
|
elif kind in ["ods", "ods", "odt"]:
|
2180
2281
|
engine = kwargs.get("engine", "odf")
|
2181
2282
|
kwargs.pop("engine", None)
|
@@ -2184,25 +2285,40 @@ def fload(fpath, kind=None, **kwargs):
|
|
2184
2285
|
engine = kwargs.get("engine", "xlrd")
|
2185
2286
|
kwargs.pop("engine", None)
|
2186
2287
|
content = load_excel(fpath, engine=engine, **kwargs)
|
2187
|
-
print(f"shape: {content.shape}")
|
2188
|
-
display(content.head(3))
|
2288
|
+
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2289
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2189
2290
|
return content
|
2190
2291
|
elif kind == "xlsx":
|
2191
2292
|
content = load_excel(fpath, **kwargs)
|
2192
|
-
display(content.head(3))
|
2193
|
-
print(f"shape: {content.shape}")
|
2293
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2294
|
+
print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
|
2194
2295
|
return content
|
2195
2296
|
elif kind=='mtx':
|
2196
2297
|
from scipy.io import mmread
|
2197
2298
|
dat_mtx=mmread(fpath)
|
2198
2299
|
content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
|
2199
|
-
display(content.head(3))
|
2300
|
+
display(content.head(3)) if isinstance(content, pd.DataFrame) else None
|
2200
2301
|
print(f"shape: {content.shape}")
|
2201
2302
|
return content
|
2202
2303
|
elif kind == "ipynb":
|
2203
2304
|
return load_ipynb(fpath, **kwargs)
|
2204
2305
|
elif kind in ['parquet','snappy']:
|
2306
|
+
verbose=kwargs.pop('verbose',False)
|
2307
|
+
if run_once_within():
|
2308
|
+
use_pd("read_parquet")
|
2205
2309
|
return load_parquet(fpath,**kwargs)
|
2310
|
+
elif kind =='feather':
|
2311
|
+
verbose=kwargs.pop('verbose',False)
|
2312
|
+
if run_once_within():
|
2313
|
+
use_pd("read_feather")
|
2314
|
+
content=pd.read_feather(fpath,**kwargs)
|
2315
|
+
return content
|
2316
|
+
elif kind =='h5':
|
2317
|
+
content=pd.read_hdf(fpath,**kwargs)
|
2318
|
+
return content
|
2319
|
+
elif kind =='pkl':
|
2320
|
+
content=pd.read_pickle(fpath,**kwargs)
|
2321
|
+
return content
|
2206
2322
|
elif kind == "pdf":
|
2207
2323
|
# print('usage:load_pdf(fpath, page="all", verbose=False)')
|
2208
2324
|
return load_pdf(fpath, **kwargs)
|
@@ -2213,6 +2329,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
2213
2329
|
import GEOparse
|
2214
2330
|
return GEOparse.get_GEO(filepath=fpath)
|
2215
2331
|
elif kind.lower() in zip_types:
|
2332
|
+
from pprint import pp
|
2216
2333
|
keep = kwargs.get("keep", False)
|
2217
2334
|
fpath_unzip = unzip(fpath)
|
2218
2335
|
if os.path.isdir(fpath_unzip):
|
@@ -2247,6 +2364,9 @@ def fload(fpath, kind=None, **kwargs):
|
|
2247
2364
|
meta, data = fcsparser.parse(fpath, reformat_meta=True)
|
2248
2365
|
return meta, data
|
2249
2366
|
|
2367
|
+
elif kind=="mplstyle":
|
2368
|
+
return read_mplstyle(fpath)
|
2369
|
+
|
2250
2370
|
else:
|
2251
2371
|
print("direct reading...")
|
2252
2372
|
try:
|
@@ -2358,6 +2478,7 @@ def filter_kwargs(kws, valid_kwargs):
|
|
2358
2478
|
}
|
2359
2479
|
return kwargs_filtered
|
2360
2480
|
|
2481
|
+
str_space_speed='sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
|
2361
2482
|
|
2362
2483
|
def fsave(
|
2363
2484
|
fpath,
|
@@ -2393,6 +2514,7 @@ def fsave(
|
|
2393
2514
|
fappend(fpath, content=content)
|
2394
2515
|
|
2395
2516
|
def save_docx(fpath, content, font_name, font_size, spacing):
|
2517
|
+
import docx
|
2396
2518
|
if isinstance(content, str):
|
2397
2519
|
content = content.split(". ")
|
2398
2520
|
doc = docx.Document()
|
@@ -2420,6 +2542,7 @@ def fsave(
|
|
2420
2542
|
save_content(fpath, html_content, mode)
|
2421
2543
|
|
2422
2544
|
def save_pdf(fpath, content, font_name, font_size):
|
2545
|
+
from fpdf import FPDF
|
2423
2546
|
pdf = FPDF()
|
2424
2547
|
pdf.add_page()
|
2425
2548
|
# pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
|
@@ -2433,7 +2556,7 @@ def fsave(
|
|
2433
2556
|
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
|
2434
2557
|
|
2435
2558
|
verbose=kwargs.pop("verbose",False)
|
2436
|
-
if
|
2559
|
+
if run_once_within():
|
2437
2560
|
use_pd("to_csv", verbose=verbose)
|
2438
2561
|
kwargs_csv = dict(
|
2439
2562
|
path_or_buf=None,
|
@@ -2465,7 +2588,7 @@ def fsave(
|
|
2465
2588
|
def save_xlsx(fpath, data, **kwargs):
|
2466
2589
|
verbose=kwargs.pop("verbose",False)
|
2467
2590
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2468
|
-
if
|
2591
|
+
if run_once_within():
|
2469
2592
|
use_pd("to_excel", verbose=verbose)
|
2470
2593
|
if any(kwargs):
|
2471
2594
|
format_excel(df=data, filename=fpath, **kwargs)
|
@@ -2491,9 +2614,10 @@ def fsave(
|
|
2491
2614
|
|
2492
2615
|
def save_ipynb(fpath, data, **kwargs):
|
2493
2616
|
# Split the content by code fences to distinguish between code and markdown
|
2617
|
+
import nbformat
|
2494
2618
|
parts = data.split("```")
|
2495
2619
|
cells = []
|
2496
|
-
|
2620
|
+
|
2497
2621
|
for i, part in enumerate(parts):
|
2498
2622
|
if i % 2 == 0:
|
2499
2623
|
# Even index: markdown content
|
@@ -2513,6 +2637,7 @@ def fsave(
|
|
2513
2637
|
# json.dump(data, file, **kwargs)
|
2514
2638
|
|
2515
2639
|
def save_json(fpath_fname, var_dict_or_df):
|
2640
|
+
import json
|
2516
2641
|
def _convert_js(data):
|
2517
2642
|
if isinstance(data, pd.DataFrame):
|
2518
2643
|
return data.to_dict(orient="list")
|
@@ -2534,10 +2659,12 @@ def fsave(
|
|
2534
2659
|
# # setss = jsonload("/.json")
|
2535
2660
|
|
2536
2661
|
def save_yaml(fpath, data, **kwargs):
|
2662
|
+
import yaml
|
2537
2663
|
with open(fpath, "w") as file:
|
2538
2664
|
yaml.dump(data, file, **kwargs)
|
2539
2665
|
|
2540
2666
|
def save_xml(fpath, data):
|
2667
|
+
from lxml import etree
|
2541
2668
|
root = etree.Element("root")
|
2542
2669
|
if isinstance(data, dict):
|
2543
2670
|
for key, val in data.items():
|
@@ -2613,15 +2740,91 @@ def fsave(
|
|
2613
2740
|
elif kind == "ipynb":
|
2614
2741
|
save_ipynb(fpath, content, **kwargs)
|
2615
2742
|
elif kind.lower() in ["parquet","pq","big","par"]:
|
2743
|
+
verbose=kwargs.pop('verbose',False)
|
2744
|
+
if verbose:
|
2745
|
+
print(str_space_speed)
|
2746
|
+
use_pd("to_parquet")
|
2747
|
+
return None
|
2616
2748
|
compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
|
2617
2749
|
# fix the fpath ends
|
2618
|
-
|
2619
|
-
|
2750
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2751
|
+
fpath = _fpath+_ext.replace(kind, 'parquet')
|
2620
2752
|
if compression is not None:
|
2621
2753
|
if not fpath.endswith(compression):
|
2622
2754
|
fpath=fpath+f".{compression}"
|
2623
2755
|
save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
|
2756
|
+
elif kind.lower() in ["pkl","pk","pickle","pick"]:
|
2757
|
+
# Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
|
2758
|
+
# Pickle is convenient if you want to preserve exact Python object types.
|
2759
|
+
verbose=kwargs.pop('verbose',False)
|
2760
|
+
if verbose:
|
2761
|
+
print(str_space_speed)
|
2762
|
+
use_pd("to_pickle")
|
2763
|
+
return None
|
2764
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2765
|
+
fpath = _fpath+_ext.replace(kind, 'pkl')
|
2766
|
+
compression=kwargs.pop("compression",None)
|
2767
|
+
if compression is not None:
|
2768
|
+
if not fpath.endswith(compression["method"]):
|
2769
|
+
fpath=fpath+f".{compression["method"]}"
|
2770
|
+
if isinstance(content, pd.DataFrame):
|
2771
|
+
content.to_pickle(fpath,**kwargs)
|
2772
|
+
else:
|
2773
|
+
try:
|
2774
|
+
print("trying to convert it as a DataFrame...")
|
2775
|
+
content=pd.DataFrame(content)
|
2776
|
+
content.to_pickle(fpath,**kwargs)
|
2777
|
+
except Exception as e:
|
2778
|
+
raise ValueError(
|
2779
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2780
|
+
)
|
2781
|
+
elif kind.lower() in ["fea",'feather','ft','fe','feat','fether']:
|
2782
|
+
# Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
|
2783
|
+
# optimized for data analytics tasks and is especially fast when working with Pandas.
|
2784
|
+
|
2785
|
+
verbose=kwargs.pop('verbose',False)
|
2786
|
+
if verbose:
|
2787
|
+
print(str_space_speed)
|
2788
|
+
use_pd("to_feather")
|
2789
|
+
return None
|
2790
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2791
|
+
fpath = _fpath+_ext.replace(kind, 'feather')
|
2792
|
+
if isinstance(content, pd.DataFrame):
|
2793
|
+
content.to_feather(fpath,**kwargs)
|
2794
|
+
else:
|
2795
|
+
try:
|
2796
|
+
print("trying to convert it as a DataFrame...")
|
2797
|
+
content=pd.DataFrame(content)
|
2798
|
+
content.to_feather(fpath, **kwargs)
|
2799
|
+
except Exception as e:
|
2800
|
+
raise ValueError(
|
2801
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2802
|
+
)
|
2803
|
+
elif kind.lower() in ["hd",'hdf','h','h5']:
|
2804
|
+
# particularly useful for large datasets and can handle complex data structures
|
2805
|
+
verbose=kwargs.pop('verbose',False)
|
2806
|
+
if verbose:
|
2807
|
+
print(str_space_speed)
|
2808
|
+
use_pd("to_hdf")
|
2809
|
+
_fpath, _ext = os.path.splitext(fpath)
|
2810
|
+
fpath = _fpath+_ext.replace(kind, 'h5')
|
2811
|
+
compression=kwargs.pop("compression",None)
|
2812
|
+
if compression is not None:
|
2813
|
+
if not fpath.endswith(compression):
|
2814
|
+
fpath=fpath+f".{compression}"
|
2815
|
+
if isinstance(content, pd.DataFrame):
|
2816
|
+
content.to_hdf(fpath,key='content',**kwargs)
|
2817
|
+
else:
|
2818
|
+
try:
|
2819
|
+
print("trying to convert it as a DataFrame...")
|
2820
|
+
content=pd.DataFrame(content)
|
2821
|
+
content.to_hdf(fpath,**kwargs)
|
2822
|
+
except Exception as e:
|
2823
|
+
raise ValueError(
|
2824
|
+
f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
|
2825
|
+
)
|
2624
2826
|
else:
|
2827
|
+
from . import netfinder
|
2625
2828
|
try:
|
2626
2829
|
netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
|
2627
2830
|
except:
|
@@ -2744,6 +2947,7 @@ def isa(content, kind):
|
|
2744
2947
|
elif "color" in kind.lower(): # file
|
2745
2948
|
return is_str_color(content)
|
2746
2949
|
elif "html" in kind.lower():
|
2950
|
+
import re
|
2747
2951
|
if content is None or not isinstance(content, str):
|
2748
2952
|
return False
|
2749
2953
|
# Remove leading and trailing whitespace
|
@@ -2903,6 +3107,7 @@ def listdir(
|
|
2903
3107
|
display(f.head())
|
2904
3108
|
return f
|
2905
3109
|
else:
|
3110
|
+
from box import Box
|
2906
3111
|
if "l" in orient.lower(): # list # default
|
2907
3112
|
res_output = Box(f.to_dict(orient="list"))
|
2908
3113
|
return res_output
|
@@ -2943,13 +3148,10 @@ def mkdir_nest(fpath: str) -> str:
|
|
2943
3148
|
Returns:
|
2944
3149
|
- str: The path of the created directory.
|
2945
3150
|
"""
|
2946
|
-
|
2947
|
-
|
2948
3151
|
# Split the full path into directories
|
2949
3152
|
f_slash = "/" if "mac" in get_os().lower() else "\\"
|
2950
3153
|
if os.path.isdir(fpath):
|
2951
3154
|
fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
|
2952
|
-
print(fpath)
|
2953
3155
|
return fpath
|
2954
3156
|
dir_parts = fpath.split(f_slash) # Split the path by the OS-specific separator
|
2955
3157
|
|
@@ -3020,7 +3222,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
|
|
3020
3222
|
if len(rootdir) == 1:
|
3021
3223
|
rootdir = rootdir[0]
|
3022
3224
|
rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
|
3023
|
-
|
3225
|
+
|
3024
3226
|
return rootdir
|
3025
3227
|
|
3026
3228
|
|
@@ -3032,6 +3234,8 @@ def split_path(fpath):
|
|
3032
3234
|
|
3033
3235
|
|
3034
3236
|
def figsave(*args, dpi=300):
|
3237
|
+
import matplotlib.pyplot as plt
|
3238
|
+
from PIL import Image
|
3035
3239
|
dir_save = None
|
3036
3240
|
fname = None
|
3037
3241
|
img = None
|
@@ -3047,13 +3251,13 @@ def figsave(*args, dpi=300):
|
|
3047
3251
|
|
3048
3252
|
if dir_save is None:
|
3049
3253
|
dir_save="./"
|
3050
|
-
|
3254
|
+
|
3051
3255
|
# dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
|
3052
3256
|
dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
|
3053
3257
|
dir_ch = "".join(dir_save.split(f_slash)[-1:])
|
3054
3258
|
if not dir_par.endswith(f_slash):
|
3055
3259
|
dir_par += f_slash
|
3056
|
-
|
3260
|
+
|
3057
3261
|
if fname is None:
|
3058
3262
|
fname = dir_ch
|
3059
3263
|
mkdir(dir_par)
|
@@ -3140,6 +3344,7 @@ def figsave(*args, dpi=300):
|
|
3140
3344
|
def is_str_color(s):
|
3141
3345
|
# Regular expression pattern for hexadecimal color codes
|
3142
3346
|
if isinstance(s,str):
|
3347
|
+
import re
|
3143
3348
|
color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
|
3144
3349
|
return re.match(color_code_pattern, s) is not None
|
3145
3350
|
else:
|
@@ -3166,6 +3371,7 @@ def isnum(s):
|
|
3166
3371
|
|
3167
3372
|
|
3168
3373
|
def is_image(fpath):
|
3374
|
+
import mimetypes
|
3169
3375
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3170
3376
|
if mime_type and mime_type.startswith("image"):
|
3171
3377
|
return True
|
@@ -3174,6 +3380,7 @@ def is_image(fpath):
|
|
3174
3380
|
|
3175
3381
|
|
3176
3382
|
def is_document(fpath):
|
3383
|
+
import mimetypes
|
3177
3384
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3178
3385
|
if mime_type and (
|
3179
3386
|
mime_type.startswith("text/")
|
@@ -3194,6 +3401,7 @@ def is_document(fpath):
|
|
3194
3401
|
|
3195
3402
|
|
3196
3403
|
def is_zip(fpath):
|
3404
|
+
import mimetypes
|
3197
3405
|
mime_type, _ = mimetypes.guess_type(fpath)
|
3198
3406
|
if mime_type == "application/zip":
|
3199
3407
|
return True
|
@@ -3202,6 +3410,7 @@ def is_zip(fpath):
|
|
3202
3410
|
|
3203
3411
|
|
3204
3412
|
def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
|
3413
|
+
import matplotlib.pyplot as plt
|
3205
3414
|
if ax is None:
|
3206
3415
|
ax = plt.gca()
|
3207
3416
|
for loc, spine in ax.spines.items():
|
@@ -3290,7 +3499,7 @@ def apply_filter(img, *args):
|
|
3290
3499
|
Returns:
|
3291
3500
|
PIL.Image: The filtered image.
|
3292
3501
|
"""
|
3293
|
-
|
3502
|
+
from PIL import ImageFilter
|
3294
3503
|
def correct_filter_name(filter_name):
|
3295
3504
|
if "bl" in filter_name.lower() and "box" not in filter_name.lower():
|
3296
3505
|
return "BLUR"
|
@@ -3532,6 +3741,8 @@ def imgsets(img, **kwargs):
|
|
3532
3741
|
avg_contrast_factor = sum(contrast_factors) / num_channels
|
3533
3742
|
return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
|
3534
3743
|
|
3744
|
+
import matplotlib.pyplot as plt
|
3745
|
+
from PIL import ImageEnhance,ImageOps
|
3535
3746
|
# Load image if input is a file path
|
3536
3747
|
if isinstance(img, str):
|
3537
3748
|
img = load_img(img)
|
@@ -3595,6 +3806,7 @@ def imgsets(img, **kwargs):
|
|
3595
3806
|
elif "pad" in k.lower():
|
3596
3807
|
img_update = ImageOps.pad(img_update, size=value)
|
3597
3808
|
elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
|
3809
|
+
from rembg import remove, new_session
|
3598
3810
|
if isinstance(value, bool):
|
3599
3811
|
session = new_session("isnet-general-use")
|
3600
3812
|
img_update = remove(img_update, session=session)
|
@@ -3633,6 +3845,7 @@ def imgsets(img, **kwargs):
|
|
3633
3845
|
else:
|
3634
3846
|
img_update = remove(img_update)
|
3635
3847
|
elif "bg" in k.lower() and "color" in k.lower():
|
3848
|
+
from rembg import remove
|
3636
3849
|
if isinstance(value, list):
|
3637
3850
|
value = tuple(value)
|
3638
3851
|
if isinstance(value, tuple): # replace the background color
|
@@ -3664,6 +3877,8 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
3664
3877
|
Args:
|
3665
3878
|
dir_img_list (list): List of the Directory containing the images.
|
3666
3879
|
"""
|
3880
|
+
import matplotlib.pyplot as plt
|
3881
|
+
from PIL import Image
|
3667
3882
|
num_images = len(dir_img_list)
|
3668
3883
|
if not kind.startswith("."):
|
3669
3884
|
kind = "." + kind
|
@@ -3700,28 +3915,15 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
|
|
3700
3915
|
# usage:
|
3701
3916
|
# fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
|
3702
3917
|
# thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
|
3703
|
-
def read_mplstyle(style_file):
|
3704
|
-
# Load the style file
|
3705
|
-
plt.style.use(style_file)
|
3706
|
-
|
3707
|
-
# Get the current style properties
|
3708
|
-
style_dict = plt.rcParams
|
3709
|
-
|
3710
|
-
# Convert to dictionary
|
3711
|
-
style_dict = dict(style_dict)
|
3712
|
-
# Print the style dictionary
|
3713
|
-
for i, j in style_dict.items():
|
3714
|
-
print(f"\n{i}::::{j}")
|
3715
|
-
return style_dict
|
3716
3918
|
|
3717
3919
|
|
3718
|
-
# #example usage:
|
3719
|
-
# style_file = "/ std-colors.mplstyle"
|
3720
|
-
# style_dict = read_mplstyle(style_file)
|
3721
|
-
|
3722
3920
|
|
3723
3921
|
# search and fine the director of the libary, which installed at local
|
3724
3922
|
def dir_lib(lib_oi):
|
3923
|
+
"""
|
3924
|
+
# example usage:
|
3925
|
+
# dir_lib("seaborn")
|
3926
|
+
"""
|
3725
3927
|
import site
|
3726
3928
|
|
3727
3929
|
# Get the site-packages directory
|
@@ -3739,23 +3941,6 @@ def dir_lib(lib_oi):
|
|
3739
3941
|
print(f"Cannot find the {lib_oi} in site-packages directory.")
|
3740
3942
|
return dir_list
|
3741
3943
|
|
3742
|
-
|
3743
|
-
# example usage:
|
3744
|
-
# dir_lib("seaborn")
|
3745
|
-
|
3746
|
-
"""
|
3747
|
-
# n = 7
|
3748
|
-
# clist = get_color(n, cmap="auto", how="linspace") # get_color(100)
|
3749
|
-
# plt.figure(figsize=[8, 5], dpi=100)
|
3750
|
-
# x = np.linspace(0, 2 * np.pi, 50) * 100
|
3751
|
-
# y = np.sin(x)
|
3752
|
-
# for i in range(1, n + 1):
|
3753
|
-
# plt.plot(x, y + i, c=clist[i - 1], lw=5, label=str(i))
|
3754
|
-
# plt.legend()
|
3755
|
-
# plt.ylim(-2, 20)
|
3756
|
-
# figsets(plt.gca(), {"style": "whitegrid"}) """
|
3757
|
-
|
3758
|
-
|
3759
3944
|
class FileInfo:
|
3760
3945
|
def __init__(
|
3761
3946
|
self,
|
@@ -3832,6 +4017,7 @@ class FileInfo:
|
|
3832
4017
|
|
3833
4018
|
|
3834
4019
|
def finfo(fpath):
|
4020
|
+
import time
|
3835
4021
|
fname, fmt = os.path.splitext(fpath)
|
3836
4022
|
dir_par = os.path.dirname(fpath) + "/"
|
3837
4023
|
data = {
|
@@ -3846,6 +4032,7 @@ def finfo(fpath):
|
|
3846
4032
|
}
|
3847
4033
|
extra_info = {}
|
3848
4034
|
if data["kind"] == ".pdf":
|
4035
|
+
from pdf2image import pdfinfo_from_path
|
3849
4036
|
extra_info = pdfinfo_from_path(fpath)
|
3850
4037
|
|
3851
4038
|
return FileInfo(
|
@@ -3860,18 +4047,7 @@ def finfo(fpath):
|
|
3860
4047
|
extra_info=extra_info,
|
3861
4048
|
)
|
3862
4049
|
|
3863
|
-
|
3864
4050
|
# ! format excel file
|
3865
|
-
import pandas as pd
|
3866
|
-
from datetime import datetime
|
3867
|
-
from openpyxl import load_workbook
|
3868
|
-
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
3869
|
-
from openpyxl.utils import get_column_letter
|
3870
|
-
from openpyxl.worksheet.datavalidation import DataValidation
|
3871
|
-
from openpyxl.comments import Comment
|
3872
|
-
from openpyxl.formatting.rule import ColorScaleRule
|
3873
|
-
|
3874
|
-
|
3875
4051
|
def hex2argb(hex_color):
|
3876
4052
|
"""
|
3877
4053
|
Convert a hex color code to aARGB format required by openpyxl.
|
@@ -3902,341 +4078,7 @@ def hex2argb(hex_color):
|
|
3902
4078
|
return hex_color[-9:]
|
3903
4079
|
else:
|
3904
4080
|
return "F" * (9 - len(hex_color)) + hex_color
|
3905
|
-
raise ValueError(
|
3906
|
-
"Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format."
|
3907
|
-
)
|
3908
|
-
|
3909
|
-
|
3910
|
-
def convert_indices_to_range(row_slice, col_slice):
|
3911
|
-
"""Convert numerical row and column slices to Excel-style range strings."""
|
3912
|
-
start_row = row_slice.start + 1
|
3913
|
-
end_row = row_slice.stop if row_slice.stop is not None else None
|
3914
|
-
start_col = col_slice.start + 1
|
3915
|
-
end_col = col_slice.stop if col_slice.stop is not None else None
|
3916
|
-
|
3917
|
-
start_col_letter = get_column_letter(start_col)
|
3918
|
-
end_col_letter = get_column_letter(end_col) if end_col else None
|
3919
|
-
return (
|
3920
|
-
f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
|
3921
|
-
if end_col_letter
|
3922
|
-
else f"{start_col_letter}{start_row}"
|
3923
|
-
)
|
3924
|
-
|
3925
|
-
|
3926
|
-
def apply_format(ws, cell, cell_range):
|
3927
|
-
"""Apply cell formatting to a specified range."""
|
3928
|
-
cell_font, cell_fill, cell_alignment, border = None, None, None, None
|
3929
|
-
kws_cell = ["font", "fill", "alignment", "border"]
|
3930
|
-
for K, _ in cell.items():
|
3931
|
-
if strcmp(K, kws_cell)[0] == "font":
|
3932
|
-
#! font
|
3933
|
-
font_color = "000000"
|
3934
|
-
font_name = "Arial"
|
3935
|
-
font_underline = "none"
|
3936
|
-
font_size = 14
|
3937
|
-
font_bold = False
|
3938
|
-
font_strike = False
|
3939
|
-
font_italic = False
|
3940
|
-
kws_font = [
|
3941
|
-
"name",
|
3942
|
-
"size",
|
3943
|
-
"bold",
|
3944
|
-
"underline",
|
3945
|
-
"color",
|
3946
|
-
"strike",
|
3947
|
-
"italic",
|
3948
|
-
]
|
3949
|
-
for k_, v_ in cell.get(K, {}).items():
|
3950
|
-
if strcmp(k_, kws_font)[0] == "name":
|
3951
|
-
font_name = v_
|
3952
|
-
elif strcmp(k_, kws_font)[0] == "size":
|
3953
|
-
font_size = v_
|
3954
|
-
elif strcmp(k_, kws_font)[0] == "bold":
|
3955
|
-
font_bold = v_
|
3956
|
-
elif strcmp(k_, kws_font)[0] == "underline":
|
3957
|
-
font_underline = strcmp(v_, ["none", "single", "double"])[0]
|
3958
|
-
elif strcmp(k_, kws_font)[0] == "color":
|
3959
|
-
font_color = hex2argb(v_)
|
3960
|
-
elif strcmp(k_, kws_font)[0] == "strike":
|
3961
|
-
font_strike = v_
|
3962
|
-
elif strcmp(k_, kws_font)[0] == "italic":
|
3963
|
-
font_italic = v_
|
3964
|
-
|
3965
|
-
cell_font = Font(
|
3966
|
-
name=font_name,
|
3967
|
-
size=font_size,
|
3968
|
-
bold=font_bold,
|
3969
|
-
italic=font_italic,
|
3970
|
-
underline=font_underline,
|
3971
|
-
strike=font_strike,
|
3972
|
-
color=font_color,
|
3973
|
-
)
|
3974
|
-
|
3975
|
-
if strcmp(K, kws_cell)[0] == "fill":
|
3976
|
-
#! fill
|
3977
|
-
kws_fill = ["start_color", "end_color", "fill_type", "color"]
|
3978
|
-
kws_fill_type = [
|
3979
|
-
"darkVertical",
|
3980
|
-
"lightDown",
|
3981
|
-
"lightGrid",
|
3982
|
-
"solid",
|
3983
|
-
"darkDown",
|
3984
|
-
"lightGray",
|
3985
|
-
"lightUp",
|
3986
|
-
"gray0625",
|
3987
|
-
"lightVertical",
|
3988
|
-
"lightHorizontal",
|
3989
|
-
"darkHorizontal",
|
3990
|
-
"gray125",
|
3991
|
-
"darkUp",
|
3992
|
-
"mediumGray",
|
3993
|
-
"darkTrellis",
|
3994
|
-
"darkGray",
|
3995
|
-
"lightTrellis",
|
3996
|
-
"darkGrid",
|
3997
|
-
]
|
3998
|
-
start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
|
3999
|
-
for k, v in cell.get(K, {}).items():
|
4000
|
-
if strcmp(k, kws_fill)[0] == "color":
|
4001
|
-
start_color, end_color = hex2argb(v), hex2argb(v)
|
4002
|
-
break
|
4003
|
-
for k, v in cell.get(K, {}).items():
|
4004
|
-
if strcmp(k, kws_fill)[0] == "start_color":
|
4005
|
-
start_color = hex2argb(v)
|
4006
|
-
elif strcmp(k, kws_fill)[0] == "end_color":
|
4007
|
-
end_color = hex2argb(v)
|
4008
|
-
elif strcmp(k, kws_fill)[0] == "fill_type":
|
4009
|
-
fill_type = strcmp(v, kws_fill_type)[0]
|
4010
|
-
cell_fill = PatternFill(
|
4011
|
-
start_color=start_color,
|
4012
|
-
end_color=end_color,
|
4013
|
-
fill_type=fill_type,
|
4014
|
-
)
|
4015
|
-
|
4016
|
-
if strcmp(K, kws_cell)[0] == "alignment":
|
4017
|
-
#! alignment
|
4018
|
-
# default
|
4019
|
-
align_horizontal = "general"
|
4020
|
-
align_vertical = "center"
|
4021
|
-
align_rot = 0
|
4022
|
-
align_wrap = False
|
4023
|
-
align_shrink = False
|
4024
|
-
align_indent = 0
|
4025
|
-
kws_align = [
|
4026
|
-
"horizontal",
|
4027
|
-
"ha",
|
4028
|
-
"vertical",
|
4029
|
-
"va",
|
4030
|
-
"text_rotation",
|
4031
|
-
"rotat",
|
4032
|
-
"rot",
|
4033
|
-
"wrap_text",
|
4034
|
-
"wrap",
|
4035
|
-
"shrink_to_fit",
|
4036
|
-
"shrink",
|
4037
|
-
"indent",
|
4038
|
-
]
|
4039
|
-
for k, v in cell.get(K, {}).items():
|
4040
|
-
if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
|
4041
|
-
align_horizontal = strcmp(
|
4042
|
-
v, ["general", "left", "right", "center"]
|
4043
|
-
)[0]
|
4044
|
-
elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
|
4045
|
-
align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
|
4046
|
-
elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
|
4047
|
-
align_rot = v
|
4048
|
-
elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
|
4049
|
-
align_wrap = v
|
4050
|
-
elif strcmp(k, kws_align)[0] in [
|
4051
|
-
"shrink_to_fit",
|
4052
|
-
"shrink",
|
4053
|
-
"wrap_text",
|
4054
|
-
"wrap",
|
4055
|
-
]:
|
4056
|
-
align_shrink = v
|
4057
|
-
elif strcmp(k, kws_align)[0] in ["indent"]:
|
4058
|
-
align_indent = v
|
4059
|
-
cell_alignment = Alignment(
|
4060
|
-
horizontal=align_horizontal,
|
4061
|
-
vertical=align_vertical,
|
4062
|
-
text_rotation=align_rot,
|
4063
|
-
wrap_text=align_wrap,
|
4064
|
-
shrink_to_fit=align_shrink,
|
4065
|
-
indent=align_indent,
|
4066
|
-
)
|
4067
|
-
|
4068
|
-
if strcmp(K, kws_cell)[0] == "border":
|
4069
|
-
#! border
|
4070
|
-
kws_border = [
|
4071
|
-
"color_left",
|
4072
|
-
"color_l",
|
4073
|
-
"color_right",
|
4074
|
-
"color_r",
|
4075
|
-
"color_top",
|
4076
|
-
"color_t",
|
4077
|
-
"color_bottom",
|
4078
|
-
"color_b",
|
4079
|
-
"color_diagonal",
|
4080
|
-
"color_d",
|
4081
|
-
"color_outline",
|
4082
|
-
"color_o",
|
4083
|
-
"color_vertical",
|
4084
|
-
"color_v",
|
4085
|
-
"color_horizontal",
|
4086
|
-
"color_h",
|
4087
|
-
"color",
|
4088
|
-
"style_left",
|
4089
|
-
"style_l",
|
4090
|
-
"style_right",
|
4091
|
-
"style_r",
|
4092
|
-
"style_top",
|
4093
|
-
"style_t",
|
4094
|
-
"style_bottom",
|
4095
|
-
"style_b",
|
4096
|
-
"style_diagonal",
|
4097
|
-
"style_d",
|
4098
|
-
"style_outline",
|
4099
|
-
"style_o",
|
4100
|
-
"style_vertical",
|
4101
|
-
"style_v",
|
4102
|
-
"style_horizontal",
|
4103
|
-
"style_h",
|
4104
|
-
"style",
|
4105
|
-
]
|
4106
|
-
# * border color
|
4107
|
-
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4108
|
-
"FF000000",
|
4109
|
-
"FF000000",
|
4110
|
-
"FF000000",
|
4111
|
-
"FF000000",
|
4112
|
-
)
|
4113
|
-
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4114
|
-
"FF000000",
|
4115
|
-
"FF000000",
|
4116
|
-
"FF000000",
|
4117
|
-
"FF000000",
|
4118
|
-
)
|
4119
|
-
# get colors config
|
4120
|
-
for k, v in cell.get(K, {}).items():
|
4121
|
-
if strcmp(k, kws_border)[0] in ["color"]:
|
4122
|
-
border_color_all = hex2argb(v)
|
4123
|
-
# 如果设置了color,表示其它的所有的都设置成为一样的
|
4124
|
-
# 然后再才开始自己定义其它的color
|
4125
|
-
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4126
|
-
border_color_all,
|
4127
|
-
border_color_all,
|
4128
|
-
border_color_all,
|
4129
|
-
border_color_all,
|
4130
|
-
)
|
4131
|
-
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4132
|
-
border_color_all,
|
4133
|
-
border_color_all,
|
4134
|
-
border_color_all,
|
4135
|
-
border_color_all,
|
4136
|
-
)
|
4137
|
-
elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
|
4138
|
-
border_color_l = hex2argb(v)
|
4139
|
-
elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
|
4140
|
-
border_color_r = hex2argb(v)
|
4141
|
-
elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
|
4142
|
-
border_color_t = hex2argb(v)
|
4143
|
-
elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
|
4144
|
-
border_color_b = hex2argb(v)
|
4145
|
-
elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
|
4146
|
-
border_color_d = hex2argb(v)
|
4147
|
-
elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
|
4148
|
-
border_color_o = hex2argb(v)
|
4149
|
-
elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
|
4150
|
-
border_color_v = hex2argb(v)
|
4151
|
-
elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
|
4152
|
-
border_color_h = hex2argb(v)
|
4153
|
-
# *border style
|
4154
|
-
border_styles = [
|
4155
|
-
"thin",
|
4156
|
-
"medium",
|
4157
|
-
"thick",
|
4158
|
-
"dotted",
|
4159
|
-
"dashed",
|
4160
|
-
"hair",
|
4161
|
-
"mediumDashed",
|
4162
|
-
"dashDot",
|
4163
|
-
"dashDotDot",
|
4164
|
-
"slantDashDot",
|
4165
|
-
"none",
|
4166
|
-
]
|
4167
|
-
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4168
|
-
None,
|
4169
|
-
None,
|
4170
|
-
None,
|
4171
|
-
None,
|
4172
|
-
)
|
4173
|
-
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4174
|
-
None,
|
4175
|
-
None,
|
4176
|
-
None,
|
4177
|
-
None,
|
4178
|
-
)
|
4179
|
-
# get styles config
|
4180
|
-
for k, v in cell.get(K, {}).items():
|
4181
|
-
# if not "style" in k:
|
4182
|
-
# break
|
4183
|
-
if strcmp(k, kws_border)[0] in ["style"]:
|
4184
|
-
border_style_all = strcmp(v, border_styles)[0]
|
4185
|
-
# 如果设置了style,表示其它的所有的都设置成为一样的
|
4186
|
-
# 然后再才开始自己定义其它的style
|
4187
|
-
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4188
|
-
border_style_all,
|
4189
|
-
border_style_all,
|
4190
|
-
border_style_all,
|
4191
|
-
border_style_all,
|
4192
|
-
)
|
4193
|
-
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4194
|
-
border_style_all,
|
4195
|
-
border_style_all,
|
4196
|
-
border_style_all,
|
4197
|
-
border_style_all,
|
4198
|
-
)
|
4199
|
-
elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
|
4200
|
-
border_style_l = strcmp(v, border_styles)[0]
|
4201
|
-
elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
|
4202
|
-
border_style_r = strcmp(v, border_styles)[0]
|
4203
|
-
elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
|
4204
|
-
border_style_t = strcmp(v, border_styles)[0]
|
4205
|
-
elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
|
4206
|
-
border_style_b = strcmp(v, border_styles)[0]
|
4207
|
-
elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
|
4208
|
-
border_style_d = strcmp(v, border_styles)[0]
|
4209
|
-
elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
|
4210
|
-
border_style_o = strcmp(v, border_styles)[0]
|
4211
|
-
elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
|
4212
|
-
border_style_v = strcmp(v, border_styles)[0]
|
4213
|
-
elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
|
4214
|
-
border_style_h = strcmp(v, border_styles)[0]
|
4215
|
-
# * apply border config
|
4216
|
-
border = Border(
|
4217
|
-
left=Side(border_style=border_style_l, color=border_color_l),
|
4218
|
-
right=Side(border_style=border_style_r, color=border_color_r),
|
4219
|
-
top=Side(border_style=border_style_t, color=border_color_t),
|
4220
|
-
bottom=Side(border_style=border_style_b, color=border_color_b),
|
4221
|
-
diagonal=Side(border_style=border_style_d, color=border_color_d),
|
4222
|
-
diagonal_direction=0,
|
4223
|
-
outline=Side(border_style=border_style_o, color=border_color_o),
|
4224
|
-
vertical=Side(border_style=border_style_v, color=border_color_v),
|
4225
|
-
horizontal=Side(border_style=border_style_h, color=border_color_h),
|
4226
|
-
)
|
4227
|
-
|
4228
|
-
#! final apply configs
|
4229
|
-
for row in ws[cell_range]:
|
4230
|
-
for cell_ in row:
|
4231
|
-
if cell_font:
|
4232
|
-
cell_.font = cell_font
|
4233
|
-
if cell_fill:
|
4234
|
-
cell_.fill = cell_fill
|
4235
|
-
if cell_alignment:
|
4236
|
-
cell_.alignment = cell_alignment
|
4237
|
-
if border:
|
4238
|
-
cell_.border = border
|
4239
|
-
|
4081
|
+
raise ValueError("Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format.")
|
4240
4082
|
|
4241
4083
|
def format_excel(
|
4242
4084
|
df=None,
|
@@ -4257,6 +4099,255 @@ def format_excel(
|
|
4257
4099
|
conditional_format=None, # dict
|
4258
4100
|
**kwargs,
|
4259
4101
|
):
|
4102
|
+
import pandas as pd
|
4103
|
+
from datetime import datetime
|
4104
|
+
from openpyxl import load_workbook
|
4105
|
+
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
4106
|
+
from openpyxl.utils import get_column_letter
|
4107
|
+
from openpyxl.worksheet.datavalidation import DataValidation
|
4108
|
+
from openpyxl.comments import Comment
|
4109
|
+
from openpyxl.formatting.rule import ColorScaleRule
|
4110
|
+
|
4111
|
+
def convert_indices_to_range(row_slice, col_slice):
|
4112
|
+
"""Convert numerical row and column slices to Excel-style range strings."""
|
4113
|
+
start_row = row_slice.start + 1
|
4114
|
+
end_row = row_slice.stop if row_slice.stop is not None else None
|
4115
|
+
start_col = col_slice.start + 1
|
4116
|
+
end_col = col_slice.stop if col_slice.stop is not None else None
|
4117
|
+
|
4118
|
+
start_col_letter = get_column_letter(start_col)
|
4119
|
+
end_col_letter = get_column_letter(end_col) if end_col else None
|
4120
|
+
return (
|
4121
|
+
f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
|
4122
|
+
if end_col_letter
|
4123
|
+
else f"{start_col_letter}{start_row}"
|
4124
|
+
)
|
4125
|
+
|
4126
|
+
def apply_format(ws, cell, cell_range):
|
4127
|
+
"""Apply cell formatting to a specified range."""
|
4128
|
+
cell_font, cell_fill, cell_alignment, border = None, None, None, None
|
4129
|
+
kws_cell = ["font", "fill", "alignment", "border"]
|
4130
|
+
for K, _ in cell.items():
|
4131
|
+
if strcmp(K, kws_cell)[0] == "font":
|
4132
|
+
#! font
|
4133
|
+
font_color = "000000"
|
4134
|
+
font_name = "Arial"
|
4135
|
+
font_underline = "none"
|
4136
|
+
font_size = 14
|
4137
|
+
font_bold = False
|
4138
|
+
font_strike = False
|
4139
|
+
font_italic = False
|
4140
|
+
kws_font = ["name","size","bold","underline","color","strike","italic"]
|
4141
|
+
for k_, v_ in cell.get(K, {}).items():
|
4142
|
+
if strcmp(k_, kws_font)[0] == "name":
|
4143
|
+
font_name = v_
|
4144
|
+
elif strcmp(k_, kws_font)[0] == "size":
|
4145
|
+
font_size = v_
|
4146
|
+
elif strcmp(k_, kws_font)[0] == "bold":
|
4147
|
+
font_bold = v_
|
4148
|
+
elif strcmp(k_, kws_font)[0] == "underline":
|
4149
|
+
font_underline = strcmp(v_, ["none", "single", "double"])[0]
|
4150
|
+
elif strcmp(k_, kws_font)[0] == "color":
|
4151
|
+
font_color = hex2argb(v_)
|
4152
|
+
elif strcmp(k_, kws_font)[0] == "strike":
|
4153
|
+
font_strike = v_
|
4154
|
+
elif strcmp(k_, kws_font)[0] == "italic":
|
4155
|
+
font_italic = v_
|
4156
|
+
|
4157
|
+
cell_font = Font(
|
4158
|
+
name=font_name,
|
4159
|
+
size=font_size,
|
4160
|
+
bold=font_bold,
|
4161
|
+
italic=font_italic,
|
4162
|
+
underline=font_underline,
|
4163
|
+
strike=font_strike,
|
4164
|
+
color=font_color,
|
4165
|
+
)
|
4166
|
+
|
4167
|
+
if strcmp(K, kws_cell)[0] == "fill":
|
4168
|
+
#! fill
|
4169
|
+
kws_fill = ["start_color", "end_color", "fill_type", "color"]
|
4170
|
+
kws_fill_type = ["darkVertical","lightDown","lightGrid","solid","darkDown","lightGray","lightUp","gray0625","lightVertical","lightHorizontal",
|
4171
|
+
"darkHorizontal","gray125","darkUp","mediumGray","darkTrellis","darkGray","lightTrellis","darkGrid"]
|
4172
|
+
start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid" # default
|
4173
|
+
for k, v in cell.get(K, {}).items():
|
4174
|
+
if strcmp(k, kws_fill)[0] == "color":
|
4175
|
+
start_color, end_color = hex2argb(v), hex2argb(v)
|
4176
|
+
break
|
4177
|
+
for k, v in cell.get(K, {}).items():
|
4178
|
+
if strcmp(k, kws_fill)[0] == "start_color":
|
4179
|
+
start_color = hex2argb(v)
|
4180
|
+
elif strcmp(k, kws_fill)[0] == "end_color":
|
4181
|
+
end_color = hex2argb(v)
|
4182
|
+
elif strcmp(k, kws_fill)[0] == "fill_type":
|
4183
|
+
fill_type = strcmp(v, kws_fill_type)[0]
|
4184
|
+
cell_fill = PatternFill(
|
4185
|
+
start_color=start_color,
|
4186
|
+
end_color=end_color,
|
4187
|
+
fill_type=fill_type,
|
4188
|
+
)
|
4189
|
+
|
4190
|
+
if strcmp(K, kws_cell)[0] == "alignment":
|
4191
|
+
#! alignment
|
4192
|
+
# default
|
4193
|
+
align_horizontal = "general"
|
4194
|
+
align_vertical = "center"
|
4195
|
+
align_rot = 0
|
4196
|
+
align_wrap = False
|
4197
|
+
align_shrink = False
|
4198
|
+
align_indent = 0
|
4199
|
+
kws_align = [
|
4200
|
+
"horizontal",
|
4201
|
+
"ha",
|
4202
|
+
"vertical",
|
4203
|
+
"va",
|
4204
|
+
"text_rotation",
|
4205
|
+
"rotat",
|
4206
|
+
"rot",
|
4207
|
+
"wrap_text",
|
4208
|
+
"wrap",
|
4209
|
+
"shrink_to_fit",
|
4210
|
+
"shrink",
|
4211
|
+
"indent",
|
4212
|
+
]
|
4213
|
+
for k, v in cell.get(K, {}).items():
|
4214
|
+
if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
|
4215
|
+
align_horizontal = strcmp(
|
4216
|
+
v, ["general", "left", "right", "center"]
|
4217
|
+
)[0]
|
4218
|
+
elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
|
4219
|
+
align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
|
4220
|
+
elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
|
4221
|
+
align_rot = v
|
4222
|
+
elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
|
4223
|
+
align_wrap = v
|
4224
|
+
elif strcmp(k, kws_align)[0] in [
|
4225
|
+
"shrink_to_fit",
|
4226
|
+
"shrink",
|
4227
|
+
"wrap_text",
|
4228
|
+
"wrap",
|
4229
|
+
]:
|
4230
|
+
align_shrink = v
|
4231
|
+
elif strcmp(k, kws_align)[0] in ["indent"]:
|
4232
|
+
align_indent = v
|
4233
|
+
cell_alignment = Alignment(
|
4234
|
+
horizontal=align_horizontal,
|
4235
|
+
vertical=align_vertical,
|
4236
|
+
text_rotation=align_rot,
|
4237
|
+
wrap_text=align_wrap,
|
4238
|
+
shrink_to_fit=align_shrink,
|
4239
|
+
indent=align_indent,
|
4240
|
+
)
|
4241
|
+
|
4242
|
+
if strcmp(K, kws_cell)[0] == "border":
|
4243
|
+
#! border
|
4244
|
+
kws_border = ["color_left","color_l","color_right","color_r","color_top","color_t","color_bottom","color_b",
|
4245
|
+
"color_diagonal","color_d","color_outline","color_o","color_vertical","color_v","color_horizontal",
|
4246
|
+
"color_h","color","style_left","style_l","style_right","style_r","style_top","style_t","style_bottom","style_b",
|
4247
|
+
"style_diagonal","style_d","style_outline","style_o","style_vertical","style_v","style_horizontal",
|
4248
|
+
"style_h","style"]
|
4249
|
+
# * border color
|
4250
|
+
border_color_l, border_color_r, border_color_t, border_color_b = ("FF000000","FF000000","FF000000","FF000000")
|
4251
|
+
border_color_d, border_color_o, border_color_v, border_color_h = ("FF000000","FF000000","FF000000","FF000000")
|
4252
|
+
# get colors config
|
4253
|
+
for k, v in cell.get(K, {}).items():
|
4254
|
+
if strcmp(k, kws_border)[0] in ["color"]:
|
4255
|
+
border_color_all = hex2argb(v)
|
4256
|
+
# 如果设置了color,表示其它的所有的都设置成为一样的
|
4257
|
+
# 然后再才开始自己定义其它的color
|
4258
|
+
border_color_l, border_color_r, border_color_t, border_color_b = (
|
4259
|
+
border_color_all,
|
4260
|
+
border_color_all,
|
4261
|
+
border_color_all,
|
4262
|
+
border_color_all,
|
4263
|
+
)
|
4264
|
+
border_color_d, border_color_o, border_color_v, border_color_h = (
|
4265
|
+
border_color_all,
|
4266
|
+
border_color_all,
|
4267
|
+
border_color_all,
|
4268
|
+
border_color_all,
|
4269
|
+
)
|
4270
|
+
elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
|
4271
|
+
border_color_l = hex2argb(v)
|
4272
|
+
elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
|
4273
|
+
border_color_r = hex2argb(v)
|
4274
|
+
elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
|
4275
|
+
border_color_t = hex2argb(v)
|
4276
|
+
elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
|
4277
|
+
border_color_b = hex2argb(v)
|
4278
|
+
elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
|
4279
|
+
border_color_d = hex2argb(v)
|
4280
|
+
elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
|
4281
|
+
border_color_o = hex2argb(v)
|
4282
|
+
elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
|
4283
|
+
border_color_v = hex2argb(v)
|
4284
|
+
elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
|
4285
|
+
border_color_h = hex2argb(v)
|
4286
|
+
# *border style
|
4287
|
+
border_styles = ["thin","medium","thick","dotted","dashed",
|
4288
|
+
"hair","mediumDashed","dashDot","dashDotDot","slantDashDot","none"]
|
4289
|
+
border_style_l, border_style_r, border_style_t, border_style_b = (None,None,None,None)
|
4290
|
+
border_style_d, border_style_o, border_style_v, border_style_h = (None,None,None,None)
|
4291
|
+
# get styles config
|
4292
|
+
for k, v in cell.get(K, {}).items():
|
4293
|
+
# if not "style" in k:
|
4294
|
+
# break
|
4295
|
+
if strcmp(k, kws_border)[0] in ["style"]:
|
4296
|
+
border_style_all = strcmp(v, border_styles)[0]
|
4297
|
+
# 如果设置了style,表示其它的所有的都设置成为一样的
|
4298
|
+
# 然后再才开始自己定义其它的style
|
4299
|
+
border_style_l, border_style_r, border_style_t, border_style_b = (
|
4300
|
+
border_style_all,
|
4301
|
+
border_style_all,
|
4302
|
+
border_style_all,
|
4303
|
+
border_style_all,
|
4304
|
+
)
|
4305
|
+
border_style_d, border_style_o, border_style_v, border_style_h = (
|
4306
|
+
border_style_all,
|
4307
|
+
border_style_all,
|
4308
|
+
border_style_all,
|
4309
|
+
border_style_all,
|
4310
|
+
)
|
4311
|
+
elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
|
4312
|
+
border_style_l = strcmp(v, border_styles)[0]
|
4313
|
+
elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
|
4314
|
+
border_style_r = strcmp(v, border_styles)[0]
|
4315
|
+
elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
|
4316
|
+
border_style_t = strcmp(v, border_styles)[0]
|
4317
|
+
elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
|
4318
|
+
border_style_b = strcmp(v, border_styles)[0]
|
4319
|
+
elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
|
4320
|
+
border_style_d = strcmp(v, border_styles)[0]
|
4321
|
+
elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
|
4322
|
+
border_style_o = strcmp(v, border_styles)[0]
|
4323
|
+
elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
|
4324
|
+
border_style_v = strcmp(v, border_styles)[0]
|
4325
|
+
elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
|
4326
|
+
border_style_h = strcmp(v, border_styles)[0]
|
4327
|
+
# * apply border config
|
4328
|
+
border = Border(
|
4329
|
+
left=Side(border_style=border_style_l, color=border_color_l),
|
4330
|
+
right=Side(border_style=border_style_r, color=border_color_r),
|
4331
|
+
top=Side(border_style=border_style_t, color=border_color_t),
|
4332
|
+
bottom=Side(border_style=border_style_b, color=border_color_b),
|
4333
|
+
diagonal=Side(border_style=border_style_d, color=border_color_d),
|
4334
|
+
diagonal_direction=0,
|
4335
|
+
outline=Side(border_style=border_style_o, color=border_color_o),
|
4336
|
+
vertical=Side(border_style=border_style_v, color=border_color_v),
|
4337
|
+
horizontal=Side(border_style=border_style_h, color=border_color_h),
|
4338
|
+
)
|
4339
|
+
|
4340
|
+
#! final apply configs
|
4341
|
+
for row in ws[cell_range]:
|
4342
|
+
for cell_ in row:
|
4343
|
+
if cell_font:
|
4344
|
+
cell_.font = cell_font
|
4345
|
+
if cell_fill:
|
4346
|
+
cell_.fill = cell_fill
|
4347
|
+
if cell_alignment:
|
4348
|
+
cell_.alignment = cell_alignment
|
4349
|
+
if border:
|
4350
|
+
cell_.border = border
|
4260
4351
|
if not isinstance(df, pd.DataFrame):
|
4261
4352
|
try:
|
4262
4353
|
print(f"is loading file {os.path.basename(df)}")
|
@@ -4602,12 +4693,10 @@ format_excel(
|
|
4602
4693
|
print(f"Formatted Excel file saved as:\n{filename}")
|
4603
4694
|
|
4604
4695
|
|
4605
|
-
from IPython.display import display, HTML, Markdown
|
4606
|
-
|
4607
|
-
|
4608
4696
|
def preview(var):
|
4609
4697
|
"""Master function to preview formatted variables in Jupyter."""
|
4610
|
-
|
4698
|
+
from bs4 import BeautifulSoup
|
4699
|
+
from IPython.display import display, HTML, Markdown
|
4611
4700
|
if isinstance(var, str):
|
4612
4701
|
if isa(var, "html"):
|
4613
4702
|
display(HTML(var)) # Render as HTML
|
@@ -4624,6 +4713,7 @@ def preview(var):
|
|
4624
4713
|
display(var)
|
4625
4714
|
|
4626
4715
|
elif isinstance(var, list) or isinstance(var, dict):
|
4716
|
+
import json
|
4627
4717
|
# Display JSON
|
4628
4718
|
json_str = json.dumps(var, indent=4)
|
4629
4719
|
display(Markdown(f"```json\n{json_str}\n```"))
|
@@ -4637,6 +4727,7 @@ def preview(var):
|
|
4637
4727
|
display(Image(filename=var))
|
4638
4728
|
|
4639
4729
|
elif isinstance(var, dict):
|
4730
|
+
import json
|
4640
4731
|
# Handle dictionary formatting
|
4641
4732
|
json_str = json.dumps(var, indent=4)
|
4642
4733
|
display(Markdown(f"```json\n{json_str}\n```"))
|
@@ -4644,13 +4735,154 @@ def preview(var):
|
|
4644
4735
|
else:
|
4645
4736
|
# If the format is not recognized, print a message
|
4646
4737
|
print("Format not recognized or unsupported.")
|
4647
|
-
|
4648
|
-
|
4649
4738
|
# # Example usages:
|
4650
4739
|
# preview("This is a plain text message.")
|
4651
4740
|
# preview("# This is a Markdown header")
|
4652
4741
|
# preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
|
4653
4742
|
# preview({"key": "value", "numbers": [1, 2, 3]})
|
4743
|
+
|
4744
|
+
def _df_outlier(
|
4745
|
+
data,
|
4746
|
+
columns=None,
|
4747
|
+
method=["zscore", "iqr", "percentile", "iforest"],
|
4748
|
+
min_outlier_method=3, # 至少两种方法检查出outlier
|
4749
|
+
zscore_threshold=3,
|
4750
|
+
iqr_threshold=1.5,
|
4751
|
+
lower_percentile=5,
|
4752
|
+
upper_percentile=95,
|
4753
|
+
):
|
4754
|
+
from scipy.stats import zscore
|
4755
|
+
from sklearn.ensemble import IsolationForest
|
4756
|
+
from sklearn.preprocessing import StandardScaler
|
4757
|
+
|
4758
|
+
col_names_org = data.columns.tolist()
|
4759
|
+
index_names_org = data.index.tolist()
|
4760
|
+
# Separate numeric and non-numeric columns
|
4761
|
+
numeric_data = data.select_dtypes(include=[np.number])
|
4762
|
+
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
4763
|
+
|
4764
|
+
if columns is not None:
|
4765
|
+
numeric_data = numeric_data[columns]
|
4766
|
+
elif numeric_data.empty:
|
4767
|
+
raise ValueError("Input data must contain numeric columns.")
|
4768
|
+
|
4769
|
+
outliers_df = pd.DataFrame(index=numeric_data.index)
|
4770
|
+
if isinstance(method, str):
|
4771
|
+
method = [method]
|
4772
|
+
|
4773
|
+
# Z-score method
|
4774
|
+
if "zscore" in method:
|
4775
|
+
z_scores = np.abs(zscore(numeric_data))
|
4776
|
+
outliers_df["zscore"] = np.any(z_scores > zscore_threshold, axis=1)
|
4777
|
+
|
4778
|
+
# IQR method
|
4779
|
+
if "iqr" in method:
|
4780
|
+
Q1 = numeric_data.quantile(0.25)
|
4781
|
+
Q3 = numeric_data.quantile(0.75)
|
4782
|
+
IQR = Q3 - Q1
|
4783
|
+
lower_bound = Q1 - iqr_threshold * IQR
|
4784
|
+
upper_bound = Q3 + iqr_threshold * IQR
|
4785
|
+
outliers_df["iqr"] = (
|
4786
|
+
(numeric_data < lower_bound) | (numeric_data > upper_bound)
|
4787
|
+
).any(axis=1)
|
4788
|
+
|
4789
|
+
# Percentile method
|
4790
|
+
if "percentile" in method:
|
4791
|
+
lower_bound = numeric_data.quantile(lower_percentile / 100)
|
4792
|
+
upper_bound = numeric_data.quantile(upper_percentile / 100)
|
4793
|
+
outliers_df["percentile"] = (
|
4794
|
+
(numeric_data < lower_bound) | (numeric_data > upper_bound)
|
4795
|
+
).any(axis=1)
|
4796
|
+
|
4797
|
+
# Isolation Forest method
|
4798
|
+
if "iforest" in method:
|
4799
|
+
# iforest method cannot handle NaNs, then fillna with mean
|
4800
|
+
numeric_data_ = numeric_data.fillna(numeric_data.mean())
|
4801
|
+
scaler = StandardScaler()
|
4802
|
+
scaled_data = scaler.fit_transform(numeric_data_)
|
4803
|
+
iso_forest = IsolationForest(contamination=0.05)
|
4804
|
+
outliers_df["iforest"] = iso_forest.fit_predict(scaled_data) == -1
|
4805
|
+
|
4806
|
+
# Combine all outlier detections
|
4807
|
+
if len(method) == 4: # all method are used:
|
4808
|
+
outliers_df["outlier"] = outliers_df.sum(axis=1) >= min_outlier_method
|
4809
|
+
else:
|
4810
|
+
outliers_df["outlier"] = outliers_df.any(axis=1)
|
4811
|
+
|
4812
|
+
# Handling Outliers: Remove or Winsorize or Replace with NaN
|
4813
|
+
processed_data = numeric_data.copy()
|
4814
|
+
|
4815
|
+
processed_data.loc[outliers_df["outlier"]] = np.nan
|
4816
|
+
|
4817
|
+
return processed_data
|
4818
|
+
|
4819
|
+
|
4820
|
+
def df_outlier(
|
4821
|
+
data,
|
4822
|
+
columns=None,
|
4823
|
+
method=["zscore", "iqr", "percentile", "iforest"],
|
4824
|
+
min_outlier_method=2, # 至少两种方法检查出outlier
|
4825
|
+
zscore_threshold=3,
|
4826
|
+
iqr_threshold=1.5,
|
4827
|
+
lower_percentile=5,
|
4828
|
+
upper_percentile=95,
|
4829
|
+
):
|
4830
|
+
"""
|
4831
|
+
Usage:
|
4832
|
+
data_out = df_outlier(
|
4833
|
+
data,
|
4834
|
+
columns=["income"],
|
4835
|
+
method="iforest",
|
4836
|
+
min_outlier_method=1)
|
4837
|
+
|
4838
|
+
Advanced outlier detection and handling function.
|
4839
|
+
|
4840
|
+
Parameters:
|
4841
|
+
- data: DataFrame, the input data (numerical).
|
4842
|
+
- method: List, the outlier detection method to use. Options: 'zscore', 'iqr', 'percentile', 'iforest'.
|
4843
|
+
- zscore_threshold: float, threshold for Z-score outlier detection (default 3).
|
4844
|
+
- iqr_threshold: float, threshold for IQR method (default 1.5).
|
4845
|
+
- lower_percentile: float, lower percentile for percentile-based outliers (default 5).
|
4846
|
+
- upper_percentile: float, upper percentile for percentile-based outliers (default 95).
|
4847
|
+
- keep_nan: bool, whether to replace outliers with NaN (default True).
|
4848
|
+
- plot: bool, whether to visualize the outliers (default False).
|
4849
|
+
- min_outlier_method: int, minimum number of method that need to flag a row as an outlier (default 2).
|
4850
|
+
- inplace: bool, whether to modify the original `data` DataFrame (default False).
|
4851
|
+
|
4852
|
+
Returns:
|
4853
|
+
- processed_data: DataFrame with outliers handled based on method (if winsorize/remove is True).
|
4854
|
+
"""
|
4855
|
+
col_names_org = data.columns.tolist()
|
4856
|
+
index_names_org = data.index.tolist()
|
4857
|
+
|
4858
|
+
numeric_data = data.select_dtypes(include=[np.number])
|
4859
|
+
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
4860
|
+
|
4861
|
+
_outlier_df_tmp = pd.DataFrame()
|
4862
|
+
for col in numeric_data.columns:
|
4863
|
+
_outlier_df_tmp = pd.concat(
|
4864
|
+
[
|
4865
|
+
_outlier_df_tmp,
|
4866
|
+
_df_outlier(
|
4867
|
+
data=data,
|
4868
|
+
columns=[col],
|
4869
|
+
method=method,
|
4870
|
+
min_outlier_method=min_outlier_method, # 至少两种方法检查出outlier
|
4871
|
+
zscore_threshold=zscore_threshold,
|
4872
|
+
iqr_threshold=iqr_threshold,
|
4873
|
+
lower_percentile=lower_percentile,
|
4874
|
+
upper_percentile=upper_percentile,
|
4875
|
+
),
|
4876
|
+
],
|
4877
|
+
axis=1,
|
4878
|
+
# join="inner",
|
4879
|
+
)
|
4880
|
+
processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
|
4881
|
+
processed_data = processed_data[col_names_org]
|
4882
|
+
return processed_data
|
4883
|
+
|
4884
|
+
|
4885
|
+
|
4654
4886
|
def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
4655
4887
|
"""
|
4656
4888
|
Extend a DataFrame by the list elecments in the column.
|
@@ -5042,6 +5274,7 @@ def df_drop_duplicates(
|
|
5042
5274
|
return None
|
5043
5275
|
else:
|
5044
5276
|
return result
|
5277
|
+
#! fillna()
|
5045
5278
|
def df_fillna(
|
5046
5279
|
data: pd.DataFrame,
|
5047
5280
|
method: str = "knn",
|
@@ -5049,8 +5282,8 @@ def df_fillna(
|
|
5049
5282
|
constant: float = None,
|
5050
5283
|
n_neighbors: int = 5, # KNN-specific
|
5051
5284
|
max_iter: int = 10, # Iterative methods specific
|
5052
|
-
inplace: bool =
|
5053
|
-
random_state:int =
|
5285
|
+
inplace: bool = False,
|
5286
|
+
random_state:int = 1
|
5054
5287
|
) -> pd.DataFrame:
|
5055
5288
|
"""
|
5056
5289
|
Fill missing values in a DataFrame using specified imputation method.
|
@@ -5078,7 +5311,18 @@ def df_fillna(
|
|
5078
5311
|
inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
|
5079
5312
|
|
5080
5313
|
"""
|
5081
|
-
|
5314
|
+
if isinstance(data, pd.Series):
|
5315
|
+
data=pd.DataFrame(data)
|
5316
|
+
# handle None
|
5317
|
+
for col in data.columns:
|
5318
|
+
data[col] = data[col].apply(lambda x: np.nan if x is None else x)
|
5319
|
+
|
5320
|
+
col_names_org = data.columns.tolist()
|
5321
|
+
index_names_org = data.index.tolist()
|
5322
|
+
# Separate numeric and non-numeric columns
|
5323
|
+
numeric_data = data.select_dtypes(include=[np.number])
|
5324
|
+
non_numeric_data = data.select_dtypes(exclude=[np.number])
|
5325
|
+
|
5082
5326
|
if data.empty:
|
5083
5327
|
raise ValueError("Input DataFrame is empty.")
|
5084
5328
|
|
@@ -5107,15 +5351,6 @@ def df_fillna(
|
|
5107
5351
|
from sklearn.impute import IterativeImputer
|
5108
5352
|
|
5109
5353
|
imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
|
5110
|
-
# elif method == "missforest":
|
5111
|
-
# from missingpy import MissForest
|
5112
|
-
# imputer = MissForest(max_iter=max_iter, random_state=random_state)
|
5113
|
-
# elif method == "softimpute":
|
5114
|
-
# from fancyimpute import SoftImpute
|
5115
|
-
# imputer = SoftImpute()
|
5116
|
-
# elif method == "svd":
|
5117
|
-
# from fancyimpute import IterativeSVD
|
5118
|
-
# imputer = IterativeSVD(max_iters=max_iter)
|
5119
5354
|
else: # mean, median, most_frequent
|
5120
5355
|
from sklearn.impute import SimpleImputer
|
5121
5356
|
imputer = SimpleImputer(strategy=method)
|
@@ -5123,26 +5358,49 @@ def df_fillna(
|
|
5123
5358
|
# Fit and transform the data
|
5124
5359
|
if axis == 0:
|
5125
5360
|
# Impute column-wise
|
5126
|
-
imputed_data = imputer.fit_transform(
|
5127
|
-
imputed_data.shape
|
5361
|
+
imputed_data = imputer.fit_transform(numeric_data)
|
5128
5362
|
elif axis == 1:
|
5129
5363
|
# Impute row-wise
|
5130
|
-
imputed_data = imputer.fit_transform(
|
5131
|
-
imputed_data.shape
|
5364
|
+
imputed_data = imputer.fit_transform(numeric_data.T)
|
5132
5365
|
else:
|
5133
5366
|
raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
|
5134
5367
|
|
5135
|
-
|
5368
|
+
imputed_data = pd.DataFrame(
|
5136
5369
|
imputed_data if axis == 0 else imputed_data.T,
|
5137
|
-
index=
|
5138
|
-
columns=
|
5370
|
+
index=numeric_data.index if axis == 0 else data.columns,
|
5371
|
+
columns=numeric_data.columns if axis == 0 else data.index,
|
5139
5372
|
)
|
5373
|
+
for col in imputed_data.select_dtypes(include=[np.number]).columns:
|
5374
|
+
imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
|
5375
|
+
|
5376
|
+
# Handle non-numeric data imputation
|
5377
|
+
if not non_numeric_data.empty:
|
5378
|
+
from sklearn.impute import SimpleImputer
|
5379
|
+
if method == "constant":
|
5380
|
+
non_numeric_imputer = SimpleImputer(strategy="constant", fill_value=constant)
|
5381
|
+
else:
|
5382
|
+
non_numeric_imputer = SimpleImputer(strategy="most_frequent")
|
5383
|
+
|
5384
|
+
# Impute non-numeric columns column-wise (axis=0)
|
5385
|
+
imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
|
5386
|
+
|
5387
|
+
# Convert imputed non-numeric array back to DataFrame with original index and column names
|
5388
|
+
imputed_non_numeric_df = pd.DataFrame(
|
5389
|
+
imputed_non_numeric, index=non_numeric_data.index, columns=non_numeric_data.columns
|
5390
|
+
)
|
5391
|
+
else:
|
5392
|
+
imputed_non_numeric_df = pd.DataFrame(index=data.index)
|
5393
|
+
|
5394
|
+
|
5395
|
+
imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(columns=data.columns)
|
5140
5396
|
|
5141
5397
|
if inplace:
|
5142
|
-
|
5143
|
-
|
5398
|
+
# Modify the original DataFrame
|
5399
|
+
data[:] = imputed_data[col_names_org]
|
5400
|
+
return None
|
5144
5401
|
else:
|
5145
|
-
|
5402
|
+
# Return the modified DataFrame
|
5403
|
+
return imputed_data[col_names_org]
|
5146
5404
|
# # example
|
5147
5405
|
# data = {
|
5148
5406
|
# "A": [1, 2, np.nan, 4, 5],
|
@@ -5172,7 +5430,94 @@ def df_fillna(
|
|
5172
5430
|
# display(df)
|
5173
5431
|
# display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
|
5174
5432
|
|
5175
|
-
|
5433
|
+
def df_encoder(
|
5434
|
+
data: pd.DataFrame,
|
5435
|
+
method: str = "dummy",#'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
|
5436
|
+
columns=None,
|
5437
|
+
target_column=None, # Required for 'target' encoding method
|
5438
|
+
**kwargs
|
5439
|
+
) -> pd.DataFrame:
|
5440
|
+
"""
|
5441
|
+
Methods explained:
|
5442
|
+
- 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
|
5443
|
+
|
5444
|
+
- 'onehot': One-hot encoding is used when there is no inherent order in categories. It creates a binary column for each category and is useful for nominal categorical variables. However, it increases dimensionality significantly if there are many unique categories.
|
5445
|
+
|
5446
|
+
- 'ordinal': Ordinal encoding is used when there is an inherent order in the categories. It assigns integers to categories based on their order. Use this when the categories have a ranking (e.g., 'low', 'medium', 'high').
|
5447
|
+
|
5448
|
+
- 'label': Label encoding is used for converting each unique category to a numeric label. It can be useful when working with algorithms that can handle categorical data natively (e.g., decision trees). However, it might introduce unintended ordinal relationships between the categories.
|
5449
|
+
|
5450
|
+
- 'target': Target encoding is used when you encode a categorical feature based on the mean of the target variable. This is useful when there is a strong correlation between the categorical feature and the target variable. It is often used in predictive modeling to capture relationships that are not directly encoded in the feature.
|
5451
|
+
|
5452
|
+
- 'binary': Binary encoding is a more efficient alternative to one-hot encoding when dealing with high-cardinality categorical variables. It converts categories into binary numbers and then splits them into multiple columns, reducing dimensionality compared to one-hot encoding.
|
5453
|
+
"""
|
5454
|
+
|
5455
|
+
# Select categorical columns
|
5456
|
+
categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
|
5457
|
+
methods = ["dummy","onehot", "ordinal", "label", "target", "binary"]
|
5458
|
+
method = strcmp(method, methods)[0]
|
5459
|
+
|
5460
|
+
if columns is None:
|
5461
|
+
columns = categorical_cols
|
5462
|
+
|
5463
|
+
# pd.get_dummies()
|
5464
|
+
if method=='dummy':
|
5465
|
+
dtype=kwargs.pop("dtype",int)
|
5466
|
+
drop_first=kwargs.pop("drop_first",True)
|
5467
|
+
try:
|
5468
|
+
encoded_df = pd.get_dummies(data[columns], drop_first=drop_first, dtype=dtype, **kwargs)
|
5469
|
+
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5470
|
+
except Exception as e:
|
5471
|
+
# print(f"Warning, 没有进行转换, 因为: {e}")
|
5472
|
+
return data
|
5473
|
+
# One-hot encoding
|
5474
|
+
elif method == "onehot":
|
5475
|
+
from sklearn.preprocessing import OneHotEncoder
|
5476
|
+
|
5477
|
+
encoder = OneHotEncoder(drop="first", sparse_output=False, **kwargs)
|
5478
|
+
encoded_data = encoder.fit_transform(data[columns])
|
5479
|
+
encoded_df = pd.DataFrame(
|
5480
|
+
encoded_data,
|
5481
|
+
columns=encoder.get_feature_names_out(columns),
|
5482
|
+
index=data.index,
|
5483
|
+
)
|
5484
|
+
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5485
|
+
|
5486
|
+
# Ordinal encoding
|
5487
|
+
elif method == "ordinal":
|
5488
|
+
from sklearn.preprocessing import OrdinalEncoder
|
5489
|
+
|
5490
|
+
encoder = OrdinalEncoder(**kwargs)
|
5491
|
+
encoded_data = encoder.fit_transform(data[columns])
|
5492
|
+
encoded_df = pd.DataFrame(encoded_data, columns=columns, index=data.index)
|
5493
|
+
return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
|
5494
|
+
|
5495
|
+
# Label encoding
|
5496
|
+
elif method == "label":
|
5497
|
+
from sklearn.preprocessing import LabelEncoder
|
5498
|
+
|
5499
|
+
encoder = LabelEncoder()
|
5500
|
+
encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
|
5501
|
+
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5502
|
+
|
5503
|
+
# Target encoding (Mean of the target for each category)
|
5504
|
+
elif method == "target":
|
5505
|
+
if target_column is None:
|
5506
|
+
raise ValueError("target_column must be provided for target encoding.")
|
5507
|
+
from category_encoders import TargetEncoder
|
5508
|
+
|
5509
|
+
encoder = TargetEncoder(cols=columns, **kwargs)
|
5510
|
+
encoded_data = encoder.fit_transform(data[columns], data[target_column])
|
5511
|
+
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5512
|
+
|
5513
|
+
# Binary encoding (for high-cardinality categorical variables)
|
5514
|
+
elif method == "binary":
|
5515
|
+
from category_encoders import BinaryEncoder
|
5516
|
+
|
5517
|
+
encoder = BinaryEncoder(cols=columns, **kwargs)
|
5518
|
+
encoded_data = encoder.fit_transform(data[columns])
|
5519
|
+
return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
|
5520
|
+
|
5176
5521
|
def df_scaler(
|
5177
5522
|
data: pd.DataFrame, # should be numeric dtype
|
5178
5523
|
method="standard",
|
@@ -5218,9 +5563,8 @@ def df_scaler(
|
|
5218
5563
|
if axis == 0:
|
5219
5564
|
# Column-wise scaling (default)
|
5220
5565
|
if columns is None:
|
5221
|
-
columns = data.select_dtypes(include=
|
5566
|
+
columns = data.select_dtypes(include=np.number).columns.tolist()
|
5222
5567
|
non_numeric_columns = data.columns.difference(columns)
|
5223
|
-
print(f"Scaling columns")
|
5224
5568
|
|
5225
5569
|
scaled_data = scaler.fit_transform(data[columns])
|
5226
5570
|
|
@@ -5242,7 +5586,7 @@ def df_scaler(
|
|
5242
5586
|
# Row-wise scaling
|
5243
5587
|
if columns is None:
|
5244
5588
|
columns = data.index.tolist()
|
5245
|
-
numeric_rows = data.loc[columns].select_dtypes(include=
|
5589
|
+
numeric_rows = data.loc[columns].select_dtypes(include=np.number)
|
5246
5590
|
if numeric_rows.empty:
|
5247
5591
|
raise ValueError("No numeric rows to scale.")
|
5248
5592
|
|
@@ -5259,7 +5603,31 @@ def df_scaler(
|
|
5259
5603
|
scaled_df = data.copy()
|
5260
5604
|
scaled_df.loc[numeric_rows.index] = scaled_data
|
5261
5605
|
return scaled_df
|
5606
|
+
def df_special_characters_cleaner(
|
5607
|
+
data: pd.DataFrame, where=["column", "content", "index"]
|
5608
|
+
) -> pd.DataFrame:
|
5609
|
+
"""
|
5610
|
+
to clean special characters:
|
5611
|
+
usage:
|
5612
|
+
df_special_characters_cleaner(data=df, where='column')
|
5613
|
+
"""
|
5614
|
+
if not isinstance(where, list):
|
5615
|
+
where = [where]
|
5616
|
+
where_to_clean = ["column", "content", "index"]
|
5617
|
+
where_ = [strcmp(i, where_to_clean)[0] for i in where]
|
5618
|
+
|
5619
|
+
# 1. Clean column names by replacing special characters with underscores
|
5620
|
+
if "column" in where_:
|
5621
|
+
data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
|
5622
|
+
|
5623
|
+
# 2. Clean only object-type columns (text columns)
|
5624
|
+
if "content" in where_:
|
5625
|
+
for col in data.select_dtypes(include=["object"]).columns:
|
5626
|
+
data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
|
5627
|
+
if data.index.dtype == "object" and index in where_:
|
5628
|
+
data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
|
5262
5629
|
|
5630
|
+
return data
|
5263
5631
|
def df_cluster(
|
5264
5632
|
data: pd.DataFrame,
|
5265
5633
|
columns: Optional[list] = None,
|
@@ -5268,8 +5636,8 @@ def df_cluster(
|
|
5268
5636
|
scale: bool = True,
|
5269
5637
|
plot: Union[str, list] = "all",
|
5270
5638
|
inplace: bool = True,
|
5271
|
-
ax
|
5272
|
-
)
|
5639
|
+
ax = None,
|
5640
|
+
):
|
5273
5641
|
from sklearn.preprocessing import StandardScaler
|
5274
5642
|
from sklearn.cluster import KMeans
|
5275
5643
|
from sklearn.metrics import silhouette_score, silhouette_samples
|
@@ -5277,7 +5645,6 @@ def df_cluster(
|
|
5277
5645
|
import numpy as np
|
5278
5646
|
import pandas as pd
|
5279
5647
|
import matplotlib.pyplot as plt
|
5280
|
-
import seaborn as sns
|
5281
5648
|
|
5282
5649
|
"""
|
5283
5650
|
Performs clustering analysis on the provided feature matrix using K-Means.
|
@@ -5585,94 +5952,61 @@ def df_reducer(
|
|
5585
5952
|
umap_neighbors: int = 15, # UMAP-specific
|
5586
5953
|
umap_min_dist: float = 0.1, # UMAP-specific
|
5587
5954
|
tsne_perplexity: int = 30, # t-SNE-specific
|
5955
|
+
hue:str = None,# lda-specific
|
5588
5956
|
scale: bool = True,
|
5589
5957
|
fill_missing: bool = True,
|
5590
5958
|
debug: bool = False,
|
5591
5959
|
inplace: bool = True, # replace the oringinal data
|
5592
5960
|
plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
|
5593
|
-
|
5594
|
-
|
5595
|
-
|
5596
|
-
|
5597
|
-
|
5598
|
-
|
5599
|
-
|
5600
|
-
|
5601
|
-
|
5602
|
-
|
5603
|
-
|
5604
|
-
|
5605
|
-
|
5606
|
-
|
5607
|
-
|
5608
|
-
|
5609
|
-
|
5610
|
-
|
5611
|
-
|
5612
|
-
|
5613
|
-
|
5614
|
-
|
5615
|
-
|
5616
|
-
columns : List[str], optional
|
5617
|
-
List of column names to reduce. If None, all columns are used.
|
5618
|
-
|
5619
|
-
method : str, optional, default="umap"
|
5620
|
-
Dimensionality reduction method, either "pca" or "umap".
|
5621
|
-
|
5622
|
-
n_components : int, optional, default=50
|
5623
|
-
Number of components for PCA or UMAP.
|
5624
|
-
|
5625
|
-
umap_neighbors : int, optional, default=15
|
5626
|
-
Number of neighbors considered for UMAP embedding.
|
5627
|
-
|
5628
|
-
umap_min_dist : float, optional, default=0.1
|
5629
|
-
Minimum distance between points in UMAP embedding.
|
5630
|
-
|
5631
|
-
scale : bool, optional, default=True
|
5632
|
-
Whether to scale the data using StandardScaler.
|
5633
|
-
|
5634
|
-
fill_missing : bool, optional, default=True
|
5635
|
-
Whether to fill missing values using the mean before applying PCA/UMAP.
|
5961
|
+
random_state=1,
|
5962
|
+
ax = None,
|
5963
|
+
figsize=None,
|
5964
|
+
**kwargs
|
5965
|
+
) -> pd.DataFrame:
|
5966
|
+
dict_methods = {
|
5967
|
+
#!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
|
5968
|
+
"pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
|
5969
|
+
"lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
|
5970
|
+
"factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
|
5971
|
+
"svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
|
5972
|
+
|
5973
|
+
#! Non-linear Dimensionality Reduction (Manifold Learning)
|
5974
|
+
"umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
|
5975
|
+
"tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other. Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
|
5976
|
+
"mds": "mds(Multidimensional Scaling):\n\tAppropriate for visualizing pairwise similarity or distance in data. Advantage: Maintains the perceived similarity or dissimilarity between points. Limitation: Computationally expensive for large datasets, less effective for complex, high-dimensional structures.",
|
5977
|
+
"lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
|
5978
|
+
"kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
|
5979
|
+
"ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
|
5980
|
+
|
5981
|
+
#! Anomaly Detection: Specialized for detecting outliers or unusual patterns
|
5982
|
+
"isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
|
5983
|
+
}
|
5636
5984
|
|
5637
|
-
Returns:
|
5638
|
-
--------
|
5639
|
-
reduced_df : pd.DataFrame
|
5640
|
-
DataFrame with the reduced dimensions.
|
5641
|
-
"""
|
5642
|
-
|
5643
|
-
"""
|
5644
|
-
PCA: explained_variance:
|
5645
|
-
indicates the proportion of the dataset's total variance that each principal
|
5646
|
-
component (PC) explains. It gives you a sense of how much information
|
5647
|
-
(or variance) is captured by each PC
|
5648
|
-
Interpretation:
|
5649
|
-
- Higher values indicate that the corresponding PC captures more variance.
|
5650
|
-
- The sum of the explained variances for all PCs equals 1 (or 100%).
|
5651
|
-
- If the first few components explain a high percentage (e.g., 90%),
|
5652
|
-
it means you can reduce the dimensionality of the data significantly without losing much information.
|
5653
|
-
Use case:
|
5654
|
-
You may plot a scree plot, which shows the explained variance for each PC, to help decide
|
5655
|
-
how many components to keep for analysis.
|
5656
|
-
|
5657
|
-
PCA: Singular values:
|
5658
|
-
represent the magnitude of variance along each principal component. Mathematically,
|
5659
|
-
they are the square roots of the eigenvalues of the covariance matrix.
|
5660
|
-
Interpretation:
|
5661
|
-
Larger singular values indicate that the associated PC captures more variance.
|
5662
|
-
Singular values are related to the scale of the data. If the data are scaled
|
5663
|
-
before PCA (e.g., standardized), then the singular values will provide a measure
|
5664
|
-
of the spread of data along each PC.
|
5665
|
-
Use case:
|
5666
|
-
Singular values help quantify the contribution of each principal component in a
|
5667
|
-
similar way to the explained variance. They are useful in understanding the overall
|
5668
|
-
structure of the data.
|
5669
|
-
"""
|
5670
5985
|
from sklearn.preprocessing import StandardScaler
|
5671
5986
|
from sklearn.impute import SimpleImputer
|
5672
|
-
|
5673
|
-
|
5674
|
-
|
5675
|
-
|
5987
|
+
if plot_:
|
5988
|
+
import matplotlib.pyplot as plt
|
5989
|
+
import seaborn as sns
|
5990
|
+
# Check valid method input
|
5991
|
+
methods=["pca", "umap","tsne","factor","isolation_forest","lda","kpca","ica","mds","lle","svd"]
|
5992
|
+
method=strcmp(method, methods)[0]
|
5993
|
+
print(f"\nprocessing with using {dict_methods[method]}:")
|
5994
|
+
xlabel,ylabel=None,None
|
5995
|
+
if columns is None:
|
5996
|
+
columns = data.select_dtypes(include='number').columns.tolist()
|
5997
|
+
if hue is None:
|
5998
|
+
hue = data.select_dtypes(exclude='number').columns.tolist()
|
5999
|
+
if isinstance(hue, list):
|
6000
|
+
print("Warning: hue is a list, only select the 1st one")
|
6001
|
+
hue=hue[0]
|
6002
|
+
if not hue:
|
6003
|
+
# Select columns if specified, else use all columns
|
6004
|
+
X = data[columns].values if columns else data.values
|
6005
|
+
else:
|
6006
|
+
# Select columns to reduce and hue for LDA
|
6007
|
+
X = data[columns].values if columns else data.drop(columns=[hue]).values
|
6008
|
+
y = data[hue].values
|
6009
|
+
print(X.shape)
|
5676
6010
|
# Handle missing values
|
5677
6011
|
if fill_missing:
|
5678
6012
|
imputer = SimpleImputer(strategy="mean")
|
@@ -5683,9 +6017,6 @@ def df_reducer(
|
|
5683
6017
|
scaler = StandardScaler()
|
5684
6018
|
X = scaler.fit_transform(X)
|
5685
6019
|
|
5686
|
-
# Check valid method input
|
5687
|
-
methods=["pca", "umap","tsne","factor","isolation_forest"]
|
5688
|
-
method=strcmp(method, methods)[0]
|
5689
6020
|
# Apply PCA if selected
|
5690
6021
|
if method == "pca":
|
5691
6022
|
from sklearn.decomposition import PCA
|
@@ -5729,7 +6060,27 @@ def df_reducer(
|
|
5729
6060
|
pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
|
5730
6061
|
for i in range(n_components):
|
5731
6062
|
pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
|
6063
|
+
if hue:
|
6064
|
+
pca_df[hue]=y
|
6065
|
+
elif method =='lda':
|
6066
|
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
6067
|
+
|
6068
|
+
if "hue" not in locals() or hue is None:
|
6069
|
+
raise ValueError("LDA requires a 'hue' col parameter to specify class labels.")
|
5732
6070
|
|
6071
|
+
lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
|
6072
|
+
X_reduced = lda_reducer.fit_transform(X, y)
|
6073
|
+
|
6074
|
+
# Prepare reduced DataFrame with additional LDA info
|
6075
|
+
lda_df = pd.DataFrame(
|
6076
|
+
X_reduced, index=data.index,
|
6077
|
+
columns=[f"LDA_{i+1}" for i in range(n_components)]
|
6078
|
+
)
|
6079
|
+
if debug:
|
6080
|
+
print(f"LDA completed: Reduced to {n_components} components.")
|
6081
|
+
print("Class separability achieved by LDA.")
|
6082
|
+
if hue:
|
6083
|
+
lda_df[hue]=y
|
5733
6084
|
# Apply UMAP if selected
|
5734
6085
|
elif method == "umap":
|
5735
6086
|
import umap
|
@@ -5756,32 +6107,36 @@ def df_reducer(
|
|
5756
6107
|
)
|
5757
6108
|
umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
|
5758
6109
|
umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
|
6110
|
+
if hue:
|
6111
|
+
umap_df[hue]=y
|
5759
6112
|
elif method == "tsne":
|
5760
6113
|
from sklearn.manifold import TSNE
|
5761
|
-
tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=
|
5762
|
-
X_reduced = tsne.fit_transform(X)
|
5763
|
-
|
5764
|
-
# Prepare reduced DataFrame with additional t-SNE info
|
6114
|
+
tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=random_state)
|
6115
|
+
X_reduced = tsne.fit_transform(X)
|
5765
6116
|
tsne_df = pd.DataFrame(
|
5766
|
-
X_reduced,
|
6117
|
+
X_reduced,
|
6118
|
+
index=data.index,
|
5767
6119
|
columns=[f"tSNE_{i+1}" for i in range(n_components)]
|
5768
6120
|
)
|
5769
6121
|
tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
|
5770
|
-
|
6122
|
+
if hue:
|
6123
|
+
tsne_df[hue]=y
|
5771
6124
|
# Apply Factor Analysis if selected
|
5772
6125
|
elif method == "factor":
|
5773
6126
|
from sklearn.decomposition import FactorAnalysis
|
5774
|
-
factor = FactorAnalysis(n_components=n_components, random_state=
|
6127
|
+
factor = FactorAnalysis(n_components=n_components, random_state=random_state)
|
5775
6128
|
X_reduced = factor.fit_transform(X)
|
5776
6129
|
# Factor Analysis does not directly provide explained variance, but we can approximate it
|
5777
6130
|
fa_variance = factor.noise_variance_
|
5778
6131
|
# Prepare reduced DataFrame with additional Factor Analysis info
|
5779
6132
|
factor_df = pd.DataFrame(
|
5780
|
-
X_reduced,
|
6133
|
+
X_reduced,
|
6134
|
+
index=data.index,
|
5781
6135
|
columns=[f"Factor_{i+1}" for i in range(n_components)]
|
5782
6136
|
)
|
5783
6137
|
factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
|
5784
|
-
|
6138
|
+
if hue:
|
6139
|
+
factor_df[hue]=y
|
5785
6140
|
# Apply Isolation Forest for outlier detection if selected
|
5786
6141
|
elif method == "isolation_forest":
|
5787
6142
|
from sklearn.decomposition import PCA
|
@@ -5812,48 +6167,100 @@ def df_reducer(
|
|
5812
6167
|
iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
|
5813
6168
|
for i in range(n_components):
|
5814
6169
|
iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
|
6170
|
+
if hue:
|
6171
|
+
iso_forest_df[hue]=y
|
6172
|
+
#* Apply Kernel PCA if selected
|
6173
|
+
elif method == "kpca":
|
6174
|
+
from sklearn.decomposition import KernelPCA
|
6175
|
+
kpca = KernelPCA(n_components=n_components, kernel="rbf", random_state=random_state)
|
6176
|
+
X_reduced = kpca.fit_transform(X)
|
6177
|
+
|
6178
|
+
# Prepare reduced DataFrame with KPCA info
|
6179
|
+
kpca_df = pd.DataFrame(
|
6180
|
+
X_reduced,
|
6181
|
+
index=data.index,
|
6182
|
+
columns=[f"KPCA_{i+1}" for i in range(n_components)]
|
6183
|
+
)
|
6184
|
+
if debug:
|
6185
|
+
print("Kernel PCA completed with RBF kernel.")
|
6186
|
+
if hue:
|
6187
|
+
kpca_df[hue]=y
|
6188
|
+
#* Apply ICA if selected
|
6189
|
+
elif method == "ica":
|
6190
|
+
from sklearn.decomposition import FastICA
|
6191
|
+
ica = FastICA(n_components=n_components, random_state=random_state)
|
6192
|
+
X_reduced = ica.fit_transform(X)
|
6193
|
+
|
6194
|
+
# Prepare reduced DataFrame with ICA info
|
6195
|
+
ica_df = pd.DataFrame(
|
6196
|
+
X_reduced, index=data.index,
|
6197
|
+
columns=[f"ICA_{i+1}" for i in range(n_components)]
|
6198
|
+
)
|
6199
|
+
if debug:
|
6200
|
+
print("Independent Component Analysis (ICA) completed.")
|
6201
|
+
if hue:
|
6202
|
+
ica_df[hue]=y
|
6203
|
+
#* Apply MDS if selected
|
6204
|
+
elif method == "mds":
|
6205
|
+
from sklearn.manifold import MDS
|
6206
|
+
mds = MDS(n_components=n_components, random_state=random_state)
|
6207
|
+
X_reduced = mds.fit_transform(X)
|
6208
|
+
|
6209
|
+
# Prepare reduced DataFrame with MDS info
|
6210
|
+
mds_df = pd.DataFrame(
|
6211
|
+
X_reduced, index=data.index,
|
6212
|
+
columns=[f"MDS_{i+1}" for i in range(n_components)]
|
6213
|
+
)
|
6214
|
+
if debug:
|
6215
|
+
print("Multidimensional Scaling (MDS) completed.")
|
6216
|
+
if hue:
|
6217
|
+
mds_df[hue]=y
|
6218
|
+
#* Apply Locally Linear Embedding (LLE) if selected
|
6219
|
+
elif method == "lle":
|
6220
|
+
from sklearn.manifold import LocallyLinearEmbedding
|
6221
|
+
lle = LocallyLinearEmbedding(n_components=n_components, n_neighbors=umap_neighbors, random_state=random_state)
|
6222
|
+
X_reduced = lle.fit_transform(X)
|
6223
|
+
|
6224
|
+
# Prepare reduced DataFrame with LLE info
|
6225
|
+
lle_df = pd.DataFrame(
|
6226
|
+
X_reduced, index=data.index,
|
6227
|
+
columns=[f"LLE_{i+1}" for i in range(n_components)]
|
6228
|
+
)
|
6229
|
+
if debug:
|
6230
|
+
print("Locally Linear Embedding (LLE) completed.")
|
6231
|
+
if hue:
|
6232
|
+
lle_df[hue]=y
|
6233
|
+
#* Apply Singular Value Decomposition (SVD) if selected
|
6234
|
+
elif method == "svd":
|
6235
|
+
# Using NumPy's SVD for dimensionality reduction
|
6236
|
+
U, s, Vt = np.linalg.svd(X, full_matrices=False)
|
6237
|
+
X_reduced = U[:, :n_components] * s[:n_components]
|
6238
|
+
|
6239
|
+
# Prepare reduced DataFrame with SVD info
|
6240
|
+
svd_df = pd.DataFrame(
|
6241
|
+
X_reduced, index=data.index,
|
6242
|
+
columns=[f"SVD_{i+1}" for i in range(n_components)]
|
6243
|
+
)
|
6244
|
+
if hue:
|
6245
|
+
svd_df[hue]=y
|
6246
|
+
if debug:
|
6247
|
+
print("Singular Value Decomposition (SVD) completed.")
|
5815
6248
|
|
5816
6249
|
# Return reduced data and info as a new DataFrame with the same index
|
5817
6250
|
if method == "pca":
|
5818
6251
|
reduced_df = pca_df
|
5819
6252
|
colname_met = "PC_"
|
5820
|
-
|
5821
|
-
|
5822
|
-
data=pca_df,
|
5823
|
-
x="PC_1",
|
5824
|
-
y="PC_2",
|
5825
|
-
# hue="condition",
|
5826
|
-
)
|
6253
|
+
xlabel= f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
|
6254
|
+
ylabel= f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
|
5827
6255
|
elif method == "umap":
|
5828
6256
|
reduced_df = umap_df
|
5829
|
-
colname_met = "UMAP_"
|
5830
|
-
if plot_:
|
5831
|
-
sns.scatterplot(
|
5832
|
-
data=umap_df,
|
5833
|
-
x="UMAP_1",
|
5834
|
-
y="UMAP_2",
|
5835
|
-
# hue="condition",
|
5836
|
-
)
|
6257
|
+
colname_met = "UMAP_"
|
5837
6258
|
elif method == "tsne":
|
5838
6259
|
reduced_df = tsne_df
|
5839
|
-
colname_met = "
|
5840
|
-
if plot_:
|
5841
|
-
sns.scatterplot(
|
5842
|
-
data=tsne_df,
|
5843
|
-
x="tSNE_1",
|
5844
|
-
y="tSNE_2",
|
5845
|
-
# hue="batch",
|
5846
|
-
)
|
6260
|
+
colname_met = "tSNE_"
|
5847
6261
|
elif method == "factor":
|
5848
6262
|
reduced_df = factor_df
|
5849
|
-
colname_met = "Factor_"
|
5850
|
-
if plot_:
|
5851
|
-
sns.scatterplot(
|
5852
|
-
data=factor_df,
|
5853
|
-
x="Factor_1",
|
5854
|
-
y="Factor_2",
|
5855
|
-
# hue="batch",
|
5856
|
-
)
|
6263
|
+
colname_met = "Factor_"
|
5857
6264
|
elif method == "isolation_forest":
|
5858
6265
|
reduced_df = iso_forest_df # Already a DataFrame for outliers
|
5859
6266
|
colname_met = "PC_"
|
@@ -5872,33 +6279,71 @@ def df_reducer(
|
|
5872
6279
|
c="r",
|
5873
6280
|
label="outlier", marker="+", s=30,
|
5874
6281
|
)
|
5875
|
-
|
6282
|
+
elif method=='lda':
|
6283
|
+
reduced_df=lda_df
|
6284
|
+
colname_met="LDA_"
|
6285
|
+
elif method=="kpca":
|
6286
|
+
reduced_df=kpca_df
|
6287
|
+
colname_met="KPCA_"
|
6288
|
+
elif method=="ica":
|
6289
|
+
reduced_df=ica_df
|
6290
|
+
colname_met="ICA_"
|
6291
|
+
elif method=="mds":
|
6292
|
+
reduced_df=mds_df
|
6293
|
+
colname_met="MDS_"
|
6294
|
+
elif method=="lle":
|
6295
|
+
reduced_df=lle_df
|
6296
|
+
colname_met="LLE_"
|
6297
|
+
elif method=="svd":
|
6298
|
+
reduced_df=svd_df
|
6299
|
+
colname_met="SVD_"
|
6300
|
+
# Quick plots
|
6301
|
+
if plot_ and (not method in ["isolation_forest"]):
|
6302
|
+
from .plot import plotxy
|
6303
|
+
if ax is None:
|
6304
|
+
if figsize is None:
|
6305
|
+
_, ax = plt.subplots(figsize=cm2inch(8,8))
|
6306
|
+
else:
|
6307
|
+
_, ax = plt.subplots(figsize=figsize)
|
6308
|
+
else:
|
6309
|
+
ax=ax.cla()
|
6310
|
+
ax=plotxy(data=reduced_df,
|
6311
|
+
x=colname_met+"1",
|
6312
|
+
y=colname_met+"2",
|
6313
|
+
hue=hue,
|
6314
|
+
s=1,
|
6315
|
+
edgecolor='none',
|
6316
|
+
kind='scater',
|
6317
|
+
figsets=dict(legend=dict(loc='best',markerscale=4),
|
6318
|
+
xlabel=xlabel if xlabel else None,
|
6319
|
+
ylabel=ylabel if ylabel else None),
|
6320
|
+
ax=ax,
|
6321
|
+
verbose=False,
|
6322
|
+
**kwargs
|
6323
|
+
)
|
5876
6324
|
|
5877
6325
|
if inplace:
|
5878
6326
|
# If inplace=True, add components back into the original data
|
5879
6327
|
for col_idx in range(n_components):
|
5880
|
-
data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
6328
|
+
data.loc[:,f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
5881
6329
|
# Add extra info for PCA/UMAP
|
5882
6330
|
if method == "pca":
|
5883
6331
|
for i in range(n_components):
|
5884
|
-
data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
|
6332
|
+
data.loc[:,f"Explained Variance PC_{i+1}"] = reduced_df.loc[:,f"Explained Variance PC_{i+1}"]
|
5885
6333
|
for i in range(n_components):
|
5886
|
-
data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
|
6334
|
+
data.loc[:,f"Singular Values PC_{i+1}"] = reduced_df.loc[:,f"Singular Values PC_{i+1}"]
|
5887
6335
|
elif method == "umap":
|
5888
6336
|
for i in range(n_components):
|
5889
|
-
data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
|
5890
|
-
data["Embedding"] = reduced_df["Embedding"]
|
5891
|
-
data["Trustworthiness"] = reduced_df["Trustworthiness"]
|
6337
|
+
data.loc[:,f"UMAP_{i+1}"]=reduced_df.loc[:,f"UMAP_{i+1}"]
|
6338
|
+
data.loc[:,"Embedding"] = reduced_df.loc[:,"Embedding"]
|
6339
|
+
data.loc[:,"Trustworthiness"] = reduced_df.loc[:,"Trustworthiness"]
|
6340
|
+
|
5892
6341
|
return None # No return when inplace=True
|
5893
|
-
|
5894
6342
|
|
5895
6343
|
return reduced_df
|
5896
|
-
|
5897
|
-
|
5898
6344
|
# example:
|
5899
6345
|
# df_reducer(data=data_log, columns=markers, n_components=2)
|
5900
6346
|
|
5901
|
-
|
5902
6347
|
def plot_cluster(
|
5903
6348
|
data: pd.DataFrame,
|
5904
6349
|
labels: np.ndarray,
|
@@ -5922,7 +6367,7 @@ def plot_cluster(
|
|
5922
6367
|
"""
|
5923
6368
|
import seaborn as sns
|
5924
6369
|
from sklearn.metrics import silhouette_samples
|
5925
|
-
|
6370
|
+
import matplotlib.pyplot as plt
|
5926
6371
|
if metrics is None:
|
5927
6372
|
metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)
|
5928
6373
|
|