PyPI - py2ls - Versions diffs - 0.2.4.7__py3-none-any.whl → 0.2.4.9__py3-none-any.whl - Mend

py2ls 0.2.4.7py3-none-any.whl → 0.2.4.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

py2ls/.git/index +0 -0
py2ls/batman.py +32 -1
py2ls/bio.py +3 -17
py2ls/data/usages_sns.json +2 -1
py2ls/ips.py +1694 -838
py2ls/ml2ls.py +1877 -391
py2ls/plot.py +500 -222
{py2ls-0.2.4.7.dist-info → py2ls-0.2.4.9.dist-info}/METADATA +1 -1
{py2ls-0.2.4.7.dist-info → py2ls-0.2.4.9.dist-info}/RECORD +10 -10
{py2ls-0.2.4.7.dist-info → py2ls-0.2.4.9.dist-info}/WHEEL +1 -1

py2ls/ips.py CHANGED Viewed

@@ -1,55 +1,8 @@
 import numpy as np
 import pandas as pd
-import json
-import matplotlib
-import matplotlib.pyplot as plt
-import matplotlib.ticker as tck
-from cycler import cycler
-from mpl_toolkits.mplot3d import Axes3D
-import seaborn as sns
-from sklearn.kernel_approximation import KERNEL_PARAMS
-from sympy import is_increasing
-import sys, os, shutil, re, yaml, json, subprocess
-import importlib.util
-import time
-from dateutil import parser
-from datetime import datetime
-import schedule
-from PIL import Image, ImageEnhance, ImageOps, ImageFilter
-from rembg import remove, new_session
-import docx
-from fpdf import FPDF
-from lxml import etree
-from docx import Document
-from PyPDF2 import PdfReader
-from pptx import Presentation
-from pptx.util import Inches
-from pdf2image import convert_from_path, pdfinfo_from_path
-from nltk.tokenize import sent_tokenize, word_tokenize
-import nltk  # nltk.download("punkt")
-from docx2pdf import convert
-import img2pdf as image2pdf
-import nbformat
-from nbconvert import MarkdownExporter
-from itertools import pairwise
-from box import Box, BoxList
-from numerizer import numerize
-from tqdm import tqdm
-import mimetypes
-from pprint import pp
-from collections import Counter
-from fuzzywuzzy import fuzz, process
-from langdetect import detect
-from duckduckgo_search import DDGS
+import sys, os
+from IPython.display import display
 from typing import List, Optional, Union
-from bs4 import BeautifulSoup
-from . import netfinder
 try:
     get_ipython().run_line_magic("load_ext", "autoreload")
@@ -57,6 +10,35 @@ try:
 except NameError:
     pass
+import warnings
+warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
+warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
+def run_once_within(duration=60):  # default 60s
+    import time
+    """
+    usage:
+    if run_once_within():
+        print("This code runs once per minute.")
+    else:
+        print("The code has already been run in the last minute.")
+    """
+    if not hasattr(run_once_within, "time_last"):
+        run_once_within.time_last = None
+    time_curr = time.time()
+    if (run_once_within.time_last is None) or (
+        time_curr - run_once_within.time_last >= duration
+    ):
+        run_once_within.time_last = time_curr  # Update the last execution time
+        return True
+    else:
+        return False
 def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
     """
     Add the Chinese (default) font to the font manager
@@ -66,13 +48,14 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
     """
     import matplotlib.pyplot as plt
     from matplotlib import font_manager
-    slashtype = "/" if 'mac' in get_os() else "\\"
+    slashtype = "/" if "mac" in get_os() else "\\"
     if slashtype in dir_font:
         font_manager.fontManager.addfont(dir_font)
         fontname = os.path.basename(dir_font).split(".")[0]
     else:
         if "cn" in dir_font.lower() or "ch" in dir_font.lower():
-            fontname = "Hiragino Sans GB" # default Chinese font
+            fontname = "Hiragino Sans GB"  # default Chinese font
         else:
             fontname = dir_font
@@ -86,6 +69,7 @@ def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
         plt.rcParams["font.sans-serif"] = ["Arial"]
     return fontname
 # set 'dir_save'
 if "dar" in sys.platform:
     dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
@@ -155,6 +139,9 @@ def run_every(when: str = None, job=None, wait: int = 60):
     :param when: String specifying the interval, e.g. '2 minutes', '4 hours', '1 day'.
     :param job: The function to be scheduled.
     """
+    import schedule
+    import time
     if job is None:
         print("No job provided!")
         return
@@ -200,6 +187,9 @@ def run_at(when: str, job=None, wait: int = 60):
     :param job: The function to be scheduled.
     :param wait: The sleep interval between checks in seconds.
     """
+    from datetime import datetime
+    import time
     if job is None:
         print("No job provided!")
         return
@@ -279,11 +269,13 @@ def get_timezone(timezone: str | list = None):
 def is_package_installed(package_name):
     """Check if a package is installed."""
+    import importlib.util
     package_spec = importlib.util.find_spec(package_name)
     return package_spec is not None
-def upgrade(module="py2ls",uninstall=False):
+def upgrade(module="py2ls", uninstall=False):
     """
     Installs or upgrades a specified Python module.
@@ -291,6 +283,8 @@ def upgrade(module="py2ls",uninstall=False):
     module (str): The name of the module to install/upgrade.
     uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
     """
+    import subprocess
     if not is_package_installed(module):
         try:
             subprocess.check_call([sys.executable, "-m", "pip", "install", module])
@@ -327,6 +321,8 @@ def get_version(pkg):
 def rm_folder(folder_path, verbose=True):
+    import shutil
     try:
         shutil.rmtree(folder_path)
         if verbose:
@@ -345,6 +341,8 @@ def fremove(path, verbose=True):
     """
     try:
         if os.path.isdir(path):
+            import shutil
             shutil.rmtree(path)
             if verbose:
                 print(f"Successfully deleted folder {path}")
@@ -360,22 +358,31 @@ def fremove(path, verbose=True):
             print(f"Failed to delete {path}. Reason: {e}")
-def get_cwd(verbose: bool = True):
-    """
-    get_cwd: to get the current working directory
-    Args:
-        verbose (bool, optional): to show which function is use. Defaults to True.
-    """
-    try:
-        script_dir = os.path.dirname(os.path.abspath(__file__))
-        if verbose:
-            print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
-    except NameError:
-        # This works in an interactive environment (like a Jupyter notebook)
-        script_dir = os.getcwd()
-        if verbose:
-            print("os.getcwd():", script_dir)
-    return script_dir
+# def get_cwd(verbose: bool = True):
+#     """
+#     get_cwd: to get the current working directory
+#     Args:
+#         verbose (bool, optional): to show which function is use. Defaults to True.
+#     """
+#     try:
+#         script_dir = os.path.dirname(os.path.abspath(__file__))
+#         if verbose:
+#             print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
+#     except NameError:
+#         # This works in an interactive environment (like a Jupyter notebook)
+#         script_dir = os.getcwd()
+#         if verbose:
+#             print("os.getcwd():", script_dir)
+#     return script_dir
+def get_cwd():
+    from pathlib import Path
+    # Get the current script's directory as a Path object
+    current_directory = Path(__file__).resolve().parent
+    return current_directory
 def search(
@@ -388,6 +395,7 @@ def search(
     dir_save=dir_save,
     **kwargs,
 ):
+    from duckduckgo_search import DDGS
     if "te" in kind.lower():
         results = DDGS().text(query, max_results=limit)
@@ -421,6 +429,7 @@ def echo(*args, **kwargs):
         str: the answer from ai
     """
     global dir_save
+    from duckduckgo_search import DDGS
     query = None
     model = kwargs.get("model", "gpt")
@@ -469,8 +478,13 @@ def echo(*args, **kwargs):
     model_valid = valid_mod_name(model)
     res = DDGS().chat(query, model=model_valid)
     if verbose:
+        from pprint import pp
         pp(res)
     if log:
+        from datetime import datetime
+        import time
         dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
         res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
         if bool(os.path.basename(dir_save)):
@@ -492,6 +506,8 @@ def ai(*args, **kwargs):
 def detect_lang(text, output="lang", verbose=True):
+    from langdetect import detect
     dir_curr_script = os.path.dirname(os.path.abspath(__file__))
     dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
     print(dir_curr_script, os.getcwd(), dir_lang_code)
@@ -521,13 +537,14 @@ def is_text(s):
 from typing import Any, Union
 def shared(*args, strict=True, n_shared=2, verbose=True):
     """
     check the shared elelements in two list.
     usage:
         list1 = [1, 2, 3, 4, 5]
         list2 = [4, 5, 6, 7, 8]
-        list3 = [5, 6, 9, 10]
+        list3 = [5, 6, 9, 10]
         a = shared(list1, list2,list3)
     """
     if verbose:
@@ -543,25 +560,34 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
         print(f"{' ' * 2}All inputs must be lists.")
         return []
     first_list = flattened_lists[0]
-    shared_elements = [item for item in first_list if all(item in lst for lst in flattened_lists)]
+    shared_elements = [
+        item for item in first_list if all(item in lst for lst in flattened_lists)
+    ]
     if strict:
-            # Strict mode: require elements to be in all lists
-            shared_elements = set(flattened_lists[0])
-            for lst in flattened_lists[1:]:
-                shared_elements.intersection_update(lst)
+        # Strict mode: require elements to be in all lists
+        shared_elements = set(flattened_lists[0])
+        for lst in flattened_lists[1:]:
+            shared_elements.intersection_update(lst)
     else:
+        from collections import Counter
         all_elements = [item for sublist in flattened_lists for item in sublist]
         element_count = Counter(all_elements)
         # Get elements that appear in at least n_shared lists
-        shared_elements = [item for item, count in element_count.items() if count >= n_shared]
+        shared_elements = [
+            item for item, count in element_count.items() if count >= n_shared
+        ]
     shared_elements = flatten(shared_elements, verbose=verbose)
     if verbose:
-        elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
+        elements2show = (
+            shared_elements if len(shared_elements) < 10 else shared_elements[:5]
+        )
         print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
         print("********* checking shared elements *********")
     return shared_elements
 def not_shared(*args, strict=True, n_shared=2, verbose=False):
     """
     To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
@@ -571,9 +597,9 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
         not_shared(list1,list2)# output [1,3]
     """
     _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
-    list1 = args[0]
-    _not_shared=[item for item in list1 if item not in _common]
-    return flatten(_not_shared, verbose=verbose)
+    list1 = flatten(args[0], verbose=verbose)
+    _not_shared = [item for item in list1 if item not in _common]
+    return _not_shared
 def flatten(nested: Any, unique_list=True, verbose=False):
@@ -582,29 +608,41 @@ def flatten(nested: Any, unique_list=True, verbose=False):
     Parameters:
         nested : Any, Can be a list, tuple, dictionary, or set.
     Returns: list, A flattened list.
-    """
+    """
     flattened_list = []
     stack = [nested]
     while stack:
         current = stack.pop()
         if isinstance(current, dict):
-            stack.extend(current.values())
+            stack.extend(current.values())
         elif isinstance(current, (list, tuple, set)):
             stack.extend(current)
         elif isinstance(current, pd.Series):
             stack.extend(current)
-        elif isinstance(current, (pd.Index,np.ndarray)): # df.columns df.index are object of type pd.Index
+        elif isinstance(
+            current, (pd.Index, np.ndarray)
+        ):  # df.columns df.index are object of type pd.Index
             stack.extend(current.tolist())
         else:
             flattened_list.append(current)
     if verbose:
-        print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
+        print(
+            f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>"
+        )
     if unique_list:
         return unique(flattened_list)[::-1]
     else:
         return flattened_list
-def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=False, scorer="WR"):
+def strcmp(
+    search_term,
+    candidates,
+    ignore_case=True,
+    get_rank=False,
+    verbose=False,
+    scorer="WR",
+):
     """
     Compares a search term with a list of candidate strings and finds the best match based on similarity score.
@@ -617,6 +655,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
     Returns:
     tuple: A tuple containing the best match and its index in the candidates list.
     """
+    from fuzzywuzzy import fuzz, process
     def to_lower(s, ignore_case=True):
         # Converts a string or list of strings to lowercase if ignore_case is True.
@@ -624,7 +663,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
             if isinstance(s, str):
                 return s.lower()
             elif isinstance(s, list):
-                s=[str(i) for i in s]# convert all to str
+                s = [str(i) for i in s]  # convert all to str
                 return [elem.lower() for elem in s]
         return s
@@ -634,12 +673,15 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
             similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
         elif "W" in scorer.lower():
             similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
-        elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
+        elif "ratio" in scorer.lower() or "stri" in scorer.lower():  # Ratio (Strictest)
             similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
         else:
             similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
         if get_rank:
-            idx = [similarity_scores.index(i) for i in sorted(similarity_scores,reverse=True)]
+            idx = [
+                similarity_scores.index(i)
+                for i in sorted(similarity_scores, reverse=True)
+            ]
             if verbose:
                 display([candidates[ii] for ii in idx])
             return [candidates[ii] for ii in idx]
@@ -667,6 +709,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
 # str2 = ['PLoS Computational Biology', 'PLOS BIOLOGY']
 # best_match, idx = strcmp(str1, str2, ignore_case=1)
 def cn2pinyin(
     cn_str: Union[str, list] = None,
     sep: str = " ",
@@ -731,18 +774,21 @@ def cn2pinyin(
         style = Style.PL
     else:
         style = Style.NORMAL
-    if not isinstance(cn_str,list):
-        cn_str=[cn_str]
-    pinyin_flat=[]
+    if not isinstance(cn_str, list):
+        cn_str = [cn_str]
+    pinyin_flat = []
     for cn_str_ in cn_str:
         pinyin_string = pinyin(cn_str_, style=style)
         pinyin_flat.append(sep.join([item[0] for item in pinyin_string]))
-    if len(pinyin_flat)==1:
+    if len(pinyin_flat) == 1:
         return pinyin_flat[0]
     else:
         return pinyin_flat
 def counter(list_, verbose=True):
+    from collections import Counter
     c = Counter(list_)
     # Print the name counts
     for item, count in c.items():
@@ -771,6 +817,7 @@ def str2time(time_str, fmt="24"):
         %p represents AM or PM.
     - str: The converted time string.
     """
+    from datetime import datetime
     def time_len_corr(time_str):
         time_str_ = (
@@ -832,6 +879,8 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
     Returns:
     - str: The converted date string.
     """
+    from dateutil import parser
     try:
         date_obj = parser.parse(date_str)
     except ValueError as e:
@@ -848,6 +897,8 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
 def str2num(s, *args, **kwargs):
+    import re
     delimiter = kwargs.get("sep", None)
     round_digits = kwargs.get("round", None)
     if delimiter is not None:
@@ -863,6 +914,8 @@ def str2num(s, *args, **kwargs):
         try:
             num = float(s)
         except ValueError:
+            from numerizer import numerize
             try:
                 numerized = numerize(s)
                 num = int(numerized) if "." not in numerized else float(numerized)
@@ -1030,7 +1083,7 @@ def px2inch(*px, dpi=300) -> list:
         return [i / dpi for i in px]
-def cm2inch(*cm) -> list:
+def inch2cm(*cm) -> list:
     """
     cm2inch: converts centimeter measurements to inches.
     Usage:
@@ -1051,24 +1104,30 @@ def cm2inch(*cm) -> list:
 def inch2px(*inch, dpi=300) -> list:
     """
     inch2px: converts inch measurements to pixels based on the given dpi.
     Usage:
     inch2px(1, 2, dpi=300); inch2px([1, 2], dpi=300)
+    Parameters:
+    inch : float, list, or tuple
+        Single or multiple measurements in inches to convert to pixels.
+    dpi : int, optional (default=300)
+        Dots per inch (DPI), representing the pixel density.
     Returns:
-        list: in pixels
+        list: Converted measurements in pixels.
     """
-    # Case 1: When the user passes a single argument that is a list or tuple, such as inch2px([1, 2]) or inch2px((1, 2))
+    # Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
     if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
-        # If the input is a single list or tuple, we unpack its elements and convert each to pixels
         return [i * dpi for i in inch[0]]
-    # Case 2: When the user passes multiple arguments directly, such as inch2px(1, 2)
+    # Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
     else:
-        # Here, we convert each individual argument directly to pixels
         return [i * dpi for i in inch]
-def inch2cm(*inch) -> list:
+def cm2inch(*inch) -> list:
     """
-    inch2cm: converts inch measurements to centimeters.
     Usage:
     inch2cm(8,5); inch2cm((8,5)); inch2cm([8,5])
     Returns:
@@ -1183,6 +1242,8 @@ def paper_size(paper_type_str="a4"):
 def docx2pdf(dir_docx, dir_pdf=None):
+    from docx2pdf import convert
     if dir_pdf:
         convert(dir_docx, dir_pdf)
     else:
@@ -1190,6 +1251,8 @@ def docx2pdf(dir_docx, dir_pdf=None):
 def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
+    import img2pdf as image2pdf
     def mm_to_point(size):
         return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
@@ -1241,6 +1304,10 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
 def pdf2ppt(dir_pdf, dir_ppt):
+    from PyPDF2 import PdfReader
+    from pptx.util import Inches
+    from pptx import Presentation
     prs = Presentation()
     # Open the PDF file
@@ -1269,6 +1336,8 @@ def pdf2ppt(dir_pdf, dir_ppt):
 def ssplit(text, by="space", verbose=False, strict=False, **kws):
+    import re
     if isinstance(text, list):
         nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
         flat_list = [item for sublist in nested_list for item in sublist]
@@ -1316,6 +1385,9 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
         return [text[i : i + length] for i in range(0, len(text), length)]
     def split_by_sent_num(text, n=10):
+        from nltk.tokenize import sent_tokenize
+        from itertools import pairwise
         # split text into sentences
         text_split_by_sent = sent_tokenize(text)
         cut_loc_array = np.arange(0, len(text_split_by_sent), n)
@@ -1388,10 +1460,14 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
             print(f"splited by camel_case")
         return split_by_camel_case(text)
     elif ("word" in by) and not strict:
+        from nltk.tokenize import word_tokenize
         if verbose:
             print(f"splited by word")
         return word_tokenize(text)
     elif ("sen" in by and not "num" in by) and not strict:
+        from nltk.tokenize import sent_tokenize
         if verbose:
             print(f"splited by sentence")
         return sent_tokenize(text)
@@ -1441,9 +1517,13 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
 def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
+    from pdf2image import convert_from_path, pdfinfo_from_path
     df_dir_img_single_page = pd.DataFrame()
     dir_single_page = []
     if verbose:
+        from pprint import pp
         pp(pdfinfo_from_path(dir_pdf))
     if isinstance(page, tuple) and page:
         page = list(page)
@@ -1562,6 +1642,8 @@ def unzip(dir_path, output_dir=None):
     # If the output directory already exists, remove it and replace it
     if os.path.exists(output_dir):
         if os.path.isdir(output_dir):  # check if it is a folder
+            import shutil
             shutil.rmtree(output_dir)  # remove folder
         else:
             os.remove(output_dir)  # remove file
@@ -1579,6 +1661,8 @@ def unzip(dir_path, output_dir=None):
         output_file = os.path.splitext(dir_path)[0]  # remove the .gz extension
         try:
+            import shutil
             with gzip.open(dir_path, "rb") as gz_file:
                 with open(output_file, "wb") as out_file:
                     shutil.copyfileobj(gz_file, out_file)
@@ -1586,11 +1670,14 @@ def unzip(dir_path, output_dir=None):
         except FileNotFoundError:
             print(f"Error: The file '{dir_path}' was not found.")
         except PermissionError:
-            print(f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'.")
+            print(
+                f"Error: Permission denied when accessing '{dir_path}' or writing to '{output_file}'."
+            )
         except Exception as e:
             try:
                 import tarfile
-                with tarfile.open(dir_path, 'r:gz') as tar:
+                with tarfile.open(dir_path, "r:gz") as tar:
                     tar.extractall(path=output_file)
             except Exception as final_e:
                 print(f"An final unexpected error occurred: {final_e}")
@@ -1676,11 +1763,13 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     """
     if not isinstance(df, pd.DataFrame):
+        if verbose:
+            print("not pd.DataFrame")
         return False
-    df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
+    df.columns = df.columns.astype(str)  # 把它变成str, 这样就可以进行counts运算了
     # Initialize a list to hold messages about abnormalities
     messages = []
-    is_abnormal = True
+    is_abnormal = False
     # Check the shape of the DataFrame
     actual_shape = df.shape
     messages.append(f"Shape of DataFrame: {actual_shape}")
@@ -1705,25 +1794,29 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
         is_abnormal = True
         if verbose:
             print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
+    if verbose:
+        print("1", is_abnormal)
     if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
         messages.append("Abnormal: Too many delimiters in column names.")
         is_abnormal = True
         if verbose:
             print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
+    if verbose:
+        print("2", is_abnormal)
     if delimiter_counts[""] > 3:
         messages.append("Abnormal: There are empty column names.")
         is_abnormal = True
         if verbose:
             print(f'delimiter_counts[""] > 3')
+    if verbose:
+        print("3", is_abnormal)
     if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
         messages.append("Abnormal: Some column names contain unexpected characters.")
         is_abnormal = True
         if verbose:
             print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
+    if verbose:
+        print("4", is_abnormal)
     # # Check for missing values
     # missing_values = df.isnull().sum()
     # if missing_values.any():
@@ -1742,8 +1835,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
         messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
         is_abnormal = True
         if verbose:
-            print(f'df.columns[df.nunique() == 1].tolist()')
+            print(f"df.columns[df.nunique() == 1].tolist()")
+    if verbose:
+        print("5", is_abnormal)
     # Check for an unreasonable number of rows or columns
     if actual_shape[0] < 2 or actual_shape[1] < 2:
         messages.append(
@@ -1751,8 +1845,9 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
         )
         is_abnormal = True
         if verbose:
-            print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
+            print(f"actual_shape[0] < 2 or actual_shape[1] < 2")
+    if verbose:
+        print("6", is_abnormal)
     # Compile results
     if verbose:
         print("\n".join(messages))
@@ -1770,6 +1865,26 @@ def fload(fpath, kind=None, **kwargs):
         content: The content loaded from the file.
     """
+    def read_mplstyle(style_file):
+        import matplotlib.pyplot as plt
+        # Load the style file
+        plt.style.use(style_file)
+        # Get the current style properties
+        style_dict = plt.rcParams
+        # Convert to dictionary
+        style_dict = dict(style_dict)
+        # Print the style dictionary
+        for i, j in style_dict.items():
+            print(f"\n{i}::::{j}")
+        return style_dict
+    # #example usage:
+    # style_file = "/ std-colors.mplstyle"
+    # style_dict = read_mplstyle(style_file)
     def load_txt_md(fpath):
         with open(fpath, "r") as file:
             content = file.read()
@@ -1779,25 +1894,30 @@ def fload(fpath, kind=None, **kwargs):
     #     with open(fpath, "r") as file:
     #         content = file.read()
     #     return content
-    def load_html(fpath,**kwargs):
-        return pd.read_html(fpath,**kwargs)
+    def load_html(fpath, **kwargs):
+        return pd.read_html(fpath, **kwargs)
     def load_json(fpath, **kwargs):
-        output=kwargs.pop("output","json")
-        if output=='json':
+        output = kwargs.pop("output", "json")
+        if output == "json":
+            import json
             with open(fpath, "r") as file:
                 content = json.load(file)
             return content
         else:
-            return pd.read_json(fpath,**kwargs)
+            return pd.read_json(fpath, **kwargs)
     def load_yaml(fpath):
+        import yaml
         with open(fpath, "r") as file:
             content = yaml.safe_load(file)
         return content
     def load_xml(fpath, fsize_thr: int = 100):
+        from lxml import etree
         def load_small_xml(fpath):
             tree = etree.parse(fpath)
             root = tree.getroot()
@@ -1857,6 +1977,15 @@ def fload(fpath, kind=None, **kwargs):
                     return char
         return None
+    def _get_chunks(df_fake):
+        """
+        helper func for 'load_csv'
+        """
+        chunks = []
+        for chunk in df_fake:
+            chunks.append(chunk)
+        return pd.concat(chunks, ignore_index=True)
     def load_csv(fpath, **kwargs):
         from pandas.errors import EmptyDataError
@@ -1868,12 +1997,17 @@ def fload(fpath, kind=None, **kwargs):
         encoding = kwargs.pop("encoding", "utf-8")
         on_bad_lines = kwargs.pop("on_bad_lines", "skip")
         comment = kwargs.pop("comment", None)
-        fmt=kwargs.pop("fmt",False)
-        verbose=kwargs.pop("verbose",False)
-        if verbose:
+        fmt = kwargs.pop("fmt", False)
+        chunksize = kwargs.pop("chunksize", None)
+        engine = "c" if chunksize else engine  # when chunksize, recommend 'c'
+        low_memory = kwargs.pop("low_memory", True)
+        low_memory = (
+            False if chunksize else True
+        )  # when chunksize, recommend low_memory=False
+        verbose = kwargs.pop("verbose", False)
+        if run_once_within():
             use_pd("read_csv", verbose=verbose)
-            return
         if comment is None:
             comment = get_comment(
                 fpath, comment=None, encoding="utf-8", lines_to_check=5
@@ -1890,14 +2024,19 @@ def fload(fpath, kind=None, **kwargs):
                 skipinitialspace=skipinitialspace,
                 sep=sep,
                 on_bad_lines=on_bad_lines,
+                chunksize=chunksize,
+                low_memory=low_memory,
                 **kwargs,
             )
-            if is_df_abnormal(df, verbose=0):
+            if chunksize:
+                df = _get_chunks(df)
+                print(df.shape)
+            if is_df_abnormal(df, verbose=0):  # raise error
                 raise ValueError("the df is abnormal")
         except:
             try:
                 try:
-                    if engine == "pyarrow":
+                    if engine == "pyarrow" and not chunksize:
                         df = pd.read_csv(
                             fpath,
                             engine=engine,
@@ -1906,6 +2045,7 @@ def fload(fpath, kind=None, **kwargs):
                             sep=sep,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            low_memory=low_memory,
                             **kwargs,
                         )
                     else:
@@ -1919,14 +2059,19 @@ def fload(fpath, kind=None, **kwargs):
                             skipinitialspace=skipinitialspace,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            chunksize=chunksize,
+                            low_memory=low_memory,
                             **kwargs,
                         )
+                    if chunksize:
+                        df = _get_chunks(df)
+                        print(df.shape)
                     if is_df_abnormal(df, verbose=0):
                         raise ValueError("the df is abnormal")
                 except (UnicodeDecodeError, ValueError):
                     encoding = get_encoding(fpath)
                     # print(f"utf-8 failed. Retrying with detected encoding: {encoding}")
-                    if engine == "pyarrow":
+                    if engine == "pyarrow" and not chunksize:
                         df = pd.read_csv(
                             fpath,
                             engine=engine,
@@ -1935,6 +2080,7 @@ def fload(fpath, kind=None, **kwargs):
                             sep=sep,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            low_memory=low_memory,
                             **kwargs,
                         )
                     else:
@@ -1948,8 +2094,13 @@ def fload(fpath, kind=None, **kwargs):
                             skipinitialspace=skipinitialspace,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            chunksize=chunksize,
+                            low_memory=low_memory,
                             **kwargs,
                         )
+                    if chunksize:
+                        df = _get_chunks(df)
+                        print(df.shape)
                     if is_df_abnormal(df, verbose=0):
                         raise ValueError("the df is abnormal")
             except Exception as e:
@@ -1966,8 +2117,13 @@ def fload(fpath, kind=None, **kwargs):
                             sep=sep,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            chunksize=chunksize,
+                            low_memory=low_memory,
                             **kwargs,
                         )
+                        if chunksize:
+                            df = _get_chunks(df)
+                            print(df.shape)
                         if not is_df_abnormal(df, verbose=0):  # normal
                             display(df.head(2))
                             print(f"shape: {df.shape}")
@@ -1975,51 +2131,64 @@ def fload(fpath, kind=None, **kwargs):
                     except:
                         pass
                 else:
-                    engines = [None,"c", "python"]
-                    for engine in engines:
-                        separators = [",", "\t", ";", "|", " "]
-                        for sep in separators:
-                            try:
-                                # sep2show = sep if sep != "\t" else "\\t"
-                                # print(f"trying with: engine={engine}, sep='{sep2show}'")
-                                # print(".")
-                                df = pd.read_csv(
-                                    fpath,
-                                    engine=engine,
-                                    sep=sep,
-                                    on_bad_lines=on_bad_lines,
-                                    comment=comment,
-                                    **kwargs,
-                                )
-                                # display(df.head(2))
-                                # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
-                                if not is_df_abnormal(df, verbose=0):
-                                    display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
-                                    print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
-                                    return df
-                            except EmptyDataError as e:
-                                continue
-                        else:
-                            pass
+                    if not chunksize:
+                        engines = [None, "c", "python"]
+                        for engine in engines:
+                            separators = [",", "\t", ";", "|", " "]
+                            for sep in separators:
+                                try:
+                                    # sep2show = sep if sep != "\t" else "\\t"
+                                    # print(f"trying with: engine={engine}, sep='{sep2show}'")
+                                    # print(".")
+                                    df = pd.read_csv(
+                                        fpath,
+                                        engine=engine,
+                                        sep=sep,
+                                        on_bad_lines=on_bad_lines,
+                                        comment=comment,
+                                        chunksize=chunksize,
+                                        low_memory=low_memory,
+                                        **kwargs,
+                                    )
+                                    # display(df.head(2))
+                                    # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
+                                    if chunksize:
+                                        df = _get_chunks(df)
+                                        print(df.shape)
+                                    if not is_df_abnormal(df, verbose=0):
+                                        (
+                                            display(df.head(2))
+                                            if isinstance(df, pd.DataFrame)
+                                            else display("it is not a DataFrame")
+                                        )
+                                        (
+                                            print(f"shape: {df.shape}")
+                                            if isinstance(df, pd.DataFrame)
+                                            else display("it is not a DataFrame")
+                                        )
+                                        return df
+                                except EmptyDataError as e:
+                                    continue
+                            else:
+                                pass
         display(df.head(2))
         print(f"shape: {df.shape}")
         return df
     def load_excel(fpath, **kwargs):
         engine = kwargs.get("engine", "openpyxl")
-        verbose=kwargs.pop("verbose",False)
-        if verbose:
+        verbose = kwargs.pop("verbose", False)
+        if run_once_within():
             use_pd("read_excel", verbose=verbose)
         df = pd.read_excel(fpath, engine=engine, **kwargs)
         try:
-            meata=pd.ExcelFile(fpath)
+            meata = pd.ExcelFile(fpath)
             print(f"n_sheet={len(meata.sheet_names)},\t'sheetname = 0 (default)':")
-            [print(f"{i}:\t{i_}") for i,i_ in enumerate(meata.sheet_names)]
+            [print(f"{i}:\t{i_}") for i, i_ in enumerate(meata.sheet_names)]
         except:
             pass
         return df
     def load_parquet(fpath, **kwargs):
         """
         Load a Parquet file into a Pandas DataFrame with advanced options.
@@ -2035,16 +2204,16 @@ def fload(fpath, kind=None, **kwargs):
         Returns:
         - df (DataFrame): The loaded DataFrame.
         """
         engine = kwargs.get("engine", "pyarrow")
         verbose = kwargs.pop("verbose", False)
-        if verbose:
+        if run_once_within():
             use_pd("read_parquet", verbose=verbose)
         try:
             df = pd.read_parquet(fpath, engine=engine, **kwargs)
             if verbose:
-                if 'columns' in kwargs:
+                if "columns" in kwargs:
                     print(f"Loaded columns: {kwargs['columns']}")
                 else:
                     print("Loaded all columns.")
@@ -2053,9 +2222,12 @@ def fload(fpath, kind=None, **kwargs):
             print(f"An error occurred while loading the Parquet file: {e}")
             df = None
-        return df
+        return df
     def load_ipynb(fpath, **kwargs):
+        import nbformat
+        from nbconvert import MarkdownExporter
         as_version = kwargs.get("as_version", 4)
         with open(fpath, "r") as file:
             nb = nbformat.read(file, as_version=as_version)
@@ -2085,6 +2257,8 @@ def fload(fpath, kind=None, **kwargs):
         If page is an integer, it returns the text of the specified page number.
         If the specified page is not found, it returns the string "Page is not found".
         """
+        from PyPDF2 import PdfReader
         text_dict = {}
         with open(fpath, "rb") as file:
             pdf_reader = PdfReader(file)
@@ -2114,6 +2288,8 @@ def fload(fpath, kind=None, **kwargs):
             return text_dict.get(int(page), "Page is not found")
     def load_docx(fpath):
+        from docx import Document
         doc = Document(fpath)
         content = [para.text for para in doc.paragraphs]
         return content
@@ -2123,21 +2299,55 @@ def fload(fpath, kind=None, **kwargs):
         kind = kind.lower()
     kind = kind.lstrip(".").lower()
     img_types = [
-        "bmp","eps","gif","png","jpg","jpeg","jpeg2000","tiff","tif",
-        "icns","ico","im","msp","pcx","ppm","sgi","spider","tga","webp",
+        "bmp",
+        "eps",
+        "gif",
+        "png",
+        "jpg",
+        "jpeg",
+        "jpeg2000",
+        "tiff",
+        "tif",
+        "icns",
+        "ico",
+        "im",
+        "msp",
+        "pcx",
+        "ppm",
+        "sgi",
+        "spider",
+        "tga",
+        "webp",
     ]
     doc_types = [
-        "docx","pdf",
-        "txt","csv","xlsx","tsv","parquet","snappy",
-        "md","html",
-        "json","yaml","xml",
+        "docx",
+        "pdf",
+        "txt",
+        "csv",
+        "xlsx",
+        "tsv",
+        "parquet",
+        "snappy",
+        "md",
+        "html",
+        "json",
+        "yaml",
+        "xml",
         "ipynb",
-        "mtx"
+        "mtx",
     ]
     zip_types = [
-        "gz","zip","7z","rar","tgz",
-        "tar","tar.gz","tar.bz2",
-        "bz2","xz","gzip"
+        "gz",
+        "zip",
+        "7z",
+        "rar",
+        "tgz",
+        "tar",
+        "tar.gz",
+        "tar.bz2",
+        "bz2",
+        "xz",
+        "gzip",
     ]
     other_types = ["fcs"]
     supported_types = [*doc_types, *img_types, *zip_types, *other_types]
@@ -2173,9 +2383,17 @@ def fload(fpath, kind=None, **kwargs):
         return load_yaml(fpath)
     elif kind == "xml":
         return load_xml(fpath)
-    elif kind in ["csv","tsv"]:
+    elif kind in ["csv", "tsv"]:
+        verbose = kwargs.pop("verbose", False)
+        if run_once_within():
+            use_pd("read_csv")
         content = load_csv(fpath, **kwargs)
         return content
+    elif kind == "pkl":
+        verbose = kwargs.pop("verbose", False)
+        if run_once_within():
+            use_pd("read_pickle")
+        return pd.read_pickle(fpath, **kwargs)
     elif kind in ["ods", "ods", "odt"]:
         engine = kwargs.get("engine", "odf")
         kwargs.pop("engine", None)
@@ -2184,35 +2402,54 @@ def fload(fpath, kind=None, **kwargs):
         engine = kwargs.get("engine", "xlrd")
         kwargs.pop("engine", None)
         content = load_excel(fpath, engine=engine, **kwargs)
-        print(f"shape: {content.shape}")
-        display(content.head(3))
+        print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
+        display(content.head(3)) if isinstance(content, pd.DataFrame) else None
         return content
     elif kind == "xlsx":
         content = load_excel(fpath, **kwargs)
-        display(content.head(3))
-        print(f"shape: {content.shape}")
+        display(content.head(3)) if isinstance(content, pd.DataFrame) else None
+        print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
         return content
-    elif kind=='mtx':
+    elif kind == "mtx":
         from scipy.io import mmread
-        dat_mtx=mmread(fpath)
-        content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
-        display(content.head(3))
+        dat_mtx = mmread(fpath)
+        content = pd.DataFrame.sparse.from_spmatrix(dat_mtx, **kwargs)
+        display(content.head(3)) if isinstance(content, pd.DataFrame) else None
         print(f"shape: {content.shape}")
         return content
     elif kind == "ipynb":
         return load_ipynb(fpath, **kwargs)
-    elif kind in ['parquet','snappy']:
-        return load_parquet(fpath,**kwargs)
+    elif kind in ["parquet", "snappy"]:
+        verbose = kwargs.pop("verbose", False)
+        if run_once_within():
+            use_pd("read_parquet")
+        return load_parquet(fpath, **kwargs)
+    elif kind == "feather":
+        verbose = kwargs.pop("verbose", False)
+        if run_once_within():
+            use_pd("read_feather")
+        content = pd.read_feather(fpath, **kwargs)
+        return content
+    elif kind == "h5":
+        content = pd.read_hdf(fpath, **kwargs)
+        return content
+    elif kind == "pkl":
+        content = pd.read_pickle(fpath, **kwargs)
+        return content
     elif kind == "pdf":
         # print('usage:load_pdf(fpath, page="all", verbose=False)')
         return load_pdf(fpath, **kwargs)
     elif kind.lower() in img_types:
         print(f'Image ".{kind}" is loaded.')
         return load_img(fpath)
-    elif kind=="gz" and fpath.endswith(".soft.gz"):
+    elif kind == "gz" and fpath.endswith(".soft.gz"):
         import GEOparse
         return GEOparse.get_GEO(filepath=fpath)
     elif kind.lower() in zip_types:
+        from pprint import pp
         keep = kwargs.get("keep", False)
         fpath_unzip = unzip(fpath)
         if os.path.isdir(fpath_unzip):
@@ -2247,6 +2484,9 @@ def fload(fpath, kind=None, **kwargs):
         meta, data = fcsparser.parse(fpath, reformat_meta=True)
         return meta, data
+    elif kind == "mplstyle":
+        return read_mplstyle(fpath)
     else:
         print("direct reading...")
         try:
@@ -2288,7 +2528,7 @@ def fupdate(fpath, content=None, how="head"):
     """
     Update a file by adding new content at the top and moving the old content to the bottom.
     If the file is a JSON file, merge the new content with the old content.
     Parameters
     ----------
     fpath : str
@@ -2296,7 +2536,7 @@ def fupdate(fpath, content=None, how="head"):
     content : str or dict, optional
         The new content to add at the top of the file (for text) or merge (for JSON).
         If not provided, the function will not add any new content.
     Notes
     -----
     - If the file at `fpath` does not exist, it will be created.
@@ -2305,14 +2545,20 @@ def fupdate(fpath, content=None, how="head"):
     """
     content = content or ""
     file_ext = os.path.splitext(fpath)[1]
-    how_s=["head", "tail","start","end","beginning", "stop",'last',"before"]
+    how_s = ["head", "tail", "start", "end", "beginning", "stop", "last", "before"]
     how = strcmp(how, how_s)[0]
     print(how)
-    add_where = 'head' if how in ["head", "start","beginning", "before"] else "tail"
+    add_where = "head" if how in ["head", "start", "beginning", "before"] else "tail"
     if "json" in file_ext.lower():
-        old_content=fload(fpath,kind='json') if os.path.exists(fpath) else {}
-        updated_content =  {**content,**old_content} if add_where=="head" else {**old_content, **content} if isinstance(content, dict) else old_content
-        fsave(fpath,updated_content)
+        old_content = fload(fpath, kind="json") if os.path.exists(fpath) else {}
+        updated_content = (
+            {**content, **old_content}
+            if add_where == "head"
+            else (
+                {**old_content, **content} if isinstance(content, dict) else old_content
+            )
+        )
+        fsave(fpath, updated_content)
     else:
         # Handle text file
         if os.path.exists(fpath):
@@ -2323,7 +2569,7 @@ def fupdate(fpath, content=None, how="head"):
         # Write new content at the top followed by old content
         with open(fpath, "w") as file:
-            if add_where=="head":
+            if add_where == "head":
                 file.write(content + "\n")
                 file.write(old_content)
             else:
@@ -2359,6 +2605,9 @@ def filter_kwargs(kws, valid_kwargs):
     return kwargs_filtered
+str_space_speed = 'sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
 def fsave(
     fpath,
     content,
@@ -2393,6 +2642,8 @@ def fsave(
             fappend(fpath, content=content)
     def save_docx(fpath, content, font_name, font_size, spacing):
+        import docx
         if isinstance(content, str):
             content = content.split(". ")
         doc = docx.Document()
@@ -2420,6 +2671,8 @@ def fsave(
         save_content(fpath, html_content, mode)
     def save_pdf(fpath, content, font_name, font_size):
+        from fpdf import FPDF
         pdf = FPDF()
         pdf.add_page()
         # pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
@@ -2432,8 +2685,8 @@ def fsave(
     def save_csv(fpath, data, **kwargs):
         # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
-        verbose=kwargs.pop("verbose",False)
-        if verbose:
+        verbose = kwargs.pop("verbose", False)
+        if run_once_within():
             use_pd("to_csv", verbose=verbose)
         kwargs_csv = dict(
             path_or_buf=None,
@@ -2463,18 +2716,30 @@ def fsave(
         df.to_csv(fpath, **kwargs_valid)
     def save_xlsx(fpath, data, **kwargs):
-        verbose=kwargs.pop("verbose",False)
+        verbose = kwargs.pop("verbose", False)
         sheet_name = kwargs.pop("sheet_name", "Sheet1")
-        if verbose:
+        if run_once_within():
             use_pd("to_excel", verbose=verbose)
         if any(kwargs):
             format_excel(df=data, filename=fpath, **kwargs)
         else:
             # Remove non-relevant kwargs
             irrelevant_keys = [
-            "format", "usage", "cell", "width", "height", "height_max", "merge",
-            "shade", "comment", "link", "protect", "number_format", "conditional_format",
-            "index_default"]
+                "format",
+                "usage",
+                "cell",
+                "width",
+                "height",
+                "height_max",
+                "merge",
+                "shade",
+                "comment",
+                "link",
+                "protect",
+                "number_format",
+                "conditional_format",
+                "index_default",
+            ]
             for key in irrelevant_keys:
                 kwargs.pop(key, None)
@@ -2482,15 +2747,18 @@ def fsave(
             # Check if the file exists, then append the sheet, otherwise create a new file
             try:
                 # Use ExcelWriter with append mode if the file exists
-                with pd.ExcelWriter(fpath, engine='openpyxl', mode='a', if_sheet_exists='new') as writer:
+                with pd.ExcelWriter(
+                    fpath, engine="openpyxl", mode="a", if_sheet_exists="new"
+                ) as writer:
                     df.to_excel(writer, sheet_name=sheet_name, index=False, **kwargs)
             except FileNotFoundError:
                 # If file doesn't exist, create a new one
                 df.to_excel(fpath, sheet_name=sheet_name, index=False, **kwargs)
     def save_ipynb(fpath, data, **kwargs):
         # Split the content by code fences to distinguish between code and markdown
+        import nbformat
         parts = data.split("```")
         cells = []
@@ -2513,17 +2781,19 @@ def fsave(
     #         json.dump(data, file, **kwargs)
     def save_json(fpath_fname, var_dict_or_df):
+        import json
         def _convert_js(data):
             if isinstance(data, pd.DataFrame):
-                return data.to_dict(orient="list")
+                return data.to_dict(orient="list")
             elif isinstance(data, np.ndarray):
                 return data.tolist()
             elif isinstance(data, dict):
                 return {key: _convert_js(value) for key, value in data.items()}
-            return data
+            return data
         serializable_data = _convert_js(var_dict_or_df)
         # Save the serializable data to the JSON file
         with open(fpath_fname, "w") as f_json:
             json.dump(serializable_data, f_json, indent=4)
@@ -2534,10 +2804,14 @@ def fsave(
     # # setss = jsonload("/.json")
     def save_yaml(fpath, data, **kwargs):
+        import yaml
         with open(fpath, "w") as file:
             yaml.dump(data, file, **kwargs)
     def save_xml(fpath, data):
+        from lxml import etree
         root = etree.Element("root")
         if isinstance(data, dict):
             for key, val in data.items():
@@ -2548,24 +2822,37 @@ def fsave(
         tree = etree.ElementTree(root)
         tree.write(fpath, pretty_print=True, xml_declaration=True, encoding="UTF-8")
-    def save_parquet(fpath:str, data:pd.DataFrame, **kwargs):
-        engine = kwargs.pop("engine","auto") # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
-        compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
+    def save_parquet(fpath: str, data: pd.DataFrame, **kwargs):
+        engine = kwargs.pop(
+            "engine", "auto"
+        )  # auto先试pyarrow, 不行就转为fastparquet, {‘auto’, ‘pyarrow’, ‘fastparquet’}
+        compression = kwargs.pop(
+            "compression", None
+        )  # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
         try:
             # Attempt to save with "pyarrow" if engine is set to "auto"
-                data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
-                print(f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression.")
+            data.to_parquet(fpath, engine=engine, compression=compression, **kwargs)
+            print(
+                f"DataFrame successfully saved to {fpath} with engine '{engine}' and {compression} compression."
+            )
         except Exception as e:
-            print(f"Error using with engine '{engine}' and {compression} compression: {e}")
+            print(
+                f"Error using with engine '{engine}' and {compression} compression: {e}"
+            )
             if "Sparse" in str(e):
                 try:
                     # Handle sparse data by converting columns to dense
                     print("Attempting to convert sparse columns to dense format...")
-                    data = data.apply(lambda x: x.sparse.to_dense() if pd.api.types.is_sparse(x) else x)
-                    save_parquet(fpath, data=data,**kwargs)
+                    data = data.apply(
+                        lambda x: (
+                            x.sparse.to_dense() if pd.api.types.is_sparse(x) else x
+                        )
+                    )
+                    save_parquet(fpath, data=data, **kwargs)
                 except Exception as last_e:
-                    print(f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}")
+                    print(
+                        f"After converted sparse columns to dense format, Error using with engine '{engine}' and {compression} compression: {last_e}"
+                    )
     if kind is None:
         _, kind = os.path.splitext(fpath)
@@ -2612,16 +2899,95 @@ def fsave(
         save_yaml(fpath, content, **kwargs)
     elif kind == "ipynb":
         save_ipynb(fpath, content, **kwargs)
-    elif kind.lower() in ["parquet","pq","big","par"]:
-        compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
+    elif kind.lower() in ["parquet", "pq", "big", "par"]:
+        verbose = kwargs.pop("verbose", False)
+        if verbose:
+            print(str_space_speed)
+            use_pd("to_parquet")
+            return None
+        compression = kwargs.pop(
+            "compression", None
+        )  # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
         # fix the fpath ends
-        if not '.parquet' in fpath:
-            fpath=fpath.replace(kind, 'parquet')
+        _fpath, _ext = os.path.splitext(fpath)
+        fpath = _fpath + _ext.replace(kind, "parquet")
         if compression is not None:
             if not fpath.endswith(compression):
-                fpath=fpath+f".{compression}"
-        save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
+                fpath = fpath + f".{compression}"
+        save_parquet(fpath=fpath, data=content, compression=compression, **kwargs)
+    elif kind.lower() in ["pkl", "pk", "pickle", "pick"]:
+        # Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
+        # Pickle is convenient if you want to preserve exact Python object types.
+        verbose = kwargs.pop("verbose", False)
+        if verbose:
+            print(str_space_speed)
+            use_pd("to_pickle")
+            return None
+        _fpath, _ext = os.path.splitext(fpath)
+        fpath = _fpath + _ext.replace(kind, "pkl")
+        compression = kwargs.pop("compression", None)
+        if compression is not None:
+            if not fpath.endswith(compression["method"]):
+                fpath = fpath + f".{compression['method']}"
+        if isinstance(content, pd.DataFrame):
+            content.to_pickle(fpath, **kwargs)
+        else:
+            try:
+                print("trying to convert it as a DataFrame...")
+                content = pd.DataFrame(content)
+                content.to_pickle(fpath, **kwargs)
+            except Exception as e:
+                raise ValueError(
+                    f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
+                )
+    elif kind.lower() in ["fea", "feather", "ft", "fe", "feat", "fether"]:
+        # Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
+        # optimized for data analytics tasks and is especially fast when working with Pandas.
+        verbose = kwargs.pop("verbose", False)
+        if verbose:
+            print(str_space_speed)
+            use_pd("to_feather")
+            return None
+        _fpath, _ext = os.path.splitext(fpath)
+        fpath = _fpath + _ext.replace(kind, "feather")
+        if isinstance(content, pd.DataFrame):
+            content.to_feather(fpath, **kwargs)
+        else:
+            try:
+                print("trying to convert it as a DataFrame...")
+                content = pd.DataFrame(content)
+                content.to_feather(fpath, **kwargs)
+            except Exception as e:
+                raise ValueError(
+                    f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
+                )
+    elif kind.lower() in ["hd", "hdf", "h", "h5"]:
+        # particularly useful for large datasets and can handle complex data structures
+        verbose = kwargs.pop("verbose", False)
+        if verbose:
+            print(str_space_speed)
+            use_pd("to_hdf")
+        _fpath, _ext = os.path.splitext(fpath)
+        fpath = _fpath + _ext.replace(kind, "h5")
+        compression = kwargs.pop("compression", None)
+        if compression is not None:
+            if not fpath.endswith(compression):
+                fpath = fpath + f".{compression}"
+        if isinstance(content, pd.DataFrame):
+            content.to_hdf(fpath, key="content", **kwargs)
+        else:
+            try:
+                print("trying to convert it as a DataFrame...")
+                content = pd.DataFrame(content)
+                content.to_hdf(fpath, **kwargs)
+            except Exception as e:
+                raise ValueError(
+                    f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
+                )
     else:
+        from . import netfinder
         try:
             netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
         except:
@@ -2744,6 +3110,8 @@ def isa(content, kind):
     elif "color" in kind.lower():  # file
         return is_str_color(content)
     elif "html" in kind.lower():
+        import re
         if content is None or not isinstance(content, str):
             return False
         # Remove leading and trailing whitespace
@@ -2793,8 +3161,8 @@ def listdir(
     verbose=True,
 ):
     if kind is None:
-        ls=os.listdir(rootdir)
-        ls = [f for f in ls if not f.startswith('.') and not f.startswith('~')]
+        ls = os.listdir(rootdir)
+        ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
         print(ls)
         df_all = pd.DataFrame(
             {
@@ -2825,7 +3193,7 @@ def listdir(
     if os.path.isdir(rootdir):
         ls = os.listdir(rootdir)
-        ls = [f for f in ls if not f.startswith('.') and not f.startswith('~')]
+        ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
         fd = [".fd", ".fld", ".fol", ".fd", ".folder"]
         i = 0
         f = {
@@ -2903,6 +3271,8 @@ def listdir(
             display(f.head())
         return f
     else:
+        from box import Box
         if "l" in orient.lower():  # list # default
             res_output = Box(f.to_dict(orient="list"))
             return res_output
@@ -2943,13 +3313,10 @@ def mkdir_nest(fpath: str) -> str:
     Returns:
     - str: The path of the created directory.
     """
     # Split the full path into directories
     f_slash = "/" if "mac" in get_os().lower() else "\\"
     if os.path.isdir(fpath):
-        fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
-        print(fpath)
+        fpath = fpath + f_slash if not fpath.endswith(f_slash) else fpath
         return fpath
     dir_parts = fpath.split(f_slash)  # Split the path by the OS-specific separator
@@ -2979,27 +3346,27 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
     - str: The path of the created directory or an error message.
     """
-    rootdir = []
+    rootdir = []
     if chdir is None:
         return mkdir_nest(pardir)
     if isinstance(chdir, str):
-        chdir = [chdir]
+        chdir = [chdir]
     chdir = list(set(chdir))
     if isinstance(pardir, str):  # Dir_parents should be 'str' type
-        pardir = os.path.normpath(pardir)
+        pardir = os.path.normpath(pardir)
     if "mac" in get_os().lower() or "lin" in get_os().lower():
         stype = "/"
     elif "win" in get_os().lower():
         stype = "\\"
     else:
         stype = "/"
     if os.path.isdir(pardir):
         os.chdir(pardir)  # Set current path
         # Check if subdirectories are not empty
         if chdir:
-            chdir.sort()
-            for folder in chdir:
+            chdir.sort()
+            for folder in chdir:
                 child_tmp = os.path.join(pardir, folder)
                 if not os.path.isdir(child_tmp):
                     os.mkdir("./" + folder)
@@ -3019,8 +3386,8 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
     # Dir is the main output, if only one dir, then str type is inconvenient
     if len(rootdir) == 1:
         rootdir = rootdir[0]
-    rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
-    print(rootdir)
+    rootdir = rootdir + stype if not rootdir.endswith(stype) else rootdir
     return rootdir
@@ -3032,6 +3399,9 @@ def split_path(fpath):
 def figsave(*args, dpi=300):
+    import matplotlib.pyplot as plt
+    from PIL import Image
     dir_save = None
     fname = None
     img = None
@@ -3046,14 +3416,14 @@ def figsave(*args, dpi=300):
             img = arg  # Store the PIL image if provided
     if dir_save is None:
-        dir_save="./"
-    print(dir_save)
+        dir_save = "./"
     # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
     dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
     dir_ch = "".join(dir_save.split(f_slash)[-1:])
     if not dir_par.endswith(f_slash):
         dir_par += f_slash
-    print(dir_par)
     if fname is None:
         fname = dir_ch
     mkdir(dir_par)
@@ -3139,7 +3509,9 @@ def figsave(*args, dpi=300):
 def is_str_color(s):
     # Regular expression pattern for hexadecimal color codes
-    if isinstance(s,str):
+    if isinstance(s, str):
+        import re
         color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
         return re.match(color_code_pattern, s) is not None
     else:
@@ -3166,6 +3538,8 @@ def isnum(s):
 def is_image(fpath):
+    import mimetypes
     mime_type, _ = mimetypes.guess_type(fpath)
     if mime_type and mime_type.startswith("image"):
         return True
@@ -3174,6 +3548,8 @@ def is_image(fpath):
 def is_document(fpath):
+    import mimetypes
     mime_type, _ = mimetypes.guess_type(fpath)
     if mime_type and (
         mime_type.startswith("text/")
@@ -3194,6 +3570,8 @@ def is_document(fpath):
 def is_zip(fpath):
+    import mimetypes
     mime_type, _ = mimetypes.guess_type(fpath)
     if mime_type == "application/zip":
         return True
@@ -3202,6 +3580,8 @@ def is_zip(fpath):
 def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
+    import matplotlib.pyplot as plt
     if ax is None:
         ax = plt.gca()
     for loc, spine in ax.spines.items():
@@ -3290,6 +3670,7 @@ def apply_filter(img, *args):
     Returns:
         PIL.Image: The filtered image.
     """
+    from PIL import ImageFilter
     def correct_filter_name(filter_name):
         if "bl" in filter_name.lower() and "box" not in filter_name.lower():
@@ -3532,6 +3913,9 @@ def imgsets(img, **kwargs):
         avg_contrast_factor = sum(contrast_factors) / num_channels
         return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
+    import matplotlib.pyplot as plt
+    from PIL import ImageEnhance, ImageOps
     # Load image if input is a file path
     if isinstance(img, str):
         img = load_img(img)
@@ -3595,6 +3979,8 @@ def imgsets(img, **kwargs):
         elif "pad" in k.lower():
             img_update = ImageOps.pad(img_update, size=value)
         elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
+            from rembg import remove, new_session
             if isinstance(value, bool):
                 session = new_session("isnet-general-use")
                 img_update = remove(img_update, session=session)
@@ -3633,6 +4019,8 @@ def imgsets(img, **kwargs):
                 else:
                     img_update = remove(img_update)
         elif "bg" in k.lower() and "color" in k.lower():
+            from rembg import remove
             if isinstance(value, list):
                 value = tuple(value)
             if isinstance(value, tuple):  # replace the background color
@@ -3664,6 +4052,9 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
     Args:
         dir_img_list (list): List of the Directory containing the images.
     """
+    import matplotlib.pyplot as plt
+    from PIL import Image
     num_images = len(dir_img_list)
     if not kind.startswith("."):
         kind = "." + kind
@@ -3700,28 +4091,14 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
 # usage:
 # fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
 # thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
-def read_mplstyle(style_file):
-    # Load the style file
-    plt.style.use(style_file)
-    # Get the current style properties
-    style_dict = plt.rcParams
-    # Convert to dictionary
-    style_dict = dict(style_dict)
-    # Print the style dictionary
-    for i, j in style_dict.items():
-        print(f"\n{i}::::{j}")
-    return style_dict
-# #example usage:
-# style_file = "/ std-colors.mplstyle"
-# style_dict = read_mplstyle(style_file)
 # search and fine the director of the libary, which installed at local
 def dir_lib(lib_oi):
+    """
+    # example usage:
+    # dir_lib("seaborn")
+    """
     import site
     # Get the site-packages directory
@@ -3740,22 +4117,6 @@ def dir_lib(lib_oi):
     return dir_list
-# example usage:
-# dir_lib("seaborn")
-"""
-    # n = 7
-    # clist = get_color(n, cmap="auto", how="linspace")  # get_color(100)
-    # plt.figure(figsize=[8, 5], dpi=100)
-    # x = np.linspace(0, 2 * np.pi, 50) * 100
-    # y = np.sin(x)
-    # for i in range(1, n + 1):
-    #     plt.plot(x, y + i, c=clist[i - 1], lw=5, label=str(i))
-    # plt.legend()
-    # plt.ylim(-2, 20)
-    # figsets(plt.gca(), {"style": "whitegrid"}) """
 class FileInfo:
     def __init__(
         self,
@@ -3832,6 +4193,8 @@ class FileInfo:
 def finfo(fpath):
+    import time
     fname, fmt = os.path.splitext(fpath)
     dir_par = os.path.dirname(fpath) + "/"
     data = {
@@ -3846,6 +4209,8 @@ def finfo(fpath):
     }
     extra_info = {}
     if data["kind"] == ".pdf":
+        from pdf2image import pdfinfo_from_path
         extra_info = pdfinfo_from_path(fpath)
     return FileInfo(
@@ -3862,16 +4227,6 @@ def finfo(fpath):
 # ! format excel file
-import pandas as pd
-from datetime import datetime
-from openpyxl import load_workbook
-from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
-from openpyxl.utils import get_column_letter
-from openpyxl.worksheet.datavalidation import DataValidation
-from openpyxl.comments import Comment
-from openpyxl.formatting.rule import ColorScaleRule
 def hex2argb(hex_color):
     """
         Convert a hex color code to aARGB format required by openpyxl.
@@ -3907,337 +4262,6 @@ def hex2argb(hex_color):
     )
-def convert_indices_to_range(row_slice, col_slice):
-    """Convert numerical row and column slices to Excel-style range strings."""
-    start_row = row_slice.start + 1
-    end_row = row_slice.stop if row_slice.stop is not None else None
-    start_col = col_slice.start + 1
-    end_col = col_slice.stop if col_slice.stop is not None else None
-    start_col_letter = get_column_letter(start_col)
-    end_col_letter = get_column_letter(end_col) if end_col else None
-    return (
-        f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
-        if end_col_letter
-        else f"{start_col_letter}{start_row}"
-    )
-def apply_format(ws, cell, cell_range):
-    """Apply cell formatting to a specified range."""
-    cell_font, cell_fill, cell_alignment, border = None, None, None, None
-    kws_cell = ["font", "fill", "alignment", "border"]
-    for K, _ in cell.items():
-        if strcmp(K, kws_cell)[0] == "font":
-            #! font
-            font_color = "000000"
-            font_name = "Arial"
-            font_underline = "none"
-            font_size = 14
-            font_bold = False
-            font_strike = False
-            font_italic = False
-            kws_font = [
-                "name",
-                "size",
-                "bold",
-                "underline",
-                "color",
-                "strike",
-                "italic",
-            ]
-            for k_, v_ in cell.get(K, {}).items():
-                if strcmp(k_, kws_font)[0] == "name":
-                    font_name = v_
-                elif strcmp(k_, kws_font)[0] == "size":
-                    font_size = v_
-                elif strcmp(k_, kws_font)[0] == "bold":
-                    font_bold = v_
-                elif strcmp(k_, kws_font)[0] == "underline":
-                    font_underline = strcmp(v_, ["none", "single", "double"])[0]
-                elif strcmp(k_, kws_font)[0] == "color":
-                    font_color = hex2argb(v_)
-                elif strcmp(k_, kws_font)[0] == "strike":
-                    font_strike = v_
-                elif strcmp(k_, kws_font)[0] == "italic":
-                    font_italic = v_
-            cell_font = Font(
-                name=font_name,
-                size=font_size,
-                bold=font_bold,
-                italic=font_italic,
-                underline=font_underline,
-                strike=font_strike,
-                color=font_color,
-            )
-        if strcmp(K, kws_cell)[0] == "fill":
-            #! fill
-            kws_fill = ["start_color", "end_color", "fill_type", "color"]
-            kws_fill_type = [
-                "darkVertical",
-                "lightDown",
-                "lightGrid",
-                "solid",
-                "darkDown",
-                "lightGray",
-                "lightUp",
-                "gray0625",
-                "lightVertical",
-                "lightHorizontal",
-                "darkHorizontal",
-                "gray125",
-                "darkUp",
-                "mediumGray",
-                "darkTrellis",
-                "darkGray",
-                "lightTrellis",
-                "darkGrid",
-            ]
-            start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid"  # default
-            for k, v in cell.get(K, {}).items():
-                if strcmp(k, kws_fill)[0] == "color":
-                    start_color, end_color = hex2argb(v), hex2argb(v)
-                    break
-            for k, v in cell.get(K, {}).items():
-                if strcmp(k, kws_fill)[0] == "start_color":
-                    start_color = hex2argb(v)
-                elif strcmp(k, kws_fill)[0] == "end_color":
-                    end_color = hex2argb(v)
-                elif strcmp(k, kws_fill)[0] == "fill_type":
-                    fill_type = strcmp(v, kws_fill_type)[0]
-            cell_fill = PatternFill(
-                start_color=start_color,
-                end_color=end_color,
-                fill_type=fill_type,
-            )
-        if strcmp(K, kws_cell)[0] == "alignment":
-            #! alignment
-            # default
-            align_horizontal = "general"
-            align_vertical = "center"
-            align_rot = 0
-            align_wrap = False
-            align_shrink = False
-            align_indent = 0
-            kws_align = [
-                "horizontal",
-                "ha",
-                "vertical",
-                "va",
-                "text_rotation",
-                "rotat",
-                "rot",
-                "wrap_text",
-                "wrap",
-                "shrink_to_fit",
-                "shrink",
-                "indent",
-            ]
-            for k, v in cell.get(K, {}).items():
-                if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
-                    align_horizontal = strcmp(
-                        v, ["general", "left", "right", "center"]
-                    )[0]
-                elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
-                    align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
-                elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
-                    align_rot = v
-                elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
-                    align_wrap = v
-                elif strcmp(k, kws_align)[0] in [
-                    "shrink_to_fit",
-                    "shrink",
-                    "wrap_text",
-                    "wrap",
-                ]:
-                    align_shrink = v
-                elif strcmp(k, kws_align)[0] in ["indent"]:
-                    align_indent = v
-            cell_alignment = Alignment(
-                horizontal=align_horizontal,
-                vertical=align_vertical,
-                text_rotation=align_rot,
-                wrap_text=align_wrap,
-                shrink_to_fit=align_shrink,
-                indent=align_indent,
-            )
-        if strcmp(K, kws_cell)[0] == "border":
-            #! border
-            kws_border = [
-                "color_left",
-                "color_l",
-                "color_right",
-                "color_r",
-                "color_top",
-                "color_t",
-                "color_bottom",
-                "color_b",
-                "color_diagonal",
-                "color_d",
-                "color_outline",
-                "color_o",
-                "color_vertical",
-                "color_v",
-                "color_horizontal",
-                "color_h",
-                "color",
-                "style_left",
-                "style_l",
-                "style_right",
-                "style_r",
-                "style_top",
-                "style_t",
-                "style_bottom",
-                "style_b",
-                "style_diagonal",
-                "style_d",
-                "style_outline",
-                "style_o",
-                "style_vertical",
-                "style_v",
-                "style_horizontal",
-                "style_h",
-                "style",
-            ]
-            # * border color
-            border_color_l, border_color_r, border_color_t, border_color_b = (
-                "FF000000",
-                "FF000000",
-                "FF000000",
-                "FF000000",
-            )
-            border_color_d, border_color_o, border_color_v, border_color_h = (
-                "FF000000",
-                "FF000000",
-                "FF000000",
-                "FF000000",
-            )
-            # get colors config
-            for k, v in cell.get(K, {}).items():
-                if strcmp(k, kws_border)[0] in ["color"]:
-                    border_color_all = hex2argb(v)
-                    # 如果设置了color,表示其它的所有的都设置成为一样的
-                    # 然后再才开始自己定义其它的color
-                    border_color_l, border_color_r, border_color_t, border_color_b = (
-                        border_color_all,
-                        border_color_all,
-                        border_color_all,
-                        border_color_all,
-                    )
-                    border_color_d, border_color_o, border_color_v, border_color_h = (
-                        border_color_all,
-                        border_color_all,
-                        border_color_all,
-                        border_color_all,
-                    )
-                elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
-                    border_color_l = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
-                    border_color_r = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
-                    border_color_t = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
-                    border_color_b = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
-                    border_color_d = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
-                    border_color_o = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
-                    border_color_v = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
-                    border_color_h = hex2argb(v)
-            # *border style
-            border_styles = [
-                "thin",
-                "medium",
-                "thick",
-                "dotted",
-                "dashed",
-                "hair",
-                "mediumDashed",
-                "dashDot",
-                "dashDotDot",
-                "slantDashDot",
-                "none",
-            ]
-            border_style_l, border_style_r, border_style_t, border_style_b = (
-                None,
-                None,
-                None,
-                None,
-            )
-            border_style_d, border_style_o, border_style_v, border_style_h = (
-                None,
-                None,
-                None,
-                None,
-            )
-            # get styles config
-            for k, v in cell.get(K, {}).items():
-                # if not "style" in k:
-                #     break
-                if strcmp(k, kws_border)[0] in ["style"]:
-                    border_style_all = strcmp(v, border_styles)[0]
-                    # 如果设置了style,表示其它的所有的都设置成为一样的
-                    # 然后再才开始自己定义其它的style
-                    border_style_l, border_style_r, border_style_t, border_style_b = (
-                        border_style_all,
-                        border_style_all,
-                        border_style_all,
-                        border_style_all,
-                    )
-                    border_style_d, border_style_o, border_style_v, border_style_h = (
-                        border_style_all,
-                        border_style_all,
-                        border_style_all,
-                        border_style_all,
-                    )
-                elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
-                    border_style_l = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
-                    border_style_r = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
-                    border_style_t = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
-                    border_style_b = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
-                    border_style_d = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
-                    border_style_o = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
-                    border_style_v = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
-                    border_style_h = strcmp(v, border_styles)[0]
-            # * apply border config
-            border = Border(
-                left=Side(border_style=border_style_l, color=border_color_l),
-                right=Side(border_style=border_style_r, color=border_color_r),
-                top=Side(border_style=border_style_t, color=border_color_t),
-                bottom=Side(border_style=border_style_b, color=border_color_b),
-                diagonal=Side(border_style=border_style_d, color=border_color_d),
-                diagonal_direction=0,
-                outline=Side(border_style=border_style_o, color=border_color_o),
-                vertical=Side(border_style=border_style_v, color=border_color_v),
-                horizontal=Side(border_style=border_style_h, color=border_color_h),
-            )
-    #! final apply configs
-    for row in ws[cell_range]:
-        for cell_ in row:
-            if cell_font:
-                cell_.font = cell_font
-            if cell_fill:
-                cell_.fill = cell_fill
-            if cell_alignment:
-                cell_.alignment = cell_alignment
-            if border:
-                cell_.border = border
 def format_excel(
     df=None,
     filename=None,
@@ -4257,6 +4281,368 @@ def format_excel(
     conditional_format=None,  # dict
     **kwargs,
 ):
+    import pandas as pd
+    from datetime import datetime
+    from openpyxl import load_workbook
+    from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+    from openpyxl.utils import get_column_letter
+    from openpyxl.worksheet.datavalidation import DataValidation
+    from openpyxl.comments import Comment
+    from openpyxl.formatting.rule import ColorScaleRule
+    def convert_indices_to_range(row_slice, col_slice):
+        """Convert numerical row and column slices to Excel-style range strings."""
+        start_row = row_slice.start + 1
+        end_row = row_slice.stop if row_slice.stop is not None else None
+        start_col = col_slice.start + 1
+        end_col = col_slice.stop if col_slice.stop is not None else None
+        start_col_letter = get_column_letter(start_col)
+        end_col_letter = get_column_letter(end_col) if end_col else None
+        return (
+            f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
+            if end_col_letter
+            else f"{start_col_letter}{start_row}"
+        )
+    def apply_format(ws, cell, cell_range):
+        """Apply cell formatting to a specified range."""
+        cell_font, cell_fill, cell_alignment, border = None, None, None, None
+        kws_cell = ["font", "fill", "alignment", "border"]
+        for K, _ in cell.items():
+            if strcmp(K, kws_cell)[0] == "font":
+                #! font
+                font_color = "000000"
+                font_name = "Arial"
+                font_underline = "none"
+                font_size = 14
+                font_bold = False
+                font_strike = False
+                font_italic = False
+                kws_font = [
+                    "name",
+                    "size",
+                    "bold",
+                    "underline",
+                    "color",
+                    "strike",
+                    "italic",
+                ]
+                for k_, v_ in cell.get(K, {}).items():
+                    if strcmp(k_, kws_font)[0] == "name":
+                        font_name = v_
+                    elif strcmp(k_, kws_font)[0] == "size":
+                        font_size = v_
+                    elif strcmp(k_, kws_font)[0] == "bold":
+                        font_bold = v_
+                    elif strcmp(k_, kws_font)[0] == "underline":
+                        font_underline = strcmp(v_, ["none", "single", "double"])[0]
+                    elif strcmp(k_, kws_font)[0] == "color":
+                        font_color = hex2argb(v_)
+                    elif strcmp(k_, kws_font)[0] == "strike":
+                        font_strike = v_
+                    elif strcmp(k_, kws_font)[0] == "italic":
+                        font_italic = v_
+                cell_font = Font(
+                    name=font_name,
+                    size=font_size,
+                    bold=font_bold,
+                    italic=font_italic,
+                    underline=font_underline,
+                    strike=font_strike,
+                    color=font_color,
+                )
+            if strcmp(K, kws_cell)[0] == "fill":
+                #! fill
+                kws_fill = ["start_color", "end_color", "fill_type", "color"]
+                kws_fill_type = [
+                    "darkVertical",
+                    "lightDown",
+                    "lightGrid",
+                    "solid",
+                    "darkDown",
+                    "lightGray",
+                    "lightUp",
+                    "gray0625",
+                    "lightVertical",
+                    "lightHorizontal",
+                    "darkHorizontal",
+                    "gray125",
+                    "darkUp",
+                    "mediumGray",
+                    "darkTrellis",
+                    "darkGray",
+                    "lightTrellis",
+                    "darkGrid",
+                ]
+                start_color, end_color, fill_type = (
+                    "FFFFFF",
+                    "FFFFFF",
+                    "solid",
+                )  # default
+                for k, v in cell.get(K, {}).items():
+                    if strcmp(k, kws_fill)[0] == "color":
+                        start_color, end_color = hex2argb(v), hex2argb(v)
+                        break
+                for k, v in cell.get(K, {}).items():
+                    if strcmp(k, kws_fill)[0] == "start_color":
+                        start_color = hex2argb(v)
+                    elif strcmp(k, kws_fill)[0] == "end_color":
+                        end_color = hex2argb(v)
+                    elif strcmp(k, kws_fill)[0] == "fill_type":
+                        fill_type = strcmp(v, kws_fill_type)[0]
+                cell_fill = PatternFill(
+                    start_color=start_color,
+                    end_color=end_color,
+                    fill_type=fill_type,
+                )
+            if strcmp(K, kws_cell)[0] == "alignment":
+                #! alignment
+                # default
+                align_horizontal = "general"
+                align_vertical = "center"
+                align_rot = 0
+                align_wrap = False
+                align_shrink = False
+                align_indent = 0
+                kws_align = [
+                    "horizontal",
+                    "ha",
+                    "vertical",
+                    "va",
+                    "text_rotation",
+                    "rotat",
+                    "rot",
+                    "wrap_text",
+                    "wrap",
+                    "shrink_to_fit",
+                    "shrink",
+                    "indent",
+                ]
+                for k, v in cell.get(K, {}).items():
+                    if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
+                        align_horizontal = strcmp(
+                            v, ["general", "left", "right", "center"]
+                        )[0]
+                    elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
+                        align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
+                    elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
+                        align_rot = v
+                    elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
+                        align_wrap = v
+                    elif strcmp(k, kws_align)[0] in [
+                        "shrink_to_fit",
+                        "shrink",
+                        "wrap_text",
+                        "wrap",
+                    ]:
+                        align_shrink = v
+                    elif strcmp(k, kws_align)[0] in ["indent"]:
+                        align_indent = v
+                cell_alignment = Alignment(
+                    horizontal=align_horizontal,
+                    vertical=align_vertical,
+                    text_rotation=align_rot,
+                    wrap_text=align_wrap,
+                    shrink_to_fit=align_shrink,
+                    indent=align_indent,
+                )
+            if strcmp(K, kws_cell)[0] == "border":
+                #! border
+                kws_border = [
+                    "color_left",
+                    "color_l",
+                    "color_right",
+                    "color_r",
+                    "color_top",
+                    "color_t",
+                    "color_bottom",
+                    "color_b",
+                    "color_diagonal",
+                    "color_d",
+                    "color_outline",
+                    "color_o",
+                    "color_vertical",
+                    "color_v",
+                    "color_horizontal",
+                    "color_h",
+                    "color",
+                    "style_left",
+                    "style_l",
+                    "style_right",
+                    "style_r",
+                    "style_top",
+                    "style_t",
+                    "style_bottom",
+                    "style_b",
+                    "style_diagonal",
+                    "style_d",
+                    "style_outline",
+                    "style_o",
+                    "style_vertical",
+                    "style_v",
+                    "style_horizontal",
+                    "style_h",
+                    "style",
+                ]
+                # * border color
+                border_color_l, border_color_r, border_color_t, border_color_b = (
+                    "FF000000",
+                    "FF000000",
+                    "FF000000",
+                    "FF000000",
+                )
+                border_color_d, border_color_o, border_color_v, border_color_h = (
+                    "FF000000",
+                    "FF000000",
+                    "FF000000",
+                    "FF000000",
+                )
+                # get colors config
+                for k, v in cell.get(K, {}).items():
+                    if strcmp(k, kws_border)[0] in ["color"]:
+                        border_color_all = hex2argb(v)
+                        # 如果设置了color,表示其它的所有的都设置成为一样的
+                        # 然后再才开始自己定义其它的color
+                        (
+                            border_color_l,
+                            border_color_r,
+                            border_color_t,
+                            border_color_b,
+                        ) = (
+                            border_color_all,
+                            border_color_all,
+                            border_color_all,
+                            border_color_all,
+                        )
+                        (
+                            border_color_d,
+                            border_color_o,
+                            border_color_v,
+                            border_color_h,
+                        ) = (
+                            border_color_all,
+                            border_color_all,
+                            border_color_all,
+                            border_color_all,
+                        )
+                    elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
+                        border_color_l = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
+                        border_color_r = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
+                        border_color_t = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
+                        border_color_b = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
+                        border_color_d = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
+                        border_color_o = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
+                        border_color_v = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
+                        border_color_h = hex2argb(v)
+                # *border style
+                border_styles = [
+                    "thin",
+                    "medium",
+                    "thick",
+                    "dotted",
+                    "dashed",
+                    "hair",
+                    "mediumDashed",
+                    "dashDot",
+                    "dashDotDot",
+                    "slantDashDot",
+                    "none",
+                ]
+                border_style_l, border_style_r, border_style_t, border_style_b = (
+                    None,
+                    None,
+                    None,
+                    None,
+                )
+                border_style_d, border_style_o, border_style_v, border_style_h = (
+                    None,
+                    None,
+                    None,
+                    None,
+                )
+                # get styles config
+                for k, v in cell.get(K, {}).items():
+                    # if not "style" in k:
+                    #     break
+                    if strcmp(k, kws_border)[0] in ["style"]:
+                        border_style_all = strcmp(v, border_styles)[0]
+                        # 如果设置了style,表示其它的所有的都设置成为一样的
+                        # 然后再才开始自己定义其它的style
+                        (
+                            border_style_l,
+                            border_style_r,
+                            border_style_t,
+                            border_style_b,
+                        ) = (
+                            border_style_all,
+                            border_style_all,
+                            border_style_all,
+                            border_style_all,
+                        )
+                        (
+                            border_style_d,
+                            border_style_o,
+                            border_style_v,
+                            border_style_h,
+                        ) = (
+                            border_style_all,
+                            border_style_all,
+                            border_style_all,
+                            border_style_all,
+                        )
+                    elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
+                        border_style_l = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
+                        border_style_r = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
+                        border_style_t = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
+                        border_style_b = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
+                        border_style_d = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
+                        border_style_o = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
+                        border_style_v = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
+                        border_style_h = strcmp(v, border_styles)[0]
+                # * apply border config
+                border = Border(
+                    left=Side(border_style=border_style_l, color=border_color_l),
+                    right=Side(border_style=border_style_r, color=border_color_r),
+                    top=Side(border_style=border_style_t, color=border_color_t),
+                    bottom=Side(border_style=border_style_b, color=border_color_b),
+                    diagonal=Side(border_style=border_style_d, color=border_color_d),
+                    diagonal_direction=0,
+                    outline=Side(border_style=border_style_o, color=border_color_o),
+                    vertical=Side(border_style=border_style_v, color=border_color_v),
+                    horizontal=Side(border_style=border_style_h, color=border_color_h),
+                )
+        #! final apply configs
+        for row in ws[cell_range]:
+            for cell_ in row:
+                if cell_font:
+                    cell_.font = cell_font
+                if cell_fill:
+                    cell_.fill = cell_fill
+                if cell_alignment:
+                    cell_.alignment = cell_alignment
+                if border:
+                    cell_.border = border
     if not isinstance(df, pd.DataFrame):
         try:
             print(f"is loading file {os.path.basename(df)}")
@@ -4602,11 +4988,10 @@ format_excel(
     print(f"Formatted Excel file saved as:\n{filename}")
-from IPython.display import display, HTML, Markdown
 def preview(var):
     """Master function to preview formatted variables in Jupyter."""
+    from bs4 import BeautifulSoup
+    from IPython.display import display, HTML, Markdown
     if isinstance(var, str):
         if isa(var, "html"):
@@ -4624,6 +5009,8 @@ def preview(var):
         display(var)
     elif isinstance(var, list) or isinstance(var, dict):
+        import json
         # Display JSON
         json_str = json.dumps(var, indent=4)
         display(Markdown(f"```json\n{json_str}\n```"))
@@ -4637,6 +5024,8 @@ def preview(var):
         display(Image(filename=var))
     elif isinstance(var, dict):
+        import json
         # Handle dictionary formatting
         json_str = json.dumps(var, indent=4)
         display(Markdown(f"```json\n{json_str}\n```"))
@@ -4651,48 +5040,194 @@ def preview(var):
 # preview("# This is a Markdown header")
 # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
 # preview({"key": "value", "numbers": [1, 2, 3]})
+def _df_outlier(
+    data,
+    columns=None,
+    method=["zscore", "iqr", "percentile", "iforest"],
+    min_outlier_method=3,  # 至少两种方法检查出outlier
+    zscore_threshold=3,
+    iqr_threshold=1.5,
+    lower_percentile=5,
+    upper_percentile=95,
+):
+    from scipy.stats import zscore
+    from sklearn.ensemble import IsolationForest
+    from sklearn.preprocessing import StandardScaler
+    col_names_org = data.columns.tolist()
+    index_names_org = data.index.tolist()
+    # Separate numeric and non-numeric columns
+    numeric_data = data.select_dtypes(include=[np.number])
+    non_numeric_data = data.select_dtypes(exclude=[np.number])
+    if columns is not None:
+        numeric_data = numeric_data[columns]
+    elif numeric_data.empty:
+        raise ValueError("Input data must contain numeric columns.")
+    outliers_df = pd.DataFrame(index=numeric_data.index)
+    if isinstance(method, str):
+        method = [method]
+    # Z-score method
+    if "zscore" in method:
+        z_scores = np.abs(zscore(numeric_data))
+        outliers_df["zscore"] = np.any(z_scores > zscore_threshold, axis=1)
+    # IQR method
+    if "iqr" in method:
+        Q1 = numeric_data.quantile(0.25)
+        Q3 = numeric_data.quantile(0.75)
+        IQR = Q3 - Q1
+        lower_bound = Q1 - iqr_threshold * IQR
+        upper_bound = Q3 + iqr_threshold * IQR
+        outliers_df["iqr"] = (
+            (numeric_data < lower_bound) | (numeric_data > upper_bound)
+        ).any(axis=1)
+    # Percentile method
+    if "percentile" in method:
+        lower_bound = numeric_data.quantile(lower_percentile / 100)
+        upper_bound = numeric_data.quantile(upper_percentile / 100)
+        outliers_df["percentile"] = (
+            (numeric_data < lower_bound) | (numeric_data > upper_bound)
+        ).any(axis=1)
+    # Isolation Forest method
+    if "iforest" in method:
+        # iforest method cannot handle NaNs, then fillna with mean
+        numeric_data_ = numeric_data.fillna(numeric_data.mean())
+        scaler = StandardScaler()
+        scaled_data = scaler.fit_transform(numeric_data_)
+        iso_forest = IsolationForest(contamination=0.05)
+        outliers_df["iforest"] = iso_forest.fit_predict(scaled_data) == -1
+    # Combine all outlier detections
+    if len(method) == 4:  # all method are used:
+        outliers_df["outlier"] = outliers_df.sum(axis=1) >= min_outlier_method
+    else:
+        outliers_df["outlier"] = outliers_df.any(axis=1)
+    # Handling Outliers: Remove or Winsorize or Replace with NaN
+    processed_data = numeric_data.copy()
+    processed_data.loc[outliers_df["outlier"]] = np.nan
+    return processed_data
+def df_outlier(
+    data,
+    columns=None,
+    method=["zscore", "iqr", "percentile", "iforest"],
+    min_outlier_method=2,  # 至少两种方法检查出outlier
+    zscore_threshold=3,
+    iqr_threshold=1.5,
+    lower_percentile=5,
+    upper_percentile=95,
+):
+    """
+    Usage:
+    data_out = df_outlier(
+        data,
+        columns=["income"],
+        method="iforest",
+        min_outlier_method=1)
+    Advanced outlier detection and handling function.
+    Parameters:
+    - data: DataFrame, the input data (numerical).
+    - method: List, the outlier detection method to use. Options: 'zscore', 'iqr', 'percentile', 'iforest'.
+    - zscore_threshold: float, threshold for Z-score outlier detection (default 3).
+    - iqr_threshold: float, threshold for IQR method (default 1.5).
+    - lower_percentile: float, lower percentile for percentile-based outliers (default 5).
+    - upper_percentile: float, upper percentile for percentile-based outliers (default 95).
+    - keep_nan: bool, whether to replace outliers with NaN (default True).
+    - plot: bool, whether to visualize the outliers (default False).
+    - min_outlier_method: int, minimum number of method that need to flag a row as an outlier (default 2).
+    - inplace: bool, whether to modify the original `data` DataFrame (default False).
+    Returns:
+    - processed_data: DataFrame with outliers handled based on method (if winsorize/remove is True).
+    """
+    col_names_org = data.columns.tolist()
+    index_names_org = data.index.tolist()
+    numeric_data = data.select_dtypes(include=[np.number])
+    non_numeric_data = data.select_dtypes(exclude=[np.number])
+    _outlier_df_tmp = pd.DataFrame()
+    for col in numeric_data.columns:
+        _outlier_df_tmp = pd.concat(
+            [
+                _outlier_df_tmp,
+                _df_outlier(
+                    data=data,
+                    columns=[col],
+                    method=method,
+                    min_outlier_method=min_outlier_method,  # 至少两种方法检查出outlier
+                    zscore_threshold=zscore_threshold,
+                    iqr_threshold=iqr_threshold,
+                    lower_percentile=lower_percentile,
+                    upper_percentile=upper_percentile,
+                ),
+            ],
+            axis=1,
+            # join="inner",
+        )
+    processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
+    processed_data = processed_data[col_names_org]
+    return processed_data
 def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
     """
     Extend a DataFrame by the list elecments in the column.
     Parameters:
     ----------
     data : pd.DataFrame
         The input DataFrame to be extended.
     column : str
         The name of the column to be split.
     axis : int, optional
-        The axis along which to expand the DataFrame.
+        The axis along which to expand the DataFrame.
         - 0 (default): Expand the specified column into multiple rows.
         - 1: Expand the specified column into multiple columns.
     sep : str, optional
         The separator used to split the values in the specified column.
         Must be provided for the function to work correctly.
     """
-    data = data.copy()
+    data = data.copy()
     mask = data[column].str.contains(sep, na=False)
     data = data.copy()
     if mask.any():
-        data[column] = (
-            data[column]
-            .apply(lambda x: x.split(sep) if isinstance(x, str) else x)  # Only split if x is a string
-        )
+        data[column] = data[column].apply(
+            lambda x: x.split(sep) if isinstance(x, str) else x
+        )  # Only split if x is a string
         # Strip spaces from each item in the lists
-        data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
+        data[column] = data[column].apply(
+            lambda x: [item.strip() for item in x] if isinstance(x, list) else x
+        )
     data = data.explode(column, ignore_index=True)
     return data
 # ! DataFrame
 def df_astype(
     data: pd.DataFrame,
     columns: Optional[Union[str, List[str]]] = None,
     astype: str = "datetime",
-    skip_row:Union[str,list]=None,
+    skip_row: Union[str, list] = None,
     fmt: Optional[str] = None,
     inplace: bool = True,
     errors: str = "coerce",  # Can be "ignore", "raise", or "coerce"
@@ -4750,7 +5285,8 @@ def df_astype(
         "second",
         "time",
         "week",
-        "date","day",
+        "date",
+        "day",
         "month",
         "year",
     ]
@@ -4758,18 +5294,18 @@ def df_astype(
     if not inplace:
         data = data.copy()
     if skip_row is not None:
-        data = data.drop(index=skip_row, errors='ignore')
+        data = data.drop(index=skip_row, errors="ignore")
     # If columns is None, apply to all columns
     if columns is None:
         columns = data.columns.tolist()
     # correct the astype input
-    if isinstance(astype,str):
+    if isinstance(astype, str):
         astype = strcmp(astype, astypes)[0]
         print(f"converting as type: {astype}")
-    elif isinstance(astype,dict):
+    elif isinstance(astype, dict):
         for col, dtype in astype.items():
-            dtype='date' if dtype=="day" else dtype
-            data["col"]=data["col"].adtype(strcmp(dtype, astypes)[0])
+            dtype = "date" if dtype == "day" else dtype
+            data["col"] = data["col"].adtype(strcmp(dtype, astypes)[0])
         return data if not inplace else None
     # Ensure columns is a list
@@ -4880,13 +5416,15 @@ def df_sort_values(df, column, by=None, ascending=True, inplace=True, **kwargs):
     if column not in data.columns:
         raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
-    if isinstance(by, str) and 'count' in by.lower():
+    if isinstance(by, str) and "count" in by.lower():
         # Count occurrences of each value in the specified column
         value_counts = df[column].value_counts()
         # Determine the order based on counts
         count_ascending = kwargs.pop("count_ascending", ascending)
-        sorted_counts = value_counts.sort_values(ascending=count_ascending).index.tolist()
+        sorted_counts = value_counts.sort_values(
+            ascending=count_ascending
+        ).index.tolist()
         # Convert to a categorical type with the new order
         df[column] = pd.Categorical(df[column], categories=sorted_counts, ordered=True)
@@ -5004,6 +5542,7 @@ def df_merge(
         )
     return df_merged
 def df_drop_duplicates(
     data: pd.DataFrame,
     by: Union[
@@ -5012,16 +5551,16 @@ def df_drop_duplicates(
     keep="first",  # Options: 'first', 'last', or False (drop all duplicates)
     ignore_index=True,
     inplace: bool = False,
-    verbose=True
+    verbose=True,
 ):
     """
     data (pd.DataFrame): DataFrame to drop duplicates from.
     by (str): Specify by to drop duplicates:
                  - 'index': Drop duplicates based on the DataFrame index.
                  - Column name(s) for row-wise duplicate checking.
-    keep (str): Which duplicates to keep:
-        'first',
-        'last',
+    keep (str): Which duplicates to keep:
+        'first',
+        'last',
         False (drop all duplicates).
     inplace (bool): Whether to modify the original DataFrame in place.
     """
@@ -5031,8 +5570,8 @@ def df_drop_duplicates(
         result = data[~data.index.duplicated(keep=keep)]
     else:
         # Drop duplicates row-wise based on column(s)
-        result = data.drop_duplicates(subset=by, keep=keep,ignore_index=ignore_index)
-    if original_shape!=result.shape or verbose:
+        result = data.drop_duplicates(subset=by, keep=keep, ignore_index=ignore_index)
+    if original_shape != result.shape or verbose:
         print(f"\nshape:{original_shape} (before drop_duplicates)")
         print(f"shape:{result.shape} (after drop_duplicates)")
     if inplace:
@@ -5042,15 +5581,18 @@ def df_drop_duplicates(
         return None
     else:
         return result
+#! fillna()
 def df_fillna(
     data: pd.DataFrame,
     method: str = "knn",
-    axis: int = 0,# column-wise
+    axis: int = 0,  # column-wise
     constant: float = None,
     n_neighbors: int = 5,  # KNN-specific
-    max_iter: int = 10, # Iterative methods specific
-    inplace: bool = True,
-    random_state:int = None
+    max_iter: int = 10,  # Iterative methods specific
+    inplace: bool = False,
+    random_state: int = 1,
 ) -> pd.DataFrame:
     """
     Fill missing values in a DataFrame using specified imputation method.
@@ -5066,11 +5608,11 @@ def df_fillna(
         - 'iterative': Use Iterative imputation; each feature with missing values as a function of other features and estimates them iteratively
         - 'mice' (Multivariate Imputation by Chained Equations): A special case of iterative imputation.
         # - 'missforest': A random forest-based imputation method. Uses a random forest model to predict and fill missing values
-        # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
+        # - 'softimpute': Matrix factorization imputation.A matrix factorization technique where missing values are imputed by
         #       reconstructing the data matrix using low-rank approximation
         # - EM (Expectation-Maximization): Often used in advanced statistics to estimate missing values in a probabilistic framework.
         # - 'svd': Use IterativeSVD (matrix factorization via Singular Value Decomposition).
     axis (int): The axis along which to impute:
         - 0: Impute column-wise (default).
         - 1: Impute row-wise.
@@ -5078,13 +5620,30 @@ def df_fillna(
     inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
     """
+    if isinstance(data, pd.Series):
+        data = pd.DataFrame(data)
+    # handle None
+    for col in data.columns:
+        data[col] = data[col].apply(lambda x: np.nan if x is None else x)
+    col_names_org = data.columns.tolist()
+    index_names_org = data.index.tolist()
+    # Separate numeric and non-numeric columns
+    numeric_data = data.select_dtypes(include=[np.number])
+    non_numeric_data = data.select_dtypes(exclude=[np.number])
     if data.empty:
         raise ValueError("Input DataFrame is empty.")
     # Validate method
-    methods = ["mean", "median", "most_frequent",
-               "constant", "knn", "iterative"]#,"missforest","softimpute","svd"]
+    methods = [
+        "mean",
+        "median",
+        "most_frequent",
+        "constant",
+        "knn",
+        "iterative",
+    ]  # ,"missforest","softimpute","svd"]
     method = strcmp(method, methods)[0]
     # If using constant method, ask for a constant value
@@ -5098,51 +5657,76 @@ def df_fillna(
     # Initialize SimpleImputer with the chosen method
     if method == "constant":
         from sklearn.impute import SimpleImputer
         imputer = SimpleImputer(strategy=method, fill_value=constant)
     elif method == "knn":
         from sklearn.impute import KNNImputer
         imputer = KNNImputer(n_neighbors=n_neighbors)
     elif method == "iterative" or method == "mice":
         from sklearn.experimental import enable_iterative_imputer
         from sklearn.impute import IterativeImputer
-        imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
-    # elif method == "missforest":
-    #     from missingpy import MissForest
-    #     imputer = MissForest(max_iter=max_iter, random_state=random_state)
-    # elif method == "softimpute":
-    #     from fancyimpute import SoftImpute
-    #     imputer = SoftImpute()
-    # elif method == "svd":
-    #     from fancyimpute import IterativeSVD
-    #     imputer = IterativeSVD(max_iters=max_iter)
-    else: # mean, median, most_frequent
+        imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
+    else:  # mean, median, most_frequent
         from sklearn.impute import SimpleImputer
         imputer = SimpleImputer(strategy=method)
     # Fit and transform the data
     if axis == 0:
         # Impute column-wise
-        imputed_data = imputer.fit_transform(data)
-        imputed_data.shape
+        imputed_data = imputer.fit_transform(numeric_data)
     elif axis == 1:
         # Impute row-wise
-        imputed_data = imputer.fit_transform(data.T)
-        imputed_data.shape
+        imputed_data = imputer.fit_transform(numeric_data.T)
     else:
         raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
-    df_filled = pd.DataFrame(
+    imputed_data = pd.DataFrame(
         imputed_data if axis == 0 else imputed_data.T,
-        index=data.index,# if axis == 0 else data.columns,
-        columns=data.columns,# if axis == 0 else data.index,
+        index=numeric_data.index if axis == 0 else data.columns,
+        columns=numeric_data.columns if axis == 0 else data.index,
+    )
+    for col in imputed_data.select_dtypes(include=[np.number]).columns:
+        imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
+    # Handle non-numeric data imputation
+    if not non_numeric_data.empty:
+        from sklearn.impute import SimpleImputer
+        if method == "constant":
+            non_numeric_imputer = SimpleImputer(
+                strategy="constant", fill_value=constant
+            )
+        else:
+            non_numeric_imputer = SimpleImputer(strategy="most_frequent")
+        # Impute non-numeric columns column-wise (axis=0)
+        imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
+        # Convert imputed non-numeric array back to DataFrame with original index and column names
+        imputed_non_numeric_df = pd.DataFrame(
+            imputed_non_numeric,
+            index=non_numeric_data.index,
+            columns=non_numeric_data.columns,
+        )
+    else:
+        imputed_non_numeric_df = pd.DataFrame(index=data.index)
+    imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(
+        columns=data.columns
     )
     if inplace:
-        data.update(df_filled)
-        return None  # replace original
+        # Modify the original DataFrame
+        data[:] = imputed_data[col_names_org]
+        return None
     else:
-        return df_filled
+        # Return the modified DataFrame
+        return imputed_data[col_names_org]
 # # example
 # data = {
 #     "A": [1, 2, np.nan, 4, 5],
@@ -5172,9 +5756,100 @@ def df_fillna(
 #     display(df)
 #     display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
+def df_encoder(
+    data: pd.DataFrame,
+    method: str = "dummy",  #'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
+    columns=None,
+    target_column=None,  # Required for 'target' encoding method
+    **kwargs,
+) -> pd.DataFrame:
+    """
+    Methods explained:
+    - 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
+    - 'onehot': One-hot encoding is used when there is no inherent order in categories. It creates a binary column for each category and is useful for nominal categorical variables. However, it increases dimensionality significantly if there are many unique categories.
+    - 'ordinal': Ordinal encoding is used when there is an inherent order in the categories. It assigns integers to categories based on their order. Use this when the categories have a ranking (e.g., 'low', 'medium', 'high').
+    - 'label': Label encoding is used for converting each unique category to a numeric label. It can be useful when working with algorithms that can handle categorical data natively (e.g., decision trees). However, it might introduce unintended ordinal relationships between the categories.
+    - 'target': Target encoding is used when you encode a categorical feature based on the mean of the target variable. This is useful when there is a strong correlation between the categorical feature and the target variable. It is often used in predictive modeling to capture relationships that are not directly encoded in the feature.
+    - 'binary': Binary encoding is a more efficient alternative to one-hot encoding when dealing with high-cardinality categorical variables. It converts categories into binary numbers and then splits them into multiple columns, reducing dimensionality compared to one-hot encoding.
+    """
+    # Select categorical columns
+    categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
+    methods = ["dummy", "onehot", "ordinal", "label", "target", "binary"]
+    method = strcmp(method, methods)[0]
+    if columns is None:
+        columns = categorical_cols
+    # pd.get_dummies()
+    if method == "dummy":
+        dtype = kwargs.pop("dtype", int)
+        drop_first = kwargs.pop("drop_first", True)
+        try:
+            encoded_df = pd.get_dummies(
+                data[columns], drop_first=drop_first, dtype=dtype, **kwargs
+            )
+            return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
+        except Exception as e:
+            # print(f"Warning, 没有进行转换, 因为: {e}")
+            return data
+    # One-hot encoding
+    elif method == "onehot":
+        from sklearn.preprocessing import OneHotEncoder
+        encoder = OneHotEncoder(drop="first", sparse_output=False, **kwargs)
+        encoded_data = encoder.fit_transform(data[columns])
+        encoded_df = pd.DataFrame(
+            encoded_data,
+            columns=encoder.get_feature_names_out(columns),
+            index=data.index,
+        )
+        return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
+    # Ordinal encoding
+    elif method == "ordinal":
+        from sklearn.preprocessing import OrdinalEncoder
+        encoder = OrdinalEncoder(**kwargs)
+        encoded_data = encoder.fit_transform(data[columns])
+        encoded_df = pd.DataFrame(encoded_data, columns=columns, index=data.index)
+        return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
+    # Label encoding
+    elif method == "label":
+        from sklearn.preprocessing import LabelEncoder
+        encoder = LabelEncoder()
+        encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
+        return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
+    # Target encoding (Mean of the target for each category)
+    elif method == "target":
+        if target_column is None:
+            raise ValueError("target_column must be provided for target encoding.")
+        from category_encoders import TargetEncoder
+        encoder = TargetEncoder(cols=columns, **kwargs)
+        encoded_data = encoder.fit_transform(data[columns], data[target_column])
+        return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
+    # Binary encoding (for high-cardinality categorical variables)
+    elif method == "binary":
+        from category_encoders import BinaryEncoder
+        encoder = BinaryEncoder(cols=columns, **kwargs)
+        encoded_data = encoder.fit_transform(data[columns])
+        return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
 def df_scaler(
-    data: pd.DataFrame, # should be numeric dtype
+    data: pd.DataFrame,  # should be numeric dtype
     method="standard",
     columns=None,  # default, select all numeric col/row
     inplace=False,
@@ -5218,9 +5893,8 @@ def df_scaler(
     if axis == 0:
         # Column-wise scaling (default)
         if columns is None:
-            columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
+            columns = data.select_dtypes(include=np.number).columns.tolist()
         non_numeric_columns = data.columns.difference(columns)
-        print(f"Scaling columns")
         scaled_data = scaler.fit_transform(data[columns])
@@ -5242,7 +5916,7 @@ def df_scaler(
         # Row-wise scaling
         if columns is None:
             columns = data.index.tolist()
-        numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
+        numeric_rows = data.loc[columns].select_dtypes(include=np.number)
         if numeric_rows.empty:
             raise ValueError("No numeric rows to scale.")
@@ -5260,6 +5934,34 @@ def df_scaler(
             scaled_df.loc[numeric_rows.index] = scaled_data
             return scaled_df
+def df_special_characters_cleaner(
+    data: pd.DataFrame, where=["column", "content", "index"]
+) -> pd.DataFrame:
+    """
+    to clean special characters:
+    usage:
+        df_special_characters_cleaner(data=df, where='column')
+    """
+    if not isinstance(where, list):
+        where = [where]
+    where_to_clean = ["column", "content", "index"]
+    where_ = [strcmp(i, where_to_clean)[0] for i in where]
+    # 1. Clean column names by replacing special characters with underscores
+    if "column" in where_:
+        data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
+    # 2. Clean only object-type columns (text columns)
+    if "content" in where_:
+        for col in data.select_dtypes(include=["object"]).columns:
+            data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
+    if data.index.dtype == "object" and index in where_:
+        data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
+    return data
 def df_cluster(
     data: pd.DataFrame,
     columns: Optional[list] = None,
@@ -5268,8 +5970,8 @@ def df_cluster(
     scale: bool = True,
     plot: Union[str, list] = "all",
     inplace: bool = True,
-    ax: Optional[plt.Axes] = None,
-) -> tuple[pd.DataFrame, int, Optional[plt.Axes]]:
+    ax=None,
+):
     from sklearn.preprocessing import StandardScaler
     from sklearn.cluster import KMeans
     from sklearn.metrics import silhouette_score, silhouette_samples
@@ -5277,7 +5979,6 @@ def df_cluster(
     import numpy as np
     import pandas as pd
     import matplotlib.pyplot as plt
-    import seaborn as sns
     """
     Performs clustering analysis on the provided feature matrix using K-Means.
@@ -5585,94 +6286,72 @@ def df_reducer(
     umap_neighbors: int = 15,  # UMAP-specific
     umap_min_dist: float = 0.1,  # UMAP-specific
     tsne_perplexity: int = 30,  # t-SNE-specific
+    hue: str = None,  # lda-specific
     scale: bool = True,
     fill_missing: bool = True,
     debug: bool = False,
     inplace: bool = True,  # replace the oringinal data
-    plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
+    plot_: bool = False,  # plot scatterplot, but no 'hue',so it is meaningless
+    random_state=1,
+    ax=None,
+    figsize=None,
+    **kwargs,
 ) -> pd.DataFrame:
-    """
-    Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
-    method:
-        1. 'umap':
-            - big dataset and global structure, often preferred in large-scale datasets for
-            visualization and dimensionality reduction, balancing speed and quality of visualization.
-            - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
-            relationships, causing clusters to appear in arbitrary proximities to each other.
-        2. 'pca':
-            - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
-                relationships, causing clusters to appear in arbitrary proximities to each other.
-            - useful as a preprocessing step and in datasets where linear relationships dominate.
-        3. 't-SNE':
-            a. t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
-                relationships, causing clusters to appear in arbitrary proximities to each other.
-            b. often preferred in large-scale datasets for visualization and dimensionality
-                reduction, balancing speed and quality of visualization.
-    Parameters:
-    -----------
-    data : pd.DataFrame
-        The input DataFrame (samples x features).
-    columns : List[str], optional
-        List of column names to reduce. If None, all columns are used.
-    method : str, optional, default="umap"
-        Dimensionality reduction method, either "pca" or "umap".
-    n_components : int, optional, default=50
-        Number of components for PCA or UMAP.
-    umap_neighbors : int, optional, default=15
-        Number of neighbors considered for UMAP embedding.
-    umap_min_dist : float, optional, default=0.1
-        Minimum distance between points in UMAP embedding.
-    scale : bool, optional, default=True
-        Whether to scale the data using StandardScaler.
-    fill_missing : bool, optional, default=True
-        Whether to fill missing values using the mean before applying PCA/UMAP.
+    dict_methods = {
+        #!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
+        "pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
+        "lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
+        "factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
+        "svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
+        #! Non-linear Dimensionality Reduction (Manifold Learning)
+        "umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
+        "tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other.  Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
+        "mds": "mds(Multidimensional Scaling):\n\tAppropriate for visualizing pairwise similarity or distance in data. Advantage: Maintains the perceived similarity or dissimilarity between points. Limitation: Computationally expensive for large datasets, less effective for complex, high-dimensional structures.",
+        "lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
+        "kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
+        "ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
+        #! Anomaly Detection: Specialized for detecting outliers or unusual patterns
+        "isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
+    }
-    Returns:
-    --------
-    reduced_df : pd.DataFrame
-        DataFrame with the reduced dimensions.
-    """
-    """
-       PCA: explained_variance:
-            indicates the proportion of the dataset's total variance that each principal
-            component (PC) explains. It gives you a sense of how much information
-            (or variance) is captured by each PC
-        Interpretation:
-            - Higher values indicate that the corresponding PC captures more variance.
-            - The sum of the explained variances for all PCs equals 1 (or 100%).
-            - If the first few components explain a high percentage (e.g., 90%),
-            it means you can reduce the dimensionality of the data significantly without losing much information.
-        Use case:
-            You may plot a scree plot, which shows the explained variance for each PC, to help decide
-            how many components to keep for analysis.
-        PCA: Singular values:
-            represent the magnitude of variance along each principal component. Mathematically,
-            they are the square roots of the eigenvalues of the covariance matrix.
-        Interpretation:
-            Larger singular values indicate that the associated PC captures more variance.
-            Singular values are related to the scale of the data. If the data are scaled
-            before PCA (e.g., standardized), then the singular values will provide a measure
-            of the spread of data along each PC.
-        Use case:
-            Singular values help quantify the contribution of each principal component in a
-            similar way to the explained variance. They are useful in understanding the overall
-            structure of the data.
-    """
     from sklearn.preprocessing import StandardScaler
     from sklearn.impute import SimpleImputer
-    # Select columns if specified, else use all columns
-    X = data[columns].values if columns else data.values
-    print(X.shape,type(X))
+    if plot_:
+        import matplotlib.pyplot as plt
+        import seaborn as sns
+    # Check valid method input
+    methods = [
+        "pca",
+        "umap",
+        "tsne",
+        "factor",
+        "isolation_forest",
+        "lda",
+        "kpca",
+        "ica",
+        "mds",
+        "lle",
+        "svd",
+    ]
+    method = strcmp(method, methods)[0]
+    print(f"\nprocessing with using {dict_methods[method]}:")
+    xlabel, ylabel = None, None
+    if columns is None:
+        columns = data.select_dtypes(include="number").columns.tolist()
+    if hue is None:
+        hue = data.select_dtypes(exclude="number").columns.tolist()
+    if isinstance(hue, list):
+        print("Warning: hue is a list, only select the 1st one")
+        hue = hue[0]
+    if not hue:
+        # Select columns if specified, else use all columns
+        X = data[columns].values if columns else data.values
+    else:
+        # Select columns to reduce and hue for LDA
+        X = data[columns].values if columns else data.drop(columns=[hue]).values
+        y = data[hue].values
+    print(X.shape)
     # Handle missing values
     if fill_missing:
         imputer = SimpleImputer(strategy="mean")
@@ -5683,15 +6362,13 @@ def df_reducer(
         scaler = StandardScaler()
         X = scaler.fit_transform(X)
-    # Check valid method input
-    methods=["pca", "umap","tsne","factor","isolation_forest"]
-    method=strcmp(method, methods)[0]
     # Apply PCA if selected
-    if method == "pca":
+    if method == "pca":
         from sklearn.decomposition import PCA
         pca = PCA(n_components=n_components)
         X_reduced = pca.fit_transform(X)
         # Additional PCA information
         explained_variance = pca.explained_variance_ratio_
         singular_values = pca.singular_values_
@@ -5707,36 +6384,72 @@ def df_reducer(
             # Plot explained variance
             cumulative_variance = np.cumsum(explained_variance)
             plt.figure(figsize=(8, 5))
-            plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
+            plt.plot(
+                range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o"
+            )
             plt.title("Cumulative Explained Variance by Principal Components")
             plt.xlabel("Number of Principal Components")
             plt.ylabel("Cumulative Explained Variance")
             plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
-            plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
+            plt.axvline(
+                x=n_components,
+                color="g",
+                linestyle="--",
+                label=f"n_components = {n_components}",
+            )
             plt.legend()
             plt.grid()
             plt.show()
         # Prepare reduced DataFrame with additional PCA info
         pca_df = pd.DataFrame(
-            X_reduced, index=data.index,
-            columns=[f"PC_{i+1}" for i in range(n_components)]
-        )
+            X_reduced,
+            index=data.index,
+            columns=[f"PC_{i+1}" for i in range(n_components)],
+        )
         # pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
         # pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
         # Expand explained variance to multiple columns if needed
         for i in range(n_components):
-            pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
+            pca_df[f"Explained Variance PC_{i+1}"] = np.tile(
+                format(explained_variance[i] * 100, ".3f") + "%", (pca_df.shape[0], 1)
+            )
         for i in range(n_components):
-            pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
+            pca_df[f"Singular Values PC_{i+1}"] = np.tile(
+                singular_values[i], (pca_df.shape[0], 1)
+            )
+        if hue:
+            pca_df[hue] = y
+    elif method == "lda":
+        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+        if "hue" not in locals() or hue is None:
+            raise ValueError(
+                "LDA requires a 'hue' col parameter to specify class labels."
+            )
+        lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
+        X_reduced = lda_reducer.fit_transform(X, y)
+        # Prepare reduced DataFrame with additional LDA info
+        lda_df = pd.DataFrame(
+            X_reduced,
+            index=data.index,
+            columns=[f"LDA_{i+1}" for i in range(n_components)],
+        )
+        if debug:
+            print(f"LDA completed: Reduced to {n_components} components.")
+            print("Class separability achieved by LDA.")
+        if hue:
+            lda_df[hue] = y
     # Apply UMAP if selected
     elif method == "umap":
         import umap
         umap_reducer = umap.UMAP(
             n_neighbors=umap_neighbors,
             min_dist=umap_min_dist,
-            n_components=n_components
+            n_components=n_components,
         )
         X_reduced = umap_reducer.fit_transform(X)
@@ -5751,41 +6464,57 @@ def df_reducer(
         # Prepare reduced DataFrame with additional UMAP info
         umap_df = pd.DataFrame(
-            X_reduced, index=data.index,
-            columns=[f"UMAP_{i+1}" for i in range(n_components)]
+            X_reduced,
+            index=data.index,
+            columns=[f"UMAP_{i+1}" for i in range(n_components)],
         )
         umap_df["Embedding"] = embedding[:, 0]  # Example of embedding data
         umap_df["Trustworthiness"] = trustworthiness[:, 0]  # Trustworthiness metric
+        if hue:
+            umap_df[hue] = y
     elif method == "tsne":
         from sklearn.manifold import TSNE
-        tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
-        X_reduced = tsne.fit_transform(X)
-        # Prepare reduced DataFrame with additional t-SNE info
+        tsne = TSNE(
+            n_components=n_components,
+            perplexity=tsne_perplexity,
+            random_state=random_state,
+        )
+        X_reduced = tsne.fit_transform(X)
         tsne_df = pd.DataFrame(
-            X_reduced, index=data.index,
-            columns=[f"tSNE_{i+1}" for i in range(n_components)]
+            X_reduced,
+            index=data.index,
+            columns=[f"tSNE_{i+1}" for i in range(n_components)],
         )
-        tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
+        tsne_df["Perplexity"] = np.tile(
+            f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1)
+        )
+        if hue:
+            tsne_df[hue] = y
     # Apply Factor Analysis if selected
     elif method == "factor":
         from sklearn.decomposition import FactorAnalysis
-        factor = FactorAnalysis(n_components=n_components, random_state=1)
+        factor = FactorAnalysis(n_components=n_components, random_state=random_state)
         X_reduced = factor.fit_transform(X)
         # Factor Analysis does not directly provide explained variance, but we can approximate it
         fa_variance = factor.noise_variance_
         # Prepare reduced DataFrame with additional Factor Analysis info
         factor_df = pd.DataFrame(
-            X_reduced, index=data.index,
-            columns=[f"Factor_{i+1}" for i in range(n_components)]
+            X_reduced,
+            index=data.index,
+            columns=[f"Factor_{i+1}" for i in range(n_components)],
         )
-        factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
+        factor_df["Noise Variance"] = np.tile(
+            format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1)
+        )
+        if hue:
+            factor_df[hue] = y
     # Apply Isolation Forest for outlier detection if selected
     elif method == "isolation_forest":
         from sklearn.decomposition import PCA
         from sklearn.ensemble import IsolationForest
         # Step 1: Apply PCA for dimensionality reduction to 2 components
         pca = PCA(n_components=n_components)
         X_pca = pca.fit_transform(X)
@@ -5795,65 +6524,139 @@ def df_reducer(
         # Prepare reduced DataFrame with additional PCA info
         iso_forest_df = pd.DataFrame(
-            X_pca, index=data.index,
-            columns=[f"PC_{i+1}" for i in range(n_components)]
+            X_pca, index=data.index, columns=[f"PC_{i+1}" for i in range(n_components)]
         )
-        isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
+        isolation_forest = IsolationForest(
+            n_estimators=100, contamination="auto", random_state=1
+        )
         isolation_forest.fit(X)
-        anomaly_scores = isolation_forest.decision_function(X)  # Anomaly score: larger is less anomalous
+        anomaly_scores = isolation_forest.decision_function(
+            X
+        )  # Anomaly score: larger is less anomalous
         # Predict labels: 1 (normal), -1 (anomaly)
-        anomaly_labels = isolation_forest.fit_predict(X)
+        anomaly_labels = isolation_forest.fit_predict(X)
         # Add anomaly scores and labels to the DataFrame
         iso_forest_df["Anomaly Score"] = anomaly_scores
         iso_forest_df["Anomaly Label"] = anomaly_labels
         # add info from pca
         for i in range(n_components):
-            iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
+            iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(
+                format(explained_variance[i] * 100, ".3f") + "%",
+                (iso_forest_df.shape[0], 1),
+            )
         for i in range(n_components):
-            iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
+            iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(
+                singular_values[i], (iso_forest_df.shape[0], 1)
+            )
+        if hue:
+            iso_forest_df[hue] = y
+    # * Apply Kernel PCA if selected
+    elif method == "kpca":
+        from sklearn.decomposition import KernelPCA
+        kpca = KernelPCA(
+            n_components=n_components, kernel="rbf", random_state=random_state
+        )
+        X_reduced = kpca.fit_transform(X)
+        # Prepare reduced DataFrame with KPCA info
+        kpca_df = pd.DataFrame(
+            X_reduced,
+            index=data.index,
+            columns=[f"KPCA_{i+1}" for i in range(n_components)],
+        )
+        if debug:
+            print("Kernel PCA completed with RBF kernel.")
+        if hue:
+            kpca_df[hue] = y
+    # * Apply ICA if selected
+    elif method == "ica":
+        from sklearn.decomposition import FastICA
+        ica = FastICA(n_components=n_components, random_state=random_state)
+        X_reduced = ica.fit_transform(X)
+        # Prepare reduced DataFrame with ICA info
+        ica_df = pd.DataFrame(
+            X_reduced,
+            index=data.index,
+            columns=[f"ICA_{i+1}" for i in range(n_components)],
+        )
+        if debug:
+            print("Independent Component Analysis (ICA) completed.")
+        if hue:
+            ica_df[hue] = y
+    # * Apply MDS if selected
+    elif method == "mds":
+        from sklearn.manifold import MDS
+        mds = MDS(n_components=n_components, random_state=random_state)
+        X_reduced = mds.fit_transform(X)
+        # Prepare reduced DataFrame with MDS info
+        mds_df = pd.DataFrame(
+            X_reduced,
+            index=data.index,
+            columns=[f"MDS_{i+1}" for i in range(n_components)],
+        )
+        if debug:
+            print("Multidimensional Scaling (MDS) completed.")
+        if hue:
+            mds_df[hue] = y
+    # * Apply Locally Linear Embedding (LLE) if selected
+    elif method == "lle":
+        from sklearn.manifold import LocallyLinearEmbedding
+        lle = LocallyLinearEmbedding(
+            n_components=n_components,
+            n_neighbors=umap_neighbors,
+            random_state=random_state,
+        )
+        X_reduced = lle.fit_transform(X)
+        # Prepare reduced DataFrame with LLE info
+        lle_df = pd.DataFrame(
+            X_reduced,
+            index=data.index,
+            columns=[f"LLE_{i+1}" for i in range(n_components)],
+        )
+        if debug:
+            print("Locally Linear Embedding (LLE) completed.")
+        if hue:
+            lle_df[hue] = y
+    # * Apply Singular Value Decomposition (SVD) if selected
+    elif method == "svd":
+        # Using NumPy's SVD for dimensionality reduction
+        U, s, Vt = np.linalg.svd(X, full_matrices=False)
+        X_reduced = U[:, :n_components] * s[:n_components]
+        # Prepare reduced DataFrame with SVD info
+        svd_df = pd.DataFrame(
+            X_reduced,
+            index=data.index,
+            columns=[f"SVD_{i+1}" for i in range(n_components)],
+        )
+        if hue:
+            svd_df[hue] = y
+        if debug:
+            print("Singular Value Decomposition (SVD) completed.")
     # Return reduced data and info as a new DataFrame with the same index
     if method == "pca":
         reduced_df = pca_df
         colname_met = "PC_"
-        if plot_:
-            sns.scatterplot(
-                data=pca_df,
-                x="PC_1",
-                y="PC_2",
-                # hue="condition",
-            )
+        xlabel = f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
+        ylabel = f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
     elif method == "umap":
         reduced_df = umap_df
         colname_met = "UMAP_"
-        if plot_:
-            sns.scatterplot(
-                data=umap_df,
-                x="UMAP_1",
-                y="UMAP_2",
-                # hue="condition",
-            )
     elif method == "tsne":
         reduced_df = tsne_df
-        colname_met = "t-SNE_"
-        if plot_:
-            sns.scatterplot(
-                data=tsne_df,
-                x="tSNE_1",
-                y="tSNE_2",
-                # hue="batch",
-            )
+        colname_met = "tSNE_"
     elif method == "factor":
         reduced_df = factor_df
         colname_met = "Factor_"
-        if plot_:
-            sns.scatterplot(
-                data=factor_df,
-                x="Factor_1",
-                y="Factor_2",
-                # hue="batch",
-            )
     elif method == "isolation_forest":
         reduced_df = iso_forest_df  # Already a DataFrame for outliers
         colname_met = "PC_"
@@ -5862,7 +6665,8 @@ def df_reducer(
                 data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
                 x="PC_1",
                 y="PC_2",
-                label="normal", c="b",
+                label="normal",
+                c="b",
             )
             ax = sns.scatterplot(
                 ax=ax,
@@ -5870,29 +6674,80 @@ def df_reducer(
                 x="PC_1",
                 y="PC_2",
                 c="r",
-                label="outlier", marker="+", s=30,
+                label="outlier",
+                marker="+",
+                s=30,
             )
+    elif method == "lda":
+        reduced_df = lda_df
+        colname_met = "LDA_"
+    elif method == "kpca":
+        reduced_df = kpca_df
+        colname_met = "KPCA_"
+    elif method == "ica":
+        reduced_df = ica_df
+        colname_met = "ICA_"
+    elif method == "mds":
+        reduced_df = mds_df
+        colname_met = "MDS_"
+    elif method == "lle":
+        reduced_df = lle_df
+        colname_met = "LLE_"
+    elif method == "svd":
+        reduced_df = svd_df
+        colname_met = "SVD_"
+    # Quick plots
+    if plot_ and (not method in ["isolation_forest"]):
+        from .plot import plotxy
+        if ax is None:
+            if figsize is None:
+                _, ax = plt.subplots(figsize=cm2inch(8, 8))
+            else:
+                _, ax = plt.subplots(figsize=figsize)
+        else:
+            ax = ax.cla()
+        ax = plotxy(
+            data=reduced_df,
+            x=colname_met + "1",
+            y=colname_met + "2",
+            hue=hue,
+            s=1,
+            edgecolor="none",
+            kind="scater",
+            figsets=dict(
+                legend=dict(loc="best", markerscale=4),
+                xlabel=xlabel if xlabel else None,
+                ylabel=ylabel if ylabel else None,
+            ),
+            ax=ax,
+            verbose=False,
+            **kwargs,
+        )
     if inplace:
         # If inplace=True, add components back into the original data
         for col_idx in range(n_components):
-            data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
+            data.loc[:, f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
         # Add extra info for PCA/UMAP
         if method == "pca":
             for i in range(n_components):
-                data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
+                data.loc[:, f"Explained Variance PC_{i+1}"] = reduced_df.loc[
+                    :, f"Explained Variance PC_{i+1}"
+                ]
             for i in range(n_components):
-                data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
-        elif method == "umap":
+                data.loc[:, f"Singular Values PC_{i+1}"] = reduced_df.loc[
+                    :, f"Singular Values PC_{i+1}"
+                ]
+        elif method == "umap":
             for i in range(n_components):
-                data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
-            data["Embedding"] = reduced_df["Embedding"]
-            data["Trustworthiness"] = reduced_df["Trustworthiness"]
+                data.loc[:, f"UMAP_{i+1}"] = reduced_df.loc[:, f"UMAP_{i+1}"]
+            data.loc[:, "Embedding"] = reduced_df.loc[:, "Embedding"]
+            data.loc[:, "Trustworthiness"] = reduced_df.loc[:, "Trustworthiness"]
         return None  # No return when inplace=True
-    return reduced_df
+    return reduced_df
 # example:
@@ -5922,6 +6777,7 @@ def plot_cluster(
     """
     import seaborn as sns
     from sklearn.metrics import silhouette_samples
+    import matplotlib.pyplot as plt
     if metrics is None:
         metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)
@@ -6152,10 +7008,10 @@ def use_pd(
     verbose=True,
     dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
 ):
-    default_settings = fload(dir_json, output='json')
+    default_settings = fload(dir_json, output="json")
     valid_kinds = list(default_settings.keys())
     kind = strcmp(func_name, valid_kinds)[0]
-    usage=default_settings[kind]
+    usage = default_settings[kind]
     if verbose:
         for i, i_ in enumerate(ssplit(usage, by=",")):
             i_ = i_.replace("=", "\t= ") + ","

py2ls 0.2.4.7__py3-none-any.whl → 0.2.4.9__py3-none-any.whl

py2ls 0.2.4.7py3-none-any.whl → 0.2.4.9py3-none-any.whl