PyPI - py2ls - Versions diffs - 0.2.4.7__py3-none-any.whl → 0.2.4.8__py3-none-any.whl - Mend

py2ls 0.2.4.7py3-none-any.whl → 0.2.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

py2ls/.git/index +0 -0
py2ls/batman.py +32 -1
py2ls/bio.py +3 -17
py2ls/data/usages_sns.json +2 -1
py2ls/ips.py +1136 -691
py2ls/ml2ls.py +1841 -390
py2ls/plot.py +499 -214
{py2ls-0.2.4.7.dist-info → py2ls-0.2.4.8.dist-info}/METADATA +2 -2
{py2ls-0.2.4.7.dist-info → py2ls-0.2.4.8.dist-info}/RECORD +10 -10
{py2ls-0.2.4.7.dist-info → py2ls-0.2.4.8.dist-info}/WHEEL +1 -1

py2ls/ips.py CHANGED Viewed

@@ -1,62 +1,38 @@
 import numpy as np
-import pandas as pd
-import json
-import matplotlib
-import matplotlib.pyplot as plt
-import matplotlib.ticker as tck
-from cycler import cycler
-from mpl_toolkits.mplot3d import Axes3D
-import seaborn as sns
-from sklearn.kernel_approximation import KERNEL_PARAMS
-from sympy import is_increasing
-import sys, os, shutil, re, yaml, json, subprocess
-import importlib.util
-import time
-from dateutil import parser
-from datetime import datetime
-import schedule
-from PIL import Image, ImageEnhance, ImageOps, ImageFilter
-from rembg import remove, new_session
-import docx
-from fpdf import FPDF
-from lxml import etree
-from docx import Document
-from PyPDF2 import PdfReader
-from pptx import Presentation
-from pptx.util import Inches
-from pdf2image import convert_from_path, pdfinfo_from_path
-from nltk.tokenize import sent_tokenize, word_tokenize
-import nltk  # nltk.download("punkt")
-from docx2pdf import convert
-import img2pdf as image2pdf
-import nbformat
-from nbconvert import MarkdownExporter
-from itertools import pairwise
-from box import Box, BoxList
-from numerizer import numerize
-from tqdm import tqdm
-import mimetypes
-from pprint import pp
-from collections import Counter
-from fuzzywuzzy import fuzz, process
-from langdetect import detect
-from duckduckgo_search import DDGS
+import pandas as pd
+import sys, os
+from IPython.display import display
 from typing import List, Optional, Union
-from bs4 import BeautifulSoup
-from . import netfinder
 try:
     get_ipython().run_line_magic("load_ext", "autoreload")
     get_ipython().run_line_magic("autoreload", "2")
 except NameError:
     pass
+import warnings
+warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
+warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
+def run_once_within(duration=60): # default 60s
+    import time
+    """
+    usage:
+    if run_once_within():
+        print("This code runs once per minute.")
+    else:
+        print("The code has already been run in the last minute.")
+    """
+    if not hasattr(run_once_within, "time_last"):
+        run_once_within.time_last = None
+    time_curr = time.time()
+    if (run_once_within.time_last is None) or (time_curr - run_once_within.time_last >= duration):
+        run_once_within.time_last = time_curr  # Update the last execution time
+        return True
+    else:
+        return False
 def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
     """
     Add the Chinese (default) font to the font manager
@@ -155,6 +131,8 @@ def run_every(when: str = None, job=None, wait: int = 60):
     :param when: String specifying the interval, e.g. '2 minutes', '4 hours', '1 day'.
     :param job: The function to be scheduled.
     """
+    import schedule
+    import time
     if job is None:
         print("No job provided!")
         return
@@ -200,6 +178,8 @@ def run_at(when: str, job=None, wait: int = 60):
     :param job: The function to be scheduled.
     :param wait: The sleep interval between checks in seconds.
     """
+    from datetime import datetime
+    import time
     if job is None:
         print("No job provided!")
         return
@@ -279,6 +259,7 @@ def get_timezone(timezone: str | list = None):
 def is_package_installed(package_name):
     """Check if a package is installed."""
+    import importlib.util
     package_spec = importlib.util.find_spec(package_name)
     return package_spec is not None
@@ -291,6 +272,7 @@ def upgrade(module="py2ls",uninstall=False):
     module (str): The name of the module to install/upgrade.
     uninstall (bool): If True, uninstalls the webdriver-manager before upgrading.
     """
+    import subprocess
     if not is_package_installed(module):
         try:
             subprocess.check_call([sys.executable, "-m", "pip", "install", module])
@@ -327,6 +309,7 @@ def get_version(pkg):
 def rm_folder(folder_path, verbose=True):
+    import shutil
     try:
         shutil.rmtree(folder_path)
         if verbose:
@@ -345,6 +328,7 @@ def fremove(path, verbose=True):
     """
     try:
         if os.path.isdir(path):
+            import shutil
             shutil.rmtree(path)
             if verbose:
                 print(f"Successfully deleted folder {path}")
@@ -360,23 +344,30 @@ def fremove(path, verbose=True):
             print(f"Failed to delete {path}. Reason: {e}")
-def get_cwd(verbose: bool = True):
-    """
-    get_cwd: to get the current working directory
-    Args:
-        verbose (bool, optional): to show which function is use. Defaults to True.
-    """
-    try:
-        script_dir = os.path.dirname(os.path.abspath(__file__))
-        if verbose:
-            print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
-    except NameError:
-        # This works in an interactive environment (like a Jupyter notebook)
-        script_dir = os.getcwd()
-        if verbose:
-            print("os.getcwd():", script_dir)
-    return script_dir
+# def get_cwd(verbose: bool = True):
+#     """
+#     get_cwd: to get the current working directory
+#     Args:
+#         verbose (bool, optional): to show which function is use. Defaults to True.
+#     """
+#     try:
+#         script_dir = os.path.dirname(os.path.abspath(__file__))
+#         if verbose:
+#             print("os.path.dirname(os.path.abspath(__file__)):", script_dir)
+#     except NameError:
+#         # This works in an interactive environment (like a Jupyter notebook)
+#         script_dir = os.getcwd()
+#         if verbose:
+#             print("os.getcwd():", script_dir)
+#     return script_dir
+def get_cwd():
+    from pathlib import Path
+    # Get the current script's directory as a Path object
+    current_directory = Path(__file__).resolve().parent
+    return current_directory
 def search(
     query,
@@ -388,7 +379,7 @@ def search(
     dir_save=dir_save,
     **kwargs,
 ):
+    from duckduckgo_search import DDGS
     if "te" in kind.lower():
         results = DDGS().text(query, max_results=limit)
         res = pd.DataFrame(results)
@@ -421,7 +412,7 @@ def echo(*args, **kwargs):
         str: the answer from ai
     """
     global dir_save
+    from duckduckgo_search import DDGS
     query = None
     model = kwargs.get("model", "gpt")
     verbose = kwargs.get("verbose", True)
@@ -469,8 +460,11 @@ def echo(*args, **kwargs):
     model_valid = valid_mod_name(model)
     res = DDGS().chat(query, model=model_valid)
     if verbose:
+        from pprint import pp
         pp(res)
     if log:
+        from datetime import datetime
+        import time
         dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
         res_ = f"\n\n####Q:{query}\n\n#####Ans:{dt_str}\n\n>{res}\n"
         if bool(os.path.basename(dir_save)):
@@ -492,6 +486,7 @@ def ai(*args, **kwargs):
 def detect_lang(text, output="lang", verbose=True):
+    from langdetect import detect
     dir_curr_script = os.path.dirname(os.path.abspath(__file__))
     dir_lang_code = dir_curr_script + "/data/lang_code_iso639.json"
     print(dir_curr_script, os.getcwd(), dir_lang_code)
@@ -550,6 +545,7 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
             for lst in flattened_lists[1:]:
                 shared_elements.intersection_update(lst)
     else:
+        from collections import Counter
         all_elements = [item for sublist in flattened_lists for item in sublist]
         element_count = Counter(all_elements)
         # Get elements that appear in at least n_shared lists
@@ -571,9 +567,9 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
         not_shared(list1,list2)# output [1,3]
     """
     _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
-    list1 = args[0]
+    list1 = flatten(args[0], verbose=verbose)
     _not_shared=[item for item in list1 if item not in _common]
-    return flatten(_not_shared, verbose=verbose)
+    return _not_shared
 def flatten(nested: Any, unique_list=True, verbose=False):
@@ -617,7 +613,7 @@ def strcmp(search_term, candidates, ignore_case=True,get_rank=False, verbose=Fal
     Returns:
     tuple: A tuple containing the best match and its index in the candidates list.
     """
+    from fuzzywuzzy import fuzz, process
     def to_lower(s, ignore_case=True):
         # Converts a string or list of strings to lowercase if ignore_case is True.
         if ignore_case:
@@ -743,6 +739,7 @@ def cn2pinyin(
         return pinyin_flat
 def counter(list_, verbose=True):
+    from collections import Counter
     c = Counter(list_)
     # Print the name counts
     for item, count in c.items():
@@ -771,7 +768,7 @@ def str2time(time_str, fmt="24"):
         %p represents AM or PM.
     - str: The converted time string.
     """
+    from datetime import datetime
     def time_len_corr(time_str):
         time_str_ = (
             ssplit(time_str, by=[":", " ", "digital_num"]) if ":" in time_str else None
@@ -832,6 +829,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
     Returns:
     - str: The converted date string.
     """
+    from dateutil import parser
     try:
         date_obj = parser.parse(date_str)
     except ValueError as e:
@@ -848,6 +846,7 @@ def str2date(date_str, fmt="%Y-%m-%d_%H:%M:%S"):
 def str2num(s, *args, **kwargs):
+    import re
     delimiter = kwargs.get("sep", None)
     round_digits = kwargs.get("round", None)
     if delimiter is not None:
@@ -863,6 +862,7 @@ def str2num(s, *args, **kwargs):
         try:
             num = float(s)
         except ValueError:
+            from numerizer import numerize
             try:
                 numerized = numerize(s)
                 num = int(numerized) if "." not in numerized else float(numerized)
@@ -1030,7 +1030,7 @@ def px2inch(*px, dpi=300) -> list:
         return [i / dpi for i in px]
-def cm2inch(*cm) -> list:
+def inch2cm(*cm) -> list:
     """
     cm2inch: converts centimeter measurements to inches.
     Usage:
@@ -1051,24 +1051,31 @@ def cm2inch(*cm) -> list:
 def inch2px(*inch, dpi=300) -> list:
     """
     inch2px: converts inch measurements to pixels based on the given dpi.
     Usage:
     inch2px(1, 2, dpi=300); inch2px([1, 2], dpi=300)
+    Parameters:
+    inch : float, list, or tuple
+        Single or multiple measurements in inches to convert to pixels.
+    dpi : int, optional (default=300)
+        Dots per inch (DPI), representing the pixel density.
     Returns:
-        list: in pixels
+        list: Converted measurements in pixels.
     """
-    # Case 1: When the user passes a single argument that is a list or tuple, such as inch2px([1, 2]) or inch2px((1, 2))
+    # Case 1: When the user passes a single argument that is a list or tuple, e.g., inch2px([1, 2]) or inch2px((1, 2))
     if len(inch) == 1 and isinstance(inch[0], (list, tuple)):
-        # If the input is a single list or tuple, we unpack its elements and convert each to pixels
         return [i * dpi for i in inch[0]]
-    # Case 2: When the user passes multiple arguments directly, such as inch2px(1, 2)
+    # Case 2: When the user passes multiple arguments directly, e.g., inch2px(1, 2)
     else:
-        # Here, we convert each individual argument directly to pixels
         return [i * dpi for i in inch]
-def inch2cm(*inch) -> list:
+def cm2inch(*inch) -> list:
     """
-    inch2cm: converts inch measurements to centimeters.
     Usage:
     inch2cm(8,5); inch2cm((8,5)); inch2cm([8,5])
     Returns:
@@ -1183,6 +1190,7 @@ def paper_size(paper_type_str="a4"):
 def docx2pdf(dir_docx, dir_pdf=None):
+    from docx2pdf import convert
     if dir_pdf:
         convert(dir_docx, dir_pdf)
     else:
@@ -1190,6 +1198,7 @@ def docx2pdf(dir_docx, dir_pdf=None):
 def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=300):
+    import img2pdf as image2pdf
     def mm_to_point(size):
         return (image2pdf.mm_to_pt(size[0]), image2pdf.mm_to_pt(size[1]))
@@ -1241,6 +1250,9 @@ def img2pdf(dir_img, kind="jpeg", page=None, dir_save=None, page_size="a4", dpi=
 def pdf2ppt(dir_pdf, dir_ppt):
+    from PyPDF2 import PdfReader
+    from pptx.util import Inches
+    from pptx import Presentation
     prs = Presentation()
     # Open the PDF file
@@ -1269,6 +1281,7 @@ def pdf2ppt(dir_pdf, dir_ppt):
 def ssplit(text, by="space", verbose=False, strict=False, **kws):
+    import re
     if isinstance(text, list):
         nested_list = [ssplit(i, by=by, verbose=verbose, **kws) for i in text]
         flat_list = [item for sublist in nested_list for item in sublist]
@@ -1316,6 +1329,8 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
         return [text[i : i + length] for i in range(0, len(text), length)]
     def split_by_sent_num(text, n=10):
+        from nltk.tokenize import sent_tokenize
+        from itertools import pairwise
         # split text into sentences
         text_split_by_sent = sent_tokenize(text)
         cut_loc_array = np.arange(0, len(text_split_by_sent), n)
@@ -1388,10 +1403,12 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
             print(f"splited by camel_case")
         return split_by_camel_case(text)
     elif ("word" in by) and not strict:
+        from nltk.tokenize import word_tokenize
         if verbose:
             print(f"splited by word")
         return word_tokenize(text)
     elif ("sen" in by and not "num" in by) and not strict:
+        from nltk.tokenize import sent_tokenize
         if verbose:
             print(f"splited by sentence")
         return sent_tokenize(text)
@@ -1441,9 +1458,11 @@ def ssplit(text, by="space", verbose=False, strict=False, **kws):
 def pdf2img(dir_pdf, dir_save=None, page=None, kind="png", verbose=True, **kws):
+    from pdf2image import convert_from_path, pdfinfo_from_path
     df_dir_img_single_page = pd.DataFrame()
     dir_single_page = []
     if verbose:
+        from pprint import pp
         pp(pdfinfo_from_path(dir_pdf))
     if isinstance(page, tuple) and page:
         page = list(page)
@@ -1562,6 +1581,7 @@ def unzip(dir_path, output_dir=None):
     # If the output directory already exists, remove it and replace it
     if os.path.exists(output_dir):
         if os.path.isdir(output_dir):  # check if it is a folder
+            import shutil
             shutil.rmtree(output_dir)  # remove folder
         else:
             os.remove(output_dir)  # remove file
@@ -1579,6 +1599,7 @@ def unzip(dir_path, output_dir=None):
         output_file = os.path.splitext(dir_path)[0]  # remove the .gz extension
         try:
+            import shutil
             with gzip.open(dir_path, "rb") as gz_file:
                 with open(output_file, "wb") as out_file:
                     shutil.copyfileobj(gz_file, out_file)
@@ -1676,11 +1697,13 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     """
     if not isinstance(df, pd.DataFrame):
+        if verbose:
+            print('not pd.DataFrame')
         return False
     df.columns = df.columns.astype(str)# 把它变成str, 这样就可以进行counts运算了
     # Initialize a list to hold messages about abnormalities
     messages = []
-    is_abnormal = True
+    is_abnormal = False
     # Check the shape of the DataFrame
     actual_shape = df.shape
     messages.append(f"Shape of DataFrame: {actual_shape}")
@@ -1705,25 +1728,29 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
         is_abnormal = True
         if verbose:
             print(f'len(column_names) == 1 and delimiter_counts["\t"] > 1')
+    if verbose:
+        print("1",is_abnormal)
     if any(delimiter_counts[d] > 3 for d in delimiter_counts if d != ""):
         messages.append("Abnormal: Too many delimiters in column names.")
         is_abnormal = True
         if verbose:
             print(f'any(delimiter_counts[d] > 3 for d in delimiter_counts if d != "")')
+    if verbose:
+        print("2",is_abnormal)
     if delimiter_counts[""] > 3:
         messages.append("Abnormal: There are empty column names.")
         is_abnormal = True
         if verbose:
             print(f'delimiter_counts[""] > 3')
+    if verbose:
+        print("3",is_abnormal)
     if any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"]):
         messages.append("Abnormal: Some column names contain unexpected characters.")
         is_abnormal = True
         if verbose:
             print(f'any(delimiter_counts[d] > 3 for d in ["\t", ",", "\n"])')
+    if verbose:
+        print("4",is_abnormal)
     # # Check for missing values
     # missing_values = df.isnull().sum()
     # if missing_values.any():
@@ -1743,7 +1770,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
         is_abnormal = True
         if verbose:
             print(f'df.columns[df.nunique() == 1].tolist()')
+    if verbose:
+        print("5",is_abnormal)
     # Check for an unreasonable number of rows or columns
     if actual_shape[0] < 2 or actual_shape[1] < 2:
         messages.append(
@@ -1752,7 +1780,8 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
         is_abnormal = True
         if verbose:
             print(f'actual_shape[0] < 2 or actual_shape[1] < 2')
+    if verbose:
+        print("6",is_abnormal)
     # Compile results
     if verbose:
         print("\n".join(messages))
@@ -1769,7 +1798,24 @@ def fload(fpath, kind=None, **kwargs):
     Returns:
         content: The content loaded from the file.
     """
+    def read_mplstyle(style_file):
+        import matplotlib.pyplot as plt
+        # Load the style file
+        plt.style.use(style_file)
+        # Get the current style properties
+        style_dict = plt.rcParams
+        # Convert to dictionary
+        style_dict = dict(style_dict)
+        # Print the style dictionary
+        for i, j in style_dict.items():
+            print(f"\n{i}::::{j}")
+        return style_dict
+    # #example usage:
+    # style_file = "/ std-colors.mplstyle"
+    # style_dict = read_mplstyle(style_file)
     def load_txt_md(fpath):
         with open(fpath, "r") as file:
             content = file.read()
@@ -1785,6 +1831,7 @@ def fload(fpath, kind=None, **kwargs):
     def load_json(fpath, **kwargs):
         output=kwargs.pop("output","json")
         if output=='json':
+            import json
             with open(fpath, "r") as file:
                 content = json.load(file)
             return content
@@ -1792,12 +1839,14 @@ def fload(fpath, kind=None, **kwargs):
             return pd.read_json(fpath,**kwargs)
     def load_yaml(fpath):
+        import yaml
         with open(fpath, "r") as file:
             content = yaml.safe_load(file)
         return content
     def load_xml(fpath, fsize_thr: int = 100):
+        from lxml import etree
         def load_small_xml(fpath):
             tree = etree.parse(fpath)
             root = tree.getroot()
@@ -1856,6 +1905,15 @@ def fload(fpath, kind=None, **kwargs):
                 if line.startswith(char):
                     return char
         return None
+    def _get_chunks(df_fake):
+        """
+        helper func for 'load_csv'
+        """
+        chunks = []
+        for chunk in df_fake:
+            chunks.append(chunk)
+        return pd.concat(chunks, ignore_index=True)
     def load_csv(fpath, **kwargs):
         from pandas.errors import EmptyDataError
@@ -1869,16 +1927,19 @@ def fload(fpath, kind=None, **kwargs):
         on_bad_lines = kwargs.pop("on_bad_lines", "skip")
         comment = kwargs.pop("comment", None)
         fmt=kwargs.pop("fmt",False)
+        chunksize=kwargs.pop("chunksize", None)
+        engine='c' if chunksize else engine # when chunksize, recommend 'c'
+        low_memory=kwargs.pop("low_memory",True)
+        low_memory=False if chunksize else True # when chunksize, recommend low_memory=False
         verbose=kwargs.pop("verbose",False)
-        if verbose:
+        if run_once_within():
             use_pd("read_csv", verbose=verbose)
-            return
         if comment is None:
             comment = get_comment(
                 fpath, comment=None, encoding="utf-8", lines_to_check=5
             )
         try:
             df = pd.read_csv(
                 fpath,
@@ -1890,14 +1951,19 @@ def fload(fpath, kind=None, **kwargs):
                 skipinitialspace=skipinitialspace,
                 sep=sep,
                 on_bad_lines=on_bad_lines,
+                chunksize=chunksize,
+                low_memory=low_memory,
                 **kwargs,
             )
-            if is_df_abnormal(df, verbose=0):
+            if chunksize:
+                df=_get_chunks(df)
+                print(df.shape)
+            if is_df_abnormal(df, verbose=0): # raise error
                 raise ValueError("the df is abnormal")
         except:
             try:
                 try:
-                    if engine == "pyarrow":
+                    if engine == "pyarrow" and not chunksize:
                         df = pd.read_csv(
                             fpath,
                             engine=engine,
@@ -1906,6 +1972,7 @@ def fload(fpath, kind=None, **kwargs):
                             sep=sep,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            low_memory=low_memory,
                             **kwargs,
                         )
                     else:
@@ -1919,14 +1986,19 @@ def fload(fpath, kind=None, **kwargs):
                             skipinitialspace=skipinitialspace,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            chunksize=chunksize,
+                            low_memory=low_memory,
                             **kwargs,
                         )
+                    if chunksize:
+                        df=_get_chunks(df)
+                        print(df.shape)
                     if is_df_abnormal(df, verbose=0):
                         raise ValueError("the df is abnormal")
                 except (UnicodeDecodeError, ValueError):
                     encoding = get_encoding(fpath)
                     # print(f"utf-8 failed. Retrying with detected encoding: {encoding}")
-                    if engine == "pyarrow":
+                    if engine == "pyarrow" and not chunksize:
                         df = pd.read_csv(
                             fpath,
                             engine=engine,
@@ -1935,6 +2007,7 @@ def fload(fpath, kind=None, **kwargs):
                             sep=sep,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            low_memory=low_memory,
                             **kwargs,
                         )
                     else:
@@ -1948,8 +2021,13 @@ def fload(fpath, kind=None, **kwargs):
                             skipinitialspace=skipinitialspace,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            chunksize=chunksize,
+                            low_memory=low_memory,
                             **kwargs,
                         )
+                    if chunksize:
+                        df=_get_chunks(df)
+                        print(df.shape)
                     if is_df_abnormal(df, verbose=0):
                         raise ValueError("the df is abnormal")
             except Exception as e:
@@ -1966,8 +2044,13 @@ def fload(fpath, kind=None, **kwargs):
                             sep=sep,
                             on_bad_lines=on_bad_lines,
                             comment=comment,
+                            chunksize=chunksize,
+                            low_memory=low_memory,
                             **kwargs,
                         )
+                        if chunksize:
+                            df=_get_chunks(df)
+                            print(df.shape)
                         if not is_df_abnormal(df, verbose=0):  # normal
                             display(df.head(2))
                             print(f"shape: {df.shape}")
@@ -1975,32 +2058,38 @@ def fload(fpath, kind=None, **kwargs):
                     except:
                         pass
                 else:
-                    engines = [None,"c", "python"]
-                    for engine in engines:
-                        separators = [",", "\t", ";", "|", " "]
-                        for sep in separators:
-                            try:
-                                # sep2show = sep if sep != "\t" else "\\t"
-                                # print(f"trying with: engine={engine}, sep='{sep2show}'")
-                                # print(".")
-                                df = pd.read_csv(
-                                    fpath,
-                                    engine=engine,
-                                    sep=sep,
-                                    on_bad_lines=on_bad_lines,
-                                    comment=comment,
-                                    **kwargs,
-                                )
-                                # display(df.head(2))
-                                # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
-                                if not is_df_abnormal(df, verbose=0):
-                                    display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
-                                    print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
-                                    return df
-                            except EmptyDataError as e:
-                                continue
-                        else:
-                            pass
+                    if not chunksize:
+                        engines = [None,"c", "python"]
+                        for engine in engines:
+                            separators = [",", "\t", ";", "|", " "]
+                            for sep in separators:
+                                try:
+                                    # sep2show = sep if sep != "\t" else "\\t"
+                                    # print(f"trying with: engine={engine}, sep='{sep2show}'")
+                                    # print(".")
+                                    df = pd.read_csv(
+                                        fpath,
+                                        engine=engine,
+                                        sep=sep,
+                                        on_bad_lines=on_bad_lines,
+                                        comment=comment,
+                                        chunksize=chunksize,
+                                        low_memory=low_memory,
+                                        **kwargs,
+                                    )
+                                    # display(df.head(2))
+                                    # print(f"is_df_abnormal:{is_df_abnormal(df, verbose=0)}")
+                                    if chunksize:
+                                        df=_get_chunks(df)
+                                        print(df.shape)
+                                    if not is_df_abnormal(df, verbose=0):
+                                        display(df.head(2)) if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
+                                        print(f"shape: {df.shape}") if isinstance(df, pd.DataFrame) else display("it is not a DataFrame")
+                                        return df
+                                except EmptyDataError as e:
+                                    continue
+                            else:
+                                pass
         display(df.head(2))
         print(f"shape: {df.shape}")
         return df
@@ -2008,7 +2097,7 @@ def fload(fpath, kind=None, **kwargs):
     def load_excel(fpath, **kwargs):
         engine = kwargs.get("engine", "openpyxl")
         verbose=kwargs.pop("verbose",False)
-        if verbose:
+        if run_once_within():
             use_pd("read_excel", verbose=verbose)
         df = pd.read_excel(fpath, engine=engine, **kwargs)
         try:
@@ -2039,7 +2128,7 @@ def fload(fpath, kind=None, **kwargs):
         engine = kwargs.get("engine", "pyarrow")
         verbose = kwargs.pop("verbose", False)
-        if verbose:
+        if run_once_within():
             use_pd("read_parquet", verbose=verbose)
         try:
             df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2056,6 +2145,8 @@ def fload(fpath, kind=None, **kwargs):
         return df
     def load_ipynb(fpath, **kwargs):
+        import nbformat
+        from nbconvert import MarkdownExporter
         as_version = kwargs.get("as_version", 4)
         with open(fpath, "r") as file:
             nb = nbformat.read(file, as_version=as_version)
@@ -2085,6 +2176,7 @@ def fload(fpath, kind=None, **kwargs):
         If page is an integer, it returns the text of the specified page number.
         If the specified page is not found, it returns the string "Page is not found".
         """
+        from PyPDF2 import PdfReader
         text_dict = {}
         with open(fpath, "rb") as file:
             pdf_reader = PdfReader(file)
@@ -2114,6 +2206,7 @@ def fload(fpath, kind=None, **kwargs):
             return text_dict.get(int(page), "Page is not found")
     def load_docx(fpath):
+        from docx import Document
         doc = Document(fpath)
         content = [para.text for para in doc.paragraphs]
         return content
@@ -2174,8 +2267,16 @@ def fload(fpath, kind=None, **kwargs):
     elif kind == "xml":
         return load_xml(fpath)
     elif kind in ["csv","tsv"]:
+        verbose=kwargs.pop('verbose',False)
+        if run_once_within():
+            use_pd("read_csv")
         content = load_csv(fpath, **kwargs)
         return content
+    elif kind=='pkl':
+        verbose=kwargs.pop('verbose',False)
+        if run_once_within():
+            use_pd("read_pickle")
+        return pd.read_pickle(fpath,**kwargs)
     elif kind in ["ods", "ods", "odt"]:
         engine = kwargs.get("engine", "odf")
         kwargs.pop("engine", None)
@@ -2184,25 +2285,40 @@ def fload(fpath, kind=None, **kwargs):
         engine = kwargs.get("engine", "xlrd")
         kwargs.pop("engine", None)
         content = load_excel(fpath, engine=engine, **kwargs)
-        print(f"shape: {content.shape}")
-        display(content.head(3))
+        print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
+        display(content.head(3))  if isinstance(content, pd.DataFrame) else None
         return content
     elif kind == "xlsx":
         content = load_excel(fpath, **kwargs)
-        display(content.head(3))
-        print(f"shape: {content.shape}")
+        display(content.head(3)) if isinstance(content, pd.DataFrame) else None
+        print(f"shape: {content.shape}") if isinstance(content, pd.DataFrame) else None
         return content
     elif kind=='mtx':
         from scipy.io import mmread
         dat_mtx=mmread(fpath)
         content=pd.DataFrame.sparse.from_spmatrix(dat_mtx,**kwargs)
-        display(content.head(3))
+        display(content.head(3)) if isinstance(content, pd.DataFrame) else None
         print(f"shape: {content.shape}")
         return content
     elif kind == "ipynb":
         return load_ipynb(fpath, **kwargs)
     elif kind in ['parquet','snappy']:
+        verbose=kwargs.pop('verbose',False)
+        if run_once_within():
+            use_pd("read_parquet")
         return load_parquet(fpath,**kwargs)
+    elif kind =='feather':
+        verbose=kwargs.pop('verbose',False)
+        if run_once_within():
+            use_pd("read_feather")
+        content=pd.read_feather(fpath,**kwargs)
+        return content
+    elif kind =='h5':
+        content=pd.read_hdf(fpath,**kwargs)
+        return content
+    elif kind =='pkl':
+        content=pd.read_pickle(fpath,**kwargs)
+        return content
     elif kind == "pdf":
         # print('usage:load_pdf(fpath, page="all", verbose=False)')
         return load_pdf(fpath, **kwargs)
@@ -2213,6 +2329,7 @@ def fload(fpath, kind=None, **kwargs):
         import GEOparse
         return GEOparse.get_GEO(filepath=fpath)
     elif kind.lower() in zip_types:
+        from pprint import pp
         keep = kwargs.get("keep", False)
         fpath_unzip = unzip(fpath)
         if os.path.isdir(fpath_unzip):
@@ -2247,6 +2364,9 @@ def fload(fpath, kind=None, **kwargs):
         meta, data = fcsparser.parse(fpath, reformat_meta=True)
         return meta, data
+    elif kind=="mplstyle":
+        return read_mplstyle(fpath)
     else:
         print("direct reading...")
         try:
@@ -2358,6 +2478,7 @@ def filter_kwargs(kws, valid_kwargs):
         }
     return kwargs_filtered
+str_space_speed='sapce cmp:parquet(0.56GB)<feather(1.14GB)<csv(6.55GB)<pkl=h5("26.09GB")\nsaving time: pkl=feather("13s")<parquet("35s")<h5("2m31s")<csv("58m")\nloading time: pkl("6.9s")<parquet("16.1s")=feather("15s")<h5("2m 53s")<csv(">>>30m")'
 def fsave(
     fpath,
@@ -2393,6 +2514,7 @@ def fsave(
             fappend(fpath, content=content)
     def save_docx(fpath, content, font_name, font_size, spacing):
+        import docx
         if isinstance(content, str):
             content = content.split(". ")
         doc = docx.Document()
@@ -2420,6 +2542,7 @@ def fsave(
         save_content(fpath, html_content, mode)
     def save_pdf(fpath, content, font_name, font_size):
+        from fpdf import FPDF
         pdf = FPDF()
         pdf.add_page()
         # pdf.add_font('Arial','',r'/System/Library/Fonts/Supplemental/Arial.ttf',uni=True)
@@ -2433,7 +2556,7 @@ def fsave(
         # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
         verbose=kwargs.pop("verbose",False)
-        if verbose:
+        if run_once_within():
             use_pd("to_csv", verbose=verbose)
         kwargs_csv = dict(
             path_or_buf=None,
@@ -2465,7 +2588,7 @@ def fsave(
     def save_xlsx(fpath, data, **kwargs):
         verbose=kwargs.pop("verbose",False)
         sheet_name = kwargs.pop("sheet_name", "Sheet1")
-        if verbose:
+        if run_once_within():
             use_pd("to_excel", verbose=verbose)
         if any(kwargs):
             format_excel(df=data, filename=fpath, **kwargs)
@@ -2491,9 +2614,10 @@ def fsave(
     def save_ipynb(fpath, data, **kwargs):
         # Split the content by code fences to distinguish between code and markdown
+        import nbformat
         parts = data.split("```")
         cells = []
         for i, part in enumerate(parts):
             if i % 2 == 0:
                 # Even index: markdown content
@@ -2513,6 +2637,7 @@ def fsave(
     #         json.dump(data, file, **kwargs)
     def save_json(fpath_fname, var_dict_or_df):
+        import json
         def _convert_js(data):
             if isinstance(data, pd.DataFrame):
                 return data.to_dict(orient="list")
@@ -2534,10 +2659,12 @@ def fsave(
     # # setss = jsonload("/.json")
     def save_yaml(fpath, data, **kwargs):
+        import yaml
         with open(fpath, "w") as file:
             yaml.dump(data, file, **kwargs)
     def save_xml(fpath, data):
+        from lxml import etree
         root = etree.Element("root")
         if isinstance(data, dict):
             for key, val in data.items():
@@ -2613,15 +2740,91 @@ def fsave(
     elif kind == "ipynb":
         save_ipynb(fpath, content, **kwargs)
     elif kind.lower() in ["parquet","pq","big","par"]:
+        verbose=kwargs.pop('verbose',False)
+        if verbose:
+            print(str_space_speed)
+            use_pd("to_parquet")
+            return None
         compression=kwargs.pop("compression",None) # Use None for no compression. Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’
         # fix the fpath ends
-        if not '.parquet' in fpath:
-            fpath=fpath.replace(kind, 'parquet')
+        _fpath, _ext = os.path.splitext(fpath)
+        fpath = _fpath+_ext.replace(kind, 'parquet')
         if compression is not None:
             if not fpath.endswith(compression):
                 fpath=fpath+f".{compression}"
         save_parquet(fpath=fpath, data=content,compression=compression,**kwargs)
+    elif kind.lower() in ["pkl","pk","pickle","pick"]:
+        # Pickle: Although not as efficient in terms of I/O speed and storage as Parquet or Feather,
+        # Pickle is convenient if you want to preserve exact Python object types.
+        verbose=kwargs.pop('verbose',False)
+        if verbose:
+            print(str_space_speed)
+            use_pd("to_pickle")
+            return None
+        _fpath, _ext = os.path.splitext(fpath)
+        fpath = _fpath+_ext.replace(kind, 'pkl')
+        compression=kwargs.pop("compression",None)
+        if compression is not None:
+            if not fpath.endswith(compression["method"]):
+                fpath=fpath+f".{compression["method"]}"
+        if isinstance(content, pd.DataFrame):
+            content.to_pickle(fpath,**kwargs)
+        else:
+            try:
+                print("trying to convert it as a DataFrame...")
+                content=pd.DataFrame(content)
+                content.to_pickle(fpath,**kwargs)
+            except Exception as e:
+                raise ValueError(
+                        f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
+                    )
+    elif kind.lower() in ["fea",'feather','ft','fe','feat','fether']:
+        # Feather: The Feather format, based on Apache Arrow, is designed for fast I/O operations. It's
+        # optimized for data analytics tasks and is especially fast when working with Pandas.
+        verbose=kwargs.pop('verbose',False)
+        if verbose:
+            print(str_space_speed)
+            use_pd("to_feather")
+            return None
+        _fpath, _ext = os.path.splitext(fpath)
+        fpath = _fpath+_ext.replace(kind, 'feather')
+        if isinstance(content, pd.DataFrame):
+            content.to_feather(fpath,**kwargs)
+        else:
+            try:
+                print("trying to convert it as a DataFrame...")
+                content=pd.DataFrame(content)
+                content.to_feather(fpath, **kwargs)
+            except Exception as e:
+                raise ValueError(
+                        f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
+                    )
+    elif kind.lower() in ["hd",'hdf','h','h5']:
+        # particularly useful for large datasets and can handle complex data structures
+        verbose=kwargs.pop('verbose',False)
+        if verbose:
+            print(str_space_speed)
+            use_pd("to_hdf")
+        _fpath, _ext = os.path.splitext(fpath)
+        fpath = _fpath+_ext.replace(kind, 'h5')
+        compression=kwargs.pop("compression",None)
+        if compression is not None:
+            if not fpath.endswith(compression):
+                fpath=fpath+f".{compression}"
+        if isinstance(content, pd.DataFrame):
+            content.to_hdf(fpath,key='content',**kwargs)
+        else:
+            try:
+                print("trying to convert it as a DataFrame...")
+                content=pd.DataFrame(content)
+                content.to_hdf(fpath,**kwargs)
+            except Exception as e:
+                raise ValueError(
+                        f"content is not a DataFrame, cannot be saved as a 'pkl' format: {e}"
+                    )
     else:
+        from . import netfinder
         try:
             netfinder.downloader(url=content, dir_save=dirname(fpath), kind=kind)
         except:
@@ -2744,6 +2947,7 @@ def isa(content, kind):
     elif "color" in kind.lower():  # file
         return is_str_color(content)
     elif "html" in kind.lower():
+        import re
         if content is None or not isinstance(content, str):
             return False
         # Remove leading and trailing whitespace
@@ -2903,6 +3107,7 @@ def listdir(
             display(f.head())
         return f
     else:
+        from box import Box
         if "l" in orient.lower():  # list # default
             res_output = Box(f.to_dict(orient="list"))
             return res_output
@@ -2943,13 +3148,10 @@ def mkdir_nest(fpath: str) -> str:
     Returns:
     - str: The path of the created directory.
     """
     # Split the full path into directories
     f_slash = "/" if "mac" in get_os().lower() else "\\"
     if os.path.isdir(fpath):
         fpath =fpath+f_slash if not fpath.endswith(f_slash) else fpath
-        print(fpath)
         return fpath
     dir_parts = fpath.split(f_slash)  # Split the path by the OS-specific separator
@@ -3020,7 +3222,7 @@ def mkdir(pardir: str = None, chdir: str | list = None, overwrite=False):
     if len(rootdir) == 1:
         rootdir = rootdir[0]
     rootdir=rootdir+stype if not rootdir.endswith(stype) else rootdir
-    print(rootdir)
     return rootdir
@@ -3032,6 +3234,8 @@ def split_path(fpath):
 def figsave(*args, dpi=300):
+    import matplotlib.pyplot as plt
+    from PIL import Image
     dir_save = None
     fname = None
     img = None
@@ -3047,13 +3251,13 @@ def figsave(*args, dpi=300):
     if dir_save is None:
         dir_save="./"
-    print(dir_save)
     # dir_save=dir_save+f_slash if not dir_save.endswith(f_slash) else dir_save
     dir_par = f_slash.join(dir_save.split(f_slash)[:-1])
     dir_ch = "".join(dir_save.split(f_slash)[-1:])
     if not dir_par.endswith(f_slash):
         dir_par += f_slash
-    print(dir_par)
     if fname is None:
         fname = dir_ch
     mkdir(dir_par)
@@ -3140,6 +3344,7 @@ def figsave(*args, dpi=300):
 def is_str_color(s):
     # Regular expression pattern for hexadecimal color codes
     if isinstance(s,str):
+        import re
         color_code_pattern = r"^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{8})$"
         return re.match(color_code_pattern, s) is not None
     else:
@@ -3166,6 +3371,7 @@ def isnum(s):
 def is_image(fpath):
+    import mimetypes
     mime_type, _ = mimetypes.guess_type(fpath)
     if mime_type and mime_type.startswith("image"):
         return True
@@ -3174,6 +3380,7 @@ def is_image(fpath):
 def is_document(fpath):
+    import mimetypes
     mime_type, _ = mimetypes.guess_type(fpath)
     if mime_type and (
         mime_type.startswith("text/")
@@ -3194,6 +3401,7 @@ def is_document(fpath):
 def is_zip(fpath):
+    import mimetypes
     mime_type, _ = mimetypes.guess_type(fpath)
     if mime_type == "application/zip":
         return True
@@ -3202,6 +3410,7 @@ def is_zip(fpath):
 def adjust_spines(ax=None, spines=["left", "bottom"], distance=2):
+    import matplotlib.pyplot as plt
     if ax is None:
         ax = plt.gca()
     for loc, spine in ax.spines.items():
@@ -3290,7 +3499,7 @@ def apply_filter(img, *args):
     Returns:
         PIL.Image: The filtered image.
     """
+    from PIL import ImageFilter
     def correct_filter_name(filter_name):
         if "bl" in filter_name.lower() and "box" not in filter_name.lower():
             return "BLUR"
@@ -3532,6 +3741,8 @@ def imgsets(img, **kwargs):
         avg_contrast_factor = sum(contrast_factors) / num_channels
         return {"brightness": avg_brightness_factor, "contrast": avg_contrast_factor}
+    import matplotlib.pyplot as plt
+    from PIL import ImageEnhance,ImageOps
     # Load image if input is a file path
     if isinstance(img, str):
         img = load_img(img)
@@ -3595,6 +3806,7 @@ def imgsets(img, **kwargs):
         elif "pad" in k.lower():
             img_update = ImageOps.pad(img_update, size=value)
         elif "rem" in k.lower() or "rm" in k.lower() or "back" in k.lower():
+            from rembg import remove, new_session
             if isinstance(value, bool):
                 session = new_session("isnet-general-use")
                 img_update = remove(img_update, session=session)
@@ -3633,6 +3845,7 @@ def imgsets(img, **kwargs):
                 else:
                     img_update = remove(img_update)
         elif "bg" in k.lower() and "color" in k.lower():
+            from rembg import remove
             if isinstance(value, list):
                 value = tuple(value)
             if isinstance(value, tuple):  # replace the background color
@@ -3664,6 +3877,8 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
     Args:
         dir_img_list (list): List of the Directory containing the images.
     """
+    import matplotlib.pyplot as plt
+    from PIL import Image
     num_images = len(dir_img_list)
     if not kind.startswith("."):
         kind = "." + kind
@@ -3700,28 +3915,15 @@ def thumbnail(dir_img_list, figsize=(10, 10), dpi=100, dir_save=None, kind=".png
 # usage:
 # fpath = "/Users/macjianfeng/Dropbox/github/python/py2ls/tests/xample_netfinder/images/"
 # thumbnail(listdir(fpath,'png').fpath.to_list(),dir_save=dirname(fpath))
-def read_mplstyle(style_file):
-    # Load the style file
-    plt.style.use(style_file)
-    # Get the current style properties
-    style_dict = plt.rcParams
-    # Convert to dictionary
-    style_dict = dict(style_dict)
-    # Print the style dictionary
-    for i, j in style_dict.items():
-        print(f"\n{i}::::{j}")
-    return style_dict
-# #example usage:
-# style_file = "/ std-colors.mplstyle"
-# style_dict = read_mplstyle(style_file)
 # search and fine the director of the libary, which installed at local
 def dir_lib(lib_oi):
+    """
+    # example usage:
+    # dir_lib("seaborn")
+    """
     import site
     # Get the site-packages directory
@@ -3739,23 +3941,6 @@ def dir_lib(lib_oi):
         print(f"Cannot find the {lib_oi} in site-packages directory.")
     return dir_list
-# example usage:
-# dir_lib("seaborn")
-"""
-    # n = 7
-    # clist = get_color(n, cmap="auto", how="linspace")  # get_color(100)
-    # plt.figure(figsize=[8, 5], dpi=100)
-    # x = np.linspace(0, 2 * np.pi, 50) * 100
-    # y = np.sin(x)
-    # for i in range(1, n + 1):
-    #     plt.plot(x, y + i, c=clist[i - 1], lw=5, label=str(i))
-    # plt.legend()
-    # plt.ylim(-2, 20)
-    # figsets(plt.gca(), {"style": "whitegrid"}) """
 class FileInfo:
     def __init__(
         self,
@@ -3832,6 +4017,7 @@ class FileInfo:
 def finfo(fpath):
+    import time
     fname, fmt = os.path.splitext(fpath)
     dir_par = os.path.dirname(fpath) + "/"
     data = {
@@ -3846,6 +4032,7 @@ def finfo(fpath):
     }
     extra_info = {}
     if data["kind"] == ".pdf":
+        from pdf2image import pdfinfo_from_path
         extra_info = pdfinfo_from_path(fpath)
     return FileInfo(
@@ -3860,18 +4047,7 @@ def finfo(fpath):
         extra_info=extra_info,
     )
 # ! format excel file
-import pandas as pd
-from datetime import datetime
-from openpyxl import load_workbook
-from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
-from openpyxl.utils import get_column_letter
-from openpyxl.worksheet.datavalidation import DataValidation
-from openpyxl.comments import Comment
-from openpyxl.formatting.rule import ColorScaleRule
 def hex2argb(hex_color):
     """
         Convert a hex color code to aARGB format required by openpyxl.
@@ -3902,341 +4078,7 @@ def hex2argb(hex_color):
             return hex_color[-9:]
         else:
             return "F" * (9 - len(hex_color)) + hex_color
-    raise ValueError(
-        "Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format."
-    )
-def convert_indices_to_range(row_slice, col_slice):
-    """Convert numerical row and column slices to Excel-style range strings."""
-    start_row = row_slice.start + 1
-    end_row = row_slice.stop if row_slice.stop is not None else None
-    start_col = col_slice.start + 1
-    end_col = col_slice.stop if col_slice.stop is not None else None
-    start_col_letter = get_column_letter(start_col)
-    end_col_letter = get_column_letter(end_col) if end_col else None
-    return (
-        f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
-        if end_col_letter
-        else f"{start_col_letter}{start_row}"
-    )
-def apply_format(ws, cell, cell_range):
-    """Apply cell formatting to a specified range."""
-    cell_font, cell_fill, cell_alignment, border = None, None, None, None
-    kws_cell = ["font", "fill", "alignment", "border"]
-    for K, _ in cell.items():
-        if strcmp(K, kws_cell)[0] == "font":
-            #! font
-            font_color = "000000"
-            font_name = "Arial"
-            font_underline = "none"
-            font_size = 14
-            font_bold = False
-            font_strike = False
-            font_italic = False
-            kws_font = [
-                "name",
-                "size",
-                "bold",
-                "underline",
-                "color",
-                "strike",
-                "italic",
-            ]
-            for k_, v_ in cell.get(K, {}).items():
-                if strcmp(k_, kws_font)[0] == "name":
-                    font_name = v_
-                elif strcmp(k_, kws_font)[0] == "size":
-                    font_size = v_
-                elif strcmp(k_, kws_font)[0] == "bold":
-                    font_bold = v_
-                elif strcmp(k_, kws_font)[0] == "underline":
-                    font_underline = strcmp(v_, ["none", "single", "double"])[0]
-                elif strcmp(k_, kws_font)[0] == "color":
-                    font_color = hex2argb(v_)
-                elif strcmp(k_, kws_font)[0] == "strike":
-                    font_strike = v_
-                elif strcmp(k_, kws_font)[0] == "italic":
-                    font_italic = v_
-            cell_font = Font(
-                name=font_name,
-                size=font_size,
-                bold=font_bold,
-                italic=font_italic,
-                underline=font_underline,
-                strike=font_strike,
-                color=font_color,
-            )
-        if strcmp(K, kws_cell)[0] == "fill":
-            #! fill
-            kws_fill = ["start_color", "end_color", "fill_type", "color"]
-            kws_fill_type = [
-                "darkVertical",
-                "lightDown",
-                "lightGrid",
-                "solid",
-                "darkDown",
-                "lightGray",
-                "lightUp",
-                "gray0625",
-                "lightVertical",
-                "lightHorizontal",
-                "darkHorizontal",
-                "gray125",
-                "darkUp",
-                "mediumGray",
-                "darkTrellis",
-                "darkGray",
-                "lightTrellis",
-                "darkGrid",
-            ]
-            start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid"  # default
-            for k, v in cell.get(K, {}).items():
-                if strcmp(k, kws_fill)[0] == "color":
-                    start_color, end_color = hex2argb(v), hex2argb(v)
-                    break
-            for k, v in cell.get(K, {}).items():
-                if strcmp(k, kws_fill)[0] == "start_color":
-                    start_color = hex2argb(v)
-                elif strcmp(k, kws_fill)[0] == "end_color":
-                    end_color = hex2argb(v)
-                elif strcmp(k, kws_fill)[0] == "fill_type":
-                    fill_type = strcmp(v, kws_fill_type)[0]
-            cell_fill = PatternFill(
-                start_color=start_color,
-                end_color=end_color,
-                fill_type=fill_type,
-            )
-        if strcmp(K, kws_cell)[0] == "alignment":
-            #! alignment
-            # default
-            align_horizontal = "general"
-            align_vertical = "center"
-            align_rot = 0
-            align_wrap = False
-            align_shrink = False
-            align_indent = 0
-            kws_align = [
-                "horizontal",
-                "ha",
-                "vertical",
-                "va",
-                "text_rotation",
-                "rotat",
-                "rot",
-                "wrap_text",
-                "wrap",
-                "shrink_to_fit",
-                "shrink",
-                "indent",
-            ]
-            for k, v in cell.get(K, {}).items():
-                if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
-                    align_horizontal = strcmp(
-                        v, ["general", "left", "right", "center"]
-                    )[0]
-                elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
-                    align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
-                elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
-                    align_rot = v
-                elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
-                    align_wrap = v
-                elif strcmp(k, kws_align)[0] in [
-                    "shrink_to_fit",
-                    "shrink",
-                    "wrap_text",
-                    "wrap",
-                ]:
-                    align_shrink = v
-                elif strcmp(k, kws_align)[0] in ["indent"]:
-                    align_indent = v
-            cell_alignment = Alignment(
-                horizontal=align_horizontal,
-                vertical=align_vertical,
-                text_rotation=align_rot,
-                wrap_text=align_wrap,
-                shrink_to_fit=align_shrink,
-                indent=align_indent,
-            )
-        if strcmp(K, kws_cell)[0] == "border":
-            #! border
-            kws_border = [
-                "color_left",
-                "color_l",
-                "color_right",
-                "color_r",
-                "color_top",
-                "color_t",
-                "color_bottom",
-                "color_b",
-                "color_diagonal",
-                "color_d",
-                "color_outline",
-                "color_o",
-                "color_vertical",
-                "color_v",
-                "color_horizontal",
-                "color_h",
-                "color",
-                "style_left",
-                "style_l",
-                "style_right",
-                "style_r",
-                "style_top",
-                "style_t",
-                "style_bottom",
-                "style_b",
-                "style_diagonal",
-                "style_d",
-                "style_outline",
-                "style_o",
-                "style_vertical",
-                "style_v",
-                "style_horizontal",
-                "style_h",
-                "style",
-            ]
-            # * border color
-            border_color_l, border_color_r, border_color_t, border_color_b = (
-                "FF000000",
-                "FF000000",
-                "FF000000",
-                "FF000000",
-            )
-            border_color_d, border_color_o, border_color_v, border_color_h = (
-                "FF000000",
-                "FF000000",
-                "FF000000",
-                "FF000000",
-            )
-            # get colors config
-            for k, v in cell.get(K, {}).items():
-                if strcmp(k, kws_border)[0] in ["color"]:
-                    border_color_all = hex2argb(v)
-                    # 如果设置了color,表示其它的所有的都设置成为一样的
-                    # 然后再才开始自己定义其它的color
-                    border_color_l, border_color_r, border_color_t, border_color_b = (
-                        border_color_all,
-                        border_color_all,
-                        border_color_all,
-                        border_color_all,
-                    )
-                    border_color_d, border_color_o, border_color_v, border_color_h = (
-                        border_color_all,
-                        border_color_all,
-                        border_color_all,
-                        border_color_all,
-                    )
-                elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
-                    border_color_l = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
-                    border_color_r = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
-                    border_color_t = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
-                    border_color_b = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
-                    border_color_d = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
-                    border_color_o = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
-                    border_color_v = hex2argb(v)
-                elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
-                    border_color_h = hex2argb(v)
-            # *border style
-            border_styles = [
-                "thin",
-                "medium",
-                "thick",
-                "dotted",
-                "dashed",
-                "hair",
-                "mediumDashed",
-                "dashDot",
-                "dashDotDot",
-                "slantDashDot",
-                "none",
-            ]
-            border_style_l, border_style_r, border_style_t, border_style_b = (
-                None,
-                None,
-                None,
-                None,
-            )
-            border_style_d, border_style_o, border_style_v, border_style_h = (
-                None,
-                None,
-                None,
-                None,
-            )
-            # get styles config
-            for k, v in cell.get(K, {}).items():
-                # if not "style" in k:
-                #     break
-                if strcmp(k, kws_border)[0] in ["style"]:
-                    border_style_all = strcmp(v, border_styles)[0]
-                    # 如果设置了style,表示其它的所有的都设置成为一样的
-                    # 然后再才开始自己定义其它的style
-                    border_style_l, border_style_r, border_style_t, border_style_b = (
-                        border_style_all,
-                        border_style_all,
-                        border_style_all,
-                        border_style_all,
-                    )
-                    border_style_d, border_style_o, border_style_v, border_style_h = (
-                        border_style_all,
-                        border_style_all,
-                        border_style_all,
-                        border_style_all,
-                    )
-                elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
-                    border_style_l = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
-                    border_style_r = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
-                    border_style_t = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
-                    border_style_b = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
-                    border_style_d = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
-                    border_style_o = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
-                    border_style_v = strcmp(v, border_styles)[0]
-                elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
-                    border_style_h = strcmp(v, border_styles)[0]
-            # * apply border config
-            border = Border(
-                left=Side(border_style=border_style_l, color=border_color_l),
-                right=Side(border_style=border_style_r, color=border_color_r),
-                top=Side(border_style=border_style_t, color=border_color_t),
-                bottom=Side(border_style=border_style_b, color=border_color_b),
-                diagonal=Side(border_style=border_style_d, color=border_color_d),
-                diagonal_direction=0,
-                outline=Side(border_style=border_style_o, color=border_color_o),
-                vertical=Side(border_style=border_style_v, color=border_color_v),
-                horizontal=Side(border_style=border_style_h, color=border_color_h),
-            )
-    #! final apply configs
-    for row in ws[cell_range]:
-        for cell_ in row:
-            if cell_font:
-                cell_.font = cell_font
-            if cell_fill:
-                cell_.fill = cell_fill
-            if cell_alignment:
-                cell_.alignment = cell_alignment
-            if border:
-                cell_.border = border
+    raise ValueError("Invalid hex color format. Use RRGGBB, #RRGGBB, or aARRGGBB format.")
 def format_excel(
     df=None,
@@ -4257,6 +4099,255 @@ def format_excel(
     conditional_format=None,  # dict
     **kwargs,
 ):
+    import pandas as pd
+    from datetime import datetime
+    from openpyxl import load_workbook
+    from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+    from openpyxl.utils import get_column_letter
+    from openpyxl.worksheet.datavalidation import DataValidation
+    from openpyxl.comments import Comment
+    from openpyxl.formatting.rule import ColorScaleRule
+    def convert_indices_to_range(row_slice, col_slice):
+        """Convert numerical row and column slices to Excel-style range strings."""
+        start_row = row_slice.start + 1
+        end_row = row_slice.stop if row_slice.stop is not None else None
+        start_col = col_slice.start + 1
+        end_col = col_slice.stop if col_slice.stop is not None else None
+        start_col_letter = get_column_letter(start_col)
+        end_col_letter = get_column_letter(end_col) if end_col else None
+        return (
+            f"{start_col_letter}{start_row}:{end_col_letter}{end_row}"
+            if end_col_letter
+            else f"{start_col_letter}{start_row}"
+        )
+    def apply_format(ws, cell, cell_range):
+        """Apply cell formatting to a specified range."""
+        cell_font, cell_fill, cell_alignment, border = None, None, None, None
+        kws_cell = ["font", "fill", "alignment", "border"]
+        for K, _ in cell.items():
+            if strcmp(K, kws_cell)[0] == "font":
+                #! font
+                font_color = "000000"
+                font_name = "Arial"
+                font_underline = "none"
+                font_size = 14
+                font_bold = False
+                font_strike = False
+                font_italic = False
+                kws_font = ["name","size","bold","underline","color","strike","italic"]
+                for k_, v_ in cell.get(K, {}).items():
+                    if strcmp(k_, kws_font)[0] == "name":
+                        font_name = v_
+                    elif strcmp(k_, kws_font)[0] == "size":
+                        font_size = v_
+                    elif strcmp(k_, kws_font)[0] == "bold":
+                        font_bold = v_
+                    elif strcmp(k_, kws_font)[0] == "underline":
+                        font_underline = strcmp(v_, ["none", "single", "double"])[0]
+                    elif strcmp(k_, kws_font)[0] == "color":
+                        font_color = hex2argb(v_)
+                    elif strcmp(k_, kws_font)[0] == "strike":
+                        font_strike = v_
+                    elif strcmp(k_, kws_font)[0] == "italic":
+                        font_italic = v_
+                cell_font = Font(
+                    name=font_name,
+                    size=font_size,
+                    bold=font_bold,
+                    italic=font_italic,
+                    underline=font_underline,
+                    strike=font_strike,
+                    color=font_color,
+                )
+            if strcmp(K, kws_cell)[0] == "fill":
+                #! fill
+                kws_fill = ["start_color", "end_color", "fill_type", "color"]
+                kws_fill_type = ["darkVertical","lightDown","lightGrid","solid","darkDown","lightGray","lightUp","gray0625","lightVertical","lightHorizontal",
+                                "darkHorizontal","gray125","darkUp","mediumGray","darkTrellis","darkGray","lightTrellis","darkGrid"]
+                start_color, end_color, fill_type = "FFFFFF", "FFFFFF", "solid"  # default
+                for k, v in cell.get(K, {}).items():
+                    if strcmp(k, kws_fill)[0] == "color":
+                        start_color, end_color = hex2argb(v), hex2argb(v)
+                        break
+                for k, v in cell.get(K, {}).items():
+                    if strcmp(k, kws_fill)[0] == "start_color":
+                        start_color = hex2argb(v)
+                    elif strcmp(k, kws_fill)[0] == "end_color":
+                        end_color = hex2argb(v)
+                    elif strcmp(k, kws_fill)[0] == "fill_type":
+                        fill_type = strcmp(v, kws_fill_type)[0]
+                cell_fill = PatternFill(
+                    start_color=start_color,
+                    end_color=end_color,
+                    fill_type=fill_type,
+                )
+            if strcmp(K, kws_cell)[0] == "alignment":
+                #! alignment
+                # default
+                align_horizontal = "general"
+                align_vertical = "center"
+                align_rot = 0
+                align_wrap = False
+                align_shrink = False
+                align_indent = 0
+                kws_align = [
+                    "horizontal",
+                    "ha",
+                    "vertical",
+                    "va",
+                    "text_rotation",
+                    "rotat",
+                    "rot",
+                    "wrap_text",
+                    "wrap",
+                    "shrink_to_fit",
+                    "shrink",
+                    "indent",
+                ]
+                for k, v in cell.get(K, {}).items():
+                    if strcmp(k, kws_align)[0] in ["horizontal", "ha"]:
+                        align_horizontal = strcmp(
+                            v, ["general", "left", "right", "center"]
+                        )[0]
+                    elif strcmp(k, kws_align)[0] in ["vertical", "va"]:
+                        align_vertical = strcmp(v, ["top", "center", "bottom"])[0]
+                    elif strcmp(k, kws_align)[0] in ["text_rotation", "rotat", "rot"]:
+                        align_rot = v
+                    elif strcmp(k, kws_align)[0] in ["wrap_text", "wrap"]:
+                        align_wrap = v
+                    elif strcmp(k, kws_align)[0] in [
+                        "shrink_to_fit",
+                        "shrink",
+                        "wrap_text",
+                        "wrap",
+                    ]:
+                        align_shrink = v
+                    elif strcmp(k, kws_align)[0] in ["indent"]:
+                        align_indent = v
+                cell_alignment = Alignment(
+                    horizontal=align_horizontal,
+                    vertical=align_vertical,
+                    text_rotation=align_rot,
+                    wrap_text=align_wrap,
+                    shrink_to_fit=align_shrink,
+                    indent=align_indent,
+                )
+            if strcmp(K, kws_cell)[0] == "border":
+                #! border
+                kws_border = ["color_left","color_l","color_right","color_r","color_top","color_t","color_bottom","color_b",
+                    "color_diagonal","color_d","color_outline","color_o","color_vertical","color_v","color_horizontal",
+                    "color_h","color","style_left","style_l","style_right","style_r","style_top","style_t","style_bottom","style_b",
+                    "style_diagonal","style_d","style_outline","style_o","style_vertical","style_v","style_horizontal",
+                    "style_h","style"]
+                # * border color
+                border_color_l, border_color_r, border_color_t, border_color_b = ("FF000000","FF000000","FF000000","FF000000")
+                border_color_d, border_color_o, border_color_v, border_color_h = ("FF000000","FF000000","FF000000","FF000000")
+                # get colors config
+                for k, v in cell.get(K, {}).items():
+                    if strcmp(k, kws_border)[0] in ["color"]:
+                        border_color_all = hex2argb(v)
+                        # 如果设置了color,表示其它的所有的都设置成为一样的
+                        # 然后再才开始自己定义其它的color
+                        border_color_l, border_color_r, border_color_t, border_color_b = (
+                            border_color_all,
+                            border_color_all,
+                            border_color_all,
+                            border_color_all,
+                        )
+                        border_color_d, border_color_o, border_color_v, border_color_h = (
+                            border_color_all,
+                            border_color_all,
+                            border_color_all,
+                            border_color_all,
+                        )
+                    elif strcmp(k, kws_border)[0] in ["color_left", "color_l"]:
+                        border_color_l = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_right", "color_r"]:
+                        border_color_r = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_top", "color_t"]:
+                        border_color_t = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_bottom", "color_b"]:
+                        border_color_b = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_diagonal", "color_d"]:
+                        border_color_d = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_outline", "color_o"]:
+                        border_color_o = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_vertical", "color_v"]:
+                        border_color_v = hex2argb(v)
+                    elif strcmp(k, kws_border)[0] in ["color_horizontal", "color_h"]:
+                        border_color_h = hex2argb(v)
+                # *border style
+                border_styles = ["thin","medium","thick","dotted","dashed",
+                    "hair","mediumDashed","dashDot","dashDotDot","slantDashDot","none"]
+                border_style_l, border_style_r, border_style_t, border_style_b = (None,None,None,None)
+                border_style_d, border_style_o, border_style_v, border_style_h = (None,None,None,None)
+                # get styles config
+                for k, v in cell.get(K, {}).items():
+                    # if not "style" in k:
+                    #     break
+                    if strcmp(k, kws_border)[0] in ["style"]:
+                        border_style_all = strcmp(v, border_styles)[0]
+                        # 如果设置了style,表示其它的所有的都设置成为一样的
+                        # 然后再才开始自己定义其它的style
+                        border_style_l, border_style_r, border_style_t, border_style_b = (
+                            border_style_all,
+                            border_style_all,
+                            border_style_all,
+                            border_style_all,
+                        )
+                        border_style_d, border_style_o, border_style_v, border_style_h = (
+                            border_style_all,
+                            border_style_all,
+                            border_style_all,
+                            border_style_all,
+                        )
+                    elif strcmp(k, kws_border)[0] in ["style_left", "style_l"]:
+                        border_style_l = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_right", "style_r"]:
+                        border_style_r = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_top", "style_t"]:
+                        border_style_t = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_bottom", "style_b"]:
+                        border_style_b = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_diagonal", "style_d"]:
+                        border_style_d = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_outline", "style_o"]:
+                        border_style_o = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_vertical", "style_v"]:
+                        border_style_v = strcmp(v, border_styles)[0]
+                    elif strcmp(k, kws_border)[0] in ["style_horizontal", "style_h"]:
+                        border_style_h = strcmp(v, border_styles)[0]
+                # * apply border config
+                border = Border(
+                    left=Side(border_style=border_style_l, color=border_color_l),
+                    right=Side(border_style=border_style_r, color=border_color_r),
+                    top=Side(border_style=border_style_t, color=border_color_t),
+                    bottom=Side(border_style=border_style_b, color=border_color_b),
+                    diagonal=Side(border_style=border_style_d, color=border_color_d),
+                    diagonal_direction=0,
+                    outline=Side(border_style=border_style_o, color=border_color_o),
+                    vertical=Side(border_style=border_style_v, color=border_color_v),
+                    horizontal=Side(border_style=border_style_h, color=border_color_h),
+                )
+        #! final apply configs
+        for row in ws[cell_range]:
+            for cell_ in row:
+                if cell_font:
+                    cell_.font = cell_font
+                if cell_fill:
+                    cell_.fill = cell_fill
+                if cell_alignment:
+                    cell_.alignment = cell_alignment
+                if border:
+                    cell_.border = border
     if not isinstance(df, pd.DataFrame):
         try:
             print(f"is loading file {os.path.basename(df)}")
@@ -4602,12 +4693,10 @@ format_excel(
     print(f"Formatted Excel file saved as:\n{filename}")
-from IPython.display import display, HTML, Markdown
 def preview(var):
     """Master function to preview formatted variables in Jupyter."""
+    from bs4 import BeautifulSoup
+    from IPython.display import display, HTML, Markdown
     if isinstance(var, str):
         if isa(var, "html"):
             display(HTML(var))  # Render as HTML
@@ -4624,6 +4713,7 @@ def preview(var):
         display(var)
     elif isinstance(var, list) or isinstance(var, dict):
+        import json
         # Display JSON
         json_str = json.dumps(var, indent=4)
         display(Markdown(f"```json\n{json_str}\n```"))
@@ -4637,6 +4727,7 @@ def preview(var):
         display(Image(filename=var))
     elif isinstance(var, dict):
+        import json
         # Handle dictionary formatting
         json_str = json.dumps(var, indent=4)
         display(Markdown(f"```json\n{json_str}\n```"))
@@ -4644,13 +4735,154 @@ def preview(var):
     else:
         # If the format is not recognized, print a message
         print("Format not recognized or unsupported.")
 # # Example usages:
 # preview("This is a plain text message.")
 # preview("# This is a Markdown header")
 # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
 # preview({"key": "value", "numbers": [1, 2, 3]})
+def _df_outlier(
+    data,
+    columns=None,
+    method=["zscore", "iqr", "percentile", "iforest"],
+    min_outlier_method=3,  # 至少两种方法检查出outlier
+    zscore_threshold=3,
+    iqr_threshold=1.5,
+    lower_percentile=5,
+    upper_percentile=95,
+):
+    from scipy.stats import zscore
+    from sklearn.ensemble import IsolationForest
+    from sklearn.preprocessing import StandardScaler
+    col_names_org = data.columns.tolist()
+    index_names_org = data.index.tolist()
+    # Separate numeric and non-numeric columns
+    numeric_data = data.select_dtypes(include=[np.number])
+    non_numeric_data = data.select_dtypes(exclude=[np.number])
+    if columns is not None:
+        numeric_data = numeric_data[columns]
+    elif numeric_data.empty:
+        raise ValueError("Input data must contain numeric columns.")
+    outliers_df = pd.DataFrame(index=numeric_data.index)
+    if isinstance(method, str):
+        method = [method]
+    # Z-score method
+    if "zscore" in method:
+        z_scores = np.abs(zscore(numeric_data))
+        outliers_df["zscore"] = np.any(z_scores > zscore_threshold, axis=1)
+    # IQR method
+    if "iqr" in method:
+        Q1 = numeric_data.quantile(0.25)
+        Q3 = numeric_data.quantile(0.75)
+        IQR = Q3 - Q1
+        lower_bound = Q1 - iqr_threshold * IQR
+        upper_bound = Q3 + iqr_threshold * IQR
+        outliers_df["iqr"] = (
+            (numeric_data < lower_bound) | (numeric_data > upper_bound)
+        ).any(axis=1)
+    # Percentile method
+    if "percentile" in method:
+        lower_bound = numeric_data.quantile(lower_percentile / 100)
+        upper_bound = numeric_data.quantile(upper_percentile / 100)
+        outliers_df["percentile"] = (
+            (numeric_data < lower_bound) | (numeric_data > upper_bound)
+        ).any(axis=1)
+    # Isolation Forest method
+    if "iforest" in method:
+        # iforest method cannot handle NaNs, then fillna with mean
+        numeric_data_ = numeric_data.fillna(numeric_data.mean())
+        scaler = StandardScaler()
+        scaled_data = scaler.fit_transform(numeric_data_)
+        iso_forest = IsolationForest(contamination=0.05)
+        outliers_df["iforest"] = iso_forest.fit_predict(scaled_data) == -1
+    # Combine all outlier detections
+    if len(method) == 4:  # all method are used:
+        outliers_df["outlier"] = outliers_df.sum(axis=1) >= min_outlier_method
+    else:
+        outliers_df["outlier"] = outliers_df.any(axis=1)
+    # Handling Outliers: Remove or Winsorize or Replace with NaN
+    processed_data = numeric_data.copy()
+    processed_data.loc[outliers_df["outlier"]] = np.nan
+    return processed_data
+def df_outlier(
+    data,
+    columns=None,
+    method=["zscore", "iqr", "percentile", "iforest"],
+    min_outlier_method=2,  # 至少两种方法检查出outlier
+    zscore_threshold=3,
+    iqr_threshold=1.5,
+    lower_percentile=5,
+    upper_percentile=95,
+):
+    """
+    Usage:
+    data_out = df_outlier(
+        data,
+        columns=["income"],
+        method="iforest",
+        min_outlier_method=1)
+    Advanced outlier detection and handling function.
+    Parameters:
+    - data: DataFrame, the input data (numerical).
+    - method: List, the outlier detection method to use. Options: 'zscore', 'iqr', 'percentile', 'iforest'.
+    - zscore_threshold: float, threshold for Z-score outlier detection (default 3).
+    - iqr_threshold: float, threshold for IQR method (default 1.5).
+    - lower_percentile: float, lower percentile for percentile-based outliers (default 5).
+    - upper_percentile: float, upper percentile for percentile-based outliers (default 95).
+    - keep_nan: bool, whether to replace outliers with NaN (default True).
+    - plot: bool, whether to visualize the outliers (default False).
+    - min_outlier_method: int, minimum number of method that need to flag a row as an outlier (default 2).
+    - inplace: bool, whether to modify the original `data` DataFrame (default False).
+    Returns:
+    - processed_data: DataFrame with outliers handled based on method (if winsorize/remove is True).
+    """
+    col_names_org = data.columns.tolist()
+    index_names_org = data.index.tolist()
+    numeric_data = data.select_dtypes(include=[np.number])
+    non_numeric_data = data.select_dtypes(exclude=[np.number])
+    _outlier_df_tmp = pd.DataFrame()
+    for col in numeric_data.columns:
+        _outlier_df_tmp = pd.concat(
+            [
+                _outlier_df_tmp,
+                _df_outlier(
+                    data=data,
+                    columns=[col],
+                    method=method,
+                    min_outlier_method=min_outlier_method,  # 至少两种方法检查出outlier
+                    zscore_threshold=zscore_threshold,
+                    iqr_threshold=iqr_threshold,
+                    lower_percentile=lower_percentile,
+                    upper_percentile=upper_percentile,
+                ),
+            ],
+            axis=1,
+            # join="inner",
+        )
+    processed_data = pd.concat([_outlier_df_tmp, non_numeric_data], axis=1)
+    processed_data = processed_data[col_names_org]
+    return processed_data
 def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
     """
     Extend a DataFrame by the list elecments in the column.
@@ -5042,6 +5274,7 @@ def df_drop_duplicates(
         return None
     else:
         return result
+#! fillna()
 def df_fillna(
     data: pd.DataFrame,
     method: str = "knn",
@@ -5049,8 +5282,8 @@ def df_fillna(
     constant: float = None,
     n_neighbors: int = 5,  # KNN-specific
     max_iter: int = 10, # Iterative methods specific
-    inplace: bool = True,
-    random_state:int = None
+    inplace: bool = False,
+    random_state:int = 1
 ) -> pd.DataFrame:
     """
     Fill missing values in a DataFrame using specified imputation method.
@@ -5078,7 +5311,18 @@ def df_fillna(
     inplace (bool): If True, modify the original DataFrame. If False, return a new DataFrame.
     """
+    if isinstance(data, pd.Series):
+        data=pd.DataFrame(data)
+    # handle None
+    for col in data.columns:
+        data[col] = data[col].apply(lambda x: np.nan if x is None else x)
+    col_names_org = data.columns.tolist()
+    index_names_org = data.index.tolist()
+    # Separate numeric and non-numeric columns
+    numeric_data = data.select_dtypes(include=[np.number])
+    non_numeric_data = data.select_dtypes(exclude=[np.number])
     if data.empty:
         raise ValueError("Input DataFrame is empty.")
@@ -5107,15 +5351,6 @@ def df_fillna(
         from sklearn.impute import IterativeImputer
         imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
-    # elif method == "missforest":
-    #     from missingpy import MissForest
-    #     imputer = MissForest(max_iter=max_iter, random_state=random_state)
-    # elif method == "softimpute":
-    #     from fancyimpute import SoftImpute
-    #     imputer = SoftImpute()
-    # elif method == "svd":
-    #     from fancyimpute import IterativeSVD
-    #     imputer = IterativeSVD(max_iters=max_iter)
     else: # mean, median, most_frequent
         from sklearn.impute import SimpleImputer
         imputer = SimpleImputer(strategy=method)
@@ -5123,26 +5358,49 @@ def df_fillna(
     # Fit and transform the data
     if axis == 0:
         # Impute column-wise
-        imputed_data = imputer.fit_transform(data)
-        imputed_data.shape
+        imputed_data = imputer.fit_transform(numeric_data)
     elif axis == 1:
         # Impute row-wise
-        imputed_data = imputer.fit_transform(data.T)
-        imputed_data.shape
+        imputed_data = imputer.fit_transform(numeric_data.T)
     else:
         raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
-    df_filled = pd.DataFrame(
+    imputed_data = pd.DataFrame(
         imputed_data if axis == 0 else imputed_data.T,
-        index=data.index,# if axis == 0 else data.columns,
-        columns=data.columns,# if axis == 0 else data.index,
+        index=numeric_data.index if axis == 0 else data.columns,
+        columns=numeric_data.columns if axis == 0 else data.index,
     )
+    for col in imputed_data.select_dtypes(include=[np.number]).columns:
+        imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
+    # Handle non-numeric data imputation
+    if not non_numeric_data.empty:
+        from sklearn.impute import SimpleImputer
+        if method == "constant":
+            non_numeric_imputer = SimpleImputer(strategy="constant", fill_value=constant)
+        else:
+            non_numeric_imputer = SimpleImputer(strategy="most_frequent")
+        # Impute non-numeric columns column-wise (axis=0)
+        imputed_non_numeric = non_numeric_imputer.fit_transform(non_numeric_data)
+        # Convert imputed non-numeric array back to DataFrame with original index and column names
+        imputed_non_numeric_df = pd.DataFrame(
+            imputed_non_numeric, index=non_numeric_data.index, columns=non_numeric_data.columns
+        )
+    else:
+        imputed_non_numeric_df = pd.DataFrame(index=data.index)
+    imputed_data = pd.concat([imputed_data, imputed_non_numeric_df], axis=1).reindex(columns=data.columns)
     if inplace:
-        data.update(df_filled)
-        return None  # replace original
+        # Modify the original DataFrame
+        data[:] = imputed_data[col_names_org]
+        return None
     else:
-        return df_filled
+        # Return the modified DataFrame
+        return imputed_data[col_names_org]
 # # example
 # data = {
 #     "A": [1, 2, np.nan, 4, 5],
@@ -5172,7 +5430,94 @@ def df_fillna(
 #     display(df)
 #     display(df_fillna(data=df, method=method_name, inplace=False, axis=0))
+def df_encoder(
+    data: pd.DataFrame,
+    method: str = "dummy",#'dummy', 'onehot', 'ordinal', 'label', 'target', 'binary'
+    columns=None,
+    target_column=None,  # Required for 'target' encoding method
+    **kwargs
+) -> pd.DataFrame:
+    """
+    Methods explained:
+    - 'dummy': pandas' `get_dummies` to create dummy variables for categorical columns, which is another form of one-hot encoding, but with a simpler interface.
+    - 'onehot': One-hot encoding is used when there is no inherent order in categories. It creates a binary column for each category and is useful for nominal categorical variables. However, it increases dimensionality significantly if there are many unique categories.
+    - 'ordinal': Ordinal encoding is used when there is an inherent order in the categories. It assigns integers to categories based on their order. Use this when the categories have a ranking (e.g., 'low', 'medium', 'high').
+    - 'label': Label encoding is used for converting each unique category to a numeric label. It can be useful when working with algorithms that can handle categorical data natively (e.g., decision trees). However, it might introduce unintended ordinal relationships between the categories.
+    - 'target': Target encoding is used when you encode a categorical feature based on the mean of the target variable. This is useful when there is a strong correlation between the categorical feature and the target variable. It is often used in predictive modeling to capture relationships that are not directly encoded in the feature.
+    - 'binary': Binary encoding is a more efficient alternative to one-hot encoding when dealing with high-cardinality categorical variables. It converts categories into binary numbers and then splits them into multiple columns, reducing dimensionality compared to one-hot encoding.
+    """
+    # Select categorical columns
+    categorical_cols = data.select_dtypes(exclude=np.number).columns.tolist()
+    methods = ["dummy","onehot", "ordinal", "label", "target", "binary"]
+    method = strcmp(method, methods)[0]
+    if columns is None:
+        columns = categorical_cols
+    # pd.get_dummies()
+    if method=='dummy':
+        dtype=kwargs.pop("dtype",int)
+        drop_first=kwargs.pop("drop_first",True)
+        try:
+            encoded_df = pd.get_dummies(data[columns], drop_first=drop_first, dtype=dtype, **kwargs)
+            return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
+        except Exception as e:
+            # print(f"Warning, 没有进行转换, 因为: {e}")
+            return data
+    # One-hot encoding
+    elif method == "onehot":
+        from sklearn.preprocessing import OneHotEncoder
+        encoder = OneHotEncoder(drop="first", sparse_output=False, **kwargs)
+        encoded_data = encoder.fit_transform(data[columns])
+        encoded_df = pd.DataFrame(
+            encoded_data,
+            columns=encoder.get_feature_names_out(columns),
+            index=data.index,
+        )
+        return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
+    # Ordinal encoding
+    elif method == "ordinal":
+        from sklearn.preprocessing import OrdinalEncoder
+        encoder = OrdinalEncoder(**kwargs)
+        encoded_data = encoder.fit_transform(data[columns])
+        encoded_df = pd.DataFrame(encoded_data, columns=columns, index=data.index)
+        return pd.concat([data.drop(columns, axis=1), encoded_df], axis=1)
+    # Label encoding
+    elif method == "label":
+        from sklearn.preprocessing import LabelEncoder
+        encoder = LabelEncoder()
+        encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
+        return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
+    # Target encoding (Mean of the target for each category)
+    elif method == "target":
+        if target_column is None:
+            raise ValueError("target_column must be provided for target encoding.")
+        from category_encoders import TargetEncoder
+        encoder = TargetEncoder(cols=columns, **kwargs)
+        encoded_data = encoder.fit_transform(data[columns], data[target_column])
+        return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
+    # Binary encoding (for high-cardinality categorical variables)
+    elif method == "binary":
+        from category_encoders import BinaryEncoder
+        encoder = BinaryEncoder(cols=columns, **kwargs)
+        encoded_data = encoder.fit_transform(data[columns])
+        return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
 def df_scaler(
     data: pd.DataFrame, # should be numeric dtype
     method="standard",
@@ -5218,9 +5563,8 @@ def df_scaler(
     if axis == 0:
         # Column-wise scaling (default)
         if columns is None:
-            columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
+            columns = data.select_dtypes(include=np.number).columns.tolist()
         non_numeric_columns = data.columns.difference(columns)
-        print(f"Scaling columns")
         scaled_data = scaler.fit_transform(data[columns])
@@ -5242,7 +5586,7 @@ def df_scaler(
         # Row-wise scaling
         if columns is None:
             columns = data.index.tolist()
-        numeric_rows = data.loc[columns].select_dtypes(include=["float64", "int64"])
+        numeric_rows = data.loc[columns].select_dtypes(include=np.number)
         if numeric_rows.empty:
             raise ValueError("No numeric rows to scale.")
@@ -5259,7 +5603,31 @@ def df_scaler(
             scaled_df = data.copy()
             scaled_df.loc[numeric_rows.index] = scaled_data
             return scaled_df
+def df_special_characters_cleaner(
+    data: pd.DataFrame, where=["column", "content", "index"]
+) -> pd.DataFrame:
+    """
+    to clean special characters:
+    usage:
+        df_special_characters_cleaner(data=df, where='column')
+    """
+    if not isinstance(where, list):
+        where = [where]
+    where_to_clean = ["column", "content", "index"]
+    where_ = [strcmp(i, where_to_clean)[0] for i in where]
+    # 1. Clean column names by replacing special characters with underscores
+    if "column" in where_:
+        data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
+    # 2. Clean only object-type columns (text columns)
+    if "content" in where_:
+        for col in data.select_dtypes(include=["object"]).columns:
+            data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
+    if data.index.dtype == "object" and index in where_:
+        data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
+    return data
 def df_cluster(
     data: pd.DataFrame,
     columns: Optional[list] = None,
@@ -5268,8 +5636,8 @@ def df_cluster(
     scale: bool = True,
     plot: Union[str, list] = "all",
     inplace: bool = True,
-    ax: Optional[plt.Axes] = None,
-) -> tuple[pd.DataFrame, int, Optional[plt.Axes]]:
+    ax = None,
+):
     from sklearn.preprocessing import StandardScaler
     from sklearn.cluster import KMeans
     from sklearn.metrics import silhouette_score, silhouette_samples
@@ -5277,7 +5645,6 @@ def df_cluster(
     import numpy as np
     import pandas as pd
     import matplotlib.pyplot as plt
-    import seaborn as sns
     """
     Performs clustering analysis on the provided feature matrix using K-Means.
@@ -5585,94 +5952,61 @@ def df_reducer(
     umap_neighbors: int = 15,  # UMAP-specific
     umap_min_dist: float = 0.1,  # UMAP-specific
     tsne_perplexity: int = 30,  # t-SNE-specific
+    hue:str = None,# lda-specific
     scale: bool = True,
     fill_missing: bool = True,
     debug: bool = False,
     inplace: bool = True,  # replace the oringinal data
     plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
-) -> pd.DataFrame:
-    """
-    Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
-    method:
-        1. 'umap':
-            - big dataset and global structure, often preferred in large-scale datasets for
-            visualization and dimensionality reduction, balancing speed and quality of visualization.
-            - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
-            relationships, causing clusters to appear in arbitrary proximities to each other.
-        2. 'pca':
-            - t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
-                relationships, causing clusters to appear in arbitrary proximities to each other.
-            - useful as a preprocessing step and in datasets where linear relationships dominate.
-        3. 't-SNE':
-            a. t-SNE excels at preserving local structure (i.e., clusters), but it often loses global
-                relationships, causing clusters to appear in arbitrary proximities to each other.
-            b. often preferred in large-scale datasets for visualization and dimensionality
-                reduction, balancing speed and quality of visualization.
-    Parameters:
-    -----------
-    data : pd.DataFrame
-        The input DataFrame (samples x features).
-    columns : List[str], optional
-        List of column names to reduce. If None, all columns are used.
-    method : str, optional, default="umap"
-        Dimensionality reduction method, either "pca" or "umap".
-    n_components : int, optional, default=50
-        Number of components for PCA or UMAP.
-    umap_neighbors : int, optional, default=15
-        Number of neighbors considered for UMAP embedding.
-    umap_min_dist : float, optional, default=0.1
-        Minimum distance between points in UMAP embedding.
-    scale : bool, optional, default=True
-        Whether to scale the data using StandardScaler.
-    fill_missing : bool, optional, default=True
-        Whether to fill missing values using the mean before applying PCA/UMAP.
+    random_state=1,
+    ax = None,
+    figsize=None,
+    **kwargs
+) -> pd.DataFrame:
+    dict_methods = {
+        #!Linear Dimensionality Reduction: For simplifying data with techniques that assume linearity.
+        "pca": "pca(Principal Component Analysis): \n\tUseful for reducing dimensionality of continuous data while retaining variance. Advantage: Simplifies data, speeds up computation, reduces noise. Limitation: Assumes linear relationships, may lose interpretability in transformed dimensions.",
+        "lda": "lda(Linear Discriminant Analysis):\n\tUseful for supervised dimensionality reduction when class separability is important. Advantage: Enhances separability between classes, can improve classification performance. Limitation: Assumes normal distribution and equal class covariances, linear boundaries only.",
+        "factor": "factor(Factor Analysis):\n\tSuitable for datasets with observed and underlying latent variables. Advantage: Reveals hidden structure in correlated data, dimensionality reduction with interpretable factors. Limitation: Assumes factors are linear combinations, less effective for nonlinear data.",
+        "svd": "svd(Singular Value Decomposition):\n\tSuitable for matrix decomposition, dimensionality reduction in tasks like topic modeling or image compression. Advantage: Efficient, preserves variance, useful in linear transformations. Limitation: Assumes linear relationships, sensitive to noise, may not capture non-linear structure.",
+        #! Non-linear Dimensionality Reduction (Manifold Learning)
+        "umap": "umap(Uniform Manifold Approximation and Projection):\n\tBest for high-dimensional data visualization (e.g., embeddings). Advantage: Captures complex structure while preserving both local and global data topology. Limitation: Non-deterministic results can vary, sensitive to parameter tuning.",
+        "tsne": "tsne(t-Distributed Stochastic Neighbor Embedding):\n\tt-SNE excels at preserving local structure (i.e., clusters), but it often loses global. relationships, causing clusters to appear in arbitrary proximities to each other.  Ideal for clustering and visualizing high-dimensional data, especially for clear cluster separation. Advantage: Captures local relationships effectively. Limitation: Computationally intensive, does not preserve global structure well, requires parameter tuning.",
+        "mds": "mds(Multidimensional Scaling):\n\tAppropriate for visualizing pairwise similarity or distance in data. Advantage: Maintains the perceived similarity or dissimilarity between points. Limitation: Computationally expensive for large datasets, less effective for complex, high-dimensional structures.",
+        "lle": "lle(Locally Linear Embedding):\n\tUseful for non-linear dimensionality reduction when local relationships are important (e.g., manifold learning). Advantage: Preserves local data structure, good for manifold-type data. Limitation: Sensitive to noise and number of neighbors, not effective for global structure.",
+        "kpca": "kpca(Kernel Principal Component Analysis):\n\tGood for non-linear data with complex structure, enhancing separability. Advantage: Extends PCA to capture non-linear relationships. Limitation: Computationally expensive, sensitive to kernel and parameter choice, less interpretable.",
+        "ica": "ica(Independent Component Analysis):\n\tEffective for blind source separation (e.g., EEG, audio signal processing).is generally categorized under Non-linear Dimensionality Reduction, but it also serves a distinct role in Blind Source Separation. While ICA is commonly used for dimensionality reduction, particularly in contexts where data sources need to be disentangled (e.g., separating mixed signals like EEG or audio data), it focuses on finding statistically independent components rather than maximizing variance (like PCA) or preserving distances (like MDS or UMAP). Advantage: Extracts independent signals/components, useful in mixed signal scenarios. Limitation: Assumes statistical independence, sensitive to noise and algorithm choice.",
+        #! Anomaly Detection: Specialized for detecting outliers or unusual patterns
+        "isolation_forest": "Isolation Forest:\n\tDesigned for anomaly detection, especially in high-dimensional data. Advantage: Effective in detecting outliers, efficient for large datasets. Limitation: Sensitive to contamination ratio parameter, not ideal for highly structured or non-anomalous data.",
+    }
-    Returns:
-    --------
-    reduced_df : pd.DataFrame
-        DataFrame with the reduced dimensions.
-    """
-    """
-       PCA: explained_variance:
-            indicates the proportion of the dataset's total variance that each principal
-            component (PC) explains. It gives you a sense of how much information
-            (or variance) is captured by each PC
-        Interpretation:
-            - Higher values indicate that the corresponding PC captures more variance.
-            - The sum of the explained variances for all PCs equals 1 (or 100%).
-            - If the first few components explain a high percentage (e.g., 90%),
-            it means you can reduce the dimensionality of the data significantly without losing much information.
-        Use case:
-            You may plot a scree plot, which shows the explained variance for each PC, to help decide
-            how many components to keep for analysis.
-        PCA: Singular values:
-            represent the magnitude of variance along each principal component. Mathematically,
-            they are the square roots of the eigenvalues of the covariance matrix.
-        Interpretation:
-            Larger singular values indicate that the associated PC captures more variance.
-            Singular values are related to the scale of the data. If the data are scaled
-            before PCA (e.g., standardized), then the singular values will provide a measure
-            of the spread of data along each PC.
-        Use case:
-            Singular values help quantify the contribution of each principal component in a
-            similar way to the explained variance. They are useful in understanding the overall
-            structure of the data.
-    """
     from sklearn.preprocessing import StandardScaler
     from sklearn.impute import SimpleImputer
-    # Select columns if specified, else use all columns
-    X = data[columns].values if columns else data.values
-    print(X.shape,type(X))
+    if plot_:
+        import matplotlib.pyplot as plt
+        import seaborn as sns
+    # Check valid method input
+    methods=["pca", "umap","tsne","factor","isolation_forest","lda","kpca","ica","mds","lle","svd"]
+    method=strcmp(method, methods)[0]
+    print(f"\nprocessing with using {dict_methods[method]}:")
+    xlabel,ylabel=None,None
+    if columns is None:
+        columns = data.select_dtypes(include='number').columns.tolist()
+    if hue is None:
+        hue  = data.select_dtypes(exclude='number').columns.tolist()
+    if isinstance(hue, list):
+        print("Warning: hue is a list, only select the 1st one")
+        hue=hue[0]
+    if not hue:
+        # Select columns if specified, else use all columns
+        X = data[columns].values if columns else data.values
+    else:
+        # Select columns to reduce and hue for LDA
+        X = data[columns].values if columns else data.drop(columns=[hue]).values
+        y = data[hue].values
+    print(X.shape)
     # Handle missing values
     if fill_missing:
         imputer = SimpleImputer(strategy="mean")
@@ -5683,9 +6017,6 @@ def df_reducer(
         scaler = StandardScaler()
         X = scaler.fit_transform(X)
-    # Check valid method input
-    methods=["pca", "umap","tsne","factor","isolation_forest"]
-    method=strcmp(method, methods)[0]
     # Apply PCA if selected
     if method == "pca":
         from sklearn.decomposition import PCA
@@ -5729,7 +6060,27 @@ def df_reducer(
             pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
         for i in range(n_components):
             pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
+        if hue:
+            pca_df[hue]=y
+    elif method =='lda':
+        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+        if "hue" not in locals() or hue is None:
+            raise ValueError("LDA requires a 'hue' col parameter to specify class labels.")
+        lda_reducer = LinearDiscriminantAnalysis(n_components=n_components)
+        X_reduced = lda_reducer.fit_transform(X, y)
+        # Prepare reduced DataFrame with additional LDA info
+        lda_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"LDA_{i+1}" for i in range(n_components)]
+        )
+        if debug:
+            print(f"LDA completed: Reduced to {n_components} components.")
+            print("Class separability achieved by LDA.")
+        if hue:
+            lda_df[hue]=y
     # Apply UMAP if selected
     elif method == "umap":
         import umap
@@ -5756,32 +6107,36 @@ def df_reducer(
         )
         umap_df["Embedding"] = embedding[:, 0]  # Example of embedding data
         umap_df["Trustworthiness"] = trustworthiness[:, 0]  # Trustworthiness metric
+        if hue:
+            umap_df[hue]=y
     elif method == "tsne":
         from sklearn.manifold import TSNE
-        tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
-        X_reduced = tsne.fit_transform(X)
-        # Prepare reduced DataFrame with additional t-SNE info
+        tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=random_state)
+        X_reduced = tsne.fit_transform(X)
         tsne_df = pd.DataFrame(
-            X_reduced, index=data.index,
+            X_reduced,
+            index=data.index,
             columns=[f"tSNE_{i+1}" for i in range(n_components)]
         )
         tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
+        if hue:
+            tsne_df[hue]=y
     # Apply Factor Analysis if selected
     elif method == "factor":
         from sklearn.decomposition import FactorAnalysis
-        factor = FactorAnalysis(n_components=n_components, random_state=1)
+        factor = FactorAnalysis(n_components=n_components, random_state=random_state)
         X_reduced = factor.fit_transform(X)
         # Factor Analysis does not directly provide explained variance, but we can approximate it
         fa_variance = factor.noise_variance_
         # Prepare reduced DataFrame with additional Factor Analysis info
         factor_df = pd.DataFrame(
-            X_reduced, index=data.index,
+            X_reduced,
+            index=data.index,
             columns=[f"Factor_{i+1}" for i in range(n_components)]
         )
         factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
+        if hue:
+            factor_df[hue]=y
     # Apply Isolation Forest for outlier detection if selected
     elif method == "isolation_forest":
         from sklearn.decomposition import PCA
@@ -5812,48 +6167,100 @@ def df_reducer(
             iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
         for i in range(n_components):
             iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
+        if hue:
+            iso_forest_df[hue]=y
+    #* Apply Kernel PCA if selected
+    elif method == "kpca":
+        from sklearn.decomposition import KernelPCA
+        kpca = KernelPCA(n_components=n_components, kernel="rbf", random_state=random_state)
+        X_reduced = kpca.fit_transform(X)
+        # Prepare reduced DataFrame with KPCA info
+        kpca_df = pd.DataFrame(
+            X_reduced,
+            index=data.index,
+            columns=[f"KPCA_{i+1}" for i in range(n_components)]
+        )
+        if debug:
+            print("Kernel PCA completed with RBF kernel.")
+        if hue:
+            kpca_df[hue]=y
+    #* Apply ICA if selected
+    elif method == "ica":
+        from sklearn.decomposition import FastICA
+        ica = FastICA(n_components=n_components, random_state=random_state)
+        X_reduced = ica.fit_transform(X)
+        # Prepare reduced DataFrame with ICA info
+        ica_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"ICA_{i+1}" for i in range(n_components)]
+        )
+        if debug:
+            print("Independent Component Analysis (ICA) completed.")
+        if hue:
+            ica_df[hue]=y
+    #* Apply MDS if selected
+    elif method == "mds":
+        from sklearn.manifold import MDS
+        mds = MDS(n_components=n_components, random_state=random_state)
+        X_reduced = mds.fit_transform(X)
+        # Prepare reduced DataFrame with MDS info
+        mds_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"MDS_{i+1}" for i in range(n_components)]
+        )
+        if debug:
+            print("Multidimensional Scaling (MDS) completed.")
+        if hue:
+            mds_df[hue]=y
+    #* Apply Locally Linear Embedding (LLE) if selected
+    elif method == "lle":
+        from sklearn.manifold import LocallyLinearEmbedding
+        lle = LocallyLinearEmbedding(n_components=n_components, n_neighbors=umap_neighbors, random_state=random_state)
+        X_reduced = lle.fit_transform(X)
+        # Prepare reduced DataFrame with LLE info
+        lle_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"LLE_{i+1}" for i in range(n_components)]
+        )
+        if debug:
+            print("Locally Linear Embedding (LLE) completed.")
+        if hue:
+            lle_df[hue]=y
+    #* Apply Singular Value Decomposition (SVD) if selected
+    elif method == "svd":
+        # Using NumPy's SVD for dimensionality reduction
+        U, s, Vt = np.linalg.svd(X, full_matrices=False)
+        X_reduced = U[:, :n_components] * s[:n_components]
+        # Prepare reduced DataFrame with SVD info
+        svd_df = pd.DataFrame(
+            X_reduced, index=data.index,
+            columns=[f"SVD_{i+1}" for i in range(n_components)]
+        )
+        if hue:
+            svd_df[hue]=y
+        if debug:
+            print("Singular Value Decomposition (SVD) completed.")
     # Return reduced data and info as a new DataFrame with the same index
     if method == "pca":
         reduced_df = pca_df
         colname_met = "PC_"
-        if plot_:
-            sns.scatterplot(
-                data=pca_df,
-                x="PC_1",
-                y="PC_2",
-                # hue="condition",
-            )
+        xlabel= f"PC_1 ({pca_df["Explained Variance PC_1"].tolist()[0]})"
+        ylabel= f"PC_2 ({pca_df["Explained Variance PC_2"].tolist()[0]})"
     elif method == "umap":
         reduced_df = umap_df
-        colname_met = "UMAP_"
-        if plot_:
-            sns.scatterplot(
-                data=umap_df,
-                x="UMAP_1",
-                y="UMAP_2",
-                # hue="condition",
-            )
+        colname_met = "UMAP_"
     elif method == "tsne":
         reduced_df = tsne_df
-        colname_met = "t-SNE_"
-        if plot_:
-            sns.scatterplot(
-                data=tsne_df,
-                x="tSNE_1",
-                y="tSNE_2",
-                # hue="batch",
-            )
+        colname_met = "tSNE_"
     elif method == "factor":
         reduced_df = factor_df
-        colname_met = "Factor_"
-        if plot_:
-            sns.scatterplot(
-                data=factor_df,
-                x="Factor_1",
-                y="Factor_2",
-                # hue="batch",
-            )
+        colname_met = "Factor_"
     elif method == "isolation_forest":
         reduced_df = iso_forest_df  # Already a DataFrame for outliers
         colname_met = "PC_"
@@ -5872,33 +6279,71 @@ def df_reducer(
                 c="r",
                 label="outlier", marker="+", s=30,
             )
+    elif method=='lda':
+        reduced_df=lda_df
+        colname_met="LDA_"
+    elif method=="kpca":
+        reduced_df=kpca_df
+        colname_met="KPCA_"
+    elif method=="ica":
+        reduced_df=ica_df
+        colname_met="ICA_"
+    elif method=="mds":
+        reduced_df=mds_df
+        colname_met="MDS_"
+    elif method=="lle":
+        reduced_df=lle_df
+        colname_met="LLE_"
+    elif method=="svd":
+        reduced_df=svd_df
+        colname_met="SVD_"
+    # Quick plots
+    if plot_ and (not method in ["isolation_forest"]):
+        from .plot import plotxy
+        if ax is None:
+            if figsize is None:
+                _, ax = plt.subplots(figsize=cm2inch(8,8))
+            else:
+                _, ax = plt.subplots(figsize=figsize)
+        else:
+            ax=ax.cla()
+        ax=plotxy(data=reduced_df,
+                  x=colname_met+"1",
+                  y=colname_met+"2",
+                  hue=hue,
+                  s=1,
+                  edgecolor='none',
+                  kind='scater',
+                  figsets=dict(legend=dict(loc='best',markerscale=4),
+                               xlabel=xlabel if xlabel else None,
+                               ylabel=ylabel if ylabel else None),
+                  ax=ax,
+                  verbose=False,
+                  **kwargs
+                  )
     if inplace:
         # If inplace=True, add components back into the original data
         for col_idx in range(n_components):
-            data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
+            data.loc[:,f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
         # Add extra info for PCA/UMAP
         if method == "pca":
             for i in range(n_components):
-                data[f"Explained Variance PC_{i+1}"] = reduced_df[f"Explained Variance PC_{i+1}"]
+                data.loc[:,f"Explained Variance PC_{i+1}"] = reduced_df.loc[:,f"Explained Variance PC_{i+1}"]
             for i in range(n_components):
-                data[f"Singular Values PC_{i+1}"] = reduced_df[f"Singular Values PC_{i+1}"]
+                data.loc[:,f"Singular Values PC_{i+1}"] = reduced_df.loc[:,f"Singular Values PC_{i+1}"]
         elif method == "umap":
             for i in range(n_components):
-                data[f"UMAP_{i+1}"]=reduced_df[f"UMAP_{i+1}"]
-            data["Embedding"] = reduced_df["Embedding"]
-            data["Trustworthiness"] = reduced_df["Trustworthiness"]
+                data.loc[:,f"UMAP_{i+1}"]=reduced_df.loc[:,f"UMAP_{i+1}"]
+            data.loc[:,"Embedding"] = reduced_df.loc[:,"Embedding"]
+            data.loc[:,"Trustworthiness"] = reduced_df.loc[:,"Trustworthiness"]
         return None  # No return when inplace=True
     return reduced_df
 # example:
 # df_reducer(data=data_log, columns=markers, n_components=2)
 def plot_cluster(
     data: pd.DataFrame,
     labels: np.ndarray,
@@ -5922,7 +6367,7 @@ def plot_cluster(
     """
     import seaborn as sns
     from sklearn.metrics import silhouette_samples
+    import matplotlib.pyplot as plt
     if metrics is None:
         metrics = evaluate_cluster(data=data, labels=labels, true_labels=true_labels)

py2ls 0.2.4.7__py3-none-any.whl → 0.2.4.8__py3-none-any.whl

py2ls 0.2.4.7py3-none-any.whl → 0.2.4.8py3-none-any.whl