PyPI - py2ls - Versions diffs - 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl - Mend

py2ls 0.2.4.24py3-none-any.whl → 0.2.4.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

py2ls/.DS_Store +0 -0
py2ls/.git/index +0 -0
py2ls/corr.py +475 -0
py2ls/data/.DS_Store +0 -0
py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
py2ls/data/styles/.DS_Store +0 -0
py2ls/data/styles/example/.DS_Store +0 -0
py2ls/data/usages_sns.json +6 -1
py2ls/ec2ls.py +61 -0
py2ls/ips.py +496 -138
py2ls/ml2ls.py +994 -288
py2ls/netfinder.py +16 -20
py2ls/nl2ls.py +283 -0
py2ls/plot.py +1244 -158
{py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/METADATA +5 -1
{py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/RECORD +17 -14
py2ls/data/usages_pd copy.json +0 -1105
py2ls/ml2ls copy.py +0 -2906
{py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/WHEEL +0 -0

py2ls/ips.py CHANGED Viewed

@@ -16,17 +16,20 @@ import warnings
 warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
 warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
+warnings.filterwarnings("ignore")
-def run_once_within(duration=60):  # default 60s
+def run_once_within(duration=60,reverse=False):  # default 60s
     import time
     """
+    如果reverse is True, 则在第一次运行时并不运行.但是在第二次运行时则运行
     usage:
     if run_once_within():
         print("This code runs once per minute.")
     else:
         print("The code has already been run in the last minute.")
     """
     if not hasattr(run_once_within, "time_last"):
         run_once_within.time_last = None
@@ -36,9 +39,9 @@ def run_once_within(duration=60):  # default 60s
         time_curr - run_once_within.time_last >= duration
     ):
         run_once_within.time_last = time_curr  # Update the last execution time
-        return True
+        return False if reverse else True
     else:
-        return False
+        return True if reverse else False
 def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
@@ -539,8 +542,7 @@ def is_text(s):
 from typing import Any, Union
-def shared(*args, strict=True, n_shared=2, verbose=True):
+def share(*args, strict=True, n_shared=2, verbose=True):
     """
     check the shared elelements in two list.
     usage:
@@ -585,12 +587,80 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
         elements2show = (
             shared_elements if len(shared_elements) < 10 else shared_elements[:5]
         )
+        tail = '' if len(shared_elements) < 10 else '......'
+        elements2show.append(tail)
         print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
         print("********* checking shared elements *********")
     return shared_elements
+def shared(*args, n_shared=None, verbose=True,**kwargs):
+    """
+    check the shared elelements in two list.
+    usage:
+        list1 = [1, 2, 3, 4, 5]
+        list2 = [4, 5, 6, 7, 8]
+        list3 = [5, 6, 9, 10]
+        a = shared(list1, list2,list3)
+    """
+    if verbose:
+        print("\n********* checking shared elements *********")
+    if len(args) == 1 and isinstance(args[0], list):
+        lists = args[0]  # Unpack the single list
+    else:
+        lists = args  # Use the provided arguments as lists
+    flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
+    if n_shared is None:
+        n_shared = len(flattened_lists)
+        strict = True
+    else:
+        strict = False
+    # Ensure all arguments are lists
+    if any(not isinstance(lst, list) for lst in flattened_lists):
+        print(f"{' ' * 2}All inputs must be lists.")
+        return []
+    first_list = flattened_lists[0]
+    shared_elements = [
+        item for item in first_list if all(item in lst for lst in flattened_lists)
+    ]
+    if strict:
+        # Strict mode: require elements to be in all lists
+        shared_elements = set(flattened_lists[0])
+        for lst in flattened_lists[1:]:
+            shared_elements.intersection_update(lst)
+    else:
+        from collections import Counter
+        all_elements = [item for sublist in flattened_lists for item in sublist]
+        element_count = Counter(all_elements)
+        # Get elements that appear in at least n_shared lists
+        shared_elements = [
+            item for item, count in element_count.items() if count >= n_shared
+        ]
+    shared_elements = flatten(shared_elements, verbose=verbose)
+    if verbose:
+        elements2show = (
+            shared_elements if len(shared_elements) < 10 else shared_elements[:5]
+        )
+        print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
+        print("********* checking shared elements *********")
+    return shared_elements
-def not_shared(*args, strict=True, n_shared=2, verbose=False):
+def share_not(*args,  n_shared=None, verbose=False):
+    """
+    To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
+    usage:
+        list1 = [1, 8, 3, 3, 4, 5]
+        list2 = [4, 5, 6, 7, 8]
+        not_shared(list1,list2)# output [1,3]
+    """
+    _common = shared(*args,  n_shared=n_shared, verbose=verbose)
+    list1 = flatten(args[0], verbose=verbose)
+    _not_shared = [item for item in list1 if item not in _common]
+    return _not_shared
+def not_shared(*args, n_shared=None, verbose=False):
     """
     To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
     usage:
@@ -598,7 +668,7 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
         list2 = [4, 5, 6, 7, 8]
         not_shared(list1,list2)# output [1,3]
     """
-    _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
+    _common = shared(*args, n_shared=n_shared, verbose=verbose)
     list1 = flatten(args[0], verbose=verbose)
     _not_shared = [item for item in list1 if item not in _common]
     return _not_shared
@@ -1981,7 +2051,6 @@ def fload(fpath, kind=None, **kwargs):
     def load_csv(fpath, **kwargs):
         from pandas.errors import EmptyDataError
         engine = kwargs.pop("engine", "pyarrow")# default: None
         sep = kwargs.pop("sep", None)# default: ','
         index_col = kwargs.pop("index_col", None)# default: None
@@ -1992,13 +2061,20 @@ def fload(fpath, kind=None, **kwargs):
         comment = kwargs.pop("comment", None)# default: None
         fmt = kwargs.pop("fmt", False)# default:
         chunksize = kwargs.pop("chunksize", None)# default: None
+        #check filesize
+        f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
+        if f_size>=50: #50 MB
+            if chunksize is None:
+                chunksize  = 5000
+                print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
         engine = "c" if chunksize else engine  # when chunksize, recommend 'c'
         low_memory = kwargs.pop("low_memory", True)# default: True
         low_memory = (
             False if chunksize else True
         )  # when chunksize, recommend low_memory=False # default:
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_csv", verbose=verbose)
         if comment is None:# default: None
@@ -2174,7 +2250,7 @@ def fload(fpath, kind=None, **kwargs):
     def load_excel(fpath, **kwargs):
         engine = kwargs.get("engine", "openpyxl")
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_excel", verbose=verbose)
         df = pd.read_excel(fpath, engine=engine, **kwargs)
         try:
@@ -2204,7 +2280,7 @@ def fload(fpath, kind=None, **kwargs):
         engine = kwargs.get("engine", "pyarrow")
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_parquet", verbose=verbose)
         try:
             df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2381,13 +2457,13 @@ def fload(fpath, kind=None, **kwargs):
         return load_xml(fpath)
     elif kind in ["csv", "tsv"]:
         # verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_csv")
         content = load_csv(fpath, **kwargs)
         return content
     elif kind == "pkl":
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_pickle")
         return pd.read_pickle(fpath, **kwargs)
     elif kind in ["ods", "ods", "odt"]:
@@ -2418,12 +2494,12 @@ def fload(fpath, kind=None, **kwargs):
         return load_ipynb(fpath, **kwargs)
     elif kind in ["parquet", "snappy"]:
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_parquet")
         return load_parquet(fpath, **kwargs)
     elif kind == "feather":
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_feather")
         content = pd.read_feather(fpath, **kwargs)
         return content
@@ -2682,7 +2758,7 @@ def fsave(
         # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("to_csv", verbose=verbose)
         kwargs_csv = dict(
             path_or_buf=None,
@@ -2714,7 +2790,7 @@ def fsave(
     def save_xlsx(fpath, data, **kwargs):
         verbose = kwargs.pop("verbose", False)
         sheet_name = kwargs.pop("sheet_name", "Sheet1")
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("to_excel", verbose=verbose)
         if any(kwargs):
             format_excel(df=data, filename=fpath, **kwargs)
@@ -3497,12 +3573,8 @@ def figsave(*args, dpi=300):
                     )
         else:
             plt.savefig(
-                fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", pad_inches=0
-            )
-    # elif ftype.lower() == "png":
-    #     plt.savefig(fname, format="png", dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0)
-    # elif ftype.lower() in ["tiff", "tif"]:
-    #     plt.savefig(fname, format="tiff", dpi=dpi, bbox_inches="tight",pad_inches=0)
+                fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0
+            )
     elif ftype.lower() == "emf":
         plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
     elif ftype.lower() == "fig":
@@ -5230,16 +5302,16 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
     data = data.explode(column, ignore_index=True)
     return data
-def df_circular(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
+def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
     """
     Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
     Usage:
         data = pd.DataFrame({'month': [1, 4, 7, 10, 12]})  # Just months as an example
-        # df_circular month cyclically
-        data = df_circular(data, 'month', 12)
+        # df_cycle month cyclically
+        data = df_cycle(data, 'month', 12)
     """
     if columns is None:
-        columns = list(data.columns)  # If no columns specified, use all columns
+        columns = list(data.select_dtypes(include=np.number).columns)  # If no columns specified, use all columns
     if max_val is None:
         max_val = np.max(data[columns])  # If no max_val specified, use the maximum value across all columns
     if isinstance(columns, str):
@@ -5424,7 +5496,7 @@ def df_astype(
                 # print(f"Successfully converted '{column}' to timedelta.")
             elif astype == "circular":
                 max_val = kwargs.get('max_val',None)
-                data[column]=df_circular(data=data,columns=column,max_val=max_val)
+                data[column]=df_cycle(data=data,columns=column,max_val=max_val)
             else:
                 # Convert to other types (e.g., float, int)
                 if astype=='int':
@@ -5910,11 +5982,16 @@ def df_encoder(
 def df_scaler(
     data: pd.DataFrame,  # should be numeric dtype
+    scaler=None,
     method="standard",
     columns=None,  # default, select all numeric col/row
+    feature_range=None,# specific for 'minmax'
+    vmin=0,
+    vmax=1,
     inplace=False,
     verbose=False,  # show usage
     axis=0,  # defalut column-wise
+    return_scaler:bool=False,# True: return both: return df, scaler
     **kwargs,
 ):
     """
@@ -5932,31 +6009,51 @@ def df_scaler(
     """
     if verbose:
         print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
-    methods = ["standard", "minmax", "robust"]
-    method = strcmp(method, methods)[0]
-    if method == "standard":
-        from sklearn.preprocessing import StandardScaler
-        scaler = StandardScaler(**kwargs)
-    elif method == "minmax":
-        from sklearn.preprocessing import MinMaxScaler
-        scaler = MinMaxScaler(**kwargs)
-    elif method == "robust":
-        from sklearn.preprocessing import RobustScaler
-        scaler = RobustScaler(**kwargs)
-    if axis not in [0, 1]:
-        raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
+    if scaler is None:
+        methods = ["standard", "minmax", "robust","maxabs"]
+        method = strcmp(method, methods)[0]
+        if method == "standard":
+            from sklearn.preprocessing import StandardScaler
+            if verbose:
+                print("performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1.")
+                print("Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks.")
+            scaler = StandardScaler(**kwargs)
+        elif method == "minmax":
+            from sklearn.preprocessing import MinMaxScaler
+            if feature_range is None:
+                feature_range=(vmin,vmax)
+            if verbose:
+                print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
+                print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
+                print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
+            scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
+        elif method == "robust":
+            from sklearn.preprocessing import RobustScaler
+            if verbose:
+                print("scales the data based on the median and interquartile range, which is robust to outliers.")
+                print("Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation.")
+            scaler = RobustScaler(**kwargs)
+        elif method=="maxabs":
+            from sklearn.preprocessing import MaxAbsScaler
+            if verbose:
+                print("This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature.")
+                print("Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems.")
+            scaler = MaxAbsScaler(**kwargs)
+        if axis not in [0, 1]:
+            raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
+    if verbose:
+        print(scaler)
     if axis == 0:
         # Column-wise scaling (default)
         if columns is None:
             columns = data.select_dtypes(include=np.number).columns.tolist()
         non_numeric_columns = data.columns.difference(columns)
-        scaled_data = scaler.fit_transform(data[columns])
+        # scaled_data = scaler.fit_transform(data[columns])
+        if scaler is None or not hasattr(scaler, 'mean_'):
+            scaled_data = scaler.fit_transform(data[columns])
+        else:
+            scaled_data = scaler.transform(data[columns])
         if inplace:
             data[columns] = scaled_data
@@ -5970,7 +6067,10 @@ def df_scaler(
                 axis=1,
             )
             scaled_df = scaled_df[data.columns]  # Maintain column order
-            return scaled_df
+            if return_scaler:
+                return scaled_df,scaler
+            else:
+                return scaled_df
     elif axis == 1:
         # Row-wise scaling
@@ -5982,9 +6082,10 @@ def df_scaler(
         print(f"Scaling rows")
-        scaled_data = scaler.fit_transform(
-            numeric_rows.T
-        ).T  # Transpose for scaling and then back
+        # scaled_data = scaler.fit_transform(
+        #     numeric_rows.T
+        # ).T  # Transpose for scaling and then back
+        scaled_data = scaler.fit_transform(numeric_rows.T).T if scaler is None or not hasattr(scaler, 'mean_') else scaler.transform(numeric_rows.T).T
         if inplace:
             data.loc[numeric_rows.index] = scaled_data
@@ -5992,7 +6093,10 @@ def df_scaler(
         else:
             scaled_df = data.copy()
             scaled_df.loc[numeric_rows.index] = scaled_data
-            return scaled_df
+            if return_scaler:
+                return scaled_df,scaler
+            else:
+                return scaled_df
 def df_special_characters_cleaner(
@@ -6010,15 +6114,20 @@ def df_special_characters_cleaner(
     # 1. Clean column names by replacing special characters with underscores
     if "column" in where_:
-        data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
+        try:
+            data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
+        except Exception as e:
+            print(e)
     # 2. Clean only object-type columns (text columns)
-    if "content" in where_:
-        for col in data.select_dtypes(include=["object"]).columns:
-            data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
-    if data.index.dtype == "object" and index in where_:
-        data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
+    try:
+        if "content" in where_:
+            for col in data.select_dtypes(include=["object"]).columns:
+                data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
+        if data.index.dtype == "object" and index in where_:
+            data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
+    except:
+        pass
     return data
@@ -6401,6 +6510,9 @@ def df_reducer(
         # "autoencoder","nmf",
     ]
     method = strcmp(method, methods)[0]
+    if run_once_within(reverse=True):
+        print(f"support methods:{methods}")
     if verbose:
         print(f"\nprocessing with using {dict_methods[method]}:")
     xlabel, ylabel = None, None
@@ -6408,16 +6520,20 @@ def df_reducer(
         columns = data.select_dtypes(include="number").columns.tolist()
     if hue is None:
         hue = data.select_dtypes(exclude="number").columns.tolist()
+        print(f"auto select the non-number as 'hue':{hue}")
     if isinstance(hue, list):
         print("Warning: hue is a list, only select the 1st one")
         hue = hue[0]
-    if not hue:
+    if not any(hue):
         # Select columns if specified, else use all columns
         X = data[columns].values if columns else data.values
     else:
         # Select columns to reduce and hue for LDA
-        X = data[columns].values if columns else data.drop(columns=[hue]).values
-        y = data[hue].values
+        try:
+            X = data[columns].values if columns else data.drop(columns=[hue]).values
+            y = data[hue].values
+        except:
+            pass
     print(X.shape)
     # Handle missing values
     if fill_missing:
@@ -6884,33 +7000,49 @@ def df_reducer(
         colname_met = "SVD_"
     # Quick plots
     if plot_ and (not method in ["isolation_forest"]):
-        from .plot import plotxy
-        if ax is None:
-            if figsize is None:
-                _, ax = plt.subplots(figsize=cm2inch(8, 8))
-            else:
-                _, ax = plt.subplots(figsize=figsize)
-        else:
-            ax = ax.cla()
+        from .plot import plotxy,figsets,get_color
+        # if ax is None:
+        #     if figsize is None:
+        #         _, ax = plt.subplots(figsize=cm2inch(8, 8))
+        #     else:
+        #         _, ax = plt.subplots(figsize=figsize)
+        # else:
+        #     ax = ax.cla()
         xlabel = f"{colname_met}1" if xlabel is None else xlabel
         ylabel = f"{colname_met}2" if ylabel is None else ylabel
+        palette=get_color(len(flatten(data[hue],verbose=0)))
+        reduced_df=reduced_df.sort_values(by=hue)
+        print(flatten(reduced_df[hue]))
         ax = plotxy(
             data=reduced_df,
             x=colname_met + "1",
             y=colname_met + "2",
             hue=hue,
-            s=size,
+            palette=palette,
+            # size=size,
             edgecolor=edgecolor,
-            kind_="scater",
-            figsets=dict(
-                legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
-                xlabel=xlabel if xlabel else None,
-                ylabel=ylabel if ylabel else None,
-            ),
-            ax=ax,
+            kind_=["joint",
+                #    "kde",
+                   "ell",
+                   ],
+            kws_kde=dict(
+                    hue=hue,
+                    levels=2,
+                    common_norm=False,
+                    fill=True,
+                    alpha=0.05,
+                    ),
+            kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
+            kws_ellipse=dict(alpha=0.1,lw=1,label=None),
             verbose=False,
             **kwargs,
         )
+        figsets(
+            legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
+            xlabel=xlabel if xlabel else None,
+            ylabel=ylabel if ylabel else None,
+        )
     if inplace:
         # If inplace=True, add components back into the original data
@@ -7387,6 +7519,7 @@ def df_qc(
     from statsmodels.stats.outliers_influence import variance_inflation_factor
     from scipy.stats import skew, kurtosis, entropy
+    pd.options.display.max_seq_items = 10
     #! display(data.select_dtypes(include=[np.number]).describe())
     #!skim
     if columns is not None:
@@ -7403,16 +7536,18 @@ def df_qc(
     data = data.copy()
     data.loc[:, data.isna().all()] = 0
     res_qc = {}
-    print(f"data.shape:{data.shape}")
+    print(f"⤵ data.shape:{data.shape}\n⤵ data.sample(10):")
+    display(data.sample(10).style.background_gradient(cmap="coolwarm", axis=1))
     # Missing values
     res_qc["missing_values"] = data.isnull().sum()
-    res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
+    res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
     res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
     # Data types and unique values
     res_qc["data_types"] = data.dtypes
-    res_qc["unique_values"] = data.nunique()
+    res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
+    res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
     res_qc["constant_columns"] = [
         col for col in data.columns if data[col].nunique() <= 1
     ]
@@ -7428,33 +7563,42 @@ def df_qc(
     data_outliers = df_outlier(data)
     outlier_num = data_outliers.isna().sum() - data.isnull().sum()
     res_qc["outlier_num"] = outlier_num[outlier_num > 0]
-    outlier_percentage=(outlier_num / len(data_outliers)) * 100
+    outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
     res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
-    # Correlation and multicollinearity (VIF)
-    if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
-        numeric_df = data.select_dtypes(include=[np.number]).dropna()
-        corr_matrix = numeric_df.corr()
-        high_corr_pairs = [
-            (col1, col2)
-            for col1 in corr_matrix.columns
-            for col2 in corr_matrix.columns
-            if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
-        ]
-        res_qc["high_correlations"] = high_corr_pairs
-        # VIF for multicollinearity check
-        numeric_df = data.select_dtypes(include=[np.number]).dropna()
-        vif_data = pd.DataFrame()
-        res_qc["vif"]=vif_data
-        if numeric_df.shape[1] > 1 and not numeric_df.empty:
-            vif_data["feature"] = numeric_df.columns
-            vif_data["VIF"] = [
-                variance_inflation_factor(numeric_df.values, i)
-                for i in range(numeric_df.shape[1])
+    try:
+        # Correlation and multicollinearity (VIF)
+        if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
+            numeric_df = data.select_dtypes(include=[np.number]).dropna()
+            corr_matrix = numeric_df.corr()
+            high_corr_pairs = [
+                (col1, col2)
+                for col1 in corr_matrix.columns
+                for col2 in corr_matrix.columns
+                if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
             ]
-            res_qc["vif"] = vif_data[
-                vif_data["VIF"] > 5
-            ]  # Typically VIF > 5 indicates multicollinearity
+            res_qc["high_correlations"] = high_corr_pairs
+            # VIF for multicollinearity check
+            numeric_df = data.select_dtypes(include=[np.number]).dropna()
+            if isinstance(numeric_df.columns, pd.MultiIndex):
+                numeric_df.columns = [
+                    "_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
+                ]
+            vif_data = pd.DataFrame()
+            res_qc["vif"]=vif_data
+            if numeric_df.shape[1] > 1 and not numeric_df.empty:
+                vif_data["feature"] = numeric_df.columns.tolist()
+                vif_data["VIF"] = [
+                    round(variance_inflation_factor(numeric_df.values, i),2)
+                    for i in range(numeric_df.shape[1])
+                ]
+                res_qc["vif"] = vif_data[
+                    vif_data["VIF"] > 5
+                ]  # Typically VIF > 5 indicates multicollinearity
+    except Exception as e:
+        print(e)
     # Skewness and Kurtosis
     skewness = data.skew(numeric_only=True)
     kurtosis_vals = data.kurt(numeric_only=True)
@@ -7467,8 +7611,7 @@ def df_qc(
         col: entropy(data[col].value_counts(normalize=True), base=2)
         for col in categorical_cols
     }
-    # number of unique
-    res_qc["unique_counts"] = data.nunique()
     # dtypes counts
     res_qc['dtype_counts']=data.dtypes.value_counts()
@@ -7515,7 +7658,7 @@ def df_qc(
     res_qc["text_length_analysis"] = text_lengths
     # Summary statistics
-    res_qc["summary_statistics"] = data.describe().T
+    res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
     # Automated warnings
     warnings = []
@@ -7537,28 +7680,45 @@ def df_qc(
     # Report generation
     if verbose:
-        print("=== QC Report Summary ===")
         print("\n⤵  Summary Statistics:")
         display(res_qc["summary_statistics"])
         print("\n⤵  Data Types:")
         display(res_qc["data_types"])
         if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
             print(" ⤵  Missing Values Counts:")
-            display(res_qc["missing_values"][res_qc["missing_values"] > 0])
+            display(pd.DataFrame(
+                                {
+                                    "missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
+                                    "missing_percent(%)": res_qc["missing_percentage"][
+                                        res_qc["missing_percentage"] > 0
+                                    ],
+                                }
+                            ).style.background_gradient(cmap="coolwarm", axis=0)
+                    )
             # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
             print("\n⤵  Rows with Missing Values:",res_qc["rows_with_missing"])
+        print("\n⤵  Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
+        print("⤵  Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
+        print("⤵  Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
         if any(res_qc["outlier_num"]):
             print("\n⤵  Outlier Report:")
-            display(res_qc["outlier_num"])
-        if any(res_qc["unique_values"]):
-            print("\n⤵  Unique Values per Column:")
-            display(res_qc["unique_values"])
+            display(pd.DataFrame(
+                                {
+                                    "outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
+                                    "outlier_percentage(%)": res_qc["outlier_percentage"][
+                                        res_qc["outlier_percentage"] > 0
+                                    ],
+                                }
+                            ).style.background_gradient(cmap="coolwarm", axis=0)
+                    )
-        print("\n⤵  Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
+        if any(res_qc["unique_counts"]):
+            print("\n⤵  Unique Values per Column:")
+            display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
+                                  "unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
-        print("⤵  Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
-        print("⤵  Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
         if res_qc["empty_columns"]:
             print("\n⤵  Empty Columns:", res_qc["empty_columns"])
@@ -7570,7 +7730,7 @@ def df_qc(
         if "vif" in res_qc:
             print("\n⤵  Features with High VIF (>|5|):")
-            print(res_qc["vif"])
+            display(res_qc["vif"].style.background_gradient(cmap="coolwarm", axis=0))
         if any(res_qc["high_cardinality_categoricals"]):
             print("\n⤵  High Cardinality Categorical Columns (>|50 unique|):")
@@ -7589,28 +7749,27 @@ def df_qc(
             print("\nWarnings:")
             for warning in res_qc["warnings"]:
                 print("  -", warning)
+    pd.reset_option("display.max_seq_items")
     if plot_:
-        df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue)
-        if dir_save:
-            try:
-                figsave(dir_save)
-            except Exception as e:
-                print(f"⚠️: {e}")
-    if output:
+        df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
+    if output or not plot_:
         return res_qc
     return None
-def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None):
+def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None,dir_save=None):
     import matplotlib.pyplot as plt
     import seaborn as sns
     from .plot import subplot, figsets, get_color
+    from datetime import datetime
+    now_ = datetime.now().strftime("%y%m%d_%H%M%S")
     if columns is not None:
         if isinstance(columns, (list,pd.core.indexes.base.Index)):
             data=data[columns]
     len_total = len(res_qc)
-    n_row, n_col = int((len_total + 10) / 3), 3
+    n_row, n_col = int((len_total + 10)), 3
     nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
     missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
@@ -7638,15 +7797,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
         ax=nexttile(),
     )
     figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
-    #!
-    try:
-        if data.select_dtypes(include=np.number).shape[1]<=10:
-            for col in data.select_dtypes(include=np.number).columns:
-                sns.histplot(data[col], kde=True, bins=30, ax=nexttile())
-                figsets(title=f"Distribution: {col}", xlabel=col, ylabel="Frequency")
-    except:
-        pass
     #!
     try:
         for col in data.select_dtypes(include='category').columns:
@@ -7775,8 +7926,10 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
         title="Dtypes",
         ylabel="#",
         ax=ax_dtype_counts,
-        fontsize=8  if len(dtype_counts.index)<=20 else 6,
+        fontsize=8 if len(dtype_counts.index)<=20 else 6,
     )
+    # from .plot import pie
+    # pie()
     # High cardinality: Show top categorical columns by unique value count
     high_cardinality = res_qc["high_cardinality_categoricals"]
@@ -7857,6 +8010,79 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
                 title="Correlation Heatmap",
                 ax=ax_heatmap
             )
+    # # save figure
+    # if dir_save:
+    #     figsave(dir_save,f"qc_plot_{now_}.pdf")
+    if columns is not None:
+        if isinstance(columns, (list,pd.core.indexes.base.Index)):
+            data=data[columns]
+    # len_total = len(res_qc)
+    # n_row, n_col = int((len_total + 10) / 3), 3
+    # nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
+    #! check distribution
+    data_num = data.select_dtypes(include=np.number)
+    if len(data_num) > max_cols:
+        data_num = data_num.iloc[:,:max_cols]
+    data_num = df_scaler(data=data_num, method='standard')
+    import scipy.stats as stats
+    for column in data_num.columns:
+        #* Shapiro-Wilk test for normality
+        stat, p_value = stats.shapiro(data_num[column])
+        normality = "norm" if p_value > 0.05 else "not_norm"
+        #* Plot histogram
+        ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
+        x_min, x_max = ax_hist.get_xlim()
+        y_min, y_max = ax_hist.get_ylim()
+        ax_hist.text(x_min+(x_max-x_min)*0.5, y_min+(y_max-y_min)*0.75,
+                     f'p(Shapiro-Wilk)={p_value:.3f}\n{normality}',
+                     ha='center', va='top')
+        figsets(title=column,ax=ax_hist)
+        ax_twin=ax_hist.twinx()
+        #* Q-Q plot
+        stats.probplot(data_num[column], dist="norm", plot=ax_twin)
+        figsets(ylabel=f'Q-Q Plot:{column}',title=None)
+    # save figure
+    if dir_save:
+        figsave(dir_save,f"qc_plot_{now_}.pdf")
+def df_corr(df: pd.DataFrame, method="pearson"):
+    """
+    Compute correlation coefficients and p-values for a DataFrame.
+    Parameters:
+    - df (pd.DataFrame): Input DataFrame with numeric data.
+    - method (str): Correlation method ("pearson", "spearman", "kendall").
+    Returns:
+    - corr_matrix (pd.DataFrame): Correlation coefficient matrix.
+    - pval_matrix (pd.DataFrame): P-value matrix.
+    """
+    from scipy.stats import pearsonr, spearmanr, kendalltau
+    methods = ["pearson", "spearman", "kendall"]
+    method = strcmp(method, methods)[0]
+    methods_dict = {"pearson": pearsonr, "spearman": spearmanr, "kendall": kendalltau}
+    cols = df.columns
+    corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
+    pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
+    correlation_func = methods_dict[method]
+    for col1 in cols:
+        for col2 in cols:
+            if col1 == col2:
+                corr_matrix.loc[col1, col2] = 1.0
+                pval_matrix.loc[col1, col2] = 0.0
+            else:
+                corr, pval = correlation_func(df[col1], df[col2])
+                corr_matrix.loc[col1, col2] = corr
+                pval_matrix.loc[col1, col2] = pval
+    return corr_matrix, pval_matrix
 def use_pd(
     func_name="excel",
@@ -7877,3 +8103,135 @@ def use_pd(
     except Exception as e:
         if verbose:
             print(e)
+def get_phone(phone_number: str, region: str = None,verbose=True):
+    """
+    usage:
+        info = get_phone(15237654321, "DE")
+        preview(info)
+    Extremely advanced phone number analysis function.
+    Args:
+        phone_number (str): The phone number to analyze.
+        region (str): None (Default). Tries to work with international numbers including country codes; otherwise, uses the specified region.
+    Returns:
+        dict: Comprehensive information about the phone number.
+    """
+    import phonenumbers
+    from phonenumbers import geocoder, carrier, timezone, number_type
+    from datetime import datetime
+    import pytz
+    from tzlocal import get_localzone
+    if not isinstance(phone_number, str):
+        phone_number = str(phone_number)
+    if isinstance(region, str):
+        region = region.upper()
+    try:
+        # Parse the phone number
+        parsed_number = phonenumbers.parse(phone_number, region)
+        # Validate the phone number
+        valid = phonenumbers.is_valid_number(parsed_number)
+        possible = phonenumbers.is_possible_number(parsed_number)
+        if not valid:
+            suggested_fix = phonenumbers.example_number(region) if region else "Unknown"
+            return {
+                "valid": False,
+                "error": "Invalid phone number",
+                "suggested_fix": suggested_fix,
+            }
+        # Basic details
+        formatted_international = phonenumbers.format_number(
+            parsed_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
+        )
+        formatted_national = phonenumbers.format_number(
+            parsed_number, phonenumbers.PhoneNumberFormat.NATIONAL
+        )
+        formatted_e164 = phonenumbers.format_number(
+            parsed_number, phonenumbers.PhoneNumberFormat.E164
+        )
+        country_code = parsed_number.country_code
+        region_code = geocoder.region_code_for_number(parsed_number)
+        country_name = geocoder.country_name_for_number(parsed_number, "en")
+        location = geocoder.description_for_number(parsed_number, "en")
+        carrier_name = carrier.name_for_number(parsed_number, "en") or "Unknown Carrier"
+        time_zones = timezone.time_zones_for_number(parsed_number)[0]
+        current_times = datetime.now(pytz.timezone(time_zones)).strftime(
+            "%Y-%m-%d %H:%M:%S %Z"
+        )
+        number_type_str = {
+            phonenumbers.PhoneNumberType.FIXED_LINE: "Fixed Line",
+            phonenumbers.PhoneNumberType.MOBILE: "Mobile",
+            phonenumbers.PhoneNumberType.FIXED_LINE_OR_MOBILE: "Fixed Line or Mobile",
+            phonenumbers.PhoneNumberType.TOLL_FREE: "Toll Free",
+            phonenumbers.PhoneNumberType.PREMIUM_RATE: "Premium Rate",
+            phonenumbers.PhoneNumberType.SHARED_COST: "Shared Cost",
+            phonenumbers.PhoneNumberType.VOIP: "VOIP",
+            phonenumbers.PhoneNumberType.PERSONAL_NUMBER: "Personal Number",
+            phonenumbers.PhoneNumberType.PAGER: "Pager",
+            phonenumbers.PhoneNumberType.UAN: "UAN",
+            phonenumbers.PhoneNumberType.UNKNOWN: "Unknown",
+        }.get(number_type(parsed_number), "Unknown")
+        # Advanced Features
+        is_toll_free = (
+            number_type(parsed_number) == phonenumbers.PhoneNumberType.TOLL_FREE
+        )
+        is_premium_rate = (
+            number_type(parsed_number) == phonenumbers.PhoneNumberType.PREMIUM_RATE
+        )
+        # Dialing Information
+        dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
+        # Advanced Timezone Handling
+        gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
+        # Get the local timezone (current computer's time)
+        local_timezone = get_localzone()
+        #local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
+        local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
+        offset_diff = local_offset - gmt_offsets
+        head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
+        res= {
+            "valid": True,
+            "possible": possible,
+            "formatted": {
+                "international": formatted_international,
+                "national": formatted_national,
+                "e164": formatted_e164,
+            },
+            "country_code": country_code,
+            "country_name": country_name,
+            "region_code": region_code,
+            "location": location if location else "Unknown",
+            "carrier": carrier_name,
+            "time_zone": time_zones,
+            "current_times": current_times,
+            "local_offset":f"{local_offset} utcoffset",
+            "time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
+            "number_type": number_type_str,
+            "is_toll_free": is_toll_free,
+            "is_premium_rate": is_premium_rate,
+            "dialing_instructions": dialing_instructions,
+            "suggested_fix": None,  # Use phonenumbers.example_number if invalid
+            "logs": {
+                "number_analysis_completed": datetime.now().strftime(
+                    "%Y-%m-%d %H:%M:%S"
+                ),
+                "raw_input": phone_number,
+                "parsed_number": str(parsed_number),
+            },
+        }
+    except phonenumbers.NumberParseException as e:
+        res= {"valid": False, "error": str(e)}
+    if verbose:
+        preview(res)
+    return res

py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

py2ls 0.2.4.24py3-none-any.whl → 0.2.4.26py3-none-any.whl