PyPI - py2ls - Versions diffs - 0.2.4.25__py3-none-any.whl → 0.2.4.26__py3-none-any.whl - Mend

py2ls 0.2.4.25py3-none-any.whl → 0.2.4.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

py2ls/.DS_Store +0 -0
py2ls/.git/index +0 -0
py2ls/corr.py +475 -0
py2ls/data/.DS_Store +0 -0
py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
py2ls/data/styles/.DS_Store +0 -0
py2ls/data/styles/example/.DS_Store +0 -0
py2ls/data/usages_sns.json +6 -1
py2ls/ips.py +399 -91
py2ls/ml2ls.py +758 -186
py2ls/netfinder.py +16 -20
py2ls/plot.py +916 -141
{py2ls-0.2.4.25.dist-info → py2ls-0.2.4.26.dist-info}/METADATA +5 -1
{py2ls-0.2.4.25.dist-info → py2ls-0.2.4.26.dist-info}/RECORD +15 -13
py2ls/data/usages_pd copy.json +0 -1105
{py2ls-0.2.4.25.dist-info → py2ls-0.2.4.26.dist-info}/WHEEL +0 -0

py2ls/ips.py CHANGED Viewed

@@ -16,6 +16,7 @@ import warnings
 warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
 warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
+warnings.filterwarnings("ignore")
 def run_once_within(duration=60,reverse=False):  # default 60s
@@ -541,8 +542,7 @@ def is_text(s):
 from typing import Any, Union
-def shared(*args, strict=True, n_shared=2, verbose=True):
+def share(*args, strict=True, n_shared=2, verbose=True):
     """
     check the shared elelements in two list.
     usage:
@@ -587,12 +587,68 @@ def shared(*args, strict=True, n_shared=2, verbose=True):
         elements2show = (
             shared_elements if len(shared_elements) < 10 else shared_elements[:5]
         )
+        tail = '' if len(shared_elements) < 10 else '......'
+        elements2show.append(tail)
         print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
         print("********* checking shared elements *********")
     return shared_elements
+def shared(*args, n_shared=None, verbose=True,**kwargs):
+    """
+    check the shared elelements in two list.
+    usage:
+        list1 = [1, 2, 3, 4, 5]
+        list2 = [4, 5, 6, 7, 8]
+        list3 = [5, 6, 9, 10]
+        a = shared(list1, list2,list3)
+    """
+    if verbose:
+        print("\n********* checking shared elements *********")
+    if len(args) == 1 and isinstance(args[0], list):
+        lists = args[0]  # Unpack the single list
+    else:
+        lists = args  # Use the provided arguments as lists
+    flattened_lists = [flatten(lst, verbose=verbose) for lst in lists]
+    if n_shared is None:
+        n_shared = len(flattened_lists)
+        strict = True
+    else:
+        strict = False
+    # Ensure all arguments are lists
+    if any(not isinstance(lst, list) for lst in flattened_lists):
+        print(f"{' ' * 2}All inputs must be lists.")
+        return []
+    first_list = flattened_lists[0]
+    shared_elements = [
+        item for item in first_list if all(item in lst for lst in flattened_lists)
+    ]
+    if strict:
+        # Strict mode: require elements to be in all lists
+        shared_elements = set(flattened_lists[0])
+        for lst in flattened_lists[1:]:
+            shared_elements.intersection_update(lst)
+    else:
+        from collections import Counter
-def not_shared(*args, strict=True, n_shared=2, verbose=False):
+        all_elements = [item for sublist in flattened_lists for item in sublist]
+        element_count = Counter(all_elements)
+        # Get elements that appear in at least n_shared lists
+        shared_elements = [
+            item for item, count in element_count.items() if count >= n_shared
+        ]
+    shared_elements = flatten(shared_elements, verbose=verbose)
+    if verbose:
+        elements2show = (
+            shared_elements if len(shared_elements) < 10 else shared_elements[:5]
+        )
+        print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
+        print("********* checking shared elements *********")
+    return shared_elements
+def share_not(*args,  n_shared=None, verbose=False):
     """
     To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
     usage:
@@ -600,7 +656,19 @@ def not_shared(*args, strict=True, n_shared=2, verbose=False):
         list2 = [4, 5, 6, 7, 8]
         not_shared(list1,list2)# output [1,3]
     """
-    _common = shared(*args, strict=strict, n_shared=n_shared, verbose=verbose)
+    _common = shared(*args,  n_shared=n_shared, verbose=verbose)
+    list1 = flatten(args[0], verbose=verbose)
+    _not_shared = [item for item in list1 if item not in _common]
+    return _not_shared
+def not_shared(*args, n_shared=None, verbose=False):
+    """
+    To find the elements in list1 that are not shared with list2 while maintaining the original order of list1
+    usage:
+        list1 = [1, 8, 3, 3, 4, 5]
+        list2 = [4, 5, 6, 7, 8]
+        not_shared(list1,list2)# output [1,3]
+    """
+    _common = shared(*args, n_shared=n_shared, verbose=verbose)
     list1 = flatten(args[0], verbose=verbose)
     _not_shared = [item for item in list1 if item not in _common]
     return _not_shared
@@ -1983,7 +2051,6 @@ def fload(fpath, kind=None, **kwargs):
     def load_csv(fpath, **kwargs):
         from pandas.errors import EmptyDataError
         engine = kwargs.pop("engine", "pyarrow")# default: None
         sep = kwargs.pop("sep", None)# default: ','
         index_col = kwargs.pop("index_col", None)# default: None
@@ -1994,13 +2061,20 @@ def fload(fpath, kind=None, **kwargs):
         comment = kwargs.pop("comment", None)# default: None
         fmt = kwargs.pop("fmt", False)# default:
         chunksize = kwargs.pop("chunksize", None)# default: None
+        #check filesize
+        f_size=round(os.path.getsize(fpath) / 1024 / 1024, 3)
+        if f_size>=50: #50 MB
+            if chunksize is None:
+                chunksize  = 5000
+                print(f"file size is {f_size}MB, then set the chunksize with {chunksize}")
         engine = "c" if chunksize else engine  # when chunksize, recommend 'c'
         low_memory = kwargs.pop("low_memory", True)# default: True
         low_memory = (
             False if chunksize else True
         )  # when chunksize, recommend low_memory=False # default:
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_csv", verbose=verbose)
         if comment is None:# default: None
@@ -2176,7 +2250,7 @@ def fload(fpath, kind=None, **kwargs):
     def load_excel(fpath, **kwargs):
         engine = kwargs.get("engine", "openpyxl")
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_excel", verbose=verbose)
         df = pd.read_excel(fpath, engine=engine, **kwargs)
         try:
@@ -2206,7 +2280,7 @@ def fload(fpath, kind=None, **kwargs):
         engine = kwargs.get("engine", "pyarrow")
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_parquet", verbose=verbose)
         try:
             df = pd.read_parquet(fpath, engine=engine, **kwargs)
@@ -2383,13 +2457,13 @@ def fload(fpath, kind=None, **kwargs):
         return load_xml(fpath)
     elif kind in ["csv", "tsv"]:
         # verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_csv")
         content = load_csv(fpath, **kwargs)
         return content
     elif kind == "pkl":
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_pickle")
         return pd.read_pickle(fpath, **kwargs)
     elif kind in ["ods", "ods", "odt"]:
@@ -2420,12 +2494,12 @@ def fload(fpath, kind=None, **kwargs):
         return load_ipynb(fpath, **kwargs)
     elif kind in ["parquet", "snappy"]:
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_parquet")
         return load_parquet(fpath, **kwargs)
     elif kind == "feather":
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("read_feather")
         content = pd.read_feather(fpath, **kwargs)
         return content
@@ -2684,7 +2758,7 @@ def fsave(
         # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
         verbose = kwargs.pop("verbose", False)
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("to_csv", verbose=verbose)
         kwargs_csv = dict(
             path_or_buf=None,
@@ -2716,7 +2790,7 @@ def fsave(
     def save_xlsx(fpath, data, **kwargs):
         verbose = kwargs.pop("verbose", False)
         sheet_name = kwargs.pop("sheet_name", "Sheet1")
-        if run_once_within():
+        if run_once_within(reverse=True):
             use_pd("to_excel", verbose=verbose)
         if any(kwargs):
             format_excel(df=data, filename=fpath, **kwargs)
@@ -5911,6 +5985,9 @@ def df_scaler(
     scaler=None,
     method="standard",
     columns=None,  # default, select all numeric col/row
+    feature_range=None,# specific for 'minmax'
+    vmin=0,
+    vmax=1,
     inplace=False,
     verbose=False,  # show usage
     axis=0,  # defalut column-wise
@@ -5943,11 +6020,13 @@ def df_scaler(
             scaler = StandardScaler(**kwargs)
         elif method == "minmax":
             from sklearn.preprocessing import MinMaxScaler
+            if feature_range is None:
+                feature_range=(vmin,vmax)
             if verbose:
                 print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
                 print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
                 print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
-            scaler = MinMaxScaler(**kwargs)
+            scaler = MinMaxScaler(feature_range=feature_range,**kwargs)
         elif method == "robust":
             from sklearn.preprocessing import RobustScaler
             if verbose:
@@ -6035,15 +6114,20 @@ def df_special_characters_cleaner(
     # 1. Clean column names by replacing special characters with underscores
     if "column" in where_:
-        data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
+        try:
+            data.columns = data.columns.str.replace(r"[^\w\s]", "_", regex=True)
+        except Exception as e:
+            print(e)
     # 2. Clean only object-type columns (text columns)
-    if "content" in where_:
-        for col in data.select_dtypes(include=["object"]).columns:
-            data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
-    if data.index.dtype == "object" and index in where_:
-        data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
+    try:
+        if "content" in where_:
+            for col in data.select_dtypes(include=["object"]).columns:
+                data[col] = data[col].str.replace(r"[^\w\s]", "", regex=True)
+        if data.index.dtype == "object" and index in where_:
+            data.index = data.index.str.replace(r"[^\w\s]", "_", regex=True)
+    except:
+        pass
     return data
@@ -6426,6 +6510,9 @@ def df_reducer(
         # "autoencoder","nmf",
     ]
     method = strcmp(method, methods)[0]
+    if run_once_within(reverse=True):
+        print(f"support methods:{methods}")
     if verbose:
         print(f"\nprocessing with using {dict_methods[method]}:")
     xlabel, ylabel = None, None
@@ -6433,16 +6520,20 @@ def df_reducer(
         columns = data.select_dtypes(include="number").columns.tolist()
     if hue is None:
         hue = data.select_dtypes(exclude="number").columns.tolist()
+        print(f"auto select the non-number as 'hue':{hue}")
     if isinstance(hue, list):
         print("Warning: hue is a list, only select the 1st one")
         hue = hue[0]
-    if not hue:
+    if not any(hue):
         # Select columns if specified, else use all columns
         X = data[columns].values if columns else data.values
     else:
         # Select columns to reduce and hue for LDA
-        X = data[columns].values if columns else data.drop(columns=[hue]).values
-        y = data[hue].values
+        try:
+            X = data[columns].values if columns else data.drop(columns=[hue]).values
+            y = data[hue].values
+        except:
+            pass
     print(X.shape)
     # Handle missing values
     if fill_missing:
@@ -6909,33 +7000,49 @@ def df_reducer(
         colname_met = "SVD_"
     # Quick plots
     if plot_ and (not method in ["isolation_forest"]):
-        from .plot import plotxy
-        if ax is None:
-            if figsize is None:
-                _, ax = plt.subplots(figsize=cm2inch(8, 8))
-            else:
-                _, ax = plt.subplots(figsize=figsize)
-        else:
-            ax = ax.cla()
+        from .plot import plotxy,figsets,get_color
+        # if ax is None:
+        #     if figsize is None:
+        #         _, ax = plt.subplots(figsize=cm2inch(8, 8))
+        #     else:
+        #         _, ax = plt.subplots(figsize=figsize)
+        # else:
+        #     ax = ax.cla()
         xlabel = f"{colname_met}1" if xlabel is None else xlabel
         ylabel = f"{colname_met}2" if ylabel is None else ylabel
+        palette=get_color(len(flatten(data[hue],verbose=0)))
+        reduced_df=reduced_df.sort_values(by=hue)
+        print(flatten(reduced_df[hue]))
         ax = plotxy(
             data=reduced_df,
             x=colname_met + "1",
             y=colname_met + "2",
             hue=hue,
-            s=size,
+            palette=palette,
+            # size=size,
             edgecolor=edgecolor,
-            kind_="scater",
-            figsets=dict(
-                legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
-                xlabel=xlabel if xlabel else None,
-                ylabel=ylabel if ylabel else None,
-            ),
-            ax=ax,
+            kind_=["joint",
+                #    "kde",
+                   "ell",
+                   ],
+            kws_kde=dict(
+                    hue=hue,
+                    levels=2,
+                    common_norm=False,
+                    fill=True,
+                    alpha=0.05,
+                    ),
+            kws_joint=dict(kind='scatter',joint_kws=dict(s=size)),
+            kws_ellipse=dict(alpha=0.1,lw=1,label=None),
             verbose=False,
             **kwargs,
         )
+        figsets(
+            legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
+            xlabel=xlabel if xlabel else None,
+            ylabel=ylabel if ylabel else None,
+        )
     if inplace:
         # If inplace=True, add components back into the original data
@@ -7412,6 +7519,7 @@ def df_qc(
     from statsmodels.stats.outliers_influence import variance_inflation_factor
     from scipy.stats import skew, kurtosis, entropy
+    pd.options.display.max_seq_items = 10
     #! display(data.select_dtypes(include=[np.number]).describe())
     #!skim
     if columns is not None:
@@ -7428,16 +7536,18 @@ def df_qc(
     data = data.copy()
     data.loc[:, data.isna().all()] = 0
     res_qc = {}
-    print(f"data.shape:{data.shape}")
+    print(f"⤵ data.shape:{data.shape}\n⤵ data.sample(10):")
+    display(data.sample(10).style.background_gradient(cmap="coolwarm", axis=1))
     # Missing values
     res_qc["missing_values"] = data.isnull().sum()
-    res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
+    res_qc["missing_percentage"] = round((res_qc["missing_values"] / len(data)) * 100,2)
     res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
     # Data types and unique values
     res_qc["data_types"] = data.dtypes
-    res_qc["unique_values"] = data.nunique()
+    res_qc["unique_counts"] = data.select_dtypes(exclude=np.number).nunique().sort_values()
+    res_qc["unique_values"] = data.select_dtypes(exclude=np.number).apply(lambda x: x.unique())
     res_qc["constant_columns"] = [
         col for col in data.columns if data[col].nunique() <= 1
     ]
@@ -7453,33 +7563,42 @@ def df_qc(
     data_outliers = df_outlier(data)
     outlier_num = data_outliers.isna().sum() - data.isnull().sum()
     res_qc["outlier_num"] = outlier_num[outlier_num > 0]
-    outlier_percentage=(outlier_num / len(data_outliers)) * 100
+    outlier_percentage=round((outlier_num / len(data_outliers)) * 100,2)
     res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
-    # Correlation and multicollinearity (VIF)
-    if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
-        numeric_df = data.select_dtypes(include=[np.number]).dropna()
-        corr_matrix = numeric_df.corr()
-        high_corr_pairs = [
-            (col1, col2)
-            for col1 in corr_matrix.columns
-            for col2 in corr_matrix.columns
-            if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
-        ]
-        res_qc["high_correlations"] = high_corr_pairs
-        # VIF for multicollinearity check
-        numeric_df = data.select_dtypes(include=[np.number]).dropna()
-        vif_data = pd.DataFrame()
-        res_qc["vif"]=vif_data
-        if numeric_df.shape[1] > 1 and not numeric_df.empty:
-            vif_data["feature"] = numeric_df.columns
-            vif_data["VIF"] = [
-                variance_inflation_factor(numeric_df.values, i)
-                for i in range(numeric_df.shape[1])
+    try:
+        # Correlation and multicollinearity (VIF)
+        if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
+            numeric_df = data.select_dtypes(include=[np.number]).dropna()
+            corr_matrix = numeric_df.corr()
+            high_corr_pairs = [
+                (col1, col2)
+                for col1 in corr_matrix.columns
+                for col2 in corr_matrix.columns
+                if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
             ]
-            res_qc["vif"] = vif_data[
-                vif_data["VIF"] > 5
-            ]  # Typically VIF > 5 indicates multicollinearity
+            res_qc["high_correlations"] = high_corr_pairs
+            # VIF for multicollinearity check
+            numeric_df = data.select_dtypes(include=[np.number]).dropna()
+            if isinstance(numeric_df.columns, pd.MultiIndex):
+                numeric_df.columns = [
+                    "_".join(col).strip() if isinstance(col, tuple) else col for col in numeric_df.columns
+                ]
+            vif_data = pd.DataFrame()
+            res_qc["vif"]=vif_data
+            if numeric_df.shape[1] > 1 and not numeric_df.empty:
+                vif_data["feature"] = numeric_df.columns.tolist()
+                vif_data["VIF"] = [
+                    round(variance_inflation_factor(numeric_df.values, i),2)
+                    for i in range(numeric_df.shape[1])
+                ]
+                res_qc["vif"] = vif_data[
+                    vif_data["VIF"] > 5
+                ]  # Typically VIF > 5 indicates multicollinearity
+    except Exception as e:
+        print(e)
     # Skewness and Kurtosis
     skewness = data.skew(numeric_only=True)
     kurtosis_vals = data.kurt(numeric_only=True)
@@ -7492,8 +7611,7 @@ def df_qc(
         col: entropy(data[col].value_counts(normalize=True), base=2)
         for col in categorical_cols
     }
-    # number of unique
-    res_qc["unique_counts"] = data.nunique()
     # dtypes counts
     res_qc['dtype_counts']=data.dtypes.value_counts()
@@ -7540,7 +7658,7 @@ def df_qc(
     res_qc["text_length_analysis"] = text_lengths
     # Summary statistics
-    res_qc["summary_statistics"] = data.describe().T
+    res_qc["summary_statistics"] = data.describe().T.style.background_gradient(cmap='coolwarm', axis=0)
     # Automated warnings
     warnings = []
@@ -7562,28 +7680,45 @@ def df_qc(
     # Report generation
     if verbose:
-        print("=== QC Report Summary ===")
         print("\n⤵  Summary Statistics:")
         display(res_qc["summary_statistics"])
         print("\n⤵  Data Types:")
         display(res_qc["data_types"])
         if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
             print(" ⤵  Missing Values Counts:")
-            display(res_qc["missing_values"][res_qc["missing_values"] > 0])
+            display(pd.DataFrame(
+                                {
+                                    "missing_values": res_qc["missing_values"][res_qc["missing_values"] > 0],
+                                    "missing_percent(%)": res_qc["missing_percentage"][
+                                        res_qc["missing_percentage"] > 0
+                                    ],
+                                }
+                            ).style.background_gradient(cmap="coolwarm", axis=0)
+                    )
             # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
             print("\n⤵  Rows with Missing Values:",res_qc["rows_with_missing"])
+        print("\n⤵  Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
+        print("⤵  Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
+        print("⤵  Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
         if any(res_qc["outlier_num"]):
             print("\n⤵  Outlier Report:")
-            display(res_qc["outlier_num"])
-        if any(res_qc["unique_values"]):
+            display(pd.DataFrame(
+                                {
+                                    "outlier_num": res_qc["outlier_num"][res_qc["outlier_num"] > 0],
+                                    "outlier_percentage(%)": res_qc["outlier_percentage"][
+                                        res_qc["outlier_percentage"] > 0
+                                    ],
+                                }
+                            ).style.background_gradient(cmap="coolwarm", axis=0)
+                    )
+        if any(res_qc["unique_counts"]):
             print("\n⤵  Unique Values per Column:")
-            display(res_qc["unique_values"])
+            display(pd.DataFrame({"unique_counts":res_qc["unique_counts"],
+                                  "unique_values":res_qc["unique_values"]}).style.background_gradient(cmap="coolwarm", axis=0))
-        print("\n⤵  Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
-        print("⤵  Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
-        print("⤵  Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
         if res_qc["empty_columns"]:
             print("\n⤵  Empty Columns:", res_qc["empty_columns"])
@@ -7595,7 +7730,7 @@ def df_qc(
         if "vif" in res_qc:
             print("\n⤵  Features with High VIF (>|5|):")
-            print(res_qc["vif"])
+            display(res_qc["vif"].style.background_gradient(cmap="coolwarm", axis=0))
         if any(res_qc["high_cardinality_categoricals"]):
             print("\n⤵  High Cardinality Categorical Columns (>|50 unique|):")
@@ -7614,6 +7749,8 @@ def df_qc(
             print("\nWarnings:")
             for warning in res_qc["warnings"]:
                 print("  -", warning)
+    pd.reset_option("display.max_seq_items")
     if plot_:
         df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
     if output or not plot_:
@@ -7632,7 +7769,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
         if isinstance(columns, (list,pd.core.indexes.base.Index)):
             data=data[columns]
     len_total = len(res_qc)
-    n_row, n_col = int((len_total + 10) / 3), 3
+    n_row, n_col = int((len_total + 10)), 3
     nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
     missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
@@ -7789,8 +7926,10 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
         title="Dtypes",
         ylabel="#",
         ax=ax_dtype_counts,
-        fontsize=8  if len(dtype_counts.index)<=20 else 6,
+        fontsize=8 if len(dtype_counts.index)<=20 else 6,
     )
+    # from .plot import pie
+    # pie()
     # High cardinality: Show top categorical columns by unique value count
     high_cardinality = res_qc["high_cardinality_categoricals"]
@@ -7871,16 +8010,17 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
                 title="Correlation Heatmap",
                 ax=ax_heatmap
             )
-    # save figure
-    if dir_save:
-        figsave(dir_save,f"qc_plot_{now_}.pdf")
+    # # save figure
+    # if dir_save:
+    #     figsave(dir_save,f"qc_plot_{now_}.pdf")
     if columns is not None:
         if isinstance(columns, (list,pd.core.indexes.base.Index)):
             data=data[columns]
-    len_total = len(res_qc)
-    n_row, n_col = int((len_total + 10) / 3), 3
-    nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
+    # len_total = len(res_qc)
+    # n_row, n_col = int((len_total + 10) / 3), 3
+    # nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
     #! check distribution
     data_num = data.select_dtypes(include=np.number)
     if len(data_num) > max_cols:
@@ -7907,7 +8047,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
         figsets(ylabel=f'Q-Q Plot:{column}',title=None)
     # save figure
     if dir_save:
-        figsave(dir_save,f"qq_plot_{now_}.pdf")
+        figsave(dir_save,f"qc_plot_{now_}.pdf")
+def df_corr(df: pd.DataFrame, method="pearson"):
+    """
+    Compute correlation coefficients and p-values for a DataFrame.
+    Parameters:
+    - df (pd.DataFrame): Input DataFrame with numeric data.
+    - method (str): Correlation method ("pearson", "spearman", "kendall").
+    Returns:
+    - corr_matrix (pd.DataFrame): Correlation coefficient matrix.
+    - pval_matrix (pd.DataFrame): P-value matrix.
+    """
+    from scipy.stats import pearsonr, spearmanr, kendalltau
+    methods = ["pearson", "spearman", "kendall"]
+    method = strcmp(method, methods)[0]
+    methods_dict = {"pearson": pearsonr, "spearman": spearmanr, "kendall": kendalltau}
+    cols = df.columns
+    corr_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
+    pval_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)
+    correlation_func = methods_dict[method]
+    for col1 in cols:
+        for col2 in cols:
+            if col1 == col2:
+                corr_matrix.loc[col1, col2] = 1.0
+                pval_matrix.loc[col1, col2] = 0.0
+            else:
+                corr, pval = correlation_func(df[col1], df[col2])
+                corr_matrix.loc[col1, col2] = corr
+                pval_matrix.loc[col1, col2] = pval
+    return corr_matrix, pval_matrix
 def use_pd(
     func_name="excel",
     verbose=True,
@@ -7927,3 +8103,135 @@ def use_pd(
     except Exception as e:
         if verbose:
             print(e)
+def get_phone(phone_number: str, region: str = None,verbose=True):
+    """
+    usage:
+        info = get_phone(15237654321, "DE")
+        preview(info)
+    Extremely advanced phone number analysis function.
+    Args:
+        phone_number (str): The phone number to analyze.
+        region (str): None (Default). Tries to work with international numbers including country codes; otherwise, uses the specified region.
+    Returns:
+        dict: Comprehensive information about the phone number.
+    """
+    import phonenumbers
+    from phonenumbers import geocoder, carrier, timezone, number_type
+    from datetime import datetime
+    import pytz
+    from tzlocal import get_localzone
+    if not isinstance(phone_number, str):
+        phone_number = str(phone_number)
+    if isinstance(region, str):
+        region = region.upper()
+    try:
+        # Parse the phone number
+        parsed_number = phonenumbers.parse(phone_number, region)
+        # Validate the phone number
+        valid = phonenumbers.is_valid_number(parsed_number)
+        possible = phonenumbers.is_possible_number(parsed_number)
+        if not valid:
+            suggested_fix = phonenumbers.example_number(region) if region else "Unknown"
+            return {
+                "valid": False,
+                "error": "Invalid phone number",
+                "suggested_fix": suggested_fix,
+            }
+        # Basic details
+        formatted_international = phonenumbers.format_number(
+            parsed_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
+        )
+        formatted_national = phonenumbers.format_number(
+            parsed_number, phonenumbers.PhoneNumberFormat.NATIONAL
+        )
+        formatted_e164 = phonenumbers.format_number(
+            parsed_number, phonenumbers.PhoneNumberFormat.E164
+        )
+        country_code = parsed_number.country_code
+        region_code = geocoder.region_code_for_number(parsed_number)
+        country_name = geocoder.country_name_for_number(parsed_number, "en")
+        location = geocoder.description_for_number(parsed_number, "en")
+        carrier_name = carrier.name_for_number(parsed_number, "en") or "Unknown Carrier"
+        time_zones = timezone.time_zones_for_number(parsed_number)[0]
+        current_times = datetime.now(pytz.timezone(time_zones)).strftime(
+            "%Y-%m-%d %H:%M:%S %Z"
+        )
+        number_type_str = {
+            phonenumbers.PhoneNumberType.FIXED_LINE: "Fixed Line",
+            phonenumbers.PhoneNumberType.MOBILE: "Mobile",
+            phonenumbers.PhoneNumberType.FIXED_LINE_OR_MOBILE: "Fixed Line or Mobile",
+            phonenumbers.PhoneNumberType.TOLL_FREE: "Toll Free",
+            phonenumbers.PhoneNumberType.PREMIUM_RATE: "Premium Rate",
+            phonenumbers.PhoneNumberType.SHARED_COST: "Shared Cost",
+            phonenumbers.PhoneNumberType.VOIP: "VOIP",
+            phonenumbers.PhoneNumberType.PERSONAL_NUMBER: "Personal Number",
+            phonenumbers.PhoneNumberType.PAGER: "Pager",
+            phonenumbers.PhoneNumberType.UAN: "UAN",
+            phonenumbers.PhoneNumberType.UNKNOWN: "Unknown",
+        }.get(number_type(parsed_number), "Unknown")
+        # Advanced Features
+        is_toll_free = (
+            number_type(parsed_number) == phonenumbers.PhoneNumberType.TOLL_FREE
+        )
+        is_premium_rate = (
+            number_type(parsed_number) == phonenumbers.PhoneNumberType.PREMIUM_RATE
+        )
+        # Dialing Information
+        dialing_instructions = f"Dial {formatted_national} within {country_name}. Dial {formatted_e164} from abroad."
+        # Advanced Timezone Handling
+        gmt_offsets = pytz.timezone(time_zones).utcoffset(datetime.now()).total_seconds()/ 3600
+        # Get the local timezone (current computer's time)
+        local_timezone = get_localzone()
+        #local_timezone = pytz.timezone(pytz.country_timezones[region_code][0])
+        local_offset = local_timezone.utcoffset(datetime.now()).total_seconds() / 3600
+        offset_diff = local_offset - gmt_offsets
+        head_time = "earlier" if offset_diff < 0 else "later" if offset_diff > 0 else ""
+        res= {
+            "valid": True,
+            "possible": possible,
+            "formatted": {
+                "international": formatted_international,
+                "national": formatted_national,
+                "e164": formatted_e164,
+            },
+            "country_code": country_code,
+            "country_name": country_name,
+            "region_code": region_code,
+            "location": location if location else "Unknown",
+            "carrier": carrier_name,
+            "time_zone": time_zones,
+            "current_times": current_times,
+            "local_offset":f"{local_offset} utcoffset",
+            "time_zone_diff": f"{head_time} {int(np.abs(offset_diff))} h",
+            "number_type": number_type_str,
+            "is_toll_free": is_toll_free,
+            "is_premium_rate": is_premium_rate,
+            "dialing_instructions": dialing_instructions,
+            "suggested_fix": None,  # Use phonenumbers.example_number if invalid
+            "logs": {
+                "number_analysis_completed": datetime.now().strftime(
+                    "%Y-%m-%d %H:%M:%S"
+                ),
+                "raw_input": phone_number,
+                "parsed_number": str(parsed_number),
+            },
+        }
+    except phonenumbers.NumberParseException as e:
+        res= {"valid": False, "error": str(e)}
+    if verbose:
+        preview(res)
+    return res

py2ls 0.2.4.25__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

py2ls 0.2.4.25py3-none-any.whl → 0.2.4.26py3-none-any.whl