PyPI - py2ls - Versions diffs - 0.2.4.23__py3-none-any.whl → 0.2.4.25__py3-none-any.whl - Mend

py2ls 0.2.4.23py3-none-any.whl → 0.2.4.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

py2ls/.DS_Store +0 -0
py2ls/.git/.DS_Store +0 -0
py2ls/.git/index +0 -0
py2ls/.git/objects/.DS_Store +0 -0
py2ls/.git/refs/.DS_Store +0 -0
py2ls/data/.DS_Store +0 -0
py2ls/data/styles/.DS_Store +0 -0
py2ls/ec2ls.py +61 -0
py2ls/ips.py +297 -229
py2ls/ml2ls.py +996 -155
py2ls/nl2ls.py +283 -0
py2ls/plot.py +351 -40
{py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/METADATA +1 -1
{py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/RECORD +15 -11
py2ls/ml2ls copy.py +0 -2906
{py2ls-0.2.4.23.dist-info → py2ls-0.2.4.25.dist-info}/WHEEL +0 -0

py2ls/ips.py CHANGED Viewed

@@ -4,6 +4,8 @@ import sys, os
 from IPython.display import display
 from typing import List, Optional, Union
+from regex import X
 try:
     get_ipython().run_line_magic("load_ext", "autoreload")
     get_ipython().run_line_magic("autoreload", "2")
@@ -16,15 +18,17 @@ warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
 warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
-def run_once_within(duration=60):  # default 60s
+def run_once_within(duration=60,reverse=False):  # default 60s
     import time
     """
+    如果reverse is True, 则在第一次运行时并不运行.但是在第二次运行时则运行
     usage:
     if run_once_within():
         print("This code runs once per minute.")
     else:
         print("The code has already been run in the last minute.")
     """
     if not hasattr(run_once_within, "time_last"):
         run_once_within.time_last = None
@@ -34,9 +38,9 @@ def run_once_within(duration=60):  # default 60s
         time_curr - run_once_within.time_last >= duration
     ):
         run_once_within.time_last = time_curr  # Update the last execution time
-        return True
+        return False if reverse else True
     else:
-        return False
+        return True if reverse else False
 def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
@@ -1828,16 +1832,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     # Check data types
     data_types = df.dtypes
     # messages.append(f"Data types of columns:\n{data_types}")
-    # Check for constant values across any column
-    constant_columns = df.columns[df.nunique() == 1].tolist()
-    if constant_columns:
-        messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
-        is_abnormal = True
-        if verbose:
-            print(f"df.columns[df.nunique() == 1].tolist()")
-    if verbose:
-        print("5", is_abnormal)
     # Check for an unreasonable number of rows or columns
     if actual_shape[0] < 2 or actual_shape[1] < 2:
         messages.append(
@@ -1989,30 +1984,29 @@ def fload(fpath, kind=None, **kwargs):
     def load_csv(fpath, **kwargs):
         from pandas.errors import EmptyDataError
-        engine = kwargs.pop("engine", "pyarrow")
-        sep = kwargs.pop("sep", "\t")
-        index_col = kwargs.pop("index_col", None)
-        memory_map = kwargs.pop("memory_map", False)
-        skipinitialspace = kwargs.pop("skipinitialspace", False)
-        encoding = kwargs.pop("encoding", "utf-8")
-        on_bad_lines = kwargs.pop("on_bad_lines", "skip")
-        comment = kwargs.pop("comment", None)
-        fmt = kwargs.pop("fmt", False)
-        chunksize = kwargs.pop("chunksize", None)
+        engine = kwargs.pop("engine", "pyarrow")# default: None
+        sep = kwargs.pop("sep", None)# default: ','
+        index_col = kwargs.pop("index_col", None)# default: None
+        memory_map = kwargs.pop("memory_map", False)# default: False
+        skipinitialspace = kwargs.pop("skipinitialspace", False)# default: False
+        encoding = kwargs.pop("encoding", "utf-8")# default: "utf-8"
+        on_bad_lines = kwargs.pop("on_bad_lines", "skip")# default: 'error'
+        comment = kwargs.pop("comment", None)# default: None
+        fmt = kwargs.pop("fmt", False)# default:
+        chunksize = kwargs.pop("chunksize", None)# default: None
         engine = "c" if chunksize else engine  # when chunksize, recommend 'c'
-        low_memory = kwargs.pop("low_memory", True)
+        low_memory = kwargs.pop("low_memory", True)# default: True
         low_memory = (
             False if chunksize else True
-        )  # when chunksize, recommend low_memory=False
+        )  # when chunksize, recommend low_memory=False # default:
         verbose = kwargs.pop("verbose", False)
         if run_once_within():
             use_pd("read_csv", verbose=verbose)
-        if comment is None:
+        if comment is None:# default: None
             comment = get_comment(
                 fpath, comment=None, encoding="utf-8", lines_to_check=5
             )
         try:
             df = pd.read_csv(
                 fpath,
@@ -2107,8 +2101,8 @@ def fload(fpath, kind=None, **kwargs):
                 separators = [",", "\t", ";", "|", " "]
                 for sep in separators:
                     sep2show = sep if sep != "\t" else "\\t"
-                    # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
-                    # print(".")
+                    if verbose:
+                        print(f'trying with: engine=pyarrow, sep="{sep2show}"')
                     try:
                         df = pd.read_csv(
                             fpath,
@@ -2137,8 +2131,9 @@ def fload(fpath, kind=None, **kwargs):
                             separators = [",", "\t", ";", "|", " "]
                             for sep in separators:
                                 try:
-                                    # sep2show = sep if sep != "\t" else "\\t"
-                                    # print(f"trying with: engine={engine}, sep='{sep2show}'")
+                                    sep2show = sep if sep != "\t" else "\\t"
+                                    if verbose:
+                                        print(f"trying with: engine={engine}, sep='{sep2show}'")
                                     # print(".")
                                     df = pd.read_csv(
                                         fpath,
@@ -2171,8 +2166,9 @@ def fload(fpath, kind=None, **kwargs):
                                     continue
                             else:
                                 pass
-        if is_df_abnormal(df,verbose=verbose):
-            df=pd.read_csv(fpath,**kwargs)
+        print(kwargs)
+        # if is_df_abnormal(df,verbose=verbose):
+        #     df=pd.read_csv(fpath,**kwargs)
         display(df.head(2))
         print(f"shape: {df.shape}")
         return df
@@ -2386,7 +2382,7 @@ def fload(fpath, kind=None, **kwargs):
     elif kind == "xml":
         return load_xml(fpath)
     elif kind in ["csv", "tsv"]:
-        verbose = kwargs.pop("verbose", False)
+        # verbose = kwargs.pop("verbose", False)
         if run_once_within():
             use_pd("read_csv")
         content = load_csv(fpath, **kwargs)
@@ -3503,12 +3499,8 @@ def figsave(*args, dpi=300):
                     )
         else:
             plt.savefig(
-                fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", pad_inches=0
-            )
-    # elif ftype.lower() == "png":
-    #     plt.savefig(fname, format="png", dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0)
-    # elif ftype.lower() in ["tiff", "tif"]:
-    #     plt.savefig(fname, format="tiff", dpi=dpi, bbox_inches="tight",pad_inches=0)
+                fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0
+            )
     elif ftype.lower() == "emf":
         plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
     elif ftype.lower() == "fig":
@@ -5236,15 +5228,44 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
     data = data.explode(column, ignore_index=True)
     return data
+def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
+    """
+    Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
+    Usage:
+        data = pd.DataFrame({'month': [1, 4, 7, 10, 12]})  # Just months as an example
+        # df_cycle month cyclically
+        data = df_cycle(data, 'month', 12)
+    """
+    if columns is None:
+        columns = list(data.select_dtypes(include=np.number).columns)  # If no columns specified, use all columns
+    if max_val is None:
+        max_val = np.max(data[columns])  # If no max_val specified, use the maximum value across all columns
+    if isinstance(columns, str):
+        columns = [columns]  # If a single column name is provided as a string, convert it to a list
+    # Check if inplace is True, so we modify the original dataframe
+    if inplace:
+        # Modify the data in place, no return statement needed
+        for col in columns:
+            data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
+            data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
+    else:
+        # If inplace is False, return the modified dataframe
+        new_data = data.copy()
+        for col in columns:
+            new_data[col + '_sin'] = np.sin(2 * np.pi * new_data[col] / max_val)
+            new_data[col + '_cos'] = np.cos(2 * np.pi * new_data[col] / max_val)
+        return new_data
 # ! DataFrame
 def df_astype(
     data: pd.DataFrame,
     columns: Optional[Union[str, List[str]]] = None,
-    astype: str = "datetime",
+    astype: str = None,#"datetime",
     skip_row: Union[str, list] = None,
     fmt: Optional[str] = None,
-    inplace: bool = True,
+    inplace: bool = False,
     errors: str = "coerce",  # Can be "ignore", "raise", or "coerce"
     **kwargs,
 ) -> Optional[pd.DataFrame]:
@@ -5304,6 +5325,7 @@ def df_astype(
         "day",
         "month",
         "year",
+        "circular"
     ]
     # If inplace is False, make a copy of the DataFrame
     if not inplace:
@@ -5398,10 +5420,22 @@ def df_astype(
                 kwargs.pop("errors", None)
                 data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
                 # print(f"Successfully converted '{column}' to timedelta.")
+            elif astype == "circular":
+                max_val = kwargs.get('max_val',None)
+                data[column]=df_cycle(data=data,columns=column,max_val=max_val)
             else:
                 # Convert to other types (e.g., float, int)
-                data[column] = data[column].astype(astype)
+                if astype=='int':
+                    data[column] = data[column].astype('float').astype('int')
+                else:
+                    data[column] = data[column].astype(astype)
                 # print(f"Successfully converted '{column}' to {astype}.")
+            # format
+            try:
+                if fmt is not None:
+                    data[column] = data[column].apply(lambda x: f"{x:{fmt}}")
+            except Exception as e:
+                print(f"设置格式的时候有误: {e}")
         except Exception as e:
             print(f"Error converting '{column}' to {astype}: {e}")
     try:
@@ -5874,11 +5908,13 @@ def df_encoder(
 def df_scaler(
     data: pd.DataFrame,  # should be numeric dtype
+    scaler=None,
     method="standard",
     columns=None,  # default, select all numeric col/row
     inplace=False,
     verbose=False,  # show usage
     axis=0,  # defalut column-wise
+    return_scaler:bool=False,# True: return both: return df, scaler
     **kwargs,
 ):
     """
@@ -5896,31 +5932,49 @@ def df_scaler(
     """
     if verbose:
         print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
-    methods = ["standard", "minmax", "robust"]
-    method = strcmp(method, methods)[0]
-    if method == "standard":
-        from sklearn.preprocessing import StandardScaler
-        scaler = StandardScaler(**kwargs)
-    elif method == "minmax":
-        from sklearn.preprocessing import MinMaxScaler
-        scaler = MinMaxScaler(**kwargs)
-    elif method == "robust":
-        from sklearn.preprocessing import RobustScaler
-        scaler = RobustScaler(**kwargs)
-    if axis not in [0, 1]:
-        raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
+    if scaler is None:
+        methods = ["standard", "minmax", "robust","maxabs"]
+        method = strcmp(method, methods)[0]
+        if method == "standard":
+            from sklearn.preprocessing import StandardScaler
+            if verbose:
+                print("performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1.")
+                print("Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks.")
+            scaler = StandardScaler(**kwargs)
+        elif method == "minmax":
+            from sklearn.preprocessing import MinMaxScaler
+            if verbose:
+                print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
+                print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
+                print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
+            scaler = MinMaxScaler(**kwargs)
+        elif method == "robust":
+            from sklearn.preprocessing import RobustScaler
+            if verbose:
+                print("scales the data based on the median and interquartile range, which is robust to outliers.")
+                print("Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation.")
+            scaler = RobustScaler(**kwargs)
+        elif method=="maxabs":
+            from sklearn.preprocessing import MaxAbsScaler
+            if verbose:
+                print("This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature.")
+                print("Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems.")
+            scaler = MaxAbsScaler(**kwargs)
+        if axis not in [0, 1]:
+            raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
+    if verbose:
+        print(scaler)
     if axis == 0:
         # Column-wise scaling (default)
         if columns is None:
             columns = data.select_dtypes(include=np.number).columns.tolist()
         non_numeric_columns = data.columns.difference(columns)
-        scaled_data = scaler.fit_transform(data[columns])
+        # scaled_data = scaler.fit_transform(data[columns])
+        if scaler is None or not hasattr(scaler, 'mean_'):
+            scaled_data = scaler.fit_transform(data[columns])
+        else:
+            scaled_data = scaler.transform(data[columns])
         if inplace:
             data[columns] = scaled_data
@@ -5934,7 +5988,10 @@ def df_scaler(
                 axis=1,
             )
             scaled_df = scaled_df[data.columns]  # Maintain column order
-            return scaled_df
+            if return_scaler:
+                return scaled_df,scaler
+            else:
+                return scaled_df
     elif axis == 1:
         # Row-wise scaling
@@ -5946,9 +6003,10 @@ def df_scaler(
         print(f"Scaling rows")
-        scaled_data = scaler.fit_transform(
-            numeric_rows.T
-        ).T  # Transpose for scaling and then back
+        # scaled_data = scaler.fit_transform(
+        #     numeric_rows.T
+        # ).T  # Transpose for scaling and then back
+        scaled_data = scaler.fit_transform(numeric_rows.T).T if scaler is None or not hasattr(scaler, 'mean_') else scaler.transform(numeric_rows.T).T
         if inplace:
             data.loc[numeric_rows.index] = scaled_data
@@ -5956,7 +6014,10 @@ def df_scaler(
         else:
             scaled_df = data.copy()
             scaled_df.loc[numeric_rows.index] = scaled_data
-            return scaled_df
+            if return_scaler:
+                return scaled_df,scaler
+            else:
+                return scaled_df
 def df_special_characters_cleaner(
@@ -6325,6 +6386,7 @@ def df_reducer(
     random_state=1,
     ax=None,
     figsize=None,
+    verbose=True,
     **kwargs,
 ) -> pd.DataFrame:
     dict_methods = {
@@ -6364,7 +6426,8 @@ def df_reducer(
         # "autoencoder","nmf",
     ]
     method = strcmp(method, methods)[0]
-    print(f"\nprocessing with using {dict_methods[method]}:")
+    if verbose:
+        print(f"\nprocessing with using {dict_methods[method]}:")
     xlabel, ylabel = None, None
     if columns is None:
         columns = data.select_dtypes(include="number").columns.tolist()
@@ -6863,7 +6926,7 @@ def df_reducer(
             hue=hue,
             s=size,
             edgecolor=edgecolor,
-            kind="scater",
+            kind_="scater",
             figsets=dict(
                 legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
                 xlabel=xlabel if xlabel else None,
@@ -7334,10 +7397,13 @@ def evaluate_cluster(
 def df_qc(
     data: pd.DataFrame,
     columns=None,
-    verbose=False,
+    skim=False,
     plot_=True,
     max_cols=20,  # only for plots
+    hue=None,
     output=False,
+    verbose=True,
+    dir_save=None
 ):
     """
     Usage example:
@@ -7345,22 +7411,24 @@ def df_qc(
     """
     from statsmodels.stats.outliers_influence import variance_inflation_factor
     from scipy.stats import skew, kurtosis, entropy
-    import skimpy
     #! display(data.select_dtypes(include=[np.number]).describe())
     #!skim
     if columns is not None:
         if isinstance(columns, (list,pd.core.indexes.base.Index)):
             data=data[columns]
-    try:
-        skimpy.skim(data)
-    except:
-        numerical_data = data.select_dtypes(include=[np.number])
-        skimpy.skim(numerical_data)
+    if skim:
+        try:
+            import skimpy
+            skimpy.skim(data)
+        except:
+            numerical_data = data.select_dtypes(include=[np.number])
+            skimpy.skim(numerical_data)
     # Fill completely NaN columns with a default value (e.g., 0)
     data = data.copy()
     data.loc[:, data.isna().all()] = 0
     res_qc = {}
+    print(f"data.shape:{data.shape}")
     # Missing values
     res_qc["missing_values"] = data.isnull().sum()
@@ -7403,7 +7471,7 @@ def df_qc(
         numeric_df = data.select_dtypes(include=[np.number]).dropna()
         vif_data = pd.DataFrame()
         res_qc["vif"]=vif_data
-        if numeric_df.shape[1] > 1:
+        if numeric_df.shape[1] > 1 and not numeric_df.empty:
             vif_data["feature"] = numeric_df.columns
             vif_data["VIF"] = [
                 variance_inflation_factor(numeric_df.values, i)
@@ -7495,72 +7563,70 @@ def df_qc(
     # Report generation
     if verbose:
         print("=== QC Report Summary ===")
-        print("\nMissing Values (Total and %):")
-        print(res_qc["missing_values"][res_qc["missing_values"] > 0])
-        print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
-        print("\nRows with Missing Values:", res_qc["rows_with_missing"])
-        print("\nData Types:")
-        print(res_qc["data_types"])
-        print("\nUnique Values per Column:")
-        print(res_qc["unique_values"])
-        print("\nConstant Columns:", res_qc["constant_columns"])
-        print("\nDuplicate Rows:", res_qc["duplicate_rows"])
-        print("Duplicate Columns:", res_qc["duplicate_columns"])
+        print("\n⤵  Summary Statistics:")
+        display(res_qc["summary_statistics"])
+        print("\n⤵  Data Types:")
+        display(res_qc["data_types"])
+        if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
+            print(" ⤵  Missing Values Counts:")
+            display(res_qc["missing_values"][res_qc["missing_values"] > 0])
+            # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
+            print("\n⤵  Rows with Missing Values:",res_qc["rows_with_missing"])
+        if any(res_qc["outlier_num"]):
+            print("\n⤵  Outlier Report:")
+            display(res_qc["outlier_num"])
+        if any(res_qc["unique_values"]):
+            print("\n⤵  Unique Values per Column:")
+            display(res_qc["unique_values"])
+        print("\n⤵  Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
+        print("⤵  Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
+        print("⤵  Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
         if res_qc["empty_columns"]:
-            print("\nEmpty Columns:", res_qc["empty_columns"])
-        print("\nOutlier Report:")
-        print(res_qc["outlier_num"])
-        print("\nPercentage of Values Replaced per Column:")
-        print(res_qc["outlier_percentage"])
+            print("\n⤵  Empty Columns:", res_qc["empty_columns"])
-        print("\nHigh Correlations (>|0.9|):")
-        for col1, col2 in res_qc["high_correlations"]:
-            print(f"  {col1} and {col2}")
+        if any(res_qc["high_correlations"]):
+            print("\n⤵  High Correlations (>|0.9|):")
+            for col1, col2 in res_qc["high_correlations"]:
+                print(f"  {col1} and {col2}")
         if "vif" in res_qc:
-            print("\nFeatures with High VIF (>|5|):")
+            print("\n⤵  Features with High VIF (>|5|):")
             print(res_qc["vif"])
-        print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
-        print(res_qc["high_cardinality_categoricals"])
-        print("\nInconsistent Data Types:")
-        print(res_qc["inconsistent_types"])
-        print("\nRange Checks for Numeric Columns:")
-        print(res_qc["range_checks"])
-        print("\nText Length Analysis:")
-        for col, stats in res_qc["text_length_analysis"].items():
-            print(
-                f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
-            )
-        print("\nSummary Statistics:")
-        print(res_qc["summary_statistics"])
+        if any(res_qc["high_cardinality_categoricals"]):
+            print("\n⤵  High Cardinality Categorical Columns (>|50 unique|):")
+            print(res_qc["high_cardinality_categoricals"])
+        if any(res_qc["inconsistent_types"]):
+            print("\n⤵  Inconsistent Data Types:")
+            display(res_qc["inconsistent_types"])
+        if any(res_qc["text_length_analysis"]):
+            print("\n⤵  Text Length Analysis:")
+            for col, stats in res_qc["text_length_analysis"].items():
+                print(
+                    f"{col}: Avg Length={round(stats['avg_length'],1)}, Length Variance={round(stats['length_variance'],1)}"
+                )
         if res_qc["warnings"]:
             print("\nWarnings:")
             for warning in res_qc["warnings"]:
                 print("  -", warning)
     if plot_:
-        df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
-    if output:
+        df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
+    if output or not plot_:
         return res_qc
     return None
-def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
+def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None,dir_save=None):
     import matplotlib.pyplot as plt
     import seaborn as sns
     from .plot import subplot, figsets, get_color
+    from datetime import datetime
+    now_ = datetime.now().strftime("%y%m%d_%H%M%S")
     if columns is not None:
         if isinstance(columns, (list,pd.core.indexes.base.Index)):
@@ -7574,91 +7640,65 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
     )
     if len(missing_data) > max_cols:
         missing_data = missing_data[:max_cols]
-    ax=sns.barplot(
-        x=missing_data.index,
-        y=missing_data.values,
-        hue=missing_data.index,
-        palette=get_color(len(missing_data), cmap="Blues")[::-1],
+    ax_missing_data=sns.barplot(
+        y=missing_data.index,
+        x=missing_data.values,
+        hue=missing_data.index,
+        palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
         ax=nexttile(),
     )
-    figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
-    ax2 = ax.twinx()
-    # Plot missing value percentages
-    missing_percentage = res_qc["missing_percentage"][
-        res_qc["missing_percentage"] > 0
-    ].sort_values(ascending=False)
-    sns.barplot(
-        x=missing_percentage.index,
-        y=missing_percentage.values,
-        hue=missing_percentage.index,
-        palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
-        ax=ax2,#nexttile(),
-    )
-    figsets(xangle=45, ylabel="%",ax=ax2)
-    ax2.tick_params(axis="y", color='r',labelcolor='r')
-    ax2.yaxis.label.set_color('r')
+    figsets(title="Missing (#)", xlabel="#",ax=ax_missing_data,ylabel=None,fontsize=8 if len(missing_data)<=20 else 6)
     outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
     if len(outlier_num) > max_cols:
         outlier_num = outlier_num[:max_cols]
     ax_outlier_num=sns.barplot(
-        x=outlier_num.index,
-        y=outlier_num.values,
+        y=outlier_num.index,
+        x=outlier_num.values,
         hue=outlier_num.index,
         palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
         ax=nexttile(),
     )
-    figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
-    ax_outlier_percentage = ax_outlier_num.twinx()
-    outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
-    if len(outlier_percentage) > max_cols:
-        outlier_percentage = outlier_percentage[:max_cols]
-    ax_outlier_percentage=sns.barplot(
-        x=outlier_percentage.index,
-        y=outlier_percentage.values,
-        hue=outlier_percentage.index,
-        palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
-        ax=ax2 #nexttile(),
-    )
-    figsets(
-        xangle=45,
-        ylabel="%",
-        xlabel=None,
-        ylim=[0, outlier_percentage.max() + 2],
-        ax=ax_outlier_percentage
-    )
-    ax2.tick_params(axis="y", color='r',labelcolor='r')
-    ax2.yaxis.label.set_color('r')
+    figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
+    #!
+    try:
+        for col in data.select_dtypes(include='category').columns:
+            sns.countplot(y=data[col],
+                          palette=get_color(data.select_dtypes(include='category').shape[1], cmap="coolwarm")[::-1],
+                          ax=nexttile())
+            figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
+    except Exception as e:
+        pass
     # Skewness and Kurtosis Plots
     skewness = res_qc["skewness"].sort_values(ascending=False)
     kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
     if not skewness.empty:
         ax_skewness=sns.barplot(
-            x=skewness.index,
-            y=skewness.values,
+            y=skewness.index,
+            x=skewness.values,
             hue=skewness.index,
             palette=get_color(len(skewness), cmap="coolwarm")[::-1],
             ax=nexttile(),
         )
         figsets(
-            xangle=45,
             title="Highly Skewed Numeric Columns (Skewness > 1)",
-            ylabel="Skewness",xlabel=None,ax=ax_skewness
+            xlabel="Skewness",ylabel=None,ax=ax_skewness,
+            fontsize=8 if len(skewness)<=20 else 6
         )
     if not kurtosis.empty:
         ax_kurtosis=sns.barplot(
-            x=kurtosis.index,
-            y=kurtosis.values,
+            y=kurtosis.index,
+            x=kurtosis.values,
             hue=kurtosis.index,
             palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
             ax=nexttile(),
         )
         figsets(
-            xangle=45,
             title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
-            ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
+            xlabel="Kurtosis",ylabel=None,ax=ax_kurtosis,
+            fontsize=8 if len(kurtosis)<=20 else 6
         )
     # Entropy for Categorical Variables
@@ -7666,56 +7706,46 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
         ascending=False
     )
     ax_entropy_data=sns.barplot(
-        x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
+        y=entropy_data.index, x=entropy_data.values,hue=entropy_data.index,
+        palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
+        ax=nexttile()
     )
     figsets(
-            xangle=45,
-            xlabel="Categorical Columns",
+            ylabel="Categorical Columns",
             title="Entropy of Categorical Variables",
-            ylabel="Entropy (bits)",
-            ax=ax_entropy_data
-        )
-    # Distribution Analysis: Boxplot for IQR
-    ax_iqr=sns.boxplot(
-        data=data[res_qc["distribution_analysis"].index],
-        orient="v",
-        palette="Set3",
-        ax=nexttile(),
-    )
-    figsets(
-            xangle=45,
-            title="Range for Numeric Columns",
-            ylabel="#",
-            ax=ax_iqr
-        )
+            xlabel="Entropy (bits)",
+            ax=ax_entropy_data,
+            fontsize=8 if len(entropy_data)<=20 else 6
+        )
     # unique counts
     unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
     ax_unique_counts_=sns.barplot(
-                x=unique_counts.index,
-                y=unique_counts.values,
+                y=unique_counts.index,
+                x=unique_counts.values,
                 hue=unique_counts.index,
-                palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
+                palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
                 ax=nexttile())
     figsets(
-            xangle=45,
             title="Unique Counts",
-            xlabel=None,
-            ylabel="#",
-            ax=ax_unique_counts_
+            ylabel=None,
+            xlabel="#",
+            ax=ax_unique_counts_,
+            fontsize=8 if len(unique_counts)<=20 else 6
         )
     # Binary Checking
-    ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
-                y=unique_counts[unique_counts<10].values,
-                hue=unique_counts[unique_counts<10].index,
-                palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
+    ax_unique_counts=sns.barplot(y=unique_counts[unique_counts<8].index,
+                x=unique_counts[unique_counts<8].values,
+                hue=unique_counts[unique_counts<8].index,
+                palette=get_color(len(unique_counts[unique_counts<8].index), cmap="coolwarm")[::-1],
                 ax=nexttile())
-    plt.axhline(y=2, color="r", linestyle="--", lw=2)
+    plt.axvline(x=2, color="r", linestyle="--", lw=2)
     figsets(
-            xangle=45,
-            xlabel=None,
+            ylabel=None,
             title="Binary Checking",
-            ylabel="#",
-            ax=ax_unique_counts
+            xlabel="#",
+            ax=ax_unique_counts,
+            fontsize=8 if len(unique_counts[unique_counts<10].index)<=20 else 6
         )
     # dtypes counts
@@ -7751,14 +7781,15 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
                 ha="center",
                 va="top",
                 c="k",
-                fontsize=8,
+                fontsize=8  if len(dtype_counts.index)<=20 else 6,
                 rotation=0,
             )
     figsets(
         xlabel=None,
         title="Dtypes",
         ylabel="#",
-        ax=ax_dtype_counts
+        ax=ax_dtype_counts,
+        fontsize=8  if len(dtype_counts.index)<=20 else 6,
     )
     # High cardinality: Show top categorical columns by unique value count
@@ -7772,24 +7803,26 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
     if high_cardinality:
         ax_high_cardinality=sns.barplot(
-            x=list(high_cardinality.keys()),
-            y=list(high_cardinality.values()),
+            y=list(high_cardinality.keys()),
+            x=list(high_cardinality.values()),
             hue=list(high_cardinality.keys()),
-            palette="Oranges", ax=nexttile()
+            palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[::-1],
+            ax=nexttile(),
         )
         figsets(
-            xangle=45,
             title="High Cardinality Categorical Columns",
-            ylabel="Unique Value Count",
-            ax=ax_high_cardinality
+            xlabel="Unique Value Count",
+            ax=ax_high_cardinality,
+            fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
         )
     if res_qc["low_variance_features"]:
         low_variance_data = data[res_qc["low_variance_features"]].copy()
         for col in low_variance_data.columns:
-            sns.histplot(
+            ax_low_variance_features=sns.histplot(
                 low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
             )
-            plt.title(f"Low Variance Feature: {col}")
+            figsets(title=f"Low Variance Feature: {col}",ax=ax_low_variance_features,
+            fontsize=8 if len(low_variance_data[col])<=20 else 6)
     # VIF plot for multicollinearity detection
     if "vif" in res_qc and not res_qc["vif"].empty:
@@ -7800,23 +7833,22 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
                     x="VIF",
                     y="feature",
                     hue="VIF",
-                    palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
+                    palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
                     ax=nexttile())
         figsets(
-            xangle=45,
             title="Variance Inflation Factor(VIF)",
-            xlabel="Variance Inflation Factor(VIF)",
+            xlabel="VIF",
             ylabel="Features",
             legend=None,
-            ax=ax_vif
+            ax=ax_vif,
+            fontsize=8 if len(vif_data)<=20 else 6
         )
     # Correlation heatmap for numeric columns with high correlation pairs
     if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
-        corr = data.select_dtypes(include=[np.number]).dropna().corr()
+        corr = data.select_dtypes(include=[np.number]).corr()
         if corr.shape[1]<=33:
             mask = np.triu(np.ones_like(corr, dtype=bool))
-                # Dynamically scale fontsize based on the number of columns
             num_columns = corr.shape[1]
             fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2))  # Scale between 8 and 12
@@ -7826,7 +7858,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
                 annot=True,
                 cmap="coolwarm",
                 center=0,
-                fmt=".2f",
+                fmt=".1f",
                 linewidths=0.5,
                 vmin=-1, vmax=1,
                 ax=nexttile(2, 2),
@@ -7839,7 +7871,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
                 title="Correlation Heatmap",
                 ax=ax_heatmap
             )
+    # save figure
+    if dir_save:
+        figsave(dir_save,f"qc_plot_{now_}.pdf")
+    if columns is not None:
+        if isinstance(columns, (list,pd.core.indexes.base.Index)):
+            data=data[columns]
+    len_total = len(res_qc)
+    n_row, n_col = int((len_total + 10) / 3), 3
+    nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
+    #! check distribution
+    data_num = data.select_dtypes(include=np.number)
+    if len(data_num) > max_cols:
+        data_num = data_num.iloc[:,:max_cols]
+    data_num = df_scaler(data=data_num, method='standard')
+    import scipy.stats as stats
+    for column in data_num.columns:
+        #* Shapiro-Wilk test for normality
+        stat, p_value = stats.shapiro(data_num[column])
+        normality = "norm" if p_value > 0.05 else "not_norm"
+        #* Plot histogram
+        ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
+        x_min, x_max = ax_hist.get_xlim()
+        y_min, y_max = ax_hist.get_ylim()
+        ax_hist.text(x_min+(x_max-x_min)*0.5, y_min+(y_max-y_min)*0.75,
+                     f'p(Shapiro-Wilk)={p_value:.3f}\n{normality}',
+                     ha='center', va='top')
+        figsets(title=column,ax=ax_hist)
+        ax_twin=ax_hist.twinx()
+        #* Q-Q plot
+        stats.probplot(data_num[column], dist="norm", plot=ax_twin)
+        figsets(ylabel=f'Q-Q Plot:{column}',title=None)
+    # save figure
+    if dir_save:
+        figsave(dir_save,f"qq_plot_{now_}.pdf")
 def use_pd(
     func_name="excel",
     verbose=True,

py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.25__py3-none-any.whl

py2ls 0.2.4.23py3-none-any.whl → 0.2.4.25py3-none-any.whl