PyPI - py2ls - Versions diffs - 0.2.4.23__py3-none-any.whl → 0.2.4.24__py3-none-any.whl - Mend

py2ls 0.2.4.23py3-none-any.whl → 0.2.4.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

py2ls/.DS_Store +0 -0
py2ls/.git/.DS_Store +0 -0
py2ls/.git/index +0 -0
py2ls/.git/objects/.DS_Store +0 -0
py2ls/.git/refs/.DS_Store +0 -0
py2ls/data/.DS_Store +0 -0
py2ls/data/styles/.DS_Store +0 -0
py2ls/ips.py +213 -195
py2ls/ml2ls.py +768 -61
{py2ls-0.2.4.23.dist-info → py2ls-0.2.4.24.dist-info}/METADATA +1 -1
{py2ls-0.2.4.23.dist-info → py2ls-0.2.4.24.dist-info}/RECORD +12 -9
{py2ls-0.2.4.23.dist-info → py2ls-0.2.4.24.dist-info}/WHEEL +0 -0

py2ls/ips.py CHANGED Viewed

@@ -4,6 +4,8 @@ import sys, os
 from IPython.display import display
 from typing import List, Optional, Union
+from regex import X
 try:
     get_ipython().run_line_magic("load_ext", "autoreload")
     get_ipython().run_line_magic("autoreload", "2")
@@ -1828,16 +1830,7 @@ def is_df_abnormal(df: pd.DataFrame, verbose=False) -> bool:
     # Check data types
     data_types = df.dtypes
     # messages.append(f"Data types of columns:\n{data_types}")
-    # Check for constant values across any column
-    constant_columns = df.columns[df.nunique() == 1].tolist()
-    if constant_columns:
-        messages.append(f"Abnormal: Columns with constant values: {constant_columns}")
-        is_abnormal = True
-        if verbose:
-            print(f"df.columns[df.nunique() == 1].tolist()")
-    if verbose:
-        print("5", is_abnormal)
     # Check for an unreasonable number of rows or columns
     if actual_shape[0] < 2 or actual_shape[1] < 2:
         messages.append(
@@ -1989,30 +1982,29 @@ def fload(fpath, kind=None, **kwargs):
     def load_csv(fpath, **kwargs):
         from pandas.errors import EmptyDataError
-        engine = kwargs.pop("engine", "pyarrow")
-        sep = kwargs.pop("sep", "\t")
-        index_col = kwargs.pop("index_col", None)
-        memory_map = kwargs.pop("memory_map", False)
-        skipinitialspace = kwargs.pop("skipinitialspace", False)
-        encoding = kwargs.pop("encoding", "utf-8")
-        on_bad_lines = kwargs.pop("on_bad_lines", "skip")
-        comment = kwargs.pop("comment", None)
-        fmt = kwargs.pop("fmt", False)
-        chunksize = kwargs.pop("chunksize", None)
+        engine = kwargs.pop("engine", "pyarrow")# default: None
+        sep = kwargs.pop("sep", None)# default: ','
+        index_col = kwargs.pop("index_col", None)# default: None
+        memory_map = kwargs.pop("memory_map", False)# default: False
+        skipinitialspace = kwargs.pop("skipinitialspace", False)# default: False
+        encoding = kwargs.pop("encoding", "utf-8")# default: "utf-8"
+        on_bad_lines = kwargs.pop("on_bad_lines", "skip")# default: 'error'
+        comment = kwargs.pop("comment", None)# default: None
+        fmt = kwargs.pop("fmt", False)# default:
+        chunksize = kwargs.pop("chunksize", None)# default: None
         engine = "c" if chunksize else engine  # when chunksize, recommend 'c'
-        low_memory = kwargs.pop("low_memory", True)
+        low_memory = kwargs.pop("low_memory", True)# default: True
         low_memory = (
             False if chunksize else True
-        )  # when chunksize, recommend low_memory=False
+        )  # when chunksize, recommend low_memory=False # default:
         verbose = kwargs.pop("verbose", False)
         if run_once_within():
             use_pd("read_csv", verbose=verbose)
-        if comment is None:
+        if comment is None:# default: None
             comment = get_comment(
                 fpath, comment=None, encoding="utf-8", lines_to_check=5
             )
         try:
             df = pd.read_csv(
                 fpath,
@@ -2107,8 +2099,8 @@ def fload(fpath, kind=None, **kwargs):
                 separators = [",", "\t", ";", "|", " "]
                 for sep in separators:
                     sep2show = sep if sep != "\t" else "\\t"
-                    # print(f'trying with: engine=pyarrow, sep="{sep2show}"')
-                    # print(".")
+                    if verbose:
+                        print(f'trying with: engine=pyarrow, sep="{sep2show}"')
                     try:
                         df = pd.read_csv(
                             fpath,
@@ -2137,8 +2129,9 @@ def fload(fpath, kind=None, **kwargs):
                             separators = [",", "\t", ";", "|", " "]
                             for sep in separators:
                                 try:
-                                    # sep2show = sep if sep != "\t" else "\\t"
-                                    # print(f"trying with: engine={engine}, sep='{sep2show}'")
+                                    sep2show = sep if sep != "\t" else "\\t"
+                                    if verbose:
+                                        print(f"trying with: engine={engine}, sep='{sep2show}'")
                                     # print(".")
                                     df = pd.read_csv(
                                         fpath,
@@ -2171,8 +2164,9 @@ def fload(fpath, kind=None, **kwargs):
                                     continue
                             else:
                                 pass
-        if is_df_abnormal(df,verbose=verbose):
-            df=pd.read_csv(fpath,**kwargs)
+        print(kwargs)
+        # if is_df_abnormal(df,verbose=verbose):
+        #     df=pd.read_csv(fpath,**kwargs)
         display(df.head(2))
         print(f"shape: {df.shape}")
         return df
@@ -2386,7 +2380,7 @@ def fload(fpath, kind=None, **kwargs):
     elif kind == "xml":
         return load_xml(fpath)
     elif kind in ["csv", "tsv"]:
-        verbose = kwargs.pop("verbose", False)
+        # verbose = kwargs.pop("verbose", False)
         if run_once_within():
             use_pd("read_csv")
         content = load_csv(fpath, **kwargs)
@@ -5236,15 +5230,44 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
     data = data.explode(column, ignore_index=True)
     return data
+def df_circular(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
+    """
+    Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
+    Usage:
+        data = pd.DataFrame({'month': [1, 4, 7, 10, 12]})  # Just months as an example
+        # df_circular month cyclically
+        data = df_circular(data, 'month', 12)
+    """
+    if columns is None:
+        columns = list(data.columns)  # If no columns specified, use all columns
+    if max_val is None:
+        max_val = np.max(data[columns])  # If no max_val specified, use the maximum value across all columns
+    if isinstance(columns, str):
+        columns = [columns]  # If a single column name is provided as a string, convert it to a list
+    # Check if inplace is True, so we modify the original dataframe
+    if inplace:
+        # Modify the data in place, no return statement needed
+        for col in columns:
+            data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
+            data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
+    else:
+        # If inplace is False, return the modified dataframe
+        new_data = data.copy()
+        for col in columns:
+            new_data[col + '_sin'] = np.sin(2 * np.pi * new_data[col] / max_val)
+            new_data[col + '_cos'] = np.cos(2 * np.pi * new_data[col] / max_val)
+        return new_data
 # ! DataFrame
 def df_astype(
     data: pd.DataFrame,
     columns: Optional[Union[str, List[str]]] = None,
-    astype: str = "datetime",
+    astype: str = None,#"datetime",
     skip_row: Union[str, list] = None,
     fmt: Optional[str] = None,
-    inplace: bool = True,
+    inplace: bool = False,
     errors: str = "coerce",  # Can be "ignore", "raise", or "coerce"
     **kwargs,
 ) -> Optional[pd.DataFrame]:
@@ -5304,6 +5327,7 @@ def df_astype(
         "day",
         "month",
         "year",
+        "circular"
     ]
     # If inplace is False, make a copy of the DataFrame
     if not inplace:
@@ -5398,10 +5422,22 @@ def df_astype(
                 kwargs.pop("errors", None)
                 data[column] = pd.to_timedelta(data[column], errors=errors, **kwargs)
                 # print(f"Successfully converted '{column}' to timedelta.")
+            elif astype == "circular":
+                max_val = kwargs.get('max_val',None)
+                data[column]=df_circular(data=data,columns=column,max_val=max_val)
             else:
                 # Convert to other types (e.g., float, int)
-                data[column] = data[column].astype(astype)
+                if astype=='int':
+                    data[column] = data[column].astype('float').astype('int')
+                else:
+                    data[column] = data[column].astype(astype)
                 # print(f"Successfully converted '{column}' to {astype}.")
+            # format
+            try:
+                if fmt is not None:
+                    data[column] = data[column].apply(lambda x: f"{x:{fmt}}")
+            except Exception as e:
+                print(f"设置格式的时候有误: {e}")
         except Exception as e:
             print(f"Error converting '{column}' to {astype}: {e}")
     try:
@@ -6325,6 +6361,7 @@ def df_reducer(
     random_state=1,
     ax=None,
     figsize=None,
+    verbose=True,
     **kwargs,
 ) -> pd.DataFrame:
     dict_methods = {
@@ -6364,7 +6401,8 @@ def df_reducer(
         # "autoencoder","nmf",
     ]
     method = strcmp(method, methods)[0]
-    print(f"\nprocessing with using {dict_methods[method]}:")
+    if verbose:
+        print(f"\nprocessing with using {dict_methods[method]}:")
     xlabel, ylabel = None, None
     if columns is None:
         columns = data.select_dtypes(include="number").columns.tolist()
@@ -6863,7 +6901,7 @@ def df_reducer(
             hue=hue,
             s=size,
             edgecolor=edgecolor,
-            kind="scater",
+            kind_="scater",
             figsets=dict(
                 legend=dict(loc=legend_loc, markerscale=markerscale,bbox_to_anchor=bbox_to_anchor,ncols=ncols,fontsize=8),
                 xlabel=xlabel if xlabel else None,
@@ -7334,10 +7372,13 @@ def evaluate_cluster(
 def df_qc(
     data: pd.DataFrame,
     columns=None,
-    verbose=False,
+    skim=False,
     plot_=True,
     max_cols=20,  # only for plots
+    hue=None,
     output=False,
+    verbose=True,
+    dir_save=None
 ):
     """
     Usage example:
@@ -7345,22 +7386,24 @@ def df_qc(
     """
     from statsmodels.stats.outliers_influence import variance_inflation_factor
     from scipy.stats import skew, kurtosis, entropy
-    import skimpy
     #! display(data.select_dtypes(include=[np.number]).describe())
     #!skim
     if columns is not None:
         if isinstance(columns, (list,pd.core.indexes.base.Index)):
             data=data[columns]
-    try:
-        skimpy.skim(data)
-    except:
-        numerical_data = data.select_dtypes(include=[np.number])
-        skimpy.skim(numerical_data)
+    if skim:
+        try:
+            import skimpy
+            skimpy.skim(data)
+        except:
+            numerical_data = data.select_dtypes(include=[np.number])
+            skimpy.skim(numerical_data)
     # Fill completely NaN columns with a default value (e.g., 0)
     data = data.copy()
     data.loc[:, data.isna().all()] = 0
     res_qc = {}
+    print(f"data.shape:{data.shape}")
     # Missing values
     res_qc["missing_values"] = data.isnull().sum()
@@ -7403,7 +7446,7 @@ def df_qc(
         numeric_df = data.select_dtypes(include=[np.number]).dropna()
         vif_data = pd.DataFrame()
         res_qc["vif"]=vif_data
-        if numeric_df.shape[1] > 1:
+        if numeric_df.shape[1] > 1 and not numeric_df.empty:
             vif_data["feature"] = numeric_df.columns
             vif_data["VIF"] = [
                 variance_inflation_factor(numeric_df.values, i)
@@ -7495,69 +7538,70 @@ def df_qc(
     # Report generation
     if verbose:
         print("=== QC Report Summary ===")
-        print("\nMissing Values (Total and %):")
-        print(res_qc["missing_values"][res_qc["missing_values"] > 0])
-        print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
-        print("\nRows with Missing Values:", res_qc["rows_with_missing"])
-        print("\nData Types:")
-        print(res_qc["data_types"])
-        print("\nUnique Values per Column:")
-        print(res_qc["unique_values"])
-        print("\nConstant Columns:", res_qc["constant_columns"])
-        print("\nDuplicate Rows:", res_qc["duplicate_rows"])
-        print("Duplicate Columns:", res_qc["duplicate_columns"])
+        print("\n⤵  Summary Statistics:")
+        display(res_qc["summary_statistics"])
+        print("\n⤵  Data Types:")
+        display(res_qc["data_types"])
+        if any(res_qc["missing_values"][res_qc["missing_values"] > 0]):
+            print(" ⤵  Missing Values Counts:")
+            display(res_qc["missing_values"][res_qc["missing_values"] > 0])
+            # print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
+            print("\n⤵  Rows with Missing Values:",res_qc["rows_with_missing"])
+        if any(res_qc["outlier_num"]):
+            print("\n⤵  Outlier Report:")
+            display(res_qc["outlier_num"])
+        if any(res_qc["unique_values"]):
+            print("\n⤵  Unique Values per Column:")
+            display(res_qc["unique_values"])
+        print("\n⤵  Constant Columns:", res_qc["constant_columns"]) if any(res_qc["constant_columns"]) else None
+        print("⤵  Duplicate Rows:", res_qc["duplicate_rows"]) if res_qc["duplicate_rows"] else None
+        print("⤵  Duplicate Columns:", res_qc["duplicate_columns"]) if any(res_qc["duplicate_columns"]) else None
         if res_qc["empty_columns"]:
-            print("\nEmpty Columns:", res_qc["empty_columns"])
-        print("\nOutlier Report:")
-        print(res_qc["outlier_num"])
-        print("\nPercentage of Values Replaced per Column:")
-        print(res_qc["outlier_percentage"])
+            print("\n⤵  Empty Columns:", res_qc["empty_columns"])
-        print("\nHigh Correlations (>|0.9|):")
-        for col1, col2 in res_qc["high_correlations"]:
-            print(f"  {col1} and {col2}")
+        if any(res_qc["high_correlations"]):
+            print("\n⤵  High Correlations (>|0.9|):")
+            for col1, col2 in res_qc["high_correlations"]:
+                print(f"  {col1} and {col2}")
         if "vif" in res_qc:
-            print("\nFeatures with High VIF (>|5|):")
+            print("\n⤵  Features with High VIF (>|5|):")
             print(res_qc["vif"])
-        print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
-        print(res_qc["high_cardinality_categoricals"])
-        print("\nInconsistent Data Types:")
-        print(res_qc["inconsistent_types"])
-        print("\nRange Checks for Numeric Columns:")
-        print(res_qc["range_checks"])
-        print("\nText Length Analysis:")
-        for col, stats in res_qc["text_length_analysis"].items():
-            print(
-                f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
-            )
-        print("\nSummary Statistics:")
-        print(res_qc["summary_statistics"])
+        if any(res_qc["high_cardinality_categoricals"]):
+            print("\n⤵  High Cardinality Categorical Columns (>|50 unique|):")
+            print(res_qc["high_cardinality_categoricals"])
+        if any(res_qc["inconsistent_types"]):
+            print("\n⤵  Inconsistent Data Types:")
+            display(res_qc["inconsistent_types"])
+        if any(res_qc["text_length_analysis"]):
+            print("\n⤵  Text Length Analysis:")
+            for col, stats in res_qc["text_length_analysis"].items():
+                print(
+                    f"{col}: Avg Length={round(stats['avg_length'],1)}, Length Variance={round(stats['length_variance'],1)}"
+                )
         if res_qc["warnings"]:
             print("\nWarnings:")
             for warning in res_qc["warnings"]:
                 print("  -", warning)
     if plot_:
-        df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
+        df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue)
+        if dir_save:
+            try:
+                figsave(dir_save)
+            except Exception as e:
+                print(f"⚠️: {e}")
     if output:
         return res_qc
     return None
-def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
+def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None):
     import matplotlib.pyplot as plt
     import seaborn as sns
     from .plot import subplot, figsets, get_color
@@ -7574,91 +7618,73 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
     )
     if len(missing_data) > max_cols:
         missing_data = missing_data[:max_cols]
-    ax=sns.barplot(
-        x=missing_data.index,
-        y=missing_data.values,
-        hue=missing_data.index,
-        palette=get_color(len(missing_data), cmap="Blues")[::-1],
+    ax_missing_data=sns.barplot(
+        y=missing_data.index,
+        x=missing_data.values,
+        hue=missing_data.index,
+        palette=get_color(len(missing_data), cmap="coolwarm")[::-1],
         ax=nexttile(),
     )
-    figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
-    ax2 = ax.twinx()
-    # Plot missing value percentages
-    missing_percentage = res_qc["missing_percentage"][
-        res_qc["missing_percentage"] > 0
-    ].sort_values(ascending=False)
-    sns.barplot(
-        x=missing_percentage.index,
-        y=missing_percentage.values,
-        hue=missing_percentage.index,
-        palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
-        ax=ax2,#nexttile(),
-    )
-    figsets(xangle=45, ylabel="%",ax=ax2)
-    ax2.tick_params(axis="y", color='r',labelcolor='r')
-    ax2.yaxis.label.set_color('r')
+    figsets(title="Missing (#)", xlabel="#",ax=ax_missing_data,ylabel=None,fontsize=8 if len(missing_data)<=20 else 6)
     outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
     if len(outlier_num) > max_cols:
         outlier_num = outlier_num[:max_cols]
     ax_outlier_num=sns.barplot(
-        x=outlier_num.index,
-        y=outlier_num.values,
+        y=outlier_num.index,
+        x=outlier_num.values,
         hue=outlier_num.index,
         palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
         ax=nexttile(),
     )
-    figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
-    ax_outlier_percentage = ax_outlier_num.twinx()
-    outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
-    if len(outlier_percentage) > max_cols:
-        outlier_percentage = outlier_percentage[:max_cols]
-    ax_outlier_percentage=sns.barplot(
-        x=outlier_percentage.index,
-        y=outlier_percentage.values,
-        hue=outlier_percentage.index,
-        palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
-        ax=ax2 #nexttile(),
-    )
-    figsets(
-        xangle=45,
-        ylabel="%",
-        xlabel=None,
-        ylim=[0, outlier_percentage.max() + 2],
-        ax=ax_outlier_percentage
-    )
-    ax2.tick_params(axis="y", color='r',labelcolor='r')
-    ax2.yaxis.label.set_color('r')
+    figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
+    #!
+    try:
+        if data.select_dtypes(include=np.number).shape[1]<=10:
+            for col in data.select_dtypes(include=np.number).columns:
+                sns.histplot(data[col], kde=True, bins=30, ax=nexttile())
+                figsets(title=f"Distribution: {col}", xlabel=col, ylabel="Frequency")
+    except:
+        pass
+    #!
+    try:
+        for col in data.select_dtypes(include='category').columns:
+            sns.countplot(y=data[col],
+                          palette=get_color(data.select_dtypes(include='category').shape[1], cmap="coolwarm")[::-1],
+                          ax=nexttile())
+            figsets(title=f"Count Plot: {col}", xlabel="Count", ylabel=col)
+    except Exception as e:
+        pass
     # Skewness and Kurtosis Plots
     skewness = res_qc["skewness"].sort_values(ascending=False)
     kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
     if not skewness.empty:
         ax_skewness=sns.barplot(
-            x=skewness.index,
-            y=skewness.values,
+            y=skewness.index,
+            x=skewness.values,
             hue=skewness.index,
             palette=get_color(len(skewness), cmap="coolwarm")[::-1],
             ax=nexttile(),
         )
         figsets(
-            xangle=45,
             title="Highly Skewed Numeric Columns (Skewness > 1)",
-            ylabel="Skewness",xlabel=None,ax=ax_skewness
+            xlabel="Skewness",ylabel=None,ax=ax_skewness,
+            fontsize=8 if len(skewness)<=20 else 6
         )
     if not kurtosis.empty:
         ax_kurtosis=sns.barplot(
-            x=kurtosis.index,
-            y=kurtosis.values,
+            y=kurtosis.index,
+            x=kurtosis.values,
             hue=kurtosis.index,
             palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
             ax=nexttile(),
         )
         figsets(
-            xangle=45,
             title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
-            ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
+            xlabel="Kurtosis",ylabel=None,ax=ax_kurtosis,
+            fontsize=8 if len(kurtosis)<=20 else 6
         )
     # Entropy for Categorical Variables
@@ -7666,56 +7692,46 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
         ascending=False
     )
     ax_entropy_data=sns.barplot(
-        x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
+        y=entropy_data.index, x=entropy_data.values,hue=entropy_data.index,
+        palette=get_color(len(entropy_data), cmap="coolwarm")[::-1],
+        ax=nexttile()
     )
     figsets(
-            xangle=45,
-            xlabel="Categorical Columns",
+            ylabel="Categorical Columns",
             title="Entropy of Categorical Variables",
-            ylabel="Entropy (bits)",
-            ax=ax_entropy_data
-        )
-    # Distribution Analysis: Boxplot for IQR
-    ax_iqr=sns.boxplot(
-        data=data[res_qc["distribution_analysis"].index],
-        orient="v",
-        palette="Set3",
-        ax=nexttile(),
-    )
-    figsets(
-            xangle=45,
-            title="Range for Numeric Columns",
-            ylabel="#",
-            ax=ax_iqr
-        )
+            xlabel="Entropy (bits)",
+            ax=ax_entropy_data,
+            fontsize=8 if len(entropy_data)<=20 else 6
+        )
     # unique counts
     unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
     ax_unique_counts_=sns.barplot(
-                x=unique_counts.index,
-                y=unique_counts.values,
+                y=unique_counts.index,
+                x=unique_counts.values,
                 hue=unique_counts.index,
-                palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
+                palette=get_color(len(unique_counts), cmap="coolwarm")[::-1],
                 ax=nexttile())
     figsets(
-            xangle=45,
             title="Unique Counts",
-            xlabel=None,
-            ylabel="#",
-            ax=ax_unique_counts_
+            ylabel=None,
+            xlabel="#",
+            ax=ax_unique_counts_,
+            fontsize=8 if len(unique_counts)<=20 else 6
         )
     # Binary Checking
-    ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
-                y=unique_counts[unique_counts<10].values,
-                hue=unique_counts[unique_counts<10].index,
-                palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
+    ax_unique_counts=sns.barplot(y=unique_counts[unique_counts<8].index,
+                x=unique_counts[unique_counts<8].values,
+                hue=unique_counts[unique_counts<8].index,
+                palette=get_color(len(unique_counts[unique_counts<8].index), cmap="coolwarm")[::-1],
                 ax=nexttile())
-    plt.axhline(y=2, color="r", linestyle="--", lw=2)
+    plt.axvline(x=2, color="r", linestyle="--", lw=2)
     figsets(
-            xangle=45,
-            xlabel=None,
+            ylabel=None,
             title="Binary Checking",
-            ylabel="#",
-            ax=ax_unique_counts
+            xlabel="#",
+            ax=ax_unique_counts,
+            fontsize=8 if len(unique_counts[unique_counts<10].index)<=20 else 6
         )
     # dtypes counts
@@ -7751,14 +7767,15 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
                 ha="center",
                 va="top",
                 c="k",
-                fontsize=8,
+                fontsize=8  if len(dtype_counts.index)<=20 else 6,
                 rotation=0,
             )
     figsets(
         xlabel=None,
         title="Dtypes",
         ylabel="#",
-        ax=ax_dtype_counts
+        ax=ax_dtype_counts,
+        fontsize=8  if len(dtype_counts.index)<=20 else 6,
     )
     # High cardinality: Show top categorical columns by unique value count
@@ -7772,24 +7789,26 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
     if high_cardinality:
         ax_high_cardinality=sns.barplot(
-            x=list(high_cardinality.keys()),
-            y=list(high_cardinality.values()),
+            y=list(high_cardinality.keys()),
+            x=list(high_cardinality.values()),
             hue=list(high_cardinality.keys()),
-            palette="Oranges", ax=nexttile()
+            palette=get_color(len(list(high_cardinality.keys())), cmap="coolwarm")[::-1],
+            ax=nexttile(),
         )
         figsets(
-            xangle=45,
             title="High Cardinality Categorical Columns",
-            ylabel="Unique Value Count",
-            ax=ax_high_cardinality
+            xlabel="Unique Value Count",
+            ax=ax_high_cardinality,
+            fontsize=8 if len(list(high_cardinality.keys()))<=20 else 6
         )
     if res_qc["low_variance_features"]:
         low_variance_data = data[res_qc["low_variance_features"]].copy()
         for col in low_variance_data.columns:
-            sns.histplot(
+            ax_low_variance_features=sns.histplot(
                 low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
             )
-            plt.title(f"Low Variance Feature: {col}")
+            figsets(title=f"Low Variance Feature: {col}",ax=ax_low_variance_features,
+            fontsize=8 if len(low_variance_data[col])<=20 else 6)
     # VIF plot for multicollinearity detection
     if "vif" in res_qc and not res_qc["vif"].empty:
@@ -7800,23 +7819,22 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
                     x="VIF",
                     y="feature",
                     hue="VIF",
-                    palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
+                    palette=get_color(len(vif_data), cmap="coolwarm")[::-1],
                     ax=nexttile())
         figsets(
-            xangle=45,
             title="Variance Inflation Factor(VIF)",
-            xlabel="Variance Inflation Factor(VIF)",
+            xlabel="VIF",
             ylabel="Features",
             legend=None,
-            ax=ax_vif
+            ax=ax_vif,
+            fontsize=8 if len(vif_data)<=20 else 6
         )
     # Correlation heatmap for numeric columns with high correlation pairs
     if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
-        corr = data.select_dtypes(include=[np.number]).dropna().corr()
+        corr = data.select_dtypes(include=[np.number]).corr()
         if corr.shape[1]<=33:
             mask = np.triu(np.ones_like(corr, dtype=bool))
-                # Dynamically scale fontsize based on the number of columns
             num_columns = corr.shape[1]
             fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2))  # Scale between 8 and 12
@@ -7826,7 +7844,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20)
                 annot=True,
                 cmap="coolwarm",
                 center=0,
-                fmt=".2f",
+                fmt=".1f",
                 linewidths=0.5,
                 vmin=-1, vmax=1,
                 ax=nexttile(2, 2),

py2ls 0.2.4.23__py3-none-any.whl → 0.2.4.24__py3-none-any.whl

py2ls 0.2.4.23py3-none-any.whl → 0.2.4.24py3-none-any.whl