PyPI - py2ls - Versions diffs - 0.2.4.15__py3-none-any.whl → 0.2.4.16__py3-none-any.whl - Mend

py2ls 0.2.4.15py3-none-any.whl → 0.2.4.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

py2ls/.git/index +0 -0
py2ls/ips.py +722 -12
py2ls/ml2ls copy.py +2906 -0
py2ls/ml2ls.py +345 -12
py2ls/plot.py +409 -24
{py2ls-0.2.4.15.dist-info → py2ls-0.2.4.16.dist-info}/METADATA +1 -1
{py2ls-0.2.4.15.dist-info → py2ls-0.2.4.16.dist-info}/RECORD +8 -7
{py2ls-0.2.4.15.dist-info → py2ls-0.2.4.16.dist-info}/WHEEL +0 -0

py2ls/ips.py CHANGED Viewed

@@ -3163,14 +3163,19 @@ def listdir(
     if kind is None:
         ls = os.listdir(rootdir)
         ls = [f for f in ls if not f.startswith(".") and not f.startswith("~")]
-        print(ls)
+        if verbose:
+            if len(ls)>20:
+                print(ls[:20])
+            else:
+                print(ls)
         df_all = pd.DataFrame(
             {
                 "fname": ls,
                 "fpath": [os.path.join(rootdir, i) for i in ls],
             }
         )
-        display(df_all)
+        if verbose:
+            display(df_all.head())
         return df_all
     if isinstance(kind, list):
         f_ = []
@@ -3206,6 +3211,7 @@ def listdir(
             "size": [],
             "fname": [],
             "fpath": [],
+            "basename":[],
         }
         for item in ls:
             item_path = os.path.join(rootdir, item)
@@ -3228,6 +3234,7 @@ def listdir(
             f["length"].append(len(filename))
             f["path"].append(os.path.join(os.path.dirname(item_path), item))
             fpath = os.path.join(os.path.dirname(item_path), item)
+            basename=os.path.basename(item_path)
             f["size"].append(round(os.path.getsize(fpath) / 1024 / 1024, 3))
             f["created_time"].append(
                 pd.to_datetime(os.path.getctime(item_path), unit="s")
@@ -3240,6 +3247,7 @@ def listdir(
             )
             f["fname"].append(filename)  # will be removed
             f["fpath"].append(fpath)  # will be removed
+            f['basename'].append(basename)
             i += 1
         f["num"] = i
@@ -3462,7 +3470,6 @@ def figsave(*args, dpi=300):
                 img.save(fname, format=ftype.upper(), dpi=(dpi, dpi))
             elif isinstance(img, np.ndarray):
                 import cv2
                 # Check the shape of the image to determine color mode
                 if img.ndim == 2:
                     # Grayscale image
@@ -5055,16 +5062,22 @@ def _df_outlier(
     from scipy.stats import zscore
     from sklearn.ensemble import IsolationForest
     from sklearn.preprocessing import StandardScaler
+    # Fill completely NaN columns with a default value (e.g., 0)
+    data = data.copy()
+    data.loc[:, data.isna().all()] = 0
+    if columns is not None:
+        if isinstance(columns, (list,pd.core.indexes.base.Index)):
+            data=data[columns]
     col_names_org = data.columns.tolist()
     index_names_org = data.index.tolist()
     # Separate numeric and non-numeric columns
     numeric_data = data.select_dtypes(include=[np.number])
     non_numeric_data = data.select_dtypes(exclude=[np.number])
-    if columns is not None:
-        numeric_data = numeric_data[columns]
-    elif numeric_data.empty:
+    # if columns is not None:
+    #     numeric_data = numeric_data[columns]
+    if numeric_data.empty:
         raise ValueError("Input data must contain numeric columns.")
     outliers_df = pd.DataFrame(index=numeric_data.index)
@@ -5626,6 +5639,10 @@ def df_fillna(
     for col in data.columns:
         data[col] = data[col].apply(lambda x: np.nan if x is None else x)
+    # Fill completely NaN columns with a default value (e.g., 0)
+    data = data.copy()
+    data.loc[:, data.isna().all()] = 0
     col_names_org = data.columns.tolist()
     index_names_org = data.index.tolist()
     # Separate numeric and non-numeric columns
@@ -5682,11 +5699,11 @@ def df_fillna(
         imputed_data = imputer.fit_transform(numeric_data.T)
     else:
         raise ValueError("Invalid axis. Use 0 for columns or 1 for rows.")
     imputed_data = pd.DataFrame(
         imputed_data if axis == 0 else imputed_data.T,
-        index=numeric_data.index if axis == 0 else data.columns,
-        columns=numeric_data.columns if axis == 0 else data.index,
+        index=numeric_data.index if axis == 0 else numeric_data.columns,
+        columns=numeric_data.columns if axis == 0 else numeric_data.index,
     )
     for col in imputed_data.select_dtypes(include=[np.number]).columns:
         imputed_data[col] = imputed_data[col].astype(numeric_data[col].dtype)
@@ -5826,8 +5843,13 @@ def df_encoder(
         from sklearn.preprocessing import LabelEncoder
         encoder = LabelEncoder()
-        encoded_data = data[columns].apply(lambda col: encoder.fit_transform(col))
-        return pd.concat([data.drop(columns, axis=1), encoded_data], axis=1)
+        # Apply LabelEncoder only to non-numeric columns
+        non_numeric_columns = [col for col in columns if not pd.api.types.is_numeric_dtype(data[col])]
+        if not non_numeric_columns:
+            return data
+        encoded_data = data[non_numeric_columns].apply(lambda col: encoder.fit_transform(col))
+        return pd.concat([data.drop(non_numeric_columns, axis=1), encoded_data], axis=1)
     # Target encoding (Mean of the target for each category)
     elif method == "target":
@@ -6878,7 +6900,188 @@ def df_reducer(
 # example:
 # df_reducer(data=data_log, columns=markers, n_components=2)
+def df_format(data, threshold_unique=0.5, verbose=False):
+    """
+    检测表格: long, wide or uncertain.
+    Parameters:
+    - data (pd.DataFrame): DataFrame to check.
+    - threshold_unique (float): Proportion threshold for detecting categorical columns.
+    Returns:
+    - "long" if detected as long format,
+    - "wide" if detected as wide format
+    - "uncertain" if ambiguous.
+    """
+    from scipy.stats import entropy
+    from sklearn.cluster import AgglomerativeClustering
+    from sklearn.preprocessing import StandardScaler
+    long_score = 0
+    wide_score = 0
+    n_rows, n_cols = data.shape
+    # Step 1: Row-Column Ratio Heuristic
+    if n_rows > 3 * n_cols:
+        long_score += 2
+        if verbose:
+            print(
+                "Row-Column Ratio suggests long format (many rows relative to columns)."
+            )
+    elif n_cols > 3 * n_rows:
+        wide_score += 2
+        if verbose:
+            print(
+                "Row-Column Ratio suggests wide format (many columns relative to rows)."
+            )
+    # Step 2: Unique-to-duplicate ratio and entropy for categorical variables
+    unique_counts = data.apply(lambda x: x.nunique())
+    duplicate_ratio = 1 - unique_counts / n_rows
+    if (duplicate_ratio > 0.2).sum() > 0.5 * n_cols:
+        wide_score += 2
+        if verbose:
+            print("High duplicate values in columns suggest wide format.")
+    else:
+        long_score += 1
+        if verbose:
+            print(
+                "Lower duplicate ratio suggests long format (higher row variability)."
+            )
+    # Calculate entropy for categorical columns
+    categorical_cols = data.select_dtypes(include=["object", "category"]).columns
+    if len(categorical_cols) > 0:
+        for col in categorical_cols:
+            counts = data[col].value_counts(normalize=True)
+            col_entropy = entropy(counts)
+            if col_entropy < 1.5:
+                long_score += 1
+                if verbose:
+                    print(
+                        f"Column '{col}' entropy suggests categorical, supporting long format."
+                    )
+            else:
+                wide_score += 1
+                if verbose:
+                    print(f"Column '{col}' entropy is higher, supporting wide format.")
+    # Step 3: Column grouping analysis for patterns in suffixes/prefixes
+    col_names = data.columns.astype(str)
+    suffix_count = sum("_" in col or col[-1].isdigit() for col in col_names)
+    if suffix_count > 0.3 * n_cols:
+        wide_score += 2
+        if verbose:
+            print(
+                "Detected suffix/prefix patterns in column names, suggesting wide format."
+            )
+    # Step 4: Entity identifier detection for long format with categorical columns
+    if len(categorical_cols) > 0 and n_rows > n_cols:
+        entity_identifier_count = sum(
+            data.duplicated(subset=categorical_cols, keep=False)
+        )
+        if entity_identifier_count > 0.2 * n_rows:
+            long_score += 2
+            if verbose:
+                print(
+                    "Significant duplicate rows based on categorical columns, suggesting long format."
+                )
+    # Step 5: Clustering analysis on numerical columns for correlation in wide format
+    numeric_cols = data.select_dtypes(include="number").columns
+    if len(numeric_cols) > 1:
+        scaled_data = StandardScaler().fit_transform(data[numeric_cols].dropna())
+        clustering = AgglomerativeClustering(n_clusters=2).fit(scaled_data.T)
+        cluster_labels = pd.Series(clustering.labels_)
+        if cluster_labels.nunique() < len(numeric_cols) * 0.5:
+            wide_score += 2
+            if verbose:
+                print("Clustering on columns shows grouping, suggesting wide format.")
+    # Step 6: Inter-column correlation analysis
+    if len(numeric_cols) > 1:
+        corr_matrix = data[numeric_cols].corr().abs()
+        avg_corr = (
+            corr_matrix.where(~np.eye(len(corr_matrix), dtype=bool)).mean().mean()
+        )
+        if avg_corr > 0.6:
+            wide_score += 2
+            if verbose:
+                print("High inter-column correlation suggests wide format.")
+    # Step 7: Missing value pattern analysis
+    missing_patterns = data.isna().sum(axis=1)
+    if missing_patterns.std() < 2:
+        wide_score += 1
+        if verbose:
+            print(
+                "Low variation in missing patterns across rows, supporting wide format."
+            )
+    elif missing_patterns.mean() < 1:
+        long_score += 1
+        if verbose:
+            print("Lower missing pattern suggests long format (less structured).")
+    # Step 8: Multi-level clustering on rows to detect block structure for wide format
+    if len(numeric_cols) > 1 and n_rows > 5:
+        clustering_rows = AgglomerativeClustering(n_clusters=2).fit(scaled_data)
+        if pd.Series(clustering_rows.labels_).nunique() < 2:
+            wide_score += 2
+            if verbose:
+                print("Row clustering reveals homogeneity, suggesting wide format.")
+    # Step 9: Sequential name detection for time-series pattern in wide format
+    if any(col.isdigit() or col.startswith("T") for col in col_names):
+        wide_score += 1
+        if verbose:
+            print("Detected time-like sequential column names, supporting wide format.")
+    # Step 10: Entropy of numeric columns
+    numeric_entropy = data[numeric_cols].apply(
+        lambda x: entropy(pd.cut(x, bins=10).value_counts(normalize=True))
+    )
+    if numeric_entropy.mean() < 2:
+        wide_score += 2
+        if verbose:
+            print(
+                "Low entropy in numeric columns indicates stability across columns, supporting wide format."
+            )
+    # Step 11: Tie-breaking strategy if scores are equal
+    if wide_score == long_score:
+        if n_cols > n_rows:
+            wide_score += 1
+            if verbose:
+                print(
+                    "Tie-breaking based on column-major structure, favoring wide format."
+                )
+        elif n_rows > n_cols:
+            long_score += 1
+            if verbose:
+                print(
+                    "Tie-breaking based on row-major structure, favoring long format."
+                )
+        else:
+            if verbose:
+                print("Tie-breaking inconclusive; returning 'uncertain'.")
+            return "uncertain"
+    # Final decision
+    if wide_score > long_score:
+        if verbose:
+            print("Final decision: Wide format.")
+        return "wide"
+    elif long_score > wide_score:
+        if verbose:
+            print("Final decision: Long format.")
+        return "long"
+    else:
+        if verbose:
+            print("Final decision: Uncertain format.")
+        return "uncertain"
 def plot_cluster(
     data: pd.DataFrame,
     labels: np.ndarray,
@@ -7126,7 +7329,514 @@ def evaluate_cluster(
             metrics["V-Measure"] = np.nan
     return metrics
+def df_qc(
+    data: pd.DataFrame,
+    columns=None,
+    verbose=False,
+    plot_=True,
+    max_cols=20,  # only for plots
+    output=False,
+):
+    """
+    Usage example:
+    df = pd.DataFrame(...)  # Your DataFrameres_qc = df_qc(df)
+    """
+    from statsmodels.stats.outliers_influence import variance_inflation_factor
+    from scipy.stats import skew, kurtosis, entropy
+    import skimpy
+    #! display(data.select_dtypes(include=[np.number]).describe())
+    #!skim
+    if columns is not None:
+        if isinstance(columns, (list,pd.core.indexes.base.Index)):
+            data=data[columns]
+    try:
+        skimpy.skim(data)
+    except:
+        numerical_data = data.select_dtypes(include=[np.number])
+        skimpy.skim(numerical_data)
+    # Fill completely NaN columns with a default value (e.g., 0)
+    data = data.copy()
+    data.loc[:, data.isna().all()] = 0
+    res_qc = {}
+    # Missing values
+    res_qc["missing_values"] = data.isnull().sum()
+    res_qc["missing_percentage"] = (res_qc["missing_values"] / len(data)) * 100
+    res_qc["rows_with_missing"] = data.isnull().any(axis=1).sum()
+    # Data types and unique values
+    res_qc["data_types"] = data.dtypes
+    res_qc["unique_values"] = data.nunique()
+    res_qc["constant_columns"] = [
+        col for col in data.columns if data[col].nunique() <= 1
+    ]
+    # Duplicate rows and columns
+    res_qc["duplicate_rows"] = data.duplicated().sum()
+    res_qc["duplicate_columns"] = data.columns[data.columns.duplicated()].tolist()
+    # Empty columns
+    res_qc["empty_columns"] = [col for col in data.columns if data[col].isnull().all()]
+    # outliers
+    data_outliers = df_outlier(data)
+    outlier_num = data_outliers.isna().sum() - data.isnull().sum()
+    res_qc["outlier_num"] = outlier_num[outlier_num > 0]
+    outlier_percentage=(outlier_num / len(data_outliers)) * 100
+    res_qc["outlier_percentage"] = outlier_percentage[outlier_percentage>0]
+    # Correlation and multicollinearity (VIF)
+    if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
+        numeric_df = data.select_dtypes(include=[np.number]).dropna()
+        corr_matrix = numeric_df.corr()
+        high_corr_pairs = [
+            (col1, col2)
+            for col1 in corr_matrix.columns
+            for col2 in corr_matrix.columns
+            if col1 != col2 and abs(corr_matrix[col1][col2]) > 0.9
+        ]
+        res_qc["high_correlations"] = high_corr_pairs
+        # VIF for multicollinearity check
+        numeric_df = data.select_dtypes(include=[np.number]).dropna()
+        vif_data = pd.DataFrame()
+        res_qc["vif"]=vif_data
+        if numeric_df.shape[1] > 1:
+            vif_data["feature"] = numeric_df.columns
+            vif_data["VIF"] = [
+                variance_inflation_factor(numeric_df.values, i)
+                for i in range(numeric_df.shape[1])
+            ]
+            res_qc["vif"] = vif_data[
+                vif_data["VIF"] > 5
+            ]  # Typically VIF > 5 indicates multicollinearity
+    # Skewness and Kurtosis
+    skewness = data.skew(numeric_only=True)
+    kurtosis_vals = data.kurt(numeric_only=True)
+    res_qc["skewness"] = skewness[abs(skewness) > 1]
+    res_qc["kurtosis"] = kurtosis_vals[abs(kurtosis_vals) > 3]
+    # Entropy for categorical columns (higher entropy suggests more disorder)
+    categorical_cols = data.select_dtypes(include=["object", "category"]).columns
+    res_qc["entropy_categoricals"] = {
+        col: entropy(data[col].value_counts(normalize=True), base=2)
+        for col in categorical_cols
+    }
+    # number of unique
+    res_qc["unique_counts"] = data.nunique()
+    # dtypes counts
+    res_qc['dtype_counts']=data.dtypes.value_counts()
+    # Distribution Analysis (mean, median, mode, std dev, IQR for numeric columns)
+    distribution_stats = data.select_dtypes(include=[np.number]).describe().T
+    iqr = data.select_dtypes(include=[np.number]).apply(
+        lambda x: x.quantile(0.75) - x.quantile(0.25)
+    )
+    distribution_stats["IQR"] = iqr
+    res_qc["distribution_analysis"] = distribution_stats
+    # Variance Check: Identify low-variance columns
+    variance_threshold = 0.01
+    low_variance_cols = [
+        col
+        for col in data.select_dtypes(include=[np.number]).columns
+        if data[col].var() < variance_threshold
+    ]
+    res_qc["low_variance_features"] = low_variance_cols
+    # Categorical columns and cardinality
+    categorical_cols = data.select_dtypes(include=["object", "category"]).columns
+    high_cardinality = {
+        col: data[col].nunique() for col in categorical_cols if data[col].nunique() > 50
+    }
+    res_qc["high_cardinality_categoricals"] = high_cardinality
+    # Feature-type inconsistency (mixed types in columns)
+    inconsistent_types = {}
+    for col in data.columns:
+        unique_types = set(type(val) for val in data[col].dropna())
+        if len(unique_types) > 1:
+            inconsistent_types[col] = unique_types
+    res_qc["inconsistent_types"] = inconsistent_types
+    # Text length analysis for text fields
+    text_lengths = {}
+    for col in categorical_cols:
+        text_lengths[col] = {
+            "avg_length": data[col].dropna().apply(len).mean(),
+            "length_variance": data[col].dropna().apply(len).var(),
+        }
+    res_qc["text_length_analysis"] = text_lengths
+    # Summary statistics
+    res_qc["summary_statistics"] = data.describe().T
+    # Automated warnings
+    warnings = []
+    if res_qc["duplicate_rows"] > 0:
+        warnings.append("Warning: Duplicate rows detected.")
+    if len(res_qc["empty_columns"]) > 0:
+        warnings.append("Warning: Columns with only NaN values detected.")
+    if len(res_qc["constant_columns"]) > 0:
+        warnings.append("Warning: Columns with a single constant value detected.")
+    if len(high_corr_pairs) > 0:
+        warnings.append("Warning: Highly correlated columns detected.")
+    if len(res_qc["vif"]) > 0:
+        warnings.append("Warning: Multicollinearity detected in features.")
+    if len(high_cardinality) > 0:
+        warnings.append("Warning: High cardinality in categorical columns.")
+    if len(inconsistent_types) > 0:
+        warnings.append("Warning: Columns with mixed data types detected.")
+    res_qc["warnings"] = warnings
+    # Report generation
+    if verbose:
+        print("=== QC Report Summary ===")
+        print("\nMissing Values (Total and %):")
+        print(res_qc["missing_values"][res_qc["missing_values"] > 0])
+        print(res_qc["missing_percentage"][res_qc["missing_percentage"] > 0])
+        print("\nRows with Missing Values:", res_qc["rows_with_missing"])
+        print("\nData Types:")
+        print(res_qc["data_types"])
+        print("\nUnique Values per Column:")
+        print(res_qc["unique_values"])
+        print("\nConstant Columns:", res_qc["constant_columns"])
+        print("\nDuplicate Rows:", res_qc["duplicate_rows"])
+        print("Duplicate Columns:", res_qc["duplicate_columns"])
+        if res_qc["empty_columns"]:
+            print("\nEmpty Columns:", res_qc["empty_columns"])
+        print("\nOutlier Report:")
+        print(res_qc["outlier_num"])
+        print("\nPercentage of Values Replaced per Column:")
+        print(res_qc["outlier_percentage"])
+        print("\nHigh Correlations (>|0.9|):")
+        for col1, col2 in res_qc["high_correlations"]:
+            print(f"  {col1} and {col2}")
+        if "vif" in res_qc:
+            print("\nFeatures with High VIF (>|5|):")
+            print(res_qc["vif"])
+        print("\nHigh Cardinality Categorical Columns (>|50 unique|):")
+        print(res_qc["high_cardinality_categoricals"])
+        print("\nInconsistent Data Types:")
+        print(res_qc["inconsistent_types"])
+        print("\nRange Checks for Numeric Columns:")
+        print(res_qc["range_checks"])
+        print("\nText Length Analysis:")
+        for col, stats in res_qc["text_length_analysis"].items():
+            print(
+                f"{col}: Avg Length={stats['avg_length']}, Length Variance={stats['length_variance']}"
+            )
+        print("\nSummary Statistics:")
+        print(res_qc["summary_statistics"])
+        if res_qc["warnings"]:
+            print("\nWarnings:")
+            for warning in res_qc["warnings"]:
+                print("  -", warning)
+    if plot_:
+        df_qc_plots(data=data, res_qc=res_qc, max_cols=20)
+    if output:
+        return res_qc
+    return None
+def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20):
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    from .plot import subplot, figsets, get_color
+    if columns is not None:
+        if isinstance(columns, (list,pd.core.indexes.base.Index)):
+            data=data[columns]
+    len_total = len(res_qc)
+    n_row, n_col = int((len_total + 10) / 3), 3
+    nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
+    missing_data = res_qc["missing_values"][res_qc["missing_values"] > 0].sort_values(
+        ascending=False
+    )
+    if len(missing_data) > max_cols:
+        missing_data = missing_data[:max_cols]
+    ax=sns.barplot(
+        x=missing_data.index,
+        y=missing_data.values,
+        hue=missing_data.index,
+        palette=get_color(len(missing_data), cmap="Blues")[::-1],
+        ax=nexttile(),
+    )
+    figsets(xangle=45, title="Missing (#)", ylabel="#",ax=ax)
+    ax2 = ax.twinx()
+    # Plot missing value percentages
+    missing_percentage = res_qc["missing_percentage"][
+        res_qc["missing_percentage"] > 0
+    ].sort_values(ascending=False)
+    sns.barplot(
+        x=missing_percentage.index,
+        y=missing_percentage.values,
+        hue=missing_percentage.index,
+        palette=get_color(len(missing_percentage), cmap="Blues")[::-1],
+        ax=ax2,#nexttile(),
+    )
+    figsets(xangle=45, ylabel="%",ax=ax2)
+    ax2.tick_params(axis="y", color='r',labelcolor='r')
+    ax2.yaxis.label.set_color('r')
+    outlier_num = res_qc["outlier_num"].sort_values(ascending=False)
+    if len(outlier_num) > max_cols:
+        outlier_num = outlier_num[:max_cols]
+    ax_outlier_num=sns.barplot(
+        x=outlier_num.index,
+        y=outlier_num.values,
+        hue=outlier_num.index,
+        palette=get_color(len(outlier_num), cmap="coolwarm")[::-1],
+        ax=nexttile(),
+    )
+    figsets(xangle=45, title="Outliers (#)", ylabel="#",xlabel=None)
+    ax_outlier_percentage = ax_outlier_num.twinx()
+    outlier_percentage = res_qc["outlier_percentage"].sort_values(ascending=False)
+    if len(outlier_percentage) > max_cols:
+        outlier_percentage = outlier_percentage[:max_cols]
+    ax_outlier_percentage=sns.barplot(
+        x=outlier_percentage.index,
+        y=outlier_percentage.values,
+        hue=outlier_percentage.index,
+        palette=get_color(len(outlier_percentage), cmap="coolwarm")[::-1],
+        ax=ax2 #nexttile(),
+    )
+    figsets(
+        xangle=45,
+        ylabel="%",
+        xlabel=None,
+        ylim=[0, outlier_percentage.max() + 2],
+        ax=ax_outlier_percentage
+    )
+    ax2.tick_params(axis="y", color='r',labelcolor='r')
+    ax2.yaxis.label.set_color('r')
+    # Skewness and Kurtosis Plots
+    skewness = res_qc["skewness"].sort_values(ascending=False)
+    kurtosis = res_qc["kurtosis"].sort_values(ascending=False)
+    if not skewness.empty:
+        ax_skewness=sns.barplot(
+            x=skewness.index,
+            y=skewness.values,
+            hue=skewness.index,
+            palette=get_color(len(skewness), cmap="coolwarm")[::-1],
+            ax=nexttile(),
+        )
+        figsets(
+            xangle=45,
+            title="Highly Skewed Numeric Columns (Skewness > 1)",
+            ylabel="Skewness",xlabel=None,ax=ax_skewness
+        )
+    if not kurtosis.empty:
+        ax_kurtosis=sns.barplot(
+            x=kurtosis.index,
+            y=kurtosis.values,
+            hue=kurtosis.index,
+            palette=get_color(len(kurtosis), cmap="coolwarm")[::-1],
+            ax=nexttile(),
+        )
+        figsets(
+            xangle=45,
+            title="Highly Kurtotic Numeric Columns (Kurtosis > 3)",
+            ylabel="Kurtosis",xlabel=None,ax=ax_kurtosis
+        )
+    # Entropy for Categorical Variables
+    entropy_data = pd.Series(res_qc["entropy_categoricals"]).sort_values(
+        ascending=False
+    )
+    ax_entropy_data=sns.barplot(
+        x=entropy_data.index, y=entropy_data.values,hue=entropy_data.index, palette="viridis", ax=nexttile()
+    )
+    figsets(
+            xangle=45,
+            xlabel="Categorical Columns",
+            title="Entropy of Categorical Variables",
+            ylabel="Entropy (bits)",
+            ax=ax_entropy_data
+        )
+    # Distribution Analysis: Boxplot for IQR
+    ax_iqr=sns.boxplot(
+        data=data[res_qc["distribution_analysis"].index],
+        orient="v",
+        palette="Set3",
+        ax=nexttile(),
+    )
+    figsets(
+            xangle=45,
+            title="Range for Numeric Columns",
+            ylabel="#",
+            ax=ax_iqr
+        )
+    # unique counts
+    unique_counts=res_qc["unique_counts"].sort_values(ascending=False)
+    ax_unique_counts_=sns.barplot(
+                x=unique_counts.index,
+                y=unique_counts.values,
+                hue=unique_counts.index,
+                palette=get_color(len(unique_counts)+10, cmap="Blues")[::-1],
+                ax=nexttile())
+    figsets(
+            xangle=45,
+            title="Unique Counts",
+            xlabel=None,
+            ylabel="#",
+            ax=ax_unique_counts_
+        )
+    # Binary Checking
+    ax_unique_counts=sns.barplot(x=unique_counts[unique_counts<10].index,
+                y=unique_counts[unique_counts<10].values,
+                hue=unique_counts[unique_counts<10].index,
+                palette=get_color(len(unique_counts[unique_counts<10])+10, cmap="Blues")[::-1],
+                ax=nexttile())
+    plt.axhline(y=2, color="r", linestyle="--", lw=2)
+    figsets(
+            xangle=45,
+            xlabel=None,
+            title="Binary Checking",
+            ylabel="#",
+            ax=ax_unique_counts
+        )
+    # dtypes counts
+    dtype_counts = res_qc['dtype_counts']
+    txt = []
+    for tp in dtype_counts.index:
+        txt.append(list(data.select_dtypes(include=tp).columns))
+    ax_dtype_counts = sns.barplot(
+        x=dtype_counts.index,
+        y=dtype_counts.values,
+        color="#F3C8B2",
+        ax=nexttile(),
+    )
+    max_columns_per_row = 1 # Maximum number of columns per row
+    for i, tp in enumerate(dtype_counts.index):
+        if i<=20:
+            column_names = txt[i]
+            # Split the column names into multiple lines if too long
+            column_name_str = ", ".join(column_names)
+            if len(column_name_str) > 40:  # If column names are too long, split them
+                column_name_str = "\n".join(
+                    [
+                        ", ".join(column_names[j : j + max_columns_per_row])
+                        for j in range(0, len(column_names), max_columns_per_row)
+                    ]
+                )
+            # Place text annotation with line breaks and rotate the text if needed
+            ax_dtype_counts.text(
+                i,
+                dtype_counts.values[i],
+                f"{column_name_str}",
+                ha="center",
+                va="top",
+                c="k",
+                fontsize=8,
+                rotation=0,
+            )
+    figsets(
+        xlabel=None,
+        title="Dtypes",
+        ylabel="#",
+        ax=ax_dtype_counts
+    )
+    # High cardinality: Show top categorical columns by unique value count
+    high_cardinality = res_qc["high_cardinality_categoricals"]
+    if high_cardinality and len(high_cardinality) > max_cols:
+        high_cardinality = dict(
+            sorted(high_cardinality.items(), key=lambda x: x[1], reverse=True)[
+                :max_cols
+            ]
+        )
+    if high_cardinality:
+        ax_high_cardinality=sns.barplot(
+            x=list(high_cardinality.keys()),
+            y=list(high_cardinality.values()),
+            hue=list(high_cardinality.keys()),
+            palette="Oranges", ax=nexttile()
+        )
+        figsets(
+            xangle=45,
+            title="High Cardinality Categorical Columns",
+            ylabel="Unique Value Count",
+            ax=ax_high_cardinality
+        )
+    if res_qc["low_variance_features"]:
+        low_variance_data = data[res_qc["low_variance_features"]].copy()
+        for col in low_variance_data.columns:
+            sns.histplot(
+                low_variance_data[col], bins=20, kde=True, color="coral", ax=nexttile()
+            )
+            plt.title(f"Low Variance Feature: {col}")
+    # VIF plot for multicollinearity detection
+    if "vif" in res_qc and not res_qc["vif"].empty:
+        vif_data = res_qc["vif"].sort_values(by="VIF", ascending=False)
+        if len(vif_data) > max_cols:
+            vif_data = vif_data[:max_cols]
+        ax_vif=sns.barplot(data=vif_data,
+                    x="VIF",
+                    y="feature",
+                    hue="VIF",
+                    palette=get_color(len(vif_data)+10, cmap="Blues")[::-1],
+                    ax=nexttile())
+        figsets(
+            xangle=45,
+            title="Variance Inflation Factor(VIF)",
+            xlabel="Variance Inflation Factor(VIF)",
+            ylabel="Features",
+            legend=None,
+            ax=ax_vif
+        )
+    # Correlation heatmap for numeric columns with high correlation pairs
+    if any(data.dtypes.apply(pd.api.types.is_numeric_dtype)):
+        corr = data.select_dtypes(include=[np.number]).dropna().corr()
+        if corr.shape[1]<=33:
+            mask = np.triu(np.ones_like(corr, dtype=bool))
+                # Dynamically scale fontsize based on the number of columns
+            num_columns = corr.shape[1]
+            fontsize = max(6, min(12, 12 - (num_columns - 10) * 0.2))  # Scale between 8 and 12
+            ax_heatmap=sns.heatmap(
+                corr,
+                mask=mask,
+                annot=True,
+                cmap="coolwarm",
+                center=0,
+                fmt=".2f",
+                linewidths=0.5,
+                vmin=-1, vmax=1,
+                ax=nexttile(2, 2),
+                cbar_kws=dict(shrink=0.2,ticks=np.arange(-1, 2, 1)),
+                annot_kws={"size": fontsize}
+            )
+            figsets(
+                xangle=45,
+                title="Correlation Heatmap",
+                ax=ax_heatmap
+            )
 def use_pd(
     func_name="excel",

py2ls 0.2.4.15__py3-none-any.whl → 0.2.4.16__py3-none-any.whl

py2ls 0.2.4.15py3-none-any.whl → 0.2.4.16py3-none-any.whl