PyPI - dragon-ml-toolbox - Versions diffs - 19.12.2__py3-none-any.whl → 19.14.0__py3-none-any.whl - Mend

dragon-ml-toolbox 19.12.2py3-none-any.whl → 19.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 19.12.2
+Version: 19.14.0
 Summary: Complete pipelines and helper tools for data science and machine learning projects.
 Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-dragon_ml_toolbox-19.12.2.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
-dragon_ml_toolbox-19.12.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
+dragon_ml_toolbox-19.14.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-19.14.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
 ml_tools/ETL_cleaning.py,sha256=cKXyRFaaFs_beAGDnQM54xnML671kq-yJEGjHafW-20,351
 ml_tools/ETL_engineering.py,sha256=cwh1FhtNdUHllUDvho-x3SIVj4KwG_rFQR6VYzWUg0U,898
 ml_tools/GUI_tools.py,sha256=O89rG8WQv6GY1DiphQjIsPzXFCQID6te7q_Sgt1iTkQ,294
@@ -88,7 +88,7 @@ ml_tools/_core/_PSO_optimization.py,sha256=W3g5xw2v2eOUQadv8KHFkt5HNm9AiY3ZUk-Te
 ml_tools/_core/_SQL.py,sha256=zX_8EgYfmLmvvrnL851KMkI4w9kdkjHJ997BTvS5aig,11556
 ml_tools/_core/_VIF_factor.py,sha256=BM0mTowBqt45PXFy9oJLhT9C-CTWWo0TQhgCyWYLHtQ,10457
 ml_tools/_core/__init__.py,sha256=d4IG0OxUXj2HffepzQcYixHlZeuuuDMAFa09H_6LtmU,12
-ml_tools/_core/_data_exploration.py,sha256=uynIjMppbr5nFJ-7wag0R0HDQCp2rTXqz_IpgNKKNOM,77508
+ml_tools/_core/_data_exploration.py,sha256=tOdtXTCh_xESKqIUuxCCo8fbcPoO9Eu5PwJwyehGKY8,76434
 ml_tools/_core/_ensemble_evaluation.py,sha256=17lWl4bWLT1BAMv_fhGf2D3wy-F4jx0HgnJ79lYkRuE,28419
 ml_tools/_core/_ensemble_inference.py,sha256=9UpARSETzmqPdQmxqizD768tjkqldxHw1ER_hM9Kx9M,8631
 ml_tools/_core/_ensemble_learning.py,sha256=X8ghbjDOLMENCWdISXLhDlHQtR3C6SW1tkTBAcfRRPY,22016
@@ -105,7 +105,7 @@ ml_tools/_core/_schema.py,sha256=TM5WVVMoKOvr_Bc2z34sU_gzKlM465PRKTgdZaEOkGY,140
 ml_tools/_core/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/_core/_serde.py,sha256=tsI4EO2Y7jrBMmbQ1pinDsPOrOg-SaPuB-Dt40q0taE,5609
 ml_tools/_core/_utilities.py,sha256=oU-0hBipE96bXox66NG-hFuEMMNkKa9MkAy1yJGCSIA,22779
-dragon_ml_toolbox-19.12.2.dist-info/METADATA,sha256=Nuk7YVRdDotD_TURCpIFKqcDuTlkBs9fpTv-8jCm5aU,8193
-dragon_ml_toolbox-19.12.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-19.12.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-19.12.2.dist-info/RECORD,,
+dragon_ml_toolbox-19.14.0.dist-info/METADATA,sha256=7QaJsWeT9idUhpAV37t64fsuWNmzXbaQqA-a-yDP2yY,8193
+dragon_ml_toolbox-19.14.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-19.14.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-19.14.0.dist-info/RECORD,,

ml_tools/_core/_data_exploration.py CHANGED Viewed

@@ -26,21 +26,21 @@ __all__ = [
     "drop_macro",
     "clean_column_names",
     "plot_value_distributions",
-    "plot_continuous_vs_target",
-    "plot_categorical_vs_target",
-    "encode_categorical_features",
     "split_features_targets",
-    "split_continuous_binary",
+    "encode_categorical_features",
     "clip_outliers_single",
     "clip_outliers_multi",
     "drop_outlier_samples",
+    "plot_continuous_vs_target",
+    "plot_categorical_vs_target",
     "plot_correlation_heatmap",
+    "finalize_feature_schema",
     "match_and_filter_columns_by_regex",
     "standardize_percentages",
     "reconstruct_one_hot",
     "reconstruct_binary",
     "reconstruct_multibinary",
-    "finalize_feature_schema",
+    "split_continuous_binary",
     "apply_feature_schema"
 ]
@@ -59,16 +59,18 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
     """
     summary = pd.DataFrame({
         'Data Type': df.dtypes,
-        'Non-Null Count': df.notnull().sum(),
+        'Completeness %': (df.notnull().mean() * 100).round(2),
         'Unique Values': df.nunique(),
-        'Missing %': (df.isnull().mean() * 100).round(round_digits)
+        # 'Missing %': (df.isnull().mean() * 100).round(2)
     })
     # For numeric columns, add summary statistics
     numeric_cols = df.select_dtypes(include='number').columns
     if not numeric_cols.empty:
-        summary_numeric = df[numeric_cols].describe().T[
-            ['mean', 'std', 'min', '25%', '50%', '75%', 'max']
+        stats = df[numeric_cols].describe(percentiles=[.10, .25, .50, .70, .80, .90])
+        summary_numeric = stats.T[
+            ['mean', 'std', 'min', '10%', '25%', '50%', '70%', '80%', '90%', 'max']
         ].round(round_digits)
         summary = summary.join(summary_numeric, how='left')
@@ -108,22 +110,17 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
     for col_name in df_clean.columns:
         column = df_clean[col_name]
-        # We can apply this logic to all columns or only focus on numeric ones.
-        # if not is_numeric_dtype(column):
-        #     cols_to_keep.append(col_name)
-        #     continue
         # Keep a column if it has more than one unique value (nunique ignores NaNs by default)
         if column.nunique(dropna=True) > 1:
             cols_to_keep.append(col_name)
     dropped_columns = original_columns - set(cols_to_keep)
     if verbose:
-        _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
         if dropped_columns:
-            for dropped_column in dropped_columns:
-                print(f"    {dropped_column}")
+            _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
+        else:
+            _LOGGER.info("No constant columns found.")
     # Return a new DataFrame with only the columns to keep
     df_clean = df_clean[cols_to_keep]
@@ -338,8 +335,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
     cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
     if len(cols_to_drop) > 0:
-        _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data:")
-        print(list(cols_to_drop))
+        _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
         result_df = df.drop(columns=cols_to_drop)
         if show_nulls_after:
@@ -369,9 +365,8 @@ def drop_macro(df: pd.DataFrame,
     Args:
         df (pd.DataFrame): The input pandas DataFrame to be cleaned.
-        log_directory (Union[str, Path]): Path to the directory where the
-            'Missing_Data_start.csv' and 'Missing_Data_final.csv' logs
-            will be saved.
+        log_directory (Union[str, Path]): Path to the directory where the missing data reports
+            and plots will be saved inside a "Missing Report" subdirectory.
         targets (list[str]): A list of column names to be treated as target
             variables. This list guides the row-dropping logic.
         skip_targets (bool, optional): If True, the columns listed in `targets`
@@ -387,15 +382,18 @@ def drop_macro(df: pd.DataFrame,
     # make a deep copy to work with
     df_clean = df.copy()
+    base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
+    full_path = base_dir_path / "Missing Report"
     # Log initial state + Plot
     missing_data_start = show_null_columns(
         df=df_clean,
-        plot_to_dir=log_directory,
+        plot_to_dir=full_path,
         plot_filename="Original",
         use_all_columns=True
     )
     save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
-                   save_dir=log_directory,
+                   save_dir=full_path,
                    filename="Missing_Data_Original")
     # Clean cycles for rows and columns
@@ -424,12 +422,12 @@ def drop_macro(df: pd.DataFrame,
     # log final state + plot
     missing_data_final = show_null_columns(
         df=df_clean,
-        plot_to_dir=log_directory,
+        plot_to_dir=full_path,
         plot_filename="Processed",
         use_all_columns=True
     )
     save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
-                   save_dir=log_directory,
+                   save_dir=full_path,
                    filename="Missing_Data_Processed")
     # return cleaned dataframe
@@ -476,9 +474,8 @@ def plot_value_distributions(
     df: pd.DataFrame,
     save_dir: Union[str, Path],
     categorical_columns: Optional[List[str]] = None,
-    categorical_cardinality_threshold: int = 10,
-    max_categories: int = 50,
-    fill_na_with: str = "Missing"
+    max_categories: int = 100,
+    fill_na_with: str = "MISSING DATA"
 ):
     """
     Plots and saves the value distributions for all columns in a DataFrame,
@@ -491,15 +488,9 @@ def plot_value_distributions(
     Args:
         df (pd.DataFrame): The input DataFrame to analyze.
         save_dir (str | Path): Directory path to save the plots.
-        categorical_columns (List[str] | None): If provided, this list
-            of column names will be treated as categorical, and all other columns will be treated as continuous. This
-            overrides the `continuous_cardinality_threshold` logic.
-        categorical_cardinality_threshold (int): A numeric column will be treated
-            as 'categorical' if its number of unique values is less than or equal to this threshold. (Ignored if `categorical_columns` is set).
-        max_categories (int): The maximum number of unique categories a
-            categorical feature can have to be plotted. Features exceeding this limit will be skipped.
-        fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its
-            own category. Defaults to "Missing".
+        categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
+        max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
+        fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
     Notes:
         - `seaborn.histplot` with KDE is used for continuous features.
@@ -534,7 +525,7 @@ def plot_value_distributions(
                     is_continuous = True
             else:
                 # Use auto-detection
-                if is_numeric and n_unique > categorical_cardinality_threshold:
+                if is_numeric:
                     is_continuous = True
             # --- Case 1: Continuous Numeric (Histogram) ---
@@ -549,7 +540,7 @@ def plot_value_distributions(
                 save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
                 numeric_plots_saved += 1
-            # --- Case 2: Categorical or Low-Cardinality Numeric (Count Plot) ---
+            # --- Case 2: Categorical (Count Plot) ---
             else:
                 # Check max categories
                 if n_unique > max_categories:
@@ -558,7 +549,7 @@ def plot_value_distributions(
                 # Adaptive figure size
                 fig_width = max(10, n_unique * 0.5)
-                plt.figure(figsize=(fig_width, 7))
+                plt.figure(figsize=(fig_width, 8))
                 # Make a temporary copy for plotting to handle NaNs
                 temp_series = df[col_name].copy()
@@ -573,7 +564,7 @@ def plot_value_distributions(
                 # Get category order by frequency
                 order = temp_series.value_counts().index
-                sns.countplot(x=temp_series, order=order, palette="viridis")
+                sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
                 plt.title(f"Distribution of '{col_name}' (Categorical)")
                 plt.xlabel(col_name)
@@ -607,68 +598,55 @@ def plot_value_distributions(
 def plot_continuous_vs_target(
-    df: pd.DataFrame,
-    targets: List[str],
+    df_continuous: pd.DataFrame,
+    df_targets: pd.DataFrame,
     save_dir: Union[str, Path],
-    features: Optional[List[str]] = None
+    verbose: int = 1
 ):
     """
-    Plots each continuous feature against each target to visualize linear relationships.
+    Plots each continuous feature from df_continuous against each target in df_targets.
-    This function is a common EDA step for regression tasks. It creates a
-    scatter plot for each feature-target pair, overlays a simple linear
-    regression line, and saves each plot as an individual .svg file.
+    This function creates a scatter plot for each feature-target pair, overlays a
+    simple linear regression line, and saves each plot as an individual .svg file.
     Plots are saved in a structured way, with a subdirectory created for
     each target variable.
     Args:
-        df (pd.DataFrame): The input DataFrame.
-        targets (List[str]): A list of target column names to plot (y-axis).
-        save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
-        features (List[str] | None): A list of feature column names to plot (x-axis). If None, all non-target columns in the
-            DataFrame will be used.
+        df_continuous (pd.DataFrame): DataFrame containing continuous feature columns (x-axis).
+        df_targets (pd.DataFrame): DataFrame containing target columns (y-axis).
+        save_dir (str | Path): The base directory where plots will be saved.
+        verbose (int): Verbosity level for logging warnings.
     Notes:
-        - Only numeric features and numeric targets are processed. Non-numeric
-          columns in the lists will be skipped with a warning.
-        - Rows with NaN in either the feature or the target are dropped
-          pairwise for each plot.
+        - Only numeric features and numeric targets are processed.
+        - Rows with NaN in either the feature or the target are dropped pairwise.
+        - Assumes df_continuous and df_targets share the same index.
     """
     # 1. Validate the base save directory
     base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
-    # 2. Validate helper
-    def _validate_numeric_cols(col_list: List[str], col_type: str) -> List[str]:
+    # 2. Validation helper
+    def _get_valid_numeric_cols(df: pd.DataFrame, df_name: str) -> List[str]:
         valid_cols = []
-        for col in col_list:
-            if col not in df.columns:
-                _LOGGER.warning(f"{col_type} column '{col}' not found. Skipping.")
-            elif not is_numeric_dtype(df[col]):
-                _LOGGER.warning(f"{col_type} column '{col}' is not numeric. Skipping.")
+        for col in df.columns:
+            if not is_numeric_dtype(df[col]):
+                if verbose > 0:
+                    _LOGGER.warning(f"Column '{col}' in {df_name} is not numeric. Skipping.")
             else:
                 valid_cols.append(col)
         return valid_cols
-    # 3. Validate target columns FIRST
-    valid_targets = _validate_numeric_cols(targets, "Target")
+    # 3. Validate target columns
+    valid_targets = _get_valid_numeric_cols(df_targets, "df_targets")
     if not valid_targets:
-        _LOGGER.error("No valid numeric target columns provided to plot.")
+        _LOGGER.error("No valid numeric target columns provided in df_targets.")
         return
-    # 4. Determine and validate feature columns
-    if features is None:
-        _LOGGER.info("No 'features' list provided. Using all non-target columns as features.")
-        target_set = set(valid_targets)
-        # Get all columns that are not in the valid_targets set
-        features_to_validate = [col for col in df.columns if col not in target_set]
-    else:
-        features_to_validate = features
-    valid_features = _validate_numeric_cols(features_to_validate, "Feature")
+    # 4. Validate feature columns
+    valid_features = _get_valid_numeric_cols(df_continuous, "df_continuous")
     if not valid_features:
-        _LOGGER.error("No valid numeric feature columns found to plot.")
+        _LOGGER.error("No valid numeric feature columns provided in df_continuous.")
         return
     # 5. Main plotting loop
@@ -680,15 +658,20 @@ def plot_continuous_vs_target(
         target_save_dir = base_save_path / safe_target_dir_name
         target_save_dir.mkdir(parents=True, exist_ok=True)
-        _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
+        if verbose > 0:
+            _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
         for feature_name in valid_features:
-            # Drop NaNs pairwise for this specific plot
-            temp_df = df[[feature_name, target_name]].dropna()
+            # Align data and drop NaNs pairwise - use concat to ensure we respect the index alignment between the two DFs
+            temp_df = pd.concat([
+                df_continuous[feature_name],
+                df_targets[target_name]
+            ], axis=1).dropna()
             if temp_df.empty:
-                _LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
+                if verbose > 1:
+                    _LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
                 continue
             x = temp_df[feature_name]
@@ -696,11 +679,12 @@ def plot_continuous_vs_target(
             # 6. Perform linear fit
             try:
-                # Modern replacement for np.polyfit + np.poly1d. Compatible with NumPy 1.14+ and NumPy 2.0+
+                # Modern replacement for np.polyfit + np.poly1d
                 p = np.polynomial.Polynomial.fit(x, y, deg=1)
                 plot_regression_line = True
             except (np.linalg.LinAlgError, ValueError):
-                _LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
+                if verbose > 0:
+                    _LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
                 plot_regression_line = False
             # 7. Create the plot
@@ -734,87 +718,68 @@ def plot_continuous_vs_target(
             # Close the figure to free up memory
             plt.close()
-    _LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
+    if verbose > 0:
+        _LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
 def plot_categorical_vs_target(
-    df: pd.DataFrame,
-    targets: List[str],
+    df_categorical: pd.DataFrame,
+    df_targets: pd.DataFrame,
     save_dir: Union[str, Path],
-    features: Optional[List[str]] = None,
-    plot_type: Literal["box", "violin"] = "box",
-    max_categories: int = 20,
-    fill_na_with: str = "Missing"
+    max_categories: int = 50,
+    fill_na_with: str = "MISSING DATA",
+    drop_empty_targets: bool = True,
+    verbose: int = 1
 ):
     """
-    Plots each categorical feature against each numeric target using box or violin plots.
+    Plots each feature in df_categorical against each numeric target in df_targets using box plots.
-    This function is a core EDA step for regression tasks to understand the
-    relationship between a categorical independent variable and a continuous
-    dependent variable.
-    Plots are saved as individual .svg files in a structured way, with a subdirectory created for each target.
+    Automatically aligns the two DataFrames by index. If a numeric
+    column is passed within df_categorical, it will be cast to object type to treat it as a category.
     Args:
-        df (pd.DataFrame): The input DataFrame.
-        targets (List[str]): A list of numeric target column names (y-axis).
-        save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
-        features (List[str] | None): A list of categorical feature column names (x-axis). If None, all non-numeric (object) columns will be used.
-        plot_type (Literal["box", "violin"]): The type of plot to generate.
-        max_categories (int): The maximum number of unique categories a feature can have to be plotted. Features exceeding this limit will be skipped.
-        fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category. Defaults to "Missing".
+        df_categorical (pd.DataFrame): DataFrame containing categorical feature columns (x-axis).
+        df_targets (pd.DataFrame): DataFrame containing numeric target columns (y-axis).
+        save_dir (str | Path): Base directory for saving plots.
+        max_categories (int): The maximum number of unique categories a feature can have to be plotted.
+        fill_na_with (str): String to replace NaN values in categorical columns.
+        drop_empty_targets (bool): If True, drops rows where the target value is NaN before plotting.
+        verbose (int): Verbosity level for logging warnings.
     Notes:
-        - Only numeric targets are processed.
-        - Features are automatically identified as categorical if they are 'object' dtype.
+        - Assumes df_categorical and df_targets share the same index.
     """
-    # 1. Validate the base save directory and inputs
+    # 1. Validate the base save directory
     base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
-    if plot_type not in ["box", "violin"]:
-        _LOGGER.error(f"Invalid plot type '{plot_type}'")
-        raise ValueError()
     # 2. Validate target columns (must be numeric)
     valid_targets = []
-    for col in targets:
-        if col not in df.columns:
-            _LOGGER.warning(f"Target column '{col}' not found. Skipping.")
-        elif not is_numeric_dtype(df[col]):
-            _LOGGER.warning(f"Target column '{col}' is not numeric. Skipping.")
+    for col in df_targets.columns:
+        if not is_numeric_dtype(df_targets[col]):
+            if verbose > 0:
+                _LOGGER.warning(f"Target column '{col}' in df_targets is not numeric. Skipping.")
         else:
             valid_targets.append(col)
     if not valid_targets:
-        _LOGGER.error("No valid numeric target columns provided to plot.")
+        _LOGGER.error("No valid numeric target columns provided in df_targets.")
         return
-    # 3. Determine and validate feature columns
-    features_to_plot = []
-    if features is None:
-        _LOGGER.info("No 'features' list provided. Auto-detecting categorical features.")
-        for col in df.columns:
-            if col in valid_targets:
-                continue
-            # Auto-include object dtypes
-            if is_object_dtype(df[col]):
-                features_to_plot.append(col)
-            # Auto-include low-cardinality numeric features - REMOVED
-            # elif is_numeric_dtype(df[col]) and df[col].nunique() <= max_categories:
-            #     _LOGGER.info(f"Treating low-cardinality numeric column '{col}' as categorical.")
-            #     features_to_plot.append(col)
-    else:
-        # Validate user-provided list
-        for col in features:
-            if col not in df.columns:
-                _LOGGER.warning(f"Feature column '{col}' not found in DataFrame. Skipping.")
-            else:
-                features_to_plot.append(col)
+    # 3. Validate feature columns (Flexible: Allow numeric but warn)
+    valid_features = []
+    for col in df_categorical.columns:
+        # If numeric, warn but accept it (will be cast to object later)
+        if is_numeric_dtype(df_categorical[col]):
+            if verbose > 0:
+                _LOGGER.warning(f"Feature '{col}' in df_categorical is numeric. It will be cast to 'object' and treated as categorical.")
+            valid_features.append(col)
+        else:
+            # Assume it is already object/category
+            valid_features.append(col)
-    if not features_to_plot:
-        _LOGGER.error("No valid categorical feature columns found to plot.")
+    if not valid_features:
+        _LOGGER.error("No valid feature columns provided in df_categorical.")
         return
     # 4. Main plotting loop
@@ -822,39 +787,53 @@ def plot_categorical_vs_target(
     for target_name in valid_targets:
         # Create a sanitized subdirectory for this target
-        safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical_{plot_type}")
+        safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
         target_save_dir = base_save_path / safe_target_dir_name
         target_save_dir.mkdir(parents=True, exist_ok=True)
-        _LOGGER.info(f"Generating '{plot_type}' plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
+        if verbose > 0:
+            _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
+        for feature_name in valid_features:
+            # Align data using concat to respect indices
+            feature_series = df_categorical[feature_name]
+            target_series = df_targets[target_name]
+            # Create a temporary DataFrame for this pair
+            temp_df = pd.concat([feature_series, target_series], axis=1)
+            # Optional: Drop rows where the target is NaN
+            if drop_empty_targets:
+                temp_df = temp_df.dropna(subset=[target_name])
+                if temp_df.empty:
+                    if verbose > 1:
+                        _LOGGER.warning(f"No valid data left for '{feature_name}' vs '{target_name}' after dropping empty targets. Skipping.")
+                    continue
-        for feature_name in features_to_plot:
+            # Force feature to object if it isn't already (handling the numeric flexibility)
+            if not is_object_dtype(temp_df[feature_name]):
+                temp_df[feature_name] = temp_df[feature_name].astype(object)
+            # Handle NaNs in the feature column (treat as a category)
+            if temp_df[feature_name].isnull().any():
+                temp_df[feature_name] = temp_df[feature_name].fillna(fill_na_with)
-            # Make a temporary copy for plotting to handle NaNs and dtypes
-            temp_df = df[[feature_name, target_name]].copy()
+            # Convert to string to ensure consistent plotting and cardinality check
+            temp_df[feature_name] = temp_df[feature_name].astype(str)
             # Check cardinality
             n_unique = temp_df[feature_name].nunique()
             if n_unique > max_categories:
-                _LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique values > {max_categories} max_categories.")
+                if verbose > 1:
+                    _LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique categories > {max_categories} max_categories.")
                 continue
-            # Handle NaNs by replacing them with the specified string
-            if temp_df[feature_name].isnull().any():
-                # Convert to object type first to allow string replacement
-                temp_df[feature_name] = temp_df[feature_name].astype(object).fillna(fill_na_with)
-            # Convert feature to string to ensure correct plotting order
-            temp_df[feature_name] = temp_df[feature_name].astype(str)
             # 5. Create the plot
-            # Increase figure width for categories
-            plt.figure(figsize=(max(10, n_unique * 1.2), 7))
+            # Dynamic figure width based on number of categories
+            plt.figure(figsize=(max(10, n_unique * 0.8), 10))
-            if plot_type == "box":
-                sns.boxplot(x=feature_name, y=target_name, data=temp_df)
-            elif plot_type == "violin":
-                sns.violinplot(x=feature_name, y=target_name, data=temp_df)
+            sns.boxplot(x=feature_name, y=target_name, data=temp_df)
             plt.title(f'{target_name} vs {feature_name}')
             plt.xlabel(feature_name)
@@ -875,8 +854,9 @@ def plot_categorical_vs_target(
                 _LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
             plt.close()
-    _LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
+    if verbose > 0:
+        _LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
 def encode_categorical_features(
@@ -982,7 +962,7 @@ def encode_categorical_features(
     # Handle the dataset splitting logic
     if split_resulting_dataset:
-        df_categorical = df_encoded[valid_columns].to_frame() # type: ignore
+        df_categorical = df_encoded[valid_columns]
         df_non_categorical = df.drop(columns=valid_columns)
         return mappings, df_non_categorical, df_categorical
     else:
@@ -1103,7 +1083,10 @@ def plot_correlation_heatmap(df: pd.DataFrame,
         annot=annot_bool,
         cmap='coolwarm',
         fmt=".2f",
-        cbar_kws={"shrink": 0.8}
+        cbar_kws={"shrink": 0.8},
+        vmin=-1,  # Anchors minimum color to -1
+        vmax=1,   # Anchors maximum color to 1
+        center=0  # Ensures 0 corresponds to the neutral color (white)
     )
     # add suffix to title

{dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-19.12.2.dist-info → dragon_ml_toolbox-19.14.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 19.12.2__py3-none-any.whl → 19.14.0__py3-none-any.whl

dragon-ml-toolbox 19.12.2py3-none-any.whl → 19.14.0py3-none-any.whl