dragon-ml-toolbox 19.13.0__py3-none-any.whl → 19.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 19.13.0
3
+ Version: 19.14.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- dragon_ml_toolbox-19.13.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-19.13.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
1
+ dragon_ml_toolbox-19.14.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-19.14.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
3
3
  ml_tools/ETL_cleaning.py,sha256=cKXyRFaaFs_beAGDnQM54xnML671kq-yJEGjHafW-20,351
4
4
  ml_tools/ETL_engineering.py,sha256=cwh1FhtNdUHllUDvho-x3SIVj4KwG_rFQR6VYzWUg0U,898
5
5
  ml_tools/GUI_tools.py,sha256=O89rG8WQv6GY1DiphQjIsPzXFCQID6te7q_Sgt1iTkQ,294
@@ -88,7 +88,7 @@ ml_tools/_core/_PSO_optimization.py,sha256=W3g5xw2v2eOUQadv8KHFkt5HNm9AiY3ZUk-Te
88
88
  ml_tools/_core/_SQL.py,sha256=zX_8EgYfmLmvvrnL851KMkI4w9kdkjHJ997BTvS5aig,11556
89
89
  ml_tools/_core/_VIF_factor.py,sha256=BM0mTowBqt45PXFy9oJLhT9C-CTWWo0TQhgCyWYLHtQ,10457
90
90
  ml_tools/_core/__init__.py,sha256=d4IG0OxUXj2HffepzQcYixHlZeuuuDMAFa09H_6LtmU,12
91
- ml_tools/_core/_data_exploration.py,sha256=VPSqTo8IPLDOGcVDAcdyxgzO0Fw224pbivzbli_aad0,76159
91
+ ml_tools/_core/_data_exploration.py,sha256=tOdtXTCh_xESKqIUuxCCo8fbcPoO9Eu5PwJwyehGKY8,76434
92
92
  ml_tools/_core/_ensemble_evaluation.py,sha256=17lWl4bWLT1BAMv_fhGf2D3wy-F4jx0HgnJ79lYkRuE,28419
93
93
  ml_tools/_core/_ensemble_inference.py,sha256=9UpARSETzmqPdQmxqizD768tjkqldxHw1ER_hM9Kx9M,8631
94
94
  ml_tools/_core/_ensemble_learning.py,sha256=X8ghbjDOLMENCWdISXLhDlHQtR3C6SW1tkTBAcfRRPY,22016
@@ -105,7 +105,7 @@ ml_tools/_core/_schema.py,sha256=TM5WVVMoKOvr_Bc2z34sU_gzKlM465PRKTgdZaEOkGY,140
105
105
  ml_tools/_core/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
106
106
  ml_tools/_core/_serde.py,sha256=tsI4EO2Y7jrBMmbQ1pinDsPOrOg-SaPuB-Dt40q0taE,5609
107
107
  ml_tools/_core/_utilities.py,sha256=oU-0hBipE96bXox66NG-hFuEMMNkKa9MkAy1yJGCSIA,22779
108
- dragon_ml_toolbox-19.13.0.dist-info/METADATA,sha256=349zn3DuPgY4UmlKJ7YuI1lNhGCXnYFYe4zo63mDkbE,8193
109
- dragon_ml_toolbox-19.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
110
- dragon_ml_toolbox-19.13.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
111
- dragon_ml_toolbox-19.13.0.dist-info/RECORD,,
108
+ dragon_ml_toolbox-19.14.0.dist-info/METADATA,sha256=7QaJsWeT9idUhpAV37t64fsuWNmzXbaQqA-a-yDP2yY,8193
109
+ dragon_ml_toolbox-19.14.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
110
+ dragon_ml_toolbox-19.14.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
111
+ dragon_ml_toolbox-19.14.0.dist-info/RECORD,,
@@ -26,13 +26,13 @@ __all__ = [
26
26
  "drop_macro",
27
27
  "clean_column_names",
28
28
  "plot_value_distributions",
29
- "plot_continuous_vs_target",
30
- "plot_categorical_vs_target",
31
29
  "split_features_targets",
32
30
  "encode_categorical_features",
33
31
  "clip_outliers_single",
34
32
  "clip_outliers_multi",
35
33
  "drop_outlier_samples",
34
+ "plot_continuous_vs_target",
35
+ "plot_categorical_vs_target",
36
36
  "plot_correlation_heatmap",
37
37
  "finalize_feature_schema",
38
38
  "match_and_filter_columns_by_regex",
@@ -59,16 +59,18 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
59
59
  """
60
60
  summary = pd.DataFrame({
61
61
  'Data Type': df.dtypes,
62
- 'Non-Null Count': df.notnull().sum(),
62
+ 'Completeness %': (df.notnull().mean() * 100).round(2),
63
63
  'Unique Values': df.nunique(),
64
- 'Missing %': (df.isnull().mean() * 100).round(round_digits)
64
+ # 'Missing %': (df.isnull().mean() * 100).round(2)
65
65
  })
66
66
 
67
67
  # For numeric columns, add summary statistics
68
68
  numeric_cols = df.select_dtypes(include='number').columns
69
69
  if not numeric_cols.empty:
70
- summary_numeric = df[numeric_cols].describe().T[
71
- ['mean', 'std', 'min', '25%', '50%', '75%', 'max']
70
+ stats = df[numeric_cols].describe(percentiles=[.10, .25, .50, .70, .80, .90])
71
+
72
+ summary_numeric = stats.T[
73
+ ['mean', 'std', 'min', '10%', '25%', '50%', '70%', '80%', '90%', 'max']
72
74
  ].round(round_digits)
73
75
  summary = summary.join(summary_numeric, how='left')
74
76
 
@@ -596,68 +598,55 @@ def plot_value_distributions(
596
598
 
597
599
 
598
600
  def plot_continuous_vs_target(
599
- df: pd.DataFrame,
600
- targets: List[str],
601
+ df_continuous: pd.DataFrame,
602
+ df_targets: pd.DataFrame,
601
603
  save_dir: Union[str, Path],
602
- features: Optional[List[str]] = None
604
+ verbose: int = 1
603
605
  ):
604
606
  """
605
- Plots each continuous feature against each target to visualize linear relationships.
607
+ Plots each continuous feature from df_continuous against each target in df_targets.
606
608
 
607
- This function is a common EDA step for regression tasks. It creates a
608
- scatter plot for each feature-target pair, overlays a simple linear
609
- regression line, and saves each plot as an individual .svg file.
609
+ This function creates a scatter plot for each feature-target pair, overlays a
610
+ simple linear regression line, and saves each plot as an individual .svg file.
610
611
 
611
612
  Plots are saved in a structured way, with a subdirectory created for
612
613
  each target variable.
613
614
 
614
615
  Args:
615
- df (pd.DataFrame): The input DataFrame.
616
- targets (List[str]): A list of target column names to plot (y-axis).
617
- save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
618
- features (List[str] | None): A list of feature column names to plot (x-axis). If None, all non-target columns in the
619
- DataFrame will be used.
616
+ df_continuous (pd.DataFrame): DataFrame containing continuous feature columns (x-axis).
617
+ df_targets (pd.DataFrame): DataFrame containing target columns (y-axis).
618
+ save_dir (str | Path): The base directory where plots will be saved.
619
+ verbose (int): Verbosity level for logging warnings.
620
620
 
621
621
  Notes:
622
- - Only numeric features and numeric targets are processed. Non-numeric
623
- columns in the lists will be skipped with a warning.
624
- - Rows with NaN in either the feature or the target are dropped
625
- pairwise for each plot.
622
+ - Only numeric features and numeric targets are processed.
623
+ - Rows with NaN in either the feature or the target are dropped pairwise.
624
+ - Assumes df_continuous and df_targets share the same index.
626
625
  """
627
626
  # 1. Validate the base save directory
628
627
  base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
629
628
 
630
- # 2. Validate helper
631
- def _validate_numeric_cols(col_list: List[str], col_type: str) -> List[str]:
629
+ # 2. Validation helper
630
+ def _get_valid_numeric_cols(df: pd.DataFrame, df_name: str) -> List[str]:
632
631
  valid_cols = []
633
- for col in col_list:
634
- if col not in df.columns:
635
- _LOGGER.warning(f"{col_type} column '{col}' not found. Skipping.")
636
- elif not is_numeric_dtype(df[col]):
637
- _LOGGER.warning(f"{col_type} column '{col}' is not numeric. Skipping.")
632
+ for col in df.columns:
633
+ if not is_numeric_dtype(df[col]):
634
+ if verbose > 0:
635
+ _LOGGER.warning(f"Column '{col}' in {df_name} is not numeric. Skipping.")
638
636
  else:
639
637
  valid_cols.append(col)
640
638
  return valid_cols
641
639
 
642
- # 3. Validate target columns FIRST
643
- valid_targets = _validate_numeric_cols(targets, "Target")
640
+ # 3. Validate target columns
641
+ valid_targets = _get_valid_numeric_cols(df_targets, "df_targets")
644
642
  if not valid_targets:
645
- _LOGGER.error("No valid numeric target columns provided to plot.")
643
+ _LOGGER.error("No valid numeric target columns provided in df_targets.")
646
644
  return
647
645
 
648
- # 4. Determine and validate feature columns
649
- if features is None:
650
- _LOGGER.info("No 'features' list provided. Using all non-target columns as features.")
651
- target_set = set(valid_targets)
652
- # Get all columns that are not in the valid_targets set
653
- features_to_validate = [col for col in df.columns if col not in target_set]
654
- else:
655
- features_to_validate = features
656
-
657
- valid_features = _validate_numeric_cols(features_to_validate, "Feature")
658
-
646
+ # 4. Validate feature columns
647
+ valid_features = _get_valid_numeric_cols(df_continuous, "df_continuous")
659
648
  if not valid_features:
660
- _LOGGER.error("No valid numeric feature columns found to plot.")
649
+ _LOGGER.error("No valid numeric feature columns provided in df_continuous.")
661
650
  return
662
651
 
663
652
  # 5. Main plotting loop
@@ -669,15 +658,20 @@ def plot_continuous_vs_target(
669
658
  target_save_dir = base_save_path / safe_target_dir_name
670
659
  target_save_dir.mkdir(parents=True, exist_ok=True)
671
660
 
672
- _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
661
+ if verbose > 0:
662
+ _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
673
663
 
674
664
  for feature_name in valid_features:
675
665
 
676
- # Drop NaNs pairwise for this specific plot
677
- temp_df = df[[feature_name, target_name]].dropna()
666
+ # Align data and drop NaNs pairwise - use concat to ensure we respect the index alignment between the two DFs
667
+ temp_df = pd.concat([
668
+ df_continuous[feature_name],
669
+ df_targets[target_name]
670
+ ], axis=1).dropna()
678
671
 
679
672
  if temp_df.empty:
680
- _LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
673
+ if verbose > 1:
674
+ _LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
681
675
  continue
682
676
 
683
677
  x = temp_df[feature_name]
@@ -685,11 +679,12 @@ def plot_continuous_vs_target(
685
679
 
686
680
  # 6. Perform linear fit
687
681
  try:
688
- # Modern replacement for np.polyfit + np.poly1d. Compatible with NumPy 1.14+ and NumPy 2.0+
682
+ # Modern replacement for np.polyfit + np.poly1d
689
683
  p = np.polynomial.Polynomial.fit(x, y, deg=1)
690
684
  plot_regression_line = True
691
685
  except (np.linalg.LinAlgError, ValueError):
692
- _LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
686
+ if verbose > 0:
687
+ _LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
693
688
  plot_regression_line = False
694
689
 
695
690
  # 7. Create the plot
@@ -723,77 +718,68 @@ def plot_continuous_vs_target(
723
718
 
724
719
  # Close the figure to free up memory
725
720
  plt.close()
726
-
727
- _LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
721
+
722
+ if verbose > 0:
723
+ _LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
728
724
 
729
725
 
730
726
  def plot_categorical_vs_target(
731
- df: pd.DataFrame,
732
- targets: List[str],
727
+ df_categorical: pd.DataFrame,
728
+ df_targets: pd.DataFrame,
733
729
  save_dir: Union[str, Path],
734
- features: Optional[List[str]] = None,
735
730
  max_categories: int = 50,
736
- fill_na_with: str = "MISSING DATA"
731
+ fill_na_with: str = "MISSING DATA",
732
+ drop_empty_targets: bool = True,
733
+ verbose: int = 1
737
734
  ):
738
735
  """
739
- Plots each categorical feature against each numeric target using box plots.
736
+ Plots each feature in df_categorical against each numeric target in df_targets using box plots.
740
737
 
741
- This function is a core EDA step for regression tasks to understand the
742
- relationship between a categorical independent variable and a continuous
743
- dependent variable.
744
-
745
- Plots are saved as individual .svg files in a structured way, with a subdirectory created for each target.
738
+ Automatically aligns the two DataFrames by index. If a numeric
739
+ column is passed within df_categorical, it will be cast to object type to treat it as a category.
746
740
 
747
741
  Args:
748
- df (pd.DataFrame): The input DataFrame.
749
- targets (List[str]): A list of numeric target column names (y-axis).
750
- save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
751
- features (List[str] | None): A list of categorical feature column names (x-axis). If None, all non-numeric (object) columns will be used.
752
- max_categories (int): The maximum number of unique categories a feature can have to be plotted. Features exceeding this limit will be skipped.
753
- fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category. Defaults to "Missing".
742
+ df_categorical (pd.DataFrame): DataFrame containing categorical feature columns (x-axis).
743
+ df_targets (pd.DataFrame): DataFrame containing numeric target columns (y-axis).
744
+ save_dir (str | Path): Base directory for saving plots.
745
+ max_categories (int): The maximum number of unique categories a feature can have to be plotted.
746
+ fill_na_with (str): String to replace NaN values in categorical columns.
747
+ drop_empty_targets (bool): If True, drops rows where the target value is NaN before plotting.
748
+ verbose (int): Verbosity level for logging warnings.
754
749
 
755
750
  Notes:
756
- - Only numeric targets are processed.
757
- - Features are automatically identified as categorical if they are 'object' dtype.
751
+ - Assumes df_categorical and df_targets share the same index.
758
752
  """
759
- # 1. Validate the base save directory and inputs
753
+ # 1. Validate the base save directory
760
754
  base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
761
755
 
762
756
  # 2. Validate target columns (must be numeric)
763
757
  valid_targets = []
764
- for col in targets:
765
- if col not in df.columns:
766
- _LOGGER.warning(f"Target column '{col}' not found. Skipping.")
767
- elif not is_numeric_dtype(df[col]):
768
- _LOGGER.warning(f"Target column '{col}' is not numeric. Skipping.")
758
+ for col in df_targets.columns:
759
+ if not is_numeric_dtype(df_targets[col]):
760
+ if verbose > 0:
761
+ _LOGGER.warning(f"Target column '{col}' in df_targets is not numeric. Skipping.")
769
762
  else:
770
763
  valid_targets.append(col)
771
764
 
772
765
  if not valid_targets:
773
- _LOGGER.error("No valid numeric target columns provided to plot.")
766
+ _LOGGER.error("No valid numeric target columns provided in df_targets.")
774
767
  return
775
768
 
776
- # 3. Determine and validate feature columns
777
- features_to_plot = []
778
- if features is None:
779
- _LOGGER.info("No 'features' list provided. Auto-detecting categorical features.")
780
- for col in df.columns:
781
- if col in valid_targets:
782
- continue
783
- # Auto-include object dtypes
784
- if is_object_dtype(df[col]):
785
- features_to_plot.append(col)
786
-
787
- else:
788
- # Validate user-provided list
789
- for col in features:
790
- if col not in df.columns:
791
- _LOGGER.warning(f"Feature column '{col}' not found in DataFrame. Skipping.")
792
- else:
793
- features_to_plot.append(col)
769
+ # 3. Validate feature columns (Flexible: Allow numeric but warn)
770
+ valid_features = []
771
+ for col in df_categorical.columns:
772
+ # If numeric, warn but accept it (will be cast to object later)
773
+ if is_numeric_dtype(df_categorical[col]):
774
+ if verbose > 0:
775
+ _LOGGER.warning(f"Feature '{col}' in df_categorical is numeric. It will be cast to 'object' and treated as categorical.")
776
+ valid_features.append(col)
777
+ else:
778
+ # Assume it is already object/category
779
+ valid_features.append(col)
794
780
 
795
- if not features_to_plot:
796
- _LOGGER.error("No valid categorical feature columns found to plot.")
781
+ if not valid_features:
782
+ _LOGGER.error("No valid feature columns provided in df_categorical.")
797
783
  return
798
784
 
799
785
  # 4. Main plotting loop
@@ -805,29 +791,47 @@ def plot_categorical_vs_target(
805
791
  target_save_dir = base_save_path / safe_target_dir_name
806
792
  target_save_dir.mkdir(parents=True, exist_ok=True)
807
793
 
808
- _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
809
- for feature_name in features_to_plot:
794
+ if verbose > 0:
795
+ _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
796
+
797
+ for feature_name in valid_features:
798
+
799
+ # Align data using concat to respect indices
800
+ feature_series = df_categorical[feature_name]
801
+ target_series = df_targets[target_name]
802
+
803
+ # Create a temporary DataFrame for this pair
804
+ temp_df = pd.concat([feature_series, target_series], axis=1)
805
+
806
+ # Optional: Drop rows where the target is NaN
807
+ if drop_empty_targets:
808
+ temp_df = temp_df.dropna(subset=[target_name])
809
+ if temp_df.empty:
810
+ if verbose > 1:
811
+ _LOGGER.warning(f"No valid data left for '{feature_name}' vs '{target_name}' after dropping empty targets. Skipping.")
812
+ continue
813
+
814
+ # Force feature to object if it isn't already (handling the numeric flexibility)
815
+ if not is_object_dtype(temp_df[feature_name]):
816
+ temp_df[feature_name] = temp_df[feature_name].astype(object)
817
+
818
+ # Handle NaNs in the feature column (treat as a category)
819
+ if temp_df[feature_name].isnull().any():
820
+ temp_df[feature_name] = temp_df[feature_name].fillna(fill_na_with)
810
821
 
811
- # Make a temporary copy for plotting to handle NaNs and dtypes
812
- temp_df = df[[feature_name, target_name]].copy()
822
+ # Convert to string to ensure consistent plotting and cardinality check
823
+ temp_df[feature_name] = temp_df[feature_name].astype(str)
813
824
 
814
825
  # Check cardinality
815
826
  n_unique = temp_df[feature_name].nunique()
816
827
  if n_unique > max_categories:
817
- _LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique values > {max_categories} max_categories.")
828
+ if verbose > 1:
829
+ _LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique categories > {max_categories} max_categories.")
818
830
  continue
819
-
820
- # Handle NaNs by replacing them with the specified string
821
- if temp_df[feature_name].isnull().any():
822
- # Convert to object type first to allow string replacement
823
- temp_df[feature_name] = temp_df[feature_name].astype(object).fillna(fill_na_with)
824
-
825
- # Convert feature to string to ensure correct plotting order
826
- temp_df[feature_name] = temp_df[feature_name].astype(str)
827
831
 
828
832
  # 5. Create the plot
829
- # Increase figure width for categories
830
- plt.figure(figsize=(max(10, n_unique * 1.2), 10))
833
+ # Dynamic figure width based on number of categories
834
+ plt.figure(figsize=(max(10, n_unique * 0.8), 10))
831
835
 
832
836
  sns.boxplot(x=feature_name, y=target_name, data=temp_df)
833
837
 
@@ -850,8 +854,9 @@ def plot_categorical_vs_target(
850
854
  _LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
851
855
 
852
856
  plt.close()
853
-
854
- _LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
857
+
858
+ if verbose > 0:
859
+ _LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
855
860
 
856
861
 
857
862
  def encode_categorical_features(
@@ -1078,7 +1083,10 @@ def plot_correlation_heatmap(df: pd.DataFrame,
1078
1083
  annot=annot_bool,
1079
1084
  cmap='coolwarm',
1080
1085
  fmt=".2f",
1081
- cbar_kws={"shrink": 0.8}
1086
+ cbar_kws={"shrink": 0.8},
1087
+ vmin=-1, # Anchors minimum color to -1
1088
+ vmax=1, # Anchors maximum color to 1
1089
+ center=0 # Ensures 0 corresponds to the neutral color (white)
1082
1090
  )
1083
1091
 
1084
1092
  # add suffix to title