dragon-ml-toolbox 20.12.0__py3-none-any.whl → 20.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 20.12.0
3
+ Version: 20.14.0
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- dragon_ml_toolbox-20.12.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-20.12.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
1
+ dragon_ml_toolbox-20.14.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-20.14.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
3
3
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
5
5
  ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
@@ -46,7 +46,7 @@ ml_tools/ML_evaluation/_loss.py,sha256=1a4O25i3Ya_3naNZNL7ELLUL46BY86g1scA7d7q2U
46
46
  ml_tools/ML_evaluation/_regression.py,sha256=UZA7_fg85ZKJQWszioWDtmkplSiXeHJk2fBYR5bRXHY,11225
47
47
  ml_tools/ML_evaluation/_sequence.py,sha256=gUk9Uvmy7MrXkfrriMnfypkgJU5XERHdqekTa2gBaOM,8004
48
48
  ml_tools/ML_evaluation/_vision.py,sha256=abBHQ6Z2GunHNusL3wcLgfI1FVNA6hBUBTq1eOA8FSA,11489
49
- ml_tools/ML_evaluation_captum/_ML_evaluation_captum.py,sha256=eWxd6HRQSaqFYNek6x1hZBKi8qeHqk4oOkqrjQgKjZk,19611
49
+ ml_tools/ML_evaluation_captum/_ML_evaluation_captum.py,sha256=eCP19o4sxfG0XlAVKiuuIxdtxO5lqCc0SuhWXx6eObY,20079
50
50
  ml_tools/ML_evaluation_captum/__init__.py,sha256=DZDoZXexCI49JNl_tTmFfYW4hTUYK5QQLex01wMfhnk,333
51
51
  ml_tools/ML_finalize_handler/_ML_finalize_handler.py,sha256=g-vkHJDTGXZsKOUA-Yfg7EuA1SmaHjzesCPiAyRMg2k,7054
52
52
  ml_tools/ML_finalize_handler/__init__.py,sha256=VQyLbCQUcliAAFiOAsnPhyJ7UVYgbSqAbAnpqeOnRSg,198
@@ -80,7 +80,7 @@ ml_tools/ML_optimization/__init__.py,sha256=No18Dsw6Q9zPt8B9fpG0bWomuXmwDC7Dioki
80
80
  ml_tools/ML_optimization/_multi_dragon.py,sha256=zQhDxFY8FNxUlcbSnHMVArfojzYjgNa21jSE3pJmRW0,38956
81
81
  ml_tools/ML_optimization/_single_dragon.py,sha256=jh5-SK6NKAzbheQhquiYoROozk-RzUv1jiFkIzK_AFg,7288
82
82
  ml_tools/ML_optimization/_single_manual.py,sha256=h-_k9JmRqPkjTra1nu7AyYbSyWkYZ1R3utiNmW06WFs,21809
83
- ml_tools/ML_scaler/_ML_scaler.py,sha256=P75X0Sx8N-VxC2Qy8aG7mWaZlkTfjspiZDi1YiMQD1I,8872
83
+ ml_tools/ML_scaler/_ML_scaler.py,sha256=NcwprqrAHMIKpkzMdExk99I2QpfTSbiJH8rDqmOlnkU,8870
84
84
  ml_tools/ML_scaler/__init__.py,sha256=SHDNyLsoOLl2OtkIb3pGg-JRs3E2bYJBgnHwH3vw_Tk,172
85
85
  ml_tools/ML_trainer/__init__.py,sha256=42kueHa7Z0b_yLbywNCgIxlW6WmgLBqkTFwKH7vFLXw,379
86
86
  ml_tools/ML_trainer/_base_trainer.py,sha256=0ATm672NRsjJ6nv_NEl6-OEd9Bst1-s5OPxfG4qe8Lg,18075
@@ -104,10 +104,10 @@ ml_tools/_core/__init__.py,sha256=m-VP0RW0tOTm9N5NI3kFNcpM7WtVgs0RK9pK3ZJRZQQ,14
104
104
  ml_tools/_core/_logger.py,sha256=xzhn_FouMDRVNwXGBGlPC9Ruq6i5uCrmNaS5jesguMU,4972
105
105
  ml_tools/_core/_schema_load_ops.py,sha256=KLs9vBzANz5ESe2wlP-C41N4VlgGil-ywcfvWKSOGss,1551
106
106
  ml_tools/_core/_script_info.py,sha256=LtFGt10gEvCnhIRMKJPi2yXkiGLcdr7lE-oIP2XGHzQ,234
107
- ml_tools/data_exploration/__init__.py,sha256=efUBsruHL56B429tUadl3PdG73zAF639Y430uMQRfko,1917
108
- ml_tools/data_exploration/_analysis.py,sha256=PJNrEBz5ZZXHoUlQ6fh9Y86nzPQrLpVPv2Ye4NfOxgs,14181
107
+ ml_tools/data_exploration/__init__.py,sha256=XNA8gcRx5ifrv092HA7HSpek8havlk_3RZi9aq9dSjg,1957
108
+ ml_tools/data_exploration/_analysis.py,sha256=JSoFJSkv4-_v9YxxmjHZ_PeFRneDENjSEo2sy_uC4oY,14196
109
109
  ml_tools/data_exploration/_cleaning.py,sha256=pAZOXgGK35j7O8q6cnyTwYK1GLNnD04A8p2fSyMB1mg,20906
110
- ml_tools/data_exploration/_features.py,sha256=Z1noJfDxBzFRfusFp6NlpLF2NItuZuzFHq4ssWFqny4,26273
110
+ ml_tools/data_exploration/_features.py,sha256=twJ6OixU4ItRXA8rPJRfg2N9QVsbn38CFqJiLcXav1A,28664
111
111
  ml_tools/data_exploration/_plotting.py,sha256=zH1dPcIoAlOuww23xIoBCsQOAshPPv9OyGposOA2RvI,19883
112
112
  ml_tools/data_exploration/_schema_ops.py,sha256=Fd6fBGGv4OpxmJ1HG9pith6QL90z0tzssCvzkQxlEEQ,11083
113
113
  ml_tools/ensemble_evaluation/__init__.py,sha256=t4Gr8EGEk8RLatyc92-S0BzbQvdvodzoF-qDAH2qjVg,546
@@ -119,7 +119,7 @@ ml_tools/ensemble_learning/_ensemble_learning.py,sha256=MHDZBR20_nStlSSeThFI3bSu
119
119
  ml_tools/excel_handler/__init__.py,sha256=AaWM3n_dqBhJLTs3OEA57ex5YykKXNOwVCyHlVsdnqI,530
120
120
  ml_tools/excel_handler/_excel_handler.py,sha256=TODudmeQgDSdxUKzLfAzizs--VL-g8WxDOfQ4sgxxLs,13965
121
121
  ml_tools/keys/__init__.py,sha256=-0c2pmrhyfROc-oQpEjJGLBMhSagA3CyFijQaaqZRqU,399
122
- ml_tools/keys/_keys.py,sha256=Kr73o9SaH5Y3DT0z0H-1eLwlBplJmjisjoO_EoUNkAg,9388
122
+ ml_tools/keys/_keys.py,sha256=YE_Ux2FYObfWurcQvfCvA3ZehwOvKvtCvIViUuYAYNM,9447
123
123
  ml_tools/math_utilities/__init__.py,sha256=K7Obkkc4rPKj4EbRZf1BsXHfiCg7FXYv_aN9Yc2Z_Vg,400
124
124
  ml_tools/math_utilities/_math_utilities.py,sha256=BYHIVcM9tuKIhVrkgLLiM5QalJ39zx7dXYy_M9aGgiM,9012
125
125
  ml_tools/optimization_tools/__init__.py,sha256=KD8JXpfGuPndO4AHnjJGu6uV1GRwhOfboD0KZV45kzw,658
@@ -143,7 +143,7 @@ ml_tools/utilities/__init__.py,sha256=h4lE3SQstg-opcQj6QSKhu-HkqSbmHExsWoM9vC5D9
143
143
  ml_tools/utilities/_translate.py,sha256=U8hRPa3PmTpIf9n9yR3gBGmp_hkcsjQLwjAHSHc0WHs,10325
144
144
  ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
145
145
  ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
146
- dragon_ml_toolbox-20.12.0.dist-info/METADATA,sha256=VH-wt974dX5kNOfVShO_N5HJJCIhqP2V_7uihSwlYzE,7889
147
- dragon_ml_toolbox-20.12.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
148
- dragon_ml_toolbox-20.12.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
149
- dragon_ml_toolbox-20.12.0.dist-info/RECORD,,
146
+ dragon_ml_toolbox-20.14.0.dist-info/METADATA,sha256=32IleSQa7t7E42ZB5rM32Lf1MlSAMtKkU-TFky3VckA,7889
147
+ dragon_ml_toolbox-20.14.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
148
+ dragon_ml_toolbox-20.14.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
149
+ dragon_ml_toolbox-20.14.0.dist-info/RECORD,,
@@ -202,6 +202,13 @@ def _process_single_target(ig: 'IntegratedGradients', # type: ignore
202
202
  mean_abs_attr = mean_abs_attr[:min_len]
203
203
  feature_names = feature_names[:min_len]
204
204
 
205
+ # Calculate percentages (Before Min-Max scaling to preserve relative importance)
206
+ total_attr_sum = np.sum(mean_abs_attr)
207
+ if total_attr_sum > 0:
208
+ attr_percentages = (mean_abs_attr / total_attr_sum) * 100.0
209
+ else:
210
+ attr_percentages = np.zeros_like(mean_abs_attr)
211
+
205
212
  # Min-Max Scaling
206
213
  target_min = 0.01
207
214
  target_max = 1.0
@@ -222,7 +229,8 @@ def _process_single_target(ig: 'IntegratedGradients', # type: ignore
222
229
  # --- Save Data to CSV ---
223
230
  summary_df = pd.DataFrame({
224
231
  CaptumKeys.FEATURE_COLUMN: feature_names,
225
- CaptumKeys.IMPORTANCE_COLUMN: mean_abs_attr
232
+ CaptumKeys.IMPORTANCE_COLUMN: mean_abs_attr,
233
+ CaptumKeys.PERCENT_COLUMN: attr_percentages
226
234
  }).sort_values(CaptumKeys.IMPORTANCE_COLUMN, ascending=False)
227
235
 
228
236
  csv_name = f"{CaptumKeys.SAVENAME}{file_suffix}.csv"
@@ -230,11 +238,13 @@ def _process_single_target(ig: 'IntegratedGradients', # type: ignore
230
238
  summary_df.to_csv(csv_path, index=False)
231
239
 
232
240
  # --- Generate Plot ---
233
- plot_df = summary_df.head(20).sort_values(CaptumKeys.IMPORTANCE_COLUMN, ascending=True)
241
+ plot_df = summary_df.head(20).sort_values(CaptumKeys.PERCENT_COLUMN, ascending=True)
234
242
  plt.figure(figsize=(10, 8), dpi=300)
235
- plt.barh(plot_df[CaptumKeys.FEATURE_COLUMN], plot_df[CaptumKeys.IMPORTANCE_COLUMN], color='mediumpurple')
236
- plt.xlim(0, 1.05) # standardized scale
237
- plt.xlabel("Mean Absolute Attribution")
243
+ plt.barh(plot_df[CaptumKeys.FEATURE_COLUMN], plot_df[CaptumKeys.PERCENT_COLUMN], color='mediumpurple')
244
+ # plt.xlim(0, 1.05) # standardized scale # Removed to reflect actual percentages
245
+ plt.xlim(left=0) # start at 0
246
+ # plt.xlabel("Scaled Mean Absolute Attribution")
247
+ plt.xlabel("Relative Importance (%)")
238
248
 
239
249
  title = "Feature Importance"
240
250
 
@@ -99,7 +99,7 @@ class DragonScaler:
99
99
  std = torch.sqrt(torch.clamp(variance, min=1e-8))
100
100
 
101
101
  if verbose >= 2:
102
- _LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features} features (Welford's).")
102
+ _LOGGER.info(f"Scaler fitted on {n_total} samples for {num_continuous_features} columns (Welford's).")
103
103
  return cls(mean=mean_global, std=std, continuous_feature_indices=continuous_feature_indices)
104
104
 
105
105
  @classmethod
@@ -121,7 +121,7 @@ class DragonScaler:
121
121
  std = torch.where(std == 0, torch.tensor(1.0, device=data.device), std)
122
122
 
123
123
  if verbose >= 2:
124
- _LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features} features.")
124
+ _LOGGER.info(f"Scaler fitted on tensor with {data.shape[0]} samples for {num_features} columns.")
125
125
 
126
126
  return cls(mean=mean, std=std, continuous_feature_indices=indices)
127
127
 
@@ -33,6 +33,7 @@ from ._features import (
33
33
  reconstruct_one_hot,
34
34
  reconstruct_binary,
35
35
  reconstruct_multibinary,
36
+ filter_subset,
36
37
  )
37
38
 
38
39
  from ._schema_ops import (
@@ -51,6 +52,7 @@ __all__ = [
51
52
  "drop_columns_with_missing_data",
52
53
  "drop_macro",
53
54
  "clean_column_names",
55
+ "filter_subset",
54
56
  "plot_value_distributions",
55
57
  "split_features_targets",
56
58
  "split_continuous_binary",
@@ -34,7 +34,7 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
34
34
  """
35
35
  summary = pd.DataFrame({
36
36
  'Data Type': df.dtypes,
37
- 'Completeness %': (df.notnull().mean() * 100).round(2),
37
+ 'Completeness %': (df.notnull().mean() * 100).round(2), # type: ignore
38
38
  'Unique Values': df.nunique(),
39
39
  # 'Missing %': (df.isnull().mean() * 100).round(2)
40
40
  })
@@ -657,3 +657,66 @@ def reconstruct_multibinary(
657
657
  _LOGGER.info(f"Reconstructed {converted_count} binary columns matching '{pattern}'.")
658
658
 
659
659
  return new_df, target_columns
660
+
661
+
662
+ def filter_subset(
663
+ df: pd.DataFrame,
664
+ filters: Union[dict[str, Any], dict[str, list[Any]]],
665
+ drop_filter_cols: bool = True,
666
+ reset_index: bool = True,
667
+ verbose: int = 3
668
+ ) -> pd.DataFrame:
669
+ """
670
+ Filters the DataFrame based on a dictionary of column-value conditions.
671
+
672
+ Supports:
673
+ - Single value matching (e.g., {"Color": "Blue"})
674
+ - Multiple value matching (e.g., {"Color": ["Blue", "Red"]}) -> OR logic within column.
675
+ - Multiple column filtering (e.g., {"Color": "Blue", "Size": "Large"}) -> AND logic between columns.
676
+
677
+ Args:
678
+ df (pd.DataFrame): Input DataFrame.
679
+ filters (dict[str, Any] | dict[str, list[Any]]): Dictionary where keys are column names and values are the target values (scalar or list).
680
+ drop_filter_cols (bool): If True, drops the columns used for filtering from the result.
681
+ reset_index (bool): If True, resets the index of the resulting DataFrame.
682
+ verbose (int): Verbosity level.
683
+
684
+ Returns:
685
+ pd.DataFrame: The filtered DataFrame.
686
+ """
687
+ df_filtered = df.copy()
688
+
689
+ # Validate columns exist
690
+ missing_cols = [col for col in filters.keys() if col not in df.columns]
691
+ if missing_cols:
692
+ _LOGGER.error(f"Filter columns not found: {missing_cols}")
693
+ raise ValueError()
694
+
695
+ if verbose >= 2:
696
+ _LOGGER.info(f"Original shape: {df.shape}")
697
+
698
+ for col, value in filters.items():
699
+ # Handle list of values (OR logic within column)
700
+ if isinstance(value, list):
701
+ df_filtered = df_filtered[df_filtered[col].isin(value)]
702
+ # Handle single value
703
+ else:
704
+ # Warn if the value is a floating point due to potential precision issues
705
+ if isinstance(value, float) and verbose >= 1:
706
+ _LOGGER.warning(f"Filtering on column '{col}' with float value '{value}'.")
707
+ df_filtered = df_filtered[df_filtered[col] == value]
708
+
709
+ if drop_filter_cols:
710
+ if verbose >= 3:
711
+ _LOGGER.info(f"Dropping filter columns: {list(filters.keys())}")
712
+ df_filtered.drop(columns=list(filters.keys()), inplace=True)
713
+
714
+ if reset_index:
715
+ if verbose >= 3:
716
+ _LOGGER.info("Resetting index of the filtered DataFrame.")
717
+ df_filtered.reset_index(drop=True, inplace=True)
718
+
719
+ if verbose >= 2:
720
+ _LOGGER.info(f"Filtered shape: {df_filtered.shape}")
721
+
722
+ return df_filtered
ml_tools/keys/_keys.py CHANGED
@@ -99,8 +99,9 @@ class SHAPKeys:
99
99
 
100
100
  class CaptumKeys:
101
101
  """Keys for Captum functions"""
102
- FEATURE_COLUMN = "feature"
103
- IMPORTANCE_COLUMN = "importance"
102
+ FEATURE_COLUMN = "Feature"
103
+ IMPORTANCE_COLUMN = "Scaled Mean Attribution"
104
+ PERCENT_COLUMN = "Relative Importance(%)"
104
105
  SAVENAME = "captum_summary"
105
106
  PLOT_NAME = "captum_importance_plot"
106
107