dragon-ml-toolbox 10.1.1__py3-none-any.whl → 14.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show
  1. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
  2. dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
  3. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
  4. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
  5. ml_tools/ETL_cleaning.py +175 -59
  6. ml_tools/ETL_engineering.py +506 -70
  7. ml_tools/GUI_tools.py +2 -1
  8. ml_tools/MICE_imputation.py +212 -7
  9. ml_tools/ML_callbacks.py +73 -40
  10. ml_tools/ML_datasetmaster.py +267 -284
  11. ml_tools/ML_evaluation.py +119 -58
  12. ml_tools/ML_evaluation_multi.py +107 -32
  13. ml_tools/ML_inference.py +15 -5
  14. ml_tools/ML_models.py +234 -170
  15. ml_tools/ML_models_advanced.py +323 -0
  16. ml_tools/ML_optimization.py +321 -97
  17. ml_tools/ML_scaler.py +10 -5
  18. ml_tools/ML_trainer.py +585 -40
  19. ml_tools/ML_utilities.py +528 -0
  20. ml_tools/ML_vision_datasetmaster.py +1315 -0
  21. ml_tools/ML_vision_evaluation.py +260 -0
  22. ml_tools/ML_vision_inference.py +428 -0
  23. ml_tools/ML_vision_models.py +627 -0
  24. ml_tools/ML_vision_transformers.py +58 -0
  25. ml_tools/PSO_optimization.py +10 -7
  26. ml_tools/RNN_forecast.py +2 -0
  27. ml_tools/SQL.py +22 -9
  28. ml_tools/VIF_factor.py +4 -3
  29. ml_tools/_ML_vision_recipe.py +88 -0
  30. ml_tools/__init__.py +1 -0
  31. ml_tools/_logger.py +0 -2
  32. ml_tools/_schema.py +96 -0
  33. ml_tools/constants.py +79 -0
  34. ml_tools/custom_logger.py +164 -16
  35. ml_tools/data_exploration.py +1092 -109
  36. ml_tools/ensemble_evaluation.py +48 -1
  37. ml_tools/ensemble_inference.py +6 -7
  38. ml_tools/ensemble_learning.py +4 -3
  39. ml_tools/handle_excel.py +1 -0
  40. ml_tools/keys.py +80 -0
  41. ml_tools/math_utilities.py +259 -0
  42. ml_tools/optimization_tools.py +198 -24
  43. ml_tools/path_manager.py +144 -45
  44. ml_tools/serde.py +192 -0
  45. ml_tools/utilities.py +287 -227
  46. dragon_ml_toolbox-10.1.1.dist-info/RECORD +0 -36
  47. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
  48. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,7 @@ from typing import Union, Optional, Literal
25
25
  from .path_manager import sanitize_filename, make_fullpath
26
26
  from ._script_info import _script_info
27
27
  from ._logger import _LOGGER
28
+ from .keys import SHAPKeys
28
29
 
29
30
 
30
31
  __all__ = [
@@ -472,7 +473,7 @@ def get_shap_values(
472
473
  save_dir: Directory to save visualizations.
473
474
  """
474
475
  sanitized_target_name = sanitize_filename(target_name)
475
- global_save_path = make_fullpath(save_dir, make=True)
476
+ global_save_path = make_fullpath(save_dir, make=True, enforce="directory")
476
477
 
477
478
  def _apply_plot_style():
478
479
  styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -539,6 +540,15 @@ def get_shap_values(
539
540
  plot_type=plot_type,
540
541
  title=f"{model_name} - {target_name} (Class {class_name})"
541
542
  )
543
+
544
+ # Save the summary data for the current class
545
+ summary_save_path = global_save_path / f"SHAP_{sanitized_target_name}_{class_name}.csv"
546
+ _save_summary_csv(
547
+ shap_values_for_summary=class_shap,
548
+ feature_names=feature_names,
549
+ save_path=summary_save_path
550
+ )
551
+
542
552
  else:
543
553
  values = shap_values[1] if isinstance(shap_values, list) else shap_values
544
554
  for plot_type in ["bar", "dot"]:
@@ -549,6 +559,15 @@ def get_shap_values(
549
559
  plot_type=plot_type,
550
560
  title=f"{model_name} - {target_name}"
551
561
  )
562
+
563
+ # Save the summary data for the positive class
564
+ shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
565
+ summary_save_path = global_save_path / shap_summary_filename
566
+ _save_summary_csv(
567
+ shap_values_for_summary=values,
568
+ feature_names=feature_names,
569
+ save_path=summary_save_path
570
+ )
552
571
 
553
572
  def _plot_for_regression(shap_values):
554
573
  for plot_type in ["bar", "dot"]:
@@ -559,6 +578,34 @@ def get_shap_values(
559
578
  plot_type=plot_type,
560
579
  title=f"{model_name} - {target_name}"
561
580
  )
581
+
582
+ # Save the summary data to a CSV file
583
+ shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
584
+ summary_save_path = global_save_path / shap_summary_filename
585
+ _save_summary_csv(
586
+ shap_values_for_summary=shap_values,
587
+ feature_names=feature_names,
588
+ save_path=summary_save_path
589
+ )
590
+
591
+ def _save_summary_csv(shap_values_for_summary: np.ndarray, feature_names: list[str], save_path: Path):
592
+ """Calculates and saves the SHAP summary data to a CSV file."""
593
+ mean_abs_shap = np.abs(shap_values_for_summary).mean(axis=0)
594
+
595
+ # Create default feature names if none are provided
596
+ current_feature_names = feature_names
597
+ if current_feature_names is None:
598
+ current_feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
599
+
600
+ summary_df = pd.DataFrame({
601
+ SHAPKeys.FEATURE_COLUMN: feature_names,
602
+ SHAPKeys.SHAP_VALUE_COLUMN: mean_abs_shap
603
+ }).sort_values(SHAPKeys.SHAP_VALUE_COLUMN, ascending=False)
604
+
605
+ summary_df.to_csv(save_path, index=False)
606
+ # print(f"📝 SHAP summary data saved as '{save_path.name}'")
607
+
608
+
562
609
  #START_O
563
610
 
564
611
  explainer = shap.TreeExplainer(model)
@@ -1,18 +1,17 @@
1
- from ._script_info import _script_info
2
- from ._logger import _LOGGER
3
- from .path_manager import make_fullpath, list_files_by_extension
4
- from .keys import EnsembleKeys
5
-
6
1
  from typing import Union, Literal, Dict, Any, Optional, List
7
2
  from pathlib import Path
8
3
  import json
9
-
10
4
  import joblib
11
5
  import numpy as np
12
6
  # Inference models
13
7
  import xgboost
14
8
  import lightgbm
15
9
 
10
+ from ._script_info import _script_info
11
+ from ._logger import _LOGGER
12
+ from .path_manager import make_fullpath, list_files_by_extension
13
+ from .keys import EnsembleKeys
14
+
16
15
 
17
16
  __all__ = [
18
17
  "InferenceHandler",
@@ -219,7 +218,7 @@ def model_report(
219
218
  return report_data
220
219
 
221
220
 
222
- # Local implementation to avoid calling utilities' dependencies
221
+ # Local implementation to avoid calling utilities dependencies
223
222
  def _deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
224
223
  """
225
224
  Loads a serialized object from a .joblib file.
@@ -13,7 +13,8 @@ import lightgbm as lgb
13
13
  from sklearn.model_selection import train_test_split
14
14
  from sklearn.base import clone
15
15
 
16
- from .utilities import yield_dataframes_from_dir, serialize_object, train_dataset_yielder
16
+ from .utilities import yield_dataframes_from_dir, train_dataset_yielder
17
+ from .serde import serialize_object_filename
17
18
  from .path_manager import sanitize_filename, make_fullpath
18
19
  from ._script_info import _script_info
19
20
  from .keys import EnsembleKeys
@@ -410,7 +411,7 @@ def _save_model(trained_model, model_name: str, target_name:str, feature_names:
410
411
  EnsembleKeys.FEATURES: feature_names,
411
412
  EnsembleKeys.TARGET: target_name}
412
413
 
413
- serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
414
+ serialize_object_filename(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
414
415
 
415
416
 
416
417
  # TRAIN EVALUATE PIPELINE
@@ -481,7 +482,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
481
482
 
482
483
  ###### 4. Execution ######
483
484
  def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Path], target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
484
- handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
485
+ handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=True,
485
486
  test_size: float=0.2, debug:bool=False, generate_learning_curves: bool = False):
486
487
  #Check models
487
488
  if isinstance(model_object, RegressionTreeModels):
ml_tools/handle_excel.py CHANGED
@@ -2,6 +2,7 @@ from pathlib import Path
2
2
  from openpyxl import load_workbook, Workbook
3
3
  import pandas as pd
4
4
  from typing import List, Optional, Union
5
+
5
6
  from .path_manager import sanitize_filename, make_fullpath
6
7
  from ._script_info import _script_info
7
8
  from ._logger import _LOGGER
ml_tools/keys.py CHANGED
@@ -36,6 +36,86 @@ class PyTorchInferenceKeys:
36
36
  # For classification tasks
37
37
  LABELS = "labels"
38
38
  PROBABILITIES = "probabilities"
39
+ LABEL_NAMES = "label_names"
40
+
41
+
42
+ class PytorchModelArchitectureKeys:
43
+ """Keys for saving and loading model architecture."""
44
+ MODEL = 'model_class'
45
+ CONFIG = "config"
46
+ SAVENAME = "architecture"
47
+
48
+
49
+ class PytorchArtifactPathKeys:
50
+ """Keys for model artifact paths."""
51
+ FEATURES_PATH = "feature_names_path"
52
+ TARGETS_PATH = "target_names_path"
53
+ ARCHITECTURE_PATH = "model_architecture_path"
54
+ WEIGHTS_PATH = "model_weights_path"
55
+ SCALER_PATH = "scaler_path"
56
+
57
+
58
+ class DatasetKeys:
59
+ """Keys for saving dataset artifacts. Also used by FeatureSchema"""
60
+ FEATURE_NAMES = "feature_names"
61
+ TARGET_NAMES = "target_names"
62
+ SCALER_PREFIX = "scaler_"
63
+ # Feature Schema
64
+ CONTINUOUS_NAMES = "continuous_feature_names"
65
+ CATEGORICAL_NAMES = "categorical_feature_names"
66
+
67
+
68
+ class SHAPKeys:
69
+ """Keys for SHAP functions"""
70
+ FEATURE_COLUMN = "feature"
71
+ SHAP_VALUE_COLUMN = "mean_abs_shap_value"
72
+ SAVENAME = "shap_summary"
73
+
74
+
75
+ class PyTorchCheckpointKeys:
76
+ """Keys for saving/loading a training checkpoint dictionary."""
77
+ MODEL_STATE = "model_state_dict"
78
+ OPTIMIZER_STATE = "optimizer_state_dict"
79
+ SCHEDULER_STATE = "scheduler_state_dict"
80
+ EPOCH = "epoch"
81
+ BEST_SCORE = "best_score"
82
+
83
+
84
+ class UtilityKeys:
85
+ """Keys used for utility modules"""
86
+ MODEL_PARAMS_FILE = "model_parameters"
87
+ TOTAL_PARAMS = "Total Parameters"
88
+ TRAINABLE_PARAMS = "Trainable Parameters"
89
+ PTH_FILE = "pth report "
90
+ MODEL_ARCHITECTURE_FILE = "model_architecture_summary"
91
+
92
+
93
+ class VisionKeys:
94
+ """For vision ML metrics"""
95
+ SEGMENTATION_REPORT = "segmentation_report"
96
+ SEGMENTATION_HEATMAP = "segmentation_metrics_heatmap"
97
+ SEGMENTATION_CONFUSION_MATRIX = "segmentation_confusion_matrix"
98
+ # Object detection
99
+ OBJECT_DETECTION_REPORT = "object_detection_report"
100
+
101
+
102
+ class VisionTransformRecipeKeys:
103
+ """Defines the key names for the transform recipe JSON file."""
104
+ TASK = "task"
105
+ PIPELINE = "pipeline"
106
+ NAME = "name"
107
+ KWARGS = "_kwargs"
108
+ PRE_TRANSFORMS = "pre_transforms"
109
+ RESIZE_SIZE = "resize_size"
110
+ CROP_SIZE = "crop_size"
111
+ MEAN = "mean"
112
+ STD = "std"
113
+
114
+
115
+ class ObjectDetectionKeys:
116
+ """Used by the object detection dataset"""
117
+ BOXES = "boxes"
118
+ LABELS = "labels"
39
119
 
40
120
 
41
121
  class _OneHotOtherPlaceholder:
@@ -0,0 +1,259 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import math
4
+ from typing import Union, Sequence, Optional
5
+
6
+ from ._script_info import _script_info
7
+ from ._logger import _LOGGER
8
+
9
+
10
+ __all__ = [
11
+ "normalize_mixed_list",
12
+ "threshold_binary_values",
13
+ "threshold_binary_values_batch",
14
+ "discretize_categorical_values",
15
+ ]
16
+
17
+
18
+ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
19
+ """
20
+ Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
21
+ applying heuristic adjustments to correct for potential data entry scale mismatches.
22
+
23
+ Parameters:
24
+ data (list):
25
+ A list of values that may include strings, floats, integers, or None.
26
+ None values are treated as 0.0.
27
+
28
+ threshold (int, optional):
29
+ The number of log10 orders of magnitude below the median scale
30
+ at which a value is considered suspect and is scaled upward accordingly.
31
+ Default is 2.
32
+
33
+ Returns:
34
+ List[float]: A list of normalized float values summing to 1.0.
35
+
36
+ Notes:
37
+ - Zeros and None values remain zero.
38
+ - Input strings are automatically cast to floats if possible.
39
+
40
+ Example:
41
+ >>> normalize_mixed_list([1, "0.01", 4, None])
42
+ [0.2, 0.2, 0.6, 0.0]
43
+ """
44
+ # Step 1: Convert all values to float, treat None as 0.0
45
+ float_list = [float(x) if x is not None else 0.0 for x in data]
46
+
47
+ # Raise for negative values
48
+ if any(x < 0 for x in float_list):
49
+ _LOGGER.error("Negative values are not allowed in the input list.")
50
+ raise ValueError()
51
+
52
+ # Step 2: Compute log10 of non-zero values
53
+ nonzero = [x for x in float_list if x > 0]
54
+ if not nonzero:
55
+ return [0.0 for _ in float_list]
56
+
57
+ log_scales = [math.log10(x) for x in nonzero]
58
+ log_median = np.median(log_scales)
59
+
60
+ # Step 3: Adjust values that are much smaller than median
61
+ adjusted = []
62
+ for x in float_list:
63
+ if x == 0.0:
64
+ adjusted.append(0.0)
65
+ else:
66
+ log_x = math.log10(x)
67
+ if log_median - log_x > threshold:
68
+ scale_diff = round(log_median - log_x)
69
+ adjusted.append(x * (10 ** scale_diff))
70
+ else:
71
+ adjusted.append(x)
72
+
73
+ # Step 4: Normalize to sum to 1.0
74
+ total = sum(adjusted)
75
+ if total == 0:
76
+ return [0.0 for _ in adjusted]
77
+
78
+ return [x / total for x in adjusted]
79
+
80
+
81
+ def threshold_binary_values(
82
+ input_array: Union[Sequence[float], np.ndarray, pd.Series],
83
+ binary_values: Optional[int] = None
84
+ ) -> Union[np.ndarray, pd.Series, list[float], tuple[float]]:
85
+ """
86
+ Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
87
+
88
+ Binary elements are converted to 0 or 1 using a 0.5 threshold.
89
+
90
+ Parameters:
91
+ input_array: 1D sequence, NumPy array, or pandas Series.
92
+ binary_values (Optional[int]) :
93
+ - If `None`, all values are treated as binary.
94
+ - If `int`, only this many last `binary_values` are thresholded.
95
+
96
+ Returns:
97
+ Any:
98
+ Same type as input
99
+ """
100
+ original_type = type(input_array)
101
+
102
+ if isinstance(input_array, (pd.Series, np.ndarray)):
103
+ array = np.asarray(input_array)
104
+ elif isinstance(input_array, (list, tuple)):
105
+ array = np.array(input_array)
106
+ else:
107
+ _LOGGER.error("Unsupported input type")
108
+ raise TypeError()
109
+
110
+ array = array.flatten()
111
+ total = array.shape[0]
112
+
113
+ bin_count = total if binary_values is None else binary_values
114
+ if not (0 <= bin_count <= total):
115
+ _LOGGER.error("'binary_values' must be between 0 and the total number of elements")
116
+ raise ValueError()
117
+
118
+ if bin_count == 0:
119
+ result = array
120
+ else:
121
+ cont_part = array[:-bin_count] if bin_count < total else np.array([])
122
+ bin_part = (array[-bin_count:] > 0.5).astype(int)
123
+ result = np.concatenate([cont_part, bin_part])
124
+
125
+ if original_type is pd.Series:
126
+ return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
127
+ elif original_type is list:
128
+ return result.tolist()
129
+ elif original_type is tuple:
130
+ return tuple(result)
131
+ else:
132
+ return result
133
+
134
+
135
+ def threshold_binary_values_batch(
136
+ input_array: np.ndarray,
137
+ binary_values: int
138
+ ) -> np.ndarray:
139
+ """
140
+ Threshold the last `binary_values` columns of a 2D NumPy array to binary {0,1} using 0.5 cutoff.
141
+
142
+ Parameters
143
+ ----------
144
+ input_array : np.ndarray
145
+ 2D array with shape (batch_size, n_features).
146
+ binary_values : int
147
+ Number of binary features located at the END of each row.
148
+
149
+ Returns
150
+ -------
151
+ np.ndarray
152
+ Thresholded array, same shape as input.
153
+ """
154
+ if input_array.ndim != 2:
155
+ _LOGGER.error(f"Expected 2D array, got {input_array.ndim}D array.")
156
+ raise AssertionError()
157
+
158
+ batch_size, total_features = input_array.shape
159
+
160
+ if not (0 <= binary_values <= total_features):
161
+ _LOGGER.error("'binary_values' out of valid range.")
162
+ raise AssertionError()
163
+
164
+ if binary_values == 0:
165
+ return input_array.copy()
166
+
167
+ cont_part = input_array[:, :-binary_values] if binary_values < total_features else np.empty((batch_size, 0))
168
+ bin_part = input_array[:, -binary_values:] > 0.5
169
+ bin_part = bin_part.astype(np.int32)
170
+
171
+ return np.hstack([cont_part, bin_part])
172
+
173
+
174
+ def discretize_categorical_values(
175
+ input_array: np.ndarray,
176
+ categorical_info: dict[int, int],
177
+ start_at_zero: bool = True
178
+ ) -> np.ndarray:
179
+ """
180
+ Rounds specified columns of a 2D NumPy array to the nearest integer and
181
+ clamps the result to a valid categorical range.
182
+
183
+ If a 1D array is provided, it is treated as a single batch.
184
+
185
+ Parameters
186
+ ----------
187
+ input_array : np.ndarray
188
+ 1D array (n_features,) or 2D array with shape (batch_size, n_features) containing continuous values.
189
+ categorical_info : dict[int, int]
190
+ A dictionary mapping column indices to their cardinality (number of categories).
191
+ Example: {3: 4} means column 3 will be clamped to its 4 valid categories.
192
+ start_at_zero : bool
193
+ If True, categories range from 0 to k-1.
194
+ If False, categories range from 1 to k.
195
+
196
+ Returns
197
+ -------
198
+ np.ndarray
199
+ A new array with the specified columns converted to integer categories.
200
+ Shape matches the input array's original shape.
201
+ """
202
+ # --- Input Validation ---
203
+ if not isinstance(input_array, np.ndarray):
204
+ _LOGGER.error(f"Expected np.ndarray, got {type(input_array)}.")
205
+ raise ValueError()
206
+
207
+ if input_array.ndim == 1:
208
+ # Reshape 1D array (n_features,) to 2D (1, n_features)
209
+ working_array = input_array.reshape(1, -1)
210
+ original_was_1d = True
211
+ elif input_array.ndim == 2:
212
+ working_array = input_array
213
+ original_was_1d = False
214
+ else:
215
+ _LOGGER.error(f"Expected 1D or 2D array, got {input_array.ndim}D array.")
216
+ raise ValueError()
217
+
218
+ if not isinstance(categorical_info, dict) or not categorical_info:
219
+ _LOGGER.error(f"'categorical_info' is not a dictionary, or is empty.")
220
+ raise ValueError()
221
+
222
+ _, total_features = working_array.shape
223
+ for col_idx, cardinality in categorical_info.items():
224
+ if not isinstance(col_idx, int):
225
+ _LOGGER.error(f"Column index key {col_idx} is not an integer.")
226
+ raise TypeError()
227
+ if not (0 <= col_idx < total_features):
228
+ _LOGGER.error(f"Column index {col_idx} is out of bounds for an array with {total_features} features.")
229
+ raise ValueError()
230
+ if not isinstance(cardinality, int) or cardinality < 2:
231
+ _LOGGER.error(f"Cardinality for column {col_idx} must be an integer >= 2, but got {cardinality}.")
232
+ raise ValueError()
233
+
234
+ # --- Core Logic ---
235
+ output_array = working_array.copy()
236
+
237
+ for col_idx, cardinality in categorical_info.items():
238
+ # 1. Round the column values using "round half up"
239
+ rounded_col = np.floor(output_array[:, col_idx] + 0.5)
240
+
241
+ # 2. Determine clamping bounds
242
+ min_bound = 0 if start_at_zero else 1
243
+ max_bound = cardinality - 1 if start_at_zero else cardinality
244
+
245
+ # 3. Clamp the values and update the output array
246
+ output_array[:, col_idx] = np.clip(rounded_col, min_bound, max_bound)
247
+
248
+ final_output = output_array.astype(np.int32)
249
+
250
+ # --- Output Shape Handling ---
251
+ if original_was_1d:
252
+ # Squeeze the batch dimension to return a 1D array
253
+ return final_output.squeeze(axis=0)
254
+ else:
255
+ return final_output
256
+
257
+
258
+ def info():
259
+ _script_info(__all__)