dragon-ml-toolbox 13.7.0__tar.gz → 13.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (47) hide show
  1. {dragon_ml_toolbox-13.7.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-13.8.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. dragon_ml_toolbox-13.8.0/ml_tools/ML_utilities.py +479 -0
  4. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/custom_logger.py +26 -8
  5. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/keys.py +8 -0
  6. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/pyproject.toml +1 -1
  7. dragon_ml_toolbox-13.7.0/ml_tools/ML_utilities.py +0 -230
  8. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/LICENSE +0 -0
  9. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/LICENSE-THIRD-PARTY.md +0 -0
  10. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/README.md +0 -0
  11. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  12. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  13. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  14. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  15. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ETL_cleaning.py +0 -0
  16. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ETL_engineering.py +0 -0
  17. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/GUI_tools.py +0 -0
  18. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/MICE_imputation.py +0 -0
  19. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ML_callbacks.py +0 -0
  20. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ML_datasetmaster.py +0 -0
  21. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ML_evaluation.py +0 -0
  22. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ML_evaluation_multi.py +0 -0
  23. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ML_inference.py +0 -0
  24. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ML_models.py +0 -0
  25. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ML_optimization.py +0 -0
  26. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ML_scaler.py +0 -0
  27. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ML_trainer.py +0 -0
  28. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/PSO_optimization.py +0 -0
  29. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/RNN_forecast.py +0 -0
  30. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/SQL.py +0 -0
  31. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/VIF_factor.py +0 -0
  32. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/__init__.py +0 -0
  33. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/_logger.py +0 -0
  34. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/_schema.py +0 -0
  35. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/_script_info.py +0 -0
  36. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/constants.py +0 -0
  37. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/data_exploration.py +0 -0
  38. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ensemble_evaluation.py +0 -0
  39. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ensemble_inference.py +0 -0
  40. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/ensemble_learning.py +0 -0
  41. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/handle_excel.py +0 -0
  42. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/math_utilities.py +0 -0
  43. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/optimization_tools.py +0 -0
  44. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/path_manager.py +0 -0
  45. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/serde.py +0 -0
  46. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/ml_tools/utilities.py +0 -0
  47. {dragon_ml_toolbox-13.7.0 → dragon_ml_toolbox-13.8.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 13.7.0
3
+ Version: 13.8.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 13.7.0
3
+ Version: 13.8.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,479 @@
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from typing import Union, Any, Optional, Dict, List, Iterable
4
+ import torch
5
+ from torch import nn
6
+
7
+
8
+ from .path_manager import make_fullpath, list_subdirectories, list_files_by_extension
9
+ from ._script_info import _script_info
10
+ from ._logger import _LOGGER
11
+ from .keys import DatasetKeys, PytorchModelArchitectureKeys, PytorchArtifactPathKeys, SHAPKeys, UtilityKeys, PyTorchCheckpointKeys
12
+ from .utilities import load_dataframe
13
+ from .custom_logger import save_list_strings, custom_logger
14
+
15
+
16
+ __all__ = [
17
+ "find_model_artifacts",
18
+ "select_features_by_shap",
19
+ "get_model_parameters",
20
+ "inspect_pth_file",
21
+ "set_parameter_requires_grad"
22
+ ]
23
+
24
+
25
+ def find_model_artifacts(target_directory: Union[str,Path], load_scaler: bool, verbose: bool=False) -> list[dict[str,Any]]:
26
+ """
27
+ Scans subdirectories to find paths to model weights, target names, feature names, and model architecture. Optionally an scaler path if `load_scaler` is True.
28
+
29
+ This function operates on a specific directory structure. It expects the
30
+ `target_directory` to contain one or more subdirectories, where each
31
+ subdirectory represents a single trained model result.
32
+
33
+ The expected directory structure for each model is as follows:
34
+ ```
35
+ target_directory
36
+ ├── model_1
37
+ │ ├── *.pth
38
+ │ ├── scaler_*.pth (Required if `load_scaler` is True)
39
+ │ ├── feature_names.txt
40
+ │ ├── target_names.txt
41
+ │ └── architecture.json
42
+ └── model_2/
43
+ └── ...
44
+ ```
45
+
46
+ Args:
47
+ target_directory (str | Path): The path to the root directory that contains model subdirectories.
48
+ load_scaler (bool): If True, the function requires and searches for a scaler file (`.pth`) in each model subdirectory.
49
+ verbose (bool): If True, enables detailed logging during the file paths search process.
50
+
51
+ Returns:
52
+ (list[dict[str, Path]]): A list of dictionaries, where each dictionary
53
+ corresponds to a model found in a subdirectory. The dictionary
54
+ maps standardized keys to the absolute paths of the model's
55
+ artifacts (weights, architecture, features, targets, and scaler).
56
+ The scaler path will be `None` if `load_scaler` is False.
57
+ """
58
+ # validate directory
59
+ root_path = make_fullpath(target_directory, enforce="directory")
60
+
61
+ # store results
62
+ all_artifacts: list[dict] = list()
63
+
64
+ # find model directories
65
+ result_dirs_dict = list_subdirectories(root_dir=root_path, verbose=verbose)
66
+ for dir_name, dir_path in result_dirs_dict.items():
67
+ # find files
68
+ model_pth_dict = list_files_by_extension(directory=dir_path, extension="pth", verbose=verbose)
69
+
70
+ # restriction
71
+ if load_scaler:
72
+ if len(model_pth_dict) != 2:
73
+ _LOGGER.error(f"Directory {dir_path} should contain exactly 2 '.pth' files: scaler and weights.")
74
+ raise IOError()
75
+ else:
76
+ if len(model_pth_dict) != 1:
77
+ _LOGGER.error(f"Directory {dir_path} should contain exactly 1 '.pth' file: weights.")
78
+ raise IOError()
79
+
80
+ ##### Scaler and Weights #####
81
+ scaler_path = None
82
+ weights_path = None
83
+
84
+ # load weights and scaler if present
85
+ for pth_filename, pth_path in model_pth_dict.items():
86
+ if load_scaler and pth_filename.lower().startswith(DatasetKeys.SCALER_PREFIX):
87
+ scaler_path = pth_path
88
+ else:
89
+ weights_path = pth_path
90
+
91
+ # validation
92
+ if not weights_path:
93
+ _LOGGER.error(f"Error parsing the model weights path from '{dir_name}'")
94
+ raise IOError()
95
+
96
+ if load_scaler and not scaler_path:
97
+ _LOGGER.error(f"Error parsing the scaler path from '{dir_name}'")
98
+ raise IOError()
99
+
100
+ ##### Target and Feature names #####
101
+ target_names_path = None
102
+ feature_names_path = None
103
+
104
+ # load feature and target names
105
+ model_txt_dict = list_files_by_extension(directory=dir_path, extension="txt", verbose=verbose)
106
+
107
+ for txt_filename, txt_path in model_txt_dict.items():
108
+ if txt_filename == DatasetKeys.FEATURE_NAMES:
109
+ feature_names_path = txt_path
110
+ elif txt_filename == DatasetKeys.TARGET_NAMES:
111
+ target_names_path = txt_path
112
+
113
+ # validation
114
+ if not target_names_path or not feature_names_path:
115
+ _LOGGER.error(f"Error parsing features path or targets path from '{dir_name}'")
116
+ raise IOError()
117
+
118
+ ##### load model architecture path #####
119
+ architecture_path = None
120
+
121
+ model_json_dict = list_files_by_extension(directory=dir_path, extension="json", verbose=verbose)
122
+
123
+ for json_filename, json_path in model_json_dict.items():
124
+ if json_filename == PytorchModelArchitectureKeys.SAVENAME:
125
+ architecture_path = json_path
126
+
127
+ # validation
128
+ if not architecture_path:
129
+ _LOGGER.error(f"Error parsing the model architecture path from '{dir_name}'")
130
+ raise IOError()
131
+
132
+ ##### Paths dictionary #####
133
+ parsing_dict = {
134
+ PytorchArtifactPathKeys.WEIGHTS_PATH: weights_path,
135
+ PytorchArtifactPathKeys.ARCHITECTURE_PATH: architecture_path,
136
+ PytorchArtifactPathKeys.FEATURES_PATH: feature_names_path,
137
+ PytorchArtifactPathKeys.TARGETS_PATH: target_names_path,
138
+ PytorchArtifactPathKeys.SCALER_PATH: scaler_path
139
+ }
140
+
141
+ all_artifacts.append(parsing_dict)
142
+
143
+ return all_artifacts
144
+
145
+
146
+ def select_features_by_shap(
147
+ root_directory: Union[str, Path],
148
+ shap_threshold: float,
149
+ log_feature_names_directory: Optional[Union[str, Path]],
150
+ verbose: bool = True) -> list[str]:
151
+ """
152
+ Scans subdirectories to find SHAP summary CSVs, then extracts feature
153
+ names whose mean absolute SHAP value meets a specified threshold.
154
+
155
+ This function is useful for automated feature selection based on feature
156
+ importance scores aggregated from multiple models.
157
+
158
+ Args:
159
+ root_directory (str | Path):
160
+ The path to the root directory that contains model subdirectories.
161
+ shap_threshold (float):
162
+ The minimum mean absolute SHAP value for a feature to be included
163
+ in the final list.
164
+ log_feature_names_directory (str | Path | None):
165
+ If given, saves the chosen feature names as a .txt file in this directory.
166
+
167
+ Returns:
168
+ list[str]:
169
+ A single, sorted list of unique feature names that meet the
170
+ threshold criteria across all found files.
171
+ """
172
+ if verbose:
173
+ _LOGGER.info(f"Starting feature selection with SHAP threshold >= {shap_threshold}")
174
+ root_path = make_fullpath(root_directory, enforce="directory")
175
+
176
+ # --- Step 2: Directory and File Discovery ---
177
+ subdirectories = list_subdirectories(root_dir=root_path, verbose=False)
178
+
179
+ shap_filename = SHAPKeys.SAVENAME + ".csv"
180
+
181
+ valid_csv_paths = []
182
+ for dir_name, dir_path in subdirectories.items():
183
+ expected_path = dir_path / shap_filename
184
+ if expected_path.is_file():
185
+ valid_csv_paths.append(expected_path)
186
+ else:
187
+ _LOGGER.warning(f"No '{shap_filename}' found in subdirectory '{dir_name}'.")
188
+
189
+ if not valid_csv_paths:
190
+ _LOGGER.error(f"Process halted: No '{shap_filename}' files were found in any subdirectory.")
191
+ return []
192
+
193
+ if verbose:
194
+ _LOGGER.info(f"Found {len(valid_csv_paths)} SHAP summary files to process.")
195
+
196
+ # --- Step 3: Data Processing and Feature Extraction ---
197
+ master_feature_set = set()
198
+ for csv_path in valid_csv_paths:
199
+ try:
200
+ df, _ = load_dataframe(csv_path, kind="pandas", verbose=False)
201
+
202
+ # Validate required columns
203
+ required_cols = {SHAPKeys.FEATURE_COLUMN, SHAPKeys.SHAP_VALUE_COLUMN}
204
+ if not required_cols.issubset(df.columns):
205
+ _LOGGER.warning(f"Skipping '{csv_path}': missing required columns.")
206
+ continue
207
+
208
+ # Filter by threshold and extract features
209
+ filtered_df = df[df[SHAPKeys.SHAP_VALUE_COLUMN] >= shap_threshold]
210
+ features = filtered_df[SHAPKeys.FEATURE_COLUMN].tolist()
211
+ master_feature_set.update(features)
212
+
213
+ except (ValueError, pd.errors.EmptyDataError):
214
+ _LOGGER.warning(f"Skipping '{csv_path}' because it is empty or malformed.")
215
+ continue
216
+ except Exception as e:
217
+ _LOGGER.error(f"An unexpected error occurred while processing '{csv_path}': {e}")
218
+ continue
219
+
220
+ # --- Step 4: Finalize and Return ---
221
+ final_features = sorted(list(master_feature_set))
222
+ if verbose:
223
+ _LOGGER.info(f"Selected {len(final_features)} unique features across all files.")
224
+
225
+ if log_feature_names_directory is not None:
226
+ save_names_path = make_fullpath(log_feature_names_directory, make=True, enforce="directory")
227
+ save_list_strings(list_strings=final_features,
228
+ directory=save_names_path,
229
+ filename=DatasetKeys.FEATURE_NAMES,
230
+ verbose=verbose)
231
+
232
+ return final_features
233
+
234
+
235
+ def get_model_parameters(model: nn.Module, save_dir: Optional[Union[str,Path]]=None) -> Dict[str, int]:
236
+ """
237
+ Calculates the total and trainable parameters of a PyTorch model.
238
+
239
+ Args:
240
+ model (nn.Module): The PyTorch model to inspect.
241
+ save_dir: Optional directory to save the output as a JSON file.
242
+
243
+ Returns:
244
+ Dict[str, int]: A dictionary containing:
245
+ - "total_params": The total number of parameters.
246
+ - "trainable_params": The number of trainable parameters (where requires_grad=True).
247
+ """
248
+ total_params = sum(p.numel() for p in model.parameters())
249
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
250
+
251
+ report = {
252
+ UtilityKeys.TOTAL_PARAMS: total_params,
253
+ UtilityKeys.TRAINABLE_PARAMS: trainable_params
254
+ }
255
+
256
+ if save_dir is not None:
257
+ output_dir = make_fullpath(save_dir, make=True, enforce="directory")
258
+ custom_logger(data=report,
259
+ save_directory=output_dir,
260
+ log_name=UtilityKeys.MODEL_PARAMS_FILE,
261
+ dict_as="json")
262
+
263
+ return report
264
+
265
+
266
+ def inspect_pth_file(
267
+ pth_path: Union[str, Path],
268
+ save_dir: Union[str, Path],
269
+ ) -> None:
270
+ """
271
+ Inspects a .pth file (e.g., checkpoint) and saves a human-readable
272
+ JSON summary of its contents.
273
+
274
+ Args:
275
+ pth_path (str | Path): The path to the .pth file to inspect.
276
+ save_dir (str | Path): The directory to save the JSON report.
277
+
278
+ Returns:
279
+ Dict (str, Any): A dictionary containing the inspection report.
280
+
281
+ Raises:
282
+ ValueError: If the .pth file is empty or in an unrecognized format.
283
+ """
284
+ # --- 1. Validate paths ---
285
+ pth_file = make_fullpath(pth_path, enforce="file")
286
+ output_dir = make_fullpath(save_dir, make=True, enforce="directory")
287
+ pth_name = pth_file.stem
288
+
289
+ # --- 2. Load data ---
290
+ try:
291
+ # Load onto CPU to avoid GPU memory issues
292
+ loaded_data = torch.load(pth_file, map_location=torch.device('cpu'))
293
+ except Exception as e:
294
+ _LOGGER.error(f"Failed to load .pth file '{pth_file}': {e}")
295
+ raise
296
+
297
+ # --- 3. Initialize Report ---
298
+ report = {
299
+ "top_level_type": str(type(loaded_data)),
300
+ "top_level_summary": {},
301
+ "model_state_analysis": None,
302
+ "notes": []
303
+ }
304
+
305
+ # --- 4. Parse loaded data ---
306
+ if isinstance(loaded_data, dict):
307
+ # --- Case 1: Loaded data is a dictionary (most common case) ---
308
+ # "main loop" that iterates over *everything* first.
309
+ for key, value in loaded_data.items():
310
+ key_summary = {}
311
+ val_type = str(type(value))
312
+ key_summary["type"] = val_type
313
+
314
+ if isinstance(value, torch.Tensor):
315
+ key_summary["shape"] = list(value.shape)
316
+ key_summary["dtype"] = str(value.dtype)
317
+ elif isinstance(value, dict):
318
+ key_summary["key_count"] = len(value)
319
+ key_summary["key_preview"] = list(value.keys())[:5]
320
+ elif isinstance(value, (int, float, str, bool)):
321
+ key_summary["value_preview"] = str(value)
322
+ elif isinstance(value, (list, tuple)):
323
+ key_summary["value_preview"] = str(value)[:100]
324
+
325
+ report["top_level_summary"][key] = key_summary
326
+
327
+ # Now, try to find the model state_dict within the dict
328
+ if PyTorchCheckpointKeys.MODEL_STATE in loaded_data and isinstance(loaded_data[PyTorchCheckpointKeys.MODEL_STATE], dict):
329
+ report["notes"].append(f"Found standard checkpoint key: '{PyTorchCheckpointKeys.MODEL_STATE}'. Analyzing as model state_dict.")
330
+ state_dict = loaded_data[PyTorchCheckpointKeys.MODEL_STATE]
331
+ report["model_state_analysis"] = _generate_weight_report(state_dict)
332
+
333
+ elif all(isinstance(v, torch.Tensor) for v in loaded_data.values()):
334
+ report["notes"].append("File dictionary contains only tensors. Analyzing entire dictionary as model state_dict.")
335
+ state_dict = loaded_data
336
+ report["model_state_analysis"] = _generate_weight_report(state_dict)
337
+
338
+ else:
339
+ report["notes"].append("Could not identify a single model state_dict. See top_level_summary for all contents. No detailed weight analysis will be performed.")
340
+
341
+ elif isinstance(loaded_data, nn.Module):
342
+ # --- Case 2: Loaded data is a full pickled model ---
343
+ # _LOGGER.warning("Loading a full, pickled nn.Module is not recommended. Inspecting its state_dict().")
344
+ report["notes"].append("File is a full, pickled nn.Module. This is not recommended. Extracting state_dict() for analysis.")
345
+ state_dict = loaded_data.state_dict()
346
+ report["model_state_analysis"] = _generate_weight_report(state_dict)
347
+
348
+ else:
349
+ # --- Case 3: Unrecognized format (e.g., single tensor, list) ---
350
+ _LOGGER.error(f"Could not parse .pth file. Loaded data is of type {type(loaded_data)}, not a dict or nn.Module.")
351
+ raise ValueError()
352
+
353
+ # --- 5. Save Report ---
354
+ custom_logger(data=report,
355
+ save_directory=output_dir,
356
+ log_name=UtilityKeys.PTH_FILE + pth_name,
357
+ dict_as="json")
358
+
359
+
360
+ def _generate_weight_report(state_dict: dict) -> dict:
361
+ """
362
+ Internal helper to analyze a state_dict and return a structured report.
363
+
364
+ Args:
365
+ state_dict (dict): The model state_dict to analyze.
366
+
367
+ Returns:
368
+ dict: A report containing total parameters and a per-parameter breakdown.
369
+ """
370
+ weight_report = {}
371
+ total_params = 0
372
+ if not isinstance(state_dict, dict):
373
+ _LOGGER.warning(f"Attempted to generate weight report on non-dict type: {type(state_dict)}")
374
+ return {"error": "Input was not a dictionary."}
375
+
376
+ for key, tensor in state_dict.items():
377
+ if not isinstance(tensor, torch.Tensor):
378
+ _LOGGER.warning(f"Skipping key '{key}' in state_dict: value is not a tensor (type: {type(tensor)}).")
379
+ weight_report[key] = {
380
+ "type": str(type(tensor)),
381
+ "value_preview": str(tensor)[:50] # Show a preview
382
+ }
383
+ continue
384
+ weight_report[key] = {
385
+ "shape": list(tensor.shape),
386
+ "dtype": str(tensor.dtype),
387
+ "requires_grad": tensor.requires_grad,
388
+ "num_elements": tensor.numel()
389
+ }
390
+ total_params += tensor.numel()
391
+
392
+ return {
393
+ "total_parameters": total_params,
394
+ "parameter_key_count": len(weight_report),
395
+ "parameters": weight_report
396
+ }
397
+
398
+
399
+ def set_parameter_requires_grad(
400
+ model: nn.Module,
401
+ unfreeze_last_n_params: int,
402
+ ) -> int:
403
+ """
404
+ Freezes or unfreezes parameters in a model based on unfreeze_last_n_params.
405
+
406
+ - N = 0: Freezes ALL parameters.
407
+ - N > 0 and N < total: Freezes ALL parameters, then unfreezes the last N.
408
+ - N >= total: Unfreezes ALL parameters.
409
+
410
+ Note: 'N' refers to individual parameter tensors (e.g., `layer.weight`
411
+ or `layer.bias`), not modules or layers. For example, to unfreeze
412
+ the final nn.Linear layer, you would use N=2 (for its weight and bias).
413
+
414
+ Args:
415
+ model (nn.Module): The model to modify.
416
+ unfreeze_last_n_params (int):
417
+ The number of parameter tensors to unfreeze, starting from
418
+ the end of the model.
419
+
420
+ Returns:
421
+ int: The total number of individual parameters (elements) that were set to `requires_grad=True`.
422
+ """
423
+ if unfreeze_last_n_params < 0:
424
+ _LOGGER.error(f"unfreeze_last_n_params must be >= 0, but got {unfreeze_last_n_params}")
425
+ raise ValueError()
426
+
427
+ # --- Step 1: Get all parameter tensors ---
428
+ all_params = list(model.parameters())
429
+ total_param_tensors = len(all_params)
430
+
431
+ # --- Case 1: N = 0 (Freeze ALL parameters) ---
432
+ # early exit for the "freeze all" case.
433
+ if unfreeze_last_n_params == 0:
434
+ params_frozen = _set_params_grad(all_params, requires_grad=False)
435
+ _LOGGER.warning(f"Froze all {total_param_tensors} parameter tensors ({params_frozen} total elements).")
436
+ return 0 # 0 parameters unfrozen
437
+
438
+ # --- Case 2: N >= total (Unfreeze ALL parameters) ---
439
+ if unfreeze_last_n_params >= total_param_tensors:
440
+ if unfreeze_last_n_params > total_param_tensors:
441
+ _LOGGER.warning(f"Requested to unfreeze {unfreeze_last_n_params} params, but model only has {total_param_tensors}. Unfreezing all.")
442
+
443
+ params_unfrozen = _set_params_grad(all_params, requires_grad=True)
444
+ _LOGGER.info(f"Unfroze all {total_param_tensors} parameter tensors ({params_unfrozen} total elements) for training.")
445
+ return params_unfrozen
446
+
447
+ # --- Case 3: 0 < N < total (Standard: Freeze all, unfreeze last N) ---
448
+ # Freeze ALL
449
+ params_frozen = _set_params_grad(all_params, requires_grad=False)
450
+ _LOGGER.info(f"Froze {params_frozen} parameters.")
451
+
452
+ # Unfreeze the last N
453
+ params_to_unfreeze = all_params[-unfreeze_last_n_params:]
454
+
455
+ # these are all False, so the helper will set them to True
456
+ params_unfrozen = _set_params_grad(params_to_unfreeze, requires_grad=True)
457
+
458
+ _LOGGER.info(f"Unfroze the last {unfreeze_last_n_params} parameter tensors ({params_unfrozen} total elements) for training.")
459
+
460
+ return params_unfrozen
461
+
462
+
463
+ def _set_params_grad(
464
+ params: Iterable[nn.Parameter],
465
+ requires_grad: bool
466
+ ) -> int:
467
+ """
468
+ A helper function to set the `requires_grad` attribute for an iterable
469
+ of parameters and return the total number of elements changed.
470
+ """
471
+ params_changed = 0
472
+ for param in params:
473
+ if param.requires_grad != requires_grad:
474
+ param.requires_grad = requires_grad
475
+ params_changed += param.numel()
476
+ return params_changed
477
+
478
+ def info():
479
+ _script_info(__all__)
@@ -1,6 +1,6 @@
1
1
  from pathlib import Path
2
2
  from datetime import datetime
3
- from typing import Union, List, Dict, Any
3
+ from typing import Union, List, Dict, Any, Literal
4
4
  import traceback
5
5
  import json
6
6
  import csv
@@ -29,6 +29,7 @@ def custom_logger(
29
29
  ],
30
30
  save_directory: Union[str, Path],
31
31
  log_name: str,
32
+ dict_as: Literal['auto', 'json', 'csv'] = 'auto',
32
33
  ) -> None:
33
34
  """
34
35
  Logs various data types to corresponding output formats:
@@ -36,10 +37,10 @@ def custom_logger(
36
37
  - list[Any] → .txt
37
38
  Each element is written on a new line.
38
39
 
39
- - dict[str, list[Any]] → .csv
40
+ - dict[str, list[Any]] → .csv (if dict_as='auto' or 'csv')
40
41
  Dictionary is treated as tabular data; keys become columns, values become rows.
41
42
 
42
- - dict[str, scalar] → .json
43
+ - dict[str, scalar] → .json (if dict_as='auto' or 'json')
43
44
  Dictionary is treated as structured data and serialized as JSON.
44
45
 
45
46
  - str → .log
@@ -52,26 +53,43 @@ def custom_logger(
52
53
  data: The data to be logged. Must be one of the supported types.
53
54
  save_directory: Directory where the log will be saved. Created if it does not exist.
54
55
  log_name: Base name for the log file. Timestamp will be appended automatically.
56
+ dict_as ('auto'|'json'|'csv'):
57
+ - 'auto': Guesses format (JSON or CSV) based on dictionary content.
58
+ - 'json': Forces .json format for any dictionary.
59
+ - 'csv': Forces .csv format. Will fail if dict values are not all lists.
55
60
 
56
61
  Raises:
57
62
  ValueError: If the data type is unsupported.
58
63
  """
59
64
  try:
65
+ if not isinstance(data, BaseException) and not data:
66
+ _LOGGER.warning("Empty data received. No log file will be saved.")
67
+ return
68
+
60
69
  save_path = make_fullpath(save_directory, make=True)
61
70
 
62
71
  timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
63
72
  log_name = sanitize_filename(log_name)
64
73
 
65
74
  base_path = save_path / f"{log_name}_{timestamp}"
66
-
75
+
76
+ # Router
67
77
  if isinstance(data, list):
68
78
  _log_list_to_txt(data, base_path.with_suffix(".txt"))
69
79
 
70
80
  elif isinstance(data, dict):
71
- if all(isinstance(v, list) for v in data.values()):
72
- _log_dict_to_csv(data, base_path.with_suffix(".csv"))
73
- else:
81
+ if dict_as == 'json':
74
82
  _log_dict_to_json(data, base_path.with_suffix(".json"))
83
+
84
+ elif dict_as == 'csv':
85
+ # This will raise a ValueError if data is not all lists
86
+ _log_dict_to_csv(data, base_path.with_suffix(".csv"))
87
+
88
+ else: # 'auto' mode
89
+ if all(isinstance(v, list) for v in data.values()):
90
+ _log_dict_to_csv(data, base_path.with_suffix(".csv"))
91
+ else:
92
+ _log_dict_to_json(data, base_path.with_suffix(".json"))
75
93
 
76
94
  elif isinstance(data, str):
77
95
  _log_string_to_log(data, base_path.with_suffix(".log"))
@@ -83,7 +101,7 @@ def custom_logger(
83
101
  _LOGGER.error("Unsupported data type. Must be list, dict, str, or BaseException.")
84
102
  raise ValueError()
85
103
 
86
- _LOGGER.info(f"Log saved to: '{base_path}'")
104
+ _LOGGER.info(f"Log saved as: '{base_path.name}'")
87
105
 
88
106
  except Exception:
89
107
  _LOGGER.exception(f"Log not saved.")
@@ -80,6 +80,14 @@ class PyTorchCheckpointKeys:
80
80
  BEST_SCORE = "best_score"
81
81
 
82
82
 
83
+ class UtilityKeys:
84
+ """Keys used for utility modules"""
85
+ MODEL_PARAMS_FILE = "model_parameters"
86
+ TOTAL_PARAMS = "Total Parameters"
87
+ TRAINABLE_PARAMS = "Trainable Parameters"
88
+ PTH_FILE = "pth report "
89
+
90
+
83
91
  class _OneHotOtherPlaceholder:
84
92
  """Used internally by GUI_tools."""
85
93
  OTHER_GUI = "OTHER"
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "13.7.0"
3
+ version = "13.8.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl L. Loza Vidaurre", email = "luigiloza@gmail.com" }
@@ -1,230 +0,0 @@
1
- import pandas as pd
2
- from pathlib import Path
3
- from typing import Union, Any, Optional
4
-
5
- from .path_manager import make_fullpath, list_subdirectories, list_files_by_extension
6
- from ._script_info import _script_info
7
- from ._logger import _LOGGER
8
- from .keys import DatasetKeys, PytorchModelArchitectureKeys, PytorchArtifactPathKeys, SHAPKeys
9
- from .utilities import load_dataframe
10
- from .custom_logger import save_list_strings
11
-
12
-
13
- __all__ = [
14
- "find_model_artifacts",
15
- "select_features_by_shap"
16
- ]
17
-
18
-
19
- def find_model_artifacts(target_directory: Union[str,Path], load_scaler: bool, verbose: bool=False) -> list[dict[str,Any]]:
20
- """
21
- Scans subdirectories to find paths to model weights, target names, feature names, and model architecture. Optionally an scaler path if `load_scaler` is True.
22
-
23
- This function operates on a specific directory structure. It expects the
24
- `target_directory` to contain one or more subdirectories, where each
25
- subdirectory represents a single trained model result.
26
-
27
- The expected directory structure for each model is as follows:
28
- ```
29
- target_directory
30
- ├── model_1
31
- │ ├── *.pth
32
- │ ├── scaler_*.pth (Required if `load_scaler` is True)
33
- │ ├── feature_names.txt
34
- │ ├── target_names.txt
35
- │ └── architecture.json
36
- └── model_2/
37
- └── ...
38
- ```
39
-
40
- Args:
41
- target_directory (str | Path): The path to the root directory that contains model subdirectories.
42
- load_scaler (bool): If True, the function requires and searches for a scaler file (`.pth`) in each model subdirectory.
43
- verbose (bool): If True, enables detailed logging during the file paths search process.
44
-
45
- Returns:
46
- (list[dict[str, Path]]): A list of dictionaries, where each dictionary
47
- corresponds to a model found in a subdirectory. The dictionary
48
- maps standardized keys to the absolute paths of the model's
49
- artifacts (weights, architecture, features, targets, and scaler).
50
- The scaler path will be `None` if `load_scaler` is False.
51
- """
52
- # validate directory
53
- root_path = make_fullpath(target_directory, enforce="directory")
54
-
55
- # store results
56
- all_artifacts: list[dict] = list()
57
-
58
- # find model directories
59
- result_dirs_dict = list_subdirectories(root_dir=root_path, verbose=verbose)
60
- for dir_name, dir_path in result_dirs_dict.items():
61
- # find files
62
- model_pth_dict = list_files_by_extension(directory=dir_path, extension="pth", verbose=verbose)
63
-
64
- # restriction
65
- if load_scaler:
66
- if len(model_pth_dict) != 2:
67
- _LOGGER.error(f"Directory {dir_path} should contain exactly 2 '.pth' files: scaler and weights.")
68
- raise IOError()
69
- else:
70
- if len(model_pth_dict) != 1:
71
- _LOGGER.error(f"Directory {dir_path} should contain exactly 1 '.pth' file: weights.")
72
- raise IOError()
73
-
74
- ##### Scaler and Weights #####
75
- scaler_path = None
76
- weights_path = None
77
-
78
- # load weights and scaler if present
79
- for pth_filename, pth_path in model_pth_dict.items():
80
- if load_scaler and pth_filename.lower().startswith(DatasetKeys.SCALER_PREFIX):
81
- scaler_path = pth_path
82
- else:
83
- weights_path = pth_path
84
-
85
- # validation
86
- if not weights_path:
87
- _LOGGER.error(f"Error parsing the model weights path from '{dir_name}'")
88
- raise IOError()
89
-
90
- if load_scaler and not scaler_path:
91
- _LOGGER.error(f"Error parsing the scaler path from '{dir_name}'")
92
- raise IOError()
93
-
94
- ##### Target and Feature names #####
95
- target_names_path = None
96
- feature_names_path = None
97
-
98
- # load feature and target names
99
- model_txt_dict = list_files_by_extension(directory=dir_path, extension="txt", verbose=verbose)
100
-
101
- for txt_filename, txt_path in model_txt_dict.items():
102
- if txt_filename == DatasetKeys.FEATURE_NAMES:
103
- feature_names_path = txt_path
104
- elif txt_filename == DatasetKeys.TARGET_NAMES:
105
- target_names_path = txt_path
106
-
107
- # validation
108
- if not target_names_path or not feature_names_path:
109
- _LOGGER.error(f"Error parsing features path or targets path from '{dir_name}'")
110
- raise IOError()
111
-
112
- ##### load model architecture path #####
113
- architecture_path = None
114
-
115
- model_json_dict = list_files_by_extension(directory=dir_path, extension="json", verbose=verbose)
116
-
117
- for json_filename, json_path in model_json_dict.items():
118
- if json_filename == PytorchModelArchitectureKeys.SAVENAME:
119
- architecture_path = json_path
120
-
121
- # validation
122
- if not architecture_path:
123
- _LOGGER.error(f"Error parsing the model architecture path from '{dir_name}'")
124
- raise IOError()
125
-
126
- ##### Paths dictionary #####
127
- parsing_dict = {
128
- PytorchArtifactPathKeys.WEIGHTS_PATH: weights_path,
129
- PytorchArtifactPathKeys.ARCHITECTURE_PATH: architecture_path,
130
- PytorchArtifactPathKeys.FEATURES_PATH: feature_names_path,
131
- PytorchArtifactPathKeys.TARGETS_PATH: target_names_path,
132
- PytorchArtifactPathKeys.SCALER_PATH: scaler_path
133
- }
134
-
135
- all_artifacts.append(parsing_dict)
136
-
137
- return all_artifacts
138
-
139
-
140
- def select_features_by_shap(
141
- root_directory: Union[str, Path],
142
- shap_threshold: float,
143
- log_feature_names_directory: Optional[Union[str, Path]],
144
- verbose: bool = True) -> list[str]:
145
- """
146
- Scans subdirectories to find SHAP summary CSVs, then extracts feature
147
- names whose mean absolute SHAP value meets a specified threshold.
148
-
149
- This function is useful for automated feature selection based on feature
150
- importance scores aggregated from multiple models.
151
-
152
- Args:
153
- root_directory (str | Path):
154
- The path to the root directory that contains model subdirectories.
155
- shap_threshold (float):
156
- The minimum mean absolute SHAP value for a feature to be included
157
- in the final list.
158
- log_feature_names_directory (str | Path | None):
159
- If given, saves the chosen feature names as a .txt file in this directory.
160
-
161
- Returns:
162
- list[str]:
163
- A single, sorted list of unique feature names that meet the
164
- threshold criteria across all found files.
165
- """
166
- if verbose:
167
- _LOGGER.info(f"Starting feature selection with SHAP threshold >= {shap_threshold}")
168
- root_path = make_fullpath(root_directory, enforce="directory")
169
-
170
- # --- Step 2: Directory and File Discovery ---
171
- subdirectories = list_subdirectories(root_dir=root_path, verbose=False)
172
-
173
- shap_filename = SHAPKeys.SAVENAME + ".csv"
174
-
175
- valid_csv_paths = []
176
- for dir_name, dir_path in subdirectories.items():
177
- expected_path = dir_path / shap_filename
178
- if expected_path.is_file():
179
- valid_csv_paths.append(expected_path)
180
- else:
181
- _LOGGER.warning(f"No '{shap_filename}' found in subdirectory '{dir_name}'.")
182
-
183
- if not valid_csv_paths:
184
- _LOGGER.error(f"Process halted: No '{shap_filename}' files were found in any subdirectory.")
185
- return []
186
-
187
- if verbose:
188
- _LOGGER.info(f"Found {len(valid_csv_paths)} SHAP summary files to process.")
189
-
190
- # --- Step 3: Data Processing and Feature Extraction ---
191
- master_feature_set = set()
192
- for csv_path in valid_csv_paths:
193
- try:
194
- df, _ = load_dataframe(csv_path, kind="pandas", verbose=False)
195
-
196
- # Validate required columns
197
- required_cols = {SHAPKeys.FEATURE_COLUMN, SHAPKeys.SHAP_VALUE_COLUMN}
198
- if not required_cols.issubset(df.columns):
199
- _LOGGER.warning(f"Skipping '{csv_path}': missing required columns.")
200
- continue
201
-
202
- # Filter by threshold and extract features
203
- filtered_df = df[df[SHAPKeys.SHAP_VALUE_COLUMN] >= shap_threshold]
204
- features = filtered_df[SHAPKeys.FEATURE_COLUMN].tolist()
205
- master_feature_set.update(features)
206
-
207
- except (ValueError, pd.errors.EmptyDataError):
208
- _LOGGER.warning(f"Skipping '{csv_path}' because it is empty or malformed.")
209
- continue
210
- except Exception as e:
211
- _LOGGER.error(f"An unexpected error occurred while processing '{csv_path}': {e}")
212
- continue
213
-
214
- # --- Step 4: Finalize and Return ---
215
- final_features = sorted(list(master_feature_set))
216
- if verbose:
217
- _LOGGER.info(f"Selected {len(final_features)} unique features across all files.")
218
-
219
- if log_feature_names_directory is not None:
220
- save_names_path = make_fullpath(log_feature_names_directory, make=True, enforce="directory")
221
- save_list_strings(list_strings=final_features,
222
- directory=save_names_path,
223
- filename=DatasetKeys.FEATURE_NAMES,
224
- verbose=verbose)
225
-
226
- return final_features
227
-
228
-
229
- def info():
230
- _script_info(__all__)