dragon-ml-toolbox 11.1.1__py3-none-any.whl → 12.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.0.dist-info}/METADATA +22 -36
- dragon_ml_toolbox-12.0.0.dist-info/RECORD +40 -0
- ml_tools/ETL_cleaning.py +1 -0
- ml_tools/ETL_engineering.py +17 -5
- ml_tools/GUI_tools.py +2 -1
- ml_tools/MICE_imputation.py +5 -2
- ml_tools/ML_callbacks.py +3 -3
- ml_tools/ML_datasetmaster.py +1 -0
- ml_tools/ML_evaluation.py +2 -1
- ml_tools/ML_evaluation_multi.py +1 -0
- ml_tools/ML_inference.py +1 -0
- ml_tools/ML_models.py +3 -1
- ml_tools/ML_optimization.py +2 -1
- ml_tools/ML_scaler.py +3 -0
- ml_tools/ML_utilities.py +219 -0
- ml_tools/PSO_optimization.py +5 -6
- ml_tools/RNN_forecast.py +2 -0
- ml_tools/SQL.py +1 -0
- ml_tools/VIF_factor.py +2 -1
- ml_tools/_logger.py +0 -2
- ml_tools/custom_logger.py +1 -0
- ml_tools/data_exploration.py +16 -10
- ml_tools/ensemble_inference.py +5 -6
- ml_tools/ensemble_learning.py +3 -2
- ml_tools/handle_excel.py +1 -0
- ml_tools/math_utilities.py +235 -0
- ml_tools/path_manager.py +2 -1
- ml_tools/serde.py +103 -0
- ml_tools/utilities.py +19 -453
- dragon_ml_toolbox-11.1.1.dist-info/RECORD +0 -37
- {dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-11.1.1.dist-info → dragon_ml_toolbox-12.0.0.dist-info}/top_level.txt +0 -0
ml_tools/utilities.py
CHANGED
|
@@ -1,15 +1,12 @@
|
|
|
1
|
-
import math
|
|
2
1
|
import numpy as np
|
|
3
2
|
import pandas as pd
|
|
4
3
|
import polars as pl
|
|
5
4
|
from pathlib import Path
|
|
6
|
-
from typing import Literal, Union,
|
|
7
|
-
|
|
8
|
-
from
|
|
9
|
-
from .path_manager import sanitize_filename, make_fullpath, list_csv_paths, list_files_by_extension, list_subdirectories
|
|
5
|
+
from typing import Literal, Union, Optional, Any, Iterator, Tuple, overload
|
|
6
|
+
|
|
7
|
+
from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
|
|
10
8
|
from ._script_info import _script_info
|
|
11
9
|
from ._logger import _LOGGER
|
|
12
|
-
from .keys import DatasetKeys, PytorchModelArchitectureKeys, PytorchArtifactPathKeys, SHAPKeys
|
|
13
10
|
|
|
14
11
|
|
|
15
12
|
# Keep track of available tools
|
|
@@ -18,16 +15,10 @@ __all__ = [
|
|
|
18
15
|
"yield_dataframes_from_dir",
|
|
19
16
|
"merge_dataframes",
|
|
20
17
|
"save_dataframe",
|
|
21
|
-
"
|
|
22
|
-
"threshold_binary_values",
|
|
23
|
-
"threshold_binary_values_batch",
|
|
24
|
-
"serialize_object",
|
|
25
|
-
"deserialize_object",
|
|
18
|
+
"save_dataframe_path",
|
|
26
19
|
"distribute_dataset_by_target",
|
|
27
20
|
"train_dataset_orchestrator",
|
|
28
|
-
"train_dataset_yielder"
|
|
29
|
-
"find_model_artifacts",
|
|
30
|
-
"select_features_by_shap"
|
|
21
|
+
"train_dataset_yielder"
|
|
31
22
|
]
|
|
32
23
|
|
|
33
24
|
|
|
@@ -132,6 +123,7 @@ def load_dataframe(
|
|
|
132
123
|
|
|
133
124
|
return df, df_name # type: ignore
|
|
134
125
|
|
|
126
|
+
|
|
135
127
|
def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
|
|
136
128
|
"""
|
|
137
129
|
Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
|
|
@@ -236,7 +228,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
236
228
|
return
|
|
237
229
|
|
|
238
230
|
# Create the directory if it doesn't exist
|
|
239
|
-
save_path = make_fullpath(save_dir, make=True)
|
|
231
|
+
save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
240
232
|
|
|
241
233
|
# Clean the filename
|
|
242
234
|
filename = sanitize_filename(filename)
|
|
@@ -258,250 +250,24 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
258
250
|
_LOGGER.info(f"Saved dataset: '{filename}' with shape: {df.shape}")
|
|
259
251
|
|
|
260
252
|
|
|
261
|
-
def
|
|
253
|
+
def save_dataframe_path(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
|
|
262
254
|
"""
|
|
263
|
-
|
|
264
|
-
applying heuristic adjustments to correct for potential data entry scale mismatches.
|
|
265
|
-
|
|
266
|
-
Parameters:
|
|
267
|
-
data (list):
|
|
268
|
-
A list of values that may include strings, floats, integers, or None.
|
|
269
|
-
None values are treated as 0.0.
|
|
270
|
-
|
|
271
|
-
threshold (int, optional):
|
|
272
|
-
The number of log10 orders of magnitude below the median scale
|
|
273
|
-
at which a value is considered suspect and is scaled upward accordingly.
|
|
274
|
-
Default is 2.
|
|
255
|
+
Saves a DataFrame to a specified full path.
|
|
275
256
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
Notes:
|
|
280
|
-
- Zeros and None values remain zero.
|
|
281
|
-
- Input strings are automatically cast to floats if possible.
|
|
257
|
+
This function is a convenience wrapper for `save_dataframe()`. It takes a
|
|
258
|
+
single `pathlib.Path` object pointing to a `.csv` file.
|
|
282
259
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
"""
|
|
287
|
-
# Step 1: Convert all values to float, treat None as 0.0
|
|
288
|
-
float_list = [float(x) if x is not None else 0.0 for x in data]
|
|
289
|
-
|
|
290
|
-
# Raise for negative values
|
|
291
|
-
if any(x < 0 for x in float_list):
|
|
292
|
-
_LOGGER.error("Negative values are not allowed in the input list.")
|
|
293
|
-
raise ValueError()
|
|
294
|
-
|
|
295
|
-
# Step 2: Compute log10 of non-zero values
|
|
296
|
-
nonzero = [x for x in float_list if x > 0]
|
|
297
|
-
if not nonzero:
|
|
298
|
-
return [0.0 for _ in float_list]
|
|
299
|
-
|
|
300
|
-
log_scales = [math.log10(x) for x in nonzero]
|
|
301
|
-
log_median = np.median(log_scales)
|
|
302
|
-
|
|
303
|
-
# Step 3: Adjust values that are much smaller than median
|
|
304
|
-
adjusted = []
|
|
305
|
-
for x in float_list:
|
|
306
|
-
if x == 0.0:
|
|
307
|
-
adjusted.append(0.0)
|
|
308
|
-
else:
|
|
309
|
-
log_x = math.log10(x)
|
|
310
|
-
if log_median - log_x > threshold:
|
|
311
|
-
scale_diff = round(log_median - log_x)
|
|
312
|
-
adjusted.append(x * (10 ** scale_diff))
|
|
313
|
-
else:
|
|
314
|
-
adjusted.append(x)
|
|
315
|
-
|
|
316
|
-
# Step 4: Normalize to sum to 1.0
|
|
317
|
-
total = sum(adjusted)
|
|
318
|
-
if total == 0:
|
|
319
|
-
return [0.0 for _ in adjusted]
|
|
320
|
-
|
|
321
|
-
return [x / total for x in adjusted]
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
def threshold_binary_values(
|
|
325
|
-
input_array: Union[Sequence[float], np.ndarray, pd.Series, pl.Series],
|
|
326
|
-
binary_values: Optional[int] = None
|
|
327
|
-
) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
|
|
328
|
-
"""
|
|
329
|
-
Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
|
|
330
|
-
|
|
331
|
-
Binary elements are converted to 0 or 1 using a 0.5 threshold.
|
|
332
|
-
|
|
333
|
-
Parameters:
|
|
334
|
-
input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
|
|
335
|
-
binary_values (Optional[int]) :
|
|
336
|
-
- If `None`, all values are treated as binary.
|
|
337
|
-
- If `int`, only this many last `binary_values` are thresholded.
|
|
338
|
-
|
|
339
|
-
Returns:
|
|
340
|
-
Any:
|
|
341
|
-
Same type as input
|
|
260
|
+
Args:
|
|
261
|
+
df (Union[pd.DataFrame, pl.DataFrame]): The pandas or polars DataFrame to save.
|
|
262
|
+
full_path (Path): The complete file path, including the filename and `.csv` extension, where the DataFrame will be saved.
|
|
342
263
|
"""
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
if isinstance(input_array, pl.Series):
|
|
346
|
-
array = input_array.to_numpy()
|
|
347
|
-
elif isinstance(input_array, (pd.Series, np.ndarray)):
|
|
348
|
-
array = np.asarray(input_array)
|
|
349
|
-
elif isinstance(input_array, (list, tuple)):
|
|
350
|
-
array = np.array(input_array)
|
|
351
|
-
else:
|
|
352
|
-
_LOGGER.error("Unsupported input type")
|
|
353
|
-
raise TypeError()
|
|
354
|
-
|
|
355
|
-
array = array.flatten()
|
|
356
|
-
total = array.shape[0]
|
|
357
|
-
|
|
358
|
-
bin_count = total if binary_values is None else binary_values
|
|
359
|
-
if not (0 <= bin_count <= total):
|
|
360
|
-
_LOGGER.error("'binary_values' must be between 0 and the total number of elements")
|
|
264
|
+
if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
|
|
265
|
+
_LOGGER.error('A path object pointing to a .csv file must be provided.')
|
|
361
266
|
raise ValueError()
|
|
362
|
-
|
|
363
|
-
if bin_count == 0:
|
|
364
|
-
result = array
|
|
365
|
-
else:
|
|
366
|
-
cont_part = array[:-bin_count] if bin_count < total else np.array([])
|
|
367
|
-
bin_part = (array[-bin_count:] > 0.5).astype(int)
|
|
368
|
-
result = np.concatenate([cont_part, bin_part])
|
|
369
|
-
|
|
370
|
-
if original_type is pd.Series:
|
|
371
|
-
return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
|
|
372
|
-
elif original_type is pl.Series:
|
|
373
|
-
return pl.Series(input_array.name if hasattr(input_array, 'name') else "binary", result) # type: ignore
|
|
374
|
-
elif original_type is list:
|
|
375
|
-
return result.tolist()
|
|
376
|
-
elif original_type is tuple:
|
|
377
|
-
return tuple(result)
|
|
378
|
-
else:
|
|
379
|
-
return result
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
def threshold_binary_values_batch(
|
|
383
|
-
input_array: np.ndarray,
|
|
384
|
-
binary_values: int
|
|
385
|
-
) -> np.ndarray:
|
|
386
|
-
"""
|
|
387
|
-
Threshold the last `binary_values` columns of a 2D NumPy array to binary {0,1} using 0.5 cutoff.
|
|
388
|
-
|
|
389
|
-
Parameters
|
|
390
|
-
----------
|
|
391
|
-
input_array : np.ndarray
|
|
392
|
-
2D array with shape (batch_size, n_features).
|
|
393
|
-
binary_values : int
|
|
394
|
-
Number of binary features located at the END of each row.
|
|
395
|
-
|
|
396
|
-
Returns
|
|
397
|
-
-------
|
|
398
|
-
np.ndarray
|
|
399
|
-
Thresholded array, same shape as input.
|
|
400
|
-
"""
|
|
401
|
-
if input_array.ndim != 2:
|
|
402
|
-
_LOGGER.error(f"Expected 2D array, got {input_array.ndim}D array.")
|
|
403
|
-
raise AssertionError()
|
|
404
|
-
|
|
405
|
-
batch_size, total_features = input_array.shape
|
|
406
|
-
|
|
407
|
-
if not (0 <= binary_values <= total_features):
|
|
408
|
-
_LOGGER.error("'binary_values' out of valid range.")
|
|
409
|
-
raise AssertionError()
|
|
410
|
-
|
|
411
|
-
if binary_values == 0:
|
|
412
|
-
return input_array.copy()
|
|
413
|
-
|
|
414
|
-
cont_part = input_array[:, :-binary_values] if binary_values < total_features else np.empty((batch_size, 0))
|
|
415
|
-
bin_part = input_array[:, -binary_values:] > 0.5
|
|
416
|
-
bin_part = bin_part.astype(np.int32)
|
|
417
|
-
|
|
418
|
-
return np.hstack([cont_part, bin_part])
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> None:
|
|
422
|
-
"""
|
|
423
|
-
Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
|
|
424
|
-
|
|
425
|
-
Parameters:
|
|
426
|
-
obj (Any) : The Python object to serialize.
|
|
427
|
-
save_dir (str | Path) : Directory path where the serialized object will be saved.
|
|
428
|
-
filename (str) : Name for the output file, extension will be appended if needed.
|
|
429
|
-
"""
|
|
430
|
-
try:
|
|
431
|
-
save_path = make_fullpath(save_dir, make=True)
|
|
432
|
-
sanitized_name = sanitize_filename(filename)
|
|
433
|
-
if not sanitized_name.endswith('.joblib'):
|
|
434
|
-
sanitized_name = sanitized_name + ".joblib"
|
|
435
|
-
full_path = save_path / sanitized_name
|
|
436
|
-
joblib.dump(obj, full_path)
|
|
437
|
-
except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
|
|
438
|
-
_LOGGER.error(f"Failed to serialize object of type '{type(obj)}'.")
|
|
439
|
-
if raise_on_error:
|
|
440
|
-
raise e
|
|
441
|
-
return None
|
|
442
|
-
else:
|
|
443
|
-
if verbose:
|
|
444
|
-
_LOGGER.info(f"Object of type '{type(obj)}' saved to '{full_path}'")
|
|
445
|
-
return None
|
|
446
|
-
|
|
447
|
-
# Define a TypeVar to link the expected type to the return type of deserialization
|
|
448
|
-
T = TypeVar('T')
|
|
449
|
-
|
|
450
|
-
def deserialize_object(
|
|
451
|
-
filepath: Union[str, Path],
|
|
452
|
-
expected_type: Optional[Type[T]] = None,
|
|
453
|
-
verbose: bool = True,
|
|
454
|
-
raise_on_error: bool = True
|
|
455
|
-
) -> Optional[T]:
|
|
456
|
-
"""
|
|
457
|
-
Loads a serialized object from a .joblib file.
|
|
458
|
-
|
|
459
|
-
Parameters:
|
|
460
|
-
filepath (str | Path): Full path to the serialized .joblib file.
|
|
461
|
-
expected_type (Type[T] | None): The expected type of the object.
|
|
462
|
-
If provided, the function raises a TypeError if the loaded object
|
|
463
|
-
is not an instance of this type. It correctly handles generics
|
|
464
|
-
like `list[str]` by checking the base type (e.g., `list`).
|
|
465
|
-
Defaults to None, which skips the type check.
|
|
466
|
-
verbose (bool): If True, logs success messages.
|
|
467
|
-
raise_on_error (bool): If True, raises exceptions on errors. If False, returns None instead.
|
|
468
|
-
|
|
469
|
-
Returns:
|
|
470
|
-
(Any | None): The deserialized Python object, which will match the
|
|
471
|
-
`expected_type` if provided. Returns None if an error
|
|
472
|
-
occurs and `raise_on_error` is False.
|
|
473
|
-
"""
|
|
474
|
-
true_filepath = make_fullpath(filepath)
|
|
475
|
-
|
|
476
|
-
try:
|
|
477
|
-
obj = joblib.load(true_filepath)
|
|
478
|
-
except (IOError, OSError, EOFError, TypeError, ValueError) as e:
|
|
479
|
-
_LOGGER.error(f"Failed to deserialize object from '{true_filepath}'.")
|
|
480
|
-
if raise_on_error:
|
|
481
|
-
raise e
|
|
482
|
-
return None
|
|
483
|
-
else:
|
|
484
|
-
# --- Type Validation Step ---
|
|
485
|
-
if expected_type:
|
|
486
|
-
# get_origin handles generics (e.g., list[str] -> list)
|
|
487
|
-
# If it's not a generic, get_origin returns None, so we use the type itself.
|
|
488
|
-
type_to_check = get_origin(expected_type) or expected_type
|
|
489
|
-
|
|
490
|
-
# Can't do an isinstance check on 'Any', skip it.
|
|
491
|
-
if type_to_check is not Any and not isinstance(obj, type_to_check):
|
|
492
|
-
error_msg = (
|
|
493
|
-
f"Type mismatch: Expected an instance of '{expected_type}', "
|
|
494
|
-
f"but found '{type(obj)}' in '{true_filepath}'."
|
|
495
|
-
)
|
|
496
|
-
_LOGGER.error(error_msg)
|
|
497
|
-
if raise_on_error:
|
|
498
|
-
raise TypeError()
|
|
499
|
-
return None
|
|
500
|
-
|
|
501
|
-
if verbose:
|
|
502
|
-
_LOGGER.info(f"Loaded object of type '{type(obj)}' from '{true_filepath}'.")
|
|
503
267
|
|
|
504
|
-
|
|
268
|
+
save_dataframe(df=df,
|
|
269
|
+
save_dir=full_path.parent,
|
|
270
|
+
filename=full_path.name)
|
|
505
271
|
|
|
506
272
|
|
|
507
273
|
def distribute_dataset_by_target(
|
|
@@ -616,205 +382,5 @@ def train_dataset_yielder(
|
|
|
616
382
|
yield (df_features, df_target, feature_names, target_col)
|
|
617
383
|
|
|
618
384
|
|
|
619
|
-
def find_model_artifacts(target_directory: Union[str,Path], load_scaler: bool, verbose: bool=False) -> list[dict[str,Any]]:
|
|
620
|
-
"""
|
|
621
|
-
Scans subdirectories to find paths to model weights, target names, feature names, and model architecture. Optionally an scaler path if `load_scaler` is True.
|
|
622
|
-
|
|
623
|
-
This function operates on a specific directory structure. It expects the
|
|
624
|
-
`target_directory` to contain one or more subdirectories, where each
|
|
625
|
-
subdirectory represents a single trained model result.
|
|
626
|
-
|
|
627
|
-
The expected directory structure for each model is as follows:
|
|
628
|
-
```
|
|
629
|
-
target_directory
|
|
630
|
-
├── model_1
|
|
631
|
-
│ ├── *.pth
|
|
632
|
-
│ ├── scaler_*.pth (Required if `load_scaler` is True)
|
|
633
|
-
│ ├── feature_names.txt
|
|
634
|
-
│ ├── target_names.txt
|
|
635
|
-
│ └── architecture.json
|
|
636
|
-
└── model_2/
|
|
637
|
-
└── ...
|
|
638
|
-
```
|
|
639
|
-
|
|
640
|
-
Args:
|
|
641
|
-
target_directory (str | Path): The path to the root directory that contains model subdirectories.
|
|
642
|
-
load_scaler (bool): If True, the function requires and searches for a scaler file (`.pth`) in each model subdirectory.
|
|
643
|
-
verbose (bool): If True, enables detailed logging during the file paths search process.
|
|
644
|
-
|
|
645
|
-
Returns:
|
|
646
|
-
(list[dict[str, Path]]): A list of dictionaries, where each dictionary
|
|
647
|
-
corresponds to a model found in a subdirectory. The dictionary
|
|
648
|
-
maps standardized keys to the absolute paths of the model's
|
|
649
|
-
artifacts (weights, architecture, features, targets, and scaler).
|
|
650
|
-
The scaler path will be `None` if `load_scaler` is False.
|
|
651
|
-
"""
|
|
652
|
-
# validate directory
|
|
653
|
-
root_path = make_fullpath(target_directory, enforce="directory")
|
|
654
|
-
|
|
655
|
-
# store results
|
|
656
|
-
all_artifacts: list[dict] = list()
|
|
657
|
-
|
|
658
|
-
# find model directories
|
|
659
|
-
result_dirs_dict = list_subdirectories(root_dir=root_path, verbose=verbose)
|
|
660
|
-
for dir_name, dir_path in result_dirs_dict.items():
|
|
661
|
-
# find files
|
|
662
|
-
model_pth_dict = list_files_by_extension(directory=dir_path, extension="pth", verbose=verbose)
|
|
663
|
-
|
|
664
|
-
# restriction
|
|
665
|
-
if load_scaler:
|
|
666
|
-
if len(model_pth_dict) != 2:
|
|
667
|
-
_LOGGER.error(f"Directory {dir_path} should contain exactly 2 '.pth' files: scaler and weights.")
|
|
668
|
-
raise IOError()
|
|
669
|
-
else:
|
|
670
|
-
if len(model_pth_dict) != 1:
|
|
671
|
-
_LOGGER.error(f"Directory {dir_path} should contain exactly 1 '.pth' file: weights.")
|
|
672
|
-
raise IOError()
|
|
673
|
-
|
|
674
|
-
##### Scaler and Weights #####
|
|
675
|
-
scaler_path = None
|
|
676
|
-
weights_path = None
|
|
677
|
-
|
|
678
|
-
# load weights and scaler if present
|
|
679
|
-
for pth_filename, pth_path in model_pth_dict.items():
|
|
680
|
-
if load_scaler and pth_filename.lower().startswith(DatasetKeys.SCALER_PREFIX):
|
|
681
|
-
scaler_path = pth_path
|
|
682
|
-
else:
|
|
683
|
-
weights_path = pth_path
|
|
684
|
-
|
|
685
|
-
# validation
|
|
686
|
-
if not weights_path:
|
|
687
|
-
_LOGGER.error(f"Error parsing the model weights path from '{dir_name}'")
|
|
688
|
-
raise IOError()
|
|
689
|
-
|
|
690
|
-
if load_scaler and not scaler_path:
|
|
691
|
-
_LOGGER.error(f"Error parsing the scaler path from '{dir_name}'")
|
|
692
|
-
raise IOError()
|
|
693
|
-
|
|
694
|
-
##### Target and Feature names #####
|
|
695
|
-
target_names_path = None
|
|
696
|
-
feature_names_path = None
|
|
697
|
-
|
|
698
|
-
# load feature and target names
|
|
699
|
-
model_txt_dict = list_files_by_extension(directory=dir_path, extension="txt", verbose=verbose)
|
|
700
|
-
|
|
701
|
-
for txt_filename, txt_path in model_txt_dict.items():
|
|
702
|
-
if txt_filename == DatasetKeys.FEATURE_NAMES:
|
|
703
|
-
feature_names_path = txt_path
|
|
704
|
-
elif txt_filename == DatasetKeys.TARGET_NAMES:
|
|
705
|
-
target_names_path = txt_path
|
|
706
|
-
|
|
707
|
-
# validation
|
|
708
|
-
if not target_names_path or not feature_names_path:
|
|
709
|
-
_LOGGER.error(f"Error parsing features path or targets path from '{dir_name}'")
|
|
710
|
-
raise IOError()
|
|
711
|
-
|
|
712
|
-
##### load model architecture path #####
|
|
713
|
-
architecture_path = None
|
|
714
|
-
|
|
715
|
-
model_json_dict = list_files_by_extension(directory=dir_path, extension="json", verbose=verbose)
|
|
716
|
-
|
|
717
|
-
for json_filename, json_path in model_json_dict.items():
|
|
718
|
-
if json_filename == PytorchModelArchitectureKeys.SAVENAME:
|
|
719
|
-
architecture_path = json_path
|
|
720
|
-
|
|
721
|
-
# validation
|
|
722
|
-
if not architecture_path:
|
|
723
|
-
_LOGGER.error(f"Error parsing the model architecture path from '{dir_name}'")
|
|
724
|
-
raise IOError()
|
|
725
|
-
|
|
726
|
-
##### Paths dictionary #####
|
|
727
|
-
parsing_dict = {
|
|
728
|
-
PytorchArtifactPathKeys.WEIGHTS_PATH: weights_path,
|
|
729
|
-
PytorchArtifactPathKeys.ARCHITECTURE_PATH: architecture_path,
|
|
730
|
-
PytorchArtifactPathKeys.FEATURES_PATH: feature_names_path,
|
|
731
|
-
PytorchArtifactPathKeys.TARGETS_PATH: target_names_path,
|
|
732
|
-
PytorchArtifactPathKeys.SCALER_PATH: scaler_path
|
|
733
|
-
}
|
|
734
|
-
|
|
735
|
-
all_artifacts.append(parsing_dict)
|
|
736
|
-
|
|
737
|
-
return all_artifacts
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
def select_features_by_shap(
|
|
741
|
-
root_directory: Union[str, Path],
|
|
742
|
-
shap_threshold: float = 1.0,
|
|
743
|
-
verbose: bool = True) -> list[str]:
|
|
744
|
-
"""
|
|
745
|
-
Scans subdirectories to find SHAP summary CSVs, then extracts feature
|
|
746
|
-
names whose mean absolute SHAP value meets a specified threshold.
|
|
747
|
-
|
|
748
|
-
This function is useful for automated feature selection based on feature
|
|
749
|
-
importance scores aggregated from multiple models.
|
|
750
|
-
|
|
751
|
-
Args:
|
|
752
|
-
root_directory (Union[str, Path]):
|
|
753
|
-
The path to the root directory that contains model subdirectories.
|
|
754
|
-
shap_threshold (float):
|
|
755
|
-
The minimum mean absolute SHAP value for a feature to be included
|
|
756
|
-
in the final list.
|
|
757
|
-
|
|
758
|
-
Returns:
|
|
759
|
-
list[str]:
|
|
760
|
-
A single, sorted list of unique feature names that meet the
|
|
761
|
-
threshold criteria across all found files.
|
|
762
|
-
"""
|
|
763
|
-
if verbose:
|
|
764
|
-
_LOGGER.info(f"Starting feature selection with SHAP threshold >= {shap_threshold}")
|
|
765
|
-
root_path = make_fullpath(root_directory, enforce="directory")
|
|
766
|
-
|
|
767
|
-
# --- Step 2: Directory and File Discovery ---
|
|
768
|
-
subdirectories = list_subdirectories(root_dir=root_path, verbose=False)
|
|
769
|
-
|
|
770
|
-
shap_filename = SHAPKeys.SAVENAME + ".csv"
|
|
771
|
-
|
|
772
|
-
valid_csv_paths = []
|
|
773
|
-
for dir_name, dir_path in subdirectories.items():
|
|
774
|
-
expected_path = dir_path / shap_filename
|
|
775
|
-
if expected_path.is_file():
|
|
776
|
-
valid_csv_paths.append(expected_path)
|
|
777
|
-
else:
|
|
778
|
-
_LOGGER.warning(f"No '{shap_filename}' found in subdirectory '{dir_name}'.")
|
|
779
|
-
|
|
780
|
-
if not valid_csv_paths:
|
|
781
|
-
_LOGGER.error(f"Process halted: No '{shap_filename}' files were found in any subdirectory.")
|
|
782
|
-
return []
|
|
783
|
-
|
|
784
|
-
if verbose:
|
|
785
|
-
_LOGGER.info(f"Found {len(valid_csv_paths)} SHAP summary files to process.")
|
|
786
|
-
|
|
787
|
-
# --- Step 3: Data Processing and Feature Extraction ---
|
|
788
|
-
master_feature_set = set()
|
|
789
|
-
for csv_path in valid_csv_paths:
|
|
790
|
-
try:
|
|
791
|
-
df, _ = load_dataframe(csv_path, kind="pandas", verbose=False)
|
|
792
|
-
|
|
793
|
-
# Validate required columns
|
|
794
|
-
required_cols = {SHAPKeys.FEATURE_COLUMN, SHAPKeys.SHAP_VALUE_COLUMN}
|
|
795
|
-
if not required_cols.issubset(df.columns):
|
|
796
|
-
_LOGGER.warning(f"Skipping '{csv_path}': missing required columns.")
|
|
797
|
-
continue
|
|
798
|
-
|
|
799
|
-
# Filter by threshold and extract features
|
|
800
|
-
filtered_df = df[df[SHAPKeys.SHAP_VALUE_COLUMN] >= shap_threshold]
|
|
801
|
-
features = filtered_df[SHAPKeys.FEATURE_COLUMN].tolist()
|
|
802
|
-
master_feature_set.update(features)
|
|
803
|
-
|
|
804
|
-
except (ValueError, pd.errors.EmptyDataError):
|
|
805
|
-
_LOGGER.warning(f"Skipping '{csv_path}' because it is empty or malformed.")
|
|
806
|
-
continue
|
|
807
|
-
except Exception as e:
|
|
808
|
-
_LOGGER.error(f"An unexpected error occurred while processing '{csv_path}': {e}")
|
|
809
|
-
continue
|
|
810
|
-
|
|
811
|
-
# --- Step 4: Finalize and Return ---
|
|
812
|
-
final_features = sorted(list(master_feature_set))
|
|
813
|
-
if verbose:
|
|
814
|
-
_LOGGER.info(f"Selected {len(final_features)} unique features across all files.")
|
|
815
|
-
|
|
816
|
-
return final_features
|
|
817
|
-
|
|
818
|
-
|
|
819
385
|
def info():
|
|
820
386
|
_script_info(__all__)
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-11.1.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
-
dragon_ml_toolbox-11.1.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
|
|
3
|
-
ml_tools/ETL_cleaning.py,sha256=-JrYkT8AvkZFK-Agzhp6uVxaZXzFw49t0txjf6Z1Apw,20365
|
|
4
|
-
ml_tools/ETL_engineering.py,sha256=9dmNd2e3fUldwhIggogGKPlxTb02rtb463Kq5QHnqJo,54551
|
|
5
|
-
ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
|
|
6
|
-
ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
|
|
7
|
-
ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
|
|
8
|
-
ml_tools/ML_datasetmaster.py,sha256=vqKZhCXsvN5yeRJdOKqMPh5OhY1xe6xlNjM3WoH5lys,30821
|
|
9
|
-
ml_tools/ML_evaluation.py,sha256=6FB6S-aDDpFzQdrp3flBVECzEsHhMbQknYVGhHooEFs,16207
|
|
10
|
-
ml_tools/ML_evaluation_multi.py,sha256=2jTSNFCu8cz5C05EusnrDyffs59M2Fq3UXSHxo2TR1A,12515
|
|
11
|
-
ml_tools/ML_inference.py,sha256=SGDPiPxs_OYDKKRZziFMyaWcC8A37c70W9t-dMP5niI,23066
|
|
12
|
-
ml_tools/ML_models.py,sha256=JMFOuw4jtX5RtUFpkQWS8-dzDW0AwqYjbl67XRCVubA,27996
|
|
13
|
-
ml_tools/ML_optimization.py,sha256=mMVLR3Atu2yeS-zzPCnKBO6CwRgJavCXUJT3zvniUjo,18097
|
|
14
|
-
ml_tools/ML_scaler.py,sha256=h2ymq5u953Lx60Qb38Y0mAWj85x9PbnP0xYNQ3pd8-w,7535
|
|
15
|
-
ml_tools/ML_trainer.py,sha256=_g48w5Ak-wQr5fGHdJqlcpnzv3gWyL1ghkOhy9VOZbo,23930
|
|
16
|
-
ml_tools/PSO_optimization.py,sha256=q0VYpssQGbPum7xdnkDXlJQKhZMYZo8acHpKhajPK3c,22954
|
|
17
|
-
ml_tools/RNN_forecast.py,sha256=8rNZr-eWOBXMiDQV22e_tQTPM5LM2IFggEAa1FaoXaI,1965
|
|
18
|
-
ml_tools/SQL.py,sha256=rPeKywvwJ5oHYVUQUovO3OUkXQTxBT9Dvwb6E2ntphY,11233
|
|
19
|
-
ml_tools/VIF_factor.py,sha256=MkMh_RIdsN2XUPzKNGRiEcmB17R_MmvGV4ezpL5zD2E,10403
|
|
20
|
-
ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
21
|
-
ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
|
|
22
|
-
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
23
|
-
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
24
|
-
ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
|
|
25
|
-
ml_tools/data_exploration.py,sha256=-aTi5jmv4AepPgi2k_85qEJsSLx5zPOtTbhorqzUvGQ,38542
|
|
26
|
-
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
27
|
-
ml_tools/ensemble_inference.py,sha256=Hun_ipIZaaLrHxSo63J6NKS_O1fMWi_6HkuSHs4RywI,9349
|
|
28
|
-
ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
|
|
29
|
-
ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
|
|
30
|
-
ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
|
|
31
|
-
ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
|
|
32
|
-
ml_tools/path_manager.py,sha256=ke0MYOhYheRPX599GUbrvRsYHn2JKUmMDldS5LP6LQA,18431
|
|
33
|
-
ml_tools/utilities.py,sha256=uheMUjQJ1zI69gASsE-mCq4KlRPVGgrgqson02rGNYM,30755
|
|
34
|
-
dragon_ml_toolbox-11.1.1.dist-info/METADATA,sha256=Vl_AVzC58IA6OESD3NQTPADls7o_eN5dl-s2qKdWBZI,6657
|
|
35
|
-
dragon_ml_toolbox-11.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
36
|
-
dragon_ml_toolbox-11.1.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
37
|
-
dragon_ml_toolbox-11.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|