dragon-ml-toolbox 2.4.0__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/METADATA +7 -4
- dragon_ml_toolbox-3.1.0.dist-info/RECORD +25 -0
- ml_tools/ETL_engineering.py +49 -19
- ml_tools/GUI_tools.py +24 -25
- ml_tools/MICE_imputation.py +8 -4
- ml_tools/ML_callbacks.py +341 -0
- ml_tools/ML_evaluation.py +255 -0
- ml_tools/ML_trainer.py +344 -0
- ml_tools/ML_tutorial.py +300 -0
- ml_tools/PSO_optimization.py +27 -20
- ml_tools/RNN_forecast.py +49 -0
- ml_tools/VIF_factor.py +6 -5
- ml_tools/data_exploration.py +2 -2
- ml_tools/datasetmaster.py +601 -527
- ml_tools/ensemble_learning.py +12 -9
- ml_tools/handle_excel.py +9 -10
- ml_tools/logger.py +45 -8
- ml_tools/utilities.py +18 -1
- dragon_ml_toolbox-2.4.0.dist-info/RECORD +0 -22
- ml_tools/trainer.py +0 -346
- ml_tools/vision_helpers.py +0 -231
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.1.0.dist-info}/top_level.txt +0 -0
- /ml_tools/{pytorch_models.py → _pytorch_models.py} +0 -0
ml_tools/PSO_optimization.py
CHANGED
|
@@ -7,20 +7,23 @@ from sklearn.base import ClassifierMixin
|
|
|
7
7
|
from typing import Literal, Union, Tuple, Dict, Optional
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from copy import deepcopy
|
|
10
|
-
from .utilities import
|
|
10
|
+
from .utilities import (
|
|
11
|
+
_script_info,
|
|
12
|
+
list_csv_paths,
|
|
13
|
+
threshold_binary_values,
|
|
14
|
+
threshold_binary_values_batch,
|
|
15
|
+
deserialize_object,
|
|
16
|
+
list_files_by_extension,
|
|
17
|
+
save_dataframe,
|
|
18
|
+
make_fullpath,
|
|
19
|
+
yield_dataframes_from_dir,
|
|
20
|
+
sanitize_filename)
|
|
11
21
|
import torch
|
|
12
22
|
from tqdm import trange
|
|
13
|
-
import logging
|
|
14
23
|
import matplotlib.pyplot as plt
|
|
15
24
|
import seaborn as sns
|
|
16
25
|
from collections import defaultdict
|
|
17
|
-
|
|
18
|
-
# Configure logger
|
|
19
|
-
logging.basicConfig(
|
|
20
|
-
level=logging.INFO,
|
|
21
|
-
format="[%(asctime)s] [%(levelname)s] - %(message)s",
|
|
22
|
-
datefmt="%Y-%m-%d %H:%M:%S"
|
|
23
|
-
)
|
|
26
|
+
from .logger import _LOGGER
|
|
24
27
|
|
|
25
28
|
|
|
26
29
|
__all__ = [
|
|
@@ -304,7 +307,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
304
307
|
else:
|
|
305
308
|
device = torch.device("cpu")
|
|
306
309
|
|
|
307
|
-
|
|
310
|
+
_LOGGER.info(f"Using device: '{device}'")
|
|
308
311
|
|
|
309
312
|
# set local deep copies to prevent in place list modification
|
|
310
313
|
local_lower_boundaries = deepcopy(lower_boundaries)
|
|
@@ -352,7 +355,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
352
355
|
save_results_path = make_fullpath(save_results_dir, make=True)
|
|
353
356
|
_save_results(features, target, save_dir=save_results_path, target_name=target_name)
|
|
354
357
|
|
|
355
|
-
return features, target
|
|
358
|
+
return features, target # type: ignore
|
|
356
359
|
|
|
357
360
|
|
|
358
361
|
def _pso(func: ObjectiveFunction,
|
|
@@ -526,19 +529,23 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
526
529
|
If True, generates comparative plots with distributions colored by their source target.
|
|
527
530
|
"""
|
|
528
531
|
mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
|
|
529
|
-
|
|
532
|
+
_LOGGER.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
|
|
530
533
|
|
|
534
|
+
# Check results_dir
|
|
535
|
+
results_path = make_fullpath(results_dir)
|
|
536
|
+
# make output path
|
|
531
537
|
output_path = make_fullpath(save_dir, make=True)
|
|
532
|
-
|
|
538
|
+
|
|
539
|
+
all_csvs = list_csv_paths(results_path)
|
|
533
540
|
|
|
534
|
-
if not
|
|
535
|
-
|
|
541
|
+
if not all_csvs:
|
|
542
|
+
_LOGGER.warning("No data found. No plots will be generated.")
|
|
536
543
|
return
|
|
537
544
|
|
|
538
545
|
# --- MODE 1: Color-coded plots by target ---
|
|
539
546
|
if color_by_target:
|
|
540
547
|
data_to_plot = []
|
|
541
|
-
for df, df_name in
|
|
548
|
+
for df, df_name in yield_dataframes_from_dir(results_path):
|
|
542
549
|
# Assumes last col is target, rest are features
|
|
543
550
|
melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
|
|
544
551
|
# Sanitize target name for cleaner legend labels
|
|
@@ -547,7 +554,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
547
554
|
|
|
548
555
|
long_df = pd.concat(data_to_plot, ignore_index=True)
|
|
549
556
|
features = long_df['feature'].unique()
|
|
550
|
-
|
|
557
|
+
_LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
|
|
551
558
|
|
|
552
559
|
for feature_name in features:
|
|
553
560
|
plt.figure(figsize=(12, 7))
|
|
@@ -569,12 +576,12 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
569
576
|
# --- MODE 2: Aggregate plot ---
|
|
570
577
|
else:
|
|
571
578
|
feature_distributions = defaultdict(list)
|
|
572
|
-
for df, _ in
|
|
579
|
+
for df, _ in yield_dataframes_from_dir(results_path):
|
|
573
580
|
feature_columns = df.iloc[:, :-1]
|
|
574
581
|
for feature_name in feature_columns:
|
|
575
582
|
feature_distributions[feature_name].extend(df[feature_name].tolist())
|
|
576
583
|
|
|
577
|
-
|
|
584
|
+
_LOGGER.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
|
|
578
585
|
for feature_name, values in feature_distributions.items():
|
|
579
586
|
plt.figure(figsize=(12, 7))
|
|
580
587
|
sns.histplot(x=values, kde=True, bins='auto', stat="density")
|
|
@@ -589,7 +596,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
589
596
|
plt.savefig(plot_filename, bbox_inches='tight')
|
|
590
597
|
plt.close()
|
|
591
598
|
|
|
592
|
-
|
|
599
|
+
_LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
|
|
593
600
|
|
|
594
601
|
|
|
595
602
|
def info():
|
ml_tools/RNN_forecast.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch import nn
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"rnn_forecast"
|
|
7
|
+
]
|
|
8
|
+
|
|
9
|
+
def rnn_forecast(model: nn.Module, start_sequence: torch.Tensor, steps: int, device: str = 'cpu'):
|
|
10
|
+
"""
|
|
11
|
+
Runs a sequential forecast for a trained RNN-based model.
|
|
12
|
+
|
|
13
|
+
This function iteratively predicts future time steps, where each new prediction
|
|
14
|
+
is generated by feeding the previous prediction back into the model.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
model (nn.Module): The trained PyTorch RNN model (e.g., LSTM, GRU).
|
|
18
|
+
start_sequence (torch.Tensor): The initial sequence to start the forecast from.
|
|
19
|
+
Shape should be (sequence_length, num_features).
|
|
20
|
+
steps (int): The number of future time steps to predict.
|
|
21
|
+
device (str, optional): The device to run the forecast on ('cpu', 'cuda', 'mps').
|
|
22
|
+
Defaults to 'cpu'.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
np.ndarray: A numpy array containing the forecasted values.
|
|
26
|
+
"""
|
|
27
|
+
model.eval()
|
|
28
|
+
model.to(device)
|
|
29
|
+
|
|
30
|
+
predictions = []
|
|
31
|
+
current_sequence = start_sequence.to(device)
|
|
32
|
+
|
|
33
|
+
with torch.no_grad():
|
|
34
|
+
for _ in range(steps):
|
|
35
|
+
# Get the model's prediction for the current sequence
|
|
36
|
+
output = model(current_sequence.unsqueeze(0)) # Add batch dimension
|
|
37
|
+
|
|
38
|
+
# The prediction is the last element of the output sequence
|
|
39
|
+
next_pred = output[0, -1, :].view(1, -1)
|
|
40
|
+
|
|
41
|
+
# Store the prediction
|
|
42
|
+
predictions.append(next_pred.cpu().numpy())
|
|
43
|
+
|
|
44
|
+
# Update the sequence for the next iteration:
|
|
45
|
+
# Drop the first element and append the new prediction
|
|
46
|
+
current_sequence = torch.cat([current_sequence[1:], next_pred], dim=0)
|
|
47
|
+
|
|
48
|
+
# Concatenate all predictions and flatten the array for easy use
|
|
49
|
+
return np.concatenate(predictions).flatten()
|
ml_tools/VIF_factor.py
CHANGED
|
@@ -8,6 +8,7 @@ from statsmodels.tools.tools import add_constant
|
|
|
8
8
|
import warnings
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info, make_fullpath
|
|
11
|
+
from .logger import _LOGGER
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
@@ -54,20 +55,20 @@ def compute_vif(
|
|
|
54
55
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
55
56
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
56
57
|
if missing_features and verbose:
|
|
57
|
-
|
|
58
|
+
_LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
58
59
|
else:
|
|
59
60
|
sanitized_columns = list()
|
|
60
61
|
for feature in use_columns:
|
|
61
62
|
if feature not in ground_truth_cols:
|
|
62
63
|
if verbose:
|
|
63
|
-
|
|
64
|
+
_LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
64
65
|
else:
|
|
65
66
|
sanitized_columns.append(feature)
|
|
66
67
|
|
|
67
68
|
if ignore_columns is not None and use_columns is None:
|
|
68
69
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
69
70
|
if missing_ignore and verbose:
|
|
70
|
-
|
|
71
|
+
_LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
71
72
|
sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
|
|
72
73
|
|
|
73
74
|
X = df[sanitized_columns].copy()
|
|
@@ -167,12 +168,12 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
167
168
|
|
|
168
169
|
# Identify features to drop
|
|
169
170
|
to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
|
|
170
|
-
|
|
171
|
+
_LOGGER.info(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
|
|
171
172
|
|
|
172
173
|
result_df = df.drop(columns=to_drop)
|
|
173
174
|
|
|
174
175
|
if result_df.empty:
|
|
175
|
-
|
|
176
|
+
_LOGGER.warning(f"\t⚠️ All columns were dropped.")
|
|
176
177
|
|
|
177
178
|
return result_df, to_drop
|
|
178
179
|
|
ml_tools/data_exploration.py
CHANGED
|
@@ -587,14 +587,14 @@ def standardize_percentages(
|
|
|
587
587
|
Standardizes numeric columns containing mixed-format percentages.
|
|
588
588
|
|
|
589
589
|
This function cleans columns where percentages might be entered as whole
|
|
590
|
-
numbers (
|
|
590
|
+
numbers (55) and as proportions (0.55). It assumes values
|
|
591
591
|
between 0 and 1 are proportions and multiplies them by 100.
|
|
592
592
|
|
|
593
593
|
Args:
|
|
594
594
|
df (pd.Dataframe): The input pandas DataFrame.
|
|
595
595
|
columns (list[str]): A list of column names to standardize.
|
|
596
596
|
treat_one_as_proportion (bool):
|
|
597
|
-
- If True (default): The value `1` is treated as a proportion and converted to `100
|
|
597
|
+
- If True (default): The value `1` is treated as a proportion and converted to `100%`.
|
|
598
598
|
- If False: The value `1` is treated as `1%`.
|
|
599
599
|
round_digits (int): The number of decimal places to round the final result to.
|
|
600
600
|
|