dragon-ml-toolbox 2.4.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/METADATA +7 -4
- dragon_ml_toolbox-3.0.0.dist-info/RECORD +25 -0
- ml_tools/ETL_engineering.py +8 -7
- ml_tools/GUI_tools.py +24 -25
- ml_tools/MICE_imputation.py +8 -4
- ml_tools/ML_callbacks.py +341 -0
- ml_tools/ML_evaluation.py +255 -0
- ml_tools/ML_trainer.py +344 -0
- ml_tools/ML_tutorial.py +300 -0
- ml_tools/PSO_optimization.py +27 -20
- ml_tools/RNN_forecast.py +49 -0
- ml_tools/VIF_factor.py +6 -5
- ml_tools/datasetmaster.py +601 -527
- ml_tools/ensemble_learning.py +12 -9
- ml_tools/handle_excel.py +9 -10
- ml_tools/logger.py +45 -8
- ml_tools/utilities.py +18 -1
- dragon_ml_toolbox-2.4.0.dist-info/RECORD +0 -22
- ml_tools/trainer.py +0 -346
- ml_tools/vision_helpers.py +0 -231
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.4.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/top_level.txt +0 -0
- /ml_tools/{pytorch_models.py → _pytorch_models.py} +0 -0
ml_tools/ML_tutorial.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Literal, Optional, Union
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from .logger import _LOGGER
|
|
5
|
+
from .utilities import make_fullpath, sanitize_filename
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"generate_notebook"
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
def _get_notebook_content(kind: str):
|
|
13
|
+
"""Helper function to generate the cell content for the notebook."""
|
|
14
|
+
|
|
15
|
+
# --- Common Cells ---
|
|
16
|
+
imports_cell = {
|
|
17
|
+
"cell_type": "code",
|
|
18
|
+
"source": [
|
|
19
|
+
"import torch\n",
|
|
20
|
+
"from torch import nn\n",
|
|
21
|
+
"from torch.utils.data import TensorDataset, DataLoader\n",
|
|
22
|
+
"import numpy as np\n",
|
|
23
|
+
"from pathlib import Path\n",
|
|
24
|
+
"\n",
|
|
25
|
+
"# Import from dragon_ml_toolbox\n",
|
|
26
|
+
"from ml_tools.ML_trainer import MyTrainer\n",
|
|
27
|
+
"from ml_tools.ML_callbacks import EarlyStopping, ModelCheckpoint"
|
|
28
|
+
"from ml_tools.utilities import LogKeys"
|
|
29
|
+
]
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
device_cell = {
|
|
33
|
+
"cell_type": "code",
|
|
34
|
+
"source": [
|
|
35
|
+
"import torch\\n",
|
|
36
|
+
"if torch.cuda.is_available():\\n",
|
|
37
|
+
" device = 'cuda'\\n",
|
|
38
|
+
"elif torch.backends.mps.is_available():\\n",
|
|
39
|
+
" device = 'mps'\\n",
|
|
40
|
+
"else:\\n",
|
|
41
|
+
" device = 'cpu'\\n",
|
|
42
|
+
"\\n",
|
|
43
|
+
"print(f'Using device: {device}')"
|
|
44
|
+
]
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
model_definition_cell = {
|
|
48
|
+
"cell_type": "markdown",
|
|
49
|
+
"source": [
|
|
50
|
+
"### 3. Define the Model, Criterion, and Optimizer\n",
|
|
51
|
+
"Next, we define a simple neural network for our task. We also need to choose a loss function (`criterion`) and an `optimizer`."
|
|
52
|
+
]
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
callbacks_cell = {
|
|
56
|
+
"cell_type": "code",
|
|
57
|
+
"source": [
|
|
58
|
+
"# Define callbacks for training\n",
|
|
59
|
+
"model_filepath = 'best_model.pth'\n",
|
|
60
|
+
"monitor_metric = LogKeys.VAL_LOSS\n",
|
|
61
|
+
"\n",
|
|
62
|
+
"model_checkpoint = ModelCheckpoint(\n",
|
|
63
|
+
" filepath=model_filepath, \n",
|
|
64
|
+
" save_best_only=True, \n",
|
|
65
|
+
" monitor=monitor_metric, \n",
|
|
66
|
+
" mode='min'\n",
|
|
67
|
+
")\n",
|
|
68
|
+
"\n",
|
|
69
|
+
"early_stopping = EarlyStopping(\n",
|
|
70
|
+
" patience=10, \n",
|
|
71
|
+
" monitor=monitor_metric, \n",
|
|
72
|
+
" mode='min'\n",
|
|
73
|
+
")"
|
|
74
|
+
]
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
trainer_instantiation_cell = {
|
|
78
|
+
"cell_type": "code",
|
|
79
|
+
"source": [
|
|
80
|
+
"trainer = MyTrainer(\n",
|
|
81
|
+
" model=model,\n",
|
|
82
|
+
" train_dataset=train_dataset,\n",
|
|
83
|
+
" test_dataset=test_dataset,\n",
|
|
84
|
+
f" kind='{kind}',\n",
|
|
85
|
+
" criterion=criterion,\n",
|
|
86
|
+
" optimizer=optimizer,\n",
|
|
87
|
+
" device=device,\\n",
|
|
88
|
+
" callbacks=[model_checkpoint, early_stopping]\n",
|
|
89
|
+
")"
|
|
90
|
+
]
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
fit_cell = {
|
|
94
|
+
"cell_type": "code",
|
|
95
|
+
"source": [
|
|
96
|
+
"history = trainer.fit(epochs=100, batch_size=16)"
|
|
97
|
+
]
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
evaluation_cell = {
|
|
101
|
+
"cell_type": "code",
|
|
102
|
+
"source": [
|
|
103
|
+
"save_dir = Path('tutorial_results')\n",
|
|
104
|
+
"\n",
|
|
105
|
+
"# The evaluate method will automatically use the test_loader.\n",
|
|
106
|
+
"# First, we load the best weights saved by ModelCheckpoint.\n",
|
|
107
|
+
"model_path = Path(model_filepath)\n",
|
|
108
|
+
"if model_path.exists():\n",
|
|
109
|
+
" print(f'Loading best model from {model_path}')\n",
|
|
110
|
+
" trainer.model.load_state_dict(torch.load(model_path))\n",
|
|
111
|
+
"\n",
|
|
112
|
+
"print('\\n--- Evaluating Model ---')\n",
|
|
113
|
+
"# All evaluation artifacts will be saved in the 'evaluation' subdirectory.\n",
|
|
114
|
+
"trainer.evaluate(save_dir=save_dir / 'evaluation')"
|
|
115
|
+
]
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
explanation_cell = {
|
|
119
|
+
"cell_type": "code",
|
|
120
|
+
"source": [
|
|
121
|
+
"print('\\n--- Explaining Model ---')\n",
|
|
122
|
+
"# We can also generate SHAP plots to explain the model's predictions.\n",
|
|
123
|
+
"# All SHAP artifacts will be saved in the 'explanation' subdirectory.\n",
|
|
124
|
+
"trainer.explain(\n",
|
|
125
|
+
" background_loader=trainer.train_loader,\n",
|
|
126
|
+
" explain_loader=trainer.test_loader,\n",
|
|
127
|
+
" save_dir=save_dir / 'explanation'\n",
|
|
128
|
+
")"
|
|
129
|
+
]
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# --- Task-Specific Cells ---
|
|
134
|
+
if kind == 'classification':
|
|
135
|
+
title = "Classification Tutorial"
|
|
136
|
+
data_prep_source = [
|
|
137
|
+
"### 2. Prepare the Data\n",
|
|
138
|
+
"For this example, we'll generate some simple, linearly separable mock data for a binary classification task. We'll then wrap it in PyTorch `TensorDataset` objects."
|
|
139
|
+
]
|
|
140
|
+
data_creation_source = [
|
|
141
|
+
"from sklearn.datasets import make_classification\n",
|
|
142
|
+
"from sklearn.model_selection import train_test_split\n",
|
|
143
|
+
"\n",
|
|
144
|
+
"X, y = make_classification(n_samples=200, n_features=10, n_informative=5, n_redundant=0, random_state=42)\n",
|
|
145
|
+
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|
146
|
+
"\n",
|
|
147
|
+
"# Convert to PyTorch tensors\n",
|
|
148
|
+
"X_train = torch.FloatTensor(X_train)\n",
|
|
149
|
+
"y_train = torch.LongTensor(y_train)\n",
|
|
150
|
+
"X_test = torch.FloatTensor(X_test)\n",
|
|
151
|
+
"y_test = torch.LongTensor(y_test)\n",
|
|
152
|
+
"\n",
|
|
153
|
+
"# Create TensorDatasets\n",
|
|
154
|
+
"train_dataset = TensorDataset(X_train, y_train)\n",
|
|
155
|
+
"test_dataset = TensorDataset(X_test, y_test)"
|
|
156
|
+
]
|
|
157
|
+
model_creation_source = [
|
|
158
|
+
"class SimpleClassifier(nn.Module):\n",
|
|
159
|
+
" def __init__(self, input_features, num_classes):\n",
|
|
160
|
+
" super().__init__()\n",
|
|
161
|
+
" self.layer_1 = nn.Linear(input_features, 32)\n",
|
|
162
|
+
" self.layer_2 = nn.Linear(32, num_classes)\n",
|
|
163
|
+
" self.relu = nn.ReLU()\n",
|
|
164
|
+
" \n",
|
|
165
|
+
" def forward(self, x):\n",
|
|
166
|
+
" return self.layer_2(self.relu(self.layer_1(x)))\n",
|
|
167
|
+
"\n",
|
|
168
|
+
"model = SimpleClassifier(input_features=10, num_classes=2)\n",
|
|
169
|
+
"criterion = nn.CrossEntropyLoss()\n",
|
|
170
|
+
"optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
elif kind == 'regression':
|
|
174
|
+
title = "Regression Tutorial"
|
|
175
|
+
data_prep_source = [
|
|
176
|
+
"### 2. Prepare the Data\n",
|
|
177
|
+
"For this example, we'll generate some simple mock data for a regression task. We'll then wrap it in PyTorch `TensorDataset` objects."
|
|
178
|
+
]
|
|
179
|
+
data_creation_source = [
|
|
180
|
+
"from sklearn.datasets import make_regression\n",
|
|
181
|
+
"from sklearn.model_selection import train_test_split\n",
|
|
182
|
+
"\n",
|
|
183
|
+
"X, y = make_regression(n_samples=200, n_features=5, noise=15, random_state=42)\n",
|
|
184
|
+
"y = y.reshape(-1, 1) # Reshape for compatibility with MSELoss\n",
|
|
185
|
+
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|
186
|
+
"\n",
|
|
187
|
+
"# Convert to PyTorch tensors\n",
|
|
188
|
+
"X_train = torch.FloatTensor(X_train)\n",
|
|
189
|
+
"y_train = torch.FloatTensor(y_train)\n",
|
|
190
|
+
"X_test = torch.FloatTensor(X_test)\n",
|
|
191
|
+
"y_test = torch.FloatTensor(y_test)\n",
|
|
192
|
+
"\n",
|
|
193
|
+
"# Create TensorDatasets\n",
|
|
194
|
+
"train_dataset = TensorDataset(X_train, y_train)\n",
|
|
195
|
+
"test_dataset = TensorDataset(X_test, y_test)"
|
|
196
|
+
]
|
|
197
|
+
model_creation_source = [
|
|
198
|
+
"class SimpleRegressor(nn.Module):\n",
|
|
199
|
+
" def __init__(self, input_features, output_features):\n",
|
|
200
|
+
" super().__init__()\n",
|
|
201
|
+
" self.layer_1 = nn.Linear(input_features, 32)\n",
|
|
202
|
+
" self.layer_2 = nn.Linear(32, output_features)\n",
|
|
203
|
+
" self.relu = nn.ReLU()\n",
|
|
204
|
+
" \n",
|
|
205
|
+
" def forward(self, x):\n",
|
|
206
|
+
" return self.layer_2(self.relu(self.layer_1(x)))\n",
|
|
207
|
+
"\n",
|
|
208
|
+
"model = SimpleRegressor(input_features=5, output_features=1)\n",
|
|
209
|
+
"criterion = nn.MSELoss()\n",
|
|
210
|
+
"optimizer = torch.optim.Adam(model.parameters(), lr=0.001)"
|
|
211
|
+
]
|
|
212
|
+
else:
|
|
213
|
+
raise ValueError("kind must be 'classification' or 'regression'")
|
|
214
|
+
|
|
215
|
+
# --- Assemble Notebook ---
|
|
216
|
+
cells = [
|
|
217
|
+
{"cell_type": "markdown", "source": [f"# Dragon ML Toolbox - {title}\n", "This notebook demonstrates how to use the `MyTrainer` class for a complete training and evaluation workflow."]},
|
|
218
|
+
{"cell_type": "markdown", "source": ["### 1. Imports\n", "First, let's import all the necessary components."]},
|
|
219
|
+
imports_cell,
|
|
220
|
+
{"cell_type": "markdown", "source": data_prep_source},
|
|
221
|
+
{"cell_type": "code", "source": data_creation_source},
|
|
222
|
+
model_definition_cell,
|
|
223
|
+
{"cell_type": "code", "source": model_creation_source},
|
|
224
|
+
{"cell_type": "markdown", "source": ["### 4. Configure Callbacks\n", "We'll set up `ModelCheckpoint` to save the best model and `EarlyStopping` to prevent overfitting."]},
|
|
225
|
+
callbacks_cell,
|
|
226
|
+
{"cell_type": "markdown", "source": ["### 5. Initialize the Trainer\\n", "First, we'll determine the best device to run on. Then, we can instantiate `MyTrainer` with all our components."]},
|
|
227
|
+
device_cell,
|
|
228
|
+
trainer_instantiation_cell,
|
|
229
|
+
{"cell_type": "markdown", "source": ["### 6. Train the Model\n", "Call the `.fit()` method to start training."]},
|
|
230
|
+
fit_cell,
|
|
231
|
+
{"cell_type": "markdown", "source": ["### 7. Evaluate the Model\n", "Finally, call the `.evaluate()` method to see the performance report and save all plots and metrics."]},
|
|
232
|
+
evaluation_cell,
|
|
233
|
+
{"cell_type": "markdown", "source": ["### 8. Explain the Model\n", "We can also use the `.explain()` method to generate and save SHAP plots for model interpretability."]},
|
|
234
|
+
explanation_cell,
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
# Add execution counts to code cells
|
|
238
|
+
for cell in cells:
|
|
239
|
+
if cell['cell_type'] == 'code':
|
|
240
|
+
cell['execution_count'] = None
|
|
241
|
+
cell['metadata'] = {}
|
|
242
|
+
cell['outputs'] = []
|
|
243
|
+
|
|
244
|
+
return cells
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def generate_notebook(kind: Literal['classification', 'regression'] = 'classification', filepath: Optional[Union[str,Path]] = None):
|
|
248
|
+
"""
|
|
249
|
+
Generates a tutorial Jupyter Notebook (.ipynb) file.
|
|
250
|
+
|
|
251
|
+
This function creates a complete, runnable notebook with mock data,
|
|
252
|
+
a simple model, and a full training/evaluation cycle using MyTrainer.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
kind (str): The type of tutorial to generate, either 'classification' or 'regression'.
|
|
256
|
+
filepath (str | Path | None): The path to save the notebook file.
|
|
257
|
+
If None, defaults to 'classification_tutorial.ipynb' or
|
|
258
|
+
'regression_tutorial.ipynb' in the current directory.
|
|
259
|
+
"""
|
|
260
|
+
if kind not in ["classification", "regression"]:
|
|
261
|
+
raise ValueError("kind must be 'classification' or 'regression'")
|
|
262
|
+
|
|
263
|
+
if filepath is None:
|
|
264
|
+
sanitized_filepath = f"{kind}_tutorial.ipynb"
|
|
265
|
+
else:
|
|
266
|
+
sanitized_filepath = sanitize_filename(str(filepath))
|
|
267
|
+
|
|
268
|
+
# check suffix
|
|
269
|
+
if not sanitized_filepath.endswith(".ipynb"):
|
|
270
|
+
sanitized_filepath = sanitized_filepath + ".ipynb"
|
|
271
|
+
|
|
272
|
+
new_filepath = make_fullpath(sanitized_filepath, make=True)
|
|
273
|
+
|
|
274
|
+
_LOGGER.info(f"Generating {kind} tutorial notebook at: {filepath}")
|
|
275
|
+
|
|
276
|
+
cells = _get_notebook_content(kind)
|
|
277
|
+
|
|
278
|
+
notebook = {
|
|
279
|
+
"cells": cells,
|
|
280
|
+
"metadata": {
|
|
281
|
+
"kernelspec": {
|
|
282
|
+
"display_name": "Python 3",
|
|
283
|
+
"language": "python",
|
|
284
|
+
"name": "python3"
|
|
285
|
+
},
|
|
286
|
+
"language_info": {
|
|
287
|
+
"name": "python",
|
|
288
|
+
"version": "3.10.0"
|
|
289
|
+
}
|
|
290
|
+
},
|
|
291
|
+
"nbformat": 4,
|
|
292
|
+
"nbformat_minor": 2
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
with open(new_filepath, 'w') as f:
|
|
297
|
+
json.dump(notebook, f, indent=2)
|
|
298
|
+
_LOGGER.info("Notebook generated successfully.")
|
|
299
|
+
except Exception as e:
|
|
300
|
+
_LOGGER.error(f"Error generating notebook: {e}")
|
ml_tools/PSO_optimization.py
CHANGED
|
@@ -7,20 +7,23 @@ from sklearn.base import ClassifierMixin
|
|
|
7
7
|
from typing import Literal, Union, Tuple, Dict, Optional
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from copy import deepcopy
|
|
10
|
-
from .utilities import
|
|
10
|
+
from .utilities import (
|
|
11
|
+
_script_info,
|
|
12
|
+
list_csv_paths,
|
|
13
|
+
threshold_binary_values,
|
|
14
|
+
threshold_binary_values_batch,
|
|
15
|
+
deserialize_object,
|
|
16
|
+
list_files_by_extension,
|
|
17
|
+
save_dataframe,
|
|
18
|
+
make_fullpath,
|
|
19
|
+
yield_dataframes_from_dir,
|
|
20
|
+
sanitize_filename)
|
|
11
21
|
import torch
|
|
12
22
|
from tqdm import trange
|
|
13
|
-
import logging
|
|
14
23
|
import matplotlib.pyplot as plt
|
|
15
24
|
import seaborn as sns
|
|
16
25
|
from collections import defaultdict
|
|
17
|
-
|
|
18
|
-
# Configure logger
|
|
19
|
-
logging.basicConfig(
|
|
20
|
-
level=logging.INFO,
|
|
21
|
-
format="[%(asctime)s] [%(levelname)s] - %(message)s",
|
|
22
|
-
datefmt="%Y-%m-%d %H:%M:%S"
|
|
23
|
-
)
|
|
26
|
+
from .logger import _LOGGER
|
|
24
27
|
|
|
25
28
|
|
|
26
29
|
__all__ = [
|
|
@@ -304,7 +307,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
304
307
|
else:
|
|
305
308
|
device = torch.device("cpu")
|
|
306
309
|
|
|
307
|
-
|
|
310
|
+
_LOGGER.info(f"Using device: '{device}'")
|
|
308
311
|
|
|
309
312
|
# set local deep copies to prevent in place list modification
|
|
310
313
|
local_lower_boundaries = deepcopy(lower_boundaries)
|
|
@@ -352,7 +355,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
352
355
|
save_results_path = make_fullpath(save_results_dir, make=True)
|
|
353
356
|
_save_results(features, target, save_dir=save_results_path, target_name=target_name)
|
|
354
357
|
|
|
355
|
-
return features, target
|
|
358
|
+
return features, target # type: ignore
|
|
356
359
|
|
|
357
360
|
|
|
358
361
|
def _pso(func: ObjectiveFunction,
|
|
@@ -526,19 +529,23 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
526
529
|
If True, generates comparative plots with distributions colored by their source target.
|
|
527
530
|
"""
|
|
528
531
|
mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
|
|
529
|
-
|
|
532
|
+
_LOGGER.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
|
|
530
533
|
|
|
534
|
+
# Check results_dir
|
|
535
|
+
results_path = make_fullpath(results_dir)
|
|
536
|
+
# make output path
|
|
531
537
|
output_path = make_fullpath(save_dir, make=True)
|
|
532
|
-
|
|
538
|
+
|
|
539
|
+
all_csvs = list_csv_paths(results_path)
|
|
533
540
|
|
|
534
|
-
if not
|
|
535
|
-
|
|
541
|
+
if not all_csvs:
|
|
542
|
+
_LOGGER.warning("No data found. No plots will be generated.")
|
|
536
543
|
return
|
|
537
544
|
|
|
538
545
|
# --- MODE 1: Color-coded plots by target ---
|
|
539
546
|
if color_by_target:
|
|
540
547
|
data_to_plot = []
|
|
541
|
-
for df, df_name in
|
|
548
|
+
for df, df_name in yield_dataframes_from_dir(results_path):
|
|
542
549
|
# Assumes last col is target, rest are features
|
|
543
550
|
melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
|
|
544
551
|
# Sanitize target name for cleaner legend labels
|
|
@@ -547,7 +554,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
547
554
|
|
|
548
555
|
long_df = pd.concat(data_to_plot, ignore_index=True)
|
|
549
556
|
features = long_df['feature'].unique()
|
|
550
|
-
|
|
557
|
+
_LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
|
|
551
558
|
|
|
552
559
|
for feature_name in features:
|
|
553
560
|
plt.figure(figsize=(12, 7))
|
|
@@ -569,12 +576,12 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
569
576
|
# --- MODE 2: Aggregate plot ---
|
|
570
577
|
else:
|
|
571
578
|
feature_distributions = defaultdict(list)
|
|
572
|
-
for df, _ in
|
|
579
|
+
for df, _ in yield_dataframes_from_dir(results_path):
|
|
573
580
|
feature_columns = df.iloc[:, :-1]
|
|
574
581
|
for feature_name in feature_columns:
|
|
575
582
|
feature_distributions[feature_name].extend(df[feature_name].tolist())
|
|
576
583
|
|
|
577
|
-
|
|
584
|
+
_LOGGER.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
|
|
578
585
|
for feature_name, values in feature_distributions.items():
|
|
579
586
|
plt.figure(figsize=(12, 7))
|
|
580
587
|
sns.histplot(x=values, kde=True, bins='auto', stat="density")
|
|
@@ -589,7 +596,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
589
596
|
plt.savefig(plot_filename, bbox_inches='tight')
|
|
590
597
|
plt.close()
|
|
591
598
|
|
|
592
|
-
|
|
599
|
+
_LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
|
|
593
600
|
|
|
594
601
|
|
|
595
602
|
def info():
|
ml_tools/RNN_forecast.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch import nn
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"rnn_forecast"
|
|
7
|
+
]
|
|
8
|
+
|
|
9
|
+
def rnn_forecast(model: nn.Module, start_sequence: torch.Tensor, steps: int, device: str = 'cpu'):
|
|
10
|
+
"""
|
|
11
|
+
Runs a sequential forecast for a trained RNN-based model.
|
|
12
|
+
|
|
13
|
+
This function iteratively predicts future time steps, where each new prediction
|
|
14
|
+
is generated by feeding the previous prediction back into the model.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
model (nn.Module): The trained PyTorch RNN model (e.g., LSTM, GRU).
|
|
18
|
+
start_sequence (torch.Tensor): The initial sequence to start the forecast from.
|
|
19
|
+
Shape should be (sequence_length, num_features).
|
|
20
|
+
steps (int): The number of future time steps to predict.
|
|
21
|
+
device (str, optional): The device to run the forecast on ('cpu', 'cuda', 'mps').
|
|
22
|
+
Defaults to 'cpu'.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
np.ndarray: A numpy array containing the forecasted values.
|
|
26
|
+
"""
|
|
27
|
+
model.eval()
|
|
28
|
+
model.to(device)
|
|
29
|
+
|
|
30
|
+
predictions = []
|
|
31
|
+
current_sequence = start_sequence.to(device)
|
|
32
|
+
|
|
33
|
+
with torch.no_grad():
|
|
34
|
+
for _ in range(steps):
|
|
35
|
+
# Get the model's prediction for the current sequence
|
|
36
|
+
output = model(current_sequence.unsqueeze(0)) # Add batch dimension
|
|
37
|
+
|
|
38
|
+
# The prediction is the last element of the output sequence
|
|
39
|
+
next_pred = output[0, -1, :].view(1, -1)
|
|
40
|
+
|
|
41
|
+
# Store the prediction
|
|
42
|
+
predictions.append(next_pred.cpu().numpy())
|
|
43
|
+
|
|
44
|
+
# Update the sequence for the next iteration:
|
|
45
|
+
# Drop the first element and append the new prediction
|
|
46
|
+
current_sequence = torch.cat([current_sequence[1:], next_pred], dim=0)
|
|
47
|
+
|
|
48
|
+
# Concatenate all predictions and flatten the array for easy use
|
|
49
|
+
return np.concatenate(predictions).flatten()
|
ml_tools/VIF_factor.py
CHANGED
|
@@ -8,6 +8,7 @@ from statsmodels.tools.tools import add_constant
|
|
|
8
8
|
import warnings
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info, make_fullpath
|
|
11
|
+
from .logger import _LOGGER
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
@@ -54,20 +55,20 @@ def compute_vif(
|
|
|
54
55
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
55
56
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
56
57
|
if missing_features and verbose:
|
|
57
|
-
|
|
58
|
+
_LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
58
59
|
else:
|
|
59
60
|
sanitized_columns = list()
|
|
60
61
|
for feature in use_columns:
|
|
61
62
|
if feature not in ground_truth_cols:
|
|
62
63
|
if verbose:
|
|
63
|
-
|
|
64
|
+
_LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
64
65
|
else:
|
|
65
66
|
sanitized_columns.append(feature)
|
|
66
67
|
|
|
67
68
|
if ignore_columns is not None and use_columns is None:
|
|
68
69
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
69
70
|
if missing_ignore and verbose:
|
|
70
|
-
|
|
71
|
+
_LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
71
72
|
sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
|
|
72
73
|
|
|
73
74
|
X = df[sanitized_columns].copy()
|
|
@@ -167,12 +168,12 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
167
168
|
|
|
168
169
|
# Identify features to drop
|
|
169
170
|
to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
|
|
170
|
-
|
|
171
|
+
_LOGGER.info(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
|
|
171
172
|
|
|
172
173
|
result_df = df.drop(columns=to_drop)
|
|
173
174
|
|
|
174
175
|
if result_df.empty:
|
|
175
|
-
|
|
176
|
+
_LOGGER.warning(f"\t⚠️ All columns were dropped.")
|
|
176
177
|
|
|
177
178
|
return result_df, to_drop
|
|
178
179
|
|