dragon-ml-toolbox 2.4.0__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -7,20 +7,23 @@ from sklearn.base import ClassifierMixin
7
7
  from typing import Literal, Union, Tuple, Dict, Optional
8
8
  import pandas as pd
9
9
  from copy import deepcopy
10
- from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath, yield_dataframes_from_dir, sanitize_filename
10
+ from .utilities import (
11
+ _script_info,
12
+ list_csv_paths,
13
+ threshold_binary_values,
14
+ threshold_binary_values_batch,
15
+ deserialize_object,
16
+ list_files_by_extension,
17
+ save_dataframe,
18
+ make_fullpath,
19
+ yield_dataframes_from_dir,
20
+ sanitize_filename)
11
21
  import torch
12
22
  from tqdm import trange
13
- import logging
14
23
  import matplotlib.pyplot as plt
15
24
  import seaborn as sns
16
25
  from collections import defaultdict
17
-
18
- # Configure logger
19
- logging.basicConfig(
20
- level=logging.INFO,
21
- format="[%(asctime)s] [%(levelname)s] - %(message)s",
22
- datefmt="%Y-%m-%d %H:%M:%S"
23
- )
26
+ from .logger import _LOGGER
24
27
 
25
28
 
26
29
  __all__ = [
@@ -304,7 +307,7 @@ def run_pso(lower_boundaries: list[float],
304
307
  else:
305
308
  device = torch.device("cpu")
306
309
 
307
- logging.info(f"Using device: '{device}'")
310
+ _LOGGER.info(f"Using device: '{device}'")
308
311
 
309
312
  # set local deep copies to prevent in place list modification
310
313
  local_lower_boundaries = deepcopy(lower_boundaries)
@@ -352,7 +355,7 @@ def run_pso(lower_boundaries: list[float],
352
355
  save_results_path = make_fullpath(save_results_dir, make=True)
353
356
  _save_results(features, target, save_dir=save_results_path, target_name=target_name)
354
357
 
355
- return features, target
358
+ return features, target # type: ignore
356
359
 
357
360
 
358
361
  def _pso(func: ObjectiveFunction,
@@ -526,19 +529,23 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
526
529
  If True, generates comparative plots with distributions colored by their source target.
527
530
  """
528
531
  mode = "Comparative (color-coded)" if color_by_target else "Aggregate"
529
- logging.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
532
+ _LOGGER.info(f"Starting analysis in '{mode}' mode from results in: '{results_dir}'")
530
533
 
534
+ # Check results_dir
535
+ results_path = make_fullpath(results_dir)
536
+ # make output path
531
537
  output_path = make_fullpath(save_dir, make=True)
532
- all_files = list(yield_dataframes_from_dir(results_dir))
538
+
539
+ all_csvs = list_csv_paths(results_path)
533
540
 
534
- if not all_files:
535
- logging.warning("No data found. No plots will be generated.")
541
+ if not all_csvs:
542
+ _LOGGER.warning("No data found. No plots will be generated.")
536
543
  return
537
544
 
538
545
  # --- MODE 1: Color-coded plots by target ---
539
546
  if color_by_target:
540
547
  data_to_plot = []
541
- for df, df_name in all_files:
548
+ for df, df_name in yield_dataframes_from_dir(results_path):
542
549
  # Assumes last col is target, rest are features
543
550
  melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
544
551
  # Sanitize target name for cleaner legend labels
@@ -547,7 +554,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
547
554
 
548
555
  long_df = pd.concat(data_to_plot, ignore_index=True)
549
556
  features = long_df['feature'].unique()
550
- logging.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
557
+ _LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
551
558
 
552
559
  for feature_name in features:
553
560
  plt.figure(figsize=(12, 7))
@@ -569,12 +576,12 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
569
576
  # --- MODE 2: Aggregate plot ---
570
577
  else:
571
578
  feature_distributions = defaultdict(list)
572
- for df, _ in all_files:
579
+ for df, _ in yield_dataframes_from_dir(results_path):
573
580
  feature_columns = df.iloc[:, :-1]
574
581
  for feature_name in feature_columns:
575
582
  feature_distributions[feature_name].extend(df[feature_name].tolist())
576
583
 
577
- logging.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
584
+ _LOGGER.info(f"Found data for {len(feature_distributions)} features. Generating plots...")
578
585
  for feature_name, values in feature_distributions.items():
579
586
  plt.figure(figsize=(12, 7))
580
587
  sns.histplot(x=values, kde=True, bins='auto', stat="density")
@@ -589,7 +596,7 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
589
596
  plt.savefig(plot_filename, bbox_inches='tight')
590
597
  plt.close()
591
598
 
592
- logging.info(f"✅ All plots saved successfully to: {output_path}")
599
+ _LOGGER.info(f"✅ All plots saved successfully to: '{output_path}'")
593
600
 
594
601
 
595
602
  def info():
@@ -0,0 +1,49 @@
1
+ import torch
2
+ from torch import nn
3
+ import numpy as np
4
+
5
+ __all__ = [
6
+ "rnn_forecast"
7
+ ]
8
+
9
+ def rnn_forecast(model: nn.Module, start_sequence: torch.Tensor, steps: int, device: str = 'cpu'):
10
+ """
11
+ Runs a sequential forecast for a trained RNN-based model.
12
+
13
+ This function iteratively predicts future time steps, where each new prediction
14
+ is generated by feeding the previous prediction back into the model.
15
+
16
+ Args:
17
+ model (nn.Module): The trained PyTorch RNN model (e.g., LSTM, GRU).
18
+ start_sequence (torch.Tensor): The initial sequence to start the forecast from.
19
+ Shape should be (sequence_length, num_features).
20
+ steps (int): The number of future time steps to predict.
21
+ device (str, optional): The device to run the forecast on ('cpu', 'cuda', 'mps').
22
+ Defaults to 'cpu'.
23
+
24
+ Returns:
25
+ np.ndarray: A numpy array containing the forecasted values.
26
+ """
27
+ model.eval()
28
+ model.to(device)
29
+
30
+ predictions = []
31
+ current_sequence = start_sequence.to(device)
32
+
33
+ with torch.no_grad():
34
+ for _ in range(steps):
35
+ # Get the model's prediction for the current sequence
36
+ output = model(current_sequence.unsqueeze(0)) # Add batch dimension
37
+
38
+ # The prediction is the last element of the output sequence
39
+ next_pred = output[0, -1, :].view(1, -1)
40
+
41
+ # Store the prediction
42
+ predictions.append(next_pred.cpu().numpy())
43
+
44
+ # Update the sequence for the next iteration:
45
+ # Drop the first element and append the new prediction
46
+ current_sequence = torch.cat([current_sequence[1:], next_pred], dim=0)
47
+
48
+ # Concatenate all predictions and flatten the array for easy use
49
+ return np.concatenate(predictions).flatten()
ml_tools/VIF_factor.py CHANGED
@@ -8,6 +8,7 @@ from statsmodels.tools.tools import add_constant
8
8
  import warnings
9
9
  from pathlib import Path
10
10
  from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info, make_fullpath
11
+ from .logger import _LOGGER
11
12
 
12
13
 
13
14
  __all__ = [
@@ -54,20 +55,20 @@ def compute_vif(
54
55
  sanitized_columns = df.select_dtypes(include='number').columns.tolist()
55
56
  missing_features = set(ground_truth_cols) - set(sanitized_columns)
56
57
  if missing_features and verbose:
57
- print(f"⚠️ These columns are not Numeric:\n{missing_features}")
58
+ _LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
58
59
  else:
59
60
  sanitized_columns = list()
60
61
  for feature in use_columns:
61
62
  if feature not in ground_truth_cols:
62
63
  if verbose:
63
- print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
64
+ _LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
64
65
  else:
65
66
  sanitized_columns.append(feature)
66
67
 
67
68
  if ignore_columns is not None and use_columns is None:
68
69
  missing_ignore = set(ignore_columns) - set(ground_truth_cols)
69
70
  if missing_ignore and verbose:
70
- print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
71
+ _LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
71
72
  sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
72
73
 
73
74
  X = df[sanitized_columns].copy()
@@ -167,12 +168,12 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
167
168
 
168
169
  # Identify features to drop
169
170
  to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
170
- print(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
171
+ _LOGGER.info(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
171
172
 
172
173
  result_df = df.drop(columns=to_drop)
173
174
 
174
175
  if result_df.empty:
175
- print(f"\t⚠️ Warning: All columns were dropped.")
176
+ _LOGGER.warning(f"\t⚠️ All columns were dropped.")
176
177
 
177
178
  return result_df, to_drop
178
179
 
@@ -587,14 +587,14 @@ def standardize_percentages(
587
587
  Standardizes numeric columns containing mixed-format percentages.
588
588
 
589
589
  This function cleans columns where percentages might be entered as whole
590
- numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
590
+ numbers (55) and as proportions (0.55). It assumes values
591
591
  between 0 and 1 are proportions and multiplies them by 100.
592
592
 
593
593
  Args:
594
594
  df (pd.Dataframe): The input pandas DataFrame.
595
595
  columns (list[str]): A list of column names to standardize.
596
596
  treat_one_as_proportion (bool):
597
- - If True (default): The value `1` is treated as a proportion and converted to `100`.
597
+ - If True (default): The value `1` is treated as a proportion and converted to `100%`.
598
598
  - If False: The value `1` is treated as `1%`.
599
599
  round_digits (int): The number of decimal places to round the final result to.
600
600