dragon-ml-toolbox 6.4.1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,68 +1,30 @@
1
1
  import torch
2
2
  from torch.utils.data import Dataset, Subset
3
- from torch import nn
4
3
  import pandas
5
4
  import numpy
6
5
  from sklearn.model_selection import train_test_split
7
- from sklearn.preprocessing import StandardScaler, MinMaxScaler
8
6
  from typing import Literal, Union, Tuple, List, Optional
9
- from imblearn.combine import SMOTETomek
10
7
  from abc import ABC, abstractmethod
11
8
  from PIL import Image, ImageOps
12
9
  from torchvision.datasets import ImageFolder
13
10
  from torchvision import transforms
14
11
  import matplotlib.pyplot as plt
15
12
  from pathlib import Path
16
- from .path_manager import make_fullpath
13
+ from .path_manager import make_fullpath, sanitize_filename
17
14
  from ._logger import _LOGGER
18
15
  from ._script_info import _script_info
19
16
  from .custom_logger import save_list_strings
17
+ from .ML_scaler import PytorchScaler
20
18
 
21
-
22
- # --- public-facing API ---
23
19
  __all__ = [
24
20
  "DatasetMaker",
25
- "SimpleDatasetMaker",
21
+ "DatasetMakerMulti",
26
22
  "VisionDatasetMaker",
27
23
  "SequenceMaker",
28
24
  "ResizeAspectFill",
29
25
  ]
30
26
 
31
27
 
32
- # --- Custom Vision Transform Class ---
33
- class ResizeAspectFill:
34
- """
35
- Custom transformation to make an image square by padding it to match the
36
- longest side, preserving the aspect ratio. The image is finally centered.
37
-
38
- Args:
39
- pad_color (Union[str, int]): Color to use for the padding.
40
- Defaults to "black".
41
- """
42
- def __init__(self, pad_color: Union[str, int] = "black") -> None:
43
- self.pad_color = pad_color
44
-
45
- def __call__(self, image: Image.Image) -> Image.Image:
46
- if not isinstance(image, Image.Image):
47
- raise TypeError(f"Expected PIL.Image.Image, got {type(image).__name__}")
48
-
49
- w, h = image.size
50
- if w == h:
51
- return image
52
-
53
- # Determine padding to center the image
54
- if w > h:
55
- top_padding = (w - h) // 2
56
- bottom_padding = w - h - top_padding
57
- padding = (0, top_padding, 0, bottom_padding)
58
- else: # h > w
59
- left_padding = (h - w) // 2
60
- right_padding = h - w - left_padding
61
- padding = (left_padding, 0, right_padding, 0)
62
-
63
- return ImageOps.expand(image, padding, fill=self.pad_color)
64
-
65
-
66
28
  # --- Internal Helper Class ---
67
29
  class _PytorchDataset(Dataset):
68
30
  """
@@ -71,8 +33,13 @@ class _PytorchDataset(Dataset):
71
33
  """
72
34
  def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
73
35
  labels: Union[numpy.ndarray, pandas.Series],
74
- features_dtype: torch.dtype = torch.float32,
75
- labels_dtype: torch.dtype = torch.int64):
36
+ labels_dtype: torch.dtype,
37
+ features_dtype: torch.dtype = torch.float32):
38
+ """
39
+ integer labels for classification.
40
+
41
+ float labels for regression.
42
+ """
76
43
 
77
44
  if isinstance(features, numpy.ndarray):
78
45
  self.features = torch.tensor(features, dtype=features_dtype)
@@ -91,417 +58,247 @@ class _PytorchDataset(Dataset):
91
58
  return self.features[index], self.labels[index]
92
59
 
93
60
 
94
- # --- Private Base Class ---
95
- class _BaseMaker(ABC):
61
+ # --- Abstract Base Class (New) ---
62
+ # --- Abstract Base Class (Corrected) ---
63
+ class _BaseDatasetMaker(ABC):
96
64
  """
97
- Abstract Base Class for all dataset makers.
98
- Ensures a consistent API across the library.
65
+ Abstract base class for dataset makers. Contains shared logic for
66
+ splitting, scaling, and accessing datasets to reduce code duplication.
99
67
  """
100
68
  def __init__(self):
101
- self._train_dataset = None
102
- self._test_dataset = None
103
- self._val_dataset = None
104
-
105
- @abstractmethod
106
- def get_datasets(self) -> Tuple[Dataset, ...]:
107
- """
108
- The primary method to retrieve the final, processed PyTorch datasets.
109
- Must be implemented by all subclasses.
110
- """
111
- pass
112
-
113
-
114
- # --- Refactored DatasetMaker ---
115
- class DatasetMaker(_BaseMaker):
116
- """
117
- Creates processed PyTorch datasets from a Pandas DataFrame using a fluent, step-by-step interface.
118
-
119
- Recommended pipeline:
120
-
121
- - Full Control (step-by-step):
122
- 1. Process categorical features `.process_categoricals()`
123
- 2. Split train-test datasets `.split_data()`
124
- 3. Normalize continuous features `.normalize_continuous()`; `.denormalize()` becomes available.
125
- 4. [Optional][Classification only] Balance classes `.balance_data()`
126
- 5. Get PyTorch datasets: `train, test = .get_datasets()`
127
- 6. [Optional] Inspect the processed data as DataFrames `X_train, X_test, y_train, y_test = .inspect_dataframes()`
128
-
129
- - Automated (single call):
130
- ```python
131
- maker = DatasetMaker(df, label_col='target')
132
- maker.auto_process() # uses simplified arguments
133
- train_ds, test_ds = maker.get_datasets()
134
- ```
135
- """
136
- def __init__(self, pandas_df: pandas.DataFrame, label_col: str, kind: Literal["regression", "classification"]):
137
- super().__init__()
138
- if not isinstance(pandas_df, pandas.DataFrame):
139
- raise TypeError("Input must be a pandas.DataFrame.")
140
- if label_col not in pandas_df.columns:
141
- raise ValueError(f"Label column '{label_col}' not found in DataFrame.")
142
-
143
- self.kind = kind
144
- self.labels = pandas_df[label_col]
145
- self.features = pandas_df.drop(columns=label_col)
146
- self.labels_map = None
147
- self.scaler = None
148
-
149
- self._feature_names = self.features.columns.tolist()
150
- self._target_name = str(self.labels.name)
151
-
152
- self._is_split = False
153
- self._is_balanced = False
154
- self._is_normalized = False
155
- self._is_categoricals_processed = False
156
-
157
- self.features_train = None
158
- self.features_test = None
159
- self.labels_train = None
160
- self.labels_test = None
161
-
162
- self.continuous_columns = None
163
-
164
- def process_categoricals(self, method: Literal["one-hot", "embed"] = "one-hot",
165
- cat_features: Union[list[str], None] = None, **kwargs) -> 'DatasetMaker':
166
- """
167
- Encodes categorical features using the specified method.
168
-
169
- Args:
170
- method (str, optional): 'one-hot' (default) or 'embed'.
171
- cat_features (list, optional): A list of categorical column names.
172
- If None, they will be inferred from the DataFrame's dtypes.
173
- **kwargs: Additional keyword arguments to pass to the underlying
174
- pandas.get_dummies() or torch.nn.Embedding() functions.
175
- For 'one-hot' encoding, it is often recommended to add
176
- `drop_first=True` to help reduce multicollinearity.
177
- """
178
- if self._is_split:
179
- raise RuntimeError("Categoricals must be processed before splitting data to avoid data leakage.")
180
-
181
- if cat_features is None:
182
- cat_columns = self.features.select_dtypes(include=['object', 'category', 'string']).columns.tolist()
183
- else:
184
- cat_columns = cat_features
185
-
186
- if not cat_columns:
187
- _LOGGER.info("No categorical features to process.")
188
- self._is_categoricals_processed = True
189
- return self
190
-
191
- continuous_df = self.features.drop(columns=cat_columns)
192
- # store continuous column names
193
- self.continuous_columns = continuous_df.columns.tolist()
194
-
195
- categorical_df = self.features[cat_columns].copy()
196
-
197
- if method == "one-hot":
198
- processed_cats = pandas.get_dummies(categorical_df, dtype=numpy.int32, **kwargs)
199
- elif method == "embed":
200
- processed_cats = self._embed_categorical(categorical_df, **kwargs)
201
- else:
202
- raise ValueError("`method` must be 'one-hot' or 'embed'.")
203
-
204
- self.features = pandas.concat([continuous_df, processed_cats], axis=1)
205
- self._is_categoricals_processed = True
206
- _LOGGER.info("Categorical features processed.")
207
- return self
208
-
209
- def normalize_continuous(self, method: Literal["standard", "minmax"] = "standard") -> 'DatasetMaker':
210
- """Normalizes all numeric features and saves the scaler."""
211
- if not self._is_split:
212
- raise RuntimeError("Continuous features must be normalized AFTER splitting data. Call .split_data() first.")
213
- if self._is_normalized:
214
- _LOGGER.warning("⚠️ Data has already been normalized.")
215
- return self
69
+ self._train_ds: Optional[Dataset] = None
70
+ self._test_ds: Optional[Dataset] = None
71
+ self.scaler: Optional[PytorchScaler] = None
72
+ self._id: Optional[str] = None
73
+ self._feature_names: List[str] = []
74
+ self._X_train_shape = (0,0)
75
+ self._X_test_shape = (0,0)
76
+ self._y_train_shape = (0,)
77
+ self._y_test_shape = (0,)
78
+
79
+ def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
80
+ """Internal helper to fit and apply a PytorchScaler."""
81
+ continuous_feature_indices: Optional[List[int]] = None
82
+ if continuous_feature_columns:
83
+ if all(isinstance(c, str) for c in continuous_feature_columns):
84
+ name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
85
+ try:
86
+ continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
87
+ except KeyError as e:
88
+ raise ValueError(f"Feature column '{e.args[0]}' not found.")
89
+ elif all(isinstance(c, int) for c in continuous_feature_columns):
90
+ continuous_feature_indices = continuous_feature_columns # type: ignore
91
+ else:
92
+ raise TypeError("`continuous_feature_columns` must be a list of all strings or all integers.")
93
+
94
+ X_train_values = X_train.values
95
+ X_test_values = X_test.values
96
+
97
+ if self.scaler is None and continuous_feature_indices:
98
+ _LOGGER.info("Fitting a new PytorchScaler on training data.")
99
+ temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
100
+ self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
101
+
102
+ if self.scaler and self.scaler.mean_ is not None:
103
+ _LOGGER.info("Applying scaler transformation to train and test feature sets.")
104
+ X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
105
+ X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
106
+ return X_train_tensor.numpy(), X_test_tensor.numpy()
107
+
108
+ return X_train_values, X_test_values
216
109
 
217
- # Use continuous features columns
218
- self.scaler_columns = self.continuous_columns
219
- if not self.scaler_columns:
220
- _LOGGER.info("No continuous features to normalize.")
221
- self._is_normalized = True
222
- return self
110
+ @property
111
+ def train_dataset(self) -> Dataset:
112
+ if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
113
+ return self._train_ds
223
114
 
224
- if method == "standard":
225
- self.scaler = StandardScaler()
226
- elif method == "minmax":
227
- self.scaler = MinMaxScaler()
228
- else:
229
- raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
115
+ @property
116
+ def test_dataset(self) -> Dataset:
117
+ if self._test_ds is None: raise RuntimeError("Dataset not yet created.")
118
+ return self._test_ds
230
119
 
231
- # Fit on training data only, then transform both
232
- self.features_train[self.scaler_columns] = self.scaler.fit_transform(self.features_train[self.scaler_columns]) # type: ignore
233
- self.features_test[self.scaler_columns] = self.scaler.transform(self.features_test[self.scaler_columns]) # type: ignore
234
-
235
- self._is_normalized = True
236
- _LOGGER.info(f"Continuous features normalized using {self.scaler.__class__.__name__}. Scaler stored in `self.scaler`.")
237
- return self
120
+ @property
121
+ def feature_names(self) -> list[str]:
122
+ return self._feature_names
238
123
 
239
- def split_data(self, test_size: float = 0.2, stratify: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
240
- """Splits the data into training and testing sets."""
241
- if self._is_split:
242
- _LOGGER.warning("⚠️ Data has already been split.")
243
- return self
124
+ @property
125
+ def id(self) -> Optional[str]:
126
+ return self._id
244
127
 
245
- if self.labels.dtype == 'object' or self.labels.dtype.name == 'category':
246
- labels_numeric = self.labels.astype("category")
247
- self.labels_map = {code: val for code, val in enumerate(labels_numeric.cat.categories)}
248
- self.labels = pandas.Series(labels_numeric.cat.codes, index=self.labels.index)
249
- _LOGGER.info("Labels have been encoded. Mapping stored in `self.labels_map`.")
128
+ @id.setter
129
+ def id(self, dataset_id: str):
130
+ if not isinstance(dataset_id, str): raise ValueError("ID must be a string.")
131
+ self._id = dataset_id
250
132
 
251
- stratify_array = self.labels if stratify else None
252
-
253
- self.features_train, self.features_test, self.labels_train, self.labels_test = train_test_split(
254
- self.features, self.labels, test_size=test_size, random_state=random_state, stratify=stratify_array
255
- )
256
-
257
- self._is_split = True
258
- _LOGGER.info(f"Data split into training ({len(self.features_train)} samples) and testing ({len(self.features_test)} samples).")
259
- return self
133
+ def dataframes_info(self) -> None:
134
+ print("--- DataFrame Shapes After Split ---")
135
+ print(f" X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
136
+ print(f" X_test shape: {self._X_test_shape}, y_test shape: {self._y_test_shape}")
137
+ print("------------------------------------")
138
+
139
+ def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
140
+ """Saves a list of feature names as a text file"""
141
+ save_list_strings(list_strings=self._feature_names,
142
+ directory=directory,
143
+ filename="feature_names",
144
+ verbose=verbose)
260
145
 
261
- def balance_data(self, resampler=None, **kwargs) -> 'DatasetMaker':
262
- """
263
- Only useful for classification tasks.
264
-
265
- Balances the training data using a specified resampler.
266
-
267
- Defaults to `SMOTETomek`.
146
+ def save_scaler(self, save_dir: Union[str, Path]):
268
147
  """
269
- if not self._is_split:
270
- raise RuntimeError("❌ Cannot balance data before it has been split. Call .split_data() first.")
271
- if self._is_balanced:
272
- _LOGGER.warning("⚠️ Training data has already been balanced.")
273
- return self
148
+ Saves the fitted PytorchScaler's state to a .pth file.
274
149
 
275
- if resampler is None:
276
- resampler = SMOTETomek(**kwargs)
277
-
278
- _LOGGER.info(f"Balancing training data with {resampler.__class__.__name__}...")
279
- self.features_train, self.labels_train = resampler.fit_resample(self.features_train, self.labels_train) # type: ignore
150
+ The filename is automatically generated based on the dataset id.
280
151
 
281
- self._is_balanced = True
282
- _LOGGER.info(f"Balancing complete. New training set size: {len(self.features_train)} samples.")
283
- return self
284
-
285
- def auto_process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
286
- balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
287
- """Runs a standard, fully automated preprocessing pipeline."""
288
- _LOGGER.info("--- 🤖 Running Automated Processing Pipeline ---")
289
- self.process_categoricals(method=cat_method)
290
- self.split_data(test_size=test_size, stratify=True, random_state=random_state)
291
- self.normalize_continuous(method=normalize_method)
292
- if balance:
293
- self.balance_data()
294
- _LOGGER.info("--- 🤖 Automated Processing Complete ---")
295
- return self
296
-
297
- def denormalize(self, data: Union[torch.Tensor, numpy.ndarray, pandas.DataFrame]) -> Union[numpy.ndarray, pandas.DataFrame]:
152
+ Args:
153
+ save_dir (str | Path): The directory where the scaler will be saved.
298
154
  """
299
- Applies inverse transformation to denormalize data, preserving DataFrame
300
- structure if provided.
155
+ if not self.scaler: raise RuntimeError("No scaler was fitted or provided.")
156
+ if not self.id: raise ValueError("Must set the `id` before saving scaler.")
157
+ save_path = make_fullpath(save_dir, make=True, enforce="directory")
158
+ sanitized_id = sanitize_filename(self.id)
159
+ filename = f"scaler_{sanitized_id}.pth"
160
+ filepath = save_path / filename
161
+ self.scaler.save(filepath)
162
+ _LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
163
+
164
+
165
+ # Single target dataset
166
+ class DatasetMaker(_BaseDatasetMaker):
167
+ """
168
+ Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
301
169
 
170
+ This class takes a DataFrame, automatically splits it into training and
171
+ testing sets, and converts them into PyTorch Datasets. It assumes the
172
+ target variable is the last column. It can also create, apply, and
173
+ save a PytorchScaler for standardizing continuous features.
174
+
175
+ Attributes:
176
+ `scaler` -> PytorchScaler | None
177
+ `train_dataset` -> PyTorch Dataset
178
+ `test_dataset` -> PyTorch Dataset
179
+ `feature_names` -> list[str]
180
+ `target_name` -> str
181
+ `id` -> str
182
+
183
+ The ID can be manually set to any string if needed, it is the target name by default.
184
+ """
185
+ def __init__(self,
186
+ pandas_df: pandas.DataFrame,
187
+ kind: Literal["regression", "classification"],
188
+ test_size: float = 0.2,
189
+ random_state: int = 42,
190
+ scaler: Optional[PytorchScaler] = None,
191
+ continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
192
+ """
302
193
  Args:
303
- data: The normalized data to be transformed back to its original scale.
304
- Can be a PyTorch Tensor, NumPy array, or Pandas DataFrame.
305
- If a DataFrame, it must contain the columns that were originally scaled.
306
-
307
- Returns:
308
- The denormalized data. Returns a Pandas DataFrame if the input was a
309
- DataFrame, otherwise returns a NumPy array.
194
+ pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
195
+ kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
196
+ test_size (float): The proportion of the dataset to allocate to the test split.
197
+ random_state (int): The seed for the random number generator for reproducibility.
198
+ scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
199
+ continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
310
200
  """
311
- if self.scaler is None:
312
- raise RuntimeError("Data was not normalized. Cannot denormalize.")
313
-
314
- if isinstance(data, pandas.DataFrame):
315
- # If input is a DataFrame, denormalize in place and return a copy
316
- if not all(col in data.columns for col in self.scaler_columns): # type: ignore
317
- raise ValueError(f"Input DataFrame is missing one or more required columns for denormalization. Required: {self.scaler_columns}")
318
-
319
- output_df = data.copy()
320
- output_df[self.scaler_columns] = self.scaler.inverse_transform(data[self.scaler_columns]) # type: ignore
321
- return output_df
201
+ super().__init__()
202
+ self.scaler = scaler
322
203
 
323
- # Handle tensor or numpy array input
324
- if isinstance(data, torch.Tensor):
325
- data_np = data.cpu().numpy()
326
- else: # It's already a numpy array
327
- data_np = data
328
-
329
- if data_np.ndim == 1:
330
- data_np = data_np.reshape(-1, 1)
331
-
332
- if data_np.shape[1] != len(self.scaler_columns): # type: ignore
333
- raise ValueError(f"Input array has {data_np.shape[1]} columns, but scaler was fitted on {len(self.scaler_columns)} columns.") # type: ignore
334
-
335
- return self.scaler.inverse_transform(data_np)
204
+ # --- 1. Identify features and target (single-target logic) ---
205
+ features = pandas_df.iloc[:, :-1]
206
+ target = pandas_df.iloc[:, -1]
207
+ self._feature_names = features.columns.tolist()
208
+ self._target_name = str(target.name)
209
+ self._id = self._target_name
336
210
 
337
- def get_datasets(self) -> Tuple[Dataset, Dataset]:
338
- """Primary method to get the final PyTorch Datasets."""
339
- if not self._is_split:
340
- raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
341
-
342
- label_dtype = torch.float32 if self.kind == "regression" else torch.int64
211
+ # --- 2. Split ---
212
+ X_train, X_test, y_train, y_test = train_test_split(
213
+ features, target, test_size=test_size, random_state=random_state
214
+ )
215
+ self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
216
+ self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
343
217
 
344
- self._train_dataset = _PytorchDataset(self.features_train, self.labels_train, labels_dtype=label_dtype) # type: ignore
345
- self._test_dataset = _PytorchDataset(self.features_test, self.labels_test, labels_dtype=label_dtype) # type: ignore
218
+ label_dtype = torch.float32 if kind == "regression" else torch.int64
219
+
220
+ # --- 3. Scale ---
221
+ X_train_final, X_test_final = self._prepare_scaler(
222
+ X_train, y_train, X_test, label_dtype, continuous_feature_columns
223
+ )
346
224
 
347
- return self._train_dataset, self._test_dataset
348
-
349
- def inspect_dataframes(self) -> Tuple[pandas.DataFrame, pandas.DataFrame, pandas.Series, pandas.Series]:
350
- """Utility method to inspect the processed data as Pandas DataFrames."""
351
- if not self._is_split:
352
- raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
353
- return self.features_train, self.features_test, self.labels_train, self.labels_test # type: ignore
354
-
355
- @property
356
- def feature_names(self) -> list[str]:
357
- """Returns the list of feature column names."""
358
- return self._feature_names
225
+ # --- 4. Create Datasets ---
226
+ self._train_ds = _PytorchDataset(X_train_final, y_train.values, label_dtype)
227
+ self._test_ds = _PytorchDataset(X_test_final, y_test.values, label_dtype)
359
228
 
360
229
  @property
361
230
  def target_name(self) -> str:
362
- """Returns the name of the target column."""
363
231
  return self._target_name
364
-
365
- def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
366
- """Saves a list of feature names as a text file"""
367
- save_list_strings(list_strings=self._feature_names,
368
- directory=directory,
369
- filename="feature_names",
370
- verbose=verbose)
371
-
372
- @staticmethod
373
- def _embed_categorical(cat_df: pandas.DataFrame, random_state: Optional[int] = None, **kwargs) -> pandas.DataFrame:
374
- """Internal helper to perform embedding on categorical features."""
375
- embedded_tensors = []
376
- new_columns = []
377
- for col in cat_df.columns:
378
- cat_series = cat_df[col].astype("category")
379
- num_categories = len(cat_series.cat.categories)
380
- embedding_dim = min(50, (num_categories + 1) // 2)
381
-
382
- if random_state:
383
- torch.manual_seed(random_state)
384
-
385
- embedder = nn.Embedding(num_embeddings=num_categories, embedding_dim=embedding_dim, **kwargs)
386
-
387
- with torch.no_grad():
388
- codes = torch.LongTensor(cat_series.cat.codes.values)
389
- embedded_tensors.append(embedder(codes))
390
-
391
- new_columns.extend([f"{col}_{i+1}" for i in range(embedding_dim)])
392
-
393
- with torch.no_grad():
394
- full_tensor = torch.cat(embedded_tensors, dim=1)
395
- return pandas.DataFrame(full_tensor.numpy(), columns=new_columns, index=cat_df.index)
396
232
 
397
233
 
398
- # Streamlined DatasetMaker version
399
- class SimpleDatasetMaker:
234
+ # --- New Multi-Target Class ---
235
+ class DatasetMakerMulti(_BaseDatasetMaker):
400
236
  """
401
- A simplified dataset maker for pre-processed, numerical pandas DataFrames.
402
-
403
- This class takes a DataFrame, automatically splits it into training and
404
- testing sets, and converts them into PyTorch Datasets. It assumes the
405
- target variable is the last column.
237
+ Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
406
238
 
407
- Args:
408
- pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
409
- kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
410
- test_size (float): The proportion of the dataset to allocate to the
411
- test split.
412
- random_state (int): The seed for the random number generator for
413
- reproducibility.
239
+ This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
414
240
  """
415
- def __init__(self, pandas_df: pandas.DataFrame, kind: Literal["regression", "classification"], test_size: float = 0.2, random_state: int = 42):
241
+ def __init__(self,
242
+ pandas_df: pandas.DataFrame,
243
+ target_columns: List[str],
244
+ test_size: float = 0.2,
245
+ random_state: int = 42,
246
+ scaler: Optional[PytorchScaler] = None,
247
+ continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
416
248
  """
417
- Attributes:
418
- `train_dataset` -> PyTorch Dataset
419
- `test_dataset` -> PyTorch Dataset
420
- `feature_names` -> list[str]
421
- `target_name` -> str
422
- `id` -> str | None
423
-
424
- The ID can be manually set to any string if needed, it is `None` by default.
249
+ Args:
250
+ pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
251
+ target_columns (list[str]): List of target column names.
252
+ test_size (float): The proportion of the dataset to allocate to the test split.
253
+ random_state (int): The seed for the random number generator for reproducibility.
254
+ scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
255
+ continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
425
256
  """
426
- # Validation
427
- if not isinstance(pandas_df, pandas.DataFrame):
428
- raise TypeError("Input must be a pandas.DataFrame.")
429
- if kind not in ["regression", "classification"]:
430
- raise ValueError("`kind` must be 'regression' or 'classification'.")
257
+ super().__init__()
258
+ self.scaler = scaler
431
259
 
432
- # 1. Identify features and target
433
- features = pandas_df.iloc[:, :-1]
434
- target = pandas_df.iloc[:, -1]
260
+ self._target_names = target_columns
261
+ self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
262
+ features = pandas_df[self._feature_names]
263
+ target = pandas_df[self._target_names]
435
264
 
436
- self._feature_names = features.columns.tolist()
437
- self._target_name = str(target.name)
438
-
439
- #set id
440
- self._id: Optional[str] = None
441
-
442
- # 2. Split the data
443
265
  X_train, X_test, y_train, y_test = train_test_split(
444
266
  features, target, test_size=test_size, random_state=random_state
445
267
  )
446
-
447
- self._X_train_shape = X_train.shape
448
- self._X_test_shape = X_test.shape
449
- self._y_train_shape = y_train.shape
450
- self._y_test_shape = y_test.shape
451
-
452
- # 3. Convert to PyTorch Datasets with the correct label dtype
453
- label_dtype = torch.float32 if kind == "regression" else torch.int64
268
+ self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
269
+ self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
454
270
 
455
- self._train_ds = _PytorchDataset(X_train.values, y_train.values, labels_dtype=label_dtype)
456
- self._test_ds = _PytorchDataset(X_test.values, y_test.values, labels_dtype=label_dtype)
271
+ label_dtype = torch.float32
457
272
 
458
- @property
459
- def train_dataset(self) -> Dataset:
460
- """Returns the training PyTorch dataset."""
461
- return self._train_ds
273
+ X_train_final, X_test_final = self._prepare_scaler(
274
+ X_train, y_train, X_test, label_dtype, continuous_feature_columns
275
+ )
276
+
277
+ self._train_ds = _PytorchDataset(X_train_final, y_train, label_dtype)
278
+ self._test_ds = _PytorchDataset(X_test_final, y_test, label_dtype)
462
279
 
463
280
  @property
464
- def test_dataset(self) -> Dataset:
465
- """Returns the testing PyTorch dataset."""
466
- return self._test_ds
281
+ def target_names(self) -> list[str]:
282
+ return self._target_names
467
283
 
468
- @property
469
- def feature_names(self) -> list[str]:
470
- """Returns the list of feature column names."""
471
- return self._feature_names
472
284
 
473
- @property
474
- def target_name(self) -> str:
475
- """Returns the name of the target column."""
476
- return self._target_name
477
-
478
- @property
479
- def id(self) -> Optional[str]:
480
- """Returns the object identifier if any."""
481
- return self._id
482
-
483
- @id.setter
484
- def id(self, dataset_id: str):
485
- """Sets the ID value"""
486
- if not isinstance(dataset_id, str):
487
- raise ValueError(f"Dataset ID '{type(dataset_id)}' is not a string.")
488
- self._id = dataset_id
285
+ # --- Private Base Class ---
286
+ class _BaseMaker(ABC):
287
+ """
288
+ Abstract Base Class for extra dataset makers.
289
+ """
290
+ def __init__(self):
291
+ self._train_dataset = None
292
+ self._test_dataset = None
293
+ self._val_dataset = None
489
294
 
490
- def dataframes_info(self) -> None:
491
- """Prints the shape information of the split pandas DataFrames."""
492
- print("--- Original DataFrame Shapes After Split ---")
493
- print(f" X_train shape: {self._X_train_shape}")
494
- print(f" y_train shape: {self._y_train_shape}\n")
495
- print(f" X_test shape: {self._X_test_shape}")
496
- print(f" y_test shape: {self._y_test_shape}")
497
- print("-------------------------------------------")
498
-
499
- def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
500
- """Saves a list of feature names as a text file"""
501
- save_list_strings(list_strings=self._feature_names,
502
- directory=directory,
503
- filename="feature_names",
504
- verbose=verbose)
295
+ @abstractmethod
296
+ def get_datasets(self) -> Tuple[Dataset, ...]:
297
+ """
298
+ The primary method to retrieve the final, processed PyTorch datasets.
299
+ Must be implemented by all subclasses.
300
+ """
301
+ pass
505
302
 
506
303
 
507
304
  # --- VisionDatasetMaker ---
@@ -654,6 +451,7 @@ class SequenceMaker(_BaseMaker):
654
451
  1. `.split_data()`: Separate time series into training and testing portions.
655
452
  2. `.normalize_data()`: Normalize the data. The scaler will be fitted on the training portion.
656
453
  3. `.generate_windows()`: Create the windowed sequences from the split and normalized data.
454
+ 4. `.get_datasets()`: Return Pytorch train and test datasets.
657
455
  """
658
456
  def __init__(self, data: Union[pandas.DataFrame, pandas.Series, numpy.ndarray], sequence_length: int):
659
457
  super().__init__()
@@ -679,33 +477,41 @@ class SequenceMaker(_BaseMaker):
679
477
  self._is_normalized = False
680
478
  self._are_windows_generated = False
681
479
 
682
- def normalize_data(self, method: Literal["standard", "minmax"] = "minmax") -> 'SequenceMaker':
480
+ def normalize_data(self) -> 'SequenceMaker':
683
481
  """
684
- Normalizes the sequence data. Must be called AFTER splitting to prevent data leakage from the test set.
482
+ Normalizes the sequence data using PytorchScaler. Must be called AFTER
483
+ splitting to prevent data leakage from the test set.
685
484
  """
686
485
  if not self._is_split:
687
486
  raise RuntimeError("Data must be split BEFORE normalizing. Call .split_data() first.")
688
-
487
+
689
488
  if self.scaler:
690
489
  _LOGGER.warning("⚠️ Data has already been normalized.")
691
490
  return self
692
-
693
- if method == "standard":
694
- self.scaler = StandardScaler()
695
- elif method == "minmax":
696
- self.scaler = MinMaxScaler(feature_range=(-1, 1))
697
- else:
698
- raise ValueError("Normalization `method` must be 'standard' or 'minmax'.")
699
491
 
700
- # Fit scaler ONLY on the training data
701
- self.scaler.fit(self.train_sequence.reshape(-1, 1)) # type: ignore
702
-
703
- # Transform both train and test data using the fitted scaler
704
- self.train_sequence = self.scaler.transform(self.train_sequence.reshape(-1, 1)).flatten() # type: ignore
705
- self.test_sequence = self.scaler.transform(self.test_sequence.reshape(-1, 1)).flatten() # type: ignore
706
-
492
+ # 1. PytorchScaler requires a Dataset to fit. Create a temporary one.
493
+ # The scaler expects 2D data [n_samples, n_features].
494
+ train_features = self.train_sequence.reshape(-1, 1) # type: ignore
495
+
496
+ # _PytorchDataset needs labels, so we create dummy ones.
497
+ dummy_labels = numpy.zeros(len(train_features))
498
+ temp_train_ds = _PytorchDataset(train_features, dummy_labels, labels_dtype=torch.float32)
499
+
500
+ # 2. Fit the PytorchScaler on the temporary training dataset.
501
+ # The sequence is a single feature, so its index is [0].
502
+ _LOGGER.info("Fitting PytorchScaler on the training data...")
503
+ self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices=[0])
504
+
505
+ # 3. Transform sequences using the fitted scaler.
506
+ # The transform method requires a tensor, so we convert, transform, and convert back.
507
+ train_tensor = torch.tensor(self.train_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
508
+ test_tensor = torch.tensor(self.test_sequence.reshape(-1, 1), dtype=torch.float32) # type: ignore
509
+
510
+ self.train_sequence = self.scaler.transform(train_tensor).numpy().flatten()
511
+ self.test_sequence = self.scaler.transform(test_tensor).numpy().flatten()
512
+
707
513
  self._is_normalized = True
708
- _LOGGER.info(f"Sequence data normalized using {self.scaler.__class__.__name__}. Scaler was fit on the training set only.")
514
+ _LOGGER.info("Sequence data normalized using PytorchScaler.")
709
515
  return self
710
516
 
711
517
  def split_data(self, test_size: float = 0.2) -> 'SequenceMaker':
@@ -741,7 +547,7 @@ class SequenceMaker(_BaseMaker):
741
547
  _LOGGER.info("Feature and label windows generated for train and test sets.")
742
548
  return self
743
549
 
744
- def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> _PytorchDataset:
550
+ def _create_windowed_dataset(self, data: numpy.ndarray, use_sequence_labels: bool) -> Dataset:
745
551
  """Efficiently creates windowed features and labels using numpy."""
746
552
  if len(data) <= self.sequence_length:
747
553
  raise ValueError("Data length must be greater than the sequence_length to create at least one window.")
@@ -768,18 +574,25 @@ class SequenceMaker(_BaseMaker):
768
574
  strided_y = numpy.lib.stride_tricks.as_strided(y_data, shape=(n_windows, self.sequence_length), strides=(bytes_per_item, bytes_per_item))
769
575
 
770
576
  return _PytorchDataset(strided_x, strided_y, labels_dtype=torch.float32)
771
-
577
+
772
578
  def denormalize(self, data: Union[torch.Tensor, numpy.ndarray]) -> numpy.ndarray:
773
- """Applies inverse transformation using the stored scaler."""
579
+ """Applies inverse transformation using the stored PytorchScaler."""
774
580
  if self.scaler is None:
775
581
  raise RuntimeError("Data was not normalized. Cannot denormalize.")
776
-
777
- if isinstance(data, torch.Tensor):
778
- data_np = data.cpu().detach().numpy()
582
+
583
+ # Ensure data is a torch.Tensor
584
+ if isinstance(data, numpy.ndarray):
585
+ tensor_data = torch.tensor(data, dtype=torch.float32)
779
586
  else:
780
- data_np = data
781
-
782
- return self.scaler.inverse_transform(data_np.reshape(-1, 1)).flatten()
587
+ tensor_data = data
588
+
589
+ # Reshape for the scaler [n_samples, n_features]
590
+ if tensor_data.ndim == 1:
591
+ tensor_data = tensor_data.view(-1, 1)
592
+
593
+ # Apply inverse transform and convert back to a flat numpy array
594
+ original_scale_tensor = self.scaler.inverse_transform(tensor_data)
595
+ return original_scale_tensor.cpu().numpy().flatten()
783
596
 
784
597
  def plot(self, predictions: Optional[numpy.ndarray] = None):
785
598
  """Plots the original training and testing data, with optional predictions."""
@@ -802,12 +615,46 @@ class SequenceMaker(_BaseMaker):
802
615
  plt.legend()
803
616
  plt.show()
804
617
 
805
- def get_datasets(self) -> Tuple[_PytorchDataset, _PytorchDataset]:
618
+ def get_datasets(self) -> Tuple[Dataset, Dataset]:
806
619
  """Returns the final train and test datasets."""
807
620
  if not self._are_windows_generated:
808
621
  raise RuntimeError("Windows have not been generated. Call .generate_windows() first.")
809
622
  return self._train_dataset, self._test_dataset
810
623
 
811
624
 
625
+ # --- Custom Vision Transform Class ---
626
+ class ResizeAspectFill:
627
+ """
628
+ Custom transformation to make an image square by padding it to match the
629
+ longest side, preserving the aspect ratio. The image is finally centered.
630
+
631
+ Args:
632
+ pad_color (Union[str, int]): Color to use for the padding.
633
+ Defaults to "black".
634
+ """
635
+ def __init__(self, pad_color: Union[str, int] = "black") -> None:
636
+ self.pad_color = pad_color
637
+
638
+ def __call__(self, image: Image.Image) -> Image.Image:
639
+ if not isinstance(image, Image.Image):
640
+ raise TypeError(f"Expected PIL.Image.Image, got {type(image).__name__}")
641
+
642
+ w, h = image.size
643
+ if w == h:
644
+ return image
645
+
646
+ # Determine padding to center the image
647
+ if w > h:
648
+ top_padding = (w - h) // 2
649
+ bottom_padding = w - h - top_padding
650
+ padding = (0, top_padding, 0, bottom_padding)
651
+ else: # h > w
652
+ left_padding = (h - w) // 2
653
+ right_padding = h - w - left_padding
654
+ padding = (left_padding, 0, right_padding, 0)
655
+
656
+ return ImageOps.expand(image, padding, fill=self.pad_color)
657
+
658
+
812
659
  def info():
813
660
  _script_info(__all__)