dragon-ml-toolbox 13.0.0__py3-none-any.whl → 14.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/METADATA +12 -2
  2. dragon_ml_toolbox-14.7.0.dist-info/RECORD +49 -0
  3. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
  4. ml_tools/MICE_imputation.py +207 -5
  5. ml_tools/ML_configuration.py +108 -0
  6. ml_tools/ML_datasetmaster.py +241 -260
  7. ml_tools/ML_evaluation.py +229 -76
  8. ml_tools/ML_evaluation_multi.py +45 -16
  9. ml_tools/ML_inference.py +0 -1
  10. ml_tools/ML_models.py +135 -55
  11. ml_tools/ML_models_advanced.py +323 -0
  12. ml_tools/ML_optimization.py +49 -36
  13. ml_tools/ML_trainer.py +498 -29
  14. ml_tools/ML_utilities.py +351 -4
  15. ml_tools/ML_vision_datasetmaster.py +1492 -0
  16. ml_tools/ML_vision_evaluation.py +260 -0
  17. ml_tools/ML_vision_inference.py +428 -0
  18. ml_tools/ML_vision_models.py +641 -0
  19. ml_tools/ML_vision_transformers.py +203 -0
  20. ml_tools/PSO_optimization.py +5 -1
  21. ml_tools/_ML_vision_recipe.py +88 -0
  22. ml_tools/__init__.py +1 -0
  23. ml_tools/_schema.py +96 -0
  24. ml_tools/custom_logger.py +37 -14
  25. ml_tools/data_exploration.py +576 -138
  26. ml_tools/ensemble_evaluation.py +53 -10
  27. ml_tools/keys.py +43 -1
  28. ml_tools/math_utilities.py +1 -1
  29. ml_tools/optimization_tools.py +65 -86
  30. ml_tools/serde.py +78 -17
  31. ml_tools/utilities.py +192 -3
  32. dragon_ml_toolbox-13.0.0.dist-info/RECORD +0 -41
  33. ml_tools/ML_simple_optimization.py +0 -413
  34. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/WHEEL +0 -0
  35. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE +0 -0
  36. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,10 @@
1
1
  import torch
2
- from torch.utils.data import Dataset, Subset
2
+ from torch.utils.data import Dataset
3
3
  import pandas
4
4
  import numpy
5
5
  from sklearn.model_selection import train_test_split
6
6
  from typing import Literal, Union, Tuple, List, Optional
7
7
  from abc import ABC, abstractmethod
8
- from PIL import Image, ImageOps
9
- from torchvision.datasets import ImageFolder
10
- from torchvision import transforms
11
8
  import matplotlib.pyplot as plt
12
9
  from pathlib import Path
13
10
 
@@ -17,14 +14,13 @@ from ._script_info import _script_info
17
14
  from .custom_logger import save_list_strings
18
15
  from .ML_scaler import PytorchScaler
19
16
  from .keys import DatasetKeys
17
+ from ._schema import FeatureSchema
20
18
 
21
19
 
22
20
  __all__ = [
23
21
  "DatasetMaker",
24
22
  "DatasetMakerMulti",
25
- "VisionDatasetMaker",
26
- "SequenceMaker",
27
- "ResizeAspectFill",
23
+ "SequenceMaker"
28
24
  ]
29
25
 
30
26
 
@@ -35,7 +31,7 @@ class _PytorchDataset(Dataset):
35
31
  Converts numpy/pandas data into tensors for model consumption.
36
32
  """
37
33
  def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
38
- labels: Union[numpy.ndarray, pandas.Series],
34
+ labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
39
35
  labels_dtype: torch.dtype,
40
36
  features_dtype: torch.dtype = torch.float32,
41
37
  feature_names: Optional[List[str]] = None,
@@ -48,13 +44,16 @@ class _PytorchDataset(Dataset):
48
44
 
49
45
  if isinstance(features, numpy.ndarray):
50
46
  self.features = torch.tensor(features, dtype=features_dtype)
51
- else:
52
- self.features = torch.tensor(features.values, dtype=features_dtype)
47
+ else: # It's a pandas.DataFrame
48
+ self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
53
49
 
54
50
  if isinstance(labels, numpy.ndarray):
55
51
  self.labels = torch.tensor(labels, dtype=labels_dtype)
52
+ elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
53
+ self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
56
54
  else:
57
- self.labels = torch.tensor(labels.values, dtype=labels_dtype)
55
+ # Fallback for other types (though your type hints don't cover this)
56
+ self.labels = torch.tensor(labels, dtype=labels_dtype)
58
57
 
59
58
  self._feature_names = feature_names
60
59
  self._target_names = target_names
@@ -98,27 +97,34 @@ class _BaseDatasetMaker(ABC):
98
97
  self._X_test_shape = (0,0)
99
98
  self._y_train_shape = (0,)
100
99
  self._y_test_shape = (0,)
101
-
102
- def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
103
- """Internal helper to fit and apply a PytorchScaler."""
100
+
101
+ def _prepare_scaler(self,
102
+ X_train: pandas.DataFrame,
103
+ y_train: Union[pandas.Series, pandas.DataFrame],
104
+ X_test: pandas.DataFrame,
105
+ label_dtype: torch.dtype,
106
+ schema: FeatureSchema):
107
+ """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
104
108
  continuous_feature_indices: Optional[List[int]] = None
105
- if continuous_feature_columns:
106
- if all(isinstance(c, str) for c in continuous_feature_columns):
107
- name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
108
- try:
109
- continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
110
- except KeyError as e:
111
- _LOGGER.error(f"Feature column '{e.args[0]}' not found.")
112
- raise ValueError()
113
- elif all(isinstance(c, int) for c in continuous_feature_columns):
114
- continuous_feature_indices = continuous_feature_columns # type: ignore
115
- else:
116
- _LOGGER.error("'continuous_feature_columns' must be a list of all strings or all integers.")
117
- raise TypeError()
118
-
119
- X_train_values = X_train.values
120
- X_test_values = X_test.values
121
109
 
110
+ # Get continuous feature indices *from the schema*
111
+ if schema.continuous_feature_names:
112
+ _LOGGER.info("Getting continuous feature indices from schema.")
113
+ try:
114
+ # Convert columns to a standard list for .index()
115
+ train_cols_list = X_train.columns.to_list()
116
+ # Map names from schema to column indices in the training DataFrame
117
+ continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
118
+ except ValueError as e: #
119
+ _LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
120
+ raise ValueError()
121
+ else:
122
+ _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
123
+
124
+ X_train_values = X_train.to_numpy()
125
+ X_test_values = X_test.to_numpy()
126
+
127
+ # continuous_feature_indices is derived
122
128
  if self.scaler is None and continuous_feature_indices:
123
129
  _LOGGER.info("Fitting a new PytorchScaler on training data.")
124
130
  temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
@@ -225,10 +231,8 @@ class DatasetMaker(_BaseDatasetMaker):
225
231
  """
226
232
  Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
227
233
 
228
- This class takes a DataFrame, automatically splits it into training and
229
- testing sets, and converts them into PyTorch Datasets. It assumes the
230
- target variable is the last column. It can also create, apply, and
231
- save a PytorchScaler for standardizing continuous features.
234
+ This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
235
+ It can also create and apply a PytorchScaler using the schema.
232
236
 
233
237
  Attributes:
234
238
  `scaler` -> PytorchScaler | None
@@ -242,95 +246,234 @@ class DatasetMaker(_BaseDatasetMaker):
242
246
  """
243
247
  def __init__(self,
244
248
  pandas_df: pandas.DataFrame,
249
+ schema: FeatureSchema,
245
250
  kind: Literal["regression", "classification"],
251
+ scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
246
252
  test_size: float = 0.2,
247
- random_state: int = 42,
248
- scaler: Optional[PytorchScaler] = None,
249
- continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
253
+ random_state: int = 42):
250
254
  """
251
255
  Args:
252
- pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
253
- kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
254
- test_size (float): The proportion of the dataset to allocate to the test split.
255
- random_state (int): The seed for the random number generator for reproducibility.
256
- scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
257
- continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
256
+ pandas_df (pandas.DataFrame):
257
+ The pre-processed input DataFrame containing all columns. (features and single target).
258
+ schema (FeatureSchema):
259
+ The definitive schema object from data_exploration.
260
+ kind ("regression" | "classification"):
261
+ The type of ML task. This determines the data type of the labels.
262
+ scaler ("fit" | "none" | PytorchScaler):
263
+ Strategy for data scaling:
264
+ - "fit": Fit a new PytorchScaler on continuous features.
265
+ - "none": Do not scale data (e.g., for TabularTransformer).
266
+ - PytorchScaler instance: Use a pre-fitted scaler to transform data.
267
+ test_size (float):
268
+ The proportion of the dataset to allocate to the test split.
269
+ random_state (int):
270
+ The seed for the random number of generator for reproducibility.
271
+
258
272
  """
259
273
  super().__init__()
260
- self.scaler = scaler
261
274
 
262
- # --- 1. Identify features and target (single-target logic) ---
263
- features = pandas_df.iloc[:, :-1]
264
- target = pandas_df.iloc[:, -1]
265
- self._feature_names = features.columns.tolist()
266
- self._target_names = [str(target.name)]
267
- self._id = self._target_names[0]
268
-
269
- # --- 2. Split ---
275
+ _apply_scaling: bool = False
276
+ if scaler == "fit":
277
+ self.scaler = None # To be created
278
+ _apply_scaling = True
279
+ elif scaler == "none":
280
+ self.scaler = None
281
+ elif isinstance(scaler, PytorchScaler):
282
+ self.scaler = scaler # Use the provided one
283
+ _apply_scaling = True
284
+ else:
285
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
286
+ raise ValueError()
287
+
288
+ # --- 1. Identify features (from schema) ---
289
+ self._feature_names = list(schema.feature_names)
290
+
291
+ # --- 2. Infer target (by set difference) ---
292
+ all_cols_set = set(pandas_df.columns)
293
+ feature_cols_set = set(self._feature_names)
294
+
295
+ target_cols_set = all_cols_set - feature_cols_set
296
+
297
+ if len(target_cols_set) == 0:
298
+ _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
299
+ raise ValueError("No target column found in DataFrame.")
300
+ if len(target_cols_set) > 1:
301
+ _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
302
+ raise ValueError("Ambiguous target: More than one non-feature column found.")
303
+
304
+ target_name = list(target_cols_set)[0]
305
+ self._target_names = [target_name]
306
+ self._id = target_name
307
+
308
+ # --- 3. Split Data ---
309
+ features_df = pandas_df[self._feature_names]
310
+ target_series = pandas_df[target_name]
311
+
270
312
  X_train, X_test, y_train, y_test = train_test_split(
271
- features, target, test_size=test_size, random_state=random_state
313
+ features_df,
314
+ target_series,
315
+ test_size=test_size,
316
+ random_state=random_state
272
317
  )
273
318
  self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
274
319
  self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
275
320
 
276
321
  label_dtype = torch.float32 if kind == "regression" else torch.int64
277
322
 
278
- # --- 3. Scale ---
279
- X_train_final, X_test_final = self._prepare_scaler(
280
- X_train, y_train, X_test, label_dtype, continuous_feature_columns
281
- )
323
+ # --- 4. Scale (using the schema) ---
324
+ if _apply_scaling:
325
+ X_train_final, X_test_final = self._prepare_scaler(
326
+ X_train, y_train, X_test, label_dtype, schema
327
+ )
328
+ else:
329
+ _LOGGER.info("Features have not been scaled as specified.")
330
+ X_train_final = X_train.to_numpy()
331
+ X_test_final = X_test.to_numpy()
332
+
333
+ # --- 5. Create Datasets ---
334
+ self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
335
+ self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
336
+
337
+ def __repr__(self) -> str:
338
+ s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
339
+ s += f" Target: {self.target_names[0]}\n"
340
+ s += f" Features: {self.number_of_features}\n"
341
+ s += f" Scaler: {'Fitted' if self.scaler else 'None'}\n"
282
342
 
283
- # --- 4. Create Datasets ---
284
- self._train_ds = _PytorchDataset(X_train_final, y_train.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
285
- self._test_ds = _PytorchDataset(X_test_final, y_test.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
343
+ if self._train_ds:
344
+ s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
345
+ if self._test_ds:
346
+ s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
347
+
348
+ return s
286
349
 
287
350
 
288
- # --- New Multi-Target Class ---
351
+ # --- Multi-Target Class ---
289
352
  class DatasetMakerMulti(_BaseDatasetMaker):
290
353
  """
291
- Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
354
+ Dataset maker for pre-processed, numerical pandas DataFrames with
355
+ multiple target columns.
292
356
 
293
- This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
357
+ This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
358
+ *target_columns*. It validates that the schema's features and the
359
+ target columns are mutually exclusive and together account for all
360
+ columns in the DataFrame.
361
+
362
+ Targets dtype is torch.float32
294
363
  """
295
364
  def __init__(self,
296
365
  pandas_df: pandas.DataFrame,
297
366
  target_columns: List[str],
367
+ schema: FeatureSchema,
368
+ scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
298
369
  test_size: float = 0.2,
299
- random_state: int = 42,
300
- scaler: Optional[PytorchScaler] = None,
301
- continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
370
+ random_state: int = 42):
302
371
  """
303
372
  Args:
304
- pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
305
- target_columns (list[str]): List of target column names.
306
- test_size (float): The proportion of the dataset to allocate to the test split.
307
- random_state (int): The seed for the random number generator for reproducibility.
308
- scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
309
- continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
373
+ pandas_df (pandas.DataFrame):
374
+ The pre-processed input DataFrame with *all* columns
375
+ (features and targets).
376
+ target_columns (list[str]):
377
+ List of target column names.
378
+ schema (FeatureSchema):
379
+ The definitive schema object from data_exploration.
380
+ scaler ("fit" | "none" | PytorchScaler):
381
+ Strategy for data scaling:
382
+ - "fit": Fit a new PytorchScaler on continuous features.
383
+ - "none": Do not scale data (e.g., for TabularTransformer).
384
+ - PytorchScaler instance: Use a pre-fitted scaler to transform data.
385
+ test_size (float):
386
+ The proportion of the dataset to allocate to the test split.
387
+ random_state (int):
388
+ The seed for the random number generator for reproducibility.
389
+
390
+ ## Note:
391
+ For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
392
+ This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
310
393
  """
311
394
  super().__init__()
312
- self.scaler = scaler
313
-
395
+
396
+ _apply_scaling: bool = False
397
+ if scaler == "fit":
398
+ self.scaler = None
399
+ _apply_scaling = True
400
+ elif scaler == "none":
401
+ self.scaler = None
402
+ elif isinstance(scaler, PytorchScaler):
403
+ self.scaler = scaler # Use the provided one
404
+ _apply_scaling = True
405
+ else:
406
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
407
+ raise ValueError()
408
+
409
+ # --- 1. Get features and targets from schema/args ---
410
+ self._feature_names = list(schema.feature_names)
314
411
  self._target_names = target_columns
315
- self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
316
- features = pandas_df[self._feature_names]
317
- target = pandas_df[self._target_names]
412
+
413
+ # --- 2. Validation ---
414
+ all_cols_set = set(pandas_df.columns)
415
+ feature_cols_set = set(self._feature_names)
416
+ target_cols_set = set(self._target_names)
417
+
418
+ overlap = feature_cols_set.intersection(target_cols_set)
419
+ if overlap:
420
+ _LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
421
+ raise ValueError("Features and targets overlap.")
422
+
423
+ schema_plus_targets = feature_cols_set.union(target_cols_set)
424
+ missing_cols = all_cols_set - schema_plus_targets
425
+ if missing_cols:
426
+ _LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
427
+
428
+ extra_cols = schema_plus_targets - all_cols_set
429
+ if extra_cols:
430
+ _LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
431
+ raise ValueError("Schema/target definition mismatch with DataFrame.")
432
+
433
+ # --- 3. Split Data ---
434
+ features_df = pandas_df[self._feature_names]
435
+ target_df = pandas_df[self._target_names]
318
436
 
319
437
  X_train, X_test, y_train, y_test = train_test_split(
320
- features, target, test_size=test_size, random_state=random_state
438
+ features_df,
439
+ target_df,
440
+ test_size=test_size,
441
+ random_state=random_state
321
442
  )
322
443
  self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
323
444
  self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
324
445
 
325
- label_dtype = torch.float32
446
+ # Multi-target for regression or multi-binary
447
+ label_dtype = torch.float32
326
448
 
327
- X_train_final, X_test_final = self._prepare_scaler(
328
- X_train, y_train, X_test, label_dtype, continuous_feature_columns
329
- )
449
+ # --- 4. Scale (using the schema) ---
450
+ if _apply_scaling:
451
+ X_train_final, X_test_final = self._prepare_scaler(
452
+ X_train, y_train, X_test, label_dtype, schema
453
+ )
454
+ else:
455
+ _LOGGER.info("Features have not been scaled as specified.")
456
+ X_train_final = X_train.to_numpy()
457
+ X_test_final = X_test.to_numpy()
330
458
 
459
+ # --- 5. Create Datasets ---
460
+ # _PytorchDataset now correctly handles y_train (a DataFrame)
331
461
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
332
462
  self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
333
463
 
464
+ def __repr__(self) -> str:
465
+ s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
466
+ s += f" Targets: {self.number_of_targets}\n"
467
+ s += f" Features: {self.number_of_features}\n"
468
+ s += f" Scaler: {'Fitted' if self.scaler else 'None'}\n"
469
+
470
+ if self._train_ds:
471
+ s += f" Train Samples: {len(self._train_ds)}\n" # type: ignore
472
+ if self._test_ds:
473
+ s += f" Test Samples: {len(self._test_ds)}\n" # type: ignore
474
+
475
+ return s
476
+
334
477
 
335
478
  # --- Private Base Class ---
336
479
  class _BaseMaker(ABC):
@@ -351,149 +494,6 @@ class _BaseMaker(ABC):
351
494
  pass
352
495
 
353
496
 
354
- # --- VisionDatasetMaker ---
355
- class VisionDatasetMaker(_BaseMaker):
356
- """
357
- Creates processed PyTorch datasets for computer vision tasks from an
358
- image folder directory.
359
-
360
- Uses online augmentations per epoch (image augmentation without creating new files).
361
- """
362
- def __init__(self, full_dataset: ImageFolder):
363
- super().__init__()
364
- self.full_dataset = full_dataset
365
- self.labels = [s[1] for s in self.full_dataset.samples]
366
- self.class_map = full_dataset.class_to_idx
367
-
368
- self._is_split = False
369
- self._are_transforms_configured = False
370
-
371
- @classmethod
372
- def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
373
- """Creates a maker instance from a root directory of images."""
374
- initial_transform = transforms.Compose([transforms.ToTensor()])
375
- full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
376
- _LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
377
- return cls(full_dataset)
378
-
379
- @staticmethod
380
- def inspect_folder(path: Union[str, Path]):
381
- """
382
- Logs a report of the types, sizes, and channels of image files
383
- found in the directory and its subdirectories.
384
- """
385
- path_obj = make_fullpath(path)
386
-
387
- non_image_files = set()
388
- img_types = set()
389
- img_sizes = set()
390
- img_channels = set()
391
- img_counter = 0
392
-
393
- _LOGGER.info(f"Inspecting folder: {path_obj}...")
394
- # Use rglob to recursively find all files
395
- for filepath in path_obj.rglob('*'):
396
- if filepath.is_file():
397
- try:
398
- # Using PIL to open is a more reliable check
399
- with Image.open(filepath) as img:
400
- img_types.add(img.format)
401
- img_sizes.add(img.size)
402
- img_channels.update(img.getbands())
403
- img_counter += 1
404
- except (IOError, SyntaxError):
405
- non_image_files.add(filepath.name)
406
-
407
- if non_image_files:
408
- _LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
409
-
410
- report = (
411
- f"\n--- Inspection Report for '{path_obj.name}' ---\n"
412
- f"Total images found: {img_counter}\n"
413
- f"Image formats: {img_types or 'None'}\n"
414
- f"Image sizes (WxH): {img_sizes or 'None'}\n"
415
- f"Image channels (bands): {img_channels or 'None'}\n"
416
- f"--------------------------------------"
417
- )
418
- print(report)
419
-
420
- def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
421
- stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
422
- """Splits the dataset into training, validation, and optional test sets."""
423
- if self._is_split:
424
- _LOGGER.warning("Data has already been split.")
425
- return self
426
-
427
- if val_size + test_size >= 1.0:
428
- _LOGGER.error("The sum of val_size and test_size must be less than 1.")
429
- raise ValueError()
430
-
431
- indices = list(range(len(self.full_dataset)))
432
- labels_for_split = self.labels if stratify else None
433
-
434
- train_indices, val_test_indices = train_test_split(
435
- indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
436
- )
437
-
438
- if test_size > 0:
439
- val_test_labels = [self.labels[i] for i in val_test_indices]
440
- stratify_val_test = val_test_labels if stratify else None
441
- val_indices, test_indices = train_test_split(
442
- val_test_indices, test_size=(test_size / (val_size + test_size)),
443
- random_state=random_state, stratify=stratify_val_test
444
- )
445
- self._test_dataset = Subset(self.full_dataset, test_indices)
446
- _LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
447
- else:
448
- val_indices = val_test_indices
449
-
450
- self._train_dataset = Subset(self.full_dataset, train_indices)
451
- self._val_dataset = Subset(self.full_dataset, val_indices)
452
- self._is_split = True
453
-
454
- _LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
455
- return self
456
-
457
- def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
458
- mean: List[float] = [0.485, 0.456, 0.406],
459
- std: List[float] = [0.229, 0.224, 0.225],
460
- extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
461
- """Configures and applies the image transformations (augmentations)."""
462
- if not self._is_split:
463
- _LOGGER.error("Transforms must be configured AFTER splitting data. Call .split_data() first.")
464
- raise RuntimeError()
465
-
466
- base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
467
- if extra_train_transforms:
468
- base_train_transforms.extend(extra_train_transforms)
469
-
470
- final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
471
-
472
- val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
473
- train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
474
-
475
- self._train_dataset.dataset.transform = train_transform # type: ignore
476
- self._val_dataset.dataset.transform = val_transform # type: ignore
477
- if self._test_dataset:
478
- self._test_dataset.dataset.transform = val_transform # type: ignore
479
-
480
- self._are_transforms_configured = True
481
- _LOGGER.info("Image transforms configured and applied.")
482
- return self
483
-
484
- def get_datasets(self) -> Tuple[Dataset, ...]:
485
- """Returns the final train, validation, and optional test datasets."""
486
- if not self._is_split:
487
- _LOGGER.error("Data has not been split. Call .split_data() first.")
488
- raise RuntimeError()
489
- if not self._are_transforms_configured:
490
- _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
491
-
492
- if self._test_dataset:
493
- return self._train_dataset, self._val_dataset, self._test_dataset
494
- return self._train_dataset, self._val_dataset
495
-
496
-
497
497
  # --- SequenceMaker ---
498
498
  class SequenceMaker(_BaseMaker):
499
499
  """
@@ -680,41 +680,22 @@ class SequenceMaker(_BaseMaker):
680
680
  _LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
681
681
  raise RuntimeError()
682
682
  return self._train_dataset, self._test_dataset
683
-
684
-
685
- # --- Custom Vision Transform Class ---
686
- class ResizeAspectFill:
687
- """
688
- Custom transformation to make an image square by padding it to match the
689
- longest side, preserving the aspect ratio. The image is finally centered.
690
-
691
- Args:
692
- pad_color (Union[str, int]): Color to use for the padding.
693
- Defaults to "black".
694
- """
695
- def __init__(self, pad_color: Union[str, int] = "black") -> None:
696
- self.pad_color = pad_color
697
-
698
- def __call__(self, image: Image.Image) -> Image.Image:
699
- if not isinstance(image, Image.Image):
700
- _LOGGER.error(f"Expected PIL.Image.Image, got {type(image).__name__}")
701
- raise TypeError()
702
-
703
- w, h = image.size
704
- if w == h:
705
- return image
706
-
707
- # Determine padding to center the image
708
- if w > h:
709
- top_padding = (w - h) // 2
710
- bottom_padding = w - h - top_padding
711
- padding = (0, top_padding, 0, bottom_padding)
712
- else: # h > w
713
- left_padding = (h - w) // 2
714
- right_padding = h - w - left_padding
715
- padding = (left_padding, 0, right_padding, 0)
716
-
717
- return ImageOps.expand(image, padding, fill=self.pad_color)
683
+
684
+ def __repr__(self) -> str:
685
+ s = f"<{self.__class__.__name__}>:\n"
686
+ s += f" Sequence Length (Window): {self.sequence_length}\n"
687
+ s += f" Total Data Points: {len(self.sequence)}\n"
688
+ s += " --- Status ---\n"
689
+ s += f" Split: {self._is_split}\n"
690
+ s += f" Normalized: {self._is_normalized}\n"
691
+ s += f" Windows Generated: {self._are_windows_generated}\n"
692
+
693
+ if self._are_windows_generated:
694
+ train_len = len(self._train_dataset) if self._train_dataset else 0 # type: ignore
695
+ test_len = len(self._test_dataset) if self._test_dataset else 0 # type: ignore
696
+ s += f" Datasets (Train/Test): {train_len} / {test_len} windows\n"
697
+
698
+ return s
718
699
 
719
700
 
720
701
  def info():