dragon-ml-toolbox 13.2.0__tar.gz → 14.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {dragon_ml_toolbox-13.2.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-14.1.0}/PKG-INFO +2 -1
  2. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0/dragon_ml_toolbox.egg-info}/PKG-INFO +2 -1
  3. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +7 -0
  4. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/requires.txt +1 -0
  5. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/MICE_imputation.py +207 -5
  6. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_datasetmaster.py +63 -205
  7. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_evaluation.py +23 -15
  8. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_evaluation_multi.py +5 -6
  9. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_inference.py +0 -1
  10. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_models.py +23 -7
  11. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_trainer.py +463 -20
  12. dragon_ml_toolbox-14.1.0/ml_tools/ML_utilities.py +528 -0
  13. dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_datasetmaster.py +1315 -0
  14. dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_evaluation.py +260 -0
  15. dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_inference.py +428 -0
  16. dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_models.py +627 -0
  17. dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_transformers.py +58 -0
  18. dragon_ml_toolbox-14.1.0/ml_tools/_ML_pytorch_tabular.py +543 -0
  19. dragon_ml_toolbox-14.1.0/ml_tools/_ML_vision_recipe.py +88 -0
  20. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/_schema.py +26 -0
  21. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/custom_logger.py +37 -14
  22. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/data_exploration.py +502 -93
  23. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/keys.py +38 -0
  24. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/math_utilities.py +1 -1
  25. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/serde.py +23 -3
  26. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/utilities.py +192 -3
  27. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/pyproject.toml +12 -2
  28. dragon_ml_toolbox-13.2.0/ml_tools/ML_utilities.py +0 -230
  29. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/LICENSE +0 -0
  30. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/LICENSE-THIRD-PARTY.md +0 -0
  31. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/README.md +0 -0
  32. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  33. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  34. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ETL_cleaning.py +0 -0
  35. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ETL_engineering.py +0 -0
  36. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/GUI_tools.py +0 -0
  37. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_callbacks.py +0 -0
  38. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_optimization.py +0 -0
  39. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_scaler.py +0 -0
  40. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/PSO_optimization.py +0 -0
  41. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/RNN_forecast.py +0 -0
  42. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/SQL.py +0 -0
  43. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/VIF_factor.py +0 -0
  44. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/__init__.py +0 -0
  45. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/_logger.py +0 -0
  46. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/_script_info.py +0 -0
  47. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/constants.py +0 -0
  48. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ensemble_evaluation.py +0 -0
  49. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ensemble_inference.py +0 -0
  50. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ensemble_learning.py +0 -0
  51. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/handle_excel.py +0 -0
  52. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/optimization_tools.py +0 -0
  53. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/path_manager.py +0 -0
  54. {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 13.2.0
3
+ Version: 14.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -34,6 +34,7 @@ Requires-Dist: Pillow; extra == "ml"
34
34
  Requires-Dist: evotorch; extra == "ml"
35
35
  Requires-Dist: pyarrow; extra == "ml"
36
36
  Requires-Dist: colorlog; extra == "ml"
37
+ Requires-Dist: torchmetrics; extra == "ml"
37
38
  Provides-Extra: mice
38
39
  Requires-Dist: numpy<2.0; extra == "mice"
39
40
  Requires-Dist: pandas; extra == "mice"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 13.2.0
3
+ Version: 14.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -34,6 +34,7 @@ Requires-Dist: Pillow; extra == "ml"
34
34
  Requires-Dist: evotorch; extra == "ml"
35
35
  Requires-Dist: pyarrow; extra == "ml"
36
36
  Requires-Dist: colorlog; extra == "ml"
37
+ Requires-Dist: torchmetrics; extra == "ml"
37
38
  Provides-Extra: mice
38
39
  Requires-Dist: numpy<2.0; extra == "mice"
39
40
  Requires-Dist: pandas; extra == "mice"
@@ -21,10 +21,17 @@ ml_tools/ML_optimization.py
21
21
  ml_tools/ML_scaler.py
22
22
  ml_tools/ML_trainer.py
23
23
  ml_tools/ML_utilities.py
24
+ ml_tools/ML_vision_datasetmaster.py
25
+ ml_tools/ML_vision_evaluation.py
26
+ ml_tools/ML_vision_inference.py
27
+ ml_tools/ML_vision_models.py
28
+ ml_tools/ML_vision_transformers.py
24
29
  ml_tools/PSO_optimization.py
25
30
  ml_tools/RNN_forecast.py
26
31
  ml_tools/SQL.py
27
32
  ml_tools/VIF_factor.py
33
+ ml_tools/_ML_pytorch_tabular.py
34
+ ml_tools/_ML_vision_recipe.py
28
35
  ml_tools/__init__.py
29
36
  ml_tools/_logger.py
30
37
  ml_tools/_schema.py
@@ -21,6 +21,7 @@ Pillow
21
21
  evotorch
22
22
  pyarrow
23
23
  colorlog
24
+ torchmetrics
24
25
 
25
26
  [excel]
26
27
  pandas
@@ -7,19 +7,20 @@ from plotnine import ggplot, labs, theme, element_blank # type: ignore
7
7
  from typing import Optional, Union
8
8
 
9
9
  from .utilities import load_dataframe, merge_dataframes, save_dataframe_filename
10
- from .math_utilities import threshold_binary_values
10
+ from .math_utilities import threshold_binary_values, discretize_categorical_values
11
11
  from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
12
12
  from ._logger import _LOGGER
13
13
  from ._script_info import _script_info
14
+ from ._schema import FeatureSchema
14
15
 
15
16
 
16
17
  __all__ = [
18
+ "MiceImputer",
17
19
  "apply_mice",
18
20
  "save_imputed_datasets",
19
- "get_na_column_names",
20
21
  "get_convergence_diagnostic",
21
22
  "get_imputed_distributions",
22
- "run_mice_pipeline"
23
+ "run_mice_pipeline",
23
24
  ]
24
25
 
25
26
 
@@ -79,7 +80,7 @@ def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df
79
80
 
80
81
 
81
82
  #Get names of features that had missing values before imputation
82
- def get_na_column_names(df: pd.DataFrame):
83
+ def _get_na_column_names(df: pd.DataFrame):
83
84
  return [col for col in df.columns if df[col].isna().any()]
84
85
 
85
86
 
@@ -264,7 +265,7 @@ def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str]
264
265
 
265
266
  save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
266
267
 
267
- imputed_column_names = get_na_column_names(df=df)
268
+ imputed_column_names = _get_na_column_names(df=df)
268
269
 
269
270
  get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
270
271
 
@@ -278,5 +279,206 @@ def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
278
279
  return df_feats, df_targets
279
280
 
280
281
 
282
+ # modern implementation
283
+ class MiceImputer:
284
+ """
285
+ A modern MICE imputation pipeline that uses a FeatureSchema
286
+ to correctly discretize categorical features after imputation.
287
+ """
288
+ def __init__(self,
289
+ schema: FeatureSchema,
290
+ iterations: int=20,
291
+ resulting_datasets: int = 1,
292
+ random_state: int = 101):
293
+
294
+ self.schema = schema
295
+ self.random_state = random_state
296
+ self.iterations = iterations
297
+ self.resulting_datasets = resulting_datasets
298
+
299
+ # --- Store schema info ---
300
+
301
+ # 1. Categorical info
302
+ if not self.schema.categorical_index_map:
303
+ _LOGGER.warning("FeatureSchema has no 'categorical_index_map'. No discretization will be applied.")
304
+ self.cat_info = {}
305
+ else:
306
+ self.cat_info = self.schema.categorical_index_map
307
+
308
+ # 2. Ordered feature names (critical for index mapping)
309
+ self.ordered_features = list(self.schema.feature_names)
310
+
311
+ # 3. Names of categorical features
312
+ self.categorical_features = list(self.schema.categorical_feature_names)
313
+
314
+ _LOGGER.info(f"MiceImputer initialized. Found {len(self.cat_info)} categorical features to discretize.")
315
+
316
+ def _post_process(self, imputed_df: pd.DataFrame) -> pd.DataFrame:
317
+ """
318
+ Applies schema-based discretization to a completed dataframe.
319
+
320
+ This method works around the behavior of `discretize_categorical_values`
321
+ (which returns a full int32 array) by:
322
+ 1. Calling it on the full, ordered feature array.
323
+ 2. Extracting *only* the valid discretized categorical columns.
324
+ 3. Updating the original float dataframe with these integer values.
325
+ """
326
+ # If no categorical features are defined, return the df as-is.
327
+ if not self.cat_info:
328
+ return imputed_df
329
+
330
+ try:
331
+ # 1. Ensure DataFrame columns match the schema order
332
+ # This is critical for the index-based categorical_info
333
+ df_ordered: pd.DataFrame = imputed_df[self.ordered_features] # type: ignore
334
+
335
+ # 2. Convert to NumPy array
336
+ array_ordered = df_ordered.to_numpy()
337
+
338
+ # 3. Apply discretization utility (which returns a full int32 array)
339
+ # This array has *correct* categorical values but *truncated* continuous values.
340
+ discretized_array_int32 = discretize_categorical_values(
341
+ array_ordered,
342
+ self.cat_info,
343
+ start_at_zero=True # Assuming 0-based indexing
344
+ )
345
+
346
+ # 4. Create a new DF from the int32 array, keeping the categorical columns.
347
+ df_discretized_cats = pd.DataFrame(
348
+ discretized_array_int32,
349
+ columns=self.ordered_features,
350
+ index=df_ordered.index # <-- Critical: align index
351
+ )[self.categorical_features] # <-- Select only cat features
352
+
353
+ # 5. "Rejoin": Start with a fresh copy of the *original* imputed DF (which has correct continuous floats).
354
+ final_df = df_ordered.copy()
355
+
356
+ # 6. Use .update() to "paste" the integer categorical values
357
+ # over the old float categorical values. Continuous floats are unaffected.
358
+ final_df.update(df_discretized_cats)
359
+
360
+ return final_df
361
+
362
+ except Exception as e:
363
+ _LOGGER.error(f"Failed during post-processing discretization:\n\tInput DF shape: {imputed_df.shape}\n\tSchema features: {len(self.ordered_features)}\n\tCategorical info keys: {list(self.cat_info.keys())}\n{e}")
364
+ raise
365
+
366
+ def _run_mice(self,
367
+ df: pd.DataFrame,
368
+ df_name: str) -> tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
369
+ """
370
+ Runs the MICE kernel and applies schema-based post-processing.
371
+
372
+ Parameters:
373
+ df (pd.DataFrame): The input dataframe *with NaNs*. Should only contain feature columns.
374
+ df_name (str): The base name for the dataset.
375
+
376
+ Returns:
377
+ tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
378
+ - The trained MICE kernel
379
+ - A list of imputed and processed DataFrames
380
+ - A list of names for the new DataFrames
381
+ """
382
+ # Ensure input df only contains features from the schema and is in the correct order.
383
+ try:
384
+ df_feats = df[self.ordered_features]
385
+ except KeyError as e:
386
+ _LOGGER.error(f"Input DataFrame is missing required schema columns: {e}")
387
+ raise
388
+
389
+ # 1. Initialize kernel
390
+ kernel = mf.ImputationKernel(
391
+ data=df_feats,
392
+ num_datasets=self.resulting_datasets,
393
+ random_state=self.random_state
394
+ )
395
+
396
+ _LOGGER.info("➡️ Schema-based MICE imputation running...")
397
+
398
+ # 2. Perform MICE
399
+ kernel.mice(self.iterations)
400
+
401
+ # 3. Retrieve, process, and collect datasets
402
+ imputed_datasets = []
403
+ for i in range(self.resulting_datasets):
404
+ # complete_data returns a pd.DataFrame
405
+ completed_df = kernel.complete_data(dataset=i)
406
+
407
+ # Apply our new discretization and ordering
408
+ processed_df = self._post_process(completed_df)
409
+ imputed_datasets.append(processed_df)
410
+
411
+ if not imputed_datasets:
412
+ _LOGGER.error("No imputed datasets were generated.")
413
+ raise ValueError()
414
+
415
+ # 4. Generate names
416
+ if self.resulting_datasets == 1:
417
+ imputed_dataset_names = [f"{df_name}_MICE"]
418
+ else:
419
+ imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(self.resulting_datasets)]
420
+
421
+ # 5. Validate indexes
422
+ for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
423
+ assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}"
424
+ assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}"
425
+
426
+ _LOGGER.info("Schema-based MICE imputation complete.")
427
+
428
+ return kernel, imputed_datasets, imputed_dataset_names
429
+
430
+ def run_pipeline(self,
431
+ df_path_or_dir: Union[str,Path],
432
+ save_datasets_dir: Union[str,Path],
433
+ save_metrics_dir: Union[str,Path],
434
+ ):
435
+ """
436
+ Runs the complete MICE imputation pipeline.
437
+
438
+ This method automates the entire workflow:
439
+ 1. Loads data from a CSV file path or a directory with CSV files.
440
+ 2. Separates features and targets based on the `FeatureSchema`.
441
+ 3. Runs the MICE algorithm on the feature set.
442
+ 4. Applies schema-based post-processing to discretize categorical features.
443
+ 5. Saves the final, processed, and imputed dataset(s) (re-joined with targets) to `save_datasets_dir`.
444
+ 6. Generates and saves convergence and distribution plots for all imputed columns to `save_metrics_dir`.
445
+
446
+ Parameters
447
+ ----------
448
+ df_path_or_dir :[str,Path]
449
+ Path to a single CSV file or a directory containing multiple CSV files to impute.
450
+ save_datasets_dir : [str,Path]
451
+ Directory where the final imputed and processed dataset(s) will be saved as CSVs.
452
+ save_metrics_dir : [str,Path]
453
+ Directory where convergence and distribution plots will be saved.
454
+ """
455
+ # Check paths
456
+ save_datasets_path = make_fullpath(save_datasets_dir, make=True)
457
+ save_metrics_path = make_fullpath(save_metrics_dir, make=True)
458
+
459
+ input_path = make_fullpath(df_path_or_dir)
460
+ if input_path.is_file():
461
+ all_file_paths = [input_path]
462
+ else:
463
+ all_file_paths = list(list_csv_paths(input_path).values())
464
+
465
+ for df_path in all_file_paths:
466
+
467
+ df, df_name = load_dataframe(df_path=df_path, kind="pandas")
468
+
469
+ df_features: pd.DataFrame = df[self.schema.feature_names] # type: ignore
470
+ df_targets = df.drop(columns=self.schema.feature_names)
471
+
472
+ imputed_column_names = _get_na_column_names(df=df_features)
473
+
474
+ kernel, imputed_datasets, imputed_dataset_names = self._run_mice(df=df_features, df_name=df_name)
475
+
476
+ save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
477
+
478
+ get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
479
+
480
+ get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
481
+
482
+
281
483
  def info():
282
484
  _script_info(__all__)
@@ -1,13 +1,10 @@
1
1
  import torch
2
- from torch.utils.data import Dataset, Subset
2
+ from torch.utils.data import Dataset
3
3
  import pandas
4
4
  import numpy
5
5
  from sklearn.model_selection import train_test_split
6
6
  from typing import Literal, Union, Tuple, List, Optional
7
7
  from abc import ABC, abstractmethod
8
- from PIL import Image, ImageOps
9
- from torchvision.datasets import ImageFolder
10
- from torchvision import transforms
11
8
  import matplotlib.pyplot as plt
12
9
  from pathlib import Path
13
10
 
@@ -23,9 +20,7 @@ from ._schema import FeatureSchema
23
20
  __all__ = [
24
21
  "DatasetMaker",
25
22
  "DatasetMakerMulti",
26
- "VisionDatasetMaker",
27
- "SequenceMaker",
28
- "ResizeAspectFill",
23
+ "SequenceMaker"
29
24
  ]
30
25
 
31
26
 
@@ -126,8 +121,8 @@ class _BaseDatasetMaker(ABC):
126
121
  else:
127
122
  _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
128
123
 
129
- X_train_values = X_train.values
130
- X_test_values = X_test.values
124
+ X_train_values = X_train.to_numpy()
125
+ X_test_values = X_test.to_numpy()
131
126
 
132
127
  # continuous_feature_indices is derived
133
128
  if self.scaler is None and continuous_feature_indices:
@@ -253,26 +248,42 @@ class DatasetMaker(_BaseDatasetMaker):
253
248
  pandas_df: pandas.DataFrame,
254
249
  schema: FeatureSchema,
255
250
  kind: Literal["regression", "classification"],
251
+ scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
256
252
  test_size: float = 0.2,
257
- random_state: int = 42,
258
- scaler: Optional[PytorchScaler] = None):
253
+ random_state: int = 42):
259
254
  """
260
255
  Args:
261
256
  pandas_df (pandas.DataFrame):
262
257
  The pre-processed input DataFrame containing all columns. (features and single target).
263
258
  schema (FeatureSchema):
264
259
  The definitive schema object from data_exploration.
265
- kind (Literal["regression", "classification"]):
260
+ kind ("regression" | "classification"):
266
261
  The type of ML task. This determines the data type of the labels.
262
+ scaler ("fit" | "none" | PytorchScaler):
263
+ Strategy for data scaling:
264
+ - "fit": Fit a new PytorchScaler on continuous features.
265
+ - "none": Do not scale data (e.g., for TabularTransformer).
266
+ - PytorchScaler instance: Use a pre-fitted scaler to transform data.
267
267
  test_size (float):
268
268
  The proportion of the dataset to allocate to the test split.
269
269
  random_state (int):
270
270
  The seed for the random number of generator for reproducibility.
271
- scaler (PytorchScaler | None):
272
- A pre-fitted PytorchScaler instance, if None a new scaler will be created.
271
+
273
272
  """
274
273
  super().__init__()
275
- self.scaler = scaler
274
+
275
+ _apply_scaling: bool = False
276
+ if scaler == "fit":
277
+ self.scaler = None # To be created
278
+ _apply_scaling = True
279
+ elif scaler == "none":
280
+ self.scaler = None
281
+ elif isinstance(scaler, PytorchScaler):
282
+ self.scaler = scaler # Use the provided one
283
+ _apply_scaling = True
284
+ else:
285
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
286
+ raise ValueError()
276
287
 
277
288
  # --- 1. Identify features (from schema) ---
278
289
  self._feature_names = list(schema.feature_names)
@@ -310,9 +321,14 @@ class DatasetMaker(_BaseDatasetMaker):
310
321
  label_dtype = torch.float32 if kind == "regression" else torch.int64
311
322
 
312
323
  # --- 4. Scale (using the schema) ---
313
- X_train_final, X_test_final = self._prepare_scaler(
314
- X_train, y_train, X_test, label_dtype, schema
315
- )
324
+ if _apply_scaling:
325
+ X_train_final, X_test_final = self._prepare_scaler(
326
+ X_train, y_train, X_test, label_dtype, schema
327
+ )
328
+ else:
329
+ _LOGGER.info("Features have not been scaled as specified.")
330
+ X_train_final = X_train.to_numpy()
331
+ X_test_final = X_test.to_numpy()
316
332
 
317
333
  # --- 5. Create Datasets ---
318
334
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
@@ -336,9 +352,9 @@ class DatasetMakerMulti(_BaseDatasetMaker):
336
352
  pandas_df: pandas.DataFrame,
337
353
  target_columns: List[str],
338
354
  schema: FeatureSchema,
355
+ scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
339
356
  test_size: float = 0.2,
340
- random_state: int = 42,
341
- scaler: Optional[PytorchScaler] = None):
357
+ random_state: int = 42):
342
358
  """
343
359
  Args:
344
360
  pandas_df (pandas.DataFrame):
@@ -348,20 +364,35 @@ class DatasetMakerMulti(_BaseDatasetMaker):
348
364
  List of target column names.
349
365
  schema (FeatureSchema):
350
366
  The definitive schema object from data_exploration.
367
+ scaler ("fit" | "none" | PytorchScaler):
368
+ Strategy for data scaling:
369
+ - "fit": Fit a new PytorchScaler on continuous features.
370
+ - "none": Do not scale data (e.g., for TabularTransformer).
371
+ - PytorchScaler instance: Use a pre-fitted scaler to transform data.
351
372
  test_size (float):
352
373
  The proportion of the dataset to allocate to the test split.
353
374
  random_state (int):
354
375
  The seed for the random number generator for reproducibility.
355
- scaler (PytorchScaler | None):
356
- A pre-fitted PytorchScaler instance.
357
376
 
358
377
  ## Note:
359
378
  For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
360
379
  This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
361
380
  """
362
381
  super().__init__()
363
- self.scaler = scaler
364
-
382
+
383
+ _apply_scaling: bool = False
384
+ if scaler == "fit":
385
+ self.scaler = None
386
+ _apply_scaling = True
387
+ elif scaler == "none":
388
+ self.scaler = None
389
+ elif isinstance(scaler, PytorchScaler):
390
+ self.scaler = scaler # Use the provided one
391
+ _apply_scaling = True
392
+ else:
393
+ _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
394
+ raise ValueError()
395
+
365
396
  # --- 1. Get features and targets from schema/args ---
366
397
  self._feature_names = list(schema.feature_names)
367
398
  self._target_names = target_columns
@@ -403,9 +434,14 @@ class DatasetMakerMulti(_BaseDatasetMaker):
403
434
  label_dtype = torch.float32
404
435
 
405
436
  # --- 4. Scale (using the schema) ---
406
- X_train_final, X_test_final = self._prepare_scaler(
407
- X_train, y_train, X_test, label_dtype, schema
408
- )
437
+ if _apply_scaling:
438
+ X_train_final, X_test_final = self._prepare_scaler(
439
+ X_train, y_train, X_test, label_dtype, schema
440
+ )
441
+ else:
442
+ _LOGGER.info("Features have not been scaled as specified.")
443
+ X_train_final = X_train.to_numpy()
444
+ X_test_final = X_test.to_numpy()
409
445
 
410
446
  # --- 5. Create Datasets ---
411
447
  # _PytorchDataset now correctly handles y_train (a DataFrame)
@@ -432,149 +468,6 @@ class _BaseMaker(ABC):
432
468
  pass
433
469
 
434
470
 
435
- # --- VisionDatasetMaker ---
436
- class VisionDatasetMaker(_BaseMaker):
437
- """
438
- Creates processed PyTorch datasets for computer vision tasks from an
439
- image folder directory.
440
-
441
- Uses online augmentations per epoch (image augmentation without creating new files).
442
- """
443
- def __init__(self, full_dataset: ImageFolder):
444
- super().__init__()
445
- self.full_dataset = full_dataset
446
- self.labels = [s[1] for s in self.full_dataset.samples]
447
- self.class_map = full_dataset.class_to_idx
448
-
449
- self._is_split = False
450
- self._are_transforms_configured = False
451
-
452
- @classmethod
453
- def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
454
- """Creates a maker instance from a root directory of images."""
455
- initial_transform = transforms.Compose([transforms.ToTensor()])
456
- full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
457
- _LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
458
- return cls(full_dataset)
459
-
460
- @staticmethod
461
- def inspect_folder(path: Union[str, Path]):
462
- """
463
- Logs a report of the types, sizes, and channels of image files
464
- found in the directory and its subdirectories.
465
- """
466
- path_obj = make_fullpath(path)
467
-
468
- non_image_files = set()
469
- img_types = set()
470
- img_sizes = set()
471
- img_channels = set()
472
- img_counter = 0
473
-
474
- _LOGGER.info(f"Inspecting folder: {path_obj}...")
475
- # Use rglob to recursively find all files
476
- for filepath in path_obj.rglob('*'):
477
- if filepath.is_file():
478
- try:
479
- # Using PIL to open is a more reliable check
480
- with Image.open(filepath) as img:
481
- img_types.add(img.format)
482
- img_sizes.add(img.size)
483
- img_channels.update(img.getbands())
484
- img_counter += 1
485
- except (IOError, SyntaxError):
486
- non_image_files.add(filepath.name)
487
-
488
- if non_image_files:
489
- _LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
490
-
491
- report = (
492
- f"\n--- Inspection Report for '{path_obj.name}' ---\n"
493
- f"Total images found: {img_counter}\n"
494
- f"Image formats: {img_types or 'None'}\n"
495
- f"Image sizes (WxH): {img_sizes or 'None'}\n"
496
- f"Image channels (bands): {img_channels or 'None'}\n"
497
- f"--------------------------------------"
498
- )
499
- print(report)
500
-
501
- def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
502
- stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
503
- """Splits the dataset into training, validation, and optional test sets."""
504
- if self._is_split:
505
- _LOGGER.warning("Data has already been split.")
506
- return self
507
-
508
- if val_size + test_size >= 1.0:
509
- _LOGGER.error("The sum of val_size and test_size must be less than 1.")
510
- raise ValueError()
511
-
512
- indices = list(range(len(self.full_dataset)))
513
- labels_for_split = self.labels if stratify else None
514
-
515
- train_indices, val_test_indices = train_test_split(
516
- indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
517
- )
518
-
519
- if test_size > 0:
520
- val_test_labels = [self.labels[i] for i in val_test_indices]
521
- stratify_val_test = val_test_labels if stratify else None
522
- val_indices, test_indices = train_test_split(
523
- val_test_indices, test_size=(test_size / (val_size + test_size)),
524
- random_state=random_state, stratify=stratify_val_test
525
- )
526
- self._test_dataset = Subset(self.full_dataset, test_indices)
527
- _LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
528
- else:
529
- val_indices = val_test_indices
530
-
531
- self._train_dataset = Subset(self.full_dataset, train_indices)
532
- self._val_dataset = Subset(self.full_dataset, val_indices)
533
- self._is_split = True
534
-
535
- _LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
536
- return self
537
-
538
- def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
539
- mean: List[float] = [0.485, 0.456, 0.406],
540
- std: List[float] = [0.229, 0.224, 0.225],
541
- extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
542
- """Configures and applies the image transformations (augmentations)."""
543
- if not self._is_split:
544
- _LOGGER.error("Transforms must be configured AFTER splitting data. Call .split_data() first.")
545
- raise RuntimeError()
546
-
547
- base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
548
- if extra_train_transforms:
549
- base_train_transforms.extend(extra_train_transforms)
550
-
551
- final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
552
-
553
- val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
554
- train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
555
-
556
- self._train_dataset.dataset.transform = train_transform # type: ignore
557
- self._val_dataset.dataset.transform = val_transform # type: ignore
558
- if self._test_dataset:
559
- self._test_dataset.dataset.transform = val_transform # type: ignore
560
-
561
- self._are_transforms_configured = True
562
- _LOGGER.info("Image transforms configured and applied.")
563
- return self
564
-
565
- def get_datasets(self) -> Tuple[Dataset, ...]:
566
- """Returns the final train, validation, and optional test datasets."""
567
- if not self._is_split:
568
- _LOGGER.error("Data has not been split. Call .split_data() first.")
569
- raise RuntimeError()
570
- if not self._are_transforms_configured:
571
- _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
572
-
573
- if self._test_dataset:
574
- return self._train_dataset, self._val_dataset, self._test_dataset
575
- return self._train_dataset, self._val_dataset
576
-
577
-
578
471
  # --- SequenceMaker ---
579
472
  class SequenceMaker(_BaseMaker):
580
473
  """
@@ -763,40 +656,5 @@ class SequenceMaker(_BaseMaker):
763
656
  return self._train_dataset, self._test_dataset
764
657
 
765
658
 
766
- # --- Custom Vision Transform Class ---
767
- class ResizeAspectFill:
768
- """
769
- Custom transformation to make an image square by padding it to match the
770
- longest side, preserving the aspect ratio. The image is finally centered.
771
-
772
- Args:
773
- pad_color (Union[str, int]): Color to use for the padding.
774
- Defaults to "black".
775
- """
776
- def __init__(self, pad_color: Union[str, int] = "black") -> None:
777
- self.pad_color = pad_color
778
-
779
- def __call__(self, image: Image.Image) -> Image.Image:
780
- if not isinstance(image, Image.Image):
781
- _LOGGER.error(f"Expected PIL.Image.Image, got {type(image).__name__}")
782
- raise TypeError()
783
-
784
- w, h = image.size
785
- if w == h:
786
- return image
787
-
788
- # Determine padding to center the image
789
- if w > h:
790
- top_padding = (w - h) // 2
791
- bottom_padding = w - h - top_padding
792
- padding = (0, top_padding, 0, bottom_padding)
793
- else: # h > w
794
- left_padding = (h - w) // 2
795
- right_padding = h - w - left_padding
796
- padding = (left_padding, 0, right_padding, 0)
797
-
798
- return ImageOps.expand(image, padding, fill=self.pad_color)
799
-
800
-
801
659
  def info():
802
660
  _script_info(__all__)