dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show
  1. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
  2. dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
  3. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
  4. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
  5. ml_tools/ETL_cleaning.py +72 -34
  6. ml_tools/ETL_engineering.py +506 -70
  7. ml_tools/GUI_tools.py +2 -1
  8. ml_tools/MICE_imputation.py +212 -7
  9. ml_tools/ML_callbacks.py +73 -40
  10. ml_tools/ML_datasetmaster.py +267 -284
  11. ml_tools/ML_evaluation.py +119 -58
  12. ml_tools/ML_evaluation_multi.py +107 -32
  13. ml_tools/ML_inference.py +15 -5
  14. ml_tools/ML_models.py +234 -170
  15. ml_tools/ML_models_advanced.py +323 -0
  16. ml_tools/ML_optimization.py +321 -97
  17. ml_tools/ML_scaler.py +10 -5
  18. ml_tools/ML_trainer.py +585 -40
  19. ml_tools/ML_utilities.py +528 -0
  20. ml_tools/ML_vision_datasetmaster.py +1315 -0
  21. ml_tools/ML_vision_evaluation.py +260 -0
  22. ml_tools/ML_vision_inference.py +428 -0
  23. ml_tools/ML_vision_models.py +627 -0
  24. ml_tools/ML_vision_transformers.py +58 -0
  25. ml_tools/PSO_optimization.py +10 -7
  26. ml_tools/RNN_forecast.py +2 -0
  27. ml_tools/SQL.py +22 -9
  28. ml_tools/VIF_factor.py +4 -3
  29. ml_tools/_ML_vision_recipe.py +88 -0
  30. ml_tools/__init__.py +1 -0
  31. ml_tools/_logger.py +0 -2
  32. ml_tools/_schema.py +96 -0
  33. ml_tools/constants.py +79 -0
  34. ml_tools/custom_logger.py +164 -16
  35. ml_tools/data_exploration.py +1092 -109
  36. ml_tools/ensemble_evaluation.py +48 -1
  37. ml_tools/ensemble_inference.py +6 -7
  38. ml_tools/ensemble_learning.py +4 -3
  39. ml_tools/handle_excel.py +1 -0
  40. ml_tools/keys.py +80 -0
  41. ml_tools/math_utilities.py +259 -0
  42. ml_tools/optimization_tools.py +198 -24
  43. ml_tools/path_manager.py +144 -45
  44. ml_tools/serde.py +192 -0
  45. ml_tools/utilities.py +287 -227
  46. dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
  47. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
  48. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
ml_tools/utilities.py CHANGED
@@ -1,27 +1,25 @@
1
- import math
2
1
  import numpy as np
3
2
  import pandas as pd
4
3
  import polars as pl
5
4
  from pathlib import Path
6
- from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple, overload
7
- import joblib
8
- from joblib.externals.loky.process_executor import TerminatedWorkerError
5
+ from typing import Literal, Union, Optional, Any, Iterator, Tuple, overload
6
+
9
7
  from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
10
8
  from ._script_info import _script_info
11
9
  from ._logger import _LOGGER
10
+ from ._schema import FeatureSchema
12
11
 
13
12
 
14
13
  # Keep track of available tools
15
14
  __all__ = [
16
15
  "load_dataframe",
16
+ "load_dataframe_greedy",
17
+ "load_dataframe_with_schema",
17
18
  "yield_dataframes_from_dir",
18
19
  "merge_dataframes",
20
+ "save_dataframe_filename",
19
21
  "save_dataframe",
20
- "normalize_mixed_list",
21
- "threshold_binary_values",
22
- "threshold_binary_values_batch",
23
- "serialize_object",
24
- "deserialize_object",
22
+ "save_dataframe_with_schema",
25
23
  "distribute_dataset_by_target",
26
24
  "train_dataset_orchestrator",
27
25
  "train_dataset_yielder"
@@ -32,6 +30,7 @@ __all__ = [
32
30
  @overload
33
31
  def load_dataframe(
34
32
  df_path: Union[str, Path],
33
+ use_columns: Optional[list[str]] = None,
35
34
  kind: Literal["pandas"] = "pandas",
36
35
  all_strings: bool = False,
37
36
  verbose: bool = True
@@ -42,7 +41,8 @@ def load_dataframe(
42
41
  @overload
43
42
  def load_dataframe(
44
43
  df_path: Union[str, Path],
45
- kind: Literal["polars"],
44
+ use_columns: Optional[list[str]] = None,
45
+ kind: Literal["polars"] = "polars",
46
46
  all_strings: bool = False,
47
47
  verbose: bool = True
48
48
  ) -> Tuple[pl.DataFrame, str]:
@@ -50,6 +50,7 @@ def load_dataframe(
50
50
 
51
51
  def load_dataframe(
52
52
  df_path: Union[str, Path],
53
+ use_columns: Optional[list[str]] = None,
53
54
  kind: Literal["pandas", "polars"] = "pandas",
54
55
  all_strings: bool = False,
55
56
  verbose: bool = True
@@ -58,11 +59,13 @@ def load_dataframe(
58
59
  Load a CSV file into a DataFrame and extract its base name.
59
60
 
60
61
  Can load data as either a pandas or a polars DataFrame. Allows for loading all
61
- columns as string types to prevent type inference errors.
62
+ columns or a subset of columns as string types to prevent type inference errors.
62
63
 
63
64
  Args:
64
65
  df_path (str, Path):
65
66
  The path to the CSV file.
67
+ use_columns (list[str] | None):
68
+ If provided, only these columns will be loaded from the CSV.
66
69
  kind ("pandas", "polars"):
67
70
  The type of DataFrame to load. Defaults to "pandas".
68
71
  all_strings (bool):
@@ -76,28 +79,44 @@ def load_dataframe(
76
79
 
77
80
  Raises:
78
81
  FileNotFoundError: If the file does not exist at the given path.
79
- ValueError: If the DataFrame is empty or an invalid 'kind' is provided.
82
+ ValueError: If the DataFrame is empty, an invalid 'kind' is provided, or a column in 'use_columns' is not found in the file.
80
83
  """
81
84
  path = make_fullpath(df_path)
82
85
 
83
86
  df_name = path.stem
84
87
 
85
- if kind == "pandas":
86
- if all_strings:
87
- df = pd.read_csv(path, encoding='utf-8', dtype=str)
88
- else:
89
- df = pd.read_csv(path, encoding='utf-8')
90
-
91
- elif kind == "polars":
92
- if all_strings:
93
- df = pl.read_csv(path, infer_schema=False)
88
+ try:
89
+ if kind == "pandas":
90
+ pd_kwargs: dict[str,Any]
91
+ pd_kwargs = {'encoding': 'utf-8'}
92
+ if use_columns:
93
+ pd_kwargs['usecols'] = use_columns
94
+ if all_strings:
95
+ pd_kwargs['dtype'] = str
96
+
97
+ df = pd.read_csv(path, **pd_kwargs)
98
+
99
+ elif kind == "polars":
100
+ pl_kwargs: dict[str,Any]
101
+ pl_kwargs = {}
102
+ pl_kwargs['null_values'] = ["", " "]
103
+ if use_columns:
104
+ pl_kwargs['columns'] = use_columns
105
+
106
+ if all_strings:
107
+ pl_kwargs['infer_schema'] = False
108
+ else:
109
+ pl_kwargs['infer_schema_length'] = 1000
110
+
111
+ df = pl.read_csv(path, **pl_kwargs)
112
+
94
113
  else:
95
- # Default behavior: infer the schema.
96
- df = pl.read_csv(path, infer_schema_length=1000)
114
+ _LOGGER.error(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
115
+ raise ValueError()
97
116
 
98
- else:
99
- _LOGGER.error(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
100
- raise ValueError()
117
+ except (ValueError, pl.exceptions.ColumnNotFoundError) as e:
118
+ _LOGGER.error(f"Failed to load '{df_name}'. A specified column may not exist in the file.")
119
+ raise e
101
120
 
102
121
  # This check works for both pandas and polars DataFrames
103
122
  if df.shape[0] == 0:
@@ -110,6 +129,116 @@ def load_dataframe(
110
129
  return df, df_name # type: ignore
111
130
 
112
131
 
132
+ def load_dataframe_greedy(directory: Union[str, Path],
133
+ use_columns: Optional[list[str]] = None,
134
+ all_strings: bool = False,
135
+ verbose: bool = True) -> pd.DataFrame:
136
+ """
137
+ Greedily loads the first found CSV file from a directory into a Pandas DataFrame.
138
+
139
+ This function scans the specified directory for any CSV files. It will
140
+ attempt to load the *first* CSV file it finds using the `load_dataframe`
141
+ function as a Pandas DataFrame.
142
+
143
+ Args:
144
+ directory (str, Path):
145
+ The path to the directory to search for a CSV file.
146
+ use_columns (list[str] | None):
147
+ A list of column names to load. If None, all columns are loaded.
148
+ all_strings (bool):
149
+ If True, loads all columns as string data types.
150
+
151
+ Returns:
152
+ pd.DataFrame:
153
+ A pandas DataFrame loaded from the first CSV file found.
154
+
155
+ Raises:
156
+ FileNotFoundError:
157
+ If the specified directory does not exist or the CSV file path
158
+ found is invalid.
159
+ ValueError:
160
+ If the loaded DataFrame is empty or `use_columns` contains
161
+ invalid column names.
162
+ """
163
+ # validate directory
164
+ dir_path = make_fullpath(directory, enforce="directory")
165
+
166
+ # list all csv files and grab one (should be the only one)
167
+ csv_dict = list_csv_paths(directory=dir_path, verbose=False)
168
+
169
+ for df_path in csv_dict.values():
170
+ df , _df_name = load_dataframe(df_path=df_path,
171
+ use_columns=use_columns,
172
+ kind="pandas",
173
+ all_strings=all_strings,
174
+ verbose=verbose)
175
+ break
176
+
177
+ return df
178
+
179
+
180
+ def load_dataframe_with_schema(
181
+ df_path: Union[str, Path],
182
+ schema: "FeatureSchema",
183
+ all_strings: bool = False,
184
+ ) -> Tuple[pd.DataFrame, str]:
185
+ """
186
+ Loads a CSV file into a Pandas DataFrame, strictly validating its
187
+ feature columns against a FeatureSchema.
188
+
189
+ This function wraps `load_dataframe`. After loading, it validates
190
+ that the first N columns of the DataFrame (where N =
191
+ len(schema.feature_names)) contain *exactly* the set of features
192
+ specified in the schema.
193
+
194
+ - If the columns are present but out of order, they are reordered.
195
+ - If any required feature is missing from the first N columns, it fails.
196
+ - If any extra column is found within the first N columns, it fails.
197
+
198
+ Columns *after* the first N are considered target columns and are
199
+ logged for verification.
200
+
201
+ Args:
202
+ df_path (str, Path):
203
+ The path to the CSV file.
204
+ schema (FeatureSchema):
205
+ The schema object to validate against.
206
+ all_strings (bool):
207
+ If True, loads all columns as string data types.
208
+
209
+ Returns:
210
+ (Tuple[pd.DataFrame, str]):
211
+ A tuple containing the loaded, validated (and possibly
212
+ reordered) pandas DataFrame and the base name of the file.
213
+
214
+ Raises:
215
+ ValueError:
216
+ - If the DataFrame is missing columns required by the schema
217
+ within its first N columns.
218
+ - If the DataFrame's first N columns contain unexpected
219
+ columns that are not in the schema.
220
+ FileNotFoundError:
221
+ If the file does not exist at the given path.
222
+ """
223
+ # Step 1: Load the dataframe using the original function
224
+ try:
225
+ df, df_name = load_dataframe(
226
+ df_path=df_path,
227
+ use_columns=None, # Load all columns for validation
228
+ kind="pandas",
229
+ all_strings=all_strings,
230
+ verbose=True
231
+ )
232
+ except Exception as e:
233
+ _LOGGER.error(f"Failed during initial load for schema validation: {e}")
234
+ raise e
235
+
236
+ # Step 2: Call the helper to validate and reorder
237
+ df_validated = _validate_and_reorder_schema(df=df, schema=schema)
238
+
239
+ return df_validated, df_name
240
+
241
+
113
242
  def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
114
243
  """
115
244
  Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
@@ -196,7 +325,7 @@ def merge_dataframes(
196
325
  return merged_df
197
326
 
198
327
 
199
- def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Path], filename: str) -> None:
328
+ def save_dataframe_filename(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Path], filename: str) -> None:
200
329
  """
201
330
  Saves a pandas or polars DataFrame to a CSV file.
202
331
 
@@ -214,7 +343,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
214
343
  return
215
344
 
216
345
  # Create the directory if it doesn't exist
217
- save_path = make_fullpath(save_dir, make=True)
346
+ save_path = make_fullpath(save_dir, make=True, enforce="directory")
218
347
 
219
348
  # Clean the filename
220
349
  filename = sanitize_filename(filename)
@@ -225,227 +354,91 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
225
354
 
226
355
  # --- Type-specific saving logic ---
227
356
  if isinstance(df, pd.DataFrame):
228
- df.to_csv(output_path, index=False, encoding='utf-8')
357
+ # Transform "" to np.nan before saving
358
+ df_to_save = df.replace(r'^\s*$', np.nan, regex=True)
359
+ # Save
360
+ df_to_save.to_csv(output_path, index=False, encoding='utf-8')
229
361
  elif isinstance(df, pl.DataFrame):
230
- df.write_csv(output_path) # Polars defaults to utf8 and no index
362
+ # Transform empty strings to Null
363
+ df_to_save = df.with_columns(
364
+ pl.when(pl.col(pl.Utf8).str.strip() == "")
365
+ .then(None)
366
+ .otherwise(pl.col(pl.Utf8))
367
+ )
368
+ # Save
369
+ df_to_save.write_csv(output_path)
231
370
  else:
232
371
  # This error handles cases where an unsupported type is passed
233
372
  _LOGGER.error(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
234
373
  raise TypeError()
235
374
 
236
- _LOGGER.info(f"Saved dataset: '{filename}' with shape: {df.shape}")
375
+ _LOGGER.info(f"Saved dataset: '{filename}' with shape: {df_to_save.shape}")
237
376
 
238
377
 
239
- def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
378
+ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
240
379
  """
241
- Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
242
- applying heuristic adjustments to correct for potential data entry scale mismatches.
380
+ Saves a DataFrame to a specified full path.
243
381
 
244
- Parameters:
245
- data (list):
246
- A list of values that may include strings, floats, integers, or None.
247
- None values are treated as 0.0.
248
-
249
- threshold (int, optional):
250
- The number of log10 orders of magnitude below the median scale
251
- at which a value is considered suspect and is scaled upward accordingly.
252
- Default is 2.
253
-
254
- Returns:
255
- List[float]: A list of normalized float values summing to 1.0.
256
-
257
- Notes:
258
- - Zeros and None values remain zero.
259
- - Input strings are automatically cast to floats if possible.
382
+ This function is a wrapper for `save_dataframe_filename()`. It takes a
383
+ single `pathlib.Path` object pointing to a `.csv` file.
260
384
 
261
- Example:
262
- >>> normalize_mixed_list([1, "0.01", 4, None])
263
- [0.2, 0.2, 0.6, 0.0]
385
+ Args:
386
+ df (Union[pd.DataFrame, pl.DataFrame]): The pandas or polars DataFrame to save.
387
+ full_path (Path): The complete file path, including the filename and `.csv` extension, where the DataFrame will be saved.
264
388
  """
265
- # Step 1: Convert all values to float, treat None as 0.0
266
- float_list = [float(x) if x is not None else 0.0 for x in data]
267
-
268
- # Raise for negative values
269
- if any(x < 0 for x in float_list):
270
- _LOGGER.error("Negative values are not allowed in the input list.")
389
+ if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
390
+ _LOGGER.error('A path object pointing to a .csv file must be provided.')
271
391
  raise ValueError()
272
-
273
- # Step 2: Compute log10 of non-zero values
274
- nonzero = [x for x in float_list if x > 0]
275
- if not nonzero:
276
- return [0.0 for _ in float_list]
277
-
278
- log_scales = [math.log10(x) for x in nonzero]
279
- log_median = np.median(log_scales)
280
-
281
- # Step 3: Adjust values that are much smaller than median
282
- adjusted = []
283
- for x in float_list:
284
- if x == 0.0:
285
- adjusted.append(0.0)
286
- else:
287
- log_x = math.log10(x)
288
- if log_median - log_x > threshold:
289
- scale_diff = round(log_median - log_x)
290
- adjusted.append(x * (10 ** scale_diff))
291
- else:
292
- adjusted.append(x)
293
-
294
- # Step 4: Normalize to sum to 1.0
295
- total = sum(adjusted)
296
- if total == 0:
297
- return [0.0 for _ in adjusted]
298
-
299
- return [x / total for x in adjusted]
300
-
301
-
302
- def threshold_binary_values(
303
- input_array: Union[Sequence[float], np.ndarray, pd.Series, pl.Series],
304
- binary_values: Optional[int] = None
305
- ) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
306
- """
307
- Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
308
-
309
- Binary elements are converted to 0 or 1 using a 0.5 threshold.
310
-
311
- Parameters:
312
- input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
313
- binary_values (Optional[int]) :
314
- - If `None`, all values are treated as binary.
315
- - If `int`, only this many last `binary_values` are thresholded.
316
-
317
- Returns:
318
- Any:
319
- Same type as input
320
- """
321
- original_type = type(input_array)
322
-
323
- if isinstance(input_array, pl.Series):
324
- array = input_array.to_numpy()
325
- elif isinstance(input_array, (pd.Series, np.ndarray)):
326
- array = np.asarray(input_array)
327
- elif isinstance(input_array, (list, tuple)):
328
- array = np.array(input_array)
329
- else:
330
- _LOGGER.error("Unsupported input type")
331
- raise TypeError()
332
-
333
- array = array.flatten()
334
- total = array.shape[0]
392
+
393
+ save_dataframe_filename(df=df,
394
+ save_dir=full_path.parent,
395
+ filename=full_path.name)
335
396
 
336
- bin_count = total if binary_values is None else binary_values
337
- if not (0 <= bin_count <= total):
338
- _LOGGER.error("'binary_values' must be between 0 and the total number of elements")
339
- raise ValueError()
340
397
 
341
- if bin_count == 0:
342
- result = array
343
- else:
344
- cont_part = array[:-bin_count] if bin_count < total else np.array([])
345
- bin_part = (array[-bin_count:] > 0.5).astype(int)
346
- result = np.concatenate([cont_part, bin_part])
347
-
348
- if original_type is pd.Series:
349
- return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
350
- elif original_type is pl.Series:
351
- return pl.Series(input_array.name if hasattr(input_array, 'name') else "binary", result) # type: ignore
352
- elif original_type is list:
353
- return result.tolist()
354
- elif original_type is tuple:
355
- return tuple(result)
356
- else:
357
- return result
358
-
359
-
360
- def threshold_binary_values_batch(
361
- input_array: np.ndarray,
362
- binary_values: int
363
- ) -> np.ndarray:
398
+ def save_dataframe_with_schema(
399
+ df: pd.DataFrame,
400
+ full_path: Path,
401
+ schema: "FeatureSchema"
402
+ ) -> None:
364
403
  """
365
- Threshold the last `binary_values` columns of a 2D NumPy array to binary {0,1} using 0.5 cutoff.
404
+ Saves a pandas DataFrame to a CSV, strictly enforcing that the
405
+ first N columns match the FeatureSchema.
366
406
 
367
- Parameters
368
- ----------
369
- input_array : np.ndarray
370
- 2D array with shape (batch_size, n_features).
371
- binary_values : int
372
- Number of binary features located at the END of each row.
373
-
374
- Returns
375
- -------
376
- np.ndarray
377
- Thresholded array, same shape as input.
378
- """
379
- if input_array.ndim != 2:
380
- _LOGGER.error(f"Expected 2D array, got {input_array.ndim}D array.")
381
- raise AssertionError()
382
-
383
- batch_size, total_features = input_array.shape
407
+ This function validates that the first N columns of the DataFrame
408
+ (where N = len(schema.feature_names)) contain *exactly* the set
409
+ of features specified in the schema.
384
410
 
385
- if not (0 <= binary_values <= total_features):
386
- _LOGGER.error("'binary_values' out of valid range.")
387
- raise AssertionError()
411
+ - If the columns are present but out of order, they are reordered.
412
+ - If any required feature is missing from the first N columns, it fails.
413
+ - If any extra column is found within the first N columns, it fails.
388
414
 
389
- if binary_values == 0:
390
- return input_array.copy()
415
+ Columns *after* the first N are considered target columns and are
416
+ logged for verification.
391
417
 
392
- cont_part = input_array[:, :-binary_values] if binary_values < total_features else np.empty((batch_size, 0))
393
- bin_part = input_array[:, -binary_values:] > 0.5
394
- bin_part = bin_part.astype(np.int32)
395
-
396
- return np.hstack([cont_part, bin_part])
397
-
398
-
399
- def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> None:
400
- """
401
- Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
402
-
403
- Parameters:
404
- obj (Any) : The Python object to serialize.
405
- save_dir (str | Path) : Directory path where the serialized object will be saved.
406
- filename (str) : Name for the output file, extension will be appended if needed.
407
- """
408
- try:
409
- save_path = make_fullpath(save_dir, make=True)
410
- sanitized_name = sanitize_filename(filename)
411
- if not sanitized_name.endswith('.joblib'):
412
- sanitized_name = sanitized_name + ".joblib"
413
- full_path = save_path / sanitized_name
414
- joblib.dump(obj, full_path)
415
- except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
416
- _LOGGER.error(f"Failed to serialize object of type '{type(obj)}'.")
417
- if raise_on_error:
418
- raise e
419
- return None
420
- else:
421
- if verbose:
422
- _LOGGER.info(f"Object of type '{type(obj)}' saved to '{full_path}'")
423
- return None
424
-
425
-
426
- def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
427
- """
428
- Loads a serialized object from a .joblib file.
429
-
430
- Parameters:
431
- filepath (str | Path): Full path to the serialized .joblib file.
418
+ Args:
419
+ df (pd.DataFrame):
420
+ The DataFrame to save.
421
+ full_path (Path):
422
+ The complete file path where the DataFrame will be saved.
423
+ schema (FeatureSchema):
424
+ The schema object to validate against.
432
425
 
433
- Returns:
434
- (Any | None): The deserialized Python object, or None if loading fails.
426
+ Raises:
427
+ ValueError:
428
+ - If the DataFrame is missing columns required by the schema
429
+ within its first N columns.
430
+ - If the DataFrame's first N columns contain unexpected
431
+ columns that are not in the schema.
435
432
  """
436
- true_filepath = make_fullpath(filepath)
433
+ if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
434
+ _LOGGER.error('A path object pointing to a .csv file must be provided.')
435
+ raise ValueError()
437
436
 
438
- try:
439
- obj = joblib.load(true_filepath)
440
- except (IOError, OSError, EOFError, TypeError, ValueError) as e:
441
- _LOGGER.error(f"Failed to deserialize object from '{true_filepath}'.")
442
- if raise_on_error:
443
- raise e
444
- return None
445
- else:
446
- if verbose:
447
- _LOGGER.info(f"Loaded object of type '{type(obj)}'.")
448
- return obj
437
+ # Call the helper to validate and reorder
438
+ df_to_save = _validate_and_reorder_schema(df=df, schema=schema)
439
+
440
+ # Call the original save function
441
+ save_dataframe(df=df_to_save, full_path=full_path)
449
442
 
450
443
 
451
444
  def distribute_dataset_by_target(
@@ -529,7 +522,7 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
529
522
  filename = df_dir.name + '_' + target_name + '_' + df_name
530
523
  else:
531
524
  filename = target_name + '_' + df_name
532
- save_dataframe(df=df, save_dir=save_dir, filename=filename)
525
+ save_dataframe_filename(df=df, save_dir=save_dir, filename=filename)
533
526
  total_saved += 1
534
527
  except Exception as e:
535
528
  _LOGGER.error(f"Failed to process file '{df_path}'. Reason: {e}")
@@ -560,5 +553,72 @@ def train_dataset_yielder(
560
553
  yield (df_features, df_target, feature_names, target_col)
561
554
 
562
555
 
556
+ def _validate_and_reorder_schema(
557
+ df: pd.DataFrame,
558
+ schema: "FeatureSchema"
559
+ ) -> pd.DataFrame:
560
+ """
561
+ Internal helper to validate and reorder a DataFrame against a schema.
562
+
563
+ Checks for missing, extra, and out-of-order feature columns
564
+ (the first N columns). Returns a reordered DataFrame if necessary.
565
+ Logs all actions.
566
+
567
+ Raises:
568
+ ValueError: If validation fails.
569
+ """
570
+ # Get schema and DataFrame column info
571
+ expected_features = list(schema.feature_names)
572
+ expected_set = set(expected_features)
573
+ n_features = len(expected_features)
574
+
575
+ all_df_columns = df.columns.to_list()
576
+
577
+ # --- Strict Validation ---
578
+
579
+ # 0. Check if DataFrame is long enough
580
+ if len(all_df_columns) < n_features:
581
+ _LOGGER.error(f"DataFrame has only {len(all_df_columns)} columns, but schema requires {n_features} features.")
582
+ raise ValueError()
583
+
584
+ df_feature_cols = all_df_columns[:n_features]
585
+ df_feature_set = set(df_feature_cols)
586
+ df_target_cols = all_df_columns[n_features:]
587
+
588
+ # 1. Check for missing features
589
+ missing_from_df = expected_set - df_feature_set
590
+ if missing_from_df:
591
+ _LOGGER.error(f"DataFrame's first {n_features} columns are missing required schema features: {missing_from_df}")
592
+ raise ValueError()
593
+
594
+ # 2. Check for extra (unexpected) features
595
+ extra_in_df = df_feature_set - expected_set
596
+ if extra_in_df:
597
+ _LOGGER.error(f"DataFrame's first {n_features} columns contain unexpected columns: {extra_in_df}")
598
+ raise ValueError()
599
+
600
+ # --- Reordering ---
601
+
602
+ df_to_process = df
603
+
604
+ # If we pass validation, the sets are equal. Now check order.
605
+ if df_feature_cols == expected_features:
606
+ _LOGGER.info("DataFrame feature columns already match schema order.")
607
+ else:
608
+ _LOGGER.warning("DataFrame feature columns do not match schema order. Reordering...")
609
+
610
+ # Rebuild the DataFrame with the correct feature order + target columns
611
+ new_order = expected_features + df_target_cols
612
+ df_to_process = df[new_order]
613
+
614
+ # Log the presumed target columns for user verification
615
+ if not df_target_cols:
616
+ _LOGGER.warning(f"No target columns were found after index {n_features-1}.")
617
+ else:
618
+ _LOGGER.info(f"Presumed Target Columns: {df_target_cols}")
619
+
620
+ return df_to_process # type: ignore
621
+
622
+
563
623
  def info():
564
624
  _script_info(__all__)
@@ -1,36 +0,0 @@
1
- dragon_ml_toolbox-10.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-10.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
- ml_tools/ETL_cleaning.py,sha256=-hxvnJYkGcBAR2eattOcgfGqPxM3TIORC6pCNvwDsf4,19113
4
- ml_tools/ETL_engineering.py,sha256=sgpIhlFIeId4eSJ-a33MnVuPNXs50msxFWa8-kw2hOI,36369
5
- ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
6
- ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
7
- ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
8
- ml_tools/ML_datasetmaster.py,sha256=CBZFpvm0qiY-8gP89iKTkd7jvU-rGQcJwk-_mBJmRSg,29273
9
- ml_tools/ML_evaluation.py,sha256=28JJ2M71p4pxniwav2Hv3b1a5dsvaoIYNLm-UJQuXvY,16002
10
- ml_tools/ML_evaluation_multi.py,sha256=2jTSNFCu8cz5C05EusnrDyffs59M2Fq3UXSHxo2TR1A,12515
11
- ml_tools/ML_inference.py,sha256=SGDPiPxs_OYDKKRZziFMyaWcC8A37c70W9t-dMP5niI,23066
12
- ml_tools/ML_models.py,sha256=Dl2mTMgVCtnNCSRlyqvMnInsKJVldS7vnBPimD-TnHo,27999
13
- ml_tools/ML_optimization.py,sha256=a2Uxe1g-y4I-gFa8ENIM8QDS-Pz3hoPRRaVXAWMbyQA,13491
14
- ml_tools/ML_scaler.py,sha256=O8JzHr2551zPpKRRReEIMvq0lNAAPau6hV59KUMAySg,7420
15
- ml_tools/ML_trainer.py,sha256=xM-o-gbPhWXm2lOVXbeaTFotgJSDRSHyE7H0-9OOij4,23712
16
- ml_tools/PSO_optimization.py,sha256=q0VYpssQGbPum7xdnkDXlJQKhZMYZo8acHpKhajPK3c,22954
17
- ml_tools/RNN_forecast.py,sha256=8rNZr-eWOBXMiDQV22e_tQTPM5LM2IFggEAa1FaoXaI,1965
18
- ml_tools/SQL.py,sha256=WDgdZUYuLBUpv-4Am9XjVY_Aq_jxBWdLrbcgAIEwefI,10704
19
- ml_tools/VIF_factor.py,sha256=MkMh_RIdsN2XUPzKNGRiEcmB17R_MmvGV4ezpL5zD2E,10403
20
- ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
21
- ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
22
- ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
23
- ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
24
- ml_tools/data_exploration.py,sha256=hKA_3U-piJ8TtDWhzX_T2Awkg-25e0DC5E8qloqPo6w,27206
25
- ml_tools/ensemble_evaluation.py,sha256=xMEMfXJ5MjTkTfr1LkFOeD7iUtnVDCW3S9lm3zT-6tY,24778
26
- ml_tools/ensemble_inference.py,sha256=EFHnbjbu31fcVp88NBx8lWAVdu2Gpg9MY9huVZJHFfM,9350
27
- ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
28
- ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
29
- ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
30
- ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
31
- ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
32
- ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
33
- dragon_ml_toolbox-10.2.0.dist-info/METADATA,sha256=nJ-15xA7A7FgzYDRSi6xjhBmn32Fz57TEn2Wqg5hZRg,6968
34
- dragon_ml_toolbox-10.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
- dragon_ml_toolbox-10.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
- dragon_ml_toolbox-10.2.0.dist-info/RECORD,,