dragon-ml-toolbox 13.6.0__tar.gz → 13.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (46) hide show
  1. {dragon_ml_toolbox-13.6.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-13.7.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/MICE_imputation.py +207 -5
  4. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/utilities.py +178 -0
  5. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/pyproject.toml +1 -1
  6. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/LICENSE +0 -0
  7. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/LICENSE-THIRD-PARTY.md +0 -0
  8. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/README.md +0 -0
  9. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  10. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  11. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  12. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  13. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ETL_cleaning.py +0 -0
  14. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ETL_engineering.py +0 -0
  15. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/GUI_tools.py +0 -0
  16. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_callbacks.py +0 -0
  17. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_datasetmaster.py +0 -0
  18. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_evaluation.py +0 -0
  19. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_evaluation_multi.py +0 -0
  20. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_inference.py +0 -0
  21. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_models.py +0 -0
  22. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_optimization.py +0 -0
  23. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_scaler.py +0 -0
  24. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_trainer.py +0 -0
  25. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ML_utilities.py +0 -0
  26. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/PSO_optimization.py +0 -0
  27. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/RNN_forecast.py +0 -0
  28. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/SQL.py +0 -0
  29. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/VIF_factor.py +0 -0
  30. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/__init__.py +0 -0
  31. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/_logger.py +0 -0
  32. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/_schema.py +0 -0
  33. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/_script_info.py +0 -0
  34. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/constants.py +0 -0
  35. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/custom_logger.py +0 -0
  36. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/data_exploration.py +0 -0
  37. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ensemble_evaluation.py +0 -0
  38. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ensemble_inference.py +0 -0
  39. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/ensemble_learning.py +0 -0
  40. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/handle_excel.py +0 -0
  41. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/keys.py +0 -0
  42. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/math_utilities.py +0 -0
  43. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/optimization_tools.py +0 -0
  44. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/path_manager.py +0 -0
  45. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/ml_tools/serde.py +0 -0
  46. {dragon_ml_toolbox-13.6.0 → dragon_ml_toolbox-13.7.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 13.6.0
3
+ Version: 13.7.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 13.6.0
3
+ Version: 13.7.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -7,19 +7,20 @@ from plotnine import ggplot, labs, theme, element_blank # type: ignore
7
7
  from typing import Optional, Union
8
8
 
9
9
  from .utilities import load_dataframe, merge_dataframes, save_dataframe_filename
10
- from .math_utilities import threshold_binary_values
10
+ from .math_utilities import threshold_binary_values, discretize_categorical_values
11
11
  from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
12
12
  from ._logger import _LOGGER
13
13
  from ._script_info import _script_info
14
+ from ._schema import FeatureSchema
14
15
 
15
16
 
16
17
  __all__ = [
18
+ "MiceImputer",
17
19
  "apply_mice",
18
20
  "save_imputed_datasets",
19
- "get_na_column_names",
20
21
  "get_convergence_diagnostic",
21
22
  "get_imputed_distributions",
22
- "run_mice_pipeline"
23
+ "run_mice_pipeline",
23
24
  ]
24
25
 
25
26
 
@@ -79,7 +80,7 @@ def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df
79
80
 
80
81
 
81
82
  #Get names of features that had missing values before imputation
82
- def get_na_column_names(df: pd.DataFrame):
83
+ def _get_na_column_names(df: pd.DataFrame):
83
84
  return [col for col in df.columns if df[col].isna().any()]
84
85
 
85
86
 
@@ -264,7 +265,7 @@ def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str]
264
265
 
265
266
  save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
266
267
 
267
- imputed_column_names = get_na_column_names(df=df)
268
+ imputed_column_names = _get_na_column_names(df=df)
268
269
 
269
270
  get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
270
271
 
@@ -278,5 +279,206 @@ def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
278
279
  return df_feats, df_targets
279
280
 
280
281
 
282
+ # modern implementation
283
+ class MiceImputer:
284
+ """
285
+ A modern MICE imputation pipeline that uses a FeatureSchema
286
+ to correctly discretize categorical features after imputation.
287
+ """
288
+ def __init__(self,
289
+ schema: FeatureSchema,
290
+ iterations: int=20,
291
+ resulting_datasets: int = 1,
292
+ random_state: int = 101):
293
+
294
+ self.schema = schema
295
+ self.random_state = random_state
296
+ self.iterations = iterations
297
+ self.resulting_datasets = resulting_datasets
298
+
299
+ # --- Store schema info ---
300
+
301
+ # 1. Categorical info
302
+ if not self.schema.categorical_index_map:
303
+ _LOGGER.warning("FeatureSchema has no 'categorical_index_map'. No discretization will be applied.")
304
+ self.cat_info = {}
305
+ else:
306
+ self.cat_info = self.schema.categorical_index_map
307
+
308
+ # 2. Ordered feature names (critical for index mapping)
309
+ self.ordered_features = list(self.schema.feature_names)
310
+
311
+ # 3. Names of categorical features
312
+ self.categorical_features = list(self.schema.categorical_feature_names)
313
+
314
+ _LOGGER.info(f"MiceImputer initialized. Found {len(self.cat_info)} categorical features to discretize.")
315
+
316
+ def _post_process(self, imputed_df: pd.DataFrame) -> pd.DataFrame:
317
+ """
318
+ Applies schema-based discretization to a completed dataframe.
319
+
320
+ This method works around the behavior of `discretize_categorical_values`
321
+ (which returns a full int32 array) by:
322
+ 1. Calling it on the full, ordered feature array.
323
+ 2. Extracting *only* the valid discretized categorical columns.
324
+ 3. Updating the original float dataframe with these integer values.
325
+ """
326
+ # If no categorical features are defined, return the df as-is.
327
+ if not self.cat_info:
328
+ return imputed_df
329
+
330
+ try:
331
+ # 1. Ensure DataFrame columns match the schema order
332
+ # This is critical for the index-based categorical_info
333
+ df_ordered: pd.DataFrame = imputed_df[self.ordered_features] # type: ignore
334
+
335
+ # 2. Convert to NumPy array
336
+ array_ordered = df_ordered.to_numpy()
337
+
338
+ # 3. Apply discretization utility (which returns a full int32 array)
339
+ # This array has *correct* categorical values but *truncated* continuous values.
340
+ discretized_array_int32 = discretize_categorical_values(
341
+ array_ordered,
342
+ self.cat_info,
343
+ start_at_zero=True # Assuming 0-based indexing
344
+ )
345
+
346
+ # 4. Create a new DF from the int32 array, keeping the categorical columns.
347
+ df_discretized_cats = pd.DataFrame(
348
+ discretized_array_int32,
349
+ columns=self.ordered_features,
350
+ index=df_ordered.index # <-- Critical: align index
351
+ )[self.categorical_features] # <-- Select only cat features
352
+
353
+ # 5. "Rejoin": Start with a fresh copy of the *original* imputed DF (which has correct continuous floats).
354
+ final_df = df_ordered.copy()
355
+
356
+ # 6. Use .update() to "paste" the integer categorical values
357
+ # over the old float categorical values. Continuous floats are unaffected.
358
+ final_df.update(df_discretized_cats)
359
+
360
+ return final_df
361
+
362
+ except Exception as e:
363
+ _LOGGER.error(f"Failed during post-processing discretization:\n\tInput DF shape: {imputed_df.shape}\n\tSchema features: {len(self.ordered_features)}\n\tCategorical info keys: {list(self.cat_info.keys())}\n{e}")
364
+ raise
365
+
366
+ def _run_mice(self,
367
+ df: pd.DataFrame,
368
+ df_name: str) -> tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
369
+ """
370
+ Runs the MICE kernel and applies schema-based post-processing.
371
+
372
+ Parameters:
373
+ df (pd.DataFrame): The input dataframe *with NaNs*. Should only contain feature columns.
374
+ df_name (str): The base name for the dataset.
375
+
376
+ Returns:
377
+ tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
378
+ - The trained MICE kernel
379
+ - A list of imputed and processed DataFrames
380
+ - A list of names for the new DataFrames
381
+ """
382
+ # Ensure input df only contains features from the schema and is in the correct order.
383
+ try:
384
+ df_feats = df[self.ordered_features]
385
+ except KeyError as e:
386
+ _LOGGER.error(f"Input DataFrame is missing required schema columns: {e}")
387
+ raise
388
+
389
+ # 1. Initialize kernel
390
+ kernel = mf.ImputationKernel(
391
+ data=df_feats,
392
+ num_datasets=self.resulting_datasets,
393
+ random_state=self.random_state
394
+ )
395
+
396
+ _LOGGER.info("➡️ Schema-based MICE imputation running...")
397
+
398
+ # 2. Perform MICE
399
+ kernel.mice(self.iterations)
400
+
401
+ # 3. Retrieve, process, and collect datasets
402
+ imputed_datasets = []
403
+ for i in range(self.resulting_datasets):
404
+ # complete_data returns a pd.DataFrame
405
+ completed_df = kernel.complete_data(dataset=i)
406
+
407
+ # Apply our new discretization and ordering
408
+ processed_df = self._post_process(completed_df)
409
+ imputed_datasets.append(processed_df)
410
+
411
+ if not imputed_datasets:
412
+ _LOGGER.error("No imputed datasets were generated.")
413
+ raise ValueError()
414
+
415
+ # 4. Generate names
416
+ if self.resulting_datasets == 1:
417
+ imputed_dataset_names = [f"{df_name}_MICE"]
418
+ else:
419
+ imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(self.resulting_datasets)]
420
+
421
+ # 5. Validate indexes
422
+ for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
423
+ assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}"
424
+ assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}"
425
+
426
+ _LOGGER.info("Schema-based MICE imputation complete.")
427
+
428
+ return kernel, imputed_datasets, imputed_dataset_names
429
+
430
+ def run_pipeline(self,
431
+ df_path_or_dir: Union[str,Path],
432
+ save_datasets_dir: Union[str,Path],
433
+ save_metrics_dir: Union[str,Path],
434
+ ):
435
+ """
436
+ Runs the complete MICE imputation pipeline.
437
+
438
+ This method automates the entire workflow:
439
+ 1. Loads data from a CSV file path or a directory with CSV files.
440
+ 2. Separates features and targets based on the `FeatureSchema`.
441
+ 3. Runs the MICE algorithm on the feature set.
442
+ 4. Applies schema-based post-processing to discretize categorical features.
443
+ 5. Saves the final, processed, and imputed dataset(s) (re-joined with targets) to `save_datasets_dir`.
444
+ 6. Generates and saves convergence and distribution plots for all imputed columns to `save_metrics_dir`.
445
+
446
+ Parameters
447
+ ----------
448
+ df_path_or_dir :[str,Path]
449
+ Path to a single CSV file or a directory containing multiple CSV files to impute.
450
+ save_datasets_dir : [str,Path]
451
+ Directory where the final imputed and processed dataset(s) will be saved as CSVs.
452
+ save_metrics_dir : [str,Path]
453
+ Directory where convergence and distribution plots will be saved.
454
+ """
455
+ # Check paths
456
+ save_datasets_path = make_fullpath(save_datasets_dir, make=True)
457
+ save_metrics_path = make_fullpath(save_metrics_dir, make=True)
458
+
459
+ input_path = make_fullpath(df_path_or_dir)
460
+ if input_path.is_file():
461
+ all_file_paths = [input_path]
462
+ else:
463
+ all_file_paths = list(list_csv_paths(input_path).values())
464
+
465
+ for df_path in all_file_paths:
466
+
467
+ df, df_name = load_dataframe(df_path=df_path, kind="pandas")
468
+
469
+ df_features: pd.DataFrame = df[self.schema.feature_names] # type: ignore
470
+ df_targets = df.drop(columns=self.schema.feature_names)
471
+
472
+ imputed_column_names = _get_na_column_names(df=df_features)
473
+
474
+ kernel, imputed_datasets, imputed_dataset_names = self._run_mice(df=df_features, df_name=df_name)
475
+
476
+ save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
477
+
478
+ get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
479
+
480
+ get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
481
+
482
+
281
483
  def info():
282
484
  _script_info(__all__)
@@ -7,16 +7,19 @@ from typing import Literal, Union, Optional, Any, Iterator, Tuple, overload
7
7
  from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
8
8
  from ._script_info import _script_info
9
9
  from ._logger import _LOGGER
10
+ from ._schema import FeatureSchema
10
11
 
11
12
 
12
13
  # Keep track of available tools
13
14
  __all__ = [
14
15
  "load_dataframe",
15
16
  "load_dataframe_greedy",
17
+ "load_dataframe_with_schema",
16
18
  "yield_dataframes_from_dir",
17
19
  "merge_dataframes",
18
20
  "save_dataframe_filename",
19
21
  "save_dataframe",
22
+ "save_dataframe_with_schema",
20
23
  "distribute_dataset_by_target",
21
24
  "train_dataset_orchestrator",
22
25
  "train_dataset_yielder"
@@ -174,6 +177,68 @@ def load_dataframe_greedy(directory: Union[str, Path],
174
177
  return df
175
178
 
176
179
 
180
+ def load_dataframe_with_schema(
181
+ df_path: Union[str, Path],
182
+ schema: "FeatureSchema",
183
+ all_strings: bool = False,
184
+ ) -> Tuple[pd.DataFrame, str]:
185
+ """
186
+ Loads a CSV file into a Pandas DataFrame, strictly validating its
187
+ feature columns against a FeatureSchema.
188
+
189
+ This function wraps `load_dataframe`. After loading, it validates
190
+ that the first N columns of the DataFrame (where N =
191
+ len(schema.feature_names)) contain *exactly* the set of features
192
+ specified in the schema.
193
+
194
+ - If the columns are present but out of order, they are reordered.
195
+ - If any required feature is missing from the first N columns, it fails.
196
+ - If any extra column is found within the first N columns, it fails.
197
+
198
+ Columns *after* the first N are considered target columns and are
199
+ logged for verification.
200
+
201
+ Args:
202
+ df_path (str, Path):
203
+ The path to the CSV file.
204
+ schema (FeatureSchema):
205
+ The schema object to validate against.
206
+ all_strings (bool):
207
+ If True, loads all columns as string data types.
208
+
209
+ Returns:
210
+ (Tuple[pd.DataFrame, str]):
211
+ A tuple containing the loaded, validated (and possibly
212
+ reordered) pandas DataFrame and the base name of the file.
213
+
214
+ Raises:
215
+ ValueError:
216
+ - If the DataFrame is missing columns required by the schema
217
+ within its first N columns.
218
+ - If the DataFrame's first N columns contain unexpected
219
+ columns that are not in the schema.
220
+ FileNotFoundError:
221
+ If the file does not exist at the given path.
222
+ """
223
+ # Step 1: Load the dataframe using the original function
224
+ try:
225
+ df, df_name = load_dataframe(
226
+ df_path=df_path,
227
+ use_columns=None, # Load all columns for validation
228
+ kind="pandas",
229
+ all_strings=all_strings,
230
+ verbose=True
231
+ )
232
+ except Exception as e:
233
+ _LOGGER.error(f"Failed during initial load for schema validation: {e}")
234
+ raise e
235
+
236
+ # Step 2: Call the helper to validate and reorder
237
+ df_validated = _validate_and_reorder_schema(df=df, schema=schema)
238
+
239
+ return df_validated, df_name
240
+
241
+
177
242
  def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
178
243
  """
179
244
  Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
@@ -330,6 +395,52 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
330
395
  filename=full_path.name)
331
396
 
332
397
 
398
+ def save_dataframe_with_schema(
399
+ df: pd.DataFrame,
400
+ full_path: Path,
401
+ schema: "FeatureSchema"
402
+ ) -> None:
403
+ """
404
+ Saves a pandas DataFrame to a CSV, strictly enforcing that the
405
+ first N columns match the FeatureSchema.
406
+
407
+ This function validates that the first N columns of the DataFrame
408
+ (where N = len(schema.feature_names)) contain *exactly* the set
409
+ of features specified in the schema.
410
+
411
+ - If the columns are present but out of order, they are reordered.
412
+ - If any required feature is missing from the first N columns, it fails.
413
+ - If any extra column is found within the first N columns, it fails.
414
+
415
+ Columns *after* the first N are considered target columns and are
416
+ logged for verification.
417
+
418
+ Args:
419
+ df (pd.DataFrame):
420
+ The DataFrame to save.
421
+ full_path (Path):
422
+ The complete file path where the DataFrame will be saved.
423
+ schema (FeatureSchema):
424
+ The schema object to validate against.
425
+
426
+ Raises:
427
+ ValueError:
428
+ - If the DataFrame is missing columns required by the schema
429
+ within its first N columns.
430
+ - If the DataFrame's first N columns contain unexpected
431
+ columns that are not in the schema.
432
+ """
433
+ if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
434
+ _LOGGER.error('A path object pointing to a .csv file must be provided.')
435
+ raise ValueError()
436
+
437
+ # Call the helper to validate and reorder
438
+ df_to_save = _validate_and_reorder_schema(df=df, schema=schema)
439
+
440
+ # Call the original save function
441
+ save_dataframe(df=df_to_save, full_path=full_path)
442
+
443
+
333
444
  def distribute_dataset_by_target(
334
445
  df_or_path: Union[pd.DataFrame, str, Path],
335
446
  target_columns: list[str],
@@ -442,5 +553,72 @@ def train_dataset_yielder(
442
553
  yield (df_features, df_target, feature_names, target_col)
443
554
 
444
555
 
556
+ def _validate_and_reorder_schema(
557
+ df: pd.DataFrame,
558
+ schema: "FeatureSchema"
559
+ ) -> pd.DataFrame:
560
+ """
561
+ Internal helper to validate and reorder a DataFrame against a schema.
562
+
563
+ Checks for missing, extra, and out-of-order feature columns
564
+ (the first N columns). Returns a reordered DataFrame if necessary.
565
+ Logs all actions.
566
+
567
+ Raises:
568
+ ValueError: If validation fails.
569
+ """
570
+ # Get schema and DataFrame column info
571
+ expected_features = list(schema.feature_names)
572
+ expected_set = set(expected_features)
573
+ n_features = len(expected_features)
574
+
575
+ all_df_columns = df.columns.to_list()
576
+
577
+ # --- Strict Validation ---
578
+
579
+ # 0. Check if DataFrame is long enough
580
+ if len(all_df_columns) < n_features:
581
+ _LOGGER.error(f"DataFrame has only {len(all_df_columns)} columns, but schema requires {n_features} features.")
582
+ raise ValueError()
583
+
584
+ df_feature_cols = all_df_columns[:n_features]
585
+ df_feature_set = set(df_feature_cols)
586
+ df_target_cols = all_df_columns[n_features:]
587
+
588
+ # 1. Check for missing features
589
+ missing_from_df = expected_set - df_feature_set
590
+ if missing_from_df:
591
+ _LOGGER.error(f"DataFrame's first {n_features} columns are missing required schema features: {missing_from_df}")
592
+ raise ValueError()
593
+
594
+ # 2. Check for extra (unexpected) features
595
+ extra_in_df = df_feature_set - expected_set
596
+ if extra_in_df:
597
+ _LOGGER.error(f"DataFrame's first {n_features} columns contain unexpected columns: {extra_in_df}")
598
+ raise ValueError()
599
+
600
+ # --- Reordering ---
601
+
602
+ df_to_process = df
603
+
604
+ # If we pass validation, the sets are equal. Now check order.
605
+ if df_feature_cols == expected_features:
606
+ _LOGGER.info("DataFrame feature columns already match schema order.")
607
+ else:
608
+ _LOGGER.warning("DataFrame feature columns do not match schema order. Reordering...")
609
+
610
+ # Rebuild the DataFrame with the correct feature order + target columns
611
+ new_order = expected_features + df_target_cols
612
+ df_to_process = df[new_order]
613
+
614
+ # Log the presumed target columns for user verification
615
+ if not df_target_cols:
616
+ _LOGGER.warning(f"No target columns were found after index {n_features-1}.")
617
+ else:
618
+ _LOGGER.info(f"Presumed Target Columns: {df_target_cols}")
619
+
620
+ return df_to_process # type: ignore
621
+
622
+
445
623
  def info():
446
624
  _script_info(__all__)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "13.6.0"
3
+ version = "13.7.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl L. Loza Vidaurre", email = "luigiloza@gmail.com" }