dragon-ml-toolbox 13.0.0__py3-none-any.whl → 14.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/METADATA +12 -2
  2. dragon_ml_toolbox-14.7.0.dist-info/RECORD +49 -0
  3. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
  4. ml_tools/MICE_imputation.py +207 -5
  5. ml_tools/ML_configuration.py +108 -0
  6. ml_tools/ML_datasetmaster.py +241 -260
  7. ml_tools/ML_evaluation.py +229 -76
  8. ml_tools/ML_evaluation_multi.py +45 -16
  9. ml_tools/ML_inference.py +0 -1
  10. ml_tools/ML_models.py +135 -55
  11. ml_tools/ML_models_advanced.py +323 -0
  12. ml_tools/ML_optimization.py +49 -36
  13. ml_tools/ML_trainer.py +498 -29
  14. ml_tools/ML_utilities.py +351 -4
  15. ml_tools/ML_vision_datasetmaster.py +1492 -0
  16. ml_tools/ML_vision_evaluation.py +260 -0
  17. ml_tools/ML_vision_inference.py +428 -0
  18. ml_tools/ML_vision_models.py +641 -0
  19. ml_tools/ML_vision_transformers.py +203 -0
  20. ml_tools/PSO_optimization.py +5 -1
  21. ml_tools/_ML_vision_recipe.py +88 -0
  22. ml_tools/__init__.py +1 -0
  23. ml_tools/_schema.py +96 -0
  24. ml_tools/custom_logger.py +37 -14
  25. ml_tools/data_exploration.py +576 -138
  26. ml_tools/ensemble_evaluation.py +53 -10
  27. ml_tools/keys.py +43 -1
  28. ml_tools/math_utilities.py +1 -1
  29. ml_tools/optimization_tools.py +65 -86
  30. ml_tools/serde.py +78 -17
  31. ml_tools/utilities.py +192 -3
  32. dragon_ml_toolbox-13.0.0.dist-info/RECORD +0 -41
  33. ml_tools/ML_simple_optimization.py +0 -413
  34. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/WHEEL +0 -0
  35. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE +0 -0
  36. {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/top_level.txt +0 -0
ml_tools/utilities.py CHANGED
@@ -7,16 +7,19 @@ from typing import Literal, Union, Optional, Any, Iterator, Tuple, overload
7
7
  from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
8
8
  from ._script_info import _script_info
9
9
  from ._logger import _LOGGER
10
+ from ._schema import FeatureSchema
10
11
 
11
12
 
12
13
  # Keep track of available tools
13
14
  __all__ = [
14
15
  "load_dataframe",
15
16
  "load_dataframe_greedy",
17
+ "load_dataframe_with_schema",
16
18
  "yield_dataframes_from_dir",
17
19
  "merge_dataframes",
18
20
  "save_dataframe_filename",
19
21
  "save_dataframe",
22
+ "save_dataframe_with_schema",
20
23
  "distribute_dataset_by_target",
21
24
  "train_dataset_orchestrator",
22
25
  "train_dataset_yielder"
@@ -96,6 +99,7 @@ def load_dataframe(
96
99
  elif kind == "polars":
97
100
  pl_kwargs: dict[str,Any]
98
101
  pl_kwargs = {}
102
+ pl_kwargs['null_values'] = ["", " "]
99
103
  if use_columns:
100
104
  pl_kwargs['columns'] = use_columns
101
105
 
@@ -173,6 +177,68 @@ def load_dataframe_greedy(directory: Union[str, Path],
173
177
  return df
174
178
 
175
179
 
180
+ def load_dataframe_with_schema(
181
+ df_path: Union[str, Path],
182
+ schema: "FeatureSchema",
183
+ all_strings: bool = False,
184
+ ) -> Tuple[pd.DataFrame, str]:
185
+ """
186
+ Loads a CSV file into a Pandas DataFrame, strictly validating its
187
+ feature columns against a FeatureSchema.
188
+
189
+ This function wraps `load_dataframe`. After loading, it validates
190
+ that the first N columns of the DataFrame (where N =
191
+ len(schema.feature_names)) contain *exactly* the set of features
192
+ specified in the schema.
193
+
194
+ - If the columns are present but out of order, they are reordered.
195
+ - If any required feature is missing from the first N columns, it fails.
196
+ - If any extra column is found within the first N columns, it fails.
197
+
198
+ Columns *after* the first N are considered target columns and are
199
+ logged for verification.
200
+
201
+ Args:
202
+ df_path (str, Path):
203
+ The path to the CSV file.
204
+ schema (FeatureSchema):
205
+ The schema object to validate against.
206
+ all_strings (bool):
207
+ If True, loads all columns as string data types.
208
+
209
+ Returns:
210
+ (Tuple[pd.DataFrame, str]):
211
+ A tuple containing the loaded, validated (and possibly
212
+ reordered) pandas DataFrame and the base name of the file.
213
+
214
+ Raises:
215
+ ValueError:
216
+ - If the DataFrame is missing columns required by the schema
217
+ within its first N columns.
218
+ - If the DataFrame's first N columns contain unexpected
219
+ columns that are not in the schema.
220
+ FileNotFoundError:
221
+ If the file does not exist at the given path.
222
+ """
223
+ # Step 1: Load the dataframe using the original function
224
+ try:
225
+ df, df_name = load_dataframe(
226
+ df_path=df_path,
227
+ use_columns=None, # Load all columns for validation
228
+ kind="pandas",
229
+ all_strings=all_strings,
230
+ verbose=True
231
+ )
232
+ except Exception as e:
233
+ _LOGGER.error(f"Failed during initial load for schema validation: {e}")
234
+ raise e
235
+
236
+ # Step 2: Call the helper to validate and reorder
237
+ df_validated = _validate_and_reorder_schema(df=df, schema=schema)
238
+
239
+ return df_validated, df_name
240
+
241
+
176
242
  def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
177
243
  """
178
244
  Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
@@ -288,15 +354,25 @@ def save_dataframe_filename(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Uni
288
354
 
289
355
  # --- Type-specific saving logic ---
290
356
  if isinstance(df, pd.DataFrame):
291
- df.to_csv(output_path, index=False, encoding='utf-8')
357
+ # Transform "" to np.nan before saving
358
+ df_to_save = df.replace(r'^\s*$', np.nan, regex=True)
359
+ # Save
360
+ df_to_save.to_csv(output_path, index=False, encoding='utf-8')
292
361
  elif isinstance(df, pl.DataFrame):
293
- df.write_csv(output_path) # Polars defaults to utf8 and no index
362
+ # Transform empty strings to Null
363
+ df_to_save = df.with_columns(
364
+ pl.when(pl.col(pl.Utf8).str.strip() == "")
365
+ .then(None)
366
+ .otherwise(pl.col(pl.Utf8))
367
+ )
368
+ # Save
369
+ df_to_save.write_csv(output_path)
294
370
  else:
295
371
  # This error handles cases where an unsupported type is passed
296
372
  _LOGGER.error(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
297
373
  raise TypeError()
298
374
 
299
- _LOGGER.info(f"Saved dataset: '{filename}' with shape: {df.shape}")
375
+ _LOGGER.info(f"Saved dataset: '{filename}' with shape: {df_to_save.shape}")
300
376
 
301
377
 
302
378
  def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
@@ -319,6 +395,52 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
319
395
  filename=full_path.name)
320
396
 
321
397
 
398
+ def save_dataframe_with_schema(
399
+ df: pd.DataFrame,
400
+ full_path: Path,
401
+ schema: "FeatureSchema"
402
+ ) -> None:
403
+ """
404
+ Saves a pandas DataFrame to a CSV, strictly enforcing that the
405
+ first N columns match the FeatureSchema.
406
+
407
+ This function validates that the first N columns of the DataFrame
408
+ (where N = len(schema.feature_names)) contain *exactly* the set
409
+ of features specified in the schema.
410
+
411
+ - If the columns are present but out of order, they are reordered.
412
+ - If any required feature is missing from the first N columns, it fails.
413
+ - If any extra column is found within the first N columns, it fails.
414
+
415
+ Columns *after* the first N are considered target columns and are
416
+ logged for verification.
417
+
418
+ Args:
419
+ df (pd.DataFrame):
420
+ The DataFrame to save.
421
+ full_path (Path):
422
+ The complete file path where the DataFrame will be saved.
423
+ schema (FeatureSchema):
424
+ The schema object to validate against.
425
+
426
+ Raises:
427
+ ValueError:
428
+ - If the DataFrame is missing columns required by the schema
429
+ within its first N columns.
430
+ - If the DataFrame's first N columns contain unexpected
431
+ columns that are not in the schema.
432
+ """
433
+ if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
434
+ _LOGGER.error('A path object pointing to a .csv file must be provided.')
435
+ raise ValueError()
436
+
437
+ # Call the helper to validate and reorder
438
+ df_to_save = _validate_and_reorder_schema(df=df, schema=schema)
439
+
440
+ # Call the original save function
441
+ save_dataframe(df=df_to_save, full_path=full_path)
442
+
443
+
322
444
  def distribute_dataset_by_target(
323
445
  df_or_path: Union[pd.DataFrame, str, Path],
324
446
  target_columns: list[str],
@@ -431,5 +553,72 @@ def train_dataset_yielder(
431
553
  yield (df_features, df_target, feature_names, target_col)
432
554
 
433
555
 
556
+ def _validate_and_reorder_schema(
557
+ df: pd.DataFrame,
558
+ schema: "FeatureSchema"
559
+ ) -> pd.DataFrame:
560
+ """
561
+ Internal helper to validate and reorder a DataFrame against a schema.
562
+
563
+ Checks for missing, extra, and out-of-order feature columns
564
+ (the first N columns). Returns a reordered DataFrame if necessary.
565
+ Logs all actions.
566
+
567
+ Raises:
568
+ ValueError: If validation fails.
569
+ """
570
+ # Get schema and DataFrame column info
571
+ expected_features = list(schema.feature_names)
572
+ expected_set = set(expected_features)
573
+ n_features = len(expected_features)
574
+
575
+ all_df_columns = df.columns.to_list()
576
+
577
+ # --- Strict Validation ---
578
+
579
+ # 0. Check if DataFrame is long enough
580
+ if len(all_df_columns) < n_features:
581
+ _LOGGER.error(f"DataFrame has only {len(all_df_columns)} columns, but schema requires {n_features} features.")
582
+ raise ValueError()
583
+
584
+ df_feature_cols = all_df_columns[:n_features]
585
+ df_feature_set = set(df_feature_cols)
586
+ df_target_cols = all_df_columns[n_features:]
587
+
588
+ # 1. Check for missing features
589
+ missing_from_df = expected_set - df_feature_set
590
+ if missing_from_df:
591
+ _LOGGER.error(f"DataFrame's first {n_features} columns are missing required schema features: {missing_from_df}")
592
+ raise ValueError()
593
+
594
+ # 2. Check for extra (unexpected) features
595
+ extra_in_df = df_feature_set - expected_set
596
+ if extra_in_df:
597
+ _LOGGER.error(f"DataFrame's first {n_features} columns contain unexpected columns: {extra_in_df}")
598
+ raise ValueError()
599
+
600
+ # --- Reordering ---
601
+
602
+ df_to_process = df
603
+
604
+ # If we pass validation, the sets are equal. Now check order.
605
+ if df_feature_cols == expected_features:
606
+ _LOGGER.info("DataFrame feature columns already match schema order.")
607
+ else:
608
+ _LOGGER.warning("DataFrame feature columns do not match schema order. Reordering...")
609
+
610
+ # Rebuild the DataFrame with the correct feature order + target columns
611
+ new_order = expected_features + df_target_cols
612
+ df_to_process = df[new_order]
613
+
614
+ # Log the presumed target columns for user verification
615
+ if not df_target_cols:
616
+ _LOGGER.warning(f"No target columns were found after index {n_features-1}.")
617
+ else:
618
+ _LOGGER.info(f"Presumed Target Columns: {df_target_cols}")
619
+
620
+ return df_to_process # type: ignore
621
+
622
+
434
623
  def info():
435
624
  _script_info(__all__)
@@ -1,41 +0,0 @@
1
- dragon_ml_toolbox-13.0.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-13.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
3
- ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
4
- ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
5
- ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
6
- ml_tools/MICE_imputation.py,sha256=X273Qlgoqqg7KTmoKd75YDyAPB0UIbTzGP3xsCmRh3E,11717
7
- ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
8
- ml_tools/ML_datasetmaster.py,sha256=kedCGneR3S2zui0_JFZN6TBL5e69XWkdpkE_QohyqSM,31433
9
- ml_tools/ML_evaluation.py,sha256=3u5dOhS77gn3kAshKr2GwSa5xZBF0YM77ZkFevqNPvA,18528
10
- ml_tools/ML_evaluation_multi.py,sha256=L6Ub_uObXsI7ToVCF6DtmAFekHRcga5wWMOnRYRR-BY,16121
11
- ml_tools/ML_inference.py,sha256=yq2gdN6s_OUYC5ZLQrIJC5BA5H33q8UKODXwb-_0M2c,23549
12
- ml_tools/ML_models.py,sha256=G64NPhYZfYvHTIUwkIrMrNLgfDTKJwqdc8jwesPqB9E,28090
13
- ml_tools/ML_optimization.py,sha256=es3TlQbY7RYgJMZnznkjYGbUxFnAqzZxE_g3_qLK9Q8,22960
14
- ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
15
- ml_tools/ML_simple_optimization.py,sha256=W2mce1XFCuiOHTOjOsCNbETISHn5MwYlYsTIXH5hMMo,18177
16
- ml_tools/ML_trainer.py,sha256=9BP6JFClqGfe7GL-FGG3n5e-no9ssjEOLol7P6baGrI,29019
17
- ml_tools/ML_utilities.py,sha256=EnKpPTnJ2qjZmz7kvows4Uu5CfSA7ByRmI1v2-KarKw,9337
18
- ml_tools/PSO_optimization.py,sha256=fVHeemqilBS0zrGV25E5yKwDlGdd2ZKa18d8CZ6Q6Fk,22961
19
- ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
20
- ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
21
- ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
22
- ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
23
- ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
24
- ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
25
- ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
26
- ml_tools/custom_logger.py,sha256=7tSAgRL7e-Ekm7rS1FLDocaPLCnaoKc7VSrtfwCtCEg,10067
27
- ml_tools/data_exploration.py,sha256=haddQFsXAWzuf84NLItcZ4Q7vzN3YWjFoh7lPlWUczo,50679
28
- ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
29
- ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
30
- ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
31
- ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
32
- ml_tools/keys.py,sha256=eJ4St5fl8uHstEGO1XVdP8G-ddwjOxV9zqG0D6W8pCI,2124
33
- ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
34
- ml_tools/optimization_tools.py,sha256=P074YCuZzkqkONnAsM-Zb9DTX_i8cRkkJLpwAWz6CRw,13521
35
- ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
36
- ml_tools/serde.py,sha256=ll2mVC0sO2jIEdG3K6xMcgEN13N4YSb8VjviGvw_ers,4949
37
- ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
38
- dragon_ml_toolbox-13.0.0.dist-info/METADATA,sha256=trY1fFyTTXLS6TZdrJXxq4_YMPjEZhKCilzCg6qFxzw,6166
39
- dragon_ml_toolbox-13.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- dragon_ml_toolbox-13.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
41
- dragon_ml_toolbox-13.0.0.dist-info/RECORD,,