dragon-ml-toolbox 12.13.0__py3-none-any.whl → 14.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/METADATA +11 -2
- dragon_ml_toolbox-14.3.0.dist-info/RECORD +48 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
- ml_tools/MICE_imputation.py +207 -5
- ml_tools/ML_callbacks.py +40 -8
- ml_tools/ML_datasetmaster.py +200 -261
- ml_tools/ML_evaluation.py +29 -17
- ml_tools/ML_evaluation_multi.py +13 -10
- ml_tools/ML_inference.py +14 -5
- ml_tools/ML_models.py +135 -55
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +49 -36
- ml_tools/ML_trainer.py +560 -30
- ml_tools/ML_utilities.py +302 -4
- ml_tools/ML_vision_datasetmaster.py +1352 -0
- ml_tools/ML_vision_evaluation.py +260 -0
- ml_tools/ML_vision_inference.py +428 -0
- ml_tools/ML_vision_models.py +627 -0
- ml_tools/ML_vision_transformers.py +58 -0
- ml_tools/PSO_optimization.py +5 -1
- ml_tools/_ML_vision_recipe.py +88 -0
- ml_tools/__init__.py +1 -0
- ml_tools/_schema.py +96 -0
- ml_tools/custom_logger.py +37 -14
- ml_tools/data_exploration.py +576 -138
- ml_tools/keys.py +51 -1
- ml_tools/math_utilities.py +1 -1
- ml_tools/optimization_tools.py +65 -86
- ml_tools/serde.py +78 -17
- ml_tools/utilities.py +192 -3
- dragon_ml_toolbox-12.13.0.dist-info/RECORD +0 -41
- ml_tools/ML_simple_optimization.py +0 -413
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/top_level.txt +0 -0
ml_tools/utilities.py
CHANGED
|
@@ -7,16 +7,19 @@ from typing import Literal, Union, Optional, Any, Iterator, Tuple, overload
|
|
|
7
7
|
from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
|
|
8
8
|
from ._script_info import _script_info
|
|
9
9
|
from ._logger import _LOGGER
|
|
10
|
+
from ._schema import FeatureSchema
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
# Keep track of available tools
|
|
13
14
|
__all__ = [
|
|
14
15
|
"load_dataframe",
|
|
15
16
|
"load_dataframe_greedy",
|
|
17
|
+
"load_dataframe_with_schema",
|
|
16
18
|
"yield_dataframes_from_dir",
|
|
17
19
|
"merge_dataframes",
|
|
18
20
|
"save_dataframe_filename",
|
|
19
21
|
"save_dataframe",
|
|
22
|
+
"save_dataframe_with_schema",
|
|
20
23
|
"distribute_dataset_by_target",
|
|
21
24
|
"train_dataset_orchestrator",
|
|
22
25
|
"train_dataset_yielder"
|
|
@@ -96,6 +99,7 @@ def load_dataframe(
|
|
|
96
99
|
elif kind == "polars":
|
|
97
100
|
pl_kwargs: dict[str,Any]
|
|
98
101
|
pl_kwargs = {}
|
|
102
|
+
pl_kwargs['null_values'] = ["", " "]
|
|
99
103
|
if use_columns:
|
|
100
104
|
pl_kwargs['columns'] = use_columns
|
|
101
105
|
|
|
@@ -173,6 +177,68 @@ def load_dataframe_greedy(directory: Union[str, Path],
|
|
|
173
177
|
return df
|
|
174
178
|
|
|
175
179
|
|
|
180
|
+
def load_dataframe_with_schema(
|
|
181
|
+
df_path: Union[str, Path],
|
|
182
|
+
schema: "FeatureSchema",
|
|
183
|
+
all_strings: bool = False,
|
|
184
|
+
) -> Tuple[pd.DataFrame, str]:
|
|
185
|
+
"""
|
|
186
|
+
Loads a CSV file into a Pandas DataFrame, strictly validating its
|
|
187
|
+
feature columns against a FeatureSchema.
|
|
188
|
+
|
|
189
|
+
This function wraps `load_dataframe`. After loading, it validates
|
|
190
|
+
that the first N columns of the DataFrame (where N =
|
|
191
|
+
len(schema.feature_names)) contain *exactly* the set of features
|
|
192
|
+
specified in the schema.
|
|
193
|
+
|
|
194
|
+
- If the columns are present but out of order, they are reordered.
|
|
195
|
+
- If any required feature is missing from the first N columns, it fails.
|
|
196
|
+
- If any extra column is found within the first N columns, it fails.
|
|
197
|
+
|
|
198
|
+
Columns *after* the first N are considered target columns and are
|
|
199
|
+
logged for verification.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
df_path (str, Path):
|
|
203
|
+
The path to the CSV file.
|
|
204
|
+
schema (FeatureSchema):
|
|
205
|
+
The schema object to validate against.
|
|
206
|
+
all_strings (bool):
|
|
207
|
+
If True, loads all columns as string data types.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
(Tuple[pd.DataFrame, str]):
|
|
211
|
+
A tuple containing the loaded, validated (and possibly
|
|
212
|
+
reordered) pandas DataFrame and the base name of the file.
|
|
213
|
+
|
|
214
|
+
Raises:
|
|
215
|
+
ValueError:
|
|
216
|
+
- If the DataFrame is missing columns required by the schema
|
|
217
|
+
within its first N columns.
|
|
218
|
+
- If the DataFrame's first N columns contain unexpected
|
|
219
|
+
columns that are not in the schema.
|
|
220
|
+
FileNotFoundError:
|
|
221
|
+
If the file does not exist at the given path.
|
|
222
|
+
"""
|
|
223
|
+
# Step 1: Load the dataframe using the original function
|
|
224
|
+
try:
|
|
225
|
+
df, df_name = load_dataframe(
|
|
226
|
+
df_path=df_path,
|
|
227
|
+
use_columns=None, # Load all columns for validation
|
|
228
|
+
kind="pandas",
|
|
229
|
+
all_strings=all_strings,
|
|
230
|
+
verbose=True
|
|
231
|
+
)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
_LOGGER.error(f"Failed during initial load for schema validation: {e}")
|
|
234
|
+
raise e
|
|
235
|
+
|
|
236
|
+
# Step 2: Call the helper to validate and reorder
|
|
237
|
+
df_validated = _validate_and_reorder_schema(df=df, schema=schema)
|
|
238
|
+
|
|
239
|
+
return df_validated, df_name
|
|
240
|
+
|
|
241
|
+
|
|
176
242
|
def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
|
|
177
243
|
"""
|
|
178
244
|
Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
|
|
@@ -288,15 +354,25 @@ def save_dataframe_filename(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Uni
|
|
|
288
354
|
|
|
289
355
|
# --- Type-specific saving logic ---
|
|
290
356
|
if isinstance(df, pd.DataFrame):
|
|
291
|
-
|
|
357
|
+
# Transform "" to np.nan before saving
|
|
358
|
+
df_to_save = df.replace(r'^\s*$', np.nan, regex=True)
|
|
359
|
+
# Save
|
|
360
|
+
df_to_save.to_csv(output_path, index=False, encoding='utf-8')
|
|
292
361
|
elif isinstance(df, pl.DataFrame):
|
|
293
|
-
|
|
362
|
+
# Transform empty strings to Null
|
|
363
|
+
df_to_save = df.with_columns(
|
|
364
|
+
pl.when(pl.col(pl.Utf8).str.strip() == "")
|
|
365
|
+
.then(None)
|
|
366
|
+
.otherwise(pl.col(pl.Utf8))
|
|
367
|
+
)
|
|
368
|
+
# Save
|
|
369
|
+
df_to_save.write_csv(output_path)
|
|
294
370
|
else:
|
|
295
371
|
# This error handles cases where an unsupported type is passed
|
|
296
372
|
_LOGGER.error(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
|
|
297
373
|
raise TypeError()
|
|
298
374
|
|
|
299
|
-
_LOGGER.info(f"Saved dataset: '{filename}' with shape: {
|
|
375
|
+
_LOGGER.info(f"Saved dataset: '{filename}' with shape: {df_to_save.shape}")
|
|
300
376
|
|
|
301
377
|
|
|
302
378
|
def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
|
|
@@ -319,6 +395,52 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
|
|
|
319
395
|
filename=full_path.name)
|
|
320
396
|
|
|
321
397
|
|
|
398
|
+
def save_dataframe_with_schema(
|
|
399
|
+
df: pd.DataFrame,
|
|
400
|
+
full_path: Path,
|
|
401
|
+
schema: "FeatureSchema"
|
|
402
|
+
) -> None:
|
|
403
|
+
"""
|
|
404
|
+
Saves a pandas DataFrame to a CSV, strictly enforcing that the
|
|
405
|
+
first N columns match the FeatureSchema.
|
|
406
|
+
|
|
407
|
+
This function validates that the first N columns of the DataFrame
|
|
408
|
+
(where N = len(schema.feature_names)) contain *exactly* the set
|
|
409
|
+
of features specified in the schema.
|
|
410
|
+
|
|
411
|
+
- If the columns are present but out of order, they are reordered.
|
|
412
|
+
- If any required feature is missing from the first N columns, it fails.
|
|
413
|
+
- If any extra column is found within the first N columns, it fails.
|
|
414
|
+
|
|
415
|
+
Columns *after* the first N are considered target columns and are
|
|
416
|
+
logged for verification.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
df (pd.DataFrame):
|
|
420
|
+
The DataFrame to save.
|
|
421
|
+
full_path (Path):
|
|
422
|
+
The complete file path where the DataFrame will be saved.
|
|
423
|
+
schema (FeatureSchema):
|
|
424
|
+
The schema object to validate against.
|
|
425
|
+
|
|
426
|
+
Raises:
|
|
427
|
+
ValueError:
|
|
428
|
+
- If the DataFrame is missing columns required by the schema
|
|
429
|
+
within its first N columns.
|
|
430
|
+
- If the DataFrame's first N columns contain unexpected
|
|
431
|
+
columns that are not in the schema.
|
|
432
|
+
"""
|
|
433
|
+
if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
|
|
434
|
+
_LOGGER.error('A path object pointing to a .csv file must be provided.')
|
|
435
|
+
raise ValueError()
|
|
436
|
+
|
|
437
|
+
# Call the helper to validate and reorder
|
|
438
|
+
df_to_save = _validate_and_reorder_schema(df=df, schema=schema)
|
|
439
|
+
|
|
440
|
+
# Call the original save function
|
|
441
|
+
save_dataframe(df=df_to_save, full_path=full_path)
|
|
442
|
+
|
|
443
|
+
|
|
322
444
|
def distribute_dataset_by_target(
|
|
323
445
|
df_or_path: Union[pd.DataFrame, str, Path],
|
|
324
446
|
target_columns: list[str],
|
|
@@ -431,5 +553,72 @@ def train_dataset_yielder(
|
|
|
431
553
|
yield (df_features, df_target, feature_names, target_col)
|
|
432
554
|
|
|
433
555
|
|
|
556
|
+
def _validate_and_reorder_schema(
|
|
557
|
+
df: pd.DataFrame,
|
|
558
|
+
schema: "FeatureSchema"
|
|
559
|
+
) -> pd.DataFrame:
|
|
560
|
+
"""
|
|
561
|
+
Internal helper to validate and reorder a DataFrame against a schema.
|
|
562
|
+
|
|
563
|
+
Checks for missing, extra, and out-of-order feature columns
|
|
564
|
+
(the first N columns). Returns a reordered DataFrame if necessary.
|
|
565
|
+
Logs all actions.
|
|
566
|
+
|
|
567
|
+
Raises:
|
|
568
|
+
ValueError: If validation fails.
|
|
569
|
+
"""
|
|
570
|
+
# Get schema and DataFrame column info
|
|
571
|
+
expected_features = list(schema.feature_names)
|
|
572
|
+
expected_set = set(expected_features)
|
|
573
|
+
n_features = len(expected_features)
|
|
574
|
+
|
|
575
|
+
all_df_columns = df.columns.to_list()
|
|
576
|
+
|
|
577
|
+
# --- Strict Validation ---
|
|
578
|
+
|
|
579
|
+
# 0. Check if DataFrame is long enough
|
|
580
|
+
if len(all_df_columns) < n_features:
|
|
581
|
+
_LOGGER.error(f"DataFrame has only {len(all_df_columns)} columns, but schema requires {n_features} features.")
|
|
582
|
+
raise ValueError()
|
|
583
|
+
|
|
584
|
+
df_feature_cols = all_df_columns[:n_features]
|
|
585
|
+
df_feature_set = set(df_feature_cols)
|
|
586
|
+
df_target_cols = all_df_columns[n_features:]
|
|
587
|
+
|
|
588
|
+
# 1. Check for missing features
|
|
589
|
+
missing_from_df = expected_set - df_feature_set
|
|
590
|
+
if missing_from_df:
|
|
591
|
+
_LOGGER.error(f"DataFrame's first {n_features} columns are missing required schema features: {missing_from_df}")
|
|
592
|
+
raise ValueError()
|
|
593
|
+
|
|
594
|
+
# 2. Check for extra (unexpected) features
|
|
595
|
+
extra_in_df = df_feature_set - expected_set
|
|
596
|
+
if extra_in_df:
|
|
597
|
+
_LOGGER.error(f"DataFrame's first {n_features} columns contain unexpected columns: {extra_in_df}")
|
|
598
|
+
raise ValueError()
|
|
599
|
+
|
|
600
|
+
# --- Reordering ---
|
|
601
|
+
|
|
602
|
+
df_to_process = df
|
|
603
|
+
|
|
604
|
+
# If we pass validation, the sets are equal. Now check order.
|
|
605
|
+
if df_feature_cols == expected_features:
|
|
606
|
+
_LOGGER.info("DataFrame feature columns already match schema order.")
|
|
607
|
+
else:
|
|
608
|
+
_LOGGER.warning("DataFrame feature columns do not match schema order. Reordering...")
|
|
609
|
+
|
|
610
|
+
# Rebuild the DataFrame with the correct feature order + target columns
|
|
611
|
+
new_order = expected_features + df_target_cols
|
|
612
|
+
df_to_process = df[new_order]
|
|
613
|
+
|
|
614
|
+
# Log the presumed target columns for user verification
|
|
615
|
+
if not df_target_cols:
|
|
616
|
+
_LOGGER.warning(f"No target columns were found after index {n_features-1}.")
|
|
617
|
+
else:
|
|
618
|
+
_LOGGER.info(f"Presumed Target Columns: {df_target_cols}")
|
|
619
|
+
|
|
620
|
+
return df_to_process # type: ignore
|
|
621
|
+
|
|
622
|
+
|
|
434
623
|
def info():
|
|
435
624
|
_script_info(__all__)
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-12.13.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
-
dragon_ml_toolbox-12.13.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
|
|
3
|
-
ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
|
|
4
|
-
ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
|
|
5
|
-
ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
|
|
6
|
-
ml_tools/MICE_imputation.py,sha256=X273Qlgoqqg7KTmoKd75YDyAPB0UIbTzGP3xsCmRh3E,11717
|
|
7
|
-
ml_tools/ML_callbacks.py,sha256=2ZazJjlbClP-ALc8q0ru2oalkugbhO3TFwPg4RFZpck,14056
|
|
8
|
-
ml_tools/ML_datasetmaster.py,sha256=kedCGneR3S2zui0_JFZN6TBL5e69XWkdpkE_QohyqSM,31433
|
|
9
|
-
ml_tools/ML_evaluation.py,sha256=h7fAtk0lS4gTqQ46fiVjucTvFlX4rsufKnEtate6Nu0,18381
|
|
10
|
-
ml_tools/ML_evaluation_multi.py,sha256=Kn9n5lfxo7A0TvgIDMx8UHZCvzTqv1ViezzwJBF-ypM,15970
|
|
11
|
-
ml_tools/ML_inference.py,sha256=ymFvncFsU10PExq87xnEj541DKV5ck0nMuK8ToJHzVQ,23067
|
|
12
|
-
ml_tools/ML_models.py,sha256=G64NPhYZfYvHTIUwkIrMrNLgfDTKJwqdc8jwesPqB9E,28090
|
|
13
|
-
ml_tools/ML_optimization.py,sha256=es3TlQbY7RYgJMZnznkjYGbUxFnAqzZxE_g3_qLK9Q8,22960
|
|
14
|
-
ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
|
|
15
|
-
ml_tools/ML_simple_optimization.py,sha256=W2mce1XFCuiOHTOjOsCNbETISHn5MwYlYsTIXH5hMMo,18177
|
|
16
|
-
ml_tools/ML_trainer.py,sha256=UmCuKr_GzQGYqhEZ-kaRv9Buj44DsNyuOzmOM7Fw8N0,24569
|
|
17
|
-
ml_tools/ML_utilities.py,sha256=EnKpPTnJ2qjZmz7kvows4Uu5CfSA7ByRmI1v2-KarKw,9337
|
|
18
|
-
ml_tools/PSO_optimization.py,sha256=fVHeemqilBS0zrGV25E5yKwDlGdd2ZKa18d8CZ6Q6Fk,22961
|
|
19
|
-
ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
|
|
20
|
-
ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
|
|
21
|
-
ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
|
|
22
|
-
ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
23
|
-
ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
|
|
24
|
-
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
25
|
-
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
26
|
-
ml_tools/custom_logger.py,sha256=7tSAgRL7e-Ekm7rS1FLDocaPLCnaoKc7VSrtfwCtCEg,10067
|
|
27
|
-
ml_tools/data_exploration.py,sha256=haddQFsXAWzuf84NLItcZ4Q7vzN3YWjFoh7lPlWUczo,50679
|
|
28
|
-
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
29
|
-
ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
|
|
30
|
-
ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
|
|
31
|
-
ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
|
|
32
|
-
ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
|
|
33
|
-
ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
|
|
34
|
-
ml_tools/optimization_tools.py,sha256=P074YCuZzkqkONnAsM-Zb9DTX_i8cRkkJLpwAWz6CRw,13521
|
|
35
|
-
ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
|
|
36
|
-
ml_tools/serde.py,sha256=ll2mVC0sO2jIEdG3K6xMcgEN13N4YSb8VjviGvw_ers,4949
|
|
37
|
-
ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
|
|
38
|
-
dragon_ml_toolbox-12.13.0.dist-info/METADATA,sha256=p3-oOSqq1hhJj13KjIXeFnwBu3UTfBJu5mTDL9MCpdU,6167
|
|
39
|
-
dragon_ml_toolbox-12.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
40
|
-
dragon_ml_toolbox-12.13.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
41
|
-
dragon_ml_toolbox-12.13.0.dist-info/RECORD,,
|