dragon-ml-toolbox 12.7.0__py3-none-any.whl → 12.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-12.7.0.dist-info → dragon_ml_toolbox-12.9.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-12.7.0.dist-info → dragon_ml_toolbox-12.9.0.dist-info}/RECORD +9 -9
- ml_tools/ML_utilities.py +13 -2
- ml_tools/data_exploration.py +88 -22
- ml_tools/optimization_tools.py +3 -3
- {dragon_ml_toolbox-12.7.0.dist-info → dragon_ml_toolbox-12.9.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-12.7.0.dist-info → dragon_ml_toolbox-12.9.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-12.7.0.dist-info → dragon_ml_toolbox-12.9.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-12.7.0.dist-info → dragon_ml_toolbox-12.9.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
dragon_ml_toolbox-12.
|
|
2
|
-
dragon_ml_toolbox-12.
|
|
1
|
+
dragon_ml_toolbox-12.9.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-12.9.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
|
|
3
3
|
ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
|
|
4
4
|
ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
|
|
@@ -14,7 +14,7 @@ ml_tools/ML_optimization.py,sha256=es3TlQbY7RYgJMZnznkjYGbUxFnAqzZxE_g3_qLK9Q8,2
|
|
|
14
14
|
ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
|
|
15
15
|
ml_tools/ML_simple_optimization.py,sha256=W2mce1XFCuiOHTOjOsCNbETISHn5MwYlYsTIXH5hMMo,18177
|
|
16
16
|
ml_tools/ML_trainer.py,sha256=_g48w5Ak-wQr5fGHdJqlcpnzv3gWyL1ghkOhy9VOZbo,23930
|
|
17
|
-
ml_tools/ML_utilities.py,sha256=
|
|
17
|
+
ml_tools/ML_utilities.py,sha256=EnKpPTnJ2qjZmz7kvows4Uu5CfSA7ByRmI1v2-KarKw,9337
|
|
18
18
|
ml_tools/PSO_optimization.py,sha256=fVHeemqilBS0zrGV25E5yKwDlGdd2ZKa18d8CZ6Q6Fk,22961
|
|
19
19
|
ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
|
|
20
20
|
ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
|
|
@@ -24,18 +24,18 @@ ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
|
|
|
24
24
|
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
25
25
|
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
26
26
|
ml_tools/custom_logger.py,sha256=xot-VeZFigKjcVxADgzvI54vZO_MqMMejo7JmDED8Xo,5892
|
|
27
|
-
ml_tools/data_exploration.py,sha256=
|
|
27
|
+
ml_tools/data_exploration.py,sha256=bERsPSmV8h5YtJEzkVVYXkOyvdq60g-3cvJRhFjnI_A,50270
|
|
28
28
|
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
29
29
|
ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
|
|
30
30
|
ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
|
|
31
31
|
ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
|
|
32
32
|
ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
|
|
33
33
|
ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
|
|
34
|
-
ml_tools/optimization_tools.py,sha256=
|
|
34
|
+
ml_tools/optimization_tools.py,sha256=P074YCuZzkqkONnAsM-Zb9DTX_i8cRkkJLpwAWz6CRw,13521
|
|
35
35
|
ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
|
|
36
36
|
ml_tools/serde.py,sha256=UIshIesHRFmxr8F6B3LxGG8bYc1HHK-nlE3kENSZL18,5288
|
|
37
37
|
ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
|
|
38
|
-
dragon_ml_toolbox-12.
|
|
39
|
-
dragon_ml_toolbox-12.
|
|
40
|
-
dragon_ml_toolbox-12.
|
|
41
|
-
dragon_ml_toolbox-12.
|
|
38
|
+
dragon_ml_toolbox-12.9.0.dist-info/METADATA,sha256=AAbJFe1QFwEU3uFZV9mUOhXubpSBSRXAHO6zOtJiX10,6166
|
|
39
|
+
dragon_ml_toolbox-12.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
40
|
+
dragon_ml_toolbox-12.9.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
41
|
+
dragon_ml_toolbox-12.9.0.dist-info/RECORD,,
|
ml_tools/ML_utilities.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Union, Any
|
|
3
|
+
from typing import Union, Any, Optional
|
|
4
4
|
|
|
5
5
|
from .path_manager import make_fullpath, list_subdirectories, list_files_by_extension
|
|
6
6
|
from ._script_info import _script_info
|
|
7
7
|
from ._logger import _LOGGER
|
|
8
8
|
from .keys import DatasetKeys, PytorchModelArchitectureKeys, PytorchArtifactPathKeys, SHAPKeys
|
|
9
9
|
from .utilities import load_dataframe
|
|
10
|
+
from .custom_logger import save_list_strings
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
__all__ = [
|
|
@@ -139,6 +140,7 @@ def find_model_artifacts(target_directory: Union[str,Path], load_scaler: bool, v
|
|
|
139
140
|
def select_features_by_shap(
|
|
140
141
|
root_directory: Union[str, Path],
|
|
141
142
|
shap_threshold: float,
|
|
143
|
+
log_feature_names_directory: Optional[Union[str, Path]],
|
|
142
144
|
verbose: bool = True) -> list[str]:
|
|
143
145
|
"""
|
|
144
146
|
Scans subdirectories to find SHAP summary CSVs, then extracts feature
|
|
@@ -148,11 +150,13 @@ def select_features_by_shap(
|
|
|
148
150
|
importance scores aggregated from multiple models.
|
|
149
151
|
|
|
150
152
|
Args:
|
|
151
|
-
root_directory (
|
|
153
|
+
root_directory (str | Path):
|
|
152
154
|
The path to the root directory that contains model subdirectories.
|
|
153
155
|
shap_threshold (float):
|
|
154
156
|
The minimum mean absolute SHAP value for a feature to be included
|
|
155
157
|
in the final list.
|
|
158
|
+
log_feature_names_directory (str | Path | None):
|
|
159
|
+
If given, saves the chosen feature names as a .txt file in this directory.
|
|
156
160
|
|
|
157
161
|
Returns:
|
|
158
162
|
list[str]:
|
|
@@ -211,6 +215,13 @@ def select_features_by_shap(
|
|
|
211
215
|
final_features = sorted(list(master_feature_set))
|
|
212
216
|
if verbose:
|
|
213
217
|
_LOGGER.info(f"Selected {len(final_features)} unique features across all files.")
|
|
218
|
+
|
|
219
|
+
if log_feature_names_directory is not None:
|
|
220
|
+
save_names_path = make_fullpath(log_feature_names_directory, make=True, enforce="directory")
|
|
221
|
+
save_list_strings(list_strings=final_features,
|
|
222
|
+
directory=save_names_path,
|
|
223
|
+
filename=DatasetKeys.FEATURE_NAMES,
|
|
224
|
+
verbose=verbose)
|
|
214
225
|
|
|
215
226
|
return final_features
|
|
216
227
|
|
ml_tools/data_exploration.py
CHANGED
|
@@ -346,6 +346,7 @@ def encode_categorical_features(
|
|
|
346
346
|
df: pd.DataFrame,
|
|
347
347
|
columns_to_encode: List[str],
|
|
348
348
|
encode_nulls: bool,
|
|
349
|
+
null_label: str = "Other",
|
|
349
350
|
split_resulting_dataset: bool = True,
|
|
350
351
|
verbose: bool = True
|
|
351
352
|
) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
|
|
@@ -359,12 +360,15 @@ def encode_categorical_features(
|
|
|
359
360
|
Args:
|
|
360
361
|
df (pd.DataFrame): The input DataFrame.
|
|
361
362
|
columns_to_encode (List[str]): A list of column names to be encoded.
|
|
362
|
-
encode_nulls (bool):
|
|
363
|
-
|
|
364
|
-
If False, Nulls are ignored and categories start from 0.
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
363
|
+
encode_nulls (bool):
|
|
364
|
+
- If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
|
|
365
|
+
- If False, Nulls are ignored and categories start from 0.
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
|
|
369
|
+
split_resulting_dataset (bool):
|
|
370
|
+
- If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
|
|
371
|
+
- If False, returns a single DataFrame with all columns.
|
|
368
372
|
verbose (bool): If True, prints encoding progress.
|
|
369
373
|
|
|
370
374
|
Returns:
|
|
@@ -375,6 +379,9 @@ def encode_categorical_features(
|
|
|
375
379
|
- pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
|
|
376
380
|
|
|
377
381
|
- pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
|
|
382
|
+
|
|
383
|
+
## **Note:**
|
|
384
|
+
Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
|
|
378
385
|
"""
|
|
379
386
|
df_encoded = df.copy()
|
|
380
387
|
|
|
@@ -400,8 +407,16 @@ def encode_categorical_features(
|
|
|
400
407
|
mapped_series = df_encoded[col_name].astype(str).map(mapping)
|
|
401
408
|
df_encoded[col_name] = mapped_series.fillna(0).astype(int)
|
|
402
409
|
|
|
410
|
+
# --- Validate nulls category---
|
|
411
|
+
# Ensure the key for 0 doesn't collide with a real category.
|
|
412
|
+
if null_label in mapping.keys():
|
|
413
|
+
# COLLISION! null_label is a real category
|
|
414
|
+
original_label = null_label
|
|
415
|
+
null_label = "__NULL__" # fallback
|
|
416
|
+
_LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
|
|
417
|
+
|
|
403
418
|
# Create the complete user-facing map including "Other"
|
|
404
|
-
user_mapping = {**mapping,
|
|
419
|
+
user_mapping = {**mapping, null_label: 0}
|
|
405
420
|
mappings[col_name] = user_mapping
|
|
406
421
|
else:
|
|
407
422
|
# ignore nulls
|
|
@@ -1008,9 +1023,10 @@ def create_transformer_categorical_map(
|
|
|
1008
1023
|
|
|
1009
1024
|
def reconstruct_one_hot(
|
|
1010
1025
|
df: pd.DataFrame,
|
|
1011
|
-
|
|
1026
|
+
features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
|
|
1012
1027
|
separator: str = '_',
|
|
1013
|
-
drop_original: bool = True
|
|
1028
|
+
drop_original: bool = True,
|
|
1029
|
+
verbose: bool = True
|
|
1014
1030
|
) -> pd.DataFrame:
|
|
1015
1031
|
"""
|
|
1016
1032
|
Reconstructs original categorical columns from a one-hot encoded DataFrame.
|
|
@@ -1022,9 +1038,20 @@ def reconstruct_one_hot(
|
|
|
1022
1038
|
Args:
|
|
1023
1039
|
df (pd.DataFrame):
|
|
1024
1040
|
The input DataFrame with one-hot encoded columns.
|
|
1025
|
-
|
|
1026
|
-
A list
|
|
1027
|
-
|
|
1041
|
+
features_to_reconstruct (List[str | Tuple[str, str | None]]):
|
|
1042
|
+
A list defining the features to reconstruct. This list can contain:
|
|
1043
|
+
|
|
1044
|
+
- A string: (e.g., "Color")
|
|
1045
|
+
This reconstructs the feature 'Color' and assumes all-zero rows represent missing data NaN.
|
|
1046
|
+
- A tuple: (e.g., ("Pet", "Dog"))
|
|
1047
|
+
This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog" (handling 'drop_first=True' scenarios).
|
|
1048
|
+
- A tuple with None: (e.g., ("Size", None))
|
|
1049
|
+
This is explicit and behaves identically to just passing "Size". All-zero rows will be mapped to NaN.
|
|
1050
|
+
Example:
|
|
1051
|
+
[
|
|
1052
|
+
"Mood", # All-zeros -> NaN
|
|
1053
|
+
("Color", "Red"), # All-zeros -> "Red"
|
|
1054
|
+
]
|
|
1028
1055
|
separator (str):
|
|
1029
1056
|
The character separating the base name from the categorical value in
|
|
1030
1057
|
the column names (e.g., '_' in 'B_a').
|
|
@@ -1054,10 +1081,39 @@ def reconstruct_one_hot(
|
|
|
1054
1081
|
new_df = df.copy()
|
|
1055
1082
|
all_ohe_cols_to_drop = []
|
|
1056
1083
|
reconstructed_count = 0
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1084
|
+
|
|
1085
|
+
# --- 1. Parse and validate the reconstruction config ---
|
|
1086
|
+
# This normalizes the input into a clean {base_name: baseline_val} dict
|
|
1087
|
+
reconstruction_config: Dict[str, Optional[str]] = {}
|
|
1088
|
+
try:
|
|
1089
|
+
for item in features_to_reconstruct:
|
|
1090
|
+
if isinstance(item, str):
|
|
1091
|
+
# Case 1: "Color"
|
|
1092
|
+
base_name = item
|
|
1093
|
+
baseline_val = None
|
|
1094
|
+
elif isinstance(item, tuple) and len(item) == 2:
|
|
1095
|
+
# Case 2: ("Pet", "dog") or ("Size", None)
|
|
1096
|
+
base_name, baseline_val = item
|
|
1097
|
+
if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
|
|
1098
|
+
_LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
|
|
1099
|
+
raise ValueError()
|
|
1100
|
+
else:
|
|
1101
|
+
_LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
|
|
1102
|
+
raise ValueError()
|
|
1103
|
+
|
|
1104
|
+
if base_name in reconstruction_config and verbose:
|
|
1105
|
+
_LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
|
|
1106
|
+
|
|
1107
|
+
reconstruction_config[base_name] = baseline_val
|
|
1108
|
+
|
|
1109
|
+
except Exception as e:
|
|
1110
|
+
_LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
|
|
1111
|
+
raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
|
|
1112
|
+
|
|
1113
|
+
_LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
|
|
1114
|
+
|
|
1115
|
+
# Main logic
|
|
1116
|
+
for base_name, baseline_category in reconstruction_config.items():
|
|
1061
1117
|
# Regex to find all columns belonging to this base feature.
|
|
1062
1118
|
pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
|
|
1063
1119
|
|
|
@@ -1069,24 +1125,34 @@ def reconstruct_one_hot(
|
|
|
1069
1125
|
continue
|
|
1070
1126
|
|
|
1071
1127
|
# For each row, find the column name with the maximum value (which is 1)
|
|
1072
|
-
reconstructed_series = new_df[ohe_cols].idxmax(axis=1)
|
|
1128
|
+
reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
|
|
1073
1129
|
|
|
1074
1130
|
# Extract the categorical value (the suffix) from the column name
|
|
1075
1131
|
# Use n=1 in split to handle cases where the category itself might contain the separator
|
|
1076
1132
|
new_column_values = reconstructed_series.str.split(separator, n=1).str[1]
|
|
1077
1133
|
|
|
1078
|
-
# Handle rows where all OHE columns were 0 (e.g., original value was NaN).
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1134
|
+
# Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
|
|
1135
|
+
all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
|
|
1136
|
+
|
|
1137
|
+
if baseline_category is not None:
|
|
1138
|
+
# A baseline category was provided
|
|
1139
|
+
new_column_values.loc[all_zero_mask] = baseline_category
|
|
1140
|
+
else:
|
|
1141
|
+
# No baseline provided: assign NaN
|
|
1142
|
+
new_column_values.loc[all_zero_mask] = np.nan # type: ignore
|
|
1143
|
+
|
|
1144
|
+
if verbose:
|
|
1145
|
+
print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
|
|
1082
1146
|
|
|
1083
1147
|
# Assign the new reconstructed column to the DataFrame
|
|
1084
1148
|
new_df[base_name] = new_column_values
|
|
1085
1149
|
|
|
1086
1150
|
all_ohe_cols_to_drop.extend(ohe_cols)
|
|
1087
1151
|
reconstructed_count += 1
|
|
1088
|
-
|
|
1152
|
+
if verbose:
|
|
1153
|
+
print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
|
|
1089
1154
|
|
|
1155
|
+
# Cleanup
|
|
1090
1156
|
if drop_original and all_ohe_cols_to_drop:
|
|
1091
1157
|
# Drop the original OHE columns, ensuring no duplicates in the drop list
|
|
1092
1158
|
unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
|
ml_tools/optimization_tools.py
CHANGED
|
@@ -98,7 +98,7 @@ def create_optimization_bounds(
|
|
|
98
98
|
|
|
99
99
|
# 3. Populate categorical bounds (Index-based)
|
|
100
100
|
# The indices in categorical_map (e.g., {2: 4}) directly correspond
|
|
101
|
-
# to the indices in
|
|
101
|
+
# to the indices in the `feature_names` list.
|
|
102
102
|
for index, cardinality in categorical_map.items():
|
|
103
103
|
if not (0 <= index < total_features):
|
|
104
104
|
_LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
|
|
@@ -125,8 +125,8 @@ def create_optimization_bounds(
|
|
|
125
125
|
# Map name to its index in the *feature-only* list
|
|
126
126
|
index = feature_names.index(name)
|
|
127
127
|
except ValueError:
|
|
128
|
-
_LOGGER.
|
|
129
|
-
|
|
128
|
+
_LOGGER.warning(f"Feature name '{name}' from 'continuous_bounds_map' not found in the CSV's feature columns.")
|
|
129
|
+
continue
|
|
130
130
|
|
|
131
131
|
if lower_bounds[index] is not None:
|
|
132
132
|
# This index was already set by the categorical map
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|