dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
- dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
- ml_tools/ETL_cleaning.py +72 -34
- ml_tools/ETL_engineering.py +506 -70
- ml_tools/GUI_tools.py +2 -1
- ml_tools/MICE_imputation.py +212 -7
- ml_tools/ML_callbacks.py +73 -40
- ml_tools/ML_datasetmaster.py +267 -284
- ml_tools/ML_evaluation.py +119 -58
- ml_tools/ML_evaluation_multi.py +107 -32
- ml_tools/ML_inference.py +15 -5
- ml_tools/ML_models.py +234 -170
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +321 -97
- ml_tools/ML_scaler.py +10 -5
- ml_tools/ML_trainer.py +585 -40
- ml_tools/ML_utilities.py +528 -0
- ml_tools/ML_vision_datasetmaster.py +1315 -0
- ml_tools/ML_vision_evaluation.py +260 -0
- ml_tools/ML_vision_inference.py +428 -0
- ml_tools/ML_vision_models.py +627 -0
- ml_tools/ML_vision_transformers.py +58 -0
- ml_tools/PSO_optimization.py +10 -7
- ml_tools/RNN_forecast.py +2 -0
- ml_tools/SQL.py +22 -9
- ml_tools/VIF_factor.py +4 -3
- ml_tools/_ML_vision_recipe.py +88 -0
- ml_tools/__init__.py +1 -0
- ml_tools/_logger.py +0 -2
- ml_tools/_schema.py +96 -0
- ml_tools/constants.py +79 -0
- ml_tools/custom_logger.py +164 -16
- ml_tools/data_exploration.py +1092 -109
- ml_tools/ensemble_evaluation.py +48 -1
- ml_tools/ensemble_inference.py +6 -7
- ml_tools/ensemble_learning.py +4 -3
- ml_tools/handle_excel.py +1 -0
- ml_tools/keys.py +80 -0
- ml_tools/math_utilities.py +259 -0
- ml_tools/optimization_tools.py +198 -24
- ml_tools/path_manager.py +144 -45
- ml_tools/serde.py +192 -0
- ml_tools/utilities.py +287 -227
- dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
ml_tools/ensemble_evaluation.py
CHANGED
|
@@ -25,6 +25,7 @@ from typing import Union, Optional, Literal
|
|
|
25
25
|
from .path_manager import sanitize_filename, make_fullpath
|
|
26
26
|
from ._script_info import _script_info
|
|
27
27
|
from ._logger import _LOGGER
|
|
28
|
+
from .keys import SHAPKeys
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
__all__ = [
|
|
@@ -472,7 +473,7 @@ def get_shap_values(
|
|
|
472
473
|
save_dir: Directory to save visualizations.
|
|
473
474
|
"""
|
|
474
475
|
sanitized_target_name = sanitize_filename(target_name)
|
|
475
|
-
global_save_path = make_fullpath(save_dir, make=True)
|
|
476
|
+
global_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
476
477
|
|
|
477
478
|
def _apply_plot_style():
|
|
478
479
|
styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
|
|
@@ -539,6 +540,15 @@ def get_shap_values(
|
|
|
539
540
|
plot_type=plot_type,
|
|
540
541
|
title=f"{model_name} - {target_name} (Class {class_name})"
|
|
541
542
|
)
|
|
543
|
+
|
|
544
|
+
# Save the summary data for the current class
|
|
545
|
+
summary_save_path = global_save_path / f"SHAP_{sanitized_target_name}_{class_name}.csv"
|
|
546
|
+
_save_summary_csv(
|
|
547
|
+
shap_values_for_summary=class_shap,
|
|
548
|
+
feature_names=feature_names,
|
|
549
|
+
save_path=summary_save_path
|
|
550
|
+
)
|
|
551
|
+
|
|
542
552
|
else:
|
|
543
553
|
values = shap_values[1] if isinstance(shap_values, list) else shap_values
|
|
544
554
|
for plot_type in ["bar", "dot"]:
|
|
@@ -549,6 +559,15 @@ def get_shap_values(
|
|
|
549
559
|
plot_type=plot_type,
|
|
550
560
|
title=f"{model_name} - {target_name}"
|
|
551
561
|
)
|
|
562
|
+
|
|
563
|
+
# Save the summary data for the positive class
|
|
564
|
+
shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
|
|
565
|
+
summary_save_path = global_save_path / shap_summary_filename
|
|
566
|
+
_save_summary_csv(
|
|
567
|
+
shap_values_for_summary=values,
|
|
568
|
+
feature_names=feature_names,
|
|
569
|
+
save_path=summary_save_path
|
|
570
|
+
)
|
|
552
571
|
|
|
553
572
|
def _plot_for_regression(shap_values):
|
|
554
573
|
for plot_type in ["bar", "dot"]:
|
|
@@ -559,6 +578,34 @@ def get_shap_values(
|
|
|
559
578
|
plot_type=plot_type,
|
|
560
579
|
title=f"{model_name} - {target_name}"
|
|
561
580
|
)
|
|
581
|
+
|
|
582
|
+
# Save the summary data to a CSV file
|
|
583
|
+
shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
|
|
584
|
+
summary_save_path = global_save_path / shap_summary_filename
|
|
585
|
+
_save_summary_csv(
|
|
586
|
+
shap_values_for_summary=shap_values,
|
|
587
|
+
feature_names=feature_names,
|
|
588
|
+
save_path=summary_save_path
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
def _save_summary_csv(shap_values_for_summary: np.ndarray, feature_names: list[str], save_path: Path):
|
|
592
|
+
"""Calculates and saves the SHAP summary data to a CSV file."""
|
|
593
|
+
mean_abs_shap = np.abs(shap_values_for_summary).mean(axis=0)
|
|
594
|
+
|
|
595
|
+
# Create default feature names if none are provided
|
|
596
|
+
current_feature_names = feature_names
|
|
597
|
+
if current_feature_names is None:
|
|
598
|
+
current_feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
|
|
599
|
+
|
|
600
|
+
summary_df = pd.DataFrame({
|
|
601
|
+
SHAPKeys.FEATURE_COLUMN: feature_names,
|
|
602
|
+
SHAPKeys.SHAP_VALUE_COLUMN: mean_abs_shap
|
|
603
|
+
}).sort_values(SHAPKeys.SHAP_VALUE_COLUMN, ascending=False)
|
|
604
|
+
|
|
605
|
+
summary_df.to_csv(save_path, index=False)
|
|
606
|
+
# print(f"📝 SHAP summary data saved as '{save_path.name}'")
|
|
607
|
+
|
|
608
|
+
|
|
562
609
|
#START_O
|
|
563
610
|
|
|
564
611
|
explainer = shap.TreeExplainer(model)
|
ml_tools/ensemble_inference.py
CHANGED
|
@@ -1,18 +1,17 @@
|
|
|
1
|
-
from ._script_info import _script_info
|
|
2
|
-
from ._logger import _LOGGER
|
|
3
|
-
from .path_manager import make_fullpath, list_files_by_extension
|
|
4
|
-
from .keys import EnsembleKeys
|
|
5
|
-
|
|
6
1
|
from typing import Union, Literal, Dict, Any, Optional, List
|
|
7
2
|
from pathlib import Path
|
|
8
3
|
import json
|
|
9
|
-
|
|
10
4
|
import joblib
|
|
11
5
|
import numpy as np
|
|
12
6
|
# Inference models
|
|
13
7
|
import xgboost
|
|
14
8
|
import lightgbm
|
|
15
9
|
|
|
10
|
+
from ._script_info import _script_info
|
|
11
|
+
from ._logger import _LOGGER
|
|
12
|
+
from .path_manager import make_fullpath, list_files_by_extension
|
|
13
|
+
from .keys import EnsembleKeys
|
|
14
|
+
|
|
16
15
|
|
|
17
16
|
__all__ = [
|
|
18
17
|
"InferenceHandler",
|
|
@@ -219,7 +218,7 @@ def model_report(
|
|
|
219
218
|
return report_data
|
|
220
219
|
|
|
221
220
|
|
|
222
|
-
# Local implementation to avoid calling utilities
|
|
221
|
+
# Local implementation to avoid calling utilities dependencies
|
|
223
222
|
def _deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
|
|
224
223
|
"""
|
|
225
224
|
Loads a serialized object from a .joblib file.
|
ml_tools/ensemble_learning.py
CHANGED
|
@@ -13,7 +13,8 @@ import lightgbm as lgb
|
|
|
13
13
|
from sklearn.model_selection import train_test_split
|
|
14
14
|
from sklearn.base import clone
|
|
15
15
|
|
|
16
|
-
from .utilities import yield_dataframes_from_dir,
|
|
16
|
+
from .utilities import yield_dataframes_from_dir, train_dataset_yielder
|
|
17
|
+
from .serde import serialize_object_filename
|
|
17
18
|
from .path_manager import sanitize_filename, make_fullpath
|
|
18
19
|
from ._script_info import _script_info
|
|
19
20
|
from .keys import EnsembleKeys
|
|
@@ -410,7 +411,7 @@ def _save_model(trained_model, model_name: str, target_name:str, feature_names:
|
|
|
410
411
|
EnsembleKeys.FEATURES: feature_names,
|
|
411
412
|
EnsembleKeys.TARGET: target_name}
|
|
412
413
|
|
|
413
|
-
|
|
414
|
+
serialize_object_filename(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
|
|
414
415
|
|
|
415
416
|
|
|
416
417
|
# TRAIN EVALUATE PIPELINE
|
|
@@ -481,7 +482,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
|
|
|
481
482
|
|
|
482
483
|
###### 4. Execution ######
|
|
483
484
|
def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Path], target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
|
|
484
|
-
handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=
|
|
485
|
+
handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=True,
|
|
485
486
|
test_size: float=0.2, debug:bool=False, generate_learning_curves: bool = False):
|
|
486
487
|
#Check models
|
|
487
488
|
if isinstance(model_object, RegressionTreeModels):
|
ml_tools/handle_excel.py
CHANGED
|
@@ -2,6 +2,7 @@ from pathlib import Path
|
|
|
2
2
|
from openpyxl import load_workbook, Workbook
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from typing import List, Optional, Union
|
|
5
|
+
|
|
5
6
|
from .path_manager import sanitize_filename, make_fullpath
|
|
6
7
|
from ._script_info import _script_info
|
|
7
8
|
from ._logger import _LOGGER
|
ml_tools/keys.py
CHANGED
|
@@ -36,6 +36,86 @@ class PyTorchInferenceKeys:
|
|
|
36
36
|
# For classification tasks
|
|
37
37
|
LABELS = "labels"
|
|
38
38
|
PROBABILITIES = "probabilities"
|
|
39
|
+
LABEL_NAMES = "label_names"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class PytorchModelArchitectureKeys:
|
|
43
|
+
"""Keys for saving and loading model architecture."""
|
|
44
|
+
MODEL = 'model_class'
|
|
45
|
+
CONFIG = "config"
|
|
46
|
+
SAVENAME = "architecture"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class PytorchArtifactPathKeys:
|
|
50
|
+
"""Keys for model artifact paths."""
|
|
51
|
+
FEATURES_PATH = "feature_names_path"
|
|
52
|
+
TARGETS_PATH = "target_names_path"
|
|
53
|
+
ARCHITECTURE_PATH = "model_architecture_path"
|
|
54
|
+
WEIGHTS_PATH = "model_weights_path"
|
|
55
|
+
SCALER_PATH = "scaler_path"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class DatasetKeys:
|
|
59
|
+
"""Keys for saving dataset artifacts. Also used by FeatureSchema"""
|
|
60
|
+
FEATURE_NAMES = "feature_names"
|
|
61
|
+
TARGET_NAMES = "target_names"
|
|
62
|
+
SCALER_PREFIX = "scaler_"
|
|
63
|
+
# Feature Schema
|
|
64
|
+
CONTINUOUS_NAMES = "continuous_feature_names"
|
|
65
|
+
CATEGORICAL_NAMES = "categorical_feature_names"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class SHAPKeys:
|
|
69
|
+
"""Keys for SHAP functions"""
|
|
70
|
+
FEATURE_COLUMN = "feature"
|
|
71
|
+
SHAP_VALUE_COLUMN = "mean_abs_shap_value"
|
|
72
|
+
SAVENAME = "shap_summary"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class PyTorchCheckpointKeys:
|
|
76
|
+
"""Keys for saving/loading a training checkpoint dictionary."""
|
|
77
|
+
MODEL_STATE = "model_state_dict"
|
|
78
|
+
OPTIMIZER_STATE = "optimizer_state_dict"
|
|
79
|
+
SCHEDULER_STATE = "scheduler_state_dict"
|
|
80
|
+
EPOCH = "epoch"
|
|
81
|
+
BEST_SCORE = "best_score"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class UtilityKeys:
|
|
85
|
+
"""Keys used for utility modules"""
|
|
86
|
+
MODEL_PARAMS_FILE = "model_parameters"
|
|
87
|
+
TOTAL_PARAMS = "Total Parameters"
|
|
88
|
+
TRAINABLE_PARAMS = "Trainable Parameters"
|
|
89
|
+
PTH_FILE = "pth report "
|
|
90
|
+
MODEL_ARCHITECTURE_FILE = "model_architecture_summary"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class VisionKeys:
|
|
94
|
+
"""For vision ML metrics"""
|
|
95
|
+
SEGMENTATION_REPORT = "segmentation_report"
|
|
96
|
+
SEGMENTATION_HEATMAP = "segmentation_metrics_heatmap"
|
|
97
|
+
SEGMENTATION_CONFUSION_MATRIX = "segmentation_confusion_matrix"
|
|
98
|
+
# Object detection
|
|
99
|
+
OBJECT_DETECTION_REPORT = "object_detection_report"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class VisionTransformRecipeKeys:
|
|
103
|
+
"""Defines the key names for the transform recipe JSON file."""
|
|
104
|
+
TASK = "task"
|
|
105
|
+
PIPELINE = "pipeline"
|
|
106
|
+
NAME = "name"
|
|
107
|
+
KWARGS = "_kwargs"
|
|
108
|
+
PRE_TRANSFORMS = "pre_transforms"
|
|
109
|
+
RESIZE_SIZE = "resize_size"
|
|
110
|
+
CROP_SIZE = "crop_size"
|
|
111
|
+
MEAN = "mean"
|
|
112
|
+
STD = "std"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class ObjectDetectionKeys:
|
|
116
|
+
"""Used by the object detection dataset"""
|
|
117
|
+
BOXES = "boxes"
|
|
118
|
+
LABELS = "labels"
|
|
39
119
|
|
|
40
120
|
|
|
41
121
|
class _OneHotOtherPlaceholder:
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import math
|
|
4
|
+
from typing import Union, Sequence, Optional
|
|
5
|
+
|
|
6
|
+
from ._script_info import _script_info
|
|
7
|
+
from ._logger import _LOGGER
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"normalize_mixed_list",
|
|
12
|
+
"threshold_binary_values",
|
|
13
|
+
"threshold_binary_values_batch",
|
|
14
|
+
"discretize_categorical_values",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
19
|
+
"""
|
|
20
|
+
Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
|
|
21
|
+
applying heuristic adjustments to correct for potential data entry scale mismatches.
|
|
22
|
+
|
|
23
|
+
Parameters:
|
|
24
|
+
data (list):
|
|
25
|
+
A list of values that may include strings, floats, integers, or None.
|
|
26
|
+
None values are treated as 0.0.
|
|
27
|
+
|
|
28
|
+
threshold (int, optional):
|
|
29
|
+
The number of log10 orders of magnitude below the median scale
|
|
30
|
+
at which a value is considered suspect and is scaled upward accordingly.
|
|
31
|
+
Default is 2.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List[float]: A list of normalized float values summing to 1.0.
|
|
35
|
+
|
|
36
|
+
Notes:
|
|
37
|
+
- Zeros and None values remain zero.
|
|
38
|
+
- Input strings are automatically cast to floats if possible.
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> normalize_mixed_list([1, "0.01", 4, None])
|
|
42
|
+
[0.2, 0.2, 0.6, 0.0]
|
|
43
|
+
"""
|
|
44
|
+
# Step 1: Convert all values to float, treat None as 0.0
|
|
45
|
+
float_list = [float(x) if x is not None else 0.0 for x in data]
|
|
46
|
+
|
|
47
|
+
# Raise for negative values
|
|
48
|
+
if any(x < 0 for x in float_list):
|
|
49
|
+
_LOGGER.error("Negative values are not allowed in the input list.")
|
|
50
|
+
raise ValueError()
|
|
51
|
+
|
|
52
|
+
# Step 2: Compute log10 of non-zero values
|
|
53
|
+
nonzero = [x for x in float_list if x > 0]
|
|
54
|
+
if not nonzero:
|
|
55
|
+
return [0.0 for _ in float_list]
|
|
56
|
+
|
|
57
|
+
log_scales = [math.log10(x) for x in nonzero]
|
|
58
|
+
log_median = np.median(log_scales)
|
|
59
|
+
|
|
60
|
+
# Step 3: Adjust values that are much smaller than median
|
|
61
|
+
adjusted = []
|
|
62
|
+
for x in float_list:
|
|
63
|
+
if x == 0.0:
|
|
64
|
+
adjusted.append(0.0)
|
|
65
|
+
else:
|
|
66
|
+
log_x = math.log10(x)
|
|
67
|
+
if log_median - log_x > threshold:
|
|
68
|
+
scale_diff = round(log_median - log_x)
|
|
69
|
+
adjusted.append(x * (10 ** scale_diff))
|
|
70
|
+
else:
|
|
71
|
+
adjusted.append(x)
|
|
72
|
+
|
|
73
|
+
# Step 4: Normalize to sum to 1.0
|
|
74
|
+
total = sum(adjusted)
|
|
75
|
+
if total == 0:
|
|
76
|
+
return [0.0 for _ in adjusted]
|
|
77
|
+
|
|
78
|
+
return [x / total for x in adjusted]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def threshold_binary_values(
|
|
82
|
+
input_array: Union[Sequence[float], np.ndarray, pd.Series],
|
|
83
|
+
binary_values: Optional[int] = None
|
|
84
|
+
) -> Union[np.ndarray, pd.Series, list[float], tuple[float]]:
|
|
85
|
+
"""
|
|
86
|
+
Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
|
|
87
|
+
|
|
88
|
+
Binary elements are converted to 0 or 1 using a 0.5 threshold.
|
|
89
|
+
|
|
90
|
+
Parameters:
|
|
91
|
+
input_array: 1D sequence, NumPy array, or pandas Series.
|
|
92
|
+
binary_values (Optional[int]) :
|
|
93
|
+
- If `None`, all values are treated as binary.
|
|
94
|
+
- If `int`, only this many last `binary_values` are thresholded.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Any:
|
|
98
|
+
Same type as input
|
|
99
|
+
"""
|
|
100
|
+
original_type = type(input_array)
|
|
101
|
+
|
|
102
|
+
if isinstance(input_array, (pd.Series, np.ndarray)):
|
|
103
|
+
array = np.asarray(input_array)
|
|
104
|
+
elif isinstance(input_array, (list, tuple)):
|
|
105
|
+
array = np.array(input_array)
|
|
106
|
+
else:
|
|
107
|
+
_LOGGER.error("Unsupported input type")
|
|
108
|
+
raise TypeError()
|
|
109
|
+
|
|
110
|
+
array = array.flatten()
|
|
111
|
+
total = array.shape[0]
|
|
112
|
+
|
|
113
|
+
bin_count = total if binary_values is None else binary_values
|
|
114
|
+
if not (0 <= bin_count <= total):
|
|
115
|
+
_LOGGER.error("'binary_values' must be between 0 and the total number of elements")
|
|
116
|
+
raise ValueError()
|
|
117
|
+
|
|
118
|
+
if bin_count == 0:
|
|
119
|
+
result = array
|
|
120
|
+
else:
|
|
121
|
+
cont_part = array[:-bin_count] if bin_count < total else np.array([])
|
|
122
|
+
bin_part = (array[-bin_count:] > 0.5).astype(int)
|
|
123
|
+
result = np.concatenate([cont_part, bin_part])
|
|
124
|
+
|
|
125
|
+
if original_type is pd.Series:
|
|
126
|
+
return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
|
|
127
|
+
elif original_type is list:
|
|
128
|
+
return result.tolist()
|
|
129
|
+
elif original_type is tuple:
|
|
130
|
+
return tuple(result)
|
|
131
|
+
else:
|
|
132
|
+
return result
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def threshold_binary_values_batch(
|
|
136
|
+
input_array: np.ndarray,
|
|
137
|
+
binary_values: int
|
|
138
|
+
) -> np.ndarray:
|
|
139
|
+
"""
|
|
140
|
+
Threshold the last `binary_values` columns of a 2D NumPy array to binary {0,1} using 0.5 cutoff.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
input_array : np.ndarray
|
|
145
|
+
2D array with shape (batch_size, n_features).
|
|
146
|
+
binary_values : int
|
|
147
|
+
Number of binary features located at the END of each row.
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
np.ndarray
|
|
152
|
+
Thresholded array, same shape as input.
|
|
153
|
+
"""
|
|
154
|
+
if input_array.ndim != 2:
|
|
155
|
+
_LOGGER.error(f"Expected 2D array, got {input_array.ndim}D array.")
|
|
156
|
+
raise AssertionError()
|
|
157
|
+
|
|
158
|
+
batch_size, total_features = input_array.shape
|
|
159
|
+
|
|
160
|
+
if not (0 <= binary_values <= total_features):
|
|
161
|
+
_LOGGER.error("'binary_values' out of valid range.")
|
|
162
|
+
raise AssertionError()
|
|
163
|
+
|
|
164
|
+
if binary_values == 0:
|
|
165
|
+
return input_array.copy()
|
|
166
|
+
|
|
167
|
+
cont_part = input_array[:, :-binary_values] if binary_values < total_features else np.empty((batch_size, 0))
|
|
168
|
+
bin_part = input_array[:, -binary_values:] > 0.5
|
|
169
|
+
bin_part = bin_part.astype(np.int32)
|
|
170
|
+
|
|
171
|
+
return np.hstack([cont_part, bin_part])
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def discretize_categorical_values(
|
|
175
|
+
input_array: np.ndarray,
|
|
176
|
+
categorical_info: dict[int, int],
|
|
177
|
+
start_at_zero: bool = True
|
|
178
|
+
) -> np.ndarray:
|
|
179
|
+
"""
|
|
180
|
+
Rounds specified columns of a 2D NumPy array to the nearest integer and
|
|
181
|
+
clamps the result to a valid categorical range.
|
|
182
|
+
|
|
183
|
+
If a 1D array is provided, it is treated as a single batch.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
input_array : np.ndarray
|
|
188
|
+
1D array (n_features,) or 2D array with shape (batch_size, n_features) containing continuous values.
|
|
189
|
+
categorical_info : dict[int, int]
|
|
190
|
+
A dictionary mapping column indices to their cardinality (number of categories).
|
|
191
|
+
Example: {3: 4} means column 3 will be clamped to its 4 valid categories.
|
|
192
|
+
start_at_zero : bool
|
|
193
|
+
If True, categories range from 0 to k-1.
|
|
194
|
+
If False, categories range from 1 to k.
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
np.ndarray
|
|
199
|
+
A new array with the specified columns converted to integer categories.
|
|
200
|
+
Shape matches the input array's original shape.
|
|
201
|
+
"""
|
|
202
|
+
# --- Input Validation ---
|
|
203
|
+
if not isinstance(input_array, np.ndarray):
|
|
204
|
+
_LOGGER.error(f"Expected np.ndarray, got {type(input_array)}.")
|
|
205
|
+
raise ValueError()
|
|
206
|
+
|
|
207
|
+
if input_array.ndim == 1:
|
|
208
|
+
# Reshape 1D array (n_features,) to 2D (1, n_features)
|
|
209
|
+
working_array = input_array.reshape(1, -1)
|
|
210
|
+
original_was_1d = True
|
|
211
|
+
elif input_array.ndim == 2:
|
|
212
|
+
working_array = input_array
|
|
213
|
+
original_was_1d = False
|
|
214
|
+
else:
|
|
215
|
+
_LOGGER.error(f"Expected 1D or 2D array, got {input_array.ndim}D array.")
|
|
216
|
+
raise ValueError()
|
|
217
|
+
|
|
218
|
+
if not isinstance(categorical_info, dict) or not categorical_info:
|
|
219
|
+
_LOGGER.error(f"'categorical_info' is not a dictionary, or is empty.")
|
|
220
|
+
raise ValueError()
|
|
221
|
+
|
|
222
|
+
_, total_features = working_array.shape
|
|
223
|
+
for col_idx, cardinality in categorical_info.items():
|
|
224
|
+
if not isinstance(col_idx, int):
|
|
225
|
+
_LOGGER.error(f"Column index key {col_idx} is not an integer.")
|
|
226
|
+
raise TypeError()
|
|
227
|
+
if not (0 <= col_idx < total_features):
|
|
228
|
+
_LOGGER.error(f"Column index {col_idx} is out of bounds for an array with {total_features} features.")
|
|
229
|
+
raise ValueError()
|
|
230
|
+
if not isinstance(cardinality, int) or cardinality < 2:
|
|
231
|
+
_LOGGER.error(f"Cardinality for column {col_idx} must be an integer >= 2, but got {cardinality}.")
|
|
232
|
+
raise ValueError()
|
|
233
|
+
|
|
234
|
+
# --- Core Logic ---
|
|
235
|
+
output_array = working_array.copy()
|
|
236
|
+
|
|
237
|
+
for col_idx, cardinality in categorical_info.items():
|
|
238
|
+
# 1. Round the column values using "round half up"
|
|
239
|
+
rounded_col = np.floor(output_array[:, col_idx] + 0.5)
|
|
240
|
+
|
|
241
|
+
# 2. Determine clamping bounds
|
|
242
|
+
min_bound = 0 if start_at_zero else 1
|
|
243
|
+
max_bound = cardinality - 1 if start_at_zero else cardinality
|
|
244
|
+
|
|
245
|
+
# 3. Clamp the values and update the output array
|
|
246
|
+
output_array[:, col_idx] = np.clip(rounded_col, min_bound, max_bound)
|
|
247
|
+
|
|
248
|
+
final_output = output_array.astype(np.int32)
|
|
249
|
+
|
|
250
|
+
# --- Output Shape Handling ---
|
|
251
|
+
if original_was_1d:
|
|
252
|
+
# Squeeze the batch dimension to return a 1D array
|
|
253
|
+
return final_output.squeeze(axis=0)
|
|
254
|
+
else:
|
|
255
|
+
return final_output
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def info():
|
|
259
|
+
_script_info(__all__)
|