machinegnostics 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- machinegnostics/__init__.py +24 -0
- machinegnostics/magcal/__init__.py +37 -0
- machinegnostics/magcal/characteristics.py +460 -0
- machinegnostics/magcal/criteria_eval.py +268 -0
- machinegnostics/magcal/criterion.py +140 -0
- machinegnostics/magcal/data_conversion.py +381 -0
- machinegnostics/magcal/gcor.py +64 -0
- machinegnostics/magcal/gdf/__init__.py +2 -0
- machinegnostics/magcal/gdf/base_df.py +39 -0
- machinegnostics/magcal/gdf/base_distfunc.py +1202 -0
- machinegnostics/magcal/gdf/base_egdf.py +823 -0
- machinegnostics/magcal/gdf/base_eldf.py +830 -0
- machinegnostics/magcal/gdf/base_qgdf.py +1234 -0
- machinegnostics/magcal/gdf/base_qldf.py +1019 -0
- machinegnostics/magcal/gdf/cluster_analysis.py +456 -0
- machinegnostics/magcal/gdf/data_cluster.py +975 -0
- machinegnostics/magcal/gdf/data_intervals.py +853 -0
- machinegnostics/magcal/gdf/data_membership.py +536 -0
- machinegnostics/magcal/gdf/der_egdf.py +243 -0
- machinegnostics/magcal/gdf/distfunc_engine.py +841 -0
- machinegnostics/magcal/gdf/egdf.py +324 -0
- machinegnostics/magcal/gdf/eldf.py +297 -0
- machinegnostics/magcal/gdf/eldf_intv.py +609 -0
- machinegnostics/magcal/gdf/eldf_ma.py +627 -0
- machinegnostics/magcal/gdf/homogeneity.py +1218 -0
- machinegnostics/magcal/gdf/intv_engine.py +1523 -0
- machinegnostics/magcal/gdf/marginal_intv_analysis.py +558 -0
- machinegnostics/magcal/gdf/qgdf.py +289 -0
- machinegnostics/magcal/gdf/qldf.py +296 -0
- machinegnostics/magcal/gdf/scedasticity.py +197 -0
- machinegnostics/magcal/gdf/wedf.py +181 -0
- machinegnostics/magcal/gdf/z0_estimator.py +1047 -0
- machinegnostics/magcal/layer_base.py +42 -0
- machinegnostics/magcal/layer_history_base.py +74 -0
- machinegnostics/magcal/layer_io_process_base.py +238 -0
- machinegnostics/magcal/layer_param_base.py +448 -0
- machinegnostics/magcal/mg_weights.py +36 -0
- machinegnostics/magcal/sample_characteristics.py +532 -0
- machinegnostics/magcal/scale_optimization.py +185 -0
- machinegnostics/magcal/scale_param.py +313 -0
- machinegnostics/magcal/util/__init__.py +0 -0
- machinegnostics/magcal/util/dis_docstring.py +18 -0
- machinegnostics/magcal/util/logging.py +24 -0
- machinegnostics/magcal/util/min_max_float.py +34 -0
- machinegnostics/magnet/__init__.py +0 -0
- machinegnostics/metrics/__init__.py +28 -0
- machinegnostics/metrics/accu.py +61 -0
- machinegnostics/metrics/accuracy.py +67 -0
- machinegnostics/metrics/auto_correlation.py +183 -0
- machinegnostics/metrics/auto_covariance.py +204 -0
- machinegnostics/metrics/cls_report.py +130 -0
- machinegnostics/metrics/conf_matrix.py +93 -0
- machinegnostics/metrics/correlation.py +178 -0
- machinegnostics/metrics/cross_variance.py +167 -0
- machinegnostics/metrics/divi.py +82 -0
- machinegnostics/metrics/evalmet.py +109 -0
- machinegnostics/metrics/f1_score.py +128 -0
- machinegnostics/metrics/gmmfe.py +108 -0
- machinegnostics/metrics/hc.py +141 -0
- machinegnostics/metrics/mae.py +72 -0
- machinegnostics/metrics/mean.py +117 -0
- machinegnostics/metrics/median.py +122 -0
- machinegnostics/metrics/mg_r2.py +167 -0
- machinegnostics/metrics/mse.py +78 -0
- machinegnostics/metrics/precision.py +119 -0
- machinegnostics/metrics/r2.py +122 -0
- machinegnostics/metrics/recall.py +108 -0
- machinegnostics/metrics/rmse.py +77 -0
- machinegnostics/metrics/robr2.py +119 -0
- machinegnostics/metrics/std.py +144 -0
- machinegnostics/metrics/variance.py +101 -0
- machinegnostics/models/__init__.py +2 -0
- machinegnostics/models/classification/__init__.py +1 -0
- machinegnostics/models/classification/layer_history_log_reg.py +121 -0
- machinegnostics/models/classification/layer_io_process_log_reg.py +98 -0
- machinegnostics/models/classification/layer_mlflow_log_reg.py +107 -0
- machinegnostics/models/classification/layer_param_log_reg.py +275 -0
- machinegnostics/models/classification/mg_log_reg.py +273 -0
- machinegnostics/models/cross_validation.py +118 -0
- machinegnostics/models/data_split.py +106 -0
- machinegnostics/models/regression/__init__.py +2 -0
- machinegnostics/models/regression/layer_histroy_rob_reg.py +139 -0
- machinegnostics/models/regression/layer_io_process_rob_rig.py +88 -0
- machinegnostics/models/regression/layer_mlflow_rob_reg.py +134 -0
- machinegnostics/models/regression/layer_param_rob_reg.py +212 -0
- machinegnostics/models/regression/mg_lin_reg.py +253 -0
- machinegnostics/models/regression/mg_poly_reg.py +258 -0
- machinegnostics-0.0.1.dist-info/METADATA +246 -0
- machinegnostics-0.0.1.dist-info/RECORD +93 -0
- machinegnostics-0.0.1.dist-info/WHEEL +5 -0
- machinegnostics-0.0.1.dist-info/licenses/LICENSE +674 -0
- machinegnostics-0.0.1.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import logging
|
|
3
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
4
|
+
|
|
5
|
+
class CrossValidator:
|
|
6
|
+
"""
|
|
7
|
+
A custom implementation of k-Fold Cross-Validation for evaluating machine learning models.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
model : object
|
|
12
|
+
A machine learning model that implements `fit(X, y)` and `predict(X)` methods.
|
|
13
|
+
|
|
14
|
+
X : array-like of shape (n_samples, n_features)
|
|
15
|
+
Feature matrix.
|
|
16
|
+
|
|
17
|
+
y : array-like of shape (n_samples,)
|
|
18
|
+
Target labels.
|
|
19
|
+
|
|
20
|
+
k : int, default=5
|
|
21
|
+
Number of folds to use in cross-validation.
|
|
22
|
+
|
|
23
|
+
shuffle : bool, default=True
|
|
24
|
+
Whether to shuffle the dataset before splitting into folds.
|
|
25
|
+
|
|
26
|
+
random_seed : int or None, default=None
|
|
27
|
+
Seed used to shuffle the data. Ignored if `shuffle=False`.
|
|
28
|
+
|
|
29
|
+
verbose : bool, default=False
|
|
30
|
+
If True, enables detailed logging.
|
|
31
|
+
|
|
32
|
+
Attributes
|
|
33
|
+
----------
|
|
34
|
+
folds : list of tuple
|
|
35
|
+
List of (train_indices, test_indices) for each fold.
|
|
36
|
+
|
|
37
|
+
Example
|
|
38
|
+
-------
|
|
39
|
+
>>> from machinegnostics.models import CrossValidator
|
|
40
|
+
>>> from machinegnostics.models import LinearRegressor
|
|
41
|
+
>>> from sklearn.metrics import mean_squared_error
|
|
42
|
+
>>> X = np.random.rand(100, 10)
|
|
43
|
+
>>> y = np.random.rand(100)
|
|
44
|
+
>>> model = LinearRegression()
|
|
45
|
+
>>> cv = CrossValidator(model, X, y, k=5, shuffle=True, random_seed=42)
|
|
46
|
+
>>> scores = cv.evaluate(mean_squared_error)
|
|
47
|
+
>>> print("Cross-Validation Scores:", scores)
|
|
48
|
+
>>> print("Mean Score:", np.mean(scores))
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, model , X:np.ndarray, y:np.ndarray, k=5, shuffle=True, random_seed=None, verbose: bool = False):
|
|
52
|
+
self.model = model
|
|
53
|
+
self.X = np.array(X)
|
|
54
|
+
self.y = np.array(y)
|
|
55
|
+
self.k = k
|
|
56
|
+
self.shuffle = shuffle
|
|
57
|
+
self.random_seed = random_seed
|
|
58
|
+
self.verbose = verbose
|
|
59
|
+
|
|
60
|
+
self.logger = get_logger('CrossValidator', level=logging.WARNING if not verbose else logging.INFO)
|
|
61
|
+
|
|
62
|
+
def split(self):
|
|
63
|
+
"""
|
|
64
|
+
Split the dataset into k folds.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
folds : list of tuple
|
|
69
|
+
A list of (train_indices, test_indices) for each fold.
|
|
70
|
+
"""
|
|
71
|
+
self.logger.info("Starting k-Fold split...")
|
|
72
|
+
n_samples = len(self.X)
|
|
73
|
+
indices = np.arange(n_samples)
|
|
74
|
+
|
|
75
|
+
if self.shuffle:
|
|
76
|
+
rng = np.random.default_rng(self.random_seed)
|
|
77
|
+
rng.shuffle(indices)
|
|
78
|
+
|
|
79
|
+
fold_sizes = np.full(self.k, n_samples // self.k, dtype=int)
|
|
80
|
+
fold_sizes[:n_samples % self.k] += 1
|
|
81
|
+
|
|
82
|
+
current = 0
|
|
83
|
+
folds = []
|
|
84
|
+
for fold_size in fold_sizes:
|
|
85
|
+
start, stop = current, current + fold_size
|
|
86
|
+
test_idx = indices[start:stop]
|
|
87
|
+
train_idx = np.concatenate([indices[:start], indices[stop:]])
|
|
88
|
+
folds.append((train_idx, test_idx))
|
|
89
|
+
current = stop
|
|
90
|
+
self.logger.info("Completed k-Fold split.")
|
|
91
|
+
return folds
|
|
92
|
+
|
|
93
|
+
def evaluate(self, scoring_func):
|
|
94
|
+
"""
|
|
95
|
+
Perform k-fold cross-validation and return the evaluation scores.
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
scoring_func : callable
|
|
100
|
+
A function that takes `y_true` and `y_pred` and returns a numeric score (e.g., accuracy_score).
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
scores : list of float
|
|
105
|
+
Evaluation scores for each fold.
|
|
106
|
+
"""
|
|
107
|
+
self.logger.info("Starting cross-validation evaluation...")
|
|
108
|
+
scores = []
|
|
109
|
+
for train_idx, test_idx in self.split():
|
|
110
|
+
X_train, y_train = self.X[train_idx], self.y[train_idx]
|
|
111
|
+
X_test, y_test = self.X[test_idx], self.y[test_idx]
|
|
112
|
+
|
|
113
|
+
self.model.fit(X_train, y_train)
|
|
114
|
+
y_pred = self.model.predict(X_test)
|
|
115
|
+
score = scoring_func(y_test, y_pred)
|
|
116
|
+
scores.append(score)
|
|
117
|
+
self.logger.info("Completed cross-validation evaluation.")
|
|
118
|
+
return scores
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import logging
|
|
3
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
4
|
+
|
|
5
|
+
def train_test_split(X:np.ndarray, y=None, test_size=0.25, shuffle=True, random_seed=None, verbose: bool = False):
|
|
6
|
+
"""
|
|
7
|
+
Splits arrays or matrices into random train and test subsets.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
X : array-like (list, tuple, or np.ndarray)
|
|
12
|
+
Feature data to be split. Must be indexable and of consistent length.
|
|
13
|
+
|
|
14
|
+
y : array-like or None, optional (default=None)
|
|
15
|
+
Target data to be split alongside X. Must be same length as X.
|
|
16
|
+
|
|
17
|
+
test_size : float or int, optional (default=0.25)
|
|
18
|
+
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
|
|
19
|
+
If int, represents the absolute number of test samples.
|
|
20
|
+
|
|
21
|
+
shuffle : bool, optional (default=True)
|
|
22
|
+
Whether or not to shuffle the data before splitting.
|
|
23
|
+
|
|
24
|
+
random_seed : int or None, optional (default=None)
|
|
25
|
+
Controls the shuffling applied to the data before splitting.
|
|
26
|
+
|
|
27
|
+
verbose : bool, optional (default=False)
|
|
28
|
+
If True, enables detailed logging.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
X_train, X_test : np.ndarray
|
|
33
|
+
Train-test split of X.
|
|
34
|
+
|
|
35
|
+
y_train, y_test : np.ndarray or None
|
|
36
|
+
Train-test split of y. If y is None, these will also be None.
|
|
37
|
+
|
|
38
|
+
Raises
|
|
39
|
+
------
|
|
40
|
+
ValueError
|
|
41
|
+
If inputs are invalid or test_size is not appropriate.
|
|
42
|
+
|
|
43
|
+
Example
|
|
44
|
+
-------
|
|
45
|
+
>>> import numpy as np
|
|
46
|
+
>>> from machinegnostics.models import train_test_split
|
|
47
|
+
>>> X = np.arange(20).reshape(10, 2)
|
|
48
|
+
>>> y = np.arange(10)
|
|
49
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_seed=42)
|
|
50
|
+
>>> print("X_train:", X_train)
|
|
51
|
+
>>> print("X_test:", X_test)
|
|
52
|
+
>>> print("y_train:", y_train)
|
|
53
|
+
>>> print("y_test:", y_test)
|
|
54
|
+
"""
|
|
55
|
+
logger = get_logger('train_test_split', level=logging.WARNING if not verbose else logging.INFO)
|
|
56
|
+
logger.info("Starting train_test_split...")
|
|
57
|
+
|
|
58
|
+
# Convert inputs to np arrays
|
|
59
|
+
X = np.asarray(X)
|
|
60
|
+
if y is not None:
|
|
61
|
+
y = np.asarray(y)
|
|
62
|
+
|
|
63
|
+
# Validate shapes
|
|
64
|
+
if y is not None and len(X) != len(y):
|
|
65
|
+
logger.error(f"X and y must have the same number of samples, got {len(X)} and {len(y)}.")
|
|
66
|
+
raise ValueError(f"X and y must have the same number of samples, got {len(X)} and {len(y)}.")
|
|
67
|
+
|
|
68
|
+
n_samples = len(X)
|
|
69
|
+
|
|
70
|
+
# Validate and compute test size
|
|
71
|
+
if isinstance(test_size, float):
|
|
72
|
+
if not 0.0 < test_size < 1.0:
|
|
73
|
+
logger.error("If test_size is a float, it must be between 0.0 and 1.0.")
|
|
74
|
+
raise ValueError("If test_size is a float, it must be between 0.0 and 1.0.")
|
|
75
|
+
n_test = int(np.ceil(test_size * n_samples))
|
|
76
|
+
elif isinstance(test_size, int):
|
|
77
|
+
if not 0 < test_size < n_samples:
|
|
78
|
+
logger.error("If test_size is an int, it must be between 1 and len(X) - 1.")
|
|
79
|
+
raise ValueError("If test_size is an int, it must be between 1 and len(X) - 1.")
|
|
80
|
+
n_test = test_size
|
|
81
|
+
else:
|
|
82
|
+
logger.error("test_size must be either a float or an int.")
|
|
83
|
+
raise TypeError("test_size must be either a float or an int.")
|
|
84
|
+
|
|
85
|
+
n_train = n_samples - n_test
|
|
86
|
+
|
|
87
|
+
# Create indices and shuffle
|
|
88
|
+
indices = np.arange(n_samples)
|
|
89
|
+
if shuffle:
|
|
90
|
+
rng = np.random.default_rng(seed=random_seed)
|
|
91
|
+
rng.shuffle(indices)
|
|
92
|
+
|
|
93
|
+
train_idx = indices[:n_train]
|
|
94
|
+
test_idx = indices[n_train:]
|
|
95
|
+
|
|
96
|
+
X_train = X[train_idx]
|
|
97
|
+
X_test = X[test_idx]
|
|
98
|
+
|
|
99
|
+
if y is not None:
|
|
100
|
+
y_train = y[train_idx]
|
|
101
|
+
y_test = y[test_idx]
|
|
102
|
+
else:
|
|
103
|
+
y_train = y_test = None
|
|
104
|
+
logger.info("Completed train_test_split.")
|
|
105
|
+
|
|
106
|
+
return X_train, X_test, y_train, y_test
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from machinegnostics.models.regression.layer_param_rob_reg import ParamRobustRegressorBase
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class ParamRecord:
|
|
7
|
+
iteration: int
|
|
8
|
+
h_loss: float = None
|
|
9
|
+
weights: np.ndarray = None
|
|
10
|
+
coefficients: np.ndarray = None
|
|
11
|
+
degree: int = None
|
|
12
|
+
rentropy: float = None
|
|
13
|
+
fi: np.ndarray = None
|
|
14
|
+
hi: np.ndarray = None
|
|
15
|
+
fj: np.ndarray = None
|
|
16
|
+
hj: np.ndarray = None
|
|
17
|
+
infoi: dict = None
|
|
18
|
+
infoj: dict = None
|
|
19
|
+
pi: np.ndarray = None
|
|
20
|
+
pj: np.ndarray = None
|
|
21
|
+
ei: float = None
|
|
22
|
+
ej: float = None
|
|
23
|
+
|
|
24
|
+
class HistoryRobustRegressor(ParamRobustRegressorBase):
|
|
25
|
+
"""
|
|
26
|
+
History class for the Robust Regressor model.
|
|
27
|
+
|
|
28
|
+
This class extends HistoryBase and ParamRobustRegressorBase to maintain a history
|
|
29
|
+
of model parameters and gnostic loss values during training iterations.
|
|
30
|
+
|
|
31
|
+
Parameters needed to record history:
|
|
32
|
+
- h_loss: Gnostic loss value at each iteration
|
|
33
|
+
- iteration: The iteration number
|
|
34
|
+
- weights: Model weights at each iteration
|
|
35
|
+
- coefficients: Model coefficients at each iteration
|
|
36
|
+
- degree: Degree of polynomial features used in the model
|
|
37
|
+
- rentropy: Entropy of the model at each iteration
|
|
38
|
+
- fi, hi, fj, hj, infoi, infoj, pi, pj, ei, ej: Additional gnostic information if calculated
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self,
|
|
42
|
+
degree: int = 1,
|
|
43
|
+
max_iter: int = 100,
|
|
44
|
+
tol: float = 1e-3,
|
|
45
|
+
mg_loss: str = 'hi',
|
|
46
|
+
early_stopping: bool = True,
|
|
47
|
+
verbose: bool = False,
|
|
48
|
+
scale: 'str | int | float' = 'auto',
|
|
49
|
+
data_form: str = 'a',
|
|
50
|
+
gnostic_characteristics:bool=True,
|
|
51
|
+
history: bool = True):
|
|
52
|
+
super().__init__(
|
|
53
|
+
degree=degree,
|
|
54
|
+
max_iter=max_iter,
|
|
55
|
+
tol=tol,
|
|
56
|
+
mg_loss=mg_loss,
|
|
57
|
+
early_stopping=early_stopping,
|
|
58
|
+
verbose=verbose,
|
|
59
|
+
scale=scale,
|
|
60
|
+
data_form=data_form,
|
|
61
|
+
gnostic_characteristics=gnostic_characteristics
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
self.degree = degree
|
|
65
|
+
self.max_iter = max_iter
|
|
66
|
+
self.tol = tol
|
|
67
|
+
self.mg_loss = mg_loss
|
|
68
|
+
self.early_stopping = early_stopping
|
|
69
|
+
self.verbose = verbose
|
|
70
|
+
self.scale = scale
|
|
71
|
+
self.data_form = data_form
|
|
72
|
+
self.gnostic_characteristics = gnostic_characteristics
|
|
73
|
+
self._history = history
|
|
74
|
+
self.params = [
|
|
75
|
+
{
|
|
76
|
+
'iteration': 0,
|
|
77
|
+
'loss': None,
|
|
78
|
+
'weights': None,
|
|
79
|
+
'coefficients': None,
|
|
80
|
+
'degree': self.degree,
|
|
81
|
+
'rentropy': None,
|
|
82
|
+
'fi': None,
|
|
83
|
+
'hi': None,
|
|
84
|
+
'fj': None,
|
|
85
|
+
'hj': None,
|
|
86
|
+
'infoi': None,
|
|
87
|
+
'infoj': None,
|
|
88
|
+
'pi': None,
|
|
89
|
+
'pj': None,
|
|
90
|
+
'ei': None,
|
|
91
|
+
'ej': None
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
self.logger.info("HistoryRobustRegressor initialized.")
|
|
96
|
+
|
|
97
|
+
def _fit(self, X: np.ndarray, y: np.ndarray):
|
|
98
|
+
"""
|
|
99
|
+
Fit the model to the data and record history.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
X : np.ndarray
|
|
104
|
+
Input features.
|
|
105
|
+
y : np.ndarray
|
|
106
|
+
Target values.
|
|
107
|
+
"""
|
|
108
|
+
self.logger.info("Starting fit process for HistoryRobustRegressor.")
|
|
109
|
+
# Call the parent fit method to perform fitting
|
|
110
|
+
super()._fit(X, y)
|
|
111
|
+
|
|
112
|
+
# Record the initial state in history as a dict
|
|
113
|
+
params_dict = {}
|
|
114
|
+
|
|
115
|
+
if self.gnostic_characteristics:
|
|
116
|
+
params_dict['iteration'] = self._iter + 1
|
|
117
|
+
params_dict['loss'] = self.loss
|
|
118
|
+
params_dict['weights'] = self.weights.copy() if self.weights is not None else None
|
|
119
|
+
params_dict['coefficients'] = self.coefficients.copy() if self.coefficients is not None else None
|
|
120
|
+
params_dict['degree'] = self.degree
|
|
121
|
+
params_dict['rentropy'] = self.re
|
|
122
|
+
params_dict['fi'] = self.fi
|
|
123
|
+
params_dict['hi'] = self.hi
|
|
124
|
+
params_dict['fj'] = self.fj
|
|
125
|
+
params_dict['hj'] = self.hj
|
|
126
|
+
params_dict['infoi'] = self.infoi
|
|
127
|
+
params_dict['infoj'] = self.infoj
|
|
128
|
+
params_dict['pi'] = self.pi
|
|
129
|
+
params_dict['pj'] = self.pj
|
|
130
|
+
params_dict['ei'] = self.ei
|
|
131
|
+
params_dict['ej'] = self.ej
|
|
132
|
+
else:
|
|
133
|
+
params_dict['iteration'] = 0
|
|
134
|
+
params_dict['loss'] = None
|
|
135
|
+
params_dict['weights'] = self.weights.copy() if self.weights is not None else None
|
|
136
|
+
params_dict['coefficients'] = self.coefficients .copy() if self.coefficients is not None else None
|
|
137
|
+
params_dict['degree'] = self.degree
|
|
138
|
+
|
|
139
|
+
self.params.append(params_dict)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from machinegnostics.magcal import DataProcessLayerBase
|
|
3
|
+
from machinegnostics.models.regression.layer_mlflow_rob_reg import InterfaceRobustRegressor
|
|
4
|
+
from machinegnostics.magcal import disable_parent_docstring
|
|
5
|
+
|
|
6
|
+
@disable_parent_docstring
|
|
7
|
+
class DataProcessRobustRegressor(DataProcessLayerBase, InterfaceRobustRegressor):
|
|
8
|
+
"""
|
|
9
|
+
Data processing layer for the Robust Regressor model.
|
|
10
|
+
Handles data preprocessing specific to the Robust Regressor model.
|
|
11
|
+
"""
|
|
12
|
+
@disable_parent_docstring
|
|
13
|
+
def __init__(self,
|
|
14
|
+
degree: int = 1,
|
|
15
|
+
max_iter: int = 100,
|
|
16
|
+
tol: float = 1e-3,
|
|
17
|
+
mg_loss: str = 'hi',
|
|
18
|
+
early_stopping: bool = True,
|
|
19
|
+
verbose: bool = False,
|
|
20
|
+
scale: str | int | float = 'auto',
|
|
21
|
+
data_form: str = 'a',
|
|
22
|
+
gnostic_characteristics: bool = True,
|
|
23
|
+
history: bool = True,
|
|
24
|
+
**kwargs):
|
|
25
|
+
super().__init__(
|
|
26
|
+
degree=degree,
|
|
27
|
+
max_iter=max_iter,
|
|
28
|
+
tol=tol,
|
|
29
|
+
mg_loss=mg_loss,
|
|
30
|
+
early_stopping=early_stopping,
|
|
31
|
+
verbose=verbose,
|
|
32
|
+
scale=scale,
|
|
33
|
+
data_form=data_form,
|
|
34
|
+
gnostic_characteristics=gnostic_characteristics,
|
|
35
|
+
history=history,
|
|
36
|
+
**kwargs
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# --- argument checks ---
|
|
40
|
+
if not isinstance(degree, int) or degree < 1:
|
|
41
|
+
raise ValueError("Degree must be a positive integer.")
|
|
42
|
+
if not isinstance(max_iter, int) or max_iter < 1:
|
|
43
|
+
raise ValueError("max_iter must be a positive integer.")
|
|
44
|
+
if not isinstance(tol, (float, int)) or tol <= 0:
|
|
45
|
+
raise ValueError("tol must be a positive float or int.")
|
|
46
|
+
if mg_loss not in ['hi', 'hj']:
|
|
47
|
+
raise ValueError("mg_loss must be either 'hi' or 'hj'.")
|
|
48
|
+
if not isinstance(scale, (str, int, float)):
|
|
49
|
+
raise ValueError("scale must be a string, int, or float.")
|
|
50
|
+
if isinstance(scale, (int, float)) and (scale < 0 or scale > 2):
|
|
51
|
+
raise ValueError("scale must be between 0 and 2 if it is a number.")
|
|
52
|
+
if data_form not in ['a', 'm']:
|
|
53
|
+
raise ValueError("data_form must be either 'a' (additive) or 'm' (multiplicative).")
|
|
54
|
+
self.degree = degree
|
|
55
|
+
self.max_iter = max_iter
|
|
56
|
+
self.tol = tol
|
|
57
|
+
self.mg_loss = mg_loss
|
|
58
|
+
self.early_stopping = early_stopping
|
|
59
|
+
self.verbose = verbose
|
|
60
|
+
self.scale = scale
|
|
61
|
+
self.data_form = data_form
|
|
62
|
+
self.gnostic_characteristics = gnostic_characteristics
|
|
63
|
+
self._history = history
|
|
64
|
+
self.params = []
|
|
65
|
+
|
|
66
|
+
# logger
|
|
67
|
+
self.logger.info("DataProcessRobustRegressor initialized.")
|
|
68
|
+
|
|
69
|
+
@disable_parent_docstring
|
|
70
|
+
def _fit(self, X: np.ndarray, y: np.ndarray):
|
|
71
|
+
"""
|
|
72
|
+
Fit the model to the data and preprocess it.
|
|
73
|
+
"""
|
|
74
|
+
self.logger.info("Starting fit process for DataProcessRobustRegressor.")
|
|
75
|
+
X, y = self._fit_io(X, y)
|
|
76
|
+
# Call the fit method from the next class in the MRO
|
|
77
|
+
return super()._fit(X, y)
|
|
78
|
+
|
|
79
|
+
@disable_parent_docstring
|
|
80
|
+
def _predict(self, X: np.ndarray) -> np.ndarray:
|
|
81
|
+
"""
|
|
82
|
+
Predict using the model after preprocessing the input data.
|
|
83
|
+
"""
|
|
84
|
+
self.logger.info("Making predictions with DataProcessRobustRegressor.")
|
|
85
|
+
X = self._predict_io(X)
|
|
86
|
+
y_pred = super()._predict(X)
|
|
87
|
+
# y_pred = self._convert_output(y_pred, self.data_form)
|
|
88
|
+
return y_pred
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from machinegnostics.models.regression.layer_histroy_rob_reg import HistoryRobustRegressor
|
|
3
|
+
import mlflow
|
|
4
|
+
import os
|
|
5
|
+
import joblib
|
|
6
|
+
|
|
7
|
+
class InterfaceRobustRegressor(HistoryRobustRegressor, mlflow.pyfunc.PythonModel):
|
|
8
|
+
"""
|
|
9
|
+
Interface for the Robust Regressor model with MLflow integration.
|
|
10
|
+
|
|
11
|
+
This class extends HistoryRobustRegressor to provide an interface for
|
|
12
|
+
logging and tracking model parameters and performance metrics using MLflow.
|
|
13
|
+
|
|
14
|
+
Parameters needed for MLflow tracking:
|
|
15
|
+
- experiment_name: Name of the MLflow experiment
|
|
16
|
+
- run_name: Name of the MLflow run
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self,
|
|
20
|
+
degree: int = 1,
|
|
21
|
+
max_iter: int = 100,
|
|
22
|
+
tol: float = 1e-8,
|
|
23
|
+
mg_loss: str = 'hi',
|
|
24
|
+
early_stopping: bool = True,
|
|
25
|
+
verbose: bool = False,
|
|
26
|
+
scale: str | int | float = 'auto',
|
|
27
|
+
data_form: str = 'a',
|
|
28
|
+
gnostic_characteristics: bool = True,
|
|
29
|
+
history: bool = True):
|
|
30
|
+
super().__init__(
|
|
31
|
+
degree=degree,
|
|
32
|
+
max_iter=max_iter,
|
|
33
|
+
tol=tol,
|
|
34
|
+
mg_loss=mg_loss,
|
|
35
|
+
early_stopping=early_stopping,
|
|
36
|
+
verbose=verbose,
|
|
37
|
+
scale=scale,
|
|
38
|
+
data_form=data_form,
|
|
39
|
+
gnostic_characteristics=gnostic_characteristics,
|
|
40
|
+
history=history
|
|
41
|
+
)
|
|
42
|
+
self.coefficients = None
|
|
43
|
+
self.weights = None
|
|
44
|
+
self.degree = degree
|
|
45
|
+
self.max_iter = max_iter
|
|
46
|
+
self.tol = tol
|
|
47
|
+
self.mg_loss = mg_loss
|
|
48
|
+
self.early_stopping = early_stopping
|
|
49
|
+
self.verbose = verbose
|
|
50
|
+
self.scale = scale
|
|
51
|
+
self.data_form = data_form
|
|
52
|
+
self.gnostic_characteristics = gnostic_characteristics
|
|
53
|
+
self._history = history
|
|
54
|
+
self.params = []
|
|
55
|
+
|
|
56
|
+
# logger
|
|
57
|
+
self.logger.info("InterfaceRobustRegressor initialized.")
|
|
58
|
+
|
|
59
|
+
def _fit(self, X: np.ndarray, y: np.ndarray):
|
|
60
|
+
"""
|
|
61
|
+
Fit the model to the data and log parameters to MLflow.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
X : np.ndarray
|
|
66
|
+
Input features.
|
|
67
|
+
y : np.ndarray
|
|
68
|
+
Target values.
|
|
69
|
+
"""
|
|
70
|
+
# Call the fit method from HistoryRobustRegressor
|
|
71
|
+
self.logger.info("Starting fit process for InterfaceRobustRegressor. Logging to MLflow available.")
|
|
72
|
+
super()._fit(X, y)
|
|
73
|
+
return self
|
|
74
|
+
|
|
75
|
+
def _predict(self, model_input) -> np.ndarray:
|
|
76
|
+
"""
|
|
77
|
+
Predict class labels for input data and log predictions to MLflow.
|
|
78
|
+
|
|
79
|
+
Accepts numpy arrays, pandas DataFrames, or pyspark DataFrames.
|
|
80
|
+
|
|
81
|
+
Parameters
|
|
82
|
+
----------
|
|
83
|
+
model_input : np.ndarray, pd.DataFrame, pyspark.sql.DataFrame
|
|
84
|
+
Input data for prediction.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
np.ndarray
|
|
89
|
+
Predicted class labels.
|
|
90
|
+
"""
|
|
91
|
+
self.logger.info("Making predictions with InterfaceRobustRegressor.")
|
|
92
|
+
predictions = super()._predict(model_input)
|
|
93
|
+
return predictions
|
|
94
|
+
|
|
95
|
+
def save_model(self, path:str):
|
|
96
|
+
"""
|
|
97
|
+
Save the trained model to disk using joblib.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
path : str
|
|
102
|
+
Directory path where the model will be saved.
|
|
103
|
+
If the directory does not exist, it will be created.
|
|
104
|
+
If the model is already saved, it will be overwritten.
|
|
105
|
+
This method saves the model in a directory with a file named "model.pkl".
|
|
106
|
+
"""
|
|
107
|
+
self.logger.info(f"Saving model to {path}.")
|
|
108
|
+
os.makedirs(path, exist_ok=True)
|
|
109
|
+
joblib.dump(self, os.path.join(path, "model.pkl"))
|
|
110
|
+
|
|
111
|
+
@classmethod
|
|
112
|
+
def load_model(cls, path:str):
|
|
113
|
+
"""
|
|
114
|
+
Load a trained model from disk using joblib.
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
path : str
|
|
119
|
+
Directory path where the model is saved.
|
|
120
|
+
This method loads the model from a file named "model.pkl" in the specified directory.
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
MlflowInterfaceRobustRegressor
|
|
124
|
+
An instance of the model loaded from the specified path.
|
|
125
|
+
"""
|
|
126
|
+
return joblib.load(os.path.join(path, "model.pkl"))
|
|
127
|
+
|
|
128
|
+
def save_model(self, path):
|
|
129
|
+
"""
|
|
130
|
+
Save the trained model to disk using joblib.
|
|
131
|
+
"""
|
|
132
|
+
self.logger.info(f"Saving model to {path}.")
|
|
133
|
+
os.makedirs(path, exist_ok=True)
|
|
134
|
+
joblib.dump(self, os.path.join(path, "model.pkl"))
|