optiml 1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optiml/__init__.py +0 -0
- optiml/ml/__init__.py +0 -0
- optiml/ml/neural_network/__init__.py +3 -0
- optiml/ml/neural_network/_base.py +475 -0
- optiml/ml/neural_network/activations.py +79 -0
- optiml/ml/neural_network/initializers.py +66 -0
- optiml/ml/neural_network/layers.py +183 -0
- optiml/ml/neural_network/losses.py +178 -0
- optiml/ml/neural_network/regularizers.py +87 -0
- optiml/ml/svm/__init__.py +3 -0
- optiml/ml/svm/_base.py +1442 -0
- optiml/ml/svm/kernels.py +208 -0
- optiml/ml/svm/losses.py +284 -0
- optiml/ml/svm/smo.py +797 -0
- optiml/ml/tests/__init__.py +0 -0
- optiml/ml/tests/_datasets.py +49 -0
- optiml/ml/tests/_utils.py +28 -0
- optiml/ml/tests/test_initializers.py +33 -0
- optiml/ml/tests/test_neural_network.py +86 -0
- optiml/ml/tests/test_svc.py +245 -0
- optiml/ml/tests/test_svr.py +256 -0
- optiml/ml/utils.py +252 -0
- optiml/opti/__init__.py +4 -0
- optiml/opti/_base.py +309 -0
- optiml/opti/constrained/__init__.py +9 -0
- optiml/opti/constrained/_base.py +404 -0
- optiml/opti/constrained/active_set.py +228 -0
- optiml/opti/constrained/frank_wolfe.py +158 -0
- optiml/opti/constrained/interior_point.py +282 -0
- optiml/opti/constrained/projected_gradient.py +138 -0
- optiml/opti/constrained/tests/__init__.py +0 -0
- optiml/opti/constrained/tests/test_active_set.py +16 -0
- optiml/opti/constrained/tests/test_frank_wolfe.py +16 -0
- optiml/opti/constrained/tests/test_interior_point.py +16 -0
- optiml/opti/constrained/tests/test_lagrangian_quadratic.py +26 -0
- optiml/opti/constrained/tests/test_lower_bound.py +29 -0
- optiml/opti/constrained/tests/test_projected_gradient.py +16 -0
- optiml/opti/unconstrained/__init__.py +6 -0
- optiml/opti/unconstrained/_base.py +63 -0
- optiml/opti/unconstrained/line_search/__init__.py +10 -0
- optiml/opti/unconstrained/line_search/_base.py +106 -0
- optiml/opti/unconstrained/line_search/conjugate_gradient.py +255 -0
- optiml/opti/unconstrained/line_search/gradient_descent.py +212 -0
- optiml/opti/unconstrained/line_search/line_search.py +248 -0
- optiml/opti/unconstrained/line_search/newton.py +198 -0
- optiml/opti/unconstrained/line_search/quasi_newton.py +496 -0
- optiml/opti/unconstrained/proximal_bundle.py +219 -0
- optiml/opti/unconstrained/stochastic/__init__.py +12 -0
- optiml/opti/unconstrained/stochastic/_base.py +246 -0
- optiml/opti/unconstrained/stochastic/adadelta.py +133 -0
- optiml/opti/unconstrained/stochastic/adagrad.py +123 -0
- optiml/opti/unconstrained/stochastic/adam.py +179 -0
- optiml/opti/unconstrained/stochastic/adamax.py +178 -0
- optiml/opti/unconstrained/stochastic/amsgrad.py +177 -0
- optiml/opti/unconstrained/stochastic/gradient_descent.py +135 -0
- optiml/opti/unconstrained/stochastic/rmsprop.py +156 -0
- optiml/opti/unconstrained/stochastic/schedules.py +89 -0
- optiml/opti/unconstrained/tests/__init__.py +0 -0
- optiml/opti/unconstrained/tests/test_adadelta.py +20 -0
- optiml/opti/unconstrained/tests/test_adagrad.py +20 -0
- optiml/opti/unconstrained/tests/test_adam.py +42 -0
- optiml/opti/unconstrained/tests/test_adamax.py +41 -0
- optiml/opti/unconstrained/tests/test_amsgrad.py +40 -0
- optiml/opti/unconstrained/tests/test_conjugate_gradient.py +35 -0
- optiml/opti/unconstrained/tests/test_functions.py +34 -0
- optiml/opti/unconstrained/tests/test_gradient_descent.py +51 -0
- optiml/opti/unconstrained/tests/test_newton.py +20 -0
- optiml/opti/unconstrained/tests/test_quasi_newton.py +30 -0
- optiml/opti/unconstrained/tests/test_rmsprop.py +40 -0
- optiml/opti/unconstrained/tests/test_verbose.py +25 -0
- optiml/opti/utils.py +353 -0
- optiml-1.7.dist-info/METADATA +203 -0
- optiml-1.7.dist-info/RECORD +76 -0
- optiml-1.7.dist-info/WHEEL +5 -0
- optiml-1.7.dist-info/licenses/LICENSE +21 -0
- optiml-1.7.dist-info/top_level.txt +1 -0
optiml/__init__.py
ADDED
|
File without changes
|
optiml/ml/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from abc import ABC
|
|
3
|
+
|
|
4
|
+
import autograd.numpy as np
|
|
5
|
+
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
|
|
6
|
+
from sklearn.exceptions import ConvergenceWarning
|
|
7
|
+
from sklearn.metrics import accuracy_score
|
|
8
|
+
from sklearn.model_selection import train_test_split
|
|
9
|
+
|
|
10
|
+
from .activations import sigmoid, linear, softmax
|
|
11
|
+
from .layers import Layer, ParamLayer
|
|
12
|
+
from .losses import (CategoricalCrossEntropy, SparseCategoricalCrossEntropy,
|
|
13
|
+
MeanSquaredError, BinaryCrossEntropy, mean_squared_error, NeuralNetworkLoss)
|
|
14
|
+
from ...opti import Optimizer
|
|
15
|
+
from ...opti.unconstrained import ProximalBundle
|
|
16
|
+
from ...opti.unconstrained.line_search import LineSearchOptimizer
|
|
17
|
+
from ...opti.unconstrained.stochastic import StochasticOptimizer, StochasticGradientDescent, StochasticMomentumOptimizer
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NeuralNetwork(BaseEstimator, Layer, ABC):
|
|
21
|
+
"""
|
|
22
|
+
Base abstract class for all feed-forward neural network estimators.
|
|
23
|
+
It chains a sequence of layers, performs forward/backward propagation
|
|
24
|
+
and trains the network parameters by minimizing the given loss with
|
|
25
|
+
the chosen optimizer.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
|
|
30
|
+
layers : tuple of `Layer` instances, default=()
|
|
31
|
+
The ordered sequence of layers composing the network.
|
|
32
|
+
|
|
33
|
+
loss : `NeuralNetworkLoss` subclass, default=mean_squared_error
|
|
34
|
+
Specifies the loss function to minimize.
|
|
35
|
+
|
|
36
|
+
optimizer : `Optimizer` subclass, default=StochasticGradientDescent
|
|
37
|
+
The solver for optimization. It can be a subclass of the
|
|
38
|
+
`LineSearchOptimizer`, the `ProximalBundle` method or a subclass
|
|
39
|
+
of the `StochasticOptimizer`.
|
|
40
|
+
|
|
41
|
+
learning_rate : float, default=0.01
|
|
42
|
+
The initial learning rate used for weight update. It controls the
|
|
43
|
+
step-size in updating the weights. Only used when ``optimizer`` is a
|
|
44
|
+
subclass of `StochasticOptimizer`.
|
|
45
|
+
|
|
46
|
+
max_iter : int, default=1000
|
|
47
|
+
Maximum number of iterations. The solver iterates until convergence
|
|
48
|
+
(determined by ``tol``) or this number of iterations. If the optimizer
|
|
49
|
+
is a subclass of `StochasticOptimizer`, this value determines the number
|
|
50
|
+
of epochs, not the number of gradient steps.
|
|
51
|
+
|
|
52
|
+
momentum_type : {'none', 'polyak', 'nesterov'}, default='none'
|
|
53
|
+
Momentum type used for weight update. Only used when ``optimizer`` is
|
|
54
|
+
a subclass of `StochasticMomentumOptimizer`.
|
|
55
|
+
|
|
56
|
+
momentum : float, default=0.9
|
|
57
|
+
Momentum for weight update. Should be between 0 and 1. Only used when
|
|
58
|
+
``optimizer`` is a subclass of `StochasticMomentumOptimizer`.
|
|
59
|
+
|
|
60
|
+
tol : float, default=1e-4
|
|
61
|
+
Tolerance for stopping criterion.
|
|
62
|
+
|
|
63
|
+
validation_split : float, default=0.
|
|
64
|
+
The proportion of training data to set aside as validation set for
|
|
65
|
+
early stopping. Must be between 0 and 1. Only used when ``optimizer``
|
|
66
|
+
is a subclass of `StochasticOptimizer`.
|
|
67
|
+
|
|
68
|
+
batch_size : int, default=None
|
|
69
|
+
Size of mini batches for stochastic optimizers.
|
|
70
|
+
Only used when ``optimizer`` is a subclass of `StochasticOptimizer`.
|
|
71
|
+
|
|
72
|
+
max_f_eval : int, default=15000
|
|
73
|
+
Maximum number of loss function calls. Only used when ``optimizer``
|
|
74
|
+
is a subclass of `LineSearchOptimizer`.
|
|
75
|
+
|
|
76
|
+
early_stopping : bool, default=False
|
|
77
|
+
Whether to use early stopping to terminate training when the
|
|
78
|
+
monitored score/loss does not improve by at least ``tol`` for
|
|
79
|
+
``patience`` consecutive epochs.
|
|
80
|
+
Only used when ``optimizer`` is a subclass of `StochasticOptimizer`.
|
|
81
|
+
|
|
82
|
+
patience : int, default=5
|
|
83
|
+
Maximum number of epochs to not meet ``tol`` improvement.
|
|
84
|
+
Only used when ``optimizer`` is a subclass of `StochasticOptimizer`.
|
|
85
|
+
|
|
86
|
+
shuffle : bool, default=True
|
|
87
|
+
Whether to shuffle samples for batch sampling in each iteration. Only
|
|
88
|
+
used when the ``optimizer`` is a subclass of `StochasticOptimizer`.
|
|
89
|
+
|
|
90
|
+
random_state : int, RandomState instance or None, default=None
|
|
91
|
+
Controls the pseudo random number generation for the train-validation
|
|
92
|
+
split and for shuffling the data in batch sampling.
|
|
93
|
+
Pass an int for reproducible output across multiple function calls.
|
|
94
|
+
|
|
95
|
+
mu : float, default=1
|
|
96
|
+
Mu parameter for the proximal bundle method.
|
|
97
|
+
Only used when ``optimizer`` is `ProximalBundle`. Must be strictly positive.
|
|
98
|
+
|
|
99
|
+
master_solver : string, default='clarabel'
|
|
100
|
+
Master solver for the proximal bundle method for the CVXPY interface.
|
|
101
|
+
Only used when ``optimizer`` is `ProximalBundle`.
|
|
102
|
+
|
|
103
|
+
master_verbose : bool or int, default=False
|
|
104
|
+
Controls the verbosity of the CVXPY interface.
|
|
105
|
+
Only used when ``optimizer`` is `ProximalBundle`.
|
|
106
|
+
|
|
107
|
+
verbose : bool or int, default=False
|
|
108
|
+
Controls the verbosity of progress messages to stdout. Use a boolean value
|
|
109
|
+
to switch on/off or an int value to show progress each ``verbose`` time
|
|
110
|
+
optimization steps.
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
def __init__(self,
|
|
114
|
+
layers=(),
|
|
115
|
+
loss=mean_squared_error,
|
|
116
|
+
optimizer=StochasticGradientDescent,
|
|
117
|
+
learning_rate=0.01,
|
|
118
|
+
max_iter=1000,
|
|
119
|
+
momentum_type='none',
|
|
120
|
+
momentum=0.9,
|
|
121
|
+
tol=1e-4,
|
|
122
|
+
validation_split=0.,
|
|
123
|
+
batch_size=None,
|
|
124
|
+
max_f_eval=15000,
|
|
125
|
+
early_stopping=False,
|
|
126
|
+
patience=5,
|
|
127
|
+
shuffle=True,
|
|
128
|
+
random_state=None,
|
|
129
|
+
mu=1,
|
|
130
|
+
master_solver='clarabel',
|
|
131
|
+
master_verbose=False,
|
|
132
|
+
verbose=False):
|
|
133
|
+
self.layers = layers
|
|
134
|
+
if not issubclass(loss, NeuralNetworkLoss):
|
|
135
|
+
raise TypeError(f'{loss} is not an allowed neural network loss function')
|
|
136
|
+
self.loss = loss
|
|
137
|
+
if not issubclass(optimizer, Optimizer):
|
|
138
|
+
raise TypeError(f'{optimizer} is not an allowed optimization method')
|
|
139
|
+
self.optimizer = optimizer
|
|
140
|
+
self.learning_rate = learning_rate
|
|
141
|
+
self.momentum_type = momentum_type
|
|
142
|
+
self.momentum = momentum
|
|
143
|
+
self.tol = tol
|
|
144
|
+
self.max_iter = max_iter
|
|
145
|
+
self.batch_size = batch_size
|
|
146
|
+
self.validation_split = validation_split
|
|
147
|
+
self.max_f_eval = max_f_eval
|
|
148
|
+
self.early_stopping = early_stopping
|
|
149
|
+
self.patience = patience
|
|
150
|
+
self.shuffle = shuffle
|
|
151
|
+
self.random_state = random_state
|
|
152
|
+
self.mu = mu
|
|
153
|
+
self.master_solver = master_solver
|
|
154
|
+
self.master_verbose = master_verbose
|
|
155
|
+
self.verbose = verbose
|
|
156
|
+
if issubclass(self.optimizer, StochasticOptimizer):
|
|
157
|
+
self.train_loss_history = []
|
|
158
|
+
self.train_score_history = []
|
|
159
|
+
self._no_improvement_count = 0
|
|
160
|
+
self._avg_epoch_loss = 0
|
|
161
|
+
if self.validation_split:
|
|
162
|
+
self.val_loss_history = []
|
|
163
|
+
self.val_score_history = []
|
|
164
|
+
self.best_val_score = -np.inf
|
|
165
|
+
else:
|
|
166
|
+
self.best_loss = np.inf
|
|
167
|
+
|
|
168
|
+
def forward(self, X):
|
|
169
|
+
for layer in self.layers:
|
|
170
|
+
X = layer.forward(X)
|
|
171
|
+
return X
|
|
172
|
+
|
|
173
|
+
def backward(self, delta):
|
|
174
|
+
coef_grads = []
|
|
175
|
+
inter_grads = []
|
|
176
|
+
# backpropagate
|
|
177
|
+
for layer in self.layers[::-1]:
|
|
178
|
+
if isinstance(layer, ParamLayer):
|
|
179
|
+
delta, grads = layer.backward(delta)
|
|
180
|
+
coef_grads.append(grads['dW'] + layer.coef_reg.jacobian(layer.coef_) / layer._X.shape[0])
|
|
181
|
+
if layer.fit_intercept:
|
|
182
|
+
inter_grads.append(grads['db'] + layer.inter_reg.jacobian(layer.inter_) / layer._X.shape[0])
|
|
183
|
+
else:
|
|
184
|
+
delta = layer.backward(delta)
|
|
185
|
+
return coef_grads[::-1], inter_grads[::-1]
|
|
186
|
+
|
|
187
|
+
@property
|
|
188
|
+
def coefs_(self):
|
|
189
|
+
return [layer.coef_ for layer in self.layers if isinstance(layer, ParamLayer)]
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def intercepts_(self):
|
|
193
|
+
return [layer.inter_ for layer in self.layers if isinstance(layer, ParamLayer) and layer.fit_intercept]
|
|
194
|
+
|
|
195
|
+
def _pack(self, coefs, intercepts):
|
|
196
|
+
return np.hstack([w.ravel() for w in coefs + intercepts])
|
|
197
|
+
|
|
198
|
+
def _unpack(self, packed_coef_inter):
|
|
199
|
+
coef_idx = 0
|
|
200
|
+
inter_idx = 0
|
|
201
|
+
for layer in self.layers:
|
|
202
|
+
if isinstance(layer, ParamLayer):
|
|
203
|
+
start, end, shape = self.coef_idx[coef_idx]
|
|
204
|
+
layer.coef_ = np.reshape(packed_coef_inter[start:end], shape)
|
|
205
|
+
if layer.fit_intercept:
|
|
206
|
+
start, end = self.inter_idx[inter_idx]
|
|
207
|
+
layer.inter_ = packed_coef_inter[start:end]
|
|
208
|
+
inter_idx += 1
|
|
209
|
+
coef_idx += 1
|
|
210
|
+
|
|
211
|
+
def _store_meta_info(self):
|
|
212
|
+
# store meta information for the parameters
|
|
213
|
+
self.coef_idx = []
|
|
214
|
+
self.inter_idx = []
|
|
215
|
+
start = 0
|
|
216
|
+
# save sizes and indices of coefs for faster unpacking
|
|
217
|
+
for layer in self.layers:
|
|
218
|
+
if isinstance(layer, ParamLayer):
|
|
219
|
+
end = start + (np.prod(layer.coef_.shape))
|
|
220
|
+
self.coef_idx.append((start, end, layer.coef_.shape))
|
|
221
|
+
start = end
|
|
222
|
+
# save sizes and indices of intercepts for faster unpacking
|
|
223
|
+
for layer in self.layers:
|
|
224
|
+
if isinstance(layer, ParamLayer) and layer.fit_intercept:
|
|
225
|
+
fan_in, fan_out = layer.inter_.shape[0], layer.inter_.shape[1]
|
|
226
|
+
end = start + fan_out
|
|
227
|
+
self.inter_idx.append((start, end))
|
|
228
|
+
start = end
|
|
229
|
+
|
|
230
|
+
def _store_train_val_info(self, opt, X_batch, y_batch, X_val, y_val):
|
|
231
|
+
self._avg_epoch_loss += opt.f_x * X_batch.shape[0]
|
|
232
|
+
if opt.is_batch_end():
|
|
233
|
+
self._avg_epoch_loss /= opt.f.X.shape[0] # n_samples
|
|
234
|
+
self.train_loss_history.append(self._avg_epoch_loss)
|
|
235
|
+
if opt.is_verbose() and opt.epoch != opt.iter:
|
|
236
|
+
print('\tavg_loss: {: 1.4e}'.format(self._avg_epoch_loss), end='')
|
|
237
|
+
self._avg_epoch_loss = 0.
|
|
238
|
+
if self.validation_split:
|
|
239
|
+
val_loss = self.loss(opt.x, X_val, y_val)
|
|
240
|
+
self.val_loss_history.append(val_loss)
|
|
241
|
+
if opt.is_verbose():
|
|
242
|
+
print('\tval_loss: {: 1.4e}'.format(val_loss), end='')
|
|
243
|
+
|
|
244
|
+
def _update_no_improvement_count(self, opt):
|
|
245
|
+
if self.early_stopping:
|
|
246
|
+
|
|
247
|
+
if self.validation_split: # monitor val_score
|
|
248
|
+
|
|
249
|
+
if self.val_score_history[-1] < self.best_val_score + self.tol:
|
|
250
|
+
self._no_improvement_count += 1
|
|
251
|
+
else:
|
|
252
|
+
self._no_improvement_count = 0
|
|
253
|
+
if self.val_score_history[-1] > self.best_val_score:
|
|
254
|
+
self.best_val_score = self.val_score_history[-1]
|
|
255
|
+
self._best_coefs = [coef.copy() for coef in self.coefs_]
|
|
256
|
+
self._best_intercepts = [inter.copy() for inter in self.intercepts_]
|
|
257
|
+
|
|
258
|
+
else: # monitor train_loss
|
|
259
|
+
|
|
260
|
+
if self.train_loss_history[-1] > self.best_loss - self.tol:
|
|
261
|
+
self._no_improvement_count += 1
|
|
262
|
+
else:
|
|
263
|
+
self._no_improvement_count = 0
|
|
264
|
+
if self.train_loss_history[-1] < self.best_loss:
|
|
265
|
+
self.best_loss = self.train_loss_history[-1]
|
|
266
|
+
|
|
267
|
+
if self._no_improvement_count >= self.patience:
|
|
268
|
+
|
|
269
|
+
if self.validation_split:
|
|
270
|
+
opt.x = self._pack(self._best_coefs, self._best_intercepts)
|
|
271
|
+
|
|
272
|
+
if self.verbose:
|
|
273
|
+
if self.validation_split:
|
|
274
|
+
print(f'\ntraining stopped since validation score did not improve more than '
|
|
275
|
+
f'tol={self.tol} for {self.patience} consecutive epochs')
|
|
276
|
+
else:
|
|
277
|
+
print('\ntraining stopped since training loss did not improve more than '
|
|
278
|
+
f'tol={self.tol} for {self.patience} consecutive epochs')
|
|
279
|
+
|
|
280
|
+
raise StopIteration
|
|
281
|
+
|
|
282
|
+
def fit(self, X, y):
|
|
283
|
+
|
|
284
|
+
self._store_meta_info()
|
|
285
|
+
|
|
286
|
+
packed_coef_inter = self._pack(self.coefs_, self.intercepts_)
|
|
287
|
+
|
|
288
|
+
if issubclass(self.optimizer, LineSearchOptimizer):
|
|
289
|
+
|
|
290
|
+
self.loss = self.loss(self, X, y)
|
|
291
|
+
self.optimizer = self.optimizer(f=self.loss,
|
|
292
|
+
x=packed_coef_inter,
|
|
293
|
+
max_iter=self.max_iter,
|
|
294
|
+
max_f_eval=self.max_f_eval,
|
|
295
|
+
verbose=self.verbose).minimize()
|
|
296
|
+
|
|
297
|
+
if self.optimizer.status == 'stopped':
|
|
298
|
+
if self.optimizer.iter >= self.max_iter:
|
|
299
|
+
warnings.warn('max_iter reached but the optimization has not converged yet', ConvergenceWarning)
|
|
300
|
+
elif self.optimizer.f_eval >= self.max_f_eval:
|
|
301
|
+
warnings.warn('max_f_eval reached but the optimization has not converged yet', ConvergenceWarning)
|
|
302
|
+
|
|
303
|
+
elif issubclass(self.optimizer, ProximalBundle):
|
|
304
|
+
|
|
305
|
+
self.loss = self.loss(self, X, y)
|
|
306
|
+
self.optimizer = self.optimizer(f=self.loss,
|
|
307
|
+
x=packed_coef_inter,
|
|
308
|
+
mu=self.mu,
|
|
309
|
+
max_iter=self.max_iter,
|
|
310
|
+
master_solver=self.master_solver,
|
|
311
|
+
master_verbose=self.master_verbose,
|
|
312
|
+
verbose=self.verbose).minimize()
|
|
313
|
+
|
|
314
|
+
if self.optimizer.status == 'error':
|
|
315
|
+
warnings.warn('failure while computing direction for the master problem', ConvergenceWarning)
|
|
316
|
+
|
|
317
|
+
elif issubclass(self.optimizer, StochasticOptimizer):
|
|
318
|
+
|
|
319
|
+
if self.validation_split:
|
|
320
|
+
# don't stratify in multi-label classification
|
|
321
|
+
should_stratify = isinstance(self, NeuralNetworkClassifier) and self.layers[-1].fan_out == 1
|
|
322
|
+
stratify = y if should_stratify else None
|
|
323
|
+
X, X_val, y, y_val = train_test_split(X, y,
|
|
324
|
+
stratify=stratify,
|
|
325
|
+
test_size=self.validation_split,
|
|
326
|
+
random_state=self.random_state)
|
|
327
|
+
else:
|
|
328
|
+
X_val = None
|
|
329
|
+
y_val = None
|
|
330
|
+
|
|
331
|
+
self.loss = self.loss(self, X, y)
|
|
332
|
+
|
|
333
|
+
if issubclass(self.optimizer, StochasticMomentumOptimizer):
|
|
334
|
+
|
|
335
|
+
self.optimizer = self.optimizer(f=self.loss,
|
|
336
|
+
x=packed_coef_inter,
|
|
337
|
+
step_size=self.learning_rate,
|
|
338
|
+
epochs=self.max_iter,
|
|
339
|
+
batch_size=self.batch_size,
|
|
340
|
+
momentum_type=self.momentum_type,
|
|
341
|
+
momentum=self.momentum,
|
|
342
|
+
callback=self._store_train_val_info,
|
|
343
|
+
callback_args=(X_val, y_val),
|
|
344
|
+
shuffle=self.shuffle,
|
|
345
|
+
random_state=self.random_state,
|
|
346
|
+
verbose=self.verbose).minimize()
|
|
347
|
+
|
|
348
|
+
else:
|
|
349
|
+
|
|
350
|
+
self.optimizer = self.optimizer(f=self.loss,
|
|
351
|
+
x=packed_coef_inter,
|
|
352
|
+
step_size=self.learning_rate,
|
|
353
|
+
epochs=self.max_iter,
|
|
354
|
+
batch_size=self.batch_size,
|
|
355
|
+
callback=self._store_train_val_info,
|
|
356
|
+
callback_args=(X_val, y_val),
|
|
357
|
+
shuffle=self.shuffle,
|
|
358
|
+
random_state=self.random_state,
|
|
359
|
+
verbose=self.verbose).minimize()
|
|
360
|
+
|
|
361
|
+
else:
|
|
362
|
+
|
|
363
|
+
raise TypeError(f'{self.optimizer} is not an allowed optimizer')
|
|
364
|
+
|
|
365
|
+
self._unpack(self.optimizer.x)
|
|
366
|
+
|
|
367
|
+
return self
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
class NeuralNetworkClassifier(ClassifierMixin, NeuralNetwork):
|
|
371
|
+
"""
|
|
372
|
+
Feed-forward neural network for classification. The output layer must be
|
|
373
|
+
sigmoid (binary/multi-label) or softmax (multi-class), consistently with
|
|
374
|
+
the chosen loss function.
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
def _store_train_val_info(self, opt, X_batch, y_batch, X_val, y_val):
|
|
378
|
+
super(NeuralNetworkClassifier, self)._store_train_val_info(opt, X_batch, y_batch, X_val, y_val)
|
|
379
|
+
if opt.is_batch_end():
|
|
380
|
+
acc = self.score(X_batch, y_batch)
|
|
381
|
+
self.train_score_history.append(acc)
|
|
382
|
+
if opt.is_verbose():
|
|
383
|
+
print('\tacc: {:1.4f}'.format(acc), end='')
|
|
384
|
+
if self.validation_split:
|
|
385
|
+
val_acc = self.score(X_val, y_val)
|
|
386
|
+
self.val_score_history.append(val_acc)
|
|
387
|
+
if opt.is_verbose():
|
|
388
|
+
print('\tval_acc: {:1.4f}'.format(val_acc), end='')
|
|
389
|
+
self._update_no_improvement_count(opt)
|
|
390
|
+
|
|
391
|
+
def fit(self, X, y):
|
|
392
|
+
if y.ndim == 1:
|
|
393
|
+
y = y.reshape(-1, 1)
|
|
394
|
+
|
|
395
|
+
n_classes = y.shape[1] if self.loss == CategoricalCrossEntropy else np.unique(y).size
|
|
396
|
+
if self.loss in (SparseCategoricalCrossEntropy, CategoricalCrossEntropy):
|
|
397
|
+
if self.layers[-1].activation != softmax:
|
|
398
|
+
raise ValueError(f'NeuralNetworkClassifier with {type(self.loss).__name__} loss '
|
|
399
|
+
'function only works with softmax output layer')
|
|
400
|
+
if self.layers[-1].fan_out != n_classes:
|
|
401
|
+
raise ValueError('the number of neurons in the output layer must '
|
|
402
|
+
f'be equal to the number of classes, i.e., {n_classes}')
|
|
403
|
+
elif self.loss in (MeanSquaredError, BinaryCrossEntropy):
|
|
404
|
+
if n_classes > 2:
|
|
405
|
+
raise ValueError(f'NeuralNetworkClassifier with {type(self.loss).__name__} '
|
|
406
|
+
'loss function only works for binary classification')
|
|
407
|
+
if self.layers[-1].activation != sigmoid:
|
|
408
|
+
raise ValueError(f'NeuralNetworkClassifier with {type(self.loss).__name__} '
|
|
409
|
+
'loss function only works with sigmoid output layer')
|
|
410
|
+
if self.layers[-1].fan_out != 1:
|
|
411
|
+
raise ValueError(f'NeuralNetworkClassifier with {type(self.loss).__name__} loss '
|
|
412
|
+
'function only works with one neuron in the output layer')
|
|
413
|
+
|
|
414
|
+
return super(NeuralNetworkClassifier, self).fit(X, y)
|
|
415
|
+
|
|
416
|
+
def predict(self, X):
|
|
417
|
+
if self.layers[-1].activation == sigmoid:
|
|
418
|
+
return self.forward(X) >= 0.5
|
|
419
|
+
elif self.layers[-1].activation == softmax:
|
|
420
|
+
return np.argmax(self.forward(X), axis=1)
|
|
421
|
+
else:
|
|
422
|
+
return self.forward(X)
|
|
423
|
+
|
|
424
|
+
def score(self, X, y, sample_weight=None):
|
|
425
|
+
y = np.argmax(y, axis=1) if isinstance(self.loss, CategoricalCrossEntropy) else y
|
|
426
|
+
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
class NeuralNetworkRegressor(RegressorMixin, NeuralNetwork):
|
|
430
|
+
"""
|
|
431
|
+
Feed-forward neural network for regression. The output layer must be
|
|
432
|
+
linear or, for regression between 0 and 1, sigmoid. The number of output
|
|
433
|
+
neurons must equal the number of targets.
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
def _store_train_val_info(self, opt, X_batch, y_batch, X_val, y_val):
|
|
437
|
+
super(NeuralNetworkRegressor, self)._store_train_val_info(opt, X_batch, y_batch, X_val, y_val)
|
|
438
|
+
if opt.is_batch_end():
|
|
439
|
+
r2 = self.score(X_batch, y_batch)
|
|
440
|
+
self.train_score_history.append(r2)
|
|
441
|
+
if opt.is_verbose():
|
|
442
|
+
print('\tr2: {: 1.4f}'.format(r2), end='')
|
|
443
|
+
if self.early_stopping:
|
|
444
|
+
val_r2 = self.score(X_val, y_val)
|
|
445
|
+
self.val_score_history.append(val_r2)
|
|
446
|
+
if opt.is_verbose():
|
|
447
|
+
print('\tval_r2: {: 1.4f}'.format(val_r2), end='')
|
|
448
|
+
self._update_no_improvement_count(opt)
|
|
449
|
+
|
|
450
|
+
def fit(self, X, y):
|
|
451
|
+
if y.ndim == 1:
|
|
452
|
+
y = y.reshape(-1, 1)
|
|
453
|
+
|
|
454
|
+
if self.layers[-1].activation not in (linear, sigmoid):
|
|
455
|
+
raise ValueError('NeuralNetworkRegressor only works with linear or '
|
|
456
|
+
'sigmoid (for regression between 0 and 1) output layer')
|
|
457
|
+
if self.loss == BinaryCrossEntropy:
|
|
458
|
+
if self.layers[-1].activation != sigmoid:
|
|
459
|
+
raise ValueError('NeuralNetworkRegressor with binary_cross_entropy loss function only '
|
|
460
|
+
'works with sigmoid output layer for regression between 0 and 1')
|
|
461
|
+
if not (0 <= y <= 1).all():
|
|
462
|
+
raise ValueError('NeuralNetworkRegressor with binary_cross_entropy loss '
|
|
463
|
+
'function only works for regression between 0 and 1')
|
|
464
|
+
n_targets = y.shape[1]
|
|
465
|
+
if self.layers[-1].fan_out != n_targets:
|
|
466
|
+
raise ValueError(f'the number of neurons in the output layer must be '
|
|
467
|
+
f'equal to the number of targets, i.e., {n_targets}')
|
|
468
|
+
|
|
469
|
+
return super(NeuralNetworkRegressor, self).fit(X, y)
|
|
470
|
+
|
|
471
|
+
def predict(self, X):
|
|
472
|
+
if self.layers[-1].fan_out == 1: # one target
|
|
473
|
+
return self.forward(X).ravel()
|
|
474
|
+
else: # multi target
|
|
475
|
+
return self.forward(X)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from autograd.scipy.special import expit
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Activation(ABC):
|
|
8
|
+
"""
|
|
9
|
+
Base abstract class for all activation functions. Subclasses must
|
|
10
|
+
implement ``function`` and its element-wise derivative ``jacobian``.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def function(self, x):
|
|
14
|
+
raise NotImplementedError
|
|
15
|
+
|
|
16
|
+
def jacobian(self, x):
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
|
|
19
|
+
def __call__(self, x):
|
|
20
|
+
return self.function(x)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Linear(Activation):
|
|
24
|
+
r"""Identity (linear) activation function :math:`f(x) = x`."""
|
|
25
|
+
|
|
26
|
+
def function(self, x):
|
|
27
|
+
return x
|
|
28
|
+
|
|
29
|
+
def jacobian(self, x):
|
|
30
|
+
return np.ones_like(x)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ReLU(Activation):
|
|
34
|
+
r"""Rectified linear unit activation function :math:`f(x) = \max(0, x)`."""
|
|
35
|
+
|
|
36
|
+
def function(self, x):
|
|
37
|
+
return np.maximum(0., x)
|
|
38
|
+
|
|
39
|
+
def jacobian(self, x):
|
|
40
|
+
return np.where(x > 0, 1., 0.)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Tanh(Activation):
|
|
44
|
+
r"""Hyperbolic tangent activation function :math:`f(x) = \tanh(x)`."""
|
|
45
|
+
|
|
46
|
+
def function(self, x):
|
|
47
|
+
return np.tanh(x)
|
|
48
|
+
|
|
49
|
+
def jacobian(self, x):
|
|
50
|
+
return 1. - np.square(self.function(x))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class Sigmoid(Activation):
|
|
54
|
+
r"""Logistic sigmoid activation function :math:`f(x) = \frac{1}{1 + e^{-x}}`."""
|
|
55
|
+
|
|
56
|
+
def function(self, x):
|
|
57
|
+
return expit(x)
|
|
58
|
+
|
|
59
|
+
def jacobian(self, x):
|
|
60
|
+
x = self.function(x)
|
|
61
|
+
return x * (1. - x)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class SoftMax(Activation):
|
|
65
|
+
r"""Softmax activation function :math:`f(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}}`."""
|
|
66
|
+
|
|
67
|
+
def function(self, x, axis=-1):
|
|
68
|
+
exps = np.exp(x - np.max(x, axis=axis, keepdims=True))
|
|
69
|
+
return exps / np.sum(exps, axis=axis, keepdims=True)
|
|
70
|
+
|
|
71
|
+
def jacobian(self, x):
|
|
72
|
+
return np.ones_like(x)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
linear = Linear()
|
|
76
|
+
relu = ReLU()
|
|
77
|
+
tanh = Tanh()
|
|
78
|
+
sigmoid = Sigmoid()
|
|
79
|
+
softmax = SoftMax()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def truncated_normal(shape, mean=0., std=1., random_state=None):
|
|
5
|
+
truncated = 2 * std + mean
|
|
6
|
+
return np.clip(np.random.RandomState(random_state).normal(size=shape, loc=mean, scale=std), -truncated, truncated)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def glorot_normal(shape, random_state=None):
|
|
10
|
+
r"""Glorot normal initializer, also called Xavier normal initializer.
|
|
11
|
+
It draws samples from a truncated normal distribution centered on 0
|
|
12
|
+
with
|
|
13
|
+
|
|
14
|
+
.. math::
|
|
15
|
+
|
|
16
|
+
\text{std} = \sqrt{\frac{2}{\text{fan\_in} + \text{fan\_out}}}
|
|
17
|
+
|
|
18
|
+
where ``fan_in`` is the number of input units in the weight tensor
|
|
19
|
+
and ``fan_out`` is the number of output units in the weight tensor."""
|
|
20
|
+
fan_in, fan_out = shape[0], shape[1]
|
|
21
|
+
std = np.sqrt(2. / (fan_in + fan_out))
|
|
22
|
+
return truncated_normal(shape=shape, mean=0., std=std, random_state=random_state)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def glorot_uniform(shape, random_state=None):
|
|
26
|
+
r"""Glorot uniform initializer, also called Xavier uniform initializer.
|
|
27
|
+
It draws samples from a uniform distribution within
|
|
28
|
+
:math:`[-\text{limit}, \text{limit}]` where
|
|
29
|
+
|
|
30
|
+
.. math::
|
|
31
|
+
|
|
32
|
+
\text{limit} = \sqrt{\frac{6}{\text{fan\_in} + \text{fan\_out}}}
|
|
33
|
+
|
|
34
|
+
where ``fan_in`` is the number of input units in the weight tensor
|
|
35
|
+
and ``fan_out`` is the number of output units in the weight tensor."""
|
|
36
|
+
fan_in, fan_out = shape[0], shape[1]
|
|
37
|
+
limit = np.sqrt(6. / (fan_in + fan_out))
|
|
38
|
+
return np.random.RandomState(random_state).uniform(size=shape, low=-limit, high=limit)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def he_normal(shape, random_state=None):
|
|
42
|
+
r"""He normal initializer. It draws samples from a truncated normal
|
|
43
|
+
distribution centered on 0 with
|
|
44
|
+
|
|
45
|
+
.. math::
|
|
46
|
+
|
|
47
|
+
\text{std} = \sqrt{\frac{2}{\text{fan\_in}}}
|
|
48
|
+
|
|
49
|
+
where ``fan_in`` is the number of input units in the weight tensor."""
|
|
50
|
+
fan_in, fan_out = shape[0], shape[1]
|
|
51
|
+
std = np.sqrt(2. / fan_in)
|
|
52
|
+
return truncated_normal(shape=shape, mean=0., std=std, random_state=random_state)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def he_uniform(shape, random_state=None):
|
|
56
|
+
r"""He uniform variance scaling initializer. It draws samples from
|
|
57
|
+
a uniform distribution within :math:`[-\text{limit}, \text{limit}]` where
|
|
58
|
+
|
|
59
|
+
.. math::
|
|
60
|
+
|
|
61
|
+
\text{limit} = \sqrt{\frac{6}{\text{fan\_in}}}
|
|
62
|
+
|
|
63
|
+
where ``fan_in`` is the number of input units in the weight tensor."""
|
|
64
|
+
fan_in, fan_out = shape[0], shape[1]
|
|
65
|
+
limit = np.sqrt(6. / fan_in)
|
|
66
|
+
return np.random.RandomState(random_state).uniform(size=shape, low=-limit, high=limit)
|