optiml 1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optiml/__init__.py +0 -0
- optiml/ml/__init__.py +0 -0
- optiml/ml/neural_network/__init__.py +3 -0
- optiml/ml/neural_network/_base.py +475 -0
- optiml/ml/neural_network/activations.py +79 -0
- optiml/ml/neural_network/initializers.py +66 -0
- optiml/ml/neural_network/layers.py +183 -0
- optiml/ml/neural_network/losses.py +178 -0
- optiml/ml/neural_network/regularizers.py +87 -0
- optiml/ml/svm/__init__.py +3 -0
- optiml/ml/svm/_base.py +1442 -0
- optiml/ml/svm/kernels.py +208 -0
- optiml/ml/svm/losses.py +284 -0
- optiml/ml/svm/smo.py +797 -0
- optiml/ml/tests/__init__.py +0 -0
- optiml/ml/tests/_datasets.py +49 -0
- optiml/ml/tests/_utils.py +28 -0
- optiml/ml/tests/test_initializers.py +33 -0
- optiml/ml/tests/test_neural_network.py +86 -0
- optiml/ml/tests/test_svc.py +245 -0
- optiml/ml/tests/test_svr.py +256 -0
- optiml/ml/utils.py +252 -0
- optiml/opti/__init__.py +4 -0
- optiml/opti/_base.py +309 -0
- optiml/opti/constrained/__init__.py +9 -0
- optiml/opti/constrained/_base.py +404 -0
- optiml/opti/constrained/active_set.py +228 -0
- optiml/opti/constrained/frank_wolfe.py +158 -0
- optiml/opti/constrained/interior_point.py +282 -0
- optiml/opti/constrained/projected_gradient.py +138 -0
- optiml/opti/constrained/tests/__init__.py +0 -0
- optiml/opti/constrained/tests/test_active_set.py +16 -0
- optiml/opti/constrained/tests/test_frank_wolfe.py +16 -0
- optiml/opti/constrained/tests/test_interior_point.py +16 -0
- optiml/opti/constrained/tests/test_lagrangian_quadratic.py +26 -0
- optiml/opti/constrained/tests/test_lower_bound.py +29 -0
- optiml/opti/constrained/tests/test_projected_gradient.py +16 -0
- optiml/opti/unconstrained/__init__.py +6 -0
- optiml/opti/unconstrained/_base.py +63 -0
- optiml/opti/unconstrained/line_search/__init__.py +10 -0
- optiml/opti/unconstrained/line_search/_base.py +106 -0
- optiml/opti/unconstrained/line_search/conjugate_gradient.py +255 -0
- optiml/opti/unconstrained/line_search/gradient_descent.py +212 -0
- optiml/opti/unconstrained/line_search/line_search.py +248 -0
- optiml/opti/unconstrained/line_search/newton.py +198 -0
- optiml/opti/unconstrained/line_search/quasi_newton.py +496 -0
- optiml/opti/unconstrained/proximal_bundle.py +219 -0
- optiml/opti/unconstrained/stochastic/__init__.py +12 -0
- optiml/opti/unconstrained/stochastic/_base.py +246 -0
- optiml/opti/unconstrained/stochastic/adadelta.py +133 -0
- optiml/opti/unconstrained/stochastic/adagrad.py +123 -0
- optiml/opti/unconstrained/stochastic/adam.py +179 -0
- optiml/opti/unconstrained/stochastic/adamax.py +178 -0
- optiml/opti/unconstrained/stochastic/amsgrad.py +177 -0
- optiml/opti/unconstrained/stochastic/gradient_descent.py +135 -0
- optiml/opti/unconstrained/stochastic/rmsprop.py +156 -0
- optiml/opti/unconstrained/stochastic/schedules.py +89 -0
- optiml/opti/unconstrained/tests/__init__.py +0 -0
- optiml/opti/unconstrained/tests/test_adadelta.py +20 -0
- optiml/opti/unconstrained/tests/test_adagrad.py +20 -0
- optiml/opti/unconstrained/tests/test_adam.py +42 -0
- optiml/opti/unconstrained/tests/test_adamax.py +41 -0
- optiml/opti/unconstrained/tests/test_amsgrad.py +40 -0
- optiml/opti/unconstrained/tests/test_conjugate_gradient.py +35 -0
- optiml/opti/unconstrained/tests/test_functions.py +34 -0
- optiml/opti/unconstrained/tests/test_gradient_descent.py +51 -0
- optiml/opti/unconstrained/tests/test_newton.py +20 -0
- optiml/opti/unconstrained/tests/test_quasi_newton.py +30 -0
- optiml/opti/unconstrained/tests/test_rmsprop.py +40 -0
- optiml/opti/unconstrained/tests/test_verbose.py +25 -0
- optiml/opti/utils.py +353 -0
- optiml-1.7.dist-info/METADATA +203 -0
- optiml-1.7.dist-info/RECORD +76 -0
- optiml-1.7.dist-info/WHEEL +5 -0
- optiml-1.7.dist-info/licenses/LICENSE +21 -0
- optiml-1.7.dist-info/top_level.txt +1 -0
optiml/ml/svm/kernels.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from sklearn.base import BaseEstimator
|
|
5
|
+
from sklearn.metrics.pairwise import check_pairwise_arrays, euclidean_distances, manhattan_distances
|
|
6
|
+
from sklearn.utils.extmath import safe_sparse_dot
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Kernel(BaseEstimator, ABC):
|
|
10
|
+
"""
|
|
11
|
+
Base abstract class for all kernel functions. A kernel computes the
|
|
12
|
+
pairwise similarities (i.e., the Gram matrix) between two sets of samples.
|
|
13
|
+
|
|
14
|
+
Subclasses must implement ``__call__``.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __call__(self, X, Y=None):
|
|
18
|
+
"""
|
|
19
|
+
Compute the kernel (Gram) matrix between X and Y.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
|
|
24
|
+
X : ndarray of shape (n_samples_X, n_features)
|
|
25
|
+
Left argument of the kernel function.
|
|
26
|
+
|
|
27
|
+
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
|
28
|
+
Right argument of the kernel function. If None, ``Y`` is
|
|
29
|
+
set to ``X`` and the kernel matrix between X and itself is computed.
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
|
|
34
|
+
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
35
|
+
The computed kernel matrix.
|
|
36
|
+
"""
|
|
37
|
+
raise NotImplementedError
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LinearKernel(Kernel):
|
|
41
|
+
r"""
|
|
42
|
+
Compute the linear kernel between X and Y:
|
|
43
|
+
|
|
44
|
+
.. math::
|
|
45
|
+
|
|
46
|
+
K(X, Y) = \langle X, Y \rangle
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __call__(self, X, Y=None):
|
|
50
|
+
X, Y = check_pairwise_arrays(X, Y)
|
|
51
|
+
return safe_sparse_dot(X, Y.T, dense_output=True)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class PolyKernel(Kernel):
|
|
55
|
+
r"""
|
|
56
|
+
Compute the polynomial kernel between X and Y:
|
|
57
|
+
|
|
58
|
+
.. math::
|
|
59
|
+
|
|
60
|
+
K(X, Y) = (\gamma \langle X, Y \rangle + coef_0)^{degree}
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
|
|
65
|
+
degree : int, default=3
|
|
66
|
+
Degree of the polynomial kernel function.
|
|
67
|
+
|
|
68
|
+
gamma : {'scale', 'auto'} or float, default='scale'
|
|
69
|
+
Kernel coefficient for kernel function.
|
|
70
|
+
|
|
71
|
+
- if `gamma='scale'` (default) is passed then it uses
|
|
72
|
+
1 / (n_features * X.var()) as value of gamma,
|
|
73
|
+
- if `gamma='auto'`, uses 1 / n_features.
|
|
74
|
+
|
|
75
|
+
coef0 : float, default=0.0
|
|
76
|
+
Independent term in kernel function.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(self, degree=3, gamma='scale', coef0=0.):
|
|
80
|
+
if not degree > 0:
|
|
81
|
+
raise ValueError('degree must be > 0')
|
|
82
|
+
self.degree = degree
|
|
83
|
+
if isinstance(gamma, str):
|
|
84
|
+
if gamma not in ('scale', 'auto'):
|
|
85
|
+
raise ValueError(f'unknown gamma type {gamma}')
|
|
86
|
+
elif not gamma > 0:
|
|
87
|
+
raise ValueError('gamma must be > 0')
|
|
88
|
+
self.gamma = gamma
|
|
89
|
+
self.coef0 = coef0
|
|
90
|
+
|
|
91
|
+
def __call__(self, X, Y=None):
|
|
92
|
+
X, Y = check_pairwise_arrays(X, Y)
|
|
93
|
+
gamma = (1. / (X.shape[1] * X.var()) if self.gamma == 'scale' else
|
|
94
|
+
1. / X.shape[1] if self.gamma == 'auto' else self.gamma)
|
|
95
|
+
return (gamma * safe_sparse_dot(X, Y.T, dense_output=True) + self.coef0) ** self.degree
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class GaussianKernel(Kernel):
|
|
99
|
+
r"""
|
|
100
|
+
Compute the gaussian RBF kernel between X and Y:
|
|
101
|
+
|
|
102
|
+
.. math::
|
|
103
|
+
|
|
104
|
+
K(X, Y) = e^{-\gamma \lVert X - Y \rVert_2^2}
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
|
|
109
|
+
gamma : {'scale', 'auto'} or float, default='scale'
|
|
110
|
+
Kernel coefficient for kernel function.
|
|
111
|
+
|
|
112
|
+
- if `gamma='scale'` (default) is passed then it uses
|
|
113
|
+
1 / (n_features * X.var()) as value of gamma,
|
|
114
|
+
- if `gamma='auto'`, uses 1 / n_features.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(self, gamma='scale'):
|
|
118
|
+
if isinstance(gamma, str):
|
|
119
|
+
if gamma not in ('scale', 'auto'):
|
|
120
|
+
raise ValueError(f'unknown gamma type {gamma}')
|
|
121
|
+
elif not gamma > 0:
|
|
122
|
+
raise ValueError('gamma must be > 0')
|
|
123
|
+
self.gamma = gamma
|
|
124
|
+
|
|
125
|
+
def __call__(self, X, Y=None):
|
|
126
|
+
X, Y = check_pairwise_arrays(X, Y)
|
|
127
|
+
gamma = (1. / (X.shape[1] * X.var()) if self.gamma == 'scale' else
|
|
128
|
+
1. / X.shape[1] if self.gamma == 'auto' else self.gamma)
|
|
129
|
+
return np.exp(-gamma * euclidean_distances(X, Y, squared=True))
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class LaplacianKernel(Kernel):
|
|
133
|
+
r"""
|
|
134
|
+
Compute the laplacian RBF kernel between X and Y:
|
|
135
|
+
|
|
136
|
+
.. math::
|
|
137
|
+
|
|
138
|
+
K(X, Y) = e^{-\gamma \lVert X - Y \rVert_1}
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
|
|
143
|
+
gamma : {'scale', 'auto'} or float, default='scale'
|
|
144
|
+
Kernel coefficient for kernel function.
|
|
145
|
+
|
|
146
|
+
- if `gamma='scale'` (default) is passed then it uses
|
|
147
|
+
1 / (n_features * X.var()) as value of gamma,
|
|
148
|
+
- if `gamma='auto'`, uses 1 / n_features.
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
def __init__(self, gamma='scale'):
|
|
152
|
+
if isinstance(gamma, str):
|
|
153
|
+
if gamma not in ('scale', 'auto'):
|
|
154
|
+
raise ValueError(f'unknown gamma type {gamma}')
|
|
155
|
+
elif not gamma > 0:
|
|
156
|
+
raise ValueError('gamma must be > 0')
|
|
157
|
+
self.gamma = gamma
|
|
158
|
+
|
|
159
|
+
def __call__(self, X, Y=None):
|
|
160
|
+
X, Y = check_pairwise_arrays(X, Y)
|
|
161
|
+
gamma = (1. / (X.shape[1] * X.var()) if self.gamma == 'scale' else
|
|
162
|
+
1. / X.shape[1] if self.gamma == 'auto' else self.gamma)
|
|
163
|
+
return np.exp(-gamma * manhattan_distances(X, Y))
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class SigmoidKernel(Kernel):
|
|
167
|
+
r"""
|
|
168
|
+
Compute the sigmoid kernel between X and Y:
|
|
169
|
+
|
|
170
|
+
.. math::
|
|
171
|
+
|
|
172
|
+
K(X, Y) = \tanh(\gamma \langle X, Y \rangle + coef_0)
|
|
173
|
+
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
|
|
177
|
+
gamma : {'scale', 'auto'} or float, default='scale'
|
|
178
|
+
Kernel coefficient for kernel function.
|
|
179
|
+
|
|
180
|
+
- if `gamma='scale'` (default) is passed then it uses
|
|
181
|
+
1 / (n_features * X.var()) as value of gamma,
|
|
182
|
+
- if `gamma='auto'`, uses 1 / n_features.
|
|
183
|
+
|
|
184
|
+
coef0 : float, default=0.0
|
|
185
|
+
Independent term in kernel function.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
def __init__(self, gamma='scale', coef0=0.):
|
|
189
|
+
if isinstance(gamma, str):
|
|
190
|
+
if gamma not in ('scale', 'auto'):
|
|
191
|
+
raise ValueError(f'unknown gamma type {gamma}')
|
|
192
|
+
elif not gamma > 0:
|
|
193
|
+
raise ValueError('gamma must be > 0')
|
|
194
|
+
self.gamma = gamma
|
|
195
|
+
self.coef0 = coef0
|
|
196
|
+
|
|
197
|
+
def __call__(self, X, Y=None):
|
|
198
|
+
X, Y = check_pairwise_arrays(X, Y)
|
|
199
|
+
gamma = (1. / (X.shape[1] * X.var()) if self.gamma == 'scale' else
|
|
200
|
+
1. / X.shape[1] if self.gamma == 'auto' else self.gamma)
|
|
201
|
+
return np.tanh(gamma * safe_sparse_dot(X, Y.T, dense_output=True) + self.coef0)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
linear = LinearKernel()
|
|
205
|
+
poly = PolyKernel()
|
|
206
|
+
gaussian = GaussianKernel()
|
|
207
|
+
laplacian = LaplacianKernel()
|
|
208
|
+
sigmoid = SigmoidKernel()
|
optiml/ml/svm/losses.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
|
|
3
|
+
import autograd.numpy as np
|
|
4
|
+
import cvxpy as cp
|
|
5
|
+
|
|
6
|
+
from ...opti import OptimizationFunction
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SVMLoss(OptimizationFunction, ABC):
|
|
10
|
+
"""
|
|
11
|
+
Base abstract class for all SVM loss functions. It defines the
|
|
12
|
+
primal objective, i.e., the regularization term plus the loss term
|
|
13
|
+
averaged over the training samples, together with its jacobian.
|
|
14
|
+
|
|
15
|
+
Subclasses must implement ``loss``, ``loss_jacobian`` and ``step_size``.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, svm, X, y):
|
|
19
|
+
"""
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
|
|
23
|
+
svm : `SVM` instance
|
|
24
|
+
The SVM estimator this loss is attached to. It provides the
|
|
25
|
+
hyper-parameters used by the objective, e.g., ``C`` and
|
|
26
|
+
``fit_intercept``.
|
|
27
|
+
|
|
28
|
+
X : ndarray of shape (n_samples, n_features)
|
|
29
|
+
Training data over which the loss is evaluated.
|
|
30
|
+
|
|
31
|
+
y : ndarray of shape (n_samples,)
|
|
32
|
+
Target values associated with ``X``.
|
|
33
|
+
"""
|
|
34
|
+
super(SVMLoss, self).__init__(X.shape[1])
|
|
35
|
+
self.svm = svm
|
|
36
|
+
self.X = X
|
|
37
|
+
self.y = y
|
|
38
|
+
|
|
39
|
+
def args(self):
|
|
40
|
+
return self.X, self.y
|
|
41
|
+
|
|
42
|
+
def x_star(self):
|
|
43
|
+
# Compute the exact minimizer of the *same* primal objective that the
|
|
44
|
+
# optimizers minimize, i.e., 1/(2n) ||theta||^2 + C/n sum(loss), by solving
|
|
45
|
+
# it directly as a convex program to high accuracy with a reliable conic
|
|
46
|
+
# solver, instead of recovering it (less accurately) from the dual. This
|
|
47
|
+
# makes f_star() = function(x_star()) a genuine, solver-certified optimum.
|
|
48
|
+
if not hasattr(self, 'x_opt'):
|
|
49
|
+
n_samples = self.X.shape[0]
|
|
50
|
+
theta = cp.Variable(self.X.shape[1])
|
|
51
|
+
objective = cp.Minimize(1 / (2 * n_samples) * cp.sum_squares(theta) + # regularization term
|
|
52
|
+
self.svm.C / n_samples * self._cvxpy_loss(theta)) # loss term
|
|
53
|
+
problem = cp.Problem(objective)
|
|
54
|
+
# solve to high accuracy, falling back to other available solvers if needed
|
|
55
|
+
for solver in (cp.CLARABEL, cp.ECOS, cp.OSQP, cp.SCS):
|
|
56
|
+
try:
|
|
57
|
+
problem.solve(solver=solver)
|
|
58
|
+
except (cp.error.SolverError, cp.error.DCPError, KeyError):
|
|
59
|
+
continue
|
|
60
|
+
if problem.status in (cp.OPTIMAL, cp.OPTIMAL_INACCURATE):
|
|
61
|
+
break
|
|
62
|
+
if problem.status not in (cp.OPTIMAL, cp.OPTIMAL_INACCURATE):
|
|
63
|
+
raise ValueError(f'could not compute the optimal solution x_star '
|
|
64
|
+
f'(solver status: {problem.status})')
|
|
65
|
+
self.x_opt = np.asarray(theta.value, dtype=float)
|
|
66
|
+
return self.x_opt
|
|
67
|
+
|
|
68
|
+
def f_star(self):
|
|
69
|
+
return self.function(self.x_star())
|
|
70
|
+
|
|
71
|
+
def _cvxpy_loss(self, theta):
|
|
72
|
+
"""
|
|
73
|
+
The cvxpy expression of the (summed over the samples) loss term as a
|
|
74
|
+
function of the optimization variable ``theta``, used to build the convex
|
|
75
|
+
primal program whose optimum defines f_star.
|
|
76
|
+
|
|
77
|
+
:param theta: the cvxpy variable of the packed coefficients and intercept.
|
|
78
|
+
:return: the cvxpy expression of sum(loss(y, X theta)).
|
|
79
|
+
"""
|
|
80
|
+
raise NotImplementedError
|
|
81
|
+
|
|
82
|
+
def function(self, packed_coef_inter, X_batch=None, y_batch=None):
|
|
83
|
+
if X_batch is None:
|
|
84
|
+
X_batch = self.X
|
|
85
|
+
if y_batch is None:
|
|
86
|
+
y_batch = self.y
|
|
87
|
+
|
|
88
|
+
n_samples = X_batch.shape[0]
|
|
89
|
+
y_pred = np.dot(X_batch, packed_coef_inter) # svm decision function
|
|
90
|
+
return (1 / (2 * n_samples) * np.linalg.norm(packed_coef_inter) ** 2 + # regularization term
|
|
91
|
+
self.svm.C / n_samples * np.sum(self.loss(y_pred, y_batch))) # loss term
|
|
92
|
+
|
|
93
|
+
def loss(self, y_pred, y_true):
|
|
94
|
+
raise NotImplementedError
|
|
95
|
+
|
|
96
|
+
def jacobian(self, packed_coef_inter, X_batch=None, y_batch=None):
|
|
97
|
+
if X_batch is None:
|
|
98
|
+
X_batch = self.X
|
|
99
|
+
if y_batch is None:
|
|
100
|
+
y_batch = self.y
|
|
101
|
+
|
|
102
|
+
n_samples = X_batch.shape[0]
|
|
103
|
+
return ((1 / n_samples) * packed_coef_inter - # jacobian wrt the regularization term
|
|
104
|
+
self.svm.C / n_samples * self.loss_jacobian(
|
|
105
|
+
packed_coef_inter, X_batch, y_batch)) # jacobian wrt the loss term
|
|
106
|
+
|
|
107
|
+
def loss_jacobian(self, packed_coef_inter, X_batch, y_batch):
|
|
108
|
+
raise NotImplementedError
|
|
109
|
+
|
|
110
|
+
def step_size(self, X_batch, y_batch):
|
|
111
|
+
raise NotImplementedError
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class Hinge(SVMLoss):
|
|
115
|
+
r"""
|
|
116
|
+
Compute the hinge loss for classification as:
|
|
117
|
+
|
|
118
|
+
.. math::
|
|
119
|
+
|
|
120
|
+
L(y_{pred}, y_{true}) = \max(0, 1 - y_{true} \, y_{pred})
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
_loss_type = 'classifier'
|
|
124
|
+
|
|
125
|
+
def loss(self, y_pred, y_true):
|
|
126
|
+
return np.maximum(0, 1 - y_true * y_pred)
|
|
127
|
+
|
|
128
|
+
def _cvxpy_loss(self, theta):
|
|
129
|
+
return cp.sum(cp.pos(1 - cp.multiply(self.y, self.X @ theta)))
|
|
130
|
+
|
|
131
|
+
def loss_jacobian(self, packed_coef_inter, X_batch, y_batch):
|
|
132
|
+
y_pred = np.dot(X_batch, packed_coef_inter) # svm decision function
|
|
133
|
+
idx = np.argwhere(y_batch * y_pred < 1.).ravel()
|
|
134
|
+
return np.dot(y_batch[idx], X_batch[idx])
|
|
135
|
+
|
|
136
|
+
def step_size(self, X_batch, y_batch):
|
|
137
|
+
if np.array_equal(X_batch, self.X): # no mini batches
|
|
138
|
+
if not hasattr(self, '_step_size'):
|
|
139
|
+
n_samples = self.X.shape[0]
|
|
140
|
+
L = self.svm.C / n_samples * np.linalg.norm(self.X) ** 2
|
|
141
|
+
self._step_size = 1 / L
|
|
142
|
+
yield self._step_size
|
|
143
|
+
else:
|
|
144
|
+
n_samples = X_batch.shape[0]
|
|
145
|
+
L = self.svm.C / n_samples * np.linalg.norm(X_batch) ** 2
|
|
146
|
+
yield 1 / L
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class SquaredHinge(Hinge):
|
|
150
|
+
r"""
|
|
151
|
+
Compute the squared hinge loss for classification as:
|
|
152
|
+
|
|
153
|
+
.. math::
|
|
154
|
+
|
|
155
|
+
L(y_{pred}, y_{true}) = \max(0, 1 - y_{true} \, y_{pred})^2
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
def loss(self, y_pred, y_true):
|
|
159
|
+
return np.square(super(SquaredHinge, self).loss(y_pred, y_true))
|
|
160
|
+
|
|
161
|
+
def _cvxpy_loss(self, theta):
|
|
162
|
+
return cp.sum(cp.square(cp.pos(1 - cp.multiply(self.y, self.X @ theta))))
|
|
163
|
+
|
|
164
|
+
def loss_jacobian(self, packed_coef_inter, X_batch, y_batch):
|
|
165
|
+
y_pred = np.dot(X_batch, packed_coef_inter) # svm decision function
|
|
166
|
+
idx = np.argwhere(y_batch * y_pred < 1.).ravel()
|
|
167
|
+
return 2 * np.dot(np.maximum(0, 1 - y_batch[idx] * y_pred[idx]) * y_batch[idx], X_batch[idx])
|
|
168
|
+
|
|
169
|
+
def step_size(self, X_batch, y_batch):
|
|
170
|
+
if np.array_equal(X_batch, self.X): # no mini batches
|
|
171
|
+
if not hasattr(self, '_step_size'):
|
|
172
|
+
mu = 1
|
|
173
|
+
n_samples = self.X.shape[0]
|
|
174
|
+
L = (1 / n_samples * mu + # Lipschitz constant wrt the regularization term (strictly convex)
|
|
175
|
+
self.svm.C / n_samples * np.linalg.norm(self.X) ** 2) # Lipschitz constant wrt the loss term
|
|
176
|
+
self._step_size = 1 / L
|
|
177
|
+
yield self._step_size
|
|
178
|
+
else:
|
|
179
|
+
mu = 1
|
|
180
|
+
n_samples = X_batch.shape[0]
|
|
181
|
+
L = (1 / n_samples * mu + # Lipschitz constant wrt the regularization term (strictly convex)
|
|
182
|
+
self.svm.C / n_samples * np.linalg.norm(X_batch) ** 2) # Lipschitz constant wrt the loss term
|
|
183
|
+
yield 1 / L
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class EpsilonInsensitive(SVMLoss):
|
|
187
|
+
r"""
|
|
188
|
+
Compute the epsilon-insensitive loss for regression as:
|
|
189
|
+
|
|
190
|
+
.. math::
|
|
191
|
+
|
|
192
|
+
L(y_{pred}, y_{true}) = \max(0, \lvert y_{true} - y_{pred} \rvert - \epsilon)
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
_loss_type = 'regressor'
|
|
196
|
+
|
|
197
|
+
def __init__(self, svm, X, y, epsilon):
|
|
198
|
+
"""
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
|
|
202
|
+
svm : `SVM` instance
|
|
203
|
+
The SVM estimator this loss is attached to.
|
|
204
|
+
|
|
205
|
+
X : ndarray of shape (n_samples, n_features)
|
|
206
|
+
Training data over which the loss is evaluated.
|
|
207
|
+
|
|
208
|
+
y : ndarray of shape (n_samples,)
|
|
209
|
+
Target values associated with ``X``.
|
|
210
|
+
|
|
211
|
+
epsilon : float
|
|
212
|
+
Width of the epsilon-tube within which no penalty is associated
|
|
213
|
+
with points predicted within a distance epsilon from the actual value.
|
|
214
|
+
"""
|
|
215
|
+
super(EpsilonInsensitive, self).__init__(svm, X, y)
|
|
216
|
+
self.epsilon = epsilon
|
|
217
|
+
|
|
218
|
+
def loss(self, y_pred, y_true):
|
|
219
|
+
return np.maximum(0, np.abs(y_true - y_pred) - self.epsilon)
|
|
220
|
+
|
|
221
|
+
def _cvxpy_loss(self, theta):
|
|
222
|
+
return cp.sum(cp.pos(cp.abs(self.y - self.X @ theta) - self.epsilon))
|
|
223
|
+
|
|
224
|
+
def loss_jacobian(self, packed_coef_inter, X_batch, y_batch):
|
|
225
|
+
y_pred = np.dot(X_batch, packed_coef_inter) # svm decision function
|
|
226
|
+
idx = np.argwhere(np.abs(y_batch - y_pred) >= self.epsilon).ravel()
|
|
227
|
+
z = y_batch[idx] - y_pred[idx]
|
|
228
|
+
return np.dot(np.sign(z), X_batch[idx]) # or np.dot(np.divide(z, np.abs(z)), X_batch[idx])
|
|
229
|
+
|
|
230
|
+
def step_size(self, X_batch, y_batch):
|
|
231
|
+
if np.array_equal(X_batch, self.X): # no mini batches
|
|
232
|
+
if not hasattr(self, '_step_size'):
|
|
233
|
+
n_samples = self.X.shape[0]
|
|
234
|
+
L = self.svm.C / n_samples * np.linalg.norm(self.X) ** 2
|
|
235
|
+
self._step_size = 1 / L
|
|
236
|
+
yield self._step_size
|
|
237
|
+
else:
|
|
238
|
+
n_samples = X_batch.shape[0]
|
|
239
|
+
L = self.svm.C / n_samples * np.linalg.norm(X_batch) ** 2
|
|
240
|
+
yield 1 / L
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class SquaredEpsilonInsensitive(EpsilonInsensitive):
|
|
244
|
+
r"""
|
|
245
|
+
Compute the squared epsilon-insensitive loss for regression as:
|
|
246
|
+
|
|
247
|
+
.. math::
|
|
248
|
+
|
|
249
|
+
L(y_{pred}, y_{true}) = \max(0, \lvert y_{true} - y_{pred} \rvert - \epsilon)^2
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
def loss(self, y_pred, y_true):
|
|
253
|
+
return np.square(super(SquaredEpsilonInsensitive, self).loss(y_pred, y_true))
|
|
254
|
+
|
|
255
|
+
def _cvxpy_loss(self, theta):
|
|
256
|
+
return cp.sum(cp.square(cp.pos(cp.abs(self.y - self.X @ theta) - self.epsilon)))
|
|
257
|
+
|
|
258
|
+
def loss_jacobian(self, packed_coef_inter, X_batch, y_batch):
|
|
259
|
+
y_pred = np.dot(X_batch, packed_coef_inter) # svm decision function
|
|
260
|
+
idx = np.argwhere(np.abs(y_batch - y_pred) >= self.epsilon).ravel()
|
|
261
|
+
z = y_batch[idx] - y_pred[idx]
|
|
262
|
+
return 2 * np.dot(np.sign(z) * (np.abs(z) - self.epsilon), X_batch[idx])
|
|
263
|
+
|
|
264
|
+
def step_size(self, X_batch, y_batch):
|
|
265
|
+
if np.array_equal(X_batch, self.X): # no mini batches
|
|
266
|
+
if not hasattr(self, '_step_size'):
|
|
267
|
+
mu = 1
|
|
268
|
+
n_samples = self.X.shape[0]
|
|
269
|
+
L = (1 / n_samples * mu + # Lipschitz constant wrt the regularization term (strictly convex)
|
|
270
|
+
self.svm.C / n_samples * np.linalg.norm(self.X) ** 2) # Lipschitz constant wrt the loss term
|
|
271
|
+
self._step_size = 1 / L
|
|
272
|
+
yield self._step_size
|
|
273
|
+
else:
|
|
274
|
+
mu = 1
|
|
275
|
+
n_samples = X_batch.shape[0]
|
|
276
|
+
L = (1 / n_samples * mu + # Lipschitz constant wrt the regularization term (strictly convex)
|
|
277
|
+
self.svm.C / n_samples * np.linalg.norm(X_batch) ** 2) # Lipschitz constant wrt the loss term
|
|
278
|
+
yield 1 / L
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
hinge = Hinge
|
|
282
|
+
squared_hinge = SquaredHinge
|
|
283
|
+
epsilon_insensitive = EpsilonInsensitive
|
|
284
|
+
squared_epsilon_insensitive = SquaredEpsilonInsensitive
|