cvmatrix 2.0.2__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cvmatrix-2.0.2 → cvmatrix-2.1.0}/PKG-INFO +1 -1
- cvmatrix-2.1.0/cvmatrix/__init__.py +1 -0
- {cvmatrix-2.0.2 → cvmatrix-2.1.0}/cvmatrix/cvmatrix.py +448 -141
- {cvmatrix-2.0.2 → cvmatrix-2.1.0}/pyproject.toml +1 -1
- cvmatrix-2.0.2/cvmatrix/__init__.py +0 -1
- {cvmatrix-2.0.2 → cvmatrix-2.1.0}/LICENSE +0 -0
- {cvmatrix-2.0.2 → cvmatrix-2.1.0}/README.md +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.1.0"
|
|
@@ -11,7 +11,7 @@ E-mail: ole.e@di.ku.dk
|
|
|
11
11
|
|
|
12
12
|
from collections import defaultdict
|
|
13
13
|
from collections.abc import Hashable
|
|
14
|
-
from typing import Iterable, Union
|
|
14
|
+
from typing import Iterable, Tuple, Union
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
from numpy import typing as npt
|
|
@@ -93,7 +93,7 @@ class CVMatrix:
|
|
|
93
93
|
self.ddof = ddof
|
|
94
94
|
self.dtype = dtype
|
|
95
95
|
self.copy = copy
|
|
96
|
-
self.
|
|
96
|
+
self.resolution = np.finfo(dtype).resolution * 10
|
|
97
97
|
self.X_total = None
|
|
98
98
|
self.Y_total = None
|
|
99
99
|
self.N = None
|
|
@@ -149,10 +149,16 @@ class CVMatrix:
|
|
|
149
149
|
self._init_matrix_products()
|
|
150
150
|
self._init_total_stats()
|
|
151
151
|
|
|
152
|
-
def training_XTX(
|
|
152
|
+
def training_XTX(
|
|
153
|
+
self, fold: Hashable
|
|
154
|
+
) -> Tuple[
|
|
155
|
+
np.ndarray, Tuple[Union[None, np.ndarray], Union[None, np.ndarray], None, None]
|
|
156
|
+
]:
|
|
153
157
|
"""
|
|
154
|
-
|
|
155
|
-
corresponding to every sample except those belonging to the given fold.
|
|
158
|
+
Computes the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`
|
|
159
|
+
corresponding to every sample except those belonging to the given fold. Also
|
|
160
|
+
computes the row of column-wise weighted means for `X` and the row of
|
|
161
|
+
column-wise weighted standard deviations for `X`.
|
|
156
162
|
|
|
157
163
|
Parameters
|
|
158
164
|
----------
|
|
@@ -162,8 +168,13 @@ class CVMatrix:
|
|
|
162
168
|
|
|
163
169
|
Returns
|
|
164
170
|
-------
|
|
165
|
-
|
|
166
|
-
|
|
171
|
+
Tuple of two elements. The first element is an array of shape (K, K)
|
|
172
|
+
corresponding to the training set
|
|
173
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`. The second element is
|
|
174
|
+
a tuple containing the row of column-wise weighted means for `X`, the row
|
|
175
|
+
of column-wise weighted standard deviations for `X`, and two `None`
|
|
176
|
+
corresponding to the non-computed rows of column-wise weighted means and
|
|
177
|
+
standard deviations for `Y`. If a statistic is not computed, it is `None`.
|
|
167
178
|
|
|
168
179
|
Raises
|
|
169
180
|
------
|
|
@@ -174,20 +185,34 @@ class CVMatrix:
|
|
|
174
185
|
See Also
|
|
175
186
|
--------
|
|
176
187
|
training_XTY :
|
|
177
|
-
|
|
178
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`
|
|
188
|
+
Computes the training set
|
|
189
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}` and weighted
|
|
190
|
+
statistics.
|
|
179
191
|
training_XTX_XTY :
|
|
180
|
-
|
|
181
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}
|
|
182
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}
|
|
183
|
-
method is faster than calling
|
|
192
|
+
Computes the training set
|
|
193
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`,
|
|
194
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`, and weighted
|
|
195
|
+
statistics for a given fold. This method is faster than calling
|
|
196
|
+
`training_XTX` and `training_XTY` separately.
|
|
184
197
|
"""
|
|
185
198
|
return self._training_matrices(True, False, fold)
|
|
186
199
|
|
|
187
|
-
def training_XTY(self, fold: Hashable) ->
|
|
200
|
+
def training_XTY(self, fold: Hashable) -> Tuple[
|
|
201
|
+
np.ndarray,
|
|
202
|
+
Tuple[
|
|
203
|
+
Union[None, np.ndarray],
|
|
204
|
+
Union[None, np.ndarray],
|
|
205
|
+
Union[None, np.ndarray],
|
|
206
|
+
Union[None, np.ndarray],
|
|
207
|
+
],
|
|
208
|
+
]:
|
|
188
209
|
"""
|
|
189
|
-
|
|
190
|
-
corresponding to every sample except those belonging to the given fold.
|
|
210
|
+
Computes the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`
|
|
211
|
+
corresponding to every sample except those belonging to the given fold. Also
|
|
212
|
+
computes the row of column-wise weighted means for `X`, the row of column-wise
|
|
213
|
+
weighted standard deviations for `X`, the row of column-wise weighted means for
|
|
214
|
+
`Y`, and the row of column-wise weighted standard deviations for `Y`. If a
|
|
215
|
+
statistic is not computed, it is `None`.
|
|
191
216
|
|
|
192
217
|
Parameters
|
|
193
218
|
----------
|
|
@@ -197,8 +222,13 @@ class CVMatrix:
|
|
|
197
222
|
|
|
198
223
|
Returns
|
|
199
224
|
-------
|
|
200
|
-
|
|
201
|
-
|
|
225
|
+
Tuple of two elements. The first element is an array of shape (K, M)
|
|
226
|
+
corresponding to the training set
|
|
227
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`. The second element
|
|
228
|
+
is a tuple containing the row of column-wise weighted means for `X`, the
|
|
229
|
+
row of column-wise weighted standard deviations for `X`, the row of
|
|
230
|
+
column-wise weighted means for `Y`, and the row of column-wise weighted
|
|
231
|
+
standard deviations for `Y`. If a statistic is not computed, it is `None`.
|
|
202
232
|
|
|
203
233
|
Raises
|
|
204
234
|
------
|
|
@@ -212,20 +242,36 @@ class CVMatrix:
|
|
|
212
242
|
See Also
|
|
213
243
|
--------
|
|
214
244
|
training_XTX :
|
|
215
|
-
|
|
245
|
+
Computes the training set
|
|
246
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}` and weighted statistics
|
|
247
|
+
for a given fold.
|
|
216
248
|
training_XTX_XTY :
|
|
217
|
-
|
|
218
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}
|
|
219
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}
|
|
220
|
-
method is faster than calling
|
|
249
|
+
Computes the training set
|
|
250
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`,
|
|
251
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`, and weighted
|
|
252
|
+
statistics for a given fold. This method is faster than calling
|
|
253
|
+
`training_XTX` and `training_XTY` separately.
|
|
221
254
|
"""
|
|
222
255
|
return self._training_matrices(False, True, fold)
|
|
223
256
|
|
|
224
|
-
def training_XTX_XTY(self, fold: Hashable) ->
|
|
257
|
+
def training_XTX_XTY(self, fold: Hashable) -> Tuple[
|
|
258
|
+
Tuple[np.ndarray, np.ndarray],
|
|
259
|
+
Tuple[
|
|
260
|
+
Union[None, np.ndarray],
|
|
261
|
+
Union[None, np.ndarray],
|
|
262
|
+
Union[None, np.ndarray],
|
|
263
|
+
Union[None, np.ndarray],
|
|
264
|
+
],
|
|
265
|
+
]:
|
|
225
266
|
"""
|
|
226
|
-
|
|
267
|
+
Computes the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`
|
|
227
268
|
and :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}` corresponding to every
|
|
228
|
-
sample except those belonging to the given fold.
|
|
269
|
+
sample except those belonging to the given fold. Also computes the row of
|
|
270
|
+
column-wise weighted means for `X`, the row of column-wise weighted standard
|
|
271
|
+
deviations for `X`, the row of column-wise weighted means for `Y`, and the row
|
|
272
|
+
of column-wise weighted standard deviations for `Y`. If a statistic is not
|
|
273
|
+
computed, it is `None`.
|
|
274
|
+
|
|
229
275
|
|
|
230
276
|
Parameters
|
|
231
277
|
----------
|
|
@@ -236,9 +282,14 @@ class CVMatrix:
|
|
|
236
282
|
|
|
237
283
|
Returns
|
|
238
284
|
-------
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{
|
|
285
|
+
Tuple of two tuples. The first tuple contains arrays of shapes (K, K) and
|
|
286
|
+
(K, M). These are the training set
|
|
287
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}` and
|
|
288
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`. The second tuple
|
|
289
|
+
contains the row of column-wise weighted means for `X`, the row of
|
|
290
|
+
column-wise weighted standard deviations for `X`, the row of column-wise
|
|
291
|
+
weighted means for `Y`, and the row of column-wise weighted standard
|
|
292
|
+
deviations for `Y`. If a statistic is not computed, it is `None`.
|
|
242
293
|
|
|
243
294
|
Raises
|
|
244
295
|
------
|
|
@@ -252,121 +303,204 @@ class CVMatrix:
|
|
|
252
303
|
See Also
|
|
253
304
|
--------
|
|
254
305
|
training_XTX :
|
|
255
|
-
|
|
256
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`
|
|
306
|
+
Computes the training set
|
|
307
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}` and weighted
|
|
308
|
+
statistics.
|
|
257
309
|
training_XTY :
|
|
258
|
-
|
|
259
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`
|
|
310
|
+
Computes the training set
|
|
311
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}` and weighted
|
|
312
|
+
statistics.
|
|
260
313
|
"""
|
|
261
314
|
return self._training_matrices(True, True, fold)
|
|
262
315
|
|
|
263
|
-
def
|
|
264
|
-
|
|
265
|
-
|
|
316
|
+
def training_statistics(self, fold: Hashable) -> Tuple[
|
|
317
|
+
Union[None, np.ndarray],
|
|
318
|
+
Union[None, np.ndarray],
|
|
319
|
+
Union[None, np.ndarray],
|
|
320
|
+
Union[None, np.ndarray],
|
|
321
|
+
]:
|
|
266
322
|
"""
|
|
267
|
-
|
|
268
|
-
and
|
|
269
|
-
|
|
323
|
+
Computes the row of column-wise weighted means and standard deviations for `X`
|
|
324
|
+
and `Y` corresponding to every sample except those belonging to the given fold.
|
|
325
|
+
The statistics that can be computed depend on the arguments provided in the
|
|
326
|
+
constructor: `X` mean can be computed if `center_X` or `scale_X`, or `center_Y`
|
|
327
|
+
is `True`. `X` standard deviation can be computed if `scale_X` is True. `Y`
|
|
328
|
+
mean can be computed if `center_X ,`center_Y` or `scale_Y` is `True`, and `Y`
|
|
329
|
+
is provided. `Y` standard deviation can be computed if `scale_Y` is `True` and
|
|
330
|
+
`Y` is provided.
|
|
270
331
|
|
|
271
332
|
Parameters
|
|
272
333
|
----------
|
|
273
|
-
return_XTX : bool
|
|
274
|
-
Whether to return the training set
|
|
275
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`.
|
|
276
|
-
|
|
277
334
|
fold : Hashable
|
|
278
|
-
The fold for which to return the corresponding training
|
|
279
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}` and
|
|
280
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`
|
|
281
|
-
|
|
282
|
-
return_XTY : bool, optional, default=False
|
|
283
|
-
Whether to return the training set
|
|
284
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`.
|
|
335
|
+
The fold for which to return the corresponding training statistics.
|
|
285
336
|
|
|
286
337
|
Returns
|
|
287
338
|
-------
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
339
|
+
Tuple of four elements of Union[None, np.ndarray]
|
|
340
|
+
A tuple containing the row of column-wise weighted means for `X`, the row
|
|
341
|
+
of column-wise weighted standard deviations for `X`, the row of column-wise
|
|
342
|
+
weighted means for `Y`, and the row of column-wise weighted standard
|
|
343
|
+
deviations for `Y`. If a statistic is not computed, it is `None`.
|
|
291
344
|
|
|
292
345
|
Raises
|
|
293
346
|
------
|
|
294
|
-
ValueError
|
|
295
|
-
If both `return_XTX` and `return_XTY` are `False` or if `return_XTY` is
|
|
296
|
-
`True` and `Y` is `None`.
|
|
297
|
-
|
|
298
347
|
ValueError
|
|
299
348
|
If `fold` was not provided as a cross-validation split in the
|
|
300
349
|
`folds` parameter of the constructor.
|
|
301
350
|
"""
|
|
302
|
-
|
|
351
|
+
val_indices = self._get_val_indices(fold)
|
|
352
|
+
X_val, X_val_unweighted, Y_val, Y_val_unweighted = self._get_val_matrices(
|
|
353
|
+
val_indices=val_indices, return_XTY=self.Y_total is not None
|
|
354
|
+
)
|
|
355
|
+
return self._compute_training_stats(
|
|
356
|
+
val_indices=val_indices,
|
|
357
|
+
X_val=X_val,
|
|
358
|
+
X_val_unweighted=X_val_unweighted,
|
|
359
|
+
Y_val=Y_val,
|
|
360
|
+
Y_val_unweighted=Y_val_unweighted,
|
|
361
|
+
return_X_mean=self.center_X or self.scale_X,
|
|
362
|
+
return_X_std=self.scale_X,
|
|
363
|
+
return_Y_mean=(self.center_Y or self.scale_Y) and self.Y_total is not None,
|
|
364
|
+
return_Y_std=self.scale_Y and self.Y_total is not None,
|
|
365
|
+
)[
|
|
366
|
+
:-1
|
|
367
|
+
] # Exclude the sum of training weights from the return tuple
|
|
368
|
+
|
|
369
|
+
def _get_sum_w_train_and_num_nonzero_w_train(
|
|
370
|
+
self, val_indices: npt.NDArray[np.int_]
|
|
371
|
+
) -> Tuple[float, float]:
|
|
372
|
+
"""
|
|
373
|
+
Returns a tuple containing the sum of weights in the training set and the number
|
|
374
|
+
of non-zero weights in the training set. If `w_total` is `None`, it returns the
|
|
375
|
+
size of the training set as both the sum of weights and the number of
|
|
376
|
+
non-zero weights.
|
|
377
|
+
|
|
378
|
+
Returns
|
|
379
|
+
-------
|
|
380
|
+
Tuple of floats
|
|
381
|
+
The sum of weights in the training set and the number of non-zero weights in
|
|
382
|
+
the training set.
|
|
383
|
+
|
|
384
|
+
Raises
|
|
385
|
+
------
|
|
386
|
+
ValueError
|
|
387
|
+
If the number of non-zero weights in the training set is zero, which would
|
|
388
|
+
make it impossible to compute either of training set means or standard
|
|
389
|
+
deviations.
|
|
390
|
+
"""
|
|
391
|
+
if self.w_total is None:
|
|
392
|
+
sum_w_val = np.asarray(val_indices.size, dtype=self.dtype)
|
|
393
|
+
sum_w_train = self.sum_w_total - sum_w_val
|
|
394
|
+
return (sum_w_train, sum_w_train)
|
|
395
|
+
w_val = self.w_total[val_indices]
|
|
396
|
+
sum_w_val = np.sum(w_val)
|
|
397
|
+
sum_w_train = self.sum_w_total - sum_w_val
|
|
398
|
+
num_nonzero_w_val = np.count_nonzero(w_val)
|
|
399
|
+
num_nonzero_w_train = np.asarray(
|
|
400
|
+
self.num_nonzero_w_total - num_nonzero_w_val, dtype=self.dtype
|
|
401
|
+
)
|
|
402
|
+
if num_nonzero_w_train == 0:
|
|
303
403
|
raise ValueError(
|
|
304
|
-
"
|
|
404
|
+
"The number of non-zero weights in the training set must be "
|
|
405
|
+
"greater than zero."
|
|
305
406
|
)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
407
|
+
return sum_w_train, num_nonzero_w_train
|
|
408
|
+
|
|
409
|
+
def _compute_training_stats(
|
|
410
|
+
self,
|
|
411
|
+
val_indices: npt.NDArray[np.int_],
|
|
412
|
+
X_val: Union[None, np.ndarray],
|
|
413
|
+
X_val_unweighted: Union[None, np.ndarray],
|
|
414
|
+
Y_val: Union[None, np.ndarray],
|
|
415
|
+
Y_val_unweighted: Union[None, np.ndarray],
|
|
416
|
+
return_X_mean: bool,
|
|
417
|
+
return_X_std: bool,
|
|
418
|
+
return_Y_mean: bool,
|
|
419
|
+
return_Y_std: bool,
|
|
420
|
+
) -> Tuple[
|
|
421
|
+
Union[None, np.ndarray],
|
|
422
|
+
Union[None, np.ndarray],
|
|
423
|
+
Union[None, np.ndarray],
|
|
424
|
+
Union[None, np.ndarray],
|
|
425
|
+
Union[None, float],
|
|
426
|
+
]:
|
|
427
|
+
"""
|
|
428
|
+
Computes the training set statistics for the given fold. The statistics include
|
|
429
|
+
the row of column-wise weighted means and standard deviations for `X` and `Y`.
|
|
430
|
+
|
|
431
|
+
Parameters
|
|
432
|
+
----------
|
|
433
|
+
val_indices : Array of shape (N_val,)
|
|
434
|
+
The indices of the validation set samples for the given fold.
|
|
435
|
+
|
|
436
|
+
X_val : None or Array of shape (N_val, K)
|
|
437
|
+
The validation set of weighted predictor variables. If `None`, no
|
|
438
|
+
statistics for `X` are computed. Required if `return_X_mean` or
|
|
439
|
+
`return_X_std` is `True`.
|
|
440
|
+
|
|
441
|
+
X_val_unweighted : None or Array of shape (N_val, K)
|
|
442
|
+
The validation set of unweighted predictor variables. Required if
|
|
443
|
+
`return_X_std` is `True`.
|
|
444
|
+
|
|
445
|
+
Y_val : None or Array of shape (N_val, M)
|
|
446
|
+
The validation set of weighted response variables. If `None`, no statistics
|
|
447
|
+
for `Y` are computed. Required if `return_Y_mean` or `return_Y_std` is
|
|
448
|
+
`True`.
|
|
449
|
+
|
|
450
|
+
Y_val_unweighted : None or Array of shape (N_val, M)
|
|
451
|
+
The validation set of unweighted response variables. Required if
|
|
452
|
+
`return_Y_std` is `True`.
|
|
453
|
+
|
|
454
|
+
return_X_mean : bool
|
|
455
|
+
Whether to compute the row of column-wise weighted means for `X`.
|
|
456
|
+
|
|
457
|
+
return_X_std : bool
|
|
458
|
+
Whether to compute the row of column-wise weighted standard deviations for
|
|
459
|
+
`X`.
|
|
460
|
+
|
|
461
|
+
return_Y_mean : bool
|
|
462
|
+
Whether to compute the row of column-wise weighted means for `Y`.
|
|
463
|
+
|
|
464
|
+
return_Y_std : bool
|
|
465
|
+
Whether to compute the row of column-wise weighted standard deviations for
|
|
466
|
+
`Y`.
|
|
467
|
+
|
|
468
|
+
Returns
|
|
469
|
+
-------
|
|
470
|
+
Tuple of Union[None, np.ndarray]
|
|
471
|
+
A tuple containing the row of column-wise weighted means for `X`, the row
|
|
472
|
+
of column-wise weighted standard deviations for `X`, the row of column-wise
|
|
473
|
+
weighted means for `Y`, the row of column-wise weighted standard deviations
|
|
474
|
+
for `Y`, and the sum of training weights. If a statistic is not computed,
|
|
475
|
+
it is `None`.
|
|
476
|
+
"""
|
|
331
477
|
if (
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
478
|
+
not return_X_mean
|
|
479
|
+
and not return_X_std
|
|
480
|
+
and not return_Y_mean
|
|
481
|
+
and not return_Y_std
|
|
335
482
|
):
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
w_val = self.w_total[val_indices]
|
|
342
|
-
sum_w_val = np.sum(w_val)
|
|
343
|
-
sum_w_train = self.sum_w_total - sum_w_val
|
|
344
|
-
num_nonzero_w_val = np.count_nonzero(w_val)
|
|
345
|
-
num_nonzero_w_train = np.asarray(
|
|
346
|
-
self.num_nonzero_w_total - num_nonzero_w_val, dtype=self.dtype
|
|
347
|
-
)
|
|
348
|
-
if num_nonzero_w_train == 0:
|
|
349
|
-
raise ValueError(
|
|
350
|
-
"The number of non-zero weights in the training set must be "
|
|
351
|
-
"greater than zero."
|
|
352
|
-
)
|
|
353
|
-
if self.center_X or self.scale_X or (return_XTY and self.center_Y):
|
|
483
|
+
return None, None, None, None, None
|
|
484
|
+
sum_w_train, num_nonzero_w_train = (
|
|
485
|
+
self._get_sum_w_train_and_num_nonzero_w_train(val_indices)
|
|
486
|
+
)
|
|
487
|
+
if return_X_mean or return_X_std:
|
|
354
488
|
sum_X_val = np.sum(X_val, axis=0, keepdims=True)
|
|
355
489
|
X_train_mean = self._compute_training_mat_mean(
|
|
356
490
|
sum_X_val,
|
|
357
491
|
self.sum_X_total,
|
|
358
492
|
sum_w_train,
|
|
359
493
|
)
|
|
360
|
-
if
|
|
494
|
+
if return_Y_mean or return_Y_std:
|
|
361
495
|
sum_Y_val = np.sum(Y_val, axis=0, keepdims=True)
|
|
362
496
|
Y_train_mean = self._compute_training_mat_mean(
|
|
363
497
|
sum_Y_val,
|
|
364
498
|
self.sum_Y_total,
|
|
365
499
|
sum_w_train,
|
|
366
500
|
)
|
|
367
|
-
if
|
|
501
|
+
if return_X_std or return_Y_std:
|
|
368
502
|
divisor = self._compute_std_divisor(sum_w_train, num_nonzero_w_train)
|
|
369
|
-
if
|
|
503
|
+
if return_X_std:
|
|
370
504
|
X_train_std = self._compute_training_mat_std(
|
|
371
505
|
sum_X_val,
|
|
372
506
|
X_val,
|
|
@@ -377,7 +511,7 @@ class CVMatrix:
|
|
|
377
511
|
sum_w_train,
|
|
378
512
|
divisor,
|
|
379
513
|
)
|
|
380
|
-
if
|
|
514
|
+
if return_Y_std:
|
|
381
515
|
Y_train_std = self._compute_training_mat_std(
|
|
382
516
|
sum_Y_val,
|
|
383
517
|
Y_val,
|
|
@@ -388,7 +522,135 @@ class CVMatrix:
|
|
|
388
522
|
sum_w_train,
|
|
389
523
|
divisor,
|
|
390
524
|
)
|
|
525
|
+
return (
|
|
526
|
+
X_train_mean if return_X_mean else None,
|
|
527
|
+
X_train_std if return_X_std else None,
|
|
528
|
+
Y_train_mean if return_Y_mean else None,
|
|
529
|
+
Y_train_std if return_Y_std else None,
|
|
530
|
+
sum_w_train,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
def _training_matrices(
|
|
534
|
+
self, return_XTX: bool, return_XTY: bool, fold: Hashable
|
|
535
|
+
) -> Tuple[
|
|
536
|
+
Union[np.ndarray, Tuple[np.ndarray, np.ndarray]],
|
|
537
|
+
Tuple[
|
|
538
|
+
Union[None, np.ndarray],
|
|
539
|
+
Union[None, np.ndarray],
|
|
540
|
+
Union[None, np.ndarray],
|
|
541
|
+
Union[None, np.ndarray],
|
|
542
|
+
],
|
|
543
|
+
]:
|
|
544
|
+
"""
|
|
545
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`
|
|
546
|
+
and/or :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}` corresponding to
|
|
547
|
+
every sample except those belonging to the given fold.
|
|
548
|
+
|
|
549
|
+
Parameters
|
|
550
|
+
----------
|
|
551
|
+
return_XTX : bool
|
|
552
|
+
Whether to return the training set
|
|
553
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`.
|
|
554
|
+
|
|
555
|
+
fold : Hashable
|
|
556
|
+
The fold for which to return the corresponding training set
|
|
557
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}` and
|
|
558
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`
|
|
559
|
+
|
|
560
|
+
return_XTY : bool, optional, default=False
|
|
561
|
+
Whether to return the training set
|
|
562
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`.
|
|
563
|
+
|
|
564
|
+
Returns
|
|
565
|
+
-------
|
|
566
|
+
Tuple of two elements. The first element is an array of shape (K, K) or (K, M)
|
|
567
|
+
or a tuple of arrays of shapes (K, K) and (K, M). These are the training
|
|
568
|
+
set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}` and/or
|
|
569
|
+
training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`. The
|
|
570
|
+
second element is a tuple containing the row of column-wise weighted means
|
|
571
|
+
for `X`, the row of column-wise weighted standard deviations for `X`, the
|
|
572
|
+
row of column-wise weighted means for `Y`, and the row of column-wise
|
|
573
|
+
weighted standard deviations for `Y`. If a statistic is not computed, it is
|
|
574
|
+
`None`.
|
|
575
|
+
|
|
576
|
+
Raises
|
|
577
|
+
------
|
|
578
|
+
ValueError
|
|
579
|
+
If both `return_XTX` and `return_XTY` are `False` or if `return_XTY` is
|
|
580
|
+
`True` and `Y` is `None`.
|
|
581
|
+
|
|
582
|
+
ValueError
|
|
583
|
+
If `fold` was not provided as a cross-validation split in the
|
|
584
|
+
`folds` parameter of the constructor.
|
|
585
|
+
"""
|
|
586
|
+
if not return_XTX and not return_XTY:
|
|
587
|
+
raise ValueError(
|
|
588
|
+
"At least one of `return_XTX` and `return_XTY` must be True."
|
|
589
|
+
)
|
|
590
|
+
if return_XTY and self.Y_total is None:
|
|
591
|
+
raise ValueError("Response variables `Y` are not provided.")
|
|
592
|
+
val_indices = self._get_val_indices(fold)
|
|
593
|
+
X_val, X_val_unweighted, Y_val, Y_val_unweighted = self._get_val_matrices(
|
|
594
|
+
val_indices=val_indices, return_XTY=return_XTY
|
|
595
|
+
)
|
|
596
|
+
X_train_mean, X_train_std, Y_train_mean, Y_train_std, sum_w_train = (
|
|
597
|
+
self._compute_training_stats(
|
|
598
|
+
val_indices=val_indices,
|
|
599
|
+
X_val=(
|
|
600
|
+
X_val
|
|
601
|
+
if self.center_X or self.scale_X or (return_XTY and self.center_Y)
|
|
602
|
+
else None
|
|
603
|
+
),
|
|
604
|
+
X_val_unweighted=X_val_unweighted if self.scale_X else None,
|
|
605
|
+
Y_val=(
|
|
606
|
+
Y_val
|
|
607
|
+
if return_XTY and (self.center_X or self.center_Y or self.scale_Y)
|
|
608
|
+
else None
|
|
609
|
+
),
|
|
610
|
+
Y_val_unweighted=(
|
|
611
|
+
Y_val_unweighted if return_XTY and self.scale_Y else None
|
|
612
|
+
),
|
|
613
|
+
return_X_mean=self.center_X or (return_XTY and self.center_Y),
|
|
614
|
+
return_Y_mean=return_XTY and (self.center_X or self.center_Y),
|
|
615
|
+
return_X_std=self.scale_X,
|
|
616
|
+
return_Y_std=return_XTY and self.scale_Y,
|
|
617
|
+
)
|
|
618
|
+
)
|
|
619
|
+
stats_tuple = (
|
|
620
|
+
X_train_mean,
|
|
621
|
+
X_train_std,
|
|
622
|
+
Y_train_mean,
|
|
623
|
+
Y_train_std,
|
|
624
|
+
)
|
|
391
625
|
if return_XTX and return_XTY:
|
|
626
|
+
return (
|
|
627
|
+
(
|
|
628
|
+
self._training_kernel_matrix(
|
|
629
|
+
self.XTX_total,
|
|
630
|
+
X_val,
|
|
631
|
+
X_val_unweighted,
|
|
632
|
+
X_train_mean,
|
|
633
|
+
X_train_mean,
|
|
634
|
+
X_train_std,
|
|
635
|
+
X_train_std,
|
|
636
|
+
sum_w_train,
|
|
637
|
+
center=self.center_X,
|
|
638
|
+
),
|
|
639
|
+
self._training_kernel_matrix(
|
|
640
|
+
self.XTY_total,
|
|
641
|
+
X_val,
|
|
642
|
+
Y_val_unweighted,
|
|
643
|
+
X_train_mean,
|
|
644
|
+
Y_train_mean,
|
|
645
|
+
X_train_std,
|
|
646
|
+
Y_train_std,
|
|
647
|
+
sum_w_train,
|
|
648
|
+
center=self.center_X or self.center_Y,
|
|
649
|
+
),
|
|
650
|
+
),
|
|
651
|
+
stats_tuple,
|
|
652
|
+
)
|
|
653
|
+
if return_XTX:
|
|
392
654
|
return (
|
|
393
655
|
self._training_kernel_matrix(
|
|
394
656
|
self.XTX_total,
|
|
@@ -401,42 +663,87 @@ class CVMatrix:
|
|
|
401
663
|
sum_w_train,
|
|
402
664
|
center=self.center_X,
|
|
403
665
|
),
|
|
404
|
-
|
|
405
|
-
self.XTY_total,
|
|
406
|
-
X_val,
|
|
407
|
-
Y_val_unweighted,
|
|
408
|
-
X_train_mean,
|
|
409
|
-
Y_train_mean,
|
|
410
|
-
X_train_std,
|
|
411
|
-
Y_train_std,
|
|
412
|
-
sum_w_train,
|
|
413
|
-
center=self.center_X or self.center_Y,
|
|
414
|
-
),
|
|
666
|
+
stats_tuple,
|
|
415
667
|
)
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
self.
|
|
668
|
+
return (
|
|
669
|
+
self._training_kernel_matrix(
|
|
670
|
+
self.XTY_total,
|
|
419
671
|
X_val,
|
|
420
|
-
|
|
421
|
-
X_train_mean,
|
|
672
|
+
Y_val_unweighted,
|
|
422
673
|
X_train_mean,
|
|
674
|
+
Y_train_mean,
|
|
423
675
|
X_train_std,
|
|
424
|
-
|
|
676
|
+
Y_train_std,
|
|
425
677
|
sum_w_train,
|
|
426
|
-
center=self.center_X,
|
|
427
|
-
)
|
|
428
|
-
|
|
429
|
-
self.XTY_total,
|
|
430
|
-
X_val,
|
|
431
|
-
Y_val_unweighted,
|
|
432
|
-
X_train_mean,
|
|
433
|
-
Y_train_mean,
|
|
434
|
-
X_train_std,
|
|
435
|
-
Y_train_std,
|
|
436
|
-
sum_w_train,
|
|
437
|
-
center=self.center_X or self.center_Y,
|
|
678
|
+
center=self.center_X or self.center_Y,
|
|
679
|
+
),
|
|
680
|
+
stats_tuple,
|
|
438
681
|
)
|
|
439
682
|
|
|
683
|
+
def _get_val_indices(self, fold: Hashable) -> npt.NDArray[np.int_]:
|
|
684
|
+
"""
|
|
685
|
+
Returns the indices of the validation set samples for a given fold.
|
|
686
|
+
Parameters
|
|
687
|
+
----------
|
|
688
|
+
fold : Hashable
|
|
689
|
+
The fold for which to return the validation set indices.
|
|
690
|
+
Returns
|
|
691
|
+
-------
|
|
692
|
+
Array of shape (N_val,)
|
|
693
|
+
The indices of the validation set samples for the given fold.
|
|
694
|
+
"""
|
|
695
|
+
try:
|
|
696
|
+
val_indices = self.folds_dict[fold]
|
|
697
|
+
except KeyError as e:
|
|
698
|
+
raise ValueError(f"Fold {fold} not found.") from e
|
|
699
|
+
return val_indices
|
|
700
|
+
|
|
701
|
+
def _get_val_matrices(
|
|
702
|
+
self, val_indices: npt.NDArray[np.int_], return_XTY: bool
|
|
703
|
+
) -> Tuple[
|
|
704
|
+
np.ndarray,
|
|
705
|
+
np.ndarray,
|
|
706
|
+
np.ndarray,
|
|
707
|
+
Union[None, np.ndarray],
|
|
708
|
+
Union[None, np.ndarray],
|
|
709
|
+
]:
|
|
710
|
+
"""
|
|
711
|
+
Returns the validation set matrices for a given fold.
|
|
712
|
+
Parameters
|
|
713
|
+
----------
|
|
714
|
+
val_indices : Array of shape (N_val,)
|
|
715
|
+
The indices of the validation set samples for the given fold.
|
|
716
|
+
return_XTY : bool
|
|
717
|
+
Whether to return the validation set of response variables `Y`. If `False`,
|
|
718
|
+
the returned `Y_val` and `Y_val_unweighted` will be `None`.
|
|
719
|
+
Returns
|
|
720
|
+
-------
|
|
721
|
+
Tuple of arrays of shapes (N_val, K), (N_val, K), (N_val, M), and (N_val, M)
|
|
722
|
+
The validation set of predictor variables `X`, the validation set of
|
|
723
|
+
unweighted predictor variables `X_unweighted`, the validation set of
|
|
724
|
+
response variables `Y`, and the validation set of unweighted response
|
|
725
|
+
variables `Y_unweighted`. If `return_XTY` is `False`, `Y` and
|
|
726
|
+
`Y_unweighted` will be `None`.
|
|
727
|
+
"""
|
|
728
|
+
X_val = self.Xw_total[val_indices]
|
|
729
|
+
if self.w_total is None:
|
|
730
|
+
X_val_unweighted = X_val
|
|
731
|
+
else:
|
|
732
|
+
X_val_unweighted = self.X_total[val_indices]
|
|
733
|
+
if return_XTY:
|
|
734
|
+
if self.w_total is None or not (
|
|
735
|
+
self.center_X or self.center_Y or self.scale_Y
|
|
736
|
+
):
|
|
737
|
+
Y_val = self.Y_total[val_indices]
|
|
738
|
+
Y_val_unweighted = Y_val
|
|
739
|
+
else:
|
|
740
|
+
Y_val = self.Yw_total[val_indices]
|
|
741
|
+
Y_val_unweighted = self.Y_total[val_indices]
|
|
742
|
+
else:
|
|
743
|
+
Y_val = None
|
|
744
|
+
Y_val_unweighted = None
|
|
745
|
+
return X_val, X_val_unweighted, Y_val, Y_val_unweighted
|
|
746
|
+
|
|
440
747
|
def _training_kernel_matrix(
|
|
441
748
|
self,
|
|
442
749
|
total_kernel_mat: np.ndarray,
|
|
@@ -455,8 +762,8 @@ class CVMatrix:
|
|
|
455
762
|
Parameters
|
|
456
763
|
----------
|
|
457
764
|
total_kernel_mat : Array of shape (N, K) or (N, M)
|
|
458
|
-
The total kernel matrix :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`
|
|
459
|
-
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
|
|
765
|
+
The total kernel matrix :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{X}`
|
|
766
|
+
or :math:`\mathbf{X}^{\mathbf{T}}\mathbf{W}\mathbf{Y}`.
|
|
460
767
|
|
|
461
768
|
X_val : Array of shape (N_val, K)
|
|
462
769
|
The validation set of predictor variables.
|
|
@@ -619,7 +926,7 @@ class CVMatrix:
|
|
|
619
926
|
) / divisor
|
|
620
927
|
mat_train_var[mat_train_var < 0] = 0
|
|
621
928
|
mat_train_std = np.sqrt(mat_train_var)
|
|
622
|
-
mat_train_std[np.abs(mat_train_std) <= self.
|
|
929
|
+
mat_train_std[np.abs(mat_train_std) <= self.resolution] = 1
|
|
623
930
|
return mat_train_std
|
|
624
931
|
|
|
625
932
|
def _init_mat(self, mat: np.ndarray) -> np.ndarray:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "cvmatrix"
|
|
3
|
-
version = "2.0
|
|
3
|
+
version = "2.1.0"
|
|
4
4
|
description = "Fast computation of possibly weighted and possibly centered/scaled training set kernel matrices in a cross-validation setting."
|
|
5
5
|
authors = ["Sm00thix <oleemail@icloud.com>"]
|
|
6
6
|
maintainers = ["Sm00thix <oleemail@icloud.com>"]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "2.0.2"
|
|
File without changes
|
|
File without changes
|