cvmatrix 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cvmatrix-1.0.0/LICENSE +190 -0
- cvmatrix-1.0.0/PKG-INFO +100 -0
- cvmatrix-1.0.0/README.md +79 -0
- cvmatrix-1.0.0/cvmatrix/__init__.py +1 -0
- cvmatrix-1.0.0/cvmatrix/cvmatrix.py +619 -0
- cvmatrix-1.0.0/pyproject.toml +18 -0
cvmatrix-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity. For the purposes of this definition,
|
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
19
|
+
direction or management of such entity, whether by contract or
|
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
22
|
+
|
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
24
|
+
exercising permissions granted by this License.
|
|
25
|
+
|
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
27
|
+
including but not limited to software source code, documentation
|
|
28
|
+
source, and configuration files.
|
|
29
|
+
|
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
|
31
|
+
transformation or translation of a Source form, including but
|
|
32
|
+
not limited to compiled object code, generated documentation,
|
|
33
|
+
and conversions to other media types.
|
|
34
|
+
|
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
36
|
+
Object form, made available under the License, as indicated by a
|
|
37
|
+
copyright notice that is included in or attached to the work
|
|
38
|
+
(an example is provided in the Appendix below).
|
|
39
|
+
|
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
|
46
|
+
the Work and Derivative Works thereof.
|
|
47
|
+
|
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
|
49
|
+
the original version of the Work and any modifications or additions
|
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
|
61
|
+
|
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
|
64
|
+
subsequently incorporated within the Work.
|
|
65
|
+
|
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
|
72
|
+
|
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
78
|
+
where such license applies only to those patent claims licensable
|
|
79
|
+
by such Contributor that are necessarily infringed by their
|
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
|
82
|
+
institute patent litigation against any entity (including a
|
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
|
85
|
+
or contributory patent infringement, then any patent licenses
|
|
86
|
+
granted to You under this License for that Work shall terminate
|
|
87
|
+
as of the date such litigation is filed.
|
|
88
|
+
|
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
91
|
+
modifications, and in Source or Object form, provided that You
|
|
92
|
+
meet the following conditions:
|
|
93
|
+
|
|
94
|
+
(a) You must give any other recipients of the Work or
|
|
95
|
+
Derivative Works a copy of this License; and
|
|
96
|
+
|
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
|
98
|
+
stating that You changed the files; and
|
|
99
|
+
|
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
|
102
|
+
attribution notices from the Source form of the Work,
|
|
103
|
+
excluding those notices that do not pertain to any part of
|
|
104
|
+
the Derivative Works; and
|
|
105
|
+
|
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
|
108
|
+
include a readable copy of the attribution notices contained
|
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
|
111
|
+
of the following places: within a NOTICE text file distributed
|
|
112
|
+
as part of the Derivative Works; within the Source form or
|
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
|
114
|
+
within a display generated by the Derivative Works, if and
|
|
115
|
+
wherever such third-party notices normally appear. The contents
|
|
116
|
+
of the NOTICE file are for informational purposes only and
|
|
117
|
+
do not modify the License. You may add Your own attribution
|
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
|
120
|
+
that such additional attribution notices cannot be construed
|
|
121
|
+
as modifying the License.
|
|
122
|
+
|
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
|
124
|
+
may provide additional or different license terms and conditions
|
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
|
128
|
+
the conditions stated in this License.
|
|
129
|
+
|
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
|
133
|
+
this License, without any additional terms or conditions.
|
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
|
135
|
+
the terms of any separate license agreement you may have executed
|
|
136
|
+
with Licensor regarding such Contributions.
|
|
137
|
+
|
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
|
140
|
+
except as required for reasonable and customary use in describing the
|
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
|
142
|
+
|
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
|
152
|
+
|
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
|
158
|
+
incidental, or consequential damages of any character arising as a
|
|
159
|
+
result of this License or out of the use or inability to use the
|
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
|
162
|
+
other commercial damages or losses), even if such Contributor
|
|
163
|
+
has been advised of the possibility of such damages.
|
|
164
|
+
|
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
168
|
+
or other liability obligations and/or rights consistent with this
|
|
169
|
+
License. However, in accepting such obligations, You may act only
|
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
|
174
|
+
of your accepting any such warranty or additional liability.
|
|
175
|
+
|
|
176
|
+
END OF TERMS AND CONDITIONS
|
|
177
|
+
|
|
178
|
+
Copyright 2024 Ole-Christian Galbo Engstrøm
|
|
179
|
+
|
|
180
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
181
|
+
you may not use this file except in compliance with the License.
|
|
182
|
+
You may obtain a copy of the License at
|
|
183
|
+
|
|
184
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
185
|
+
|
|
186
|
+
Unless required by applicable law or agreed to in writing, software
|
|
187
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
188
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
189
|
+
See the License for the specific language governing permissions and
|
|
190
|
+
limitations under the License.
|
cvmatrix-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: cvmatrix
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Fast computation of possibly centered/scaled training set kernel matrices in a cross-validation setting.
|
|
5
|
+
Home-page: https://cvmatrix.readthedocs.io/en/latest/
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Author: Sm00thix
|
|
8
|
+
Author-email: oleemail@icloud.com
|
|
9
|
+
Maintainer: Sm00thix
|
|
10
|
+
Maintainer-email: oleemail@icloud.com
|
|
11
|
+
Requires-Python: >=3.9,<3.13
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Dist: numpy (>=1.26.4,<2.0.0)
|
|
19
|
+
Project-URL: Repository, https://github.com/Sm00thix/CVMatrix
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# CVMatrix
|
|
23
|
+
|
|
24
|
+
[](https://pypi.python.org/pypi/cvmatrix/)
|
|
25
|
+
|
|
26
|
+
[](https://pypi.python.org/pypi/cvmatrix/)
|
|
27
|
+
|
|
28
|
+
[](https://pypi.python.org/pypi/cvmatrix/)
|
|
29
|
+
|
|
30
|
+
[](https://pypi.python.org/pypi/cvmatrix/)
|
|
31
|
+
|
|
32
|
+
[](https://cvmatrix.readthedocs.io/en/latest/?badge=latest)
|
|
33
|
+
|
|
34
|
+
[](https://github.com/Sm00thix/CVMatrix/actions/workflows/workflow.yml)
|
|
35
|
+
|
|
36
|
+
This repository contains the source code for the [`cvmatrix`](https://pypi.org/project/cvmatrix/) package which implements the fast algorithms by Engstrøm [[1]](#references) for computation of training set $\mathbf{X}^{\mathbf{T}}\mathbf{X}$ and $\mathbf{X}^{\mathbf{T}}\mathbf{Y}$ in a cross-validation setting. In addition to correctly handling arbitrary row-wise pre-processing, the algorithms allow for and efficiently and correctly handle any combination of column-wise centering and scaling of `X` and `Y` based on training set statistics.
|
|
37
|
+
|
|
38
|
+
For an implementation of the fast cross-validation algorithms combined with Improved Kernel Partial Least Squares [[2]](#references), see the Python package [`ikpls`](https://pypi.org/project/ikpls/).
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
- Install the package for Python3 using the following command:
|
|
43
|
+
```shell
|
|
44
|
+
pip3 install cvmatrix
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
- Now you can import the class implementing all the algorithms with:
|
|
48
|
+
```python
|
|
49
|
+
from cvmatrix.cvmatrix import CVMatrix
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
### Use the cvmatrix package for fast computation of training set kernel matrices
|
|
55
|
+
|
|
56
|
+
> ```python
|
|
57
|
+
> import numpy as np
|
|
58
|
+
> from cvmatrix.cvmatrix import CVMatrix
|
|
59
|
+
>
|
|
60
|
+
> N = 100 # Number of samples.
|
|
61
|
+
> K = 50 # Number of features.
|
|
62
|
+
> M = 10 # Number of targets.
|
|
63
|
+
>
|
|
64
|
+
> X = np.random.uniform(size=(N, K)) # Random X data
|
|
65
|
+
> Y = np.random.uniform(size=(N, M)) # Random Y data
|
|
66
|
+
> cv_splits = np.arange(100) % 5 # 5-fold cross-validation
|
|
67
|
+
>
|
|
68
|
+
> # Instantiate CVMatrix
|
|
69
|
+
> cvm = CVMatrix(
|
|
70
|
+
> cv_splits=cv_splits,
|
|
71
|
+
> center_X=True,
|
|
72
|
+
> center_Y=True,
|
|
73
|
+
> scale_X=True,
|
|
74
|
+
> scale_Y=True,
|
|
75
|
+
> )
|
|
76
|
+
> # Fit on X and Y
|
|
77
|
+
> cvm.fit(X=X, Y=Y)
|
|
78
|
+
> # Compute training set XTX and/or XTY for each fold
|
|
79
|
+
> for val_split in np.unique(cv_splits):
|
|
80
|
+
> # Get both XTX and XTY
|
|
81
|
+
> training_XTX, training_XTY = cvm.training_XTX_XTY(val_split)
|
|
82
|
+
> # Get only XTX
|
|
83
|
+
> training_XTX = cvm.training_XTX(val_split)
|
|
84
|
+
> # Get only XTY
|
|
85
|
+
> training_XTY = cvm.training_XTY(val_split)
|
|
86
|
+
|
|
87
|
+
### Examples
|
|
88
|
+
In [examples](https://github.com/Sm00thix/CVMatrix/tree/main/examples) you will find:
|
|
89
|
+
|
|
90
|
+
- [Compute training matrices with CVMatrix](https://github.com/Sm00thix/CVMatrix/tree/main/examples/training_matrices.py)
|
|
91
|
+
|
|
92
|
+
## Contribute
|
|
93
|
+
|
|
94
|
+
To contribute, please read the [Contribution
|
|
95
|
+
Guidelines](https://github.com/Sm00thix/CVMatrix/blob/main/CONTRIBUTING.md).
|
|
96
|
+
|
|
97
|
+
## References
|
|
98
|
+
|
|
99
|
+
1. [Engstrøm, O.-C. G. (2024). Shortcutting Cross-Validation: Efficiently Deriving Column-Wise Centered and Scaled Training Set $\mathbf{X}^\mathbf{T}\mathbf{X}$ and $\mathbf{X}^\mathbf{T}\mathbf{Y}$ Without Full Recomputation of Matrix Products or Statistical Moments](https://arxiv.org/abs/2401.13185)
|
|
100
|
+
2. [Dayal, B. S., & MacGregor, J. F. (1997). Improved PLS algorithms. *Journal of Chemometrics*, 11(1), 73-85.](https://doi.org/10.1002/(SICI)1099-128X(199701)11:1%3C73::AID-CEM435%3E3.0.CO;2-%23?)
|
cvmatrix-1.0.0/README.md
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# CVMatrix
|
|
2
|
+
|
|
3
|
+
[](https://pypi.python.org/pypi/cvmatrix/)
|
|
4
|
+
|
|
5
|
+
[](https://pypi.python.org/pypi/cvmatrix/)
|
|
6
|
+
|
|
7
|
+
[](https://pypi.python.org/pypi/cvmatrix/)
|
|
8
|
+
|
|
9
|
+
[](https://pypi.python.org/pypi/cvmatrix/)
|
|
10
|
+
|
|
11
|
+
[](https://cvmatrix.readthedocs.io/en/latest/?badge=latest)
|
|
12
|
+
|
|
13
|
+
[](https://github.com/Sm00thix/CVMatrix/actions/workflows/workflow.yml)
|
|
14
|
+
|
|
15
|
+
This repository contains the source code for the [`cvmatrix`](https://pypi.org/project/cvmatrix/) package which implements the fast algorithms by Engstrøm [[1]](#references) for computation of training set $\mathbf{X}^{\mathbf{T}}\mathbf{X}$ and $\mathbf{X}^{\mathbf{T}}\mathbf{Y}$ in a cross-validation setting. In addition to correctly handling arbitrary row-wise pre-processing, the algorithms allow for and efficiently and correctly handle any combination of column-wise centering and scaling of `X` and `Y` based on training set statistics.
|
|
16
|
+
|
|
17
|
+
For an implementation of the fast cross-validation algorithms combined with Improved Kernel Partial Least Squares [[2]](#references), see the Python package [`ikpls`](https://pypi.org/project/ikpls/).
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
- Install the package for Python3 using the following command:
|
|
22
|
+
```shell
|
|
23
|
+
pip3 install cvmatrix
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
- Now you can import the class implementing all the algorithms with:
|
|
27
|
+
```python
|
|
28
|
+
from cvmatrix.cvmatrix import CVMatrix
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
### Use the cvmatrix package for fast computation of training set kernel matrices
|
|
34
|
+
|
|
35
|
+
> ```python
|
|
36
|
+
> import numpy as np
|
|
37
|
+
> from cvmatrix.cvmatrix import CVMatrix
|
|
38
|
+
>
|
|
39
|
+
> N = 100 # Number of samples.
|
|
40
|
+
> K = 50 # Number of features.
|
|
41
|
+
> M = 10 # Number of targets.
|
|
42
|
+
>
|
|
43
|
+
> X = np.random.uniform(size=(N, K)) # Random X data
|
|
44
|
+
> Y = np.random.uniform(size=(N, M)) # Random Y data
|
|
45
|
+
> cv_splits = np.arange(100) % 5 # 5-fold cross-validation
|
|
46
|
+
>
|
|
47
|
+
> # Instantiate CVMatrix
|
|
48
|
+
> cvm = CVMatrix(
|
|
49
|
+
> cv_splits=cv_splits,
|
|
50
|
+
> center_X=True,
|
|
51
|
+
> center_Y=True,
|
|
52
|
+
> scale_X=True,
|
|
53
|
+
> scale_Y=True,
|
|
54
|
+
> )
|
|
55
|
+
> # Fit on X and Y
|
|
56
|
+
> cvm.fit(X=X, Y=Y)
|
|
57
|
+
> # Compute training set XTX and/or XTY for each fold
|
|
58
|
+
> for val_split in np.unique(cv_splits):
|
|
59
|
+
> # Get both XTX and XTY
|
|
60
|
+
> training_XTX, training_XTY = cvm.training_XTX_XTY(val_split)
|
|
61
|
+
> # Get only XTX
|
|
62
|
+
> training_XTX = cvm.training_XTX(val_split)
|
|
63
|
+
> # Get only XTY
|
|
64
|
+
> training_XTY = cvm.training_XTY(val_split)
|
|
65
|
+
|
|
66
|
+
### Examples
|
|
67
|
+
In [examples](https://github.com/Sm00thix/CVMatrix/tree/main/examples) you will find:
|
|
68
|
+
|
|
69
|
+
- [Compute training matrices with CVMatrix](https://github.com/Sm00thix/CVMatrix/tree/main/examples/training_matrices.py)
|
|
70
|
+
|
|
71
|
+
## Contribute
|
|
72
|
+
|
|
73
|
+
To contribute, please read the [Contribution
|
|
74
|
+
Guidelines](https://github.com/Sm00thix/CVMatrix/blob/main/CONTRIBUTING.md).
|
|
75
|
+
|
|
76
|
+
## References
|
|
77
|
+
|
|
78
|
+
1. [Engstrøm, O.-C. G. (2024). Shortcutting Cross-Validation: Efficiently Deriving Column-Wise Centered and Scaled Training Set $\mathbf{X}^\mathbf{T}\mathbf{X}$ and $\mathbf{X}^\mathbf{T}\mathbf{Y}$ Without Full Recomputation of Matrix Products or Statistical Moments](https://arxiv.org/abs/2401.13185)
|
|
79
|
+
2. [Dayal, B. S., & MacGregor, J. F. (1997). Improved PLS algorithms. *Journal of Chemometrics*, 11(1), 73-85.](https://doi.org/10.1002/(SICI)1099-128X(199701)11:1%3C73::AID-CEM435%3E3.0.CO;2-%23?)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '1.0.0'
|
|
@@ -0,0 +1,619 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Contains the CVMatrix class which implements methods for fast computation of training
|
|
3
|
+
set kernel matrices in cross-validation using the fast algorithms described in the
|
|
4
|
+
paper by O.-C. G. Engstrøm: https://arxiv.org/abs/2401.13185
|
|
5
|
+
|
|
6
|
+
The implementation is written using NumPy.
|
|
7
|
+
|
|
8
|
+
Author: Ole-Christian Galbo Engstrøm
|
|
9
|
+
E-mail: ole.e@di.ku.dk
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from typing import Hashable, Iterable, Union
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
from numpy import typing as npt
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CVMatrix:
|
|
19
|
+
"""
|
|
20
|
+
Implements the fast cross-validation algorithms for kernel matrix-based models such
|
|
21
|
+
as PCA, PCR, PLS, and OLS. The algorithms are based on the paper by O.-C. G.
|
|
22
|
+
Engstrøm: https://arxiv.org/abs/2401.13185
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
cv_splits : Iterable of Hashable with N elements
|
|
27
|
+
An iterable defining cross-validation splits. Each unique value in
|
|
28
|
+
`cv_splits` corresponds to a different fold.
|
|
29
|
+
|
|
30
|
+
center_X : bool, optional, default=True
|
|
31
|
+
Whether to center `X` before computation of
|
|
32
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
|
|
33
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` by subtracting its row of column-wise
|
|
34
|
+
means from each row. The row of column-wise means is computed on the training
|
|
35
|
+
set for each fold to avoid data leakage.
|
|
36
|
+
|
|
37
|
+
center_Y : bool, optional, default=True
|
|
38
|
+
Whether to center `Y` before computation of
|
|
39
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` by subtracting its row of column-wise
|
|
40
|
+
means from each row. The row of column-wise means is computed on the training
|
|
41
|
+
set for each fold to avoid data leakage. This parameter is ignored if `Y` is
|
|
42
|
+
`None`.
|
|
43
|
+
|
|
44
|
+
scale_X : bool, optional, default=True
|
|
45
|
+
Whether to scale `X` before computation of
|
|
46
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
|
|
47
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` by dividing each row with the row of
|
|
48
|
+
`X`'s column-wise standard deviations. Bessel's correction for the unbiased
|
|
49
|
+
estimate of the sample standard deviation is used. The row of column-wise
|
|
50
|
+
standard deviations is computed on the training set for each fold to avoid data
|
|
51
|
+
leakage.
|
|
52
|
+
|
|
53
|
+
scale_Y : bool, optional, default=True
|
|
54
|
+
Whether to scale `Y` before computation of
|
|
55
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` by dividing each row with the row of
|
|
56
|
+
`X`'s column-wise standard deviations. Bessel's correction for the unbiased
|
|
57
|
+
estimate of the sample standard deviation is used. The row of column-wise
|
|
58
|
+
standard deviations is computed on the training set for each fold to avoid data
|
|
59
|
+
leakage. This parameter is ignored if `Y` is `None`.
|
|
60
|
+
|
|
61
|
+
dtype : np.floating, optional, default=np.float64
|
|
62
|
+
The data type used for the computations. The default is `np.float64`.
|
|
63
|
+
|
|
64
|
+
copy : bool, optional, default=True
|
|
65
|
+
Whether to make a copy of the input arrays. If `False` and the input arrays are
|
|
66
|
+
already NumPy arrays of type `dtype`, then no copy is made. If `False` and the
|
|
67
|
+
input arrays are not NumPy arrays of type `dtype`, then a copy is made. If
|
|
68
|
+
`True` a copy is always made. If no copy is made, then external modifications
|
|
69
|
+
to `X` or `Y` will result in undefined behavior.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
cv_splits: Iterable[Hashable],
|
|
75
|
+
center_X: bool = True,
|
|
76
|
+
center_Y: bool = True,
|
|
77
|
+
scale_X: bool = True,
|
|
78
|
+
scale_Y: bool = True,
|
|
79
|
+
dtype: np.floating = np.float64,
|
|
80
|
+
copy: bool = True,
|
|
81
|
+
) -> None:
|
|
82
|
+
self.center_X = center_X
|
|
83
|
+
self.center_Y = center_Y
|
|
84
|
+
self.scale_X = scale_X
|
|
85
|
+
self.scale_Y = scale_Y
|
|
86
|
+
self.dtype = dtype
|
|
87
|
+
self.copy = copy
|
|
88
|
+
self.X_total = None
|
|
89
|
+
self.Y_total = None
|
|
90
|
+
self.N = None
|
|
91
|
+
self.K = None
|
|
92
|
+
self.M = None
|
|
93
|
+
self.X_total_mean = None
|
|
94
|
+
self.Y_total_mean = None
|
|
95
|
+
self.XTX_total = None
|
|
96
|
+
self.XTY_total = None
|
|
97
|
+
self.sum_X_total = None
|
|
98
|
+
self.sum_Y_total = None
|
|
99
|
+
self.sum_sq_X_total = None
|
|
100
|
+
self.sum_sq_Y_total = None
|
|
101
|
+
self.val_folds_dict = None
|
|
102
|
+
self._init_val_folds_dict(cv_splits)
|
|
103
|
+
|
|
104
|
+
def fit(self, X: npt.ArrayLike, Y: Union[None, npt.ArrayLike] = None) -> None:
|
|
105
|
+
"""
|
|
106
|
+
Loads and stores `X` and `Y` for cross-validation. Computes dataset-wide
|
|
107
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and, if `Y` is not `None`,
|
|
108
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`. If `center_X`, `center_Y`,
|
|
109
|
+
`scale_X`, or `scale_Y` is `True`, the corresponding global statistics are also
|
|
110
|
+
computed.
|
|
111
|
+
|
|
112
|
+
Parameters
|
|
113
|
+
----------
|
|
114
|
+
X : Array-like of shape (N, K) or (N,)
|
|
115
|
+
Predictor variables.
|
|
116
|
+
|
|
117
|
+
Y : None or array-like of shape (N, M) or (N,), optional, default=None
|
|
118
|
+
Response variables. If `None`, subsequent calls to training_XTY and
|
|
119
|
+
training_XTX_XTY will raise a `ValueError`.
|
|
120
|
+
"""
|
|
121
|
+
self.X_total = self._init_mat(X)
|
|
122
|
+
self.N, self.K = self.X_total.shape
|
|
123
|
+
self.XTX_total = self.X_total.T @ self.X_total
|
|
124
|
+
if Y is not None:
|
|
125
|
+
self.Y_total = self._init_mat(Y)
|
|
126
|
+
self.M = self.Y_total.shape[1]
|
|
127
|
+
self.XTY_total = self.X_total.T @ self.Y_total
|
|
128
|
+
self._init_total_stats()
|
|
129
|
+
|
|
130
|
+
def training_XTX(self, val_fold: Hashable) -> np.ndarray:
|
|
131
|
+
"""
|
|
132
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`
|
|
133
|
+
corresponding to every sample except those belonging to the given validation
|
|
134
|
+
fold.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
val_fold : Hashable
|
|
139
|
+
The validation fold for which to return the corresponding training set
|
|
140
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`.
|
|
141
|
+
|
|
142
|
+
Returns
|
|
143
|
+
-------
|
|
144
|
+
Array of shape (K, K)
|
|
145
|
+
The training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`.
|
|
146
|
+
|
|
147
|
+
Raises
|
|
148
|
+
------
|
|
149
|
+
ValueError
|
|
150
|
+
If `val_fold` was not provided as a cross-validation split in the
|
|
151
|
+
`cv_splits` parameter of the constructor.
|
|
152
|
+
|
|
153
|
+
See Also
|
|
154
|
+
--------
|
|
155
|
+
training_XTY :
|
|
156
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`
|
|
157
|
+
training_XTX_XTY :
|
|
158
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
|
|
159
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` for a given validation fold. This
|
|
160
|
+
method is faster than calling `training_XTX` and `training_XTY` separately.
|
|
161
|
+
"""
|
|
162
|
+
return self._training_matrices(True, False, val_fold)
|
|
163
|
+
|
|
164
|
+
def training_XTY(self, val_fold: Hashable) -> np.ndarray:
|
|
165
|
+
"""
|
|
166
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`
|
|
167
|
+
corresponding to every sample except those belonging to the given validation
|
|
168
|
+
fold.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
val_fold : Hashable
|
|
173
|
+
The validation fold for which to return the corresponding training set
|
|
174
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
|
|
175
|
+
|
|
176
|
+
Returns
|
|
177
|
+
-------
|
|
178
|
+
Array of shape (K, M)
|
|
179
|
+
The training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
|
|
180
|
+
|
|
181
|
+
Raises
|
|
182
|
+
------
|
|
183
|
+
ValueError
|
|
184
|
+
If `Y` is `None`.
|
|
185
|
+
|
|
186
|
+
ValueError
|
|
187
|
+
If `val_fold` was not provided as a cross-validation split in the
|
|
188
|
+
`cv_splits` parameter of the constructor.
|
|
189
|
+
|
|
190
|
+
See Also
|
|
191
|
+
--------
|
|
192
|
+
training_XTX :
|
|
193
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`
|
|
194
|
+
training_XTX_XTY :
|
|
195
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
|
|
196
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` for a given validation fold. This
|
|
197
|
+
method is faster than calling `training_XTX` and `training_XTY` separately.
|
|
198
|
+
"""
|
|
199
|
+
return self._training_matrices(False, True, val_fold)
|
|
200
|
+
|
|
201
|
+
def training_XTX_XTY(self, val_fold: Hashable) -> tuple[np.ndarray, np.ndarray]:
|
|
202
|
+
"""
|
|
203
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
|
|
204
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` corresponding to every sample except
|
|
205
|
+
those belonging to the given validation fold.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
val_fold : Hashable
|
|
210
|
+
The validation fold for which to return the corresponding training set
|
|
211
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
|
|
212
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
|
|
213
|
+
|
|
214
|
+
Returns
|
|
215
|
+
-------
|
|
216
|
+
tuple of arrays of shapes (K, K) and (K, M)
|
|
217
|
+
The training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
|
|
218
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
|
|
219
|
+
|
|
220
|
+
Raises
|
|
221
|
+
------
|
|
222
|
+
ValueError
|
|
223
|
+
If `Y` is `None`.
|
|
224
|
+
|
|
225
|
+
ValueError
|
|
226
|
+
If `val_fold` was not provided as a cross-validation split in the
|
|
227
|
+
`cv_splits` parameter of the constructor.
|
|
228
|
+
|
|
229
|
+
See Also
|
|
230
|
+
--------
|
|
231
|
+
training_XTX :
|
|
232
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`
|
|
233
|
+
training_XTY :
|
|
234
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`
|
|
235
|
+
"""
|
|
236
|
+
return self._training_matrices(True, True, val_fold)
|
|
237
|
+
|
|
238
|
+
def _training_matrices(
|
|
239
|
+
self,
|
|
240
|
+
return_XTX: bool,
|
|
241
|
+
return_XTY: bool,
|
|
242
|
+
val_fold: Hashable
|
|
243
|
+
) -> Union[np.ndarray, tuple[np.ndarray, np.ndarray]]:
|
|
244
|
+
"""
|
|
245
|
+
Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and/or
|
|
246
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` corresponding to every sample except
|
|
247
|
+
those belonging to the given validation fold.
|
|
248
|
+
|
|
249
|
+
Parameters
|
|
250
|
+
----------
|
|
251
|
+
return_XTX : bool
|
|
252
|
+
Whether to return the training set
|
|
253
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`.
|
|
254
|
+
|
|
255
|
+
val_fold : Hashable
|
|
256
|
+
The validation fold for which to return the corresponding training set
|
|
257
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
|
|
258
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`
|
|
259
|
+
|
|
260
|
+
return_XTY : bool, optional, default=False
|
|
261
|
+
Whether to return the training set
|
|
262
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
|
|
263
|
+
|
|
264
|
+
Returns
|
|
265
|
+
-------
|
|
266
|
+
Array of shape (K, K) or (K, M) or tuple of arrays of shapes (K, K) and (K, M)
|
|
267
|
+
The training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and/or
|
|
268
|
+
training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
|
|
269
|
+
|
|
270
|
+
Raises
|
|
271
|
+
------
|
|
272
|
+
ValueError
|
|
273
|
+
If both `return_XTX` and `return_XTY` are `False` or if `return_XTY` is
|
|
274
|
+
`True` and `Y` is `None`.
|
|
275
|
+
|
|
276
|
+
ValueError
|
|
277
|
+
If `val_fold` was not provided as a cross-validation split in the
|
|
278
|
+
`cv_splits` parameter of the constructor.
|
|
279
|
+
"""
|
|
280
|
+
if not return_XTX and not return_XTY:
|
|
281
|
+
raise ValueError(
|
|
282
|
+
"At least one of `return_XTX` and `return_XTY` must be True."
|
|
283
|
+
)
|
|
284
|
+
if return_XTY and self.Y_total is None:
|
|
285
|
+
raise ValueError("Response variables `Y` are not provided.")
|
|
286
|
+
X_train_mean = None
|
|
287
|
+
Y_train_mean = None
|
|
288
|
+
X_train_std = None
|
|
289
|
+
Y_train_std = None
|
|
290
|
+
N_train = None
|
|
291
|
+
try:
|
|
292
|
+
val_indices = self.val_folds_dict[val_fold]
|
|
293
|
+
except KeyError as e:
|
|
294
|
+
raise ValueError(f"Validation fold {val_fold} not found.") from e
|
|
295
|
+
X_val = self.X_total[val_indices]
|
|
296
|
+
if return_XTY:
|
|
297
|
+
Y_val = self.Y_total[val_indices]
|
|
298
|
+
if self.center_X or self.center_Y or self.scale_X or self.scale_Y:
|
|
299
|
+
N_val = val_indices.size
|
|
300
|
+
N_train = self.N - N_val
|
|
301
|
+
N_total_over_N_train = self.N / N_train
|
|
302
|
+
N_val_over_N_train = N_val / N_train
|
|
303
|
+
if self.center_X or self.center_Y or self.scale_X:
|
|
304
|
+
X_train_mean = self._compute_training_mat_mean(
|
|
305
|
+
X_val,
|
|
306
|
+
self.X_total_mean,
|
|
307
|
+
N_total_over_N_train,
|
|
308
|
+
N_val_over_N_train
|
|
309
|
+
)
|
|
310
|
+
if return_XTY and (self.center_X or self.center_Y or self.scale_Y):
|
|
311
|
+
Y_train_mean = self._compute_training_mat_mean(
|
|
312
|
+
Y_val,
|
|
313
|
+
self.Y_total_mean,
|
|
314
|
+
N_total_over_N_train,
|
|
315
|
+
N_val_over_N_train
|
|
316
|
+
)
|
|
317
|
+
if self.scale_X:
|
|
318
|
+
X_train_std = self._compute_training_mat_std(
|
|
319
|
+
X_val,
|
|
320
|
+
X_train_mean,
|
|
321
|
+
self.sum_X_total,
|
|
322
|
+
self.sum_sq_X_total,
|
|
323
|
+
N_train
|
|
324
|
+
)
|
|
325
|
+
if self.scale_Y and return_XTY:
|
|
326
|
+
Y_train_std = self._compute_training_mat_std(
|
|
327
|
+
Y_val,
|
|
328
|
+
Y_train_mean,
|
|
329
|
+
self.sum_Y_total,
|
|
330
|
+
self.sum_sq_Y_total,
|
|
331
|
+
N_train
|
|
332
|
+
)
|
|
333
|
+
if return_XTX and return_XTY:
|
|
334
|
+
return (
|
|
335
|
+
self._training_kernel_matrix(
|
|
336
|
+
self.XTX_total,
|
|
337
|
+
X_val,
|
|
338
|
+
X_val,
|
|
339
|
+
X_train_mean,
|
|
340
|
+
X_train_mean,
|
|
341
|
+
X_train_std,
|
|
342
|
+
X_train_std,
|
|
343
|
+
N_train,
|
|
344
|
+
center=self.center_X
|
|
345
|
+
),
|
|
346
|
+
self._training_kernel_matrix(
|
|
347
|
+
self.XTY_total,
|
|
348
|
+
X_val,
|
|
349
|
+
Y_val,
|
|
350
|
+
X_train_mean,
|
|
351
|
+
Y_train_mean,
|
|
352
|
+
X_train_std,
|
|
353
|
+
Y_train_std,
|
|
354
|
+
N_train,
|
|
355
|
+
center=self.center_X or self.center_Y
|
|
356
|
+
)
|
|
357
|
+
)
|
|
358
|
+
if return_XTX:
|
|
359
|
+
return self._training_kernel_matrix(
|
|
360
|
+
self.XTX_total,
|
|
361
|
+
X_val,
|
|
362
|
+
X_val,
|
|
363
|
+
X_train_mean,
|
|
364
|
+
X_train_mean,
|
|
365
|
+
X_train_std,
|
|
366
|
+
X_train_std,
|
|
367
|
+
N_train,
|
|
368
|
+
center=self.center_X
|
|
369
|
+
)
|
|
370
|
+
return self._training_kernel_matrix(
|
|
371
|
+
self.XTY_total,
|
|
372
|
+
X_val,
|
|
373
|
+
Y_val,
|
|
374
|
+
X_train_mean,
|
|
375
|
+
Y_train_mean,
|
|
376
|
+
X_train_std,
|
|
377
|
+
Y_train_std,
|
|
378
|
+
N_train,
|
|
379
|
+
center=self.center_X or self.center_Y
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
def _training_kernel_matrix(
|
|
383
|
+
self,
|
|
384
|
+
total_kernel_mat: np.ndarray,
|
|
385
|
+
X_val: np.ndarray,
|
|
386
|
+
mat2_val: np.ndarray,
|
|
387
|
+
X_train_mean: Union[None, np.ndarray] = None,
|
|
388
|
+
mat2_train_mean: Union[None, np.ndarray] = None,
|
|
389
|
+
X_train_std: Union[None, np.ndarray] = None,
|
|
390
|
+
mat2_train_std: Union[None, np.ndarray] = None,
|
|
391
|
+
N_train: Union[None, int] = None,
|
|
392
|
+
center: bool = False,
|
|
393
|
+
) -> np.ndarray:
|
|
394
|
+
"""
|
|
395
|
+
Computes the training set kernel matrix for a given fold.
|
|
396
|
+
|
|
397
|
+
Parameters
|
|
398
|
+
----------
|
|
399
|
+
total_kernel_mat : Array of shape (N, K) or (N, M)
|
|
400
|
+
The total kernel matrix :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` or
|
|
401
|
+
:math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
|
|
402
|
+
|
|
403
|
+
X_val : Array of shape (N_val, K)
|
|
404
|
+
The validation set of predictor variables.
|
|
405
|
+
|
|
406
|
+
mat2_val : Array of shape (N_val, K) or (N_val, M)
|
|
407
|
+
The validation set of predictor or resoponse variables.
|
|
408
|
+
|
|
409
|
+
X_train_mean : None or array of shape (1, K), optional, default=None
|
|
410
|
+
The row of column-wise means of the training set of predictor variables.
|
|
411
|
+
|
|
412
|
+
mat2_train_mean : None or array of shape (1, K) or (1, M), optional,
|
|
413
|
+
default=None
|
|
414
|
+
The row of column-wise means of the training set of predictor or response
|
|
415
|
+
variables.
|
|
416
|
+
|
|
417
|
+
X_train_std : None or array of shape (1, K), optional, default=None
|
|
418
|
+
The row of column-wise standard deviations of the training set of predictor
|
|
419
|
+
variables.
|
|
420
|
+
|
|
421
|
+
mat2_train_std : None or array of shape (1, K) or (1, M), optional, default=None
|
|
422
|
+
The row of column-wise standard deviations of the training set of predictor
|
|
423
|
+
or response variables.
|
|
424
|
+
|
|
425
|
+
N_train : None or int, optional, default=None
|
|
426
|
+
The size of the training set. Only required if `X_train_mean` or
|
|
427
|
+
`mat2_train_mean` is not `None`.
|
|
428
|
+
|
|
429
|
+
center : bool, optional, default=False
|
|
430
|
+
Whether to center the kernel matrix. If `True`, the kernel matrix is
|
|
431
|
+
centered. Setting this parameter to `True` requires that `X_train_mean` and
|
|
432
|
+
`mat2_train_mean` are not `None`.
|
|
433
|
+
|
|
434
|
+
Returns
|
|
435
|
+
-------
|
|
436
|
+
Array of shape (K, K) or (K, M)
|
|
437
|
+
The training set kernel matrix.
|
|
438
|
+
"""
|
|
439
|
+
XTmat2_train = total_kernel_mat - X_val.T @ mat2_val
|
|
440
|
+
if center:
|
|
441
|
+
XTmat2_train -= N_train * (X_train_mean.T @ mat2_train_mean)
|
|
442
|
+
if X_train_std is not None and mat2_train_std is not None:
|
|
443
|
+
return XTmat2_train / (X_train_std.T @ mat2_train_std)
|
|
444
|
+
if X_train_std is not None:
|
|
445
|
+
return XTmat2_train / X_train_std.T
|
|
446
|
+
if mat2_train_std is not None:
|
|
447
|
+
return XTmat2_train / mat2_train_std
|
|
448
|
+
return XTmat2_train
|
|
449
|
+
|
|
450
|
+
def _compute_training_mat_mean(
|
|
451
|
+
self,
|
|
452
|
+
mat_val: np.ndarray,
|
|
453
|
+
mat_total_mean: np.ndarray,
|
|
454
|
+
N_total_over_N_train: float,
|
|
455
|
+
N_val_over_N_train: float
|
|
456
|
+
) -> np.ndarray:
|
|
457
|
+
"""
|
|
458
|
+
Computes the row of column-wise means of a matrix for a given fold.
|
|
459
|
+
|
|
460
|
+
Parameters
|
|
461
|
+
----------
|
|
462
|
+
mat_val : Array of shape (N_val, K) or (N_val, M)
|
|
463
|
+
The validation set of `X` or `Y`.
|
|
464
|
+
|
|
465
|
+
mat_total_mean : Array of shape (1, K) or (1, M)
|
|
466
|
+
The row of column-wise means of the total matrix.
|
|
467
|
+
|
|
468
|
+
N_total_over_N_train : float
|
|
469
|
+
The ratio of the total number of samples to the number of samples in the
|
|
470
|
+
training set.
|
|
471
|
+
|
|
472
|
+
N_val_over_N_train : float
|
|
473
|
+
The ratio of the number of samples in the validation set to the number of
|
|
474
|
+
samples in the training set.
|
|
475
|
+
|
|
476
|
+
Returns
|
|
477
|
+
-------
|
|
478
|
+
Array of shape (1, K) or (1, M)
|
|
479
|
+
The row of column-wise means of the training set matrix.
|
|
480
|
+
"""
|
|
481
|
+
return (
|
|
482
|
+
N_total_over_N_train * mat_total_mean
|
|
483
|
+
- N_val_over_N_train * mat_val.mean(axis=0, keepdims=True)
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
def _compute_training_mat_std(
|
|
487
|
+
self,
|
|
488
|
+
mat_val: np.ndarray,
|
|
489
|
+
mat_train_mean: np.ndarray,
|
|
490
|
+
sum_mat_total: np.ndarray,
|
|
491
|
+
sum_sq_mat_total: np.ndarray,
|
|
492
|
+
N_train: int
|
|
493
|
+
) -> np.ndarray:
|
|
494
|
+
"""
|
|
495
|
+
Computes the row of column-wise standard deviations of a matrix for a given
|
|
496
|
+
fold.
|
|
497
|
+
|
|
498
|
+
Parameters
|
|
499
|
+
----------
|
|
500
|
+
mat_val : Array of shape (N_val, K) or (N_val, M)
|
|
501
|
+
The validation set of `X` or `Y`.
|
|
502
|
+
|
|
503
|
+
mat_train_mean : Array of shape (1, K) or (1, M)
|
|
504
|
+
The row of column-wise means of the training matrix.
|
|
505
|
+
|
|
506
|
+
sum_mat_total : Array of shape (1, K) or (1, M)
|
|
507
|
+
The row of column-wise sums of the total matrix.
|
|
508
|
+
|
|
509
|
+
sum_sq_mat_total : Array of shape (1, K) or (1, M)
|
|
510
|
+
The row of column-wise sums of squares of the total matrix.
|
|
511
|
+
|
|
512
|
+
N_train : int
|
|
513
|
+
The size of the training set.
|
|
514
|
+
|
|
515
|
+
Returns
|
|
516
|
+
-------
|
|
517
|
+
Array of shape (1, K) or (1, M)
|
|
518
|
+
The row of column-wise standard deviations of the training set matrix.
|
|
519
|
+
"""
|
|
520
|
+
train_sum_mat = sum_mat_total - np.expand_dims(
|
|
521
|
+
np.einsum("ij -> j", mat_val), axis=0
|
|
522
|
+
)
|
|
523
|
+
train_sum_sq_mat = sum_sq_mat_total - np.expand_dims(
|
|
524
|
+
np.einsum("ij,ij -> j", mat_val, mat_val), axis=0
|
|
525
|
+
)
|
|
526
|
+
mat_train_std = np.sqrt(
|
|
527
|
+
1
|
|
528
|
+
/ (N_train - 1)
|
|
529
|
+
* (
|
|
530
|
+
-2 * mat_train_mean * train_sum_mat
|
|
531
|
+
+ N_train
|
|
532
|
+
* np.einsum("ij,ij -> ij", mat_train_mean, mat_train_mean)
|
|
533
|
+
+ train_sum_sq_mat
|
|
534
|
+
)
|
|
535
|
+
)
|
|
536
|
+
mat_train_std[mat_train_std == 0] = 1
|
|
537
|
+
return mat_train_std
|
|
538
|
+
|
|
539
|
+
def _init_mat(self, mat: np.ndarray) -> np.ndarray:
|
|
540
|
+
"""
|
|
541
|
+
Casts the matrix to the dtype specified in the constructor and reshapes it if
|
|
542
|
+
the matrix is one-dimensional.
|
|
543
|
+
|
|
544
|
+
Parameters
|
|
545
|
+
----------
|
|
546
|
+
mat : Array of shape (N, K) or (N, M) or (N,)
|
|
547
|
+
The matrix to be initialized.
|
|
548
|
+
|
|
549
|
+
Returns
|
|
550
|
+
-------
|
|
551
|
+
Array of shape (N, K) or (N, M) or (N, 1)
|
|
552
|
+
The initialized matrix.
|
|
553
|
+
"""
|
|
554
|
+
mat = np.asarray(mat, dtype=self.dtype)
|
|
555
|
+
if self.copy and mat.dtype == self.dtype:
|
|
556
|
+
mat = mat.copy()
|
|
557
|
+
if mat.ndim == 1:
|
|
558
|
+
mat = mat.reshape(-1, 1)
|
|
559
|
+
return mat
|
|
560
|
+
|
|
561
|
+
def _init_total_stats(self) -> None:
|
|
562
|
+
"""
|
|
563
|
+
Initializes the global statistics for `X` and `Y`.
|
|
564
|
+
"""
|
|
565
|
+
if self.center_X or self.center_Y or self.scale_X:
|
|
566
|
+
self.X_total_mean = self.X_total.mean(axis=0, keepdims=True)
|
|
567
|
+
else:
|
|
568
|
+
self.X_total_mean = None
|
|
569
|
+
if (
|
|
570
|
+
(self.center_X or self.center_Y or self.scale_Y)
|
|
571
|
+
and self.Y_total is not None
|
|
572
|
+
):
|
|
573
|
+
self.Y_total_mean = self.Y_total.mean(axis=0, keepdims=True)
|
|
574
|
+
else:
|
|
575
|
+
self.Y_total_mean = None
|
|
576
|
+
if self.scale_X:
|
|
577
|
+
self.sum_X_total = np.expand_dims(
|
|
578
|
+
np.einsum("ij -> j", self.X_total), axis=0
|
|
579
|
+
)
|
|
580
|
+
self.sum_sq_X_total = np.expand_dims(
|
|
581
|
+
np.einsum("ij,ij -> j", self.X_total, self.X_total), axis=0
|
|
582
|
+
)
|
|
583
|
+
else:
|
|
584
|
+
self.sum_X_total = None
|
|
585
|
+
self.sum_sq_X_total = None
|
|
586
|
+
if self.scale_Y and self.Y_total is not None:
|
|
587
|
+
self.sum_Y_total = np.expand_dims(
|
|
588
|
+
np.einsum("ij -> j", self.Y_total), axis=0
|
|
589
|
+
)
|
|
590
|
+
self.sum_sq_Y_total = np.expand_dims(
|
|
591
|
+
np.einsum("ij,ij -> j", self.Y_total, self.Y_total), axis=0
|
|
592
|
+
)
|
|
593
|
+
else:
|
|
594
|
+
self.sum_Y_total = None
|
|
595
|
+
self.sum_sq_Y_total = None
|
|
596
|
+
|
|
597
|
+
def _init_val_folds_dict(
|
|
598
|
+
self, cv_splits: Iterable[Hashable]
|
|
599
|
+
) -> dict[Hashable, npt.NDArray[np.int_]]:
|
|
600
|
+
"""
|
|
601
|
+
Generates a dictionary of validation indices for each fold. The dictionary is
|
|
602
|
+
stored in the `val_folds_dict` attribute. The dictionary is used to quickly
|
|
603
|
+
access the validation indices for each fold.
|
|
604
|
+
|
|
605
|
+
Parameters
|
|
606
|
+
----------
|
|
607
|
+
cv_splits : Iterable of Hashable with N elements
|
|
608
|
+
An iterable defining cross-validation splits. Each unique value in
|
|
609
|
+
`cv_splits` corresponds to a different fold.
|
|
610
|
+
"""
|
|
611
|
+
val_folds_dict = {}
|
|
612
|
+
for i, num in enumerate(cv_splits):
|
|
613
|
+
try:
|
|
614
|
+
val_folds_dict[num].append(i)
|
|
615
|
+
except KeyError:
|
|
616
|
+
val_folds_dict[num] = [i]
|
|
617
|
+
for key in val_folds_dict:
|
|
618
|
+
val_folds_dict[key] = np.asarray(val_folds_dict[key], dtype=int)
|
|
619
|
+
self.val_folds_dict = val_folds_dict
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "cvmatrix"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "Fast computation of possibly centered/scaled training set kernel matrices in a cross-validation setting."
|
|
5
|
+
authors = ["Sm00thix <oleemail@icloud.com>"]
|
|
6
|
+
maintainers = ["Sm00thix <oleemail@icloud.com>"]
|
|
7
|
+
license = "Apache-2.0"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
homepage = "https://cvmatrix.readthedocs.io/en/latest/"
|
|
10
|
+
repository = "https://github.com/Sm00thix/CVMatrix"
|
|
11
|
+
|
|
12
|
+
[tool.poetry.dependencies]
|
|
13
|
+
python = ">=3.9, <3.13"
|
|
14
|
+
numpy = "^1.26.4"
|
|
15
|
+
|
|
16
|
+
[build-system]
|
|
17
|
+
requires = ["poetry-core"]
|
|
18
|
+
build-backend = "poetry.core.masonry.api"
|