cvmatrix 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cvmatrix-1.0.0/LICENSE ADDED
@@ -0,0 +1,190 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ Copyright 2024 Ole-Christian Galbo Engstrøm
179
+
180
+ Licensed under the Apache License, Version 2.0 (the "License");
181
+ you may not use this file except in compliance with the License.
182
+ You may obtain a copy of the License at
183
+
184
+ http://www.apache.org/licenses/LICENSE-2.0
185
+
186
+ Unless required by applicable law or agreed to in writing, software
187
+ distributed under the License is distributed on an "AS IS" BASIS,
188
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189
+ See the License for the specific language governing permissions and
190
+ limitations under the License.
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.1
2
+ Name: cvmatrix
3
+ Version: 1.0.0
4
+ Summary: Fast computation of possibly centered/scaled training set kernel matrices in a cross-validation setting.
5
+ Home-page: https://cvmatrix.readthedocs.io/en/latest/
6
+ License: Apache-2.0
7
+ Author: Sm00thix
8
+ Author-email: oleemail@icloud.com
9
+ Maintainer: Sm00thix
10
+ Maintainer-email: oleemail@icloud.com
11
+ Requires-Python: >=3.9,<3.13
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Dist: numpy (>=1.26.4,<2.0.0)
19
+ Project-URL: Repository, https://github.com/Sm00thix/CVMatrix
20
+ Description-Content-Type: text/markdown
21
+
22
+ # CVMatrix
23
+
24
+ [![PyPI Version](https://img.shields.io/pypi/v/cvmatrix.svg)](https://pypi.python.org/pypi/cvmatrix/)
25
+
26
+ [![PyPI - Downloads](https://img.shields.io/pypi/dm/cvmatrix)](https://pypi.python.org/pypi/cvmatrix/)
27
+
28
+ [![Python Versions](https://img.shields.io/pypi/pyversions/cvmatrix.svg)](https://pypi.python.org/pypi/cvmatrix/)
29
+
30
+ [![License](https://img.shields.io/pypi/l/cvmatrix.svg)](https://pypi.python.org/pypi/cvmatrix/)
31
+
32
+ [![Documentation Status](https://readthedocs.org/projects/cvmatrix/badge/?version=latest)](https://cvmatrix.readthedocs.io/en/latest/?badge=latest)
33
+
34
+ [![Build Status](https://github.com/Sm00thix/CVMatrix/actions/workflows/workflow.yml/badge.svg)](https://github.com/Sm00thix/CVMatrix/actions/workflows/workflow.yml)
35
+
36
+ This repository contains the source code for the [`cvmatrix`](https://pypi.org/project/cvmatrix/) package which implements the fast algorithms by Engstrøm [[1]](#references) for computation of training set $\mathbf{X}^{\mathbf{T}}\mathbf{X}$ and $\mathbf{X}^{\mathbf{T}}\mathbf{Y}$ in a cross-validation setting. In addition to correctly handling arbitrary row-wise pre-processing, the algorithms allow for and efficiently and correctly handle any combination of column-wise centering and scaling of `X` and `Y` based on training set statistics.
37
+
38
+ For an implementation of the fast cross-validation algorithms combined with Improved Kernel Partial Least Squares [[2]](#references), see the Python package [`ikpls`](https://pypi.org/project/ikpls/).
39
+
40
+ ## Installation
41
+
42
+ - Install the package for Python3 using the following command:
43
+ ```shell
44
+ pip3 install cvmatrix
45
+ ```
46
+
47
+ - Now you can import the class implementing all the algorithms with:
48
+ ```python
49
+ from cvmatrix.cvmatrix import CVMatrix
50
+ ```
51
+
52
+ ## Quick Start
53
+
54
+ ### Use the cvmatrix package for fast computation of training set kernel matrices
55
+
56
+ > ```python
57
+ > import numpy as np
58
+ > from cvmatrix.cvmatrix import CVMatrix
59
+ >
60
+ > N = 100 # Number of samples.
61
+ > K = 50 # Number of features.
62
+ > M = 10 # Number of targets.
63
+ >
64
+ > X = np.random.uniform(size=(N, K)) # Random X data
65
+ > Y = np.random.uniform(size=(N, M)) # Random Y data
66
+ > cv_splits = np.arange(100) % 5 # 5-fold cross-validation
67
+ >
68
+ > # Instantiate CVMatrix
69
+ > cvm = CVMatrix(
70
+ > cv_splits=cv_splits,
71
+ > center_X=True,
72
+ > center_Y=True,
73
+ > scale_X=True,
74
+ > scale_Y=True,
75
+ > )
76
+ > # Fit on X and Y
77
+ > cvm.fit(X=X, Y=Y)
78
+ > # Compute training set XTX and/or XTY for each fold
79
+ > for val_split in np.unique(cv_splits):
80
+ > # Get both XTX and XTY
81
+ > training_XTX, training_XTY = cvm.training_XTX_XTY(val_split)
82
+ > # Get only XTX
83
+ > training_XTX = cvm.training_XTX(val_split)
84
+ > # Get only XTY
85
+ > training_XTY = cvm.training_XTY(val_split)
86
+
87
+ ### Examples
88
+ In [examples](https://github.com/Sm00thix/CVMatrix/tree/main/examples) you will find:
89
+
90
+ - [Compute training matrices with CVMatrix](https://github.com/Sm00thix/CVMatrix/tree/main/examples/training_matrices.py)
91
+
92
+ ## Contribute
93
+
94
+ To contribute, please read the [Contribution
95
+ Guidelines](https://github.com/Sm00thix/CVMatrix/blob/main/CONTRIBUTING.md).
96
+
97
+ ## References
98
+
99
+ 1. [Engstrøm, O.-C. G. (2024). Shortcutting Cross-Validation: Efficiently Deriving Column-Wise Centered and Scaled Training Set $\mathbf{X}^\mathbf{T}\mathbf{X}$ and $\mathbf{X}^\mathbf{T}\mathbf{Y}$ Without Full Recomputation of Matrix Products or Statistical Moments](https://arxiv.org/abs/2401.13185)
100
+ 2. [Dayal, B. S., & MacGregor, J. F. (1997). Improved PLS algorithms. *Journal of Chemometrics*, 11(1), 73-85.](https://doi.org/10.1002/(SICI)1099-128X(199701)11:1%3C73::AID-CEM435%3E3.0.CO;2-%23?)
@@ -0,0 +1,79 @@
1
+ # CVMatrix
2
+
3
+ [![PyPI Version](https://img.shields.io/pypi/v/cvmatrix.svg)](https://pypi.python.org/pypi/cvmatrix/)
4
+
5
+ [![PyPI - Downloads](https://img.shields.io/pypi/dm/cvmatrix)](https://pypi.python.org/pypi/cvmatrix/)
6
+
7
+ [![Python Versions](https://img.shields.io/pypi/pyversions/cvmatrix.svg)](https://pypi.python.org/pypi/cvmatrix/)
8
+
9
+ [![License](https://img.shields.io/pypi/l/cvmatrix.svg)](https://pypi.python.org/pypi/cvmatrix/)
10
+
11
+ [![Documentation Status](https://readthedocs.org/projects/cvmatrix/badge/?version=latest)](https://cvmatrix.readthedocs.io/en/latest/?badge=latest)
12
+
13
+ [![Build Status](https://github.com/Sm00thix/CVMatrix/actions/workflows/workflow.yml/badge.svg)](https://github.com/Sm00thix/CVMatrix/actions/workflows/workflow.yml)
14
+
15
+ This repository contains the source code for the [`cvmatrix`](https://pypi.org/project/cvmatrix/) package which implements the fast algorithms by Engstrøm [[1]](#references) for computation of training set $\mathbf{X}^{\mathbf{T}}\mathbf{X}$ and $\mathbf{X}^{\mathbf{T}}\mathbf{Y}$ in a cross-validation setting. In addition to correctly handling arbitrary row-wise pre-processing, the algorithms allow for and efficiently and correctly handle any combination of column-wise centering and scaling of `X` and `Y` based on training set statistics.
16
+
17
+ For an implementation of the fast cross-validation algorithms combined with Improved Kernel Partial Least Squares [[2]](#references), see the Python package [`ikpls`](https://pypi.org/project/ikpls/).
18
+
19
+ ## Installation
20
+
21
+ - Install the package for Python3 using the following command:
22
+ ```shell
23
+ pip3 install cvmatrix
24
+ ```
25
+
26
+ - Now you can import the class implementing all the algorithms with:
27
+ ```python
28
+ from cvmatrix.cvmatrix import CVMatrix
29
+ ```
30
+
31
+ ## Quick Start
32
+
33
+ ### Use the cvmatrix package for fast computation of training set kernel matrices
34
+
35
+ > ```python
36
+ > import numpy as np
37
+ > from cvmatrix.cvmatrix import CVMatrix
38
+ >
39
+ > N = 100 # Number of samples.
40
+ > K = 50 # Number of features.
41
+ > M = 10 # Number of targets.
42
+ >
43
+ > X = np.random.uniform(size=(N, K)) # Random X data
44
+ > Y = np.random.uniform(size=(N, M)) # Random Y data
45
+ > cv_splits = np.arange(100) % 5 # 5-fold cross-validation
46
+ >
47
+ > # Instantiate CVMatrix
48
+ > cvm = CVMatrix(
49
+ > cv_splits=cv_splits,
50
+ > center_X=True,
51
+ > center_Y=True,
52
+ > scale_X=True,
53
+ > scale_Y=True,
54
+ > )
55
+ > # Fit on X and Y
56
+ > cvm.fit(X=X, Y=Y)
57
+ > # Compute training set XTX and/or XTY for each fold
58
+ > for val_split in np.unique(cv_splits):
59
+ > # Get both XTX and XTY
60
+ > training_XTX, training_XTY = cvm.training_XTX_XTY(val_split)
61
+ > # Get only XTX
62
+ > training_XTX = cvm.training_XTX(val_split)
63
+ > # Get only XTY
64
+ > training_XTY = cvm.training_XTY(val_split)
65
+
66
+ ### Examples
67
+ In [examples](https://github.com/Sm00thix/CVMatrix/tree/main/examples) you will find:
68
+
69
+ - [Compute training matrices with CVMatrix](https://github.com/Sm00thix/CVMatrix/tree/main/examples/training_matrices.py)
70
+
71
+ ## Contribute
72
+
73
+ To contribute, please read the [Contribution
74
+ Guidelines](https://github.com/Sm00thix/CVMatrix/blob/main/CONTRIBUTING.md).
75
+
76
+ ## References
77
+
78
+ 1. [Engstrøm, O.-C. G. (2024). Shortcutting Cross-Validation: Efficiently Deriving Column-Wise Centered and Scaled Training Set $\mathbf{X}^\mathbf{T}\mathbf{X}$ and $\mathbf{X}^\mathbf{T}\mathbf{Y}$ Without Full Recomputation of Matrix Products or Statistical Moments](https://arxiv.org/abs/2401.13185)
79
+ 2. [Dayal, B. S., & MacGregor, J. F. (1997). Improved PLS algorithms. *Journal of Chemometrics*, 11(1), 73-85.](https://doi.org/10.1002/(SICI)1099-128X(199701)11:1%3C73::AID-CEM435%3E3.0.CO;2-%23?)
@@ -0,0 +1 @@
1
+ __version__ = '1.0.0'
@@ -0,0 +1,619 @@
1
+ """
2
+ Contains the CVMatrix class which implements methods for fast computation of training
3
+ set kernel matrices in cross-validation using the fast algorithms described in the
4
+ paper by O.-C. G. Engstrøm: https://arxiv.org/abs/2401.13185
5
+
6
+ The implementation is written using NumPy.
7
+
8
+ Author: Ole-Christian Galbo Engstrøm
9
+ E-mail: ole.e@di.ku.dk
10
+ """
11
+
12
+ from typing import Hashable, Iterable, Union
13
+
14
+ import numpy as np
15
+ from numpy import typing as npt
16
+
17
+
18
+ class CVMatrix:
19
+ """
20
+ Implements the fast cross-validation algorithms for kernel matrix-based models such
21
+ as PCA, PCR, PLS, and OLS. The algorithms are based on the paper by O.-C. G.
22
+ Engstrøm: https://arxiv.org/abs/2401.13185
23
+
24
+ Parameters
25
+ ----------
26
+ cv_splits : Iterable of Hashable with N elements
27
+ An iterable defining cross-validation splits. Each unique value in
28
+ `cv_splits` corresponds to a different fold.
29
+
30
+ center_X : bool, optional, default=True
31
+ Whether to center `X` before computation of
32
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
33
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` by subtracting its row of column-wise
34
+ means from each row. The row of column-wise means is computed on the training
35
+ set for each fold to avoid data leakage.
36
+
37
+ center_Y : bool, optional, default=True
38
+ Whether to center `Y` before computation of
39
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` by subtracting its row of column-wise
40
+ means from each row. The row of column-wise means is computed on the training
41
+ set for each fold to avoid data leakage. This parameter is ignored if `Y` is
42
+ `None`.
43
+
44
+ scale_X : bool, optional, default=True
45
+ Whether to scale `X` before computation of
46
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
47
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` by dividing each row with the row of
48
+ `X`'s column-wise standard deviations. Bessel's correction for the unbiased
49
+ estimate of the sample standard deviation is used. The row of column-wise
50
+ standard deviations is computed on the training set for each fold to avoid data
51
+ leakage.
52
+
53
+ scale_Y : bool, optional, default=True
54
+ Whether to scale `Y` before computation of
55
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` by dividing each row with the row of
56
+ `X`'s column-wise standard deviations. Bessel's correction for the unbiased
57
+ estimate of the sample standard deviation is used. The row of column-wise
58
+ standard deviations is computed on the training set for each fold to avoid data
59
+ leakage. This parameter is ignored if `Y` is `None`.
60
+
61
+ dtype : np.floating, optional, default=np.float64
62
+ The data type used for the computations. The default is `np.float64`.
63
+
64
+ copy : bool, optional, default=True
65
+ Whether to make a copy of the input arrays. If `False` and the input arrays are
66
+ already NumPy arrays of type `dtype`, then no copy is made. If `False` and the
67
+ input arrays are not NumPy arrays of type `dtype`, then a copy is made. If
68
+ `True` a copy is always made. If no copy is made, then external modifications
69
+ to `X` or `Y` will result in undefined behavior.
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ cv_splits: Iterable[Hashable],
75
+ center_X: bool = True,
76
+ center_Y: bool = True,
77
+ scale_X: bool = True,
78
+ scale_Y: bool = True,
79
+ dtype: np.floating = np.float64,
80
+ copy: bool = True,
81
+ ) -> None:
82
+ self.center_X = center_X
83
+ self.center_Y = center_Y
84
+ self.scale_X = scale_X
85
+ self.scale_Y = scale_Y
86
+ self.dtype = dtype
87
+ self.copy = copy
88
+ self.X_total = None
89
+ self.Y_total = None
90
+ self.N = None
91
+ self.K = None
92
+ self.M = None
93
+ self.X_total_mean = None
94
+ self.Y_total_mean = None
95
+ self.XTX_total = None
96
+ self.XTY_total = None
97
+ self.sum_X_total = None
98
+ self.sum_Y_total = None
99
+ self.sum_sq_X_total = None
100
+ self.sum_sq_Y_total = None
101
+ self.val_folds_dict = None
102
+ self._init_val_folds_dict(cv_splits)
103
+
104
+ def fit(self, X: npt.ArrayLike, Y: Union[None, npt.ArrayLike] = None) -> None:
105
+ """
106
+ Loads and stores `X` and `Y` for cross-validation. Computes dataset-wide
107
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and, if `Y` is not `None`,
108
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`. If `center_X`, `center_Y`,
109
+ `scale_X`, or `scale_Y` is `True`, the corresponding global statistics are also
110
+ computed.
111
+
112
+ Parameters
113
+ ----------
114
+ X : Array-like of shape (N, K) or (N,)
115
+ Predictor variables.
116
+
117
+ Y : None or array-like of shape (N, M) or (N,), optional, default=None
118
+ Response variables. If `None`, subsequent calls to training_XTY and
119
+ training_XTX_XTY will raise a `ValueError`.
120
+ """
121
+ self.X_total = self._init_mat(X)
122
+ self.N, self.K = self.X_total.shape
123
+ self.XTX_total = self.X_total.T @ self.X_total
124
+ if Y is not None:
125
+ self.Y_total = self._init_mat(Y)
126
+ self.M = self.Y_total.shape[1]
127
+ self.XTY_total = self.X_total.T @ self.Y_total
128
+ self._init_total_stats()
129
+
130
+ def training_XTX(self, val_fold: Hashable) -> np.ndarray:
131
+ """
132
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`
133
+ corresponding to every sample except those belonging to the given validation
134
+ fold.
135
+
136
+ Parameters
137
+ ----------
138
+ val_fold : Hashable
139
+ The validation fold for which to return the corresponding training set
140
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`.
141
+
142
+ Returns
143
+ -------
144
+ Array of shape (K, K)
145
+ The training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`.
146
+
147
+ Raises
148
+ ------
149
+ ValueError
150
+ If `val_fold` was not provided as a cross-validation split in the
151
+ `cv_splits` parameter of the constructor.
152
+
153
+ See Also
154
+ --------
155
+ training_XTY :
156
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`
157
+ training_XTX_XTY :
158
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
159
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` for a given validation fold. This
160
+ method is faster than calling `training_XTX` and `training_XTY` separately.
161
+ """
162
+ return self._training_matrices(True, False, val_fold)
163
+
164
+ def training_XTY(self, val_fold: Hashable) -> np.ndarray:
165
+ """
166
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`
167
+ corresponding to every sample except those belonging to the given validation
168
+ fold.
169
+
170
+ Parameters
171
+ ----------
172
+ val_fold : Hashable
173
+ The validation fold for which to return the corresponding training set
174
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
175
+
176
+ Returns
177
+ -------
178
+ Array of shape (K, M)
179
+ The training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
180
+
181
+ Raises
182
+ ------
183
+ ValueError
184
+ If `Y` is `None`.
185
+
186
+ ValueError
187
+ If `val_fold` was not provided as a cross-validation split in the
188
+ `cv_splits` parameter of the constructor.
189
+
190
+ See Also
191
+ --------
192
+ training_XTX :
193
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`
194
+ training_XTX_XTY :
195
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
196
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` for a given validation fold. This
197
+ method is faster than calling `training_XTX` and `training_XTY` separately.
198
+ """
199
+ return self._training_matrices(False, True, val_fold)
200
+
201
+ def training_XTX_XTY(self, val_fold: Hashable) -> tuple[np.ndarray, np.ndarray]:
202
+ """
203
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
204
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` corresponding to every sample except
205
+ those belonging to the given validation fold.
206
+
207
+ Parameters
208
+ ----------
209
+ val_fold : Hashable
210
+ The validation fold for which to return the corresponding training set
211
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
212
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
213
+
214
+ Returns
215
+ -------
216
+ tuple of arrays of shapes (K, K) and (K, M)
217
+ The training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
218
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
219
+
220
+ Raises
221
+ ------
222
+ ValueError
223
+ If `Y` is `None`.
224
+
225
+ ValueError
226
+ If `val_fold` was not provided as a cross-validation split in the
227
+ `cv_splits` parameter of the constructor.
228
+
229
+ See Also
230
+ --------
231
+ training_XTX :
232
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`
233
+ training_XTY :
234
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`
235
+ """
236
+ return self._training_matrices(True, True, val_fold)
237
+
238
+ def _training_matrices(
239
+ self,
240
+ return_XTX: bool,
241
+ return_XTY: bool,
242
+ val_fold: Hashable
243
+ ) -> Union[np.ndarray, tuple[np.ndarray, np.ndarray]]:
244
+ """
245
+ Returns the training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and/or
246
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}` corresponding to every sample except
247
+ those belonging to the given validation fold.
248
+
249
+ Parameters
250
+ ----------
251
+ return_XTX : bool
252
+ Whether to return the training set
253
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}`.
254
+
255
+ val_fold : Hashable
256
+ The validation fold for which to return the corresponding training set
257
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and
258
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`
259
+
260
+ return_XTY : bool, optional, default=False
261
+ Whether to return the training set
262
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
263
+
264
+ Returns
265
+ -------
266
+ Array of shape (K, K) or (K, M) or tuple of arrays of shapes (K, K) and (K, M)
267
+ The training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` and/or
268
+ training set :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
269
+
270
+ Raises
271
+ ------
272
+ ValueError
273
+ If both `return_XTX` and `return_XTY` are `False` or if `return_XTY` is
274
+ `True` and `Y` is `None`.
275
+
276
+ ValueError
277
+ If `val_fold` was not provided as a cross-validation split in the
278
+ `cv_splits` parameter of the constructor.
279
+ """
280
+ if not return_XTX and not return_XTY:
281
+ raise ValueError(
282
+ "At least one of `return_XTX` and `return_XTY` must be True."
283
+ )
284
+ if return_XTY and self.Y_total is None:
285
+ raise ValueError("Response variables `Y` are not provided.")
286
+ X_train_mean = None
287
+ Y_train_mean = None
288
+ X_train_std = None
289
+ Y_train_std = None
290
+ N_train = None
291
+ try:
292
+ val_indices = self.val_folds_dict[val_fold]
293
+ except KeyError as e:
294
+ raise ValueError(f"Validation fold {val_fold} not found.") from e
295
+ X_val = self.X_total[val_indices]
296
+ if return_XTY:
297
+ Y_val = self.Y_total[val_indices]
298
+ if self.center_X or self.center_Y or self.scale_X or self.scale_Y:
299
+ N_val = val_indices.size
300
+ N_train = self.N - N_val
301
+ N_total_over_N_train = self.N / N_train
302
+ N_val_over_N_train = N_val / N_train
303
+ if self.center_X or self.center_Y or self.scale_X:
304
+ X_train_mean = self._compute_training_mat_mean(
305
+ X_val,
306
+ self.X_total_mean,
307
+ N_total_over_N_train,
308
+ N_val_over_N_train
309
+ )
310
+ if return_XTY and (self.center_X or self.center_Y or self.scale_Y):
311
+ Y_train_mean = self._compute_training_mat_mean(
312
+ Y_val,
313
+ self.Y_total_mean,
314
+ N_total_over_N_train,
315
+ N_val_over_N_train
316
+ )
317
+ if self.scale_X:
318
+ X_train_std = self._compute_training_mat_std(
319
+ X_val,
320
+ X_train_mean,
321
+ self.sum_X_total,
322
+ self.sum_sq_X_total,
323
+ N_train
324
+ )
325
+ if self.scale_Y and return_XTY:
326
+ Y_train_std = self._compute_training_mat_std(
327
+ Y_val,
328
+ Y_train_mean,
329
+ self.sum_Y_total,
330
+ self.sum_sq_Y_total,
331
+ N_train
332
+ )
333
+ if return_XTX and return_XTY:
334
+ return (
335
+ self._training_kernel_matrix(
336
+ self.XTX_total,
337
+ X_val,
338
+ X_val,
339
+ X_train_mean,
340
+ X_train_mean,
341
+ X_train_std,
342
+ X_train_std,
343
+ N_train,
344
+ center=self.center_X
345
+ ),
346
+ self._training_kernel_matrix(
347
+ self.XTY_total,
348
+ X_val,
349
+ Y_val,
350
+ X_train_mean,
351
+ Y_train_mean,
352
+ X_train_std,
353
+ Y_train_std,
354
+ N_train,
355
+ center=self.center_X or self.center_Y
356
+ )
357
+ )
358
+ if return_XTX:
359
+ return self._training_kernel_matrix(
360
+ self.XTX_total,
361
+ X_val,
362
+ X_val,
363
+ X_train_mean,
364
+ X_train_mean,
365
+ X_train_std,
366
+ X_train_std,
367
+ N_train,
368
+ center=self.center_X
369
+ )
370
+ return self._training_kernel_matrix(
371
+ self.XTY_total,
372
+ X_val,
373
+ Y_val,
374
+ X_train_mean,
375
+ Y_train_mean,
376
+ X_train_std,
377
+ Y_train_std,
378
+ N_train,
379
+ center=self.center_X or self.center_Y
380
+ )
381
+
382
+ def _training_kernel_matrix(
383
+ self,
384
+ total_kernel_mat: np.ndarray,
385
+ X_val: np.ndarray,
386
+ mat2_val: np.ndarray,
387
+ X_train_mean: Union[None, np.ndarray] = None,
388
+ mat2_train_mean: Union[None, np.ndarray] = None,
389
+ X_train_std: Union[None, np.ndarray] = None,
390
+ mat2_train_std: Union[None, np.ndarray] = None,
391
+ N_train: Union[None, int] = None,
392
+ center: bool = False,
393
+ ) -> np.ndarray:
394
+ """
395
+ Computes the training set kernel matrix for a given fold.
396
+
397
+ Parameters
398
+ ----------
399
+ total_kernel_mat : Array of shape (N, K) or (N, M)
400
+ The total kernel matrix :math:`\mathbf{X}^{\mathbf{T}}\mathbf{X}` or
401
+ :math:`\mathbf{X}^{\mathbf{T}}\mathbf{Y}`.
402
+
403
+ X_val : Array of shape (N_val, K)
404
+ The validation set of predictor variables.
405
+
406
+ mat2_val : Array of shape (N_val, K) or (N_val, M)
407
+ The validation set of predictor or resoponse variables.
408
+
409
+ X_train_mean : None or array of shape (1, K), optional, default=None
410
+ The row of column-wise means of the training set of predictor variables.
411
+
412
+ mat2_train_mean : None or array of shape (1, K) or (1, M), optional,
413
+ default=None
414
+ The row of column-wise means of the training set of predictor or response
415
+ variables.
416
+
417
+ X_train_std : None or array of shape (1, K), optional, default=None
418
+ The row of column-wise standard deviations of the training set of predictor
419
+ variables.
420
+
421
+ mat2_train_std : None or array of shape (1, K) or (1, M), optional, default=None
422
+ The row of column-wise standard deviations of the training set of predictor
423
+ or response variables.
424
+
425
+ N_train : None or int, optional, default=None
426
+ The size of the training set. Only required if `X_train_mean` or
427
+ `mat2_train_mean` is not `None`.
428
+
429
+ center : bool, optional, default=False
430
+ Whether to center the kernel matrix. If `True`, the kernel matrix is
431
+ centered. Setting this parameter to `True` requires that `X_train_mean` and
432
+ `mat2_train_mean` are not `None`.
433
+
434
+ Returns
435
+ -------
436
+ Array of shape (K, K) or (K, M)
437
+ The training set kernel matrix.
438
+ """
439
+ XTmat2_train = total_kernel_mat - X_val.T @ mat2_val
440
+ if center:
441
+ XTmat2_train -= N_train * (X_train_mean.T @ mat2_train_mean)
442
+ if X_train_std is not None and mat2_train_std is not None:
443
+ return XTmat2_train / (X_train_std.T @ mat2_train_std)
444
+ if X_train_std is not None:
445
+ return XTmat2_train / X_train_std.T
446
+ if mat2_train_std is not None:
447
+ return XTmat2_train / mat2_train_std
448
+ return XTmat2_train
449
+
450
+ def _compute_training_mat_mean(
451
+ self,
452
+ mat_val: np.ndarray,
453
+ mat_total_mean: np.ndarray,
454
+ N_total_over_N_train: float,
455
+ N_val_over_N_train: float
456
+ ) -> np.ndarray:
457
+ """
458
+ Computes the row of column-wise means of a matrix for a given fold.
459
+
460
+ Parameters
461
+ ----------
462
+ mat_val : Array of shape (N_val, K) or (N_val, M)
463
+ The validation set of `X` or `Y`.
464
+
465
+ mat_total_mean : Array of shape (1, K) or (1, M)
466
+ The row of column-wise means of the total matrix.
467
+
468
+ N_total_over_N_train : float
469
+ The ratio of the total number of samples to the number of samples in the
470
+ training set.
471
+
472
+ N_val_over_N_train : float
473
+ The ratio of the number of samples in the validation set to the number of
474
+ samples in the training set.
475
+
476
+ Returns
477
+ -------
478
+ Array of shape (1, K) or (1, M)
479
+ The row of column-wise means of the training set matrix.
480
+ """
481
+ return (
482
+ N_total_over_N_train * mat_total_mean
483
+ - N_val_over_N_train * mat_val.mean(axis=0, keepdims=True)
484
+ )
485
+
486
+ def _compute_training_mat_std(
487
+ self,
488
+ mat_val: np.ndarray,
489
+ mat_train_mean: np.ndarray,
490
+ sum_mat_total: np.ndarray,
491
+ sum_sq_mat_total: np.ndarray,
492
+ N_train: int
493
+ ) -> np.ndarray:
494
+ """
495
+ Computes the row of column-wise standard deviations of a matrix for a given
496
+ fold.
497
+
498
+ Parameters
499
+ ----------
500
+ mat_val : Array of shape (N_val, K) or (N_val, M)
501
+ The validation set of `X` or `Y`.
502
+
503
+ mat_train_mean : Array of shape (1, K) or (1, M)
504
+ The row of column-wise means of the training matrix.
505
+
506
+ sum_mat_total : Array of shape (1, K) or (1, M)
507
+ The row of column-wise sums of the total matrix.
508
+
509
+ sum_sq_mat_total : Array of shape (1, K) or (1, M)
510
+ The row of column-wise sums of squares of the total matrix.
511
+
512
+ N_train : int
513
+ The size of the training set.
514
+
515
+ Returns
516
+ -------
517
+ Array of shape (1, K) or (1, M)
518
+ The row of column-wise standard deviations of the training set matrix.
519
+ """
520
+ train_sum_mat = sum_mat_total - np.expand_dims(
521
+ np.einsum("ij -> j", mat_val), axis=0
522
+ )
523
+ train_sum_sq_mat = sum_sq_mat_total - np.expand_dims(
524
+ np.einsum("ij,ij -> j", mat_val, mat_val), axis=0
525
+ )
526
+ mat_train_std = np.sqrt(
527
+ 1
528
+ / (N_train - 1)
529
+ * (
530
+ -2 * mat_train_mean * train_sum_mat
531
+ + N_train
532
+ * np.einsum("ij,ij -> ij", mat_train_mean, mat_train_mean)
533
+ + train_sum_sq_mat
534
+ )
535
+ )
536
+ mat_train_std[mat_train_std == 0] = 1
537
+ return mat_train_std
538
+
539
+ def _init_mat(self, mat: np.ndarray) -> np.ndarray:
540
+ """
541
+ Casts the matrix to the dtype specified in the constructor and reshapes it if
542
+ the matrix is one-dimensional.
543
+
544
+ Parameters
545
+ ----------
546
+ mat : Array of shape (N, K) or (N, M) or (N,)
547
+ The matrix to be initialized.
548
+
549
+ Returns
550
+ -------
551
+ Array of shape (N, K) or (N, M) or (N, 1)
552
+ The initialized matrix.
553
+ """
554
+ mat = np.asarray(mat, dtype=self.dtype)
555
+ if self.copy and mat.dtype == self.dtype:
556
+ mat = mat.copy()
557
+ if mat.ndim == 1:
558
+ mat = mat.reshape(-1, 1)
559
+ return mat
560
+
561
+ def _init_total_stats(self) -> None:
562
+ """
563
+ Initializes the global statistics for `X` and `Y`.
564
+ """
565
+ if self.center_X or self.center_Y or self.scale_X:
566
+ self.X_total_mean = self.X_total.mean(axis=0, keepdims=True)
567
+ else:
568
+ self.X_total_mean = None
569
+ if (
570
+ (self.center_X or self.center_Y or self.scale_Y)
571
+ and self.Y_total is not None
572
+ ):
573
+ self.Y_total_mean = self.Y_total.mean(axis=0, keepdims=True)
574
+ else:
575
+ self.Y_total_mean = None
576
+ if self.scale_X:
577
+ self.sum_X_total = np.expand_dims(
578
+ np.einsum("ij -> j", self.X_total), axis=0
579
+ )
580
+ self.sum_sq_X_total = np.expand_dims(
581
+ np.einsum("ij,ij -> j", self.X_total, self.X_total), axis=0
582
+ )
583
+ else:
584
+ self.sum_X_total = None
585
+ self.sum_sq_X_total = None
586
+ if self.scale_Y and self.Y_total is not None:
587
+ self.sum_Y_total = np.expand_dims(
588
+ np.einsum("ij -> j", self.Y_total), axis=0
589
+ )
590
+ self.sum_sq_Y_total = np.expand_dims(
591
+ np.einsum("ij,ij -> j", self.Y_total, self.Y_total), axis=0
592
+ )
593
+ else:
594
+ self.sum_Y_total = None
595
+ self.sum_sq_Y_total = None
596
+
597
+ def _init_val_folds_dict(
598
+ self, cv_splits: Iterable[Hashable]
599
+ ) -> dict[Hashable, npt.NDArray[np.int_]]:
600
+ """
601
+ Generates a dictionary of validation indices for each fold. The dictionary is
602
+ stored in the `val_folds_dict` attribute. The dictionary is used to quickly
603
+ access the validation indices for each fold.
604
+
605
+ Parameters
606
+ ----------
607
+ cv_splits : Iterable of Hashable with N elements
608
+ An iterable defining cross-validation splits. Each unique value in
609
+ `cv_splits` corresponds to a different fold.
610
+ """
611
+ val_folds_dict = {}
612
+ for i, num in enumerate(cv_splits):
613
+ try:
614
+ val_folds_dict[num].append(i)
615
+ except KeyError:
616
+ val_folds_dict[num] = [i]
617
+ for key in val_folds_dict:
618
+ val_folds_dict[key] = np.asarray(val_folds_dict[key], dtype=int)
619
+ self.val_folds_dict = val_folds_dict
@@ -0,0 +1,18 @@
1
+ [tool.poetry]
2
+ name = "cvmatrix"
3
+ version = "1.0.0"
4
+ description = "Fast computation of possibly centered/scaled training set kernel matrices in a cross-validation setting."
5
+ authors = ["Sm00thix <oleemail@icloud.com>"]
6
+ maintainers = ["Sm00thix <oleemail@icloud.com>"]
7
+ license = "Apache-2.0"
8
+ readme = "README.md"
9
+ homepage = "https://cvmatrix.readthedocs.io/en/latest/"
10
+ repository = "https://github.com/Sm00thix/CVMatrix"
11
+
12
+ [tool.poetry.dependencies]
13
+ python = ">=3.9, <3.13"
14
+ numpy = "^1.26.4"
15
+
16
+ [build-system]
17
+ requires = ["poetry-core"]
18
+ build-backend = "poetry.core.masonry.api"