twoblock 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- twoblock-0.0.1/LICENSE +21 -0
- twoblock-0.0.1/PKG-INFO +27 -0
- twoblock-0.0.1/README.md +10 -0
- twoblock-0.0.1/setup.cfg +7 -0
- twoblock-0.0.1/setup.py +54 -0
- twoblock-0.0.1/src/twoblock/__init__.py +17 -0
- twoblock-0.0.1/src/twoblock/_preproc_utilities.py +308 -0
- twoblock-0.0.1/src/twoblock/robcent.py +319 -0
- twoblock-0.0.1/src/twoblock/test_twoblock.py +75 -0
- twoblock-0.0.1/src/twoblock/twoblock.py +262 -0
- twoblock-0.0.1/src/twoblock/utils.py +76 -0
- twoblock-0.0.1/src/twoblock.egg-info/PKG-INFO +27 -0
- twoblock-0.0.1/src/twoblock.egg-info/SOURCES.txt +15 -0
- twoblock-0.0.1/src/twoblock.egg-info/dependency_links.txt +1 -0
- twoblock-0.0.1/src/twoblock.egg-info/requires.txt +4 -0
- twoblock-0.0.1/src/twoblock.egg-info/top_level.txt +1 -0
twoblock-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Sven Serneels
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
twoblock-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: twoblock
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A Scikit-Learn Compatible Library for Simultaneous Two-Block Sufficient Dimension Reduction Methods
|
|
5
|
+
Home-page: https://github.com/SvenSerneels/twoblock
|
|
6
|
+
Author: Sven Serneels
|
|
7
|
+
Author-email: svenserneels@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=1.22.0
|
|
14
|
+
Requires-Dist: scipy>=1.8.0
|
|
15
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
16
|
+
Requires-Dist: pandas>=1.4.0
|
|
17
|
+
|
|
18
|
+
# twoblock
|
|
19
|
+
Two-block simultaneous dimension reduction
|
|
20
|
+
|
|
21
|
+
A scikitlearn compatible implementation of simultaneous two-block dimension reduction as proposed in [1].
|
|
22
|
+
|
|
23
|
+
References
|
|
24
|
+
----------
|
|
25
|
+
[1] Cook, R. Dennis, Liliana Forzani, and Lan Liu.
|
|
26
|
+
"Partial least squares for simultaneous reduction of response and predictor
|
|
27
|
+
vectors in regression." Journal of Multivariate Analysis 196 (2023): 105163.
|
twoblock-0.0.1/README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# twoblock
|
|
2
|
+
Two-block simultaneous dimension reduction
|
|
3
|
+
|
|
4
|
+
A scikitlearn compatible implementation of simultaneous two-block dimension reduction as proposed in [1].
|
|
5
|
+
|
|
6
|
+
References
|
|
7
|
+
----------
|
|
8
|
+
[1] Cook, R. Dennis, Liliana Forzani, and Lan Liu.
|
|
9
|
+
"Partial least squares for simultaneous reduction of response and predictor
|
|
10
|
+
vectors in regression." Journal of Multivariate Analysis 196 (2023): 105163.
|
twoblock-0.0.1/setup.cfg
ADDED
twoblock-0.0.1/setup.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on Sun Jul 22 12:18:53 2018
|
|
5
|
+
|
|
6
|
+
@author: Sven serneels, Ponalytics
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from setuptools import setup, find_packages
|
|
10
|
+
import re
|
|
11
|
+
import sys
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
SRC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "./src")
|
|
15
|
+
if SRC_DIR not in sys.path:
|
|
16
|
+
sys.path.insert(0, SRC_DIR)
|
|
17
|
+
from twoblock import __version__, __author__, __license__
|
|
18
|
+
|
|
19
|
+
readme_file = os.path.join(
|
|
20
|
+
os.path.dirname(os.path.abspath(__file__)), "README.md"
|
|
21
|
+
)
|
|
22
|
+
try:
|
|
23
|
+
from m2r import parse_from_file
|
|
24
|
+
|
|
25
|
+
readme = parse_from_file(readme_file)
|
|
26
|
+
except ImportError:
|
|
27
|
+
# m2r may not be installed in user environment
|
|
28
|
+
with open(readme_file) as f:
|
|
29
|
+
readme = f.read()
|
|
30
|
+
|
|
31
|
+
setup(
|
|
32
|
+
name="twoblock",
|
|
33
|
+
version=__version__,
|
|
34
|
+
author=__author__,
|
|
35
|
+
author_email="svenserneels@gmail.com",
|
|
36
|
+
description="A Scikit-Learn Compatible Library for Simultaneous Two-Block Sufficient Dimension Reduction Methods",
|
|
37
|
+
long_description=readme,
|
|
38
|
+
long_description_content_type="text/markdown",
|
|
39
|
+
url="https://github.com/SvenSerneels/twoblock",
|
|
40
|
+
classifiers=[
|
|
41
|
+
"Programming Language :: Python :: 3",
|
|
42
|
+
"License :: OSI Approved :: MIT License",
|
|
43
|
+
"Operating System :: OS Independent",
|
|
44
|
+
],
|
|
45
|
+
packages=find_packages("src"), # include all packages under src
|
|
46
|
+
package_dir={"": "src"}, # tell distutils packages are under src
|
|
47
|
+
include_package_data=True,
|
|
48
|
+
install_requires=[
|
|
49
|
+
"numpy>=1.22.0",
|
|
50
|
+
"scipy>=1.8.0",
|
|
51
|
+
"scikit-learn>=1.3.0",
|
|
52
|
+
"pandas>=1.4.0",
|
|
53
|
+
],
|
|
54
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on Sun Jul 22 12:17:17 2018
|
|
5
|
+
|
|
6
|
+
@author: Sven Serneels, Ponalytics
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
__name__ = "twoblock"
|
|
10
|
+
__author__ = "Sven Serneels"
|
|
11
|
+
__license__ = "MIT"
|
|
12
|
+
__version__ = "0.0.1"
|
|
13
|
+
__date__ = "2024-08-07"
|
|
14
|
+
|
|
15
|
+
# The commented lines can be uncommented if IPOPT has been installed independently.
|
|
16
|
+
|
|
17
|
+
from .twoblock import twoblock
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on Sat Dec 21 10:55:24 2019
|
|
5
|
+
|
|
6
|
+
Set of help functions for robust centring and scaling
|
|
7
|
+
|
|
8
|
+
@author: Sven Serneels, Ponalytics
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as ps
|
|
13
|
+
import scipy.stats as sps
|
|
14
|
+
import scipy.optimize as spo
|
|
15
|
+
import copy
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _handle_zeros_in_scale(scale, copy=True):
|
|
19
|
+
"""
|
|
20
|
+
Makes sure that whenever scale is zero, we handle it correctly.
|
|
21
|
+
This happens in most scalers when we have constant features.
|
|
22
|
+
Taken from ScikitLearn.preprocesssing"""
|
|
23
|
+
|
|
24
|
+
# if we are fitting on 1D arrays, scale might be a scalar
|
|
25
|
+
if np.isscalar(scale):
|
|
26
|
+
if scale == 0.0:
|
|
27
|
+
scale = 1.0
|
|
28
|
+
return scale
|
|
29
|
+
elif isinstance(scale, np.ndarray):
|
|
30
|
+
if copy:
|
|
31
|
+
# New array to avoid side-effects
|
|
32
|
+
scale = scale.copy()
|
|
33
|
+
scale[scale == 0.0] = 1.0
|
|
34
|
+
return scale
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _check_trimming(t):
|
|
38
|
+
|
|
39
|
+
if (t > 0.99) or (t < 0):
|
|
40
|
+
raise (ValueError("Trimming fraction must be in [0,1)"))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def mad(X, c=0.6744897501960817, **kwargs):
|
|
44
|
+
"""
|
|
45
|
+
Column-wise median absolute deviation. **kwargs included to allow
|
|
46
|
+
general function call in scale_data.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
s = median(np.abs(X - median(X, axis=0)), axis=0) / c
|
|
50
|
+
s = np.array(s).reshape(-1)
|
|
51
|
+
# statsmodels.robust.mad is not as flexible toward matrix input,
|
|
52
|
+
# sometimes throws a value error in ufunc
|
|
53
|
+
return s
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def median(X, **kwargs):
|
|
57
|
+
"""
|
|
58
|
+
Column-wise median. **kwargs included to allow
|
|
59
|
+
general function call in scale_data.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
if np.isnan(X).any():
|
|
63
|
+
m = np.nanmedian(X, axis=0)
|
|
64
|
+
else:
|
|
65
|
+
m = np.median(X, axis=0)
|
|
66
|
+
m = np.array(m).reshape(-1)
|
|
67
|
+
|
|
68
|
+
return m
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def mean(X, trimming=0):
|
|
72
|
+
"""
|
|
73
|
+
Column-wise mean or trimmed mean. Trimming to be entered as fraction.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
if trimming == 0:
|
|
77
|
+
if np.isnan(X).any():
|
|
78
|
+
m = np.nanmean(X, axis=0)
|
|
79
|
+
else:
|
|
80
|
+
m = np.mean(X, axis=0)
|
|
81
|
+
else:
|
|
82
|
+
# Returns all NaN if missings in X
|
|
83
|
+
m = sps.trim_mean(X, trimming, 0)
|
|
84
|
+
|
|
85
|
+
return m
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def std(X, trimming=0):
|
|
89
|
+
"""
|
|
90
|
+
Column-wise standard devaition or trimmed std.
|
|
91
|
+
Trimming to be entered as fraction.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
if trimming == 0:
|
|
95
|
+
if np.isnan(X).any():
|
|
96
|
+
s = np.power(np.nanvar(X, axis=0), 0.5)
|
|
97
|
+
else:
|
|
98
|
+
s = np.power(np.var(X, axis=0), 0.5)
|
|
99
|
+
s = np.array(s).reshape(-1)
|
|
100
|
+
else:
|
|
101
|
+
var = sps.trim_mean(
|
|
102
|
+
np.square(X - sps.trim_mean(X, trimming, 0)), trimming, 0
|
|
103
|
+
)
|
|
104
|
+
s = np.sqrt(var)
|
|
105
|
+
return s
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _euclidnorm(x):
|
|
109
|
+
"""
|
|
110
|
+
Euclidean norm of a vector
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
if np.isnan(x).any():
|
|
114
|
+
return np.sqrt(np.nansum(np.square(x)))
|
|
115
|
+
else:
|
|
116
|
+
return np.sqrt(np.sum(np.square(x)))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _diffmat_objective(a, X):
|
|
120
|
+
"""
|
|
121
|
+
Utility to l1median, matrix of differences
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
(n, p) = X.shape
|
|
125
|
+
return X - np.tile(a, (n, 1))
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _l1m_objective(a, X, *args):
|
|
129
|
+
"""
|
|
130
|
+
Optimization objective for l1median
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
if np.isnan(X).any():
|
|
134
|
+
return np.nansum(
|
|
135
|
+
np.apply_along_axis(_euclidnorm, 1, _diffmat_objective(a, X))
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
return np.sum(
|
|
139
|
+
np.apply_along_axis(_euclidnorm, 1, _diffmat_objective(a, X))
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _l1m_jacobian(a, X):
|
|
144
|
+
"""
|
|
145
|
+
Jacobian for l1median
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
(n, p) = X.shape
|
|
149
|
+
dX = _diffmat_objective(a, X)
|
|
150
|
+
dists = np.apply_along_axis(_euclidnorm, 1, dX)
|
|
151
|
+
dists = _handle_zeros_in_scale(dists)
|
|
152
|
+
dX /= np.tile(np.array(dists).reshape(n, 1), (1, p))
|
|
153
|
+
if np.isnan(X).any():
|
|
154
|
+
return -np.nansum(dX, axis=0)
|
|
155
|
+
else:
|
|
156
|
+
return -np.sum(dX, axis=0)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _l1median(
|
|
160
|
+
X, x0, method="SLSQP", tol=1e-8, options={"maxiter": 2000}, **kwargs
|
|
161
|
+
):
|
|
162
|
+
"""
|
|
163
|
+
Optimization for l1median
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
mu = spo.minimize(
|
|
167
|
+
_l1m_objective,
|
|
168
|
+
x0,
|
|
169
|
+
args=(X),
|
|
170
|
+
jac=_l1m_jacobian,
|
|
171
|
+
tol=tol,
|
|
172
|
+
options=options,
|
|
173
|
+
method=method,
|
|
174
|
+
)
|
|
175
|
+
return mu
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def l1median(X, **kwargs):
|
|
179
|
+
"""
|
|
180
|
+
l1median wrapper to generically convert matrices as some of the scipy
|
|
181
|
+
optimization options will crash when provided matrix input.
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
if "x0" not in kwargs:
|
|
185
|
+
x0 = median(X)
|
|
186
|
+
|
|
187
|
+
if type(X) == np.matrix:
|
|
188
|
+
X = np.array(X)
|
|
189
|
+
|
|
190
|
+
if len(X.shape) == 2:
|
|
191
|
+
(n, p) = X.shape
|
|
192
|
+
else:
|
|
193
|
+
p = 1
|
|
194
|
+
|
|
195
|
+
if p < 2:
|
|
196
|
+
return median(X)
|
|
197
|
+
else:
|
|
198
|
+
return _l1median(X, x0, **kwargs).x
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def kstepLTS(X, maxit=5, tol=1e-10, **kwargs):
|
|
202
|
+
"""
|
|
203
|
+
Computes the K-step LTS estimator of location
|
|
204
|
+
It uses the spatial median as a starting value, and yields an
|
|
205
|
+
estimator with improved statistical efficiency, but at a higher
|
|
206
|
+
computational cost.
|
|
207
|
+
Inputs:
|
|
208
|
+
X: data matrix
|
|
209
|
+
maxit: maximum number of iterations
|
|
210
|
+
tol: convergence tolerance
|
|
211
|
+
Outputs:
|
|
212
|
+
m2: location estimate
|
|
213
|
+
"""
|
|
214
|
+
n, p = X.shape
|
|
215
|
+
m1 = l1median(X) # initial estimate
|
|
216
|
+
m2 = copy.deepcopy(m1)
|
|
217
|
+
iteration = 0
|
|
218
|
+
unconverged = True
|
|
219
|
+
while unconverged and (iteration < maxit):
|
|
220
|
+
if np.isnan(X).any():
|
|
221
|
+
dists = np.nansum(np.square(X - m1), axis=1)
|
|
222
|
+
else:
|
|
223
|
+
dists = np.sum(np.square(X - m1), axis=1)
|
|
224
|
+
cutdist = np.sort(dists, axis=0)[int(np.floor((n + 1) / 2)) - 1]
|
|
225
|
+
hsubset = np.where(dists <= cutdist)[0]
|
|
226
|
+
m2 = np.array(mean(X[hsubset, :])).reshape((p,))
|
|
227
|
+
unconverged = max(abs(m1 - m2)) > tol
|
|
228
|
+
iteration += 1
|
|
229
|
+
m1 = copy.deepcopy(m2)
|
|
230
|
+
|
|
231
|
+
return m2
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def scaleTau2(x0, c1=4.5, c2=3, consistency=True, **kwargs):
|
|
235
|
+
"""
|
|
236
|
+
Tau estimator of scale
|
|
237
|
+
Inputs:
|
|
238
|
+
x0: array or matrix, data
|
|
239
|
+
c1: consistency factor for initial estimate
|
|
240
|
+
c2: consistency factor for final estimate
|
|
241
|
+
consistency: str or bool,
|
|
242
|
+
False, True, or "finiteSample"
|
|
243
|
+
Output:
|
|
244
|
+
the scale estimate
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
x = copy.deepcopy(x0)
|
|
248
|
+
n, p = x.shape
|
|
249
|
+
if np.isnan(x).any():
|
|
250
|
+
summ = np.nansum
|
|
251
|
+
else:
|
|
252
|
+
summ = np.sum
|
|
253
|
+
medx = median(x)
|
|
254
|
+
xc = abs(x - medx)
|
|
255
|
+
sigma0 = median(xc)
|
|
256
|
+
if c1 > 0:
|
|
257
|
+
xc /= sigma0 * c1
|
|
258
|
+
w = 1 - np.square(xc)
|
|
259
|
+
w = np.square((abs(w) + w) / 2)
|
|
260
|
+
mu = summ(np.multiply(x, w)) / summ(w)
|
|
261
|
+
else:
|
|
262
|
+
mu = medx
|
|
263
|
+
x -= mu
|
|
264
|
+
x /= sigma0
|
|
265
|
+
rho = np.square(x)
|
|
266
|
+
rho[np.where(rho > c2**2)[0]] = c2**2
|
|
267
|
+
if consistency:
|
|
268
|
+
|
|
269
|
+
def Erho(b):
|
|
270
|
+
return (
|
|
271
|
+
2 * ((1 - b**2) * sps.norm.cdf(b) - b * sps.norm.pdf(b) + b**2)
|
|
272
|
+
- 1
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
def Es2(c2):
|
|
276
|
+
return Erho(c2 * sps.norm.ppf(3 / 4))
|
|
277
|
+
|
|
278
|
+
if consistency == "finiteSample":
|
|
279
|
+
nEs2 = (n - 2) * Es2(c2)
|
|
280
|
+
else:
|
|
281
|
+
nEs2 = n * Es2(c2)
|
|
282
|
+
else:
|
|
283
|
+
nEs2 = n
|
|
284
|
+
return np.array(sigma0 * np.sqrt(summ(rho) / nEs2)).reshape((p,))
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def scale_data(X, m, s):
|
|
288
|
+
"""
|
|
289
|
+
Column-wise data scaling on location and scale estimates.
|
|
290
|
+
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
n = X.shape
|
|
294
|
+
if len(n) > 1:
|
|
295
|
+
p = n[1]
|
|
296
|
+
else:
|
|
297
|
+
p = 1
|
|
298
|
+
n = n[0]
|
|
299
|
+
|
|
300
|
+
s = _handle_zeros_in_scale(s)
|
|
301
|
+
|
|
302
|
+
if p == 1:
|
|
303
|
+
Xm = X - float(m)
|
|
304
|
+
Xs = Xm / s
|
|
305
|
+
else:
|
|
306
|
+
Xm = X - np.array([m for i in range(1, n + 1)])
|
|
307
|
+
Xs = Xm / np.array([s for i in range(1, n + 1)])
|
|
308
|
+
return Xs
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# @author: Sven Serneels, Ponalytics
|
|
4
|
+
# Created on Sun Feb 4 2018
|
|
5
|
+
# Updated on Sun Dec 16 2018
|
|
6
|
+
# Refactored on Sat Dec 21 2019
|
|
7
|
+
# Refactored on Sat Mar 28 2020
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Class for classical and robust centering and scaling of input data for
|
|
11
|
+
# regression and machine learning
|
|
12
|
+
|
|
13
|
+
# Version 2.0: Code entirely restructured compared to version 1.0.
|
|
14
|
+
# Code made consistent with sklearn logic: fit(data,params) yields results.
|
|
15
|
+
# Code makes more effciient use of numpy builtin estimators.
|
|
16
|
+
# Version 3.0:
|
|
17
|
+
# Code now takes strings or functions as input to centring and scaling.
|
|
18
|
+
# Utility functions have been moved to _preproc_utilities.py
|
|
19
|
+
# Code now supplied for l1median cetring, with options to use different
|
|
20
|
+
# scipy.optimize optimization algorithms
|
|
21
|
+
# Version 4.0:
|
|
22
|
+
# Made the API compatible for ScikitLearn pipelines. However, some nonstandard
|
|
23
|
+
# functions and output remain for backwards compatibility. Functionality for
|
|
24
|
+
# sparse matrices still has to be implemented.
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Ancillary functions in _preproc_utilities.py:
|
|
28
|
+
|
|
29
|
+
# - `scale_data(X,m,s)`: centers and scales X on center m (as vector) and scale s (as vector).
|
|
30
|
+
# - `mean(X,trimming)`: Column-wise mean.
|
|
31
|
+
# - `median(X)`: Column-wise median.
|
|
32
|
+
# - `l1median(X)`: L1 or spatial median. Optional arguments:
|
|
33
|
+
# - `x0`: starting point for optimization, defaults to column wise median
|
|
34
|
+
# - `method`: optimization algorithm, defaults to 'SLSQP'
|
|
35
|
+
# - `tol`: tolerance, defaults to 1e-8
|
|
36
|
+
# - `options`: list of options for `scipy.optimize.minimize`
|
|
37
|
+
# - `kstepLTS(X): k-step LTS estimator of location.
|
|
38
|
+
# - `maxit`: int, number of iterations to compute maximally
|
|
39
|
+
# - `tol`: float, tolerance for convergence
|
|
40
|
+
# - `std(X,trimming)`: Column-wise std.
|
|
41
|
+
# - `mad(X,c)`: Column-wise median absolute deviation, with consistency factor c.
|
|
42
|
+
# - `scaleTau2(x0, c1 = 4.5, c2 = 3, consistency = True)`: Tau estimator of scale
|
|
43
|
+
# with consistency parameters c1 and c2 and option for consistency correction
|
|
44
|
+
# (True, False or 'finiteSample')
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
48
|
+
from sklearn.utils.metaestimators import _BaseComposition
|
|
49
|
+
from sklearn.utils.validation import check_is_fitted
|
|
50
|
+
import numpy as np
|
|
51
|
+
from .utils import (
|
|
52
|
+
convert_X_input,
|
|
53
|
+
convert_y_input,
|
|
54
|
+
_check_input,
|
|
55
|
+
)
|
|
56
|
+
from ._preproc_utilities import *
|
|
57
|
+
from ._preproc_utilities import _check_trimming
|
|
58
|
+
|
|
59
|
+
__all__ = ["VersatileScaler", "robcent", "versatile_scale", "Wrapper", "wrap"]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class VersatileScaler(_BaseComposition, TransformerMixin, BaseEstimator):
|
|
63
|
+
"""
|
|
64
|
+
VersatileScaler Center and Scale data about classical or robust location and scale estimates
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
`center`: str or callable, location estimator. String has to be name of the
|
|
69
|
+
function to be used, or 'None'.
|
|
70
|
+
`scale`: str or callable, scale estimator
|
|
71
|
+
`trimming`: trimming percentage to be used in location and scale estimation.
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
Attributes
|
|
75
|
+
----------
|
|
76
|
+
Arguments for methods:
|
|
77
|
+
- `X`: array-like, n x p, the data.
|
|
78
|
+
- `trimming`: float, fraction to be trimmed (must be in (0,1)).
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
Remarks
|
|
84
|
+
-------
|
|
85
|
+
Options for classical estimators 'mean' and 'std' also give access to robust
|
|
86
|
+
trimmed versions.
|
|
87
|
+
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(self, center="mean", scale="std", trimming=0):
|
|
91
|
+
"""
|
|
92
|
+
Initialize values. Check if correct options provided.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
self.center = center
|
|
96
|
+
self.scale = scale
|
|
97
|
+
self.trimming = trimming
|
|
98
|
+
|
|
99
|
+
def fit(self, X):
|
|
100
|
+
"""
|
|
101
|
+
Estimate location and scale, store these in the class object.
|
|
102
|
+
Trimming fraction can be provided as keyword argument.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
X = _check_input(X)
|
|
106
|
+
|
|
107
|
+
_check_trimming(self.trimming)
|
|
108
|
+
|
|
109
|
+
if type(self.center) is str:
|
|
110
|
+
center = eval(self.center)
|
|
111
|
+
else:
|
|
112
|
+
center = self.center
|
|
113
|
+
|
|
114
|
+
if type(self.scale) is str:
|
|
115
|
+
scale = eval(self.scale)
|
|
116
|
+
else:
|
|
117
|
+
scale = self.scale
|
|
118
|
+
|
|
119
|
+
n = X.shape
|
|
120
|
+
if len(n) > 1:
|
|
121
|
+
p = n[1]
|
|
122
|
+
else:
|
|
123
|
+
p = 1
|
|
124
|
+
n = n[0]
|
|
125
|
+
|
|
126
|
+
if self.center == "None":
|
|
127
|
+
m = np.repeat(0, p)
|
|
128
|
+
else:
|
|
129
|
+
m = center(X, trimming=self.trimming)
|
|
130
|
+
|
|
131
|
+
# Keeping col_loc_ for older version compatibility
|
|
132
|
+
setattr(self, "col_loc_", m)
|
|
133
|
+
# sklearn standard
|
|
134
|
+
setattr(self, "center_", m)
|
|
135
|
+
|
|
136
|
+
if self.scale == "None":
|
|
137
|
+
s = np.repeat(1, p)
|
|
138
|
+
else:
|
|
139
|
+
s = scale(X, trimming=self.trimming)
|
|
140
|
+
|
|
141
|
+
# Keeping col_sca_ for older version compatibility
|
|
142
|
+
setattr(self, "col_sca_", s)
|
|
143
|
+
# sklearn standard
|
|
144
|
+
setattr(self, "scale_", s)
|
|
145
|
+
|
|
146
|
+
def transform(self, X):
|
|
147
|
+
"""
|
|
148
|
+
Center and/or scale training data to pre-estimated location and scale
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
X = _check_input(X)
|
|
152
|
+
check_is_fitted(self, ["center_", "scale_"])
|
|
153
|
+
|
|
154
|
+
Xs = scale_data(X, self.center_, self.scale_)
|
|
155
|
+
setattr(self, "datas_", Xs)
|
|
156
|
+
|
|
157
|
+
return Xs
|
|
158
|
+
|
|
159
|
+
def predict(self, Xn):
|
|
160
|
+
"""
|
|
161
|
+
Standardize new data on previously estimated location and scale.
|
|
162
|
+
Number of columns needs to match.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
Xn = _check_input(Xn)
|
|
166
|
+
Xns = scale_data(Xn, self.col_loc_, self.col_sca_)
|
|
167
|
+
setattr(self, "datans_", Xns)
|
|
168
|
+
|
|
169
|
+
return Xns
|
|
170
|
+
|
|
171
|
+
def fit_transform(self, X):
|
|
172
|
+
"""
|
|
173
|
+
Estimate center and scale for training data and scale these data
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
self.fit(X)
|
|
177
|
+
self.transform(X)
|
|
178
|
+
|
|
179
|
+
return self.datas_
|
|
180
|
+
|
|
181
|
+
def inverse_transform(self, Xs=None):
|
|
182
|
+
"""
|
|
183
|
+
Transform scaled data back to their original scale
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
check_is_fitted(self, ["center_", "scale_"])
|
|
187
|
+
if Xs is not None:
|
|
188
|
+
Xs = _check_input(Xs)
|
|
189
|
+
else:
|
|
190
|
+
Xs = self.datas_
|
|
191
|
+
|
|
192
|
+
return np.multiply(Xs, self.scale_) + self.center_
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# For backwards compatibility
|
|
196
|
+
robcent = VersatileScaler
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def versatile_scale(X, center="l1median", scale="mad", trimming=0):
|
|
200
|
+
"""
|
|
201
|
+
Wrapper to scale based on present robcent implementation that uses
|
|
202
|
+
`fit` instead of `transform`
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
rc = VersatileScaler(center=center, scale=scale, trimming=trimming)
|
|
206
|
+
return rc.fit_transform(X)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class Wrapper(_BaseComposition, TransformerMixin, BaseEstimator):
|
|
210
|
+
"""
|
|
211
|
+
Wrapper Perform robustness inducing 'wrapping' transformation using
|
|
212
|
+
optimal plugins and parameters from the literature
|
|
213
|
+
|
|
214
|
+
Parameters
|
|
215
|
+
----------
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
Attributes
|
|
219
|
+
----------
|
|
220
|
+
Arguments for methods:
|
|
221
|
+
- `X`: array-like, n x p, the data.
|
|
222
|
+
|
|
223
|
+
Reference
|
|
224
|
+
---------
|
|
225
|
+
Jakob Raymaekers & Peter J. Rousseeuw (2021), Fast Robust Correlation for
|
|
226
|
+
High-Dimensional Data, Technometrics, 63:2, 184-198.
|
|
227
|
+
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
def __init__(self):
|
|
231
|
+
"""
|
|
232
|
+
Initialize values. Check if correct options provided.
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
self.center = "median"
|
|
236
|
+
self.scale = "mad"
|
|
237
|
+
self.trimming = 0
|
|
238
|
+
|
|
239
|
+
def fit(self, X):
|
|
240
|
+
"""
|
|
241
|
+
Estimate location and scale, store these in the class object.
|
|
242
|
+
Trimming fraction can be provided as keyword argument.
|
|
243
|
+
"""
|
|
244
|
+
|
|
245
|
+
X = _check_input(X)
|
|
246
|
+
|
|
247
|
+
_check_trimming(self.trimming)
|
|
248
|
+
|
|
249
|
+
if type(self.center) is str:
|
|
250
|
+
center = eval(self.center)
|
|
251
|
+
else:
|
|
252
|
+
center = self.center
|
|
253
|
+
|
|
254
|
+
if type(self.scale) is str:
|
|
255
|
+
scale = eval(self.scale)
|
|
256
|
+
else:
|
|
257
|
+
scale = self.scale
|
|
258
|
+
|
|
259
|
+
n = X.shape
|
|
260
|
+
if len(n) > 1:
|
|
261
|
+
p = n[1]
|
|
262
|
+
else:
|
|
263
|
+
p = 1
|
|
264
|
+
n = n[0]
|
|
265
|
+
|
|
266
|
+
if self.center == "None":
|
|
267
|
+
m = np.repeat(0, p)
|
|
268
|
+
else:
|
|
269
|
+
m = center(X, trimming=self.trimming)
|
|
270
|
+
|
|
271
|
+
# Keeping col_loc_ for older version compatibility
|
|
272
|
+
setattr(self, "col_loc_", m)
|
|
273
|
+
# sklearn standard
|
|
274
|
+
setattr(self, "center_", m)
|
|
275
|
+
|
|
276
|
+
if self.scale == "None":
|
|
277
|
+
s = np.repeat(1, p)
|
|
278
|
+
else:
|
|
279
|
+
s = scale(X, trimming=self.trimming)
|
|
280
|
+
|
|
281
|
+
# Keeping col_sca_ for older version compatibility
|
|
282
|
+
setattr(self, "col_sca_", s)
|
|
283
|
+
# sklearn standard
|
|
284
|
+
setattr(self, "scale_", s)
|
|
285
|
+
|
|
286
|
+
def transform(self, X):
|
|
287
|
+
"""
|
|
288
|
+
Project data points to their wrapped counterparts
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
X = _check_input(X)
|
|
292
|
+
check_is_fitted(self, ["center_", "scale_"])
|
|
293
|
+
|
|
294
|
+
Xw = wrap(X, self.center_, self.scale_)
|
|
295
|
+
setattr(self, "dataw_", Xw)
|
|
296
|
+
|
|
297
|
+
return Xw
|
|
298
|
+
|
|
299
|
+
def predict(self, Xn):
|
|
300
|
+
"""
|
|
301
|
+
Wrap new data using previously estimated location and scale.
|
|
302
|
+
Number of columns needs to match.
|
|
303
|
+
"""
|
|
304
|
+
|
|
305
|
+
Xn = _check_input(Xn)
|
|
306
|
+
Xnw = wrap(Xn, self.col_loc_, self.col_sca_)
|
|
307
|
+
setattr(self, "datanw_", Xnw)
|
|
308
|
+
|
|
309
|
+
return Xnw
|
|
310
|
+
|
|
311
|
+
def fit_transform(self, X):
|
|
312
|
+
"""
|
|
313
|
+
Estimate center and scale for training data wrap these data
|
|
314
|
+
"""
|
|
315
|
+
|
|
316
|
+
self.fit(X)
|
|
317
|
+
self.transform(X)
|
|
318
|
+
|
|
319
|
+
return self.dataw_
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Aug 7 18:17:46 2020
|
|
4
|
+
|
|
5
|
+
@author: Sven Serneels
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import unittest
|
|
9
|
+
from .twoblock import twoblock
|
|
10
|
+
import pandas as ps
|
|
11
|
+
import numpy as np
|
|
12
|
+
from sklearn.metrics import r2_score
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestTwoBlock(unittest.TestCase):
|
|
16
|
+
"""Test methods in the twoblock class"""
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def setUpClass(cls):
|
|
20
|
+
print("...setupClass")
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def tearDownClass(cls):
|
|
24
|
+
print("...teardownClass")
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def setUp(self):
|
|
28
|
+
self.Yt = ps.read_csv("./data/cookie_lab_train.csv", index_col=0).T
|
|
29
|
+
self.Xt = ps.read_csv("./data/cookie_nir_train.csv", index_col=0).T
|
|
30
|
+
self.Yv = ps.read_csv("./data/cookie_lab_test.csv", index_col=0).T
|
|
31
|
+
self.Xv = ps.read_csv("./data/cookie_nir_test.csv", index_col=0).T
|
|
32
|
+
self.p = self.Xt.shape[1]
|
|
33
|
+
self.q = self.Yt.shape[1]
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def tearDown(self):
|
|
37
|
+
del self.Xt
|
|
38
|
+
del self.Yt
|
|
39
|
+
del self.Xv
|
|
40
|
+
del self.Yv
|
|
41
|
+
del self.p
|
|
42
|
+
del self.q
|
|
43
|
+
|
|
44
|
+
def test_assert(self):
|
|
45
|
+
|
|
46
|
+
tb = twoblock(n_components_x=7, n_components_y=200, scale="None")
|
|
47
|
+
self.assertRaises(AssertionError, tb.fit, self.Xt, self.Yt)
|
|
48
|
+
|
|
49
|
+
tb = twoblock(n_components_x=700, n_components_y=2, scale="None")
|
|
50
|
+
self.assertRaises(AssertionError, tb.fit, self.Xt, self.Yt)
|
|
51
|
+
|
|
52
|
+
def test_fit(self):
|
|
53
|
+
"""Tests fit function"""
|
|
54
|
+
|
|
55
|
+
tb = twoblock(n_components_x=7, n_components_y=2, scale="None")
|
|
56
|
+
tb.fit(self.Xt, self.Yt)
|
|
57
|
+
|
|
58
|
+
self.assertEqual(tb.coef_.shape, (self.p, self.q)) # coefficients
|
|
59
|
+
|
|
60
|
+
ypttb = tb.predict(self.Xv)
|
|
61
|
+
|
|
62
|
+
self.assertEqual(
|
|
63
|
+
ypttb.shape,
|
|
64
|
+
self.Yv.shape,
|
|
65
|
+
) # predictions
|
|
66
|
+
|
|
67
|
+
r2tbt = [
|
|
68
|
+
r2_score(self.Yv.iloc[:, i], ypttb[:, i]) for i in range(self.q)
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
self.assertGreaterEqual(r2tbt, [0.8 for i in range(self.q)])
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
if __name__ == "__main__":
|
|
75
|
+
unittest.main()
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Created on Tue Aug 6 16:39:57 2024
|
|
4
|
+
|
|
5
|
+
@author: SERNEELS
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from sklearn.base import (
|
|
10
|
+
RegressorMixin,
|
|
11
|
+
BaseEstimator,
|
|
12
|
+
TransformerMixin,
|
|
13
|
+
MultiOutputMixin,
|
|
14
|
+
)
|
|
15
|
+
from sklearn.utils.metaestimators import _BaseComposition
|
|
16
|
+
import copy
|
|
17
|
+
import pandas as ps
|
|
18
|
+
from .utils import _check_input, _predict_check_input
|
|
19
|
+
from .robcent import VersatileScaler
|
|
20
|
+
|
|
21
|
+
# Draft version
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class twoblock(
|
|
25
|
+
_BaseComposition,
|
|
26
|
+
BaseEstimator,
|
|
27
|
+
TransformerMixin,
|
|
28
|
+
RegressorMixin,
|
|
29
|
+
MultiOutputMixin,
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
TWOBLOCK Two-Block Simultaneous Dimension Reduction of Multivariate X and Y
|
|
33
|
+
data blocks
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
-----------
|
|
37
|
+
|
|
38
|
+
n_components_x : int, min 1. Note that if applied on data,
|
|
39
|
+
n_components_x shall take a value <= min(x_data.shape)
|
|
40
|
+
|
|
41
|
+
n_components_y : int, min 1. Note that if applied on data,
|
|
42
|
+
n_components_x shall take a value <= min(x_data.shape)
|
|
43
|
+
If unspecified, set to equal n_components_x
|
|
44
|
+
|
|
45
|
+
verbose: Boolean (def true)
|
|
46
|
+
to print intermediate set of columns retained
|
|
47
|
+
|
|
48
|
+
centre : str,
|
|
49
|
+
type of centring (`'mean'` [recommended], `'median'` or `'l1median'`),
|
|
50
|
+
|
|
51
|
+
scale : str,
|
|
52
|
+
type of scaling ('std','mad' or 'None')
|
|
53
|
+
|
|
54
|
+
copy : (def True): boolean,
|
|
55
|
+
whether to copy data into twoblock object.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
Attributes
|
|
59
|
+
------------
|
|
60
|
+
Attributes always provided:
|
|
61
|
+
|
|
62
|
+
- `x_weights_`: X block PLS weighting vectors (usually denoted W)
|
|
63
|
+
- `y_weights_`: Y block PLS weighting vectors (usually denoted V)
|
|
64
|
+
- `x_loadings_`: X block PLS loading vectors (usually denoted P)
|
|
65
|
+
- `y_loadings_`: Y block PLS loading vectors (usually denoted Q)
|
|
66
|
+
- `x_scores_`: X block PLS score vectors (usually denoted T)
|
|
67
|
+
- `y_scores_`: Y block PLS score vectors (usually denoted U)
|
|
68
|
+
- `coef_`: vector of regression coefficients
|
|
69
|
+
- `intercept_`: intercept
|
|
70
|
+
- `coef_scaled_`: vector of scaled regression coeeficients (when scaling option used)
|
|
71
|
+
- `intercept_scaled_`: scaled intercept
|
|
72
|
+
- `residuals_`: vector of regression residuals
|
|
73
|
+
- `fitted_`: fitted response
|
|
74
|
+
- `x_loc_`: X block location estimate
|
|
75
|
+
- `y_loc_`: y location estimate
|
|
76
|
+
- `x_sca_`: X block scale estimate
|
|
77
|
+
- `y_sca_`: y scale estimate
|
|
78
|
+
- `centring_`: scaling object used internally (type: `VersatileScaler`)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
Reference
|
|
82
|
+
---------
|
|
83
|
+
Cook, R. Dennis, Liliana Forzani, and Lan Liu.
|
|
84
|
+
"Partial least squares for simultaneous reduction of response and predictor
|
|
85
|
+
vectors in regression." Journal of Multivariate Analysis 196 (2023): 105163.
|
|
86
|
+
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
n_components_x=1,
|
|
92
|
+
n_components_y=None,
|
|
93
|
+
verbose=True,
|
|
94
|
+
centre="mean",
|
|
95
|
+
scale="None",
|
|
96
|
+
copy=True,
|
|
97
|
+
):
|
|
98
|
+
self.n_components_x = n_components_x
|
|
99
|
+
self.n_components_y = n_components_y
|
|
100
|
+
self.verbose = verbose
|
|
101
|
+
self.centre = centre
|
|
102
|
+
self.scale = scale
|
|
103
|
+
self.copy = copy
|
|
104
|
+
|
|
105
|
+
def fit(self, X, Y):
|
|
106
|
+
"""
|
|
107
|
+
Fit a Twoblock model.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
------------
|
|
111
|
+
|
|
112
|
+
X : numpy array or Pandas data frame
|
|
113
|
+
Predictor data.
|
|
114
|
+
|
|
115
|
+
Y : numpy array or Pandas data frame
|
|
116
|
+
Response data
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
twoblock class object containing the estimated parameters.
|
|
121
|
+
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
if type(X) == ps.core.frame.DataFrame:
|
|
125
|
+
X = X.to_numpy()
|
|
126
|
+
(n, p) = X.shape
|
|
127
|
+
if type(Y) in [ps.core.frame.DataFrame, ps.core.series.Series]:
|
|
128
|
+
Y = Y.to_numpy()
|
|
129
|
+
X = _check_input(X)
|
|
130
|
+
Y = _check_input(Y)
|
|
131
|
+
ny, q = Y.shape
|
|
132
|
+
if ny != n:
|
|
133
|
+
raise (ValueError("Number of cases in X and Y needs to agree"))
|
|
134
|
+
|
|
135
|
+
Y = Y.astype("float64")
|
|
136
|
+
|
|
137
|
+
if self.n_components_y is None:
|
|
138
|
+
self.n_components_y = self.n_components_x
|
|
139
|
+
|
|
140
|
+
assert self.n_components_x <= min(
|
|
141
|
+
np.linalg.matrix_rank(np.matmul(X.T, X)), n - 1
|
|
142
|
+
), "Number of components cannot exceed covariance rank or number of cases"
|
|
143
|
+
|
|
144
|
+
assert self.n_components_y <= min(
|
|
145
|
+
np.linalg.matrix_rank(np.matmul(Y.T, Y)), n - 1
|
|
146
|
+
), "Number of components cannot exceed covariance rank or number of cases"
|
|
147
|
+
|
|
148
|
+
if self.copy:
|
|
149
|
+
X0 = copy.deepcopy(X)
|
|
150
|
+
Y0 = copy.deepcopy(Y)
|
|
151
|
+
else:
|
|
152
|
+
X0 = X
|
|
153
|
+
Y0 = Y
|
|
154
|
+
if self.copy:
|
|
155
|
+
self.X = X0
|
|
156
|
+
self.Y = Y0
|
|
157
|
+
X0 = X0.astype("float64")
|
|
158
|
+
centring = VersatileScaler(
|
|
159
|
+
center=self.centre, scale=self.scale, trimming=0
|
|
160
|
+
)
|
|
161
|
+
X0 = centring.fit_transform(X0).astype("float64")
|
|
162
|
+
mX = centring.col_loc_
|
|
163
|
+
sX = centring.col_sca_
|
|
164
|
+
Y0 = centring.fit_transform(Y0).astype("float64")
|
|
165
|
+
my = centring.col_loc_
|
|
166
|
+
sy = centring.col_sca_
|
|
167
|
+
|
|
168
|
+
self.x_scores_ = np.empty((n, self.n_components_x), float)
|
|
169
|
+
self.y_scores_ = np.empty((n, self.n_components_y), float)
|
|
170
|
+
self.x_weights_ = np.empty((p, self.n_components_x), float)
|
|
171
|
+
self.y_weights_ = np.empty((q, self.n_components_y), float)
|
|
172
|
+
self.x_loadings_ = np.empty((p, self.n_components_x), float)
|
|
173
|
+
self.y_loadings_ = np.empty((q, self.n_components_y), float)
|
|
174
|
+
|
|
175
|
+
Xh = copy.deepcopy(X0)
|
|
176
|
+
Yh = copy.deepcopy(Y0)
|
|
177
|
+
|
|
178
|
+
for i in range(self.n_components_x):
|
|
179
|
+
|
|
180
|
+
sXY = np.dot(Xh.T, Y0) / n
|
|
181
|
+
u, _, _ = np.linalg.svd(sXY)
|
|
182
|
+
x_weights = u[:, 0].reshape((p,))
|
|
183
|
+
x_scores = np.dot(Xh, x_weights)
|
|
184
|
+
x_loadings = np.dot(Xh.T, x_scores) / np.dot(x_scores, x_scores)
|
|
185
|
+
|
|
186
|
+
Xh -= np.outer(x_scores, x_loadings)
|
|
187
|
+
|
|
188
|
+
self.x_weights_[:, i] = x_weights
|
|
189
|
+
self.x_scores_[:, i] = x_scores
|
|
190
|
+
self.x_loadings_[:, i] = x_loadings
|
|
191
|
+
|
|
192
|
+
for i in range(self.n_components_y):
|
|
193
|
+
|
|
194
|
+
sYX = np.dot(Yh.T, X0) / n
|
|
195
|
+
|
|
196
|
+
v, _, _ = np.linalg.svd(sYX)
|
|
197
|
+
y_weights = v[:, 0].reshape((q,))
|
|
198
|
+
y_scores = np.dot(Yh, y_weights)
|
|
199
|
+
y_loadings = np.dot(Yh.T, y_scores) / np.dot(y_scores, y_scores)
|
|
200
|
+
|
|
201
|
+
Yh -= np.outer(y_scores, y_loadings)
|
|
202
|
+
|
|
203
|
+
self.y_weights_[:, i] = y_weights
|
|
204
|
+
self.y_scores_[:, i] = y_scores
|
|
205
|
+
self.y_loadings_[:, i] = y_loadings
|
|
206
|
+
|
|
207
|
+
wtx = np.dot(X0, self.x_weights_)
|
|
208
|
+
wti = np.linalg.inv(np.dot(wtx.T, wtx))
|
|
209
|
+
swg = np.dot(wtx.T, np.dot(Y0, self.y_weights_))
|
|
210
|
+
self.coef_scaled_ = np.matmul(
|
|
211
|
+
np.matmul(self.x_weights_, wti), np.dot(swg, self.y_weights_.T)
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if self.centre == "None" and self.scale == "None":
|
|
215
|
+
B_rescaled = self.coef_scaled_
|
|
216
|
+
else:
|
|
217
|
+
# sklearn has this line wrong
|
|
218
|
+
B_rescaled = np.multiply(
|
|
219
|
+
np.outer(sy, np.divide(1, sX)).T, self.coef_scaled_
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
Yp_rescaled = np.matmul(X, B_rescaled)
|
|
223
|
+
if self.centre == "None":
|
|
224
|
+
intercept = 0
|
|
225
|
+
elif self.centre == "mean":
|
|
226
|
+
intercept = np.mean(Y - Yp_rescaled, axis=0)
|
|
227
|
+
else:
|
|
228
|
+
intercept = np.median(Y - Yp_rescaled, axis=0)
|
|
229
|
+
|
|
230
|
+
Yfit = Yp_rescaled + intercept
|
|
231
|
+
R = Y - Yfit
|
|
232
|
+
|
|
233
|
+
setattr(self, "coef_", B_rescaled)
|
|
234
|
+
setattr(self, "intercept_", intercept)
|
|
235
|
+
setattr(self, "fitted_", Yfit)
|
|
236
|
+
setattr(self, "residuals_", R)
|
|
237
|
+
setattr(self, "x_loc_", mX)
|
|
238
|
+
setattr(self, "y_loc_", my)
|
|
239
|
+
setattr(self, "x_sca_", sX)
|
|
240
|
+
setattr(self, "y_sca_", sy)
|
|
241
|
+
setattr(self, "centring_", centring)
|
|
242
|
+
return self
|
|
243
|
+
|
|
244
|
+
def predict(self, Xn):
|
|
245
|
+
"""
|
|
246
|
+
Predict cases.
|
|
247
|
+
|
|
248
|
+
Parameters
|
|
249
|
+
------------
|
|
250
|
+
|
|
251
|
+
Xn : numpy array or data frame
|
|
252
|
+
Input data.
|
|
253
|
+
|
|
254
|
+
"""
|
|
255
|
+
n, p, Xn = _predict_check_input(Xn)
|
|
256
|
+
if p != self.X.shape[1]:
|
|
257
|
+
raise (
|
|
258
|
+
ValueError(
|
|
259
|
+
"New data must have same number of columns as the ones the model has been trained with"
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
return np.matmul(Xn, self.coef_) + self.intercept_
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on Mon Apr 13 16:08:22 2020
|
|
5
|
+
|
|
6
|
+
@author: sven
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import pandas as ps
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def convert_X_input(X):
|
|
14
|
+
|
|
15
|
+
if type(X) == ps.core.frame.DataFrame:
|
|
16
|
+
X = X.to_numpy().astype("float64")
|
|
17
|
+
return X
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def convert_y_input(y):
|
|
21
|
+
|
|
22
|
+
if type(y) in [ps.core.frame.DataFrame, ps.core.series.Series]:
|
|
23
|
+
y = y.to_numpy().T.astype("float64")
|
|
24
|
+
return y
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def const_xscale(beta, *args):
|
|
28
|
+
X = args[0]
|
|
29
|
+
h = args[1]
|
|
30
|
+
i = args[2]
|
|
31
|
+
j = args[3]
|
|
32
|
+
beta = np.reshape(beta, (-1, h), order="F")
|
|
33
|
+
covx = np.cov(X, rowvar=False)
|
|
34
|
+
ans = np.matmul(np.matmul(beta.T, covx), beta) - np.identity(h)
|
|
35
|
+
return ans[i, j]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def const_zscale(beta, *args):
|
|
39
|
+
X = args[0]
|
|
40
|
+
h = args[1]
|
|
41
|
+
i = args[2]
|
|
42
|
+
j = args[3]
|
|
43
|
+
beta = np.reshape(beta, (-1, h), order="F")
|
|
44
|
+
covx = np.identity(X.shape[1])
|
|
45
|
+
ans = np.matmul(np.matmul(beta.T, covx), beta) - np.identity(h)
|
|
46
|
+
return ans[i, j]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _predict_check_input(Xn):
|
|
50
|
+
if type(Xn) == ps.core.series.Series:
|
|
51
|
+
Xn = Xn.to_numpy()
|
|
52
|
+
if Xn.ndim == 1:
|
|
53
|
+
Xn = Xn.reshape((1, -1))
|
|
54
|
+
if type(Xn) == ps.core.frame.DataFrame:
|
|
55
|
+
Xn = Xn.to_numpy()
|
|
56
|
+
n, p = Xn.shape
|
|
57
|
+
return (n, p, Xn)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _check_input(X):
|
|
61
|
+
|
|
62
|
+
if type(X) in (np.matrix, ps.core.frame.DataFrame, ps.core.series.Series):
|
|
63
|
+
X = np.array(X)
|
|
64
|
+
|
|
65
|
+
if X.dtype == np.dtype("O"):
|
|
66
|
+
X = X.astype("float64")
|
|
67
|
+
|
|
68
|
+
if X.ndim == 1:
|
|
69
|
+
X = X.reshape((1, -1))
|
|
70
|
+
|
|
71
|
+
n, p = X.shape
|
|
72
|
+
|
|
73
|
+
if n == 1:
|
|
74
|
+
if p >= 2:
|
|
75
|
+
X = X.reshape((-1, 1))
|
|
76
|
+
return X
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: twoblock
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A Scikit-Learn Compatible Library for Simultaneous Two-Block Sufficient Dimension Reduction Methods
|
|
5
|
+
Home-page: https://github.com/SvenSerneels/twoblock
|
|
6
|
+
Author: Sven Serneels
|
|
7
|
+
Author-email: svenserneels@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=1.22.0
|
|
14
|
+
Requires-Dist: scipy>=1.8.0
|
|
15
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
16
|
+
Requires-Dist: pandas>=1.4.0
|
|
17
|
+
|
|
18
|
+
# twoblock
|
|
19
|
+
Two-block simultaneous dimension reduction
|
|
20
|
+
|
|
21
|
+
A scikitlearn compatible implementation of simultaneous two-block dimension reduction as proposed in [1].
|
|
22
|
+
|
|
23
|
+
References
|
|
24
|
+
----------
|
|
25
|
+
[1] Cook, R. Dennis, Liliana Forzani, and Lan Liu.
|
|
26
|
+
"Partial least squares for simultaneous reduction of response and predictor
|
|
27
|
+
vectors in regression." Journal of Multivariate Analysis 196 (2023): 105163.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.cfg
|
|
4
|
+
setup.py
|
|
5
|
+
src/twoblock/__init__.py
|
|
6
|
+
src/twoblock/_preproc_utilities.py
|
|
7
|
+
src/twoblock/robcent.py
|
|
8
|
+
src/twoblock/test_twoblock.py
|
|
9
|
+
src/twoblock/twoblock.py
|
|
10
|
+
src/twoblock/utils.py
|
|
11
|
+
src/twoblock.egg-info/PKG-INFO
|
|
12
|
+
src/twoblock.egg-info/SOURCES.txt
|
|
13
|
+
src/twoblock.egg-info/dependency_links.txt
|
|
14
|
+
src/twoblock.egg-info/requires.txt
|
|
15
|
+
src/twoblock.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
twoblock
|