twoblock 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
twoblock-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Sven Serneels
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.1
2
+ Name: twoblock
3
+ Version: 0.0.1
4
+ Summary: A Scikit-Learn Compatible Library for Simultaneous Two-Block Sufficient Dimension Reduction Methods
5
+ Home-page: https://github.com/SvenSerneels/twoblock
6
+ Author: Sven Serneels
7
+ Author-email: svenserneels@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy>=1.22.0
14
+ Requires-Dist: scipy>=1.8.0
15
+ Requires-Dist: scikit-learn>=1.3.0
16
+ Requires-Dist: pandas>=1.4.0
17
+
18
+ # twoblock
19
+ Two-block simultaneous dimension reduction
20
+
21
+ A scikitlearn compatible implementation of simultaneous two-block dimension reduction as proposed in [1].
22
+
23
+ References
24
+ ----------
25
+ [1] Cook, R. Dennis, Liliana Forzani, and Lan Liu.
26
+ "Partial least squares for simultaneous reduction of response and predictor
27
+ vectors in regression." Journal of Multivariate Analysis 196 (2023): 105163.
@@ -0,0 +1,10 @@
1
+ # twoblock
2
+ Two-block simultaneous dimension reduction
3
+
4
+ A scikitlearn compatible implementation of simultaneous two-block dimension reduction as proposed in [1].
5
+
6
+ References
7
+ ----------
8
+ [1] Cook, R. Dennis, Liliana Forzani, and Lan Liu.
9
+ "Partial least squares for simultaneous reduction of response and predictor
10
+ vectors in regression." Journal of Multivariate Analysis 196 (2023): 105163.
@@ -0,0 +1,7 @@
1
+ [metadata]
2
+ description-file = README.md
3
+
4
+ [egg_info]
5
+ tag_build =
6
+ tag_date = 0
7
+
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Sun Jul 22 12:18:53 2018
5
+
6
+ @author: Sven serneels, Ponalytics
7
+ """
8
+
9
+ from setuptools import setup, find_packages
10
+ import re
11
+ import sys
12
+ import os
13
+
14
+ SRC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "./src")
15
+ if SRC_DIR not in sys.path:
16
+ sys.path.insert(0, SRC_DIR)
17
+ from twoblock import __version__, __author__, __license__
18
+
19
+ readme_file = os.path.join(
20
+ os.path.dirname(os.path.abspath(__file__)), "README.md"
21
+ )
22
+ try:
23
+ from m2r import parse_from_file
24
+
25
+ readme = parse_from_file(readme_file)
26
+ except ImportError:
27
+ # m2r may not be installed in user environment
28
+ with open(readme_file) as f:
29
+ readme = f.read()
30
+
31
+ setup(
32
+ name="twoblock",
33
+ version=__version__,
34
+ author=__author__,
35
+ author_email="svenserneels@gmail.com",
36
+ description="A Scikit-Learn Compatible Library for Simultaneous Two-Block Sufficient Dimension Reduction Methods",
37
+ long_description=readme,
38
+ long_description_content_type="text/markdown",
39
+ url="https://github.com/SvenSerneels/twoblock",
40
+ classifiers=[
41
+ "Programming Language :: Python :: 3",
42
+ "License :: OSI Approved :: MIT License",
43
+ "Operating System :: OS Independent",
44
+ ],
45
+ packages=find_packages("src"), # include all packages under src
46
+ package_dir={"": "src"}, # tell distutils packages are under src
47
+ include_package_data=True,
48
+ install_requires=[
49
+ "numpy>=1.22.0",
50
+ "scipy>=1.8.0",
51
+ "scikit-learn>=1.3.0",
52
+ "pandas>=1.4.0",
53
+ ],
54
+ )
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Sun Jul 22 12:17:17 2018
5
+
6
+ @author: Sven Serneels, Ponalytics
7
+ """
8
+
9
+ __name__ = "twoblock"
10
+ __author__ = "Sven Serneels"
11
+ __license__ = "MIT"
12
+ __version__ = "0.0.1"
13
+ __date__ = "2024-08-07"
14
+
15
+ # The commented lines can be uncommented if IPOPT has been installed independently.
16
+
17
+ from .twoblock import twoblock
@@ -0,0 +1,308 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Sat Dec 21 10:55:24 2019
5
+
6
+ Set of help functions for robust centring and scaling
7
+
8
+ @author: Sven Serneels, Ponalytics
9
+ """
10
+
11
+ import numpy as np
12
+ import pandas as ps
13
+ import scipy.stats as sps
14
+ import scipy.optimize as spo
15
+ import copy
16
+
17
+
18
+ def _handle_zeros_in_scale(scale, copy=True):
19
+ """
20
+ Makes sure that whenever scale is zero, we handle it correctly.
21
+ This happens in most scalers when we have constant features.
22
+ Taken from ScikitLearn.preprocesssing"""
23
+
24
+ # if we are fitting on 1D arrays, scale might be a scalar
25
+ if np.isscalar(scale):
26
+ if scale == 0.0:
27
+ scale = 1.0
28
+ return scale
29
+ elif isinstance(scale, np.ndarray):
30
+ if copy:
31
+ # New array to avoid side-effects
32
+ scale = scale.copy()
33
+ scale[scale == 0.0] = 1.0
34
+ return scale
35
+
36
+
37
+ def _check_trimming(t):
38
+
39
+ if (t > 0.99) or (t < 0):
40
+ raise (ValueError("Trimming fraction must be in [0,1)"))
41
+
42
+
43
+ def mad(X, c=0.6744897501960817, **kwargs):
44
+ """
45
+ Column-wise median absolute deviation. **kwargs included to allow
46
+ general function call in scale_data.
47
+ """
48
+
49
+ s = median(np.abs(X - median(X, axis=0)), axis=0) / c
50
+ s = np.array(s).reshape(-1)
51
+ # statsmodels.robust.mad is not as flexible toward matrix input,
52
+ # sometimes throws a value error in ufunc
53
+ return s
54
+
55
+
56
+ def median(X, **kwargs):
57
+ """
58
+ Column-wise median. **kwargs included to allow
59
+ general function call in scale_data.
60
+ """
61
+
62
+ if np.isnan(X).any():
63
+ m = np.nanmedian(X, axis=0)
64
+ else:
65
+ m = np.median(X, axis=0)
66
+ m = np.array(m).reshape(-1)
67
+
68
+ return m
69
+
70
+
71
+ def mean(X, trimming=0):
72
+ """
73
+ Column-wise mean or trimmed mean. Trimming to be entered as fraction.
74
+ """
75
+
76
+ if trimming == 0:
77
+ if np.isnan(X).any():
78
+ m = np.nanmean(X, axis=0)
79
+ else:
80
+ m = np.mean(X, axis=0)
81
+ else:
82
+ # Returns all NaN if missings in X
83
+ m = sps.trim_mean(X, trimming, 0)
84
+
85
+ return m
86
+
87
+
88
+ def std(X, trimming=0):
89
+ """
90
+ Column-wise standard devaition or trimmed std.
91
+ Trimming to be entered as fraction.
92
+ """
93
+
94
+ if trimming == 0:
95
+ if np.isnan(X).any():
96
+ s = np.power(np.nanvar(X, axis=0), 0.5)
97
+ else:
98
+ s = np.power(np.var(X, axis=0), 0.5)
99
+ s = np.array(s).reshape(-1)
100
+ else:
101
+ var = sps.trim_mean(
102
+ np.square(X - sps.trim_mean(X, trimming, 0)), trimming, 0
103
+ )
104
+ s = np.sqrt(var)
105
+ return s
106
+
107
+
108
+ def _euclidnorm(x):
109
+ """
110
+ Euclidean norm of a vector
111
+ """
112
+
113
+ if np.isnan(x).any():
114
+ return np.sqrt(np.nansum(np.square(x)))
115
+ else:
116
+ return np.sqrt(np.sum(np.square(x)))
117
+
118
+
119
+ def _diffmat_objective(a, X):
120
+ """
121
+ Utility to l1median, matrix of differences
122
+ """
123
+
124
+ (n, p) = X.shape
125
+ return X - np.tile(a, (n, 1))
126
+
127
+
128
+ def _l1m_objective(a, X, *args):
129
+ """
130
+ Optimization objective for l1median
131
+ """
132
+
133
+ if np.isnan(X).any():
134
+ return np.nansum(
135
+ np.apply_along_axis(_euclidnorm, 1, _diffmat_objective(a, X))
136
+ )
137
+ else:
138
+ return np.sum(
139
+ np.apply_along_axis(_euclidnorm, 1, _diffmat_objective(a, X))
140
+ )
141
+
142
+
143
+ def _l1m_jacobian(a, X):
144
+ """
145
+ Jacobian for l1median
146
+ """
147
+
148
+ (n, p) = X.shape
149
+ dX = _diffmat_objective(a, X)
150
+ dists = np.apply_along_axis(_euclidnorm, 1, dX)
151
+ dists = _handle_zeros_in_scale(dists)
152
+ dX /= np.tile(np.array(dists).reshape(n, 1), (1, p))
153
+ if np.isnan(X).any():
154
+ return -np.nansum(dX, axis=0)
155
+ else:
156
+ return -np.sum(dX, axis=0)
157
+
158
+
159
+ def _l1median(
160
+ X, x0, method="SLSQP", tol=1e-8, options={"maxiter": 2000}, **kwargs
161
+ ):
162
+ """
163
+ Optimization for l1median
164
+ """
165
+
166
+ mu = spo.minimize(
167
+ _l1m_objective,
168
+ x0,
169
+ args=(X),
170
+ jac=_l1m_jacobian,
171
+ tol=tol,
172
+ options=options,
173
+ method=method,
174
+ )
175
+ return mu
176
+
177
+
178
+ def l1median(X, **kwargs):
179
+ """
180
+ l1median wrapper to generically convert matrices as some of the scipy
181
+ optimization options will crash when provided matrix input.
182
+ """
183
+
184
+ if "x0" not in kwargs:
185
+ x0 = median(X)
186
+
187
+ if type(X) == np.matrix:
188
+ X = np.array(X)
189
+
190
+ if len(X.shape) == 2:
191
+ (n, p) = X.shape
192
+ else:
193
+ p = 1
194
+
195
+ if p < 2:
196
+ return median(X)
197
+ else:
198
+ return _l1median(X, x0, **kwargs).x
199
+
200
+
201
+ def kstepLTS(X, maxit=5, tol=1e-10, **kwargs):
202
+ """
203
+ Computes the K-step LTS estimator of location
204
+ It uses the spatial median as a starting value, and yields an
205
+ estimator with improved statistical efficiency, but at a higher
206
+ computational cost.
207
+ Inputs:
208
+ X: data matrix
209
+ maxit: maximum number of iterations
210
+ tol: convergence tolerance
211
+ Outputs:
212
+ m2: location estimate
213
+ """
214
+ n, p = X.shape
215
+ m1 = l1median(X) # initial estimate
216
+ m2 = copy.deepcopy(m1)
217
+ iteration = 0
218
+ unconverged = True
219
+ while unconverged and (iteration < maxit):
220
+ if np.isnan(X).any():
221
+ dists = np.nansum(np.square(X - m1), axis=1)
222
+ else:
223
+ dists = np.sum(np.square(X - m1), axis=1)
224
+ cutdist = np.sort(dists, axis=0)[int(np.floor((n + 1) / 2)) - 1]
225
+ hsubset = np.where(dists <= cutdist)[0]
226
+ m2 = np.array(mean(X[hsubset, :])).reshape((p,))
227
+ unconverged = max(abs(m1 - m2)) > tol
228
+ iteration += 1
229
+ m1 = copy.deepcopy(m2)
230
+
231
+ return m2
232
+
233
+
234
+ def scaleTau2(x0, c1=4.5, c2=3, consistency=True, **kwargs):
235
+ """
236
+ Tau estimator of scale
237
+ Inputs:
238
+ x0: array or matrix, data
239
+ c1: consistency factor for initial estimate
240
+ c2: consistency factor for final estimate
241
+ consistency: str or bool,
242
+ False, True, or "finiteSample"
243
+ Output:
244
+ the scale estimate
245
+ """
246
+
247
+ x = copy.deepcopy(x0)
248
+ n, p = x.shape
249
+ if np.isnan(x).any():
250
+ summ = np.nansum
251
+ else:
252
+ summ = np.sum
253
+ medx = median(x)
254
+ xc = abs(x - medx)
255
+ sigma0 = median(xc)
256
+ if c1 > 0:
257
+ xc /= sigma0 * c1
258
+ w = 1 - np.square(xc)
259
+ w = np.square((abs(w) + w) / 2)
260
+ mu = summ(np.multiply(x, w)) / summ(w)
261
+ else:
262
+ mu = medx
263
+ x -= mu
264
+ x /= sigma0
265
+ rho = np.square(x)
266
+ rho[np.where(rho > c2**2)[0]] = c2**2
267
+ if consistency:
268
+
269
+ def Erho(b):
270
+ return (
271
+ 2 * ((1 - b**2) * sps.norm.cdf(b) - b * sps.norm.pdf(b) + b**2)
272
+ - 1
273
+ )
274
+
275
+ def Es2(c2):
276
+ return Erho(c2 * sps.norm.ppf(3 / 4))
277
+
278
+ if consistency == "finiteSample":
279
+ nEs2 = (n - 2) * Es2(c2)
280
+ else:
281
+ nEs2 = n * Es2(c2)
282
+ else:
283
+ nEs2 = n
284
+ return np.array(sigma0 * np.sqrt(summ(rho) / nEs2)).reshape((p,))
285
+
286
+
287
+ def scale_data(X, m, s):
288
+ """
289
+ Column-wise data scaling on location and scale estimates.
290
+
291
+ """
292
+
293
+ n = X.shape
294
+ if len(n) > 1:
295
+ p = n[1]
296
+ else:
297
+ p = 1
298
+ n = n[0]
299
+
300
+ s = _handle_zeros_in_scale(s)
301
+
302
+ if p == 1:
303
+ Xm = X - float(m)
304
+ Xs = Xm / s
305
+ else:
306
+ Xm = X - np.array([m for i in range(1, n + 1)])
307
+ Xs = Xm / np.array([s for i in range(1, n + 1)])
308
+ return Xs
@@ -0,0 +1,319 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @author: Sven Serneels, Ponalytics
4
+ # Created on Sun Feb 4 2018
5
+ # Updated on Sun Dec 16 2018
6
+ # Refactored on Sat Dec 21 2019
7
+ # Refactored on Sat Mar 28 2020
8
+
9
+
10
+ # Class for classical and robust centering and scaling of input data for
11
+ # regression and machine learning
12
+
13
+ # Version 2.0: Code entirely restructured compared to version 1.0.
14
+ # Code made consistent with sklearn logic: fit(data,params) yields results.
15
+ # Code makes more effciient use of numpy builtin estimators.
16
+ # Version 3.0:
17
+ # Code now takes strings or functions as input to centring and scaling.
18
+ # Utility functions have been moved to _preproc_utilities.py
19
+ # Code now supplied for l1median cetring, with options to use different
20
+ # scipy.optimize optimization algorithms
21
+ # Version 4.0:
22
+ # Made the API compatible for ScikitLearn pipelines. However, some nonstandard
23
+ # functions and output remain for backwards compatibility. Functionality for
24
+ # sparse matrices still has to be implemented.
25
+
26
+
27
+ # Ancillary functions in _preproc_utilities.py:
28
+
29
+ # - `scale_data(X,m,s)`: centers and scales X on center m (as vector) and scale s (as vector).
30
+ # - `mean(X,trimming)`: Column-wise mean.
31
+ # - `median(X)`: Column-wise median.
32
+ # - `l1median(X)`: L1 or spatial median. Optional arguments:
33
+ # - `x0`: starting point for optimization, defaults to column wise median
34
+ # - `method`: optimization algorithm, defaults to 'SLSQP'
35
+ # - `tol`: tolerance, defaults to 1e-8
36
+ # - `options`: list of options for `scipy.optimize.minimize`
37
+ # - `kstepLTS(X): k-step LTS estimator of location.
38
+ # - `maxit`: int, number of iterations to compute maximally
39
+ # - `tol`: float, tolerance for convergence
40
+ # - `std(X,trimming)`: Column-wise std.
41
+ # - `mad(X,c)`: Column-wise median absolute deviation, with consistency factor c.
42
+ # - `scaleTau2(x0, c1 = 4.5, c2 = 3, consistency = True)`: Tau estimator of scale
43
+ # with consistency parameters c1 and c2 and option for consistency correction
44
+ # (True, False or 'finiteSample')
45
+
46
+
47
+ from sklearn.base import BaseEstimator, TransformerMixin
48
+ from sklearn.utils.metaestimators import _BaseComposition
49
+ from sklearn.utils.validation import check_is_fitted
50
+ import numpy as np
51
+ from .utils import (
52
+ convert_X_input,
53
+ convert_y_input,
54
+ _check_input,
55
+ )
56
+ from ._preproc_utilities import *
57
+ from ._preproc_utilities import _check_trimming
58
+
59
+ __all__ = ["VersatileScaler", "robcent", "versatile_scale", "Wrapper", "wrap"]
60
+
61
+
62
+ class VersatileScaler(_BaseComposition, TransformerMixin, BaseEstimator):
63
+ """
64
+ VersatileScaler Center and Scale data about classical or robust location and scale estimates
65
+
66
+ Parameters
67
+ ----------
68
+ `center`: str or callable, location estimator. String has to be name of the
69
+ function to be used, or 'None'.
70
+ `scale`: str or callable, scale estimator
71
+ `trimming`: trimming percentage to be used in location and scale estimation.
72
+
73
+
74
+ Attributes
75
+ ----------
76
+ Arguments for methods:
77
+ - `X`: array-like, n x p, the data.
78
+ - `trimming`: float, fraction to be trimmed (must be in (0,1)).
79
+
80
+
81
+
82
+
83
+ Remarks
84
+ -------
85
+ Options for classical estimators 'mean' and 'std' also give access to robust
86
+ trimmed versions.
87
+
88
+ """
89
+
90
+ def __init__(self, center="mean", scale="std", trimming=0):
91
+ """
92
+ Initialize values. Check if correct options provided.
93
+ """
94
+
95
+ self.center = center
96
+ self.scale = scale
97
+ self.trimming = trimming
98
+
99
+ def fit(self, X):
100
+ """
101
+ Estimate location and scale, store these in the class object.
102
+ Trimming fraction can be provided as keyword argument.
103
+ """
104
+
105
+ X = _check_input(X)
106
+
107
+ _check_trimming(self.trimming)
108
+
109
+ if type(self.center) is str:
110
+ center = eval(self.center)
111
+ else:
112
+ center = self.center
113
+
114
+ if type(self.scale) is str:
115
+ scale = eval(self.scale)
116
+ else:
117
+ scale = self.scale
118
+
119
+ n = X.shape
120
+ if len(n) > 1:
121
+ p = n[1]
122
+ else:
123
+ p = 1
124
+ n = n[0]
125
+
126
+ if self.center == "None":
127
+ m = np.repeat(0, p)
128
+ else:
129
+ m = center(X, trimming=self.trimming)
130
+
131
+ # Keeping col_loc_ for older version compatibility
132
+ setattr(self, "col_loc_", m)
133
+ # sklearn standard
134
+ setattr(self, "center_", m)
135
+
136
+ if self.scale == "None":
137
+ s = np.repeat(1, p)
138
+ else:
139
+ s = scale(X, trimming=self.trimming)
140
+
141
+ # Keeping col_sca_ for older version compatibility
142
+ setattr(self, "col_sca_", s)
143
+ # sklearn standard
144
+ setattr(self, "scale_", s)
145
+
146
+ def transform(self, X):
147
+ """
148
+ Center and/or scale training data to pre-estimated location and scale
149
+ """
150
+
151
+ X = _check_input(X)
152
+ check_is_fitted(self, ["center_", "scale_"])
153
+
154
+ Xs = scale_data(X, self.center_, self.scale_)
155
+ setattr(self, "datas_", Xs)
156
+
157
+ return Xs
158
+
159
+ def predict(self, Xn):
160
+ """
161
+ Standardize new data on previously estimated location and scale.
162
+ Number of columns needs to match.
163
+ """
164
+
165
+ Xn = _check_input(Xn)
166
+ Xns = scale_data(Xn, self.col_loc_, self.col_sca_)
167
+ setattr(self, "datans_", Xns)
168
+
169
+ return Xns
170
+
171
+ def fit_transform(self, X):
172
+ """
173
+ Estimate center and scale for training data and scale these data
174
+ """
175
+
176
+ self.fit(X)
177
+ self.transform(X)
178
+
179
+ return self.datas_
180
+
181
+ def inverse_transform(self, Xs=None):
182
+ """
183
+ Transform scaled data back to their original scale
184
+ """
185
+
186
+ check_is_fitted(self, ["center_", "scale_"])
187
+ if Xs is not None:
188
+ Xs = _check_input(Xs)
189
+ else:
190
+ Xs = self.datas_
191
+
192
+ return np.multiply(Xs, self.scale_) + self.center_
193
+
194
+
195
+ # For backwards compatibility
196
+ robcent = VersatileScaler
197
+
198
+
199
+ def versatile_scale(X, center="l1median", scale="mad", trimming=0):
200
+ """
201
+ Wrapper to scale based on present robcent implementation that uses
202
+ `fit` instead of `transform`
203
+ """
204
+
205
+ rc = VersatileScaler(center=center, scale=scale, trimming=trimming)
206
+ return rc.fit_transform(X)
207
+
208
+
209
+ class Wrapper(_BaseComposition, TransformerMixin, BaseEstimator):
210
+ """
211
+ Wrapper Perform robustness inducing 'wrapping' transformation using
212
+ optimal plugins and parameters from the literature
213
+
214
+ Parameters
215
+ ----------
216
+
217
+
218
+ Attributes
219
+ ----------
220
+ Arguments for methods:
221
+ - `X`: array-like, n x p, the data.
222
+
223
+ Reference
224
+ ---------
225
+ Jakob Raymaekers & Peter J. Rousseeuw (2021), Fast Robust Correlation for
226
+ High-Dimensional Data, Technometrics, 63:2, 184-198.
227
+
228
+ """
229
+
230
+ def __init__(self):
231
+ """
232
+ Initialize values. Check if correct options provided.
233
+ """
234
+
235
+ self.center = "median"
236
+ self.scale = "mad"
237
+ self.trimming = 0
238
+
239
+ def fit(self, X):
240
+ """
241
+ Estimate location and scale, store these in the class object.
242
+ Trimming fraction can be provided as keyword argument.
243
+ """
244
+
245
+ X = _check_input(X)
246
+
247
+ _check_trimming(self.trimming)
248
+
249
+ if type(self.center) is str:
250
+ center = eval(self.center)
251
+ else:
252
+ center = self.center
253
+
254
+ if type(self.scale) is str:
255
+ scale = eval(self.scale)
256
+ else:
257
+ scale = self.scale
258
+
259
+ n = X.shape
260
+ if len(n) > 1:
261
+ p = n[1]
262
+ else:
263
+ p = 1
264
+ n = n[0]
265
+
266
+ if self.center == "None":
267
+ m = np.repeat(0, p)
268
+ else:
269
+ m = center(X, trimming=self.trimming)
270
+
271
+ # Keeping col_loc_ for older version compatibility
272
+ setattr(self, "col_loc_", m)
273
+ # sklearn standard
274
+ setattr(self, "center_", m)
275
+
276
+ if self.scale == "None":
277
+ s = np.repeat(1, p)
278
+ else:
279
+ s = scale(X, trimming=self.trimming)
280
+
281
+ # Keeping col_sca_ for older version compatibility
282
+ setattr(self, "col_sca_", s)
283
+ # sklearn standard
284
+ setattr(self, "scale_", s)
285
+
286
+ def transform(self, X):
287
+ """
288
+ Project data points to their wrapped counterparts
289
+ """
290
+
291
+ X = _check_input(X)
292
+ check_is_fitted(self, ["center_", "scale_"])
293
+
294
+ Xw = wrap(X, self.center_, self.scale_)
295
+ setattr(self, "dataw_", Xw)
296
+
297
+ return Xw
298
+
299
+ def predict(self, Xn):
300
+ """
301
+ Wrap new data using previously estimated location and scale.
302
+ Number of columns needs to match.
303
+ """
304
+
305
+ Xn = _check_input(Xn)
306
+ Xnw = wrap(Xn, self.col_loc_, self.col_sca_)
307
+ setattr(self, "datanw_", Xnw)
308
+
309
+ return Xnw
310
+
311
+ def fit_transform(self, X):
312
+ """
313
+ Estimate center and scale for training data wrap these data
314
+ """
315
+
316
+ self.fit(X)
317
+ self.transform(X)
318
+
319
+ return self.dataw_
@@ -0,0 +1,75 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Aug 7 18:17:46 2020
4
+
5
+ @author: Sven Serneels
6
+ """
7
+
8
+ import unittest
9
+ from .twoblock import twoblock
10
+ import pandas as ps
11
+ import numpy as np
12
+ from sklearn.metrics import r2_score
13
+
14
+
15
+ class TestTwoBlock(unittest.TestCase):
16
+ """Test methods in the twoblock class"""
17
+
18
+ @classmethod
19
+ def setUpClass(cls):
20
+ print("...setupClass")
21
+
22
+ @classmethod
23
+ def tearDownClass(cls):
24
+ print("...teardownClass")
25
+
26
+ @classmethod
27
+ def setUp(self):
28
+ self.Yt = ps.read_csv("./data/cookie_lab_train.csv", index_col=0).T
29
+ self.Xt = ps.read_csv("./data/cookie_nir_train.csv", index_col=0).T
30
+ self.Yv = ps.read_csv("./data/cookie_lab_test.csv", index_col=0).T
31
+ self.Xv = ps.read_csv("./data/cookie_nir_test.csv", index_col=0).T
32
+ self.p = self.Xt.shape[1]
33
+ self.q = self.Yt.shape[1]
34
+
35
+ @classmethod
36
+ def tearDown(self):
37
+ del self.Xt
38
+ del self.Yt
39
+ del self.Xv
40
+ del self.Yv
41
+ del self.p
42
+ del self.q
43
+
44
+ def test_assert(self):
45
+
46
+ tb = twoblock(n_components_x=7, n_components_y=200, scale="None")
47
+ self.assertRaises(AssertionError, tb.fit, self.Xt, self.Yt)
48
+
49
+ tb = twoblock(n_components_x=700, n_components_y=2, scale="None")
50
+ self.assertRaises(AssertionError, tb.fit, self.Xt, self.Yt)
51
+
52
+ def test_fit(self):
53
+ """Tests fit function"""
54
+
55
+ tb = twoblock(n_components_x=7, n_components_y=2, scale="None")
56
+ tb.fit(self.Xt, self.Yt)
57
+
58
+ self.assertEqual(tb.coef_.shape, (self.p, self.q)) # coefficients
59
+
60
+ ypttb = tb.predict(self.Xv)
61
+
62
+ self.assertEqual(
63
+ ypttb.shape,
64
+ self.Yv.shape,
65
+ ) # predictions
66
+
67
+ r2tbt = [
68
+ r2_score(self.Yv.iloc[:, i], ypttb[:, i]) for i in range(self.q)
69
+ ]
70
+
71
+ self.assertGreaterEqual(r2tbt, [0.8 for i in range(self.q)])
72
+
73
+
74
+ if __name__ == "__main__":
75
+ unittest.main()
@@ -0,0 +1,262 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Aug 6 16:39:57 2024
4
+
5
+ @author: SERNEELS
6
+ """
7
+
8
+ import numpy as np
9
+ from sklearn.base import (
10
+ RegressorMixin,
11
+ BaseEstimator,
12
+ TransformerMixin,
13
+ MultiOutputMixin,
14
+ )
15
+ from sklearn.utils.metaestimators import _BaseComposition
16
+ import copy
17
+ import pandas as ps
18
+ from .utils import _check_input, _predict_check_input
19
+ from .robcent import VersatileScaler
20
+
21
+ # Draft version
22
+
23
+
24
+ class twoblock(
25
+ _BaseComposition,
26
+ BaseEstimator,
27
+ TransformerMixin,
28
+ RegressorMixin,
29
+ MultiOutputMixin,
30
+ ):
31
+ """
32
+ TWOBLOCK Two-Block Simultaneous Dimension Reduction of Multivariate X and Y
33
+ data blocks
34
+
35
+ Parameters
36
+ -----------
37
+
38
+ n_components_x : int, min 1. Note that if applied on data,
39
+ n_components_x shall take a value <= min(x_data.shape)
40
+
41
+ n_components_y : int, min 1. Note that if applied on data,
42
+ n_components_x shall take a value <= min(x_data.shape)
43
+ If unspecified, set to equal n_components_x
44
+
45
+ verbose: Boolean (def true)
46
+ to print intermediate set of columns retained
47
+
48
+ centre : str,
49
+ type of centring (`'mean'` [recommended], `'median'` or `'l1median'`),
50
+
51
+ scale : str,
52
+ type of scaling ('std','mad' or 'None')
53
+
54
+ copy : (def True): boolean,
55
+ whether to copy data into twoblock object.
56
+
57
+
58
+ Attributes
59
+ ------------
60
+ Attributes always provided:
61
+
62
+ - `x_weights_`: X block PLS weighting vectors (usually denoted W)
63
+ - `y_weights_`: Y block PLS weighting vectors (usually denoted V)
64
+ - `x_loadings_`: X block PLS loading vectors (usually denoted P)
65
+ - `y_loadings_`: Y block PLS loading vectors (usually denoted Q)
66
+ - `x_scores_`: X block PLS score vectors (usually denoted T)
67
+ - `y_scores_`: Y block PLS score vectors (usually denoted U)
68
+ - `coef_`: vector of regression coefficients
69
+ - `intercept_`: intercept
70
+ - `coef_scaled_`: vector of scaled regression coeeficients (when scaling option used)
71
+ - `intercept_scaled_`: scaled intercept
72
+ - `residuals_`: vector of regression residuals
73
+ - `fitted_`: fitted response
74
+ - `x_loc_`: X block location estimate
75
+ - `y_loc_`: y location estimate
76
+ - `x_sca_`: X block scale estimate
77
+ - `y_sca_`: y scale estimate
78
+ - `centring_`: scaling object used internally (type: `VersatileScaler`)
79
+
80
+
81
+ Reference
82
+ ---------
83
+ Cook, R. Dennis, Liliana Forzani, and Lan Liu.
84
+ "Partial least squares for simultaneous reduction of response and predictor
85
+ vectors in regression." Journal of Multivariate Analysis 196 (2023): 105163.
86
+
87
+ """
88
+
89
+ def __init__(
90
+ self,
91
+ n_components_x=1,
92
+ n_components_y=None,
93
+ verbose=True,
94
+ centre="mean",
95
+ scale="None",
96
+ copy=True,
97
+ ):
98
+ self.n_components_x = n_components_x
99
+ self.n_components_y = n_components_y
100
+ self.verbose = verbose
101
+ self.centre = centre
102
+ self.scale = scale
103
+ self.copy = copy
104
+
105
+ def fit(self, X, Y):
106
+ """
107
+ Fit a Twoblock model.
108
+
109
+ Parameters
110
+ ------------
111
+
112
+ X : numpy array or Pandas data frame
113
+ Predictor data.
114
+
115
+ Y : numpy array or Pandas data frame
116
+ Response data
117
+
118
+ Returns
119
+ -------
120
+ twoblock class object containing the estimated parameters.
121
+
122
+ """
123
+
124
+ if type(X) == ps.core.frame.DataFrame:
125
+ X = X.to_numpy()
126
+ (n, p) = X.shape
127
+ if type(Y) in [ps.core.frame.DataFrame, ps.core.series.Series]:
128
+ Y = Y.to_numpy()
129
+ X = _check_input(X)
130
+ Y = _check_input(Y)
131
+ ny, q = Y.shape
132
+ if ny != n:
133
+ raise (ValueError("Number of cases in X and Y needs to agree"))
134
+
135
+ Y = Y.astype("float64")
136
+
137
+ if self.n_components_y is None:
138
+ self.n_components_y = self.n_components_x
139
+
140
+ assert self.n_components_x <= min(
141
+ np.linalg.matrix_rank(np.matmul(X.T, X)), n - 1
142
+ ), "Number of components cannot exceed covariance rank or number of cases"
143
+
144
+ assert self.n_components_y <= min(
145
+ np.linalg.matrix_rank(np.matmul(Y.T, Y)), n - 1
146
+ ), "Number of components cannot exceed covariance rank or number of cases"
147
+
148
+ if self.copy:
149
+ X0 = copy.deepcopy(X)
150
+ Y0 = copy.deepcopy(Y)
151
+ else:
152
+ X0 = X
153
+ Y0 = Y
154
+ if self.copy:
155
+ self.X = X0
156
+ self.Y = Y0
157
+ X0 = X0.astype("float64")
158
+ centring = VersatileScaler(
159
+ center=self.centre, scale=self.scale, trimming=0
160
+ )
161
+ X0 = centring.fit_transform(X0).astype("float64")
162
+ mX = centring.col_loc_
163
+ sX = centring.col_sca_
164
+ Y0 = centring.fit_transform(Y0).astype("float64")
165
+ my = centring.col_loc_
166
+ sy = centring.col_sca_
167
+
168
+ self.x_scores_ = np.empty((n, self.n_components_x), float)
169
+ self.y_scores_ = np.empty((n, self.n_components_y), float)
170
+ self.x_weights_ = np.empty((p, self.n_components_x), float)
171
+ self.y_weights_ = np.empty((q, self.n_components_y), float)
172
+ self.x_loadings_ = np.empty((p, self.n_components_x), float)
173
+ self.y_loadings_ = np.empty((q, self.n_components_y), float)
174
+
175
+ Xh = copy.deepcopy(X0)
176
+ Yh = copy.deepcopy(Y0)
177
+
178
+ for i in range(self.n_components_x):
179
+
180
+ sXY = np.dot(Xh.T, Y0) / n
181
+ u, _, _ = np.linalg.svd(sXY)
182
+ x_weights = u[:, 0].reshape((p,))
183
+ x_scores = np.dot(Xh, x_weights)
184
+ x_loadings = np.dot(Xh.T, x_scores) / np.dot(x_scores, x_scores)
185
+
186
+ Xh -= np.outer(x_scores, x_loadings)
187
+
188
+ self.x_weights_[:, i] = x_weights
189
+ self.x_scores_[:, i] = x_scores
190
+ self.x_loadings_[:, i] = x_loadings
191
+
192
+ for i in range(self.n_components_y):
193
+
194
+ sYX = np.dot(Yh.T, X0) / n
195
+
196
+ v, _, _ = np.linalg.svd(sYX)
197
+ y_weights = v[:, 0].reshape((q,))
198
+ y_scores = np.dot(Yh, y_weights)
199
+ y_loadings = np.dot(Yh.T, y_scores) / np.dot(y_scores, y_scores)
200
+
201
+ Yh -= np.outer(y_scores, y_loadings)
202
+
203
+ self.y_weights_[:, i] = y_weights
204
+ self.y_scores_[:, i] = y_scores
205
+ self.y_loadings_[:, i] = y_loadings
206
+
207
+ wtx = np.dot(X0, self.x_weights_)
208
+ wti = np.linalg.inv(np.dot(wtx.T, wtx))
209
+ swg = np.dot(wtx.T, np.dot(Y0, self.y_weights_))
210
+ self.coef_scaled_ = np.matmul(
211
+ np.matmul(self.x_weights_, wti), np.dot(swg, self.y_weights_.T)
212
+ )
213
+
214
+ if self.centre == "None" and self.scale == "None":
215
+ B_rescaled = self.coef_scaled_
216
+ else:
217
+ # sklearn has this line wrong
218
+ B_rescaled = np.multiply(
219
+ np.outer(sy, np.divide(1, sX)).T, self.coef_scaled_
220
+ )
221
+
222
+ Yp_rescaled = np.matmul(X, B_rescaled)
223
+ if self.centre == "None":
224
+ intercept = 0
225
+ elif self.centre == "mean":
226
+ intercept = np.mean(Y - Yp_rescaled, axis=0)
227
+ else:
228
+ intercept = np.median(Y - Yp_rescaled, axis=0)
229
+
230
+ Yfit = Yp_rescaled + intercept
231
+ R = Y - Yfit
232
+
233
+ setattr(self, "coef_", B_rescaled)
234
+ setattr(self, "intercept_", intercept)
235
+ setattr(self, "fitted_", Yfit)
236
+ setattr(self, "residuals_", R)
237
+ setattr(self, "x_loc_", mX)
238
+ setattr(self, "y_loc_", my)
239
+ setattr(self, "x_sca_", sX)
240
+ setattr(self, "y_sca_", sy)
241
+ setattr(self, "centring_", centring)
242
+ return self
243
+
244
+ def predict(self, Xn):
245
+ """
246
+ Predict cases.
247
+
248
+ Parameters
249
+ ------------
250
+
251
+ Xn : numpy array or data frame
252
+ Input data.
253
+
254
+ """
255
+ n, p, Xn = _predict_check_input(Xn)
256
+ if p != self.X.shape[1]:
257
+ raise (
258
+ ValueError(
259
+ "New data must have same number of columns as the ones the model has been trained with"
260
+ )
261
+ )
262
+ return np.matmul(Xn, self.coef_) + self.intercept_
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Mon Apr 13 16:08:22 2020
5
+
6
+ @author: sven
7
+ """
8
+
9
+ import pandas as ps
10
+ import numpy as np
11
+
12
+
13
+ def convert_X_input(X):
14
+
15
+ if type(X) == ps.core.frame.DataFrame:
16
+ X = X.to_numpy().astype("float64")
17
+ return X
18
+
19
+
20
+ def convert_y_input(y):
21
+
22
+ if type(y) in [ps.core.frame.DataFrame, ps.core.series.Series]:
23
+ y = y.to_numpy().T.astype("float64")
24
+ return y
25
+
26
+
27
+ def const_xscale(beta, *args):
28
+ X = args[0]
29
+ h = args[1]
30
+ i = args[2]
31
+ j = args[3]
32
+ beta = np.reshape(beta, (-1, h), order="F")
33
+ covx = np.cov(X, rowvar=False)
34
+ ans = np.matmul(np.matmul(beta.T, covx), beta) - np.identity(h)
35
+ return ans[i, j]
36
+
37
+
38
+ def const_zscale(beta, *args):
39
+ X = args[0]
40
+ h = args[1]
41
+ i = args[2]
42
+ j = args[3]
43
+ beta = np.reshape(beta, (-1, h), order="F")
44
+ covx = np.identity(X.shape[1])
45
+ ans = np.matmul(np.matmul(beta.T, covx), beta) - np.identity(h)
46
+ return ans[i, j]
47
+
48
+
49
+ def _predict_check_input(Xn):
50
+ if type(Xn) == ps.core.series.Series:
51
+ Xn = Xn.to_numpy()
52
+ if Xn.ndim == 1:
53
+ Xn = Xn.reshape((1, -1))
54
+ if type(Xn) == ps.core.frame.DataFrame:
55
+ Xn = Xn.to_numpy()
56
+ n, p = Xn.shape
57
+ return (n, p, Xn)
58
+
59
+
60
+ def _check_input(X):
61
+
62
+ if type(X) in (np.matrix, ps.core.frame.DataFrame, ps.core.series.Series):
63
+ X = np.array(X)
64
+
65
+ if X.dtype == np.dtype("O"):
66
+ X = X.astype("float64")
67
+
68
+ if X.ndim == 1:
69
+ X = X.reshape((1, -1))
70
+
71
+ n, p = X.shape
72
+
73
+ if n == 1:
74
+ if p >= 2:
75
+ X = X.reshape((-1, 1))
76
+ return X
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.1
2
+ Name: twoblock
3
+ Version: 0.0.1
4
+ Summary: A Scikit-Learn Compatible Library for Simultaneous Two-Block Sufficient Dimension Reduction Methods
5
+ Home-page: https://github.com/SvenSerneels/twoblock
6
+ Author: Sven Serneels
7
+ Author-email: svenserneels@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy>=1.22.0
14
+ Requires-Dist: scipy>=1.8.0
15
+ Requires-Dist: scikit-learn>=1.3.0
16
+ Requires-Dist: pandas>=1.4.0
17
+
18
+ # twoblock
19
+ Two-block simultaneous dimension reduction
20
+
21
+ A scikitlearn compatible implementation of simultaneous two-block dimension reduction as proposed in [1].
22
+
23
+ References
24
+ ----------
25
+ [1] Cook, R. Dennis, Liliana Forzani, and Lan Liu.
26
+ "Partial least squares for simultaneous reduction of response and predictor
27
+ vectors in regression." Journal of Multivariate Analysis 196 (2023): 105163.
@@ -0,0 +1,15 @@
1
+ LICENSE
2
+ README.md
3
+ setup.cfg
4
+ setup.py
5
+ src/twoblock/__init__.py
6
+ src/twoblock/_preproc_utilities.py
7
+ src/twoblock/robcent.py
8
+ src/twoblock/test_twoblock.py
9
+ src/twoblock/twoblock.py
10
+ src/twoblock/utils.py
11
+ src/twoblock.egg-info/PKG-INFO
12
+ src/twoblock.egg-info/SOURCES.txt
13
+ src/twoblock.egg-info/dependency_links.txt
14
+ src/twoblock.egg-info/requires.txt
15
+ src/twoblock.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ numpy>=1.22.0
2
+ scipy>=1.8.0
3
+ scikit-learn>=1.3.0
4
+ pandas>=1.4.0
@@ -0,0 +1 @@
1
+ twoblock