robust-mixed-dist 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- robust_mixed_dist/__init__.py +0 -0
- robust_mixed_dist/binary.py +110 -0
- robust_mixed_dist/mixed.py +776 -0
- robust_mixed_dist/multiclass.py +57 -0
- robust_mixed_dist/quantitative.py +666 -0
- robust_mixed_dist-0.1.0.dist-info/METADATA +27 -0
- robust_mixed_dist-0.1.0.dist-info/RECORD +10 -0
- robust_mixed_dist-0.1.0.dist-info/WHEEL +5 -0
- robust_mixed_dist-0.1.0.dist-info/licenses/LICENSE +19 -0
- robust_mixed_dist-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,776 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from robust_mixed_dist.quantitative import (euclidean_dist_matrix, euclidean_dist, minkowski_dist_matrix,
|
|
5
|
+
minkowski_dist, canberra_dist_matrix, canberra_dist, pearson_dist_matrix,
|
|
6
|
+
mahalanobis_dist_matrix, mahalanobis_dist, robust_maha_dist_matrix, robust_maha_dist, S_robust)
|
|
7
|
+
from robust_mixed_dist.binary import sokal_dist_matrix, sokal_dist, jaccard_dist_matrix, jaccard_dist
|
|
8
|
+
from robust_mixed_dist.multiclass import hamming_dist_matrix, hamming_dist
|
|
9
|
+
|
|
10
|
+
################################################################################
|
|
11
|
+
|
|
12
|
+
def get_dist_matrix_functions():
|
|
13
|
+
|
|
14
|
+
dist_matrix = {}
|
|
15
|
+
dist_matrix['euclidean'] = euclidean_dist_matrix
|
|
16
|
+
dist_matrix['minkowski'] = minkowski_dist_matrix
|
|
17
|
+
dist_matrix['canberra'] = canberra_dist_matrix
|
|
18
|
+
dist_matrix['pearson'] = pearson_dist_matrix
|
|
19
|
+
dist_matrix['mahalanobis'] = mahalanobis_dist_matrix
|
|
20
|
+
dist_matrix['robust_mahalanobis'] = robust_maha_dist_matrix
|
|
21
|
+
dist_matrix['sokal'] = sokal_dist_matrix
|
|
22
|
+
dist_matrix['jaccard'] = jaccard_dist_matrix
|
|
23
|
+
dist_matrix['hamming'] = hamming_dist_matrix
|
|
24
|
+
|
|
25
|
+
return dist_matrix
|
|
26
|
+
|
|
27
|
+
################################################################################
|
|
28
|
+
|
|
29
|
+
def get_dist_functions():
|
|
30
|
+
|
|
31
|
+
dist = {}
|
|
32
|
+
dist['euclidean'] = euclidean_dist
|
|
33
|
+
dist['minkowski'] = minkowski_dist
|
|
34
|
+
dist['canberra'] = canberra_dist
|
|
35
|
+
dist['mahalanobis'] = mahalanobis_dist
|
|
36
|
+
dist['robust_mahalanobis'] = robust_maha_dist
|
|
37
|
+
dist['sokal'] = sokal_dist
|
|
38
|
+
dist['jaccard'] = jaccard_dist
|
|
39
|
+
dist['hamming'] = hamming_dist
|
|
40
|
+
|
|
41
|
+
return dist
|
|
42
|
+
|
|
43
|
+
################################################################################
|
|
44
|
+
|
|
45
|
+
def vg(D_2):
|
|
46
|
+
"""
|
|
47
|
+
Calculates the geometric variability of the squared distance matrix `D_2` passed as input.
|
|
48
|
+
|
|
49
|
+
Parameters (inputs)
|
|
50
|
+
----------
|
|
51
|
+
D_2: a numpy array. It should represent an squared distance matrix.
|
|
52
|
+
|
|
53
|
+
Returns (outputs)
|
|
54
|
+
-------
|
|
55
|
+
VG: the geometric variability of the squared distance matrix `D_2`.
|
|
56
|
+
"""
|
|
57
|
+
n = len(D_2)
|
|
58
|
+
VG = (1/(2*(n**2)))*np.sum(D_2)
|
|
59
|
+
# TO DO: version managing weights
|
|
60
|
+
return VG
|
|
61
|
+
|
|
62
|
+
################################################################################
|
|
63
|
+
|
|
64
|
+
def get_dist_matrices(X, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1,
|
|
65
|
+
robust_method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20, weights=None):
|
|
66
|
+
"""
|
|
67
|
+
Calculates the distance matrices that are involved in the Generalized Gower distance.
|
|
68
|
+
|
|
69
|
+
Parameters:
|
|
70
|
+
X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
|
|
71
|
+
p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
|
|
72
|
+
d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
|
|
73
|
+
d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
|
|
74
|
+
d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
|
|
75
|
+
q: the parameter that defines the Minkowski distance. Must be a positive integer.
|
|
76
|
+
robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
77
|
+
epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
78
|
+
n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
|
|
79
|
+
weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
D1, D2, D3: the distances matrices associated to the quantitative, binary and multi-class variables, respectively.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
if isinstance(X, (pl.DataFrame, pd.DataFrame)):
|
|
86
|
+
X = X.to_numpy()
|
|
87
|
+
|
|
88
|
+
dist_matrix = get_dist_matrix_functions()
|
|
89
|
+
|
|
90
|
+
n = len(X)
|
|
91
|
+
X_quant = X[:, 0:p1]
|
|
92
|
+
X_bin = X[:, (p1):(p1+p2)]
|
|
93
|
+
X_multi = X[:, (p1+p2):(p1+p2+p3)]
|
|
94
|
+
|
|
95
|
+
# Define D1 based on d1 and p1
|
|
96
|
+
if p1 > 0:
|
|
97
|
+
if d1 == 'minkowski':
|
|
98
|
+
D1 = dist_matrix[d1](X_quant, q)
|
|
99
|
+
elif d1 == 'robust_mahalanobis':
|
|
100
|
+
S_robust_ = S_robust(X=X_quant, method=robust_method, alpha=alpha,
|
|
101
|
+
epsilon=epsilon, n_iters=n_iters,
|
|
102
|
+
weights=weights)
|
|
103
|
+
D1 = dist_matrix[d1](X_quant, S_robust=S_robust_)
|
|
104
|
+
else:
|
|
105
|
+
D1 = dist_matrix[d1](X_quant)
|
|
106
|
+
elif p1 == 0:
|
|
107
|
+
D1 = np.zeros((n, n))
|
|
108
|
+
# Define D2 based on p2
|
|
109
|
+
D2 = dist_matrix[d2](X_bin) if p2 > 0 else np.zeros((n, n))
|
|
110
|
+
# Define D3 based on p3
|
|
111
|
+
D3 = dist_matrix[d3](X_multi) if p3 > 0 else np.zeros((n, n))
|
|
112
|
+
|
|
113
|
+
return D1, D2, D3
|
|
114
|
+
|
|
115
|
+
################################################################################
|
|
116
|
+
|
|
117
|
+
def get_distances(xi, xr, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1, S=None, S_robust=None):
|
|
118
|
+
"""
|
|
119
|
+
Calculates the distances between observations that are involved in the Generalized Gower distance.
|
|
120
|
+
|
|
121
|
+
Parameters:
|
|
122
|
+
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
|
|
123
|
+
p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
|
|
124
|
+
d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
|
|
125
|
+
d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
|
|
126
|
+
d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
|
|
127
|
+
q: the parameter that defines the Minkowski distance. Must be a positive integer.
|
|
128
|
+
S: the covariance matrix of the considered data matrix.
|
|
129
|
+
S_robust: the robust covariance matrix of the considered data matrix.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
dist1, dist2, dist3: the distances values associated to the quantitative, binary and multi-class observations, respectively.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
if isinstance(xi, (pl.DataFrame, pd.DataFrame)) :
|
|
136
|
+
xi = xi.to_numpy().flatten()
|
|
137
|
+
elif isinstance(xi, (pd.Series, pl.Series)) :
|
|
138
|
+
xi = xi.to_numpy()
|
|
139
|
+
if isinstance(xr, (pl.DataFrame, pd.DataFrame)) :
|
|
140
|
+
xr = xr.to_numpy().flatten()
|
|
141
|
+
elif isinstance(xr, (pd.Series, pl.Series)) :
|
|
142
|
+
xr = xr.to_numpy()
|
|
143
|
+
|
|
144
|
+
dist = get_dist_functions()
|
|
145
|
+
|
|
146
|
+
xi_quant = xi[0:p1] ; xr_quant = xr[0:p1] ;
|
|
147
|
+
xi_bin = xi[(p1):(p1+p2)] ; xr_bin = xr[(p1):(p1+p2)]
|
|
148
|
+
xi_multi = xi[(p1+p2):(p1+p2+p3)] ; xr_multi = xr[(p1+p2):(p1+p2+p3)]
|
|
149
|
+
|
|
150
|
+
if p1 > 0:
|
|
151
|
+
if d1 == 'minkowski':
|
|
152
|
+
dist1 = dist[d1](xi_quant, xr_quant, q=q)
|
|
153
|
+
elif d1 == 'robust_mahalanobis':
|
|
154
|
+
dist1 = dist[d1](xi_quant, xr_quant, S_robust=S_robust)
|
|
155
|
+
elif d1 == 'mahalanobis':
|
|
156
|
+
dist1 = dist[d1](xi_quant, xr_quant, S=S)
|
|
157
|
+
else:
|
|
158
|
+
dist1 = dist[d1](xi_quant, xr_quant)
|
|
159
|
+
elif p1 == 0:
|
|
160
|
+
dist1 = 0
|
|
161
|
+
|
|
162
|
+
dist2 = dist[d2](xi_bin, xr_bin) if p2 > 0 else 0
|
|
163
|
+
dist3 = dist[d3](xi_multi, xr_multi) if p3 > 0 else 0
|
|
164
|
+
|
|
165
|
+
return dist1, dist2, dist3
|
|
166
|
+
|
|
167
|
+
################################################################################
|
|
168
|
+
|
|
169
|
+
def vg_ggower_estimation(X, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching',
|
|
170
|
+
q=1, robust_method='trimmed', epsilon=0.05, alpha=0.05,
|
|
171
|
+
n_iters=20, weights=None):
|
|
172
|
+
"""
|
|
173
|
+
Calculates the geometric variability of an Generalized Gower distance matrix.
|
|
174
|
+
|
|
175
|
+
Parameters:
|
|
176
|
+
X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
|
|
177
|
+
p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
|
|
178
|
+
d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
|
|
179
|
+
d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
|
|
180
|
+
d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
|
|
181
|
+
q: the parameter that defines the Minkowski distance. Must be a positive integer.
|
|
182
|
+
robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
183
|
+
epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
184
|
+
n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
|
|
185
|
+
weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
VG1, VG2, VG3: the geometric variabilities of the distances matrices associated to the quantitative, binary and multi-class variables, respectively.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
D1, D2, D3 = get_dist_matrices(X=X, p1=p1, p2=p2, p3=p3, d1=d1, d2=d2, d3=d3,
|
|
192
|
+
q=q, robust_method=robust_method, epsilon=epsilon,
|
|
193
|
+
alpha=alpha, n_iters=n_iters, weights=weights)
|
|
194
|
+
|
|
195
|
+
D1_2, D2_2, D3_2 = D1**2, D2**2, D3**2
|
|
196
|
+
VG1, VG2, VG3 = vg(D1_2), vg(D2_2), vg(D3_2)
|
|
197
|
+
|
|
198
|
+
return VG1, VG2, VG3
|
|
199
|
+
|
|
200
|
+
################################################################################
|
|
201
|
+
|
|
202
|
+
def vg_ggower_fast_estimation(X, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching',
|
|
203
|
+
robust_method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20, q=1,
|
|
204
|
+
VG_sample_size=300, VG_n_samples=5, random_state=123, weights=None):
|
|
205
|
+
"""
|
|
206
|
+
Calculates a fast estimation of the geometric variability of an squared Generalized Gower distance matrix.
|
|
207
|
+
|
|
208
|
+
Parameters:
|
|
209
|
+
X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
|
|
210
|
+
p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
|
|
211
|
+
d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
|
|
212
|
+
d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
|
|
213
|
+
d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
|
|
214
|
+
q: the parameter that defines the Minkowski distance. Must be a positive integer.
|
|
215
|
+
robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
216
|
+
epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
217
|
+
n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
|
|
218
|
+
weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
|
|
219
|
+
VG_sample_size: sample size to be used to make the estimation of the geometric variability.
|
|
220
|
+
VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
|
|
221
|
+
random_state: the random seed used for the (random) sample elements.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
VG1, VG2, VG3: the geometric variabilities of the distances matrices associated to the quantitative, binary and multi-class variables, respectively.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
if isinstance(X, (pl.DataFrame, pd.DataFrame)) :
|
|
228
|
+
X = X.to_numpy()
|
|
229
|
+
|
|
230
|
+
n = len(X)
|
|
231
|
+
VG1_list, VG2_list, VG3_list = [], [], []
|
|
232
|
+
|
|
233
|
+
for i in range(0, VG_n_samples) :
|
|
234
|
+
|
|
235
|
+
np.random.seed(random_state + i)
|
|
236
|
+
index = np.arange(0,n)
|
|
237
|
+
sample_index = np.random.choice(index, size=VG_sample_size)
|
|
238
|
+
X_sample = X[sample_index,:].copy()
|
|
239
|
+
|
|
240
|
+
if weights is not None:
|
|
241
|
+
sample_weights = weights[sample_index].copy()
|
|
242
|
+
else:
|
|
243
|
+
sample_weights = None
|
|
244
|
+
|
|
245
|
+
VG1, VG2, VG3 = vg_ggower_estimation(X=X_sample, p1=p1, p2=p2, p3=p3, d1=d1, d2=d2, d3=d3, q=q,
|
|
246
|
+
robust_method=robust_method, epsilon=epsilon, alpha=alpha,
|
|
247
|
+
n_iters=n_iters, weights=sample_weights)
|
|
248
|
+
|
|
249
|
+
VG1_list.append(VG1) ; VG2_list.append(VG2) ; VG3_list.append(VG3)
|
|
250
|
+
|
|
251
|
+
VG1 = np.mean(VG1_list) ; VG2 = np.mean(VG2_list) ; VG3 = np.mean(VG3_list)
|
|
252
|
+
|
|
253
|
+
return VG1, VG2, VG3
|
|
254
|
+
|
|
255
|
+
################################################################################
|
|
256
|
+
|
|
257
|
+
class GGowerDistMatrix:
|
|
258
|
+
"""
|
|
259
|
+
Calculates the Generalized Gower matrix for a data matrix.
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
def __init__(self, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1, robust_method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20,
|
|
263
|
+
fast_VG=False, VG_sample_size=300, VG_n_samples=5, random_state=123, weights=None):
|
|
264
|
+
"""
|
|
265
|
+
Constructor method.
|
|
266
|
+
|
|
267
|
+
Parameters:
|
|
268
|
+
p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
|
|
269
|
+
d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
|
|
270
|
+
d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
|
|
271
|
+
d3: name of the distance to be computed for multi-class variables. Must be an string in ['hamming'].
|
|
272
|
+
q: the parameter that defines the Minkowski distance. Must be a positive integer.
|
|
273
|
+
metrobust_methodhod: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
274
|
+
alpha : a real number in [0,1] that is used if `robust_method` is 'trimmed' or 'winsorized'. Only needed when d1 = 'robust_mahalanobis'.
|
|
275
|
+
epsilon : parameter used by the Delvin transformation. epsilon=0.05 is recommended. Only needed when d1 = 'robust_mahalanobis'.
|
|
276
|
+
n_iter : maximum number of iterations run by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
|
|
277
|
+
weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
|
|
278
|
+
fast_VG: whether the geometric variability estimation will be full (False) or fast (True).
|
|
279
|
+
VG_sample_size: sample size to be used to make the estimation of the geometric variability.
|
|
280
|
+
VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
|
|
281
|
+
random_state: the random seed used for the (random) sample elements.
|
|
282
|
+
"""
|
|
283
|
+
self.p1 = p1 ; self.p2 = p2 ; self.p3 = p3
|
|
284
|
+
self.d1 = d1 ; self.d2 = d2 ; self.d3 = d3
|
|
285
|
+
self.q = q ; self.robust_method = robust_method ; self.alpha = alpha ;
|
|
286
|
+
self.epsilon = epsilon ; self.n_iters = n_iters
|
|
287
|
+
self.VG_sample_size = VG_sample_size; self.VG_n_samples = VG_n_samples
|
|
288
|
+
self.random_state = random_state ; self.fast_VG = fast_VG; self.weights = weights
|
|
289
|
+
|
|
290
|
+
def compute(self, X):
|
|
291
|
+
"""
|
|
292
|
+
Compute method.
|
|
293
|
+
|
|
294
|
+
Parameters:
|
|
295
|
+
X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
D: the Generalized Gower matrix for the data matrix `X`.
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
D1, D2, D3 = get_dist_matrices(X=X, p1=self.p1, p2=self.p2, p3=self.p3,
|
|
302
|
+
d1=self.d1, d2=self.d2, d3=self.d3,
|
|
303
|
+
q=self.q, robust_method=self.robust_method, epsilon=self.epsilon,
|
|
304
|
+
alpha=self.alpha, n_iters=self.n_iters, weights=self.weights)
|
|
305
|
+
|
|
306
|
+
D1_2 = D1**2 ; D2_2 = D2**2 ; D3_2 = D3**2
|
|
307
|
+
|
|
308
|
+
if self.fast_VG == True:
|
|
309
|
+
VG1, VG2, VG3 = vg_ggower_fast_estimation(X=X, p1=self.p1, p2=self.p2, p3=self.p3,
|
|
310
|
+
d1=self.d1, d2=self.d2, d3=self.d3,
|
|
311
|
+
robust_method=self.robust_method, alpha=self.alpha,
|
|
312
|
+
VG_sample_size=self.VG_sample_size, VG_n_samples=self.VG_n_samples,
|
|
313
|
+
random_state=self.random_state, weights=self.weights)
|
|
314
|
+
else:
|
|
315
|
+
VG1, VG2, VG3 = vg(D1_2), vg(D2_2), vg(D3_2)
|
|
316
|
+
|
|
317
|
+
D1_std = D1_2/VG1 if VG1 > 0 else D1_2
|
|
318
|
+
D2_std = D2_2/VG2 if VG2 > 0 else D2_2
|
|
319
|
+
D3_std = D3_2/VG3 if VG3 > 0 else D3_2
|
|
320
|
+
D_2 = D1_std + D2_std + D3_std
|
|
321
|
+
D = np.sqrt(D_2)
|
|
322
|
+
|
|
323
|
+
return D
|
|
324
|
+
|
|
325
|
+
################################################################################
|
|
326
|
+
|
|
327
|
+
class GGowerDist:
|
|
328
|
+
"""
|
|
329
|
+
Calculates the Generalized Gower distance for a pair of data observations.
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
def __init__(self, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1, robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20,
|
|
333
|
+
VG_sample_size=300, VG_n_samples=5, random_state=123, weights=None):
|
|
334
|
+
"""
|
|
335
|
+
Constructor method.
|
|
336
|
+
|
|
337
|
+
Parameters:
|
|
338
|
+
p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
|
|
339
|
+
d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
|
|
340
|
+
d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
|
|
341
|
+
d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
|
|
342
|
+
q: the parameter that defines the Minkowski distance. Must be a positive integer.
|
|
343
|
+
robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
344
|
+
epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
345
|
+
n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
|
|
346
|
+
weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
|
|
347
|
+
VG_sample_size: sample size to be used to make the estimation of the geometric variability.
|
|
348
|
+
VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
|
|
349
|
+
random_state: the random seed used for the (random) sample elements.
|
|
350
|
+
"""
|
|
351
|
+
self.p1 = p1 ; self.p2 = p2 ; self.p3 = p3
|
|
352
|
+
self.d1 = d1 ; self.d2 = d2 ; self.d3 = d3
|
|
353
|
+
self.q = q ; self.robust_method = robust_method ; self.alpha = alpha ;
|
|
354
|
+
self.epsilon = epsilon ; self.n_iters = n_iters
|
|
355
|
+
self.VG_sample_size = VG_sample_size; self.VG_n_samples = VG_n_samples
|
|
356
|
+
self.random_state = random_state; self.weights = weights
|
|
357
|
+
|
|
358
|
+
def fit(self, X) :
|
|
359
|
+
"""
|
|
360
|
+
Fit method that computes the geometric variability and covariance matrix to be used in 'compute' method, if needed.
|
|
361
|
+
|
|
362
|
+
Parameters:
|
|
363
|
+
X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
D: the Generalized Gower matrix for the data matrix `X`.
|
|
367
|
+
"""
|
|
368
|
+
p1 = self.p1 ; p2 = self.p2 ; p3 = self.p3
|
|
369
|
+
d1 = self.d1 ; d2 = self.d2 ; d3 = self.d3
|
|
370
|
+
self.S, self.S_robust = None, None
|
|
371
|
+
|
|
372
|
+
if d1 in ['robust_mahalanobis', 'mahalanobis']:
|
|
373
|
+
|
|
374
|
+
if isinstance(X, (pl.DataFrame, pd.DataFrame)) :
|
|
375
|
+
X = X.to_numpy()
|
|
376
|
+
|
|
377
|
+
X_quant = X[:, 0:p1]
|
|
378
|
+
|
|
379
|
+
if d1 == 'robust_mahalanobis':
|
|
380
|
+
self.S_robust = S_robust(X=X_quant, method=self.robust_method, alpha=self.alpha,
|
|
381
|
+
epsilon=self.epsilon, n_iters=self.n_iters, weights=self.weights)
|
|
382
|
+
elif d1 == 'mahalanobis':
|
|
383
|
+
self.S = np.cov(X_quant, rowvar=False)
|
|
384
|
+
|
|
385
|
+
self.VG1, self.VG2, self.VG3 = vg_ggower_fast_estimation(X=X, p1=p1, p2=p2, p3=p3, d1=d1, d2=d2, d3=d3, robust_method=self.robust_method,
|
|
386
|
+
alpha=self.alpha, epsilon=self.epsilon, n_iters=self.n_iters,
|
|
387
|
+
VG_sample_size=self.VG_sample_size, VG_n_samples=self.VG_n_samples,
|
|
388
|
+
random_state=self.random_state, weights=self.weights)
|
|
389
|
+
|
|
390
|
+
def compute(self, xi, xr):
|
|
391
|
+
"""
|
|
392
|
+
Compute method.
|
|
393
|
+
|
|
394
|
+
Parameters:
|
|
395
|
+
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
dist: the Generalized Gower distance between the observations `xi` and `xr`.
|
|
399
|
+
"""
|
|
400
|
+
dist1, dist2, dist3 = get_distances(xi=xi, xr=xr, p1=self.p1, p2=self.p2, p3=self.p3,
|
|
401
|
+
d1=self.d1, d2=self.d2, d3=self.d3,
|
|
402
|
+
q=self.q, S=self.S, S_robust=self.S_robust)
|
|
403
|
+
|
|
404
|
+
dist1_2 = dist1**2 ; dist2_2 = dist2**2 ; dist3_2 = dist3**2
|
|
405
|
+
dist1_2_std = dist1_2/self.VG1 if self.VG1 > 0 else dist1_2
|
|
406
|
+
dist2_2_std = dist2_2/self.VG2 if self.VG2 > 0 else dist2_2
|
|
407
|
+
dist3_2_std = dist3_2/self.VG3 if self.VG3 > 0 else dist3_2
|
|
408
|
+
dist_2 = dist1_2_std + dist2_2_std + dist3_2_std
|
|
409
|
+
dist = np.sqrt(dist_2)
|
|
410
|
+
|
|
411
|
+
return dist
|
|
412
|
+
|
|
413
|
+
################################################################################
|
|
414
|
+
|
|
415
|
+
def ggower_dist(xi, xr, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching',
|
|
416
|
+
q=1, S=None, S_robust=None, VG1=None, VG2=None, VG3=None):
|
|
417
|
+
|
|
418
|
+
dist1, dist2, dist3 = get_distances(xi=xi, xr=xr, p1=p1, p2=p2, p3=p3,
|
|
419
|
+
d1=d1, d2=d2, d3=d3,
|
|
420
|
+
q=q, S=S, S_robust=S_robust)
|
|
421
|
+
|
|
422
|
+
dist1_2 = dist1**2 ; dist2_2 = dist2**2 ; dist3_2 = dist3**2
|
|
423
|
+
dist1_2_std = dist1_2/VG1 if VG1 > 0 else dist1_2
|
|
424
|
+
dist2_2_std = dist2_2/VG2 if VG2 > 0 else dist2_2
|
|
425
|
+
dist3_2_std = dist3_2/VG3 if VG3 > 0 else dist3_2
|
|
426
|
+
dist_2 = dist1_2_std + dist2_2_std + dist3_2_std
|
|
427
|
+
dist = np.sqrt(dist_2)
|
|
428
|
+
|
|
429
|
+
return dist
|
|
430
|
+
|
|
431
|
+
################################################################################
|
|
432
|
+
|
|
433
|
+
def simple_gower_dist(xi, xr, X, p1, p2, p3) :
|
|
434
|
+
"""
|
|
435
|
+
Compute method.
|
|
436
|
+
|
|
437
|
+
Parameters:
|
|
438
|
+
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
|
|
439
|
+
X: a pandas/polars data-frame or a numpy array. It represents a data matrix.
|
|
440
|
+
p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
dist: the Simple Gower distance between the observations `xi` and `xr`.
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
if isinstance(X, (pl.DataFrame, pd.DataFrame)) :
|
|
447
|
+
X = X.to_numpy()
|
|
448
|
+
if isinstance(xi, (pl.DataFrame, pd.DataFrame)) :
|
|
449
|
+
xi = xi.to_numpy().flatten()
|
|
450
|
+
elif isinstance(xi, (pd.Series, pl.Series)) :
|
|
451
|
+
xi = xi.to_numpy()
|
|
452
|
+
if isinstance(xr, (pl.DataFrame, pd.DataFrame)) :
|
|
453
|
+
xr = xr.to_numpy().flatten()
|
|
454
|
+
elif isinstance(xi, (pd.Series, pl.Series)) :
|
|
455
|
+
xr = xr.to_numpy()
|
|
456
|
+
|
|
457
|
+
dist = get_dist_functions()
|
|
458
|
+
|
|
459
|
+
X_quant = X[:,0:p1]
|
|
460
|
+
xi_quant = xi[0:p1] ; xr_quant = xr[0:p1] ;
|
|
461
|
+
xi_bin = xi[(p1):(p1+p2)] ; xr_bin = xr[(p1):(p1+p2)]
|
|
462
|
+
xi_multi = xi[(p1+p2):(p1+p2+p3)] ; xr_multi = xr[(p1+p2):(p1+p2+p3)]
|
|
463
|
+
R = np.max(X_quant, axis=0) - np.min(X_quant, axis=0)
|
|
464
|
+
|
|
465
|
+
dist1 = np.sum(np.abs(xi_quant - xr_quant)/R) if p1 > 0 else 0
|
|
466
|
+
dist2 = dist['jaccard'](xi_bin, xr_bin) if p2 > 0 else 0
|
|
467
|
+
dist3 = dist['hamming'](xi_multi, xr_multi) if p3 > 0 else 0
|
|
468
|
+
dist = dist1 + dist2 + dist3
|
|
469
|
+
|
|
470
|
+
return dist
|
|
471
|
+
|
|
472
|
+
################################################################################
|
|
473
|
+
|
|
474
|
+
'''
|
|
475
|
+
def simple_gower_dist_matrix(X, p1, p2, p3):
|
|
476
|
+
|
|
477
|
+
if isinstance(X, (pl.DataFrame, pd.DataFrame)) :
|
|
478
|
+
X = X.to_numpy()
|
|
479
|
+
|
|
480
|
+
D = np.zeros((len(X), len(X)))
|
|
481
|
+
|
|
482
|
+
for i in range(len(X)):
|
|
483
|
+
for r in range(len(X)):
|
|
484
|
+
if i <= r:
|
|
485
|
+
D[i,r] = simple_gower_dist(xi=X[i,:], xr=X[r,:], X=X,
|
|
486
|
+
p1=p1, p2=p2, p3=p3)
|
|
487
|
+
|
|
488
|
+
D = D + D.T - np.diag(D.diagonal())
|
|
489
|
+
|
|
490
|
+
return D
|
|
491
|
+
'''
|
|
492
|
+
|
|
493
|
+
def simple_gower_dist_matrix(X, p1, p2, p3):
|
|
494
|
+
"""
|
|
495
|
+
Cálculo matricial de la distancia simple de Gower entre todas las filas de X.
|
|
496
|
+
|
|
497
|
+
Parameters:
|
|
498
|
+
X: np.ndarray o DataFrame (se convierte a np.ndarray).
|
|
499
|
+
p1: número de columnas numéricas.
|
|
500
|
+
p2: número de columnas binarias.
|
|
501
|
+
p3: número de columnas categóricas (multi-clase).
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
D: matriz de distancias (n x n) con la distancia de Gower simple entre observaciones.
|
|
505
|
+
"""
|
|
506
|
+
|
|
507
|
+
# Convertir DataFrame si fuera necesario
|
|
508
|
+
if isinstance(X, (pd.DataFrame, pl.DataFrame)):
|
|
509
|
+
X = X.to_numpy()
|
|
510
|
+
|
|
511
|
+
dist_matrix = get_dist_matrix_functions()
|
|
512
|
+
|
|
513
|
+
# Separar bloques
|
|
514
|
+
X_quant = X[:, 0:p1] if p1 > 0 else None
|
|
515
|
+
X_bin = X[:, p1:p1 + p2] if p2 > 0 else None
|
|
516
|
+
X_multi = X[:, p1 + p2:p1 + p2 + p3] if p3 > 0 else None
|
|
517
|
+
|
|
518
|
+
n = X.shape[0]
|
|
519
|
+
D = np.zeros((n, n))
|
|
520
|
+
|
|
521
|
+
# Distancia cuantitativa: Manhattan normalizada por rango
|
|
522
|
+
if p1 > 0:
|
|
523
|
+
R = np.max(X_quant, axis=0) - np.min(X_quant, axis=0)
|
|
524
|
+
R[R == 0] = 1 # evitar división por cero
|
|
525
|
+
X_quant_norm = X_quant / R
|
|
526
|
+
dist_quant = dist_matrix['minkowski'](X_quant_norm, q=1)
|
|
527
|
+
D += dist_quant
|
|
528
|
+
|
|
529
|
+
# Distancia binaria: Jaccard
|
|
530
|
+
if p2 > 0:
|
|
531
|
+
dist_bin = dist_matrix['jaccard'](X_bin)
|
|
532
|
+
D += dist_bin
|
|
533
|
+
|
|
534
|
+
# Distancia categórica: Hamming (simple coincidencia)
|
|
535
|
+
if p3 > 0:
|
|
536
|
+
dist_multi = dist_matrix['hamming'](X_multi)
|
|
537
|
+
D += dist_multi
|
|
538
|
+
|
|
539
|
+
return D
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
################################################################################
|
|
543
|
+
|
|
544
|
+
class RelMSDistMatrix:
|
|
545
|
+
"""
|
|
546
|
+
Calculates the Related Metric Scaling matrix for a data matrix.
|
|
547
|
+
"""
|
|
548
|
+
|
|
549
|
+
def __init__(self, p1,p2,p3,d1='euclidean',d2='sokal',d3='matching',q=1, robust_method='trimmed',
|
|
550
|
+
epsilon=0.05, alpha=0.05, n_iters=20, weights=None,
|
|
551
|
+
fast_VG=False, VG_sample_size=300, VG_n_samples=5, random_state=123):
|
|
552
|
+
"""
|
|
553
|
+
Constructor method.
|
|
554
|
+
|
|
555
|
+
Parameters:
|
|
556
|
+
p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
|
|
557
|
+
d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
|
|
558
|
+
d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
|
|
559
|
+
d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
|
|
560
|
+
q: the parameter that defines the Minkowski distance. Must be a positive integer.
|
|
561
|
+
robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
562
|
+
epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
563
|
+
n_iters: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
|
|
564
|
+
weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
|
|
565
|
+
"""
|
|
566
|
+
self.p1 = p1 ; self.p2 = p2 ; self.p3 = p3
|
|
567
|
+
self.d1 = d1 ; self.d2 = d2 ; self.d3 = d3
|
|
568
|
+
self.q = q ; self.robust_method = robust_method ; self.alpha = alpha ; self.fast_VG = fast_VG;
|
|
569
|
+
self.VG_sample_size = VG_sample_size; self.VG_n_samples = VG_n_samples; self.random_state = random_state;
|
|
570
|
+
self.epsilon = epsilon ; self.n_iters = n_iters ; self.weights = weights
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def compute(self, X, tol=1e-6, d=2.5, Gs_PSD_transformation=True):
|
|
574
|
+
"""
|
|
575
|
+
Compute method.
|
|
576
|
+
|
|
577
|
+
Parameters:
|
|
578
|
+
X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
|
|
579
|
+
tol: a tolerance value to round the close-to-zero eigenvalues of the Gramm matrices.
|
|
580
|
+
Gs_PSD_trans: controls if a transformation is applied to enforce positive semi-definite Gramm matrices.
|
|
581
|
+
d: a parameter that controls the omega definition involved in the transformation mentioned above.
|
|
582
|
+
|
|
583
|
+
Returns:
|
|
584
|
+
D: the Related Metric Scaling matrix for the data matrix `X`.
|
|
585
|
+
"""
|
|
586
|
+
D1, D2, D3 = get_dist_matrices(X=X, p1=self.p1, p2=self.p2, p3=self.p3,
|
|
587
|
+
d1=self.d1, d2=self.d2, d3=self.d3,
|
|
588
|
+
q=self.q, robust_method=self.robust_method, epsilon=self.epsilon,
|
|
589
|
+
alpha=self.alpha, n_iters=self.n_iters, weights=self.weights)
|
|
590
|
+
|
|
591
|
+
D1_2 = D1**2 ; D2_2 = D2**2 ; D3_2 = D3**2
|
|
592
|
+
|
|
593
|
+
if self.fast_VG == True:
|
|
594
|
+
VG1, VG2, VG3 = vg_ggower_fast_estimation(X=X, p1=self.p1, p2=self.p2, p3=self.p3,
|
|
595
|
+
d1=self.d1, d2=self.d2, d3=self.d3,
|
|
596
|
+
robust_method=self.robust_method, alpha=self.alpha,
|
|
597
|
+
VG_sample_size=self.VG_sample_size, VG_n_samples=self.VG_n_samples,
|
|
598
|
+
random_state=self.random_state, weights=self.weights)
|
|
599
|
+
else:
|
|
600
|
+
VG1, VG2, VG3 = vg(D1_2), vg(D2_2), vg(D3_2)
|
|
601
|
+
|
|
602
|
+
D1_std = D1_2/VG1 if VG1 > 0 else D1_2
|
|
603
|
+
D2_std = D2_2/VG2 if VG2 > 0 else D2_2
|
|
604
|
+
D3_std = D3_2/VG3 if VG3 > 0 else D3_2
|
|
605
|
+
|
|
606
|
+
n = len(D1)
|
|
607
|
+
ones = np.ones((n, 1))
|
|
608
|
+
ones_T = np.ones((1, n))
|
|
609
|
+
ones_M = np.ones((n, n))
|
|
610
|
+
I = np.identity(n)
|
|
611
|
+
H = I - (1/n)*(ones @ ones_T)
|
|
612
|
+
G_1 = -(1/2)*(H @ D1_std @ H)
|
|
613
|
+
G_2 = -(1/2)*(H @ D2_std @ H)
|
|
614
|
+
G_3 = -(1/2)*(H @ D3_std @ H)
|
|
615
|
+
|
|
616
|
+
if Gs_PSD_transformation == True :
|
|
617
|
+
|
|
618
|
+
v1 = np.real(np.linalg.eigvals(G_1))
|
|
619
|
+
v2 = np.real(np.linalg.eigvals(G_2))
|
|
620
|
+
v3 = np.real(np.linalg.eigvals(G_3))
|
|
621
|
+
v1[np.isclose(v1, 0, atol=tol)] = 0
|
|
622
|
+
v2[np.isclose(v2, 0, atol=tol)] = 0
|
|
623
|
+
v3[np.isclose(v3, 0, atol=tol)] = 0
|
|
624
|
+
G1_PSD = np.all(v1 >= 0)
|
|
625
|
+
G2_PSD = np.all(v2 >= 0)
|
|
626
|
+
G3_PSD = np.all(v3 >= 0)
|
|
627
|
+
|
|
628
|
+
if not G1_PSD :
|
|
629
|
+
|
|
630
|
+
print('G1 is not PSD, a transformation to force it will be applied.')
|
|
631
|
+
|
|
632
|
+
omega = d * np.abs(np.min(v1))
|
|
633
|
+
D1_std = D1_std + omega*ones_M - omega*I
|
|
634
|
+
G_1 = -(1/2)*(H @ D1_std @ H)
|
|
635
|
+
|
|
636
|
+
if not G2_PSD :
|
|
637
|
+
|
|
638
|
+
print('G2 is not PSD, a transformation to force it will be applied.')
|
|
639
|
+
omega = d * np.abs(np.min(v2))
|
|
640
|
+
D2_std = D2_std + omega*ones_M - omega*I
|
|
641
|
+
G_2 = -(1/2)*(H @ D2_std @ H)
|
|
642
|
+
|
|
643
|
+
if not G3_PSD :
|
|
644
|
+
|
|
645
|
+
print('G3 is not PSD, a transformation to force it will be applied.')
|
|
646
|
+
omega = d * np.abs(np.min(v3))
|
|
647
|
+
D3_std = D3_std + omega*ones_M - omega*I
|
|
648
|
+
G_3 = -(1/2)*(H @ D3_std @ H)
|
|
649
|
+
|
|
650
|
+
U1, S1, V1 = np.linalg.svd(G_1)
|
|
651
|
+
U2, S2, V2 = np.linalg.svd(G_2)
|
|
652
|
+
U3, S3, V3 = np.linalg.svd(G_3)
|
|
653
|
+
S1 = np.clip(S1, 0, None)
|
|
654
|
+
S2 = np.clip(S2, 0, None)
|
|
655
|
+
S3 = np.clip(S3, 0, None)
|
|
656
|
+
sqrtG1 = U1 @ np.diag(np.sqrt(S1)) @ V1
|
|
657
|
+
sqrtG2 = U2 @ np.diag(np.sqrt(S2)) @ V2
|
|
658
|
+
sqrtG3 = U3 @ np.diag(np.sqrt(S3)) @ V3
|
|
659
|
+
|
|
660
|
+
G = G_1 + G_2 + G_3 - (1/3)*(sqrtG1@sqrtG2 + sqrtG1@sqrtG3 + sqrtG2@sqrtG1 + sqrtG2@sqrtG3 + sqrtG3@sqrtG1 + sqrtG3@sqrtG2)
|
|
661
|
+
g = np.diag(G)
|
|
662
|
+
g = np.reshape(g, (len(g), 1))
|
|
663
|
+
g_T = np.reshape(g, (1, len(g)))
|
|
664
|
+
D_2_ = g @ ones_T + ones @ g_T - 2*G
|
|
665
|
+
D_2_[np.isclose(D_2_, 0, atol=tol)] = 0
|
|
666
|
+
D = np.sqrt(D_2_)
|
|
667
|
+
|
|
668
|
+
return D
|
|
669
|
+
|
|
670
|
+
################################################################################
|
|
671
|
+
|
|
672
|
+
def data_preprocessing(X, frac_sample_size, random_state):
|
|
673
|
+
"""
|
|
674
|
+
Preprocess data in the way as needed by `FastGG` class.
|
|
675
|
+
|
|
676
|
+
Parameters (inputs)
|
|
677
|
+
----------
|
|
678
|
+
X: a pandas/polars data-frame.
|
|
679
|
+
frac_sample_size: the sample size in proportional terms.
|
|
680
|
+
random_state: the random seed for the random elements of the function.
|
|
681
|
+
|
|
682
|
+
Returns (outputs)
|
|
683
|
+
-------
|
|
684
|
+
X_sample: a polars df with the sample of `X`.
|
|
685
|
+
X_out_sample: a polars df with the out of sample of `X`.
|
|
686
|
+
sample_index: the index of the sample observations/rows.
|
|
687
|
+
out_sample_index: the index of the out of sample observations/rows.
|
|
688
|
+
"""
|
|
689
|
+
|
|
690
|
+
if not (0 < frac_sample_size <= 1):
|
|
691
|
+
raise ValueError('frac_sample_size must be in (0,1].')
|
|
692
|
+
|
|
693
|
+
if isinstance(X, (pd.DataFrame, pl.DataFrame)):
|
|
694
|
+
X = X.to_numpy()
|
|
695
|
+
|
|
696
|
+
n = len(X)
|
|
697
|
+
|
|
698
|
+
if frac_sample_size < 1:
|
|
699
|
+
n_sample = int(frac_sample_size*n)
|
|
700
|
+
index = np.arange(0,n)
|
|
701
|
+
np.random.seed(random_state)
|
|
702
|
+
sample_index = np.random.choice(index, size=n_sample, replace=False)
|
|
703
|
+
out_sample_index = np.array([x for x in index if x not in sample_index])
|
|
704
|
+
X_sample = X[sample_index,:]
|
|
705
|
+
X_out_sample = X[out_sample_index,:]
|
|
706
|
+
else:
|
|
707
|
+
X_sample = X
|
|
708
|
+
sample_index = np.arange(0,n)
|
|
709
|
+
X_out_sample = np.array([])
|
|
710
|
+
out_sample_index = np.array([])
|
|
711
|
+
|
|
712
|
+
return X_sample, X_out_sample, sample_index, out_sample_index
|
|
713
|
+
|
|
714
|
+
################################################################################
|
|
715
|
+
|
|
716
|
+
class FastGGowerDistMatrix:
|
|
717
|
+
"""
|
|
718
|
+
Calculates the the Generalized Gower matrix of a sample of a given data matrix.
|
|
719
|
+
"""
|
|
720
|
+
|
|
721
|
+
def __init__(self, frac_sample_size=0.1, random_state=123, p1=None, p2=None, p3=None,
|
|
722
|
+
d1='robust_mahalanobis', d2='jaccard', d3='matching',
|
|
723
|
+
robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20, q=1,
|
|
724
|
+
fast_VG=False, VG_sample_size=1000, VG_n_samples=5, weights=None) :
|
|
725
|
+
"""
|
|
726
|
+
Constructor method.
|
|
727
|
+
|
|
728
|
+
Parameters:
|
|
729
|
+
frac_sample_size: the sample size in proportional terms.
|
|
730
|
+
p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
|
|
731
|
+
d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
|
|
732
|
+
d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
|
|
733
|
+
d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
|
|
734
|
+
q: the parameter that defines the Minkowski distance. Must be a positive integer.
|
|
735
|
+
robust_method: the method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
736
|
+
alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'. Only needed when d1 = 'robust_mahalanobis'.
|
|
737
|
+
epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
|
|
738
|
+
n_iters: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
|
|
739
|
+
fast_VG: whether the geometric variability estimation will be full (False) or fast (True).
|
|
740
|
+
VG_sample_size: sample size to be used to make the estimation of the geometric variability.
|
|
741
|
+
VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
|
|
742
|
+
random_state: the random seed used for the (random) sample elements.
|
|
743
|
+
weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
|
|
744
|
+
"""
|
|
745
|
+
self.random_state = random_state; self.frac_sample_size = frac_sample_size; self.p1 = p1; self.p2 = p2; self.p3 = p3;
|
|
746
|
+
self.d1 = d1; self.d2 = d2; self.d3 = d3; self.robust_method = robust_method; self.alpha = alpha; self.epsilon = epsilon;
|
|
747
|
+
self.n_iters = n_iters; self.fast_VG = fast_VG; self.VG_sample_size = VG_sample_size; self.VG_n_samples = VG_n_samples;
|
|
748
|
+
self.q = q; self.weights = weights
|
|
749
|
+
|
|
750
|
+
def compute(self, X):
|
|
751
|
+
"""
|
|
752
|
+
Compute method: computes the Generalized Gower function for the defined sample of data.
|
|
753
|
+
|
|
754
|
+
Parameters:
|
|
755
|
+
X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
|
|
756
|
+
"""
|
|
757
|
+
|
|
758
|
+
X_sample, X_out_sample, sample_index, out_sample_index = data_preprocessing(X=X, frac_sample_size=self.frac_sample_size,
|
|
759
|
+
random_state=self.random_state)
|
|
760
|
+
|
|
761
|
+
sample_weights = self.weights[sample_index] if self.weights is not None else None
|
|
762
|
+
|
|
763
|
+
GGower_matrix = GGowerDistMatrix(p1=self.p1, p2=self.p2, p3=self.p3,
|
|
764
|
+
d1=self.d1, d2=self.d2, d3=self.d3, q=self.q,
|
|
765
|
+
robust_method=self.robust_method, alpha=self.alpha,
|
|
766
|
+
epsilon=self.epsilon, n_iters=self.n_iters,
|
|
767
|
+
fast_VG=self.fast_VG, VG_sample_size=self.VG_sample_size,
|
|
768
|
+
VG_n_samples=self.VG_n_samples, weights=sample_weights)
|
|
769
|
+
|
|
770
|
+
self.D_GGower = GGower_matrix.compute(X=X_sample)
|
|
771
|
+
self.sample_index = sample_index
|
|
772
|
+
self.out_sample_index = out_sample_index
|
|
773
|
+
self.X_sample = X_sample
|
|
774
|
+
self.X_out_sample = X_out_sample
|
|
775
|
+
|
|
776
|
+
################################################################################
|