robust-mixed-dist 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,776 @@
1
+ import polars as pl
2
+ import numpy as np
3
+ import pandas as pd
4
+ from robust_mixed_dist.quantitative import (euclidean_dist_matrix, euclidean_dist, minkowski_dist_matrix,
5
+ minkowski_dist, canberra_dist_matrix, canberra_dist, pearson_dist_matrix,
6
+ mahalanobis_dist_matrix, mahalanobis_dist, robust_maha_dist_matrix, robust_maha_dist, S_robust)
7
+ from robust_mixed_dist.binary import sokal_dist_matrix, sokal_dist, jaccard_dist_matrix, jaccard_dist
8
+ from robust_mixed_dist.multiclass import hamming_dist_matrix, hamming_dist
9
+
10
+ ################################################################################
11
+
12
+ def get_dist_matrix_functions():
13
+
14
+ dist_matrix = {}
15
+ dist_matrix['euclidean'] = euclidean_dist_matrix
16
+ dist_matrix['minkowski'] = minkowski_dist_matrix
17
+ dist_matrix['canberra'] = canberra_dist_matrix
18
+ dist_matrix['pearson'] = pearson_dist_matrix
19
+ dist_matrix['mahalanobis'] = mahalanobis_dist_matrix
20
+ dist_matrix['robust_mahalanobis'] = robust_maha_dist_matrix
21
+ dist_matrix['sokal'] = sokal_dist_matrix
22
+ dist_matrix['jaccard'] = jaccard_dist_matrix
23
+ dist_matrix['hamming'] = hamming_dist_matrix
24
+
25
+ return dist_matrix
26
+
27
+ ################################################################################
28
+
29
+ def get_dist_functions():
30
+
31
+ dist = {}
32
+ dist['euclidean'] = euclidean_dist
33
+ dist['minkowski'] = minkowski_dist
34
+ dist['canberra'] = canberra_dist
35
+ dist['mahalanobis'] = mahalanobis_dist
36
+ dist['robust_mahalanobis'] = robust_maha_dist
37
+ dist['sokal'] = sokal_dist
38
+ dist['jaccard'] = jaccard_dist
39
+ dist['hamming'] = hamming_dist
40
+
41
+ return dist
42
+
43
+ ################################################################################
44
+
45
+ def vg(D_2):
46
+ """
47
+ Calculates the geometric variability of the squared distance matrix `D_2` passed as input.
48
+
49
+ Parameters (inputs)
50
+ ----------
51
+ D_2: a numpy array. It should represent an squared distance matrix.
52
+
53
+ Returns (outputs)
54
+ -------
55
+ VG: the geometric variability of the squared distance matrix `D_2`.
56
+ """
57
+ n = len(D_2)
58
+ VG = (1/(2*(n**2)))*np.sum(D_2)
59
+ # TO DO: version managing weights
60
+ return VG
61
+
62
+ ################################################################################
63
+
64
+ def get_dist_matrices(X, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1,
65
+ robust_method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20, weights=None):
66
+ """
67
+ Calculates the distance matrices that are involved in the Generalized Gower distance.
68
+
69
+ Parameters:
70
+ X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
71
+ p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
72
+ d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
73
+ d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
74
+ d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
75
+ q: the parameter that defines the Minkowski distance. Must be a positive integer.
76
+ robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
77
+ epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
78
+ n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
79
+ weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
80
+
81
+ Returns:
82
+ D1, D2, D3: the distances matrices associated to the quantitative, binary and multi-class variables, respectively.
83
+ """
84
+
85
+ if isinstance(X, (pl.DataFrame, pd.DataFrame)):
86
+ X = X.to_numpy()
87
+
88
+ dist_matrix = get_dist_matrix_functions()
89
+
90
+ n = len(X)
91
+ X_quant = X[:, 0:p1]
92
+ X_bin = X[:, (p1):(p1+p2)]
93
+ X_multi = X[:, (p1+p2):(p1+p2+p3)]
94
+
95
+ # Define D1 based on d1 and p1
96
+ if p1 > 0:
97
+ if d1 == 'minkowski':
98
+ D1 = dist_matrix[d1](X_quant, q)
99
+ elif d1 == 'robust_mahalanobis':
100
+ S_robust_ = S_robust(X=X_quant, method=robust_method, alpha=alpha,
101
+ epsilon=epsilon, n_iters=n_iters,
102
+ weights=weights)
103
+ D1 = dist_matrix[d1](X_quant, S_robust=S_robust_)
104
+ else:
105
+ D1 = dist_matrix[d1](X_quant)
106
+ elif p1 == 0:
107
+ D1 = np.zeros((n, n))
108
+ # Define D2 based on p2
109
+ D2 = dist_matrix[d2](X_bin) if p2 > 0 else np.zeros((n, n))
110
+ # Define D3 based on p3
111
+ D3 = dist_matrix[d3](X_multi) if p3 > 0 else np.zeros((n, n))
112
+
113
+ return D1, D2, D3
114
+
115
+ ################################################################################
116
+
117
+ def get_distances(xi, xr, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1, S=None, S_robust=None):
118
+ """
119
+ Calculates the distances between observations that are involved in the Generalized Gower distance.
120
+
121
+ Parameters:
122
+ xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
123
+ p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
124
+ d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
125
+ d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
126
+ d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
127
+ q: the parameter that defines the Minkowski distance. Must be a positive integer.
128
+ S: the covariance matrix of the considered data matrix.
129
+ S_robust: the robust covariance matrix of the considered data matrix.
130
+
131
+ Returns:
132
+ dist1, dist2, dist3: the distances values associated to the quantitative, binary and multi-class observations, respectively.
133
+ """
134
+
135
+ if isinstance(xi, (pl.DataFrame, pd.DataFrame)) :
136
+ xi = xi.to_numpy().flatten()
137
+ elif isinstance(xi, (pd.Series, pl.Series)) :
138
+ xi = xi.to_numpy()
139
+ if isinstance(xr, (pl.DataFrame, pd.DataFrame)) :
140
+ xr = xr.to_numpy().flatten()
141
+ elif isinstance(xr, (pd.Series, pl.Series)) :
142
+ xr = xr.to_numpy()
143
+
144
+ dist = get_dist_functions()
145
+
146
+ xi_quant = xi[0:p1] ; xr_quant = xr[0:p1] ;
147
+ xi_bin = xi[(p1):(p1+p2)] ; xr_bin = xr[(p1):(p1+p2)]
148
+ xi_multi = xi[(p1+p2):(p1+p2+p3)] ; xr_multi = xr[(p1+p2):(p1+p2+p3)]
149
+
150
+ if p1 > 0:
151
+ if d1 == 'minkowski':
152
+ dist1 = dist[d1](xi_quant, xr_quant, q=q)
153
+ elif d1 == 'robust_mahalanobis':
154
+ dist1 = dist[d1](xi_quant, xr_quant, S_robust=S_robust)
155
+ elif d1 == 'mahalanobis':
156
+ dist1 = dist[d1](xi_quant, xr_quant, S=S)
157
+ else:
158
+ dist1 = dist[d1](xi_quant, xr_quant)
159
+ elif p1 == 0:
160
+ dist1 = 0
161
+
162
+ dist2 = dist[d2](xi_bin, xr_bin) if p2 > 0 else 0
163
+ dist3 = dist[d3](xi_multi, xr_multi) if p3 > 0 else 0
164
+
165
+ return dist1, dist2, dist3
166
+
167
+ ################################################################################
168
+
169
+ def vg_ggower_estimation(X, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching',
170
+ q=1, robust_method='trimmed', epsilon=0.05, alpha=0.05,
171
+ n_iters=20, weights=None):
172
+ """
173
+ Calculates the geometric variability of an Generalized Gower distance matrix.
174
+
175
+ Parameters:
176
+ X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
177
+ p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
178
+ d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
179
+ d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
180
+ d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
181
+ q: the parameter that defines the Minkowski distance. Must be a positive integer.
182
+ robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
183
+ epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
184
+ n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
185
+ weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
186
+
187
+ Returns:
188
+ VG1, VG2, VG3: the geometric variabilities of the distances matrices associated to the quantitative, binary and multi-class variables, respectively.
189
+ """
190
+
191
+ D1, D2, D3 = get_dist_matrices(X=X, p1=p1, p2=p2, p3=p3, d1=d1, d2=d2, d3=d3,
192
+ q=q, robust_method=robust_method, epsilon=epsilon,
193
+ alpha=alpha, n_iters=n_iters, weights=weights)
194
+
195
+ D1_2, D2_2, D3_2 = D1**2, D2**2, D3**2
196
+ VG1, VG2, VG3 = vg(D1_2), vg(D2_2), vg(D3_2)
197
+
198
+ return VG1, VG2, VG3
199
+
200
+ ################################################################################
201
+
202
+ def vg_ggower_fast_estimation(X, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching',
203
+ robust_method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20, q=1,
204
+ VG_sample_size=300, VG_n_samples=5, random_state=123, weights=None):
205
+ """
206
+ Calculates a fast estimation of the geometric variability of an squared Generalized Gower distance matrix.
207
+
208
+ Parameters:
209
+ X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
210
+ p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
211
+ d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
212
+ d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
213
+ d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
214
+ q: the parameter that defines the Minkowski distance. Must be a positive integer.
215
+ robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
216
+ epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
217
+ n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
218
+ weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
219
+ VG_sample_size: sample size to be used to make the estimation of the geometric variability.
220
+ VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
221
+ random_state: the random seed used for the (random) sample elements.
222
+
223
+ Returns:
224
+ VG1, VG2, VG3: the geometric variabilities of the distances matrices associated to the quantitative, binary and multi-class variables, respectively.
225
+ """
226
+
227
+ if isinstance(X, (pl.DataFrame, pd.DataFrame)) :
228
+ X = X.to_numpy()
229
+
230
+ n = len(X)
231
+ VG1_list, VG2_list, VG3_list = [], [], []
232
+
233
+ for i in range(0, VG_n_samples) :
234
+
235
+ np.random.seed(random_state + i)
236
+ index = np.arange(0,n)
237
+ sample_index = np.random.choice(index, size=VG_sample_size)
238
+ X_sample = X[sample_index,:].copy()
239
+
240
+ if weights is not None:
241
+ sample_weights = weights[sample_index].copy()
242
+ else:
243
+ sample_weights = None
244
+
245
+ VG1, VG2, VG3 = vg_ggower_estimation(X=X_sample, p1=p1, p2=p2, p3=p3, d1=d1, d2=d2, d3=d3, q=q,
246
+ robust_method=robust_method, epsilon=epsilon, alpha=alpha,
247
+ n_iters=n_iters, weights=sample_weights)
248
+
249
+ VG1_list.append(VG1) ; VG2_list.append(VG2) ; VG3_list.append(VG3)
250
+
251
+ VG1 = np.mean(VG1_list) ; VG2 = np.mean(VG2_list) ; VG3 = np.mean(VG3_list)
252
+
253
+ return VG1, VG2, VG3
254
+
255
+ ################################################################################
256
+
257
+ class GGowerDistMatrix:
258
+ """
259
+ Calculates the Generalized Gower matrix for a data matrix.
260
+ """
261
+
262
+ def __init__(self, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1, robust_method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20,
263
+ fast_VG=False, VG_sample_size=300, VG_n_samples=5, random_state=123, weights=None):
264
+ """
265
+ Constructor method.
266
+
267
+ Parameters:
268
+ p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
269
+ d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
270
+ d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
271
+ d3: name of the distance to be computed for multi-class variables. Must be an string in ['hamming'].
272
+ q: the parameter that defines the Minkowski distance. Must be a positive integer.
273
+ metrobust_methodhod: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
274
+ alpha : a real number in [0,1] that is used if `robust_method` is 'trimmed' or 'winsorized'. Only needed when d1 = 'robust_mahalanobis'.
275
+ epsilon : parameter used by the Delvin transformation. epsilon=0.05 is recommended. Only needed when d1 = 'robust_mahalanobis'.
276
+ n_iter : maximum number of iterations run by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
277
+ weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
278
+ fast_VG: whether the geometric variability estimation will be full (False) or fast (True).
279
+ VG_sample_size: sample size to be used to make the estimation of the geometric variability.
280
+ VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
281
+ random_state: the random seed used for the (random) sample elements.
282
+ """
283
+ self.p1 = p1 ; self.p2 = p2 ; self.p3 = p3
284
+ self.d1 = d1 ; self.d2 = d2 ; self.d3 = d3
285
+ self.q = q ; self.robust_method = robust_method ; self.alpha = alpha ;
286
+ self.epsilon = epsilon ; self.n_iters = n_iters
287
+ self.VG_sample_size = VG_sample_size; self.VG_n_samples = VG_n_samples
288
+ self.random_state = random_state ; self.fast_VG = fast_VG; self.weights = weights
289
+
290
+ def compute(self, X):
291
+ """
292
+ Compute method.
293
+
294
+ Parameters:
295
+ X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
296
+
297
+ Returns:
298
+ D: the Generalized Gower matrix for the data matrix `X`.
299
+ """
300
+
301
+ D1, D2, D3 = get_dist_matrices(X=X, p1=self.p1, p2=self.p2, p3=self.p3,
302
+ d1=self.d1, d2=self.d2, d3=self.d3,
303
+ q=self.q, robust_method=self.robust_method, epsilon=self.epsilon,
304
+ alpha=self.alpha, n_iters=self.n_iters, weights=self.weights)
305
+
306
+ D1_2 = D1**2 ; D2_2 = D2**2 ; D3_2 = D3**2
307
+
308
+ if self.fast_VG == True:
309
+ VG1, VG2, VG3 = vg_ggower_fast_estimation(X=X, p1=self.p1, p2=self.p2, p3=self.p3,
310
+ d1=self.d1, d2=self.d2, d3=self.d3,
311
+ robust_method=self.robust_method, alpha=self.alpha,
312
+ VG_sample_size=self.VG_sample_size, VG_n_samples=self.VG_n_samples,
313
+ random_state=self.random_state, weights=self.weights)
314
+ else:
315
+ VG1, VG2, VG3 = vg(D1_2), vg(D2_2), vg(D3_2)
316
+
317
+ D1_std = D1_2/VG1 if VG1 > 0 else D1_2
318
+ D2_std = D2_2/VG2 if VG2 > 0 else D2_2
319
+ D3_std = D3_2/VG3 if VG3 > 0 else D3_2
320
+ D_2 = D1_std + D2_std + D3_std
321
+ D = np.sqrt(D_2)
322
+
323
+ return D
324
+
325
+ ################################################################################
326
+
327
+ class GGowerDist:
328
+ """
329
+ Calculates the Generalized Gower distance for a pair of data observations.
330
+ """
331
+
332
+ def __init__(self, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1, robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20,
333
+ VG_sample_size=300, VG_n_samples=5, random_state=123, weights=None):
334
+ """
335
+ Constructor method.
336
+
337
+ Parameters:
338
+ p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
339
+ d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
340
+ d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
341
+ d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
342
+ q: the parameter that defines the Minkowski distance. Must be a positive integer.
343
+ robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
344
+ epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
345
+ n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
346
+ weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
347
+ VG_sample_size: sample size to be used to make the estimation of the geometric variability.
348
+ VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
349
+ random_state: the random seed used for the (random) sample elements.
350
+ """
351
+ self.p1 = p1 ; self.p2 = p2 ; self.p3 = p3
352
+ self.d1 = d1 ; self.d2 = d2 ; self.d3 = d3
353
+ self.q = q ; self.robust_method = robust_method ; self.alpha = alpha ;
354
+ self.epsilon = epsilon ; self.n_iters = n_iters
355
+ self.VG_sample_size = VG_sample_size; self.VG_n_samples = VG_n_samples
356
+ self.random_state = random_state; self.weights = weights
357
+
358
+ def fit(self, X) :
359
+ """
360
+ Fit method that computes the geometric variability and covariance matrix to be used in 'compute' method, if needed.
361
+
362
+ Parameters:
363
+ X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
364
+
365
+ Returns:
366
+ D: the Generalized Gower matrix for the data matrix `X`.
367
+ """
368
+ p1 = self.p1 ; p2 = self.p2 ; p3 = self.p3
369
+ d1 = self.d1 ; d2 = self.d2 ; d3 = self.d3
370
+ self.S, self.S_robust = None, None
371
+
372
+ if d1 in ['robust_mahalanobis', 'mahalanobis']:
373
+
374
+ if isinstance(X, (pl.DataFrame, pd.DataFrame)) :
375
+ X = X.to_numpy()
376
+
377
+ X_quant = X[:, 0:p1]
378
+
379
+ if d1 == 'robust_mahalanobis':
380
+ self.S_robust = S_robust(X=X_quant, method=self.robust_method, alpha=self.alpha,
381
+ epsilon=self.epsilon, n_iters=self.n_iters, weights=self.weights)
382
+ elif d1 == 'mahalanobis':
383
+ self.S = np.cov(X_quant, rowvar=False)
384
+
385
+ self.VG1, self.VG2, self.VG3 = vg_ggower_fast_estimation(X=X, p1=p1, p2=p2, p3=p3, d1=d1, d2=d2, d3=d3, robust_method=self.robust_method,
386
+ alpha=self.alpha, epsilon=self.epsilon, n_iters=self.n_iters,
387
+ VG_sample_size=self.VG_sample_size, VG_n_samples=self.VG_n_samples,
388
+ random_state=self.random_state, weights=self.weights)
389
+
390
+ def compute(self, xi, xr):
391
+ """
392
+ Compute method.
393
+
394
+ Parameters:
395
+ xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
396
+
397
+ Returns:
398
+ dist: the Generalized Gower distance between the observations `xi` and `xr`.
399
+ """
400
+ dist1, dist2, dist3 = get_distances(xi=xi, xr=xr, p1=self.p1, p2=self.p2, p3=self.p3,
401
+ d1=self.d1, d2=self.d2, d3=self.d3,
402
+ q=self.q, S=self.S, S_robust=self.S_robust)
403
+
404
+ dist1_2 = dist1**2 ; dist2_2 = dist2**2 ; dist3_2 = dist3**2
405
+ dist1_2_std = dist1_2/self.VG1 if self.VG1 > 0 else dist1_2
406
+ dist2_2_std = dist2_2/self.VG2 if self.VG2 > 0 else dist2_2
407
+ dist3_2_std = dist3_2/self.VG3 if self.VG3 > 0 else dist3_2
408
+ dist_2 = dist1_2_std + dist2_2_std + dist3_2_std
409
+ dist = np.sqrt(dist_2)
410
+
411
+ return dist
412
+
413
+ ################################################################################
414
+
415
+ def ggower_dist(xi, xr, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching',
416
+ q=1, S=None, S_robust=None, VG1=None, VG2=None, VG3=None):
417
+
418
+ dist1, dist2, dist3 = get_distances(xi=xi, xr=xr, p1=p1, p2=p2, p3=p3,
419
+ d1=d1, d2=d2, d3=d3,
420
+ q=q, S=S, S_robust=S_robust)
421
+
422
+ dist1_2 = dist1**2 ; dist2_2 = dist2**2 ; dist3_2 = dist3**2
423
+ dist1_2_std = dist1_2/VG1 if VG1 > 0 else dist1_2
424
+ dist2_2_std = dist2_2/VG2 if VG2 > 0 else dist2_2
425
+ dist3_2_std = dist3_2/VG3 if VG3 > 0 else dist3_2
426
+ dist_2 = dist1_2_std + dist2_2_std + dist3_2_std
427
+ dist = np.sqrt(dist_2)
428
+
429
+ return dist
430
+
431
+ ################################################################################
432
+
433
+ def simple_gower_dist(xi, xr, X, p1, p2, p3) :
434
+ """
435
+ Compute method.
436
+
437
+ Parameters:
438
+ xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
439
+ X: a pandas/polars data-frame or a numpy array. It represents a data matrix.
440
+ p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
441
+
442
+ Returns:
443
+ dist: the Simple Gower distance between the observations `xi` and `xr`.
444
+ """
445
+
446
+ if isinstance(X, (pl.DataFrame, pd.DataFrame)) :
447
+ X = X.to_numpy()
448
+ if isinstance(xi, (pl.DataFrame, pd.DataFrame)) :
449
+ xi = xi.to_numpy().flatten()
450
+ elif isinstance(xi, (pd.Series, pl.Series)) :
451
+ xi = xi.to_numpy()
452
+ if isinstance(xr, (pl.DataFrame, pd.DataFrame)) :
453
+ xr = xr.to_numpy().flatten()
454
+ elif isinstance(xi, (pd.Series, pl.Series)) :
455
+ xr = xr.to_numpy()
456
+
457
+ dist = get_dist_functions()
458
+
459
+ X_quant = X[:,0:p1]
460
+ xi_quant = xi[0:p1] ; xr_quant = xr[0:p1] ;
461
+ xi_bin = xi[(p1):(p1+p2)] ; xr_bin = xr[(p1):(p1+p2)]
462
+ xi_multi = xi[(p1+p2):(p1+p2+p3)] ; xr_multi = xr[(p1+p2):(p1+p2+p3)]
463
+ R = np.max(X_quant, axis=0) - np.min(X_quant, axis=0)
464
+
465
+ dist1 = np.sum(np.abs(xi_quant - xr_quant)/R) if p1 > 0 else 0
466
+ dist2 = dist['jaccard'](xi_bin, xr_bin) if p2 > 0 else 0
467
+ dist3 = dist['hamming'](xi_multi, xr_multi) if p3 > 0 else 0
468
+ dist = dist1 + dist2 + dist3
469
+
470
+ return dist
471
+
472
+ ################################################################################
473
+
474
+ '''
475
+ def simple_gower_dist_matrix(X, p1, p2, p3):
476
+
477
+ if isinstance(X, (pl.DataFrame, pd.DataFrame)) :
478
+ X = X.to_numpy()
479
+
480
+ D = np.zeros((len(X), len(X)))
481
+
482
+ for i in range(len(X)):
483
+ for r in range(len(X)):
484
+ if i <= r:
485
+ D[i,r] = simple_gower_dist(xi=X[i,:], xr=X[r,:], X=X,
486
+ p1=p1, p2=p2, p3=p3)
487
+
488
+ D = D + D.T - np.diag(D.diagonal())
489
+
490
+ return D
491
+ '''
492
+
493
+ def simple_gower_dist_matrix(X, p1, p2, p3):
494
+ """
495
+ Cálculo matricial de la distancia simple de Gower entre todas las filas de X.
496
+
497
+ Parameters:
498
+ X: np.ndarray o DataFrame (se convierte a np.ndarray).
499
+ p1: número de columnas numéricas.
500
+ p2: número de columnas binarias.
501
+ p3: número de columnas categóricas (multi-clase).
502
+
503
+ Returns:
504
+ D: matriz de distancias (n x n) con la distancia de Gower simple entre observaciones.
505
+ """
506
+
507
+ # Convertir DataFrame si fuera necesario
508
+ if isinstance(X, (pd.DataFrame, pl.DataFrame)):
509
+ X = X.to_numpy()
510
+
511
+ dist_matrix = get_dist_matrix_functions()
512
+
513
+ # Separar bloques
514
+ X_quant = X[:, 0:p1] if p1 > 0 else None
515
+ X_bin = X[:, p1:p1 + p2] if p2 > 0 else None
516
+ X_multi = X[:, p1 + p2:p1 + p2 + p3] if p3 > 0 else None
517
+
518
+ n = X.shape[0]
519
+ D = np.zeros((n, n))
520
+
521
+ # Distancia cuantitativa: Manhattan normalizada por rango
522
+ if p1 > 0:
523
+ R = np.max(X_quant, axis=0) - np.min(X_quant, axis=0)
524
+ R[R == 0] = 1 # evitar división por cero
525
+ X_quant_norm = X_quant / R
526
+ dist_quant = dist_matrix['minkowski'](X_quant_norm, q=1)
527
+ D += dist_quant
528
+
529
+ # Distancia binaria: Jaccard
530
+ if p2 > 0:
531
+ dist_bin = dist_matrix['jaccard'](X_bin)
532
+ D += dist_bin
533
+
534
+ # Distancia categórica: Hamming (simple coincidencia)
535
+ if p3 > 0:
536
+ dist_multi = dist_matrix['hamming'](X_multi)
537
+ D += dist_multi
538
+
539
+ return D
540
+
541
+
542
+ ################################################################################
543
+
544
+ class RelMSDistMatrix:
545
+ """
546
+ Calculates the Related Metric Scaling matrix for a data matrix.
547
+ """
548
+
549
+ def __init__(self, p1,p2,p3,d1='euclidean',d2='sokal',d3='matching',q=1, robust_method='trimmed',
550
+ epsilon=0.05, alpha=0.05, n_iters=20, weights=None,
551
+ fast_VG=False, VG_sample_size=300, VG_n_samples=5, random_state=123):
552
+ """
553
+ Constructor method.
554
+
555
+ Parameters:
556
+ p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
557
+ d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
558
+ d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
559
+ d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
560
+ q: the parameter that defines the Minkowski distance. Must be a positive integer.
561
+ robust_method: the robust_method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
562
+ epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
563
+ n_iters: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
564
+ weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
565
+ """
566
+ self.p1 = p1 ; self.p2 = p2 ; self.p3 = p3
567
+ self.d1 = d1 ; self.d2 = d2 ; self.d3 = d3
568
+ self.q = q ; self.robust_method = robust_method ; self.alpha = alpha ; self.fast_VG = fast_VG;
569
+ self.VG_sample_size = VG_sample_size; self.VG_n_samples = VG_n_samples; self.random_state = random_state;
570
+ self.epsilon = epsilon ; self.n_iters = n_iters ; self.weights = weights
571
+
572
+
573
+ def compute(self, X, tol=1e-6, d=2.5, Gs_PSD_transformation=True):
574
+ """
575
+ Compute method.
576
+
577
+ Parameters:
578
+ X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
579
+ tol: a tolerance value to round the close-to-zero eigenvalues of the Gramm matrices.
580
+ Gs_PSD_trans: controls if a transformation is applied to enforce positive semi-definite Gramm matrices.
581
+ d: a parameter that controls the omega definition involved in the transformation mentioned above.
582
+
583
+ Returns:
584
+ D: the Related Metric Scaling matrix for the data matrix `X`.
585
+ """
586
+ D1, D2, D3 = get_dist_matrices(X=X, p1=self.p1, p2=self.p2, p3=self.p3,
587
+ d1=self.d1, d2=self.d2, d3=self.d3,
588
+ q=self.q, robust_method=self.robust_method, epsilon=self.epsilon,
589
+ alpha=self.alpha, n_iters=self.n_iters, weights=self.weights)
590
+
591
+ D1_2 = D1**2 ; D2_2 = D2**2 ; D3_2 = D3**2
592
+
593
+ if self.fast_VG == True:
594
+ VG1, VG2, VG3 = vg_ggower_fast_estimation(X=X, p1=self.p1, p2=self.p2, p3=self.p3,
595
+ d1=self.d1, d2=self.d2, d3=self.d3,
596
+ robust_method=self.robust_method, alpha=self.alpha,
597
+ VG_sample_size=self.VG_sample_size, VG_n_samples=self.VG_n_samples,
598
+ random_state=self.random_state, weights=self.weights)
599
+ else:
600
+ VG1, VG2, VG3 = vg(D1_2), vg(D2_2), vg(D3_2)
601
+
602
+ D1_std = D1_2/VG1 if VG1 > 0 else D1_2
603
+ D2_std = D2_2/VG2 if VG2 > 0 else D2_2
604
+ D3_std = D3_2/VG3 if VG3 > 0 else D3_2
605
+
606
+ n = len(D1)
607
+ ones = np.ones((n, 1))
608
+ ones_T = np.ones((1, n))
609
+ ones_M = np.ones((n, n))
610
+ I = np.identity(n)
611
+ H = I - (1/n)*(ones @ ones_T)
612
+ G_1 = -(1/2)*(H @ D1_std @ H)
613
+ G_2 = -(1/2)*(H @ D2_std @ H)
614
+ G_3 = -(1/2)*(H @ D3_std @ H)
615
+
616
+ if Gs_PSD_transformation == True :
617
+
618
+ v1 = np.real(np.linalg.eigvals(G_1))
619
+ v2 = np.real(np.linalg.eigvals(G_2))
620
+ v3 = np.real(np.linalg.eigvals(G_3))
621
+ v1[np.isclose(v1, 0, atol=tol)] = 0
622
+ v2[np.isclose(v2, 0, atol=tol)] = 0
623
+ v3[np.isclose(v3, 0, atol=tol)] = 0
624
+ G1_PSD = np.all(v1 >= 0)
625
+ G2_PSD = np.all(v2 >= 0)
626
+ G3_PSD = np.all(v3 >= 0)
627
+
628
+ if not G1_PSD :
629
+
630
+ print('G1 is not PSD, a transformation to force it will be applied.')
631
+
632
+ omega = d * np.abs(np.min(v1))
633
+ D1_std = D1_std + omega*ones_M - omega*I
634
+ G_1 = -(1/2)*(H @ D1_std @ H)
635
+
636
+ if not G2_PSD :
637
+
638
+ print('G2 is not PSD, a transformation to force it will be applied.')
639
+ omega = d * np.abs(np.min(v2))
640
+ D2_std = D2_std + omega*ones_M - omega*I
641
+ G_2 = -(1/2)*(H @ D2_std @ H)
642
+
643
+ if not G3_PSD :
644
+
645
+ print('G3 is not PSD, a transformation to force it will be applied.')
646
+ omega = d * np.abs(np.min(v3))
647
+ D3_std = D3_std + omega*ones_M - omega*I
648
+ G_3 = -(1/2)*(H @ D3_std @ H)
649
+
650
+ U1, S1, V1 = np.linalg.svd(G_1)
651
+ U2, S2, V2 = np.linalg.svd(G_2)
652
+ U3, S3, V3 = np.linalg.svd(G_3)
653
+ S1 = np.clip(S1, 0, None)
654
+ S2 = np.clip(S2, 0, None)
655
+ S3 = np.clip(S3, 0, None)
656
+ sqrtG1 = U1 @ np.diag(np.sqrt(S1)) @ V1
657
+ sqrtG2 = U2 @ np.diag(np.sqrt(S2)) @ V2
658
+ sqrtG3 = U3 @ np.diag(np.sqrt(S3)) @ V3
659
+
660
+ G = G_1 + G_2 + G_3 - (1/3)*(sqrtG1@sqrtG2 + sqrtG1@sqrtG3 + sqrtG2@sqrtG1 + sqrtG2@sqrtG3 + sqrtG3@sqrtG1 + sqrtG3@sqrtG2)
661
+ g = np.diag(G)
662
+ g = np.reshape(g, (len(g), 1))
663
+ g_T = np.reshape(g, (1, len(g)))
664
+ D_2_ = g @ ones_T + ones @ g_T - 2*G
665
+ D_2_[np.isclose(D_2_, 0, atol=tol)] = 0
666
+ D = np.sqrt(D_2_)
667
+
668
+ return D
669
+
670
+ ################################################################################
671
+
672
+ def data_preprocessing(X, frac_sample_size, random_state):
673
+ """
674
+ Preprocess data in the way as needed by `FastGG` class.
675
+
676
+ Parameters (inputs)
677
+ ----------
678
+ X: a pandas/polars data-frame.
679
+ frac_sample_size: the sample size in proportional terms.
680
+ random_state: the random seed for the random elements of the function.
681
+
682
+ Returns (outputs)
683
+ -------
684
+ X_sample: a polars df with the sample of `X`.
685
+ X_out_sample: a polars df with the out of sample of `X`.
686
+ sample_index: the index of the sample observations/rows.
687
+ out_sample_index: the index of the out of sample observations/rows.
688
+ """
689
+
690
+ if not (0 < frac_sample_size <= 1):
691
+ raise ValueError('frac_sample_size must be in (0,1].')
692
+
693
+ if isinstance(X, (pd.DataFrame, pl.DataFrame)):
694
+ X = X.to_numpy()
695
+
696
+ n = len(X)
697
+
698
+ if frac_sample_size < 1:
699
+ n_sample = int(frac_sample_size*n)
700
+ index = np.arange(0,n)
701
+ np.random.seed(random_state)
702
+ sample_index = np.random.choice(index, size=n_sample, replace=False)
703
+ out_sample_index = np.array([x for x in index if x not in sample_index])
704
+ X_sample = X[sample_index,:]
705
+ X_out_sample = X[out_sample_index,:]
706
+ else:
707
+ X_sample = X
708
+ sample_index = np.arange(0,n)
709
+ X_out_sample = np.array([])
710
+ out_sample_index = np.array([])
711
+
712
+ return X_sample, X_out_sample, sample_index, out_sample_index
713
+
714
+ ################################################################################
715
+
716
+ class FastGGowerDistMatrix:
717
+ """
718
+ Calculates the the Generalized Gower matrix of a sample of a given data matrix.
719
+ """
720
+
721
+ def __init__(self, frac_sample_size=0.1, random_state=123, p1=None, p2=None, p3=None,
722
+ d1='robust_mahalanobis', d2='jaccard', d3='matching',
723
+ robust_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20, q=1,
724
+ fast_VG=False, VG_sample_size=1000, VG_n_samples=5, weights=None) :
725
+ """
726
+ Constructor method.
727
+
728
+ Parameters:
729
+ frac_sample_size: the sample size in proportional terms.
730
+ p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
731
+ d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis'].
732
+ d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
733
+ d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
734
+ q: the parameter that defines the Minkowski distance. Must be a positive integer.
735
+ robust_method: the method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
736
+ alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'. Only needed when d1 = 'robust_mahalanobis'.
737
+ epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
738
+ n_iters: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
739
+ fast_VG: whether the geometric variability estimation will be full (False) or fast (True).
740
+ VG_sample_size: sample size to be used to make the estimation of the geometric variability.
741
+ VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
742
+ random_state: the random seed used for the (random) sample elements.
743
+ weights: the sample weights. Only used if provided and d1 = 'robust_mahalanobis'.
744
+ """
745
+ self.random_state = random_state; self.frac_sample_size = frac_sample_size; self.p1 = p1; self.p2 = p2; self.p3 = p3;
746
+ self.d1 = d1; self.d2 = d2; self.d3 = d3; self.robust_method = robust_method; self.alpha = alpha; self.epsilon = epsilon;
747
+ self.n_iters = n_iters; self.fast_VG = fast_VG; self.VG_sample_size = VG_sample_size; self.VG_n_samples = VG_n_samples;
748
+ self.q = q; self.weights = weights
749
+
750
+ def compute(self, X):
751
+ """
752
+ Compute method: computes the Generalized Gower function for the defined sample of data.
753
+
754
+ Parameters:
755
+ X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
756
+ """
757
+
758
+ X_sample, X_out_sample, sample_index, out_sample_index = data_preprocessing(X=X, frac_sample_size=self.frac_sample_size,
759
+ random_state=self.random_state)
760
+
761
+ sample_weights = self.weights[sample_index] if self.weights is not None else None
762
+
763
+ GGower_matrix = GGowerDistMatrix(p1=self.p1, p2=self.p2, p3=self.p3,
764
+ d1=self.d1, d2=self.d2, d3=self.d3, q=self.q,
765
+ robust_method=self.robust_method, alpha=self.alpha,
766
+ epsilon=self.epsilon, n_iters=self.n_iters,
767
+ fast_VG=self.fast_VG, VG_sample_size=self.VG_sample_size,
768
+ VG_n_samples=self.VG_n_samples, weights=sample_weights)
769
+
770
+ self.D_GGower = GGower_matrix.compute(X=X_sample)
771
+ self.sample_index = sample_index
772
+ self.out_sample_index = out_sample_index
773
+ self.X_sample = X_sample
774
+ self.X_out_sample = X_out_sample
775
+
776
+ ################################################################################