robust-mixed-dist 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- robust_mixed_dist/__init__.py +0 -0
- robust_mixed_dist/binary.py +110 -0
- robust_mixed_dist/mixed.py +776 -0
- robust_mixed_dist/multiclass.py +57 -0
- robust_mixed_dist/quantitative.py +666 -0
- robust_mixed_dist-0.1.0.dist-info/METADATA +27 -0
- robust_mixed_dist-0.1.0.dist-info/RECORD +10 -0
- robust_mixed_dist-0.1.0.dist-info/WHEEL +5 -0
- robust_mixed_dist-0.1.0.dist-info/licenses/LICENSE +19 -0
- robust_mixed_dist-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,666 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from itertools import product
|
|
5
|
+
from scipy.spatial import distance
|
|
6
|
+
from scipy.spatial.distance import pdist, squareform, cdist
|
|
7
|
+
from scipy import sparse
|
|
8
|
+
|
|
9
|
+
################################################################################
|
|
10
|
+
|
|
11
|
+
def euclidean_dist_matrix(X):
|
|
12
|
+
"""
|
|
13
|
+
Calculates the Euclidean distance matrix for a data matrix using SciPy.
|
|
14
|
+
|
|
15
|
+
Parameters (inputs)
|
|
16
|
+
----------
|
|
17
|
+
X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.
|
|
18
|
+
|
|
19
|
+
Returns (outputs)
|
|
20
|
+
-------
|
|
21
|
+
M: the Euclidean distance matrix between the rows of `X`.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Convert to NumPy array if input is a DataFrame.
|
|
25
|
+
if isinstance(X, pl.DataFrame):
|
|
26
|
+
X = X.to_numpy()
|
|
27
|
+
if isinstance(X, pd.DataFrame):
|
|
28
|
+
X = X.to_numpy()
|
|
29
|
+
|
|
30
|
+
# Compute the pairwise distances using pdist and convert to a square form.
|
|
31
|
+
M = squareform(pdist(X, metric='euclidean'))
|
|
32
|
+
|
|
33
|
+
return M
|
|
34
|
+
|
|
35
|
+
################################################################################
|
|
36
|
+
|
|
37
|
+
def euclidean_dist(xi, xr) :
|
|
38
|
+
"""
|
|
39
|
+
Calculates the Euclidean distance between a pair of vectors.
|
|
40
|
+
|
|
41
|
+
Parameters (inputs)
|
|
42
|
+
----------
|
|
43
|
+
xi, xr: a pair of Pandas or Polars Series or DataFrames.
|
|
44
|
+
They represent a couple of statistical observations of quantitative variables.
|
|
45
|
+
|
|
46
|
+
Returns (outputs)
|
|
47
|
+
-------
|
|
48
|
+
The Euclidean distance between the observations `xi` and `xr`.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
if isinstance(xi, (pl.DataFrame, pd.DataFrame)) :
|
|
52
|
+
xi = xi.to_numpy().flatten()
|
|
53
|
+
elif isinstance(xi, (pd.Series, pl.Series)) :
|
|
54
|
+
xi = xi.to_numpy()
|
|
55
|
+
if isinstance(xr, (pl.DataFrame, pd.DataFrame)) :
|
|
56
|
+
xr = xr.to_numpy().flatten()
|
|
57
|
+
elif isinstance(xr, (pd.Series, pl.Series)) :
|
|
58
|
+
xr = xr.to_numpy()
|
|
59
|
+
|
|
60
|
+
return distance.euclidean(xi, xr)
|
|
61
|
+
|
|
62
|
+
################################################################################
|
|
63
|
+
|
|
64
|
+
def minkowski_dist_matrix(X, q):
|
|
65
|
+
"""
|
|
66
|
+
Calculates the Minkowski distance matrix for a data matrix using SciPy.
|
|
67
|
+
|
|
68
|
+
Parameters (inputs)
|
|
69
|
+
----------
|
|
70
|
+
X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.
|
|
71
|
+
q: the parameters that defines the Minkowski form. Some particular cases: q=1 := Manhattan, q=2 := Euclidean.
|
|
72
|
+
|
|
73
|
+
Returns (outputs)
|
|
74
|
+
-------
|
|
75
|
+
M: the Minkowski(`q`) distance matrix between the rows of `X`.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
if isinstance(X, pl.DataFrame):
|
|
79
|
+
X = X.to_numpy()
|
|
80
|
+
if isinstance(X, pd.DataFrame):
|
|
81
|
+
X = X.to_numpy()
|
|
82
|
+
|
|
83
|
+
# Compute the pairwise distances using pdist and convert to a square form.
|
|
84
|
+
M = squareform(pdist(X, metric='minkowski', p=q))
|
|
85
|
+
|
|
86
|
+
return M
|
|
87
|
+
|
|
88
|
+
################################################################################
|
|
89
|
+
|
|
90
|
+
def minkowski_dist(xi, xr, q) :
|
|
91
|
+
"""
|
|
92
|
+
Calculates the Minkowski distance between a pair of vectors.
|
|
93
|
+
|
|
94
|
+
Parameters (inputs)
|
|
95
|
+
----------
|
|
96
|
+
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
|
|
97
|
+
q: the parameters that defines the Minkowski form. Some particular cases: q=1 := Manhattan, q=2 := Euclidean.
|
|
98
|
+
|
|
99
|
+
Returns (outputs)
|
|
100
|
+
-------
|
|
101
|
+
The Minkowki(`q`) distance between the observations `xi` and `xr`.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
if isinstance(xi, (pl.DataFrame, pd.DataFrame)) :
|
|
105
|
+
xi = xi.to_numpy().flatten()
|
|
106
|
+
elif isinstance(xi, (pd.Series, pl.Series)) :
|
|
107
|
+
xi = xi.to_numpy()
|
|
108
|
+
if isinstance(xr, (pl.DataFrame, pd.DataFrame)) :
|
|
109
|
+
xr = xr.to_numpy().flatten()
|
|
110
|
+
elif isinstance(xr, (pd.Series, pl.Series)) :
|
|
111
|
+
xr = xr.to_numpy()
|
|
112
|
+
|
|
113
|
+
return distance.minkowski(xi, xr, q)
|
|
114
|
+
|
|
115
|
+
################################################################################
|
|
116
|
+
|
|
117
|
+
def canberra_dist_matrix(X):
|
|
118
|
+
"""
|
|
119
|
+
Calculates the Canberra distance matrix for a data matrix using SciPy.
|
|
120
|
+
|
|
121
|
+
Parameters (inputs)
|
|
122
|
+
----------
|
|
123
|
+
X: a pandas/polars DataFrame or a NumPy array. It represents a data matrix.
|
|
124
|
+
|
|
125
|
+
Returns (outputs)
|
|
126
|
+
-------
|
|
127
|
+
M: the Canberra distance matrix between the rows of `X`.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
if isinstance(X, pl.DataFrame):
|
|
131
|
+
X = X.to_numpy()
|
|
132
|
+
if isinstance(X, pd.DataFrame):
|
|
133
|
+
X = X.to_numpy()
|
|
134
|
+
|
|
135
|
+
# Compute the pairwise distances using pdist and convert to a square form.
|
|
136
|
+
M = squareform(pdist(X, metric='canberra'))
|
|
137
|
+
|
|
138
|
+
return M
|
|
139
|
+
|
|
140
|
+
################################################################################
|
|
141
|
+
|
|
142
|
+
def canberra_dist(xi, xr) :
|
|
143
|
+
"""
|
|
144
|
+
Calculates the Canberra distance between a pair of vectors.
|
|
145
|
+
|
|
146
|
+
Parameters (inputs)
|
|
147
|
+
----------
|
|
148
|
+
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
|
|
149
|
+
|
|
150
|
+
Returns (outputs)
|
|
151
|
+
-------
|
|
152
|
+
The Canberra distance between the observations `xi` and `xr`.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
if isinstance(xi, (pl.DataFrame, pd.DataFrame)) :
|
|
156
|
+
xi = xi.to_numpy().flatten()
|
|
157
|
+
elif isinstance(xi, (pd.Series, pl.Series)) :
|
|
158
|
+
xi = xi.to_numpy()
|
|
159
|
+
if isinstance(xr, (pl.DataFrame, pd.DataFrame)) :
|
|
160
|
+
xr = xr.to_numpy().flatten()
|
|
161
|
+
elif isinstance(xr, (pd.Series, pl.Series)) :
|
|
162
|
+
xr = xr.to_numpy()
|
|
163
|
+
|
|
164
|
+
return distance.canberra(xi, xr)
|
|
165
|
+
|
|
166
|
+
################################################################################
|
|
167
|
+
|
|
168
|
+
def pearson_dist_matrix(X):
|
|
169
|
+
"""
|
|
170
|
+
Calculates the Pearson distance matrix for a data matrix using SciPy.
|
|
171
|
+
|
|
172
|
+
Parameters (inputs)
|
|
173
|
+
----------
|
|
174
|
+
X: a pandas/polars DataFrame or a NumPy array. It represents a data matrix.
|
|
175
|
+
|
|
176
|
+
Returns (outputs)
|
|
177
|
+
-------
|
|
178
|
+
M: the Pearson distance matrix between the rows of X.
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
if isinstance(X, pl.DataFrame):
|
|
182
|
+
X = X.to_numpy()
|
|
183
|
+
if isinstance(X, pd.DataFrame):
|
|
184
|
+
X = X.to_numpy()
|
|
185
|
+
|
|
186
|
+
# Compute the pairwise distances using pdist and convert to a square form.
|
|
187
|
+
M = squareform(pdist(X, metric='seuclidean'))
|
|
188
|
+
|
|
189
|
+
return M
|
|
190
|
+
|
|
191
|
+
################################################################################
|
|
192
|
+
|
|
193
|
+
def mahalanobis_dist_matrix(X):
|
|
194
|
+
"""
|
|
195
|
+
Calculates the classical Mahalanobis distance matrix for a data matrix `X`.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
X : pandas.DataFrame, polars.DataFrame, or np.ndarray
|
|
200
|
+
Data matrix of shape (n_samples, n_features).
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
D : np.ndarray
|
|
205
|
+
Symmetric matrix (n_samples x n_samples) of Mahalanobis distances.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
# Convert to numpy array if needed
|
|
209
|
+
if isinstance(X, pl.DataFrame):
|
|
210
|
+
X = X.to_numpy()
|
|
211
|
+
elif isinstance(X, pd.DataFrame):
|
|
212
|
+
X = X.to_numpy()
|
|
213
|
+
|
|
214
|
+
# Center the data
|
|
215
|
+
X_centered = X - np.mean(X, axis=0)
|
|
216
|
+
|
|
217
|
+
# Classical covariance matrix
|
|
218
|
+
S = np.cov(X_centered, rowvar=False)
|
|
219
|
+
|
|
220
|
+
# Use pseudo-inverse for numerical stability
|
|
221
|
+
S_pinv = np.linalg.pinv(S)
|
|
222
|
+
|
|
223
|
+
# Symmetrize just in case
|
|
224
|
+
S_pinv = (S_pinv + S_pinv.T) / 2
|
|
225
|
+
|
|
226
|
+
# Compute Mahalanobis distance matrix
|
|
227
|
+
D = cdist(X_centered, X_centered, metric='mahalanobis', VI=S_pinv)
|
|
228
|
+
|
|
229
|
+
return D
|
|
230
|
+
|
|
231
|
+
################################################################################
|
|
232
|
+
|
|
233
|
+
def mahalanobis_dist(xi, xr, S) :
|
|
234
|
+
"""
|
|
235
|
+
Calculates the Mahalanobis distance between a pair of vectors.
|
|
236
|
+
|
|
237
|
+
Parameters (inputs)
|
|
238
|
+
----------
|
|
239
|
+
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
|
|
240
|
+
S: the covariance matrix of the data matrix to which `xi` and `xr` belong.
|
|
241
|
+
|
|
242
|
+
Returns (outputs)
|
|
243
|
+
-------
|
|
244
|
+
The Mahalanobis distance between the observations `xi` and `xr`.
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
if isinstance(xi, (pl.DataFrame, pd.DataFrame)) :
|
|
248
|
+
xi = xi.to_numpy().flatten()
|
|
249
|
+
elif isinstance(xi, (pd.Series, pl.Series)) :
|
|
250
|
+
xi = xi.to_numpy()
|
|
251
|
+
if isinstance(xr, (pl.DataFrame, pd.DataFrame)) :
|
|
252
|
+
xr = xr.to_numpy().flatten()
|
|
253
|
+
elif isinstance(xr, (pd.Series, pl.Series)) :
|
|
254
|
+
xr = xr.to_numpy()
|
|
255
|
+
|
|
256
|
+
S_inv = np.linalg.inv(S)
|
|
257
|
+
dist = distance.mahalanobis(xi, xr, S_inv)
|
|
258
|
+
|
|
259
|
+
return dist
|
|
260
|
+
|
|
261
|
+
################################################################################
|
|
262
|
+
|
|
263
|
+
def mad(Xj) :
|
|
264
|
+
"""
|
|
265
|
+
Compute the median absolute deviation of a statistical variable.
|
|
266
|
+
|
|
267
|
+
Parameters (inputs)
|
|
268
|
+
----------
|
|
269
|
+
Xj: a vector representing a quantitative statistical variable.
|
|
270
|
+
|
|
271
|
+
Returns (outputs)
|
|
272
|
+
-------
|
|
273
|
+
MAD: median absolute deviation of `Xj`.
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
if isinstance(Xj, pl.Series):
|
|
277
|
+
Xj = Xj.to_numpy()
|
|
278
|
+
elif isinstance(Xj, pd.Series):
|
|
279
|
+
Xj = Xj.to_numpy()
|
|
280
|
+
|
|
281
|
+
mad_ = np.median(np.abs(Xj - np.median(Xj)))
|
|
282
|
+
|
|
283
|
+
return mad_
|
|
284
|
+
|
|
285
|
+
################################################################################
|
|
286
|
+
|
|
287
|
+
def Xj_trimmed(Xj, alpha) :
|
|
288
|
+
"""
|
|
289
|
+
Compute the trimmed version of a statistical variable.
|
|
290
|
+
|
|
291
|
+
Parameters (inputs)
|
|
292
|
+
----------
|
|
293
|
+
Xj : a vector representing a quantitative statistical variable.
|
|
294
|
+
alpha : a real number in [0,1] that defines the trimming level.
|
|
295
|
+
|
|
296
|
+
Returns (outputs)
|
|
297
|
+
-------
|
|
298
|
+
result: the `alpha` trimmed version of `Xj`.
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
if isinstance(Xj, pl.Series):
|
|
302
|
+
Xj = Xj.to_numpy()
|
|
303
|
+
elif isinstance(Xj, pd.Series):
|
|
304
|
+
Xj = Xj.to_numpy()
|
|
305
|
+
|
|
306
|
+
lower_bound = np.quantile(Xj, q=alpha/2)
|
|
307
|
+
upper_bound = np.quantile(Xj, q=1-alpha/2)
|
|
308
|
+
mask = np.logical_and(Xj >= lower_bound, Xj <= upper_bound)
|
|
309
|
+
Xj_trimmed_ = Xj[mask]
|
|
310
|
+
|
|
311
|
+
return Xj_trimmed_
|
|
312
|
+
|
|
313
|
+
################################################################################
|
|
314
|
+
|
|
315
|
+
def Xj_winsorized(Xj, alpha):
|
|
316
|
+
"""
|
|
317
|
+
Compute the winsorized version of a quantitative variable.
|
|
318
|
+
|
|
319
|
+
Parameters
|
|
320
|
+
----------
|
|
321
|
+
Xj : a vector representing a quantitative statistical variable.
|
|
322
|
+
alpha : a real number in [0,1] that defines the winsorizing level.
|
|
323
|
+
|
|
324
|
+
Returns
|
|
325
|
+
-------
|
|
326
|
+
result: the `alpha` winsorized version of Xj.
|
|
327
|
+
"""
|
|
328
|
+
|
|
329
|
+
if isinstance(Xj, pl.Series):
|
|
330
|
+
Xj = Xj.to_numpy()
|
|
331
|
+
elif isinstance(Xj, pd.Series):
|
|
332
|
+
Xj = Xj.to_numpy()
|
|
333
|
+
|
|
334
|
+
# If Xj is a vector of zeros, return Xj.
|
|
335
|
+
if np.all(Xj == 0):
|
|
336
|
+
return Xj
|
|
337
|
+
|
|
338
|
+
lower_bound = np.quantile(Xj, q=alpha/2)
|
|
339
|
+
upper_bound = np.quantile(Xj, q=1-alpha/2)
|
|
340
|
+
|
|
341
|
+
# Clip the values: values smaller than lower_bound are set to lower_bound,
|
|
342
|
+
# those larger than upper_bound are set to upper_bound,
|
|
343
|
+
# and the ones in the middle are not transform.
|
|
344
|
+
Xj_winsorized_ = np.clip(Xj, lower_bound, upper_bound)
|
|
345
|
+
|
|
346
|
+
return Xj_winsorized_
|
|
347
|
+
|
|
348
|
+
################################################################################
|
|
349
|
+
|
|
350
|
+
def robust_var(Xj, method, alpha=None) :
|
|
351
|
+
"""
|
|
352
|
+
Compute the robust variance of `Xj` allowing different methods.
|
|
353
|
+
|
|
354
|
+
Parameters
|
|
355
|
+
----------
|
|
356
|
+
Xj : a vector representing a quantitative statistical variable.
|
|
357
|
+
method: the method to be used for computing the robust variance of `Xj`. Must be an string in ['MAD', 'trimmed', 'winsorized'].
|
|
358
|
+
alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'.
|
|
359
|
+
|
|
360
|
+
Returns
|
|
361
|
+
-------
|
|
362
|
+
result: the robust variance of `Xj` computed by the method `method`.
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
if method == 'MAD' :
|
|
366
|
+
|
|
367
|
+
return mad(Xj)**2
|
|
368
|
+
|
|
369
|
+
if method == 'trimmed' :
|
|
370
|
+
|
|
371
|
+
return np.var(Xj_trimmed(Xj, alpha))
|
|
372
|
+
|
|
373
|
+
if method == 'winsorized' :
|
|
374
|
+
|
|
375
|
+
return np.var(Xj_winsorized(Xj, alpha))
|
|
376
|
+
|
|
377
|
+
################################################################################
|
|
378
|
+
|
|
379
|
+
def robust_corr(Xj, Xr, method, alpha=None) :
|
|
380
|
+
"""
|
|
381
|
+
Compute the robust correlation between `Xj` and `Xr` by different methods.
|
|
382
|
+
|
|
383
|
+
Parameters
|
|
384
|
+
----------
|
|
385
|
+
Xj, Xr : two vectors representing a quantitative statistical variables.
|
|
386
|
+
method: the method to be used for computing the robust variance of `Xj`. Must be an string in ['MAD', 'trimmed', 'winsorized'].
|
|
387
|
+
alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'.
|
|
388
|
+
|
|
389
|
+
Returns
|
|
390
|
+
-------
|
|
391
|
+
result: the robust correlation between `Xj` and `Xr` computed by the method `method`.
|
|
392
|
+
"""
|
|
393
|
+
|
|
394
|
+
if isinstance(Xj, pl.Series):
|
|
395
|
+
Xj = Xj.to_numpy()
|
|
396
|
+
elif isinstance(Xj, pd.Series):
|
|
397
|
+
Xj = Xj.to_numpy()
|
|
398
|
+
if isinstance(Xr, pl.Series):
|
|
399
|
+
Xr = Xr.to_numpy()
|
|
400
|
+
elif isinstance(Xr, pd.Series):
|
|
401
|
+
Xr = Xr.to_numpy()
|
|
402
|
+
|
|
403
|
+
# Si la varianza robusta de X_j es cero, la version estandarizada de X_j es la propia X_j.
|
|
404
|
+
if robust_var(Xj, method, alpha) == 0 :
|
|
405
|
+
Xj_std = Xj
|
|
406
|
+
# Si la varianza robusta de X_j es distinta de cero.
|
|
407
|
+
else :
|
|
408
|
+
# Se estandariza X_j como se especifica en la seccion 7.2.2.
|
|
409
|
+
Xj_std = Xj / np.sqrt(robust_var(Xj, method, alpha))
|
|
410
|
+
# Si la varianza robusta de X_r es cero, la version estandarizada de X_r es la propia X_r.
|
|
411
|
+
if np.sqrt(robust_var(Xr, method, alpha)) == 0 :
|
|
412
|
+
Xr_std = Xr
|
|
413
|
+
# Si la varianza robusta de X_res distinta de cero.
|
|
414
|
+
else :
|
|
415
|
+
# Se estandariza X_r como se especifica en la seccion 7.2.2.
|
|
416
|
+
Xr_std = Xr / np.sqrt(robust_var(Xr, method, alpha))
|
|
417
|
+
|
|
418
|
+
# Se calcula la correlacion robusta como se especifica en la seccion 7.2.2, evitando problemas de divisionalidad.
|
|
419
|
+
robust_var_3 = robust_var(Xj_std + Xr_std, method, alpha)
|
|
420
|
+
robust_var_4 = robust_var(Xj_std - Xr_std, method, alpha)
|
|
421
|
+
if (robust_var_3 + robust_var_4) == 0 :
|
|
422
|
+
robust_corr = (robust_var_3 - robust_var_4)
|
|
423
|
+
else :
|
|
424
|
+
robust_corr = (robust_var_3 - robust_var_4) / (robust_var_3 + robust_var_4)
|
|
425
|
+
return robust_corr
|
|
426
|
+
|
|
427
|
+
################################################################################
|
|
428
|
+
|
|
429
|
+
def R_robust(X, method, alpha=None) :
|
|
430
|
+
"""
|
|
431
|
+
Computes the robust correlation matrix of a given data matrix `X`.
|
|
432
|
+
|
|
433
|
+
Parameters
|
|
434
|
+
----------
|
|
435
|
+
X : a pandas/polars data-frame or a numpy array.
|
|
436
|
+
method : the method used to compute the robust correlation matrix. Must be an string in ['MAD', 'trimmed', 'winsorized'].
|
|
437
|
+
alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'.
|
|
438
|
+
|
|
439
|
+
Returns
|
|
440
|
+
-------
|
|
441
|
+
M : the robust correlation matrix for `X`.
|
|
442
|
+
"""
|
|
443
|
+
|
|
444
|
+
if isinstance(X, pl.DataFrame):
|
|
445
|
+
X = X.to_numpy()
|
|
446
|
+
elif isinstance(X, pd.DataFrame):
|
|
447
|
+
X = X.to_numpy()
|
|
448
|
+
|
|
449
|
+
p = X.shape[1]
|
|
450
|
+
M = np.zeros((p,p))
|
|
451
|
+
|
|
452
|
+
for j,r in product(range(0,p), range(0,p)) :
|
|
453
|
+
|
|
454
|
+
M[j,r] = robust_corr(Xj=X[:,j], Xr=X[:,r], method=method, alpha=alpha)
|
|
455
|
+
|
|
456
|
+
return M
|
|
457
|
+
|
|
458
|
+
################################################################################
|
|
459
|
+
|
|
460
|
+
def delvin_trans(M, epsilon=0.05) :
|
|
461
|
+
"""
|
|
462
|
+
Applies the Delvin transformation for the matrix `M` passed as input to make it positive definite or closer to it.
|
|
463
|
+
|
|
464
|
+
Parameters (inputs)
|
|
465
|
+
----------
|
|
466
|
+
M : a pandas/polars data-frame or a numpy array.
|
|
467
|
+
epsilon : parameter involved in the Delvin transformation that must be a close to zero positive number. epsilon=0.05 is recommended.
|
|
468
|
+
|
|
469
|
+
Returns (outputs)
|
|
470
|
+
-------
|
|
471
|
+
M : the Delvin transformation of the input matrix `M`.
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
if isinstance(M, pl.DataFrame):
|
|
475
|
+
M = M.to_numpy()
|
|
476
|
+
elif isinstance(M, pd.DataFrame):
|
|
477
|
+
M = M.to_numpy()
|
|
478
|
+
|
|
479
|
+
# Se define la funcion z.
|
|
480
|
+
def z(x) :
|
|
481
|
+
return np.arctanh(x)
|
|
482
|
+
|
|
483
|
+
# Se define la funcion z^{-1}
|
|
484
|
+
def z_inv(x) :
|
|
485
|
+
# La arctanh es la inversa de tanh, por tanto, la inversa de arctanh es tanh.
|
|
486
|
+
return np.tanh(x)
|
|
487
|
+
|
|
488
|
+
# Se define la funcion g.
|
|
489
|
+
def g(i,j, M) :
|
|
490
|
+
if i == j :
|
|
491
|
+
return 1
|
|
492
|
+
else:
|
|
493
|
+
if np.abs(M[i,j]) <= z(epsilon) :
|
|
494
|
+
return 0
|
|
495
|
+
elif M[i,j] < - z(epsilon) :
|
|
496
|
+
return z_inv(M[i,j] + epsilon)
|
|
497
|
+
elif M[i,j] > z(epsilon) :
|
|
498
|
+
return z_inv(M[i,j] - epsilon)
|
|
499
|
+
|
|
500
|
+
# Se crea una matriz cuyos elementos son el resultado de aplicar la funcion g sobre matrix elemento a elemento.
|
|
501
|
+
p = M.shape[1]
|
|
502
|
+
M_new = np.zeros((p,p))
|
|
503
|
+
|
|
504
|
+
for i,j in product(range(0,p), range(0,p)) :
|
|
505
|
+
|
|
506
|
+
M_new[i,j] = g(i,j, M)
|
|
507
|
+
|
|
508
|
+
return M_new
|
|
509
|
+
|
|
510
|
+
################################################################################
|
|
511
|
+
|
|
512
|
+
def delvin_algorithm(M, epsilon, n_iters):
|
|
513
|
+
"""
|
|
514
|
+
Applies the Delvin algorithm on the matrix `M` passed as input
|
|
515
|
+
to make it positive definite by applying on it the Delvin transformation as many iterations as needed.
|
|
516
|
+
|
|
517
|
+
Parameters (inputs)
|
|
518
|
+
----------
|
|
519
|
+
M: a pandas/polars data-frame or a numpy array.
|
|
520
|
+
epsilon : parameter used by the Delvin transformation. epsilon=0.05 is recommended.
|
|
521
|
+
n_iter : maximum number of iterations run by the algorithm.
|
|
522
|
+
|
|
523
|
+
Returns (outputs)
|
|
524
|
+
-------
|
|
525
|
+
M_new : the resulting matrix of applying the Delvin algorithm on `M`.
|
|
526
|
+
"""
|
|
527
|
+
|
|
528
|
+
M_new = M.copy()
|
|
529
|
+
# Se inicializa i=0 para entrar en el bucle while
|
|
530
|
+
i = 0
|
|
531
|
+
# Mientras i sea inferior o igual a n_iter, el bucle continua ejecutandose.
|
|
532
|
+
while i < n_iters:
|
|
533
|
+
# Si new_matrix ya es definida positiva (todos sus autovalores son positivos), se devuelve new_matrix.
|
|
534
|
+
# En otro caso, se le aplica la transformacion de Delvin y se vuelve a comprobar si es definida positiva.
|
|
535
|
+
if np.all(np.linalg.eigvals(M_new) > 0):
|
|
536
|
+
return M_new, i
|
|
537
|
+
else:
|
|
538
|
+
M_new = delvin_trans(M=M_new, epsilon=epsilon)
|
|
539
|
+
i = i + 1
|
|
540
|
+
|
|
541
|
+
return M_new, i
|
|
542
|
+
|
|
543
|
+
################################################################################
|
|
544
|
+
|
|
545
|
+
def S_robust(X, method, epsilon, n_iters, alpha=None, weights=None):
|
|
546
|
+
"""
|
|
547
|
+
Computes the robust covariance of the data matrix `X` by different methods.
|
|
548
|
+
|
|
549
|
+
Parameters (inputs)
|
|
550
|
+
----------
|
|
551
|
+
X: a pandas/polars data-frame or a numpy array.
|
|
552
|
+
method: the method to be used to compute the robust covariance. Must be an string in ['MAD', 'trimmed', 'winsorized'].
|
|
553
|
+
alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'.
|
|
554
|
+
epsilon : parameter used by the Delvin transformation. epsilon=0.05 is recommended.
|
|
555
|
+
n_iter : maximum number of iterations run by the Delvin algorithm.
|
|
556
|
+
weights: the sample weights. Only used if provided.
|
|
557
|
+
|
|
558
|
+
Returns (outputs)
|
|
559
|
+
-------
|
|
560
|
+
S_robust : the robust covariance matrix computed for `X`.
|
|
561
|
+
"""
|
|
562
|
+
|
|
563
|
+
if isinstance(X, pl.DataFrame):
|
|
564
|
+
X = X.to_numpy()
|
|
565
|
+
elif isinstance(X, pd.DataFrame):
|
|
566
|
+
X = X.to_numpy()
|
|
567
|
+
|
|
568
|
+
if weights is None:
|
|
569
|
+
# Se calcula la matriz de correlaciones robustas para Data.
|
|
570
|
+
R_robust_ = R_robust(X, method, alpha)
|
|
571
|
+
# Se aplica el algoritmo de Delvin a la matriz de correlaciones robustas calculada.
|
|
572
|
+
R_robust_, i = delvin_algorithm(M=R_robust_, epsilon=epsilon, n_iters=n_iters)
|
|
573
|
+
# Se calcula la matriz de covarianzas robustas a partir de la matriz de correlaciones robustas.
|
|
574
|
+
S_robust = np.diag(np.std(X, axis=0)) @ R_robust_ @ np.diag(np.std(X, axis=0))
|
|
575
|
+
|
|
576
|
+
else:
|
|
577
|
+
w = weights
|
|
578
|
+
n = len(X)
|
|
579
|
+
Dw = sparse.diags(w)
|
|
580
|
+
I = np.identity(n)
|
|
581
|
+
ones_arr = np.ones(n)
|
|
582
|
+
Jw = np.sqrt(Dw) @ (I - ones_arr @ w.T)
|
|
583
|
+
Xw = Jw @ X # Computational problems when n is too large since is an n x n matrix.
|
|
584
|
+
# Se calcula la matriz de correlaciones robustas para Data.
|
|
585
|
+
R_robust_ = R_robust(Xw, method, alpha)
|
|
586
|
+
# Se aplica el algoritmo de Delvin a la matriz de correlaciones robustas calculada.
|
|
587
|
+
R_robust_, i = delvin_algorithm(M=R_robust_, epsilon=epsilon, n_iters=n_iters)
|
|
588
|
+
# Se calcula la matriz de covarianzas robustas a partir de la matriz de correlaciones robustas.
|
|
589
|
+
S_robust = np.diag(np.std(Xw, axis=0)) @ R_robust_ @ np.diag(np.std(Xw, axis=0))
|
|
590
|
+
|
|
591
|
+
return S_robust
|
|
592
|
+
|
|
593
|
+
################################################################################
|
|
594
|
+
|
|
595
|
+
def robust_maha_dist_matrix(X, S_robust):
|
|
596
|
+
"""
|
|
597
|
+
Calculates the Robust Mahalanobis distance matrix for a data matrix `X`
|
|
598
|
+
using a robust estimation of the covariance matrix.
|
|
599
|
+
|
|
600
|
+
Parameters
|
|
601
|
+
----------
|
|
602
|
+
X : pandas.DataFrame, polars.DataFrame, or np.ndarray
|
|
603
|
+
The input data matrix with shape (n_samples, n_features).
|
|
604
|
+
|
|
605
|
+
S_robust : np.ndarray
|
|
606
|
+
Robust covariance matrix (e.g., from MCD or a trimmed estimator).
|
|
607
|
+
Should be of shape (n_features, n_features).
|
|
608
|
+
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
D : np.ndarray
|
|
612
|
+
Symmetric matrix (n_samples, n_samples) of Mahalanobis distances.
|
|
613
|
+
"""
|
|
614
|
+
|
|
615
|
+
# Convert input to NumPy array if needed
|
|
616
|
+
if isinstance(X, pl.DataFrame):
|
|
617
|
+
X = X.to_numpy()
|
|
618
|
+
elif isinstance(X, pd.DataFrame):
|
|
619
|
+
X = X.to_numpy()
|
|
620
|
+
|
|
621
|
+
# Center the data (important for Mahalanobis)
|
|
622
|
+
X_centered = X - np.mean(X, axis=0)
|
|
623
|
+
|
|
624
|
+
# Use pseudo-inverse for stability
|
|
625
|
+
S_robust_pinv = np.linalg.pinv(S_robust)
|
|
626
|
+
|
|
627
|
+
# Force symmetry (sometimes lost by numerical error)
|
|
628
|
+
S_robust_pinv = (S_robust_pinv + S_robust_pinv.T) / 2
|
|
629
|
+
|
|
630
|
+
# Compute pairwise Mahalanobis distances with cdist
|
|
631
|
+
D = cdist(X_centered, X_centered, metric='mahalanobis', VI=S_robust_pinv)
|
|
632
|
+
|
|
633
|
+
return D
|
|
634
|
+
|
|
635
|
+
################################################################################
|
|
636
|
+
|
|
637
|
+
def robust_maha_dist(xi, xr, S_robust) :
|
|
638
|
+
"""
|
|
639
|
+
Calculates the Robust Mahalanobis distance between a pair of vectors.
|
|
640
|
+
|
|
641
|
+
Parameters (inputs)
|
|
642
|
+
----------
|
|
643
|
+
xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
|
|
644
|
+
S_robust: the robust covariance matrix of the data matrix to which `xi` and `xr` belong.
|
|
645
|
+
|
|
646
|
+
Returns (outputs)
|
|
647
|
+
-------
|
|
648
|
+
The Robust Mahalanobis distance between the observations `xi` and `xr`.
|
|
649
|
+
"""
|
|
650
|
+
|
|
651
|
+
if isinstance(xi, (pl.DataFrame, pd.DataFrame)) :
|
|
652
|
+
xi = xi.to_numpy().flatten()
|
|
653
|
+
elif isinstance(xi, (pd.Series, pl.Series)) :
|
|
654
|
+
xi = xi.to_numpy()
|
|
655
|
+
if isinstance(xr, (pl.DataFrame, pd.DataFrame)) :
|
|
656
|
+
xr = xr.to_numpy().flatten()
|
|
657
|
+
elif isinstance(xr, (pd.Series, pl.Series)) :
|
|
658
|
+
xr = xr.to_numpy()
|
|
659
|
+
|
|
660
|
+
X = np.array([xi, xr])
|
|
661
|
+
dist_xi_xr = robust_maha_dist_matrix(X, S_robust)
|
|
662
|
+
dist_xi_xr = dist_xi_xr[0,1]
|
|
663
|
+
|
|
664
|
+
return dist_xi_xr
|
|
665
|
+
|
|
666
|
+
################################################################################
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: robust-mixed-dist
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: For more information, check out the official documentation of `robust_mixed_dist` at: https://fabioscielzoortiz.github.io/robust_mixed_dist-docu/intro.html
|
|
5
|
+
Home-page: https://github.com/FabioScielzoOrtiz/robust_mixed_dist-package
|
|
6
|
+
Author: Fabio Scielzo Ortiz
|
|
7
|
+
Author-email: fabio.scielzoortiz@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.7
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: home-page
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# robust_mixed_dist
|
|
25
|
+
|
|
26
|
+
For more information, check out the official documentation of `robust_mixed_dist` at: https://fabioscielzoortiz.github.io/robust_mixed_dist-docu/intro.html
|
|
27
|
+
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
robust_mixed_dist/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
robust_mixed_dist/binary.py,sha256=n_RIANha7PDeeP8qKTizJQtA5zTP5KoOjcIY8vDkRjE,3322
|
|
3
|
+
robust_mixed_dist/mixed.py,sha256=mtHjh8e3ahxq51X0ri74N1O31OMQMUpmxvzmyJbsJVc,39403
|
|
4
|
+
robust_mixed_dist/multiclass.py,sha256=diUMIvP_O3BlOlMxz6Q7HIkmdDu18Pl9bbiszTHjweI,1778
|
|
5
|
+
robust_mixed_dist/quantitative.py,sha256=_wiIhyuwEjHW5twlYDCCfK8qXITPT8XPFz1wBQRq7Ho,22397
|
|
6
|
+
robust_mixed_dist-0.1.0.dist-info/licenses/LICENSE,sha256=6kbiFSfobTZ7beWiKnHpN902HgBx-Jzgcme0SvKqhKY,1091
|
|
7
|
+
robust_mixed_dist-0.1.0.dist-info/METADATA,sha256=D_e66QD2y5LVZvyjvwox9NgKBQjUv3HQtnLb2xepGQA,1004
|
|
8
|
+
robust_mixed_dist-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
robust_mixed_dist-0.1.0.dist-info/top_level.txt,sha256=kQcI1A0TrhkUiY8uvP0QHpZMPOwuLq-KojGhJoW9cjs,18
|
|
10
|
+
robust_mixed_dist-0.1.0.dist-info/RECORD,,
|