skfolio 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skfolio/__init__.py +29 -0
- skfolio/cluster/__init__.py +8 -0
- skfolio/cluster/_hierarchical.py +387 -0
- skfolio/datasets/__init__.py +20 -0
- skfolio/datasets/_base.py +389 -0
- skfolio/datasets/data/__init__.py +0 -0
- skfolio/datasets/data/factors_dataset.csv.gz +0 -0
- skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
- skfolio/datasets/data/sp500_index.csv.gz +0 -0
- skfolio/distance/__init__.py +26 -0
- skfolio/distance/_base.py +55 -0
- skfolio/distance/_distance.py +574 -0
- skfolio/exceptions.py +30 -0
- skfolio/measures/__init__.py +76 -0
- skfolio/measures/_enums.py +355 -0
- skfolio/measures/_measures.py +607 -0
- skfolio/metrics/__init__.py +3 -0
- skfolio/metrics/_scorer.py +121 -0
- skfolio/model_selection/__init__.py +18 -0
- skfolio/model_selection/_combinatorial.py +407 -0
- skfolio/model_selection/_validation.py +194 -0
- skfolio/model_selection/_walk_forward.py +221 -0
- skfolio/moments/__init__.py +41 -0
- skfolio/moments/covariance/__init__.py +29 -0
- skfolio/moments/covariance/_base.py +101 -0
- skfolio/moments/covariance/_covariance.py +1108 -0
- skfolio/moments/expected_returns/__init__.py +21 -0
- skfolio/moments/expected_returns/_base.py +31 -0
- skfolio/moments/expected_returns/_expected_returns.py +415 -0
- skfolio/optimization/__init__.py +36 -0
- skfolio/optimization/_base.py +147 -0
- skfolio/optimization/cluster/__init__.py +13 -0
- skfolio/optimization/cluster/_nco.py +348 -0
- skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
- skfolio/optimization/cluster/hierarchical/_base.py +440 -0
- skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
- skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
- skfolio/optimization/convex/__init__.py +16 -0
- skfolio/optimization/convex/_base.py +1944 -0
- skfolio/optimization/convex/_distributionally_robust.py +392 -0
- skfolio/optimization/convex/_maximum_diversification.py +417 -0
- skfolio/optimization/convex/_mean_risk.py +974 -0
- skfolio/optimization/convex/_risk_budgeting.py +560 -0
- skfolio/optimization/ensemble/__init__.py +6 -0
- skfolio/optimization/ensemble/_base.py +87 -0
- skfolio/optimization/ensemble/_stacking.py +326 -0
- skfolio/optimization/naive/__init__.py +3 -0
- skfolio/optimization/naive/_naive.py +173 -0
- skfolio/population/__init__.py +3 -0
- skfolio/population/_population.py +883 -0
- skfolio/portfolio/__init__.py +13 -0
- skfolio/portfolio/_base.py +1096 -0
- skfolio/portfolio/_multi_period_portfolio.py +610 -0
- skfolio/portfolio/_portfolio.py +842 -0
- skfolio/pre_selection/__init__.py +7 -0
- skfolio/pre_selection/_pre_selection.py +342 -0
- skfolio/preprocessing/__init__.py +3 -0
- skfolio/preprocessing/_returns.py +114 -0
- skfolio/prior/__init__.py +18 -0
- skfolio/prior/_base.py +63 -0
- skfolio/prior/_black_litterman.py +238 -0
- skfolio/prior/_empirical.py +163 -0
- skfolio/prior/_factor_model.py +268 -0
- skfolio/typing.py +50 -0
- skfolio/uncertainty_set/__init__.py +23 -0
- skfolio/uncertainty_set/_base.py +108 -0
- skfolio/uncertainty_set/_bootstrap.py +281 -0
- skfolio/uncertainty_set/_empirical.py +237 -0
- skfolio/utils/__init__.py +0 -0
- skfolio/utils/bootstrap.py +115 -0
- skfolio/utils/equations.py +350 -0
- skfolio/utils/sorting.py +117 -0
- skfolio/utils/stats.py +466 -0
- skfolio/utils/tools.py +567 -0
- skfolio-0.0.1.dist-info/LICENSE +29 -0
- skfolio-0.0.1.dist-info/METADATA +568 -0
- skfolio-0.0.1.dist-info/RECORD +79 -0
- skfolio-0.0.1.dist-info/WHEEL +5 -0
- skfolio-0.0.1.dist-info/top_level.txt +1 -0
skfolio/utils/stats.py
ADDED
@@ -0,0 +1,466 @@
|
|
1
|
+
"""Tools module"""
|
2
|
+
|
3
|
+
# Author: Hugo Delatte <delatte.hugo@gmail.com>
|
4
|
+
# License: BSD 3 clause
|
5
|
+
|
6
|
+
|
7
|
+
from enum import auto
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
import scipy.cluster.hierarchy as sch
|
11
|
+
import scipy.optimize as sco
|
12
|
+
import scipy.spatial.distance as scd
|
13
|
+
import scipy.special as scs
|
14
|
+
from scipy.sparse import csr_matrix
|
15
|
+
|
16
|
+
from skfolio.utils.tools import AutoEnum
|
17
|
+
|
18
|
+
__all__ = [
|
19
|
+
"NBinsMethod",
|
20
|
+
"n_bins_freedman",
|
21
|
+
"n_bins_knuth",
|
22
|
+
"is_cholesky_dec",
|
23
|
+
"assert_is_square",
|
24
|
+
"assert_is_symmetric",
|
25
|
+
"assert_is_distance",
|
26
|
+
"cov_nearest",
|
27
|
+
"cov_to_corr",
|
28
|
+
"corr_to_cov",
|
29
|
+
"commutation_matrix",
|
30
|
+
"compute_optimal_n_clusters",
|
31
|
+
"rand_weights",
|
32
|
+
"rand_weights_dirichlet",
|
33
|
+
]
|
34
|
+
|
35
|
+
|
36
|
+
class NBinsMethod(AutoEnum):
|
37
|
+
"""Enumeration of the Number of Bins Methods
|
38
|
+
|
39
|
+
Parameters
|
40
|
+
----------
|
41
|
+
FREEDMAN : str
|
42
|
+
Freedman method
|
43
|
+
|
44
|
+
KNUTH : str
|
45
|
+
Knuth method
|
46
|
+
"""
|
47
|
+
|
48
|
+
FREEDMAN = auto()
|
49
|
+
KNUTH = auto()
|
50
|
+
|
51
|
+
|
52
|
+
def n_bins_freedman(x: np.ndarray) -> int:
|
53
|
+
"""Compute the optimal histogram bin size using the Freedman-Diaconis rule [1]_.
|
54
|
+
|
55
|
+
Parameters
|
56
|
+
----------
|
57
|
+
x : ndarray of shape (n_observations,)
|
58
|
+
The input array.
|
59
|
+
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
n_bins : int
|
63
|
+
The optimal bin size.
|
64
|
+
|
65
|
+
References
|
66
|
+
----------
|
67
|
+
.. [1] "On the histogram as a density estimator: L2 theory".
|
68
|
+
Freedman & Diaconis (1981).
|
69
|
+
"""
|
70
|
+
if x.ndim != 1:
|
71
|
+
raise ValueError("`x` must be a 1d-array")
|
72
|
+
n = len(x)
|
73
|
+
p_25, p_75 = np.percentile(x, [25, 75])
|
74
|
+
d = 2 * (p_75 - p_25) / (n ** (1 / 3))
|
75
|
+
if d == 0:
|
76
|
+
return 5
|
77
|
+
n_bins = max(1, np.ceil((np.max(x) - np.min(x)) / d))
|
78
|
+
return int(round(n_bins))
|
79
|
+
|
80
|
+
|
81
|
+
def n_bins_knuth(x: np.ndarray) -> int:
|
82
|
+
"""Compute the optimal histogram bin size using Knuth's rule [1]_.
|
83
|
+
|
84
|
+
Parameters
|
85
|
+
----------
|
86
|
+
x : ndarray of shape (n_observations,)
|
87
|
+
The input array.
|
88
|
+
|
89
|
+
Returns
|
90
|
+
-------
|
91
|
+
n_bins : int
|
92
|
+
The optimal bin size.
|
93
|
+
|
94
|
+
References
|
95
|
+
----------
|
96
|
+
.. [1] "Optimal Data-Based Binning for Histograms".
|
97
|
+
Knuth.
|
98
|
+
"""
|
99
|
+
x = np.sort(x)
|
100
|
+
n = len(x)
|
101
|
+
|
102
|
+
def func(y: float):
|
103
|
+
y = int(y)
|
104
|
+
if y <= 0:
|
105
|
+
return np.inf
|
106
|
+
bin_edges = np.linspace(x[0], x[-1], int(y) + 1)
|
107
|
+
hist, _ = np.histogram(x, bin_edges)
|
108
|
+
return -(
|
109
|
+
n * np.log(y)
|
110
|
+
+ scs.gammaln(0.5 * y)
|
111
|
+
- y * scs.gammaln(0.5)
|
112
|
+
- scs.gammaln(n + 0.5 * y)
|
113
|
+
+ np.sum(scs.gammaln(hist + 0.5))
|
114
|
+
)
|
115
|
+
|
116
|
+
n_bins_init = n_bins_freedman(x)
|
117
|
+
n_bins = sco.fmin(func, n_bins_init, disp=0)[0]
|
118
|
+
return int(round(n_bins))
|
119
|
+
|
120
|
+
|
121
|
+
def rand_weights_dirichlet(n: int) -> np.array:
|
122
|
+
"""Produces n random weights that sum to one from a dirichlet distribution
|
123
|
+
(uniform distribution over a simplex)
|
124
|
+
|
125
|
+
Parameters
|
126
|
+
----------
|
127
|
+
n : int
|
128
|
+
Number of weights.
|
129
|
+
|
130
|
+
Returns
|
131
|
+
-------
|
132
|
+
weights : ndarray of shape (n, )
|
133
|
+
The vector of weights.
|
134
|
+
"""
|
135
|
+
return np.random.dirichlet(np.ones(n))
|
136
|
+
|
137
|
+
|
138
|
+
def rand_weights(n: int, zeros: int = 0) -> np.array:
|
139
|
+
"""Produces n random weights that sum to one from an uniform distribution
|
140
|
+
(non-uniform distribution over a simplex)
|
141
|
+
|
142
|
+
Parameters
|
143
|
+
----------
|
144
|
+
n : int
|
145
|
+
Number of weights.
|
146
|
+
|
147
|
+
zeros : int, default=0
|
148
|
+
The number of weights to randomly set to zeros.
|
149
|
+
|
150
|
+
Returns
|
151
|
+
-------
|
152
|
+
weights : ndarray of shape (n, )
|
153
|
+
The vector of weights.
|
154
|
+
"""
|
155
|
+
k = np.random.rand(n)
|
156
|
+
if zeros > 0:
|
157
|
+
zeros_idx = np.random.choice(n, zeros, replace=False)
|
158
|
+
k[zeros_idx] = 0
|
159
|
+
return k / sum(k)
|
160
|
+
|
161
|
+
|
162
|
+
def is_cholesky_dec(x: np.ndarray) -> bool:
|
163
|
+
"""Returns True if Cholesky decomposition can be computed.
|
164
|
+
The matrix must be Hermitian (symmetric if real-valued) and positive-definite.
|
165
|
+
No checking is performed to verify whether the matrix is Hermitian or not.
|
166
|
+
|
167
|
+
Parameters
|
168
|
+
----------
|
169
|
+
x : ndarray of shape (n, m)
|
170
|
+
The matrix.
|
171
|
+
|
172
|
+
Returns
|
173
|
+
-------
|
174
|
+
value : bool
|
175
|
+
True if Cholesky decomposition can be applied to the matrix, False otherwise.
|
176
|
+
"""
|
177
|
+
# Around 100 times faster than checking for positive eigenvalues with np.linalg.eigh
|
178
|
+
try:
|
179
|
+
np.linalg.cholesky(x)
|
180
|
+
return True
|
181
|
+
except np.linalg.linalg.LinAlgError:
|
182
|
+
return False
|
183
|
+
|
184
|
+
|
185
|
+
def is_positive_definite(x: np.ndarray) -> bool:
|
186
|
+
"""Returns True if the matrix is positive definite.
|
187
|
+
|
188
|
+
Parameters
|
189
|
+
----------
|
190
|
+
x : ndarray of shape (n, m)
|
191
|
+
The matrix.
|
192
|
+
|
193
|
+
Returns
|
194
|
+
-------
|
195
|
+
value : bool
|
196
|
+
True if if the matrix is positive definite, False otherwise.
|
197
|
+
"""
|
198
|
+
return np.all(np.linalg.eigvals(x) > 0)
|
199
|
+
|
200
|
+
|
201
|
+
def assert_is_square(x: np.ndarray) -> None:
|
202
|
+
"""Raises an error if the matrix is not square.
|
203
|
+
|
204
|
+
Parameters
|
205
|
+
----------
|
206
|
+
x : ndarray of shape (n, n)
|
207
|
+
The matrix.
|
208
|
+
|
209
|
+
Raises
|
210
|
+
------
|
211
|
+
ValueError: if the matrix is not square.
|
212
|
+
"""
|
213
|
+
if x.ndim != 2 or x.shape[0] != x.shape[1]:
|
214
|
+
raise ValueError("The matrix must be square")
|
215
|
+
|
216
|
+
|
217
|
+
def assert_is_symmetric(x: np.ndarray) -> None:
|
218
|
+
"""Raises an error if the matrix is not symmetric.
|
219
|
+
|
220
|
+
Parameters
|
221
|
+
----------
|
222
|
+
x : ndarray of shape (n, m)
|
223
|
+
The matrix.
|
224
|
+
|
225
|
+
Raises
|
226
|
+
------
|
227
|
+
ValueError: if the matrix is not symmetric.
|
228
|
+
"""
|
229
|
+
assert_is_square(x)
|
230
|
+
if not np.allclose(x, x.T):
|
231
|
+
raise ValueError("The matrix must be symmetric")
|
232
|
+
|
233
|
+
|
234
|
+
def assert_is_distance(x: np.ndarray) -> None:
|
235
|
+
"""Raises an error if the matrix is not a distance matrix.
|
236
|
+
|
237
|
+
Parameters
|
238
|
+
----------
|
239
|
+
x : ndarray of shape (n, n)
|
240
|
+
The matrix.
|
241
|
+
|
242
|
+
Raises
|
243
|
+
------
|
244
|
+
ValueError: if the matrix is a distance matrix.
|
245
|
+
"""
|
246
|
+
assert_is_symmetric(x)
|
247
|
+
if not np.allclose(np.diag(x), np.zeros(x.shape[0]), atol=1e-5):
|
248
|
+
raise ValueError(
|
249
|
+
"The distance matrix must have diagonal elements close to zeros"
|
250
|
+
)
|
251
|
+
|
252
|
+
|
253
|
+
def cov_to_corr(cov: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
254
|
+
"""Convert a covariance matrix to a correlation matrix.
|
255
|
+
|
256
|
+
Parameters
|
257
|
+
----------
|
258
|
+
cov : ndarray of shape (n, n)
|
259
|
+
Covariance matrix.
|
260
|
+
|
261
|
+
Returns
|
262
|
+
-------
|
263
|
+
corr, std : tuple[ndarray of shape (n, n), ndarray of shape (n, )]
|
264
|
+
Correlation matrix and standard-deviation vector
|
265
|
+
"""
|
266
|
+
if cov.ndim != 2:
|
267
|
+
raise ValueError(f"`cov` must be a 2D array, got a {cov.ndim}D array")
|
268
|
+
std = np.sqrt(np.diag(cov))
|
269
|
+
corr = cov / std / std[:, None]
|
270
|
+
return corr, std
|
271
|
+
|
272
|
+
|
273
|
+
def corr_to_cov(corr: np.ndarray, std: np.ndarray):
|
274
|
+
"""Convert a correlation matrix to a covariance matrix given its
|
275
|
+
standard-deviation vector.
|
276
|
+
|
277
|
+
Parameters
|
278
|
+
----------
|
279
|
+
corr : ndarray of shape (n, n)
|
280
|
+
Correlation matrix.
|
281
|
+
|
282
|
+
std : ndarray of shape (n, )
|
283
|
+
Standard-deviation vector.
|
284
|
+
|
285
|
+
Returns
|
286
|
+
-------
|
287
|
+
cov : ndarray of shape (n, n)
|
288
|
+
Covariance matrix
|
289
|
+
"""
|
290
|
+
if std.ndim != 1:
|
291
|
+
raise ValueError(f"`std` must be a 1D array, got a {std.ndim}D array")
|
292
|
+
if corr.ndim != 2:
|
293
|
+
raise ValueError(f"`corr` must be a 2D array, got a {corr.ndim}D array")
|
294
|
+
cov = corr * std * std[:, None]
|
295
|
+
return cov
|
296
|
+
|
297
|
+
|
298
|
+
_CLIPPING_VALUE = 1e-13
|
299
|
+
|
300
|
+
|
301
|
+
def cov_nearest(cov: np.ndarray, higham: bool = False, higham_max_iteration: int = 100):
|
302
|
+
"""Compute the nearest covariance matrix that is positive definite and with a
|
303
|
+
cholesky decomposition than can be computed. The variance is left unchanged.
|
304
|
+
|
305
|
+
First, it converts the covariance matrix to a correlation matrix.
|
306
|
+
Then, it finds the nearest correlation matrix and converts it back to a covariance
|
307
|
+
matrix using the initial standard deviation.
|
308
|
+
|
309
|
+
Cholesky decomposition can fail for symmetric positive definite (SPD) matrix due
|
310
|
+
to floating point error and inversely, Cholesky decomposition can success for
|
311
|
+
non-SPD matrix. Therefore, we need to test for both. We always start by testing
|
312
|
+
for Cholesky decomposition which is significantly faster than checking for positive
|
313
|
+
eigenvalues.
|
314
|
+
|
315
|
+
Parameters
|
316
|
+
----------
|
317
|
+
cov : ndarray of shape (n, n)
|
318
|
+
Covariance matrix.
|
319
|
+
|
320
|
+
higham : bool, default=False
|
321
|
+
If this is set to True, the Higham & Nick (2002) algorithm [1]_ is used,
|
322
|
+
otherwise the eigenvalues are clipped to threshold above zeros (1e-13).
|
323
|
+
The default (`False`) is to use the clipping method as the Higham & Nick
|
324
|
+
algorithm can be slow for large datasets.
|
325
|
+
|
326
|
+
higham_max_iteration : int, default=100
|
327
|
+
Maximum number of iteration of the Higham & Nick (2002) algorithm.
|
328
|
+
The default value is `100`.
|
329
|
+
|
330
|
+
Returns
|
331
|
+
-------
|
332
|
+
cov : ndarray
|
333
|
+
The nearest covariance matrix.
|
334
|
+
|
335
|
+
References
|
336
|
+
----------
|
337
|
+
.. [1] "Computing the nearest correlation matrix - a problem from finance"
|
338
|
+
IMA Journal of Numerical Analysis
|
339
|
+
Higham & Nick (2002)
|
340
|
+
"""
|
341
|
+
assert_is_square(cov)
|
342
|
+
assert_is_symmetric(cov)
|
343
|
+
|
344
|
+
# Around 100 times faster than checking eigenvalues with np.linalg.eigh
|
345
|
+
if is_cholesky_dec(cov) and is_positive_definite(cov):
|
346
|
+
return cov
|
347
|
+
|
348
|
+
corr, std = cov_to_corr(cov)
|
349
|
+
|
350
|
+
if higham:
|
351
|
+
eps = np.finfo(np.float64).eps * 5
|
352
|
+
diff = np.zeros(corr.shape)
|
353
|
+
x = corr.copy()
|
354
|
+
for _ in range(higham_max_iteration):
|
355
|
+
x_adj = x - diff
|
356
|
+
eig_vals, eig_vecs = np.linalg.eigh(x_adj)
|
357
|
+
x = eig_vecs * np.maximum(eig_vals, eps) @ eig_vecs.T
|
358
|
+
diff = x - x_adj
|
359
|
+
np.fill_diagonal(x, 1)
|
360
|
+
cov = corr_to_cov(x, std)
|
361
|
+
if is_cholesky_dec(cov) and is_positive_definite(cov):
|
362
|
+
break
|
363
|
+
else:
|
364
|
+
raise ValueError("Unable to find the nearest positive definite matrix")
|
365
|
+
else:
|
366
|
+
eig_vals, eig_vecs = np.linalg.eigh(corr)
|
367
|
+
# Clipping the eigenvalues with a value smaller than 1e-13 can cause scipy to
|
368
|
+
# consider the matrix non-psd is some corner cases (see test/test_stats.py)
|
369
|
+
x = eig_vecs * np.maximum(eig_vals, _CLIPPING_VALUE) @ eig_vecs.T
|
370
|
+
x, _ = cov_to_corr(x)
|
371
|
+
cov = corr_to_cov(x, std)
|
372
|
+
|
373
|
+
return cov
|
374
|
+
|
375
|
+
|
376
|
+
def commutation_matrix(x):
|
377
|
+
"""Compute the commutation matrix.
|
378
|
+
|
379
|
+
Parameters
|
380
|
+
----------
|
381
|
+
x : ndarray of shape (n, m)
|
382
|
+
The matrix.
|
383
|
+
|
384
|
+
Returns
|
385
|
+
-------
|
386
|
+
K : ndarray of shape (m * n, m * n)
|
387
|
+
The commutation matrix.
|
388
|
+
"""
|
389
|
+
(m, n) = x.shape
|
390
|
+
row = np.arange(m * n)
|
391
|
+
col = row.reshape((m, n), order="F").ravel()
|
392
|
+
data = np.ones(m * n, dtype=np.int8)
|
393
|
+
k = csr_matrix((data, (row, col)), shape=(m * n, m * n))
|
394
|
+
return k
|
395
|
+
|
396
|
+
|
397
|
+
def compute_optimal_n_clusters(distance: np.ndarray, linkage_matrix: np.ndarray) -> int:
|
398
|
+
r"""Compute the optimal number of clusters based on Two-Order Difference to Gap
|
399
|
+
Statistic [1]_.
|
400
|
+
|
401
|
+
The Two-Order Difference to Gap Statistic has been developed to improve the
|
402
|
+
performance and stability of the Tibshiranis Gap statistic.
|
403
|
+
It applies the two-order difference of the within-cluster dispersion to replace the
|
404
|
+
reference null distribution in the Gap statistic.
|
405
|
+
|
406
|
+
The number of cluster :math:`k` is determined by:
|
407
|
+
|
408
|
+
.. math:: \begin{cases}
|
409
|
+
\begin{aligned}
|
410
|
+
&\max_{k} & & W_{k+2} + W_{k} - 2 W_{k+1} \\
|
411
|
+
&\text{s.t.} & & 1 \ge c \ge max\bigl(8, \sqrt{n}\bigr) \\
|
412
|
+
\end{aligned}
|
413
|
+
\end{cases}
|
414
|
+
|
415
|
+
with :math:`n` the sample size and :math:`W_{k}` the within-cluster dispersions
|
416
|
+
defined as:
|
417
|
+
|
418
|
+
.. math:: W_{k} = \sum_{i=1}^{k} \frac{D_{i}}{2|C_{i}|}
|
419
|
+
|
420
|
+
where :math:`|C_{i}|` is the cardinality of cluster :math:`i` and :math:`D_{i}` its
|
421
|
+
density defined as:
|
422
|
+
|
423
|
+
.. math:: D_{i} = \sum_{u \in C_{i}} \sum_{v \in C_{i}} d(u,v)
|
424
|
+
|
425
|
+
with :math:`d(u,v)` the distance between u and v.
|
426
|
+
|
427
|
+
|
428
|
+
Parameters
|
429
|
+
----------
|
430
|
+
distance : ndarray of shape (n, n)
|
431
|
+
Distance matrix.
|
432
|
+
|
433
|
+
linkage_matrix : ndarray of shape (n - 1, 4)
|
434
|
+
Linkage matrix.
|
435
|
+
|
436
|
+
Returns
|
437
|
+
-------
|
438
|
+
value : int
|
439
|
+
Optimal number of clusters.
|
440
|
+
|
441
|
+
References
|
442
|
+
----------
|
443
|
+
.. [1] "Application of two-order difference to gap statistic".
|
444
|
+
Yue, Wang & Wei (2009)
|
445
|
+
"""
|
446
|
+
cut_tree = sch.cut_tree(linkage_matrix)
|
447
|
+
n = cut_tree.shape[1]
|
448
|
+
max_clusters = max(8, round(np.sqrt(n)))
|
449
|
+
dispersion = []
|
450
|
+
for k in range(max_clusters):
|
451
|
+
level = cut_tree[:, n - k - 1]
|
452
|
+
cluster_density = []
|
453
|
+
for i in range(np.max(level) + 1):
|
454
|
+
cluster_idx = np.argwhere(level == i).flatten()
|
455
|
+
cluster_dists = scd.squareform(
|
456
|
+
distance[cluster_idx, :][:, cluster_idx], checks=False
|
457
|
+
)
|
458
|
+
if cluster_dists.shape[0] != 0:
|
459
|
+
cluster_density.append(np.nan_to_num(cluster_dists.mean()))
|
460
|
+
dispersion.append(np.sum(cluster_density))
|
461
|
+
dispersion = np.array(dispersion)
|
462
|
+
gaps = np.roll(dispersion, -2) + dispersion - 2 * np.roll(dispersion, -1)
|
463
|
+
gaps = gaps[:-2]
|
464
|
+
# k=0 represents one cluster
|
465
|
+
k = np.argmax(gaps) + 2
|
466
|
+
return k
|