skfolio 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skfolio/__init__.py +29 -0
  2. skfolio/cluster/__init__.py +8 -0
  3. skfolio/cluster/_hierarchical.py +387 -0
  4. skfolio/datasets/__init__.py +20 -0
  5. skfolio/datasets/_base.py +389 -0
  6. skfolio/datasets/data/__init__.py +0 -0
  7. skfolio/datasets/data/factors_dataset.csv.gz +0 -0
  8. skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
  9. skfolio/datasets/data/sp500_index.csv.gz +0 -0
  10. skfolio/distance/__init__.py +26 -0
  11. skfolio/distance/_base.py +55 -0
  12. skfolio/distance/_distance.py +574 -0
  13. skfolio/exceptions.py +30 -0
  14. skfolio/measures/__init__.py +76 -0
  15. skfolio/measures/_enums.py +355 -0
  16. skfolio/measures/_measures.py +607 -0
  17. skfolio/metrics/__init__.py +3 -0
  18. skfolio/metrics/_scorer.py +121 -0
  19. skfolio/model_selection/__init__.py +18 -0
  20. skfolio/model_selection/_combinatorial.py +407 -0
  21. skfolio/model_selection/_validation.py +194 -0
  22. skfolio/model_selection/_walk_forward.py +221 -0
  23. skfolio/moments/__init__.py +41 -0
  24. skfolio/moments/covariance/__init__.py +29 -0
  25. skfolio/moments/covariance/_base.py +101 -0
  26. skfolio/moments/covariance/_covariance.py +1108 -0
  27. skfolio/moments/expected_returns/__init__.py +21 -0
  28. skfolio/moments/expected_returns/_base.py +31 -0
  29. skfolio/moments/expected_returns/_expected_returns.py +415 -0
  30. skfolio/optimization/__init__.py +36 -0
  31. skfolio/optimization/_base.py +147 -0
  32. skfolio/optimization/cluster/__init__.py +13 -0
  33. skfolio/optimization/cluster/_nco.py +348 -0
  34. skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
  35. skfolio/optimization/cluster/hierarchical/_base.py +440 -0
  36. skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
  37. skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
  38. skfolio/optimization/convex/__init__.py +16 -0
  39. skfolio/optimization/convex/_base.py +1944 -0
  40. skfolio/optimization/convex/_distributionally_robust.py +392 -0
  41. skfolio/optimization/convex/_maximum_diversification.py +417 -0
  42. skfolio/optimization/convex/_mean_risk.py +974 -0
  43. skfolio/optimization/convex/_risk_budgeting.py +560 -0
  44. skfolio/optimization/ensemble/__init__.py +6 -0
  45. skfolio/optimization/ensemble/_base.py +87 -0
  46. skfolio/optimization/ensemble/_stacking.py +326 -0
  47. skfolio/optimization/naive/__init__.py +3 -0
  48. skfolio/optimization/naive/_naive.py +173 -0
  49. skfolio/population/__init__.py +3 -0
  50. skfolio/population/_population.py +883 -0
  51. skfolio/portfolio/__init__.py +13 -0
  52. skfolio/portfolio/_base.py +1096 -0
  53. skfolio/portfolio/_multi_period_portfolio.py +610 -0
  54. skfolio/portfolio/_portfolio.py +842 -0
  55. skfolio/pre_selection/__init__.py +7 -0
  56. skfolio/pre_selection/_pre_selection.py +342 -0
  57. skfolio/preprocessing/__init__.py +3 -0
  58. skfolio/preprocessing/_returns.py +114 -0
  59. skfolio/prior/__init__.py +18 -0
  60. skfolio/prior/_base.py +63 -0
  61. skfolio/prior/_black_litterman.py +238 -0
  62. skfolio/prior/_empirical.py +163 -0
  63. skfolio/prior/_factor_model.py +268 -0
  64. skfolio/typing.py +50 -0
  65. skfolio/uncertainty_set/__init__.py +23 -0
  66. skfolio/uncertainty_set/_base.py +108 -0
  67. skfolio/uncertainty_set/_bootstrap.py +281 -0
  68. skfolio/uncertainty_set/_empirical.py +237 -0
  69. skfolio/utils/__init__.py +0 -0
  70. skfolio/utils/bootstrap.py +115 -0
  71. skfolio/utils/equations.py +350 -0
  72. skfolio/utils/sorting.py +117 -0
  73. skfolio/utils/stats.py +466 -0
  74. skfolio/utils/tools.py +567 -0
  75. skfolio-0.0.1.dist-info/LICENSE +29 -0
  76. skfolio-0.0.1.dist-info/METADATA +568 -0
  77. skfolio-0.0.1.dist-info/RECORD +79 -0
  78. skfolio-0.0.1.dist-info/WHEEL +5 -0
  79. skfolio-0.0.1.dist-info/top_level.txt +1 -0
skfolio/utils/stats.py ADDED
@@ -0,0 +1,466 @@
1
+ """Tools module"""
2
+
3
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
4
+ # License: BSD 3 clause
5
+
6
+
7
+ from enum import auto
8
+
9
+ import numpy as np
10
+ import scipy.cluster.hierarchy as sch
11
+ import scipy.optimize as sco
12
+ import scipy.spatial.distance as scd
13
+ import scipy.special as scs
14
+ from scipy.sparse import csr_matrix
15
+
16
+ from skfolio.utils.tools import AutoEnum
17
+
18
+ __all__ = [
19
+ "NBinsMethod",
20
+ "n_bins_freedman",
21
+ "n_bins_knuth",
22
+ "is_cholesky_dec",
23
+ "assert_is_square",
24
+ "assert_is_symmetric",
25
+ "assert_is_distance",
26
+ "cov_nearest",
27
+ "cov_to_corr",
28
+ "corr_to_cov",
29
+ "commutation_matrix",
30
+ "compute_optimal_n_clusters",
31
+ "rand_weights",
32
+ "rand_weights_dirichlet",
33
+ ]
34
+
35
+
36
+ class NBinsMethod(AutoEnum):
37
+ """Enumeration of the Number of Bins Methods
38
+
39
+ Parameters
40
+ ----------
41
+ FREEDMAN : str
42
+ Freedman method
43
+
44
+ KNUTH : str
45
+ Knuth method
46
+ """
47
+
48
+ FREEDMAN = auto()
49
+ KNUTH = auto()
50
+
51
+
52
+ def n_bins_freedman(x: np.ndarray) -> int:
53
+ """Compute the optimal histogram bin size using the Freedman-Diaconis rule [1]_.
54
+
55
+ Parameters
56
+ ----------
57
+ x : ndarray of shape (n_observations,)
58
+ The input array.
59
+
60
+ Returns
61
+ -------
62
+ n_bins : int
63
+ The optimal bin size.
64
+
65
+ References
66
+ ----------
67
+ .. [1] "On the histogram as a density estimator: L2 theory".
68
+ Freedman & Diaconis (1981).
69
+ """
70
+ if x.ndim != 1:
71
+ raise ValueError("`x` must be a 1d-array")
72
+ n = len(x)
73
+ p_25, p_75 = np.percentile(x, [25, 75])
74
+ d = 2 * (p_75 - p_25) / (n ** (1 / 3))
75
+ if d == 0:
76
+ return 5
77
+ n_bins = max(1, np.ceil((np.max(x) - np.min(x)) / d))
78
+ return int(round(n_bins))
79
+
80
+
81
+ def n_bins_knuth(x: np.ndarray) -> int:
82
+ """Compute the optimal histogram bin size using Knuth's rule [1]_.
83
+
84
+ Parameters
85
+ ----------
86
+ x : ndarray of shape (n_observations,)
87
+ The input array.
88
+
89
+ Returns
90
+ -------
91
+ n_bins : int
92
+ The optimal bin size.
93
+
94
+ References
95
+ ----------
96
+ .. [1] "Optimal Data-Based Binning for Histograms".
97
+ Knuth.
98
+ """
99
+ x = np.sort(x)
100
+ n = len(x)
101
+
102
+ def func(y: float):
103
+ y = int(y)
104
+ if y <= 0:
105
+ return np.inf
106
+ bin_edges = np.linspace(x[0], x[-1], int(y) + 1)
107
+ hist, _ = np.histogram(x, bin_edges)
108
+ return -(
109
+ n * np.log(y)
110
+ + scs.gammaln(0.5 * y)
111
+ - y * scs.gammaln(0.5)
112
+ - scs.gammaln(n + 0.5 * y)
113
+ + np.sum(scs.gammaln(hist + 0.5))
114
+ )
115
+
116
+ n_bins_init = n_bins_freedman(x)
117
+ n_bins = sco.fmin(func, n_bins_init, disp=0)[0]
118
+ return int(round(n_bins))
119
+
120
+
121
+ def rand_weights_dirichlet(n: int) -> np.array:
122
+ """Produces n random weights that sum to one from a dirichlet distribution
123
+ (uniform distribution over a simplex)
124
+
125
+ Parameters
126
+ ----------
127
+ n : int
128
+ Number of weights.
129
+
130
+ Returns
131
+ -------
132
+ weights : ndarray of shape (n, )
133
+ The vector of weights.
134
+ """
135
+ return np.random.dirichlet(np.ones(n))
136
+
137
+
138
+ def rand_weights(n: int, zeros: int = 0) -> np.array:
139
+ """Produces n random weights that sum to one from an uniform distribution
140
+ (non-uniform distribution over a simplex)
141
+
142
+ Parameters
143
+ ----------
144
+ n : int
145
+ Number of weights.
146
+
147
+ zeros : int, default=0
148
+ The number of weights to randomly set to zeros.
149
+
150
+ Returns
151
+ -------
152
+ weights : ndarray of shape (n, )
153
+ The vector of weights.
154
+ """
155
+ k = np.random.rand(n)
156
+ if zeros > 0:
157
+ zeros_idx = np.random.choice(n, zeros, replace=False)
158
+ k[zeros_idx] = 0
159
+ return k / sum(k)
160
+
161
+
162
+ def is_cholesky_dec(x: np.ndarray) -> bool:
163
+ """Returns True if Cholesky decomposition can be computed.
164
+ The matrix must be Hermitian (symmetric if real-valued) and positive-definite.
165
+ No checking is performed to verify whether the matrix is Hermitian or not.
166
+
167
+ Parameters
168
+ ----------
169
+ x : ndarray of shape (n, m)
170
+ The matrix.
171
+
172
+ Returns
173
+ -------
174
+ value : bool
175
+ True if Cholesky decomposition can be applied to the matrix, False otherwise.
176
+ """
177
+ # Around 100 times faster than checking for positive eigenvalues with np.linalg.eigh
178
+ try:
179
+ np.linalg.cholesky(x)
180
+ return True
181
+ except np.linalg.linalg.LinAlgError:
182
+ return False
183
+
184
+
185
+ def is_positive_definite(x: np.ndarray) -> bool:
186
+ """Returns True if the matrix is positive definite.
187
+
188
+ Parameters
189
+ ----------
190
+ x : ndarray of shape (n, m)
191
+ The matrix.
192
+
193
+ Returns
194
+ -------
195
+ value : bool
196
+ True if if the matrix is positive definite, False otherwise.
197
+ """
198
+ return np.all(np.linalg.eigvals(x) > 0)
199
+
200
+
201
+ def assert_is_square(x: np.ndarray) -> None:
202
+ """Raises an error if the matrix is not square.
203
+
204
+ Parameters
205
+ ----------
206
+ x : ndarray of shape (n, n)
207
+ The matrix.
208
+
209
+ Raises
210
+ ------
211
+ ValueError: if the matrix is not square.
212
+ """
213
+ if x.ndim != 2 or x.shape[0] != x.shape[1]:
214
+ raise ValueError("The matrix must be square")
215
+
216
+
217
+ def assert_is_symmetric(x: np.ndarray) -> None:
218
+ """Raises an error if the matrix is not symmetric.
219
+
220
+ Parameters
221
+ ----------
222
+ x : ndarray of shape (n, m)
223
+ The matrix.
224
+
225
+ Raises
226
+ ------
227
+ ValueError: if the matrix is not symmetric.
228
+ """
229
+ assert_is_square(x)
230
+ if not np.allclose(x, x.T):
231
+ raise ValueError("The matrix must be symmetric")
232
+
233
+
234
+ def assert_is_distance(x: np.ndarray) -> None:
235
+ """Raises an error if the matrix is not a distance matrix.
236
+
237
+ Parameters
238
+ ----------
239
+ x : ndarray of shape (n, n)
240
+ The matrix.
241
+
242
+ Raises
243
+ ------
244
+ ValueError: if the matrix is a distance matrix.
245
+ """
246
+ assert_is_symmetric(x)
247
+ if not np.allclose(np.diag(x), np.zeros(x.shape[0]), atol=1e-5):
248
+ raise ValueError(
249
+ "The distance matrix must have diagonal elements close to zeros"
250
+ )
251
+
252
+
253
+ def cov_to_corr(cov: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
254
+ """Convert a covariance matrix to a correlation matrix.
255
+
256
+ Parameters
257
+ ----------
258
+ cov : ndarray of shape (n, n)
259
+ Covariance matrix.
260
+
261
+ Returns
262
+ -------
263
+ corr, std : tuple[ndarray of shape (n, n), ndarray of shape (n, )]
264
+ Correlation matrix and standard-deviation vector
265
+ """
266
+ if cov.ndim != 2:
267
+ raise ValueError(f"`cov` must be a 2D array, got a {cov.ndim}D array")
268
+ std = np.sqrt(np.diag(cov))
269
+ corr = cov / std / std[:, None]
270
+ return corr, std
271
+
272
+
273
+ def corr_to_cov(corr: np.ndarray, std: np.ndarray):
274
+ """Convert a correlation matrix to a covariance matrix given its
275
+ standard-deviation vector.
276
+
277
+ Parameters
278
+ ----------
279
+ corr : ndarray of shape (n, n)
280
+ Correlation matrix.
281
+
282
+ std : ndarray of shape (n, )
283
+ Standard-deviation vector.
284
+
285
+ Returns
286
+ -------
287
+ cov : ndarray of shape (n, n)
288
+ Covariance matrix
289
+ """
290
+ if std.ndim != 1:
291
+ raise ValueError(f"`std` must be a 1D array, got a {std.ndim}D array")
292
+ if corr.ndim != 2:
293
+ raise ValueError(f"`corr` must be a 2D array, got a {corr.ndim}D array")
294
+ cov = corr * std * std[:, None]
295
+ return cov
296
+
297
+
298
+ _CLIPPING_VALUE = 1e-13
299
+
300
+
301
+ def cov_nearest(cov: np.ndarray, higham: bool = False, higham_max_iteration: int = 100):
302
+ """Compute the nearest covariance matrix that is positive definite and with a
303
+ cholesky decomposition than can be computed. The variance is left unchanged.
304
+
305
+ First, it converts the covariance matrix to a correlation matrix.
306
+ Then, it finds the nearest correlation matrix and converts it back to a covariance
307
+ matrix using the initial standard deviation.
308
+
309
+ Cholesky decomposition can fail for symmetric positive definite (SPD) matrix due
310
+ to floating point error and inversely, Cholesky decomposition can success for
311
+ non-SPD matrix. Therefore, we need to test for both. We always start by testing
312
+ for Cholesky decomposition which is significantly faster than checking for positive
313
+ eigenvalues.
314
+
315
+ Parameters
316
+ ----------
317
+ cov : ndarray of shape (n, n)
318
+ Covariance matrix.
319
+
320
+ higham : bool, default=False
321
+ If this is set to True, the Higham & Nick (2002) algorithm [1]_ is used,
322
+ otherwise the eigenvalues are clipped to threshold above zeros (1e-13).
323
+ The default (`False`) is to use the clipping method as the Higham & Nick
324
+ algorithm can be slow for large datasets.
325
+
326
+ higham_max_iteration : int, default=100
327
+ Maximum number of iteration of the Higham & Nick (2002) algorithm.
328
+ The default value is `100`.
329
+
330
+ Returns
331
+ -------
332
+ cov : ndarray
333
+ The nearest covariance matrix.
334
+
335
+ References
336
+ ----------
337
+ .. [1] "Computing the nearest correlation matrix - a problem from finance"
338
+ IMA Journal of Numerical Analysis
339
+ Higham & Nick (2002)
340
+ """
341
+ assert_is_square(cov)
342
+ assert_is_symmetric(cov)
343
+
344
+ # Around 100 times faster than checking eigenvalues with np.linalg.eigh
345
+ if is_cholesky_dec(cov) and is_positive_definite(cov):
346
+ return cov
347
+
348
+ corr, std = cov_to_corr(cov)
349
+
350
+ if higham:
351
+ eps = np.finfo(np.float64).eps * 5
352
+ diff = np.zeros(corr.shape)
353
+ x = corr.copy()
354
+ for _ in range(higham_max_iteration):
355
+ x_adj = x - diff
356
+ eig_vals, eig_vecs = np.linalg.eigh(x_adj)
357
+ x = eig_vecs * np.maximum(eig_vals, eps) @ eig_vecs.T
358
+ diff = x - x_adj
359
+ np.fill_diagonal(x, 1)
360
+ cov = corr_to_cov(x, std)
361
+ if is_cholesky_dec(cov) and is_positive_definite(cov):
362
+ break
363
+ else:
364
+ raise ValueError("Unable to find the nearest positive definite matrix")
365
+ else:
366
+ eig_vals, eig_vecs = np.linalg.eigh(corr)
367
+ # Clipping the eigenvalues with a value smaller than 1e-13 can cause scipy to
368
+ # consider the matrix non-psd is some corner cases (see test/test_stats.py)
369
+ x = eig_vecs * np.maximum(eig_vals, _CLIPPING_VALUE) @ eig_vecs.T
370
+ x, _ = cov_to_corr(x)
371
+ cov = corr_to_cov(x, std)
372
+
373
+ return cov
374
+
375
+
376
+ def commutation_matrix(x):
377
+ """Compute the commutation matrix.
378
+
379
+ Parameters
380
+ ----------
381
+ x : ndarray of shape (n, m)
382
+ The matrix.
383
+
384
+ Returns
385
+ -------
386
+ K : ndarray of shape (m * n, m * n)
387
+ The commutation matrix.
388
+ """
389
+ (m, n) = x.shape
390
+ row = np.arange(m * n)
391
+ col = row.reshape((m, n), order="F").ravel()
392
+ data = np.ones(m * n, dtype=np.int8)
393
+ k = csr_matrix((data, (row, col)), shape=(m * n, m * n))
394
+ return k
395
+
396
+
397
+ def compute_optimal_n_clusters(distance: np.ndarray, linkage_matrix: np.ndarray) -> int:
398
+ r"""Compute the optimal number of clusters based on Two-Order Difference to Gap
399
+ Statistic [1]_.
400
+
401
+ The Two-Order Difference to Gap Statistic has been developed to improve the
402
+ performance and stability of the Tibshiranis Gap statistic.
403
+ It applies the two-order difference of the within-cluster dispersion to replace the
404
+ reference null distribution in the Gap statistic.
405
+
406
+ The number of cluster :math:`k` is determined by:
407
+
408
+ .. math:: \begin{cases}
409
+ \begin{aligned}
410
+ &\max_{k} & & W_{k+2} + W_{k} - 2 W_{k+1} \\
411
+ &\text{s.t.} & & 1 \ge c \ge max\bigl(8, \sqrt{n}\bigr) \\
412
+ \end{aligned}
413
+ \end{cases}
414
+
415
+ with :math:`n` the sample size and :math:`W_{k}` the within-cluster dispersions
416
+ defined as:
417
+
418
+ .. math:: W_{k} = \sum_{i=1}^{k} \frac{D_{i}}{2|C_{i}|}
419
+
420
+ where :math:`|C_{i}|` is the cardinality of cluster :math:`i` and :math:`D_{i}` its
421
+ density defined as:
422
+
423
+ .. math:: D_{i} = \sum_{u \in C_{i}} \sum_{v \in C_{i}} d(u,v)
424
+
425
+ with :math:`d(u,v)` the distance between u and v.
426
+
427
+
428
+ Parameters
429
+ ----------
430
+ distance : ndarray of shape (n, n)
431
+ Distance matrix.
432
+
433
+ linkage_matrix : ndarray of shape (n - 1, 4)
434
+ Linkage matrix.
435
+
436
+ Returns
437
+ -------
438
+ value : int
439
+ Optimal number of clusters.
440
+
441
+ References
442
+ ----------
443
+ .. [1] "Application of two-order difference to gap statistic".
444
+ Yue, Wang & Wei (2009)
445
+ """
446
+ cut_tree = sch.cut_tree(linkage_matrix)
447
+ n = cut_tree.shape[1]
448
+ max_clusters = max(8, round(np.sqrt(n)))
449
+ dispersion = []
450
+ for k in range(max_clusters):
451
+ level = cut_tree[:, n - k - 1]
452
+ cluster_density = []
453
+ for i in range(np.max(level) + 1):
454
+ cluster_idx = np.argwhere(level == i).flatten()
455
+ cluster_dists = scd.squareform(
456
+ distance[cluster_idx, :][:, cluster_idx], checks=False
457
+ )
458
+ if cluster_dists.shape[0] != 0:
459
+ cluster_density.append(np.nan_to_num(cluster_dists.mean()))
460
+ dispersion.append(np.sum(cluster_density))
461
+ dispersion = np.array(dispersion)
462
+ gaps = np.roll(dispersion, -2) + dispersion - 2 * np.roll(dispersion, -1)
463
+ gaps = gaps[:-2]
464
+ # k=0 represents one cluster
465
+ k = np.argmax(gaps) + 2
466
+ return k