riskfolio-lib 7.1.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1488 @@
1
+ """""" #
2
+
3
+ """
4
+ Copyright (c) 2020-2025, Dany Cajas
5
+ All rights reserved.
6
+ This work is licensed under BSD 3-Clause "New" or "Revised" License.
7
+ License available at https://github.com/dcajasn/Riskfolio-Lib/blob/master/LICENSE.txt
8
+ """
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ import matplotlib as mpl
13
+ import matplotlib.pyplot as plt
14
+ import scipy.stats as st
15
+ import scipy.cluster.hierarchy as hr
16
+ from scipy import linalg as LA
17
+ from statsmodels.stats.correlation_tools import cov_nearest
18
+ from scipy.sparse import csr_matrix
19
+ from scipy.spatial.distance import pdist, squareform
20
+ from scipy.optimize import minimize
21
+ from sklearn.metrics import mutual_info_score
22
+ from sklearn.neighbors import KernelDensity
23
+ from sklearn.metrics import silhouette_samples
24
+ from astropy.stats import knuth_bin_width, freedman_bin_width, scott_bin_width
25
+ from itertools import product
26
+ import riskfolio.external.cppfunctions as cf
27
+ import riskfolio.src.GerberStatistic as gs
28
+ import re
29
+
30
+
31
+ __all__ = [
32
+ "is_pos_def",
33
+ "cov2corr",
34
+ "corr2cov",
35
+ "cov_fix",
36
+ "cov_returns",
37
+ "block_vec_pq",
38
+ "dcorr",
39
+ "dcorr_matrix",
40
+ "numBins",
41
+ "mutual_info_matrix",
42
+ "var_info_matrix",
43
+ "ltdi_matrix",
44
+ "two_diff_gap_stat",
45
+ "std_silhouette_score",
46
+ "codep_dist",
47
+ "fitKDE",
48
+ "mpPDF",
49
+ "errPDFs",
50
+ "findMaxEval",
51
+ "getPCA",
52
+ "denoisedCorr",
53
+ "shrinkCorr",
54
+ "denoiseCov",
55
+ "round_values",
56
+ "weights_discretizetion",
57
+ "color_list",
58
+ ]
59
+
60
+ ###############################################################################
61
+ # Additional Matrix Functions
62
+ ###############################################################################
63
+
64
+
65
+ def is_pos_def(cov, threshold=1e-8):
66
+ r"""
67
+ Indicate if a matrix is positive (semi)definite.
68
+
69
+ Parameters
70
+ ----------
71
+ cov : DataFrame of shape (n_assets, n_assets)
72
+ Covariance matrix, where n_assets is the number of assets.
73
+
74
+ Returns
75
+ -------
76
+ value : bool
77
+ True if matrix is positive (semi)definite.
78
+
79
+ Raises
80
+ ------
81
+ ValueError when the value cannot be calculated.
82
+
83
+ """
84
+ cov_ = np.array(cov, ndmin=2)
85
+ w = LA.eigh(cov_, lower=True, check_finite=True, eigvals_only=True)
86
+ value = np.all(w >= threshold)
87
+
88
+ return value
89
+
90
+
91
+ def cov2corr(cov):
92
+ r"""
93
+ Generate a correlation matrix from a covariance matrix cov.
94
+
95
+ Parameters
96
+ ----------
97
+ cov : DataFrame of shape (n_assets, n_assets)
98
+ Covariance matrix, where n_assets is the number of assets.
99
+
100
+ Returns
101
+ -------
102
+ corr : ndarray
103
+ A correlation matrix.
104
+
105
+ Raises
106
+ ------
107
+ ValueError when the value cannot be calculated.
108
+
109
+ """
110
+
111
+ flag = False
112
+ if isinstance(cov, pd.DataFrame):
113
+ cols = cov.columns.tolist()
114
+ flag = True
115
+
116
+ cov1 = np.array(cov, ndmin=2)
117
+ std = np.sqrt(np.diag(cov1))
118
+ corr = np.clip(cov1 / np.outer(std, std), a_min=-1.0, a_max=1.0)
119
+
120
+ if flag:
121
+ corr = pd.DataFrame(corr, index=cols, columns=cols)
122
+
123
+ return corr
124
+
125
+
126
+ def corr2cov(corr, std):
127
+ r"""
128
+ Generate a covariance matrix from a correlation matrix corr and a standard
129
+ deviation vector std.
130
+
131
+ Parameters
132
+ ----------
133
+ corr : DataFrame of shape (n_assets, n_assets)
134
+ Covariance matrix, where n_assets is the number of assets.
135
+ std : 1darray
136
+ Assets standard deviation vector of size n_features, where
137
+ n_features is the number of features.
138
+
139
+
140
+ Returns
141
+ -------
142
+ cov : ndarray
143
+ A covariance matrix.
144
+
145
+ Raises
146
+ ------
147
+ ValueError when the value cannot be calculated.
148
+
149
+ """
150
+
151
+ flag = False
152
+ if isinstance(corr, pd.DataFrame):
153
+ cols = corr.columns.tolist()
154
+ flag = True
155
+
156
+ cov = corr * np.outer(std, std)
157
+
158
+ if flag:
159
+ cov = pd.DataFrame(cov, index=cols, columns=cols)
160
+
161
+ return cov
162
+
163
+
164
+ def cov_fix(cov, method="clipped", threshold=1e-8):
165
+ r"""
166
+ Fix a covariance matrix to a positive definite matrix.
167
+
168
+ Parameters
169
+ ----------
170
+ cov : DataFrame of shape (n_assets, n_assets)
171
+ Covariance matrix, where n_assets is the number of assets.
172
+ method : str
173
+ The default value is 'clipped', see more in `cov_nearest <https://www.statsmodels.org/stable/generated/statsmodels.stats.correlation_tools.cov_nearest.html>`_.
174
+ threshold
175
+ Clipping threshold for smallest eigen value.
176
+
177
+ Returns
178
+ -------
179
+ cov_ : bool
180
+ A positive definite covariance matrix.
181
+
182
+ Raises
183
+ ------
184
+ ValueError when the value cannot be calculated.
185
+
186
+ """
187
+ flag = False
188
+ if isinstance(cov, pd.DataFrame):
189
+ cols = cov.columns.tolist()
190
+ flag = True
191
+
192
+ cov_ = np.array(cov, ndmin=2)
193
+ cov_ = cov_nearest(cov_, method=method, threshold=threshold)
194
+ cov_ = np.array(cov_, ndmin=2)
195
+
196
+ if flag:
197
+ cov_ = pd.DataFrame(cov_, index=cols, columns=cols)
198
+
199
+ return cov_
200
+
201
+
202
+ def cov_returns(cov, seed=0):
203
+ r"""
204
+ Generate a matrix of returns that have a covariance matrix cov.
205
+
206
+ Parameters
207
+ ----------
208
+ cov : DataFrame of shape (n_assets, n_assets)
209
+ Covariance matrix, where n_assets is the number of assets.
210
+
211
+ Returns
212
+ -------
213
+ a : ndarray
214
+ A matrix of returns that have a covariance matrix cov.
215
+
216
+ Raises
217
+ ------
218
+ ValueError when the value cannot be calculated.
219
+
220
+ """
221
+
222
+ rs = np.random.RandomState(seed)
223
+ n = len(cov)
224
+ a = np.array(rs.randn(n + 10, n), ndmin=2)
225
+
226
+ for i in range(0, 5):
227
+ cov_ = np.cov(a.T)
228
+ L = np.array(np.linalg.cholesky(cov_), ndmin=2)
229
+ a = a @ np.linalg.inv(L).T
230
+ cov_ = np.cov(a.T)
231
+ desv_ = np.sqrt(np.array(np.diag(cov_), ndmin=2))
232
+ a = (np.array(a) - np.mean(a, axis=0)) / np.array(desv_)
233
+
234
+ L1 = np.array(np.linalg.cholesky(cov), ndmin=2)
235
+ a = a @ L1.T
236
+
237
+ return a
238
+
239
+
240
+ def block_vec_pq(A, p, q):
241
+ r"""
242
+ Calculates block vectorization operator as shown in :cite:`d-VanLoan1993`
243
+ and :cite:`d-Ojeda2015`.
244
+
245
+ Parameters
246
+ ----------
247
+ A : ndarray
248
+ Matrix that will be block vectorized.
249
+ p : int
250
+ Order p of block vectorization operator.
251
+ q : int
252
+ Order q of block vectorization operator.
253
+
254
+ Returns
255
+ -------
256
+ bvec_A : ndarray
257
+ The block vectorized matrix.
258
+
259
+ Raises
260
+ ------
261
+ ValueError when the value cannot be calculated.
262
+
263
+ """
264
+ if isinstance(A, pd.DataFrame):
265
+ A_ = A.to_numpy()
266
+ elif isinstance(A, np.ndarray):
267
+ A_ = A.copy()
268
+ else:
269
+ raise ValueError("A must be an 2darray or DataFrame.")
270
+
271
+ mp, nq = A_.shape
272
+ if mp % p == 0 and nq % q == 0:
273
+ m = int(mp / p)
274
+ n = int(nq / q)
275
+ bvec_A = np.empty((0, p * q))
276
+ for j in range(n):
277
+ Aj = np.empty((0, p * q))
278
+ for i in range(m):
279
+ Aij = (
280
+ A_[i * p : (i + 1) * p, j * q : (j + 1) * q]
281
+ .reshape(-1, 1, order="F")
282
+ .T
283
+ )
284
+ Aj = np.vstack([Aj, Aij])
285
+ bvec_A = np.vstack([bvec_A, Aj])
286
+ else:
287
+ raise ValueError(
288
+ "Dimensions p and q give non integer values for dimensions m and n."
289
+ )
290
+
291
+ return bvec_A
292
+
293
+
294
+ ###############################################################################
295
+ # Aditional Codependence Functions
296
+ ###############################################################################
297
+
298
+
299
+ def dcorr(X, Y):
300
+ r"""
301
+ Calculate the distance correlation between two variables :cite:`d-Szekely`.
302
+
303
+ Parameters
304
+ ----------
305
+ X : 1d-array
306
+ Returns series, must have of shape n_sample x 1.
307
+ Y : 1d-array
308
+ Returns series, must have of shape n_sample x 1.
309
+
310
+ Returns
311
+ -------
312
+ value : float
313
+ The distance correlation between variables X and Y.
314
+
315
+ Raises
316
+ ------
317
+ ValueError when the value cannot be calculated.
318
+
319
+ """
320
+
321
+ X = np.atleast_1d(X)
322
+ Y = np.atleast_1d(Y)
323
+
324
+ if np.prod(X.shape) == len(X):
325
+ X = X[:, None]
326
+ if np.prod(Y.shape) == len(Y):
327
+ Y = Y[:, None]
328
+
329
+ X = np.atleast_2d(X)
330
+ Y = np.atleast_2d(Y)
331
+ n = X.shape[0]
332
+
333
+ if Y.shape[0] != X.shape[0]:
334
+ raise ValueError("Number of samples must match")
335
+
336
+ value = cf.d_corr(X, Y)
337
+
338
+ return value
339
+
340
+
341
+ def dcorr_matrix(X):
342
+ r"""
343
+ Calculate the distance correlation matrix of n variables.
344
+
345
+ Parameters
346
+ ----------
347
+ X : DataFrame of shape (n_samples, n_assets)
348
+ Assets returns DataFrame, where n_samples is the number of
349
+ observations and n_assets is the number of assets.
350
+
351
+ Returns
352
+ -------
353
+ corr : ndarray
354
+ The distance correlation matrix of shape n_features x n_features.
355
+
356
+ Raises
357
+ ------
358
+ ValueError when the value cannot be calculated.
359
+
360
+ """
361
+ flag = False
362
+ if isinstance(X, pd.DataFrame):
363
+ cols = X.columns.tolist()
364
+ X1 = X.to_numpy()
365
+ flag = True
366
+ else:
367
+ X1 = X.copy()
368
+
369
+ corr = cf.d_corr_matrix(X1)
370
+
371
+ if flag:
372
+ corr = pd.DataFrame(corr, index=cols, columns=cols)
373
+ else:
374
+ corr = pd.DataFrame(corr)
375
+
376
+ return corr
377
+
378
+
379
+ def numBins(n_samples, corr=None):
380
+ r"""
381
+ Calculate the optimal number of bins for discretization of mutual
382
+ information and variation of information.
383
+
384
+ Parameters
385
+ ----------
386
+ n_samples : integer
387
+ Number of samples.
388
+
389
+ corr : float, optional
390
+ Correlation coefficient of variables. The default value is None.
391
+
392
+ Returns
393
+ -------
394
+ bins : int
395
+ The optimal number of bins.
396
+
397
+ Raises
398
+ ------
399
+ ValueError when the value cannot be calculated.
400
+
401
+ """
402
+ # univariate case
403
+ if corr is None:
404
+ z = (
405
+ 8 + 324 * n_samples + 12 * (36 * n_samples + 729 * n_samples**2) ** 0.5
406
+ ) ** (1 / 3)
407
+ b = np.round(z / 6 + 2 / (3 * z) + 1 / 3)
408
+ # bivariate case
409
+ else:
410
+ b = np.round(2**-0.5 * (1 + (1 + 24 * n_samples / (1 - corr**2)) ** 0.5) ** 0.5)
411
+
412
+ bins = np.int32(b)
413
+
414
+ return bins
415
+
416
+
417
+ def mutual_info_matrix(X, bins_info="KN", normalize=True):
418
+ r"""
419
+ Calculate the mutual information matrix of n variables.
420
+
421
+ Parameters
422
+ ----------
423
+ X : DataFrame of shape (n_samples, n_assets)
424
+ Assets returns DataFrame, where n_samples is the number of
425
+ observations and n_assets is the number of assets.
426
+ bins_info: int or str
427
+ Number of bins used to calculate mutual information. The default
428
+ value is 'KN'. Possible values are:
429
+
430
+ - 'KN': Knuth's choice method. See more in `knuth_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.knuth_bin_width.html>`_.
431
+ - 'FD': Freedman–Diaconis' choice method. See more in `freedman_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.freedman_bin_width.html>`_.
432
+ - 'SC': Scotts' choice method. See more in `scott_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.scott_bin_width.html>`_.
433
+ - 'HGR': Hacine-Gharbi and Ravier' choice method.
434
+ - int: integer value choice by user.
435
+
436
+ normalize: bool
437
+ If normalize variation of information. The default value is True.
438
+
439
+ Returns
440
+ -------
441
+ corr : ndarray
442
+ The mutual information matrix of shape n_features x n_features.
443
+
444
+ Raises
445
+ ------
446
+ ValueError when the value cannot be calculated.
447
+
448
+ """
449
+ flag = False
450
+ if isinstance(X, pd.DataFrame):
451
+ cols = X.columns.tolist()
452
+ X1 = X.to_numpy()
453
+ flag = True
454
+ else:
455
+ X1 = X.copy()
456
+
457
+ m = X1.shape[0]
458
+ n = X1.shape[1]
459
+ mat = np.zeros((n, n))
460
+ indices = np.triu_indices(n)
461
+
462
+ for i, j in zip(indices[0], indices[1]):
463
+ if bins_info == "KN":
464
+ k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / knuth_bin_width(X1[:, i])
465
+ bins = np.int32(np.round(k1))
466
+ if i != j:
467
+ k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / knuth_bin_width(X1[:, j])
468
+ bins = np.int32(np.round(np.maximum(k1, k2)))
469
+ elif bins_info == "FD":
470
+ k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / freedman_bin_width(X1[:, i])
471
+ bins = np.int32(np.round(k1))
472
+ if i != j:
473
+ k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / freedman_bin_width(
474
+ X1[:, j]
475
+ )
476
+ bins = np.int32(np.round(np.maximum(k1, k2)))
477
+ elif bins_info == "SC":
478
+ k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / scott_bin_width(X1[:, i])
479
+ bins = np.int32(np.round(k1))
480
+ if i != j:
481
+ k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / scott_bin_width(X1[:, j])
482
+ bins = np.int32(np.round(np.maximum(k1, k2)))
483
+ elif bins_info == "HGR":
484
+ corr = np.corrcoef(X1[:, i], X1[:, j])[0, 1]
485
+ if corr == 1:
486
+ bins = numBins(m, None)
487
+ else:
488
+ bins = numBins(m, corr)
489
+ elif isinstance(bins_info, np.int32) or isinstance(bins_info, int):
490
+ bins = bins_info
491
+
492
+ cXY = np.histogram2d(X1[:, i], X1[:, j], bins)[0]
493
+ hX = st.entropy(np.histogram(X1[:, i], bins)[0]) # marginal
494
+ hY = st.entropy(np.histogram(X1[:, j], bins)[0]) # marginal
495
+ iXY = mutual_info_score(None, None, contingency=cXY) # mutual information
496
+ if normalize == True:
497
+ iXY = iXY / np.min([hX, hY]) # normalized mutual information
498
+ # hXY = hX + hY - iXY # joint
499
+ # hX_Y = hXY - hY # conditional
500
+ # hY_X = hXY - hX # conditional
501
+
502
+ mat[i, j] = iXY
503
+ mat[j, i] = mat[i, j]
504
+
505
+ mat = np.clip(np.round(mat, 8), a_min=0.0, a_max=np.inf)
506
+
507
+ if flag:
508
+ mat = pd.DataFrame(mat, index=cols, columns=cols)
509
+
510
+ return mat
511
+
512
+
513
+ def var_info_matrix(X, bins_info="KN", normalize=True):
514
+ r"""
515
+ Calculate the variation of information matrix of n variables.
516
+
517
+ Parameters
518
+ ----------
519
+ X : DataFrame of shape (n_samples, n_assets)
520
+ Assets returns DataFrame, where n_samples is the number of
521
+ observations and n_assets is the number of assets.
522
+ bins_info: int or str
523
+ Number of bins used to calculate variation of information. The default
524
+ value is 'KN'. Possible values are:
525
+
526
+ - 'KN': Knuth's choice method. See more in `knuth_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.knuth_bin_width.html>`_.
527
+ - 'FD': Freedman–Diaconis' choice method. See more in `freedman_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.freedman_bin_width.html>`_.
528
+ - 'SC': Scotts' choice method. See more in `scott_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.scott_bin_width.html>`_.
529
+ - 'HGR': Hacine-Gharbi and Ravier' choice method.
530
+ - int: integer value choice by user.
531
+
532
+ normalize: bool
533
+ If normalize variation of information. The default value is True.
534
+
535
+ Returns
536
+ -------
537
+ corr : ndarray
538
+ The mutual information matrix of shape n_features x n_features.
539
+
540
+ Raises
541
+ ------
542
+ ValueError when the value cannot be calculated.
543
+
544
+ """
545
+ flag = False
546
+ if isinstance(X, pd.DataFrame):
547
+ cols = X.columns.tolist()
548
+ X1 = X.to_numpy()
549
+ flag = True
550
+ else:
551
+ X1 = X.copy()
552
+
553
+ m = X1.shape[0]
554
+ n = X1.shape[1]
555
+ mat = np.zeros((n, n))
556
+ indices = np.triu_indices(n)
557
+
558
+ for i, j in zip(indices[0], indices[1]):
559
+ if bins_info == "KN":
560
+ k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / knuth_bin_width(X1[:, i])
561
+ bins = np.int32(np.round(k1))
562
+ if i != j:
563
+ k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / knuth_bin_width(X1[:, j])
564
+ bins = np.int32(np.round(np.maximum(k1, k2)))
565
+ elif bins_info == "FD":
566
+ k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / freedman_bin_width(X1[:, i])
567
+ bins = np.int32(np.round(k1))
568
+ if i != j:
569
+ k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / freedman_bin_width(
570
+ X1[:, j]
571
+ )
572
+ bins = np.int32(np.round(np.maximum(k1, k2)))
573
+ elif bins_info == "SC":
574
+ k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / scott_bin_width(X1[:, i])
575
+ bins = np.int32(np.round(k1))
576
+ if i != j:
577
+ k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / scott_bin_width(X1[:, j])
578
+ bins = np.int32(np.round(np.maximum(k1, k2)))
579
+ elif bins_info == "HGR":
580
+ corr = np.corrcoef(X1[:, i], X1[:, j])[0, 1]
581
+ if corr == 1:
582
+ bins = numBins(m, None)
583
+ else:
584
+ bins = numBins(m, corr)
585
+ elif isinstance(bins_info, np.int32) or isinstance(bins_info, int):
586
+ bins = bins_info
587
+
588
+ cXY = np.histogram2d(X1[:, i], X1[:, j], bins)[0]
589
+ hX = st.entropy(np.histogram(X1[:, i], bins)[0]) # marginal
590
+ hY = st.entropy(np.histogram(X1[:, j], bins)[0]) # marginal
591
+ iXY = mutual_info_score(None, None, contingency=cXY) # mutual information
592
+ vXY = hX + hY - 2 * iXY # variation of information
593
+ if normalize == True:
594
+ hXY = hX + hY - iXY # joint
595
+ vXY = vXY / hXY # normalized variation of information
596
+
597
+ mat[i, j] = vXY
598
+ mat[j, i] = mat[i, j]
599
+
600
+ mat = np.clip(np.round(mat, 8), a_min=0.0, a_max=np.inf)
601
+
602
+ if flag:
603
+ mat = pd.DataFrame(mat, index=cols, columns=cols)
604
+
605
+ return mat
606
+
607
+
608
+ def ltdi_matrix(X, alpha=0.05):
609
+ r"""
610
+ Calculate the lower tail dependence index matrix using the empirical
611
+ approach.
612
+
613
+ Parameters
614
+ ----------
615
+ X : DataFrame of shape (n_samples, n_assets)
616
+ Assets returns DataFrame, where n_samples is the number of
617
+ observations and n_assets is the number of assets.
618
+ alpha : float, optional
619
+ Significance level for lower tail dependence index.
620
+ The default is 0.05.
621
+
622
+ Returns
623
+ -------
624
+ corr : ndarray
625
+ The lower tail dependence index matrix of shape n_features x
626
+ n_features.
627
+
628
+ Raises
629
+ ------
630
+ ValueError when the value cannot be calculated.
631
+
632
+ """
633
+
634
+ flag = False
635
+ if isinstance(X, pd.DataFrame):
636
+ cols = X.columns.tolist()
637
+ X1 = X.to_numpy()
638
+ flag = True
639
+ else:
640
+ X1 = X.copy()
641
+
642
+ m = X1.shape[0]
643
+ n = X1.shape[1]
644
+ k = np.int32(np.ceil(m * alpha))
645
+ mat = np.ones((n, n))
646
+
647
+ if k > 0:
648
+ indices = np.triu_indices(n)
649
+
650
+ for i, j in zip(indices[0], indices[1]):
651
+ u = np.sort(X1[:, i])[k - 1]
652
+ v = np.sort(X1[:, j])[k - 1]
653
+ ltd = (
654
+ np.sum(np.where(np.logical_and(X1[:, i] <= u, X1[:, j] <= v), 1, 0)) / k
655
+ )
656
+
657
+ mat[i, j] = ltd
658
+ mat[j, i] = mat[i, j]
659
+
660
+ for i in range(0, n):
661
+ u = np.sort(X1[:, i])[k - 1]
662
+ v = np.sort(X1[:, i])[k - 1]
663
+ ltd = (
664
+ np.sum(np.where(np.logical_and(X1[:, i] <= u, X1[:, i] <= v), 1, 0)) / k
665
+ )
666
+
667
+ mat[i, i] = ltd
668
+
669
+ mat = np.clip(np.round(mat, 8), a_min=1.0e-8, a_max=1)
670
+
671
+ if flag:
672
+ mat = pd.DataFrame(mat, index=cols, columns=cols)
673
+ else:
674
+ mat = pd.DataFrame(mat)
675
+
676
+ return mat
677
+
678
+
679
+ def two_diff_gap_stat(dist, clustering, max_k=10):
680
+ r"""
681
+ Calculate the optimal number of clusters based on the two difference gap
682
+ statistic :cite:`d-twogap`.
683
+
684
+ Parameters
685
+ ----------
686
+ dist : str, optional
687
+ A distance measure based on the codependence matrix.
688
+ clustering : str, optional
689
+ The hierarchical clustering encoded as a linkage matrix, see `linkage <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html?highlight=linkage#scipy.cluster.hierarchy.linkage>`_ for more details.
690
+ max_k : int, optional
691
+ Max number of clusters used by the two difference gap statistic
692
+ to find the optimal number of clusters. The default is 10.
693
+
694
+ Returns
695
+ -------
696
+ k : int
697
+ The optimal number of clusters based on the two difference gap statistic.
698
+
699
+ Raises
700
+ ------
701
+ ValueError when the value cannot be calculated.
702
+
703
+ """
704
+ flag = False
705
+ # Check if linkage matrix is monotonic
706
+ if hr.is_monotonic(clustering):
707
+ flag = True
708
+ # cluster levels over from 1 to N-1 clusters
709
+ cluster_lvls = pd.DataFrame(hr.cut_tree(clustering), index=dist.columns)
710
+ level_k = cluster_lvls.columns.tolist()
711
+ cluster_lvls = cluster_lvls.iloc[:, ::-1] # reverse order to start with 1 cluster
712
+ cluster_lvls.columns = level_k
713
+ # Fix for nonmonotonic linkage matrices
714
+ if flag is False:
715
+ for i in cluster_lvls.columns:
716
+ unique_vals, indices = np.unique(cluster_lvls[i], return_inverse=True)
717
+ cluster_lvls[i] = indices
718
+ cluster_lvls = cluster_lvls.T.drop_duplicates().T
719
+ level_k = cluster_lvls.columns.tolist()
720
+ cluster_k = cluster_lvls.nunique(axis=0).tolist()
721
+ W_list = []
722
+ n = dist.shape[0]
723
+
724
+ # get within-cluster dissimilarity for each k
725
+ for k in cluster_k:
726
+ if k == 1:
727
+ W_list.append(-np.inf)
728
+ elif k > min(max_k, np.sqrt(n)) + 2:
729
+ break
730
+ else:
731
+ level = cluster_lvls[level_k[cluster_k.index(k)]] # get k clusters
732
+ D_list = [] # within-cluster distance list
733
+
734
+ for i in range(np.max(level.unique()) + 1):
735
+ cluster = level.loc[level == i]
736
+ # Based on correlation distance
737
+ cluster_dist = dist.loc[cluster.index, cluster.index] # get distance
738
+ cluster_pdist = squareform(cluster_dist, checks=False)
739
+ if cluster_pdist.shape[0] != 0:
740
+ D = np.nan_to_num(cluster_pdist.std())
741
+ D_list.append(D) # append to list
742
+
743
+ W_k = np.sum(D_list)
744
+ W_list.append(W_k)
745
+
746
+ W_list = pd.Series(W_list)
747
+ gaps = W_list.shift(-2) + W_list - 2 * W_list.shift(-1)
748
+ k_index = int(gaps.idxmax())
749
+ k = cluster_k[k_index]
750
+ node_k = level_k[k_index]
751
+
752
+ if flag:
753
+ clustering_inds = cluster_lvls[node_k].tolist()
754
+ else:
755
+ clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")
756
+ j = len(np.unique(clustering_inds))
757
+ while k != j:
758
+ j += 1
759
+ clustering_inds = hr.fcluster(clustering, j, criterion="maxclust")
760
+ k = len(np.unique(clustering_inds))
761
+ unique_vals, indices = np.unique(clustering_inds, return_inverse=True)
762
+ clustering_inds = indices
763
+
764
+ return k, clustering_inds
765
+
766
+
767
+ def std_silhouette_score(dist, clustering, max_k=10):
768
+ r"""
769
+ Calculate the optimal number of clusters based on the standarized silhouette
770
+ score index :cite:`d-Prado2`.
771
+
772
+ Parameters
773
+ ----------
774
+ dist : str, optional
775
+ A distance measure based on the codependence matrix.
776
+ clustering : str, optional
777
+ The hierarchical clustering encoded as a linkage matrix, see `linkage <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html?highlight=linkage#scipy.cluster.hierarchy.linkage>`_ for more details.
778
+ max_k : int, optional
779
+ Max number of clusters used by the standarized silhouette score
780
+ to find the optimal number of clusters. The default is 10.
781
+
782
+ Returns
783
+ -------
784
+ k : int
785
+ The optimal number of clusters based on the standarized silhouette score.
786
+
787
+ Raises
788
+ ------
789
+ ValueError when the value cannot be calculated.
790
+
791
+ """
792
+ flag = False
793
+ # Check if linkage matrix is monotonic
794
+ if hr.is_monotonic(clustering):
795
+ flag = True
796
+ # cluster levels over from 1 to N-1 clusters
797
+ cluster_lvls = pd.DataFrame(hr.cut_tree(clustering), index=dist.columns)
798
+ level_k = cluster_lvls.columns.tolist()
799
+ cluster_lvls = cluster_lvls.iloc[:, ::-1] # reverse order to start with 1 cluster
800
+ cluster_lvls.columns = level_k
801
+ # Fix for nonmonotonic linkage matrices
802
+ if flag is False:
803
+ for i in cluster_lvls.columns:
804
+ unique_vals, indices = np.unique(cluster_lvls[i], return_inverse=True)
805
+ cluster_lvls[i] = indices
806
+ cluster_lvls = cluster_lvls.T.drop_duplicates().T
807
+ level_k = cluster_lvls.columns.tolist()
808
+ cluster_k = cluster_lvls.nunique(axis=0).tolist()
809
+ scores_list = []
810
+ n = dist.shape[0]
811
+
812
+ # get within-cluster dissimilarity for each k
813
+ for k in cluster_k:
814
+ if k == 1:
815
+ scores_list.append(-np.inf)
816
+ elif k > min(max_k, np.sqrt(n)):
817
+ break
818
+ else:
819
+ level = cluster_lvls[level_k[cluster_k.index(k)]] # get k clusters
820
+ b = silhouette_samples(dist, level)
821
+ scores_list.append(b.mean() / b.std())
822
+
823
+ scores_list = pd.Series(scores_list)
824
+ k_index = int(scores_list.idxmax())
825
+ k = cluster_k[k_index]
826
+ node_k = level_k[k_index]
827
+ if flag:
828
+ clustering_inds = cluster_lvls[node_k].tolist()
829
+ else:
830
+ clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")
831
+ j = len(np.unique(clustering_inds))
832
+ while k != j:
833
+ j += 1
834
+ clustering_inds = hr.fcluster(clustering, j, criterion="maxclust")
835
+ k = len(np.unique(clustering_inds))
836
+ unique_vals, indices = np.unique(clustering_inds, return_inverse=True)
837
+ clustering_inds = indices
838
+
839
+ return k, clustering_inds
840
+
841
+
842
+ def codep_dist(
843
+ returns,
844
+ custom_cov=None,
845
+ codependence="pearson",
846
+ bins_info="KN",
847
+ alpha_tail=0.05,
848
+ gs_threshold=0.5,
849
+ ):
850
+ r"""
851
+ Calculate the codependence and distance matrix according the selected method.
852
+
853
+ Parameters
854
+ ----------
855
+ returns : DataFrame of shape (n_samples, n_assets)
856
+ Assets returns DataFrame, where n_samples is the number of
857
+ observations and n_assets is the number of assets.
858
+ custom_cov : DataFrame or None, optional
859
+ Custom covariance matrix, used when codependence parameter has value
860
+ 'custom_cov'. The default is None.
861
+ codependence : str, can be {'pearson', 'spearman', 'abs_pearson', 'abs_spearman', 'distance', 'mutual_info', 'tail' or 'custom_cov'}
862
+ The codependence or similarity matrix used to build the distance
863
+ metric and clusters. The default is 'pearson'. Possible values are:
864
+
865
+ - 'pearson': pearson correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{pearson}_{i,j})}`.
866
+ - 'spearman': spearman correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{spearman}_{i,j})}`.
867
+ - 'kendall': kendall correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{kendall}_{i,j})}`.
868
+ - 'gerber1': Gerber statistic 1 correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{gerber1}_{i,j})}`.
869
+ - 'gerber2': Gerber statistic 2 correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{gerber2}_{i,j})}`.
870
+ - 'abs_pearson': absolute value pearson correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{(1-|\rho_{i,j}|)}`.
871
+ - 'abs_spearman': absolute value spearman correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{(1-|\rho_{i,j}|)}`.
872
+ - 'abs_kendall': absolute value kendall correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{(1-|\rho^{kendall}_{i,j}|)}`.
873
+ - 'distance': distance correlation matrix. Distance formula :math:`D_{i,j} = \sqrt{(1-\rho^{distance}_{i,j})}`.
874
+ - 'mutual_info': mutual information matrix. Distance used is variation information matrix.
875
+ - 'tail': lower tail dependence index matrix. Dissimilarity formula :math:`D_{i,j} = -\log{\lambda_{i,j}}`.
876
+ - 'custom_cov': use custom correlation matrix based on the custom_cov parameter. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{pearson}_{i,j})}`.
877
+
878
+ bins_info: int or str
879
+ Number of bins used to calculate variation of information. The default
880
+ value is 'KN'. Possible values are:
881
+
882
+ - 'KN': Knuth's choice method. See more in `knuth_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.knuth_bin_width.html>`_.
883
+ - 'FD': Freedman–Diaconis' choice method. See more in `freedman_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.freedman_bin_width.html>`_.
884
+ - 'SC': Scotts' choice method. See more in `scott_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.scott_bin_width.html>`_.
885
+ - 'HGR': Hacine-Gharbi and Ravier' choice method.
886
+ - int: integer value choice by user.
887
+
888
+ alpha_tail : float, optional
889
+ Significance level for lower tail dependence index. The default is 0.05.
890
+ gs_threshold : float, optional
891
+ Gerber statistic threshold. The default is 0.5.
892
+
893
+ Returns
894
+ -------
895
+ codep : DataFrame
896
+ Codependence matrix.
897
+ dist : DataFrame
898
+ Distance matrix.
899
+
900
+ Raises
901
+ ------
902
+ ValueError
903
+ When the value cannot be calculated.
904
+
905
+ """
906
+ if codependence in {"pearson", "spearman", "kendall"}:
907
+ codep = returns.corr(method=codependence)
908
+ dist = np.sqrt(np.clip((1 - codep) / 2, a_min=0.0, a_max=1.0))
909
+ elif codependence == "gerber1":
910
+ codep = gs.gerber_cov_stat1(returns, threshold=gs_threshold)
911
+ codep = cov2corr(codep)
912
+ dist = np.sqrt(np.clip((1 - codep) / 2, a_min=0.0, a_max=1.0))
913
+ elif codependence == "gerber2":
914
+ codep = gs.gerber_cov_stat2(returns, threshold=gs_threshold)
915
+ codep = cov2corr(codep)
916
+ dist = np.sqrt(np.clip((1 - codep) / 2, a_min=0.0, a_max=1.0))
917
+ elif codependence in {"abs_pearson", "abs_spearman", "abs_kendall"}:
918
+ codep = np.abs(returns.corr(method=codependence[4:]))
919
+ dist = np.sqrt(np.clip((1 - codep), a_min=0.0, a_max=1.0))
920
+ elif codependence in {"distance"}:
921
+ codep = dcorr_matrix(returns).astype(float)
922
+ dist = np.sqrt(np.clip((1 - codep), a_min=0.0, a_max=1.0))
923
+ elif codependence in {"mutual_info"}:
924
+ codep = mutual_info_matrix(returns, bins_info).astype(float)
925
+ dist = var_info_matrix(returns, bins_info).astype(float)
926
+ elif codependence in {"tail"}:
927
+ codep = ltdi_matrix(returns, alpha_tail).astype(float)
928
+ dist = -np.log(codep)
929
+ elif codependence in {"custom_cov"}:
930
+ codep = cov2corr(custom_cov).astype(float)
931
+ dist = np.sqrt(np.clip((1 - codep) / 2, a_min=0.0, a_max=1.0))
932
+
933
+ return codep, dist
934
+
935
+
936
+ ###############################################################################
937
+ # Denoising Functions Based on Lopez de Prado Book
938
+ ###############################################################################
939
+
940
+
941
+ def fitKDE(obs, bWidth=0.01, kernel="gaussian", x=None):
942
+ """
943
+ Fit kernel to a series of obs, and derive the prob of obs x is the array of
944
+ values on which the fit KDE will be evaluated. It is the empirical Probability
945
+ Density Function (PDF). For more information see chapter 2 of :cite:`d-MLforAM`.
946
+
947
+ Parameters
948
+ ----------
949
+ obs : ndarray
950
+ Observations to fit. Commonly is the diagonal of Eigenvalues.
951
+ bWidth : float, optional
952
+ The bandwidth of the kernel. The default value is 0.01.
953
+ kernel : string, optional
954
+ The kernel to use. The default value is 'gaussian'. For more information see: `kernel-density <https://scikit-learn.org/stable/modules/density.html#kernel-density>`_.
955
+ Possible values are:
956
+
957
+ - 'gaussian': gaussian kernel.
958
+ - 'tophat': tophat kernel.
959
+ - 'epanechnikov': epanechnikov kernel.
960
+ - 'exponential': exponential kernel.
961
+ - 'linear': linear kernel.
962
+ - 'cosine': cosine kernel.
963
+
964
+ x : ndarray, optional
965
+ It is the array of values on which the fit KDE will be evaluated.
966
+
967
+ Returns
968
+ -------
969
+ pdf : pd.series
970
+ Empirical PDF.
971
+
972
+ Raises
973
+ ------
974
+ ValueError when the value cannot be calculated.
975
+
976
+ """
977
+
978
+ if len(obs.shape) == 1:
979
+ obs = obs.reshape(-1, 1)
980
+
981
+ kde = KernelDensity(kernel=kernel, bandwidth=bWidth).fit(obs)
982
+
983
+ if x is None:
984
+ x = np.unique(obs).reshape(-1, 1)
985
+
986
+ if len(x.shape) == 1:
987
+ x = x.reshape(-1, 1)
988
+
989
+ logProb = kde.score_samples(x) # log(density)
990
+ pdf = pd.Series(np.exp(logProb), index=x.flatten())
991
+
992
+ return pdf
993
+
994
+
995
+ def mpPDF(var, q, pts):
996
+ r"""
997
+ Creates a Marchenko-Pastur Probability Density Function (PDF). For more
998
+ information see chapter 2 of :cite:`d-MLforAM`.
999
+
1000
+ Parameters
1001
+ ----------
1002
+ var : float
1003
+ Variance.
1004
+ q : float
1005
+ T/N where T is the number of rows and N the number of columns
1006
+ pts : int
1007
+ Number of points used to construct the PDF.
1008
+
1009
+ Returns
1010
+ -------
1011
+ pdf : pd.series
1012
+ Marchenko-Pastur PDF.
1013
+
1014
+ Raises
1015
+ ------
1016
+ ValueError when the value cannot be calculated.
1017
+
1018
+ """
1019
+
1020
+ if isinstance(var, np.ndarray):
1021
+ if var.shape == (1,):
1022
+ var = var[0]
1023
+
1024
+ eMin, eMax = var * (1 - (1.0 / q) ** 0.5) ** 2, var * (1 + (1.0 / q) ** 0.5) ** 2
1025
+ eVal = np.linspace(eMin, eMax, pts)
1026
+ pdf = q / (2 * np.pi * var * eVal) * ((eMax - eVal) * (eVal - eMin)) ** 0.5
1027
+ pdf = pd.Series(pdf, index=eVal)
1028
+
1029
+ return pdf
1030
+
1031
+
1032
+ def errPDFs(var, eVal, q, bWidth=0.01, pts=1000):
1033
+ r"""
1034
+ Fit error of Empirical PDF (uses Marchenko-Pastur PDF). For more information
1035
+ see chapter 2 of :cite:`d-MLforAM`.
1036
+
1037
+ Parameters
1038
+ ----------
1039
+ var : float
1040
+ Variance.
1041
+ eVal : ndarray
1042
+ Eigenvalues to fit.
1043
+ q : float
1044
+ T/N where T is the number of rows and N the number of columns.
1045
+ bWidth : float, optional
1046
+ The bandwidth of the kernel. The default value is 0.01.
1047
+ pts : int
1048
+ Number of points used to construct the PDF. The default value is 1000.
1049
+
1050
+ Returns
1051
+ -------
1052
+ pdf : float
1053
+ Sum squared error.
1054
+
1055
+ Raises
1056
+ ------
1057
+ ValueError when the value cannot be calculated.
1058
+ """
1059
+
1060
+ # Fit error
1061
+ pdf0 = mpPDF(var, q, pts) # theoretical pdf
1062
+ pdf1 = fitKDE(eVal, bWidth, x=pdf0.index.values) # empirical pdf
1063
+ sse = np.sum((pdf1 - pdf0) ** 2)
1064
+
1065
+ return sse
1066
+
1067
+
1068
+ def findMaxEval(eVal, q, bWidth=0.01):
1069
+ r"""
1070
+ Find max random eVal by fitting Marchenko’s dist (i.e) everything else
1071
+ larger than this, is a signal eigenvalue. For more information see chapter
1072
+ 2 of :cite:`d-MLforAM`.
1073
+
1074
+ Parameters
1075
+ ----------
1076
+ eVal : ndarray
1077
+ Eigenvalues to fit.
1078
+ q : float
1079
+ T/N where T is the number of rows and N the number of columns.
1080
+ bWidth : float, optional
1081
+ The bandwidth of the kernel.
1082
+
1083
+ Returns
1084
+ -------
1085
+ pdf : tuple (float, float)
1086
+ First value is the maximum random eigenvalue and second is the variance
1087
+ attributed to noise (1-result) is one way to measure signal-to-noise.
1088
+
1089
+ Raises
1090
+ ------
1091
+ ValueError when the value cannot be calculated.
1092
+ """
1093
+
1094
+ out = minimize(
1095
+ lambda *x: errPDFs(*x), 0.5, args=(eVal, q, bWidth), bounds=((1e-5, 1 - 1e-5),)
1096
+ )
1097
+
1098
+ if out["success"]:
1099
+ var = out["x"][0]
1100
+ else:
1101
+ var = 1
1102
+
1103
+ eMax = var * (1 + (1.0 / q) ** 0.5) ** 2
1104
+
1105
+ return eMax, var
1106
+
1107
+
1108
+ def getPCA(matrix):
1109
+ r"""
1110
+ Gets the Eigenvalues and Eigenvector values from a Hermitian Matrix.
1111
+ For more information see chapter 2 of :cite:`d-MLforAM`.
1112
+
1113
+ Parameters
1114
+ ----------
1115
+ matrix : ndarray or pd.DataFrame
1116
+ Correlation matrix.
1117
+
1118
+ Returns
1119
+ -------
1120
+ pdf : tuple (float, float)
1121
+ First value are the eigenvalues of correlation matrix and second are
1122
+ the Eigenvectors of correlation matrix.
1123
+
1124
+ Raises
1125
+ ------
1126
+ ValueError when the value cannot be calculated.
1127
+ """
1128
+
1129
+ # Get eVal,eVec from a Hermitian matrix
1130
+ eVal, eVec = np.linalg.eigh(matrix)
1131
+ indices = eVal.argsort()[::-1] # arguments for sorting eVal desc
1132
+ eVal, eVec = eVal[indices], eVec[:, indices]
1133
+ eVal = np.diagflat(eVal)
1134
+
1135
+ return eVal, eVec
1136
+
1137
+
1138
+ def denoisedCorr(eVal, eVec, nFacts, kind="fixed"):
1139
+ r"""
1140
+ Remove noise from correlation matrix using fixing random eigenvalues and
1141
+ spectral method. For more information see chapter 2 of :cite:`d-MLforAM`.
1142
+
1143
+ Parameters
1144
+ ----------
1145
+ eVal : 1darray
1146
+ Eigenvalues.
1147
+ eVec : ndarray
1148
+ Eigenvectors.
1149
+ nFacts : float
1150
+ The number of factors.
1151
+ kind : str, optional
1152
+ The denoise method. The default value is 'fixed'. Possible values are:
1153
+
1154
+ - 'fixed': takes average of eigenvalues above max Marchenko Pastour limit.
1155
+ - 'spectral': makes zero eigenvalues above max Marchenko Pastour limit.
1156
+
1157
+ Returns
1158
+ -------
1159
+ corr : ndarray
1160
+ Denoised correlation matrix.
1161
+
1162
+ Raises
1163
+ ------
1164
+ ValueError when the value cannot be calculated.
1165
+ """
1166
+
1167
+ eVal_ = np.diag(eVal).copy()
1168
+
1169
+ if kind == "fixed":
1170
+ eVal_[nFacts:] = eVal_[nFacts:].sum() / float(eVal_.shape[0] - nFacts)
1171
+ elif kind == "spectral":
1172
+ eVal_[nFacts:] = 0
1173
+
1174
+ eVal_ = np.diag(eVal_)
1175
+ corr = np.dot(eVec, eVal_).dot(eVec.T)
1176
+ corr = cov2corr(corr)
1177
+
1178
+ return corr
1179
+
1180
+
1181
+ def shrinkCorr(eVal, eVec, nFacts, alpha=0):
1182
+ r"""
1183
+ Remove noise from correlation using target shrinkage. For more information
1184
+ see chapter 2 of :cite:`d-MLforAM`.
1185
+
1186
+ Parameters
1187
+ ----------
1188
+ eVal : 1darray
1189
+ Eigenvalues.
1190
+ eVec : ndarray
1191
+ Eigenvectors.
1192
+ nFacts : float
1193
+ The number of factors.
1194
+ alpha : float, optional
1195
+ Shrinkage factor.
1196
+
1197
+ Returns
1198
+ -------
1199
+ corr : ndarray
1200
+ Denoised correlation matrix.
1201
+
1202
+ Raises
1203
+ ------
1204
+ ValueError when the value cannot be calculated.
1205
+ """
1206
+
1207
+ eVal_L = eVal[:nFacts, :nFacts]
1208
+ eVec_L = eVec[:, :nFacts]
1209
+ eVal_R = eVal[nFacts:, nFacts:]
1210
+ eVec_R = eVec[:, nFacts:]
1211
+ corr0 = np.dot(eVec_L, eVal_L).dot(eVec_L.T)
1212
+ corr1 = np.dot(eVec_R, eVal_R).dot(eVec_R.T)
1213
+ corr2 = corr0 + alpha * corr1 + (1 - alpha) * np.diag(np.diag(corr1))
1214
+
1215
+ return corr2
1216
+
1217
+
1218
+ def denoiseCov(cov, q, kind="fixed", bWidth=0.01, detone=False, mkt_comp=1, alpha=0.1):
1219
+ r"""
1220
+ Remove noise from cov by fixing random eigenvalues of their correlation
1221
+ matrix. For more information see chapter 2 of :cite:`d-MLforAM`.
1222
+
1223
+ Parameters
1224
+ ----------
1225
+ cov : DataFrame of shape (n_assets, n_assets)
1226
+ Covariance matrix, where n_assets is the number of assets.
1227
+ q : float
1228
+ T/N where T is the number of rows and N the number of columns.
1229
+ bWidth : float
1230
+ The bandwidth of the kernel.
1231
+ kind : str, optional
1232
+ The denoise method. The default value is 'fixed'. Possible values are:
1233
+
1234
+ - 'fixed': takes average of eigenvalues above max Marchenko Pastour limit.
1235
+ - 'spectral': makes zero eigenvalues above max Marchenko Pastour limit.
1236
+ - 'shrink': uses target shrinkage method.
1237
+
1238
+ detone : bool, optional
1239
+ If remove the firs mkt_comp of correlation matrix. The detone correlation
1240
+ matrix is singular, so it cannot be inverted.
1241
+ mkt_comp : float, optional
1242
+ Number of first components that will be removed using the detone method.
1243
+ alpha : float, optional
1244
+ Shrinkage factor.
1245
+
1246
+ Returns
1247
+ -------
1248
+ cov_ : ndarray or pd.DataFrame
1249
+ Denoised covariance matrix.
1250
+
1251
+ Raises
1252
+ ------
1253
+ ValueError when the value cannot be calculated.
1254
+ """
1255
+
1256
+ flag = False
1257
+ if isinstance(cov, pd.DataFrame):
1258
+ cols = cov.columns.tolist()
1259
+ flag = True
1260
+
1261
+ corr = cov2corr(cov)
1262
+ std = np.diag(cov) ** 0.5
1263
+ eVal, eVec = getPCA(corr)
1264
+ eMax, var = findMaxEval(np.diag(eVal), q, bWidth)
1265
+ nFacts = eVal.shape[0] - np.diag(eVal)[::-1].searchsorted(eMax)
1266
+
1267
+ if kind in ["fixed", "spectral"]:
1268
+ corr = denoisedCorr(eVal, eVec, nFacts, kind=kind)
1269
+ elif kind == "shrink":
1270
+ corr = shrinkCorr(eVal, eVec, nFacts, alpha=alpha)
1271
+
1272
+ if detone == True:
1273
+ eVal_ = eVal[:mkt_comp, :mkt_comp]
1274
+ eVec_ = eVec[:, :mkt_comp]
1275
+ corr_ = np.dot(eVec_, eVal_).dot(eVec_.T)
1276
+ corr = corr - corr_
1277
+
1278
+ cov_ = corr2cov(corr, std)
1279
+
1280
+ if flag:
1281
+ cov_ = pd.DataFrame(cov_, index=cols, columns=cols)
1282
+
1283
+ return cov_
1284
+
1285
+
1286
+ ###############################################################################
1287
+ # Other Aditional Functions
1288
+ ###############################################################################
1289
+
1290
+
1291
+ def round_values(data, decimals=4, wider=False):
1292
+ r"""
1293
+ This function help us to round values to values close or away from zero.
1294
+
1295
+ Parameters
1296
+ ----------
1297
+ data : np.ndarray, pd.Series or pd.DataFrame
1298
+ Data that are going to be rounded.
1299
+ decimals : integer
1300
+ Number of decimals to round.
1301
+ wider : float
1302
+ False if round to values close to zero, True if round to values away
1303
+ from zero.
1304
+
1305
+ Returns
1306
+ -------
1307
+ value : np.ndarray, pd.Series or pd.DataFrame
1308
+ Data rounded using selected method.
1309
+
1310
+ Raises
1311
+ ------
1312
+ ValueError
1313
+ When the value cannot be calculated.
1314
+
1315
+ """
1316
+
1317
+ if wider == True:
1318
+ value = np.where(
1319
+ data >= 0,
1320
+ np.ceil(data * 10**decimals) / 10**decimals,
1321
+ np.floor(data * 10**decimals) / 10**decimals,
1322
+ )
1323
+ elif wider == False:
1324
+ value = np.where(
1325
+ data >= 0,
1326
+ np.floor(data * 10**decimals) / 10**decimals,
1327
+ np.ceil(data * 10**decimals) / 10**decimals,
1328
+ )
1329
+
1330
+ if isinstance(data, pd.DataFrame):
1331
+ value = pd.DataFrame(value, columns=data.columns, index=data.index)
1332
+ if isinstance(data, pd.Series):
1333
+ value = pd.Series(value, index=data.index)
1334
+
1335
+ return value
1336
+
1337
+
1338
+ def weights_discretizetion(
1339
+ weights, prices, capital=1000000, w_decimal=6, ascending=False
1340
+ ):
1341
+ r"""
1342
+ This function help us to find the number of shares that must be bought or
1343
+ sold to achieve portfolio weights according the prices of assets and the
1344
+ invested capital.
1345
+
1346
+ Parameters
1347
+ ----------
1348
+ weights : pd.Series or pd.DataFrame
1349
+ Vector of weights of size n_assets x 1.
1350
+ prices : pd.Series or pd.DataFrame
1351
+ Vector of prices of size n_assets x 1.
1352
+ capital : float, optional
1353
+ Capital invested. The default value is 1000000.
1354
+ w_decimal : int, optional
1355
+ Number of decimals use to round the portfolio weights. The default
1356
+ value is 6.
1357
+ ascending : bool, optional
1358
+ If True assigns excess capital to assets with lower weights, else,
1359
+ to assets with higher weights. The default value is False.
1360
+
1361
+ Returns
1362
+ -------
1363
+ n_shares : pd.DataFrame
1364
+ Number of shares that must be bought or sold to achieve portfolio
1365
+ weights.
1366
+
1367
+ Raises
1368
+ ------
1369
+ ValueError
1370
+ When the value cannot be calculated.
1371
+
1372
+ """
1373
+
1374
+ if isinstance(weights, pd.Series):
1375
+ w = weights.to_frame().copy()
1376
+ elif isinstance(weights, pd.DataFrame):
1377
+ if weights.shape[0] == 1:
1378
+ w = weights.T.copy()
1379
+ elif weights.shape[1] == 1:
1380
+ w = weights.copy()
1381
+ pass
1382
+ else:
1383
+ raise ValueError("weights must have size n_assets x 1")
1384
+ else:
1385
+ raise ValueError("weights must be DataFrame")
1386
+
1387
+ if isinstance(prices, pd.Series):
1388
+ p = prices.to_frame().copy()
1389
+ elif isinstance(prices, pd.DataFrame):
1390
+ if prices.shape[0] == 1:
1391
+ p = prices.T.copy()
1392
+ elif prices.shape[1] == 1:
1393
+ p = prices.copy()
1394
+ pass
1395
+ else:
1396
+ raise ValueError("prices must have size n_assets x 1")
1397
+ else:
1398
+ raise ValueError("prices must be DataFrame")
1399
+
1400
+ w.columns = [0]
1401
+ p.columns = [0]
1402
+
1403
+ total = w.sum().item()
1404
+ w = round_values(w, decimals=w_decimal, wider=False)
1405
+ w.loc[w.idxmin().tolist()] = w.loc[w.idxmin().tolist()] + (total - w.sum()).item()
1406
+
1407
+ n_shares = round_values(capital * w / p, decimals=0, wider=False)
1408
+
1409
+ excedent = [capital + 1, capital]
1410
+ i = 1
1411
+ while excedent[i] < excedent[i - 1]:
1412
+ new_capital = (n_shares.T @ p).iloc[0, 0]
1413
+ excedent.append(capital - new_capital)
1414
+ new_shares = round_values(excedent[-1] * w / p, 0)
1415
+ n_shares += new_shares
1416
+ i += 1
1417
+
1418
+ n_shares_1 = capital * w / p
1419
+
1420
+ excedent = capital - (n_shares.T @ p).iloc[0, 0]
1421
+ i = 1
1422
+
1423
+ d_shares = np.abs(n_shares_1) - np.abs(n_shares)
1424
+ d_shares = np.where(d_shares > 0, n_shares_1 - n_shares, 0)
1425
+ d_shares = round_values(d_shares, decimals=0, wider=True)
1426
+ d_shares = pd.DataFrame(d_shares, columns=w.columns, index=w.index)
1427
+
1428
+ n_shares_1 = capital * w / p
1429
+
1430
+ excedent = capital - (n_shares.T @ p).iloc[0, 0]
1431
+
1432
+ d_shares = np.abs(n_shares_1) - np.abs(n_shares)
1433
+ d_shares = np.where(d_shares > 0, n_shares_1 - n_shares, 0)
1434
+ d_shares = round_values(d_shares, decimals=0, wider=True)
1435
+ d_shares = pd.DataFrame(d_shares, columns=w.columns, index=w.index)
1436
+
1437
+ order = w.sort_values(by=0, ascending=ascending).index.tolist()
1438
+ d_list = d_shares[d_shares[0] == 1].index.tolist()
1439
+
1440
+ for i in order:
1441
+ if i in d_list:
1442
+ new_shares = round_values(excedent / p.loc[i, 0], 0).item()
1443
+ if new_shares > 0:
1444
+ n_shares.loc[i] += new_shares
1445
+ excedent = capital - (n_shares.T @ p).iloc[0, 0]
1446
+
1447
+ return n_shares
1448
+
1449
+
1450
+ def color_list(k):
1451
+ r"""
1452
+ This function creates a list of colors.
1453
+
1454
+ Parameters
1455
+ ----------
1456
+ k : int
1457
+ Number of colors.
1458
+
1459
+ Returns
1460
+ -------
1461
+ colors : list
1462
+ A list of colors.
1463
+ """
1464
+
1465
+ colors = []
1466
+
1467
+ if k <= 10:
1468
+ for i in range(10):
1469
+ colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab10").colors[i]))
1470
+ elif k <= 20:
1471
+ for i in range(20):
1472
+ colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20").colors[i]))
1473
+ elif k <= 40:
1474
+ for i in range(20):
1475
+ colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20").colors[i]))
1476
+ for i in range(20):
1477
+ colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20b").colors[i]))
1478
+ else:
1479
+ for i in range(20):
1480
+ colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20").colors[i]))
1481
+ for i in range(20):
1482
+ colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20b").colors[i]))
1483
+ for i in range(20):
1484
+ colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20c").colors[i]))
1485
+ if k / 60 > 1:
1486
+ colors = colors * int(np.ceil(k / 60))
1487
+
1488
+ return colors