riskfolio-lib 7.2.0__cp313-cp313-macosx_10_13_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- riskfolio/__init__.py +14 -0
- riskfolio/external/__init__.py +10 -0
- riskfolio/external/cppfunctions.py +376 -0
- riskfolio/external/functions.cpython-313-darwin.so +0 -0
- riskfolio/src/AuxFunctions.py +1488 -0
- riskfolio/src/ConstraintsFunctions.py +2210 -0
- riskfolio/src/DBHT.py +1089 -0
- riskfolio/src/GerberStatistic.py +240 -0
- riskfolio/src/HCPortfolio.py +1102 -0
- riskfolio/src/OwaWeights.py +433 -0
- riskfolio/src/ParamsEstimation.py +1989 -0
- riskfolio/src/PlotFunctions.py +5052 -0
- riskfolio/src/Portfolio.py +6164 -0
- riskfolio/src/Reports.py +692 -0
- riskfolio/src/RiskFunctions.py +3195 -0
- riskfolio/src/__init__.py +20 -0
- riskfolio/version.py +4 -0
- riskfolio_lib-7.2.0.dist-info/LICENSE.txt +27 -0
- riskfolio_lib-7.2.0.dist-info/METADATA +386 -0
- riskfolio_lib-7.2.0.dist-info/RECORD +22 -0
- riskfolio_lib-7.2.0.dist-info/WHEEL +6 -0
- riskfolio_lib-7.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1488 @@
|
|
|
1
|
+
"""""" #
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Copyright (c) 2020-2026, Dany Cajas
|
|
5
|
+
All rights reserved.
|
|
6
|
+
This work is licensed under BSD 3-Clause "New" or "Revised" License.
|
|
7
|
+
License available at https://github.com/dcajasn/Riskfolio-Lib/blob/master/LICENSE.txt
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import matplotlib as mpl
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
import scipy.stats as st
|
|
15
|
+
import scipy.cluster.hierarchy as hr
|
|
16
|
+
from scipy import linalg as LA
|
|
17
|
+
from statsmodels.stats.correlation_tools import cov_nearest
|
|
18
|
+
from scipy.sparse import csr_matrix
|
|
19
|
+
from scipy.spatial.distance import pdist, squareform
|
|
20
|
+
from scipy.optimize import minimize
|
|
21
|
+
from sklearn.metrics import mutual_info_score
|
|
22
|
+
from sklearn.neighbors import KernelDensity
|
|
23
|
+
from sklearn.metrics import silhouette_samples
|
|
24
|
+
from astropy.stats import knuth_bin_width, freedman_bin_width, scott_bin_width
|
|
25
|
+
from itertools import product
|
|
26
|
+
import riskfolio.external.cppfunctions as cf
|
|
27
|
+
import riskfolio.src.GerberStatistic as gs
|
|
28
|
+
import re
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"is_pos_def",
|
|
33
|
+
"cov2corr",
|
|
34
|
+
"corr2cov",
|
|
35
|
+
"cov_fix",
|
|
36
|
+
"cov_returns",
|
|
37
|
+
"block_vec_pq",
|
|
38
|
+
"dcorr",
|
|
39
|
+
"dcorr_matrix",
|
|
40
|
+
"numBins",
|
|
41
|
+
"mutual_info_matrix",
|
|
42
|
+
"var_info_matrix",
|
|
43
|
+
"ltdi_matrix",
|
|
44
|
+
"two_diff_gap_stat",
|
|
45
|
+
"std_silhouette_score",
|
|
46
|
+
"codep_dist",
|
|
47
|
+
"fitKDE",
|
|
48
|
+
"mpPDF",
|
|
49
|
+
"errPDFs",
|
|
50
|
+
"findMaxEval",
|
|
51
|
+
"getPCA",
|
|
52
|
+
"denoisedCorr",
|
|
53
|
+
"shrinkCorr",
|
|
54
|
+
"denoiseCov",
|
|
55
|
+
"round_values",
|
|
56
|
+
"weights_discretizetion",
|
|
57
|
+
"color_list",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
###############################################################################
|
|
61
|
+
# Additional Matrix Functions
|
|
62
|
+
###############################################################################
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def is_pos_def(cov, threshold=1e-8):
|
|
66
|
+
r"""
|
|
67
|
+
Indicate if a matrix is positive (semi)definite.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
cov : DataFrame of shape (n_assets, n_assets)
|
|
72
|
+
Covariance matrix, where n_assets is the number of assets.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
value : bool
|
|
77
|
+
True if matrix is positive (semi)definite.
|
|
78
|
+
|
|
79
|
+
Raises
|
|
80
|
+
------
|
|
81
|
+
ValueError when the value cannot be calculated.
|
|
82
|
+
|
|
83
|
+
"""
|
|
84
|
+
cov_ = np.array(cov, ndmin=2)
|
|
85
|
+
w = LA.eigh(cov_, lower=True, check_finite=True, eigvals_only=True)
|
|
86
|
+
value = np.all(w >= threshold)
|
|
87
|
+
|
|
88
|
+
return value
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def cov2corr(cov):
|
|
92
|
+
r"""
|
|
93
|
+
Generate a correlation matrix from a covariance matrix cov.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
cov : DataFrame of shape (n_assets, n_assets)
|
|
98
|
+
Covariance matrix, where n_assets is the number of assets.
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
corr : ndarray
|
|
103
|
+
A correlation matrix.
|
|
104
|
+
|
|
105
|
+
Raises
|
|
106
|
+
------
|
|
107
|
+
ValueError when the value cannot be calculated.
|
|
108
|
+
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
flag = False
|
|
112
|
+
if isinstance(cov, pd.DataFrame):
|
|
113
|
+
cols = cov.columns.tolist()
|
|
114
|
+
flag = True
|
|
115
|
+
|
|
116
|
+
cov1 = np.array(cov, ndmin=2)
|
|
117
|
+
std = np.sqrt(np.diag(cov1))
|
|
118
|
+
corr = np.clip(cov1 / np.outer(std, std), a_min=-1.0, a_max=1.0)
|
|
119
|
+
|
|
120
|
+
if flag:
|
|
121
|
+
corr = pd.DataFrame(corr, index=cols, columns=cols)
|
|
122
|
+
|
|
123
|
+
return corr
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def corr2cov(corr, std):
|
|
127
|
+
r"""
|
|
128
|
+
Generate a covariance matrix from a correlation matrix corr and a standard
|
|
129
|
+
deviation vector std.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
corr : DataFrame of shape (n_assets, n_assets)
|
|
134
|
+
Covariance matrix, where n_assets is the number of assets.
|
|
135
|
+
std : 1darray
|
|
136
|
+
Assets standard deviation vector of size n_features, where
|
|
137
|
+
n_features is the number of features.
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
cov : ndarray
|
|
143
|
+
A covariance matrix.
|
|
144
|
+
|
|
145
|
+
Raises
|
|
146
|
+
------
|
|
147
|
+
ValueError when the value cannot be calculated.
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
flag = False
|
|
152
|
+
if isinstance(corr, pd.DataFrame):
|
|
153
|
+
cols = corr.columns.tolist()
|
|
154
|
+
flag = True
|
|
155
|
+
|
|
156
|
+
cov = corr * np.outer(std, std)
|
|
157
|
+
|
|
158
|
+
if flag:
|
|
159
|
+
cov = pd.DataFrame(cov, index=cols, columns=cols)
|
|
160
|
+
|
|
161
|
+
return cov
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def cov_fix(cov, method="clipped", threshold=1e-8):
|
|
165
|
+
r"""
|
|
166
|
+
Fix a covariance matrix to a positive definite matrix.
|
|
167
|
+
|
|
168
|
+
Parameters
|
|
169
|
+
----------
|
|
170
|
+
cov : DataFrame of shape (n_assets, n_assets)
|
|
171
|
+
Covariance matrix, where n_assets is the number of assets.
|
|
172
|
+
method : str
|
|
173
|
+
The default value is 'clipped', see more in `cov_nearest <https://www.statsmodels.org/stable/generated/statsmodels.stats.correlation_tools.cov_nearest.html>`_.
|
|
174
|
+
threshold
|
|
175
|
+
Clipping threshold for smallest eigen value.
|
|
176
|
+
|
|
177
|
+
Returns
|
|
178
|
+
-------
|
|
179
|
+
cov_ : bool
|
|
180
|
+
A positive definite covariance matrix.
|
|
181
|
+
|
|
182
|
+
Raises
|
|
183
|
+
------
|
|
184
|
+
ValueError when the value cannot be calculated.
|
|
185
|
+
|
|
186
|
+
"""
|
|
187
|
+
flag = False
|
|
188
|
+
if isinstance(cov, pd.DataFrame):
|
|
189
|
+
cols = cov.columns.tolist()
|
|
190
|
+
flag = True
|
|
191
|
+
|
|
192
|
+
cov_ = np.array(cov, ndmin=2)
|
|
193
|
+
cov_ = cov_nearest(cov_, method=method, threshold=threshold)
|
|
194
|
+
cov_ = np.array(cov_, ndmin=2)
|
|
195
|
+
|
|
196
|
+
if flag:
|
|
197
|
+
cov_ = pd.DataFrame(cov_, index=cols, columns=cols)
|
|
198
|
+
|
|
199
|
+
return cov_
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def cov_returns(cov, seed=0):
|
|
203
|
+
r"""
|
|
204
|
+
Generate a matrix of returns that have a covariance matrix cov.
|
|
205
|
+
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
cov : DataFrame of shape (n_assets, n_assets)
|
|
209
|
+
Covariance matrix, where n_assets is the number of assets.
|
|
210
|
+
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
a : ndarray
|
|
214
|
+
A matrix of returns that have a covariance matrix cov.
|
|
215
|
+
|
|
216
|
+
Raises
|
|
217
|
+
------
|
|
218
|
+
ValueError when the value cannot be calculated.
|
|
219
|
+
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
rs = np.random.RandomState(seed)
|
|
223
|
+
n = len(cov)
|
|
224
|
+
a = np.array(rs.randn(n + 10, n), ndmin=2)
|
|
225
|
+
|
|
226
|
+
for i in range(0, 5):
|
|
227
|
+
cov_ = np.cov(a.T)
|
|
228
|
+
L = np.array(np.linalg.cholesky(cov_), ndmin=2)
|
|
229
|
+
a = a @ np.linalg.inv(L).T
|
|
230
|
+
cov_ = np.cov(a.T)
|
|
231
|
+
desv_ = np.sqrt(np.array(np.diag(cov_), ndmin=2))
|
|
232
|
+
a = (np.array(a) - np.mean(a, axis=0)) / np.array(desv_)
|
|
233
|
+
|
|
234
|
+
L1 = np.array(np.linalg.cholesky(cov), ndmin=2)
|
|
235
|
+
a = a @ L1.T
|
|
236
|
+
|
|
237
|
+
return a
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def block_vec_pq(A, p, q):
|
|
241
|
+
r"""
|
|
242
|
+
Calculates block vectorization operator as shown in :cite:`d-VanLoan1993`
|
|
243
|
+
and :cite:`d-Ojeda2015`.
|
|
244
|
+
|
|
245
|
+
Parameters
|
|
246
|
+
----------
|
|
247
|
+
A : ndarray
|
|
248
|
+
Matrix that will be block vectorized.
|
|
249
|
+
p : int
|
|
250
|
+
Order p of block vectorization operator.
|
|
251
|
+
q : int
|
|
252
|
+
Order q of block vectorization operator.
|
|
253
|
+
|
|
254
|
+
Returns
|
|
255
|
+
-------
|
|
256
|
+
bvec_A : ndarray
|
|
257
|
+
The block vectorized matrix.
|
|
258
|
+
|
|
259
|
+
Raises
|
|
260
|
+
------
|
|
261
|
+
ValueError when the value cannot be calculated.
|
|
262
|
+
|
|
263
|
+
"""
|
|
264
|
+
if isinstance(A, pd.DataFrame):
|
|
265
|
+
A_ = A.to_numpy()
|
|
266
|
+
elif isinstance(A, np.ndarray):
|
|
267
|
+
A_ = A.copy()
|
|
268
|
+
else:
|
|
269
|
+
raise ValueError("A must be an 2darray or DataFrame.")
|
|
270
|
+
|
|
271
|
+
mp, nq = A_.shape
|
|
272
|
+
if mp % p == 0 and nq % q == 0:
|
|
273
|
+
m = int(mp / p)
|
|
274
|
+
n = int(nq / q)
|
|
275
|
+
bvec_A = np.empty((0, p * q))
|
|
276
|
+
for j in range(n):
|
|
277
|
+
Aj = np.empty((0, p * q))
|
|
278
|
+
for i in range(m):
|
|
279
|
+
Aij = (
|
|
280
|
+
A_[i * p : (i + 1) * p, j * q : (j + 1) * q]
|
|
281
|
+
.reshape(-1, 1, order="F")
|
|
282
|
+
.T
|
|
283
|
+
)
|
|
284
|
+
Aj = np.vstack([Aj, Aij])
|
|
285
|
+
bvec_A = np.vstack([bvec_A, Aj])
|
|
286
|
+
else:
|
|
287
|
+
raise ValueError(
|
|
288
|
+
"Dimensions p and q give non integer values for dimensions m and n."
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
return bvec_A
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
###############################################################################
|
|
295
|
+
# Aditional Codependence Functions
|
|
296
|
+
###############################################################################
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def dcorr(X, Y):
|
|
300
|
+
r"""
|
|
301
|
+
Calculate the distance correlation between two variables :cite:`d-Szekely`.
|
|
302
|
+
|
|
303
|
+
Parameters
|
|
304
|
+
----------
|
|
305
|
+
X : 1d-array
|
|
306
|
+
Returns series, must have of shape n_sample x 1.
|
|
307
|
+
Y : 1d-array
|
|
308
|
+
Returns series, must have of shape n_sample x 1.
|
|
309
|
+
|
|
310
|
+
Returns
|
|
311
|
+
-------
|
|
312
|
+
value : float
|
|
313
|
+
The distance correlation between variables X and Y.
|
|
314
|
+
|
|
315
|
+
Raises
|
|
316
|
+
------
|
|
317
|
+
ValueError when the value cannot be calculated.
|
|
318
|
+
|
|
319
|
+
"""
|
|
320
|
+
|
|
321
|
+
X = np.atleast_1d(X)
|
|
322
|
+
Y = np.atleast_1d(Y)
|
|
323
|
+
|
|
324
|
+
if np.prod(X.shape) == len(X):
|
|
325
|
+
X = X[:, None]
|
|
326
|
+
if np.prod(Y.shape) == len(Y):
|
|
327
|
+
Y = Y[:, None]
|
|
328
|
+
|
|
329
|
+
X = np.atleast_2d(X)
|
|
330
|
+
Y = np.atleast_2d(Y)
|
|
331
|
+
n = X.shape[0]
|
|
332
|
+
|
|
333
|
+
if Y.shape[0] != X.shape[0]:
|
|
334
|
+
raise ValueError("Number of samples must match")
|
|
335
|
+
|
|
336
|
+
value = cf.d_corr(X, Y)
|
|
337
|
+
|
|
338
|
+
return value
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def dcorr_matrix(X):
|
|
342
|
+
r"""
|
|
343
|
+
Calculate the distance correlation matrix of n variables.
|
|
344
|
+
|
|
345
|
+
Parameters
|
|
346
|
+
----------
|
|
347
|
+
X : DataFrame of shape (n_samples, n_assets)
|
|
348
|
+
Assets returns DataFrame, where n_samples is the number of
|
|
349
|
+
observations and n_assets is the number of assets.
|
|
350
|
+
|
|
351
|
+
Returns
|
|
352
|
+
-------
|
|
353
|
+
corr : ndarray
|
|
354
|
+
The distance correlation matrix of shape n_features x n_features.
|
|
355
|
+
|
|
356
|
+
Raises
|
|
357
|
+
------
|
|
358
|
+
ValueError when the value cannot be calculated.
|
|
359
|
+
|
|
360
|
+
"""
|
|
361
|
+
flag = False
|
|
362
|
+
if isinstance(X, pd.DataFrame):
|
|
363
|
+
cols = X.columns.tolist()
|
|
364
|
+
X1 = X.to_numpy()
|
|
365
|
+
flag = True
|
|
366
|
+
else:
|
|
367
|
+
X1 = X.copy()
|
|
368
|
+
|
|
369
|
+
corr = cf.d_corr_matrix(X1)
|
|
370
|
+
|
|
371
|
+
if flag:
|
|
372
|
+
corr = pd.DataFrame(corr, index=cols, columns=cols)
|
|
373
|
+
else:
|
|
374
|
+
corr = pd.DataFrame(corr)
|
|
375
|
+
|
|
376
|
+
return corr
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def numBins(n_samples, corr=None):
|
|
380
|
+
r"""
|
|
381
|
+
Calculate the optimal number of bins for discretization of mutual
|
|
382
|
+
information and variation of information.
|
|
383
|
+
|
|
384
|
+
Parameters
|
|
385
|
+
----------
|
|
386
|
+
n_samples : integer
|
|
387
|
+
Number of samples.
|
|
388
|
+
|
|
389
|
+
corr : float, optional
|
|
390
|
+
Correlation coefficient of variables. The default value is None.
|
|
391
|
+
|
|
392
|
+
Returns
|
|
393
|
+
-------
|
|
394
|
+
bins : int
|
|
395
|
+
The optimal number of bins.
|
|
396
|
+
|
|
397
|
+
Raises
|
|
398
|
+
------
|
|
399
|
+
ValueError when the value cannot be calculated.
|
|
400
|
+
|
|
401
|
+
"""
|
|
402
|
+
# univariate case
|
|
403
|
+
if corr is None:
|
|
404
|
+
z = (
|
|
405
|
+
8 + 324 * n_samples + 12 * (36 * n_samples + 729 * n_samples**2) ** 0.5
|
|
406
|
+
) ** (1 / 3)
|
|
407
|
+
b = np.round(z / 6 + 2 / (3 * z) + 1 / 3)
|
|
408
|
+
# bivariate case
|
|
409
|
+
else:
|
|
410
|
+
b = np.round(2**-0.5 * (1 + (1 + 24 * n_samples / (1 - corr**2)) ** 0.5) ** 0.5)
|
|
411
|
+
|
|
412
|
+
bins = np.int32(b)
|
|
413
|
+
|
|
414
|
+
return bins
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def mutual_info_matrix(X, bins_info="KN", normalize=True):
|
|
418
|
+
r"""
|
|
419
|
+
Calculate the mutual information matrix of n variables.
|
|
420
|
+
|
|
421
|
+
Parameters
|
|
422
|
+
----------
|
|
423
|
+
X : DataFrame of shape (n_samples, n_assets)
|
|
424
|
+
Assets returns DataFrame, where n_samples is the number of
|
|
425
|
+
observations and n_assets is the number of assets.
|
|
426
|
+
bins_info: int or str
|
|
427
|
+
Number of bins used to calculate mutual information. The default
|
|
428
|
+
value is 'KN'. Possible values are:
|
|
429
|
+
|
|
430
|
+
- 'KN': Knuth's choice method. See more in `knuth_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.knuth_bin_width.html>`_.
|
|
431
|
+
- 'FD': Freedman–Diaconis' choice method. See more in `freedman_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.freedman_bin_width.html>`_.
|
|
432
|
+
- 'SC': Scotts' choice method. See more in `scott_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.scott_bin_width.html>`_.
|
|
433
|
+
- 'HGR': Hacine-Gharbi and Ravier' choice method.
|
|
434
|
+
- int: integer value choice by user.
|
|
435
|
+
|
|
436
|
+
normalize: bool
|
|
437
|
+
If normalize variation of information. The default value is True.
|
|
438
|
+
|
|
439
|
+
Returns
|
|
440
|
+
-------
|
|
441
|
+
corr : ndarray
|
|
442
|
+
The mutual information matrix of shape n_features x n_features.
|
|
443
|
+
|
|
444
|
+
Raises
|
|
445
|
+
------
|
|
446
|
+
ValueError when the value cannot be calculated.
|
|
447
|
+
|
|
448
|
+
"""
|
|
449
|
+
flag = False
|
|
450
|
+
if isinstance(X, pd.DataFrame):
|
|
451
|
+
cols = X.columns.tolist()
|
|
452
|
+
X1 = X.to_numpy()
|
|
453
|
+
flag = True
|
|
454
|
+
else:
|
|
455
|
+
X1 = X.copy()
|
|
456
|
+
|
|
457
|
+
m = X1.shape[0]
|
|
458
|
+
n = X1.shape[1]
|
|
459
|
+
mat = np.zeros((n, n))
|
|
460
|
+
indices = np.triu_indices(n)
|
|
461
|
+
|
|
462
|
+
for i, j in zip(indices[0], indices[1]):
|
|
463
|
+
if bins_info == "KN":
|
|
464
|
+
k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / knuth_bin_width(X1[:, i])
|
|
465
|
+
bins = np.int32(np.round(k1))
|
|
466
|
+
if i != j:
|
|
467
|
+
k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / knuth_bin_width(X1[:, j])
|
|
468
|
+
bins = np.int32(np.round(np.maximum(k1, k2)))
|
|
469
|
+
elif bins_info == "FD":
|
|
470
|
+
k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / freedman_bin_width(X1[:, i])
|
|
471
|
+
bins = np.int32(np.round(k1))
|
|
472
|
+
if i != j:
|
|
473
|
+
k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / freedman_bin_width(
|
|
474
|
+
X1[:, j]
|
|
475
|
+
)
|
|
476
|
+
bins = np.int32(np.round(np.maximum(k1, k2)))
|
|
477
|
+
elif bins_info == "SC":
|
|
478
|
+
k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / scott_bin_width(X1[:, i])
|
|
479
|
+
bins = np.int32(np.round(k1))
|
|
480
|
+
if i != j:
|
|
481
|
+
k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / scott_bin_width(X1[:, j])
|
|
482
|
+
bins = np.int32(np.round(np.maximum(k1, k2)))
|
|
483
|
+
elif bins_info == "HGR":
|
|
484
|
+
corr = np.corrcoef(X1[:, i], X1[:, j])[0, 1]
|
|
485
|
+
if corr == 1:
|
|
486
|
+
bins = numBins(m, None)
|
|
487
|
+
else:
|
|
488
|
+
bins = numBins(m, corr)
|
|
489
|
+
elif isinstance(bins_info, np.int32) or isinstance(bins_info, int):
|
|
490
|
+
bins = bins_info
|
|
491
|
+
|
|
492
|
+
cXY = np.histogram2d(X1[:, i], X1[:, j], bins)[0]
|
|
493
|
+
hX = st.entropy(np.histogram(X1[:, i], bins)[0]) # marginal
|
|
494
|
+
hY = st.entropy(np.histogram(X1[:, j], bins)[0]) # marginal
|
|
495
|
+
iXY = mutual_info_score(None, None, contingency=cXY) # mutual information
|
|
496
|
+
if normalize == True:
|
|
497
|
+
iXY = iXY / np.min([hX, hY]) # normalized mutual information
|
|
498
|
+
# hXY = hX + hY - iXY # joint
|
|
499
|
+
# hX_Y = hXY - hY # conditional
|
|
500
|
+
# hY_X = hXY - hX # conditional
|
|
501
|
+
|
|
502
|
+
mat[i, j] = iXY
|
|
503
|
+
mat[j, i] = mat[i, j]
|
|
504
|
+
|
|
505
|
+
mat = np.clip(np.round(mat, 8), a_min=0.0, a_max=np.inf)
|
|
506
|
+
|
|
507
|
+
if flag:
|
|
508
|
+
mat = pd.DataFrame(mat, index=cols, columns=cols)
|
|
509
|
+
|
|
510
|
+
return mat
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def var_info_matrix(X, bins_info="KN", normalize=True):
|
|
514
|
+
r"""
|
|
515
|
+
Calculate the variation of information matrix of n variables.
|
|
516
|
+
|
|
517
|
+
Parameters
|
|
518
|
+
----------
|
|
519
|
+
X : DataFrame of shape (n_samples, n_assets)
|
|
520
|
+
Assets returns DataFrame, where n_samples is the number of
|
|
521
|
+
observations and n_assets is the number of assets.
|
|
522
|
+
bins_info: int or str
|
|
523
|
+
Number of bins used to calculate variation of information. The default
|
|
524
|
+
value is 'KN'. Possible values are:
|
|
525
|
+
|
|
526
|
+
- 'KN': Knuth's choice method. See more in `knuth_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.knuth_bin_width.html>`_.
|
|
527
|
+
- 'FD': Freedman–Diaconis' choice method. See more in `freedman_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.freedman_bin_width.html>`_.
|
|
528
|
+
- 'SC': Scotts' choice method. See more in `scott_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.scott_bin_width.html>`_.
|
|
529
|
+
- 'HGR': Hacine-Gharbi and Ravier' choice method.
|
|
530
|
+
- int: integer value choice by user.
|
|
531
|
+
|
|
532
|
+
normalize: bool
|
|
533
|
+
If normalize variation of information. The default value is True.
|
|
534
|
+
|
|
535
|
+
Returns
|
|
536
|
+
-------
|
|
537
|
+
corr : ndarray
|
|
538
|
+
The mutual information matrix of shape n_features x n_features.
|
|
539
|
+
|
|
540
|
+
Raises
|
|
541
|
+
------
|
|
542
|
+
ValueError when the value cannot be calculated.
|
|
543
|
+
|
|
544
|
+
"""
|
|
545
|
+
flag = False
|
|
546
|
+
if isinstance(X, pd.DataFrame):
|
|
547
|
+
cols = X.columns.tolist()
|
|
548
|
+
X1 = X.to_numpy()
|
|
549
|
+
flag = True
|
|
550
|
+
else:
|
|
551
|
+
X1 = X.copy()
|
|
552
|
+
|
|
553
|
+
m = X1.shape[0]
|
|
554
|
+
n = X1.shape[1]
|
|
555
|
+
mat = np.zeros((n, n))
|
|
556
|
+
indices = np.triu_indices(n)
|
|
557
|
+
|
|
558
|
+
for i, j in zip(indices[0], indices[1]):
|
|
559
|
+
if bins_info == "KN":
|
|
560
|
+
k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / knuth_bin_width(X1[:, i])
|
|
561
|
+
bins = np.int32(np.round(k1))
|
|
562
|
+
if i != j:
|
|
563
|
+
k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / knuth_bin_width(X1[:, j])
|
|
564
|
+
bins = np.int32(np.round(np.maximum(k1, k2)))
|
|
565
|
+
elif bins_info == "FD":
|
|
566
|
+
k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / freedman_bin_width(X1[:, i])
|
|
567
|
+
bins = np.int32(np.round(k1))
|
|
568
|
+
if i != j:
|
|
569
|
+
k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / freedman_bin_width(
|
|
570
|
+
X1[:, j]
|
|
571
|
+
)
|
|
572
|
+
bins = np.int32(np.round(np.maximum(k1, k2)))
|
|
573
|
+
elif bins_info == "SC":
|
|
574
|
+
k1 = (np.max(X1[:, i]) - np.min(X1[:, i])) / scott_bin_width(X1[:, i])
|
|
575
|
+
bins = np.int32(np.round(k1))
|
|
576
|
+
if i != j:
|
|
577
|
+
k2 = (np.max(X1[:, j]) - np.min(X1[:, j])) / scott_bin_width(X1[:, j])
|
|
578
|
+
bins = np.int32(np.round(np.maximum(k1, k2)))
|
|
579
|
+
elif bins_info == "HGR":
|
|
580
|
+
corr = np.corrcoef(X1[:, i], X1[:, j])[0, 1]
|
|
581
|
+
if corr == 1:
|
|
582
|
+
bins = numBins(m, None)
|
|
583
|
+
else:
|
|
584
|
+
bins = numBins(m, corr)
|
|
585
|
+
elif isinstance(bins_info, np.int32) or isinstance(bins_info, int):
|
|
586
|
+
bins = bins_info
|
|
587
|
+
|
|
588
|
+
cXY = np.histogram2d(X1[:, i], X1[:, j], bins)[0]
|
|
589
|
+
hX = st.entropy(np.histogram(X1[:, i], bins)[0]) # marginal
|
|
590
|
+
hY = st.entropy(np.histogram(X1[:, j], bins)[0]) # marginal
|
|
591
|
+
iXY = mutual_info_score(None, None, contingency=cXY) # mutual information
|
|
592
|
+
vXY = hX + hY - 2 * iXY # variation of information
|
|
593
|
+
if normalize == True:
|
|
594
|
+
hXY = hX + hY - iXY # joint
|
|
595
|
+
vXY = vXY / hXY # normalized variation of information
|
|
596
|
+
|
|
597
|
+
mat[i, j] = vXY
|
|
598
|
+
mat[j, i] = mat[i, j]
|
|
599
|
+
|
|
600
|
+
mat = np.clip(np.round(mat, 8), a_min=0.0, a_max=np.inf)
|
|
601
|
+
|
|
602
|
+
if flag:
|
|
603
|
+
mat = pd.DataFrame(mat, index=cols, columns=cols)
|
|
604
|
+
|
|
605
|
+
return mat
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def ltdi_matrix(X, alpha=0.05):
|
|
609
|
+
r"""
|
|
610
|
+
Calculate the lower tail dependence index matrix using the empirical
|
|
611
|
+
approach.
|
|
612
|
+
|
|
613
|
+
Parameters
|
|
614
|
+
----------
|
|
615
|
+
X : DataFrame of shape (n_samples, n_assets)
|
|
616
|
+
Assets returns DataFrame, where n_samples is the number of
|
|
617
|
+
observations and n_assets is the number of assets.
|
|
618
|
+
alpha : float, optional
|
|
619
|
+
Significance level for lower tail dependence index.
|
|
620
|
+
The default is 0.05.
|
|
621
|
+
|
|
622
|
+
Returns
|
|
623
|
+
-------
|
|
624
|
+
corr : ndarray
|
|
625
|
+
The lower tail dependence index matrix of shape n_features x
|
|
626
|
+
n_features.
|
|
627
|
+
|
|
628
|
+
Raises
|
|
629
|
+
------
|
|
630
|
+
ValueError when the value cannot be calculated.
|
|
631
|
+
|
|
632
|
+
"""
|
|
633
|
+
|
|
634
|
+
flag = False
|
|
635
|
+
if isinstance(X, pd.DataFrame):
|
|
636
|
+
cols = X.columns.tolist()
|
|
637
|
+
X1 = X.to_numpy()
|
|
638
|
+
flag = True
|
|
639
|
+
else:
|
|
640
|
+
X1 = X.copy()
|
|
641
|
+
|
|
642
|
+
m = X1.shape[0]
|
|
643
|
+
n = X1.shape[1]
|
|
644
|
+
k = np.int32(np.ceil(m * alpha))
|
|
645
|
+
mat = np.ones((n, n))
|
|
646
|
+
|
|
647
|
+
if k > 0:
|
|
648
|
+
indices = np.triu_indices(n)
|
|
649
|
+
|
|
650
|
+
for i, j in zip(indices[0], indices[1]):
|
|
651
|
+
u = np.sort(X1[:, i])[k - 1]
|
|
652
|
+
v = np.sort(X1[:, j])[k - 1]
|
|
653
|
+
ltd = (
|
|
654
|
+
np.sum(np.where(np.logical_and(X1[:, i] <= u, X1[:, j] <= v), 1, 0)) / k
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
mat[i, j] = ltd
|
|
658
|
+
mat[j, i] = mat[i, j]
|
|
659
|
+
|
|
660
|
+
for i in range(0, n):
|
|
661
|
+
u = np.sort(X1[:, i])[k - 1]
|
|
662
|
+
v = np.sort(X1[:, i])[k - 1]
|
|
663
|
+
ltd = (
|
|
664
|
+
np.sum(np.where(np.logical_and(X1[:, i] <= u, X1[:, i] <= v), 1, 0)) / k
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
mat[i, i] = ltd
|
|
668
|
+
|
|
669
|
+
mat = np.clip(np.round(mat, 8), a_min=1.0e-8, a_max=1)
|
|
670
|
+
|
|
671
|
+
if flag:
|
|
672
|
+
mat = pd.DataFrame(mat, index=cols, columns=cols)
|
|
673
|
+
else:
|
|
674
|
+
mat = pd.DataFrame(mat)
|
|
675
|
+
|
|
676
|
+
return mat
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def two_diff_gap_stat(dist, clustering, max_k=10):
|
|
680
|
+
r"""
|
|
681
|
+
Calculate the optimal number of clusters based on the two difference gap
|
|
682
|
+
statistic :cite:`d-twogap`.
|
|
683
|
+
|
|
684
|
+
Parameters
|
|
685
|
+
----------
|
|
686
|
+
dist : str, optional
|
|
687
|
+
A distance measure based on the codependence matrix.
|
|
688
|
+
clustering : str, optional
|
|
689
|
+
The hierarchical clustering encoded as a linkage matrix, see `linkage <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html?highlight=linkage#scipy.cluster.hierarchy.linkage>`_ for more details.
|
|
690
|
+
max_k : int, optional
|
|
691
|
+
Max number of clusters used by the two difference gap statistic
|
|
692
|
+
to find the optimal number of clusters. The default is 10.
|
|
693
|
+
|
|
694
|
+
Returns
|
|
695
|
+
-------
|
|
696
|
+
k : int
|
|
697
|
+
The optimal number of clusters based on the two difference gap statistic.
|
|
698
|
+
|
|
699
|
+
Raises
|
|
700
|
+
------
|
|
701
|
+
ValueError when the value cannot be calculated.
|
|
702
|
+
|
|
703
|
+
"""
|
|
704
|
+
flag = False
|
|
705
|
+
# Check if linkage matrix is monotonic
|
|
706
|
+
if hr.is_monotonic(clustering):
|
|
707
|
+
flag = True
|
|
708
|
+
# cluster levels over from 1 to N-1 clusters
|
|
709
|
+
cluster_lvls = pd.DataFrame(hr.cut_tree(clustering), index=dist.columns)
|
|
710
|
+
level_k = cluster_lvls.columns.tolist()
|
|
711
|
+
cluster_lvls = cluster_lvls.iloc[:, ::-1] # reverse order to start with 1 cluster
|
|
712
|
+
cluster_lvls.columns = level_k
|
|
713
|
+
# Fix for nonmonotonic linkage matrices
|
|
714
|
+
if flag is False:
|
|
715
|
+
for i in cluster_lvls.columns:
|
|
716
|
+
unique_vals, indices = np.unique(cluster_lvls[i], return_inverse=True)
|
|
717
|
+
cluster_lvls[i] = indices
|
|
718
|
+
cluster_lvls = cluster_lvls.T.drop_duplicates().T
|
|
719
|
+
level_k = cluster_lvls.columns.tolist()
|
|
720
|
+
cluster_k = cluster_lvls.nunique(axis=0).tolist()
|
|
721
|
+
W_list = []
|
|
722
|
+
n = dist.shape[0]
|
|
723
|
+
|
|
724
|
+
# get within-cluster dissimilarity for each k
|
|
725
|
+
for k in cluster_k:
|
|
726
|
+
if k == 1:
|
|
727
|
+
W_list.append(-np.inf)
|
|
728
|
+
elif k > min(max_k, np.sqrt(n)) + 2:
|
|
729
|
+
break
|
|
730
|
+
else:
|
|
731
|
+
level = cluster_lvls[level_k[cluster_k.index(k)]] # get k clusters
|
|
732
|
+
D_list = [] # within-cluster distance list
|
|
733
|
+
|
|
734
|
+
for i in range(np.max(level.unique()) + 1):
|
|
735
|
+
cluster = level.loc[level == i]
|
|
736
|
+
# Based on correlation distance
|
|
737
|
+
cluster_dist = dist.loc[cluster.index, cluster.index] # get distance
|
|
738
|
+
cluster_pdist = squareform(cluster_dist, checks=False)
|
|
739
|
+
if cluster_pdist.shape[0] != 0:
|
|
740
|
+
D = np.nan_to_num(cluster_pdist.std())
|
|
741
|
+
D_list.append(D) # append to list
|
|
742
|
+
|
|
743
|
+
W_k = np.sum(D_list)
|
|
744
|
+
W_list.append(W_k)
|
|
745
|
+
|
|
746
|
+
W_list = pd.Series(W_list)
|
|
747
|
+
gaps = W_list.shift(-2) + W_list - 2 * W_list.shift(-1)
|
|
748
|
+
k_index = int(gaps.idxmax())
|
|
749
|
+
k = cluster_k[k_index]
|
|
750
|
+
node_k = level_k[k_index]
|
|
751
|
+
|
|
752
|
+
if flag:
|
|
753
|
+
clustering_inds = cluster_lvls[node_k].tolist()
|
|
754
|
+
else:
|
|
755
|
+
clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")
|
|
756
|
+
j = len(np.unique(clustering_inds))
|
|
757
|
+
while k != j:
|
|
758
|
+
j += 1
|
|
759
|
+
clustering_inds = hr.fcluster(clustering, j, criterion="maxclust")
|
|
760
|
+
k = len(np.unique(clustering_inds))
|
|
761
|
+
unique_vals, indices = np.unique(clustering_inds, return_inverse=True)
|
|
762
|
+
clustering_inds = indices
|
|
763
|
+
|
|
764
|
+
return k, clustering_inds
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
def std_silhouette_score(dist, clustering, max_k=10):
|
|
768
|
+
r"""
|
|
769
|
+
Calculate the optimal number of clusters based on the standarized silhouette
|
|
770
|
+
score index :cite:`d-Prado2`.
|
|
771
|
+
|
|
772
|
+
Parameters
|
|
773
|
+
----------
|
|
774
|
+
dist : str, optional
|
|
775
|
+
A distance measure based on the codependence matrix.
|
|
776
|
+
clustering : str, optional
|
|
777
|
+
The hierarchical clustering encoded as a linkage matrix, see `linkage <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html?highlight=linkage#scipy.cluster.hierarchy.linkage>`_ for more details.
|
|
778
|
+
max_k : int, optional
|
|
779
|
+
Max number of clusters used by the standarized silhouette score
|
|
780
|
+
to find the optimal number of clusters. The default is 10.
|
|
781
|
+
|
|
782
|
+
Returns
|
|
783
|
+
-------
|
|
784
|
+
k : int
|
|
785
|
+
The optimal number of clusters based on the standarized silhouette score.
|
|
786
|
+
|
|
787
|
+
Raises
|
|
788
|
+
------
|
|
789
|
+
ValueError when the value cannot be calculated.
|
|
790
|
+
|
|
791
|
+
"""
|
|
792
|
+
flag = False
|
|
793
|
+
# Check if linkage matrix is monotonic
|
|
794
|
+
if hr.is_monotonic(clustering):
|
|
795
|
+
flag = True
|
|
796
|
+
# cluster levels over from 1 to N-1 clusters
|
|
797
|
+
cluster_lvls = pd.DataFrame(hr.cut_tree(clustering), index=dist.columns)
|
|
798
|
+
level_k = cluster_lvls.columns.tolist()
|
|
799
|
+
cluster_lvls = cluster_lvls.iloc[:, ::-1] # reverse order to start with 1 cluster
|
|
800
|
+
cluster_lvls.columns = level_k
|
|
801
|
+
# Fix for nonmonotonic linkage matrices
|
|
802
|
+
if flag is False:
|
|
803
|
+
for i in cluster_lvls.columns:
|
|
804
|
+
unique_vals, indices = np.unique(cluster_lvls[i], return_inverse=True)
|
|
805
|
+
cluster_lvls[i] = indices
|
|
806
|
+
cluster_lvls = cluster_lvls.T.drop_duplicates().T
|
|
807
|
+
level_k = cluster_lvls.columns.tolist()
|
|
808
|
+
cluster_k = cluster_lvls.nunique(axis=0).tolist()
|
|
809
|
+
scores_list = []
|
|
810
|
+
n = dist.shape[0]
|
|
811
|
+
|
|
812
|
+
# get within-cluster dissimilarity for each k
|
|
813
|
+
for k in cluster_k:
|
|
814
|
+
if k == 1:
|
|
815
|
+
scores_list.append(-np.inf)
|
|
816
|
+
elif k > min(max_k, np.sqrt(n)):
|
|
817
|
+
break
|
|
818
|
+
else:
|
|
819
|
+
level = cluster_lvls[level_k[cluster_k.index(k)]] # get k clusters
|
|
820
|
+
b = silhouette_samples(dist, level)
|
|
821
|
+
scores_list.append(b.mean() / b.std())
|
|
822
|
+
|
|
823
|
+
scores_list = pd.Series(scores_list)
|
|
824
|
+
k_index = int(scores_list.idxmax())
|
|
825
|
+
k = cluster_k[k_index]
|
|
826
|
+
node_k = level_k[k_index]
|
|
827
|
+
if flag:
|
|
828
|
+
clustering_inds = cluster_lvls[node_k].tolist()
|
|
829
|
+
else:
|
|
830
|
+
clustering_inds = hr.fcluster(clustering, k, criterion="maxclust")
|
|
831
|
+
j = len(np.unique(clustering_inds))
|
|
832
|
+
while k != j:
|
|
833
|
+
j += 1
|
|
834
|
+
clustering_inds = hr.fcluster(clustering, j, criterion="maxclust")
|
|
835
|
+
k = len(np.unique(clustering_inds))
|
|
836
|
+
unique_vals, indices = np.unique(clustering_inds, return_inverse=True)
|
|
837
|
+
clustering_inds = indices
|
|
838
|
+
|
|
839
|
+
return k, clustering_inds
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def codep_dist(
|
|
843
|
+
returns,
|
|
844
|
+
custom_cov=None,
|
|
845
|
+
codependence="pearson",
|
|
846
|
+
bins_info="KN",
|
|
847
|
+
alpha_tail=0.05,
|
|
848
|
+
gs_threshold=0.5,
|
|
849
|
+
):
|
|
850
|
+
r"""
|
|
851
|
+
Calculate the codependence and distance matrix according the selected method.
|
|
852
|
+
|
|
853
|
+
Parameters
|
|
854
|
+
----------
|
|
855
|
+
returns : DataFrame of shape (n_samples, n_assets)
|
|
856
|
+
Assets returns DataFrame, where n_samples is the number of
|
|
857
|
+
observations and n_assets is the number of assets.
|
|
858
|
+
custom_cov : DataFrame or None, optional
|
|
859
|
+
Custom covariance matrix, used when codependence parameter has value
|
|
860
|
+
'custom_cov'. The default is None.
|
|
861
|
+
codependence : str, can be {'pearson', 'spearman', 'abs_pearson', 'abs_spearman', 'distance', 'mutual_info', 'tail' or 'custom_cov'}
|
|
862
|
+
The codependence or similarity matrix used to build the distance
|
|
863
|
+
metric and clusters. The default is 'pearson'. Possible values are:
|
|
864
|
+
|
|
865
|
+
- 'pearson': pearson correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{pearson}_{i,j})}`.
|
|
866
|
+
- 'spearman': spearman correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{spearman}_{i,j})}`.
|
|
867
|
+
- 'kendall': kendall correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{kendall}_{i,j})}`.
|
|
868
|
+
- 'gerber1': Gerber statistic 1 correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{gerber1}_{i,j})}`.
|
|
869
|
+
- 'gerber2': Gerber statistic 2 correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{gerber2}_{i,j})}`.
|
|
870
|
+
- 'abs_pearson': absolute value pearson correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{(1-|\rho_{i,j}|)}`.
|
|
871
|
+
- 'abs_spearman': absolute value spearman correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{(1-|\rho_{i,j}|)}`.
|
|
872
|
+
- 'abs_kendall': absolute value kendall correlation matrix. Distance formula: :math:`D_{i,j} = \sqrt{(1-|\rho^{kendall}_{i,j}|)}`.
|
|
873
|
+
- 'distance': distance correlation matrix. Distance formula :math:`D_{i,j} = \sqrt{(1-\rho^{distance}_{i,j})}`.
|
|
874
|
+
- 'mutual_info': mutual information matrix. Distance used is variation information matrix.
|
|
875
|
+
- 'tail': lower tail dependence index matrix. Dissimilarity formula :math:`D_{i,j} = -\log{\lambda_{i,j}}`.
|
|
876
|
+
- 'custom_cov': use custom correlation matrix based on the custom_cov parameter. Distance formula: :math:`D_{i,j} = \sqrt{0.5(1-\rho^{pearson}_{i,j})}`.
|
|
877
|
+
|
|
878
|
+
bins_info: int or str
|
|
879
|
+
Number of bins used to calculate variation of information. The default
|
|
880
|
+
value is 'KN'. Possible values are:
|
|
881
|
+
|
|
882
|
+
- 'KN': Knuth's choice method. See more in `knuth_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.knuth_bin_width.html>`_.
|
|
883
|
+
- 'FD': Freedman–Diaconis' choice method. See more in `freedman_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.freedman_bin_width.html>`_.
|
|
884
|
+
- 'SC': Scotts' choice method. See more in `scott_bin_width <https://docs.astropy.org/en/stable/api/astropy.stats.scott_bin_width.html>`_.
|
|
885
|
+
- 'HGR': Hacine-Gharbi and Ravier' choice method.
|
|
886
|
+
- int: integer value choice by user.
|
|
887
|
+
|
|
888
|
+
alpha_tail : float, optional
|
|
889
|
+
Significance level for lower tail dependence index. The default is 0.05.
|
|
890
|
+
gs_threshold : float, optional
|
|
891
|
+
Gerber statistic threshold. The default is 0.5.
|
|
892
|
+
|
|
893
|
+
Returns
|
|
894
|
+
-------
|
|
895
|
+
codep : DataFrame
|
|
896
|
+
Codependence matrix.
|
|
897
|
+
dist : DataFrame
|
|
898
|
+
Distance matrix.
|
|
899
|
+
|
|
900
|
+
Raises
|
|
901
|
+
------
|
|
902
|
+
ValueError
|
|
903
|
+
When the value cannot be calculated.
|
|
904
|
+
|
|
905
|
+
"""
|
|
906
|
+
if codependence in {"pearson", "spearman", "kendall"}:
|
|
907
|
+
codep = returns.corr(method=codependence)
|
|
908
|
+
dist = np.sqrt(np.clip((1 - codep) / 2, a_min=0.0, a_max=1.0))
|
|
909
|
+
elif codependence == "gerber1":
|
|
910
|
+
codep = gs.gerber_cov_stat1(returns, threshold=gs_threshold)
|
|
911
|
+
codep = cov2corr(codep)
|
|
912
|
+
dist = np.sqrt(np.clip((1 - codep) / 2, a_min=0.0, a_max=1.0))
|
|
913
|
+
elif codependence == "gerber2":
|
|
914
|
+
codep = gs.gerber_cov_stat2(returns, threshold=gs_threshold)
|
|
915
|
+
codep = cov2corr(codep)
|
|
916
|
+
dist = np.sqrt(np.clip((1 - codep) / 2, a_min=0.0, a_max=1.0))
|
|
917
|
+
elif codependence in {"abs_pearson", "abs_spearman", "abs_kendall"}:
|
|
918
|
+
codep = np.abs(returns.corr(method=codependence[4:]))
|
|
919
|
+
dist = np.sqrt(np.clip((1 - codep), a_min=0.0, a_max=1.0))
|
|
920
|
+
elif codependence in {"distance"}:
|
|
921
|
+
codep = dcorr_matrix(returns).astype(float)
|
|
922
|
+
dist = np.sqrt(np.clip((1 - codep), a_min=0.0, a_max=1.0))
|
|
923
|
+
elif codependence in {"mutual_info"}:
|
|
924
|
+
codep = mutual_info_matrix(returns, bins_info).astype(float)
|
|
925
|
+
dist = var_info_matrix(returns, bins_info).astype(float)
|
|
926
|
+
elif codependence in {"tail"}:
|
|
927
|
+
codep = ltdi_matrix(returns, alpha_tail).astype(float)
|
|
928
|
+
dist = -np.log(codep)
|
|
929
|
+
elif codependence in {"custom_cov"}:
|
|
930
|
+
codep = cov2corr(custom_cov).astype(float)
|
|
931
|
+
dist = np.sqrt(np.clip((1 - codep) / 2, a_min=0.0, a_max=1.0))
|
|
932
|
+
|
|
933
|
+
return codep, dist
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
###############################################################################
|
|
937
|
+
# Denoising Functions Based on Lopez de Prado Book
|
|
938
|
+
###############################################################################
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
def fitKDE(obs, bWidth=0.01, kernel="gaussian", x=None):
|
|
942
|
+
"""
|
|
943
|
+
Fit kernel to a series of obs, and derive the prob of obs x is the array of
|
|
944
|
+
values on which the fit KDE will be evaluated. It is the empirical Probability
|
|
945
|
+
Density Function (PDF). For more information see chapter 2 of :cite:`d-MLforAM`.
|
|
946
|
+
|
|
947
|
+
Parameters
|
|
948
|
+
----------
|
|
949
|
+
obs : ndarray
|
|
950
|
+
Observations to fit. Commonly is the diagonal of Eigenvalues.
|
|
951
|
+
bWidth : float, optional
|
|
952
|
+
The bandwidth of the kernel. The default value is 0.01.
|
|
953
|
+
kernel : string, optional
|
|
954
|
+
The kernel to use. The default value is 'gaussian'. For more information see: `kernel-density <https://scikit-learn.org/stable/modules/density.html#kernel-density>`_.
|
|
955
|
+
Possible values are:
|
|
956
|
+
|
|
957
|
+
- 'gaussian': gaussian kernel.
|
|
958
|
+
- 'tophat': tophat kernel.
|
|
959
|
+
- 'epanechnikov': epanechnikov kernel.
|
|
960
|
+
- 'exponential': exponential kernel.
|
|
961
|
+
- 'linear': linear kernel.
|
|
962
|
+
- 'cosine': cosine kernel.
|
|
963
|
+
|
|
964
|
+
x : ndarray, optional
|
|
965
|
+
It is the array of values on which the fit KDE will be evaluated.
|
|
966
|
+
|
|
967
|
+
Returns
|
|
968
|
+
-------
|
|
969
|
+
pdf : pd.series
|
|
970
|
+
Empirical PDF.
|
|
971
|
+
|
|
972
|
+
Raises
|
|
973
|
+
------
|
|
974
|
+
ValueError when the value cannot be calculated.
|
|
975
|
+
|
|
976
|
+
"""
|
|
977
|
+
|
|
978
|
+
if len(obs.shape) == 1:
|
|
979
|
+
obs = obs.reshape(-1, 1)
|
|
980
|
+
|
|
981
|
+
kde = KernelDensity(kernel=kernel, bandwidth=bWidth).fit(obs)
|
|
982
|
+
|
|
983
|
+
if x is None:
|
|
984
|
+
x = np.unique(obs).reshape(-1, 1)
|
|
985
|
+
|
|
986
|
+
if len(x.shape) == 1:
|
|
987
|
+
x = x.reshape(-1, 1)
|
|
988
|
+
|
|
989
|
+
logProb = kde.score_samples(x) # log(density)
|
|
990
|
+
pdf = pd.Series(np.exp(logProb), index=x.flatten())
|
|
991
|
+
|
|
992
|
+
return pdf
|
|
993
|
+
|
|
994
|
+
|
|
995
|
+
def mpPDF(var, q, pts):
|
|
996
|
+
r"""
|
|
997
|
+
Creates a Marchenko-Pastur Probability Density Function (PDF). For more
|
|
998
|
+
information see chapter 2 of :cite:`d-MLforAM`.
|
|
999
|
+
|
|
1000
|
+
Parameters
|
|
1001
|
+
----------
|
|
1002
|
+
var : float
|
|
1003
|
+
Variance.
|
|
1004
|
+
q : float
|
|
1005
|
+
T/N where T is the number of rows and N the number of columns
|
|
1006
|
+
pts : int
|
|
1007
|
+
Number of points used to construct the PDF.
|
|
1008
|
+
|
|
1009
|
+
Returns
|
|
1010
|
+
-------
|
|
1011
|
+
pdf : pd.series
|
|
1012
|
+
Marchenko-Pastur PDF.
|
|
1013
|
+
|
|
1014
|
+
Raises
|
|
1015
|
+
------
|
|
1016
|
+
ValueError when the value cannot be calculated.
|
|
1017
|
+
|
|
1018
|
+
"""
|
|
1019
|
+
|
|
1020
|
+
if isinstance(var, np.ndarray):
|
|
1021
|
+
if var.shape == (1,):
|
|
1022
|
+
var = var[0]
|
|
1023
|
+
|
|
1024
|
+
eMin, eMax = var * (1 - (1.0 / q) ** 0.5) ** 2, var * (1 + (1.0 / q) ** 0.5) ** 2
|
|
1025
|
+
eVal = np.linspace(eMin, eMax, pts)
|
|
1026
|
+
pdf = q / (2 * np.pi * var * eVal) * ((eMax - eVal) * (eVal - eMin)) ** 0.5
|
|
1027
|
+
pdf = pd.Series(pdf, index=eVal)
|
|
1028
|
+
|
|
1029
|
+
return pdf
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
def errPDFs(var, eVal, q, bWidth=0.01, pts=1000):
|
|
1033
|
+
r"""
|
|
1034
|
+
Fit error of Empirical PDF (uses Marchenko-Pastur PDF). For more information
|
|
1035
|
+
see chapter 2 of :cite:`d-MLforAM`.
|
|
1036
|
+
|
|
1037
|
+
Parameters
|
|
1038
|
+
----------
|
|
1039
|
+
var : float
|
|
1040
|
+
Variance.
|
|
1041
|
+
eVal : ndarray
|
|
1042
|
+
Eigenvalues to fit.
|
|
1043
|
+
q : float
|
|
1044
|
+
T/N where T is the number of rows and N the number of columns.
|
|
1045
|
+
bWidth : float, optional
|
|
1046
|
+
The bandwidth of the kernel. The default value is 0.01.
|
|
1047
|
+
pts : int
|
|
1048
|
+
Number of points used to construct the PDF. The default value is 1000.
|
|
1049
|
+
|
|
1050
|
+
Returns
|
|
1051
|
+
-------
|
|
1052
|
+
pdf : float
|
|
1053
|
+
Sum squared error.
|
|
1054
|
+
|
|
1055
|
+
Raises
|
|
1056
|
+
------
|
|
1057
|
+
ValueError when the value cannot be calculated.
|
|
1058
|
+
"""
|
|
1059
|
+
|
|
1060
|
+
# Fit error
|
|
1061
|
+
pdf0 = mpPDF(var, q, pts) # theoretical pdf
|
|
1062
|
+
pdf1 = fitKDE(eVal, bWidth, x=pdf0.index.values) # empirical pdf
|
|
1063
|
+
sse = np.sum((pdf1 - pdf0) ** 2)
|
|
1064
|
+
|
|
1065
|
+
return sse
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
def findMaxEval(eVal, q, bWidth=0.01):
|
|
1069
|
+
r"""
|
|
1070
|
+
Find max random eVal by fitting Marchenko’s dist (i.e) everything else
|
|
1071
|
+
larger than this, is a signal eigenvalue. For more information see chapter
|
|
1072
|
+
2 of :cite:`d-MLforAM`.
|
|
1073
|
+
|
|
1074
|
+
Parameters
|
|
1075
|
+
----------
|
|
1076
|
+
eVal : ndarray
|
|
1077
|
+
Eigenvalues to fit.
|
|
1078
|
+
q : float
|
|
1079
|
+
T/N where T is the number of rows and N the number of columns.
|
|
1080
|
+
bWidth : float, optional
|
|
1081
|
+
The bandwidth of the kernel.
|
|
1082
|
+
|
|
1083
|
+
Returns
|
|
1084
|
+
-------
|
|
1085
|
+
pdf : tuple (float, float)
|
|
1086
|
+
First value is the maximum random eigenvalue and second is the variance
|
|
1087
|
+
attributed to noise (1-result) is one way to measure signal-to-noise.
|
|
1088
|
+
|
|
1089
|
+
Raises
|
|
1090
|
+
------
|
|
1091
|
+
ValueError when the value cannot be calculated.
|
|
1092
|
+
"""
|
|
1093
|
+
|
|
1094
|
+
out = minimize(
|
|
1095
|
+
lambda *x: errPDFs(*x), 0.5, args=(eVal, q, bWidth), bounds=((1e-5, 1 - 1e-5),)
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
if out["success"]:
|
|
1099
|
+
var = out["x"][0]
|
|
1100
|
+
else:
|
|
1101
|
+
var = 1
|
|
1102
|
+
|
|
1103
|
+
eMax = var * (1 + (1.0 / q) ** 0.5) ** 2
|
|
1104
|
+
|
|
1105
|
+
return eMax, var
|
|
1106
|
+
|
|
1107
|
+
|
|
1108
|
+
def getPCA(matrix):
|
|
1109
|
+
r"""
|
|
1110
|
+
Gets the Eigenvalues and Eigenvector values from a Hermitian Matrix.
|
|
1111
|
+
For more information see chapter 2 of :cite:`d-MLforAM`.
|
|
1112
|
+
|
|
1113
|
+
Parameters
|
|
1114
|
+
----------
|
|
1115
|
+
matrix : ndarray or pd.DataFrame
|
|
1116
|
+
Correlation matrix.
|
|
1117
|
+
|
|
1118
|
+
Returns
|
|
1119
|
+
-------
|
|
1120
|
+
pdf : tuple (float, float)
|
|
1121
|
+
First value are the eigenvalues of correlation matrix and second are
|
|
1122
|
+
the Eigenvectors of correlation matrix.
|
|
1123
|
+
|
|
1124
|
+
Raises
|
|
1125
|
+
------
|
|
1126
|
+
ValueError when the value cannot be calculated.
|
|
1127
|
+
"""
|
|
1128
|
+
|
|
1129
|
+
# Get eVal,eVec from a Hermitian matrix
|
|
1130
|
+
eVal, eVec = np.linalg.eigh(matrix)
|
|
1131
|
+
indices = eVal.argsort()[::-1] # arguments for sorting eVal desc
|
|
1132
|
+
eVal, eVec = eVal[indices], eVec[:, indices]
|
|
1133
|
+
eVal = np.diagflat(eVal)
|
|
1134
|
+
|
|
1135
|
+
return eVal, eVec
|
|
1136
|
+
|
|
1137
|
+
|
|
1138
|
+
def denoisedCorr(eVal, eVec, nFacts, kind="fixed"):
|
|
1139
|
+
r"""
|
|
1140
|
+
Remove noise from correlation matrix using fixing random eigenvalues and
|
|
1141
|
+
spectral method. For more information see chapter 2 of :cite:`d-MLforAM`.
|
|
1142
|
+
|
|
1143
|
+
Parameters
|
|
1144
|
+
----------
|
|
1145
|
+
eVal : 1darray
|
|
1146
|
+
Eigenvalues.
|
|
1147
|
+
eVec : ndarray
|
|
1148
|
+
Eigenvectors.
|
|
1149
|
+
nFacts : float
|
|
1150
|
+
The number of factors.
|
|
1151
|
+
kind : str, optional
|
|
1152
|
+
The denoise method. The default value is 'fixed'. Possible values are:
|
|
1153
|
+
|
|
1154
|
+
- 'fixed': takes average of eigenvalues above max Marchenko Pastour limit.
|
|
1155
|
+
- 'spectral': makes zero eigenvalues above max Marchenko Pastour limit.
|
|
1156
|
+
|
|
1157
|
+
Returns
|
|
1158
|
+
-------
|
|
1159
|
+
corr : ndarray
|
|
1160
|
+
Denoised correlation matrix.
|
|
1161
|
+
|
|
1162
|
+
Raises
|
|
1163
|
+
------
|
|
1164
|
+
ValueError when the value cannot be calculated.
|
|
1165
|
+
"""
|
|
1166
|
+
|
|
1167
|
+
eVal_ = np.diag(eVal).copy()
|
|
1168
|
+
|
|
1169
|
+
if kind == "fixed":
|
|
1170
|
+
eVal_[nFacts:] = eVal_[nFacts:].sum() / float(eVal_.shape[0] - nFacts)
|
|
1171
|
+
elif kind == "spectral":
|
|
1172
|
+
eVal_[nFacts:] = 0
|
|
1173
|
+
|
|
1174
|
+
eVal_ = np.diag(eVal_)
|
|
1175
|
+
corr = np.dot(eVec, eVal_).dot(eVec.T)
|
|
1176
|
+
corr = cov2corr(corr)
|
|
1177
|
+
|
|
1178
|
+
return corr
|
|
1179
|
+
|
|
1180
|
+
|
|
1181
|
+
def shrinkCorr(eVal, eVec, nFacts, alpha=0):
|
|
1182
|
+
r"""
|
|
1183
|
+
Remove noise from correlation using target shrinkage. For more information
|
|
1184
|
+
see chapter 2 of :cite:`d-MLforAM`.
|
|
1185
|
+
|
|
1186
|
+
Parameters
|
|
1187
|
+
----------
|
|
1188
|
+
eVal : 1darray
|
|
1189
|
+
Eigenvalues.
|
|
1190
|
+
eVec : ndarray
|
|
1191
|
+
Eigenvectors.
|
|
1192
|
+
nFacts : float
|
|
1193
|
+
The number of factors.
|
|
1194
|
+
alpha : float, optional
|
|
1195
|
+
Shrinkage factor.
|
|
1196
|
+
|
|
1197
|
+
Returns
|
|
1198
|
+
-------
|
|
1199
|
+
corr : ndarray
|
|
1200
|
+
Denoised correlation matrix.
|
|
1201
|
+
|
|
1202
|
+
Raises
|
|
1203
|
+
------
|
|
1204
|
+
ValueError when the value cannot be calculated.
|
|
1205
|
+
"""
|
|
1206
|
+
|
|
1207
|
+
eVal_L = eVal[:nFacts, :nFacts]
|
|
1208
|
+
eVec_L = eVec[:, :nFacts]
|
|
1209
|
+
eVal_R = eVal[nFacts:, nFacts:]
|
|
1210
|
+
eVec_R = eVec[:, nFacts:]
|
|
1211
|
+
corr0 = np.dot(eVec_L, eVal_L).dot(eVec_L.T)
|
|
1212
|
+
corr1 = np.dot(eVec_R, eVal_R).dot(eVec_R.T)
|
|
1213
|
+
corr2 = corr0 + alpha * corr1 + (1 - alpha) * np.diag(np.diag(corr1))
|
|
1214
|
+
|
|
1215
|
+
return corr2
|
|
1216
|
+
|
|
1217
|
+
|
|
1218
|
+
def denoiseCov(cov, q, kind="fixed", bWidth=0.01, detone=False, mkt_comp=1, alpha=0.1):
|
|
1219
|
+
r"""
|
|
1220
|
+
Remove noise from cov by fixing random eigenvalues of their correlation
|
|
1221
|
+
matrix. For more information see chapter 2 of :cite:`d-MLforAM`.
|
|
1222
|
+
|
|
1223
|
+
Parameters
|
|
1224
|
+
----------
|
|
1225
|
+
cov : DataFrame of shape (n_assets, n_assets)
|
|
1226
|
+
Covariance matrix, where n_assets is the number of assets.
|
|
1227
|
+
q : float
|
|
1228
|
+
T/N where T is the number of rows and N the number of columns.
|
|
1229
|
+
bWidth : float
|
|
1230
|
+
The bandwidth of the kernel.
|
|
1231
|
+
kind : str, optional
|
|
1232
|
+
The denoise method. The default value is 'fixed'. Possible values are:
|
|
1233
|
+
|
|
1234
|
+
- 'fixed': takes average of eigenvalues above max Marchenko Pastour limit.
|
|
1235
|
+
- 'spectral': makes zero eigenvalues above max Marchenko Pastour limit.
|
|
1236
|
+
- 'shrink': uses target shrinkage method.
|
|
1237
|
+
|
|
1238
|
+
detone : bool, optional
|
|
1239
|
+
If remove the firs mkt_comp of correlation matrix. The detone correlation
|
|
1240
|
+
matrix is singular, so it cannot be inverted.
|
|
1241
|
+
mkt_comp : float, optional
|
|
1242
|
+
Number of first components that will be removed using the detone method.
|
|
1243
|
+
alpha : float, optional
|
|
1244
|
+
Shrinkage factor.
|
|
1245
|
+
|
|
1246
|
+
Returns
|
|
1247
|
+
-------
|
|
1248
|
+
cov_ : ndarray or pd.DataFrame
|
|
1249
|
+
Denoised covariance matrix.
|
|
1250
|
+
|
|
1251
|
+
Raises
|
|
1252
|
+
------
|
|
1253
|
+
ValueError when the value cannot be calculated.
|
|
1254
|
+
"""
|
|
1255
|
+
|
|
1256
|
+
flag = False
|
|
1257
|
+
if isinstance(cov, pd.DataFrame):
|
|
1258
|
+
cols = cov.columns.tolist()
|
|
1259
|
+
flag = True
|
|
1260
|
+
|
|
1261
|
+
corr = cov2corr(cov)
|
|
1262
|
+
std = np.diag(cov) ** 0.5
|
|
1263
|
+
eVal, eVec = getPCA(corr)
|
|
1264
|
+
eMax, var = findMaxEval(np.diag(eVal), q, bWidth)
|
|
1265
|
+
nFacts = eVal.shape[0] - np.diag(eVal)[::-1].searchsorted(eMax)
|
|
1266
|
+
|
|
1267
|
+
if kind in ["fixed", "spectral"]:
|
|
1268
|
+
corr = denoisedCorr(eVal, eVec, nFacts, kind=kind)
|
|
1269
|
+
elif kind == "shrink":
|
|
1270
|
+
corr = shrinkCorr(eVal, eVec, nFacts, alpha=alpha)
|
|
1271
|
+
|
|
1272
|
+
if detone == True:
|
|
1273
|
+
eVal_ = eVal[:mkt_comp, :mkt_comp]
|
|
1274
|
+
eVec_ = eVec[:, :mkt_comp]
|
|
1275
|
+
corr_ = np.dot(eVec_, eVal_).dot(eVec_.T)
|
|
1276
|
+
corr = corr - corr_
|
|
1277
|
+
|
|
1278
|
+
cov_ = corr2cov(corr, std)
|
|
1279
|
+
|
|
1280
|
+
if flag:
|
|
1281
|
+
cov_ = pd.DataFrame(cov_, index=cols, columns=cols)
|
|
1282
|
+
|
|
1283
|
+
return cov_
|
|
1284
|
+
|
|
1285
|
+
|
|
1286
|
+
###############################################################################
|
|
1287
|
+
# Other Aditional Functions
|
|
1288
|
+
###############################################################################
|
|
1289
|
+
|
|
1290
|
+
|
|
1291
|
+
def round_values(data, decimals=4, wider=False):
|
|
1292
|
+
r"""
|
|
1293
|
+
This function help us to round values to values close or away from zero.
|
|
1294
|
+
|
|
1295
|
+
Parameters
|
|
1296
|
+
----------
|
|
1297
|
+
data : np.ndarray, pd.Series or pd.DataFrame
|
|
1298
|
+
Data that are going to be rounded.
|
|
1299
|
+
decimals : integer
|
|
1300
|
+
Number of decimals to round.
|
|
1301
|
+
wider : float
|
|
1302
|
+
False if round to values close to zero, True if round to values away
|
|
1303
|
+
from zero.
|
|
1304
|
+
|
|
1305
|
+
Returns
|
|
1306
|
+
-------
|
|
1307
|
+
value : np.ndarray, pd.Series or pd.DataFrame
|
|
1308
|
+
Data rounded using selected method.
|
|
1309
|
+
|
|
1310
|
+
Raises
|
|
1311
|
+
------
|
|
1312
|
+
ValueError
|
|
1313
|
+
When the value cannot be calculated.
|
|
1314
|
+
|
|
1315
|
+
"""
|
|
1316
|
+
|
|
1317
|
+
if wider == True:
|
|
1318
|
+
value = np.where(
|
|
1319
|
+
data >= 0,
|
|
1320
|
+
np.ceil(data * 10**decimals) / 10**decimals,
|
|
1321
|
+
np.floor(data * 10**decimals) / 10**decimals,
|
|
1322
|
+
)
|
|
1323
|
+
elif wider == False:
|
|
1324
|
+
value = np.where(
|
|
1325
|
+
data >= 0,
|
|
1326
|
+
np.floor(data * 10**decimals) / 10**decimals,
|
|
1327
|
+
np.ceil(data * 10**decimals) / 10**decimals,
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
if isinstance(data, pd.DataFrame):
|
|
1331
|
+
value = pd.DataFrame(value, columns=data.columns, index=data.index)
|
|
1332
|
+
if isinstance(data, pd.Series):
|
|
1333
|
+
value = pd.Series(value, index=data.index)
|
|
1334
|
+
|
|
1335
|
+
return value
|
|
1336
|
+
|
|
1337
|
+
|
|
1338
|
+
def weights_discretizetion(
|
|
1339
|
+
weights, prices, capital=1000000, w_decimal=6, ascending=False
|
|
1340
|
+
):
|
|
1341
|
+
r"""
|
|
1342
|
+
This function help us to find the number of shares that must be bought or
|
|
1343
|
+
sold to achieve portfolio weights according the prices of assets and the
|
|
1344
|
+
invested capital.
|
|
1345
|
+
|
|
1346
|
+
Parameters
|
|
1347
|
+
----------
|
|
1348
|
+
weights : pd.Series or pd.DataFrame
|
|
1349
|
+
Vector of weights of size n_assets x 1.
|
|
1350
|
+
prices : pd.Series or pd.DataFrame
|
|
1351
|
+
Vector of prices of size n_assets x 1.
|
|
1352
|
+
capital : float, optional
|
|
1353
|
+
Capital invested. The default value is 1000000.
|
|
1354
|
+
w_decimal : int, optional
|
|
1355
|
+
Number of decimals use to round the portfolio weights. The default
|
|
1356
|
+
value is 6.
|
|
1357
|
+
ascending : bool, optional
|
|
1358
|
+
If True assigns excess capital to assets with lower weights, else,
|
|
1359
|
+
to assets with higher weights. The default value is False.
|
|
1360
|
+
|
|
1361
|
+
Returns
|
|
1362
|
+
-------
|
|
1363
|
+
n_shares : pd.DataFrame
|
|
1364
|
+
Number of shares that must be bought or sold to achieve portfolio
|
|
1365
|
+
weights.
|
|
1366
|
+
|
|
1367
|
+
Raises
|
|
1368
|
+
------
|
|
1369
|
+
ValueError
|
|
1370
|
+
When the value cannot be calculated.
|
|
1371
|
+
|
|
1372
|
+
"""
|
|
1373
|
+
|
|
1374
|
+
if isinstance(weights, pd.Series):
|
|
1375
|
+
w = weights.to_frame().copy()
|
|
1376
|
+
elif isinstance(weights, pd.DataFrame):
|
|
1377
|
+
if weights.shape[0] == 1:
|
|
1378
|
+
w = weights.T.copy()
|
|
1379
|
+
elif weights.shape[1] == 1:
|
|
1380
|
+
w = weights.copy()
|
|
1381
|
+
pass
|
|
1382
|
+
else:
|
|
1383
|
+
raise ValueError("weights must have size n_assets x 1")
|
|
1384
|
+
else:
|
|
1385
|
+
raise ValueError("weights must be DataFrame")
|
|
1386
|
+
|
|
1387
|
+
if isinstance(prices, pd.Series):
|
|
1388
|
+
p = prices.to_frame().copy()
|
|
1389
|
+
elif isinstance(prices, pd.DataFrame):
|
|
1390
|
+
if prices.shape[0] == 1:
|
|
1391
|
+
p = prices.T.copy()
|
|
1392
|
+
elif prices.shape[1] == 1:
|
|
1393
|
+
p = prices.copy()
|
|
1394
|
+
pass
|
|
1395
|
+
else:
|
|
1396
|
+
raise ValueError("prices must have size n_assets x 1")
|
|
1397
|
+
else:
|
|
1398
|
+
raise ValueError("prices must be DataFrame")
|
|
1399
|
+
|
|
1400
|
+
w.columns = [0]
|
|
1401
|
+
p.columns = [0]
|
|
1402
|
+
|
|
1403
|
+
total = w.sum().item()
|
|
1404
|
+
w = round_values(w, decimals=w_decimal, wider=False)
|
|
1405
|
+
w.loc[w.idxmin().tolist()] = w.loc[w.idxmin().tolist()] + (total - w.sum()).item()
|
|
1406
|
+
|
|
1407
|
+
n_shares = round_values(capital * w / p, decimals=0, wider=False)
|
|
1408
|
+
|
|
1409
|
+
excedent = [capital + 1, capital]
|
|
1410
|
+
i = 1
|
|
1411
|
+
while excedent[i] < excedent[i - 1]:
|
|
1412
|
+
new_capital = (n_shares.T @ p).iloc[0, 0]
|
|
1413
|
+
excedent.append(capital - new_capital)
|
|
1414
|
+
new_shares = round_values(excedent[-1] * w / p, 0)
|
|
1415
|
+
n_shares += new_shares
|
|
1416
|
+
i += 1
|
|
1417
|
+
|
|
1418
|
+
n_shares_1 = capital * w / p
|
|
1419
|
+
|
|
1420
|
+
excedent = capital - (n_shares.T @ p).iloc[0, 0]
|
|
1421
|
+
i = 1
|
|
1422
|
+
|
|
1423
|
+
d_shares = np.abs(n_shares_1) - np.abs(n_shares)
|
|
1424
|
+
d_shares = np.where(d_shares > 0, n_shares_1 - n_shares, 0)
|
|
1425
|
+
d_shares = round_values(d_shares, decimals=0, wider=True)
|
|
1426
|
+
d_shares = pd.DataFrame(d_shares, columns=w.columns, index=w.index)
|
|
1427
|
+
|
|
1428
|
+
n_shares_1 = capital * w / p
|
|
1429
|
+
|
|
1430
|
+
excedent = capital - (n_shares.T @ p).iloc[0, 0]
|
|
1431
|
+
|
|
1432
|
+
d_shares = np.abs(n_shares_1) - np.abs(n_shares)
|
|
1433
|
+
d_shares = np.where(d_shares > 0, n_shares_1 - n_shares, 0)
|
|
1434
|
+
d_shares = round_values(d_shares, decimals=0, wider=True)
|
|
1435
|
+
d_shares = pd.DataFrame(d_shares, columns=w.columns, index=w.index)
|
|
1436
|
+
|
|
1437
|
+
order = w.sort_values(by=0, ascending=ascending).index.tolist()
|
|
1438
|
+
d_list = d_shares[d_shares[0] == 1].index.tolist()
|
|
1439
|
+
|
|
1440
|
+
for i in order:
|
|
1441
|
+
if i in d_list:
|
|
1442
|
+
new_shares = round_values(excedent / p.loc[i, 0], 0).item()
|
|
1443
|
+
if new_shares > 0:
|
|
1444
|
+
n_shares.loc[i] += new_shares
|
|
1445
|
+
excedent = capital - (n_shares.T @ p).iloc[0, 0]
|
|
1446
|
+
|
|
1447
|
+
return n_shares
|
|
1448
|
+
|
|
1449
|
+
|
|
1450
|
+
def color_list(k):
|
|
1451
|
+
r"""
|
|
1452
|
+
This function creates a list of colors.
|
|
1453
|
+
|
|
1454
|
+
Parameters
|
|
1455
|
+
----------
|
|
1456
|
+
k : int
|
|
1457
|
+
Number of colors.
|
|
1458
|
+
|
|
1459
|
+
Returns
|
|
1460
|
+
-------
|
|
1461
|
+
colors : list
|
|
1462
|
+
A list of colors.
|
|
1463
|
+
"""
|
|
1464
|
+
|
|
1465
|
+
colors = []
|
|
1466
|
+
|
|
1467
|
+
if k <= 10:
|
|
1468
|
+
for i in range(10):
|
|
1469
|
+
colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab10").colors[i]))
|
|
1470
|
+
elif k <= 20:
|
|
1471
|
+
for i in range(20):
|
|
1472
|
+
colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20").colors[i]))
|
|
1473
|
+
elif k <= 40:
|
|
1474
|
+
for i in range(20):
|
|
1475
|
+
colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20").colors[i]))
|
|
1476
|
+
for i in range(20):
|
|
1477
|
+
colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20b").colors[i]))
|
|
1478
|
+
else:
|
|
1479
|
+
for i in range(20):
|
|
1480
|
+
colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20").colors[i]))
|
|
1481
|
+
for i in range(20):
|
|
1482
|
+
colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20b").colors[i]))
|
|
1483
|
+
for i in range(20):
|
|
1484
|
+
colors.append(mpl.colors.rgb2hex(plt.get_cmap("tab20c").colors[i]))
|
|
1485
|
+
if k / 60 > 1:
|
|
1486
|
+
colors = colors * int(np.ceil(k / 60))
|
|
1487
|
+
|
|
1488
|
+
return colors
|