skfolio 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skfolio/__init__.py +29 -0
  2. skfolio/cluster/__init__.py +8 -0
  3. skfolio/cluster/_hierarchical.py +387 -0
  4. skfolio/datasets/__init__.py +20 -0
  5. skfolio/datasets/_base.py +389 -0
  6. skfolio/datasets/data/__init__.py +0 -0
  7. skfolio/datasets/data/factors_dataset.csv.gz +0 -0
  8. skfolio/datasets/data/sp500_dataset.csv.gz +0 -0
  9. skfolio/datasets/data/sp500_index.csv.gz +0 -0
  10. skfolio/distance/__init__.py +26 -0
  11. skfolio/distance/_base.py +55 -0
  12. skfolio/distance/_distance.py +574 -0
  13. skfolio/exceptions.py +30 -0
  14. skfolio/measures/__init__.py +76 -0
  15. skfolio/measures/_enums.py +355 -0
  16. skfolio/measures/_measures.py +607 -0
  17. skfolio/metrics/__init__.py +3 -0
  18. skfolio/metrics/_scorer.py +121 -0
  19. skfolio/model_selection/__init__.py +18 -0
  20. skfolio/model_selection/_combinatorial.py +407 -0
  21. skfolio/model_selection/_validation.py +194 -0
  22. skfolio/model_selection/_walk_forward.py +221 -0
  23. skfolio/moments/__init__.py +41 -0
  24. skfolio/moments/covariance/__init__.py +29 -0
  25. skfolio/moments/covariance/_base.py +101 -0
  26. skfolio/moments/covariance/_covariance.py +1108 -0
  27. skfolio/moments/expected_returns/__init__.py +21 -0
  28. skfolio/moments/expected_returns/_base.py +31 -0
  29. skfolio/moments/expected_returns/_expected_returns.py +415 -0
  30. skfolio/optimization/__init__.py +36 -0
  31. skfolio/optimization/_base.py +147 -0
  32. skfolio/optimization/cluster/__init__.py +13 -0
  33. skfolio/optimization/cluster/_nco.py +348 -0
  34. skfolio/optimization/cluster/hierarchical/__init__.py +13 -0
  35. skfolio/optimization/cluster/hierarchical/_base.py +440 -0
  36. skfolio/optimization/cluster/hierarchical/_herc.py +406 -0
  37. skfolio/optimization/cluster/hierarchical/_hrp.py +368 -0
  38. skfolio/optimization/convex/__init__.py +16 -0
  39. skfolio/optimization/convex/_base.py +1944 -0
  40. skfolio/optimization/convex/_distributionally_robust.py +392 -0
  41. skfolio/optimization/convex/_maximum_diversification.py +417 -0
  42. skfolio/optimization/convex/_mean_risk.py +974 -0
  43. skfolio/optimization/convex/_risk_budgeting.py +560 -0
  44. skfolio/optimization/ensemble/__init__.py +6 -0
  45. skfolio/optimization/ensemble/_base.py +87 -0
  46. skfolio/optimization/ensemble/_stacking.py +326 -0
  47. skfolio/optimization/naive/__init__.py +3 -0
  48. skfolio/optimization/naive/_naive.py +173 -0
  49. skfolio/population/__init__.py +3 -0
  50. skfolio/population/_population.py +883 -0
  51. skfolio/portfolio/__init__.py +13 -0
  52. skfolio/portfolio/_base.py +1096 -0
  53. skfolio/portfolio/_multi_period_portfolio.py +610 -0
  54. skfolio/portfolio/_portfolio.py +842 -0
  55. skfolio/pre_selection/__init__.py +7 -0
  56. skfolio/pre_selection/_pre_selection.py +342 -0
  57. skfolio/preprocessing/__init__.py +3 -0
  58. skfolio/preprocessing/_returns.py +114 -0
  59. skfolio/prior/__init__.py +18 -0
  60. skfolio/prior/_base.py +63 -0
  61. skfolio/prior/_black_litterman.py +238 -0
  62. skfolio/prior/_empirical.py +163 -0
  63. skfolio/prior/_factor_model.py +268 -0
  64. skfolio/typing.py +50 -0
  65. skfolio/uncertainty_set/__init__.py +23 -0
  66. skfolio/uncertainty_set/_base.py +108 -0
  67. skfolio/uncertainty_set/_bootstrap.py +281 -0
  68. skfolio/uncertainty_set/_empirical.py +237 -0
  69. skfolio/utils/__init__.py +0 -0
  70. skfolio/utils/bootstrap.py +115 -0
  71. skfolio/utils/equations.py +350 -0
  72. skfolio/utils/sorting.py +117 -0
  73. skfolio/utils/stats.py +466 -0
  74. skfolio/utils/tools.py +567 -0
  75. skfolio-0.0.1.dist-info/LICENSE +29 -0
  76. skfolio-0.0.1.dist-info/METADATA +568 -0
  77. skfolio-0.0.1.dist-info/RECORD +79 -0
  78. skfolio-0.0.1.dist-info/WHEEL +5 -0
  79. skfolio-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,389 @@
1
+ """Datasets module."""
2
+
3
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
4
+ # License: BSD 3 clause
5
+
6
+ import gzip
7
+ import os
8
+ import shutil
9
+ import urllib.request as ur
10
+ from importlib import resources
11
+ from pathlib import Path
12
+
13
+ import joblib
14
+ import pandas as pd
15
+
16
+ DATA_MODULE = "skfolio.datasets.data"
17
+
18
+
19
+ def get_data_home(data_home: str | Path | None = None) -> str:
20
+ """Return the path of the skfolio data directory.
21
+
22
+ This folder is used by some large dataset loaders to avoid downloading the
23
+ data several times.
24
+
25
+ By default, the data directory is set to a folder named 'skfolio_data' in the
26
+ user home folder.
27
+
28
+ Alternatively, it can be set by the 'SKFOLIO_DATA' environment
29
+ variable or programmatically by giving an explicit folder path. The '~'
30
+ symbol is expanded to the user home folder.
31
+
32
+ If the folder does not already exist, it is automatically created.
33
+
34
+ Parameters
35
+ ----------
36
+ data_home : str, optional
37
+ The path to skfolio data directory. If `None`, the default path
38
+ is `~/skfolio_data`.
39
+
40
+ Returns
41
+ -------
42
+ data_home: str or path-like, optional
43
+ The path to skfolio data directory.
44
+ """
45
+ if data_home is None:
46
+ data_home = os.environ.get("SKFOLIO_DATA", os.path.join("~", "skfolio_data"))
47
+ data_home = os.path.expanduser(data_home)
48
+ os.makedirs(data_home, exist_ok=True)
49
+ return data_home
50
+
51
+
52
+ def clear_data_home(data_home: str | Path | None = None) -> None:
53
+ """Delete all the content of the data home cache.
54
+
55
+ Parameters
56
+ ----------
57
+ data_home : str or path-like, optional
58
+ The path to scikit-learn data directory. If `None`, the default path
59
+ is `~/skfolio_data`.
60
+ """
61
+ data_home = get_data_home(data_home)
62
+ shutil.rmtree(data_home)
63
+
64
+
65
+ def load_gzip_compressed_csv_data(
66
+ data_filename: str,
67
+ data_module: str = DATA_MODULE,
68
+ encoding="utf-8",
69
+ datetime_index: bool = True,
70
+ ) -> pd.DataFrame:
71
+ """Loads gzip-compressed csv files with `importlib.resources`.
72
+
73
+ 1) Open resource file with `importlib.resources.open_binary`
74
+ 2) Decompress csv file with `gzip.open`
75
+ 3) Load decompressed data with `pd.read_csv`
76
+
77
+ Parameters
78
+ ----------
79
+ data_filename : str
80
+ Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from
81
+ `data_module/data_file_name`. For example `'SPX500.csv.gz'`.
82
+
83
+ data_module : str or module, default='skfolio.datasets.data'
84
+ Module where data lives. The default is `'skfolio.datasets.data'`.
85
+
86
+ encoding : str, default="utf-8"
87
+ Name of the encoding that the gzip-decompressed file will be
88
+ decoded with. The default is 'utf-8'.
89
+
90
+ datetime_index: bool, default=True
91
+ If this is set to True, the DataFrame index is converted to datetime with
92
+ format="%Y-%m-%d".
93
+ The default is `True`.
94
+
95
+ Returns
96
+ -------
97
+ df : DataFrame of shape (n_observations, n_assets)
98
+ DataFrame with each row representing one observation and each column
99
+ representing the asset price of a given observation.
100
+ """
101
+ path = resources.files(data_module).joinpath(data_filename)
102
+ with path.open("rb") as compressed_file:
103
+ compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding)
104
+ df = pd.read_csv(compressed_file, sep=",", index_col=0)
105
+ if datetime_index:
106
+ df.index = pd.to_datetime(df.index, format="%Y-%m-%d")
107
+ return df
108
+
109
+
110
+ def download_dataset(
111
+ data_filename: str,
112
+ data_home: str | Path | None = None,
113
+ download_if_missing: bool = True,
114
+ ) -> pd.DataFrame:
115
+ """Download and save locally a dataset from the remote GitHub dataset folder.
116
+
117
+ Parameters
118
+ ----------
119
+ data_filename : str
120
+ Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from a remote
121
+ GitHub dataset folder.
122
+
123
+ data_home : str or path-like, optional
124
+ Specify another download and cache folder for the datasets. By default,
125
+ all skfolio data is stored in `~/skfolio_data` sub-folders.
126
+
127
+ download_if_missing : bool, default=True
128
+ If False, raise an OSError if the data is not locally available
129
+ instead of trying to download the data from the source site.
130
+ The default is `True`.
131
+
132
+ Returns
133
+ -------
134
+ df : DataFrame of shape (n_observations, n_assets)
135
+ DataFrame with each row representing one observation and each column
136
+ representing the asset price of a given observation.
137
+ """
138
+ url = (
139
+ "https://github.com/HugoDelatte/portfolio-optimization/raw/main/datasets/"
140
+ f"{data_filename}.csv.gz"
141
+ )
142
+
143
+ data_home = get_data_home(data_home=data_home)
144
+ filepath = os.path.join(data_home, f"{data_filename}.pkz")
145
+
146
+ if os.path.exists(filepath):
147
+ return joblib.load(filepath)
148
+
149
+ if not download_if_missing:
150
+ raise OSError("Data not found and `download_if_missing` is False")
151
+
152
+ archive_path = os.path.join(data_home, os.path.basename(url))
153
+ ur.urlretrieve(url, archive_path)
154
+ df = load_gzip_compressed_csv_data(archive_path)
155
+ joblib.dump(df, filepath, compress=6)
156
+ os.remove(archive_path)
157
+ return df
158
+
159
+
160
+ def load_sp500_dataset() -> pd.DataFrame:
161
+ """Load the prices of 20 assets from the S&P 500 Index composition.
162
+
163
+ This dataset is composed of the daily prices of 20 assets from the S&P 500
164
+ composition starting from 1990-01-02 up to 2022-12-28.
165
+
166
+ The data comes from the Yahoo public API.
167
+ The price is the adjusted close which is the closing price after adjustments for
168
+ all applicable splits and dividend distributions.
169
+ The adjustment uses appropriate split and dividend multipliers, adhering to
170
+ the Center for Research in Security Prices (CRSP) standards.
171
+
172
+ ============== ==================
173
+ Observations 8313
174
+ Assets 20
175
+ ============== ==================
176
+
177
+ Returns
178
+ -------
179
+ df : DataFrame of shape (n_observations, n_assets)
180
+ Prices DataFrame
181
+
182
+ Examples
183
+ --------
184
+ >>> from skfolio.datasets import load_sp500_dataset
185
+ >>> prices = load_sp500_dataset()
186
+ >>> prices.head()
187
+ AAPL AMD BAC ... UNH WMT XOM
188
+ 1990-01-02 0.332589 4.1250 11.65625 ... 0.382813 5.890625 12.5000
189
+ 1990-01-03 0.334821 4.0000 11.75000 ... 0.375000 5.890625 12.3750
190
+ 1990-01-04 0.335938 3.9375 11.50000 ... 0.371094 5.859375 12.2500
191
+ 1990-01-05 0.337054 3.8125 11.25000 ... 0.355469 5.796875 12.1875
192
+ 1990-01-08 0.339286 3.8125 11.31250 ... 0.347656 5.875000 12.3750
193
+ """
194
+ data_filename = "sp500_dataset.csv.gz"
195
+ df = load_gzip_compressed_csv_data(data_filename)
196
+ return df
197
+
198
+
199
+ def load_sp500_index() -> pd.DataFrame:
200
+ """Load the prices of the S&P 500 Index.
201
+
202
+ This dataset is composed of the daily prices of the S&P 500 Index starting from
203
+ 1990-01-02 up to 2022-12-28.
204
+
205
+ The data comes from the Yahoo public API.
206
+ The price is the adjusted close which is the closing price after adjustments for
207
+ all applicable splits and dividend distributions.
208
+ The adjustment uses appropriate split and dividend multipliers, adhering to
209
+ the Center for Research in Security Prices (CRSP) standards.
210
+
211
+ ============== ==================
212
+ Observations 8313
213
+ Assets 1
214
+ ============== ==================
215
+
216
+ Returns
217
+ -------
218
+ df : DataFrame of shape (n_observations, n_assets)
219
+ Prices DataFrame
220
+
221
+ Examples
222
+ --------
223
+ >>> from skfolio.datasets import load_sp500_index
224
+ >>> prices = load_sp500_index()
225
+ >>> prices.head()
226
+ SP500
227
+ Date
228
+ 1990-01-02 359.69
229
+ 1990-01-03 358.76
230
+ 1990-01-04 355.67
231
+ 1990-01-05 352.20
232
+ 1990-01-08 353.79
233
+ """
234
+ data_filename = "sp500_index.csv.gz"
235
+ df = load_gzip_compressed_csv_data(data_filename)
236
+ return df
237
+
238
+
239
+ def load_factors_dataset() -> pd.DataFrame:
240
+ """Load the prices of 5 factor ETFs.
241
+
242
+ This dataset is composed of the daily prices of 5 ETF representing common factors
243
+ starting from 2014-01-02 up to 2022-12-28.
244
+
245
+ The factors are:
246
+
247
+ * "MTUM": Momentum
248
+ * "QUAL": Quanlity
249
+ * "SIZE": Size
250
+ * "VLUE": Value
251
+ * "USMV": low volatility
252
+
253
+ The data comes from the Yahoo public API.
254
+ The price is the adjusted close which is the closing price after adjustments for
255
+ all applicable splits and dividend distributions.
256
+ The adjustment uses appropriate split and dividend multipliers, adhering to
257
+ the Center for Research in Security Prices (CRSP) standards.
258
+
259
+ ============== ==================
260
+ Observations 2264
261
+ Assets 5
262
+ ============== ==================
263
+
264
+ Returns
265
+ -------
266
+ df : DataFrame of shape (n_observations, n_assets)
267
+ Prices DataFrame
268
+
269
+ Examples
270
+ --------
271
+ >>> from skfolio.datasets import load_factors_dataset
272
+ >>> prices = load_factors_dataset()
273
+ >>> prices.head()
274
+ MTUM QUAL SIZE USMV VLUE
275
+ Date
276
+ 2014-01-02 52.704 48.351 48.986 29.338 47.054
277
+ 2014-01-03 52.792 48.256 48.722 29.330 46.999
278
+ 2014-01-06 52.677 48.067 48.722 29.263 46.991
279
+ 2014-01-07 53.112 48.455 48.731 29.430 47.253
280
+ 2014-01-08 53.502 48.437 48.731 29.422 47.253
281
+ """
282
+ data_filename = "factors_dataset.csv.gz"
283
+ df = load_gzip_compressed_csv_data(data_filename)
284
+ return df
285
+
286
+
287
+ def load_ftse100_dataset(data_home=None, download_if_missing=True) -> pd.DataFrame:
288
+ """Load the prices of 64 assets from the FTSE 100 Index composition.
289
+
290
+ This dataset is composed of the daily prices of 64 assets from the FTSE 100 Index
291
+ starting from 2000-01-04 up to 2023-05-31.
292
+
293
+ The data comes from the Yahoo public API.
294
+ The price is the adjusted close which is the closing price after adjustments for
295
+ all applicable splits and dividend distributions.
296
+ The adjustment uses appropriate split and dividend multipliers, adhering to
297
+ the Center for Research in Security Prices (CRSP) standards.
298
+ The data contains NaN.
299
+
300
+ ============== ==================
301
+ Observations 5960
302
+ Assets 64
303
+ ============== ==================
304
+
305
+ Parameters
306
+ ----------
307
+ data_home : str, optional
308
+ Specify another download and cache folder for the datasets.
309
+ By default, all skfolio data is stored in `~/skfolio_data` subfolders.
310
+
311
+ download_if_missing : bool, default=True
312
+ If False, raise an OSError if the data is not locally available
313
+ instead of trying to download the data from the source site.
314
+
315
+ Returns
316
+ -------
317
+ df : DataFrame of shape (n_observations, n_assets)
318
+ Prices DataFrame
319
+
320
+ Examples
321
+ --------
322
+ >>> from skfolio.datasets import load_ftse100_dataset
323
+ >>> prices = load_ftse100_dataset()
324
+ >>> prices.head()
325
+ AAL.L ABF.L AHT.L ANTO.L ... VOD.L WEIR.L WPP.L WTB.L
326
+ Date ...
327
+ 2000-01-04 535.354 205.926 97.590 40.313 ... 72.562 115.240 512.249 382.907
328
+ 2000-01-05 540.039 209.185 96.729 40.313 ... 69.042 118.483 462.080 381.972
329
+ 2000-01-06 553.289 229.048 95.581 40.452 ... 66.950 124.220 458.119 386.337
330
+ 2000-01-07 572.829 222.220 95.581 40.452 ... 70.716 121.725 475.283 405.046
331
+ 2000-01-10 578.852 224.548 92.711 40.685 ... 74.285 121.476 498.254 392.885
332
+ """
333
+ data_filename = "ftse100_dataset"
334
+ df = download_dataset(
335
+ data_filename, data_home=data_home, download_if_missing=download_if_missing
336
+ )
337
+ return df
338
+
339
+
340
+ def load_nasdaq_dataset(data_home=None, download_if_missing=True) -> pd.DataFrame:
341
+ """Load the prices of 1455 assets from the NASDAQ Composite Index.
342
+
343
+ This dataset is composed of the daily prices of 1455 assets from the NASDAQ
344
+ Composite starting from 2018-01-02 up to 2023-05-31.
345
+
346
+ The data comes from the Yahoo public API.
347
+ The price is the adjusted close which is the closing price after adjustments for
348
+ all applicable splits and dividend distributions.
349
+ The adjustment uses appropriate split and dividend multipliers, adhering to
350
+ the Center for Research in Security Prices (CRSP) standards.
351
+
352
+ ============== ==================
353
+ Observations 1362
354
+ Assets 1455
355
+ ============== ==================
356
+
357
+ Parameters
358
+ ----------
359
+ data_home : str, optional
360
+ Specify another download and cache folder for the datasets.
361
+ By default, all skfolio data is stored in `~/skfolio_data` subfolders.
362
+
363
+ download_if_missing : bool, default=True
364
+ If False, raise an OSError if the data is not locally available
365
+ instead of trying to download the data from the source site.
366
+
367
+ Returns
368
+ -------
369
+ df : DataFrame of shape (n_observations, n_assets)
370
+ Prices DataFrame
371
+
372
+ Examples
373
+ --------
374
+ >>> from skfolio.datasets import load_nasdaq_dataset
375
+ >>> prices = load_nasdaq_dataset()
376
+ >>> prices.head()
377
+ AAL AAOI AAON AAPL ... ZVRA ZYME ZYNE ZYXI
378
+ Date ...
379
+ 2018-01-02 51.648 37.91 35.621 41.310 ... 66.4 7.933 12.995 2.922
380
+ 2018-01-03 51.014 37.89 36.247 41.303 ... 72.8 7.965 13.460 2.913
381
+ 2018-01-04 51.336 38.38 36.103 41.495 ... 78.4 8.430 12.700 2.869
382
+ 2018-01-05 51.316 38.89 36.681 41.967 ... 77.6 8.400 12.495 2.780
383
+ 2018-01-08 50.809 38.37 36.103 41.811 ... 82.4 8.310 12.550 2.825
384
+ """
385
+ data_filename = "nasdaq_dataset"
386
+ df = download_dataset(
387
+ data_filename, data_home=data_home, download_if_missing=download_if_missing
388
+ )
389
+ return df
File without changes
Binary file
@@ -0,0 +1,26 @@
1
+ """Distance Estimators."""
2
+
3
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
4
+ # License: BSD 3 clause
5
+
6
+ from skfolio.distance._base import BaseDistance
7
+ from skfolio.distance._distance import (
8
+ CovarianceDistance,
9
+ DistanceCorrelation,
10
+ KendallDistance,
11
+ MutualInformation,
12
+ NBinsMethod,
13
+ PearsonDistance,
14
+ SpearmanDistance,
15
+ )
16
+
17
+ __all__ = [
18
+ "BaseDistance",
19
+ "PearsonDistance",
20
+ "KendallDistance",
21
+ "SpearmanDistance",
22
+ "CovarianceDistance",
23
+ "DistanceCorrelation",
24
+ "MutualInformation",
25
+ "NBinsMethod",
26
+ ]
@@ -0,0 +1,55 @@
1
+ """Base Distance Estimators"""
2
+
3
+ # Author: Hugo Delatte <delatte.hugo@gmail.com>
4
+ # License: BSD 3 clause
5
+
6
+ from abc import ABC, abstractmethod
7
+
8
+ import numpy as np
9
+ import numpy.typing as npt
10
+ import sklearn.base as skb
11
+
12
+
13
+ class BaseDistance(skb.BaseEstimator, ABC):
14
+ """Base class for all distance estimators in skfolio.
15
+
16
+ Notes
17
+ -----
18
+ All estimators should specify all the parameters that can be set
19
+ at the class level in their ``__init__`` as explicit keyword
20
+ arguments (no ``*args`` or ``**kwargs``).
21
+
22
+ Attributes
23
+ ----------
24
+ codependence_ : ndarray of shape (n_assets, n_assets)
25
+ Codependence matrix.
26
+
27
+ distance_ : ndarray of shape (n_assets, n_assets)
28
+ Distance matrix.
29
+ """
30
+
31
+ codependence_: np.ndarray
32
+ distance_: np.ndarray
33
+
34
+ @abstractmethod
35
+ def __init__(self):
36
+ pass
37
+
38
+ @abstractmethod
39
+ def fit(self, X: npt.ArrayLike, y=None) -> "BaseDistance":
40
+ """Fit the Distance estimator.
41
+
42
+ Parameters
43
+ ----------
44
+ X : array-like of shape (n_observations, n_assets)
45
+ Price returns of the assets.
46
+
47
+ y : Ignored
48
+ Not used, present for API consistency by convention.
49
+
50
+ Returns
51
+ -------
52
+ self : BaseDistance
53
+ Fitted estimator.
54
+ """
55
+ pass