skfolio 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
skfolio/datasets/_base.py CHANGED
@@ -250,7 +250,7 @@ def load_factors_dataset() -> pd.DataFrame:
250
250
  The factors are:
251
251
 
252
252
  * "MTUM": Momentum
253
- * "QUAL": Quanlity
253
+ * "QUAL": Quality
254
254
  * "SIZE": Size
255
255
  * "VLUE": Value
256
256
  * "USMV": low volatility
@@ -347,7 +347,7 @@ def entropic_risk_measure(
347
347
  """Compute the entropic risk measure.
348
348
 
349
349
  The entropic risk measure is a risk measure which depends on the risk aversion
350
- defined by the investor (theat) through the exponential utility function at a given
350
+ defined by the investor (theta) through the exponential utility function at a given
351
351
  confidence level (beta).
352
352
 
353
353
  Parameters
@@ -377,7 +377,7 @@ class CombinatorialPurgedCV(BaseCombinatorialCV):
377
377
  ]
378
378
  values = self.index_train_test_.T
379
379
  values = np.insert(values, 0, np.arange(n_samples), axis=0)
380
- fill_color = np.select(cond, ["green", "blue", "red"]).T
380
+ fill_color = np.select(cond, ["green", "blue", "red"], default="green").T
381
381
  fill_color = fill_color.astype(object)
382
382
  fill_color = np.insert(
383
383
  fill_color, 0, np.array(["darkblue" for _ in range(n_samples)]), axis=0
@@ -8,50 +8,109 @@
8
8
  # scikit-learn, Copyright (c) 2007-2010 David Cournapeau, Fabian Pedregosa, Olivier
9
9
  # Grisel Licensed under BSD 3 clause.
10
10
 
11
+ import datetime as dt
11
12
  from collections.abc import Iterator
12
13
 
13
14
  import numpy as np
14
15
  import numpy.typing as npt
16
+ import pandas as pd
15
17
  import sklearn.model_selection as sks
16
18
  import sklearn.utils as sku
17
19
 
18
20
 
19
21
  class WalkForward(sks.BaseCrossValidator):
20
- """Walk Forward cross-validator.
22
+ """Walk Forward Cross-Validator.
21
23
 
22
- Provides train/test indices to split time series data samples in a walk forward
24
+ Provides train/test indices to split time series data samples using a walk-forward
23
25
  logic.
24
26
 
25
- In each split, test indices must be higher than before, and thus shuffling
26
- in cross validator is inappropriate.
27
+ In each split, test indices must be higher than the previous ones; therefore,
28
+ shuffling in cross-validator is inappropriate.
27
29
 
28
30
  Compared to `sklearn.model_selection.TimeSeriesSplit`, you control the train/test
29
- folds by providing a number of training and test samples instead of a number of
30
- split making it more suitable for portfolio cross-validation.
31
+ folds by specifying the number of training and test samples instead of the number
32
+ of splits, making it more suitable for portfolio cross-validation.
33
+
34
+ If your data is a DataFrame indexed with a DatetimeIndex, you can split the data
35
+ using specific datetime frequencies and offsets.
31
36
 
32
37
  Parameters
33
38
  ----------
34
39
  test_size : int
35
- Number of observations in each test set.
40
+ Length of each test set.
41
+ If `freq` is `None` (default), it represents the number of observations.
42
+ Otherwise, it represents the number of periods defined by `freq`.
43
+
44
+ train_size : int | pandas.offsets.DateOffset | datetime.timedelta
45
+ Length of each training set.
46
+ If `freq` is `None` (default), it represents the number of observations.
47
+ Otherwise, for integers, it represents the number of periods defined by `freq`;
48
+ for pandas DateOffset or datetime timedelta it represents the date offset
49
+ applied to the start of each period.
50
+
51
+ freq : str | pandas.offsets.DateOffset, optional
52
+ If provided, it must be a frequency string or a pandas DateOffset, and the
53
+ returns `X` must be a DataFrame with an index of type `DatetimeIndex`.
54
+ For a list of pandas frequencies and offsets, see `here <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases>`_.
55
+ The defaul (`None`) means `test_size` and `train_size` represent the number of
56
+ observations.
57
+
58
+ Below are some common examples:
59
+
60
+ * Rebalancing : Montly on the first day
61
+ * Test Duration : 1 month
62
+ * Train Duration : 6 months
63
+
64
+ >>> cv = WalkForward(test_size=1, train_size=6, freq="MS")
65
+
66
+ * Rebalancing : Quarterly on the first day
67
+ * Test Duration : 1 quarter
68
+ * Train Duration : 2 months
69
+
70
+ >>> cv = WalkForward(test_size=1, train_size=pd.DateOffset(months=2), freq="QS")
71
+
72
+ * Rebalancing : Montly on the third Friday
73
+ * Test Duration : 1 month
74
+ * Train Duration : 6 weeks
75
+
76
+ >>> cv = WalkForward(test_size=1, train_size=pd.offsets.Week(6), freq= "WOM-3FRI")
77
+
78
+ * Rebalancing : Semi-annually on the last day
79
+ * Test Duration : 6 months
80
+ * Train Duration : 1 year
81
+
82
+ >>> cv = WalkForward(test_size=1, train_size=2, freq=pd.offsets.SemiMonthEnd())
83
+
84
+ * Rebalancing : Every 2 months on the second day
85
+ * Test Duration : 2 months
86
+ * Train Duration : 6 months
36
87
 
37
- train_size : int
38
- Number of observations in each training set.
88
+ >>> cv = WalkForward(test_size=2, train_size=6, freq="MS", freq_offset=dt.timedelta(days=2))
89
+
90
+ freq_offset : pandas DateOffset | datetime timedelta, optional
91
+ Only used if `freq` is provided. Offsets the `freq` by a pandas DateOffset or a
92
+ datetime timedelta offset.
93
+
94
+ previous : bool, default=False
95
+ Only used if `freq` is provided. If set to `True`, and if the period start
96
+ or period end is not in the `DatetimeIndex`, the previous observation is used;
97
+ otherwise, the next observation is used (default).
39
98
 
40
99
  expend_train : bool, default=False
41
- If this is set to True, each subsequent training set after the first one will
100
+ If set to `True`, each subsequent training set after the first one will
42
101
  use all past observations.
43
- The default is `False`
102
+ The default is `False`.
44
103
 
45
104
  reduce_test : bool, default=False
46
- If this is set to True, the last train/test split will be returned even if the
47
- test set is partial (if it contains less observations than `test_size`),
48
- otherwise it will be ignored.
49
- The default is `False`
105
+ If set to `True`, the last train/test split will be returned even if the
106
+ test set is partial (i.e., it contains fewer observations than `test_size`),
107
+ otherwise, it will be ignored.
108
+ The default is `False`.
50
109
 
51
110
  purged_size : int, default=0
52
- Number of observations to exclude from the end of each train set before the
53
- test set.
54
- The default value is `0`
111
+ The number of observations to exclude from the end of each training set before
112
+ the test set.
113
+ The default value is `0`.
55
114
 
56
115
  Examples
57
116
  --------
@@ -124,13 +183,19 @@ class WalkForward(sks.BaseCrossValidator):
124
183
  def __init__(
125
184
  self,
126
185
  test_size: int,
127
- train_size: int,
186
+ train_size: int | pd.offsets.BaseOffset | dt.timedelta,
187
+ freq: str | pd.offsets.BaseOffset | None = None,
188
+ freq_offset: pd.offsets.BaseOffset | dt.timedelta | None = None,
189
+ previous: bool = False,
128
190
  expend_train: bool = False,
129
191
  reduce_test: bool = False,
130
192
  purged_size: int = 0,
131
193
  ):
132
194
  self.test_size = test_size
133
195
  self.train_size = train_size
196
+ self.freq = freq
197
+ self.freq_offset = freq_offset
198
+ self.previous = previous
134
199
  self.expend_train = expend_train
135
200
  self.reduce_test = reduce_test
136
201
  self.purged_size = purged_size
@@ -161,40 +226,51 @@ class WalkForward(sks.BaseCrossValidator):
161
226
  """
162
227
  X, y = sku.indexable(X, y)
163
228
  n_samples = X.shape[0]
164
- # Make sure we have enough samples for the given split parameters
165
- if self.train_size + self.purged_size >= n_samples:
166
- raise ValueError(
167
- "The sum of `train_size` with `purged_size` "
168
- f"({self.train_size + self.purged_size}) cannot be greater than the"
169
- f" number of samples ({n_samples})."
170
- )
171
229
 
172
- indices = np.arange(n_samples)
230
+ if not isinstance(self.test_size, int):
231
+ raise ValueError("test_size` must be an integer")
173
232
 
174
- test_start = self.train_size + self.purged_size
175
- while True:
176
- if test_start >= n_samples:
177
- return
178
- test_end = test_start + self.test_size
179
- train_end = test_start - self.purged_size
180
- if self.expend_train:
181
- train_start = 0
182
- else:
183
- train_start = train_end - self.train_size
184
-
185
- if test_end > n_samples:
186
- if not self.reduce_test:
187
- return
188
- yield (
189
- indices[train_start:train_end],
190
- indices[test_start:],
191
- )
192
- else:
193
- yield (
194
- indices[train_start:train_end],
195
- indices[test_start:test_end],
196
- )
197
- test_start = test_end
233
+ if self.freq is None:
234
+ if not isinstance(self.train_size, int):
235
+ raise ValueError("When `freq` is None, `train_size` must be an integer")
236
+ return _split_without_period(
237
+ n_samples=n_samples,
238
+ train_size=self.train_size,
239
+ test_size=self.test_size,
240
+ purged_size=self.purged_size,
241
+ expend_train=self.expend_train,
242
+ reduce_test=self.reduce_test,
243
+ )
244
+
245
+ if not hasattr(X, "index") or not isinstance(X.index, pd.DatetimeIndex):
246
+ raise ValueError(
247
+ "X must be a DataFrame with an index of type DatetimeIndex"
248
+ )
249
+ if isinstance(self.train_size, int):
250
+ return _split_from_period_without_train_offset(
251
+ n_samples=n_samples,
252
+ train_size=self.train_size,
253
+ test_size=self.test_size,
254
+ freq=self.freq,
255
+ freq_offset=self.freq_offset,
256
+ previous=self.previous,
257
+ purged_size=self.purged_size,
258
+ expend_train=self.expend_train,
259
+ reduce_test=self.reduce_test,
260
+ ts_index=X.index,
261
+ )
262
+ return _split_from_period_with_train_offset(
263
+ n_samples=n_samples,
264
+ train_size=self.train_size,
265
+ test_size=self.test_size,
266
+ freq=self.freq,
267
+ freq_offset=self.freq_offset,
268
+ previous=self.previous,
269
+ purged_size=self.purged_size,
270
+ expend_train=self.expend_train,
271
+ reduce_test=self.reduce_test,
272
+ ts_index=X.index,
273
+ )
198
274
 
199
275
  def get_n_splits(self, X=None, y=None, groups=None) -> int:
200
276
  """Returns the number of splitting iterations in the cross-validator
@@ -224,3 +300,141 @@ class WalkForward(sks.BaseCrossValidator):
224
300
  if self.reduce_test and n % self.test_size != 0:
225
301
  return n // self.test_size + 1
226
302
  return n // self.test_size
303
+
304
+
305
+ def _split_without_period(
306
+ n_samples: int,
307
+ train_size: int,
308
+ test_size: int,
309
+ purged_size: int,
310
+ expend_train: bool,
311
+ reduce_test: bool,
312
+ ) -> Iterator[np.ndarray, np.ndarray]:
313
+ if train_size + purged_size >= n_samples:
314
+ raise ValueError(
315
+ "The sum of `train_size` with `purged_size` "
316
+ f"({train_size + purged_size}) cannot be greater than the"
317
+ f" number of samples ({n_samples})."
318
+ )
319
+
320
+ indices = np.arange(n_samples)
321
+
322
+ test_start = train_size + purged_size
323
+ while True:
324
+ if test_start >= n_samples:
325
+ return
326
+ test_end = test_start + test_size
327
+ train_end = test_start - purged_size
328
+ if expend_train:
329
+ train_start = 0
330
+ else:
331
+ train_start = train_end - train_size
332
+
333
+ if test_end > n_samples:
334
+ if not reduce_test:
335
+ return
336
+ test_indices = indices[test_start:]
337
+ else:
338
+ test_indices = indices[test_start:test_end]
339
+ train_indices = indices[train_start:train_end]
340
+ yield train_indices, test_indices
341
+
342
+ test_start = test_end
343
+
344
+
345
+ def _split_from_period_without_train_offset(
346
+ n_samples: int,
347
+ train_size: int,
348
+ test_size: int,
349
+ freq: str,
350
+ freq_offset: pd.offsets.BaseOffset | dt.timedelta | None,
351
+ previous: bool,
352
+ purged_size: int,
353
+ expend_train: bool,
354
+ reduce_test: bool,
355
+ ts_index,
356
+ ) -> Iterator[np.ndarray, np.ndarray]:
357
+ start = ts_index[0]
358
+ end = ts_index[-1]
359
+ if freq_offset is not None:
360
+ start = min(start, start - freq_offset)
361
+
362
+ date_range = pd.date_range(start=start, end=end, freq=freq)
363
+ if freq_offset is not None:
364
+ date_range += freq_offset
365
+
366
+ idx = ts_index.get_indexer(date_range, method="ffill" if previous else "bfill")
367
+ n = len(idx)
368
+ i = 0
369
+ while True:
370
+ if i + train_size >= n:
371
+ return
372
+
373
+ if i + train_size + test_size >= n:
374
+ if not reduce_test:
375
+ return
376
+ test_indices = np.arange(idx[i + train_size], n_samples)
377
+
378
+ else:
379
+ test_indices = np.arange(
380
+ idx[i + train_size], idx[i + train_size + test_size]
381
+ )
382
+ if expend_train:
383
+ train_start = 0
384
+ else:
385
+ train_start = idx[i]
386
+ train_indices = np.arange(train_start, idx[i + train_size] - purged_size)
387
+ yield train_indices, test_indices
388
+
389
+ i += test_size
390
+
391
+
392
+ def _split_from_period_with_train_offset(
393
+ n_samples: int,
394
+ train_size: pd.offsets.BaseOffset | dt.timedelta,
395
+ test_size: int,
396
+ freq: str,
397
+ freq_offset: pd.offsets.BaseOffset | dt.timedelta | None,
398
+ previous: bool,
399
+ purged_size: int,
400
+ expend_train: bool,
401
+ reduce_test: bool,
402
+ ts_index,
403
+ ) -> Iterator[np.ndarray, np.ndarray]:
404
+ start = ts_index[0]
405
+ end = ts_index[-1]
406
+ if freq_offset is not None:
407
+ start = min(start, start - freq_offset)
408
+
409
+ date_range = pd.date_range(start=start, end=end, freq=freq)
410
+ if freq_offset is not None:
411
+ date_range += freq_offset
412
+
413
+ idx = ts_index.get_indexer(date_range, method="ffill" if previous else "bfill")
414
+ train_idx = ts_index.get_indexer(date_range - train_size, method="ffill")
415
+
416
+ n = len(idx)
417
+
418
+ if np.all(train_idx == -1):
419
+ return
420
+
421
+ i = np.argmax(train_idx > -1)
422
+ while True:
423
+ if i >= n:
424
+ return
425
+
426
+ if i + test_size >= n:
427
+ if not reduce_test:
428
+ return
429
+ test_indices = np.arange(idx[i], n_samples)
430
+ else:
431
+ test_indices = np.arange(idx[i], idx[i + test_size] - purged_size)
432
+
433
+ if expend_train:
434
+ train_start = 0
435
+ else:
436
+ train_start = train_idx[i]
437
+ train_indices = np.arange(train_start, idx[i])
438
+ yield train_indices, test_indices
439
+
440
+ i += test_size