bbstrader 0.2.93__py3-none-any.whl → 0.2.95__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbstrader might be problematic. Click here for more details.

Files changed (35) hide show
  1. bbstrader/__ini__.py +20 -20
  2. bbstrader/__main__.py +50 -50
  3. bbstrader/btengine/__init__.py +54 -54
  4. bbstrader/btengine/scripts.py +157 -157
  5. bbstrader/compat.py +19 -19
  6. bbstrader/config.py +137 -137
  7. bbstrader/core/data.py +22 -22
  8. bbstrader/core/utils.py +146 -146
  9. bbstrader/metatrader/__init__.py +6 -6
  10. bbstrader/metatrader/account.py +1516 -1516
  11. bbstrader/metatrader/copier.py +750 -745
  12. bbstrader/metatrader/rates.py +584 -584
  13. bbstrader/metatrader/risk.py +749 -748
  14. bbstrader/metatrader/scripts.py +81 -81
  15. bbstrader/metatrader/trade.py +1836 -1836
  16. bbstrader/metatrader/utils.py +645 -645
  17. bbstrader/models/__init__.py +10 -10
  18. bbstrader/models/factors.py +312 -312
  19. bbstrader/models/ml.py +1272 -1272
  20. bbstrader/models/optimization.py +182 -182
  21. bbstrader/models/portfolio.py +223 -223
  22. bbstrader/models/risk.py +398 -398
  23. bbstrader/trading/__init__.py +11 -11
  24. bbstrader/trading/execution.py +846 -846
  25. bbstrader/trading/script.py +155 -155
  26. bbstrader/trading/scripts.py +69 -69
  27. bbstrader/trading/strategies.py +860 -860
  28. bbstrader/tseries.py +1842 -1842
  29. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/LICENSE +21 -21
  30. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/METADATA +188 -187
  31. bbstrader-0.2.95.dist-info/RECORD +44 -0
  32. bbstrader-0.2.93.dist-info/RECORD +0 -44
  33. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/WHEEL +0 -0
  34. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/entry_points.txt +0 -0
  35. {bbstrader-0.2.93.dist-info → bbstrader-0.2.95.dist-info}/top_level.txt +0 -0
bbstrader/models/ml.py CHANGED
@@ -1,1272 +1,1272 @@
1
- import os
2
- import warnings
3
- from datetime import datetime
4
- from itertools import product
5
- from time import time
6
-
7
- import lightgbm as lgb
8
- import matplotlib.pyplot as plt
9
- import numpy as np
10
- import pandas as pd
11
- import seaborn as sns
12
-
13
- import yfinance as yf
14
- from alphalens import performance as perf
15
- from alphalens import plotting
16
- from alphalens.tears import create_full_tear_sheet, create_summary_tear_sheet
17
- from alphalens.utils import (
18
- get_clean_factor_and_forward_returns,
19
- rate_of_return,
20
- std_conversion,
21
- )
22
- from scipy.stats import spearmanr
23
- from sklearn.preprocessing import LabelEncoder, StandardScaler
24
- import pandas_ta as ta
25
-
26
- warnings.filterwarnings("ignore")
27
-
28
-
29
- __all__ = ["OneStepTimeSeriesSplit", "MultipleTimeSeriesCV", "LightGBModel"]
30
-
31
-
32
- class OneStepTimeSeriesSplit:
33
- __author__ = "Stefan Jansen"
34
- """Generates tuples of train_idx, test_idx pairs
35
- Assumes the index contains a level labeled 'date'"""
36
-
37
- def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
38
- self.n_splits = n_splits
39
- self.test_period_length = test_period_length
40
- self.shuffle = shuffle
41
-
42
- @staticmethod
43
- def chunks(l, n): # noqa: E741
44
- for i in range(0, len(l), n):
45
- yield l[i : i + n]
46
-
47
- def split(self, X: pd.DataFrame, y=None, groups=None):
48
- unique_dates = (
49
- X.index.get_level_values("date")
50
- .unique()
51
- .sort_values(ascending=False)[: self.n_splits * self.test_period_length]
52
- )
53
-
54
- dates = X.reset_index()[["date"]]
55
- for test_date in self.chunks(unique_dates, self.test_period_length):
56
- train_idx = dates[dates.date < min(test_date)].index
57
- test_idx = dates[dates.date.isin(test_date)].index
58
- if self.shuffle:
59
- np.random.shuffle(list(train_idx))
60
- yield train_idx, test_idx
61
-
62
- def get_n_splits(self, X, y, groups=None):
63
- return self.n_splits
64
-
65
-
66
- class MultipleTimeSeriesCV:
67
- __author__ = "Stefan Jansen"
68
- """
69
- Generates tuples of train_idx, test_idx pairs
70
- Assumes the MultiIndex contains levels 'symbol' and 'date'
71
- purges overlapping outcomes
72
- """
73
-
74
- def __init__(
75
- self,
76
- n_splits=3,
77
- train_period_length=126,
78
- test_period_length=21,
79
- lookahead=None,
80
- date_idx="date",
81
- shuffle=False,
82
- ):
83
- self.n_splits = n_splits
84
- self.lookahead = lookahead
85
- self.test_length = test_period_length
86
- self.train_length = train_period_length
87
- self.shuffle = shuffle
88
- self.date_idx = date_idx
89
-
90
- def split(self, X: pd.DataFrame, y=None, groups=None):
91
- unique_dates = X.index.get_level_values(self.date_idx).unique()
92
- days = sorted(unique_dates, reverse=True)
93
- split_idx = []
94
- for i in range(self.n_splits):
95
- test_end_idx = i * self.test_length
96
- test_start_idx = test_end_idx + self.test_length
97
- train_end_idx = test_start_idx + self.lookahead - 1
98
- train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
99
- split_idx.append(
100
- [train_start_idx, train_end_idx, test_start_idx, test_end_idx]
101
- )
102
-
103
- dates = X.reset_index()[[self.date_idx]]
104
- for train_start, train_end, test_start, test_end in split_idx:
105
- train_idx = dates[
106
- (dates[self.date_idx] > days[train_start])
107
- & (dates[self.date_idx] <= days[train_end])
108
- ].index
109
- test_idx = dates[
110
- (dates[self.date_idx] > days[test_start])
111
- & (dates[self.date_idx] <= days[test_end])
112
- ].index
113
- if self.shuffle:
114
- np.random.shuffle(list(train_idx))
115
- yield train_idx.to_numpy(), test_idx.to_numpy()
116
-
117
- def get_n_splits(self, X, y, groups=None):
118
- return self.n_splits
119
-
120
-
121
- class LightGBModel(object):
122
- """
123
- ``LightGBModel`` encapsulates a complete workflow for training and evaluating
124
- a ``LightGBM (Light Gradient Boosting Machine)`` model for predicting stock returns.
125
- It includes data acquisition, feature engineering, model tuning, and performance
126
- evaluation using information ``coefficient (IC)`` and Alphalens analysis.
127
-
128
- Key Features
129
- ------------
130
- - ``HDF5 Storage``: Utilizes ``pandas.HDFStore`` for efficient storage and retrieval
131
- of large datasets, which is essential for backtesting on financial time series data.
132
-
133
- - ``Time-Series Cross-Validation``: Employs a custom cross-validation strategy that
134
- respects the time series nature of the data, avoiding data leakage.
135
-
136
- - ``Hyperparameter Tuning``: Includes automated hyperparameter tuning using a randomized
137
- grid search for optimization.
138
-
139
- - ``Information Coefficient (IC)``: Uses IC as a core performance metric that quantifies
140
- the predictive power of the model, which is a standard measure for ranking models in finance.
141
-
142
- - ``Alphalens Integration``: Provides a comprehensive framework for validating model
143
- performance using Alphalens, allowing for in-depth performance analysis, like backtesting
144
- and return decomposition.
145
-
146
- Use Case
147
- --------
148
- This class is designed for quantitative finance and algorithmic trading use cases where
149
- the goal is to build a predictive model for stock returns based on historical data and
150
- technical indicators. It follows a complete cycle from data acquisition to model validation
151
- and provides the infrastructure needed for deployment of this model in a trading strategy.
152
-
153
- Notes
154
- -----
155
- The implementation is inspired by the book "Machine Learning for Algorithmic Trading"
156
- by Stefan Jansen.
157
-
158
- References
159
- ----------
160
- Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
161
- Chapter 12, Boosting Your Trading Strategy.
162
- """
163
-
164
- def __init__(
165
- self,
166
- data: pd.DataFrame = None,
167
- datastore: pd.HDFStore = "lgbdata.h5",
168
- trainstore: pd.HDFStore = "lgbtrain.h5",
169
- outstore: pd.HDFStore = "lgbout.h5",
170
- ):
171
- """
172
- Args:
173
- data (pd.DataFrame): The input data for the model. It should be a DataFrame with a MultiIndex containing
174
- 'symbol' and 'date' levels. If not provided, the data can be downloaded using the `download_boosting_data` method.
175
- datastore (str): The path to the HDF5 file for storing the model data.
176
- trainstore (str): The path to the HDF5 file for storing the training data.
177
- outstore (str): The path to the HDF5 file for storing the output data.
178
- """
179
- self.datastore = datastore
180
- self.trainstore = trainstore
181
- self.outstore = outstore
182
- if data is not None:
183
- data.reset_index().to_hdf(path_or_buf=self.datastore, key="model_data")
184
-
185
- def _compute_bb(self, close):
186
- # Compute Bollinger Bands using pandas_ta
187
- bb = ta.bbands(close, length=20)
188
- return pd.DataFrame(
189
- {"bb_high": bb["BBU_20_2.0"], "bb_low": bb["BBL_20_2.0"]}, index=close.index
190
- )
191
-
192
- def _compute_atr(self, stock_data):
193
- # Compute ATR using pandas_ta
194
- atr = ta.atr(stock_data.high, stock_data.low, stock_data.close, length=14)
195
- return (atr - atr.mean()) / atr.std()
196
-
197
- def _compute_macd(self, close):
198
- # Compute MACD using pandas_ta
199
- macd = ta.macd(close)["MACD_12_26_9"]
200
- return (macd - macd.mean()) / macd.std()
201
-
202
- def _add_technical_indicators(self, prices: pd.DataFrame):
203
- prices = prices.copy()
204
-
205
- # Add RSI and normalize
206
- prices["rsi"] = (
207
- prices.groupby(level="symbol")
208
- .close.apply(lambda x: ta.rsi(x, length=14))
209
- .reset_index(level=0, drop=True)
210
- )
211
-
212
- # Add Bollinger Bands
213
- bb = prices.groupby(level="symbol").close.apply(self._compute_bb)
214
- bb = bb.reset_index(level=1, drop=True)
215
- prices = prices.join(bb)
216
-
217
- prices["bb_high"] = (
218
- prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
219
- )
220
- prices["bb_low"] = (
221
- prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)
222
- )
223
-
224
- # Add ATR and normalize
225
- prices["ATR"] = prices.groupby(level="symbol", group_keys=False).apply(
226
- lambda x: self._compute_atr(x)
227
- )
228
-
229
- # Add MACD and normalize
230
- prices["MACD"] = prices.groupby(level="symbol", group_keys=False).close.apply(
231
- self._compute_macd
232
- )
233
-
234
- return prices
235
-
236
- def download_boosting_data(self, tickers, start, end=None):
237
- data = []
238
- for ticker in tickers:
239
- try:
240
- prices = yf.download(
241
- ticker,
242
- start=start,
243
- end=end,
244
- progress=False,
245
- multi_level_index=False,
246
- auto_adjust=True,
247
- )
248
- prices["symbol"] = ticker
249
- data.append(prices)
250
- except: # noqa: E722
251
- continue
252
- data = pd.concat(data)
253
- data = (
254
- data.rename(columns={s: s.lower().replace(" ", "_") for s in data.columns})
255
- .drop(columns=["adj_close"])
256
- .set_index("symbol", append=True)
257
- .swaplevel()
258
- .sort_index()
259
- .dropna()
260
- )
261
- return data
262
-
263
- def download_metadata(self, tickers):
264
- def clean_text_column(series: pd.Series) -> pd.Series:
265
- return (
266
- series.str.lower()
267
- # use regex=False for literal string replacements
268
- .str.replace("-", "", regex=False)
269
- .str.replace("&", "and", regex=False)
270
- .str.replace(" ", "_", regex=False)
271
- .str.replace("__", "_", regex=False)
272
- )
273
-
274
- metadata = [
275
- "industry",
276
- "sector",
277
- "exchange",
278
- "symbol",
279
- "heldPercentInsiders",
280
- "heldPercentInstitutions",
281
- "overallRisk",
282
- "shortRatio",
283
- "dividendYield",
284
- "beta",
285
- "regularMarketVolume",
286
- "averageVolume",
287
- "averageVolume10days",
288
- "bid",
289
- "ask",
290
- "bidSize",
291
- "askSize",
292
- "marketCap",
293
- ]
294
-
295
- columns = {
296
- "industry": "industry",
297
- "sector": "sector",
298
- "exchange": "exchange",
299
- "symbol": "symbol",
300
- "heldPercentInsiders": "insiders",
301
- "heldPercentInstitutions": "institutions",
302
- "overallRisk": "risk",
303
- "shortRatio": "short_ratio",
304
- "dividendYield": "dyield",
305
- "beta": "beta",
306
- "regularMarketVolume": "regvolume",
307
- "averageVolume": "avgvolume",
308
- "averageVolume10days": "avgvolume10",
309
- "bid": "bid",
310
- "ask": "ask",
311
- "bidSize": "bidsize",
312
- "askSize": "asksize",
313
- "marketCap": "marketcap",
314
- }
315
- data = []
316
- for symbol in tickers:
317
- try:
318
- symbol_info = yf.Ticker(symbol).info
319
- except: # noqa: E722
320
- continue
321
- infos = {}
322
- for info in metadata:
323
- infos[info] = symbol_info.get(info)
324
- data.append(infos)
325
- metadata = pd.DataFrame(data)
326
- metadata = metadata.rename(columns=columns)
327
- metadata.dyield = metadata.dyield.fillna(0)
328
- metadata.sector = clean_text_column(metadata.sector)
329
- metadata.industry = clean_text_column(metadata.industry)
330
- metadata = metadata.set_index("symbol")
331
- return metadata
332
-
333
- def _select_nlargest_liquidity_stocks(
334
- self,
335
- df: pd.DataFrame,
336
- n: int,
337
- volume_features,
338
- bid_ask_features,
339
- market_cap_feature,
340
- ):
341
- df = df.copy()
342
- scaler = StandardScaler()
343
-
344
- # Normalize features
345
- df[volume_features] = scaler.fit_transform(df[volume_features])
346
- df["bid_ask_spread"] = df["ask"] - df["bid"]
347
- df["bid_ask_spread"] = scaler.fit_transform(df[["bid_ask_spread"]])
348
- df[market_cap_feature] = scaler.fit_transform(df[market_cap_feature])
349
-
350
- # Calculate Liquidity Score
351
- # Assign weights to each component (these weights can be adjusted based on importance)
352
- weights = {"volume": 0.4, "bid_ask_spread": 0.2, "marketCap": 0.4}
353
-
354
- # Calculate the liquidity score by combining the normalized features
355
- df["liquidity_score"] = (
356
- weights["volume"] * df[volume_features].mean(axis=1)
357
- + weights["bid_ask_spread"] * df["bid_ask_spread"]
358
- + weights["marketCap"] * df[market_cap_feature[0]]
359
- )
360
- df_sorted = df.sort_values(by="liquidity_score", ascending=False)
361
-
362
- return df_sorted.nlargest(n, "liquidity_score").index
363
-
364
- def _encode_metadata(self, df: pd.DataFrame):
365
- df = df.copy()
366
- # Binning each numerical feature into categories
367
- df["insiders"] = pd.qcut(
368
- df["insiders"], q=4, labels=["Very Low", "Low", "High", "Very High"]
369
- )
370
- df["institutions"] = pd.qcut(
371
- df["institutions"], q=4, labels=["Very Low", "Low", "High", "Very High"]
372
- )
373
- df["risk"] = pd.cut(
374
- df["risk"],
375
- bins=[-float("inf"), 3, 5, 7, float("inf")],
376
- labels=["Low", "Medium", "High", "Very High"],
377
- )
378
- df["short_ratio"] = pd.qcut(
379
- df["short_ratio"], q=4, labels=["Very Low", "Low", "High", "Very High"]
380
- )
381
- df["dyield"] = pd.cut(
382
- df["dyield"],
383
- bins=[-float("inf"), 0.002, 0.005, 0.01, float("inf")],
384
- labels=["Very Low", "Low", "High", "Very High"],
385
- )
386
- df["beta"] = pd.cut(
387
- df["beta"],
388
- bins=[-float("inf"), 0.8, 1.0, 1.2, float("inf")],
389
- labels=["Low", "Moderate", "High", "Very High"],
390
- )
391
-
392
- # Encode binned features
393
- binned_features = [
394
- "insiders",
395
- "institutions",
396
- "risk",
397
- "short_ratio",
398
- "dyield",
399
- "beta",
400
- "sector",
401
- "industry",
402
- "exchange",
403
- ]
404
- label_encoders = {}
405
-
406
- for col in binned_features:
407
- le = LabelEncoder()
408
- df[col] = le.fit_transform(df[col])
409
- label_encoders[col] = le
410
- return df, label_encoders
411
-
412
- def prepare_boosting_data(
413
- self,
414
- prices: pd.DataFrame,
415
- metadata: pd.DataFrame = None,
416
- min_years=7,
417
- universe=500,
418
- ):
419
- if metadata is None:
420
- mcap = False
421
- tickers = prices.index.get_level_values("symbol").unique()
422
- metadata = self.download_metadata(tickers)
423
- else:
424
- mcap = True
425
- YEAR = 252
426
- idx = pd.IndexSlice
427
- percentiles = [0.001, 0.01, 0.02, 0.03, 0.04, 0.05]
428
- percentiles += [1 - p for p in percentiles[::-1]]
429
- T = [1, 5, 10, 21, 42, 63]
430
-
431
- prices.volume /= 1e3 # make vol figures a bit smaller
432
- prices.index.names = ["symbol", "date"]
433
- metadata.index.name = "symbol"
434
- prices.reset_index().to_hdf(path_or_buf=self.datastore, key="stock_data")
435
- metadata.reset_index().to_hdf(path_or_buf=self.datastore, key="stock_metadata")
436
-
437
- # Remove stocks with insufficient observations
438
- min_obs = min_years * YEAR
439
- nobs = prices.groupby(level="symbol").size()
440
- keep = nobs[nobs > min_obs].index
441
- prices = prices.loc[idx[keep, :], :]
442
-
443
- # # Remove duplicate symbols
444
- prices = prices[~prices.index.duplicated()]
445
-
446
- # Align price and meta data
447
- metadata = metadata[~metadata.index.duplicated() & metadata.sector.notnull()]
448
- metadata.sector = metadata.sector.str.lower().str.replace(" ", "_")
449
- shared = (
450
- prices.index.get_level_values("symbol")
451
- .unique()
452
- .intersection(metadata.index)
453
- )
454
- metadata = metadata.loc[shared, :]
455
- prices = prices.loc[idx[shared, :], :]
456
-
457
- # Limit universe
458
- if mcap:
459
- universe = metadata.marketcap.nlargest(universe).index
460
- else:
461
- volume_features = ["regvolume", "avgvolume", "avgvolume10"]
462
- bid_ask_features = ["bid", "ask", "bidsize", "asksize"]
463
- market_cap_feature = ["marketcap"]
464
- to_drop = volume_features + bid_ask_features + market_cap_feature
465
- universe = self._select_nlargest_liquidity_stocks(
466
- metadata,
467
- universe,
468
- volume_features,
469
- bid_ask_features,
470
- market_cap_feature,
471
- )
472
- metadata = metadata.drop(to_drop, axis=1)
473
- prices = prices.loc[idx[universe, :], :]
474
- metadata = metadata.loc[universe]
475
- metadata = self._encode_metadata(metadata)[0]
476
-
477
- prices["dollar_vol"] = prices[["close", "volume"]].prod(1).div(1e3)
478
- # compute dollar volume to determine universe
479
- dollar_vol_ma = (
480
- prices.dollar_vol.unstack("symbol")
481
- .rolling(window=21, min_periods=1) # 1 trading month
482
- .mean()
483
- )
484
-
485
- # Rank stocks by moving average
486
- prices["dollar_vol_rank"] = (
487
- dollar_vol_ma.rank(axis=1, ascending=False).stack("symbol").swaplevel()
488
- )
489
- # Add some Basic Factors
490
- prices = self._add_technical_indicators(prices)
491
- # Combine Price and Meta Data
492
- prices = prices.join(metadata)
493
-
494
- # Compute Returns
495
- by_sym = prices.groupby(level="symbol").close
496
- for t in T:
497
- prices[f"r{t:02}"] = by_sym.pct_change(t)
498
- # Daily historical return deciles
499
- for t in T:
500
- # Reset the index to apply qcut by date without grouping errors
501
- prices[f"r{t:02}dec"] = (
502
- prices.reset_index(level="date")
503
- .groupby("date")[f"r{t:02}"]
504
- .apply(lambda x: pd.qcut(x, q=10, labels=False, duplicates="drop"))
505
- .values
506
- )
507
- # Daily sector return deciles
508
- for t in T:
509
- prices[f"r{t:02}q_sector"] = prices.groupby(["date", "sector"])[
510
- f"r{t:02}"
511
- ].transform(lambda x: pd.qcut(x, q=5, labels=False, duplicates="drop"))
512
- # Compute Forward Returns
513
- for t in [1, 5, 21]:
514
- prices[f"r{t:02}_fwd"] = prices.groupby(level="symbol")[f"r{t:02}"].shift(
515
- -t
516
- )
517
-
518
- # Remove outliers
519
- outliers = prices[prices.r01 > 1].index.get_level_values("symbol").unique()
520
- prices = prices.drop(outliers, level="symbol")
521
- # Create time and sector dummy variables
522
- prices["year"] = prices.index.get_level_values("date").year
523
- prices["month"] = prices.index.get_level_values("date").month
524
- prices["weekday"] = prices.index.get_level_values("date").weekday
525
- # Store Model Data
526
- prices = prices.drop(["open", "close", "low", "high", "volume"], axis=1)
527
- if "adj_close" in prices.columns:
528
- prices = prices.drop("adj_close", axis=1)
529
- prices.reset_index().to_hdf(path_or_buf=self.datastore, key="model_data")
530
- return prices.sort_index()
531
-
532
- def tickers(self):
533
- return pd.read_hdf(self.outstore, "lgb/tickers").tolist()
534
-
535
- def load_model_data(self, key="model_data"):
536
- return (
537
- pd.read_hdf(self.datastore, key=key)
538
- .set_index(["symbol", "date"])
539
- .sort_index()
540
- )
541
-
542
- def format_time(self, t):
543
- """Return a formatted time string 'HH:MM:SS
544
- based on a numeric time() value"""
545
- m, s = divmod(t, 60)
546
- h, m = divmod(m, 60)
547
- return f"{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}"
548
-
549
- def fit(self, data: pd.DataFrame, verbose=True):
550
- def get_fi(model):
551
- """Return normalized feature importance as pd.Series"""
552
- fi = model.feature_importance(importance_type="gain")
553
- return pd.Series(fi / fi.sum(), index=model.feature_name())
554
-
555
- def ic_lgbm(preds, train_data):
556
- """Custom IC eval metric for lightgbm"""
557
- is_higher_better = True
558
- return "ic", spearmanr(preds, train_data.get_label())[0], is_higher_better
559
-
560
- data = data.dropna()
561
- # Hyperparameter options
562
- YEAR = 252
563
- base_params = dict(boosting="gbdt", objective="regression", verbose=-1)
564
-
565
- # constraints on structure (depth) of each tree
566
- max_depths = [2, 3, 5, 7]
567
- num_leaves_opts = [2**i for i in max_depths]
568
- min_data_in_leaf_opts = [250, 500, 1000]
569
-
570
- # weight of each new tree in the ensemble
571
- learning_rate_ops = [0.01, 0.1, 0.3]
572
-
573
- # random feature selection
574
- feature_fraction_opts = [0.3, 0.6, 0.95]
575
-
576
- param_names = [
577
- "learning_rate",
578
- "num_leaves",
579
- "feature_fraction",
580
- "min_data_in_leaf",
581
- ]
582
-
583
- cv_params = list(
584
- product(
585
- learning_rate_ops,
586
- num_leaves_opts,
587
- feature_fraction_opts,
588
- min_data_in_leaf_opts,
589
- )
590
- )
591
- n_params = len(cv_params)
592
- print(f"# Parameters: {n_params}")
593
-
594
- # Train/Test Period Lengths
595
- lookaheads = [1, 5, 21]
596
- train_lengths = [int(4.5 * 252), 252]
597
- test_lengths = [63]
598
- test_params = list(product(lookaheads, train_lengths, test_lengths))
599
- n = len(test_params)
600
- test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
601
- test_params = [test_params[i] for i in test_param_sample]
602
- print("Train configs:", len(test_params))
603
-
604
- # Categorical Variables
605
- categoricals = ["year", "weekday", "month"]
606
- for feature in categoricals:
607
- data[feature] = pd.factorize(data[feature], sort=True)[0]
608
-
609
- # ### Run Cross-Validation
610
- labels = sorted(data.filter(like="fwd").columns)
611
- features = data.columns.difference(labels).tolist()
612
- label_dict = dict(zip(lookaheads, labels))
613
- num_iterations = [10, 25, 50, 75] + list(range(100, 501, 50))
614
- num_boost_round = num_iterations[-1]
615
-
616
- metric_cols = (
617
- param_names
618
- + [
619
- "t",
620
- "daily_ic_mean",
621
- "daily_ic_mean_n",
622
- "daily_ic_median",
623
- "daily_ic_median_n",
624
- ]
625
- + [str(n) for n in num_iterations]
626
- )
627
-
628
- for lookahead, train_length, test_length in test_params:
629
- # randomized grid search
630
- cvp = np.random.choice(
631
- list(range(n_params)), size=int(n_params / 2), replace=False
632
- )
633
- cv_params_ = [cv_params[i] for i in cvp]
634
-
635
- # set up cross-validation
636
- n_splits = int(2 * YEAR / test_length)
637
- print(
638
- f"Lookahead: {lookahead:2.0f} | "
639
- f"Train: {train_length:3.0f} | "
640
- f"Test: {test_length:2.0f} | "
641
- f"Params: {len(cv_params_):3.0f} | "
642
- f"Train configs: {len(test_params)}"
643
- )
644
-
645
- # time-series cross-validation
646
- cv = MultipleTimeSeriesCV(
647
- n_splits=n_splits,
648
- lookahead=lookahead,
649
- test_period_length=test_length,
650
- train_period_length=train_length,
651
- )
652
-
653
- label = label_dict[lookahead]
654
- outcome_data = data.loc[:, features + [label]].dropna()
655
-
656
- # binary dataset
657
- lgb_data = lgb.Dataset(
658
- data=outcome_data.drop(label, axis=1),
659
- label=outcome_data[label],
660
- categorical_feature=categoricals,
661
- free_raw_data=False,
662
- )
663
- T = 0
664
- predictions, metrics = [], []
665
-
666
- # iterate over (shuffled) hyperparameter combinations
667
- for p, param_vals in enumerate(cv_params_):
668
- key = f"{lookahead}/{train_length}/{test_length}/" + "/".join(
669
- [str(p) for p in param_vals]
670
- )
671
- params = dict(zip(param_names, param_vals))
672
- params.update(base_params)
673
-
674
- start = time()
675
- cv_preds = []
676
-
677
- # iterate over folds
678
- for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
679
- # select train subset
680
- lgb_train = lgb_data.subset(
681
- used_indices=train_idx.tolist(), params=params
682
- ).construct()
683
-
684
- # train model for num_boost_round
685
- model = lgb.train(
686
- params=params,
687
- train_set=lgb_train,
688
- num_boost_round=num_boost_round,
689
- )
690
- # log feature importance
691
- if i == 0:
692
- fi = get_fi(model).to_frame()
693
- else:
694
- fi[i] = get_fi(model)
695
-
696
- # capture predictions
697
- test_set = outcome_data.iloc[test_idx, :]
698
- X_test = test_set.loc[:, model.feature_name()]
699
- y_test = test_set.loc[:, label]
700
- y_pred = {
701
- str(n): model.predict(X_test, num_iteration=n)
702
- for n in num_iterations
703
- }
704
-
705
- # record predictions for each fold
706
- cv_preds.append(
707
- y_test.to_frame("y_test").assign(**y_pred).assign(i=i)
708
- )
709
-
710
- # combine fold results
711
- cv_preds = pd.concat(cv_preds).assign(**params)
712
- predictions.append(cv_preds)
713
-
714
- # compute IC per day
715
- by_day = cv_preds.groupby(level="date")
716
- ic_by_day = pd.concat(
717
- [
718
- by_day.apply(
719
- lambda x: spearmanr(x.y_test, x[str(n)])[0]
720
- ).to_frame(n)
721
- for n in num_iterations
722
- ],
723
- axis=1,
724
- )
725
- daily_ic_mean = ic_by_day.mean()
726
- daily_ic_mean_n = daily_ic_mean.idxmax()
727
- daily_ic_median = ic_by_day.median()
728
- daily_ic_median_n = daily_ic_median.idxmax()
729
-
730
- # compute IC across all predictions
731
- ic = [
732
- spearmanr(cv_preds.y_test, cv_preds[str(n)])[0]
733
- for n in num_iterations
734
- ]
735
- t = time() - start
736
- T += t
737
-
738
- # collect metrics
739
- metrics = pd.Series(
740
- list(param_vals)
741
- + [
742
- t,
743
- daily_ic_mean.max(),
744
- daily_ic_mean_n,
745
- daily_ic_median.max(),
746
- daily_ic_median_n,
747
- ]
748
- + ic,
749
- index=metric_cols,
750
- )
751
- if verbose:
752
- msg = f'\t{p:3.0f} | {self.format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
753
- msg += f'{params["num_leaves"]:3.0f} | {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} | '
754
- msg += f" {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}"
755
- print(msg)
756
-
757
- # persist results for given CV run and hyperparameter combination
758
- metrics.to_hdf(path_or_buf=self.trainstore, key="metrics/" + key)
759
- ic_by_day.assign(**params).to_hdf(
760
- path_or_buf=self.trainstore, key="daily_ic/" + key
761
- )
762
- fi.T.describe().T.assign(**params).to_hdf(
763
- path_or_buf=self.trainstore, key="fi/" + key
764
- )
765
- cv_preds.to_hdf(
766
- path_or_buf=self.trainstore, key="predictions/" + key, append=True
767
- )
768
-
769
- def _get_lgb_metrics(self, scope_params, lgb_train_params, daily_ic_metrics):
770
- with pd.HDFStore(self.trainstore) as store:
771
- for i, key in enumerate(
772
- [k[1:] for k in store.keys() if k[1:].startswith("metrics")]
773
- ):
774
- _, t, train_length, test_length = key.split("/")[:4]
775
- attrs = {
776
- "lookahead": t,
777
- "train_length": train_length,
778
- "test_length": test_length,
779
- }
780
- s = store[key].to_dict()
781
- s.update(attrs)
782
- if i == 0:
783
- lgb_metrics = pd.Series(s).to_frame(i)
784
- else:
785
- lgb_metrics[i] = pd.Series(s)
786
-
787
- id_vars = scope_params + lgb_train_params + daily_ic_metrics
788
- lgb_metrics = (
789
- pd.melt(
790
- lgb_metrics.T.drop("t", axis=1),
791
- id_vars=id_vars,
792
- value_name="ic",
793
- var_name="boost_rounds",
794
- )
795
- .dropna()
796
- .apply(pd.to_numeric)
797
- )
798
- return lgb_metrics
799
-
800
- def _get_lgb_ic(self, int_cols, scope_params, lgb_train_params, id_vars):
801
- lgb_ic = []
802
- with pd.HDFStore(self.trainstore) as store:
803
- keys = [k[1:] for k in store.keys()]
804
- for key in keys:
805
- _, t, train_length, test_length = key.split("/")[:4]
806
- if key.startswith("daily_ic"):
807
- df = (
808
- store[key]
809
- .drop(["boosting", "objective", "verbose"], axis=1)
810
- .assign(
811
- lookahead=t,
812
- train_length=train_length,
813
- test_length=test_length,
814
- )
815
- )
816
- lgb_ic.append(df)
817
- lgb_ic = pd.concat(lgb_ic).reset_index()
818
- lgb_ic = pd.melt(
819
- lgb_ic, id_vars=id_vars, value_name="ic", var_name="boost_rounds"
820
- ).dropna()
821
- lgb_ic.loc[:, int_cols] = lgb_ic.loc[:, int_cols].astype(int)
822
- return lgb_ic
823
-
824
- def _get_lgb_params(self, data, scope_params, lgb_train_params, t=5, best=0):
825
- param_cols = scope_params[1:] + lgb_train_params + ["boost_rounds"]
826
- df = data[data.lookahead == t].sort_values("ic", ascending=False).iloc[best]
827
- return df.loc[param_cols]
828
-
829
- def _get_lgb_key(self, t, p):
830
- key = f"{t}/{int(p.train_length)}/{int(p.test_length)}/{p.learning_rate}/"
831
- return (
832
- key + f"{int(p.num_leaves)}/{p.feature_fraction}/{int(p.min_data_in_leaf)}"
833
- )
834
-
835
- def _select_ic(self, params, ic_data, lookahead):
836
- return ic_data.loc[
837
- (ic_data.lookahead == lookahead)
838
- & (ic_data.train_length == params.train_length)
839
- & (ic_data.test_length == params.test_length)
840
- & (ic_data.learning_rate == params.learning_rate)
841
- & (ic_data.num_leaves == params.num_leaves)
842
- & (ic_data.feature_fraction == params.feature_fraction)
843
- & (ic_data.boost_rounds == params.boost_rounds),
844
- ["date", "ic"],
845
- ].set_index("date")
846
-
847
- def get_trade_prices(self, tickers, start, end):
848
- idx = pd.IndexSlice
849
- with pd.HDFStore(self.datastore) as store:
850
- data = store.select("stock_data")
851
- data = data.set_index(["symbol", "date"]).sort_index()
852
- data = data[~data.index.duplicated()]
853
- return (
854
- data.loc[idx[tickers, start:end], "open"]
855
- .unstack("symbol")
856
- .sort_index()
857
- .shift(-1)
858
- .tz_convert("UTC")
859
- )
860
-
861
- def plot_ic(self, lgb_ic, lgb_daily_ic, scope_params, lgb_train_params):
862
- fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
863
- axes = axes.flatten()
864
- for i, t in enumerate([1, 21]):
865
- params = self._get_lgb_params(
866
- lgb_daily_ic, scope_params, lgb_train_params, t=t, best=0
867
- )
868
- data = self._select_ic(params, lgb_ic, lookahead=t).sort_index()
869
- rolling = data.rolling(63).ic.mean().dropna()
870
- avg = data.ic.mean()
871
- med = data.ic.median()
872
- rolling.plot(
873
- ax=axes[i],
874
- title=f"Horizon: {t} Day(s) | IC: Mean={avg*100:.2f} Median={med*100:.2f}",
875
- )
876
- axes[i].axhline(avg, c="darkred", lw=1)
877
- axes[i].axhline(0, ls="--", c="k", lw=1)
878
-
879
- fig.suptitle("3-Month Rolling Information Coefficient", fontsize=16)
880
- fig.tight_layout()
881
- fig.subplots_adjust(top=0.92)
882
-
883
- def plot_metrics(self, lgb_metrics, lgb_daily_ic, t=1):
884
- # Visualization
885
- sns.jointplot(x=lgb_metrics.daily_ic_mean, y=lgb_metrics.ic)
886
-
887
- sns.catplot(
888
- x="lookahead",
889
- y="ic",
890
- col="train_length",
891
- row="test_length",
892
- data=lgb_metrics,
893
- kind="box",
894
- )
895
- sns.catplot(
896
- x="boost_rounds",
897
- y="ic",
898
- col="train_length",
899
- row="test_length",
900
- data=lgb_daily_ic[lgb_daily_ic.lookahead == t],
901
- kind="box",
902
- )
903
-
904
- def get_best_predictions(
905
- self, lgb_daily_ic, scope_params, lgb_train_params, lookahead=1, topn=10
906
- ):
907
- for best in range(topn):
908
- best_params = self._get_lgb_params(
909
- lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=best
910
- )
911
- key = self._get_lgb_key(lookahead, best_params)
912
- rounds = str(int(best_params.boost_rounds))
913
- if best == 0:
914
- best_predictions = pd.read_hdf(self.trainstore, "predictions/" + key)
915
- best_predictions = best_predictions[rounds].to_frame(best)
916
- else:
917
- best_predictions[best] = pd.read_hdf(
918
- self.trainstore, "predictions/" + key
919
- )[rounds]
920
- best_predictions = best_predictions.sort_index()
921
- best_predictions.reset_index().to_hdf(
922
- path_or_buf=self.outstore, key=f"lgb/train/{lookahead:02}"
923
- )
924
- return best_predictions
925
-
926
- def apply_alphalen_analysis(self, factor_data, tearsheet=True, verbose=True):
927
- # Compute Alphalens metrics
928
- mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
929
- factor_data,
930
- by_date=True,
931
- by_group=False,
932
- demeaned=True,
933
- group_adjust=False,
934
- )
935
- factor_returns = perf.factor_returns(factor_data)
936
- mean_quant_ret, std_quantile = perf.mean_return_by_quantile(
937
- factor_data, by_group=False, demeaned=True
938
- )
939
-
940
- mean_quant_rateret = mean_quant_ret.apply(
941
- rate_of_return, axis=0, base_period=mean_quant_ret.columns[0]
942
- )
943
-
944
- mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
945
- factor_data,
946
- by_date=True,
947
- by_group=False,
948
- demeaned=True,
949
- group_adjust=False,
950
- )
951
-
952
- mean_quant_rateret_bydate = mean_quant_ret_bydate.apply(
953
- rate_of_return,
954
- base_period=mean_quant_ret_bydate.columns[0],
955
- )
956
-
957
- compstd_quant_daily = std_quant_daily.apply(
958
- std_conversion, base_period=std_quant_daily.columns[0]
959
- )
960
-
961
- alpha_beta = perf.factor_alpha_beta(factor_data, demeaned=True)
962
-
963
- mean_ret_spread_quant, std_spread_quant = perf.compute_mean_returns_spread(
964
- mean_quant_rateret_bydate,
965
- factor_data["factor_quantile"].max(),
966
- factor_data["factor_quantile"].min(),
967
- std_err=compstd_quant_daily,
968
- )
969
- if verbose:
970
- print(
971
- mean_ret_spread_quant.mean()
972
- .mul(10000)
973
- .to_frame("Mean Period Wise Spread (bps)")
974
- .join(alpha_beta.T)
975
- .T
976
- )
977
-
978
- fig, axes = plt.subplots(ncols=3, figsize=(18, 4))
979
-
980
- plotting.plot_quantile_returns_bar(mean_quant_rateret, ax=axes[0])
981
- plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)
982
- axes[0].set_xlabel("Quantile")
983
-
984
- plotting.plot_cumulative_returns_by_quantile(
985
- mean_quant_ret_bydate["1D"],
986
- freq=pd.tseries.offsets.BDay(),
987
- period="1D",
988
- ax=axes[1],
989
- )
990
- axes[1].set_title("Cumulative Return by Quantile (1D Period)")
991
-
992
- title = "Cumulative Return - Factor-Weighted Long/Short PF (1D Period)"
993
- plotting.plot_cumulative_returns(
994
- factor_returns["1D"],
995
- period="1D",
996
- freq=pd.tseries.offsets.BDay(),
997
- title=title,
998
- ax=axes[2],
999
- )
1000
-
1001
- fig.suptitle("Alphalens - Validation Set Performance", fontsize=14)
1002
- fig.tight_layout()
1003
- fig.subplots_adjust(top=0.85)
1004
-
1005
- # Summary Tearsheet
1006
- create_summary_tear_sheet(factor_data)
1007
- create_full_tear_sheet(factor_data)
1008
-
1009
- def evaluate(self, remove_instore=False, lookahead=1, verbose=True):
1010
- scope_params = ["lookahead", "train_length", "test_length"]
1011
- daily_ic_metrics = [
1012
- "daily_ic_mean",
1013
- "daily_ic_mean_n",
1014
- "daily_ic_median",
1015
- "daily_ic_median_n",
1016
- ]
1017
- lgb_train_params = [
1018
- "learning_rate",
1019
- "num_leaves",
1020
- "feature_fraction",
1021
- "min_data_in_leaf",
1022
- ]
1023
-
1024
- lgb_metrics = self._get_lgb_metrics(
1025
- scope_params, lgb_train_params, daily_ic_metrics
1026
- )
1027
- # Summary Metrics by Fold
1028
- lgb_metrics.to_hdf(path_or_buf=self.outstore, key="lgb/metrics")
1029
-
1030
- # Information Coefficient by Day
1031
- int_cols = ["lookahead", "train_length", "test_length", "boost_rounds"]
1032
- id_vars = ["date"] + scope_params + lgb_train_params
1033
- lgb_ic = self._get_lgb_ic(int_cols, scope_params, lgb_train_params, id_vars)
1034
- lgb_ic.to_hdf(path_or_buf=self.outstore, key="lgb/ic")
1035
- lgb_daily_ic = (
1036
- lgb_ic.groupby(id_vars[1:] + ["boost_rounds"])
1037
- .ic.mean()
1038
- .to_frame("ic")
1039
- .reset_index()
1040
- )
1041
- lgb_daily_ic.to_hdf(path_or_buf=self.outstore, key="lgb/daily_ic")
1042
-
1043
- # Cross-validation Result: Best Hyperparameters
1044
- if verbose:
1045
- print(
1046
- lgb_daily_ic.groupby("lookahead", group_keys=False).apply(
1047
- lambda x: x.nlargest(3, "ic")
1048
- )
1049
- )
1050
- lgb_metrics.groupby("lookahead", group_keys=False).apply(
1051
- lambda x: x.nlargest(3, "ic")
1052
- )
1053
- lgb_metrics.groupby("lookahead", group_keys=False).apply(
1054
- lambda x: x.nlargest(3, "ic")
1055
- ).to_hdf(path_or_buf=self.outstore, key="lgb/best_model")
1056
- if verbose:
1057
- print(
1058
- lgb_metrics.groupby("lookahead", group_keys=False).apply(
1059
- lambda x: x.nlargest(3, "daily_ic_mean")
1060
- )
1061
- )
1062
-
1063
- # Visualization
1064
- if verbose:
1065
- self.plot_metrics(lgb_metrics, lgb_daily_ic, t=lookahead)
1066
-
1067
- # AlphaLens Analysis - Validation Performance
1068
- lgb_daily_ic = pd.read_hdf(self.outstore, "lgb/daily_ic")
1069
- best_params = self._get_lgb_params(
1070
- lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=0
1071
- )
1072
- best_params.to_hdf(path_or_buf=self.outstore, key="lgb/best_params")
1073
-
1074
- if verbose:
1075
- self.plot_ic(lgb_ic, lgb_daily_ic, scope_params, lgb_train_params)
1076
-
1077
- # Get Predictions for Validation Period
1078
- best_predictions = self.get_best_predictions(
1079
- lgb_daily_ic, scope_params, lgb_train_params, lookahead=lookahead, topn=10
1080
- )
1081
- test_tickers = best_predictions.index.get_level_values("symbol").unique()
1082
- start = best_predictions.index.get_level_values("date").min()
1083
- end = best_predictions.index.get_level_values("date").max()
1084
- trade_prices = self.get_trade_prices(test_tickers, start, end)
1085
- pd.Series(test_tickers).to_hdf(path_or_buf=self.outstore, key="lgb/tickers")
1086
- # We average the top five models and provide the corresponding prices to Alphalens,
1087
- # in order to compute the mean period-wise
1088
- # return earned on an equal-weighted portfolio invested in the daily factor quintiles
1089
- # for various holding periods:
1090
- factor = (
1091
- best_predictions.iloc[:, :5]
1092
- .mean(1)
1093
- .dropna()
1094
- .tz_convert("UTC", level="date")
1095
- .swaplevel()
1096
- )
1097
- # Create AlphaLens Inputs
1098
- if verbose:
1099
- factor_data = get_clean_factor_and_forward_returns(
1100
- factor=factor,
1101
- prices=trade_prices,
1102
- quantiles=5,
1103
- periods=(1, 5, 10, 21),
1104
- max_loss=1,
1105
- )
1106
- self.apply_alphalen_analysis(factor_data, tearsheet=True, verbose=True)
1107
- # Delete the temporary files
1108
- if remove_instore:
1109
- os.remove(self.trainstore)
1110
-
1111
- def make_predictions(
1112
- self, data: pd.DataFrame, mode="test", lookahead=1, verbose=True
1113
- ):
1114
- data = data.copy()
1115
- YEAR = 252
1116
- scope_params = ["lookahead", "train_length", "test_length"]
1117
- lgb_train_params = [
1118
- "learning_rate",
1119
- "num_leaves",
1120
- "feature_fraction",
1121
- "min_data_in_leaf",
1122
- ]
1123
-
1124
- base_params = dict(boosting="gbdt", objective="regression", verbose=-1)
1125
-
1126
- categoricals = ["year", "month", "weekday"]
1127
- labels = sorted(data.filter(like="_fwd").columns)
1128
- features = data.columns.difference(labels).tolist()
1129
- label = f"r{lookahead:02}_fwd"
1130
- for feature in categoricals:
1131
- data[feature] = pd.factorize(data[feature], sort=True)[0]
1132
-
1133
- if mode == "test":
1134
- data = data.dropna().sort_index()
1135
- elif mode == "live":
1136
- data[labels] = data[labels].fillna(0)
1137
- data = data.sort_index().dropna()
1138
-
1139
- lgb_data = lgb.Dataset(
1140
- data=data[features],
1141
- label=data[label],
1142
- categorical_feature=categoricals,
1143
- free_raw_data=False,
1144
- )
1145
- # Generate predictions
1146
- lgb_daily_ic = pd.read_hdf(self.outstore, "lgb/daily_ic")
1147
-
1148
- for position in range(10):
1149
- params = self._get_lgb_params(
1150
- lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=position
1151
- )
1152
-
1153
- params = params.to_dict()
1154
-
1155
- for p in ["min_data_in_leaf", "num_leaves"]:
1156
- params[p] = int(params[p])
1157
- train_length = int(params.pop("train_length"))
1158
- test_length = int(params.pop("test_length"))
1159
- num_boost_round = int(params.pop("boost_rounds"))
1160
- params.update(base_params)
1161
- if verbose:
1162
- print(f"\nPosition: {position:02}")
1163
-
1164
- # 1-year out-of-sample period
1165
- n_splits = int(YEAR / test_length)
1166
- cv = MultipleTimeSeriesCV(
1167
- n_splits=n_splits,
1168
- test_period_length=test_length,
1169
- lookahead=lookahead,
1170
- train_period_length=train_length,
1171
- )
1172
-
1173
- predictions = []
1174
- for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
1175
- if verbose:
1176
- print(i, end=" ", flush=True)
1177
- lgb_train = lgb_data.subset(
1178
- used_indices=train_idx.tolist(), params=params
1179
- ).construct()
1180
-
1181
- model = lgb.train(
1182
- params=params,
1183
- train_set=lgb_train,
1184
- num_boost_round=num_boost_round,
1185
- )
1186
-
1187
- test_set = data.iloc[test_idx, :]
1188
- y_test = test_set.loc[:, label].to_frame("y_test")
1189
- y_pred = model.predict(test_set.loc[:, model.feature_name()])
1190
- predictions.append(y_test.assign(prediction=y_pred))
1191
-
1192
- if position == 0:
1193
- test_predictions = pd.concat(predictions).rename(
1194
- columns={"prediction": position}
1195
- )
1196
- else:
1197
- test_predictions[position] = pd.concat(predictions).prediction
1198
-
1199
- by_day = test_predictions.groupby(level="date")
1200
- for position in range(10):
1201
- if position == 0:
1202
- ic_by_day = by_day.apply(
1203
- lambda x: spearmanr(x.y_test, x[position])[0]
1204
- ).to_frame()
1205
- else:
1206
- ic_by_day[position] = by_day.apply(
1207
- lambda x: spearmanr(x.y_test, x[position])[0]
1208
- )
1209
- if verbose:
1210
- print(ic_by_day.describe())
1211
- test_predictions.reset_index().to_hdf(
1212
- path_or_buf=self.outstore, key=f"lgb/test/{lookahead:02}"
1213
- )
1214
- return test_predictions
1215
-
1216
- def load_predictions(self, predictions=None, lookahead=1):
1217
- if predictions is None:
1218
- predictions = pd.concat(
1219
- [
1220
- pd.read_hdf(self.outstore, f"lgb/train/{lookahead:02}"),
1221
- pd.read_hdf(self.outstore, f"lgb/test/{lookahead:02}").drop(
1222
- "y_test", axis=1
1223
- ),
1224
- ]
1225
- )
1226
- predictions = predictions.set_index(["symbol", "date"])
1227
-
1228
- predictions = (
1229
- predictions.loc[~predictions.index.duplicated()]
1230
- .iloc[:, :10]
1231
- .mean(1)
1232
- .sort_index()
1233
- .dropna()
1234
- .to_frame("prediction")
1235
- )
1236
- tickers = predictions.index.get_level_values("symbol").unique().tolist()
1237
- try:
1238
- return (predictions.unstack("symbol").prediction.tz_convert("UTC")), tickers
1239
- except TypeError:
1240
- return (predictions.unstack("symbol").prediction.tz_localize("UTC")), tickers
1241
-
1242
- def assert_last_date(self, predictions: pd.DataFrame):
1243
- """
1244
- Usefull in Live Trading to ensure that the last date in the predictions
1245
- is the previous day, so it predicts today's returns.
1246
- """
1247
- last_date = predictions.index.get_level_values("date").max()
1248
- try:
1249
- if last_date.tzinfo is None:
1250
- last_date = last_date.tz_localize("UTC")
1251
- else:
1252
- last_date = last_date.tz_convert("UTC")
1253
- last_date = last_date.normalize()
1254
- except Exception as e:
1255
- print(f"Error getting last date: {e}")
1256
- try:
1257
- days = 3 if datetime.now().strftime("%A") == "Monday" else 1
1258
- td = (
1259
- last_date
1260
- - (pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=days)).normalize()
1261
- )
1262
- assert (
1263
- td.days == days or last_date == (pd.Timestamp.now(tz="UTC")).normalize()
1264
- )
1265
- return True
1266
- except AssertionError:
1267
- return False
1268
-
1269
- def clean_stores(self, *stores):
1270
- for store in stores:
1271
- if os.path.exists(store):
1272
- os.remove(store)
1
+ import os
2
+ import warnings
3
+ from datetime import datetime
4
+ from itertools import product
5
+ from time import time
6
+
7
+ import lightgbm as lgb
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import pandas as pd
11
+ import seaborn as sns
12
+
13
+ import yfinance as yf
14
+ from alphalens import performance as perf
15
+ from alphalens import plotting
16
+ from alphalens.tears import create_full_tear_sheet, create_summary_tear_sheet
17
+ from alphalens.utils import (
18
+ get_clean_factor_and_forward_returns,
19
+ rate_of_return,
20
+ std_conversion,
21
+ )
22
+ from scipy.stats import spearmanr
23
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
24
+ import pandas_ta as ta
25
+
26
+ warnings.filterwarnings("ignore")
27
+
28
+
29
+ __all__ = ["OneStepTimeSeriesSplit", "MultipleTimeSeriesCV", "LightGBModel"]
30
+
31
+
32
+ class OneStepTimeSeriesSplit:
33
+ __author__ = "Stefan Jansen"
34
+ """Generates tuples of train_idx, test_idx pairs
35
+ Assumes the index contains a level labeled 'date'"""
36
+
37
+ def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
38
+ self.n_splits = n_splits
39
+ self.test_period_length = test_period_length
40
+ self.shuffle = shuffle
41
+
42
+ @staticmethod
43
+ def chunks(l, n): # noqa: E741
44
+ for i in range(0, len(l), n):
45
+ yield l[i : i + n]
46
+
47
+ def split(self, X: pd.DataFrame, y=None, groups=None):
48
+ unique_dates = (
49
+ X.index.get_level_values("date")
50
+ .unique()
51
+ .sort_values(ascending=False)[: self.n_splits * self.test_period_length]
52
+ )
53
+
54
+ dates = X.reset_index()[["date"]]
55
+ for test_date in self.chunks(unique_dates, self.test_period_length):
56
+ train_idx = dates[dates.date < min(test_date)].index
57
+ test_idx = dates[dates.date.isin(test_date)].index
58
+ if self.shuffle:
59
+ np.random.shuffle(list(train_idx))
60
+ yield train_idx, test_idx
61
+
62
+ def get_n_splits(self, X, y, groups=None):
63
+ return self.n_splits
64
+
65
+
66
+ class MultipleTimeSeriesCV:
67
+ __author__ = "Stefan Jansen"
68
+ """
69
+ Generates tuples of train_idx, test_idx pairs
70
+ Assumes the MultiIndex contains levels 'symbol' and 'date'
71
+ purges overlapping outcomes
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ n_splits=3,
77
+ train_period_length=126,
78
+ test_period_length=21,
79
+ lookahead=None,
80
+ date_idx="date",
81
+ shuffle=False,
82
+ ):
83
+ self.n_splits = n_splits
84
+ self.lookahead = lookahead
85
+ self.test_length = test_period_length
86
+ self.train_length = train_period_length
87
+ self.shuffle = shuffle
88
+ self.date_idx = date_idx
89
+
90
+ def split(self, X: pd.DataFrame, y=None, groups=None):
91
+ unique_dates = X.index.get_level_values(self.date_idx).unique()
92
+ days = sorted(unique_dates, reverse=True)
93
+ split_idx = []
94
+ for i in range(self.n_splits):
95
+ test_end_idx = i * self.test_length
96
+ test_start_idx = test_end_idx + self.test_length
97
+ train_end_idx = test_start_idx + self.lookahead - 1
98
+ train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
99
+ split_idx.append(
100
+ [train_start_idx, train_end_idx, test_start_idx, test_end_idx]
101
+ )
102
+
103
+ dates = X.reset_index()[[self.date_idx]]
104
+ for train_start, train_end, test_start, test_end in split_idx:
105
+ train_idx = dates[
106
+ (dates[self.date_idx] > days[train_start])
107
+ & (dates[self.date_idx] <= days[train_end])
108
+ ].index
109
+ test_idx = dates[
110
+ (dates[self.date_idx] > days[test_start])
111
+ & (dates[self.date_idx] <= days[test_end])
112
+ ].index
113
+ if self.shuffle:
114
+ np.random.shuffle(list(train_idx))
115
+ yield train_idx.to_numpy(), test_idx.to_numpy()
116
+
117
+ def get_n_splits(self, X, y, groups=None):
118
+ return self.n_splits
119
+
120
+
121
+ class LightGBModel(object):
122
+ """
123
+ ``LightGBModel`` encapsulates a complete workflow for training and evaluating
124
+ a ``LightGBM (Light Gradient Boosting Machine)`` model for predicting stock returns.
125
+ It includes data acquisition, feature engineering, model tuning, and performance
126
+ evaluation using information ``coefficient (IC)`` and Alphalens analysis.
127
+
128
+ Key Features
129
+ ------------
130
+ - ``HDF5 Storage``: Utilizes ``pandas.HDFStore`` for efficient storage and retrieval
131
+ of large datasets, which is essential for backtesting on financial time series data.
132
+
133
+ - ``Time-Series Cross-Validation``: Employs a custom cross-validation strategy that
134
+ respects the time series nature of the data, avoiding data leakage.
135
+
136
+ - ``Hyperparameter Tuning``: Includes automated hyperparameter tuning using a randomized
137
+ grid search for optimization.
138
+
139
+ - ``Information Coefficient (IC)``: Uses IC as a core performance metric that quantifies
140
+ the predictive power of the model, which is a standard measure for ranking models in finance.
141
+
142
+ - ``Alphalens Integration``: Provides a comprehensive framework for validating model
143
+ performance using Alphalens, allowing for in-depth performance analysis, like backtesting
144
+ and return decomposition.
145
+
146
+ Use Case
147
+ --------
148
+ This class is designed for quantitative finance and algorithmic trading use cases where
149
+ the goal is to build a predictive model for stock returns based on historical data and
150
+ technical indicators. It follows a complete cycle from data acquisition to model validation
151
+ and provides the infrastructure needed for deployment of this model in a trading strategy.
152
+
153
+ Notes
154
+ -----
155
+ The implementation is inspired by the book "Machine Learning for Algorithmic Trading"
156
+ by Stefan Jansen.
157
+
158
+ References
159
+ ----------
160
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
161
+ Chapter 12, Boosting Your Trading Strategy.
162
+ """
163
+
164
+ def __init__(
165
+ self,
166
+ data: pd.DataFrame = None,
167
+ datastore: pd.HDFStore = "lgbdata.h5",
168
+ trainstore: pd.HDFStore = "lgbtrain.h5",
169
+ outstore: pd.HDFStore = "lgbout.h5",
170
+ ):
171
+ """
172
+ Args:
173
+ data (pd.DataFrame): The input data for the model. It should be a DataFrame with a MultiIndex containing
174
+ 'symbol' and 'date' levels. If not provided, the data can be downloaded using the `download_boosting_data` method.
175
+ datastore (str): The path to the HDF5 file for storing the model data.
176
+ trainstore (str): The path to the HDF5 file for storing the training data.
177
+ outstore (str): The path to the HDF5 file for storing the output data.
178
+ """
179
+ self.datastore = datastore
180
+ self.trainstore = trainstore
181
+ self.outstore = outstore
182
+ if data is not None:
183
+ data.reset_index().to_hdf(path_or_buf=self.datastore, key="model_data")
184
+
185
+ def _compute_bb(self, close):
186
+ # Compute Bollinger Bands using pandas_ta
187
+ bb = ta.bbands(close, length=20)
188
+ return pd.DataFrame(
189
+ {"bb_high": bb["BBU_20_2.0"], "bb_low": bb["BBL_20_2.0"]}, index=close.index
190
+ )
191
+
192
+ def _compute_atr(self, stock_data):
193
+ # Compute ATR using pandas_ta
194
+ atr = ta.atr(stock_data.high, stock_data.low, stock_data.close, length=14)
195
+ return (atr - atr.mean()) / atr.std()
196
+
197
+ def _compute_macd(self, close):
198
+ # Compute MACD using pandas_ta
199
+ macd = ta.macd(close)["MACD_12_26_9"]
200
+ return (macd - macd.mean()) / macd.std()
201
+
202
+ def _add_technical_indicators(self, prices: pd.DataFrame):
203
+ prices = prices.copy()
204
+
205
+ # Add RSI and normalize
206
+ prices["rsi"] = (
207
+ prices.groupby(level="symbol")
208
+ .close.apply(lambda x: ta.rsi(x, length=14))
209
+ .reset_index(level=0, drop=True)
210
+ )
211
+
212
+ # Add Bollinger Bands
213
+ bb = prices.groupby(level="symbol").close.apply(self._compute_bb)
214
+ bb = bb.reset_index(level=1, drop=True)
215
+ prices = prices.join(bb)
216
+
217
+ prices["bb_high"] = (
218
+ prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
219
+ )
220
+ prices["bb_low"] = (
221
+ prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)
222
+ )
223
+
224
+ # Add ATR and normalize
225
+ prices["ATR"] = prices.groupby(level="symbol", group_keys=False).apply(
226
+ lambda x: self._compute_atr(x)
227
+ )
228
+
229
+ # Add MACD and normalize
230
+ prices["MACD"] = prices.groupby(level="symbol", group_keys=False).close.apply(
231
+ self._compute_macd
232
+ )
233
+
234
+ return prices
235
+
236
+ def download_boosting_data(self, tickers, start, end=None):
237
+ data = []
238
+ for ticker in tickers:
239
+ try:
240
+ prices = yf.download(
241
+ ticker,
242
+ start=start,
243
+ end=end,
244
+ progress=False,
245
+ multi_level_index=False,
246
+ auto_adjust=True,
247
+ )
248
+ prices["symbol"] = ticker
249
+ data.append(prices)
250
+ except: # noqa: E722
251
+ continue
252
+ data = pd.concat(data)
253
+ data = (
254
+ data.rename(columns={s: s.lower().replace(" ", "_") for s in data.columns})
255
+ .drop(columns=["adj_close"])
256
+ .set_index("symbol", append=True)
257
+ .swaplevel()
258
+ .sort_index()
259
+ .dropna()
260
+ )
261
+ return data
262
+
263
+ def download_metadata(self, tickers):
264
+ def clean_text_column(series: pd.Series) -> pd.Series:
265
+ return (
266
+ series.str.lower()
267
+ # use regex=False for literal string replacements
268
+ .str.replace("-", "", regex=False)
269
+ .str.replace("&", "and", regex=False)
270
+ .str.replace(" ", "_", regex=False)
271
+ .str.replace("__", "_", regex=False)
272
+ )
273
+
274
+ metadata = [
275
+ "industry",
276
+ "sector",
277
+ "exchange",
278
+ "symbol",
279
+ "heldPercentInsiders",
280
+ "heldPercentInstitutions",
281
+ "overallRisk",
282
+ "shortRatio",
283
+ "dividendYield",
284
+ "beta",
285
+ "regularMarketVolume",
286
+ "averageVolume",
287
+ "averageVolume10days",
288
+ "bid",
289
+ "ask",
290
+ "bidSize",
291
+ "askSize",
292
+ "marketCap",
293
+ ]
294
+
295
+ columns = {
296
+ "industry": "industry",
297
+ "sector": "sector",
298
+ "exchange": "exchange",
299
+ "symbol": "symbol",
300
+ "heldPercentInsiders": "insiders",
301
+ "heldPercentInstitutions": "institutions",
302
+ "overallRisk": "risk",
303
+ "shortRatio": "short_ratio",
304
+ "dividendYield": "dyield",
305
+ "beta": "beta",
306
+ "regularMarketVolume": "regvolume",
307
+ "averageVolume": "avgvolume",
308
+ "averageVolume10days": "avgvolume10",
309
+ "bid": "bid",
310
+ "ask": "ask",
311
+ "bidSize": "bidsize",
312
+ "askSize": "asksize",
313
+ "marketCap": "marketcap",
314
+ }
315
+ data = []
316
+ for symbol in tickers:
317
+ try:
318
+ symbol_info = yf.Ticker(symbol).info
319
+ except: # noqa: E722
320
+ continue
321
+ infos = {}
322
+ for info in metadata:
323
+ infos[info] = symbol_info.get(info)
324
+ data.append(infos)
325
+ metadata = pd.DataFrame(data)
326
+ metadata = metadata.rename(columns=columns)
327
+ metadata.dyield = metadata.dyield.fillna(0)
328
+ metadata.sector = clean_text_column(metadata.sector)
329
+ metadata.industry = clean_text_column(metadata.industry)
330
+ metadata = metadata.set_index("symbol")
331
+ return metadata
332
+
333
+ def _select_nlargest_liquidity_stocks(
334
+ self,
335
+ df: pd.DataFrame,
336
+ n: int,
337
+ volume_features,
338
+ bid_ask_features,
339
+ market_cap_feature,
340
+ ):
341
+ df = df.copy()
342
+ scaler = StandardScaler()
343
+
344
+ # Normalize features
345
+ df[volume_features] = scaler.fit_transform(df[volume_features])
346
+ df["bid_ask_spread"] = df["ask"] - df["bid"]
347
+ df["bid_ask_spread"] = scaler.fit_transform(df[["bid_ask_spread"]])
348
+ df[market_cap_feature] = scaler.fit_transform(df[market_cap_feature])
349
+
350
+ # Calculate Liquidity Score
351
+ # Assign weights to each component (these weights can be adjusted based on importance)
352
+ weights = {"volume": 0.4, "bid_ask_spread": 0.2, "marketCap": 0.4}
353
+
354
+ # Calculate the liquidity score by combining the normalized features
355
+ df["liquidity_score"] = (
356
+ weights["volume"] * df[volume_features].mean(axis=1)
357
+ + weights["bid_ask_spread"] * df["bid_ask_spread"]
358
+ + weights["marketCap"] * df[market_cap_feature[0]]
359
+ )
360
+ df_sorted = df.sort_values(by="liquidity_score", ascending=False)
361
+
362
+ return df_sorted.nlargest(n, "liquidity_score").index
363
+
364
+ def _encode_metadata(self, df: pd.DataFrame):
365
+ df = df.copy()
366
+ # Binning each numerical feature into categories
367
+ df["insiders"] = pd.qcut(
368
+ df["insiders"], q=4, labels=["Very Low", "Low", "High", "Very High"]
369
+ )
370
+ df["institutions"] = pd.qcut(
371
+ df["institutions"], q=4, labels=["Very Low", "Low", "High", "Very High"]
372
+ )
373
+ df["risk"] = pd.cut(
374
+ df["risk"],
375
+ bins=[-float("inf"), 3, 5, 7, float("inf")],
376
+ labels=["Low", "Medium", "High", "Very High"],
377
+ )
378
+ df["short_ratio"] = pd.qcut(
379
+ df["short_ratio"], q=4, labels=["Very Low", "Low", "High", "Very High"]
380
+ )
381
+ df["dyield"] = pd.cut(
382
+ df["dyield"],
383
+ bins=[-float("inf"), 0.002, 0.005, 0.01, float("inf")],
384
+ labels=["Very Low", "Low", "High", "Very High"],
385
+ )
386
+ df["beta"] = pd.cut(
387
+ df["beta"],
388
+ bins=[-float("inf"), 0.8, 1.0, 1.2, float("inf")],
389
+ labels=["Low", "Moderate", "High", "Very High"],
390
+ )
391
+
392
+ # Encode binned features
393
+ binned_features = [
394
+ "insiders",
395
+ "institutions",
396
+ "risk",
397
+ "short_ratio",
398
+ "dyield",
399
+ "beta",
400
+ "sector",
401
+ "industry",
402
+ "exchange",
403
+ ]
404
+ label_encoders = {}
405
+
406
+ for col in binned_features:
407
+ le = LabelEncoder()
408
+ df[col] = le.fit_transform(df[col])
409
+ label_encoders[col] = le
410
+ return df, label_encoders
411
+
412
+ def prepare_boosting_data(
413
+ self,
414
+ prices: pd.DataFrame,
415
+ metadata: pd.DataFrame = None,
416
+ min_years=7,
417
+ universe=500,
418
+ ):
419
+ if metadata is None:
420
+ mcap = False
421
+ tickers = prices.index.get_level_values("symbol").unique()
422
+ metadata = self.download_metadata(tickers)
423
+ else:
424
+ mcap = True
425
+ YEAR = 252
426
+ idx = pd.IndexSlice
427
+ percentiles = [0.001, 0.01, 0.02, 0.03, 0.04, 0.05]
428
+ percentiles += [1 - p for p in percentiles[::-1]]
429
+ T = [1, 5, 10, 21, 42, 63]
430
+
431
+ prices.volume /= 1e3 # make vol figures a bit smaller
432
+ prices.index.names = ["symbol", "date"]
433
+ metadata.index.name = "symbol"
434
+ prices.reset_index().to_hdf(path_or_buf=self.datastore, key="stock_data")
435
+ metadata.reset_index().to_hdf(path_or_buf=self.datastore, key="stock_metadata")
436
+
437
+ # Remove stocks with insufficient observations
438
+ min_obs = min_years * YEAR
439
+ nobs = prices.groupby(level="symbol").size()
440
+ keep = nobs[nobs > min_obs].index
441
+ prices = prices.loc[idx[keep, :], :]
442
+
443
+ # # Remove duplicate symbols
444
+ prices = prices[~prices.index.duplicated()]
445
+
446
+ # Align price and meta data
447
+ metadata = metadata[~metadata.index.duplicated() & metadata.sector.notnull()]
448
+ metadata.sector = metadata.sector.str.lower().str.replace(" ", "_")
449
+ shared = (
450
+ prices.index.get_level_values("symbol")
451
+ .unique()
452
+ .intersection(metadata.index)
453
+ )
454
+ metadata = metadata.loc[shared, :]
455
+ prices = prices.loc[idx[shared, :], :]
456
+
457
+ # Limit universe
458
+ if mcap:
459
+ universe = metadata.marketcap.nlargest(universe).index
460
+ else:
461
+ volume_features = ["regvolume", "avgvolume", "avgvolume10"]
462
+ bid_ask_features = ["bid", "ask", "bidsize", "asksize"]
463
+ market_cap_feature = ["marketcap"]
464
+ to_drop = volume_features + bid_ask_features + market_cap_feature
465
+ universe = self._select_nlargest_liquidity_stocks(
466
+ metadata,
467
+ universe,
468
+ volume_features,
469
+ bid_ask_features,
470
+ market_cap_feature,
471
+ )
472
+ metadata = metadata.drop(to_drop, axis=1)
473
+ prices = prices.loc[idx[universe, :], :]
474
+ metadata = metadata.loc[universe]
475
+ metadata = self._encode_metadata(metadata)[0]
476
+
477
+ prices["dollar_vol"] = prices[["close", "volume"]].prod(1).div(1e3)
478
+ # compute dollar volume to determine universe
479
+ dollar_vol_ma = (
480
+ prices.dollar_vol.unstack("symbol")
481
+ .rolling(window=21, min_periods=1) # 1 trading month
482
+ .mean()
483
+ )
484
+
485
+ # Rank stocks by moving average
486
+ prices["dollar_vol_rank"] = (
487
+ dollar_vol_ma.rank(axis=1, ascending=False).stack("symbol").swaplevel()
488
+ )
489
+ # Add some Basic Factors
490
+ prices = self._add_technical_indicators(prices)
491
+ # Combine Price and Meta Data
492
+ prices = prices.join(metadata)
493
+
494
+ # Compute Returns
495
+ by_sym = prices.groupby(level="symbol").close
496
+ for t in T:
497
+ prices[f"r{t:02}"] = by_sym.pct_change(t)
498
+ # Daily historical return deciles
499
+ for t in T:
500
+ # Reset the index to apply qcut by date without grouping errors
501
+ prices[f"r{t:02}dec"] = (
502
+ prices.reset_index(level="date")
503
+ .groupby("date")[f"r{t:02}"]
504
+ .apply(lambda x: pd.qcut(x, q=10, labels=False, duplicates="drop"))
505
+ .values
506
+ )
507
+ # Daily sector return deciles
508
+ for t in T:
509
+ prices[f"r{t:02}q_sector"] = prices.groupby(["date", "sector"])[
510
+ f"r{t:02}"
511
+ ].transform(lambda x: pd.qcut(x, q=5, labels=False, duplicates="drop"))
512
+ # Compute Forward Returns
513
+ for t in [1, 5, 21]:
514
+ prices[f"r{t:02}_fwd"] = prices.groupby(level="symbol")[f"r{t:02}"].shift(
515
+ -t
516
+ )
517
+
518
+ # Remove outliers
519
+ outliers = prices[prices.r01 > 1].index.get_level_values("symbol").unique()
520
+ prices = prices.drop(outliers, level="symbol")
521
+ # Create time and sector dummy variables
522
+ prices["year"] = prices.index.get_level_values("date").year
523
+ prices["month"] = prices.index.get_level_values("date").month
524
+ prices["weekday"] = prices.index.get_level_values("date").weekday
525
+ # Store Model Data
526
+ prices = prices.drop(["open", "close", "low", "high", "volume"], axis=1)
527
+ if "adj_close" in prices.columns:
528
+ prices = prices.drop("adj_close", axis=1)
529
+ prices.reset_index().to_hdf(path_or_buf=self.datastore, key="model_data")
530
+ return prices.sort_index()
531
+
532
+ def tickers(self):
533
+ return pd.read_hdf(self.outstore, "lgb/tickers").tolist()
534
+
535
+ def load_model_data(self, key="model_data"):
536
+ return (
537
+ pd.read_hdf(self.datastore, key=key)
538
+ .set_index(["symbol", "date"])
539
+ .sort_index()
540
+ )
541
+
542
+ def format_time(self, t):
543
+ """Return a formatted time string 'HH:MM:SS
544
+ based on a numeric time() value"""
545
+ m, s = divmod(t, 60)
546
+ h, m = divmod(m, 60)
547
+ return f"{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}"
548
+
549
+ def fit(self, data: pd.DataFrame, verbose=True):
550
+ def get_fi(model):
551
+ """Return normalized feature importance as pd.Series"""
552
+ fi = model.feature_importance(importance_type="gain")
553
+ return pd.Series(fi / fi.sum(), index=model.feature_name())
554
+
555
+ def ic_lgbm(preds, train_data):
556
+ """Custom IC eval metric for lightgbm"""
557
+ is_higher_better = True
558
+ return "ic", spearmanr(preds, train_data.get_label())[0], is_higher_better
559
+
560
+ data = data.dropna()
561
+ # Hyperparameter options
562
+ YEAR = 252
563
+ base_params = dict(boosting="gbdt", objective="regression", verbose=-1)
564
+
565
+ # constraints on structure (depth) of each tree
566
+ max_depths = [2, 3, 5, 7]
567
+ num_leaves_opts = [2**i for i in max_depths]
568
+ min_data_in_leaf_opts = [250, 500, 1000]
569
+
570
+ # weight of each new tree in the ensemble
571
+ learning_rate_ops = [0.01, 0.1, 0.3]
572
+
573
+ # random feature selection
574
+ feature_fraction_opts = [0.3, 0.6, 0.95]
575
+
576
+ param_names = [
577
+ "learning_rate",
578
+ "num_leaves",
579
+ "feature_fraction",
580
+ "min_data_in_leaf",
581
+ ]
582
+
583
+ cv_params = list(
584
+ product(
585
+ learning_rate_ops,
586
+ num_leaves_opts,
587
+ feature_fraction_opts,
588
+ min_data_in_leaf_opts,
589
+ )
590
+ )
591
+ n_params = len(cv_params)
592
+ print(f"# Parameters: {n_params}")
593
+
594
+ # Train/Test Period Lengths
595
+ lookaheads = [1, 5, 21]
596
+ train_lengths = [int(4.5 * 252), 252]
597
+ test_lengths = [63]
598
+ test_params = list(product(lookaheads, train_lengths, test_lengths))
599
+ n = len(test_params)
600
+ test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
601
+ test_params = [test_params[i] for i in test_param_sample]
602
+ print("Train configs:", len(test_params))
603
+
604
+ # Categorical Variables
605
+ categoricals = ["year", "weekday", "month"]
606
+ for feature in categoricals:
607
+ data[feature] = pd.factorize(data[feature], sort=True)[0]
608
+
609
+ # ### Run Cross-Validation
610
+ labels = sorted(data.filter(like="fwd").columns)
611
+ features = data.columns.difference(labels).tolist()
612
+ label_dict = dict(zip(lookaheads, labels))
613
+ num_iterations = [10, 25, 50, 75] + list(range(100, 501, 50))
614
+ num_boost_round = num_iterations[-1]
615
+
616
+ metric_cols = (
617
+ param_names
618
+ + [
619
+ "t",
620
+ "daily_ic_mean",
621
+ "daily_ic_mean_n",
622
+ "daily_ic_median",
623
+ "daily_ic_median_n",
624
+ ]
625
+ + [str(n) for n in num_iterations]
626
+ )
627
+
628
+ for lookahead, train_length, test_length in test_params:
629
+ # randomized grid search
630
+ cvp = np.random.choice(
631
+ list(range(n_params)), size=int(n_params / 2), replace=False
632
+ )
633
+ cv_params_ = [cv_params[i] for i in cvp]
634
+
635
+ # set up cross-validation
636
+ n_splits = int(2 * YEAR / test_length)
637
+ print(
638
+ f"Lookahead: {lookahead:2.0f} | "
639
+ f"Train: {train_length:3.0f} | "
640
+ f"Test: {test_length:2.0f} | "
641
+ f"Params: {len(cv_params_):3.0f} | "
642
+ f"Train configs: {len(test_params)}"
643
+ )
644
+
645
+ # time-series cross-validation
646
+ cv = MultipleTimeSeriesCV(
647
+ n_splits=n_splits,
648
+ lookahead=lookahead,
649
+ test_period_length=test_length,
650
+ train_period_length=train_length,
651
+ )
652
+
653
+ label = label_dict[lookahead]
654
+ outcome_data = data.loc[:, features + [label]].dropna()
655
+
656
+ # binary dataset
657
+ lgb_data = lgb.Dataset(
658
+ data=outcome_data.drop(label, axis=1),
659
+ label=outcome_data[label],
660
+ categorical_feature=categoricals,
661
+ free_raw_data=False,
662
+ )
663
+ T = 0
664
+ predictions, metrics = [], []
665
+
666
+ # iterate over (shuffled) hyperparameter combinations
667
+ for p, param_vals in enumerate(cv_params_):
668
+ key = f"{lookahead}/{train_length}/{test_length}/" + "/".join(
669
+ [str(p) for p in param_vals]
670
+ )
671
+ params = dict(zip(param_names, param_vals))
672
+ params.update(base_params)
673
+
674
+ start = time()
675
+ cv_preds = []
676
+
677
+ # iterate over folds
678
+ for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
679
+ # select train subset
680
+ lgb_train = lgb_data.subset(
681
+ used_indices=train_idx.tolist(), params=params
682
+ ).construct()
683
+
684
+ # train model for num_boost_round
685
+ model = lgb.train(
686
+ params=params,
687
+ train_set=lgb_train,
688
+ num_boost_round=num_boost_round,
689
+ )
690
+ # log feature importance
691
+ if i == 0:
692
+ fi = get_fi(model).to_frame()
693
+ else:
694
+ fi[i] = get_fi(model)
695
+
696
+ # capture predictions
697
+ test_set = outcome_data.iloc[test_idx, :]
698
+ X_test = test_set.loc[:, model.feature_name()]
699
+ y_test = test_set.loc[:, label]
700
+ y_pred = {
701
+ str(n): model.predict(X_test, num_iteration=n)
702
+ for n in num_iterations
703
+ }
704
+
705
+ # record predictions for each fold
706
+ cv_preds.append(
707
+ y_test.to_frame("y_test").assign(**y_pred).assign(i=i)
708
+ )
709
+
710
+ # combine fold results
711
+ cv_preds = pd.concat(cv_preds).assign(**params)
712
+ predictions.append(cv_preds)
713
+
714
+ # compute IC per day
715
+ by_day = cv_preds.groupby(level="date")
716
+ ic_by_day = pd.concat(
717
+ [
718
+ by_day.apply(
719
+ lambda x: spearmanr(x.y_test, x[str(n)])[0]
720
+ ).to_frame(n)
721
+ for n in num_iterations
722
+ ],
723
+ axis=1,
724
+ )
725
+ daily_ic_mean = ic_by_day.mean()
726
+ daily_ic_mean_n = daily_ic_mean.idxmax()
727
+ daily_ic_median = ic_by_day.median()
728
+ daily_ic_median_n = daily_ic_median.idxmax()
729
+
730
+ # compute IC across all predictions
731
+ ic = [
732
+ spearmanr(cv_preds.y_test, cv_preds[str(n)])[0]
733
+ for n in num_iterations
734
+ ]
735
+ t = time() - start
736
+ T += t
737
+
738
+ # collect metrics
739
+ metrics = pd.Series(
740
+ list(param_vals)
741
+ + [
742
+ t,
743
+ daily_ic_mean.max(),
744
+ daily_ic_mean_n,
745
+ daily_ic_median.max(),
746
+ daily_ic_median_n,
747
+ ]
748
+ + ic,
749
+ index=metric_cols,
750
+ )
751
+ if verbose:
752
+ msg = f'\t{p:3.0f} | {self.format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
753
+ msg += f'{params["num_leaves"]:3.0f} | {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} | '
754
+ msg += f" {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}"
755
+ print(msg)
756
+
757
+ # persist results for given CV run and hyperparameter combination
758
+ metrics.to_hdf(path_or_buf=self.trainstore, key="metrics/" + key)
759
+ ic_by_day.assign(**params).to_hdf(
760
+ path_or_buf=self.trainstore, key="daily_ic/" + key
761
+ )
762
+ fi.T.describe().T.assign(**params).to_hdf(
763
+ path_or_buf=self.trainstore, key="fi/" + key
764
+ )
765
+ cv_preds.to_hdf(
766
+ path_or_buf=self.trainstore, key="predictions/" + key, append=True
767
+ )
768
+
769
+ def _get_lgb_metrics(self, scope_params, lgb_train_params, daily_ic_metrics):
770
+ with pd.HDFStore(self.trainstore) as store:
771
+ for i, key in enumerate(
772
+ [k[1:] for k in store.keys() if k[1:].startswith("metrics")]
773
+ ):
774
+ _, t, train_length, test_length = key.split("/")[:4]
775
+ attrs = {
776
+ "lookahead": t,
777
+ "train_length": train_length,
778
+ "test_length": test_length,
779
+ }
780
+ s = store[key].to_dict()
781
+ s.update(attrs)
782
+ if i == 0:
783
+ lgb_metrics = pd.Series(s).to_frame(i)
784
+ else:
785
+ lgb_metrics[i] = pd.Series(s)
786
+
787
+ id_vars = scope_params + lgb_train_params + daily_ic_metrics
788
+ lgb_metrics = (
789
+ pd.melt(
790
+ lgb_metrics.T.drop("t", axis=1),
791
+ id_vars=id_vars,
792
+ value_name="ic",
793
+ var_name="boost_rounds",
794
+ )
795
+ .dropna()
796
+ .apply(pd.to_numeric)
797
+ )
798
+ return lgb_metrics
799
+
800
+ def _get_lgb_ic(self, int_cols, scope_params, lgb_train_params, id_vars):
801
+ lgb_ic = []
802
+ with pd.HDFStore(self.trainstore) as store:
803
+ keys = [k[1:] for k in store.keys()]
804
+ for key in keys:
805
+ _, t, train_length, test_length = key.split("/")[:4]
806
+ if key.startswith("daily_ic"):
807
+ df = (
808
+ store[key]
809
+ .drop(["boosting", "objective", "verbose"], axis=1)
810
+ .assign(
811
+ lookahead=t,
812
+ train_length=train_length,
813
+ test_length=test_length,
814
+ )
815
+ )
816
+ lgb_ic.append(df)
817
+ lgb_ic = pd.concat(lgb_ic).reset_index()
818
+ lgb_ic = pd.melt(
819
+ lgb_ic, id_vars=id_vars, value_name="ic", var_name="boost_rounds"
820
+ ).dropna()
821
+ lgb_ic.loc[:, int_cols] = lgb_ic.loc[:, int_cols].astype(int)
822
+ return lgb_ic
823
+
824
+ def _get_lgb_params(self, data, scope_params, lgb_train_params, t=5, best=0):
825
+ param_cols = scope_params[1:] + lgb_train_params + ["boost_rounds"]
826
+ df = data[data.lookahead == t].sort_values("ic", ascending=False).iloc[best]
827
+ return df.loc[param_cols]
828
+
829
+ def _get_lgb_key(self, t, p):
830
+ key = f"{t}/{int(p.train_length)}/{int(p.test_length)}/{p.learning_rate}/"
831
+ return (
832
+ key + f"{int(p.num_leaves)}/{p.feature_fraction}/{int(p.min_data_in_leaf)}"
833
+ )
834
+
835
+ def _select_ic(self, params, ic_data, lookahead):
836
+ return ic_data.loc[
837
+ (ic_data.lookahead == lookahead)
838
+ & (ic_data.train_length == params.train_length)
839
+ & (ic_data.test_length == params.test_length)
840
+ & (ic_data.learning_rate == params.learning_rate)
841
+ & (ic_data.num_leaves == params.num_leaves)
842
+ & (ic_data.feature_fraction == params.feature_fraction)
843
+ & (ic_data.boost_rounds == params.boost_rounds),
844
+ ["date", "ic"],
845
+ ].set_index("date")
846
+
847
+ def get_trade_prices(self, tickers, start, end):
848
+ idx = pd.IndexSlice
849
+ with pd.HDFStore(self.datastore) as store:
850
+ data = store.select("stock_data")
851
+ data = data.set_index(["symbol", "date"]).sort_index()
852
+ data = data[~data.index.duplicated()]
853
+ return (
854
+ data.loc[idx[tickers, start:end], "open"]
855
+ .unstack("symbol")
856
+ .sort_index()
857
+ .shift(-1)
858
+ .tz_convert("UTC")
859
+ )
860
+
861
+ def plot_ic(self, lgb_ic, lgb_daily_ic, scope_params, lgb_train_params):
862
+ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
863
+ axes = axes.flatten()
864
+ for i, t in enumerate([1, 21]):
865
+ params = self._get_lgb_params(
866
+ lgb_daily_ic, scope_params, lgb_train_params, t=t, best=0
867
+ )
868
+ data = self._select_ic(params, lgb_ic, lookahead=t).sort_index()
869
+ rolling = data.rolling(63).ic.mean().dropna()
870
+ avg = data.ic.mean()
871
+ med = data.ic.median()
872
+ rolling.plot(
873
+ ax=axes[i],
874
+ title=f"Horizon: {t} Day(s) | IC: Mean={avg*100:.2f} Median={med*100:.2f}",
875
+ )
876
+ axes[i].axhline(avg, c="darkred", lw=1)
877
+ axes[i].axhline(0, ls="--", c="k", lw=1)
878
+
879
+ fig.suptitle("3-Month Rolling Information Coefficient", fontsize=16)
880
+ fig.tight_layout()
881
+ fig.subplots_adjust(top=0.92)
882
+
883
+ def plot_metrics(self, lgb_metrics, lgb_daily_ic, t=1):
884
+ # Visualization
885
+ sns.jointplot(x=lgb_metrics.daily_ic_mean, y=lgb_metrics.ic)
886
+
887
+ sns.catplot(
888
+ x="lookahead",
889
+ y="ic",
890
+ col="train_length",
891
+ row="test_length",
892
+ data=lgb_metrics,
893
+ kind="box",
894
+ )
895
+ sns.catplot(
896
+ x="boost_rounds",
897
+ y="ic",
898
+ col="train_length",
899
+ row="test_length",
900
+ data=lgb_daily_ic[lgb_daily_ic.lookahead == t],
901
+ kind="box",
902
+ )
903
+
904
+ def get_best_predictions(
905
+ self, lgb_daily_ic, scope_params, lgb_train_params, lookahead=1, topn=10
906
+ ):
907
+ for best in range(topn):
908
+ best_params = self._get_lgb_params(
909
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=best
910
+ )
911
+ key = self._get_lgb_key(lookahead, best_params)
912
+ rounds = str(int(best_params.boost_rounds))
913
+ if best == 0:
914
+ best_predictions = pd.read_hdf(self.trainstore, "predictions/" + key)
915
+ best_predictions = best_predictions[rounds].to_frame(best)
916
+ else:
917
+ best_predictions[best] = pd.read_hdf(
918
+ self.trainstore, "predictions/" + key
919
+ )[rounds]
920
+ best_predictions = best_predictions.sort_index()
921
+ best_predictions.reset_index().to_hdf(
922
+ path_or_buf=self.outstore, key=f"lgb/train/{lookahead:02}"
923
+ )
924
+ return best_predictions
925
+
926
+ def apply_alphalen_analysis(self, factor_data, tearsheet=True, verbose=True):
927
+ # Compute Alphalens metrics
928
+ mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
929
+ factor_data,
930
+ by_date=True,
931
+ by_group=False,
932
+ demeaned=True,
933
+ group_adjust=False,
934
+ )
935
+ factor_returns = perf.factor_returns(factor_data)
936
+ mean_quant_ret, std_quantile = perf.mean_return_by_quantile(
937
+ factor_data, by_group=False, demeaned=True
938
+ )
939
+
940
+ mean_quant_rateret = mean_quant_ret.apply(
941
+ rate_of_return, axis=0, base_period=mean_quant_ret.columns[0]
942
+ )
943
+
944
+ mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
945
+ factor_data,
946
+ by_date=True,
947
+ by_group=False,
948
+ demeaned=True,
949
+ group_adjust=False,
950
+ )
951
+
952
+ mean_quant_rateret_bydate = mean_quant_ret_bydate.apply(
953
+ rate_of_return,
954
+ base_period=mean_quant_ret_bydate.columns[0],
955
+ )
956
+
957
+ compstd_quant_daily = std_quant_daily.apply(
958
+ std_conversion, base_period=std_quant_daily.columns[0]
959
+ )
960
+
961
+ alpha_beta = perf.factor_alpha_beta(factor_data, demeaned=True)
962
+
963
+ mean_ret_spread_quant, std_spread_quant = perf.compute_mean_returns_spread(
964
+ mean_quant_rateret_bydate,
965
+ factor_data["factor_quantile"].max(),
966
+ factor_data["factor_quantile"].min(),
967
+ std_err=compstd_quant_daily,
968
+ )
969
+ if verbose:
970
+ print(
971
+ mean_ret_spread_quant.mean()
972
+ .mul(10000)
973
+ .to_frame("Mean Period Wise Spread (bps)")
974
+ .join(alpha_beta.T)
975
+ .T
976
+ )
977
+
978
+ fig, axes = plt.subplots(ncols=3, figsize=(18, 4))
979
+
980
+ plotting.plot_quantile_returns_bar(mean_quant_rateret, ax=axes[0])
981
+ plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)
982
+ axes[0].set_xlabel("Quantile")
983
+
984
+ plotting.plot_cumulative_returns_by_quantile(
985
+ mean_quant_ret_bydate["1D"],
986
+ freq=pd.tseries.offsets.BDay(),
987
+ period="1D",
988
+ ax=axes[1],
989
+ )
990
+ axes[1].set_title("Cumulative Return by Quantile (1D Period)")
991
+
992
+ title = "Cumulative Return - Factor-Weighted Long/Short PF (1D Period)"
993
+ plotting.plot_cumulative_returns(
994
+ factor_returns["1D"],
995
+ period="1D",
996
+ freq=pd.tseries.offsets.BDay(),
997
+ title=title,
998
+ ax=axes[2],
999
+ )
1000
+
1001
+ fig.suptitle("Alphalens - Validation Set Performance", fontsize=14)
1002
+ fig.tight_layout()
1003
+ fig.subplots_adjust(top=0.85)
1004
+
1005
+ # Summary Tearsheet
1006
+ create_summary_tear_sheet(factor_data)
1007
+ create_full_tear_sheet(factor_data)
1008
+
1009
+ def evaluate(self, remove_instore=False, lookahead=1, verbose=True):
1010
+ scope_params = ["lookahead", "train_length", "test_length"]
1011
+ daily_ic_metrics = [
1012
+ "daily_ic_mean",
1013
+ "daily_ic_mean_n",
1014
+ "daily_ic_median",
1015
+ "daily_ic_median_n",
1016
+ ]
1017
+ lgb_train_params = [
1018
+ "learning_rate",
1019
+ "num_leaves",
1020
+ "feature_fraction",
1021
+ "min_data_in_leaf",
1022
+ ]
1023
+
1024
+ lgb_metrics = self._get_lgb_metrics(
1025
+ scope_params, lgb_train_params, daily_ic_metrics
1026
+ )
1027
+ # Summary Metrics by Fold
1028
+ lgb_metrics.to_hdf(path_or_buf=self.outstore, key="lgb/metrics")
1029
+
1030
+ # Information Coefficient by Day
1031
+ int_cols = ["lookahead", "train_length", "test_length", "boost_rounds"]
1032
+ id_vars = ["date"] + scope_params + lgb_train_params
1033
+ lgb_ic = self._get_lgb_ic(int_cols, scope_params, lgb_train_params, id_vars)
1034
+ lgb_ic.to_hdf(path_or_buf=self.outstore, key="lgb/ic")
1035
+ lgb_daily_ic = (
1036
+ lgb_ic.groupby(id_vars[1:] + ["boost_rounds"])
1037
+ .ic.mean()
1038
+ .to_frame("ic")
1039
+ .reset_index()
1040
+ )
1041
+ lgb_daily_ic.to_hdf(path_or_buf=self.outstore, key="lgb/daily_ic")
1042
+
1043
+ # Cross-validation Result: Best Hyperparameters
1044
+ if verbose:
1045
+ print(
1046
+ lgb_daily_ic.groupby("lookahead", group_keys=False).apply(
1047
+ lambda x: x.nlargest(3, "ic")
1048
+ )
1049
+ )
1050
+ lgb_metrics.groupby("lookahead", group_keys=False).apply(
1051
+ lambda x: x.nlargest(3, "ic")
1052
+ )
1053
+ lgb_metrics.groupby("lookahead", group_keys=False).apply(
1054
+ lambda x: x.nlargest(3, "ic")
1055
+ ).to_hdf(path_or_buf=self.outstore, key="lgb/best_model")
1056
+ if verbose:
1057
+ print(
1058
+ lgb_metrics.groupby("lookahead", group_keys=False).apply(
1059
+ lambda x: x.nlargest(3, "daily_ic_mean")
1060
+ )
1061
+ )
1062
+
1063
+ # Visualization
1064
+ if verbose:
1065
+ self.plot_metrics(lgb_metrics, lgb_daily_ic, t=lookahead)
1066
+
1067
+ # AlphaLens Analysis - Validation Performance
1068
+ lgb_daily_ic = pd.read_hdf(self.outstore, "lgb/daily_ic")
1069
+ best_params = self._get_lgb_params(
1070
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=0
1071
+ )
1072
+ best_params.to_hdf(path_or_buf=self.outstore, key="lgb/best_params")
1073
+
1074
+ if verbose:
1075
+ self.plot_ic(lgb_ic, lgb_daily_ic, scope_params, lgb_train_params)
1076
+
1077
+ # Get Predictions for Validation Period
1078
+ best_predictions = self.get_best_predictions(
1079
+ lgb_daily_ic, scope_params, lgb_train_params, lookahead=lookahead, topn=10
1080
+ )
1081
+ test_tickers = best_predictions.index.get_level_values("symbol").unique()
1082
+ start = best_predictions.index.get_level_values("date").min()
1083
+ end = best_predictions.index.get_level_values("date").max()
1084
+ trade_prices = self.get_trade_prices(test_tickers, start, end)
1085
+ pd.Series(test_tickers).to_hdf(path_or_buf=self.outstore, key="lgb/tickers")
1086
+ # We average the top five models and provide the corresponding prices to Alphalens,
1087
+ # in order to compute the mean period-wise
1088
+ # return earned on an equal-weighted portfolio invested in the daily factor quintiles
1089
+ # for various holding periods:
1090
+ factor = (
1091
+ best_predictions.iloc[:, :5]
1092
+ .mean(1)
1093
+ .dropna()
1094
+ .tz_convert("UTC", level="date")
1095
+ .swaplevel()
1096
+ )
1097
+ # Create AlphaLens Inputs
1098
+ if verbose:
1099
+ factor_data = get_clean_factor_and_forward_returns(
1100
+ factor=factor,
1101
+ prices=trade_prices,
1102
+ quantiles=5,
1103
+ periods=(1, 5, 10, 21),
1104
+ max_loss=1,
1105
+ )
1106
+ self.apply_alphalen_analysis(factor_data, tearsheet=True, verbose=True)
1107
+ # Delete the temporary files
1108
+ if remove_instore:
1109
+ os.remove(self.trainstore)
1110
+
1111
+ def make_predictions(
1112
+ self, data: pd.DataFrame, mode="test", lookahead=1, verbose=True
1113
+ ):
1114
+ data = data.copy()
1115
+ YEAR = 252
1116
+ scope_params = ["lookahead", "train_length", "test_length"]
1117
+ lgb_train_params = [
1118
+ "learning_rate",
1119
+ "num_leaves",
1120
+ "feature_fraction",
1121
+ "min_data_in_leaf",
1122
+ ]
1123
+
1124
+ base_params = dict(boosting="gbdt", objective="regression", verbose=-1)
1125
+
1126
+ categoricals = ["year", "month", "weekday"]
1127
+ labels = sorted(data.filter(like="_fwd").columns)
1128
+ features = data.columns.difference(labels).tolist()
1129
+ label = f"r{lookahead:02}_fwd"
1130
+ for feature in categoricals:
1131
+ data[feature] = pd.factorize(data[feature], sort=True)[0]
1132
+
1133
+ if mode == "test":
1134
+ data = data.dropna().sort_index()
1135
+ elif mode == "live":
1136
+ data[labels] = data[labels].fillna(0)
1137
+ data = data.sort_index().dropna()
1138
+
1139
+ lgb_data = lgb.Dataset(
1140
+ data=data[features],
1141
+ label=data[label],
1142
+ categorical_feature=categoricals,
1143
+ free_raw_data=False,
1144
+ )
1145
+ # Generate predictions
1146
+ lgb_daily_ic = pd.read_hdf(self.outstore, "lgb/daily_ic")
1147
+
1148
+ for position in range(10):
1149
+ params = self._get_lgb_params(
1150
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=position
1151
+ )
1152
+
1153
+ params = params.to_dict()
1154
+
1155
+ for p in ["min_data_in_leaf", "num_leaves"]:
1156
+ params[p] = int(params[p])
1157
+ train_length = int(params.pop("train_length"))
1158
+ test_length = int(params.pop("test_length"))
1159
+ num_boost_round = int(params.pop("boost_rounds"))
1160
+ params.update(base_params)
1161
+ if verbose:
1162
+ print(f"\nPosition: {position:02}")
1163
+
1164
+ # 1-year out-of-sample period
1165
+ n_splits = int(YEAR / test_length)
1166
+ cv = MultipleTimeSeriesCV(
1167
+ n_splits=n_splits,
1168
+ test_period_length=test_length,
1169
+ lookahead=lookahead,
1170
+ train_period_length=train_length,
1171
+ )
1172
+
1173
+ predictions = []
1174
+ for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
1175
+ if verbose:
1176
+ print(i, end=" ", flush=True)
1177
+ lgb_train = lgb_data.subset(
1178
+ used_indices=train_idx.tolist(), params=params
1179
+ ).construct()
1180
+
1181
+ model = lgb.train(
1182
+ params=params,
1183
+ train_set=lgb_train,
1184
+ num_boost_round=num_boost_round,
1185
+ )
1186
+
1187
+ test_set = data.iloc[test_idx, :]
1188
+ y_test = test_set.loc[:, label].to_frame("y_test")
1189
+ y_pred = model.predict(test_set.loc[:, model.feature_name()])
1190
+ predictions.append(y_test.assign(prediction=y_pred))
1191
+
1192
+ if position == 0:
1193
+ test_predictions = pd.concat(predictions).rename(
1194
+ columns={"prediction": position}
1195
+ )
1196
+ else:
1197
+ test_predictions[position] = pd.concat(predictions).prediction
1198
+
1199
+ by_day = test_predictions.groupby(level="date")
1200
+ for position in range(10):
1201
+ if position == 0:
1202
+ ic_by_day = by_day.apply(
1203
+ lambda x: spearmanr(x.y_test, x[position])[0]
1204
+ ).to_frame()
1205
+ else:
1206
+ ic_by_day[position] = by_day.apply(
1207
+ lambda x: spearmanr(x.y_test, x[position])[0]
1208
+ )
1209
+ if verbose:
1210
+ print(ic_by_day.describe())
1211
+ test_predictions.reset_index().to_hdf(
1212
+ path_or_buf=self.outstore, key=f"lgb/test/{lookahead:02}"
1213
+ )
1214
+ return test_predictions
1215
+
1216
+ def load_predictions(self, predictions=None, lookahead=1):
1217
+ if predictions is None:
1218
+ predictions = pd.concat(
1219
+ [
1220
+ pd.read_hdf(self.outstore, f"lgb/train/{lookahead:02}"),
1221
+ pd.read_hdf(self.outstore, f"lgb/test/{lookahead:02}").drop(
1222
+ "y_test", axis=1
1223
+ ),
1224
+ ]
1225
+ )
1226
+ predictions = predictions.set_index(["symbol", "date"])
1227
+
1228
+ predictions = (
1229
+ predictions.loc[~predictions.index.duplicated()]
1230
+ .iloc[:, :10]
1231
+ .mean(1)
1232
+ .sort_index()
1233
+ .dropna()
1234
+ .to_frame("prediction")
1235
+ )
1236
+ tickers = predictions.index.get_level_values("symbol").unique().tolist()
1237
+ try:
1238
+ return (predictions.unstack("symbol").prediction.tz_convert("UTC")), tickers
1239
+ except TypeError:
1240
+ return (predictions.unstack("symbol").prediction.tz_localize("UTC")), tickers
1241
+
1242
+ def assert_last_date(self, predictions: pd.DataFrame):
1243
+ """
1244
+ Usefull in Live Trading to ensure that the last date in the predictions
1245
+ is the previous day, so it predicts today's returns.
1246
+ """
1247
+ last_date = predictions.index.get_level_values("date").max()
1248
+ try:
1249
+ if last_date.tzinfo is None:
1250
+ last_date = last_date.tz_localize("UTC")
1251
+ else:
1252
+ last_date = last_date.tz_convert("UTC")
1253
+ last_date = last_date.normalize()
1254
+ except Exception as e:
1255
+ print(f"Error getting last date: {e}")
1256
+ try:
1257
+ days = 3 if datetime.now().strftime("%A") == "Monday" else 1
1258
+ td = (
1259
+ last_date
1260
+ - (pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=days)).normalize()
1261
+ )
1262
+ assert (
1263
+ td.days == days or last_date == (pd.Timestamp.now(tz="UTC")).normalize()
1264
+ )
1265
+ return True
1266
+ except AssertionError:
1267
+ return False
1268
+
1269
+ def clean_stores(self, *stores):
1270
+ for store in stores:
1271
+ if os.path.exists(store):
1272
+ os.remove(store)