bbstrader 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbstrader might be problematic. Click here for more details.

bbstrader/models/ml.py ADDED
@@ -0,0 +1,1264 @@
1
+ import os
2
+ import warnings
3
+ from datetime import datetime
4
+ from itertools import product
5
+ from time import time
6
+
7
+ import lightgbm as lgb
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import pandas as pd
11
+ import seaborn as sns
12
+
13
+ import yfinance as yf
14
+ from alphalens import performance as perf
15
+ from alphalens import plotting
16
+ from alphalens.tears import create_full_tear_sheet, create_summary_tear_sheet
17
+ from alphalens.utils import (
18
+ get_clean_factor_and_forward_returns,
19
+ rate_of_return,
20
+ std_conversion,
21
+ )
22
+ from scipy.stats import spearmanr
23
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
24
+ import pandas_ta as ta
25
+
26
+ warnings.filterwarnings("ignore")
27
+
28
+
29
+ __all__ = ["OneStepTimeSeriesSplit", "MultipleTimeSeriesCV", "LightGBModel"]
30
+
31
+
32
+ class OneStepTimeSeriesSplit:
33
+ __author__ = "Stefan Jansen"
34
+ """Generates tuples of train_idx, test_idx pairs
35
+ Assumes the index contains a level labeled 'date'"""
36
+
37
+ def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
38
+ self.n_splits = n_splits
39
+ self.test_period_length = test_period_length
40
+ self.shuffle = shuffle
41
+
42
+ @staticmethod
43
+ def chunks(l, n): # noqa: E741
44
+ for i in range(0, len(l), n):
45
+ yield l[i : i + n]
46
+
47
+ def split(self, X: pd.DataFrame, y=None, groups=None):
48
+ unique_dates = (
49
+ X.index.get_level_values("date")
50
+ .unique()
51
+ .sort_values(ascending=False)[: self.n_splits * self.test_period_length]
52
+ )
53
+
54
+ dates = X.reset_index()[["date"]]
55
+ for test_date in self.chunks(unique_dates, self.test_period_length):
56
+ train_idx = dates[dates.date < min(test_date)].index
57
+ test_idx = dates[dates.date.isin(test_date)].index
58
+ if self.shuffle:
59
+ np.random.shuffle(list(train_idx))
60
+ yield train_idx, test_idx
61
+
62
+ def get_n_splits(self, X, y, groups=None):
63
+ return self.n_splits
64
+
65
+
66
+ class MultipleTimeSeriesCV:
67
+ __author__ = "Stefan Jansen"
68
+ """
69
+ Generates tuples of train_idx, test_idx pairs
70
+ Assumes the MultiIndex contains levels 'symbol' and 'date'
71
+ purges overlapping outcomes
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ n_splits=3,
77
+ train_period_length=126,
78
+ test_period_length=21,
79
+ lookahead=None,
80
+ date_idx="date",
81
+ shuffle=False,
82
+ ):
83
+ self.n_splits = n_splits
84
+ self.lookahead = lookahead
85
+ self.test_length = test_period_length
86
+ self.train_length = train_period_length
87
+ self.shuffle = shuffle
88
+ self.date_idx = date_idx
89
+
90
+ def split(self, X: pd.DataFrame, y=None, groups=None):
91
+ unique_dates = X.index.get_level_values(self.date_idx).unique()
92
+ days = sorted(unique_dates, reverse=True)
93
+ split_idx = []
94
+ for i in range(self.n_splits):
95
+ test_end_idx = i * self.test_length
96
+ test_start_idx = test_end_idx + self.test_length
97
+ train_end_idx = test_start_idx + self.lookahead - 1
98
+ train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
99
+ split_idx.append(
100
+ [train_start_idx, train_end_idx, test_start_idx, test_end_idx]
101
+ )
102
+
103
+ dates = X.reset_index()[[self.date_idx]]
104
+ for train_start, train_end, test_start, test_end in split_idx:
105
+ train_idx = dates[
106
+ (dates[self.date_idx] > days[train_start])
107
+ & (dates[self.date_idx] <= days[train_end])
108
+ ].index
109
+ test_idx = dates[
110
+ (dates[self.date_idx] > days[test_start])
111
+ & (dates[self.date_idx] <= days[test_end])
112
+ ].index
113
+ if self.shuffle:
114
+ np.random.shuffle(list(train_idx))
115
+ yield train_idx.to_numpy(), test_idx.to_numpy()
116
+
117
+ def get_n_splits(self, X, y, groups=None):
118
+ return self.n_splits
119
+
120
+
121
+ class LightGBModel(object):
122
+ """
123
+ ``LightGBModel`` encapsulates a complete workflow for training and evaluating
124
+ a ``LightGBM (Light Gradient Boosting Machine)`` model for predicting stock returns.
125
+ It includes data acquisition, feature engineering, model tuning, and performance
126
+ evaluation using information ``coefficient (IC)`` and Alphalens analysis.
127
+
128
+ Key Features
129
+ ------------
130
+ - ``HDF5 Storage``: Utilizes ``pandas.HDFStore`` for efficient storage and retrieval
131
+ of large datasets, which is essential for backtesting on financial time series data.
132
+
133
+ - ``Time-Series Cross-Validation``: Employs a custom cross-validation strategy that
134
+ respects the time series nature of the data, avoiding data leakage.
135
+
136
+ - ``Hyperparameter Tuning``: Includes automated hyperparameter tuning using a randomized
137
+ grid search for optimization.
138
+
139
+ - ``Information Coefficient (IC)``: Uses IC as a core performance metric that quantifies
140
+ the predictive power of the model, which is a standard measure for ranking models in finance.
141
+
142
+ - ``Alphalens Integration``: Provides a comprehensive framework for validating model
143
+ performance using Alphalens, allowing for in-depth performance analysis, like backtesting
144
+ and return decomposition.
145
+
146
+ Use Case
147
+ --------
148
+ This class is designed for quantitative finance and algorithmic trading use cases where
149
+ the goal is to build a predictive model for stock returns based on historical data and
150
+ technical indicators. It follows a complete cycle from data acquisition to model validation
151
+ and provides the infrastructure needed for deployment of this model in a trading strategy.
152
+
153
+ Notes
154
+ -----
155
+ The implementation is inspired by the book "Machine Learning for Algorithmic Trading"
156
+ by Stefan Jansen.
157
+
158
+ References
159
+ ----------
160
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
161
+ Chapter 12, Boosting Your Trading Strategy.
162
+ """
163
+
164
+ def __init__(
165
+ self,
166
+ data: pd.DataFrame = None,
167
+ datastore: pd.HDFStore = "lgbdata.h5",
168
+ trainstore: pd.HDFStore = "lgbtrain.h5",
169
+ outstore: pd.HDFStore = "lgbout.h5",
170
+ ):
171
+ """
172
+ Args:
173
+ data (pd.DataFrame): The input data for the model. It should be a DataFrame with a MultiIndex containing
174
+ 'symbol' and 'date' levels. If not provided, the data can be downloaded using the `download_boosting_data` method.
175
+ datastore (str): The path to the HDF5 file for storing the model data.
176
+ trainstore (str): The path to the HDF5 file for storing the training data.
177
+ outstore (str): The path to the HDF5 file for storing the output data.
178
+ """
179
+ self.datastore = datastore
180
+ self.trainstore = trainstore
181
+ self.outstore = outstore
182
+ if data is not None:
183
+ data.reset_index().to_hdf(path_or_buf=self.datastore, key="model_data")
184
+
185
+ def _compute_bb(self, close):
186
+ # Compute Bollinger Bands using pandas_ta
187
+ bb = ta.bbands(close, length=20)
188
+ return pd.DataFrame(
189
+ {"bb_high": bb["BBU_20_2.0"], "bb_low": bb["BBL_20_2.0"]}, index=close.index
190
+ )
191
+
192
+ def _compute_atr(self, stock_data):
193
+ # Compute ATR using pandas_ta
194
+ atr = ta.atr(stock_data.high, stock_data.low, stock_data.close, length=14)
195
+ return (atr - atr.mean()) / atr.std()
196
+
197
+ def _compute_macd(self, close):
198
+ # Compute MACD using pandas_ta
199
+ macd = ta.macd(close)["MACD_12_26_9"]
200
+ return (macd - macd.mean()) / macd.std()
201
+
202
+ def _add_technical_indicators(self, prices: pd.DataFrame):
203
+ prices = prices.copy()
204
+
205
+ # Add RSI and normalize
206
+ prices["rsi"] = (
207
+ prices.groupby(level="symbol")
208
+ .close.apply(lambda x: ta.rsi(x, length=14))
209
+ .reset_index(level=0, drop=True)
210
+ )
211
+
212
+ # Add Bollinger Bands
213
+ bb = prices.groupby(level="symbol").close.apply(self._compute_bb)
214
+ bb = bb.reset_index(level=1, drop=True)
215
+ prices = prices.join(bb)
216
+
217
+ prices["bb_high"] = (
218
+ prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
219
+ )
220
+ prices["bb_low"] = (
221
+ prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)
222
+ )
223
+
224
+ # Add ATR and normalize
225
+ prices["ATR"] = prices.groupby(level="symbol", group_keys=False).apply(
226
+ lambda x: self._compute_atr(x)
227
+ )
228
+
229
+ # Add MACD and normalize
230
+ prices["MACD"] = prices.groupby(level="symbol", group_keys=False).close.apply(
231
+ self._compute_macd
232
+ )
233
+
234
+ return prices
235
+
236
+ def download_boosting_data(self, tickers, start, end=None):
237
+ data = []
238
+ for ticker in tickers:
239
+ try:
240
+ prices = yf.download(
241
+ ticker,
242
+ start=start,
243
+ end=end,
244
+ progress=False,
245
+ multi_level_index=False,
246
+ )
247
+ prices["symbol"] = ticker
248
+ data.append(prices)
249
+ except: # noqa: E722
250
+ continue
251
+ data = pd.concat(data)
252
+ data = (
253
+ data.rename(columns={s: s.lower().replace(" ", "_") for s in data.columns})
254
+ .drop(columns=["adj_close"])
255
+ .set_index("symbol", append=True)
256
+ .swaplevel()
257
+ .sort_index()
258
+ .dropna()
259
+ )
260
+ return data
261
+
262
+ def download_metadata(self, tickers):
263
+ def clean_text_column(series: pd.Series) -> pd.Series:
264
+ return (
265
+ series.str.lower()
266
+ # use regex=False for literal string replacements
267
+ .str.replace("-", "", regex=False)
268
+ .str.replace("&", "and", regex=False)
269
+ .str.replace(" ", "_", regex=False)
270
+ .str.replace("__", "_", regex=False)
271
+ )
272
+
273
+ metadata = [
274
+ "industry",
275
+ "sector",
276
+ "exchange",
277
+ "symbol",
278
+ "heldPercentInsiders",
279
+ "heldPercentInstitutions",
280
+ "overallRisk",
281
+ "shortRatio",
282
+ "dividendYield",
283
+ "beta",
284
+ "regularMarketVolume",
285
+ "averageVolume",
286
+ "averageVolume10days",
287
+ "bid",
288
+ "ask",
289
+ "bidSize",
290
+ "askSize",
291
+ "marketCap",
292
+ ]
293
+
294
+ columns = {
295
+ "industry": "industry",
296
+ "sector": "sector",
297
+ "exchange": "exchange",
298
+ "symbol": "symbol",
299
+ "heldPercentInsiders": "insiders",
300
+ "heldPercentInstitutions": "institutions",
301
+ "overallRisk": "risk",
302
+ "shortRatio": "short_ratio",
303
+ "dividendYield": "dyield",
304
+ "beta": "beta",
305
+ "regularMarketVolume": "regvolume",
306
+ "averageVolume": "avgvolume",
307
+ "averageVolume10days": "avgvolume10",
308
+ "bid": "bid",
309
+ "ask": "ask",
310
+ "bidSize": "bidsize",
311
+ "askSize": "asksize",
312
+ "marketCap": "marketcap",
313
+ }
314
+ data = []
315
+ for symbol in tickers:
316
+ try:
317
+ symbol_info = yf.Ticker(symbol).info
318
+ except: # noqa: E722
319
+ continue
320
+ infos = {}
321
+ for info in metadata:
322
+ infos[info] = symbol_info.get(info)
323
+ data.append(infos)
324
+ metadata = pd.DataFrame(data)
325
+ metadata = metadata.rename(columns=columns)
326
+ metadata.dyield = metadata.dyield.fillna(0)
327
+ metadata.sector = clean_text_column(metadata.sector)
328
+ metadata.industry = clean_text_column(metadata.industry)
329
+ metadata = metadata.set_index("symbol")
330
+ return metadata
331
+
332
+ def _select_nlargest_liquidity_stocks(
333
+ self,
334
+ df: pd.DataFrame,
335
+ n: int,
336
+ volume_features,
337
+ bid_ask_features,
338
+ market_cap_feature,
339
+ ):
340
+ df = df.copy()
341
+ scaler = StandardScaler()
342
+
343
+ # Normalize features
344
+ df[volume_features] = scaler.fit_transform(df[volume_features])
345
+ df["bid_ask_spread"] = df["ask"] - df["bid"]
346
+ df["bid_ask_spread"] = scaler.fit_transform(df[["bid_ask_spread"]])
347
+ df[market_cap_feature] = scaler.fit_transform(df[market_cap_feature])
348
+
349
+ # Calculate Liquidity Score
350
+ # Assign weights to each component (these weights can be adjusted based on importance)
351
+ weights = {"volume": 0.4, "bid_ask_spread": 0.2, "marketCap": 0.4}
352
+
353
+ # Calculate the liquidity score by combining the normalized features
354
+ df["liquidity_score"] = (
355
+ weights["volume"] * df[volume_features].mean(axis=1)
356
+ + weights["bid_ask_spread"] * df["bid_ask_spread"]
357
+ + weights["marketCap"] * df[market_cap_feature[0]]
358
+ )
359
+ df_sorted = df.sort_values(by="liquidity_score", ascending=False)
360
+
361
+ return df_sorted.nlargest(n, "liquidity_score").index
362
+
363
+ def _encode_metadata(self, df: pd.DataFrame):
364
+ df = df.copy()
365
+ # Binning each numerical feature into categories
366
+ df["insiders"] = pd.qcut(
367
+ df["insiders"], q=4, labels=["Very Low", "Low", "High", "Very High"]
368
+ )
369
+ df["institutions"] = pd.qcut(
370
+ df["institutions"], q=4, labels=["Very Low", "Low", "High", "Very High"]
371
+ )
372
+ df["risk"] = pd.cut(
373
+ df["risk"],
374
+ bins=[-float("inf"), 3, 5, 7, float("inf")],
375
+ labels=["Low", "Medium", "High", "Very High"],
376
+ )
377
+ df["short_ratio"] = pd.qcut(
378
+ df["short_ratio"], q=4, labels=["Very Low", "Low", "High", "Very High"]
379
+ )
380
+ df["dyield"] = pd.cut(
381
+ df["dyield"],
382
+ bins=[-float("inf"), 0.002, 0.005, 0.01, float("inf")],
383
+ labels=["Very Low", "Low", "High", "Very High"],
384
+ )
385
+ df["beta"] = pd.cut(
386
+ df["beta"],
387
+ bins=[-float("inf"), 0.8, 1.0, 1.2, float("inf")],
388
+ labels=["Low", "Moderate", "High", "Very High"],
389
+ )
390
+
391
+ # Encode binned features
392
+ binned_features = [
393
+ "insiders",
394
+ "institutions",
395
+ "risk",
396
+ "short_ratio",
397
+ "dyield",
398
+ "beta",
399
+ "sector",
400
+ "industry",
401
+ "exchange",
402
+ ]
403
+ label_encoders = {}
404
+
405
+ for col in binned_features:
406
+ le = LabelEncoder()
407
+ df[col] = le.fit_transform(df[col])
408
+ label_encoders[col] = le
409
+ return df, label_encoders
410
+
411
+ def prepare_boosting_data(
412
+ self,
413
+ prices: pd.DataFrame,
414
+ metadata: pd.DataFrame = None,
415
+ min_years=7,
416
+ universe=500,
417
+ ):
418
+ if metadata is None:
419
+ mcap = False
420
+ tickers = prices.index.get_level_values("symbol").unique()
421
+ metadata = self.download_metadata(tickers)
422
+ else:
423
+ mcap = True
424
+ YEAR = 252
425
+ idx = pd.IndexSlice
426
+ percentiles = [0.001, 0.01, 0.02, 0.03, 0.04, 0.05]
427
+ percentiles += [1 - p for p in percentiles[::-1]]
428
+ T = [1, 5, 10, 21, 42, 63]
429
+
430
+ prices.volume /= 1e3 # make vol figures a bit smaller
431
+ prices.index.names = ["symbol", "date"]
432
+ metadata.index.name = "symbol"
433
+ prices.reset_index().to_hdf(path_or_buf=self.datastore, key="stock_data")
434
+ metadata.reset_index().to_hdf(path_or_buf=self.datastore, key="stock_metadata")
435
+
436
+ # Remove stocks with insufficient observations
437
+ min_obs = min_years * YEAR
438
+ nobs = prices.groupby(level="symbol").size()
439
+ keep = nobs[nobs > min_obs].index
440
+ prices = prices.loc[idx[keep, :], :]
441
+
442
+ # # Remove duplicate symbols
443
+ prices = prices[~prices.index.duplicated()]
444
+
445
+ # Align price and meta data
446
+ metadata = metadata[~metadata.index.duplicated() & metadata.sector.notnull()]
447
+ metadata.sector = metadata.sector.str.lower().str.replace(" ", "_")
448
+ shared = (
449
+ prices.index.get_level_values("symbol")
450
+ .unique()
451
+ .intersection(metadata.index)
452
+ )
453
+ metadata = metadata.loc[shared, :]
454
+ prices = prices.loc[idx[shared, :], :]
455
+
456
+ # Limit universe
457
+ if mcap:
458
+ universe = metadata.marketcap.nlargest(universe).index
459
+ else:
460
+ volume_features = ["regvolume", "avgvolume", "avgvolume10"]
461
+ bid_ask_features = ["bid", "ask", "bidsize", "asksize"]
462
+ market_cap_feature = ["marketcap"]
463
+ to_drop = volume_features + bid_ask_features + market_cap_feature
464
+ universe = self._select_nlargest_liquidity_stocks(
465
+ metadata,
466
+ universe,
467
+ volume_features,
468
+ bid_ask_features,
469
+ market_cap_feature,
470
+ )
471
+ metadata = metadata.drop(to_drop, axis=1)
472
+ prices = prices.loc[idx[universe, :], :]
473
+ metadata = metadata.loc[universe]
474
+ metadata = self._encode_metadata(metadata)[0]
475
+
476
+ prices["dollar_vol"] = prices[["close", "volume"]].prod(1).div(1e3)
477
+ # compute dollar volume to determine universe
478
+ dollar_vol_ma = (
479
+ prices.dollar_vol.unstack("symbol")
480
+ .rolling(window=21, min_periods=1) # 1 trading month
481
+ .mean()
482
+ )
483
+
484
+ # Rank stocks by moving average
485
+ prices["dollar_vol_rank"] = (
486
+ dollar_vol_ma.rank(axis=1, ascending=False).stack("symbol").swaplevel()
487
+ )
488
+ # Add some Basic Factors
489
+ prices = self._add_technical_indicators(prices)
490
+ # Combine Price and Meta Data
491
+ prices = prices.join(metadata)
492
+
493
+ # Compute Returns
494
+ by_sym = prices.groupby(level="symbol").close
495
+ for t in T:
496
+ prices[f"r{t:02}"] = by_sym.pct_change(t)
497
+ # Daily historical return deciles
498
+ for t in T:
499
+ # Reset the index to apply qcut by date without grouping errors
500
+ prices[f"r{t:02}dec"] = (
501
+ prices.reset_index(level="date")
502
+ .groupby("date")[f"r{t:02}"]
503
+ .apply(lambda x: pd.qcut(x, q=10, labels=False, duplicates="drop"))
504
+ .values
505
+ )
506
+ # Daily sector return deciles
507
+ for t in T:
508
+ prices[f"r{t:02}q_sector"] = prices.groupby(["date", "sector"])[
509
+ f"r{t:02}"
510
+ ].transform(lambda x: pd.qcut(x, q=5, labels=False, duplicates="drop"))
511
+ # Compute Forward Returns
512
+ for t in [1, 5, 21]:
513
+ prices[f"r{t:02}_fwd"] = prices.groupby(level="symbol")[f"r{t:02}"].shift(
514
+ -t
515
+ )
516
+
517
+ # Remove outliers
518
+ outliers = prices[prices.r01 > 1].index.get_level_values("symbol").unique()
519
+ prices = prices.drop(outliers, level="symbol")
520
+ # Create time and sector dummy variables
521
+ prices["year"] = prices.index.get_level_values("date").year
522
+ prices["month"] = prices.index.get_level_values("date").month
523
+ prices["weekday"] = prices.index.get_level_values("date").weekday
524
+ # Store Model Data
525
+ prices = prices.drop(["open", "close", "low", "high", "volume"], axis=1)
526
+ if "adj_close" in prices.columns:
527
+ prices = prices.drop("adj_close", axis=1)
528
+ prices.reset_index().to_hdf(path_or_buf=self.datastore, key="model_data")
529
+ return prices.sort_index()
530
+
531
+ def tickers(self):
532
+ return pd.read_hdf(self.outstore, "lgb/tickers").tolist()
533
+
534
+ def load_model_data(self, key="model_data"):
535
+ return (
536
+ pd.read_hdf(self.datastore, key=key)
537
+ .set_index(["symbol", "date"])
538
+ .sort_index()
539
+ )
540
+
541
+ def format_time(self, t):
542
+ """Return a formatted time string 'HH:MM:SS
543
+ based on a numeric time() value"""
544
+ m, s = divmod(t, 60)
545
+ h, m = divmod(m, 60)
546
+ return f"{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}"
547
+
548
+ def fit(self, data: pd.DataFrame, verbose=True):
549
+ def get_fi(model):
550
+ """Return normalized feature importance as pd.Series"""
551
+ fi = model.feature_importance(importance_type="gain")
552
+ return pd.Series(fi / fi.sum(), index=model.feature_name())
553
+
554
+ def ic_lgbm(preds, train_data):
555
+ """Custom IC eval metric for lightgbm"""
556
+ is_higher_better = True
557
+ return "ic", spearmanr(preds, train_data.get_label())[0], is_higher_better
558
+
559
+ data = data.dropna()
560
+ # Hyperparameter options
561
+ YEAR = 252
562
+ base_params = dict(boosting="gbdt", objective="regression", verbose=-1)
563
+
564
+ # constraints on structure (depth) of each tree
565
+ max_depths = [2, 3, 5, 7]
566
+ num_leaves_opts = [2**i for i in max_depths]
567
+ min_data_in_leaf_opts = [250, 500, 1000]
568
+
569
+ # weight of each new tree in the ensemble
570
+ learning_rate_ops = [0.01, 0.1, 0.3]
571
+
572
+ # random feature selection
573
+ feature_fraction_opts = [0.3, 0.6, 0.95]
574
+
575
+ param_names = [
576
+ "learning_rate",
577
+ "num_leaves",
578
+ "feature_fraction",
579
+ "min_data_in_leaf",
580
+ ]
581
+
582
+ cv_params = list(
583
+ product(
584
+ learning_rate_ops,
585
+ num_leaves_opts,
586
+ feature_fraction_opts,
587
+ min_data_in_leaf_opts,
588
+ )
589
+ )
590
+ n_params = len(cv_params)
591
+ print(f"# Parameters: {n_params}")
592
+
593
+ # Train/Test Period Lengths
594
+ lookaheads = [1, 5, 21]
595
+ train_lengths = [int(4.5 * 252), 252]
596
+ test_lengths = [63]
597
+ test_params = list(product(lookaheads, train_lengths, test_lengths))
598
+ n = len(test_params)
599
+ test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
600
+ test_params = [test_params[i] for i in test_param_sample]
601
+ print("Train configs:", len(test_params))
602
+
603
+ # Categorical Variables
604
+ categoricals = ["year", "weekday", "month"]
605
+ for feature in categoricals:
606
+ data[feature] = pd.factorize(data[feature], sort=True)[0]
607
+
608
+ # ### Run Cross-Validation
609
+ labels = sorted(data.filter(like="fwd").columns)
610
+ features = data.columns.difference(labels).tolist()
611
+ label_dict = dict(zip(lookaheads, labels))
612
+ num_iterations = [10, 25, 50, 75] + list(range(100, 501, 50))
613
+ num_boost_round = num_iterations[-1]
614
+
615
+ metric_cols = (
616
+ param_names
617
+ + [
618
+ "t",
619
+ "daily_ic_mean",
620
+ "daily_ic_mean_n",
621
+ "daily_ic_median",
622
+ "daily_ic_median_n",
623
+ ]
624
+ + [str(n) for n in num_iterations]
625
+ )
626
+
627
+ for lookahead, train_length, test_length in test_params:
628
+ # randomized grid search
629
+ cvp = np.random.choice(
630
+ list(range(n_params)), size=int(n_params / 2), replace=False
631
+ )
632
+ cv_params_ = [cv_params[i] for i in cvp]
633
+
634
+ # set up cross-validation
635
+ n_splits = int(2 * YEAR / test_length)
636
+ if verbose:
637
+ print(
638
+ f"Lookahead: {lookahead:2.0f} | "
639
+ f"Train: {train_length:3.0f} | "
640
+ f"Test: {test_length:2.0f} | "
641
+ f"Params: {len(cv_params_):3.0f} | "
642
+ f"Train configs: {len(test_params)}"
643
+ )
644
+
645
+ # time-series cross-validation
646
+ cv = MultipleTimeSeriesCV(
647
+ n_splits=n_splits,
648
+ lookahead=lookahead,
649
+ test_period_length=test_length,
650
+ train_period_length=train_length,
651
+ )
652
+
653
+ label = label_dict[lookahead]
654
+ outcome_data = data.loc[:, features + [label]].dropna()
655
+
656
+ # binary dataset
657
+ lgb_data = lgb.Dataset(
658
+ data=outcome_data.drop(label, axis=1),
659
+ label=outcome_data[label],
660
+ categorical_feature=categoricals,
661
+ free_raw_data=False,
662
+ )
663
+ T = 0
664
+ predictions, metrics = [], []
665
+
666
+ # iterate over (shuffled) hyperparameter combinations
667
+ for p, param_vals in enumerate(cv_params_):
668
+ key = f"{lookahead}/{train_length}/{test_length}/" + "/".join(
669
+ [str(p) for p in param_vals]
670
+ )
671
+ params = dict(zip(param_names, param_vals))
672
+ params.update(base_params)
673
+
674
+ start = time()
675
+ cv_preds = []
676
+
677
+ # iterate over folds
678
+ for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
679
+ # select train subset
680
+ lgb_train = lgb_data.subset(
681
+ used_indices=train_idx.tolist(), params=params
682
+ ).construct()
683
+
684
+ # train model for num_boost_round
685
+ model = lgb.train(
686
+ params=params,
687
+ train_set=lgb_train,
688
+ num_boost_round=num_boost_round,
689
+ )
690
+ # log feature importance
691
+ if i == 0:
692
+ fi = get_fi(model).to_frame()
693
+ else:
694
+ fi[i] = get_fi(model)
695
+
696
+ # capture predictions
697
+ test_set = outcome_data.iloc[test_idx, :]
698
+ X_test = test_set.loc[:, model.feature_name()]
699
+ y_test = test_set.loc[:, label]
700
+ y_pred = {
701
+ str(n): model.predict(X_test, num_iteration=n)
702
+ for n in num_iterations
703
+ }
704
+
705
+ # record predictions for each fold
706
+ cv_preds.append(
707
+ y_test.to_frame("y_test").assign(**y_pred).assign(i=i)
708
+ )
709
+
710
+ # combine fold results
711
+ cv_preds = pd.concat(cv_preds).assign(**params)
712
+ predictions.append(cv_preds)
713
+
714
+ # compute IC per day
715
+ by_day = cv_preds.groupby(level="date")
716
+ ic_by_day = pd.concat(
717
+ [
718
+ by_day.apply(
719
+ lambda x: spearmanr(x.y_test, x[str(n)])[0]
720
+ ).to_frame(n)
721
+ for n in num_iterations
722
+ ],
723
+ axis=1,
724
+ )
725
+ daily_ic_mean = ic_by_day.mean()
726
+ daily_ic_mean_n = daily_ic_mean.idxmax()
727
+ daily_ic_median = ic_by_day.median()
728
+ daily_ic_median_n = daily_ic_median.idxmax()
729
+
730
+ # compute IC across all predictions
731
+ ic = [
732
+ spearmanr(cv_preds.y_test, cv_preds[str(n)])[0]
733
+ for n in num_iterations
734
+ ]
735
+ t = time() - start
736
+ T += t
737
+
738
+ # collect metrics
739
+ metrics = pd.Series(
740
+ list(param_vals)
741
+ + [
742
+ t,
743
+ daily_ic_mean.max(),
744
+ daily_ic_mean_n,
745
+ daily_ic_median.max(),
746
+ daily_ic_median_n,
747
+ ]
748
+ + ic,
749
+ index=metric_cols,
750
+ )
751
+ if verbose:
752
+ msg = f'\t{p:3.0f} | {self.format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
753
+ msg += f'{params["num_leaves"]:3.0f} | {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} | '
754
+ msg += f" {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}"
755
+ print(msg)
756
+
757
+ # persist results for given CV run and hyperparameter combination
758
+ metrics.to_hdf(path_or_buf=self.trainstore, key="metrics/" + key)
759
+ ic_by_day.assign(**params).to_hdf(
760
+ path_or_buf=self.trainstore, key="daily_ic/" + key
761
+ )
762
+ fi.T.describe().T.assign(**params).to_hdf(
763
+ path_or_buf=self.trainstore, key="fi/" + key
764
+ )
765
+ cv_preds.to_hdf(
766
+ path_or_buf=self.trainstore, key="predictions/" + key, append=True
767
+ )
768
+
769
+ def _get_lgb_metrics(self, scope_params, lgb_train_params, daily_ic_metrics):
770
+ with pd.HDFStore(self.trainstore) as store:
771
+ for i, key in enumerate(
772
+ [k[1:] for k in store.keys() if k[1:].startswith("metrics")]
773
+ ):
774
+ _, t, train_length, test_length = key.split("/")[:4]
775
+ attrs = {
776
+ "lookahead": t,
777
+ "train_length": train_length,
778
+ "test_length": test_length,
779
+ }
780
+ s = store[key].to_dict()
781
+ s.update(attrs)
782
+ if i == 0:
783
+ lgb_metrics = pd.Series(s).to_frame(i)
784
+ else:
785
+ lgb_metrics[i] = pd.Series(s)
786
+
787
+ id_vars = scope_params + lgb_train_params + daily_ic_metrics
788
+ lgb_metrics = (
789
+ pd.melt(
790
+ lgb_metrics.T.drop("t", axis=1),
791
+ id_vars=id_vars,
792
+ value_name="ic",
793
+ var_name="boost_rounds",
794
+ )
795
+ .dropna()
796
+ .apply(pd.to_numeric)
797
+ )
798
+ return lgb_metrics
799
+
800
+ def _get_lgb_ic(self, int_cols, scope_params, lgb_train_params, id_vars):
801
+ lgb_ic = []
802
+ with pd.HDFStore(self.trainstore) as store:
803
+ keys = [k[1:] for k in store.keys()]
804
+ for key in keys:
805
+ _, t, train_length, test_length = key.split("/")[:4]
806
+ if key.startswith("daily_ic"):
807
+ df = (
808
+ store[key]
809
+ .drop(["boosting", "objective", "verbose"], axis=1)
810
+ .assign(
811
+ lookahead=t,
812
+ train_length=train_length,
813
+ test_length=test_length,
814
+ )
815
+ )
816
+ lgb_ic.append(df)
817
+ lgb_ic = pd.concat(lgb_ic).reset_index()
818
+ lgb_ic = pd.melt(
819
+ lgb_ic, id_vars=id_vars, value_name="ic", var_name="boost_rounds"
820
+ ).dropna()
821
+ lgb_ic.loc[:, int_cols] = lgb_ic.loc[:, int_cols].astype(int)
822
+ return lgb_ic
823
+
824
+ def _get_lgb_params(self, data, scope_params, lgb_train_params, t=5, best=0):
825
+ param_cols = scope_params[1:] + lgb_train_params + ["boost_rounds"]
826
+ df = data[data.lookahead == t].sort_values("ic", ascending=False).iloc[best]
827
+ return df.loc[param_cols]
828
+
829
+ def _get_lgb_key(self, t, p):
830
+ key = f"{t}/{int(p.train_length)}/{int(p.test_length)}/{p.learning_rate}/"
831
+ return (
832
+ key + f"{int(p.num_leaves)}/{p.feature_fraction}/{int(p.min_data_in_leaf)}"
833
+ )
834
+
835
+ def _select_ic(self, params, ic_data, lookahead):
836
+ return ic_data.loc[
837
+ (ic_data.lookahead == lookahead)
838
+ & (ic_data.train_length == params.train_length)
839
+ & (ic_data.test_length == params.test_length)
840
+ & (ic_data.learning_rate == params.learning_rate)
841
+ & (ic_data.num_leaves == params.num_leaves)
842
+ & (ic_data.feature_fraction == params.feature_fraction)
843
+ & (ic_data.boost_rounds == params.boost_rounds),
844
+ ["date", "ic"],
845
+ ].set_index("date")
846
+
847
+ def get_trade_prices(self, tickers, start, end):
848
+ idx = pd.IndexSlice
849
+ with pd.HDFStore(self.datastore) as store:
850
+ data = store.select("stock_data")
851
+ data = data.set_index(["symbol", "date"]).sort_index()
852
+ data = data[~data.index.duplicated()]
853
+ return (
854
+ data.loc[idx[tickers, start:end], "open"]
855
+ .unstack("symbol")
856
+ .sort_index()
857
+ .shift(-1)
858
+ .tz_convert("UTC")
859
+ )
860
+
861
+ def plot_ic(self, lgb_ic, lgb_daily_ic, scope_params, lgb_train_params):
862
+ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
863
+ axes = axes.flatten()
864
+ for i, t in enumerate([1, 21]):
865
+ params = self._get_lgb_params(
866
+ lgb_daily_ic, scope_params, lgb_train_params, t=t, best=0
867
+ )
868
+ data = self._select_ic(params, lgb_ic, lookahead=t).sort_index()
869
+ rolling = data.rolling(63).ic.mean().dropna()
870
+ avg = data.ic.mean()
871
+ med = data.ic.median()
872
+ rolling.plot(
873
+ ax=axes[i],
874
+ title=f"Horizon: {t} Day(s) | IC: Mean={avg*100:.2f} Median={med*100:.2f}",
875
+ )
876
+ axes[i].axhline(avg, c="darkred", lw=1)
877
+ axes[i].axhline(0, ls="--", c="k", lw=1)
878
+
879
+ fig.suptitle("3-Month Rolling Information Coefficient", fontsize=16)
880
+ fig.tight_layout()
881
+ fig.subplots_adjust(top=0.92)
882
+
883
+ def plot_metrics(self, lgb_metrics, lgb_daily_ic, t=1):
884
+ # Visualization
885
+ sns.jointplot(x=lgb_metrics.daily_ic_mean, y=lgb_metrics.ic)
886
+
887
+ sns.catplot(
888
+ x="lookahead",
889
+ y="ic",
890
+ col="train_length",
891
+ row="test_length",
892
+ data=lgb_metrics,
893
+ kind="box",
894
+ )
895
+ sns.catplot(
896
+ x="boost_rounds",
897
+ y="ic",
898
+ col="train_length",
899
+ row="test_length",
900
+ data=lgb_daily_ic[lgb_daily_ic.lookahead == t],
901
+ kind="box",
902
+ )
903
+
904
+ def get_best_predictions(
905
+ self, lgb_daily_ic, scope_params, lgb_train_params, lookahead=1, topn=10
906
+ ):
907
+ for best in range(topn):
908
+ best_params = self._get_lgb_params(
909
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=best
910
+ )
911
+ key = self._get_lgb_key(lookahead, best_params)
912
+ rounds = str(int(best_params.boost_rounds))
913
+ if best == 0:
914
+ best_predictions = pd.read_hdf(self.trainstore, "predictions/" + key)
915
+ best_predictions = best_predictions[rounds].to_frame(best)
916
+ else:
917
+ best_predictions[best] = pd.read_hdf(
918
+ self.trainstore, "predictions/" + key
919
+ )[rounds]
920
+ best_predictions = best_predictions.sort_index()
921
+ best_predictions.reset_index().to_hdf(
922
+ path_or_buf=self.outstore, key=f"lgb/train/{lookahead:02}"
923
+ )
924
+ return best_predictions
925
+
926
+ def apply_alphalen_analysis(self, factor_data, tearsheet=True, verbose=True):
927
+ # Compute Alphalens metrics
928
+ mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
929
+ factor_data,
930
+ by_date=True,
931
+ by_group=False,
932
+ demeaned=True,
933
+ group_adjust=False,
934
+ )
935
+ factor_returns = perf.factor_returns(factor_data)
936
+ mean_quant_ret, std_quantile = perf.mean_return_by_quantile(
937
+ factor_data, by_group=False, demeaned=True
938
+ )
939
+
940
+ mean_quant_rateret = mean_quant_ret.apply(
941
+ rate_of_return, axis=0, base_period=mean_quant_ret.columns[0]
942
+ )
943
+
944
+ mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
945
+ factor_data,
946
+ by_date=True,
947
+ by_group=False,
948
+ demeaned=True,
949
+ group_adjust=False,
950
+ )
951
+
952
+ mean_quant_rateret_bydate = mean_quant_ret_bydate.apply(
953
+ rate_of_return,
954
+ base_period=mean_quant_ret_bydate.columns[0],
955
+ )
956
+
957
+ compstd_quant_daily = std_quant_daily.apply(
958
+ std_conversion, base_period=std_quant_daily.columns[0]
959
+ )
960
+
961
+ alpha_beta = perf.factor_alpha_beta(factor_data, demeaned=True)
962
+
963
+ mean_ret_spread_quant, std_spread_quant = perf.compute_mean_returns_spread(
964
+ mean_quant_rateret_bydate,
965
+ factor_data["factor_quantile"].max(),
966
+ factor_data["factor_quantile"].min(),
967
+ std_err=compstd_quant_daily,
968
+ )
969
+ if verbose:
970
+ print(
971
+ mean_ret_spread_quant.mean()
972
+ .mul(10000)
973
+ .to_frame("Mean Period Wise Spread (bps)")
974
+ .join(alpha_beta.T)
975
+ .T
976
+ )
977
+
978
+ fig, axes = plt.subplots(ncols=3, figsize=(18, 4))
979
+
980
+ plotting.plot_quantile_returns_bar(mean_quant_rateret, ax=axes[0])
981
+ plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)
982
+ axes[0].set_xlabel("Quantile")
983
+
984
+ plotting.plot_cumulative_returns_by_quantile(
985
+ mean_quant_ret_bydate["1D"],
986
+ freq=pd.tseries.offsets.BDay(),
987
+ period="1D",
988
+ ax=axes[1],
989
+ )
990
+ axes[1].set_title("Cumulative Return by Quantile (1D Period)")
991
+
992
+ title = "Cumulative Return - Factor-Weighted Long/Short PF (1D Period)"
993
+ plotting.plot_cumulative_returns(
994
+ factor_returns["1D"],
995
+ period="1D",
996
+ freq=pd.tseries.offsets.BDay(),
997
+ title=title,
998
+ ax=axes[2],
999
+ )
1000
+
1001
+ fig.suptitle("Alphalens - Validation Set Performance", fontsize=14)
1002
+ fig.tight_layout()
1003
+ fig.subplots_adjust(top=0.85)
1004
+
1005
+ # Summary Tearsheet
1006
+ create_summary_tear_sheet(factor_data)
1007
+ create_full_tear_sheet(factor_data)
1008
+
1009
+ def evaluate(self, remove_instore=False, lookahead=1, verbose=True):
1010
+ scope_params = ["lookahead", "train_length", "test_length"]
1011
+ daily_ic_metrics = [
1012
+ "daily_ic_mean",
1013
+ "daily_ic_mean_n",
1014
+ "daily_ic_median",
1015
+ "daily_ic_median_n",
1016
+ ]
1017
+ lgb_train_params = [
1018
+ "learning_rate",
1019
+ "num_leaves",
1020
+ "feature_fraction",
1021
+ "min_data_in_leaf",
1022
+ ]
1023
+
1024
+ lgb_metrics = self._get_lgb_metrics(
1025
+ scope_params, lgb_train_params, daily_ic_metrics
1026
+ )
1027
+ # Summary Metrics by Fold
1028
+ lgb_metrics.to_hdf(path_or_buf=self.outstore, key="lgb/metrics")
1029
+
1030
+ # Information Coefficient by Day
1031
+ int_cols = ["lookahead", "train_length", "test_length", "boost_rounds"]
1032
+ id_vars = ["date"] + scope_params + lgb_train_params
1033
+ lgb_ic = self._get_lgb_ic(int_cols, scope_params, lgb_train_params, id_vars)
1034
+ lgb_ic.to_hdf(path_or_buf=self.outstore, key="lgb/ic")
1035
+ lgb_daily_ic = (
1036
+ lgb_ic.groupby(id_vars[1:] + ["boost_rounds"])
1037
+ .ic.mean()
1038
+ .to_frame("ic")
1039
+ .reset_index()
1040
+ )
1041
+ lgb_daily_ic.to_hdf(path_or_buf=self.outstore, key="lgb/daily_ic")
1042
+
1043
+ # Cross-validation Result: Best Hyperparameters
1044
+ if verbose:
1045
+ print(
1046
+ lgb_daily_ic.groupby("lookahead", group_keys=False).apply(
1047
+ lambda x: x.nlargest(3, "ic")
1048
+ )
1049
+ )
1050
+ lgb_metrics.groupby("lookahead", group_keys=False).apply(
1051
+ lambda x: x.nlargest(3, "ic")
1052
+ )
1053
+ lgb_metrics.groupby("lookahead", group_keys=False).apply(
1054
+ lambda x: x.nlargest(3, "ic")
1055
+ ).to_hdf(path_or_buf=self.outstore, key="lgb/best_model")
1056
+ if verbose:
1057
+ print(
1058
+ lgb_metrics.groupby("lookahead", group_keys=False).apply(
1059
+ lambda x: x.nlargest(3, "daily_ic_mean")
1060
+ )
1061
+ )
1062
+
1063
+ # Visualization
1064
+ if verbose:
1065
+ self.plot_metrics(lgb_metrics, lgb_daily_ic, t=lookahead)
1066
+
1067
+ # AlphaLens Analysis - Validation Performance
1068
+ lgb_daily_ic = pd.read_hdf(self.outstore, "lgb/daily_ic")
1069
+ best_params = self._get_lgb_params(
1070
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=0
1071
+ )
1072
+ best_params.to_hdf(path_or_buf=self.outstore, key="lgb/best_params")
1073
+
1074
+ if verbose:
1075
+ self.plot_ic(lgb_ic, lgb_daily_ic, scope_params, lgb_train_params)
1076
+
1077
+ # Get Predictions for Validation Period
1078
+ best_predictions = self.get_best_predictions(
1079
+ lgb_daily_ic, scope_params, lgb_train_params, lookahead=lookahead, topn=10
1080
+ )
1081
+ test_tickers = best_predictions.index.get_level_values("symbol").unique()
1082
+ start = best_predictions.index.get_level_values("date").min()
1083
+ end = best_predictions.index.get_level_values("date").max()
1084
+ trade_prices = self.get_trade_prices(test_tickers, start, end)
1085
+ pd.Series(test_tickers).to_hdf(path_or_buf=self.outstore, key="lgb/tickers")
1086
+ # We average the top five models and provide the corresponding prices to Alphalens,
1087
+ # in order to compute the mean period-wise
1088
+ # return earned on an equal-weighted portfolio invested in the daily factor quintiles
1089
+ # for various holding periods:
1090
+ factor = (
1091
+ best_predictions.iloc[:, :5]
1092
+ .mean(1)
1093
+ .dropna()
1094
+ .tz_convert("UTC", level="date")
1095
+ .swaplevel()
1096
+ )
1097
+ # Create AlphaLens Inputs
1098
+ if verbose:
1099
+ factor_data = get_clean_factor_and_forward_returns(
1100
+ factor=factor,
1101
+ prices=trade_prices,
1102
+ quantiles=5,
1103
+ periods=(1, 5, 10, 21),
1104
+ max_loss=1,
1105
+ )
1106
+ self.apply_alphalen_analysis(factor_data, tearsheet=True, verbose=True)
1107
+ # Delete the temporary files
1108
+ if remove_instore:
1109
+ os.remove(self.trainstore)
1110
+
1111
+ def make_predictions(
1112
+ self, data: pd.DataFrame, mode="test", lookahead=1, verbose=True
1113
+ ):
1114
+ data = data.copy()
1115
+ YEAR = 252
1116
+ scope_params = ["lookahead", "train_length", "test_length"]
1117
+ lgb_train_params = [
1118
+ "learning_rate",
1119
+ "num_leaves",
1120
+ "feature_fraction",
1121
+ "min_data_in_leaf",
1122
+ ]
1123
+
1124
+ base_params = dict(boosting="gbdt", objective="regression", verbose=-1)
1125
+
1126
+ categoricals = ["year", "month", "weekday"]
1127
+ labels = sorted(data.filter(like="_fwd").columns)
1128
+ features = data.columns.difference(labels).tolist()
1129
+ label = f"r{lookahead:02}_fwd"
1130
+ for feature in categoricals:
1131
+ data[feature] = pd.factorize(data[feature], sort=True)[0]
1132
+
1133
+ if mode == "test":
1134
+ data = data.dropna().sort_index()
1135
+ elif mode == "live":
1136
+ data[labels] = data[labels].fillna(0)
1137
+ data = data.sort_index().dropna()
1138
+
1139
+ lgb_data = lgb.Dataset(
1140
+ data=data[features],
1141
+ label=data[label],
1142
+ categorical_feature=categoricals,
1143
+ free_raw_data=False,
1144
+ )
1145
+ # Generate predictions
1146
+ lgb_daily_ic = pd.read_hdf(self.outstore, "lgb/daily_ic")
1147
+
1148
+ for position in range(10):
1149
+ params = self._get_lgb_params(
1150
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=position
1151
+ )
1152
+
1153
+ params = params.to_dict()
1154
+
1155
+ for p in ["min_data_in_leaf", "num_leaves"]:
1156
+ params[p] = int(params[p])
1157
+ train_length = int(params.pop("train_length"))
1158
+ test_length = int(params.pop("test_length"))
1159
+ num_boost_round = int(params.pop("boost_rounds"))
1160
+ params.update(base_params)
1161
+ if verbose:
1162
+ print(f"\nPosition: {position:02}")
1163
+
1164
+ # 1-year out-of-sample period
1165
+ n_splits = int(YEAR / test_length)
1166
+ cv = MultipleTimeSeriesCV(
1167
+ n_splits=n_splits,
1168
+ test_period_length=test_length,
1169
+ lookahead=lookahead,
1170
+ train_period_length=train_length,
1171
+ )
1172
+
1173
+ predictions = []
1174
+ for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
1175
+ if verbose:
1176
+ print(i, end=" ", flush=True)
1177
+ lgb_train = lgb_data.subset(
1178
+ used_indices=train_idx.tolist(), params=params
1179
+ ).construct()
1180
+
1181
+ model = lgb.train(
1182
+ params=params,
1183
+ train_set=lgb_train,
1184
+ num_boost_round=num_boost_round,
1185
+ )
1186
+
1187
+ test_set = data.iloc[test_idx, :]
1188
+ y_test = test_set.loc[:, label].to_frame("y_test")
1189
+ y_pred = model.predict(test_set.loc[:, model.feature_name()])
1190
+ predictions.append(y_test.assign(prediction=y_pred))
1191
+
1192
+ if position == 0:
1193
+ test_predictions = pd.concat(predictions).rename(
1194
+ columns={"prediction": position}
1195
+ )
1196
+ else:
1197
+ test_predictions[position] = pd.concat(predictions).prediction
1198
+
1199
+ by_day = test_predictions.groupby(level="date")
1200
+ for position in range(10):
1201
+ if position == 0:
1202
+ ic_by_day = by_day.apply(
1203
+ lambda x: spearmanr(x.y_test, x[position])[0]
1204
+ ).to_frame()
1205
+ else:
1206
+ ic_by_day[position] = by_day.apply(
1207
+ lambda x: spearmanr(x.y_test, x[position])[0]
1208
+ )
1209
+ if verbose:
1210
+ print(ic_by_day.describe())
1211
+ test_predictions.reset_index().to_hdf(
1212
+ path_or_buf=self.outstore, key=f"lgb/test/{lookahead:02}"
1213
+ )
1214
+ return test_predictions
1215
+
1216
+ def load_predictions(self, predictions=None, lookahead=1):
1217
+ if predictions is None:
1218
+ predictions = pd.concat(
1219
+ [
1220
+ pd.read_hdf(self.outstore, f"lgb/train/{lookahead:02}"),
1221
+ pd.read_hdf(self.outstore, f"lgb/test/{lookahead:02}").drop(
1222
+ "y_test", axis=1
1223
+ ),
1224
+ ]
1225
+ )
1226
+ predictions = predictions.set_index(["symbol", "date"])
1227
+
1228
+ predictions = (
1229
+ predictions.loc[~predictions.index.duplicated()]
1230
+ .iloc[:, :10]
1231
+ .mean(1)
1232
+ .sort_index()
1233
+ .dropna()
1234
+ .to_frame("prediction")
1235
+ )
1236
+ tickers = predictions.index.get_level_values("symbol").unique().tolist()
1237
+ return (predictions.unstack("symbol").prediction.tz_convert("UTC")), tickers
1238
+
1239
+ def assert_last_date(self, predictions: pd.DataFrame):
1240
+ """
1241
+ Usefull in Live Trading to ensure that the last date in the predictions
1242
+ is the previous day, so it predicts today's returns.
1243
+ """
1244
+ last_date = predictions.index.get_level_values("date").max()
1245
+ if last_date.tzinfo is None:
1246
+ last_date = last_date.tz_localize("UTC")
1247
+ last_date = last_date.normalize()
1248
+ try:
1249
+ days = 3 if datetime.now().strftime("%A") == "Monday" else 1
1250
+ td = (
1251
+ last_date
1252
+ - (pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=days)).normalize()
1253
+ )
1254
+ assert (
1255
+ td.days == days or last_date == (pd.Timestamp.now(tz="UTC")).normalize()
1256
+ )
1257
+ return True
1258
+ except AssertionError:
1259
+ return False
1260
+
1261
+ def clean_stores(self, *stores):
1262
+ for store in stores:
1263
+ if os.path.exists(store):
1264
+ os.remove(store)