bbstrader 0.1.93__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbstrader might be problematic. Click here for more details.

bbstrader/models/ml.py CHANGED
@@ -0,0 +1,1026 @@
1
+ from pathlib import Path
2
+ import sys, os
3
+ import numpy as np
4
+ import pandas as pd
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import talib
8
+ from time import time
9
+ from tqdm import tqdm
10
+ from talib import RSI, BBANDS, MACD, ATR
11
+ import yfinance as yf
12
+ from scipy.stats import spearmanr
13
+ from itertools import product
14
+ import lightgbm as lgb
15
+ from collections import defaultdict
16
+ from alphalens.tears import (create_summary_tear_sheet,
17
+ create_full_tear_sheet)
18
+ from alphalens import plotting
19
+ from alphalens import performance as perf
20
+ from alphalens.utils import get_clean_factor_and_forward_returns, rate_of_return, std_conversion
21
+ from sklearn.preprocessing import StandardScaler
22
+ from sklearn.preprocessing import LabelEncoder
23
+
24
+ import warnings
25
+ warnings.filterwarnings('ignore')
26
+
27
+
28
+ __all__ = [
29
+ 'OneStepTimeSeriesSplit',
30
+ 'MultipleTimeSeriesCV',
31
+ 'LightGBModel'
32
+ ]
33
+
34
+ class OneStepTimeSeriesSplit:
35
+ __author__ = "Stefan Jansen"
36
+ """Generates tuples of train_idx, test_idx pairs
37
+ Assumes the index contains a level labeled 'date'"""
38
+
39
+ def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
40
+ self.n_splits = n_splits
41
+ self.test_period_length = test_period_length
42
+ self.shuffle = shuffle
43
+
44
+ @staticmethod
45
+ def chunks(l, n):
46
+ for i in range(0, len(l), n):
47
+ yield l[i:i + n]
48
+
49
+ def split(self, X: pd.DataFrame, y=None, groups=None):
50
+ unique_dates = (X.index
51
+ .get_level_values('date')
52
+ .unique()
53
+ .sort_values(ascending=False)
54
+ [:self.n_splits*self.test_period_length])
55
+
56
+ dates = X.reset_index()[['date']]
57
+ for test_date in self.chunks(unique_dates, self.test_period_length):
58
+ train_idx = dates[dates.date < min(test_date)].index
59
+ test_idx = dates[dates.date.isin(test_date)].index
60
+ if self.shuffle:
61
+ np.random.shuffle(list(train_idx))
62
+ yield train_idx, test_idx
63
+
64
+ def get_n_splits(self, X, y, groups=None):
65
+ return self.n_splits
66
+
67
+
68
+ class MultipleTimeSeriesCV:
69
+ __author__ = "Stefan Jansen"
70
+ """
71
+ Generates tuples of train_idx, test_idx pairs
72
+ Assumes the MultiIndex contains levels 'symbol' and 'date'
73
+ purges overlapping outcomes
74
+ """
75
+
76
+ def __init__(self,
77
+ n_splits=3,
78
+ train_period_length=126,
79
+ test_period_length=21,
80
+ lookahead=None,
81
+ date_idx='date',
82
+ shuffle=False):
83
+
84
+ self.n_splits = n_splits
85
+ self.lookahead = lookahead
86
+ self.test_length = test_period_length
87
+ self.train_length = train_period_length
88
+ self.shuffle = shuffle
89
+ self.date_idx = date_idx
90
+
91
+ def split(self, X: pd.DataFrame, y=None, groups=None):
92
+ unique_dates = X.index.get_level_values(self.date_idx).unique()
93
+ days = sorted(unique_dates, reverse=True)
94
+ split_idx = []
95
+ for i in range(self.n_splits):
96
+ test_end_idx = i * self.test_length
97
+ test_start_idx = test_end_idx + self.test_length
98
+ train_end_idx = test_start_idx + self.lookahead - 1
99
+ train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
100
+ split_idx.append([train_start_idx, train_end_idx,
101
+ test_start_idx, test_end_idx])
102
+
103
+ dates = X.reset_index()[[self.date_idx]]
104
+ for train_start, train_end, test_start, test_end in split_idx:
105
+
106
+ train_idx = dates[(dates[self.date_idx] > days[train_start])
107
+ & (dates[self.date_idx] <= days[train_end])].index
108
+ test_idx = dates[(dates[self.date_idx] > days[test_start])
109
+ & (dates[self.date_idx] <= days[test_end])].index
110
+ if self.shuffle:
111
+ np.random.shuffle(list(train_idx))
112
+ yield train_idx.to_numpy(), test_idx.to_numpy()
113
+
114
+ def get_n_splits(self, X, y, groups=None):
115
+ return self.n_splits
116
+
117
+
118
+ class LightGBModel(object):
119
+ """
120
+ ``LightGBModel`` encapsulates a complete workflow for training and evaluating
121
+ a ``LightGBM (Light Gradient Boosting Machine)`` model for predicting stock returns.
122
+ It includes data acquisition, feature engineering, model tuning, and performance
123
+ evaluation using information ``coefficient (IC)`` and Alphalens analysis.
124
+
125
+ Key Features
126
+ ------------
127
+ - ``HDF5 Storage``: Utilizes ``pandas.HDFStore`` for efficient storage and retrieval
128
+ of large datasets, which is essential for backtesting on financial time series data.
129
+
130
+ - ``Time-Series Cross-Validation``: Employs a custom cross-validation strategy that
131
+ respects the time series nature of the data, avoiding data leakage.
132
+
133
+ - ``Hyperparameter Tuning``: Includes automated hyperparameter tuning using a randomized
134
+ grid search for optimization.
135
+
136
+ - ``Information Coefficient (IC)``: Uses IC as a core performance metric that quantifies
137
+ the predictive power of the model, which is a standard measure for ranking models in finance.
138
+
139
+ - ``Alphalens Integration``: Provides a comprehensive framework for validating model
140
+ performance using Alphalens, allowing for in-depth performance analysis, like backtesting
141
+ and return decomposition.
142
+
143
+ Use Case
144
+ --------
145
+ This class is designed for quantitative finance and algorithmic trading use cases where
146
+ the goal is to build a predictive model for stock returns based on historical data and
147
+ technical indicators. It follows a complete cycle from data acquisition to model validation
148
+ and provides the infrastructure needed for deployment of this model in a trading strategy.
149
+
150
+ Notes
151
+ -----
152
+ The implementation is inspired by the book "Machine Learning for Algorithmic Trading"
153
+ by Stefan Jansen.
154
+
155
+ References
156
+ ----------
157
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
158
+ Chapter 12, Boosting Your Trading Strategy.
159
+ """
160
+
161
+ def __init__(self,
162
+ data: pd.DataFrame=None,
163
+ datastore: pd.HDFStore='lgbdata.h5',
164
+ trainstore: pd.HDFStore='lgbtrain.h5',
165
+ outstore: pd.HDFStore='lgbout.h5'
166
+ ):
167
+ """
168
+ Args:
169
+ data (pd.DataFrame): The input data for the model. It should be a DataFrame with a MultiIndex containing
170
+ 'symbol' and 'date' levels. If not provided, the data can be downloaded using the `download_boosting_data` method.
171
+ datastore (str): The path to the HDF5 file for storing the model data.
172
+ trainstore (str): The path to the HDF5 file for storing the training data.
173
+ outstore (str): The path to the HDF5 file for storing the output data.
174
+ """
175
+ self.datastore = datastore
176
+ self.trainstore = trainstore
177
+ self.outstore = outstore
178
+ if data is not None:
179
+ data.reset_index().to_hdf(self.datastore, 'model_data')
180
+
181
+ def _compute_bb(self, close):
182
+ high, mid, low = BBANDS(close, timeperiod=20)
183
+ return pd.DataFrame({'bb_high': high, 'bb_low': low}, index=close.index)
184
+
185
+ def _compute_atr(self, stock_data):
186
+ df = ATR(stock_data.high, stock_data.low,
187
+ stock_data.close, timeperiod=14)
188
+ return df.sub(df.mean()).div(df.std())
189
+
190
+ def _compute_macd(self, close):
191
+ macd = MACD(close)[0]
192
+ return (macd - np.mean(macd))/np.std(macd)
193
+
194
+ def _add_technical_indicators(self, prices: pd.DataFrame):
195
+ prices = prices.copy()
196
+ prices['rsi'] = prices.groupby(level='symbol').close.apply(lambda x: RSI(x).reset_index(level=0, drop=True))
197
+ bb = prices.groupby(level=0).close.apply(self._compute_bb).reset_index(level=1, drop=True)
198
+ prices = prices.join(bb)
199
+ prices['bb_high'] = prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
200
+ prices['bb_low'] = prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)
201
+ prices['NATR'] = prices.groupby(level='symbol',
202
+ group_keys=False).apply(lambda x:
203
+ talib.NATR(x.high, x.low, x.close))
204
+
205
+ prices['ATR'] = (prices.groupby('symbol', group_keys=False)
206
+ .apply(self._compute_atr))
207
+ prices['PPO'] = prices.groupby(level='symbol').close.apply(lambda x: talib.PPO(x).reset_index(level=0, drop=True))
208
+ prices['MACD'] = (prices
209
+ .groupby('symbol', group_keys=False)
210
+ .close
211
+ .apply(self._compute_macd))
212
+ return prices
213
+
214
+ def download_boosting_data(self, tickers, start, end=None):
215
+ data = []
216
+ for ticker in tickers:
217
+ try:
218
+ prices = yf.download(ticker, start=start, end=end, progress=False, multi_level_index=False)
219
+ prices['symbol'] = ticker
220
+ data.append(prices)
221
+ except:
222
+ continue
223
+ data = pd.concat(data)
224
+ data = (data
225
+ .rename(columns={s: s.lower().replace(' ', '_') for s in data.columns})
226
+ .drop(columns=['adj_close'])
227
+ .set_index('symbol', append=True).swaplevel()
228
+ .sort_index()
229
+ .dropna())
230
+ return data
231
+
232
+ def download_metadata(self, tickers):
233
+
234
+ def clean_text_column(series: pd.Series) -> pd.Series:
235
+ return (
236
+ series.str.lower()
237
+ .str.replace('-', '', regex=False) # use regex=False for literal string replacements
238
+ .str.replace('&', 'and', regex=False)
239
+ .str.replace(' ', '_', regex=False)
240
+ .str.replace('__', '_', regex=False)
241
+ )
242
+
243
+ metadata = ['industry', 'sector', 'exchange', 'symbol',
244
+ 'heldPercentInsiders', 'heldPercentInstitutions',
245
+ 'overallRisk', 'shortRatio', 'dividendYield', 'beta',
246
+ 'regularMarketVolume', 'averageVolume', 'averageVolume10days',
247
+ 'bid', 'ask', 'bidSize', 'askSize','marketCap']
248
+
249
+ columns = {
250
+ 'industry' : 'industry',
251
+ 'sector' : 'sector',
252
+ 'exchange' : 'exchange',
253
+ 'symbol' : 'symbol',
254
+ 'heldPercentInsiders' : 'insiders',
255
+ 'heldPercentInstitutions': 'institutions',
256
+ 'overallRisk' : 'risk',
257
+ 'shortRatio' : 'short_ratio',
258
+ 'dividendYield' : 'dyield',
259
+ 'beta' : 'beta',
260
+ 'regularMarketVolume' : 'regvolume',
261
+ 'averageVolume' : 'avgvolume',
262
+ 'averageVolume10days' : 'avgvolume10',
263
+ 'bid' : 'bid',
264
+ 'ask' : 'ask',
265
+ 'bidSize' : 'bidsize',
266
+ 'askSize' : 'asksize',
267
+ 'marketCap' : 'marketcap'
268
+ }
269
+ data = []
270
+ for symbol in tickers:
271
+ try:
272
+ symbol_info = yf.Ticker(symbol).info
273
+ except:
274
+ continue
275
+ infos = {}
276
+ for info in metadata:
277
+ infos[info] = symbol_info.get(info)
278
+ data.append(infos)
279
+ metadata = pd.DataFrame(data)
280
+ metadata = metadata.rename(columns=columns)
281
+ metadata.dyield = metadata.dyield.fillna(0)
282
+ metadata.sector = clean_text_column(metadata.sector)
283
+ metadata.industry = clean_text_column(metadata.industry)
284
+ metadata = metadata.set_index('symbol')
285
+ return metadata
286
+
287
+ def _select_nlargest_liquidity_stocks(self, df: pd.DataFrame, n: int,
288
+ volume_features, bid_ask_features, market_cap_feature):
289
+ df = df.copy()
290
+ scaler = StandardScaler()
291
+
292
+ # Normalize features
293
+ df[volume_features] = scaler.fit_transform(df[volume_features])
294
+ df['bid_ask_spread'] = df['ask'] - df['bid']
295
+ df['bid_ask_spread'] = scaler.fit_transform(df[['bid_ask_spread']])
296
+ df[market_cap_feature] = scaler.fit_transform(df[market_cap_feature])
297
+
298
+ # Calculate Liquidity Score
299
+ # Assign weights to each component (these weights can be adjusted based on importance)
300
+ weights = {
301
+ 'volume': 0.4,
302
+ 'bid_ask_spread': 0.2,
303
+ 'marketCap': 0.4
304
+ }
305
+
306
+ # Calculate the liquidity score by combining the normalized features
307
+ df['liquidity_score'] = (weights['volume'] * df[volume_features].mean(axis=1) +
308
+ weights['bid_ask_spread'] * df['bid_ask_spread'] +
309
+ weights['marketCap'] * df[market_cap_feature[0]])
310
+ df_sorted = df.sort_values(by='liquidity_score', ascending=False)
311
+
312
+ return df_sorted.nlargest(n, 'liquidity_score').index
313
+
314
+ def _encode_metadata(self, df: pd.DataFrame):
315
+ df = df.copy()
316
+ # Binning each numerical feature into categories
317
+ df['insiders'] = pd.qcut(
318
+ df['insiders'], q=4,
319
+ labels=['Very Low', 'Low', 'High', 'Very High']
320
+ )
321
+ df['institutions'] = pd.qcut(
322
+ df['institutions'], q=4,
323
+ labels=['Very Low', 'Low', 'High', 'Very High']
324
+ )
325
+ df['risk'] = pd.cut(
326
+ df['risk'], bins=[-float('inf'), 3, 5, 7, float('inf')],
327
+ labels=['Low', 'Medium', 'High', 'Very High']
328
+ )
329
+ df['short_ratio'] = pd.qcut(
330
+ df['short_ratio'], q=4,
331
+ labels=['Very Low', 'Low', 'High', 'Very High']
332
+ )
333
+ df['dyield'] = pd.cut(
334
+ df['dyield'],
335
+ bins=[-float('inf'), 0.002, 0.005, 0.01, float('inf')],
336
+ labels=['Very Low', 'Low', 'High', 'Very High']
337
+ )
338
+ df['beta'] = pd.cut(
339
+ df['beta'],
340
+ bins=[-float('inf'), 0.8, 1.0, 1.2, float('inf')],
341
+ labels=['Low', 'Moderate', 'High', 'Very High']
342
+ )
343
+
344
+ # Encode binned features
345
+ binned_features = [
346
+ 'insiders', 'institutions',
347
+ 'risk', 'short_ratio', 'dyield',
348
+ 'beta', 'sector', 'industry', 'exchange',
349
+ ]
350
+ label_encoders = {}
351
+
352
+ for col in binned_features:
353
+ le = LabelEncoder()
354
+ df[col] = le.fit_transform(df[col])
355
+ label_encoders[col] = le
356
+ return df, label_encoders
357
+
358
+ def prepare_boosting_data(self,
359
+ prices: pd.DataFrame,
360
+ metadata: pd.DataFrame = None,
361
+ min_years=7,
362
+ universe=500
363
+ ):
364
+ if metadata is None:
365
+ mcap = False
366
+ tickers = prices.index.get_level_values('symbol').unique()
367
+ metadata = self.download_metadata(tickers)
368
+ else:
369
+ mcap = True
370
+ YEAR = 252
371
+ idx = pd.IndexSlice
372
+ percentiles = [.001, .01, .02, .03, .04, .05]
373
+ percentiles += [1-p for p in percentiles[::-1]]
374
+ T = [1, 5, 10, 21, 42, 63]
375
+
376
+ prices.volume /= 1e3 # make vol figures a bit smaller
377
+ prices.index.names = ['symbol', 'date']
378
+ metadata.index.name = 'symbol'
379
+ prices.reset_index().to_hdf(self.datastore, 'stock_data')
380
+ metadata.reset_index().to_hdf(self.datastore, 'stock_metadata')
381
+
382
+ # Remove stocks with insufficient observations
383
+ min_obs = min_years * YEAR
384
+ nobs = prices.groupby(level='symbol').size()
385
+ keep = nobs[nobs > min_obs].index
386
+ prices = prices.loc[idx[keep, :], :]
387
+
388
+ # # Remove duplicate symbols
389
+ prices = prices[~prices.index.duplicated()]
390
+
391
+ # Align price and meta data
392
+ metadata = metadata[~metadata.index.duplicated() & metadata.sector.notnull()]
393
+ metadata.sector = metadata.sector.str.lower().str.replace(' ', '_')
394
+ shared = (prices.index.get_level_values('symbol').unique()
395
+ .intersection(metadata.index))
396
+ metadata = metadata.loc[shared, :]
397
+ prices = prices.loc[idx[shared, :], :]
398
+
399
+ # Limit universe
400
+ if mcap:
401
+ universe = metadata.marketcap.nlargest(universe).index
402
+ else:
403
+ volume_features = ['regvolume', 'avgvolume', 'avgvolume10']
404
+ bid_ask_features = ['bid', 'ask', 'bidsize', 'asksize']
405
+ market_cap_feature = ['marketcap']
406
+ to_drop = volume_features + bid_ask_features + market_cap_feature
407
+ universe = self._select_nlargest_liquidity_stocks(
408
+ metadata, universe, volume_features, bid_ask_features, market_cap_feature
409
+ )
410
+ metadata = metadata.drop(to_drop, axis=1)
411
+ prices = prices.loc[idx[universe, :], :]
412
+ metadata = metadata.loc[universe]
413
+ metadata = self._encode_metadata(metadata)[0]
414
+
415
+ prices['dollar_vol'] = prices[['close', 'volume']].prod(1).div(1e3)
416
+ # compute dollar volume to determine universe
417
+ dollar_vol_ma = (prices
418
+ .dollar_vol
419
+ .unstack('symbol')
420
+ .rolling(window=21, min_periods=1) # 1 trading month
421
+ .mean())
422
+
423
+ # Rank stocks by moving average
424
+ prices['dollar_vol_rank'] = (dollar_vol_ma
425
+ .rank(axis=1, ascending=False)
426
+ .stack('symbol')
427
+ .swaplevel())
428
+ # Add some Basic Factors
429
+ prices = self._add_technical_indicators(prices)
430
+ # Combine Price and Meta Data
431
+ prices = prices.join(metadata)
432
+
433
+ # Compute Returns
434
+ by_sym = prices.groupby(level='symbol').close
435
+ for t in T:
436
+ prices[f'r{t:02}'] = by_sym.pct_change(t)
437
+ # Daily historical return deciles
438
+ for t in T:
439
+ # Reset the index to apply qcut by date without grouping errors
440
+ prices[f'r{t:02}dec'] = (prices.reset_index(level='date')
441
+ .groupby('date')[f'r{t:02}']
442
+ .apply(lambda x: pd.qcut(x,
443
+ q=10,
444
+ labels=False,
445
+ duplicates='drop'))
446
+ .values)
447
+ # Daily sector return deciles
448
+ for t in T:
449
+ prices[f'r{t:02}q_sector'] = (
450
+ prices
451
+ .groupby(['date', 'sector'])[f'r{t:02}']
452
+ .transform(lambda x: pd.qcut(
453
+ x,
454
+ q=5,
455
+ labels=False,
456
+ duplicates='drop'))
457
+ )
458
+ # Compute Forward Returns
459
+ for t in [1, 5, 21]:
460
+ prices[f'r{t:02}_fwd'] = prices.groupby(
461
+ level='symbol')[f'r{t:02}'].shift(-t)
462
+
463
+ # Remove outliers
464
+ outliers = prices[prices.r01 > 1].index.get_level_values('symbol').unique()
465
+ prices = prices.drop(outliers, level='symbol')
466
+ # Create time and sector dummy variables
467
+ prices['year'] = prices.index.get_level_values('date').year
468
+ prices['month'] = prices.index.get_level_values('date').month
469
+ prices['weekday'] = prices.index.get_level_values('date').weekday
470
+ # Store Model Data
471
+ prices = prices.drop(['open', 'close', 'low', 'high', 'volume'], axis=1)
472
+ if 'adj_close' in prices.columns:
473
+ prices = prices.drop('adj_close', axis=1)
474
+ prices.reset_index().dropna().to_hdf(self.datastore, 'model_data')
475
+ return prices.dropna()
476
+
477
+ def tickers(self):
478
+ return pd.read_hdf(self.outstore, 'lgb/tickers').tolist()
479
+
480
+ def load_model_data(self):
481
+ return pd.read_hdf(self.datastore, 'model_data').set_index(['symbol', 'date']).sort_index()
482
+
483
+ def format_time(self, t):
484
+
485
+ """Return a formatted time string 'HH:MM:SS
486
+ based on a numeric time() value"""
487
+ m, s = divmod(t, 60)
488
+ h, m = divmod(m, 60)
489
+ return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'
490
+
491
+ def fit(self, data: pd.DataFrame, verbose=True):
492
+ def get_fi(model):
493
+ """Return normalized feature importance as pd.Series"""
494
+ fi = model.feature_importance(importance_type='gain')
495
+ return (pd.Series(fi / fi.sum(),
496
+ index=model.feature_name()))
497
+
498
+ def ic_lgbm(preds, train_data):
499
+ """Custom IC eval metric for lightgbm"""
500
+ is_higher_better = True
501
+ return 'ic', spearmanr(preds, train_data.get_label())[0], is_higher_better
502
+ # Hyperparameter options
503
+ YEAR = 252
504
+ base_params = dict(boosting='gbdt',
505
+ objective='regression',
506
+ verbose=-1)
507
+
508
+ # constraints on structure (depth) of each tree
509
+ max_depths = [2, 3, 5, 7]
510
+ num_leaves_opts = [2 ** i for i in max_depths]
511
+ min_data_in_leaf_opts = [250, 500, 1000]
512
+
513
+ # weight of each new tree in the ensemble
514
+ learning_rate_ops = [.01, .1, .3]
515
+
516
+ # random feature selection
517
+ feature_fraction_opts = [.3, .6, .95]
518
+
519
+ param_names = ['learning_rate', 'num_leaves',
520
+ 'feature_fraction', 'min_data_in_leaf']
521
+
522
+ cv_params = list(product(learning_rate_ops,
523
+ num_leaves_opts,
524
+ feature_fraction_opts,
525
+ min_data_in_leaf_opts))
526
+ n_params = len(cv_params)
527
+ print(f'# Parameters: {n_params}')
528
+
529
+ # Train/Test Period Lengths
530
+ lookaheads = [1, 5, 21]
531
+ train_lengths = [int(4.5 * 252), 252]
532
+ test_lengths = [63]
533
+ test_params = list(product(lookaheads, train_lengths, test_lengths))
534
+ n = len(test_params)
535
+ test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
536
+ test_params = [test_params[i] for i in test_param_sample]
537
+ print('Train configs:', len(test_params))
538
+
539
+ ### Categorical Variables
540
+ categoricals = ['year', 'weekday', 'month']
541
+ for feature in categoricals:
542
+ data[feature] = pd.factorize(data[feature], sort=True)[0]
543
+
544
+
545
+ # ### Run Cross-Validation
546
+ labels = sorted(data.filter(like='fwd').columns)
547
+ features = data.columns.difference(labels).tolist()
548
+ label_dict = dict(zip(lookaheads, labels))
549
+ num_iterations = [10, 25, 50, 75] + list(range(100, 501, 50))
550
+ num_boost_round = num_iterations[-1]
551
+
552
+ metric_cols = (param_names + ['t', 'daily_ic_mean', 'daily_ic_mean_n',
553
+ 'daily_ic_median', 'daily_ic_median_n'] +
554
+ [str(n) for n in num_iterations])
555
+
556
+ for lookahead, train_length, test_length in test_params:
557
+ # randomized grid search
558
+ cvp = np.random.choice(list(range(n_params)),
559
+ size=int(n_params / 2),
560
+ replace=False)
561
+ cv_params_ = [cv_params[i] for i in cvp]
562
+
563
+ # set up cross-validation
564
+ n_splits = int(2 * YEAR / test_length)
565
+ if verbose:
566
+ print(f'Lookahead: {lookahead:2.0f} | '
567
+ f'Train: {train_length:3.0f} | '
568
+ f'Test: {test_length:2.0f} | '
569
+ f'Params: {len(cv_params_):3.0f} | '
570
+ f'Train configs: {len(test_params)}')
571
+
572
+ # time-series cross-validation
573
+ cv = MultipleTimeSeriesCV(n_splits=n_splits,
574
+ lookahead=lookahead,
575
+ test_period_length=test_length,
576
+ train_period_length=train_length)
577
+
578
+ label = label_dict[lookahead]
579
+ outcome_data = data.loc[:, features + [label]].dropna()
580
+
581
+ # binary dataset
582
+ lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),
583
+ label=outcome_data[label],
584
+ categorical_feature=categoricals,
585
+ free_raw_data=False)
586
+ T = 0
587
+ predictions, metrics, feature_importance, daily_ic = [], [], [], []
588
+
589
+ # iterate over (shuffled) hyperparameter combinations
590
+ for p, param_vals in enumerate(cv_params_):
591
+ key = f'{lookahead}/{train_length}/{test_length}/' + '/'.join([str(p) for p in param_vals])
592
+ params = dict(zip(param_names, param_vals))
593
+ params.update(base_params)
594
+
595
+ start = time()
596
+ cv_preds, nrounds = [], []
597
+ ic_cv = defaultdict(list)
598
+
599
+ # iterate over folds
600
+ for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
601
+
602
+ # select train subset
603
+ lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
604
+ params=params).construct()
605
+
606
+ # train model for num_boost_round
607
+ model = lgb.train(params=params,
608
+ train_set=lgb_train,
609
+ num_boost_round=num_boost_round,
610
+ )
611
+ # log feature importance
612
+ if i == 0:
613
+ fi = get_fi(model).to_frame()
614
+ else:
615
+ fi[i] = get_fi(model)
616
+
617
+ # capture predictions
618
+ test_set = outcome_data.iloc[test_idx, :]
619
+ X_test = test_set.loc[:, model.feature_name()]
620
+ y_test = test_set.loc[:, label]
621
+ y_pred = {str(n): model.predict(X_test, num_iteration=n) for n in num_iterations}
622
+
623
+ # record predictions for each fold
624
+ cv_preds.append(y_test.to_frame('y_test').assign(**y_pred).assign(i=i))
625
+
626
+ # combine fold results
627
+ cv_preds = pd.concat(cv_preds).assign(**params)
628
+ predictions.append(cv_preds)
629
+
630
+ # compute IC per day
631
+ by_day = cv_preds.groupby(level='date')
632
+ ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
633
+ for n in num_iterations], axis=1)
634
+ daily_ic_mean = ic_by_day.mean()
635
+ daily_ic_mean_n = daily_ic_mean.idxmax()
636
+ daily_ic_median = ic_by_day.median()
637
+ daily_ic_median_n = daily_ic_median.idxmax()
638
+
639
+ # compute IC across all predictions
640
+ ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]
641
+ t = time() - start
642
+ T += t
643
+
644
+ # collect metrics
645
+ metrics = pd.Series(list(param_vals) +
646
+ [t, daily_ic_mean.max(), daily_ic_mean_n, daily_ic_median.max(), daily_ic_median_n] + ic,
647
+ index=metric_cols)
648
+ if verbose:
649
+ msg = f'\t{p:3.0f} | {self.format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
650
+ msg += f'{params["num_leaves"]:3.0f} | {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} | '
651
+ msg += f' {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}'
652
+ print(msg)
653
+
654
+ # persist results for given CV run and hyperparameter combination
655
+ metrics.to_hdf(self.trainstore, 'metrics/' + key)
656
+ ic_by_day.assign(**params).to_hdf(self.trainstore, 'daily_ic/' + key)
657
+ fi.T.describe().T.assign(**params).to_hdf(self.trainstore, 'fi/' + key)
658
+ cv_preds.to_hdf(self.trainstore, 'predictions/' + key, append=True)
659
+
660
+ def _get_lgb_metrics(self, scope_params, lgb_train_params, daily_ic_metrics):
661
+ with pd.HDFStore(self.trainstore) as store:
662
+ for i, key in enumerate(
663
+ [k[1:] for k in store.keys() if k[1:].startswith('metrics')]):
664
+ _, t, train_length, test_length = key.split('/')[:4]
665
+ attrs = {
666
+ 'lookahead': t,
667
+ 'train_length': train_length,
668
+ 'test_length': test_length
669
+ }
670
+ s = store[key].to_dict()
671
+ s.update(attrs)
672
+ if i == 0:
673
+ lgb_metrics = pd.Series(s).to_frame(i)
674
+ else:
675
+ lgb_metrics[i] = pd.Series(s)
676
+
677
+ id_vars = scope_params + lgb_train_params + daily_ic_metrics
678
+ lgb_metrics = pd.melt(lgb_metrics.T.drop('t', axis=1),
679
+ id_vars=id_vars,
680
+ value_name='ic',
681
+ var_name='boost_rounds').dropna().apply(pd.to_numeric)
682
+ return lgb_metrics
683
+
684
+ def _get_lgb_ic(self, int_cols, scope_params, lgb_train_params, id_vars):
685
+ lgb_ic = []
686
+ with pd.HDFStore(self.trainstore) as store:
687
+ keys = [k[1:] for k in store.keys()]
688
+ for key in keys:
689
+ _, t, train_length, test_length = key.split('/')[:4]
690
+ if key.startswith('daily_ic'):
691
+ df = (store[key]
692
+ .drop(['boosting', 'objective', 'verbose'], axis=1)
693
+ .assign(lookahead=t,
694
+ train_length=train_length,
695
+ test_length=test_length))
696
+ lgb_ic.append(df)
697
+ lgb_ic = pd.concat(lgb_ic).reset_index()
698
+ lgb_ic = pd.melt(lgb_ic,
699
+ id_vars=id_vars,
700
+ value_name='ic',
701
+ var_name='boost_rounds').dropna()
702
+ lgb_ic.loc[:, int_cols] = lgb_ic.loc[:, int_cols].astype(int)
703
+ return lgb_ic
704
+
705
+ def _get_lgb_params(self, data, scope_params, lgb_train_params, t=5, best=0):
706
+ param_cols = scope_params[1:] + lgb_train_params + ['boost_rounds']
707
+ df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
708
+ return df.loc[param_cols]
709
+
710
+ def _get_lgb_key(self, t, p):
711
+ key = f'{t}/{int(p.train_length)}/{int(p.test_length)}/{p.learning_rate}/'
712
+ return key + f'{int(p.num_leaves)}/{p.feature_fraction}/{int(p.min_data_in_leaf)}'
713
+
714
+ def _select_ic(self, params, ic_data, lookahead):
715
+ return ic_data.loc[(ic_data.lookahead == lookahead) &
716
+ (ic_data.train_length == params.train_length) &
717
+ (ic_data.test_length == params.test_length) &
718
+ (ic_data.learning_rate == params.learning_rate) &
719
+ (ic_data.num_leaves == params.num_leaves) &
720
+ (ic_data.feature_fraction == params.feature_fraction) &
721
+ (ic_data.boost_rounds == params.boost_rounds), ['date', 'ic']].set_index('date')
722
+
723
+ def get_trade_prices(self, tickers, start, end):
724
+ idx = pd.IndexSlice
725
+ with pd.HDFStore(self.datastore) as store:
726
+ data = store.select('stock_data')
727
+ data = data.set_index(['symbol', 'date']).sort_index()
728
+ data = data[~data.index.duplicated()]
729
+ return (data.loc[idx[tickers, start: end], 'open']
730
+ .unstack('symbol')
731
+ .sort_index()
732
+ .shift(-1)
733
+ .tz_convert('UTC'))
734
+
735
+ def plot_ic(self, lgb_ic, lgb_daily_ic, scope_params, lgb_train_params):
736
+ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
737
+ axes = axes.flatten()
738
+ for i, t in enumerate([1, 21]):
739
+ params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=t,best=0)
740
+ data = self._select_ic(params, lgb_ic, lookahead=t).sort_index()
741
+ rolling = data.rolling(63).ic.mean().dropna()
742
+ avg = data.ic.mean()
743
+ med = data.ic.median()
744
+ rolling.plot(ax=axes[i], title=f'Horizon: {t} Day(s) | IC: Mean={avg*100:.2f} Median={med*100:.2f}')
745
+ axes[i].axhline(avg, c='darkred', lw=1)
746
+ axes[i].axhline(0, ls='--', c='k', lw=1)
747
+
748
+ fig.suptitle('3-Month Rolling Information Coefficient', fontsize=16)
749
+ fig.tight_layout()
750
+ fig.subplots_adjust(top=0.92);
751
+
752
+ def plot_metrics(self, lgb_metrics, lgb_daily_ic, t=1):
753
+ ### Visualization
754
+ sns.jointplot(x=lgb_metrics.daily_ic_mean,y=lgb_metrics.ic);
755
+
756
+ g = sns.catplot(x='lookahead', y='ic',
757
+ col='train_length', row='test_length',
758
+ data=lgb_metrics,
759
+ kind='box');
760
+ g=sns.catplot(x='boost_rounds',
761
+ y='ic',
762
+ col='train_length',
763
+ row='test_length',
764
+ data=lgb_daily_ic[lgb_daily_ic.lookahead == t],
765
+ kind='box');
766
+
767
+ def get_best_predictions(self, lgb_daily_ic, scope_params, lgb_train_params, lookahead=1, topn=10):
768
+ for best in range(topn):
769
+ best_params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=best)
770
+ key = self._get_lgb_key(lookahead, best_params)
771
+ rounds = str(int(best_params.boost_rounds))
772
+ if best == 0:
773
+ best_predictions = pd.read_hdf(self.trainstore, 'predictions/' + key)
774
+ best_predictions = best_predictions[rounds].to_frame(best)
775
+ else:
776
+ best_predictions[best] = pd.read_hdf(self.trainstore, 'predictions/' + key)[rounds]
777
+ best_predictions = best_predictions.sort_index()
778
+ best_predictions.reset_index().to_hdf(self.outstore, f'lgb/train/{lookahead:02}')
779
+ return best_predictions
780
+
781
+ def apply_alphalen_analysis(self, factor_data, tearsheet=True, verbose=True):
782
+ #### Compute Alphalens metrics
783
+ mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
784
+ factor_data,
785
+ by_date=True,
786
+ by_group=False,
787
+ demeaned=True,
788
+ group_adjust=False,
789
+ )
790
+ factor_returns = perf.factor_returns(factor_data)
791
+ mean_quant_ret, std_quantile = perf.mean_return_by_quantile(factor_data,
792
+ by_group=False,
793
+ demeaned=True)
794
+
795
+
796
+
797
+ mean_quant_rateret = mean_quant_ret.apply(rate_of_return, axis=0,
798
+ base_period=mean_quant_ret.columns[0])
799
+
800
+ mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
801
+ factor_data,
802
+ by_date=True,
803
+ by_group=False,
804
+ demeaned=True,
805
+ group_adjust=False,
806
+ )
807
+
808
+ mean_quant_rateret_bydate = mean_quant_ret_bydate.apply(
809
+ rate_of_return,
810
+ base_period=mean_quant_ret_bydate.columns[0],
811
+ )
812
+
813
+ compstd_quant_daily = std_quant_daily.apply(std_conversion,
814
+ base_period=std_quant_daily.columns[0])
815
+
816
+ alpha_beta = perf.factor_alpha_beta(factor_data,
817
+ demeaned=True)
818
+
819
+ mean_ret_spread_quant, std_spread_quant = perf.compute_mean_returns_spread(
820
+ mean_quant_rateret_bydate,
821
+ factor_data["factor_quantile"].max(),
822
+ factor_data["factor_quantile"].min(),
823
+ std_err=compstd_quant_daily,
824
+ )
825
+ if verbose:
826
+ print(mean_ret_spread_quant.mean().mul(10000).to_frame('Mean Period Wise Spread (bps)').join(alpha_beta.T).T)
827
+
828
+ fig, axes = plt.subplots(ncols=3, figsize=(18, 4))
829
+
830
+
831
+ plotting.plot_quantile_returns_bar(mean_quant_rateret, ax=axes[0])
832
+ plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)
833
+ axes[0].set_xlabel('Quantile')
834
+
835
+ plotting.plot_cumulative_returns_by_quantile(mean_quant_ret_bydate['1D'],
836
+ freq=pd.tseries.offsets.BDay(),
837
+ period='1D',
838
+ ax=axes[1])
839
+ axes[1].set_title('Cumulative Return by Quantile (1D Period)')
840
+
841
+ title = "Cumulative Return - Factor-Weighted Long/Short PF (1D Period)"
842
+ plotting.plot_cumulative_returns(factor_returns['1D'],
843
+ period='1D',
844
+ freq=pd.tseries.offsets.BDay(),
845
+ title=title,
846
+ ax=axes[2])
847
+
848
+ fig.suptitle('Alphalens - Validation Set Performance', fontsize=14)
849
+ fig.tight_layout()
850
+ fig.subplots_adjust(top=.85);
851
+
852
+ #### Summary Tearsheet
853
+ create_summary_tear_sheet(factor_data)
854
+ create_full_tear_sheet(factor_data)
855
+
856
+ def evaluate(self, remove_instore=False, lookahead=1):
857
+ scope_params = ['lookahead', 'train_length', 'test_length']
858
+ daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
859
+ lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
860
+
861
+ lgb_metrics = self._get_lgb_metrics(scope_params, lgb_train_params, daily_ic_metrics)
862
+ #### Summary Metrics by Fold
863
+ lgb_metrics.to_hdf(self.outstore, 'lgb/metrics')
864
+
865
+ #### Information Coefficient by Day
866
+ int_cols = ['lookahead', 'train_length', 'test_length', 'boost_rounds']
867
+ id_vars = ['date'] + scope_params + lgb_train_params
868
+ lgb_ic = self._get_lgb_ic(int_cols, scope_params, lgb_train_params, id_vars)
869
+ lgb_ic.to_hdf(self.outstore, 'lgb/ic')
870
+ lgb_daily_ic = lgb_ic.groupby(id_vars[1:] + ['boost_rounds']).ic.mean().to_frame('ic').reset_index()
871
+ lgb_daily_ic.to_hdf(self.outstore, 'lgb/daily_ic')
872
+
873
+ ## Cross-validation Result: Best Hyperparameters
874
+ group_cols = scope_params + lgb_train_params + ['boost_rounds']
875
+ print(lgb_daily_ic.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'ic')))
876
+ lgb_metrics.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'ic'))
877
+ lgb_metrics.groupby('lookahead', group_keys=False
878
+ ).apply(lambda x: x.nlargest(3, 'ic')).to_hdf(self.outstore, 'lgb/best_model')
879
+ print(lgb_metrics.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'daily_ic_mean')))
880
+
881
+ ### Visualization
882
+ self.plot_metrics(lgb_metrics, lgb_daily_ic, t=1)
883
+
884
+ ## AlphaLens Analysis - Validation Performance
885
+ lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
886
+ best_params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=5, best=0)
887
+ best_params.to_hdf(self.outstore, 'lgb/best_params')
888
+
889
+ self.plot_ic(lgb_ic, lgb_daily_ic, scope_params, lgb_train_params)
890
+
891
+ #### Get Predictions for Validation Period
892
+ best_predictions = self.get_best_predictions(lgb_daily_ic, scope_params, lgb_train_params,
893
+ lookahead=lookahead, topn=10)
894
+ test_tickers = best_predictions.index.get_level_values('symbol').unique()
895
+ start = best_predictions.index.get_level_values('date').min()
896
+ end = best_predictions.index.get_level_values('date').max()
897
+ trade_prices = self.get_trade_prices(test_tickers, start, end)
898
+ trade_prices.to_hdf(self.outstore, 'trade_prices/model_selection')
899
+ pd.Series(test_tickers).to_hdf(self.outstore, 'lgb/tickers')
900
+ #We average the top five models and provide the corresponding prices to Alphalens, in order to compute the mean period-wise
901
+ #return earned on an equal-weighted portfolio invested in the daily factor quintiles for various holding periods:
902
+ factor = best_predictions.iloc[:, :5].mean(1).dropna().tz_convert ('UTC', level='date').swaplevel()
903
+ ### #### Create AlphaLens Inputs
904
+ factor_data = get_clean_factor_and_forward_returns(factor=factor,
905
+ prices=trade_prices,
906
+ quantiles=5,
907
+ periods=(1, 5, 10, 21),
908
+ max_loss=1)
909
+ self.apply_alphalen_analysis(factor_data, tearsheet=True, verbose=True)
910
+ # Delete the temporary files
911
+ if remove_instore:
912
+ os.remove(self.trainstore)
913
+
914
+ def make_predictions(self, data: pd.DataFrame, lookahead=1, verbose=True):
915
+ YEAR = 252
916
+ idx = pd.IndexSlice
917
+ scope_params = ['lookahead', 'train_length', 'test_length']
918
+ daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
919
+ lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
920
+
921
+ base_params = dict(boosting='gbdt',
922
+ objective='regression',
923
+ verbose=-1)
924
+
925
+ categoricals = ['year', 'month', 'sector', 'weekday']
926
+ data = data.sort_index()
927
+ labels = sorted(data.filter(like='_fwd').columns)
928
+ features = data.columns.difference(labels).tolist()
929
+ label = f'r{lookahead:02}_fwd'
930
+ for feature in categoricals:
931
+ data[feature] = pd.factorize(data[feature], sort=True)[0]
932
+
933
+ lgb_data = lgb.Dataset(data=data[features],
934
+ label=data[label],
935
+ categorical_feature=categoricals,
936
+ free_raw_data=False)
937
+ ### Generate predictions
938
+ lgb_ic = pd.read_hdf(self.outstore, 'lgb/ic')
939
+ lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
940
+
941
+ for position in range(10):
942
+ params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=position)
943
+
944
+ params = params.to_dict()
945
+
946
+ for p in ['min_data_in_leaf', 'num_leaves']:
947
+ params[p] = int(params[p])
948
+ train_length = int(params.pop('train_length'))
949
+ test_length = int(params.pop('test_length'))
950
+ num_boost_round = int(params.pop('boost_rounds'))
951
+ params.update(base_params)
952
+
953
+ print(f'\nPosition: {position:02}')
954
+
955
+ # 1-year out-of-sample period
956
+ n_splits = int(YEAR / test_length)
957
+ cv = MultipleTimeSeriesCV(n_splits=n_splits,
958
+ test_period_length=test_length,
959
+ lookahead=lookahead,
960
+ train_period_length=train_length)
961
+
962
+ predictions = []
963
+ start = time()
964
+ for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
965
+ print(i, end=' ', flush=True)
966
+ lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
967
+ params=params).construct()
968
+
969
+ model = lgb.train(params=params,
970
+ train_set=lgb_train,
971
+ num_boost_round=num_boost_round,
972
+ )
973
+
974
+ test_set = data.iloc[test_idx, :]
975
+ y_test = test_set.loc[:, label].to_frame('y_test')
976
+ y_pred = model.predict(test_set.loc[:, model.feature_name()])
977
+ predictions.append(y_test.assign(prediction=y_pred))
978
+
979
+ if position == 0:
980
+ test_predictions = (pd.concat(predictions)
981
+ .rename(columns={'prediction': position}))
982
+ else:
983
+ test_predictions[position] = pd.concat(predictions).prediction
984
+
985
+ by_day = test_predictions.groupby(level='date')
986
+ for position in range(10):
987
+ if position == 0:
988
+ ic_by_day = by_day.apply(lambda x: spearmanr(
989
+ x.y_test, x[position])[0]).to_frame()
990
+ else:
991
+ ic_by_day[position] = by_day.apply(
992
+ lambda x: spearmanr(x.y_test, x[position])[0])
993
+ if verbose:
994
+ print(ic_by_day.describe())
995
+ test_predictions.reset_index().to_hdf(self.outstore, f'lgb/test/{lookahead:02}')
996
+ return test_predictions
997
+
998
+ def load_predictions(self, predictions=None, lookahead=1):
999
+ if predictions is None:
1000
+ predictions = pd.concat([
1001
+ pd.read_hdf(self.outstore, f'lgb/train/{lookahead:02}'),
1002
+ pd.read_hdf(self.outstore, f'lgb/test/{lookahead:02}').drop('y_test', axis=1)
1003
+ ])
1004
+ predictions = predictions.set_index(['symbol', 'date'])
1005
+
1006
+ predictions = (predictions.loc[~predictions.index.duplicated()]
1007
+ .iloc[:, :10]
1008
+ .mean(1)
1009
+ .sort_index()
1010
+ .dropna()
1011
+ .to_frame('prediction'))
1012
+ tickers = predictions.index.get_level_values('symbol').unique().tolist()
1013
+ return (predictions
1014
+ .unstack('symbol')
1015
+ .prediction
1016
+ .tz_convert ('UTC')), tickers
1017
+
1018
+ def assert_last_date(self, predictions: pd.DataFrame):
1019
+ """
1020
+ Usefull in Live Trading to ensure that the last date in the predictions
1021
+ is the previous day, so it predicts today's returns.
1022
+ """
1023
+ last_date = predictions.index.get_level_values('date').max()
1024
+ if last_date.tzinfo is None:
1025
+ last_date = last_date.tz_localize('UTC')
1026
+ assert last_date == (pd.Timestamp.now(tz='UTC') - pd.Timedelta(days=1)).normalize()