bbstrader 0.1.94__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbstrader might be problematic. Click here for more details.

Files changed (38) hide show
  1. bbstrader/__ini__.py +9 -9
  2. bbstrader/btengine/__init__.py +7 -7
  3. bbstrader/btengine/backtest.py +30 -26
  4. bbstrader/btengine/data.py +100 -79
  5. bbstrader/btengine/event.py +2 -1
  6. bbstrader/btengine/execution.py +18 -16
  7. bbstrader/btengine/performance.py +11 -7
  8. bbstrader/btengine/portfolio.py +35 -36
  9. bbstrader/btengine/strategy.py +119 -94
  10. bbstrader/config.py +14 -8
  11. bbstrader/core/__init__.py +0 -0
  12. bbstrader/core/data.py +22 -0
  13. bbstrader/core/utils.py +57 -0
  14. bbstrader/ibkr/__init__.py +0 -0
  15. bbstrader/ibkr/utils.py +0 -0
  16. bbstrader/metatrader/__init__.py +5 -5
  17. bbstrader/metatrader/account.py +117 -121
  18. bbstrader/metatrader/rates.py +83 -80
  19. bbstrader/metatrader/risk.py +23 -37
  20. bbstrader/metatrader/trade.py +169 -140
  21. bbstrader/metatrader/utils.py +3 -3
  22. bbstrader/models/__init__.py +5 -5
  23. bbstrader/models/factors.py +280 -0
  24. bbstrader/models/ml.py +1092 -0
  25. bbstrader/models/optimization.py +31 -28
  26. bbstrader/models/{portfolios.py → portfolio.py} +64 -46
  27. bbstrader/models/risk.py +15 -9
  28. bbstrader/trading/__init__.py +2 -2
  29. bbstrader/trading/execution.py +252 -164
  30. bbstrader/trading/scripts.py +8 -4
  31. bbstrader/trading/strategies.py +79 -66
  32. bbstrader/tseries.py +482 -107
  33. {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/LICENSE +1 -1
  34. {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/METADATA +6 -1
  35. bbstrader-0.2.1.dist-info/RECORD +37 -0
  36. bbstrader-0.1.94.dist-info/RECORD +0 -32
  37. {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/WHEEL +0 -0
  38. {bbstrader-0.1.94.dist-info → bbstrader-0.2.1.dist-info}/top_level.txt +0 -0
bbstrader/models/ml.py CHANGED
@@ -0,0 +1,1092 @@
1
+ import os
2
+ import warnings
3
+ from datetime import datetime
4
+ from itertools import product
5
+ from time import time
6
+
7
+ import lightgbm as lgb
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import pandas as pd
11
+ import seaborn as sns
12
+ import talib
13
+ import yfinance as yf
14
+ from alphalens import performance as perf
15
+ from alphalens import plotting
16
+ from alphalens.tears import create_full_tear_sheet, create_summary_tear_sheet
17
+ from alphalens.utils import (
18
+ get_clean_factor_and_forward_returns,
19
+ rate_of_return,
20
+ std_conversion,
21
+ )
22
+ from scipy.stats import spearmanr
23
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
24
+ from talib import ATR, BBANDS, MACD, RSI
25
+
26
+ warnings.filterwarnings('ignore')
27
+
28
+
29
+ __all__ = [
30
+ 'OneStepTimeSeriesSplit',
31
+ 'MultipleTimeSeriesCV',
32
+ 'LightGBModel'
33
+ ]
34
+
35
+
36
+ class OneStepTimeSeriesSplit:
37
+ __author__ = "Stefan Jansen"
38
+ """Generates tuples of train_idx, test_idx pairs
39
+ Assumes the index contains a level labeled 'date'"""
40
+
41
+ def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
42
+ self.n_splits = n_splits
43
+ self.test_period_length = test_period_length
44
+ self.shuffle = shuffle
45
+
46
+ @staticmethod
47
+ def chunks(l, n): # noqa: E741
48
+ for i in range(0, len(l), n):
49
+ yield l[i:i + n]
50
+
51
+ def split(self, X: pd.DataFrame, y=None, groups=None):
52
+ unique_dates = (X.index
53
+ .get_level_values('date')
54
+ .unique()
55
+ .sort_values(ascending=False)
56
+ [:self.n_splits*self.test_period_length])
57
+
58
+ dates = X.reset_index()[['date']]
59
+ for test_date in self.chunks(unique_dates, self.test_period_length):
60
+ train_idx = dates[dates.date < min(test_date)].index
61
+ test_idx = dates[dates.date.isin(test_date)].index
62
+ if self.shuffle:
63
+ np.random.shuffle(list(train_idx))
64
+ yield train_idx, test_idx
65
+
66
+ def get_n_splits(self, X, y, groups=None):
67
+ return self.n_splits
68
+
69
+
70
+ class MultipleTimeSeriesCV:
71
+ __author__ = "Stefan Jansen"
72
+ """
73
+ Generates tuples of train_idx, test_idx pairs
74
+ Assumes the MultiIndex contains levels 'symbol' and 'date'
75
+ purges overlapping outcomes
76
+ """
77
+
78
+ def __init__(self,
79
+ n_splits=3,
80
+ train_period_length=126,
81
+ test_period_length=21,
82
+ lookahead=None,
83
+ date_idx='date',
84
+ shuffle=False):
85
+
86
+ self.n_splits = n_splits
87
+ self.lookahead = lookahead
88
+ self.test_length = test_period_length
89
+ self.train_length = train_period_length
90
+ self.shuffle = shuffle
91
+ self.date_idx = date_idx
92
+
93
+ def split(self, X: pd.DataFrame, y=None, groups=None):
94
+ unique_dates = X.index.get_level_values(self.date_idx).unique()
95
+ days = sorted(unique_dates, reverse=True)
96
+ split_idx = []
97
+ for i in range(self.n_splits):
98
+ test_end_idx = i * self.test_length
99
+ test_start_idx = test_end_idx + self.test_length
100
+ train_end_idx = test_start_idx + self.lookahead - 1
101
+ train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
102
+ split_idx.append([train_start_idx, train_end_idx,
103
+ test_start_idx, test_end_idx])
104
+
105
+ dates = X.reset_index()[[self.date_idx]]
106
+ for train_start, train_end, test_start, test_end in split_idx:
107
+
108
+ train_idx = dates[(dates[self.date_idx] > days[train_start])
109
+ & (dates[self.date_idx] <= days[train_end])].index
110
+ test_idx = dates[(dates[self.date_idx] > days[test_start])
111
+ & (dates[self.date_idx] <= days[test_end])].index
112
+ if self.shuffle:
113
+ np.random.shuffle(list(train_idx))
114
+ yield train_idx.to_numpy(), test_idx.to_numpy()
115
+
116
+ def get_n_splits(self, X, y, groups=None):
117
+ return self.n_splits
118
+
119
+
120
+ class LightGBModel(object):
121
+ """
122
+ ``LightGBModel`` encapsulates a complete workflow for training and evaluating
123
+ a ``LightGBM (Light Gradient Boosting Machine)`` model for predicting stock returns.
124
+ It includes data acquisition, feature engineering, model tuning, and performance
125
+ evaluation using information ``coefficient (IC)`` and Alphalens analysis.
126
+
127
+ Key Features
128
+ ------------
129
+ - ``HDF5 Storage``: Utilizes ``pandas.HDFStore`` for efficient storage and retrieval
130
+ of large datasets, which is essential for backtesting on financial time series data.
131
+
132
+ - ``Time-Series Cross-Validation``: Employs a custom cross-validation strategy that
133
+ respects the time series nature of the data, avoiding data leakage.
134
+
135
+ - ``Hyperparameter Tuning``: Includes automated hyperparameter tuning using a randomized
136
+ grid search for optimization.
137
+
138
+ - ``Information Coefficient (IC)``: Uses IC as a core performance metric that quantifies
139
+ the predictive power of the model, which is a standard measure for ranking models in finance.
140
+
141
+ - ``Alphalens Integration``: Provides a comprehensive framework for validating model
142
+ performance using Alphalens, allowing for in-depth performance analysis, like backtesting
143
+ and return decomposition.
144
+
145
+ Use Case
146
+ --------
147
+ This class is designed for quantitative finance and algorithmic trading use cases where
148
+ the goal is to build a predictive model for stock returns based on historical data and
149
+ technical indicators. It follows a complete cycle from data acquisition to model validation
150
+ and provides the infrastructure needed for deployment of this model in a trading strategy.
151
+
152
+ Notes
153
+ -----
154
+ The implementation is inspired by the book "Machine Learning for Algorithmic Trading"
155
+ by Stefan Jansen.
156
+
157
+ References
158
+ ----------
159
+ Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
160
+ Chapter 12, Boosting Your Trading Strategy.
161
+ """
162
+
163
+ def __init__(self,
164
+ data: pd.DataFrame = None,
165
+ datastore: pd.HDFStore = 'lgbdata.h5',
166
+ trainstore: pd.HDFStore = 'lgbtrain.h5',
167
+ outstore: pd.HDFStore = 'lgbout.h5'
168
+ ):
169
+ """
170
+ Args:
171
+ data (pd.DataFrame): The input data for the model. It should be a DataFrame with a MultiIndex containing
172
+ 'symbol' and 'date' levels. If not provided, the data can be downloaded using the `download_boosting_data` method.
173
+ datastore (str): The path to the HDF5 file for storing the model data.
174
+ trainstore (str): The path to the HDF5 file for storing the training data.
175
+ outstore (str): The path to the HDF5 file for storing the output data.
176
+ """
177
+ self.datastore = datastore
178
+ self.trainstore = trainstore
179
+ self.outstore = outstore
180
+ if data is not None:
181
+ data.reset_index().to_hdf(self.datastore, 'model_data')
182
+
183
+ def _compute_bb(self, close):
184
+ high, mid, low = BBANDS(close, timeperiod=20)
185
+ return pd.DataFrame({'bb_high': high, 'bb_low': low}, index=close.index)
186
+
187
+ def _compute_atr(self, stock_data):
188
+ df = ATR(stock_data.high, stock_data.low,
189
+ stock_data.close, timeperiod=14)
190
+ return df.sub(df.mean()).div(df.std())
191
+
192
+ def _compute_macd(self, close):
193
+ macd = MACD(close)[0]
194
+ return (macd - np.mean(macd))/np.std(macd)
195
+
196
+ def _add_technical_indicators(self, prices: pd.DataFrame):
197
+ prices = prices.copy()
198
+ prices['rsi'] = prices.groupby(level='symbol').close.apply(
199
+ lambda x: RSI(x).reset_index(level=0, drop=True))
200
+ bb = prices.groupby(level=0).close.apply(
201
+ self._compute_bb).reset_index(level=1, drop=True)
202
+ prices = prices.join(bb)
203
+ prices['bb_high'] = prices.bb_high.sub(
204
+ prices.close).div(prices.bb_high).apply(np.log1p)
205
+ prices['bb_low'] = prices.close.sub(
206
+ prices.bb_low).div(prices.close).apply(np.log1p)
207
+ prices['NATR'] = prices.groupby(level='symbol',
208
+ group_keys=False).apply(lambda x:
209
+ talib.NATR(x.high, x.low, x.close))
210
+
211
+ prices['ATR'] = (prices.groupby('symbol', group_keys=False)
212
+ .apply(self._compute_atr))
213
+ prices['PPO'] = prices.groupby(level='symbol').close.apply(
214
+ lambda x: talib.PPO(x).reset_index(level=0, drop=True))
215
+ prices['MACD'] = (prices
216
+ .groupby('symbol', group_keys=False)
217
+ .close
218
+ .apply(self._compute_macd))
219
+ return prices
220
+
221
+ def download_boosting_data(self, tickers, start, end=None):
222
+ data = []
223
+ for ticker in tickers:
224
+ try:
225
+ prices = yf.download(
226
+ ticker, start=start, end=end, progress=False, multi_level_index=False)
227
+ prices['symbol'] = ticker
228
+ data.append(prices)
229
+ except: # noqa: E722
230
+ continue
231
+ data = pd.concat(data)
232
+ data = (data
233
+ .rename(columns={s: s.lower().replace(' ', '_') for s in data.columns})
234
+ .drop(columns=['adj_close'])
235
+ .set_index('symbol', append=True).swaplevel()
236
+ .sort_index()
237
+ .dropna())
238
+ return data
239
+
240
+ def download_metadata(self, tickers):
241
+
242
+ def clean_text_column(series: pd.Series) -> pd.Series:
243
+ return (
244
+ series.str.lower()
245
+ # use regex=False for literal string replacements
246
+ .str.replace('-', '', regex=False)
247
+ .str.replace('&', 'and', regex=False)
248
+ .str.replace(' ', '_', regex=False)
249
+ .str.replace('__', '_', regex=False)
250
+ )
251
+
252
+ metadata = ['industry', 'sector', 'exchange', 'symbol',
253
+ 'heldPercentInsiders', 'heldPercentInstitutions',
254
+ 'overallRisk', 'shortRatio', 'dividendYield', 'beta',
255
+ 'regularMarketVolume', 'averageVolume', 'averageVolume10days',
256
+ 'bid', 'ask', 'bidSize', 'askSize', 'marketCap']
257
+
258
+ columns = {
259
+ 'industry': 'industry',
260
+ 'sector': 'sector',
261
+ 'exchange': 'exchange',
262
+ 'symbol': 'symbol',
263
+ 'heldPercentInsiders': 'insiders',
264
+ 'heldPercentInstitutions': 'institutions',
265
+ 'overallRisk': 'risk',
266
+ 'shortRatio': 'short_ratio',
267
+ 'dividendYield': 'dyield',
268
+ 'beta': 'beta',
269
+ 'regularMarketVolume': 'regvolume',
270
+ 'averageVolume': 'avgvolume',
271
+ 'averageVolume10days': 'avgvolume10',
272
+ 'bid': 'bid',
273
+ 'ask': 'ask',
274
+ 'bidSize': 'bidsize',
275
+ 'askSize': 'asksize',
276
+ 'marketCap': 'marketcap'
277
+ }
278
+ data = []
279
+ for symbol in tickers:
280
+ try:
281
+ symbol_info = yf.Ticker(symbol).info
282
+ except: # noqa: E722
283
+ continue
284
+ infos = {}
285
+ for info in metadata:
286
+ infos[info] = symbol_info.get(info)
287
+ data.append(infos)
288
+ metadata = pd.DataFrame(data)
289
+ metadata = metadata.rename(columns=columns)
290
+ metadata.dyield = metadata.dyield.fillna(0)
291
+ metadata.sector = clean_text_column(metadata.sector)
292
+ metadata.industry = clean_text_column(metadata.industry)
293
+ metadata = metadata.set_index('symbol')
294
+ return metadata
295
+
296
+ def _select_nlargest_liquidity_stocks(self, df: pd.DataFrame, n: int,
297
+ volume_features, bid_ask_features, market_cap_feature):
298
+ df = df.copy()
299
+ scaler = StandardScaler()
300
+
301
+ # Normalize features
302
+ df[volume_features] = scaler.fit_transform(df[volume_features])
303
+ df['bid_ask_spread'] = df['ask'] - df['bid']
304
+ df['bid_ask_spread'] = scaler.fit_transform(df[['bid_ask_spread']])
305
+ df[market_cap_feature] = scaler.fit_transform(df[market_cap_feature])
306
+
307
+ # Calculate Liquidity Score
308
+ # Assign weights to each component (these weights can be adjusted based on importance)
309
+ weights = {
310
+ 'volume': 0.4,
311
+ 'bid_ask_spread': 0.2,
312
+ 'marketCap': 0.4
313
+ }
314
+
315
+ # Calculate the liquidity score by combining the normalized features
316
+ df['liquidity_score'] = (weights['volume'] * df[volume_features].mean(axis=1) +
317
+ weights['bid_ask_spread'] * df['bid_ask_spread'] +
318
+ weights['marketCap'] * df[market_cap_feature[0]])
319
+ df_sorted = df.sort_values(by='liquidity_score', ascending=False)
320
+
321
+ return df_sorted.nlargest(n, 'liquidity_score').index
322
+
323
+ def _encode_metadata(self, df: pd.DataFrame):
324
+ df = df.copy()
325
+ # Binning each numerical feature into categories
326
+ df['insiders'] = pd.qcut(
327
+ df['insiders'], q=4,
328
+ labels=['Very Low', 'Low', 'High', 'Very High']
329
+ )
330
+ df['institutions'] = pd.qcut(
331
+ df['institutions'], q=4,
332
+ labels=['Very Low', 'Low', 'High', 'Very High']
333
+ )
334
+ df['risk'] = pd.cut(
335
+ df['risk'], bins=[-float('inf'), 3, 5, 7, float('inf')],
336
+ labels=['Low', 'Medium', 'High', 'Very High']
337
+ )
338
+ df['short_ratio'] = pd.qcut(
339
+ df['short_ratio'], q=4,
340
+ labels=['Very Low', 'Low', 'High', 'Very High']
341
+ )
342
+ df['dyield'] = pd.cut(
343
+ df['dyield'],
344
+ bins=[-float('inf'), 0.002, 0.005, 0.01, float('inf')],
345
+ labels=['Very Low', 'Low', 'High', 'Very High']
346
+ )
347
+ df['beta'] = pd.cut(
348
+ df['beta'],
349
+ bins=[-float('inf'), 0.8, 1.0, 1.2, float('inf')],
350
+ labels=['Low', 'Moderate', 'High', 'Very High']
351
+ )
352
+
353
+ # Encode binned features
354
+ binned_features = [
355
+ 'insiders', 'institutions',
356
+ 'risk', 'short_ratio', 'dyield',
357
+ 'beta', 'sector', 'industry', 'exchange',
358
+ ]
359
+ label_encoders = {}
360
+
361
+ for col in binned_features:
362
+ le = LabelEncoder()
363
+ df[col] = le.fit_transform(df[col])
364
+ label_encoders[col] = le
365
+ return df, label_encoders
366
+
367
+ def prepare_boosting_data(self,
368
+ prices: pd.DataFrame,
369
+ metadata: pd.DataFrame = None,
370
+ min_years=7,
371
+ universe=500
372
+ ):
373
+ if metadata is None:
374
+ mcap = False
375
+ tickers = prices.index.get_level_values('symbol').unique()
376
+ metadata = self.download_metadata(tickers)
377
+ else:
378
+ mcap = True
379
+ YEAR = 252
380
+ idx = pd.IndexSlice
381
+ percentiles = [.001, .01, .02, .03, .04, .05]
382
+ percentiles += [1-p for p in percentiles[::-1]]
383
+ T = [1, 5, 10, 21, 42, 63]
384
+
385
+ prices.volume /= 1e3 # make vol figures a bit smaller
386
+ prices.index.names = ['symbol', 'date']
387
+ metadata.index.name = 'symbol'
388
+ prices.reset_index().to_hdf(self.datastore, 'stock_data')
389
+ metadata.reset_index().to_hdf(self.datastore, 'stock_metadata')
390
+
391
+ # Remove stocks with insufficient observations
392
+ min_obs = min_years * YEAR
393
+ nobs = prices.groupby(level='symbol').size()
394
+ keep = nobs[nobs > min_obs].index
395
+ prices = prices.loc[idx[keep, :], :]
396
+
397
+ # # Remove duplicate symbols
398
+ prices = prices[~prices.index.duplicated()]
399
+
400
+ # Align price and meta data
401
+ metadata = metadata[~metadata.index.duplicated() &
402
+ metadata.sector.notnull()]
403
+ metadata.sector = metadata.sector.str.lower().str.replace(' ', '_')
404
+ shared = (prices.index.get_level_values('symbol').unique()
405
+ .intersection(metadata.index))
406
+ metadata = metadata.loc[shared, :]
407
+ prices = prices.loc[idx[shared, :], :]
408
+
409
+ # Limit universe
410
+ if mcap:
411
+ universe = metadata.marketcap.nlargest(universe).index
412
+ else:
413
+ volume_features = ['regvolume', 'avgvolume', 'avgvolume10']
414
+ bid_ask_features = ['bid', 'ask', 'bidsize', 'asksize']
415
+ market_cap_feature = ['marketcap']
416
+ to_drop = volume_features + bid_ask_features + market_cap_feature
417
+ universe = self._select_nlargest_liquidity_stocks(
418
+ metadata, universe, volume_features, bid_ask_features, market_cap_feature
419
+ )
420
+ metadata = metadata.drop(to_drop, axis=1)
421
+ prices = prices.loc[idx[universe, :], :]
422
+ metadata = metadata.loc[universe]
423
+ metadata = self._encode_metadata(metadata)[0]
424
+
425
+ prices['dollar_vol'] = prices[['close', 'volume']].prod(1).div(1e3)
426
+ # compute dollar volume to determine universe
427
+ dollar_vol_ma = (prices
428
+ .dollar_vol
429
+ .unstack('symbol')
430
+ .rolling(window=21, min_periods=1) # 1 trading month
431
+ .mean())
432
+
433
+ # Rank stocks by moving average
434
+ prices['dollar_vol_rank'] = (dollar_vol_ma
435
+ .rank(axis=1, ascending=False)
436
+ .stack('symbol')
437
+ .swaplevel())
438
+ # Add some Basic Factors
439
+ prices = self._add_technical_indicators(prices)
440
+ # Combine Price and Meta Data
441
+ prices = prices.join(metadata)
442
+
443
+ # Compute Returns
444
+ by_sym = prices.groupby(level='symbol').close
445
+ for t in T:
446
+ prices[f'r{t:02}'] = by_sym.pct_change(t)
447
+ # Daily historical return deciles
448
+ for t in T:
449
+ # Reset the index to apply qcut by date without grouping errors
450
+ prices[f'r{t:02}dec'] = (prices.reset_index(level='date')
451
+ .groupby('date')[f'r{t:02}']
452
+ .apply(lambda x: pd.qcut(x,
453
+ q=10,
454
+ labels=False,
455
+ duplicates='drop'))
456
+ .values)
457
+ # Daily sector return deciles
458
+ for t in T:
459
+ prices[f'r{t:02}q_sector'] = (
460
+ prices
461
+ .groupby(['date', 'sector'])[f'r{t:02}']
462
+ .transform(lambda x: pd.qcut(
463
+ x,
464
+ q=5,
465
+ labels=False,
466
+ duplicates='drop'))
467
+ )
468
+ # Compute Forward Returns
469
+ for t in [1, 5, 21]:
470
+ prices[f'r{t:02}_fwd'] = prices.groupby(
471
+ level='symbol')[f'r{t:02}'].shift(-t)
472
+
473
+ # Remove outliers
474
+ outliers = prices[prices.r01 > 1].index.get_level_values(
475
+ 'symbol').unique()
476
+ prices = prices.drop(outliers, level='symbol')
477
+ # Create time and sector dummy variables
478
+ prices['year'] = prices.index.get_level_values('date').year
479
+ prices['month'] = prices.index.get_level_values('date').month
480
+ prices['weekday'] = prices.index.get_level_values('date').weekday
481
+ # Store Model Data
482
+ prices = prices.drop(
483
+ ['open', 'close', 'low', 'high', 'volume'], axis=1)
484
+ if 'adj_close' in prices.columns:
485
+ prices = prices.drop('adj_close', axis=1)
486
+ prices.reset_index().to_hdf(self.datastore, 'model_data')
487
+ return prices.sort_index()
488
+
489
+ def tickers(self):
490
+ return pd.read_hdf(self.outstore, 'lgb/tickers').tolist()
491
+
492
+ def load_model_data(self, key='model_data'):
493
+ return pd.read_hdf(self.datastore, key=key).set_index(['symbol', 'date']).sort_index()
494
+
495
+ def format_time(self, t):
496
+ """Return a formatted time string 'HH:MM:SS
497
+ based on a numeric time() value"""
498
+ m, s = divmod(t, 60)
499
+ h, m = divmod(m, 60)
500
+ return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'
501
+
502
+ def fit(self, data: pd.DataFrame, verbose=True):
503
+ def get_fi(model):
504
+ """Return normalized feature importance as pd.Series"""
505
+ fi = model.feature_importance(importance_type='gain')
506
+ return (pd.Series(fi / fi.sum(),
507
+ index=model.feature_name()))
508
+
509
+ def ic_lgbm(preds, train_data):
510
+ """Custom IC eval metric for lightgbm"""
511
+ is_higher_better = True
512
+ return 'ic', spearmanr(preds, train_data.get_label())[0], is_higher_better
513
+ data = data.dropna()
514
+ # Hyperparameter options
515
+ YEAR = 252
516
+ base_params = dict(boosting='gbdt',
517
+ objective='regression',
518
+ verbose=-1)
519
+
520
+ # constraints on structure (depth) of each tree
521
+ max_depths = [2, 3, 5, 7]
522
+ num_leaves_opts = [2 ** i for i in max_depths]
523
+ min_data_in_leaf_opts = [250, 500, 1000]
524
+
525
+ # weight of each new tree in the ensemble
526
+ learning_rate_ops = [.01, .1, .3]
527
+
528
+ # random feature selection
529
+ feature_fraction_opts = [.3, .6, .95]
530
+
531
+ param_names = ['learning_rate', 'num_leaves',
532
+ 'feature_fraction', 'min_data_in_leaf']
533
+
534
+ cv_params = list(product(learning_rate_ops,
535
+ num_leaves_opts,
536
+ feature_fraction_opts,
537
+ min_data_in_leaf_opts))
538
+ n_params = len(cv_params)
539
+ print(f'# Parameters: {n_params}')
540
+
541
+ # Train/Test Period Lengths
542
+ lookaheads = [1, 5, 21]
543
+ train_lengths = [int(4.5 * 252), 252]
544
+ test_lengths = [63]
545
+ test_params = list(product(lookaheads, train_lengths, test_lengths))
546
+ n = len(test_params)
547
+ test_param_sample = np.random.choice(
548
+ list(range(n)), size=int(n), replace=False)
549
+ test_params = [test_params[i] for i in test_param_sample]
550
+ print('Train configs:', len(test_params))
551
+
552
+ # Categorical Variables
553
+ categoricals = ['year', 'weekday', 'month']
554
+ for feature in categoricals:
555
+ data[feature] = pd.factorize(data[feature], sort=True)[0]
556
+
557
+ # ### Run Cross-Validation
558
+ labels = sorted(data.filter(like='fwd').columns)
559
+ features = data.columns.difference(labels).tolist()
560
+ label_dict = dict(zip(lookaheads, labels))
561
+ num_iterations = [10, 25, 50, 75] + list(range(100, 501, 50))
562
+ num_boost_round = num_iterations[-1]
563
+
564
+ metric_cols = (param_names + ['t', 'daily_ic_mean', 'daily_ic_mean_n',
565
+ 'daily_ic_median', 'daily_ic_median_n'] +
566
+ [str(n) for n in num_iterations])
567
+
568
+ for lookahead, train_length, test_length in test_params:
569
+ # randomized grid search
570
+ cvp = np.random.choice(list(range(n_params)),
571
+ size=int(n_params / 2),
572
+ replace=False)
573
+ cv_params_ = [cv_params[i] for i in cvp]
574
+
575
+ # set up cross-validation
576
+ n_splits = int(2 * YEAR / test_length)
577
+ if verbose:
578
+ print(f'Lookahead: {lookahead:2.0f} | '
579
+ f'Train: {train_length:3.0f} | '
580
+ f'Test: {test_length:2.0f} | '
581
+ f'Params: {len(cv_params_):3.0f} | '
582
+ f'Train configs: {len(test_params)}')
583
+
584
+ # time-series cross-validation
585
+ cv = MultipleTimeSeriesCV(n_splits=n_splits,
586
+ lookahead=lookahead,
587
+ test_period_length=test_length,
588
+ train_period_length=train_length)
589
+
590
+ label = label_dict[lookahead]
591
+ outcome_data = data.loc[:, features + [label]].dropna()
592
+
593
+ # binary dataset
594
+ lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),
595
+ label=outcome_data[label],
596
+ categorical_feature=categoricals,
597
+ free_raw_data=False)
598
+ T = 0
599
+ predictions, metrics = [], []
600
+
601
+ # iterate over (shuffled) hyperparameter combinations
602
+ for p, param_vals in enumerate(cv_params_):
603
+ key = f'{lookahead}/{train_length}/{test_length}/' + \
604
+ '/'.join([str(p) for p in param_vals])
605
+ params = dict(zip(param_names, param_vals))
606
+ params.update(base_params)
607
+
608
+ start = time()
609
+ cv_preds = []
610
+
611
+ # iterate over folds
612
+ for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
613
+
614
+ # select train subset
615
+ lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
616
+ params=params).construct()
617
+
618
+ # train model for num_boost_round
619
+ model = lgb.train(params=params,
620
+ train_set=lgb_train,
621
+ num_boost_round=num_boost_round,
622
+ )
623
+ # log feature importance
624
+ if i == 0:
625
+ fi = get_fi(model).to_frame()
626
+ else:
627
+ fi[i] = get_fi(model)
628
+
629
+ # capture predictions
630
+ test_set = outcome_data.iloc[test_idx, :]
631
+ X_test = test_set.loc[:, model.feature_name()]
632
+ y_test = test_set.loc[:, label]
633
+ y_pred = {str(n): model.predict(X_test, num_iteration=n)
634
+ for n in num_iterations}
635
+
636
+ # record predictions for each fold
637
+ cv_preds.append(y_test.to_frame(
638
+ 'y_test').assign(**y_pred).assign(i=i))
639
+
640
+ # combine fold results
641
+ cv_preds = pd.concat(cv_preds).assign(**params)
642
+ predictions.append(cv_preds)
643
+
644
+ # compute IC per day
645
+ by_day = cv_preds.groupby(level='date')
646
+ ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
647
+ for n in num_iterations], axis=1)
648
+ daily_ic_mean = ic_by_day.mean()
649
+ daily_ic_mean_n = daily_ic_mean.idxmax()
650
+ daily_ic_median = ic_by_day.median()
651
+ daily_ic_median_n = daily_ic_median.idxmax()
652
+
653
+ # compute IC across all predictions
654
+ ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0]
655
+ for n in num_iterations]
656
+ t = time() - start
657
+ T += t
658
+
659
+ # collect metrics
660
+ metrics = pd.Series(list(param_vals) +
661
+ [t, daily_ic_mean.max(), daily_ic_mean_n,
662
+ daily_ic_median.max(), daily_ic_median_n] + ic,
663
+ index=metric_cols)
664
+ if verbose:
665
+ msg = f'\t{p:3.0f} | {self.format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
666
+ msg += f'{params["num_leaves"]:3.0f} | {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} | '
667
+ msg += f' {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}'
668
+ print(msg)
669
+
670
+ # persist results for given CV run and hyperparameter combination
671
+ metrics.to_hdf(self.trainstore, 'metrics/' + key)
672
+ ic_by_day.assign(
673
+ **params).to_hdf(self.trainstore, 'daily_ic/' + key)
674
+ fi.T.describe().T.assign(**params).to_hdf(self.trainstore, 'fi/' + key)
675
+ cv_preds.to_hdf(self.trainstore,
676
+ 'predictions/' + key, append=True)
677
+
678
+ def _get_lgb_metrics(self, scope_params, lgb_train_params, daily_ic_metrics):
679
+ with pd.HDFStore(self.trainstore) as store:
680
+ for i, key in enumerate(
681
+ [k[1:] for k in store.keys() if k[1:].startswith('metrics')]):
682
+ _, t, train_length, test_length = key.split('/')[:4]
683
+ attrs = {
684
+ 'lookahead': t,
685
+ 'train_length': train_length,
686
+ 'test_length': test_length
687
+ }
688
+ s = store[key].to_dict()
689
+ s.update(attrs)
690
+ if i == 0:
691
+ lgb_metrics = pd.Series(s).to_frame(i)
692
+ else:
693
+ lgb_metrics[i] = pd.Series(s)
694
+
695
+ id_vars = scope_params + lgb_train_params + daily_ic_metrics
696
+ lgb_metrics = pd.melt(lgb_metrics.T.drop('t', axis=1),
697
+ id_vars=id_vars,
698
+ value_name='ic',
699
+ var_name='boost_rounds').dropna().apply(pd.to_numeric)
700
+ return lgb_metrics
701
+
702
+ def _get_lgb_ic(self, int_cols, scope_params, lgb_train_params, id_vars):
703
+ lgb_ic = []
704
+ with pd.HDFStore(self.trainstore) as store:
705
+ keys = [k[1:] for k in store.keys()]
706
+ for key in keys:
707
+ _, t, train_length, test_length = key.split('/')[:4]
708
+ if key.startswith('daily_ic'):
709
+ df = (store[key]
710
+ .drop(['boosting', 'objective', 'verbose'], axis=1)
711
+ .assign(lookahead=t,
712
+ train_length=train_length,
713
+ test_length=test_length))
714
+ lgb_ic.append(df)
715
+ lgb_ic = pd.concat(lgb_ic).reset_index()
716
+ lgb_ic = pd.melt(lgb_ic,
717
+ id_vars=id_vars,
718
+ value_name='ic',
719
+ var_name='boost_rounds').dropna()
720
+ lgb_ic.loc[:, int_cols] = lgb_ic.loc[:, int_cols].astype(int)
721
+ return lgb_ic
722
+
723
+ def _get_lgb_params(self, data, scope_params, lgb_train_params, t=5, best=0):
724
+ param_cols = scope_params[1:] + lgb_train_params + ['boost_rounds']
725
+ df = data[data.lookahead == t].sort_values(
726
+ 'ic', ascending=False).iloc[best]
727
+ return df.loc[param_cols]
728
+
729
+ def _get_lgb_key(self, t, p):
730
+ key = f'{t}/{int(p.train_length)}/{int(p.test_length)}/{p.learning_rate}/'
731
+ return key + f'{int(p.num_leaves)}/{p.feature_fraction}/{int(p.min_data_in_leaf)}'
732
+
733
+ def _select_ic(self, params, ic_data, lookahead):
734
+ return ic_data.loc[(ic_data.lookahead == lookahead) &
735
+ (ic_data.train_length == params.train_length) &
736
+ (ic_data.test_length == params.test_length) &
737
+ (ic_data.learning_rate == params.learning_rate) &
738
+ (ic_data.num_leaves == params.num_leaves) &
739
+ (ic_data.feature_fraction == params.feature_fraction) &
740
+ (ic_data.boost_rounds == params.boost_rounds), ['date', 'ic']].set_index('date')
741
+
742
+ def get_trade_prices(self, tickers, start, end):
743
+ idx = pd.IndexSlice
744
+ with pd.HDFStore(self.datastore) as store:
745
+ data = store.select('stock_data')
746
+ data = data.set_index(['symbol', 'date']).sort_index()
747
+ data = data[~data.index.duplicated()]
748
+ return (data.loc[idx[tickers, start: end], 'open']
749
+ .unstack('symbol')
750
+ .sort_index()
751
+ .shift(-1)
752
+ .tz_convert('UTC'))
753
+
754
+ def plot_ic(self, lgb_ic, lgb_daily_ic, scope_params, lgb_train_params):
755
+ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
756
+ axes = axes.flatten()
757
+ for i, t in enumerate([1, 21]):
758
+ params = self._get_lgb_params(
759
+ lgb_daily_ic, scope_params, lgb_train_params, t=t, best=0)
760
+ data = self._select_ic(params, lgb_ic, lookahead=t).sort_index()
761
+ rolling = data.rolling(63).ic.mean().dropna()
762
+ avg = data.ic.mean()
763
+ med = data.ic.median()
764
+ rolling.plot(
765
+ ax=axes[i], title=f'Horizon: {t} Day(s) | IC: Mean={avg*100:.2f} Median={med*100:.2f}')
766
+ axes[i].axhline(avg, c='darkred', lw=1)
767
+ axes[i].axhline(0, ls='--', c='k', lw=1)
768
+
769
+ fig.suptitle('3-Month Rolling Information Coefficient', fontsize=16)
770
+ fig.tight_layout()
771
+ fig.subplots_adjust(top=0.92)
772
+
773
+ def plot_metrics(self, lgb_metrics, lgb_daily_ic, t=1):
774
+ # Visualization
775
+ sns.jointplot(x=lgb_metrics.daily_ic_mean, y=lgb_metrics.ic)
776
+
777
+ sns.catplot(x='lookahead', y='ic',
778
+ col='train_length', row='test_length',
779
+ data=lgb_metrics,
780
+ kind='box')
781
+ sns.catplot(x='boost_rounds',
782
+ y='ic',
783
+ col='train_length',
784
+ row='test_length',
785
+ data=lgb_daily_ic[lgb_daily_ic.lookahead == t],
786
+ kind='box')
787
+
788
+ def get_best_predictions(self, lgb_daily_ic, scope_params, lgb_train_params, lookahead=1, topn=10):
789
+ for best in range(topn):
790
+ best_params = self._get_lgb_params(
791
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=best)
792
+ key = self._get_lgb_key(lookahead, best_params)
793
+ rounds = str(int(best_params.boost_rounds))
794
+ if best == 0:
795
+ best_predictions = pd.read_hdf(
796
+ self.trainstore, 'predictions/' + key)
797
+ best_predictions = best_predictions[rounds].to_frame(best)
798
+ else:
799
+ best_predictions[best] = pd.read_hdf(
800
+ self.trainstore, 'predictions/' + key)[rounds]
801
+ best_predictions = best_predictions.sort_index()
802
+ best_predictions.reset_index().to_hdf(
803
+ self.outstore, f'lgb/train/{lookahead:02}')
804
+ return best_predictions
805
+
806
+ def apply_alphalen_analysis(self, factor_data, tearsheet=True, verbose=True):
807
+ # Compute Alphalens metrics
808
+ mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
809
+ factor_data,
810
+ by_date=True,
811
+ by_group=False,
812
+ demeaned=True,
813
+ group_adjust=False,
814
+ )
815
+ factor_returns = perf.factor_returns(factor_data)
816
+ mean_quant_ret, std_quantile = perf.mean_return_by_quantile(factor_data,
817
+ by_group=False,
818
+ demeaned=True)
819
+
820
+ mean_quant_rateret = mean_quant_ret.apply(rate_of_return, axis=0,
821
+ base_period=mean_quant_ret.columns[0])
822
+
823
+ mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
824
+ factor_data,
825
+ by_date=True,
826
+ by_group=False,
827
+ demeaned=True,
828
+ group_adjust=False,
829
+ )
830
+
831
+ mean_quant_rateret_bydate = mean_quant_ret_bydate.apply(
832
+ rate_of_return,
833
+ base_period=mean_quant_ret_bydate.columns[0],
834
+ )
835
+
836
+ compstd_quant_daily = std_quant_daily.apply(std_conversion,
837
+ base_period=std_quant_daily.columns[0])
838
+
839
+ alpha_beta = perf.factor_alpha_beta(factor_data,
840
+ demeaned=True)
841
+
842
+ mean_ret_spread_quant, std_spread_quant = perf.compute_mean_returns_spread(
843
+ mean_quant_rateret_bydate,
844
+ factor_data["factor_quantile"].max(),
845
+ factor_data["factor_quantile"].min(),
846
+ std_err=compstd_quant_daily,
847
+ )
848
+ if verbose:
849
+ print(mean_ret_spread_quant.mean().mul(10000).to_frame(
850
+ 'Mean Period Wise Spread (bps)').join(alpha_beta.T).T)
851
+
852
+ fig, axes = plt.subplots(ncols=3, figsize=(18, 4))
853
+
854
+ plotting.plot_quantile_returns_bar(mean_quant_rateret, ax=axes[0])
855
+ plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)
856
+ axes[0].set_xlabel('Quantile')
857
+
858
+ plotting.plot_cumulative_returns_by_quantile(mean_quant_ret_bydate['1D'],
859
+ freq=pd.tseries.offsets.BDay(),
860
+ period='1D',
861
+ ax=axes[1])
862
+ axes[1].set_title('Cumulative Return by Quantile (1D Period)')
863
+
864
+ title = "Cumulative Return - Factor-Weighted Long/Short PF (1D Period)"
865
+ plotting.plot_cumulative_returns(factor_returns['1D'],
866
+ period='1D',
867
+ freq=pd.tseries.offsets.BDay(),
868
+ title=title,
869
+ ax=axes[2])
870
+
871
+ fig.suptitle('Alphalens - Validation Set Performance', fontsize=14)
872
+ fig.tight_layout()
873
+ fig.subplots_adjust(top=.85)
874
+
875
+ # Summary Tearsheet
876
+ create_summary_tear_sheet(factor_data)
877
+ create_full_tear_sheet(factor_data)
878
+
879
+ def evaluate(self, remove_instore=False, lookahead=1, verbose=True):
880
+ scope_params = ['lookahead', 'train_length', 'test_length']
881
+ daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n',
882
+ 'daily_ic_median', 'daily_ic_median_n']
883
+ lgb_train_params = ['learning_rate', 'num_leaves',
884
+ 'feature_fraction', 'min_data_in_leaf']
885
+
886
+ lgb_metrics = self._get_lgb_metrics(
887
+ scope_params, lgb_train_params, daily_ic_metrics)
888
+ # Summary Metrics by Fold
889
+ lgb_metrics.to_hdf(self.outstore, 'lgb/metrics')
890
+
891
+ # Information Coefficient by Day
892
+ int_cols = ['lookahead', 'train_length', 'test_length', 'boost_rounds']
893
+ id_vars = ['date'] + scope_params + lgb_train_params
894
+ lgb_ic = self._get_lgb_ic(
895
+ int_cols, scope_params, lgb_train_params, id_vars)
896
+ lgb_ic.to_hdf(self.outstore, 'lgb/ic')
897
+ lgb_daily_ic = lgb_ic.groupby(
898
+ id_vars[1:] + ['boost_rounds']).ic.mean().to_frame('ic').reset_index()
899
+ lgb_daily_ic.to_hdf(self.outstore, 'lgb/daily_ic')
900
+
901
+ # Cross-validation Result: Best Hyperparameters
902
+ if verbose:
903
+ print(lgb_daily_ic.groupby('lookahead', group_keys=False).apply(
904
+ lambda x: x.nlargest(3, 'ic')))
905
+ lgb_metrics.groupby('lookahead', group_keys=False).apply(
906
+ lambda x: x.nlargest(3, 'ic'))
907
+ lgb_metrics.groupby('lookahead', group_keys=False
908
+ ).apply(lambda x: x.nlargest(3, 'ic')).to_hdf(self.outstore, 'lgb/best_model')
909
+ if verbose:
910
+ print(lgb_metrics.groupby('lookahead', group_keys=False).apply(
911
+ lambda x: x.nlargest(3, 'daily_ic_mean')))
912
+
913
+ # Visualization
914
+ if verbose:
915
+ self.plot_metrics(lgb_metrics, lgb_daily_ic, t=lookahead)
916
+
917
+ # AlphaLens Analysis - Validation Performance
918
+ lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
919
+ best_params = self._get_lgb_params(
920
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=0)
921
+ best_params.to_hdf(self.outstore, 'lgb/best_params')
922
+
923
+ if verbose:
924
+ self.plot_ic(lgb_ic, lgb_daily_ic, scope_params, lgb_train_params)
925
+
926
+ # Get Predictions for Validation Period
927
+ best_predictions = self.get_best_predictions(lgb_daily_ic, scope_params, lgb_train_params,
928
+ lookahead=lookahead, topn=10)
929
+ test_tickers = best_predictions.index.get_level_values(
930
+ 'symbol').unique()
931
+ start = best_predictions.index.get_level_values('date').min()
932
+ end = best_predictions.index.get_level_values('date').max()
933
+ trade_prices = self.get_trade_prices(test_tickers, start, end)
934
+ pd.Series(test_tickers).to_hdf(self.outstore, 'lgb/tickers')
935
+ # We average the top five models and provide the corresponding prices to Alphalens,
936
+ # in order to compute the mean period-wise
937
+ # return earned on an equal-weighted portfolio invested in the daily factor quintiles
938
+ # for various holding periods:
939
+ factor = best_predictions.iloc[:, :5].mean(
940
+ 1).dropna().tz_convert('UTC', level='date').swaplevel()
941
+ # Create AlphaLens Inputs
942
+ if verbose:
943
+ factor_data = get_clean_factor_and_forward_returns(factor=factor,
944
+ prices=trade_prices,
945
+ quantiles=5,
946
+ periods=(
947
+ 1, 5, 10, 21),
948
+ max_loss=1)
949
+ self.apply_alphalen_analysis(
950
+ factor_data, tearsheet=True, verbose=True)
951
+ # Delete the temporary files
952
+ if remove_instore:
953
+ os.remove(self.trainstore)
954
+
955
+ def make_predictions(self, data: pd.DataFrame, mode='test', lookahead=1, verbose=True):
956
+ data = data.copy()
957
+ YEAR = 252
958
+ scope_params = ['lookahead', 'train_length', 'test_length']
959
+ lgb_train_params = ['learning_rate', 'num_leaves',
960
+ 'feature_fraction', 'min_data_in_leaf']
961
+
962
+ base_params = dict(boosting='gbdt',
963
+ objective='regression',
964
+ verbose=-1)
965
+
966
+ categoricals = ['year', 'month', 'weekday']
967
+ labels = sorted(data.filter(like='_fwd').columns)
968
+ features = data.columns.difference(labels).tolist()
969
+ label = f'r{lookahead:02}_fwd'
970
+ for feature in categoricals:
971
+ data[feature] = pd.factorize(data[feature], sort=True)[0]
972
+
973
+ if mode == 'test':
974
+ data = data.dropna().sort_index()
975
+ elif mode == 'live':
976
+ data[labels] = data[labels].fillna(0)
977
+ data = data.sort_index().dropna()
978
+
979
+ lgb_data = lgb.Dataset(data=data[features],
980
+ label=data[label],
981
+ categorical_feature=categoricals,
982
+ free_raw_data=False)
983
+ # Generate predictions
984
+ lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
985
+
986
+ for position in range(10):
987
+ params = self._get_lgb_params(
988
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=position)
989
+
990
+ params = params.to_dict()
991
+
992
+ for p in ['min_data_in_leaf', 'num_leaves']:
993
+ params[p] = int(params[p])
994
+ train_length = int(params.pop('train_length'))
995
+ test_length = int(params.pop('test_length'))
996
+ num_boost_round = int(params.pop('boost_rounds'))
997
+ params.update(base_params)
998
+ if verbose:
999
+ print(f'\nPosition: {position:02}')
1000
+
1001
+ # 1-year out-of-sample period
1002
+ n_splits = int(YEAR / test_length)
1003
+ cv = MultipleTimeSeriesCV(n_splits=n_splits,
1004
+ test_period_length=test_length,
1005
+ lookahead=lookahead,
1006
+ train_period_length=train_length)
1007
+
1008
+ predictions = []
1009
+ for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
1010
+ if verbose:
1011
+ print(i, end=' ', flush=True)
1012
+ lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
1013
+ params=params).construct()
1014
+
1015
+ model = lgb.train(params=params,
1016
+ train_set=lgb_train,
1017
+ num_boost_round=num_boost_round,
1018
+ )
1019
+
1020
+ test_set = data.iloc[test_idx, :]
1021
+ y_test = test_set.loc[:, label].to_frame('y_test')
1022
+ y_pred = model.predict(test_set.loc[:, model.feature_name()])
1023
+ predictions.append(y_test.assign(prediction=y_pred))
1024
+
1025
+ if position == 0:
1026
+ test_predictions = (pd.concat(predictions)
1027
+ .rename(columns={'prediction': position}))
1028
+ else:
1029
+ test_predictions[position] = pd.concat(predictions).prediction
1030
+
1031
+ by_day = test_predictions.groupby(level='date')
1032
+ for position in range(10):
1033
+ if position == 0:
1034
+ ic_by_day = by_day.apply(lambda x: spearmanr(
1035
+ x.y_test, x[position])[0]).to_frame()
1036
+ else:
1037
+ ic_by_day[position] = by_day.apply(
1038
+ lambda x: spearmanr(x.y_test, x[position])[0])
1039
+ if verbose:
1040
+ print(ic_by_day.describe())
1041
+ test_predictions.reset_index().to_hdf(
1042
+ self.outstore, f'lgb/test/{lookahead:02}')
1043
+ return test_predictions
1044
+
1045
+ def load_predictions(self, predictions=None, lookahead=1):
1046
+ if predictions is None:
1047
+ predictions = pd.concat([
1048
+ pd.read_hdf(self.outstore, f'lgb/train/{lookahead:02}'),
1049
+ pd.read_hdf(self.outstore,
1050
+ f'lgb/test/{lookahead:02}').drop('y_test', axis=1)
1051
+ ])
1052
+ predictions = predictions.set_index(['symbol', 'date'])
1053
+
1054
+ predictions = (predictions.loc[~predictions.index.duplicated()]
1055
+ .iloc[:, :10]
1056
+ .mean(1)
1057
+ .sort_index()
1058
+ .dropna()
1059
+ .to_frame('prediction'))
1060
+ tickers = predictions.index.get_level_values(
1061
+ 'symbol').unique().tolist()
1062
+ return (predictions
1063
+ .unstack('symbol')
1064
+ .prediction
1065
+ .tz_convert('UTC')), tickers
1066
+
1067
+ def assert_last_date(self, predictions: pd.DataFrame):
1068
+ """
1069
+ Usefull in Live Trading to ensure that the last date in the predictions
1070
+ is the previous day, so it predicts today's returns.
1071
+ """
1072
+ last_date = predictions.index.get_level_values('date').max()
1073
+ if last_date.tzinfo is None:
1074
+ last_date = last_date.tz_localize('UTC')
1075
+ try:
1076
+ if datetime.now().strftime('%A') == 'Monday':
1077
+ assert last_date == (pd.Timestamp.now(
1078
+ tz='UTC') - pd.Timedelta(days=3)).normalize()
1079
+ else:
1080
+ assert (
1081
+ last_date == (pd.Timestamp.now(tz='UTC')
1082
+ - pd.Timedelta(days=1)).normalize()
1083
+ or last_date == (pd.Timestamp.now(tz='UTC')).normalize()
1084
+ )
1085
+ return True
1086
+ except AssertionError:
1087
+ return False
1088
+
1089
+ def clean_stores(self, *stores):
1090
+ for store in stores:
1091
+ if os.path.exists(store):
1092
+ os.remove(store)