bbstrader 0.1.93__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bbstrader might be problematic. Click here for more details.
- bbstrader/__ini__.py +2 -2
- bbstrader/btengine/data.py +241 -40
- bbstrader/btengine/strategy.py +12 -8
- bbstrader/config.py +4 -0
- bbstrader/core/__init__.py +0 -0
- bbstrader/core/data.py +23 -0
- bbstrader/core/utils.py +0 -0
- bbstrader/ibkr/__init__.py +0 -0
- bbstrader/metatrader/account.py +66 -12
- bbstrader/metatrader/rates.py +24 -20
- bbstrader/metatrader/risk.py +6 -3
- bbstrader/metatrader/trade.py +31 -13
- bbstrader/models/__init__.py +1 -1
- bbstrader/models/factors.py +275 -0
- bbstrader/models/ml.py +1026 -0
- bbstrader/models/optimization.py +17 -16
- bbstrader/models/{portfolios.py → portfolio.py} +20 -11
- bbstrader/models/risk.py +10 -2
- bbstrader/trading/execution.py +67 -35
- bbstrader/trading/strategies.py +5 -5
- bbstrader/tseries.py +412 -63
- {bbstrader-0.1.93.dist-info → bbstrader-0.2.0.dist-info}/METADATA +9 -3
- bbstrader-0.2.0.dist-info/RECORD +36 -0
- {bbstrader-0.1.93.dist-info → bbstrader-0.2.0.dist-info}/WHEEL +1 -1
- bbstrader-0.1.93.dist-info/RECORD +0 -32
- {bbstrader-0.1.93.dist-info → bbstrader-0.2.0.dist-info}/LICENSE +0 -0
- {bbstrader-0.1.93.dist-info → bbstrader-0.2.0.dist-info}/top_level.txt +0 -0
bbstrader/models/ml.py
CHANGED
|
@@ -0,0 +1,1026 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import sys, os
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import seaborn as sns
|
|
7
|
+
import talib
|
|
8
|
+
from time import time
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
from talib import RSI, BBANDS, MACD, ATR
|
|
11
|
+
import yfinance as yf
|
|
12
|
+
from scipy.stats import spearmanr
|
|
13
|
+
from itertools import product
|
|
14
|
+
import lightgbm as lgb
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
from alphalens.tears import (create_summary_tear_sheet,
|
|
17
|
+
create_full_tear_sheet)
|
|
18
|
+
from alphalens import plotting
|
|
19
|
+
from alphalens import performance as perf
|
|
20
|
+
from alphalens.utils import get_clean_factor_and_forward_returns, rate_of_return, std_conversion
|
|
21
|
+
from sklearn.preprocessing import StandardScaler
|
|
22
|
+
from sklearn.preprocessing import LabelEncoder
|
|
23
|
+
|
|
24
|
+
import warnings
|
|
25
|
+
warnings.filterwarnings('ignore')
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
'OneStepTimeSeriesSplit',
|
|
30
|
+
'MultipleTimeSeriesCV',
|
|
31
|
+
'LightGBModel'
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
class OneStepTimeSeriesSplit:
|
|
35
|
+
__author__ = "Stefan Jansen"
|
|
36
|
+
"""Generates tuples of train_idx, test_idx pairs
|
|
37
|
+
Assumes the index contains a level labeled 'date'"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
|
|
40
|
+
self.n_splits = n_splits
|
|
41
|
+
self.test_period_length = test_period_length
|
|
42
|
+
self.shuffle = shuffle
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def chunks(l, n):
|
|
46
|
+
for i in range(0, len(l), n):
|
|
47
|
+
yield l[i:i + n]
|
|
48
|
+
|
|
49
|
+
def split(self, X: pd.DataFrame, y=None, groups=None):
|
|
50
|
+
unique_dates = (X.index
|
|
51
|
+
.get_level_values('date')
|
|
52
|
+
.unique()
|
|
53
|
+
.sort_values(ascending=False)
|
|
54
|
+
[:self.n_splits*self.test_period_length])
|
|
55
|
+
|
|
56
|
+
dates = X.reset_index()[['date']]
|
|
57
|
+
for test_date in self.chunks(unique_dates, self.test_period_length):
|
|
58
|
+
train_idx = dates[dates.date < min(test_date)].index
|
|
59
|
+
test_idx = dates[dates.date.isin(test_date)].index
|
|
60
|
+
if self.shuffle:
|
|
61
|
+
np.random.shuffle(list(train_idx))
|
|
62
|
+
yield train_idx, test_idx
|
|
63
|
+
|
|
64
|
+
def get_n_splits(self, X, y, groups=None):
|
|
65
|
+
return self.n_splits
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class MultipleTimeSeriesCV:
|
|
69
|
+
__author__ = "Stefan Jansen"
|
|
70
|
+
"""
|
|
71
|
+
Generates tuples of train_idx, test_idx pairs
|
|
72
|
+
Assumes the MultiIndex contains levels 'symbol' and 'date'
|
|
73
|
+
purges overlapping outcomes
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self,
|
|
77
|
+
n_splits=3,
|
|
78
|
+
train_period_length=126,
|
|
79
|
+
test_period_length=21,
|
|
80
|
+
lookahead=None,
|
|
81
|
+
date_idx='date',
|
|
82
|
+
shuffle=False):
|
|
83
|
+
|
|
84
|
+
self.n_splits = n_splits
|
|
85
|
+
self.lookahead = lookahead
|
|
86
|
+
self.test_length = test_period_length
|
|
87
|
+
self.train_length = train_period_length
|
|
88
|
+
self.shuffle = shuffle
|
|
89
|
+
self.date_idx = date_idx
|
|
90
|
+
|
|
91
|
+
def split(self, X: pd.DataFrame, y=None, groups=None):
|
|
92
|
+
unique_dates = X.index.get_level_values(self.date_idx).unique()
|
|
93
|
+
days = sorted(unique_dates, reverse=True)
|
|
94
|
+
split_idx = []
|
|
95
|
+
for i in range(self.n_splits):
|
|
96
|
+
test_end_idx = i * self.test_length
|
|
97
|
+
test_start_idx = test_end_idx + self.test_length
|
|
98
|
+
train_end_idx = test_start_idx + self.lookahead - 1
|
|
99
|
+
train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
|
|
100
|
+
split_idx.append([train_start_idx, train_end_idx,
|
|
101
|
+
test_start_idx, test_end_idx])
|
|
102
|
+
|
|
103
|
+
dates = X.reset_index()[[self.date_idx]]
|
|
104
|
+
for train_start, train_end, test_start, test_end in split_idx:
|
|
105
|
+
|
|
106
|
+
train_idx = dates[(dates[self.date_idx] > days[train_start])
|
|
107
|
+
& (dates[self.date_idx] <= days[train_end])].index
|
|
108
|
+
test_idx = dates[(dates[self.date_idx] > days[test_start])
|
|
109
|
+
& (dates[self.date_idx] <= days[test_end])].index
|
|
110
|
+
if self.shuffle:
|
|
111
|
+
np.random.shuffle(list(train_idx))
|
|
112
|
+
yield train_idx.to_numpy(), test_idx.to_numpy()
|
|
113
|
+
|
|
114
|
+
def get_n_splits(self, X, y, groups=None):
|
|
115
|
+
return self.n_splits
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class LightGBModel(object):
|
|
119
|
+
"""
|
|
120
|
+
``LightGBModel`` encapsulates a complete workflow for training and evaluating
|
|
121
|
+
a ``LightGBM (Light Gradient Boosting Machine)`` model for predicting stock returns.
|
|
122
|
+
It includes data acquisition, feature engineering, model tuning, and performance
|
|
123
|
+
evaluation using information ``coefficient (IC)`` and Alphalens analysis.
|
|
124
|
+
|
|
125
|
+
Key Features
|
|
126
|
+
------------
|
|
127
|
+
- ``HDF5 Storage``: Utilizes ``pandas.HDFStore`` for efficient storage and retrieval
|
|
128
|
+
of large datasets, which is essential for backtesting on financial time series data.
|
|
129
|
+
|
|
130
|
+
- ``Time-Series Cross-Validation``: Employs a custom cross-validation strategy that
|
|
131
|
+
respects the time series nature of the data, avoiding data leakage.
|
|
132
|
+
|
|
133
|
+
- ``Hyperparameter Tuning``: Includes automated hyperparameter tuning using a randomized
|
|
134
|
+
grid search for optimization.
|
|
135
|
+
|
|
136
|
+
- ``Information Coefficient (IC)``: Uses IC as a core performance metric that quantifies
|
|
137
|
+
the predictive power of the model, which is a standard measure for ranking models in finance.
|
|
138
|
+
|
|
139
|
+
- ``Alphalens Integration``: Provides a comprehensive framework for validating model
|
|
140
|
+
performance using Alphalens, allowing for in-depth performance analysis, like backtesting
|
|
141
|
+
and return decomposition.
|
|
142
|
+
|
|
143
|
+
Use Case
|
|
144
|
+
--------
|
|
145
|
+
This class is designed for quantitative finance and algorithmic trading use cases where
|
|
146
|
+
the goal is to build a predictive model for stock returns based on historical data and
|
|
147
|
+
technical indicators. It follows a complete cycle from data acquisition to model validation
|
|
148
|
+
and provides the infrastructure needed for deployment of this model in a trading strategy.
|
|
149
|
+
|
|
150
|
+
Notes
|
|
151
|
+
-----
|
|
152
|
+
The implementation is inspired by the book "Machine Learning for Algorithmic Trading"
|
|
153
|
+
by Stefan Jansen.
|
|
154
|
+
|
|
155
|
+
References
|
|
156
|
+
----------
|
|
157
|
+
Stefan Jansen (2020). Machine Learning for Algorithmic Trading - Second Edition.
|
|
158
|
+
Chapter 12, Boosting Your Trading Strategy.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
def __init__(self,
|
|
162
|
+
data: pd.DataFrame=None,
|
|
163
|
+
datastore: pd.HDFStore='lgbdata.h5',
|
|
164
|
+
trainstore: pd.HDFStore='lgbtrain.h5',
|
|
165
|
+
outstore: pd.HDFStore='lgbout.h5'
|
|
166
|
+
):
|
|
167
|
+
"""
|
|
168
|
+
Args:
|
|
169
|
+
data (pd.DataFrame): The input data for the model. It should be a DataFrame with a MultiIndex containing
|
|
170
|
+
'symbol' and 'date' levels. If not provided, the data can be downloaded using the `download_boosting_data` method.
|
|
171
|
+
datastore (str): The path to the HDF5 file for storing the model data.
|
|
172
|
+
trainstore (str): The path to the HDF5 file for storing the training data.
|
|
173
|
+
outstore (str): The path to the HDF5 file for storing the output data.
|
|
174
|
+
"""
|
|
175
|
+
self.datastore = datastore
|
|
176
|
+
self.trainstore = trainstore
|
|
177
|
+
self.outstore = outstore
|
|
178
|
+
if data is not None:
|
|
179
|
+
data.reset_index().to_hdf(self.datastore, 'model_data')
|
|
180
|
+
|
|
181
|
+
def _compute_bb(self, close):
|
|
182
|
+
high, mid, low = BBANDS(close, timeperiod=20)
|
|
183
|
+
return pd.DataFrame({'bb_high': high, 'bb_low': low}, index=close.index)
|
|
184
|
+
|
|
185
|
+
def _compute_atr(self, stock_data):
|
|
186
|
+
df = ATR(stock_data.high, stock_data.low,
|
|
187
|
+
stock_data.close, timeperiod=14)
|
|
188
|
+
return df.sub(df.mean()).div(df.std())
|
|
189
|
+
|
|
190
|
+
def _compute_macd(self, close):
|
|
191
|
+
macd = MACD(close)[0]
|
|
192
|
+
return (macd - np.mean(macd))/np.std(macd)
|
|
193
|
+
|
|
194
|
+
def _add_technical_indicators(self, prices: pd.DataFrame):
|
|
195
|
+
prices = prices.copy()
|
|
196
|
+
prices['rsi'] = prices.groupby(level='symbol').close.apply(lambda x: RSI(x).reset_index(level=0, drop=True))
|
|
197
|
+
bb = prices.groupby(level=0).close.apply(self._compute_bb).reset_index(level=1, drop=True)
|
|
198
|
+
prices = prices.join(bb)
|
|
199
|
+
prices['bb_high'] = prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
|
|
200
|
+
prices['bb_low'] = prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)
|
|
201
|
+
prices['NATR'] = prices.groupby(level='symbol',
|
|
202
|
+
group_keys=False).apply(lambda x:
|
|
203
|
+
talib.NATR(x.high, x.low, x.close))
|
|
204
|
+
|
|
205
|
+
prices['ATR'] = (prices.groupby('symbol', group_keys=False)
|
|
206
|
+
.apply(self._compute_atr))
|
|
207
|
+
prices['PPO'] = prices.groupby(level='symbol').close.apply(lambda x: talib.PPO(x).reset_index(level=0, drop=True))
|
|
208
|
+
prices['MACD'] = (prices
|
|
209
|
+
.groupby('symbol', group_keys=False)
|
|
210
|
+
.close
|
|
211
|
+
.apply(self._compute_macd))
|
|
212
|
+
return prices
|
|
213
|
+
|
|
214
|
+
def download_boosting_data(self, tickers, start, end=None):
|
|
215
|
+
data = []
|
|
216
|
+
for ticker in tickers:
|
|
217
|
+
try:
|
|
218
|
+
prices = yf.download(ticker, start=start, end=end, progress=False, multi_level_index=False)
|
|
219
|
+
prices['symbol'] = ticker
|
|
220
|
+
data.append(prices)
|
|
221
|
+
except:
|
|
222
|
+
continue
|
|
223
|
+
data = pd.concat(data)
|
|
224
|
+
data = (data
|
|
225
|
+
.rename(columns={s: s.lower().replace(' ', '_') for s in data.columns})
|
|
226
|
+
.drop(columns=['adj_close'])
|
|
227
|
+
.set_index('symbol', append=True).swaplevel()
|
|
228
|
+
.sort_index()
|
|
229
|
+
.dropna())
|
|
230
|
+
return data
|
|
231
|
+
|
|
232
|
+
def download_metadata(self, tickers):
|
|
233
|
+
|
|
234
|
+
def clean_text_column(series: pd.Series) -> pd.Series:
|
|
235
|
+
return (
|
|
236
|
+
series.str.lower()
|
|
237
|
+
.str.replace('-', '', regex=False) # use regex=False for literal string replacements
|
|
238
|
+
.str.replace('&', 'and', regex=False)
|
|
239
|
+
.str.replace(' ', '_', regex=False)
|
|
240
|
+
.str.replace('__', '_', regex=False)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
metadata = ['industry', 'sector', 'exchange', 'symbol',
|
|
244
|
+
'heldPercentInsiders', 'heldPercentInstitutions',
|
|
245
|
+
'overallRisk', 'shortRatio', 'dividendYield', 'beta',
|
|
246
|
+
'regularMarketVolume', 'averageVolume', 'averageVolume10days',
|
|
247
|
+
'bid', 'ask', 'bidSize', 'askSize','marketCap']
|
|
248
|
+
|
|
249
|
+
columns = {
|
|
250
|
+
'industry' : 'industry',
|
|
251
|
+
'sector' : 'sector',
|
|
252
|
+
'exchange' : 'exchange',
|
|
253
|
+
'symbol' : 'symbol',
|
|
254
|
+
'heldPercentInsiders' : 'insiders',
|
|
255
|
+
'heldPercentInstitutions': 'institutions',
|
|
256
|
+
'overallRisk' : 'risk',
|
|
257
|
+
'shortRatio' : 'short_ratio',
|
|
258
|
+
'dividendYield' : 'dyield',
|
|
259
|
+
'beta' : 'beta',
|
|
260
|
+
'regularMarketVolume' : 'regvolume',
|
|
261
|
+
'averageVolume' : 'avgvolume',
|
|
262
|
+
'averageVolume10days' : 'avgvolume10',
|
|
263
|
+
'bid' : 'bid',
|
|
264
|
+
'ask' : 'ask',
|
|
265
|
+
'bidSize' : 'bidsize',
|
|
266
|
+
'askSize' : 'asksize',
|
|
267
|
+
'marketCap' : 'marketcap'
|
|
268
|
+
}
|
|
269
|
+
data = []
|
|
270
|
+
for symbol in tickers:
|
|
271
|
+
try:
|
|
272
|
+
symbol_info = yf.Ticker(symbol).info
|
|
273
|
+
except:
|
|
274
|
+
continue
|
|
275
|
+
infos = {}
|
|
276
|
+
for info in metadata:
|
|
277
|
+
infos[info] = symbol_info.get(info)
|
|
278
|
+
data.append(infos)
|
|
279
|
+
metadata = pd.DataFrame(data)
|
|
280
|
+
metadata = metadata.rename(columns=columns)
|
|
281
|
+
metadata.dyield = metadata.dyield.fillna(0)
|
|
282
|
+
metadata.sector = clean_text_column(metadata.sector)
|
|
283
|
+
metadata.industry = clean_text_column(metadata.industry)
|
|
284
|
+
metadata = metadata.set_index('symbol')
|
|
285
|
+
return metadata
|
|
286
|
+
|
|
287
|
+
def _select_nlargest_liquidity_stocks(self, df: pd.DataFrame, n: int,
|
|
288
|
+
volume_features, bid_ask_features, market_cap_feature):
|
|
289
|
+
df = df.copy()
|
|
290
|
+
scaler = StandardScaler()
|
|
291
|
+
|
|
292
|
+
# Normalize features
|
|
293
|
+
df[volume_features] = scaler.fit_transform(df[volume_features])
|
|
294
|
+
df['bid_ask_spread'] = df['ask'] - df['bid']
|
|
295
|
+
df['bid_ask_spread'] = scaler.fit_transform(df[['bid_ask_spread']])
|
|
296
|
+
df[market_cap_feature] = scaler.fit_transform(df[market_cap_feature])
|
|
297
|
+
|
|
298
|
+
# Calculate Liquidity Score
|
|
299
|
+
# Assign weights to each component (these weights can be adjusted based on importance)
|
|
300
|
+
weights = {
|
|
301
|
+
'volume': 0.4,
|
|
302
|
+
'bid_ask_spread': 0.2,
|
|
303
|
+
'marketCap': 0.4
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
# Calculate the liquidity score by combining the normalized features
|
|
307
|
+
df['liquidity_score'] = (weights['volume'] * df[volume_features].mean(axis=1) +
|
|
308
|
+
weights['bid_ask_spread'] * df['bid_ask_spread'] +
|
|
309
|
+
weights['marketCap'] * df[market_cap_feature[0]])
|
|
310
|
+
df_sorted = df.sort_values(by='liquidity_score', ascending=False)
|
|
311
|
+
|
|
312
|
+
return df_sorted.nlargest(n, 'liquidity_score').index
|
|
313
|
+
|
|
314
|
+
def _encode_metadata(self, df: pd.DataFrame):
|
|
315
|
+
df = df.copy()
|
|
316
|
+
# Binning each numerical feature into categories
|
|
317
|
+
df['insiders'] = pd.qcut(
|
|
318
|
+
df['insiders'], q=4,
|
|
319
|
+
labels=['Very Low', 'Low', 'High', 'Very High']
|
|
320
|
+
)
|
|
321
|
+
df['institutions'] = pd.qcut(
|
|
322
|
+
df['institutions'], q=4,
|
|
323
|
+
labels=['Very Low', 'Low', 'High', 'Very High']
|
|
324
|
+
)
|
|
325
|
+
df['risk'] = pd.cut(
|
|
326
|
+
df['risk'], bins=[-float('inf'), 3, 5, 7, float('inf')],
|
|
327
|
+
labels=['Low', 'Medium', 'High', 'Very High']
|
|
328
|
+
)
|
|
329
|
+
df['short_ratio'] = pd.qcut(
|
|
330
|
+
df['short_ratio'], q=4,
|
|
331
|
+
labels=['Very Low', 'Low', 'High', 'Very High']
|
|
332
|
+
)
|
|
333
|
+
df['dyield'] = pd.cut(
|
|
334
|
+
df['dyield'],
|
|
335
|
+
bins=[-float('inf'), 0.002, 0.005, 0.01, float('inf')],
|
|
336
|
+
labels=['Very Low', 'Low', 'High', 'Very High']
|
|
337
|
+
)
|
|
338
|
+
df['beta'] = pd.cut(
|
|
339
|
+
df['beta'],
|
|
340
|
+
bins=[-float('inf'), 0.8, 1.0, 1.2, float('inf')],
|
|
341
|
+
labels=['Low', 'Moderate', 'High', 'Very High']
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Encode binned features
|
|
345
|
+
binned_features = [
|
|
346
|
+
'insiders', 'institutions',
|
|
347
|
+
'risk', 'short_ratio', 'dyield',
|
|
348
|
+
'beta', 'sector', 'industry', 'exchange',
|
|
349
|
+
]
|
|
350
|
+
label_encoders = {}
|
|
351
|
+
|
|
352
|
+
for col in binned_features:
|
|
353
|
+
le = LabelEncoder()
|
|
354
|
+
df[col] = le.fit_transform(df[col])
|
|
355
|
+
label_encoders[col] = le
|
|
356
|
+
return df, label_encoders
|
|
357
|
+
|
|
358
|
+
def prepare_boosting_data(self,
|
|
359
|
+
prices: pd.DataFrame,
|
|
360
|
+
metadata: pd.DataFrame = None,
|
|
361
|
+
min_years=7,
|
|
362
|
+
universe=500
|
|
363
|
+
):
|
|
364
|
+
if metadata is None:
|
|
365
|
+
mcap = False
|
|
366
|
+
tickers = prices.index.get_level_values('symbol').unique()
|
|
367
|
+
metadata = self.download_metadata(tickers)
|
|
368
|
+
else:
|
|
369
|
+
mcap = True
|
|
370
|
+
YEAR = 252
|
|
371
|
+
idx = pd.IndexSlice
|
|
372
|
+
percentiles = [.001, .01, .02, .03, .04, .05]
|
|
373
|
+
percentiles += [1-p for p in percentiles[::-1]]
|
|
374
|
+
T = [1, 5, 10, 21, 42, 63]
|
|
375
|
+
|
|
376
|
+
prices.volume /= 1e3 # make vol figures a bit smaller
|
|
377
|
+
prices.index.names = ['symbol', 'date']
|
|
378
|
+
metadata.index.name = 'symbol'
|
|
379
|
+
prices.reset_index().to_hdf(self.datastore, 'stock_data')
|
|
380
|
+
metadata.reset_index().to_hdf(self.datastore, 'stock_metadata')
|
|
381
|
+
|
|
382
|
+
# Remove stocks with insufficient observations
|
|
383
|
+
min_obs = min_years * YEAR
|
|
384
|
+
nobs = prices.groupby(level='symbol').size()
|
|
385
|
+
keep = nobs[nobs > min_obs].index
|
|
386
|
+
prices = prices.loc[idx[keep, :], :]
|
|
387
|
+
|
|
388
|
+
# # Remove duplicate symbols
|
|
389
|
+
prices = prices[~prices.index.duplicated()]
|
|
390
|
+
|
|
391
|
+
# Align price and meta data
|
|
392
|
+
metadata = metadata[~metadata.index.duplicated() & metadata.sector.notnull()]
|
|
393
|
+
metadata.sector = metadata.sector.str.lower().str.replace(' ', '_')
|
|
394
|
+
shared = (prices.index.get_level_values('symbol').unique()
|
|
395
|
+
.intersection(metadata.index))
|
|
396
|
+
metadata = metadata.loc[shared, :]
|
|
397
|
+
prices = prices.loc[idx[shared, :], :]
|
|
398
|
+
|
|
399
|
+
# Limit universe
|
|
400
|
+
if mcap:
|
|
401
|
+
universe = metadata.marketcap.nlargest(universe).index
|
|
402
|
+
else:
|
|
403
|
+
volume_features = ['regvolume', 'avgvolume', 'avgvolume10']
|
|
404
|
+
bid_ask_features = ['bid', 'ask', 'bidsize', 'asksize']
|
|
405
|
+
market_cap_feature = ['marketcap']
|
|
406
|
+
to_drop = volume_features + bid_ask_features + market_cap_feature
|
|
407
|
+
universe = self._select_nlargest_liquidity_stocks(
|
|
408
|
+
metadata, universe, volume_features, bid_ask_features, market_cap_feature
|
|
409
|
+
)
|
|
410
|
+
metadata = metadata.drop(to_drop, axis=1)
|
|
411
|
+
prices = prices.loc[idx[universe, :], :]
|
|
412
|
+
metadata = metadata.loc[universe]
|
|
413
|
+
metadata = self._encode_metadata(metadata)[0]
|
|
414
|
+
|
|
415
|
+
prices['dollar_vol'] = prices[['close', 'volume']].prod(1).div(1e3)
|
|
416
|
+
# compute dollar volume to determine universe
|
|
417
|
+
dollar_vol_ma = (prices
|
|
418
|
+
.dollar_vol
|
|
419
|
+
.unstack('symbol')
|
|
420
|
+
.rolling(window=21, min_periods=1) # 1 trading month
|
|
421
|
+
.mean())
|
|
422
|
+
|
|
423
|
+
# Rank stocks by moving average
|
|
424
|
+
prices['dollar_vol_rank'] = (dollar_vol_ma
|
|
425
|
+
.rank(axis=1, ascending=False)
|
|
426
|
+
.stack('symbol')
|
|
427
|
+
.swaplevel())
|
|
428
|
+
# Add some Basic Factors
|
|
429
|
+
prices = self._add_technical_indicators(prices)
|
|
430
|
+
# Combine Price and Meta Data
|
|
431
|
+
prices = prices.join(metadata)
|
|
432
|
+
|
|
433
|
+
# Compute Returns
|
|
434
|
+
by_sym = prices.groupby(level='symbol').close
|
|
435
|
+
for t in T:
|
|
436
|
+
prices[f'r{t:02}'] = by_sym.pct_change(t)
|
|
437
|
+
# Daily historical return deciles
|
|
438
|
+
for t in T:
|
|
439
|
+
# Reset the index to apply qcut by date without grouping errors
|
|
440
|
+
prices[f'r{t:02}dec'] = (prices.reset_index(level='date')
|
|
441
|
+
.groupby('date')[f'r{t:02}']
|
|
442
|
+
.apply(lambda x: pd.qcut(x,
|
|
443
|
+
q=10,
|
|
444
|
+
labels=False,
|
|
445
|
+
duplicates='drop'))
|
|
446
|
+
.values)
|
|
447
|
+
# Daily sector return deciles
|
|
448
|
+
for t in T:
|
|
449
|
+
prices[f'r{t:02}q_sector'] = (
|
|
450
|
+
prices
|
|
451
|
+
.groupby(['date', 'sector'])[f'r{t:02}']
|
|
452
|
+
.transform(lambda x: pd.qcut(
|
|
453
|
+
x,
|
|
454
|
+
q=5,
|
|
455
|
+
labels=False,
|
|
456
|
+
duplicates='drop'))
|
|
457
|
+
)
|
|
458
|
+
# Compute Forward Returns
|
|
459
|
+
for t in [1, 5, 21]:
|
|
460
|
+
prices[f'r{t:02}_fwd'] = prices.groupby(
|
|
461
|
+
level='symbol')[f'r{t:02}'].shift(-t)
|
|
462
|
+
|
|
463
|
+
# Remove outliers
|
|
464
|
+
outliers = prices[prices.r01 > 1].index.get_level_values('symbol').unique()
|
|
465
|
+
prices = prices.drop(outliers, level='symbol')
|
|
466
|
+
# Create time and sector dummy variables
|
|
467
|
+
prices['year'] = prices.index.get_level_values('date').year
|
|
468
|
+
prices['month'] = prices.index.get_level_values('date').month
|
|
469
|
+
prices['weekday'] = prices.index.get_level_values('date').weekday
|
|
470
|
+
# Store Model Data
|
|
471
|
+
prices = prices.drop(['open', 'close', 'low', 'high', 'volume'], axis=1)
|
|
472
|
+
if 'adj_close' in prices.columns:
|
|
473
|
+
prices = prices.drop('adj_close', axis=1)
|
|
474
|
+
prices.reset_index().dropna().to_hdf(self.datastore, 'model_data')
|
|
475
|
+
return prices.dropna()
|
|
476
|
+
|
|
477
|
+
def tickers(self):
|
|
478
|
+
return pd.read_hdf(self.outstore, 'lgb/tickers').tolist()
|
|
479
|
+
|
|
480
|
+
def load_model_data(self):
|
|
481
|
+
return pd.read_hdf(self.datastore, 'model_data').set_index(['symbol', 'date']).sort_index()
|
|
482
|
+
|
|
483
|
+
def format_time(self, t):
|
|
484
|
+
|
|
485
|
+
"""Return a formatted time string 'HH:MM:SS
|
|
486
|
+
based on a numeric time() value"""
|
|
487
|
+
m, s = divmod(t, 60)
|
|
488
|
+
h, m = divmod(m, 60)
|
|
489
|
+
return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'
|
|
490
|
+
|
|
491
|
+
def fit(self, data: pd.DataFrame, verbose=True):
|
|
492
|
+
def get_fi(model):
|
|
493
|
+
"""Return normalized feature importance as pd.Series"""
|
|
494
|
+
fi = model.feature_importance(importance_type='gain')
|
|
495
|
+
return (pd.Series(fi / fi.sum(),
|
|
496
|
+
index=model.feature_name()))
|
|
497
|
+
|
|
498
|
+
def ic_lgbm(preds, train_data):
|
|
499
|
+
"""Custom IC eval metric for lightgbm"""
|
|
500
|
+
is_higher_better = True
|
|
501
|
+
return 'ic', spearmanr(preds, train_data.get_label())[0], is_higher_better
|
|
502
|
+
# Hyperparameter options
|
|
503
|
+
YEAR = 252
|
|
504
|
+
base_params = dict(boosting='gbdt',
|
|
505
|
+
objective='regression',
|
|
506
|
+
verbose=-1)
|
|
507
|
+
|
|
508
|
+
# constraints on structure (depth) of each tree
|
|
509
|
+
max_depths = [2, 3, 5, 7]
|
|
510
|
+
num_leaves_opts = [2 ** i for i in max_depths]
|
|
511
|
+
min_data_in_leaf_opts = [250, 500, 1000]
|
|
512
|
+
|
|
513
|
+
# weight of each new tree in the ensemble
|
|
514
|
+
learning_rate_ops = [.01, .1, .3]
|
|
515
|
+
|
|
516
|
+
# random feature selection
|
|
517
|
+
feature_fraction_opts = [.3, .6, .95]
|
|
518
|
+
|
|
519
|
+
param_names = ['learning_rate', 'num_leaves',
|
|
520
|
+
'feature_fraction', 'min_data_in_leaf']
|
|
521
|
+
|
|
522
|
+
cv_params = list(product(learning_rate_ops,
|
|
523
|
+
num_leaves_opts,
|
|
524
|
+
feature_fraction_opts,
|
|
525
|
+
min_data_in_leaf_opts))
|
|
526
|
+
n_params = len(cv_params)
|
|
527
|
+
print(f'# Parameters: {n_params}')
|
|
528
|
+
|
|
529
|
+
# Train/Test Period Lengths
|
|
530
|
+
lookaheads = [1, 5, 21]
|
|
531
|
+
train_lengths = [int(4.5 * 252), 252]
|
|
532
|
+
test_lengths = [63]
|
|
533
|
+
test_params = list(product(lookaheads, train_lengths, test_lengths))
|
|
534
|
+
n = len(test_params)
|
|
535
|
+
test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
|
|
536
|
+
test_params = [test_params[i] for i in test_param_sample]
|
|
537
|
+
print('Train configs:', len(test_params))
|
|
538
|
+
|
|
539
|
+
### Categorical Variables
|
|
540
|
+
categoricals = ['year', 'weekday', 'month']
|
|
541
|
+
for feature in categoricals:
|
|
542
|
+
data[feature] = pd.factorize(data[feature], sort=True)[0]
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
# ### Run Cross-Validation
|
|
546
|
+
labels = sorted(data.filter(like='fwd').columns)
|
|
547
|
+
features = data.columns.difference(labels).tolist()
|
|
548
|
+
label_dict = dict(zip(lookaheads, labels))
|
|
549
|
+
num_iterations = [10, 25, 50, 75] + list(range(100, 501, 50))
|
|
550
|
+
num_boost_round = num_iterations[-1]
|
|
551
|
+
|
|
552
|
+
metric_cols = (param_names + ['t', 'daily_ic_mean', 'daily_ic_mean_n',
|
|
553
|
+
'daily_ic_median', 'daily_ic_median_n'] +
|
|
554
|
+
[str(n) for n in num_iterations])
|
|
555
|
+
|
|
556
|
+
for lookahead, train_length, test_length in test_params:
|
|
557
|
+
# randomized grid search
|
|
558
|
+
cvp = np.random.choice(list(range(n_params)),
|
|
559
|
+
size=int(n_params / 2),
|
|
560
|
+
replace=False)
|
|
561
|
+
cv_params_ = [cv_params[i] for i in cvp]
|
|
562
|
+
|
|
563
|
+
# set up cross-validation
|
|
564
|
+
n_splits = int(2 * YEAR / test_length)
|
|
565
|
+
if verbose:
|
|
566
|
+
print(f'Lookahead: {lookahead:2.0f} | '
|
|
567
|
+
f'Train: {train_length:3.0f} | '
|
|
568
|
+
f'Test: {test_length:2.0f} | '
|
|
569
|
+
f'Params: {len(cv_params_):3.0f} | '
|
|
570
|
+
f'Train configs: {len(test_params)}')
|
|
571
|
+
|
|
572
|
+
# time-series cross-validation
|
|
573
|
+
cv = MultipleTimeSeriesCV(n_splits=n_splits,
|
|
574
|
+
lookahead=lookahead,
|
|
575
|
+
test_period_length=test_length,
|
|
576
|
+
train_period_length=train_length)
|
|
577
|
+
|
|
578
|
+
label = label_dict[lookahead]
|
|
579
|
+
outcome_data = data.loc[:, features + [label]].dropna()
|
|
580
|
+
|
|
581
|
+
# binary dataset
|
|
582
|
+
lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),
|
|
583
|
+
label=outcome_data[label],
|
|
584
|
+
categorical_feature=categoricals,
|
|
585
|
+
free_raw_data=False)
|
|
586
|
+
T = 0
|
|
587
|
+
predictions, metrics, feature_importance, daily_ic = [], [], [], []
|
|
588
|
+
|
|
589
|
+
# iterate over (shuffled) hyperparameter combinations
|
|
590
|
+
for p, param_vals in enumerate(cv_params_):
|
|
591
|
+
key = f'{lookahead}/{train_length}/{test_length}/' + '/'.join([str(p) for p in param_vals])
|
|
592
|
+
params = dict(zip(param_names, param_vals))
|
|
593
|
+
params.update(base_params)
|
|
594
|
+
|
|
595
|
+
start = time()
|
|
596
|
+
cv_preds, nrounds = [], []
|
|
597
|
+
ic_cv = defaultdict(list)
|
|
598
|
+
|
|
599
|
+
# iterate over folds
|
|
600
|
+
for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
|
|
601
|
+
|
|
602
|
+
# select train subset
|
|
603
|
+
lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
|
|
604
|
+
params=params).construct()
|
|
605
|
+
|
|
606
|
+
# train model for num_boost_round
|
|
607
|
+
model = lgb.train(params=params,
|
|
608
|
+
train_set=lgb_train,
|
|
609
|
+
num_boost_round=num_boost_round,
|
|
610
|
+
)
|
|
611
|
+
# log feature importance
|
|
612
|
+
if i == 0:
|
|
613
|
+
fi = get_fi(model).to_frame()
|
|
614
|
+
else:
|
|
615
|
+
fi[i] = get_fi(model)
|
|
616
|
+
|
|
617
|
+
# capture predictions
|
|
618
|
+
test_set = outcome_data.iloc[test_idx, :]
|
|
619
|
+
X_test = test_set.loc[:, model.feature_name()]
|
|
620
|
+
y_test = test_set.loc[:, label]
|
|
621
|
+
y_pred = {str(n): model.predict(X_test, num_iteration=n) for n in num_iterations}
|
|
622
|
+
|
|
623
|
+
# record predictions for each fold
|
|
624
|
+
cv_preds.append(y_test.to_frame('y_test').assign(**y_pred).assign(i=i))
|
|
625
|
+
|
|
626
|
+
# combine fold results
|
|
627
|
+
cv_preds = pd.concat(cv_preds).assign(**params)
|
|
628
|
+
predictions.append(cv_preds)
|
|
629
|
+
|
|
630
|
+
# compute IC per day
|
|
631
|
+
by_day = cv_preds.groupby(level='date')
|
|
632
|
+
ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
|
|
633
|
+
for n in num_iterations], axis=1)
|
|
634
|
+
daily_ic_mean = ic_by_day.mean()
|
|
635
|
+
daily_ic_mean_n = daily_ic_mean.idxmax()
|
|
636
|
+
daily_ic_median = ic_by_day.median()
|
|
637
|
+
daily_ic_median_n = daily_ic_median.idxmax()
|
|
638
|
+
|
|
639
|
+
# compute IC across all predictions
|
|
640
|
+
ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]
|
|
641
|
+
t = time() - start
|
|
642
|
+
T += t
|
|
643
|
+
|
|
644
|
+
# collect metrics
|
|
645
|
+
metrics = pd.Series(list(param_vals) +
|
|
646
|
+
[t, daily_ic_mean.max(), daily_ic_mean_n, daily_ic_median.max(), daily_ic_median_n] + ic,
|
|
647
|
+
index=metric_cols)
|
|
648
|
+
if verbose:
|
|
649
|
+
msg = f'\t{p:3.0f} | {self.format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
|
|
650
|
+
msg += f'{params["num_leaves"]:3.0f} | {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} | '
|
|
651
|
+
msg += f' {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}'
|
|
652
|
+
print(msg)
|
|
653
|
+
|
|
654
|
+
# persist results for given CV run and hyperparameter combination
|
|
655
|
+
metrics.to_hdf(self.trainstore, 'metrics/' + key)
|
|
656
|
+
ic_by_day.assign(**params).to_hdf(self.trainstore, 'daily_ic/' + key)
|
|
657
|
+
fi.T.describe().T.assign(**params).to_hdf(self.trainstore, 'fi/' + key)
|
|
658
|
+
cv_preds.to_hdf(self.trainstore, 'predictions/' + key, append=True)
|
|
659
|
+
|
|
660
|
+
def _get_lgb_metrics(self, scope_params, lgb_train_params, daily_ic_metrics):
|
|
661
|
+
with pd.HDFStore(self.trainstore) as store:
|
|
662
|
+
for i, key in enumerate(
|
|
663
|
+
[k[1:] for k in store.keys() if k[1:].startswith('metrics')]):
|
|
664
|
+
_, t, train_length, test_length = key.split('/')[:4]
|
|
665
|
+
attrs = {
|
|
666
|
+
'lookahead': t,
|
|
667
|
+
'train_length': train_length,
|
|
668
|
+
'test_length': test_length
|
|
669
|
+
}
|
|
670
|
+
s = store[key].to_dict()
|
|
671
|
+
s.update(attrs)
|
|
672
|
+
if i == 0:
|
|
673
|
+
lgb_metrics = pd.Series(s).to_frame(i)
|
|
674
|
+
else:
|
|
675
|
+
lgb_metrics[i] = pd.Series(s)
|
|
676
|
+
|
|
677
|
+
id_vars = scope_params + lgb_train_params + daily_ic_metrics
|
|
678
|
+
lgb_metrics = pd.melt(lgb_metrics.T.drop('t', axis=1),
|
|
679
|
+
id_vars=id_vars,
|
|
680
|
+
value_name='ic',
|
|
681
|
+
var_name='boost_rounds').dropna().apply(pd.to_numeric)
|
|
682
|
+
return lgb_metrics
|
|
683
|
+
|
|
684
|
+
def _get_lgb_ic(self, int_cols, scope_params, lgb_train_params, id_vars):
|
|
685
|
+
lgb_ic = []
|
|
686
|
+
with pd.HDFStore(self.trainstore) as store:
|
|
687
|
+
keys = [k[1:] for k in store.keys()]
|
|
688
|
+
for key in keys:
|
|
689
|
+
_, t, train_length, test_length = key.split('/')[:4]
|
|
690
|
+
if key.startswith('daily_ic'):
|
|
691
|
+
df = (store[key]
|
|
692
|
+
.drop(['boosting', 'objective', 'verbose'], axis=1)
|
|
693
|
+
.assign(lookahead=t,
|
|
694
|
+
train_length=train_length,
|
|
695
|
+
test_length=test_length))
|
|
696
|
+
lgb_ic.append(df)
|
|
697
|
+
lgb_ic = pd.concat(lgb_ic).reset_index()
|
|
698
|
+
lgb_ic = pd.melt(lgb_ic,
|
|
699
|
+
id_vars=id_vars,
|
|
700
|
+
value_name='ic',
|
|
701
|
+
var_name='boost_rounds').dropna()
|
|
702
|
+
lgb_ic.loc[:, int_cols] = lgb_ic.loc[:, int_cols].astype(int)
|
|
703
|
+
return lgb_ic
|
|
704
|
+
|
|
705
|
+
def _get_lgb_params(self, data, scope_params, lgb_train_params, t=5, best=0):
|
|
706
|
+
param_cols = scope_params[1:] + lgb_train_params + ['boost_rounds']
|
|
707
|
+
df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
|
|
708
|
+
return df.loc[param_cols]
|
|
709
|
+
|
|
710
|
+
def _get_lgb_key(self, t, p):
|
|
711
|
+
key = f'{t}/{int(p.train_length)}/{int(p.test_length)}/{p.learning_rate}/'
|
|
712
|
+
return key + f'{int(p.num_leaves)}/{p.feature_fraction}/{int(p.min_data_in_leaf)}'
|
|
713
|
+
|
|
714
|
+
def _select_ic(self, params, ic_data, lookahead):
|
|
715
|
+
return ic_data.loc[(ic_data.lookahead == lookahead) &
|
|
716
|
+
(ic_data.train_length == params.train_length) &
|
|
717
|
+
(ic_data.test_length == params.test_length) &
|
|
718
|
+
(ic_data.learning_rate == params.learning_rate) &
|
|
719
|
+
(ic_data.num_leaves == params.num_leaves) &
|
|
720
|
+
(ic_data.feature_fraction == params.feature_fraction) &
|
|
721
|
+
(ic_data.boost_rounds == params.boost_rounds), ['date', 'ic']].set_index('date')
|
|
722
|
+
|
|
723
|
+
def get_trade_prices(self, tickers, start, end):
|
|
724
|
+
idx = pd.IndexSlice
|
|
725
|
+
with pd.HDFStore(self.datastore) as store:
|
|
726
|
+
data = store.select('stock_data')
|
|
727
|
+
data = data.set_index(['symbol', 'date']).sort_index()
|
|
728
|
+
data = data[~data.index.duplicated()]
|
|
729
|
+
return (data.loc[idx[tickers, start: end], 'open']
|
|
730
|
+
.unstack('symbol')
|
|
731
|
+
.sort_index()
|
|
732
|
+
.shift(-1)
|
|
733
|
+
.tz_convert('UTC'))
|
|
734
|
+
|
|
735
|
+
def plot_ic(self, lgb_ic, lgb_daily_ic, scope_params, lgb_train_params):
|
|
736
|
+
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
|
|
737
|
+
axes = axes.flatten()
|
|
738
|
+
for i, t in enumerate([1, 21]):
|
|
739
|
+
params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=t,best=0)
|
|
740
|
+
data = self._select_ic(params, lgb_ic, lookahead=t).sort_index()
|
|
741
|
+
rolling = data.rolling(63).ic.mean().dropna()
|
|
742
|
+
avg = data.ic.mean()
|
|
743
|
+
med = data.ic.median()
|
|
744
|
+
rolling.plot(ax=axes[i], title=f'Horizon: {t} Day(s) | IC: Mean={avg*100:.2f} Median={med*100:.2f}')
|
|
745
|
+
axes[i].axhline(avg, c='darkred', lw=1)
|
|
746
|
+
axes[i].axhline(0, ls='--', c='k', lw=1)
|
|
747
|
+
|
|
748
|
+
fig.suptitle('3-Month Rolling Information Coefficient', fontsize=16)
|
|
749
|
+
fig.tight_layout()
|
|
750
|
+
fig.subplots_adjust(top=0.92);
|
|
751
|
+
|
|
752
|
+
def plot_metrics(self, lgb_metrics, lgb_daily_ic, t=1):
|
|
753
|
+
### Visualization
|
|
754
|
+
sns.jointplot(x=lgb_metrics.daily_ic_mean,y=lgb_metrics.ic);
|
|
755
|
+
|
|
756
|
+
g = sns.catplot(x='lookahead', y='ic',
|
|
757
|
+
col='train_length', row='test_length',
|
|
758
|
+
data=lgb_metrics,
|
|
759
|
+
kind='box');
|
|
760
|
+
g=sns.catplot(x='boost_rounds',
|
|
761
|
+
y='ic',
|
|
762
|
+
col='train_length',
|
|
763
|
+
row='test_length',
|
|
764
|
+
data=lgb_daily_ic[lgb_daily_ic.lookahead == t],
|
|
765
|
+
kind='box');
|
|
766
|
+
|
|
767
|
+
def get_best_predictions(self, lgb_daily_ic, scope_params, lgb_train_params, lookahead=1, topn=10):
|
|
768
|
+
for best in range(topn):
|
|
769
|
+
best_params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=best)
|
|
770
|
+
key = self._get_lgb_key(lookahead, best_params)
|
|
771
|
+
rounds = str(int(best_params.boost_rounds))
|
|
772
|
+
if best == 0:
|
|
773
|
+
best_predictions = pd.read_hdf(self.trainstore, 'predictions/' + key)
|
|
774
|
+
best_predictions = best_predictions[rounds].to_frame(best)
|
|
775
|
+
else:
|
|
776
|
+
best_predictions[best] = pd.read_hdf(self.trainstore, 'predictions/' + key)[rounds]
|
|
777
|
+
best_predictions = best_predictions.sort_index()
|
|
778
|
+
best_predictions.reset_index().to_hdf(self.outstore, f'lgb/train/{lookahead:02}')
|
|
779
|
+
return best_predictions
|
|
780
|
+
|
|
781
|
+
def apply_alphalen_analysis(self, factor_data, tearsheet=True, verbose=True):
|
|
782
|
+
#### Compute Alphalens metrics
|
|
783
|
+
mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
|
|
784
|
+
factor_data,
|
|
785
|
+
by_date=True,
|
|
786
|
+
by_group=False,
|
|
787
|
+
demeaned=True,
|
|
788
|
+
group_adjust=False,
|
|
789
|
+
)
|
|
790
|
+
factor_returns = perf.factor_returns(factor_data)
|
|
791
|
+
mean_quant_ret, std_quantile = perf.mean_return_by_quantile(factor_data,
|
|
792
|
+
by_group=False,
|
|
793
|
+
demeaned=True)
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
mean_quant_rateret = mean_quant_ret.apply(rate_of_return, axis=0,
|
|
798
|
+
base_period=mean_quant_ret.columns[0])
|
|
799
|
+
|
|
800
|
+
mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
|
|
801
|
+
factor_data,
|
|
802
|
+
by_date=True,
|
|
803
|
+
by_group=False,
|
|
804
|
+
demeaned=True,
|
|
805
|
+
group_adjust=False,
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
mean_quant_rateret_bydate = mean_quant_ret_bydate.apply(
|
|
809
|
+
rate_of_return,
|
|
810
|
+
base_period=mean_quant_ret_bydate.columns[0],
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
compstd_quant_daily = std_quant_daily.apply(std_conversion,
|
|
814
|
+
base_period=std_quant_daily.columns[0])
|
|
815
|
+
|
|
816
|
+
alpha_beta = perf.factor_alpha_beta(factor_data,
|
|
817
|
+
demeaned=True)
|
|
818
|
+
|
|
819
|
+
mean_ret_spread_quant, std_spread_quant = perf.compute_mean_returns_spread(
|
|
820
|
+
mean_quant_rateret_bydate,
|
|
821
|
+
factor_data["factor_quantile"].max(),
|
|
822
|
+
factor_data["factor_quantile"].min(),
|
|
823
|
+
std_err=compstd_quant_daily,
|
|
824
|
+
)
|
|
825
|
+
if verbose:
|
|
826
|
+
print(mean_ret_spread_quant.mean().mul(10000).to_frame('Mean Period Wise Spread (bps)').join(alpha_beta.T).T)
|
|
827
|
+
|
|
828
|
+
fig, axes = plt.subplots(ncols=3, figsize=(18, 4))
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
plotting.plot_quantile_returns_bar(mean_quant_rateret, ax=axes[0])
|
|
832
|
+
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)
|
|
833
|
+
axes[0].set_xlabel('Quantile')
|
|
834
|
+
|
|
835
|
+
plotting.plot_cumulative_returns_by_quantile(mean_quant_ret_bydate['1D'],
|
|
836
|
+
freq=pd.tseries.offsets.BDay(),
|
|
837
|
+
period='1D',
|
|
838
|
+
ax=axes[1])
|
|
839
|
+
axes[1].set_title('Cumulative Return by Quantile (1D Period)')
|
|
840
|
+
|
|
841
|
+
title = "Cumulative Return - Factor-Weighted Long/Short PF (1D Period)"
|
|
842
|
+
plotting.plot_cumulative_returns(factor_returns['1D'],
|
|
843
|
+
period='1D',
|
|
844
|
+
freq=pd.tseries.offsets.BDay(),
|
|
845
|
+
title=title,
|
|
846
|
+
ax=axes[2])
|
|
847
|
+
|
|
848
|
+
fig.suptitle('Alphalens - Validation Set Performance', fontsize=14)
|
|
849
|
+
fig.tight_layout()
|
|
850
|
+
fig.subplots_adjust(top=.85);
|
|
851
|
+
|
|
852
|
+
#### Summary Tearsheet
|
|
853
|
+
create_summary_tear_sheet(factor_data)
|
|
854
|
+
create_full_tear_sheet(factor_data)
|
|
855
|
+
|
|
856
|
+
def evaluate(self, remove_instore=False, lookahead=1):
|
|
857
|
+
scope_params = ['lookahead', 'train_length', 'test_length']
|
|
858
|
+
daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
|
|
859
|
+
lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
|
|
860
|
+
|
|
861
|
+
lgb_metrics = self._get_lgb_metrics(scope_params, lgb_train_params, daily_ic_metrics)
|
|
862
|
+
#### Summary Metrics by Fold
|
|
863
|
+
lgb_metrics.to_hdf(self.outstore, 'lgb/metrics')
|
|
864
|
+
|
|
865
|
+
#### Information Coefficient by Day
|
|
866
|
+
int_cols = ['lookahead', 'train_length', 'test_length', 'boost_rounds']
|
|
867
|
+
id_vars = ['date'] + scope_params + lgb_train_params
|
|
868
|
+
lgb_ic = self._get_lgb_ic(int_cols, scope_params, lgb_train_params, id_vars)
|
|
869
|
+
lgb_ic.to_hdf(self.outstore, 'lgb/ic')
|
|
870
|
+
lgb_daily_ic = lgb_ic.groupby(id_vars[1:] + ['boost_rounds']).ic.mean().to_frame('ic').reset_index()
|
|
871
|
+
lgb_daily_ic.to_hdf(self.outstore, 'lgb/daily_ic')
|
|
872
|
+
|
|
873
|
+
## Cross-validation Result: Best Hyperparameters
|
|
874
|
+
group_cols = scope_params + lgb_train_params + ['boost_rounds']
|
|
875
|
+
print(lgb_daily_ic.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'ic')))
|
|
876
|
+
lgb_metrics.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'ic'))
|
|
877
|
+
lgb_metrics.groupby('lookahead', group_keys=False
|
|
878
|
+
).apply(lambda x: x.nlargest(3, 'ic')).to_hdf(self.outstore, 'lgb/best_model')
|
|
879
|
+
print(lgb_metrics.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'daily_ic_mean')))
|
|
880
|
+
|
|
881
|
+
### Visualization
|
|
882
|
+
self.plot_metrics(lgb_metrics, lgb_daily_ic, t=1)
|
|
883
|
+
|
|
884
|
+
## AlphaLens Analysis - Validation Performance
|
|
885
|
+
lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
|
|
886
|
+
best_params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=5, best=0)
|
|
887
|
+
best_params.to_hdf(self.outstore, 'lgb/best_params')
|
|
888
|
+
|
|
889
|
+
self.plot_ic(lgb_ic, lgb_daily_ic, scope_params, lgb_train_params)
|
|
890
|
+
|
|
891
|
+
#### Get Predictions for Validation Period
|
|
892
|
+
best_predictions = self.get_best_predictions(lgb_daily_ic, scope_params, lgb_train_params,
|
|
893
|
+
lookahead=lookahead, topn=10)
|
|
894
|
+
test_tickers = best_predictions.index.get_level_values('symbol').unique()
|
|
895
|
+
start = best_predictions.index.get_level_values('date').min()
|
|
896
|
+
end = best_predictions.index.get_level_values('date').max()
|
|
897
|
+
trade_prices = self.get_trade_prices(test_tickers, start, end)
|
|
898
|
+
trade_prices.to_hdf(self.outstore, 'trade_prices/model_selection')
|
|
899
|
+
pd.Series(test_tickers).to_hdf(self.outstore, 'lgb/tickers')
|
|
900
|
+
#We average the top five models and provide the corresponding prices to Alphalens, in order to compute the mean period-wise
|
|
901
|
+
#return earned on an equal-weighted portfolio invested in the daily factor quintiles for various holding periods:
|
|
902
|
+
factor = best_predictions.iloc[:, :5].mean(1).dropna().tz_convert ('UTC', level='date').swaplevel()
|
|
903
|
+
### #### Create AlphaLens Inputs
|
|
904
|
+
factor_data = get_clean_factor_and_forward_returns(factor=factor,
|
|
905
|
+
prices=trade_prices,
|
|
906
|
+
quantiles=5,
|
|
907
|
+
periods=(1, 5, 10, 21),
|
|
908
|
+
max_loss=1)
|
|
909
|
+
self.apply_alphalen_analysis(factor_data, tearsheet=True, verbose=True)
|
|
910
|
+
# Delete the temporary files
|
|
911
|
+
if remove_instore:
|
|
912
|
+
os.remove(self.trainstore)
|
|
913
|
+
|
|
914
|
+
def make_predictions(self, data: pd.DataFrame, lookahead=1, verbose=True):
|
|
915
|
+
YEAR = 252
|
|
916
|
+
idx = pd.IndexSlice
|
|
917
|
+
scope_params = ['lookahead', 'train_length', 'test_length']
|
|
918
|
+
daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
|
|
919
|
+
lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
|
|
920
|
+
|
|
921
|
+
base_params = dict(boosting='gbdt',
|
|
922
|
+
objective='regression',
|
|
923
|
+
verbose=-1)
|
|
924
|
+
|
|
925
|
+
categoricals = ['year', 'month', 'sector', 'weekday']
|
|
926
|
+
data = data.sort_index()
|
|
927
|
+
labels = sorted(data.filter(like='_fwd').columns)
|
|
928
|
+
features = data.columns.difference(labels).tolist()
|
|
929
|
+
label = f'r{lookahead:02}_fwd'
|
|
930
|
+
for feature in categoricals:
|
|
931
|
+
data[feature] = pd.factorize(data[feature], sort=True)[0]
|
|
932
|
+
|
|
933
|
+
lgb_data = lgb.Dataset(data=data[features],
|
|
934
|
+
label=data[label],
|
|
935
|
+
categorical_feature=categoricals,
|
|
936
|
+
free_raw_data=False)
|
|
937
|
+
### Generate predictions
|
|
938
|
+
lgb_ic = pd.read_hdf(self.outstore, 'lgb/ic')
|
|
939
|
+
lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
|
|
940
|
+
|
|
941
|
+
for position in range(10):
|
|
942
|
+
params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=position)
|
|
943
|
+
|
|
944
|
+
params = params.to_dict()
|
|
945
|
+
|
|
946
|
+
for p in ['min_data_in_leaf', 'num_leaves']:
|
|
947
|
+
params[p] = int(params[p])
|
|
948
|
+
train_length = int(params.pop('train_length'))
|
|
949
|
+
test_length = int(params.pop('test_length'))
|
|
950
|
+
num_boost_round = int(params.pop('boost_rounds'))
|
|
951
|
+
params.update(base_params)
|
|
952
|
+
|
|
953
|
+
print(f'\nPosition: {position:02}')
|
|
954
|
+
|
|
955
|
+
# 1-year out-of-sample period
|
|
956
|
+
n_splits = int(YEAR / test_length)
|
|
957
|
+
cv = MultipleTimeSeriesCV(n_splits=n_splits,
|
|
958
|
+
test_period_length=test_length,
|
|
959
|
+
lookahead=lookahead,
|
|
960
|
+
train_period_length=train_length)
|
|
961
|
+
|
|
962
|
+
predictions = []
|
|
963
|
+
start = time()
|
|
964
|
+
for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
|
|
965
|
+
print(i, end=' ', flush=True)
|
|
966
|
+
lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
|
|
967
|
+
params=params).construct()
|
|
968
|
+
|
|
969
|
+
model = lgb.train(params=params,
|
|
970
|
+
train_set=lgb_train,
|
|
971
|
+
num_boost_round=num_boost_round,
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
test_set = data.iloc[test_idx, :]
|
|
975
|
+
y_test = test_set.loc[:, label].to_frame('y_test')
|
|
976
|
+
y_pred = model.predict(test_set.loc[:, model.feature_name()])
|
|
977
|
+
predictions.append(y_test.assign(prediction=y_pred))
|
|
978
|
+
|
|
979
|
+
if position == 0:
|
|
980
|
+
test_predictions = (pd.concat(predictions)
|
|
981
|
+
.rename(columns={'prediction': position}))
|
|
982
|
+
else:
|
|
983
|
+
test_predictions[position] = pd.concat(predictions).prediction
|
|
984
|
+
|
|
985
|
+
by_day = test_predictions.groupby(level='date')
|
|
986
|
+
for position in range(10):
|
|
987
|
+
if position == 0:
|
|
988
|
+
ic_by_day = by_day.apply(lambda x: spearmanr(
|
|
989
|
+
x.y_test, x[position])[0]).to_frame()
|
|
990
|
+
else:
|
|
991
|
+
ic_by_day[position] = by_day.apply(
|
|
992
|
+
lambda x: spearmanr(x.y_test, x[position])[0])
|
|
993
|
+
if verbose:
|
|
994
|
+
print(ic_by_day.describe())
|
|
995
|
+
test_predictions.reset_index().to_hdf(self.outstore, f'lgb/test/{lookahead:02}')
|
|
996
|
+
return test_predictions
|
|
997
|
+
|
|
998
|
+
def load_predictions(self, predictions=None, lookahead=1):
|
|
999
|
+
if predictions is None:
|
|
1000
|
+
predictions = pd.concat([
|
|
1001
|
+
pd.read_hdf(self.outstore, f'lgb/train/{lookahead:02}'),
|
|
1002
|
+
pd.read_hdf(self.outstore, f'lgb/test/{lookahead:02}').drop('y_test', axis=1)
|
|
1003
|
+
])
|
|
1004
|
+
predictions = predictions.set_index(['symbol', 'date'])
|
|
1005
|
+
|
|
1006
|
+
predictions = (predictions.loc[~predictions.index.duplicated()]
|
|
1007
|
+
.iloc[:, :10]
|
|
1008
|
+
.mean(1)
|
|
1009
|
+
.sort_index()
|
|
1010
|
+
.dropna()
|
|
1011
|
+
.to_frame('prediction'))
|
|
1012
|
+
tickers = predictions.index.get_level_values('symbol').unique().tolist()
|
|
1013
|
+
return (predictions
|
|
1014
|
+
.unstack('symbol')
|
|
1015
|
+
.prediction
|
|
1016
|
+
.tz_convert ('UTC')), tickers
|
|
1017
|
+
|
|
1018
|
+
def assert_last_date(self, predictions: pd.DataFrame):
|
|
1019
|
+
"""
|
|
1020
|
+
Usefull in Live Trading to ensure that the last date in the predictions
|
|
1021
|
+
is the previous day, so it predicts today's returns.
|
|
1022
|
+
"""
|
|
1023
|
+
last_date = predictions.index.get_level_values('date').max()
|
|
1024
|
+
if last_date.tzinfo is None:
|
|
1025
|
+
last_date = last_date.tz_localize('UTC')
|
|
1026
|
+
assert last_date == (pd.Timestamp.now(tz='UTC') - pd.Timedelta(days=1)).normalize()
|