bbstrader 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bbstrader might be problematic. Click here for more details.
- bbstrader/__ini__.py +7 -7
- bbstrader/btengine/__init__.py +7 -7
- bbstrader/btengine/backtest.py +30 -26
- bbstrader/btengine/data.py +92 -81
- bbstrader/btengine/event.py +2 -1
- bbstrader/btengine/execution.py +18 -16
- bbstrader/btengine/performance.py +11 -7
- bbstrader/btengine/portfolio.py +35 -36
- bbstrader/btengine/strategy.py +113 -92
- bbstrader/config.py +12 -10
- bbstrader/core/data.py +4 -5
- bbstrader/core/utils.py +57 -0
- bbstrader/ibkr/utils.py +0 -0
- bbstrader/metatrader/__init__.py +5 -5
- bbstrader/metatrader/account.py +117 -121
- bbstrader/metatrader/rates.py +81 -78
- bbstrader/metatrader/risk.py +23 -37
- bbstrader/metatrader/trade.py +154 -138
- bbstrader/metatrader/utils.py +3 -3
- bbstrader/models/__init__.py +5 -5
- bbstrader/models/factors.py +17 -12
- bbstrader/models/ml.py +371 -305
- bbstrader/models/optimization.py +14 -12
- bbstrader/models/portfolio.py +44 -35
- bbstrader/models/risk.py +15 -9
- bbstrader/trading/__init__.py +2 -2
- bbstrader/trading/execution.py +245 -179
- bbstrader/trading/scripts.py +8 -4
- bbstrader/trading/strategies.py +78 -65
- bbstrader/tseries.py +124 -98
- {bbstrader-0.2.0.dist-info → bbstrader-0.2.1.dist-info}/LICENSE +1 -1
- {bbstrader-0.2.0.dist-info → bbstrader-0.2.1.dist-info}/METADATA +2 -1
- bbstrader-0.2.1.dist-info/RECORD +37 -0
- bbstrader-0.2.0.dist-info/RECORD +0 -36
- {bbstrader-0.2.0.dist-info → bbstrader-0.2.1.dist-info}/WHEEL +0 -0
- {bbstrader-0.2.0.dist-info → bbstrader-0.2.1.dist-info}/top_level.txt +0 -0
bbstrader/models/ml.py
CHANGED
|
@@ -1,36 +1,38 @@
|
|
|
1
|
-
|
|
2
|
-
import
|
|
1
|
+
import os
|
|
2
|
+
import warnings
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from itertools import product
|
|
5
|
+
from time import time
|
|
6
|
+
|
|
7
|
+
import lightgbm as lgb
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
3
9
|
import numpy as np
|
|
4
10
|
import pandas as pd
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
11
|
import seaborn as sns
|
|
7
12
|
import talib
|
|
8
|
-
from time import time
|
|
9
|
-
from tqdm import tqdm
|
|
10
|
-
from talib import RSI, BBANDS, MACD, ATR
|
|
11
13
|
import yfinance as yf
|
|
12
|
-
from scipy.stats import spearmanr
|
|
13
|
-
from itertools import product
|
|
14
|
-
import lightgbm as lgb
|
|
15
|
-
from collections import defaultdict
|
|
16
|
-
from alphalens.tears import (create_summary_tear_sheet,
|
|
17
|
-
create_full_tear_sheet)
|
|
18
|
-
from alphalens import plotting
|
|
19
14
|
from alphalens import performance as perf
|
|
20
|
-
from alphalens
|
|
21
|
-
from
|
|
22
|
-
from
|
|
15
|
+
from alphalens import plotting
|
|
16
|
+
from alphalens.tears import create_full_tear_sheet, create_summary_tear_sheet
|
|
17
|
+
from alphalens.utils import (
|
|
18
|
+
get_clean_factor_and_forward_returns,
|
|
19
|
+
rate_of_return,
|
|
20
|
+
std_conversion,
|
|
21
|
+
)
|
|
22
|
+
from scipy.stats import spearmanr
|
|
23
|
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
|
24
|
+
from talib import ATR, BBANDS, MACD, RSI
|
|
23
25
|
|
|
24
|
-
import warnings
|
|
25
26
|
warnings.filterwarnings('ignore')
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
__all__ = [
|
|
29
|
-
'OneStepTimeSeriesSplit',
|
|
30
|
-
'MultipleTimeSeriesCV',
|
|
30
|
+
'OneStepTimeSeriesSplit',
|
|
31
|
+
'MultipleTimeSeriesCV',
|
|
31
32
|
'LightGBModel'
|
|
32
33
|
]
|
|
33
34
|
|
|
35
|
+
|
|
34
36
|
class OneStepTimeSeriesSplit:
|
|
35
37
|
__author__ = "Stefan Jansen"
|
|
36
38
|
"""Generates tuples of train_idx, test_idx pairs
|
|
@@ -42,7 +44,7 @@ class OneStepTimeSeriesSplit:
|
|
|
42
44
|
self.shuffle = shuffle
|
|
43
45
|
|
|
44
46
|
@staticmethod
|
|
45
|
-
def chunks(l, n):
|
|
47
|
+
def chunks(l, n): # noqa: E741
|
|
46
48
|
for i in range(0, len(l), n):
|
|
47
49
|
yield l[i:i + n]
|
|
48
50
|
|
|
@@ -63,7 +65,7 @@ class OneStepTimeSeriesSplit:
|
|
|
63
65
|
|
|
64
66
|
def get_n_splits(self, X, y, groups=None):
|
|
65
67
|
return self.n_splits
|
|
66
|
-
|
|
68
|
+
|
|
67
69
|
|
|
68
70
|
class MultipleTimeSeriesCV:
|
|
69
71
|
__author__ = "Stefan Jansen"
|
|
@@ -80,7 +82,7 @@ class MultipleTimeSeriesCV:
|
|
|
80
82
|
lookahead=None,
|
|
81
83
|
date_idx='date',
|
|
82
84
|
shuffle=False):
|
|
83
|
-
|
|
85
|
+
|
|
84
86
|
self.n_splits = n_splits
|
|
85
87
|
self.lookahead = lookahead
|
|
86
88
|
self.test_length = test_period_length
|
|
@@ -113,7 +115,7 @@ class MultipleTimeSeriesCV:
|
|
|
113
115
|
|
|
114
116
|
def get_n_splits(self, X, y, groups=None):
|
|
115
117
|
return self.n_splits
|
|
116
|
-
|
|
118
|
+
|
|
117
119
|
|
|
118
120
|
class LightGBModel(object):
|
|
119
121
|
"""
|
|
@@ -158,12 +160,12 @@ class LightGBModel(object):
|
|
|
158
160
|
Chapter 12, Boosting Your Trading Strategy.
|
|
159
161
|
"""
|
|
160
162
|
|
|
161
|
-
def __init__(self,
|
|
162
|
-
data: pd.DataFrame=None,
|
|
163
|
-
datastore: pd.HDFStore='lgbdata.h5',
|
|
164
|
-
trainstore: pd.HDFStore='lgbtrain.h5',
|
|
165
|
-
outstore: pd.HDFStore='lgbout.h5'
|
|
166
|
-
|
|
163
|
+
def __init__(self,
|
|
164
|
+
data: pd.DataFrame = None,
|
|
165
|
+
datastore: pd.HDFStore = 'lgbdata.h5',
|
|
166
|
+
trainstore: pd.HDFStore = 'lgbtrain.h5',
|
|
167
|
+
outstore: pd.HDFStore = 'lgbout.h5'
|
|
168
|
+
):
|
|
167
169
|
"""
|
|
168
170
|
Args:
|
|
169
171
|
data (pd.DataFrame): The input data for the model. It should be a DataFrame with a MultiIndex containing
|
|
@@ -183,42 +185,48 @@ class LightGBModel(object):
|
|
|
183
185
|
return pd.DataFrame({'bb_high': high, 'bb_low': low}, index=close.index)
|
|
184
186
|
|
|
185
187
|
def _compute_atr(self, stock_data):
|
|
186
|
-
df = ATR(stock_data.high, stock_data.low,
|
|
187
|
-
|
|
188
|
+
df = ATR(stock_data.high, stock_data.low,
|
|
189
|
+
stock_data.close, timeperiod=14)
|
|
188
190
|
return df.sub(df.mean()).div(df.std())
|
|
189
|
-
|
|
191
|
+
|
|
190
192
|
def _compute_macd(self, close):
|
|
191
193
|
macd = MACD(close)[0]
|
|
192
194
|
return (macd - np.mean(macd))/np.std(macd)
|
|
193
|
-
|
|
195
|
+
|
|
194
196
|
def _add_technical_indicators(self, prices: pd.DataFrame):
|
|
195
197
|
prices = prices.copy()
|
|
196
|
-
prices['rsi'] = prices.groupby(level='symbol').close.apply(
|
|
197
|
-
|
|
198
|
+
prices['rsi'] = prices.groupby(level='symbol').close.apply(
|
|
199
|
+
lambda x: RSI(x).reset_index(level=0, drop=True))
|
|
200
|
+
bb = prices.groupby(level=0).close.apply(
|
|
201
|
+
self._compute_bb).reset_index(level=1, drop=True)
|
|
198
202
|
prices = prices.join(bb)
|
|
199
|
-
prices['bb_high'] = prices.bb_high.sub(
|
|
200
|
-
|
|
201
|
-
prices['
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
203
|
+
prices['bb_high'] = prices.bb_high.sub(
|
|
204
|
+
prices.close).div(prices.bb_high).apply(np.log1p)
|
|
205
|
+
prices['bb_low'] = prices.close.sub(
|
|
206
|
+
prices.bb_low).div(prices.close).apply(np.log1p)
|
|
207
|
+
prices['NATR'] = prices.groupby(level='symbol',
|
|
208
|
+
group_keys=False).apply(lambda x:
|
|
209
|
+
talib.NATR(x.high, x.low, x.close))
|
|
210
|
+
|
|
205
211
|
prices['ATR'] = (prices.groupby('symbol', group_keys=False)
|
|
206
|
-
|
|
207
|
-
prices['PPO'] = prices.groupby(level='symbol').close.apply(
|
|
212
|
+
.apply(self._compute_atr))
|
|
213
|
+
prices['PPO'] = prices.groupby(level='symbol').close.apply(
|
|
214
|
+
lambda x: talib.PPO(x).reset_index(level=0, drop=True))
|
|
208
215
|
prices['MACD'] = (prices
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
216
|
+
.groupby('symbol', group_keys=False)
|
|
217
|
+
.close
|
|
218
|
+
.apply(self._compute_macd))
|
|
212
219
|
return prices
|
|
213
|
-
|
|
220
|
+
|
|
214
221
|
def download_boosting_data(self, tickers, start, end=None):
|
|
215
222
|
data = []
|
|
216
223
|
for ticker in tickers:
|
|
217
224
|
try:
|
|
218
|
-
prices = yf.download(
|
|
225
|
+
prices = yf.download(
|
|
226
|
+
ticker, start=start, end=end, progress=False, multi_level_index=False)
|
|
219
227
|
prices['symbol'] = ticker
|
|
220
228
|
data.append(prices)
|
|
221
|
-
except:
|
|
229
|
+
except: # noqa: E722
|
|
222
230
|
continue
|
|
223
231
|
data = pd.concat(data)
|
|
224
232
|
data = (data
|
|
@@ -228,49 +236,50 @@ class LightGBModel(object):
|
|
|
228
236
|
.sort_index()
|
|
229
237
|
.dropna())
|
|
230
238
|
return data
|
|
231
|
-
|
|
239
|
+
|
|
232
240
|
def download_metadata(self, tickers):
|
|
233
|
-
|
|
241
|
+
|
|
234
242
|
def clean_text_column(series: pd.Series) -> pd.Series:
|
|
235
243
|
return (
|
|
236
244
|
series.str.lower()
|
|
237
|
-
|
|
245
|
+
# use regex=False for literal string replacements
|
|
246
|
+
.str.replace('-', '', regex=False)
|
|
238
247
|
.str.replace('&', 'and', regex=False)
|
|
239
248
|
.str.replace(' ', '_', regex=False)
|
|
240
249
|
.str.replace('__', '_', regex=False)
|
|
241
250
|
)
|
|
242
|
-
|
|
251
|
+
|
|
243
252
|
metadata = ['industry', 'sector', 'exchange', 'symbol',
|
|
244
|
-
'heldPercentInsiders', 'heldPercentInstitutions',
|
|
253
|
+
'heldPercentInsiders', 'heldPercentInstitutions',
|
|
245
254
|
'overallRisk', 'shortRatio', 'dividendYield', 'beta',
|
|
246
255
|
'regularMarketVolume', 'averageVolume', 'averageVolume10days',
|
|
247
|
-
'bid', 'ask', 'bidSize', 'askSize','marketCap']
|
|
248
|
-
|
|
256
|
+
'bid', 'ask', 'bidSize', 'askSize', 'marketCap']
|
|
257
|
+
|
|
249
258
|
columns = {
|
|
250
|
-
'industry'
|
|
251
|
-
'sector'
|
|
252
|
-
'exchange'
|
|
253
|
-
'symbol'
|
|
254
|
-
'heldPercentInsiders'
|
|
259
|
+
'industry': 'industry',
|
|
260
|
+
'sector': 'sector',
|
|
261
|
+
'exchange': 'exchange',
|
|
262
|
+
'symbol': 'symbol',
|
|
263
|
+
'heldPercentInsiders': 'insiders',
|
|
255
264
|
'heldPercentInstitutions': 'institutions',
|
|
256
|
-
'overallRisk'
|
|
257
|
-
'shortRatio'
|
|
258
|
-
'dividendYield'
|
|
259
|
-
'beta'
|
|
260
|
-
'regularMarketVolume'
|
|
261
|
-
'averageVolume'
|
|
262
|
-
'averageVolume10days'
|
|
263
|
-
'bid'
|
|
264
|
-
'ask'
|
|
265
|
-
'bidSize'
|
|
266
|
-
'askSize'
|
|
267
|
-
'marketCap'
|
|
265
|
+
'overallRisk': 'risk',
|
|
266
|
+
'shortRatio': 'short_ratio',
|
|
267
|
+
'dividendYield': 'dyield',
|
|
268
|
+
'beta': 'beta',
|
|
269
|
+
'regularMarketVolume': 'regvolume',
|
|
270
|
+
'averageVolume': 'avgvolume',
|
|
271
|
+
'averageVolume10days': 'avgvolume10',
|
|
272
|
+
'bid': 'bid',
|
|
273
|
+
'ask': 'ask',
|
|
274
|
+
'bidSize': 'bidsize',
|
|
275
|
+
'askSize': 'asksize',
|
|
276
|
+
'marketCap': 'marketcap'
|
|
268
277
|
}
|
|
269
278
|
data = []
|
|
270
279
|
for symbol in tickers:
|
|
271
280
|
try:
|
|
272
281
|
symbol_info = yf.Ticker(symbol).info
|
|
273
|
-
except:
|
|
282
|
+
except: # noqa: E722
|
|
274
283
|
continue
|
|
275
284
|
infos = {}
|
|
276
285
|
for info in metadata:
|
|
@@ -284,8 +293,8 @@ class LightGBModel(object):
|
|
|
284
293
|
metadata = metadata.set_index('symbol')
|
|
285
294
|
return metadata
|
|
286
295
|
|
|
287
|
-
def _select_nlargest_liquidity_stocks(self, df: pd.DataFrame, n: int,
|
|
288
|
-
|
|
296
|
+
def _select_nlargest_liquidity_stocks(self, df: pd.DataFrame, n: int,
|
|
297
|
+
volume_features, bid_ask_features, market_cap_feature):
|
|
289
298
|
df = df.copy()
|
|
290
299
|
scaler = StandardScaler()
|
|
291
300
|
|
|
@@ -305,46 +314,46 @@ class LightGBModel(object):
|
|
|
305
314
|
|
|
306
315
|
# Calculate the liquidity score by combining the normalized features
|
|
307
316
|
df['liquidity_score'] = (weights['volume'] * df[volume_features].mean(axis=1) +
|
|
308
|
-
|
|
309
|
-
|
|
317
|
+
weights['bid_ask_spread'] * df['bid_ask_spread'] +
|
|
318
|
+
weights['marketCap'] * df[market_cap_feature[0]])
|
|
310
319
|
df_sorted = df.sort_values(by='liquidity_score', ascending=False)
|
|
311
320
|
|
|
312
321
|
return df_sorted.nlargest(n, 'liquidity_score').index
|
|
313
|
-
|
|
322
|
+
|
|
314
323
|
def _encode_metadata(self, df: pd.DataFrame):
|
|
315
324
|
df = df.copy()
|
|
316
325
|
# Binning each numerical feature into categories
|
|
317
326
|
df['insiders'] = pd.qcut(
|
|
318
|
-
df['insiders'], q=4,
|
|
327
|
+
df['insiders'], q=4,
|
|
319
328
|
labels=['Very Low', 'Low', 'High', 'Very High']
|
|
320
329
|
)
|
|
321
330
|
df['institutions'] = pd.qcut(
|
|
322
|
-
df['institutions'], q=4,
|
|
331
|
+
df['institutions'], q=4,
|
|
323
332
|
labels=['Very Low', 'Low', 'High', 'Very High']
|
|
324
333
|
)
|
|
325
334
|
df['risk'] = pd.cut(
|
|
326
|
-
df['risk'], bins=[-float('inf'), 3, 5, 7, float('inf')],
|
|
335
|
+
df['risk'], bins=[-float('inf'), 3, 5, 7, float('inf')],
|
|
327
336
|
labels=['Low', 'Medium', 'High', 'Very High']
|
|
328
337
|
)
|
|
329
338
|
df['short_ratio'] = pd.qcut(
|
|
330
|
-
df['short_ratio'], q=4,
|
|
339
|
+
df['short_ratio'], q=4,
|
|
331
340
|
labels=['Very Low', 'Low', 'High', 'Very High']
|
|
332
341
|
)
|
|
333
342
|
df['dyield'] = pd.cut(
|
|
334
|
-
df['dyield'],
|
|
343
|
+
df['dyield'],
|
|
335
344
|
bins=[-float('inf'), 0.002, 0.005, 0.01, float('inf')],
|
|
336
|
-
|
|
345
|
+
labels=['Very Low', 'Low', 'High', 'Very High']
|
|
337
346
|
)
|
|
338
347
|
df['beta'] = pd.cut(
|
|
339
|
-
df['beta'],
|
|
340
|
-
bins=[-float('inf'), 0.8, 1.0, 1.2, float('inf')],
|
|
348
|
+
df['beta'],
|
|
349
|
+
bins=[-float('inf'), 0.8, 1.0, 1.2, float('inf')],
|
|
341
350
|
labels=['Low', 'Moderate', 'High', 'Very High']
|
|
342
351
|
)
|
|
343
352
|
|
|
344
353
|
# Encode binned features
|
|
345
354
|
binned_features = [
|
|
346
|
-
'insiders', 'institutions',
|
|
347
|
-
'risk', 'short_ratio', 'dyield',
|
|
355
|
+
'insiders', 'institutions',
|
|
356
|
+
'risk', 'short_ratio', 'dyield',
|
|
348
357
|
'beta', 'sector', 'industry', 'exchange',
|
|
349
358
|
]
|
|
350
359
|
label_encoders = {}
|
|
@@ -355,10 +364,10 @@ class LightGBModel(object):
|
|
|
355
364
|
label_encoders[col] = le
|
|
356
365
|
return df, label_encoders
|
|
357
366
|
|
|
358
|
-
def prepare_boosting_data(self,
|
|
359
|
-
prices: pd.DataFrame,
|
|
360
|
-
metadata: pd.DataFrame = None,
|
|
361
|
-
min_years=7,
|
|
367
|
+
def prepare_boosting_data(self,
|
|
368
|
+
prices: pd.DataFrame,
|
|
369
|
+
metadata: pd.DataFrame = None,
|
|
370
|
+
min_years=7,
|
|
362
371
|
universe=500
|
|
363
372
|
):
|
|
364
373
|
if metadata is None:
|
|
@@ -389,10 +398,11 @@ class LightGBModel(object):
|
|
|
389
398
|
prices = prices[~prices.index.duplicated()]
|
|
390
399
|
|
|
391
400
|
# Align price and meta data
|
|
392
|
-
metadata = metadata[~metadata.index.duplicated() &
|
|
401
|
+
metadata = metadata[~metadata.index.duplicated() &
|
|
402
|
+
metadata.sector.notnull()]
|
|
393
403
|
metadata.sector = metadata.sector.str.lower().str.replace(' ', '_')
|
|
394
404
|
shared = (prices.index.get_level_values('symbol').unique()
|
|
395
|
-
|
|
405
|
+
.intersection(metadata.index))
|
|
396
406
|
metadata = metadata.loc[shared, :]
|
|
397
407
|
prices = prices.loc[idx[shared, :], :]
|
|
398
408
|
|
|
@@ -415,16 +425,16 @@ class LightGBModel(object):
|
|
|
415
425
|
prices['dollar_vol'] = prices[['close', 'volume']].prod(1).div(1e3)
|
|
416
426
|
# compute dollar volume to determine universe
|
|
417
427
|
dollar_vol_ma = (prices
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
428
|
+
.dollar_vol
|
|
429
|
+
.unstack('symbol')
|
|
430
|
+
.rolling(window=21, min_periods=1) # 1 trading month
|
|
431
|
+
.mean())
|
|
422
432
|
|
|
423
433
|
# Rank stocks by moving average
|
|
424
434
|
prices['dollar_vol_rank'] = (dollar_vol_ma
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
435
|
+
.rank(axis=1, ascending=False)
|
|
436
|
+
.stack('symbol')
|
|
437
|
+
.swaplevel())
|
|
428
438
|
# Add some Basic Factors
|
|
429
439
|
prices = self._add_technical_indicators(prices)
|
|
430
440
|
# Combine Price and Meta Data
|
|
@@ -438,12 +448,12 @@ class LightGBModel(object):
|
|
|
438
448
|
for t in T:
|
|
439
449
|
# Reset the index to apply qcut by date without grouping errors
|
|
440
450
|
prices[f'r{t:02}dec'] = (prices.reset_index(level='date')
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
451
|
+
.groupby('date')[f'r{t:02}']
|
|
452
|
+
.apply(lambda x: pd.qcut(x,
|
|
453
|
+
q=10,
|
|
454
|
+
labels=False,
|
|
455
|
+
duplicates='drop'))
|
|
456
|
+
.values)
|
|
447
457
|
# Daily sector return deciles
|
|
448
458
|
for t in T:
|
|
449
459
|
prices[f'r{t:02}q_sector'] = (
|
|
@@ -461,50 +471,52 @@ class LightGBModel(object):
|
|
|
461
471
|
level='symbol')[f'r{t:02}'].shift(-t)
|
|
462
472
|
|
|
463
473
|
# Remove outliers
|
|
464
|
-
outliers = prices[prices.r01 > 1].index.get_level_values(
|
|
474
|
+
outliers = prices[prices.r01 > 1].index.get_level_values(
|
|
475
|
+
'symbol').unique()
|
|
465
476
|
prices = prices.drop(outliers, level='symbol')
|
|
466
477
|
# Create time and sector dummy variables
|
|
467
478
|
prices['year'] = prices.index.get_level_values('date').year
|
|
468
479
|
prices['month'] = prices.index.get_level_values('date').month
|
|
469
480
|
prices['weekday'] = prices.index.get_level_values('date').weekday
|
|
470
481
|
# Store Model Data
|
|
471
|
-
prices = prices.drop(
|
|
482
|
+
prices = prices.drop(
|
|
483
|
+
['open', 'close', 'low', 'high', 'volume'], axis=1)
|
|
472
484
|
if 'adj_close' in prices.columns:
|
|
473
485
|
prices = prices.drop('adj_close', axis=1)
|
|
474
|
-
prices.reset_index().
|
|
475
|
-
return prices.
|
|
486
|
+
prices.reset_index().to_hdf(self.datastore, 'model_data')
|
|
487
|
+
return prices.sort_index()
|
|
476
488
|
|
|
477
489
|
def tickers(self):
|
|
478
490
|
return pd.read_hdf(self.outstore, 'lgb/tickers').tolist()
|
|
479
|
-
|
|
480
|
-
def load_model_data(self):
|
|
481
|
-
return pd.read_hdf(self.datastore, 'model_data').set_index(['symbol', 'date']).sort_index()
|
|
482
|
-
|
|
483
|
-
def format_time(self, t):
|
|
484
491
|
|
|
492
|
+
def load_model_data(self, key='model_data'):
|
|
493
|
+
return pd.read_hdf(self.datastore, key=key).set_index(['symbol', 'date']).sort_index()
|
|
494
|
+
|
|
495
|
+
def format_time(self, t):
|
|
485
496
|
"""Return a formatted time string 'HH:MM:SS
|
|
486
497
|
based on a numeric time() value"""
|
|
487
498
|
m, s = divmod(t, 60)
|
|
488
499
|
h, m = divmod(m, 60)
|
|
489
500
|
return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'
|
|
490
|
-
|
|
501
|
+
|
|
491
502
|
def fit(self, data: pd.DataFrame, verbose=True):
|
|
492
503
|
def get_fi(model):
|
|
493
504
|
"""Return normalized feature importance as pd.Series"""
|
|
494
505
|
fi = model.feature_importance(importance_type='gain')
|
|
495
506
|
return (pd.Series(fi / fi.sum(),
|
|
496
|
-
|
|
497
|
-
|
|
507
|
+
index=model.feature_name()))
|
|
508
|
+
|
|
498
509
|
def ic_lgbm(preds, train_data):
|
|
499
510
|
"""Custom IC eval metric for lightgbm"""
|
|
500
511
|
is_higher_better = True
|
|
501
512
|
return 'ic', spearmanr(preds, train_data.get_label())[0], is_higher_better
|
|
513
|
+
data = data.dropna()
|
|
502
514
|
# Hyperparameter options
|
|
503
515
|
YEAR = 252
|
|
504
516
|
base_params = dict(boosting='gbdt',
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
517
|
+
objective='regression',
|
|
518
|
+
verbose=-1)
|
|
519
|
+
|
|
508
520
|
# constraints on structure (depth) of each tree
|
|
509
521
|
max_depths = [2, 3, 5, 7]
|
|
510
522
|
num_leaves_opts = [2 ** i for i in max_depths]
|
|
@@ -517,12 +529,12 @@ class LightGBModel(object):
|
|
|
517
529
|
feature_fraction_opts = [.3, .6, .95]
|
|
518
530
|
|
|
519
531
|
param_names = ['learning_rate', 'num_leaves',
|
|
520
|
-
|
|
521
|
-
|
|
532
|
+
'feature_fraction', 'min_data_in_leaf']
|
|
533
|
+
|
|
522
534
|
cv_params = list(product(learning_rate_ops,
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
535
|
+
num_leaves_opts,
|
|
536
|
+
feature_fraction_opts,
|
|
537
|
+
min_data_in_leaf_opts))
|
|
526
538
|
n_params = len(cv_params)
|
|
527
539
|
print(f'# Parameters: {n_params}')
|
|
528
540
|
|
|
@@ -532,15 +544,15 @@ class LightGBModel(object):
|
|
|
532
544
|
test_lengths = [63]
|
|
533
545
|
test_params = list(product(lookaheads, train_lengths, test_lengths))
|
|
534
546
|
n = len(test_params)
|
|
535
|
-
test_param_sample = np.random.choice(
|
|
547
|
+
test_param_sample = np.random.choice(
|
|
548
|
+
list(range(n)), size=int(n), replace=False)
|
|
536
549
|
test_params = [test_params[i] for i in test_param_sample]
|
|
537
550
|
print('Train configs:', len(test_params))
|
|
538
551
|
|
|
539
|
-
|
|
552
|
+
# Categorical Variables
|
|
540
553
|
categoricals = ['year', 'weekday', 'month']
|
|
541
554
|
for feature in categoricals:
|
|
542
555
|
data[feature] = pd.factorize(data[feature], sort=True)[0]
|
|
543
|
-
|
|
544
556
|
|
|
545
557
|
# ### Run Cross-Validation
|
|
546
558
|
labels = sorted(data.filter(like='fwd').columns)
|
|
@@ -550,64 +562,64 @@ class LightGBModel(object):
|
|
|
550
562
|
num_boost_round = num_iterations[-1]
|
|
551
563
|
|
|
552
564
|
metric_cols = (param_names + ['t', 'daily_ic_mean', 'daily_ic_mean_n',
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
565
|
+
'daily_ic_median', 'daily_ic_median_n'] +
|
|
566
|
+
[str(n) for n in num_iterations])
|
|
567
|
+
|
|
556
568
|
for lookahead, train_length, test_length in test_params:
|
|
557
569
|
# randomized grid search
|
|
558
570
|
cvp = np.random.choice(list(range(n_params)),
|
|
559
|
-
|
|
560
|
-
|
|
571
|
+
size=int(n_params / 2),
|
|
572
|
+
replace=False)
|
|
561
573
|
cv_params_ = [cv_params[i] for i in cvp]
|
|
562
574
|
|
|
563
575
|
# set up cross-validation
|
|
564
576
|
n_splits = int(2 * YEAR / test_length)
|
|
565
577
|
if verbose:
|
|
566
578
|
print(f'Lookahead: {lookahead:2.0f} | '
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
579
|
+
f'Train: {train_length:3.0f} | '
|
|
580
|
+
f'Test: {test_length:2.0f} | '
|
|
581
|
+
f'Params: {len(cv_params_):3.0f} | '
|
|
582
|
+
f'Train configs: {len(test_params)}')
|
|
571
583
|
|
|
572
584
|
# time-series cross-validation
|
|
573
585
|
cv = MultipleTimeSeriesCV(n_splits=n_splits,
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
586
|
+
lookahead=lookahead,
|
|
587
|
+
test_period_length=test_length,
|
|
588
|
+
train_period_length=train_length)
|
|
577
589
|
|
|
578
590
|
label = label_dict[lookahead]
|
|
579
591
|
outcome_data = data.loc[:, features + [label]].dropna()
|
|
580
|
-
|
|
592
|
+
|
|
581
593
|
# binary dataset
|
|
582
594
|
lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
595
|
+
label=outcome_data[label],
|
|
596
|
+
categorical_feature=categoricals,
|
|
597
|
+
free_raw_data=False)
|
|
586
598
|
T = 0
|
|
587
|
-
predictions, metrics
|
|
588
|
-
|
|
599
|
+
predictions, metrics = [], []
|
|
600
|
+
|
|
589
601
|
# iterate over (shuffled) hyperparameter combinations
|
|
590
602
|
for p, param_vals in enumerate(cv_params_):
|
|
591
|
-
key = f'{lookahead}/{train_length}/{test_length}/' +
|
|
603
|
+
key = f'{lookahead}/{train_length}/{test_length}/' + \
|
|
604
|
+
'/'.join([str(p) for p in param_vals])
|
|
592
605
|
params = dict(zip(param_names, param_vals))
|
|
593
606
|
params.update(base_params)
|
|
594
607
|
|
|
595
608
|
start = time()
|
|
596
|
-
cv_preds
|
|
597
|
-
|
|
598
|
-
|
|
609
|
+
cv_preds = []
|
|
610
|
+
|
|
599
611
|
# iterate over folds
|
|
600
612
|
for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
|
|
601
|
-
|
|
613
|
+
|
|
602
614
|
# select train subset
|
|
603
615
|
lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
|
|
604
|
-
|
|
605
|
-
|
|
616
|
+
params=params).construct()
|
|
617
|
+
|
|
606
618
|
# train model for num_boost_round
|
|
607
619
|
model = lgb.train(params=params,
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
620
|
+
train_set=lgb_train,
|
|
621
|
+
num_boost_round=num_boost_round,
|
|
622
|
+
)
|
|
611
623
|
# log feature importance
|
|
612
624
|
if i == 0:
|
|
613
625
|
fi = get_fi(model).to_frame()
|
|
@@ -618,32 +630,36 @@ class LightGBModel(object):
|
|
|
618
630
|
test_set = outcome_data.iloc[test_idx, :]
|
|
619
631
|
X_test = test_set.loc[:, model.feature_name()]
|
|
620
632
|
y_test = test_set.loc[:, label]
|
|
621
|
-
y_pred = {str(n): model.predict(X_test, num_iteration=n)
|
|
622
|
-
|
|
633
|
+
y_pred = {str(n): model.predict(X_test, num_iteration=n)
|
|
634
|
+
for n in num_iterations}
|
|
635
|
+
|
|
623
636
|
# record predictions for each fold
|
|
624
|
-
cv_preds.append(y_test.to_frame(
|
|
625
|
-
|
|
637
|
+
cv_preds.append(y_test.to_frame(
|
|
638
|
+
'y_test').assign(**y_pred).assign(i=i))
|
|
639
|
+
|
|
626
640
|
# combine fold results
|
|
627
641
|
cv_preds = pd.concat(cv_preds).assign(**params)
|
|
628
642
|
predictions.append(cv_preds)
|
|
629
|
-
|
|
643
|
+
|
|
630
644
|
# compute IC per day
|
|
631
645
|
by_day = cv_preds.groupby(level='date')
|
|
632
646
|
ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
|
|
633
|
-
|
|
647
|
+
for n in num_iterations], axis=1)
|
|
634
648
|
daily_ic_mean = ic_by_day.mean()
|
|
635
649
|
daily_ic_mean_n = daily_ic_mean.idxmax()
|
|
636
650
|
daily_ic_median = ic_by_day.median()
|
|
637
651
|
daily_ic_median_n = daily_ic_median.idxmax()
|
|
638
|
-
|
|
652
|
+
|
|
639
653
|
# compute IC across all predictions
|
|
640
|
-
ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0]
|
|
654
|
+
ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0]
|
|
655
|
+
for n in num_iterations]
|
|
641
656
|
t = time() - start
|
|
642
657
|
T += t
|
|
643
|
-
|
|
658
|
+
|
|
644
659
|
# collect metrics
|
|
645
660
|
metrics = pd.Series(list(param_vals) +
|
|
646
|
-
[t, daily_ic_mean.max(), daily_ic_mean_n,
|
|
661
|
+
[t, daily_ic_mean.max(), daily_ic_mean_n,
|
|
662
|
+
daily_ic_median.max(), daily_ic_median_n] + ic,
|
|
647
663
|
index=metric_cols)
|
|
648
664
|
if verbose:
|
|
649
665
|
msg = f'\t{p:3.0f} | {self.format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
|
|
@@ -653,14 +669,16 @@ class LightGBModel(object):
|
|
|
653
669
|
|
|
654
670
|
# persist results for given CV run and hyperparameter combination
|
|
655
671
|
metrics.to_hdf(self.trainstore, 'metrics/' + key)
|
|
656
|
-
ic_by_day.assign(
|
|
672
|
+
ic_by_day.assign(
|
|
673
|
+
**params).to_hdf(self.trainstore, 'daily_ic/' + key)
|
|
657
674
|
fi.T.describe().T.assign(**params).to_hdf(self.trainstore, 'fi/' + key)
|
|
658
|
-
cv_preds.to_hdf(self.trainstore,
|
|
675
|
+
cv_preds.to_hdf(self.trainstore,
|
|
676
|
+
'predictions/' + key, append=True)
|
|
659
677
|
|
|
660
678
|
def _get_lgb_metrics(self, scope_params, lgb_train_params, daily_ic_metrics):
|
|
661
679
|
with pd.HDFStore(self.trainstore) as store:
|
|
662
680
|
for i, key in enumerate(
|
|
663
|
-
|
|
681
|
+
[k[1:] for k in store.keys() if k[1:].startswith('metrics')]):
|
|
664
682
|
_, t, train_length, test_length = key.split('/')[:4]
|
|
665
683
|
attrs = {
|
|
666
684
|
'lookahead': t,
|
|
@@ -675,10 +693,10 @@ class LightGBModel(object):
|
|
|
675
693
|
lgb_metrics[i] = pd.Series(s)
|
|
676
694
|
|
|
677
695
|
id_vars = scope_params + lgb_train_params + daily_ic_metrics
|
|
678
|
-
lgb_metrics = pd.melt(lgb_metrics.T.drop('t', axis=1),
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
696
|
+
lgb_metrics = pd.melt(lgb_metrics.T.drop('t', axis=1),
|
|
697
|
+
id_vars=id_vars,
|
|
698
|
+
value_name='ic',
|
|
699
|
+
var_name='boost_rounds').dropna().apply(pd.to_numeric)
|
|
682
700
|
return lgb_metrics
|
|
683
701
|
|
|
684
702
|
def _get_lgb_ic(self, int_cols, scope_params, lgb_train_params, id_vars):
|
|
@@ -689,22 +707,23 @@ class LightGBModel(object):
|
|
|
689
707
|
_, t, train_length, test_length = key.split('/')[:4]
|
|
690
708
|
if key.startswith('daily_ic'):
|
|
691
709
|
df = (store[key]
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
710
|
+
.drop(['boosting', 'objective', 'verbose'], axis=1)
|
|
711
|
+
.assign(lookahead=t,
|
|
712
|
+
train_length=train_length,
|
|
713
|
+
test_length=test_length))
|
|
696
714
|
lgb_ic.append(df)
|
|
697
715
|
lgb_ic = pd.concat(lgb_ic).reset_index()
|
|
698
|
-
lgb_ic = pd.melt(lgb_ic,
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
716
|
+
lgb_ic = pd.melt(lgb_ic,
|
|
717
|
+
id_vars=id_vars,
|
|
718
|
+
value_name='ic',
|
|
719
|
+
var_name='boost_rounds').dropna()
|
|
702
720
|
lgb_ic.loc[:, int_cols] = lgb_ic.loc[:, int_cols].astype(int)
|
|
703
721
|
return lgb_ic
|
|
704
722
|
|
|
705
723
|
def _get_lgb_params(self, data, scope_params, lgb_train_params, t=5, best=0):
|
|
706
724
|
param_cols = scope_params[1:] + lgb_train_params + ['boost_rounds']
|
|
707
|
-
df = data[data.lookahead==t].sort_values(
|
|
725
|
+
df = data[data.lookahead == t].sort_values(
|
|
726
|
+
'ic', ascending=False).iloc[best]
|
|
708
727
|
return df.loc[param_cols]
|
|
709
728
|
|
|
710
729
|
def _get_lgb_key(self, t, p):
|
|
@@ -713,12 +732,12 @@ class LightGBModel(object):
|
|
|
713
732
|
|
|
714
733
|
def _select_ic(self, params, ic_data, lookahead):
|
|
715
734
|
return ic_data.loc[(ic_data.lookahead == lookahead) &
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
735
|
+
(ic_data.train_length == params.train_length) &
|
|
736
|
+
(ic_data.test_length == params.test_length) &
|
|
737
|
+
(ic_data.learning_rate == params.learning_rate) &
|
|
738
|
+
(ic_data.num_leaves == params.num_leaves) &
|
|
739
|
+
(ic_data.feature_fraction == params.feature_fraction) &
|
|
740
|
+
(ic_data.boost_rounds == params.boost_rounds), ['date', 'ic']].set_index('date')
|
|
722
741
|
|
|
723
742
|
def get_trade_prices(self, tickers, start, end):
|
|
724
743
|
idx = pd.IndexSlice
|
|
@@ -736,73 +755,77 @@ class LightGBModel(object):
|
|
|
736
755
|
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
|
|
737
756
|
axes = axes.flatten()
|
|
738
757
|
for i, t in enumerate([1, 21]):
|
|
739
|
-
params = self._get_lgb_params(
|
|
758
|
+
params = self._get_lgb_params(
|
|
759
|
+
lgb_daily_ic, scope_params, lgb_train_params, t=t, best=0)
|
|
740
760
|
data = self._select_ic(params, lgb_ic, lookahead=t).sort_index()
|
|
741
761
|
rolling = data.rolling(63).ic.mean().dropna()
|
|
742
762
|
avg = data.ic.mean()
|
|
743
763
|
med = data.ic.median()
|
|
744
|
-
rolling.plot(
|
|
764
|
+
rolling.plot(
|
|
765
|
+
ax=axes[i], title=f'Horizon: {t} Day(s) | IC: Mean={avg*100:.2f} Median={med*100:.2f}')
|
|
745
766
|
axes[i].axhline(avg, c='darkred', lw=1)
|
|
746
767
|
axes[i].axhline(0, ls='--', c='k', lw=1)
|
|
747
768
|
|
|
748
769
|
fig.suptitle('3-Month Rolling Information Coefficient', fontsize=16)
|
|
749
770
|
fig.tight_layout()
|
|
750
|
-
fig.subplots_adjust(top=0.92)
|
|
771
|
+
fig.subplots_adjust(top=0.92)
|
|
751
772
|
|
|
752
773
|
def plot_metrics(self, lgb_metrics, lgb_daily_ic, t=1):
|
|
753
|
-
|
|
754
|
-
sns.jointplot(x=lgb_metrics.daily_ic_mean,y=lgb_metrics.ic)
|
|
774
|
+
# Visualization
|
|
775
|
+
sns.jointplot(x=lgb_metrics.daily_ic_mean, y=lgb_metrics.ic)
|
|
755
776
|
|
|
756
|
-
|
|
777
|
+
sns.catplot(x='lookahead', y='ic',
|
|
757
778
|
col='train_length', row='test_length',
|
|
758
779
|
data=lgb_metrics,
|
|
759
|
-
kind='box')
|
|
760
|
-
|
|
780
|
+
kind='box')
|
|
781
|
+
sns.catplot(x='boost_rounds',
|
|
761
782
|
y='ic',
|
|
762
783
|
col='train_length',
|
|
763
784
|
row='test_length',
|
|
764
785
|
data=lgb_daily_ic[lgb_daily_ic.lookahead == t],
|
|
765
|
-
kind='box')
|
|
786
|
+
kind='box')
|
|
766
787
|
|
|
767
788
|
def get_best_predictions(self, lgb_daily_ic, scope_params, lgb_train_params, lookahead=1, topn=10):
|
|
768
789
|
for best in range(topn):
|
|
769
|
-
best_params = self._get_lgb_params(
|
|
790
|
+
best_params = self._get_lgb_params(
|
|
791
|
+
lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=best)
|
|
770
792
|
key = self._get_lgb_key(lookahead, best_params)
|
|
771
793
|
rounds = str(int(best_params.boost_rounds))
|
|
772
794
|
if best == 0:
|
|
773
|
-
best_predictions = pd.read_hdf(
|
|
795
|
+
best_predictions = pd.read_hdf(
|
|
796
|
+
self.trainstore, 'predictions/' + key)
|
|
774
797
|
best_predictions = best_predictions[rounds].to_frame(best)
|
|
775
798
|
else:
|
|
776
|
-
best_predictions[best] = pd.read_hdf(
|
|
799
|
+
best_predictions[best] = pd.read_hdf(
|
|
800
|
+
self.trainstore, 'predictions/' + key)[rounds]
|
|
777
801
|
best_predictions = best_predictions.sort_index()
|
|
778
|
-
best_predictions.reset_index().to_hdf(
|
|
802
|
+
best_predictions.reset_index().to_hdf(
|
|
803
|
+
self.outstore, f'lgb/train/{lookahead:02}')
|
|
779
804
|
return best_predictions
|
|
780
805
|
|
|
781
806
|
def apply_alphalen_analysis(self, factor_data, tearsheet=True, verbose=True):
|
|
782
|
-
|
|
807
|
+
# Compute Alphalens metrics
|
|
783
808
|
mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
809
|
+
factor_data,
|
|
810
|
+
by_date=True,
|
|
811
|
+
by_group=False,
|
|
812
|
+
demeaned=True,
|
|
813
|
+
group_adjust=False,
|
|
789
814
|
)
|
|
790
815
|
factor_returns = perf.factor_returns(factor_data)
|
|
791
816
|
mean_quant_ret, std_quantile = perf.mean_return_by_quantile(factor_data,
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
817
|
+
by_group=False,
|
|
818
|
+
demeaned=True)
|
|
796
819
|
|
|
797
820
|
mean_quant_rateret = mean_quant_ret.apply(rate_of_return, axis=0,
|
|
798
|
-
|
|
799
|
-
|
|
821
|
+
base_period=mean_quant_ret.columns[0])
|
|
822
|
+
|
|
800
823
|
mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
824
|
+
factor_data,
|
|
825
|
+
by_date=True,
|
|
826
|
+
by_group=False,
|
|
827
|
+
demeaned=True,
|
|
828
|
+
group_adjust=False,
|
|
806
829
|
)
|
|
807
830
|
|
|
808
831
|
mean_quant_rateret_bydate = mean_quant_ret_bydate.apply(
|
|
@@ -823,123 +846,146 @@ class LightGBModel(object):
|
|
|
823
846
|
std_err=compstd_quant_daily,
|
|
824
847
|
)
|
|
825
848
|
if verbose:
|
|
826
|
-
print(mean_ret_spread_quant.mean().mul(10000).to_frame(
|
|
849
|
+
print(mean_ret_spread_quant.mean().mul(10000).to_frame(
|
|
850
|
+
'Mean Period Wise Spread (bps)').join(alpha_beta.T).T)
|
|
827
851
|
|
|
828
852
|
fig, axes = plt.subplots(ncols=3, figsize=(18, 4))
|
|
829
853
|
|
|
830
|
-
|
|
831
854
|
plotting.plot_quantile_returns_bar(mean_quant_rateret, ax=axes[0])
|
|
832
855
|
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)
|
|
833
856
|
axes[0].set_xlabel('Quantile')
|
|
834
857
|
|
|
835
858
|
plotting.plot_cumulative_returns_by_quantile(mean_quant_ret_bydate['1D'],
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
859
|
+
freq=pd.tseries.offsets.BDay(),
|
|
860
|
+
period='1D',
|
|
861
|
+
ax=axes[1])
|
|
839
862
|
axes[1].set_title('Cumulative Return by Quantile (1D Period)')
|
|
840
863
|
|
|
841
864
|
title = "Cumulative Return - Factor-Weighted Long/Short PF (1D Period)"
|
|
842
865
|
plotting.plot_cumulative_returns(factor_returns['1D'],
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
866
|
+
period='1D',
|
|
867
|
+
freq=pd.tseries.offsets.BDay(),
|
|
868
|
+
title=title,
|
|
869
|
+
ax=axes[2])
|
|
847
870
|
|
|
848
871
|
fig.suptitle('Alphalens - Validation Set Performance', fontsize=14)
|
|
849
872
|
fig.tight_layout()
|
|
850
|
-
fig.subplots_adjust(top=.85)
|
|
873
|
+
fig.subplots_adjust(top=.85)
|
|
851
874
|
|
|
852
|
-
|
|
875
|
+
# Summary Tearsheet
|
|
853
876
|
create_summary_tear_sheet(factor_data)
|
|
854
877
|
create_full_tear_sheet(factor_data)
|
|
855
878
|
|
|
856
|
-
def evaluate(self, remove_instore=False, lookahead=1):
|
|
879
|
+
def evaluate(self, remove_instore=False, lookahead=1, verbose=True):
|
|
857
880
|
scope_params = ['lookahead', 'train_length', 'test_length']
|
|
858
|
-
daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n',
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
881
|
+
daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n',
|
|
882
|
+
'daily_ic_median', 'daily_ic_median_n']
|
|
883
|
+
lgb_train_params = ['learning_rate', 'num_leaves',
|
|
884
|
+
'feature_fraction', 'min_data_in_leaf']
|
|
885
|
+
|
|
886
|
+
lgb_metrics = self._get_lgb_metrics(
|
|
887
|
+
scope_params, lgb_train_params, daily_ic_metrics)
|
|
888
|
+
# Summary Metrics by Fold
|
|
863
889
|
lgb_metrics.to_hdf(self.outstore, 'lgb/metrics')
|
|
864
|
-
|
|
865
|
-
|
|
890
|
+
|
|
891
|
+
# Information Coefficient by Day
|
|
866
892
|
int_cols = ['lookahead', 'train_length', 'test_length', 'boost_rounds']
|
|
867
893
|
id_vars = ['date'] + scope_params + lgb_train_params
|
|
868
|
-
lgb_ic = self._get_lgb_ic(
|
|
894
|
+
lgb_ic = self._get_lgb_ic(
|
|
895
|
+
int_cols, scope_params, lgb_train_params, id_vars)
|
|
869
896
|
lgb_ic.to_hdf(self.outstore, 'lgb/ic')
|
|
870
|
-
lgb_daily_ic = lgb_ic.groupby(
|
|
897
|
+
lgb_daily_ic = lgb_ic.groupby(
|
|
898
|
+
id_vars[1:] + ['boost_rounds']).ic.mean().to_frame('ic').reset_index()
|
|
871
899
|
lgb_daily_ic.to_hdf(self.outstore, 'lgb/daily_ic')
|
|
872
900
|
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
901
|
+
# Cross-validation Result: Best Hyperparameters
|
|
902
|
+
if verbose:
|
|
903
|
+
print(lgb_daily_ic.groupby('lookahead', group_keys=False).apply(
|
|
904
|
+
lambda x: x.nlargest(3, 'ic')))
|
|
905
|
+
lgb_metrics.groupby('lookahead', group_keys=False).apply(
|
|
906
|
+
lambda x: x.nlargest(3, 'ic'))
|
|
877
907
|
lgb_metrics.groupby('lookahead', group_keys=False
|
|
878
908
|
).apply(lambda x: x.nlargest(3, 'ic')).to_hdf(self.outstore, 'lgb/best_model')
|
|
879
|
-
|
|
909
|
+
if verbose:
|
|
910
|
+
print(lgb_metrics.groupby('lookahead', group_keys=False).apply(
|
|
911
|
+
lambda x: x.nlargest(3, 'daily_ic_mean')))
|
|
880
912
|
|
|
881
|
-
|
|
882
|
-
|
|
913
|
+
# Visualization
|
|
914
|
+
if verbose:
|
|
915
|
+
self.plot_metrics(lgb_metrics, lgb_daily_ic, t=lookahead)
|
|
883
916
|
|
|
884
|
-
|
|
917
|
+
# AlphaLens Analysis - Validation Performance
|
|
885
918
|
lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
|
|
886
|
-
best_params = self._get_lgb_params(
|
|
919
|
+
best_params = self._get_lgb_params(
|
|
920
|
+
lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=0)
|
|
887
921
|
best_params.to_hdf(self.outstore, 'lgb/best_params')
|
|
888
922
|
|
|
889
|
-
|
|
923
|
+
if verbose:
|
|
924
|
+
self.plot_ic(lgb_ic, lgb_daily_ic, scope_params, lgb_train_params)
|
|
890
925
|
|
|
891
|
-
|
|
892
|
-
best_predictions = self.get_best_predictions(lgb_daily_ic, scope_params, lgb_train_params,
|
|
893
|
-
|
|
894
|
-
test_tickers = best_predictions.index.get_level_values(
|
|
926
|
+
# Get Predictions for Validation Period
|
|
927
|
+
best_predictions = self.get_best_predictions(lgb_daily_ic, scope_params, lgb_train_params,
|
|
928
|
+
lookahead=lookahead, topn=10)
|
|
929
|
+
test_tickers = best_predictions.index.get_level_values(
|
|
930
|
+
'symbol').unique()
|
|
895
931
|
start = best_predictions.index.get_level_values('date').min()
|
|
896
932
|
end = best_predictions.index.get_level_values('date').max()
|
|
897
933
|
trade_prices = self.get_trade_prices(test_tickers, start, end)
|
|
898
|
-
trade_prices.to_hdf(self.outstore, 'trade_prices/model_selection')
|
|
899
934
|
pd.Series(test_tickers).to_hdf(self.outstore, 'lgb/tickers')
|
|
900
|
-
#We average the top five models and provide the corresponding prices to Alphalens,
|
|
901
|
-
#
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
935
|
+
# We average the top five models and provide the corresponding prices to Alphalens,
|
|
936
|
+
# in order to compute the mean period-wise
|
|
937
|
+
# return earned on an equal-weighted portfolio invested in the daily factor quintiles
|
|
938
|
+
# for various holding periods:
|
|
939
|
+
factor = best_predictions.iloc[:, :5].mean(
|
|
940
|
+
1).dropna().tz_convert('UTC', level='date').swaplevel()
|
|
941
|
+
# Create AlphaLens Inputs
|
|
942
|
+
if verbose:
|
|
943
|
+
factor_data = get_clean_factor_and_forward_returns(factor=factor,
|
|
944
|
+
prices=trade_prices,
|
|
945
|
+
quantiles=5,
|
|
946
|
+
periods=(
|
|
947
|
+
1, 5, 10, 21),
|
|
948
|
+
max_loss=1)
|
|
949
|
+
self.apply_alphalen_analysis(
|
|
950
|
+
factor_data, tearsheet=True, verbose=True)
|
|
910
951
|
# Delete the temporary files
|
|
911
952
|
if remove_instore:
|
|
912
953
|
os.remove(self.trainstore)
|
|
913
|
-
|
|
914
|
-
def make_predictions(self, data: pd.DataFrame, lookahead=1, verbose=True):
|
|
954
|
+
|
|
955
|
+
def make_predictions(self, data: pd.DataFrame, mode='test', lookahead=1, verbose=True):
|
|
956
|
+
data = data.copy()
|
|
915
957
|
YEAR = 252
|
|
916
|
-
idx = pd.IndexSlice
|
|
917
958
|
scope_params = ['lookahead', 'train_length', 'test_length']
|
|
918
|
-
|
|
919
|
-
|
|
959
|
+
lgb_train_params = ['learning_rate', 'num_leaves',
|
|
960
|
+
'feature_fraction', 'min_data_in_leaf']
|
|
920
961
|
|
|
921
962
|
base_params = dict(boosting='gbdt',
|
|
922
|
-
|
|
923
|
-
|
|
963
|
+
objective='regression',
|
|
964
|
+
verbose=-1)
|
|
924
965
|
|
|
925
|
-
categoricals = ['year', 'month', '
|
|
926
|
-
data = data.sort_index()
|
|
966
|
+
categoricals = ['year', 'month', 'weekday']
|
|
927
967
|
labels = sorted(data.filter(like='_fwd').columns)
|
|
928
968
|
features = data.columns.difference(labels).tolist()
|
|
929
969
|
label = f'r{lookahead:02}_fwd'
|
|
930
970
|
for feature in categoricals:
|
|
931
971
|
data[feature] = pd.factorize(data[feature], sort=True)[0]
|
|
932
|
-
|
|
972
|
+
|
|
973
|
+
if mode == 'test':
|
|
974
|
+
data = data.dropna().sort_index()
|
|
975
|
+
elif mode == 'live':
|
|
976
|
+
data[labels] = data[labels].fillna(0)
|
|
977
|
+
data = data.sort_index().dropna()
|
|
978
|
+
|
|
933
979
|
lgb_data = lgb.Dataset(data=data[features],
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
lgb_ic = pd.read_hdf(self.outstore, 'lgb/ic')
|
|
980
|
+
label=data[label],
|
|
981
|
+
categorical_feature=categoricals,
|
|
982
|
+
free_raw_data=False)
|
|
983
|
+
# Generate predictions
|
|
939
984
|
lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
|
|
940
985
|
|
|
941
986
|
for position in range(10):
|
|
942
|
-
params = self._get_lgb_params(
|
|
987
|
+
params = self._get_lgb_params(
|
|
988
|
+
lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=position)
|
|
943
989
|
|
|
944
990
|
params = params.to_dict()
|
|
945
991
|
|
|
@@ -949,27 +995,27 @@ class LightGBModel(object):
|
|
|
949
995
|
test_length = int(params.pop('test_length'))
|
|
950
996
|
num_boost_round = int(params.pop('boost_rounds'))
|
|
951
997
|
params.update(base_params)
|
|
952
|
-
|
|
953
|
-
|
|
998
|
+
if verbose:
|
|
999
|
+
print(f'\nPosition: {position:02}')
|
|
954
1000
|
|
|
955
1001
|
# 1-year out-of-sample period
|
|
956
1002
|
n_splits = int(YEAR / test_length)
|
|
957
1003
|
cv = MultipleTimeSeriesCV(n_splits=n_splits,
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
1004
|
+
test_period_length=test_length,
|
|
1005
|
+
lookahead=lookahead,
|
|
1006
|
+
train_period_length=train_length)
|
|
961
1007
|
|
|
962
1008
|
predictions = []
|
|
963
|
-
start = time()
|
|
964
1009
|
for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
|
|
965
|
-
|
|
1010
|
+
if verbose:
|
|
1011
|
+
print(i, end=' ', flush=True)
|
|
966
1012
|
lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
|
|
967
1013
|
params=params).construct()
|
|
968
1014
|
|
|
969
1015
|
model = lgb.train(params=params,
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
1016
|
+
train_set=lgb_train,
|
|
1017
|
+
num_boost_round=num_boost_round,
|
|
1018
|
+
)
|
|
973
1019
|
|
|
974
1020
|
test_set = data.iloc[test_idx, :]
|
|
975
1021
|
y_test = test_set.loc[:, label].to_frame('y_test')
|
|
@@ -992,29 +1038,32 @@ class LightGBModel(object):
|
|
|
992
1038
|
lambda x: spearmanr(x.y_test, x[position])[0])
|
|
993
1039
|
if verbose:
|
|
994
1040
|
print(ic_by_day.describe())
|
|
995
|
-
test_predictions.reset_index().to_hdf(
|
|
1041
|
+
test_predictions.reset_index().to_hdf(
|
|
1042
|
+
self.outstore, f'lgb/test/{lookahead:02}')
|
|
996
1043
|
return test_predictions
|
|
997
1044
|
|
|
998
1045
|
def load_predictions(self, predictions=None, lookahead=1):
|
|
999
1046
|
if predictions is None:
|
|
1000
1047
|
predictions = pd.concat([
|
|
1001
|
-
pd.read_hdf(self.outstore, f'lgb/train/{lookahead:02}'),
|
|
1002
|
-
pd.read_hdf(self.outstore,
|
|
1048
|
+
pd.read_hdf(self.outstore, f'lgb/train/{lookahead:02}'),
|
|
1049
|
+
pd.read_hdf(self.outstore,
|
|
1050
|
+
f'lgb/test/{lookahead:02}').drop('y_test', axis=1)
|
|
1003
1051
|
])
|
|
1004
1052
|
predictions = predictions.set_index(['symbol', 'date'])
|
|
1005
1053
|
|
|
1006
1054
|
predictions = (predictions.loc[~predictions.index.duplicated()]
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
tickers = predictions.index.get_level_values(
|
|
1055
|
+
.iloc[:, :10]
|
|
1056
|
+
.mean(1)
|
|
1057
|
+
.sort_index()
|
|
1058
|
+
.dropna()
|
|
1059
|
+
.to_frame('prediction'))
|
|
1060
|
+
tickers = predictions.index.get_level_values(
|
|
1061
|
+
'symbol').unique().tolist()
|
|
1013
1062
|
return (predictions
|
|
1014
1063
|
.unstack('symbol')
|
|
1015
1064
|
.prediction
|
|
1016
|
-
.tz_convert
|
|
1017
|
-
|
|
1065
|
+
.tz_convert('UTC')), tickers
|
|
1066
|
+
|
|
1018
1067
|
def assert_last_date(self, predictions: pd.DataFrame):
|
|
1019
1068
|
"""
|
|
1020
1069
|
Usefull in Live Trading to ensure that the last date in the predictions
|
|
@@ -1023,4 +1072,21 @@ class LightGBModel(object):
|
|
|
1023
1072
|
last_date = predictions.index.get_level_values('date').max()
|
|
1024
1073
|
if last_date.tzinfo is None:
|
|
1025
1074
|
last_date = last_date.tz_localize('UTC')
|
|
1026
|
-
|
|
1075
|
+
try:
|
|
1076
|
+
if datetime.now().strftime('%A') == 'Monday':
|
|
1077
|
+
assert last_date == (pd.Timestamp.now(
|
|
1078
|
+
tz='UTC') - pd.Timedelta(days=3)).normalize()
|
|
1079
|
+
else:
|
|
1080
|
+
assert (
|
|
1081
|
+
last_date == (pd.Timestamp.now(tz='UTC')
|
|
1082
|
+
- pd.Timedelta(days=1)).normalize()
|
|
1083
|
+
or last_date == (pd.Timestamp.now(tz='UTC')).normalize()
|
|
1084
|
+
)
|
|
1085
|
+
return True
|
|
1086
|
+
except AssertionError:
|
|
1087
|
+
return False
|
|
1088
|
+
|
|
1089
|
+
def clean_stores(self, *stores):
|
|
1090
|
+
for store in stores:
|
|
1091
|
+
if os.path.exists(store):
|
|
1092
|
+
os.remove(store)
|