bbstrader 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbstrader might be problematic. Click here for more details.

bbstrader/models/ml.py CHANGED
@@ -1,36 +1,38 @@
1
- from pathlib import Path
2
- import sys, os
1
+ import os
2
+ import warnings
3
+ from datetime import datetime
4
+ from itertools import product
5
+ from time import time
6
+
7
+ import lightgbm as lgb
8
+ import matplotlib.pyplot as plt
3
9
  import numpy as np
4
10
  import pandas as pd
5
- import matplotlib.pyplot as plt
6
11
  import seaborn as sns
7
12
  import talib
8
- from time import time
9
- from tqdm import tqdm
10
- from talib import RSI, BBANDS, MACD, ATR
11
13
  import yfinance as yf
12
- from scipy.stats import spearmanr
13
- from itertools import product
14
- import lightgbm as lgb
15
- from collections import defaultdict
16
- from alphalens.tears import (create_summary_tear_sheet,
17
- create_full_tear_sheet)
18
- from alphalens import plotting
19
14
  from alphalens import performance as perf
20
- from alphalens.utils import get_clean_factor_and_forward_returns, rate_of_return, std_conversion
21
- from sklearn.preprocessing import StandardScaler
22
- from sklearn.preprocessing import LabelEncoder
15
+ from alphalens import plotting
16
+ from alphalens.tears import create_full_tear_sheet, create_summary_tear_sheet
17
+ from alphalens.utils import (
18
+ get_clean_factor_and_forward_returns,
19
+ rate_of_return,
20
+ std_conversion,
21
+ )
22
+ from scipy.stats import spearmanr
23
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
24
+ from talib import ATR, BBANDS, MACD, RSI
23
25
 
24
- import warnings
25
26
  warnings.filterwarnings('ignore')
26
27
 
27
28
 
28
29
  __all__ = [
29
- 'OneStepTimeSeriesSplit',
30
- 'MultipleTimeSeriesCV',
30
+ 'OneStepTimeSeriesSplit',
31
+ 'MultipleTimeSeriesCV',
31
32
  'LightGBModel'
32
33
  ]
33
34
 
35
+
34
36
  class OneStepTimeSeriesSplit:
35
37
  __author__ = "Stefan Jansen"
36
38
  """Generates tuples of train_idx, test_idx pairs
@@ -42,7 +44,7 @@ class OneStepTimeSeriesSplit:
42
44
  self.shuffle = shuffle
43
45
 
44
46
  @staticmethod
45
- def chunks(l, n):
47
+ def chunks(l, n): # noqa: E741
46
48
  for i in range(0, len(l), n):
47
49
  yield l[i:i + n]
48
50
 
@@ -63,7 +65,7 @@ class OneStepTimeSeriesSplit:
63
65
 
64
66
  def get_n_splits(self, X, y, groups=None):
65
67
  return self.n_splits
66
-
68
+
67
69
 
68
70
  class MultipleTimeSeriesCV:
69
71
  __author__ = "Stefan Jansen"
@@ -80,7 +82,7 @@ class MultipleTimeSeriesCV:
80
82
  lookahead=None,
81
83
  date_idx='date',
82
84
  shuffle=False):
83
-
85
+
84
86
  self.n_splits = n_splits
85
87
  self.lookahead = lookahead
86
88
  self.test_length = test_period_length
@@ -113,7 +115,7 @@ class MultipleTimeSeriesCV:
113
115
 
114
116
  def get_n_splits(self, X, y, groups=None):
115
117
  return self.n_splits
116
-
118
+
117
119
 
118
120
  class LightGBModel(object):
119
121
  """
@@ -158,12 +160,12 @@ class LightGBModel(object):
158
160
  Chapter 12, Boosting Your Trading Strategy.
159
161
  """
160
162
 
161
- def __init__(self,
162
- data: pd.DataFrame=None,
163
- datastore: pd.HDFStore='lgbdata.h5',
164
- trainstore: pd.HDFStore='lgbtrain.h5',
165
- outstore: pd.HDFStore='lgbout.h5'
166
- ):
163
+ def __init__(self,
164
+ data: pd.DataFrame = None,
165
+ datastore: pd.HDFStore = 'lgbdata.h5',
166
+ trainstore: pd.HDFStore = 'lgbtrain.h5',
167
+ outstore: pd.HDFStore = 'lgbout.h5'
168
+ ):
167
169
  """
168
170
  Args:
169
171
  data (pd.DataFrame): The input data for the model. It should be a DataFrame with a MultiIndex containing
@@ -183,42 +185,48 @@ class LightGBModel(object):
183
185
  return pd.DataFrame({'bb_high': high, 'bb_low': low}, index=close.index)
184
186
 
185
187
  def _compute_atr(self, stock_data):
186
- df = ATR(stock_data.high, stock_data.low,
187
- stock_data.close, timeperiod=14)
188
+ df = ATR(stock_data.high, stock_data.low,
189
+ stock_data.close, timeperiod=14)
188
190
  return df.sub(df.mean()).div(df.std())
189
-
191
+
190
192
  def _compute_macd(self, close):
191
193
  macd = MACD(close)[0]
192
194
  return (macd - np.mean(macd))/np.std(macd)
193
-
195
+
194
196
  def _add_technical_indicators(self, prices: pd.DataFrame):
195
197
  prices = prices.copy()
196
- prices['rsi'] = prices.groupby(level='symbol').close.apply(lambda x: RSI(x).reset_index(level=0, drop=True))
197
- bb = prices.groupby(level=0).close.apply(self._compute_bb).reset_index(level=1, drop=True)
198
+ prices['rsi'] = prices.groupby(level='symbol').close.apply(
199
+ lambda x: RSI(x).reset_index(level=0, drop=True))
200
+ bb = prices.groupby(level=0).close.apply(
201
+ self._compute_bb).reset_index(level=1, drop=True)
198
202
  prices = prices.join(bb)
199
- prices['bb_high'] = prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
200
- prices['bb_low'] = prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)
201
- prices['NATR'] = prices.groupby(level='symbol',
202
- group_keys=False).apply(lambda x:
203
- talib.NATR(x.high, x.low, x.close))
204
-
203
+ prices['bb_high'] = prices.bb_high.sub(
204
+ prices.close).div(prices.bb_high).apply(np.log1p)
205
+ prices['bb_low'] = prices.close.sub(
206
+ prices.bb_low).div(prices.close).apply(np.log1p)
207
+ prices['NATR'] = prices.groupby(level='symbol',
208
+ group_keys=False).apply(lambda x:
209
+ talib.NATR(x.high, x.low, x.close))
210
+
205
211
  prices['ATR'] = (prices.groupby('symbol', group_keys=False)
206
- .apply(self._compute_atr))
207
- prices['PPO'] = prices.groupby(level='symbol').close.apply(lambda x: talib.PPO(x).reset_index(level=0, drop=True))
212
+ .apply(self._compute_atr))
213
+ prices['PPO'] = prices.groupby(level='symbol').close.apply(
214
+ lambda x: talib.PPO(x).reset_index(level=0, drop=True))
208
215
  prices['MACD'] = (prices
209
- .groupby('symbol', group_keys=False)
210
- .close
211
- .apply(self._compute_macd))
216
+ .groupby('symbol', group_keys=False)
217
+ .close
218
+ .apply(self._compute_macd))
212
219
  return prices
213
-
220
+
214
221
  def download_boosting_data(self, tickers, start, end=None):
215
222
  data = []
216
223
  for ticker in tickers:
217
224
  try:
218
- prices = yf.download(ticker, start=start, end=end, progress=False, multi_level_index=False)
225
+ prices = yf.download(
226
+ ticker, start=start, end=end, progress=False, multi_level_index=False)
219
227
  prices['symbol'] = ticker
220
228
  data.append(prices)
221
- except:
229
+ except: # noqa: E722
222
230
  continue
223
231
  data = pd.concat(data)
224
232
  data = (data
@@ -228,49 +236,50 @@ class LightGBModel(object):
228
236
  .sort_index()
229
237
  .dropna())
230
238
  return data
231
-
239
+
232
240
  def download_metadata(self, tickers):
233
-
241
+
234
242
  def clean_text_column(series: pd.Series) -> pd.Series:
235
243
  return (
236
244
  series.str.lower()
237
- .str.replace('-', '', regex=False) # use regex=False for literal string replacements
245
+ # use regex=False for literal string replacements
246
+ .str.replace('-', '', regex=False)
238
247
  .str.replace('&', 'and', regex=False)
239
248
  .str.replace(' ', '_', regex=False)
240
249
  .str.replace('__', '_', regex=False)
241
250
  )
242
-
251
+
243
252
  metadata = ['industry', 'sector', 'exchange', 'symbol',
244
- 'heldPercentInsiders', 'heldPercentInstitutions',
253
+ 'heldPercentInsiders', 'heldPercentInstitutions',
245
254
  'overallRisk', 'shortRatio', 'dividendYield', 'beta',
246
255
  'regularMarketVolume', 'averageVolume', 'averageVolume10days',
247
- 'bid', 'ask', 'bidSize', 'askSize','marketCap']
248
-
256
+ 'bid', 'ask', 'bidSize', 'askSize', 'marketCap']
257
+
249
258
  columns = {
250
- 'industry' : 'industry',
251
- 'sector' : 'sector',
252
- 'exchange' : 'exchange',
253
- 'symbol' : 'symbol',
254
- 'heldPercentInsiders' : 'insiders',
259
+ 'industry': 'industry',
260
+ 'sector': 'sector',
261
+ 'exchange': 'exchange',
262
+ 'symbol': 'symbol',
263
+ 'heldPercentInsiders': 'insiders',
255
264
  'heldPercentInstitutions': 'institutions',
256
- 'overallRisk' : 'risk',
257
- 'shortRatio' : 'short_ratio',
258
- 'dividendYield' : 'dyield',
259
- 'beta' : 'beta',
260
- 'regularMarketVolume' : 'regvolume',
261
- 'averageVolume' : 'avgvolume',
262
- 'averageVolume10days' : 'avgvolume10',
263
- 'bid' : 'bid',
264
- 'ask' : 'ask',
265
- 'bidSize' : 'bidsize',
266
- 'askSize' : 'asksize',
267
- 'marketCap' : 'marketcap'
265
+ 'overallRisk': 'risk',
266
+ 'shortRatio': 'short_ratio',
267
+ 'dividendYield': 'dyield',
268
+ 'beta': 'beta',
269
+ 'regularMarketVolume': 'regvolume',
270
+ 'averageVolume': 'avgvolume',
271
+ 'averageVolume10days': 'avgvolume10',
272
+ 'bid': 'bid',
273
+ 'ask': 'ask',
274
+ 'bidSize': 'bidsize',
275
+ 'askSize': 'asksize',
276
+ 'marketCap': 'marketcap'
268
277
  }
269
278
  data = []
270
279
  for symbol in tickers:
271
280
  try:
272
281
  symbol_info = yf.Ticker(symbol).info
273
- except:
282
+ except: # noqa: E722
274
283
  continue
275
284
  infos = {}
276
285
  for info in metadata:
@@ -284,8 +293,8 @@ class LightGBModel(object):
284
293
  metadata = metadata.set_index('symbol')
285
294
  return metadata
286
295
 
287
- def _select_nlargest_liquidity_stocks(self, df: pd.DataFrame, n: int,
288
- volume_features, bid_ask_features, market_cap_feature):
296
+ def _select_nlargest_liquidity_stocks(self, df: pd.DataFrame, n: int,
297
+ volume_features, bid_ask_features, market_cap_feature):
289
298
  df = df.copy()
290
299
  scaler = StandardScaler()
291
300
 
@@ -305,46 +314,46 @@ class LightGBModel(object):
305
314
 
306
315
  # Calculate the liquidity score by combining the normalized features
307
316
  df['liquidity_score'] = (weights['volume'] * df[volume_features].mean(axis=1) +
308
- weights['bid_ask_spread'] * df['bid_ask_spread'] +
309
- weights['marketCap'] * df[market_cap_feature[0]])
317
+ weights['bid_ask_spread'] * df['bid_ask_spread'] +
318
+ weights['marketCap'] * df[market_cap_feature[0]])
310
319
  df_sorted = df.sort_values(by='liquidity_score', ascending=False)
311
320
 
312
321
  return df_sorted.nlargest(n, 'liquidity_score').index
313
-
322
+
314
323
  def _encode_metadata(self, df: pd.DataFrame):
315
324
  df = df.copy()
316
325
  # Binning each numerical feature into categories
317
326
  df['insiders'] = pd.qcut(
318
- df['insiders'], q=4,
327
+ df['insiders'], q=4,
319
328
  labels=['Very Low', 'Low', 'High', 'Very High']
320
329
  )
321
330
  df['institutions'] = pd.qcut(
322
- df['institutions'], q=4,
331
+ df['institutions'], q=4,
323
332
  labels=['Very Low', 'Low', 'High', 'Very High']
324
333
  )
325
334
  df['risk'] = pd.cut(
326
- df['risk'], bins=[-float('inf'), 3, 5, 7, float('inf')],
335
+ df['risk'], bins=[-float('inf'), 3, 5, 7, float('inf')],
327
336
  labels=['Low', 'Medium', 'High', 'Very High']
328
337
  )
329
338
  df['short_ratio'] = pd.qcut(
330
- df['short_ratio'], q=4,
339
+ df['short_ratio'], q=4,
331
340
  labels=['Very Low', 'Low', 'High', 'Very High']
332
341
  )
333
342
  df['dyield'] = pd.cut(
334
- df['dyield'],
343
+ df['dyield'],
335
344
  bins=[-float('inf'), 0.002, 0.005, 0.01, float('inf')],
336
- labels=['Very Low', 'Low', 'High', 'Very High']
345
+ labels=['Very Low', 'Low', 'High', 'Very High']
337
346
  )
338
347
  df['beta'] = pd.cut(
339
- df['beta'],
340
- bins=[-float('inf'), 0.8, 1.0, 1.2, float('inf')],
348
+ df['beta'],
349
+ bins=[-float('inf'), 0.8, 1.0, 1.2, float('inf')],
341
350
  labels=['Low', 'Moderate', 'High', 'Very High']
342
351
  )
343
352
 
344
353
  # Encode binned features
345
354
  binned_features = [
346
- 'insiders', 'institutions',
347
- 'risk', 'short_ratio', 'dyield',
355
+ 'insiders', 'institutions',
356
+ 'risk', 'short_ratio', 'dyield',
348
357
  'beta', 'sector', 'industry', 'exchange',
349
358
  ]
350
359
  label_encoders = {}
@@ -355,10 +364,10 @@ class LightGBModel(object):
355
364
  label_encoders[col] = le
356
365
  return df, label_encoders
357
366
 
358
- def prepare_boosting_data(self,
359
- prices: pd.DataFrame,
360
- metadata: pd.DataFrame = None,
361
- min_years=7,
367
+ def prepare_boosting_data(self,
368
+ prices: pd.DataFrame,
369
+ metadata: pd.DataFrame = None,
370
+ min_years=7,
362
371
  universe=500
363
372
  ):
364
373
  if metadata is None:
@@ -389,10 +398,11 @@ class LightGBModel(object):
389
398
  prices = prices[~prices.index.duplicated()]
390
399
 
391
400
  # Align price and meta data
392
- metadata = metadata[~metadata.index.duplicated() & metadata.sector.notnull()]
401
+ metadata = metadata[~metadata.index.duplicated() &
402
+ metadata.sector.notnull()]
393
403
  metadata.sector = metadata.sector.str.lower().str.replace(' ', '_')
394
404
  shared = (prices.index.get_level_values('symbol').unique()
395
- .intersection(metadata.index))
405
+ .intersection(metadata.index))
396
406
  metadata = metadata.loc[shared, :]
397
407
  prices = prices.loc[idx[shared, :], :]
398
408
 
@@ -415,16 +425,16 @@ class LightGBModel(object):
415
425
  prices['dollar_vol'] = prices[['close', 'volume']].prod(1).div(1e3)
416
426
  # compute dollar volume to determine universe
417
427
  dollar_vol_ma = (prices
418
- .dollar_vol
419
- .unstack('symbol')
420
- .rolling(window=21, min_periods=1) # 1 trading month
421
- .mean())
428
+ .dollar_vol
429
+ .unstack('symbol')
430
+ .rolling(window=21, min_periods=1) # 1 trading month
431
+ .mean())
422
432
 
423
433
  # Rank stocks by moving average
424
434
  prices['dollar_vol_rank'] = (dollar_vol_ma
425
- .rank(axis=1, ascending=False)
426
- .stack('symbol')
427
- .swaplevel())
435
+ .rank(axis=1, ascending=False)
436
+ .stack('symbol')
437
+ .swaplevel())
428
438
  # Add some Basic Factors
429
439
  prices = self._add_technical_indicators(prices)
430
440
  # Combine Price and Meta Data
@@ -438,12 +448,12 @@ class LightGBModel(object):
438
448
  for t in T:
439
449
  # Reset the index to apply qcut by date without grouping errors
440
450
  prices[f'r{t:02}dec'] = (prices.reset_index(level='date')
441
- .groupby('date')[f'r{t:02}']
442
- .apply(lambda x: pd.qcut(x,
443
- q=10,
444
- labels=False,
445
- duplicates='drop'))
446
- .values)
451
+ .groupby('date')[f'r{t:02}']
452
+ .apply(lambda x: pd.qcut(x,
453
+ q=10,
454
+ labels=False,
455
+ duplicates='drop'))
456
+ .values)
447
457
  # Daily sector return deciles
448
458
  for t in T:
449
459
  prices[f'r{t:02}q_sector'] = (
@@ -461,50 +471,52 @@ class LightGBModel(object):
461
471
  level='symbol')[f'r{t:02}'].shift(-t)
462
472
 
463
473
  # Remove outliers
464
- outliers = prices[prices.r01 > 1].index.get_level_values('symbol').unique()
474
+ outliers = prices[prices.r01 > 1].index.get_level_values(
475
+ 'symbol').unique()
465
476
  prices = prices.drop(outliers, level='symbol')
466
477
  # Create time and sector dummy variables
467
478
  prices['year'] = prices.index.get_level_values('date').year
468
479
  prices['month'] = prices.index.get_level_values('date').month
469
480
  prices['weekday'] = prices.index.get_level_values('date').weekday
470
481
  # Store Model Data
471
- prices = prices.drop(['open', 'close', 'low', 'high', 'volume'], axis=1)
482
+ prices = prices.drop(
483
+ ['open', 'close', 'low', 'high', 'volume'], axis=1)
472
484
  if 'adj_close' in prices.columns:
473
485
  prices = prices.drop('adj_close', axis=1)
474
- prices.reset_index().dropna().to_hdf(self.datastore, 'model_data')
475
- return prices.dropna()
486
+ prices.reset_index().to_hdf(self.datastore, 'model_data')
487
+ return prices.sort_index()
476
488
 
477
489
  def tickers(self):
478
490
  return pd.read_hdf(self.outstore, 'lgb/tickers').tolist()
479
-
480
- def load_model_data(self):
481
- return pd.read_hdf(self.datastore, 'model_data').set_index(['symbol', 'date']).sort_index()
482
-
483
- def format_time(self, t):
484
491
 
492
+ def load_model_data(self, key='model_data'):
493
+ return pd.read_hdf(self.datastore, key=key).set_index(['symbol', 'date']).sort_index()
494
+
495
+ def format_time(self, t):
485
496
  """Return a formatted time string 'HH:MM:SS
486
497
  based on a numeric time() value"""
487
498
  m, s = divmod(t, 60)
488
499
  h, m = divmod(m, 60)
489
500
  return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'
490
-
501
+
491
502
  def fit(self, data: pd.DataFrame, verbose=True):
492
503
  def get_fi(model):
493
504
  """Return normalized feature importance as pd.Series"""
494
505
  fi = model.feature_importance(importance_type='gain')
495
506
  return (pd.Series(fi / fi.sum(),
496
- index=model.feature_name()))
497
-
507
+ index=model.feature_name()))
508
+
498
509
  def ic_lgbm(preds, train_data):
499
510
  """Custom IC eval metric for lightgbm"""
500
511
  is_higher_better = True
501
512
  return 'ic', spearmanr(preds, train_data.get_label())[0], is_higher_better
513
+ data = data.dropna()
502
514
  # Hyperparameter options
503
515
  YEAR = 252
504
516
  base_params = dict(boosting='gbdt',
505
- objective='regression',
506
- verbose=-1)
507
-
517
+ objective='regression',
518
+ verbose=-1)
519
+
508
520
  # constraints on structure (depth) of each tree
509
521
  max_depths = [2, 3, 5, 7]
510
522
  num_leaves_opts = [2 ** i for i in max_depths]
@@ -517,12 +529,12 @@ class LightGBModel(object):
517
529
  feature_fraction_opts = [.3, .6, .95]
518
530
 
519
531
  param_names = ['learning_rate', 'num_leaves',
520
- 'feature_fraction', 'min_data_in_leaf']
521
-
532
+ 'feature_fraction', 'min_data_in_leaf']
533
+
522
534
  cv_params = list(product(learning_rate_ops,
523
- num_leaves_opts,
524
- feature_fraction_opts,
525
- min_data_in_leaf_opts))
535
+ num_leaves_opts,
536
+ feature_fraction_opts,
537
+ min_data_in_leaf_opts))
526
538
  n_params = len(cv_params)
527
539
  print(f'# Parameters: {n_params}')
528
540
 
@@ -532,15 +544,15 @@ class LightGBModel(object):
532
544
  test_lengths = [63]
533
545
  test_params = list(product(lookaheads, train_lengths, test_lengths))
534
546
  n = len(test_params)
535
- test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
547
+ test_param_sample = np.random.choice(
548
+ list(range(n)), size=int(n), replace=False)
536
549
  test_params = [test_params[i] for i in test_param_sample]
537
550
  print('Train configs:', len(test_params))
538
551
 
539
- ### Categorical Variables
552
+ # Categorical Variables
540
553
  categoricals = ['year', 'weekday', 'month']
541
554
  for feature in categoricals:
542
555
  data[feature] = pd.factorize(data[feature], sort=True)[0]
543
-
544
556
 
545
557
  # ### Run Cross-Validation
546
558
  labels = sorted(data.filter(like='fwd').columns)
@@ -550,64 +562,64 @@ class LightGBModel(object):
550
562
  num_boost_round = num_iterations[-1]
551
563
 
552
564
  metric_cols = (param_names + ['t', 'daily_ic_mean', 'daily_ic_mean_n',
553
- 'daily_ic_median', 'daily_ic_median_n'] +
554
- [str(n) for n in num_iterations])
555
-
565
+ 'daily_ic_median', 'daily_ic_median_n'] +
566
+ [str(n) for n in num_iterations])
567
+
556
568
  for lookahead, train_length, test_length in test_params:
557
569
  # randomized grid search
558
570
  cvp = np.random.choice(list(range(n_params)),
559
- size=int(n_params / 2),
560
- replace=False)
571
+ size=int(n_params / 2),
572
+ replace=False)
561
573
  cv_params_ = [cv_params[i] for i in cvp]
562
574
 
563
575
  # set up cross-validation
564
576
  n_splits = int(2 * YEAR / test_length)
565
577
  if verbose:
566
578
  print(f'Lookahead: {lookahead:2.0f} | '
567
- f'Train: {train_length:3.0f} | '
568
- f'Test: {test_length:2.0f} | '
569
- f'Params: {len(cv_params_):3.0f} | '
570
- f'Train configs: {len(test_params)}')
579
+ f'Train: {train_length:3.0f} | '
580
+ f'Test: {test_length:2.0f} | '
581
+ f'Params: {len(cv_params_):3.0f} | '
582
+ f'Train configs: {len(test_params)}')
571
583
 
572
584
  # time-series cross-validation
573
585
  cv = MultipleTimeSeriesCV(n_splits=n_splits,
574
- lookahead=lookahead,
575
- test_period_length=test_length,
576
- train_period_length=train_length)
586
+ lookahead=lookahead,
587
+ test_period_length=test_length,
588
+ train_period_length=train_length)
577
589
 
578
590
  label = label_dict[lookahead]
579
591
  outcome_data = data.loc[:, features + [label]].dropna()
580
-
592
+
581
593
  # binary dataset
582
594
  lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),
583
- label=outcome_data[label],
584
- categorical_feature=categoricals,
585
- free_raw_data=False)
595
+ label=outcome_data[label],
596
+ categorical_feature=categoricals,
597
+ free_raw_data=False)
586
598
  T = 0
587
- predictions, metrics, feature_importance, daily_ic = [], [], [], []
588
-
599
+ predictions, metrics = [], []
600
+
589
601
  # iterate over (shuffled) hyperparameter combinations
590
602
  for p, param_vals in enumerate(cv_params_):
591
- key = f'{lookahead}/{train_length}/{test_length}/' + '/'.join([str(p) for p in param_vals])
603
+ key = f'{lookahead}/{train_length}/{test_length}/' + \
604
+ '/'.join([str(p) for p in param_vals])
592
605
  params = dict(zip(param_names, param_vals))
593
606
  params.update(base_params)
594
607
 
595
608
  start = time()
596
- cv_preds, nrounds = [], []
597
- ic_cv = defaultdict(list)
598
-
609
+ cv_preds = []
610
+
599
611
  # iterate over folds
600
612
  for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
601
-
613
+
602
614
  # select train subset
603
615
  lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
604
- params=params).construct()
605
-
616
+ params=params).construct()
617
+
606
618
  # train model for num_boost_round
607
619
  model = lgb.train(params=params,
608
- train_set=lgb_train,
609
- num_boost_round=num_boost_round,
610
- )
620
+ train_set=lgb_train,
621
+ num_boost_round=num_boost_round,
622
+ )
611
623
  # log feature importance
612
624
  if i == 0:
613
625
  fi = get_fi(model).to_frame()
@@ -618,32 +630,36 @@ class LightGBModel(object):
618
630
  test_set = outcome_data.iloc[test_idx, :]
619
631
  X_test = test_set.loc[:, model.feature_name()]
620
632
  y_test = test_set.loc[:, label]
621
- y_pred = {str(n): model.predict(X_test, num_iteration=n) for n in num_iterations}
622
-
633
+ y_pred = {str(n): model.predict(X_test, num_iteration=n)
634
+ for n in num_iterations}
635
+
623
636
  # record predictions for each fold
624
- cv_preds.append(y_test.to_frame('y_test').assign(**y_pred).assign(i=i))
625
-
637
+ cv_preds.append(y_test.to_frame(
638
+ 'y_test').assign(**y_pred).assign(i=i))
639
+
626
640
  # combine fold results
627
641
  cv_preds = pd.concat(cv_preds).assign(**params)
628
642
  predictions.append(cv_preds)
629
-
643
+
630
644
  # compute IC per day
631
645
  by_day = cv_preds.groupby(level='date')
632
646
  ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
633
- for n in num_iterations], axis=1)
647
+ for n in num_iterations], axis=1)
634
648
  daily_ic_mean = ic_by_day.mean()
635
649
  daily_ic_mean_n = daily_ic_mean.idxmax()
636
650
  daily_ic_median = ic_by_day.median()
637
651
  daily_ic_median_n = daily_ic_median.idxmax()
638
-
652
+
639
653
  # compute IC across all predictions
640
- ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]
654
+ ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0]
655
+ for n in num_iterations]
641
656
  t = time() - start
642
657
  T += t
643
-
658
+
644
659
  # collect metrics
645
660
  metrics = pd.Series(list(param_vals) +
646
- [t, daily_ic_mean.max(), daily_ic_mean_n, daily_ic_median.max(), daily_ic_median_n] + ic,
661
+ [t, daily_ic_mean.max(), daily_ic_mean_n,
662
+ daily_ic_median.max(), daily_ic_median_n] + ic,
647
663
  index=metric_cols)
648
664
  if verbose:
649
665
  msg = f'\t{p:3.0f} | {self.format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
@@ -653,14 +669,16 @@ class LightGBModel(object):
653
669
 
654
670
  # persist results for given CV run and hyperparameter combination
655
671
  metrics.to_hdf(self.trainstore, 'metrics/' + key)
656
- ic_by_day.assign(**params).to_hdf(self.trainstore, 'daily_ic/' + key)
672
+ ic_by_day.assign(
673
+ **params).to_hdf(self.trainstore, 'daily_ic/' + key)
657
674
  fi.T.describe().T.assign(**params).to_hdf(self.trainstore, 'fi/' + key)
658
- cv_preds.to_hdf(self.trainstore, 'predictions/' + key, append=True)
675
+ cv_preds.to_hdf(self.trainstore,
676
+ 'predictions/' + key, append=True)
659
677
 
660
678
  def _get_lgb_metrics(self, scope_params, lgb_train_params, daily_ic_metrics):
661
679
  with pd.HDFStore(self.trainstore) as store:
662
680
  for i, key in enumerate(
663
- [k[1:] for k in store.keys() if k[1:].startswith('metrics')]):
681
+ [k[1:] for k in store.keys() if k[1:].startswith('metrics')]):
664
682
  _, t, train_length, test_length = key.split('/')[:4]
665
683
  attrs = {
666
684
  'lookahead': t,
@@ -675,10 +693,10 @@ class LightGBModel(object):
675
693
  lgb_metrics[i] = pd.Series(s)
676
694
 
677
695
  id_vars = scope_params + lgb_train_params + daily_ic_metrics
678
- lgb_metrics = pd.melt(lgb_metrics.T.drop('t', axis=1),
679
- id_vars=id_vars,
680
- value_name='ic',
681
- var_name='boost_rounds').dropna().apply(pd.to_numeric)
696
+ lgb_metrics = pd.melt(lgb_metrics.T.drop('t', axis=1),
697
+ id_vars=id_vars,
698
+ value_name='ic',
699
+ var_name='boost_rounds').dropna().apply(pd.to_numeric)
682
700
  return lgb_metrics
683
701
 
684
702
  def _get_lgb_ic(self, int_cols, scope_params, lgb_train_params, id_vars):
@@ -689,22 +707,23 @@ class LightGBModel(object):
689
707
  _, t, train_length, test_length = key.split('/')[:4]
690
708
  if key.startswith('daily_ic'):
691
709
  df = (store[key]
692
- .drop(['boosting', 'objective', 'verbose'], axis=1)
693
- .assign(lookahead=t,
694
- train_length=train_length,
695
- test_length=test_length))
710
+ .drop(['boosting', 'objective', 'verbose'], axis=1)
711
+ .assign(lookahead=t,
712
+ train_length=train_length,
713
+ test_length=test_length))
696
714
  lgb_ic.append(df)
697
715
  lgb_ic = pd.concat(lgb_ic).reset_index()
698
- lgb_ic = pd.melt(lgb_ic,
699
- id_vars=id_vars,
700
- value_name='ic',
701
- var_name='boost_rounds').dropna()
716
+ lgb_ic = pd.melt(lgb_ic,
717
+ id_vars=id_vars,
718
+ value_name='ic',
719
+ var_name='boost_rounds').dropna()
702
720
  lgb_ic.loc[:, int_cols] = lgb_ic.loc[:, int_cols].astype(int)
703
721
  return lgb_ic
704
722
 
705
723
  def _get_lgb_params(self, data, scope_params, lgb_train_params, t=5, best=0):
706
724
  param_cols = scope_params[1:] + lgb_train_params + ['boost_rounds']
707
- df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
725
+ df = data[data.lookahead == t].sort_values(
726
+ 'ic', ascending=False).iloc[best]
708
727
  return df.loc[param_cols]
709
728
 
710
729
  def _get_lgb_key(self, t, p):
@@ -713,12 +732,12 @@ class LightGBModel(object):
713
732
 
714
733
  def _select_ic(self, params, ic_data, lookahead):
715
734
  return ic_data.loc[(ic_data.lookahead == lookahead) &
716
- (ic_data.train_length == params.train_length) &
717
- (ic_data.test_length == params.test_length) &
718
- (ic_data.learning_rate == params.learning_rate) &
719
- (ic_data.num_leaves == params.num_leaves) &
720
- (ic_data.feature_fraction == params.feature_fraction) &
721
- (ic_data.boost_rounds == params.boost_rounds), ['date', 'ic']].set_index('date')
735
+ (ic_data.train_length == params.train_length) &
736
+ (ic_data.test_length == params.test_length) &
737
+ (ic_data.learning_rate == params.learning_rate) &
738
+ (ic_data.num_leaves == params.num_leaves) &
739
+ (ic_data.feature_fraction == params.feature_fraction) &
740
+ (ic_data.boost_rounds == params.boost_rounds), ['date', 'ic']].set_index('date')
722
741
 
723
742
  def get_trade_prices(self, tickers, start, end):
724
743
  idx = pd.IndexSlice
@@ -736,73 +755,77 @@ class LightGBModel(object):
736
755
  fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
737
756
  axes = axes.flatten()
738
757
  for i, t in enumerate([1, 21]):
739
- params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=t,best=0)
758
+ params = self._get_lgb_params(
759
+ lgb_daily_ic, scope_params, lgb_train_params, t=t, best=0)
740
760
  data = self._select_ic(params, lgb_ic, lookahead=t).sort_index()
741
761
  rolling = data.rolling(63).ic.mean().dropna()
742
762
  avg = data.ic.mean()
743
763
  med = data.ic.median()
744
- rolling.plot(ax=axes[i], title=f'Horizon: {t} Day(s) | IC: Mean={avg*100:.2f} Median={med*100:.2f}')
764
+ rolling.plot(
765
+ ax=axes[i], title=f'Horizon: {t} Day(s) | IC: Mean={avg*100:.2f} Median={med*100:.2f}')
745
766
  axes[i].axhline(avg, c='darkred', lw=1)
746
767
  axes[i].axhline(0, ls='--', c='k', lw=1)
747
768
 
748
769
  fig.suptitle('3-Month Rolling Information Coefficient', fontsize=16)
749
770
  fig.tight_layout()
750
- fig.subplots_adjust(top=0.92);
771
+ fig.subplots_adjust(top=0.92)
751
772
 
752
773
  def plot_metrics(self, lgb_metrics, lgb_daily_ic, t=1):
753
- ### Visualization
754
- sns.jointplot(x=lgb_metrics.daily_ic_mean,y=lgb_metrics.ic);
774
+ # Visualization
775
+ sns.jointplot(x=lgb_metrics.daily_ic_mean, y=lgb_metrics.ic)
755
776
 
756
- g = sns.catplot(x='lookahead', y='ic',
777
+ sns.catplot(x='lookahead', y='ic',
757
778
  col='train_length', row='test_length',
758
779
  data=lgb_metrics,
759
- kind='box');
760
- g=sns.catplot(x='boost_rounds',
780
+ kind='box')
781
+ sns.catplot(x='boost_rounds',
761
782
  y='ic',
762
783
  col='train_length',
763
784
  row='test_length',
764
785
  data=lgb_daily_ic[lgb_daily_ic.lookahead == t],
765
- kind='box');
786
+ kind='box')
766
787
 
767
788
  def get_best_predictions(self, lgb_daily_ic, scope_params, lgb_train_params, lookahead=1, topn=10):
768
789
  for best in range(topn):
769
- best_params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=best)
790
+ best_params = self._get_lgb_params(
791
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=best)
770
792
  key = self._get_lgb_key(lookahead, best_params)
771
793
  rounds = str(int(best_params.boost_rounds))
772
794
  if best == 0:
773
- best_predictions = pd.read_hdf(self.trainstore, 'predictions/' + key)
795
+ best_predictions = pd.read_hdf(
796
+ self.trainstore, 'predictions/' + key)
774
797
  best_predictions = best_predictions[rounds].to_frame(best)
775
798
  else:
776
- best_predictions[best] = pd.read_hdf(self.trainstore, 'predictions/' + key)[rounds]
799
+ best_predictions[best] = pd.read_hdf(
800
+ self.trainstore, 'predictions/' + key)[rounds]
777
801
  best_predictions = best_predictions.sort_index()
778
- best_predictions.reset_index().to_hdf(self.outstore, f'lgb/train/{lookahead:02}')
802
+ best_predictions.reset_index().to_hdf(
803
+ self.outstore, f'lgb/train/{lookahead:02}')
779
804
  return best_predictions
780
805
 
781
806
  def apply_alphalen_analysis(self, factor_data, tearsheet=True, verbose=True):
782
- #### Compute Alphalens metrics
807
+ # Compute Alphalens metrics
783
808
  mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
784
- factor_data,
785
- by_date=True,
786
- by_group=False,
787
- demeaned=True,
788
- group_adjust=False,
809
+ factor_data,
810
+ by_date=True,
811
+ by_group=False,
812
+ demeaned=True,
813
+ group_adjust=False,
789
814
  )
790
815
  factor_returns = perf.factor_returns(factor_data)
791
816
  mean_quant_ret, std_quantile = perf.mean_return_by_quantile(factor_data,
792
- by_group=False,
793
- demeaned=True)
794
-
795
-
817
+ by_group=False,
818
+ demeaned=True)
796
819
 
797
820
  mean_quant_rateret = mean_quant_ret.apply(rate_of_return, axis=0,
798
- base_period=mean_quant_ret.columns[0])
799
-
821
+ base_period=mean_quant_ret.columns[0])
822
+
800
823
  mean_quant_ret_bydate, std_quant_daily = perf.mean_return_by_quantile(
801
- factor_data,
802
- by_date=True,
803
- by_group=False,
804
- demeaned=True,
805
- group_adjust=False,
824
+ factor_data,
825
+ by_date=True,
826
+ by_group=False,
827
+ demeaned=True,
828
+ group_adjust=False,
806
829
  )
807
830
 
808
831
  mean_quant_rateret_bydate = mean_quant_ret_bydate.apply(
@@ -823,123 +846,146 @@ class LightGBModel(object):
823
846
  std_err=compstd_quant_daily,
824
847
  )
825
848
  if verbose:
826
- print(mean_ret_spread_quant.mean().mul(10000).to_frame('Mean Period Wise Spread (bps)').join(alpha_beta.T).T)
849
+ print(mean_ret_spread_quant.mean().mul(10000).to_frame(
850
+ 'Mean Period Wise Spread (bps)').join(alpha_beta.T).T)
827
851
 
828
852
  fig, axes = plt.subplots(ncols=3, figsize=(18, 4))
829
853
 
830
-
831
854
  plotting.plot_quantile_returns_bar(mean_quant_rateret, ax=axes[0])
832
855
  plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=0)
833
856
  axes[0].set_xlabel('Quantile')
834
857
 
835
858
  plotting.plot_cumulative_returns_by_quantile(mean_quant_ret_bydate['1D'],
836
- freq=pd.tseries.offsets.BDay(),
837
- period='1D',
838
- ax=axes[1])
859
+ freq=pd.tseries.offsets.BDay(),
860
+ period='1D',
861
+ ax=axes[1])
839
862
  axes[1].set_title('Cumulative Return by Quantile (1D Period)')
840
863
 
841
864
  title = "Cumulative Return - Factor-Weighted Long/Short PF (1D Period)"
842
865
  plotting.plot_cumulative_returns(factor_returns['1D'],
843
- period='1D',
844
- freq=pd.tseries.offsets.BDay(),
845
- title=title,
846
- ax=axes[2])
866
+ period='1D',
867
+ freq=pd.tseries.offsets.BDay(),
868
+ title=title,
869
+ ax=axes[2])
847
870
 
848
871
  fig.suptitle('Alphalens - Validation Set Performance', fontsize=14)
849
872
  fig.tight_layout()
850
- fig.subplots_adjust(top=.85);
873
+ fig.subplots_adjust(top=.85)
851
874
 
852
- #### Summary Tearsheet
875
+ # Summary Tearsheet
853
876
  create_summary_tear_sheet(factor_data)
854
877
  create_full_tear_sheet(factor_data)
855
878
 
856
- def evaluate(self, remove_instore=False, lookahead=1):
879
+ def evaluate(self, remove_instore=False, lookahead=1, verbose=True):
857
880
  scope_params = ['lookahead', 'train_length', 'test_length']
858
- daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
859
- lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
860
-
861
- lgb_metrics = self._get_lgb_metrics(scope_params, lgb_train_params, daily_ic_metrics)
862
- #### Summary Metrics by Fold
881
+ daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n',
882
+ 'daily_ic_median', 'daily_ic_median_n']
883
+ lgb_train_params = ['learning_rate', 'num_leaves',
884
+ 'feature_fraction', 'min_data_in_leaf']
885
+
886
+ lgb_metrics = self._get_lgb_metrics(
887
+ scope_params, lgb_train_params, daily_ic_metrics)
888
+ # Summary Metrics by Fold
863
889
  lgb_metrics.to_hdf(self.outstore, 'lgb/metrics')
864
-
865
- #### Information Coefficient by Day
890
+
891
+ # Information Coefficient by Day
866
892
  int_cols = ['lookahead', 'train_length', 'test_length', 'boost_rounds']
867
893
  id_vars = ['date'] + scope_params + lgb_train_params
868
- lgb_ic = self._get_lgb_ic(int_cols, scope_params, lgb_train_params, id_vars)
894
+ lgb_ic = self._get_lgb_ic(
895
+ int_cols, scope_params, lgb_train_params, id_vars)
869
896
  lgb_ic.to_hdf(self.outstore, 'lgb/ic')
870
- lgb_daily_ic = lgb_ic.groupby(id_vars[1:] + ['boost_rounds']).ic.mean().to_frame('ic').reset_index()
897
+ lgb_daily_ic = lgb_ic.groupby(
898
+ id_vars[1:] + ['boost_rounds']).ic.mean().to_frame('ic').reset_index()
871
899
  lgb_daily_ic.to_hdf(self.outstore, 'lgb/daily_ic')
872
900
 
873
- ## Cross-validation Result: Best Hyperparameters
874
- group_cols = scope_params + lgb_train_params + ['boost_rounds']
875
- print(lgb_daily_ic.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'ic')))
876
- lgb_metrics.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'ic'))
901
+ # Cross-validation Result: Best Hyperparameters
902
+ if verbose:
903
+ print(lgb_daily_ic.groupby('lookahead', group_keys=False).apply(
904
+ lambda x: x.nlargest(3, 'ic')))
905
+ lgb_metrics.groupby('lookahead', group_keys=False).apply(
906
+ lambda x: x.nlargest(3, 'ic'))
877
907
  lgb_metrics.groupby('lookahead', group_keys=False
878
908
  ).apply(lambda x: x.nlargest(3, 'ic')).to_hdf(self.outstore, 'lgb/best_model')
879
- print(lgb_metrics.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'daily_ic_mean')))
909
+ if verbose:
910
+ print(lgb_metrics.groupby('lookahead', group_keys=False).apply(
911
+ lambda x: x.nlargest(3, 'daily_ic_mean')))
880
912
 
881
- ### Visualization
882
- self.plot_metrics(lgb_metrics, lgb_daily_ic, t=1)
913
+ # Visualization
914
+ if verbose:
915
+ self.plot_metrics(lgb_metrics, lgb_daily_ic, t=lookahead)
883
916
 
884
- ## AlphaLens Analysis - Validation Performance
917
+ # AlphaLens Analysis - Validation Performance
885
918
  lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
886
- best_params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=5, best=0)
919
+ best_params = self._get_lgb_params(
920
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=0)
887
921
  best_params.to_hdf(self.outstore, 'lgb/best_params')
888
922
 
889
- self.plot_ic(lgb_ic, lgb_daily_ic, scope_params, lgb_train_params)
923
+ if verbose:
924
+ self.plot_ic(lgb_ic, lgb_daily_ic, scope_params, lgb_train_params)
890
925
 
891
- #### Get Predictions for Validation Period
892
- best_predictions = self.get_best_predictions(lgb_daily_ic, scope_params, lgb_train_params,
893
- lookahead=lookahead, topn=10)
894
- test_tickers = best_predictions.index.get_level_values('symbol').unique()
926
+ # Get Predictions for Validation Period
927
+ best_predictions = self.get_best_predictions(lgb_daily_ic, scope_params, lgb_train_params,
928
+ lookahead=lookahead, topn=10)
929
+ test_tickers = best_predictions.index.get_level_values(
930
+ 'symbol').unique()
895
931
  start = best_predictions.index.get_level_values('date').min()
896
932
  end = best_predictions.index.get_level_values('date').max()
897
933
  trade_prices = self.get_trade_prices(test_tickers, start, end)
898
- trade_prices.to_hdf(self.outstore, 'trade_prices/model_selection')
899
934
  pd.Series(test_tickers).to_hdf(self.outstore, 'lgb/tickers')
900
- #We average the top five models and provide the corresponding prices to Alphalens, in order to compute the mean period-wise
901
- #return earned on an equal-weighted portfolio invested in the daily factor quintiles for various holding periods:
902
- factor = best_predictions.iloc[:, :5].mean(1).dropna().tz_convert ('UTC', level='date').swaplevel()
903
- ### #### Create AlphaLens Inputs
904
- factor_data = get_clean_factor_and_forward_returns(factor=factor,
905
- prices=trade_prices,
906
- quantiles=5,
907
- periods=(1, 5, 10, 21),
908
- max_loss=1)
909
- self.apply_alphalen_analysis(factor_data, tearsheet=True, verbose=True)
935
+ # We average the top five models and provide the corresponding prices to Alphalens,
936
+ # in order to compute the mean period-wise
937
+ # return earned on an equal-weighted portfolio invested in the daily factor quintiles
938
+ # for various holding periods:
939
+ factor = best_predictions.iloc[:, :5].mean(
940
+ 1).dropna().tz_convert('UTC', level='date').swaplevel()
941
+ # Create AlphaLens Inputs
942
+ if verbose:
943
+ factor_data = get_clean_factor_and_forward_returns(factor=factor,
944
+ prices=trade_prices,
945
+ quantiles=5,
946
+ periods=(
947
+ 1, 5, 10, 21),
948
+ max_loss=1)
949
+ self.apply_alphalen_analysis(
950
+ factor_data, tearsheet=True, verbose=True)
910
951
  # Delete the temporary files
911
952
  if remove_instore:
912
953
  os.remove(self.trainstore)
913
-
914
- def make_predictions(self, data: pd.DataFrame, lookahead=1, verbose=True):
954
+
955
+ def make_predictions(self, data: pd.DataFrame, mode='test', lookahead=1, verbose=True):
956
+ data = data.copy()
915
957
  YEAR = 252
916
- idx = pd.IndexSlice
917
958
  scope_params = ['lookahead', 'train_length', 'test_length']
918
- daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
919
- lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
959
+ lgb_train_params = ['learning_rate', 'num_leaves',
960
+ 'feature_fraction', 'min_data_in_leaf']
920
961
 
921
962
  base_params = dict(boosting='gbdt',
922
- objective='regression',
923
- verbose=-1)
963
+ objective='regression',
964
+ verbose=-1)
924
965
 
925
- categoricals = ['year', 'month', 'sector', 'weekday']
926
- data = data.sort_index()
966
+ categoricals = ['year', 'month', 'weekday']
927
967
  labels = sorted(data.filter(like='_fwd').columns)
928
968
  features = data.columns.difference(labels).tolist()
929
969
  label = f'r{lookahead:02}_fwd'
930
970
  for feature in categoricals:
931
971
  data[feature] = pd.factorize(data[feature], sort=True)[0]
932
-
972
+
973
+ if mode == 'test':
974
+ data = data.dropna().sort_index()
975
+ elif mode == 'live':
976
+ data[labels] = data[labels].fillna(0)
977
+ data = data.sort_index().dropna()
978
+
933
979
  lgb_data = lgb.Dataset(data=data[features],
934
- label=data[label],
935
- categorical_feature=categoricals,
936
- free_raw_data=False)
937
- ### Generate predictions
938
- lgb_ic = pd.read_hdf(self.outstore, 'lgb/ic')
980
+ label=data[label],
981
+ categorical_feature=categoricals,
982
+ free_raw_data=False)
983
+ # Generate predictions
939
984
  lgb_daily_ic = pd.read_hdf(self.outstore, 'lgb/daily_ic')
940
985
 
941
986
  for position in range(10):
942
- params = self._get_lgb_params(lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=position)
987
+ params = self._get_lgb_params(
988
+ lgb_daily_ic, scope_params, lgb_train_params, t=lookahead, best=position)
943
989
 
944
990
  params = params.to_dict()
945
991
 
@@ -949,27 +995,27 @@ class LightGBModel(object):
949
995
  test_length = int(params.pop('test_length'))
950
996
  num_boost_round = int(params.pop('boost_rounds'))
951
997
  params.update(base_params)
952
-
953
- print(f'\nPosition: {position:02}')
998
+ if verbose:
999
+ print(f'\nPosition: {position:02}')
954
1000
 
955
1001
  # 1-year out-of-sample period
956
1002
  n_splits = int(YEAR / test_length)
957
1003
  cv = MultipleTimeSeriesCV(n_splits=n_splits,
958
- test_period_length=test_length,
959
- lookahead=lookahead,
960
- train_period_length=train_length)
1004
+ test_period_length=test_length,
1005
+ lookahead=lookahead,
1006
+ train_period_length=train_length)
961
1007
 
962
1008
  predictions = []
963
- start = time()
964
1009
  for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
965
- print(i, end=' ', flush=True)
1010
+ if verbose:
1011
+ print(i, end=' ', flush=True)
966
1012
  lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
967
1013
  params=params).construct()
968
1014
 
969
1015
  model = lgb.train(params=params,
970
- train_set=lgb_train,
971
- num_boost_round=num_boost_round,
972
- )
1016
+ train_set=lgb_train,
1017
+ num_boost_round=num_boost_round,
1018
+ )
973
1019
 
974
1020
  test_set = data.iloc[test_idx, :]
975
1021
  y_test = test_set.loc[:, label].to_frame('y_test')
@@ -992,29 +1038,32 @@ class LightGBModel(object):
992
1038
  lambda x: spearmanr(x.y_test, x[position])[0])
993
1039
  if verbose:
994
1040
  print(ic_by_day.describe())
995
- test_predictions.reset_index().to_hdf(self.outstore, f'lgb/test/{lookahead:02}')
1041
+ test_predictions.reset_index().to_hdf(
1042
+ self.outstore, f'lgb/test/{lookahead:02}')
996
1043
  return test_predictions
997
1044
 
998
1045
  def load_predictions(self, predictions=None, lookahead=1):
999
1046
  if predictions is None:
1000
1047
  predictions = pd.concat([
1001
- pd.read_hdf(self.outstore, f'lgb/train/{lookahead:02}'),
1002
- pd.read_hdf(self.outstore, f'lgb/test/{lookahead:02}').drop('y_test', axis=1)
1048
+ pd.read_hdf(self.outstore, f'lgb/train/{lookahead:02}'),
1049
+ pd.read_hdf(self.outstore,
1050
+ f'lgb/test/{lookahead:02}').drop('y_test', axis=1)
1003
1051
  ])
1004
1052
  predictions = predictions.set_index(['symbol', 'date'])
1005
1053
 
1006
1054
  predictions = (predictions.loc[~predictions.index.duplicated()]
1007
- .iloc[:, :10]
1008
- .mean(1)
1009
- .sort_index()
1010
- .dropna()
1011
- .to_frame('prediction'))
1012
- tickers = predictions.index.get_level_values('symbol').unique().tolist()
1055
+ .iloc[:, :10]
1056
+ .mean(1)
1057
+ .sort_index()
1058
+ .dropna()
1059
+ .to_frame('prediction'))
1060
+ tickers = predictions.index.get_level_values(
1061
+ 'symbol').unique().tolist()
1013
1062
  return (predictions
1014
1063
  .unstack('symbol')
1015
1064
  .prediction
1016
- .tz_convert ('UTC')), tickers
1017
-
1065
+ .tz_convert('UTC')), tickers
1066
+
1018
1067
  def assert_last_date(self, predictions: pd.DataFrame):
1019
1068
  """
1020
1069
  Usefull in Live Trading to ensure that the last date in the predictions
@@ -1023,4 +1072,21 @@ class LightGBModel(object):
1023
1072
  last_date = predictions.index.get_level_values('date').max()
1024
1073
  if last_date.tzinfo is None:
1025
1074
  last_date = last_date.tz_localize('UTC')
1026
- assert last_date == (pd.Timestamp.now(tz='UTC') - pd.Timedelta(days=1)).normalize()
1075
+ try:
1076
+ if datetime.now().strftime('%A') == 'Monday':
1077
+ assert last_date == (pd.Timestamp.now(
1078
+ tz='UTC') - pd.Timedelta(days=3)).normalize()
1079
+ else:
1080
+ assert (
1081
+ last_date == (pd.Timestamp.now(tz='UTC')
1082
+ - pd.Timedelta(days=1)).normalize()
1083
+ or last_date == (pd.Timestamp.now(tz='UTC')).normalize()
1084
+ )
1085
+ return True
1086
+ except AssertionError:
1087
+ return False
1088
+
1089
+ def clean_stores(self, *stores):
1090
+ for store in stores:
1091
+ if os.path.exists(store):
1092
+ os.remove(store)