virgo-modules 0.0.72__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import yfinance as yf
2
2
  import pandas as pd
3
3
  import numpy as np
4
- import json
4
+ import gc
5
5
 
6
6
  import matplotlib.pyplot as plt
7
7
  import matplotlib.gridspec as gridspec
@@ -36,7 +36,6 @@ from hmmlearn.hmm import GaussianHMM
36
36
 
37
37
  from plotly.colors import DEFAULT_PLOTLY_COLORS
38
38
 
39
- from sklearn.base import BaseEstimator, TransformerMixin
40
39
  from sklearn.pipeline import Pipeline
41
40
  from feature_engine.imputation import MeanMedianImputer
42
41
 
@@ -48,88 +47,38 @@ from feature_engine.timeseries.forecasting import LagFeatures
48
47
  from feature_engine.imputation import MeanMedianImputer
49
48
  from feature_engine.discretisation import EqualWidthDiscretiser
50
49
 
50
+ from sklearn.linear_model import HuberRegressor
51
+
51
52
  from .aws_utils import upload_file_to_aws
52
53
 
53
54
  import logging
54
55
 
55
- class InverseHyperbolicSine(BaseEstimator, TransformerMixin):
56
- def __init__(self, features, prefix = ''):
57
- self.features = features
58
- self.prefix = prefix
56
+ from virgo_modules.src.hmm_utils import trainer_hmm
57
+ from virgo_modules.src.transformer_utils import signal_combiner, FeatureSelector
58
+ from virgo_modules.src.transformer_utils import FeaturesEntropy, VirgoWinsorizerFeature # imported bcs some models read this module otherwise it crashed mlflow.load()
59
59
 
60
- def fit(self, X, y=None):
61
- return self
62
-
63
- def transform(self, X, y=None):
64
- for feature in self.features:
65
- X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
66
- return X
67
-
68
- class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
69
- def __init__(self, feature_configs):
70
- self.feature_configs = feature_configs
71
- def fit(self, X, y=None):
72
- return self
73
-
74
- def transform(self, X, y=None):
75
- for feature in self.feature_configs:
76
- lower = self.feature_configs[feature]['min']
77
- upper = self.feature_configs[feature]['max']
78
- X[feature] = np.where( lower > X[feature], lower, X[feature])
79
- X[feature] = np.where( upper < X[feature], upper, X[feature])
80
- return X
81
-
82
- class FeatureSelector(BaseEstimator, TransformerMixin):
83
- def __init__(self, columns):
84
- self.columns = columns
85
-
86
- def fit(self, X, y=None):
87
- return self
88
-
89
- def transform(self, X, y=None):
90
- return X[self.columns]
91
-
92
- def sharpe_ratio(return_series):
93
- N = 255 # Trading days in the year (change to 365 for crypto)
94
- rf = 0.005 # Half a percent risk free rare
95
- mean = return_series.mean() * N -rf
96
- sigma = return_series.std() * np.sqrt(N)
97
- sharpe = round(mean / sigma, 3)
98
- return sharpe
99
-
100
- class signal_combiner(BaseEstimator, TransformerMixin):
101
- def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
102
- self.columns = columns
103
- self.drop = drop
104
- self.prefix_up = prefix_up
105
- self.prefix_low = prefix_low
106
-
107
- def fit(self, X, y=None):
108
- return self
109
-
110
- def transform(self, X, y=None):
111
- for column in self.columns:
112
- X['CombSignal_'+column] = np.where(
113
- X[self.prefix_up + column] == 1,
114
- 1,
115
- np.where(
116
- X[self.prefix_low + column] == 1,
117
- 1,
118
- 0
119
- )
120
- )
121
- if self.drop:
122
- X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
123
- return X
124
-
125
60
  def data_processing_pipeline(features_base,features_to_drop = False, lag_dict = False, combine_signals = False, discretize_columns = False, correlation = 0.77):
126
-
61
+
62
+ '''
63
+ create a scikit learn pipeline object using different configurations and feature engineering blocks with a given flow
64
+
65
+ Parameters:
66
+ features_to_drop (list): list of features to drop
67
+ lag_dict (dict): feature dictionary with configurations to apply lags
68
+ combine_signals (list): list of columns/signals to combine
69
+ discretize_columns (list): list of features to discretize, bins is fixed
70
+ correlation (float): correaltion score threshold for feature selection
71
+
72
+ Returns:
73
+ pipe (obj): pipeline object
74
+ '''
75
+
127
76
  lag_pipe_sec = [(f'lags_{key}', LagFeatures(variables = key, periods = lag_dict[key])) for key in lag_dict] if lag_dict else []
128
77
  drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
129
78
  merge = [('signal_combiner', signal_combiner(combine_signals))] if combine_signals else []
130
79
  discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = 20 ))] if discretize_columns else []
131
80
  drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation))] if correlation else []
132
-
81
+
133
82
  pipe = Pipeline(
134
83
  [('selector', FeatureSelector(features_base))] + \
135
84
  [('encoding',OneHotEncoder(top_categories=None, variables=['hmm_feature']))] + \
@@ -142,64 +91,172 @@ def data_processing_pipeline(features_base,features_to_drop = False, lag_dict =
142
91
  )
143
92
  return pipe
144
93
 
145
- def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
146
- ## legnths
147
- cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
148
- cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
149
- ## means
150
- def quantile2(x):
151
- return x.quantile(0.25)
152
- def quantile3(x):
153
- return x.quantile(0.75)
154
-
155
- cluster_returns = data.groupby('hmm_feature').agg(
156
- n_uniques = ('chain_id','nunique'),
157
- n_obs = ('Date','count'),
158
- cluster_ret_q25 = ('chain_return',quantile2),
159
- cluster_ret_median = ('chain_return','median'),
160
- cluster_ret_q75 = ('chain_return',quantile3),
161
- )
162
- cluster_returns = cluster_returns.join(cluster_lengths, how = 'left')
163
- cluster_returns['perc_dispute'] = np.where(
164
- np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
165
- 1,0
166
- )
167
- cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
168
- cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
169
- cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
170
- cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
171
- cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
172
- cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
173
- cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
174
-
175
- ## relevance or importance
176
- # naive aproach
177
- cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
178
- cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
179
- cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
180
- cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
181
-
182
- mean_relevance = cluster_returns['abs_t_accpted'].mean()
183
- number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
184
-
185
- return mean_relevance, cluster_returns, number_relevant_states
94
+ class stock_eda_panel(object):
186
95
 
96
+ """
97
+ Class that initialy gets stock data then apply feature enginering, enrichment, analysis, plotting, model training etc.
98
+
99
+ Attributes
100
+ ----------
101
+ stock_code : str
102
+ symbol of the asset
103
+ n_days : str
104
+ number of days to extract data
105
+ data_window : str
106
+ large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
107
+ df : pd.DataFrame
108
+ Pandas dataframe of the asset data with features
109
+ strategy_log: pd.DataFrame
110
+ Pandas dataframe that has the results of different tested strategies (result from strategy simulator hmm)
111
+ best_strategy: list
112
+ features of the best performing strategy (result from strategy simulator hmm)
113
+ top_10_strategy: dict
114
+ top 10 best performing strategies (result from strategy simulator hmm)
115
+ settings: dict
116
+ configuration dictionary of the features and other parameters
117
+
118
+ Methods
119
+ -------
120
+ augmented_dickey_fuller_statistics(time_series=pd.Series, label=str):
121
+ Perform dickey fuller or stationary test for a given time series
122
+ It will print p value of the features
123
+ get_data():
124
+ Get asset data performing some data normalization or formating (in case of dates)
125
+ plot_series_returns(roll_mean_lags1=int, roll_mean_lags2=int)
126
+ Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
127
+ seasonal_plot():
128
+ Display time series split by year
129
+ plot_price_signal(feature=str, feature_2=str, opacity=float):
130
+ Display botton and roof signals over the closing prices
131
+ volatility_analysis(lags=int, trad_days=int, window_log_return=int, plot=boolean, save_features=boolean):
132
+ this method performs log return and volatilyty analysis of the closing prices
133
+ find_lag(feature=str, lag_list=list, column_target=str,posterior_lag=int, test_size=int):
134
+ displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
135
+ outlier_plot(zlim=float, plot=boolean, save_features=boolean):
136
+ perform outlier analysis of the log returns. It also permors normality test of returns
137
+ analysis_roll_mean_log_returns(lags=int, plot=boolean):
138
+ perform analysis of lags of the mean rolling log return
139
+ compute_clip_bands(feature_name=str,threshold=float):
140
+ compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
141
+ extract_sec_data(symbol=str, base_columns=list(str), rename_columns=dict):
142
+ extract new asset data and merge it to the main asset data
143
+ lag_log_return(lags=int, feature=str, feature_name=str):
144
+ compute log return given some lags
145
+ produce_log_volatility(trad_days=int, feature=str, feature_name=str):
146
+ compute volatility
147
+ signal_plotter(feature_name=str):
148
+ display analysis plot of a feature with high and low signals
149
+ log_features_standard(feature_name=str):
150
+ save resulting feature names in an standard structure
151
+ relative_spread_MA(ma1=int, ma2=int, threshold=float, plot=boolean, save_features=boolean):
152
+ perform relative moving average features, one for short term and another for long/mid term
153
+ pair_feature(pair_symbol=str, plot=boolean):
154
+ initialize pair feature data extraction and analysis
155
+ calculate_cointegration(series_1=pd.series, series_2=pd.series):
156
+ calculate cointegration score for two time series
157
+ bidirect_count_feature(rolling_window=int, threshold=float, plot=boolean, save_features=boolean):
158
+ perform negative and positive return counting in a given rolling time window
159
+ get_relative_range_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
160
+ perform relative spread of opening and closing price
161
+ rsi_feature_improved(window=int, threshold=float, plot=boolean, save_features=boolean):
162
+ perform relative strength index
163
+ days_features_bands(window=int, threshold=float, plot=boolean, save_features=boolean):
164
+ compute mean returns for a given day of the week in a window scope per day
165
+ analysis_smooth_volume(window=int, threshold=float, plot=boolean, save_features=boolean):
166
+ compute feature of thrading volumes
167
+ roc_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
168
+ perform price rate of change
169
+ stoch_feature(window=int, smooth1=int, smooth2=int, threshold=float, plot=boolean, save_features=boolean):
170
+ perform stochastic oscilator RSI feature
171
+ stochastic_feature(window=int, smooth=int, threshold=float, plot=boolean, save_features=boolean):
172
+ perform stochastic oscilator feature
173
+ william_feature(lbp=int, threshold=float, plot=boolean, save_features=boolean):
174
+ perfom fast stochastic oscilator or william indicator
175
+ vortex_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
176
+ perform vortex oscilator
177
+ minmax_pricefeature(type_func=str, window=int, distance=bolean, save_features=boolean)
178
+ get relative price/ distance feature with respect to the min/max price in a given window
179
+ pair_index_feature(pair_symbol=str, feature_label=str, window=int, threshold=float, plot=boolean, save_features=boolean):
180
+ perform additional asset ROC feature, then a new feature is created in the main dataframe
181
+ produce_order_features(feature_name=str, save_features=boolean):
182
+ perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
183
+ compute_last_signal (feature_name=str, save_features=boolean):
184
+ perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
185
+ create_hmm_derived_features():
186
+ create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
187
+ cluster_hmm_analysis(n_clusters=int,features_hmm=list, test_data_size=int, seed=int, lag_returns_state=int, plot=boolean, save_features=boolean, model=obj):
188
+ create or use a hmm model
189
+ sharpe_ratio(return_series=pd.Series, n_trad_days=int, rf=float):
190
+ perform sharpe ratio of a given time series return
191
+ treat_signal_strategy(test_data=pd.DataFrame, strategy=list):
192
+ helper method that treats signals and converts signals to 1 or 0
193
+ stategy_simulator(features=list, hmm_feature=boolean):
194
+ execute strategy and get some performance metrics like sharpe ratio, return
195
+ viz_strategy(strategy):
196
+ display analysis plot of a given strategy
197
+ deep_dive_analysis_hmm(test_data_size=int, split=str):
198
+ display analysis plot hmm model
199
+ get_targets(steps=int):
200
+ produce regression target return taking future prices
201
+ get_categorical_targets(horizon=int, flor_loss=float, top_gain=float):
202
+ produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
203
+ get_configurations(test_data_size=int, val_data_size=int, model_type=str):
204
+ produce configuration dictionary that were saved in the feature generation methods if save_features was activated
205
+ """
187
206
 
188
- class stock_eda_panel(object):
189
-
190
207
  def __init__(self, stock_code, n_days, data_window = '5y'):
208
+
209
+ """
210
+ Initialize object
211
+
212
+ Parameters
213
+ ----------
214
+ stock_code (str): symbol of the asset
215
+ n_days (str): number of days to extract data
216
+ data_window (str): large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
217
+
218
+ Returns
219
+ -------
220
+ None
221
+ """
222
+
191
223
  self.stock_code = stock_code
192
224
  self.n_days = n_days
193
225
  self.today = datetime.date.today()
194
226
  self.features = list()
195
227
  self.signals = list()
196
228
  self.data_window = data_window
197
-
229
+
198
230
  def augmented_dickey_fuller_statistics(self,time_series, label):
231
+ """
232
+ Perform dickey fuller or stationary test for a given time series
233
+ It will print p value of the features
234
+
235
+ Parameters
236
+ ----------
237
+ time_series (pd.Series): pandas series of the time series
238
+ label (pd.Series): feature name
239
+
240
+ Returns
241
+ -------
242
+ None
243
+ """
199
244
  result = adfuller(time_series.dropna().values)
200
245
  print('p-value: {} for the series {}'.format(round(result[1],6), label))
201
-
246
+
202
247
  def get_data(self):
248
+ """
249
+ Get asset data performing some data normalization or formating (in case of dates)
250
+
251
+ Parameters
252
+ ----------
253
+ None
254
+
255
+ Returns
256
+ -------
257
+ None
258
+ """
259
+
203
260
  begin_date = self.today - relativedelta(days = self.n_days)
204
261
  begin_date_str = begin_date.strftime('%Y-%m-%d')
205
262
 
@@ -210,7 +267,7 @@ class stock_eda_panel(object):
210
267
  df.reset_index(inplace=True)
211
268
  df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
212
269
  df['Date'] = pd.to_datetime(df['Date'])
213
-
270
+
214
271
  df = df[df.Date >= begin_date_str ]
215
272
  self.settings_general = {
216
273
  'n_days':self.n_days,
@@ -219,44 +276,56 @@ class stock_eda_panel(object):
219
276
  'execution_date': self.today.strftime('%Y-%m-%d')
220
277
  }
221
278
  self.df = df
222
-
279
+
223
280
  ### cleaning volume
224
281
  ### volume clearning
225
282
  self.df['Volume'] = np.where(self.df['Volume'] <= 10, np.nan, self.df['Volume'])
226
283
  self.df['Volume'] = self.df['Volume'].fillna(method='bfill')
227
-
284
+
228
285
  ## filling
229
-
286
+
230
287
  base_columns_unit_test = ['Open','High','Low','Close','Volume']
231
288
  self.df[base_columns_unit_test] = self.df[base_columns_unit_test].fillna(method='ffill')
232
-
289
+
233
290
  ## cleaning nulls
234
-
291
+
235
292
  xs = self.df[base_columns_unit_test].isnull().sum()/self.df[base_columns_unit_test].count()
236
293
  reject_columns = list(xs[xs > 0.5].index.values)
237
-
294
+
238
295
  if len(reject_columns) > 0:
239
296
  logging.warning("the following columns have many nulls and are drop: {}".format(reject_columns))
240
297
  self.df = self.df.drop(columns = reject_columns)
241
-
242
-
298
+
243
299
  def plot_series_returns(self,roll_mean_lags1,roll_mean_lags2):
244
-
300
+
301
+ """
302
+ Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
303
+
304
+ Parameters
305
+ ----------
306
+ roll_mean_lags1 (int): short term window
307
+ roll_mean_lags2 (int): mid/long term window
308
+
309
+ Returns
310
+ -------
311
+ None
312
+ """
313
+
245
314
  df = self.df
246
315
  begin_date = self.today - relativedelta(days = self.n_days)
247
316
  begin_date_str = begin_date.strftime('%Y-%m-%d')
248
-
317
+
249
318
  ### getting rolling mean
250
319
  df["Close_roll_mean"] = (
251
320
  df.sort_values("Date")["Close"]
252
321
  .transform(lambda x: x.rolling(roll_mean_lags1, min_periods=1).mean())
253
322
  )
254
-
323
+
255
324
  df["Close_roll_mean_2"] = (
256
325
  df.sort_values("Date")["Close"]
257
326
  .transform(lambda x: x.rolling(roll_mean_lags2, min_periods=1).mean())
258
327
  )
259
-
328
+
260
329
  ### getting rolling stdv
261
330
  df["Close_roll_std"] = (
262
331
  df.sort_values("Date")["Close"]
@@ -273,7 +342,7 @@ class stock_eda_panel(object):
273
342
  ))
274
343
 
275
344
  fig.add_trace(go.Scatter(x=df['Date'], y=df.Close, marker_color = 'blue', name='Price'),row=1, col=1)
276
-
345
+
277
346
  fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean, marker_color = 'black', name='roll mean' ),row=1, col=1)
278
347
  fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean_2, marker_color = 'grey', name='roll mean 2' ),row=1, col=1)
279
348
  fig.add_trace(go.Scatter(x=df['Date'], y=df.lower, marker_color = 'pink',legendgroup='bound', name='bound' ),row=1, col=1)
@@ -281,8 +350,21 @@ class stock_eda_panel(object):
281
350
 
282
351
  fig.update_layout(height=500, width=1200, title_text=f"stock {self.stock_code} vizualization")
283
352
  fig.show()
284
-
353
+
285
354
  def seasonal_plot(self):
355
+
356
+ """
357
+ Display time series split by year
358
+
359
+ Parameters
360
+ ----------
361
+ None
362
+
363
+ Returns
364
+ -------
365
+ None
366
+ """
367
+
286
368
  df = self.df
287
369
  years = list(df['Date'].dt.year.unique())
288
370
  years.sort()
@@ -302,10 +384,24 @@ class stock_eda_panel(object):
302
384
 
303
385
  fig.update_layout(height=500, width=1400, title_text=f"stock {self.stock_code} seasonal vizualization")
304
386
  fig.show()
305
-
387
+
306
388
  def plot_price_signal(self, feature, feature_2 = '', opacity = 0.3):
307
-
308
- signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
389
+
390
+ """
391
+ Display botton and roof signals over the closing prices
392
+
393
+ Parameters
394
+ ----------
395
+ feature (str): name of the main feature to plot
396
+ feature_2 (str): name of the alternative feature to plot
397
+ opacity (float): opacity degree of the signals points
398
+
399
+ Returns
400
+ -------
401
+ None
402
+ """
403
+
404
+ signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
309
405
  signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
310
406
  norm_list = [f'norm_{feature}', f'z_{feature}', feature]
311
407
 
@@ -315,14 +411,14 @@ class stock_eda_panel(object):
315
411
  if norm_feat in self.df.columns:
316
412
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[norm_feat],legendgroup="up", mode='lines',name = norm_feat, marker_color = 'blue'),col = 1, row = 1)
317
413
  break
318
-
319
-
414
+
415
+
320
416
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df['Close'], mode='lines',name = 'history', marker_color = 'grey'),col = 1, row = 2)
321
-
417
+
322
418
  if feature == 'MA_spread':
323
419
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma1_column],legendgroup="ma", mode='lines',name = self.ma1_column, marker_color = 'black'),col = 1, row = 2)
324
420
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma2_column],legendgroup="ma", mode='lines',name = self.ma2_column, marker_color = 'grey'),col = 1, row = 2)
325
-
421
+
326
422
  for norm_feat in norm_list:
327
423
  if norm_feat in self.df.columns:
328
424
  fig.add_trace(go.Scatter(x=self.df['Date'], y=np.where(self.df[norm_feat] > 0, self.df['Close'], np.nan),legendgroup="up", mode='markers',name = 'up', marker_color = 'green',opacity = opacity),col = 1, row = 2)
@@ -338,8 +434,25 @@ class stock_eda_panel(object):
338
434
 
339
435
  fig.update_layout(height=900, width=1200)
340
436
  fig.show()
341
-
437
+
342
438
  def volatility_analysis(self, lags, trad_days, window_log_return, plot = False, save_features = False):
439
+
440
+ """
441
+ this method performs log return and volatilyty analysis of the closing prices
442
+
443
+ Parameters
444
+ ----------
445
+ lags (int): number of lags to apply to the closing prices
446
+ trad_days (int): number of trading days to anualize returns or volatility
447
+ window_log_return (int): window for rolling returns
448
+ plot (boolean): True to display plot
449
+ save_features (boolean): True to save feature configuration and feature names
450
+
451
+ Returns
452
+ -------
453
+ None
454
+ """
455
+
343
456
  df = self.df
344
457
  df['log_return'] = np.log(df.Close/df.Close.shift(lags))
345
458
  df['sqr_log_return'] = np.square(df.log_return)
@@ -349,13 +462,13 @@ class stock_eda_panel(object):
349
462
  df.sort_values("Date")["log_return"]
350
463
  .transform(lambda x: x.rolling(window_log_return, min_periods=1).mean())
351
464
  )
352
-
465
+
353
466
  if save_features:
354
467
  self.features.append('volatility_log_return')
355
468
  self.features.append('roll_mean_log_return')
356
469
  self.features.append('log_return')
357
470
  self.settings_volatility = {'lags':lags, 'trad_days':trad_days, 'window_log_return':window_log_return}
358
-
471
+
359
472
  if plot:
360
473
  fig = make_subplots(rows=3, cols=1,vertical_spacing = 0.02,shared_xaxes=True,
361
474
  specs=[
@@ -395,10 +508,25 @@ class stock_eda_panel(object):
395
508
 
396
509
  self.augmented_dickey_fuller_statistics(df['log_return'], 'log_return')
397
510
  self.augmented_dickey_fuller_statistics(df['roll_mean_log_return'], 'roll_mean_log_return')
398
-
399
-
511
+
400
512
  def find_lag(self, feature, lag_list, column_target = 'log_return',posterior_lag = 4, test_size = 350):
401
513
 
514
+ """
515
+ displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
516
+
517
+ Parameters
518
+ ----------
519
+ feature (str): feature name to apply lags
520
+ lag_list (list): list of lags, each lag as integer
521
+ column_target (str): target to get correlation, e.g return or mean reaturn
522
+ posterior_lag (int): for the target, posterior window shift to calculate a window return
523
+ test_size (int): data size of the test data. The remaining is going to be used as training data. This parameters is ment to avoid overfiting and leackage
524
+
525
+ Returns
526
+ -------
527
+ None
528
+ """
529
+
402
530
  results = dict()
403
531
  df = self.df.iloc[:-test_size,:][['Date','Close','roll_mean_log_return','log_return',feature]].sort_values('Date').copy()
404
532
  for i,lag in enumerate(lag_list):
@@ -413,7 +541,7 @@ class stock_eda_panel(object):
413
541
  'lag':lag,
414
542
  'pearsonr_log_return':r_log[0],
415
543
  'spearman_log_return': sp_log[0],
416
- }
544
+ }
417
545
  del df
418
546
  results_df = pd.DataFrame(results).T
419
547
 
@@ -426,10 +554,23 @@ class stock_eda_panel(object):
426
554
  plt.legend()
427
555
  plt.axhline(y=0, color='grey', linestyle='--')
428
556
  plt.show()
429
-
430
-
557
+
431
558
  def outlier_plot(self, zlim, plot = False, save_features = False):
432
-
559
+
560
+ """
561
+ perform outlier analysis of the log returns. It also permors normality test of returns
562
+
563
+ Parameters
564
+ ----------
565
+ zlim (float): alpha or z thrsholds for normalized returns
566
+ plot (boolean): True to display plot
567
+ save_features (boolean): True to save feature configuration and feature names
568
+
569
+ Returns
570
+ -------
571
+ None
572
+ """
573
+
433
574
  mean = self.df.log_return.mean()
434
575
  std = self.df.log_return.std()
435
576
  self.df['z_log_return'] = (self.df.log_return - mean)/std
@@ -440,7 +581,7 @@ class stock_eda_panel(object):
440
581
  self.df['up_outlier'] = zlim*self.df['z_std_log_return'] + mean_
441
582
  self.df['low_outlier'] = -zlim*self.df['z_std_log_return'] + mean_
442
583
 
443
- self.df['signal_low_osutlier'] = np.where( (self.df['z_log_return'] < self.df['low_outlier'] ), 1, 0)
584
+ self.df['signal_low_outlier'] = np.where( (self.df['z_log_return'] < self.df['low_outlier'] ), 1, 0)
444
585
  self.df['signal_up_outlier'] = np.where( (self.df['z_log_return'] > self.df['up_outlier'] ), 1, 0)
445
586
  if save_features:
446
587
  self.signals.append('signal_low_outlier')
@@ -451,7 +592,7 @@ class stock_eda_panel(object):
451
592
  sigma = self.df['z_log_return'].std()
452
593
  x = np.linspace(self.df['z_log_return'].min(),self.df['z_log_return'].max(), 15000)
453
594
  y = stats.norm.pdf(x, loc = mu, scale = sigma)
454
-
595
+
455
596
  fig, axs = plt.subplots(2, 1,figsize=(15,8))
456
597
 
457
598
  axs[0].hist(self.df['z_log_return'],density = True,bins = 100 , label = 'Returns distribution')
@@ -460,7 +601,7 @@ class stock_eda_panel(object):
460
601
  axs[0].axvline(l2, color='green', linestyle='--')
461
602
  axs[0].axvline(-l2, color='green', linestyle='--')
462
603
  axs[0].plot(x,y, linewidth = 3, color = 'r', label = 'Normal Dist Curve')
463
-
604
+
464
605
  axs[1].plot(self.df['Date'],self.df['z_log_return'])
465
606
  axs[1].plot(self.df['Date'],self.df['low_outlier'], linestyle='--')
466
607
  axs[1].plot(self.df['Date'],self.df['up_outlier'], linestyle='--')
@@ -469,18 +610,31 @@ class stock_eda_panel(object):
469
610
  plt.show()
470
611
 
471
612
  z_stat, p_stat = stats.normaltest(self.df['z_log_return'].dropna())
472
- p_stat = round(p_stat, 7)
613
+ p_stat = round(p_stat, 7)
473
614
  print('---------------------- returns normality tests ----------------------------')
474
615
  if p_stat < 0.05:
475
616
  print(f'pvalue: {p_stat} then, returns do not follow a normal distribution')
476
617
  else:
477
618
  print(f'pvalue: {p_stat} then, returns follow a normal distribution')
478
-
619
+
479
620
  def analysis_roll_mean_log_returns(self, lags, plot = False):
480
621
 
622
+ """
623
+ perform analysis of lags of the mean rolling log return
624
+
625
+ Parameters
626
+ ----------
627
+ lags (int): lags to apply to the roll log return
628
+ plot (boolean): True to display plot
629
+
630
+ Returns
631
+ -------
632
+ None
633
+ """
634
+
481
635
  self.df['lag'] = self.df.roll_mean_log_return.shift(lags)
482
636
  self.df['Diff'] = self.df['roll_mean_log_return'] - self.df['lag']
483
-
637
+
484
638
  if plot:
485
639
 
486
640
  fig, axs = plt.subplots(1, 3,figsize=(19,4))
@@ -493,7 +647,20 @@ class stock_eda_panel(object):
493
647
  plt.show()
494
648
 
495
649
  def compute_clip_bands(self,feature_name,threshold):
496
-
650
+
651
+ """
652
+ compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
653
+
654
+ Parameters
655
+ ----------
656
+ feature_name (str): feature name
657
+ threshold (float): alpha or z thrsholds for normalized returns
658
+
659
+ Returns
660
+ -------
661
+ None
662
+ """
663
+
497
664
  self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/self.df[feature_name].std()
498
665
  mean_ = self.df[f'norm_{feature_name}'].mean()
499
666
 
@@ -506,84 +673,140 @@ class stock_eda_panel(object):
506
673
  self.df[f'signal_low_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] < self.df[f'lower_{feature_name}'] ), 1, 0)
507
674
  self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] > self.df[f'upper_{feature_name}'] ), 1, 0)
508
675
 
676
+ def extract_sec_data(self, symbol, base_columns, rename_columns=None):
677
+ """
678
+ extract new asset data and merge it to the main asset data
679
+
680
+ Parameters
681
+ ----------
682
+ symbol (str): symbol to extract data
683
+ base_columns (list): list of columns to persist
684
+ rename_columns (dict): map of the new column names using pd.DataFrame.rename()
685
+
686
+ Returns
687
+ -------
688
+ None
689
+ """
690
+ begin_date = self.today - relativedelta(days = self.n_days)
691
+ begin_date_str = begin_date.strftime('%Y-%m-%d')
692
+
693
+ stock = yf.Ticker(symbol)
694
+ df = stock.history(period=self.data_window)
695
+ df = df.sort_values('Date')
696
+ df.reset_index(inplace=True)
697
+ df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
698
+ df['Date'] = pd.to_datetime(df['Date'])
699
+ df = df[df.Date >= begin_date_str ]
700
+ df = df[base_columns]
701
+ if rename_columns:
702
+ df = df.rename(columns=rename_columns)
703
+ right_df = df.copy()
704
+
705
+ dates_vector = self.df.Date.to_frame()
706
+ right_df = dates_vector.merge(right_df, on ='Date',how = 'left')
707
+ right_df = right_df.fillna(method = 'bfill')
708
+ right_df = right_df.fillna(method = 'ffill')
709
+
710
+ self.df = self.df.merge(right_df, on ='Date',how = 'left')
711
+ self.df = self.df.sort_values("Date")
712
+ del right_df
713
+ gc.collect()
714
+
715
+ def lag_log_return(self, lags, feature, feature_name=False):
716
+ """
717
+ compute log return given some lags
718
+
719
+ Parameters
720
+ ----------
721
+ lags (int): lag to apply log return
722
+ feature (str): feature to apply log return
723
+ feature_name (str): rename resuling name
724
+
725
+ Returns
726
+ -------
727
+ None
728
+ """
729
+
730
+ feature_name = feature_name if feature_name else f"{feature}_log_return"
731
+ self.df[feature_name] = np.log(self.df[feature]/self.df[feature].shift(lags))
732
+
733
+ def produce_log_volatility(self, trad_days, feature, feature_name=False):
734
+ """
735
+ compute log return given some lags
736
+
737
+ Parameters
738
+ ----------
739
+ trad_days (int): window function to calculate standard deviation
740
+ feature (str): feature to apply computation
741
+ feature_name (str): resulting feature name
742
+
743
+ Returns
744
+ -------
745
+ None
746
+ """
747
+ feature_name = feature_name if feature_name else f"{feature}_log_return_{trad_days}"
748
+ self.df[feature_name] = self.df.sort_values("Date")[feature].rolling(window = trad_days).std()*np.sqrt(252)
749
+
509
750
  def signal_plotter(self, feature_name):
751
+
752
+ """
753
+ display analysis plot of a feature with high and low signals
754
+
755
+ Parameters
756
+ ----------
757
+ feature_name (str): feature name
758
+
759
+ Returns
760
+ -------
761
+ None
762
+ """
763
+
510
764
  fig, axs = plt.subplots(1, 3,figsize=(17,5))
511
-
765
+
512
766
  axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
513
767
  axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
514
768
  axs[0].plot(self.df[f'norm_{feature_name}'])
515
-
769
+
516
770
  plot_acf(self.df[feature_name].dropna(),lags=25,ax = axs[1])
517
771
  axs[1].set_title(f'acf {feature_name}')
518
-
772
+
519
773
  plot_pacf(self.df[feature_name].dropna(),lags=25,ax = axs[2])
520
774
  axs[2].set_title(f'pacf {feature_name}')
521
-
775
+
522
776
  fig.show()
523
777
 
524
778
  def log_features_standard(self, feature_name):
779
+ """
780
+ save resulting feature names in an standard structure
781
+
782
+ Parameters
783
+ ----------
784
+ feature_name (str): feature name
785
+
786
+ Returns
787
+ -------
788
+ None
789
+ """
525
790
  self.features.append(feature_name)
526
791
  self.signals.append(f'signal_up_{feature_name}')
527
792
  self.signals.append(f'signal_low_{feature_name}')
528
-
529
- #######################
530
- #### to be deprecated ####
531
- def spread_MA(self, ma1, ma2, limit = 1.95, plot = False, save_features = False):
532
-
533
- self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
534
- self.df[f'MA_{ma2}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma2, min_periods=1).mean()))
535
-
536
- self.ma1_column = f'MA_{ma1}'
537
- self.ma2_column = f'MA_{ma2}'
538
- self.df['MA_spread'] = self.df[f'MA_{ma1}'] - self.df[f'MA_{ma2}']
539
-
540
- self.df['norm_MA_spread'] = (self.df['MA_spread'] - self.df['MA_spread'].mean())/self.df['MA_spread'].std()
541
- mean_ = self.df['norm_MA_spread'].mean()
542
- self.df['rollstd_MA_spread'] = self.df.sort_values("Date")["norm_MA_spread"].rolling(50).std()
543
-
544
- self.df['upper_MA_spread'] = limit*self.df['rollstd_MA_spread'] + mean_
545
- self.df['lower_MA_spread'] = -limit*self.df['rollstd_MA_spread'] + mean_
546
-
547
- self.df['signal_low_MA_spread'] = np.where( (self.df['norm_MA_spread'] < self.df['lower_MA_spread'] ), 1, 0)
548
- self.df['signal_up_MA_spread'] = np.where( (self.df['norm_MA_spread'] > self.df['upper_MA_spread'] ), 1, 0)
549
-
550
- ### ploting purposes
551
- self.df[f"Roll_mean_{ma1}"] = (
552
- self.df.sort_values("Date")["Close"]
553
- .transform(lambda x: x.rolling(ma1, min_periods=1).mean())
554
- )
555
- self.df[f"Roll_mean_{ma2}"] = (
556
- self.df.sort_values("Date")["Close"]
557
- .transform(lambda x: x.rolling(ma2, min_periods=1).mean())
558
- )
559
-
560
-
561
- print('--------------------------------------------------------------------')
562
- if save_features:
563
- self.features.append('MA_spread')
564
- self.signals.append('signal_low_MA_spread')
565
- self.signals.append('signal_up_MA_spread')
566
- self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
567
-
568
- if plot:
569
-
570
- fig, axs = plt.subplots(1, 3,figsize=(21,4))
571
-
572
- axs[0].plot(self.df['Date'],self.df['norm_MA_spread'])
573
- axs[0].plot(self.df['Date'],self.df['upper_MA_spread'], linestyle='--')
574
- axs[0].plot(self.df['Date'],self.df['lower_MA_spread'], linestyle='--')
575
- axs[0].set_title('MA_spread series')
576
793
 
577
- plot_acf(self.df['MA_spread'].dropna(),lags=25, ax=axs[1])
578
- axs[1].set_title('acf MA_spread series')
579
-
580
- plot_pacf(self.df['MA_spread'].dropna(),lags=25, ax=axs[2])
581
- axs[2].set_title('acf MA_spread series')
582
- plt.show()
583
- ##################################################
584
-
585
794
  def relative_spread_MA(self, ma1, ma2, threshold = 1.95, plot = False, save_features = False):
586
-
795
+ """
796
+ perform relative moving average features, one for short term and another for long/mid term
797
+
798
+ Parameters
799
+ ----------
800
+ ma1 (int): short term moving average window
801
+ ma2 (int): long/mid term moving average window
802
+ threshold (float): alpha or z thrsholds for the normalized feature
803
+ plot (boolean): True to display plot
804
+ save_features (boolean): True to save feature configuration and feature names
805
+
806
+ Returns
807
+ -------
808
+ None
809
+ """
587
810
  feature_name = 'rel_MA_spread'
588
811
 
589
812
  self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
@@ -605,16 +828,27 @@ class stock_eda_panel(object):
605
828
  .transform(lambda x: x.rolling(ma2, min_periods=1).mean())
606
829
  )
607
830
 
608
- print('--------------------------------------------------------------------')
609
831
  if save_features:
610
832
  self.log_features_standard(feature_name)
611
- self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
833
+ self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
612
834
 
613
835
  if plot:
614
-
615
836
  self.signal_plotter(feature_name)
616
-
837
+
617
838
  def pair_feature(self, pair_symbol, plot = False):
839
+ """
840
+ initialize pair feature data extraction and analysis
841
+
842
+ Parameters
843
+ ----------
844
+ pair_symbol (str): symbol of the pair asset to extract
845
+ plot (boolean): True to display plot
846
+
847
+ Returns
848
+ -------
849
+ None
850
+ """
851
+
618
852
  self.pair_symbol = pair_symbol
619
853
  begin_date = self.today - relativedelta(days = self.n_days)
620
854
  begin_date_str = begin_date.strftime('%Y-%m-%d')
@@ -627,7 +861,7 @@ class stock_eda_panel(object):
627
861
  df['Date'] = pd.to_datetime(df['Date'])
628
862
  df = df[df.Date >= begin_date_str ]
629
863
  self.pair_df = df
630
-
864
+
631
865
  #### converting the same index ####
632
866
  dates_vector = self.df.Date.to_frame()
633
867
  self.pair_df = dates_vector.merge(self.pair_df, on ='Date',how = 'left')
@@ -653,8 +887,40 @@ class stock_eda_panel(object):
653
887
  plt.plot(self.df['Date'],asset_2_values,label = asset_2)
654
888
  plt.legend()
655
889
  plt.show()
656
-
890
+
891
+ def smooth_logrets_interaction_term(self, feature_interact_with, resulting_feature_name="persisted_clip_diff_smooths", rollmean_window = 5, ext_threhold=0.015, persist_days = 3, save_features=False):
892
+ """
893
+ create an interaction term that is going to compare the distance of asset wolling window mean and market rolling window mean.
894
+ then get the outliers or high values using abs and this value persist for some days
895
+ goal persist big differences of market and asset returns
896
+
897
+ feature_interact_with: name of the market return
898
+ rollmean_window: rolling window or smoothing number of days
899
+ ext_threhold: threshold
900
+ persist_days: number of days to persis the signal
901
+ """
902
+ self.df["smooth_log_return"] = self.df['log_return'].rolling(rollmean_window).mean().values
903
+ self.df["smooth_market_log_return"] = self.df[feature_interact_with].rolling(rollmean_window).mean().values
904
+ self.df["diff_smooths"] = self.df["smooth_market_log_return"]-self.df["smooth_log_return"]
905
+ self.df["clip_diff_smooths"] = np.where(np.abs(self.df["diff_smooths"]) > ext_threhold, self.df["diff_smooths"] , 0)
906
+ self.df[resulting_feature_name] = self.df['clip_diff_smooths'].rolling(persist_days).mean().values
907
+ self.df = self.df.drop(columns=["smooth_log_return","smooth_market_log_return","diff_smooths","clip_diff_smooths"])
908
+
657
909
  def calculate_cointegration(self,series_1, series_2):
910
+ """
911
+ calculate cointegration score for two time series
912
+
913
+ Parameters
914
+ ----------
915
+ series_1 (pd.series): time series
916
+ series_2 (pd.series): time series
917
+
918
+ Returns
919
+ -------
920
+ coint_flag (boolean): 1 if the p_value cointegration_t are lower than 0.05 and critical value
921
+ hedge_value (float): beta from the regression model
922
+ """
923
+
658
924
  coint_flag = 0
659
925
  coint_res = coint(series_1, series_2)
660
926
  coint_t = coint_res[0]
@@ -666,9 +932,22 @@ class stock_eda_panel(object):
666
932
  coint_flag = 1 if p_value < 0.05 and coint_t < critical_value else 0
667
933
 
668
934
  return coint_flag, hedge_value
669
-
670
- def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
671
935
 
936
+ def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
937
+ """
938
+ display analysis of the pair feature and save results in case if needed
939
+
940
+ Parameters
941
+ ----------
942
+ window (int): window to apply to the rolling spread between pair and main asset
943
+ z_threshold (float): alpha or z thrsholds for the normalized feature
944
+ plot (boolean): True to display plot
945
+ save_features (boolean): True to save feature configuration and feature names
946
+
947
+ Returns
948
+ -------
949
+ None
950
+ """
672
951
  spread_series = pd.Series(self.df.pair_spread)
673
952
  mean = spread_series.rolling(center = False, window = window).mean()
674
953
  std = spread_series.rolling(center = False, window = window).std()
@@ -677,11 +956,11 @@ class stock_eda_panel(object):
677
956
  self.df['pair_z_score'] = z_score
678
957
  self.df['signal_low_pair_z_score'] = np.where(self.df['pair_z_score'] < -z_threshold, 1, 0)
679
958
  self.df['signal_up_pair_z_score'] = np.where(self.df['pair_z_score'] > z_threshold, 1, 0)
680
-
959
+
681
960
  if save_features:
682
961
  self.log_features_standard('pair_z_score')
683
- self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
684
-
962
+ self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
963
+
685
964
  if plot:
686
965
  pvalue = round(adfuller(z_score.dropna().values)[1],4)
687
966
  print(f'p value of the rolling z-score is {pvalue}')
@@ -695,7 +974,7 @@ class stock_eda_panel(object):
695
974
  axs[0,0].axhline(y=0, color='blue', linestyle='-.')
696
975
  axs[0,0].plot(self.df.pair_z_score)
697
976
  axs[0,0].set_title('z score from the spread')
698
-
977
+
699
978
  axs[0,1].plot(self.df['Date'],self.df['pair_spread'])
700
979
  axs[0,1].plot(self.df['Date'],np.where(self.df['signal_low_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'red')
701
980
  axs[0,1].plot(self.df['Date'],np.where(self.df['signal_up_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'green')
@@ -704,44 +983,27 @@ class stock_eda_panel(object):
704
983
 
705
984
  plot_acf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,0])
706
985
  axs[1,0].set_title('acf pair_z_score')
707
-
986
+
708
987
  plot_pacf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,1])
709
988
  axs[1,1].set_title('pacf pair_z_score')
710
-
711
- plt.show()
712
-
713
- #######################
714
- #### to be deprecated ####
715
- def get_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
716
989
 
717
- # negative countiing and rolling countingng
718
- self.df['RetClose'] = self.df['Close'].pct_change()
719
- self.df['roll_pos_counting'] = np.where(self.df['RetClose'].shift(1) > 0,1,0 )
720
- self.df['roll_pos_counting'] = self.df['roll_pos_counting'].rolling(window = rolling_window).sum()
721
-
722
- mean = self.df['roll_pos_counting'].mean()
723
- std = self.df['roll_pos_counting'].std()
724
- self.df['norm_counting'] = (self.df['roll_pos_counting'] - mean )/std
725
-
726
- self.df['signal_up_roll_pos_counting'] = np.where((self.df['norm_counting'] > threshold),1,0)
727
- self.df['signal_low_roll_pos_counting'] = np.where((self.df['norm_counting'] < -threshold),1,0)
728
-
729
- if save_features:
730
- self.features.append('roll_pos_counting')
731
- self.signals.append('signal_up_roll_pos_counting')
732
- self.signals.append('signal_low_roll_pos_counting')
733
- self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
734
-
735
- if plot:
736
- fig = plt.figure(figsize = (10,4))
737
- plt.plot(self.df['Date'],self.df.norm_counting)
738
- plt.axhline(y=threshold, color='grey', linestyle='--')
739
- plt.axhline(y=-threshold, color='grey', linestyle='--')
740
990
  plt.show()
741
- #######################
742
-
991
+
743
992
  def bidirect_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
744
-
993
+ """
994
+ perform negative and positive return counting in a given rolling time window
995
+
996
+ Parameters
997
+ ----------
998
+ rolling_window (int): window to apply to positive and negative returns
999
+ threshold (float): alpha or z thrsholds for the normalized feature
1000
+ plot (boolean): True to display plot
1001
+ save_features (boolean): True to save feature configuration and feature names
1002
+
1003
+ Returns
1004
+ -------
1005
+ None
1006
+ """
745
1007
  feature_name = 'bidirect_counting'
746
1008
  # negative countiing and rolling countingng
747
1009
  self.df['RetClose'] = self.df['Close'].pct_change()
@@ -757,7 +1019,7 @@ class stock_eda_panel(object):
757
1019
 
758
1020
  if save_features:
759
1021
  self.log_features_standard(feature_name)
760
- self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
1022
+ self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
761
1023
 
762
1024
  if plot:
763
1025
  fig = plt.figure(figsize = (10,4))
@@ -766,47 +1028,21 @@ class stock_eda_panel(object):
766
1028
  plt.plot(self.df['Date'],self.df[f'lower_{feature_name}'], linestyle='--')
767
1029
  plt.show()
768
1030
 
769
- #######################
770
- #### to be deprecated ####
771
- def get_range_feature(self, window, up_threshold, low_threshold, plot = False, save_features = False):
772
-
773
- self.df["Range"] = self.df["High"] / self.df["Low"] - 1
774
- self.df['Avg_range'] = self.df['Range'].rolling(window = 5).mean()
775
- self.df['dist_range'] = self.df['Range'] - self.df['Avg_range']
776
- self.df['norm_dist_range'] = (self.df['dist_range'] - self.df['dist_range'].mean())/ self.df['dist_range'].std()
777
-
778
- mean_ = self.df['norm_dist_range'].mean()
779
- self.df[f'std_norm_dist_range'] = (self.df.sort_values("Date")["norm_dist_range"].transform(lambda x: x.rolling(window, min_periods=1).std()))
780
-
781
- self.df['up_bound_norm_dist_range'] = up_threshold*self.df['std_norm_dist_range'] + mean_
782
- self.df['low_bound_norm_dist_range'] = -low_threshold*self.df['std_norm_dist_range'] + mean_
783
-
784
- self.df['signal_up_dist_range'] = np.where(self.df['norm_dist_range'] > self.df['up_bound_norm_dist_range'],1,0 )
785
- self.df['signal_low_dist_range'] = np.where(self.df['norm_dist_range'] < self.df['low_bound_norm_dist_range'],1,0 )
786
-
787
- if save_features:
788
- self.features.append('dist_range')
789
- self.signals.append('signal_up_dist_range')
790
- self.signals.append('signal_low_dist_range')
791
- self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
792
-
793
- if plot:
794
- fig, axs = plt.subplots(2, 2,figsize=(17,11))
795
-
796
- axs[0,0].plot(self.df['Range'])
797
- axs[0,0].set_title('range')
798
-
799
- axs[0,1].plot(self.df['Avg_range'])
800
- axs[0,1].set_title('Avg_range')
801
-
802
- axs[1,0].plot(self.df['up_bound_norm_dist_range'],color = 'grey', linestyle='--')
803
- axs[1,0].plot(self.df['low_bound_norm_dist_range'],color = 'grey', linestyle='--')
804
- axs[1,0].plot(self.df['norm_dist_range'])
805
- axs[1,0].set_title('norm_dist_range')
806
- #######################
807
-
808
1031
  def get_relative_range_feature(self, window, threshold, plot = False, save_features = False):
809
-
1032
+ """
1033
+ perform relative spread of opening and closing price
1034
+
1035
+ Parameters
1036
+ ----------
1037
+ window (int): window to apply to the feature
1038
+ threshold (float): alpha or z thrsholds for the normalized feature
1039
+ plot (boolean): True to display plot
1040
+ save_features (boolean): True to save feature configuration and feature names
1041
+
1042
+ Returns
1043
+ -------
1044
+ None
1045
+ """
810
1046
  feature_name = 'CO_Range'
811
1047
  self.df[feature_name] = self.df["Close"] / self.df["Open"]-1
812
1048
  self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/ self.df[feature_name].std()
@@ -822,7 +1058,7 @@ class stock_eda_panel(object):
822
1058
 
823
1059
  if save_features:
824
1060
  self.log_features_standard(feature_name)
825
- self.settings_relative_price_range = {'window':window, 'threshold':threshold}
1061
+ self.settings_relative_price_range = {'window':window, 'threshold':threshold}
826
1062
 
827
1063
  if plot:
828
1064
  fig, axs = plt.subplots(1, 2,figsize=(14,5))
@@ -835,46 +1071,24 @@ class stock_eda_panel(object):
835
1071
  axs[1].plot(self.df[f'norm_{feature_name}'])
836
1072
  axs[1].set_title(f'norm_{feature_name}')
837
1073
 
838
- #######################
839
- #### to be deprecated ####
840
- def rsi_feature(self, window, lag_rsi_ret, threshold, plot = False, save_features = False):
841
-
842
- rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
843
- self.df['RSI'] = rsi
844
- self.df['RSI_ret'] = self.df['RSI']/self.df['RSI'].shift(lag_rsi_ret)
845
-
846
- mean = self.df['RSI_ret'].mean()
847
- std = self.df['RSI_ret'].std()
848
- self.df['norm_RSI_ret'] = (self.df['RSI_ret']-mean)/std
849
- self.df['signal_up_RSI_ret'] = np.where(self.df['norm_RSI_ret'] > threshold,1,0)
850
- self.df['signal_low_RSI_ret'] = np.where(self.df['norm_RSI_ret'] < -threshold,1,0)
851
-
852
- if save_features:
853
- self.features.append('RSI_ret')
854
- self.signals.append('signal_up_RSI_ret')
855
- self.signals.append('signal_low_RSI_ret')
856
- self.settings_rsi_feature= {'window':window, 'lag_rsi_ret':lag_rsi_ret, 'threshold':threshold}
857
-
858
- if plot:
859
- fig, axs = plt.subplots(1, 3,figsize=(17,5))
860
-
861
- axs[0].plot(self.df.norm_RSI_ret)
862
- axs[0].axhline(y=threshold, color='grey', linestyle='--')
863
- axs[0].axhline(y=-threshold, color='grey', linestyle='--')
864
-
865
- plot_acf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[1])
866
- axs[1].set_title('acf RSI_ret')
867
-
868
- plot_pacf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[2])
869
- axs[2].set_title('pacf RSI_ret')
870
-
871
- fig.show()
872
- #######################
873
-
874
1074
  def rsi_feature_improved(self, window, threshold, plot = False, save_features = False):
1075
+ """
1076
+ perform relative strength index
1077
+
1078
+ Parameters
1079
+ ----------
1080
+ window (int): window to apply to the feature
1081
+ threshold (float): alpha or z thrsholds for the normalized feature
1082
+ plot (boolean): True to display plot
1083
+ save_features (boolean): True to save feature configuration and feature names
1084
+
1085
+ Returns
1086
+ -------
1087
+ None
1088
+ """
875
1089
  feature_name = 'RSI'
876
1090
  rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
877
- self.df[feature_name] = rsi
1091
+ self.df[feature_name] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
878
1092
  self.compute_clip_bands(feature_name,threshold)
879
1093
 
880
1094
  if save_features:
@@ -883,54 +1097,22 @@ class stock_eda_panel(object):
883
1097
 
884
1098
  if plot:
885
1099
  self.signal_plotter(feature_name)
886
-
887
- #######################
888
- #### to be deprecated ####
889
- def days_features(self, window_day, limit, plot = False, save_features = False):
890
-
891
- self.df['dow'] = self.df.Date.dt.dayofweek
892
- self.df['dow'] = self.df['dow'].astype('str')
893
-
894
- self.df['target_mean_input'] = (self.df.sort_values("Date").groupby('dow')['roll_mean_log_return'].transform(lambda x: x.rolling(window_day, min_periods=1).mean()))
895
-
896
- mean = self.df['target_mean_input'].mean()
897
- std = self.df['target_mean_input'].std()
898
-
899
- self.df['norm_dow_input'] = (self.df['target_mean_input']-mean)/std
900
- mean_ = self.df['norm_dow_input'].mean()
901
- self.df['std_dow_input'] = self.df.sort_values("Date")["norm_dow_input"].rolling(50).std()
902
1100
 
903
- self.df['up_dow_input'] = limit*self.df['std_dow_input'] + mean_
904
- self.df['low_dow_input'] = -limit*self.df['std_dow_input'] - mean_
905
-
906
- self.df['signal_up_target_mean_input'] = np.where(self.df['norm_dow_input'] > self.df['up_dow_input'],1,0)
907
- self.df['signal_low_target_mean_input'] = np.where(self.df['norm_dow_input'] < self.df['low_dow_input'],1,0)
908
-
909
- if save_features:
910
-
911
- self.features.append('target_mean_input')
912
- self.signals.append('signal_up_target_mean_input')
913
- self.signals.append('signal_low_target_mean_input')
914
- self.settings_days_features = {'window_day':window_day, 'limit':limit}
915
-
916
- if plot:
917
- fig, axs = plt.subplots(1, 3,figsize=(17,5))
918
-
919
- axs[0].plot(self.df['norm_dow_input'])
920
- axs[0].plot(self.df['up_dow_input'], linestyle='--')
921
- axs[0].plot(self.df['low_dow_input'], linestyle='--')
922
-
923
- plot_acf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[1])
924
- axs[1].set_title('acf day feature')
925
-
926
- plot_pacf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[2])
927
- axs[2].set_title('pacf day feature')
928
-
929
- fig.show()
930
- #######################
931
-
932
1101
  def days_features_bands(self, window, threshold, plot = False, save_features = False):
933
-
1102
+ """
1103
+ compute mean returns for a given day of the week in a window scope per day
1104
+
1105
+ Parameters
1106
+ ----------
1107
+ window (int): window to apply to the feature
1108
+ threshold (float): alpha or z thrsholds for the normalized feature
1109
+ plot (boolean): True to display plot
1110
+ save_features (boolean): True to save feature configuration and feature names
1111
+
1112
+ Returns
1113
+ -------
1114
+ None
1115
+ """
934
1116
  self.df['dow'] = self.df.Date.dt.dayofweek
935
1117
  self.df['dow'] = self.df['dow'].astype('str')
936
1118
 
@@ -947,65 +1129,22 @@ class stock_eda_panel(object):
947
1129
 
948
1130
  if plot:
949
1131
  self.signal_plotter(feature_name)
950
-
951
- #######################
952
- #### to be deprecated ####
953
- def analysis_volume(self,lag_volume, threshold, window, plot = False, save_features = False):
954
-
955
- self.df['log_Volume'] = np.log(self.df['Volume'])
956
- self.df['ret_log_Volume'] = self.df['log_Volume'].pct_change(lag_volume)
957
-
958
- self.df['norm_ret_log_Volume'] = (self.df['ret_log_Volume'] - self.df['ret_log_Volume'].mean())/ self.df['ret_log_Volume'].std()
959
- mean_ = self.df['norm_ret_log_Volume'].mean()
960
- self.df[f'std_norm_ret_log_Volume'] = (self.df.sort_values("Date")["norm_ret_log_Volume"].transform(lambda x: x.rolling(window, min_periods=1).std()))
961
-
962
- self.df['up_bound_ret_log_Volume'] = threshold*self.df['std_norm_ret_log_Volume'] + mean_
963
- self.df['low_bound_ret_log_Volume'] = -threshold*self.df['std_norm_ret_log_Volume'] + mean_
964
-
965
- self.df['signal_up_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] > self.df['up_bound_ret_log_Volume'],1,0 )
966
- self.df['signal_low_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] < self.df['low_bound_ret_log_Volume'],1,0 )
967
-
968
- if save_features:
969
- self.features.append('ret_log_Volume')
970
- self.signals.append('signal_up_ret_log_Volume')
971
- self.signals.append('signal_low_ret_log_Volume')
972
- self.settings_volume_feature= {'lag_volume':lag_volume, 'threshold':threshold, 'window':window}
973
- if plot:
974
- fig, axs = plt.subplots(3, 2,figsize=(11,13))
975
- axs[0,0].plot(self.df.Date, self.df.Volume)
976
- axs[0,0].set_title('Volume')
977
- axs[0,1].plot(self.df.Date, self.df.log_Volume)
978
- axs[0,1].set_title('log Volume')
979
-
980
- plot_acf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,0])
981
- axs[1,0].set_title('acf log_Volume')
982
- plot_pacf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,1])
983
- axs[1,1].set_title('pacf log_Volume')
984
-
985
- plot_acf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,0])
986
- axs[2,0].set_title('acf ret_log_Volume')
987
- plot_pacf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,1])
988
- axs[2,1].set_title('pacf ret_log_Volume')
989
1132
 
990
- plt.show()
991
-
992
- print('--------------------------------------------------------------')
993
-
994
- fig, axs = plt.subplots(1, 2,figsize=(10,4))
995
-
996
- axs[0].plot(self.df.Date, self.df.norm_ret_log_Volume)
997
- axs[0].plot(self.df.Date, self.df.up_bound_ret_log_Volume)
998
- axs[0].plot(self.df.Date, self.df.low_bound_ret_log_Volume)
999
- axs[0].set_title('norm_ret_log_Volume')
1000
-
1001
- axs[1].plot(self.df.Date, self.df.std_norm_ret_log_Volume)
1002
- axs[1].set_title('std_norm_ret_log_Volume')
1003
-
1004
- plt.show()
1005
- #######################
1006
-
1007
1133
  def analysis_smooth_volume(self, window, threshold, plot = False, save_features = False):
1008
-
1134
+ """
1135
+ compute feature of thrading volumes
1136
+
1137
+ Parameters
1138
+ ----------
1139
+ window (int): window to apply to the feature
1140
+ threshold (float): alpha or z thrsholds for the normalized feature
1141
+ plot (boolean): True to display plot
1142
+ save_features (boolean): True to save feature configuration and feature names
1143
+
1144
+ Returns
1145
+ -------
1146
+ None
1147
+ """
1009
1148
  feature_name = 'smooth_Volume'
1010
1149
  self.df[feature_name] = np.log(self.df['Volume'])
1011
1150
  # self.df[feature_name] = self.df['log_Volume'].rolling(window).mean()
@@ -1039,7 +1178,7 @@ class stock_eda_panel(object):
1039
1178
 
1040
1179
  fig, axs = plt.subplots(1,2,figsize=(10,4))
1041
1180
 
1042
- axs[0].plot(self.df[f'{feature_name}'])
1181
+ axs[0].plot(self.df[f'{feature_name}'])
1043
1182
  axs[0].set_title(f'{feature_name}')
1044
1183
 
1045
1184
  axs[1].plot(self.df[f'z_{feature_name}'], linestyle='--')
@@ -1048,9 +1187,23 @@ class stock_eda_panel(object):
1048
1187
  plt.show()
1049
1188
 
1050
1189
  def roc_feature(self, window, threshold, plot = False, save_features = False):
1190
+ """
1191
+ perform price rate of change
1192
+
1193
+ Parameters
1194
+ ----------
1195
+ window (int): window to apply to the feature
1196
+ threshold (float): alpha or z thrsholds for the normalized feature
1197
+ plot (boolean): True to display plot
1198
+ save_features (boolean): True to save feature configuration and feature names
1199
+
1200
+ Returns
1201
+ -------
1202
+ None
1203
+ """
1051
1204
  feature_name = 'ROC'
1052
1205
  roc = ROCIndicator(close = self.df['Close'], window = window).roc()
1053
- self.df[feature_name] = roc
1206
+ self.df[feature_name] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1054
1207
  self.compute_clip_bands(feature_name,threshold)
1055
1208
 
1056
1209
  if save_features:
@@ -1058,11 +1211,27 @@ class stock_eda_panel(object):
1058
1211
  self.settings_roc_feature = {'window':window, 'threshold':threshold}
1059
1212
  if plot:
1060
1213
  self.signal_plotter(feature_name)
1061
-
1214
+
1062
1215
  def stoch_feature(self, window, smooth1, smooth2, threshold, plot = False, save_features = False):
1216
+ """
1217
+ perform stochastic oscilator RSI feature
1218
+
1219
+ Parameters
1220
+ ----------
1221
+ window (int): window to apply to the feature
1222
+ smooth1 (int): smoothing parameter 1
1223
+ smooth2 (int): smoothing parameter 2
1224
+ threshold (float): alpha or z thrsholds for the normalized feature
1225
+ plot (boolean): True to display plot
1226
+ save_features (boolean): True to save feature configuration and feature names
1227
+
1228
+ Returns
1229
+ -------
1230
+ None
1231
+ """
1063
1232
  feature_name = 'STOCH'
1064
1233
  stoch = StochRSIIndicator(close = self.df['Close'], window = window, smooth1=smooth1, smooth2=smooth2).stochrsi()
1065
- self.df[feature_name] = stoch
1234
+ self.df[feature_name] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1066
1235
  self.compute_clip_bands(feature_name,threshold)
1067
1236
 
1068
1237
  if save_features:
@@ -1072,9 +1241,24 @@ class stock_eda_panel(object):
1072
1241
  self.signal_plotter(feature_name)
1073
1242
 
1074
1243
  def stochastic_feature(self, window, smooth, threshold, plot = False, save_features = False):
1244
+ """
1245
+ perform stochastic oscilator feature
1246
+
1247
+ Parameters
1248
+ ----------
1249
+ window (int): window to apply to the feature
1250
+ smooth (int): smoothing parameter
1251
+ threshold (float): alpha or z thrsholds for the normalized feature
1252
+ plot (boolean): True to display plot
1253
+ save_features (boolean): True to save feature configuration and feature names
1254
+
1255
+ Returns
1256
+ -------
1257
+ None
1258
+ """
1075
1259
  feature_name = 'STOCHOSC'
1076
1260
  stochast = StochasticOscillator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window,smooth_window=smooth).stoch()
1077
- self.df[feature_name] = stochast
1261
+ self.df[feature_name] = stochast.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1078
1262
  self.compute_clip_bands(feature_name,threshold)
1079
1263
 
1080
1264
  if save_features:
@@ -1084,9 +1268,23 @@ class stock_eda_panel(object):
1084
1268
  self.signal_plotter(feature_name)
1085
1269
 
1086
1270
  def william_feature(self, lbp, threshold, plot = False, save_features = False):
1271
+ """
1272
+ perfom fast stochastic oscilator or william indicator
1273
+
1274
+ Parameters
1275
+ ----------
1276
+ lbp (int): look back parameter
1277
+ threshold (float): alpha or z thrsholds for the normalized feature
1278
+ plot (boolean): True to display plot
1279
+ save_features (boolean): True to save feature configuration and feature names
1280
+
1281
+ Returns
1282
+ -------
1283
+ None
1284
+ """
1087
1285
  feature_name = 'WILL'
1088
- will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
1089
- self.df[feature_name] = will
1286
+ will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
1287
+ self.df[feature_name] = will.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1090
1288
  self.compute_clip_bands(feature_name,threshold)
1091
1289
 
1092
1290
  if save_features:
@@ -1096,9 +1294,23 @@ class stock_eda_panel(object):
1096
1294
  self.signal_plotter(feature_name)
1097
1295
 
1098
1296
  def vortex_feature(self, window, threshold, plot = False, save_features = False):
1297
+ """
1298
+ perform vortex oscilator
1299
+
1300
+ Parameters
1301
+ ----------
1302
+ window (int): window to apply to the feature
1303
+ threshold (float): alpha or z thrsholds for the normalized feature
1304
+ plot (boolean): True to display plot
1305
+ save_features (boolean): True to save feature configuration and feature names
1306
+
1307
+ Returns
1308
+ -------
1309
+ None
1310
+ """
1099
1311
  feature_name = 'VORTEX'
1100
1312
  vortex = VortexIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window).vortex_indicator_diff()
1101
- self.df[feature_name] = vortex
1313
+ self.df[feature_name] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1102
1314
  self.compute_clip_bands(feature_name,threshold)
1103
1315
 
1104
1316
  if save_features:
@@ -1107,11 +1319,93 @@ class stock_eda_panel(object):
1107
1319
  if plot:
1108
1320
  self.signal_plotter(feature_name)
1109
1321
 
1110
- def pair_index_feature(self, pair_symbol, feature_label, window, threshold, plot = False, save_features = False):
1322
+ def minmax_pricefeature(self, type_func, window, distance = False, plot = False, save_features = False):
1323
+ """
1324
+ perform relative price/distance with respect to the min/max price in a given time scope
1325
+
1326
+ Parameters
1327
+ ----------
1328
+ type_func (str): either min or max
1329
+ window (int): window scope
1330
+ distance (boolean): if true, get distance feature else relative feature
1331
+ save_features (boolean): True to save feature configuration and feature names
1332
+
1333
+ Returns
1334
+ -------
1335
+ None
1336
+ """
1337
+ if type_func == 'min':
1338
+ self.df['Price_ref'] = self.df[['Open','High', 'Low','Close']].min(axis = 1)
1339
+ elif type_func == 'max':
1340
+ self.df['Price_ref'] = self.df[['Open','High', 'Low','Close']].max(axis = 1)
1341
+
1342
+ init_shape = self.df.shape[0]
1343
+ df_date = self.df[['Date','Price_ref']].rename(columns = {'Date':'Date_ref'}).copy()
1344
+
1345
+ self.df = self.df.rename(columns = {'Price_ref':'Price_to_use'})
1346
+
1347
+ if type_func == 'min':
1348
+ self.df[f'window_price'] = (self.df.sort_values("Date")["Price_to_use"].transform(lambda x: x.rolling(window, min_periods=1).min()))
1349
+ elif type_func == 'max':
1350
+ self.df[f'window_price'] = (self.df.sort_values("Date")["Price_to_use"].transform(lambda x: x.rolling(window, min_periods=1).max()))
1351
+
1352
+
1353
+ self.df = self.df.merge(df_date, left_on = 'window_price', right_on = 'Price_ref', how = 'left')
1354
+ self.df['date_span'] = self.df['Date'] - self.df['Date_ref']
1355
+
1356
+ self.df['RN'] = self.df.sort_values(['date_span'], ascending=False).groupby(['Date']).cumcount() + 1
1357
+ self.df = self.df[self.df['RN'] == 1]
1358
+
1359
+ if distance:
1360
+ self.df[f'{type_func}_distance_to_price'] = pd.to_numeric(self.df['date_span'].dt.days, downcast='integer')
1361
+
1362
+ if not distance:
1363
+ if type_func == 'min':
1364
+ self.df[f'{type_func}_relprice'] = self.df['Price_to_use']/self.df['window_price']-1
1365
+
1366
+ if type_func == 'max':
1367
+ self.df[f'{type_func}_relprice'] = self.df['window_price']/self.df['Price_to_use']-1
1368
+
1369
+ self.df = self.df.drop(columns = ['RN', 'date_span', 'Price_to_use', 'window_price', 'Date_ref','Price_ref'])
1370
+
1371
+ end_shape = self.df.shape[0]
1372
+
1373
+ if init_shape != end_shape:
1374
+ raise Exception("shapes are not the same")
1375
+
1376
+ if save_features:
1377
+ if distance:
1378
+ self.features.append(f'{type_func}_distance_to_price')
1379
+ name_attr = f'{type_func}_distance'
1380
+ if not distance:
1381
+ self.features.append(f'{type_func}_relprice')
1382
+ name_attr = f'{type_func}_relprice'
1383
+
1384
+ setattr(self,f'settings_{name_attr}_pricefeature' , {'type_func': type_func, 'window': window, 'distance': distance})
1385
+
1386
+ def pair_index_feature(self, pair_symbol, feature_label,threshold, window = None,ta_method='ROC',param_set=False,plot = False, save_features = False):
1387
+ """
1388
+ perform additional asset ROC feature, then a new feature is created in the main dataframe
1389
+
1390
+ Parameters
1391
+ ----------
1392
+ pair_symbol (str): symbol of the asset to extract the data
1393
+ feature_label (str): name of the resulting feature
1394
+ window (int): window to apply to the feature as default (this parameter is going to be deprecated)
1395
+ threshold (float): alpha or z thrsholds for the normalized feature
1396
+ param_set (dict): parameter set in case ta_method is other than ROC
1397
+ ta_method (str): method to use, available RSI, ROC, VORTEX, STOCH
1398
+ plot (boolean): True to display plot
1399
+ save_features (boolean): True to save feature configuration and feature names
1400
+
1401
+ Returns
1402
+ -------
1403
+ None
1404
+ """
1111
1405
  self.pair_index = pair_symbol
1112
1406
  begin_date = self.today - relativedelta(days = self.n_days)
1113
1407
  begin_date_str = begin_date.strftime('%Y-%m-%d')
1114
-
1408
+
1115
1409
  if feature_label in self.df.columns:
1116
1410
  self.df = self.df.drop(columns = [feature_label])
1117
1411
 
@@ -1123,14 +1417,27 @@ class stock_eda_panel(object):
1123
1417
  df['Date'] = pd.to_datetime(df['Date'])
1124
1418
  df = df[df.Date >= begin_date_str ]
1125
1419
  self.pair_index_df = df
1126
-
1420
+
1127
1421
  #### converting the same index ####
1128
1422
  dates_vector = self.df.Date.to_frame()
1129
1423
  self.pair_index_df = dates_vector.merge(self.pair_index_df, on ='Date',how = 'left')
1130
1424
  self.pair_index_df = self.pair_index_df.fillna(method = 'bfill')
1131
1425
  self.pair_index_df = self.pair_index_df.fillna(method = 'ffill')
1132
-
1133
- self.pair_index_df[feature_label] = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
1426
+
1427
+ if ta_method == 'ROC':
1428
+ window = window if window else param_set.get('window')
1429
+ roc = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
1430
+ self.pair_index_df[feature_label] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1431
+ elif ta_method == 'RSI':
1432
+ rsi = RSIIndicator(close = self.pair_index_df['Close'], **param_set).rsi()
1433
+ self.pair_index_df[feature_label] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1434
+ elif ta_method == 'VORTEX':
1435
+ vortex = VortexIndicator(close = self.pair_index_df['Close'], high = self.pair_index_df['High'], low = self.pair_index_df['Low'], **param_set).vortex_indicator_diff()
1436
+ self.pair_index_df[feature_label] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1437
+ elif ta_method == 'STOCH':
1438
+ stoch = StochRSIIndicator(close = self.pair_index_df['Close'], **param_set).stochrsi()
1439
+ self.pair_index_df[feature_label] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1440
+
1134
1441
  df_to_merge = self.pair_index_df[['Date',feature_label]]
1135
1442
  self.df = self.df.merge(df_to_merge, on ='Date',how = 'left')
1136
1443
 
@@ -1140,7 +1447,7 @@ class stock_eda_panel(object):
1140
1447
  if save_features:
1141
1448
  self.log_features_standard(feature_label)
1142
1449
  parameters = {feature_label:{'pair_symbol':pair_symbol, 'feature_label':feature_label, 'window':window,'threshold':threshold}}
1143
- try:
1450
+ try:
1144
1451
  len(self.settings_pair_index_feature)
1145
1452
  print('existing')
1146
1453
  self.settings_pair_index_feature.append(parameters)
@@ -1153,10 +1460,21 @@ class stock_eda_panel(object):
1153
1460
  self.signal_plotter(feature_label)
1154
1461
 
1155
1462
  def produce_order_features(self, feature_name, save_features = False):
1463
+ """
1464
+ perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
1156
1465
 
1466
+ Parameters
1467
+ ----------
1468
+ feature_name (str): name of the feature
1469
+ save_features (boolean): True to save feature configuration and feature names
1470
+
1471
+ Returns
1472
+ -------
1473
+ None
1474
+ """
1157
1475
  signal_feature_name = f'discrete_signal_{feature_name}'
1158
1476
  order_feature_name = f'order_signal_{feature_name}'
1159
-
1477
+
1160
1478
  self.df[signal_feature_name] = np.where(
1161
1479
  self.df[f'signal_up_{feature_name}'] == 1,1,
1162
1480
  np.where(
@@ -1173,14 +1491,107 @@ class stock_eda_panel(object):
1173
1491
  self.df[order_feature_name] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1174
1492
  self.df[order_feature_name] = self.df[order_feature_name]*self.df[signal_feature_name]
1175
1493
  self.df = self.df.drop(columns = [f'lag_{signal_feature_name}', 'breack', "chain_id"])
1176
-
1494
+
1177
1495
  ## saving features
1178
1496
  if save_features:
1179
1497
  self.signals.append(signal_feature_name)
1180
1498
  self.signals.append(order_feature_name)
1181
-
1499
+
1500
+ def get_order_feature_nosignal(self,feature_name, save_features=False):
1501
+ """
1502
+ perform a feature that captures number of steps after the end of a signal
1503
+
1504
+ Parameters
1505
+ ----------
1506
+ feature_name (str): name of the feature
1507
+ save_features (boolean): True to save feature configuration and feature names
1508
+
1509
+ Returns
1510
+ -------
1511
+ None
1512
+ """
1513
+ order_feature_name = f'order_signal_{feature_name}'
1514
+ ns_order_feature_name = f'ns_order_{feature_name}'
1515
+ self.df = self.df.sort_values('Date')
1516
+ self.df['lag_'] = self.df[order_feature_name].shift(1)
1517
+ self.df['flag'] = np.where((self.df[order_feature_name] == 0) & (self.df['lag_']!=0),1,np.nan)
1518
+ self.df = self.df.drop(columns=['lag_'])
1519
+ self.df['order_'] = self.df.sort_values('Date').groupby(['flag']).cumcount() + 1
1520
+ self.df['order_'] = self.df['order_'].fillna(method='ffill')
1521
+ self.df['order_'] = np.where(self.df[order_feature_name]==0,self.df['order_'],0)
1522
+ self.df = self.df.drop(columns=['flag'])
1523
+ self.df['order_'] = self.df.sort_values('Date').groupby(['order_']).cumcount() + 1
1524
+ norm_list = [f'norm_{feature_name}', f'z_{feature_name}', feature_name]
1525
+ for norm_feature in norm_list:
1526
+ try:
1527
+ self.df['order_'] = np.sign(self.df[norm_feature])*self.df['order_']
1528
+ break
1529
+ except:
1530
+ pass
1531
+ self.df['order_'] = np.where(self.df[order_feature_name]==0,self.df['order_'],0)
1532
+ self.df = self.df.rename(columns={'order_':ns_order_feature_name})
1533
+ if save_features:
1534
+ self.signals.append(ns_order_feature_name)
1535
+
1536
+ def compute_last_signal(self,feature, save_features = False):
1537
+ """
1538
+ perform two new features when signal is observed, one for the last duration of the previous chain, second for the last duration of the same sign signal
1539
+
1540
+ Parameters
1541
+ ----------
1542
+ feature_name (str): name of the feature
1543
+ save_features (boolean): True to save feature configuration and feature names
1544
+
1545
+ Returns
1546
+ -------
1547
+ None
1548
+ """
1549
+ def create_last_signal(df, feature, prefix, type ='0'):
1550
+ if type == '0':
1551
+ condition = df[f'order_signal_{feature}'] != 0
1552
+ elif type == '+':
1553
+ condition = df[f'order_signal_{feature}'] > 0
1554
+ elif type == '-':
1555
+ condition = df[f'order_signal_{feature}'] < 0
1556
+ df[f'last_maxorder_{feature}'] = np.where(condition, df[f'order_signal_{feature}'],np.nan)
1557
+ df['tmp_chain_index'] = df[f'last_maxorder_{feature}'].shift(-1)
1558
+ df['last'] = np.where((df[f'last_maxorder_{feature}'] != 0) & (df['tmp_chain_index'].isna()),df[f'last_maxorder_{feature}'], np.nan )
1559
+ df['last'] = df['last'].shift(1)
1560
+ df[f'last_maxorder_{feature}'] = df['last'].fillna(method = 'ffill')
1561
+ df = df.drop(columns = ['tmp_chain_index','last'])
1562
+ df[f'last_maxorder_{feature}'] = np.where(df[f'order_signal_{feature}'] != 0,df[f'last_maxorder_{feature}'],np.nan)
1563
+ df[f'last_maxorder_{feature}'] = df[f'last_maxorder_{feature}'].fillna(0)
1564
+ df = df.rename(columns = {f'last_maxorder_{feature}':f'{prefix}_{feature}'})
1565
+ return df
1566
+ prefix0, prefix1, prefix2 = 'ldur', 'pos', 'neg'
1567
+ self.df = create_last_signal(self.df, feature, prefix0, type ='0')
1568
+ self.df = create_last_signal(self.df, feature, prefix1, type ='+')
1569
+ self.df = create_last_signal(self.df, feature, prefix2, type ='-')
1570
+
1571
+ self.df[f'sldur_{feature}'] = np.where(
1572
+ self.df[f'order_signal_{feature}'] > 0, self.df[f'{prefix1}_{feature}'],
1573
+ np.where(
1574
+ self.df[f'order_signal_{feature}'] < 0, self.df[f'{prefix2}_{feature}'],
1575
+ 0
1576
+ )
1577
+ )
1578
+ self.df = self.df.drop(columns = [f'{prefix1}_{feature}',f'{prefix2}_{feature}'])
1579
+ if save_features:
1580
+ self.signals.append(f'sldur_{feature}')
1581
+ self.signals.append(f'ldur_{feature}')
1582
+
1182
1583
  def create_hmm_derived_features(self, lag_returns):
1584
+ """
1585
+ create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
1586
+
1587
+ Parameters
1588
+ ----------
1589
+ lag_returns (int): lag paramter (not used)
1183
1590
 
1591
+ Returns
1592
+ -------
1593
+ None
1594
+ """
1184
1595
  self.df = self.df.sort_values('Date')
1185
1596
  ## indexing chains
1186
1597
  self.df['lag_hmm_feature'] = self.df['hmm_feature'].shift(1)
@@ -1189,31 +1600,44 @@ class stock_eda_panel(object):
1189
1600
  self.df["chain_id"] = np.where(self.df['breack'] == 1,self.df["chain_id"],np.nan)
1190
1601
  self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
1191
1602
  self.df["hmm_chain_order"] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1192
-
1193
- ### returns using the first element in a chain
1194
- self.df['first'] = np.where(self.df['hmm_chain_order'] == 1, self.df['Close'], np.nan)
1195
- self.df['first'] = self.df.sort_values('Date')['first'].fillna(method='ffill')
1196
- self.df['chain_return'] = (self.df['Close']/self.df['first'] -1) * 100
1197
1603
 
1198
- self.df = self.df.drop(columns = ['breack','first'])
1604
+ ### returns using the windowsseeds
1605
+ self.df['lag_chain_close'] = self.df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
1606
+ self.df['chain_return'] = (self.df['Close']/self.df['lag_chain_close'] -1) * 100
1607
+ self.df = self.df.drop(columns = ['breack'])
1199
1608
 
1200
- def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, plot = False, save_features = False, model = False):
1609
+ def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, corr_threshold = 0.75, plot = False, save_features = False, model = False):
1610
+ """
1611
+ create or use a hmm model
1612
+
1613
+ Parameters
1614
+ ----------
1615
+ n_clusters (int): number of clusters or states to calculate
1616
+ features_hmm (list): features to be considered in hmm model when training
1617
+ test_data_size (int): size of the test data. Note that the remaining is going to be used as training data
1618
+ seed (int): seed for the model inizialization
1619
+ lag_returns_state (int) : lags for returns of the state
1620
+ corr_threshold (float): correlation threshold for initial feature selection
1621
+ plot (boolean): True to display hmm states analysis
1622
+ save_features (boolean): True to save features and configurations
1623
+ model (obj): if provided, no model will be trainend and the provided model will be used to get hmm features
1624
+
1625
+ Returns
1626
+ -------
1627
+ None
1628
+ """
1201
1629
  if not model:
1202
-
1630
+
1203
1631
  df_new = self.df
1204
- pipeline_hmm = Pipeline([
1205
- ('selector', FeatureSelector(columns=features_hmm)),
1206
- ('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
1207
- ('hmm',GaussianHMM(n_components = n_clusters, covariance_type = 'full', random_state = seed))
1208
- ])
1209
1632
  data_train = df_new.iloc[:-test_data_size,:]
1210
1633
  data_test = df_new.iloc[-test_data_size:,:]
1211
1634
 
1212
- pipeline_hmm.fit(data_train)
1213
-
1635
+ th = trainer_hmm(data_train, features_hmm, n_clusters=n_clusters,corr_thrshold=corr_threshold, seed = seed)
1636
+ th.train()
1637
+ pipeline_hmm = th.hmm_model
1214
1638
  self.model_hmm = pipeline_hmm
1215
1639
  self.test_data_hmm = data_test
1216
-
1640
+
1217
1641
  ### first feature: the hidden state
1218
1642
  self.df['hmm_feature'] = self.model_hmm.predict(self.df)
1219
1643
  self.create_hmm_derived_features(lag_returns = lag_returns_state)
@@ -1230,15 +1654,15 @@ class stock_eda_panel(object):
1230
1654
  hidden_states = pipeline_hmm.predict(data_test)
1231
1655
  data_test['HMM'] = hidden_states
1232
1656
  data_test['HMM_state'] = data_test['HMM'].map(map_)
1233
-
1657
+
1234
1658
  if model:
1235
1659
  self.df['hmm_feature'] = model.predict(self.df)
1236
1660
  self.create_hmm_derived_features(lag_returns = lag_returns_state)
1237
-
1661
+
1238
1662
  if save_features:
1239
1663
  self.features.append('hmm_feature')
1240
1664
  self.features.append('hmm_chain_order')
1241
- self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state }
1665
+ self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state, 'corr_threshold':corr_threshold }
1242
1666
 
1243
1667
  if plot:
1244
1668
 
@@ -1263,14 +1687,38 @@ class stock_eda_panel(object):
1263
1687
  fig.show()
1264
1688
 
1265
1689
  def sharpe_ratio(self, return_series, n_trad_days = 255, rf = 0.01):
1690
+ """
1691
+ perform sharpe ratio of a given time series return
1692
+
1693
+ Parameters
1694
+ ----------
1695
+ return_series (pd.series): time series of the returns
1696
+ n_trad_days (int): trading days to anualize returns
1697
+ rf (float): anual free risk rate
1698
+
1699
+ Returns
1700
+ -------
1701
+ sharpe_ratio (float): sharpe ratio
1702
+ """
1266
1703
  nsqrt = np.sqrt(n_trad_days)
1267
1704
  mean = return_series.mean() * n_trad_days
1268
1705
  sigma = return_series.std() * nsqrt
1269
1706
  sharpe_ratio = round((mean-rf)/sigma,2)
1270
1707
  return sharpe_ratio
1271
-
1708
+
1272
1709
  def treat_signal_strategy(self,test_data, strategy):
1273
-
1710
+ """
1711
+ helper method that treats signals and converts signals to 1 or 0
1712
+
1713
+ Parameters
1714
+ ----------
1715
+ test_data (pd.DataFrame): test data
1716
+ strategy (list): features to get the strategy
1717
+
1718
+ Returns
1719
+ -------
1720
+ test_data (pd.DataFrame): test data with extra columns that are the strategy (main_signal)
1721
+ """
1274
1722
  hmm_states_list = [x for x in strategy if 'hmm_state_' in x]
1275
1723
  other_features = [x for x in strategy if x not in hmm_states_list]
1276
1724
 
@@ -1299,10 +1747,21 @@ class stock_eda_panel(object):
1299
1747
  elif len(hmm_states_list) == 0 and len(other_features) > 0:
1300
1748
  test_data['main_signal'] = np.where((test_data['features_signal'] == 1) & (test_data['hmm_signal'] == 0),1,0)
1301
1749
 
1302
- return test_data
1750
+ return test_data
1303
1751
 
1304
1752
  def stategy_simulator(self, features, hmm_feature = True):
1753
+ """
1754
+ execute strategy and get some performance metrics like sharpe ratio, return. This method creates some new attributes
1305
1755
 
1756
+ Parameters
1757
+ ----------
1758
+ features (list): list of features to be tested as strategies
1759
+ hmm_feature (boolean): include hmm feature
1760
+
1761
+ Returns
1762
+ -------
1763
+ None
1764
+ """
1306
1765
  columns_ = ['Date', 'Close','Open'] + features + ['HMM']
1307
1766
  states = list(self.df.hmm_feature.unique())
1308
1767
  states.sort()
@@ -1372,8 +1831,19 @@ class stock_eda_panel(object):
1372
1831
  self.strategy_log = df_returns_log
1373
1832
  self.best_strategy = df_returns_log.iloc[0,:].strategy
1374
1833
  self.top_10_strategy = list(df_returns_log.iloc[0:10,:].strategy.values)
1375
-
1834
+
1376
1835
  def viz_strategy(self, strategy):
1836
+ """
1837
+ display analysis plot of a given strategy
1838
+
1839
+ Parameters
1840
+ ----------
1841
+ strategy (list): list of features of the strategy
1842
+
1843
+ Returns
1844
+ -------
1845
+ None
1846
+ """
1377
1847
  test_data = self.test_data_strategy
1378
1848
 
1379
1849
  test_data = self.treat_signal_strategy(test_data, strategy)
@@ -1406,62 +1876,26 @@ class stock_eda_panel(object):
1406
1876
  plt.legend()
1407
1877
  plt.show()
1408
1878
 
1409
- ### deprecated ############################
1410
- def create_strategy(self, favourable_states):
1411
-
1412
- test_data = self.test_data_hmm
1413
- # add MA signal
1414
- test_data.loc[test_data[self.ma1_column] > test_data[self.ma2_column], 'MA_signal'] = 1
1415
- test_data.loc[test_data[self.ma1_column] <= test_data[self.ma2_column], 'MA_signal'] = 0
1416
-
1417
- # add hnn signal
1418
-
1419
- test_data['HMM_signal'] = np.where(test_data['HMM'].isin(favourable_states),1,0)
1420
-
1421
- ## combined signals
1422
- test_data['main_signal'] = 0
1423
- test_data.loc[(test_data['MA_signal'] == 1) & (test_data['HMM_signal'] == 1), 'main_signal'] = 1
1424
- test_data['main_signal'] = test_data['main_signal'].shift(1)
1425
-
1426
- ## benchmark return
1427
- test_data['lrets_bench'] = np.log(test_data['Close']/test_data['Close'].shift(1))
1428
- test_data['bench_prod'] = test_data['lrets_bench'].cumsum()
1429
- test_data['bench_prod_exp'] = np.exp(test_data['bench_prod']) - 1
1430
-
1431
- ## strategy return
1432
- # test_data['lrets_strat'] = np.log(test_data['Open'].shift(-1)/test_data['Open']) * test_data['main_signal']
1433
- test_data['lrets_strat'] = np.log(test_data['Close'].shift(-1)/test_data['Close']) * test_data['main_signal']
1434
- test_data['lrets_prod'] = test_data['lrets_strat'].cumsum()
1435
- test_data['strat_prod_exp'] = np.exp(test_data['lrets_prod']) - 1
1436
- test_data.dropna(inplace = True)
1437
-
1438
- bench_rets = round(test_data['bench_prod_exp'].values[-1]*100,1)
1439
- strat_rets = round(test_data['strat_prod_exp'].values[-1]*100,1)
1440
-
1441
- bench_sharpe = self.sharpe_ratio(test_data['bench_prod_exp'].values)
1442
- strat_sharpe = self.sharpe_ratio(test_data['strat_prod_exp'].values)
1443
-
1444
- print(f'returns benchmark {bench_rets}%')
1445
- print(f'returns strategy {strat_rets}%')
1446
- print('-----------------------------')
1447
- print(f'sharpe benchmark {bench_sharpe}')
1448
- print(f'sharpe strategy {strat_sharpe}')
1449
-
1450
- fig = plt.figure(figsize = (10,4))
1451
- plt.plot(test_data['bench_prod_exp'])
1452
- plt.plot(test_data['strat_prod_exp'])
1453
- self.settings_hmm_states = {'favourable_states':favourable_states}
1454
- ################################################
1455
-
1456
1879
  def deep_dive_analysis_hmm(self, test_data_size, split = 'train'):
1457
-
1880
+ """
1881
+ display analysis plot hmm model
1882
+
1883
+ Parameters
1884
+ ----------
1885
+ test_data_size (int): test data size, the remaining is the train data
1886
+ split (str): options (train or test). Split type to assess
1887
+
1888
+ Returns
1889
+ -------
1890
+ None
1891
+ """
1458
1892
  if split == 'train':
1459
1893
  df = self.df.iloc[:-test_data_size,:]
1460
1894
  elif split == 'test':
1461
1895
  df = self.df.iloc[-test_data_size:,:]
1462
1896
 
1463
1897
  ## returns plot
1464
- fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
1898
+ fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
1465
1899
  height=400, width=1000, title = 'returns chain hmm feature')
1466
1900
  fig.add_shape(type='line',x0=-0.5,y0=0,x1=max(df.hmm_feature)+0.5,y1=0,line=dict(color='grey',width=1),xref='x',yref='y')
1467
1901
  fig.show()
@@ -1490,6 +1924,17 @@ class stock_eda_panel(object):
1490
1924
  del df
1491
1925
 
1492
1926
  def get_targets(self, steps):
1927
+ """
1928
+ produce regression target return taking future prices
1929
+
1930
+ Parameters
1931
+ ----------
1932
+ steps (int): number of lags and steps for future returns
1933
+
1934
+ Returns
1935
+ -------
1936
+ None
1937
+ """
1493
1938
  self.targets = list()
1494
1939
  self.target = list()
1495
1940
  columns = list()
@@ -1501,9 +1946,23 @@ class stock_eda_panel(object):
1501
1946
  self.df[f'mean_target'] = self.df[columns].mean(axis=1)
1502
1947
  self.target.append(f'mean_target')
1503
1948
  self.settings_target_lasts = {'steps':steps, 'type':'regression'}
1504
-
1505
- def get_categorical_targets(self, horizon, flor_loss, top_gain):
1506
-
1949
+
1950
+ def get_categorical_targets(self, horizon, flor_loss, top_gain, min_pos=1 , min_negs=1):
1951
+ """
1952
+ produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
1953
+
1954
+ Parameters
1955
+ ----------
1956
+ horizon (int): number of lags and steps for future returns
1957
+ flor_loss (float): min loss return
1958
+ top_gain (float): max gain return
1959
+ min_pos (int): minimun number of positives to count in a window for target_up
1960
+ min_negs (int): minimun number of negatives to count in a window for target_down
1961
+
1962
+ Returns
1963
+ -------
1964
+ None
1965
+ """
1507
1966
  self.target = list()
1508
1967
  self.targets = list()
1509
1968
  columns = list()
@@ -1516,7 +1975,7 @@ class stock_eda_panel(object):
1516
1975
  self.df[f'target_{i}'] = np.where(self.df[f'target_{i}'] >= top_gain,1,0)
1517
1976
  columns.append(f'target_{i}')
1518
1977
  self.df[f'target_up'] = self.df[columns].sum(axis=1)
1519
- self.df[f'target_up'] = np.where(self.df[f'target_up'] >=1,1,0 )
1978
+ self.df[f'target_up'] = np.where(self.df[f'target_up'] >=min_pos,1,0 )
1520
1979
  self.df = self.df.drop(columns = columns)
1521
1980
 
1522
1981
  for i in range(1,horizon+1):
@@ -1526,7 +1985,7 @@ class stock_eda_panel(object):
1526
1985
  self.df[f'target_{i}'] = np.where(self.df[f'target_{i}'] <= flor_loss,1,0)
1527
1986
  columns.append(f'target_{i}')
1528
1987
  self.df[f'target_down'] = self.df[columns].sum(axis=1)
1529
- self.df[f'target_down'] = np.where(self.df[f'target_down'] >= 1,1,0 )
1988
+ self.df[f'target_down'] = np.where(self.df[f'target_down'] >= min_negs,1,0 )
1530
1989
  self.df = self.df.drop(columns = columns)
1531
1990
 
1532
1991
  self.targets.append('target_up')
@@ -1535,7 +1994,19 @@ class stock_eda_panel(object):
1535
1994
  self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain, 'type': 'classification'}
1536
1995
 
1537
1996
  def get_configurations(self,test_data_size =250, val_data_size = 250, model_type = False):
1538
-
1997
+ """
1998
+ produce configuration dictionary that were saved in the feature generation methods if save_features was activated
1999
+
2000
+ Parameters
2001
+ ----------
2002
+ test_data_size (int): test data size
2003
+ val_data_size (int): validation data size
2004
+ model_type (str): model type, options: 'Forecaster','Classifier'
2005
+
2006
+ Returns
2007
+ -------
2008
+ None
2009
+ """
1539
2010
  self.settings = {
1540
2011
  'features':list(set(self.features)),
1541
2012
  'signals' :list(set(self.signals)),
@@ -1547,19 +2018,21 @@ class stock_eda_panel(object):
1547
2018
  'outlier': self.settings_outlier,
1548
2019
  }
1549
2020
  }
1550
-
2021
+
1551
2022
  if model_type in ['Forecaster','Classifier']:
1552
-
2023
+
1553
2024
  target_list = list(set(self.targets))
1554
2025
  target_list.sort()
1555
2026
  self.settings['model_type'] = model_type
1556
2027
  self.settings['target'] = list(set(self.target))
1557
2028
  self.settings['targets'] = target_list
1558
-
2029
+
1559
2030
  ## for now this is hard coded
1560
2031
  feature_list = ['spread_ma','relative_spread_ma','pair_feature','count_features','bidirect_count_features','price_range','relative_price_range','rsi_feature',
1561
2032
  'rsi_feature_v2', 'days_features','days_features_v2', 'volume_feature','smooth_volume', 'roc_feature', 'stoch_feature', 'stochastic_feature',
1562
- 'william_feature', 'vortex_feature', 'pair_index_feature','hmm']
2033
+ 'william_feature', 'vortex_feature', 'pair_index_feature','hmm',
2034
+ 'min_distance_pricefeature', 'min_relprice_pricefeature', 'max_distance_pricefeature','max_relprice_pricefeature'
2035
+ ]
1563
2036
 
1564
2037
  for feature in feature_list:
1565
2038
  try:
@@ -1570,7 +2043,7 @@ class stock_eda_panel(object):
1570
2043
  self.settings['settings']['target_lasts'] = self.settings_target_lasts
1571
2044
  except:
1572
2045
  pass
1573
-
2046
+
1574
2047
  try:
1575
2048
  self.settings['settings']['strategies'] = {
1576
2049
  'best_strategy':self.best_strategy,
@@ -1580,512 +2053,280 @@ class stock_eda_panel(object):
1580
2053
  pass
1581
2054
 
1582
2055
  class produce_model:
2056
+ """
2057
+ Class that produces a machine learning model in a scikit-learn pipeline wrapper.
2058
+
2059
+ Attributes
2060
+ ----------
2061
+ data : pd.DataFrame
2062
+ symbol of the asset
2063
+ X_train : pd.DataFrame
2064
+ y_train : pd.Series
2065
+ X_test : pd.DataFrame
2066
+ y_test : pd.Series
2067
+ X_val : pd.DataFrame
2068
+ y_val : pd.Series
2069
+ pipeline : obj
2070
+ trained pipeline that includes a ml model
2071
+ features_to_model: list
2072
+ features in end step of the pipeline
2073
+
2074
+ Methods
2075
+ -------
2076
+ preprocess(test_data_size=int, target=str, val_data_size=int):
2077
+ prepare data, split train, test, validation data and X and Y
2078
+ get_sample(x=pd.DataFrame, sample=int, max_=int):
2079
+ sample data
2080
+ """
1583
2081
  def __init__(self,data):
2082
+ """
2083
+ Initialize object
2084
+
2085
+ Parameters
2086
+ ----------
2087
+ data (pd.DataFrame): data
2088
+
2089
+ Returns
2090
+ -------
2091
+ None
2092
+ """
1584
2093
  self.data = data.copy()
1585
-
2094
+
1586
2095
  def preprocess(self, test_data_size, target, val_data_size = False):
1587
-
2096
+ """
2097
+ prepare data, split train, test, validation data and X and Y
2098
+
2099
+ Parameters
2100
+ ----------
2101
+ test_data_size (int): test data size
2102
+ target (str): target column
2103
+ val_data_size (int): validation data size
2104
+
2105
+ Returns
2106
+ -------
2107
+ None
2108
+ """
1588
2109
  train_data, test_data = self.data.iloc[:-test_data_size,:].dropna() , self.data.iloc[-test_data_size:,:].dropna()
1589
-
2110
+
1590
2111
  if val_data_size:
1591
2112
  train_data, val_data = train_data.iloc[:-val_data_size,:], train_data.iloc[-val_data_size:,:]
1592
-
2113
+
1593
2114
  self.test_data = test_data
1594
-
2115
+
1595
2116
  X_train, y_train = train_data.iloc[0:,1:], train_data[target]
1596
2117
  X_test, y_test = test_data.iloc[0:,1:], test_data[target]
1597
2118
  self.X_train = X_train
1598
2119
  self.y_train = y_train
1599
2120
  self.X_test = X_test
1600
2121
  self.y_test = y_test
1601
-
2122
+
1602
2123
  if val_data_size:
1603
2124
  X_val, y_val = val_data.iloc[0:,1:], val_data[target]
1604
2125
  self.X_val = X_val
1605
2126
  self.y_val = y_val
1606
-
2127
+
1607
2128
  def get_sample(self, x, sample, max_=900):
2129
+ """
2130
+ sample data
2131
+
2132
+ Parameters
2133
+ ----------
2134
+ x (pd.DataFrame): input data
2135
+ sample (int): sample size
2136
+ max_ (int): max sample
2137
+
2138
+ Returns
2139
+ -------
2140
+ sample (float): sample size
2141
+ """
1608
2142
  length = len(x)
1609
2143
  if length > max_:
1610
2144
  return 1.0
1611
2145
  else:
1612
2146
  return sample
1613
-
2147
+
1614
2148
  def train_model(self, pipe, model, cv_ = False):
2149
+ """
2150
+ train pipeline
2151
+
2152
+ Parameters
2153
+ ----------
2154
+ pipe (obj): pipeline object
2155
+ model (obj): model object
2156
+ cv_ (obj): cross validation procedure
2157
+
2158
+ Returns
2159
+ -------
2160
+ sample (float): sample size
2161
+ """
1615
2162
  self.model = model
1616
2163
  self.pipe_transform = pipe
1617
2164
  self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
1618
- self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
1619
2165
  self.pipeline.fit(self.X_train, self.y_train)
1620
-
1621
-
1622
- class hmm_feature_selector():
1623
-
1624
- def __init__(self, data, n_clusters, init_features_hmm, test_data_size, select_n_features, n_trials = 1,limit_search = False, default_benchmark_sd = 0.00003, t_threshold = 2):
1625
- self.data = data.copy()
1626
- self.n_clusters = n_clusters
1627
- self.init_features_hmm = init_features_hmm
1628
- self.test_data_size = test_data_size
1629
- self.select_n_features = select_n_features
1630
- self.n_trials = n_trials
1631
- self.limit_search= limit_search
1632
- self.default_benchmark_sd = default_benchmark_sd
1633
- self.t_threshold = t_threshold
1634
-
1635
- def split_data(self):
1636
-
1637
- self.data_train = self.data.iloc[:-self.test_data_size,:]
1638
- self.data_test = self.data.iloc[-self.test_data_size:,:]
1639
-
1640
- def train_model(self,features_hmm):
1641
- pipeline_hmm = Pipeline([
1642
- ('selector', FeatureSelector(columns=features_hmm)),
1643
- ('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
1644
- ('hmm',GaussianHMM(n_components = self.n_clusters, covariance_type = 'full'))
1645
- ])
1646
-
1647
- self.pipeline_hmm = pipeline_hmm.fit(self.data_train)
1648
- self.features_used_in_model = features_hmm
1649
-
1650
- def feature_list_generator(self):
1651
-
1652
- feature_combinations = set(list(combinations(self.init_features_hmm, self.select_n_features)))
1653
- feature_combinations = list(map(list, feature_combinations))
1654
-
1655
- self.feature_combinations = feature_combinations
1656
-
1657
- def get_error(self):
1658
-
1659
- self.data_train_ = self.data_train.copy()
1660
-
1661
- self.data_train_['hmm_feature'] = self.pipeline_hmm.predict(self.data_train_)
1662
- self.data_train_ = self.data_train_[['Date','hmm_feature','Close']].sort_values('Date')
1663
-
1664
- ## indexing chains
1665
- self.data_train_['lag_hmm_feature'] = self.data_train_['hmm_feature'].shift(1)
1666
- self.data_train_['breack'] = np.where(self.data_train_['lag_hmm_feature'] != self.data_train_['hmm_feature'],1,0)
1667
- self.data_train_["chain_id"] = self.data_train_.groupby("breack")["Date"].rank(method="first", ascending=True)
1668
- self.data_train_["chain_id"] = np.where(self.data_train_['breack'] == 1,self.data_train_["chain_id"],np.nan)
1669
- self.data_train_["chain_id"] = self.data_train_["chain_id"].fillna(method='ffill')
1670
- self.data_train_["hmm_chain_order"] = self.data_train_.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1671
-
1672
- ### returns using the first element in a chain
1673
- self.data_train_['first'] = np.where(self.data_train_['hmm_chain_order'] == 1, self.data_train_['Close'], np.nan)
1674
- self.data_train_['first'] = self.data_train_.sort_values('Date')['first'].fillna(method='ffill')
1675
- self.data_train_['chain_return'] = (self.data_train_['Close']/self.data_train_['first'] -1) * 100
1676
-
1677
- self.data_train_ = self.data_train_.drop(columns = ['first'])
1678
-
1679
- mean_relevance, cluster_returns, number_relevant_states = states_relevance_score(self.data_train_)
1680
- self.mean_relevance = mean_relevance
1681
-
1682
- def execute_selector(self):
1683
-
1684
- self.split_data()
1685
- self.feature_list_generator()
1686
- maxi = -1
1687
- print(f'it is expected {len(self.feature_combinations)} combinations')
1688
- feature_results = dict()
1689
-
1690
- if self.limit_search:
1691
- print(f' taking just {self.limit_search} combinations')
1692
- maxi = self.limit_search
1693
-
1694
- for i,features_hmm in enumerate(self.feature_combinations[0:maxi]):
1695
-
1696
- feature_results[f'group_{i}'] = {
1697
- 'features':list(features_hmm),
1698
- 'relevances':list()
1699
- }
1700
-
1701
- for _ in range(self.n_trials):
1702
- try:
1703
- self.train_model(features_hmm)
1704
- self.get_error()
1705
- feature_results[f'group_{i}']['relevances'].append(self.mean_relevance)
1706
- except:
1707
- print('error')
1708
- feature_results[f'group_{i}']['mean relevance'] = np.mean(feature_results[f'group_{i}']['relevances'])
1709
- self.feature_results = feature_results
1710
- self.best_features = pd.DataFrame(self.feature_results).T.sort_values('mean relevance').iloc[-1,:].features
1711
-
1712
- class signal_analyser_object:
1713
-
1714
- def __init__(self, data,symbol_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
2166
+ self.features_to_model = self.pipeline[:-1].transform(self.X_train).columns
2167
+
2168
+ class analyse_index(stock_eda_panel):
2169
+ """
2170
+ class that is going to train hmm models to perform feature selection
2171
+
2172
+ Attributes
2173
+ ----------
2174
+ data_index : pd.DataFrame
2175
+ name of the index
2176
+ indexes: list
2177
+ list of indexes
2178
+ asset : str
2179
+ name of the asset
2180
+ n_obs : int
2181
+ number of rows to extract
2182
+ lag : int
2183
+ lag to apply
2184
+ data_window : str
2185
+ 5y 10y 15y
2186
+ show_plot : bool
2187
+ If True, show plots
2188
+ save_path : str
2189
+ local path for saving e.g r'C:/path/to/the/file/'
2190
+ save_aws : str
2191
+ remote key in s3 bucket path e.g. 'path/to/file/'
2192
+ aws_credentials : dict
2193
+ dict with the aws credentials
2194
+ merger_df : pd.DataFrame
2195
+ dataframe with the index and asset data
2196
+ states_result = dict
2197
+ betas and correlation score results
2198
+
2199
+ Methods
2200
+ -------
2201
+ process_data():
2202
+ using stock_eda_panel, get data and merge data
2203
+ plot_betas(sample_size=int, offset=int, subsample_ts=int):
2204
+ display beta analysis plot
2205
+ get_betas(subsample_ts=int)
2206
+ get general beta and last sample beta, correlation score is included too
2207
+ """
2208
+ def __init__(self, index_data, asset, n_obs, lag, data_window = '5y', show_plot = False, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
1715
2209
  """
1716
- data: pandas df
1717
- symbol_name: str name of the asset
1718
- show_plot: bool
1719
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
1720
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
1721
- aws_credentials: dict
1722
- return_fig: boolean return the image function as result
2210
+ Initialize object
2211
+
2212
+ Parameters
2213
+ ----------
2214
+ index_data (pd.DataFrame or str): index data dataframe or index string
2215
+ asset (str): name of the asset
2216
+ n_obs (int): number of rows to extract
2217
+ lag (int): lag to apply
2218
+ data_window (str): 5y 10y 15y
2219
+ show_plot (bool): If True, show plots
2220
+ save_path (str): local path for saving e.g r'C:/path/to/the/file/'
2221
+ save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
2222
+ aws_credentials (dict): dict with the aws credentials
2223
+
2224
+ Returns
2225
+ -------
2226
+ None
1723
2227
  """
1724
- self.data = data.copy()
1725
- self.ticket_name = symbol_name
1726
- self.show_plot = show_plot
1727
- self.save_path = save_path
1728
- self.save_aws = save_aws
1729
- self.aws_credentials = aws_credentials
1730
- self.return_fig = return_fig
1731
-
1732
- def signal_analyser(self, test_size, feature_name, days_list, threshold = 0.05,verbose = False, signal_position = False):
1733
- data = self.data
1734
- self.feature_name = feature_name
1735
- up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
1736
- features_base = ['Date', up_signal, low_signal, 'Close']
1737
-
1738
- df = data[features_base].sort_values('Date').iloc[0:-test_size,:]
1739
- returns_list = list()
1740
-
1741
- for days in days_list:
1742
-
1743
- feature_ = f'return_{days}d'
1744
- df[feature_] = (df['Close'].shift(-days)/df['Close']-1)*100
1745
- returns_list.append(feature_)
1746
-
1747
- df['signal_type'] = np.where(
1748
- df[up_signal] == 1,
1749
- 'up',
1750
- np.where(
1751
- df[low_signal] == 1,
1752
- 'down',
1753
- None
1754
- )
1755
- )
1756
- df = df[~df.signal_type.isna()]
1757
- # df['Date'] = df.index
1758
- df['lag_Date'] = df['Date'].shift(1)
1759
- df['span'] = (pd.to_datetime(df['Date']) - pd.to_datetime(df['lag_Date'])).dt.days - 1
1760
- df['break'] = np.where(df['span'] > 3, 1, 0)
1761
- df['break'] = np.where(df['span'].isna(), 1, df['break'])
1762
2228
 
1763
- df['chain_id'] = df.sort_values(['Date']).groupby(['break']).cumcount() + 1
1764
- df['chain_id'] = np.where(df['break'] == 1, df['chain_id'], np.nan )
1765
- df['chain_id'] = df['chain_id'].fillna(method = 'ffill')
1766
-
1767
- df['internal_rn'] = df.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
1768
- df['inv_internal_rn'] = df.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
1769
-
1770
- df['first_in_chain'] = np.where(df['internal_rn'] == 1, True, False)
1771
- df['last_in_chain'] = np.where(df['inv_internal_rn'] == 1, True, False)
1772
-
1773
- df = df.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
1774
- self.df_signal = df
1775
2229
 
1776
- n_signals_up = len(list(df[df.signal_type == 'up'].chain_id.unique()))
1777
- n_signals_down = len(list(df[df.signal_type == 'down'].chain_id.unique()))
1778
- p_scores = list()
1779
- medians_down = list()
1780
- validations = list()
1781
- if not signal_position: ### for now it is based on the last signal on a chain
1782
- df_melt = df[df.last_in_chain == True].melt(id_vars=['signal_type'], value_vars=returns_list, var_name='time', value_name='value')
1783
- df_melt = df_melt.dropna()
1784
-
1785
- for evalx in returns_list:
1786
-
1787
- sample1 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'up')].value.values
1788
- sample2 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'down')].value.values
1789
- pvalue = stats.ttest_ind(sample1, sample2).pvalue
1790
- median_down = np.median(sample2)
1791
- median_up = np.median(sample1)
1792
- validations.append(median_up < 0)
1793
- validations.append(median_down > 0)
1794
- p_scores.append(pvalue)
1795
- medians_down.append(median_down)
1796
- self.df_melt = df_melt
1797
- null_ho_eval = threshold > np.mean(p_scores)
1798
- mean_median_return = np.median(medians_down) ## end metric
1799
- median_signal_type_eval = validations.count(validations[0]) == len(validations)
1800
-
1801
- if verbose:
1802
- print('number of signal up:',n_signals_up)
1803
- print('number of signal down:',n_signals_down)
1804
- print('reject ho: ', null_ho_eval)
1805
- print('mean median:', mean_median_return)
1806
- print('all validations: ', median_signal_type_eval)
1807
-
1808
- # if median_signal_type_eval == True and null_ho_eval == True:
1809
- if null_ho_eval == True:
1810
- if verbose:
1811
- print('success evals')
1812
- self.mean_median_return = mean_median_return
2230
+ if type(index_data) != str:
2231
+ index_data['Date'] = pd.to_datetime(index_data['Date'])
2232
+ self.index_data = index_data
2233
+ self.indexes = [ x for x in list(index_data.columns) if x != 'Date']
1813
2234
  else:
1814
- self.mean_median_return = np.nan
1815
-
1816
- df2 = df.copy()
1817
- df2 = df2[df2.last_in_chain == True]
1818
-
1819
-
1820
- df2['lagdate'] = df2.Date.shift(1)
1821
- df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lagdate'])).dt.days
1822
-
1823
- fig, axs = plt.subplots(1, 3, figsize = (15,5))
1824
-
1825
- sns.boxplot(data=df2, y="span",ax = axs[0])
1826
- axs[0].set_title('span between last signals')
1827
- del df2
1828
- sns.boxplot(data=df[df.last_in_chain == True], y="internal_rn",ax = axs[1])
1829
- axs[1].set_title('signal duration distribution')
1830
- sns.boxplot(data=df_melt, x="time", y="value", hue="signal_type",ax = axs[2])
1831
- axs[2].axhline(y=0, color='grey', linestyle='--')
1832
- axs[2].set_title('signal type expected returns distribution at different time lapses')
1833
-
1834
- if self.show_plot:
1835
- plt.show()
1836
-
1837
- if self.save_path:
1838
- result_plot_name = f'signals_strategy_distribution_{feature_name}.png'
1839
- fig.savefig(self.save_path+result_plot_name)
1840
- # pickle.dump(axs, open(self.save_path+result_plot_name, 'wb'))
1841
-
1842
- if self.save_path and self.save_aws:
1843
- # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name, input_path = self.save_path+result_plot_name)
1844
- upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
1845
- if not self.show_plot:
1846
- plt.close()
1847
-
1848
- del df
1849
-
1850
- if self.return_fig:
1851
- return fig
1852
-
1853
- def create_backtest_signal(self,days_strategy, test_size, feature_name, high_exit = False, low_exit = False):
1854
- asset_1 = 'Close'
1855
- up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
1856
- df1 = self.data.iloc[-test_size:,:].copy()
1857
- df2 = df1.copy()
1858
- df2['signal_type'] = np.where(
1859
- df2[up_signal] == 1,
1860
- 'up',
1861
- np.where(
1862
- df2[low_signal] == 1,
1863
- 'down',
1864
- None
1865
- )
1866
- )
1867
- df2 = df2[~df2.signal_type.isna()]
1868
- # df2['Date_'] = df2.index
1869
- df2['lag_Date'] = df2['Date'].shift(1)
1870
- df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lag_Date'])).dt.days - 1
1871
- df2['break'] = np.where(df2['span'] > 3, 1, 0)
1872
- df2['break'] = np.where(df2['span'].isna(), 1, df2['break'])
1873
-
1874
- df2['chain_id'] = df2.sort_values(['Date']).groupby(['break']).cumcount() + 1
1875
- df2['chain_id'] = np.where(df2['break'] == 1, df2['chain_id'], np.nan )
1876
- df2['chain_id'] = df2['chain_id'].fillna(method = 'ffill')
1877
-
1878
- df2['internal_rn'] = df2.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
1879
- df2['inv_internal_rn'] = df2.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
1880
-
1881
- df2['first_in_chain'] = np.where(df2['internal_rn'] == 1, True, False)
1882
- df2['last_in_chain'] = np.where(df2['inv_internal_rn'] == 1, True, False)
1883
-
1884
- df2 = df2.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
1885
-
1886
- df2 = df2[(df2.last_in_chain == True) & (df2.signal_type == 'down')][['last_in_chain']]
1887
- dft = df1.merge(df2,how = 'left',left_index=True, right_index=True )
1888
-
1889
- dft['chain_id'] = dft.sort_values(['Date']).groupby(['last_in_chain']).cumcount() + 1
1890
- dft['chain_id'] = np.where(dft['last_in_chain'] == True, dft['chain_id'], np.nan )
1891
- dft['chain_id'] = dft['chain_id'].fillna(method = 'ffill')
1892
-
1893
- dft['internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
1894
- dft['flag'] = np.where(dft['internal_rn'] < days_strategy, 1,0)
1895
-
1896
- dft['lrets_bench'] = np.log(dft[asset_1]/dft[asset_1].shift(1))
1897
- dft['bench_prod'] = dft['lrets_bench'].cumsum()
1898
- dft['bench_prod_exp'] = np.exp(dft['bench_prod']) - 1
1899
-
1900
- if high_exit and low_exit:
1901
- dft['open_strat'] = np.where(dft.last_in_chain == True, dft.Open, np.nan)
1902
- dft['open_strat'] = dft['open_strat'].fillna(method = 'ffill')
1903
- dft['open_strat'] = np.where(dft.flag == 1, dft.open_strat, np.nan)
1904
- dft['high_strat_ret'] = (dft['High']/dft['open_strat']-1)*100
1905
- dft['low_strat_ret'] = (dft['Low']/dft['open_strat']-1)*100
1906
- dft['high_exit'] = np.where(((dft['high_strat_ret'] >= high_exit) | (dft['internal_rn'] == days_strategy)), 1, np.nan)
1907
- dft['low_exit'] = np.where((dft['low_strat_ret'] <= low_exit), -1, np.nan)
1908
-
1909
- dft["exit_type"] = dft[["high_exit", "low_exit"]].max(axis=1)
1910
- dft['exit_type'] = np.where(dft["exit_type"] == 1, 1, np.where(dft["exit_type"] == -1,-1,np.nan))
1911
- dft['exit'] = np.where(dft['exit_type'].isnull(), np.nan, 1)
1912
- dft['exit_order'] = dft.sort_values(['Date']).groupby(['chain_id','exit']).cumcount() + 1
1913
- dft['exit'] = np.where(dft['exit_order'] == 1, True, np.nan)
1914
- dft = dft.drop(columns = ['exit_order'])
1915
- ## if last signal is near
1916
- max_id = dft.chain_id.max()
1917
- dft['max_internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).internal_rn.transform('max')
1918
- dft['exit'] = np.where((dft.chain_id == max_id) & (dft.max_internal_rn < days_strategy) & (dft.max_internal_rn == dft.internal_rn), 1, dft['exit'])
1919
-
1920
- dft['exit_step'] = np.where(dft.exit == 1, dft.internal_rn, np.nan)
1921
- dft['exit_step'] = dft.sort_values(['Date']).groupby(['chain_id']).exit_step.transform('max')
1922
-
1923
- dft['flag'] = np.where(dft.internal_rn <= dft.exit_step, 1, 0)
1924
- dft = dft.drop(columns = ['open_strat', 'high_strat_ret', 'low_strat_ret','exit_step', 'exit','exit_type','high_exit','low_exit', 'max_internal_rn'])
1925
-
1926
- dft['lrets_strat'] = np.log(dft[asset_1].shift(-1)/dft[asset_1]) * dft['flag']
1927
- dft['lrets_strat'] = np.where(dft['lrets_strat'].isna(),-0.0,dft['lrets_strat'])
1928
- dft['lrets_prod'] = dft['lrets_strat'].cumsum()
1929
- dft['strat_prod_exp'] = np.exp(dft['lrets_prod']) - 1
1930
-
1931
- bench_rets = round(dft['bench_prod_exp'].values[-1]*100,1)
1932
- strat_rets = round(dft['strat_prod_exp'].values[-1]*100,1)
1933
-
1934
- bench_sr = round(sharpe_ratio(dft.bench_prod_exp.dropna()),1)
1935
- strat_sr = round(sharpe_ratio(dft.strat_prod_exp.dropna()),1)
1936
-
1937
- message1 = f'{bench_rets}%'
1938
- message2 = f'{strat_rets}%'
1939
-
1940
- messages = {
1941
- 'benchmark return:':message1,
1942
- 'benchmark sharpe ratio:': bench_sr,
1943
- 'strategy return:':message2,
1944
- 'strategy sharpe ratio:': strat_sr,
1945
- }
1946
- if self.show_plot:
1947
- print('----------------------------')
1948
- print(messages)
1949
- print('----------------------------')
1950
-
1951
- fig = plt.figure(1)
1952
- plt.plot(dft.bench_prod_exp.values, label = 'benchmark')
1953
- plt.scatter(range(len(dft)),np.where(dft[low_signal] == 1,dft.bench_prod_exp.values,np.nan),color = 'red', label = 'signal')
1954
- plt.plot(dft.strat_prod_exp.values, label = 'strategy')
1955
- plt.legend()
1956
- plt.title('strategy and cumulative returns based on signal strategy')
1957
- if self.show_plot:
1958
- plt.plot()
2235
+ self.indexes = [index_data]
1959
2236
 
1960
- if self.save_path:
1961
- result_json_name = f'signals_strategy_return_{feature_name}.json'
1962
- result_plot_name = f'signals_strategy_return_{feature_name}.png'
1963
-
1964
- plt.savefig(self.save_path+result_plot_name)
1965
- # pickle.dump(fig, open(self.save_path+result_plot_name, 'wb'))
1966
-
1967
- with open(self.save_path+result_json_name, "w") as outfile:
1968
- json.dump(messages, outfile)
1969
-
1970
- if self.save_path and self.save_aws:
1971
- # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
1972
- # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name,input_path = self.save_path+result_plot_name)
1973
-
1974
- upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_json_name, input_path = self.save_path + result_json_name, aws_credentials = self.aws_credentials)
1975
- upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
1976
-
1977
- if not self.show_plot:
1978
- plt.close()
1979
-
1980
- del df1,df2,dft
1981
-
1982
- if self.return_fig:
1983
- return fig, messages
1984
-
1985
- def execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object, plot = False, backtest= False, exit_params = {}):
1986
-
1987
- method(**configuration)
1988
- signal_assess = signal_analyser_object(object_stock.df,object_stock.stock_code,show_plot = plot)
1989
- signal_assess.signal_analyser(test_size = test_data_size, feature_name = feature_name, days_list = days_list, threshold = 1)
1990
-
1991
- if backtest:
1992
- print('-----------------------back test ---------------------------')
1993
- signal_assess.create_backtest_signal(backtest, test_data_size, feature_name, **exit_params )
1994
-
1995
- return signal_assess.mean_median_return
1996
-
1997
- def iterate_signal_analyser(test_data_size,feature_name, days_list, arguments_to_test, method, object_stock, signal_analyser_object, plot = True):
1998
-
1999
- results = list()
2000
- for key in arguments_to_test.keys():
2001
- configuration = arguments_to_test.get(key)
2002
- mean_median_return = execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object)
2003
- results.append(mean_median_return)
2004
-
2005
- df_result = pd.DataFrame({'keys':arguments_to_test.keys(),'results':results})
2006
- if plot:
2007
- plt.plot(df_result['keys'], df_result['results'])
2008
- plt.scatter(df_result['keys'], df_result['results'])
2009
- plt.title('simulation between configurations')
2010
- plt.ylabel('median expected return')
2011
- plt.show()
2012
-
2013
- best_result = df_result.sort_values('results',ascending = False)['keys'].values[0]
2014
- return best_result
2015
-
2016
- class analyse_index(stock_eda_panel):
2017
- def __init__(self, index, asset, n_obs, lag, data_window = '5y', show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
2018
-
2019
- """
2020
- data: pandas df
2021
- index: str name of the index
2022
- asset: str name of the asset
2023
- n_obs: int
2024
- lag: int
2025
- data_window: str eg 5y 10y 15y
2026
- show_plot: bool
2027
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
2028
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
2029
- aws_credentials: dict
2030
- """
2031
-
2032
- self.index = index
2237
+ self.index_data = index_data
2033
2238
  self.asset = asset
2034
2239
  self.n_obs = n_obs
2035
2240
  self.data_window = data_window
2036
2241
  self.lag = lag
2037
-
2242
+
2038
2243
  self.show_plot = show_plot
2244
+ self.return_fig = return_fig
2039
2245
  self.save_path = save_path
2040
2246
  self.save_aws = save_aws
2041
-
2042
- def process_data(self):
2043
-
2044
- index = stock_eda_panel(self.index, self.n_obs, self.data_window)
2045
- index.get_data()
2046
- index.df['shift'] = index.df.Close.shift(self.lag)
2047
- index.df['index_return'] = index.df.Close/index.df['shift'] - 1
2048
2247
 
2049
- asset = stock_eda_panel(self.asset, self.n_obs, self.data_window)
2248
+ def process_data(self):
2249
+ """
2250
+ using stock_eda_panel, get data and merge data
2251
+
2252
+ Parameters
2253
+ ----------
2254
+ None
2255
+
2256
+ Returns
2257
+ -------
2258
+ None
2259
+ """
2260
+ asset = stock_eda_panel(self.asset, self.n_obs, data_window=self.data_window)
2050
2261
  asset.get_data()
2051
- asset.df['shift'] = asset.df.Close.shift(self.lag)
2052
- asset.df['asset_return'] = asset.df.Close/asset.df['shift'] - 1
2262
+ df = asset.df[['Date','Close']]
2053
2263
 
2054
- df1 = index.df[['Date','index_return']]
2055
- df2 = asset.df[['Date','asset_return','Close']]
2056
- merger = df1.merge(df2, on = 'Date', how = 'inner')
2057
- merger.dropna(inplace = True)
2058
- self.merger_df = merger
2059
-
2060
- def plot_betas(self,sample_size, offset, subsample_ts =False):
2061
-
2062
- ### extracting data
2264
+ if type(self.index_data) != str:
2265
+ df_merge = df.merge(self.index_data, on = ['Date'], how = 'left').sort_values('Date')
2266
+
2267
+ else:
2268
+ indx = stock_eda_panel(self.index_data, self.n_obs, data_window=self.data_window)
2269
+ indx.get_data()
2270
+ indx_df = indx.df[['Date','Close']].rename(columns = {'Close':self.index_data})
2271
+ df_merge = df.merge(indx_df, on = ['Date'], how = 'left').sort_values('Date')
2272
+
2273
+ for colx in ['Close'] + self.indexes:
2274
+ df_merge[f'{colx}_pct'] = df_merge[colx]/df_merge[colx].shift(self.lag) - 1
2275
+
2276
+ df_merge.dropna(inplace = True)
2277
+ self.merger_df = df_merge.rename(columns = {'Close_pct': 'asset_return'})
2063
2278
 
2064
- self.process_data()
2065
-
2066
- ### ploting analysis
2279
+ def plot_betas(self,sample_size, offset, subsample_ts =False, index = False):
2280
+ """
2281
+ display beta analysis plot
2282
+
2283
+ Parameters
2284
+ ----------
2285
+ sample_size (int): number of days or window size to calculate beta
2286
+ offset (int): overlap between windows
2287
+ subsample_ts (int): subsample size of data
2288
+
2289
+ Returns
2290
+ -------
2291
+ None
2292
+ """
2293
+ if (type(self.index_data) == str) & (index != False):
2294
+ raise Exception("No need of index argument")
2295
+ else:
2296
+ index = self.indexes[0]
2297
+
2298
+ index_pct = f'{index}_pct'
2299
+ ### ploting analysis
2067
2300
  figure, ax = plt.subplot_mosaic(
2068
2301
  [["scatter_total", "scatter_sample",'ts','ts']],
2069
2302
  layout="constrained",
2070
2303
  figsize=(18, 5)
2071
2304
  )
2072
-
2073
- ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df.index_return)
2074
- b, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
2305
+
2306
+ ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df[index_pct])
2307
+
2308
+ huber_regr = HuberRegressor(fit_intercept = True)
2309
+ huber_regr.fit(self.merger_df.asset_return.values.reshape(-1,1), self.merger_df[index_pct].values.reshape(-1,1))
2310
+ b, a = huber_regr.coef_[0], huber_regr.intercept_
2311
+
2312
+ # b, a = np.polyfit(self.merger_df.asset_return, self.merger_df[index_pct], 1)
2075
2313
  ax['scatter_total'].plot(self.merger_df.asset_return, b*self.merger_df.asset_return+a, color='red')
2076
2314
 
2077
2315
  ax['ts'].plot(self.merger_df.Date, self.merger_df.Close, color = 'grey', alpha = 0.3)
2078
-
2316
+
2079
2317
  if subsample_ts:
2080
2318
  self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
2081
-
2319
+
2082
2320
  for i in range(0,len(self.merger_df)-sample_size,offset):
2083
2321
 
2084
2322
  merger_ = self.merger_df.sort_values('Date', ascending = False).iloc[i:i+sample_size,:]
2085
- x = merger_.index_return
2323
+ x = merger_[index_pct]
2086
2324
  y = merger_.asset_return
2087
- b, a = np.polyfit(x,y, 1)
2088
-
2325
+ # b, a = np.polyfit(x,y, 1)
2326
+ huber_regr = HuberRegressor(fit_intercept = True)
2327
+ huber_regr.fit(x.values.reshape(-1,1), y.values.reshape(-1,1))
2328
+ b, a = huber_regr.coef_[0], huber_regr.intercept_
2329
+
2089
2330
  normalize = mcolors.Normalize(vmin=-1, vmax=1)
2090
2331
  colormap = cm.jet
2091
2332
 
@@ -2098,12 +2339,13 @@ class analyse_index(stock_eda_panel):
2098
2339
 
2099
2340
  scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
2100
2341
  scalarmappaple.set_array(x)
2101
-
2102
- plt.title(f'{self.asset} using index: {self.index}')
2342
+
2343
+ plt.title(f'{self.asset} using index: {index}')
2103
2344
  plt.colorbar(scalarmappaple)
2104
-
2345
+
2105
2346
  if self.show_plot:
2106
2347
  plt.show()
2348
+
2107
2349
  if self.save_path:
2108
2350
  result_plot_name = f'market_best_fit.png'
2109
2351
  figure.savefig(self.save_path+result_plot_name)
@@ -2111,80 +2353,50 @@ class analyse_index(stock_eda_panel):
2111
2353
  if self.save_path and self.save_aws:
2112
2354
  # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.asset}/'+result_plot_name,input_path = self.save_path+result_plot_name)
2113
2355
  upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
2356
+
2114
2357
  if not self.show_plot:
2115
- plt.close()
2116
-
2358
+ plt.close()
2359
+
2360
+ if self.return_fig:
2361
+ return figure
2362
+
2117
2363
  def get_betas(self,subsample_ts=False):
2118
-
2119
- self.process_data()
2120
- general_beta, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
2121
- general_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df.index_return)[0]
2122
-
2123
- self.process_data()
2124
- if subsample_ts:
2125
- self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
2126
- sample_beta, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
2127
- sample_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df.index_return)[0]
2128
-
2129
- result = {
2130
- 'general_beta':general_beta,
2131
- 'general_r':general_r,
2132
- 'sample_beta':sample_beta,
2133
- 'sample_r':sample_r
2134
- }
2135
-
2136
- self.states_result = result
2137
-
2138
- class evaluate_markets(analyse_index):
2139
- def __init__(self, stock_code, indexes):
2140
- self.stock_code = stock_code
2141
- self.indexes = indexes
2142
- def evaluate_best_market_fit(self,sample_size, offset,lag= 3, n_obs = 3500, verbose = False, plot_best = False):
2143
-
2144
- results_dicts = dict()
2364
+ """
2365
+ get general beta and last sample beta, correlation score is included too
2366
+
2367
+ Parameters
2368
+ ----------
2369
+ subsample_ts (int): subsample size of data
2370
+
2371
+ Returns
2372
+ -------
2373
+ None
2374
+ """
2375
+ result = list()
2145
2376
  for index in self.indexes:
2146
- betex = analyse_index(index = index,asset = self.stock_code,n_obs = n_obs, lag = lag)
2147
- betex.get_betas(sample_size)
2148
- results_dicts[index] = betex.states_result
2149
- pd_result = pd.DataFrame(results_dicts).T
2150
- pd_result['gen_r2'] = pd_result.general_r ** 2
2151
- pd_result['sampl_r2'] = pd_result.sample_r ** 2
2152
- self.stat_results = pd_result
2153
-
2154
- best_result = pd_result.sort_values('gen_r2',ascending = False).head(2).sort_values('sampl_r2',ascending = False).head(1)
2155
- best_fit_index = best_result.index.values[0]
2156
-
2157
- self.stat_results = self.stat_results.drop(columns = ['gen_r2','sampl_r2'])
2158
-
2159
- if verbose:
2160
- print(best_result)
2161
- if plot_best:
2162
- betex = analyse_index(index = best_fit_index,asset = self.stock_code, n_obs = n_obs, lag = lag)
2163
- betex.plot_betas(sample_size = sample_size, offset = offset, subsample_ts = False)
2164
2377
 
2165
- self.best_result = best_result
2166
-
2167
- def get_relevant_beta(data_market, ticket_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
2168
- """
2169
- data_market: pandas df
2170
- ticket_name: str name of the asset
2171
- show_plot: bool
2172
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
2173
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
2174
- aws_credentials: dict
2175
- """
2176
- all_betas = data_market[data_market.asset == ticket_name].sort_values('general_r', ascending = False)
2177
- all_betas['gen_r2'] = all_betas.general_r ** 2
2178
- all_betas['sampl_r2'] = all_betas.sample_r ** 2
2179
- selection = all_betas.sort_values('gen_r2',ascending =False).head(2).sort_values('sampl_r2',ascending =False).head(1).drop(columns = ['gen_r2','sampl_r2'])
2378
+ index_pct = f'{index}_pct'
2379
+ huber_regr = HuberRegressor(fit_intercept = True)
2380
+ huber_regr.fit(self.merger_df.asset_return.values.reshape(-1,1), self.merger_df[index_pct].values.reshape(-1,1))
2381
+ general_beta, a = huber_regr.coef_[0], huber_regr.intercept_
2382
+ general_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df[index])[0]
2383
+
2384
+ dict_res = {
2385
+ 'index':index,
2386
+ 'general_beta':general_beta,
2387
+ 'general_r':general_r,
2388
+ }
2180
2389
 
2181
- if show_plot:
2182
- print(selection)
2183
- if save_path:
2184
- result_plot_name = f'market_best_fit.csv'
2185
- selection.to_csv(save_path+result_plot_name)
2186
-
2187
- if save_path and save_aws:
2188
- # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{ticket_name}/'+result_plot_name,input_path = save_path+result_plot_name)
2189
- upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = save_aws + result_plot_name, input_path = save_path + result_plot_name, aws_credentials = aws_credentials)
2190
- return selection
2390
+ if subsample_ts:
2391
+ tmp_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
2392
+ huber_regr = HuberRegressor(fit_intercept = True)
2393
+ huber_regr.fit(tmp_df.asset_return.values.reshape(-1,1), tmp_df[index_pct].values.reshape(-1,1))
2394
+ sample_beta, a = huber_regr.coef_[0], huber_regr.intercept_
2395
+ sample_r = stats.mstats.pearsonr(tmp_df.asset_return, tmp_df[index])[0]
2396
+ dict_res['sample_beta'] = sample_beta
2397
+ dict_res['sample_r'] = sample_r
2398
+
2399
+ result.append(dict_res)
2400
+
2401
+ self.states_result = result
2402
+