virgo-modules 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import yfinance as yf
2
2
  import pandas as pd
3
3
  import numpy as np
4
- import json
4
+ import gc
5
5
 
6
6
  import matplotlib.pyplot as plt
7
7
  import matplotlib.gridspec as gridspec
@@ -36,7 +36,6 @@ from hmmlearn.hmm import GaussianHMM
36
36
 
37
37
  from plotly.colors import DEFAULT_PLOTLY_COLORS
38
38
 
39
- from sklearn.base import BaseEstimator, TransformerMixin
40
39
  from sklearn.pipeline import Pipeline
41
40
  from feature_engine.imputation import MeanMedianImputer
42
41
 
@@ -48,88 +47,38 @@ from feature_engine.timeseries.forecasting import LagFeatures
48
47
  from feature_engine.imputation import MeanMedianImputer
49
48
  from feature_engine.discretisation import EqualWidthDiscretiser
50
49
 
50
+ from sklearn.linear_model import HuberRegressor
51
+
51
52
  from .aws_utils import upload_file_to_aws
52
53
 
53
54
  import logging
54
55
 
55
- class InverseHyperbolicSine(BaseEstimator, TransformerMixin):
56
- def __init__(self, features, prefix = ''):
57
- self.features = features
58
- self.prefix = prefix
56
+ from virgo_modules.src.hmm_utils import trainer_hmm
57
+ from virgo_modules.src.transformer_utils import signal_combiner, FeatureSelector
58
+ from virgo_modules.src.transformer_utils import FeaturesEntropy, VirgoWinsorizerFeature # imported bcs some models read this module otherwise it crashed mlflow.load()
59
59
 
60
- def fit(self, X, y=None):
61
- return self
62
-
63
- def transform(self, X, y=None):
64
- for feature in self.features:
65
- X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
66
- return X
67
-
68
- class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
69
- def __init__(self, feature_configs):
70
- self.feature_configs = feature_configs
71
- def fit(self, X, y=None):
72
- return self
73
-
74
- def transform(self, X, y=None):
75
- for feature in self.feature_configs:
76
- lower = self.feature_configs[feature]['min']
77
- upper = self.feature_configs[feature]['max']
78
- X[feature] = np.where( lower > X[feature], lower, X[feature])
79
- X[feature] = np.where( upper < X[feature], upper, X[feature])
80
- return X
81
-
82
- class FeatureSelector(BaseEstimator, TransformerMixin):
83
- def __init__(self, columns):
84
- self.columns = columns
85
-
86
- def fit(self, X, y=None):
87
- return self
88
-
89
- def transform(self, X, y=None):
90
- return X[self.columns]
91
-
92
- def sharpe_ratio(return_series):
93
- N = 255 # Trading days in the year (change to 365 for crypto)
94
- rf = 0.005 # Half a percent risk free rare
95
- mean = return_series.mean() * N -rf
96
- sigma = return_series.std() * np.sqrt(N)
97
- sharpe = round(mean / sigma, 3)
98
- return sharpe
99
-
100
- class signal_combiner(BaseEstimator, TransformerMixin):
101
- def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
102
- self.columns = columns
103
- self.drop = drop
104
- self.prefix_up = prefix_up
105
- self.prefix_low = prefix_low
106
-
107
- def fit(self, X, y=None):
108
- return self
109
-
110
- def transform(self, X, y=None):
111
- for column in self.columns:
112
- X['CombSignal_'+column] = np.where(
113
- X[self.prefix_up + column] == 1,
114
- 1,
115
- np.where(
116
- X[self.prefix_low + column] == 1,
117
- 1,
118
- 0
119
- )
120
- )
121
- if self.drop:
122
- X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
123
- return X
124
-
125
60
  def data_processing_pipeline(features_base,features_to_drop = False, lag_dict = False, combine_signals = False, discretize_columns = False, correlation = 0.77):
126
-
61
+
62
+ '''
63
+ create a scikit learn pipeline object using different configurations and feature engineering blocks with a given flow
64
+
65
+ Parameters:
66
+ features_to_drop (list): list of features to drop
67
+ lag_dict (dict): feature dictionary with configurations to apply lags
68
+ combine_signals (list): list of columns/signals to combine
69
+ discretize_columns (list): list of features to discretize, bins is fixed
70
+ correlation (float): correaltion score threshold for feature selection
71
+
72
+ Returns:
73
+ pipe (obj): pipeline object
74
+ '''
75
+
127
76
  lag_pipe_sec = [(f'lags_{key}', LagFeatures(variables = key, periods = lag_dict[key])) for key in lag_dict] if lag_dict else []
128
77
  drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
129
78
  merge = [('signal_combiner', signal_combiner(combine_signals))] if combine_signals else []
130
79
  discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = 20 ))] if discretize_columns else []
131
80
  drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation))] if correlation else []
132
-
81
+
133
82
  pipe = Pipeline(
134
83
  [('selector', FeatureSelector(features_base))] + \
135
84
  [('encoding',OneHotEncoder(top_categories=None, variables=['hmm_feature']))] + \
@@ -142,64 +91,178 @@ def data_processing_pipeline(features_base,features_to_drop = False, lag_dict =
142
91
  )
143
92
  return pipe
144
93
 
145
- def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
146
- ## legnths
147
- cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
148
- cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
149
- ## means
150
- def quantile2(x):
151
- return x.quantile(0.25)
152
- def quantile3(x):
153
- return x.quantile(0.75)
154
-
155
- cluster_returns = data.groupby('hmm_feature').agg(
156
- n_uniques = ('chain_id','nunique'),
157
- n_obs = ('Date','count'),
158
- cluster_ret_q25 = ('chain_return',quantile2),
159
- cluster_ret_median = ('chain_return','median'),
160
- cluster_ret_q75 = ('chain_return',quantile3),
161
- )
162
- cluster_returns = cluster_returns.join(cluster_lengths, how = 'left')
163
- cluster_returns['perc_dispute'] = np.where(
164
- np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
165
- 1,0
166
- )
167
- cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
168
- cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
169
- cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
170
- cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
171
- cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
172
- cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
173
- cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
174
-
175
- ## relevance or importance
176
- # naive aproach
177
- cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
178
- cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
179
- cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
180
- cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
181
-
182
- mean_relevance = cluster_returns['abs_t_accpted'].mean()
183
- number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
184
-
185
- return mean_relevance, cluster_returns, number_relevant_states
94
+ class stock_eda_panel(object):
186
95
 
96
+ """
97
+ Class that initialy gets stock data then apply feature enginering, enrichment, analysis, plotting, model training etc.
98
+
99
+ Attributes
100
+ ----------
101
+ stock_code : str
102
+ symbol of the asset
103
+ n_days : str
104
+ number of days to extract data
105
+ data_window : str
106
+ large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
107
+ df : pd.DataFrame
108
+ Pandas dataframe of the asset data with features
109
+ strategy_log: pd.DataFrame
110
+ Pandas dataframe that has the results of different tested strategies (result from strategy simulator hmm)
111
+ best_strategy: list
112
+ features of the best performing strategy (result from strategy simulator hmm)
113
+ top_10_strategy: dict
114
+ top 10 best performing strategies (result from strategy simulator hmm)
115
+ settings: dict
116
+ configuration dictionary of the features and other parameters
117
+
118
+ Methods
119
+ -------
120
+ augmented_dickey_fuller_statistics(time_series=pd.Series, label=str):
121
+ Perform dickey fuller or stationary test for a given time series
122
+ It will print p value of the features
123
+ get_data():
124
+ Get asset data performing some data normalization or formating (in case of dates)
125
+ plot_series_returns(roll_mean_lags1=int, roll_mean_lags2=int)
126
+ Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
127
+ seasonal_plot():
128
+ Display time series split by year
129
+ plot_price_signal(feature=str, feature_2=str, opacity=float):
130
+ Display botton and roof signals over the closing prices
131
+ volatility_analysis(lags=int, trad_days=int, window_log_return=int, plot=boolean, save_features=boolean):
132
+ this method performs log return and volatilyty analysis of the closing prices
133
+ find_lag(feature=str, lag_list=list, column_target=str,posterior_lag=int, test_size=int):
134
+ displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
135
+ outlier_plot(zlim=float, plot=boolean, save_features=boolean):
136
+ perform outlier analysis of the log returns. It also permors normality test of returns
137
+ analysis_roll_mean_log_returns(lags=int, plot=boolean):
138
+ perform analysis of lags of the mean rolling log return
139
+ compute_clip_bands(feature_name=str,threshold=float):
140
+ compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
141
+ extract_sec_data(symbol=str, base_columns=list(str), rename_columns=dict):
142
+ extract new asset data and merge it to the main asset data
143
+ lag_log_return(lags=int, feature=str, feature_name=str):
144
+ compute log return given some lags
145
+ produce_log_volatility(trad_days=int, feature=str, feature_name=str):
146
+ compute volatility
147
+ signal_plotter(feature_name=str):
148
+ display analysis plot of a feature with high and low signals
149
+ log_features_standard(feature_name=str):
150
+ save resulting feature names in an standard structure
151
+ relative_spread_MA(ma1=int, ma2=int, threshold=float, plot=boolean, save_features=boolean):
152
+ perform relative moving average features, one for short term and another for long/mid term
153
+ pair_feature(pair_symbol=str, plot=boolean):
154
+ initialize pair feature data extraction and analysis
155
+ calculate_cointegration(series_1=pd.series, series_2=pd.series):
156
+ calculate cointegration score for two time series
157
+ bidirect_count_feature(rolling_window=int, threshold=float, plot=boolean, save_features=boolean):
158
+ perform negative and positive return counting in a given rolling time window
159
+ get_relative_range_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
160
+ perform relative spread of opening and closing price
161
+ rsi_feature_improved(window=int, threshold=float, plot=boolean, save_features=boolean):
162
+ perform relative strength index
163
+ days_features_bands(window=int, threshold=float, plot=boolean, save_features=boolean):
164
+ compute mean returns for a given day of the week in a window scope per day
165
+ analysis_smooth_volume(window=int, threshold=float, plot=boolean, save_features=boolean):
166
+ compute feature of thrading volumes
167
+ roc_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
168
+ perform price rate of change
169
+ stoch_feature(window=int, smooth1=int, smooth2=int, threshold=float, plot=boolean, save_features=boolean):
170
+ perform stochastic oscilator RSI feature
171
+ stochastic_feature(window=int, smooth=int, threshold=float, plot=boolean, save_features=boolean):
172
+ perform stochastic oscilator feature
173
+ william_feature(lbp=int, threshold=float, plot=boolean, save_features=boolean):
174
+ perfom fast stochastic oscilator or william indicator
175
+ vortex_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
176
+ perform vortex oscilator
177
+ expected_return(trad_days:int, feature:str, feature_name:str):
178
+ perform expected log return based on inversed shift of historical data and applying
179
+ rolling_feature(feature: str, window:int, function:callable):
180
+ perform rolling (non expanding) window operation for a given feature
181
+ time_distance(feature_base:str,feature_window:str, result_feature_name:str, max_window:int):
182
+ perform distancce time to a given window feature
183
+ minmax_pricefeature(type_func=str, window=int, distance=bolean, save_features=boolean)
184
+ get relative price/ distance feature with respect to the min/max price in a given window
185
+ pair_index_feature(pair_symbol=str, feature_label=str, window=int, threshold=float, plot=boolean, save_features=boolean):
186
+ perform additional asset ROC feature, then a new feature is created in the main dataframe
187
+ produce_order_features(feature_name=str, save_features=boolean):
188
+ perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
189
+ compute_last_signal (feature_name=str, save_features=boolean):
190
+ perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
191
+ create_hmm_derived_features():
192
+ create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
193
+ cluster_hmm_analysis(n_clusters=int,features_hmm=list, test_data_size=int, seed=int, lag_returns_state=int, plot=boolean, save_features=boolean, model=obj):
194
+ create or use a hmm model
195
+ sharpe_ratio(return_series=pd.Series, n_trad_days=int, rf=float):
196
+ perform sharpe ratio of a given time series return
197
+ treat_signal_strategy(test_data=pd.DataFrame, strategy=list):
198
+ helper method that treats signals and converts signals to 1 or 0
199
+ stategy_simulator(features=list, hmm_feature=boolean):
200
+ execute strategy and get some performance metrics like sharpe ratio, return
201
+ viz_strategy(strategy):
202
+ display analysis plot of a given strategy
203
+ deep_dive_analysis_hmm(test_data_size=int, split=str):
204
+ display analysis plot hmm model
205
+ get_targets(steps=int):
206
+ produce regression target return taking future prices
207
+ get_categorical_targets(horizon=int, flor_loss=float, top_gain=float):
208
+ produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
209
+ get_configurations(test_data_size=int, val_data_size=int, model_type=str):
210
+ produce configuration dictionary that were saved in the feature generation methods if save_features was activated
211
+ """
187
212
 
188
- class stock_eda_panel(object):
189
-
190
213
  def __init__(self, stock_code, n_days, data_window = '5y'):
214
+
215
+ """
216
+ Initialize object
217
+
218
+ Parameters
219
+ ----------
220
+ stock_code (str): symbol of the asset
221
+ n_days (str): number of days to extract data
222
+ data_window (str): large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
223
+
224
+ Returns
225
+ -------
226
+ None
227
+ """
228
+
191
229
  self.stock_code = stock_code
192
230
  self.n_days = n_days
193
231
  self.today = datetime.date.today()
194
232
  self.features = list()
195
233
  self.signals = list()
196
234
  self.data_window = data_window
197
-
235
+
198
236
  def augmented_dickey_fuller_statistics(self,time_series, label):
237
+ """
238
+ Perform dickey fuller or stationary test for a given time series
239
+ It will print p value of the features
240
+
241
+ Parameters
242
+ ----------
243
+ time_series (pd.Series): pandas series of the time series
244
+ label (pd.Series): feature name
245
+
246
+ Returns
247
+ -------
248
+ None
249
+ """
199
250
  result = adfuller(time_series.dropna().values)
200
251
  print('p-value: {} for the series {}'.format(round(result[1],6), label))
201
-
252
+
202
253
  def get_data(self):
254
+ """
255
+ Get asset data performing some data normalization or formating (in case of dates)
256
+
257
+ Parameters
258
+ ----------
259
+ None
260
+
261
+ Returns
262
+ -------
263
+ None
264
+ """
265
+
203
266
  begin_date = self.today - relativedelta(days = self.n_days)
204
267
  begin_date_str = begin_date.strftime('%Y-%m-%d')
205
268
 
@@ -210,7 +273,7 @@ class stock_eda_panel(object):
210
273
  df.reset_index(inplace=True)
211
274
  df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
212
275
  df['Date'] = pd.to_datetime(df['Date'])
213
-
276
+
214
277
  df = df[df.Date >= begin_date_str ]
215
278
  self.settings_general = {
216
279
  'n_days':self.n_days,
@@ -219,44 +282,56 @@ class stock_eda_panel(object):
219
282
  'execution_date': self.today.strftime('%Y-%m-%d')
220
283
  }
221
284
  self.df = df
222
-
285
+
223
286
  ### cleaning volume
224
287
  ### volume clearning
225
288
  self.df['Volume'] = np.where(self.df['Volume'] <= 10, np.nan, self.df['Volume'])
226
289
  self.df['Volume'] = self.df['Volume'].fillna(method='bfill')
227
-
290
+
228
291
  ## filling
229
-
292
+
230
293
  base_columns_unit_test = ['Open','High','Low','Close','Volume']
231
294
  self.df[base_columns_unit_test] = self.df[base_columns_unit_test].fillna(method='ffill')
232
-
295
+
233
296
  ## cleaning nulls
234
-
297
+
235
298
  xs = self.df[base_columns_unit_test].isnull().sum()/self.df[base_columns_unit_test].count()
236
299
  reject_columns = list(xs[xs > 0.5].index.values)
237
-
300
+
238
301
  if len(reject_columns) > 0:
239
302
  logging.warning("the following columns have many nulls and are drop: {}".format(reject_columns))
240
303
  self.df = self.df.drop(columns = reject_columns)
241
-
242
-
304
+
243
305
  def plot_series_returns(self,roll_mean_lags1,roll_mean_lags2):
244
-
306
+
307
+ """
308
+ Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
309
+
310
+ Parameters
311
+ ----------
312
+ roll_mean_lags1 (int): short term window
313
+ roll_mean_lags2 (int): mid/long term window
314
+
315
+ Returns
316
+ -------
317
+ None
318
+ """
319
+
245
320
  df = self.df
246
321
  begin_date = self.today - relativedelta(days = self.n_days)
247
322
  begin_date_str = begin_date.strftime('%Y-%m-%d')
248
-
323
+
249
324
  ### getting rolling mean
250
325
  df["Close_roll_mean"] = (
251
326
  df.sort_values("Date")["Close"]
252
327
  .transform(lambda x: x.rolling(roll_mean_lags1, min_periods=1).mean())
253
328
  )
254
-
329
+
255
330
  df["Close_roll_mean_2"] = (
256
331
  df.sort_values("Date")["Close"]
257
332
  .transform(lambda x: x.rolling(roll_mean_lags2, min_periods=1).mean())
258
333
  )
259
-
334
+
260
335
  ### getting rolling stdv
261
336
  df["Close_roll_std"] = (
262
337
  df.sort_values("Date")["Close"]
@@ -273,7 +348,7 @@ class stock_eda_panel(object):
273
348
  ))
274
349
 
275
350
  fig.add_trace(go.Scatter(x=df['Date'], y=df.Close, marker_color = 'blue', name='Price'),row=1, col=1)
276
-
351
+
277
352
  fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean, marker_color = 'black', name='roll mean' ),row=1, col=1)
278
353
  fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean_2, marker_color = 'grey', name='roll mean 2' ),row=1, col=1)
279
354
  fig.add_trace(go.Scatter(x=df['Date'], y=df.lower, marker_color = 'pink',legendgroup='bound', name='bound' ),row=1, col=1)
@@ -281,8 +356,21 @@ class stock_eda_panel(object):
281
356
 
282
357
  fig.update_layout(height=500, width=1200, title_text=f"stock {self.stock_code} vizualization")
283
358
  fig.show()
284
-
359
+
285
360
  def seasonal_plot(self):
361
+
362
+ """
363
+ Display time series split by year
364
+
365
+ Parameters
366
+ ----------
367
+ None
368
+
369
+ Returns
370
+ -------
371
+ None
372
+ """
373
+
286
374
  df = self.df
287
375
  years = list(df['Date'].dt.year.unique())
288
376
  years.sort()
@@ -302,10 +390,24 @@ class stock_eda_panel(object):
302
390
 
303
391
  fig.update_layout(height=500, width=1400, title_text=f"stock {self.stock_code} seasonal vizualization")
304
392
  fig.show()
305
-
393
+
306
394
  def plot_price_signal(self, feature, feature_2 = '', opacity = 0.3):
307
-
308
- signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
395
+
396
+ """
397
+ Display botton and roof signals over the closing prices
398
+
399
+ Parameters
400
+ ----------
401
+ feature (str): name of the main feature to plot
402
+ feature_2 (str): name of the alternative feature to plot
403
+ opacity (float): opacity degree of the signals points
404
+
405
+ Returns
406
+ -------
407
+ None
408
+ """
409
+
410
+ signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
309
411
  signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
310
412
  norm_list = [f'norm_{feature}', f'z_{feature}', feature]
311
413
 
@@ -315,14 +417,14 @@ class stock_eda_panel(object):
315
417
  if norm_feat in self.df.columns:
316
418
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[norm_feat],legendgroup="up", mode='lines',name = norm_feat, marker_color = 'blue'),col = 1, row = 1)
317
419
  break
318
-
319
-
420
+
421
+
320
422
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df['Close'], mode='lines',name = 'history', marker_color = 'grey'),col = 1, row = 2)
321
-
423
+
322
424
  if feature == 'MA_spread':
323
425
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma1_column],legendgroup="ma", mode='lines',name = self.ma1_column, marker_color = 'black'),col = 1, row = 2)
324
426
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma2_column],legendgroup="ma", mode='lines',name = self.ma2_column, marker_color = 'grey'),col = 1, row = 2)
325
-
427
+
326
428
  for norm_feat in norm_list:
327
429
  if norm_feat in self.df.columns:
328
430
  fig.add_trace(go.Scatter(x=self.df['Date'], y=np.where(self.df[norm_feat] > 0, self.df['Close'], np.nan),legendgroup="up", mode='markers',name = 'up', marker_color = 'green',opacity = opacity),col = 1, row = 2)
@@ -338,8 +440,25 @@ class stock_eda_panel(object):
338
440
 
339
441
  fig.update_layout(height=900, width=1200)
340
442
  fig.show()
341
-
443
+
342
444
  def volatility_analysis(self, lags, trad_days, window_log_return, plot = False, save_features = False):
445
+
446
+ """
447
+ this method performs log return and volatilyty analysis of the closing prices
448
+
449
+ Parameters
450
+ ----------
451
+ lags (int): number of lags to apply to the closing prices
452
+ trad_days (int): number of trading days to anualize returns or volatility
453
+ window_log_return (int): window for rolling returns
454
+ plot (boolean): True to display plot
455
+ save_features (boolean): True to save feature configuration and feature names
456
+
457
+ Returns
458
+ -------
459
+ None
460
+ """
461
+
343
462
  df = self.df
344
463
  df['log_return'] = np.log(df.Close/df.Close.shift(lags))
345
464
  df['sqr_log_return'] = np.square(df.log_return)
@@ -349,13 +468,13 @@ class stock_eda_panel(object):
349
468
  df.sort_values("Date")["log_return"]
350
469
  .transform(lambda x: x.rolling(window_log_return, min_periods=1).mean())
351
470
  )
352
-
471
+
353
472
  if save_features:
354
473
  self.features.append('volatility_log_return')
355
474
  self.features.append('roll_mean_log_return')
356
475
  self.features.append('log_return')
357
476
  self.settings_volatility = {'lags':lags, 'trad_days':trad_days, 'window_log_return':window_log_return}
358
-
477
+
359
478
  if plot:
360
479
  fig = make_subplots(rows=3, cols=1,vertical_spacing = 0.02,shared_xaxes=True,
361
480
  specs=[
@@ -395,10 +514,25 @@ class stock_eda_panel(object):
395
514
 
396
515
  self.augmented_dickey_fuller_statistics(df['log_return'], 'log_return')
397
516
  self.augmented_dickey_fuller_statistics(df['roll_mean_log_return'], 'roll_mean_log_return')
398
-
399
-
517
+
400
518
  def find_lag(self, feature, lag_list, column_target = 'log_return',posterior_lag = 4, test_size = 350):
401
519
 
520
+ """
521
+ displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
522
+
523
+ Parameters
524
+ ----------
525
+ feature (str): feature name to apply lags
526
+ lag_list (list): list of lags, each lag as integer
527
+ column_target (str): target to get correlation, e.g return or mean reaturn
528
+ posterior_lag (int): for the target, posterior window shift to calculate a window return
529
+ test_size (int): data size of the test data. The remaining is going to be used as training data. This parameters is ment to avoid overfiting and leackage
530
+
531
+ Returns
532
+ -------
533
+ None
534
+ """
535
+
402
536
  results = dict()
403
537
  df = self.df.iloc[:-test_size,:][['Date','Close','roll_mean_log_return','log_return',feature]].sort_values('Date').copy()
404
538
  for i,lag in enumerate(lag_list):
@@ -413,7 +547,7 @@ class stock_eda_panel(object):
413
547
  'lag':lag,
414
548
  'pearsonr_log_return':r_log[0],
415
549
  'spearman_log_return': sp_log[0],
416
- }
550
+ }
417
551
  del df
418
552
  results_df = pd.DataFrame(results).T
419
553
 
@@ -426,10 +560,23 @@ class stock_eda_panel(object):
426
560
  plt.legend()
427
561
  plt.axhline(y=0, color='grey', linestyle='--')
428
562
  plt.show()
429
-
430
-
563
+
431
564
  def outlier_plot(self, zlim, plot = False, save_features = False):
432
-
565
+
566
+ """
567
+ perform outlier analysis of the log returns. It also permors normality test of returns
568
+
569
+ Parameters
570
+ ----------
571
+ zlim (float): alpha or z thrsholds for normalized returns
572
+ plot (boolean): True to display plot
573
+ save_features (boolean): True to save feature configuration and feature names
574
+
575
+ Returns
576
+ -------
577
+ None
578
+ """
579
+
433
580
  mean = self.df.log_return.mean()
434
581
  std = self.df.log_return.std()
435
582
  self.df['z_log_return'] = (self.df.log_return - mean)/std
@@ -440,7 +587,7 @@ class stock_eda_panel(object):
440
587
  self.df['up_outlier'] = zlim*self.df['z_std_log_return'] + mean_
441
588
  self.df['low_outlier'] = -zlim*self.df['z_std_log_return'] + mean_
442
589
 
443
- self.df['signal_low_osutlier'] = np.where( (self.df['z_log_return'] < self.df['low_outlier'] ), 1, 0)
590
+ self.df['signal_low_outlier'] = np.where( (self.df['z_log_return'] < self.df['low_outlier'] ), 1, 0)
444
591
  self.df['signal_up_outlier'] = np.where( (self.df['z_log_return'] > self.df['up_outlier'] ), 1, 0)
445
592
  if save_features:
446
593
  self.signals.append('signal_low_outlier')
@@ -451,7 +598,7 @@ class stock_eda_panel(object):
451
598
  sigma = self.df['z_log_return'].std()
452
599
  x = np.linspace(self.df['z_log_return'].min(),self.df['z_log_return'].max(), 15000)
453
600
  y = stats.norm.pdf(x, loc = mu, scale = sigma)
454
-
601
+
455
602
  fig, axs = plt.subplots(2, 1,figsize=(15,8))
456
603
 
457
604
  axs[0].hist(self.df['z_log_return'],density = True,bins = 100 , label = 'Returns distribution')
@@ -460,7 +607,7 @@ class stock_eda_panel(object):
460
607
  axs[0].axvline(l2, color='green', linestyle='--')
461
608
  axs[0].axvline(-l2, color='green', linestyle='--')
462
609
  axs[0].plot(x,y, linewidth = 3, color = 'r', label = 'Normal Dist Curve')
463
-
610
+
464
611
  axs[1].plot(self.df['Date'],self.df['z_log_return'])
465
612
  axs[1].plot(self.df['Date'],self.df['low_outlier'], linestyle='--')
466
613
  axs[1].plot(self.df['Date'],self.df['up_outlier'], linestyle='--')
@@ -469,18 +616,31 @@ class stock_eda_panel(object):
469
616
  plt.show()
470
617
 
471
618
  z_stat, p_stat = stats.normaltest(self.df['z_log_return'].dropna())
472
- p_stat = round(p_stat, 7)
619
+ p_stat = round(p_stat, 7)
473
620
  print('---------------------- returns normality tests ----------------------------')
474
621
  if p_stat < 0.05:
475
622
  print(f'pvalue: {p_stat} then, returns do not follow a normal distribution')
476
623
  else:
477
624
  print(f'pvalue: {p_stat} then, returns follow a normal distribution')
478
-
625
+
479
626
  def analysis_roll_mean_log_returns(self, lags, plot = False):
480
627
 
628
+ """
629
+ perform analysis of lags of the mean rolling log return
630
+
631
+ Parameters
632
+ ----------
633
+ lags (int): lags to apply to the roll log return
634
+ plot (boolean): True to display plot
635
+
636
+ Returns
637
+ -------
638
+ None
639
+ """
640
+
481
641
  self.df['lag'] = self.df.roll_mean_log_return.shift(lags)
482
642
  self.df['Diff'] = self.df['roll_mean_log_return'] - self.df['lag']
483
-
643
+
484
644
  if plot:
485
645
 
486
646
  fig, axs = plt.subplots(1, 3,figsize=(19,4))
@@ -493,7 +653,20 @@ class stock_eda_panel(object):
493
653
  plt.show()
494
654
 
495
655
  def compute_clip_bands(self,feature_name,threshold):
496
-
656
+
657
+ """
658
+ compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
659
+
660
+ Parameters
661
+ ----------
662
+ feature_name (str): feature name
663
+ threshold (float): alpha or z thrsholds for normalized returns
664
+
665
+ Returns
666
+ -------
667
+ None
668
+ """
669
+
497
670
  self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/self.df[feature_name].std()
498
671
  mean_ = self.df[f'norm_{feature_name}'].mean()
499
672
 
@@ -506,84 +679,140 @@ class stock_eda_panel(object):
506
679
  self.df[f'signal_low_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] < self.df[f'lower_{feature_name}'] ), 1, 0)
507
680
  self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] > self.df[f'upper_{feature_name}'] ), 1, 0)
508
681
 
682
+ def extract_sec_data(self, symbol, base_columns, rename_columns=None):
683
+ """
684
+ extract new asset data and merge it to the main asset data
685
+
686
+ Parameters
687
+ ----------
688
+ symbol (str): symbol to extract data
689
+ base_columns (list): list of columns to persist
690
+ rename_columns (dict): map of the new column names using pd.DataFrame.rename()
691
+
692
+ Returns
693
+ -------
694
+ None
695
+ """
696
+ begin_date = self.today - relativedelta(days = self.n_days)
697
+ begin_date_str = begin_date.strftime('%Y-%m-%d')
698
+
699
+ stock = yf.Ticker(symbol)
700
+ df = stock.history(period=self.data_window)
701
+ df = df.sort_values('Date')
702
+ df.reset_index(inplace=True)
703
+ df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
704
+ df['Date'] = pd.to_datetime(df['Date'])
705
+ df = df[df.Date >= begin_date_str ]
706
+ df = df[base_columns]
707
+ if rename_columns:
708
+ df = df.rename(columns=rename_columns)
709
+ right_df = df.copy()
710
+
711
+ dates_vector = self.df.Date.to_frame()
712
+ right_df = dates_vector.merge(right_df, on ='Date',how = 'left')
713
+ right_df = right_df.fillna(method = 'bfill')
714
+ right_df = right_df.fillna(method = 'ffill')
715
+
716
+ self.df = self.df.merge(right_df, on ='Date',how = 'left')
717
+ self.df = self.df.sort_values("Date")
718
+ del right_df
719
+ gc.collect()
720
+
721
+ def lag_log_return(self, lags, feature, feature_name=False):
722
+ """
723
+ compute log return given some lags
724
+
725
+ Parameters
726
+ ----------
727
+ lags (int): lag to apply log return
728
+ feature (str): feature to apply log return
729
+ feature_name (str): rename resuling name
730
+
731
+ Returns
732
+ -------
733
+ None
734
+ """
735
+
736
+ feature_name = feature_name if feature_name else f"{feature}_log_return"
737
+ self.df[feature_name] = np.log(self.df[feature]/self.df[feature].shift(lags))
738
+
739
+ def produce_log_volatility(self, trad_days, feature, feature_name=False):
740
+ """
741
+ compute log return given some lags
742
+
743
+ Parameters
744
+ ----------
745
+ trad_days (int): window function to calculate standard deviation
746
+ feature (str): feature to apply computation
747
+ feature_name (str): resulting feature name
748
+
749
+ Returns
750
+ -------
751
+ None
752
+ """
753
+ feature_name = feature_name if feature_name else f"{feature}_log_return_{trad_days}"
754
+ self.df[feature_name] = self.df.sort_values("Date")[feature].rolling(window = trad_days).std()*np.sqrt(252)
755
+
509
756
  def signal_plotter(self, feature_name):
757
+
758
+ """
759
+ display analysis plot of a feature with high and low signals
760
+
761
+ Parameters
762
+ ----------
763
+ feature_name (str): feature name
764
+
765
+ Returns
766
+ -------
767
+ None
768
+ """
769
+
510
770
  fig, axs = plt.subplots(1, 3,figsize=(17,5))
511
-
771
+
512
772
  axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
513
773
  axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
514
774
  axs[0].plot(self.df[f'norm_{feature_name}'])
515
-
775
+
516
776
  plot_acf(self.df[feature_name].dropna(),lags=25,ax = axs[1])
517
777
  axs[1].set_title(f'acf {feature_name}')
518
-
778
+
519
779
  plot_pacf(self.df[feature_name].dropna(),lags=25,ax = axs[2])
520
780
  axs[2].set_title(f'pacf {feature_name}')
521
-
781
+
522
782
  fig.show()
523
783
 
524
784
  def log_features_standard(self, feature_name):
785
+ """
786
+ save resulting feature names in an standard structure
787
+
788
+ Parameters
789
+ ----------
790
+ feature_name (str): feature name
791
+
792
+ Returns
793
+ -------
794
+ None
795
+ """
525
796
  self.features.append(feature_name)
526
797
  self.signals.append(f'signal_up_{feature_name}')
527
798
  self.signals.append(f'signal_low_{feature_name}')
528
-
529
- #######################
530
- #### to be deprecated ####
531
- def spread_MA(self, ma1, ma2, limit = 1.95, plot = False, save_features = False):
532
-
533
- self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
534
- self.df[f'MA_{ma2}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma2, min_periods=1).mean()))
535
-
536
- self.ma1_column = f'MA_{ma1}'
537
- self.ma2_column = f'MA_{ma2}'
538
- self.df['MA_spread'] = self.df[f'MA_{ma1}'] - self.df[f'MA_{ma2}']
539
-
540
- self.df['norm_MA_spread'] = (self.df['MA_spread'] - self.df['MA_spread'].mean())/self.df['MA_spread'].std()
541
- mean_ = self.df['norm_MA_spread'].mean()
542
- self.df['rollstd_MA_spread'] = self.df.sort_values("Date")["norm_MA_spread"].rolling(50).std()
543
-
544
- self.df['upper_MA_spread'] = limit*self.df['rollstd_MA_spread'] + mean_
545
- self.df['lower_MA_spread'] = -limit*self.df['rollstd_MA_spread'] + mean_
546
-
547
- self.df['signal_low_MA_spread'] = np.where( (self.df['norm_MA_spread'] < self.df['lower_MA_spread'] ), 1, 0)
548
- self.df['signal_up_MA_spread'] = np.where( (self.df['norm_MA_spread'] > self.df['upper_MA_spread'] ), 1, 0)
549
-
550
- ### ploting purposes
551
- self.df[f"Roll_mean_{ma1}"] = (
552
- self.df.sort_values("Date")["Close"]
553
- .transform(lambda x: x.rolling(ma1, min_periods=1).mean())
554
- )
555
- self.df[f"Roll_mean_{ma2}"] = (
556
- self.df.sort_values("Date")["Close"]
557
- .transform(lambda x: x.rolling(ma2, min_periods=1).mean())
558
- )
559
-
560
-
561
- print('--------------------------------------------------------------------')
562
- if save_features:
563
- self.features.append('MA_spread')
564
- self.signals.append('signal_low_MA_spread')
565
- self.signals.append('signal_up_MA_spread')
566
- self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
567
-
568
- if plot:
569
-
570
- fig, axs = plt.subplots(1, 3,figsize=(21,4))
571
-
572
- axs[0].plot(self.df['Date'],self.df['norm_MA_spread'])
573
- axs[0].plot(self.df['Date'],self.df['upper_MA_spread'], linestyle='--')
574
- axs[0].plot(self.df['Date'],self.df['lower_MA_spread'], linestyle='--')
575
- axs[0].set_title('MA_spread series')
576
799
 
577
- plot_acf(self.df['MA_spread'].dropna(),lags=25, ax=axs[1])
578
- axs[1].set_title('acf MA_spread series')
579
-
580
- plot_pacf(self.df['MA_spread'].dropna(),lags=25, ax=axs[2])
581
- axs[2].set_title('acf MA_spread series')
582
- plt.show()
583
- ##################################################
584
-
585
800
  def relative_spread_MA(self, ma1, ma2, threshold = 1.95, plot = False, save_features = False):
586
-
801
+ """
802
+ perform relative moving average features, one for short term and another for long/mid term
803
+
804
+ Parameters
805
+ ----------
806
+ ma1 (int): short term moving average window
807
+ ma2 (int): long/mid term moving average window
808
+ threshold (float): alpha or z thrsholds for the normalized feature
809
+ plot (boolean): True to display plot
810
+ save_features (boolean): True to save feature configuration and feature names
811
+
812
+ Returns
813
+ -------
814
+ None
815
+ """
587
816
  feature_name = 'rel_MA_spread'
588
817
 
589
818
  self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
@@ -605,16 +834,27 @@ class stock_eda_panel(object):
605
834
  .transform(lambda x: x.rolling(ma2, min_periods=1).mean())
606
835
  )
607
836
 
608
- print('--------------------------------------------------------------------')
609
837
  if save_features:
610
838
  self.log_features_standard(feature_name)
611
- self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
839
+ self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
612
840
 
613
841
  if plot:
614
-
615
842
  self.signal_plotter(feature_name)
616
-
843
+
617
844
  def pair_feature(self, pair_symbol, plot = False):
845
+ """
846
+ initialize pair feature data extraction and analysis
847
+
848
+ Parameters
849
+ ----------
850
+ pair_symbol (str): symbol of the pair asset to extract
851
+ plot (boolean): True to display plot
852
+
853
+ Returns
854
+ -------
855
+ None
856
+ """
857
+
618
858
  self.pair_symbol = pair_symbol
619
859
  begin_date = self.today - relativedelta(days = self.n_days)
620
860
  begin_date_str = begin_date.strftime('%Y-%m-%d')
@@ -627,7 +867,7 @@ class stock_eda_panel(object):
627
867
  df['Date'] = pd.to_datetime(df['Date'])
628
868
  df = df[df.Date >= begin_date_str ]
629
869
  self.pair_df = df
630
-
870
+
631
871
  #### converting the same index ####
632
872
  dates_vector = self.df.Date.to_frame()
633
873
  self.pair_df = dates_vector.merge(self.pair_df, on ='Date',how = 'left')
@@ -653,8 +893,40 @@ class stock_eda_panel(object):
653
893
  plt.plot(self.df['Date'],asset_2_values,label = asset_2)
654
894
  plt.legend()
655
895
  plt.show()
656
-
896
+
897
+ def smooth_logrets_interaction_term(self, feature_interact_with, resulting_feature_name="persisted_clip_diff_smooths", rollmean_window = 5, ext_threhold=0.015, persist_days = 3, save_features=False):
898
+ """
899
+ create an interaction term that is going to compare the distance of asset wolling window mean and market rolling window mean.
900
+ then get the outliers or high values using abs and this value persist for some days
901
+ goal persist big differences of market and asset returns
902
+
903
+ feature_interact_with: name of the market return
904
+ rollmean_window: rolling window or smoothing number of days
905
+ ext_threhold: threshold
906
+ persist_days: number of days to persis the signal
907
+ """
908
+ self.df["smooth_log_return"] = self.df['log_return'].rolling(rollmean_window).mean().values
909
+ self.df["smooth_market_log_return"] = self.df[feature_interact_with].rolling(rollmean_window).mean().values
910
+ self.df["diff_smooths"] = self.df["smooth_market_log_return"]-self.df["smooth_log_return"]
911
+ self.df["clip_diff_smooths"] = np.where(np.abs(self.df["diff_smooths"]) > ext_threhold, self.df["diff_smooths"] , 0)
912
+ self.df[resulting_feature_name] = self.df['clip_diff_smooths'].rolling(persist_days).mean().values
913
+ self.df = self.df.drop(columns=["smooth_log_return","smooth_market_log_return","diff_smooths","clip_diff_smooths"])
914
+
657
915
  def calculate_cointegration(self,series_1, series_2):
916
+ """
917
+ calculate cointegration score for two time series
918
+
919
+ Parameters
920
+ ----------
921
+ series_1 (pd.series): time series
922
+ series_2 (pd.series): time series
923
+
924
+ Returns
925
+ -------
926
+ coint_flag (boolean): 1 if the p_value cointegration_t are lower than 0.05 and critical value
927
+ hedge_value (float): beta from the regression model
928
+ """
929
+
658
930
  coint_flag = 0
659
931
  coint_res = coint(series_1, series_2)
660
932
  coint_t = coint_res[0]
@@ -666,9 +938,22 @@ class stock_eda_panel(object):
666
938
  coint_flag = 1 if p_value < 0.05 and coint_t < critical_value else 0
667
939
 
668
940
  return coint_flag, hedge_value
669
-
670
- def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
671
941
 
942
+ def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
943
+ """
944
+ display analysis of the pair feature and save results in case if needed
945
+
946
+ Parameters
947
+ ----------
948
+ window (int): window to apply to the rolling spread between pair and main asset
949
+ z_threshold (float): alpha or z thrsholds for the normalized feature
950
+ plot (boolean): True to display plot
951
+ save_features (boolean): True to save feature configuration and feature names
952
+
953
+ Returns
954
+ -------
955
+ None
956
+ """
672
957
  spread_series = pd.Series(self.df.pair_spread)
673
958
  mean = spread_series.rolling(center = False, window = window).mean()
674
959
  std = spread_series.rolling(center = False, window = window).std()
@@ -677,11 +962,11 @@ class stock_eda_panel(object):
677
962
  self.df['pair_z_score'] = z_score
678
963
  self.df['signal_low_pair_z_score'] = np.where(self.df['pair_z_score'] < -z_threshold, 1, 0)
679
964
  self.df['signal_up_pair_z_score'] = np.where(self.df['pair_z_score'] > z_threshold, 1, 0)
680
-
965
+
681
966
  if save_features:
682
967
  self.log_features_standard('pair_z_score')
683
- self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
684
-
968
+ self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
969
+
685
970
  if plot:
686
971
  pvalue = round(adfuller(z_score.dropna().values)[1],4)
687
972
  print(f'p value of the rolling z-score is {pvalue}')
@@ -695,7 +980,7 @@ class stock_eda_panel(object):
695
980
  axs[0,0].axhline(y=0, color='blue', linestyle='-.')
696
981
  axs[0,0].plot(self.df.pair_z_score)
697
982
  axs[0,0].set_title('z score from the spread')
698
-
983
+
699
984
  axs[0,1].plot(self.df['Date'],self.df['pair_spread'])
700
985
  axs[0,1].plot(self.df['Date'],np.where(self.df['signal_low_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'red')
701
986
  axs[0,1].plot(self.df['Date'],np.where(self.df['signal_up_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'green')
@@ -704,44 +989,27 @@ class stock_eda_panel(object):
704
989
 
705
990
  plot_acf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,0])
706
991
  axs[1,0].set_title('acf pair_z_score')
707
-
992
+
708
993
  plot_pacf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,1])
709
994
  axs[1,1].set_title('pacf pair_z_score')
710
-
711
- plt.show()
712
-
713
- #######################
714
- #### to be deprecated ####
715
- def get_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
716
-
717
- # negative countiing and rolling countingng
718
- self.df['RetClose'] = self.df['Close'].pct_change()
719
- self.df['roll_pos_counting'] = np.where(self.df['RetClose'].shift(1) > 0,1,0 )
720
- self.df['roll_pos_counting'] = self.df['roll_pos_counting'].rolling(window = rolling_window).sum()
721
-
722
- mean = self.df['roll_pos_counting'].mean()
723
- std = self.df['roll_pos_counting'].std()
724
- self.df['norm_counting'] = (self.df['roll_pos_counting'] - mean )/std
725
995
 
726
- self.df['signal_up_roll_pos_counting'] = np.where((self.df['norm_counting'] > threshold),1,0)
727
- self.df['signal_low_roll_pos_counting'] = np.where((self.df['norm_counting'] < -threshold),1,0)
728
-
729
- if save_features:
730
- self.features.append('roll_pos_counting')
731
- self.signals.append('signal_up_roll_pos_counting')
732
- self.signals.append('signal_low_roll_pos_counting')
733
- self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
734
-
735
- if plot:
736
- fig = plt.figure(figsize = (10,4))
737
- plt.plot(self.df['Date'],self.df.norm_counting)
738
- plt.axhline(y=threshold, color='grey', linestyle='--')
739
- plt.axhline(y=-threshold, color='grey', linestyle='--')
740
996
  plt.show()
741
- #######################
742
-
997
+
743
998
  def bidirect_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
744
-
999
+ """
1000
+ perform negative and positive return counting in a given rolling time window
1001
+
1002
+ Parameters
1003
+ ----------
1004
+ rolling_window (int): window to apply to positive and negative returns
1005
+ threshold (float): alpha or z thrsholds for the normalized feature
1006
+ plot (boolean): True to display plot
1007
+ save_features (boolean): True to save feature configuration and feature names
1008
+
1009
+ Returns
1010
+ -------
1011
+ None
1012
+ """
745
1013
  feature_name = 'bidirect_counting'
746
1014
  # negative countiing and rolling countingng
747
1015
  self.df['RetClose'] = self.df['Close'].pct_change()
@@ -757,7 +1025,7 @@ class stock_eda_panel(object):
757
1025
 
758
1026
  if save_features:
759
1027
  self.log_features_standard(feature_name)
760
- self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
1028
+ self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
761
1029
 
762
1030
  if plot:
763
1031
  fig = plt.figure(figsize = (10,4))
@@ -766,47 +1034,21 @@ class stock_eda_panel(object):
766
1034
  plt.plot(self.df['Date'],self.df[f'lower_{feature_name}'], linestyle='--')
767
1035
  plt.show()
768
1036
 
769
- #######################
770
- #### to be deprecated ####
771
- def get_range_feature(self, window, up_threshold, low_threshold, plot = False, save_features = False):
772
-
773
- self.df["Range"] = self.df["High"] / self.df["Low"] - 1
774
- self.df['Avg_range'] = self.df['Range'].rolling(window = 5).mean()
775
- self.df['dist_range'] = self.df['Range'] - self.df['Avg_range']
776
- self.df['norm_dist_range'] = (self.df['dist_range'] - self.df['dist_range'].mean())/ self.df['dist_range'].std()
777
-
778
- mean_ = self.df['norm_dist_range'].mean()
779
- self.df[f'std_norm_dist_range'] = (self.df.sort_values("Date")["norm_dist_range"].transform(lambda x: x.rolling(window, min_periods=1).std()))
780
-
781
- self.df['up_bound_norm_dist_range'] = up_threshold*self.df['std_norm_dist_range'] + mean_
782
- self.df['low_bound_norm_dist_range'] = -low_threshold*self.df['std_norm_dist_range'] + mean_
783
-
784
- self.df['signal_up_dist_range'] = np.where(self.df['norm_dist_range'] > self.df['up_bound_norm_dist_range'],1,0 )
785
- self.df['signal_low_dist_range'] = np.where(self.df['norm_dist_range'] < self.df['low_bound_norm_dist_range'],1,0 )
786
-
787
- if save_features:
788
- self.features.append('dist_range')
789
- self.signals.append('signal_up_dist_range')
790
- self.signals.append('signal_low_dist_range')
791
- self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
792
-
793
- if plot:
794
- fig, axs = plt.subplots(2, 2,figsize=(17,11))
795
-
796
- axs[0,0].plot(self.df['Range'])
797
- axs[0,0].set_title('range')
798
-
799
- axs[0,1].plot(self.df['Avg_range'])
800
- axs[0,1].set_title('Avg_range')
801
-
802
- axs[1,0].plot(self.df['up_bound_norm_dist_range'],color = 'grey', linestyle='--')
803
- axs[1,0].plot(self.df['low_bound_norm_dist_range'],color = 'grey', linestyle='--')
804
- axs[1,0].plot(self.df['norm_dist_range'])
805
- axs[1,0].set_title('norm_dist_range')
806
- #######################
807
-
808
1037
  def get_relative_range_feature(self, window, threshold, plot = False, save_features = False):
809
-
1038
+ """
1039
+ perform relative spread of opening and closing price
1040
+
1041
+ Parameters
1042
+ ----------
1043
+ window (int): window to apply to the feature
1044
+ threshold (float): alpha or z thrsholds for the normalized feature
1045
+ plot (boolean): True to display plot
1046
+ save_features (boolean): True to save feature configuration and feature names
1047
+
1048
+ Returns
1049
+ -------
1050
+ None
1051
+ """
810
1052
  feature_name = 'CO_Range'
811
1053
  self.df[feature_name] = self.df["Close"] / self.df["Open"]-1
812
1054
  self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/ self.df[feature_name].std()
@@ -822,7 +1064,7 @@ class stock_eda_panel(object):
822
1064
 
823
1065
  if save_features:
824
1066
  self.log_features_standard(feature_name)
825
- self.settings_relative_price_range = {'window':window, 'threshold':threshold}
1067
+ self.settings_relative_price_range = {'window':window, 'threshold':threshold}
826
1068
 
827
1069
  if plot:
828
1070
  fig, axs = plt.subplots(1, 2,figsize=(14,5))
@@ -835,46 +1077,24 @@ class stock_eda_panel(object):
835
1077
  axs[1].plot(self.df[f'norm_{feature_name}'])
836
1078
  axs[1].set_title(f'norm_{feature_name}')
837
1079
 
838
- #######################
839
- #### to be deprecated ####
840
- def rsi_feature(self, window, lag_rsi_ret, threshold, plot = False, save_features = False):
841
-
842
- rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
843
- self.df['RSI'] = rsi
844
- self.df['RSI_ret'] = self.df['RSI']/self.df['RSI'].shift(lag_rsi_ret)
845
-
846
- mean = self.df['RSI_ret'].mean()
847
- std = self.df['RSI_ret'].std()
848
- self.df['norm_RSI_ret'] = (self.df['RSI_ret']-mean)/std
849
- self.df['signal_up_RSI_ret'] = np.where(self.df['norm_RSI_ret'] > threshold,1,0)
850
- self.df['signal_low_RSI_ret'] = np.where(self.df['norm_RSI_ret'] < -threshold,1,0)
851
-
852
- if save_features:
853
- self.features.append('RSI_ret')
854
- self.signals.append('signal_up_RSI_ret')
855
- self.signals.append('signal_low_RSI_ret')
856
- self.settings_rsi_feature= {'window':window, 'lag_rsi_ret':lag_rsi_ret, 'threshold':threshold}
857
-
858
- if plot:
859
- fig, axs = plt.subplots(1, 3,figsize=(17,5))
860
-
861
- axs[0].plot(self.df.norm_RSI_ret)
862
- axs[0].axhline(y=threshold, color='grey', linestyle='--')
863
- axs[0].axhline(y=-threshold, color='grey', linestyle='--')
864
-
865
- plot_acf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[1])
866
- axs[1].set_title('acf RSI_ret')
867
-
868
- plot_pacf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[2])
869
- axs[2].set_title('pacf RSI_ret')
870
-
871
- fig.show()
872
- #######################
873
-
874
1080
  def rsi_feature_improved(self, window, threshold, plot = False, save_features = False):
1081
+ """
1082
+ perform relative strength index
1083
+
1084
+ Parameters
1085
+ ----------
1086
+ window (int): window to apply to the feature
1087
+ threshold (float): alpha or z thrsholds for the normalized feature
1088
+ plot (boolean): True to display plot
1089
+ save_features (boolean): True to save feature configuration and feature names
1090
+
1091
+ Returns
1092
+ -------
1093
+ None
1094
+ """
875
1095
  feature_name = 'RSI'
876
1096
  rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
877
- self.df[feature_name] = rsi
1097
+ self.df[feature_name] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
878
1098
  self.compute_clip_bands(feature_name,threshold)
879
1099
 
880
1100
  if save_features:
@@ -883,54 +1103,22 @@ class stock_eda_panel(object):
883
1103
 
884
1104
  if plot:
885
1105
  self.signal_plotter(feature_name)
886
-
887
- #######################
888
- #### to be deprecated ####
889
- def days_features(self, window_day, limit, plot = False, save_features = False):
890
-
891
- self.df['dow'] = self.df.Date.dt.dayofweek
892
- self.df['dow'] = self.df['dow'].astype('str')
893
-
894
- self.df['target_mean_input'] = (self.df.sort_values("Date").groupby('dow')['roll_mean_log_return'].transform(lambda x: x.rolling(window_day, min_periods=1).mean()))
895
-
896
- mean = self.df['target_mean_input'].mean()
897
- std = self.df['target_mean_input'].std()
898
-
899
- self.df['norm_dow_input'] = (self.df['target_mean_input']-mean)/std
900
- mean_ = self.df['norm_dow_input'].mean()
901
- self.df['std_dow_input'] = self.df.sort_values("Date")["norm_dow_input"].rolling(50).std()
902
-
903
- self.df['up_dow_input'] = limit*self.df['std_dow_input'] + mean_
904
- self.df['low_dow_input'] = -limit*self.df['std_dow_input'] - mean_
905
-
906
- self.df['signal_up_target_mean_input'] = np.where(self.df['norm_dow_input'] > self.df['up_dow_input'],1,0)
907
- self.df['signal_low_target_mean_input'] = np.where(self.df['norm_dow_input'] < self.df['low_dow_input'],1,0)
908
1106
 
909
- if save_features:
910
-
911
- self.features.append('target_mean_input')
912
- self.signals.append('signal_up_target_mean_input')
913
- self.signals.append('signal_low_target_mean_input')
914
- self.settings_days_features = {'window_day':window_day, 'limit':limit}
915
-
916
- if plot:
917
- fig, axs = plt.subplots(1, 3,figsize=(17,5))
918
-
919
- axs[0].plot(self.df['norm_dow_input'])
920
- axs[0].plot(self.df['up_dow_input'], linestyle='--')
921
- axs[0].plot(self.df['low_dow_input'], linestyle='--')
922
-
923
- plot_acf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[1])
924
- axs[1].set_title('acf day feature')
925
-
926
- plot_pacf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[2])
927
- axs[2].set_title('pacf day feature')
928
-
929
- fig.show()
930
- #######################
931
-
932
1107
  def days_features_bands(self, window, threshold, plot = False, save_features = False):
933
-
1108
+ """
1109
+ compute mean returns for a given day of the week in a window scope per day
1110
+
1111
+ Parameters
1112
+ ----------
1113
+ window (int): window to apply to the feature
1114
+ threshold (float): alpha or z thrsholds for the normalized feature
1115
+ plot (boolean): True to display plot
1116
+ save_features (boolean): True to save feature configuration and feature names
1117
+
1118
+ Returns
1119
+ -------
1120
+ None
1121
+ """
934
1122
  self.df['dow'] = self.df.Date.dt.dayofweek
935
1123
  self.df['dow'] = self.df['dow'].astype('str')
936
1124
 
@@ -947,65 +1135,22 @@ class stock_eda_panel(object):
947
1135
 
948
1136
  if plot:
949
1137
  self.signal_plotter(feature_name)
950
-
951
- #######################
952
- #### to be deprecated ####
953
- def analysis_volume(self,lag_volume, threshold, window, plot = False, save_features = False):
954
-
955
- self.df['log_Volume'] = np.log(self.df['Volume'])
956
- self.df['ret_log_Volume'] = self.df['log_Volume'].pct_change(lag_volume)
957
-
958
- self.df['norm_ret_log_Volume'] = (self.df['ret_log_Volume'] - self.df['ret_log_Volume'].mean())/ self.df['ret_log_Volume'].std()
959
- mean_ = self.df['norm_ret_log_Volume'].mean()
960
- self.df[f'std_norm_ret_log_Volume'] = (self.df.sort_values("Date")["norm_ret_log_Volume"].transform(lambda x: x.rolling(window, min_periods=1).std()))
961
1138
 
962
- self.df['up_bound_ret_log_Volume'] = threshold*self.df['std_norm_ret_log_Volume'] + mean_
963
- self.df['low_bound_ret_log_Volume'] = -threshold*self.df['std_norm_ret_log_Volume'] + mean_
964
-
965
- self.df['signal_up_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] > self.df['up_bound_ret_log_Volume'],1,0 )
966
- self.df['signal_low_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] < self.df['low_bound_ret_log_Volume'],1,0 )
967
-
968
- if save_features:
969
- self.features.append('ret_log_Volume')
970
- self.signals.append('signal_up_ret_log_Volume')
971
- self.signals.append('signal_low_ret_log_Volume')
972
- self.settings_volume_feature= {'lag_volume':lag_volume, 'threshold':threshold, 'window':window}
973
- if plot:
974
- fig, axs = plt.subplots(3, 2,figsize=(11,13))
975
- axs[0,0].plot(self.df.Date, self.df.Volume)
976
- axs[0,0].set_title('Volume')
977
- axs[0,1].plot(self.df.Date, self.df.log_Volume)
978
- axs[0,1].set_title('log Volume')
979
-
980
- plot_acf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,0])
981
- axs[1,0].set_title('acf log_Volume')
982
- plot_pacf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,1])
983
- axs[1,1].set_title('pacf log_Volume')
984
-
985
- plot_acf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,0])
986
- axs[2,0].set_title('acf ret_log_Volume')
987
- plot_pacf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,1])
988
- axs[2,1].set_title('pacf ret_log_Volume')
989
-
990
- plt.show()
991
-
992
- print('--------------------------------------------------------------')
993
-
994
- fig, axs = plt.subplots(1, 2,figsize=(10,4))
995
-
996
- axs[0].plot(self.df.Date, self.df.norm_ret_log_Volume)
997
- axs[0].plot(self.df.Date, self.df.up_bound_ret_log_Volume)
998
- axs[0].plot(self.df.Date, self.df.low_bound_ret_log_Volume)
999
- axs[0].set_title('norm_ret_log_Volume')
1000
-
1001
- axs[1].plot(self.df.Date, self.df.std_norm_ret_log_Volume)
1002
- axs[1].set_title('std_norm_ret_log_Volume')
1003
-
1004
- plt.show()
1005
- #######################
1006
-
1007
1139
  def analysis_smooth_volume(self, window, threshold, plot = False, save_features = False):
1008
-
1140
+ """
1141
+ compute feature of thrading volumes
1142
+
1143
+ Parameters
1144
+ ----------
1145
+ window (int): window to apply to the feature
1146
+ threshold (float): alpha or z thrsholds for the normalized feature
1147
+ plot (boolean): True to display plot
1148
+ save_features (boolean): True to save feature configuration and feature names
1149
+
1150
+ Returns
1151
+ -------
1152
+ None
1153
+ """
1009
1154
  feature_name = 'smooth_Volume'
1010
1155
  self.df[feature_name] = np.log(self.df['Volume'])
1011
1156
  # self.df[feature_name] = self.df['log_Volume'].rolling(window).mean()
@@ -1039,7 +1184,7 @@ class stock_eda_panel(object):
1039
1184
 
1040
1185
  fig, axs = plt.subplots(1,2,figsize=(10,4))
1041
1186
 
1042
- axs[0].plot(self.df[f'{feature_name}'])
1187
+ axs[0].plot(self.df[f'{feature_name}'])
1043
1188
  axs[0].set_title(f'{feature_name}')
1044
1189
 
1045
1190
  axs[1].plot(self.df[f'z_{feature_name}'], linestyle='--')
@@ -1048,9 +1193,23 @@ class stock_eda_panel(object):
1048
1193
  plt.show()
1049
1194
 
1050
1195
  def roc_feature(self, window, threshold, plot = False, save_features = False):
1196
+ """
1197
+ perform price rate of change
1198
+
1199
+ Parameters
1200
+ ----------
1201
+ window (int): window to apply to the feature
1202
+ threshold (float): alpha or z thrsholds for the normalized feature
1203
+ plot (boolean): True to display plot
1204
+ save_features (boolean): True to save feature configuration and feature names
1205
+
1206
+ Returns
1207
+ -------
1208
+ None
1209
+ """
1051
1210
  feature_name = 'ROC'
1052
1211
  roc = ROCIndicator(close = self.df['Close'], window = window).roc()
1053
- self.df[feature_name] = roc
1212
+ self.df[feature_name] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1054
1213
  self.compute_clip_bands(feature_name,threshold)
1055
1214
 
1056
1215
  if save_features:
@@ -1058,11 +1217,27 @@ class stock_eda_panel(object):
1058
1217
  self.settings_roc_feature = {'window':window, 'threshold':threshold}
1059
1218
  if plot:
1060
1219
  self.signal_plotter(feature_name)
1061
-
1220
+
1062
1221
  def stoch_feature(self, window, smooth1, smooth2, threshold, plot = False, save_features = False):
1222
+ """
1223
+ perform stochastic oscilator RSI feature
1224
+
1225
+ Parameters
1226
+ ----------
1227
+ window (int): window to apply to the feature
1228
+ smooth1 (int): smoothing parameter 1
1229
+ smooth2 (int): smoothing parameter 2
1230
+ threshold (float): alpha or z thrsholds for the normalized feature
1231
+ plot (boolean): True to display plot
1232
+ save_features (boolean): True to save feature configuration and feature names
1233
+
1234
+ Returns
1235
+ -------
1236
+ None
1237
+ """
1063
1238
  feature_name = 'STOCH'
1064
1239
  stoch = StochRSIIndicator(close = self.df['Close'], window = window, smooth1=smooth1, smooth2=smooth2).stochrsi()
1065
- self.df[feature_name] = stoch
1240
+ self.df[feature_name] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1066
1241
  self.compute_clip_bands(feature_name,threshold)
1067
1242
 
1068
1243
  if save_features:
@@ -1072,9 +1247,24 @@ class stock_eda_panel(object):
1072
1247
  self.signal_plotter(feature_name)
1073
1248
 
1074
1249
  def stochastic_feature(self, window, smooth, threshold, plot = False, save_features = False):
1250
+ """
1251
+ perform stochastic oscilator feature
1252
+
1253
+ Parameters
1254
+ ----------
1255
+ window (int): window to apply to the feature
1256
+ smooth (int): smoothing parameter
1257
+ threshold (float): alpha or z thrsholds for the normalized feature
1258
+ plot (boolean): True to display plot
1259
+ save_features (boolean): True to save feature configuration and feature names
1260
+
1261
+ Returns
1262
+ -------
1263
+ None
1264
+ """
1075
1265
  feature_name = 'STOCHOSC'
1076
1266
  stochast = StochasticOscillator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window,smooth_window=smooth).stoch()
1077
- self.df[feature_name] = stochast
1267
+ self.df[feature_name] = stochast.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1078
1268
  self.compute_clip_bands(feature_name,threshold)
1079
1269
 
1080
1270
  if save_features:
@@ -1084,9 +1274,23 @@ class stock_eda_panel(object):
1084
1274
  self.signal_plotter(feature_name)
1085
1275
 
1086
1276
  def william_feature(self, lbp, threshold, plot = False, save_features = False):
1277
+ """
1278
+ perfom fast stochastic oscilator or william indicator
1279
+
1280
+ Parameters
1281
+ ----------
1282
+ lbp (int): look back parameter
1283
+ threshold (float): alpha or z thrsholds for the normalized feature
1284
+ plot (boolean): True to display plot
1285
+ save_features (boolean): True to save feature configuration and feature names
1286
+
1287
+ Returns
1288
+ -------
1289
+ None
1290
+ """
1087
1291
  feature_name = 'WILL'
1088
- will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
1089
- self.df[feature_name] = will
1292
+ will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
1293
+ self.df[feature_name] = will.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1090
1294
  self.compute_clip_bands(feature_name,threshold)
1091
1295
 
1092
1296
  if save_features:
@@ -1096,9 +1300,23 @@ class stock_eda_panel(object):
1096
1300
  self.signal_plotter(feature_name)
1097
1301
 
1098
1302
  def vortex_feature(self, window, threshold, plot = False, save_features = False):
1303
+ """
1304
+ perform vortex oscilator
1305
+
1306
+ Parameters
1307
+ ----------
1308
+ window (int): window to apply to the feature
1309
+ threshold (float): alpha or z thrsholds for the normalized feature
1310
+ plot (boolean): True to display plot
1311
+ save_features (boolean): True to save feature configuration and feature names
1312
+
1313
+ Returns
1314
+ -------
1315
+ None
1316
+ """
1099
1317
  feature_name = 'VORTEX'
1100
1318
  vortex = VortexIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window).vortex_indicator_diff()
1101
- self.df[feature_name] = vortex
1319
+ self.df[feature_name] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1102
1320
  self.compute_clip_bands(feature_name,threshold)
1103
1321
 
1104
1322
  if save_features:
@@ -1107,11 +1325,160 @@ class stock_eda_panel(object):
1107
1325
  if plot:
1108
1326
  self.signal_plotter(feature_name)
1109
1327
 
1110
- def pair_index_feature(self, pair_symbol, feature_label, window, threshold, plot = False, save_features = False):
1328
+ def minmax_pricefeature(self, type_func, window, distance = False, plot = False, save_features = False):
1329
+ """
1330
+ perform relative price/distance with respect to the min/max price in a given time scope
1331
+
1332
+ Parameters
1333
+ ----------
1334
+ type_func (str): either min or max
1335
+ window (int): window scope
1336
+ distance (boolean): if true, get distance feature else relative feature
1337
+ save_features (boolean): True to save feature configuration and feature names
1338
+
1339
+ Returns
1340
+ -------
1341
+ None
1342
+ """
1343
+ if type_func == 'min':
1344
+ self.df['Price_ref'] = self.df[['Open','High', 'Low','Close']].min(axis = 1)
1345
+ elif type_func == 'max':
1346
+ self.df['Price_ref'] = self.df[['Open','High', 'Low','Close']].max(axis = 1)
1347
+
1348
+ init_shape = self.df.shape[0]
1349
+ df_date = self.df[['Date','Price_ref']].rename(columns = {'Date':'Date_ref'}).copy()
1350
+
1351
+ self.df = self.df.rename(columns = {'Price_ref':'Price_to_use'})
1352
+
1353
+ if type_func == 'min':
1354
+ self.df[f'window_price'] = (self.df.sort_values("Date")["Price_to_use"].transform(lambda x: x.rolling(window, min_periods=1).min()))
1355
+ elif type_func == 'max':
1356
+ self.df[f'window_price'] = (self.df.sort_values("Date")["Price_to_use"].transform(lambda x: x.rolling(window, min_periods=1).max()))
1357
+
1358
+
1359
+ self.df = self.df.merge(df_date, left_on = 'window_price', right_on = 'Price_ref', how = 'left')
1360
+ self.df['date_span'] = self.df['Date'] - self.df['Date_ref']
1361
+
1362
+ self.df['RN'] = self.df.sort_values(['date_span'], ascending=False).groupby(['Date']).cumcount() + 1
1363
+ self.df = self.df[self.df['RN'] == 1]
1364
+
1365
+ if distance:
1366
+ self.df[f'{type_func}_distance_to_price'] = pd.to_numeric(self.df['date_span'].dt.days, downcast='integer')
1367
+
1368
+ if not distance:
1369
+ if type_func == 'min':
1370
+ self.df[f'{type_func}_relprice'] = self.df['Price_to_use']/self.df['window_price']-1
1371
+
1372
+ if type_func == 'max':
1373
+ self.df[f'{type_func}_relprice'] = self.df['window_price']/self.df['Price_to_use']-1
1374
+
1375
+ self.df = self.df.drop(columns = ['RN', 'date_span', 'Price_to_use', 'window_price', 'Date_ref','Price_ref'])
1376
+
1377
+ end_shape = self.df.shape[0]
1378
+
1379
+ if init_shape != end_shape:
1380
+ raise Exception("shapes are not the same")
1381
+
1382
+ if save_features:
1383
+ if distance:
1384
+ self.features.append(f'{type_func}_distance_to_price')
1385
+ name_attr = f'{type_func}_distance'
1386
+ if not distance:
1387
+ self.features.append(f'{type_func}_relprice')
1388
+ name_attr = f'{type_func}_relprice'
1389
+
1390
+ setattr(self,f'settings_{name_attr}_pricefeature' , {'type_func': type_func, 'window': window, 'distance': distance})
1391
+
1392
+ def expected_return(self, trad_days, feature, feature_name=False):
1393
+ """
1394
+ perform expected log return based on inversed shift of historical data and applying
1395
+
1396
+ Parameters
1397
+ ----------
1398
+ trad_days (int): window or differenciation
1399
+ feature (int): feature to apply expected log return
1400
+ feature_name (str): resulting feature name
1401
+
1402
+ Returns
1403
+ -------
1404
+ None
1405
+ """
1406
+ feature_name = feature_name if feature_name else f"{feature}_log_return_{trad_days}"
1407
+ tmp_names = list()
1408
+ for ind in range(1,trad_days+1):
1409
+ tmp_name = f"expected_{ind}"
1410
+ self.df[tmp_name] = self.df[feature].shift(-ind)/self.df[feature]-1
1411
+ tmp_names.append(tmp_name)
1412
+ self.df[feature_name] = self.df[tmp_names].max(axis=1)
1413
+ self.df = self.df.drop(columns = tmp_names)
1414
+
1415
+ def rolling_feature(self, feature, window, function):
1416
+ """
1417
+ perform rolling (non expanding) window operation for a given feature
1418
+
1419
+ Parameters
1420
+ ----------
1421
+ feature (int): feature to apply window operation
1422
+ window (int): window size
1423
+ function (str): window function e.g MIN, MAX, AVG
1424
+
1425
+ Returns
1426
+ -------
1427
+ None
1428
+ """
1429
+ feature_name = f"{feature}_{window}_{function}"
1430
+ self.df[feature_name] = getattr(self.df.sort_values("Date")[feature].rolling(window), function)()
1431
+
1432
+ def time_distance(self, feature_base,feature_window, result_feature_name, max_window=None):
1433
+ """
1434
+ perform distancce time to a given window feature
1435
+
1436
+ Parameters
1437
+ ----------
1438
+ feature_base (str): name of the underlaying feature
1439
+ feature_window (str): name of the window feature
1440
+ result_feature_name (str): resulting feature name
1441
+ max_window (int): apply a top value to the time to distance feature
1442
+
1443
+ Returns
1444
+ -------
1445
+ None
1446
+ """
1447
+ self.df["Date_pivot"] = np.nan
1448
+ self.df["Date_pivot"] = self.df["Date_pivot"].case_when([
1449
+ (self.df[feature_base] == self.df[feature_window], self.df["Date"]),
1450
+
1451
+ ])
1452
+ self.df["Date_pivot"] = self.df.sort_values("Date")["Date_pivot"].fillna(method="ffill")
1453
+ self.df[result_feature_name] = self.df["Date"] - self.df["Date_pivot"]
1454
+ self.df[result_feature_name] = self.df[result_feature_name].dt.days
1455
+ if max_window:
1456
+ self.df[result_feature_name] = self.df[result_feature_name].clip(0,max_window)
1457
+ self.df = self.df.drop(columns = ["Date_pivot"])
1458
+
1459
+ def pair_index_feature(self, pair_symbol, feature_label,threshold, window = None,ta_method='ROC',param_set=False,plot = False, save_features = False):
1460
+ """
1461
+ perform additional asset ROC feature, then a new feature is created in the main dataframe
1462
+
1463
+ Parameters
1464
+ ----------
1465
+ pair_symbol (str): symbol of the asset to extract the data
1466
+ feature_label (str): name of the resulting feature
1467
+ window (int): window to apply to the feature as default (this parameter is going to be deprecated)
1468
+ threshold (float): alpha or z thrsholds for the normalized feature
1469
+ param_set (dict): parameter set in case ta_method is other than ROC
1470
+ ta_method (str): method to use, available RSI, ROC, VORTEX, STOCH
1471
+ plot (boolean): True to display plot
1472
+ save_features (boolean): True to save feature configuration and feature names
1473
+
1474
+ Returns
1475
+ -------
1476
+ None
1477
+ """
1111
1478
  self.pair_index = pair_symbol
1112
1479
  begin_date = self.today - relativedelta(days = self.n_days)
1113
1480
  begin_date_str = begin_date.strftime('%Y-%m-%d')
1114
-
1481
+
1115
1482
  if feature_label in self.df.columns:
1116
1483
  self.df = self.df.drop(columns = [feature_label])
1117
1484
 
@@ -1123,14 +1490,27 @@ class stock_eda_panel(object):
1123
1490
  df['Date'] = pd.to_datetime(df['Date'])
1124
1491
  df = df[df.Date >= begin_date_str ]
1125
1492
  self.pair_index_df = df
1126
-
1493
+
1127
1494
  #### converting the same index ####
1128
1495
  dates_vector = self.df.Date.to_frame()
1129
1496
  self.pair_index_df = dates_vector.merge(self.pair_index_df, on ='Date',how = 'left')
1130
1497
  self.pair_index_df = self.pair_index_df.fillna(method = 'bfill')
1131
1498
  self.pair_index_df = self.pair_index_df.fillna(method = 'ffill')
1132
-
1133
- self.pair_index_df[feature_label] = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
1499
+
1500
+ if ta_method == 'ROC':
1501
+ window = window if window else param_set.get('window')
1502
+ roc = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
1503
+ self.pair_index_df[feature_label] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1504
+ elif ta_method == 'RSI':
1505
+ rsi = RSIIndicator(close = self.pair_index_df['Close'], **param_set).rsi()
1506
+ self.pair_index_df[feature_label] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1507
+ elif ta_method == 'VORTEX':
1508
+ vortex = VortexIndicator(close = self.pair_index_df['Close'], high = self.pair_index_df['High'], low = self.pair_index_df['Low'], **param_set).vortex_indicator_diff()
1509
+ self.pair_index_df[feature_label] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1510
+ elif ta_method == 'STOCH':
1511
+ stoch = StochRSIIndicator(close = self.pair_index_df['Close'], **param_set).stochrsi()
1512
+ self.pair_index_df[feature_label] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1513
+
1134
1514
  df_to_merge = self.pair_index_df[['Date',feature_label]]
1135
1515
  self.df = self.df.merge(df_to_merge, on ='Date',how = 'left')
1136
1516
 
@@ -1140,7 +1520,7 @@ class stock_eda_panel(object):
1140
1520
  if save_features:
1141
1521
  self.log_features_standard(feature_label)
1142
1522
  parameters = {feature_label:{'pair_symbol':pair_symbol, 'feature_label':feature_label, 'window':window,'threshold':threshold}}
1143
- try:
1523
+ try:
1144
1524
  len(self.settings_pair_index_feature)
1145
1525
  print('existing')
1146
1526
  self.settings_pair_index_feature.append(parameters)
@@ -1153,10 +1533,21 @@ class stock_eda_panel(object):
1153
1533
  self.signal_plotter(feature_label)
1154
1534
 
1155
1535
  def produce_order_features(self, feature_name, save_features = False):
1536
+ """
1537
+ perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
1156
1538
 
1539
+ Parameters
1540
+ ----------
1541
+ feature_name (str): name of the feature
1542
+ save_features (boolean): True to save feature configuration and feature names
1543
+
1544
+ Returns
1545
+ -------
1546
+ None
1547
+ """
1157
1548
  signal_feature_name = f'discrete_signal_{feature_name}'
1158
1549
  order_feature_name = f'order_signal_{feature_name}'
1159
-
1550
+
1160
1551
  self.df[signal_feature_name] = np.where(
1161
1552
  self.df[f'signal_up_{feature_name}'] == 1,1,
1162
1553
  np.where(
@@ -1173,14 +1564,107 @@ class stock_eda_panel(object):
1173
1564
  self.df[order_feature_name] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1174
1565
  self.df[order_feature_name] = self.df[order_feature_name]*self.df[signal_feature_name]
1175
1566
  self.df = self.df.drop(columns = [f'lag_{signal_feature_name}', 'breack', "chain_id"])
1176
-
1567
+
1177
1568
  ## saving features
1178
1569
  if save_features:
1179
1570
  self.signals.append(signal_feature_name)
1180
1571
  self.signals.append(order_feature_name)
1181
-
1572
+
1573
+ def get_order_feature_nosignal(self,feature_name, save_features=False):
1574
+ """
1575
+ perform a feature that captures number of steps after the end of a signal
1576
+
1577
+ Parameters
1578
+ ----------
1579
+ feature_name (str): name of the feature
1580
+ save_features (boolean): True to save feature configuration and feature names
1581
+
1582
+ Returns
1583
+ -------
1584
+ None
1585
+ """
1586
+ order_feature_name = f'order_signal_{feature_name}'
1587
+ ns_order_feature_name = f'ns_order_{feature_name}'
1588
+ self.df = self.df.sort_values('Date')
1589
+ self.df['lag_'] = self.df[order_feature_name].shift(1)
1590
+ self.df['flag'] = np.where((self.df[order_feature_name] == 0) & (self.df['lag_']!=0),1,np.nan)
1591
+ self.df = self.df.drop(columns=['lag_'])
1592
+ self.df['order_'] = self.df.sort_values('Date').groupby(['flag']).cumcount() + 1
1593
+ self.df['order_'] = self.df['order_'].fillna(method='ffill')
1594
+ self.df['order_'] = np.where(self.df[order_feature_name]==0,self.df['order_'],0)
1595
+ self.df = self.df.drop(columns=['flag'])
1596
+ self.df['order_'] = self.df.sort_values('Date').groupby(['order_']).cumcount() + 1
1597
+ norm_list = [f'norm_{feature_name}', f'z_{feature_name}', feature_name]
1598
+ for norm_feature in norm_list:
1599
+ try:
1600
+ self.df['order_'] = np.sign(self.df[norm_feature])*self.df['order_']
1601
+ break
1602
+ except:
1603
+ pass
1604
+ self.df['order_'] = np.where(self.df[order_feature_name]==0,self.df['order_'],0)
1605
+ self.df = self.df.rename(columns={'order_':ns_order_feature_name})
1606
+ if save_features:
1607
+ self.signals.append(ns_order_feature_name)
1608
+
1609
+ def compute_last_signal(self,feature, save_features = False):
1610
+ """
1611
+ perform two new features when signal is observed, one for the last duration of the previous chain, second for the last duration of the same sign signal
1612
+
1613
+ Parameters
1614
+ ----------
1615
+ feature_name (str): name of the feature
1616
+ save_features (boolean): True to save feature configuration and feature names
1617
+
1618
+ Returns
1619
+ -------
1620
+ None
1621
+ """
1622
+ def create_last_signal(df, feature, prefix, type ='0'):
1623
+ if type == '0':
1624
+ condition = df[f'order_signal_{feature}'] != 0
1625
+ elif type == '+':
1626
+ condition = df[f'order_signal_{feature}'] > 0
1627
+ elif type == '-':
1628
+ condition = df[f'order_signal_{feature}'] < 0
1629
+ df[f'last_maxorder_{feature}'] = np.where(condition, df[f'order_signal_{feature}'],np.nan)
1630
+ df['tmp_chain_index'] = df[f'last_maxorder_{feature}'].shift(-1)
1631
+ df['last'] = np.where((df[f'last_maxorder_{feature}'] != 0) & (df['tmp_chain_index'].isna()),df[f'last_maxorder_{feature}'], np.nan )
1632
+ df['last'] = df['last'].shift(1)
1633
+ df[f'last_maxorder_{feature}'] = df['last'].fillna(method = 'ffill')
1634
+ df = df.drop(columns = ['tmp_chain_index','last'])
1635
+ df[f'last_maxorder_{feature}'] = np.where(df[f'order_signal_{feature}'] != 0,df[f'last_maxorder_{feature}'],np.nan)
1636
+ df[f'last_maxorder_{feature}'] = df[f'last_maxorder_{feature}'].fillna(0)
1637
+ df = df.rename(columns = {f'last_maxorder_{feature}':f'{prefix}_{feature}'})
1638
+ return df
1639
+ prefix0, prefix1, prefix2 = 'ldur', 'pos', 'neg'
1640
+ self.df = create_last_signal(self.df, feature, prefix0, type ='0')
1641
+ self.df = create_last_signal(self.df, feature, prefix1, type ='+')
1642
+ self.df = create_last_signal(self.df, feature, prefix2, type ='-')
1643
+
1644
+ self.df[f'sldur_{feature}'] = np.where(
1645
+ self.df[f'order_signal_{feature}'] > 0, self.df[f'{prefix1}_{feature}'],
1646
+ np.where(
1647
+ self.df[f'order_signal_{feature}'] < 0, self.df[f'{prefix2}_{feature}'],
1648
+ 0
1649
+ )
1650
+ )
1651
+ self.df = self.df.drop(columns = [f'{prefix1}_{feature}',f'{prefix2}_{feature}'])
1652
+ if save_features:
1653
+ self.signals.append(f'sldur_{feature}')
1654
+ self.signals.append(f'ldur_{feature}')
1655
+
1182
1656
  def create_hmm_derived_features(self, lag_returns):
1657
+ """
1658
+ create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
1659
+
1660
+ Parameters
1661
+ ----------
1662
+ lag_returns (int): lag paramter (not used)
1183
1663
 
1664
+ Returns
1665
+ -------
1666
+ None
1667
+ """
1184
1668
  self.df = self.df.sort_values('Date')
1185
1669
  ## indexing chains
1186
1670
  self.df['lag_hmm_feature'] = self.df['hmm_feature'].shift(1)
@@ -1189,31 +1673,44 @@ class stock_eda_panel(object):
1189
1673
  self.df["chain_id"] = np.where(self.df['breack'] == 1,self.df["chain_id"],np.nan)
1190
1674
  self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
1191
1675
  self.df["hmm_chain_order"] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1192
-
1193
- ### returns using the first element in a chain
1194
- self.df['first'] = np.where(self.df['hmm_chain_order'] == 1, self.df['Close'], np.nan)
1195
- self.df['first'] = self.df.sort_values('Date')['first'].fillna(method='ffill')
1196
- self.df['chain_return'] = (self.df['Close']/self.df['first'] -1) * 100
1197
1676
 
1198
- self.df = self.df.drop(columns = ['breack','first'])
1677
+ ### returns using the windowsseeds
1678
+ self.df['lag_chain_close'] = self.df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
1679
+ self.df['chain_return'] = (self.df['Close']/self.df['lag_chain_close'] -1) * 100
1680
+ self.df = self.df.drop(columns = ['breack'])
1199
1681
 
1200
- def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, plot = False, save_features = False, model = False):
1682
+ def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, corr_threshold = 0.75, plot = False, save_features = False, model = False):
1683
+ """
1684
+ create or use a hmm model
1685
+
1686
+ Parameters
1687
+ ----------
1688
+ n_clusters (int): number of clusters or states to calculate
1689
+ features_hmm (list): features to be considered in hmm model when training
1690
+ test_data_size (int): size of the test data. Note that the remaining is going to be used as training data
1691
+ seed (int): seed for the model inizialization
1692
+ lag_returns_state (int) : lags for returns of the state
1693
+ corr_threshold (float): correlation threshold for initial feature selection
1694
+ plot (boolean): True to display hmm states analysis
1695
+ save_features (boolean): True to save features and configurations
1696
+ model (obj): if provided, no model will be trainend and the provided model will be used to get hmm features
1697
+
1698
+ Returns
1699
+ -------
1700
+ None
1701
+ """
1201
1702
  if not model:
1202
-
1703
+
1203
1704
  df_new = self.df
1204
- pipeline_hmm = Pipeline([
1205
- ('selector', FeatureSelector(columns=features_hmm)),
1206
- ('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
1207
- ('hmm',GaussianHMM(n_components = n_clusters, covariance_type = 'full', random_state = seed))
1208
- ])
1209
1705
  data_train = df_new.iloc[:-test_data_size,:]
1210
1706
  data_test = df_new.iloc[-test_data_size:,:]
1211
1707
 
1212
- pipeline_hmm.fit(data_train)
1213
-
1708
+ th = trainer_hmm(data_train, features_hmm, n_clusters=n_clusters,corr_thrshold=corr_threshold, seed = seed)
1709
+ th.train()
1710
+ pipeline_hmm = th.hmm_model
1214
1711
  self.model_hmm = pipeline_hmm
1215
1712
  self.test_data_hmm = data_test
1216
-
1713
+
1217
1714
  ### first feature: the hidden state
1218
1715
  self.df['hmm_feature'] = self.model_hmm.predict(self.df)
1219
1716
  self.create_hmm_derived_features(lag_returns = lag_returns_state)
@@ -1230,15 +1727,15 @@ class stock_eda_panel(object):
1230
1727
  hidden_states = pipeline_hmm.predict(data_test)
1231
1728
  data_test['HMM'] = hidden_states
1232
1729
  data_test['HMM_state'] = data_test['HMM'].map(map_)
1233
-
1730
+
1234
1731
  if model:
1235
1732
  self.df['hmm_feature'] = model.predict(self.df)
1236
1733
  self.create_hmm_derived_features(lag_returns = lag_returns_state)
1237
-
1734
+
1238
1735
  if save_features:
1239
1736
  self.features.append('hmm_feature')
1240
1737
  self.features.append('hmm_chain_order')
1241
- self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state }
1738
+ self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state, 'corr_threshold':corr_threshold }
1242
1739
 
1243
1740
  if plot:
1244
1741
 
@@ -1263,14 +1760,38 @@ class stock_eda_panel(object):
1263
1760
  fig.show()
1264
1761
 
1265
1762
  def sharpe_ratio(self, return_series, n_trad_days = 255, rf = 0.01):
1763
+ """
1764
+ perform sharpe ratio of a given time series return
1765
+
1766
+ Parameters
1767
+ ----------
1768
+ return_series (pd.series): time series of the returns
1769
+ n_trad_days (int): trading days to anualize returns
1770
+ rf (float): anual free risk rate
1771
+
1772
+ Returns
1773
+ -------
1774
+ sharpe_ratio (float): sharpe ratio
1775
+ """
1266
1776
  nsqrt = np.sqrt(n_trad_days)
1267
1777
  mean = return_series.mean() * n_trad_days
1268
1778
  sigma = return_series.std() * nsqrt
1269
1779
  sharpe_ratio = round((mean-rf)/sigma,2)
1270
1780
  return sharpe_ratio
1271
-
1781
+
1272
1782
  def treat_signal_strategy(self,test_data, strategy):
1273
-
1783
+ """
1784
+ helper method that treats signals and converts signals to 1 or 0
1785
+
1786
+ Parameters
1787
+ ----------
1788
+ test_data (pd.DataFrame): test data
1789
+ strategy (list): features to get the strategy
1790
+
1791
+ Returns
1792
+ -------
1793
+ test_data (pd.DataFrame): test data with extra columns that are the strategy (main_signal)
1794
+ """
1274
1795
  hmm_states_list = [x for x in strategy if 'hmm_state_' in x]
1275
1796
  other_features = [x for x in strategy if x not in hmm_states_list]
1276
1797
 
@@ -1299,10 +1820,21 @@ class stock_eda_panel(object):
1299
1820
  elif len(hmm_states_list) == 0 and len(other_features) > 0:
1300
1821
  test_data['main_signal'] = np.where((test_data['features_signal'] == 1) & (test_data['hmm_signal'] == 0),1,0)
1301
1822
 
1302
- return test_data
1823
+ return test_data
1303
1824
 
1304
1825
  def stategy_simulator(self, features, hmm_feature = True):
1826
+ """
1827
+ execute strategy and get some performance metrics like sharpe ratio, return. This method creates some new attributes
1305
1828
 
1829
+ Parameters
1830
+ ----------
1831
+ features (list): list of features to be tested as strategies
1832
+ hmm_feature (boolean): include hmm feature
1833
+
1834
+ Returns
1835
+ -------
1836
+ None
1837
+ """
1306
1838
  columns_ = ['Date', 'Close','Open'] + features + ['HMM']
1307
1839
  states = list(self.df.hmm_feature.unique())
1308
1840
  states.sort()
@@ -1372,8 +1904,19 @@ class stock_eda_panel(object):
1372
1904
  self.strategy_log = df_returns_log
1373
1905
  self.best_strategy = df_returns_log.iloc[0,:].strategy
1374
1906
  self.top_10_strategy = list(df_returns_log.iloc[0:10,:].strategy.values)
1375
-
1907
+
1376
1908
  def viz_strategy(self, strategy):
1909
+ """
1910
+ display analysis plot of a given strategy
1911
+
1912
+ Parameters
1913
+ ----------
1914
+ strategy (list): list of features of the strategy
1915
+
1916
+ Returns
1917
+ -------
1918
+ None
1919
+ """
1377
1920
  test_data = self.test_data_strategy
1378
1921
 
1379
1922
  test_data = self.treat_signal_strategy(test_data, strategy)
@@ -1406,62 +1949,26 @@ class stock_eda_panel(object):
1406
1949
  plt.legend()
1407
1950
  plt.show()
1408
1951
 
1409
- ### deprecated ############################
1410
- def create_strategy(self, favourable_states):
1411
-
1412
- test_data = self.test_data_hmm
1413
- # add MA signal
1414
- test_data.loc[test_data[self.ma1_column] > test_data[self.ma2_column], 'MA_signal'] = 1
1415
- test_data.loc[test_data[self.ma1_column] <= test_data[self.ma2_column], 'MA_signal'] = 0
1416
-
1417
- # add hnn signal
1418
-
1419
- test_data['HMM_signal'] = np.where(test_data['HMM'].isin(favourable_states),1,0)
1420
-
1421
- ## combined signals
1422
- test_data['main_signal'] = 0
1423
- test_data.loc[(test_data['MA_signal'] == 1) & (test_data['HMM_signal'] == 1), 'main_signal'] = 1
1424
- test_data['main_signal'] = test_data['main_signal'].shift(1)
1425
-
1426
- ## benchmark return
1427
- test_data['lrets_bench'] = np.log(test_data['Close']/test_data['Close'].shift(1))
1428
- test_data['bench_prod'] = test_data['lrets_bench'].cumsum()
1429
- test_data['bench_prod_exp'] = np.exp(test_data['bench_prod']) - 1
1430
-
1431
- ## strategy return
1432
- # test_data['lrets_strat'] = np.log(test_data['Open'].shift(-1)/test_data['Open']) * test_data['main_signal']
1433
- test_data['lrets_strat'] = np.log(test_data['Close'].shift(-1)/test_data['Close']) * test_data['main_signal']
1434
- test_data['lrets_prod'] = test_data['lrets_strat'].cumsum()
1435
- test_data['strat_prod_exp'] = np.exp(test_data['lrets_prod']) - 1
1436
- test_data.dropna(inplace = True)
1437
-
1438
- bench_rets = round(test_data['bench_prod_exp'].values[-1]*100,1)
1439
- strat_rets = round(test_data['strat_prod_exp'].values[-1]*100,1)
1440
-
1441
- bench_sharpe = self.sharpe_ratio(test_data['bench_prod_exp'].values)
1442
- strat_sharpe = self.sharpe_ratio(test_data['strat_prod_exp'].values)
1443
-
1444
- print(f'returns benchmark {bench_rets}%')
1445
- print(f'returns strategy {strat_rets}%')
1446
- print('-----------------------------')
1447
- print(f'sharpe benchmark {bench_sharpe}')
1448
- print(f'sharpe strategy {strat_sharpe}')
1449
-
1450
- fig = plt.figure(figsize = (10,4))
1451
- plt.plot(test_data['bench_prod_exp'])
1452
- plt.plot(test_data['strat_prod_exp'])
1453
- self.settings_hmm_states = {'favourable_states':favourable_states}
1454
- ################################################
1455
-
1456
1952
  def deep_dive_analysis_hmm(self, test_data_size, split = 'train'):
1457
-
1953
+ """
1954
+ display analysis plot hmm model
1955
+
1956
+ Parameters
1957
+ ----------
1958
+ test_data_size (int): test data size, the remaining is the train data
1959
+ split (str): options (train or test). Split type to assess
1960
+
1961
+ Returns
1962
+ -------
1963
+ None
1964
+ """
1458
1965
  if split == 'train':
1459
1966
  df = self.df.iloc[:-test_data_size,:]
1460
1967
  elif split == 'test':
1461
1968
  df = self.df.iloc[-test_data_size:,:]
1462
1969
 
1463
1970
  ## returns plot
1464
- fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
1971
+ fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
1465
1972
  height=400, width=1000, title = 'returns chain hmm feature')
1466
1973
  fig.add_shape(type='line',x0=-0.5,y0=0,x1=max(df.hmm_feature)+0.5,y1=0,line=dict(color='grey',width=1),xref='x',yref='y')
1467
1974
  fig.show()
@@ -1490,6 +1997,17 @@ class stock_eda_panel(object):
1490
1997
  del df
1491
1998
 
1492
1999
  def get_targets(self, steps):
2000
+ """
2001
+ produce regression target return taking future prices
2002
+
2003
+ Parameters
2004
+ ----------
2005
+ steps (int): number of lags and steps for future returns
2006
+
2007
+ Returns
2008
+ -------
2009
+ None
2010
+ """
1493
2011
  self.targets = list()
1494
2012
  self.target = list()
1495
2013
  columns = list()
@@ -1501,9 +2019,23 @@ class stock_eda_panel(object):
1501
2019
  self.df[f'mean_target'] = self.df[columns].mean(axis=1)
1502
2020
  self.target.append(f'mean_target')
1503
2021
  self.settings_target_lasts = {'steps':steps, 'type':'regression'}
1504
-
1505
- def get_categorical_targets(self, horizon, flor_loss, top_gain):
1506
-
2022
+
2023
+ def get_categorical_targets(self, horizon, flor_loss, top_gain, min_pos=1 , min_negs=1):
2024
+ """
2025
+ produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
2026
+
2027
+ Parameters
2028
+ ----------
2029
+ horizon (int): number of lags and steps for future returns
2030
+ flor_loss (float): min loss return
2031
+ top_gain (float): max gain return
2032
+ min_pos (int): minimun number of positives to count in a window for target_up
2033
+ min_negs (int): minimun number of negatives to count in a window for target_down
2034
+
2035
+ Returns
2036
+ -------
2037
+ None
2038
+ """
1507
2039
  self.target = list()
1508
2040
  self.targets = list()
1509
2041
  columns = list()
@@ -1516,7 +2048,7 @@ class stock_eda_panel(object):
1516
2048
  self.df[f'target_{i}'] = np.where(self.df[f'target_{i}'] >= top_gain,1,0)
1517
2049
  columns.append(f'target_{i}')
1518
2050
  self.df[f'target_up'] = self.df[columns].sum(axis=1)
1519
- self.df[f'target_up'] = np.where(self.df[f'target_up'] >=1,1,0 )
2051
+ self.df[f'target_up'] = np.where(self.df[f'target_up'] >=min_pos,1,0 )
1520
2052
  self.df = self.df.drop(columns = columns)
1521
2053
 
1522
2054
  for i in range(1,horizon+1):
@@ -1526,7 +2058,7 @@ class stock_eda_panel(object):
1526
2058
  self.df[f'target_{i}'] = np.where(self.df[f'target_{i}'] <= flor_loss,1,0)
1527
2059
  columns.append(f'target_{i}')
1528
2060
  self.df[f'target_down'] = self.df[columns].sum(axis=1)
1529
- self.df[f'target_down'] = np.where(self.df[f'target_down'] >= 1,1,0 )
2061
+ self.df[f'target_down'] = np.where(self.df[f'target_down'] >= min_negs,1,0 )
1530
2062
  self.df = self.df.drop(columns = columns)
1531
2063
 
1532
2064
  self.targets.append('target_up')
@@ -1535,7 +2067,19 @@ class stock_eda_panel(object):
1535
2067
  self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain, 'type': 'classification'}
1536
2068
 
1537
2069
  def get_configurations(self,test_data_size =250, val_data_size = 250, model_type = False):
1538
-
2070
+ """
2071
+ produce configuration dictionary that were saved in the feature generation methods if save_features was activated
2072
+
2073
+ Parameters
2074
+ ----------
2075
+ test_data_size (int): test data size
2076
+ val_data_size (int): validation data size
2077
+ model_type (str): model type, options: 'Forecaster','Classifier'
2078
+
2079
+ Returns
2080
+ -------
2081
+ None
2082
+ """
1539
2083
  self.settings = {
1540
2084
  'features':list(set(self.features)),
1541
2085
  'signals' :list(set(self.signals)),
@@ -1547,19 +2091,21 @@ class stock_eda_panel(object):
1547
2091
  'outlier': self.settings_outlier,
1548
2092
  }
1549
2093
  }
1550
-
2094
+
1551
2095
  if model_type in ['Forecaster','Classifier']:
1552
-
2096
+
1553
2097
  target_list = list(set(self.targets))
1554
2098
  target_list.sort()
1555
2099
  self.settings['model_type'] = model_type
1556
2100
  self.settings['target'] = list(set(self.target))
1557
2101
  self.settings['targets'] = target_list
1558
-
2102
+
1559
2103
  ## for now this is hard coded
1560
2104
  feature_list = ['spread_ma','relative_spread_ma','pair_feature','count_features','bidirect_count_features','price_range','relative_price_range','rsi_feature',
1561
2105
  'rsi_feature_v2', 'days_features','days_features_v2', 'volume_feature','smooth_volume', 'roc_feature', 'stoch_feature', 'stochastic_feature',
1562
- 'william_feature', 'vortex_feature', 'pair_index_feature','hmm']
2106
+ 'william_feature', 'vortex_feature', 'pair_index_feature','hmm',
2107
+ 'min_distance_pricefeature', 'min_relprice_pricefeature', 'max_distance_pricefeature','max_relprice_pricefeature'
2108
+ ]
1563
2109
 
1564
2110
  for feature in feature_list:
1565
2111
  try:
@@ -1570,7 +2116,7 @@ class stock_eda_panel(object):
1570
2116
  self.settings['settings']['target_lasts'] = self.settings_target_lasts
1571
2117
  except:
1572
2118
  pass
1573
-
2119
+
1574
2120
  try:
1575
2121
  self.settings['settings']['strategies'] = {
1576
2122
  'best_strategy':self.best_strategy,
@@ -1580,512 +2126,280 @@ class stock_eda_panel(object):
1580
2126
  pass
1581
2127
 
1582
2128
  class produce_model:
2129
+ """
2130
+ Class that produces a machine learning model in a scikit-learn pipeline wrapper.
2131
+
2132
+ Attributes
2133
+ ----------
2134
+ data : pd.DataFrame
2135
+ symbol of the asset
2136
+ X_train : pd.DataFrame
2137
+ y_train : pd.Series
2138
+ X_test : pd.DataFrame
2139
+ y_test : pd.Series
2140
+ X_val : pd.DataFrame
2141
+ y_val : pd.Series
2142
+ pipeline : obj
2143
+ trained pipeline that includes a ml model
2144
+ features_to_model: list
2145
+ features in end step of the pipeline
2146
+
2147
+ Methods
2148
+ -------
2149
+ preprocess(test_data_size=int, target=str, val_data_size=int):
2150
+ prepare data, split train, test, validation data and X and Y
2151
+ get_sample(x=pd.DataFrame, sample=int, max_=int):
2152
+ sample data
2153
+ """
1583
2154
  def __init__(self,data):
2155
+ """
2156
+ Initialize object
2157
+
2158
+ Parameters
2159
+ ----------
2160
+ data (pd.DataFrame): data
2161
+
2162
+ Returns
2163
+ -------
2164
+ None
2165
+ """
1584
2166
  self.data = data.copy()
1585
-
2167
+
1586
2168
  def preprocess(self, test_data_size, target, val_data_size = False):
1587
-
2169
+ """
2170
+ prepare data, split train, test, validation data and X and Y
2171
+
2172
+ Parameters
2173
+ ----------
2174
+ test_data_size (int): test data size
2175
+ target (str): target column
2176
+ val_data_size (int): validation data size
2177
+
2178
+ Returns
2179
+ -------
2180
+ None
2181
+ """
1588
2182
  train_data, test_data = self.data.iloc[:-test_data_size,:].dropna() , self.data.iloc[-test_data_size:,:].dropna()
1589
-
2183
+
1590
2184
  if val_data_size:
1591
2185
  train_data, val_data = train_data.iloc[:-val_data_size,:], train_data.iloc[-val_data_size:,:]
1592
-
2186
+
1593
2187
  self.test_data = test_data
1594
-
2188
+
1595
2189
  X_train, y_train = train_data.iloc[0:,1:], train_data[target]
1596
2190
  X_test, y_test = test_data.iloc[0:,1:], test_data[target]
1597
2191
  self.X_train = X_train
1598
2192
  self.y_train = y_train
1599
2193
  self.X_test = X_test
1600
2194
  self.y_test = y_test
1601
-
2195
+
1602
2196
  if val_data_size:
1603
2197
  X_val, y_val = val_data.iloc[0:,1:], val_data[target]
1604
2198
  self.X_val = X_val
1605
2199
  self.y_val = y_val
1606
-
2200
+
1607
2201
  def get_sample(self, x, sample, max_=900):
2202
+ """
2203
+ sample data
2204
+
2205
+ Parameters
2206
+ ----------
2207
+ x (pd.DataFrame): input data
2208
+ sample (int): sample size
2209
+ max_ (int): max sample
2210
+
2211
+ Returns
2212
+ -------
2213
+ sample (float): sample size
2214
+ """
1608
2215
  length = len(x)
1609
2216
  if length > max_:
1610
2217
  return 1.0
1611
2218
  else:
1612
2219
  return sample
1613
-
2220
+
1614
2221
  def train_model(self, pipe, model, cv_ = False):
2222
+ """
2223
+ train pipeline
2224
+
2225
+ Parameters
2226
+ ----------
2227
+ pipe (obj): pipeline object
2228
+ model (obj): model object
2229
+ cv_ (obj): cross validation procedure
2230
+
2231
+ Returns
2232
+ -------
2233
+ sample (float): sample size
2234
+ """
1615
2235
  self.model = model
1616
2236
  self.pipe_transform = pipe
1617
2237
  self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
1618
- self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
1619
2238
  self.pipeline.fit(self.X_train, self.y_train)
1620
-
1621
-
1622
- class hmm_feature_selector():
1623
-
1624
- def __init__(self, data, n_clusters, init_features_hmm, test_data_size, select_n_features, n_trials = 1,limit_search = False, default_benchmark_sd = 0.00003, t_threshold = 2):
1625
- self.data = data.copy()
1626
- self.n_clusters = n_clusters
1627
- self.init_features_hmm = init_features_hmm
1628
- self.test_data_size = test_data_size
1629
- self.select_n_features = select_n_features
1630
- self.n_trials = n_trials
1631
- self.limit_search= limit_search
1632
- self.default_benchmark_sd = default_benchmark_sd
1633
- self.t_threshold = t_threshold
1634
-
1635
- def split_data(self):
1636
-
1637
- self.data_train = self.data.iloc[:-self.test_data_size,:]
1638
- self.data_test = self.data.iloc[-self.test_data_size:,:]
1639
-
1640
- def train_model(self,features_hmm):
1641
- pipeline_hmm = Pipeline([
1642
- ('selector', FeatureSelector(columns=features_hmm)),
1643
- ('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
1644
- ('hmm',GaussianHMM(n_components = self.n_clusters, covariance_type = 'full'))
1645
- ])
1646
-
1647
- self.pipeline_hmm = pipeline_hmm.fit(self.data_train)
1648
- self.features_used_in_model = features_hmm
1649
-
1650
- def feature_list_generator(self):
1651
-
1652
- feature_combinations = set(list(combinations(self.init_features_hmm, self.select_n_features)))
1653
- feature_combinations = list(map(list, feature_combinations))
1654
-
1655
- self.feature_combinations = feature_combinations
1656
-
1657
- def get_error(self):
1658
-
1659
- self.data_train_ = self.data_train.copy()
1660
-
1661
- self.data_train_['hmm_feature'] = self.pipeline_hmm.predict(self.data_train_)
1662
- self.data_train_ = self.data_train_[['Date','hmm_feature','Close']].sort_values('Date')
1663
-
1664
- ## indexing chains
1665
- self.data_train_['lag_hmm_feature'] = self.data_train_['hmm_feature'].shift(1)
1666
- self.data_train_['breack'] = np.where(self.data_train_['lag_hmm_feature'] != self.data_train_['hmm_feature'],1,0)
1667
- self.data_train_["chain_id"] = self.data_train_.groupby("breack")["Date"].rank(method="first", ascending=True)
1668
- self.data_train_["chain_id"] = np.where(self.data_train_['breack'] == 1,self.data_train_["chain_id"],np.nan)
1669
- self.data_train_["chain_id"] = self.data_train_["chain_id"].fillna(method='ffill')
1670
- self.data_train_["hmm_chain_order"] = self.data_train_.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1671
-
1672
- ### returns using the first element in a chain
1673
- self.data_train_['first'] = np.where(self.data_train_['hmm_chain_order'] == 1, self.data_train_['Close'], np.nan)
1674
- self.data_train_['first'] = self.data_train_.sort_values('Date')['first'].fillna(method='ffill')
1675
- self.data_train_['chain_return'] = (self.data_train_['Close']/self.data_train_['first'] -1) * 100
1676
-
1677
- self.data_train_ = self.data_train_.drop(columns = ['first'])
1678
-
1679
- mean_relevance, cluster_returns, number_relevant_states = states_relevance_score(self.data_train_)
1680
- self.mean_relevance = mean_relevance
1681
-
1682
- def execute_selector(self):
1683
-
1684
- self.split_data()
1685
- self.feature_list_generator()
1686
- maxi = -1
1687
- print(f'it is expected {len(self.feature_combinations)} combinations')
1688
- feature_results = dict()
1689
-
1690
- if self.limit_search:
1691
- print(f' taking just {self.limit_search} combinations')
1692
- maxi = self.limit_search
1693
-
1694
- for i,features_hmm in enumerate(self.feature_combinations[0:maxi]):
1695
-
1696
- feature_results[f'group_{i}'] = {
1697
- 'features':list(features_hmm),
1698
- 'relevances':list()
1699
- }
1700
-
1701
- for _ in range(self.n_trials):
1702
- try:
1703
- self.train_model(features_hmm)
1704
- self.get_error()
1705
- feature_results[f'group_{i}']['relevances'].append(self.mean_relevance)
1706
- except:
1707
- print('error')
1708
- feature_results[f'group_{i}']['mean relevance'] = np.mean(feature_results[f'group_{i}']['relevances'])
1709
- self.feature_results = feature_results
1710
- self.best_features = pd.DataFrame(self.feature_results).T.sort_values('mean relevance').iloc[-1,:].features
1711
-
1712
- class signal_analyser_object:
1713
-
1714
- def __init__(self, data,symbol_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
2239
+ self.features_to_model = self.pipeline[:-1].transform(self.X_train).columns
2240
+
2241
+ class analyse_index(stock_eda_panel):
2242
+ """
2243
+ class that is going to train hmm models to perform feature selection
2244
+
2245
+ Attributes
2246
+ ----------
2247
+ data_index : pd.DataFrame
2248
+ name of the index
2249
+ indexes: list
2250
+ list of indexes
2251
+ asset : str
2252
+ name of the asset
2253
+ n_obs : int
2254
+ number of rows to extract
2255
+ lag : int
2256
+ lag to apply
2257
+ data_window : str
2258
+ 5y 10y 15y
2259
+ show_plot : bool
2260
+ If True, show plots
2261
+ save_path : str
2262
+ local path for saving e.g r'C:/path/to/the/file/'
2263
+ save_aws : str
2264
+ remote key in s3 bucket path e.g. 'path/to/file/'
2265
+ aws_credentials : dict
2266
+ dict with the aws credentials
2267
+ merger_df : pd.DataFrame
2268
+ dataframe with the index and asset data
2269
+ states_result = dict
2270
+ betas and correlation score results
2271
+
2272
+ Methods
2273
+ -------
2274
+ process_data():
2275
+ using stock_eda_panel, get data and merge data
2276
+ plot_betas(sample_size=int, offset=int, subsample_ts=int):
2277
+ display beta analysis plot
2278
+ get_betas(subsample_ts=int)
2279
+ get general beta and last sample beta, correlation score is included too
2280
+ """
2281
+ def __init__(self, index_data, asset, n_obs, lag, data_window = '5y', show_plot = False, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
1715
2282
  """
1716
- data: pandas df
1717
- symbol_name: str name of the asset
1718
- show_plot: bool
1719
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
1720
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
1721
- aws_credentials: dict
1722
- return_fig: boolean return the image function as result
2283
+ Initialize object
2284
+
2285
+ Parameters
2286
+ ----------
2287
+ index_data (pd.DataFrame or str): index data dataframe or index string
2288
+ asset (str): name of the asset
2289
+ n_obs (int): number of rows to extract
2290
+ lag (int): lag to apply
2291
+ data_window (str): 5y 10y 15y
2292
+ show_plot (bool): If True, show plots
2293
+ save_path (str): local path for saving e.g r'C:/path/to/the/file/'
2294
+ save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
2295
+ aws_credentials (dict): dict with the aws credentials
2296
+
2297
+ Returns
2298
+ -------
2299
+ None
1723
2300
  """
1724
- self.data = data.copy()
1725
- self.ticket_name = symbol_name
1726
- self.show_plot = show_plot
1727
- self.save_path = save_path
1728
- self.save_aws = save_aws
1729
- self.aws_credentials = aws_credentials
1730
- self.return_fig = return_fig
1731
-
1732
- def signal_analyser(self, test_size, feature_name, days_list, threshold = 0.05,verbose = False, signal_position = False):
1733
- data = self.data
1734
- self.feature_name = feature_name
1735
- up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
1736
- features_base = ['Date', up_signal, low_signal, 'Close']
1737
-
1738
- df = data[features_base].sort_values('Date').iloc[0:-test_size,:]
1739
- returns_list = list()
1740
-
1741
- for days in days_list:
1742
-
1743
- feature_ = f'return_{days}d'
1744
- df[feature_] = (df['Close'].shift(-days)/df['Close']-1)*100
1745
- returns_list.append(feature_)
1746
2301
 
1747
- df['signal_type'] = np.where(
1748
- df[up_signal] == 1,
1749
- 'up',
1750
- np.where(
1751
- df[low_signal] == 1,
1752
- 'down',
1753
- None
1754
- )
1755
- )
1756
- df = df[~df.signal_type.isna()]
1757
- # df['Date'] = df.index
1758
- df['lag_Date'] = df['Date'].shift(1)
1759
- df['span'] = (pd.to_datetime(df['Date']) - pd.to_datetime(df['lag_Date'])).dt.days - 1
1760
- df['break'] = np.where(df['span'] > 3, 1, 0)
1761
- df['break'] = np.where(df['span'].isna(), 1, df['break'])
1762
-
1763
- df['chain_id'] = df.sort_values(['Date']).groupby(['break']).cumcount() + 1
1764
- df['chain_id'] = np.where(df['break'] == 1, df['chain_id'], np.nan )
1765
- df['chain_id'] = df['chain_id'].fillna(method = 'ffill')
1766
-
1767
- df['internal_rn'] = df.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
1768
- df['inv_internal_rn'] = df.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
1769
-
1770
- df['first_in_chain'] = np.where(df['internal_rn'] == 1, True, False)
1771
- df['last_in_chain'] = np.where(df['inv_internal_rn'] == 1, True, False)
1772
-
1773
- df = df.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
1774
- self.df_signal = df
1775
2302
 
1776
- n_signals_up = len(list(df[df.signal_type == 'up'].chain_id.unique()))
1777
- n_signals_down = len(list(df[df.signal_type == 'down'].chain_id.unique()))
1778
- p_scores = list()
1779
- medians_down = list()
1780
- validations = list()
1781
- if not signal_position: ### for now it is based on the last signal on a chain
1782
- df_melt = df[df.last_in_chain == True].melt(id_vars=['signal_type'], value_vars=returns_list, var_name='time', value_name='value')
1783
- df_melt = df_melt.dropna()
1784
-
1785
- for evalx in returns_list:
1786
-
1787
- sample1 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'up')].value.values
1788
- sample2 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'down')].value.values
1789
- pvalue = stats.ttest_ind(sample1, sample2).pvalue
1790
- median_down = np.median(sample2)
1791
- median_up = np.median(sample1)
1792
- validations.append(median_up < 0)
1793
- validations.append(median_down > 0)
1794
- p_scores.append(pvalue)
1795
- medians_down.append(median_down)
1796
- self.df_melt = df_melt
1797
- null_ho_eval = threshold > np.mean(p_scores)
1798
- mean_median_return = np.median(medians_down) ## end metric
1799
- median_signal_type_eval = validations.count(validations[0]) == len(validations)
1800
-
1801
- if verbose:
1802
- print('number of signal up:',n_signals_up)
1803
- print('number of signal down:',n_signals_down)
1804
- print('reject ho: ', null_ho_eval)
1805
- print('mean median:', mean_median_return)
1806
- print('all validations: ', median_signal_type_eval)
1807
-
1808
- # if median_signal_type_eval == True and null_ho_eval == True:
1809
- if null_ho_eval == True:
1810
- if verbose:
1811
- print('success evals')
1812
- self.mean_median_return = mean_median_return
2303
+ if type(index_data) != str:
2304
+ index_data['Date'] = pd.to_datetime(index_data['Date'])
2305
+ self.index_data = index_data
2306
+ self.indexes = [ x for x in list(index_data.columns) if x != 'Date']
1813
2307
  else:
1814
- self.mean_median_return = np.nan
1815
-
1816
- df2 = df.copy()
1817
- df2 = df2[df2.last_in_chain == True]
1818
-
1819
-
1820
- df2['lagdate'] = df2.Date.shift(1)
1821
- df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lagdate'])).dt.days
1822
-
1823
- fig, axs = plt.subplots(1, 3, figsize = (15,5))
1824
-
1825
- sns.boxplot(data=df2, y="span",ax = axs[0])
1826
- axs[0].set_title('span between last signals')
1827
- del df2
1828
- sns.boxplot(data=df[df.last_in_chain == True], y="internal_rn",ax = axs[1])
1829
- axs[1].set_title('signal duration distribution')
1830
- sns.boxplot(data=df_melt, x="time", y="value", hue="signal_type",ax = axs[2])
1831
- axs[2].axhline(y=0, color='grey', linestyle='--')
1832
- axs[2].set_title('signal type expected returns distribution at different time lapses')
1833
-
1834
- if self.show_plot:
1835
- plt.show()
2308
+ self.indexes = [index_data]
1836
2309
 
1837
- if self.save_path:
1838
- result_plot_name = f'signals_strategy_distribution_{feature_name}.png'
1839
- fig.savefig(self.save_path+result_plot_name)
1840
- # pickle.dump(axs, open(self.save_path+result_plot_name, 'wb'))
1841
-
1842
- if self.save_path and self.save_aws:
1843
- # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name, input_path = self.save_path+result_plot_name)
1844
- upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
1845
- if not self.show_plot:
1846
- plt.close()
1847
-
1848
- del df
1849
-
1850
- if self.return_fig:
1851
- return fig
1852
-
1853
- def create_backtest_signal(self,days_strategy, test_size, feature_name, high_exit = False, low_exit = False):
1854
- asset_1 = 'Close'
1855
- up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
1856
- df1 = self.data.iloc[-test_size:,:].copy()
1857
- df2 = df1.copy()
1858
- df2['signal_type'] = np.where(
1859
- df2[up_signal] == 1,
1860
- 'up',
1861
- np.where(
1862
- df2[low_signal] == 1,
1863
- 'down',
1864
- None
1865
- )
1866
- )
1867
- df2 = df2[~df2.signal_type.isna()]
1868
- # df2['Date_'] = df2.index
1869
- df2['lag_Date'] = df2['Date'].shift(1)
1870
- df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lag_Date'])).dt.days - 1
1871
- df2['break'] = np.where(df2['span'] > 3, 1, 0)
1872
- df2['break'] = np.where(df2['span'].isna(), 1, df2['break'])
1873
-
1874
- df2['chain_id'] = df2.sort_values(['Date']).groupby(['break']).cumcount() + 1
1875
- df2['chain_id'] = np.where(df2['break'] == 1, df2['chain_id'], np.nan )
1876
- df2['chain_id'] = df2['chain_id'].fillna(method = 'ffill')
1877
-
1878
- df2['internal_rn'] = df2.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
1879
- df2['inv_internal_rn'] = df2.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
1880
-
1881
- df2['first_in_chain'] = np.where(df2['internal_rn'] == 1, True, False)
1882
- df2['last_in_chain'] = np.where(df2['inv_internal_rn'] == 1, True, False)
1883
-
1884
- df2 = df2.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
1885
-
1886
- df2 = df2[(df2.last_in_chain == True) & (df2.signal_type == 'down')][['last_in_chain']]
1887
- dft = df1.merge(df2,how = 'left',left_index=True, right_index=True )
1888
-
1889
- dft['chain_id'] = dft.sort_values(['Date']).groupby(['last_in_chain']).cumcount() + 1
1890
- dft['chain_id'] = np.where(dft['last_in_chain'] == True, dft['chain_id'], np.nan )
1891
- dft['chain_id'] = dft['chain_id'].fillna(method = 'ffill')
1892
-
1893
- dft['internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
1894
- dft['flag'] = np.where(dft['internal_rn'] < days_strategy, 1,0)
1895
-
1896
- dft['lrets_bench'] = np.log(dft[asset_1]/dft[asset_1].shift(1))
1897
- dft['bench_prod'] = dft['lrets_bench'].cumsum()
1898
- dft['bench_prod_exp'] = np.exp(dft['bench_prod']) - 1
1899
-
1900
- if high_exit and low_exit:
1901
- dft['open_strat'] = np.where(dft.last_in_chain == True, dft.Open, np.nan)
1902
- dft['open_strat'] = dft['open_strat'].fillna(method = 'ffill')
1903
- dft['open_strat'] = np.where(dft.flag == 1, dft.open_strat, np.nan)
1904
- dft['high_strat_ret'] = (dft['High']/dft['open_strat']-1)*100
1905
- dft['low_strat_ret'] = (dft['Low']/dft['open_strat']-1)*100
1906
- dft['high_exit'] = np.where(((dft['high_strat_ret'] >= high_exit) | (dft['internal_rn'] == days_strategy)), 1, np.nan)
1907
- dft['low_exit'] = np.where((dft['low_strat_ret'] <= low_exit), -1, np.nan)
1908
-
1909
- dft["exit_type"] = dft[["high_exit", "low_exit"]].max(axis=1)
1910
- dft['exit_type'] = np.where(dft["exit_type"] == 1, 1, np.where(dft["exit_type"] == -1,-1,np.nan))
1911
- dft['exit'] = np.where(dft['exit_type'].isnull(), np.nan, 1)
1912
- dft['exit_order'] = dft.sort_values(['Date']).groupby(['chain_id','exit']).cumcount() + 1
1913
- dft['exit'] = np.where(dft['exit_order'] == 1, True, np.nan)
1914
- dft = dft.drop(columns = ['exit_order'])
1915
- ## if last signal is near
1916
- max_id = dft.chain_id.max()
1917
- dft['max_internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).internal_rn.transform('max')
1918
- dft['exit'] = np.where((dft.chain_id == max_id) & (dft.max_internal_rn < days_strategy) & (dft.max_internal_rn == dft.internal_rn), 1, dft['exit'])
1919
-
1920
- dft['exit_step'] = np.where(dft.exit == 1, dft.internal_rn, np.nan)
1921
- dft['exit_step'] = dft.sort_values(['Date']).groupby(['chain_id']).exit_step.transform('max')
1922
-
1923
- dft['flag'] = np.where(dft.internal_rn <= dft.exit_step, 1, 0)
1924
- dft = dft.drop(columns = ['open_strat', 'high_strat_ret', 'low_strat_ret','exit_step', 'exit','exit_type','high_exit','low_exit', 'max_internal_rn'])
1925
-
1926
- dft['lrets_strat'] = np.log(dft[asset_1].shift(-1)/dft[asset_1]) * dft['flag']
1927
- dft['lrets_strat'] = np.where(dft['lrets_strat'].isna(),-0.0,dft['lrets_strat'])
1928
- dft['lrets_prod'] = dft['lrets_strat'].cumsum()
1929
- dft['strat_prod_exp'] = np.exp(dft['lrets_prod']) - 1
1930
-
1931
- bench_rets = round(dft['bench_prod_exp'].values[-1]*100,1)
1932
- strat_rets = round(dft['strat_prod_exp'].values[-1]*100,1)
1933
-
1934
- bench_sr = round(sharpe_ratio(dft.bench_prod_exp.dropna()),1)
1935
- strat_sr = round(sharpe_ratio(dft.strat_prod_exp.dropna()),1)
1936
-
1937
- message1 = f'{bench_rets}%'
1938
- message2 = f'{strat_rets}%'
1939
-
1940
- messages = {
1941
- 'benchmark return:':message1,
1942
- 'benchmark sharpe ratio:': bench_sr,
1943
- 'strategy return:':message2,
1944
- 'strategy sharpe ratio:': strat_sr,
1945
- }
1946
- if self.show_plot:
1947
- print('----------------------------')
1948
- print(messages)
1949
- print('----------------------------')
1950
-
1951
- fig = plt.figure(1)
1952
- plt.plot(dft.bench_prod_exp.values, label = 'benchmark')
1953
- plt.scatter(range(len(dft)),np.where(dft[low_signal] == 1,dft.bench_prod_exp.values,np.nan),color = 'red', label = 'signal')
1954
- plt.plot(dft.strat_prod_exp.values, label = 'strategy')
1955
- plt.legend()
1956
- plt.title('strategy and cumulative returns based on signal strategy')
1957
- if self.show_plot:
1958
- plt.plot()
1959
-
1960
- if self.save_path:
1961
- result_json_name = f'signals_strategy_return_{feature_name}.json'
1962
- result_plot_name = f'signals_strategy_return_{feature_name}.png'
1963
-
1964
- plt.savefig(self.save_path+result_plot_name)
1965
- # pickle.dump(fig, open(self.save_path+result_plot_name, 'wb'))
1966
-
1967
- with open(self.save_path+result_json_name, "w") as outfile:
1968
- json.dump(messages, outfile)
1969
-
1970
- if self.save_path and self.save_aws:
1971
- # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
1972
- # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name,input_path = self.save_path+result_plot_name)
1973
-
1974
- upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_json_name, input_path = self.save_path + result_json_name, aws_credentials = self.aws_credentials)
1975
- upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
1976
-
1977
- if not self.show_plot:
1978
- plt.close()
1979
-
1980
- del df1,df2,dft
1981
-
1982
- if self.return_fig:
1983
- return fig, messages
1984
-
1985
- def execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object, plot = False, backtest= False, exit_params = {}):
1986
-
1987
- method(**configuration)
1988
- signal_assess = signal_analyser_object(object_stock.df,object_stock.stock_code,show_plot = plot)
1989
- signal_assess.signal_analyser(test_size = test_data_size, feature_name = feature_name, days_list = days_list, threshold = 1)
1990
-
1991
- if backtest:
1992
- print('-----------------------back test ---------------------------')
1993
- signal_assess.create_backtest_signal(backtest, test_data_size, feature_name, **exit_params )
1994
-
1995
- return signal_assess.mean_median_return
1996
-
1997
- def iterate_signal_analyser(test_data_size,feature_name, days_list, arguments_to_test, method, object_stock, signal_analyser_object, plot = True):
1998
-
1999
- results = list()
2000
- for key in arguments_to_test.keys():
2001
- configuration = arguments_to_test.get(key)
2002
- mean_median_return = execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object)
2003
- results.append(mean_median_return)
2004
-
2005
- df_result = pd.DataFrame({'keys':arguments_to_test.keys(),'results':results})
2006
- if plot:
2007
- plt.plot(df_result['keys'], df_result['results'])
2008
- plt.scatter(df_result['keys'], df_result['results'])
2009
- plt.title('simulation between configurations')
2010
- plt.ylabel('median expected return')
2011
- plt.show()
2012
-
2013
- best_result = df_result.sort_values('results',ascending = False)['keys'].values[0]
2014
- return best_result
2015
-
2016
- class analyse_index(stock_eda_panel):
2017
- def __init__(self, index, asset, n_obs, lag, data_window = '5y', show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
2018
-
2019
- """
2020
- data: pandas df
2021
- index: str name of the index
2022
- asset: str name of the asset
2023
- n_obs: int
2024
- lag: int
2025
- data_window: str eg 5y 10y 15y
2026
- show_plot: bool
2027
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
2028
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
2029
- aws_credentials: dict
2030
- """
2031
-
2032
- self.index = index
2310
+ self.index_data = index_data
2033
2311
  self.asset = asset
2034
2312
  self.n_obs = n_obs
2035
2313
  self.data_window = data_window
2036
2314
  self.lag = lag
2037
-
2315
+
2038
2316
  self.show_plot = show_plot
2317
+ self.return_fig = return_fig
2039
2318
  self.save_path = save_path
2040
2319
  self.save_aws = save_aws
2041
-
2042
- def process_data(self):
2043
-
2044
- index = stock_eda_panel(self.index, self.n_obs, self.data_window)
2045
- index.get_data()
2046
- index.df['shift'] = index.df.Close.shift(self.lag)
2047
- index.df['index_return'] = index.df.Close/index.df['shift'] - 1
2048
2320
 
2049
- asset = stock_eda_panel(self.asset, self.n_obs, self.data_window)
2321
+ def process_data(self):
2322
+ """
2323
+ using stock_eda_panel, get data and merge data
2324
+
2325
+ Parameters
2326
+ ----------
2327
+ None
2328
+
2329
+ Returns
2330
+ -------
2331
+ None
2332
+ """
2333
+ asset = stock_eda_panel(self.asset, self.n_obs, data_window=self.data_window)
2050
2334
  asset.get_data()
2051
- asset.df['shift'] = asset.df.Close.shift(self.lag)
2052
- asset.df['asset_return'] = asset.df.Close/asset.df['shift'] - 1
2335
+ df = asset.df[['Date','Close']]
2053
2336
 
2054
- df1 = index.df[['Date','index_return']]
2055
- df2 = asset.df[['Date','asset_return','Close']]
2056
- merger = df1.merge(df2, on = 'Date', how = 'inner')
2057
- merger.dropna(inplace = True)
2058
- self.merger_df = merger
2059
-
2060
- def plot_betas(self,sample_size, offset, subsample_ts =False):
2061
-
2062
- ### extracting data
2337
+ if type(self.index_data) != str:
2338
+ df_merge = df.merge(self.index_data, on = ['Date'], how = 'left').sort_values('Date')
2339
+
2340
+ else:
2341
+ indx = stock_eda_panel(self.index_data, self.n_obs, data_window=self.data_window)
2342
+ indx.get_data()
2343
+ indx_df = indx.df[['Date','Close']].rename(columns = {'Close':self.index_data})
2344
+ df_merge = df.merge(indx_df, on = ['Date'], how = 'left').sort_values('Date')
2345
+
2346
+ for colx in ['Close'] + self.indexes:
2347
+ df_merge[f'{colx}_pct'] = df_merge[colx]/df_merge[colx].shift(self.lag) - 1
2348
+
2349
+ df_merge.dropna(inplace = True)
2350
+ self.merger_df = df_merge.rename(columns = {'Close_pct': 'asset_return'})
2063
2351
 
2064
- self.process_data()
2065
-
2066
- ### ploting analysis
2352
+ def plot_betas(self,sample_size, offset, subsample_ts =False, index = False):
2353
+ """
2354
+ display beta analysis plot
2355
+
2356
+ Parameters
2357
+ ----------
2358
+ sample_size (int): number of days or window size to calculate beta
2359
+ offset (int): overlap between windows
2360
+ subsample_ts (int): subsample size of data
2361
+
2362
+ Returns
2363
+ -------
2364
+ None
2365
+ """
2366
+ if (type(self.index_data) == str) & (index != False):
2367
+ raise Exception("No need of index argument")
2368
+ else:
2369
+ index = self.indexes[0]
2370
+
2371
+ index_pct = f'{index}_pct'
2372
+ ### ploting analysis
2067
2373
  figure, ax = plt.subplot_mosaic(
2068
2374
  [["scatter_total", "scatter_sample",'ts','ts']],
2069
2375
  layout="constrained",
2070
2376
  figsize=(18, 5)
2071
2377
  )
2072
-
2073
- ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df.index_return)
2074
- b, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
2378
+
2379
+ ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df[index_pct])
2380
+
2381
+ huber_regr = HuberRegressor(fit_intercept = True)
2382
+ huber_regr.fit(self.merger_df.asset_return.values.reshape(-1,1), self.merger_df[index_pct].values.reshape(-1,1))
2383
+ b, a = huber_regr.coef_[0], huber_regr.intercept_
2384
+
2385
+ # b, a = np.polyfit(self.merger_df.asset_return, self.merger_df[index_pct], 1)
2075
2386
  ax['scatter_total'].plot(self.merger_df.asset_return, b*self.merger_df.asset_return+a, color='red')
2076
2387
 
2077
2388
  ax['ts'].plot(self.merger_df.Date, self.merger_df.Close, color = 'grey', alpha = 0.3)
2078
-
2389
+
2079
2390
  if subsample_ts:
2080
2391
  self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
2081
-
2392
+
2082
2393
  for i in range(0,len(self.merger_df)-sample_size,offset):
2083
2394
 
2084
2395
  merger_ = self.merger_df.sort_values('Date', ascending = False).iloc[i:i+sample_size,:]
2085
- x = merger_.index_return
2396
+ x = merger_[index_pct]
2086
2397
  y = merger_.asset_return
2087
- b, a = np.polyfit(x,y, 1)
2088
-
2398
+ # b, a = np.polyfit(x,y, 1)
2399
+ huber_regr = HuberRegressor(fit_intercept = True)
2400
+ huber_regr.fit(x.values.reshape(-1,1), y.values.reshape(-1,1))
2401
+ b, a = huber_regr.coef_[0], huber_regr.intercept_
2402
+
2089
2403
  normalize = mcolors.Normalize(vmin=-1, vmax=1)
2090
2404
  colormap = cm.jet
2091
2405
 
@@ -2098,12 +2412,13 @@ class analyse_index(stock_eda_panel):
2098
2412
 
2099
2413
  scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
2100
2414
  scalarmappaple.set_array(x)
2101
-
2102
- plt.title(f'{self.asset} using index: {self.index}')
2415
+
2416
+ plt.title(f'{self.asset} using index: {index}')
2103
2417
  plt.colorbar(scalarmappaple)
2104
-
2418
+
2105
2419
  if self.show_plot:
2106
2420
  plt.show()
2421
+
2107
2422
  if self.save_path:
2108
2423
  result_plot_name = f'market_best_fit.png'
2109
2424
  figure.savefig(self.save_path+result_plot_name)
@@ -2111,80 +2426,50 @@ class analyse_index(stock_eda_panel):
2111
2426
  if self.save_path and self.save_aws:
2112
2427
  # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.asset}/'+result_plot_name,input_path = self.save_path+result_plot_name)
2113
2428
  upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
2429
+
2114
2430
  if not self.show_plot:
2115
- plt.close()
2116
-
2431
+ plt.close()
2432
+
2433
+ if self.return_fig:
2434
+ return figure
2435
+
2117
2436
  def get_betas(self,subsample_ts=False):
2118
-
2119
- self.process_data()
2120
- general_beta, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
2121
- general_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df.index_return)[0]
2122
-
2123
- self.process_data()
2124
- if subsample_ts:
2125
- self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
2126
- sample_beta, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
2127
- sample_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df.index_return)[0]
2128
-
2129
- result = {
2130
- 'general_beta':general_beta,
2131
- 'general_r':general_r,
2132
- 'sample_beta':sample_beta,
2133
- 'sample_r':sample_r
2134
- }
2135
-
2136
- self.states_result = result
2137
-
2138
- class evaluate_markets(analyse_index):
2139
- def __init__(self, stock_code, indexes):
2140
- self.stock_code = stock_code
2141
- self.indexes = indexes
2142
- def evaluate_best_market_fit(self,sample_size, offset,lag= 3, n_obs = 3500, verbose = False, plot_best = False):
2143
-
2144
- results_dicts = dict()
2437
+ """
2438
+ get general beta and last sample beta, correlation score is included too
2439
+
2440
+ Parameters
2441
+ ----------
2442
+ subsample_ts (int): subsample size of data
2443
+
2444
+ Returns
2445
+ -------
2446
+ None
2447
+ """
2448
+ result = list()
2145
2449
  for index in self.indexes:
2146
- betex = analyse_index(index = index,asset = self.stock_code,n_obs = n_obs, lag = lag)
2147
- betex.get_betas(sample_size)
2148
- results_dicts[index] = betex.states_result
2149
- pd_result = pd.DataFrame(results_dicts).T
2150
- pd_result['gen_r2'] = pd_result.general_r ** 2
2151
- pd_result['sampl_r2'] = pd_result.sample_r ** 2
2152
- self.stat_results = pd_result
2153
-
2154
- best_result = pd_result.sort_values('gen_r2',ascending = False).head(2).sort_values('sampl_r2',ascending = False).head(1)
2155
- best_fit_index = best_result.index.values[0]
2156
-
2157
- self.stat_results = self.stat_results.drop(columns = ['gen_r2','sampl_r2'])
2158
-
2159
- if verbose:
2160
- print(best_result)
2161
- if plot_best:
2162
- betex = analyse_index(index = best_fit_index,asset = self.stock_code, n_obs = n_obs, lag = lag)
2163
- betex.plot_betas(sample_size = sample_size, offset = offset, subsample_ts = False)
2164
2450
 
2165
- self.best_result = best_result
2166
-
2167
- def get_relevant_beta(data_market, ticket_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
2168
- """
2169
- data_market: pandas df
2170
- ticket_name: str name of the asset
2171
- show_plot: bool
2172
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
2173
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
2174
- aws_credentials: dict
2175
- """
2176
- all_betas = data_market[data_market.asset == ticket_name].sort_values('general_r', ascending = False)
2177
- all_betas['gen_r2'] = all_betas.general_r ** 2
2178
- all_betas['sampl_r2'] = all_betas.sample_r ** 2
2179
- selection = all_betas.sort_values('gen_r2',ascending =False).head(2).sort_values('sampl_r2',ascending =False).head(1).drop(columns = ['gen_r2','sampl_r2'])
2451
+ index_pct = f'{index}_pct'
2452
+ huber_regr = HuberRegressor(fit_intercept = True)
2453
+ huber_regr.fit(self.merger_df.asset_return.values.reshape(-1,1), self.merger_df[index_pct].values.reshape(-1,1))
2454
+ general_beta, a = huber_regr.coef_[0], huber_regr.intercept_
2455
+ general_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df[index])[0]
2456
+
2457
+ dict_res = {
2458
+ 'index':index,
2459
+ 'general_beta':general_beta,
2460
+ 'general_r':general_r,
2461
+ }
2180
2462
 
2181
- if show_plot:
2182
- print(selection)
2183
- if save_path:
2184
- result_plot_name = f'market_best_fit.csv'
2185
- selection.to_csv(save_path+result_plot_name)
2186
-
2187
- if save_path and save_aws:
2188
- # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{ticket_name}/'+result_plot_name,input_path = save_path+result_plot_name)
2189
- upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = save_aws + result_plot_name, input_path = save_path + result_plot_name, aws_credentials = aws_credentials)
2190
- return selection
2463
+ if subsample_ts:
2464
+ tmp_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
2465
+ huber_regr = HuberRegressor(fit_intercept = True)
2466
+ huber_regr.fit(tmp_df.asset_return.values.reshape(-1,1), tmp_df[index_pct].values.reshape(-1,1))
2467
+ sample_beta, a = huber_regr.coef_[0], huber_regr.intercept_
2468
+ sample_r = stats.mstats.pearsonr(tmp_df.asset_return, tmp_df[index])[0]
2469
+ dict_res['sample_beta'] = sample_beta
2470
+ dict_res['sample_r'] = sample_r
2471
+
2472
+ result.append(dict_res)
2473
+
2474
+ self.states_result = result
2475
+