virgo-modules 0.0.72__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- virgo_modules/src/aws_utils.py +35 -3
- virgo_modules/src/backtester.py +474 -0
- virgo_modules/src/edge_utils/__init__.py +0 -0
- virgo_modules/src/edge_utils/conformal_utils.py +106 -0
- virgo_modules/src/edge_utils/edge_utils.py +502 -0
- virgo_modules/src/edge_utils/feature_selection.py +66 -0
- virgo_modules/src/edge_utils/shap_utils.py +54 -0
- virgo_modules/src/edge_utils/stack_model.py +94 -0
- virgo_modules/src/hmm_utils.py +494 -0
- virgo_modules/src/market/__init__.py +0 -0
- virgo_modules/src/market/market_tools.py +189 -0
- virgo_modules/src/re_utils.py +628 -85
- virgo_modules/src/ticketer_source.py +1278 -1066
- virgo_modules/src/transformer_utils.py +401 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.8.4.dist-info}/METADATA +16 -22
- virgo_modules-0.8.4.dist-info/RECORD +22 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.8.4.dist-info}/WHEEL +1 -1
- virgo_modules/src/edge_utils.py +0 -178
- virgo_modules-0.0.72.dist-info/RECORD +0 -12
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.8.4.dist-info/licenses}/LICENSE +0 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.8.4.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import yfinance as yf
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import numpy as np
|
|
4
|
-
import
|
|
4
|
+
import gc
|
|
5
5
|
|
|
6
6
|
import matplotlib.pyplot as plt
|
|
7
7
|
import matplotlib.gridspec as gridspec
|
|
@@ -36,7 +36,6 @@ from hmmlearn.hmm import GaussianHMM
|
|
|
36
36
|
|
|
37
37
|
from plotly.colors import DEFAULT_PLOTLY_COLORS
|
|
38
38
|
|
|
39
|
-
from sklearn.base import BaseEstimator, TransformerMixin
|
|
40
39
|
from sklearn.pipeline import Pipeline
|
|
41
40
|
from feature_engine.imputation import MeanMedianImputer
|
|
42
41
|
|
|
@@ -48,88 +47,38 @@ from feature_engine.timeseries.forecasting import LagFeatures
|
|
|
48
47
|
from feature_engine.imputation import MeanMedianImputer
|
|
49
48
|
from feature_engine.discretisation import EqualWidthDiscretiser
|
|
50
49
|
|
|
50
|
+
from sklearn.linear_model import HuberRegressor
|
|
51
|
+
|
|
51
52
|
from .aws_utils import upload_file_to_aws
|
|
52
53
|
|
|
53
54
|
import logging
|
|
54
55
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
self.prefix = prefix
|
|
56
|
+
from virgo_modules.src.hmm_utils import trainer_hmm
|
|
57
|
+
from virgo_modules.src.transformer_utils import signal_combiner, FeatureSelector
|
|
58
|
+
from virgo_modules.src.transformer_utils import FeaturesEntropy, VirgoWinsorizerFeature # imported bcs some models read this module otherwise it crashed mlflow.load()
|
|
59
59
|
|
|
60
|
-
def fit(self, X, y=None):
|
|
61
|
-
return self
|
|
62
|
-
|
|
63
|
-
def transform(self, X, y=None):
|
|
64
|
-
for feature in self.features:
|
|
65
|
-
X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
|
|
66
|
-
return X
|
|
67
|
-
|
|
68
|
-
class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
|
|
69
|
-
def __init__(self, feature_configs):
|
|
70
|
-
self.feature_configs = feature_configs
|
|
71
|
-
def fit(self, X, y=None):
|
|
72
|
-
return self
|
|
73
|
-
|
|
74
|
-
def transform(self, X, y=None):
|
|
75
|
-
for feature in self.feature_configs:
|
|
76
|
-
lower = self.feature_configs[feature]['min']
|
|
77
|
-
upper = self.feature_configs[feature]['max']
|
|
78
|
-
X[feature] = np.where( lower > X[feature], lower, X[feature])
|
|
79
|
-
X[feature] = np.where( upper < X[feature], upper, X[feature])
|
|
80
|
-
return X
|
|
81
|
-
|
|
82
|
-
class FeatureSelector(BaseEstimator, TransformerMixin):
|
|
83
|
-
def __init__(self, columns):
|
|
84
|
-
self.columns = columns
|
|
85
|
-
|
|
86
|
-
def fit(self, X, y=None):
|
|
87
|
-
return self
|
|
88
|
-
|
|
89
|
-
def transform(self, X, y=None):
|
|
90
|
-
return X[self.columns]
|
|
91
|
-
|
|
92
|
-
def sharpe_ratio(return_series):
|
|
93
|
-
N = 255 # Trading days in the year (change to 365 for crypto)
|
|
94
|
-
rf = 0.005 # Half a percent risk free rare
|
|
95
|
-
mean = return_series.mean() * N -rf
|
|
96
|
-
sigma = return_series.std() * np.sqrt(N)
|
|
97
|
-
sharpe = round(mean / sigma, 3)
|
|
98
|
-
return sharpe
|
|
99
|
-
|
|
100
|
-
class signal_combiner(BaseEstimator, TransformerMixin):
|
|
101
|
-
def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
|
|
102
|
-
self.columns = columns
|
|
103
|
-
self.drop = drop
|
|
104
|
-
self.prefix_up = prefix_up
|
|
105
|
-
self.prefix_low = prefix_low
|
|
106
|
-
|
|
107
|
-
def fit(self, X, y=None):
|
|
108
|
-
return self
|
|
109
|
-
|
|
110
|
-
def transform(self, X, y=None):
|
|
111
|
-
for column in self.columns:
|
|
112
|
-
X['CombSignal_'+column] = np.where(
|
|
113
|
-
X[self.prefix_up + column] == 1,
|
|
114
|
-
1,
|
|
115
|
-
np.where(
|
|
116
|
-
X[self.prefix_low + column] == 1,
|
|
117
|
-
1,
|
|
118
|
-
0
|
|
119
|
-
)
|
|
120
|
-
)
|
|
121
|
-
if self.drop:
|
|
122
|
-
X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
|
|
123
|
-
return X
|
|
124
|
-
|
|
125
60
|
def data_processing_pipeline(features_base,features_to_drop = False, lag_dict = False, combine_signals = False, discretize_columns = False, correlation = 0.77):
|
|
126
|
-
|
|
61
|
+
|
|
62
|
+
'''
|
|
63
|
+
create a scikit learn pipeline object using different configurations and feature engineering blocks with a given flow
|
|
64
|
+
|
|
65
|
+
Parameters:
|
|
66
|
+
features_to_drop (list): list of features to drop
|
|
67
|
+
lag_dict (dict): feature dictionary with configurations to apply lags
|
|
68
|
+
combine_signals (list): list of columns/signals to combine
|
|
69
|
+
discretize_columns (list): list of features to discretize, bins is fixed
|
|
70
|
+
correlation (float): correaltion score threshold for feature selection
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
pipe (obj): pipeline object
|
|
74
|
+
'''
|
|
75
|
+
|
|
127
76
|
lag_pipe_sec = [(f'lags_{key}', LagFeatures(variables = key, periods = lag_dict[key])) for key in lag_dict] if lag_dict else []
|
|
128
77
|
drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
|
|
129
78
|
merge = [('signal_combiner', signal_combiner(combine_signals))] if combine_signals else []
|
|
130
79
|
discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = 20 ))] if discretize_columns else []
|
|
131
80
|
drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation))] if correlation else []
|
|
132
|
-
|
|
81
|
+
|
|
133
82
|
pipe = Pipeline(
|
|
134
83
|
[('selector', FeatureSelector(features_base))] + \
|
|
135
84
|
[('encoding',OneHotEncoder(top_categories=None, variables=['hmm_feature']))] + \
|
|
@@ -142,64 +91,172 @@ def data_processing_pipeline(features_base,features_to_drop = False, lag_dict =
|
|
|
142
91
|
)
|
|
143
92
|
return pipe
|
|
144
93
|
|
|
145
|
-
|
|
146
|
-
## legnths
|
|
147
|
-
cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
|
|
148
|
-
cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
|
|
149
|
-
## means
|
|
150
|
-
def quantile2(x):
|
|
151
|
-
return x.quantile(0.25)
|
|
152
|
-
def quantile3(x):
|
|
153
|
-
return x.quantile(0.75)
|
|
154
|
-
|
|
155
|
-
cluster_returns = data.groupby('hmm_feature').agg(
|
|
156
|
-
n_uniques = ('chain_id','nunique'),
|
|
157
|
-
n_obs = ('Date','count'),
|
|
158
|
-
cluster_ret_q25 = ('chain_return',quantile2),
|
|
159
|
-
cluster_ret_median = ('chain_return','median'),
|
|
160
|
-
cluster_ret_q75 = ('chain_return',quantile3),
|
|
161
|
-
)
|
|
162
|
-
cluster_returns = cluster_returns.join(cluster_lengths, how = 'left')
|
|
163
|
-
cluster_returns['perc_dispute'] = np.where(
|
|
164
|
-
np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
|
|
165
|
-
1,0
|
|
166
|
-
)
|
|
167
|
-
cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
|
|
168
|
-
cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
|
|
169
|
-
cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
|
|
170
|
-
cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
|
|
171
|
-
cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
|
|
172
|
-
cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
|
|
173
|
-
cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
|
|
174
|
-
|
|
175
|
-
## relevance or importance
|
|
176
|
-
# naive aproach
|
|
177
|
-
cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
|
|
178
|
-
cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
|
|
179
|
-
cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
|
|
180
|
-
cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
|
|
181
|
-
|
|
182
|
-
mean_relevance = cluster_returns['abs_t_accpted'].mean()
|
|
183
|
-
number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
|
|
184
|
-
|
|
185
|
-
return mean_relevance, cluster_returns, number_relevant_states
|
|
94
|
+
class stock_eda_panel(object):
|
|
186
95
|
|
|
96
|
+
"""
|
|
97
|
+
Class that initialy gets stock data then apply feature enginering, enrichment, analysis, plotting, model training etc.
|
|
98
|
+
|
|
99
|
+
Attributes
|
|
100
|
+
----------
|
|
101
|
+
stock_code : str
|
|
102
|
+
symbol of the asset
|
|
103
|
+
n_days : str
|
|
104
|
+
number of days to extract data
|
|
105
|
+
data_window : str
|
|
106
|
+
large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
|
|
107
|
+
df : pd.DataFrame
|
|
108
|
+
Pandas dataframe of the asset data with features
|
|
109
|
+
strategy_log: pd.DataFrame
|
|
110
|
+
Pandas dataframe that has the results of different tested strategies (result from strategy simulator hmm)
|
|
111
|
+
best_strategy: list
|
|
112
|
+
features of the best performing strategy (result from strategy simulator hmm)
|
|
113
|
+
top_10_strategy: dict
|
|
114
|
+
top 10 best performing strategies (result from strategy simulator hmm)
|
|
115
|
+
settings: dict
|
|
116
|
+
configuration dictionary of the features and other parameters
|
|
117
|
+
|
|
118
|
+
Methods
|
|
119
|
+
-------
|
|
120
|
+
augmented_dickey_fuller_statistics(time_series=pd.Series, label=str):
|
|
121
|
+
Perform dickey fuller or stationary test for a given time series
|
|
122
|
+
It will print p value of the features
|
|
123
|
+
get_data():
|
|
124
|
+
Get asset data performing some data normalization or formating (in case of dates)
|
|
125
|
+
plot_series_returns(roll_mean_lags1=int, roll_mean_lags2=int)
|
|
126
|
+
Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
|
|
127
|
+
seasonal_plot():
|
|
128
|
+
Display time series split by year
|
|
129
|
+
plot_price_signal(feature=str, feature_2=str, opacity=float):
|
|
130
|
+
Display botton and roof signals over the closing prices
|
|
131
|
+
volatility_analysis(lags=int, trad_days=int, window_log_return=int, plot=boolean, save_features=boolean):
|
|
132
|
+
this method performs log return and volatilyty analysis of the closing prices
|
|
133
|
+
find_lag(feature=str, lag_list=list, column_target=str,posterior_lag=int, test_size=int):
|
|
134
|
+
displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
|
|
135
|
+
outlier_plot(zlim=float, plot=boolean, save_features=boolean):
|
|
136
|
+
perform outlier analysis of the log returns. It also permors normality test of returns
|
|
137
|
+
analysis_roll_mean_log_returns(lags=int, plot=boolean):
|
|
138
|
+
perform analysis of lags of the mean rolling log return
|
|
139
|
+
compute_clip_bands(feature_name=str,threshold=float):
|
|
140
|
+
compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
|
|
141
|
+
extract_sec_data(symbol=str, base_columns=list(str), rename_columns=dict):
|
|
142
|
+
extract new asset data and merge it to the main asset data
|
|
143
|
+
lag_log_return(lags=int, feature=str, feature_name=str):
|
|
144
|
+
compute log return given some lags
|
|
145
|
+
produce_log_volatility(trad_days=int, feature=str, feature_name=str):
|
|
146
|
+
compute volatility
|
|
147
|
+
signal_plotter(feature_name=str):
|
|
148
|
+
display analysis plot of a feature with high and low signals
|
|
149
|
+
log_features_standard(feature_name=str):
|
|
150
|
+
save resulting feature names in an standard structure
|
|
151
|
+
relative_spread_MA(ma1=int, ma2=int, threshold=float, plot=boolean, save_features=boolean):
|
|
152
|
+
perform relative moving average features, one for short term and another for long/mid term
|
|
153
|
+
pair_feature(pair_symbol=str, plot=boolean):
|
|
154
|
+
initialize pair feature data extraction and analysis
|
|
155
|
+
calculate_cointegration(series_1=pd.series, series_2=pd.series):
|
|
156
|
+
calculate cointegration score for two time series
|
|
157
|
+
bidirect_count_feature(rolling_window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
158
|
+
perform negative and positive return counting in a given rolling time window
|
|
159
|
+
get_relative_range_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
160
|
+
perform relative spread of opening and closing price
|
|
161
|
+
rsi_feature_improved(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
162
|
+
perform relative strength index
|
|
163
|
+
days_features_bands(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
164
|
+
compute mean returns for a given day of the week in a window scope per day
|
|
165
|
+
analysis_smooth_volume(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
166
|
+
compute feature of thrading volumes
|
|
167
|
+
roc_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
168
|
+
perform price rate of change
|
|
169
|
+
stoch_feature(window=int, smooth1=int, smooth2=int, threshold=float, plot=boolean, save_features=boolean):
|
|
170
|
+
perform stochastic oscilator RSI feature
|
|
171
|
+
stochastic_feature(window=int, smooth=int, threshold=float, plot=boolean, save_features=boolean):
|
|
172
|
+
perform stochastic oscilator feature
|
|
173
|
+
william_feature(lbp=int, threshold=float, plot=boolean, save_features=boolean):
|
|
174
|
+
perfom fast stochastic oscilator or william indicator
|
|
175
|
+
vortex_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
176
|
+
perform vortex oscilator
|
|
177
|
+
minmax_pricefeature(type_func=str, window=int, distance=bolean, save_features=boolean)
|
|
178
|
+
get relative price/ distance feature with respect to the min/max price in a given window
|
|
179
|
+
pair_index_feature(pair_symbol=str, feature_label=str, window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
180
|
+
perform additional asset ROC feature, then a new feature is created in the main dataframe
|
|
181
|
+
produce_order_features(feature_name=str, save_features=boolean):
|
|
182
|
+
perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
|
|
183
|
+
compute_last_signal (feature_name=str, save_features=boolean):
|
|
184
|
+
perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
|
|
185
|
+
create_hmm_derived_features():
|
|
186
|
+
create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
|
|
187
|
+
cluster_hmm_analysis(n_clusters=int,features_hmm=list, test_data_size=int, seed=int, lag_returns_state=int, plot=boolean, save_features=boolean, model=obj):
|
|
188
|
+
create or use a hmm model
|
|
189
|
+
sharpe_ratio(return_series=pd.Series, n_trad_days=int, rf=float):
|
|
190
|
+
perform sharpe ratio of a given time series return
|
|
191
|
+
treat_signal_strategy(test_data=pd.DataFrame, strategy=list):
|
|
192
|
+
helper method that treats signals and converts signals to 1 or 0
|
|
193
|
+
stategy_simulator(features=list, hmm_feature=boolean):
|
|
194
|
+
execute strategy and get some performance metrics like sharpe ratio, return
|
|
195
|
+
viz_strategy(strategy):
|
|
196
|
+
display analysis plot of a given strategy
|
|
197
|
+
deep_dive_analysis_hmm(test_data_size=int, split=str):
|
|
198
|
+
display analysis plot hmm model
|
|
199
|
+
get_targets(steps=int):
|
|
200
|
+
produce regression target return taking future prices
|
|
201
|
+
get_categorical_targets(horizon=int, flor_loss=float, top_gain=float):
|
|
202
|
+
produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
|
|
203
|
+
get_configurations(test_data_size=int, val_data_size=int, model_type=str):
|
|
204
|
+
produce configuration dictionary that were saved in the feature generation methods if save_features was activated
|
|
205
|
+
"""
|
|
187
206
|
|
|
188
|
-
class stock_eda_panel(object):
|
|
189
|
-
|
|
190
207
|
def __init__(self, stock_code, n_days, data_window = '5y'):
|
|
208
|
+
|
|
209
|
+
"""
|
|
210
|
+
Initialize object
|
|
211
|
+
|
|
212
|
+
Parameters
|
|
213
|
+
----------
|
|
214
|
+
stock_code (str): symbol of the asset
|
|
215
|
+
n_days (str): number of days to extract data
|
|
216
|
+
data_window (str): large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
|
|
217
|
+
|
|
218
|
+
Returns
|
|
219
|
+
-------
|
|
220
|
+
None
|
|
221
|
+
"""
|
|
222
|
+
|
|
191
223
|
self.stock_code = stock_code
|
|
192
224
|
self.n_days = n_days
|
|
193
225
|
self.today = datetime.date.today()
|
|
194
226
|
self.features = list()
|
|
195
227
|
self.signals = list()
|
|
196
228
|
self.data_window = data_window
|
|
197
|
-
|
|
229
|
+
|
|
198
230
|
def augmented_dickey_fuller_statistics(self,time_series, label):
|
|
231
|
+
"""
|
|
232
|
+
Perform dickey fuller or stationary test for a given time series
|
|
233
|
+
It will print p value of the features
|
|
234
|
+
|
|
235
|
+
Parameters
|
|
236
|
+
----------
|
|
237
|
+
time_series (pd.Series): pandas series of the time series
|
|
238
|
+
label (pd.Series): feature name
|
|
239
|
+
|
|
240
|
+
Returns
|
|
241
|
+
-------
|
|
242
|
+
None
|
|
243
|
+
"""
|
|
199
244
|
result = adfuller(time_series.dropna().values)
|
|
200
245
|
print('p-value: {} for the series {}'.format(round(result[1],6), label))
|
|
201
|
-
|
|
246
|
+
|
|
202
247
|
def get_data(self):
|
|
248
|
+
"""
|
|
249
|
+
Get asset data performing some data normalization or formating (in case of dates)
|
|
250
|
+
|
|
251
|
+
Parameters
|
|
252
|
+
----------
|
|
253
|
+
None
|
|
254
|
+
|
|
255
|
+
Returns
|
|
256
|
+
-------
|
|
257
|
+
None
|
|
258
|
+
"""
|
|
259
|
+
|
|
203
260
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
204
261
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
205
262
|
|
|
@@ -210,7 +267,7 @@ class stock_eda_panel(object):
|
|
|
210
267
|
df.reset_index(inplace=True)
|
|
211
268
|
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
212
269
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
213
|
-
|
|
270
|
+
|
|
214
271
|
df = df[df.Date >= begin_date_str ]
|
|
215
272
|
self.settings_general = {
|
|
216
273
|
'n_days':self.n_days,
|
|
@@ -219,44 +276,56 @@ class stock_eda_panel(object):
|
|
|
219
276
|
'execution_date': self.today.strftime('%Y-%m-%d')
|
|
220
277
|
}
|
|
221
278
|
self.df = df
|
|
222
|
-
|
|
279
|
+
|
|
223
280
|
### cleaning volume
|
|
224
281
|
### volume clearning
|
|
225
282
|
self.df['Volume'] = np.where(self.df['Volume'] <= 10, np.nan, self.df['Volume'])
|
|
226
283
|
self.df['Volume'] = self.df['Volume'].fillna(method='bfill')
|
|
227
|
-
|
|
284
|
+
|
|
228
285
|
## filling
|
|
229
|
-
|
|
286
|
+
|
|
230
287
|
base_columns_unit_test = ['Open','High','Low','Close','Volume']
|
|
231
288
|
self.df[base_columns_unit_test] = self.df[base_columns_unit_test].fillna(method='ffill')
|
|
232
|
-
|
|
289
|
+
|
|
233
290
|
## cleaning nulls
|
|
234
|
-
|
|
291
|
+
|
|
235
292
|
xs = self.df[base_columns_unit_test].isnull().sum()/self.df[base_columns_unit_test].count()
|
|
236
293
|
reject_columns = list(xs[xs > 0.5].index.values)
|
|
237
|
-
|
|
294
|
+
|
|
238
295
|
if len(reject_columns) > 0:
|
|
239
296
|
logging.warning("the following columns have many nulls and are drop: {}".format(reject_columns))
|
|
240
297
|
self.df = self.df.drop(columns = reject_columns)
|
|
241
|
-
|
|
242
|
-
|
|
298
|
+
|
|
243
299
|
def plot_series_returns(self,roll_mean_lags1,roll_mean_lags2):
|
|
244
|
-
|
|
300
|
+
|
|
301
|
+
"""
|
|
302
|
+
Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
|
|
303
|
+
|
|
304
|
+
Parameters
|
|
305
|
+
----------
|
|
306
|
+
roll_mean_lags1 (int): short term window
|
|
307
|
+
roll_mean_lags2 (int): mid/long term window
|
|
308
|
+
|
|
309
|
+
Returns
|
|
310
|
+
-------
|
|
311
|
+
None
|
|
312
|
+
"""
|
|
313
|
+
|
|
245
314
|
df = self.df
|
|
246
315
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
247
316
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
248
|
-
|
|
317
|
+
|
|
249
318
|
### getting rolling mean
|
|
250
319
|
df["Close_roll_mean"] = (
|
|
251
320
|
df.sort_values("Date")["Close"]
|
|
252
321
|
.transform(lambda x: x.rolling(roll_mean_lags1, min_periods=1).mean())
|
|
253
322
|
)
|
|
254
|
-
|
|
323
|
+
|
|
255
324
|
df["Close_roll_mean_2"] = (
|
|
256
325
|
df.sort_values("Date")["Close"]
|
|
257
326
|
.transform(lambda x: x.rolling(roll_mean_lags2, min_periods=1).mean())
|
|
258
327
|
)
|
|
259
|
-
|
|
328
|
+
|
|
260
329
|
### getting rolling stdv
|
|
261
330
|
df["Close_roll_std"] = (
|
|
262
331
|
df.sort_values("Date")["Close"]
|
|
@@ -273,7 +342,7 @@ class stock_eda_panel(object):
|
|
|
273
342
|
))
|
|
274
343
|
|
|
275
344
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.Close, marker_color = 'blue', name='Price'),row=1, col=1)
|
|
276
|
-
|
|
345
|
+
|
|
277
346
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean, marker_color = 'black', name='roll mean' ),row=1, col=1)
|
|
278
347
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean_2, marker_color = 'grey', name='roll mean 2' ),row=1, col=1)
|
|
279
348
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.lower, marker_color = 'pink',legendgroup='bound', name='bound' ),row=1, col=1)
|
|
@@ -281,8 +350,21 @@ class stock_eda_panel(object):
|
|
|
281
350
|
|
|
282
351
|
fig.update_layout(height=500, width=1200, title_text=f"stock {self.stock_code} vizualization")
|
|
283
352
|
fig.show()
|
|
284
|
-
|
|
353
|
+
|
|
285
354
|
def seasonal_plot(self):
|
|
355
|
+
|
|
356
|
+
"""
|
|
357
|
+
Display time series split by year
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
None
|
|
362
|
+
|
|
363
|
+
Returns
|
|
364
|
+
-------
|
|
365
|
+
None
|
|
366
|
+
"""
|
|
367
|
+
|
|
286
368
|
df = self.df
|
|
287
369
|
years = list(df['Date'].dt.year.unique())
|
|
288
370
|
years.sort()
|
|
@@ -302,10 +384,24 @@ class stock_eda_panel(object):
|
|
|
302
384
|
|
|
303
385
|
fig.update_layout(height=500, width=1400, title_text=f"stock {self.stock_code} seasonal vizualization")
|
|
304
386
|
fig.show()
|
|
305
|
-
|
|
387
|
+
|
|
306
388
|
def plot_price_signal(self, feature, feature_2 = '', opacity = 0.3):
|
|
307
|
-
|
|
308
|
-
|
|
389
|
+
|
|
390
|
+
"""
|
|
391
|
+
Display botton and roof signals over the closing prices
|
|
392
|
+
|
|
393
|
+
Parameters
|
|
394
|
+
----------
|
|
395
|
+
feature (str): name of the main feature to plot
|
|
396
|
+
feature_2 (str): name of the alternative feature to plot
|
|
397
|
+
opacity (float): opacity degree of the signals points
|
|
398
|
+
|
|
399
|
+
Returns
|
|
400
|
+
-------
|
|
401
|
+
None
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
|
|
309
405
|
signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
|
|
310
406
|
norm_list = [f'norm_{feature}', f'z_{feature}', feature]
|
|
311
407
|
|
|
@@ -315,14 +411,14 @@ class stock_eda_panel(object):
|
|
|
315
411
|
if norm_feat in self.df.columns:
|
|
316
412
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[norm_feat],legendgroup="up", mode='lines',name = norm_feat, marker_color = 'blue'),col = 1, row = 1)
|
|
317
413
|
break
|
|
318
|
-
|
|
319
|
-
|
|
414
|
+
|
|
415
|
+
|
|
320
416
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df['Close'], mode='lines',name = 'history', marker_color = 'grey'),col = 1, row = 2)
|
|
321
|
-
|
|
417
|
+
|
|
322
418
|
if feature == 'MA_spread':
|
|
323
419
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma1_column],legendgroup="ma", mode='lines',name = self.ma1_column, marker_color = 'black'),col = 1, row = 2)
|
|
324
420
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma2_column],legendgroup="ma", mode='lines',name = self.ma2_column, marker_color = 'grey'),col = 1, row = 2)
|
|
325
|
-
|
|
421
|
+
|
|
326
422
|
for norm_feat in norm_list:
|
|
327
423
|
if norm_feat in self.df.columns:
|
|
328
424
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=np.where(self.df[norm_feat] > 0, self.df['Close'], np.nan),legendgroup="up", mode='markers',name = 'up', marker_color = 'green',opacity = opacity),col = 1, row = 2)
|
|
@@ -338,8 +434,25 @@ class stock_eda_panel(object):
|
|
|
338
434
|
|
|
339
435
|
fig.update_layout(height=900, width=1200)
|
|
340
436
|
fig.show()
|
|
341
|
-
|
|
437
|
+
|
|
342
438
|
def volatility_analysis(self, lags, trad_days, window_log_return, plot = False, save_features = False):
|
|
439
|
+
|
|
440
|
+
"""
|
|
441
|
+
this method performs log return and volatilyty analysis of the closing prices
|
|
442
|
+
|
|
443
|
+
Parameters
|
|
444
|
+
----------
|
|
445
|
+
lags (int): number of lags to apply to the closing prices
|
|
446
|
+
trad_days (int): number of trading days to anualize returns or volatility
|
|
447
|
+
window_log_return (int): window for rolling returns
|
|
448
|
+
plot (boolean): True to display plot
|
|
449
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
450
|
+
|
|
451
|
+
Returns
|
|
452
|
+
-------
|
|
453
|
+
None
|
|
454
|
+
"""
|
|
455
|
+
|
|
343
456
|
df = self.df
|
|
344
457
|
df['log_return'] = np.log(df.Close/df.Close.shift(lags))
|
|
345
458
|
df['sqr_log_return'] = np.square(df.log_return)
|
|
@@ -349,13 +462,13 @@ class stock_eda_panel(object):
|
|
|
349
462
|
df.sort_values("Date")["log_return"]
|
|
350
463
|
.transform(lambda x: x.rolling(window_log_return, min_periods=1).mean())
|
|
351
464
|
)
|
|
352
|
-
|
|
465
|
+
|
|
353
466
|
if save_features:
|
|
354
467
|
self.features.append('volatility_log_return')
|
|
355
468
|
self.features.append('roll_mean_log_return')
|
|
356
469
|
self.features.append('log_return')
|
|
357
470
|
self.settings_volatility = {'lags':lags, 'trad_days':trad_days, 'window_log_return':window_log_return}
|
|
358
|
-
|
|
471
|
+
|
|
359
472
|
if plot:
|
|
360
473
|
fig = make_subplots(rows=3, cols=1,vertical_spacing = 0.02,shared_xaxes=True,
|
|
361
474
|
specs=[
|
|
@@ -395,10 +508,25 @@ class stock_eda_panel(object):
|
|
|
395
508
|
|
|
396
509
|
self.augmented_dickey_fuller_statistics(df['log_return'], 'log_return')
|
|
397
510
|
self.augmented_dickey_fuller_statistics(df['roll_mean_log_return'], 'roll_mean_log_return')
|
|
398
|
-
|
|
399
|
-
|
|
511
|
+
|
|
400
512
|
def find_lag(self, feature, lag_list, column_target = 'log_return',posterior_lag = 4, test_size = 350):
|
|
401
513
|
|
|
514
|
+
"""
|
|
515
|
+
displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
|
|
516
|
+
|
|
517
|
+
Parameters
|
|
518
|
+
----------
|
|
519
|
+
feature (str): feature name to apply lags
|
|
520
|
+
lag_list (list): list of lags, each lag as integer
|
|
521
|
+
column_target (str): target to get correlation, e.g return or mean reaturn
|
|
522
|
+
posterior_lag (int): for the target, posterior window shift to calculate a window return
|
|
523
|
+
test_size (int): data size of the test data. The remaining is going to be used as training data. This parameters is ment to avoid overfiting and leackage
|
|
524
|
+
|
|
525
|
+
Returns
|
|
526
|
+
-------
|
|
527
|
+
None
|
|
528
|
+
"""
|
|
529
|
+
|
|
402
530
|
results = dict()
|
|
403
531
|
df = self.df.iloc[:-test_size,:][['Date','Close','roll_mean_log_return','log_return',feature]].sort_values('Date').copy()
|
|
404
532
|
for i,lag in enumerate(lag_list):
|
|
@@ -413,7 +541,7 @@ class stock_eda_panel(object):
|
|
|
413
541
|
'lag':lag,
|
|
414
542
|
'pearsonr_log_return':r_log[0],
|
|
415
543
|
'spearman_log_return': sp_log[0],
|
|
416
|
-
}
|
|
544
|
+
}
|
|
417
545
|
del df
|
|
418
546
|
results_df = pd.DataFrame(results).T
|
|
419
547
|
|
|
@@ -426,10 +554,23 @@ class stock_eda_panel(object):
|
|
|
426
554
|
plt.legend()
|
|
427
555
|
plt.axhline(y=0, color='grey', linestyle='--')
|
|
428
556
|
plt.show()
|
|
429
|
-
|
|
430
|
-
|
|
557
|
+
|
|
431
558
|
def outlier_plot(self, zlim, plot = False, save_features = False):
|
|
432
|
-
|
|
559
|
+
|
|
560
|
+
"""
|
|
561
|
+
perform outlier analysis of the log returns. It also permors normality test of returns
|
|
562
|
+
|
|
563
|
+
Parameters
|
|
564
|
+
----------
|
|
565
|
+
zlim (float): alpha or z thrsholds for normalized returns
|
|
566
|
+
plot (boolean): True to display plot
|
|
567
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
568
|
+
|
|
569
|
+
Returns
|
|
570
|
+
-------
|
|
571
|
+
None
|
|
572
|
+
"""
|
|
573
|
+
|
|
433
574
|
mean = self.df.log_return.mean()
|
|
434
575
|
std = self.df.log_return.std()
|
|
435
576
|
self.df['z_log_return'] = (self.df.log_return - mean)/std
|
|
@@ -440,7 +581,7 @@ class stock_eda_panel(object):
|
|
|
440
581
|
self.df['up_outlier'] = zlim*self.df['z_std_log_return'] + mean_
|
|
441
582
|
self.df['low_outlier'] = -zlim*self.df['z_std_log_return'] + mean_
|
|
442
583
|
|
|
443
|
-
self.df['
|
|
584
|
+
self.df['signal_low_outlier'] = np.where( (self.df['z_log_return'] < self.df['low_outlier'] ), 1, 0)
|
|
444
585
|
self.df['signal_up_outlier'] = np.where( (self.df['z_log_return'] > self.df['up_outlier'] ), 1, 0)
|
|
445
586
|
if save_features:
|
|
446
587
|
self.signals.append('signal_low_outlier')
|
|
@@ -451,7 +592,7 @@ class stock_eda_panel(object):
|
|
|
451
592
|
sigma = self.df['z_log_return'].std()
|
|
452
593
|
x = np.linspace(self.df['z_log_return'].min(),self.df['z_log_return'].max(), 15000)
|
|
453
594
|
y = stats.norm.pdf(x, loc = mu, scale = sigma)
|
|
454
|
-
|
|
595
|
+
|
|
455
596
|
fig, axs = plt.subplots(2, 1,figsize=(15,8))
|
|
456
597
|
|
|
457
598
|
axs[0].hist(self.df['z_log_return'],density = True,bins = 100 , label = 'Returns distribution')
|
|
@@ -460,7 +601,7 @@ class stock_eda_panel(object):
|
|
|
460
601
|
axs[0].axvline(l2, color='green', linestyle='--')
|
|
461
602
|
axs[0].axvline(-l2, color='green', linestyle='--')
|
|
462
603
|
axs[0].plot(x,y, linewidth = 3, color = 'r', label = 'Normal Dist Curve')
|
|
463
|
-
|
|
604
|
+
|
|
464
605
|
axs[1].plot(self.df['Date'],self.df['z_log_return'])
|
|
465
606
|
axs[1].plot(self.df['Date'],self.df['low_outlier'], linestyle='--')
|
|
466
607
|
axs[1].plot(self.df['Date'],self.df['up_outlier'], linestyle='--')
|
|
@@ -469,18 +610,31 @@ class stock_eda_panel(object):
|
|
|
469
610
|
plt.show()
|
|
470
611
|
|
|
471
612
|
z_stat, p_stat = stats.normaltest(self.df['z_log_return'].dropna())
|
|
472
|
-
p_stat = round(p_stat, 7)
|
|
613
|
+
p_stat = round(p_stat, 7)
|
|
473
614
|
print('---------------------- returns normality tests ----------------------------')
|
|
474
615
|
if p_stat < 0.05:
|
|
475
616
|
print(f'pvalue: {p_stat} then, returns do not follow a normal distribution')
|
|
476
617
|
else:
|
|
477
618
|
print(f'pvalue: {p_stat} then, returns follow a normal distribution')
|
|
478
|
-
|
|
619
|
+
|
|
479
620
|
def analysis_roll_mean_log_returns(self, lags, plot = False):
|
|
480
621
|
|
|
622
|
+
"""
|
|
623
|
+
perform analysis of lags of the mean rolling log return
|
|
624
|
+
|
|
625
|
+
Parameters
|
|
626
|
+
----------
|
|
627
|
+
lags (int): lags to apply to the roll log return
|
|
628
|
+
plot (boolean): True to display plot
|
|
629
|
+
|
|
630
|
+
Returns
|
|
631
|
+
-------
|
|
632
|
+
None
|
|
633
|
+
"""
|
|
634
|
+
|
|
481
635
|
self.df['lag'] = self.df.roll_mean_log_return.shift(lags)
|
|
482
636
|
self.df['Diff'] = self.df['roll_mean_log_return'] - self.df['lag']
|
|
483
|
-
|
|
637
|
+
|
|
484
638
|
if plot:
|
|
485
639
|
|
|
486
640
|
fig, axs = plt.subplots(1, 3,figsize=(19,4))
|
|
@@ -493,7 +647,20 @@ class stock_eda_panel(object):
|
|
|
493
647
|
plt.show()
|
|
494
648
|
|
|
495
649
|
def compute_clip_bands(self,feature_name,threshold):
|
|
496
|
-
|
|
650
|
+
|
|
651
|
+
"""
|
|
652
|
+
compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
|
|
653
|
+
|
|
654
|
+
Parameters
|
|
655
|
+
----------
|
|
656
|
+
feature_name (str): feature name
|
|
657
|
+
threshold (float): alpha or z thrsholds for normalized returns
|
|
658
|
+
|
|
659
|
+
Returns
|
|
660
|
+
-------
|
|
661
|
+
None
|
|
662
|
+
"""
|
|
663
|
+
|
|
497
664
|
self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/self.df[feature_name].std()
|
|
498
665
|
mean_ = self.df[f'norm_{feature_name}'].mean()
|
|
499
666
|
|
|
@@ -506,84 +673,140 @@ class stock_eda_panel(object):
|
|
|
506
673
|
self.df[f'signal_low_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] < self.df[f'lower_{feature_name}'] ), 1, 0)
|
|
507
674
|
self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] > self.df[f'upper_{feature_name}'] ), 1, 0)
|
|
508
675
|
|
|
676
|
+
def extract_sec_data(self, symbol, base_columns, rename_columns=None):
|
|
677
|
+
"""
|
|
678
|
+
extract new asset data and merge it to the main asset data
|
|
679
|
+
|
|
680
|
+
Parameters
|
|
681
|
+
----------
|
|
682
|
+
symbol (str): symbol to extract data
|
|
683
|
+
base_columns (list): list of columns to persist
|
|
684
|
+
rename_columns (dict): map of the new column names using pd.DataFrame.rename()
|
|
685
|
+
|
|
686
|
+
Returns
|
|
687
|
+
-------
|
|
688
|
+
None
|
|
689
|
+
"""
|
|
690
|
+
begin_date = self.today - relativedelta(days = self.n_days)
|
|
691
|
+
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
692
|
+
|
|
693
|
+
stock = yf.Ticker(symbol)
|
|
694
|
+
df = stock.history(period=self.data_window)
|
|
695
|
+
df = df.sort_values('Date')
|
|
696
|
+
df.reset_index(inplace=True)
|
|
697
|
+
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
698
|
+
df['Date'] = pd.to_datetime(df['Date'])
|
|
699
|
+
df = df[df.Date >= begin_date_str ]
|
|
700
|
+
df = df[base_columns]
|
|
701
|
+
if rename_columns:
|
|
702
|
+
df = df.rename(columns=rename_columns)
|
|
703
|
+
right_df = df.copy()
|
|
704
|
+
|
|
705
|
+
dates_vector = self.df.Date.to_frame()
|
|
706
|
+
right_df = dates_vector.merge(right_df, on ='Date',how = 'left')
|
|
707
|
+
right_df = right_df.fillna(method = 'bfill')
|
|
708
|
+
right_df = right_df.fillna(method = 'ffill')
|
|
709
|
+
|
|
710
|
+
self.df = self.df.merge(right_df, on ='Date',how = 'left')
|
|
711
|
+
self.df = self.df.sort_values("Date")
|
|
712
|
+
del right_df
|
|
713
|
+
gc.collect()
|
|
714
|
+
|
|
715
|
+
def lag_log_return(self, lags, feature, feature_name=False):
|
|
716
|
+
"""
|
|
717
|
+
compute log return given some lags
|
|
718
|
+
|
|
719
|
+
Parameters
|
|
720
|
+
----------
|
|
721
|
+
lags (int): lag to apply log return
|
|
722
|
+
feature (str): feature to apply log return
|
|
723
|
+
feature_name (str): rename resuling name
|
|
724
|
+
|
|
725
|
+
Returns
|
|
726
|
+
-------
|
|
727
|
+
None
|
|
728
|
+
"""
|
|
729
|
+
|
|
730
|
+
feature_name = feature_name if feature_name else f"{feature}_log_return"
|
|
731
|
+
self.df[feature_name] = np.log(self.df[feature]/self.df[feature].shift(lags))
|
|
732
|
+
|
|
733
|
+
def produce_log_volatility(self, trad_days, feature, feature_name=False):
|
|
734
|
+
"""
|
|
735
|
+
compute log return given some lags
|
|
736
|
+
|
|
737
|
+
Parameters
|
|
738
|
+
----------
|
|
739
|
+
trad_days (int): window function to calculate standard deviation
|
|
740
|
+
feature (str): feature to apply computation
|
|
741
|
+
feature_name (str): resulting feature name
|
|
742
|
+
|
|
743
|
+
Returns
|
|
744
|
+
-------
|
|
745
|
+
None
|
|
746
|
+
"""
|
|
747
|
+
feature_name = feature_name if feature_name else f"{feature}_log_return_{trad_days}"
|
|
748
|
+
self.df[feature_name] = self.df.sort_values("Date")[feature].rolling(window = trad_days).std()*np.sqrt(252)
|
|
749
|
+
|
|
509
750
|
def signal_plotter(self, feature_name):
|
|
751
|
+
|
|
752
|
+
"""
|
|
753
|
+
display analysis plot of a feature with high and low signals
|
|
754
|
+
|
|
755
|
+
Parameters
|
|
756
|
+
----------
|
|
757
|
+
feature_name (str): feature name
|
|
758
|
+
|
|
759
|
+
Returns
|
|
760
|
+
-------
|
|
761
|
+
None
|
|
762
|
+
"""
|
|
763
|
+
|
|
510
764
|
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
511
|
-
|
|
765
|
+
|
|
512
766
|
axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
|
|
513
767
|
axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
|
|
514
768
|
axs[0].plot(self.df[f'norm_{feature_name}'])
|
|
515
|
-
|
|
769
|
+
|
|
516
770
|
plot_acf(self.df[feature_name].dropna(),lags=25,ax = axs[1])
|
|
517
771
|
axs[1].set_title(f'acf {feature_name}')
|
|
518
|
-
|
|
772
|
+
|
|
519
773
|
plot_pacf(self.df[feature_name].dropna(),lags=25,ax = axs[2])
|
|
520
774
|
axs[2].set_title(f'pacf {feature_name}')
|
|
521
|
-
|
|
775
|
+
|
|
522
776
|
fig.show()
|
|
523
777
|
|
|
524
778
|
def log_features_standard(self, feature_name):
|
|
779
|
+
"""
|
|
780
|
+
save resulting feature names in an standard structure
|
|
781
|
+
|
|
782
|
+
Parameters
|
|
783
|
+
----------
|
|
784
|
+
feature_name (str): feature name
|
|
785
|
+
|
|
786
|
+
Returns
|
|
787
|
+
-------
|
|
788
|
+
None
|
|
789
|
+
"""
|
|
525
790
|
self.features.append(feature_name)
|
|
526
791
|
self.signals.append(f'signal_up_{feature_name}')
|
|
527
792
|
self.signals.append(f'signal_low_{feature_name}')
|
|
528
|
-
|
|
529
|
-
#######################
|
|
530
|
-
#### to be deprecated ####
|
|
531
|
-
def spread_MA(self, ma1, ma2, limit = 1.95, plot = False, save_features = False):
|
|
532
|
-
|
|
533
|
-
self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
|
|
534
|
-
self.df[f'MA_{ma2}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma2, min_periods=1).mean()))
|
|
535
|
-
|
|
536
|
-
self.ma1_column = f'MA_{ma1}'
|
|
537
|
-
self.ma2_column = f'MA_{ma2}'
|
|
538
|
-
self.df['MA_spread'] = self.df[f'MA_{ma1}'] - self.df[f'MA_{ma2}']
|
|
539
|
-
|
|
540
|
-
self.df['norm_MA_spread'] = (self.df['MA_spread'] - self.df['MA_spread'].mean())/self.df['MA_spread'].std()
|
|
541
|
-
mean_ = self.df['norm_MA_spread'].mean()
|
|
542
|
-
self.df['rollstd_MA_spread'] = self.df.sort_values("Date")["norm_MA_spread"].rolling(50).std()
|
|
543
|
-
|
|
544
|
-
self.df['upper_MA_spread'] = limit*self.df['rollstd_MA_spread'] + mean_
|
|
545
|
-
self.df['lower_MA_spread'] = -limit*self.df['rollstd_MA_spread'] + mean_
|
|
546
|
-
|
|
547
|
-
self.df['signal_low_MA_spread'] = np.where( (self.df['norm_MA_spread'] < self.df['lower_MA_spread'] ), 1, 0)
|
|
548
|
-
self.df['signal_up_MA_spread'] = np.where( (self.df['norm_MA_spread'] > self.df['upper_MA_spread'] ), 1, 0)
|
|
549
|
-
|
|
550
|
-
### ploting purposes
|
|
551
|
-
self.df[f"Roll_mean_{ma1}"] = (
|
|
552
|
-
self.df.sort_values("Date")["Close"]
|
|
553
|
-
.transform(lambda x: x.rolling(ma1, min_periods=1).mean())
|
|
554
|
-
)
|
|
555
|
-
self.df[f"Roll_mean_{ma2}"] = (
|
|
556
|
-
self.df.sort_values("Date")["Close"]
|
|
557
|
-
.transform(lambda x: x.rolling(ma2, min_periods=1).mean())
|
|
558
|
-
)
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
print('--------------------------------------------------------------------')
|
|
562
|
-
if save_features:
|
|
563
|
-
self.features.append('MA_spread')
|
|
564
|
-
self.signals.append('signal_low_MA_spread')
|
|
565
|
-
self.signals.append('signal_up_MA_spread')
|
|
566
|
-
self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
|
|
567
|
-
|
|
568
|
-
if plot:
|
|
569
|
-
|
|
570
|
-
fig, axs = plt.subplots(1, 3,figsize=(21,4))
|
|
571
|
-
|
|
572
|
-
axs[0].plot(self.df['Date'],self.df['norm_MA_spread'])
|
|
573
|
-
axs[0].plot(self.df['Date'],self.df['upper_MA_spread'], linestyle='--')
|
|
574
|
-
axs[0].plot(self.df['Date'],self.df['lower_MA_spread'], linestyle='--')
|
|
575
|
-
axs[0].set_title('MA_spread series')
|
|
576
793
|
|
|
577
|
-
plot_acf(self.df['MA_spread'].dropna(),lags=25, ax=axs[1])
|
|
578
|
-
axs[1].set_title('acf MA_spread series')
|
|
579
|
-
|
|
580
|
-
plot_pacf(self.df['MA_spread'].dropna(),lags=25, ax=axs[2])
|
|
581
|
-
axs[2].set_title('acf MA_spread series')
|
|
582
|
-
plt.show()
|
|
583
|
-
##################################################
|
|
584
|
-
|
|
585
794
|
def relative_spread_MA(self, ma1, ma2, threshold = 1.95, plot = False, save_features = False):
|
|
586
|
-
|
|
795
|
+
"""
|
|
796
|
+
perform relative moving average features, one for short term and another for long/mid term
|
|
797
|
+
|
|
798
|
+
Parameters
|
|
799
|
+
----------
|
|
800
|
+
ma1 (int): short term moving average window
|
|
801
|
+
ma2 (int): long/mid term moving average window
|
|
802
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
803
|
+
plot (boolean): True to display plot
|
|
804
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
805
|
+
|
|
806
|
+
Returns
|
|
807
|
+
-------
|
|
808
|
+
None
|
|
809
|
+
"""
|
|
587
810
|
feature_name = 'rel_MA_spread'
|
|
588
811
|
|
|
589
812
|
self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
|
|
@@ -605,16 +828,27 @@ class stock_eda_panel(object):
|
|
|
605
828
|
.transform(lambda x: x.rolling(ma2, min_periods=1).mean())
|
|
606
829
|
)
|
|
607
830
|
|
|
608
|
-
print('--------------------------------------------------------------------')
|
|
609
831
|
if save_features:
|
|
610
832
|
self.log_features_standard(feature_name)
|
|
611
|
-
self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
|
|
833
|
+
self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
|
|
612
834
|
|
|
613
835
|
if plot:
|
|
614
|
-
|
|
615
836
|
self.signal_plotter(feature_name)
|
|
616
|
-
|
|
837
|
+
|
|
617
838
|
def pair_feature(self, pair_symbol, plot = False):
|
|
839
|
+
"""
|
|
840
|
+
initialize pair feature data extraction and analysis
|
|
841
|
+
|
|
842
|
+
Parameters
|
|
843
|
+
----------
|
|
844
|
+
pair_symbol (str): symbol of the pair asset to extract
|
|
845
|
+
plot (boolean): True to display plot
|
|
846
|
+
|
|
847
|
+
Returns
|
|
848
|
+
-------
|
|
849
|
+
None
|
|
850
|
+
"""
|
|
851
|
+
|
|
618
852
|
self.pair_symbol = pair_symbol
|
|
619
853
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
620
854
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
@@ -627,7 +861,7 @@ class stock_eda_panel(object):
|
|
|
627
861
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
628
862
|
df = df[df.Date >= begin_date_str ]
|
|
629
863
|
self.pair_df = df
|
|
630
|
-
|
|
864
|
+
|
|
631
865
|
#### converting the same index ####
|
|
632
866
|
dates_vector = self.df.Date.to_frame()
|
|
633
867
|
self.pair_df = dates_vector.merge(self.pair_df, on ='Date',how = 'left')
|
|
@@ -653,8 +887,40 @@ class stock_eda_panel(object):
|
|
|
653
887
|
plt.plot(self.df['Date'],asset_2_values,label = asset_2)
|
|
654
888
|
plt.legend()
|
|
655
889
|
plt.show()
|
|
656
|
-
|
|
890
|
+
|
|
891
|
+
def smooth_logrets_interaction_term(self, feature_interact_with, resulting_feature_name="persisted_clip_diff_smooths", rollmean_window = 5, ext_threhold=0.015, persist_days = 3, save_features=False):
|
|
892
|
+
"""
|
|
893
|
+
create an interaction term that is going to compare the distance of asset wolling window mean and market rolling window mean.
|
|
894
|
+
then get the outliers or high values using abs and this value persist for some days
|
|
895
|
+
goal persist big differences of market and asset returns
|
|
896
|
+
|
|
897
|
+
feature_interact_with: name of the market return
|
|
898
|
+
rollmean_window: rolling window or smoothing number of days
|
|
899
|
+
ext_threhold: threshold
|
|
900
|
+
persist_days: number of days to persis the signal
|
|
901
|
+
"""
|
|
902
|
+
self.df["smooth_log_return"] = self.df['log_return'].rolling(rollmean_window).mean().values
|
|
903
|
+
self.df["smooth_market_log_return"] = self.df[feature_interact_with].rolling(rollmean_window).mean().values
|
|
904
|
+
self.df["diff_smooths"] = self.df["smooth_market_log_return"]-self.df["smooth_log_return"]
|
|
905
|
+
self.df["clip_diff_smooths"] = np.where(np.abs(self.df["diff_smooths"]) > ext_threhold, self.df["diff_smooths"] , 0)
|
|
906
|
+
self.df[resulting_feature_name] = self.df['clip_diff_smooths'].rolling(persist_days).mean().values
|
|
907
|
+
self.df = self.df.drop(columns=["smooth_log_return","smooth_market_log_return","diff_smooths","clip_diff_smooths"])
|
|
908
|
+
|
|
657
909
|
def calculate_cointegration(self,series_1, series_2):
|
|
910
|
+
"""
|
|
911
|
+
calculate cointegration score for two time series
|
|
912
|
+
|
|
913
|
+
Parameters
|
|
914
|
+
----------
|
|
915
|
+
series_1 (pd.series): time series
|
|
916
|
+
series_2 (pd.series): time series
|
|
917
|
+
|
|
918
|
+
Returns
|
|
919
|
+
-------
|
|
920
|
+
coint_flag (boolean): 1 if the p_value cointegration_t are lower than 0.05 and critical value
|
|
921
|
+
hedge_value (float): beta from the regression model
|
|
922
|
+
"""
|
|
923
|
+
|
|
658
924
|
coint_flag = 0
|
|
659
925
|
coint_res = coint(series_1, series_2)
|
|
660
926
|
coint_t = coint_res[0]
|
|
@@ -666,9 +932,22 @@ class stock_eda_panel(object):
|
|
|
666
932
|
coint_flag = 1 if p_value < 0.05 and coint_t < critical_value else 0
|
|
667
933
|
|
|
668
934
|
return coint_flag, hedge_value
|
|
669
|
-
|
|
670
|
-
def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
|
|
671
935
|
|
|
936
|
+
def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
|
|
937
|
+
"""
|
|
938
|
+
display analysis of the pair feature and save results in case if needed
|
|
939
|
+
|
|
940
|
+
Parameters
|
|
941
|
+
----------
|
|
942
|
+
window (int): window to apply to the rolling spread between pair and main asset
|
|
943
|
+
z_threshold (float): alpha or z thrsholds for the normalized feature
|
|
944
|
+
plot (boolean): True to display plot
|
|
945
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
946
|
+
|
|
947
|
+
Returns
|
|
948
|
+
-------
|
|
949
|
+
None
|
|
950
|
+
"""
|
|
672
951
|
spread_series = pd.Series(self.df.pair_spread)
|
|
673
952
|
mean = spread_series.rolling(center = False, window = window).mean()
|
|
674
953
|
std = spread_series.rolling(center = False, window = window).std()
|
|
@@ -677,11 +956,11 @@ class stock_eda_panel(object):
|
|
|
677
956
|
self.df['pair_z_score'] = z_score
|
|
678
957
|
self.df['signal_low_pair_z_score'] = np.where(self.df['pair_z_score'] < -z_threshold, 1, 0)
|
|
679
958
|
self.df['signal_up_pair_z_score'] = np.where(self.df['pair_z_score'] > z_threshold, 1, 0)
|
|
680
|
-
|
|
959
|
+
|
|
681
960
|
if save_features:
|
|
682
961
|
self.log_features_standard('pair_z_score')
|
|
683
|
-
self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
|
|
684
|
-
|
|
962
|
+
self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
|
|
963
|
+
|
|
685
964
|
if plot:
|
|
686
965
|
pvalue = round(adfuller(z_score.dropna().values)[1],4)
|
|
687
966
|
print(f'p value of the rolling z-score is {pvalue}')
|
|
@@ -695,7 +974,7 @@ class stock_eda_panel(object):
|
|
|
695
974
|
axs[0,0].axhline(y=0, color='blue', linestyle='-.')
|
|
696
975
|
axs[0,0].plot(self.df.pair_z_score)
|
|
697
976
|
axs[0,0].set_title('z score from the spread')
|
|
698
|
-
|
|
977
|
+
|
|
699
978
|
axs[0,1].plot(self.df['Date'],self.df['pair_spread'])
|
|
700
979
|
axs[0,1].plot(self.df['Date'],np.where(self.df['signal_low_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'red')
|
|
701
980
|
axs[0,1].plot(self.df['Date'],np.where(self.df['signal_up_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'green')
|
|
@@ -704,44 +983,27 @@ class stock_eda_panel(object):
|
|
|
704
983
|
|
|
705
984
|
plot_acf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,0])
|
|
706
985
|
axs[1,0].set_title('acf pair_z_score')
|
|
707
|
-
|
|
986
|
+
|
|
708
987
|
plot_pacf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,1])
|
|
709
988
|
axs[1,1].set_title('pacf pair_z_score')
|
|
710
|
-
|
|
711
|
-
plt.show()
|
|
712
|
-
|
|
713
|
-
#######################
|
|
714
|
-
#### to be deprecated ####
|
|
715
|
-
def get_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
|
|
716
989
|
|
|
717
|
-
# negative countiing and rolling countingng
|
|
718
|
-
self.df['RetClose'] = self.df['Close'].pct_change()
|
|
719
|
-
self.df['roll_pos_counting'] = np.where(self.df['RetClose'].shift(1) > 0,1,0 )
|
|
720
|
-
self.df['roll_pos_counting'] = self.df['roll_pos_counting'].rolling(window = rolling_window).sum()
|
|
721
|
-
|
|
722
|
-
mean = self.df['roll_pos_counting'].mean()
|
|
723
|
-
std = self.df['roll_pos_counting'].std()
|
|
724
|
-
self.df['norm_counting'] = (self.df['roll_pos_counting'] - mean )/std
|
|
725
|
-
|
|
726
|
-
self.df['signal_up_roll_pos_counting'] = np.where((self.df['norm_counting'] > threshold),1,0)
|
|
727
|
-
self.df['signal_low_roll_pos_counting'] = np.where((self.df['norm_counting'] < -threshold),1,0)
|
|
728
|
-
|
|
729
|
-
if save_features:
|
|
730
|
-
self.features.append('roll_pos_counting')
|
|
731
|
-
self.signals.append('signal_up_roll_pos_counting')
|
|
732
|
-
self.signals.append('signal_low_roll_pos_counting')
|
|
733
|
-
self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
734
|
-
|
|
735
|
-
if plot:
|
|
736
|
-
fig = plt.figure(figsize = (10,4))
|
|
737
|
-
plt.plot(self.df['Date'],self.df.norm_counting)
|
|
738
|
-
plt.axhline(y=threshold, color='grey', linestyle='--')
|
|
739
|
-
plt.axhline(y=-threshold, color='grey', linestyle='--')
|
|
740
990
|
plt.show()
|
|
741
|
-
|
|
742
|
-
|
|
991
|
+
|
|
743
992
|
def bidirect_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
|
|
744
|
-
|
|
993
|
+
"""
|
|
994
|
+
perform negative and positive return counting in a given rolling time window
|
|
995
|
+
|
|
996
|
+
Parameters
|
|
997
|
+
----------
|
|
998
|
+
rolling_window (int): window to apply to positive and negative returns
|
|
999
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1000
|
+
plot (boolean): True to display plot
|
|
1001
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1002
|
+
|
|
1003
|
+
Returns
|
|
1004
|
+
-------
|
|
1005
|
+
None
|
|
1006
|
+
"""
|
|
745
1007
|
feature_name = 'bidirect_counting'
|
|
746
1008
|
# negative countiing and rolling countingng
|
|
747
1009
|
self.df['RetClose'] = self.df['Close'].pct_change()
|
|
@@ -757,7 +1019,7 @@ class stock_eda_panel(object):
|
|
|
757
1019
|
|
|
758
1020
|
if save_features:
|
|
759
1021
|
self.log_features_standard(feature_name)
|
|
760
|
-
self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
1022
|
+
self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
761
1023
|
|
|
762
1024
|
if plot:
|
|
763
1025
|
fig = plt.figure(figsize = (10,4))
|
|
@@ -766,47 +1028,21 @@ class stock_eda_panel(object):
|
|
|
766
1028
|
plt.plot(self.df['Date'],self.df[f'lower_{feature_name}'], linestyle='--')
|
|
767
1029
|
plt.show()
|
|
768
1030
|
|
|
769
|
-
#######################
|
|
770
|
-
#### to be deprecated ####
|
|
771
|
-
def get_range_feature(self, window, up_threshold, low_threshold, plot = False, save_features = False):
|
|
772
|
-
|
|
773
|
-
self.df["Range"] = self.df["High"] / self.df["Low"] - 1
|
|
774
|
-
self.df['Avg_range'] = self.df['Range'].rolling(window = 5).mean()
|
|
775
|
-
self.df['dist_range'] = self.df['Range'] - self.df['Avg_range']
|
|
776
|
-
self.df['norm_dist_range'] = (self.df['dist_range'] - self.df['dist_range'].mean())/ self.df['dist_range'].std()
|
|
777
|
-
|
|
778
|
-
mean_ = self.df['norm_dist_range'].mean()
|
|
779
|
-
self.df[f'std_norm_dist_range'] = (self.df.sort_values("Date")["norm_dist_range"].transform(lambda x: x.rolling(window, min_periods=1).std()))
|
|
780
|
-
|
|
781
|
-
self.df['up_bound_norm_dist_range'] = up_threshold*self.df['std_norm_dist_range'] + mean_
|
|
782
|
-
self.df['low_bound_norm_dist_range'] = -low_threshold*self.df['std_norm_dist_range'] + mean_
|
|
783
|
-
|
|
784
|
-
self.df['signal_up_dist_range'] = np.where(self.df['norm_dist_range'] > self.df['up_bound_norm_dist_range'],1,0 )
|
|
785
|
-
self.df['signal_low_dist_range'] = np.where(self.df['norm_dist_range'] < self.df['low_bound_norm_dist_range'],1,0 )
|
|
786
|
-
|
|
787
|
-
if save_features:
|
|
788
|
-
self.features.append('dist_range')
|
|
789
|
-
self.signals.append('signal_up_dist_range')
|
|
790
|
-
self.signals.append('signal_low_dist_range')
|
|
791
|
-
self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
|
|
792
|
-
|
|
793
|
-
if plot:
|
|
794
|
-
fig, axs = plt.subplots(2, 2,figsize=(17,11))
|
|
795
|
-
|
|
796
|
-
axs[0,0].plot(self.df['Range'])
|
|
797
|
-
axs[0,0].set_title('range')
|
|
798
|
-
|
|
799
|
-
axs[0,1].plot(self.df['Avg_range'])
|
|
800
|
-
axs[0,1].set_title('Avg_range')
|
|
801
|
-
|
|
802
|
-
axs[1,0].plot(self.df['up_bound_norm_dist_range'],color = 'grey', linestyle='--')
|
|
803
|
-
axs[1,0].plot(self.df['low_bound_norm_dist_range'],color = 'grey', linestyle='--')
|
|
804
|
-
axs[1,0].plot(self.df['norm_dist_range'])
|
|
805
|
-
axs[1,0].set_title('norm_dist_range')
|
|
806
|
-
#######################
|
|
807
|
-
|
|
808
1031
|
def get_relative_range_feature(self, window, threshold, plot = False, save_features = False):
|
|
809
|
-
|
|
1032
|
+
"""
|
|
1033
|
+
perform relative spread of opening and closing price
|
|
1034
|
+
|
|
1035
|
+
Parameters
|
|
1036
|
+
----------
|
|
1037
|
+
window (int): window to apply to the feature
|
|
1038
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1039
|
+
plot (boolean): True to display plot
|
|
1040
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1041
|
+
|
|
1042
|
+
Returns
|
|
1043
|
+
-------
|
|
1044
|
+
None
|
|
1045
|
+
"""
|
|
810
1046
|
feature_name = 'CO_Range'
|
|
811
1047
|
self.df[feature_name] = self.df["Close"] / self.df["Open"]-1
|
|
812
1048
|
self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/ self.df[feature_name].std()
|
|
@@ -822,7 +1058,7 @@ class stock_eda_panel(object):
|
|
|
822
1058
|
|
|
823
1059
|
if save_features:
|
|
824
1060
|
self.log_features_standard(feature_name)
|
|
825
|
-
self.settings_relative_price_range = {'window':window, 'threshold':threshold}
|
|
1061
|
+
self.settings_relative_price_range = {'window':window, 'threshold':threshold}
|
|
826
1062
|
|
|
827
1063
|
if plot:
|
|
828
1064
|
fig, axs = plt.subplots(1, 2,figsize=(14,5))
|
|
@@ -835,46 +1071,24 @@ class stock_eda_panel(object):
|
|
|
835
1071
|
axs[1].plot(self.df[f'norm_{feature_name}'])
|
|
836
1072
|
axs[1].set_title(f'norm_{feature_name}')
|
|
837
1073
|
|
|
838
|
-
#######################
|
|
839
|
-
#### to be deprecated ####
|
|
840
|
-
def rsi_feature(self, window, lag_rsi_ret, threshold, plot = False, save_features = False):
|
|
841
|
-
|
|
842
|
-
rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
|
|
843
|
-
self.df['RSI'] = rsi
|
|
844
|
-
self.df['RSI_ret'] = self.df['RSI']/self.df['RSI'].shift(lag_rsi_ret)
|
|
845
|
-
|
|
846
|
-
mean = self.df['RSI_ret'].mean()
|
|
847
|
-
std = self.df['RSI_ret'].std()
|
|
848
|
-
self.df['norm_RSI_ret'] = (self.df['RSI_ret']-mean)/std
|
|
849
|
-
self.df['signal_up_RSI_ret'] = np.where(self.df['norm_RSI_ret'] > threshold,1,0)
|
|
850
|
-
self.df['signal_low_RSI_ret'] = np.where(self.df['norm_RSI_ret'] < -threshold,1,0)
|
|
851
|
-
|
|
852
|
-
if save_features:
|
|
853
|
-
self.features.append('RSI_ret')
|
|
854
|
-
self.signals.append('signal_up_RSI_ret')
|
|
855
|
-
self.signals.append('signal_low_RSI_ret')
|
|
856
|
-
self.settings_rsi_feature= {'window':window, 'lag_rsi_ret':lag_rsi_ret, 'threshold':threshold}
|
|
857
|
-
|
|
858
|
-
if plot:
|
|
859
|
-
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
860
|
-
|
|
861
|
-
axs[0].plot(self.df.norm_RSI_ret)
|
|
862
|
-
axs[0].axhline(y=threshold, color='grey', linestyle='--')
|
|
863
|
-
axs[0].axhline(y=-threshold, color='grey', linestyle='--')
|
|
864
|
-
|
|
865
|
-
plot_acf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[1])
|
|
866
|
-
axs[1].set_title('acf RSI_ret')
|
|
867
|
-
|
|
868
|
-
plot_pacf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[2])
|
|
869
|
-
axs[2].set_title('pacf RSI_ret')
|
|
870
|
-
|
|
871
|
-
fig.show()
|
|
872
|
-
#######################
|
|
873
|
-
|
|
874
1074
|
def rsi_feature_improved(self, window, threshold, plot = False, save_features = False):
|
|
1075
|
+
"""
|
|
1076
|
+
perform relative strength index
|
|
1077
|
+
|
|
1078
|
+
Parameters
|
|
1079
|
+
----------
|
|
1080
|
+
window (int): window to apply to the feature
|
|
1081
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1082
|
+
plot (boolean): True to display plot
|
|
1083
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1084
|
+
|
|
1085
|
+
Returns
|
|
1086
|
+
-------
|
|
1087
|
+
None
|
|
1088
|
+
"""
|
|
875
1089
|
feature_name = 'RSI'
|
|
876
1090
|
rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
|
|
877
|
-
self.df[feature_name] = rsi
|
|
1091
|
+
self.df[feature_name] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
878
1092
|
self.compute_clip_bands(feature_name,threshold)
|
|
879
1093
|
|
|
880
1094
|
if save_features:
|
|
@@ -883,54 +1097,22 @@ class stock_eda_panel(object):
|
|
|
883
1097
|
|
|
884
1098
|
if plot:
|
|
885
1099
|
self.signal_plotter(feature_name)
|
|
886
|
-
|
|
887
|
-
#######################
|
|
888
|
-
#### to be deprecated ####
|
|
889
|
-
def days_features(self, window_day, limit, plot = False, save_features = False):
|
|
890
|
-
|
|
891
|
-
self.df['dow'] = self.df.Date.dt.dayofweek
|
|
892
|
-
self.df['dow'] = self.df['dow'].astype('str')
|
|
893
|
-
|
|
894
|
-
self.df['target_mean_input'] = (self.df.sort_values("Date").groupby('dow')['roll_mean_log_return'].transform(lambda x: x.rolling(window_day, min_periods=1).mean()))
|
|
895
|
-
|
|
896
|
-
mean = self.df['target_mean_input'].mean()
|
|
897
|
-
std = self.df['target_mean_input'].std()
|
|
898
|
-
|
|
899
|
-
self.df['norm_dow_input'] = (self.df['target_mean_input']-mean)/std
|
|
900
|
-
mean_ = self.df['norm_dow_input'].mean()
|
|
901
|
-
self.df['std_dow_input'] = self.df.sort_values("Date")["norm_dow_input"].rolling(50).std()
|
|
902
1100
|
|
|
903
|
-
self.df['up_dow_input'] = limit*self.df['std_dow_input'] + mean_
|
|
904
|
-
self.df['low_dow_input'] = -limit*self.df['std_dow_input'] - mean_
|
|
905
|
-
|
|
906
|
-
self.df['signal_up_target_mean_input'] = np.where(self.df['norm_dow_input'] > self.df['up_dow_input'],1,0)
|
|
907
|
-
self.df['signal_low_target_mean_input'] = np.where(self.df['norm_dow_input'] < self.df['low_dow_input'],1,0)
|
|
908
|
-
|
|
909
|
-
if save_features:
|
|
910
|
-
|
|
911
|
-
self.features.append('target_mean_input')
|
|
912
|
-
self.signals.append('signal_up_target_mean_input')
|
|
913
|
-
self.signals.append('signal_low_target_mean_input')
|
|
914
|
-
self.settings_days_features = {'window_day':window_day, 'limit':limit}
|
|
915
|
-
|
|
916
|
-
if plot:
|
|
917
|
-
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
918
|
-
|
|
919
|
-
axs[0].plot(self.df['norm_dow_input'])
|
|
920
|
-
axs[0].plot(self.df['up_dow_input'], linestyle='--')
|
|
921
|
-
axs[0].plot(self.df['low_dow_input'], linestyle='--')
|
|
922
|
-
|
|
923
|
-
plot_acf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[1])
|
|
924
|
-
axs[1].set_title('acf day feature')
|
|
925
|
-
|
|
926
|
-
plot_pacf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[2])
|
|
927
|
-
axs[2].set_title('pacf day feature')
|
|
928
|
-
|
|
929
|
-
fig.show()
|
|
930
|
-
#######################
|
|
931
|
-
|
|
932
1101
|
def days_features_bands(self, window, threshold, plot = False, save_features = False):
|
|
933
|
-
|
|
1102
|
+
"""
|
|
1103
|
+
compute mean returns for a given day of the week in a window scope per day
|
|
1104
|
+
|
|
1105
|
+
Parameters
|
|
1106
|
+
----------
|
|
1107
|
+
window (int): window to apply to the feature
|
|
1108
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1109
|
+
plot (boolean): True to display plot
|
|
1110
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1111
|
+
|
|
1112
|
+
Returns
|
|
1113
|
+
-------
|
|
1114
|
+
None
|
|
1115
|
+
"""
|
|
934
1116
|
self.df['dow'] = self.df.Date.dt.dayofweek
|
|
935
1117
|
self.df['dow'] = self.df['dow'].astype('str')
|
|
936
1118
|
|
|
@@ -947,65 +1129,22 @@ class stock_eda_panel(object):
|
|
|
947
1129
|
|
|
948
1130
|
if plot:
|
|
949
1131
|
self.signal_plotter(feature_name)
|
|
950
|
-
|
|
951
|
-
#######################
|
|
952
|
-
#### to be deprecated ####
|
|
953
|
-
def analysis_volume(self,lag_volume, threshold, window, plot = False, save_features = False):
|
|
954
|
-
|
|
955
|
-
self.df['log_Volume'] = np.log(self.df['Volume'])
|
|
956
|
-
self.df['ret_log_Volume'] = self.df['log_Volume'].pct_change(lag_volume)
|
|
957
|
-
|
|
958
|
-
self.df['norm_ret_log_Volume'] = (self.df['ret_log_Volume'] - self.df['ret_log_Volume'].mean())/ self.df['ret_log_Volume'].std()
|
|
959
|
-
mean_ = self.df['norm_ret_log_Volume'].mean()
|
|
960
|
-
self.df[f'std_norm_ret_log_Volume'] = (self.df.sort_values("Date")["norm_ret_log_Volume"].transform(lambda x: x.rolling(window, min_periods=1).std()))
|
|
961
|
-
|
|
962
|
-
self.df['up_bound_ret_log_Volume'] = threshold*self.df['std_norm_ret_log_Volume'] + mean_
|
|
963
|
-
self.df['low_bound_ret_log_Volume'] = -threshold*self.df['std_norm_ret_log_Volume'] + mean_
|
|
964
|
-
|
|
965
|
-
self.df['signal_up_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] > self.df['up_bound_ret_log_Volume'],1,0 )
|
|
966
|
-
self.df['signal_low_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] < self.df['low_bound_ret_log_Volume'],1,0 )
|
|
967
|
-
|
|
968
|
-
if save_features:
|
|
969
|
-
self.features.append('ret_log_Volume')
|
|
970
|
-
self.signals.append('signal_up_ret_log_Volume')
|
|
971
|
-
self.signals.append('signal_low_ret_log_Volume')
|
|
972
|
-
self.settings_volume_feature= {'lag_volume':lag_volume, 'threshold':threshold, 'window':window}
|
|
973
|
-
if plot:
|
|
974
|
-
fig, axs = plt.subplots(3, 2,figsize=(11,13))
|
|
975
|
-
axs[0,0].plot(self.df.Date, self.df.Volume)
|
|
976
|
-
axs[0,0].set_title('Volume')
|
|
977
|
-
axs[0,1].plot(self.df.Date, self.df.log_Volume)
|
|
978
|
-
axs[0,1].set_title('log Volume')
|
|
979
|
-
|
|
980
|
-
plot_acf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,0])
|
|
981
|
-
axs[1,0].set_title('acf log_Volume')
|
|
982
|
-
plot_pacf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,1])
|
|
983
|
-
axs[1,1].set_title('pacf log_Volume')
|
|
984
|
-
|
|
985
|
-
plot_acf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,0])
|
|
986
|
-
axs[2,0].set_title('acf ret_log_Volume')
|
|
987
|
-
plot_pacf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,1])
|
|
988
|
-
axs[2,1].set_title('pacf ret_log_Volume')
|
|
989
1132
|
|
|
990
|
-
plt.show()
|
|
991
|
-
|
|
992
|
-
print('--------------------------------------------------------------')
|
|
993
|
-
|
|
994
|
-
fig, axs = plt.subplots(1, 2,figsize=(10,4))
|
|
995
|
-
|
|
996
|
-
axs[0].plot(self.df.Date, self.df.norm_ret_log_Volume)
|
|
997
|
-
axs[0].plot(self.df.Date, self.df.up_bound_ret_log_Volume)
|
|
998
|
-
axs[0].plot(self.df.Date, self.df.low_bound_ret_log_Volume)
|
|
999
|
-
axs[0].set_title('norm_ret_log_Volume')
|
|
1000
|
-
|
|
1001
|
-
axs[1].plot(self.df.Date, self.df.std_norm_ret_log_Volume)
|
|
1002
|
-
axs[1].set_title('std_norm_ret_log_Volume')
|
|
1003
|
-
|
|
1004
|
-
plt.show()
|
|
1005
|
-
#######################
|
|
1006
|
-
|
|
1007
1133
|
def analysis_smooth_volume(self, window, threshold, plot = False, save_features = False):
|
|
1008
|
-
|
|
1134
|
+
"""
|
|
1135
|
+
compute feature of thrading volumes
|
|
1136
|
+
|
|
1137
|
+
Parameters
|
|
1138
|
+
----------
|
|
1139
|
+
window (int): window to apply to the feature
|
|
1140
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1141
|
+
plot (boolean): True to display plot
|
|
1142
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1143
|
+
|
|
1144
|
+
Returns
|
|
1145
|
+
-------
|
|
1146
|
+
None
|
|
1147
|
+
"""
|
|
1009
1148
|
feature_name = 'smooth_Volume'
|
|
1010
1149
|
self.df[feature_name] = np.log(self.df['Volume'])
|
|
1011
1150
|
# self.df[feature_name] = self.df['log_Volume'].rolling(window).mean()
|
|
@@ -1039,7 +1178,7 @@ class stock_eda_panel(object):
|
|
|
1039
1178
|
|
|
1040
1179
|
fig, axs = plt.subplots(1,2,figsize=(10,4))
|
|
1041
1180
|
|
|
1042
|
-
axs[0].plot(self.df[f'{feature_name}'])
|
|
1181
|
+
axs[0].plot(self.df[f'{feature_name}'])
|
|
1043
1182
|
axs[0].set_title(f'{feature_name}')
|
|
1044
1183
|
|
|
1045
1184
|
axs[1].plot(self.df[f'z_{feature_name}'], linestyle='--')
|
|
@@ -1048,9 +1187,23 @@ class stock_eda_panel(object):
|
|
|
1048
1187
|
plt.show()
|
|
1049
1188
|
|
|
1050
1189
|
def roc_feature(self, window, threshold, plot = False, save_features = False):
|
|
1190
|
+
"""
|
|
1191
|
+
perform price rate of change
|
|
1192
|
+
|
|
1193
|
+
Parameters
|
|
1194
|
+
----------
|
|
1195
|
+
window (int): window to apply to the feature
|
|
1196
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1197
|
+
plot (boolean): True to display plot
|
|
1198
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1199
|
+
|
|
1200
|
+
Returns
|
|
1201
|
+
-------
|
|
1202
|
+
None
|
|
1203
|
+
"""
|
|
1051
1204
|
feature_name = 'ROC'
|
|
1052
1205
|
roc = ROCIndicator(close = self.df['Close'], window = window).roc()
|
|
1053
|
-
self.df[feature_name] = roc
|
|
1206
|
+
self.df[feature_name] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1054
1207
|
self.compute_clip_bands(feature_name,threshold)
|
|
1055
1208
|
|
|
1056
1209
|
if save_features:
|
|
@@ -1058,11 +1211,27 @@ class stock_eda_panel(object):
|
|
|
1058
1211
|
self.settings_roc_feature = {'window':window, 'threshold':threshold}
|
|
1059
1212
|
if plot:
|
|
1060
1213
|
self.signal_plotter(feature_name)
|
|
1061
|
-
|
|
1214
|
+
|
|
1062
1215
|
def stoch_feature(self, window, smooth1, smooth2, threshold, plot = False, save_features = False):
|
|
1216
|
+
"""
|
|
1217
|
+
perform stochastic oscilator RSI feature
|
|
1218
|
+
|
|
1219
|
+
Parameters
|
|
1220
|
+
----------
|
|
1221
|
+
window (int): window to apply to the feature
|
|
1222
|
+
smooth1 (int): smoothing parameter 1
|
|
1223
|
+
smooth2 (int): smoothing parameter 2
|
|
1224
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1225
|
+
plot (boolean): True to display plot
|
|
1226
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1227
|
+
|
|
1228
|
+
Returns
|
|
1229
|
+
-------
|
|
1230
|
+
None
|
|
1231
|
+
"""
|
|
1063
1232
|
feature_name = 'STOCH'
|
|
1064
1233
|
stoch = StochRSIIndicator(close = self.df['Close'], window = window, smooth1=smooth1, smooth2=smooth2).stochrsi()
|
|
1065
|
-
self.df[feature_name] = stoch
|
|
1234
|
+
self.df[feature_name] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1066
1235
|
self.compute_clip_bands(feature_name,threshold)
|
|
1067
1236
|
|
|
1068
1237
|
if save_features:
|
|
@@ -1072,9 +1241,24 @@ class stock_eda_panel(object):
|
|
|
1072
1241
|
self.signal_plotter(feature_name)
|
|
1073
1242
|
|
|
1074
1243
|
def stochastic_feature(self, window, smooth, threshold, plot = False, save_features = False):
|
|
1244
|
+
"""
|
|
1245
|
+
perform stochastic oscilator feature
|
|
1246
|
+
|
|
1247
|
+
Parameters
|
|
1248
|
+
----------
|
|
1249
|
+
window (int): window to apply to the feature
|
|
1250
|
+
smooth (int): smoothing parameter
|
|
1251
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1252
|
+
plot (boolean): True to display plot
|
|
1253
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1254
|
+
|
|
1255
|
+
Returns
|
|
1256
|
+
-------
|
|
1257
|
+
None
|
|
1258
|
+
"""
|
|
1075
1259
|
feature_name = 'STOCHOSC'
|
|
1076
1260
|
stochast = StochasticOscillator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window,smooth_window=smooth).stoch()
|
|
1077
|
-
self.df[feature_name] = stochast
|
|
1261
|
+
self.df[feature_name] = stochast.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1078
1262
|
self.compute_clip_bands(feature_name,threshold)
|
|
1079
1263
|
|
|
1080
1264
|
if save_features:
|
|
@@ -1084,9 +1268,23 @@ class stock_eda_panel(object):
|
|
|
1084
1268
|
self.signal_plotter(feature_name)
|
|
1085
1269
|
|
|
1086
1270
|
def william_feature(self, lbp, threshold, plot = False, save_features = False):
|
|
1271
|
+
"""
|
|
1272
|
+
perfom fast stochastic oscilator or william indicator
|
|
1273
|
+
|
|
1274
|
+
Parameters
|
|
1275
|
+
----------
|
|
1276
|
+
lbp (int): look back parameter
|
|
1277
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1278
|
+
plot (boolean): True to display plot
|
|
1279
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1280
|
+
|
|
1281
|
+
Returns
|
|
1282
|
+
-------
|
|
1283
|
+
None
|
|
1284
|
+
"""
|
|
1087
1285
|
feature_name = 'WILL'
|
|
1088
|
-
will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
|
|
1089
|
-
self.df[feature_name] = will
|
|
1286
|
+
will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
|
|
1287
|
+
self.df[feature_name] = will.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1090
1288
|
self.compute_clip_bands(feature_name,threshold)
|
|
1091
1289
|
|
|
1092
1290
|
if save_features:
|
|
@@ -1096,9 +1294,23 @@ class stock_eda_panel(object):
|
|
|
1096
1294
|
self.signal_plotter(feature_name)
|
|
1097
1295
|
|
|
1098
1296
|
def vortex_feature(self, window, threshold, plot = False, save_features = False):
|
|
1297
|
+
"""
|
|
1298
|
+
perform vortex oscilator
|
|
1299
|
+
|
|
1300
|
+
Parameters
|
|
1301
|
+
----------
|
|
1302
|
+
window (int): window to apply to the feature
|
|
1303
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1304
|
+
plot (boolean): True to display plot
|
|
1305
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1306
|
+
|
|
1307
|
+
Returns
|
|
1308
|
+
-------
|
|
1309
|
+
None
|
|
1310
|
+
"""
|
|
1099
1311
|
feature_name = 'VORTEX'
|
|
1100
1312
|
vortex = VortexIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window).vortex_indicator_diff()
|
|
1101
|
-
self.df[feature_name] = vortex
|
|
1313
|
+
self.df[feature_name] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1102
1314
|
self.compute_clip_bands(feature_name,threshold)
|
|
1103
1315
|
|
|
1104
1316
|
if save_features:
|
|
@@ -1107,11 +1319,93 @@ class stock_eda_panel(object):
|
|
|
1107
1319
|
if plot:
|
|
1108
1320
|
self.signal_plotter(feature_name)
|
|
1109
1321
|
|
|
1110
|
-
def
|
|
1322
|
+
def minmax_pricefeature(self, type_func, window, distance = False, plot = False, save_features = False):
|
|
1323
|
+
"""
|
|
1324
|
+
perform relative price/distance with respect to the min/max price in a given time scope
|
|
1325
|
+
|
|
1326
|
+
Parameters
|
|
1327
|
+
----------
|
|
1328
|
+
type_func (str): either min or max
|
|
1329
|
+
window (int): window scope
|
|
1330
|
+
distance (boolean): if true, get distance feature else relative feature
|
|
1331
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1332
|
+
|
|
1333
|
+
Returns
|
|
1334
|
+
-------
|
|
1335
|
+
None
|
|
1336
|
+
"""
|
|
1337
|
+
if type_func == 'min':
|
|
1338
|
+
self.df['Price_ref'] = self.df[['Open','High', 'Low','Close']].min(axis = 1)
|
|
1339
|
+
elif type_func == 'max':
|
|
1340
|
+
self.df['Price_ref'] = self.df[['Open','High', 'Low','Close']].max(axis = 1)
|
|
1341
|
+
|
|
1342
|
+
init_shape = self.df.shape[0]
|
|
1343
|
+
df_date = self.df[['Date','Price_ref']].rename(columns = {'Date':'Date_ref'}).copy()
|
|
1344
|
+
|
|
1345
|
+
self.df = self.df.rename(columns = {'Price_ref':'Price_to_use'})
|
|
1346
|
+
|
|
1347
|
+
if type_func == 'min':
|
|
1348
|
+
self.df[f'window_price'] = (self.df.sort_values("Date")["Price_to_use"].transform(lambda x: x.rolling(window, min_periods=1).min()))
|
|
1349
|
+
elif type_func == 'max':
|
|
1350
|
+
self.df[f'window_price'] = (self.df.sort_values("Date")["Price_to_use"].transform(lambda x: x.rolling(window, min_periods=1).max()))
|
|
1351
|
+
|
|
1352
|
+
|
|
1353
|
+
self.df = self.df.merge(df_date, left_on = 'window_price', right_on = 'Price_ref', how = 'left')
|
|
1354
|
+
self.df['date_span'] = self.df['Date'] - self.df['Date_ref']
|
|
1355
|
+
|
|
1356
|
+
self.df['RN'] = self.df.sort_values(['date_span'], ascending=False).groupby(['Date']).cumcount() + 1
|
|
1357
|
+
self.df = self.df[self.df['RN'] == 1]
|
|
1358
|
+
|
|
1359
|
+
if distance:
|
|
1360
|
+
self.df[f'{type_func}_distance_to_price'] = pd.to_numeric(self.df['date_span'].dt.days, downcast='integer')
|
|
1361
|
+
|
|
1362
|
+
if not distance:
|
|
1363
|
+
if type_func == 'min':
|
|
1364
|
+
self.df[f'{type_func}_relprice'] = self.df['Price_to_use']/self.df['window_price']-1
|
|
1365
|
+
|
|
1366
|
+
if type_func == 'max':
|
|
1367
|
+
self.df[f'{type_func}_relprice'] = self.df['window_price']/self.df['Price_to_use']-1
|
|
1368
|
+
|
|
1369
|
+
self.df = self.df.drop(columns = ['RN', 'date_span', 'Price_to_use', 'window_price', 'Date_ref','Price_ref'])
|
|
1370
|
+
|
|
1371
|
+
end_shape = self.df.shape[0]
|
|
1372
|
+
|
|
1373
|
+
if init_shape != end_shape:
|
|
1374
|
+
raise Exception("shapes are not the same")
|
|
1375
|
+
|
|
1376
|
+
if save_features:
|
|
1377
|
+
if distance:
|
|
1378
|
+
self.features.append(f'{type_func}_distance_to_price')
|
|
1379
|
+
name_attr = f'{type_func}_distance'
|
|
1380
|
+
if not distance:
|
|
1381
|
+
self.features.append(f'{type_func}_relprice')
|
|
1382
|
+
name_attr = f'{type_func}_relprice'
|
|
1383
|
+
|
|
1384
|
+
setattr(self,f'settings_{name_attr}_pricefeature' , {'type_func': type_func, 'window': window, 'distance': distance})
|
|
1385
|
+
|
|
1386
|
+
def pair_index_feature(self, pair_symbol, feature_label,threshold, window = None,ta_method='ROC',param_set=False,plot = False, save_features = False):
|
|
1387
|
+
"""
|
|
1388
|
+
perform additional asset ROC feature, then a new feature is created in the main dataframe
|
|
1389
|
+
|
|
1390
|
+
Parameters
|
|
1391
|
+
----------
|
|
1392
|
+
pair_symbol (str): symbol of the asset to extract the data
|
|
1393
|
+
feature_label (str): name of the resulting feature
|
|
1394
|
+
window (int): window to apply to the feature as default (this parameter is going to be deprecated)
|
|
1395
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1396
|
+
param_set (dict): parameter set in case ta_method is other than ROC
|
|
1397
|
+
ta_method (str): method to use, available RSI, ROC, VORTEX, STOCH
|
|
1398
|
+
plot (boolean): True to display plot
|
|
1399
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1400
|
+
|
|
1401
|
+
Returns
|
|
1402
|
+
-------
|
|
1403
|
+
None
|
|
1404
|
+
"""
|
|
1111
1405
|
self.pair_index = pair_symbol
|
|
1112
1406
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
1113
1407
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
1114
|
-
|
|
1408
|
+
|
|
1115
1409
|
if feature_label in self.df.columns:
|
|
1116
1410
|
self.df = self.df.drop(columns = [feature_label])
|
|
1117
1411
|
|
|
@@ -1123,14 +1417,27 @@ class stock_eda_panel(object):
|
|
|
1123
1417
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
1124
1418
|
df = df[df.Date >= begin_date_str ]
|
|
1125
1419
|
self.pair_index_df = df
|
|
1126
|
-
|
|
1420
|
+
|
|
1127
1421
|
#### converting the same index ####
|
|
1128
1422
|
dates_vector = self.df.Date.to_frame()
|
|
1129
1423
|
self.pair_index_df = dates_vector.merge(self.pair_index_df, on ='Date',how = 'left')
|
|
1130
1424
|
self.pair_index_df = self.pair_index_df.fillna(method = 'bfill')
|
|
1131
1425
|
self.pair_index_df = self.pair_index_df.fillna(method = 'ffill')
|
|
1132
|
-
|
|
1133
|
-
|
|
1426
|
+
|
|
1427
|
+
if ta_method == 'ROC':
|
|
1428
|
+
window = window if window else param_set.get('window')
|
|
1429
|
+
roc = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
|
|
1430
|
+
self.pair_index_df[feature_label] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1431
|
+
elif ta_method == 'RSI':
|
|
1432
|
+
rsi = RSIIndicator(close = self.pair_index_df['Close'], **param_set).rsi()
|
|
1433
|
+
self.pair_index_df[feature_label] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1434
|
+
elif ta_method == 'VORTEX':
|
|
1435
|
+
vortex = VortexIndicator(close = self.pair_index_df['Close'], high = self.pair_index_df['High'], low = self.pair_index_df['Low'], **param_set).vortex_indicator_diff()
|
|
1436
|
+
self.pair_index_df[feature_label] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1437
|
+
elif ta_method == 'STOCH':
|
|
1438
|
+
stoch = StochRSIIndicator(close = self.pair_index_df['Close'], **param_set).stochrsi()
|
|
1439
|
+
self.pair_index_df[feature_label] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1440
|
+
|
|
1134
1441
|
df_to_merge = self.pair_index_df[['Date',feature_label]]
|
|
1135
1442
|
self.df = self.df.merge(df_to_merge, on ='Date',how = 'left')
|
|
1136
1443
|
|
|
@@ -1140,7 +1447,7 @@ class stock_eda_panel(object):
|
|
|
1140
1447
|
if save_features:
|
|
1141
1448
|
self.log_features_standard(feature_label)
|
|
1142
1449
|
parameters = {feature_label:{'pair_symbol':pair_symbol, 'feature_label':feature_label, 'window':window,'threshold':threshold}}
|
|
1143
|
-
try:
|
|
1450
|
+
try:
|
|
1144
1451
|
len(self.settings_pair_index_feature)
|
|
1145
1452
|
print('existing')
|
|
1146
1453
|
self.settings_pair_index_feature.append(parameters)
|
|
@@ -1153,10 +1460,21 @@ class stock_eda_panel(object):
|
|
|
1153
1460
|
self.signal_plotter(feature_label)
|
|
1154
1461
|
|
|
1155
1462
|
def produce_order_features(self, feature_name, save_features = False):
|
|
1463
|
+
"""
|
|
1464
|
+
perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
|
|
1156
1465
|
|
|
1466
|
+
Parameters
|
|
1467
|
+
----------
|
|
1468
|
+
feature_name (str): name of the feature
|
|
1469
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1470
|
+
|
|
1471
|
+
Returns
|
|
1472
|
+
-------
|
|
1473
|
+
None
|
|
1474
|
+
"""
|
|
1157
1475
|
signal_feature_name = f'discrete_signal_{feature_name}'
|
|
1158
1476
|
order_feature_name = f'order_signal_{feature_name}'
|
|
1159
|
-
|
|
1477
|
+
|
|
1160
1478
|
self.df[signal_feature_name] = np.where(
|
|
1161
1479
|
self.df[f'signal_up_{feature_name}'] == 1,1,
|
|
1162
1480
|
np.where(
|
|
@@ -1173,14 +1491,107 @@ class stock_eda_panel(object):
|
|
|
1173
1491
|
self.df[order_feature_name] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1174
1492
|
self.df[order_feature_name] = self.df[order_feature_name]*self.df[signal_feature_name]
|
|
1175
1493
|
self.df = self.df.drop(columns = [f'lag_{signal_feature_name}', 'breack', "chain_id"])
|
|
1176
|
-
|
|
1494
|
+
|
|
1177
1495
|
## saving features
|
|
1178
1496
|
if save_features:
|
|
1179
1497
|
self.signals.append(signal_feature_name)
|
|
1180
1498
|
self.signals.append(order_feature_name)
|
|
1181
|
-
|
|
1499
|
+
|
|
1500
|
+
def get_order_feature_nosignal(self,feature_name, save_features=False):
|
|
1501
|
+
"""
|
|
1502
|
+
perform a feature that captures number of steps after the end of a signal
|
|
1503
|
+
|
|
1504
|
+
Parameters
|
|
1505
|
+
----------
|
|
1506
|
+
feature_name (str): name of the feature
|
|
1507
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1508
|
+
|
|
1509
|
+
Returns
|
|
1510
|
+
-------
|
|
1511
|
+
None
|
|
1512
|
+
"""
|
|
1513
|
+
order_feature_name = f'order_signal_{feature_name}'
|
|
1514
|
+
ns_order_feature_name = f'ns_order_{feature_name}'
|
|
1515
|
+
self.df = self.df.sort_values('Date')
|
|
1516
|
+
self.df['lag_'] = self.df[order_feature_name].shift(1)
|
|
1517
|
+
self.df['flag'] = np.where((self.df[order_feature_name] == 0) & (self.df['lag_']!=0),1,np.nan)
|
|
1518
|
+
self.df = self.df.drop(columns=['lag_'])
|
|
1519
|
+
self.df['order_'] = self.df.sort_values('Date').groupby(['flag']).cumcount() + 1
|
|
1520
|
+
self.df['order_'] = self.df['order_'].fillna(method='ffill')
|
|
1521
|
+
self.df['order_'] = np.where(self.df[order_feature_name]==0,self.df['order_'],0)
|
|
1522
|
+
self.df = self.df.drop(columns=['flag'])
|
|
1523
|
+
self.df['order_'] = self.df.sort_values('Date').groupby(['order_']).cumcount() + 1
|
|
1524
|
+
norm_list = [f'norm_{feature_name}', f'z_{feature_name}', feature_name]
|
|
1525
|
+
for norm_feature in norm_list:
|
|
1526
|
+
try:
|
|
1527
|
+
self.df['order_'] = np.sign(self.df[norm_feature])*self.df['order_']
|
|
1528
|
+
break
|
|
1529
|
+
except:
|
|
1530
|
+
pass
|
|
1531
|
+
self.df['order_'] = np.where(self.df[order_feature_name]==0,self.df['order_'],0)
|
|
1532
|
+
self.df = self.df.rename(columns={'order_':ns_order_feature_name})
|
|
1533
|
+
if save_features:
|
|
1534
|
+
self.signals.append(ns_order_feature_name)
|
|
1535
|
+
|
|
1536
|
+
def compute_last_signal(self,feature, save_features = False):
|
|
1537
|
+
"""
|
|
1538
|
+
perform two new features when signal is observed, one for the last duration of the previous chain, second for the last duration of the same sign signal
|
|
1539
|
+
|
|
1540
|
+
Parameters
|
|
1541
|
+
----------
|
|
1542
|
+
feature_name (str): name of the feature
|
|
1543
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1544
|
+
|
|
1545
|
+
Returns
|
|
1546
|
+
-------
|
|
1547
|
+
None
|
|
1548
|
+
"""
|
|
1549
|
+
def create_last_signal(df, feature, prefix, type ='0'):
|
|
1550
|
+
if type == '0':
|
|
1551
|
+
condition = df[f'order_signal_{feature}'] != 0
|
|
1552
|
+
elif type == '+':
|
|
1553
|
+
condition = df[f'order_signal_{feature}'] > 0
|
|
1554
|
+
elif type == '-':
|
|
1555
|
+
condition = df[f'order_signal_{feature}'] < 0
|
|
1556
|
+
df[f'last_maxorder_{feature}'] = np.where(condition, df[f'order_signal_{feature}'],np.nan)
|
|
1557
|
+
df['tmp_chain_index'] = df[f'last_maxorder_{feature}'].shift(-1)
|
|
1558
|
+
df['last'] = np.where((df[f'last_maxorder_{feature}'] != 0) & (df['tmp_chain_index'].isna()),df[f'last_maxorder_{feature}'], np.nan )
|
|
1559
|
+
df['last'] = df['last'].shift(1)
|
|
1560
|
+
df[f'last_maxorder_{feature}'] = df['last'].fillna(method = 'ffill')
|
|
1561
|
+
df = df.drop(columns = ['tmp_chain_index','last'])
|
|
1562
|
+
df[f'last_maxorder_{feature}'] = np.where(df[f'order_signal_{feature}'] != 0,df[f'last_maxorder_{feature}'],np.nan)
|
|
1563
|
+
df[f'last_maxorder_{feature}'] = df[f'last_maxorder_{feature}'].fillna(0)
|
|
1564
|
+
df = df.rename(columns = {f'last_maxorder_{feature}':f'{prefix}_{feature}'})
|
|
1565
|
+
return df
|
|
1566
|
+
prefix0, prefix1, prefix2 = 'ldur', 'pos', 'neg'
|
|
1567
|
+
self.df = create_last_signal(self.df, feature, prefix0, type ='0')
|
|
1568
|
+
self.df = create_last_signal(self.df, feature, prefix1, type ='+')
|
|
1569
|
+
self.df = create_last_signal(self.df, feature, prefix2, type ='-')
|
|
1570
|
+
|
|
1571
|
+
self.df[f'sldur_{feature}'] = np.where(
|
|
1572
|
+
self.df[f'order_signal_{feature}'] > 0, self.df[f'{prefix1}_{feature}'],
|
|
1573
|
+
np.where(
|
|
1574
|
+
self.df[f'order_signal_{feature}'] < 0, self.df[f'{prefix2}_{feature}'],
|
|
1575
|
+
0
|
|
1576
|
+
)
|
|
1577
|
+
)
|
|
1578
|
+
self.df = self.df.drop(columns = [f'{prefix1}_{feature}',f'{prefix2}_{feature}'])
|
|
1579
|
+
if save_features:
|
|
1580
|
+
self.signals.append(f'sldur_{feature}')
|
|
1581
|
+
self.signals.append(f'ldur_{feature}')
|
|
1582
|
+
|
|
1182
1583
|
def create_hmm_derived_features(self, lag_returns):
|
|
1584
|
+
"""
|
|
1585
|
+
create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
|
|
1586
|
+
|
|
1587
|
+
Parameters
|
|
1588
|
+
----------
|
|
1589
|
+
lag_returns (int): lag paramter (not used)
|
|
1183
1590
|
|
|
1591
|
+
Returns
|
|
1592
|
+
-------
|
|
1593
|
+
None
|
|
1594
|
+
"""
|
|
1184
1595
|
self.df = self.df.sort_values('Date')
|
|
1185
1596
|
## indexing chains
|
|
1186
1597
|
self.df['lag_hmm_feature'] = self.df['hmm_feature'].shift(1)
|
|
@@ -1189,31 +1600,44 @@ class stock_eda_panel(object):
|
|
|
1189
1600
|
self.df["chain_id"] = np.where(self.df['breack'] == 1,self.df["chain_id"],np.nan)
|
|
1190
1601
|
self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
|
|
1191
1602
|
self.df["hmm_chain_order"] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1192
|
-
|
|
1193
|
-
### returns using the first element in a chain
|
|
1194
|
-
self.df['first'] = np.where(self.df['hmm_chain_order'] == 1, self.df['Close'], np.nan)
|
|
1195
|
-
self.df['first'] = self.df.sort_values('Date')['first'].fillna(method='ffill')
|
|
1196
|
-
self.df['chain_return'] = (self.df['Close']/self.df['first'] -1) * 100
|
|
1197
1603
|
|
|
1198
|
-
|
|
1604
|
+
### returns using the windowsseeds
|
|
1605
|
+
self.df['lag_chain_close'] = self.df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
|
|
1606
|
+
self.df['chain_return'] = (self.df['Close']/self.df['lag_chain_close'] -1) * 100
|
|
1607
|
+
self.df = self.df.drop(columns = ['breack'])
|
|
1199
1608
|
|
|
1200
|
-
def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, plot = False, save_features = False, model = False):
|
|
1609
|
+
def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, corr_threshold = 0.75, plot = False, save_features = False, model = False):
|
|
1610
|
+
"""
|
|
1611
|
+
create or use a hmm model
|
|
1612
|
+
|
|
1613
|
+
Parameters
|
|
1614
|
+
----------
|
|
1615
|
+
n_clusters (int): number of clusters or states to calculate
|
|
1616
|
+
features_hmm (list): features to be considered in hmm model when training
|
|
1617
|
+
test_data_size (int): size of the test data. Note that the remaining is going to be used as training data
|
|
1618
|
+
seed (int): seed for the model inizialization
|
|
1619
|
+
lag_returns_state (int) : lags for returns of the state
|
|
1620
|
+
corr_threshold (float): correlation threshold for initial feature selection
|
|
1621
|
+
plot (boolean): True to display hmm states analysis
|
|
1622
|
+
save_features (boolean): True to save features and configurations
|
|
1623
|
+
model (obj): if provided, no model will be trainend and the provided model will be used to get hmm features
|
|
1624
|
+
|
|
1625
|
+
Returns
|
|
1626
|
+
-------
|
|
1627
|
+
None
|
|
1628
|
+
"""
|
|
1201
1629
|
if not model:
|
|
1202
|
-
|
|
1630
|
+
|
|
1203
1631
|
df_new = self.df
|
|
1204
|
-
pipeline_hmm = Pipeline([
|
|
1205
|
-
('selector', FeatureSelector(columns=features_hmm)),
|
|
1206
|
-
('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
|
|
1207
|
-
('hmm',GaussianHMM(n_components = n_clusters, covariance_type = 'full', random_state = seed))
|
|
1208
|
-
])
|
|
1209
1632
|
data_train = df_new.iloc[:-test_data_size,:]
|
|
1210
1633
|
data_test = df_new.iloc[-test_data_size:,:]
|
|
1211
1634
|
|
|
1212
|
-
|
|
1213
|
-
|
|
1635
|
+
th = trainer_hmm(data_train, features_hmm, n_clusters=n_clusters,corr_thrshold=corr_threshold, seed = seed)
|
|
1636
|
+
th.train()
|
|
1637
|
+
pipeline_hmm = th.hmm_model
|
|
1214
1638
|
self.model_hmm = pipeline_hmm
|
|
1215
1639
|
self.test_data_hmm = data_test
|
|
1216
|
-
|
|
1640
|
+
|
|
1217
1641
|
### first feature: the hidden state
|
|
1218
1642
|
self.df['hmm_feature'] = self.model_hmm.predict(self.df)
|
|
1219
1643
|
self.create_hmm_derived_features(lag_returns = lag_returns_state)
|
|
@@ -1230,15 +1654,15 @@ class stock_eda_panel(object):
|
|
|
1230
1654
|
hidden_states = pipeline_hmm.predict(data_test)
|
|
1231
1655
|
data_test['HMM'] = hidden_states
|
|
1232
1656
|
data_test['HMM_state'] = data_test['HMM'].map(map_)
|
|
1233
|
-
|
|
1657
|
+
|
|
1234
1658
|
if model:
|
|
1235
1659
|
self.df['hmm_feature'] = model.predict(self.df)
|
|
1236
1660
|
self.create_hmm_derived_features(lag_returns = lag_returns_state)
|
|
1237
|
-
|
|
1661
|
+
|
|
1238
1662
|
if save_features:
|
|
1239
1663
|
self.features.append('hmm_feature')
|
|
1240
1664
|
self.features.append('hmm_chain_order')
|
|
1241
|
-
self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state }
|
|
1665
|
+
self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state, 'corr_threshold':corr_threshold }
|
|
1242
1666
|
|
|
1243
1667
|
if plot:
|
|
1244
1668
|
|
|
@@ -1263,14 +1687,38 @@ class stock_eda_panel(object):
|
|
|
1263
1687
|
fig.show()
|
|
1264
1688
|
|
|
1265
1689
|
def sharpe_ratio(self, return_series, n_trad_days = 255, rf = 0.01):
|
|
1690
|
+
"""
|
|
1691
|
+
perform sharpe ratio of a given time series return
|
|
1692
|
+
|
|
1693
|
+
Parameters
|
|
1694
|
+
----------
|
|
1695
|
+
return_series (pd.series): time series of the returns
|
|
1696
|
+
n_trad_days (int): trading days to anualize returns
|
|
1697
|
+
rf (float): anual free risk rate
|
|
1698
|
+
|
|
1699
|
+
Returns
|
|
1700
|
+
-------
|
|
1701
|
+
sharpe_ratio (float): sharpe ratio
|
|
1702
|
+
"""
|
|
1266
1703
|
nsqrt = np.sqrt(n_trad_days)
|
|
1267
1704
|
mean = return_series.mean() * n_trad_days
|
|
1268
1705
|
sigma = return_series.std() * nsqrt
|
|
1269
1706
|
sharpe_ratio = round((mean-rf)/sigma,2)
|
|
1270
1707
|
return sharpe_ratio
|
|
1271
|
-
|
|
1708
|
+
|
|
1272
1709
|
def treat_signal_strategy(self,test_data, strategy):
|
|
1273
|
-
|
|
1710
|
+
"""
|
|
1711
|
+
helper method that treats signals and converts signals to 1 or 0
|
|
1712
|
+
|
|
1713
|
+
Parameters
|
|
1714
|
+
----------
|
|
1715
|
+
test_data (pd.DataFrame): test data
|
|
1716
|
+
strategy (list): features to get the strategy
|
|
1717
|
+
|
|
1718
|
+
Returns
|
|
1719
|
+
-------
|
|
1720
|
+
test_data (pd.DataFrame): test data with extra columns that are the strategy (main_signal)
|
|
1721
|
+
"""
|
|
1274
1722
|
hmm_states_list = [x for x in strategy if 'hmm_state_' in x]
|
|
1275
1723
|
other_features = [x for x in strategy if x not in hmm_states_list]
|
|
1276
1724
|
|
|
@@ -1299,10 +1747,21 @@ class stock_eda_panel(object):
|
|
|
1299
1747
|
elif len(hmm_states_list) == 0 and len(other_features) > 0:
|
|
1300
1748
|
test_data['main_signal'] = np.where((test_data['features_signal'] == 1) & (test_data['hmm_signal'] == 0),1,0)
|
|
1301
1749
|
|
|
1302
|
-
return test_data
|
|
1750
|
+
return test_data
|
|
1303
1751
|
|
|
1304
1752
|
def stategy_simulator(self, features, hmm_feature = True):
|
|
1753
|
+
"""
|
|
1754
|
+
execute strategy and get some performance metrics like sharpe ratio, return. This method creates some new attributes
|
|
1305
1755
|
|
|
1756
|
+
Parameters
|
|
1757
|
+
----------
|
|
1758
|
+
features (list): list of features to be tested as strategies
|
|
1759
|
+
hmm_feature (boolean): include hmm feature
|
|
1760
|
+
|
|
1761
|
+
Returns
|
|
1762
|
+
-------
|
|
1763
|
+
None
|
|
1764
|
+
"""
|
|
1306
1765
|
columns_ = ['Date', 'Close','Open'] + features + ['HMM']
|
|
1307
1766
|
states = list(self.df.hmm_feature.unique())
|
|
1308
1767
|
states.sort()
|
|
@@ -1372,8 +1831,19 @@ class stock_eda_panel(object):
|
|
|
1372
1831
|
self.strategy_log = df_returns_log
|
|
1373
1832
|
self.best_strategy = df_returns_log.iloc[0,:].strategy
|
|
1374
1833
|
self.top_10_strategy = list(df_returns_log.iloc[0:10,:].strategy.values)
|
|
1375
|
-
|
|
1834
|
+
|
|
1376
1835
|
def viz_strategy(self, strategy):
|
|
1836
|
+
"""
|
|
1837
|
+
display analysis plot of a given strategy
|
|
1838
|
+
|
|
1839
|
+
Parameters
|
|
1840
|
+
----------
|
|
1841
|
+
strategy (list): list of features of the strategy
|
|
1842
|
+
|
|
1843
|
+
Returns
|
|
1844
|
+
-------
|
|
1845
|
+
None
|
|
1846
|
+
"""
|
|
1377
1847
|
test_data = self.test_data_strategy
|
|
1378
1848
|
|
|
1379
1849
|
test_data = self.treat_signal_strategy(test_data, strategy)
|
|
@@ -1406,62 +1876,26 @@ class stock_eda_panel(object):
|
|
|
1406
1876
|
plt.legend()
|
|
1407
1877
|
plt.show()
|
|
1408
1878
|
|
|
1409
|
-
### deprecated ############################
|
|
1410
|
-
def create_strategy(self, favourable_states):
|
|
1411
|
-
|
|
1412
|
-
test_data = self.test_data_hmm
|
|
1413
|
-
# add MA signal
|
|
1414
|
-
test_data.loc[test_data[self.ma1_column] > test_data[self.ma2_column], 'MA_signal'] = 1
|
|
1415
|
-
test_data.loc[test_data[self.ma1_column] <= test_data[self.ma2_column], 'MA_signal'] = 0
|
|
1416
|
-
|
|
1417
|
-
# add hnn signal
|
|
1418
|
-
|
|
1419
|
-
test_data['HMM_signal'] = np.where(test_data['HMM'].isin(favourable_states),1,0)
|
|
1420
|
-
|
|
1421
|
-
## combined signals
|
|
1422
|
-
test_data['main_signal'] = 0
|
|
1423
|
-
test_data.loc[(test_data['MA_signal'] == 1) & (test_data['HMM_signal'] == 1), 'main_signal'] = 1
|
|
1424
|
-
test_data['main_signal'] = test_data['main_signal'].shift(1)
|
|
1425
|
-
|
|
1426
|
-
## benchmark return
|
|
1427
|
-
test_data['lrets_bench'] = np.log(test_data['Close']/test_data['Close'].shift(1))
|
|
1428
|
-
test_data['bench_prod'] = test_data['lrets_bench'].cumsum()
|
|
1429
|
-
test_data['bench_prod_exp'] = np.exp(test_data['bench_prod']) - 1
|
|
1430
|
-
|
|
1431
|
-
## strategy return
|
|
1432
|
-
# test_data['lrets_strat'] = np.log(test_data['Open'].shift(-1)/test_data['Open']) * test_data['main_signal']
|
|
1433
|
-
test_data['lrets_strat'] = np.log(test_data['Close'].shift(-1)/test_data['Close']) * test_data['main_signal']
|
|
1434
|
-
test_data['lrets_prod'] = test_data['lrets_strat'].cumsum()
|
|
1435
|
-
test_data['strat_prod_exp'] = np.exp(test_data['lrets_prod']) - 1
|
|
1436
|
-
test_data.dropna(inplace = True)
|
|
1437
|
-
|
|
1438
|
-
bench_rets = round(test_data['bench_prod_exp'].values[-1]*100,1)
|
|
1439
|
-
strat_rets = round(test_data['strat_prod_exp'].values[-1]*100,1)
|
|
1440
|
-
|
|
1441
|
-
bench_sharpe = self.sharpe_ratio(test_data['bench_prod_exp'].values)
|
|
1442
|
-
strat_sharpe = self.sharpe_ratio(test_data['strat_prod_exp'].values)
|
|
1443
|
-
|
|
1444
|
-
print(f'returns benchmark {bench_rets}%')
|
|
1445
|
-
print(f'returns strategy {strat_rets}%')
|
|
1446
|
-
print('-----------------------------')
|
|
1447
|
-
print(f'sharpe benchmark {bench_sharpe}')
|
|
1448
|
-
print(f'sharpe strategy {strat_sharpe}')
|
|
1449
|
-
|
|
1450
|
-
fig = plt.figure(figsize = (10,4))
|
|
1451
|
-
plt.plot(test_data['bench_prod_exp'])
|
|
1452
|
-
plt.plot(test_data['strat_prod_exp'])
|
|
1453
|
-
self.settings_hmm_states = {'favourable_states':favourable_states}
|
|
1454
|
-
################################################
|
|
1455
|
-
|
|
1456
1879
|
def deep_dive_analysis_hmm(self, test_data_size, split = 'train'):
|
|
1457
|
-
|
|
1880
|
+
"""
|
|
1881
|
+
display analysis plot hmm model
|
|
1882
|
+
|
|
1883
|
+
Parameters
|
|
1884
|
+
----------
|
|
1885
|
+
test_data_size (int): test data size, the remaining is the train data
|
|
1886
|
+
split (str): options (train or test). Split type to assess
|
|
1887
|
+
|
|
1888
|
+
Returns
|
|
1889
|
+
-------
|
|
1890
|
+
None
|
|
1891
|
+
"""
|
|
1458
1892
|
if split == 'train':
|
|
1459
1893
|
df = self.df.iloc[:-test_data_size,:]
|
|
1460
1894
|
elif split == 'test':
|
|
1461
1895
|
df = self.df.iloc[-test_data_size:,:]
|
|
1462
1896
|
|
|
1463
1897
|
## returns plot
|
|
1464
|
-
fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
|
|
1898
|
+
fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
|
|
1465
1899
|
height=400, width=1000, title = 'returns chain hmm feature')
|
|
1466
1900
|
fig.add_shape(type='line',x0=-0.5,y0=0,x1=max(df.hmm_feature)+0.5,y1=0,line=dict(color='grey',width=1),xref='x',yref='y')
|
|
1467
1901
|
fig.show()
|
|
@@ -1490,6 +1924,17 @@ class stock_eda_panel(object):
|
|
|
1490
1924
|
del df
|
|
1491
1925
|
|
|
1492
1926
|
def get_targets(self, steps):
|
|
1927
|
+
"""
|
|
1928
|
+
produce regression target return taking future prices
|
|
1929
|
+
|
|
1930
|
+
Parameters
|
|
1931
|
+
----------
|
|
1932
|
+
steps (int): number of lags and steps for future returns
|
|
1933
|
+
|
|
1934
|
+
Returns
|
|
1935
|
+
-------
|
|
1936
|
+
None
|
|
1937
|
+
"""
|
|
1493
1938
|
self.targets = list()
|
|
1494
1939
|
self.target = list()
|
|
1495
1940
|
columns = list()
|
|
@@ -1501,9 +1946,23 @@ class stock_eda_panel(object):
|
|
|
1501
1946
|
self.df[f'mean_target'] = self.df[columns].mean(axis=1)
|
|
1502
1947
|
self.target.append(f'mean_target')
|
|
1503
1948
|
self.settings_target_lasts = {'steps':steps, 'type':'regression'}
|
|
1504
|
-
|
|
1505
|
-
def get_categorical_targets(self, horizon, flor_loss, top_gain):
|
|
1506
|
-
|
|
1949
|
+
|
|
1950
|
+
def get_categorical_targets(self, horizon, flor_loss, top_gain, min_pos=1 , min_negs=1):
|
|
1951
|
+
"""
|
|
1952
|
+
produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
|
|
1953
|
+
|
|
1954
|
+
Parameters
|
|
1955
|
+
----------
|
|
1956
|
+
horizon (int): number of lags and steps for future returns
|
|
1957
|
+
flor_loss (float): min loss return
|
|
1958
|
+
top_gain (float): max gain return
|
|
1959
|
+
min_pos (int): minimun number of positives to count in a window for target_up
|
|
1960
|
+
min_negs (int): minimun number of negatives to count in a window for target_down
|
|
1961
|
+
|
|
1962
|
+
Returns
|
|
1963
|
+
-------
|
|
1964
|
+
None
|
|
1965
|
+
"""
|
|
1507
1966
|
self.target = list()
|
|
1508
1967
|
self.targets = list()
|
|
1509
1968
|
columns = list()
|
|
@@ -1516,7 +1975,7 @@ class stock_eda_panel(object):
|
|
|
1516
1975
|
self.df[f'target_{i}'] = np.where(self.df[f'target_{i}'] >= top_gain,1,0)
|
|
1517
1976
|
columns.append(f'target_{i}')
|
|
1518
1977
|
self.df[f'target_up'] = self.df[columns].sum(axis=1)
|
|
1519
|
-
self.df[f'target_up'] = np.where(self.df[f'target_up'] >=
|
|
1978
|
+
self.df[f'target_up'] = np.where(self.df[f'target_up'] >=min_pos,1,0 )
|
|
1520
1979
|
self.df = self.df.drop(columns = columns)
|
|
1521
1980
|
|
|
1522
1981
|
for i in range(1,horizon+1):
|
|
@@ -1526,7 +1985,7 @@ class stock_eda_panel(object):
|
|
|
1526
1985
|
self.df[f'target_{i}'] = np.where(self.df[f'target_{i}'] <= flor_loss,1,0)
|
|
1527
1986
|
columns.append(f'target_{i}')
|
|
1528
1987
|
self.df[f'target_down'] = self.df[columns].sum(axis=1)
|
|
1529
|
-
self.df[f'target_down'] = np.where(self.df[f'target_down'] >=
|
|
1988
|
+
self.df[f'target_down'] = np.where(self.df[f'target_down'] >= min_negs,1,0 )
|
|
1530
1989
|
self.df = self.df.drop(columns = columns)
|
|
1531
1990
|
|
|
1532
1991
|
self.targets.append('target_up')
|
|
@@ -1535,7 +1994,19 @@ class stock_eda_panel(object):
|
|
|
1535
1994
|
self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain, 'type': 'classification'}
|
|
1536
1995
|
|
|
1537
1996
|
def get_configurations(self,test_data_size =250, val_data_size = 250, model_type = False):
|
|
1538
|
-
|
|
1997
|
+
"""
|
|
1998
|
+
produce configuration dictionary that were saved in the feature generation methods if save_features was activated
|
|
1999
|
+
|
|
2000
|
+
Parameters
|
|
2001
|
+
----------
|
|
2002
|
+
test_data_size (int): test data size
|
|
2003
|
+
val_data_size (int): validation data size
|
|
2004
|
+
model_type (str): model type, options: 'Forecaster','Classifier'
|
|
2005
|
+
|
|
2006
|
+
Returns
|
|
2007
|
+
-------
|
|
2008
|
+
None
|
|
2009
|
+
"""
|
|
1539
2010
|
self.settings = {
|
|
1540
2011
|
'features':list(set(self.features)),
|
|
1541
2012
|
'signals' :list(set(self.signals)),
|
|
@@ -1547,19 +2018,21 @@ class stock_eda_panel(object):
|
|
|
1547
2018
|
'outlier': self.settings_outlier,
|
|
1548
2019
|
}
|
|
1549
2020
|
}
|
|
1550
|
-
|
|
2021
|
+
|
|
1551
2022
|
if model_type in ['Forecaster','Classifier']:
|
|
1552
|
-
|
|
2023
|
+
|
|
1553
2024
|
target_list = list(set(self.targets))
|
|
1554
2025
|
target_list.sort()
|
|
1555
2026
|
self.settings['model_type'] = model_type
|
|
1556
2027
|
self.settings['target'] = list(set(self.target))
|
|
1557
2028
|
self.settings['targets'] = target_list
|
|
1558
|
-
|
|
2029
|
+
|
|
1559
2030
|
## for now this is hard coded
|
|
1560
2031
|
feature_list = ['spread_ma','relative_spread_ma','pair_feature','count_features','bidirect_count_features','price_range','relative_price_range','rsi_feature',
|
|
1561
2032
|
'rsi_feature_v2', 'days_features','days_features_v2', 'volume_feature','smooth_volume', 'roc_feature', 'stoch_feature', 'stochastic_feature',
|
|
1562
|
-
'william_feature', 'vortex_feature', 'pair_index_feature','hmm'
|
|
2033
|
+
'william_feature', 'vortex_feature', 'pair_index_feature','hmm',
|
|
2034
|
+
'min_distance_pricefeature', 'min_relprice_pricefeature', 'max_distance_pricefeature','max_relprice_pricefeature'
|
|
2035
|
+
]
|
|
1563
2036
|
|
|
1564
2037
|
for feature in feature_list:
|
|
1565
2038
|
try:
|
|
@@ -1570,7 +2043,7 @@ class stock_eda_panel(object):
|
|
|
1570
2043
|
self.settings['settings']['target_lasts'] = self.settings_target_lasts
|
|
1571
2044
|
except:
|
|
1572
2045
|
pass
|
|
1573
|
-
|
|
2046
|
+
|
|
1574
2047
|
try:
|
|
1575
2048
|
self.settings['settings']['strategies'] = {
|
|
1576
2049
|
'best_strategy':self.best_strategy,
|
|
@@ -1580,512 +2053,280 @@ class stock_eda_panel(object):
|
|
|
1580
2053
|
pass
|
|
1581
2054
|
|
|
1582
2055
|
class produce_model:
|
|
2056
|
+
"""
|
|
2057
|
+
Class that produces a machine learning model in a scikit-learn pipeline wrapper.
|
|
2058
|
+
|
|
2059
|
+
Attributes
|
|
2060
|
+
----------
|
|
2061
|
+
data : pd.DataFrame
|
|
2062
|
+
symbol of the asset
|
|
2063
|
+
X_train : pd.DataFrame
|
|
2064
|
+
y_train : pd.Series
|
|
2065
|
+
X_test : pd.DataFrame
|
|
2066
|
+
y_test : pd.Series
|
|
2067
|
+
X_val : pd.DataFrame
|
|
2068
|
+
y_val : pd.Series
|
|
2069
|
+
pipeline : obj
|
|
2070
|
+
trained pipeline that includes a ml model
|
|
2071
|
+
features_to_model: list
|
|
2072
|
+
features in end step of the pipeline
|
|
2073
|
+
|
|
2074
|
+
Methods
|
|
2075
|
+
-------
|
|
2076
|
+
preprocess(test_data_size=int, target=str, val_data_size=int):
|
|
2077
|
+
prepare data, split train, test, validation data and X and Y
|
|
2078
|
+
get_sample(x=pd.DataFrame, sample=int, max_=int):
|
|
2079
|
+
sample data
|
|
2080
|
+
"""
|
|
1583
2081
|
def __init__(self,data):
|
|
2082
|
+
"""
|
|
2083
|
+
Initialize object
|
|
2084
|
+
|
|
2085
|
+
Parameters
|
|
2086
|
+
----------
|
|
2087
|
+
data (pd.DataFrame): data
|
|
2088
|
+
|
|
2089
|
+
Returns
|
|
2090
|
+
-------
|
|
2091
|
+
None
|
|
2092
|
+
"""
|
|
1584
2093
|
self.data = data.copy()
|
|
1585
|
-
|
|
2094
|
+
|
|
1586
2095
|
def preprocess(self, test_data_size, target, val_data_size = False):
|
|
1587
|
-
|
|
2096
|
+
"""
|
|
2097
|
+
prepare data, split train, test, validation data and X and Y
|
|
2098
|
+
|
|
2099
|
+
Parameters
|
|
2100
|
+
----------
|
|
2101
|
+
test_data_size (int): test data size
|
|
2102
|
+
target (str): target column
|
|
2103
|
+
val_data_size (int): validation data size
|
|
2104
|
+
|
|
2105
|
+
Returns
|
|
2106
|
+
-------
|
|
2107
|
+
None
|
|
2108
|
+
"""
|
|
1588
2109
|
train_data, test_data = self.data.iloc[:-test_data_size,:].dropna() , self.data.iloc[-test_data_size:,:].dropna()
|
|
1589
|
-
|
|
2110
|
+
|
|
1590
2111
|
if val_data_size:
|
|
1591
2112
|
train_data, val_data = train_data.iloc[:-val_data_size,:], train_data.iloc[-val_data_size:,:]
|
|
1592
|
-
|
|
2113
|
+
|
|
1593
2114
|
self.test_data = test_data
|
|
1594
|
-
|
|
2115
|
+
|
|
1595
2116
|
X_train, y_train = train_data.iloc[0:,1:], train_data[target]
|
|
1596
2117
|
X_test, y_test = test_data.iloc[0:,1:], test_data[target]
|
|
1597
2118
|
self.X_train = X_train
|
|
1598
2119
|
self.y_train = y_train
|
|
1599
2120
|
self.X_test = X_test
|
|
1600
2121
|
self.y_test = y_test
|
|
1601
|
-
|
|
2122
|
+
|
|
1602
2123
|
if val_data_size:
|
|
1603
2124
|
X_val, y_val = val_data.iloc[0:,1:], val_data[target]
|
|
1604
2125
|
self.X_val = X_val
|
|
1605
2126
|
self.y_val = y_val
|
|
1606
|
-
|
|
2127
|
+
|
|
1607
2128
|
def get_sample(self, x, sample, max_=900):
|
|
2129
|
+
"""
|
|
2130
|
+
sample data
|
|
2131
|
+
|
|
2132
|
+
Parameters
|
|
2133
|
+
----------
|
|
2134
|
+
x (pd.DataFrame): input data
|
|
2135
|
+
sample (int): sample size
|
|
2136
|
+
max_ (int): max sample
|
|
2137
|
+
|
|
2138
|
+
Returns
|
|
2139
|
+
-------
|
|
2140
|
+
sample (float): sample size
|
|
2141
|
+
"""
|
|
1608
2142
|
length = len(x)
|
|
1609
2143
|
if length > max_:
|
|
1610
2144
|
return 1.0
|
|
1611
2145
|
else:
|
|
1612
2146
|
return sample
|
|
1613
|
-
|
|
2147
|
+
|
|
1614
2148
|
def train_model(self, pipe, model, cv_ = False):
|
|
2149
|
+
"""
|
|
2150
|
+
train pipeline
|
|
2151
|
+
|
|
2152
|
+
Parameters
|
|
2153
|
+
----------
|
|
2154
|
+
pipe (obj): pipeline object
|
|
2155
|
+
model (obj): model object
|
|
2156
|
+
cv_ (obj): cross validation procedure
|
|
2157
|
+
|
|
2158
|
+
Returns
|
|
2159
|
+
-------
|
|
2160
|
+
sample (float): sample size
|
|
2161
|
+
"""
|
|
1615
2162
|
self.model = model
|
|
1616
2163
|
self.pipe_transform = pipe
|
|
1617
2164
|
self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
|
|
1618
|
-
self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
|
|
1619
2165
|
self.pipeline.fit(self.X_train, self.y_train)
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
class
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
## indexing chains
|
|
1665
|
-
self.data_train_['lag_hmm_feature'] = self.data_train_['hmm_feature'].shift(1)
|
|
1666
|
-
self.data_train_['breack'] = np.where(self.data_train_['lag_hmm_feature'] != self.data_train_['hmm_feature'],1,0)
|
|
1667
|
-
self.data_train_["chain_id"] = self.data_train_.groupby("breack")["Date"].rank(method="first", ascending=True)
|
|
1668
|
-
self.data_train_["chain_id"] = np.where(self.data_train_['breack'] == 1,self.data_train_["chain_id"],np.nan)
|
|
1669
|
-
self.data_train_["chain_id"] = self.data_train_["chain_id"].fillna(method='ffill')
|
|
1670
|
-
self.data_train_["hmm_chain_order"] = self.data_train_.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1671
|
-
|
|
1672
|
-
### returns using the first element in a chain
|
|
1673
|
-
self.data_train_['first'] = np.where(self.data_train_['hmm_chain_order'] == 1, self.data_train_['Close'], np.nan)
|
|
1674
|
-
self.data_train_['first'] = self.data_train_.sort_values('Date')['first'].fillna(method='ffill')
|
|
1675
|
-
self.data_train_['chain_return'] = (self.data_train_['Close']/self.data_train_['first'] -1) * 100
|
|
1676
|
-
|
|
1677
|
-
self.data_train_ = self.data_train_.drop(columns = ['first'])
|
|
1678
|
-
|
|
1679
|
-
mean_relevance, cluster_returns, number_relevant_states = states_relevance_score(self.data_train_)
|
|
1680
|
-
self.mean_relevance = mean_relevance
|
|
1681
|
-
|
|
1682
|
-
def execute_selector(self):
|
|
1683
|
-
|
|
1684
|
-
self.split_data()
|
|
1685
|
-
self.feature_list_generator()
|
|
1686
|
-
maxi = -1
|
|
1687
|
-
print(f'it is expected {len(self.feature_combinations)} combinations')
|
|
1688
|
-
feature_results = dict()
|
|
1689
|
-
|
|
1690
|
-
if self.limit_search:
|
|
1691
|
-
print(f' taking just {self.limit_search} combinations')
|
|
1692
|
-
maxi = self.limit_search
|
|
1693
|
-
|
|
1694
|
-
for i,features_hmm in enumerate(self.feature_combinations[0:maxi]):
|
|
1695
|
-
|
|
1696
|
-
feature_results[f'group_{i}'] = {
|
|
1697
|
-
'features':list(features_hmm),
|
|
1698
|
-
'relevances':list()
|
|
1699
|
-
}
|
|
1700
|
-
|
|
1701
|
-
for _ in range(self.n_trials):
|
|
1702
|
-
try:
|
|
1703
|
-
self.train_model(features_hmm)
|
|
1704
|
-
self.get_error()
|
|
1705
|
-
feature_results[f'group_{i}']['relevances'].append(self.mean_relevance)
|
|
1706
|
-
except:
|
|
1707
|
-
print('error')
|
|
1708
|
-
feature_results[f'group_{i}']['mean relevance'] = np.mean(feature_results[f'group_{i}']['relevances'])
|
|
1709
|
-
self.feature_results = feature_results
|
|
1710
|
-
self.best_features = pd.DataFrame(self.feature_results).T.sort_values('mean relevance').iloc[-1,:].features
|
|
1711
|
-
|
|
1712
|
-
class signal_analyser_object:
|
|
1713
|
-
|
|
1714
|
-
def __init__(self, data,symbol_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
|
|
2166
|
+
self.features_to_model = self.pipeline[:-1].transform(self.X_train).columns
|
|
2167
|
+
|
|
2168
|
+
class analyse_index(stock_eda_panel):
|
|
2169
|
+
"""
|
|
2170
|
+
class that is going to train hmm models to perform feature selection
|
|
2171
|
+
|
|
2172
|
+
Attributes
|
|
2173
|
+
----------
|
|
2174
|
+
data_index : pd.DataFrame
|
|
2175
|
+
name of the index
|
|
2176
|
+
indexes: list
|
|
2177
|
+
list of indexes
|
|
2178
|
+
asset : str
|
|
2179
|
+
name of the asset
|
|
2180
|
+
n_obs : int
|
|
2181
|
+
number of rows to extract
|
|
2182
|
+
lag : int
|
|
2183
|
+
lag to apply
|
|
2184
|
+
data_window : str
|
|
2185
|
+
5y 10y 15y
|
|
2186
|
+
show_plot : bool
|
|
2187
|
+
If True, show plots
|
|
2188
|
+
save_path : str
|
|
2189
|
+
local path for saving e.g r'C:/path/to/the/file/'
|
|
2190
|
+
save_aws : str
|
|
2191
|
+
remote key in s3 bucket path e.g. 'path/to/file/'
|
|
2192
|
+
aws_credentials : dict
|
|
2193
|
+
dict with the aws credentials
|
|
2194
|
+
merger_df : pd.DataFrame
|
|
2195
|
+
dataframe with the index and asset data
|
|
2196
|
+
states_result = dict
|
|
2197
|
+
betas and correlation score results
|
|
2198
|
+
|
|
2199
|
+
Methods
|
|
2200
|
+
-------
|
|
2201
|
+
process_data():
|
|
2202
|
+
using stock_eda_panel, get data and merge data
|
|
2203
|
+
plot_betas(sample_size=int, offset=int, subsample_ts=int):
|
|
2204
|
+
display beta analysis plot
|
|
2205
|
+
get_betas(subsample_ts=int)
|
|
2206
|
+
get general beta and last sample beta, correlation score is included too
|
|
2207
|
+
"""
|
|
2208
|
+
def __init__(self, index_data, asset, n_obs, lag, data_window = '5y', show_plot = False, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
|
|
1715
2209
|
"""
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
2210
|
+
Initialize object
|
|
2211
|
+
|
|
2212
|
+
Parameters
|
|
2213
|
+
----------
|
|
2214
|
+
index_data (pd.DataFrame or str): index data dataframe or index string
|
|
2215
|
+
asset (str): name of the asset
|
|
2216
|
+
n_obs (int): number of rows to extract
|
|
2217
|
+
lag (int): lag to apply
|
|
2218
|
+
data_window (str): 5y 10y 15y
|
|
2219
|
+
show_plot (bool): If True, show plots
|
|
2220
|
+
save_path (str): local path for saving e.g r'C:/path/to/the/file/'
|
|
2221
|
+
save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
|
|
2222
|
+
aws_credentials (dict): dict with the aws credentials
|
|
2223
|
+
|
|
2224
|
+
Returns
|
|
2225
|
+
-------
|
|
2226
|
+
None
|
|
1723
2227
|
"""
|
|
1724
|
-
self.data = data.copy()
|
|
1725
|
-
self.ticket_name = symbol_name
|
|
1726
|
-
self.show_plot = show_plot
|
|
1727
|
-
self.save_path = save_path
|
|
1728
|
-
self.save_aws = save_aws
|
|
1729
|
-
self.aws_credentials = aws_credentials
|
|
1730
|
-
self.return_fig = return_fig
|
|
1731
|
-
|
|
1732
|
-
def signal_analyser(self, test_size, feature_name, days_list, threshold = 0.05,verbose = False, signal_position = False):
|
|
1733
|
-
data = self.data
|
|
1734
|
-
self.feature_name = feature_name
|
|
1735
|
-
up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
|
|
1736
|
-
features_base = ['Date', up_signal, low_signal, 'Close']
|
|
1737
|
-
|
|
1738
|
-
df = data[features_base].sort_values('Date').iloc[0:-test_size,:]
|
|
1739
|
-
returns_list = list()
|
|
1740
|
-
|
|
1741
|
-
for days in days_list:
|
|
1742
|
-
|
|
1743
|
-
feature_ = f'return_{days}d'
|
|
1744
|
-
df[feature_] = (df['Close'].shift(-days)/df['Close']-1)*100
|
|
1745
|
-
returns_list.append(feature_)
|
|
1746
|
-
|
|
1747
|
-
df['signal_type'] = np.where(
|
|
1748
|
-
df[up_signal] == 1,
|
|
1749
|
-
'up',
|
|
1750
|
-
np.where(
|
|
1751
|
-
df[low_signal] == 1,
|
|
1752
|
-
'down',
|
|
1753
|
-
None
|
|
1754
|
-
)
|
|
1755
|
-
)
|
|
1756
|
-
df = df[~df.signal_type.isna()]
|
|
1757
|
-
# df['Date'] = df.index
|
|
1758
|
-
df['lag_Date'] = df['Date'].shift(1)
|
|
1759
|
-
df['span'] = (pd.to_datetime(df['Date']) - pd.to_datetime(df['lag_Date'])).dt.days - 1
|
|
1760
|
-
df['break'] = np.where(df['span'] > 3, 1, 0)
|
|
1761
|
-
df['break'] = np.where(df['span'].isna(), 1, df['break'])
|
|
1762
2228
|
|
|
1763
|
-
df['chain_id'] = df.sort_values(['Date']).groupby(['break']).cumcount() + 1
|
|
1764
|
-
df['chain_id'] = np.where(df['break'] == 1, df['chain_id'], np.nan )
|
|
1765
|
-
df['chain_id'] = df['chain_id'].fillna(method = 'ffill')
|
|
1766
|
-
|
|
1767
|
-
df['internal_rn'] = df.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
|
|
1768
|
-
df['inv_internal_rn'] = df.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
|
|
1769
|
-
|
|
1770
|
-
df['first_in_chain'] = np.where(df['internal_rn'] == 1, True, False)
|
|
1771
|
-
df['last_in_chain'] = np.where(df['inv_internal_rn'] == 1, True, False)
|
|
1772
|
-
|
|
1773
|
-
df = df.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
|
|
1774
|
-
self.df_signal = df
|
|
1775
2229
|
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
validations = list()
|
|
1781
|
-
if not signal_position: ### for now it is based on the last signal on a chain
|
|
1782
|
-
df_melt = df[df.last_in_chain == True].melt(id_vars=['signal_type'], value_vars=returns_list, var_name='time', value_name='value')
|
|
1783
|
-
df_melt = df_melt.dropna()
|
|
1784
|
-
|
|
1785
|
-
for evalx in returns_list:
|
|
1786
|
-
|
|
1787
|
-
sample1 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'up')].value.values
|
|
1788
|
-
sample2 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'down')].value.values
|
|
1789
|
-
pvalue = stats.ttest_ind(sample1, sample2).pvalue
|
|
1790
|
-
median_down = np.median(sample2)
|
|
1791
|
-
median_up = np.median(sample1)
|
|
1792
|
-
validations.append(median_up < 0)
|
|
1793
|
-
validations.append(median_down > 0)
|
|
1794
|
-
p_scores.append(pvalue)
|
|
1795
|
-
medians_down.append(median_down)
|
|
1796
|
-
self.df_melt = df_melt
|
|
1797
|
-
null_ho_eval = threshold > np.mean(p_scores)
|
|
1798
|
-
mean_median_return = np.median(medians_down) ## end metric
|
|
1799
|
-
median_signal_type_eval = validations.count(validations[0]) == len(validations)
|
|
1800
|
-
|
|
1801
|
-
if verbose:
|
|
1802
|
-
print('number of signal up:',n_signals_up)
|
|
1803
|
-
print('number of signal down:',n_signals_down)
|
|
1804
|
-
print('reject ho: ', null_ho_eval)
|
|
1805
|
-
print('mean median:', mean_median_return)
|
|
1806
|
-
print('all validations: ', median_signal_type_eval)
|
|
1807
|
-
|
|
1808
|
-
# if median_signal_type_eval == True and null_ho_eval == True:
|
|
1809
|
-
if null_ho_eval == True:
|
|
1810
|
-
if verbose:
|
|
1811
|
-
print('success evals')
|
|
1812
|
-
self.mean_median_return = mean_median_return
|
|
2230
|
+
if type(index_data) != str:
|
|
2231
|
+
index_data['Date'] = pd.to_datetime(index_data['Date'])
|
|
2232
|
+
self.index_data = index_data
|
|
2233
|
+
self.indexes = [ x for x in list(index_data.columns) if x != 'Date']
|
|
1813
2234
|
else:
|
|
1814
|
-
self.
|
|
1815
|
-
|
|
1816
|
-
df2 = df.copy()
|
|
1817
|
-
df2 = df2[df2.last_in_chain == True]
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
df2['lagdate'] = df2.Date.shift(1)
|
|
1821
|
-
df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lagdate'])).dt.days
|
|
1822
|
-
|
|
1823
|
-
fig, axs = plt.subplots(1, 3, figsize = (15,5))
|
|
1824
|
-
|
|
1825
|
-
sns.boxplot(data=df2, y="span",ax = axs[0])
|
|
1826
|
-
axs[0].set_title('span between last signals')
|
|
1827
|
-
del df2
|
|
1828
|
-
sns.boxplot(data=df[df.last_in_chain == True], y="internal_rn",ax = axs[1])
|
|
1829
|
-
axs[1].set_title('signal duration distribution')
|
|
1830
|
-
sns.boxplot(data=df_melt, x="time", y="value", hue="signal_type",ax = axs[2])
|
|
1831
|
-
axs[2].axhline(y=0, color='grey', linestyle='--')
|
|
1832
|
-
axs[2].set_title('signal type expected returns distribution at different time lapses')
|
|
1833
|
-
|
|
1834
|
-
if self.show_plot:
|
|
1835
|
-
plt.show()
|
|
1836
|
-
|
|
1837
|
-
if self.save_path:
|
|
1838
|
-
result_plot_name = f'signals_strategy_distribution_{feature_name}.png'
|
|
1839
|
-
fig.savefig(self.save_path+result_plot_name)
|
|
1840
|
-
# pickle.dump(axs, open(self.save_path+result_plot_name, 'wb'))
|
|
1841
|
-
|
|
1842
|
-
if self.save_path and self.save_aws:
|
|
1843
|
-
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name, input_path = self.save_path+result_plot_name)
|
|
1844
|
-
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
|
|
1845
|
-
if not self.show_plot:
|
|
1846
|
-
plt.close()
|
|
1847
|
-
|
|
1848
|
-
del df
|
|
1849
|
-
|
|
1850
|
-
if self.return_fig:
|
|
1851
|
-
return fig
|
|
1852
|
-
|
|
1853
|
-
def create_backtest_signal(self,days_strategy, test_size, feature_name, high_exit = False, low_exit = False):
|
|
1854
|
-
asset_1 = 'Close'
|
|
1855
|
-
up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
|
|
1856
|
-
df1 = self.data.iloc[-test_size:,:].copy()
|
|
1857
|
-
df2 = df1.copy()
|
|
1858
|
-
df2['signal_type'] = np.where(
|
|
1859
|
-
df2[up_signal] == 1,
|
|
1860
|
-
'up',
|
|
1861
|
-
np.where(
|
|
1862
|
-
df2[low_signal] == 1,
|
|
1863
|
-
'down',
|
|
1864
|
-
None
|
|
1865
|
-
)
|
|
1866
|
-
)
|
|
1867
|
-
df2 = df2[~df2.signal_type.isna()]
|
|
1868
|
-
# df2['Date_'] = df2.index
|
|
1869
|
-
df2['lag_Date'] = df2['Date'].shift(1)
|
|
1870
|
-
df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lag_Date'])).dt.days - 1
|
|
1871
|
-
df2['break'] = np.where(df2['span'] > 3, 1, 0)
|
|
1872
|
-
df2['break'] = np.where(df2['span'].isna(), 1, df2['break'])
|
|
1873
|
-
|
|
1874
|
-
df2['chain_id'] = df2.sort_values(['Date']).groupby(['break']).cumcount() + 1
|
|
1875
|
-
df2['chain_id'] = np.where(df2['break'] == 1, df2['chain_id'], np.nan )
|
|
1876
|
-
df2['chain_id'] = df2['chain_id'].fillna(method = 'ffill')
|
|
1877
|
-
|
|
1878
|
-
df2['internal_rn'] = df2.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
|
|
1879
|
-
df2['inv_internal_rn'] = df2.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
|
|
1880
|
-
|
|
1881
|
-
df2['first_in_chain'] = np.where(df2['internal_rn'] == 1, True, False)
|
|
1882
|
-
df2['last_in_chain'] = np.where(df2['inv_internal_rn'] == 1, True, False)
|
|
1883
|
-
|
|
1884
|
-
df2 = df2.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
|
|
1885
|
-
|
|
1886
|
-
df2 = df2[(df2.last_in_chain == True) & (df2.signal_type == 'down')][['last_in_chain']]
|
|
1887
|
-
dft = df1.merge(df2,how = 'left',left_index=True, right_index=True )
|
|
1888
|
-
|
|
1889
|
-
dft['chain_id'] = dft.sort_values(['Date']).groupby(['last_in_chain']).cumcount() + 1
|
|
1890
|
-
dft['chain_id'] = np.where(dft['last_in_chain'] == True, dft['chain_id'], np.nan )
|
|
1891
|
-
dft['chain_id'] = dft['chain_id'].fillna(method = 'ffill')
|
|
1892
|
-
|
|
1893
|
-
dft['internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
|
|
1894
|
-
dft['flag'] = np.where(dft['internal_rn'] < days_strategy, 1,0)
|
|
1895
|
-
|
|
1896
|
-
dft['lrets_bench'] = np.log(dft[asset_1]/dft[asset_1].shift(1))
|
|
1897
|
-
dft['bench_prod'] = dft['lrets_bench'].cumsum()
|
|
1898
|
-
dft['bench_prod_exp'] = np.exp(dft['bench_prod']) - 1
|
|
1899
|
-
|
|
1900
|
-
if high_exit and low_exit:
|
|
1901
|
-
dft['open_strat'] = np.where(dft.last_in_chain == True, dft.Open, np.nan)
|
|
1902
|
-
dft['open_strat'] = dft['open_strat'].fillna(method = 'ffill')
|
|
1903
|
-
dft['open_strat'] = np.where(dft.flag == 1, dft.open_strat, np.nan)
|
|
1904
|
-
dft['high_strat_ret'] = (dft['High']/dft['open_strat']-1)*100
|
|
1905
|
-
dft['low_strat_ret'] = (dft['Low']/dft['open_strat']-1)*100
|
|
1906
|
-
dft['high_exit'] = np.where(((dft['high_strat_ret'] >= high_exit) | (dft['internal_rn'] == days_strategy)), 1, np.nan)
|
|
1907
|
-
dft['low_exit'] = np.where((dft['low_strat_ret'] <= low_exit), -1, np.nan)
|
|
1908
|
-
|
|
1909
|
-
dft["exit_type"] = dft[["high_exit", "low_exit"]].max(axis=1)
|
|
1910
|
-
dft['exit_type'] = np.where(dft["exit_type"] == 1, 1, np.where(dft["exit_type"] == -1,-1,np.nan))
|
|
1911
|
-
dft['exit'] = np.where(dft['exit_type'].isnull(), np.nan, 1)
|
|
1912
|
-
dft['exit_order'] = dft.sort_values(['Date']).groupby(['chain_id','exit']).cumcount() + 1
|
|
1913
|
-
dft['exit'] = np.where(dft['exit_order'] == 1, True, np.nan)
|
|
1914
|
-
dft = dft.drop(columns = ['exit_order'])
|
|
1915
|
-
## if last signal is near
|
|
1916
|
-
max_id = dft.chain_id.max()
|
|
1917
|
-
dft['max_internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).internal_rn.transform('max')
|
|
1918
|
-
dft['exit'] = np.where((dft.chain_id == max_id) & (dft.max_internal_rn < days_strategy) & (dft.max_internal_rn == dft.internal_rn), 1, dft['exit'])
|
|
1919
|
-
|
|
1920
|
-
dft['exit_step'] = np.where(dft.exit == 1, dft.internal_rn, np.nan)
|
|
1921
|
-
dft['exit_step'] = dft.sort_values(['Date']).groupby(['chain_id']).exit_step.transform('max')
|
|
1922
|
-
|
|
1923
|
-
dft['flag'] = np.where(dft.internal_rn <= dft.exit_step, 1, 0)
|
|
1924
|
-
dft = dft.drop(columns = ['open_strat', 'high_strat_ret', 'low_strat_ret','exit_step', 'exit','exit_type','high_exit','low_exit', 'max_internal_rn'])
|
|
1925
|
-
|
|
1926
|
-
dft['lrets_strat'] = np.log(dft[asset_1].shift(-1)/dft[asset_1]) * dft['flag']
|
|
1927
|
-
dft['lrets_strat'] = np.where(dft['lrets_strat'].isna(),-0.0,dft['lrets_strat'])
|
|
1928
|
-
dft['lrets_prod'] = dft['lrets_strat'].cumsum()
|
|
1929
|
-
dft['strat_prod_exp'] = np.exp(dft['lrets_prod']) - 1
|
|
1930
|
-
|
|
1931
|
-
bench_rets = round(dft['bench_prod_exp'].values[-1]*100,1)
|
|
1932
|
-
strat_rets = round(dft['strat_prod_exp'].values[-1]*100,1)
|
|
1933
|
-
|
|
1934
|
-
bench_sr = round(sharpe_ratio(dft.bench_prod_exp.dropna()),1)
|
|
1935
|
-
strat_sr = round(sharpe_ratio(dft.strat_prod_exp.dropna()),1)
|
|
1936
|
-
|
|
1937
|
-
message1 = f'{bench_rets}%'
|
|
1938
|
-
message2 = f'{strat_rets}%'
|
|
1939
|
-
|
|
1940
|
-
messages = {
|
|
1941
|
-
'benchmark return:':message1,
|
|
1942
|
-
'benchmark sharpe ratio:': bench_sr,
|
|
1943
|
-
'strategy return:':message2,
|
|
1944
|
-
'strategy sharpe ratio:': strat_sr,
|
|
1945
|
-
}
|
|
1946
|
-
if self.show_plot:
|
|
1947
|
-
print('----------------------------')
|
|
1948
|
-
print(messages)
|
|
1949
|
-
print('----------------------------')
|
|
1950
|
-
|
|
1951
|
-
fig = plt.figure(1)
|
|
1952
|
-
plt.plot(dft.bench_prod_exp.values, label = 'benchmark')
|
|
1953
|
-
plt.scatter(range(len(dft)),np.where(dft[low_signal] == 1,dft.bench_prod_exp.values,np.nan),color = 'red', label = 'signal')
|
|
1954
|
-
plt.plot(dft.strat_prod_exp.values, label = 'strategy')
|
|
1955
|
-
plt.legend()
|
|
1956
|
-
plt.title('strategy and cumulative returns based on signal strategy')
|
|
1957
|
-
if self.show_plot:
|
|
1958
|
-
plt.plot()
|
|
2235
|
+
self.indexes = [index_data]
|
|
1959
2236
|
|
|
1960
|
-
|
|
1961
|
-
result_json_name = f'signals_strategy_return_{feature_name}.json'
|
|
1962
|
-
result_plot_name = f'signals_strategy_return_{feature_name}.png'
|
|
1963
|
-
|
|
1964
|
-
plt.savefig(self.save_path+result_plot_name)
|
|
1965
|
-
# pickle.dump(fig, open(self.save_path+result_plot_name, 'wb'))
|
|
1966
|
-
|
|
1967
|
-
with open(self.save_path+result_json_name, "w") as outfile:
|
|
1968
|
-
json.dump(messages, outfile)
|
|
1969
|
-
|
|
1970
|
-
if self.save_path and self.save_aws:
|
|
1971
|
-
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
|
|
1972
|
-
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name,input_path = self.save_path+result_plot_name)
|
|
1973
|
-
|
|
1974
|
-
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_json_name, input_path = self.save_path + result_json_name, aws_credentials = self.aws_credentials)
|
|
1975
|
-
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
|
|
1976
|
-
|
|
1977
|
-
if not self.show_plot:
|
|
1978
|
-
plt.close()
|
|
1979
|
-
|
|
1980
|
-
del df1,df2,dft
|
|
1981
|
-
|
|
1982
|
-
if self.return_fig:
|
|
1983
|
-
return fig, messages
|
|
1984
|
-
|
|
1985
|
-
def execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object, plot = False, backtest= False, exit_params = {}):
|
|
1986
|
-
|
|
1987
|
-
method(**configuration)
|
|
1988
|
-
signal_assess = signal_analyser_object(object_stock.df,object_stock.stock_code,show_plot = plot)
|
|
1989
|
-
signal_assess.signal_analyser(test_size = test_data_size, feature_name = feature_name, days_list = days_list, threshold = 1)
|
|
1990
|
-
|
|
1991
|
-
if backtest:
|
|
1992
|
-
print('-----------------------back test ---------------------------')
|
|
1993
|
-
signal_assess.create_backtest_signal(backtest, test_data_size, feature_name, **exit_params )
|
|
1994
|
-
|
|
1995
|
-
return signal_assess.mean_median_return
|
|
1996
|
-
|
|
1997
|
-
def iterate_signal_analyser(test_data_size,feature_name, days_list, arguments_to_test, method, object_stock, signal_analyser_object, plot = True):
|
|
1998
|
-
|
|
1999
|
-
results = list()
|
|
2000
|
-
for key in arguments_to_test.keys():
|
|
2001
|
-
configuration = arguments_to_test.get(key)
|
|
2002
|
-
mean_median_return = execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object)
|
|
2003
|
-
results.append(mean_median_return)
|
|
2004
|
-
|
|
2005
|
-
df_result = pd.DataFrame({'keys':arguments_to_test.keys(),'results':results})
|
|
2006
|
-
if plot:
|
|
2007
|
-
plt.plot(df_result['keys'], df_result['results'])
|
|
2008
|
-
plt.scatter(df_result['keys'], df_result['results'])
|
|
2009
|
-
plt.title('simulation between configurations')
|
|
2010
|
-
plt.ylabel('median expected return')
|
|
2011
|
-
plt.show()
|
|
2012
|
-
|
|
2013
|
-
best_result = df_result.sort_values('results',ascending = False)['keys'].values[0]
|
|
2014
|
-
return best_result
|
|
2015
|
-
|
|
2016
|
-
class analyse_index(stock_eda_panel):
|
|
2017
|
-
def __init__(self, index, asset, n_obs, lag, data_window = '5y', show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
|
|
2018
|
-
|
|
2019
|
-
"""
|
|
2020
|
-
data: pandas df
|
|
2021
|
-
index: str name of the index
|
|
2022
|
-
asset: str name of the asset
|
|
2023
|
-
n_obs: int
|
|
2024
|
-
lag: int
|
|
2025
|
-
data_window: str eg 5y 10y 15y
|
|
2026
|
-
show_plot: bool
|
|
2027
|
-
save_path: str local path for saving e.g r'C:/path/to/the/file/'
|
|
2028
|
-
save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
|
|
2029
|
-
aws_credentials: dict
|
|
2030
|
-
"""
|
|
2031
|
-
|
|
2032
|
-
self.index = index
|
|
2237
|
+
self.index_data = index_data
|
|
2033
2238
|
self.asset = asset
|
|
2034
2239
|
self.n_obs = n_obs
|
|
2035
2240
|
self.data_window = data_window
|
|
2036
2241
|
self.lag = lag
|
|
2037
|
-
|
|
2242
|
+
|
|
2038
2243
|
self.show_plot = show_plot
|
|
2244
|
+
self.return_fig = return_fig
|
|
2039
2245
|
self.save_path = save_path
|
|
2040
2246
|
self.save_aws = save_aws
|
|
2041
|
-
|
|
2042
|
-
def process_data(self):
|
|
2043
|
-
|
|
2044
|
-
index = stock_eda_panel(self.index, self.n_obs, self.data_window)
|
|
2045
|
-
index.get_data()
|
|
2046
|
-
index.df['shift'] = index.df.Close.shift(self.lag)
|
|
2047
|
-
index.df['index_return'] = index.df.Close/index.df['shift'] - 1
|
|
2048
2247
|
|
|
2049
|
-
|
|
2248
|
+
def process_data(self):
|
|
2249
|
+
"""
|
|
2250
|
+
using stock_eda_panel, get data and merge data
|
|
2251
|
+
|
|
2252
|
+
Parameters
|
|
2253
|
+
----------
|
|
2254
|
+
None
|
|
2255
|
+
|
|
2256
|
+
Returns
|
|
2257
|
+
-------
|
|
2258
|
+
None
|
|
2259
|
+
"""
|
|
2260
|
+
asset = stock_eda_panel(self.asset, self.n_obs, data_window=self.data_window)
|
|
2050
2261
|
asset.get_data()
|
|
2051
|
-
|
|
2052
|
-
asset.df['asset_return'] = asset.df.Close/asset.df['shift'] - 1
|
|
2262
|
+
df = asset.df[['Date','Close']]
|
|
2053
2263
|
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2264
|
+
if type(self.index_data) != str:
|
|
2265
|
+
df_merge = df.merge(self.index_data, on = ['Date'], how = 'left').sort_values('Date')
|
|
2266
|
+
|
|
2267
|
+
else:
|
|
2268
|
+
indx = stock_eda_panel(self.index_data, self.n_obs, data_window=self.data_window)
|
|
2269
|
+
indx.get_data()
|
|
2270
|
+
indx_df = indx.df[['Date','Close']].rename(columns = {'Close':self.index_data})
|
|
2271
|
+
df_merge = df.merge(indx_df, on = ['Date'], how = 'left').sort_values('Date')
|
|
2272
|
+
|
|
2273
|
+
for colx in ['Close'] + self.indexes:
|
|
2274
|
+
df_merge[f'{colx}_pct'] = df_merge[colx]/df_merge[colx].shift(self.lag) - 1
|
|
2275
|
+
|
|
2276
|
+
df_merge.dropna(inplace = True)
|
|
2277
|
+
self.merger_df = df_merge.rename(columns = {'Close_pct': 'asset_return'})
|
|
2063
2278
|
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
|
|
2279
|
+
def plot_betas(self,sample_size, offset, subsample_ts =False, index = False):
|
|
2280
|
+
"""
|
|
2281
|
+
display beta analysis plot
|
|
2282
|
+
|
|
2283
|
+
Parameters
|
|
2284
|
+
----------
|
|
2285
|
+
sample_size (int): number of days or window size to calculate beta
|
|
2286
|
+
offset (int): overlap between windows
|
|
2287
|
+
subsample_ts (int): subsample size of data
|
|
2288
|
+
|
|
2289
|
+
Returns
|
|
2290
|
+
-------
|
|
2291
|
+
None
|
|
2292
|
+
"""
|
|
2293
|
+
if (type(self.index_data) == str) & (index != False):
|
|
2294
|
+
raise Exception("No need of index argument")
|
|
2295
|
+
else:
|
|
2296
|
+
index = self.indexes[0]
|
|
2297
|
+
|
|
2298
|
+
index_pct = f'{index}_pct'
|
|
2299
|
+
### ploting analysis
|
|
2067
2300
|
figure, ax = plt.subplot_mosaic(
|
|
2068
2301
|
[["scatter_total", "scatter_sample",'ts','ts']],
|
|
2069
2302
|
layout="constrained",
|
|
2070
2303
|
figsize=(18, 5)
|
|
2071
2304
|
)
|
|
2072
|
-
|
|
2073
|
-
ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df
|
|
2074
|
-
|
|
2305
|
+
|
|
2306
|
+
ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df[index_pct])
|
|
2307
|
+
|
|
2308
|
+
huber_regr = HuberRegressor(fit_intercept = True)
|
|
2309
|
+
huber_regr.fit(self.merger_df.asset_return.values.reshape(-1,1), self.merger_df[index_pct].values.reshape(-1,1))
|
|
2310
|
+
b, a = huber_regr.coef_[0], huber_regr.intercept_
|
|
2311
|
+
|
|
2312
|
+
# b, a = np.polyfit(self.merger_df.asset_return, self.merger_df[index_pct], 1)
|
|
2075
2313
|
ax['scatter_total'].plot(self.merger_df.asset_return, b*self.merger_df.asset_return+a, color='red')
|
|
2076
2314
|
|
|
2077
2315
|
ax['ts'].plot(self.merger_df.Date, self.merger_df.Close, color = 'grey', alpha = 0.3)
|
|
2078
|
-
|
|
2316
|
+
|
|
2079
2317
|
if subsample_ts:
|
|
2080
2318
|
self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
|
|
2081
|
-
|
|
2319
|
+
|
|
2082
2320
|
for i in range(0,len(self.merger_df)-sample_size,offset):
|
|
2083
2321
|
|
|
2084
2322
|
merger_ = self.merger_df.sort_values('Date', ascending = False).iloc[i:i+sample_size,:]
|
|
2085
|
-
x = merger_
|
|
2323
|
+
x = merger_[index_pct]
|
|
2086
2324
|
y = merger_.asset_return
|
|
2087
|
-
b, a = np.polyfit(x,y, 1)
|
|
2088
|
-
|
|
2325
|
+
# b, a = np.polyfit(x,y, 1)
|
|
2326
|
+
huber_regr = HuberRegressor(fit_intercept = True)
|
|
2327
|
+
huber_regr.fit(x.values.reshape(-1,1), y.values.reshape(-1,1))
|
|
2328
|
+
b, a = huber_regr.coef_[0], huber_regr.intercept_
|
|
2329
|
+
|
|
2089
2330
|
normalize = mcolors.Normalize(vmin=-1, vmax=1)
|
|
2090
2331
|
colormap = cm.jet
|
|
2091
2332
|
|
|
@@ -2098,12 +2339,13 @@ class analyse_index(stock_eda_panel):
|
|
|
2098
2339
|
|
|
2099
2340
|
scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
|
|
2100
2341
|
scalarmappaple.set_array(x)
|
|
2101
|
-
|
|
2102
|
-
plt.title(f'{self.asset} using index: {
|
|
2342
|
+
|
|
2343
|
+
plt.title(f'{self.asset} using index: {index}')
|
|
2103
2344
|
plt.colorbar(scalarmappaple)
|
|
2104
|
-
|
|
2345
|
+
|
|
2105
2346
|
if self.show_plot:
|
|
2106
2347
|
plt.show()
|
|
2348
|
+
|
|
2107
2349
|
if self.save_path:
|
|
2108
2350
|
result_plot_name = f'market_best_fit.png'
|
|
2109
2351
|
figure.savefig(self.save_path+result_plot_name)
|
|
@@ -2111,80 +2353,50 @@ class analyse_index(stock_eda_panel):
|
|
|
2111
2353
|
if self.save_path and self.save_aws:
|
|
2112
2354
|
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.asset}/'+result_plot_name,input_path = self.save_path+result_plot_name)
|
|
2113
2355
|
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
|
|
2356
|
+
|
|
2114
2357
|
if not self.show_plot:
|
|
2115
|
-
plt.close()
|
|
2116
|
-
|
|
2358
|
+
plt.close()
|
|
2359
|
+
|
|
2360
|
+
if self.return_fig:
|
|
2361
|
+
return figure
|
|
2362
|
+
|
|
2117
2363
|
def get_betas(self,subsample_ts=False):
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
result =
|
|
2130
|
-
'general_beta':general_beta,
|
|
2131
|
-
'general_r':general_r,
|
|
2132
|
-
'sample_beta':sample_beta,
|
|
2133
|
-
'sample_r':sample_r
|
|
2134
|
-
}
|
|
2135
|
-
|
|
2136
|
-
self.states_result = result
|
|
2137
|
-
|
|
2138
|
-
class evaluate_markets(analyse_index):
|
|
2139
|
-
def __init__(self, stock_code, indexes):
|
|
2140
|
-
self.stock_code = stock_code
|
|
2141
|
-
self.indexes = indexes
|
|
2142
|
-
def evaluate_best_market_fit(self,sample_size, offset,lag= 3, n_obs = 3500, verbose = False, plot_best = False):
|
|
2143
|
-
|
|
2144
|
-
results_dicts = dict()
|
|
2364
|
+
"""
|
|
2365
|
+
get general beta and last sample beta, correlation score is included too
|
|
2366
|
+
|
|
2367
|
+
Parameters
|
|
2368
|
+
----------
|
|
2369
|
+
subsample_ts (int): subsample size of data
|
|
2370
|
+
|
|
2371
|
+
Returns
|
|
2372
|
+
-------
|
|
2373
|
+
None
|
|
2374
|
+
"""
|
|
2375
|
+
result = list()
|
|
2145
2376
|
for index in self.indexes:
|
|
2146
|
-
betex = analyse_index(index = index,asset = self.stock_code,n_obs = n_obs, lag = lag)
|
|
2147
|
-
betex.get_betas(sample_size)
|
|
2148
|
-
results_dicts[index] = betex.states_result
|
|
2149
|
-
pd_result = pd.DataFrame(results_dicts).T
|
|
2150
|
-
pd_result['gen_r2'] = pd_result.general_r ** 2
|
|
2151
|
-
pd_result['sampl_r2'] = pd_result.sample_r ** 2
|
|
2152
|
-
self.stat_results = pd_result
|
|
2153
|
-
|
|
2154
|
-
best_result = pd_result.sort_values('gen_r2',ascending = False).head(2).sort_values('sampl_r2',ascending = False).head(1)
|
|
2155
|
-
best_fit_index = best_result.index.values[0]
|
|
2156
|
-
|
|
2157
|
-
self.stat_results = self.stat_results.drop(columns = ['gen_r2','sampl_r2'])
|
|
2158
|
-
|
|
2159
|
-
if verbose:
|
|
2160
|
-
print(best_result)
|
|
2161
|
-
if plot_best:
|
|
2162
|
-
betex = analyse_index(index = best_fit_index,asset = self.stock_code, n_obs = n_obs, lag = lag)
|
|
2163
|
-
betex.plot_betas(sample_size = sample_size, offset = offset, subsample_ts = False)
|
|
2164
2377
|
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
all_betas = data_market[data_market.asset == ticket_name].sort_values('general_r', ascending = False)
|
|
2177
|
-
all_betas['gen_r2'] = all_betas.general_r ** 2
|
|
2178
|
-
all_betas['sampl_r2'] = all_betas.sample_r ** 2
|
|
2179
|
-
selection = all_betas.sort_values('gen_r2',ascending =False).head(2).sort_values('sampl_r2',ascending =False).head(1).drop(columns = ['gen_r2','sampl_r2'])
|
|
2378
|
+
index_pct = f'{index}_pct'
|
|
2379
|
+
huber_regr = HuberRegressor(fit_intercept = True)
|
|
2380
|
+
huber_regr.fit(self.merger_df.asset_return.values.reshape(-1,1), self.merger_df[index_pct].values.reshape(-1,1))
|
|
2381
|
+
general_beta, a = huber_regr.coef_[0], huber_regr.intercept_
|
|
2382
|
+
general_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df[index])[0]
|
|
2383
|
+
|
|
2384
|
+
dict_res = {
|
|
2385
|
+
'index':index,
|
|
2386
|
+
'general_beta':general_beta,
|
|
2387
|
+
'general_r':general_r,
|
|
2388
|
+
}
|
|
2180
2389
|
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2390
|
+
if subsample_ts:
|
|
2391
|
+
tmp_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
|
|
2392
|
+
huber_regr = HuberRegressor(fit_intercept = True)
|
|
2393
|
+
huber_regr.fit(tmp_df.asset_return.values.reshape(-1,1), tmp_df[index_pct].values.reshape(-1,1))
|
|
2394
|
+
sample_beta, a = huber_regr.coef_[0], huber_regr.intercept_
|
|
2395
|
+
sample_r = stats.mstats.pearsonr(tmp_df.asset_return, tmp_df[index])[0]
|
|
2396
|
+
dict_res['sample_beta'] = sample_beta
|
|
2397
|
+
dict_res['sample_r'] = sample_r
|
|
2398
|
+
|
|
2399
|
+
result.append(dict_res)
|
|
2400
|
+
|
|
2401
|
+
self.states_result = result
|
|
2402
|
+
|