virgo-modules 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- virgo_modules/__init__.py +1 -0
- virgo_modules/src/aws_utils.py +35 -3
- virgo_modules/src/backtester.py +474 -0
- virgo_modules/src/edge_utils/__init__.py +0 -0
- virgo_modules/src/edge_utils/conformal_utils.py +106 -0
- virgo_modules/src/edge_utils/edge_utils.py +502 -0
- virgo_modules/src/edge_utils/feature_selection.py +66 -0
- virgo_modules/src/edge_utils/shap_utils.py +54 -0
- virgo_modules/src/edge_utils/stack_model.py +94 -0
- virgo_modules/src/hmm_utils.py +494 -0
- virgo_modules/src/market/__init__.py +0 -0
- virgo_modules/src/market/market_tools.py +189 -0
- virgo_modules/src/markowitz/__init__.py +0 -0
- virgo_modules/src/markowitz/markowitz_utils.py +44 -0
- virgo_modules/src/re_utils.py +628 -85
- virgo_modules/src/ticketer_source.py +1351 -1066
- virgo_modules/src/transformer_utils.py +401 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/METADATA +16 -22
- virgo_modules-0.9.0.dist-info/RECORD +24 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/WHEEL +1 -1
- virgo_modules/src/edge_utils.py +0 -178
- virgo_modules-0.0.72.dist-info/RECORD +0 -12
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info/licenses}/LICENSE +0 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import yfinance as yf
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import numpy as np
|
|
4
|
-
import
|
|
4
|
+
import gc
|
|
5
5
|
|
|
6
6
|
import matplotlib.pyplot as plt
|
|
7
7
|
import matplotlib.gridspec as gridspec
|
|
@@ -36,7 +36,6 @@ from hmmlearn.hmm import GaussianHMM
|
|
|
36
36
|
|
|
37
37
|
from plotly.colors import DEFAULT_PLOTLY_COLORS
|
|
38
38
|
|
|
39
|
-
from sklearn.base import BaseEstimator, TransformerMixin
|
|
40
39
|
from sklearn.pipeline import Pipeline
|
|
41
40
|
from feature_engine.imputation import MeanMedianImputer
|
|
42
41
|
|
|
@@ -48,88 +47,38 @@ from feature_engine.timeseries.forecasting import LagFeatures
|
|
|
48
47
|
from feature_engine.imputation import MeanMedianImputer
|
|
49
48
|
from feature_engine.discretisation import EqualWidthDiscretiser
|
|
50
49
|
|
|
50
|
+
from sklearn.linear_model import HuberRegressor
|
|
51
|
+
|
|
51
52
|
from .aws_utils import upload_file_to_aws
|
|
52
53
|
|
|
53
54
|
import logging
|
|
54
55
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
self.prefix = prefix
|
|
56
|
+
from virgo_modules.src.hmm_utils import trainer_hmm
|
|
57
|
+
from virgo_modules.src.transformer_utils import signal_combiner, FeatureSelector
|
|
58
|
+
from virgo_modules.src.transformer_utils import FeaturesEntropy, VirgoWinsorizerFeature # imported bcs some models read this module otherwise it crashed mlflow.load()
|
|
59
59
|
|
|
60
|
-
def fit(self, X, y=None):
|
|
61
|
-
return self
|
|
62
|
-
|
|
63
|
-
def transform(self, X, y=None):
|
|
64
|
-
for feature in self.features:
|
|
65
|
-
X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
|
|
66
|
-
return X
|
|
67
|
-
|
|
68
|
-
class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
|
|
69
|
-
def __init__(self, feature_configs):
|
|
70
|
-
self.feature_configs = feature_configs
|
|
71
|
-
def fit(self, X, y=None):
|
|
72
|
-
return self
|
|
73
|
-
|
|
74
|
-
def transform(self, X, y=None):
|
|
75
|
-
for feature in self.feature_configs:
|
|
76
|
-
lower = self.feature_configs[feature]['min']
|
|
77
|
-
upper = self.feature_configs[feature]['max']
|
|
78
|
-
X[feature] = np.where( lower > X[feature], lower, X[feature])
|
|
79
|
-
X[feature] = np.where( upper < X[feature], upper, X[feature])
|
|
80
|
-
return X
|
|
81
|
-
|
|
82
|
-
class FeatureSelector(BaseEstimator, TransformerMixin):
|
|
83
|
-
def __init__(self, columns):
|
|
84
|
-
self.columns = columns
|
|
85
|
-
|
|
86
|
-
def fit(self, X, y=None):
|
|
87
|
-
return self
|
|
88
|
-
|
|
89
|
-
def transform(self, X, y=None):
|
|
90
|
-
return X[self.columns]
|
|
91
|
-
|
|
92
|
-
def sharpe_ratio(return_series):
|
|
93
|
-
N = 255 # Trading days in the year (change to 365 for crypto)
|
|
94
|
-
rf = 0.005 # Half a percent risk free rare
|
|
95
|
-
mean = return_series.mean() * N -rf
|
|
96
|
-
sigma = return_series.std() * np.sqrt(N)
|
|
97
|
-
sharpe = round(mean / sigma, 3)
|
|
98
|
-
return sharpe
|
|
99
|
-
|
|
100
|
-
class signal_combiner(BaseEstimator, TransformerMixin):
|
|
101
|
-
def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
|
|
102
|
-
self.columns = columns
|
|
103
|
-
self.drop = drop
|
|
104
|
-
self.prefix_up = prefix_up
|
|
105
|
-
self.prefix_low = prefix_low
|
|
106
|
-
|
|
107
|
-
def fit(self, X, y=None):
|
|
108
|
-
return self
|
|
109
|
-
|
|
110
|
-
def transform(self, X, y=None):
|
|
111
|
-
for column in self.columns:
|
|
112
|
-
X['CombSignal_'+column] = np.where(
|
|
113
|
-
X[self.prefix_up + column] == 1,
|
|
114
|
-
1,
|
|
115
|
-
np.where(
|
|
116
|
-
X[self.prefix_low + column] == 1,
|
|
117
|
-
1,
|
|
118
|
-
0
|
|
119
|
-
)
|
|
120
|
-
)
|
|
121
|
-
if self.drop:
|
|
122
|
-
X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
|
|
123
|
-
return X
|
|
124
|
-
|
|
125
60
|
def data_processing_pipeline(features_base,features_to_drop = False, lag_dict = False, combine_signals = False, discretize_columns = False, correlation = 0.77):
|
|
126
|
-
|
|
61
|
+
|
|
62
|
+
'''
|
|
63
|
+
create a scikit learn pipeline object using different configurations and feature engineering blocks with a given flow
|
|
64
|
+
|
|
65
|
+
Parameters:
|
|
66
|
+
features_to_drop (list): list of features to drop
|
|
67
|
+
lag_dict (dict): feature dictionary with configurations to apply lags
|
|
68
|
+
combine_signals (list): list of columns/signals to combine
|
|
69
|
+
discretize_columns (list): list of features to discretize, bins is fixed
|
|
70
|
+
correlation (float): correaltion score threshold for feature selection
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
pipe (obj): pipeline object
|
|
74
|
+
'''
|
|
75
|
+
|
|
127
76
|
lag_pipe_sec = [(f'lags_{key}', LagFeatures(variables = key, periods = lag_dict[key])) for key in lag_dict] if lag_dict else []
|
|
128
77
|
drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
|
|
129
78
|
merge = [('signal_combiner', signal_combiner(combine_signals))] if combine_signals else []
|
|
130
79
|
discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = 20 ))] if discretize_columns else []
|
|
131
80
|
drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation))] if correlation else []
|
|
132
|
-
|
|
81
|
+
|
|
133
82
|
pipe = Pipeline(
|
|
134
83
|
[('selector', FeatureSelector(features_base))] + \
|
|
135
84
|
[('encoding',OneHotEncoder(top_categories=None, variables=['hmm_feature']))] + \
|
|
@@ -142,64 +91,178 @@ def data_processing_pipeline(features_base,features_to_drop = False, lag_dict =
|
|
|
142
91
|
)
|
|
143
92
|
return pipe
|
|
144
93
|
|
|
145
|
-
|
|
146
|
-
## legnths
|
|
147
|
-
cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
|
|
148
|
-
cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
|
|
149
|
-
## means
|
|
150
|
-
def quantile2(x):
|
|
151
|
-
return x.quantile(0.25)
|
|
152
|
-
def quantile3(x):
|
|
153
|
-
return x.quantile(0.75)
|
|
154
|
-
|
|
155
|
-
cluster_returns = data.groupby('hmm_feature').agg(
|
|
156
|
-
n_uniques = ('chain_id','nunique'),
|
|
157
|
-
n_obs = ('Date','count'),
|
|
158
|
-
cluster_ret_q25 = ('chain_return',quantile2),
|
|
159
|
-
cluster_ret_median = ('chain_return','median'),
|
|
160
|
-
cluster_ret_q75 = ('chain_return',quantile3),
|
|
161
|
-
)
|
|
162
|
-
cluster_returns = cluster_returns.join(cluster_lengths, how = 'left')
|
|
163
|
-
cluster_returns['perc_dispute'] = np.where(
|
|
164
|
-
np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
|
|
165
|
-
1,0
|
|
166
|
-
)
|
|
167
|
-
cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
|
|
168
|
-
cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
|
|
169
|
-
cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
|
|
170
|
-
cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
|
|
171
|
-
cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
|
|
172
|
-
cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
|
|
173
|
-
cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
|
|
174
|
-
|
|
175
|
-
## relevance or importance
|
|
176
|
-
# naive aproach
|
|
177
|
-
cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
|
|
178
|
-
cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
|
|
179
|
-
cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
|
|
180
|
-
cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
|
|
181
|
-
|
|
182
|
-
mean_relevance = cluster_returns['abs_t_accpted'].mean()
|
|
183
|
-
number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
|
|
184
|
-
|
|
185
|
-
return mean_relevance, cluster_returns, number_relevant_states
|
|
94
|
+
class stock_eda_panel(object):
|
|
186
95
|
|
|
96
|
+
"""
|
|
97
|
+
Class that initialy gets stock data then apply feature enginering, enrichment, analysis, plotting, model training etc.
|
|
98
|
+
|
|
99
|
+
Attributes
|
|
100
|
+
----------
|
|
101
|
+
stock_code : str
|
|
102
|
+
symbol of the asset
|
|
103
|
+
n_days : str
|
|
104
|
+
number of days to extract data
|
|
105
|
+
data_window : str
|
|
106
|
+
large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
|
|
107
|
+
df : pd.DataFrame
|
|
108
|
+
Pandas dataframe of the asset data with features
|
|
109
|
+
strategy_log: pd.DataFrame
|
|
110
|
+
Pandas dataframe that has the results of different tested strategies (result from strategy simulator hmm)
|
|
111
|
+
best_strategy: list
|
|
112
|
+
features of the best performing strategy (result from strategy simulator hmm)
|
|
113
|
+
top_10_strategy: dict
|
|
114
|
+
top 10 best performing strategies (result from strategy simulator hmm)
|
|
115
|
+
settings: dict
|
|
116
|
+
configuration dictionary of the features and other parameters
|
|
117
|
+
|
|
118
|
+
Methods
|
|
119
|
+
-------
|
|
120
|
+
augmented_dickey_fuller_statistics(time_series=pd.Series, label=str):
|
|
121
|
+
Perform dickey fuller or stationary test for a given time series
|
|
122
|
+
It will print p value of the features
|
|
123
|
+
get_data():
|
|
124
|
+
Get asset data performing some data normalization or formating (in case of dates)
|
|
125
|
+
plot_series_returns(roll_mean_lags1=int, roll_mean_lags2=int)
|
|
126
|
+
Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
|
|
127
|
+
seasonal_plot():
|
|
128
|
+
Display time series split by year
|
|
129
|
+
plot_price_signal(feature=str, feature_2=str, opacity=float):
|
|
130
|
+
Display botton and roof signals over the closing prices
|
|
131
|
+
volatility_analysis(lags=int, trad_days=int, window_log_return=int, plot=boolean, save_features=boolean):
|
|
132
|
+
this method performs log return and volatilyty analysis of the closing prices
|
|
133
|
+
find_lag(feature=str, lag_list=list, column_target=str,posterior_lag=int, test_size=int):
|
|
134
|
+
displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
|
|
135
|
+
outlier_plot(zlim=float, plot=boolean, save_features=boolean):
|
|
136
|
+
perform outlier analysis of the log returns. It also permors normality test of returns
|
|
137
|
+
analysis_roll_mean_log_returns(lags=int, plot=boolean):
|
|
138
|
+
perform analysis of lags of the mean rolling log return
|
|
139
|
+
compute_clip_bands(feature_name=str,threshold=float):
|
|
140
|
+
compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
|
|
141
|
+
extract_sec_data(symbol=str, base_columns=list(str), rename_columns=dict):
|
|
142
|
+
extract new asset data and merge it to the main asset data
|
|
143
|
+
lag_log_return(lags=int, feature=str, feature_name=str):
|
|
144
|
+
compute log return given some lags
|
|
145
|
+
produce_log_volatility(trad_days=int, feature=str, feature_name=str):
|
|
146
|
+
compute volatility
|
|
147
|
+
signal_plotter(feature_name=str):
|
|
148
|
+
display analysis plot of a feature with high and low signals
|
|
149
|
+
log_features_standard(feature_name=str):
|
|
150
|
+
save resulting feature names in an standard structure
|
|
151
|
+
relative_spread_MA(ma1=int, ma2=int, threshold=float, plot=boolean, save_features=boolean):
|
|
152
|
+
perform relative moving average features, one for short term and another for long/mid term
|
|
153
|
+
pair_feature(pair_symbol=str, plot=boolean):
|
|
154
|
+
initialize pair feature data extraction and analysis
|
|
155
|
+
calculate_cointegration(series_1=pd.series, series_2=pd.series):
|
|
156
|
+
calculate cointegration score for two time series
|
|
157
|
+
bidirect_count_feature(rolling_window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
158
|
+
perform negative and positive return counting in a given rolling time window
|
|
159
|
+
get_relative_range_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
160
|
+
perform relative spread of opening and closing price
|
|
161
|
+
rsi_feature_improved(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
162
|
+
perform relative strength index
|
|
163
|
+
days_features_bands(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
164
|
+
compute mean returns for a given day of the week in a window scope per day
|
|
165
|
+
analysis_smooth_volume(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
166
|
+
compute feature of thrading volumes
|
|
167
|
+
roc_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
168
|
+
perform price rate of change
|
|
169
|
+
stoch_feature(window=int, smooth1=int, smooth2=int, threshold=float, plot=boolean, save_features=boolean):
|
|
170
|
+
perform stochastic oscilator RSI feature
|
|
171
|
+
stochastic_feature(window=int, smooth=int, threshold=float, plot=boolean, save_features=boolean):
|
|
172
|
+
perform stochastic oscilator feature
|
|
173
|
+
william_feature(lbp=int, threshold=float, plot=boolean, save_features=boolean):
|
|
174
|
+
perfom fast stochastic oscilator or william indicator
|
|
175
|
+
vortex_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
176
|
+
perform vortex oscilator
|
|
177
|
+
expected_return(trad_days:int, feature:str, feature_name:str):
|
|
178
|
+
perform expected log return based on inversed shift of historical data and applying
|
|
179
|
+
rolling_feature(feature: str, window:int, function:callable):
|
|
180
|
+
perform rolling (non expanding) window operation for a given feature
|
|
181
|
+
time_distance(feature_base:str,feature_window:str, result_feature_name:str, max_window:int):
|
|
182
|
+
perform distancce time to a given window feature
|
|
183
|
+
minmax_pricefeature(type_func=str, window=int, distance=bolean, save_features=boolean)
|
|
184
|
+
get relative price/ distance feature with respect to the min/max price in a given window
|
|
185
|
+
pair_index_feature(pair_symbol=str, feature_label=str, window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
186
|
+
perform additional asset ROC feature, then a new feature is created in the main dataframe
|
|
187
|
+
produce_order_features(feature_name=str, save_features=boolean):
|
|
188
|
+
perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
|
|
189
|
+
compute_last_signal (feature_name=str, save_features=boolean):
|
|
190
|
+
perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
|
|
191
|
+
create_hmm_derived_features():
|
|
192
|
+
create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
|
|
193
|
+
cluster_hmm_analysis(n_clusters=int,features_hmm=list, test_data_size=int, seed=int, lag_returns_state=int, plot=boolean, save_features=boolean, model=obj):
|
|
194
|
+
create or use a hmm model
|
|
195
|
+
sharpe_ratio(return_series=pd.Series, n_trad_days=int, rf=float):
|
|
196
|
+
perform sharpe ratio of a given time series return
|
|
197
|
+
treat_signal_strategy(test_data=pd.DataFrame, strategy=list):
|
|
198
|
+
helper method that treats signals and converts signals to 1 or 0
|
|
199
|
+
stategy_simulator(features=list, hmm_feature=boolean):
|
|
200
|
+
execute strategy and get some performance metrics like sharpe ratio, return
|
|
201
|
+
viz_strategy(strategy):
|
|
202
|
+
display analysis plot of a given strategy
|
|
203
|
+
deep_dive_analysis_hmm(test_data_size=int, split=str):
|
|
204
|
+
display analysis plot hmm model
|
|
205
|
+
get_targets(steps=int):
|
|
206
|
+
produce regression target return taking future prices
|
|
207
|
+
get_categorical_targets(horizon=int, flor_loss=float, top_gain=float):
|
|
208
|
+
produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
|
|
209
|
+
get_configurations(test_data_size=int, val_data_size=int, model_type=str):
|
|
210
|
+
produce configuration dictionary that were saved in the feature generation methods if save_features was activated
|
|
211
|
+
"""
|
|
187
212
|
|
|
188
|
-
class stock_eda_panel(object):
|
|
189
|
-
|
|
190
213
|
def __init__(self, stock_code, n_days, data_window = '5y'):
|
|
214
|
+
|
|
215
|
+
"""
|
|
216
|
+
Initialize object
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
stock_code (str): symbol of the asset
|
|
221
|
+
n_days (str): number of days to extract data
|
|
222
|
+
data_window (str): large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
None
|
|
227
|
+
"""
|
|
228
|
+
|
|
191
229
|
self.stock_code = stock_code
|
|
192
230
|
self.n_days = n_days
|
|
193
231
|
self.today = datetime.date.today()
|
|
194
232
|
self.features = list()
|
|
195
233
|
self.signals = list()
|
|
196
234
|
self.data_window = data_window
|
|
197
|
-
|
|
235
|
+
|
|
198
236
|
def augmented_dickey_fuller_statistics(self,time_series, label):
|
|
237
|
+
"""
|
|
238
|
+
Perform dickey fuller or stationary test for a given time series
|
|
239
|
+
It will print p value of the features
|
|
240
|
+
|
|
241
|
+
Parameters
|
|
242
|
+
----------
|
|
243
|
+
time_series (pd.Series): pandas series of the time series
|
|
244
|
+
label (pd.Series): feature name
|
|
245
|
+
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
None
|
|
249
|
+
"""
|
|
199
250
|
result = adfuller(time_series.dropna().values)
|
|
200
251
|
print('p-value: {} for the series {}'.format(round(result[1],6), label))
|
|
201
|
-
|
|
252
|
+
|
|
202
253
|
def get_data(self):
|
|
254
|
+
"""
|
|
255
|
+
Get asset data performing some data normalization or formating (in case of dates)
|
|
256
|
+
|
|
257
|
+
Parameters
|
|
258
|
+
----------
|
|
259
|
+
None
|
|
260
|
+
|
|
261
|
+
Returns
|
|
262
|
+
-------
|
|
263
|
+
None
|
|
264
|
+
"""
|
|
265
|
+
|
|
203
266
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
204
267
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
205
268
|
|
|
@@ -210,7 +273,7 @@ class stock_eda_panel(object):
|
|
|
210
273
|
df.reset_index(inplace=True)
|
|
211
274
|
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
212
275
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
213
|
-
|
|
276
|
+
|
|
214
277
|
df = df[df.Date >= begin_date_str ]
|
|
215
278
|
self.settings_general = {
|
|
216
279
|
'n_days':self.n_days,
|
|
@@ -219,44 +282,56 @@ class stock_eda_panel(object):
|
|
|
219
282
|
'execution_date': self.today.strftime('%Y-%m-%d')
|
|
220
283
|
}
|
|
221
284
|
self.df = df
|
|
222
|
-
|
|
285
|
+
|
|
223
286
|
### cleaning volume
|
|
224
287
|
### volume clearning
|
|
225
288
|
self.df['Volume'] = np.where(self.df['Volume'] <= 10, np.nan, self.df['Volume'])
|
|
226
289
|
self.df['Volume'] = self.df['Volume'].fillna(method='bfill')
|
|
227
|
-
|
|
290
|
+
|
|
228
291
|
## filling
|
|
229
|
-
|
|
292
|
+
|
|
230
293
|
base_columns_unit_test = ['Open','High','Low','Close','Volume']
|
|
231
294
|
self.df[base_columns_unit_test] = self.df[base_columns_unit_test].fillna(method='ffill')
|
|
232
|
-
|
|
295
|
+
|
|
233
296
|
## cleaning nulls
|
|
234
|
-
|
|
297
|
+
|
|
235
298
|
xs = self.df[base_columns_unit_test].isnull().sum()/self.df[base_columns_unit_test].count()
|
|
236
299
|
reject_columns = list(xs[xs > 0.5].index.values)
|
|
237
|
-
|
|
300
|
+
|
|
238
301
|
if len(reject_columns) > 0:
|
|
239
302
|
logging.warning("the following columns have many nulls and are drop: {}".format(reject_columns))
|
|
240
303
|
self.df = self.df.drop(columns = reject_columns)
|
|
241
|
-
|
|
242
|
-
|
|
304
|
+
|
|
243
305
|
def plot_series_returns(self,roll_mean_lags1,roll_mean_lags2):
|
|
244
|
-
|
|
306
|
+
|
|
307
|
+
"""
|
|
308
|
+
Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
|
|
309
|
+
|
|
310
|
+
Parameters
|
|
311
|
+
----------
|
|
312
|
+
roll_mean_lags1 (int): short term window
|
|
313
|
+
roll_mean_lags2 (int): mid/long term window
|
|
314
|
+
|
|
315
|
+
Returns
|
|
316
|
+
-------
|
|
317
|
+
None
|
|
318
|
+
"""
|
|
319
|
+
|
|
245
320
|
df = self.df
|
|
246
321
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
247
322
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
248
|
-
|
|
323
|
+
|
|
249
324
|
### getting rolling mean
|
|
250
325
|
df["Close_roll_mean"] = (
|
|
251
326
|
df.sort_values("Date")["Close"]
|
|
252
327
|
.transform(lambda x: x.rolling(roll_mean_lags1, min_periods=1).mean())
|
|
253
328
|
)
|
|
254
|
-
|
|
329
|
+
|
|
255
330
|
df["Close_roll_mean_2"] = (
|
|
256
331
|
df.sort_values("Date")["Close"]
|
|
257
332
|
.transform(lambda x: x.rolling(roll_mean_lags2, min_periods=1).mean())
|
|
258
333
|
)
|
|
259
|
-
|
|
334
|
+
|
|
260
335
|
### getting rolling stdv
|
|
261
336
|
df["Close_roll_std"] = (
|
|
262
337
|
df.sort_values("Date")["Close"]
|
|
@@ -273,7 +348,7 @@ class stock_eda_panel(object):
|
|
|
273
348
|
))
|
|
274
349
|
|
|
275
350
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.Close, marker_color = 'blue', name='Price'),row=1, col=1)
|
|
276
|
-
|
|
351
|
+
|
|
277
352
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean, marker_color = 'black', name='roll mean' ),row=1, col=1)
|
|
278
353
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean_2, marker_color = 'grey', name='roll mean 2' ),row=1, col=1)
|
|
279
354
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.lower, marker_color = 'pink',legendgroup='bound', name='bound' ),row=1, col=1)
|
|
@@ -281,8 +356,21 @@ class stock_eda_panel(object):
|
|
|
281
356
|
|
|
282
357
|
fig.update_layout(height=500, width=1200, title_text=f"stock {self.stock_code} vizualization")
|
|
283
358
|
fig.show()
|
|
284
|
-
|
|
359
|
+
|
|
285
360
|
def seasonal_plot(self):
|
|
361
|
+
|
|
362
|
+
"""
|
|
363
|
+
Display time series split by year
|
|
364
|
+
|
|
365
|
+
Parameters
|
|
366
|
+
----------
|
|
367
|
+
None
|
|
368
|
+
|
|
369
|
+
Returns
|
|
370
|
+
-------
|
|
371
|
+
None
|
|
372
|
+
"""
|
|
373
|
+
|
|
286
374
|
df = self.df
|
|
287
375
|
years = list(df['Date'].dt.year.unique())
|
|
288
376
|
years.sort()
|
|
@@ -302,10 +390,24 @@ class stock_eda_panel(object):
|
|
|
302
390
|
|
|
303
391
|
fig.update_layout(height=500, width=1400, title_text=f"stock {self.stock_code} seasonal vizualization")
|
|
304
392
|
fig.show()
|
|
305
|
-
|
|
393
|
+
|
|
306
394
|
def plot_price_signal(self, feature, feature_2 = '', opacity = 0.3):
|
|
307
|
-
|
|
308
|
-
|
|
395
|
+
|
|
396
|
+
"""
|
|
397
|
+
Display botton and roof signals over the closing prices
|
|
398
|
+
|
|
399
|
+
Parameters
|
|
400
|
+
----------
|
|
401
|
+
feature (str): name of the main feature to plot
|
|
402
|
+
feature_2 (str): name of the alternative feature to plot
|
|
403
|
+
opacity (float): opacity degree of the signals points
|
|
404
|
+
|
|
405
|
+
Returns
|
|
406
|
+
-------
|
|
407
|
+
None
|
|
408
|
+
"""
|
|
409
|
+
|
|
410
|
+
signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
|
|
309
411
|
signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
|
|
310
412
|
norm_list = [f'norm_{feature}', f'z_{feature}', feature]
|
|
311
413
|
|
|
@@ -315,14 +417,14 @@ class stock_eda_panel(object):
|
|
|
315
417
|
if norm_feat in self.df.columns:
|
|
316
418
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[norm_feat],legendgroup="up", mode='lines',name = norm_feat, marker_color = 'blue'),col = 1, row = 1)
|
|
317
419
|
break
|
|
318
|
-
|
|
319
|
-
|
|
420
|
+
|
|
421
|
+
|
|
320
422
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df['Close'], mode='lines',name = 'history', marker_color = 'grey'),col = 1, row = 2)
|
|
321
|
-
|
|
423
|
+
|
|
322
424
|
if feature == 'MA_spread':
|
|
323
425
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma1_column],legendgroup="ma", mode='lines',name = self.ma1_column, marker_color = 'black'),col = 1, row = 2)
|
|
324
426
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma2_column],legendgroup="ma", mode='lines',name = self.ma2_column, marker_color = 'grey'),col = 1, row = 2)
|
|
325
|
-
|
|
427
|
+
|
|
326
428
|
for norm_feat in norm_list:
|
|
327
429
|
if norm_feat in self.df.columns:
|
|
328
430
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=np.where(self.df[norm_feat] > 0, self.df['Close'], np.nan),legendgroup="up", mode='markers',name = 'up', marker_color = 'green',opacity = opacity),col = 1, row = 2)
|
|
@@ -338,8 +440,25 @@ class stock_eda_panel(object):
|
|
|
338
440
|
|
|
339
441
|
fig.update_layout(height=900, width=1200)
|
|
340
442
|
fig.show()
|
|
341
|
-
|
|
443
|
+
|
|
342
444
|
def volatility_analysis(self, lags, trad_days, window_log_return, plot = False, save_features = False):
|
|
445
|
+
|
|
446
|
+
"""
|
|
447
|
+
this method performs log return and volatilyty analysis of the closing prices
|
|
448
|
+
|
|
449
|
+
Parameters
|
|
450
|
+
----------
|
|
451
|
+
lags (int): number of lags to apply to the closing prices
|
|
452
|
+
trad_days (int): number of trading days to anualize returns or volatility
|
|
453
|
+
window_log_return (int): window for rolling returns
|
|
454
|
+
plot (boolean): True to display plot
|
|
455
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
456
|
+
|
|
457
|
+
Returns
|
|
458
|
+
-------
|
|
459
|
+
None
|
|
460
|
+
"""
|
|
461
|
+
|
|
343
462
|
df = self.df
|
|
344
463
|
df['log_return'] = np.log(df.Close/df.Close.shift(lags))
|
|
345
464
|
df['sqr_log_return'] = np.square(df.log_return)
|
|
@@ -349,13 +468,13 @@ class stock_eda_panel(object):
|
|
|
349
468
|
df.sort_values("Date")["log_return"]
|
|
350
469
|
.transform(lambda x: x.rolling(window_log_return, min_periods=1).mean())
|
|
351
470
|
)
|
|
352
|
-
|
|
471
|
+
|
|
353
472
|
if save_features:
|
|
354
473
|
self.features.append('volatility_log_return')
|
|
355
474
|
self.features.append('roll_mean_log_return')
|
|
356
475
|
self.features.append('log_return')
|
|
357
476
|
self.settings_volatility = {'lags':lags, 'trad_days':trad_days, 'window_log_return':window_log_return}
|
|
358
|
-
|
|
477
|
+
|
|
359
478
|
if plot:
|
|
360
479
|
fig = make_subplots(rows=3, cols=1,vertical_spacing = 0.02,shared_xaxes=True,
|
|
361
480
|
specs=[
|
|
@@ -395,10 +514,25 @@ class stock_eda_panel(object):
|
|
|
395
514
|
|
|
396
515
|
self.augmented_dickey_fuller_statistics(df['log_return'], 'log_return')
|
|
397
516
|
self.augmented_dickey_fuller_statistics(df['roll_mean_log_return'], 'roll_mean_log_return')
|
|
398
|
-
|
|
399
|
-
|
|
517
|
+
|
|
400
518
|
def find_lag(self, feature, lag_list, column_target = 'log_return',posterior_lag = 4, test_size = 350):
|
|
401
519
|
|
|
520
|
+
"""
|
|
521
|
+
displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
|
|
522
|
+
|
|
523
|
+
Parameters
|
|
524
|
+
----------
|
|
525
|
+
feature (str): feature name to apply lags
|
|
526
|
+
lag_list (list): list of lags, each lag as integer
|
|
527
|
+
column_target (str): target to get correlation, e.g return or mean reaturn
|
|
528
|
+
posterior_lag (int): for the target, posterior window shift to calculate a window return
|
|
529
|
+
test_size (int): data size of the test data. The remaining is going to be used as training data. This parameters is ment to avoid overfiting and leackage
|
|
530
|
+
|
|
531
|
+
Returns
|
|
532
|
+
-------
|
|
533
|
+
None
|
|
534
|
+
"""
|
|
535
|
+
|
|
402
536
|
results = dict()
|
|
403
537
|
df = self.df.iloc[:-test_size,:][['Date','Close','roll_mean_log_return','log_return',feature]].sort_values('Date').copy()
|
|
404
538
|
for i,lag in enumerate(lag_list):
|
|
@@ -413,7 +547,7 @@ class stock_eda_panel(object):
|
|
|
413
547
|
'lag':lag,
|
|
414
548
|
'pearsonr_log_return':r_log[0],
|
|
415
549
|
'spearman_log_return': sp_log[0],
|
|
416
|
-
}
|
|
550
|
+
}
|
|
417
551
|
del df
|
|
418
552
|
results_df = pd.DataFrame(results).T
|
|
419
553
|
|
|
@@ -426,10 +560,23 @@ class stock_eda_panel(object):
|
|
|
426
560
|
plt.legend()
|
|
427
561
|
plt.axhline(y=0, color='grey', linestyle='--')
|
|
428
562
|
plt.show()
|
|
429
|
-
|
|
430
|
-
|
|
563
|
+
|
|
431
564
|
def outlier_plot(self, zlim, plot = False, save_features = False):
|
|
432
|
-
|
|
565
|
+
|
|
566
|
+
"""
|
|
567
|
+
perform outlier analysis of the log returns. It also permors normality test of returns
|
|
568
|
+
|
|
569
|
+
Parameters
|
|
570
|
+
----------
|
|
571
|
+
zlim (float): alpha or z thrsholds for normalized returns
|
|
572
|
+
plot (boolean): True to display plot
|
|
573
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
574
|
+
|
|
575
|
+
Returns
|
|
576
|
+
-------
|
|
577
|
+
None
|
|
578
|
+
"""
|
|
579
|
+
|
|
433
580
|
mean = self.df.log_return.mean()
|
|
434
581
|
std = self.df.log_return.std()
|
|
435
582
|
self.df['z_log_return'] = (self.df.log_return - mean)/std
|
|
@@ -440,7 +587,7 @@ class stock_eda_panel(object):
|
|
|
440
587
|
self.df['up_outlier'] = zlim*self.df['z_std_log_return'] + mean_
|
|
441
588
|
self.df['low_outlier'] = -zlim*self.df['z_std_log_return'] + mean_
|
|
442
589
|
|
|
443
|
-
self.df['
|
|
590
|
+
self.df['signal_low_outlier'] = np.where( (self.df['z_log_return'] < self.df['low_outlier'] ), 1, 0)
|
|
444
591
|
self.df['signal_up_outlier'] = np.where( (self.df['z_log_return'] > self.df['up_outlier'] ), 1, 0)
|
|
445
592
|
if save_features:
|
|
446
593
|
self.signals.append('signal_low_outlier')
|
|
@@ -451,7 +598,7 @@ class stock_eda_panel(object):
|
|
|
451
598
|
sigma = self.df['z_log_return'].std()
|
|
452
599
|
x = np.linspace(self.df['z_log_return'].min(),self.df['z_log_return'].max(), 15000)
|
|
453
600
|
y = stats.norm.pdf(x, loc = mu, scale = sigma)
|
|
454
|
-
|
|
601
|
+
|
|
455
602
|
fig, axs = plt.subplots(2, 1,figsize=(15,8))
|
|
456
603
|
|
|
457
604
|
axs[0].hist(self.df['z_log_return'],density = True,bins = 100 , label = 'Returns distribution')
|
|
@@ -460,7 +607,7 @@ class stock_eda_panel(object):
|
|
|
460
607
|
axs[0].axvline(l2, color='green', linestyle='--')
|
|
461
608
|
axs[0].axvline(-l2, color='green', linestyle='--')
|
|
462
609
|
axs[0].plot(x,y, linewidth = 3, color = 'r', label = 'Normal Dist Curve')
|
|
463
|
-
|
|
610
|
+
|
|
464
611
|
axs[1].plot(self.df['Date'],self.df['z_log_return'])
|
|
465
612
|
axs[1].plot(self.df['Date'],self.df['low_outlier'], linestyle='--')
|
|
466
613
|
axs[1].plot(self.df['Date'],self.df['up_outlier'], linestyle='--')
|
|
@@ -469,18 +616,31 @@ class stock_eda_panel(object):
|
|
|
469
616
|
plt.show()
|
|
470
617
|
|
|
471
618
|
z_stat, p_stat = stats.normaltest(self.df['z_log_return'].dropna())
|
|
472
|
-
p_stat = round(p_stat, 7)
|
|
619
|
+
p_stat = round(p_stat, 7)
|
|
473
620
|
print('---------------------- returns normality tests ----------------------------')
|
|
474
621
|
if p_stat < 0.05:
|
|
475
622
|
print(f'pvalue: {p_stat} then, returns do not follow a normal distribution')
|
|
476
623
|
else:
|
|
477
624
|
print(f'pvalue: {p_stat} then, returns follow a normal distribution')
|
|
478
|
-
|
|
625
|
+
|
|
479
626
|
def analysis_roll_mean_log_returns(self, lags, plot = False):
|
|
480
627
|
|
|
628
|
+
"""
|
|
629
|
+
perform analysis of lags of the mean rolling log return
|
|
630
|
+
|
|
631
|
+
Parameters
|
|
632
|
+
----------
|
|
633
|
+
lags (int): lags to apply to the roll log return
|
|
634
|
+
plot (boolean): True to display plot
|
|
635
|
+
|
|
636
|
+
Returns
|
|
637
|
+
-------
|
|
638
|
+
None
|
|
639
|
+
"""
|
|
640
|
+
|
|
481
641
|
self.df['lag'] = self.df.roll_mean_log_return.shift(lags)
|
|
482
642
|
self.df['Diff'] = self.df['roll_mean_log_return'] - self.df['lag']
|
|
483
|
-
|
|
643
|
+
|
|
484
644
|
if plot:
|
|
485
645
|
|
|
486
646
|
fig, axs = plt.subplots(1, 3,figsize=(19,4))
|
|
@@ -493,7 +653,20 @@ class stock_eda_panel(object):
|
|
|
493
653
|
plt.show()
|
|
494
654
|
|
|
495
655
|
def compute_clip_bands(self,feature_name,threshold):
|
|
496
|
-
|
|
656
|
+
|
|
657
|
+
"""
|
|
658
|
+
compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
|
|
659
|
+
|
|
660
|
+
Parameters
|
|
661
|
+
----------
|
|
662
|
+
feature_name (str): feature name
|
|
663
|
+
threshold (float): alpha or z thrsholds for normalized returns
|
|
664
|
+
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
None
|
|
668
|
+
"""
|
|
669
|
+
|
|
497
670
|
self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/self.df[feature_name].std()
|
|
498
671
|
mean_ = self.df[f'norm_{feature_name}'].mean()
|
|
499
672
|
|
|
@@ -506,84 +679,140 @@ class stock_eda_panel(object):
|
|
|
506
679
|
self.df[f'signal_low_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] < self.df[f'lower_{feature_name}'] ), 1, 0)
|
|
507
680
|
self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] > self.df[f'upper_{feature_name}'] ), 1, 0)
|
|
508
681
|
|
|
682
|
+
def extract_sec_data(self, symbol, base_columns, rename_columns=None):
|
|
683
|
+
"""
|
|
684
|
+
extract new asset data and merge it to the main asset data
|
|
685
|
+
|
|
686
|
+
Parameters
|
|
687
|
+
----------
|
|
688
|
+
symbol (str): symbol to extract data
|
|
689
|
+
base_columns (list): list of columns to persist
|
|
690
|
+
rename_columns (dict): map of the new column names using pd.DataFrame.rename()
|
|
691
|
+
|
|
692
|
+
Returns
|
|
693
|
+
-------
|
|
694
|
+
None
|
|
695
|
+
"""
|
|
696
|
+
begin_date = self.today - relativedelta(days = self.n_days)
|
|
697
|
+
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
698
|
+
|
|
699
|
+
stock = yf.Ticker(symbol)
|
|
700
|
+
df = stock.history(period=self.data_window)
|
|
701
|
+
df = df.sort_values('Date')
|
|
702
|
+
df.reset_index(inplace=True)
|
|
703
|
+
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
704
|
+
df['Date'] = pd.to_datetime(df['Date'])
|
|
705
|
+
df = df[df.Date >= begin_date_str ]
|
|
706
|
+
df = df[base_columns]
|
|
707
|
+
if rename_columns:
|
|
708
|
+
df = df.rename(columns=rename_columns)
|
|
709
|
+
right_df = df.copy()
|
|
710
|
+
|
|
711
|
+
dates_vector = self.df.Date.to_frame()
|
|
712
|
+
right_df = dates_vector.merge(right_df, on ='Date',how = 'left')
|
|
713
|
+
right_df = right_df.fillna(method = 'bfill')
|
|
714
|
+
right_df = right_df.fillna(method = 'ffill')
|
|
715
|
+
|
|
716
|
+
self.df = self.df.merge(right_df, on ='Date',how = 'left')
|
|
717
|
+
self.df = self.df.sort_values("Date")
|
|
718
|
+
del right_df
|
|
719
|
+
gc.collect()
|
|
720
|
+
|
|
721
|
+
def lag_log_return(self, lags, feature, feature_name=False):
|
|
722
|
+
"""
|
|
723
|
+
compute log return given some lags
|
|
724
|
+
|
|
725
|
+
Parameters
|
|
726
|
+
----------
|
|
727
|
+
lags (int): lag to apply log return
|
|
728
|
+
feature (str): feature to apply log return
|
|
729
|
+
feature_name (str): rename resuling name
|
|
730
|
+
|
|
731
|
+
Returns
|
|
732
|
+
-------
|
|
733
|
+
None
|
|
734
|
+
"""
|
|
735
|
+
|
|
736
|
+
feature_name = feature_name if feature_name else f"{feature}_log_return"
|
|
737
|
+
self.df[feature_name] = np.log(self.df[feature]/self.df[feature].shift(lags))
|
|
738
|
+
|
|
739
|
+
def produce_log_volatility(self, trad_days, feature, feature_name=False):
|
|
740
|
+
"""
|
|
741
|
+
compute log return given some lags
|
|
742
|
+
|
|
743
|
+
Parameters
|
|
744
|
+
----------
|
|
745
|
+
trad_days (int): window function to calculate standard deviation
|
|
746
|
+
feature (str): feature to apply computation
|
|
747
|
+
feature_name (str): resulting feature name
|
|
748
|
+
|
|
749
|
+
Returns
|
|
750
|
+
-------
|
|
751
|
+
None
|
|
752
|
+
"""
|
|
753
|
+
feature_name = feature_name if feature_name else f"{feature}_log_return_{trad_days}"
|
|
754
|
+
self.df[feature_name] = self.df.sort_values("Date")[feature].rolling(window = trad_days).std()*np.sqrt(252)
|
|
755
|
+
|
|
509
756
|
def signal_plotter(self, feature_name):
|
|
757
|
+
|
|
758
|
+
"""
|
|
759
|
+
display analysis plot of a feature with high and low signals
|
|
760
|
+
|
|
761
|
+
Parameters
|
|
762
|
+
----------
|
|
763
|
+
feature_name (str): feature name
|
|
764
|
+
|
|
765
|
+
Returns
|
|
766
|
+
-------
|
|
767
|
+
None
|
|
768
|
+
"""
|
|
769
|
+
|
|
510
770
|
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
511
|
-
|
|
771
|
+
|
|
512
772
|
axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
|
|
513
773
|
axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
|
|
514
774
|
axs[0].plot(self.df[f'norm_{feature_name}'])
|
|
515
|
-
|
|
775
|
+
|
|
516
776
|
plot_acf(self.df[feature_name].dropna(),lags=25,ax = axs[1])
|
|
517
777
|
axs[1].set_title(f'acf {feature_name}')
|
|
518
|
-
|
|
778
|
+
|
|
519
779
|
plot_pacf(self.df[feature_name].dropna(),lags=25,ax = axs[2])
|
|
520
780
|
axs[2].set_title(f'pacf {feature_name}')
|
|
521
|
-
|
|
781
|
+
|
|
522
782
|
fig.show()
|
|
523
783
|
|
|
524
784
|
def log_features_standard(self, feature_name):
|
|
785
|
+
"""
|
|
786
|
+
save resulting feature names in an standard structure
|
|
787
|
+
|
|
788
|
+
Parameters
|
|
789
|
+
----------
|
|
790
|
+
feature_name (str): feature name
|
|
791
|
+
|
|
792
|
+
Returns
|
|
793
|
+
-------
|
|
794
|
+
None
|
|
795
|
+
"""
|
|
525
796
|
self.features.append(feature_name)
|
|
526
797
|
self.signals.append(f'signal_up_{feature_name}')
|
|
527
798
|
self.signals.append(f'signal_low_{feature_name}')
|
|
528
|
-
|
|
529
|
-
#######################
|
|
530
|
-
#### to be deprecated ####
|
|
531
|
-
def spread_MA(self, ma1, ma2, limit = 1.95, plot = False, save_features = False):
|
|
532
|
-
|
|
533
|
-
self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
|
|
534
|
-
self.df[f'MA_{ma2}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma2, min_periods=1).mean()))
|
|
535
|
-
|
|
536
|
-
self.ma1_column = f'MA_{ma1}'
|
|
537
|
-
self.ma2_column = f'MA_{ma2}'
|
|
538
|
-
self.df['MA_spread'] = self.df[f'MA_{ma1}'] - self.df[f'MA_{ma2}']
|
|
539
|
-
|
|
540
|
-
self.df['norm_MA_spread'] = (self.df['MA_spread'] - self.df['MA_spread'].mean())/self.df['MA_spread'].std()
|
|
541
|
-
mean_ = self.df['norm_MA_spread'].mean()
|
|
542
|
-
self.df['rollstd_MA_spread'] = self.df.sort_values("Date")["norm_MA_spread"].rolling(50).std()
|
|
543
|
-
|
|
544
|
-
self.df['upper_MA_spread'] = limit*self.df['rollstd_MA_spread'] + mean_
|
|
545
|
-
self.df['lower_MA_spread'] = -limit*self.df['rollstd_MA_spread'] + mean_
|
|
546
|
-
|
|
547
|
-
self.df['signal_low_MA_spread'] = np.where( (self.df['norm_MA_spread'] < self.df['lower_MA_spread'] ), 1, 0)
|
|
548
|
-
self.df['signal_up_MA_spread'] = np.where( (self.df['norm_MA_spread'] > self.df['upper_MA_spread'] ), 1, 0)
|
|
549
|
-
|
|
550
|
-
### ploting purposes
|
|
551
|
-
self.df[f"Roll_mean_{ma1}"] = (
|
|
552
|
-
self.df.sort_values("Date")["Close"]
|
|
553
|
-
.transform(lambda x: x.rolling(ma1, min_periods=1).mean())
|
|
554
|
-
)
|
|
555
|
-
self.df[f"Roll_mean_{ma2}"] = (
|
|
556
|
-
self.df.sort_values("Date")["Close"]
|
|
557
|
-
.transform(lambda x: x.rolling(ma2, min_periods=1).mean())
|
|
558
|
-
)
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
print('--------------------------------------------------------------------')
|
|
562
|
-
if save_features:
|
|
563
|
-
self.features.append('MA_spread')
|
|
564
|
-
self.signals.append('signal_low_MA_spread')
|
|
565
|
-
self.signals.append('signal_up_MA_spread')
|
|
566
|
-
self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
|
|
567
|
-
|
|
568
|
-
if plot:
|
|
569
|
-
|
|
570
|
-
fig, axs = plt.subplots(1, 3,figsize=(21,4))
|
|
571
|
-
|
|
572
|
-
axs[0].plot(self.df['Date'],self.df['norm_MA_spread'])
|
|
573
|
-
axs[0].plot(self.df['Date'],self.df['upper_MA_spread'], linestyle='--')
|
|
574
|
-
axs[0].plot(self.df['Date'],self.df['lower_MA_spread'], linestyle='--')
|
|
575
|
-
axs[0].set_title('MA_spread series')
|
|
576
799
|
|
|
577
|
-
plot_acf(self.df['MA_spread'].dropna(),lags=25, ax=axs[1])
|
|
578
|
-
axs[1].set_title('acf MA_spread series')
|
|
579
|
-
|
|
580
|
-
plot_pacf(self.df['MA_spread'].dropna(),lags=25, ax=axs[2])
|
|
581
|
-
axs[2].set_title('acf MA_spread series')
|
|
582
|
-
plt.show()
|
|
583
|
-
##################################################
|
|
584
|
-
|
|
585
800
|
def relative_spread_MA(self, ma1, ma2, threshold = 1.95, plot = False, save_features = False):
|
|
586
|
-
|
|
801
|
+
"""
|
|
802
|
+
perform relative moving average features, one for short term and another for long/mid term
|
|
803
|
+
|
|
804
|
+
Parameters
|
|
805
|
+
----------
|
|
806
|
+
ma1 (int): short term moving average window
|
|
807
|
+
ma2 (int): long/mid term moving average window
|
|
808
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
809
|
+
plot (boolean): True to display plot
|
|
810
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
811
|
+
|
|
812
|
+
Returns
|
|
813
|
+
-------
|
|
814
|
+
None
|
|
815
|
+
"""
|
|
587
816
|
feature_name = 'rel_MA_spread'
|
|
588
817
|
|
|
589
818
|
self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
|
|
@@ -605,16 +834,27 @@ class stock_eda_panel(object):
|
|
|
605
834
|
.transform(lambda x: x.rolling(ma2, min_periods=1).mean())
|
|
606
835
|
)
|
|
607
836
|
|
|
608
|
-
print('--------------------------------------------------------------------')
|
|
609
837
|
if save_features:
|
|
610
838
|
self.log_features_standard(feature_name)
|
|
611
|
-
self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
|
|
839
|
+
self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
|
|
612
840
|
|
|
613
841
|
if plot:
|
|
614
|
-
|
|
615
842
|
self.signal_plotter(feature_name)
|
|
616
|
-
|
|
843
|
+
|
|
617
844
|
def pair_feature(self, pair_symbol, plot = False):
|
|
845
|
+
"""
|
|
846
|
+
initialize pair feature data extraction and analysis
|
|
847
|
+
|
|
848
|
+
Parameters
|
|
849
|
+
----------
|
|
850
|
+
pair_symbol (str): symbol of the pair asset to extract
|
|
851
|
+
plot (boolean): True to display plot
|
|
852
|
+
|
|
853
|
+
Returns
|
|
854
|
+
-------
|
|
855
|
+
None
|
|
856
|
+
"""
|
|
857
|
+
|
|
618
858
|
self.pair_symbol = pair_symbol
|
|
619
859
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
620
860
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
@@ -627,7 +867,7 @@ class stock_eda_panel(object):
|
|
|
627
867
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
628
868
|
df = df[df.Date >= begin_date_str ]
|
|
629
869
|
self.pair_df = df
|
|
630
|
-
|
|
870
|
+
|
|
631
871
|
#### converting the same index ####
|
|
632
872
|
dates_vector = self.df.Date.to_frame()
|
|
633
873
|
self.pair_df = dates_vector.merge(self.pair_df, on ='Date',how = 'left')
|
|
@@ -653,8 +893,40 @@ class stock_eda_panel(object):
|
|
|
653
893
|
plt.plot(self.df['Date'],asset_2_values,label = asset_2)
|
|
654
894
|
plt.legend()
|
|
655
895
|
plt.show()
|
|
656
|
-
|
|
896
|
+
|
|
897
|
+
def smooth_logrets_interaction_term(self, feature_interact_with, resulting_feature_name="persisted_clip_diff_smooths", rollmean_window = 5, ext_threhold=0.015, persist_days = 3, save_features=False):
|
|
898
|
+
"""
|
|
899
|
+
create an interaction term that is going to compare the distance of asset wolling window mean and market rolling window mean.
|
|
900
|
+
then get the outliers or high values using abs and this value persist for some days
|
|
901
|
+
goal persist big differences of market and asset returns
|
|
902
|
+
|
|
903
|
+
feature_interact_with: name of the market return
|
|
904
|
+
rollmean_window: rolling window or smoothing number of days
|
|
905
|
+
ext_threhold: threshold
|
|
906
|
+
persist_days: number of days to persis the signal
|
|
907
|
+
"""
|
|
908
|
+
self.df["smooth_log_return"] = self.df['log_return'].rolling(rollmean_window).mean().values
|
|
909
|
+
self.df["smooth_market_log_return"] = self.df[feature_interact_with].rolling(rollmean_window).mean().values
|
|
910
|
+
self.df["diff_smooths"] = self.df["smooth_market_log_return"]-self.df["smooth_log_return"]
|
|
911
|
+
self.df["clip_diff_smooths"] = np.where(np.abs(self.df["diff_smooths"]) > ext_threhold, self.df["diff_smooths"] , 0)
|
|
912
|
+
self.df[resulting_feature_name] = self.df['clip_diff_smooths'].rolling(persist_days).mean().values
|
|
913
|
+
self.df = self.df.drop(columns=["smooth_log_return","smooth_market_log_return","diff_smooths","clip_diff_smooths"])
|
|
914
|
+
|
|
657
915
|
def calculate_cointegration(self,series_1, series_2):
|
|
916
|
+
"""
|
|
917
|
+
calculate cointegration score for two time series
|
|
918
|
+
|
|
919
|
+
Parameters
|
|
920
|
+
----------
|
|
921
|
+
series_1 (pd.series): time series
|
|
922
|
+
series_2 (pd.series): time series
|
|
923
|
+
|
|
924
|
+
Returns
|
|
925
|
+
-------
|
|
926
|
+
coint_flag (boolean): 1 if the p_value cointegration_t are lower than 0.05 and critical value
|
|
927
|
+
hedge_value (float): beta from the regression model
|
|
928
|
+
"""
|
|
929
|
+
|
|
658
930
|
coint_flag = 0
|
|
659
931
|
coint_res = coint(series_1, series_2)
|
|
660
932
|
coint_t = coint_res[0]
|
|
@@ -666,9 +938,22 @@ class stock_eda_panel(object):
|
|
|
666
938
|
coint_flag = 1 if p_value < 0.05 and coint_t < critical_value else 0
|
|
667
939
|
|
|
668
940
|
return coint_flag, hedge_value
|
|
669
|
-
|
|
670
|
-
def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
|
|
671
941
|
|
|
942
|
+
def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
|
|
943
|
+
"""
|
|
944
|
+
display analysis of the pair feature and save results in case if needed
|
|
945
|
+
|
|
946
|
+
Parameters
|
|
947
|
+
----------
|
|
948
|
+
window (int): window to apply to the rolling spread between pair and main asset
|
|
949
|
+
z_threshold (float): alpha or z thrsholds for the normalized feature
|
|
950
|
+
plot (boolean): True to display plot
|
|
951
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
952
|
+
|
|
953
|
+
Returns
|
|
954
|
+
-------
|
|
955
|
+
None
|
|
956
|
+
"""
|
|
672
957
|
spread_series = pd.Series(self.df.pair_spread)
|
|
673
958
|
mean = spread_series.rolling(center = False, window = window).mean()
|
|
674
959
|
std = spread_series.rolling(center = False, window = window).std()
|
|
@@ -677,11 +962,11 @@ class stock_eda_panel(object):
|
|
|
677
962
|
self.df['pair_z_score'] = z_score
|
|
678
963
|
self.df['signal_low_pair_z_score'] = np.where(self.df['pair_z_score'] < -z_threshold, 1, 0)
|
|
679
964
|
self.df['signal_up_pair_z_score'] = np.where(self.df['pair_z_score'] > z_threshold, 1, 0)
|
|
680
|
-
|
|
965
|
+
|
|
681
966
|
if save_features:
|
|
682
967
|
self.log_features_standard('pair_z_score')
|
|
683
|
-
self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
|
|
684
|
-
|
|
968
|
+
self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
|
|
969
|
+
|
|
685
970
|
if plot:
|
|
686
971
|
pvalue = round(adfuller(z_score.dropna().values)[1],4)
|
|
687
972
|
print(f'p value of the rolling z-score is {pvalue}')
|
|
@@ -695,7 +980,7 @@ class stock_eda_panel(object):
|
|
|
695
980
|
axs[0,0].axhline(y=0, color='blue', linestyle='-.')
|
|
696
981
|
axs[0,0].plot(self.df.pair_z_score)
|
|
697
982
|
axs[0,0].set_title('z score from the spread')
|
|
698
|
-
|
|
983
|
+
|
|
699
984
|
axs[0,1].plot(self.df['Date'],self.df['pair_spread'])
|
|
700
985
|
axs[0,1].plot(self.df['Date'],np.where(self.df['signal_low_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'red')
|
|
701
986
|
axs[0,1].plot(self.df['Date'],np.where(self.df['signal_up_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'green')
|
|
@@ -704,44 +989,27 @@ class stock_eda_panel(object):
|
|
|
704
989
|
|
|
705
990
|
plot_acf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,0])
|
|
706
991
|
axs[1,0].set_title('acf pair_z_score')
|
|
707
|
-
|
|
992
|
+
|
|
708
993
|
plot_pacf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,1])
|
|
709
994
|
axs[1,1].set_title('pacf pair_z_score')
|
|
710
|
-
|
|
711
|
-
plt.show()
|
|
712
|
-
|
|
713
|
-
#######################
|
|
714
|
-
#### to be deprecated ####
|
|
715
|
-
def get_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
|
|
716
|
-
|
|
717
|
-
# negative countiing and rolling countingng
|
|
718
|
-
self.df['RetClose'] = self.df['Close'].pct_change()
|
|
719
|
-
self.df['roll_pos_counting'] = np.where(self.df['RetClose'].shift(1) > 0,1,0 )
|
|
720
|
-
self.df['roll_pos_counting'] = self.df['roll_pos_counting'].rolling(window = rolling_window).sum()
|
|
721
|
-
|
|
722
|
-
mean = self.df['roll_pos_counting'].mean()
|
|
723
|
-
std = self.df['roll_pos_counting'].std()
|
|
724
|
-
self.df['norm_counting'] = (self.df['roll_pos_counting'] - mean )/std
|
|
725
995
|
|
|
726
|
-
self.df['signal_up_roll_pos_counting'] = np.where((self.df['norm_counting'] > threshold),1,0)
|
|
727
|
-
self.df['signal_low_roll_pos_counting'] = np.where((self.df['norm_counting'] < -threshold),1,0)
|
|
728
|
-
|
|
729
|
-
if save_features:
|
|
730
|
-
self.features.append('roll_pos_counting')
|
|
731
|
-
self.signals.append('signal_up_roll_pos_counting')
|
|
732
|
-
self.signals.append('signal_low_roll_pos_counting')
|
|
733
|
-
self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
734
|
-
|
|
735
|
-
if plot:
|
|
736
|
-
fig = plt.figure(figsize = (10,4))
|
|
737
|
-
plt.plot(self.df['Date'],self.df.norm_counting)
|
|
738
|
-
plt.axhline(y=threshold, color='grey', linestyle='--')
|
|
739
|
-
plt.axhline(y=-threshold, color='grey', linestyle='--')
|
|
740
996
|
plt.show()
|
|
741
|
-
|
|
742
|
-
|
|
997
|
+
|
|
743
998
|
def bidirect_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
|
|
744
|
-
|
|
999
|
+
"""
|
|
1000
|
+
perform negative and positive return counting in a given rolling time window
|
|
1001
|
+
|
|
1002
|
+
Parameters
|
|
1003
|
+
----------
|
|
1004
|
+
rolling_window (int): window to apply to positive and negative returns
|
|
1005
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1006
|
+
plot (boolean): True to display plot
|
|
1007
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1008
|
+
|
|
1009
|
+
Returns
|
|
1010
|
+
-------
|
|
1011
|
+
None
|
|
1012
|
+
"""
|
|
745
1013
|
feature_name = 'bidirect_counting'
|
|
746
1014
|
# negative countiing and rolling countingng
|
|
747
1015
|
self.df['RetClose'] = self.df['Close'].pct_change()
|
|
@@ -757,7 +1025,7 @@ class stock_eda_panel(object):
|
|
|
757
1025
|
|
|
758
1026
|
if save_features:
|
|
759
1027
|
self.log_features_standard(feature_name)
|
|
760
|
-
self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
1028
|
+
self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
761
1029
|
|
|
762
1030
|
if plot:
|
|
763
1031
|
fig = plt.figure(figsize = (10,4))
|
|
@@ -766,47 +1034,21 @@ class stock_eda_panel(object):
|
|
|
766
1034
|
plt.plot(self.df['Date'],self.df[f'lower_{feature_name}'], linestyle='--')
|
|
767
1035
|
plt.show()
|
|
768
1036
|
|
|
769
|
-
#######################
|
|
770
|
-
#### to be deprecated ####
|
|
771
|
-
def get_range_feature(self, window, up_threshold, low_threshold, plot = False, save_features = False):
|
|
772
|
-
|
|
773
|
-
self.df["Range"] = self.df["High"] / self.df["Low"] - 1
|
|
774
|
-
self.df['Avg_range'] = self.df['Range'].rolling(window = 5).mean()
|
|
775
|
-
self.df['dist_range'] = self.df['Range'] - self.df['Avg_range']
|
|
776
|
-
self.df['norm_dist_range'] = (self.df['dist_range'] - self.df['dist_range'].mean())/ self.df['dist_range'].std()
|
|
777
|
-
|
|
778
|
-
mean_ = self.df['norm_dist_range'].mean()
|
|
779
|
-
self.df[f'std_norm_dist_range'] = (self.df.sort_values("Date")["norm_dist_range"].transform(lambda x: x.rolling(window, min_periods=1).std()))
|
|
780
|
-
|
|
781
|
-
self.df['up_bound_norm_dist_range'] = up_threshold*self.df['std_norm_dist_range'] + mean_
|
|
782
|
-
self.df['low_bound_norm_dist_range'] = -low_threshold*self.df['std_norm_dist_range'] + mean_
|
|
783
|
-
|
|
784
|
-
self.df['signal_up_dist_range'] = np.where(self.df['norm_dist_range'] > self.df['up_bound_norm_dist_range'],1,0 )
|
|
785
|
-
self.df['signal_low_dist_range'] = np.where(self.df['norm_dist_range'] < self.df['low_bound_norm_dist_range'],1,0 )
|
|
786
|
-
|
|
787
|
-
if save_features:
|
|
788
|
-
self.features.append('dist_range')
|
|
789
|
-
self.signals.append('signal_up_dist_range')
|
|
790
|
-
self.signals.append('signal_low_dist_range')
|
|
791
|
-
self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
|
|
792
|
-
|
|
793
|
-
if plot:
|
|
794
|
-
fig, axs = plt.subplots(2, 2,figsize=(17,11))
|
|
795
|
-
|
|
796
|
-
axs[0,0].plot(self.df['Range'])
|
|
797
|
-
axs[0,0].set_title('range')
|
|
798
|
-
|
|
799
|
-
axs[0,1].plot(self.df['Avg_range'])
|
|
800
|
-
axs[0,1].set_title('Avg_range')
|
|
801
|
-
|
|
802
|
-
axs[1,0].plot(self.df['up_bound_norm_dist_range'],color = 'grey', linestyle='--')
|
|
803
|
-
axs[1,0].plot(self.df['low_bound_norm_dist_range'],color = 'grey', linestyle='--')
|
|
804
|
-
axs[1,0].plot(self.df['norm_dist_range'])
|
|
805
|
-
axs[1,0].set_title('norm_dist_range')
|
|
806
|
-
#######################
|
|
807
|
-
|
|
808
1037
|
def get_relative_range_feature(self, window, threshold, plot = False, save_features = False):
|
|
809
|
-
|
|
1038
|
+
"""
|
|
1039
|
+
perform relative spread of opening and closing price
|
|
1040
|
+
|
|
1041
|
+
Parameters
|
|
1042
|
+
----------
|
|
1043
|
+
window (int): window to apply to the feature
|
|
1044
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1045
|
+
plot (boolean): True to display plot
|
|
1046
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1047
|
+
|
|
1048
|
+
Returns
|
|
1049
|
+
-------
|
|
1050
|
+
None
|
|
1051
|
+
"""
|
|
810
1052
|
feature_name = 'CO_Range'
|
|
811
1053
|
self.df[feature_name] = self.df["Close"] / self.df["Open"]-1
|
|
812
1054
|
self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/ self.df[feature_name].std()
|
|
@@ -822,7 +1064,7 @@ class stock_eda_panel(object):
|
|
|
822
1064
|
|
|
823
1065
|
if save_features:
|
|
824
1066
|
self.log_features_standard(feature_name)
|
|
825
|
-
self.settings_relative_price_range = {'window':window, 'threshold':threshold}
|
|
1067
|
+
self.settings_relative_price_range = {'window':window, 'threshold':threshold}
|
|
826
1068
|
|
|
827
1069
|
if plot:
|
|
828
1070
|
fig, axs = plt.subplots(1, 2,figsize=(14,5))
|
|
@@ -835,46 +1077,24 @@ class stock_eda_panel(object):
|
|
|
835
1077
|
axs[1].plot(self.df[f'norm_{feature_name}'])
|
|
836
1078
|
axs[1].set_title(f'norm_{feature_name}')
|
|
837
1079
|
|
|
838
|
-
#######################
|
|
839
|
-
#### to be deprecated ####
|
|
840
|
-
def rsi_feature(self, window, lag_rsi_ret, threshold, plot = False, save_features = False):
|
|
841
|
-
|
|
842
|
-
rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
|
|
843
|
-
self.df['RSI'] = rsi
|
|
844
|
-
self.df['RSI_ret'] = self.df['RSI']/self.df['RSI'].shift(lag_rsi_ret)
|
|
845
|
-
|
|
846
|
-
mean = self.df['RSI_ret'].mean()
|
|
847
|
-
std = self.df['RSI_ret'].std()
|
|
848
|
-
self.df['norm_RSI_ret'] = (self.df['RSI_ret']-mean)/std
|
|
849
|
-
self.df['signal_up_RSI_ret'] = np.where(self.df['norm_RSI_ret'] > threshold,1,0)
|
|
850
|
-
self.df['signal_low_RSI_ret'] = np.where(self.df['norm_RSI_ret'] < -threshold,1,0)
|
|
851
|
-
|
|
852
|
-
if save_features:
|
|
853
|
-
self.features.append('RSI_ret')
|
|
854
|
-
self.signals.append('signal_up_RSI_ret')
|
|
855
|
-
self.signals.append('signal_low_RSI_ret')
|
|
856
|
-
self.settings_rsi_feature= {'window':window, 'lag_rsi_ret':lag_rsi_ret, 'threshold':threshold}
|
|
857
|
-
|
|
858
|
-
if plot:
|
|
859
|
-
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
860
|
-
|
|
861
|
-
axs[0].plot(self.df.norm_RSI_ret)
|
|
862
|
-
axs[0].axhline(y=threshold, color='grey', linestyle='--')
|
|
863
|
-
axs[0].axhline(y=-threshold, color='grey', linestyle='--')
|
|
864
|
-
|
|
865
|
-
plot_acf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[1])
|
|
866
|
-
axs[1].set_title('acf RSI_ret')
|
|
867
|
-
|
|
868
|
-
plot_pacf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[2])
|
|
869
|
-
axs[2].set_title('pacf RSI_ret')
|
|
870
|
-
|
|
871
|
-
fig.show()
|
|
872
|
-
#######################
|
|
873
|
-
|
|
874
1080
|
def rsi_feature_improved(self, window, threshold, plot = False, save_features = False):
|
|
1081
|
+
"""
|
|
1082
|
+
perform relative strength index
|
|
1083
|
+
|
|
1084
|
+
Parameters
|
|
1085
|
+
----------
|
|
1086
|
+
window (int): window to apply to the feature
|
|
1087
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1088
|
+
plot (boolean): True to display plot
|
|
1089
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1090
|
+
|
|
1091
|
+
Returns
|
|
1092
|
+
-------
|
|
1093
|
+
None
|
|
1094
|
+
"""
|
|
875
1095
|
feature_name = 'RSI'
|
|
876
1096
|
rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
|
|
877
|
-
self.df[feature_name] = rsi
|
|
1097
|
+
self.df[feature_name] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
878
1098
|
self.compute_clip_bands(feature_name,threshold)
|
|
879
1099
|
|
|
880
1100
|
if save_features:
|
|
@@ -883,54 +1103,22 @@ class stock_eda_panel(object):
|
|
|
883
1103
|
|
|
884
1104
|
if plot:
|
|
885
1105
|
self.signal_plotter(feature_name)
|
|
886
|
-
|
|
887
|
-
#######################
|
|
888
|
-
#### to be deprecated ####
|
|
889
|
-
def days_features(self, window_day, limit, plot = False, save_features = False):
|
|
890
|
-
|
|
891
|
-
self.df['dow'] = self.df.Date.dt.dayofweek
|
|
892
|
-
self.df['dow'] = self.df['dow'].astype('str')
|
|
893
|
-
|
|
894
|
-
self.df['target_mean_input'] = (self.df.sort_values("Date").groupby('dow')['roll_mean_log_return'].transform(lambda x: x.rolling(window_day, min_periods=1).mean()))
|
|
895
|
-
|
|
896
|
-
mean = self.df['target_mean_input'].mean()
|
|
897
|
-
std = self.df['target_mean_input'].std()
|
|
898
|
-
|
|
899
|
-
self.df['norm_dow_input'] = (self.df['target_mean_input']-mean)/std
|
|
900
|
-
mean_ = self.df['norm_dow_input'].mean()
|
|
901
|
-
self.df['std_dow_input'] = self.df.sort_values("Date")["norm_dow_input"].rolling(50).std()
|
|
902
|
-
|
|
903
|
-
self.df['up_dow_input'] = limit*self.df['std_dow_input'] + mean_
|
|
904
|
-
self.df['low_dow_input'] = -limit*self.df['std_dow_input'] - mean_
|
|
905
|
-
|
|
906
|
-
self.df['signal_up_target_mean_input'] = np.where(self.df['norm_dow_input'] > self.df['up_dow_input'],1,0)
|
|
907
|
-
self.df['signal_low_target_mean_input'] = np.where(self.df['norm_dow_input'] < self.df['low_dow_input'],1,0)
|
|
908
1106
|
|
|
909
|
-
if save_features:
|
|
910
|
-
|
|
911
|
-
self.features.append('target_mean_input')
|
|
912
|
-
self.signals.append('signal_up_target_mean_input')
|
|
913
|
-
self.signals.append('signal_low_target_mean_input')
|
|
914
|
-
self.settings_days_features = {'window_day':window_day, 'limit':limit}
|
|
915
|
-
|
|
916
|
-
if plot:
|
|
917
|
-
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
918
|
-
|
|
919
|
-
axs[0].plot(self.df['norm_dow_input'])
|
|
920
|
-
axs[0].plot(self.df['up_dow_input'], linestyle='--')
|
|
921
|
-
axs[0].plot(self.df['low_dow_input'], linestyle='--')
|
|
922
|
-
|
|
923
|
-
plot_acf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[1])
|
|
924
|
-
axs[1].set_title('acf day feature')
|
|
925
|
-
|
|
926
|
-
plot_pacf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[2])
|
|
927
|
-
axs[2].set_title('pacf day feature')
|
|
928
|
-
|
|
929
|
-
fig.show()
|
|
930
|
-
#######################
|
|
931
|
-
|
|
932
1107
|
def days_features_bands(self, window, threshold, plot = False, save_features = False):
|
|
933
|
-
|
|
1108
|
+
"""
|
|
1109
|
+
compute mean returns for a given day of the week in a window scope per day
|
|
1110
|
+
|
|
1111
|
+
Parameters
|
|
1112
|
+
----------
|
|
1113
|
+
window (int): window to apply to the feature
|
|
1114
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1115
|
+
plot (boolean): True to display plot
|
|
1116
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1117
|
+
|
|
1118
|
+
Returns
|
|
1119
|
+
-------
|
|
1120
|
+
None
|
|
1121
|
+
"""
|
|
934
1122
|
self.df['dow'] = self.df.Date.dt.dayofweek
|
|
935
1123
|
self.df['dow'] = self.df['dow'].astype('str')
|
|
936
1124
|
|
|
@@ -947,65 +1135,22 @@ class stock_eda_panel(object):
|
|
|
947
1135
|
|
|
948
1136
|
if plot:
|
|
949
1137
|
self.signal_plotter(feature_name)
|
|
950
|
-
|
|
951
|
-
#######################
|
|
952
|
-
#### to be deprecated ####
|
|
953
|
-
def analysis_volume(self,lag_volume, threshold, window, plot = False, save_features = False):
|
|
954
|
-
|
|
955
|
-
self.df['log_Volume'] = np.log(self.df['Volume'])
|
|
956
|
-
self.df['ret_log_Volume'] = self.df['log_Volume'].pct_change(lag_volume)
|
|
957
|
-
|
|
958
|
-
self.df['norm_ret_log_Volume'] = (self.df['ret_log_Volume'] - self.df['ret_log_Volume'].mean())/ self.df['ret_log_Volume'].std()
|
|
959
|
-
mean_ = self.df['norm_ret_log_Volume'].mean()
|
|
960
|
-
self.df[f'std_norm_ret_log_Volume'] = (self.df.sort_values("Date")["norm_ret_log_Volume"].transform(lambda x: x.rolling(window, min_periods=1).std()))
|
|
961
1138
|
|
|
962
|
-
self.df['up_bound_ret_log_Volume'] = threshold*self.df['std_norm_ret_log_Volume'] + mean_
|
|
963
|
-
self.df['low_bound_ret_log_Volume'] = -threshold*self.df['std_norm_ret_log_Volume'] + mean_
|
|
964
|
-
|
|
965
|
-
self.df['signal_up_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] > self.df['up_bound_ret_log_Volume'],1,0 )
|
|
966
|
-
self.df['signal_low_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] < self.df['low_bound_ret_log_Volume'],1,0 )
|
|
967
|
-
|
|
968
|
-
if save_features:
|
|
969
|
-
self.features.append('ret_log_Volume')
|
|
970
|
-
self.signals.append('signal_up_ret_log_Volume')
|
|
971
|
-
self.signals.append('signal_low_ret_log_Volume')
|
|
972
|
-
self.settings_volume_feature= {'lag_volume':lag_volume, 'threshold':threshold, 'window':window}
|
|
973
|
-
if plot:
|
|
974
|
-
fig, axs = plt.subplots(3, 2,figsize=(11,13))
|
|
975
|
-
axs[0,0].plot(self.df.Date, self.df.Volume)
|
|
976
|
-
axs[0,0].set_title('Volume')
|
|
977
|
-
axs[0,1].plot(self.df.Date, self.df.log_Volume)
|
|
978
|
-
axs[0,1].set_title('log Volume')
|
|
979
|
-
|
|
980
|
-
plot_acf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,0])
|
|
981
|
-
axs[1,0].set_title('acf log_Volume')
|
|
982
|
-
plot_pacf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,1])
|
|
983
|
-
axs[1,1].set_title('pacf log_Volume')
|
|
984
|
-
|
|
985
|
-
plot_acf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,0])
|
|
986
|
-
axs[2,0].set_title('acf ret_log_Volume')
|
|
987
|
-
plot_pacf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,1])
|
|
988
|
-
axs[2,1].set_title('pacf ret_log_Volume')
|
|
989
|
-
|
|
990
|
-
plt.show()
|
|
991
|
-
|
|
992
|
-
print('--------------------------------------------------------------')
|
|
993
|
-
|
|
994
|
-
fig, axs = plt.subplots(1, 2,figsize=(10,4))
|
|
995
|
-
|
|
996
|
-
axs[0].plot(self.df.Date, self.df.norm_ret_log_Volume)
|
|
997
|
-
axs[0].plot(self.df.Date, self.df.up_bound_ret_log_Volume)
|
|
998
|
-
axs[0].plot(self.df.Date, self.df.low_bound_ret_log_Volume)
|
|
999
|
-
axs[0].set_title('norm_ret_log_Volume')
|
|
1000
|
-
|
|
1001
|
-
axs[1].plot(self.df.Date, self.df.std_norm_ret_log_Volume)
|
|
1002
|
-
axs[1].set_title('std_norm_ret_log_Volume')
|
|
1003
|
-
|
|
1004
|
-
plt.show()
|
|
1005
|
-
#######################
|
|
1006
|
-
|
|
1007
1139
|
def analysis_smooth_volume(self, window, threshold, plot = False, save_features = False):
|
|
1008
|
-
|
|
1140
|
+
"""
|
|
1141
|
+
compute feature of thrading volumes
|
|
1142
|
+
|
|
1143
|
+
Parameters
|
|
1144
|
+
----------
|
|
1145
|
+
window (int): window to apply to the feature
|
|
1146
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1147
|
+
plot (boolean): True to display plot
|
|
1148
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1149
|
+
|
|
1150
|
+
Returns
|
|
1151
|
+
-------
|
|
1152
|
+
None
|
|
1153
|
+
"""
|
|
1009
1154
|
feature_name = 'smooth_Volume'
|
|
1010
1155
|
self.df[feature_name] = np.log(self.df['Volume'])
|
|
1011
1156
|
# self.df[feature_name] = self.df['log_Volume'].rolling(window).mean()
|
|
@@ -1039,7 +1184,7 @@ class stock_eda_panel(object):
|
|
|
1039
1184
|
|
|
1040
1185
|
fig, axs = plt.subplots(1,2,figsize=(10,4))
|
|
1041
1186
|
|
|
1042
|
-
axs[0].plot(self.df[f'{feature_name}'])
|
|
1187
|
+
axs[0].plot(self.df[f'{feature_name}'])
|
|
1043
1188
|
axs[0].set_title(f'{feature_name}')
|
|
1044
1189
|
|
|
1045
1190
|
axs[1].plot(self.df[f'z_{feature_name}'], linestyle='--')
|
|
@@ -1048,9 +1193,23 @@ class stock_eda_panel(object):
|
|
|
1048
1193
|
plt.show()
|
|
1049
1194
|
|
|
1050
1195
|
def roc_feature(self, window, threshold, plot = False, save_features = False):
|
|
1196
|
+
"""
|
|
1197
|
+
perform price rate of change
|
|
1198
|
+
|
|
1199
|
+
Parameters
|
|
1200
|
+
----------
|
|
1201
|
+
window (int): window to apply to the feature
|
|
1202
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1203
|
+
plot (boolean): True to display plot
|
|
1204
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1205
|
+
|
|
1206
|
+
Returns
|
|
1207
|
+
-------
|
|
1208
|
+
None
|
|
1209
|
+
"""
|
|
1051
1210
|
feature_name = 'ROC'
|
|
1052
1211
|
roc = ROCIndicator(close = self.df['Close'], window = window).roc()
|
|
1053
|
-
self.df[feature_name] = roc
|
|
1212
|
+
self.df[feature_name] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1054
1213
|
self.compute_clip_bands(feature_name,threshold)
|
|
1055
1214
|
|
|
1056
1215
|
if save_features:
|
|
@@ -1058,11 +1217,27 @@ class stock_eda_panel(object):
|
|
|
1058
1217
|
self.settings_roc_feature = {'window':window, 'threshold':threshold}
|
|
1059
1218
|
if plot:
|
|
1060
1219
|
self.signal_plotter(feature_name)
|
|
1061
|
-
|
|
1220
|
+
|
|
1062
1221
|
def stoch_feature(self, window, smooth1, smooth2, threshold, plot = False, save_features = False):
|
|
1222
|
+
"""
|
|
1223
|
+
perform stochastic oscilator RSI feature
|
|
1224
|
+
|
|
1225
|
+
Parameters
|
|
1226
|
+
----------
|
|
1227
|
+
window (int): window to apply to the feature
|
|
1228
|
+
smooth1 (int): smoothing parameter 1
|
|
1229
|
+
smooth2 (int): smoothing parameter 2
|
|
1230
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1231
|
+
plot (boolean): True to display plot
|
|
1232
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1233
|
+
|
|
1234
|
+
Returns
|
|
1235
|
+
-------
|
|
1236
|
+
None
|
|
1237
|
+
"""
|
|
1063
1238
|
feature_name = 'STOCH'
|
|
1064
1239
|
stoch = StochRSIIndicator(close = self.df['Close'], window = window, smooth1=smooth1, smooth2=smooth2).stochrsi()
|
|
1065
|
-
self.df[feature_name] = stoch
|
|
1240
|
+
self.df[feature_name] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1066
1241
|
self.compute_clip_bands(feature_name,threshold)
|
|
1067
1242
|
|
|
1068
1243
|
if save_features:
|
|
@@ -1072,9 +1247,24 @@ class stock_eda_panel(object):
|
|
|
1072
1247
|
self.signal_plotter(feature_name)
|
|
1073
1248
|
|
|
1074
1249
|
def stochastic_feature(self, window, smooth, threshold, plot = False, save_features = False):
|
|
1250
|
+
"""
|
|
1251
|
+
perform stochastic oscilator feature
|
|
1252
|
+
|
|
1253
|
+
Parameters
|
|
1254
|
+
----------
|
|
1255
|
+
window (int): window to apply to the feature
|
|
1256
|
+
smooth (int): smoothing parameter
|
|
1257
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1258
|
+
plot (boolean): True to display plot
|
|
1259
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1260
|
+
|
|
1261
|
+
Returns
|
|
1262
|
+
-------
|
|
1263
|
+
None
|
|
1264
|
+
"""
|
|
1075
1265
|
feature_name = 'STOCHOSC'
|
|
1076
1266
|
stochast = StochasticOscillator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window,smooth_window=smooth).stoch()
|
|
1077
|
-
self.df[feature_name] = stochast
|
|
1267
|
+
self.df[feature_name] = stochast.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1078
1268
|
self.compute_clip_bands(feature_name,threshold)
|
|
1079
1269
|
|
|
1080
1270
|
if save_features:
|
|
@@ -1084,9 +1274,23 @@ class stock_eda_panel(object):
|
|
|
1084
1274
|
self.signal_plotter(feature_name)
|
|
1085
1275
|
|
|
1086
1276
|
def william_feature(self, lbp, threshold, plot = False, save_features = False):
|
|
1277
|
+
"""
|
|
1278
|
+
perfom fast stochastic oscilator or william indicator
|
|
1279
|
+
|
|
1280
|
+
Parameters
|
|
1281
|
+
----------
|
|
1282
|
+
lbp (int): look back parameter
|
|
1283
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1284
|
+
plot (boolean): True to display plot
|
|
1285
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1286
|
+
|
|
1287
|
+
Returns
|
|
1288
|
+
-------
|
|
1289
|
+
None
|
|
1290
|
+
"""
|
|
1087
1291
|
feature_name = 'WILL'
|
|
1088
|
-
will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
|
|
1089
|
-
self.df[feature_name] = will
|
|
1292
|
+
will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
|
|
1293
|
+
self.df[feature_name] = will.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1090
1294
|
self.compute_clip_bands(feature_name,threshold)
|
|
1091
1295
|
|
|
1092
1296
|
if save_features:
|
|
@@ -1096,9 +1300,23 @@ class stock_eda_panel(object):
|
|
|
1096
1300
|
self.signal_plotter(feature_name)
|
|
1097
1301
|
|
|
1098
1302
|
def vortex_feature(self, window, threshold, plot = False, save_features = False):
|
|
1303
|
+
"""
|
|
1304
|
+
perform vortex oscilator
|
|
1305
|
+
|
|
1306
|
+
Parameters
|
|
1307
|
+
----------
|
|
1308
|
+
window (int): window to apply to the feature
|
|
1309
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1310
|
+
plot (boolean): True to display plot
|
|
1311
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1312
|
+
|
|
1313
|
+
Returns
|
|
1314
|
+
-------
|
|
1315
|
+
None
|
|
1316
|
+
"""
|
|
1099
1317
|
feature_name = 'VORTEX'
|
|
1100
1318
|
vortex = VortexIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window).vortex_indicator_diff()
|
|
1101
|
-
self.df[feature_name] = vortex
|
|
1319
|
+
self.df[feature_name] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1102
1320
|
self.compute_clip_bands(feature_name,threshold)
|
|
1103
1321
|
|
|
1104
1322
|
if save_features:
|
|
@@ -1107,11 +1325,160 @@ class stock_eda_panel(object):
|
|
|
1107
1325
|
if plot:
|
|
1108
1326
|
self.signal_plotter(feature_name)
|
|
1109
1327
|
|
|
1110
|
-
def
|
|
1328
|
+
def minmax_pricefeature(self, type_func, window, distance = False, plot = False, save_features = False):
|
|
1329
|
+
"""
|
|
1330
|
+
perform relative price/distance with respect to the min/max price in a given time scope
|
|
1331
|
+
|
|
1332
|
+
Parameters
|
|
1333
|
+
----------
|
|
1334
|
+
type_func (str): either min or max
|
|
1335
|
+
window (int): window scope
|
|
1336
|
+
distance (boolean): if true, get distance feature else relative feature
|
|
1337
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1338
|
+
|
|
1339
|
+
Returns
|
|
1340
|
+
-------
|
|
1341
|
+
None
|
|
1342
|
+
"""
|
|
1343
|
+
if type_func == 'min':
|
|
1344
|
+
self.df['Price_ref'] = self.df[['Open','High', 'Low','Close']].min(axis = 1)
|
|
1345
|
+
elif type_func == 'max':
|
|
1346
|
+
self.df['Price_ref'] = self.df[['Open','High', 'Low','Close']].max(axis = 1)
|
|
1347
|
+
|
|
1348
|
+
init_shape = self.df.shape[0]
|
|
1349
|
+
df_date = self.df[['Date','Price_ref']].rename(columns = {'Date':'Date_ref'}).copy()
|
|
1350
|
+
|
|
1351
|
+
self.df = self.df.rename(columns = {'Price_ref':'Price_to_use'})
|
|
1352
|
+
|
|
1353
|
+
if type_func == 'min':
|
|
1354
|
+
self.df[f'window_price'] = (self.df.sort_values("Date")["Price_to_use"].transform(lambda x: x.rolling(window, min_periods=1).min()))
|
|
1355
|
+
elif type_func == 'max':
|
|
1356
|
+
self.df[f'window_price'] = (self.df.sort_values("Date")["Price_to_use"].transform(lambda x: x.rolling(window, min_periods=1).max()))
|
|
1357
|
+
|
|
1358
|
+
|
|
1359
|
+
self.df = self.df.merge(df_date, left_on = 'window_price', right_on = 'Price_ref', how = 'left')
|
|
1360
|
+
self.df['date_span'] = self.df['Date'] - self.df['Date_ref']
|
|
1361
|
+
|
|
1362
|
+
self.df['RN'] = self.df.sort_values(['date_span'], ascending=False).groupby(['Date']).cumcount() + 1
|
|
1363
|
+
self.df = self.df[self.df['RN'] == 1]
|
|
1364
|
+
|
|
1365
|
+
if distance:
|
|
1366
|
+
self.df[f'{type_func}_distance_to_price'] = pd.to_numeric(self.df['date_span'].dt.days, downcast='integer')
|
|
1367
|
+
|
|
1368
|
+
if not distance:
|
|
1369
|
+
if type_func == 'min':
|
|
1370
|
+
self.df[f'{type_func}_relprice'] = self.df['Price_to_use']/self.df['window_price']-1
|
|
1371
|
+
|
|
1372
|
+
if type_func == 'max':
|
|
1373
|
+
self.df[f'{type_func}_relprice'] = self.df['window_price']/self.df['Price_to_use']-1
|
|
1374
|
+
|
|
1375
|
+
self.df = self.df.drop(columns = ['RN', 'date_span', 'Price_to_use', 'window_price', 'Date_ref','Price_ref'])
|
|
1376
|
+
|
|
1377
|
+
end_shape = self.df.shape[0]
|
|
1378
|
+
|
|
1379
|
+
if init_shape != end_shape:
|
|
1380
|
+
raise Exception("shapes are not the same")
|
|
1381
|
+
|
|
1382
|
+
if save_features:
|
|
1383
|
+
if distance:
|
|
1384
|
+
self.features.append(f'{type_func}_distance_to_price')
|
|
1385
|
+
name_attr = f'{type_func}_distance'
|
|
1386
|
+
if not distance:
|
|
1387
|
+
self.features.append(f'{type_func}_relprice')
|
|
1388
|
+
name_attr = f'{type_func}_relprice'
|
|
1389
|
+
|
|
1390
|
+
setattr(self,f'settings_{name_attr}_pricefeature' , {'type_func': type_func, 'window': window, 'distance': distance})
|
|
1391
|
+
|
|
1392
|
+
def expected_return(self, trad_days, feature, feature_name=False):
|
|
1393
|
+
"""
|
|
1394
|
+
perform expected log return based on inversed shift of historical data and applying
|
|
1395
|
+
|
|
1396
|
+
Parameters
|
|
1397
|
+
----------
|
|
1398
|
+
trad_days (int): window or differenciation
|
|
1399
|
+
feature (int): feature to apply expected log return
|
|
1400
|
+
feature_name (str): resulting feature name
|
|
1401
|
+
|
|
1402
|
+
Returns
|
|
1403
|
+
-------
|
|
1404
|
+
None
|
|
1405
|
+
"""
|
|
1406
|
+
feature_name = feature_name if feature_name else f"{feature}_log_return_{trad_days}"
|
|
1407
|
+
tmp_names = list()
|
|
1408
|
+
for ind in range(1,trad_days+1):
|
|
1409
|
+
tmp_name = f"expected_{ind}"
|
|
1410
|
+
self.df[tmp_name] = self.df[feature].shift(-ind)/self.df[feature]-1
|
|
1411
|
+
tmp_names.append(tmp_name)
|
|
1412
|
+
self.df[feature_name] = self.df[tmp_names].max(axis=1)
|
|
1413
|
+
self.df = self.df.drop(columns = tmp_names)
|
|
1414
|
+
|
|
1415
|
+
def rolling_feature(self, feature, window, function):
|
|
1416
|
+
"""
|
|
1417
|
+
perform rolling (non expanding) window operation for a given feature
|
|
1418
|
+
|
|
1419
|
+
Parameters
|
|
1420
|
+
----------
|
|
1421
|
+
feature (int): feature to apply window operation
|
|
1422
|
+
window (int): window size
|
|
1423
|
+
function (str): window function e.g MIN, MAX, AVG
|
|
1424
|
+
|
|
1425
|
+
Returns
|
|
1426
|
+
-------
|
|
1427
|
+
None
|
|
1428
|
+
"""
|
|
1429
|
+
feature_name = f"{feature}_{window}_{function}"
|
|
1430
|
+
self.df[feature_name] = getattr(self.df.sort_values("Date")[feature].rolling(window), function)()
|
|
1431
|
+
|
|
1432
|
+
def time_distance(self, feature_base,feature_window, result_feature_name, max_window=None):
|
|
1433
|
+
"""
|
|
1434
|
+
perform distancce time to a given window feature
|
|
1435
|
+
|
|
1436
|
+
Parameters
|
|
1437
|
+
----------
|
|
1438
|
+
feature_base (str): name of the underlaying feature
|
|
1439
|
+
feature_window (str): name of the window feature
|
|
1440
|
+
result_feature_name (str): resulting feature name
|
|
1441
|
+
max_window (int): apply a top value to the time to distance feature
|
|
1442
|
+
|
|
1443
|
+
Returns
|
|
1444
|
+
-------
|
|
1445
|
+
None
|
|
1446
|
+
"""
|
|
1447
|
+
self.df["Date_pivot"] = np.nan
|
|
1448
|
+
self.df["Date_pivot"] = self.df["Date_pivot"].case_when([
|
|
1449
|
+
(self.df[feature_base] == self.df[feature_window], self.df["Date"]),
|
|
1450
|
+
|
|
1451
|
+
])
|
|
1452
|
+
self.df["Date_pivot"] = self.df.sort_values("Date")["Date_pivot"].fillna(method="ffill")
|
|
1453
|
+
self.df[result_feature_name] = self.df["Date"] - self.df["Date_pivot"]
|
|
1454
|
+
self.df[result_feature_name] = self.df[result_feature_name].dt.days
|
|
1455
|
+
if max_window:
|
|
1456
|
+
self.df[result_feature_name] = self.df[result_feature_name].clip(0,max_window)
|
|
1457
|
+
self.df = self.df.drop(columns = ["Date_pivot"])
|
|
1458
|
+
|
|
1459
|
+
def pair_index_feature(self, pair_symbol, feature_label,threshold, window = None,ta_method='ROC',param_set=False,plot = False, save_features = False):
|
|
1460
|
+
"""
|
|
1461
|
+
perform additional asset ROC feature, then a new feature is created in the main dataframe
|
|
1462
|
+
|
|
1463
|
+
Parameters
|
|
1464
|
+
----------
|
|
1465
|
+
pair_symbol (str): symbol of the asset to extract the data
|
|
1466
|
+
feature_label (str): name of the resulting feature
|
|
1467
|
+
window (int): window to apply to the feature as default (this parameter is going to be deprecated)
|
|
1468
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1469
|
+
param_set (dict): parameter set in case ta_method is other than ROC
|
|
1470
|
+
ta_method (str): method to use, available RSI, ROC, VORTEX, STOCH
|
|
1471
|
+
plot (boolean): True to display plot
|
|
1472
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1473
|
+
|
|
1474
|
+
Returns
|
|
1475
|
+
-------
|
|
1476
|
+
None
|
|
1477
|
+
"""
|
|
1111
1478
|
self.pair_index = pair_symbol
|
|
1112
1479
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
1113
1480
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
1114
|
-
|
|
1481
|
+
|
|
1115
1482
|
if feature_label in self.df.columns:
|
|
1116
1483
|
self.df = self.df.drop(columns = [feature_label])
|
|
1117
1484
|
|
|
@@ -1123,14 +1490,27 @@ class stock_eda_panel(object):
|
|
|
1123
1490
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
1124
1491
|
df = df[df.Date >= begin_date_str ]
|
|
1125
1492
|
self.pair_index_df = df
|
|
1126
|
-
|
|
1493
|
+
|
|
1127
1494
|
#### converting the same index ####
|
|
1128
1495
|
dates_vector = self.df.Date.to_frame()
|
|
1129
1496
|
self.pair_index_df = dates_vector.merge(self.pair_index_df, on ='Date',how = 'left')
|
|
1130
1497
|
self.pair_index_df = self.pair_index_df.fillna(method = 'bfill')
|
|
1131
1498
|
self.pair_index_df = self.pair_index_df.fillna(method = 'ffill')
|
|
1132
|
-
|
|
1133
|
-
|
|
1499
|
+
|
|
1500
|
+
if ta_method == 'ROC':
|
|
1501
|
+
window = window if window else param_set.get('window')
|
|
1502
|
+
roc = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
|
|
1503
|
+
self.pair_index_df[feature_label] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1504
|
+
elif ta_method == 'RSI':
|
|
1505
|
+
rsi = RSIIndicator(close = self.pair_index_df['Close'], **param_set).rsi()
|
|
1506
|
+
self.pair_index_df[feature_label] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1507
|
+
elif ta_method == 'VORTEX':
|
|
1508
|
+
vortex = VortexIndicator(close = self.pair_index_df['Close'], high = self.pair_index_df['High'], low = self.pair_index_df['Low'], **param_set).vortex_indicator_diff()
|
|
1509
|
+
self.pair_index_df[feature_label] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1510
|
+
elif ta_method == 'STOCH':
|
|
1511
|
+
stoch = StochRSIIndicator(close = self.pair_index_df['Close'], **param_set).stochrsi()
|
|
1512
|
+
self.pair_index_df[feature_label] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1513
|
+
|
|
1134
1514
|
df_to_merge = self.pair_index_df[['Date',feature_label]]
|
|
1135
1515
|
self.df = self.df.merge(df_to_merge, on ='Date',how = 'left')
|
|
1136
1516
|
|
|
@@ -1140,7 +1520,7 @@ class stock_eda_panel(object):
|
|
|
1140
1520
|
if save_features:
|
|
1141
1521
|
self.log_features_standard(feature_label)
|
|
1142
1522
|
parameters = {feature_label:{'pair_symbol':pair_symbol, 'feature_label':feature_label, 'window':window,'threshold':threshold}}
|
|
1143
|
-
try:
|
|
1523
|
+
try:
|
|
1144
1524
|
len(self.settings_pair_index_feature)
|
|
1145
1525
|
print('existing')
|
|
1146
1526
|
self.settings_pair_index_feature.append(parameters)
|
|
@@ -1153,10 +1533,21 @@ class stock_eda_panel(object):
|
|
|
1153
1533
|
self.signal_plotter(feature_label)
|
|
1154
1534
|
|
|
1155
1535
|
def produce_order_features(self, feature_name, save_features = False):
|
|
1536
|
+
"""
|
|
1537
|
+
perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
|
|
1156
1538
|
|
|
1539
|
+
Parameters
|
|
1540
|
+
----------
|
|
1541
|
+
feature_name (str): name of the feature
|
|
1542
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1543
|
+
|
|
1544
|
+
Returns
|
|
1545
|
+
-------
|
|
1546
|
+
None
|
|
1547
|
+
"""
|
|
1157
1548
|
signal_feature_name = f'discrete_signal_{feature_name}'
|
|
1158
1549
|
order_feature_name = f'order_signal_{feature_name}'
|
|
1159
|
-
|
|
1550
|
+
|
|
1160
1551
|
self.df[signal_feature_name] = np.where(
|
|
1161
1552
|
self.df[f'signal_up_{feature_name}'] == 1,1,
|
|
1162
1553
|
np.where(
|
|
@@ -1173,14 +1564,107 @@ class stock_eda_panel(object):
|
|
|
1173
1564
|
self.df[order_feature_name] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1174
1565
|
self.df[order_feature_name] = self.df[order_feature_name]*self.df[signal_feature_name]
|
|
1175
1566
|
self.df = self.df.drop(columns = [f'lag_{signal_feature_name}', 'breack', "chain_id"])
|
|
1176
|
-
|
|
1567
|
+
|
|
1177
1568
|
## saving features
|
|
1178
1569
|
if save_features:
|
|
1179
1570
|
self.signals.append(signal_feature_name)
|
|
1180
1571
|
self.signals.append(order_feature_name)
|
|
1181
|
-
|
|
1572
|
+
|
|
1573
|
+
def get_order_feature_nosignal(self,feature_name, save_features=False):
|
|
1574
|
+
"""
|
|
1575
|
+
perform a feature that captures number of steps after the end of a signal
|
|
1576
|
+
|
|
1577
|
+
Parameters
|
|
1578
|
+
----------
|
|
1579
|
+
feature_name (str): name of the feature
|
|
1580
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1581
|
+
|
|
1582
|
+
Returns
|
|
1583
|
+
-------
|
|
1584
|
+
None
|
|
1585
|
+
"""
|
|
1586
|
+
order_feature_name = f'order_signal_{feature_name}'
|
|
1587
|
+
ns_order_feature_name = f'ns_order_{feature_name}'
|
|
1588
|
+
self.df = self.df.sort_values('Date')
|
|
1589
|
+
self.df['lag_'] = self.df[order_feature_name].shift(1)
|
|
1590
|
+
self.df['flag'] = np.where((self.df[order_feature_name] == 0) & (self.df['lag_']!=0),1,np.nan)
|
|
1591
|
+
self.df = self.df.drop(columns=['lag_'])
|
|
1592
|
+
self.df['order_'] = self.df.sort_values('Date').groupby(['flag']).cumcount() + 1
|
|
1593
|
+
self.df['order_'] = self.df['order_'].fillna(method='ffill')
|
|
1594
|
+
self.df['order_'] = np.where(self.df[order_feature_name]==0,self.df['order_'],0)
|
|
1595
|
+
self.df = self.df.drop(columns=['flag'])
|
|
1596
|
+
self.df['order_'] = self.df.sort_values('Date').groupby(['order_']).cumcount() + 1
|
|
1597
|
+
norm_list = [f'norm_{feature_name}', f'z_{feature_name}', feature_name]
|
|
1598
|
+
for norm_feature in norm_list:
|
|
1599
|
+
try:
|
|
1600
|
+
self.df['order_'] = np.sign(self.df[norm_feature])*self.df['order_']
|
|
1601
|
+
break
|
|
1602
|
+
except:
|
|
1603
|
+
pass
|
|
1604
|
+
self.df['order_'] = np.where(self.df[order_feature_name]==0,self.df['order_'],0)
|
|
1605
|
+
self.df = self.df.rename(columns={'order_':ns_order_feature_name})
|
|
1606
|
+
if save_features:
|
|
1607
|
+
self.signals.append(ns_order_feature_name)
|
|
1608
|
+
|
|
1609
|
+
def compute_last_signal(self,feature, save_features = False):
|
|
1610
|
+
"""
|
|
1611
|
+
perform two new features when signal is observed, one for the last duration of the previous chain, second for the last duration of the same sign signal
|
|
1612
|
+
|
|
1613
|
+
Parameters
|
|
1614
|
+
----------
|
|
1615
|
+
feature_name (str): name of the feature
|
|
1616
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1617
|
+
|
|
1618
|
+
Returns
|
|
1619
|
+
-------
|
|
1620
|
+
None
|
|
1621
|
+
"""
|
|
1622
|
+
def create_last_signal(df, feature, prefix, type ='0'):
|
|
1623
|
+
if type == '0':
|
|
1624
|
+
condition = df[f'order_signal_{feature}'] != 0
|
|
1625
|
+
elif type == '+':
|
|
1626
|
+
condition = df[f'order_signal_{feature}'] > 0
|
|
1627
|
+
elif type == '-':
|
|
1628
|
+
condition = df[f'order_signal_{feature}'] < 0
|
|
1629
|
+
df[f'last_maxorder_{feature}'] = np.where(condition, df[f'order_signal_{feature}'],np.nan)
|
|
1630
|
+
df['tmp_chain_index'] = df[f'last_maxorder_{feature}'].shift(-1)
|
|
1631
|
+
df['last'] = np.where((df[f'last_maxorder_{feature}'] != 0) & (df['tmp_chain_index'].isna()),df[f'last_maxorder_{feature}'], np.nan )
|
|
1632
|
+
df['last'] = df['last'].shift(1)
|
|
1633
|
+
df[f'last_maxorder_{feature}'] = df['last'].fillna(method = 'ffill')
|
|
1634
|
+
df = df.drop(columns = ['tmp_chain_index','last'])
|
|
1635
|
+
df[f'last_maxorder_{feature}'] = np.where(df[f'order_signal_{feature}'] != 0,df[f'last_maxorder_{feature}'],np.nan)
|
|
1636
|
+
df[f'last_maxorder_{feature}'] = df[f'last_maxorder_{feature}'].fillna(0)
|
|
1637
|
+
df = df.rename(columns = {f'last_maxorder_{feature}':f'{prefix}_{feature}'})
|
|
1638
|
+
return df
|
|
1639
|
+
prefix0, prefix1, prefix2 = 'ldur', 'pos', 'neg'
|
|
1640
|
+
self.df = create_last_signal(self.df, feature, prefix0, type ='0')
|
|
1641
|
+
self.df = create_last_signal(self.df, feature, prefix1, type ='+')
|
|
1642
|
+
self.df = create_last_signal(self.df, feature, prefix2, type ='-')
|
|
1643
|
+
|
|
1644
|
+
self.df[f'sldur_{feature}'] = np.where(
|
|
1645
|
+
self.df[f'order_signal_{feature}'] > 0, self.df[f'{prefix1}_{feature}'],
|
|
1646
|
+
np.where(
|
|
1647
|
+
self.df[f'order_signal_{feature}'] < 0, self.df[f'{prefix2}_{feature}'],
|
|
1648
|
+
0
|
|
1649
|
+
)
|
|
1650
|
+
)
|
|
1651
|
+
self.df = self.df.drop(columns = [f'{prefix1}_{feature}',f'{prefix2}_{feature}'])
|
|
1652
|
+
if save_features:
|
|
1653
|
+
self.signals.append(f'sldur_{feature}')
|
|
1654
|
+
self.signals.append(f'ldur_{feature}')
|
|
1655
|
+
|
|
1182
1656
|
def create_hmm_derived_features(self, lag_returns):
|
|
1657
|
+
"""
|
|
1658
|
+
create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
|
|
1659
|
+
|
|
1660
|
+
Parameters
|
|
1661
|
+
----------
|
|
1662
|
+
lag_returns (int): lag paramter (not used)
|
|
1183
1663
|
|
|
1664
|
+
Returns
|
|
1665
|
+
-------
|
|
1666
|
+
None
|
|
1667
|
+
"""
|
|
1184
1668
|
self.df = self.df.sort_values('Date')
|
|
1185
1669
|
## indexing chains
|
|
1186
1670
|
self.df['lag_hmm_feature'] = self.df['hmm_feature'].shift(1)
|
|
@@ -1189,31 +1673,44 @@ class stock_eda_panel(object):
|
|
|
1189
1673
|
self.df["chain_id"] = np.where(self.df['breack'] == 1,self.df["chain_id"],np.nan)
|
|
1190
1674
|
self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
|
|
1191
1675
|
self.df["hmm_chain_order"] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1192
|
-
|
|
1193
|
-
### returns using the first element in a chain
|
|
1194
|
-
self.df['first'] = np.where(self.df['hmm_chain_order'] == 1, self.df['Close'], np.nan)
|
|
1195
|
-
self.df['first'] = self.df.sort_values('Date')['first'].fillna(method='ffill')
|
|
1196
|
-
self.df['chain_return'] = (self.df['Close']/self.df['first'] -1) * 100
|
|
1197
1676
|
|
|
1198
|
-
|
|
1677
|
+
### returns using the windowsseeds
|
|
1678
|
+
self.df['lag_chain_close'] = self.df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
|
|
1679
|
+
self.df['chain_return'] = (self.df['Close']/self.df['lag_chain_close'] -1) * 100
|
|
1680
|
+
self.df = self.df.drop(columns = ['breack'])
|
|
1199
1681
|
|
|
1200
|
-
def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, plot = False, save_features = False, model = False):
|
|
1682
|
+
def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, corr_threshold = 0.75, plot = False, save_features = False, model = False):
|
|
1683
|
+
"""
|
|
1684
|
+
create or use a hmm model
|
|
1685
|
+
|
|
1686
|
+
Parameters
|
|
1687
|
+
----------
|
|
1688
|
+
n_clusters (int): number of clusters or states to calculate
|
|
1689
|
+
features_hmm (list): features to be considered in hmm model when training
|
|
1690
|
+
test_data_size (int): size of the test data. Note that the remaining is going to be used as training data
|
|
1691
|
+
seed (int): seed for the model inizialization
|
|
1692
|
+
lag_returns_state (int) : lags for returns of the state
|
|
1693
|
+
corr_threshold (float): correlation threshold for initial feature selection
|
|
1694
|
+
plot (boolean): True to display hmm states analysis
|
|
1695
|
+
save_features (boolean): True to save features and configurations
|
|
1696
|
+
model (obj): if provided, no model will be trainend and the provided model will be used to get hmm features
|
|
1697
|
+
|
|
1698
|
+
Returns
|
|
1699
|
+
-------
|
|
1700
|
+
None
|
|
1701
|
+
"""
|
|
1201
1702
|
if not model:
|
|
1202
|
-
|
|
1703
|
+
|
|
1203
1704
|
df_new = self.df
|
|
1204
|
-
pipeline_hmm = Pipeline([
|
|
1205
|
-
('selector', FeatureSelector(columns=features_hmm)),
|
|
1206
|
-
('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
|
|
1207
|
-
('hmm',GaussianHMM(n_components = n_clusters, covariance_type = 'full', random_state = seed))
|
|
1208
|
-
])
|
|
1209
1705
|
data_train = df_new.iloc[:-test_data_size,:]
|
|
1210
1706
|
data_test = df_new.iloc[-test_data_size:,:]
|
|
1211
1707
|
|
|
1212
|
-
|
|
1213
|
-
|
|
1708
|
+
th = trainer_hmm(data_train, features_hmm, n_clusters=n_clusters,corr_thrshold=corr_threshold, seed = seed)
|
|
1709
|
+
th.train()
|
|
1710
|
+
pipeline_hmm = th.hmm_model
|
|
1214
1711
|
self.model_hmm = pipeline_hmm
|
|
1215
1712
|
self.test_data_hmm = data_test
|
|
1216
|
-
|
|
1713
|
+
|
|
1217
1714
|
### first feature: the hidden state
|
|
1218
1715
|
self.df['hmm_feature'] = self.model_hmm.predict(self.df)
|
|
1219
1716
|
self.create_hmm_derived_features(lag_returns = lag_returns_state)
|
|
@@ -1230,15 +1727,15 @@ class stock_eda_panel(object):
|
|
|
1230
1727
|
hidden_states = pipeline_hmm.predict(data_test)
|
|
1231
1728
|
data_test['HMM'] = hidden_states
|
|
1232
1729
|
data_test['HMM_state'] = data_test['HMM'].map(map_)
|
|
1233
|
-
|
|
1730
|
+
|
|
1234
1731
|
if model:
|
|
1235
1732
|
self.df['hmm_feature'] = model.predict(self.df)
|
|
1236
1733
|
self.create_hmm_derived_features(lag_returns = lag_returns_state)
|
|
1237
|
-
|
|
1734
|
+
|
|
1238
1735
|
if save_features:
|
|
1239
1736
|
self.features.append('hmm_feature')
|
|
1240
1737
|
self.features.append('hmm_chain_order')
|
|
1241
|
-
self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state }
|
|
1738
|
+
self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state, 'corr_threshold':corr_threshold }
|
|
1242
1739
|
|
|
1243
1740
|
if plot:
|
|
1244
1741
|
|
|
@@ -1263,14 +1760,38 @@ class stock_eda_panel(object):
|
|
|
1263
1760
|
fig.show()
|
|
1264
1761
|
|
|
1265
1762
|
def sharpe_ratio(self, return_series, n_trad_days = 255, rf = 0.01):
|
|
1763
|
+
"""
|
|
1764
|
+
perform sharpe ratio of a given time series return
|
|
1765
|
+
|
|
1766
|
+
Parameters
|
|
1767
|
+
----------
|
|
1768
|
+
return_series (pd.series): time series of the returns
|
|
1769
|
+
n_trad_days (int): trading days to anualize returns
|
|
1770
|
+
rf (float): anual free risk rate
|
|
1771
|
+
|
|
1772
|
+
Returns
|
|
1773
|
+
-------
|
|
1774
|
+
sharpe_ratio (float): sharpe ratio
|
|
1775
|
+
"""
|
|
1266
1776
|
nsqrt = np.sqrt(n_trad_days)
|
|
1267
1777
|
mean = return_series.mean() * n_trad_days
|
|
1268
1778
|
sigma = return_series.std() * nsqrt
|
|
1269
1779
|
sharpe_ratio = round((mean-rf)/sigma,2)
|
|
1270
1780
|
return sharpe_ratio
|
|
1271
|
-
|
|
1781
|
+
|
|
1272
1782
|
def treat_signal_strategy(self,test_data, strategy):
|
|
1273
|
-
|
|
1783
|
+
"""
|
|
1784
|
+
helper method that treats signals and converts signals to 1 or 0
|
|
1785
|
+
|
|
1786
|
+
Parameters
|
|
1787
|
+
----------
|
|
1788
|
+
test_data (pd.DataFrame): test data
|
|
1789
|
+
strategy (list): features to get the strategy
|
|
1790
|
+
|
|
1791
|
+
Returns
|
|
1792
|
+
-------
|
|
1793
|
+
test_data (pd.DataFrame): test data with extra columns that are the strategy (main_signal)
|
|
1794
|
+
"""
|
|
1274
1795
|
hmm_states_list = [x for x in strategy if 'hmm_state_' in x]
|
|
1275
1796
|
other_features = [x for x in strategy if x not in hmm_states_list]
|
|
1276
1797
|
|
|
@@ -1299,10 +1820,21 @@ class stock_eda_panel(object):
|
|
|
1299
1820
|
elif len(hmm_states_list) == 0 and len(other_features) > 0:
|
|
1300
1821
|
test_data['main_signal'] = np.where((test_data['features_signal'] == 1) & (test_data['hmm_signal'] == 0),1,0)
|
|
1301
1822
|
|
|
1302
|
-
return test_data
|
|
1823
|
+
return test_data
|
|
1303
1824
|
|
|
1304
1825
|
def stategy_simulator(self, features, hmm_feature = True):
|
|
1826
|
+
"""
|
|
1827
|
+
execute strategy and get some performance metrics like sharpe ratio, return. This method creates some new attributes
|
|
1305
1828
|
|
|
1829
|
+
Parameters
|
|
1830
|
+
----------
|
|
1831
|
+
features (list): list of features to be tested as strategies
|
|
1832
|
+
hmm_feature (boolean): include hmm feature
|
|
1833
|
+
|
|
1834
|
+
Returns
|
|
1835
|
+
-------
|
|
1836
|
+
None
|
|
1837
|
+
"""
|
|
1306
1838
|
columns_ = ['Date', 'Close','Open'] + features + ['HMM']
|
|
1307
1839
|
states = list(self.df.hmm_feature.unique())
|
|
1308
1840
|
states.sort()
|
|
@@ -1372,8 +1904,19 @@ class stock_eda_panel(object):
|
|
|
1372
1904
|
self.strategy_log = df_returns_log
|
|
1373
1905
|
self.best_strategy = df_returns_log.iloc[0,:].strategy
|
|
1374
1906
|
self.top_10_strategy = list(df_returns_log.iloc[0:10,:].strategy.values)
|
|
1375
|
-
|
|
1907
|
+
|
|
1376
1908
|
def viz_strategy(self, strategy):
|
|
1909
|
+
"""
|
|
1910
|
+
display analysis plot of a given strategy
|
|
1911
|
+
|
|
1912
|
+
Parameters
|
|
1913
|
+
----------
|
|
1914
|
+
strategy (list): list of features of the strategy
|
|
1915
|
+
|
|
1916
|
+
Returns
|
|
1917
|
+
-------
|
|
1918
|
+
None
|
|
1919
|
+
"""
|
|
1377
1920
|
test_data = self.test_data_strategy
|
|
1378
1921
|
|
|
1379
1922
|
test_data = self.treat_signal_strategy(test_data, strategy)
|
|
@@ -1406,62 +1949,26 @@ class stock_eda_panel(object):
|
|
|
1406
1949
|
plt.legend()
|
|
1407
1950
|
plt.show()
|
|
1408
1951
|
|
|
1409
|
-
### deprecated ############################
|
|
1410
|
-
def create_strategy(self, favourable_states):
|
|
1411
|
-
|
|
1412
|
-
test_data = self.test_data_hmm
|
|
1413
|
-
# add MA signal
|
|
1414
|
-
test_data.loc[test_data[self.ma1_column] > test_data[self.ma2_column], 'MA_signal'] = 1
|
|
1415
|
-
test_data.loc[test_data[self.ma1_column] <= test_data[self.ma2_column], 'MA_signal'] = 0
|
|
1416
|
-
|
|
1417
|
-
# add hnn signal
|
|
1418
|
-
|
|
1419
|
-
test_data['HMM_signal'] = np.where(test_data['HMM'].isin(favourable_states),1,0)
|
|
1420
|
-
|
|
1421
|
-
## combined signals
|
|
1422
|
-
test_data['main_signal'] = 0
|
|
1423
|
-
test_data.loc[(test_data['MA_signal'] == 1) & (test_data['HMM_signal'] == 1), 'main_signal'] = 1
|
|
1424
|
-
test_data['main_signal'] = test_data['main_signal'].shift(1)
|
|
1425
|
-
|
|
1426
|
-
## benchmark return
|
|
1427
|
-
test_data['lrets_bench'] = np.log(test_data['Close']/test_data['Close'].shift(1))
|
|
1428
|
-
test_data['bench_prod'] = test_data['lrets_bench'].cumsum()
|
|
1429
|
-
test_data['bench_prod_exp'] = np.exp(test_data['bench_prod']) - 1
|
|
1430
|
-
|
|
1431
|
-
## strategy return
|
|
1432
|
-
# test_data['lrets_strat'] = np.log(test_data['Open'].shift(-1)/test_data['Open']) * test_data['main_signal']
|
|
1433
|
-
test_data['lrets_strat'] = np.log(test_data['Close'].shift(-1)/test_data['Close']) * test_data['main_signal']
|
|
1434
|
-
test_data['lrets_prod'] = test_data['lrets_strat'].cumsum()
|
|
1435
|
-
test_data['strat_prod_exp'] = np.exp(test_data['lrets_prod']) - 1
|
|
1436
|
-
test_data.dropna(inplace = True)
|
|
1437
|
-
|
|
1438
|
-
bench_rets = round(test_data['bench_prod_exp'].values[-1]*100,1)
|
|
1439
|
-
strat_rets = round(test_data['strat_prod_exp'].values[-1]*100,1)
|
|
1440
|
-
|
|
1441
|
-
bench_sharpe = self.sharpe_ratio(test_data['bench_prod_exp'].values)
|
|
1442
|
-
strat_sharpe = self.sharpe_ratio(test_data['strat_prod_exp'].values)
|
|
1443
|
-
|
|
1444
|
-
print(f'returns benchmark {bench_rets}%')
|
|
1445
|
-
print(f'returns strategy {strat_rets}%')
|
|
1446
|
-
print('-----------------------------')
|
|
1447
|
-
print(f'sharpe benchmark {bench_sharpe}')
|
|
1448
|
-
print(f'sharpe strategy {strat_sharpe}')
|
|
1449
|
-
|
|
1450
|
-
fig = plt.figure(figsize = (10,4))
|
|
1451
|
-
plt.plot(test_data['bench_prod_exp'])
|
|
1452
|
-
plt.plot(test_data['strat_prod_exp'])
|
|
1453
|
-
self.settings_hmm_states = {'favourable_states':favourable_states}
|
|
1454
|
-
################################################
|
|
1455
|
-
|
|
1456
1952
|
def deep_dive_analysis_hmm(self, test_data_size, split = 'train'):
|
|
1457
|
-
|
|
1953
|
+
"""
|
|
1954
|
+
display analysis plot hmm model
|
|
1955
|
+
|
|
1956
|
+
Parameters
|
|
1957
|
+
----------
|
|
1958
|
+
test_data_size (int): test data size, the remaining is the train data
|
|
1959
|
+
split (str): options (train or test). Split type to assess
|
|
1960
|
+
|
|
1961
|
+
Returns
|
|
1962
|
+
-------
|
|
1963
|
+
None
|
|
1964
|
+
"""
|
|
1458
1965
|
if split == 'train':
|
|
1459
1966
|
df = self.df.iloc[:-test_data_size,:]
|
|
1460
1967
|
elif split == 'test':
|
|
1461
1968
|
df = self.df.iloc[-test_data_size:,:]
|
|
1462
1969
|
|
|
1463
1970
|
## returns plot
|
|
1464
|
-
fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
|
|
1971
|
+
fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
|
|
1465
1972
|
height=400, width=1000, title = 'returns chain hmm feature')
|
|
1466
1973
|
fig.add_shape(type='line',x0=-0.5,y0=0,x1=max(df.hmm_feature)+0.5,y1=0,line=dict(color='grey',width=1),xref='x',yref='y')
|
|
1467
1974
|
fig.show()
|
|
@@ -1490,6 +1997,17 @@ class stock_eda_panel(object):
|
|
|
1490
1997
|
del df
|
|
1491
1998
|
|
|
1492
1999
|
def get_targets(self, steps):
|
|
2000
|
+
"""
|
|
2001
|
+
produce regression target return taking future prices
|
|
2002
|
+
|
|
2003
|
+
Parameters
|
|
2004
|
+
----------
|
|
2005
|
+
steps (int): number of lags and steps for future returns
|
|
2006
|
+
|
|
2007
|
+
Returns
|
|
2008
|
+
-------
|
|
2009
|
+
None
|
|
2010
|
+
"""
|
|
1493
2011
|
self.targets = list()
|
|
1494
2012
|
self.target = list()
|
|
1495
2013
|
columns = list()
|
|
@@ -1501,9 +2019,23 @@ class stock_eda_panel(object):
|
|
|
1501
2019
|
self.df[f'mean_target'] = self.df[columns].mean(axis=1)
|
|
1502
2020
|
self.target.append(f'mean_target')
|
|
1503
2021
|
self.settings_target_lasts = {'steps':steps, 'type':'regression'}
|
|
1504
|
-
|
|
1505
|
-
def get_categorical_targets(self, horizon, flor_loss, top_gain):
|
|
1506
|
-
|
|
2022
|
+
|
|
2023
|
+
def get_categorical_targets(self, horizon, flor_loss, top_gain, min_pos=1 , min_negs=1):
|
|
2024
|
+
"""
|
|
2025
|
+
produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
|
|
2026
|
+
|
|
2027
|
+
Parameters
|
|
2028
|
+
----------
|
|
2029
|
+
horizon (int): number of lags and steps for future returns
|
|
2030
|
+
flor_loss (float): min loss return
|
|
2031
|
+
top_gain (float): max gain return
|
|
2032
|
+
min_pos (int): minimun number of positives to count in a window for target_up
|
|
2033
|
+
min_negs (int): minimun number of negatives to count in a window for target_down
|
|
2034
|
+
|
|
2035
|
+
Returns
|
|
2036
|
+
-------
|
|
2037
|
+
None
|
|
2038
|
+
"""
|
|
1507
2039
|
self.target = list()
|
|
1508
2040
|
self.targets = list()
|
|
1509
2041
|
columns = list()
|
|
@@ -1516,7 +2048,7 @@ class stock_eda_panel(object):
|
|
|
1516
2048
|
self.df[f'target_{i}'] = np.where(self.df[f'target_{i}'] >= top_gain,1,0)
|
|
1517
2049
|
columns.append(f'target_{i}')
|
|
1518
2050
|
self.df[f'target_up'] = self.df[columns].sum(axis=1)
|
|
1519
|
-
self.df[f'target_up'] = np.where(self.df[f'target_up'] >=
|
|
2051
|
+
self.df[f'target_up'] = np.where(self.df[f'target_up'] >=min_pos,1,0 )
|
|
1520
2052
|
self.df = self.df.drop(columns = columns)
|
|
1521
2053
|
|
|
1522
2054
|
for i in range(1,horizon+1):
|
|
@@ -1526,7 +2058,7 @@ class stock_eda_panel(object):
|
|
|
1526
2058
|
self.df[f'target_{i}'] = np.where(self.df[f'target_{i}'] <= flor_loss,1,0)
|
|
1527
2059
|
columns.append(f'target_{i}')
|
|
1528
2060
|
self.df[f'target_down'] = self.df[columns].sum(axis=1)
|
|
1529
|
-
self.df[f'target_down'] = np.where(self.df[f'target_down'] >=
|
|
2061
|
+
self.df[f'target_down'] = np.where(self.df[f'target_down'] >= min_negs,1,0 )
|
|
1530
2062
|
self.df = self.df.drop(columns = columns)
|
|
1531
2063
|
|
|
1532
2064
|
self.targets.append('target_up')
|
|
@@ -1535,7 +2067,19 @@ class stock_eda_panel(object):
|
|
|
1535
2067
|
self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain, 'type': 'classification'}
|
|
1536
2068
|
|
|
1537
2069
|
def get_configurations(self,test_data_size =250, val_data_size = 250, model_type = False):
|
|
1538
|
-
|
|
2070
|
+
"""
|
|
2071
|
+
produce configuration dictionary that were saved in the feature generation methods if save_features was activated
|
|
2072
|
+
|
|
2073
|
+
Parameters
|
|
2074
|
+
----------
|
|
2075
|
+
test_data_size (int): test data size
|
|
2076
|
+
val_data_size (int): validation data size
|
|
2077
|
+
model_type (str): model type, options: 'Forecaster','Classifier'
|
|
2078
|
+
|
|
2079
|
+
Returns
|
|
2080
|
+
-------
|
|
2081
|
+
None
|
|
2082
|
+
"""
|
|
1539
2083
|
self.settings = {
|
|
1540
2084
|
'features':list(set(self.features)),
|
|
1541
2085
|
'signals' :list(set(self.signals)),
|
|
@@ -1547,19 +2091,21 @@ class stock_eda_panel(object):
|
|
|
1547
2091
|
'outlier': self.settings_outlier,
|
|
1548
2092
|
}
|
|
1549
2093
|
}
|
|
1550
|
-
|
|
2094
|
+
|
|
1551
2095
|
if model_type in ['Forecaster','Classifier']:
|
|
1552
|
-
|
|
2096
|
+
|
|
1553
2097
|
target_list = list(set(self.targets))
|
|
1554
2098
|
target_list.sort()
|
|
1555
2099
|
self.settings['model_type'] = model_type
|
|
1556
2100
|
self.settings['target'] = list(set(self.target))
|
|
1557
2101
|
self.settings['targets'] = target_list
|
|
1558
|
-
|
|
2102
|
+
|
|
1559
2103
|
## for now this is hard coded
|
|
1560
2104
|
feature_list = ['spread_ma','relative_spread_ma','pair_feature','count_features','bidirect_count_features','price_range','relative_price_range','rsi_feature',
|
|
1561
2105
|
'rsi_feature_v2', 'days_features','days_features_v2', 'volume_feature','smooth_volume', 'roc_feature', 'stoch_feature', 'stochastic_feature',
|
|
1562
|
-
'william_feature', 'vortex_feature', 'pair_index_feature','hmm'
|
|
2106
|
+
'william_feature', 'vortex_feature', 'pair_index_feature','hmm',
|
|
2107
|
+
'min_distance_pricefeature', 'min_relprice_pricefeature', 'max_distance_pricefeature','max_relprice_pricefeature'
|
|
2108
|
+
]
|
|
1563
2109
|
|
|
1564
2110
|
for feature in feature_list:
|
|
1565
2111
|
try:
|
|
@@ -1570,7 +2116,7 @@ class stock_eda_panel(object):
|
|
|
1570
2116
|
self.settings['settings']['target_lasts'] = self.settings_target_lasts
|
|
1571
2117
|
except:
|
|
1572
2118
|
pass
|
|
1573
|
-
|
|
2119
|
+
|
|
1574
2120
|
try:
|
|
1575
2121
|
self.settings['settings']['strategies'] = {
|
|
1576
2122
|
'best_strategy':self.best_strategy,
|
|
@@ -1580,512 +2126,280 @@ class stock_eda_panel(object):
|
|
|
1580
2126
|
pass
|
|
1581
2127
|
|
|
1582
2128
|
class produce_model:
|
|
2129
|
+
"""
|
|
2130
|
+
Class that produces a machine learning model in a scikit-learn pipeline wrapper.
|
|
2131
|
+
|
|
2132
|
+
Attributes
|
|
2133
|
+
----------
|
|
2134
|
+
data : pd.DataFrame
|
|
2135
|
+
symbol of the asset
|
|
2136
|
+
X_train : pd.DataFrame
|
|
2137
|
+
y_train : pd.Series
|
|
2138
|
+
X_test : pd.DataFrame
|
|
2139
|
+
y_test : pd.Series
|
|
2140
|
+
X_val : pd.DataFrame
|
|
2141
|
+
y_val : pd.Series
|
|
2142
|
+
pipeline : obj
|
|
2143
|
+
trained pipeline that includes a ml model
|
|
2144
|
+
features_to_model: list
|
|
2145
|
+
features in end step of the pipeline
|
|
2146
|
+
|
|
2147
|
+
Methods
|
|
2148
|
+
-------
|
|
2149
|
+
preprocess(test_data_size=int, target=str, val_data_size=int):
|
|
2150
|
+
prepare data, split train, test, validation data and X and Y
|
|
2151
|
+
get_sample(x=pd.DataFrame, sample=int, max_=int):
|
|
2152
|
+
sample data
|
|
2153
|
+
"""
|
|
1583
2154
|
def __init__(self,data):
|
|
2155
|
+
"""
|
|
2156
|
+
Initialize object
|
|
2157
|
+
|
|
2158
|
+
Parameters
|
|
2159
|
+
----------
|
|
2160
|
+
data (pd.DataFrame): data
|
|
2161
|
+
|
|
2162
|
+
Returns
|
|
2163
|
+
-------
|
|
2164
|
+
None
|
|
2165
|
+
"""
|
|
1584
2166
|
self.data = data.copy()
|
|
1585
|
-
|
|
2167
|
+
|
|
1586
2168
|
def preprocess(self, test_data_size, target, val_data_size = False):
|
|
1587
|
-
|
|
2169
|
+
"""
|
|
2170
|
+
prepare data, split train, test, validation data and X and Y
|
|
2171
|
+
|
|
2172
|
+
Parameters
|
|
2173
|
+
----------
|
|
2174
|
+
test_data_size (int): test data size
|
|
2175
|
+
target (str): target column
|
|
2176
|
+
val_data_size (int): validation data size
|
|
2177
|
+
|
|
2178
|
+
Returns
|
|
2179
|
+
-------
|
|
2180
|
+
None
|
|
2181
|
+
"""
|
|
1588
2182
|
train_data, test_data = self.data.iloc[:-test_data_size,:].dropna() , self.data.iloc[-test_data_size:,:].dropna()
|
|
1589
|
-
|
|
2183
|
+
|
|
1590
2184
|
if val_data_size:
|
|
1591
2185
|
train_data, val_data = train_data.iloc[:-val_data_size,:], train_data.iloc[-val_data_size:,:]
|
|
1592
|
-
|
|
2186
|
+
|
|
1593
2187
|
self.test_data = test_data
|
|
1594
|
-
|
|
2188
|
+
|
|
1595
2189
|
X_train, y_train = train_data.iloc[0:,1:], train_data[target]
|
|
1596
2190
|
X_test, y_test = test_data.iloc[0:,1:], test_data[target]
|
|
1597
2191
|
self.X_train = X_train
|
|
1598
2192
|
self.y_train = y_train
|
|
1599
2193
|
self.X_test = X_test
|
|
1600
2194
|
self.y_test = y_test
|
|
1601
|
-
|
|
2195
|
+
|
|
1602
2196
|
if val_data_size:
|
|
1603
2197
|
X_val, y_val = val_data.iloc[0:,1:], val_data[target]
|
|
1604
2198
|
self.X_val = X_val
|
|
1605
2199
|
self.y_val = y_val
|
|
1606
|
-
|
|
2200
|
+
|
|
1607
2201
|
def get_sample(self, x, sample, max_=900):
|
|
2202
|
+
"""
|
|
2203
|
+
sample data
|
|
2204
|
+
|
|
2205
|
+
Parameters
|
|
2206
|
+
----------
|
|
2207
|
+
x (pd.DataFrame): input data
|
|
2208
|
+
sample (int): sample size
|
|
2209
|
+
max_ (int): max sample
|
|
2210
|
+
|
|
2211
|
+
Returns
|
|
2212
|
+
-------
|
|
2213
|
+
sample (float): sample size
|
|
2214
|
+
"""
|
|
1608
2215
|
length = len(x)
|
|
1609
2216
|
if length > max_:
|
|
1610
2217
|
return 1.0
|
|
1611
2218
|
else:
|
|
1612
2219
|
return sample
|
|
1613
|
-
|
|
2220
|
+
|
|
1614
2221
|
def train_model(self, pipe, model, cv_ = False):
|
|
2222
|
+
"""
|
|
2223
|
+
train pipeline
|
|
2224
|
+
|
|
2225
|
+
Parameters
|
|
2226
|
+
----------
|
|
2227
|
+
pipe (obj): pipeline object
|
|
2228
|
+
model (obj): model object
|
|
2229
|
+
cv_ (obj): cross validation procedure
|
|
2230
|
+
|
|
2231
|
+
Returns
|
|
2232
|
+
-------
|
|
2233
|
+
sample (float): sample size
|
|
2234
|
+
"""
|
|
1615
2235
|
self.model = model
|
|
1616
2236
|
self.pipe_transform = pipe
|
|
1617
2237
|
self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
|
|
1618
|
-
self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
|
|
1619
2238
|
self.pipeline.fit(self.X_train, self.y_train)
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
class
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
## indexing chains
|
|
1665
|
-
self.data_train_['lag_hmm_feature'] = self.data_train_['hmm_feature'].shift(1)
|
|
1666
|
-
self.data_train_['breack'] = np.where(self.data_train_['lag_hmm_feature'] != self.data_train_['hmm_feature'],1,0)
|
|
1667
|
-
self.data_train_["chain_id"] = self.data_train_.groupby("breack")["Date"].rank(method="first", ascending=True)
|
|
1668
|
-
self.data_train_["chain_id"] = np.where(self.data_train_['breack'] == 1,self.data_train_["chain_id"],np.nan)
|
|
1669
|
-
self.data_train_["chain_id"] = self.data_train_["chain_id"].fillna(method='ffill')
|
|
1670
|
-
self.data_train_["hmm_chain_order"] = self.data_train_.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1671
|
-
|
|
1672
|
-
### returns using the first element in a chain
|
|
1673
|
-
self.data_train_['first'] = np.where(self.data_train_['hmm_chain_order'] == 1, self.data_train_['Close'], np.nan)
|
|
1674
|
-
self.data_train_['first'] = self.data_train_.sort_values('Date')['first'].fillna(method='ffill')
|
|
1675
|
-
self.data_train_['chain_return'] = (self.data_train_['Close']/self.data_train_['first'] -1) * 100
|
|
1676
|
-
|
|
1677
|
-
self.data_train_ = self.data_train_.drop(columns = ['first'])
|
|
1678
|
-
|
|
1679
|
-
mean_relevance, cluster_returns, number_relevant_states = states_relevance_score(self.data_train_)
|
|
1680
|
-
self.mean_relevance = mean_relevance
|
|
1681
|
-
|
|
1682
|
-
def execute_selector(self):
|
|
1683
|
-
|
|
1684
|
-
self.split_data()
|
|
1685
|
-
self.feature_list_generator()
|
|
1686
|
-
maxi = -1
|
|
1687
|
-
print(f'it is expected {len(self.feature_combinations)} combinations')
|
|
1688
|
-
feature_results = dict()
|
|
1689
|
-
|
|
1690
|
-
if self.limit_search:
|
|
1691
|
-
print(f' taking just {self.limit_search} combinations')
|
|
1692
|
-
maxi = self.limit_search
|
|
1693
|
-
|
|
1694
|
-
for i,features_hmm in enumerate(self.feature_combinations[0:maxi]):
|
|
1695
|
-
|
|
1696
|
-
feature_results[f'group_{i}'] = {
|
|
1697
|
-
'features':list(features_hmm),
|
|
1698
|
-
'relevances':list()
|
|
1699
|
-
}
|
|
1700
|
-
|
|
1701
|
-
for _ in range(self.n_trials):
|
|
1702
|
-
try:
|
|
1703
|
-
self.train_model(features_hmm)
|
|
1704
|
-
self.get_error()
|
|
1705
|
-
feature_results[f'group_{i}']['relevances'].append(self.mean_relevance)
|
|
1706
|
-
except:
|
|
1707
|
-
print('error')
|
|
1708
|
-
feature_results[f'group_{i}']['mean relevance'] = np.mean(feature_results[f'group_{i}']['relevances'])
|
|
1709
|
-
self.feature_results = feature_results
|
|
1710
|
-
self.best_features = pd.DataFrame(self.feature_results).T.sort_values('mean relevance').iloc[-1,:].features
|
|
1711
|
-
|
|
1712
|
-
class signal_analyser_object:
|
|
1713
|
-
|
|
1714
|
-
def __init__(self, data,symbol_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
|
|
2239
|
+
self.features_to_model = self.pipeline[:-1].transform(self.X_train).columns
|
|
2240
|
+
|
|
2241
|
+
class analyse_index(stock_eda_panel):
|
|
2242
|
+
"""
|
|
2243
|
+
class that is going to train hmm models to perform feature selection
|
|
2244
|
+
|
|
2245
|
+
Attributes
|
|
2246
|
+
----------
|
|
2247
|
+
data_index : pd.DataFrame
|
|
2248
|
+
name of the index
|
|
2249
|
+
indexes: list
|
|
2250
|
+
list of indexes
|
|
2251
|
+
asset : str
|
|
2252
|
+
name of the asset
|
|
2253
|
+
n_obs : int
|
|
2254
|
+
number of rows to extract
|
|
2255
|
+
lag : int
|
|
2256
|
+
lag to apply
|
|
2257
|
+
data_window : str
|
|
2258
|
+
5y 10y 15y
|
|
2259
|
+
show_plot : bool
|
|
2260
|
+
If True, show plots
|
|
2261
|
+
save_path : str
|
|
2262
|
+
local path for saving e.g r'C:/path/to/the/file/'
|
|
2263
|
+
save_aws : str
|
|
2264
|
+
remote key in s3 bucket path e.g. 'path/to/file/'
|
|
2265
|
+
aws_credentials : dict
|
|
2266
|
+
dict with the aws credentials
|
|
2267
|
+
merger_df : pd.DataFrame
|
|
2268
|
+
dataframe with the index and asset data
|
|
2269
|
+
states_result = dict
|
|
2270
|
+
betas and correlation score results
|
|
2271
|
+
|
|
2272
|
+
Methods
|
|
2273
|
+
-------
|
|
2274
|
+
process_data():
|
|
2275
|
+
using stock_eda_panel, get data and merge data
|
|
2276
|
+
plot_betas(sample_size=int, offset=int, subsample_ts=int):
|
|
2277
|
+
display beta analysis plot
|
|
2278
|
+
get_betas(subsample_ts=int)
|
|
2279
|
+
get general beta and last sample beta, correlation score is included too
|
|
2280
|
+
"""
|
|
2281
|
+
def __init__(self, index_data, asset, n_obs, lag, data_window = '5y', show_plot = False, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
|
|
1715
2282
|
"""
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
2283
|
+
Initialize object
|
|
2284
|
+
|
|
2285
|
+
Parameters
|
|
2286
|
+
----------
|
|
2287
|
+
index_data (pd.DataFrame or str): index data dataframe or index string
|
|
2288
|
+
asset (str): name of the asset
|
|
2289
|
+
n_obs (int): number of rows to extract
|
|
2290
|
+
lag (int): lag to apply
|
|
2291
|
+
data_window (str): 5y 10y 15y
|
|
2292
|
+
show_plot (bool): If True, show plots
|
|
2293
|
+
save_path (str): local path for saving e.g r'C:/path/to/the/file/'
|
|
2294
|
+
save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
|
|
2295
|
+
aws_credentials (dict): dict with the aws credentials
|
|
2296
|
+
|
|
2297
|
+
Returns
|
|
2298
|
+
-------
|
|
2299
|
+
None
|
|
1723
2300
|
"""
|
|
1724
|
-
self.data = data.copy()
|
|
1725
|
-
self.ticket_name = symbol_name
|
|
1726
|
-
self.show_plot = show_plot
|
|
1727
|
-
self.save_path = save_path
|
|
1728
|
-
self.save_aws = save_aws
|
|
1729
|
-
self.aws_credentials = aws_credentials
|
|
1730
|
-
self.return_fig = return_fig
|
|
1731
|
-
|
|
1732
|
-
def signal_analyser(self, test_size, feature_name, days_list, threshold = 0.05,verbose = False, signal_position = False):
|
|
1733
|
-
data = self.data
|
|
1734
|
-
self.feature_name = feature_name
|
|
1735
|
-
up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
|
|
1736
|
-
features_base = ['Date', up_signal, low_signal, 'Close']
|
|
1737
|
-
|
|
1738
|
-
df = data[features_base].sort_values('Date').iloc[0:-test_size,:]
|
|
1739
|
-
returns_list = list()
|
|
1740
|
-
|
|
1741
|
-
for days in days_list:
|
|
1742
|
-
|
|
1743
|
-
feature_ = f'return_{days}d'
|
|
1744
|
-
df[feature_] = (df['Close'].shift(-days)/df['Close']-1)*100
|
|
1745
|
-
returns_list.append(feature_)
|
|
1746
2301
|
|
|
1747
|
-
df['signal_type'] = np.where(
|
|
1748
|
-
df[up_signal] == 1,
|
|
1749
|
-
'up',
|
|
1750
|
-
np.where(
|
|
1751
|
-
df[low_signal] == 1,
|
|
1752
|
-
'down',
|
|
1753
|
-
None
|
|
1754
|
-
)
|
|
1755
|
-
)
|
|
1756
|
-
df = df[~df.signal_type.isna()]
|
|
1757
|
-
# df['Date'] = df.index
|
|
1758
|
-
df['lag_Date'] = df['Date'].shift(1)
|
|
1759
|
-
df['span'] = (pd.to_datetime(df['Date']) - pd.to_datetime(df['lag_Date'])).dt.days - 1
|
|
1760
|
-
df['break'] = np.where(df['span'] > 3, 1, 0)
|
|
1761
|
-
df['break'] = np.where(df['span'].isna(), 1, df['break'])
|
|
1762
|
-
|
|
1763
|
-
df['chain_id'] = df.sort_values(['Date']).groupby(['break']).cumcount() + 1
|
|
1764
|
-
df['chain_id'] = np.where(df['break'] == 1, df['chain_id'], np.nan )
|
|
1765
|
-
df['chain_id'] = df['chain_id'].fillna(method = 'ffill')
|
|
1766
|
-
|
|
1767
|
-
df['internal_rn'] = df.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
|
|
1768
|
-
df['inv_internal_rn'] = df.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
|
|
1769
|
-
|
|
1770
|
-
df['first_in_chain'] = np.where(df['internal_rn'] == 1, True, False)
|
|
1771
|
-
df['last_in_chain'] = np.where(df['inv_internal_rn'] == 1, True, False)
|
|
1772
|
-
|
|
1773
|
-
df = df.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
|
|
1774
|
-
self.df_signal = df
|
|
1775
2302
|
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
validations = list()
|
|
1781
|
-
if not signal_position: ### for now it is based on the last signal on a chain
|
|
1782
|
-
df_melt = df[df.last_in_chain == True].melt(id_vars=['signal_type'], value_vars=returns_list, var_name='time', value_name='value')
|
|
1783
|
-
df_melt = df_melt.dropna()
|
|
1784
|
-
|
|
1785
|
-
for evalx in returns_list:
|
|
1786
|
-
|
|
1787
|
-
sample1 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'up')].value.values
|
|
1788
|
-
sample2 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'down')].value.values
|
|
1789
|
-
pvalue = stats.ttest_ind(sample1, sample2).pvalue
|
|
1790
|
-
median_down = np.median(sample2)
|
|
1791
|
-
median_up = np.median(sample1)
|
|
1792
|
-
validations.append(median_up < 0)
|
|
1793
|
-
validations.append(median_down > 0)
|
|
1794
|
-
p_scores.append(pvalue)
|
|
1795
|
-
medians_down.append(median_down)
|
|
1796
|
-
self.df_melt = df_melt
|
|
1797
|
-
null_ho_eval = threshold > np.mean(p_scores)
|
|
1798
|
-
mean_median_return = np.median(medians_down) ## end metric
|
|
1799
|
-
median_signal_type_eval = validations.count(validations[0]) == len(validations)
|
|
1800
|
-
|
|
1801
|
-
if verbose:
|
|
1802
|
-
print('number of signal up:',n_signals_up)
|
|
1803
|
-
print('number of signal down:',n_signals_down)
|
|
1804
|
-
print('reject ho: ', null_ho_eval)
|
|
1805
|
-
print('mean median:', mean_median_return)
|
|
1806
|
-
print('all validations: ', median_signal_type_eval)
|
|
1807
|
-
|
|
1808
|
-
# if median_signal_type_eval == True and null_ho_eval == True:
|
|
1809
|
-
if null_ho_eval == True:
|
|
1810
|
-
if verbose:
|
|
1811
|
-
print('success evals')
|
|
1812
|
-
self.mean_median_return = mean_median_return
|
|
2303
|
+
if type(index_data) != str:
|
|
2304
|
+
index_data['Date'] = pd.to_datetime(index_data['Date'])
|
|
2305
|
+
self.index_data = index_data
|
|
2306
|
+
self.indexes = [ x for x in list(index_data.columns) if x != 'Date']
|
|
1813
2307
|
else:
|
|
1814
|
-
self.
|
|
1815
|
-
|
|
1816
|
-
df2 = df.copy()
|
|
1817
|
-
df2 = df2[df2.last_in_chain == True]
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
df2['lagdate'] = df2.Date.shift(1)
|
|
1821
|
-
df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lagdate'])).dt.days
|
|
1822
|
-
|
|
1823
|
-
fig, axs = plt.subplots(1, 3, figsize = (15,5))
|
|
1824
|
-
|
|
1825
|
-
sns.boxplot(data=df2, y="span",ax = axs[0])
|
|
1826
|
-
axs[0].set_title('span between last signals')
|
|
1827
|
-
del df2
|
|
1828
|
-
sns.boxplot(data=df[df.last_in_chain == True], y="internal_rn",ax = axs[1])
|
|
1829
|
-
axs[1].set_title('signal duration distribution')
|
|
1830
|
-
sns.boxplot(data=df_melt, x="time", y="value", hue="signal_type",ax = axs[2])
|
|
1831
|
-
axs[2].axhline(y=0, color='grey', linestyle='--')
|
|
1832
|
-
axs[2].set_title('signal type expected returns distribution at different time lapses')
|
|
1833
|
-
|
|
1834
|
-
if self.show_plot:
|
|
1835
|
-
plt.show()
|
|
2308
|
+
self.indexes = [index_data]
|
|
1836
2309
|
|
|
1837
|
-
|
|
1838
|
-
result_plot_name = f'signals_strategy_distribution_{feature_name}.png'
|
|
1839
|
-
fig.savefig(self.save_path+result_plot_name)
|
|
1840
|
-
# pickle.dump(axs, open(self.save_path+result_plot_name, 'wb'))
|
|
1841
|
-
|
|
1842
|
-
if self.save_path and self.save_aws:
|
|
1843
|
-
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name, input_path = self.save_path+result_plot_name)
|
|
1844
|
-
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
|
|
1845
|
-
if not self.show_plot:
|
|
1846
|
-
plt.close()
|
|
1847
|
-
|
|
1848
|
-
del df
|
|
1849
|
-
|
|
1850
|
-
if self.return_fig:
|
|
1851
|
-
return fig
|
|
1852
|
-
|
|
1853
|
-
def create_backtest_signal(self,days_strategy, test_size, feature_name, high_exit = False, low_exit = False):
|
|
1854
|
-
asset_1 = 'Close'
|
|
1855
|
-
up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
|
|
1856
|
-
df1 = self.data.iloc[-test_size:,:].copy()
|
|
1857
|
-
df2 = df1.copy()
|
|
1858
|
-
df2['signal_type'] = np.where(
|
|
1859
|
-
df2[up_signal] == 1,
|
|
1860
|
-
'up',
|
|
1861
|
-
np.where(
|
|
1862
|
-
df2[low_signal] == 1,
|
|
1863
|
-
'down',
|
|
1864
|
-
None
|
|
1865
|
-
)
|
|
1866
|
-
)
|
|
1867
|
-
df2 = df2[~df2.signal_type.isna()]
|
|
1868
|
-
# df2['Date_'] = df2.index
|
|
1869
|
-
df2['lag_Date'] = df2['Date'].shift(1)
|
|
1870
|
-
df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lag_Date'])).dt.days - 1
|
|
1871
|
-
df2['break'] = np.where(df2['span'] > 3, 1, 0)
|
|
1872
|
-
df2['break'] = np.where(df2['span'].isna(), 1, df2['break'])
|
|
1873
|
-
|
|
1874
|
-
df2['chain_id'] = df2.sort_values(['Date']).groupby(['break']).cumcount() + 1
|
|
1875
|
-
df2['chain_id'] = np.where(df2['break'] == 1, df2['chain_id'], np.nan )
|
|
1876
|
-
df2['chain_id'] = df2['chain_id'].fillna(method = 'ffill')
|
|
1877
|
-
|
|
1878
|
-
df2['internal_rn'] = df2.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
|
|
1879
|
-
df2['inv_internal_rn'] = df2.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
|
|
1880
|
-
|
|
1881
|
-
df2['first_in_chain'] = np.where(df2['internal_rn'] == 1, True, False)
|
|
1882
|
-
df2['last_in_chain'] = np.where(df2['inv_internal_rn'] == 1, True, False)
|
|
1883
|
-
|
|
1884
|
-
df2 = df2.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
|
|
1885
|
-
|
|
1886
|
-
df2 = df2[(df2.last_in_chain == True) & (df2.signal_type == 'down')][['last_in_chain']]
|
|
1887
|
-
dft = df1.merge(df2,how = 'left',left_index=True, right_index=True )
|
|
1888
|
-
|
|
1889
|
-
dft['chain_id'] = dft.sort_values(['Date']).groupby(['last_in_chain']).cumcount() + 1
|
|
1890
|
-
dft['chain_id'] = np.where(dft['last_in_chain'] == True, dft['chain_id'], np.nan )
|
|
1891
|
-
dft['chain_id'] = dft['chain_id'].fillna(method = 'ffill')
|
|
1892
|
-
|
|
1893
|
-
dft['internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
|
|
1894
|
-
dft['flag'] = np.where(dft['internal_rn'] < days_strategy, 1,0)
|
|
1895
|
-
|
|
1896
|
-
dft['lrets_bench'] = np.log(dft[asset_1]/dft[asset_1].shift(1))
|
|
1897
|
-
dft['bench_prod'] = dft['lrets_bench'].cumsum()
|
|
1898
|
-
dft['bench_prod_exp'] = np.exp(dft['bench_prod']) - 1
|
|
1899
|
-
|
|
1900
|
-
if high_exit and low_exit:
|
|
1901
|
-
dft['open_strat'] = np.where(dft.last_in_chain == True, dft.Open, np.nan)
|
|
1902
|
-
dft['open_strat'] = dft['open_strat'].fillna(method = 'ffill')
|
|
1903
|
-
dft['open_strat'] = np.where(dft.flag == 1, dft.open_strat, np.nan)
|
|
1904
|
-
dft['high_strat_ret'] = (dft['High']/dft['open_strat']-1)*100
|
|
1905
|
-
dft['low_strat_ret'] = (dft['Low']/dft['open_strat']-1)*100
|
|
1906
|
-
dft['high_exit'] = np.where(((dft['high_strat_ret'] >= high_exit) | (dft['internal_rn'] == days_strategy)), 1, np.nan)
|
|
1907
|
-
dft['low_exit'] = np.where((dft['low_strat_ret'] <= low_exit), -1, np.nan)
|
|
1908
|
-
|
|
1909
|
-
dft["exit_type"] = dft[["high_exit", "low_exit"]].max(axis=1)
|
|
1910
|
-
dft['exit_type'] = np.where(dft["exit_type"] == 1, 1, np.where(dft["exit_type"] == -1,-1,np.nan))
|
|
1911
|
-
dft['exit'] = np.where(dft['exit_type'].isnull(), np.nan, 1)
|
|
1912
|
-
dft['exit_order'] = dft.sort_values(['Date']).groupby(['chain_id','exit']).cumcount() + 1
|
|
1913
|
-
dft['exit'] = np.where(dft['exit_order'] == 1, True, np.nan)
|
|
1914
|
-
dft = dft.drop(columns = ['exit_order'])
|
|
1915
|
-
## if last signal is near
|
|
1916
|
-
max_id = dft.chain_id.max()
|
|
1917
|
-
dft['max_internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).internal_rn.transform('max')
|
|
1918
|
-
dft['exit'] = np.where((dft.chain_id == max_id) & (dft.max_internal_rn < days_strategy) & (dft.max_internal_rn == dft.internal_rn), 1, dft['exit'])
|
|
1919
|
-
|
|
1920
|
-
dft['exit_step'] = np.where(dft.exit == 1, dft.internal_rn, np.nan)
|
|
1921
|
-
dft['exit_step'] = dft.sort_values(['Date']).groupby(['chain_id']).exit_step.transform('max')
|
|
1922
|
-
|
|
1923
|
-
dft['flag'] = np.where(dft.internal_rn <= dft.exit_step, 1, 0)
|
|
1924
|
-
dft = dft.drop(columns = ['open_strat', 'high_strat_ret', 'low_strat_ret','exit_step', 'exit','exit_type','high_exit','low_exit', 'max_internal_rn'])
|
|
1925
|
-
|
|
1926
|
-
dft['lrets_strat'] = np.log(dft[asset_1].shift(-1)/dft[asset_1]) * dft['flag']
|
|
1927
|
-
dft['lrets_strat'] = np.where(dft['lrets_strat'].isna(),-0.0,dft['lrets_strat'])
|
|
1928
|
-
dft['lrets_prod'] = dft['lrets_strat'].cumsum()
|
|
1929
|
-
dft['strat_prod_exp'] = np.exp(dft['lrets_prod']) - 1
|
|
1930
|
-
|
|
1931
|
-
bench_rets = round(dft['bench_prod_exp'].values[-1]*100,1)
|
|
1932
|
-
strat_rets = round(dft['strat_prod_exp'].values[-1]*100,1)
|
|
1933
|
-
|
|
1934
|
-
bench_sr = round(sharpe_ratio(dft.bench_prod_exp.dropna()),1)
|
|
1935
|
-
strat_sr = round(sharpe_ratio(dft.strat_prod_exp.dropna()),1)
|
|
1936
|
-
|
|
1937
|
-
message1 = f'{bench_rets}%'
|
|
1938
|
-
message2 = f'{strat_rets}%'
|
|
1939
|
-
|
|
1940
|
-
messages = {
|
|
1941
|
-
'benchmark return:':message1,
|
|
1942
|
-
'benchmark sharpe ratio:': bench_sr,
|
|
1943
|
-
'strategy return:':message2,
|
|
1944
|
-
'strategy sharpe ratio:': strat_sr,
|
|
1945
|
-
}
|
|
1946
|
-
if self.show_plot:
|
|
1947
|
-
print('----------------------------')
|
|
1948
|
-
print(messages)
|
|
1949
|
-
print('----------------------------')
|
|
1950
|
-
|
|
1951
|
-
fig = plt.figure(1)
|
|
1952
|
-
plt.plot(dft.bench_prod_exp.values, label = 'benchmark')
|
|
1953
|
-
plt.scatter(range(len(dft)),np.where(dft[low_signal] == 1,dft.bench_prod_exp.values,np.nan),color = 'red', label = 'signal')
|
|
1954
|
-
plt.plot(dft.strat_prod_exp.values, label = 'strategy')
|
|
1955
|
-
plt.legend()
|
|
1956
|
-
plt.title('strategy and cumulative returns based on signal strategy')
|
|
1957
|
-
if self.show_plot:
|
|
1958
|
-
plt.plot()
|
|
1959
|
-
|
|
1960
|
-
if self.save_path:
|
|
1961
|
-
result_json_name = f'signals_strategy_return_{feature_name}.json'
|
|
1962
|
-
result_plot_name = f'signals_strategy_return_{feature_name}.png'
|
|
1963
|
-
|
|
1964
|
-
plt.savefig(self.save_path+result_plot_name)
|
|
1965
|
-
# pickle.dump(fig, open(self.save_path+result_plot_name, 'wb'))
|
|
1966
|
-
|
|
1967
|
-
with open(self.save_path+result_json_name, "w") as outfile:
|
|
1968
|
-
json.dump(messages, outfile)
|
|
1969
|
-
|
|
1970
|
-
if self.save_path and self.save_aws:
|
|
1971
|
-
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
|
|
1972
|
-
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name,input_path = self.save_path+result_plot_name)
|
|
1973
|
-
|
|
1974
|
-
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_json_name, input_path = self.save_path + result_json_name, aws_credentials = self.aws_credentials)
|
|
1975
|
-
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
|
|
1976
|
-
|
|
1977
|
-
if not self.show_plot:
|
|
1978
|
-
plt.close()
|
|
1979
|
-
|
|
1980
|
-
del df1,df2,dft
|
|
1981
|
-
|
|
1982
|
-
if self.return_fig:
|
|
1983
|
-
return fig, messages
|
|
1984
|
-
|
|
1985
|
-
def execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object, plot = False, backtest= False, exit_params = {}):
|
|
1986
|
-
|
|
1987
|
-
method(**configuration)
|
|
1988
|
-
signal_assess = signal_analyser_object(object_stock.df,object_stock.stock_code,show_plot = plot)
|
|
1989
|
-
signal_assess.signal_analyser(test_size = test_data_size, feature_name = feature_name, days_list = days_list, threshold = 1)
|
|
1990
|
-
|
|
1991
|
-
if backtest:
|
|
1992
|
-
print('-----------------------back test ---------------------------')
|
|
1993
|
-
signal_assess.create_backtest_signal(backtest, test_data_size, feature_name, **exit_params )
|
|
1994
|
-
|
|
1995
|
-
return signal_assess.mean_median_return
|
|
1996
|
-
|
|
1997
|
-
def iterate_signal_analyser(test_data_size,feature_name, days_list, arguments_to_test, method, object_stock, signal_analyser_object, plot = True):
|
|
1998
|
-
|
|
1999
|
-
results = list()
|
|
2000
|
-
for key in arguments_to_test.keys():
|
|
2001
|
-
configuration = arguments_to_test.get(key)
|
|
2002
|
-
mean_median_return = execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object)
|
|
2003
|
-
results.append(mean_median_return)
|
|
2004
|
-
|
|
2005
|
-
df_result = pd.DataFrame({'keys':arguments_to_test.keys(),'results':results})
|
|
2006
|
-
if plot:
|
|
2007
|
-
plt.plot(df_result['keys'], df_result['results'])
|
|
2008
|
-
plt.scatter(df_result['keys'], df_result['results'])
|
|
2009
|
-
plt.title('simulation between configurations')
|
|
2010
|
-
plt.ylabel('median expected return')
|
|
2011
|
-
plt.show()
|
|
2012
|
-
|
|
2013
|
-
best_result = df_result.sort_values('results',ascending = False)['keys'].values[0]
|
|
2014
|
-
return best_result
|
|
2015
|
-
|
|
2016
|
-
class analyse_index(stock_eda_panel):
|
|
2017
|
-
def __init__(self, index, asset, n_obs, lag, data_window = '5y', show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
|
|
2018
|
-
|
|
2019
|
-
"""
|
|
2020
|
-
data: pandas df
|
|
2021
|
-
index: str name of the index
|
|
2022
|
-
asset: str name of the asset
|
|
2023
|
-
n_obs: int
|
|
2024
|
-
lag: int
|
|
2025
|
-
data_window: str eg 5y 10y 15y
|
|
2026
|
-
show_plot: bool
|
|
2027
|
-
save_path: str local path for saving e.g r'C:/path/to/the/file/'
|
|
2028
|
-
save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
|
|
2029
|
-
aws_credentials: dict
|
|
2030
|
-
"""
|
|
2031
|
-
|
|
2032
|
-
self.index = index
|
|
2310
|
+
self.index_data = index_data
|
|
2033
2311
|
self.asset = asset
|
|
2034
2312
|
self.n_obs = n_obs
|
|
2035
2313
|
self.data_window = data_window
|
|
2036
2314
|
self.lag = lag
|
|
2037
|
-
|
|
2315
|
+
|
|
2038
2316
|
self.show_plot = show_plot
|
|
2317
|
+
self.return_fig = return_fig
|
|
2039
2318
|
self.save_path = save_path
|
|
2040
2319
|
self.save_aws = save_aws
|
|
2041
|
-
|
|
2042
|
-
def process_data(self):
|
|
2043
|
-
|
|
2044
|
-
index = stock_eda_panel(self.index, self.n_obs, self.data_window)
|
|
2045
|
-
index.get_data()
|
|
2046
|
-
index.df['shift'] = index.df.Close.shift(self.lag)
|
|
2047
|
-
index.df['index_return'] = index.df.Close/index.df['shift'] - 1
|
|
2048
2320
|
|
|
2049
|
-
|
|
2321
|
+
def process_data(self):
|
|
2322
|
+
"""
|
|
2323
|
+
using stock_eda_panel, get data and merge data
|
|
2324
|
+
|
|
2325
|
+
Parameters
|
|
2326
|
+
----------
|
|
2327
|
+
None
|
|
2328
|
+
|
|
2329
|
+
Returns
|
|
2330
|
+
-------
|
|
2331
|
+
None
|
|
2332
|
+
"""
|
|
2333
|
+
asset = stock_eda_panel(self.asset, self.n_obs, data_window=self.data_window)
|
|
2050
2334
|
asset.get_data()
|
|
2051
|
-
|
|
2052
|
-
asset.df['asset_return'] = asset.df.Close/asset.df['shift'] - 1
|
|
2335
|
+
df = asset.df[['Date','Close']]
|
|
2053
2336
|
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2337
|
+
if type(self.index_data) != str:
|
|
2338
|
+
df_merge = df.merge(self.index_data, on = ['Date'], how = 'left').sort_values('Date')
|
|
2339
|
+
|
|
2340
|
+
else:
|
|
2341
|
+
indx = stock_eda_panel(self.index_data, self.n_obs, data_window=self.data_window)
|
|
2342
|
+
indx.get_data()
|
|
2343
|
+
indx_df = indx.df[['Date','Close']].rename(columns = {'Close':self.index_data})
|
|
2344
|
+
df_merge = df.merge(indx_df, on = ['Date'], how = 'left').sort_values('Date')
|
|
2345
|
+
|
|
2346
|
+
for colx in ['Close'] + self.indexes:
|
|
2347
|
+
df_merge[f'{colx}_pct'] = df_merge[colx]/df_merge[colx].shift(self.lag) - 1
|
|
2348
|
+
|
|
2349
|
+
df_merge.dropna(inplace = True)
|
|
2350
|
+
self.merger_df = df_merge.rename(columns = {'Close_pct': 'asset_return'})
|
|
2063
2351
|
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
|
|
2352
|
+
def plot_betas(self,sample_size, offset, subsample_ts =False, index = False):
|
|
2353
|
+
"""
|
|
2354
|
+
display beta analysis plot
|
|
2355
|
+
|
|
2356
|
+
Parameters
|
|
2357
|
+
----------
|
|
2358
|
+
sample_size (int): number of days or window size to calculate beta
|
|
2359
|
+
offset (int): overlap between windows
|
|
2360
|
+
subsample_ts (int): subsample size of data
|
|
2361
|
+
|
|
2362
|
+
Returns
|
|
2363
|
+
-------
|
|
2364
|
+
None
|
|
2365
|
+
"""
|
|
2366
|
+
if (type(self.index_data) == str) & (index != False):
|
|
2367
|
+
raise Exception("No need of index argument")
|
|
2368
|
+
else:
|
|
2369
|
+
index = self.indexes[0]
|
|
2370
|
+
|
|
2371
|
+
index_pct = f'{index}_pct'
|
|
2372
|
+
### ploting analysis
|
|
2067
2373
|
figure, ax = plt.subplot_mosaic(
|
|
2068
2374
|
[["scatter_total", "scatter_sample",'ts','ts']],
|
|
2069
2375
|
layout="constrained",
|
|
2070
2376
|
figsize=(18, 5)
|
|
2071
2377
|
)
|
|
2072
|
-
|
|
2073
|
-
ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df
|
|
2074
|
-
|
|
2378
|
+
|
|
2379
|
+
ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df[index_pct])
|
|
2380
|
+
|
|
2381
|
+
huber_regr = HuberRegressor(fit_intercept = True)
|
|
2382
|
+
huber_regr.fit(self.merger_df.asset_return.values.reshape(-1,1), self.merger_df[index_pct].values.reshape(-1,1))
|
|
2383
|
+
b, a = huber_regr.coef_[0], huber_regr.intercept_
|
|
2384
|
+
|
|
2385
|
+
# b, a = np.polyfit(self.merger_df.asset_return, self.merger_df[index_pct], 1)
|
|
2075
2386
|
ax['scatter_total'].plot(self.merger_df.asset_return, b*self.merger_df.asset_return+a, color='red')
|
|
2076
2387
|
|
|
2077
2388
|
ax['ts'].plot(self.merger_df.Date, self.merger_df.Close, color = 'grey', alpha = 0.3)
|
|
2078
|
-
|
|
2389
|
+
|
|
2079
2390
|
if subsample_ts:
|
|
2080
2391
|
self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
|
|
2081
|
-
|
|
2392
|
+
|
|
2082
2393
|
for i in range(0,len(self.merger_df)-sample_size,offset):
|
|
2083
2394
|
|
|
2084
2395
|
merger_ = self.merger_df.sort_values('Date', ascending = False).iloc[i:i+sample_size,:]
|
|
2085
|
-
x = merger_
|
|
2396
|
+
x = merger_[index_pct]
|
|
2086
2397
|
y = merger_.asset_return
|
|
2087
|
-
b, a = np.polyfit(x,y, 1)
|
|
2088
|
-
|
|
2398
|
+
# b, a = np.polyfit(x,y, 1)
|
|
2399
|
+
huber_regr = HuberRegressor(fit_intercept = True)
|
|
2400
|
+
huber_regr.fit(x.values.reshape(-1,1), y.values.reshape(-1,1))
|
|
2401
|
+
b, a = huber_regr.coef_[0], huber_regr.intercept_
|
|
2402
|
+
|
|
2089
2403
|
normalize = mcolors.Normalize(vmin=-1, vmax=1)
|
|
2090
2404
|
colormap = cm.jet
|
|
2091
2405
|
|
|
@@ -2098,12 +2412,13 @@ class analyse_index(stock_eda_panel):
|
|
|
2098
2412
|
|
|
2099
2413
|
scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
|
|
2100
2414
|
scalarmappaple.set_array(x)
|
|
2101
|
-
|
|
2102
|
-
plt.title(f'{self.asset} using index: {
|
|
2415
|
+
|
|
2416
|
+
plt.title(f'{self.asset} using index: {index}')
|
|
2103
2417
|
plt.colorbar(scalarmappaple)
|
|
2104
|
-
|
|
2418
|
+
|
|
2105
2419
|
if self.show_plot:
|
|
2106
2420
|
plt.show()
|
|
2421
|
+
|
|
2107
2422
|
if self.save_path:
|
|
2108
2423
|
result_plot_name = f'market_best_fit.png'
|
|
2109
2424
|
figure.savefig(self.save_path+result_plot_name)
|
|
@@ -2111,80 +2426,50 @@ class analyse_index(stock_eda_panel):
|
|
|
2111
2426
|
if self.save_path and self.save_aws:
|
|
2112
2427
|
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.asset}/'+result_plot_name,input_path = self.save_path+result_plot_name)
|
|
2113
2428
|
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
|
|
2429
|
+
|
|
2114
2430
|
if not self.show_plot:
|
|
2115
|
-
plt.close()
|
|
2116
|
-
|
|
2431
|
+
plt.close()
|
|
2432
|
+
|
|
2433
|
+
if self.return_fig:
|
|
2434
|
+
return figure
|
|
2435
|
+
|
|
2117
2436
|
def get_betas(self,subsample_ts=False):
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
result =
|
|
2130
|
-
'general_beta':general_beta,
|
|
2131
|
-
'general_r':general_r,
|
|
2132
|
-
'sample_beta':sample_beta,
|
|
2133
|
-
'sample_r':sample_r
|
|
2134
|
-
}
|
|
2135
|
-
|
|
2136
|
-
self.states_result = result
|
|
2137
|
-
|
|
2138
|
-
class evaluate_markets(analyse_index):
|
|
2139
|
-
def __init__(self, stock_code, indexes):
|
|
2140
|
-
self.stock_code = stock_code
|
|
2141
|
-
self.indexes = indexes
|
|
2142
|
-
def evaluate_best_market_fit(self,sample_size, offset,lag= 3, n_obs = 3500, verbose = False, plot_best = False):
|
|
2143
|
-
|
|
2144
|
-
results_dicts = dict()
|
|
2437
|
+
"""
|
|
2438
|
+
get general beta and last sample beta, correlation score is included too
|
|
2439
|
+
|
|
2440
|
+
Parameters
|
|
2441
|
+
----------
|
|
2442
|
+
subsample_ts (int): subsample size of data
|
|
2443
|
+
|
|
2444
|
+
Returns
|
|
2445
|
+
-------
|
|
2446
|
+
None
|
|
2447
|
+
"""
|
|
2448
|
+
result = list()
|
|
2145
2449
|
for index in self.indexes:
|
|
2146
|
-
betex = analyse_index(index = index,asset = self.stock_code,n_obs = n_obs, lag = lag)
|
|
2147
|
-
betex.get_betas(sample_size)
|
|
2148
|
-
results_dicts[index] = betex.states_result
|
|
2149
|
-
pd_result = pd.DataFrame(results_dicts).T
|
|
2150
|
-
pd_result['gen_r2'] = pd_result.general_r ** 2
|
|
2151
|
-
pd_result['sampl_r2'] = pd_result.sample_r ** 2
|
|
2152
|
-
self.stat_results = pd_result
|
|
2153
|
-
|
|
2154
|
-
best_result = pd_result.sort_values('gen_r2',ascending = False).head(2).sort_values('sampl_r2',ascending = False).head(1)
|
|
2155
|
-
best_fit_index = best_result.index.values[0]
|
|
2156
|
-
|
|
2157
|
-
self.stat_results = self.stat_results.drop(columns = ['gen_r2','sampl_r2'])
|
|
2158
|
-
|
|
2159
|
-
if verbose:
|
|
2160
|
-
print(best_result)
|
|
2161
|
-
if plot_best:
|
|
2162
|
-
betex = analyse_index(index = best_fit_index,asset = self.stock_code, n_obs = n_obs, lag = lag)
|
|
2163
|
-
betex.plot_betas(sample_size = sample_size, offset = offset, subsample_ts = False)
|
|
2164
2450
|
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
all_betas = data_market[data_market.asset == ticket_name].sort_values('general_r', ascending = False)
|
|
2177
|
-
all_betas['gen_r2'] = all_betas.general_r ** 2
|
|
2178
|
-
all_betas['sampl_r2'] = all_betas.sample_r ** 2
|
|
2179
|
-
selection = all_betas.sort_values('gen_r2',ascending =False).head(2).sort_values('sampl_r2',ascending =False).head(1).drop(columns = ['gen_r2','sampl_r2'])
|
|
2451
|
+
index_pct = f'{index}_pct'
|
|
2452
|
+
huber_regr = HuberRegressor(fit_intercept = True)
|
|
2453
|
+
huber_regr.fit(self.merger_df.asset_return.values.reshape(-1,1), self.merger_df[index_pct].values.reshape(-1,1))
|
|
2454
|
+
general_beta, a = huber_regr.coef_[0], huber_regr.intercept_
|
|
2455
|
+
general_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df[index])[0]
|
|
2456
|
+
|
|
2457
|
+
dict_res = {
|
|
2458
|
+
'index':index,
|
|
2459
|
+
'general_beta':general_beta,
|
|
2460
|
+
'general_r':general_r,
|
|
2461
|
+
}
|
|
2180
2462
|
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2463
|
+
if subsample_ts:
|
|
2464
|
+
tmp_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
|
|
2465
|
+
huber_regr = HuberRegressor(fit_intercept = True)
|
|
2466
|
+
huber_regr.fit(tmp_df.asset_return.values.reshape(-1,1), tmp_df[index_pct].values.reshape(-1,1))
|
|
2467
|
+
sample_beta, a = huber_regr.coef_[0], huber_regr.intercept_
|
|
2468
|
+
sample_r = stats.mstats.pearsonr(tmp_df.asset_return, tmp_df[index])[0]
|
|
2469
|
+
dict_res['sample_beta'] = sample_beta
|
|
2470
|
+
dict_res['sample_r'] = sample_r
|
|
2471
|
+
|
|
2472
|
+
result.append(dict_res)
|
|
2473
|
+
|
|
2474
|
+
self.states_result = result
|
|
2475
|
+
|