virgo-modules 0.0.75__py3-none-any.whl → 0.0.76__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of virgo-modules might be problematic. Click here for more details.
- virgo_modules/src/aws_utils.py +34 -2
- virgo_modules/src/edge_utils.py +200 -5
- virgo_modules/src/re_utils.py +360 -54
- virgo_modules/src/ticketer_source.py +1418 -256
- {virgo_modules-0.0.75.dist-info → virgo_modules-0.0.76.dist-info}/METADATA +1 -1
- virgo_modules-0.0.76.dist-info/RECORD +12 -0
- virgo_modules-0.0.75.dist-info/RECORD +0 -12
- {virgo_modules-0.0.75.dist-info → virgo_modules-0.0.76.dist-info}/LICENSE +0 -0
- {virgo_modules-0.0.75.dist-info → virgo_modules-0.0.76.dist-info}/WHEEL +0 -0
- {virgo_modules-0.0.75.dist-info → virgo_modules-0.0.76.dist-info}/top_level.txt +0 -0
|
@@ -53,19 +53,57 @@ from .aws_utils import upload_file_to_aws
|
|
|
53
53
|
import logging
|
|
54
54
|
|
|
55
55
|
class InverseHyperbolicSine(BaseEstimator, TransformerMixin):
|
|
56
|
+
|
|
57
|
+
"""
|
|
58
|
+
Class that applies inverse hyperbolic sine for feature transformation.
|
|
59
|
+
this class is compatible with scikitlearn pipeline
|
|
60
|
+
|
|
61
|
+
Attributes
|
|
62
|
+
----------
|
|
63
|
+
features : list
|
|
64
|
+
list of features to apply the transformation
|
|
65
|
+
prefix : str
|
|
66
|
+
prefix for the new features. is '' the features are overwrite
|
|
67
|
+
|
|
68
|
+
Methods
|
|
69
|
+
-------
|
|
70
|
+
fit(additional="", X=DataFrame, y=None):
|
|
71
|
+
fit transformation.
|
|
72
|
+
transform(X=DataFrame, y=None):
|
|
73
|
+
apply feature transformation
|
|
74
|
+
"""
|
|
75
|
+
|
|
56
76
|
def __init__(self, features, prefix = ''):
|
|
57
77
|
self.features = features
|
|
58
78
|
self.prefix = prefix
|
|
59
79
|
|
|
60
80
|
def fit(self, X, y=None):
|
|
61
81
|
return self
|
|
62
|
-
|
|
82
|
+
|
|
63
83
|
def transform(self, X, y=None):
|
|
64
84
|
for feature in self.features:
|
|
65
85
|
X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
|
|
66
86
|
return X
|
|
67
87
|
|
|
68
88
|
class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
|
|
89
|
+
|
|
90
|
+
"""
|
|
91
|
+
Class that applies winsorirization of a feature for feature transformation.
|
|
92
|
+
this class is compatible with scikitlearn pipeline
|
|
93
|
+
|
|
94
|
+
Attributes
|
|
95
|
+
----------
|
|
96
|
+
feature_configs : dict
|
|
97
|
+
dictionary of features and configurations. the configuration has high and low limits per feature
|
|
98
|
+
|
|
99
|
+
Methods
|
|
100
|
+
-------
|
|
101
|
+
fit(additional="", X=DataFrame, y=None):
|
|
102
|
+
fit transformation.
|
|
103
|
+
transform(X=DataFrame, y=None):
|
|
104
|
+
apply feature transformation
|
|
105
|
+
"""
|
|
106
|
+
|
|
69
107
|
def __init__(self, feature_configs):
|
|
70
108
|
self.feature_configs = feature_configs
|
|
71
109
|
def fit(self, X, y=None):
|
|
@@ -80,6 +118,24 @@ class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
|
|
|
80
118
|
return X
|
|
81
119
|
|
|
82
120
|
class FeatureSelector(BaseEstimator, TransformerMixin):
|
|
121
|
+
|
|
122
|
+
"""
|
|
123
|
+
Class that applies selection of features.
|
|
124
|
+
this class is compatible with scikitlearn pipeline
|
|
125
|
+
|
|
126
|
+
Attributes
|
|
127
|
+
----------
|
|
128
|
+
columns : list
|
|
129
|
+
list of features to select
|
|
130
|
+
|
|
131
|
+
Methods
|
|
132
|
+
-------
|
|
133
|
+
fit(additional="", X=DataFrame, y=None):
|
|
134
|
+
fit transformation.
|
|
135
|
+
transform(X=DataFrame, y=None):
|
|
136
|
+
apply feature transformation
|
|
137
|
+
"""
|
|
138
|
+
|
|
83
139
|
def __init__(self, columns):
|
|
84
140
|
self.columns = columns
|
|
85
141
|
|
|
@@ -88,8 +144,19 @@ class FeatureSelector(BaseEstimator, TransformerMixin):
|
|
|
88
144
|
|
|
89
145
|
def transform(self, X, y=None):
|
|
90
146
|
return X[self.columns]
|
|
91
|
-
|
|
147
|
+
|
|
92
148
|
def sharpe_ratio(return_series):
|
|
149
|
+
|
|
150
|
+
'''
|
|
151
|
+
calculate sharpe ratio for given array.
|
|
152
|
+
|
|
153
|
+
Parameters:
|
|
154
|
+
return_series (pd.series): pandas series of the asset returns
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
sharpe (float): sharpe ratio
|
|
158
|
+
'''
|
|
159
|
+
|
|
93
160
|
N = 255 # Trading days in the year (change to 365 for crypto)
|
|
94
161
|
rf = 0.005 # Half a percent risk free rare
|
|
95
162
|
mean = return_series.mean() * N -rf
|
|
@@ -98,12 +165,38 @@ def sharpe_ratio(return_series):
|
|
|
98
165
|
return sharpe
|
|
99
166
|
|
|
100
167
|
class signal_combiner(BaseEstimator, TransformerMixin):
|
|
168
|
+
|
|
169
|
+
"""
|
|
170
|
+
Class that applies feature combination of binary signals.
|
|
171
|
+
this class is compatible with scikitlearn pipeline
|
|
172
|
+
|
|
173
|
+
...
|
|
174
|
+
|
|
175
|
+
Attributes
|
|
176
|
+
----------
|
|
177
|
+
columns : list
|
|
178
|
+
list of features to select
|
|
179
|
+
drop : boolean
|
|
180
|
+
drop combining features
|
|
181
|
+
prefix_up : str
|
|
182
|
+
up prefix of the base feature
|
|
183
|
+
prefix_low : str
|
|
184
|
+
low prefix of the base feature
|
|
185
|
+
|
|
186
|
+
Methods
|
|
187
|
+
-------
|
|
188
|
+
fit(additional="", X=DataFrame, y=None):
|
|
189
|
+
fit transformation.
|
|
190
|
+
transform(X=DataFrame, y=None):
|
|
191
|
+
apply feature transformation
|
|
192
|
+
"""
|
|
193
|
+
|
|
101
194
|
def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
|
|
102
195
|
self.columns = columns
|
|
103
196
|
self.drop = drop
|
|
104
197
|
self.prefix_up = prefix_up
|
|
105
198
|
self.prefix_low = prefix_low
|
|
106
|
-
|
|
199
|
+
|
|
107
200
|
def fit(self, X, y=None):
|
|
108
201
|
return self
|
|
109
202
|
|
|
@@ -111,7 +204,7 @@ class signal_combiner(BaseEstimator, TransformerMixin):
|
|
|
111
204
|
for column in self.columns:
|
|
112
205
|
X['CombSignal_'+column] = np.where(
|
|
113
206
|
X[self.prefix_up + column] == 1,
|
|
114
|
-
1,
|
|
207
|
+
1,
|
|
115
208
|
np.where(
|
|
116
209
|
X[self.prefix_low + column] == 1,
|
|
117
210
|
1,
|
|
@@ -121,15 +214,29 @@ class signal_combiner(BaseEstimator, TransformerMixin):
|
|
|
121
214
|
if self.drop:
|
|
122
215
|
X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
|
|
123
216
|
return X
|
|
124
|
-
|
|
217
|
+
|
|
125
218
|
def data_processing_pipeline(features_base,features_to_drop = False, lag_dict = False, combine_signals = False, discretize_columns = False, correlation = 0.77):
|
|
126
|
-
|
|
219
|
+
|
|
220
|
+
'''
|
|
221
|
+
create a scikit learn pipeline object using different configurations and feature engineering blocks with a given flow
|
|
222
|
+
|
|
223
|
+
Parameters:
|
|
224
|
+
features_to_drop (list): list of features to drop
|
|
225
|
+
lag_dict (dict): feature dictionary with configurations to apply lags
|
|
226
|
+
combine_signals (list): list of columns/signals to combine
|
|
227
|
+
discretize_columns (list): list of features to discretize, bins is fixed
|
|
228
|
+
correlation (float): correaltion score threshold for feature selection
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
pipe (obj): pipeline object
|
|
232
|
+
'''
|
|
233
|
+
|
|
127
234
|
lag_pipe_sec = [(f'lags_{key}', LagFeatures(variables = key, periods = lag_dict[key])) for key in lag_dict] if lag_dict else []
|
|
128
235
|
drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
|
|
129
236
|
merge = [('signal_combiner', signal_combiner(combine_signals))] if combine_signals else []
|
|
130
237
|
discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = 20 ))] if discretize_columns else []
|
|
131
238
|
drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation))] if correlation else []
|
|
132
|
-
|
|
239
|
+
|
|
133
240
|
pipe = Pipeline(
|
|
134
241
|
[('selector', FeatureSelector(features_base))] + \
|
|
135
242
|
[('encoding',OneHotEncoder(top_categories=None, variables=['hmm_feature']))] + \
|
|
@@ -143,6 +250,18 @@ def data_processing_pipeline(features_base,features_to_drop = False, lag_dict =
|
|
|
143
250
|
return pipe
|
|
144
251
|
|
|
145
252
|
def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
|
|
253
|
+
'''
|
|
254
|
+
calculate relevance score and summary report for hmm model
|
|
255
|
+
|
|
256
|
+
Parameters:
|
|
257
|
+
default_benchmark_sd (float): default value to bias SD for t calculation
|
|
258
|
+
t_threshold (float): alpha or z threshold for the normalized score
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
mean_relevance (float): mean relevance score of the states
|
|
262
|
+
cluster_returns (pd.DataFrame): summary report of the analysis
|
|
263
|
+
number_relevant_states (int): number of relevant states
|
|
264
|
+
'''
|
|
146
265
|
## legnths
|
|
147
266
|
cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
|
|
148
267
|
cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
|
|
@@ -151,7 +270,7 @@ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2
|
|
|
151
270
|
return x.quantile(0.25)
|
|
152
271
|
def quantile3(x):
|
|
153
272
|
return x.quantile(0.75)
|
|
154
|
-
|
|
273
|
+
|
|
155
274
|
cluster_returns = data.groupby('hmm_feature').agg(
|
|
156
275
|
n_uniques = ('chain_id','nunique'),
|
|
157
276
|
n_obs = ('Date','count'),
|
|
@@ -171,14 +290,14 @@ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2
|
|
|
171
290
|
cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
|
|
172
291
|
cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
|
|
173
292
|
cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
|
|
174
|
-
|
|
293
|
+
|
|
175
294
|
## relevance or importance
|
|
176
295
|
# naive aproach
|
|
177
296
|
cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
|
|
178
297
|
cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
|
|
179
298
|
cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
|
|
180
299
|
cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
|
|
181
|
-
|
|
300
|
+
|
|
182
301
|
mean_relevance = cluster_returns['abs_t_accpted'].mean()
|
|
183
302
|
number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
|
|
184
303
|
|
|
@@ -186,20 +305,161 @@ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2
|
|
|
186
305
|
|
|
187
306
|
|
|
188
307
|
class stock_eda_panel(object):
|
|
189
|
-
|
|
308
|
+
|
|
309
|
+
"""
|
|
310
|
+
Class that initialy gets stock data then apply feature enginering, enrichment, analysis, plotting, model training etc.
|
|
311
|
+
|
|
312
|
+
Attributes
|
|
313
|
+
----------
|
|
314
|
+
stock_code : str
|
|
315
|
+
symbol of the asset
|
|
316
|
+
n_days : str
|
|
317
|
+
number of days to extract data
|
|
318
|
+
data_window : str
|
|
319
|
+
large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
|
|
320
|
+
df : pd.DataFrame
|
|
321
|
+
Pandas dataframe of the asset data with features
|
|
322
|
+
strategy_log: pd.DataFrame
|
|
323
|
+
Pandas dataframe that has the results of different tested strategies (result from strategy simulator hmm)
|
|
324
|
+
best_strategy: list
|
|
325
|
+
features of the best performing strategy (result from strategy simulator hmm)
|
|
326
|
+
top_10_strategy: dict
|
|
327
|
+
top 10 best performing strategies (result from strategy simulator hmm)
|
|
328
|
+
settings: dict
|
|
329
|
+
configuration dictionary of the features and other parameters
|
|
330
|
+
|
|
331
|
+
Methods
|
|
332
|
+
-------
|
|
333
|
+
augmented_dickey_fuller_statistics(time_series=pd.Series, label=str):
|
|
334
|
+
Perform dickey fuller or stationary test for a given time series
|
|
335
|
+
It will print p value of the features
|
|
336
|
+
get_data():
|
|
337
|
+
Get asset data performing some data normalization or formating (in case of dates)
|
|
338
|
+
plot_series_returns(roll_mean_lags1=int, roll_mean_lags2=int)
|
|
339
|
+
Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
|
|
340
|
+
seasonal_plot():
|
|
341
|
+
Display time series split by year
|
|
342
|
+
plot_price_signal(feature=str, feature_2=str, opacity=float):
|
|
343
|
+
Display botton and roof signals over the closing prices
|
|
344
|
+
volatility_analysis(lags=int, trad_days=int, window_log_return=int, plot=boolean, save_features=boolean):
|
|
345
|
+
this method performs log return and volatilyty analysis of the closing prices
|
|
346
|
+
find_lag(feature=str, lag_list=list, column_target=str,posterior_lag=int, test_size=int):
|
|
347
|
+
displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
|
|
348
|
+
outlier_plot(zlim=float, plot=boolean, save_features=boolean):
|
|
349
|
+
perform outlier analysis of the log returns. It also permors normality test of returns
|
|
350
|
+
analysis_roll_mean_log_returns(lags=int, plot=boolean):
|
|
351
|
+
perform analysis of lags of the mean rolling log return
|
|
352
|
+
compute_clip_bands(feature_name=str,threshold=float):
|
|
353
|
+
compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
|
|
354
|
+
signal_plotter(feature_name=str):
|
|
355
|
+
display analysis plot of a feature with high and low signals
|
|
356
|
+
log_features_standard(feature_name=str):
|
|
357
|
+
save resulting feature names in an standard structure
|
|
358
|
+
relative_spread_MA(ma1=int, ma2=int, threshold=float, plot=boolean, save_features=boolean):
|
|
359
|
+
perform relative moving average features, one for short term and another for long/mid term
|
|
360
|
+
pair_feature(pair_symbol=str, plot=boolean):
|
|
361
|
+
initialize pair feature data extraction and analysis
|
|
362
|
+
calculate_cointegration(series_1=pd.series, series_2=pd.series):
|
|
363
|
+
calculate cointegration score for two time series
|
|
364
|
+
bidirect_count_feature(rolling_window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
365
|
+
perform negative and positive return counting in a given rolling time window
|
|
366
|
+
get_relative_range_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
367
|
+
perform relative spread of opening and closing price
|
|
368
|
+
rsi_feature_improved(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
369
|
+
perform relative strength index
|
|
370
|
+
days_features_bands(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
371
|
+
compute mean returns for a given day of the week in a window scope per day
|
|
372
|
+
analysis_smooth_volume(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
373
|
+
compute feature of thrading volumes
|
|
374
|
+
roc_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
375
|
+
perform price rate of change
|
|
376
|
+
stoch_feature(window=int, smooth1=int, smooth2=int, threshold=float, plot=boolean, save_features=boolean):
|
|
377
|
+
perform stochastic oscilator RSI feature
|
|
378
|
+
stochastic_feature(window=int, smooth=int, threshold=float, plot=boolean, save_features=boolean):
|
|
379
|
+
perform stochastic oscilator feature
|
|
380
|
+
william_feature(lbp=int, threshold=float, plot=boolean, save_features=boolean):
|
|
381
|
+
perfom fast stochastic oscilator or william indicator
|
|
382
|
+
vortex_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
383
|
+
perform vortex oscilator
|
|
384
|
+
pair_index_feature(pair_symbol=str, feature_label=str, window=int, threshold=float, plot=boolean, save_features=boolean):
|
|
385
|
+
perform additional asset ROC feature, then a new feature is created in the main dataframe
|
|
386
|
+
produce_order_features(feature_name=str, save_features=boolean):
|
|
387
|
+
perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
|
|
388
|
+
create_hmm_derived_features():
|
|
389
|
+
create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
|
|
390
|
+
cluster_hmm_analysis(n_clusters=int,features_hmm=list, test_data_size=int, seed=int, lag_returns_state=int, plot=boolean, save_features=boolean, model=obj):
|
|
391
|
+
create or use a hmm model
|
|
392
|
+
sharpe_ratio(return_series=pd.Series, n_trad_days=int, rf=float):
|
|
393
|
+
perform sharpe ratio of a given time series return
|
|
394
|
+
treat_signal_strategy(test_data=pd.DataFrame, strategy=list):
|
|
395
|
+
helper method that treats signals and converts signals to 1 or 0
|
|
396
|
+
stategy_simulator(features=list, hmm_feature=boolean):
|
|
397
|
+
execute strategy and get some performance metrics like sharpe ratio, return
|
|
398
|
+
viz_strategy(strategy):
|
|
399
|
+
display analysis plot of a given strategy
|
|
400
|
+
deep_dive_analysis_hmm(test_data_size=int, split=str):
|
|
401
|
+
display analysis plot hmm model
|
|
402
|
+
get_targets(steps=int):
|
|
403
|
+
produce regression target return taking future prices
|
|
404
|
+
get_categorical_targets(horizon=int, flor_loss=float, top_gain=float):
|
|
405
|
+
produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
|
|
406
|
+
get_configurations(test_data_size=int, val_data_size=int, model_type=str):
|
|
407
|
+
produce configuration dictionary that were saved in the feature generation methods if save_features was activated
|
|
408
|
+
"""
|
|
409
|
+
|
|
190
410
|
def __init__(self, stock_code, n_days, data_window = '5y'):
|
|
411
|
+
|
|
412
|
+
"""
|
|
413
|
+
Initialize object
|
|
414
|
+
|
|
415
|
+
Parameters
|
|
416
|
+
----------
|
|
417
|
+
stock_code (str): symbol of the asset
|
|
418
|
+
n_days (str): number of days to extract data
|
|
419
|
+
data_window (str): large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
|
|
420
|
+
|
|
421
|
+
Returns
|
|
422
|
+
-------
|
|
423
|
+
None
|
|
424
|
+
"""
|
|
425
|
+
|
|
191
426
|
self.stock_code = stock_code
|
|
192
427
|
self.n_days = n_days
|
|
193
428
|
self.today = datetime.date.today()
|
|
194
429
|
self.features = list()
|
|
195
430
|
self.signals = list()
|
|
196
431
|
self.data_window = data_window
|
|
197
|
-
|
|
432
|
+
|
|
198
433
|
def augmented_dickey_fuller_statistics(self,time_series, label):
|
|
434
|
+
"""
|
|
435
|
+
Perform dickey fuller or stationary test for a given time series
|
|
436
|
+
It will print p value of the features
|
|
437
|
+
|
|
438
|
+
Parameters
|
|
439
|
+
----------
|
|
440
|
+
time_series (pd.Series): pandas series of the time series
|
|
441
|
+
label (pd.Series): feature name
|
|
442
|
+
|
|
443
|
+
Returns
|
|
444
|
+
-------
|
|
445
|
+
None
|
|
446
|
+
"""
|
|
199
447
|
result = adfuller(time_series.dropna().values)
|
|
200
448
|
print('p-value: {} for the series {}'.format(round(result[1],6), label))
|
|
201
|
-
|
|
449
|
+
|
|
202
450
|
def get_data(self):
|
|
451
|
+
"""
|
|
452
|
+
Get asset data performing some data normalization or formating (in case of dates)
|
|
453
|
+
|
|
454
|
+
Parameters
|
|
455
|
+
----------
|
|
456
|
+
None
|
|
457
|
+
|
|
458
|
+
Returns
|
|
459
|
+
-------
|
|
460
|
+
None
|
|
461
|
+
"""
|
|
462
|
+
|
|
203
463
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
204
464
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
205
465
|
|
|
@@ -210,7 +470,7 @@ class stock_eda_panel(object):
|
|
|
210
470
|
df.reset_index(inplace=True)
|
|
211
471
|
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
212
472
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
213
|
-
|
|
473
|
+
|
|
214
474
|
df = df[df.Date >= begin_date_str ]
|
|
215
475
|
self.settings_general = {
|
|
216
476
|
'n_days':self.n_days,
|
|
@@ -219,44 +479,56 @@ class stock_eda_panel(object):
|
|
|
219
479
|
'execution_date': self.today.strftime('%Y-%m-%d')
|
|
220
480
|
}
|
|
221
481
|
self.df = df
|
|
222
|
-
|
|
482
|
+
|
|
223
483
|
### cleaning volume
|
|
224
484
|
### volume clearning
|
|
225
485
|
self.df['Volume'] = np.where(self.df['Volume'] <= 10, np.nan, self.df['Volume'])
|
|
226
486
|
self.df['Volume'] = self.df['Volume'].fillna(method='bfill')
|
|
227
|
-
|
|
487
|
+
|
|
228
488
|
## filling
|
|
229
|
-
|
|
489
|
+
|
|
230
490
|
base_columns_unit_test = ['Open','High','Low','Close','Volume']
|
|
231
491
|
self.df[base_columns_unit_test] = self.df[base_columns_unit_test].fillna(method='ffill')
|
|
232
|
-
|
|
492
|
+
|
|
233
493
|
## cleaning nulls
|
|
234
|
-
|
|
494
|
+
|
|
235
495
|
xs = self.df[base_columns_unit_test].isnull().sum()/self.df[base_columns_unit_test].count()
|
|
236
496
|
reject_columns = list(xs[xs > 0.5].index.values)
|
|
237
|
-
|
|
497
|
+
|
|
238
498
|
if len(reject_columns) > 0:
|
|
239
499
|
logging.warning("the following columns have many nulls and are drop: {}".format(reject_columns))
|
|
240
500
|
self.df = self.df.drop(columns = reject_columns)
|
|
241
|
-
|
|
242
|
-
|
|
501
|
+
|
|
243
502
|
def plot_series_returns(self,roll_mean_lags1,roll_mean_lags2):
|
|
244
|
-
|
|
503
|
+
|
|
504
|
+
"""
|
|
505
|
+
Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
|
|
506
|
+
|
|
507
|
+
Parameters
|
|
508
|
+
----------
|
|
509
|
+
roll_mean_lags1 (int): short term window
|
|
510
|
+
roll_mean_lags2 (int): mid/long term window
|
|
511
|
+
|
|
512
|
+
Returns
|
|
513
|
+
-------
|
|
514
|
+
None
|
|
515
|
+
"""
|
|
516
|
+
|
|
245
517
|
df = self.df
|
|
246
518
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
247
519
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
248
|
-
|
|
520
|
+
|
|
249
521
|
### getting rolling mean
|
|
250
522
|
df["Close_roll_mean"] = (
|
|
251
523
|
df.sort_values("Date")["Close"]
|
|
252
524
|
.transform(lambda x: x.rolling(roll_mean_lags1, min_periods=1).mean())
|
|
253
525
|
)
|
|
254
|
-
|
|
526
|
+
|
|
255
527
|
df["Close_roll_mean_2"] = (
|
|
256
528
|
df.sort_values("Date")["Close"]
|
|
257
529
|
.transform(lambda x: x.rolling(roll_mean_lags2, min_periods=1).mean())
|
|
258
530
|
)
|
|
259
|
-
|
|
531
|
+
|
|
260
532
|
### getting rolling stdv
|
|
261
533
|
df["Close_roll_std"] = (
|
|
262
534
|
df.sort_values("Date")["Close"]
|
|
@@ -273,7 +545,7 @@ class stock_eda_panel(object):
|
|
|
273
545
|
))
|
|
274
546
|
|
|
275
547
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.Close, marker_color = 'blue', name='Price'),row=1, col=1)
|
|
276
|
-
|
|
548
|
+
|
|
277
549
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean, marker_color = 'black', name='roll mean' ),row=1, col=1)
|
|
278
550
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean_2, marker_color = 'grey', name='roll mean 2' ),row=1, col=1)
|
|
279
551
|
fig.add_trace(go.Scatter(x=df['Date'], y=df.lower, marker_color = 'pink',legendgroup='bound', name='bound' ),row=1, col=1)
|
|
@@ -281,8 +553,21 @@ class stock_eda_panel(object):
|
|
|
281
553
|
|
|
282
554
|
fig.update_layout(height=500, width=1200, title_text=f"stock {self.stock_code} vizualization")
|
|
283
555
|
fig.show()
|
|
284
|
-
|
|
556
|
+
|
|
285
557
|
def seasonal_plot(self):
|
|
558
|
+
|
|
559
|
+
"""
|
|
560
|
+
Display time series split by year
|
|
561
|
+
|
|
562
|
+
Parameters
|
|
563
|
+
----------
|
|
564
|
+
None
|
|
565
|
+
|
|
566
|
+
Returns
|
|
567
|
+
-------
|
|
568
|
+
None
|
|
569
|
+
"""
|
|
570
|
+
|
|
286
571
|
df = self.df
|
|
287
572
|
years = list(df['Date'].dt.year.unique())
|
|
288
573
|
years.sort()
|
|
@@ -302,10 +587,24 @@ class stock_eda_panel(object):
|
|
|
302
587
|
|
|
303
588
|
fig.update_layout(height=500, width=1400, title_text=f"stock {self.stock_code} seasonal vizualization")
|
|
304
589
|
fig.show()
|
|
305
|
-
|
|
590
|
+
|
|
306
591
|
def plot_price_signal(self, feature, feature_2 = '', opacity = 0.3):
|
|
307
|
-
|
|
308
|
-
|
|
592
|
+
|
|
593
|
+
"""
|
|
594
|
+
Display botton and roof signals over the closing prices
|
|
595
|
+
|
|
596
|
+
Parameters
|
|
597
|
+
----------
|
|
598
|
+
feature (str): name of the main feature to plot
|
|
599
|
+
feature_2 (str): name of the alternative feature to plot
|
|
600
|
+
opacity (float): opacity degree of the signals points
|
|
601
|
+
|
|
602
|
+
Returns
|
|
603
|
+
-------
|
|
604
|
+
None
|
|
605
|
+
"""
|
|
606
|
+
|
|
607
|
+
signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
|
|
309
608
|
signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
|
|
310
609
|
norm_list = [f'norm_{feature}', f'z_{feature}', feature]
|
|
311
610
|
|
|
@@ -315,14 +614,14 @@ class stock_eda_panel(object):
|
|
|
315
614
|
if norm_feat in self.df.columns:
|
|
316
615
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[norm_feat],legendgroup="up", mode='lines',name = norm_feat, marker_color = 'blue'),col = 1, row = 1)
|
|
317
616
|
break
|
|
318
|
-
|
|
319
|
-
|
|
617
|
+
|
|
618
|
+
|
|
320
619
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df['Close'], mode='lines',name = 'history', marker_color = 'grey'),col = 1, row = 2)
|
|
321
|
-
|
|
620
|
+
|
|
322
621
|
if feature == 'MA_spread':
|
|
323
622
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma1_column],legendgroup="ma", mode='lines',name = self.ma1_column, marker_color = 'black'),col = 1, row = 2)
|
|
324
623
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma2_column],legendgroup="ma", mode='lines',name = self.ma2_column, marker_color = 'grey'),col = 1, row = 2)
|
|
325
|
-
|
|
624
|
+
|
|
326
625
|
for norm_feat in norm_list:
|
|
327
626
|
if norm_feat in self.df.columns:
|
|
328
627
|
fig.add_trace(go.Scatter(x=self.df['Date'], y=np.where(self.df[norm_feat] > 0, self.df['Close'], np.nan),legendgroup="up", mode='markers',name = 'up', marker_color = 'green',opacity = opacity),col = 1, row = 2)
|
|
@@ -338,8 +637,25 @@ class stock_eda_panel(object):
|
|
|
338
637
|
|
|
339
638
|
fig.update_layout(height=900, width=1200)
|
|
340
639
|
fig.show()
|
|
341
|
-
|
|
640
|
+
|
|
342
641
|
def volatility_analysis(self, lags, trad_days, window_log_return, plot = False, save_features = False):
|
|
642
|
+
|
|
643
|
+
"""
|
|
644
|
+
this method performs log return and volatilyty analysis of the closing prices
|
|
645
|
+
|
|
646
|
+
Parameters
|
|
647
|
+
----------
|
|
648
|
+
lags (int): number of lags to apply to the closing prices
|
|
649
|
+
trad_days (int): number of trading days to anualize returns or volatility
|
|
650
|
+
window_log_return (int): window for rolling returns
|
|
651
|
+
plot (boolean): True to display plot
|
|
652
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
653
|
+
|
|
654
|
+
Returns
|
|
655
|
+
-------
|
|
656
|
+
None
|
|
657
|
+
"""
|
|
658
|
+
|
|
343
659
|
df = self.df
|
|
344
660
|
df['log_return'] = np.log(df.Close/df.Close.shift(lags))
|
|
345
661
|
df['sqr_log_return'] = np.square(df.log_return)
|
|
@@ -349,13 +665,13 @@ class stock_eda_panel(object):
|
|
|
349
665
|
df.sort_values("Date")["log_return"]
|
|
350
666
|
.transform(lambda x: x.rolling(window_log_return, min_periods=1).mean())
|
|
351
667
|
)
|
|
352
|
-
|
|
668
|
+
|
|
353
669
|
if save_features:
|
|
354
670
|
self.features.append('volatility_log_return')
|
|
355
671
|
self.features.append('roll_mean_log_return')
|
|
356
672
|
self.features.append('log_return')
|
|
357
673
|
self.settings_volatility = {'lags':lags, 'trad_days':trad_days, 'window_log_return':window_log_return}
|
|
358
|
-
|
|
674
|
+
|
|
359
675
|
if plot:
|
|
360
676
|
fig = make_subplots(rows=3, cols=1,vertical_spacing = 0.02,shared_xaxes=True,
|
|
361
677
|
specs=[
|
|
@@ -395,10 +711,26 @@ class stock_eda_panel(object):
|
|
|
395
711
|
|
|
396
712
|
self.augmented_dickey_fuller_statistics(df['log_return'], 'log_return')
|
|
397
713
|
self.augmented_dickey_fuller_statistics(df['roll_mean_log_return'], 'roll_mean_log_return')
|
|
398
|
-
|
|
399
|
-
|
|
714
|
+
|
|
715
|
+
|
|
400
716
|
def find_lag(self, feature, lag_list, column_target = 'log_return',posterior_lag = 4, test_size = 350):
|
|
401
717
|
|
|
718
|
+
"""
|
|
719
|
+
displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
|
|
720
|
+
|
|
721
|
+
Parameters
|
|
722
|
+
----------
|
|
723
|
+
feature (str): feature name to apply lags
|
|
724
|
+
lag_list (list): list of lags, each lag as integer
|
|
725
|
+
column_target (str): target to get correlation, e.g return or mean reaturn
|
|
726
|
+
posterior_lag (int): for the target, posterior window shift to calculate a window return
|
|
727
|
+
test_size (int): data size of the test data. The remaining is going to be used as training data. This parameters is ment to avoid overfiting and leackage
|
|
728
|
+
|
|
729
|
+
Returns
|
|
730
|
+
-------
|
|
731
|
+
None
|
|
732
|
+
"""
|
|
733
|
+
|
|
402
734
|
results = dict()
|
|
403
735
|
df = self.df.iloc[:-test_size,:][['Date','Close','roll_mean_log_return','log_return',feature]].sort_values('Date').copy()
|
|
404
736
|
for i,lag in enumerate(lag_list):
|
|
@@ -413,7 +745,7 @@ class stock_eda_panel(object):
|
|
|
413
745
|
'lag':lag,
|
|
414
746
|
'pearsonr_log_return':r_log[0],
|
|
415
747
|
'spearman_log_return': sp_log[0],
|
|
416
|
-
}
|
|
748
|
+
}
|
|
417
749
|
del df
|
|
418
750
|
results_df = pd.DataFrame(results).T
|
|
419
751
|
|
|
@@ -426,10 +758,24 @@ class stock_eda_panel(object):
|
|
|
426
758
|
plt.legend()
|
|
427
759
|
plt.axhline(y=0, color='grey', linestyle='--')
|
|
428
760
|
plt.show()
|
|
429
|
-
|
|
430
|
-
|
|
761
|
+
|
|
762
|
+
|
|
431
763
|
def outlier_plot(self, zlim, plot = False, save_features = False):
|
|
432
|
-
|
|
764
|
+
|
|
765
|
+
"""
|
|
766
|
+
perform outlier analysis of the log returns. It also permors normality test of returns
|
|
767
|
+
|
|
768
|
+
Parameters
|
|
769
|
+
----------
|
|
770
|
+
zlim (float): alpha or z thrsholds for normalized returns
|
|
771
|
+
plot (boolean): True to display plot
|
|
772
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
773
|
+
|
|
774
|
+
Returns
|
|
775
|
+
-------
|
|
776
|
+
None
|
|
777
|
+
"""
|
|
778
|
+
|
|
433
779
|
mean = self.df.log_return.mean()
|
|
434
780
|
std = self.df.log_return.std()
|
|
435
781
|
self.df['z_log_return'] = (self.df.log_return - mean)/std
|
|
@@ -451,7 +797,7 @@ class stock_eda_panel(object):
|
|
|
451
797
|
sigma = self.df['z_log_return'].std()
|
|
452
798
|
x = np.linspace(self.df['z_log_return'].min(),self.df['z_log_return'].max(), 15000)
|
|
453
799
|
y = stats.norm.pdf(x, loc = mu, scale = sigma)
|
|
454
|
-
|
|
800
|
+
|
|
455
801
|
fig, axs = plt.subplots(2, 1,figsize=(15,8))
|
|
456
802
|
|
|
457
803
|
axs[0].hist(self.df['z_log_return'],density = True,bins = 100 , label = 'Returns distribution')
|
|
@@ -460,7 +806,7 @@ class stock_eda_panel(object):
|
|
|
460
806
|
axs[0].axvline(l2, color='green', linestyle='--')
|
|
461
807
|
axs[0].axvline(-l2, color='green', linestyle='--')
|
|
462
808
|
axs[0].plot(x,y, linewidth = 3, color = 'r', label = 'Normal Dist Curve')
|
|
463
|
-
|
|
809
|
+
|
|
464
810
|
axs[1].plot(self.df['Date'],self.df['z_log_return'])
|
|
465
811
|
axs[1].plot(self.df['Date'],self.df['low_outlier'], linestyle='--')
|
|
466
812
|
axs[1].plot(self.df['Date'],self.df['up_outlier'], linestyle='--')
|
|
@@ -469,18 +815,31 @@ class stock_eda_panel(object):
|
|
|
469
815
|
plt.show()
|
|
470
816
|
|
|
471
817
|
z_stat, p_stat = stats.normaltest(self.df['z_log_return'].dropna())
|
|
472
|
-
p_stat = round(p_stat, 7)
|
|
818
|
+
p_stat = round(p_stat, 7)
|
|
473
819
|
print('---------------------- returns normality tests ----------------------------')
|
|
474
820
|
if p_stat < 0.05:
|
|
475
821
|
print(f'pvalue: {p_stat} then, returns do not follow a normal distribution')
|
|
476
822
|
else:
|
|
477
823
|
print(f'pvalue: {p_stat} then, returns follow a normal distribution')
|
|
478
|
-
|
|
824
|
+
|
|
479
825
|
def analysis_roll_mean_log_returns(self, lags, plot = False):
|
|
480
826
|
|
|
827
|
+
"""
|
|
828
|
+
perform analysis of lags of the mean rolling log return
|
|
829
|
+
|
|
830
|
+
Parameters
|
|
831
|
+
----------
|
|
832
|
+
lags (int): lags to apply to the roll log return
|
|
833
|
+
plot (boolean): True to display plot
|
|
834
|
+
|
|
835
|
+
Returns
|
|
836
|
+
-------
|
|
837
|
+
None
|
|
838
|
+
"""
|
|
839
|
+
|
|
481
840
|
self.df['lag'] = self.df.roll_mean_log_return.shift(lags)
|
|
482
841
|
self.df['Diff'] = self.df['roll_mean_log_return'] - self.df['lag']
|
|
483
|
-
|
|
842
|
+
|
|
484
843
|
if plot:
|
|
485
844
|
|
|
486
845
|
fig, axs = plt.subplots(1, 3,figsize=(19,4))
|
|
@@ -493,7 +852,20 @@ class stock_eda_panel(object):
|
|
|
493
852
|
plt.show()
|
|
494
853
|
|
|
495
854
|
def compute_clip_bands(self,feature_name,threshold):
|
|
496
|
-
|
|
855
|
+
|
|
856
|
+
"""
|
|
857
|
+
compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
|
|
858
|
+
|
|
859
|
+
Parameters
|
|
860
|
+
----------
|
|
861
|
+
feature_name (str): feature name
|
|
862
|
+
threshold (float): alpha or z thrsholds for normalized returns
|
|
863
|
+
|
|
864
|
+
Returns
|
|
865
|
+
-------
|
|
866
|
+
None
|
|
867
|
+
"""
|
|
868
|
+
|
|
497
869
|
self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/self.df[feature_name].std()
|
|
498
870
|
mean_ = self.df[f'norm_{feature_name}'].mean()
|
|
499
871
|
|
|
@@ -507,25 +879,49 @@ class stock_eda_panel(object):
|
|
|
507
879
|
self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] > self.df[f'upper_{feature_name}'] ), 1, 0)
|
|
508
880
|
|
|
509
881
|
def signal_plotter(self, feature_name):
|
|
882
|
+
|
|
883
|
+
"""
|
|
884
|
+
display analysis plot of a feature with high and low signals
|
|
885
|
+
|
|
886
|
+
Parameters
|
|
887
|
+
----------
|
|
888
|
+
feature_name (str): feature name
|
|
889
|
+
|
|
890
|
+
Returns
|
|
891
|
+
-------
|
|
892
|
+
None
|
|
893
|
+
"""
|
|
894
|
+
|
|
510
895
|
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
511
|
-
|
|
896
|
+
|
|
512
897
|
axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
|
|
513
898
|
axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
|
|
514
899
|
axs[0].plot(self.df[f'norm_{feature_name}'])
|
|
515
|
-
|
|
900
|
+
|
|
516
901
|
plot_acf(self.df[feature_name].dropna(),lags=25,ax = axs[1])
|
|
517
902
|
axs[1].set_title(f'acf {feature_name}')
|
|
518
|
-
|
|
903
|
+
|
|
519
904
|
plot_pacf(self.df[feature_name].dropna(),lags=25,ax = axs[2])
|
|
520
905
|
axs[2].set_title(f'pacf {feature_name}')
|
|
521
|
-
|
|
906
|
+
|
|
522
907
|
fig.show()
|
|
523
908
|
|
|
524
909
|
def log_features_standard(self, feature_name):
|
|
910
|
+
"""
|
|
911
|
+
save resulting feature names in an standard structure
|
|
912
|
+
|
|
913
|
+
Parameters
|
|
914
|
+
----------
|
|
915
|
+
feature_name (str): feature name
|
|
916
|
+
|
|
917
|
+
Returns
|
|
918
|
+
-------
|
|
919
|
+
None
|
|
920
|
+
"""
|
|
525
921
|
self.features.append(feature_name)
|
|
526
922
|
self.signals.append(f'signal_up_{feature_name}')
|
|
527
923
|
self.signals.append(f'signal_low_{feature_name}')
|
|
528
|
-
|
|
924
|
+
|
|
529
925
|
#######################
|
|
530
926
|
#### to be deprecated ####
|
|
531
927
|
def spread_MA(self, ma1, ma2, limit = 1.95, plot = False, save_features = False):
|
|
@@ -546,7 +942,7 @@ class stock_eda_panel(object):
|
|
|
546
942
|
|
|
547
943
|
self.df['signal_low_MA_spread'] = np.where( (self.df['norm_MA_spread'] < self.df['lower_MA_spread'] ), 1, 0)
|
|
548
944
|
self.df['signal_up_MA_spread'] = np.where( (self.df['norm_MA_spread'] > self.df['upper_MA_spread'] ), 1, 0)
|
|
549
|
-
|
|
945
|
+
|
|
550
946
|
### ploting purposes
|
|
551
947
|
self.df[f"Roll_mean_{ma1}"] = (
|
|
552
948
|
self.df.sort_values("Date")["Close"]
|
|
@@ -556,15 +952,15 @@ class stock_eda_panel(object):
|
|
|
556
952
|
self.df.sort_values("Date")["Close"]
|
|
557
953
|
.transform(lambda x: x.rolling(ma2, min_periods=1).mean())
|
|
558
954
|
)
|
|
559
|
-
|
|
955
|
+
|
|
560
956
|
|
|
561
957
|
print('--------------------------------------------------------------------')
|
|
562
958
|
if save_features:
|
|
563
959
|
self.features.append('MA_spread')
|
|
564
960
|
self.signals.append('signal_low_MA_spread')
|
|
565
961
|
self.signals.append('signal_up_MA_spread')
|
|
566
|
-
self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
|
|
567
|
-
|
|
962
|
+
self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
|
|
963
|
+
|
|
568
964
|
if plot:
|
|
569
965
|
|
|
570
966
|
fig, axs = plt.subplots(1, 3,figsize=(21,4))
|
|
@@ -581,9 +977,23 @@ class stock_eda_panel(object):
|
|
|
581
977
|
axs[2].set_title('acf MA_spread series')
|
|
582
978
|
plt.show()
|
|
583
979
|
##################################################
|
|
584
|
-
|
|
980
|
+
|
|
585
981
|
def relative_spread_MA(self, ma1, ma2, threshold = 1.95, plot = False, save_features = False):
|
|
586
|
-
|
|
982
|
+
"""
|
|
983
|
+
perform relative moving average features, one for short term and another for long/mid term
|
|
984
|
+
|
|
985
|
+
Parameters
|
|
986
|
+
----------
|
|
987
|
+
ma1 (int): short term moving average window
|
|
988
|
+
ma2 (int): long/mid term moving average window
|
|
989
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
990
|
+
plot (boolean): True to display plot
|
|
991
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
992
|
+
|
|
993
|
+
Returns
|
|
994
|
+
-------
|
|
995
|
+
None
|
|
996
|
+
"""
|
|
587
997
|
feature_name = 'rel_MA_spread'
|
|
588
998
|
|
|
589
999
|
self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
|
|
@@ -608,13 +1018,26 @@ class stock_eda_panel(object):
|
|
|
608
1018
|
print('--------------------------------------------------------------------')
|
|
609
1019
|
if save_features:
|
|
610
1020
|
self.log_features_standard(feature_name)
|
|
611
|
-
self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
|
|
1021
|
+
self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
|
|
612
1022
|
|
|
613
1023
|
if plot:
|
|
614
1024
|
|
|
615
1025
|
self.signal_plotter(feature_name)
|
|
616
|
-
|
|
1026
|
+
|
|
617
1027
|
def pair_feature(self, pair_symbol, plot = False):
|
|
1028
|
+
"""
|
|
1029
|
+
initialize pair feature data extraction and analysis
|
|
1030
|
+
|
|
1031
|
+
Parameters
|
|
1032
|
+
----------
|
|
1033
|
+
pair_symbol (str): symbol of the pair asset to extract
|
|
1034
|
+
plot (boolean): True to display plot
|
|
1035
|
+
|
|
1036
|
+
Returns
|
|
1037
|
+
-------
|
|
1038
|
+
None
|
|
1039
|
+
"""
|
|
1040
|
+
|
|
618
1041
|
self.pair_symbol = pair_symbol
|
|
619
1042
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
620
1043
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
@@ -627,7 +1050,7 @@ class stock_eda_panel(object):
|
|
|
627
1050
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
628
1051
|
df = df[df.Date >= begin_date_str ]
|
|
629
1052
|
self.pair_df = df
|
|
630
|
-
|
|
1053
|
+
|
|
631
1054
|
#### converting the same index ####
|
|
632
1055
|
dates_vector = self.df.Date.to_frame()
|
|
633
1056
|
self.pair_df = dates_vector.merge(self.pair_df, on ='Date',how = 'left')
|
|
@@ -653,8 +1076,22 @@ class stock_eda_panel(object):
|
|
|
653
1076
|
plt.plot(self.df['Date'],asset_2_values,label = asset_2)
|
|
654
1077
|
plt.legend()
|
|
655
1078
|
plt.show()
|
|
656
|
-
|
|
1079
|
+
|
|
657
1080
|
def calculate_cointegration(self,series_1, series_2):
|
|
1081
|
+
"""
|
|
1082
|
+
calculate cointegration score for two time series
|
|
1083
|
+
|
|
1084
|
+
Parameters
|
|
1085
|
+
----------
|
|
1086
|
+
series_1 (pd.series): time series
|
|
1087
|
+
series_2 (pd.series): time series
|
|
1088
|
+
|
|
1089
|
+
Returns
|
|
1090
|
+
-------
|
|
1091
|
+
coint_flag (boolean): 1 if the p_value cointegration_t are lower than 0.05 and critical value
|
|
1092
|
+
hedge_value (float): beta from the regression model
|
|
1093
|
+
"""
|
|
1094
|
+
|
|
658
1095
|
coint_flag = 0
|
|
659
1096
|
coint_res = coint(series_1, series_2)
|
|
660
1097
|
coint_t = coint_res[0]
|
|
@@ -666,9 +1103,22 @@ class stock_eda_panel(object):
|
|
|
666
1103
|
coint_flag = 1 if p_value < 0.05 and coint_t < critical_value else 0
|
|
667
1104
|
|
|
668
1105
|
return coint_flag, hedge_value
|
|
669
|
-
|
|
670
|
-
def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
|
|
671
1106
|
|
|
1107
|
+
def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
|
|
1108
|
+
"""
|
|
1109
|
+
display analysis of the pair feature and save results in case if needed
|
|
1110
|
+
|
|
1111
|
+
Parameters
|
|
1112
|
+
----------
|
|
1113
|
+
window (int): window to apply to the rolling spread between pair and main asset
|
|
1114
|
+
z_threshold (float): alpha or z thrsholds for the normalized feature
|
|
1115
|
+
plot (boolean): True to display plot
|
|
1116
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1117
|
+
|
|
1118
|
+
Returns
|
|
1119
|
+
-------
|
|
1120
|
+
None
|
|
1121
|
+
"""
|
|
672
1122
|
spread_series = pd.Series(self.df.pair_spread)
|
|
673
1123
|
mean = spread_series.rolling(center = False, window = window).mean()
|
|
674
1124
|
std = spread_series.rolling(center = False, window = window).std()
|
|
@@ -677,11 +1127,11 @@ class stock_eda_panel(object):
|
|
|
677
1127
|
self.df['pair_z_score'] = z_score
|
|
678
1128
|
self.df['signal_low_pair_z_score'] = np.where(self.df['pair_z_score'] < -z_threshold, 1, 0)
|
|
679
1129
|
self.df['signal_up_pair_z_score'] = np.where(self.df['pair_z_score'] > z_threshold, 1, 0)
|
|
680
|
-
|
|
1130
|
+
|
|
681
1131
|
if save_features:
|
|
682
1132
|
self.log_features_standard('pair_z_score')
|
|
683
|
-
self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
|
|
684
|
-
|
|
1133
|
+
self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
|
|
1134
|
+
|
|
685
1135
|
if plot:
|
|
686
1136
|
pvalue = round(adfuller(z_score.dropna().values)[1],4)
|
|
687
1137
|
print(f'p value of the rolling z-score is {pvalue}')
|
|
@@ -695,7 +1145,7 @@ class stock_eda_panel(object):
|
|
|
695
1145
|
axs[0,0].axhline(y=0, color='blue', linestyle='-.')
|
|
696
1146
|
axs[0,0].plot(self.df.pair_z_score)
|
|
697
1147
|
axs[0,0].set_title('z score from the spread')
|
|
698
|
-
|
|
1148
|
+
|
|
699
1149
|
axs[0,1].plot(self.df['Date'],self.df['pair_spread'])
|
|
700
1150
|
axs[0,1].plot(self.df['Date'],np.where(self.df['signal_low_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'red')
|
|
701
1151
|
axs[0,1].plot(self.df['Date'],np.where(self.df['signal_up_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'green')
|
|
@@ -704,10 +1154,10 @@ class stock_eda_panel(object):
|
|
|
704
1154
|
|
|
705
1155
|
plot_acf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,0])
|
|
706
1156
|
axs[1,0].set_title('acf pair_z_score')
|
|
707
|
-
|
|
1157
|
+
|
|
708
1158
|
plot_pacf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,1])
|
|
709
1159
|
axs[1,1].set_title('pacf pair_z_score')
|
|
710
|
-
|
|
1160
|
+
|
|
711
1161
|
plt.show()
|
|
712
1162
|
|
|
713
1163
|
#######################
|
|
@@ -725,13 +1175,13 @@ class stock_eda_panel(object):
|
|
|
725
1175
|
|
|
726
1176
|
self.df['signal_up_roll_pos_counting'] = np.where((self.df['norm_counting'] > threshold),1,0)
|
|
727
1177
|
self.df['signal_low_roll_pos_counting'] = np.where((self.df['norm_counting'] < -threshold),1,0)
|
|
728
|
-
|
|
1178
|
+
|
|
729
1179
|
if save_features:
|
|
730
1180
|
self.features.append('roll_pos_counting')
|
|
731
1181
|
self.signals.append('signal_up_roll_pos_counting')
|
|
732
1182
|
self.signals.append('signal_low_roll_pos_counting')
|
|
733
|
-
self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
734
|
-
|
|
1183
|
+
self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
1184
|
+
|
|
735
1185
|
if plot:
|
|
736
1186
|
fig = plt.figure(figsize = (10,4))
|
|
737
1187
|
plt.plot(self.df['Date'],self.df.norm_counting)
|
|
@@ -739,9 +1189,22 @@ class stock_eda_panel(object):
|
|
|
739
1189
|
plt.axhline(y=-threshold, color='grey', linestyle='--')
|
|
740
1190
|
plt.show()
|
|
741
1191
|
#######################
|
|
742
|
-
|
|
1192
|
+
|
|
743
1193
|
def bidirect_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
|
|
744
|
-
|
|
1194
|
+
"""
|
|
1195
|
+
perform negative and positive return counting in a given rolling time window
|
|
1196
|
+
|
|
1197
|
+
Parameters
|
|
1198
|
+
----------
|
|
1199
|
+
rolling_window (int): window to apply to positive and negative returns
|
|
1200
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1201
|
+
plot (boolean): True to display plot
|
|
1202
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1203
|
+
|
|
1204
|
+
Returns
|
|
1205
|
+
-------
|
|
1206
|
+
None
|
|
1207
|
+
"""
|
|
745
1208
|
feature_name = 'bidirect_counting'
|
|
746
1209
|
# negative countiing and rolling countingng
|
|
747
1210
|
self.df['RetClose'] = self.df['Close'].pct_change()
|
|
@@ -757,7 +1220,7 @@ class stock_eda_panel(object):
|
|
|
757
1220
|
|
|
758
1221
|
if save_features:
|
|
759
1222
|
self.log_features_standard(feature_name)
|
|
760
|
-
self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
1223
|
+
self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
761
1224
|
|
|
762
1225
|
if plot:
|
|
763
1226
|
fig = plt.figure(figsize = (10,4))
|
|
@@ -783,12 +1246,12 @@ class stock_eda_panel(object):
|
|
|
783
1246
|
|
|
784
1247
|
self.df['signal_up_dist_range'] = np.where(self.df['norm_dist_range'] > self.df['up_bound_norm_dist_range'],1,0 )
|
|
785
1248
|
self.df['signal_low_dist_range'] = np.where(self.df['norm_dist_range'] < self.df['low_bound_norm_dist_range'],1,0 )
|
|
786
|
-
|
|
1249
|
+
|
|
787
1250
|
if save_features:
|
|
788
1251
|
self.features.append('dist_range')
|
|
789
1252
|
self.signals.append('signal_up_dist_range')
|
|
790
1253
|
self.signals.append('signal_low_dist_range')
|
|
791
|
-
self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
|
|
1254
|
+
self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
|
|
792
1255
|
|
|
793
1256
|
if plot:
|
|
794
1257
|
fig, axs = plt.subplots(2, 2,figsize=(17,11))
|
|
@@ -804,9 +1267,22 @@ class stock_eda_panel(object):
|
|
|
804
1267
|
axs[1,0].plot(self.df['norm_dist_range'])
|
|
805
1268
|
axs[1,0].set_title('norm_dist_range')
|
|
806
1269
|
#######################
|
|
807
|
-
|
|
1270
|
+
|
|
808
1271
|
def get_relative_range_feature(self, window, threshold, plot = False, save_features = False):
|
|
809
|
-
|
|
1272
|
+
"""
|
|
1273
|
+
perform relative spread of opening and closing price
|
|
1274
|
+
|
|
1275
|
+
Parameters
|
|
1276
|
+
----------
|
|
1277
|
+
window (int): window to apply to the feature
|
|
1278
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1279
|
+
plot (boolean): True to display plot
|
|
1280
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1281
|
+
|
|
1282
|
+
Returns
|
|
1283
|
+
-------
|
|
1284
|
+
None
|
|
1285
|
+
"""
|
|
810
1286
|
feature_name = 'CO_Range'
|
|
811
1287
|
self.df[feature_name] = self.df["Close"] / self.df["Open"]-1
|
|
812
1288
|
self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/ self.df[feature_name].std()
|
|
@@ -822,7 +1298,7 @@ class stock_eda_panel(object):
|
|
|
822
1298
|
|
|
823
1299
|
if save_features:
|
|
824
1300
|
self.log_features_standard(feature_name)
|
|
825
|
-
self.settings_relative_price_range = {'window':window, 'threshold':threshold}
|
|
1301
|
+
self.settings_relative_price_range = {'window':window, 'threshold':threshold}
|
|
826
1302
|
|
|
827
1303
|
if plot:
|
|
828
1304
|
fig, axs = plt.subplots(1, 2,figsize=(14,5))
|
|
@@ -840,7 +1316,7 @@ class stock_eda_panel(object):
|
|
|
840
1316
|
def rsi_feature(self, window, lag_rsi_ret, threshold, plot = False, save_features = False):
|
|
841
1317
|
|
|
842
1318
|
rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
|
|
843
|
-
self.df['RSI'] = rsi
|
|
1319
|
+
self.df['RSI'] = rsi
|
|
844
1320
|
self.df['RSI_ret'] = self.df['RSI']/self.df['RSI'].shift(lag_rsi_ret)
|
|
845
1321
|
|
|
846
1322
|
mean = self.df['RSI_ret'].mean()
|
|
@@ -870,8 +1346,22 @@ class stock_eda_panel(object):
|
|
|
870
1346
|
|
|
871
1347
|
fig.show()
|
|
872
1348
|
#######################
|
|
873
|
-
|
|
1349
|
+
|
|
874
1350
|
def rsi_feature_improved(self, window, threshold, plot = False, save_features = False):
|
|
1351
|
+
"""
|
|
1352
|
+
perform relative strength index
|
|
1353
|
+
|
|
1354
|
+
Parameters
|
|
1355
|
+
----------
|
|
1356
|
+
window (int): window to apply to the feature
|
|
1357
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1358
|
+
plot (boolean): True to display plot
|
|
1359
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1360
|
+
|
|
1361
|
+
Returns
|
|
1362
|
+
-------
|
|
1363
|
+
None
|
|
1364
|
+
"""
|
|
875
1365
|
feature_name = 'RSI'
|
|
876
1366
|
rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
|
|
877
1367
|
self.df[feature_name] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
@@ -883,7 +1373,7 @@ class stock_eda_panel(object):
|
|
|
883
1373
|
|
|
884
1374
|
if plot:
|
|
885
1375
|
self.signal_plotter(feature_name)
|
|
886
|
-
|
|
1376
|
+
|
|
887
1377
|
#######################
|
|
888
1378
|
#### to be deprecated ####
|
|
889
1379
|
def days_features(self, window_day, limit, plot = False, save_features = False):
|
|
@@ -916,7 +1406,7 @@ class stock_eda_panel(object):
|
|
|
916
1406
|
if plot:
|
|
917
1407
|
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
918
1408
|
|
|
919
|
-
axs[0].plot(self.df['norm_dow_input'])
|
|
1409
|
+
axs[0].plot(self.df['norm_dow_input'])
|
|
920
1410
|
axs[0].plot(self.df['up_dow_input'], linestyle='--')
|
|
921
1411
|
axs[0].plot(self.df['low_dow_input'], linestyle='--')
|
|
922
1412
|
|
|
@@ -928,9 +1418,22 @@ class stock_eda_panel(object):
|
|
|
928
1418
|
|
|
929
1419
|
fig.show()
|
|
930
1420
|
#######################
|
|
931
|
-
|
|
932
|
-
def days_features_bands(self, window, threshold, plot = False, save_features = False):
|
|
933
1421
|
|
|
1422
|
+
def days_features_bands(self, window, threshold, plot = False, save_features = False):
|
|
1423
|
+
"""
|
|
1424
|
+
compute mean returns for a given day of the week in a window scope per day
|
|
1425
|
+
|
|
1426
|
+
Parameters
|
|
1427
|
+
----------
|
|
1428
|
+
window (int): window to apply to the feature
|
|
1429
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1430
|
+
plot (boolean): True to display plot
|
|
1431
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1432
|
+
|
|
1433
|
+
Returns
|
|
1434
|
+
-------
|
|
1435
|
+
None
|
|
1436
|
+
"""
|
|
934
1437
|
self.df['dow'] = self.df.Date.dt.dayofweek
|
|
935
1438
|
self.df['dow'] = self.df['dow'].astype('str')
|
|
936
1439
|
|
|
@@ -947,11 +1450,11 @@ class stock_eda_panel(object):
|
|
|
947
1450
|
|
|
948
1451
|
if plot:
|
|
949
1452
|
self.signal_plotter(feature_name)
|
|
950
|
-
|
|
1453
|
+
|
|
951
1454
|
#######################
|
|
952
1455
|
#### to be deprecated ####
|
|
953
1456
|
def analysis_volume(self,lag_volume, threshold, window, plot = False, save_features = False):
|
|
954
|
-
|
|
1457
|
+
|
|
955
1458
|
self.df['log_Volume'] = np.log(self.df['Volume'])
|
|
956
1459
|
self.df['ret_log_Volume'] = self.df['log_Volume'].pct_change(lag_volume)
|
|
957
1460
|
|
|
@@ -1003,9 +1506,22 @@ class stock_eda_panel(object):
|
|
|
1003
1506
|
|
|
1004
1507
|
plt.show()
|
|
1005
1508
|
#######################
|
|
1006
|
-
|
|
1509
|
+
|
|
1007
1510
|
def analysis_smooth_volume(self, window, threshold, plot = False, save_features = False):
|
|
1008
|
-
|
|
1511
|
+
"""
|
|
1512
|
+
compute feature of thrading volumes
|
|
1513
|
+
|
|
1514
|
+
Parameters
|
|
1515
|
+
----------
|
|
1516
|
+
window (int): window to apply to the feature
|
|
1517
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1518
|
+
plot (boolean): True to display plot
|
|
1519
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1520
|
+
|
|
1521
|
+
Returns
|
|
1522
|
+
-------
|
|
1523
|
+
None
|
|
1524
|
+
"""
|
|
1009
1525
|
feature_name = 'smooth_Volume'
|
|
1010
1526
|
self.df[feature_name] = np.log(self.df['Volume'])
|
|
1011
1527
|
# self.df[feature_name] = self.df['log_Volume'].rolling(window).mean()
|
|
@@ -1039,7 +1555,7 @@ class stock_eda_panel(object):
|
|
|
1039
1555
|
|
|
1040
1556
|
fig, axs = plt.subplots(1,2,figsize=(10,4))
|
|
1041
1557
|
|
|
1042
|
-
axs[0].plot(self.df[f'{feature_name}'])
|
|
1558
|
+
axs[0].plot(self.df[f'{feature_name}'])
|
|
1043
1559
|
axs[0].set_title(f'{feature_name}')
|
|
1044
1560
|
|
|
1045
1561
|
axs[1].plot(self.df[f'z_{feature_name}'], linestyle='--')
|
|
@@ -1048,6 +1564,20 @@ class stock_eda_panel(object):
|
|
|
1048
1564
|
plt.show()
|
|
1049
1565
|
|
|
1050
1566
|
def roc_feature(self, window, threshold, plot = False, save_features = False):
|
|
1567
|
+
"""
|
|
1568
|
+
perform price rate of change
|
|
1569
|
+
|
|
1570
|
+
Parameters
|
|
1571
|
+
----------
|
|
1572
|
+
window (int): window to apply to the feature
|
|
1573
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1574
|
+
plot (boolean): True to display plot
|
|
1575
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1576
|
+
|
|
1577
|
+
Returns
|
|
1578
|
+
-------
|
|
1579
|
+
None
|
|
1580
|
+
"""
|
|
1051
1581
|
feature_name = 'ROC'
|
|
1052
1582
|
roc = ROCIndicator(close = self.df['Close'], window = window).roc()
|
|
1053
1583
|
self.df[feature_name] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
@@ -1058,8 +1588,24 @@ class stock_eda_panel(object):
|
|
|
1058
1588
|
self.settings_roc_feature = {'window':window, 'threshold':threshold}
|
|
1059
1589
|
if plot:
|
|
1060
1590
|
self.signal_plotter(feature_name)
|
|
1061
|
-
|
|
1591
|
+
|
|
1062
1592
|
def stoch_feature(self, window, smooth1, smooth2, threshold, plot = False, save_features = False):
|
|
1593
|
+
"""
|
|
1594
|
+
perform stochastic oscilator RSI feature
|
|
1595
|
+
|
|
1596
|
+
Parameters
|
|
1597
|
+
----------
|
|
1598
|
+
window (int): window to apply to the feature
|
|
1599
|
+
smooth1 (int): smoothing parameter 1
|
|
1600
|
+
smooth2 (int): smoothing parameter 2
|
|
1601
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1602
|
+
plot (boolean): True to display plot
|
|
1603
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1604
|
+
|
|
1605
|
+
Returns
|
|
1606
|
+
-------
|
|
1607
|
+
None
|
|
1608
|
+
"""
|
|
1063
1609
|
feature_name = 'STOCH'
|
|
1064
1610
|
stoch = StochRSIIndicator(close = self.df['Close'], window = window, smooth1=smooth1, smooth2=smooth2).stochrsi()
|
|
1065
1611
|
self.df[feature_name] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
@@ -1072,6 +1618,21 @@ class stock_eda_panel(object):
|
|
|
1072
1618
|
self.signal_plotter(feature_name)
|
|
1073
1619
|
|
|
1074
1620
|
def stochastic_feature(self, window, smooth, threshold, plot = False, save_features = False):
|
|
1621
|
+
"""
|
|
1622
|
+
perform stochastic oscilator feature
|
|
1623
|
+
|
|
1624
|
+
Parameters
|
|
1625
|
+
----------
|
|
1626
|
+
window (int): window to apply to the feature
|
|
1627
|
+
smooth (int): smoothing parameter
|
|
1628
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1629
|
+
plot (boolean): True to display plot
|
|
1630
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1631
|
+
|
|
1632
|
+
Returns
|
|
1633
|
+
-------
|
|
1634
|
+
None
|
|
1635
|
+
"""
|
|
1075
1636
|
feature_name = 'STOCHOSC'
|
|
1076
1637
|
stochast = StochasticOscillator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window,smooth_window=smooth).stoch()
|
|
1077
1638
|
self.df[feature_name] = stochast.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
@@ -1084,8 +1645,22 @@ class stock_eda_panel(object):
|
|
|
1084
1645
|
self.signal_plotter(feature_name)
|
|
1085
1646
|
|
|
1086
1647
|
def william_feature(self, lbp, threshold, plot = False, save_features = False):
|
|
1648
|
+
"""
|
|
1649
|
+
perfom fast stochastic oscilator or william indicator
|
|
1650
|
+
|
|
1651
|
+
Parameters
|
|
1652
|
+
----------
|
|
1653
|
+
lbp (int): look back parameter
|
|
1654
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1655
|
+
plot (boolean): True to display plot
|
|
1656
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1657
|
+
|
|
1658
|
+
Returns
|
|
1659
|
+
-------
|
|
1660
|
+
None
|
|
1661
|
+
"""
|
|
1087
1662
|
feature_name = 'WILL'
|
|
1088
|
-
will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
|
|
1663
|
+
will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
|
|
1089
1664
|
self.df[feature_name] = will.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
1090
1665
|
self.compute_clip_bands(feature_name,threshold)
|
|
1091
1666
|
|
|
@@ -1096,6 +1671,20 @@ class stock_eda_panel(object):
|
|
|
1096
1671
|
self.signal_plotter(feature_name)
|
|
1097
1672
|
|
|
1098
1673
|
def vortex_feature(self, window, threshold, plot = False, save_features = False):
|
|
1674
|
+
"""
|
|
1675
|
+
perform vortex oscilator
|
|
1676
|
+
|
|
1677
|
+
Parameters
|
|
1678
|
+
----------
|
|
1679
|
+
window (int): window to apply to the feature
|
|
1680
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1681
|
+
plot (boolean): True to display plot
|
|
1682
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1683
|
+
|
|
1684
|
+
Returns
|
|
1685
|
+
-------
|
|
1686
|
+
None
|
|
1687
|
+
"""
|
|
1099
1688
|
feature_name = 'VORTEX'
|
|
1100
1689
|
vortex = VortexIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window).vortex_indicator_diff()
|
|
1101
1690
|
self.df[feature_name] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
|
|
@@ -1108,10 +1697,26 @@ class stock_eda_panel(object):
|
|
|
1108
1697
|
self.signal_plotter(feature_name)
|
|
1109
1698
|
|
|
1110
1699
|
def pair_index_feature(self, pair_symbol, feature_label, window, threshold, plot = False, save_features = False):
|
|
1700
|
+
"""
|
|
1701
|
+
perform additional asset ROC feature, then a new feature is created in the main dataframe
|
|
1702
|
+
|
|
1703
|
+
Parameters
|
|
1704
|
+
----------
|
|
1705
|
+
pair_symbol (str): symbol of the asset to extract the data
|
|
1706
|
+
feature_label (str): name of the resulting feature
|
|
1707
|
+
window (int): window to apply to the feature
|
|
1708
|
+
threshold (float): alpha or z thrsholds for the normalized feature
|
|
1709
|
+
plot (boolean): True to display plot
|
|
1710
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1711
|
+
|
|
1712
|
+
Returns
|
|
1713
|
+
-------
|
|
1714
|
+
None
|
|
1715
|
+
"""
|
|
1111
1716
|
self.pair_index = pair_symbol
|
|
1112
1717
|
begin_date = self.today - relativedelta(days = self.n_days)
|
|
1113
1718
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
1114
|
-
|
|
1719
|
+
|
|
1115
1720
|
if feature_label in self.df.columns:
|
|
1116
1721
|
self.df = self.df.drop(columns = [feature_label])
|
|
1117
1722
|
|
|
@@ -1123,13 +1728,13 @@ class stock_eda_panel(object):
|
|
|
1123
1728
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
1124
1729
|
df = df[df.Date >= begin_date_str ]
|
|
1125
1730
|
self.pair_index_df = df
|
|
1126
|
-
|
|
1731
|
+
|
|
1127
1732
|
#### converting the same index ####
|
|
1128
1733
|
dates_vector = self.df.Date.to_frame()
|
|
1129
1734
|
self.pair_index_df = dates_vector.merge(self.pair_index_df, on ='Date',how = 'left')
|
|
1130
1735
|
self.pair_index_df = self.pair_index_df.fillna(method = 'bfill')
|
|
1131
1736
|
self.pair_index_df = self.pair_index_df.fillna(method = 'ffill')
|
|
1132
|
-
|
|
1737
|
+
|
|
1133
1738
|
self.pair_index_df[feature_label] = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
|
|
1134
1739
|
df_to_merge = self.pair_index_df[['Date',feature_label]]
|
|
1135
1740
|
self.df = self.df.merge(df_to_merge, on ='Date',how = 'left')
|
|
@@ -1140,7 +1745,7 @@ class stock_eda_panel(object):
|
|
|
1140
1745
|
if save_features:
|
|
1141
1746
|
self.log_features_standard(feature_label)
|
|
1142
1747
|
parameters = {feature_label:{'pair_symbol':pair_symbol, 'feature_label':feature_label, 'window':window,'threshold':threshold}}
|
|
1143
|
-
try:
|
|
1748
|
+
try:
|
|
1144
1749
|
len(self.settings_pair_index_feature)
|
|
1145
1750
|
print('existing')
|
|
1146
1751
|
self.settings_pair_index_feature.append(parameters)
|
|
@@ -1153,10 +1758,21 @@ class stock_eda_panel(object):
|
|
|
1153
1758
|
self.signal_plotter(feature_label)
|
|
1154
1759
|
|
|
1155
1760
|
def produce_order_features(self, feature_name, save_features = False):
|
|
1761
|
+
"""
|
|
1762
|
+
perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
|
|
1156
1763
|
|
|
1764
|
+
Parameters
|
|
1765
|
+
----------
|
|
1766
|
+
feature_name (str): name of the feature
|
|
1767
|
+
save_features (boolean): True to save feature configuration and feature names
|
|
1768
|
+
|
|
1769
|
+
Returns
|
|
1770
|
+
-------
|
|
1771
|
+
None
|
|
1772
|
+
"""
|
|
1157
1773
|
signal_feature_name = f'discrete_signal_{feature_name}'
|
|
1158
1774
|
order_feature_name = f'order_signal_{feature_name}'
|
|
1159
|
-
|
|
1775
|
+
|
|
1160
1776
|
self.df[signal_feature_name] = np.where(
|
|
1161
1777
|
self.df[f'signal_up_{feature_name}'] == 1,1,
|
|
1162
1778
|
np.where(
|
|
@@ -1173,14 +1789,24 @@ class stock_eda_panel(object):
|
|
|
1173
1789
|
self.df[order_feature_name] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1174
1790
|
self.df[order_feature_name] = self.df[order_feature_name]*self.df[signal_feature_name]
|
|
1175
1791
|
self.df = self.df.drop(columns = [f'lag_{signal_feature_name}', 'breack', "chain_id"])
|
|
1176
|
-
|
|
1792
|
+
|
|
1177
1793
|
## saving features
|
|
1178
1794
|
if save_features:
|
|
1179
1795
|
self.signals.append(signal_feature_name)
|
|
1180
1796
|
self.signals.append(order_feature_name)
|
|
1181
|
-
|
|
1797
|
+
|
|
1182
1798
|
def create_hmm_derived_features(self, lag_returns):
|
|
1799
|
+
"""
|
|
1800
|
+
create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
|
|
1801
|
+
|
|
1802
|
+
Parameters
|
|
1803
|
+
----------
|
|
1804
|
+
lag_returns (int): lag paramter (not used)
|
|
1183
1805
|
|
|
1806
|
+
Returns
|
|
1807
|
+
-------
|
|
1808
|
+
None
|
|
1809
|
+
"""
|
|
1184
1810
|
self.df = self.df.sort_values('Date')
|
|
1185
1811
|
## indexing chains
|
|
1186
1812
|
self.df['lag_hmm_feature'] = self.df['hmm_feature'].shift(1)
|
|
@@ -1189,7 +1815,7 @@ class stock_eda_panel(object):
|
|
|
1189
1815
|
self.df["chain_id"] = np.where(self.df['breack'] == 1,self.df["chain_id"],np.nan)
|
|
1190
1816
|
self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
|
|
1191
1817
|
self.df["hmm_chain_order"] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1192
|
-
|
|
1818
|
+
|
|
1193
1819
|
### returns using the first element in a chain
|
|
1194
1820
|
self.df['first'] = np.where(self.df['hmm_chain_order'] == 1, self.df['Close'], np.nan)
|
|
1195
1821
|
self.df['first'] = self.df.sort_values('Date')['first'].fillna(method='ffill')
|
|
@@ -1198,8 +1824,26 @@ class stock_eda_panel(object):
|
|
|
1198
1824
|
self.df = self.df.drop(columns = ['breack','first'])
|
|
1199
1825
|
|
|
1200
1826
|
def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, plot = False, save_features = False, model = False):
|
|
1827
|
+
"""
|
|
1828
|
+
create or use a hmm model
|
|
1829
|
+
|
|
1830
|
+
Parameters
|
|
1831
|
+
----------
|
|
1832
|
+
n_clusters (int): number of clusters or states to calculate
|
|
1833
|
+
features_hmm (list): features to be considered in hmm model when training
|
|
1834
|
+
test_data_size (int): size of the test data. Note that the remaining is going to be used as training data
|
|
1835
|
+
seed (int): seed for the model inizialization
|
|
1836
|
+
lag_returns_state (int) : lags for returns of the state
|
|
1837
|
+
plot (boolean): True to display hmm states analysis
|
|
1838
|
+
save_features (boolean): True to save features and configurations
|
|
1839
|
+
model (obj): if provided, no model will be trainend and the provided model will be used to get hmm features
|
|
1840
|
+
|
|
1841
|
+
Returns
|
|
1842
|
+
-------
|
|
1843
|
+
None
|
|
1844
|
+
"""
|
|
1201
1845
|
if not model:
|
|
1202
|
-
|
|
1846
|
+
|
|
1203
1847
|
df_new = self.df
|
|
1204
1848
|
pipeline_hmm = Pipeline([
|
|
1205
1849
|
('selector', FeatureSelector(columns=features_hmm)),
|
|
@@ -1213,7 +1857,7 @@ class stock_eda_panel(object):
|
|
|
1213
1857
|
|
|
1214
1858
|
self.model_hmm = pipeline_hmm
|
|
1215
1859
|
self.test_data_hmm = data_test
|
|
1216
|
-
|
|
1860
|
+
|
|
1217
1861
|
### first feature: the hidden state
|
|
1218
1862
|
self.df['hmm_feature'] = self.model_hmm.predict(self.df)
|
|
1219
1863
|
self.create_hmm_derived_features(lag_returns = lag_returns_state)
|
|
@@ -1230,11 +1874,11 @@ class stock_eda_panel(object):
|
|
|
1230
1874
|
hidden_states = pipeline_hmm.predict(data_test)
|
|
1231
1875
|
data_test['HMM'] = hidden_states
|
|
1232
1876
|
data_test['HMM_state'] = data_test['HMM'].map(map_)
|
|
1233
|
-
|
|
1877
|
+
|
|
1234
1878
|
if model:
|
|
1235
1879
|
self.df['hmm_feature'] = model.predict(self.df)
|
|
1236
1880
|
self.create_hmm_derived_features(lag_returns = lag_returns_state)
|
|
1237
|
-
|
|
1881
|
+
|
|
1238
1882
|
if save_features:
|
|
1239
1883
|
self.features.append('hmm_feature')
|
|
1240
1884
|
self.features.append('hmm_chain_order')
|
|
@@ -1263,14 +1907,38 @@ class stock_eda_panel(object):
|
|
|
1263
1907
|
fig.show()
|
|
1264
1908
|
|
|
1265
1909
|
def sharpe_ratio(self, return_series, n_trad_days = 255, rf = 0.01):
|
|
1910
|
+
"""
|
|
1911
|
+
perform sharpe ratio of a given time series return
|
|
1912
|
+
|
|
1913
|
+
Parameters
|
|
1914
|
+
----------
|
|
1915
|
+
return_series (pd.series): time series of the returns
|
|
1916
|
+
n_trad_days (int): trading days to anualize returns
|
|
1917
|
+
rf (float): anual free risk rate
|
|
1918
|
+
|
|
1919
|
+
Returns
|
|
1920
|
+
-------
|
|
1921
|
+
sharpe_ratio (float): sharpe ratio
|
|
1922
|
+
"""
|
|
1266
1923
|
nsqrt = np.sqrt(n_trad_days)
|
|
1267
1924
|
mean = return_series.mean() * n_trad_days
|
|
1268
1925
|
sigma = return_series.std() * nsqrt
|
|
1269
1926
|
sharpe_ratio = round((mean-rf)/sigma,2)
|
|
1270
1927
|
return sharpe_ratio
|
|
1271
|
-
|
|
1928
|
+
|
|
1272
1929
|
def treat_signal_strategy(self,test_data, strategy):
|
|
1273
|
-
|
|
1930
|
+
"""
|
|
1931
|
+
helper method that treats signals and converts signals to 1 or 0
|
|
1932
|
+
|
|
1933
|
+
Parameters
|
|
1934
|
+
----------
|
|
1935
|
+
test_data (pd.DataFrame): test data
|
|
1936
|
+
strategy (list): features to get the strategy
|
|
1937
|
+
|
|
1938
|
+
Returns
|
|
1939
|
+
-------
|
|
1940
|
+
test_data (pd.DataFrame): test data with extra columns that are the strategy (main_signal)
|
|
1941
|
+
"""
|
|
1274
1942
|
hmm_states_list = [x for x in strategy if 'hmm_state_' in x]
|
|
1275
1943
|
other_features = [x for x in strategy if x not in hmm_states_list]
|
|
1276
1944
|
|
|
@@ -1299,10 +1967,21 @@ class stock_eda_panel(object):
|
|
|
1299
1967
|
elif len(hmm_states_list) == 0 and len(other_features) > 0:
|
|
1300
1968
|
test_data['main_signal'] = np.where((test_data['features_signal'] == 1) & (test_data['hmm_signal'] == 0),1,0)
|
|
1301
1969
|
|
|
1302
|
-
return test_data
|
|
1970
|
+
return test_data
|
|
1303
1971
|
|
|
1304
1972
|
def stategy_simulator(self, features, hmm_feature = True):
|
|
1973
|
+
"""
|
|
1974
|
+
execute strategy and get some performance metrics like sharpe ratio, return. This method creates some new attributes
|
|
1305
1975
|
|
|
1976
|
+
Parameters
|
|
1977
|
+
----------
|
|
1978
|
+
features (list): list of features to be tested as strategies
|
|
1979
|
+
hmm_feature (boolean): include hmm feature
|
|
1980
|
+
|
|
1981
|
+
Returns
|
|
1982
|
+
-------
|
|
1983
|
+
None
|
|
1984
|
+
"""
|
|
1306
1985
|
columns_ = ['Date', 'Close','Open'] + features + ['HMM']
|
|
1307
1986
|
states = list(self.df.hmm_feature.unique())
|
|
1308
1987
|
states.sort()
|
|
@@ -1372,8 +2051,19 @@ class stock_eda_panel(object):
|
|
|
1372
2051
|
self.strategy_log = df_returns_log
|
|
1373
2052
|
self.best_strategy = df_returns_log.iloc[0,:].strategy
|
|
1374
2053
|
self.top_10_strategy = list(df_returns_log.iloc[0:10,:].strategy.values)
|
|
1375
|
-
|
|
2054
|
+
|
|
1376
2055
|
def viz_strategy(self, strategy):
|
|
2056
|
+
"""
|
|
2057
|
+
display analysis plot of a given strategy
|
|
2058
|
+
|
|
2059
|
+
Parameters
|
|
2060
|
+
----------
|
|
2061
|
+
strategy (list): list of features of the strategy
|
|
2062
|
+
|
|
2063
|
+
Returns
|
|
2064
|
+
-------
|
|
2065
|
+
None
|
|
2066
|
+
"""
|
|
1377
2067
|
test_data = self.test_data_strategy
|
|
1378
2068
|
|
|
1379
2069
|
test_data = self.treat_signal_strategy(test_data, strategy)
|
|
@@ -1408,7 +2098,7 @@ class stock_eda_panel(object):
|
|
|
1408
2098
|
|
|
1409
2099
|
### deprecated ############################
|
|
1410
2100
|
def create_strategy(self, favourable_states):
|
|
1411
|
-
|
|
2101
|
+
|
|
1412
2102
|
test_data = self.test_data_hmm
|
|
1413
2103
|
# add MA signal
|
|
1414
2104
|
test_data.loc[test_data[self.ma1_column] > test_data[self.ma2_column], 'MA_signal'] = 1
|
|
@@ -1452,16 +2142,27 @@ class stock_eda_panel(object):
|
|
|
1452
2142
|
plt.plot(test_data['strat_prod_exp'])
|
|
1453
2143
|
self.settings_hmm_states = {'favourable_states':favourable_states}
|
|
1454
2144
|
################################################
|
|
1455
|
-
|
|
2145
|
+
|
|
1456
2146
|
def deep_dive_analysis_hmm(self, test_data_size, split = 'train'):
|
|
1457
|
-
|
|
2147
|
+
"""
|
|
2148
|
+
display analysis plot hmm model
|
|
2149
|
+
|
|
2150
|
+
Parameters
|
|
2151
|
+
----------
|
|
2152
|
+
test_data_size (int): test data size, the remaining is the train data
|
|
2153
|
+
split (str): options (train or test). Split type to assess
|
|
2154
|
+
|
|
2155
|
+
Returns
|
|
2156
|
+
-------
|
|
2157
|
+
None
|
|
2158
|
+
"""
|
|
1458
2159
|
if split == 'train':
|
|
1459
2160
|
df = self.df.iloc[:-test_data_size,:]
|
|
1460
2161
|
elif split == 'test':
|
|
1461
2162
|
df = self.df.iloc[-test_data_size:,:]
|
|
1462
2163
|
|
|
1463
2164
|
## returns plot
|
|
1464
|
-
fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
|
|
2165
|
+
fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
|
|
1465
2166
|
height=400, width=1000, title = 'returns chain hmm feature')
|
|
1466
2167
|
fig.add_shape(type='line',x0=-0.5,y0=0,x1=max(df.hmm_feature)+0.5,y1=0,line=dict(color='grey',width=1),xref='x',yref='y')
|
|
1467
2168
|
fig.show()
|
|
@@ -1490,6 +2191,17 @@ class stock_eda_panel(object):
|
|
|
1490
2191
|
del df
|
|
1491
2192
|
|
|
1492
2193
|
def get_targets(self, steps):
|
|
2194
|
+
"""
|
|
2195
|
+
produce regression target return taking future prices
|
|
2196
|
+
|
|
2197
|
+
Parameters
|
|
2198
|
+
----------
|
|
2199
|
+
steps (int): number of lags and steps for future returns
|
|
2200
|
+
|
|
2201
|
+
Returns
|
|
2202
|
+
-------
|
|
2203
|
+
None
|
|
2204
|
+
"""
|
|
1493
2205
|
self.targets = list()
|
|
1494
2206
|
self.target = list()
|
|
1495
2207
|
columns = list()
|
|
@@ -1501,9 +2213,21 @@ class stock_eda_panel(object):
|
|
|
1501
2213
|
self.df[f'mean_target'] = self.df[columns].mean(axis=1)
|
|
1502
2214
|
self.target.append(f'mean_target')
|
|
1503
2215
|
self.settings_target_lasts = {'steps':steps, 'type':'regression'}
|
|
1504
|
-
|
|
2216
|
+
|
|
1505
2217
|
def get_categorical_targets(self, horizon, flor_loss, top_gain):
|
|
1506
|
-
|
|
2218
|
+
"""
|
|
2219
|
+
produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
|
|
2220
|
+
|
|
2221
|
+
Parameters
|
|
2222
|
+
----------
|
|
2223
|
+
horizon (int): number of lags and steps for future returns
|
|
2224
|
+
flor_loss (float): min loss return
|
|
2225
|
+
top_gain (float): max gain return
|
|
2226
|
+
|
|
2227
|
+
Returns
|
|
2228
|
+
-------
|
|
2229
|
+
None
|
|
2230
|
+
"""
|
|
1507
2231
|
self.target = list()
|
|
1508
2232
|
self.targets = list()
|
|
1509
2233
|
columns = list()
|
|
@@ -1535,7 +2259,19 @@ class stock_eda_panel(object):
|
|
|
1535
2259
|
self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain, 'type': 'classification'}
|
|
1536
2260
|
|
|
1537
2261
|
def get_configurations(self,test_data_size =250, val_data_size = 250, model_type = False):
|
|
1538
|
-
|
|
2262
|
+
"""
|
|
2263
|
+
produce configuration dictionary that were saved in the feature generation methods if save_features was activated
|
|
2264
|
+
|
|
2265
|
+
Parameters
|
|
2266
|
+
----------
|
|
2267
|
+
test_data_size (int): test data size
|
|
2268
|
+
val_data_size (int): validation data size
|
|
2269
|
+
model_type (str): model type, options: 'Forecaster','Classifier'
|
|
2270
|
+
|
|
2271
|
+
Returns
|
|
2272
|
+
-------
|
|
2273
|
+
None
|
|
2274
|
+
"""
|
|
1539
2275
|
self.settings = {
|
|
1540
2276
|
'features':list(set(self.features)),
|
|
1541
2277
|
'signals' :list(set(self.signals)),
|
|
@@ -1547,15 +2283,15 @@ class stock_eda_panel(object):
|
|
|
1547
2283
|
'outlier': self.settings_outlier,
|
|
1548
2284
|
}
|
|
1549
2285
|
}
|
|
1550
|
-
|
|
2286
|
+
|
|
1551
2287
|
if model_type in ['Forecaster','Classifier']:
|
|
1552
|
-
|
|
2288
|
+
|
|
1553
2289
|
target_list = list(set(self.targets))
|
|
1554
2290
|
target_list.sort()
|
|
1555
2291
|
self.settings['model_type'] = model_type
|
|
1556
2292
|
self.settings['target'] = list(set(self.target))
|
|
1557
2293
|
self.settings['targets'] = target_list
|
|
1558
|
-
|
|
2294
|
+
|
|
1559
2295
|
## for now this is hard coded
|
|
1560
2296
|
feature_list = ['spread_ma','relative_spread_ma','pair_feature','count_features','bidirect_count_features','price_range','relative_price_range','rsi_feature',
|
|
1561
2297
|
'rsi_feature_v2', 'days_features','days_features_v2', 'volume_feature','smooth_volume', 'roc_feature', 'stoch_feature', 'stochastic_feature',
|
|
@@ -1570,7 +2306,7 @@ class stock_eda_panel(object):
|
|
|
1570
2306
|
self.settings['settings']['target_lasts'] = self.settings_target_lasts
|
|
1571
2307
|
except:
|
|
1572
2308
|
pass
|
|
1573
|
-
|
|
2309
|
+
|
|
1574
2310
|
try:
|
|
1575
2311
|
self.settings['settings']['strategies'] = {
|
|
1576
2312
|
'best_strategy':self.best_strategy,
|
|
@@ -1580,48 +2316,189 @@ class stock_eda_panel(object):
|
|
|
1580
2316
|
pass
|
|
1581
2317
|
|
|
1582
2318
|
class produce_model:
|
|
2319
|
+
"""
|
|
2320
|
+
Class that produces a machine learning model in a scikit-learn pipeline wrapper.
|
|
2321
|
+
|
|
2322
|
+
Attributes
|
|
2323
|
+
----------
|
|
2324
|
+
data : pd.DataFrame
|
|
2325
|
+
symbol of the asset
|
|
2326
|
+
X_train : pd.DataFrame
|
|
2327
|
+
y_train : pd.Series
|
|
2328
|
+
X_test : pd.DataFrame
|
|
2329
|
+
y_test : pd.Series
|
|
2330
|
+
X_val : pd.DataFrame
|
|
2331
|
+
y_val : pd.Series
|
|
2332
|
+
pipeline : obj
|
|
2333
|
+
trained pipeline that includes a ml model
|
|
2334
|
+
features_to_model: list
|
|
2335
|
+
features in end step of the pipeline
|
|
2336
|
+
|
|
2337
|
+
Methods
|
|
2338
|
+
-------
|
|
2339
|
+
preprocess(test_data_size=int, target=str, val_data_size=int):
|
|
2340
|
+
prepare data, split train, test, validation data and X and Y
|
|
2341
|
+
get_sample(x=pd.DataFrame, sample=int, max_=int):
|
|
2342
|
+
sample data
|
|
2343
|
+
"""
|
|
1583
2344
|
def __init__(self,data):
|
|
2345
|
+
"""
|
|
2346
|
+
Initialize object
|
|
2347
|
+
|
|
2348
|
+
Parameters
|
|
2349
|
+
----------
|
|
2350
|
+
data (pd.DataFrame): data
|
|
2351
|
+
|
|
2352
|
+
Returns
|
|
2353
|
+
-------
|
|
2354
|
+
None
|
|
2355
|
+
"""
|
|
1584
2356
|
self.data = data.copy()
|
|
1585
|
-
|
|
2357
|
+
|
|
1586
2358
|
def preprocess(self, test_data_size, target, val_data_size = False):
|
|
1587
|
-
|
|
2359
|
+
"""
|
|
2360
|
+
prepare data, split train, test, validation data and X and Y
|
|
2361
|
+
|
|
2362
|
+
Parameters
|
|
2363
|
+
----------
|
|
2364
|
+
test_data_size (int): test data size
|
|
2365
|
+
target (str): target column
|
|
2366
|
+
val_data_size (int): validation data size
|
|
2367
|
+
|
|
2368
|
+
Returns
|
|
2369
|
+
-------
|
|
2370
|
+
None
|
|
2371
|
+
"""
|
|
1588
2372
|
train_data, test_data = self.data.iloc[:-test_data_size,:].dropna() , self.data.iloc[-test_data_size:,:].dropna()
|
|
1589
|
-
|
|
2373
|
+
|
|
1590
2374
|
if val_data_size:
|
|
1591
2375
|
train_data, val_data = train_data.iloc[:-val_data_size,:], train_data.iloc[-val_data_size:,:]
|
|
1592
|
-
|
|
2376
|
+
|
|
1593
2377
|
self.test_data = test_data
|
|
1594
|
-
|
|
2378
|
+
|
|
1595
2379
|
X_train, y_train = train_data.iloc[0:,1:], train_data[target]
|
|
1596
2380
|
X_test, y_test = test_data.iloc[0:,1:], test_data[target]
|
|
1597
2381
|
self.X_train = X_train
|
|
1598
2382
|
self.y_train = y_train
|
|
1599
2383
|
self.X_test = X_test
|
|
1600
2384
|
self.y_test = y_test
|
|
1601
|
-
|
|
2385
|
+
|
|
1602
2386
|
if val_data_size:
|
|
1603
2387
|
X_val, y_val = val_data.iloc[0:,1:], val_data[target]
|
|
1604
2388
|
self.X_val = X_val
|
|
1605
2389
|
self.y_val = y_val
|
|
1606
|
-
|
|
2390
|
+
|
|
1607
2391
|
def get_sample(self, x, sample, max_=900):
|
|
2392
|
+
"""
|
|
2393
|
+
sample data
|
|
2394
|
+
|
|
2395
|
+
Parameters
|
|
2396
|
+
----------
|
|
2397
|
+
x (pd.DataFrame): input data
|
|
2398
|
+
sample (int): sample size
|
|
2399
|
+
max_ (int): max sample
|
|
2400
|
+
|
|
2401
|
+
Returns
|
|
2402
|
+
-------
|
|
2403
|
+
sample (float): sample size
|
|
2404
|
+
"""
|
|
1608
2405
|
length = len(x)
|
|
1609
2406
|
if length > max_:
|
|
1610
2407
|
return 1.0
|
|
1611
2408
|
else:
|
|
1612
2409
|
return sample
|
|
1613
|
-
|
|
2410
|
+
|
|
1614
2411
|
def train_model(self, pipe, model, cv_ = False):
|
|
2412
|
+
"""
|
|
2413
|
+
train pipeline
|
|
2414
|
+
|
|
2415
|
+
Parameters
|
|
2416
|
+
----------
|
|
2417
|
+
pipe (obj): pipeline object
|
|
2418
|
+
model (obj): model object
|
|
2419
|
+
cv_ (obj): cross validation procedure
|
|
2420
|
+
|
|
2421
|
+
Returns
|
|
2422
|
+
-------
|
|
2423
|
+
sample (float): sample size
|
|
2424
|
+
"""
|
|
1615
2425
|
self.model = model
|
|
1616
2426
|
self.pipe_transform = pipe
|
|
1617
2427
|
self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
|
|
1618
2428
|
self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
|
|
1619
2429
|
self.pipeline.fit(self.X_train, self.y_train)
|
|
1620
|
-
|
|
1621
|
-
|
|
2430
|
+
|
|
2431
|
+
|
|
1622
2432
|
class hmm_feature_selector():
|
|
1623
|
-
|
|
2433
|
+
"""
|
|
2434
|
+
class that is going to train hmm models to perform feature selection
|
|
2435
|
+
|
|
2436
|
+
Attributes
|
|
2437
|
+
----------
|
|
2438
|
+
data : pd.DataFrame
|
|
2439
|
+
symbol of the asset
|
|
2440
|
+
n_clusters : int
|
|
2441
|
+
number of clusters to search
|
|
2442
|
+
init_features_hmm : list
|
|
2443
|
+
list of features to consider in the search
|
|
2444
|
+
test_data_size :int
|
|
2445
|
+
test data size, meaning that the remaining is going to be used as training data
|
|
2446
|
+
select_n_features : int
|
|
2447
|
+
number of features to select
|
|
2448
|
+
n_trials : int
|
|
2449
|
+
total number of trials per combination
|
|
2450
|
+
limit_search : int
|
|
2451
|
+
limit number of combinations
|
|
2452
|
+
default_benchmark_sd : float
|
|
2453
|
+
default value to bias standard deviation
|
|
2454
|
+
t_threshold : float
|
|
2455
|
+
alpha or z threshold
|
|
2456
|
+
pipeline_hmm: obj
|
|
2457
|
+
pipeline object of the hmm model
|
|
2458
|
+
features_used_in_model:list
|
|
2459
|
+
features in model
|
|
2460
|
+
train_model(features_hmm=list):
|
|
2461
|
+
train hmm model
|
|
2462
|
+
feature_combinations: list
|
|
2463
|
+
list of combination of features
|
|
2464
|
+
mean_relevance: float
|
|
2465
|
+
relevance score of the model
|
|
2466
|
+
best_features: list
|
|
2467
|
+
list of best performing features
|
|
2468
|
+
|
|
2469
|
+
Methods
|
|
2470
|
+
-------
|
|
2471
|
+
split_data():
|
|
2472
|
+
split data in train and test
|
|
2473
|
+
train_model(features_hmm=list):
|
|
2474
|
+
train hmm model
|
|
2475
|
+
feature_list_generator():
|
|
2476
|
+
perform combination of features
|
|
2477
|
+
get_error():
|
|
2478
|
+
get error or score of a given model using relevance score
|
|
2479
|
+
execute_selector():
|
|
2480
|
+
select the best combination of features
|
|
2481
|
+
"""
|
|
1624
2482
|
def __init__(self, data, n_clusters, init_features_hmm, test_data_size, select_n_features, n_trials = 1,limit_search = False, default_benchmark_sd = 0.00003, t_threshold = 2):
|
|
2483
|
+
"""
|
|
2484
|
+
Initialize object
|
|
2485
|
+
|
|
2486
|
+
Parameters
|
|
2487
|
+
----------
|
|
2488
|
+
data (pd.DataFrame): data
|
|
2489
|
+
n_clusters (int): number of clusters to search
|
|
2490
|
+
init_features_hmm (list): list of features to consider in the search
|
|
2491
|
+
test_data_siz:(int: test data size, meaning that the remaining is going to be used as training data
|
|
2492
|
+
select_n_features (int): number of features to select
|
|
2493
|
+
n_trials (int): total number of trials per combination
|
|
2494
|
+
limit_search (int): limit number of combinations
|
|
2495
|
+
default_benchmark_sd (float): default value to bias standard deviation
|
|
2496
|
+
t_threshold (float): alpha or z threshold
|
|
2497
|
+
|
|
2498
|
+
Returns
|
|
2499
|
+
-------
|
|
2500
|
+
None
|
|
2501
|
+
"""
|
|
1625
2502
|
self.data = data.copy()
|
|
1626
2503
|
self.n_clusters = n_clusters
|
|
1627
2504
|
self.init_features_hmm = init_features_hmm
|
|
@@ -1631,36 +2508,77 @@ class hmm_feature_selector():
|
|
|
1631
2508
|
self.limit_search= limit_search
|
|
1632
2509
|
self.default_benchmark_sd = default_benchmark_sd
|
|
1633
2510
|
self.t_threshold = t_threshold
|
|
1634
|
-
|
|
2511
|
+
|
|
1635
2512
|
def split_data(self):
|
|
1636
|
-
|
|
2513
|
+
"""
|
|
2514
|
+
split data in train and test
|
|
2515
|
+
|
|
2516
|
+
Parameters
|
|
2517
|
+
----------
|
|
2518
|
+
None
|
|
2519
|
+
|
|
2520
|
+
Returns
|
|
2521
|
+
-------
|
|
2522
|
+
None
|
|
2523
|
+
"""
|
|
1637
2524
|
self.data_train = self.data.iloc[:-self.test_data_size,:]
|
|
1638
2525
|
self.data_test = self.data.iloc[-self.test_data_size:,:]
|
|
1639
|
-
|
|
2526
|
+
|
|
1640
2527
|
def train_model(self,features_hmm):
|
|
2528
|
+
"""
|
|
2529
|
+
train hmm model
|
|
2530
|
+
|
|
2531
|
+
Parameters
|
|
2532
|
+
----------
|
|
2533
|
+
features_hmm (list): list of features to be selected in the model
|
|
2534
|
+
|
|
2535
|
+
Returns
|
|
2536
|
+
-------
|
|
2537
|
+
None
|
|
2538
|
+
"""
|
|
1641
2539
|
pipeline_hmm = Pipeline([
|
|
1642
2540
|
('selector', FeatureSelector(columns=features_hmm)),
|
|
1643
2541
|
('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
|
|
1644
2542
|
('hmm',GaussianHMM(n_components = self.n_clusters, covariance_type = 'full'))
|
|
1645
2543
|
])
|
|
1646
|
-
|
|
2544
|
+
|
|
1647
2545
|
self.pipeline_hmm = pipeline_hmm.fit(self.data_train)
|
|
1648
2546
|
self.features_used_in_model = features_hmm
|
|
1649
|
-
|
|
2547
|
+
|
|
1650
2548
|
def feature_list_generator(self):
|
|
1651
|
-
|
|
2549
|
+
"""
|
|
2550
|
+
perform combination of features
|
|
2551
|
+
|
|
2552
|
+
Parameters
|
|
2553
|
+
----------
|
|
2554
|
+
None
|
|
2555
|
+
|
|
2556
|
+
Returns
|
|
2557
|
+
-------
|
|
2558
|
+
None
|
|
2559
|
+
"""
|
|
1652
2560
|
feature_combinations = set(list(combinations(self.init_features_hmm, self.select_n_features)))
|
|
1653
2561
|
feature_combinations = list(map(list, feature_combinations))
|
|
1654
|
-
|
|
2562
|
+
|
|
1655
2563
|
self.feature_combinations = feature_combinations
|
|
1656
|
-
|
|
2564
|
+
|
|
1657
2565
|
def get_error(self):
|
|
1658
|
-
|
|
2566
|
+
"""
|
|
2567
|
+
get error or score of a given model using relevance score
|
|
2568
|
+
|
|
2569
|
+
Parameters
|
|
2570
|
+
----------
|
|
2571
|
+
None
|
|
2572
|
+
|
|
2573
|
+
Returns
|
|
2574
|
+
-------
|
|
2575
|
+
None
|
|
2576
|
+
"""
|
|
1659
2577
|
self.data_train_ = self.data_train.copy()
|
|
1660
|
-
|
|
2578
|
+
|
|
1661
2579
|
self.data_train_['hmm_feature'] = self.pipeline_hmm.predict(self.data_train_)
|
|
1662
2580
|
self.data_train_ = self.data_train_[['Date','hmm_feature','Close']].sort_values('Date')
|
|
1663
|
-
|
|
2581
|
+
|
|
1664
2582
|
## indexing chains
|
|
1665
2583
|
self.data_train_['lag_hmm_feature'] = self.data_train_['hmm_feature'].shift(1)
|
|
1666
2584
|
self.data_train_['breack'] = np.where(self.data_train_['lag_hmm_feature'] != self.data_train_['hmm_feature'],1,0)
|
|
@@ -1668,36 +2586,46 @@ class hmm_feature_selector():
|
|
|
1668
2586
|
self.data_train_["chain_id"] = np.where(self.data_train_['breack'] == 1,self.data_train_["chain_id"],np.nan)
|
|
1669
2587
|
self.data_train_["chain_id"] = self.data_train_["chain_id"].fillna(method='ffill')
|
|
1670
2588
|
self.data_train_["hmm_chain_order"] = self.data_train_.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1671
|
-
|
|
2589
|
+
|
|
1672
2590
|
### returns using the first element in a chain
|
|
1673
2591
|
self.data_train_['first'] = np.where(self.data_train_['hmm_chain_order'] == 1, self.data_train_['Close'], np.nan)
|
|
1674
2592
|
self.data_train_['first'] = self.data_train_.sort_values('Date')['first'].fillna(method='ffill')
|
|
1675
2593
|
self.data_train_['chain_return'] = (self.data_train_['Close']/self.data_train_['first'] -1) * 100
|
|
1676
|
-
|
|
2594
|
+
|
|
1677
2595
|
self.data_train_ = self.data_train_.drop(columns = ['first'])
|
|
1678
|
-
|
|
2596
|
+
|
|
1679
2597
|
mean_relevance, cluster_returns, number_relevant_states = states_relevance_score(self.data_train_)
|
|
1680
2598
|
self.mean_relevance = mean_relevance
|
|
1681
|
-
|
|
2599
|
+
|
|
1682
2600
|
def execute_selector(self):
|
|
1683
|
-
|
|
2601
|
+
"""
|
|
2602
|
+
select the best combination of features
|
|
2603
|
+
|
|
2604
|
+
Parameters
|
|
2605
|
+
----------
|
|
2606
|
+
None
|
|
2607
|
+
|
|
2608
|
+
Returns
|
|
2609
|
+
-------
|
|
2610
|
+
None
|
|
2611
|
+
"""
|
|
1684
2612
|
self.split_data()
|
|
1685
2613
|
self.feature_list_generator()
|
|
1686
2614
|
maxi = -1
|
|
1687
2615
|
print(f'it is expected {len(self.feature_combinations)} combinations')
|
|
1688
2616
|
feature_results = dict()
|
|
1689
|
-
|
|
2617
|
+
|
|
1690
2618
|
if self.limit_search:
|
|
1691
2619
|
print(f' taking just {self.limit_search} combinations')
|
|
1692
2620
|
maxi = self.limit_search
|
|
1693
|
-
|
|
2621
|
+
|
|
1694
2622
|
for i,features_hmm in enumerate(self.feature_combinations[0:maxi]):
|
|
1695
|
-
|
|
2623
|
+
|
|
1696
2624
|
feature_results[f'group_{i}'] = {
|
|
1697
2625
|
'features':list(features_hmm),
|
|
1698
2626
|
'relevances':list()
|
|
1699
2627
|
}
|
|
1700
|
-
|
|
2628
|
+
|
|
1701
2629
|
for _ in range(self.n_trials):
|
|
1702
2630
|
try:
|
|
1703
2631
|
self.train_model(features_hmm)
|
|
@@ -1708,18 +2636,54 @@ class hmm_feature_selector():
|
|
|
1708
2636
|
feature_results[f'group_{i}']['mean relevance'] = np.mean(feature_results[f'group_{i}']['relevances'])
|
|
1709
2637
|
self.feature_results = feature_results
|
|
1710
2638
|
self.best_features = pd.DataFrame(self.feature_results).T.sort_values('mean relevance').iloc[-1,:].features
|
|
1711
|
-
|
|
2639
|
+
|
|
1712
2640
|
class signal_analyser_object:
|
|
1713
|
-
|
|
2641
|
+
"""
|
|
2642
|
+
class that is going to analyse signals
|
|
2643
|
+
|
|
2644
|
+
Attributes
|
|
2645
|
+
----------
|
|
2646
|
+
data : pd.DataFrame
|
|
2647
|
+
symbol of the asset
|
|
2648
|
+
ticket_name :str
|
|
2649
|
+
asset symbol
|
|
2650
|
+
show_plot : boolean
|
|
2651
|
+
if true show plot for every method
|
|
2652
|
+
save_path : str
|
|
2653
|
+
if true, save results in file
|
|
2654
|
+
save_aws : str
|
|
2655
|
+
if true, export results to remote repo
|
|
2656
|
+
aws_credentials : dict
|
|
2657
|
+
credentials for aws
|
|
2658
|
+
return_fig : boolean
|
|
2659
|
+
if true, methods will return objects
|
|
2660
|
+
create_backtest_signal(days_strategy=list, test_size=int, feature_name=str, high_exit=float, low_exit=float):
|
|
2661
|
+
perform backtest signal analysis
|
|
2662
|
+
|
|
2663
|
+
Methods
|
|
2664
|
+
-------
|
|
2665
|
+
signal_analyser(test_size=int, feature_name=str, days_list=list, threshold=float,verbose=boolean, signal_position=boolean):
|
|
2666
|
+
perform signal analysis and feature extraction
|
|
2667
|
+
|
|
2668
|
+
"""
|
|
2669
|
+
|
|
1714
2670
|
def __init__(self, data,symbol_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
|
|
1715
2671
|
"""
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
2672
|
+
Initialize object
|
|
2673
|
+
|
|
2674
|
+
Parameters
|
|
2675
|
+
----------
|
|
2676
|
+
data (pd.DataFrame): data
|
|
2677
|
+
ticket_name (str): name of the asset
|
|
2678
|
+
show_plot (boolean): if true show plot for every method
|
|
2679
|
+
save_path (str): if true, save results in file e.g r'C:/path/to/the/file/'
|
|
2680
|
+
save_aws (str): if true, export results to remote repo e.g. 'path/to/file/'
|
|
2681
|
+
aws_credentials (dict): credentials for aws
|
|
2682
|
+
return_fig (boolean): if true, methods will return objects
|
|
2683
|
+
|
|
2684
|
+
Returns
|
|
2685
|
+
-------
|
|
2686
|
+
None
|
|
1723
2687
|
"""
|
|
1724
2688
|
self.data = data.copy()
|
|
1725
2689
|
self.ticket_name = symbol_name
|
|
@@ -1730,6 +2694,22 @@ class signal_analyser_object:
|
|
|
1730
2694
|
self.return_fig = return_fig
|
|
1731
2695
|
|
|
1732
2696
|
def signal_analyser(self, test_size, feature_name, days_list, threshold = 0.05,verbose = False, signal_position = False):
|
|
2697
|
+
"""
|
|
2698
|
+
perform signal analysis and feature extraction
|
|
2699
|
+
|
|
2700
|
+
Parameters
|
|
2701
|
+
----------
|
|
2702
|
+
test_size (int): test data size
|
|
2703
|
+
feature_name (str): name of the feature to assess
|
|
2704
|
+
days_list (list): list of integers [3,8,10] to assess
|
|
2705
|
+
threshold (float): alpha or z threshold
|
|
2706
|
+
verbose (boolean): print metrics
|
|
2707
|
+
signal_position (int): if true, the signal is taken at the given step in the signal
|
|
2708
|
+
|
|
2709
|
+
Returns
|
|
2710
|
+
-------
|
|
2711
|
+
None
|
|
2712
|
+
"""
|
|
1733
2713
|
data = self.data
|
|
1734
2714
|
self.feature_name = feature_name
|
|
1735
2715
|
up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
|
|
@@ -1745,10 +2725,10 @@ class signal_analyser_object:
|
|
|
1745
2725
|
returns_list.append(feature_)
|
|
1746
2726
|
|
|
1747
2727
|
df['signal_type'] = np.where(
|
|
1748
|
-
df[up_signal] == 1,
|
|
1749
|
-
'up',
|
|
2728
|
+
df[up_signal] == 1,
|
|
2729
|
+
'up',
|
|
1750
2730
|
np.where(
|
|
1751
|
-
df[low_signal] == 1,
|
|
2731
|
+
df[low_signal] == 1,
|
|
1752
2732
|
'down',
|
|
1753
2733
|
None
|
|
1754
2734
|
)
|
|
@@ -1772,7 +2752,7 @@ class signal_analyser_object:
|
|
|
1772
2752
|
|
|
1773
2753
|
df = df.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
|
|
1774
2754
|
self.df_signal = df
|
|
1775
|
-
|
|
2755
|
+
|
|
1776
2756
|
n_signals_up = len(list(df[df.signal_type == 'up'].chain_id.unique()))
|
|
1777
2757
|
n_signals_down = len(list(df[df.signal_type == 'down'].chain_id.unique()))
|
|
1778
2758
|
p_scores = list()
|
|
@@ -1788,7 +2768,7 @@ class signal_analyser_object:
|
|
|
1788
2768
|
sample2 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'down')].value.values
|
|
1789
2769
|
pvalue = stats.ttest_ind(sample1, sample2).pvalue
|
|
1790
2770
|
median_down = np.median(sample2)
|
|
1791
|
-
median_up = np.median(sample1)
|
|
2771
|
+
median_up = np.median(sample1)
|
|
1792
2772
|
validations.append(median_up < 0)
|
|
1793
2773
|
validations.append(median_down > 0)
|
|
1794
2774
|
p_scores.append(pvalue)
|
|
@@ -1830,10 +2810,10 @@ class signal_analyser_object:
|
|
|
1830
2810
|
sns.boxplot(data=df_melt, x="time", y="value", hue="signal_type",ax = axs[2])
|
|
1831
2811
|
axs[2].axhline(y=0, color='grey', linestyle='--')
|
|
1832
2812
|
axs[2].set_title('signal type expected returns distribution at different time lapses')
|
|
1833
|
-
|
|
2813
|
+
|
|
1834
2814
|
if self.show_plot:
|
|
1835
2815
|
plt.show()
|
|
1836
|
-
|
|
2816
|
+
|
|
1837
2817
|
if self.save_path:
|
|
1838
2818
|
result_plot_name = f'signals_strategy_distribution_{feature_name}.png'
|
|
1839
2819
|
fig.savefig(self.save_path+result_plot_name)
|
|
@@ -1849,17 +2829,33 @@ class signal_analyser_object:
|
|
|
1849
2829
|
|
|
1850
2830
|
if self.return_fig:
|
|
1851
2831
|
return fig
|
|
1852
|
-
|
|
2832
|
+
|
|
1853
2833
|
def create_backtest_signal(self,days_strategy, test_size, feature_name, high_exit = False, low_exit = False):
|
|
2834
|
+
"""
|
|
2835
|
+
perform backtest signal analysis
|
|
2836
|
+
|
|
2837
|
+
Parameters
|
|
2838
|
+
----------
|
|
2839
|
+
days_strategy (list): list of days to assess returns
|
|
2840
|
+
test_size (str): test data size
|
|
2841
|
+
feature_name (str): name of the feature to assess
|
|
2842
|
+
high_exit (float): high exit thrshold return in backtest
|
|
2843
|
+
low_exit (float): loss exit thrshold return in backtest
|
|
2844
|
+
|
|
2845
|
+
Returns
|
|
2846
|
+
-------
|
|
2847
|
+
fig (obj): plots
|
|
2848
|
+
messages (dict): dictionary with key metrics
|
|
2849
|
+
"""
|
|
1854
2850
|
asset_1 = 'Close'
|
|
1855
2851
|
up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
|
|
1856
2852
|
df1 = self.data.iloc[-test_size:,:].copy()
|
|
1857
2853
|
df2 = df1.copy()
|
|
1858
2854
|
df2['signal_type'] = np.where(
|
|
1859
|
-
df2[up_signal] == 1,
|
|
1860
|
-
'up',
|
|
2855
|
+
df2[up_signal] == 1,
|
|
2856
|
+
'up',
|
|
1861
2857
|
np.where(
|
|
1862
|
-
df2[low_signal] == 1,
|
|
2858
|
+
df2[low_signal] == 1,
|
|
1863
2859
|
'down',
|
|
1864
2860
|
None
|
|
1865
2861
|
)
|
|
@@ -1870,33 +2866,33 @@ class signal_analyser_object:
|
|
|
1870
2866
|
df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lag_Date'])).dt.days - 1
|
|
1871
2867
|
df2['break'] = np.where(df2['span'] > 3, 1, 0)
|
|
1872
2868
|
df2['break'] = np.where(df2['span'].isna(), 1, df2['break'])
|
|
1873
|
-
|
|
2869
|
+
|
|
1874
2870
|
df2['chain_id'] = df2.sort_values(['Date']).groupby(['break']).cumcount() + 1
|
|
1875
2871
|
df2['chain_id'] = np.where(df2['break'] == 1, df2['chain_id'], np.nan )
|
|
1876
2872
|
df2['chain_id'] = df2['chain_id'].fillna(method = 'ffill')
|
|
1877
|
-
|
|
2873
|
+
|
|
1878
2874
|
df2['internal_rn'] = df2.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
|
|
1879
2875
|
df2['inv_internal_rn'] = df2.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
|
|
1880
|
-
|
|
2876
|
+
|
|
1881
2877
|
df2['first_in_chain'] = np.where(df2['internal_rn'] == 1, True, False)
|
|
1882
2878
|
df2['last_in_chain'] = np.where(df2['inv_internal_rn'] == 1, True, False)
|
|
1883
|
-
|
|
2879
|
+
|
|
1884
2880
|
df2 = df2.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
|
|
1885
|
-
|
|
2881
|
+
|
|
1886
2882
|
df2 = df2[(df2.last_in_chain == True) & (df2.signal_type == 'down')][['last_in_chain']]
|
|
1887
2883
|
dft = df1.merge(df2,how = 'left',left_index=True, right_index=True )
|
|
1888
|
-
|
|
2884
|
+
|
|
1889
2885
|
dft['chain_id'] = dft.sort_values(['Date']).groupby(['last_in_chain']).cumcount() + 1
|
|
1890
2886
|
dft['chain_id'] = np.where(dft['last_in_chain'] == True, dft['chain_id'], np.nan )
|
|
1891
2887
|
dft['chain_id'] = dft['chain_id'].fillna(method = 'ffill')
|
|
1892
|
-
|
|
2888
|
+
|
|
1893
2889
|
dft['internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
|
|
1894
2890
|
dft['flag'] = np.where(dft['internal_rn'] < days_strategy, 1,0)
|
|
1895
|
-
|
|
2891
|
+
|
|
1896
2892
|
dft['lrets_bench'] = np.log(dft[asset_1]/dft[asset_1].shift(1))
|
|
1897
2893
|
dft['bench_prod'] = dft['lrets_bench'].cumsum()
|
|
1898
2894
|
dft['bench_prod_exp'] = np.exp(dft['bench_prod']) - 1
|
|
1899
|
-
|
|
2895
|
+
|
|
1900
2896
|
if high_exit and low_exit:
|
|
1901
2897
|
dft['open_strat'] = np.where(dft.last_in_chain == True, dft.Open, np.nan)
|
|
1902
2898
|
dft['open_strat'] = dft['open_strat'].fillna(method = 'ffill')
|
|
@@ -1905,7 +2901,7 @@ class signal_analyser_object:
|
|
|
1905
2901
|
dft['low_strat_ret'] = (dft['Low']/dft['open_strat']-1)*100
|
|
1906
2902
|
dft['high_exit'] = np.where(((dft['high_strat_ret'] >= high_exit) | (dft['internal_rn'] == days_strategy)), 1, np.nan)
|
|
1907
2903
|
dft['low_exit'] = np.where((dft['low_strat_ret'] <= low_exit), -1, np.nan)
|
|
1908
|
-
|
|
2904
|
+
|
|
1909
2905
|
dft["exit_type"] = dft[["high_exit", "low_exit"]].max(axis=1)
|
|
1910
2906
|
dft['exit_type'] = np.where(dft["exit_type"] == 1, 1, np.where(dft["exit_type"] == -1,-1,np.nan))
|
|
1911
2907
|
dft['exit'] = np.where(dft['exit_type'].isnull(), np.nan, 1)
|
|
@@ -1916,27 +2912,27 @@ class signal_analyser_object:
|
|
|
1916
2912
|
max_id = dft.chain_id.max()
|
|
1917
2913
|
dft['max_internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).internal_rn.transform('max')
|
|
1918
2914
|
dft['exit'] = np.where((dft.chain_id == max_id) & (dft.max_internal_rn < days_strategy) & (dft.max_internal_rn == dft.internal_rn), 1, dft['exit'])
|
|
1919
|
-
|
|
2915
|
+
|
|
1920
2916
|
dft['exit_step'] = np.where(dft.exit == 1, dft.internal_rn, np.nan)
|
|
1921
2917
|
dft['exit_step'] = dft.sort_values(['Date']).groupby(['chain_id']).exit_step.transform('max')
|
|
1922
|
-
|
|
2918
|
+
|
|
1923
2919
|
dft['flag'] = np.where(dft.internal_rn <= dft.exit_step, 1, 0)
|
|
1924
2920
|
dft = dft.drop(columns = ['open_strat', 'high_strat_ret', 'low_strat_ret','exit_step', 'exit','exit_type','high_exit','low_exit', 'max_internal_rn'])
|
|
1925
|
-
|
|
2921
|
+
|
|
1926
2922
|
dft['lrets_strat'] = np.log(dft[asset_1].shift(-1)/dft[asset_1]) * dft['flag']
|
|
1927
2923
|
dft['lrets_strat'] = np.where(dft['lrets_strat'].isna(),-0.0,dft['lrets_strat'])
|
|
1928
2924
|
dft['lrets_prod'] = dft['lrets_strat'].cumsum()
|
|
1929
2925
|
dft['strat_prod_exp'] = np.exp(dft['lrets_prod']) - 1
|
|
1930
|
-
|
|
2926
|
+
|
|
1931
2927
|
bench_rets = round(dft['bench_prod_exp'].values[-1]*100,1)
|
|
1932
2928
|
strat_rets = round(dft['strat_prod_exp'].values[-1]*100,1)
|
|
1933
|
-
|
|
2929
|
+
|
|
1934
2930
|
bench_sr = round(sharpe_ratio(dft.bench_prod_exp.dropna()),1)
|
|
1935
2931
|
strat_sr = round(sharpe_ratio(dft.strat_prod_exp.dropna()),1)
|
|
1936
|
-
|
|
2932
|
+
|
|
1937
2933
|
message1 = f'{bench_rets}%'
|
|
1938
2934
|
message2 = f'{strat_rets}%'
|
|
1939
|
-
|
|
2935
|
+
|
|
1940
2936
|
messages = {
|
|
1941
2937
|
'benchmark return:':message1,
|
|
1942
2938
|
'benchmark sharpe ratio:': bench_sr,
|
|
@@ -1947,7 +2943,7 @@ class signal_analyser_object:
|
|
|
1947
2943
|
print('----------------------------')
|
|
1948
2944
|
print(messages)
|
|
1949
2945
|
print('----------------------------')
|
|
1950
|
-
|
|
2946
|
+
|
|
1951
2947
|
fig = plt.figure(1)
|
|
1952
2948
|
plt.plot(dft.bench_prod_exp.values, label = 'benchmark')
|
|
1953
2949
|
plt.scatter(range(len(dft)),np.where(dft[low_signal] == 1,dft.bench_prod_exp.values,np.nan),color = 'red', label = 'signal')
|
|
@@ -1956,34 +2952,50 @@ class signal_analyser_object:
|
|
|
1956
2952
|
plt.title('strategy and cumulative returns based on signal strategy')
|
|
1957
2953
|
if self.show_plot:
|
|
1958
2954
|
plt.plot()
|
|
1959
|
-
|
|
2955
|
+
|
|
1960
2956
|
if self.save_path:
|
|
1961
2957
|
result_json_name = f'signals_strategy_return_{feature_name}.json'
|
|
1962
2958
|
result_plot_name = f'signals_strategy_return_{feature_name}.png'
|
|
1963
|
-
|
|
2959
|
+
|
|
1964
2960
|
plt.savefig(self.save_path+result_plot_name)
|
|
1965
2961
|
# pickle.dump(fig, open(self.save_path+result_plot_name, 'wb'))
|
|
1966
|
-
|
|
1967
|
-
with open(self.save_path+result_json_name, "w") as outfile:
|
|
2962
|
+
|
|
2963
|
+
with open(self.save_path+result_json_name, "w") as outfile:
|
|
1968
2964
|
json.dump(messages, outfile)
|
|
1969
|
-
|
|
2965
|
+
|
|
1970
2966
|
if self.save_path and self.save_aws:
|
|
1971
2967
|
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
|
|
1972
2968
|
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name,input_path = self.save_path+result_plot_name)
|
|
1973
|
-
|
|
2969
|
+
|
|
1974
2970
|
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_json_name, input_path = self.save_path + result_json_name, aws_credentials = self.aws_credentials)
|
|
1975
2971
|
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
|
|
1976
|
-
|
|
2972
|
+
|
|
1977
2973
|
if not self.show_plot:
|
|
1978
2974
|
plt.close()
|
|
1979
|
-
|
|
2975
|
+
|
|
1980
2976
|
del df1,df2,dft
|
|
1981
|
-
|
|
2977
|
+
|
|
1982
2978
|
if self.return_fig:
|
|
1983
2979
|
return fig, messages
|
|
1984
|
-
|
|
2980
|
+
|
|
1985
2981
|
def execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object, plot = False, backtest= False, exit_params = {}):
|
|
1986
|
-
|
|
2982
|
+
'''
|
|
2983
|
+
code snippet that is going run some objects. The analysis is signal analyse which is backtesting
|
|
2984
|
+
|
|
2985
|
+
Parameters:
|
|
2986
|
+
test_data_size (int): test data size
|
|
2987
|
+
feature_name (str): name of the feature to assess
|
|
2988
|
+
days_list (list): tome scope to assess the returns
|
|
2989
|
+
configuration (dict): parameters of the method to run
|
|
2990
|
+
object_stock (obj): object with data to assess
|
|
2991
|
+
signal_analyser_object (obj): signal_analyser object
|
|
2992
|
+
plot (boolean): if true, plot results
|
|
2993
|
+
backtest (boolean): if true, run backtest
|
|
2994
|
+
exit_params (dict): parameters of exit returns
|
|
2995
|
+
|
|
2996
|
+
Returns:
|
|
2997
|
+
mean_median_return (float): median return of the backtests
|
|
2998
|
+
'''
|
|
1987
2999
|
method(**configuration)
|
|
1988
3000
|
signal_assess = signal_analyser_object(object_stock.df,object_stock.stock_code,show_plot = plot)
|
|
1989
3001
|
signal_assess.signal_analyser(test_size = test_data_size, feature_name = feature_name, days_list = days_list, threshold = 1)
|
|
@@ -1991,56 +3003,127 @@ def execute_signal_analyser(test_data_size, feature_name, days_list, configurati
|
|
|
1991
3003
|
if backtest:
|
|
1992
3004
|
print('-----------------------back test ---------------------------')
|
|
1993
3005
|
signal_assess.create_backtest_signal(backtest, test_data_size, feature_name, **exit_params )
|
|
1994
|
-
|
|
3006
|
+
|
|
1995
3007
|
return signal_assess.mean_median_return
|
|
1996
3008
|
|
|
1997
3009
|
def iterate_signal_analyser(test_data_size,feature_name, days_list, arguments_to_test, method, object_stock, signal_analyser_object, plot = True):
|
|
1998
|
-
|
|
3010
|
+
'''
|
|
3011
|
+
code snippet is going to iterate signal analyser
|
|
3012
|
+
|
|
3013
|
+
Parameters:
|
|
3014
|
+
test_data_size (int): test data size
|
|
3015
|
+
feature_name (str): name of the feature to assess
|
|
3016
|
+
days_list (list): tome scope to assess the returns
|
|
3017
|
+
arguments_to_test: parameters to test
|
|
3018
|
+
method: methods to run
|
|
3019
|
+
object_stock (obj): object with data to assess
|
|
3020
|
+
signal_analyser_object (obj): signal_analyser object
|
|
3021
|
+
plot (boolean): if true, plot results
|
|
3022
|
+
|
|
3023
|
+
Returns:
|
|
3024
|
+
best_result (int): index from the arguments_to_test with the best result
|
|
3025
|
+
'''
|
|
1999
3026
|
results = list()
|
|
2000
3027
|
for key in arguments_to_test.keys():
|
|
2001
3028
|
configuration = arguments_to_test.get(key)
|
|
2002
3029
|
mean_median_return = execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object)
|
|
2003
3030
|
results.append(mean_median_return)
|
|
2004
|
-
|
|
3031
|
+
|
|
2005
3032
|
df_result = pd.DataFrame({'keys':arguments_to_test.keys(),'results':results})
|
|
2006
3033
|
if plot:
|
|
2007
3034
|
plt.plot(df_result['keys'], df_result['results'])
|
|
2008
3035
|
plt.scatter(df_result['keys'], df_result['results'])
|
|
2009
3036
|
plt.title('simulation between configurations')
|
|
2010
|
-
plt.ylabel('median expected return')
|
|
3037
|
+
plt.ylabel('median expected return')
|
|
2011
3038
|
plt.show()
|
|
2012
|
-
|
|
3039
|
+
|
|
2013
3040
|
best_result = df_result.sort_values('results',ascending = False)['keys'].values[0]
|
|
2014
3041
|
return best_result
|
|
2015
|
-
|
|
3042
|
+
|
|
2016
3043
|
class analyse_index(stock_eda_panel):
|
|
2017
|
-
|
|
3044
|
+
"""
|
|
3045
|
+
class that is going to train hmm models to perform feature selection
|
|
3046
|
+
|
|
3047
|
+
Attributes
|
|
3048
|
+
----------
|
|
3049
|
+
data : pd.DataFrame
|
|
3050
|
+
symbol of the asset
|
|
3051
|
+
index : str
|
|
3052
|
+
name of the index
|
|
3053
|
+
asset : str
|
|
3054
|
+
name of the asset
|
|
3055
|
+
n_obs : int
|
|
3056
|
+
number of rows to extract
|
|
3057
|
+
lag : int
|
|
3058
|
+
lag to apply
|
|
3059
|
+
data_window : str
|
|
3060
|
+
5y 10y 15y
|
|
3061
|
+
show_plot : bool
|
|
3062
|
+
If True, show plots
|
|
3063
|
+
save_path : str
|
|
3064
|
+
local path for saving e.g r'C:/path/to/the/file/'
|
|
3065
|
+
save_aws : str
|
|
3066
|
+
remote key in s3 bucket path e.g. 'path/to/file/'
|
|
3067
|
+
aws_credentials : dict
|
|
3068
|
+
dict with the aws credentials
|
|
3069
|
+
merger_df : pd.DataFrame
|
|
3070
|
+
dataframe with the index and asset data
|
|
3071
|
+
states_result = dict
|
|
3072
|
+
betas and correlation score results
|
|
3073
|
+
|
|
3074
|
+
Methods
|
|
3075
|
+
-------
|
|
3076
|
+
process_data():
|
|
3077
|
+
using stock_eda_panel, get data and merge data
|
|
3078
|
+
plot_betas(sample_size=int, offset=int, subsample_ts=int):
|
|
3079
|
+
display beta analysis plot
|
|
3080
|
+
get_betas(subsample_ts=int)
|
|
3081
|
+
get general beta and last sample beta, correlation score is included too
|
|
3082
|
+
"""
|
|
2018
3083
|
|
|
3084
|
+
def __init__(self, index, asset, n_obs, lag, data_window = '5y', show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
|
|
2019
3085
|
"""
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
|
|
3086
|
+
Initialize object
|
|
3087
|
+
|
|
3088
|
+
Parameters
|
|
3089
|
+
----------
|
|
3090
|
+
index (str): name of the index
|
|
3091
|
+
asset (str): name of the asset
|
|
3092
|
+
n_obs (int): number of rows to extract
|
|
3093
|
+
lag (int): lag to apply
|
|
3094
|
+
data_window (str): 5y 10y 15y
|
|
3095
|
+
show_plot (bool): If True, show plots
|
|
3096
|
+
save_path (str): local path for saving e.g r'C:/path/to/the/file/'
|
|
3097
|
+
save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
|
|
3098
|
+
aws_credentials (dict): dict with the aws credentials
|
|
3099
|
+
|
|
3100
|
+
Returns
|
|
3101
|
+
-------
|
|
3102
|
+
None
|
|
2030
3103
|
"""
|
|
2031
|
-
|
|
3104
|
+
|
|
2032
3105
|
self.index = index
|
|
2033
3106
|
self.asset = asset
|
|
2034
3107
|
self.n_obs = n_obs
|
|
2035
3108
|
self.data_window = data_window
|
|
2036
3109
|
self.lag = lag
|
|
2037
|
-
|
|
3110
|
+
|
|
2038
3111
|
self.show_plot = show_plot
|
|
2039
3112
|
self.save_path = save_path
|
|
2040
3113
|
self.save_aws = save_aws
|
|
2041
|
-
|
|
3114
|
+
|
|
2042
3115
|
def process_data(self):
|
|
2043
|
-
|
|
3116
|
+
"""
|
|
3117
|
+
using stock_eda_panel, get data and merge data
|
|
3118
|
+
|
|
3119
|
+
Parameters
|
|
3120
|
+
----------
|
|
3121
|
+
None
|
|
3122
|
+
|
|
3123
|
+
Returns
|
|
3124
|
+
-------
|
|
3125
|
+
None
|
|
3126
|
+
"""
|
|
2044
3127
|
index = stock_eda_panel(self.index, self.n_obs, self.data_window)
|
|
2045
3128
|
index.get_data()
|
|
2046
3129
|
index.df['shift'] = index.df.Close.shift(self.lag)
|
|
@@ -2050,39 +3133,51 @@ class analyse_index(stock_eda_panel):
|
|
|
2050
3133
|
asset.get_data()
|
|
2051
3134
|
asset.df['shift'] = asset.df.Close.shift(self.lag)
|
|
2052
3135
|
asset.df['asset_return'] = asset.df.Close/asset.df['shift'] - 1
|
|
2053
|
-
|
|
3136
|
+
|
|
2054
3137
|
df1 = index.df[['Date','index_return']]
|
|
2055
3138
|
df2 = asset.df[['Date','asset_return','Close']]
|
|
2056
3139
|
merger = df1.merge(df2, on = 'Date', how = 'inner')
|
|
2057
3140
|
merger.dropna(inplace = True)
|
|
2058
3141
|
self.merger_df = merger
|
|
2059
|
-
|
|
3142
|
+
|
|
2060
3143
|
def plot_betas(self,sample_size, offset, subsample_ts =False):
|
|
2061
|
-
|
|
3144
|
+
"""
|
|
3145
|
+
display beta analysis plot
|
|
3146
|
+
|
|
3147
|
+
Parameters
|
|
3148
|
+
----------
|
|
3149
|
+
sample_size (int): number of days or window size to calculate beta
|
|
3150
|
+
offset (int): overlap between windows
|
|
3151
|
+
subsample_ts (int): subsample size of data
|
|
3152
|
+
|
|
3153
|
+
Returns
|
|
3154
|
+
-------
|
|
3155
|
+
None
|
|
3156
|
+
"""
|
|
2062
3157
|
### extracting data
|
|
2063
3158
|
|
|
2064
3159
|
self.process_data()
|
|
2065
|
-
|
|
3160
|
+
|
|
2066
3161
|
### ploting analysis
|
|
2067
3162
|
figure, ax = plt.subplot_mosaic(
|
|
2068
3163
|
[["scatter_total", "scatter_sample",'ts','ts']],
|
|
2069
3164
|
layout="constrained",
|
|
2070
3165
|
figsize=(18, 5)
|
|
2071
3166
|
)
|
|
2072
|
-
|
|
3167
|
+
|
|
2073
3168
|
ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df.index_return)
|
|
2074
3169
|
b, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
|
|
2075
3170
|
ax['scatter_total'].plot(self.merger_df.asset_return, b*self.merger_df.asset_return+a, color='red')
|
|
2076
3171
|
|
|
2077
3172
|
ax['ts'].plot(self.merger_df.Date, self.merger_df.Close, color = 'grey', alpha = 0.3)
|
|
2078
|
-
|
|
3173
|
+
|
|
2079
3174
|
if subsample_ts:
|
|
2080
3175
|
self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
|
|
2081
|
-
|
|
3176
|
+
|
|
2082
3177
|
for i in range(0,len(self.merger_df)-sample_size,offset):
|
|
2083
3178
|
|
|
2084
3179
|
merger_ = self.merger_df.sort_values('Date', ascending = False).iloc[i:i+sample_size,:]
|
|
2085
|
-
x = merger_.index_return
|
|
3180
|
+
x = merger_.index_return
|
|
2086
3181
|
y = merger_.asset_return
|
|
2087
3182
|
b, a = np.polyfit(x,y, 1)
|
|
2088
3183
|
|
|
@@ -2098,10 +3193,10 @@ class analyse_index(stock_eda_panel):
|
|
|
2098
3193
|
|
|
2099
3194
|
scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
|
|
2100
3195
|
scalarmappaple.set_array(x)
|
|
2101
|
-
|
|
3196
|
+
|
|
2102
3197
|
plt.title(f'{self.asset} using index: {self.index}')
|
|
2103
3198
|
plt.colorbar(scalarmappaple)
|
|
2104
|
-
|
|
3199
|
+
|
|
2105
3200
|
if self.show_plot:
|
|
2106
3201
|
plt.show()
|
|
2107
3202
|
if self.save_path:
|
|
@@ -2113,34 +3208,95 @@ class analyse_index(stock_eda_panel):
|
|
|
2113
3208
|
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
|
|
2114
3209
|
if not self.show_plot:
|
|
2115
3210
|
plt.close()
|
|
2116
|
-
|
|
3211
|
+
|
|
2117
3212
|
def get_betas(self,subsample_ts=False):
|
|
2118
|
-
|
|
3213
|
+
"""
|
|
3214
|
+
get general beta and last sample beta, correlation score is included too
|
|
3215
|
+
|
|
3216
|
+
Parameters
|
|
3217
|
+
----------
|
|
3218
|
+
subsample_ts (int): subsample size of data
|
|
3219
|
+
|
|
3220
|
+
Returns
|
|
3221
|
+
-------
|
|
3222
|
+
None
|
|
3223
|
+
"""
|
|
2119
3224
|
self.process_data()
|
|
2120
3225
|
general_beta, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
|
|
2121
3226
|
general_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df.index_return)[0]
|
|
2122
|
-
|
|
3227
|
+
|
|
2123
3228
|
self.process_data()
|
|
2124
3229
|
if subsample_ts:
|
|
2125
3230
|
self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
|
|
2126
3231
|
sample_beta, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
|
|
2127
3232
|
sample_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df.index_return)[0]
|
|
2128
|
-
|
|
3233
|
+
|
|
2129
3234
|
result = {
|
|
2130
3235
|
'general_beta':general_beta,
|
|
2131
3236
|
'general_r':general_r,
|
|
2132
3237
|
'sample_beta':sample_beta,
|
|
2133
3238
|
'sample_r':sample_r
|
|
2134
3239
|
}
|
|
2135
|
-
|
|
3240
|
+
|
|
2136
3241
|
self.states_result = result
|
|
2137
|
-
|
|
3242
|
+
|
|
2138
3243
|
class evaluate_markets(analyse_index):
|
|
3244
|
+
"""
|
|
3245
|
+
object that is going to evaluate multiple indexes
|
|
3246
|
+
|
|
3247
|
+
Attributes
|
|
3248
|
+
----------
|
|
3249
|
+
stock_code : str
|
|
3250
|
+
asset to assess
|
|
3251
|
+
indexes : list
|
|
3252
|
+
list of indexes
|
|
3253
|
+
best_result : dict
|
|
3254
|
+
best result beta and correlation
|
|
3255
|
+
|
|
3256
|
+
Methods
|
|
3257
|
+
-------
|
|
3258
|
+
process_data():
|
|
3259
|
+
using stock_eda_panel, get data and merge data
|
|
3260
|
+
plot_betas(sample_size=int, offset=int, subsample_ts=int):
|
|
3261
|
+
display beta analysis plot
|
|
3262
|
+
get_betas(subsample_ts=int)
|
|
3263
|
+
get general beta and last sample beta, correlation score is included too
|
|
3264
|
+
evaluate_best_market_fit(sample_size=int, offset=int,lag=int, n_obs=int, verbose=boolean, plot_best=boolean):
|
|
3265
|
+
iterate every index in the index list and get results
|
|
3266
|
+
"""
|
|
3267
|
+
|
|
2139
3268
|
def __init__(self, stock_code, indexes):
|
|
3269
|
+
"""
|
|
3270
|
+
Initialize object
|
|
3271
|
+
|
|
3272
|
+
Parameters
|
|
3273
|
+
----------
|
|
3274
|
+
stock_code (str): asset to assess
|
|
3275
|
+
indexes (list): list of indexes
|
|
3276
|
+
|
|
3277
|
+
Returns
|
|
3278
|
+
-------
|
|
3279
|
+
None
|
|
3280
|
+
"""
|
|
2140
3281
|
self.stock_code = stock_code
|
|
2141
3282
|
self.indexes = indexes
|
|
2142
3283
|
def evaluate_best_market_fit(self,sample_size, offset,lag= 3, n_obs = 3500, verbose = False, plot_best = False):
|
|
2143
|
-
|
|
3284
|
+
"""
|
|
3285
|
+
iterate every index in the index list and get results
|
|
3286
|
+
|
|
3287
|
+
Parameters
|
|
3288
|
+
----------
|
|
3289
|
+
sample_size (int): sample size to get betas
|
|
3290
|
+
offset (int): overlap size
|
|
3291
|
+
lag (int): number of lags of the returns
|
|
3292
|
+
n_obs (int): number of observations of the data extraction
|
|
3293
|
+
verbose (boolean): if true, print results
|
|
3294
|
+
plot_best (boolean): if true, display plot of the best result
|
|
3295
|
+
|
|
3296
|
+
Returns
|
|
3297
|
+
-------
|
|
3298
|
+
None
|
|
3299
|
+
"""
|
|
2144
3300
|
results_dicts = dict()
|
|
2145
3301
|
for index in self.indexes:
|
|
2146
3302
|
betex = analyse_index(index = index,asset = self.stock_code,n_obs = n_obs, lag = lag)
|
|
@@ -2150,34 +3306,40 @@ class evaluate_markets(analyse_index):
|
|
|
2150
3306
|
pd_result['gen_r2'] = pd_result.general_r ** 2
|
|
2151
3307
|
pd_result['sampl_r2'] = pd_result.sample_r ** 2
|
|
2152
3308
|
self.stat_results = pd_result
|
|
2153
|
-
|
|
3309
|
+
|
|
2154
3310
|
best_result = pd_result.sort_values('gen_r2',ascending = False).head(2).sort_values('sampl_r2',ascending = False).head(1)
|
|
2155
3311
|
best_fit_index = best_result.index.values[0]
|
|
2156
|
-
|
|
3312
|
+
|
|
2157
3313
|
self.stat_results = self.stat_results.drop(columns = ['gen_r2','sampl_r2'])
|
|
2158
|
-
|
|
3314
|
+
|
|
2159
3315
|
if verbose:
|
|
2160
3316
|
print(best_result)
|
|
2161
3317
|
if plot_best:
|
|
2162
3318
|
betex = analyse_index(index = best_fit_index,asset = self.stock_code, n_obs = n_obs, lag = lag)
|
|
2163
3319
|
betex.plot_betas(sample_size = sample_size, offset = offset, subsample_ts = False)
|
|
2164
|
-
|
|
3320
|
+
|
|
2165
3321
|
self.best_result = best_result
|
|
2166
|
-
|
|
3322
|
+
|
|
2167
3323
|
def get_relevant_beta(data_market, ticket_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
3324
|
+
'''
|
|
3325
|
+
select relevant beta result data of a given asset
|
|
3326
|
+
|
|
3327
|
+
Parameters:
|
|
3328
|
+
data_market (pd.DataFrame): dataframe of the market results
|
|
3329
|
+
ticket_name (str): name of the asset
|
|
3330
|
+
show_plot (bool): If tru, plot results
|
|
3331
|
+
save_path (str): local path for saving e.g r'C:/path/to/the/file/'
|
|
3332
|
+
save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
|
|
3333
|
+
aws_credentials (dict): dict of the aws credentials
|
|
3334
|
+
|
|
3335
|
+
Returns:
|
|
3336
|
+
selection (pd.DataFrame): dataframe of the most relevant beta
|
|
3337
|
+
'''
|
|
2176
3338
|
all_betas = data_market[data_market.asset == ticket_name].sort_values('general_r', ascending = False)
|
|
2177
3339
|
all_betas['gen_r2'] = all_betas.general_r ** 2
|
|
2178
3340
|
all_betas['sampl_r2'] = all_betas.sample_r ** 2
|
|
2179
3341
|
selection = all_betas.sort_values('gen_r2',ascending =False).head(2).sort_values('sampl_r2',ascending =False).head(1).drop(columns = ['gen_r2','sampl_r2'])
|
|
2180
|
-
|
|
3342
|
+
|
|
2181
3343
|
if show_plot:
|
|
2182
3344
|
print(selection)
|
|
2183
3345
|
if save_path:
|