virgo-modules 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of virgo-modules might be problematic. Click here for more details.
- virgo_modules/src/edge_utils.py +1 -2
- virgo_modules/src/hmm_utils.py +54 -2
- virgo_modules/src/re_utils.py +4 -1
- virgo_modules/src/ticketer_source.py +12 -837
- virgo_modules/src/transformer_utils.py +250 -0
- {virgo_modules-0.2.1.dist-info → virgo_modules-0.2.3.dist-info}/METADATA +1 -1
- virgo_modules-0.2.3.dist-info/RECORD +15 -0
- virgo_modules-0.2.1.dist-info/RECORD +0 -14
- {virgo_modules-0.2.1.dist-info → virgo_modules-0.2.3.dist-info}/LICENSE +0 -0
- {virgo_modules-0.2.1.dist-info → virgo_modules-0.2.3.dist-info}/WHEEL +0 -0
- {virgo_modules-0.2.1.dist-info → virgo_modules-0.2.3.dist-info}/top_level.txt +0 -0
|
@@ -36,7 +36,6 @@ from hmmlearn.hmm import GaussianHMM
|
|
|
36
36
|
|
|
37
37
|
from plotly.colors import DEFAULT_PLOTLY_COLORS
|
|
38
38
|
|
|
39
|
-
from sklearn.base import BaseEstimator, TransformerMixin
|
|
40
39
|
from sklearn.pipeline import Pipeline
|
|
41
40
|
from feature_engine.imputation import MeanMedianImputer
|
|
42
41
|
|
|
@@ -54,252 +53,8 @@ from .aws_utils import upload_file_to_aws
|
|
|
54
53
|
|
|
55
54
|
import logging
|
|
56
55
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
"""
|
|
60
|
-
Class that applies inverse hyperbolic sine for feature transformation.
|
|
61
|
-
this class is compatible with scikitlearn pipeline
|
|
62
|
-
|
|
63
|
-
Attributes
|
|
64
|
-
----------
|
|
65
|
-
features : list
|
|
66
|
-
list of features to apply the transformation
|
|
67
|
-
prefix : str
|
|
68
|
-
prefix for the new features. is '' the features are overwrite
|
|
69
|
-
|
|
70
|
-
Methods
|
|
71
|
-
-------
|
|
72
|
-
fit(additional="", X=DataFrame, y=None):
|
|
73
|
-
fit transformation.
|
|
74
|
-
transform(X=DataFrame, y=None):
|
|
75
|
-
apply feature transformation
|
|
76
|
-
"""
|
|
77
|
-
|
|
78
|
-
def __init__(self, features, prefix = ''):
|
|
79
|
-
self.features = features
|
|
80
|
-
self.prefix = prefix
|
|
81
|
-
|
|
82
|
-
def fit(self, X, y=None):
|
|
83
|
-
return self
|
|
84
|
-
|
|
85
|
-
def transform(self, X, y=None):
|
|
86
|
-
for feature in self.features:
|
|
87
|
-
X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
|
|
88
|
-
return X
|
|
89
|
-
|
|
90
|
-
class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
|
|
91
|
-
|
|
92
|
-
"""
|
|
93
|
-
Class that applies winsorirization of a feature for feature transformation.
|
|
94
|
-
this class is compatible with scikitlearn pipeline
|
|
95
|
-
|
|
96
|
-
Attributes
|
|
97
|
-
----------
|
|
98
|
-
feature_configs : dict
|
|
99
|
-
dictionary of features and configurations. the configuration has high and low limits per feature
|
|
100
|
-
|
|
101
|
-
Methods
|
|
102
|
-
-------
|
|
103
|
-
fit(additional="", X=DataFrame, y=None):
|
|
104
|
-
fit transformation.
|
|
105
|
-
transform(X=DataFrame, y=None):
|
|
106
|
-
apply feature transformation
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
def __init__(self, feature_configs):
|
|
110
|
-
self.feature_configs = feature_configs
|
|
111
|
-
def fit(self, X, y=None):
|
|
112
|
-
return self
|
|
113
|
-
|
|
114
|
-
def transform(self, X, y=None):
|
|
115
|
-
for feature in self.feature_configs:
|
|
116
|
-
lower = self.feature_configs[feature]['min']
|
|
117
|
-
upper = self.feature_configs[feature]['max']
|
|
118
|
-
X[feature] = np.where( lower > X[feature], lower, X[feature])
|
|
119
|
-
X[feature] = np.where( upper < X[feature], upper, X[feature])
|
|
120
|
-
return X
|
|
121
|
-
|
|
122
|
-
class FeatureSelector(BaseEstimator, TransformerMixin):
|
|
123
|
-
|
|
124
|
-
"""
|
|
125
|
-
Class that applies selection of features.
|
|
126
|
-
this class is compatible with scikitlearn pipeline
|
|
127
|
-
|
|
128
|
-
Attributes
|
|
129
|
-
----------
|
|
130
|
-
columns : list
|
|
131
|
-
list of features to select
|
|
132
|
-
|
|
133
|
-
Methods
|
|
134
|
-
-------
|
|
135
|
-
fit(additional="", X=DataFrame, y=None):
|
|
136
|
-
fit transformation.
|
|
137
|
-
transform(X=DataFrame, y=None):
|
|
138
|
-
apply feature transformation
|
|
139
|
-
"""
|
|
140
|
-
|
|
141
|
-
def __init__(self, columns):
|
|
142
|
-
self.columns = columns
|
|
143
|
-
|
|
144
|
-
def fit(self, X, y=None):
|
|
145
|
-
return self
|
|
146
|
-
|
|
147
|
-
def transform(self, X, y=None):
|
|
148
|
-
return X[self.columns]
|
|
149
|
-
|
|
150
|
-
class FeaturesEntropy(BaseEstimator, TransformerMixin):
|
|
151
|
-
"""
|
|
152
|
-
Class that creates a feature that calculate entropy for a given feature classes, but it might get some leackeage in the training set.
|
|
153
|
-
this class is compatible with scikitlearn pipeline
|
|
154
|
-
|
|
155
|
-
Attributes
|
|
156
|
-
----------
|
|
157
|
-
columns : list
|
|
158
|
-
list of features to select
|
|
159
|
-
entropy_map: pd.DataFrame
|
|
160
|
-
dataframe of the map with the entropies per class
|
|
161
|
-
perc: float
|
|
162
|
-
percentage of the dates using for calculate the entropy map
|
|
163
|
-
|
|
164
|
-
Methods
|
|
165
|
-
-------
|
|
166
|
-
fit(additional="", X=DataFrame, y=None):
|
|
167
|
-
fit transformation.
|
|
168
|
-
transform(X=DataFrame, y=None):
|
|
169
|
-
apply feature transformation
|
|
170
|
-
"""
|
|
171
|
-
|
|
172
|
-
def __init__(self, features, target, feature_name = None, feature_type = 'discrete', perc = 0.5, default_null = 0.99):
|
|
173
|
-
|
|
174
|
-
self.features = features
|
|
175
|
-
self.feature_type = feature_type
|
|
176
|
-
self.target = target
|
|
177
|
-
self.perc = perc
|
|
178
|
-
self.default_null = default_null
|
|
179
|
-
|
|
180
|
-
if not feature_name:
|
|
181
|
-
self.feature_name = '_'.join(features)
|
|
182
|
-
self.feature_name = self.feature_name + '_' + target + '_' + feature_type
|
|
183
|
-
else:
|
|
184
|
-
self.feature_name = feature_name
|
|
185
|
-
|
|
186
|
-
def fit(self, X, y=None):
|
|
187
|
-
|
|
188
|
-
unique_dates = list(X['Date'].unique())
|
|
189
|
-
unique_dates.sort()
|
|
190
|
-
|
|
191
|
-
total_length = len(unique_dates)
|
|
192
|
-
cut = int(round(total_length*self.perc,0))
|
|
193
|
-
train_dates = unique_dates[:cut]
|
|
194
|
-
max_train_date = max(train_dates)
|
|
195
|
-
|
|
196
|
-
X_ = X[X['Date'] <= max_train_date].copy()
|
|
197
|
-
df = X_.join(y, how = 'left')
|
|
198
|
-
|
|
199
|
-
column_list = [f'{self.feature_type}_signal_{colx}' for colx in self.features]
|
|
200
|
-
|
|
201
|
-
df_aggr = (
|
|
202
|
-
df
|
|
203
|
-
.groupby(column_list, as_index = False)
|
|
204
|
-
.apply(
|
|
205
|
-
lambda x: pd.Series(
|
|
206
|
-
dict(
|
|
207
|
-
counts = x[self.target].count(),
|
|
208
|
-
trues=(x[self.target] == 1).sum(),
|
|
209
|
-
falses=(x[self.target] == 0).sum(),
|
|
210
|
-
)
|
|
211
|
-
)
|
|
212
|
-
)
|
|
213
|
-
.assign(
|
|
214
|
-
trues_rate=lambda x: x['trues'] / x['counts']
|
|
215
|
-
)
|
|
216
|
-
.assign(
|
|
217
|
-
falses_rate=lambda x: x['falses'] / x['counts']
|
|
218
|
-
)
|
|
219
|
-
.assign(
|
|
220
|
-
log2_trues = lambda x: np.log2(1/x['trues_rate'])
|
|
221
|
-
)
|
|
222
|
-
.assign(
|
|
223
|
-
log2_falses = lambda x: np.log2(1/x['falses_rate'])
|
|
224
|
-
)
|
|
225
|
-
.assign(
|
|
226
|
-
comp1 = lambda x: x['trues_rate']*x['log2_trues']
|
|
227
|
-
)
|
|
228
|
-
.assign(
|
|
229
|
-
comp2 = lambda x: x['falses_rate']*x['log2_falses']
|
|
230
|
-
)
|
|
231
|
-
.assign(
|
|
232
|
-
class_entropy = lambda x: np.round(x['comp1']+x['comp2'],3)
|
|
233
|
-
)
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
self.column_list = column_list
|
|
237
|
-
self.entropy_map = (
|
|
238
|
-
df_aggr
|
|
239
|
-
[column_list+['class_entropy']]
|
|
240
|
-
.rename(columns = {'class_entropy': self.feature_name})
|
|
241
|
-
.copy()
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
del df, df_aggr, X_
|
|
245
|
-
return self
|
|
246
|
-
|
|
247
|
-
def transform(self, X, y=None):
|
|
248
|
-
|
|
249
|
-
X = X.join(self.entropy_map.set_index(self.column_list), on=self.column_list, how = 'left')
|
|
250
|
-
X[self.feature_name] = X[self.feature_name].fillna(self.default_null)
|
|
251
|
-
return X
|
|
252
|
-
|
|
253
|
-
class signal_combiner(BaseEstimator, TransformerMixin):
|
|
254
|
-
|
|
255
|
-
"""
|
|
256
|
-
Class that applies feature combination of binary signals.
|
|
257
|
-
this class is compatible with scikitlearn pipeline
|
|
258
|
-
|
|
259
|
-
...
|
|
260
|
-
|
|
261
|
-
Attributes
|
|
262
|
-
----------
|
|
263
|
-
columns : list
|
|
264
|
-
list of features to select
|
|
265
|
-
drop : boolean
|
|
266
|
-
drop combining features
|
|
267
|
-
prefix_up : str
|
|
268
|
-
up prefix of the base feature
|
|
269
|
-
prefix_low : str
|
|
270
|
-
low prefix of the base feature
|
|
271
|
-
|
|
272
|
-
Methods
|
|
273
|
-
-------
|
|
274
|
-
fit(additional="", X=DataFrame, y=None):
|
|
275
|
-
fit transformation.
|
|
276
|
-
transform(X=DataFrame, y=None):
|
|
277
|
-
apply feature transformation
|
|
278
|
-
"""
|
|
279
|
-
|
|
280
|
-
def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
|
|
281
|
-
self.columns = columns
|
|
282
|
-
self.drop = drop
|
|
283
|
-
self.prefix_up = prefix_up
|
|
284
|
-
self.prefix_low = prefix_low
|
|
285
|
-
|
|
286
|
-
def fit(self, X, y=None):
|
|
287
|
-
return self
|
|
288
|
-
|
|
289
|
-
def transform(self, X, y=None):
|
|
290
|
-
for column in self.columns:
|
|
291
|
-
X['CombSignal_'+column] = np.where(
|
|
292
|
-
X[self.prefix_up + column] == 1,
|
|
293
|
-
1,
|
|
294
|
-
np.where(
|
|
295
|
-
X[self.prefix_low + column] == 1,
|
|
296
|
-
1,
|
|
297
|
-
0
|
|
298
|
-
)
|
|
299
|
-
)
|
|
300
|
-
if self.drop:
|
|
301
|
-
X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
|
|
302
|
-
return X
|
|
56
|
+
from virgo_modules.src.hmm_utils import trainer_hmm
|
|
57
|
+
from virgo_modules.src.transformer_utils import signal_combiner, FeatureSelector
|
|
303
58
|
|
|
304
59
|
def data_processing_pipeline(features_base,features_to_drop = False, lag_dict = False, combine_signals = False, discretize_columns = False, correlation = 0.77):
|
|
305
60
|
|
|
@@ -335,61 +90,6 @@ def data_processing_pipeline(features_base,features_to_drop = False, lag_dict =
|
|
|
335
90
|
)
|
|
336
91
|
return pipe
|
|
337
92
|
|
|
338
|
-
def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
|
|
339
|
-
'''
|
|
340
|
-
calculate relevance score and summary report for hmm model
|
|
341
|
-
|
|
342
|
-
Parameters:
|
|
343
|
-
default_benchmark_sd (float): default value to bias SD for t calculation
|
|
344
|
-
t_threshold (float): alpha or z threshold for the normalized score
|
|
345
|
-
|
|
346
|
-
Returns:
|
|
347
|
-
mean_relevance (float): mean relevance score of the states
|
|
348
|
-
cluster_returns (pd.DataFrame): summary report of the analysis
|
|
349
|
-
number_relevant_states (int): number of relevant states
|
|
350
|
-
'''
|
|
351
|
-
## legnths
|
|
352
|
-
cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
|
|
353
|
-
cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
|
|
354
|
-
## means
|
|
355
|
-
def quantile2(x):
|
|
356
|
-
return x.quantile(0.25)
|
|
357
|
-
def quantile3(x):
|
|
358
|
-
return x.quantile(0.75)
|
|
359
|
-
|
|
360
|
-
cluster_returns = data.groupby('hmm_feature').agg(
|
|
361
|
-
n_uniques = ('chain_id','nunique'),
|
|
362
|
-
n_obs = ('Date','count'),
|
|
363
|
-
cluster_ret_q25 = ('chain_return',quantile2),
|
|
364
|
-
cluster_ret_median = ('chain_return','median'),
|
|
365
|
-
cluster_ret_q75 = ('chain_return',quantile3),
|
|
366
|
-
)
|
|
367
|
-
cluster_returns = cluster_returns.join(cluster_lengths, how = 'left')
|
|
368
|
-
cluster_returns['perc_dispute'] = np.where(
|
|
369
|
-
np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
|
|
370
|
-
1,0
|
|
371
|
-
)
|
|
372
|
-
cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
|
|
373
|
-
cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
|
|
374
|
-
cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
|
|
375
|
-
cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
|
|
376
|
-
cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
|
|
377
|
-
cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
|
|
378
|
-
cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
|
|
379
|
-
|
|
380
|
-
## relevance or importance
|
|
381
|
-
# naive aproach
|
|
382
|
-
cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
|
|
383
|
-
cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
|
|
384
|
-
cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
|
|
385
|
-
cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
|
|
386
|
-
|
|
387
|
-
mean_relevance = cluster_returns['abs_t_accpted'].mean()
|
|
388
|
-
number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
|
|
389
|
-
|
|
390
|
-
return mean_relevance, cluster_returns, number_relevant_states
|
|
391
|
-
|
|
392
|
-
|
|
393
93
|
class stock_eda_panel(object):
|
|
394
94
|
|
|
395
95
|
"""
|
|
@@ -800,7 +500,6 @@ class stock_eda_panel(object):
|
|
|
800
500
|
self.augmented_dickey_fuller_statistics(df['log_return'], 'log_return')
|
|
801
501
|
self.augmented_dickey_fuller_statistics(df['roll_mean_log_return'], 'roll_mean_log_return')
|
|
802
502
|
|
|
803
|
-
|
|
804
503
|
def find_lag(self, feature, lag_list, column_target = 'log_return',posterior_lag = 4, test_size = 350):
|
|
805
504
|
|
|
806
505
|
"""
|
|
@@ -847,7 +546,6 @@ class stock_eda_panel(object):
|
|
|
847
546
|
plt.axhline(y=0, color='grey', linestyle='--')
|
|
848
547
|
plt.show()
|
|
849
548
|
|
|
850
|
-
|
|
851
549
|
def outlier_plot(self, zlim, plot = False, save_features = False):
|
|
852
550
|
|
|
853
551
|
"""
|
|
@@ -1010,62 +708,6 @@ class stock_eda_panel(object):
|
|
|
1010
708
|
self.signals.append(f'signal_up_{feature_name}')
|
|
1011
709
|
self.signals.append(f'signal_low_{feature_name}')
|
|
1012
710
|
|
|
1013
|
-
#######################
|
|
1014
|
-
#### to be deprecated ####
|
|
1015
|
-
def spread_MA(self, ma1, ma2, limit = 1.95, plot = False, save_features = False):
|
|
1016
|
-
|
|
1017
|
-
self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
|
|
1018
|
-
self.df[f'MA_{ma2}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma2, min_periods=1).mean()))
|
|
1019
|
-
|
|
1020
|
-
self.ma1_column = f'MA_{ma1}'
|
|
1021
|
-
self.ma2_column = f'MA_{ma2}'
|
|
1022
|
-
self.df['MA_spread'] = self.df[f'MA_{ma1}'] - self.df[f'MA_{ma2}']
|
|
1023
|
-
|
|
1024
|
-
self.df['norm_MA_spread'] = (self.df['MA_spread'] - self.df['MA_spread'].mean())/self.df['MA_spread'].std()
|
|
1025
|
-
mean_ = self.df['norm_MA_spread'].mean()
|
|
1026
|
-
self.df['rollstd_MA_spread'] = self.df.sort_values("Date")["norm_MA_spread"].rolling(50).std()
|
|
1027
|
-
|
|
1028
|
-
self.df['upper_MA_spread'] = limit*self.df['rollstd_MA_spread'] + mean_
|
|
1029
|
-
self.df['lower_MA_spread'] = -limit*self.df['rollstd_MA_spread'] + mean_
|
|
1030
|
-
|
|
1031
|
-
self.df['signal_low_MA_spread'] = np.where( (self.df['norm_MA_spread'] < self.df['lower_MA_spread'] ), 1, 0)
|
|
1032
|
-
self.df['signal_up_MA_spread'] = np.where( (self.df['norm_MA_spread'] > self.df['upper_MA_spread'] ), 1, 0)
|
|
1033
|
-
|
|
1034
|
-
### ploting purposes
|
|
1035
|
-
self.df[f"Roll_mean_{ma1}"] = (
|
|
1036
|
-
self.df.sort_values("Date")["Close"]
|
|
1037
|
-
.transform(lambda x: x.rolling(ma1, min_periods=1).mean())
|
|
1038
|
-
)
|
|
1039
|
-
self.df[f"Roll_mean_{ma2}"] = (
|
|
1040
|
-
self.df.sort_values("Date")["Close"]
|
|
1041
|
-
.transform(lambda x: x.rolling(ma2, min_periods=1).mean())
|
|
1042
|
-
)
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
print('--------------------------------------------------------------------')
|
|
1046
|
-
if save_features:
|
|
1047
|
-
self.features.append('MA_spread')
|
|
1048
|
-
self.signals.append('signal_low_MA_spread')
|
|
1049
|
-
self.signals.append('signal_up_MA_spread')
|
|
1050
|
-
self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
|
|
1051
|
-
|
|
1052
|
-
if plot:
|
|
1053
|
-
|
|
1054
|
-
fig, axs = plt.subplots(1, 3,figsize=(21,4))
|
|
1055
|
-
|
|
1056
|
-
axs[0].plot(self.df['Date'],self.df['norm_MA_spread'])
|
|
1057
|
-
axs[0].plot(self.df['Date'],self.df['upper_MA_spread'], linestyle='--')
|
|
1058
|
-
axs[0].plot(self.df['Date'],self.df['lower_MA_spread'], linestyle='--')
|
|
1059
|
-
axs[0].set_title('MA_spread series')
|
|
1060
|
-
|
|
1061
|
-
plot_acf(self.df['MA_spread'].dropna(),lags=25, ax=axs[1])
|
|
1062
|
-
axs[1].set_title('acf MA_spread series')
|
|
1063
|
-
|
|
1064
|
-
plot_pacf(self.df['MA_spread'].dropna(),lags=25, ax=axs[2])
|
|
1065
|
-
axs[2].set_title('acf MA_spread series')
|
|
1066
|
-
plt.show()
|
|
1067
|
-
##################################################
|
|
1068
|
-
|
|
1069
711
|
def relative_spread_MA(self, ma1, ma2, threshold = 1.95, plot = False, save_features = False):
|
|
1070
712
|
"""
|
|
1071
713
|
perform relative moving average features, one for short term and another for long/mid term
|
|
@@ -1248,36 +890,6 @@ class stock_eda_panel(object):
|
|
|
1248
890
|
|
|
1249
891
|
plt.show()
|
|
1250
892
|
|
|
1251
|
-
#######################
|
|
1252
|
-
#### to be deprecated ####
|
|
1253
|
-
def get_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
|
|
1254
|
-
|
|
1255
|
-
# negative countiing and rolling countingng
|
|
1256
|
-
self.df['RetClose'] = self.df['Close'].pct_change()
|
|
1257
|
-
self.df['roll_pos_counting'] = np.where(self.df['RetClose'].shift(1) > 0,1,0 )
|
|
1258
|
-
self.df['roll_pos_counting'] = self.df['roll_pos_counting'].rolling(window = rolling_window).sum()
|
|
1259
|
-
|
|
1260
|
-
mean = self.df['roll_pos_counting'].mean()
|
|
1261
|
-
std = self.df['roll_pos_counting'].std()
|
|
1262
|
-
self.df['norm_counting'] = (self.df['roll_pos_counting'] - mean )/std
|
|
1263
|
-
|
|
1264
|
-
self.df['signal_up_roll_pos_counting'] = np.where((self.df['norm_counting'] > threshold),1,0)
|
|
1265
|
-
self.df['signal_low_roll_pos_counting'] = np.where((self.df['norm_counting'] < -threshold),1,0)
|
|
1266
|
-
|
|
1267
|
-
if save_features:
|
|
1268
|
-
self.features.append('roll_pos_counting')
|
|
1269
|
-
self.signals.append('signal_up_roll_pos_counting')
|
|
1270
|
-
self.signals.append('signal_low_roll_pos_counting')
|
|
1271
|
-
self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
1272
|
-
|
|
1273
|
-
if plot:
|
|
1274
|
-
fig = plt.figure(figsize = (10,4))
|
|
1275
|
-
plt.plot(self.df['Date'],self.df.norm_counting)
|
|
1276
|
-
plt.axhline(y=threshold, color='grey', linestyle='--')
|
|
1277
|
-
plt.axhline(y=-threshold, color='grey', linestyle='--')
|
|
1278
|
-
plt.show()
|
|
1279
|
-
#######################
|
|
1280
|
-
|
|
1281
893
|
def bidirect_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
|
|
1282
894
|
"""
|
|
1283
895
|
perform negative and positive return counting in a given rolling time window
|
|
@@ -1317,45 +929,6 @@ class stock_eda_panel(object):
|
|
|
1317
929
|
plt.plot(self.df['Date'],self.df[f'lower_{feature_name}'], linestyle='--')
|
|
1318
930
|
plt.show()
|
|
1319
931
|
|
|
1320
|
-
#######################
|
|
1321
|
-
#### to be deprecated ####
|
|
1322
|
-
def get_range_feature(self, window, up_threshold, low_threshold, plot = False, save_features = False):
|
|
1323
|
-
|
|
1324
|
-
self.df["Range"] = self.df["High"] / self.df["Low"] - 1
|
|
1325
|
-
self.df['Avg_range'] = self.df['Range'].rolling(window = 5).mean()
|
|
1326
|
-
self.df['dist_range'] = self.df['Range'] - self.df['Avg_range']
|
|
1327
|
-
self.df['norm_dist_range'] = (self.df['dist_range'] - self.df['dist_range'].mean())/ self.df['dist_range'].std()
|
|
1328
|
-
|
|
1329
|
-
mean_ = self.df['norm_dist_range'].mean()
|
|
1330
|
-
self.df[f'std_norm_dist_range'] = (self.df.sort_values("Date")["norm_dist_range"].transform(lambda x: x.rolling(window, min_periods=1).std()))
|
|
1331
|
-
|
|
1332
|
-
self.df['up_bound_norm_dist_range'] = up_threshold*self.df['std_norm_dist_range'] + mean_
|
|
1333
|
-
self.df['low_bound_norm_dist_range'] = -low_threshold*self.df['std_norm_dist_range'] + mean_
|
|
1334
|
-
|
|
1335
|
-
self.df['signal_up_dist_range'] = np.where(self.df['norm_dist_range'] > self.df['up_bound_norm_dist_range'],1,0 )
|
|
1336
|
-
self.df['signal_low_dist_range'] = np.where(self.df['norm_dist_range'] < self.df['low_bound_norm_dist_range'],1,0 )
|
|
1337
|
-
|
|
1338
|
-
if save_features:
|
|
1339
|
-
self.features.append('dist_range')
|
|
1340
|
-
self.signals.append('signal_up_dist_range')
|
|
1341
|
-
self.signals.append('signal_low_dist_range')
|
|
1342
|
-
self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
|
|
1343
|
-
|
|
1344
|
-
if plot:
|
|
1345
|
-
fig, axs = plt.subplots(2, 2,figsize=(17,11))
|
|
1346
|
-
|
|
1347
|
-
axs[0,0].plot(self.df['Range'])
|
|
1348
|
-
axs[0,0].set_title('range')
|
|
1349
|
-
|
|
1350
|
-
axs[0,1].plot(self.df['Avg_range'])
|
|
1351
|
-
axs[0,1].set_title('Avg_range')
|
|
1352
|
-
|
|
1353
|
-
axs[1,0].plot(self.df['up_bound_norm_dist_range'],color = 'grey', linestyle='--')
|
|
1354
|
-
axs[1,0].plot(self.df['low_bound_norm_dist_range'],color = 'grey', linestyle='--')
|
|
1355
|
-
axs[1,0].plot(self.df['norm_dist_range'])
|
|
1356
|
-
axs[1,0].set_title('norm_dist_range')
|
|
1357
|
-
#######################
|
|
1358
|
-
|
|
1359
932
|
def get_relative_range_feature(self, window, threshold, plot = False, save_features = False):
|
|
1360
933
|
"""
|
|
1361
934
|
perform relative spread of opening and closing price
|
|
@@ -1399,42 +972,6 @@ class stock_eda_panel(object):
|
|
|
1399
972
|
axs[1].plot(self.df[f'norm_{feature_name}'])
|
|
1400
973
|
axs[1].set_title(f'norm_{feature_name}')
|
|
1401
974
|
|
|
1402
|
-
#######################
|
|
1403
|
-
#### to be deprecated ####
|
|
1404
|
-
def rsi_feature(self, window, lag_rsi_ret, threshold, plot = False, save_features = False):
|
|
1405
|
-
|
|
1406
|
-
rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
|
|
1407
|
-
self.df['RSI'] = rsi
|
|
1408
|
-
self.df['RSI_ret'] = self.df['RSI']/self.df['RSI'].shift(lag_rsi_ret)
|
|
1409
|
-
|
|
1410
|
-
mean = self.df['RSI_ret'].mean()
|
|
1411
|
-
std = self.df['RSI_ret'].std()
|
|
1412
|
-
self.df['norm_RSI_ret'] = (self.df['RSI_ret']-mean)/std
|
|
1413
|
-
self.df['signal_up_RSI_ret'] = np.where(self.df['norm_RSI_ret'] > threshold,1,0)
|
|
1414
|
-
self.df['signal_low_RSI_ret'] = np.where(self.df['norm_RSI_ret'] < -threshold,1,0)
|
|
1415
|
-
|
|
1416
|
-
if save_features:
|
|
1417
|
-
self.features.append('RSI_ret')
|
|
1418
|
-
self.signals.append('signal_up_RSI_ret')
|
|
1419
|
-
self.signals.append('signal_low_RSI_ret')
|
|
1420
|
-
self.settings_rsi_feature= {'window':window, 'lag_rsi_ret':lag_rsi_ret, 'threshold':threshold}
|
|
1421
|
-
|
|
1422
|
-
if plot:
|
|
1423
|
-
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
1424
|
-
|
|
1425
|
-
axs[0].plot(self.df.norm_RSI_ret)
|
|
1426
|
-
axs[0].axhline(y=threshold, color='grey', linestyle='--')
|
|
1427
|
-
axs[0].axhline(y=-threshold, color='grey', linestyle='--')
|
|
1428
|
-
|
|
1429
|
-
plot_acf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[1])
|
|
1430
|
-
axs[1].set_title('acf RSI_ret')
|
|
1431
|
-
|
|
1432
|
-
plot_pacf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[2])
|
|
1433
|
-
axs[2].set_title('pacf RSI_ret')
|
|
1434
|
-
|
|
1435
|
-
fig.show()
|
|
1436
|
-
#######################
|
|
1437
|
-
|
|
1438
975
|
def rsi_feature_improved(self, window, threshold, plot = False, save_features = False):
|
|
1439
976
|
"""
|
|
1440
977
|
perform relative strength index
|
|
@@ -1462,51 +999,6 @@ class stock_eda_panel(object):
|
|
|
1462
999
|
if plot:
|
|
1463
1000
|
self.signal_plotter(feature_name)
|
|
1464
1001
|
|
|
1465
|
-
#######################
|
|
1466
|
-
#### to be deprecated ####
|
|
1467
|
-
def days_features(self, window_day, limit, plot = False, save_features = False):
|
|
1468
|
-
|
|
1469
|
-
self.df['dow'] = self.df.Date.dt.dayofweek
|
|
1470
|
-
self.df['dow'] = self.df['dow'].astype('str')
|
|
1471
|
-
|
|
1472
|
-
self.df['target_mean_input'] = (self.df.sort_values("Date").groupby('dow')['roll_mean_log_return'].transform(lambda x: x.rolling(window_day, min_periods=1).mean()))
|
|
1473
|
-
|
|
1474
|
-
mean = self.df['target_mean_input'].mean()
|
|
1475
|
-
std = self.df['target_mean_input'].std()
|
|
1476
|
-
|
|
1477
|
-
self.df['norm_dow_input'] = (self.df['target_mean_input']-mean)/std
|
|
1478
|
-
mean_ = self.df['norm_dow_input'].mean()
|
|
1479
|
-
self.df['std_dow_input'] = self.df.sort_values("Date")["norm_dow_input"].rolling(50).std()
|
|
1480
|
-
|
|
1481
|
-
self.df['up_dow_input'] = limit*self.df['std_dow_input'] + mean_
|
|
1482
|
-
self.df['low_dow_input'] = -limit*self.df['std_dow_input'] - mean_
|
|
1483
|
-
|
|
1484
|
-
self.df['signal_up_target_mean_input'] = np.where(self.df['norm_dow_input'] > self.df['up_dow_input'],1,0)
|
|
1485
|
-
self.df['signal_low_target_mean_input'] = np.where(self.df['norm_dow_input'] < self.df['low_dow_input'],1,0)
|
|
1486
|
-
|
|
1487
|
-
if save_features:
|
|
1488
|
-
|
|
1489
|
-
self.features.append('target_mean_input')
|
|
1490
|
-
self.signals.append('signal_up_target_mean_input')
|
|
1491
|
-
self.signals.append('signal_low_target_mean_input')
|
|
1492
|
-
self.settings_days_features = {'window_day':window_day, 'limit':limit}
|
|
1493
|
-
|
|
1494
|
-
if plot:
|
|
1495
|
-
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
1496
|
-
|
|
1497
|
-
axs[0].plot(self.df['norm_dow_input'])
|
|
1498
|
-
axs[0].plot(self.df['up_dow_input'], linestyle='--')
|
|
1499
|
-
axs[0].plot(self.df['low_dow_input'], linestyle='--')
|
|
1500
|
-
|
|
1501
|
-
plot_acf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[1])
|
|
1502
|
-
axs[1].set_title('acf day feature')
|
|
1503
|
-
|
|
1504
|
-
plot_pacf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[2])
|
|
1505
|
-
axs[2].set_title('pacf day feature')
|
|
1506
|
-
|
|
1507
|
-
fig.show()
|
|
1508
|
-
#######################
|
|
1509
|
-
|
|
1510
1002
|
def days_features_bands(self, window, threshold, plot = False, save_features = False):
|
|
1511
1003
|
"""
|
|
1512
1004
|
compute mean returns for a given day of the week in a window scope per day
|
|
@@ -1539,62 +1031,6 @@ class stock_eda_panel(object):
|
|
|
1539
1031
|
if plot:
|
|
1540
1032
|
self.signal_plotter(feature_name)
|
|
1541
1033
|
|
|
1542
|
-
#######################
|
|
1543
|
-
#### to be deprecated ####
|
|
1544
|
-
def analysis_volume(self,lag_volume, threshold, window, plot = False, save_features = False):
|
|
1545
|
-
|
|
1546
|
-
self.df['log_Volume'] = np.log(self.df['Volume'])
|
|
1547
|
-
self.df['ret_log_Volume'] = self.df['log_Volume'].pct_change(lag_volume)
|
|
1548
|
-
|
|
1549
|
-
self.df['norm_ret_log_Volume'] = (self.df['ret_log_Volume'] - self.df['ret_log_Volume'].mean())/ self.df['ret_log_Volume'].std()
|
|
1550
|
-
mean_ = self.df['norm_ret_log_Volume'].mean()
|
|
1551
|
-
self.df[f'std_norm_ret_log_Volume'] = (self.df.sort_values("Date")["norm_ret_log_Volume"].transform(lambda x: x.rolling(window, min_periods=1).std()))
|
|
1552
|
-
|
|
1553
|
-
self.df['up_bound_ret_log_Volume'] = threshold*self.df['std_norm_ret_log_Volume'] + mean_
|
|
1554
|
-
self.df['low_bound_ret_log_Volume'] = -threshold*self.df['std_norm_ret_log_Volume'] + mean_
|
|
1555
|
-
|
|
1556
|
-
self.df['signal_up_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] > self.df['up_bound_ret_log_Volume'],1,0 )
|
|
1557
|
-
self.df['signal_low_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] < self.df['low_bound_ret_log_Volume'],1,0 )
|
|
1558
|
-
|
|
1559
|
-
if save_features:
|
|
1560
|
-
self.features.append('ret_log_Volume')
|
|
1561
|
-
self.signals.append('signal_up_ret_log_Volume')
|
|
1562
|
-
self.signals.append('signal_low_ret_log_Volume')
|
|
1563
|
-
self.settings_volume_feature= {'lag_volume':lag_volume, 'threshold':threshold, 'window':window}
|
|
1564
|
-
if plot:
|
|
1565
|
-
fig, axs = plt.subplots(3, 2,figsize=(11,13))
|
|
1566
|
-
axs[0,0].plot(self.df.Date, self.df.Volume)
|
|
1567
|
-
axs[0,0].set_title('Volume')
|
|
1568
|
-
axs[0,1].plot(self.df.Date, self.df.log_Volume)
|
|
1569
|
-
axs[0,1].set_title('log Volume')
|
|
1570
|
-
|
|
1571
|
-
plot_acf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,0])
|
|
1572
|
-
axs[1,0].set_title('acf log_Volume')
|
|
1573
|
-
plot_pacf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,1])
|
|
1574
|
-
axs[1,1].set_title('pacf log_Volume')
|
|
1575
|
-
|
|
1576
|
-
plot_acf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,0])
|
|
1577
|
-
axs[2,0].set_title('acf ret_log_Volume')
|
|
1578
|
-
plot_pacf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,1])
|
|
1579
|
-
axs[2,1].set_title('pacf ret_log_Volume')
|
|
1580
|
-
|
|
1581
|
-
plt.show()
|
|
1582
|
-
|
|
1583
|
-
print('--------------------------------------------------------------')
|
|
1584
|
-
|
|
1585
|
-
fig, axs = plt.subplots(1, 2,figsize=(10,4))
|
|
1586
|
-
|
|
1587
|
-
axs[0].plot(self.df.Date, self.df.norm_ret_log_Volume)
|
|
1588
|
-
axs[0].plot(self.df.Date, self.df.up_bound_ret_log_Volume)
|
|
1589
|
-
axs[0].plot(self.df.Date, self.df.low_bound_ret_log_Volume)
|
|
1590
|
-
axs[0].set_title('norm_ret_log_Volume')
|
|
1591
|
-
|
|
1592
|
-
axs[1].plot(self.df.Date, self.df.std_norm_ret_log_Volume)
|
|
1593
|
-
axs[1].set_title('std_norm_ret_log_Volume')
|
|
1594
|
-
|
|
1595
|
-
plt.show()
|
|
1596
|
-
#######################
|
|
1597
|
-
|
|
1598
1034
|
def analysis_smooth_volume(self, window, threshold, plot = False, save_features = False):
|
|
1599
1035
|
"""
|
|
1600
1036
|
compute feature of thrading volumes
|
|
@@ -1968,14 +1404,12 @@ class stock_eda_panel(object):
|
|
|
1968
1404
|
self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
|
|
1969
1405
|
self.df["hmm_chain_order"] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1970
1406
|
|
|
1971
|
-
### returns using the
|
|
1972
|
-
self.df['
|
|
1973
|
-
self.df['
|
|
1974
|
-
self.df
|
|
1407
|
+
### returns using the windowsseeds
|
|
1408
|
+
self.df['lag_chain_close'] = self.df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
|
|
1409
|
+
self.df['chain_return'] = (self.df['Close']/self.df['lag_chain_close'] -1) * 100
|
|
1410
|
+
self.df = self.df.drop(columns = ['breack'])
|
|
1975
1411
|
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, plot = False, save_features = False, model = False):
|
|
1412
|
+
def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, corr_threshold = 0.75, plot = False, save_features = False, model = False):
|
|
1979
1413
|
"""
|
|
1980
1414
|
create or use a hmm model
|
|
1981
1415
|
|
|
@@ -1986,6 +1420,7 @@ class stock_eda_panel(object):
|
|
|
1986
1420
|
test_data_size (int): size of the test data. Note that the remaining is going to be used as training data
|
|
1987
1421
|
seed (int): seed for the model inizialization
|
|
1988
1422
|
lag_returns_state (int) : lags for returns of the state
|
|
1423
|
+
corr_threshold (float): correlation threshold for initial feature selection
|
|
1989
1424
|
plot (boolean): True to display hmm states analysis
|
|
1990
1425
|
save_features (boolean): True to save features and configurations
|
|
1991
1426
|
model (obj): if provided, no model will be trainend and the provided model will be used to get hmm features
|
|
@@ -1997,16 +1432,12 @@ class stock_eda_panel(object):
|
|
|
1997
1432
|
if not model:
|
|
1998
1433
|
|
|
1999
1434
|
df_new = self.df
|
|
2000
|
-
pipeline_hmm = Pipeline([
|
|
2001
|
-
('selector', FeatureSelector(columns=features_hmm)),
|
|
2002
|
-
('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
|
|
2003
|
-
('hmm',GaussianHMM(n_components = n_clusters, covariance_type = 'full', random_state = seed))
|
|
2004
|
-
])
|
|
2005
1435
|
data_train = df_new.iloc[:-test_data_size,:]
|
|
2006
1436
|
data_test = df_new.iloc[-test_data_size:,:]
|
|
2007
1437
|
|
|
2008
|
-
|
|
2009
|
-
|
|
1438
|
+
th = trainer_hmm(data_train, features_hmm, n_clusters=n_clusters,corr_thrshold=corr_threshold, seed = seed)
|
|
1439
|
+
th.train()
|
|
1440
|
+
pipeline_hmm = th.hmm_model
|
|
2010
1441
|
self.model_hmm = pipeline_hmm
|
|
2011
1442
|
self.test_data_hmm = data_test
|
|
2012
1443
|
|
|
@@ -2034,7 +1465,7 @@ class stock_eda_panel(object):
|
|
|
2034
1465
|
if save_features:
|
|
2035
1466
|
self.features.append('hmm_feature')
|
|
2036
1467
|
self.features.append('hmm_chain_order')
|
|
2037
|
-
self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state }
|
|
1468
|
+
self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state, 'corr_threshold':corr_threshold }
|
|
2038
1469
|
|
|
2039
1470
|
if plot:
|
|
2040
1471
|
|
|
@@ -2248,53 +1679,6 @@ class stock_eda_panel(object):
|
|
|
2248
1679
|
plt.legend()
|
|
2249
1680
|
plt.show()
|
|
2250
1681
|
|
|
2251
|
-
### deprecated ############################
|
|
2252
|
-
def create_strategy(self, favourable_states):
|
|
2253
|
-
|
|
2254
|
-
test_data = self.test_data_hmm
|
|
2255
|
-
# add MA signal
|
|
2256
|
-
test_data.loc[test_data[self.ma1_column] > test_data[self.ma2_column], 'MA_signal'] = 1
|
|
2257
|
-
test_data.loc[test_data[self.ma1_column] <= test_data[self.ma2_column], 'MA_signal'] = 0
|
|
2258
|
-
|
|
2259
|
-
# add hnn signal
|
|
2260
|
-
|
|
2261
|
-
test_data['HMM_signal'] = np.where(test_data['HMM'].isin(favourable_states),1,0)
|
|
2262
|
-
|
|
2263
|
-
## combined signals
|
|
2264
|
-
test_data['main_signal'] = 0
|
|
2265
|
-
test_data.loc[(test_data['MA_signal'] == 1) & (test_data['HMM_signal'] == 1), 'main_signal'] = 1
|
|
2266
|
-
test_data['main_signal'] = test_data['main_signal'].shift(1)
|
|
2267
|
-
|
|
2268
|
-
## benchmark return
|
|
2269
|
-
test_data['lrets_bench'] = np.log(test_data['Close']/test_data['Close'].shift(1))
|
|
2270
|
-
test_data['bench_prod'] = test_data['lrets_bench'].cumsum()
|
|
2271
|
-
test_data['bench_prod_exp'] = np.exp(test_data['bench_prod']) - 1
|
|
2272
|
-
|
|
2273
|
-
## strategy return
|
|
2274
|
-
# test_data['lrets_strat'] = np.log(test_data['Open'].shift(-1)/test_data['Open']) * test_data['main_signal']
|
|
2275
|
-
test_data['lrets_strat'] = np.log(test_data['Close'].shift(-1)/test_data['Close']) * test_data['main_signal']
|
|
2276
|
-
test_data['lrets_prod'] = test_data['lrets_strat'].cumsum()
|
|
2277
|
-
test_data['strat_prod_exp'] = np.exp(test_data['lrets_prod']) - 1
|
|
2278
|
-
test_data.dropna(inplace = True)
|
|
2279
|
-
|
|
2280
|
-
bench_rets = round(test_data['bench_prod_exp'].values[-1]*100,1)
|
|
2281
|
-
strat_rets = round(test_data['strat_prod_exp'].values[-1]*100,1)
|
|
2282
|
-
|
|
2283
|
-
bench_sharpe = self.sharpe_ratio(test_data['bench_prod_exp'].values)
|
|
2284
|
-
strat_sharpe = self.sharpe_ratio(test_data['strat_prod_exp'].values)
|
|
2285
|
-
|
|
2286
|
-
print(f'returns benchmark {bench_rets}%')
|
|
2287
|
-
print(f'returns strategy {strat_rets}%')
|
|
2288
|
-
print('-----------------------------')
|
|
2289
|
-
print(f'sharpe benchmark {bench_sharpe}')
|
|
2290
|
-
print(f'sharpe strategy {strat_sharpe}')
|
|
2291
|
-
|
|
2292
|
-
fig = plt.figure(figsize = (10,4))
|
|
2293
|
-
plt.plot(test_data['bench_prod_exp'])
|
|
2294
|
-
plt.plot(test_data['strat_prod_exp'])
|
|
2295
|
-
self.settings_hmm_states = {'favourable_states':favourable_states}
|
|
2296
|
-
################################################
|
|
2297
|
-
|
|
2298
1682
|
def deep_dive_analysis_hmm(self, test_data_size, split = 'train'):
|
|
2299
1683
|
"""
|
|
2300
1684
|
display analysis plot hmm model
|
|
@@ -2582,214 +1966,6 @@ class produce_model:
|
|
|
2582
1966
|
self.pipeline.fit(self.X_train, self.y_train)
|
|
2583
1967
|
self.features_to_model = self.pipeline[:-1].transform(self.X_train).columns
|
|
2584
1968
|
|
|
2585
|
-
class hmm_feature_selector():
|
|
2586
|
-
"""
|
|
2587
|
-
class that is going to train hmm models to perform feature selection
|
|
2588
|
-
|
|
2589
|
-
Attributes
|
|
2590
|
-
----------
|
|
2591
|
-
data : pd.DataFrame
|
|
2592
|
-
symbol of the asset
|
|
2593
|
-
n_clusters : int
|
|
2594
|
-
number of clusters to search
|
|
2595
|
-
init_features_hmm : list
|
|
2596
|
-
list of features to consider in the search
|
|
2597
|
-
test_data_size :int
|
|
2598
|
-
test data size, meaning that the remaining is going to be used as training data
|
|
2599
|
-
select_n_features : int
|
|
2600
|
-
number of features to select
|
|
2601
|
-
n_trials : int
|
|
2602
|
-
total number of trials per combination
|
|
2603
|
-
limit_search : int
|
|
2604
|
-
limit number of combinations
|
|
2605
|
-
default_benchmark_sd : float
|
|
2606
|
-
default value to bias standard deviation
|
|
2607
|
-
t_threshold : float
|
|
2608
|
-
alpha or z threshold
|
|
2609
|
-
pipeline_hmm: obj
|
|
2610
|
-
pipeline object of the hmm model
|
|
2611
|
-
features_used_in_model:list
|
|
2612
|
-
features in model
|
|
2613
|
-
train_model(features_hmm=list):
|
|
2614
|
-
train hmm model
|
|
2615
|
-
feature_combinations: list
|
|
2616
|
-
list of combination of features
|
|
2617
|
-
mean_relevance: float
|
|
2618
|
-
relevance score of the model
|
|
2619
|
-
best_features: list
|
|
2620
|
-
list of best performing features
|
|
2621
|
-
|
|
2622
|
-
Methods
|
|
2623
|
-
-------
|
|
2624
|
-
split_data():
|
|
2625
|
-
split data in train and test
|
|
2626
|
-
train_model(features_hmm=list):
|
|
2627
|
-
train hmm model
|
|
2628
|
-
feature_list_generator():
|
|
2629
|
-
perform combination of features
|
|
2630
|
-
get_error():
|
|
2631
|
-
get error or score of a given model using relevance score
|
|
2632
|
-
execute_selector():
|
|
2633
|
-
select the best combination of features
|
|
2634
|
-
"""
|
|
2635
|
-
def __init__(self, data, n_clusters, init_features_hmm, test_data_size, select_n_features, n_trials = 1,limit_search = False, default_benchmark_sd = 0.00003, t_threshold = 2):
|
|
2636
|
-
"""
|
|
2637
|
-
Initialize object
|
|
2638
|
-
|
|
2639
|
-
Parameters
|
|
2640
|
-
----------
|
|
2641
|
-
data (pd.DataFrame): data
|
|
2642
|
-
n_clusters (int): number of clusters to search
|
|
2643
|
-
init_features_hmm (list): list of features to consider in the search
|
|
2644
|
-
test_data_siz:(int: test data size, meaning that the remaining is going to be used as training data
|
|
2645
|
-
select_n_features (int): number of features to select
|
|
2646
|
-
n_trials (int): total number of trials per combination
|
|
2647
|
-
limit_search (int): limit number of combinations
|
|
2648
|
-
default_benchmark_sd (float): default value to bias standard deviation
|
|
2649
|
-
t_threshold (float): alpha or z threshold
|
|
2650
|
-
|
|
2651
|
-
Returns
|
|
2652
|
-
-------
|
|
2653
|
-
None
|
|
2654
|
-
"""
|
|
2655
|
-
self.data = data.copy()
|
|
2656
|
-
self.n_clusters = n_clusters
|
|
2657
|
-
self.init_features_hmm = init_features_hmm
|
|
2658
|
-
self.test_data_size = test_data_size
|
|
2659
|
-
self.select_n_features = select_n_features
|
|
2660
|
-
self.n_trials = n_trials
|
|
2661
|
-
self.limit_search= limit_search
|
|
2662
|
-
self.default_benchmark_sd = default_benchmark_sd
|
|
2663
|
-
self.t_threshold = t_threshold
|
|
2664
|
-
|
|
2665
|
-
def split_data(self):
|
|
2666
|
-
"""
|
|
2667
|
-
split data in train and test
|
|
2668
|
-
|
|
2669
|
-
Parameters
|
|
2670
|
-
----------
|
|
2671
|
-
None
|
|
2672
|
-
|
|
2673
|
-
Returns
|
|
2674
|
-
-------
|
|
2675
|
-
None
|
|
2676
|
-
"""
|
|
2677
|
-
self.data_train = self.data.iloc[:-self.test_data_size,:]
|
|
2678
|
-
self.data_test = self.data.iloc[-self.test_data_size:,:]
|
|
2679
|
-
|
|
2680
|
-
def train_model(self,features_hmm):
|
|
2681
|
-
"""
|
|
2682
|
-
train hmm model
|
|
2683
|
-
|
|
2684
|
-
Parameters
|
|
2685
|
-
----------
|
|
2686
|
-
features_hmm (list): list of features to be selected in the model
|
|
2687
|
-
|
|
2688
|
-
Returns
|
|
2689
|
-
-------
|
|
2690
|
-
None
|
|
2691
|
-
"""
|
|
2692
|
-
pipeline_hmm = Pipeline([
|
|
2693
|
-
('selector', FeatureSelector(columns=features_hmm)),
|
|
2694
|
-
('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
|
|
2695
|
-
('hmm',GaussianHMM(n_components = self.n_clusters, covariance_type = 'full'))
|
|
2696
|
-
])
|
|
2697
|
-
|
|
2698
|
-
self.pipeline_hmm = pipeline_hmm.fit(self.data_train)
|
|
2699
|
-
self.features_used_in_model = features_hmm
|
|
2700
|
-
|
|
2701
|
-
def feature_list_generator(self):
|
|
2702
|
-
"""
|
|
2703
|
-
perform combination of features
|
|
2704
|
-
|
|
2705
|
-
Parameters
|
|
2706
|
-
----------
|
|
2707
|
-
None
|
|
2708
|
-
|
|
2709
|
-
Returns
|
|
2710
|
-
-------
|
|
2711
|
-
None
|
|
2712
|
-
"""
|
|
2713
|
-
feature_combinations = set(list(combinations(self.init_features_hmm, self.select_n_features)))
|
|
2714
|
-
feature_combinations = list(map(list, feature_combinations))
|
|
2715
|
-
|
|
2716
|
-
self.feature_combinations = feature_combinations
|
|
2717
|
-
|
|
2718
|
-
def get_error(self):
|
|
2719
|
-
"""
|
|
2720
|
-
get error or score of a given model using relevance score
|
|
2721
|
-
|
|
2722
|
-
Parameters
|
|
2723
|
-
----------
|
|
2724
|
-
None
|
|
2725
|
-
|
|
2726
|
-
Returns
|
|
2727
|
-
-------
|
|
2728
|
-
None
|
|
2729
|
-
"""
|
|
2730
|
-
self.data_train_ = self.data_train.copy()
|
|
2731
|
-
|
|
2732
|
-
self.data_train_['hmm_feature'] = self.pipeline_hmm.predict(self.data_train_)
|
|
2733
|
-
self.data_train_ = self.data_train_[['Date','hmm_feature','Close']].sort_values('Date')
|
|
2734
|
-
|
|
2735
|
-
## indexing chains
|
|
2736
|
-
self.data_train_['lag_hmm_feature'] = self.data_train_['hmm_feature'].shift(1)
|
|
2737
|
-
self.data_train_['breack'] = np.where(self.data_train_['lag_hmm_feature'] != self.data_train_['hmm_feature'],1,0)
|
|
2738
|
-
self.data_train_["chain_id"] = self.data_train_.groupby("breack")["Date"].rank(method="first", ascending=True)
|
|
2739
|
-
self.data_train_["chain_id"] = np.where(self.data_train_['breack'] == 1,self.data_train_["chain_id"],np.nan)
|
|
2740
|
-
self.data_train_["chain_id"] = self.data_train_["chain_id"].fillna(method='ffill')
|
|
2741
|
-
self.data_train_["hmm_chain_order"] = self.data_train_.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
2742
|
-
|
|
2743
|
-
### returns using the first element in a chain
|
|
2744
|
-
self.data_train_['first'] = np.where(self.data_train_['hmm_chain_order'] == 1, self.data_train_['Close'], np.nan)
|
|
2745
|
-
self.data_train_['first'] = self.data_train_.sort_values('Date')['first'].fillna(method='ffill')
|
|
2746
|
-
self.data_train_['chain_return'] = (self.data_train_['Close']/self.data_train_['first'] -1) * 100
|
|
2747
|
-
|
|
2748
|
-
self.data_train_ = self.data_train_.drop(columns = ['first'])
|
|
2749
|
-
|
|
2750
|
-
mean_relevance, cluster_returns, number_relevant_states = states_relevance_score(self.data_train_)
|
|
2751
|
-
self.mean_relevance = mean_relevance
|
|
2752
|
-
|
|
2753
|
-
def execute_selector(self):
|
|
2754
|
-
"""
|
|
2755
|
-
select the best combination of features
|
|
2756
|
-
|
|
2757
|
-
Parameters
|
|
2758
|
-
----------
|
|
2759
|
-
None
|
|
2760
|
-
|
|
2761
|
-
Returns
|
|
2762
|
-
-------
|
|
2763
|
-
None
|
|
2764
|
-
"""
|
|
2765
|
-
self.split_data()
|
|
2766
|
-
self.feature_list_generator()
|
|
2767
|
-
maxi = -1
|
|
2768
|
-
print(f'it is expected {len(self.feature_combinations)} combinations')
|
|
2769
|
-
feature_results = dict()
|
|
2770
|
-
|
|
2771
|
-
if self.limit_search:
|
|
2772
|
-
print(f' taking just {self.limit_search} combinations')
|
|
2773
|
-
maxi = self.limit_search
|
|
2774
|
-
|
|
2775
|
-
for i,features_hmm in enumerate(self.feature_combinations[0:maxi]):
|
|
2776
|
-
|
|
2777
|
-
feature_results[f'group_{i}'] = {
|
|
2778
|
-
'features':list(features_hmm),
|
|
2779
|
-
'relevances':list()
|
|
2780
|
-
}
|
|
2781
|
-
|
|
2782
|
-
for _ in range(self.n_trials):
|
|
2783
|
-
try:
|
|
2784
|
-
self.train_model(features_hmm)
|
|
2785
|
-
self.get_error()
|
|
2786
|
-
feature_results[f'group_{i}']['relevances'].append(self.mean_relevance)
|
|
2787
|
-
except:
|
|
2788
|
-
print('error')
|
|
2789
|
-
feature_results[f'group_{i}']['mean relevance'] = np.mean(feature_results[f'group_{i}']['relevances'])
|
|
2790
|
-
self.feature_results = feature_results
|
|
2791
|
-
self.best_features = pd.DataFrame(self.feature_results).T.sort_values('mean relevance').iloc[-1,:].features
|
|
2792
|
-
|
|
2793
1969
|
class analyse_index(stock_eda_panel):
|
|
2794
1970
|
"""
|
|
2795
1971
|
class that is going to train hmm models to perform feature selection
|
|
@@ -3025,7 +2201,6 @@ class analyse_index(stock_eda_panel):
|
|
|
3025
2201
|
|
|
3026
2202
|
self.states_result = result
|
|
3027
2203
|
|
|
3028
|
-
|
|
3029
2204
|
def get_relevant_beta(data_market, ticket_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
|
|
3030
2205
|
'''
|
|
3031
2206
|
select relevant beta result data of a given asset
|