virgo-modules 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of virgo-modules might be problematic. Click here for more details.

@@ -36,7 +36,6 @@ from hmmlearn.hmm import GaussianHMM
36
36
 
37
37
  from plotly.colors import DEFAULT_PLOTLY_COLORS
38
38
 
39
- from sklearn.base import BaseEstimator, TransformerMixin
40
39
  from sklearn.pipeline import Pipeline
41
40
  from feature_engine.imputation import MeanMedianImputer
42
41
 
@@ -54,252 +53,8 @@ from .aws_utils import upload_file_to_aws
54
53
 
55
54
  import logging
56
55
 
57
- class InverseHyperbolicSine(BaseEstimator, TransformerMixin):
58
-
59
- """
60
- Class that applies inverse hyperbolic sine for feature transformation.
61
- this class is compatible with scikitlearn pipeline
62
-
63
- Attributes
64
- ----------
65
- features : list
66
- list of features to apply the transformation
67
- prefix : str
68
- prefix for the new features. is '' the features are overwrite
69
-
70
- Methods
71
- -------
72
- fit(additional="", X=DataFrame, y=None):
73
- fit transformation.
74
- transform(X=DataFrame, y=None):
75
- apply feature transformation
76
- """
77
-
78
- def __init__(self, features, prefix = ''):
79
- self.features = features
80
- self.prefix = prefix
81
-
82
- def fit(self, X, y=None):
83
- return self
84
-
85
- def transform(self, X, y=None):
86
- for feature in self.features:
87
- X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
88
- return X
89
-
90
- class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
91
-
92
- """
93
- Class that applies winsorirization of a feature for feature transformation.
94
- this class is compatible with scikitlearn pipeline
95
-
96
- Attributes
97
- ----------
98
- feature_configs : dict
99
- dictionary of features and configurations. the configuration has high and low limits per feature
100
-
101
- Methods
102
- -------
103
- fit(additional="", X=DataFrame, y=None):
104
- fit transformation.
105
- transform(X=DataFrame, y=None):
106
- apply feature transformation
107
- """
108
-
109
- def __init__(self, feature_configs):
110
- self.feature_configs = feature_configs
111
- def fit(self, X, y=None):
112
- return self
113
-
114
- def transform(self, X, y=None):
115
- for feature in self.feature_configs:
116
- lower = self.feature_configs[feature]['min']
117
- upper = self.feature_configs[feature]['max']
118
- X[feature] = np.where( lower > X[feature], lower, X[feature])
119
- X[feature] = np.where( upper < X[feature], upper, X[feature])
120
- return X
121
-
122
- class FeatureSelector(BaseEstimator, TransformerMixin):
123
-
124
- """
125
- Class that applies selection of features.
126
- this class is compatible with scikitlearn pipeline
127
-
128
- Attributes
129
- ----------
130
- columns : list
131
- list of features to select
132
-
133
- Methods
134
- -------
135
- fit(additional="", X=DataFrame, y=None):
136
- fit transformation.
137
- transform(X=DataFrame, y=None):
138
- apply feature transformation
139
- """
140
-
141
- def __init__(self, columns):
142
- self.columns = columns
143
-
144
- def fit(self, X, y=None):
145
- return self
146
-
147
- def transform(self, X, y=None):
148
- return X[self.columns]
149
-
150
- class FeaturesEntropy(BaseEstimator, TransformerMixin):
151
- """
152
- Class that creates a feature that calculate entropy for a given feature classes, but it might get some leackeage in the training set.
153
- this class is compatible with scikitlearn pipeline
154
-
155
- Attributes
156
- ----------
157
- columns : list
158
- list of features to select
159
- entropy_map: pd.DataFrame
160
- dataframe of the map with the entropies per class
161
- perc: float
162
- percentage of the dates using for calculate the entropy map
163
-
164
- Methods
165
- -------
166
- fit(additional="", X=DataFrame, y=None):
167
- fit transformation.
168
- transform(X=DataFrame, y=None):
169
- apply feature transformation
170
- """
171
-
172
- def __init__(self, features, target, feature_name = None, feature_type = 'discrete', perc = 0.5, default_null = 0.99):
173
-
174
- self.features = features
175
- self.feature_type = feature_type
176
- self.target = target
177
- self.perc = perc
178
- self.default_null = default_null
179
-
180
- if not feature_name:
181
- self.feature_name = '_'.join(features)
182
- self.feature_name = self.feature_name + '_' + target + '_' + feature_type
183
- else:
184
- self.feature_name = feature_name
185
-
186
- def fit(self, X, y=None):
187
-
188
- unique_dates = list(X['Date'].unique())
189
- unique_dates.sort()
190
-
191
- total_length = len(unique_dates)
192
- cut = int(round(total_length*self.perc,0))
193
- train_dates = unique_dates[:cut]
194
- max_train_date = max(train_dates)
195
-
196
- X_ = X[X['Date'] <= max_train_date].copy()
197
- df = X_.join(y, how = 'left')
198
-
199
- column_list = [f'{self.feature_type}_signal_{colx}' for colx in self.features]
200
-
201
- df_aggr = (
202
- df
203
- .groupby(column_list, as_index = False)
204
- .apply(
205
- lambda x: pd.Series(
206
- dict(
207
- counts = x[self.target].count(),
208
- trues=(x[self.target] == 1).sum(),
209
- falses=(x[self.target] == 0).sum(),
210
- )
211
- )
212
- )
213
- .assign(
214
- trues_rate=lambda x: x['trues'] / x['counts']
215
- )
216
- .assign(
217
- falses_rate=lambda x: x['falses'] / x['counts']
218
- )
219
- .assign(
220
- log2_trues = lambda x: np.log2(1/x['trues_rate'])
221
- )
222
- .assign(
223
- log2_falses = lambda x: np.log2(1/x['falses_rate'])
224
- )
225
- .assign(
226
- comp1 = lambda x: x['trues_rate']*x['log2_trues']
227
- )
228
- .assign(
229
- comp2 = lambda x: x['falses_rate']*x['log2_falses']
230
- )
231
- .assign(
232
- class_entropy = lambda x: np.round(x['comp1']+x['comp2'],3)
233
- )
234
- )
235
-
236
- self.column_list = column_list
237
- self.entropy_map = (
238
- df_aggr
239
- [column_list+['class_entropy']]
240
- .rename(columns = {'class_entropy': self.feature_name})
241
- .copy()
242
- )
243
-
244
- del df, df_aggr, X_
245
- return self
246
-
247
- def transform(self, X, y=None):
248
-
249
- X = X.join(self.entropy_map.set_index(self.column_list), on=self.column_list, how = 'left')
250
- X[self.feature_name] = X[self.feature_name].fillna(self.default_null)
251
- return X
252
-
253
- class signal_combiner(BaseEstimator, TransformerMixin):
254
-
255
- """
256
- Class that applies feature combination of binary signals.
257
- this class is compatible with scikitlearn pipeline
258
-
259
- ...
260
-
261
- Attributes
262
- ----------
263
- columns : list
264
- list of features to select
265
- drop : boolean
266
- drop combining features
267
- prefix_up : str
268
- up prefix of the base feature
269
- prefix_low : str
270
- low prefix of the base feature
271
-
272
- Methods
273
- -------
274
- fit(additional="", X=DataFrame, y=None):
275
- fit transformation.
276
- transform(X=DataFrame, y=None):
277
- apply feature transformation
278
- """
279
-
280
- def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
281
- self.columns = columns
282
- self.drop = drop
283
- self.prefix_up = prefix_up
284
- self.prefix_low = prefix_low
285
-
286
- def fit(self, X, y=None):
287
- return self
288
-
289
- def transform(self, X, y=None):
290
- for column in self.columns:
291
- X['CombSignal_'+column] = np.where(
292
- X[self.prefix_up + column] == 1,
293
- 1,
294
- np.where(
295
- X[self.prefix_low + column] == 1,
296
- 1,
297
- 0
298
- )
299
- )
300
- if self.drop:
301
- X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
302
- return X
56
+ from virgo_modules.src.hmm_utils import trainer_hmm
57
+ from virgo_modules.src.transformer_utils import signal_combiner, FeatureSelector
303
58
 
304
59
  def data_processing_pipeline(features_base,features_to_drop = False, lag_dict = False, combine_signals = False, discretize_columns = False, correlation = 0.77):
305
60
 
@@ -335,61 +90,6 @@ def data_processing_pipeline(features_base,features_to_drop = False, lag_dict =
335
90
  )
336
91
  return pipe
337
92
 
338
- def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
339
- '''
340
- calculate relevance score and summary report for hmm model
341
-
342
- Parameters:
343
- default_benchmark_sd (float): default value to bias SD for t calculation
344
- t_threshold (float): alpha or z threshold for the normalized score
345
-
346
- Returns:
347
- mean_relevance (float): mean relevance score of the states
348
- cluster_returns (pd.DataFrame): summary report of the analysis
349
- number_relevant_states (int): number of relevant states
350
- '''
351
- ## legnths
352
- cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
353
- cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
354
- ## means
355
- def quantile2(x):
356
- return x.quantile(0.25)
357
- def quantile3(x):
358
- return x.quantile(0.75)
359
-
360
- cluster_returns = data.groupby('hmm_feature').agg(
361
- n_uniques = ('chain_id','nunique'),
362
- n_obs = ('Date','count'),
363
- cluster_ret_q25 = ('chain_return',quantile2),
364
- cluster_ret_median = ('chain_return','median'),
365
- cluster_ret_q75 = ('chain_return',quantile3),
366
- )
367
- cluster_returns = cluster_returns.join(cluster_lengths, how = 'left')
368
- cluster_returns['perc_dispute'] = np.where(
369
- np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
370
- 1,0
371
- )
372
- cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
373
- cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
374
- cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
375
- cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
376
- cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
377
- cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
378
- cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
379
-
380
- ## relevance or importance
381
- # naive aproach
382
- cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
383
- cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
384
- cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
385
- cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
386
-
387
- mean_relevance = cluster_returns['abs_t_accpted'].mean()
388
- number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
389
-
390
- return mean_relevance, cluster_returns, number_relevant_states
391
-
392
-
393
93
  class stock_eda_panel(object):
394
94
 
395
95
  """
@@ -800,7 +500,6 @@ class stock_eda_panel(object):
800
500
  self.augmented_dickey_fuller_statistics(df['log_return'], 'log_return')
801
501
  self.augmented_dickey_fuller_statistics(df['roll_mean_log_return'], 'roll_mean_log_return')
802
502
 
803
-
804
503
  def find_lag(self, feature, lag_list, column_target = 'log_return',posterior_lag = 4, test_size = 350):
805
504
 
806
505
  """
@@ -847,7 +546,6 @@ class stock_eda_panel(object):
847
546
  plt.axhline(y=0, color='grey', linestyle='--')
848
547
  plt.show()
849
548
 
850
-
851
549
  def outlier_plot(self, zlim, plot = False, save_features = False):
852
550
 
853
551
  """
@@ -1010,62 +708,6 @@ class stock_eda_panel(object):
1010
708
  self.signals.append(f'signal_up_{feature_name}')
1011
709
  self.signals.append(f'signal_low_{feature_name}')
1012
710
 
1013
- #######################
1014
- #### to be deprecated ####
1015
- def spread_MA(self, ma1, ma2, limit = 1.95, plot = False, save_features = False):
1016
-
1017
- self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
1018
- self.df[f'MA_{ma2}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma2, min_periods=1).mean()))
1019
-
1020
- self.ma1_column = f'MA_{ma1}'
1021
- self.ma2_column = f'MA_{ma2}'
1022
- self.df['MA_spread'] = self.df[f'MA_{ma1}'] - self.df[f'MA_{ma2}']
1023
-
1024
- self.df['norm_MA_spread'] = (self.df['MA_spread'] - self.df['MA_spread'].mean())/self.df['MA_spread'].std()
1025
- mean_ = self.df['norm_MA_spread'].mean()
1026
- self.df['rollstd_MA_spread'] = self.df.sort_values("Date")["norm_MA_spread"].rolling(50).std()
1027
-
1028
- self.df['upper_MA_spread'] = limit*self.df['rollstd_MA_spread'] + mean_
1029
- self.df['lower_MA_spread'] = -limit*self.df['rollstd_MA_spread'] + mean_
1030
-
1031
- self.df['signal_low_MA_spread'] = np.where( (self.df['norm_MA_spread'] < self.df['lower_MA_spread'] ), 1, 0)
1032
- self.df['signal_up_MA_spread'] = np.where( (self.df['norm_MA_spread'] > self.df['upper_MA_spread'] ), 1, 0)
1033
-
1034
- ### ploting purposes
1035
- self.df[f"Roll_mean_{ma1}"] = (
1036
- self.df.sort_values("Date")["Close"]
1037
- .transform(lambda x: x.rolling(ma1, min_periods=1).mean())
1038
- )
1039
- self.df[f"Roll_mean_{ma2}"] = (
1040
- self.df.sort_values("Date")["Close"]
1041
- .transform(lambda x: x.rolling(ma2, min_periods=1).mean())
1042
- )
1043
-
1044
-
1045
- print('--------------------------------------------------------------------')
1046
- if save_features:
1047
- self.features.append('MA_spread')
1048
- self.signals.append('signal_low_MA_spread')
1049
- self.signals.append('signal_up_MA_spread')
1050
- self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
1051
-
1052
- if plot:
1053
-
1054
- fig, axs = plt.subplots(1, 3,figsize=(21,4))
1055
-
1056
- axs[0].plot(self.df['Date'],self.df['norm_MA_spread'])
1057
- axs[0].plot(self.df['Date'],self.df['upper_MA_spread'], linestyle='--')
1058
- axs[0].plot(self.df['Date'],self.df['lower_MA_spread'], linestyle='--')
1059
- axs[0].set_title('MA_spread series')
1060
-
1061
- plot_acf(self.df['MA_spread'].dropna(),lags=25, ax=axs[1])
1062
- axs[1].set_title('acf MA_spread series')
1063
-
1064
- plot_pacf(self.df['MA_spread'].dropna(),lags=25, ax=axs[2])
1065
- axs[2].set_title('acf MA_spread series')
1066
- plt.show()
1067
- ##################################################
1068
-
1069
711
  def relative_spread_MA(self, ma1, ma2, threshold = 1.95, plot = False, save_features = False):
1070
712
  """
1071
713
  perform relative moving average features, one for short term and another for long/mid term
@@ -1248,36 +890,6 @@ class stock_eda_panel(object):
1248
890
 
1249
891
  plt.show()
1250
892
 
1251
- #######################
1252
- #### to be deprecated ####
1253
- def get_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
1254
-
1255
- # negative countiing and rolling countingng
1256
- self.df['RetClose'] = self.df['Close'].pct_change()
1257
- self.df['roll_pos_counting'] = np.where(self.df['RetClose'].shift(1) > 0,1,0 )
1258
- self.df['roll_pos_counting'] = self.df['roll_pos_counting'].rolling(window = rolling_window).sum()
1259
-
1260
- mean = self.df['roll_pos_counting'].mean()
1261
- std = self.df['roll_pos_counting'].std()
1262
- self.df['norm_counting'] = (self.df['roll_pos_counting'] - mean )/std
1263
-
1264
- self.df['signal_up_roll_pos_counting'] = np.where((self.df['norm_counting'] > threshold),1,0)
1265
- self.df['signal_low_roll_pos_counting'] = np.where((self.df['norm_counting'] < -threshold),1,0)
1266
-
1267
- if save_features:
1268
- self.features.append('roll_pos_counting')
1269
- self.signals.append('signal_up_roll_pos_counting')
1270
- self.signals.append('signal_low_roll_pos_counting')
1271
- self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
1272
-
1273
- if plot:
1274
- fig = plt.figure(figsize = (10,4))
1275
- plt.plot(self.df['Date'],self.df.norm_counting)
1276
- plt.axhline(y=threshold, color='grey', linestyle='--')
1277
- plt.axhline(y=-threshold, color='grey', linestyle='--')
1278
- plt.show()
1279
- #######################
1280
-
1281
893
  def bidirect_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
1282
894
  """
1283
895
  perform negative and positive return counting in a given rolling time window
@@ -1317,45 +929,6 @@ class stock_eda_panel(object):
1317
929
  plt.plot(self.df['Date'],self.df[f'lower_{feature_name}'], linestyle='--')
1318
930
  plt.show()
1319
931
 
1320
- #######################
1321
- #### to be deprecated ####
1322
- def get_range_feature(self, window, up_threshold, low_threshold, plot = False, save_features = False):
1323
-
1324
- self.df["Range"] = self.df["High"] / self.df["Low"] - 1
1325
- self.df['Avg_range'] = self.df['Range'].rolling(window = 5).mean()
1326
- self.df['dist_range'] = self.df['Range'] - self.df['Avg_range']
1327
- self.df['norm_dist_range'] = (self.df['dist_range'] - self.df['dist_range'].mean())/ self.df['dist_range'].std()
1328
-
1329
- mean_ = self.df['norm_dist_range'].mean()
1330
- self.df[f'std_norm_dist_range'] = (self.df.sort_values("Date")["norm_dist_range"].transform(lambda x: x.rolling(window, min_periods=1).std()))
1331
-
1332
- self.df['up_bound_norm_dist_range'] = up_threshold*self.df['std_norm_dist_range'] + mean_
1333
- self.df['low_bound_norm_dist_range'] = -low_threshold*self.df['std_norm_dist_range'] + mean_
1334
-
1335
- self.df['signal_up_dist_range'] = np.where(self.df['norm_dist_range'] > self.df['up_bound_norm_dist_range'],1,0 )
1336
- self.df['signal_low_dist_range'] = np.where(self.df['norm_dist_range'] < self.df['low_bound_norm_dist_range'],1,0 )
1337
-
1338
- if save_features:
1339
- self.features.append('dist_range')
1340
- self.signals.append('signal_up_dist_range')
1341
- self.signals.append('signal_low_dist_range')
1342
- self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
1343
-
1344
- if plot:
1345
- fig, axs = plt.subplots(2, 2,figsize=(17,11))
1346
-
1347
- axs[0,0].plot(self.df['Range'])
1348
- axs[0,0].set_title('range')
1349
-
1350
- axs[0,1].plot(self.df['Avg_range'])
1351
- axs[0,1].set_title('Avg_range')
1352
-
1353
- axs[1,0].plot(self.df['up_bound_norm_dist_range'],color = 'grey', linestyle='--')
1354
- axs[1,0].plot(self.df['low_bound_norm_dist_range'],color = 'grey', linestyle='--')
1355
- axs[1,0].plot(self.df['norm_dist_range'])
1356
- axs[1,0].set_title('norm_dist_range')
1357
- #######################
1358
-
1359
932
  def get_relative_range_feature(self, window, threshold, plot = False, save_features = False):
1360
933
  """
1361
934
  perform relative spread of opening and closing price
@@ -1399,42 +972,6 @@ class stock_eda_panel(object):
1399
972
  axs[1].plot(self.df[f'norm_{feature_name}'])
1400
973
  axs[1].set_title(f'norm_{feature_name}')
1401
974
 
1402
- #######################
1403
- #### to be deprecated ####
1404
- def rsi_feature(self, window, lag_rsi_ret, threshold, plot = False, save_features = False):
1405
-
1406
- rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
1407
- self.df['RSI'] = rsi
1408
- self.df['RSI_ret'] = self.df['RSI']/self.df['RSI'].shift(lag_rsi_ret)
1409
-
1410
- mean = self.df['RSI_ret'].mean()
1411
- std = self.df['RSI_ret'].std()
1412
- self.df['norm_RSI_ret'] = (self.df['RSI_ret']-mean)/std
1413
- self.df['signal_up_RSI_ret'] = np.where(self.df['norm_RSI_ret'] > threshold,1,0)
1414
- self.df['signal_low_RSI_ret'] = np.where(self.df['norm_RSI_ret'] < -threshold,1,0)
1415
-
1416
- if save_features:
1417
- self.features.append('RSI_ret')
1418
- self.signals.append('signal_up_RSI_ret')
1419
- self.signals.append('signal_low_RSI_ret')
1420
- self.settings_rsi_feature= {'window':window, 'lag_rsi_ret':lag_rsi_ret, 'threshold':threshold}
1421
-
1422
- if plot:
1423
- fig, axs = plt.subplots(1, 3,figsize=(17,5))
1424
-
1425
- axs[0].plot(self.df.norm_RSI_ret)
1426
- axs[0].axhline(y=threshold, color='grey', linestyle='--')
1427
- axs[0].axhline(y=-threshold, color='grey', linestyle='--')
1428
-
1429
- plot_acf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[1])
1430
- axs[1].set_title('acf RSI_ret')
1431
-
1432
- plot_pacf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[2])
1433
- axs[2].set_title('pacf RSI_ret')
1434
-
1435
- fig.show()
1436
- #######################
1437
-
1438
975
  def rsi_feature_improved(self, window, threshold, plot = False, save_features = False):
1439
976
  """
1440
977
  perform relative strength index
@@ -1462,51 +999,6 @@ class stock_eda_panel(object):
1462
999
  if plot:
1463
1000
  self.signal_plotter(feature_name)
1464
1001
 
1465
- #######################
1466
- #### to be deprecated ####
1467
- def days_features(self, window_day, limit, plot = False, save_features = False):
1468
-
1469
- self.df['dow'] = self.df.Date.dt.dayofweek
1470
- self.df['dow'] = self.df['dow'].astype('str')
1471
-
1472
- self.df['target_mean_input'] = (self.df.sort_values("Date").groupby('dow')['roll_mean_log_return'].transform(lambda x: x.rolling(window_day, min_periods=1).mean()))
1473
-
1474
- mean = self.df['target_mean_input'].mean()
1475
- std = self.df['target_mean_input'].std()
1476
-
1477
- self.df['norm_dow_input'] = (self.df['target_mean_input']-mean)/std
1478
- mean_ = self.df['norm_dow_input'].mean()
1479
- self.df['std_dow_input'] = self.df.sort_values("Date")["norm_dow_input"].rolling(50).std()
1480
-
1481
- self.df['up_dow_input'] = limit*self.df['std_dow_input'] + mean_
1482
- self.df['low_dow_input'] = -limit*self.df['std_dow_input'] - mean_
1483
-
1484
- self.df['signal_up_target_mean_input'] = np.where(self.df['norm_dow_input'] > self.df['up_dow_input'],1,0)
1485
- self.df['signal_low_target_mean_input'] = np.where(self.df['norm_dow_input'] < self.df['low_dow_input'],1,0)
1486
-
1487
- if save_features:
1488
-
1489
- self.features.append('target_mean_input')
1490
- self.signals.append('signal_up_target_mean_input')
1491
- self.signals.append('signal_low_target_mean_input')
1492
- self.settings_days_features = {'window_day':window_day, 'limit':limit}
1493
-
1494
- if plot:
1495
- fig, axs = plt.subplots(1, 3,figsize=(17,5))
1496
-
1497
- axs[0].plot(self.df['norm_dow_input'])
1498
- axs[0].plot(self.df['up_dow_input'], linestyle='--')
1499
- axs[0].plot(self.df['low_dow_input'], linestyle='--')
1500
-
1501
- plot_acf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[1])
1502
- axs[1].set_title('acf day feature')
1503
-
1504
- plot_pacf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[2])
1505
- axs[2].set_title('pacf day feature')
1506
-
1507
- fig.show()
1508
- #######################
1509
-
1510
1002
  def days_features_bands(self, window, threshold, plot = False, save_features = False):
1511
1003
  """
1512
1004
  compute mean returns for a given day of the week in a window scope per day
@@ -1539,62 +1031,6 @@ class stock_eda_panel(object):
1539
1031
  if plot:
1540
1032
  self.signal_plotter(feature_name)
1541
1033
 
1542
- #######################
1543
- #### to be deprecated ####
1544
- def analysis_volume(self,lag_volume, threshold, window, plot = False, save_features = False):
1545
-
1546
- self.df['log_Volume'] = np.log(self.df['Volume'])
1547
- self.df['ret_log_Volume'] = self.df['log_Volume'].pct_change(lag_volume)
1548
-
1549
- self.df['norm_ret_log_Volume'] = (self.df['ret_log_Volume'] - self.df['ret_log_Volume'].mean())/ self.df['ret_log_Volume'].std()
1550
- mean_ = self.df['norm_ret_log_Volume'].mean()
1551
- self.df[f'std_norm_ret_log_Volume'] = (self.df.sort_values("Date")["norm_ret_log_Volume"].transform(lambda x: x.rolling(window, min_periods=1).std()))
1552
-
1553
- self.df['up_bound_ret_log_Volume'] = threshold*self.df['std_norm_ret_log_Volume'] + mean_
1554
- self.df['low_bound_ret_log_Volume'] = -threshold*self.df['std_norm_ret_log_Volume'] + mean_
1555
-
1556
- self.df['signal_up_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] > self.df['up_bound_ret_log_Volume'],1,0 )
1557
- self.df['signal_low_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] < self.df['low_bound_ret_log_Volume'],1,0 )
1558
-
1559
- if save_features:
1560
- self.features.append('ret_log_Volume')
1561
- self.signals.append('signal_up_ret_log_Volume')
1562
- self.signals.append('signal_low_ret_log_Volume')
1563
- self.settings_volume_feature= {'lag_volume':lag_volume, 'threshold':threshold, 'window':window}
1564
- if plot:
1565
- fig, axs = plt.subplots(3, 2,figsize=(11,13))
1566
- axs[0,0].plot(self.df.Date, self.df.Volume)
1567
- axs[0,0].set_title('Volume')
1568
- axs[0,1].plot(self.df.Date, self.df.log_Volume)
1569
- axs[0,1].set_title('log Volume')
1570
-
1571
- plot_acf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,0])
1572
- axs[1,0].set_title('acf log_Volume')
1573
- plot_pacf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,1])
1574
- axs[1,1].set_title('pacf log_Volume')
1575
-
1576
- plot_acf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,0])
1577
- axs[2,0].set_title('acf ret_log_Volume')
1578
- plot_pacf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,1])
1579
- axs[2,1].set_title('pacf ret_log_Volume')
1580
-
1581
- plt.show()
1582
-
1583
- print('--------------------------------------------------------------')
1584
-
1585
- fig, axs = plt.subplots(1, 2,figsize=(10,4))
1586
-
1587
- axs[0].plot(self.df.Date, self.df.norm_ret_log_Volume)
1588
- axs[0].plot(self.df.Date, self.df.up_bound_ret_log_Volume)
1589
- axs[0].plot(self.df.Date, self.df.low_bound_ret_log_Volume)
1590
- axs[0].set_title('norm_ret_log_Volume')
1591
-
1592
- axs[1].plot(self.df.Date, self.df.std_norm_ret_log_Volume)
1593
- axs[1].set_title('std_norm_ret_log_Volume')
1594
-
1595
- plt.show()
1596
- #######################
1597
-
1598
1034
  def analysis_smooth_volume(self, window, threshold, plot = False, save_features = False):
1599
1035
  """
1600
1036
  compute feature of thrading volumes
@@ -1968,14 +1404,12 @@ class stock_eda_panel(object):
1968
1404
  self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
1969
1405
  self.df["hmm_chain_order"] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1970
1406
 
1971
- ### returns using the first element in a chain
1972
- self.df['first'] = np.where(self.df['hmm_chain_order'] == 1, self.df['Close'], np.nan)
1973
- self.df['first'] = self.df.sort_values('Date')['first'].fillna(method='ffill')
1974
- self.df['chain_return'] = (self.df['Close']/self.df['first'] -1) * 100
1407
+ ### returns using the windowsseeds
1408
+ self.df['lag_chain_close'] = self.df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
1409
+ self.df['chain_return'] = (self.df['Close']/self.df['lag_chain_close'] -1) * 100
1410
+ self.df = self.df.drop(columns = ['breack'])
1975
1411
 
1976
- self.df = self.df.drop(columns = ['breack','first'])
1977
-
1978
- def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, plot = False, save_features = False, model = False):
1412
+ def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, corr_threshold = 0.75, plot = False, save_features = False, model = False):
1979
1413
  """
1980
1414
  create or use a hmm model
1981
1415
 
@@ -1986,6 +1420,7 @@ class stock_eda_panel(object):
1986
1420
  test_data_size (int): size of the test data. Note that the remaining is going to be used as training data
1987
1421
  seed (int): seed for the model inizialization
1988
1422
  lag_returns_state (int) : lags for returns of the state
1423
+ corr_threshold (float): correlation threshold for initial feature selection
1989
1424
  plot (boolean): True to display hmm states analysis
1990
1425
  save_features (boolean): True to save features and configurations
1991
1426
  model (obj): if provided, no model will be trainend and the provided model will be used to get hmm features
@@ -1997,16 +1432,12 @@ class stock_eda_panel(object):
1997
1432
  if not model:
1998
1433
 
1999
1434
  df_new = self.df
2000
- pipeline_hmm = Pipeline([
2001
- ('selector', FeatureSelector(columns=features_hmm)),
2002
- ('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
2003
- ('hmm',GaussianHMM(n_components = n_clusters, covariance_type = 'full', random_state = seed))
2004
- ])
2005
1435
  data_train = df_new.iloc[:-test_data_size,:]
2006
1436
  data_test = df_new.iloc[-test_data_size:,:]
2007
1437
 
2008
- pipeline_hmm.fit(data_train)
2009
-
1438
+ th = trainer_hmm(data_train, features_hmm, n_clusters=n_clusters,corr_thrshold=corr_threshold, seed = seed)
1439
+ th.train()
1440
+ pipeline_hmm = th.hmm_model
2010
1441
  self.model_hmm = pipeline_hmm
2011
1442
  self.test_data_hmm = data_test
2012
1443
 
@@ -2034,7 +1465,7 @@ class stock_eda_panel(object):
2034
1465
  if save_features:
2035
1466
  self.features.append('hmm_feature')
2036
1467
  self.features.append('hmm_chain_order')
2037
- self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state }
1468
+ self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state, 'corr_threshold':corr_threshold }
2038
1469
 
2039
1470
  if plot:
2040
1471
 
@@ -2248,53 +1679,6 @@ class stock_eda_panel(object):
2248
1679
  plt.legend()
2249
1680
  plt.show()
2250
1681
 
2251
- ### deprecated ############################
2252
- def create_strategy(self, favourable_states):
2253
-
2254
- test_data = self.test_data_hmm
2255
- # add MA signal
2256
- test_data.loc[test_data[self.ma1_column] > test_data[self.ma2_column], 'MA_signal'] = 1
2257
- test_data.loc[test_data[self.ma1_column] <= test_data[self.ma2_column], 'MA_signal'] = 0
2258
-
2259
- # add hnn signal
2260
-
2261
- test_data['HMM_signal'] = np.where(test_data['HMM'].isin(favourable_states),1,0)
2262
-
2263
- ## combined signals
2264
- test_data['main_signal'] = 0
2265
- test_data.loc[(test_data['MA_signal'] == 1) & (test_data['HMM_signal'] == 1), 'main_signal'] = 1
2266
- test_data['main_signal'] = test_data['main_signal'].shift(1)
2267
-
2268
- ## benchmark return
2269
- test_data['lrets_bench'] = np.log(test_data['Close']/test_data['Close'].shift(1))
2270
- test_data['bench_prod'] = test_data['lrets_bench'].cumsum()
2271
- test_data['bench_prod_exp'] = np.exp(test_data['bench_prod']) - 1
2272
-
2273
- ## strategy return
2274
- # test_data['lrets_strat'] = np.log(test_data['Open'].shift(-1)/test_data['Open']) * test_data['main_signal']
2275
- test_data['lrets_strat'] = np.log(test_data['Close'].shift(-1)/test_data['Close']) * test_data['main_signal']
2276
- test_data['lrets_prod'] = test_data['lrets_strat'].cumsum()
2277
- test_data['strat_prod_exp'] = np.exp(test_data['lrets_prod']) - 1
2278
- test_data.dropna(inplace = True)
2279
-
2280
- bench_rets = round(test_data['bench_prod_exp'].values[-1]*100,1)
2281
- strat_rets = round(test_data['strat_prod_exp'].values[-1]*100,1)
2282
-
2283
- bench_sharpe = self.sharpe_ratio(test_data['bench_prod_exp'].values)
2284
- strat_sharpe = self.sharpe_ratio(test_data['strat_prod_exp'].values)
2285
-
2286
- print(f'returns benchmark {bench_rets}%')
2287
- print(f'returns strategy {strat_rets}%')
2288
- print('-----------------------------')
2289
- print(f'sharpe benchmark {bench_sharpe}')
2290
- print(f'sharpe strategy {strat_sharpe}')
2291
-
2292
- fig = plt.figure(figsize = (10,4))
2293
- plt.plot(test_data['bench_prod_exp'])
2294
- plt.plot(test_data['strat_prod_exp'])
2295
- self.settings_hmm_states = {'favourable_states':favourable_states}
2296
- ################################################
2297
-
2298
1682
  def deep_dive_analysis_hmm(self, test_data_size, split = 'train'):
2299
1683
  """
2300
1684
  display analysis plot hmm model
@@ -2582,214 +1966,6 @@ class produce_model:
2582
1966
  self.pipeline.fit(self.X_train, self.y_train)
2583
1967
  self.features_to_model = self.pipeline[:-1].transform(self.X_train).columns
2584
1968
 
2585
- class hmm_feature_selector():
2586
- """
2587
- class that is going to train hmm models to perform feature selection
2588
-
2589
- Attributes
2590
- ----------
2591
- data : pd.DataFrame
2592
- symbol of the asset
2593
- n_clusters : int
2594
- number of clusters to search
2595
- init_features_hmm : list
2596
- list of features to consider in the search
2597
- test_data_size :int
2598
- test data size, meaning that the remaining is going to be used as training data
2599
- select_n_features : int
2600
- number of features to select
2601
- n_trials : int
2602
- total number of trials per combination
2603
- limit_search : int
2604
- limit number of combinations
2605
- default_benchmark_sd : float
2606
- default value to bias standard deviation
2607
- t_threshold : float
2608
- alpha or z threshold
2609
- pipeline_hmm: obj
2610
- pipeline object of the hmm model
2611
- features_used_in_model:list
2612
- features in model
2613
- train_model(features_hmm=list):
2614
- train hmm model
2615
- feature_combinations: list
2616
- list of combination of features
2617
- mean_relevance: float
2618
- relevance score of the model
2619
- best_features: list
2620
- list of best performing features
2621
-
2622
- Methods
2623
- -------
2624
- split_data():
2625
- split data in train and test
2626
- train_model(features_hmm=list):
2627
- train hmm model
2628
- feature_list_generator():
2629
- perform combination of features
2630
- get_error():
2631
- get error or score of a given model using relevance score
2632
- execute_selector():
2633
- select the best combination of features
2634
- """
2635
- def __init__(self, data, n_clusters, init_features_hmm, test_data_size, select_n_features, n_trials = 1,limit_search = False, default_benchmark_sd = 0.00003, t_threshold = 2):
2636
- """
2637
- Initialize object
2638
-
2639
- Parameters
2640
- ----------
2641
- data (pd.DataFrame): data
2642
- n_clusters (int): number of clusters to search
2643
- init_features_hmm (list): list of features to consider in the search
2644
- test_data_siz:(int: test data size, meaning that the remaining is going to be used as training data
2645
- select_n_features (int): number of features to select
2646
- n_trials (int): total number of trials per combination
2647
- limit_search (int): limit number of combinations
2648
- default_benchmark_sd (float): default value to bias standard deviation
2649
- t_threshold (float): alpha or z threshold
2650
-
2651
- Returns
2652
- -------
2653
- None
2654
- """
2655
- self.data = data.copy()
2656
- self.n_clusters = n_clusters
2657
- self.init_features_hmm = init_features_hmm
2658
- self.test_data_size = test_data_size
2659
- self.select_n_features = select_n_features
2660
- self.n_trials = n_trials
2661
- self.limit_search= limit_search
2662
- self.default_benchmark_sd = default_benchmark_sd
2663
- self.t_threshold = t_threshold
2664
-
2665
- def split_data(self):
2666
- """
2667
- split data in train and test
2668
-
2669
- Parameters
2670
- ----------
2671
- None
2672
-
2673
- Returns
2674
- -------
2675
- None
2676
- """
2677
- self.data_train = self.data.iloc[:-self.test_data_size,:]
2678
- self.data_test = self.data.iloc[-self.test_data_size:,:]
2679
-
2680
- def train_model(self,features_hmm):
2681
- """
2682
- train hmm model
2683
-
2684
- Parameters
2685
- ----------
2686
- features_hmm (list): list of features to be selected in the model
2687
-
2688
- Returns
2689
- -------
2690
- None
2691
- """
2692
- pipeline_hmm = Pipeline([
2693
- ('selector', FeatureSelector(columns=features_hmm)),
2694
- ('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
2695
- ('hmm',GaussianHMM(n_components = self.n_clusters, covariance_type = 'full'))
2696
- ])
2697
-
2698
- self.pipeline_hmm = pipeline_hmm.fit(self.data_train)
2699
- self.features_used_in_model = features_hmm
2700
-
2701
- def feature_list_generator(self):
2702
- """
2703
- perform combination of features
2704
-
2705
- Parameters
2706
- ----------
2707
- None
2708
-
2709
- Returns
2710
- -------
2711
- None
2712
- """
2713
- feature_combinations = set(list(combinations(self.init_features_hmm, self.select_n_features)))
2714
- feature_combinations = list(map(list, feature_combinations))
2715
-
2716
- self.feature_combinations = feature_combinations
2717
-
2718
- def get_error(self):
2719
- """
2720
- get error or score of a given model using relevance score
2721
-
2722
- Parameters
2723
- ----------
2724
- None
2725
-
2726
- Returns
2727
- -------
2728
- None
2729
- """
2730
- self.data_train_ = self.data_train.copy()
2731
-
2732
- self.data_train_['hmm_feature'] = self.pipeline_hmm.predict(self.data_train_)
2733
- self.data_train_ = self.data_train_[['Date','hmm_feature','Close']].sort_values('Date')
2734
-
2735
- ## indexing chains
2736
- self.data_train_['lag_hmm_feature'] = self.data_train_['hmm_feature'].shift(1)
2737
- self.data_train_['breack'] = np.where(self.data_train_['lag_hmm_feature'] != self.data_train_['hmm_feature'],1,0)
2738
- self.data_train_["chain_id"] = self.data_train_.groupby("breack")["Date"].rank(method="first", ascending=True)
2739
- self.data_train_["chain_id"] = np.where(self.data_train_['breack'] == 1,self.data_train_["chain_id"],np.nan)
2740
- self.data_train_["chain_id"] = self.data_train_["chain_id"].fillna(method='ffill')
2741
- self.data_train_["hmm_chain_order"] = self.data_train_.groupby('chain_id')["Date"].rank(method="first", ascending=True)
2742
-
2743
- ### returns using the first element in a chain
2744
- self.data_train_['first'] = np.where(self.data_train_['hmm_chain_order'] == 1, self.data_train_['Close'], np.nan)
2745
- self.data_train_['first'] = self.data_train_.sort_values('Date')['first'].fillna(method='ffill')
2746
- self.data_train_['chain_return'] = (self.data_train_['Close']/self.data_train_['first'] -1) * 100
2747
-
2748
- self.data_train_ = self.data_train_.drop(columns = ['first'])
2749
-
2750
- mean_relevance, cluster_returns, number_relevant_states = states_relevance_score(self.data_train_)
2751
- self.mean_relevance = mean_relevance
2752
-
2753
- def execute_selector(self):
2754
- """
2755
- select the best combination of features
2756
-
2757
- Parameters
2758
- ----------
2759
- None
2760
-
2761
- Returns
2762
- -------
2763
- None
2764
- """
2765
- self.split_data()
2766
- self.feature_list_generator()
2767
- maxi = -1
2768
- print(f'it is expected {len(self.feature_combinations)} combinations')
2769
- feature_results = dict()
2770
-
2771
- if self.limit_search:
2772
- print(f' taking just {self.limit_search} combinations')
2773
- maxi = self.limit_search
2774
-
2775
- for i,features_hmm in enumerate(self.feature_combinations[0:maxi]):
2776
-
2777
- feature_results[f'group_{i}'] = {
2778
- 'features':list(features_hmm),
2779
- 'relevances':list()
2780
- }
2781
-
2782
- for _ in range(self.n_trials):
2783
- try:
2784
- self.train_model(features_hmm)
2785
- self.get_error()
2786
- feature_results[f'group_{i}']['relevances'].append(self.mean_relevance)
2787
- except:
2788
- print('error')
2789
- feature_results[f'group_{i}']['mean relevance'] = np.mean(feature_results[f'group_{i}']['relevances'])
2790
- self.feature_results = feature_results
2791
- self.best_features = pd.DataFrame(self.feature_results).T.sort_values('mean relevance').iloc[-1,:].features
2792
-
2793
1969
  class analyse_index(stock_eda_panel):
2794
1970
  """
2795
1971
  class that is going to train hmm models to perform feature selection
@@ -3025,7 +2201,6 @@ class analyse_index(stock_eda_panel):
3025
2201
 
3026
2202
  self.states_result = result
3027
2203
 
3028
-
3029
2204
  def get_relevant_beta(data_market, ticket_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
3030
2205
  '''
3031
2206
  select relevant beta result data of a given asset