virgo-modules 0.1.3__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of virgo-modules might be problematic. Click here for more details.

@@ -6,11 +6,10 @@ from sklearn.pipeline import Pipeline
6
6
 
7
7
  from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
8
8
  from feature_engine.imputation import MeanMedianImputer
9
- from virgo_modules.src.ticketer_source import FeatureSelector
10
9
  from feature_engine.discretisation import EqualWidthDiscretiser
11
10
  from feature_engine.datetime import DatetimeFeatures
12
11
 
13
- from .ticketer_source import VirgoWinsorizerFeature, InverseHyperbolicSine, FeaturesEntropy
12
+ from .transformer_utils import VirgoWinsorizerFeature, InverseHyperbolicSine, FeaturesEntropy, FeatureSelector
14
13
 
15
14
  class produce_model_wrapper:
16
15
  """
@@ -0,0 +1,492 @@
1
+ from hmmlearn.hmm import GaussianHMM
2
+
3
+ from sklearn.pipeline import Pipeline
4
+ from feature_engine.imputation import MeanMedianImputer
5
+ from virgo_modules.src.transformer_utils import FeatureSelector
6
+ from feature_engine.selection import DropCorrelatedFeatures
7
+ from sklearn.preprocessing import RobustScaler
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ import random
12
+
13
+ import matplotlib.pyplot as plt
14
+ import matplotlib.gridspec as gridspec
15
+ import seaborn as sns; sns.set()
16
+
17
+ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
18
+ '''
19
+ calculate relevance score and summary report for hmm model
20
+
21
+ Parameters:
22
+ default_benchmark_sd (float): default value to bias SD for t calculation
23
+ t_threshold (float): alpha or z threshold for the normalized score
24
+
25
+ Returns:
26
+ mean_relevance (float): mean relevance score of the states
27
+ cluster_returns (pd.DataFrame): summary report of the analysis
28
+ number_relevant_states (int): number of relevant states
29
+ '''
30
+ ## legnths
31
+ cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
32
+ cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
33
+ ## means
34
+ def quantile2(x):
35
+ return x.quantile(0.25)
36
+ def quantile3(x):
37
+ return x.quantile(0.75)
38
+
39
+ cluster_returns = data.groupby('hmm_feature').agg(
40
+ n_uniques = ('chain_id','nunique'),
41
+ n_obs = ('Date','count'),
42
+ cluster_ret_q25 = ('chain_return',quantile2),
43
+ cluster_ret_median = ('chain_return','median'),
44
+ cluster_ret_q75 = ('chain_return',quantile3),
45
+ )
46
+ cluster_returns = cluster_returns.join(cluster_lengths, how = 'left')
47
+ cluster_returns['perc_dispute'] = np.where(
48
+ np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
49
+ 1,0
50
+ )
51
+ cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
52
+ cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
53
+ cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
54
+ cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
55
+ cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
56
+ cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
57
+ cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
58
+
59
+ ## relevance or importance
60
+ # naive aproach
61
+ cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
62
+ cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
63
+ cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
64
+ cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
65
+
66
+ mean_relevance = cluster_returns['abs_t_accpted'].mean()
67
+ number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
68
+
69
+ return mean_relevance, cluster_returns, number_relevant_states
70
+
71
+ def create_hmm_derived_features(df, lag_returns):
72
+ """
73
+ create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
74
+ note: this is a copy of the method of the ticketer_object with the same name
75
+
76
+ Parameters:
77
+ df (pd.DataFrame): dataframe that must have hmm_feature columns
78
+ lag_returns (int): lag paramter (not used)
79
+
80
+ Returns:
81
+ df (pd.DataFrame): dataframe with extra hmm features as columns
82
+ """
83
+ df = df.sort_values('Date')
84
+ ## indexing chains
85
+ df['lag_hmm_feature'] = df['hmm_feature'].shift(1)
86
+ df['breack'] = np.where(df['lag_hmm_feature'] != df['hmm_feature'],1,0)
87
+ df["chain_id"] = df.groupby("breack")["Date"].rank(method="first", ascending=True)
88
+ df["chain_id"] = np.where(df['breack'] == 1,df["chain_id"],np.nan)
89
+ df["chain_id"] = df["chain_id"].fillna(method='ffill')
90
+ df["hmm_chain_order"] = df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
91
+ ### returns using the windowsseeds
92
+ df['lag_chain_close'] = df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
93
+ df['chain_return'] = (df['Close']/df['lag_chain_close'] -1) * 100
94
+ df = df.drop(columns = ['breack'])
95
+ return df
96
+
97
+ class trainer_hmm():
98
+ """
99
+ wrapper that gaussian model
100
+ this class follows scikit learn practices
101
+
102
+ Attributes
103
+ ----------
104
+ hmm_model: obj
105
+ pipeline and model
106
+ features_hmm: list
107
+ list of features used to train the gaussian model
108
+
109
+ Methods
110
+ -------
111
+ train():
112
+ train pipeline given the parameters in the class initiliazation
113
+ plot_training_results(lag_diff_returns=int):
114
+ plot features and closing prices displaying the states
115
+ plot the returns distribution by state given lag to calculate the returns in the chains
116
+ """
117
+ def __init__(self, data, features_hmm, n_clusters= 3, corr_thrshold = 0.65, seed = None):
118
+ """
119
+ Initialize object
120
+
121
+ Parameters
122
+ ----------
123
+ data (pd.DataFrame): training data
124
+ features_hmm (list): features to pass for modeling
125
+ n_clusters (int): number or states to train
126
+ corr_thrshold (float): correlation threhsold for initial feature selection
127
+ seed (int): random state for model reproducibility
128
+
129
+ Returns
130
+ -------
131
+ None
132
+ """
133
+ self.__data_train = data
134
+ self.__features_hmm = features_hmm
135
+ self.__n_clusters = n_clusters
136
+ self.__corr_thrshold = corr_thrshold
137
+ self.__seed = seed
138
+ def train(self):
139
+ """
140
+ train pipeline and model
141
+
142
+ Parameters
143
+ ----------
144
+ None
145
+
146
+ Returns
147
+ -------
148
+ None
149
+ """
150
+ transform_pipe = Pipeline([
151
+ ('selector', FeatureSelector(columns=self.__features_hmm)),
152
+ ('fillna', MeanMedianImputer(imputation_method='median',variables=self.__features_hmm)),
153
+ ('drop_correlated', DropCorrelatedFeatures(method='spearman',threshold=self.__corr_thrshold)),
154
+ ])
155
+
156
+ # features_hmm = list(transform_pipe.fit_transform(self.__data_train).columns)
157
+ # n_features = len(features_hmm)
158
+ # startprob_prior = np.array([1/self.__n_clusters]*self.__n_clusters)
159
+ transmat_prior = np.diag([0.70]*self.__n_clusters)
160
+ # means_prior = np.array([1/n_features]*n_features)
161
+ pipeline_hmm = Pipeline([
162
+ ('transfrom_pipe', transform_pipe),
163
+ ('scaler', RobustScaler()),
164
+ ('hmm', GaussianHMM(
165
+ n_components = self.__n_clusters, covariance_type = 'spherical',
166
+ # startprob_prior = startprob_prior,
167
+ transmat_prior = transmat_prior,
168
+ # means_prior = means_prior,
169
+ random_state = self.__seed,)
170
+ )
171
+ ])
172
+
173
+ self.hmm_model = pipeline_hmm.fit(self.__data_train)
174
+ self.features_hmm = [x for x in self.__features_hmm if x not in list(self.hmm_model[0][-1].features_to_drop_)]
175
+
176
+ def plot_training_results(self, lag_diff_returns):
177
+ """
178
+ plot result as matplot figure
179
+
180
+ Parameters
181
+ ----------
182
+ lag_diff_returns (int): lag or diff factor to calculate returns of chains
183
+
184
+ Returns
185
+ -------
186
+ None
187
+ """
188
+ n_clusters = self.__n_clusters
189
+ df_train = self.__data_train.copy()
190
+ df_train['hmm_feature'] = self.hmm_model.predict(df_train)
191
+ df_train = create_hmm_derived_features(df_train, lag_diff_returns,)
192
+ n = len(self.features_hmm)+1
193
+ fig, axs = plt.subplots(n, 1, figsize=(10, 3*n), sharex=True)
194
+ for i,feature in enumerate(self.features_hmm):
195
+ axs[i].plot(df_train.Date, df_train[feature])
196
+ axs[i].set_title(feature)
197
+ for s in range(n_clusters):
198
+ df = df_train[df_train['hmm_feature'] == s]
199
+ axs[i].scatter(df.Date, df[feature])
200
+
201
+ axs[i+1].plot(df_train.Date, df_train.Close)
202
+ axs[i+1].set_title('close price')
203
+ for s in range(n_clusters):
204
+ df = df_train[df_train['hmm_feature'] == s]
205
+ axs[i+1].scatter(df.Date, df.Close)
206
+
207
+ n = 1
208
+ fig, axs = plt.subplots(n, 1, figsize=(10, 3*n), sharex=True)
209
+ df_plot = df_train.dropna()
210
+ sns.boxplot(data=df_plot, x="hmm_feature", y="chain_return", hue="hmm_feature", ax=axs)
211
+ axs.axhline(0.5, linestyle='--')
212
+ del df_train
213
+
214
+ def evaluate_model_chains(data, n_clusters, at_least_states, threshold_chain, at_least_length):
215
+ """
216
+ function that is going to assess chains or series of states given some sanity chekcs
217
+
218
+ Parameters:
219
+ data (pd.DataFrame): dataframe that must have hmm_feature and extra features
220
+ n_clusters (int): n_clusters that are trainned, not observed
221
+ at_least_states (int): number of states that should be ,at least, observed
222
+ threshold_chain (int): number of times that a state should be , at least, observed
223
+ at_least_length (int): minimal lenght that the states should have using a statical measure (median, q75, max, etc)
224
+
225
+ Returns:
226
+ result (boolean): true if the model complies with parameters
227
+ """
228
+ def q3(x):
229
+ return x.quantile(0.75)
230
+ tmp_df = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
231
+ tmp_df = tmp_df.groupby("hmm_feature", as_index = False).agg(count = ('chain_id','nunique'), median_length = ('chain_lenght','median'), q3_length = ('chain_lenght',q3))
232
+ train_observedstates = len(tmp_df)
233
+
234
+ states_under_threshold = list(tmp_df[tmp_df['count'] <= threshold_chain].hmm_feature)
235
+ n_states_under_threshold = len(states_under_threshold)
236
+ min_count = np.min(tmp_df[~tmp_df.hmm_feature.isin(states_under_threshold)]['count'].values)
237
+ med_length = np.min(tmp_df['q3_length'].values)
238
+
239
+ condition_1 = threshold_chain <= min_count
240
+ condition_2 = n_states_under_threshold <= at_least_states
241
+ condition_3 = at_least_length <= med_length
242
+ condition_4 = (train_observedstates == n_clusters)
243
+
244
+ result = False
245
+
246
+ if condition_1 and condition_2 and condition_3 and condition_4:
247
+ result = True
248
+ else:
249
+ result = False
250
+ return result
251
+
252
+ def iterate_training(trials, train_params, relevance_params):
253
+ """
254
+ iterate valid training
255
+
256
+ Parameters:
257
+ trials (int): number of repetitions to iterate
258
+ train_params (dict): dictionary containing training configurations
259
+ relevance_params (dict): dictionary containing validation configurations
260
+
261
+ Returns:
262
+ results (list): list of valid relevance scores
263
+ kept_model (obj): model (pipeling) that is kept, if it exists
264
+ """
265
+ results = list()
266
+ kept_model=None
267
+ for _ in range(trials):
268
+ try:
269
+ th = trainer_hmm(**train_params)
270
+ th.train()
271
+ result_model = th.hmm_model
272
+ df_train_tmp = train_params.get('data')
273
+ df_train_tmp['hmm_feature'] = result_model.predict(df_train_tmp)
274
+ df_train_tmp = create_hmm_derived_features(df = df_train_tmp, lag_returns = relevance_params.get('lag'))
275
+ relev, _, _ = states_relevance_score(df_train_tmp)
276
+ relevance_hmm = evaluate_model_chains(data = df_train_tmp,
277
+ n_clusters=train_params.get('n_clusters'),
278
+ at_least_states=relevance_params.get('at_least_states'),
279
+ threshold_chain=relevance_params.get('threshold_chain'),
280
+ at_least_length=relevance_params.get('at_least_length'))
281
+ if relevance_hmm:
282
+ results.append(relev)
283
+ kept_model = result_model
284
+ except:
285
+ pass
286
+ del th
287
+ if not kept_model:
288
+ raise TypeError("no model was kept")
289
+ return results, kept_model
290
+
291
+ class custom_hmm_permutation_importance():
292
+ """
293
+ class that is going to perform feature importance using feature permutation
294
+ note: this method is inpired in the same method that is available in scikit-learn
295
+
296
+ Attributes
297
+ ----------
298
+ n_repeats: int
299
+ number of shufflings performed per feature
300
+ features: list
301
+ list of features that is going to be tested, note that these features have to be the input of the model
302
+ results: dict
303
+ dictionary with the results containing feature and relevance scores per each iteration
304
+
305
+ Methods
306
+ -------
307
+ fit():
308
+ fit class
309
+ """
310
+ def __init__(self, model, X, n_repeats=5,random_state=False, features = list(), lag = 4):
311
+ """
312
+ Initialize object
313
+
314
+ Parameters
315
+ ----------
316
+ model (obj): pipeline or model
317
+ X (pd.DataFrame): input data to test feature permutation
318
+ n_repeats (int): number or trials per feature
319
+ random_state (bool): if true set a random state
320
+ features (list): list of features to be tested. note that the features have to be input of the model
321
+ lag (int): lag of diff factor to calculate chain returns
322
+
323
+ Returns
324
+ -------
325
+ None
326
+ """
327
+ self.__model = model
328
+ self.__X = X
329
+ self.n_repeats = n_repeats
330
+ self.__random_state = random_state
331
+ self.features = features
332
+ self.__lag = lag
333
+ def __generate_seeds(self):
334
+ """
335
+ generate list of seeds
336
+
337
+ Parameters
338
+ ----------
339
+ None
340
+
341
+ Returns
342
+ -------
343
+ None
344
+ """
345
+ if self.__random_state:
346
+ self.__seeds = list()
347
+ for _ in range(self.n_repeats):
348
+ seed = np.random.randint(1,500)
349
+ self.__seeds.append(seed)
350
+ def fit(self):
351
+ """
352
+ fit class
353
+
354
+ Parameters
355
+ ----------
356
+ None
357
+
358
+ Returns
359
+ -------
360
+ None
361
+ """
362
+ self.__X['hmm_feature'] = self.__model.predict(self.__X)
363
+ self.__X = create_hmm_derived_features(df=self.__X, lag_returns=self.__lag)
364
+ init_relevance, _, _ = states_relevance_score(self.__X)
365
+ self.results = {feature: list() for feature in self.features}
366
+ if self.__random_state:
367
+ self.__generate_seeds()
368
+ for feature in self.features:
369
+ X_ = self.__X.dropna().reset_index(drop = True).copy()
370
+ for j in range(self.n_repeats):
371
+ if self.__random_state:
372
+ seed = self.__seeds[j]
373
+ np.random.seed(seed)
374
+ else:
375
+ seed = None
376
+ shuffled = X_[feature].sample(frac=1, random_state = seed, replace = True).reset_index(drop=True)
377
+ X_[feature] = shuffled
378
+ X_['hmm_feature'] = self.__model.predict(X_)
379
+ X_ = create_hmm_derived_features(df=X_, lag_returns=self.__lag)
380
+
381
+ tmp_df = X_.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
382
+ tmp_df = tmp_df.groupby("hmm_feature", as_index = False).agg(count = ('chain_id','nunique'), median_length = ('chain_lenght','median')).copy()
383
+ mean_relevance, _, _ = states_relevance_score(X_)
384
+ self.results[feature].append(mean_relevance - init_relevance)
385
+ del X_
386
+
387
+ def hmm_feature_selection(max_features, trials, train_params, relevance_params):
388
+ """
389
+ wrapper function that is going to use permutation importance to select features
390
+
391
+ Parameters:
392
+ ax_features (int): target to number of features
393
+ trials (int): training iterations
394
+ train_params (dict): dictionary containing training configurations
395
+ relevance_params (dict): dictionary containing validation configurations
396
+
397
+ Returns:
398
+ results (pd.DataFrame): summary relevace score per excluded feature
399
+ """
400
+ results = {'index':list(),'feature_to_drop':list(), 'median relevance excluding feature':list()}
401
+ i=0
402
+ init_numfeatures = len(train_params.get('features_hmm'))
403
+ while max_features <= init_numfeatures:
404
+ print(init_numfeatures)
405
+ if i==0:
406
+ exclude = None
407
+ r,model= iterate_training(trials, train_params, relevance_params)
408
+ for ri in r:
409
+ results['index'].append(0)
410
+ results['feature_to_drop'].append('full')
411
+ results['median relevance excluding feature'].append(ri)
412
+ data_train = train_params.get('data')
413
+ chmm_pi = custom_hmm_permutation_importance(model, data_train,random_state=5, features = train_params.get('features_hmm'), lag = relevance_params.get('lag'))
414
+ chmm_pi.fit()
415
+ results_fp = pd.DataFrame(chmm_pi.results)
416
+ feature_deltas = results_fp.median(axis = 0)
417
+ feature_deltas = feature_deltas.sort_values(ascending = False)
418
+ feature_to_drop = feature_deltas.index[0]
419
+ print(f'excluding {feature_to_drop}')
420
+
421
+ train_params['features_hmm'].remove(feature_to_drop)
422
+ print(train_params['features_hmm'])
423
+ r,model = iterate_training(trials, train_params, relevance_params)
424
+ for ri in r:
425
+ results['index'].append(i+1)
426
+ results['feature_to_drop'].append(feature_to_drop)
427
+ results['median relevance excluding feature'].append(ri)
428
+ init_numfeatures = len(model[:-2].transform(data_train).columns)
429
+ i+=1
430
+ return pd.DataFrame(results)
431
+
432
+
433
+ def seed_finder(train_params, relevance_params, n_seed = 100,max_results =5):
434
+ """
435
+ iterate valid training finding best starter seed
436
+
437
+ Parameters:
438
+ train_params (dict): dictionary containing training configurations
439
+ relevance_params (dict): dictionary containing validation configurations
440
+ n_seed (int): number of iterations
441
+ max_results (int): number of max results to keep and stop the iteration
442
+
443
+ Returns:
444
+ df_results (pd.DataFrame): summary table of seed and relevance score
445
+ """
446
+ seeds = list()
447
+ i_ = 0
448
+ while len(seeds) < max_results and i_ < n_seed:
449
+ # print(i_)
450
+ if i_ >= (n_seed*0.5) and len(seeds) == 0:
451
+ i_ += 10
452
+
453
+ seed = random.randint(50, 10000)
454
+ train_params['seed'] = seed
455
+ try:
456
+ th = trainer_hmm(**train_params)
457
+ th.train()
458
+ result_model = th.hmm_model
459
+ df_train_tmp = train_params.get('data')
460
+ df_train_tmp['hmm_feature'] = result_model.predict(df_train_tmp)
461
+ df_train_tmp = create_hmm_derived_features(df = df_train_tmp, lag_returns = relevance_params.get('lag'))
462
+ relev, _, _ = states_relevance_score(df_train_tmp)
463
+ relevance_hmm = evaluate_model_chains(data = df_train_tmp,
464
+ n_clusters=train_params.get('n_clusters'),
465
+ at_least_states=relevance_params.get('at_least_states'),
466
+ threshold_chain=relevance_params.get('threshold_chain'),
467
+ at_least_length=relevance_params.get('at_least_length'))
468
+ if relevance_hmm:
469
+ print('new model candidate was found, seed saved')
470
+ seeds.append(seed)
471
+ i_ += 1
472
+ except:
473
+ i_ += 1
474
+ print('best seeds', seeds)
475
+ ## searching the best seed
476
+ results = {'seed' : list(),'train_relevance': list()}
477
+
478
+ for seed_x in seeds:
479
+ train_params['seed'] = seed_x
480
+ th = trainer_hmm(**train_params)
481
+ th.train()
482
+ result_model = th.hmm_model
483
+ df_train_tmp = train_params.get('data')
484
+ df_train_tmp['hmm_feature'] = result_model.predict(df_train_tmp)
485
+ df_train_tmp = create_hmm_derived_features(df = df_train_tmp, lag_returns = relevance_params.get('lag'))
486
+ relev, _, _ = states_relevance_score(df_train_tmp)
487
+
488
+ results['seed'].append(seed_x)
489
+ results['train_relevance'].append(relev)
490
+
491
+ df_results = pd.DataFrame(results).sort_values(['train_relevance'], ascending = [False])
492
+ return df_results