virgo-modules 0.0.72__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,94 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ from sklearn.base import BaseEstimator, ClassifierMixin
5
+
6
+ class MyStackingClassifierMultiClass(ClassifierMixin, BaseEstimator):
7
+ def __init__(self, estimators, meta_estimators,targets,perc=None,stack_size=None, **kwargs):
8
+ self.estimators = estimators
9
+ self.meta_estimators = meta_estimators
10
+ self.targets = targets
11
+ if stack_size and perc:
12
+ raise Exception('just one option')
13
+ if not stack_size and not perc:
14
+ raise Exception('set one option')
15
+ self.stack_size = stack_size
16
+ self.perc = perc
17
+
18
+ def get_index_training(self, X):
19
+ if self.stack_size:
20
+ unique_dates = list(X.index.get_level_values('Date_i').unique())
21
+ unique_dates.sort()
22
+ stack_chunk = unique_dates[-self.stack_size:]
23
+ base_indexes = X[~X.index.get_level_values('Date_i').isin(stack_chunk)].index.get_level_values('i')
24
+ meta_indexes = X[X.index.get_level_values('Date_i').isin(stack_chunk)].index.get_level_values('i')
25
+ elif self.perc:
26
+ meta_indexes = X.sample(frac = self.perc).index.get_level_values('i')
27
+ base_indexes = X[~X.index.get_level_values('i').isin(meta_indexes)].index.get_level_values('i')
28
+ else:
29
+ raise Exception("error", self.stack_size, self.perc)
30
+ return base_indexes, meta_indexes
31
+ def train_base_learner(self, classifier, X, y,indexes):
32
+ base_X = X[X.index.get_level_values('i').isin(indexes)]
33
+ base_y = y[y.index.get_level_values('i').isin(indexes)]
34
+ classifier.fit(base_X, base_y)
35
+ def fit(self, X, y):
36
+ # #base learners
37
+ base_indexes, meta_indexes = self.get_index_training(X)
38
+ for name,estimator in self.estimators:
39
+ self.train_base_learner(estimator,X, y, base_indexes)
40
+
41
+ #stack meta learner
42
+ metas_pred = dict()
43
+ for i,cont in enumerate(self.estimators, start=1):
44
+ _,estimator = cont
45
+ meta_pred = estimator.predict_proba(X[X.index.get_level_values('i').isin(meta_indexes)])
46
+ metas_pred[f"meta{i}0"] = meta_pred[0][:,1]
47
+ metas_pred[f"meta{i}1"] = meta_pred[1][:,1]
48
+ meta_preds_df = pd.DataFrame(metas_pred)
49
+
50
+ for i,metaest in enumerate(self.meta_estimators,start=0):
51
+ _,metaest = metaest
52
+ metacols = [f"meta{j}{i}" for j in range(1,len(self.estimators)+1)]
53
+ metaest.fit(
54
+ meta_preds_df[metacols],
55
+ y[X.index.get_level_values('i').isin(meta_indexes)][self.targets[i]]
56
+ )
57
+ self.is_fitted_ = True
58
+ self.classes_ = np.array([[0,1],[0,1]])
59
+
60
+ def predict_proba(self, X):
61
+ metas_pred = dict()
62
+ for i,cont in enumerate(self.estimators, start=1):
63
+ _,estimator = cont
64
+ meta_pred = estimator.predict_proba(X)
65
+ metas_pred[f"meta{i}0"] = meta_pred[0][:,1]
66
+ metas_pred[f"meta{i}1"] = meta_pred[1][:,1]
67
+ self.meta_preds_df__ = pd.DataFrame(metas_pred)
68
+
69
+ prediction_vector = list()
70
+ for i,cont in enumerate(self.meta_estimators, start=0):
71
+ _,estimator = cont
72
+ metacols = [f"meta{j}{i}" for j in range(1,len(self.estimators)+1)]
73
+ preds = estimator.predict_proba(self.meta_preds_df__[metacols].values)
74
+ prediction_vector.append(preds)
75
+ return prediction_vector
76
+
77
+ def predict(self, X):
78
+ prediction_vector = list()
79
+ _ = self.predict_proba(X)
80
+ for i,cont in enumerate(self.meta_estimators, start=0):
81
+ _,estimator = cont
82
+ metacols = [f"meta{j}{i}" for j in range(1,len(self.estimators)+1)]
83
+ preds = estimator.predict(self.meta_preds_df__[metacols].values)
84
+ prediction_vector.append(preds)
85
+
86
+ p = np.array(tuple(prediction_vector))
87
+ return p.reshape((p.shape[1],p.shape[0]))
88
+
89
+ def get_params(self, deep=True):
90
+ return {k:v for k, v in self.__dict__.items()}
91
+
92
+ def set_params(self, **parms):
93
+ for k,v in parms.items():
94
+ setattr(self,k,v)
@@ -0,0 +1,494 @@
1
+ from hmmlearn.hmm import GaussianHMM
2
+
3
+ from sklearn.pipeline import Pipeline
4
+ from feature_engine.imputation import MeanMedianImputer
5
+ from virgo_modules.src.transformer_utils import FeatureSelector
6
+ from feature_engine.selection import DropCorrelatedFeatures
7
+ from sklearn.preprocessing import RobustScaler
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ import random
12
+
13
+ import matplotlib.pyplot as plt
14
+ import matplotlib.gridspec as gridspec
15
+ import seaborn as sns; sns.set()
16
+
17
+ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
18
+ '''
19
+ calculate relevance score and summary report for hmm model
20
+
21
+ Parameters:
22
+ default_benchmark_sd (float): default value to bias SD for t calculation
23
+ t_threshold (float): alpha or z threshold for the normalized score
24
+
25
+ Returns:
26
+ mean_relevance (float): mean relevance score of the states
27
+ cluster_returns (pd.DataFrame): summary report of the analysis
28
+ number_relevant_states (int): number of relevant states
29
+ '''
30
+ ## legnths
31
+ cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
32
+ cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
33
+ ## means
34
+ def quantile2(x):
35
+ return x.quantile(0.25)
36
+ def quantile3(x):
37
+ return x.quantile(0.75)
38
+
39
+ cluster_returns = data.groupby('hmm_feature').agg(
40
+ n_uniques = ('chain_id','nunique'),
41
+ n_obs = ('Date','count'),
42
+ cluster_ret_q25 = ('chain_return',quantile2),
43
+ cluster_ret_median = ('chain_return','median'),
44
+ cluster_ret_q75 = ('chain_return',quantile3),
45
+ )
46
+ cluster_returns = cluster_returns.join(cluster_lengths, how = 'left')
47
+ cluster_returns['perc_dispute'] = np.where(
48
+ np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
49
+ 1,0
50
+ )
51
+ cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
52
+ cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
53
+ cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
54
+ cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
55
+ cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
56
+ cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
57
+ cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
58
+
59
+ ## relevance or importance
60
+ # naive aproach
61
+ cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
62
+ cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
63
+ cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
64
+ cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
65
+
66
+ mean_relevance = cluster_returns['abs_t_accpted'].mean()
67
+ number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
68
+
69
+ return mean_relevance, cluster_returns, number_relevant_states
70
+
71
+ def create_hmm_derived_features(df, lag_returns):
72
+ """
73
+ create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
74
+ note: this is a copy of the method of the ticketer_object with the same name
75
+
76
+ Parameters:
77
+ df (pd.DataFrame): dataframe that must have hmm_feature columns
78
+ lag_returns (int): lag paramter (not used)
79
+
80
+ Returns:
81
+ df (pd.DataFrame): dataframe with extra hmm features as columns
82
+ """
83
+ df = df.sort_values('Date')
84
+ ## indexing chains
85
+ df['lag_hmm_feature'] = df['hmm_feature'].shift(1)
86
+ df['breack'] = np.where(df['lag_hmm_feature'] != df['hmm_feature'],1,0)
87
+ df["chain_id"] = df.groupby("breack")["Date"].rank(method="first", ascending=True)
88
+ df["chain_id"] = np.where(df['breack'] == 1,df["chain_id"],np.nan)
89
+ df["chain_id"] = df["chain_id"].fillna(method='ffill')
90
+ df["hmm_chain_order"] = df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
91
+ ### returns using the windowsseeds
92
+ df['lag_chain_close'] = df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
93
+ df['chain_return'] = (df['Close']/df['lag_chain_close'] -1) * 100
94
+ df = df.drop(columns = ['breack'])
95
+ return df
96
+
97
+ class trainer_hmm():
98
+ """
99
+ wrapper that gaussian model
100
+ this class follows scikit learn practices
101
+
102
+ Attributes
103
+ ----------
104
+ hmm_model: obj
105
+ pipeline and model
106
+ features_hmm: list
107
+ list of features used to train the gaussian model
108
+
109
+ Methods
110
+ -------
111
+ train():
112
+ train pipeline given the parameters in the class initiliazation
113
+ plot_training_results(lag_diff_returns=int):
114
+ plot features and closing prices displaying the states
115
+ plot the returns distribution by state given lag to calculate the returns in the chains
116
+ """
117
+ def __init__(self, data, features_hmm, n_clusters= 3, corr_thrshold = 0.65, seed = None):
118
+ """
119
+ Initialize object
120
+
121
+ Parameters
122
+ ----------
123
+ data (pd.DataFrame): training data
124
+ features_hmm (list): features to pass for modeling
125
+ n_clusters (int): number or states to train
126
+ corr_thrshold (float): correlation threhsold for initial feature selection
127
+ seed (int): random state for model reproducibility
128
+
129
+ Returns
130
+ -------
131
+ None
132
+ """
133
+ self.__data_train = data
134
+ self.__features_hmm = features_hmm
135
+ self.__n_clusters = n_clusters
136
+ self.__corr_thrshold = corr_thrshold
137
+ self.__seed = seed
138
+ def train(self):
139
+ """
140
+ train pipeline and model
141
+
142
+ Parameters
143
+ ----------
144
+ None
145
+
146
+ Returns
147
+ -------
148
+ None
149
+ """
150
+ transform_pipe = Pipeline([
151
+ ('selector', FeatureSelector(columns=self.__features_hmm)),
152
+ ('fillna', MeanMedianImputer(imputation_method='median',variables=self.__features_hmm)),
153
+ ('drop_correlated', DropCorrelatedFeatures(method='spearman',threshold=self.__corr_thrshold)),
154
+ ])
155
+
156
+ features_hmm_ = list(transform_pipe.fit_transform(self.__data_train).columns)
157
+ n_features = len(features_hmm_)
158
+ start_prob = 0.60
159
+ startprob_prior = np.array([1/self.__n_clusters]*self.__n_clusters)
160
+ transmat_prior = np.diag([start_prob]*self.__n_clusters)
161
+ transmat_prior[transmat_prior==0] = (1-start_prob)/(1-self.__n_clusters)
162
+ means_prior = np.array([1/n_features]*n_features)
163
+ pipeline_hmm = Pipeline([
164
+ ('transfrom_pipe', transform_pipe),
165
+ ('scaler', RobustScaler()),
166
+ ('hmm', GaussianHMM(
167
+ n_components = self.__n_clusters, covariance_type = 'spherical',
168
+ startprob_prior = startprob_prior,
169
+ transmat_prior = transmat_prior,
170
+ means_prior = means_prior,
171
+ random_state = self.__seed,)
172
+ )
173
+ ])
174
+
175
+ self.hmm_model = pipeline_hmm.fit(self.__data_train)
176
+ self.features_hmm = [x for x in self.__features_hmm if x not in list(self.hmm_model[0][-1].features_to_drop_)]
177
+
178
+ def plot_training_results(self, lag_diff_returns):
179
+ """
180
+ plot result as matplot figure
181
+
182
+ Parameters
183
+ ----------
184
+ lag_diff_returns (int): lag or diff factor to calculate returns of chains
185
+
186
+ Returns
187
+ -------
188
+ None
189
+ """
190
+ n_clusters = self.__n_clusters
191
+ df_train = self.__data_train.copy()
192
+ df_train['hmm_feature'] = self.hmm_model.predict(df_train)
193
+ df_train = create_hmm_derived_features(df_train, lag_diff_returns,)
194
+ n = len(self.features_hmm)+1
195
+ fig, axs = plt.subplots(n, 1, figsize=(10, 3*n), sharex=True)
196
+ for i,feature in enumerate(self.features_hmm):
197
+ axs[i].plot(df_train.Date, df_train[feature])
198
+ axs[i].set_title(feature)
199
+ for s in range(n_clusters):
200
+ df = df_train[df_train['hmm_feature'] == s]
201
+ axs[i].scatter(df.Date, df[feature])
202
+
203
+ axs[i+1].plot(df_train.Date, df_train.Close)
204
+ axs[i+1].set_title('close price')
205
+ for s in range(n_clusters):
206
+ df = df_train[df_train['hmm_feature'] == s]
207
+ axs[i+1].scatter(df.Date, df.Close)
208
+
209
+ n = 1
210
+ fig, axs = plt.subplots(n, 1, figsize=(10, 3*n), sharex=True)
211
+ df_plot = df_train.dropna()
212
+ sns.boxplot(data=df_plot, x="hmm_feature", y="chain_return", hue="hmm_feature", ax=axs)
213
+ axs.axhline(0.5, linestyle='--')
214
+ del df_train
215
+
216
+ def evaluate_model_chains(data, n_clusters, at_least_states, threshold_chain, at_least_length):
217
+ """
218
+ function that is going to assess chains or series of states given some sanity chekcs
219
+
220
+ Parameters:
221
+ data (pd.DataFrame): dataframe that must have hmm_feature and extra features
222
+ n_clusters (int): n_clusters that are trainned, not observed
223
+ at_least_states (int): number of states that should be ,at least, observed
224
+ threshold_chain (int): number of times that a state should be , at least, observed
225
+ at_least_length (int): minimal lenght that the states should have using a statical measure (median, q75, max, etc)
226
+
227
+ Returns:
228
+ result (boolean): true if the model complies with parameters
229
+ """
230
+ def q3(x):
231
+ return x.quantile(0.75)
232
+ tmp_df = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
233
+ tmp_df = tmp_df.groupby("hmm_feature", as_index = False).agg(count = ('chain_id','nunique'), median_length = ('chain_lenght','median'), q3_length = ('chain_lenght',q3))
234
+ train_observedstates = len(tmp_df)
235
+
236
+ states_under_threshold = list(tmp_df[tmp_df['count'] <= threshold_chain].hmm_feature)
237
+ n_states_under_threshold = len(states_under_threshold)
238
+ min_count = np.min(tmp_df[~tmp_df.hmm_feature.isin(states_under_threshold)]['count'].values)
239
+ med_length = np.min(tmp_df['q3_length'].values)
240
+
241
+ condition_1 = threshold_chain <= min_count
242
+ condition_2 = n_states_under_threshold <= at_least_states
243
+ condition_3 = at_least_length <= med_length
244
+ condition_4 = (train_observedstates == n_clusters)
245
+
246
+ result = False
247
+
248
+ if condition_1 and condition_2 and condition_3 and condition_4:
249
+ result = True
250
+ else:
251
+ result = False
252
+ return result
253
+
254
+ def iterate_training(trials, train_params, relevance_params):
255
+ """
256
+ iterate valid training
257
+
258
+ Parameters:
259
+ trials (int): number of repetitions to iterate
260
+ train_params (dict): dictionary containing training configurations
261
+ relevance_params (dict): dictionary containing validation configurations
262
+
263
+ Returns:
264
+ results (list): list of valid relevance scores
265
+ kept_model (obj): model (pipeling) that is kept, if it exists
266
+ """
267
+ results = list()
268
+ kept_model=None
269
+ for _ in range(trials):
270
+ try:
271
+ th = trainer_hmm(**train_params)
272
+ th.train()
273
+ result_model = th.hmm_model
274
+ df_train_tmp = train_params.get('data')
275
+ df_train_tmp['hmm_feature'] = result_model.predict(df_train_tmp)
276
+ df_train_tmp = create_hmm_derived_features(df = df_train_tmp, lag_returns = relevance_params.get('lag'))
277
+ relev, _, _ = states_relevance_score(df_train_tmp)
278
+ relevance_hmm = evaluate_model_chains(data = df_train_tmp,
279
+ n_clusters=train_params.get('n_clusters'),
280
+ at_least_states=relevance_params.get('at_least_states'),
281
+ threshold_chain=relevance_params.get('threshold_chain'),
282
+ at_least_length=relevance_params.get('at_least_length'))
283
+ if relevance_hmm:
284
+ results.append(relev)
285
+ kept_model = result_model
286
+ except:
287
+ pass
288
+ del th
289
+ if not kept_model:
290
+ raise TypeError("no model was kept")
291
+ return results, kept_model
292
+
293
+ class custom_hmm_permutation_importance():
294
+ """
295
+ class that is going to perform feature importance using feature permutation
296
+ note: this method is inpired in the same method that is available in scikit-learn
297
+
298
+ Attributes
299
+ ----------
300
+ n_repeats: int
301
+ number of shufflings performed per feature
302
+ features: list
303
+ list of features that is going to be tested, note that these features have to be the input of the model
304
+ results: dict
305
+ dictionary with the results containing feature and relevance scores per each iteration
306
+
307
+ Methods
308
+ -------
309
+ fit():
310
+ fit class
311
+ """
312
+ def __init__(self, model, X, n_repeats=5,random_state=False, features = list(), lag = 4):
313
+ """
314
+ Initialize object
315
+
316
+ Parameters
317
+ ----------
318
+ model (obj): pipeline or model
319
+ X (pd.DataFrame): input data to test feature permutation
320
+ n_repeats (int): number or trials per feature
321
+ random_state (bool): if true set a random state
322
+ features (list): list of features to be tested. note that the features have to be input of the model
323
+ lag (int): lag of diff factor to calculate chain returns
324
+
325
+ Returns
326
+ -------
327
+ None
328
+ """
329
+ self.__model = model
330
+ self.__X = X
331
+ self.n_repeats = n_repeats
332
+ self.__random_state = random_state
333
+ self.features = features
334
+ self.__lag = lag
335
+ def __generate_seeds(self):
336
+ """
337
+ generate list of seeds
338
+
339
+ Parameters
340
+ ----------
341
+ None
342
+
343
+ Returns
344
+ -------
345
+ None
346
+ """
347
+ if self.__random_state:
348
+ self.__seeds = list()
349
+ for _ in range(self.n_repeats):
350
+ seed = np.random.randint(1,500)
351
+ self.__seeds.append(seed)
352
+ def fit(self):
353
+ """
354
+ fit class
355
+
356
+ Parameters
357
+ ----------
358
+ None
359
+
360
+ Returns
361
+ -------
362
+ None
363
+ """
364
+ self.__X['hmm_feature'] = self.__model.predict(self.__X)
365
+ self.__X = create_hmm_derived_features(df=self.__X, lag_returns=self.__lag)
366
+ init_relevance, _, _ = states_relevance_score(self.__X)
367
+ self.results = {feature: list() for feature in self.features}
368
+ if self.__random_state:
369
+ self.__generate_seeds()
370
+ for feature in self.features:
371
+ X_ = self.__X.dropna().reset_index(drop = True).copy()
372
+ for j in range(self.n_repeats):
373
+ if self.__random_state:
374
+ seed = self.__seeds[j]
375
+ np.random.seed(seed)
376
+ else:
377
+ seed = None
378
+ shuffled = X_[feature].sample(frac=1, random_state = seed, replace = True).reset_index(drop=True)
379
+ X_[feature] = shuffled
380
+ X_['hmm_feature'] = self.__model.predict(X_)
381
+ X_ = create_hmm_derived_features(df=X_, lag_returns=self.__lag)
382
+
383
+ tmp_df = X_.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
384
+ tmp_df = tmp_df.groupby("hmm_feature", as_index = False).agg(count = ('chain_id','nunique'), median_length = ('chain_lenght','median')).copy()
385
+ mean_relevance, _, _ = states_relevance_score(X_)
386
+ self.results[feature].append(mean_relevance - init_relevance)
387
+ del X_
388
+
389
+ def hmm_feature_selection(max_features, trials, train_params, relevance_params):
390
+ """
391
+ wrapper function that is going to use permutation importance to select features
392
+
393
+ Parameters:
394
+ ax_features (int): target to number of features
395
+ trials (int): training iterations
396
+ train_params (dict): dictionary containing training configurations
397
+ relevance_params (dict): dictionary containing validation configurations
398
+
399
+ Returns:
400
+ results (pd.DataFrame): summary relevace score per excluded feature
401
+ """
402
+ results = {'index':list(),'feature_to_drop':list(), 'median relevance excluding feature':list()}
403
+ i=0
404
+ init_numfeatures = len(train_params.get('features_hmm'))
405
+ while max_features <= init_numfeatures:
406
+ print(init_numfeatures)
407
+ if i==0:
408
+ exclude = None
409
+ r,model= iterate_training(trials, train_params, relevance_params)
410
+ for ri in r:
411
+ results['index'].append(0)
412
+ results['feature_to_drop'].append('full')
413
+ results['median relevance excluding feature'].append(ri)
414
+ data_train = train_params.get('data')
415
+ chmm_pi = custom_hmm_permutation_importance(model, data_train,random_state=5, features = train_params.get('features_hmm'), lag = relevance_params.get('lag'))
416
+ chmm_pi.fit()
417
+ results_fp = pd.DataFrame(chmm_pi.results)
418
+ feature_deltas = results_fp.median(axis = 0)
419
+ feature_deltas = feature_deltas.sort_values(ascending = False)
420
+ feature_to_drop = feature_deltas.index[0]
421
+ print(f'excluding {feature_to_drop}')
422
+
423
+ train_params['features_hmm'].remove(feature_to_drop)
424
+ print(train_params['features_hmm'])
425
+ r,model = iterate_training(trials, train_params, relevance_params)
426
+ for ri in r:
427
+ results['index'].append(i+1)
428
+ results['feature_to_drop'].append(feature_to_drop)
429
+ results['median relevance excluding feature'].append(ri)
430
+ init_numfeatures = len(model[:-2].transform(data_train).columns)
431
+ i+=1
432
+ return pd.DataFrame(results)
433
+
434
+
435
+ def seed_finder(train_params, relevance_params, n_seed = 100,max_results =5):
436
+ """
437
+ iterate valid training finding best starter seed
438
+
439
+ Parameters:
440
+ train_params (dict): dictionary containing training configurations
441
+ relevance_params (dict): dictionary containing validation configurations
442
+ n_seed (int): number of iterations
443
+ max_results (int): number of max results to keep and stop the iteration
444
+
445
+ Returns:
446
+ df_results (pd.DataFrame): summary table of seed and relevance score
447
+ """
448
+ seeds = list()
449
+ i_ = 0
450
+ while len(seeds) < max_results and i_ < n_seed:
451
+ # print(i_)
452
+ if i_ >= (n_seed*0.5) and len(seeds) == 0:
453
+ i_ += 10
454
+
455
+ seed = random.randint(50, 10000)
456
+ train_params['seed'] = seed
457
+ try:
458
+ th = trainer_hmm(**train_params)
459
+ th.train()
460
+ result_model = th.hmm_model
461
+ df_train_tmp = train_params.get('data')
462
+ df_train_tmp['hmm_feature'] = result_model.predict(df_train_tmp)
463
+ df_train_tmp = create_hmm_derived_features(df = df_train_tmp, lag_returns = relevance_params.get('lag'))
464
+ relev, _, _ = states_relevance_score(df_train_tmp)
465
+ relevance_hmm = evaluate_model_chains(data = df_train_tmp,
466
+ n_clusters=train_params.get('n_clusters'),
467
+ at_least_states=relevance_params.get('at_least_states'),
468
+ threshold_chain=relevance_params.get('threshold_chain'),
469
+ at_least_length=relevance_params.get('at_least_length'))
470
+ if relevance_hmm:
471
+ print('new model candidate was found, seed saved')
472
+ seeds.append(seed)
473
+ i_ += 1
474
+ except:
475
+ i_ += 1
476
+ print('best seeds', seeds)
477
+ ## searching the best seed
478
+ results = {'seed' : list(),'train_relevance': list()}
479
+
480
+ for seed_x in seeds:
481
+ train_params['seed'] = seed_x
482
+ th = trainer_hmm(**train_params)
483
+ th.train()
484
+ result_model = th.hmm_model
485
+ df_train_tmp = train_params.get('data')
486
+ df_train_tmp['hmm_feature'] = result_model.predict(df_train_tmp)
487
+ df_train_tmp = create_hmm_derived_features(df = df_train_tmp, lag_returns = relevance_params.get('lag'))
488
+ relev, _, _ = states_relevance_score(df_train_tmp)
489
+
490
+ results['seed'].append(seed_x)
491
+ results['train_relevance'].append(relev)
492
+
493
+ df_results = pd.DataFrame(results).sort_values(['train_relevance'], ascending = [False])
494
+ return df_results
File without changes