virgo-modules 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- virgo_modules/__init__.py +1 -0
- virgo_modules/src/aws_utils.py +35 -3
- virgo_modules/src/backtester.py +474 -0
- virgo_modules/src/edge_utils/__init__.py +0 -0
- virgo_modules/src/edge_utils/conformal_utils.py +106 -0
- virgo_modules/src/edge_utils/edge_utils.py +502 -0
- virgo_modules/src/edge_utils/feature_selection.py +66 -0
- virgo_modules/src/edge_utils/shap_utils.py +54 -0
- virgo_modules/src/edge_utils/stack_model.py +94 -0
- virgo_modules/src/hmm_utils.py +494 -0
- virgo_modules/src/market/__init__.py +0 -0
- virgo_modules/src/market/market_tools.py +189 -0
- virgo_modules/src/markowitz/__init__.py +0 -0
- virgo_modules/src/markowitz/markowitz_utils.py +44 -0
- virgo_modules/src/re_utils.py +628 -85
- virgo_modules/src/ticketer_source.py +1351 -1066
- virgo_modules/src/transformer_utils.py +401 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/METADATA +16 -22
- virgo_modules-0.9.0.dist-info/RECORD +24 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/WHEEL +1 -1
- virgo_modules/src/edge_utils.py +0 -178
- virgo_modules-0.0.72.dist-info/RECORD +0 -12
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info/licenses}/LICENSE +0 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
5
|
+
|
|
6
|
+
class MyStackingClassifierMultiClass(ClassifierMixin, BaseEstimator):
|
|
7
|
+
def __init__(self, estimators, meta_estimators,targets,perc=None,stack_size=None, **kwargs):
|
|
8
|
+
self.estimators = estimators
|
|
9
|
+
self.meta_estimators = meta_estimators
|
|
10
|
+
self.targets = targets
|
|
11
|
+
if stack_size and perc:
|
|
12
|
+
raise Exception('just one option')
|
|
13
|
+
if not stack_size and not perc:
|
|
14
|
+
raise Exception('set one option')
|
|
15
|
+
self.stack_size = stack_size
|
|
16
|
+
self.perc = perc
|
|
17
|
+
|
|
18
|
+
def get_index_training(self, X):
|
|
19
|
+
if self.stack_size:
|
|
20
|
+
unique_dates = list(X.index.get_level_values('Date_i').unique())
|
|
21
|
+
unique_dates.sort()
|
|
22
|
+
stack_chunk = unique_dates[-self.stack_size:]
|
|
23
|
+
base_indexes = X[~X.index.get_level_values('Date_i').isin(stack_chunk)].index.get_level_values('i')
|
|
24
|
+
meta_indexes = X[X.index.get_level_values('Date_i').isin(stack_chunk)].index.get_level_values('i')
|
|
25
|
+
elif self.perc:
|
|
26
|
+
meta_indexes = X.sample(frac = self.perc).index.get_level_values('i')
|
|
27
|
+
base_indexes = X[~X.index.get_level_values('i').isin(meta_indexes)].index.get_level_values('i')
|
|
28
|
+
else:
|
|
29
|
+
raise Exception("error", self.stack_size, self.perc)
|
|
30
|
+
return base_indexes, meta_indexes
|
|
31
|
+
def train_base_learner(self, classifier, X, y,indexes):
|
|
32
|
+
base_X = X[X.index.get_level_values('i').isin(indexes)]
|
|
33
|
+
base_y = y[y.index.get_level_values('i').isin(indexes)]
|
|
34
|
+
classifier.fit(base_X, base_y)
|
|
35
|
+
def fit(self, X, y):
|
|
36
|
+
# #base learners
|
|
37
|
+
base_indexes, meta_indexes = self.get_index_training(X)
|
|
38
|
+
for name,estimator in self.estimators:
|
|
39
|
+
self.train_base_learner(estimator,X, y, base_indexes)
|
|
40
|
+
|
|
41
|
+
#stack meta learner
|
|
42
|
+
metas_pred = dict()
|
|
43
|
+
for i,cont in enumerate(self.estimators, start=1):
|
|
44
|
+
_,estimator = cont
|
|
45
|
+
meta_pred = estimator.predict_proba(X[X.index.get_level_values('i').isin(meta_indexes)])
|
|
46
|
+
metas_pred[f"meta{i}0"] = meta_pred[0][:,1]
|
|
47
|
+
metas_pred[f"meta{i}1"] = meta_pred[1][:,1]
|
|
48
|
+
meta_preds_df = pd.DataFrame(metas_pred)
|
|
49
|
+
|
|
50
|
+
for i,metaest in enumerate(self.meta_estimators,start=0):
|
|
51
|
+
_,metaest = metaest
|
|
52
|
+
metacols = [f"meta{j}{i}" for j in range(1,len(self.estimators)+1)]
|
|
53
|
+
metaest.fit(
|
|
54
|
+
meta_preds_df[metacols],
|
|
55
|
+
y[X.index.get_level_values('i').isin(meta_indexes)][self.targets[i]]
|
|
56
|
+
)
|
|
57
|
+
self.is_fitted_ = True
|
|
58
|
+
self.classes_ = np.array([[0,1],[0,1]])
|
|
59
|
+
|
|
60
|
+
def predict_proba(self, X):
|
|
61
|
+
metas_pred = dict()
|
|
62
|
+
for i,cont in enumerate(self.estimators, start=1):
|
|
63
|
+
_,estimator = cont
|
|
64
|
+
meta_pred = estimator.predict_proba(X)
|
|
65
|
+
metas_pred[f"meta{i}0"] = meta_pred[0][:,1]
|
|
66
|
+
metas_pred[f"meta{i}1"] = meta_pred[1][:,1]
|
|
67
|
+
self.meta_preds_df__ = pd.DataFrame(metas_pred)
|
|
68
|
+
|
|
69
|
+
prediction_vector = list()
|
|
70
|
+
for i,cont in enumerate(self.meta_estimators, start=0):
|
|
71
|
+
_,estimator = cont
|
|
72
|
+
metacols = [f"meta{j}{i}" for j in range(1,len(self.estimators)+1)]
|
|
73
|
+
preds = estimator.predict_proba(self.meta_preds_df__[metacols].values)
|
|
74
|
+
prediction_vector.append(preds)
|
|
75
|
+
return prediction_vector
|
|
76
|
+
|
|
77
|
+
def predict(self, X):
|
|
78
|
+
prediction_vector = list()
|
|
79
|
+
_ = self.predict_proba(X)
|
|
80
|
+
for i,cont in enumerate(self.meta_estimators, start=0):
|
|
81
|
+
_,estimator = cont
|
|
82
|
+
metacols = [f"meta{j}{i}" for j in range(1,len(self.estimators)+1)]
|
|
83
|
+
preds = estimator.predict(self.meta_preds_df__[metacols].values)
|
|
84
|
+
prediction_vector.append(preds)
|
|
85
|
+
|
|
86
|
+
p = np.array(tuple(prediction_vector))
|
|
87
|
+
return p.reshape((p.shape[1],p.shape[0]))
|
|
88
|
+
|
|
89
|
+
def get_params(self, deep=True):
|
|
90
|
+
return {k:v for k, v in self.__dict__.items()}
|
|
91
|
+
|
|
92
|
+
def set_params(self, **parms):
|
|
93
|
+
for k,v in parms.items():
|
|
94
|
+
setattr(self,k,v)
|
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
from hmmlearn.hmm import GaussianHMM
|
|
2
|
+
|
|
3
|
+
from sklearn.pipeline import Pipeline
|
|
4
|
+
from feature_engine.imputation import MeanMedianImputer
|
|
5
|
+
from virgo_modules.src.transformer_utils import FeatureSelector
|
|
6
|
+
from feature_engine.selection import DropCorrelatedFeatures
|
|
7
|
+
from sklearn.preprocessing import RobustScaler
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
import random
|
|
12
|
+
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
import matplotlib.gridspec as gridspec
|
|
15
|
+
import seaborn as sns; sns.set()
|
|
16
|
+
|
|
17
|
+
def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
|
|
18
|
+
'''
|
|
19
|
+
calculate relevance score and summary report for hmm model
|
|
20
|
+
|
|
21
|
+
Parameters:
|
|
22
|
+
default_benchmark_sd (float): default value to bias SD for t calculation
|
|
23
|
+
t_threshold (float): alpha or z threshold for the normalized score
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
mean_relevance (float): mean relevance score of the states
|
|
27
|
+
cluster_returns (pd.DataFrame): summary report of the analysis
|
|
28
|
+
number_relevant_states (int): number of relevant states
|
|
29
|
+
'''
|
|
30
|
+
## legnths
|
|
31
|
+
cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
|
|
32
|
+
cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
|
|
33
|
+
## means
|
|
34
|
+
def quantile2(x):
|
|
35
|
+
return x.quantile(0.25)
|
|
36
|
+
def quantile3(x):
|
|
37
|
+
return x.quantile(0.75)
|
|
38
|
+
|
|
39
|
+
cluster_returns = data.groupby('hmm_feature').agg(
|
|
40
|
+
n_uniques = ('chain_id','nunique'),
|
|
41
|
+
n_obs = ('Date','count'),
|
|
42
|
+
cluster_ret_q25 = ('chain_return',quantile2),
|
|
43
|
+
cluster_ret_median = ('chain_return','median'),
|
|
44
|
+
cluster_ret_q75 = ('chain_return',quantile3),
|
|
45
|
+
)
|
|
46
|
+
cluster_returns = cluster_returns.join(cluster_lengths, how = 'left')
|
|
47
|
+
cluster_returns['perc_dispute'] = np.where(
|
|
48
|
+
np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
|
|
49
|
+
1,0
|
|
50
|
+
)
|
|
51
|
+
cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
|
|
52
|
+
cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
|
|
53
|
+
cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
|
|
54
|
+
cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
|
|
55
|
+
cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
|
|
56
|
+
cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
|
|
57
|
+
cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
|
|
58
|
+
|
|
59
|
+
## relevance or importance
|
|
60
|
+
# naive aproach
|
|
61
|
+
cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
|
|
62
|
+
cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
|
|
63
|
+
cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
|
|
64
|
+
cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
|
|
65
|
+
|
|
66
|
+
mean_relevance = cluster_returns['abs_t_accpted'].mean()
|
|
67
|
+
number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
|
|
68
|
+
|
|
69
|
+
return mean_relevance, cluster_returns, number_relevant_states
|
|
70
|
+
|
|
71
|
+
def create_hmm_derived_features(df, lag_returns):
|
|
72
|
+
"""
|
|
73
|
+
create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
|
|
74
|
+
note: this is a copy of the method of the ticketer_object with the same name
|
|
75
|
+
|
|
76
|
+
Parameters:
|
|
77
|
+
df (pd.DataFrame): dataframe that must have hmm_feature columns
|
|
78
|
+
lag_returns (int): lag paramter (not used)
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
df (pd.DataFrame): dataframe with extra hmm features as columns
|
|
82
|
+
"""
|
|
83
|
+
df = df.sort_values('Date')
|
|
84
|
+
## indexing chains
|
|
85
|
+
df['lag_hmm_feature'] = df['hmm_feature'].shift(1)
|
|
86
|
+
df['breack'] = np.where(df['lag_hmm_feature'] != df['hmm_feature'],1,0)
|
|
87
|
+
df["chain_id"] = df.groupby("breack")["Date"].rank(method="first", ascending=True)
|
|
88
|
+
df["chain_id"] = np.where(df['breack'] == 1,df["chain_id"],np.nan)
|
|
89
|
+
df["chain_id"] = df["chain_id"].fillna(method='ffill')
|
|
90
|
+
df["hmm_chain_order"] = df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
91
|
+
### returns using the windowsseeds
|
|
92
|
+
df['lag_chain_close'] = df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
|
|
93
|
+
df['chain_return'] = (df['Close']/df['lag_chain_close'] -1) * 100
|
|
94
|
+
df = df.drop(columns = ['breack'])
|
|
95
|
+
return df
|
|
96
|
+
|
|
97
|
+
class trainer_hmm():
|
|
98
|
+
"""
|
|
99
|
+
wrapper that gaussian model
|
|
100
|
+
this class follows scikit learn practices
|
|
101
|
+
|
|
102
|
+
Attributes
|
|
103
|
+
----------
|
|
104
|
+
hmm_model: obj
|
|
105
|
+
pipeline and model
|
|
106
|
+
features_hmm: list
|
|
107
|
+
list of features used to train the gaussian model
|
|
108
|
+
|
|
109
|
+
Methods
|
|
110
|
+
-------
|
|
111
|
+
train():
|
|
112
|
+
train pipeline given the parameters in the class initiliazation
|
|
113
|
+
plot_training_results(lag_diff_returns=int):
|
|
114
|
+
plot features and closing prices displaying the states
|
|
115
|
+
plot the returns distribution by state given lag to calculate the returns in the chains
|
|
116
|
+
"""
|
|
117
|
+
def __init__(self, data, features_hmm, n_clusters= 3, corr_thrshold = 0.65, seed = None):
|
|
118
|
+
"""
|
|
119
|
+
Initialize object
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
data (pd.DataFrame): training data
|
|
124
|
+
features_hmm (list): features to pass for modeling
|
|
125
|
+
n_clusters (int): number or states to train
|
|
126
|
+
corr_thrshold (float): correlation threhsold for initial feature selection
|
|
127
|
+
seed (int): random state for model reproducibility
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
None
|
|
132
|
+
"""
|
|
133
|
+
self.__data_train = data
|
|
134
|
+
self.__features_hmm = features_hmm
|
|
135
|
+
self.__n_clusters = n_clusters
|
|
136
|
+
self.__corr_thrshold = corr_thrshold
|
|
137
|
+
self.__seed = seed
|
|
138
|
+
def train(self):
|
|
139
|
+
"""
|
|
140
|
+
train pipeline and model
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
None
|
|
145
|
+
|
|
146
|
+
Returns
|
|
147
|
+
-------
|
|
148
|
+
None
|
|
149
|
+
"""
|
|
150
|
+
transform_pipe = Pipeline([
|
|
151
|
+
('selector', FeatureSelector(columns=self.__features_hmm)),
|
|
152
|
+
('fillna', MeanMedianImputer(imputation_method='median',variables=self.__features_hmm)),
|
|
153
|
+
('drop_correlated', DropCorrelatedFeatures(method='spearman',threshold=self.__corr_thrshold)),
|
|
154
|
+
])
|
|
155
|
+
|
|
156
|
+
features_hmm_ = list(transform_pipe.fit_transform(self.__data_train).columns)
|
|
157
|
+
n_features = len(features_hmm_)
|
|
158
|
+
start_prob = 0.60
|
|
159
|
+
startprob_prior = np.array([1/self.__n_clusters]*self.__n_clusters)
|
|
160
|
+
transmat_prior = np.diag([start_prob]*self.__n_clusters)
|
|
161
|
+
transmat_prior[transmat_prior==0] = (1-start_prob)/(1-self.__n_clusters)
|
|
162
|
+
means_prior = np.array([1/n_features]*n_features)
|
|
163
|
+
pipeline_hmm = Pipeline([
|
|
164
|
+
('transfrom_pipe', transform_pipe),
|
|
165
|
+
('scaler', RobustScaler()),
|
|
166
|
+
('hmm', GaussianHMM(
|
|
167
|
+
n_components = self.__n_clusters, covariance_type = 'spherical',
|
|
168
|
+
startprob_prior = startprob_prior,
|
|
169
|
+
transmat_prior = transmat_prior,
|
|
170
|
+
means_prior = means_prior,
|
|
171
|
+
random_state = self.__seed,)
|
|
172
|
+
)
|
|
173
|
+
])
|
|
174
|
+
|
|
175
|
+
self.hmm_model = pipeline_hmm.fit(self.__data_train)
|
|
176
|
+
self.features_hmm = [x for x in self.__features_hmm if x not in list(self.hmm_model[0][-1].features_to_drop_)]
|
|
177
|
+
|
|
178
|
+
def plot_training_results(self, lag_diff_returns):
|
|
179
|
+
"""
|
|
180
|
+
plot result as matplot figure
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
lag_diff_returns (int): lag or diff factor to calculate returns of chains
|
|
185
|
+
|
|
186
|
+
Returns
|
|
187
|
+
-------
|
|
188
|
+
None
|
|
189
|
+
"""
|
|
190
|
+
n_clusters = self.__n_clusters
|
|
191
|
+
df_train = self.__data_train.copy()
|
|
192
|
+
df_train['hmm_feature'] = self.hmm_model.predict(df_train)
|
|
193
|
+
df_train = create_hmm_derived_features(df_train, lag_diff_returns,)
|
|
194
|
+
n = len(self.features_hmm)+1
|
|
195
|
+
fig, axs = plt.subplots(n, 1, figsize=(10, 3*n), sharex=True)
|
|
196
|
+
for i,feature in enumerate(self.features_hmm):
|
|
197
|
+
axs[i].plot(df_train.Date, df_train[feature])
|
|
198
|
+
axs[i].set_title(feature)
|
|
199
|
+
for s in range(n_clusters):
|
|
200
|
+
df = df_train[df_train['hmm_feature'] == s]
|
|
201
|
+
axs[i].scatter(df.Date, df[feature])
|
|
202
|
+
|
|
203
|
+
axs[i+1].plot(df_train.Date, df_train.Close)
|
|
204
|
+
axs[i+1].set_title('close price')
|
|
205
|
+
for s in range(n_clusters):
|
|
206
|
+
df = df_train[df_train['hmm_feature'] == s]
|
|
207
|
+
axs[i+1].scatter(df.Date, df.Close)
|
|
208
|
+
|
|
209
|
+
n = 1
|
|
210
|
+
fig, axs = plt.subplots(n, 1, figsize=(10, 3*n), sharex=True)
|
|
211
|
+
df_plot = df_train.dropna()
|
|
212
|
+
sns.boxplot(data=df_plot, x="hmm_feature", y="chain_return", hue="hmm_feature", ax=axs)
|
|
213
|
+
axs.axhline(0.5, linestyle='--')
|
|
214
|
+
del df_train
|
|
215
|
+
|
|
216
|
+
def evaluate_model_chains(data, n_clusters, at_least_states, threshold_chain, at_least_length):
|
|
217
|
+
"""
|
|
218
|
+
function that is going to assess chains or series of states given some sanity chekcs
|
|
219
|
+
|
|
220
|
+
Parameters:
|
|
221
|
+
data (pd.DataFrame): dataframe that must have hmm_feature and extra features
|
|
222
|
+
n_clusters (int): n_clusters that are trainned, not observed
|
|
223
|
+
at_least_states (int): number of states that should be ,at least, observed
|
|
224
|
+
threshold_chain (int): number of times that a state should be , at least, observed
|
|
225
|
+
at_least_length (int): minimal lenght that the states should have using a statical measure (median, q75, max, etc)
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
result (boolean): true if the model complies with parameters
|
|
229
|
+
"""
|
|
230
|
+
def q3(x):
|
|
231
|
+
return x.quantile(0.75)
|
|
232
|
+
tmp_df = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
|
|
233
|
+
tmp_df = tmp_df.groupby("hmm_feature", as_index = False).agg(count = ('chain_id','nunique'), median_length = ('chain_lenght','median'), q3_length = ('chain_lenght',q3))
|
|
234
|
+
train_observedstates = len(tmp_df)
|
|
235
|
+
|
|
236
|
+
states_under_threshold = list(tmp_df[tmp_df['count'] <= threshold_chain].hmm_feature)
|
|
237
|
+
n_states_under_threshold = len(states_under_threshold)
|
|
238
|
+
min_count = np.min(tmp_df[~tmp_df.hmm_feature.isin(states_under_threshold)]['count'].values)
|
|
239
|
+
med_length = np.min(tmp_df['q3_length'].values)
|
|
240
|
+
|
|
241
|
+
condition_1 = threshold_chain <= min_count
|
|
242
|
+
condition_2 = n_states_under_threshold <= at_least_states
|
|
243
|
+
condition_3 = at_least_length <= med_length
|
|
244
|
+
condition_4 = (train_observedstates == n_clusters)
|
|
245
|
+
|
|
246
|
+
result = False
|
|
247
|
+
|
|
248
|
+
if condition_1 and condition_2 and condition_3 and condition_4:
|
|
249
|
+
result = True
|
|
250
|
+
else:
|
|
251
|
+
result = False
|
|
252
|
+
return result
|
|
253
|
+
|
|
254
|
+
def iterate_training(trials, train_params, relevance_params):
|
|
255
|
+
"""
|
|
256
|
+
iterate valid training
|
|
257
|
+
|
|
258
|
+
Parameters:
|
|
259
|
+
trials (int): number of repetitions to iterate
|
|
260
|
+
train_params (dict): dictionary containing training configurations
|
|
261
|
+
relevance_params (dict): dictionary containing validation configurations
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
results (list): list of valid relevance scores
|
|
265
|
+
kept_model (obj): model (pipeling) that is kept, if it exists
|
|
266
|
+
"""
|
|
267
|
+
results = list()
|
|
268
|
+
kept_model=None
|
|
269
|
+
for _ in range(trials):
|
|
270
|
+
try:
|
|
271
|
+
th = trainer_hmm(**train_params)
|
|
272
|
+
th.train()
|
|
273
|
+
result_model = th.hmm_model
|
|
274
|
+
df_train_tmp = train_params.get('data')
|
|
275
|
+
df_train_tmp['hmm_feature'] = result_model.predict(df_train_tmp)
|
|
276
|
+
df_train_tmp = create_hmm_derived_features(df = df_train_tmp, lag_returns = relevance_params.get('lag'))
|
|
277
|
+
relev, _, _ = states_relevance_score(df_train_tmp)
|
|
278
|
+
relevance_hmm = evaluate_model_chains(data = df_train_tmp,
|
|
279
|
+
n_clusters=train_params.get('n_clusters'),
|
|
280
|
+
at_least_states=relevance_params.get('at_least_states'),
|
|
281
|
+
threshold_chain=relevance_params.get('threshold_chain'),
|
|
282
|
+
at_least_length=relevance_params.get('at_least_length'))
|
|
283
|
+
if relevance_hmm:
|
|
284
|
+
results.append(relev)
|
|
285
|
+
kept_model = result_model
|
|
286
|
+
except:
|
|
287
|
+
pass
|
|
288
|
+
del th
|
|
289
|
+
if not kept_model:
|
|
290
|
+
raise TypeError("no model was kept")
|
|
291
|
+
return results, kept_model
|
|
292
|
+
|
|
293
|
+
class custom_hmm_permutation_importance():
|
|
294
|
+
"""
|
|
295
|
+
class that is going to perform feature importance using feature permutation
|
|
296
|
+
note: this method is inpired in the same method that is available in scikit-learn
|
|
297
|
+
|
|
298
|
+
Attributes
|
|
299
|
+
----------
|
|
300
|
+
n_repeats: int
|
|
301
|
+
number of shufflings performed per feature
|
|
302
|
+
features: list
|
|
303
|
+
list of features that is going to be tested, note that these features have to be the input of the model
|
|
304
|
+
results: dict
|
|
305
|
+
dictionary with the results containing feature and relevance scores per each iteration
|
|
306
|
+
|
|
307
|
+
Methods
|
|
308
|
+
-------
|
|
309
|
+
fit():
|
|
310
|
+
fit class
|
|
311
|
+
"""
|
|
312
|
+
def __init__(self, model, X, n_repeats=5,random_state=False, features = list(), lag = 4):
|
|
313
|
+
"""
|
|
314
|
+
Initialize object
|
|
315
|
+
|
|
316
|
+
Parameters
|
|
317
|
+
----------
|
|
318
|
+
model (obj): pipeline or model
|
|
319
|
+
X (pd.DataFrame): input data to test feature permutation
|
|
320
|
+
n_repeats (int): number or trials per feature
|
|
321
|
+
random_state (bool): if true set a random state
|
|
322
|
+
features (list): list of features to be tested. note that the features have to be input of the model
|
|
323
|
+
lag (int): lag of diff factor to calculate chain returns
|
|
324
|
+
|
|
325
|
+
Returns
|
|
326
|
+
-------
|
|
327
|
+
None
|
|
328
|
+
"""
|
|
329
|
+
self.__model = model
|
|
330
|
+
self.__X = X
|
|
331
|
+
self.n_repeats = n_repeats
|
|
332
|
+
self.__random_state = random_state
|
|
333
|
+
self.features = features
|
|
334
|
+
self.__lag = lag
|
|
335
|
+
def __generate_seeds(self):
|
|
336
|
+
"""
|
|
337
|
+
generate list of seeds
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
None
|
|
342
|
+
|
|
343
|
+
Returns
|
|
344
|
+
-------
|
|
345
|
+
None
|
|
346
|
+
"""
|
|
347
|
+
if self.__random_state:
|
|
348
|
+
self.__seeds = list()
|
|
349
|
+
for _ in range(self.n_repeats):
|
|
350
|
+
seed = np.random.randint(1,500)
|
|
351
|
+
self.__seeds.append(seed)
|
|
352
|
+
def fit(self):
|
|
353
|
+
"""
|
|
354
|
+
fit class
|
|
355
|
+
|
|
356
|
+
Parameters
|
|
357
|
+
----------
|
|
358
|
+
None
|
|
359
|
+
|
|
360
|
+
Returns
|
|
361
|
+
-------
|
|
362
|
+
None
|
|
363
|
+
"""
|
|
364
|
+
self.__X['hmm_feature'] = self.__model.predict(self.__X)
|
|
365
|
+
self.__X = create_hmm_derived_features(df=self.__X, lag_returns=self.__lag)
|
|
366
|
+
init_relevance, _, _ = states_relevance_score(self.__X)
|
|
367
|
+
self.results = {feature: list() for feature in self.features}
|
|
368
|
+
if self.__random_state:
|
|
369
|
+
self.__generate_seeds()
|
|
370
|
+
for feature in self.features:
|
|
371
|
+
X_ = self.__X.dropna().reset_index(drop = True).copy()
|
|
372
|
+
for j in range(self.n_repeats):
|
|
373
|
+
if self.__random_state:
|
|
374
|
+
seed = self.__seeds[j]
|
|
375
|
+
np.random.seed(seed)
|
|
376
|
+
else:
|
|
377
|
+
seed = None
|
|
378
|
+
shuffled = X_[feature].sample(frac=1, random_state = seed, replace = True).reset_index(drop=True)
|
|
379
|
+
X_[feature] = shuffled
|
|
380
|
+
X_['hmm_feature'] = self.__model.predict(X_)
|
|
381
|
+
X_ = create_hmm_derived_features(df=X_, lag_returns=self.__lag)
|
|
382
|
+
|
|
383
|
+
tmp_df = X_.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
|
|
384
|
+
tmp_df = tmp_df.groupby("hmm_feature", as_index = False).agg(count = ('chain_id','nunique'), median_length = ('chain_lenght','median')).copy()
|
|
385
|
+
mean_relevance, _, _ = states_relevance_score(X_)
|
|
386
|
+
self.results[feature].append(mean_relevance - init_relevance)
|
|
387
|
+
del X_
|
|
388
|
+
|
|
389
|
+
def hmm_feature_selection(max_features, trials, train_params, relevance_params):
|
|
390
|
+
"""
|
|
391
|
+
wrapper function that is going to use permutation importance to select features
|
|
392
|
+
|
|
393
|
+
Parameters:
|
|
394
|
+
ax_features (int): target to number of features
|
|
395
|
+
trials (int): training iterations
|
|
396
|
+
train_params (dict): dictionary containing training configurations
|
|
397
|
+
relevance_params (dict): dictionary containing validation configurations
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
results (pd.DataFrame): summary relevace score per excluded feature
|
|
401
|
+
"""
|
|
402
|
+
results = {'index':list(),'feature_to_drop':list(), 'median relevance excluding feature':list()}
|
|
403
|
+
i=0
|
|
404
|
+
init_numfeatures = len(train_params.get('features_hmm'))
|
|
405
|
+
while max_features <= init_numfeatures:
|
|
406
|
+
print(init_numfeatures)
|
|
407
|
+
if i==0:
|
|
408
|
+
exclude = None
|
|
409
|
+
r,model= iterate_training(trials, train_params, relevance_params)
|
|
410
|
+
for ri in r:
|
|
411
|
+
results['index'].append(0)
|
|
412
|
+
results['feature_to_drop'].append('full')
|
|
413
|
+
results['median relevance excluding feature'].append(ri)
|
|
414
|
+
data_train = train_params.get('data')
|
|
415
|
+
chmm_pi = custom_hmm_permutation_importance(model, data_train,random_state=5, features = train_params.get('features_hmm'), lag = relevance_params.get('lag'))
|
|
416
|
+
chmm_pi.fit()
|
|
417
|
+
results_fp = pd.DataFrame(chmm_pi.results)
|
|
418
|
+
feature_deltas = results_fp.median(axis = 0)
|
|
419
|
+
feature_deltas = feature_deltas.sort_values(ascending = False)
|
|
420
|
+
feature_to_drop = feature_deltas.index[0]
|
|
421
|
+
print(f'excluding {feature_to_drop}')
|
|
422
|
+
|
|
423
|
+
train_params['features_hmm'].remove(feature_to_drop)
|
|
424
|
+
print(train_params['features_hmm'])
|
|
425
|
+
r,model = iterate_training(trials, train_params, relevance_params)
|
|
426
|
+
for ri in r:
|
|
427
|
+
results['index'].append(i+1)
|
|
428
|
+
results['feature_to_drop'].append(feature_to_drop)
|
|
429
|
+
results['median relevance excluding feature'].append(ri)
|
|
430
|
+
init_numfeatures = len(model[:-2].transform(data_train).columns)
|
|
431
|
+
i+=1
|
|
432
|
+
return pd.DataFrame(results)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def seed_finder(train_params, relevance_params, n_seed = 100,max_results =5):
|
|
436
|
+
"""
|
|
437
|
+
iterate valid training finding best starter seed
|
|
438
|
+
|
|
439
|
+
Parameters:
|
|
440
|
+
train_params (dict): dictionary containing training configurations
|
|
441
|
+
relevance_params (dict): dictionary containing validation configurations
|
|
442
|
+
n_seed (int): number of iterations
|
|
443
|
+
max_results (int): number of max results to keep and stop the iteration
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
df_results (pd.DataFrame): summary table of seed and relevance score
|
|
447
|
+
"""
|
|
448
|
+
seeds = list()
|
|
449
|
+
i_ = 0
|
|
450
|
+
while len(seeds) < max_results and i_ < n_seed:
|
|
451
|
+
# print(i_)
|
|
452
|
+
if i_ >= (n_seed*0.5) and len(seeds) == 0:
|
|
453
|
+
i_ += 10
|
|
454
|
+
|
|
455
|
+
seed = random.randint(50, 10000)
|
|
456
|
+
train_params['seed'] = seed
|
|
457
|
+
try:
|
|
458
|
+
th = trainer_hmm(**train_params)
|
|
459
|
+
th.train()
|
|
460
|
+
result_model = th.hmm_model
|
|
461
|
+
df_train_tmp = train_params.get('data')
|
|
462
|
+
df_train_tmp['hmm_feature'] = result_model.predict(df_train_tmp)
|
|
463
|
+
df_train_tmp = create_hmm_derived_features(df = df_train_tmp, lag_returns = relevance_params.get('lag'))
|
|
464
|
+
relev, _, _ = states_relevance_score(df_train_tmp)
|
|
465
|
+
relevance_hmm = evaluate_model_chains(data = df_train_tmp,
|
|
466
|
+
n_clusters=train_params.get('n_clusters'),
|
|
467
|
+
at_least_states=relevance_params.get('at_least_states'),
|
|
468
|
+
threshold_chain=relevance_params.get('threshold_chain'),
|
|
469
|
+
at_least_length=relevance_params.get('at_least_length'))
|
|
470
|
+
if relevance_hmm:
|
|
471
|
+
print('new model candidate was found, seed saved')
|
|
472
|
+
seeds.append(seed)
|
|
473
|
+
i_ += 1
|
|
474
|
+
except:
|
|
475
|
+
i_ += 1
|
|
476
|
+
print('best seeds', seeds)
|
|
477
|
+
## searching the best seed
|
|
478
|
+
results = {'seed' : list(),'train_relevance': list()}
|
|
479
|
+
|
|
480
|
+
for seed_x in seeds:
|
|
481
|
+
train_params['seed'] = seed_x
|
|
482
|
+
th = trainer_hmm(**train_params)
|
|
483
|
+
th.train()
|
|
484
|
+
result_model = th.hmm_model
|
|
485
|
+
df_train_tmp = train_params.get('data')
|
|
486
|
+
df_train_tmp['hmm_feature'] = result_model.predict(df_train_tmp)
|
|
487
|
+
df_train_tmp = create_hmm_derived_features(df = df_train_tmp, lag_returns = relevance_params.get('lag'))
|
|
488
|
+
relev, _, _ = states_relevance_score(df_train_tmp)
|
|
489
|
+
|
|
490
|
+
results['seed'].append(seed_x)
|
|
491
|
+
results['train_relevance'].append(relev)
|
|
492
|
+
|
|
493
|
+
df_results = pd.DataFrame(results).sort_values(['train_relevance'], ascending = [False])
|
|
494
|
+
return df_results
|
|
File without changes
|