virgo-modules 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,502 @@
1
+ import numpy as np
2
+ import itertools
3
+ import random
4
+ import math
5
+
6
+ from sklearn.metrics import roc_auc_score, precision_score, recall_score
7
+ from sklearn.pipeline import Pipeline
8
+
9
+ from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
10
+ from feature_engine.imputation import MeanMedianImputer
11
+ from feature_engine.discretisation import EqualWidthDiscretiser
12
+ from feature_engine.datetime import DatetimeFeatures
13
+
14
+ from ..transformer_utils import (
15
+ VirgoWinsorizerFeature,
16
+ InverseHyperbolicSine,
17
+ FeaturesEntropy,
18
+ FeatureSelector,
19
+ InteractionFeatures,
20
+ SplineMarketReturnJumpWaves
21
+ )
22
+
23
+ from plotly.subplots import make_subplots
24
+ import plotly.graph_objects as go
25
+
26
+ class produce_model_wrapper:
27
+ """
28
+ class that wraps a pipeline and a machine learning model. it also provides data spliting train/validation
29
+
30
+ Attributes
31
+ ----------
32
+ data : pd.DataFrame
33
+ list of features to apply the transformation
34
+ X_train : pd.DataFrame
35
+ y_train : pd.DataFrame
36
+ X_val : pd.DataFrame
37
+ y_val : pd.DataFrame
38
+ self.pipeline: obj
39
+ sklearn pipeline including model and pipleline
40
+
41
+ Methods
42
+ -------
43
+ preprocess(validation_size=int, target=list):
44
+ ingest data and split data between train and validation data and X and Y data
45
+ train_model(pipe=obj, model=obj, cv_=boolean):
46
+ merge and train pipeline and machine learning model
47
+ """
48
+ def __init__(self,data):
49
+ """
50
+ Initialize object
51
+
52
+ Parameters
53
+ ----------
54
+ data (pd.DataFrame): data
55
+
56
+ Returns
57
+ -------
58
+ None
59
+ """
60
+ self.data = data.copy()
61
+
62
+ def preprocess(self, validation_size, target):
63
+ """
64
+ ingest data and split data between train and validation data and X and Y data
65
+
66
+ Parameters
67
+ ----------
68
+ validation_size (int): validation data size, the remaining is taken as training data
69
+ target (list): target column list
70
+
71
+ Returns
72
+ -------
73
+ None
74
+ """
75
+ val_date = self.data.groupby('Date', as_index = False).agg(target_down = (target[0],'count')).sort_values('Date').iloc[-validation_size:,].head(1)['Date'].values[0]
76
+
77
+ train_data = self.data[self.data['Date'] < val_date].dropna()
78
+ val_data = self.data[self.data['Date'] >= val_date].dropna()
79
+
80
+ columns = [ x for x in train_data.columns if x not in target ]
81
+ X_train, y_train = train_data[columns], train_data[target]
82
+ X_val, y_val = val_data[columns], val_data[target]
83
+ self.X_train = X_train
84
+ self.y_train = y_train
85
+ self.X_val = X_val
86
+ self.y_val = y_val
87
+
88
+ def train_model(self, pipe, model, cv_ = False):
89
+ """
90
+ merge and train pipeline and machine learning model
91
+
92
+ Parameters
93
+ ----------
94
+ pipe (int): sklearn pipeline object
95
+ model (list): model
96
+
97
+ Returns
98
+ -------
99
+ None
100
+ """
101
+ self.model = model
102
+ self.pipe_transform = pipe
103
+ self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
104
+ self.pipeline.fit(self.X_train, self.y_train)
105
+ self.features_to_model = self.pipeline[:-1].transform(self.X_train).columns
106
+
107
+ class register_results():
108
+ """
109
+ class that collects model metrics
110
+
111
+ Attributes
112
+ ----------
113
+ model_name : str
114
+ model name
115
+ metric_logger : diot
116
+ dictionary that collect model metrics
117
+
118
+ Methods
119
+ -------
120
+ eval_metrics(pipeline=obj, X=pd.DataFrame, y=pd.DataFrame, type_data=str, phase=str):
121
+ register model metrics
122
+ print_metric_logger():
123
+ print logger results
124
+ """
125
+ def __init__(self, model_name):
126
+ """
127
+ Initialize object
128
+
129
+ Parameters
130
+ ----------
131
+ model_name (str): model name
132
+
133
+ Returns
134
+ -------
135
+ None
136
+ """
137
+ self.model_name = model_name
138
+ self.metric_logger = dict()
139
+ def eval_metrics(self, pipeline, X, y, type_data, phase):
140
+ """
141
+ register model metrics
142
+
143
+ Parameters
144
+ ----------
145
+ pipeline (obj): model pipeline
146
+ X (pd.DataFrame): input data
147
+ Y (pd.DataFrame): target data
148
+ type_data (str): data type, either train, test or validation
149
+ phase (str): model phase, either baseline, feature selection, tunned model
150
+
151
+ Returns
152
+ -------
153
+ None
154
+ """
155
+ preds_proba = pipeline.predict_proba(X)
156
+ preds = pipeline.predict(X)
157
+
158
+ if type(preds_proba) == list:
159
+ preds_proba = np.array([ x[:,1] for x in preds_proba]).T
160
+
161
+ roc = roc_auc_score(y,preds_proba, average=None)
162
+ precision = precision_score(y,preds, average=None)
163
+ recall = recall_score(y,preds, average=None)
164
+
165
+ self.metric_logger[f'{phase}//{self.model_name}//{type_data}'] = {'roc':roc, 'precision':precision, 'recall':recall}
166
+
167
+ def print_metric_logger(self):
168
+ """
169
+ print logger results
170
+
171
+ Parameters
172
+ ----------
173
+ None
174
+
175
+ Returns
176
+ -------
177
+ None
178
+ """
179
+ parts = list(self.metric_logger.keys())
180
+ phase_parts = [ x.split('//')[0] for x in parts]
181
+
182
+ parts = list(self.metric_logger)
183
+ phase_parts = [ x.split('//')[0] for x in parts]
184
+
185
+ init_phase = phase_parts[0]
186
+ print(f'---{init_phase}--')
187
+ for phase,val in zip(phase_parts,self.metric_logger):
188
+ stage = val.split('//')[2]
189
+ if init_phase != phase:
190
+ print(f'---{phase}--')
191
+ init_phase = phase
192
+ for metric in self.metric_logger[val]:
193
+ print(stage, metric,self.metric_logger[val][metric])
194
+
195
+
196
+ def eval_metrics(pipeline, X, y, type_data, model_name):
197
+ '''
198
+ print metrics from a model pipeline
199
+
200
+ Parameters:
201
+ pipeline (obj): model pipeline
202
+ X (pd.DataFrame): input data
203
+ Y (pd.DataFrame): target data
204
+ type_data (str): data type, either train, test or validation
205
+ model_name (str): model name
206
+
207
+ Returns:
208
+ objects (dict): that contains ml artifacts, data , configs and models
209
+ '''
210
+ preds_proba = pipeline.predict_proba(X)
211
+ preds = pipeline.predict(X)
212
+
213
+ if type(preds_proba) == list:
214
+ preds_proba = np.array([ x[:,1] for x in preds_proba]).T
215
+
216
+ print(f'--{type_data} - {model_name}--')
217
+ print('--target: down, up--')
218
+ print('--roc-auc--')
219
+ print(roc_auc_score(y,preds_proba, average=None))
220
+ print('--precision--')
221
+ print(precision_score(y,preds, average=None))
222
+ print('--recall--')
223
+ print(recall_score(y,preds, average=None))
224
+
225
+
226
+ def data_processing_pipeline_classifier(
227
+ features_base,features_to_drop = False, winsorizer_conf = False, discretize_columns = False,
228
+ bins_discretize = 10, correlation = 0.85, fillna = True,
229
+ invhypervolsin_features = False,
230
+ date_features_list = False,
231
+ entropy_set_list = False,
232
+ interaction_features_cont = False,
233
+ spline_regression_config = False,
234
+ pipeline_order = 'selector//winzorizer//discretizer//median_inputer//drop//correlation'
235
+ ):
236
+
237
+ '''
238
+ pipeline builder
239
+
240
+ Parameters:
241
+ features_base (list): model pipeline
242
+ features_to_drop (list): features to drop list
243
+ winsorizer_conf (dict): winsorising configuration dictionary
244
+ discretize_columns (list): feature list to discretize
245
+ bins_discretize (int): number of bins to discretize
246
+ correlation (float): correlation threshold to discard correlated features
247
+ fillna (boolean): if true to fill na features
248
+ invhypervolsin_features (list): list of features to apply inverse hyperbolic sine
249
+ date_features_list (list): list of features to compute from Date field. (list of features from feature_engine)
250
+ entropy_set_list (list): list of dictionaries that contains features and targets to compute entropy
251
+ interaction_features_cont (tuple): tuple of lists of interaction features
252
+ pipeline_order (str): custom pipeline order eg. selector//winzorizer//discretizer//median_inputer//drop//correlation
253
+ Returns:
254
+ pipe (obj): pipeline object
255
+ '''
256
+ select_pipe = [('selector', FeatureSelector(features_base))] if features_base else []
257
+ winzorizer_pipe = [('winzorized_features', VirgoWinsorizerFeature(winsorizer_conf))] if winsorizer_conf else []
258
+ drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
259
+ discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = bins_discretize ))] if discretize_columns else []
260
+ drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation, method = 'spearman'))] if correlation else []
261
+ median_imputer_pipe = [('median_imputer', MeanMedianImputer())] if fillna else []
262
+ invhypersin_pipe = [('invhypervolsin scaler', InverseHyperbolicSine(features = invhypervolsin_features))] if invhypervolsin_features else []
263
+ datetimeFeatures_pipe = [('date features', DatetimeFeatures(features_to_extract = date_features_list, variables = 'Date', drop_original = False))] if date_features_list else []
264
+ interaction_features = [("interaction features", InteractionFeatures(interaction_features_cont[0], interaction_features_cont[1]))] if interaction_features_cont else []
265
+ spline_features = [("spline features", SplineMarketReturnJumpWaves(
266
+ return_feature_names=spline_regression_config.get("return_feature_names"),
267
+ target_variables=spline_regression_config.get("target_variables"),
268
+ feature_label=spline_regression_config.get("feature_label"),
269
+ ))] if spline_regression_config else []
270
+
271
+ entropy_pipe = list()
272
+ if entropy_set_list:
273
+ for setx_ in entropy_set_list:
274
+ setx = setx_['set'].split('//')
275
+ target_ = setx_['target']
276
+ subpipe_name = '_'.join(setx) + 'entropy'
277
+ entropy_pipe.append((subpipe_name, FeaturesEntropy(features = setx, target = target_)))
278
+
279
+ pipe_dictionary = {
280
+ 'selector': select_pipe,
281
+ 'winzorizer':winzorizer_pipe,
282
+ 'drop':drop_pipe,
283
+ 'discretizer': discretize,
284
+ 'correlation': drop_corr,
285
+ 'median_inputer':median_imputer_pipe,
286
+ 'arcsinh_scaler': invhypersin_pipe,
287
+ 'date_features': datetimeFeatures_pipe,
288
+ 'interaction_features': interaction_features,
289
+ 'entropy_features' : entropy_pipe,
290
+ "spline_features": spline_features,
291
+ }
292
+
293
+ pipeline_steps = pipeline_order.split('//')
294
+ ## validation
295
+ for step in pipeline_steps:
296
+ if step not in pipe_dictionary.keys():
297
+ raise Exception(f'{step} step not in list of steps, the list is: {list(pipe_dictionary.keys())}')
298
+
299
+ pipeline_args = [ pipe_dictionary[step] for step in pipeline_steps]
300
+ pipeline_args = list(itertools.chain.from_iterable(pipeline_args))
301
+ pipe = Pipeline(pipeline_args)
302
+
303
+ return pipe
304
+
305
+
306
+ class ExpandingMultipleTimeSeriesKFold:
307
+ """
308
+ class that creates a custom cv schema that is compatible with sklearn cv arguments.
309
+
310
+ Attributes
311
+ ----------
312
+ df : pd.DataFrame
313
+ dataset
314
+ number_window : int
315
+ number of train splits
316
+ window_size : int
317
+ window size data
318
+ overlap_size : int
319
+ overlap size
320
+
321
+ Methods
322
+ -------
323
+ split(X=pd.DataFrame, y=pd.DataFrame, groups=boolean):
324
+ custom split procedure
325
+ get_n_splits(X=pd.DataFrame, y=pd.DataFrame, groups=boolean):
326
+ get number of splits
327
+ """
328
+
329
+ def __init__(self, df, window_size = 100, number_window=3, overlap_size = 0, sample_parts = None, embargo = 0):
330
+ """
331
+ Initialize object
332
+
333
+ Parameters
334
+ ----------
335
+ df (pd.DataFrame): dataset
336
+ number_window (int): number of train splits
337
+ window_size (int): window size data
338
+ overlap_size (int): overlap size
339
+ sample_individuals (tuple(float, str)): sample partition units to remove from the train set, tuple()
340
+ embargo int: drop tail on training data
341
+
342
+ Returns
343
+ -------
344
+ None
345
+ """
346
+ self.df = df
347
+ self.number_window = number_window
348
+ self.window_size = window_size
349
+ self.overlap_size = overlap_size
350
+ self.sample_parts = sample_parts
351
+ self.embargo = embargo
352
+
353
+ def split(self, X, y, groups=None):
354
+ """
355
+ custom split procedure
356
+
357
+ Parameters
358
+ ----------
359
+ X (pd.DataFrame): input data (required for sklearn classes)
360
+ y (pd.DataFrame): target data (required for sklearn classes)
361
+ groups (boolean): to apply groups (required for sklearn classes)
362
+
363
+ Returns
364
+ -------
365
+ None
366
+ """
367
+ if 'Date_i' not in self.df.index.names or 'i' not in self.df.index.names:
368
+ raise Exception('no date and/or index in the index dataframe')
369
+
370
+ if self.overlap_size > self.window_size:
371
+ raise Exception('overlap can not be higher than the window size')
372
+
373
+ unique_dates = list(self.df.index.get_level_values('Date_i').unique())
374
+ unique_dates.sort()
375
+
376
+ total_test_size = self.window_size * self.number_window
377
+ total_test_size = total_test_size - (self.number_window - 1)*self.overlap_size
378
+
379
+ if total_test_size > len(unique_dates):
380
+ raise Exception('test size is higher than the data length')
381
+
382
+ cut = total_test_size
383
+ for fold in range(self.number_window):
384
+
385
+ topcut = cut-self.window_size
386
+ train_dates = unique_dates[:-(cut+self.embargo)]
387
+ test_dates = unique_dates[-cut:-topcut]
388
+
389
+ if topcut == 0:
390
+ test_dates = unique_dates[-cut:]
391
+
392
+ max_train_date = max(train_dates)
393
+ min_test_date, max_test_date = min(test_dates), max(test_dates)
394
+
395
+ cut = cut - (self.window_size - self.overlap_size)
396
+
397
+ if self.sample_parts:
398
+ sample_part = self.sample_parts[0]
399
+ part_col = self.sample_parts[1]
400
+ unique_parts = list(self.df.index.get_level_values(part_col).unique())
401
+ random.shuffle(unique_parts)
402
+ n_select = math.ceil(len(unique_parts)*sample_part)
403
+ to_drop = unique_parts[0:n_select]
404
+ train_index = self.df[
405
+ (self.df.index.get_level_values('Date_i') <= max_train_date)
406
+ &
407
+ (~self.df.index.get_level_values(part_col).isin(to_drop))].index.get_level_values('i')
408
+ else:
409
+ train_index = self.df[self.df.index.get_level_values('Date_i') <= max_train_date].index.get_level_values('i')
410
+ test_index = self.df[(self.df.index.get_level_values('Date_i') >= min_test_date) & (self.df.index.get_level_values('Date_i') <= max_test_date)].index.get_level_values('i')
411
+
412
+ yield train_index, test_index
413
+
414
+ def get_n_splits(self, X, y, groups=None):
415
+ """
416
+ get number of splits
417
+
418
+ Parameters
419
+ ----------
420
+ X (pd.DataFrame): input data (required for sklearn classes)
421
+ y (pd.DataFrame): target data (required for sklearn classes)
422
+ groups (boolean): to apply groups (required for sklearn classes)
423
+
424
+ Returns
425
+ -------
426
+ number_window (int): number of splits
427
+ """
428
+ return self.number_window
429
+
430
+ def edge_probas_lines(data, threshold, plot = False, look_back = 750):
431
+ """
432
+ produce a plotly plot of edges and closing prices
433
+
434
+ Parameters:
435
+ data (pd.DataFrame): asset data with edge probabilities
436
+ plot (boolean): if true, display plot
437
+ threshold (float): edge threshold
438
+ look_back (int): number of rows back to display
439
+
440
+ Returns:
441
+ fig (obj): plotly go object
442
+ """
443
+ df = data[['Date','Close','proba_target_down','proba_target_up']].iloc[-look_back:]
444
+
445
+ fig = make_subplots(specs=[[{"secondary_y": True}]])
446
+ fig.add_trace(go.Scatter(x=df.Date, y=df.Close,mode='lines+markers',name='Close price'))
447
+ fig.add_trace(go.Scatter(x=df.Date, y=df.proba_target_down,mode='lines',marker = dict(color = 'coral'),name='go down'),secondary_y=True)
448
+ fig.add_trace(go.Scatter(x=df.Date, y=df.proba_target_up,mode='lines',marker = dict(opacity=0.1,size=80), name='go up'),secondary_y=True)
449
+ fig.add_shape(type="line", xref="paper", yref="y2",x0=0.02, y0=threshold, x1=0.9, y1=threshold,line=dict(color="red",dash="dash"),)
450
+ fig.update_layout(title_text="sirius - edge probabilities",width=1200,height = 500)
451
+ if plot:
452
+ fig.show()
453
+ return fig
454
+
455
+ def get_rolling_probs(data, window = 3,plot = False, look_back = 750, rets_eval=7):
456
+ """
457
+ produce a plotly plot of smoothed edges and closing prices
458
+
459
+ Parameters:
460
+ data (pd.DataFrame): asset data with edge probabilities
461
+ window (int): window size
462
+ plot (boolean): if true, display plot
463
+ look_back (int): number of rows back to display
464
+
465
+ Returns:
466
+ fig (obj): plotly go object
467
+ """
468
+ prob_cols = ['proba_target_down','proba_target_up']
469
+ df = data[prob_cols+['Date','log_return','Close']].iloc[-look_back:]
470
+ df["eval_rets"] = (df["Close"]/df["Close"].shift(rets_eval)-1)*100
471
+ for colx in prob_cols:
472
+ df[f'roll_{colx}'] = df.sort_values('Date')[colx].rolling(window, min_periods=1).mean()
473
+ df['roll_edge'] = np.where(df['roll_proba_target_up'] > df['roll_proba_target_down'],'up','down')
474
+ #order chaining
475
+ df['lag'] = df['roll_edge'].shift(1)
476
+ df['change'] = np.where(df['roll_edge']!=df['lag'],1,np.nan)
477
+ df['rn'] = df.sort_values('Date').groupby('change').cumcount() + 1
478
+ df['rn'] = np.where(df['change']==1,df['rn'],np.nan)
479
+ df['chain'] = df.sort_values('Date')['rn'].fillna(method='ffill')
480
+ df['chain_id'] = df.sort_values(['Date']).groupby(['chain','chain']).cumcount() + 1
481
+
482
+ colors = {'up':'blue','down':'red'}
483
+ fig = make_subplots(
484
+ rows=2, cols=2,shared_xaxes=False,vertical_spacing=0.08,
485
+ specs=[[{"colspan": 2, "secondary_y":True}, None],[{}, {}]],
486
+ subplot_titles=("Smooth edge probabilities", f"expected return {rets_eval} days", "Duration"))
487
+ fig.add_trace(go.Scatter(x=df.Date, y=df.Close,mode='lines+markers',name='Close price'))
488
+ fig.add_trace(go.Scatter(x=df.Date, y=df.roll_proba_target_down,mode='lines',marker = dict(color = 'coral'),name='go down'),secondary_y=True,col=1,row=1)
489
+ fig.add_trace(go.Scatter(x=df.Date, y=df.roll_proba_target_up,mode='lines',marker = dict(opacity=0.1,size=80), name='go up'),secondary_y=True,col=1,row=1)
490
+
491
+ for re in df['roll_edge'].unique():
492
+ fig.add_trace(go.Box(x=df[df['roll_edge']==re]["eval_rets"],name=re,marker_color=colors.get(re),showlegend=False),col=1,row=2)
493
+ fig.add_vline(x=0, line_width=2, line_dash="dash", line_color="grey", col=1,row=2)
494
+ df_ = df.groupby(['roll_edge','chain'],as_index=False).agg(max_duration = ('chain_id','max'))
495
+ for re in df_['roll_edge'].unique():
496
+ fig.add_trace(go.Box(x=df_[df_['roll_edge']==re]["max_duration"],name=re,marker_color=colors.get(re),showlegend=False),col=2,row=2)
497
+
498
+ fig.update_layout(title_text="sirius - smooth edge probabilities",width=1200,height = 1000)
499
+ if plot:
500
+ fig.show()
501
+
502
+ return fig
@@ -0,0 +1,66 @@
1
+ import random
2
+
3
+ from numpy.random import choice
4
+ import numpy as np
5
+ from scipy import stats
6
+ from sklearn.feature_selection import RFE
7
+
8
+ class StackRFE:
9
+ def __init__(self, model, n_features, batch_elim, step_elim, cv, max_iterations, manual_pipe=list(), importance_callable="auto"):
10
+ """
11
+ n_features: number of features to select in RFE
12
+ batch_elim: select n features as suggestion
13
+ step_elim: number of features to iter in RFE
14
+ manual_pipe: list of pipeline to suggest features to pass to RFE
15
+ importance_callable: function to calculate feature importance
16
+ """
17
+ self.model = model
18
+ self.n_features = n_features
19
+ self.batch_elim = batch_elim
20
+ self.step_elim = step_elim
21
+ self.cv = cv
22
+ self.max_iterations = max_iterations
23
+ self.manual_pipe = manual_pipe
24
+ self.importance_callable=importance_callable
25
+
26
+ def _suggest_elimination(self, uniform=False):
27
+ """
28
+ suggest based on mean ranking, lower the mean rank higher the prob to be selected
29
+ """
30
+ ds = self.feature_rankings
31
+ ds_mean = {k:np.mean(ds.get(k)) for k in ds}
32
+ max_ = np.max([x for x in ds_mean.values()])
33
+ ds_weight = {k: (max_-v+1) for k,v in ds_mean.items()}
34
+ sum_ = np.sum([x for x in ds_weight.values()])
35
+ ds_prob = {k: v/sum_ for k,v in ds_weight.items()}
36
+ result = list(choice(list(ds_prob.keys()), self.batch_elim,p=list(ds_prob.values()), replace=False))
37
+ if uniform:
38
+ features = list(ds_prob.keys())
39
+ random.shuffle(features)
40
+ result = features[0:self.batch_elim]
41
+ return result
42
+
43
+ def fit(self, X, y):
44
+ features = list(X.columns).copy()
45
+ self.feature_rankings = {f:[1] for f in features}
46
+ for iteration in range(self.max_iterations):
47
+ # shuffling
48
+ if random.random() > 0.5:
49
+ batch_features = self._suggest_elimination()
50
+ else:
51
+ batch_features = self._suggest_elimination(uniform=True)
52
+
53
+ if len(self.manual_pipe)>0:
54
+ batch_features = self.manual_pipe.pop(0)
55
+ # selector and elimination
56
+ tmp_feature_ranking = {k: list() for k in batch_features}
57
+ selector = RFE(self.model, n_features_to_select=self.n_features, step=self.step_elim, importance_getter=self.importance_callable)
58
+ for train_index, test_index in self.cv.split(X, y):
59
+ X_ = X[X.index.get_level_values('i').isin(train_index)][batch_features]
60
+ y_ = y[y.index.get_level_values('i').isin(train_index)]
61
+ selector = selector.fit(X_, y_)
62
+ for k,r in zip(tmp_feature_ranking.keys(), selector.ranking_):
63
+ tmp_feature_ranking[k].append(r)
64
+ rankings = [np.median(v) for v in tmp_feature_ranking.values()]
65
+ for f,r in zip(batch_features, rankings):
66
+ self.feature_rankings[f].append(r)
@@ -0,0 +1,54 @@
1
+ import shap
2
+ import mlflow
3
+ import pandas as pd
4
+ import numpy as np
5
+ from plotly.subplots import make_subplots
6
+ import plotly.graph_objects as go
7
+
8
+ class StackInterpretor(mlflow.pyfunc.PythonModel):
9
+ def __init__(self, model, targets):
10
+ self.base_estimators = model.estimators_
11
+ self.targets = targets
12
+ def fit_interpretor(self, data):
13
+ interpretors = {}
14
+ for label, predictor in zip(self.targets,self.base_estimators):
15
+ explainer = shap.Explainer(predictor, data)
16
+ interpretors[label] = explainer
17
+ self.interpretors = interpretors
18
+ def get_shap_values(self, data):
19
+ shap_values = dict()
20
+ for label, interpretor in self.interpretors.items():
21
+ shap_value = interpretor(data)
22
+ shap_values[label] = shap_value
23
+ return shap_values
24
+ def register_map(self, mapping):
25
+ self.mapping = mapping
26
+
27
+ def mean_shap(data, explainers, pipe_transform):
28
+ t_data = pipe_transform.transform(data)
29
+ input_features = t_data.columns
30
+ shap_results = explainers.get_shap_values(t_data)
31
+ dict_shap_values = explainers.mapping
32
+ arrays_ = list()
33
+ for k,_ in shap_results.items():
34
+ arrays_.append(shap_results.get(k).values)
35
+ shap_results_mean = np.mean(np.array(arrays_), axis = 0)
36
+ df_shap = pd.DataFrame(shap_results_mean, columns=input_features, index=data.index)
37
+ df_shap['Close'] = data['Close']
38
+ df_shap['Date'] = data['Date']
39
+ df_shap = df_shap[['Date','Close']+list(dict_shap_values.keys())]
40
+ df_shap = df_shap.rename(columns =dict_shap_values)
41
+ return df_shap
42
+
43
+ def edge_shap_lines(data, plot = False, look_back = 750):
44
+ ### corect labels ####
45
+ shap_cols = [col for col in data.columns if col not in ['Date','Close']]
46
+ df = data.sort_values('Date').iloc[-look_back:]
47
+ fig = make_subplots(specs=[[{"secondary_y": True}]])
48
+ fig.add_trace(go.Scatter(x=df.Date, y=df.Close,mode='lines+markers',marker = dict(color = 'grey'),line = dict(color = 'grey'),name='Close price'))
49
+ for col in shap_cols:
50
+ fig.add_trace(go.Scatter(x=df.Date, y=df[col],mode='lines+markers',name=col),secondary_y=True)
51
+ fig.update_layout(title_text="sirius - feature power",width=1200,height = 500)
52
+ if plot:
53
+ fig.show()
54
+ return fig