virgo-modules 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- virgo_modules/__init__.py +1 -0
- virgo_modules/src/aws_utils.py +35 -3
- virgo_modules/src/backtester.py +474 -0
- virgo_modules/src/edge_utils/__init__.py +0 -0
- virgo_modules/src/edge_utils/conformal_utils.py +106 -0
- virgo_modules/src/edge_utils/edge_utils.py +502 -0
- virgo_modules/src/edge_utils/feature_selection.py +66 -0
- virgo_modules/src/edge_utils/shap_utils.py +54 -0
- virgo_modules/src/edge_utils/stack_model.py +94 -0
- virgo_modules/src/hmm_utils.py +494 -0
- virgo_modules/src/market/__init__.py +0 -0
- virgo_modules/src/market/market_tools.py +189 -0
- virgo_modules/src/markowitz/__init__.py +0 -0
- virgo_modules/src/markowitz/markowitz_utils.py +44 -0
- virgo_modules/src/re_utils.py +628 -85
- virgo_modules/src/ticketer_source.py +1351 -1066
- virgo_modules/src/transformer_utils.py +401 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/METADATA +16 -22
- virgo_modules-0.9.0.dist-info/RECORD +24 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/WHEEL +1 -1
- virgo_modules/src/edge_utils.py +0 -178
- virgo_modules-0.0.72.dist-info/RECORD +0 -12
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info/licenses}/LICENSE +0 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import itertools
|
|
3
|
+
import random
|
|
4
|
+
import math
|
|
5
|
+
|
|
6
|
+
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
|
7
|
+
from sklearn.pipeline import Pipeline
|
|
8
|
+
|
|
9
|
+
from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
|
|
10
|
+
from feature_engine.imputation import MeanMedianImputer
|
|
11
|
+
from feature_engine.discretisation import EqualWidthDiscretiser
|
|
12
|
+
from feature_engine.datetime import DatetimeFeatures
|
|
13
|
+
|
|
14
|
+
from ..transformer_utils import (
|
|
15
|
+
VirgoWinsorizerFeature,
|
|
16
|
+
InverseHyperbolicSine,
|
|
17
|
+
FeaturesEntropy,
|
|
18
|
+
FeatureSelector,
|
|
19
|
+
InteractionFeatures,
|
|
20
|
+
SplineMarketReturnJumpWaves
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from plotly.subplots import make_subplots
|
|
24
|
+
import plotly.graph_objects as go
|
|
25
|
+
|
|
26
|
+
class produce_model_wrapper:
|
|
27
|
+
"""
|
|
28
|
+
class that wraps a pipeline and a machine learning model. it also provides data spliting train/validation
|
|
29
|
+
|
|
30
|
+
Attributes
|
|
31
|
+
----------
|
|
32
|
+
data : pd.DataFrame
|
|
33
|
+
list of features to apply the transformation
|
|
34
|
+
X_train : pd.DataFrame
|
|
35
|
+
y_train : pd.DataFrame
|
|
36
|
+
X_val : pd.DataFrame
|
|
37
|
+
y_val : pd.DataFrame
|
|
38
|
+
self.pipeline: obj
|
|
39
|
+
sklearn pipeline including model and pipleline
|
|
40
|
+
|
|
41
|
+
Methods
|
|
42
|
+
-------
|
|
43
|
+
preprocess(validation_size=int, target=list):
|
|
44
|
+
ingest data and split data between train and validation data and X and Y data
|
|
45
|
+
train_model(pipe=obj, model=obj, cv_=boolean):
|
|
46
|
+
merge and train pipeline and machine learning model
|
|
47
|
+
"""
|
|
48
|
+
def __init__(self,data):
|
|
49
|
+
"""
|
|
50
|
+
Initialize object
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
data (pd.DataFrame): data
|
|
55
|
+
|
|
56
|
+
Returns
|
|
57
|
+
-------
|
|
58
|
+
None
|
|
59
|
+
"""
|
|
60
|
+
self.data = data.copy()
|
|
61
|
+
|
|
62
|
+
def preprocess(self, validation_size, target):
|
|
63
|
+
"""
|
|
64
|
+
ingest data and split data between train and validation data and X and Y data
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
validation_size (int): validation data size, the remaining is taken as training data
|
|
69
|
+
target (list): target column list
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
None
|
|
74
|
+
"""
|
|
75
|
+
val_date = self.data.groupby('Date', as_index = False).agg(target_down = (target[0],'count')).sort_values('Date').iloc[-validation_size:,].head(1)['Date'].values[0]
|
|
76
|
+
|
|
77
|
+
train_data = self.data[self.data['Date'] < val_date].dropna()
|
|
78
|
+
val_data = self.data[self.data['Date'] >= val_date].dropna()
|
|
79
|
+
|
|
80
|
+
columns = [ x for x in train_data.columns if x not in target ]
|
|
81
|
+
X_train, y_train = train_data[columns], train_data[target]
|
|
82
|
+
X_val, y_val = val_data[columns], val_data[target]
|
|
83
|
+
self.X_train = X_train
|
|
84
|
+
self.y_train = y_train
|
|
85
|
+
self.X_val = X_val
|
|
86
|
+
self.y_val = y_val
|
|
87
|
+
|
|
88
|
+
def train_model(self, pipe, model, cv_ = False):
|
|
89
|
+
"""
|
|
90
|
+
merge and train pipeline and machine learning model
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
pipe (int): sklearn pipeline object
|
|
95
|
+
model (list): model
|
|
96
|
+
|
|
97
|
+
Returns
|
|
98
|
+
-------
|
|
99
|
+
None
|
|
100
|
+
"""
|
|
101
|
+
self.model = model
|
|
102
|
+
self.pipe_transform = pipe
|
|
103
|
+
self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
|
|
104
|
+
self.pipeline.fit(self.X_train, self.y_train)
|
|
105
|
+
self.features_to_model = self.pipeline[:-1].transform(self.X_train).columns
|
|
106
|
+
|
|
107
|
+
class register_results():
|
|
108
|
+
"""
|
|
109
|
+
class that collects model metrics
|
|
110
|
+
|
|
111
|
+
Attributes
|
|
112
|
+
----------
|
|
113
|
+
model_name : str
|
|
114
|
+
model name
|
|
115
|
+
metric_logger : diot
|
|
116
|
+
dictionary that collect model metrics
|
|
117
|
+
|
|
118
|
+
Methods
|
|
119
|
+
-------
|
|
120
|
+
eval_metrics(pipeline=obj, X=pd.DataFrame, y=pd.DataFrame, type_data=str, phase=str):
|
|
121
|
+
register model metrics
|
|
122
|
+
print_metric_logger():
|
|
123
|
+
print logger results
|
|
124
|
+
"""
|
|
125
|
+
def __init__(self, model_name):
|
|
126
|
+
"""
|
|
127
|
+
Initialize object
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
model_name (str): model name
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
None
|
|
136
|
+
"""
|
|
137
|
+
self.model_name = model_name
|
|
138
|
+
self.metric_logger = dict()
|
|
139
|
+
def eval_metrics(self, pipeline, X, y, type_data, phase):
|
|
140
|
+
"""
|
|
141
|
+
register model metrics
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
pipeline (obj): model pipeline
|
|
146
|
+
X (pd.DataFrame): input data
|
|
147
|
+
Y (pd.DataFrame): target data
|
|
148
|
+
type_data (str): data type, either train, test or validation
|
|
149
|
+
phase (str): model phase, either baseline, feature selection, tunned model
|
|
150
|
+
|
|
151
|
+
Returns
|
|
152
|
+
-------
|
|
153
|
+
None
|
|
154
|
+
"""
|
|
155
|
+
preds_proba = pipeline.predict_proba(X)
|
|
156
|
+
preds = pipeline.predict(X)
|
|
157
|
+
|
|
158
|
+
if type(preds_proba) == list:
|
|
159
|
+
preds_proba = np.array([ x[:,1] for x in preds_proba]).T
|
|
160
|
+
|
|
161
|
+
roc = roc_auc_score(y,preds_proba, average=None)
|
|
162
|
+
precision = precision_score(y,preds, average=None)
|
|
163
|
+
recall = recall_score(y,preds, average=None)
|
|
164
|
+
|
|
165
|
+
self.metric_logger[f'{phase}//{self.model_name}//{type_data}'] = {'roc':roc, 'precision':precision, 'recall':recall}
|
|
166
|
+
|
|
167
|
+
def print_metric_logger(self):
|
|
168
|
+
"""
|
|
169
|
+
print logger results
|
|
170
|
+
|
|
171
|
+
Parameters
|
|
172
|
+
----------
|
|
173
|
+
None
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
None
|
|
178
|
+
"""
|
|
179
|
+
parts = list(self.metric_logger.keys())
|
|
180
|
+
phase_parts = [ x.split('//')[0] for x in parts]
|
|
181
|
+
|
|
182
|
+
parts = list(self.metric_logger)
|
|
183
|
+
phase_parts = [ x.split('//')[0] for x in parts]
|
|
184
|
+
|
|
185
|
+
init_phase = phase_parts[0]
|
|
186
|
+
print(f'---{init_phase}--')
|
|
187
|
+
for phase,val in zip(phase_parts,self.metric_logger):
|
|
188
|
+
stage = val.split('//')[2]
|
|
189
|
+
if init_phase != phase:
|
|
190
|
+
print(f'---{phase}--')
|
|
191
|
+
init_phase = phase
|
|
192
|
+
for metric in self.metric_logger[val]:
|
|
193
|
+
print(stage, metric,self.metric_logger[val][metric])
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def eval_metrics(pipeline, X, y, type_data, model_name):
|
|
197
|
+
'''
|
|
198
|
+
print metrics from a model pipeline
|
|
199
|
+
|
|
200
|
+
Parameters:
|
|
201
|
+
pipeline (obj): model pipeline
|
|
202
|
+
X (pd.DataFrame): input data
|
|
203
|
+
Y (pd.DataFrame): target data
|
|
204
|
+
type_data (str): data type, either train, test or validation
|
|
205
|
+
model_name (str): model name
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
objects (dict): that contains ml artifacts, data , configs and models
|
|
209
|
+
'''
|
|
210
|
+
preds_proba = pipeline.predict_proba(X)
|
|
211
|
+
preds = pipeline.predict(X)
|
|
212
|
+
|
|
213
|
+
if type(preds_proba) == list:
|
|
214
|
+
preds_proba = np.array([ x[:,1] for x in preds_proba]).T
|
|
215
|
+
|
|
216
|
+
print(f'--{type_data} - {model_name}--')
|
|
217
|
+
print('--target: down, up--')
|
|
218
|
+
print('--roc-auc--')
|
|
219
|
+
print(roc_auc_score(y,preds_proba, average=None))
|
|
220
|
+
print('--precision--')
|
|
221
|
+
print(precision_score(y,preds, average=None))
|
|
222
|
+
print('--recall--')
|
|
223
|
+
print(recall_score(y,preds, average=None))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def data_processing_pipeline_classifier(
|
|
227
|
+
features_base,features_to_drop = False, winsorizer_conf = False, discretize_columns = False,
|
|
228
|
+
bins_discretize = 10, correlation = 0.85, fillna = True,
|
|
229
|
+
invhypervolsin_features = False,
|
|
230
|
+
date_features_list = False,
|
|
231
|
+
entropy_set_list = False,
|
|
232
|
+
interaction_features_cont = False,
|
|
233
|
+
spline_regression_config = False,
|
|
234
|
+
pipeline_order = 'selector//winzorizer//discretizer//median_inputer//drop//correlation'
|
|
235
|
+
):
|
|
236
|
+
|
|
237
|
+
'''
|
|
238
|
+
pipeline builder
|
|
239
|
+
|
|
240
|
+
Parameters:
|
|
241
|
+
features_base (list): model pipeline
|
|
242
|
+
features_to_drop (list): features to drop list
|
|
243
|
+
winsorizer_conf (dict): winsorising configuration dictionary
|
|
244
|
+
discretize_columns (list): feature list to discretize
|
|
245
|
+
bins_discretize (int): number of bins to discretize
|
|
246
|
+
correlation (float): correlation threshold to discard correlated features
|
|
247
|
+
fillna (boolean): if true to fill na features
|
|
248
|
+
invhypervolsin_features (list): list of features to apply inverse hyperbolic sine
|
|
249
|
+
date_features_list (list): list of features to compute from Date field. (list of features from feature_engine)
|
|
250
|
+
entropy_set_list (list): list of dictionaries that contains features and targets to compute entropy
|
|
251
|
+
interaction_features_cont (tuple): tuple of lists of interaction features
|
|
252
|
+
pipeline_order (str): custom pipeline order eg. selector//winzorizer//discretizer//median_inputer//drop//correlation
|
|
253
|
+
Returns:
|
|
254
|
+
pipe (obj): pipeline object
|
|
255
|
+
'''
|
|
256
|
+
select_pipe = [('selector', FeatureSelector(features_base))] if features_base else []
|
|
257
|
+
winzorizer_pipe = [('winzorized_features', VirgoWinsorizerFeature(winsorizer_conf))] if winsorizer_conf else []
|
|
258
|
+
drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
|
|
259
|
+
discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = bins_discretize ))] if discretize_columns else []
|
|
260
|
+
drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation, method = 'spearman'))] if correlation else []
|
|
261
|
+
median_imputer_pipe = [('median_imputer', MeanMedianImputer())] if fillna else []
|
|
262
|
+
invhypersin_pipe = [('invhypervolsin scaler', InverseHyperbolicSine(features = invhypervolsin_features))] if invhypervolsin_features else []
|
|
263
|
+
datetimeFeatures_pipe = [('date features', DatetimeFeatures(features_to_extract = date_features_list, variables = 'Date', drop_original = False))] if date_features_list else []
|
|
264
|
+
interaction_features = [("interaction features", InteractionFeatures(interaction_features_cont[0], interaction_features_cont[1]))] if interaction_features_cont else []
|
|
265
|
+
spline_features = [("spline features", SplineMarketReturnJumpWaves(
|
|
266
|
+
return_feature_names=spline_regression_config.get("return_feature_names"),
|
|
267
|
+
target_variables=spline_regression_config.get("target_variables"),
|
|
268
|
+
feature_label=spline_regression_config.get("feature_label"),
|
|
269
|
+
))] if spline_regression_config else []
|
|
270
|
+
|
|
271
|
+
entropy_pipe = list()
|
|
272
|
+
if entropy_set_list:
|
|
273
|
+
for setx_ in entropy_set_list:
|
|
274
|
+
setx = setx_['set'].split('//')
|
|
275
|
+
target_ = setx_['target']
|
|
276
|
+
subpipe_name = '_'.join(setx) + 'entropy'
|
|
277
|
+
entropy_pipe.append((subpipe_name, FeaturesEntropy(features = setx, target = target_)))
|
|
278
|
+
|
|
279
|
+
pipe_dictionary = {
|
|
280
|
+
'selector': select_pipe,
|
|
281
|
+
'winzorizer':winzorizer_pipe,
|
|
282
|
+
'drop':drop_pipe,
|
|
283
|
+
'discretizer': discretize,
|
|
284
|
+
'correlation': drop_corr,
|
|
285
|
+
'median_inputer':median_imputer_pipe,
|
|
286
|
+
'arcsinh_scaler': invhypersin_pipe,
|
|
287
|
+
'date_features': datetimeFeatures_pipe,
|
|
288
|
+
'interaction_features': interaction_features,
|
|
289
|
+
'entropy_features' : entropy_pipe,
|
|
290
|
+
"spline_features": spline_features,
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
pipeline_steps = pipeline_order.split('//')
|
|
294
|
+
## validation
|
|
295
|
+
for step in pipeline_steps:
|
|
296
|
+
if step not in pipe_dictionary.keys():
|
|
297
|
+
raise Exception(f'{step} step not in list of steps, the list is: {list(pipe_dictionary.keys())}')
|
|
298
|
+
|
|
299
|
+
pipeline_args = [ pipe_dictionary[step] for step in pipeline_steps]
|
|
300
|
+
pipeline_args = list(itertools.chain.from_iterable(pipeline_args))
|
|
301
|
+
pipe = Pipeline(pipeline_args)
|
|
302
|
+
|
|
303
|
+
return pipe
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class ExpandingMultipleTimeSeriesKFold:
|
|
307
|
+
"""
|
|
308
|
+
class that creates a custom cv schema that is compatible with sklearn cv arguments.
|
|
309
|
+
|
|
310
|
+
Attributes
|
|
311
|
+
----------
|
|
312
|
+
df : pd.DataFrame
|
|
313
|
+
dataset
|
|
314
|
+
number_window : int
|
|
315
|
+
number of train splits
|
|
316
|
+
window_size : int
|
|
317
|
+
window size data
|
|
318
|
+
overlap_size : int
|
|
319
|
+
overlap size
|
|
320
|
+
|
|
321
|
+
Methods
|
|
322
|
+
-------
|
|
323
|
+
split(X=pd.DataFrame, y=pd.DataFrame, groups=boolean):
|
|
324
|
+
custom split procedure
|
|
325
|
+
get_n_splits(X=pd.DataFrame, y=pd.DataFrame, groups=boolean):
|
|
326
|
+
get number of splits
|
|
327
|
+
"""
|
|
328
|
+
|
|
329
|
+
def __init__(self, df, window_size = 100, number_window=3, overlap_size = 0, sample_parts = None, embargo = 0):
|
|
330
|
+
"""
|
|
331
|
+
Initialize object
|
|
332
|
+
|
|
333
|
+
Parameters
|
|
334
|
+
----------
|
|
335
|
+
df (pd.DataFrame): dataset
|
|
336
|
+
number_window (int): number of train splits
|
|
337
|
+
window_size (int): window size data
|
|
338
|
+
overlap_size (int): overlap size
|
|
339
|
+
sample_individuals (tuple(float, str)): sample partition units to remove from the train set, tuple()
|
|
340
|
+
embargo int: drop tail on training data
|
|
341
|
+
|
|
342
|
+
Returns
|
|
343
|
+
-------
|
|
344
|
+
None
|
|
345
|
+
"""
|
|
346
|
+
self.df = df
|
|
347
|
+
self.number_window = number_window
|
|
348
|
+
self.window_size = window_size
|
|
349
|
+
self.overlap_size = overlap_size
|
|
350
|
+
self.sample_parts = sample_parts
|
|
351
|
+
self.embargo = embargo
|
|
352
|
+
|
|
353
|
+
def split(self, X, y, groups=None):
|
|
354
|
+
"""
|
|
355
|
+
custom split procedure
|
|
356
|
+
|
|
357
|
+
Parameters
|
|
358
|
+
----------
|
|
359
|
+
X (pd.DataFrame): input data (required for sklearn classes)
|
|
360
|
+
y (pd.DataFrame): target data (required for sklearn classes)
|
|
361
|
+
groups (boolean): to apply groups (required for sklearn classes)
|
|
362
|
+
|
|
363
|
+
Returns
|
|
364
|
+
-------
|
|
365
|
+
None
|
|
366
|
+
"""
|
|
367
|
+
if 'Date_i' not in self.df.index.names or 'i' not in self.df.index.names:
|
|
368
|
+
raise Exception('no date and/or index in the index dataframe')
|
|
369
|
+
|
|
370
|
+
if self.overlap_size > self.window_size:
|
|
371
|
+
raise Exception('overlap can not be higher than the window size')
|
|
372
|
+
|
|
373
|
+
unique_dates = list(self.df.index.get_level_values('Date_i').unique())
|
|
374
|
+
unique_dates.sort()
|
|
375
|
+
|
|
376
|
+
total_test_size = self.window_size * self.number_window
|
|
377
|
+
total_test_size = total_test_size - (self.number_window - 1)*self.overlap_size
|
|
378
|
+
|
|
379
|
+
if total_test_size > len(unique_dates):
|
|
380
|
+
raise Exception('test size is higher than the data length')
|
|
381
|
+
|
|
382
|
+
cut = total_test_size
|
|
383
|
+
for fold in range(self.number_window):
|
|
384
|
+
|
|
385
|
+
topcut = cut-self.window_size
|
|
386
|
+
train_dates = unique_dates[:-(cut+self.embargo)]
|
|
387
|
+
test_dates = unique_dates[-cut:-topcut]
|
|
388
|
+
|
|
389
|
+
if topcut == 0:
|
|
390
|
+
test_dates = unique_dates[-cut:]
|
|
391
|
+
|
|
392
|
+
max_train_date = max(train_dates)
|
|
393
|
+
min_test_date, max_test_date = min(test_dates), max(test_dates)
|
|
394
|
+
|
|
395
|
+
cut = cut - (self.window_size - self.overlap_size)
|
|
396
|
+
|
|
397
|
+
if self.sample_parts:
|
|
398
|
+
sample_part = self.sample_parts[0]
|
|
399
|
+
part_col = self.sample_parts[1]
|
|
400
|
+
unique_parts = list(self.df.index.get_level_values(part_col).unique())
|
|
401
|
+
random.shuffle(unique_parts)
|
|
402
|
+
n_select = math.ceil(len(unique_parts)*sample_part)
|
|
403
|
+
to_drop = unique_parts[0:n_select]
|
|
404
|
+
train_index = self.df[
|
|
405
|
+
(self.df.index.get_level_values('Date_i') <= max_train_date)
|
|
406
|
+
&
|
|
407
|
+
(~self.df.index.get_level_values(part_col).isin(to_drop))].index.get_level_values('i')
|
|
408
|
+
else:
|
|
409
|
+
train_index = self.df[self.df.index.get_level_values('Date_i') <= max_train_date].index.get_level_values('i')
|
|
410
|
+
test_index = self.df[(self.df.index.get_level_values('Date_i') >= min_test_date) & (self.df.index.get_level_values('Date_i') <= max_test_date)].index.get_level_values('i')
|
|
411
|
+
|
|
412
|
+
yield train_index, test_index
|
|
413
|
+
|
|
414
|
+
def get_n_splits(self, X, y, groups=None):
|
|
415
|
+
"""
|
|
416
|
+
get number of splits
|
|
417
|
+
|
|
418
|
+
Parameters
|
|
419
|
+
----------
|
|
420
|
+
X (pd.DataFrame): input data (required for sklearn classes)
|
|
421
|
+
y (pd.DataFrame): target data (required for sklearn classes)
|
|
422
|
+
groups (boolean): to apply groups (required for sklearn classes)
|
|
423
|
+
|
|
424
|
+
Returns
|
|
425
|
+
-------
|
|
426
|
+
number_window (int): number of splits
|
|
427
|
+
"""
|
|
428
|
+
return self.number_window
|
|
429
|
+
|
|
430
|
+
def edge_probas_lines(data, threshold, plot = False, look_back = 750):
|
|
431
|
+
"""
|
|
432
|
+
produce a plotly plot of edges and closing prices
|
|
433
|
+
|
|
434
|
+
Parameters:
|
|
435
|
+
data (pd.DataFrame): asset data with edge probabilities
|
|
436
|
+
plot (boolean): if true, display plot
|
|
437
|
+
threshold (float): edge threshold
|
|
438
|
+
look_back (int): number of rows back to display
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
fig (obj): plotly go object
|
|
442
|
+
"""
|
|
443
|
+
df = data[['Date','Close','proba_target_down','proba_target_up']].iloc[-look_back:]
|
|
444
|
+
|
|
445
|
+
fig = make_subplots(specs=[[{"secondary_y": True}]])
|
|
446
|
+
fig.add_trace(go.Scatter(x=df.Date, y=df.Close,mode='lines+markers',name='Close price'))
|
|
447
|
+
fig.add_trace(go.Scatter(x=df.Date, y=df.proba_target_down,mode='lines',marker = dict(color = 'coral'),name='go down'),secondary_y=True)
|
|
448
|
+
fig.add_trace(go.Scatter(x=df.Date, y=df.proba_target_up,mode='lines',marker = dict(opacity=0.1,size=80), name='go up'),secondary_y=True)
|
|
449
|
+
fig.add_shape(type="line", xref="paper", yref="y2",x0=0.02, y0=threshold, x1=0.9, y1=threshold,line=dict(color="red",dash="dash"),)
|
|
450
|
+
fig.update_layout(title_text="sirius - edge probabilities",width=1200,height = 500)
|
|
451
|
+
if plot:
|
|
452
|
+
fig.show()
|
|
453
|
+
return fig
|
|
454
|
+
|
|
455
|
+
def get_rolling_probs(data, window = 3,plot = False, look_back = 750, rets_eval=7):
|
|
456
|
+
"""
|
|
457
|
+
produce a plotly plot of smoothed edges and closing prices
|
|
458
|
+
|
|
459
|
+
Parameters:
|
|
460
|
+
data (pd.DataFrame): asset data with edge probabilities
|
|
461
|
+
window (int): window size
|
|
462
|
+
plot (boolean): if true, display plot
|
|
463
|
+
look_back (int): number of rows back to display
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
fig (obj): plotly go object
|
|
467
|
+
"""
|
|
468
|
+
prob_cols = ['proba_target_down','proba_target_up']
|
|
469
|
+
df = data[prob_cols+['Date','log_return','Close']].iloc[-look_back:]
|
|
470
|
+
df["eval_rets"] = (df["Close"]/df["Close"].shift(rets_eval)-1)*100
|
|
471
|
+
for colx in prob_cols:
|
|
472
|
+
df[f'roll_{colx}'] = df.sort_values('Date')[colx].rolling(window, min_periods=1).mean()
|
|
473
|
+
df['roll_edge'] = np.where(df['roll_proba_target_up'] > df['roll_proba_target_down'],'up','down')
|
|
474
|
+
#order chaining
|
|
475
|
+
df['lag'] = df['roll_edge'].shift(1)
|
|
476
|
+
df['change'] = np.where(df['roll_edge']!=df['lag'],1,np.nan)
|
|
477
|
+
df['rn'] = df.sort_values('Date').groupby('change').cumcount() + 1
|
|
478
|
+
df['rn'] = np.where(df['change']==1,df['rn'],np.nan)
|
|
479
|
+
df['chain'] = df.sort_values('Date')['rn'].fillna(method='ffill')
|
|
480
|
+
df['chain_id'] = df.sort_values(['Date']).groupby(['chain','chain']).cumcount() + 1
|
|
481
|
+
|
|
482
|
+
colors = {'up':'blue','down':'red'}
|
|
483
|
+
fig = make_subplots(
|
|
484
|
+
rows=2, cols=2,shared_xaxes=False,vertical_spacing=0.08,
|
|
485
|
+
specs=[[{"colspan": 2, "secondary_y":True}, None],[{}, {}]],
|
|
486
|
+
subplot_titles=("Smooth edge probabilities", f"expected return {rets_eval} days", "Duration"))
|
|
487
|
+
fig.add_trace(go.Scatter(x=df.Date, y=df.Close,mode='lines+markers',name='Close price'))
|
|
488
|
+
fig.add_trace(go.Scatter(x=df.Date, y=df.roll_proba_target_down,mode='lines',marker = dict(color = 'coral'),name='go down'),secondary_y=True,col=1,row=1)
|
|
489
|
+
fig.add_trace(go.Scatter(x=df.Date, y=df.roll_proba_target_up,mode='lines',marker = dict(opacity=0.1,size=80), name='go up'),secondary_y=True,col=1,row=1)
|
|
490
|
+
|
|
491
|
+
for re in df['roll_edge'].unique():
|
|
492
|
+
fig.add_trace(go.Box(x=df[df['roll_edge']==re]["eval_rets"],name=re,marker_color=colors.get(re),showlegend=False),col=1,row=2)
|
|
493
|
+
fig.add_vline(x=0, line_width=2, line_dash="dash", line_color="grey", col=1,row=2)
|
|
494
|
+
df_ = df.groupby(['roll_edge','chain'],as_index=False).agg(max_duration = ('chain_id','max'))
|
|
495
|
+
for re in df_['roll_edge'].unique():
|
|
496
|
+
fig.add_trace(go.Box(x=df_[df_['roll_edge']==re]["max_duration"],name=re,marker_color=colors.get(re),showlegend=False),col=2,row=2)
|
|
497
|
+
|
|
498
|
+
fig.update_layout(title_text="sirius - smooth edge probabilities",width=1200,height = 1000)
|
|
499
|
+
if plot:
|
|
500
|
+
fig.show()
|
|
501
|
+
|
|
502
|
+
return fig
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
from numpy.random import choice
|
|
4
|
+
import numpy as np
|
|
5
|
+
from scipy import stats
|
|
6
|
+
from sklearn.feature_selection import RFE
|
|
7
|
+
|
|
8
|
+
class StackRFE:
|
|
9
|
+
def __init__(self, model, n_features, batch_elim, step_elim, cv, max_iterations, manual_pipe=list(), importance_callable="auto"):
|
|
10
|
+
"""
|
|
11
|
+
n_features: number of features to select in RFE
|
|
12
|
+
batch_elim: select n features as suggestion
|
|
13
|
+
step_elim: number of features to iter in RFE
|
|
14
|
+
manual_pipe: list of pipeline to suggest features to pass to RFE
|
|
15
|
+
importance_callable: function to calculate feature importance
|
|
16
|
+
"""
|
|
17
|
+
self.model = model
|
|
18
|
+
self.n_features = n_features
|
|
19
|
+
self.batch_elim = batch_elim
|
|
20
|
+
self.step_elim = step_elim
|
|
21
|
+
self.cv = cv
|
|
22
|
+
self.max_iterations = max_iterations
|
|
23
|
+
self.manual_pipe = manual_pipe
|
|
24
|
+
self.importance_callable=importance_callable
|
|
25
|
+
|
|
26
|
+
def _suggest_elimination(self, uniform=False):
|
|
27
|
+
"""
|
|
28
|
+
suggest based on mean ranking, lower the mean rank higher the prob to be selected
|
|
29
|
+
"""
|
|
30
|
+
ds = self.feature_rankings
|
|
31
|
+
ds_mean = {k:np.mean(ds.get(k)) for k in ds}
|
|
32
|
+
max_ = np.max([x for x in ds_mean.values()])
|
|
33
|
+
ds_weight = {k: (max_-v+1) for k,v in ds_mean.items()}
|
|
34
|
+
sum_ = np.sum([x for x in ds_weight.values()])
|
|
35
|
+
ds_prob = {k: v/sum_ for k,v in ds_weight.items()}
|
|
36
|
+
result = list(choice(list(ds_prob.keys()), self.batch_elim,p=list(ds_prob.values()), replace=False))
|
|
37
|
+
if uniform:
|
|
38
|
+
features = list(ds_prob.keys())
|
|
39
|
+
random.shuffle(features)
|
|
40
|
+
result = features[0:self.batch_elim]
|
|
41
|
+
return result
|
|
42
|
+
|
|
43
|
+
def fit(self, X, y):
|
|
44
|
+
features = list(X.columns).copy()
|
|
45
|
+
self.feature_rankings = {f:[1] for f in features}
|
|
46
|
+
for iteration in range(self.max_iterations):
|
|
47
|
+
# shuffling
|
|
48
|
+
if random.random() > 0.5:
|
|
49
|
+
batch_features = self._suggest_elimination()
|
|
50
|
+
else:
|
|
51
|
+
batch_features = self._suggest_elimination(uniform=True)
|
|
52
|
+
|
|
53
|
+
if len(self.manual_pipe)>0:
|
|
54
|
+
batch_features = self.manual_pipe.pop(0)
|
|
55
|
+
# selector and elimination
|
|
56
|
+
tmp_feature_ranking = {k: list() for k in batch_features}
|
|
57
|
+
selector = RFE(self.model, n_features_to_select=self.n_features, step=self.step_elim, importance_getter=self.importance_callable)
|
|
58
|
+
for train_index, test_index in self.cv.split(X, y):
|
|
59
|
+
X_ = X[X.index.get_level_values('i').isin(train_index)][batch_features]
|
|
60
|
+
y_ = y[y.index.get_level_values('i').isin(train_index)]
|
|
61
|
+
selector = selector.fit(X_, y_)
|
|
62
|
+
for k,r in zip(tmp_feature_ranking.keys(), selector.ranking_):
|
|
63
|
+
tmp_feature_ranking[k].append(r)
|
|
64
|
+
rankings = [np.median(v) for v in tmp_feature_ranking.values()]
|
|
65
|
+
for f,r in zip(batch_features, rankings):
|
|
66
|
+
self.feature_rankings[f].append(r)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import shap
|
|
2
|
+
import mlflow
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from plotly.subplots import make_subplots
|
|
6
|
+
import plotly.graph_objects as go
|
|
7
|
+
|
|
8
|
+
class StackInterpretor(mlflow.pyfunc.PythonModel):
|
|
9
|
+
def __init__(self, model, targets):
|
|
10
|
+
self.base_estimators = model.estimators_
|
|
11
|
+
self.targets = targets
|
|
12
|
+
def fit_interpretor(self, data):
|
|
13
|
+
interpretors = {}
|
|
14
|
+
for label, predictor in zip(self.targets,self.base_estimators):
|
|
15
|
+
explainer = shap.Explainer(predictor, data)
|
|
16
|
+
interpretors[label] = explainer
|
|
17
|
+
self.interpretors = interpretors
|
|
18
|
+
def get_shap_values(self, data):
|
|
19
|
+
shap_values = dict()
|
|
20
|
+
for label, interpretor in self.interpretors.items():
|
|
21
|
+
shap_value = interpretor(data)
|
|
22
|
+
shap_values[label] = shap_value
|
|
23
|
+
return shap_values
|
|
24
|
+
def register_map(self, mapping):
|
|
25
|
+
self.mapping = mapping
|
|
26
|
+
|
|
27
|
+
def mean_shap(data, explainers, pipe_transform):
|
|
28
|
+
t_data = pipe_transform.transform(data)
|
|
29
|
+
input_features = t_data.columns
|
|
30
|
+
shap_results = explainers.get_shap_values(t_data)
|
|
31
|
+
dict_shap_values = explainers.mapping
|
|
32
|
+
arrays_ = list()
|
|
33
|
+
for k,_ in shap_results.items():
|
|
34
|
+
arrays_.append(shap_results.get(k).values)
|
|
35
|
+
shap_results_mean = np.mean(np.array(arrays_), axis = 0)
|
|
36
|
+
df_shap = pd.DataFrame(shap_results_mean, columns=input_features, index=data.index)
|
|
37
|
+
df_shap['Close'] = data['Close']
|
|
38
|
+
df_shap['Date'] = data['Date']
|
|
39
|
+
df_shap = df_shap[['Date','Close']+list(dict_shap_values.keys())]
|
|
40
|
+
df_shap = df_shap.rename(columns =dict_shap_values)
|
|
41
|
+
return df_shap
|
|
42
|
+
|
|
43
|
+
def edge_shap_lines(data, plot = False, look_back = 750):
|
|
44
|
+
### corect labels ####
|
|
45
|
+
shap_cols = [col for col in data.columns if col not in ['Date','Close']]
|
|
46
|
+
df = data.sort_values('Date').iloc[-look_back:]
|
|
47
|
+
fig = make_subplots(specs=[[{"secondary_y": True}]])
|
|
48
|
+
fig.add_trace(go.Scatter(x=df.Date, y=df.Close,mode='lines+markers',marker = dict(color = 'grey'),line = dict(color = 'grey'),name='Close price'))
|
|
49
|
+
for col in shap_cols:
|
|
50
|
+
fig.add_trace(go.Scatter(x=df.Date, y=df[col],mode='lines+markers',name=col),secondary_y=True)
|
|
51
|
+
fig.update_layout(title_text="sirius - feature power",width=1200,height = 500)
|
|
52
|
+
if plot:
|
|
53
|
+
fig.show()
|
|
54
|
+
return fig
|