virgo-modules 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- virgo_modules/__init__.py +1 -0
- virgo_modules/src/aws_utils.py +35 -3
- virgo_modules/src/backtester.py +474 -0
- virgo_modules/src/edge_utils/__init__.py +0 -0
- virgo_modules/src/edge_utils/conformal_utils.py +106 -0
- virgo_modules/src/edge_utils/edge_utils.py +502 -0
- virgo_modules/src/edge_utils/feature_selection.py +66 -0
- virgo_modules/src/edge_utils/shap_utils.py +54 -0
- virgo_modules/src/edge_utils/stack_model.py +94 -0
- virgo_modules/src/hmm_utils.py +494 -0
- virgo_modules/src/market/__init__.py +0 -0
- virgo_modules/src/market/market_tools.py +189 -0
- virgo_modules/src/markowitz/__init__.py +0 -0
- virgo_modules/src/markowitz/markowitz_utils.py +44 -0
- virgo_modules/src/re_utils.py +628 -85
- virgo_modules/src/ticketer_source.py +1351 -1066
- virgo_modules/src/transformer_utils.py +401 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/METADATA +16 -22
- virgo_modules-0.9.0.dist-info/RECORD +24 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/WHEEL +1 -1
- virgo_modules/src/edge_utils.py +0 -178
- virgo_modules-0.0.72.dist-info/RECORD +0 -12
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info/licenses}/LICENSE +0 -0
- {virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
|
|
3
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
import statsmodels.api as sm
|
|
7
|
+
from patsy import dmatrix
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
|
|
10
|
+
class InverseHyperbolicSine(BaseEstimator, TransformerMixin):
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
Class that applies inverse hyperbolic sine for feature transformation.
|
|
14
|
+
this class is compatible with scikitlearn pipeline
|
|
15
|
+
|
|
16
|
+
Attributes
|
|
17
|
+
----------
|
|
18
|
+
features : list
|
|
19
|
+
list of features to apply the transformation
|
|
20
|
+
prefix : str
|
|
21
|
+
prefix for the new features. is '' the features are overwrite
|
|
22
|
+
|
|
23
|
+
Methods
|
|
24
|
+
-------
|
|
25
|
+
fit(additional="", X=DataFrame, y=None):
|
|
26
|
+
fit transformation.
|
|
27
|
+
transform(X=DataFrame, y=None):
|
|
28
|
+
apply feature transformation
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, features, prefix = ''):
|
|
32
|
+
self.features = features
|
|
33
|
+
self.prefix = prefix
|
|
34
|
+
|
|
35
|
+
def fit(self, X, y=None):
|
|
36
|
+
return self
|
|
37
|
+
|
|
38
|
+
def transform(self, X, y=None):
|
|
39
|
+
for feature in self.features:
|
|
40
|
+
X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
|
|
41
|
+
return X
|
|
42
|
+
|
|
43
|
+
class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
Class that applies winsorirization of a feature for feature transformation.
|
|
47
|
+
this class is compatible with scikitlearn pipeline
|
|
48
|
+
|
|
49
|
+
Attributes
|
|
50
|
+
----------
|
|
51
|
+
feature_configs : dict
|
|
52
|
+
dictionary of features and configurations. the configuration has high and low limits per feature
|
|
53
|
+
|
|
54
|
+
Methods
|
|
55
|
+
-------
|
|
56
|
+
fit(additional="", X=DataFrame, y=None):
|
|
57
|
+
fit transformation.
|
|
58
|
+
transform(X=DataFrame, y=None):
|
|
59
|
+
apply feature transformation
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, feature_configs):
|
|
63
|
+
self.feature_configs = feature_configs
|
|
64
|
+
def fit(self, X, y=None):
|
|
65
|
+
return self
|
|
66
|
+
|
|
67
|
+
def transform(self, X, y=None):
|
|
68
|
+
for feature in self.feature_configs:
|
|
69
|
+
lower = self.feature_configs[feature]['min']
|
|
70
|
+
upper = self.feature_configs[feature]['max']
|
|
71
|
+
X[feature] = np.where( lower > X[feature], lower, X[feature])
|
|
72
|
+
X[feature] = np.where( upper < X[feature], upper, X[feature])
|
|
73
|
+
return X
|
|
74
|
+
|
|
75
|
+
class FeatureSelector(BaseEstimator, TransformerMixin):
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
Class that applies selection of features.
|
|
79
|
+
this class is compatible with scikitlearn pipeline
|
|
80
|
+
|
|
81
|
+
Attributes
|
|
82
|
+
----------
|
|
83
|
+
columns : list
|
|
84
|
+
list of features to select
|
|
85
|
+
|
|
86
|
+
Methods
|
|
87
|
+
-------
|
|
88
|
+
fit(additional="", X=DataFrame, y=None):
|
|
89
|
+
fit transformation.
|
|
90
|
+
transform(X=DataFrame, y=None):
|
|
91
|
+
apply feature transformation
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(self, columns):
|
|
95
|
+
self.columns = columns
|
|
96
|
+
|
|
97
|
+
def fit(self, X, y=None):
|
|
98
|
+
return self
|
|
99
|
+
|
|
100
|
+
def transform(self, X, y=None):
|
|
101
|
+
return X[self.columns]
|
|
102
|
+
|
|
103
|
+
class FeaturesEntropy(BaseEstimator, TransformerMixin):
|
|
104
|
+
"""
|
|
105
|
+
Class that creates a feature that calculate entropy for a given feature classes, but it might get some leackeage in the training set.
|
|
106
|
+
this class is compatible with scikitlearn pipeline
|
|
107
|
+
|
|
108
|
+
Attributes
|
|
109
|
+
----------
|
|
110
|
+
columns : list
|
|
111
|
+
list of features to select
|
|
112
|
+
entropy_map: pd.DataFrame
|
|
113
|
+
dataframe of the map with the entropies per class
|
|
114
|
+
perc: float
|
|
115
|
+
percentage of the dates using for calculate the entropy map
|
|
116
|
+
|
|
117
|
+
Methods
|
|
118
|
+
-------
|
|
119
|
+
fit(additional="", X=DataFrame, y=None):
|
|
120
|
+
fit transformation.
|
|
121
|
+
transform(X=DataFrame, y=None):
|
|
122
|
+
apply feature transformation
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def __init__(self, features, target, feature_name = None, feature_type = 'discrete', perc = 0.5, default_null = 0.99):
|
|
126
|
+
|
|
127
|
+
self.features = features
|
|
128
|
+
self.feature_type = feature_type
|
|
129
|
+
self.target = target
|
|
130
|
+
self.perc = perc
|
|
131
|
+
self.default_null = default_null
|
|
132
|
+
|
|
133
|
+
if not feature_name:
|
|
134
|
+
self.feature_name = '_'.join(features)
|
|
135
|
+
self.feature_name = self.feature_name + '_' + target + '_' + feature_type
|
|
136
|
+
else:
|
|
137
|
+
self.feature_name = feature_name
|
|
138
|
+
|
|
139
|
+
def fit(self, X, y=None):
|
|
140
|
+
|
|
141
|
+
unique_dates = list(X['Date'].unique())
|
|
142
|
+
unique_dates.sort()
|
|
143
|
+
|
|
144
|
+
total_length = len(unique_dates)
|
|
145
|
+
cut = int(round(total_length*self.perc,0))
|
|
146
|
+
train_dates = unique_dates[:cut]
|
|
147
|
+
max_train_date = max(train_dates)
|
|
148
|
+
|
|
149
|
+
X_ = X[X['Date'] <= max_train_date].copy()
|
|
150
|
+
df = X_.join(y, how = 'left')
|
|
151
|
+
|
|
152
|
+
column_list = [f'{self.feature_type}_signal_{colx}' for colx in self.features]
|
|
153
|
+
|
|
154
|
+
df_aggr = (
|
|
155
|
+
df
|
|
156
|
+
.groupby(column_list, as_index = False)
|
|
157
|
+
.apply(
|
|
158
|
+
lambda x: pd.Series(
|
|
159
|
+
dict(
|
|
160
|
+
counts = x[self.target].count(),
|
|
161
|
+
trues=(x[self.target] == 1).sum(),
|
|
162
|
+
falses=(x[self.target] == 0).sum(),
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
.assign(
|
|
167
|
+
trues_rate=lambda x: x['trues'] / x['counts']
|
|
168
|
+
)
|
|
169
|
+
.assign(
|
|
170
|
+
falses_rate=lambda x: x['falses'] / x['counts']
|
|
171
|
+
)
|
|
172
|
+
.assign(
|
|
173
|
+
log2_trues = lambda x: np.log2(1/x['trues_rate'])
|
|
174
|
+
)
|
|
175
|
+
.assign(
|
|
176
|
+
log2_falses = lambda x: np.log2(1/x['falses_rate'])
|
|
177
|
+
)
|
|
178
|
+
.assign(
|
|
179
|
+
comp1 = lambda x: x['trues_rate']*x['log2_trues']
|
|
180
|
+
)
|
|
181
|
+
.assign(
|
|
182
|
+
comp2 = lambda x: x['falses_rate']*x['log2_falses']
|
|
183
|
+
)
|
|
184
|
+
.assign(
|
|
185
|
+
class_entropy = lambda x: np.round(x['comp1']+x['comp2'],3)
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
self.column_list = column_list
|
|
190
|
+
self.entropy_map = (
|
|
191
|
+
df_aggr
|
|
192
|
+
[column_list+['class_entropy']]
|
|
193
|
+
.rename(columns = {'class_entropy': self.feature_name})
|
|
194
|
+
.copy()
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
del df, df_aggr, X_
|
|
198
|
+
return self
|
|
199
|
+
|
|
200
|
+
def transform(self, X, y=None):
|
|
201
|
+
|
|
202
|
+
X = X.join(self.entropy_map.set_index(self.column_list), on=self.column_list, how = 'left')
|
|
203
|
+
X[self.feature_name] = X[self.feature_name].fillna(self.default_null)
|
|
204
|
+
return X
|
|
205
|
+
|
|
206
|
+
class signal_combiner(BaseEstimator, TransformerMixin):
|
|
207
|
+
|
|
208
|
+
"""
|
|
209
|
+
Class that applies feature combination of binary signals.
|
|
210
|
+
this class is compatible with scikitlearn pipeline
|
|
211
|
+
|
|
212
|
+
...
|
|
213
|
+
|
|
214
|
+
Attributes
|
|
215
|
+
----------
|
|
216
|
+
columns : list
|
|
217
|
+
list of features to select
|
|
218
|
+
drop : boolean
|
|
219
|
+
drop combining features
|
|
220
|
+
prefix_up : str
|
|
221
|
+
up prefix of the base feature
|
|
222
|
+
prefix_low : str
|
|
223
|
+
low prefix of the base feature
|
|
224
|
+
|
|
225
|
+
Methods
|
|
226
|
+
-------
|
|
227
|
+
fit(additional="", X=DataFrame, y=None):
|
|
228
|
+
fit transformation.
|
|
229
|
+
transform(X=DataFrame, y=None):
|
|
230
|
+
apply feature transformation
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
|
|
234
|
+
self.columns = columns
|
|
235
|
+
self.drop = drop
|
|
236
|
+
self.prefix_up = prefix_up
|
|
237
|
+
self.prefix_low = prefix_low
|
|
238
|
+
|
|
239
|
+
def fit(self, X, y=None):
|
|
240
|
+
return self
|
|
241
|
+
|
|
242
|
+
def transform(self, X, y=None):
|
|
243
|
+
for column in self.columns:
|
|
244
|
+
X['CombSignal_'+column] = np.where(
|
|
245
|
+
X[self.prefix_up + column] == 1,
|
|
246
|
+
1,
|
|
247
|
+
np.where(
|
|
248
|
+
X[self.prefix_low + column] == 1,
|
|
249
|
+
1,
|
|
250
|
+
0
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
if self.drop:
|
|
254
|
+
X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
|
|
255
|
+
return X
|
|
256
|
+
|
|
257
|
+
class InteractionFeatures(BaseEstimator, TransformerMixin):
|
|
258
|
+
|
|
259
|
+
"""
|
|
260
|
+
Class that applies feature interaction.
|
|
261
|
+
this class is compatible with scikitlearn pipeline
|
|
262
|
+
|
|
263
|
+
Attributes
|
|
264
|
+
----------
|
|
265
|
+
feature_list1 : list
|
|
266
|
+
list of features to combine
|
|
267
|
+
feature_list2 : list
|
|
268
|
+
list of features to combine
|
|
269
|
+
|
|
270
|
+
Methods
|
|
271
|
+
-------
|
|
272
|
+
fit(additional="", X=DataFrame, y=None):
|
|
273
|
+
fit transformation.
|
|
274
|
+
transform(X=DataFrame, y=None):
|
|
275
|
+
apply feature transformation
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
def __init__(self, feature_list1, feature_list2):
|
|
279
|
+
self.feature_list1 = feature_list1
|
|
280
|
+
self.feature_list2 = feature_list2
|
|
281
|
+
|
|
282
|
+
def fit(self, X, y=None):
|
|
283
|
+
return self
|
|
284
|
+
|
|
285
|
+
def simple_div_interaction(self, data, feature1, feature2, feature_name):
|
|
286
|
+
data[feature_name] = data[feature1]*data[feature2]
|
|
287
|
+
data[feature_name] = data[feature_name].replace([np.inf, -np.inf], 0)
|
|
288
|
+
data[feature_name] = data[feature_name].fillna(0)
|
|
289
|
+
return data
|
|
290
|
+
|
|
291
|
+
def transform(self, X, y=None):
|
|
292
|
+
for f1 in self.feature_list1:
|
|
293
|
+
for f2 in self.feature_list2:
|
|
294
|
+
fn = 'iterm_'+f1.replace("norm_","")+"_"+f2.replace("norm_","")
|
|
295
|
+
X = self.simple_div_interaction(X, f1, f2, fn)
|
|
296
|
+
return X
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class SplineMarketReturnJumpWaves(BaseEstimator, TransformerMixin):
|
|
300
|
+
"""
|
|
301
|
+
Class that gets a feature returns and performs countings so that a spline regression model can be fitted
|
|
302
|
+
|
|
303
|
+
Attributes
|
|
304
|
+
----------
|
|
305
|
+
return_feature_names : list
|
|
306
|
+
list of the name of the features to apply spline regresion
|
|
307
|
+
target_variables : list
|
|
308
|
+
list of target features
|
|
309
|
+
feature_label : str
|
|
310
|
+
prefix for the new features.
|
|
311
|
+
sample_perc : float
|
|
312
|
+
sample size of the traninig data taking into consideration time
|
|
313
|
+
|
|
314
|
+
Methods
|
|
315
|
+
-------
|
|
316
|
+
fit(additional="", X=DataFrame, y=DataFrame):
|
|
317
|
+
fit transformation.
|
|
318
|
+
transform(X=DataFrame, y=None):
|
|
319
|
+
apply feature transformation
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
def __init__(self, return_feature_names, target_variables, feature_label,
|
|
323
|
+
sample_perc=0.5,parts = 6, e_floor=-0.001,e_top=0.0001, d=3):
|
|
324
|
+
self.sample_perc = sample_perc
|
|
325
|
+
self.return_feature_names=return_feature_names
|
|
326
|
+
self.target_variables = target_variables
|
|
327
|
+
self.glms = dict()
|
|
328
|
+
self.feature_label = feature_label
|
|
329
|
+
self.parts = parts
|
|
330
|
+
self.e_floor = e_floor
|
|
331
|
+
self.e_top = e_top
|
|
332
|
+
self.d = d
|
|
333
|
+
def fit(self, X, y, plot = False):
|
|
334
|
+
#complete dataset with y
|
|
335
|
+
X_set=X.copy()
|
|
336
|
+
X_set[self.target_variables] = y
|
|
337
|
+
#sampling
|
|
338
|
+
if plot:
|
|
339
|
+
fig, ax = plt.subplots(len(self.return_feature_names),1)
|
|
340
|
+
for i,return_feature_name in enumerate(self.return_feature_names):
|
|
341
|
+
X_aggregated = (
|
|
342
|
+
X_set
|
|
343
|
+
.groupby("Date",as_index=False)
|
|
344
|
+
.agg(
|
|
345
|
+
count_target_up = ("target_up","sum"),
|
|
346
|
+
count_target_down = ("target_down","sum"),
|
|
347
|
+
return_feature = (return_feature_name,"max"),
|
|
348
|
+
)
|
|
349
|
+
.sort_values("Date",ascending=True)
|
|
350
|
+
.dropna()
|
|
351
|
+
.copy()
|
|
352
|
+
)
|
|
353
|
+
del X
|
|
354
|
+
gc.collect()
|
|
355
|
+
nlines = X_aggregated.shape[0]
|
|
356
|
+
threshold = int(round((1-nlines*self.sample_perc),0))
|
|
357
|
+
train_ = X_aggregated.iloc[:threshold,:]
|
|
358
|
+
self.glms[return_feature_name] = dict()
|
|
359
|
+
for target in self.target_variables:
|
|
360
|
+
X = train_[["return_feature"]].round(4).values.reshape(-1, 1)
|
|
361
|
+
y = np.log(train_.dropna()[f"count_{target}"].values + 1)
|
|
362
|
+
knot_str = self._get_knot(X)
|
|
363
|
+
transformed_x = dmatrix(f"bs(train, knots=({knot_str}), degree=3, include_intercept=False)", {"train": X}, return_type='dataframe')
|
|
364
|
+
model = sm.GLM(y, transformed_x).fit()
|
|
365
|
+
self.glms[return_feature_name][target] = {
|
|
366
|
+
"model":model,
|
|
367
|
+
}
|
|
368
|
+
if plot:
|
|
369
|
+
x_transfomed = dmatrix(f"bs(valid, knots=({knot_str}), degree={self.d}, include_intercept=False)", {"valid":X}, return_type='dataframe')
|
|
370
|
+
pred = model.predict(x_transfomed)
|
|
371
|
+
ax[i].scatter(X, np.exp(y),s=2,alpha=0.2)
|
|
372
|
+
ax[i].scatter(X, np.exp(pred), alpha=0.2, s=1)
|
|
373
|
+
#self.X_aggregated = X_aggregated
|
|
374
|
+
return self
|
|
375
|
+
|
|
376
|
+
def transform(self, X, y=None, plot =False):
|
|
377
|
+
if plot:
|
|
378
|
+
fig, ax = plt.subplots(len(self.return_feature_names),1)
|
|
379
|
+
for i, return_feature_name in enumerate(self.return_feature_names):
|
|
380
|
+
for target in self.target_variables:
|
|
381
|
+
model = self.glms[return_feature_name][target].get("model")
|
|
382
|
+
vect = X[return_feature_name]
|
|
383
|
+
knot_str = self._get_knot(vect)
|
|
384
|
+
X_transformed = dmatrix(f"bs(valid, knots=({knot_str}), degree={self.d}, include_intercept=False)",
|
|
385
|
+
{"valid":vect.fillna(0)},
|
|
386
|
+
return_type='dataframe')
|
|
387
|
+
X[f"{self.feature_label}_{return_feature_name}_{target}"] = model.predict(
|
|
388
|
+
X_transformed
|
|
389
|
+
)
|
|
390
|
+
if plot:
|
|
391
|
+
pred = model.predict(X_transformed)
|
|
392
|
+
ax[i].scatter(X, np.exp(pred), alpha=0.2, s=1)
|
|
393
|
+
return X
|
|
394
|
+
|
|
395
|
+
def _get_knot(self, input):
|
|
396
|
+
min_, max_ = np.min(input)-self.e_floor, np.max(input)+self.e_top
|
|
397
|
+
r = (max_ - min_)/self.parts
|
|
398
|
+
knot_tuple = [str(i*r+min_) for i,_ in enumerate(range(self.parts),start=0)]
|
|
399
|
+
knot_str = ",".join(knot_tuple)
|
|
400
|
+
knot_str = f"({knot_str})"
|
|
401
|
+
return knot_str
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
2
|
-
Name:
|
|
3
|
-
Version: 0.0
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: virgo_modules
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: data processing and statistical modeling using stock market data
|
|
5
5
|
Home-page: https://github.com/miguelmayhem92/virgo_module
|
|
6
6
|
Author: Miguel Mayhuire
|
|
@@ -9,28 +9,22 @@ License: MIT
|
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.9
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
|
-
Requires-Python: >=3.9
|
|
12
|
+
Requires-Python: >=3.9
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: feature-engine ==1.6.1
|
|
16
|
-
Requires-Dist: matplotlib ==3.6.3
|
|
17
|
-
Requires-Dist: mlflow ==2.1.1
|
|
18
|
-
Requires-Dist: numpy ==1.23.5
|
|
19
|
-
Requires-Dist: optuna ==3.1.0
|
|
20
|
-
Requires-Dist: pandas ==1.5.3
|
|
21
|
-
Requires-Dist: plotly ==5.15.0
|
|
22
|
-
Requires-Dist: rsa ==4.9
|
|
23
|
-
Requires-Dist: scikit-learn ==1.2.1
|
|
24
|
-
Requires-Dist: scipy ==1.10.0
|
|
25
|
-
Requires-Dist: seaborn ==0.12.2
|
|
26
|
-
Requires-Dist: starlette ==0.22.0
|
|
27
|
-
Requires-Dist: statsmodels ==0.13.5
|
|
28
|
-
Requires-Dist: ta ==0.10.2
|
|
29
|
-
Requires-Dist: yfinance ==0.2.9
|
|
30
|
-
Requires-Dist: hmmlearn ==0.3.0
|
|
31
|
-
Requires-Dist: boto3
|
|
32
15
|
Provides-Extra: dev
|
|
33
|
-
Requires-Dist: pytest
|
|
16
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
17
|
+
Dynamic: author
|
|
18
|
+
Dynamic: author-email
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: home-page
|
|
23
|
+
Dynamic: license
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
Dynamic: provides-extra
|
|
26
|
+
Dynamic: requires-python
|
|
27
|
+
Dynamic: summary
|
|
34
28
|
|
|
35
29
|
# Virgo Package
|
|
36
30
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
virgo_modules/__init__.py,sha256=7NrzGOSBvO9S73thMlxEh5aNYKS5SYKLgTxC1YIIPRk,21
|
|
2
|
+
virgo_modules/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
virgo_modules/src/aws_utils.py,sha256=QCyxJwZ6bNCkMpuuxzxkNxejj-hJf4kj2arb1SQPNug,2582
|
|
4
|
+
virgo_modules/src/backtester.py,sha256=OhiWyzDX0PthXGuhChyWUmDN3cLkzVYe95zS4nGtia8,22106
|
|
5
|
+
virgo_modules/src/hmm_utils.py,sha256=D7axAnCdSe1_1EgRyli2PAnM2f6699hTY9GcxjPXG-o,21221
|
|
6
|
+
virgo_modules/src/pull_artifacts.py,sha256=5OPrgR7pcMSdpbevDRhf0ebk7g7ZRjff4NpTIIWAKjE,1989
|
|
7
|
+
virgo_modules/src/re_utils.py,sha256=AQlhyO0cvU-G42dolhedz5E-sniRzeFhf40RD5QVYpo,75506
|
|
8
|
+
virgo_modules/src/ticketer_source.py,sha256=nGjQdZRDWto8cGKhMQCyAqYwvqYy1m6djkCpffiX3Dk,107747
|
|
9
|
+
virgo_modules/src/transformer_utils.py,sha256=SnYdtsFPnSF6u4UFIat0-X3-qVuUWvv_T46kiB-H0Sk,13682
|
|
10
|
+
virgo_modules/src/edge_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
virgo_modules/src/edge_utils/conformal_utils.py,sha256=cKm4KSM261Eu1FJn4oowKYiKIesW81VbqITIvopGSVk,5410
|
|
12
|
+
virgo_modules/src/edge_utils/edge_utils.py,sha256=FuqEyvVYPhMy39uhxqD6bIVSMVcT-P3Pzuim0FS8u7A,20324
|
|
13
|
+
virgo_modules/src/edge_utils/feature_selection.py,sha256=VSjGsC9bivuRuGDZiykX1_LIM0C0q2el3HEmujWZ4qs,3097
|
|
14
|
+
virgo_modules/src/edge_utils/shap_utils.py,sha256=FgcHkfddvdFSeUqEubYa2ExRGVAWSthqK4b-eKagEmo,2333
|
|
15
|
+
virgo_modules/src/edge_utils/stack_model.py,sha256=QqE91uLo2KauGEj91AVNANB1xE7J4Fa49YOX7k5mFng,4257
|
|
16
|
+
virgo_modules/src/market/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
virgo_modules/src/market/market_tools.py,sha256=vBt66_7E3ANz7avzfeNw_RHMGvG9lh5PRhxmcf_Oyjc,6880
|
|
18
|
+
virgo_modules/src/markowitz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
virgo_modules/src/markowitz/markowitz_utils.py,sha256=2bsnk_QlfWB5QytSNZ5n8elto9hhsEiukcEJBqWEYX4,1970
|
|
20
|
+
virgo_modules-0.9.0.dist-info/licenses/LICENSE,sha256=pNgFyCYgmimaw0o6V20JupZLROycAnOA_HDDh1tX2V4,1097
|
|
21
|
+
virgo_modules-0.9.0.dist-info/METADATA,sha256=z7zb755kS0k15I2FddBWXbFzrTaSmEqUiPvxvtEEzIM,1122
|
|
22
|
+
virgo_modules-0.9.0.dist-info/WHEEL,sha256=lTU6B6eIfYoiQJTZNc-fyaR6BpL6ehTzU3xGYxn2n8k,91
|
|
23
|
+
virgo_modules-0.9.0.dist-info/top_level.txt,sha256=ZjI-qEkDtT-8mFwGAWnXfqPOKEGlIhWRW1es1VyXc60,14
|
|
24
|
+
virgo_modules-0.9.0.dist-info/RECORD,,
|
virgo_modules/src/edge_utils.py
DELETED
|
@@ -1,178 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import itertools
|
|
3
|
-
|
|
4
|
-
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
|
5
|
-
from sklearn.pipeline import Pipeline
|
|
6
|
-
|
|
7
|
-
from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
|
|
8
|
-
from feature_engine.imputation import MeanMedianImputer
|
|
9
|
-
from virgo_modules.src.ticketer_source import FeatureSelector
|
|
10
|
-
from feature_engine.discretisation import EqualWidthDiscretiser
|
|
11
|
-
|
|
12
|
-
from .ticketer_source import VirgoWinsorizerFeature, InverseHyperbolicSine
|
|
13
|
-
|
|
14
|
-
class produce_model_wrapper:
|
|
15
|
-
def __init__(self,data):
|
|
16
|
-
self.data = data.copy()
|
|
17
|
-
|
|
18
|
-
def preprocess(self, validation_size, target):
|
|
19
|
-
|
|
20
|
-
val_date = self.data.groupby('Date', as_index = False).agg(target_down = (target[0],'count')).sort_values('Date').iloc[-validation_size:,].head(1)['Date'].values[0]
|
|
21
|
-
|
|
22
|
-
train_data = self.data[self.data['Date'] < val_date].dropna()
|
|
23
|
-
val_data = self.data[self.data['Date'] >= val_date].dropna()
|
|
24
|
-
|
|
25
|
-
columns = [ x for x in train_data.columns if x not in target ]
|
|
26
|
-
X_train, y_train = train_data[columns], train_data[target]
|
|
27
|
-
X_val, y_val = val_data[columns], val_data[target]
|
|
28
|
-
self.X_train = X_train
|
|
29
|
-
self.y_train = y_train
|
|
30
|
-
self.X_val = X_val
|
|
31
|
-
self.y_val = y_val
|
|
32
|
-
|
|
33
|
-
def train_model(self, pipe, model, cv_ = False):
|
|
34
|
-
self.model = model
|
|
35
|
-
self.pipe_transform = pipe
|
|
36
|
-
self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
|
|
37
|
-
self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
|
|
38
|
-
self.pipeline.fit(self.X_train, self.y_train)
|
|
39
|
-
|
|
40
|
-
class register_results():
|
|
41
|
-
def __init__(self, model_name):
|
|
42
|
-
self.model_name = model_name
|
|
43
|
-
self.metric_logger = dict()
|
|
44
|
-
def eval_metrics(self, pipeline, X, y, type_data, phase):
|
|
45
|
-
|
|
46
|
-
preds_proba = pipeline.predict_proba(X)
|
|
47
|
-
preds = pipeline.predict(X)
|
|
48
|
-
|
|
49
|
-
if type(preds_proba) == list:
|
|
50
|
-
preds_proba = np.array([ x[:,1] for x in preds_proba]).T
|
|
51
|
-
|
|
52
|
-
roc = roc_auc_score(y,preds_proba, average=None)
|
|
53
|
-
precision = precision_score(y,preds, average=None)
|
|
54
|
-
recall = recall_score(y,preds, average=None)
|
|
55
|
-
|
|
56
|
-
self.metric_logger[f'{phase}//{self.model_name}//{type_data}'] = {'roc':roc, 'precision':precision, 'recall':recall}
|
|
57
|
-
|
|
58
|
-
def print_metric_logger(self):
|
|
59
|
-
parts = list(self.metric_logger.keys())
|
|
60
|
-
phase_parts = [ x.split('//')[0] for x in parts]
|
|
61
|
-
|
|
62
|
-
parts = list(self.metric_logger)
|
|
63
|
-
phase_parts = [ x.split('//')[0] for x in parts]
|
|
64
|
-
|
|
65
|
-
init_phase = phase_parts[0]
|
|
66
|
-
print(f'---{init_phase}--')
|
|
67
|
-
for phase,val in zip(phase_parts,self.metric_logger):
|
|
68
|
-
stage = val.split('//')[2]
|
|
69
|
-
if init_phase != phase:
|
|
70
|
-
print(f'---{phase}--')
|
|
71
|
-
init_phase = phase
|
|
72
|
-
for metric in self.metric_logger[val]:
|
|
73
|
-
print(stage, metric,self.metric_logger[val][metric])
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def eval_metrics(pipeline, X, y, type_data, model_name):
|
|
77
|
-
|
|
78
|
-
preds_proba = pipeline.predict_proba(X)
|
|
79
|
-
preds = pipeline.predict(X)
|
|
80
|
-
|
|
81
|
-
if type(preds_proba) == list:
|
|
82
|
-
preds_proba = np.array([ x[:,1] for x in preds_proba]).T
|
|
83
|
-
|
|
84
|
-
print(f'--{type_data} - {model_name}--')
|
|
85
|
-
print('--target: down, up--')
|
|
86
|
-
print('--roc-auc--')
|
|
87
|
-
print(roc_auc_score(y,preds_proba, average=None))
|
|
88
|
-
print('--precision--')
|
|
89
|
-
print(precision_score(y,preds, average=None))
|
|
90
|
-
print('--recall--')
|
|
91
|
-
print(recall_score(y,preds, average=None))
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def data_processing_pipeline_classifier(
|
|
95
|
-
features_base,features_to_drop = False, winsorizer_conf = False, discretize_columns = False,
|
|
96
|
-
bins_discretize = 10, correlation = 0.85, fillna = True,
|
|
97
|
-
invhypervolsin_features = False,
|
|
98
|
-
pipeline_order = 'selector//winzorizer//discretizer//median_inputer//drop//correlation'):
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
select_pipe = [('selector', FeatureSelector(features_base))] if features_base else []
|
|
102
|
-
winzorizer_pipe = [('winzorized_features', VirgoWinsorizerFeature(winsorizer_conf))] if winsorizer_conf else []
|
|
103
|
-
drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
|
|
104
|
-
discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = bins_discretize ))] if discretize_columns else []
|
|
105
|
-
drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation, method = 'spearman'))] if correlation else []
|
|
106
|
-
median_imputer_pipe = [('median_imputer', MeanMedianImputer())] if fillna else []
|
|
107
|
-
invhypersin_pipe = [('invhypervolsin scaler', InverseHyperbolicSine(features = invhypervolsin_features))] if invhypervolsin_features else []
|
|
108
|
-
|
|
109
|
-
pipe_dictionary = {
|
|
110
|
-
'selector': select_pipe,
|
|
111
|
-
'winzorizer':winzorizer_pipe,
|
|
112
|
-
'drop':drop_pipe,
|
|
113
|
-
'discretizer': discretize,
|
|
114
|
-
'correlation': drop_corr,
|
|
115
|
-
'median_inputer':median_imputer_pipe,
|
|
116
|
-
'arcsinh_scaler': invhypersin_pipe,
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
pipeline_steps = pipeline_order.split('//')
|
|
120
|
-
## validation
|
|
121
|
-
for step in pipeline_steps:
|
|
122
|
-
if step not in pipe_dictionary.keys():
|
|
123
|
-
raise Exception(f'{step} step not in list of steps, the list is: {list(pipe_dictionary.keys())}')
|
|
124
|
-
|
|
125
|
-
pipeline_args = [ pipe_dictionary[step] for step in pipeline_steps]
|
|
126
|
-
pipeline_args = list(itertools.chain.from_iterable(pipeline_args))
|
|
127
|
-
pipe = Pipeline(pipeline_args)
|
|
128
|
-
|
|
129
|
-
return pipe
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
class ExpandingMultipleTimeSeriesKFold:
|
|
133
|
-
"""increasing training window where the test can be overlap"""
|
|
134
|
-
def __init__(self, df, window_size = 100, number_window=3, overlap_size = 0):
|
|
135
|
-
self.df = df
|
|
136
|
-
self.number_window = number_window
|
|
137
|
-
self.window_size = window_size
|
|
138
|
-
self.overlap_size = overlap_size
|
|
139
|
-
|
|
140
|
-
def split(self, X, y, groups=None):
|
|
141
|
-
|
|
142
|
-
if 'Date_i' not in self.df.index.names or 'i' not in self.df.index.names:
|
|
143
|
-
raise Exception('no date and/or index in the index dataframe')
|
|
144
|
-
|
|
145
|
-
if self.overlap_size > self.window_size:
|
|
146
|
-
raise Exception('overlap can not be higher than the window size')
|
|
147
|
-
|
|
148
|
-
unique_dates = list(self.df.index.get_level_values('Date_i').unique())
|
|
149
|
-
unique_dates.sort()
|
|
150
|
-
|
|
151
|
-
total_test_size = self.window_size * self.number_window
|
|
152
|
-
total_test_size = total_test_size - (self.number_window - 1)*self.overlap_size
|
|
153
|
-
|
|
154
|
-
if total_test_size > len(unique_dates):
|
|
155
|
-
raise Exception('test size is higher than the data length')
|
|
156
|
-
|
|
157
|
-
cut = total_test_size
|
|
158
|
-
for fold in range(self.number_window):
|
|
159
|
-
|
|
160
|
-
topcut = cut-self.window_size
|
|
161
|
-
train_dates = unique_dates[:-cut]
|
|
162
|
-
test_dates = unique_dates[-cut:-topcut]
|
|
163
|
-
|
|
164
|
-
if topcut == 0:
|
|
165
|
-
test_dates = unique_dates[-cut:]
|
|
166
|
-
|
|
167
|
-
max_train_date = max(train_dates)
|
|
168
|
-
min_test_date, max_test_date = min(test_dates), max(test_dates)
|
|
169
|
-
|
|
170
|
-
cut = cut - (self.window_size - self.overlap_size)
|
|
171
|
-
|
|
172
|
-
train_index = self.df[self.df.index.get_level_values('Date_i') <= max_train_date].index.get_level_values('i')
|
|
173
|
-
test_index = self.df[(self.df.index.get_level_values('Date_i') >= min_test_date) & (self.df.index.get_level_values('Date_i') <= max_test_date)].index.get_level_values('i')
|
|
174
|
-
|
|
175
|
-
yield train_index, test_index
|
|
176
|
-
|
|
177
|
-
def get_n_splits(self, X, y, groups=None):
|
|
178
|
-
return self.number_window
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
virgo_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
virgo_modules/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
virgo_modules/src/aws_utils.py,sha256=GWmVdXM0mIJJPn-X-bEtM4KtNPCHM1D457hnuKxaM7E,1383
|
|
4
|
-
virgo_modules/src/edge_utils.py,sha256=Ihdmq7dyb8gOvG6CrDal7wsa15tqsdsFk6KINwM6578,7691
|
|
5
|
-
virgo_modules/src/pull_artifacts.py,sha256=5OPrgR7pcMSdpbevDRhf0ebk7g7ZRjff4NpTIIWAKjE,1989
|
|
6
|
-
virgo_modules/src/re_utils.py,sha256=LDI3sYAaNm3LO5gRul7PyCVbJrkT3PBihObkdVilVec,52428
|
|
7
|
-
virgo_modules/src/ticketer_source.py,sha256=ciMPObqntAFtnlY1IPt8-Y4mz6yuD1jy6gRQN109D4M,104837
|
|
8
|
-
virgo_modules-0.0.72.dist-info/LICENSE,sha256=pNgFyCYgmimaw0o6V20JupZLROycAnOA_HDDh1tX2V4,1097
|
|
9
|
-
virgo_modules-0.0.72.dist-info/METADATA,sha256=Txin9qouILtGSvPTQYcJPPkWXNry0JjI3sSfAMB0Cjg,1429
|
|
10
|
-
virgo_modules-0.0.72.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
|
11
|
-
virgo_modules-0.0.72.dist-info/top_level.txt,sha256=ZjI-qEkDtT-8mFwGAWnXfqPOKEGlIhWRW1es1VyXc60,14
|
|
12
|
-
virgo_modules-0.0.72.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|