virgo-modules 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,401 @@
1
+ import gc
2
+
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ import pandas as pd
5
+ import numpy as np
6
+ import statsmodels.api as sm
7
+ from patsy import dmatrix
8
+ import matplotlib.pyplot as plt
9
+
10
+ class InverseHyperbolicSine(BaseEstimator, TransformerMixin):
11
+
12
+ """
13
+ Class that applies inverse hyperbolic sine for feature transformation.
14
+ this class is compatible with scikitlearn pipeline
15
+
16
+ Attributes
17
+ ----------
18
+ features : list
19
+ list of features to apply the transformation
20
+ prefix : str
21
+ prefix for the new features. is '' the features are overwrite
22
+
23
+ Methods
24
+ -------
25
+ fit(additional="", X=DataFrame, y=None):
26
+ fit transformation.
27
+ transform(X=DataFrame, y=None):
28
+ apply feature transformation
29
+ """
30
+
31
+ def __init__(self, features, prefix = ''):
32
+ self.features = features
33
+ self.prefix = prefix
34
+
35
+ def fit(self, X, y=None):
36
+ return self
37
+
38
+ def transform(self, X, y=None):
39
+ for feature in self.features:
40
+ X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
41
+ return X
42
+
43
+ class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
44
+
45
+ """
46
+ Class that applies winsorirization of a feature for feature transformation.
47
+ this class is compatible with scikitlearn pipeline
48
+
49
+ Attributes
50
+ ----------
51
+ feature_configs : dict
52
+ dictionary of features and configurations. the configuration has high and low limits per feature
53
+
54
+ Methods
55
+ -------
56
+ fit(additional="", X=DataFrame, y=None):
57
+ fit transformation.
58
+ transform(X=DataFrame, y=None):
59
+ apply feature transformation
60
+ """
61
+
62
+ def __init__(self, feature_configs):
63
+ self.feature_configs = feature_configs
64
+ def fit(self, X, y=None):
65
+ return self
66
+
67
+ def transform(self, X, y=None):
68
+ for feature in self.feature_configs:
69
+ lower = self.feature_configs[feature]['min']
70
+ upper = self.feature_configs[feature]['max']
71
+ X[feature] = np.where( lower > X[feature], lower, X[feature])
72
+ X[feature] = np.where( upper < X[feature], upper, X[feature])
73
+ return X
74
+
75
+ class FeatureSelector(BaseEstimator, TransformerMixin):
76
+
77
+ """
78
+ Class that applies selection of features.
79
+ this class is compatible with scikitlearn pipeline
80
+
81
+ Attributes
82
+ ----------
83
+ columns : list
84
+ list of features to select
85
+
86
+ Methods
87
+ -------
88
+ fit(additional="", X=DataFrame, y=None):
89
+ fit transformation.
90
+ transform(X=DataFrame, y=None):
91
+ apply feature transformation
92
+ """
93
+
94
+ def __init__(self, columns):
95
+ self.columns = columns
96
+
97
+ def fit(self, X, y=None):
98
+ return self
99
+
100
+ def transform(self, X, y=None):
101
+ return X[self.columns]
102
+
103
+ class FeaturesEntropy(BaseEstimator, TransformerMixin):
104
+ """
105
+ Class that creates a feature that calculate entropy for a given feature classes, but it might get some leackeage in the training set.
106
+ this class is compatible with scikitlearn pipeline
107
+
108
+ Attributes
109
+ ----------
110
+ columns : list
111
+ list of features to select
112
+ entropy_map: pd.DataFrame
113
+ dataframe of the map with the entropies per class
114
+ perc: float
115
+ percentage of the dates using for calculate the entropy map
116
+
117
+ Methods
118
+ -------
119
+ fit(additional="", X=DataFrame, y=None):
120
+ fit transformation.
121
+ transform(X=DataFrame, y=None):
122
+ apply feature transformation
123
+ """
124
+
125
+ def __init__(self, features, target, feature_name = None, feature_type = 'discrete', perc = 0.5, default_null = 0.99):
126
+
127
+ self.features = features
128
+ self.feature_type = feature_type
129
+ self.target = target
130
+ self.perc = perc
131
+ self.default_null = default_null
132
+
133
+ if not feature_name:
134
+ self.feature_name = '_'.join(features)
135
+ self.feature_name = self.feature_name + '_' + target + '_' + feature_type
136
+ else:
137
+ self.feature_name = feature_name
138
+
139
+ def fit(self, X, y=None):
140
+
141
+ unique_dates = list(X['Date'].unique())
142
+ unique_dates.sort()
143
+
144
+ total_length = len(unique_dates)
145
+ cut = int(round(total_length*self.perc,0))
146
+ train_dates = unique_dates[:cut]
147
+ max_train_date = max(train_dates)
148
+
149
+ X_ = X[X['Date'] <= max_train_date].copy()
150
+ df = X_.join(y, how = 'left')
151
+
152
+ column_list = [f'{self.feature_type}_signal_{colx}' for colx in self.features]
153
+
154
+ df_aggr = (
155
+ df
156
+ .groupby(column_list, as_index = False)
157
+ .apply(
158
+ lambda x: pd.Series(
159
+ dict(
160
+ counts = x[self.target].count(),
161
+ trues=(x[self.target] == 1).sum(),
162
+ falses=(x[self.target] == 0).sum(),
163
+ )
164
+ )
165
+ )
166
+ .assign(
167
+ trues_rate=lambda x: x['trues'] / x['counts']
168
+ )
169
+ .assign(
170
+ falses_rate=lambda x: x['falses'] / x['counts']
171
+ )
172
+ .assign(
173
+ log2_trues = lambda x: np.log2(1/x['trues_rate'])
174
+ )
175
+ .assign(
176
+ log2_falses = lambda x: np.log2(1/x['falses_rate'])
177
+ )
178
+ .assign(
179
+ comp1 = lambda x: x['trues_rate']*x['log2_trues']
180
+ )
181
+ .assign(
182
+ comp2 = lambda x: x['falses_rate']*x['log2_falses']
183
+ )
184
+ .assign(
185
+ class_entropy = lambda x: np.round(x['comp1']+x['comp2'],3)
186
+ )
187
+ )
188
+
189
+ self.column_list = column_list
190
+ self.entropy_map = (
191
+ df_aggr
192
+ [column_list+['class_entropy']]
193
+ .rename(columns = {'class_entropy': self.feature_name})
194
+ .copy()
195
+ )
196
+
197
+ del df, df_aggr, X_
198
+ return self
199
+
200
+ def transform(self, X, y=None):
201
+
202
+ X = X.join(self.entropy_map.set_index(self.column_list), on=self.column_list, how = 'left')
203
+ X[self.feature_name] = X[self.feature_name].fillna(self.default_null)
204
+ return X
205
+
206
+ class signal_combiner(BaseEstimator, TransformerMixin):
207
+
208
+ """
209
+ Class that applies feature combination of binary signals.
210
+ this class is compatible with scikitlearn pipeline
211
+
212
+ ...
213
+
214
+ Attributes
215
+ ----------
216
+ columns : list
217
+ list of features to select
218
+ drop : boolean
219
+ drop combining features
220
+ prefix_up : str
221
+ up prefix of the base feature
222
+ prefix_low : str
223
+ low prefix of the base feature
224
+
225
+ Methods
226
+ -------
227
+ fit(additional="", X=DataFrame, y=None):
228
+ fit transformation.
229
+ transform(X=DataFrame, y=None):
230
+ apply feature transformation
231
+ """
232
+
233
+ def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
234
+ self.columns = columns
235
+ self.drop = drop
236
+ self.prefix_up = prefix_up
237
+ self.prefix_low = prefix_low
238
+
239
+ def fit(self, X, y=None):
240
+ return self
241
+
242
+ def transform(self, X, y=None):
243
+ for column in self.columns:
244
+ X['CombSignal_'+column] = np.where(
245
+ X[self.prefix_up + column] == 1,
246
+ 1,
247
+ np.where(
248
+ X[self.prefix_low + column] == 1,
249
+ 1,
250
+ 0
251
+ )
252
+ )
253
+ if self.drop:
254
+ X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
255
+ return X
256
+
257
+ class InteractionFeatures(BaseEstimator, TransformerMixin):
258
+
259
+ """
260
+ Class that applies feature interaction.
261
+ this class is compatible with scikitlearn pipeline
262
+
263
+ Attributes
264
+ ----------
265
+ feature_list1 : list
266
+ list of features to combine
267
+ feature_list2 : list
268
+ list of features to combine
269
+
270
+ Methods
271
+ -------
272
+ fit(additional="", X=DataFrame, y=None):
273
+ fit transformation.
274
+ transform(X=DataFrame, y=None):
275
+ apply feature transformation
276
+ """
277
+
278
+ def __init__(self, feature_list1, feature_list2):
279
+ self.feature_list1 = feature_list1
280
+ self.feature_list2 = feature_list2
281
+
282
+ def fit(self, X, y=None):
283
+ return self
284
+
285
+ def simple_div_interaction(self, data, feature1, feature2, feature_name):
286
+ data[feature_name] = data[feature1]*data[feature2]
287
+ data[feature_name] = data[feature_name].replace([np.inf, -np.inf], 0)
288
+ data[feature_name] = data[feature_name].fillna(0)
289
+ return data
290
+
291
+ def transform(self, X, y=None):
292
+ for f1 in self.feature_list1:
293
+ for f2 in self.feature_list2:
294
+ fn = 'iterm_'+f1.replace("norm_","")+"_"+f2.replace("norm_","")
295
+ X = self.simple_div_interaction(X, f1, f2, fn)
296
+ return X
297
+
298
+
299
+ class SplineMarketReturnJumpWaves(BaseEstimator, TransformerMixin):
300
+ """
301
+ Class that gets a feature returns and performs countings so that a spline regression model can be fitted
302
+
303
+ Attributes
304
+ ----------
305
+ return_feature_names : list
306
+ list of the name of the features to apply spline regresion
307
+ target_variables : list
308
+ list of target features
309
+ feature_label : str
310
+ prefix for the new features.
311
+ sample_perc : float
312
+ sample size of the traninig data taking into consideration time
313
+
314
+ Methods
315
+ -------
316
+ fit(additional="", X=DataFrame, y=DataFrame):
317
+ fit transformation.
318
+ transform(X=DataFrame, y=None):
319
+ apply feature transformation
320
+ """
321
+
322
+ def __init__(self, return_feature_names, target_variables, feature_label,
323
+ sample_perc=0.5,parts = 6, e_floor=-0.001,e_top=0.0001, d=3):
324
+ self.sample_perc = sample_perc
325
+ self.return_feature_names=return_feature_names
326
+ self.target_variables = target_variables
327
+ self.glms = dict()
328
+ self.feature_label = feature_label
329
+ self.parts = parts
330
+ self.e_floor = e_floor
331
+ self.e_top = e_top
332
+ self.d = d
333
+ def fit(self, X, y, plot = False):
334
+ #complete dataset with y
335
+ X_set=X.copy()
336
+ X_set[self.target_variables] = y
337
+ #sampling
338
+ if plot:
339
+ fig, ax = plt.subplots(len(self.return_feature_names),1)
340
+ for i,return_feature_name in enumerate(self.return_feature_names):
341
+ X_aggregated = (
342
+ X_set
343
+ .groupby("Date",as_index=False)
344
+ .agg(
345
+ count_target_up = ("target_up","sum"),
346
+ count_target_down = ("target_down","sum"),
347
+ return_feature = (return_feature_name,"max"),
348
+ )
349
+ .sort_values("Date",ascending=True)
350
+ .dropna()
351
+ .copy()
352
+ )
353
+ del X
354
+ gc.collect()
355
+ nlines = X_aggregated.shape[0]
356
+ threshold = int(round((1-nlines*self.sample_perc),0))
357
+ train_ = X_aggregated.iloc[:threshold,:]
358
+ self.glms[return_feature_name] = dict()
359
+ for target in self.target_variables:
360
+ X = train_[["return_feature"]].round(4).values.reshape(-1, 1)
361
+ y = np.log(train_.dropna()[f"count_{target}"].values + 1)
362
+ knot_str = self._get_knot(X)
363
+ transformed_x = dmatrix(f"bs(train, knots=({knot_str}), degree=3, include_intercept=False)", {"train": X}, return_type='dataframe')
364
+ model = sm.GLM(y, transformed_x).fit()
365
+ self.glms[return_feature_name][target] = {
366
+ "model":model,
367
+ }
368
+ if plot:
369
+ x_transfomed = dmatrix(f"bs(valid, knots=({knot_str}), degree={self.d}, include_intercept=False)", {"valid":X}, return_type='dataframe')
370
+ pred = model.predict(x_transfomed)
371
+ ax[i].scatter(X, np.exp(y),s=2,alpha=0.2)
372
+ ax[i].scatter(X, np.exp(pred), alpha=0.2, s=1)
373
+ #self.X_aggregated = X_aggregated
374
+ return self
375
+
376
+ def transform(self, X, y=None, plot =False):
377
+ if plot:
378
+ fig, ax = plt.subplots(len(self.return_feature_names),1)
379
+ for i, return_feature_name in enumerate(self.return_feature_names):
380
+ for target in self.target_variables:
381
+ model = self.glms[return_feature_name][target].get("model")
382
+ vect = X[return_feature_name]
383
+ knot_str = self._get_knot(vect)
384
+ X_transformed = dmatrix(f"bs(valid, knots=({knot_str}), degree={self.d}, include_intercept=False)",
385
+ {"valid":vect.fillna(0)},
386
+ return_type='dataframe')
387
+ X[f"{self.feature_label}_{return_feature_name}_{target}"] = model.predict(
388
+ X_transformed
389
+ )
390
+ if plot:
391
+ pred = model.predict(X_transformed)
392
+ ax[i].scatter(X, np.exp(pred), alpha=0.2, s=1)
393
+ return X
394
+
395
+ def _get_knot(self, input):
396
+ min_, max_ = np.min(input)-self.e_floor, np.max(input)+self.e_top
397
+ r = (max_ - min_)/self.parts
398
+ knot_tuple = [str(i*r+min_) for i,_ in enumerate(range(self.parts),start=0)]
399
+ knot_str = ",".join(knot_tuple)
400
+ knot_str = f"({knot_str})"
401
+ return knot_str
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
2
- Name: virgo-modules
3
- Version: 0.0.72
1
+ Metadata-Version: 2.4
2
+ Name: virgo_modules
3
+ Version: 0.9.0
4
4
  Summary: data processing and statistical modeling using stock market data
5
5
  Home-page: https://github.com/miguelmayhem92/virgo_module
6
6
  Author: Miguel Mayhuire
@@ -9,28 +9,22 @@ License: MIT
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Programming Language :: Python :: 3.9
11
11
  Classifier: Operating System :: OS Independent
12
- Requires-Python: >=3.9, <3.10
12
+ Requires-Python: >=3.9
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: feature-engine ==1.6.1
16
- Requires-Dist: matplotlib ==3.6.3
17
- Requires-Dist: mlflow ==2.1.1
18
- Requires-Dist: numpy ==1.23.5
19
- Requires-Dist: optuna ==3.1.0
20
- Requires-Dist: pandas ==1.5.3
21
- Requires-Dist: plotly ==5.15.0
22
- Requires-Dist: rsa ==4.9
23
- Requires-Dist: scikit-learn ==1.2.1
24
- Requires-Dist: scipy ==1.10.0
25
- Requires-Dist: seaborn ==0.12.2
26
- Requires-Dist: starlette ==0.22.0
27
- Requires-Dist: statsmodels ==0.13.5
28
- Requires-Dist: ta ==0.10.2
29
- Requires-Dist: yfinance ==0.2.9
30
- Requires-Dist: hmmlearn ==0.3.0
31
- Requires-Dist: boto3
32
15
  Provides-Extra: dev
33
- Requires-Dist: pytest >=7.0 ; extra == 'dev'
16
+ Requires-Dist: pytest>=7.0; extra == "dev"
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: license
24
+ Dynamic: license-file
25
+ Dynamic: provides-extra
26
+ Dynamic: requires-python
27
+ Dynamic: summary
34
28
 
35
29
  # Virgo Package
36
30
 
@@ -0,0 +1,24 @@
1
+ virgo_modules/__init__.py,sha256=7NrzGOSBvO9S73thMlxEh5aNYKS5SYKLgTxC1YIIPRk,21
2
+ virgo_modules/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ virgo_modules/src/aws_utils.py,sha256=QCyxJwZ6bNCkMpuuxzxkNxejj-hJf4kj2arb1SQPNug,2582
4
+ virgo_modules/src/backtester.py,sha256=OhiWyzDX0PthXGuhChyWUmDN3cLkzVYe95zS4nGtia8,22106
5
+ virgo_modules/src/hmm_utils.py,sha256=D7axAnCdSe1_1EgRyli2PAnM2f6699hTY9GcxjPXG-o,21221
6
+ virgo_modules/src/pull_artifacts.py,sha256=5OPrgR7pcMSdpbevDRhf0ebk7g7ZRjff4NpTIIWAKjE,1989
7
+ virgo_modules/src/re_utils.py,sha256=AQlhyO0cvU-G42dolhedz5E-sniRzeFhf40RD5QVYpo,75506
8
+ virgo_modules/src/ticketer_source.py,sha256=nGjQdZRDWto8cGKhMQCyAqYwvqYy1m6djkCpffiX3Dk,107747
9
+ virgo_modules/src/transformer_utils.py,sha256=SnYdtsFPnSF6u4UFIat0-X3-qVuUWvv_T46kiB-H0Sk,13682
10
+ virgo_modules/src/edge_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ virgo_modules/src/edge_utils/conformal_utils.py,sha256=cKm4KSM261Eu1FJn4oowKYiKIesW81VbqITIvopGSVk,5410
12
+ virgo_modules/src/edge_utils/edge_utils.py,sha256=FuqEyvVYPhMy39uhxqD6bIVSMVcT-P3Pzuim0FS8u7A,20324
13
+ virgo_modules/src/edge_utils/feature_selection.py,sha256=VSjGsC9bivuRuGDZiykX1_LIM0C0q2el3HEmujWZ4qs,3097
14
+ virgo_modules/src/edge_utils/shap_utils.py,sha256=FgcHkfddvdFSeUqEubYa2ExRGVAWSthqK4b-eKagEmo,2333
15
+ virgo_modules/src/edge_utils/stack_model.py,sha256=QqE91uLo2KauGEj91AVNANB1xE7J4Fa49YOX7k5mFng,4257
16
+ virgo_modules/src/market/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ virgo_modules/src/market/market_tools.py,sha256=vBt66_7E3ANz7avzfeNw_RHMGvG9lh5PRhxmcf_Oyjc,6880
18
+ virgo_modules/src/markowitz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ virgo_modules/src/markowitz/markowitz_utils.py,sha256=2bsnk_QlfWB5QytSNZ5n8elto9hhsEiukcEJBqWEYX4,1970
20
+ virgo_modules-0.9.0.dist-info/licenses/LICENSE,sha256=pNgFyCYgmimaw0o6V20JupZLROycAnOA_HDDh1tX2V4,1097
21
+ virgo_modules-0.9.0.dist-info/METADATA,sha256=z7zb755kS0k15I2FddBWXbFzrTaSmEqUiPvxvtEEzIM,1122
22
+ virgo_modules-0.9.0.dist-info/WHEEL,sha256=lTU6B6eIfYoiQJTZNc-fyaR6BpL6ehTzU3xGYxn2n8k,91
23
+ virgo_modules-0.9.0.dist-info/top_level.txt,sha256=ZjI-qEkDtT-8mFwGAWnXfqPOKEGlIhWRW1es1VyXc60,14
24
+ virgo_modules-0.9.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.2)
2
+ Generator: setuptools (78.1.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,178 +0,0 @@
1
- import numpy as np
2
- import itertools
3
-
4
- from sklearn.metrics import roc_auc_score, precision_score, recall_score
5
- from sklearn.pipeline import Pipeline
6
-
7
- from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
8
- from feature_engine.imputation import MeanMedianImputer
9
- from virgo_modules.src.ticketer_source import FeatureSelector
10
- from feature_engine.discretisation import EqualWidthDiscretiser
11
-
12
- from .ticketer_source import VirgoWinsorizerFeature, InverseHyperbolicSine
13
-
14
- class produce_model_wrapper:
15
- def __init__(self,data):
16
- self.data = data.copy()
17
-
18
- def preprocess(self, validation_size, target):
19
-
20
- val_date = self.data.groupby('Date', as_index = False).agg(target_down = (target[0],'count')).sort_values('Date').iloc[-validation_size:,].head(1)['Date'].values[0]
21
-
22
- train_data = self.data[self.data['Date'] < val_date].dropna()
23
- val_data = self.data[self.data['Date'] >= val_date].dropna()
24
-
25
- columns = [ x for x in train_data.columns if x not in target ]
26
- X_train, y_train = train_data[columns], train_data[target]
27
- X_val, y_val = val_data[columns], val_data[target]
28
- self.X_train = X_train
29
- self.y_train = y_train
30
- self.X_val = X_val
31
- self.y_val = y_val
32
-
33
- def train_model(self, pipe, model, cv_ = False):
34
- self.model = model
35
- self.pipe_transform = pipe
36
- self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
37
- self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
38
- self.pipeline.fit(self.X_train, self.y_train)
39
-
40
- class register_results():
41
- def __init__(self, model_name):
42
- self.model_name = model_name
43
- self.metric_logger = dict()
44
- def eval_metrics(self, pipeline, X, y, type_data, phase):
45
-
46
- preds_proba = pipeline.predict_proba(X)
47
- preds = pipeline.predict(X)
48
-
49
- if type(preds_proba) == list:
50
- preds_proba = np.array([ x[:,1] for x in preds_proba]).T
51
-
52
- roc = roc_auc_score(y,preds_proba, average=None)
53
- precision = precision_score(y,preds, average=None)
54
- recall = recall_score(y,preds, average=None)
55
-
56
- self.metric_logger[f'{phase}//{self.model_name}//{type_data}'] = {'roc':roc, 'precision':precision, 'recall':recall}
57
-
58
- def print_metric_logger(self):
59
- parts = list(self.metric_logger.keys())
60
- phase_parts = [ x.split('//')[0] for x in parts]
61
-
62
- parts = list(self.metric_logger)
63
- phase_parts = [ x.split('//')[0] for x in parts]
64
-
65
- init_phase = phase_parts[0]
66
- print(f'---{init_phase}--')
67
- for phase,val in zip(phase_parts,self.metric_logger):
68
- stage = val.split('//')[2]
69
- if init_phase != phase:
70
- print(f'---{phase}--')
71
- init_phase = phase
72
- for metric in self.metric_logger[val]:
73
- print(stage, metric,self.metric_logger[val][metric])
74
-
75
-
76
- def eval_metrics(pipeline, X, y, type_data, model_name):
77
-
78
- preds_proba = pipeline.predict_proba(X)
79
- preds = pipeline.predict(X)
80
-
81
- if type(preds_proba) == list:
82
- preds_proba = np.array([ x[:,1] for x in preds_proba]).T
83
-
84
- print(f'--{type_data} - {model_name}--')
85
- print('--target: down, up--')
86
- print('--roc-auc--')
87
- print(roc_auc_score(y,preds_proba, average=None))
88
- print('--precision--')
89
- print(precision_score(y,preds, average=None))
90
- print('--recall--')
91
- print(recall_score(y,preds, average=None))
92
-
93
-
94
- def data_processing_pipeline_classifier(
95
- features_base,features_to_drop = False, winsorizer_conf = False, discretize_columns = False,
96
- bins_discretize = 10, correlation = 0.85, fillna = True,
97
- invhypervolsin_features = False,
98
- pipeline_order = 'selector//winzorizer//discretizer//median_inputer//drop//correlation'):
99
-
100
-
101
- select_pipe = [('selector', FeatureSelector(features_base))] if features_base else []
102
- winzorizer_pipe = [('winzorized_features', VirgoWinsorizerFeature(winsorizer_conf))] if winsorizer_conf else []
103
- drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
104
- discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = bins_discretize ))] if discretize_columns else []
105
- drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation, method = 'spearman'))] if correlation else []
106
- median_imputer_pipe = [('median_imputer', MeanMedianImputer())] if fillna else []
107
- invhypersin_pipe = [('invhypervolsin scaler', InverseHyperbolicSine(features = invhypervolsin_features))] if invhypervolsin_features else []
108
-
109
- pipe_dictionary = {
110
- 'selector': select_pipe,
111
- 'winzorizer':winzorizer_pipe,
112
- 'drop':drop_pipe,
113
- 'discretizer': discretize,
114
- 'correlation': drop_corr,
115
- 'median_inputer':median_imputer_pipe,
116
- 'arcsinh_scaler': invhypersin_pipe,
117
- }
118
-
119
- pipeline_steps = pipeline_order.split('//')
120
- ## validation
121
- for step in pipeline_steps:
122
- if step not in pipe_dictionary.keys():
123
- raise Exception(f'{step} step not in list of steps, the list is: {list(pipe_dictionary.keys())}')
124
-
125
- pipeline_args = [ pipe_dictionary[step] for step in pipeline_steps]
126
- pipeline_args = list(itertools.chain.from_iterable(pipeline_args))
127
- pipe = Pipeline(pipeline_args)
128
-
129
- return pipe
130
-
131
-
132
- class ExpandingMultipleTimeSeriesKFold:
133
- """increasing training window where the test can be overlap"""
134
- def __init__(self, df, window_size = 100, number_window=3, overlap_size = 0):
135
- self.df = df
136
- self.number_window = number_window
137
- self.window_size = window_size
138
- self.overlap_size = overlap_size
139
-
140
- def split(self, X, y, groups=None):
141
-
142
- if 'Date_i' not in self.df.index.names or 'i' not in self.df.index.names:
143
- raise Exception('no date and/or index in the index dataframe')
144
-
145
- if self.overlap_size > self.window_size:
146
- raise Exception('overlap can not be higher than the window size')
147
-
148
- unique_dates = list(self.df.index.get_level_values('Date_i').unique())
149
- unique_dates.sort()
150
-
151
- total_test_size = self.window_size * self.number_window
152
- total_test_size = total_test_size - (self.number_window - 1)*self.overlap_size
153
-
154
- if total_test_size > len(unique_dates):
155
- raise Exception('test size is higher than the data length')
156
-
157
- cut = total_test_size
158
- for fold in range(self.number_window):
159
-
160
- topcut = cut-self.window_size
161
- train_dates = unique_dates[:-cut]
162
- test_dates = unique_dates[-cut:-topcut]
163
-
164
- if topcut == 0:
165
- test_dates = unique_dates[-cut:]
166
-
167
- max_train_date = max(train_dates)
168
- min_test_date, max_test_date = min(test_dates), max(test_dates)
169
-
170
- cut = cut - (self.window_size - self.overlap_size)
171
-
172
- train_index = self.df[self.df.index.get_level_values('Date_i') <= max_train_date].index.get_level_values('i')
173
- test_index = self.df[(self.df.index.get_level_values('Date_i') >= min_test_date) & (self.df.index.get_level_values('Date_i') <= max_test_date)].index.get_level_values('i')
174
-
175
- yield train_index, test_index
176
-
177
- def get_n_splits(self, X, y, groups=None):
178
- return self.number_window
@@ -1,12 +0,0 @@
1
- virgo_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- virgo_modules/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- virgo_modules/src/aws_utils.py,sha256=GWmVdXM0mIJJPn-X-bEtM4KtNPCHM1D457hnuKxaM7E,1383
4
- virgo_modules/src/edge_utils.py,sha256=Ihdmq7dyb8gOvG6CrDal7wsa15tqsdsFk6KINwM6578,7691
5
- virgo_modules/src/pull_artifacts.py,sha256=5OPrgR7pcMSdpbevDRhf0ebk7g7ZRjff4NpTIIWAKjE,1989
6
- virgo_modules/src/re_utils.py,sha256=LDI3sYAaNm3LO5gRul7PyCVbJrkT3PBihObkdVilVec,52428
7
- virgo_modules/src/ticketer_source.py,sha256=ciMPObqntAFtnlY1IPt8-Y4mz6yuD1jy6gRQN109D4M,104837
8
- virgo_modules-0.0.72.dist-info/LICENSE,sha256=pNgFyCYgmimaw0o6V20JupZLROycAnOA_HDDh1tX2V4,1097
9
- virgo_modules-0.0.72.dist-info/METADATA,sha256=Txin9qouILtGSvPTQYcJPPkWXNry0JjI3sSfAMB0Cjg,1429
10
- virgo_modules-0.0.72.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
11
- virgo_modules-0.0.72.dist-info/top_level.txt,sha256=ZjI-qEkDtT-8mFwGAWnXfqPOKEGlIhWRW1es1VyXc60,14
12
- virgo_modules-0.0.72.dist-info/RECORD,,