virgo-modules 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of virgo-modules might be problematic. Click here for more details.
- virgo_modules/src/aws_utils.py +21 -6
- virgo_modules/src/edge_utils.py +181 -0
- virgo_modules/src/re_utils.py +70 -87
- virgo_modules/src/ticketer_source.py +207 -135
- {virgo_modules-0.0.4.dist-info → virgo_modules-0.0.6.dist-info}/METADATA +20 -18
- virgo_modules-0.0.6.dist-info/RECORD +12 -0
- {virgo_modules-0.0.4.dist-info → virgo_modules-0.0.6.dist-info}/WHEEL +1 -1
- virgo_modules-0.0.4.dist-info/RECORD +0 -11
- {virgo_modules-0.0.4.dist-info → virgo_modules-0.0.6.dist-info}/LICENSE +0 -0
- {virgo_modules-0.0.4.dist-info → virgo_modules-0.0.6.dist-info}/top_level.txt +0 -0
virgo_modules/src/aws_utils.py
CHANGED
|
@@ -1,23 +1,38 @@
|
|
|
1
1
|
import yaml
|
|
2
2
|
import boto3
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from io import StringIO
|
|
4
|
+
from io import StringIO, BytesIO
|
|
5
|
+
import pandas as pd
|
|
5
6
|
|
|
6
|
-
|
|
7
|
+
|
|
8
|
+
def upload_file_to_aws(bucket,key,input_path, secret_path = 'secrets.yaml'):
|
|
7
9
|
|
|
8
|
-
credentials = yaml.safe_load(Path(
|
|
10
|
+
credentials = yaml.safe_load(Path(secret_path).read_text())
|
|
9
11
|
session = boto3.Session(aws_access_key_id=credentials['AWS_ACCESS_KEY_ID'],aws_secret_access_key=credentials['AWS_SECRET_ACCESS_KEY'])
|
|
10
12
|
bucket = credentials[bucket]
|
|
11
13
|
s3 = session.resource('s3')
|
|
12
14
|
s3.meta.client.upload_file(Filename=input_path , Bucket=bucket, Key=key)
|
|
13
15
|
|
|
14
|
-
def upload_pandas_to_s3(data_frame,bucket,key):
|
|
16
|
+
def upload_pandas_to_s3(data_frame,bucket,key, secret_path = 'secrets.yaml'):
|
|
15
17
|
|
|
16
18
|
csv_buffer = StringIO()
|
|
17
19
|
data_frame.to_csv(csv_buffer)
|
|
18
20
|
csv_buffer.seek(0)
|
|
19
21
|
|
|
20
|
-
credentials = yaml.safe_load(Path(
|
|
22
|
+
credentials = yaml.safe_load(Path(secret_path).read_text())
|
|
21
23
|
s3 = boto3.client("s3",region_name=credentials['AWS_DEFAULT_REGION'],aws_access_key_id=credentials['AWS_ACCESS_KEY_ID'],aws_secret_access_key=credentials['AWS_SECRET_ACCESS_KEY'])
|
|
22
24
|
bucket = credentials[bucket]
|
|
23
|
-
s3.put_object(Bucket=bucket, Body=csv_buffer.getvalue(), Key= key)
|
|
25
|
+
s3.put_object(Bucket=bucket, Body=csv_buffer.getvalue(), Key= key)
|
|
26
|
+
|
|
27
|
+
def download_file_to_aws(bucket,key, secret_path = 'secrets.yaml'):
|
|
28
|
+
|
|
29
|
+
credentials = yaml.safe_load(Path(secret_path).read_text())
|
|
30
|
+
s3c = boto3.client(
|
|
31
|
+
's3',
|
|
32
|
+
region_name = credentials['AWS_DEFAULT_REGION'],
|
|
33
|
+
aws_access_key_id = credentials['AWS_ACCESS_KEY_ID'],
|
|
34
|
+
aws_secret_access_key = credentials['AWS_SECRET_ACCESS_KEY']
|
|
35
|
+
)
|
|
36
|
+
obj = s3c.get_object(Bucket= bucket , Key = key)
|
|
37
|
+
df = pd.read_csv(BytesIO(obj['Body'].read()), encoding='utf8')
|
|
38
|
+
return df
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import itertools
|
|
3
|
+
|
|
4
|
+
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
|
5
|
+
from sklearn.pipeline import Pipeline
|
|
6
|
+
|
|
7
|
+
from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
|
|
8
|
+
from feature_engine.imputation import MeanMedianImputer
|
|
9
|
+
from virgo_modules.src.ticketer_source import FeatureSelector
|
|
10
|
+
from feature_engine.discretisation import EqualWidthDiscretiser
|
|
11
|
+
|
|
12
|
+
from .ticketer_source import VirgoWinsorizerFeature
|
|
13
|
+
|
|
14
|
+
class produce_model_wrapper:
|
|
15
|
+
def __init__(self,data):
|
|
16
|
+
self.data = data.copy()
|
|
17
|
+
|
|
18
|
+
def preprocess(self, validation_size, target):
|
|
19
|
+
|
|
20
|
+
val_date = self.data.groupby('Date', as_index = False).agg(target_down = (target[0],'count')).sort_values('Date').iloc[-validation_size:,].head(1)['Date'].values[0]
|
|
21
|
+
|
|
22
|
+
train_data = self.data[self.data['Date'] < val_date].dropna()
|
|
23
|
+
val_data = self.data[self.data['Date'] >= val_date].dropna()
|
|
24
|
+
|
|
25
|
+
columns = [ x for x in train_data.columns if x not in target ]
|
|
26
|
+
X_train, y_train = train_data[columns], train_data[target]
|
|
27
|
+
X_val, y_val = val_data[columns], val_data[target]
|
|
28
|
+
self.X_train = X_train
|
|
29
|
+
self.y_train = y_train
|
|
30
|
+
self.X_val = X_val
|
|
31
|
+
self.y_val = y_val
|
|
32
|
+
|
|
33
|
+
def train_model(self, pipe, model, cv_ = False):
|
|
34
|
+
self.model = model
|
|
35
|
+
self.pipe_transform = pipe
|
|
36
|
+
self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
|
|
37
|
+
self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
|
|
38
|
+
self.pipeline.fit(self.X_train, self.y_train)
|
|
39
|
+
|
|
40
|
+
class register_results():
|
|
41
|
+
def __init__(self, model_name):
|
|
42
|
+
self.model_name = model_name
|
|
43
|
+
self.metric_logger = dict()
|
|
44
|
+
def eval_metrics(self, pipeline, X, y, type_data, phase):
|
|
45
|
+
|
|
46
|
+
preds_proba = pipeline.predict_proba(X)
|
|
47
|
+
preds = pipeline.predict(X)
|
|
48
|
+
|
|
49
|
+
if type(preds_proba) == list:
|
|
50
|
+
preds_proba = np.array([ x[:,1] for x in preds_proba]).T
|
|
51
|
+
|
|
52
|
+
roc = roc_auc_score(y,preds_proba, average=None)
|
|
53
|
+
precision = precision_score(y,preds, average=None)
|
|
54
|
+
recall = recall_score(y,preds, average=None)
|
|
55
|
+
|
|
56
|
+
self.metric_logger[f'{phase}//{self.model_name}//{type_data}'] = {'roc':roc, 'precision':precision, 'recall':recall}
|
|
57
|
+
|
|
58
|
+
def print_metric_logger(self):
|
|
59
|
+
parts = list(self.metric_logger.keys())
|
|
60
|
+
phase_parts = [ x.split('//')[0] for x in parts]
|
|
61
|
+
|
|
62
|
+
parts = list(self.metric_logger)
|
|
63
|
+
phase_parts = [ x.split('//')[0] for x in parts]
|
|
64
|
+
|
|
65
|
+
init_phase = phase_parts[0]
|
|
66
|
+
print(f'---{init_phase}--')
|
|
67
|
+
for phase,val in zip(phase_parts,self.metric_logger):
|
|
68
|
+
stage = val.split('//')[2]
|
|
69
|
+
if init_phase != phase:
|
|
70
|
+
print(f'---{phase}--')
|
|
71
|
+
init_phase = phase
|
|
72
|
+
for metric in self.metric_logger[val]:
|
|
73
|
+
print(stage, metric,self.metric_logger[val][metric])
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def eval_metrics(pipeline, X, y, type_data, model_name):
|
|
77
|
+
|
|
78
|
+
preds_proba = pipeline.predict_proba(X)
|
|
79
|
+
preds = pipeline.predict(X)
|
|
80
|
+
|
|
81
|
+
if type(preds_proba) == list:
|
|
82
|
+
preds_proba = np.array([ x[:,1] for x in preds_proba]).T
|
|
83
|
+
|
|
84
|
+
print(f'--{type_data} - {model_name}--')
|
|
85
|
+
print('--target: down, up--')
|
|
86
|
+
print('--roc-auc--')
|
|
87
|
+
print(roc_auc_score(y,preds_proba, average=None))
|
|
88
|
+
print('--precision--')
|
|
89
|
+
print(precision_score(y,preds, average=None))
|
|
90
|
+
print('--recall--')
|
|
91
|
+
print(recall_score(y,preds, average=None))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def data_processing_pipeline_classifier(features_base,features_to_drop = False, winsorizer_conf = False, discretize_columns = False,
|
|
95
|
+
bins_discretize = 10, correlation = 0.85, fillna = True,
|
|
96
|
+
pipeline_order = 'selector//winzorizer//discretizer//median_inputer//drop//correlation'):
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
select_pipe = [('selector', FeatureSelector(features_base))] if features_base else []
|
|
100
|
+
winzorizer_pipe = [('winzorized_features', VirgoWinsorizerFeature(winsorizer_conf))] if winsorizer_conf else []
|
|
101
|
+
drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
|
|
102
|
+
discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = bins_discretize ))] if discretize_columns else []
|
|
103
|
+
drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation, method = 'spearman'))] if correlation else []
|
|
104
|
+
median_imputer_pipe = [('median_imputer', MeanMedianImputer())] if fillna else []
|
|
105
|
+
|
|
106
|
+
pipe_dictionary = {
|
|
107
|
+
'selector': select_pipe,
|
|
108
|
+
'winzorizer':winzorizer_pipe,
|
|
109
|
+
'drop':drop_pipe,
|
|
110
|
+
'discretizer': discretize,
|
|
111
|
+
'correlation': drop_corr,
|
|
112
|
+
'median_inputer':median_imputer_pipe,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
pipeline_steps = pipeline_order.split('//')
|
|
116
|
+
## validation
|
|
117
|
+
for step in pipeline_steps:
|
|
118
|
+
if step not in pipe_dictionary.keys():
|
|
119
|
+
raise Exception(f'{step} step not in list of steps, the list is: {list(pipe_dictionary.keys())}')
|
|
120
|
+
|
|
121
|
+
pipeline_args = [ pipe_dictionary[step] for step in pipeline_steps]
|
|
122
|
+
pipeline_args = list(itertools.chain.from_iterable(pipeline_args))
|
|
123
|
+
pipe = Pipeline(pipeline_args)
|
|
124
|
+
# pipe = Pipeline(
|
|
125
|
+
# select_pipe + \
|
|
126
|
+
# winzorizer_pipe + \
|
|
127
|
+
# discretize + \
|
|
128
|
+
# median_imputer_pipe + \
|
|
129
|
+
# drop_pipe + \
|
|
130
|
+
# drop_corr
|
|
131
|
+
# )
|
|
132
|
+
return pipe
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class ExpandingMultipleTimeSeriesKFold:
|
|
136
|
+
"""increasing training window where the test can be overlap"""
|
|
137
|
+
def __init__(self, df, window_size = 100, number_window=3, overlap_size = 0):
|
|
138
|
+
self.df = df
|
|
139
|
+
self.number_window = number_window
|
|
140
|
+
self.window_size = window_size
|
|
141
|
+
self.overlap_size = overlap_size
|
|
142
|
+
|
|
143
|
+
def split(self, X, y, groups=None):
|
|
144
|
+
|
|
145
|
+
if 'Date_i' not in self.df.index.names or 'i' not in self.df.index.names:
|
|
146
|
+
raise Exception('no date and/or index in the index dataframe')
|
|
147
|
+
|
|
148
|
+
if self.overlap_size > self.window_size:
|
|
149
|
+
raise Exception('overlap can not be higher than the window size')
|
|
150
|
+
|
|
151
|
+
unique_dates = list(self.df.index.get_level_values('Date_i').unique())
|
|
152
|
+
unique_dates.sort()
|
|
153
|
+
|
|
154
|
+
total_test_size = self.window_size * self.number_window
|
|
155
|
+
total_test_size = total_test_size - (self.number_window - 1)*self.overlap_size
|
|
156
|
+
|
|
157
|
+
if total_test_size > len(unique_dates):
|
|
158
|
+
raise Exception('test size is higher than the data length')
|
|
159
|
+
|
|
160
|
+
cut = total_test_size
|
|
161
|
+
for fold in range(self.number_window):
|
|
162
|
+
|
|
163
|
+
topcut = cut-self.window_size
|
|
164
|
+
train_dates = unique_dates[:-cut]
|
|
165
|
+
test_dates = unique_dates[-cut:-topcut]
|
|
166
|
+
|
|
167
|
+
if topcut == 0:
|
|
168
|
+
test_dates = unique_dates[-cut:]
|
|
169
|
+
|
|
170
|
+
max_train_date = max(train_dates)
|
|
171
|
+
min_test_date, max_test_date = min(test_dates), max(test_dates)
|
|
172
|
+
|
|
173
|
+
cut = cut - (self.window_size - self.overlap_size)
|
|
174
|
+
|
|
175
|
+
train_index = self.df[self.df.index.get_level_values('Date_i') <= max_train_date].index.get_level_values('i')
|
|
176
|
+
test_index = self.df[(self.df.index.get_level_values('Date_i') >= min_test_date) & (self.df.index.get_level_values('Date_i') <= max_test_date)].index.get_level_values('i')
|
|
177
|
+
|
|
178
|
+
yield train_index, test_index
|
|
179
|
+
|
|
180
|
+
def get_n_splits(self, X, y, groups=None):
|
|
181
|
+
return self.number_window
|
virgo_modules/src/re_utils.py
CHANGED
|
@@ -404,7 +404,7 @@ def rank_by_return(data, lag_days, top_n = 5):
|
|
|
404
404
|
|
|
405
405
|
return result
|
|
406
406
|
|
|
407
|
-
def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_available: object = False) -> object:
|
|
407
|
+
def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_available: object = False, data_window:str = '5y') -> object:
|
|
408
408
|
"""
|
|
409
409
|
this functions runs the stock_eda_panel
|
|
410
410
|
it is shared between train model and predictions
|
|
@@ -417,103 +417,84 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
|
|
|
417
417
|
returns: stock eda panel
|
|
418
418
|
"""
|
|
419
419
|
|
|
420
|
-
object_stock = stock_eda_panel(ticker_name , n_days )
|
|
420
|
+
object_stock = stock_eda_panel(ticker_name , n_days, data_window)
|
|
421
421
|
object_stock.get_data()
|
|
422
422
|
|
|
423
423
|
# computing features if they exists in the ticketr settings
|
|
424
424
|
|
|
425
425
|
if 'volatility' in ticket_settings['settings']:
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
trad_days = ticket_settings['settings']['volatility']['trad_days'],
|
|
429
|
-
window_log_return = ticket_settings['settings']['volatility']['window_log_return']
|
|
430
|
-
)
|
|
426
|
+
parameters = ticket_settings['settings']['volatility']
|
|
427
|
+
object_stock.volatility_analysis(**parameters)
|
|
431
428
|
|
|
432
429
|
if 'outlier' in ticket_settings['settings']:
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
430
|
+
parameters = ticket_settings['settings']['outlier']
|
|
431
|
+
object_stock.outlier_plot(**parameters)
|
|
432
|
+
|
|
433
|
+
## for now this is hard coded
|
|
434
|
+
feature_map = {
|
|
435
|
+
'spread_ma':'spread_MA', # deprecated
|
|
436
|
+
'relative_spread_ma':'relative_spread_MA',
|
|
437
|
+
'pair_feature':'pair_feature',
|
|
438
|
+
'count_features':'get_count_feature', # deprecated
|
|
439
|
+
'bidirect_count_features':'bidirect_count_feature',
|
|
440
|
+
'price_range':'get_range_feature', # deprecated
|
|
441
|
+
'relative_price_range':'get_relative_range_feature',
|
|
442
|
+
'rsi_feature':'rsi_feature', # deprecated
|
|
443
|
+
'rsi_feature_v2':'rsi_feature_improved',
|
|
444
|
+
'days_features':'days_features', # deprecated
|
|
445
|
+
'days_features_v2':'days_features_bands',
|
|
446
|
+
'volume_feature':'analysis_volume', ## this may crash but deprecated
|
|
447
|
+
'smooth_volume':'analysis_smooth_volume',
|
|
448
|
+
'roc_feature':'roc_feature',
|
|
449
|
+
'stoch_feature':'stoch_feature',
|
|
450
|
+
'stochastic_feature':'stochastic_feature',
|
|
451
|
+
'william_feature':'william_feature',
|
|
452
|
+
'vortex_feature':'vortex_feature',
|
|
453
|
+
'pair_index_feature':'pair_index_feature' # this has a diff structure!
|
|
454
|
+
}
|
|
455
|
+
exceptions = ['pair_feature','pair_index_feature']
|
|
456
|
+
### standar feature
|
|
457
|
+
for feature in feature_map.keys():
|
|
458
|
+
if (feature in ticket_settings['settings']) and (feature not in exceptions):
|
|
459
|
+
parameters = ticket_settings['settings'][feature]
|
|
460
|
+
method_to_use = feature_map.get(feature)
|
|
461
|
+
getattr(object_stock, method_to_use)(**parameters)
|
|
462
|
+
|
|
463
|
+
## special features
|
|
448
464
|
if 'pair_feature' in ticket_settings['settings']:
|
|
449
465
|
object_stock.pair_feature(pair_symbol = ticket_settings['settings']['pair_feature']['pair_symbol'])
|
|
450
466
|
object_stock.produce_pair_score_plot(
|
|
451
467
|
window = ticket_settings['settings']['pair_feature']['window'],
|
|
452
468
|
z_threshold = ticket_settings['settings']['pair_feature']['z_threshold']
|
|
453
|
-
)
|
|
454
|
-
|
|
455
|
-
if 'count_features' in ticket_settings['settings']:
|
|
456
|
-
object_stock.get_count_feature(
|
|
457
|
-
rolling_window = ticket_settings['settings']['count_features']['rolling_window'],
|
|
458
|
-
threshold = ticket_settings['settings']['count_features']['threshold']
|
|
459
|
-
)
|
|
460
|
-
|
|
461
|
-
if 'bidirect_count_features' in ticket_settings['settings']:
|
|
462
|
-
object_stock.bidirect_count_feature(
|
|
463
|
-
rolling_window = ticket_settings['settings']['bidirect_count_features']['rolling_window'],
|
|
464
|
-
threshold = ticket_settings['settings']['bidirect_count_features']['threshold']
|
|
465
|
-
)
|
|
469
|
+
)
|
|
466
470
|
|
|
467
|
-
if '
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
)
|
|
473
|
-
|
|
474
|
-
if 'relative_price_range' in ticket_settings['settings']:
|
|
475
|
-
object_stock.get_relative_range_feature(
|
|
476
|
-
window = ticket_settings['settings']['relative_price_range']['window'],
|
|
477
|
-
threshold = ticket_settings['settings']['relative_price_range']['threshold']
|
|
478
|
-
)
|
|
479
|
-
|
|
480
|
-
if 'rsi_feature' in ticket_settings['settings']:
|
|
481
|
-
object_stock.rsi_feature(
|
|
482
|
-
window = ticket_settings['settings']['rsi_feature']['window'],
|
|
483
|
-
lag_rsi_ret = ticket_settings['settings']['rsi_feature']['lag_rsi_ret'],
|
|
484
|
-
threshold = ticket_settings['settings']['rsi_feature']['threshold']
|
|
485
|
-
)
|
|
471
|
+
if 'pair_index_feature' in ticket_settings['settings']:
|
|
472
|
+
for group_feature in ticket_settings['settings']['pair_index_feature']:
|
|
473
|
+
key = list(group_feature.keys())[0]
|
|
474
|
+
parameters = group_feature[key]
|
|
475
|
+
method_to_use = feature_map.get('pair_index_feature')
|
|
476
|
+
getattr(object_stock, method_to_use)(**parameters)
|
|
486
477
|
|
|
487
|
-
if '
|
|
488
|
-
object_stock.rsi_feature_improved(
|
|
489
|
-
window = ticket_settings['settings']['rsi_feature_v2']['window'],
|
|
490
|
-
threshold = ticket_settings['settings']['rsi_feature_v2']['threshold']
|
|
491
|
-
)
|
|
478
|
+
if 'target_lasts' in ticket_settings['settings']:
|
|
492
479
|
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
if 'smooth_volume' in ticket_settings['settings']:
|
|
513
|
-
object_stock.analysis_smooth_volume(
|
|
514
|
-
window = ticket_settings['settings']['smooth_volume']['window'],
|
|
515
|
-
threshold = ticket_settings['settings']['smooth_volume']['threshold']
|
|
516
|
-
)
|
|
480
|
+
type_target = ticket_settings['settings']['target_lasts']['type']
|
|
481
|
+
params = {k:v for k,v in ticket_settings['settings']['target_lasts'].items() if k != 'type'}
|
|
482
|
+
|
|
483
|
+
if 'classification' == type_target:
|
|
484
|
+
object_stock.get_categorical_targets(**params)
|
|
485
|
+
|
|
486
|
+
elif 'regression' == type_target:
|
|
487
|
+
object_stock.get_targets(**params)
|
|
488
|
+
|
|
489
|
+
del params
|
|
490
|
+
del type_target
|
|
491
|
+
|
|
492
|
+
## searching discrete signals and orders
|
|
493
|
+
discrete_signals = [x for x in ticket_settings['signals'] if 'discrete' in x]
|
|
494
|
+
discrete_features = [x.replace('discrete_signal_', '') for x in discrete_signals]
|
|
495
|
+
if len(discrete_features) > 0:
|
|
496
|
+
for feature_name in discrete_features:
|
|
497
|
+
object_stock.produce_order_features(feature_name)
|
|
517
498
|
|
|
518
499
|
if hmm_available:
|
|
519
500
|
object_stock.cluster_hmm_analysis( n_clusters = None,
|
|
@@ -521,10 +502,11 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
|
|
|
521
502
|
test_data_size = None,
|
|
522
503
|
seed = None, model = hmm_available)
|
|
523
504
|
else:
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
505
|
+
if 'hmm' in ticket_settings['settings']:
|
|
506
|
+
object_stock.cluster_hmm_analysis( n_clusters = ticket_settings['settings']['hmm']['n_clusters'],
|
|
507
|
+
features_hmm = ticket_settings['settings']['hmm']['features_hmm'],
|
|
508
|
+
test_data_size = ticket_settings['settings']['hmm']['test_data_size'],
|
|
509
|
+
seed = ticket_settings['settings']['hmm']['seed'])
|
|
528
510
|
|
|
529
511
|
return object_stock
|
|
530
512
|
|
|
@@ -578,6 +560,7 @@ def call_ml_objects(stock_code, client, call_models = False):
|
|
|
578
560
|
ticker_name= stock_code,
|
|
579
561
|
ticket_settings = ticket_settings,
|
|
580
562
|
n_days = ticket_settings['settings']['general']['n_days'],
|
|
563
|
+
data_window = ticket_settings['settings']['general'].get('data_window','5y'),
|
|
581
564
|
hmm_available = hmm_model
|
|
582
565
|
)
|
|
583
566
|
### applying kalman
|
|
@@ -26,7 +26,8 @@ import statsmodels.api as sm
|
|
|
26
26
|
|
|
27
27
|
import scipy.stats as stats
|
|
28
28
|
|
|
29
|
-
from ta.momentum import RSIIndicator
|
|
29
|
+
from ta.momentum import RSIIndicator, ROCIndicator, StochRSIIndicator,StochasticOscillator, WilliamsRIndicator
|
|
30
|
+
from ta.trend import VortexIndicator
|
|
30
31
|
|
|
31
32
|
import warnings
|
|
32
33
|
warnings.filterwarnings('ignore')
|
|
@@ -44,14 +45,27 @@ from itertools import combinations, chain
|
|
|
44
45
|
from feature_engine.encoding import OneHotEncoder
|
|
45
46
|
from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
|
|
46
47
|
from feature_engine.timeseries.forecasting import LagFeatures
|
|
47
|
-
from feature_engine.imputation import
|
|
48
|
-
from feature_engine.discretisation import
|
|
48
|
+
from feature_engine.imputation import MeanMedianImputer
|
|
49
|
+
from feature_engine.discretisation import EqualWidthDiscretiser
|
|
49
50
|
|
|
50
51
|
from .aws_utils import upload_file_to_aws
|
|
51
|
-
import pickle
|
|
52
52
|
|
|
53
53
|
import logging
|
|
54
54
|
|
|
55
|
+
class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
|
|
56
|
+
def __init__(self, feature_configs):
|
|
57
|
+
self.feature_configs = feature_configs
|
|
58
|
+
def fit(self, X, y=None):
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def transform(self, X, y=None):
|
|
62
|
+
for feature in self.feature_configs:
|
|
63
|
+
lower = self.feature_configs[feature]['min']
|
|
64
|
+
upper = self.feature_configs[feature]['max']
|
|
65
|
+
X[feature] = np.where( lower > X[feature], lower, X[feature])
|
|
66
|
+
X[feature] = np.where( upper < X[feature], upper, X[feature])
|
|
67
|
+
return X
|
|
68
|
+
|
|
55
69
|
class FeatureSelector(BaseEstimator, TransformerMixin):
|
|
56
70
|
def __init__(self, columns):
|
|
57
71
|
self.columns = columns
|
|
@@ -152,12 +166,13 @@ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2
|
|
|
152
166
|
|
|
153
167
|
class stock_eda_panel(object):
|
|
154
168
|
|
|
155
|
-
def __init__(self, stock_code, n_days):
|
|
169
|
+
def __init__(self, stock_code, n_days, data_window = '5y'):
|
|
156
170
|
self.stock_code = stock_code
|
|
157
171
|
self.n_days = n_days
|
|
158
172
|
self.today = datetime.date.today()
|
|
159
173
|
self.features = list()
|
|
160
174
|
self.signals = list()
|
|
175
|
+
self.data_window = data_window
|
|
161
176
|
|
|
162
177
|
def augmented_dickey_fuller_statistics(self,time_series, label):
|
|
163
178
|
result = adfuller(time_series.dropna().values)
|
|
@@ -168,8 +183,7 @@ class stock_eda_panel(object):
|
|
|
168
183
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
169
184
|
|
|
170
185
|
stock = yf.Ticker(self.stock_code)
|
|
171
|
-
|
|
172
|
-
df = stock.history(period='5y')
|
|
186
|
+
df = stock.history(period=self.data_window)
|
|
173
187
|
|
|
174
188
|
df = df.sort_values('Date')
|
|
175
189
|
df.reset_index(inplace=True)
|
|
@@ -177,7 +191,12 @@ class stock_eda_panel(object):
|
|
|
177
191
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
178
192
|
|
|
179
193
|
df = df[df.Date >= begin_date_str ]
|
|
180
|
-
self.settings_general = {
|
|
194
|
+
self.settings_general = {
|
|
195
|
+
'n_days':self.n_days,
|
|
196
|
+
'begin_date':begin_date_str,
|
|
197
|
+
'data_window': self.data_window,
|
|
198
|
+
'execution_date': self.today.strftime('%Y-%m-%d')
|
|
199
|
+
}
|
|
181
200
|
self.df = df
|
|
182
201
|
|
|
183
202
|
### cleaning volume
|
|
@@ -226,8 +245,6 @@ class stock_eda_panel(object):
|
|
|
226
245
|
df["lower"] = df['Close_roll_mean'] - df["Close_roll_std"]*2
|
|
227
246
|
|
|
228
247
|
df = df[df.Date >= begin_date_str ]
|
|
229
|
-
self.settings_general = {'n_days':self.n_days, 'begin_date':begin_date_str}
|
|
230
|
-
self.df = df
|
|
231
248
|
|
|
232
249
|
fig = make_subplots(rows=1, cols=1,vertical_spacing = 0.1,shared_xaxes=True,
|
|
233
250
|
subplot_titles=(
|
|
@@ -453,6 +470,26 @@ class stock_eda_panel(object):
|
|
|
453
470
|
|
|
454
471
|
self.df[f'signal_low_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] < self.df[f'lower_{feature_name}'] ), 1, 0)
|
|
455
472
|
self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] > self.df[f'upper_{feature_name}'] ), 1, 0)
|
|
473
|
+
|
|
474
|
+
def signal_plotter(self, feature_name):
|
|
475
|
+
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
476
|
+
|
|
477
|
+
axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
|
|
478
|
+
axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
|
|
479
|
+
axs[0].plot(self.df[f'norm_{feature_name}'])
|
|
480
|
+
|
|
481
|
+
plot_acf(self.df[feature_name].dropna(),lags=25,ax = axs[1])
|
|
482
|
+
axs[1].set_title(f'acf {feature_name}')
|
|
483
|
+
|
|
484
|
+
plot_pacf(self.df[feature_name].dropna(),lags=25,ax = axs[2])
|
|
485
|
+
axs[2].set_title(f'pacf {feature_name}')
|
|
486
|
+
|
|
487
|
+
fig.show()
|
|
488
|
+
|
|
489
|
+
def log_features_standard(self, feature_name):
|
|
490
|
+
self.features.append(feature_name)
|
|
491
|
+
self.signals.append(f'signal_up_{feature_name}')
|
|
492
|
+
self.signals.append(f'signal_low_{feature_name}')
|
|
456
493
|
|
|
457
494
|
#######################
|
|
458
495
|
#### to be deprecated ####
|
|
@@ -535,26 +572,12 @@ class stock_eda_panel(object):
|
|
|
535
572
|
|
|
536
573
|
print('--------------------------------------------------------------------')
|
|
537
574
|
if save_features:
|
|
538
|
-
self.
|
|
539
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
540
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
575
|
+
self.log_features_standard(feature_name)
|
|
541
576
|
self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
|
|
542
577
|
|
|
543
578
|
if plot:
|
|
544
579
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
axs[0].plot(self.df['Date'],self.df[f'norm_{feature_name}'])
|
|
548
|
-
axs[0].plot(self.df['Date'],self.df[f'upper_{feature_name}'], linestyle='--')
|
|
549
|
-
axs[0].plot(self.df['Date'],self.df[f'lower_{feature_name}'], linestyle='--')
|
|
550
|
-
axs[0].set_title('rel_MA_spread series')
|
|
551
|
-
|
|
552
|
-
plot_acf(self.df[feature_name].dropna(),lags=25, ax=axs[1])
|
|
553
|
-
axs[1].set_title('acf rel_MA_spread series')
|
|
554
|
-
|
|
555
|
-
plot_pacf(self.df[feature_name].dropna(),lags=25, ax=axs[2])
|
|
556
|
-
axs[2].set_title('acf rel_MA_spread series')
|
|
557
|
-
plt.show()
|
|
580
|
+
self.signal_plotter(feature_name)
|
|
558
581
|
|
|
559
582
|
def pair_feature(self, pair_symbol, plot = False):
|
|
560
583
|
self.pair_symbol = pair_symbol
|
|
@@ -562,8 +585,7 @@ class stock_eda_panel(object):
|
|
|
562
585
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
563
586
|
|
|
564
587
|
stock = yf.Ticker(self.pair_symbol)
|
|
565
|
-
|
|
566
|
-
df = stock.history(period='5y')
|
|
588
|
+
df = stock.history(period=self.data_window)
|
|
567
589
|
df = df.sort_values('Date')
|
|
568
590
|
df.reset_index(inplace=True)
|
|
569
591
|
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
@@ -622,9 +644,7 @@ class stock_eda_panel(object):
|
|
|
622
644
|
self.df['signal_up_pair_z_score'] = np.where(self.df['pair_z_score'] > z_threshold, 1, 0)
|
|
623
645
|
|
|
624
646
|
if save_features:
|
|
625
|
-
self.
|
|
626
|
-
self.signals.append('signal_low_pair_z_score')
|
|
627
|
-
self.signals.append('signal_up_pair_z_score')
|
|
647
|
+
self.log_features_standard('pair_z_score')
|
|
628
648
|
self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
|
|
629
649
|
|
|
630
650
|
if plot:
|
|
@@ -701,9 +721,7 @@ class stock_eda_panel(object):
|
|
|
701
721
|
self.compute_clip_bands(feature_name,threshold)
|
|
702
722
|
|
|
703
723
|
if save_features:
|
|
704
|
-
self.
|
|
705
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
706
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
724
|
+
self.log_features_standard(feature_name)
|
|
707
725
|
self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
708
726
|
|
|
709
727
|
if plot:
|
|
@@ -768,9 +786,7 @@ class stock_eda_panel(object):
|
|
|
768
786
|
self.df[f'signal_low_{feature_name}'] = np.where(self.df[f'norm_{feature_name}'] < self.df[f'low_bound_norm_{feature_name}'],1,0 )
|
|
769
787
|
|
|
770
788
|
if save_features:
|
|
771
|
-
self.
|
|
772
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
773
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
789
|
+
self.log_features_standard(feature_name)
|
|
774
790
|
self.settings_relative_price_range = {'window':window, 'threshold':threshold}
|
|
775
791
|
|
|
776
792
|
if plot:
|
|
@@ -827,25 +843,11 @@ class stock_eda_panel(object):
|
|
|
827
843
|
self.compute_clip_bands(feature_name,threshold)
|
|
828
844
|
|
|
829
845
|
if save_features:
|
|
830
|
-
self.
|
|
831
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
832
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
846
|
+
self.log_features_standard(feature_name)
|
|
833
847
|
self.settings_rsi_feature_v2 = {'window':window, 'threshold':threshold}
|
|
834
848
|
|
|
835
849
|
if plot:
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
|
|
839
|
-
axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
|
|
840
|
-
axs[0].plot(self.df[f'norm_{feature_name}'])
|
|
841
|
-
|
|
842
|
-
plot_acf(self.df['RSI'].dropna(),lags=25,ax = axs[1])
|
|
843
|
-
axs[1].set_title('acf RSI')
|
|
844
|
-
|
|
845
|
-
plot_pacf(self.df['RSI'].dropna(),lags=25,ax = axs[2])
|
|
846
|
-
axs[2].set_title('pacf RSI')
|
|
847
|
-
|
|
848
|
-
fig.show()
|
|
850
|
+
self.signal_plotter(feature_name)
|
|
849
851
|
|
|
850
852
|
#######################
|
|
851
853
|
#### to be deprecated ####
|
|
@@ -905,25 +907,11 @@ class stock_eda_panel(object):
|
|
|
905
907
|
|
|
906
908
|
if save_features:
|
|
907
909
|
|
|
908
|
-
self.
|
|
909
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
910
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
910
|
+
self.log_features_standard(feature_name)
|
|
911
911
|
self.settings_days_features_v2 = {'window':window, 'threshold':threshold}
|
|
912
912
|
|
|
913
913
|
if plot:
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
axs[0].plot(self.df[f'norm_{feature_name}'])
|
|
917
|
-
axs[0].plot(self.df[f'upper_{feature_name}'], linestyle='--')
|
|
918
|
-
axs[0].plot(self.df[f'lower_{feature_name}'], linestyle='--')
|
|
919
|
-
|
|
920
|
-
plot_acf(self.df[f'norm_{feature_name}'].dropna(),lags=25,ax = axs[1])
|
|
921
|
-
axs[1].set_title('acf day feature')
|
|
922
|
-
|
|
923
|
-
plot_pacf(self.df[f'norm_{feature_name}'].dropna(),lags=25,ax = axs[2])
|
|
924
|
-
axs[2].set_title('pacf day feature')
|
|
925
|
-
|
|
926
|
-
fig.show()
|
|
914
|
+
self.signal_plotter(feature_name)
|
|
927
915
|
|
|
928
916
|
#######################
|
|
929
917
|
#### to be deprecated ####
|
|
@@ -996,9 +984,7 @@ class stock_eda_panel(object):
|
|
|
996
984
|
self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'z_{feature_name}'] > threshold ), 1, 0)
|
|
997
985
|
|
|
998
986
|
if save_features:
|
|
999
|
-
self.
|
|
1000
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
1001
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
987
|
+
self.log_features_standard(feature_name)
|
|
1002
988
|
self.settings_smooth_volume = {'window':window, 'threshold':threshold}
|
|
1003
989
|
if plot:
|
|
1004
990
|
fig, axs = plt.subplots(2, 2,figsize=(11,6))
|
|
@@ -1025,6 +1011,138 @@ class stock_eda_panel(object):
|
|
|
1025
1011
|
axs[1].set_title(f'z_{feature_name}')
|
|
1026
1012
|
|
|
1027
1013
|
plt.show()
|
|
1014
|
+
|
|
1015
|
+
def roc_feature(self, window, threshold, plot = False, save_features = False):
|
|
1016
|
+
feature_name = 'ROC'
|
|
1017
|
+
roc = ROCIndicator(close = self.df['Close'], window = window).roc()
|
|
1018
|
+
self.df[feature_name] = roc
|
|
1019
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1020
|
+
|
|
1021
|
+
if save_features:
|
|
1022
|
+
self.log_features_standard(feature_name)
|
|
1023
|
+
self.settings_roc_feature = {'window':window, 'threshold':threshold}
|
|
1024
|
+
if plot:
|
|
1025
|
+
self.signal_plotter(feature_name)
|
|
1026
|
+
|
|
1027
|
+
def stoch_feature(self, window, smooth1, smooth2, threshold, plot = False, save_features = False):
|
|
1028
|
+
feature_name = 'STOCH'
|
|
1029
|
+
stoch = StochRSIIndicator(close = self.df['Close'], window = window, smooth1=smooth1, smooth2=smooth2).stochrsi()
|
|
1030
|
+
self.df[feature_name] = stoch
|
|
1031
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1032
|
+
|
|
1033
|
+
if save_features:
|
|
1034
|
+
self.log_features_standard(feature_name)
|
|
1035
|
+
self.settings_stoch_feature = {'window':window, 'smooth1':smooth1, 'smooth2':smooth2, 'threshold':threshold}
|
|
1036
|
+
if plot:
|
|
1037
|
+
self.signal_plotter(feature_name)
|
|
1038
|
+
|
|
1039
|
+
def stochastic_feature(self, window, smooth, threshold, plot = False, save_features = False):
|
|
1040
|
+
feature_name = 'STOCHOSC'
|
|
1041
|
+
stochast = StochasticOscillator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window,smooth_window=smooth).stoch()
|
|
1042
|
+
self.df[feature_name] = stochast
|
|
1043
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1044
|
+
|
|
1045
|
+
if save_features:
|
|
1046
|
+
self.log_features_standard(feature_name)
|
|
1047
|
+
self.settings_stochastic_feature = {'window':window, 'smooth':smooth,'threshold':threshold}
|
|
1048
|
+
if plot:
|
|
1049
|
+
self.signal_plotter(feature_name)
|
|
1050
|
+
|
|
1051
|
+
def william_feature(self, lbp, threshold, plot = False, save_features = False):
|
|
1052
|
+
feature_name = 'WILL'
|
|
1053
|
+
will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
|
|
1054
|
+
self.df[feature_name] = will
|
|
1055
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1056
|
+
|
|
1057
|
+
if save_features:
|
|
1058
|
+
self.log_features_standard(feature_name)
|
|
1059
|
+
self.settings_william_feature = {'lbp':lbp,'threshold':threshold}
|
|
1060
|
+
if plot:
|
|
1061
|
+
self.signal_plotter(feature_name)
|
|
1062
|
+
|
|
1063
|
+
def vortex_feature(self, window, threshold, plot = False, save_features = False):
|
|
1064
|
+
feature_name = 'VORTEX'
|
|
1065
|
+
vortex = VortexIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window).vortex_indicator_diff()
|
|
1066
|
+
self.df[feature_name] = vortex
|
|
1067
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1068
|
+
|
|
1069
|
+
if save_features:
|
|
1070
|
+
self.log_features_standard(feature_name)
|
|
1071
|
+
self.settings_vortex_feature = {'window':window, 'threshold':threshold}
|
|
1072
|
+
if plot:
|
|
1073
|
+
self.signal_plotter(feature_name)
|
|
1074
|
+
|
|
1075
|
+
def pair_index_feature(self, pair_symbol, feature_label, window, threshold, plot = False, save_features = False):
|
|
1076
|
+
self.pair_index = pair_symbol
|
|
1077
|
+
begin_date = self.today - relativedelta(days = self.n_days)
|
|
1078
|
+
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
1079
|
+
|
|
1080
|
+
if feature_label in self.df.columns:
|
|
1081
|
+
self.df = self.df.drop(columns = [feature_label])
|
|
1082
|
+
|
|
1083
|
+
stock = yf.Ticker(self.pair_index)
|
|
1084
|
+
df = stock.history(period=self.data_window)
|
|
1085
|
+
df = df.sort_values('Date')
|
|
1086
|
+
df.reset_index(inplace=True)
|
|
1087
|
+
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
1088
|
+
df['Date'] = pd.to_datetime(df['Date'])
|
|
1089
|
+
df = df[df.Date >= begin_date_str ]
|
|
1090
|
+
self.pair_index_df = df
|
|
1091
|
+
|
|
1092
|
+
#### converting the same index ####
|
|
1093
|
+
dates_vector = self.df.Date.to_frame()
|
|
1094
|
+
self.pair_index_df = dates_vector.merge(self.pair_index_df, on ='Date',how = 'left')
|
|
1095
|
+
self.pair_index_df = self.pair_index_df.fillna(method = 'bfill')
|
|
1096
|
+
self.pair_index_df = self.pair_index_df.fillna(method = 'ffill')
|
|
1097
|
+
|
|
1098
|
+
self.pair_index_df[feature_label] = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
|
|
1099
|
+
df_to_merge = self.pair_index_df[['Date',feature_label]]
|
|
1100
|
+
self.df = self.df.merge(df_to_merge, on ='Date',how = 'left')
|
|
1101
|
+
|
|
1102
|
+
########
|
|
1103
|
+
self.compute_clip_bands(feature_label,threshold)
|
|
1104
|
+
|
|
1105
|
+
if save_features:
|
|
1106
|
+
self.log_features_standard(feature_label)
|
|
1107
|
+
parameters = {feature_label:{'pair_symbol':pair_symbol, 'feature_label':feature_label, 'window':window,'threshold':threshold}}
|
|
1108
|
+
try:
|
|
1109
|
+
len(self.settings_pair_index_feature)
|
|
1110
|
+
print('existing')
|
|
1111
|
+
self.settings_pair_index_feature.append(parameters)
|
|
1112
|
+
except:
|
|
1113
|
+
print('creation')
|
|
1114
|
+
self.settings_pair_index_feature = list()
|
|
1115
|
+
self.settings_pair_index_feature.append(parameters)
|
|
1116
|
+
|
|
1117
|
+
if plot:
|
|
1118
|
+
self.signal_plotter(feature_label)
|
|
1119
|
+
|
|
1120
|
+
def produce_order_features(self, feature_name, save_features = False):
|
|
1121
|
+
|
|
1122
|
+
signal_feature_name = f'discrete_signal_{feature_name}'
|
|
1123
|
+
order_feature_name = f'order_signal_{feature_name}'
|
|
1124
|
+
|
|
1125
|
+
self.df[signal_feature_name] = np.where(
|
|
1126
|
+
self.df[f'signal_up_{feature_name}'] == 1,1,
|
|
1127
|
+
np.where(
|
|
1128
|
+
self.df[f'signal_low_{feature_name}'] == 1,-1,0
|
|
1129
|
+
)
|
|
1130
|
+
)
|
|
1131
|
+
|
|
1132
|
+
## indexing chains
|
|
1133
|
+
self.df[f'lag_{signal_feature_name}'] = self.df[signal_feature_name].shift(1)
|
|
1134
|
+
self.df['breack'] = np.where(self.df[f'lag_{signal_feature_name}'] != self.df[signal_feature_name],1,0)
|
|
1135
|
+
self.df["chain_id"] = self.df.groupby("breack")["Date"].rank(method="first", ascending=True)
|
|
1136
|
+
self.df["chain_id"] = np.where(self.df['breack'] == 1,self.df["chain_id"],np.nan)
|
|
1137
|
+
self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
|
|
1138
|
+
self.df[order_feature_name] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1139
|
+
self.df[order_feature_name] = self.df[order_feature_name]*self.df[signal_feature_name]
|
|
1140
|
+
self.df = self.df.drop(columns = [f'lag_{signal_feature_name}', 'breack', "chain_id"])
|
|
1141
|
+
|
|
1142
|
+
## saving features
|
|
1143
|
+
if save_features:
|
|
1144
|
+
self.signals.append(signal_feature_name)
|
|
1145
|
+
self.signals.append(order_feature_name)
|
|
1028
1146
|
|
|
1029
1147
|
def create_hmm_derived_features(self, lag_returns):
|
|
1030
1148
|
|
|
@@ -1345,7 +1463,7 @@ class stock_eda_panel(object):
|
|
|
1345
1463
|
|
|
1346
1464
|
self.df[f'mean_target'] = self.df[columns].mean(axis=1)
|
|
1347
1465
|
self.target.append(f'mean_target')
|
|
1348
|
-
self.settings_target_lasts = {'steps':steps}
|
|
1466
|
+
self.settings_target_lasts = {'steps':steps, 'type':'regression'}
|
|
1349
1467
|
|
|
1350
1468
|
def get_categorical_targets(self, horizon, flor_loss, top_gain):
|
|
1351
1469
|
|
|
@@ -1377,7 +1495,7 @@ class stock_eda_panel(object):
|
|
|
1377
1495
|
self.targets.append('target_up')
|
|
1378
1496
|
self.targets.append('target_down')
|
|
1379
1497
|
|
|
1380
|
-
self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain}
|
|
1498
|
+
self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain, 'type': 'classification'}
|
|
1381
1499
|
|
|
1382
1500
|
def get_configurations(self,test_data_size =250, val_data_size = 250, model_type = False):
|
|
1383
1501
|
|
|
@@ -1400,69 +1518,22 @@ class stock_eda_panel(object):
|
|
|
1400
1518
|
self.settings['model_type'] = model_type
|
|
1401
1519
|
self.settings['target'] = list(set(self.target))
|
|
1402
1520
|
self.settings['targets'] = target_list
|
|
1403
|
-
|
|
1404
|
-
try:
|
|
1405
|
-
self.settings['settings']['spread_ma'] = self.settings_spread_ma ##to be deprecated
|
|
1406
|
-
except:
|
|
1407
|
-
pass
|
|
1408
|
-
try:
|
|
1409
|
-
self.settings['settings']['relative_spread_ma'] = self.settings_relative_spread_ma
|
|
1410
|
-
except:
|
|
1411
|
-
pass
|
|
1412
|
-
try:
|
|
1413
|
-
self.settings['settings']['pair_feature'] = self.settings_pair_feature
|
|
1414
|
-
except:
|
|
1415
|
-
pass
|
|
1416
|
-
try:
|
|
1417
|
-
self.settings['settings']['count_features'] = self.settings_count_features ##to be deprecated
|
|
1418
|
-
except:
|
|
1419
|
-
pass
|
|
1420
|
-
try:
|
|
1421
|
-
self.settings['settings']['bidirect_count_features'] = self.settings_bidirect_count_features
|
|
1422
|
-
except:
|
|
1423
|
-
pass
|
|
1424
|
-
try:
|
|
1425
|
-
self.settings['settings']['price_range'] = self.settings_price_range ##to be deprecated
|
|
1426
|
-
except:
|
|
1427
|
-
pass
|
|
1428
|
-
try:
|
|
1429
|
-
self.settings['settings']['relative_price_range'] = self.settings_relative_price_range
|
|
1430
|
-
except:
|
|
1431
|
-
pass
|
|
1432
|
-
try:
|
|
1433
|
-
self.settings['settings']['rsi_feature'] = self.settings_rsi_feature ##to be deprecated
|
|
1434
|
-
except:
|
|
1435
|
-
pass
|
|
1436
|
-
try:
|
|
1437
|
-
self.settings['settings']['rsi_feature_v2'] = self.settings_rsi_feature_v2
|
|
1438
|
-
except:
|
|
1439
|
-
pass
|
|
1440
|
-
try:
|
|
1441
|
-
self.settings['settings']['days_features'] = self.settings_days_features ##to be deprecated
|
|
1442
|
-
except:
|
|
1443
|
-
pass
|
|
1444
|
-
try:
|
|
1445
|
-
self.settings['settings']['days_features_v2'] = self.settings_days_features_v2
|
|
1446
|
-
except:
|
|
1447
|
-
pass
|
|
1448
1521
|
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
self.settings['settings']['hmm'] = self.settings_hmm
|
|
1460
|
-
except:
|
|
1461
|
-
pass
|
|
1522
|
+
## for now this is hard coded
|
|
1523
|
+
feature_list = ['spread_ma','relative_spread_ma','pair_feature','count_features','bidirect_count_features','price_range','relative_price_range','rsi_feature',
|
|
1524
|
+
'rsi_feature_v2', 'days_features','days_features_v2', 'volume_feature','smooth_volume', 'roc_feature', 'stoch_feature', 'stochastic_feature',
|
|
1525
|
+
'william_feature', 'vortex_feature', 'pair_index_feature','hmm']
|
|
1526
|
+
|
|
1527
|
+
for feature in feature_list:
|
|
1528
|
+
try:
|
|
1529
|
+
self.settings['settings'][feature] = getattr(self, f'settings_{feature}')
|
|
1530
|
+
except:
|
|
1531
|
+
pass
|
|
1462
1532
|
try:
|
|
1463
1533
|
self.settings['settings']['target_lasts'] = self.settings_target_lasts
|
|
1464
1534
|
except:
|
|
1465
1535
|
pass
|
|
1536
|
+
|
|
1466
1537
|
try:
|
|
1467
1538
|
self.settings['settings']['strategies'] = {
|
|
1468
1539
|
'best_strategy':self.best_strategy,
|
|
@@ -1853,10 +1924,11 @@ def iterate_signal_analyser(test_data_size,feature_name, days_list, arguments_to
|
|
|
1853
1924
|
return best_result
|
|
1854
1925
|
|
|
1855
1926
|
class analyse_index(stock_eda_panel):
|
|
1856
|
-
def __init__(self, index, asset, n_obs, lag, show_plot = True, save_path = False, save_aws = False):
|
|
1927
|
+
def __init__(self, index, asset, n_obs, lag, data_window = '5y', show_plot = True, save_path = False, save_aws = False):
|
|
1857
1928
|
self.index = index
|
|
1858
1929
|
self.asset = asset
|
|
1859
1930
|
self.n_obs = n_obs
|
|
1931
|
+
self.data_window = data_window
|
|
1860
1932
|
self.lag = lag
|
|
1861
1933
|
|
|
1862
1934
|
self.show_plot = show_plot
|
|
@@ -1865,12 +1937,12 @@ class analyse_index(stock_eda_panel):
|
|
|
1865
1937
|
|
|
1866
1938
|
def process_data(self):
|
|
1867
1939
|
|
|
1868
|
-
index = stock_eda_panel(self.index, self.n_obs)
|
|
1940
|
+
index = stock_eda_panel(self.index, self.n_obs, self.data_window)
|
|
1869
1941
|
index.get_data()
|
|
1870
1942
|
index.df['shift'] = index.df.Close.shift(self.lag)
|
|
1871
1943
|
index.df['index_return'] = index.df.Close/index.df['shift'] - 1
|
|
1872
1944
|
|
|
1873
|
-
asset = stock_eda_panel(self.asset, self.n_obs)
|
|
1945
|
+
asset = stock_eda_panel(self.asset, self.n_obs, self.data_window)
|
|
1874
1946
|
asset.get_data()
|
|
1875
1947
|
asset.df['shift'] = asset.df.Close.shift(self.lag)
|
|
1876
1948
|
asset.df['asset_return'] = asset.df.Close/asset.df['shift'] - 1
|
|
@@ -1,36 +1,37 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: virgo-modules
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.6
|
|
4
4
|
Summary: data processing and statistical modeling using stock market data
|
|
5
5
|
Home-page: https://github.com/miguelmayhem92/virgo_module
|
|
6
6
|
Author: Miguel Mayhuire
|
|
7
7
|
Author-email: miguelmayhem92@gmail.com
|
|
8
8
|
License: MIT
|
|
9
|
+
Platform: UNKNOWN
|
|
9
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
11
|
Classifier: Programming Language :: Python :: 3.9
|
|
11
12
|
Classifier: Operating System :: OS Independent
|
|
12
13
|
Requires-Python: >=3.9, <3.10
|
|
13
14
|
Description-Content-Type: text/markdown
|
|
14
15
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: feature-engine ==1.6.1
|
|
16
|
-
Requires-Dist: matplotlib ==3.6.3
|
|
17
|
-
Requires-Dist: mlflow ==2.1.1
|
|
18
|
-
Requires-Dist: numpy ==1.23.5
|
|
19
|
-
Requires-Dist: optuna ==3.1.0
|
|
20
|
-
Requires-Dist: pandas ==1.5.3
|
|
21
|
-
Requires-Dist: plotly ==5.15.0
|
|
22
|
-
Requires-Dist: rsa ==4.9
|
|
23
|
-
Requires-Dist: scikit-learn ==1.2.1
|
|
24
|
-
Requires-Dist: scipy ==1.10.0
|
|
25
|
-
Requires-Dist: seaborn ==0.12.2
|
|
26
|
-
Requires-Dist: starlette ==0.22.0
|
|
27
|
-
Requires-Dist: statsmodels ==0.13.5
|
|
28
|
-
Requires-Dist: ta ==0.10.2
|
|
29
|
-
Requires-Dist: yfinance ==0.2.9
|
|
30
|
-
Requires-Dist: hmmlearn ==0.3.0
|
|
16
|
+
Requires-Dist: feature-engine (==1.6.1)
|
|
17
|
+
Requires-Dist: matplotlib (==3.6.3)
|
|
18
|
+
Requires-Dist: mlflow (==2.1.1)
|
|
19
|
+
Requires-Dist: numpy (==1.23.5)
|
|
20
|
+
Requires-Dist: optuna (==3.1.0)
|
|
21
|
+
Requires-Dist: pandas (==1.5.3)
|
|
22
|
+
Requires-Dist: plotly (==5.15.0)
|
|
23
|
+
Requires-Dist: rsa (==4.9)
|
|
24
|
+
Requires-Dist: scikit-learn (==1.2.1)
|
|
25
|
+
Requires-Dist: scipy (==1.10.0)
|
|
26
|
+
Requires-Dist: seaborn (==0.12.2)
|
|
27
|
+
Requires-Dist: starlette (==0.22.0)
|
|
28
|
+
Requires-Dist: statsmodels (==0.13.5)
|
|
29
|
+
Requires-Dist: ta (==0.10.2)
|
|
30
|
+
Requires-Dist: yfinance (==0.2.9)
|
|
31
|
+
Requires-Dist: hmmlearn (==0.3.0)
|
|
31
32
|
Requires-Dist: boto3
|
|
32
33
|
Provides-Extra: dev
|
|
33
|
-
Requires-Dist: pytest >=7.0 ; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest (>=7.0) ; extra == 'dev'
|
|
34
35
|
|
|
35
36
|
# Virgo Package
|
|
36
37
|
|
|
@@ -51,3 +52,4 @@ obj = stock_eda_panel(stock_code = 'PEP', n_days = 20)
|
|
|
51
52
|
obj.get_data()
|
|
52
53
|
print(obj.df.shape)
|
|
53
54
|
```
|
|
55
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
virgo_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
virgo_modules/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
virgo_modules/src/aws_utils.py,sha256=toqSEgqMRiGzcGJIEjVuWN6WLhGj0eQ_n4zBr5CsNKA,1574
|
|
4
|
+
virgo_modules/src/edge_utils.py,sha256=gQxXO3h22-E5px50E8MW7-2Z7ykW_POSLIpiZwde7KI,7686
|
|
5
|
+
virgo_modules/src/pull_artifacts.py,sha256=5OPrgR7pcMSdpbevDRhf0ebk7g7ZRjff4NpTIIWAKjE,1989
|
|
6
|
+
virgo_modules/src/re_utils.py,sha256=VuXpmofgoeT1J86oHjL2G__syoGBmQMMacnwgVb4jGI,43814
|
|
7
|
+
virgo_modules/src/ticketer_source.py,sha256=9kq82PuNUUxuPiAtU_Hde_p12slORUmvYkLJqr62UBY,98719
|
|
8
|
+
virgo_modules-0.0.6.dist-info/LICENSE,sha256=pNgFyCYgmimaw0o6V20JupZLROycAnOA_HDDh1tX2V4,1097
|
|
9
|
+
virgo_modules-0.0.6.dist-info/METADATA,sha256=SRMv2E6Ee-EAtWMc9qsk-oJLlllpPEmT67bCwj4nVZ4,1483
|
|
10
|
+
virgo_modules-0.0.6.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
virgo_modules-0.0.6.dist-info/top_level.txt,sha256=ZjI-qEkDtT-8mFwGAWnXfqPOKEGlIhWRW1es1VyXc60,14
|
|
12
|
+
virgo_modules-0.0.6.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
virgo_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
virgo_modules/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
virgo_modules/src/aws_utils.py,sha256=zvCV_bfN8o8H3iSD-V_aYHtoKqXRD4Kt_T8HIji23WA,965
|
|
4
|
-
virgo_modules/src/pull_artifacts.py,sha256=5OPrgR7pcMSdpbevDRhf0ebk7g7ZRjff4NpTIIWAKjE,1989
|
|
5
|
-
virgo_modules/src/re_utils.py,sha256=pzgAGFXGKQfnOHly4lCr2Iq9iNTE_Ne2eWdR-eURjM4,44837
|
|
6
|
-
virgo_modules/src/ticketer_source.py,sha256=hVT2P6LD_h9GFExQfyNimxPuETsWaC-kuVU_qlomp7I,94431
|
|
7
|
-
virgo_modules-0.0.4.dist-info/LICENSE,sha256=pNgFyCYgmimaw0o6V20JupZLROycAnOA_HDDh1tX2V4,1097
|
|
8
|
-
virgo_modules-0.0.4.dist-info/METADATA,sha256=NM9Qv6XIMndQagQRRk7IX1Mz4E6KBWihI-Py1jnzSCw,1428
|
|
9
|
-
virgo_modules-0.0.4.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
|
10
|
-
virgo_modules-0.0.4.dist-info/top_level.txt,sha256=ZjI-qEkDtT-8mFwGAWnXfqPOKEGlIhWRW1es1VyXc60,14
|
|
11
|
-
virgo_modules-0.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|