virgo-modules 0.0.3__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of virgo-modules might be problematic. Click here for more details.
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/PKG-INFO +4 -20
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/setup.py +1 -1
- virgo_modules-0.0.6/virgo_app/virgo_modules/src/aws_utils.py +38 -0
- virgo_modules-0.0.6/virgo_app/virgo_modules/src/edge_utils.py +181 -0
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules/src/re_utils.py +113 -88
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules/src/ticketer_source.py +207 -135
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules.egg-info/PKG-INFO +4 -20
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules.egg-info/SOURCES.txt +1 -0
- virgo_modules-0.0.3/virgo_app/virgo_modules/src/aws_utils.py +0 -23
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/LICENSE +0 -0
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/README.md +0 -0
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/setup.cfg +0 -0
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules/__init__.py +0 -0
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules/src/__init__.py +0 -0
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules/src/pull_artifacts.py +0 -0
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules.egg-info/dependency_links.txt +0 -0
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules.egg-info/requires.txt +0 -0
- {virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules.egg-info/top_level.txt +0 -0
|
@@ -1,36 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: virgo_modules
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.6
|
|
4
4
|
Summary: data processing and statistical modeling using stock market data
|
|
5
5
|
Home-page: https://github.com/miguelmayhem92/virgo_module
|
|
6
6
|
Author: Miguel Mayhuire
|
|
7
7
|
Author-email: miguelmayhem92@gmail.com
|
|
8
8
|
License: MIT
|
|
9
|
+
Platform: UNKNOWN
|
|
9
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
11
|
Classifier: Programming Language :: Python :: 3.9
|
|
11
12
|
Classifier: Operating System :: OS Independent
|
|
12
13
|
Requires-Python: >=3.9, <3.10
|
|
13
14
|
Description-Content-Type: text/markdown
|
|
14
|
-
License-File: LICENSE
|
|
15
|
-
Requires-Dist: feature-engine==1.6.1
|
|
16
|
-
Requires-Dist: matplotlib==3.6.3
|
|
17
|
-
Requires-Dist: mlflow==2.1.1
|
|
18
|
-
Requires-Dist: numpy==1.23.5
|
|
19
|
-
Requires-Dist: optuna==3.1.0
|
|
20
|
-
Requires-Dist: pandas==1.5.3
|
|
21
|
-
Requires-Dist: plotly==5.15.0
|
|
22
|
-
Requires-Dist: rsa==4.9
|
|
23
|
-
Requires-Dist: scikit-learn==1.2.1
|
|
24
|
-
Requires-Dist: scipy==1.10.0
|
|
25
|
-
Requires-Dist: seaborn==0.12.2
|
|
26
|
-
Requires-Dist: starlette==0.22.0
|
|
27
|
-
Requires-Dist: statsmodels==0.13.5
|
|
28
|
-
Requires-Dist: ta==0.10.2
|
|
29
|
-
Requires-Dist: yfinance==0.2.9
|
|
30
|
-
Requires-Dist: hmmlearn==0.3.0
|
|
31
|
-
Requires-Dist: boto3
|
|
32
15
|
Provides-Extra: dev
|
|
33
|
-
|
|
16
|
+
License-File: LICENSE
|
|
34
17
|
|
|
35
18
|
# Virgo Package
|
|
36
19
|
|
|
@@ -51,3 +34,4 @@ obj = stock_eda_panel(stock_code = 'PEP', n_days = 20)
|
|
|
51
34
|
obj.get_data()
|
|
52
35
|
print(obj.df.shape)
|
|
53
36
|
```
|
|
37
|
+
|
|
@@ -5,7 +5,7 @@ with open("virgo_app/README.md", "r") as f:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="virgo_modules",
|
|
8
|
-
version="0.0.
|
|
8
|
+
version="0.0.6",
|
|
9
9
|
description="data processing and statistical modeling using stock market data",
|
|
10
10
|
package_dir={"": "virgo_app"},
|
|
11
11
|
packages=find_packages(where="virgo_app"),
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import yaml
|
|
2
|
+
import boto3
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from io import StringIO, BytesIO
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def upload_file_to_aws(bucket,key,input_path, secret_path = 'secrets.yaml'):
|
|
9
|
+
|
|
10
|
+
credentials = yaml.safe_load(Path(secret_path).read_text())
|
|
11
|
+
session = boto3.Session(aws_access_key_id=credentials['AWS_ACCESS_KEY_ID'],aws_secret_access_key=credentials['AWS_SECRET_ACCESS_KEY'])
|
|
12
|
+
bucket = credentials[bucket]
|
|
13
|
+
s3 = session.resource('s3')
|
|
14
|
+
s3.meta.client.upload_file(Filename=input_path , Bucket=bucket, Key=key)
|
|
15
|
+
|
|
16
|
+
def upload_pandas_to_s3(data_frame,bucket,key, secret_path = 'secrets.yaml'):
|
|
17
|
+
|
|
18
|
+
csv_buffer = StringIO()
|
|
19
|
+
data_frame.to_csv(csv_buffer)
|
|
20
|
+
csv_buffer.seek(0)
|
|
21
|
+
|
|
22
|
+
credentials = yaml.safe_load(Path(secret_path).read_text())
|
|
23
|
+
s3 = boto3.client("s3",region_name=credentials['AWS_DEFAULT_REGION'],aws_access_key_id=credentials['AWS_ACCESS_KEY_ID'],aws_secret_access_key=credentials['AWS_SECRET_ACCESS_KEY'])
|
|
24
|
+
bucket = credentials[bucket]
|
|
25
|
+
s3.put_object(Bucket=bucket, Body=csv_buffer.getvalue(), Key= key)
|
|
26
|
+
|
|
27
|
+
def download_file_to_aws(bucket,key, secret_path = 'secrets.yaml'):
|
|
28
|
+
|
|
29
|
+
credentials = yaml.safe_load(Path(secret_path).read_text())
|
|
30
|
+
s3c = boto3.client(
|
|
31
|
+
's3',
|
|
32
|
+
region_name = credentials['AWS_DEFAULT_REGION'],
|
|
33
|
+
aws_access_key_id = credentials['AWS_ACCESS_KEY_ID'],
|
|
34
|
+
aws_secret_access_key = credentials['AWS_SECRET_ACCESS_KEY']
|
|
35
|
+
)
|
|
36
|
+
obj = s3c.get_object(Bucket= bucket , Key = key)
|
|
37
|
+
df = pd.read_csv(BytesIO(obj['Body'].read()), encoding='utf8')
|
|
38
|
+
return df
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import itertools
|
|
3
|
+
|
|
4
|
+
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
|
5
|
+
from sklearn.pipeline import Pipeline
|
|
6
|
+
|
|
7
|
+
from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
|
|
8
|
+
from feature_engine.imputation import MeanMedianImputer
|
|
9
|
+
from virgo_modules.src.ticketer_source import FeatureSelector
|
|
10
|
+
from feature_engine.discretisation import EqualWidthDiscretiser
|
|
11
|
+
|
|
12
|
+
from .ticketer_source import VirgoWinsorizerFeature
|
|
13
|
+
|
|
14
|
+
class produce_model_wrapper:
|
|
15
|
+
def __init__(self,data):
|
|
16
|
+
self.data = data.copy()
|
|
17
|
+
|
|
18
|
+
def preprocess(self, validation_size, target):
|
|
19
|
+
|
|
20
|
+
val_date = self.data.groupby('Date', as_index = False).agg(target_down = (target[0],'count')).sort_values('Date').iloc[-validation_size:,].head(1)['Date'].values[0]
|
|
21
|
+
|
|
22
|
+
train_data = self.data[self.data['Date'] < val_date].dropna()
|
|
23
|
+
val_data = self.data[self.data['Date'] >= val_date].dropna()
|
|
24
|
+
|
|
25
|
+
columns = [ x for x in train_data.columns if x not in target ]
|
|
26
|
+
X_train, y_train = train_data[columns], train_data[target]
|
|
27
|
+
X_val, y_val = val_data[columns], val_data[target]
|
|
28
|
+
self.X_train = X_train
|
|
29
|
+
self.y_train = y_train
|
|
30
|
+
self.X_val = X_val
|
|
31
|
+
self.y_val = y_val
|
|
32
|
+
|
|
33
|
+
def train_model(self, pipe, model, cv_ = False):
|
|
34
|
+
self.model = model
|
|
35
|
+
self.pipe_transform = pipe
|
|
36
|
+
self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
|
|
37
|
+
self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
|
|
38
|
+
self.pipeline.fit(self.X_train, self.y_train)
|
|
39
|
+
|
|
40
|
+
class register_results():
|
|
41
|
+
def __init__(self, model_name):
|
|
42
|
+
self.model_name = model_name
|
|
43
|
+
self.metric_logger = dict()
|
|
44
|
+
def eval_metrics(self, pipeline, X, y, type_data, phase):
|
|
45
|
+
|
|
46
|
+
preds_proba = pipeline.predict_proba(X)
|
|
47
|
+
preds = pipeline.predict(X)
|
|
48
|
+
|
|
49
|
+
if type(preds_proba) == list:
|
|
50
|
+
preds_proba = np.array([ x[:,1] for x in preds_proba]).T
|
|
51
|
+
|
|
52
|
+
roc = roc_auc_score(y,preds_proba, average=None)
|
|
53
|
+
precision = precision_score(y,preds, average=None)
|
|
54
|
+
recall = recall_score(y,preds, average=None)
|
|
55
|
+
|
|
56
|
+
self.metric_logger[f'{phase}//{self.model_name}//{type_data}'] = {'roc':roc, 'precision':precision, 'recall':recall}
|
|
57
|
+
|
|
58
|
+
def print_metric_logger(self):
|
|
59
|
+
parts = list(self.metric_logger.keys())
|
|
60
|
+
phase_parts = [ x.split('//')[0] for x in parts]
|
|
61
|
+
|
|
62
|
+
parts = list(self.metric_logger)
|
|
63
|
+
phase_parts = [ x.split('//')[0] for x in parts]
|
|
64
|
+
|
|
65
|
+
init_phase = phase_parts[0]
|
|
66
|
+
print(f'---{init_phase}--')
|
|
67
|
+
for phase,val in zip(phase_parts,self.metric_logger):
|
|
68
|
+
stage = val.split('//')[2]
|
|
69
|
+
if init_phase != phase:
|
|
70
|
+
print(f'---{phase}--')
|
|
71
|
+
init_phase = phase
|
|
72
|
+
for metric in self.metric_logger[val]:
|
|
73
|
+
print(stage, metric,self.metric_logger[val][metric])
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def eval_metrics(pipeline, X, y, type_data, model_name):
|
|
77
|
+
|
|
78
|
+
preds_proba = pipeline.predict_proba(X)
|
|
79
|
+
preds = pipeline.predict(X)
|
|
80
|
+
|
|
81
|
+
if type(preds_proba) == list:
|
|
82
|
+
preds_proba = np.array([ x[:,1] for x in preds_proba]).T
|
|
83
|
+
|
|
84
|
+
print(f'--{type_data} - {model_name}--')
|
|
85
|
+
print('--target: down, up--')
|
|
86
|
+
print('--roc-auc--')
|
|
87
|
+
print(roc_auc_score(y,preds_proba, average=None))
|
|
88
|
+
print('--precision--')
|
|
89
|
+
print(precision_score(y,preds, average=None))
|
|
90
|
+
print('--recall--')
|
|
91
|
+
print(recall_score(y,preds, average=None))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def data_processing_pipeline_classifier(features_base,features_to_drop = False, winsorizer_conf = False, discretize_columns = False,
|
|
95
|
+
bins_discretize = 10, correlation = 0.85, fillna = True,
|
|
96
|
+
pipeline_order = 'selector//winzorizer//discretizer//median_inputer//drop//correlation'):
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
select_pipe = [('selector', FeatureSelector(features_base))] if features_base else []
|
|
100
|
+
winzorizer_pipe = [('winzorized_features', VirgoWinsorizerFeature(winsorizer_conf))] if winsorizer_conf else []
|
|
101
|
+
drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
|
|
102
|
+
discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = bins_discretize ))] if discretize_columns else []
|
|
103
|
+
drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation, method = 'spearman'))] if correlation else []
|
|
104
|
+
median_imputer_pipe = [('median_imputer', MeanMedianImputer())] if fillna else []
|
|
105
|
+
|
|
106
|
+
pipe_dictionary = {
|
|
107
|
+
'selector': select_pipe,
|
|
108
|
+
'winzorizer':winzorizer_pipe,
|
|
109
|
+
'drop':drop_pipe,
|
|
110
|
+
'discretizer': discretize,
|
|
111
|
+
'correlation': drop_corr,
|
|
112
|
+
'median_inputer':median_imputer_pipe,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
pipeline_steps = pipeline_order.split('//')
|
|
116
|
+
## validation
|
|
117
|
+
for step in pipeline_steps:
|
|
118
|
+
if step not in pipe_dictionary.keys():
|
|
119
|
+
raise Exception(f'{step} step not in list of steps, the list is: {list(pipe_dictionary.keys())}')
|
|
120
|
+
|
|
121
|
+
pipeline_args = [ pipe_dictionary[step] for step in pipeline_steps]
|
|
122
|
+
pipeline_args = list(itertools.chain.from_iterable(pipeline_args))
|
|
123
|
+
pipe = Pipeline(pipeline_args)
|
|
124
|
+
# pipe = Pipeline(
|
|
125
|
+
# select_pipe + \
|
|
126
|
+
# winzorizer_pipe + \
|
|
127
|
+
# discretize + \
|
|
128
|
+
# median_imputer_pipe + \
|
|
129
|
+
# drop_pipe + \
|
|
130
|
+
# drop_corr
|
|
131
|
+
# )
|
|
132
|
+
return pipe
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class ExpandingMultipleTimeSeriesKFold:
|
|
136
|
+
"""increasing training window where the test can be overlap"""
|
|
137
|
+
def __init__(self, df, window_size = 100, number_window=3, overlap_size = 0):
|
|
138
|
+
self.df = df
|
|
139
|
+
self.number_window = number_window
|
|
140
|
+
self.window_size = window_size
|
|
141
|
+
self.overlap_size = overlap_size
|
|
142
|
+
|
|
143
|
+
def split(self, X, y, groups=None):
|
|
144
|
+
|
|
145
|
+
if 'Date_i' not in self.df.index.names or 'i' not in self.df.index.names:
|
|
146
|
+
raise Exception('no date and/or index in the index dataframe')
|
|
147
|
+
|
|
148
|
+
if self.overlap_size > self.window_size:
|
|
149
|
+
raise Exception('overlap can not be higher than the window size')
|
|
150
|
+
|
|
151
|
+
unique_dates = list(self.df.index.get_level_values('Date_i').unique())
|
|
152
|
+
unique_dates.sort()
|
|
153
|
+
|
|
154
|
+
total_test_size = self.window_size * self.number_window
|
|
155
|
+
total_test_size = total_test_size - (self.number_window - 1)*self.overlap_size
|
|
156
|
+
|
|
157
|
+
if total_test_size > len(unique_dates):
|
|
158
|
+
raise Exception('test size is higher than the data length')
|
|
159
|
+
|
|
160
|
+
cut = total_test_size
|
|
161
|
+
for fold in range(self.number_window):
|
|
162
|
+
|
|
163
|
+
topcut = cut-self.window_size
|
|
164
|
+
train_dates = unique_dates[:-cut]
|
|
165
|
+
test_dates = unique_dates[-cut:-topcut]
|
|
166
|
+
|
|
167
|
+
if topcut == 0:
|
|
168
|
+
test_dates = unique_dates[-cut:]
|
|
169
|
+
|
|
170
|
+
max_train_date = max(train_dates)
|
|
171
|
+
min_test_date, max_test_date = min(test_dates), max(test_dates)
|
|
172
|
+
|
|
173
|
+
cut = cut - (self.window_size - self.overlap_size)
|
|
174
|
+
|
|
175
|
+
train_index = self.df[self.df.index.get_level_values('Date_i') <= max_train_date].index.get_level_values('i')
|
|
176
|
+
test_index = self.df[(self.df.index.get_level_values('Date_i') >= min_test_date) & (self.df.index.get_level_values('Date_i') <= max_test_date)].index.get_level_values('i')
|
|
177
|
+
|
|
178
|
+
yield train_index, test_index
|
|
179
|
+
|
|
180
|
+
def get_n_splits(self, X, y, groups=None):
|
|
181
|
+
return self.number_window
|
|
@@ -2,6 +2,7 @@ import matplotlib.pyplot as plt
|
|
|
2
2
|
import matplotlib.gridspec as gridspec
|
|
3
3
|
import seaborn as sns; sns.set()
|
|
4
4
|
import matplotlib.patheffects as path_effects
|
|
5
|
+
from matplotlib.dates import DateFormatter
|
|
5
6
|
|
|
6
7
|
import plotly.express as px
|
|
7
8
|
from plotly.subplots import make_subplots
|
|
@@ -403,7 +404,7 @@ def rank_by_return(data, lag_days, top_n = 5):
|
|
|
403
404
|
|
|
404
405
|
return result
|
|
405
406
|
|
|
406
|
-
def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_available: object = False) -> object:
|
|
407
|
+
def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_available: object = False, data_window:str = '5y') -> object:
|
|
407
408
|
"""
|
|
408
409
|
this functions runs the stock_eda_panel
|
|
409
410
|
it is shared between train model and predictions
|
|
@@ -416,103 +417,84 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
|
|
|
416
417
|
returns: stock eda panel
|
|
417
418
|
"""
|
|
418
419
|
|
|
419
|
-
object_stock = stock_eda_panel(ticker_name , n_days )
|
|
420
|
+
object_stock = stock_eda_panel(ticker_name , n_days, data_window)
|
|
420
421
|
object_stock.get_data()
|
|
421
422
|
|
|
422
423
|
# computing features if they exists in the ticketr settings
|
|
423
424
|
|
|
424
425
|
if 'volatility' in ticket_settings['settings']:
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
trad_days = ticket_settings['settings']['volatility']['trad_days'],
|
|
428
|
-
window_log_return = ticket_settings['settings']['volatility']['window_log_return']
|
|
429
|
-
)
|
|
426
|
+
parameters = ticket_settings['settings']['volatility']
|
|
427
|
+
object_stock.volatility_analysis(**parameters)
|
|
430
428
|
|
|
431
429
|
if 'outlier' in ticket_settings['settings']:
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
430
|
+
parameters = ticket_settings['settings']['outlier']
|
|
431
|
+
object_stock.outlier_plot(**parameters)
|
|
432
|
+
|
|
433
|
+
## for now this is hard coded
|
|
434
|
+
feature_map = {
|
|
435
|
+
'spread_ma':'spread_MA', # deprecated
|
|
436
|
+
'relative_spread_ma':'relative_spread_MA',
|
|
437
|
+
'pair_feature':'pair_feature',
|
|
438
|
+
'count_features':'get_count_feature', # deprecated
|
|
439
|
+
'bidirect_count_features':'bidirect_count_feature',
|
|
440
|
+
'price_range':'get_range_feature', # deprecated
|
|
441
|
+
'relative_price_range':'get_relative_range_feature',
|
|
442
|
+
'rsi_feature':'rsi_feature', # deprecated
|
|
443
|
+
'rsi_feature_v2':'rsi_feature_improved',
|
|
444
|
+
'days_features':'days_features', # deprecated
|
|
445
|
+
'days_features_v2':'days_features_bands',
|
|
446
|
+
'volume_feature':'analysis_volume', ## this may crash but deprecated
|
|
447
|
+
'smooth_volume':'analysis_smooth_volume',
|
|
448
|
+
'roc_feature':'roc_feature',
|
|
449
|
+
'stoch_feature':'stoch_feature',
|
|
450
|
+
'stochastic_feature':'stochastic_feature',
|
|
451
|
+
'william_feature':'william_feature',
|
|
452
|
+
'vortex_feature':'vortex_feature',
|
|
453
|
+
'pair_index_feature':'pair_index_feature' # this has a diff structure!
|
|
454
|
+
}
|
|
455
|
+
exceptions = ['pair_feature','pair_index_feature']
|
|
456
|
+
### standar feature
|
|
457
|
+
for feature in feature_map.keys():
|
|
458
|
+
if (feature in ticket_settings['settings']) and (feature not in exceptions):
|
|
459
|
+
parameters = ticket_settings['settings'][feature]
|
|
460
|
+
method_to_use = feature_map.get(feature)
|
|
461
|
+
getattr(object_stock, method_to_use)(**parameters)
|
|
462
|
+
|
|
463
|
+
## special features
|
|
447
464
|
if 'pair_feature' in ticket_settings['settings']:
|
|
448
465
|
object_stock.pair_feature(pair_symbol = ticket_settings['settings']['pair_feature']['pair_symbol'])
|
|
449
466
|
object_stock.produce_pair_score_plot(
|
|
450
467
|
window = ticket_settings['settings']['pair_feature']['window'],
|
|
451
468
|
z_threshold = ticket_settings['settings']['pair_feature']['z_threshold']
|
|
452
|
-
)
|
|
453
|
-
|
|
454
|
-
if 'count_features' in ticket_settings['settings']:
|
|
455
|
-
object_stock.get_count_feature(
|
|
456
|
-
rolling_window = ticket_settings['settings']['count_features']['rolling_window'],
|
|
457
|
-
threshold = ticket_settings['settings']['count_features']['threshold']
|
|
458
|
-
)
|
|
459
|
-
|
|
460
|
-
if 'bidirect_count_features' in ticket_settings['settings']:
|
|
461
|
-
object_stock.bidirect_count_feature(
|
|
462
|
-
rolling_window = ticket_settings['settings']['bidirect_count_features']['rolling_window'],
|
|
463
|
-
threshold = ticket_settings['settings']['bidirect_count_features']['threshold']
|
|
464
|
-
)
|
|
469
|
+
)
|
|
465
470
|
|
|
466
|
-
if '
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
)
|
|
472
|
-
|
|
473
|
-
if 'relative_price_range' in ticket_settings['settings']:
|
|
474
|
-
object_stock.get_relative_range_feature(
|
|
475
|
-
window = ticket_settings['settings']['relative_price_range']['window'],
|
|
476
|
-
threshold = ticket_settings['settings']['relative_price_range']['threshold']
|
|
477
|
-
)
|
|
478
|
-
|
|
479
|
-
if 'rsi_feature' in ticket_settings['settings']:
|
|
480
|
-
object_stock.rsi_feature(
|
|
481
|
-
window = ticket_settings['settings']['rsi_feature']['window'],
|
|
482
|
-
lag_rsi_ret = ticket_settings['settings']['rsi_feature']['lag_rsi_ret'],
|
|
483
|
-
threshold = ticket_settings['settings']['rsi_feature']['threshold']
|
|
484
|
-
)
|
|
471
|
+
if 'pair_index_feature' in ticket_settings['settings']:
|
|
472
|
+
for group_feature in ticket_settings['settings']['pair_index_feature']:
|
|
473
|
+
key = list(group_feature.keys())[0]
|
|
474
|
+
parameters = group_feature[key]
|
|
475
|
+
method_to_use = feature_map.get('pair_index_feature')
|
|
476
|
+
getattr(object_stock, method_to_use)(**parameters)
|
|
485
477
|
|
|
486
|
-
if '
|
|
487
|
-
object_stock.rsi_feature_improved(
|
|
488
|
-
window = ticket_settings['settings']['rsi_feature_v2']['window'],
|
|
489
|
-
threshold = ticket_settings['settings']['rsi_feature_v2']['threshold']
|
|
490
|
-
)
|
|
478
|
+
if 'target_lasts' in ticket_settings['settings']:
|
|
491
479
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
if 'smooth_volume' in ticket_settings['settings']:
|
|
512
|
-
object_stock.analysis_smooth_volume(
|
|
513
|
-
window = ticket_settings['settings']['smooth_volume']['window'],
|
|
514
|
-
threshold = ticket_settings['settings']['smooth_volume']['threshold']
|
|
515
|
-
)
|
|
480
|
+
type_target = ticket_settings['settings']['target_lasts']['type']
|
|
481
|
+
params = {k:v for k,v in ticket_settings['settings']['target_lasts'].items() if k != 'type'}
|
|
482
|
+
|
|
483
|
+
if 'classification' == type_target:
|
|
484
|
+
object_stock.get_categorical_targets(**params)
|
|
485
|
+
|
|
486
|
+
elif 'regression' == type_target:
|
|
487
|
+
object_stock.get_targets(**params)
|
|
488
|
+
|
|
489
|
+
del params
|
|
490
|
+
del type_target
|
|
491
|
+
|
|
492
|
+
## searching discrete signals and orders
|
|
493
|
+
discrete_signals = [x for x in ticket_settings['signals'] if 'discrete' in x]
|
|
494
|
+
discrete_features = [x.replace('discrete_signal_', '') for x in discrete_signals]
|
|
495
|
+
if len(discrete_features) > 0:
|
|
496
|
+
for feature_name in discrete_features:
|
|
497
|
+
object_stock.produce_order_features(feature_name)
|
|
516
498
|
|
|
517
499
|
if hmm_available:
|
|
518
500
|
object_stock.cluster_hmm_analysis( n_clusters = None,
|
|
@@ -520,10 +502,11 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
|
|
|
520
502
|
test_data_size = None,
|
|
521
503
|
seed = None, model = hmm_available)
|
|
522
504
|
else:
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
505
|
+
if 'hmm' in ticket_settings['settings']:
|
|
506
|
+
object_stock.cluster_hmm_analysis( n_clusters = ticket_settings['settings']['hmm']['n_clusters'],
|
|
507
|
+
features_hmm = ticket_settings['settings']['hmm']['features_hmm'],
|
|
508
|
+
test_data_size = ticket_settings['settings']['hmm']['test_data_size'],
|
|
509
|
+
seed = ticket_settings['settings']['hmm']['seed'])
|
|
527
510
|
|
|
528
511
|
return object_stock
|
|
529
512
|
|
|
@@ -577,6 +560,7 @@ def call_ml_objects(stock_code, client, call_models = False):
|
|
|
577
560
|
ticker_name= stock_code,
|
|
578
561
|
ticket_settings = ticket_settings,
|
|
579
562
|
n_days = ticket_settings['settings']['general']['n_days'],
|
|
563
|
+
data_window = ticket_settings['settings']['general'].get('data_window','5y'),
|
|
580
564
|
hmm_available = hmm_model
|
|
581
565
|
)
|
|
582
566
|
### applying kalman
|
|
@@ -898,4 +882,45 @@ class produce_plotly_plots:
|
|
|
898
882
|
if self.show_plot:
|
|
899
883
|
fig.show()
|
|
900
884
|
if self.save_path and self.save_aws:
|
|
901
|
-
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
|
|
885
|
+
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
|
|
886
|
+
|
|
887
|
+
def plot_hmm_analysis_logger(data_frame,test_data_size, save_path = False, show_plot = True):
|
|
888
|
+
|
|
889
|
+
df = data_frame
|
|
890
|
+
df_ = df[['Date','hmm_feature','Close',"chain_return"]].sort_values('Date')
|
|
891
|
+
fig, axs = plt.subplots(1,2,figsize=(10,4))
|
|
892
|
+
df__ = df_.iloc[:-test_data_size,]
|
|
893
|
+
sns.boxplot(data=df__, x="hmm_feature", y="chain_return",ax = axs[0]).set_title('train dist')
|
|
894
|
+
df__ = df_.iloc[-test_data_size:,]
|
|
895
|
+
sns.boxplot(data=df__ , x="hmm_feature", y="chain_return",ax = axs[1]).set_title('test dist')
|
|
896
|
+
if save_path:
|
|
897
|
+
plt.savefig(save_path)
|
|
898
|
+
if not show_plot:
|
|
899
|
+
plt.close()
|
|
900
|
+
|
|
901
|
+
def plot_hmm_tsanalysis_logger(data_frame, test_data_size,save_path = False, show_plot = True):
|
|
902
|
+
|
|
903
|
+
df = data_frame
|
|
904
|
+
df_ = df[['Date','hmm_feature','Close',"chain_return"]].sort_values('Date')
|
|
905
|
+
states = list(df_['hmm_feature'].unique())
|
|
906
|
+
states.sort()
|
|
907
|
+
|
|
908
|
+
if test_data_size:
|
|
909
|
+
df__ = df_.iloc[-test_data_size:,]
|
|
910
|
+
date_limit = pd.Timestamp(str(df__.Date.min().strftime('%Y-%m-%d')))
|
|
911
|
+
|
|
912
|
+
fig, ax1 = plt.subplots(figsize=(10,4))
|
|
913
|
+
ax1.plot(df_['Date'],df_["Close"])
|
|
914
|
+
|
|
915
|
+
for state in states:
|
|
916
|
+
df__ = df_[df_.hmm_feature == state]
|
|
917
|
+
ax1.scatter(df__['Date'],df__["Close"], label = state)
|
|
918
|
+
formatter = DateFormatter('%Y-%m-%d')
|
|
919
|
+
if test_data_size:
|
|
920
|
+
plt.axvline(x=date_limit, color = 'r')
|
|
921
|
+
fig.legend()
|
|
922
|
+
fig.autofmt_xdate()
|
|
923
|
+
if save_path:
|
|
924
|
+
plt.savefig(save_path)
|
|
925
|
+
if not show_plot:
|
|
926
|
+
plt.close()
|
|
@@ -26,7 +26,8 @@ import statsmodels.api as sm
|
|
|
26
26
|
|
|
27
27
|
import scipy.stats as stats
|
|
28
28
|
|
|
29
|
-
from ta.momentum import RSIIndicator
|
|
29
|
+
from ta.momentum import RSIIndicator, ROCIndicator, StochRSIIndicator,StochasticOscillator, WilliamsRIndicator
|
|
30
|
+
from ta.trend import VortexIndicator
|
|
30
31
|
|
|
31
32
|
import warnings
|
|
32
33
|
warnings.filterwarnings('ignore')
|
|
@@ -44,14 +45,27 @@ from itertools import combinations, chain
|
|
|
44
45
|
from feature_engine.encoding import OneHotEncoder
|
|
45
46
|
from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
|
|
46
47
|
from feature_engine.timeseries.forecasting import LagFeatures
|
|
47
|
-
from feature_engine.imputation import
|
|
48
|
-
from feature_engine.discretisation import
|
|
48
|
+
from feature_engine.imputation import MeanMedianImputer
|
|
49
|
+
from feature_engine.discretisation import EqualWidthDiscretiser
|
|
49
50
|
|
|
50
51
|
from .aws_utils import upload_file_to_aws
|
|
51
|
-
import pickle
|
|
52
52
|
|
|
53
53
|
import logging
|
|
54
54
|
|
|
55
|
+
class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
|
|
56
|
+
def __init__(self, feature_configs):
|
|
57
|
+
self.feature_configs = feature_configs
|
|
58
|
+
def fit(self, X, y=None):
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def transform(self, X, y=None):
|
|
62
|
+
for feature in self.feature_configs:
|
|
63
|
+
lower = self.feature_configs[feature]['min']
|
|
64
|
+
upper = self.feature_configs[feature]['max']
|
|
65
|
+
X[feature] = np.where( lower > X[feature], lower, X[feature])
|
|
66
|
+
X[feature] = np.where( upper < X[feature], upper, X[feature])
|
|
67
|
+
return X
|
|
68
|
+
|
|
55
69
|
class FeatureSelector(BaseEstimator, TransformerMixin):
|
|
56
70
|
def __init__(self, columns):
|
|
57
71
|
self.columns = columns
|
|
@@ -152,12 +166,13 @@ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2
|
|
|
152
166
|
|
|
153
167
|
class stock_eda_panel(object):
|
|
154
168
|
|
|
155
|
-
def __init__(self, stock_code, n_days):
|
|
169
|
+
def __init__(self, stock_code, n_days, data_window = '5y'):
|
|
156
170
|
self.stock_code = stock_code
|
|
157
171
|
self.n_days = n_days
|
|
158
172
|
self.today = datetime.date.today()
|
|
159
173
|
self.features = list()
|
|
160
174
|
self.signals = list()
|
|
175
|
+
self.data_window = data_window
|
|
161
176
|
|
|
162
177
|
def augmented_dickey_fuller_statistics(self,time_series, label):
|
|
163
178
|
result = adfuller(time_series.dropna().values)
|
|
@@ -168,8 +183,7 @@ class stock_eda_panel(object):
|
|
|
168
183
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
169
184
|
|
|
170
185
|
stock = yf.Ticker(self.stock_code)
|
|
171
|
-
|
|
172
|
-
df = stock.history(period='5y')
|
|
186
|
+
df = stock.history(period=self.data_window)
|
|
173
187
|
|
|
174
188
|
df = df.sort_values('Date')
|
|
175
189
|
df.reset_index(inplace=True)
|
|
@@ -177,7 +191,12 @@ class stock_eda_panel(object):
|
|
|
177
191
|
df['Date'] = pd.to_datetime(df['Date'])
|
|
178
192
|
|
|
179
193
|
df = df[df.Date >= begin_date_str ]
|
|
180
|
-
self.settings_general = {
|
|
194
|
+
self.settings_general = {
|
|
195
|
+
'n_days':self.n_days,
|
|
196
|
+
'begin_date':begin_date_str,
|
|
197
|
+
'data_window': self.data_window,
|
|
198
|
+
'execution_date': self.today.strftime('%Y-%m-%d')
|
|
199
|
+
}
|
|
181
200
|
self.df = df
|
|
182
201
|
|
|
183
202
|
### cleaning volume
|
|
@@ -226,8 +245,6 @@ class stock_eda_panel(object):
|
|
|
226
245
|
df["lower"] = df['Close_roll_mean'] - df["Close_roll_std"]*2
|
|
227
246
|
|
|
228
247
|
df = df[df.Date >= begin_date_str ]
|
|
229
|
-
self.settings_general = {'n_days':self.n_days, 'begin_date':begin_date_str}
|
|
230
|
-
self.df = df
|
|
231
248
|
|
|
232
249
|
fig = make_subplots(rows=1, cols=1,vertical_spacing = 0.1,shared_xaxes=True,
|
|
233
250
|
subplot_titles=(
|
|
@@ -453,6 +470,26 @@ class stock_eda_panel(object):
|
|
|
453
470
|
|
|
454
471
|
self.df[f'signal_low_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] < self.df[f'lower_{feature_name}'] ), 1, 0)
|
|
455
472
|
self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] > self.df[f'upper_{feature_name}'] ), 1, 0)
|
|
473
|
+
|
|
474
|
+
def signal_plotter(self, feature_name):
|
|
475
|
+
fig, axs = plt.subplots(1, 3,figsize=(17,5))
|
|
476
|
+
|
|
477
|
+
axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
|
|
478
|
+
axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
|
|
479
|
+
axs[0].plot(self.df[f'norm_{feature_name}'])
|
|
480
|
+
|
|
481
|
+
plot_acf(self.df[feature_name].dropna(),lags=25,ax = axs[1])
|
|
482
|
+
axs[1].set_title(f'acf {feature_name}')
|
|
483
|
+
|
|
484
|
+
plot_pacf(self.df[feature_name].dropna(),lags=25,ax = axs[2])
|
|
485
|
+
axs[2].set_title(f'pacf {feature_name}')
|
|
486
|
+
|
|
487
|
+
fig.show()
|
|
488
|
+
|
|
489
|
+
def log_features_standard(self, feature_name):
|
|
490
|
+
self.features.append(feature_name)
|
|
491
|
+
self.signals.append(f'signal_up_{feature_name}')
|
|
492
|
+
self.signals.append(f'signal_low_{feature_name}')
|
|
456
493
|
|
|
457
494
|
#######################
|
|
458
495
|
#### to be deprecated ####
|
|
@@ -535,26 +572,12 @@ class stock_eda_panel(object):
|
|
|
535
572
|
|
|
536
573
|
print('--------------------------------------------------------------------')
|
|
537
574
|
if save_features:
|
|
538
|
-
self.
|
|
539
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
540
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
575
|
+
self.log_features_standard(feature_name)
|
|
541
576
|
self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
|
|
542
577
|
|
|
543
578
|
if plot:
|
|
544
579
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
axs[0].plot(self.df['Date'],self.df[f'norm_{feature_name}'])
|
|
548
|
-
axs[0].plot(self.df['Date'],self.df[f'upper_{feature_name}'], linestyle='--')
|
|
549
|
-
axs[0].plot(self.df['Date'],self.df[f'lower_{feature_name}'], linestyle='--')
|
|
550
|
-
axs[0].set_title('rel_MA_spread series')
|
|
551
|
-
|
|
552
|
-
plot_acf(self.df[feature_name].dropna(),lags=25, ax=axs[1])
|
|
553
|
-
axs[1].set_title('acf rel_MA_spread series')
|
|
554
|
-
|
|
555
|
-
plot_pacf(self.df[feature_name].dropna(),lags=25, ax=axs[2])
|
|
556
|
-
axs[2].set_title('acf rel_MA_spread series')
|
|
557
|
-
plt.show()
|
|
580
|
+
self.signal_plotter(feature_name)
|
|
558
581
|
|
|
559
582
|
def pair_feature(self, pair_symbol, plot = False):
|
|
560
583
|
self.pair_symbol = pair_symbol
|
|
@@ -562,8 +585,7 @@ class stock_eda_panel(object):
|
|
|
562
585
|
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
563
586
|
|
|
564
587
|
stock = yf.Ticker(self.pair_symbol)
|
|
565
|
-
|
|
566
|
-
df = stock.history(period='5y')
|
|
588
|
+
df = stock.history(period=self.data_window)
|
|
567
589
|
df = df.sort_values('Date')
|
|
568
590
|
df.reset_index(inplace=True)
|
|
569
591
|
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
@@ -622,9 +644,7 @@ class stock_eda_panel(object):
|
|
|
622
644
|
self.df['signal_up_pair_z_score'] = np.where(self.df['pair_z_score'] > z_threshold, 1, 0)
|
|
623
645
|
|
|
624
646
|
if save_features:
|
|
625
|
-
self.
|
|
626
|
-
self.signals.append('signal_low_pair_z_score')
|
|
627
|
-
self.signals.append('signal_up_pair_z_score')
|
|
647
|
+
self.log_features_standard('pair_z_score')
|
|
628
648
|
self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
|
|
629
649
|
|
|
630
650
|
if plot:
|
|
@@ -701,9 +721,7 @@ class stock_eda_panel(object):
|
|
|
701
721
|
self.compute_clip_bands(feature_name,threshold)
|
|
702
722
|
|
|
703
723
|
if save_features:
|
|
704
|
-
self.
|
|
705
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
706
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
724
|
+
self.log_features_standard(feature_name)
|
|
707
725
|
self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
|
|
708
726
|
|
|
709
727
|
if plot:
|
|
@@ -768,9 +786,7 @@ class stock_eda_panel(object):
|
|
|
768
786
|
self.df[f'signal_low_{feature_name}'] = np.where(self.df[f'norm_{feature_name}'] < self.df[f'low_bound_norm_{feature_name}'],1,0 )
|
|
769
787
|
|
|
770
788
|
if save_features:
|
|
771
|
-
self.
|
|
772
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
773
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
789
|
+
self.log_features_standard(feature_name)
|
|
774
790
|
self.settings_relative_price_range = {'window':window, 'threshold':threshold}
|
|
775
791
|
|
|
776
792
|
if plot:
|
|
@@ -827,25 +843,11 @@ class stock_eda_panel(object):
|
|
|
827
843
|
self.compute_clip_bands(feature_name,threshold)
|
|
828
844
|
|
|
829
845
|
if save_features:
|
|
830
|
-
self.
|
|
831
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
832
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
846
|
+
self.log_features_standard(feature_name)
|
|
833
847
|
self.settings_rsi_feature_v2 = {'window':window, 'threshold':threshold}
|
|
834
848
|
|
|
835
849
|
if plot:
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
|
|
839
|
-
axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
|
|
840
|
-
axs[0].plot(self.df[f'norm_{feature_name}'])
|
|
841
|
-
|
|
842
|
-
plot_acf(self.df['RSI'].dropna(),lags=25,ax = axs[1])
|
|
843
|
-
axs[1].set_title('acf RSI')
|
|
844
|
-
|
|
845
|
-
plot_pacf(self.df['RSI'].dropna(),lags=25,ax = axs[2])
|
|
846
|
-
axs[2].set_title('pacf RSI')
|
|
847
|
-
|
|
848
|
-
fig.show()
|
|
850
|
+
self.signal_plotter(feature_name)
|
|
849
851
|
|
|
850
852
|
#######################
|
|
851
853
|
#### to be deprecated ####
|
|
@@ -905,25 +907,11 @@ class stock_eda_panel(object):
|
|
|
905
907
|
|
|
906
908
|
if save_features:
|
|
907
909
|
|
|
908
|
-
self.
|
|
909
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
910
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
910
|
+
self.log_features_standard(feature_name)
|
|
911
911
|
self.settings_days_features_v2 = {'window':window, 'threshold':threshold}
|
|
912
912
|
|
|
913
913
|
if plot:
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
axs[0].plot(self.df[f'norm_{feature_name}'])
|
|
917
|
-
axs[0].plot(self.df[f'upper_{feature_name}'], linestyle='--')
|
|
918
|
-
axs[0].plot(self.df[f'lower_{feature_name}'], linestyle='--')
|
|
919
|
-
|
|
920
|
-
plot_acf(self.df[f'norm_{feature_name}'].dropna(),lags=25,ax = axs[1])
|
|
921
|
-
axs[1].set_title('acf day feature')
|
|
922
|
-
|
|
923
|
-
plot_pacf(self.df[f'norm_{feature_name}'].dropna(),lags=25,ax = axs[2])
|
|
924
|
-
axs[2].set_title('pacf day feature')
|
|
925
|
-
|
|
926
|
-
fig.show()
|
|
914
|
+
self.signal_plotter(feature_name)
|
|
927
915
|
|
|
928
916
|
#######################
|
|
929
917
|
#### to be deprecated ####
|
|
@@ -996,9 +984,7 @@ class stock_eda_panel(object):
|
|
|
996
984
|
self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'z_{feature_name}'] > threshold ), 1, 0)
|
|
997
985
|
|
|
998
986
|
if save_features:
|
|
999
|
-
self.
|
|
1000
|
-
self.signals.append(f'signal_up_{feature_name}')
|
|
1001
|
-
self.signals.append(f'signal_low_{feature_name}')
|
|
987
|
+
self.log_features_standard(feature_name)
|
|
1002
988
|
self.settings_smooth_volume = {'window':window, 'threshold':threshold}
|
|
1003
989
|
if plot:
|
|
1004
990
|
fig, axs = plt.subplots(2, 2,figsize=(11,6))
|
|
@@ -1025,6 +1011,138 @@ class stock_eda_panel(object):
|
|
|
1025
1011
|
axs[1].set_title(f'z_{feature_name}')
|
|
1026
1012
|
|
|
1027
1013
|
plt.show()
|
|
1014
|
+
|
|
1015
|
+
def roc_feature(self, window, threshold, plot = False, save_features = False):
|
|
1016
|
+
feature_name = 'ROC'
|
|
1017
|
+
roc = ROCIndicator(close = self.df['Close'], window = window).roc()
|
|
1018
|
+
self.df[feature_name] = roc
|
|
1019
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1020
|
+
|
|
1021
|
+
if save_features:
|
|
1022
|
+
self.log_features_standard(feature_name)
|
|
1023
|
+
self.settings_roc_feature = {'window':window, 'threshold':threshold}
|
|
1024
|
+
if plot:
|
|
1025
|
+
self.signal_plotter(feature_name)
|
|
1026
|
+
|
|
1027
|
+
def stoch_feature(self, window, smooth1, smooth2, threshold, plot = False, save_features = False):
|
|
1028
|
+
feature_name = 'STOCH'
|
|
1029
|
+
stoch = StochRSIIndicator(close = self.df['Close'], window = window, smooth1=smooth1, smooth2=smooth2).stochrsi()
|
|
1030
|
+
self.df[feature_name] = stoch
|
|
1031
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1032
|
+
|
|
1033
|
+
if save_features:
|
|
1034
|
+
self.log_features_standard(feature_name)
|
|
1035
|
+
self.settings_stoch_feature = {'window':window, 'smooth1':smooth1, 'smooth2':smooth2, 'threshold':threshold}
|
|
1036
|
+
if plot:
|
|
1037
|
+
self.signal_plotter(feature_name)
|
|
1038
|
+
|
|
1039
|
+
def stochastic_feature(self, window, smooth, threshold, plot = False, save_features = False):
|
|
1040
|
+
feature_name = 'STOCHOSC'
|
|
1041
|
+
stochast = StochasticOscillator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window,smooth_window=smooth).stoch()
|
|
1042
|
+
self.df[feature_name] = stochast
|
|
1043
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1044
|
+
|
|
1045
|
+
if save_features:
|
|
1046
|
+
self.log_features_standard(feature_name)
|
|
1047
|
+
self.settings_stochastic_feature = {'window':window, 'smooth':smooth,'threshold':threshold}
|
|
1048
|
+
if plot:
|
|
1049
|
+
self.signal_plotter(feature_name)
|
|
1050
|
+
|
|
1051
|
+
def william_feature(self, lbp, threshold, plot = False, save_features = False):
|
|
1052
|
+
feature_name = 'WILL'
|
|
1053
|
+
will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
|
|
1054
|
+
self.df[feature_name] = will
|
|
1055
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1056
|
+
|
|
1057
|
+
if save_features:
|
|
1058
|
+
self.log_features_standard(feature_name)
|
|
1059
|
+
self.settings_william_feature = {'lbp':lbp,'threshold':threshold}
|
|
1060
|
+
if plot:
|
|
1061
|
+
self.signal_plotter(feature_name)
|
|
1062
|
+
|
|
1063
|
+
def vortex_feature(self, window, threshold, plot = False, save_features = False):
|
|
1064
|
+
feature_name = 'VORTEX'
|
|
1065
|
+
vortex = VortexIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window).vortex_indicator_diff()
|
|
1066
|
+
self.df[feature_name] = vortex
|
|
1067
|
+
self.compute_clip_bands(feature_name,threshold)
|
|
1068
|
+
|
|
1069
|
+
if save_features:
|
|
1070
|
+
self.log_features_standard(feature_name)
|
|
1071
|
+
self.settings_vortex_feature = {'window':window, 'threshold':threshold}
|
|
1072
|
+
if plot:
|
|
1073
|
+
self.signal_plotter(feature_name)
|
|
1074
|
+
|
|
1075
|
+
def pair_index_feature(self, pair_symbol, feature_label, window, threshold, plot = False, save_features = False):
|
|
1076
|
+
self.pair_index = pair_symbol
|
|
1077
|
+
begin_date = self.today - relativedelta(days = self.n_days)
|
|
1078
|
+
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
1079
|
+
|
|
1080
|
+
if feature_label in self.df.columns:
|
|
1081
|
+
self.df = self.df.drop(columns = [feature_label])
|
|
1082
|
+
|
|
1083
|
+
stock = yf.Ticker(self.pair_index)
|
|
1084
|
+
df = stock.history(period=self.data_window)
|
|
1085
|
+
df = df.sort_values('Date')
|
|
1086
|
+
df.reset_index(inplace=True)
|
|
1087
|
+
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
1088
|
+
df['Date'] = pd.to_datetime(df['Date'])
|
|
1089
|
+
df = df[df.Date >= begin_date_str ]
|
|
1090
|
+
self.pair_index_df = df
|
|
1091
|
+
|
|
1092
|
+
#### converting the same index ####
|
|
1093
|
+
dates_vector = self.df.Date.to_frame()
|
|
1094
|
+
self.pair_index_df = dates_vector.merge(self.pair_index_df, on ='Date',how = 'left')
|
|
1095
|
+
self.pair_index_df = self.pair_index_df.fillna(method = 'bfill')
|
|
1096
|
+
self.pair_index_df = self.pair_index_df.fillna(method = 'ffill')
|
|
1097
|
+
|
|
1098
|
+
self.pair_index_df[feature_label] = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
|
|
1099
|
+
df_to_merge = self.pair_index_df[['Date',feature_label]]
|
|
1100
|
+
self.df = self.df.merge(df_to_merge, on ='Date',how = 'left')
|
|
1101
|
+
|
|
1102
|
+
########
|
|
1103
|
+
self.compute_clip_bands(feature_label,threshold)
|
|
1104
|
+
|
|
1105
|
+
if save_features:
|
|
1106
|
+
self.log_features_standard(feature_label)
|
|
1107
|
+
parameters = {feature_label:{'pair_symbol':pair_symbol, 'feature_label':feature_label, 'window':window,'threshold':threshold}}
|
|
1108
|
+
try:
|
|
1109
|
+
len(self.settings_pair_index_feature)
|
|
1110
|
+
print('existing')
|
|
1111
|
+
self.settings_pair_index_feature.append(parameters)
|
|
1112
|
+
except:
|
|
1113
|
+
print('creation')
|
|
1114
|
+
self.settings_pair_index_feature = list()
|
|
1115
|
+
self.settings_pair_index_feature.append(parameters)
|
|
1116
|
+
|
|
1117
|
+
if plot:
|
|
1118
|
+
self.signal_plotter(feature_label)
|
|
1119
|
+
|
|
1120
|
+
def produce_order_features(self, feature_name, save_features = False):
|
|
1121
|
+
|
|
1122
|
+
signal_feature_name = f'discrete_signal_{feature_name}'
|
|
1123
|
+
order_feature_name = f'order_signal_{feature_name}'
|
|
1124
|
+
|
|
1125
|
+
self.df[signal_feature_name] = np.where(
|
|
1126
|
+
self.df[f'signal_up_{feature_name}'] == 1,1,
|
|
1127
|
+
np.where(
|
|
1128
|
+
self.df[f'signal_low_{feature_name}'] == 1,-1,0
|
|
1129
|
+
)
|
|
1130
|
+
)
|
|
1131
|
+
|
|
1132
|
+
## indexing chains
|
|
1133
|
+
self.df[f'lag_{signal_feature_name}'] = self.df[signal_feature_name].shift(1)
|
|
1134
|
+
self.df['breack'] = np.where(self.df[f'lag_{signal_feature_name}'] != self.df[signal_feature_name],1,0)
|
|
1135
|
+
self.df["chain_id"] = self.df.groupby("breack")["Date"].rank(method="first", ascending=True)
|
|
1136
|
+
self.df["chain_id"] = np.where(self.df['breack'] == 1,self.df["chain_id"],np.nan)
|
|
1137
|
+
self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
|
|
1138
|
+
self.df[order_feature_name] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
|
|
1139
|
+
self.df[order_feature_name] = self.df[order_feature_name]*self.df[signal_feature_name]
|
|
1140
|
+
self.df = self.df.drop(columns = [f'lag_{signal_feature_name}', 'breack', "chain_id"])
|
|
1141
|
+
|
|
1142
|
+
## saving features
|
|
1143
|
+
if save_features:
|
|
1144
|
+
self.signals.append(signal_feature_name)
|
|
1145
|
+
self.signals.append(order_feature_name)
|
|
1028
1146
|
|
|
1029
1147
|
def create_hmm_derived_features(self, lag_returns):
|
|
1030
1148
|
|
|
@@ -1345,7 +1463,7 @@ class stock_eda_panel(object):
|
|
|
1345
1463
|
|
|
1346
1464
|
self.df[f'mean_target'] = self.df[columns].mean(axis=1)
|
|
1347
1465
|
self.target.append(f'mean_target')
|
|
1348
|
-
self.settings_target_lasts = {'steps':steps}
|
|
1466
|
+
self.settings_target_lasts = {'steps':steps, 'type':'regression'}
|
|
1349
1467
|
|
|
1350
1468
|
def get_categorical_targets(self, horizon, flor_loss, top_gain):
|
|
1351
1469
|
|
|
@@ -1377,7 +1495,7 @@ class stock_eda_panel(object):
|
|
|
1377
1495
|
self.targets.append('target_up')
|
|
1378
1496
|
self.targets.append('target_down')
|
|
1379
1497
|
|
|
1380
|
-
self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain}
|
|
1498
|
+
self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain, 'type': 'classification'}
|
|
1381
1499
|
|
|
1382
1500
|
def get_configurations(self,test_data_size =250, val_data_size = 250, model_type = False):
|
|
1383
1501
|
|
|
@@ -1400,69 +1518,22 @@ class stock_eda_panel(object):
|
|
|
1400
1518
|
self.settings['model_type'] = model_type
|
|
1401
1519
|
self.settings['target'] = list(set(self.target))
|
|
1402
1520
|
self.settings['targets'] = target_list
|
|
1403
|
-
|
|
1404
|
-
try:
|
|
1405
|
-
self.settings['settings']['spread_ma'] = self.settings_spread_ma ##to be deprecated
|
|
1406
|
-
except:
|
|
1407
|
-
pass
|
|
1408
|
-
try:
|
|
1409
|
-
self.settings['settings']['relative_spread_ma'] = self.settings_relative_spread_ma
|
|
1410
|
-
except:
|
|
1411
|
-
pass
|
|
1412
|
-
try:
|
|
1413
|
-
self.settings['settings']['pair_feature'] = self.settings_pair_feature
|
|
1414
|
-
except:
|
|
1415
|
-
pass
|
|
1416
|
-
try:
|
|
1417
|
-
self.settings['settings']['count_features'] = self.settings_count_features ##to be deprecated
|
|
1418
|
-
except:
|
|
1419
|
-
pass
|
|
1420
|
-
try:
|
|
1421
|
-
self.settings['settings']['bidirect_count_features'] = self.settings_bidirect_count_features
|
|
1422
|
-
except:
|
|
1423
|
-
pass
|
|
1424
|
-
try:
|
|
1425
|
-
self.settings['settings']['price_range'] = self.settings_price_range ##to be deprecated
|
|
1426
|
-
except:
|
|
1427
|
-
pass
|
|
1428
|
-
try:
|
|
1429
|
-
self.settings['settings']['relative_price_range'] = self.settings_relative_price_range
|
|
1430
|
-
except:
|
|
1431
|
-
pass
|
|
1432
|
-
try:
|
|
1433
|
-
self.settings['settings']['rsi_feature'] = self.settings_rsi_feature ##to be deprecated
|
|
1434
|
-
except:
|
|
1435
|
-
pass
|
|
1436
|
-
try:
|
|
1437
|
-
self.settings['settings']['rsi_feature_v2'] = self.settings_rsi_feature_v2
|
|
1438
|
-
except:
|
|
1439
|
-
pass
|
|
1440
|
-
try:
|
|
1441
|
-
self.settings['settings']['days_features'] = self.settings_days_features ##to be deprecated
|
|
1442
|
-
except:
|
|
1443
|
-
pass
|
|
1444
|
-
try:
|
|
1445
|
-
self.settings['settings']['days_features_v2'] = self.settings_days_features_v2
|
|
1446
|
-
except:
|
|
1447
|
-
pass
|
|
1448
1521
|
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
self.settings['settings']['hmm'] = self.settings_hmm
|
|
1460
|
-
except:
|
|
1461
|
-
pass
|
|
1522
|
+
## for now this is hard coded
|
|
1523
|
+
feature_list = ['spread_ma','relative_spread_ma','pair_feature','count_features','bidirect_count_features','price_range','relative_price_range','rsi_feature',
|
|
1524
|
+
'rsi_feature_v2', 'days_features','days_features_v2', 'volume_feature','smooth_volume', 'roc_feature', 'stoch_feature', 'stochastic_feature',
|
|
1525
|
+
'william_feature', 'vortex_feature', 'pair_index_feature','hmm']
|
|
1526
|
+
|
|
1527
|
+
for feature in feature_list:
|
|
1528
|
+
try:
|
|
1529
|
+
self.settings['settings'][feature] = getattr(self, f'settings_{feature}')
|
|
1530
|
+
except:
|
|
1531
|
+
pass
|
|
1462
1532
|
try:
|
|
1463
1533
|
self.settings['settings']['target_lasts'] = self.settings_target_lasts
|
|
1464
1534
|
except:
|
|
1465
1535
|
pass
|
|
1536
|
+
|
|
1466
1537
|
try:
|
|
1467
1538
|
self.settings['settings']['strategies'] = {
|
|
1468
1539
|
'best_strategy':self.best_strategy,
|
|
@@ -1853,10 +1924,11 @@ def iterate_signal_analyser(test_data_size,feature_name, days_list, arguments_to
|
|
|
1853
1924
|
return best_result
|
|
1854
1925
|
|
|
1855
1926
|
class analyse_index(stock_eda_panel):
|
|
1856
|
-
def __init__(self, index, asset, n_obs, lag, show_plot = True, save_path = False, save_aws = False):
|
|
1927
|
+
def __init__(self, index, asset, n_obs, lag, data_window = '5y', show_plot = True, save_path = False, save_aws = False):
|
|
1857
1928
|
self.index = index
|
|
1858
1929
|
self.asset = asset
|
|
1859
1930
|
self.n_obs = n_obs
|
|
1931
|
+
self.data_window = data_window
|
|
1860
1932
|
self.lag = lag
|
|
1861
1933
|
|
|
1862
1934
|
self.show_plot = show_plot
|
|
@@ -1865,12 +1937,12 @@ class analyse_index(stock_eda_panel):
|
|
|
1865
1937
|
|
|
1866
1938
|
def process_data(self):
|
|
1867
1939
|
|
|
1868
|
-
index = stock_eda_panel(self.index, self.n_obs)
|
|
1940
|
+
index = stock_eda_panel(self.index, self.n_obs, self.data_window)
|
|
1869
1941
|
index.get_data()
|
|
1870
1942
|
index.df['shift'] = index.df.Close.shift(self.lag)
|
|
1871
1943
|
index.df['index_return'] = index.df.Close/index.df['shift'] - 1
|
|
1872
1944
|
|
|
1873
|
-
asset = stock_eda_panel(self.asset, self.n_obs)
|
|
1945
|
+
asset = stock_eda_panel(self.asset, self.n_obs, self.data_window)
|
|
1874
1946
|
asset.get_data()
|
|
1875
1947
|
asset.df['shift'] = asset.df.Close.shift(self.lag)
|
|
1876
1948
|
asset.df['asset_return'] = asset.df.Close/asset.df['shift'] - 1
|
|
@@ -1,36 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: virgo-modules
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.6
|
|
4
4
|
Summary: data processing and statistical modeling using stock market data
|
|
5
5
|
Home-page: https://github.com/miguelmayhem92/virgo_module
|
|
6
6
|
Author: Miguel Mayhuire
|
|
7
7
|
Author-email: miguelmayhem92@gmail.com
|
|
8
8
|
License: MIT
|
|
9
|
+
Platform: UNKNOWN
|
|
9
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
11
|
Classifier: Programming Language :: Python :: 3.9
|
|
11
12
|
Classifier: Operating System :: OS Independent
|
|
12
13
|
Requires-Python: >=3.9, <3.10
|
|
13
14
|
Description-Content-Type: text/markdown
|
|
14
|
-
License-File: LICENSE
|
|
15
|
-
Requires-Dist: feature-engine==1.6.1
|
|
16
|
-
Requires-Dist: matplotlib==3.6.3
|
|
17
|
-
Requires-Dist: mlflow==2.1.1
|
|
18
|
-
Requires-Dist: numpy==1.23.5
|
|
19
|
-
Requires-Dist: optuna==3.1.0
|
|
20
|
-
Requires-Dist: pandas==1.5.3
|
|
21
|
-
Requires-Dist: plotly==5.15.0
|
|
22
|
-
Requires-Dist: rsa==4.9
|
|
23
|
-
Requires-Dist: scikit-learn==1.2.1
|
|
24
|
-
Requires-Dist: scipy==1.10.0
|
|
25
|
-
Requires-Dist: seaborn==0.12.2
|
|
26
|
-
Requires-Dist: starlette==0.22.0
|
|
27
|
-
Requires-Dist: statsmodels==0.13.5
|
|
28
|
-
Requires-Dist: ta==0.10.2
|
|
29
|
-
Requires-Dist: yfinance==0.2.9
|
|
30
|
-
Requires-Dist: hmmlearn==0.3.0
|
|
31
|
-
Requires-Dist: boto3
|
|
32
15
|
Provides-Extra: dev
|
|
33
|
-
|
|
16
|
+
License-File: LICENSE
|
|
34
17
|
|
|
35
18
|
# Virgo Package
|
|
36
19
|
|
|
@@ -51,3 +34,4 @@ obj = stock_eda_panel(stock_code = 'PEP', n_days = 20)
|
|
|
51
34
|
obj.get_data()
|
|
52
35
|
print(obj.df.shape)
|
|
53
36
|
```
|
|
37
|
+
|
|
@@ -9,6 +9,7 @@ virgo_app/virgo_modules.egg-info/requires.txt
|
|
|
9
9
|
virgo_app/virgo_modules.egg-info/top_level.txt
|
|
10
10
|
virgo_app/virgo_modules/src/__init__.py
|
|
11
11
|
virgo_app/virgo_modules/src/aws_utils.py
|
|
12
|
+
virgo_app/virgo_modules/src/edge_utils.py
|
|
12
13
|
virgo_app/virgo_modules/src/pull_artifacts.py
|
|
13
14
|
virgo_app/virgo_modules/src/re_utils.py
|
|
14
15
|
virgo_app/virgo_modules/src/ticketer_source.py
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
import yaml
|
|
2
|
-
import boto3
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from io import StringIO
|
|
5
|
-
|
|
6
|
-
def upload_file_to_aws(bucket,key,input_path):
|
|
7
|
-
|
|
8
|
-
credentials = yaml.safe_load(Path('secrets.yaml').read_text())
|
|
9
|
-
session = boto3.Session(aws_access_key_id=credentials['AWS_ACCESS_KEY_ID'],aws_secret_access_key=credentials['AWS_SECRET_ACCESS_KEY'])
|
|
10
|
-
bucket = credentials[bucket]
|
|
11
|
-
s3 = session.resource('s3')
|
|
12
|
-
s3.meta.client.upload_file(Filename=input_path , Bucket=bucket, Key=key)
|
|
13
|
-
|
|
14
|
-
def upload_pandas_to_s3(data_frame,bucket,key):
|
|
15
|
-
|
|
16
|
-
csv_buffer = StringIO()
|
|
17
|
-
data_frame.to_csv(csv_buffer)
|
|
18
|
-
csv_buffer.seek(0)
|
|
19
|
-
|
|
20
|
-
credentials = yaml.safe_load(Path('secrets.yaml').read_text())
|
|
21
|
-
s3 = boto3.client("s3",region_name=credentials['AWS_DEFAULT_REGION'],aws_access_key_id=credentials['AWS_ACCESS_KEY_ID'],aws_secret_access_key=credentials['AWS_SECRET_ACCESS_KEY'])
|
|
22
|
-
bucket = credentials[bucket]
|
|
23
|
-
s3.put_object(Bucket=bucket, Body=csv_buffer.getvalue(), Key= key)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{virgo_modules-0.0.3 → virgo_modules-0.0.6}/virgo_app/virgo_modules.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|