virgo-modules 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of virgo-modules might be problematic. Click here for more details.
- virgo_modules/src/edge_utils/edge_utils.py +16 -2
- virgo_modules/src/edge_utils/feature_selection.py +54 -0
- virgo_modules/src/market/__init__.py +0 -0
- virgo_modules/src/market/market_tools.py +189 -0
- virgo_modules/src/re_utils.py +6 -0
- virgo_modules/src/ticketer_source.py +80 -33
- virgo_modules/src/transformer_utils.py +110 -0
- {virgo_modules-0.6.1.dist-info → virgo_modules-0.8.0.dist-info}/METADATA +15 -4
- virgo_modules-0.8.0.dist-info/RECORD +22 -0
- {virgo_modules-0.6.1.dist-info → virgo_modules-0.8.0.dist-info}/WHEEL +1 -1
- virgo_modules-0.6.1.dist-info/RECORD +0 -19
- {virgo_modules-0.6.1.dist-info → virgo_modules-0.8.0.dist-info/licenses}/LICENSE +0 -0
- {virgo_modules-0.6.1.dist-info → virgo_modules-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,14 @@ from feature_engine.imputation import MeanMedianImputer
|
|
|
11
11
|
from feature_engine.discretisation import EqualWidthDiscretiser
|
|
12
12
|
from feature_engine.datetime import DatetimeFeatures
|
|
13
13
|
|
|
14
|
-
from ..transformer_utils import
|
|
14
|
+
from ..transformer_utils import (
|
|
15
|
+
VirgoWinsorizerFeature,
|
|
16
|
+
InverseHyperbolicSine,
|
|
17
|
+
FeaturesEntropy,
|
|
18
|
+
FeatureSelector,
|
|
19
|
+
InteractionFeatures,
|
|
20
|
+
SplineMarketReturnJumpWaves
|
|
21
|
+
)
|
|
15
22
|
|
|
16
23
|
from plotly.subplots import make_subplots
|
|
17
24
|
import plotly.graph_objects as go
|
|
@@ -223,6 +230,7 @@ def data_processing_pipeline_classifier(
|
|
|
223
230
|
date_features_list = False,
|
|
224
231
|
entropy_set_list = False,
|
|
225
232
|
interaction_features_cont = False,
|
|
233
|
+
spline_regression_config = False,
|
|
226
234
|
pipeline_order = 'selector//winzorizer//discretizer//median_inputer//drop//correlation'
|
|
227
235
|
):
|
|
228
236
|
|
|
@@ -254,7 +262,12 @@ def data_processing_pipeline_classifier(
|
|
|
254
262
|
invhypersin_pipe = [('invhypervolsin scaler', InverseHyperbolicSine(features = invhypervolsin_features))] if invhypervolsin_features else []
|
|
255
263
|
datetimeFeatures_pipe = [('date features', DatetimeFeatures(features_to_extract = date_features_list, variables = 'Date', drop_original = False))] if date_features_list else []
|
|
256
264
|
interaction_features = [("interaction features", InteractionFeatures(interaction_features_cont[0], interaction_features_cont[1]))] if interaction_features_cont else []
|
|
257
|
-
|
|
265
|
+
spline_features = [("spline features", SplineMarketReturnJumpWaves(
|
|
266
|
+
return_feature_names=spline_regression_config.get("return_feature_names"),
|
|
267
|
+
target_variables=spline_regression_config.get("target_variables"),
|
|
268
|
+
feature_label=spline_regression_config.get("feature_label"),
|
|
269
|
+
))] if spline_regression_config else []
|
|
270
|
+
|
|
258
271
|
entropy_pipe = list()
|
|
259
272
|
if entropy_set_list:
|
|
260
273
|
for setx_ in entropy_set_list:
|
|
@@ -274,6 +287,7 @@ def data_processing_pipeline_classifier(
|
|
|
274
287
|
'date_features': datetimeFeatures_pipe,
|
|
275
288
|
'interaction_features': interaction_features,
|
|
276
289
|
'entropy_features' : entropy_pipe,
|
|
290
|
+
"spline_features": spline_features,
|
|
277
291
|
}
|
|
278
292
|
|
|
279
293
|
pipeline_steps = pipeline_order.split('//')
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
from numpy.random import choice
|
|
4
|
+
import numpy as np
|
|
5
|
+
from scipy import stats
|
|
6
|
+
from sklearn.feature_selection import RFE
|
|
7
|
+
|
|
8
|
+
class StackRFE:
|
|
9
|
+
def __init__(self, model, n_features, batch_elim, step_elim, cv, max_iterations):
|
|
10
|
+
self.model = model
|
|
11
|
+
self.n_features = n_features
|
|
12
|
+
self.batch_elim = batch_elim
|
|
13
|
+
self.step_elim = step_elim
|
|
14
|
+
self.cv = cv
|
|
15
|
+
self.max_iterations = max_iterations
|
|
16
|
+
|
|
17
|
+
def _suggest_elimination(self, uniform=False):
|
|
18
|
+
"""
|
|
19
|
+
suggest based on mean ranking, lower the mean rank higher the prob to be selected
|
|
20
|
+
"""
|
|
21
|
+
ds = self.feature_rankings
|
|
22
|
+
ds_mean = {k:np.mean(ds.get(k)) for k in ds}
|
|
23
|
+
max_ = np.max([x for x in ds_mean.values()])
|
|
24
|
+
ds_weight = {k: (max_-v+1) for k,v in ds_mean.items()}
|
|
25
|
+
sum_ = np.sum([x for x in ds_weight.values()])
|
|
26
|
+
ds_prob = {k: v/sum_ for k,v in ds_weight.items()}
|
|
27
|
+
result = list(choice(list(ds_prob.keys()), self.batch_elim,p=list(ds_prob.values()), replace=False))
|
|
28
|
+
if uniform:
|
|
29
|
+
features = list(ds_prob.keys())
|
|
30
|
+
random.shuffle(features)
|
|
31
|
+
result = features[0:self.batch_elim]
|
|
32
|
+
return result
|
|
33
|
+
|
|
34
|
+
def fit(self, X, y):
|
|
35
|
+
features = list(X.columns).copy()
|
|
36
|
+
self.feature_rankings = {f:[1] for f in features}
|
|
37
|
+
for iteration in range(self.max_iterations):
|
|
38
|
+
# shuffling
|
|
39
|
+
if random.random() > 0.5:
|
|
40
|
+
batch_features = self._suggest_elimination()
|
|
41
|
+
else:
|
|
42
|
+
batch_features = self._suggest_elimination()
|
|
43
|
+
# selector and elimination
|
|
44
|
+
tmp_feature_ranking = {k: list() for k in batch_features}
|
|
45
|
+
selector = RFE(self.model, n_features_to_select=self.n_features, step=self.step_elim)
|
|
46
|
+
for train_index, test_index in self.cv.split(X, y):
|
|
47
|
+
X_ = X[X.index.get_level_values('i').isin(train_index)][batch_features]
|
|
48
|
+
y_ = y[y.index.get_level_values('i').isin(train_index)]
|
|
49
|
+
selector = selector.fit(X_, y_)
|
|
50
|
+
for k,r in zip(tmp_feature_ranking.keys(), selector.ranking_):
|
|
51
|
+
tmp_feature_ranking[k].append(r)
|
|
52
|
+
rankings = [stats.mode(v).mode for v in tmp_feature_ranking.values()]
|
|
53
|
+
for f,r in zip(batch_features, rankings):
|
|
54
|
+
self.feature_rankings[f].append(r)
|
|
File without changes
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from sklearn.linear_model import HuberRegressor
|
|
7
|
+
from scipy import stats
|
|
8
|
+
|
|
9
|
+
import matplotlib.pyplot as plt
|
|
10
|
+
import seaborn as sns; sns.set()
|
|
11
|
+
|
|
12
|
+
from matplotlib import cm
|
|
13
|
+
import matplotlib.colors as mcolors
|
|
14
|
+
|
|
15
|
+
class MarketAnalysis:
|
|
16
|
+
"""
|
|
17
|
+
Class that perform market analysis using robust linear regression
|
|
18
|
+
|
|
19
|
+
Attributes
|
|
20
|
+
----------
|
|
21
|
+
data : pd.DataFrame
|
|
22
|
+
input data
|
|
23
|
+
market_features : list
|
|
24
|
+
list of market feature (log returns) to apply analysis
|
|
25
|
+
return_cols: str
|
|
26
|
+
main log return feature
|
|
27
|
+
col_map: dict
|
|
28
|
+
dictionary containing rename of market features
|
|
29
|
+
|
|
30
|
+
Methods
|
|
31
|
+
-------
|
|
32
|
+
compute_beta(data=pd.DataFrame, feature_x=str, feature_y=str):
|
|
33
|
+
compute betas given x and y using robust linear regression
|
|
34
|
+
get_correlation(data=pd.DataFrame, feature_x=str, feature_y=str):
|
|
35
|
+
compute correlation given x and y
|
|
36
|
+
produce_beta_report(data=pd.DataFrame):
|
|
37
|
+
produce beta report
|
|
38
|
+
compute_general_report(sample_size=int, offset=int, index=str, subsample_ts=int, show_plot=bool):
|
|
39
|
+
compute full report, global and latest window
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, data, market_features, return_col, col_map=None):
|
|
43
|
+
self.data = data.dropna()
|
|
44
|
+
self.market_features = market_features
|
|
45
|
+
self.return_cols = return_col
|
|
46
|
+
self.col_map=col_map
|
|
47
|
+
|
|
48
|
+
def compute_beta(self, data, feature_x, feature_y):
|
|
49
|
+
"""
|
|
50
|
+
compute betas given x and y using robust linear regression
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
data (pd.DataFrame): input data containing analysis features
|
|
55
|
+
feature_x (str): name of the feature x
|
|
56
|
+
feature_y (str): name of the feature y
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
(beta(str), alpha(str))
|
|
61
|
+
"""
|
|
62
|
+
x = data[feature_x].values.reshape(-1,1)
|
|
63
|
+
y = data[feature_y].values.reshape(-1,1)
|
|
64
|
+
huber_regr = HuberRegressor(fit_intercept = True)
|
|
65
|
+
huber_regr.fit(x, y)
|
|
66
|
+
beta, alpha = huber_regr.coef_[0], huber_regr.intercept_
|
|
67
|
+
return beta, alpha
|
|
68
|
+
|
|
69
|
+
def get_correlation(self, data, feature_x, feature_y):
|
|
70
|
+
"""
|
|
71
|
+
compute correlation given x and y
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
data (pd.DataFrame): input data containing analysis features
|
|
76
|
+
feature_x (str): name of the feature x
|
|
77
|
+
feature_y (str): name of the feature y
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
r (float)
|
|
82
|
+
"""
|
|
83
|
+
x = data[feature_x]
|
|
84
|
+
y = data[feature_y]
|
|
85
|
+
r = stats.mstats.pearsonr(x, y)[0]
|
|
86
|
+
return r
|
|
87
|
+
|
|
88
|
+
def produce_beta_report(self, data):
|
|
89
|
+
"""
|
|
90
|
+
produce beta report
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
data (pd.DataFrame): input data containing analysis features
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
report (pd.DataFrame)
|
|
99
|
+
"""
|
|
100
|
+
result = {
|
|
101
|
+
"market_index": list(),
|
|
102
|
+
"beta": list(),
|
|
103
|
+
"alpha": list(),
|
|
104
|
+
"r": list()
|
|
105
|
+
}
|
|
106
|
+
for index in self.market_features:
|
|
107
|
+
beta, alpha = self.compute_beta( data, self.return_cols, index)
|
|
108
|
+
r = self.get_correlation( data, self.return_cols, index)
|
|
109
|
+
result["market_index"].append(index)
|
|
110
|
+
result["beta"].append(beta)
|
|
111
|
+
result["alpha"].append(alpha)
|
|
112
|
+
result["r"].append(r)
|
|
113
|
+
pd_result = pd.DataFrame(result)
|
|
114
|
+
pd_result = pd_result.sort_values("r", ascending=False)
|
|
115
|
+
if self.col_map:
|
|
116
|
+
pd_result["map_market_index"] = pd_result.market_index.map(self.col_map)
|
|
117
|
+
return pd_result
|
|
118
|
+
|
|
119
|
+
def compute_general_report(self, sample_size, offset, index=False, subsample_ts=False, show_plot=True):
|
|
120
|
+
"""
|
|
121
|
+
compute full report, global and latest window
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
sample_size (int): sample size for every beta computation
|
|
126
|
+
offset (int): offset or overlap between samples
|
|
127
|
+
index (str): if provided, bet fit index is taken
|
|
128
|
+
subsample_ts (int): subsample for iterative beta calculation
|
|
129
|
+
show_plot (bool): whether to show plot
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
(report (pd.DataFrame), latest_report (pd.DataFrame), figure (mtpl.plt))
|
|
134
|
+
"""
|
|
135
|
+
general_report = self.produce_beta_report(self.data)
|
|
136
|
+
current_report = self.produce_beta_report(self.data.iloc[sample_size:,:])
|
|
137
|
+
if not index:
|
|
138
|
+
index = general_report.head(1).market_index.values[0]
|
|
139
|
+
b = general_report[general_report.market_index == index].beta.values
|
|
140
|
+
a = general_report[general_report.market_index == index].alpha.values
|
|
141
|
+
|
|
142
|
+
figure, ax = plt.subplot_mosaic(
|
|
143
|
+
[["scatter_total", "scatter_sample",'ts','ts']],
|
|
144
|
+
layout="constrained",
|
|
145
|
+
figsize=(18, 5)
|
|
146
|
+
)
|
|
147
|
+
x = self.data[self.return_cols]
|
|
148
|
+
y = self.data[index]
|
|
149
|
+
ax['scatter_total'].scatter(x, y)
|
|
150
|
+
ax['scatter_total'].plot(x, b*x+a, color='red')
|
|
151
|
+
|
|
152
|
+
if subsample_ts:
|
|
153
|
+
merger_df = self.data.iloc[-subsample_ts:,:].copy()
|
|
154
|
+
else:
|
|
155
|
+
merger_df = self.data.copy()
|
|
156
|
+
ax['ts'].plot(merger_df.Date, merger_df.Close, color = 'grey', alpha = 0.3)
|
|
157
|
+
b_array = list()
|
|
158
|
+
for i in range(0,len(merger_df)-sample_size,offset):
|
|
159
|
+
merger_ = merger_df.sort_values('Date', ascending = False).iloc[i:i+sample_size,:]
|
|
160
|
+
b, a = self.compute_beta(merger_, self.return_cols, index)
|
|
161
|
+
x = merger_[self.return_cols]
|
|
162
|
+
y = merger_[index]
|
|
163
|
+
normalize_ = mcolors.Normalize(vmin=-2.0, vmax=2.0)
|
|
164
|
+
colormap_ = cm.jet
|
|
165
|
+
ax['scatter_sample'].plot(x, y,'o', color = 'blue', alpha = 0.1)
|
|
166
|
+
ax['scatter_sample'].plot(x, b*x+a, color=colormap_(normalize_(b)))
|
|
167
|
+
ax['scatter_sample'].set_xlim(-0.08, 0.08)
|
|
168
|
+
ax['scatter_sample'].set_ylim(-0.08, 0.08)
|
|
169
|
+
plot = ax['ts'].scatter(merger_.Date, merger_.Close, color=colormap_(normalize_(b)), s = 10)
|
|
170
|
+
b_array.append(b)
|
|
171
|
+
normalize_ = mcolors.Normalize(vmin=np.min(b_array), vmax=np.max(b_array))
|
|
172
|
+
colormap_ = cm.jet
|
|
173
|
+
x_global = self.data[self.return_cols]
|
|
174
|
+
scalarmappaple = cm.ScalarMappable(norm=normalize_, cmap=colormap_)
|
|
175
|
+
scalarmappaple.set_array(x_global)
|
|
176
|
+
if self.col_map:
|
|
177
|
+
map_index = self.col_map.get(index)
|
|
178
|
+
title = f'market analysis of {map_index}'
|
|
179
|
+
else:
|
|
180
|
+
title = f'market analysis'
|
|
181
|
+
plt.title(title)
|
|
182
|
+
plt.colorbar(scalarmappaple)
|
|
183
|
+
del merger_df
|
|
184
|
+
gc.collect()
|
|
185
|
+
if show_plot:
|
|
186
|
+
plt.show()
|
|
187
|
+
else:
|
|
188
|
+
plt.close()
|
|
189
|
+
return general_report, current_report, figure
|
virgo_modules/src/re_utils.py
CHANGED
|
@@ -1421,6 +1421,12 @@ def extract_data_traintest(object_stock,features_to_search,configs, target_confi
|
|
|
1421
1421
|
last_signal_featlist = last_signal_featlist.split('//')
|
|
1422
1422
|
if feature_name in last_signal_featlist:
|
|
1423
1423
|
object_stock.compute_last_signal(feature_name, False)
|
|
1424
|
+
market_interaction_features = configs.get('custom_transformations',{}).get('market_interaction_features', False)
|
|
1425
|
+
if market_interaction_features:
|
|
1426
|
+
for stage in market_interaction_features.keys():
|
|
1427
|
+
method_to_use = market_interaction_features.get(stage).get("method")
|
|
1428
|
+
arguments_to_use = market_interaction_features.get(stage).get("parameters")
|
|
1429
|
+
getattr(object_stock, method_to_use)(**arguments_to_use)
|
|
1424
1430
|
# geting targets
|
|
1425
1431
|
object_stock.get_categorical_targets(**target_params_up)
|
|
1426
1432
|
object_stock.df = object_stock.df.drop(columns = ['target_down']).rename(columns = {'target_up':'target_up_save'})
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import yfinance as yf
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import numpy as np
|
|
4
|
-
import
|
|
4
|
+
import gc
|
|
5
5
|
|
|
6
6
|
import matplotlib.pyplot as plt
|
|
7
7
|
import matplotlib.gridspec as gridspec
|
|
@@ -138,6 +138,10 @@ class stock_eda_panel(object):
|
|
|
138
138
|
perform analysis of lags of the mean rolling log return
|
|
139
139
|
compute_clip_bands(feature_name=str,threshold=float):
|
|
140
140
|
compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
|
|
141
|
+
extract_sec_data(symbol=str, base_columns=list(str), rename_columns=dict):
|
|
142
|
+
extract new asset data and merge it to the main asset data
|
|
143
|
+
lag_log_return(lags=int, feature=str, feature_name=str):
|
|
144
|
+
compute log return given some lags
|
|
141
145
|
signal_plotter(feature_name=str):
|
|
142
146
|
display analysis plot of a feature with high and low signals
|
|
143
147
|
log_features_standard(feature_name=str):
|
|
@@ -667,6 +671,63 @@ class stock_eda_panel(object):
|
|
|
667
671
|
self.df[f'signal_low_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] < self.df[f'lower_{feature_name}'] ), 1, 0)
|
|
668
672
|
self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] > self.df[f'upper_{feature_name}'] ), 1, 0)
|
|
669
673
|
|
|
674
|
+
def extract_sec_data(self, symbol, base_columns, rename_columns=None):
|
|
675
|
+
"""
|
|
676
|
+
extract new asset data and merge it to the main asset data
|
|
677
|
+
|
|
678
|
+
Parameters
|
|
679
|
+
----------
|
|
680
|
+
symbol (str): symbol to extract data
|
|
681
|
+
base_columns (list): list of columns to persist
|
|
682
|
+
rename_columns (dict): map of the new column names using pd.DataFrame.rename()
|
|
683
|
+
|
|
684
|
+
Returns
|
|
685
|
+
-------
|
|
686
|
+
None
|
|
687
|
+
"""
|
|
688
|
+
begin_date = self.today - relativedelta(days = self.n_days)
|
|
689
|
+
begin_date_str = begin_date.strftime('%Y-%m-%d')
|
|
690
|
+
|
|
691
|
+
stock = yf.Ticker(symbol)
|
|
692
|
+
df = stock.history(period=self.data_window)
|
|
693
|
+
df = df.sort_values('Date')
|
|
694
|
+
df.reset_index(inplace=True)
|
|
695
|
+
df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
|
|
696
|
+
df['Date'] = pd.to_datetime(df['Date'])
|
|
697
|
+
df = df[df.Date >= begin_date_str ]
|
|
698
|
+
df = df[base_columns]
|
|
699
|
+
if rename_columns:
|
|
700
|
+
df = df.rename(columns=rename_columns)
|
|
701
|
+
right_df = df.copy()
|
|
702
|
+
|
|
703
|
+
dates_vector = self.df.Date.to_frame()
|
|
704
|
+
right_df = dates_vector.merge(right_df, on ='Date',how = 'left')
|
|
705
|
+
right_df = right_df.fillna(method = 'bfill')
|
|
706
|
+
right_df = right_df.fillna(method = 'ffill')
|
|
707
|
+
|
|
708
|
+
self.df = self.df.merge(right_df, on ='Date',how = 'left')
|
|
709
|
+
self.df = self.df.sort_values("Date")
|
|
710
|
+
del right_df
|
|
711
|
+
gc.collect()
|
|
712
|
+
|
|
713
|
+
def lag_log_return(self, lags, feature, feature_name=False):
|
|
714
|
+
"""
|
|
715
|
+
compute log return given some lags
|
|
716
|
+
|
|
717
|
+
Parameters
|
|
718
|
+
----------
|
|
719
|
+
lags (int): lag to apply log return
|
|
720
|
+
feature (str): feature to apply log return
|
|
721
|
+
feature_name (str): rename resuling name
|
|
722
|
+
|
|
723
|
+
Returns
|
|
724
|
+
-------
|
|
725
|
+
None
|
|
726
|
+
"""
|
|
727
|
+
|
|
728
|
+
feature_name = feature_name if feature_name else f"{feature}_log_return"
|
|
729
|
+
self.df[feature_name] = np.log(self.df[feature]/self.df[feature].shift(lags))
|
|
730
|
+
|
|
670
731
|
def signal_plotter(self, feature_name):
|
|
671
732
|
|
|
672
733
|
"""
|
|
@@ -748,13 +809,11 @@ class stock_eda_panel(object):
|
|
|
748
809
|
.transform(lambda x: x.rolling(ma2, min_periods=1).mean())
|
|
749
810
|
)
|
|
750
811
|
|
|
751
|
-
print('--------------------------------------------------------------------')
|
|
752
812
|
if save_features:
|
|
753
813
|
self.log_features_standard(feature_name)
|
|
754
814
|
self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
|
|
755
815
|
|
|
756
816
|
if plot:
|
|
757
|
-
|
|
758
817
|
self.signal_plotter(feature_name)
|
|
759
818
|
|
|
760
819
|
def pair_feature(self, pair_symbol, plot = False):
|
|
@@ -810,6 +869,24 @@ class stock_eda_panel(object):
|
|
|
810
869
|
plt.legend()
|
|
811
870
|
plt.show()
|
|
812
871
|
|
|
872
|
+
def smooth_logrets_interaction_term(self, feature_interact_with, resulting_feature_name="persisted_clip_diff_smooths", rollmean_window = 5, ext_threhold=0.015, persist_days = 3, save_features=False):
|
|
873
|
+
"""
|
|
874
|
+
create an interaction term that is going to compare the distance of asset wolling window mean and market rolling window mean.
|
|
875
|
+
then get the outliers or high values using abs and this value persist for some days
|
|
876
|
+
goal persist big differences of market and asset returns
|
|
877
|
+
|
|
878
|
+
feature_interact_with: name of the market return
|
|
879
|
+
rollmean_window: rolling window or smoothing number of days
|
|
880
|
+
ext_threhold: threshold
|
|
881
|
+
persist_days: number of days to persis the signal
|
|
882
|
+
"""
|
|
883
|
+
self.df["smooth_log_return"] = self.df['log_return'].rolling(rollmean_window).mean().values
|
|
884
|
+
self.df["smooth_market_log_return"] = self.df[feature_interact_with].rolling(rollmean_window).mean().values
|
|
885
|
+
self.df["diff_smooths"] = self.df["smooth_market_log_return"]-self.df["smooth_log_return"]
|
|
886
|
+
self.df["clip_diff_smooths"] = np.where(np.abs(self.df["diff_smooths"]) > ext_threhold, self.df["diff_smooths"] , 0)
|
|
887
|
+
self.df[resulting_feature_name] = self.df['clip_diff_smooths'].rolling(persist_days).mean().values
|
|
888
|
+
self.df = self.df.drop(columns=["smooth_log_return","smooth_market_log_return","diff_smooths","clip_diff_smooths"])
|
|
889
|
+
|
|
813
890
|
def calculate_cointegration(self,series_1, series_2):
|
|
814
891
|
"""
|
|
815
892
|
calculate cointegration score for two time series
|
|
@@ -2304,33 +2381,3 @@ class analyse_index(stock_eda_panel):
|
|
|
2304
2381
|
|
|
2305
2382
|
self.states_result = result
|
|
2306
2383
|
|
|
2307
|
-
def get_relevant_beta(data_market, ticket_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
|
|
2308
|
-
'''
|
|
2309
|
-
select relevant beta result data of a given asset
|
|
2310
|
-
|
|
2311
|
-
Parameters:
|
|
2312
|
-
data_market (pd.DataFrame): dataframe of the market results
|
|
2313
|
-
ticket_name (str): name of the asset
|
|
2314
|
-
show_plot (bool): If tru, plot results
|
|
2315
|
-
save_path (str): local path for saving e.g r'C:/path/to/the/file/'
|
|
2316
|
-
save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
|
|
2317
|
-
aws_credentials (dict): dict of the aws credentials
|
|
2318
|
-
|
|
2319
|
-
Returns:
|
|
2320
|
-
selection (pd.DataFrame): dataframe of the most relevant beta
|
|
2321
|
-
'''
|
|
2322
|
-
all_betas = data_market[data_market.asset == ticket_name].sort_values('general_r', ascending = False)
|
|
2323
|
-
all_betas['gen_r2'] = all_betas.general_r ** 2
|
|
2324
|
-
all_betas['sampl_r2'] = all_betas.sample_r ** 2
|
|
2325
|
-
selection = all_betas.sort_values('gen_r2',ascending =False).head(2).sort_values('sampl_r2',ascending =False).head(1).drop(columns = ['gen_r2','sampl_r2'])
|
|
2326
|
-
|
|
2327
|
-
if show_plot:
|
|
2328
|
-
print(selection)
|
|
2329
|
-
if save_path:
|
|
2330
|
-
result_plot_name = f'market_best_fit.csv'
|
|
2331
|
-
selection.to_csv(save_path+result_plot_name)
|
|
2332
|
-
|
|
2333
|
-
if save_path and save_aws:
|
|
2334
|
-
# upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{ticket_name}/'+result_plot_name,input_path = save_path+result_plot_name)
|
|
2335
|
-
upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = save_aws + result_plot_name, input_path = save_path + result_plot_name, aws_credentials = aws_credentials)
|
|
2336
|
-
return selection
|
|
@@ -1,6 +1,11 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
|
|
1
3
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
2
4
|
import pandas as pd
|
|
3
5
|
import numpy as np
|
|
6
|
+
import statsmodels.api as sm
|
|
7
|
+
from patsy import dmatrix
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
4
9
|
|
|
5
10
|
class InverseHyperbolicSine(BaseEstimator, TransformerMixin):
|
|
6
11
|
|
|
@@ -289,3 +294,108 @@ class InteractionFeatures(BaseEstimator, TransformerMixin):
|
|
|
289
294
|
fn = 'iterm_'+f1.replace("norm_","")+"_"+f2.replace("norm_","")
|
|
290
295
|
X = self.simple_div_interaction(X, f1, f2, fn)
|
|
291
296
|
return X
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class SplineMarketReturnJumpWaves(BaseEstimator, TransformerMixin):
|
|
300
|
+
"""
|
|
301
|
+
Class that gets a feature returns and performs countings so that a spline regression model can be fitted
|
|
302
|
+
|
|
303
|
+
Attributes
|
|
304
|
+
----------
|
|
305
|
+
return_feature_names : list
|
|
306
|
+
list of the name of the features to apply spline regresion
|
|
307
|
+
target_variables : list
|
|
308
|
+
list of target features
|
|
309
|
+
feature_label : str
|
|
310
|
+
prefix for the new features.
|
|
311
|
+
sample_perc : float
|
|
312
|
+
sample size of the traninig data taking into consideration time
|
|
313
|
+
|
|
314
|
+
Methods
|
|
315
|
+
-------
|
|
316
|
+
fit(additional="", X=DataFrame, y=DataFrame):
|
|
317
|
+
fit transformation.
|
|
318
|
+
transform(X=DataFrame, y=None):
|
|
319
|
+
apply feature transformation
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
def __init__(self, return_feature_names, target_variables, feature_label,
|
|
323
|
+
sample_perc=0.5,parts = 6, e_floor=-0.001,e_top=0.0001, d=3):
|
|
324
|
+
self.sample_perc = sample_perc
|
|
325
|
+
self.return_feature_names=return_feature_names
|
|
326
|
+
self.target_variables = target_variables
|
|
327
|
+
self.glms = dict()
|
|
328
|
+
self.feature_label = feature_label
|
|
329
|
+
self.parts = parts
|
|
330
|
+
self.e_floor = e_floor
|
|
331
|
+
self.e_top = e_top
|
|
332
|
+
self.d = d
|
|
333
|
+
def fit(self, X, y, plot = False):
|
|
334
|
+
#complete dataset with y
|
|
335
|
+
X_set=X.copy()
|
|
336
|
+
X_set[self.target_variables] = y
|
|
337
|
+
#sampling
|
|
338
|
+
if plot:
|
|
339
|
+
fig, ax = plt.subplots(len(self.return_feature_names),1)
|
|
340
|
+
for i,return_feature_name in enumerate(self.return_feature_names):
|
|
341
|
+
X_aggregated = (
|
|
342
|
+
X_set
|
|
343
|
+
.groupby("Date",as_index=False)
|
|
344
|
+
.agg(
|
|
345
|
+
count_target_up = ("target_up","sum"),
|
|
346
|
+
count_target_down = ("target_down","sum"),
|
|
347
|
+
return_feature = (return_feature_name,"max"),
|
|
348
|
+
)
|
|
349
|
+
.sort_values("Date",ascending=True)
|
|
350
|
+
.dropna()
|
|
351
|
+
.copy()
|
|
352
|
+
)
|
|
353
|
+
del X
|
|
354
|
+
gc.collect()
|
|
355
|
+
nlines = X_aggregated.shape[0]
|
|
356
|
+
threshold = int(round((1-nlines*self.sample_perc),0))
|
|
357
|
+
train_ = X_aggregated.iloc[:threshold,:]
|
|
358
|
+
self.glms[return_feature_name] = dict()
|
|
359
|
+
for target in self.target_variables:
|
|
360
|
+
X = train_[["return_feature"]].round(4).values.reshape(-1, 1)
|
|
361
|
+
y = np.log(train_.dropna()[f"count_{target}"].values + 1)
|
|
362
|
+
knot_str = self._get_knot(X)
|
|
363
|
+
transformed_x = dmatrix(f"bs(train, knots=({knot_str}), degree=3, include_intercept=False)", {"train": X}, return_type='dataframe')
|
|
364
|
+
model = sm.GLM(y, transformed_x).fit()
|
|
365
|
+
self.glms[return_feature_name][target] = {
|
|
366
|
+
"model":model,
|
|
367
|
+
}
|
|
368
|
+
if plot:
|
|
369
|
+
x_transfomed = dmatrix(f"bs(valid, knots=({knot_str}), degree={self.d}, include_intercept=False)", {"valid":X}, return_type='dataframe')
|
|
370
|
+
pred = model.predict(x_transfomed)
|
|
371
|
+
ax[i].scatter(X, np.exp(y),s=2,alpha=0.2)
|
|
372
|
+
ax[i].scatter(X, np.exp(pred), alpha=0.2, s=1)
|
|
373
|
+
#self.X_aggregated = X_aggregated
|
|
374
|
+
return self
|
|
375
|
+
|
|
376
|
+
def transform(self, X, y=None, plot =False):
|
|
377
|
+
if plot:
|
|
378
|
+
fig, ax = plt.subplots(len(self.return_feature_names),1)
|
|
379
|
+
for i, return_feature_name in enumerate(self.return_feature_names):
|
|
380
|
+
for target in self.target_variables:
|
|
381
|
+
model = self.glms[return_feature_name][target].get("model")
|
|
382
|
+
vect = X[return_feature_name]
|
|
383
|
+
knot_str = self._get_knot(vect)
|
|
384
|
+
X_transformed = dmatrix(f"bs(valid, knots=({knot_str}), degree={self.d}, include_intercept=False)",
|
|
385
|
+
{"valid":vect.fillna(0)},
|
|
386
|
+
return_type='dataframe')
|
|
387
|
+
X[f"{self.feature_label}_{return_feature_name}_{target}"] = model.predict(
|
|
388
|
+
X_transformed
|
|
389
|
+
)
|
|
390
|
+
if plot:
|
|
391
|
+
pred = model.predict(X_transformed)
|
|
392
|
+
ax[i].scatter(X, np.exp(pred), alpha=0.2, s=1)
|
|
393
|
+
return X
|
|
394
|
+
|
|
395
|
+
def _get_knot(self, input):
|
|
396
|
+
min_, max_ = np.min(input)-self.e_floor, np.max(input)+self.e_top
|
|
397
|
+
r = (max_ - min_)/self.parts
|
|
398
|
+
knot_tuple = [str(i*r+min_) for i,_ in enumerate(range(self.parts),start=0)]
|
|
399
|
+
knot_str = ",".join(knot_tuple)
|
|
400
|
+
knot_str = f"({knot_str})"
|
|
401
|
+
return knot_str
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
2
|
-
Name:
|
|
3
|
-
Version: 0.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: virgo_modules
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: data processing and statistical modeling using stock market data
|
|
5
5
|
Home-page: https://github.com/miguelmayhem92/virgo_module
|
|
6
6
|
Author: Miguel Mayhuire
|
|
@@ -13,7 +13,18 @@ Requires-Python: >=3.9
|
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
15
|
Provides-Extra: dev
|
|
16
|
-
Requires-Dist: pytest
|
|
16
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
17
|
+
Dynamic: author
|
|
18
|
+
Dynamic: author-email
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: home-page
|
|
23
|
+
Dynamic: license
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
Dynamic: provides-extra
|
|
26
|
+
Dynamic: requires-python
|
|
27
|
+
Dynamic: summary
|
|
17
28
|
|
|
18
29
|
# Virgo Package
|
|
19
30
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
virgo_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
virgo_modules/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
virgo_modules/src/aws_utils.py,sha256=q0l7D7ofo09Lu1QQjv-esheQ06uiSy1Pdq3xMul8zvk,2571
|
|
4
|
+
virgo_modules/src/backtester.py,sha256=OhiWyzDX0PthXGuhChyWUmDN3cLkzVYe95zS4nGtia8,22106
|
|
5
|
+
virgo_modules/src/hmm_utils.py,sha256=D7axAnCdSe1_1EgRyli2PAnM2f6699hTY9GcxjPXG-o,21221
|
|
6
|
+
virgo_modules/src/pull_artifacts.py,sha256=5OPrgR7pcMSdpbevDRhf0ebk7g7ZRjff4NpTIIWAKjE,1989
|
|
7
|
+
virgo_modules/src/re_utils.py,sha256=GZCkAfgw2tVJRJ_Gw5Yewc14ebiE9wSImPiYQN8FsW0,75095
|
|
8
|
+
virgo_modules/src/ticketer_source.py,sha256=528WhGoANOm4IKnxGSWsbQxxUh3-qlZfvGRNAafMMcE,103883
|
|
9
|
+
virgo_modules/src/transformer_utils.py,sha256=SnYdtsFPnSF6u4UFIat0-X3-qVuUWvv_T46kiB-H0Sk,13682
|
|
10
|
+
virgo_modules/src/edge_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
virgo_modules/src/edge_utils/conformal_utils.py,sha256=cKm4KSM261Eu1FJn4oowKYiKIesW81VbqITIvopGSVk,5410
|
|
12
|
+
virgo_modules/src/edge_utils/edge_utils.py,sha256=4uXVWthzJDzkJ4Uq19ZYL9aPcA6CDUS3xYD4FY-a2AM,20018
|
|
13
|
+
virgo_modules/src/edge_utils/feature_selection.py,sha256=HYbQ0JLPDiRYhn-5-C438YEKbuNduDmuvboFC_VkHww,2453
|
|
14
|
+
virgo_modules/src/edge_utils/shap_utils.py,sha256=FgcHkfddvdFSeUqEubYa2ExRGVAWSthqK4b-eKagEmo,2333
|
|
15
|
+
virgo_modules/src/edge_utils/stack_model.py,sha256=QqE91uLo2KauGEj91AVNANB1xE7J4Fa49YOX7k5mFng,4257
|
|
16
|
+
virgo_modules/src/market/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
virgo_modules/src/market/market_tools.py,sha256=vBt66_7E3ANz7avzfeNw_RHMGvG9lh5PRhxmcf_Oyjc,6880
|
|
18
|
+
virgo_modules-0.8.0.dist-info/licenses/LICENSE,sha256=pNgFyCYgmimaw0o6V20JupZLROycAnOA_HDDh1tX2V4,1097
|
|
19
|
+
virgo_modules-0.8.0.dist-info/METADATA,sha256=sCkdOmbxrEEXvGUIwh6vIl_vIcue5C0BbvRtvP9yows,1122
|
|
20
|
+
virgo_modules-0.8.0.dist-info/WHEEL,sha256=lTU6B6eIfYoiQJTZNc-fyaR6BpL6ehTzU3xGYxn2n8k,91
|
|
21
|
+
virgo_modules-0.8.0.dist-info/top_level.txt,sha256=ZjI-qEkDtT-8mFwGAWnXfqPOKEGlIhWRW1es1VyXc60,14
|
|
22
|
+
virgo_modules-0.8.0.dist-info/RECORD,,
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
virgo_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
virgo_modules/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
virgo_modules/src/aws_utils.py,sha256=q0l7D7ofo09Lu1QQjv-esheQ06uiSy1Pdq3xMul8zvk,2571
|
|
4
|
-
virgo_modules/src/backtester.py,sha256=OhiWyzDX0PthXGuhChyWUmDN3cLkzVYe95zS4nGtia8,22106
|
|
5
|
-
virgo_modules/src/hmm_utils.py,sha256=D7axAnCdSe1_1EgRyli2PAnM2f6699hTY9GcxjPXG-o,21221
|
|
6
|
-
virgo_modules/src/pull_artifacts.py,sha256=5OPrgR7pcMSdpbevDRhf0ebk7g7ZRjff4NpTIIWAKjE,1989
|
|
7
|
-
virgo_modules/src/re_utils.py,sha256=DBY_VBB1wKm5D7znutpF_66CTLZhJfx54h8Ws0YzdN4,74641
|
|
8
|
-
virgo_modules/src/ticketer_source.py,sha256=jxP-OOeoyN2JxRQg-mX6t6WNJXiIrhWKDywDxpYANxU,101977
|
|
9
|
-
virgo_modules/src/transformer_utils.py,sha256=ysCUp3cB3_7Jr9OHDqhg2_6Vu0k1YVjfqbvQNbxpbhI,8990
|
|
10
|
-
virgo_modules/src/edge_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
virgo_modules/src/edge_utils/conformal_utils.py,sha256=cKm4KSM261Eu1FJn4oowKYiKIesW81VbqITIvopGSVk,5410
|
|
12
|
-
virgo_modules/src/edge_utils/edge_utils.py,sha256=7nYPLDNyKqeKIuOOwQi4wsBibzs9gP1HgYMISXJX1Y8,19522
|
|
13
|
-
virgo_modules/src/edge_utils/shap_utils.py,sha256=FgcHkfddvdFSeUqEubYa2ExRGVAWSthqK4b-eKagEmo,2333
|
|
14
|
-
virgo_modules/src/edge_utils/stack_model.py,sha256=QqE91uLo2KauGEj91AVNANB1xE7J4Fa49YOX7k5mFng,4257
|
|
15
|
-
virgo_modules-0.6.1.dist-info/LICENSE,sha256=pNgFyCYgmimaw0o6V20JupZLROycAnOA_HDDh1tX2V4,1097
|
|
16
|
-
virgo_modules-0.6.1.dist-info/METADATA,sha256=9EtSQrm2xy6-S4wGgWwWbL5V7yz-8BV6TlK3G18LyoM,876
|
|
17
|
-
virgo_modules-0.6.1.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
|
18
|
-
virgo_modules-0.6.1.dist-info/top_level.txt,sha256=ZjI-qEkDtT-8mFwGAWnXfqPOKEGlIhWRW1es1VyXc60,14
|
|
19
|
-
virgo_modules-0.6.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|