virgo-modules 0.0.74__py3-none-any.whl → 0.0.76__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of virgo-modules might be problematic. Click here for more details.

@@ -53,19 +53,57 @@ from .aws_utils import upload_file_to_aws
53
53
  import logging
54
54
 
55
55
  class InverseHyperbolicSine(BaseEstimator, TransformerMixin):
56
+
57
+ """
58
+ Class that applies inverse hyperbolic sine for feature transformation.
59
+ this class is compatible with scikitlearn pipeline
60
+
61
+ Attributes
62
+ ----------
63
+ features : list
64
+ list of features to apply the transformation
65
+ prefix : str
66
+ prefix for the new features. is '' the features are overwrite
67
+
68
+ Methods
69
+ -------
70
+ fit(additional="", X=DataFrame, y=None):
71
+ fit transformation.
72
+ transform(X=DataFrame, y=None):
73
+ apply feature transformation
74
+ """
75
+
56
76
  def __init__(self, features, prefix = ''):
57
77
  self.features = features
58
78
  self.prefix = prefix
59
79
 
60
80
  def fit(self, X, y=None):
61
81
  return self
62
-
82
+
63
83
  def transform(self, X, y=None):
64
84
  for feature in self.features:
65
85
  X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
66
86
  return X
67
87
 
68
88
  class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
89
+
90
+ """
91
+ Class that applies winsorirization of a feature for feature transformation.
92
+ this class is compatible with scikitlearn pipeline
93
+
94
+ Attributes
95
+ ----------
96
+ feature_configs : dict
97
+ dictionary of features and configurations. the configuration has high and low limits per feature
98
+
99
+ Methods
100
+ -------
101
+ fit(additional="", X=DataFrame, y=None):
102
+ fit transformation.
103
+ transform(X=DataFrame, y=None):
104
+ apply feature transformation
105
+ """
106
+
69
107
  def __init__(self, feature_configs):
70
108
  self.feature_configs = feature_configs
71
109
  def fit(self, X, y=None):
@@ -80,6 +118,24 @@ class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
80
118
  return X
81
119
 
82
120
  class FeatureSelector(BaseEstimator, TransformerMixin):
121
+
122
+ """
123
+ Class that applies selection of features.
124
+ this class is compatible with scikitlearn pipeline
125
+
126
+ Attributes
127
+ ----------
128
+ columns : list
129
+ list of features to select
130
+
131
+ Methods
132
+ -------
133
+ fit(additional="", X=DataFrame, y=None):
134
+ fit transformation.
135
+ transform(X=DataFrame, y=None):
136
+ apply feature transformation
137
+ """
138
+
83
139
  def __init__(self, columns):
84
140
  self.columns = columns
85
141
 
@@ -88,8 +144,19 @@ class FeatureSelector(BaseEstimator, TransformerMixin):
88
144
 
89
145
  def transform(self, X, y=None):
90
146
  return X[self.columns]
91
-
147
+
92
148
  def sharpe_ratio(return_series):
149
+
150
+ '''
151
+ calculate sharpe ratio for given array.
152
+
153
+ Parameters:
154
+ return_series (pd.series): pandas series of the asset returns
155
+
156
+ Returns:
157
+ sharpe (float): sharpe ratio
158
+ '''
159
+
93
160
  N = 255 # Trading days in the year (change to 365 for crypto)
94
161
  rf = 0.005 # Half a percent risk free rare
95
162
  mean = return_series.mean() * N -rf
@@ -98,12 +165,38 @@ def sharpe_ratio(return_series):
98
165
  return sharpe
99
166
 
100
167
  class signal_combiner(BaseEstimator, TransformerMixin):
168
+
169
+ """
170
+ Class that applies feature combination of binary signals.
171
+ this class is compatible with scikitlearn pipeline
172
+
173
+ ...
174
+
175
+ Attributes
176
+ ----------
177
+ columns : list
178
+ list of features to select
179
+ drop : boolean
180
+ drop combining features
181
+ prefix_up : str
182
+ up prefix of the base feature
183
+ prefix_low : str
184
+ low prefix of the base feature
185
+
186
+ Methods
187
+ -------
188
+ fit(additional="", X=DataFrame, y=None):
189
+ fit transformation.
190
+ transform(X=DataFrame, y=None):
191
+ apply feature transformation
192
+ """
193
+
101
194
  def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
102
195
  self.columns = columns
103
196
  self.drop = drop
104
197
  self.prefix_up = prefix_up
105
198
  self.prefix_low = prefix_low
106
-
199
+
107
200
  def fit(self, X, y=None):
108
201
  return self
109
202
 
@@ -111,7 +204,7 @@ class signal_combiner(BaseEstimator, TransformerMixin):
111
204
  for column in self.columns:
112
205
  X['CombSignal_'+column] = np.where(
113
206
  X[self.prefix_up + column] == 1,
114
- 1,
207
+ 1,
115
208
  np.where(
116
209
  X[self.prefix_low + column] == 1,
117
210
  1,
@@ -121,15 +214,29 @@ class signal_combiner(BaseEstimator, TransformerMixin):
121
214
  if self.drop:
122
215
  X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
123
216
  return X
124
-
217
+
125
218
  def data_processing_pipeline(features_base,features_to_drop = False, lag_dict = False, combine_signals = False, discretize_columns = False, correlation = 0.77):
126
-
219
+
220
+ '''
221
+ create a scikit learn pipeline object using different configurations and feature engineering blocks with a given flow
222
+
223
+ Parameters:
224
+ features_to_drop (list): list of features to drop
225
+ lag_dict (dict): feature dictionary with configurations to apply lags
226
+ combine_signals (list): list of columns/signals to combine
227
+ discretize_columns (list): list of features to discretize, bins is fixed
228
+ correlation (float): correaltion score threshold for feature selection
229
+
230
+ Returns:
231
+ pipe (obj): pipeline object
232
+ '''
233
+
127
234
  lag_pipe_sec = [(f'lags_{key}', LagFeatures(variables = key, periods = lag_dict[key])) for key in lag_dict] if lag_dict else []
128
235
  drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
129
236
  merge = [('signal_combiner', signal_combiner(combine_signals))] if combine_signals else []
130
237
  discretize = [('discretize',EqualWidthDiscretiser(discretize_columns, bins = 20 ))] if discretize_columns else []
131
238
  drop_corr = [('drop_corr', DropCorrelatedFeatures(threshold=correlation))] if correlation else []
132
-
239
+
133
240
  pipe = Pipeline(
134
241
  [('selector', FeatureSelector(features_base))] + \
135
242
  [('encoding',OneHotEncoder(top_categories=None, variables=['hmm_feature']))] + \
@@ -143,6 +250,18 @@ def data_processing_pipeline(features_base,features_to_drop = False, lag_dict =
143
250
  return pipe
144
251
 
145
252
  def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
253
+ '''
254
+ calculate relevance score and summary report for hmm model
255
+
256
+ Parameters:
257
+ default_benchmark_sd (float): default value to bias SD for t calculation
258
+ t_threshold (float): alpha or z threshold for the normalized score
259
+
260
+ Returns:
261
+ mean_relevance (float): mean relevance score of the states
262
+ cluster_returns (pd.DataFrame): summary report of the analysis
263
+ number_relevant_states (int): number of relevant states
264
+ '''
146
265
  ## legnths
147
266
  cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
148
267
  cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
@@ -151,7 +270,7 @@ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2
151
270
  return x.quantile(0.25)
152
271
  def quantile3(x):
153
272
  return x.quantile(0.75)
154
-
273
+
155
274
  cluster_returns = data.groupby('hmm_feature').agg(
156
275
  n_uniques = ('chain_id','nunique'),
157
276
  n_obs = ('Date','count'),
@@ -171,14 +290,14 @@ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2
171
290
  cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
172
291
  cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
173
292
  cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
174
-
293
+
175
294
  ## relevance or importance
176
295
  # naive aproach
177
296
  cluster_returns['relevance'] = cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
178
297
  cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
179
298
  cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
180
299
  cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
181
-
300
+
182
301
  mean_relevance = cluster_returns['abs_t_accpted'].mean()
183
302
  number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
184
303
 
@@ -186,20 +305,161 @@ def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2
186
305
 
187
306
 
188
307
  class stock_eda_panel(object):
189
-
308
+
309
+ """
310
+ Class that initialy gets stock data then apply feature enginering, enrichment, analysis, plotting, model training etc.
311
+
312
+ Attributes
313
+ ----------
314
+ stock_code : str
315
+ symbol of the asset
316
+ n_days : str
317
+ number of days to extract data
318
+ data_window : str
319
+ large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
320
+ df : pd.DataFrame
321
+ Pandas dataframe of the asset data with features
322
+ strategy_log: pd.DataFrame
323
+ Pandas dataframe that has the results of different tested strategies (result from strategy simulator hmm)
324
+ best_strategy: list
325
+ features of the best performing strategy (result from strategy simulator hmm)
326
+ top_10_strategy: dict
327
+ top 10 best performing strategies (result from strategy simulator hmm)
328
+ settings: dict
329
+ configuration dictionary of the features and other parameters
330
+
331
+ Methods
332
+ -------
333
+ augmented_dickey_fuller_statistics(time_series=pd.Series, label=str):
334
+ Perform dickey fuller or stationary test for a given time series
335
+ It will print p value of the features
336
+ get_data():
337
+ Get asset data performing some data normalization or formating (in case of dates)
338
+ plot_series_returns(roll_mean_lags1=int, roll_mean_lags2=int)
339
+ Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
340
+ seasonal_plot():
341
+ Display time series split by year
342
+ plot_price_signal(feature=str, feature_2=str, opacity=float):
343
+ Display botton and roof signals over the closing prices
344
+ volatility_analysis(lags=int, trad_days=int, window_log_return=int, plot=boolean, save_features=boolean):
345
+ this method performs log return and volatilyty analysis of the closing prices
346
+ find_lag(feature=str, lag_list=list, column_target=str,posterior_lag=int, test_size=int):
347
+ displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
348
+ outlier_plot(zlim=float, plot=boolean, save_features=boolean):
349
+ perform outlier analysis of the log returns. It also permors normality test of returns
350
+ analysis_roll_mean_log_returns(lags=int, plot=boolean):
351
+ perform analysis of lags of the mean rolling log return
352
+ compute_clip_bands(feature_name=str,threshold=float):
353
+ compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
354
+ signal_plotter(feature_name=str):
355
+ display analysis plot of a feature with high and low signals
356
+ log_features_standard(feature_name=str):
357
+ save resulting feature names in an standard structure
358
+ relative_spread_MA(ma1=int, ma2=int, threshold=float, plot=boolean, save_features=boolean):
359
+ perform relative moving average features, one for short term and another for long/mid term
360
+ pair_feature(pair_symbol=str, plot=boolean):
361
+ initialize pair feature data extraction and analysis
362
+ calculate_cointegration(series_1=pd.series, series_2=pd.series):
363
+ calculate cointegration score for two time series
364
+ bidirect_count_feature(rolling_window=int, threshold=float, plot=boolean, save_features=boolean):
365
+ perform negative and positive return counting in a given rolling time window
366
+ get_relative_range_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
367
+ perform relative spread of opening and closing price
368
+ rsi_feature_improved(window=int, threshold=float, plot=boolean, save_features=boolean):
369
+ perform relative strength index
370
+ days_features_bands(window=int, threshold=float, plot=boolean, save_features=boolean):
371
+ compute mean returns for a given day of the week in a window scope per day
372
+ analysis_smooth_volume(window=int, threshold=float, plot=boolean, save_features=boolean):
373
+ compute feature of thrading volumes
374
+ roc_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
375
+ perform price rate of change
376
+ stoch_feature(window=int, smooth1=int, smooth2=int, threshold=float, plot=boolean, save_features=boolean):
377
+ perform stochastic oscilator RSI feature
378
+ stochastic_feature(window=int, smooth=int, threshold=float, plot=boolean, save_features=boolean):
379
+ perform stochastic oscilator feature
380
+ william_feature(lbp=int, threshold=float, plot=boolean, save_features=boolean):
381
+ perfom fast stochastic oscilator or william indicator
382
+ vortex_feature(window=int, threshold=float, plot=boolean, save_features=boolean):
383
+ perform vortex oscilator
384
+ pair_index_feature(pair_symbol=str, feature_label=str, window=int, threshold=float, plot=boolean, save_features=boolean):
385
+ perform additional asset ROC feature, then a new feature is created in the main dataframe
386
+ produce_order_features(feature_name=str, save_features=boolean):
387
+ perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
388
+ create_hmm_derived_features():
389
+ create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
390
+ cluster_hmm_analysis(n_clusters=int,features_hmm=list, test_data_size=int, seed=int, lag_returns_state=int, plot=boolean, save_features=boolean, model=obj):
391
+ create or use a hmm model
392
+ sharpe_ratio(return_series=pd.Series, n_trad_days=int, rf=float):
393
+ perform sharpe ratio of a given time series return
394
+ treat_signal_strategy(test_data=pd.DataFrame, strategy=list):
395
+ helper method that treats signals and converts signals to 1 or 0
396
+ stategy_simulator(features=list, hmm_feature=boolean):
397
+ execute strategy and get some performance metrics like sharpe ratio, return
398
+ viz_strategy(strategy):
399
+ display analysis plot of a given strategy
400
+ deep_dive_analysis_hmm(test_data_size=int, split=str):
401
+ display analysis plot hmm model
402
+ get_targets(steps=int):
403
+ produce regression target return taking future prices
404
+ get_categorical_targets(horizon=int, flor_loss=float, top_gain=float):
405
+ produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
406
+ get_configurations(test_data_size=int, val_data_size=int, model_type=str):
407
+ produce configuration dictionary that were saved in the feature generation methods if save_features was activated
408
+ """
409
+
190
410
  def __init__(self, stock_code, n_days, data_window = '5y'):
411
+
412
+ """
413
+ Initialize object
414
+
415
+ Parameters
416
+ ----------
417
+ stock_code (str): symbol of the asset
418
+ n_days (str): number of days to extract data
419
+ data_window (str): large window to extract data. Large window is required o extract more data. e.g. '5y', '10y', '15'
420
+
421
+ Returns
422
+ -------
423
+ None
424
+ """
425
+
191
426
  self.stock_code = stock_code
192
427
  self.n_days = n_days
193
428
  self.today = datetime.date.today()
194
429
  self.features = list()
195
430
  self.signals = list()
196
431
  self.data_window = data_window
197
-
432
+
198
433
  def augmented_dickey_fuller_statistics(self,time_series, label):
434
+ """
435
+ Perform dickey fuller or stationary test for a given time series
436
+ It will print p value of the features
437
+
438
+ Parameters
439
+ ----------
440
+ time_series (pd.Series): pandas series of the time series
441
+ label (pd.Series): feature name
442
+
443
+ Returns
444
+ -------
445
+ None
446
+ """
199
447
  result = adfuller(time_series.dropna().values)
200
448
  print('p-value: {} for the series {}'.format(round(result[1],6), label))
201
-
449
+
202
450
  def get_data(self):
451
+ """
452
+ Get asset data performing some data normalization or formating (in case of dates)
453
+
454
+ Parameters
455
+ ----------
456
+ None
457
+
458
+ Returns
459
+ -------
460
+ None
461
+ """
462
+
203
463
  begin_date = self.today - relativedelta(days = self.n_days)
204
464
  begin_date_str = begin_date.strftime('%Y-%m-%d')
205
465
 
@@ -210,7 +470,7 @@ class stock_eda_panel(object):
210
470
  df.reset_index(inplace=True)
211
471
  df['Date'] = pd.to_datetime(df['Date'], format='mixed',utc=True).dt.date
212
472
  df['Date'] = pd.to_datetime(df['Date'])
213
-
473
+
214
474
  df = df[df.Date >= begin_date_str ]
215
475
  self.settings_general = {
216
476
  'n_days':self.n_days,
@@ -219,44 +479,56 @@ class stock_eda_panel(object):
219
479
  'execution_date': self.today.strftime('%Y-%m-%d')
220
480
  }
221
481
  self.df = df
222
-
482
+
223
483
  ### cleaning volume
224
484
  ### volume clearning
225
485
  self.df['Volume'] = np.where(self.df['Volume'] <= 10, np.nan, self.df['Volume'])
226
486
  self.df['Volume'] = self.df['Volume'].fillna(method='bfill')
227
-
487
+
228
488
  ## filling
229
-
489
+
230
490
  base_columns_unit_test = ['Open','High','Low','Close','Volume']
231
491
  self.df[base_columns_unit_test] = self.df[base_columns_unit_test].fillna(method='ffill')
232
-
492
+
233
493
  ## cleaning nulls
234
-
494
+
235
495
  xs = self.df[base_columns_unit_test].isnull().sum()/self.df[base_columns_unit_test].count()
236
496
  reject_columns = list(xs[xs > 0.5].index.values)
237
-
497
+
238
498
  if len(reject_columns) > 0:
239
499
  logging.warning("the following columns have many nulls and are drop: {}".format(reject_columns))
240
500
  self.df = self.df.drop(columns = reject_columns)
241
-
242
-
501
+
243
502
  def plot_series_returns(self,roll_mean_lags1,roll_mean_lags2):
244
-
503
+
504
+ """
505
+ Display plot that time series with mean rolling windows and rolling standard deviations of daily closing prices
506
+
507
+ Parameters
508
+ ----------
509
+ roll_mean_lags1 (int): short term window
510
+ roll_mean_lags2 (int): mid/long term window
511
+
512
+ Returns
513
+ -------
514
+ None
515
+ """
516
+
245
517
  df = self.df
246
518
  begin_date = self.today - relativedelta(days = self.n_days)
247
519
  begin_date_str = begin_date.strftime('%Y-%m-%d')
248
-
520
+
249
521
  ### getting rolling mean
250
522
  df["Close_roll_mean"] = (
251
523
  df.sort_values("Date")["Close"]
252
524
  .transform(lambda x: x.rolling(roll_mean_lags1, min_periods=1).mean())
253
525
  )
254
-
526
+
255
527
  df["Close_roll_mean_2"] = (
256
528
  df.sort_values("Date")["Close"]
257
529
  .transform(lambda x: x.rolling(roll_mean_lags2, min_periods=1).mean())
258
530
  )
259
-
531
+
260
532
  ### getting rolling stdv
261
533
  df["Close_roll_std"] = (
262
534
  df.sort_values("Date")["Close"]
@@ -273,7 +545,7 @@ class stock_eda_panel(object):
273
545
  ))
274
546
 
275
547
  fig.add_trace(go.Scatter(x=df['Date'], y=df.Close, marker_color = 'blue', name='Price'),row=1, col=1)
276
-
548
+
277
549
  fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean, marker_color = 'black', name='roll mean' ),row=1, col=1)
278
550
  fig.add_trace(go.Scatter(x=df['Date'], y=df.Close_roll_mean_2, marker_color = 'grey', name='roll mean 2' ),row=1, col=1)
279
551
  fig.add_trace(go.Scatter(x=df['Date'], y=df.lower, marker_color = 'pink',legendgroup='bound', name='bound' ),row=1, col=1)
@@ -281,8 +553,21 @@ class stock_eda_panel(object):
281
553
 
282
554
  fig.update_layout(height=500, width=1200, title_text=f"stock {self.stock_code} vizualization")
283
555
  fig.show()
284
-
556
+
285
557
  def seasonal_plot(self):
558
+
559
+ """
560
+ Display time series split by year
561
+
562
+ Parameters
563
+ ----------
564
+ None
565
+
566
+ Returns
567
+ -------
568
+ None
569
+ """
570
+
286
571
  df = self.df
287
572
  years = list(df['Date'].dt.year.unique())
288
573
  years.sort()
@@ -302,10 +587,24 @@ class stock_eda_panel(object):
302
587
 
303
588
  fig.update_layout(height=500, width=1400, title_text=f"stock {self.stock_code} seasonal vizualization")
304
589
  fig.show()
305
-
590
+
306
591
  def plot_price_signal(self, feature, feature_2 = '', opacity = 0.3):
307
-
308
- signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
592
+
593
+ """
594
+ Display botton and roof signals over the closing prices
595
+
596
+ Parameters
597
+ ----------
598
+ feature (str): name of the main feature to plot
599
+ feature_2 (str): name of the alternative feature to plot
600
+ opacity (float): opacity degree of the signals points
601
+
602
+ Returns
603
+ -------
604
+ None
605
+ """
606
+
607
+ signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
309
608
  signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
310
609
  norm_list = [f'norm_{feature}', f'z_{feature}', feature]
311
610
 
@@ -315,14 +614,14 @@ class stock_eda_panel(object):
315
614
  if norm_feat in self.df.columns:
316
615
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[norm_feat],legendgroup="up", mode='lines',name = norm_feat, marker_color = 'blue'),col = 1, row = 1)
317
616
  break
318
-
319
-
617
+
618
+
320
619
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df['Close'], mode='lines',name = 'history', marker_color = 'grey'),col = 1, row = 2)
321
-
620
+
322
621
  if feature == 'MA_spread':
323
622
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma1_column],legendgroup="ma", mode='lines',name = self.ma1_column, marker_color = 'black'),col = 1, row = 2)
324
623
  fig.add_trace(go.Scatter(x=self.df['Date'], y=self.df[self.ma2_column],legendgroup="ma", mode='lines',name = self.ma2_column, marker_color = 'grey'),col = 1, row = 2)
325
-
624
+
326
625
  for norm_feat in norm_list:
327
626
  if norm_feat in self.df.columns:
328
627
  fig.add_trace(go.Scatter(x=self.df['Date'], y=np.where(self.df[norm_feat] > 0, self.df['Close'], np.nan),legendgroup="up", mode='markers',name = 'up', marker_color = 'green',opacity = opacity),col = 1, row = 2)
@@ -338,8 +637,25 @@ class stock_eda_panel(object):
338
637
 
339
638
  fig.update_layout(height=900, width=1200)
340
639
  fig.show()
341
-
640
+
342
641
  def volatility_analysis(self, lags, trad_days, window_log_return, plot = False, save_features = False):
642
+
643
+ """
644
+ this method performs log return and volatilyty analysis of the closing prices
645
+
646
+ Parameters
647
+ ----------
648
+ lags (int): number of lags to apply to the closing prices
649
+ trad_days (int): number of trading days to anualize returns or volatility
650
+ window_log_return (int): window for rolling returns
651
+ plot (boolean): True to display plot
652
+ save_features (boolean): True to save feature configuration and feature names
653
+
654
+ Returns
655
+ -------
656
+ None
657
+ """
658
+
343
659
  df = self.df
344
660
  df['log_return'] = np.log(df.Close/df.Close.shift(lags))
345
661
  df['sqr_log_return'] = np.square(df.log_return)
@@ -349,13 +665,13 @@ class stock_eda_panel(object):
349
665
  df.sort_values("Date")["log_return"]
350
666
  .transform(lambda x: x.rolling(window_log_return, min_periods=1).mean())
351
667
  )
352
-
668
+
353
669
  if save_features:
354
670
  self.features.append('volatility_log_return')
355
671
  self.features.append('roll_mean_log_return')
356
672
  self.features.append('log_return')
357
673
  self.settings_volatility = {'lags':lags, 'trad_days':trad_days, 'window_log_return':window_log_return}
358
-
674
+
359
675
  if plot:
360
676
  fig = make_subplots(rows=3, cols=1,vertical_spacing = 0.02,shared_xaxes=True,
361
677
  specs=[
@@ -395,10 +711,26 @@ class stock_eda_panel(object):
395
711
 
396
712
  self.augmented_dickey_fuller_statistics(df['log_return'], 'log_return')
397
713
  self.augmented_dickey_fuller_statistics(df['roll_mean_log_return'], 'roll_mean_log_return')
398
-
399
-
714
+
715
+
400
716
  def find_lag(self, feature, lag_list, column_target = 'log_return',posterior_lag = 4, test_size = 350):
401
717
 
718
+ """
719
+ displays correlation curves, using spearman and pearson correlation, of a given feature at different time lags with respecto to a given target
720
+
721
+ Parameters
722
+ ----------
723
+ feature (str): feature name to apply lags
724
+ lag_list (list): list of lags, each lag as integer
725
+ column_target (str): target to get correlation, e.g return or mean reaturn
726
+ posterior_lag (int): for the target, posterior window shift to calculate a window return
727
+ test_size (int): data size of the test data. The remaining is going to be used as training data. This parameters is ment to avoid overfiting and leackage
728
+
729
+ Returns
730
+ -------
731
+ None
732
+ """
733
+
402
734
  results = dict()
403
735
  df = self.df.iloc[:-test_size,:][['Date','Close','roll_mean_log_return','log_return',feature]].sort_values('Date').copy()
404
736
  for i,lag in enumerate(lag_list):
@@ -413,7 +745,7 @@ class stock_eda_panel(object):
413
745
  'lag':lag,
414
746
  'pearsonr_log_return':r_log[0],
415
747
  'spearman_log_return': sp_log[0],
416
- }
748
+ }
417
749
  del df
418
750
  results_df = pd.DataFrame(results).T
419
751
 
@@ -426,10 +758,24 @@ class stock_eda_panel(object):
426
758
  plt.legend()
427
759
  plt.axhline(y=0, color='grey', linestyle='--')
428
760
  plt.show()
429
-
430
-
761
+
762
+
431
763
  def outlier_plot(self, zlim, plot = False, save_features = False):
432
-
764
+
765
+ """
766
+ perform outlier analysis of the log returns. It also permors normality test of returns
767
+
768
+ Parameters
769
+ ----------
770
+ zlim (float): alpha or z thrsholds for normalized returns
771
+ plot (boolean): True to display plot
772
+ save_features (boolean): True to save feature configuration and feature names
773
+
774
+ Returns
775
+ -------
776
+ None
777
+ """
778
+
433
779
  mean = self.df.log_return.mean()
434
780
  std = self.df.log_return.std()
435
781
  self.df['z_log_return'] = (self.df.log_return - mean)/std
@@ -440,7 +786,7 @@ class stock_eda_panel(object):
440
786
  self.df['up_outlier'] = zlim*self.df['z_std_log_return'] + mean_
441
787
  self.df['low_outlier'] = -zlim*self.df['z_std_log_return'] + mean_
442
788
 
443
- self.df['signal_low_osutlier'] = np.where( (self.df['z_log_return'] < self.df['low_outlier'] ), 1, 0)
789
+ self.df['signal_low_outlier'] = np.where( (self.df['z_log_return'] < self.df['low_outlier'] ), 1, 0)
444
790
  self.df['signal_up_outlier'] = np.where( (self.df['z_log_return'] > self.df['up_outlier'] ), 1, 0)
445
791
  if save_features:
446
792
  self.signals.append('signal_low_outlier')
@@ -451,7 +797,7 @@ class stock_eda_panel(object):
451
797
  sigma = self.df['z_log_return'].std()
452
798
  x = np.linspace(self.df['z_log_return'].min(),self.df['z_log_return'].max(), 15000)
453
799
  y = stats.norm.pdf(x, loc = mu, scale = sigma)
454
-
800
+
455
801
  fig, axs = plt.subplots(2, 1,figsize=(15,8))
456
802
 
457
803
  axs[0].hist(self.df['z_log_return'],density = True,bins = 100 , label = 'Returns distribution')
@@ -460,7 +806,7 @@ class stock_eda_panel(object):
460
806
  axs[0].axvline(l2, color='green', linestyle='--')
461
807
  axs[0].axvline(-l2, color='green', linestyle='--')
462
808
  axs[0].plot(x,y, linewidth = 3, color = 'r', label = 'Normal Dist Curve')
463
-
809
+
464
810
  axs[1].plot(self.df['Date'],self.df['z_log_return'])
465
811
  axs[1].plot(self.df['Date'],self.df['low_outlier'], linestyle='--')
466
812
  axs[1].plot(self.df['Date'],self.df['up_outlier'], linestyle='--')
@@ -469,18 +815,31 @@ class stock_eda_panel(object):
469
815
  plt.show()
470
816
 
471
817
  z_stat, p_stat = stats.normaltest(self.df['z_log_return'].dropna())
472
- p_stat = round(p_stat, 7)
818
+ p_stat = round(p_stat, 7)
473
819
  print('---------------------- returns normality tests ----------------------------')
474
820
  if p_stat < 0.05:
475
821
  print(f'pvalue: {p_stat} then, returns do not follow a normal distribution')
476
822
  else:
477
823
  print(f'pvalue: {p_stat} then, returns follow a normal distribution')
478
-
824
+
479
825
  def analysis_roll_mean_log_returns(self, lags, plot = False):
480
826
 
827
+ """
828
+ perform analysis of lags of the mean rolling log return
829
+
830
+ Parameters
831
+ ----------
832
+ lags (int): lags to apply to the roll log return
833
+ plot (boolean): True to display plot
834
+
835
+ Returns
836
+ -------
837
+ None
838
+ """
839
+
481
840
  self.df['lag'] = self.df.roll_mean_log_return.shift(lags)
482
841
  self.df['Diff'] = self.df['roll_mean_log_return'] - self.df['lag']
483
-
842
+
484
843
  if plot:
485
844
 
486
845
  fig, axs = plt.subplots(1, 3,figsize=(19,4))
@@ -493,7 +852,20 @@ class stock_eda_panel(object):
493
852
  plt.show()
494
853
 
495
854
  def compute_clip_bands(self,feature_name,threshold):
496
-
855
+
856
+ """
857
+ compute outlier detection for a given signal, Note that this follows mean reversion procedure and feature has to be stationary. Also botton and roof resulting signals is attached to the dataframe
858
+
859
+ Parameters
860
+ ----------
861
+ feature_name (str): feature name
862
+ threshold (float): alpha or z thrsholds for normalized returns
863
+
864
+ Returns
865
+ -------
866
+ None
867
+ """
868
+
497
869
  self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/self.df[feature_name].std()
498
870
  mean_ = self.df[f'norm_{feature_name}'].mean()
499
871
 
@@ -507,25 +879,49 @@ class stock_eda_panel(object):
507
879
  self.df[f'signal_up_{feature_name}'] = np.where( (self.df[f'norm_{feature_name}'] > self.df[f'upper_{feature_name}'] ), 1, 0)
508
880
 
509
881
  def signal_plotter(self, feature_name):
882
+
883
+ """
884
+ display analysis plot of a feature with high and low signals
885
+
886
+ Parameters
887
+ ----------
888
+ feature_name (str): feature name
889
+
890
+ Returns
891
+ -------
892
+ None
893
+ """
894
+
510
895
  fig, axs = plt.subplots(1, 3,figsize=(17,5))
511
-
896
+
512
897
  axs[0].plot(self.df[f'upper_{feature_name}'],color = 'grey', linestyle='--')
513
898
  axs[0].plot(self.df[f'lower_{feature_name}'],color = 'grey', linestyle='--')
514
899
  axs[0].plot(self.df[f'norm_{feature_name}'])
515
-
900
+
516
901
  plot_acf(self.df[feature_name].dropna(),lags=25,ax = axs[1])
517
902
  axs[1].set_title(f'acf {feature_name}')
518
-
903
+
519
904
  plot_pacf(self.df[feature_name].dropna(),lags=25,ax = axs[2])
520
905
  axs[2].set_title(f'pacf {feature_name}')
521
-
906
+
522
907
  fig.show()
523
908
 
524
909
  def log_features_standard(self, feature_name):
910
+ """
911
+ save resulting feature names in an standard structure
912
+
913
+ Parameters
914
+ ----------
915
+ feature_name (str): feature name
916
+
917
+ Returns
918
+ -------
919
+ None
920
+ """
525
921
  self.features.append(feature_name)
526
922
  self.signals.append(f'signal_up_{feature_name}')
527
923
  self.signals.append(f'signal_low_{feature_name}')
528
-
924
+
529
925
  #######################
530
926
  #### to be deprecated ####
531
927
  def spread_MA(self, ma1, ma2, limit = 1.95, plot = False, save_features = False):
@@ -546,7 +942,7 @@ class stock_eda_panel(object):
546
942
 
547
943
  self.df['signal_low_MA_spread'] = np.where( (self.df['norm_MA_spread'] < self.df['lower_MA_spread'] ), 1, 0)
548
944
  self.df['signal_up_MA_spread'] = np.where( (self.df['norm_MA_spread'] > self.df['upper_MA_spread'] ), 1, 0)
549
-
945
+
550
946
  ### ploting purposes
551
947
  self.df[f"Roll_mean_{ma1}"] = (
552
948
  self.df.sort_values("Date")["Close"]
@@ -556,15 +952,15 @@ class stock_eda_panel(object):
556
952
  self.df.sort_values("Date")["Close"]
557
953
  .transform(lambda x: x.rolling(ma2, min_periods=1).mean())
558
954
  )
559
-
955
+
560
956
 
561
957
  print('--------------------------------------------------------------------')
562
958
  if save_features:
563
959
  self.features.append('MA_spread')
564
960
  self.signals.append('signal_low_MA_spread')
565
961
  self.signals.append('signal_up_MA_spread')
566
- self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
567
-
962
+ self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
963
+
568
964
  if plot:
569
965
 
570
966
  fig, axs = plt.subplots(1, 3,figsize=(21,4))
@@ -581,9 +977,23 @@ class stock_eda_panel(object):
581
977
  axs[2].set_title('acf MA_spread series')
582
978
  plt.show()
583
979
  ##################################################
584
-
980
+
585
981
  def relative_spread_MA(self, ma1, ma2, threshold = 1.95, plot = False, save_features = False):
586
-
982
+ """
983
+ perform relative moving average features, one for short term and another for long/mid term
984
+
985
+ Parameters
986
+ ----------
987
+ ma1 (int): short term moving average window
988
+ ma2 (int): long/mid term moving average window
989
+ threshold (float): alpha or z thrsholds for the normalized feature
990
+ plot (boolean): True to display plot
991
+ save_features (boolean): True to save feature configuration and feature names
992
+
993
+ Returns
994
+ -------
995
+ None
996
+ """
587
997
  feature_name = 'rel_MA_spread'
588
998
 
589
999
  self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
@@ -608,13 +1018,26 @@ class stock_eda_panel(object):
608
1018
  print('--------------------------------------------------------------------')
609
1019
  if save_features:
610
1020
  self.log_features_standard(feature_name)
611
- self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
1021
+ self.settings_relative_spread_ma = {'ma1':ma1, 'ma2':ma2, 'threshold':threshold}
612
1022
 
613
1023
  if plot:
614
1024
 
615
1025
  self.signal_plotter(feature_name)
616
-
1026
+
617
1027
  def pair_feature(self, pair_symbol, plot = False):
1028
+ """
1029
+ initialize pair feature data extraction and analysis
1030
+
1031
+ Parameters
1032
+ ----------
1033
+ pair_symbol (str): symbol of the pair asset to extract
1034
+ plot (boolean): True to display plot
1035
+
1036
+ Returns
1037
+ -------
1038
+ None
1039
+ """
1040
+
618
1041
  self.pair_symbol = pair_symbol
619
1042
  begin_date = self.today - relativedelta(days = self.n_days)
620
1043
  begin_date_str = begin_date.strftime('%Y-%m-%d')
@@ -627,7 +1050,7 @@ class stock_eda_panel(object):
627
1050
  df['Date'] = pd.to_datetime(df['Date'])
628
1051
  df = df[df.Date >= begin_date_str ]
629
1052
  self.pair_df = df
630
-
1053
+
631
1054
  #### converting the same index ####
632
1055
  dates_vector = self.df.Date.to_frame()
633
1056
  self.pair_df = dates_vector.merge(self.pair_df, on ='Date',how = 'left')
@@ -653,8 +1076,22 @@ class stock_eda_panel(object):
653
1076
  plt.plot(self.df['Date'],asset_2_values,label = asset_2)
654
1077
  plt.legend()
655
1078
  plt.show()
656
-
1079
+
657
1080
  def calculate_cointegration(self,series_1, series_2):
1081
+ """
1082
+ calculate cointegration score for two time series
1083
+
1084
+ Parameters
1085
+ ----------
1086
+ series_1 (pd.series): time series
1087
+ series_2 (pd.series): time series
1088
+
1089
+ Returns
1090
+ -------
1091
+ coint_flag (boolean): 1 if the p_value cointegration_t are lower than 0.05 and critical value
1092
+ hedge_value (float): beta from the regression model
1093
+ """
1094
+
658
1095
  coint_flag = 0
659
1096
  coint_res = coint(series_1, series_2)
660
1097
  coint_t = coint_res[0]
@@ -666,9 +1103,22 @@ class stock_eda_panel(object):
666
1103
  coint_flag = 1 if p_value < 0.05 and coint_t < critical_value else 0
667
1104
 
668
1105
  return coint_flag, hedge_value
669
-
670
- def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
671
1106
 
1107
+ def produce_pair_score_plot(self, window, z_threshold, plot = False, save_features = False):
1108
+ """
1109
+ display analysis of the pair feature and save results in case if needed
1110
+
1111
+ Parameters
1112
+ ----------
1113
+ window (int): window to apply to the rolling spread between pair and main asset
1114
+ z_threshold (float): alpha or z thrsholds for the normalized feature
1115
+ plot (boolean): True to display plot
1116
+ save_features (boolean): True to save feature configuration and feature names
1117
+
1118
+ Returns
1119
+ -------
1120
+ None
1121
+ """
672
1122
  spread_series = pd.Series(self.df.pair_spread)
673
1123
  mean = spread_series.rolling(center = False, window = window).mean()
674
1124
  std = spread_series.rolling(center = False, window = window).std()
@@ -677,11 +1127,11 @@ class stock_eda_panel(object):
677
1127
  self.df['pair_z_score'] = z_score
678
1128
  self.df['signal_low_pair_z_score'] = np.where(self.df['pair_z_score'] < -z_threshold, 1, 0)
679
1129
  self.df['signal_up_pair_z_score'] = np.where(self.df['pair_z_score'] > z_threshold, 1, 0)
680
-
1130
+
681
1131
  if save_features:
682
1132
  self.log_features_standard('pair_z_score')
683
- self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
684
-
1133
+ self.settings_pair_feature = {'pair_symbol':self.pair_symbol,'window':window, 'z_threshold':z_threshold}
1134
+
685
1135
  if plot:
686
1136
  pvalue = round(adfuller(z_score.dropna().values)[1],4)
687
1137
  print(f'p value of the rolling z-score is {pvalue}')
@@ -695,7 +1145,7 @@ class stock_eda_panel(object):
695
1145
  axs[0,0].axhline(y=0, color='blue', linestyle='-.')
696
1146
  axs[0,0].plot(self.df.pair_z_score)
697
1147
  axs[0,0].set_title('z score from the spread')
698
-
1148
+
699
1149
  axs[0,1].plot(self.df['Date'],self.df['pair_spread'])
700
1150
  axs[0,1].plot(self.df['Date'],np.where(self.df['signal_low_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'red')
701
1151
  axs[0,1].plot(self.df['Date'],np.where(self.df['signal_up_pair_z_score'] == 1, self.df['pair_spread'], np.nan),'o-r',color = 'green')
@@ -704,10 +1154,10 @@ class stock_eda_panel(object):
704
1154
 
705
1155
  plot_acf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,0])
706
1156
  axs[1,0].set_title('acf pair_z_score')
707
-
1157
+
708
1158
  plot_pacf(self.df['pair_z_score'].dropna(),lags=25, ax=axs[1,1])
709
1159
  axs[1,1].set_title('pacf pair_z_score')
710
-
1160
+
711
1161
  plt.show()
712
1162
 
713
1163
  #######################
@@ -725,13 +1175,13 @@ class stock_eda_panel(object):
725
1175
 
726
1176
  self.df['signal_up_roll_pos_counting'] = np.where((self.df['norm_counting'] > threshold),1,0)
727
1177
  self.df['signal_low_roll_pos_counting'] = np.where((self.df['norm_counting'] < -threshold),1,0)
728
-
1178
+
729
1179
  if save_features:
730
1180
  self.features.append('roll_pos_counting')
731
1181
  self.signals.append('signal_up_roll_pos_counting')
732
1182
  self.signals.append('signal_low_roll_pos_counting')
733
- self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
734
-
1183
+ self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
1184
+
735
1185
  if plot:
736
1186
  fig = plt.figure(figsize = (10,4))
737
1187
  plt.plot(self.df['Date'],self.df.norm_counting)
@@ -739,9 +1189,22 @@ class stock_eda_panel(object):
739
1189
  plt.axhline(y=-threshold, color='grey', linestyle='--')
740
1190
  plt.show()
741
1191
  #######################
742
-
1192
+
743
1193
  def bidirect_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
744
-
1194
+ """
1195
+ perform negative and positive return counting in a given rolling time window
1196
+
1197
+ Parameters
1198
+ ----------
1199
+ rolling_window (int): window to apply to positive and negative returns
1200
+ threshold (float): alpha or z thrsholds for the normalized feature
1201
+ plot (boolean): True to display plot
1202
+ save_features (boolean): True to save feature configuration and feature names
1203
+
1204
+ Returns
1205
+ -------
1206
+ None
1207
+ """
745
1208
  feature_name = 'bidirect_counting'
746
1209
  # negative countiing and rolling countingng
747
1210
  self.df['RetClose'] = self.df['Close'].pct_change()
@@ -757,7 +1220,7 @@ class stock_eda_panel(object):
757
1220
 
758
1221
  if save_features:
759
1222
  self.log_features_standard(feature_name)
760
- self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
1223
+ self.settings_bidirect_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
761
1224
 
762
1225
  if plot:
763
1226
  fig = plt.figure(figsize = (10,4))
@@ -783,12 +1246,12 @@ class stock_eda_panel(object):
783
1246
 
784
1247
  self.df['signal_up_dist_range'] = np.where(self.df['norm_dist_range'] > self.df['up_bound_norm_dist_range'],1,0 )
785
1248
  self.df['signal_low_dist_range'] = np.where(self.df['norm_dist_range'] < self.df['low_bound_norm_dist_range'],1,0 )
786
-
1249
+
787
1250
  if save_features:
788
1251
  self.features.append('dist_range')
789
1252
  self.signals.append('signal_up_dist_range')
790
1253
  self.signals.append('signal_low_dist_range')
791
- self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
1254
+ self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
792
1255
 
793
1256
  if plot:
794
1257
  fig, axs = plt.subplots(2, 2,figsize=(17,11))
@@ -804,9 +1267,22 @@ class stock_eda_panel(object):
804
1267
  axs[1,0].plot(self.df['norm_dist_range'])
805
1268
  axs[1,0].set_title('norm_dist_range')
806
1269
  #######################
807
-
1270
+
808
1271
  def get_relative_range_feature(self, window, threshold, plot = False, save_features = False):
809
-
1272
+ """
1273
+ perform relative spread of opening and closing price
1274
+
1275
+ Parameters
1276
+ ----------
1277
+ window (int): window to apply to the feature
1278
+ threshold (float): alpha or z thrsholds for the normalized feature
1279
+ plot (boolean): True to display plot
1280
+ save_features (boolean): True to save feature configuration and feature names
1281
+
1282
+ Returns
1283
+ -------
1284
+ None
1285
+ """
810
1286
  feature_name = 'CO_Range'
811
1287
  self.df[feature_name] = self.df["Close"] / self.df["Open"]-1
812
1288
  self.df[f'norm_{feature_name}'] = (self.df[feature_name] - self.df[feature_name].mean())/ self.df[feature_name].std()
@@ -822,7 +1298,7 @@ class stock_eda_panel(object):
822
1298
 
823
1299
  if save_features:
824
1300
  self.log_features_standard(feature_name)
825
- self.settings_relative_price_range = {'window':window, 'threshold':threshold}
1301
+ self.settings_relative_price_range = {'window':window, 'threshold':threshold}
826
1302
 
827
1303
  if plot:
828
1304
  fig, axs = plt.subplots(1, 2,figsize=(14,5))
@@ -840,7 +1316,7 @@ class stock_eda_panel(object):
840
1316
  def rsi_feature(self, window, lag_rsi_ret, threshold, plot = False, save_features = False):
841
1317
 
842
1318
  rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
843
- self.df['RSI'] = rsi
1319
+ self.df['RSI'] = rsi
844
1320
  self.df['RSI_ret'] = self.df['RSI']/self.df['RSI'].shift(lag_rsi_ret)
845
1321
 
846
1322
  mean = self.df['RSI_ret'].mean()
@@ -870,8 +1346,22 @@ class stock_eda_panel(object):
870
1346
 
871
1347
  fig.show()
872
1348
  #######################
873
-
1349
+
874
1350
  def rsi_feature_improved(self, window, threshold, plot = False, save_features = False):
1351
+ """
1352
+ perform relative strength index
1353
+
1354
+ Parameters
1355
+ ----------
1356
+ window (int): window to apply to the feature
1357
+ threshold (float): alpha or z thrsholds for the normalized feature
1358
+ plot (boolean): True to display plot
1359
+ save_features (boolean): True to save feature configuration and feature names
1360
+
1361
+ Returns
1362
+ -------
1363
+ None
1364
+ """
875
1365
  feature_name = 'RSI'
876
1366
  rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
877
1367
  self.df[feature_name] = rsi.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
@@ -883,7 +1373,7 @@ class stock_eda_panel(object):
883
1373
 
884
1374
  if plot:
885
1375
  self.signal_plotter(feature_name)
886
-
1376
+
887
1377
  #######################
888
1378
  #### to be deprecated ####
889
1379
  def days_features(self, window_day, limit, plot = False, save_features = False):
@@ -916,7 +1406,7 @@ class stock_eda_panel(object):
916
1406
  if plot:
917
1407
  fig, axs = plt.subplots(1, 3,figsize=(17,5))
918
1408
 
919
- axs[0].plot(self.df['norm_dow_input'])
1409
+ axs[0].plot(self.df['norm_dow_input'])
920
1410
  axs[0].plot(self.df['up_dow_input'], linestyle='--')
921
1411
  axs[0].plot(self.df['low_dow_input'], linestyle='--')
922
1412
 
@@ -928,9 +1418,22 @@ class stock_eda_panel(object):
928
1418
 
929
1419
  fig.show()
930
1420
  #######################
931
-
932
- def days_features_bands(self, window, threshold, plot = False, save_features = False):
933
1421
 
1422
+ def days_features_bands(self, window, threshold, plot = False, save_features = False):
1423
+ """
1424
+ compute mean returns for a given day of the week in a window scope per day
1425
+
1426
+ Parameters
1427
+ ----------
1428
+ window (int): window to apply to the feature
1429
+ threshold (float): alpha or z thrsholds for the normalized feature
1430
+ plot (boolean): True to display plot
1431
+ save_features (boolean): True to save feature configuration and feature names
1432
+
1433
+ Returns
1434
+ -------
1435
+ None
1436
+ """
934
1437
  self.df['dow'] = self.df.Date.dt.dayofweek
935
1438
  self.df['dow'] = self.df['dow'].astype('str')
936
1439
 
@@ -947,11 +1450,11 @@ class stock_eda_panel(object):
947
1450
 
948
1451
  if plot:
949
1452
  self.signal_plotter(feature_name)
950
-
1453
+
951
1454
  #######################
952
1455
  #### to be deprecated ####
953
1456
  def analysis_volume(self,lag_volume, threshold, window, plot = False, save_features = False):
954
-
1457
+
955
1458
  self.df['log_Volume'] = np.log(self.df['Volume'])
956
1459
  self.df['ret_log_Volume'] = self.df['log_Volume'].pct_change(lag_volume)
957
1460
 
@@ -1003,9 +1506,22 @@ class stock_eda_panel(object):
1003
1506
 
1004
1507
  plt.show()
1005
1508
  #######################
1006
-
1509
+
1007
1510
  def analysis_smooth_volume(self, window, threshold, plot = False, save_features = False):
1008
-
1511
+ """
1512
+ compute feature of thrading volumes
1513
+
1514
+ Parameters
1515
+ ----------
1516
+ window (int): window to apply to the feature
1517
+ threshold (float): alpha or z thrsholds for the normalized feature
1518
+ plot (boolean): True to display plot
1519
+ save_features (boolean): True to save feature configuration and feature names
1520
+
1521
+ Returns
1522
+ -------
1523
+ None
1524
+ """
1009
1525
  feature_name = 'smooth_Volume'
1010
1526
  self.df[feature_name] = np.log(self.df['Volume'])
1011
1527
  # self.df[feature_name] = self.df['log_Volume'].rolling(window).mean()
@@ -1039,7 +1555,7 @@ class stock_eda_panel(object):
1039
1555
 
1040
1556
  fig, axs = plt.subplots(1,2,figsize=(10,4))
1041
1557
 
1042
- axs[0].plot(self.df[f'{feature_name}'])
1558
+ axs[0].plot(self.df[f'{feature_name}'])
1043
1559
  axs[0].set_title(f'{feature_name}')
1044
1560
 
1045
1561
  axs[1].plot(self.df[f'z_{feature_name}'], linestyle='--')
@@ -1048,6 +1564,20 @@ class stock_eda_panel(object):
1048
1564
  plt.show()
1049
1565
 
1050
1566
  def roc_feature(self, window, threshold, plot = False, save_features = False):
1567
+ """
1568
+ perform price rate of change
1569
+
1570
+ Parameters
1571
+ ----------
1572
+ window (int): window to apply to the feature
1573
+ threshold (float): alpha or z thrsholds for the normalized feature
1574
+ plot (boolean): True to display plot
1575
+ save_features (boolean): True to save feature configuration and feature names
1576
+
1577
+ Returns
1578
+ -------
1579
+ None
1580
+ """
1051
1581
  feature_name = 'ROC'
1052
1582
  roc = ROCIndicator(close = self.df['Close'], window = window).roc()
1053
1583
  self.df[feature_name] = roc.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
@@ -1058,8 +1588,24 @@ class stock_eda_panel(object):
1058
1588
  self.settings_roc_feature = {'window':window, 'threshold':threshold}
1059
1589
  if plot:
1060
1590
  self.signal_plotter(feature_name)
1061
-
1591
+
1062
1592
  def stoch_feature(self, window, smooth1, smooth2, threshold, plot = False, save_features = False):
1593
+ """
1594
+ perform stochastic oscilator RSI feature
1595
+
1596
+ Parameters
1597
+ ----------
1598
+ window (int): window to apply to the feature
1599
+ smooth1 (int): smoothing parameter 1
1600
+ smooth2 (int): smoothing parameter 2
1601
+ threshold (float): alpha or z thrsholds for the normalized feature
1602
+ plot (boolean): True to display plot
1603
+ save_features (boolean): True to save feature configuration and feature names
1604
+
1605
+ Returns
1606
+ -------
1607
+ None
1608
+ """
1063
1609
  feature_name = 'STOCH'
1064
1610
  stoch = StochRSIIndicator(close = self.df['Close'], window = window, smooth1=smooth1, smooth2=smooth2).stochrsi()
1065
1611
  self.df[feature_name] = stoch.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
@@ -1072,6 +1618,21 @@ class stock_eda_panel(object):
1072
1618
  self.signal_plotter(feature_name)
1073
1619
 
1074
1620
  def stochastic_feature(self, window, smooth, threshold, plot = False, save_features = False):
1621
+ """
1622
+ perform stochastic oscilator feature
1623
+
1624
+ Parameters
1625
+ ----------
1626
+ window (int): window to apply to the feature
1627
+ smooth (int): smoothing parameter
1628
+ threshold (float): alpha or z thrsholds for the normalized feature
1629
+ plot (boolean): True to display plot
1630
+ save_features (boolean): True to save feature configuration and feature names
1631
+
1632
+ Returns
1633
+ -------
1634
+ None
1635
+ """
1075
1636
  feature_name = 'STOCHOSC'
1076
1637
  stochast = StochasticOscillator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window,smooth_window=smooth).stoch()
1077
1638
  self.df[feature_name] = stochast.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
@@ -1084,8 +1645,22 @@ class stock_eda_panel(object):
1084
1645
  self.signal_plotter(feature_name)
1085
1646
 
1086
1647
  def william_feature(self, lbp, threshold, plot = False, save_features = False):
1648
+ """
1649
+ perfom fast stochastic oscilator or william indicator
1650
+
1651
+ Parameters
1652
+ ----------
1653
+ lbp (int): look back parameter
1654
+ threshold (float): alpha or z thrsholds for the normalized feature
1655
+ plot (boolean): True to display plot
1656
+ save_features (boolean): True to save feature configuration and feature names
1657
+
1658
+ Returns
1659
+ -------
1660
+ None
1661
+ """
1087
1662
  feature_name = 'WILL'
1088
- will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
1663
+ will = WilliamsRIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], lbp = lbp).williams_r()
1089
1664
  self.df[feature_name] = will.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
1090
1665
  self.compute_clip_bands(feature_name,threshold)
1091
1666
 
@@ -1096,6 +1671,20 @@ class stock_eda_panel(object):
1096
1671
  self.signal_plotter(feature_name)
1097
1672
 
1098
1673
  def vortex_feature(self, window, threshold, plot = False, save_features = False):
1674
+ """
1675
+ perform vortex oscilator
1676
+
1677
+ Parameters
1678
+ ----------
1679
+ window (int): window to apply to the feature
1680
+ threshold (float): alpha or z thrsholds for the normalized feature
1681
+ plot (boolean): True to display plot
1682
+ save_features (boolean): True to save feature configuration and feature names
1683
+
1684
+ Returns
1685
+ -------
1686
+ None
1687
+ """
1099
1688
  feature_name = 'VORTEX'
1100
1689
  vortex = VortexIndicator(close = self.df['Close'], high = self.df['High'], low = self.df['Low'], window = window).vortex_indicator_diff()
1101
1690
  self.df[feature_name] = vortex.replace([np.inf, -np.inf], 0).fillna(method = 'ffill')
@@ -1108,10 +1697,26 @@ class stock_eda_panel(object):
1108
1697
  self.signal_plotter(feature_name)
1109
1698
 
1110
1699
  def pair_index_feature(self, pair_symbol, feature_label, window, threshold, plot = False, save_features = False):
1700
+ """
1701
+ perform additional asset ROC feature, then a new feature is created in the main dataframe
1702
+
1703
+ Parameters
1704
+ ----------
1705
+ pair_symbol (str): symbol of the asset to extract the data
1706
+ feature_label (str): name of the resulting feature
1707
+ window (int): window to apply to the feature
1708
+ threshold (float): alpha or z thrsholds for the normalized feature
1709
+ plot (boolean): True to display plot
1710
+ save_features (boolean): True to save feature configuration and feature names
1711
+
1712
+ Returns
1713
+ -------
1714
+ None
1715
+ """
1111
1716
  self.pair_index = pair_symbol
1112
1717
  begin_date = self.today - relativedelta(days = self.n_days)
1113
1718
  begin_date_str = begin_date.strftime('%Y-%m-%d')
1114
-
1719
+
1115
1720
  if feature_label in self.df.columns:
1116
1721
  self.df = self.df.drop(columns = [feature_label])
1117
1722
 
@@ -1123,13 +1728,13 @@ class stock_eda_panel(object):
1123
1728
  df['Date'] = pd.to_datetime(df['Date'])
1124
1729
  df = df[df.Date >= begin_date_str ]
1125
1730
  self.pair_index_df = df
1126
-
1731
+
1127
1732
  #### converting the same index ####
1128
1733
  dates_vector = self.df.Date.to_frame()
1129
1734
  self.pair_index_df = dates_vector.merge(self.pair_index_df, on ='Date',how = 'left')
1130
1735
  self.pair_index_df = self.pair_index_df.fillna(method = 'bfill')
1131
1736
  self.pair_index_df = self.pair_index_df.fillna(method = 'ffill')
1132
-
1737
+
1133
1738
  self.pair_index_df[feature_label] = ROCIndicator(close = self.pair_index_df['Close'], window = window).roc()
1134
1739
  df_to_merge = self.pair_index_df[['Date',feature_label]]
1135
1740
  self.df = self.df.merge(df_to_merge, on ='Date',how = 'left')
@@ -1140,7 +1745,7 @@ class stock_eda_panel(object):
1140
1745
  if save_features:
1141
1746
  self.log_features_standard(feature_label)
1142
1747
  parameters = {feature_label:{'pair_symbol':pair_symbol, 'feature_label':feature_label, 'window':window,'threshold':threshold}}
1143
- try:
1748
+ try:
1144
1749
  len(self.settings_pair_index_feature)
1145
1750
  print('existing')
1146
1751
  self.settings_pair_index_feature.append(parameters)
@@ -1153,10 +1758,21 @@ class stock_eda_panel(object):
1153
1758
  self.signal_plotter(feature_label)
1154
1759
 
1155
1760
  def produce_order_features(self, feature_name, save_features = False):
1761
+ """
1762
+ perform a feature that captures high and low values in an index. this is usefull to know duration/persistence of a signal
1156
1763
 
1764
+ Parameters
1765
+ ----------
1766
+ feature_name (str): name of the feature
1767
+ save_features (boolean): True to save feature configuration and feature names
1768
+
1769
+ Returns
1770
+ -------
1771
+ None
1772
+ """
1157
1773
  signal_feature_name = f'discrete_signal_{feature_name}'
1158
1774
  order_feature_name = f'order_signal_{feature_name}'
1159
-
1775
+
1160
1776
  self.df[signal_feature_name] = np.where(
1161
1777
  self.df[f'signal_up_{feature_name}'] == 1,1,
1162
1778
  np.where(
@@ -1173,14 +1789,24 @@ class stock_eda_panel(object):
1173
1789
  self.df[order_feature_name] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1174
1790
  self.df[order_feature_name] = self.df[order_feature_name]*self.df[signal_feature_name]
1175
1791
  self.df = self.df.drop(columns = [f'lag_{signal_feature_name}', 'breack', "chain_id"])
1176
-
1792
+
1177
1793
  ## saving features
1178
1794
  if save_features:
1179
1795
  self.signals.append(signal_feature_name)
1180
1796
  self.signals.append(order_feature_name)
1181
-
1797
+
1182
1798
  def create_hmm_derived_features(self, lag_returns):
1799
+ """
1800
+ create features derived from hmm states features. Features are the index of the state, the duration of the state, chain raturn
1801
+
1802
+ Parameters
1803
+ ----------
1804
+ lag_returns (int): lag paramter (not used)
1183
1805
 
1806
+ Returns
1807
+ -------
1808
+ None
1809
+ """
1184
1810
  self.df = self.df.sort_values('Date')
1185
1811
  ## indexing chains
1186
1812
  self.df['lag_hmm_feature'] = self.df['hmm_feature'].shift(1)
@@ -1189,7 +1815,7 @@ class stock_eda_panel(object):
1189
1815
  self.df["chain_id"] = np.where(self.df['breack'] == 1,self.df["chain_id"],np.nan)
1190
1816
  self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
1191
1817
  self.df["hmm_chain_order"] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1192
-
1818
+
1193
1819
  ### returns using the first element in a chain
1194
1820
  self.df['first'] = np.where(self.df['hmm_chain_order'] == 1, self.df['Close'], np.nan)
1195
1821
  self.df['first'] = self.df.sort_values('Date')['first'].fillna(method='ffill')
@@ -1198,8 +1824,26 @@ class stock_eda_panel(object):
1198
1824
  self.df = self.df.drop(columns = ['breack','first'])
1199
1825
 
1200
1826
  def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, plot = False, save_features = False, model = False):
1827
+ """
1828
+ create or use a hmm model
1829
+
1830
+ Parameters
1831
+ ----------
1832
+ n_clusters (int): number of clusters or states to calculate
1833
+ features_hmm (list): features to be considered in hmm model when training
1834
+ test_data_size (int): size of the test data. Note that the remaining is going to be used as training data
1835
+ seed (int): seed for the model inizialization
1836
+ lag_returns_state (int) : lags for returns of the state
1837
+ plot (boolean): True to display hmm states analysis
1838
+ save_features (boolean): True to save features and configurations
1839
+ model (obj): if provided, no model will be trainend and the provided model will be used to get hmm features
1840
+
1841
+ Returns
1842
+ -------
1843
+ None
1844
+ """
1201
1845
  if not model:
1202
-
1846
+
1203
1847
  df_new = self.df
1204
1848
  pipeline_hmm = Pipeline([
1205
1849
  ('selector', FeatureSelector(columns=features_hmm)),
@@ -1213,7 +1857,7 @@ class stock_eda_panel(object):
1213
1857
 
1214
1858
  self.model_hmm = pipeline_hmm
1215
1859
  self.test_data_hmm = data_test
1216
-
1860
+
1217
1861
  ### first feature: the hidden state
1218
1862
  self.df['hmm_feature'] = self.model_hmm.predict(self.df)
1219
1863
  self.create_hmm_derived_features(lag_returns = lag_returns_state)
@@ -1230,11 +1874,11 @@ class stock_eda_panel(object):
1230
1874
  hidden_states = pipeline_hmm.predict(data_test)
1231
1875
  data_test['HMM'] = hidden_states
1232
1876
  data_test['HMM_state'] = data_test['HMM'].map(map_)
1233
-
1877
+
1234
1878
  if model:
1235
1879
  self.df['hmm_feature'] = model.predict(self.df)
1236
1880
  self.create_hmm_derived_features(lag_returns = lag_returns_state)
1237
-
1881
+
1238
1882
  if save_features:
1239
1883
  self.features.append('hmm_feature')
1240
1884
  self.features.append('hmm_chain_order')
@@ -1263,14 +1907,38 @@ class stock_eda_panel(object):
1263
1907
  fig.show()
1264
1908
 
1265
1909
  def sharpe_ratio(self, return_series, n_trad_days = 255, rf = 0.01):
1910
+ """
1911
+ perform sharpe ratio of a given time series return
1912
+
1913
+ Parameters
1914
+ ----------
1915
+ return_series (pd.series): time series of the returns
1916
+ n_trad_days (int): trading days to anualize returns
1917
+ rf (float): anual free risk rate
1918
+
1919
+ Returns
1920
+ -------
1921
+ sharpe_ratio (float): sharpe ratio
1922
+ """
1266
1923
  nsqrt = np.sqrt(n_trad_days)
1267
1924
  mean = return_series.mean() * n_trad_days
1268
1925
  sigma = return_series.std() * nsqrt
1269
1926
  sharpe_ratio = round((mean-rf)/sigma,2)
1270
1927
  return sharpe_ratio
1271
-
1928
+
1272
1929
  def treat_signal_strategy(self,test_data, strategy):
1273
-
1930
+ """
1931
+ helper method that treats signals and converts signals to 1 or 0
1932
+
1933
+ Parameters
1934
+ ----------
1935
+ test_data (pd.DataFrame): test data
1936
+ strategy (list): features to get the strategy
1937
+
1938
+ Returns
1939
+ -------
1940
+ test_data (pd.DataFrame): test data with extra columns that are the strategy (main_signal)
1941
+ """
1274
1942
  hmm_states_list = [x for x in strategy if 'hmm_state_' in x]
1275
1943
  other_features = [x for x in strategy if x not in hmm_states_list]
1276
1944
 
@@ -1299,10 +1967,21 @@ class stock_eda_panel(object):
1299
1967
  elif len(hmm_states_list) == 0 and len(other_features) > 0:
1300
1968
  test_data['main_signal'] = np.where((test_data['features_signal'] == 1) & (test_data['hmm_signal'] == 0),1,0)
1301
1969
 
1302
- return test_data
1970
+ return test_data
1303
1971
 
1304
1972
  def stategy_simulator(self, features, hmm_feature = True):
1973
+ """
1974
+ execute strategy and get some performance metrics like sharpe ratio, return. This method creates some new attributes
1305
1975
 
1976
+ Parameters
1977
+ ----------
1978
+ features (list): list of features to be tested as strategies
1979
+ hmm_feature (boolean): include hmm feature
1980
+
1981
+ Returns
1982
+ -------
1983
+ None
1984
+ """
1306
1985
  columns_ = ['Date', 'Close','Open'] + features + ['HMM']
1307
1986
  states = list(self.df.hmm_feature.unique())
1308
1987
  states.sort()
@@ -1372,8 +2051,19 @@ class stock_eda_panel(object):
1372
2051
  self.strategy_log = df_returns_log
1373
2052
  self.best_strategy = df_returns_log.iloc[0,:].strategy
1374
2053
  self.top_10_strategy = list(df_returns_log.iloc[0:10,:].strategy.values)
1375
-
2054
+
1376
2055
  def viz_strategy(self, strategy):
2056
+ """
2057
+ display analysis plot of a given strategy
2058
+
2059
+ Parameters
2060
+ ----------
2061
+ strategy (list): list of features of the strategy
2062
+
2063
+ Returns
2064
+ -------
2065
+ None
2066
+ """
1377
2067
  test_data = self.test_data_strategy
1378
2068
 
1379
2069
  test_data = self.treat_signal_strategy(test_data, strategy)
@@ -1408,7 +2098,7 @@ class stock_eda_panel(object):
1408
2098
 
1409
2099
  ### deprecated ############################
1410
2100
  def create_strategy(self, favourable_states):
1411
-
2101
+
1412
2102
  test_data = self.test_data_hmm
1413
2103
  # add MA signal
1414
2104
  test_data.loc[test_data[self.ma1_column] > test_data[self.ma2_column], 'MA_signal'] = 1
@@ -1452,16 +2142,27 @@ class stock_eda_panel(object):
1452
2142
  plt.plot(test_data['strat_prod_exp'])
1453
2143
  self.settings_hmm_states = {'favourable_states':favourable_states}
1454
2144
  ################################################
1455
-
2145
+
1456
2146
  def deep_dive_analysis_hmm(self, test_data_size, split = 'train'):
1457
-
2147
+ """
2148
+ display analysis plot hmm model
2149
+
2150
+ Parameters
2151
+ ----------
2152
+ test_data_size (int): test data size, the remaining is the train data
2153
+ split (str): options (train or test). Split type to assess
2154
+
2155
+ Returns
2156
+ -------
2157
+ None
2158
+ """
1458
2159
  if split == 'train':
1459
2160
  df = self.df.iloc[:-test_data_size,:]
1460
2161
  elif split == 'test':
1461
2162
  df = self.df.iloc[-test_data_size:,:]
1462
2163
 
1463
2164
  ## returns plot
1464
- fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
2165
+ fig = px.box(df.sort_values('hmm_feature'), y = 'chain_return',x = 'hmm_feature', color = 'hmm_feature',
1465
2166
  height=400, width=1000, title = 'returns chain hmm feature')
1466
2167
  fig.add_shape(type='line',x0=-0.5,y0=0,x1=max(df.hmm_feature)+0.5,y1=0,line=dict(color='grey',width=1),xref='x',yref='y')
1467
2168
  fig.show()
@@ -1490,6 +2191,17 @@ class stock_eda_panel(object):
1490
2191
  del df
1491
2192
 
1492
2193
  def get_targets(self, steps):
2194
+ """
2195
+ produce regression target return taking future prices
2196
+
2197
+ Parameters
2198
+ ----------
2199
+ steps (int): number of lags and steps for future returns
2200
+
2201
+ Returns
2202
+ -------
2203
+ None
2204
+ """
1493
2205
  self.targets = list()
1494
2206
  self.target = list()
1495
2207
  columns = list()
@@ -1501,9 +2213,21 @@ class stock_eda_panel(object):
1501
2213
  self.df[f'mean_target'] = self.df[columns].mean(axis=1)
1502
2214
  self.target.append(f'mean_target')
1503
2215
  self.settings_target_lasts = {'steps':steps, 'type':'regression'}
1504
-
2216
+
1505
2217
  def get_categorical_targets(self, horizon, flor_loss, top_gain):
1506
-
2218
+ """
2219
+ produce binary target return taking future prices. it produce two targets, one for high returns and another for low returns
2220
+
2221
+ Parameters
2222
+ ----------
2223
+ horizon (int): number of lags and steps for future returns
2224
+ flor_loss (float): min loss return
2225
+ top_gain (float): max gain return
2226
+
2227
+ Returns
2228
+ -------
2229
+ None
2230
+ """
1507
2231
  self.target = list()
1508
2232
  self.targets = list()
1509
2233
  columns = list()
@@ -1535,7 +2259,19 @@ class stock_eda_panel(object):
1535
2259
  self.settings_target_lasts = {'horizon':horizon, 'flor_loss':flor_loss, 'top_gain':top_gain, 'type': 'classification'}
1536
2260
 
1537
2261
  def get_configurations(self,test_data_size =250, val_data_size = 250, model_type = False):
1538
-
2262
+ """
2263
+ produce configuration dictionary that were saved in the feature generation methods if save_features was activated
2264
+
2265
+ Parameters
2266
+ ----------
2267
+ test_data_size (int): test data size
2268
+ val_data_size (int): validation data size
2269
+ model_type (str): model type, options: 'Forecaster','Classifier'
2270
+
2271
+ Returns
2272
+ -------
2273
+ None
2274
+ """
1539
2275
  self.settings = {
1540
2276
  'features':list(set(self.features)),
1541
2277
  'signals' :list(set(self.signals)),
@@ -1547,15 +2283,15 @@ class stock_eda_panel(object):
1547
2283
  'outlier': self.settings_outlier,
1548
2284
  }
1549
2285
  }
1550
-
2286
+
1551
2287
  if model_type in ['Forecaster','Classifier']:
1552
-
2288
+
1553
2289
  target_list = list(set(self.targets))
1554
2290
  target_list.sort()
1555
2291
  self.settings['model_type'] = model_type
1556
2292
  self.settings['target'] = list(set(self.target))
1557
2293
  self.settings['targets'] = target_list
1558
-
2294
+
1559
2295
  ## for now this is hard coded
1560
2296
  feature_list = ['spread_ma','relative_spread_ma','pair_feature','count_features','bidirect_count_features','price_range','relative_price_range','rsi_feature',
1561
2297
  'rsi_feature_v2', 'days_features','days_features_v2', 'volume_feature','smooth_volume', 'roc_feature', 'stoch_feature', 'stochastic_feature',
@@ -1570,7 +2306,7 @@ class stock_eda_panel(object):
1570
2306
  self.settings['settings']['target_lasts'] = self.settings_target_lasts
1571
2307
  except:
1572
2308
  pass
1573
-
2309
+
1574
2310
  try:
1575
2311
  self.settings['settings']['strategies'] = {
1576
2312
  'best_strategy':self.best_strategy,
@@ -1580,48 +2316,189 @@ class stock_eda_panel(object):
1580
2316
  pass
1581
2317
 
1582
2318
  class produce_model:
2319
+ """
2320
+ Class that produces a machine learning model in a scikit-learn pipeline wrapper.
2321
+
2322
+ Attributes
2323
+ ----------
2324
+ data : pd.DataFrame
2325
+ symbol of the asset
2326
+ X_train : pd.DataFrame
2327
+ y_train : pd.Series
2328
+ X_test : pd.DataFrame
2329
+ y_test : pd.Series
2330
+ X_val : pd.DataFrame
2331
+ y_val : pd.Series
2332
+ pipeline : obj
2333
+ trained pipeline that includes a ml model
2334
+ features_to_model: list
2335
+ features in end step of the pipeline
2336
+
2337
+ Methods
2338
+ -------
2339
+ preprocess(test_data_size=int, target=str, val_data_size=int):
2340
+ prepare data, split train, test, validation data and X and Y
2341
+ get_sample(x=pd.DataFrame, sample=int, max_=int):
2342
+ sample data
2343
+ """
1583
2344
  def __init__(self,data):
2345
+ """
2346
+ Initialize object
2347
+
2348
+ Parameters
2349
+ ----------
2350
+ data (pd.DataFrame): data
2351
+
2352
+ Returns
2353
+ -------
2354
+ None
2355
+ """
1584
2356
  self.data = data.copy()
1585
-
2357
+
1586
2358
  def preprocess(self, test_data_size, target, val_data_size = False):
1587
-
2359
+ """
2360
+ prepare data, split train, test, validation data and X and Y
2361
+
2362
+ Parameters
2363
+ ----------
2364
+ test_data_size (int): test data size
2365
+ target (str): target column
2366
+ val_data_size (int): validation data size
2367
+
2368
+ Returns
2369
+ -------
2370
+ None
2371
+ """
1588
2372
  train_data, test_data = self.data.iloc[:-test_data_size,:].dropna() , self.data.iloc[-test_data_size:,:].dropna()
1589
-
2373
+
1590
2374
  if val_data_size:
1591
2375
  train_data, val_data = train_data.iloc[:-val_data_size,:], train_data.iloc[-val_data_size:,:]
1592
-
2376
+
1593
2377
  self.test_data = test_data
1594
-
2378
+
1595
2379
  X_train, y_train = train_data.iloc[0:,1:], train_data[target]
1596
2380
  X_test, y_test = test_data.iloc[0:,1:], test_data[target]
1597
2381
  self.X_train = X_train
1598
2382
  self.y_train = y_train
1599
2383
  self.X_test = X_test
1600
2384
  self.y_test = y_test
1601
-
2385
+
1602
2386
  if val_data_size:
1603
2387
  X_val, y_val = val_data.iloc[0:,1:], val_data[target]
1604
2388
  self.X_val = X_val
1605
2389
  self.y_val = y_val
1606
-
2390
+
1607
2391
  def get_sample(self, x, sample, max_=900):
2392
+ """
2393
+ sample data
2394
+
2395
+ Parameters
2396
+ ----------
2397
+ x (pd.DataFrame): input data
2398
+ sample (int): sample size
2399
+ max_ (int): max sample
2400
+
2401
+ Returns
2402
+ -------
2403
+ sample (float): sample size
2404
+ """
1608
2405
  length = len(x)
1609
2406
  if length > max_:
1610
2407
  return 1.0
1611
2408
  else:
1612
2409
  return sample
1613
-
2410
+
1614
2411
  def train_model(self, pipe, model, cv_ = False):
2412
+ """
2413
+ train pipeline
2414
+
2415
+ Parameters
2416
+ ----------
2417
+ pipe (obj): pipeline object
2418
+ model (obj): model object
2419
+ cv_ (obj): cross validation procedure
2420
+
2421
+ Returns
2422
+ -------
2423
+ sample (float): sample size
2424
+ """
1615
2425
  self.model = model
1616
2426
  self.pipe_transform = pipe
1617
2427
  self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
1618
2428
  self.features_to_model = self.pipe_transform.fit_transform(self.X_train).columns
1619
2429
  self.pipeline.fit(self.X_train, self.y_train)
1620
-
1621
-
2430
+
2431
+
1622
2432
  class hmm_feature_selector():
1623
-
2433
+ """
2434
+ class that is going to train hmm models to perform feature selection
2435
+
2436
+ Attributes
2437
+ ----------
2438
+ data : pd.DataFrame
2439
+ symbol of the asset
2440
+ n_clusters : int
2441
+ number of clusters to search
2442
+ init_features_hmm : list
2443
+ list of features to consider in the search
2444
+ test_data_size :int
2445
+ test data size, meaning that the remaining is going to be used as training data
2446
+ select_n_features : int
2447
+ number of features to select
2448
+ n_trials : int
2449
+ total number of trials per combination
2450
+ limit_search : int
2451
+ limit number of combinations
2452
+ default_benchmark_sd : float
2453
+ default value to bias standard deviation
2454
+ t_threshold : float
2455
+ alpha or z threshold
2456
+ pipeline_hmm: obj
2457
+ pipeline object of the hmm model
2458
+ features_used_in_model:list
2459
+ features in model
2460
+ train_model(features_hmm=list):
2461
+ train hmm model
2462
+ feature_combinations: list
2463
+ list of combination of features
2464
+ mean_relevance: float
2465
+ relevance score of the model
2466
+ best_features: list
2467
+ list of best performing features
2468
+
2469
+ Methods
2470
+ -------
2471
+ split_data():
2472
+ split data in train and test
2473
+ train_model(features_hmm=list):
2474
+ train hmm model
2475
+ feature_list_generator():
2476
+ perform combination of features
2477
+ get_error():
2478
+ get error or score of a given model using relevance score
2479
+ execute_selector():
2480
+ select the best combination of features
2481
+ """
1624
2482
  def __init__(self, data, n_clusters, init_features_hmm, test_data_size, select_n_features, n_trials = 1,limit_search = False, default_benchmark_sd = 0.00003, t_threshold = 2):
2483
+ """
2484
+ Initialize object
2485
+
2486
+ Parameters
2487
+ ----------
2488
+ data (pd.DataFrame): data
2489
+ n_clusters (int): number of clusters to search
2490
+ init_features_hmm (list): list of features to consider in the search
2491
+ test_data_siz:(int: test data size, meaning that the remaining is going to be used as training data
2492
+ select_n_features (int): number of features to select
2493
+ n_trials (int): total number of trials per combination
2494
+ limit_search (int): limit number of combinations
2495
+ default_benchmark_sd (float): default value to bias standard deviation
2496
+ t_threshold (float): alpha or z threshold
2497
+
2498
+ Returns
2499
+ -------
2500
+ None
2501
+ """
1625
2502
  self.data = data.copy()
1626
2503
  self.n_clusters = n_clusters
1627
2504
  self.init_features_hmm = init_features_hmm
@@ -1631,36 +2508,77 @@ class hmm_feature_selector():
1631
2508
  self.limit_search= limit_search
1632
2509
  self.default_benchmark_sd = default_benchmark_sd
1633
2510
  self.t_threshold = t_threshold
1634
-
2511
+
1635
2512
  def split_data(self):
1636
-
2513
+ """
2514
+ split data in train and test
2515
+
2516
+ Parameters
2517
+ ----------
2518
+ None
2519
+
2520
+ Returns
2521
+ -------
2522
+ None
2523
+ """
1637
2524
  self.data_train = self.data.iloc[:-self.test_data_size,:]
1638
2525
  self.data_test = self.data.iloc[-self.test_data_size:,:]
1639
-
2526
+
1640
2527
  def train_model(self,features_hmm):
2528
+ """
2529
+ train hmm model
2530
+
2531
+ Parameters
2532
+ ----------
2533
+ features_hmm (list): list of features to be selected in the model
2534
+
2535
+ Returns
2536
+ -------
2537
+ None
2538
+ """
1641
2539
  pipeline_hmm = Pipeline([
1642
2540
  ('selector', FeatureSelector(columns=features_hmm)),
1643
2541
  ('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
1644
2542
  ('hmm',GaussianHMM(n_components = self.n_clusters, covariance_type = 'full'))
1645
2543
  ])
1646
-
2544
+
1647
2545
  self.pipeline_hmm = pipeline_hmm.fit(self.data_train)
1648
2546
  self.features_used_in_model = features_hmm
1649
-
2547
+
1650
2548
  def feature_list_generator(self):
1651
-
2549
+ """
2550
+ perform combination of features
2551
+
2552
+ Parameters
2553
+ ----------
2554
+ None
2555
+
2556
+ Returns
2557
+ -------
2558
+ None
2559
+ """
1652
2560
  feature_combinations = set(list(combinations(self.init_features_hmm, self.select_n_features)))
1653
2561
  feature_combinations = list(map(list, feature_combinations))
1654
-
2562
+
1655
2563
  self.feature_combinations = feature_combinations
1656
-
2564
+
1657
2565
  def get_error(self):
1658
-
2566
+ """
2567
+ get error or score of a given model using relevance score
2568
+
2569
+ Parameters
2570
+ ----------
2571
+ None
2572
+
2573
+ Returns
2574
+ -------
2575
+ None
2576
+ """
1659
2577
  self.data_train_ = self.data_train.copy()
1660
-
2578
+
1661
2579
  self.data_train_['hmm_feature'] = self.pipeline_hmm.predict(self.data_train_)
1662
2580
  self.data_train_ = self.data_train_[['Date','hmm_feature','Close']].sort_values('Date')
1663
-
2581
+
1664
2582
  ## indexing chains
1665
2583
  self.data_train_['lag_hmm_feature'] = self.data_train_['hmm_feature'].shift(1)
1666
2584
  self.data_train_['breack'] = np.where(self.data_train_['lag_hmm_feature'] != self.data_train_['hmm_feature'],1,0)
@@ -1668,36 +2586,46 @@ class hmm_feature_selector():
1668
2586
  self.data_train_["chain_id"] = np.where(self.data_train_['breack'] == 1,self.data_train_["chain_id"],np.nan)
1669
2587
  self.data_train_["chain_id"] = self.data_train_["chain_id"].fillna(method='ffill')
1670
2588
  self.data_train_["hmm_chain_order"] = self.data_train_.groupby('chain_id')["Date"].rank(method="first", ascending=True)
1671
-
2589
+
1672
2590
  ### returns using the first element in a chain
1673
2591
  self.data_train_['first'] = np.where(self.data_train_['hmm_chain_order'] == 1, self.data_train_['Close'], np.nan)
1674
2592
  self.data_train_['first'] = self.data_train_.sort_values('Date')['first'].fillna(method='ffill')
1675
2593
  self.data_train_['chain_return'] = (self.data_train_['Close']/self.data_train_['first'] -1) * 100
1676
-
2594
+
1677
2595
  self.data_train_ = self.data_train_.drop(columns = ['first'])
1678
-
2596
+
1679
2597
  mean_relevance, cluster_returns, number_relevant_states = states_relevance_score(self.data_train_)
1680
2598
  self.mean_relevance = mean_relevance
1681
-
2599
+
1682
2600
  def execute_selector(self):
1683
-
2601
+ """
2602
+ select the best combination of features
2603
+
2604
+ Parameters
2605
+ ----------
2606
+ None
2607
+
2608
+ Returns
2609
+ -------
2610
+ None
2611
+ """
1684
2612
  self.split_data()
1685
2613
  self.feature_list_generator()
1686
2614
  maxi = -1
1687
2615
  print(f'it is expected {len(self.feature_combinations)} combinations')
1688
2616
  feature_results = dict()
1689
-
2617
+
1690
2618
  if self.limit_search:
1691
2619
  print(f' taking just {self.limit_search} combinations')
1692
2620
  maxi = self.limit_search
1693
-
2621
+
1694
2622
  for i,features_hmm in enumerate(self.feature_combinations[0:maxi]):
1695
-
2623
+
1696
2624
  feature_results[f'group_{i}'] = {
1697
2625
  'features':list(features_hmm),
1698
2626
  'relevances':list()
1699
2627
  }
1700
-
2628
+
1701
2629
  for _ in range(self.n_trials):
1702
2630
  try:
1703
2631
  self.train_model(features_hmm)
@@ -1708,18 +2636,54 @@ class hmm_feature_selector():
1708
2636
  feature_results[f'group_{i}']['mean relevance'] = np.mean(feature_results[f'group_{i}']['relevances'])
1709
2637
  self.feature_results = feature_results
1710
2638
  self.best_features = pd.DataFrame(self.feature_results).T.sort_values('mean relevance').iloc[-1,:].features
1711
-
2639
+
1712
2640
  class signal_analyser_object:
1713
-
2641
+ """
2642
+ class that is going to analyse signals
2643
+
2644
+ Attributes
2645
+ ----------
2646
+ data : pd.DataFrame
2647
+ symbol of the asset
2648
+ ticket_name :str
2649
+ asset symbol
2650
+ show_plot : boolean
2651
+ if true show plot for every method
2652
+ save_path : str
2653
+ if true, save results in file
2654
+ save_aws : str
2655
+ if true, export results to remote repo
2656
+ aws_credentials : dict
2657
+ credentials for aws
2658
+ return_fig : boolean
2659
+ if true, methods will return objects
2660
+ create_backtest_signal(days_strategy=list, test_size=int, feature_name=str, high_exit=float, low_exit=float):
2661
+ perform backtest signal analysis
2662
+
2663
+ Methods
2664
+ -------
2665
+ signal_analyser(test_size=int, feature_name=str, days_list=list, threshold=float,verbose=boolean, signal_position=boolean):
2666
+ perform signal analysis and feature extraction
2667
+
2668
+ """
2669
+
1714
2670
  def __init__(self, data,symbol_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False, return_fig = False):
1715
2671
  """
1716
- data: pandas df
1717
- symbol_name: str name of the asset
1718
- show_plot: bool
1719
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
1720
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
1721
- aws_credentials: dict
1722
- return_fig: boolean return the image function as result
2672
+ Initialize object
2673
+
2674
+ Parameters
2675
+ ----------
2676
+ data (pd.DataFrame): data
2677
+ ticket_name (str): name of the asset
2678
+ show_plot (boolean): if true show plot for every method
2679
+ save_path (str): if true, save results in file e.g r'C:/path/to/the/file/'
2680
+ save_aws (str): if true, export results to remote repo e.g. 'path/to/file/'
2681
+ aws_credentials (dict): credentials for aws
2682
+ return_fig (boolean): if true, methods will return objects
2683
+
2684
+ Returns
2685
+ -------
2686
+ None
1723
2687
  """
1724
2688
  self.data = data.copy()
1725
2689
  self.ticket_name = symbol_name
@@ -1730,6 +2694,22 @@ class signal_analyser_object:
1730
2694
  self.return_fig = return_fig
1731
2695
 
1732
2696
  def signal_analyser(self, test_size, feature_name, days_list, threshold = 0.05,verbose = False, signal_position = False):
2697
+ """
2698
+ perform signal analysis and feature extraction
2699
+
2700
+ Parameters
2701
+ ----------
2702
+ test_size (int): test data size
2703
+ feature_name (str): name of the feature to assess
2704
+ days_list (list): list of integers [3,8,10] to assess
2705
+ threshold (float): alpha or z threshold
2706
+ verbose (boolean): print metrics
2707
+ signal_position (int): if true, the signal is taken at the given step in the signal
2708
+
2709
+ Returns
2710
+ -------
2711
+ None
2712
+ """
1733
2713
  data = self.data
1734
2714
  self.feature_name = feature_name
1735
2715
  up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
@@ -1745,10 +2725,10 @@ class signal_analyser_object:
1745
2725
  returns_list.append(feature_)
1746
2726
 
1747
2727
  df['signal_type'] = np.where(
1748
- df[up_signal] == 1,
1749
- 'up',
2728
+ df[up_signal] == 1,
2729
+ 'up',
1750
2730
  np.where(
1751
- df[low_signal] == 1,
2731
+ df[low_signal] == 1,
1752
2732
  'down',
1753
2733
  None
1754
2734
  )
@@ -1772,7 +2752,7 @@ class signal_analyser_object:
1772
2752
 
1773
2753
  df = df.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
1774
2754
  self.df_signal = df
1775
-
2755
+
1776
2756
  n_signals_up = len(list(df[df.signal_type == 'up'].chain_id.unique()))
1777
2757
  n_signals_down = len(list(df[df.signal_type == 'down'].chain_id.unique()))
1778
2758
  p_scores = list()
@@ -1788,7 +2768,7 @@ class signal_analyser_object:
1788
2768
  sample2 = df_melt[(df_melt.time == evalx) & (df_melt.signal_type == 'down')].value.values
1789
2769
  pvalue = stats.ttest_ind(sample1, sample2).pvalue
1790
2770
  median_down = np.median(sample2)
1791
- median_up = np.median(sample1)
2771
+ median_up = np.median(sample1)
1792
2772
  validations.append(median_up < 0)
1793
2773
  validations.append(median_down > 0)
1794
2774
  p_scores.append(pvalue)
@@ -1830,10 +2810,10 @@ class signal_analyser_object:
1830
2810
  sns.boxplot(data=df_melt, x="time", y="value", hue="signal_type",ax = axs[2])
1831
2811
  axs[2].axhline(y=0, color='grey', linestyle='--')
1832
2812
  axs[2].set_title('signal type expected returns distribution at different time lapses')
1833
-
2813
+
1834
2814
  if self.show_plot:
1835
2815
  plt.show()
1836
-
2816
+
1837
2817
  if self.save_path:
1838
2818
  result_plot_name = f'signals_strategy_distribution_{feature_name}.png'
1839
2819
  fig.savefig(self.save_path+result_plot_name)
@@ -1849,17 +2829,33 @@ class signal_analyser_object:
1849
2829
 
1850
2830
  if self.return_fig:
1851
2831
  return fig
1852
-
2832
+
1853
2833
  def create_backtest_signal(self,days_strategy, test_size, feature_name, high_exit = False, low_exit = False):
2834
+ """
2835
+ perform backtest signal analysis
2836
+
2837
+ Parameters
2838
+ ----------
2839
+ days_strategy (list): list of days to assess returns
2840
+ test_size (str): test data size
2841
+ feature_name (str): name of the feature to assess
2842
+ high_exit (float): high exit thrshold return in backtest
2843
+ low_exit (float): loss exit thrshold return in backtest
2844
+
2845
+ Returns
2846
+ -------
2847
+ fig (obj): plots
2848
+ messages (dict): dictionary with key metrics
2849
+ """
1854
2850
  asset_1 = 'Close'
1855
2851
  up_signal, low_signal= f'signal_up_{feature_name}', f'signal_low_{feature_name}'
1856
2852
  df1 = self.data.iloc[-test_size:,:].copy()
1857
2853
  df2 = df1.copy()
1858
2854
  df2['signal_type'] = np.where(
1859
- df2[up_signal] == 1,
1860
- 'up',
2855
+ df2[up_signal] == 1,
2856
+ 'up',
1861
2857
  np.where(
1862
- df2[low_signal] == 1,
2858
+ df2[low_signal] == 1,
1863
2859
  'down',
1864
2860
  None
1865
2861
  )
@@ -1870,33 +2866,33 @@ class signal_analyser_object:
1870
2866
  df2['span'] = (pd.to_datetime(df2['Date']) - pd.to_datetime(df2['lag_Date'])).dt.days - 1
1871
2867
  df2['break'] = np.where(df2['span'] > 3, 1, 0)
1872
2868
  df2['break'] = np.where(df2['span'].isna(), 1, df2['break'])
1873
-
2869
+
1874
2870
  df2['chain_id'] = df2.sort_values(['Date']).groupby(['break']).cumcount() + 1
1875
2871
  df2['chain_id'] = np.where(df2['break'] == 1, df2['chain_id'], np.nan )
1876
2872
  df2['chain_id'] = df2['chain_id'].fillna(method = 'ffill')
1877
-
2873
+
1878
2874
  df2['internal_rn'] = df2.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
1879
2875
  df2['inv_internal_rn'] = df2.sort_values(['Date'],ascending = False).groupby(['chain_id']).cumcount() + 1
1880
-
2876
+
1881
2877
  df2['first_in_chain'] = np.where(df2['internal_rn'] == 1, True, False)
1882
2878
  df2['last_in_chain'] = np.where(df2['inv_internal_rn'] == 1, True, False)
1883
-
2879
+
1884
2880
  df2 = df2.drop(columns = ['break','span','lag_Date','inv_internal_rn']).sort_values('Date')
1885
-
2881
+
1886
2882
  df2 = df2[(df2.last_in_chain == True) & (df2.signal_type == 'down')][['last_in_chain']]
1887
2883
  dft = df1.merge(df2,how = 'left',left_index=True, right_index=True )
1888
-
2884
+
1889
2885
  dft['chain_id'] = dft.sort_values(['Date']).groupby(['last_in_chain']).cumcount() + 1
1890
2886
  dft['chain_id'] = np.where(dft['last_in_chain'] == True, dft['chain_id'], np.nan )
1891
2887
  dft['chain_id'] = dft['chain_id'].fillna(method = 'ffill')
1892
-
2888
+
1893
2889
  dft['internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).cumcount() + 1
1894
2890
  dft['flag'] = np.where(dft['internal_rn'] < days_strategy, 1,0)
1895
-
2891
+
1896
2892
  dft['lrets_bench'] = np.log(dft[asset_1]/dft[asset_1].shift(1))
1897
2893
  dft['bench_prod'] = dft['lrets_bench'].cumsum()
1898
2894
  dft['bench_prod_exp'] = np.exp(dft['bench_prod']) - 1
1899
-
2895
+
1900
2896
  if high_exit and low_exit:
1901
2897
  dft['open_strat'] = np.where(dft.last_in_chain == True, dft.Open, np.nan)
1902
2898
  dft['open_strat'] = dft['open_strat'].fillna(method = 'ffill')
@@ -1905,7 +2901,7 @@ class signal_analyser_object:
1905
2901
  dft['low_strat_ret'] = (dft['Low']/dft['open_strat']-1)*100
1906
2902
  dft['high_exit'] = np.where(((dft['high_strat_ret'] >= high_exit) | (dft['internal_rn'] == days_strategy)), 1, np.nan)
1907
2903
  dft['low_exit'] = np.where((dft['low_strat_ret'] <= low_exit), -1, np.nan)
1908
-
2904
+
1909
2905
  dft["exit_type"] = dft[["high_exit", "low_exit"]].max(axis=1)
1910
2906
  dft['exit_type'] = np.where(dft["exit_type"] == 1, 1, np.where(dft["exit_type"] == -1,-1,np.nan))
1911
2907
  dft['exit'] = np.where(dft['exit_type'].isnull(), np.nan, 1)
@@ -1916,27 +2912,27 @@ class signal_analyser_object:
1916
2912
  max_id = dft.chain_id.max()
1917
2913
  dft['max_internal_rn'] = dft.sort_values(['Date']).groupby(['chain_id']).internal_rn.transform('max')
1918
2914
  dft['exit'] = np.where((dft.chain_id == max_id) & (dft.max_internal_rn < days_strategy) & (dft.max_internal_rn == dft.internal_rn), 1, dft['exit'])
1919
-
2915
+
1920
2916
  dft['exit_step'] = np.where(dft.exit == 1, dft.internal_rn, np.nan)
1921
2917
  dft['exit_step'] = dft.sort_values(['Date']).groupby(['chain_id']).exit_step.transform('max')
1922
-
2918
+
1923
2919
  dft['flag'] = np.where(dft.internal_rn <= dft.exit_step, 1, 0)
1924
2920
  dft = dft.drop(columns = ['open_strat', 'high_strat_ret', 'low_strat_ret','exit_step', 'exit','exit_type','high_exit','low_exit', 'max_internal_rn'])
1925
-
2921
+
1926
2922
  dft['lrets_strat'] = np.log(dft[asset_1].shift(-1)/dft[asset_1]) * dft['flag']
1927
2923
  dft['lrets_strat'] = np.where(dft['lrets_strat'].isna(),-0.0,dft['lrets_strat'])
1928
2924
  dft['lrets_prod'] = dft['lrets_strat'].cumsum()
1929
2925
  dft['strat_prod_exp'] = np.exp(dft['lrets_prod']) - 1
1930
-
2926
+
1931
2927
  bench_rets = round(dft['bench_prod_exp'].values[-1]*100,1)
1932
2928
  strat_rets = round(dft['strat_prod_exp'].values[-1]*100,1)
1933
-
2929
+
1934
2930
  bench_sr = round(sharpe_ratio(dft.bench_prod_exp.dropna()),1)
1935
2931
  strat_sr = round(sharpe_ratio(dft.strat_prod_exp.dropna()),1)
1936
-
2932
+
1937
2933
  message1 = f'{bench_rets}%'
1938
2934
  message2 = f'{strat_rets}%'
1939
-
2935
+
1940
2936
  messages = {
1941
2937
  'benchmark return:':message1,
1942
2938
  'benchmark sharpe ratio:': bench_sr,
@@ -1947,7 +2943,7 @@ class signal_analyser_object:
1947
2943
  print('----------------------------')
1948
2944
  print(messages)
1949
2945
  print('----------------------------')
1950
-
2946
+
1951
2947
  fig = plt.figure(1)
1952
2948
  plt.plot(dft.bench_prod_exp.values, label = 'benchmark')
1953
2949
  plt.scatter(range(len(dft)),np.where(dft[low_signal] == 1,dft.bench_prod_exp.values,np.nan),color = 'red', label = 'signal')
@@ -1956,34 +2952,50 @@ class signal_analyser_object:
1956
2952
  plt.title('strategy and cumulative returns based on signal strategy')
1957
2953
  if self.show_plot:
1958
2954
  plt.plot()
1959
-
2955
+
1960
2956
  if self.save_path:
1961
2957
  result_json_name = f'signals_strategy_return_{feature_name}.json'
1962
2958
  result_plot_name = f'signals_strategy_return_{feature_name}.png'
1963
-
2959
+
1964
2960
  plt.savefig(self.save_path+result_plot_name)
1965
2961
  # pickle.dump(fig, open(self.save_path+result_plot_name, 'wb'))
1966
-
1967
- with open(self.save_path+result_json_name, "w") as outfile:
2962
+
2963
+ with open(self.save_path+result_json_name, "w") as outfile:
1968
2964
  json.dump(messages, outfile)
1969
-
2965
+
1970
2966
  if self.save_path and self.save_aws:
1971
2967
  # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
1972
2968
  # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_plot_name,input_path = self.save_path+result_plot_name)
1973
-
2969
+
1974
2970
  upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_json_name, input_path = self.save_path + result_json_name, aws_credentials = self.aws_credentials)
1975
2971
  upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
1976
-
2972
+
1977
2973
  if not self.show_plot:
1978
2974
  plt.close()
1979
-
2975
+
1980
2976
  del df1,df2,dft
1981
-
2977
+
1982
2978
  if self.return_fig:
1983
2979
  return fig, messages
1984
-
2980
+
1985
2981
  def execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object, plot = False, backtest= False, exit_params = {}):
1986
-
2982
+ '''
2983
+ code snippet that is going run some objects. The analysis is signal analyse which is backtesting
2984
+
2985
+ Parameters:
2986
+ test_data_size (int): test data size
2987
+ feature_name (str): name of the feature to assess
2988
+ days_list (list): tome scope to assess the returns
2989
+ configuration (dict): parameters of the method to run
2990
+ object_stock (obj): object with data to assess
2991
+ signal_analyser_object (obj): signal_analyser object
2992
+ plot (boolean): if true, plot results
2993
+ backtest (boolean): if true, run backtest
2994
+ exit_params (dict): parameters of exit returns
2995
+
2996
+ Returns:
2997
+ mean_median_return (float): median return of the backtests
2998
+ '''
1987
2999
  method(**configuration)
1988
3000
  signal_assess = signal_analyser_object(object_stock.df,object_stock.stock_code,show_plot = plot)
1989
3001
  signal_assess.signal_analyser(test_size = test_data_size, feature_name = feature_name, days_list = days_list, threshold = 1)
@@ -1991,56 +3003,127 @@ def execute_signal_analyser(test_data_size, feature_name, days_list, configurati
1991
3003
  if backtest:
1992
3004
  print('-----------------------back test ---------------------------')
1993
3005
  signal_assess.create_backtest_signal(backtest, test_data_size, feature_name, **exit_params )
1994
-
3006
+
1995
3007
  return signal_assess.mean_median_return
1996
3008
 
1997
3009
  def iterate_signal_analyser(test_data_size,feature_name, days_list, arguments_to_test, method, object_stock, signal_analyser_object, plot = True):
1998
-
3010
+ '''
3011
+ code snippet is going to iterate signal analyser
3012
+
3013
+ Parameters:
3014
+ test_data_size (int): test data size
3015
+ feature_name (str): name of the feature to assess
3016
+ days_list (list): tome scope to assess the returns
3017
+ arguments_to_test: parameters to test
3018
+ method: methods to run
3019
+ object_stock (obj): object with data to assess
3020
+ signal_analyser_object (obj): signal_analyser object
3021
+ plot (boolean): if true, plot results
3022
+
3023
+ Returns:
3024
+ best_result (int): index from the arguments_to_test with the best result
3025
+ '''
1999
3026
  results = list()
2000
3027
  for key in arguments_to_test.keys():
2001
3028
  configuration = arguments_to_test.get(key)
2002
3029
  mean_median_return = execute_signal_analyser(test_data_size, feature_name, days_list, configuration, method, object_stock, signal_analyser_object)
2003
3030
  results.append(mean_median_return)
2004
-
3031
+
2005
3032
  df_result = pd.DataFrame({'keys':arguments_to_test.keys(),'results':results})
2006
3033
  if plot:
2007
3034
  plt.plot(df_result['keys'], df_result['results'])
2008
3035
  plt.scatter(df_result['keys'], df_result['results'])
2009
3036
  plt.title('simulation between configurations')
2010
- plt.ylabel('median expected return')
3037
+ plt.ylabel('median expected return')
2011
3038
  plt.show()
2012
-
3039
+
2013
3040
  best_result = df_result.sort_values('results',ascending = False)['keys'].values[0]
2014
3041
  return best_result
2015
-
3042
+
2016
3043
  class analyse_index(stock_eda_panel):
2017
- def __init__(self, index, asset, n_obs, lag, data_window = '5y', show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
3044
+ """
3045
+ class that is going to train hmm models to perform feature selection
3046
+
3047
+ Attributes
3048
+ ----------
3049
+ data : pd.DataFrame
3050
+ symbol of the asset
3051
+ index : str
3052
+ name of the index
3053
+ asset : str
3054
+ name of the asset
3055
+ n_obs : int
3056
+ number of rows to extract
3057
+ lag : int
3058
+ lag to apply
3059
+ data_window : str
3060
+ 5y 10y 15y
3061
+ show_plot : bool
3062
+ If True, show plots
3063
+ save_path : str
3064
+ local path for saving e.g r'C:/path/to/the/file/'
3065
+ save_aws : str
3066
+ remote key in s3 bucket path e.g. 'path/to/file/'
3067
+ aws_credentials : dict
3068
+ dict with the aws credentials
3069
+ merger_df : pd.DataFrame
3070
+ dataframe with the index and asset data
3071
+ states_result = dict
3072
+ betas and correlation score results
3073
+
3074
+ Methods
3075
+ -------
3076
+ process_data():
3077
+ using stock_eda_panel, get data and merge data
3078
+ plot_betas(sample_size=int, offset=int, subsample_ts=int):
3079
+ display beta analysis plot
3080
+ get_betas(subsample_ts=int)
3081
+ get general beta and last sample beta, correlation score is included too
3082
+ """
2018
3083
 
3084
+ def __init__(self, index, asset, n_obs, lag, data_window = '5y', show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
2019
3085
  """
2020
- data: pandas df
2021
- index: str name of the index
2022
- asset: str name of the asset
2023
- n_obs: int
2024
- lag: int
2025
- data_window: str eg 5y 10y 15y
2026
- show_plot: bool
2027
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
2028
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
2029
- aws_credentials: dict
3086
+ Initialize object
3087
+
3088
+ Parameters
3089
+ ----------
3090
+ index (str): name of the index
3091
+ asset (str): name of the asset
3092
+ n_obs (int): number of rows to extract
3093
+ lag (int): lag to apply
3094
+ data_window (str): 5y 10y 15y
3095
+ show_plot (bool): If True, show plots
3096
+ save_path (str): local path for saving e.g r'C:/path/to/the/file/'
3097
+ save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
3098
+ aws_credentials (dict): dict with the aws credentials
3099
+
3100
+ Returns
3101
+ -------
3102
+ None
2030
3103
  """
2031
-
3104
+
2032
3105
  self.index = index
2033
3106
  self.asset = asset
2034
3107
  self.n_obs = n_obs
2035
3108
  self.data_window = data_window
2036
3109
  self.lag = lag
2037
-
3110
+
2038
3111
  self.show_plot = show_plot
2039
3112
  self.save_path = save_path
2040
3113
  self.save_aws = save_aws
2041
-
3114
+
2042
3115
  def process_data(self):
2043
-
3116
+ """
3117
+ using stock_eda_panel, get data and merge data
3118
+
3119
+ Parameters
3120
+ ----------
3121
+ None
3122
+
3123
+ Returns
3124
+ -------
3125
+ None
3126
+ """
2044
3127
  index = stock_eda_panel(self.index, self.n_obs, self.data_window)
2045
3128
  index.get_data()
2046
3129
  index.df['shift'] = index.df.Close.shift(self.lag)
@@ -2050,39 +3133,51 @@ class analyse_index(stock_eda_panel):
2050
3133
  asset.get_data()
2051
3134
  asset.df['shift'] = asset.df.Close.shift(self.lag)
2052
3135
  asset.df['asset_return'] = asset.df.Close/asset.df['shift'] - 1
2053
-
3136
+
2054
3137
  df1 = index.df[['Date','index_return']]
2055
3138
  df2 = asset.df[['Date','asset_return','Close']]
2056
3139
  merger = df1.merge(df2, on = 'Date', how = 'inner')
2057
3140
  merger.dropna(inplace = True)
2058
3141
  self.merger_df = merger
2059
-
3142
+
2060
3143
  def plot_betas(self,sample_size, offset, subsample_ts =False):
2061
-
3144
+ """
3145
+ display beta analysis plot
3146
+
3147
+ Parameters
3148
+ ----------
3149
+ sample_size (int): number of days or window size to calculate beta
3150
+ offset (int): overlap between windows
3151
+ subsample_ts (int): subsample size of data
3152
+
3153
+ Returns
3154
+ -------
3155
+ None
3156
+ """
2062
3157
  ### extracting data
2063
3158
 
2064
3159
  self.process_data()
2065
-
3160
+
2066
3161
  ### ploting analysis
2067
3162
  figure, ax = plt.subplot_mosaic(
2068
3163
  [["scatter_total", "scatter_sample",'ts','ts']],
2069
3164
  layout="constrained",
2070
3165
  figsize=(18, 5)
2071
3166
  )
2072
-
3167
+
2073
3168
  ax['scatter_total'].scatter(self.merger_df.asset_return, self.merger_df.index_return)
2074
3169
  b, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
2075
3170
  ax['scatter_total'].plot(self.merger_df.asset_return, b*self.merger_df.asset_return+a, color='red')
2076
3171
 
2077
3172
  ax['ts'].plot(self.merger_df.Date, self.merger_df.Close, color = 'grey', alpha = 0.3)
2078
-
3173
+
2079
3174
  if subsample_ts:
2080
3175
  self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
2081
-
3176
+
2082
3177
  for i in range(0,len(self.merger_df)-sample_size,offset):
2083
3178
 
2084
3179
  merger_ = self.merger_df.sort_values('Date', ascending = False).iloc[i:i+sample_size,:]
2085
- x = merger_.index_return
3180
+ x = merger_.index_return
2086
3181
  y = merger_.asset_return
2087
3182
  b, a = np.polyfit(x,y, 1)
2088
3183
 
@@ -2098,10 +3193,10 @@ class analyse_index(stock_eda_panel):
2098
3193
 
2099
3194
  scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
2100
3195
  scalarmappaple.set_array(x)
2101
-
3196
+
2102
3197
  plt.title(f'{self.asset} using index: {self.index}')
2103
3198
  plt.colorbar(scalarmappaple)
2104
-
3199
+
2105
3200
  if self.show_plot:
2106
3201
  plt.show()
2107
3202
  if self.save_path:
@@ -2113,34 +3208,95 @@ class analyse_index(stock_eda_panel):
2113
3208
  upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_plot_name, input_path = self.save_path + result_plot_name, aws_credentials = self.aws_credentials)
2114
3209
  if not self.show_plot:
2115
3210
  plt.close()
2116
-
3211
+
2117
3212
  def get_betas(self,subsample_ts=False):
2118
-
3213
+ """
3214
+ get general beta and last sample beta, correlation score is included too
3215
+
3216
+ Parameters
3217
+ ----------
3218
+ subsample_ts (int): subsample size of data
3219
+
3220
+ Returns
3221
+ -------
3222
+ None
3223
+ """
2119
3224
  self.process_data()
2120
3225
  general_beta, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
2121
3226
  general_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df.index_return)[0]
2122
-
3227
+
2123
3228
  self.process_data()
2124
3229
  if subsample_ts:
2125
3230
  self.merger_df = self.merger_df.iloc[-subsample_ts:,:].dropna()
2126
3231
  sample_beta, a = np.polyfit(self.merger_df.asset_return, self.merger_df.index_return, 1)
2127
3232
  sample_r = stats.mstats.pearsonr(self.merger_df.asset_return, self.merger_df.index_return)[0]
2128
-
3233
+
2129
3234
  result = {
2130
3235
  'general_beta':general_beta,
2131
3236
  'general_r':general_r,
2132
3237
  'sample_beta':sample_beta,
2133
3238
  'sample_r':sample_r
2134
3239
  }
2135
-
3240
+
2136
3241
  self.states_result = result
2137
-
3242
+
2138
3243
  class evaluate_markets(analyse_index):
3244
+ """
3245
+ object that is going to evaluate multiple indexes
3246
+
3247
+ Attributes
3248
+ ----------
3249
+ stock_code : str
3250
+ asset to assess
3251
+ indexes : list
3252
+ list of indexes
3253
+ best_result : dict
3254
+ best result beta and correlation
3255
+
3256
+ Methods
3257
+ -------
3258
+ process_data():
3259
+ using stock_eda_panel, get data and merge data
3260
+ plot_betas(sample_size=int, offset=int, subsample_ts=int):
3261
+ display beta analysis plot
3262
+ get_betas(subsample_ts=int)
3263
+ get general beta and last sample beta, correlation score is included too
3264
+ evaluate_best_market_fit(sample_size=int, offset=int,lag=int, n_obs=int, verbose=boolean, plot_best=boolean):
3265
+ iterate every index in the index list and get results
3266
+ """
3267
+
2139
3268
  def __init__(self, stock_code, indexes):
3269
+ """
3270
+ Initialize object
3271
+
3272
+ Parameters
3273
+ ----------
3274
+ stock_code (str): asset to assess
3275
+ indexes (list): list of indexes
3276
+
3277
+ Returns
3278
+ -------
3279
+ None
3280
+ """
2140
3281
  self.stock_code = stock_code
2141
3282
  self.indexes = indexes
2142
3283
  def evaluate_best_market_fit(self,sample_size, offset,lag= 3, n_obs = 3500, verbose = False, plot_best = False):
2143
-
3284
+ """
3285
+ iterate every index in the index list and get results
3286
+
3287
+ Parameters
3288
+ ----------
3289
+ sample_size (int): sample size to get betas
3290
+ offset (int): overlap size
3291
+ lag (int): number of lags of the returns
3292
+ n_obs (int): number of observations of the data extraction
3293
+ verbose (boolean): if true, print results
3294
+ plot_best (boolean): if true, display plot of the best result
3295
+
3296
+ Returns
3297
+ -------
3298
+ None
3299
+ """
2144
3300
  results_dicts = dict()
2145
3301
  for index in self.indexes:
2146
3302
  betex = analyse_index(index = index,asset = self.stock_code,n_obs = n_obs, lag = lag)
@@ -2150,34 +3306,40 @@ class evaluate_markets(analyse_index):
2150
3306
  pd_result['gen_r2'] = pd_result.general_r ** 2
2151
3307
  pd_result['sampl_r2'] = pd_result.sample_r ** 2
2152
3308
  self.stat_results = pd_result
2153
-
3309
+
2154
3310
  best_result = pd_result.sort_values('gen_r2',ascending = False).head(2).sort_values('sampl_r2',ascending = False).head(1)
2155
3311
  best_fit_index = best_result.index.values[0]
2156
-
3312
+
2157
3313
  self.stat_results = self.stat_results.drop(columns = ['gen_r2','sampl_r2'])
2158
-
3314
+
2159
3315
  if verbose:
2160
3316
  print(best_result)
2161
3317
  if plot_best:
2162
3318
  betex = analyse_index(index = best_fit_index,asset = self.stock_code, n_obs = n_obs, lag = lag)
2163
3319
  betex.plot_betas(sample_size = sample_size, offset = offset, subsample_ts = False)
2164
-
3320
+
2165
3321
  self.best_result = best_result
2166
-
3322
+
2167
3323
  def get_relevant_beta(data_market, ticket_name, show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
2168
- """
2169
- data_market: pandas df
2170
- ticket_name: str name of the asset
2171
- show_plot: bool
2172
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
2173
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
2174
- aws_credentials: dict
2175
- """
3324
+ '''
3325
+ select relevant beta result data of a given asset
3326
+
3327
+ Parameters:
3328
+ data_market (pd.DataFrame): dataframe of the market results
3329
+ ticket_name (str): name of the asset
3330
+ show_plot (bool): If tru, plot results
3331
+ save_path (str): local path for saving e.g r'C:/path/to/the/file/'
3332
+ save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
3333
+ aws_credentials (dict): dict of the aws credentials
3334
+
3335
+ Returns:
3336
+ selection (pd.DataFrame): dataframe of the most relevant beta
3337
+ '''
2176
3338
  all_betas = data_market[data_market.asset == ticket_name].sort_values('general_r', ascending = False)
2177
3339
  all_betas['gen_r2'] = all_betas.general_r ** 2
2178
3340
  all_betas['sampl_r2'] = all_betas.sample_r ** 2
2179
3341
  selection = all_betas.sort_values('gen_r2',ascending =False).head(2).sort_values('sampl_r2',ascending =False).head(1).drop(columns = ['gen_r2','sampl_r2'])
2180
-
3342
+
2181
3343
  if show_plot:
2182
3344
  print(selection)
2183
3345
  if save_path: