virgo-modules 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,18 @@ from pykalman import KalmanFilter
31
31
  from .aws_utils import upload_file_to_aws
32
32
 
33
33
  def calculate_cointegration(series_1, series_2):
34
+ '''
35
+ calculate cointegration score of two time series.
36
+
37
+ Parameters:
38
+ series_1 (pd.series): pandas series of the asset returns
39
+ series_2 (pd.series): pandas series of the asset returns
40
+
41
+ Returns:
42
+ coint_flag (int): cointegration flag, 1 or 0. 1 if p value and coint_t lower than 0.05 and critical value
43
+ hedge_value (float): hedge value
44
+ '''
45
+
34
46
  coint_flag = 0
35
47
  coint_res = coint(series_1, series_2)
36
48
  coint_t = coint_res[0]
@@ -44,8 +56,43 @@ def calculate_cointegration(series_1, series_2):
44
56
  return coint_flag, hedge_value
45
57
 
46
58
  class pair_finder():
59
+ """
60
+ class that is going assess two assets to evaluate whether both are cointegrated
61
+
62
+ Attributes
63
+ ----------
64
+ df : pd.DataFrame
65
+ dataframe of merged assets with spread score
66
+ asset_1 : str
67
+ asset to assess
68
+ asset_2 : str
69
+ secondary asset to assess
70
+
71
+ Methods
72
+ -------
73
+ produce_zscore(window=int, z_threshold=float, verbose=boolean):
74
+ producing z score from the spread. Also getting signals using window functions
75
+ plot_scores():
76
+ display plot of the time series and signals and other plot for pair signal strategy
77
+ evaluate_signal(days_list=list(),test_size=int, signal_position=int,threshold=float,verbose=boolean, plot=boolean):
78
+ evaluate the signal strategy using future returns
79
+ create_backtest_signal(days_strategy=int, test_size=int):
80
+ create back test of the strategy and get somo plot analysis
81
+ """
47
82
  def __init__(self, raw_data , asset_1 ,asset_2):
48
-
83
+ """
84
+ Initialize object, selecting just the two assets and getting the spread between both assets
85
+
86
+ Parameters
87
+ ----------
88
+ raw_data (pd.DataFrame): dataframe of all assets
89
+ asset_1 (str): asset to assess
90
+ asset_2 (str): secondary asset to assess
91
+
92
+ Returns
93
+ -------
94
+ None
95
+ """
49
96
  df = raw_data[[asset_1, asset_2]]
50
97
  coint_flag, hedge_ratio = calculate_cointegration(df[asset_1], df[asset_2])
51
98
  spread = df[asset_1] - (hedge_ratio * df[asset_2])
@@ -55,6 +102,19 @@ class pair_finder():
55
102
  self.asset_2 = asset_2
56
103
 
57
104
  def produce_zscore(self, window, z_threshold, verbose = False):
105
+ """
106
+ producing z score from the spread. Also getting signals using window functions
107
+
108
+ Parameters
109
+ ----------
110
+ window (int): window size
111
+ z_threshold (float): alpha and z threhold for the normalized feature
112
+ verbose (boolean): to print analysis
113
+
114
+ Returns
115
+ -------
116
+ None
117
+ """
58
118
  self.z_threshold = z_threshold
59
119
  spread_series = pd.Series(self.df.spread)
60
120
  mean = spread_series.rolling(center = False, window = window).mean()
@@ -74,7 +134,17 @@ class pair_finder():
74
134
  self.df['low_pair_signal'] = low_signal
75
135
 
76
136
  def plot_scores(self):
77
-
137
+ """
138
+ display plot of the time series and signals and other plot for pair signal strategy
139
+
140
+ Parameters
141
+ ----------
142
+ None
143
+
144
+ Returns
145
+ -------
146
+ None
147
+ """
78
148
  plt.axhline(y=0.0, color='grey', linestyle='--')
79
149
  plt.figure(1, figsize = (10, 4))
80
150
  plt.plot(self.df.spread.values)
@@ -104,7 +174,22 @@ class pair_finder():
104
174
  fig.show()
105
175
 
106
176
  def evaluate_signal(self, days_list,test_size, signal_position = False,threshold = 0.05,verbose = False, plot = False):
107
-
177
+ """
178
+ evaluate the signal strategy using future returns
179
+
180
+ Parameters
181
+ ----------
182
+ days_list (list): list of days future returns
183
+ test_size (int): teste data size, the remainng is taken as training data
184
+ signal_position (int): position of the signal to open position
185
+ threshold (float): alpha or z threshold of the normalized feature
186
+ verbose (boolean): if True, print results
187
+ plot (boolean): if true, display plots
188
+
189
+ Returns
190
+ -------
191
+ None
192
+ """
108
193
  df = self.df.sort_values('Date').iloc[0:-test_size,:].copy()
109
194
  returns_list = list()
110
195
 
@@ -206,6 +291,18 @@ class pair_finder():
206
291
  del df
207
292
 
208
293
  def create_backtest_signal(self,days_strategy, test_size):
294
+ """
295
+ create back test of the strategy and get somo plot analysis
296
+
297
+ Parameters
298
+ ----------
299
+ days_strategy (int): list of days future returns
300
+ test_size (int): teste data size, the remainng is taken as training data
301
+
302
+ Returns
303
+ -------
304
+ None
305
+ """
209
306
  asset_1 = self.asset_1
210
307
  df1 = self.df.iloc[-test_size:,:].copy()
211
308
  df2 = df1.copy()
@@ -273,7 +370,18 @@ class pair_finder():
273
370
  del df1,df2,dft
274
371
 
275
372
  def produce_big_dataset(data_frames, stocks_codes_, feature_list, limit = 500):
276
-
373
+ '''
374
+ combine multiple asset, taking a common schema
375
+
376
+ Parameters:
377
+ data_frames (pd.DataFrame): Base dataframe
378
+ stocks_codes_ (list): assets to select
379
+ feature_list (list): feature list
380
+ limit (int): number of observation per asset
381
+
382
+ Returns:
383
+ dataframe (pd.DataFrame): Base dataframe with extra data
384
+ '''
277
385
  feature_list_ = list()
278
386
  columns_vector = list(data_frames[stocks_codes_[-1]].columns )
279
387
  for feat in feature_list:
@@ -301,7 +409,19 @@ def produce_big_dataset(data_frames, stocks_codes_, feature_list, limit = 500):
301
409
  return dataframe
302
410
 
303
411
  def ranking(data, weighted_features, top = 5, window = 5):
304
-
412
+ '''
413
+ Create a ranking of assets given current signals and weighted average importance
414
+
415
+ Parameters:
416
+ data (pd.Dataframe): base data
417
+ weighted_features (dict): configuration dictionary
418
+ top (int): top n to get result
419
+ window (int): number of days to assess
420
+
421
+ Returns:
422
+ top_up (list): top roof signal asset
423
+ top_low (list): top botton signal asset
424
+ '''
305
425
  features = weighted_features.keys()
306
426
  up_columns = ['signal_up_' + x for x in features]
307
427
  low_columns = ['signal_low_' + x for x in features]
@@ -333,19 +453,80 @@ def ranking(data, weighted_features, top = 5, window = 5):
333
453
  top_up = list(df.sort_values('up_signas', ascending = False).index)[:top]
334
454
  top_low = list(df.sort_values('low_signas', ascending = False).index)[:top]
335
455
 
336
- return top_up, top_low
456
+ return top_up, top_low, df
457
+
458
+ def ranking_first(data, weighted_features, top = 5, window = 5):
459
+ '''
460
+ Create a ranking of assets given current signals and weighted average importance
461
+
462
+ Parameters:
463
+ data (pd.Dataframe): base data
464
+ weighted_features (dict): configuration dictionary
465
+ top (int): top n to get result
466
+ window (int): number of days to assess
467
+
468
+ Returns:
469
+ top_up (list): top roof signal asset
470
+ top_low (list): top botton signal asset
471
+ '''
472
+ features = weighted_features.keys()
473
+ up_columns = ['signal_up_' + x for x in features]
474
+ low_columns = ['signal_low_' + x for x in features]
475
+
476
+ def compute_score(df,col,window):
477
+ score = 0
478
+ for i in range(window):
479
+ row = df.iloc[i]
480
+ if (row[col] == 1) and (i == 0):
481
+ score += 1000
482
+ elif (row[col] == 1) and (i == 1):
483
+ score -= 200
484
+ elif (row[col] == 1) and (i >= 2):
485
+ score -= 50
486
+ return score
487
+
488
+ ticket_list= list(data.Ticket.unique())
489
+ result = dict()
490
+ for ticket in ticket_list:
491
+ result[ticket] = dict()
492
+ df = data[data.Ticket == ticket].sort_values('Date').iloc[-window:]
493
+
494
+ for col in low_columns:
495
+ df = df.sort_values('Date', ascending = False)
496
+ score = compute_score(df,col,window)
497
+ result[ticket][col] = score
498
+ for col in up_columns:
499
+ score = 0
500
+ df = df.sort_values('Date', ascending = False)
501
+ score = compute_score(df,col,window)
502
+ result[ticket][col] = score
503
+
504
+ df = pd.DataFrame(result).T
505
+ df['up_signas'] = df[up_columns].sum(axis=1)
506
+ df['low_signas'] = df[low_columns].sum(axis=1)
507
+
508
+ top_up = list(df.sort_values('up_signas', ascending = False).index)[:top]
509
+ top_low = list(df.sort_values('low_signas', ascending = False).index)[:top]
510
+ return top_up, top_low, df
337
511
 
338
512
  def produce_dashboard(data, columns , ticket_list, show_plot = True, nrows = 150,save_name = False, save_path = False, save_aws = False, aws_credential = False):
339
- """
340
- data: pandas df
341
- columns: list
342
- ticket_list: list asset list
343
- nrows: int
344
- show_plot: bool
345
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
346
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
347
- aws_credentials: dict
348
- """
513
+ '''
514
+ produce dashboard using signals and list of assets
515
+
516
+ Parameters:
517
+ data (pd.Dataframe): base data
518
+ columns (list): list of features or signals
519
+ ticket_list (list): list of assets
520
+ show_plot (boolean): if true, display plot
521
+ nrows (int): number of days back to display
522
+ save_name (str): dashboad name resulting file
523
+ save_path (str): local path for saving e.g r'C:/path/to/the/file/'
524
+ save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
525
+ aws_credential (dict): aws credentials
526
+
527
+ Returns:
528
+ None
529
+ '''
349
530
  top = len(ticket_list)
350
531
  columns = ['history'] + columns
351
532
  subtitles = list()
@@ -393,9 +574,66 @@ def produce_dashboard(data, columns , ticket_list, show_plot = True, nrows = 150
393
574
  # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'multi_dashboards/'+save_name+'.json',input_path = save_path+save_name+'.json')
394
575
  upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = save_aws + save_name + '.json', input_path = save_path + save_name + '.json', aws_credentials = aws_credential)
395
576
 
577
+ def produce_edges_dashboard(dataframe, ticket_list, save_name, show_plot = False, save_path = False, save_aws = False, aws_credentials = False):
578
+ '''
579
+ produce dashboard using signals and list of assets
580
+
581
+ Parameters:
582
+ dataframe (pd.Dataframe): base data
583
+ ticket_list (list): list of assets
584
+ save_name (str): dashboad name resulting file
585
+ show_plot (boolean): if true, display plot
586
+ save_path (str): local path for saving e.g r'C:/path/to/the/file/'
587
+ save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
588
+ aws_credential (dict): aws credentials
589
+
590
+ Returns:
591
+ None
592
+ '''
593
+ n_assets = len(ticket_list)
594
+
595
+ result_json_name = save_name
596
+ cols_length = 4
597
+ rows_length = math.ceil(n_assets/2)
598
+
599
+ subtitles = list()
600
+ for x in ticket_list:
601
+ subtitles.append(x)
602
+ subtitles.append(x + ' signal')
603
+
604
+ fig = make_subplots(rows=rows_length, cols=cols_length,vertical_spacing = 0.01, horizontal_spacing = 0.03, shared_xaxes=True, subplot_titles = subtitles)
605
+
606
+ for i,ticket in enumerate(ticket_list):
607
+ j = i%2*2 +1
608
+ i = i+1
609
+ i_r = math.ceil(i/2)
610
+
611
+ show_legend = True if i == 1 else False
612
+
613
+ df = dataframe[dataframe.asset == ticket]
614
+ fig.add_trace(go.Scatter(x=df['Date'], y=df['Close'],legendgroup="Close",showlegend = show_legend , mode='lines',name = 'Close', marker_color = 'blue'),col = j, row = i_r)
615
+ fig.add_trace(go.Scatter(x=df['Date'], y=df['proba_target_up'],legendgroup="proba",showlegend = show_legend , mode='lines',name = 'proba_target_up', marker_color = 'orange'),col = j+1, row = i_r)
616
+ fig.update_layout(height=rows_length*300, width=1500, title_text = f'dashboard top {n_assets} tickets')
617
+
618
+ if save_path:
619
+ fig.write_json(save_path+result_json_name)
620
+ if show_plot:
621
+ fig.show()
622
+ if save_path and save_aws:
623
+ upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = save_aws + result_json_name, input_path = save_path + result_json_name, aws_credentials = aws_credentials)
396
624
 
397
625
  def rank_by_return(data, lag_days, top_n = 5):
398
-
626
+ '''
627
+ produce ranking by returns
628
+
629
+ Parameters:
630
+ data (pd.Dataframe): base data
631
+ lag_days (int): number of days to consider
632
+ top_n (int): top n results assets
633
+
634
+ Returns:
635
+ result (list): resulting assets top n most important
636
+ '''
399
637
  data = data.sort_values(['Ticket','Date'], ascending=[False,False]).reset_index(drop = True)
400
638
  data['first'] = data.sort_values(['Date'], ascending=[False]).groupby(['Ticket']).cumcount() + 1
401
639
  data = data[data['first'] <= lag_days]
@@ -416,18 +654,19 @@ def rank_by_return(data, lag_days, top_n = 5):
416
654
  return result
417
655
 
418
656
  def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_available: object = False, data_window:str = '5y') -> object:
419
- """
420
- this functions runs the stock_eda_panel
421
- it is shared between train model and predictions
422
- arguments:
423
- hmm_available: if the hmm is available, in prediction is required
424
- ticker_name: name of the asset
425
- ticket_settings: dictionary with all the parameters to compute features
426
- n_days: to set an arbitrary data size
427
-
428
- returns: stock eda panel
429
- """
430
-
657
+ '''
658
+ this functions runs the stock_eda_panel. It is shared between train model and predictions
659
+
660
+ Parameters:
661
+ ticker_name (str): name of the asset
662
+ ticket_settings (dict): dictionary with all the parameters to compute features
663
+ n_days (int): to set an arbitrary data size
664
+ hmm_available (obj): if the hmm is available, in prediction is required
665
+ data_window (str): window for the data extraction
666
+
667
+ Returns:
668
+ object_stock (obj): resulting object_stock object
669
+ '''
431
670
  object_stock = stock_eda_panel(ticker_name , n_days, data_window)
432
671
  object_stock.get_data()
433
672
 
@@ -461,7 +700,11 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
461
700
  'stochastic_feature':'stochastic_feature',
462
701
  'william_feature':'william_feature',
463
702
  'vortex_feature':'vortex_feature',
464
- 'pair_index_feature':'pair_index_feature' # this has a diff structure!
703
+ 'pair_index_feature':'pair_index_feature', # this has a diff structure!
704
+ 'min_distance_pricefeature':'minmax_pricefeature',
705
+ 'min_relprice_pricefeature':'minmax_pricefeature',
706
+ 'max_distance_pricefeature':'minmax_pricefeature',
707
+ 'max_relprice_pricefeature':'minmax_pricefeature',
465
708
  }
466
709
  exceptions = ['pair_feature','pair_index_feature']
467
710
  ### standar feature
@@ -506,6 +749,7 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
506
749
  if len(discrete_features) > 0:
507
750
  for feature_name in discrete_features:
508
751
  object_stock.produce_order_features(feature_name)
752
+ object_stock.get_order_feature_nosignal(feature_name)
509
753
 
510
754
  if hmm_available:
511
755
  object_stock.cluster_hmm_analysis( n_clusters = None,
@@ -517,13 +761,24 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
517
761
  object_stock.cluster_hmm_analysis( n_clusters = ticket_settings['settings']['hmm']['n_clusters'],
518
762
  features_hmm = ticket_settings['settings']['hmm']['features_hmm'],
519
763
  test_data_size = ticket_settings['settings']['hmm']['test_data_size'],
520
- seed = ticket_settings['settings']['hmm']['seed'])
764
+ seed = ticket_settings['settings']['hmm']['seed'],
765
+ corr_threshold = ticket_settings['settings']['hmm'].get('corr_threshold',0.75),
766
+ lag_returns_state = ticket_settings['settings']['hmm'].get('lag_returns_state',7),
767
+ )
521
768
 
522
769
  return object_stock
523
770
 
524
771
  trends = {'adjusted' : 0.001, 'smooth' : 0.0001}
525
772
 
526
773
  def apply_KF(self, trends):
774
+ '''
775
+ create kalman filter feature and attach it to the stock_eda_panel object
776
+
777
+ Parameters:
778
+ trends (dict): configurations of the kalman filter
779
+ Returns:
780
+ none
781
+ '''
527
782
  for ttrend in trends:
528
783
  tcov = trends.get(ttrend)
529
784
  kf = KalmanFilter(transition_matrices = [1],
@@ -537,11 +792,24 @@ def apply_KF(self, trends):
537
792
 
538
793
  stock_eda_panel.apply_KF = apply_KF
539
794
 
540
- def call_ml_objects(stock_code, client, call_models = False):
541
-
795
+ def call_ml_objects(stock_code, client, call_models = False, clean_name=False):
796
+ '''
797
+ call artifcats from mlflow
798
+
799
+ Parameters:
800
+ stock_code (str): asset name
801
+ client (obj): mlflow client
802
+ call_models (boolean): if true, call ml artifacts
803
+ Returns:
804
+ objects (dict): that contains ml artifacts, data , configs and models
805
+ '''
542
806
  objects = dict()
543
807
 
544
- registered_model_name = f'{stock_code}_models'
808
+ if clean_name:
809
+ renamed_stock_code = stock_code.replace("^","__",).replace(".","__").replace("=","__").replace("-","__")
810
+ registered_model_name = f'{renamed_stock_code}_models'
811
+ else:
812
+ registered_model_name = f'{stock_code}_models'
545
813
  latest_version_info = client.get_latest_versions(registered_model_name, stages=["Production"])
546
814
  latest_production_version = latest_version_info[0].version
547
815
  run_id_prod_model = latest_version_info[0].run_id
@@ -552,18 +820,27 @@ def call_ml_objects(stock_code, client, call_models = False):
552
820
  )
553
821
 
554
822
  ## calling models
555
-
823
+ if clean_name:
824
+ path_hmm = f"runs:/{run_id_prod_model}/{renamed_stock_code}-hmm-model"
825
+ else:
826
+ path_hmm = f"runs:/{run_id_prod_model}/{stock_code}-hmm-model"
827
+
556
828
  hmm_model = mlflow.pyfunc.load_model(
557
- f"runs:/{run_id_prod_model}/{stock_code}-hmm-model",
558
- suppress_warnings = True
829
+ path_hmm,
830
+ suppress_warnings = True
559
831
  )
560
832
  objects['called_hmm_models'] = hmm_model
561
833
 
562
834
  if call_models:
563
835
 
836
+ if clean_name:
837
+ path_model = f"runs:/{run_id_prod_model}/{renamed_stock_code}-forecasting-model"
838
+ else:
839
+ path_model = f"runs:/{run_id_prod_model}/{stock_code}-forecasting-model"
840
+
564
841
  forecasting_model = mlflow.pyfunc.load_model(
565
- f"runs:/{run_id_prod_model}/{stock_code}-forecasting-model",
566
- suppress_warnings = True
842
+ path_model,
843
+ suppress_warnings = True
567
844
  )
568
845
  objects['called_forecasting_model'] = forecasting_model
569
846
 
@@ -584,17 +861,57 @@ def call_ml_objects(stock_code, client, call_models = False):
584
861
  return objects
585
862
 
586
863
  class produce_plotly_plots:
864
+ """
865
+ class that helps to produce different dashboards
866
+
867
+ Attributes
868
+ ----------
869
+ ticket_name : str
870
+ asset name
871
+ data_frame (pd.DataFrame): asset data
872
+ settings : dict
873
+ asset configurations
874
+ show_plot : boolean
875
+ if true, display plots
876
+ save_path : str
877
+ local path for saving e.g r'C:/path/to/the/file/'
878
+ save_aws : str
879
+ remote key in s3 bucket path e.g. 'path/to/file/'
880
+ aws_credentials : dict
881
+ aws credentials
882
+ return_figs : boolean
883
+ if true, methods will return objects
884
+
885
+ Methods
886
+ -------
887
+ plot_asset_signals(feature_list=list, spread_column=list, date_intervals=list):
888
+ Display signals and hmm states over closing prices and feature time series
889
+ explore_states_ts():
890
+ display scaled time series of every hmm state
891
+ plot_hmm_analysis(settings=dict, t_matrix=txt, model=obj):
892
+ display plots that analyse hmm states
893
+ produce_forecasting_plot(predictions=pd.DataFrame):
894
+ display forecasting plots
895
+ """
587
896
  def __init__(self,ticket_name, data_frame,settings, save_path = False, save_aws = False, show_plot= True, aws_credentials = False, return_figs = False):
588
897
  """
589
- ticket_name: str asset name
590
- data_frame: pandas df
591
- settings: dict
592
- show_plot: bool
593
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
594
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
595
- aws_credentials: dict
898
+ Initialize object
899
+
900
+ Parameters
901
+ ----------
902
+ ticket_name (str): asset name
903
+ data_frame (pd.DataFrame): asset data
904
+ settings (dict): asset configurations
905
+ show_plot (boolean): if true, display plots
906
+ save_path (str): local path for saving e.g r'C:/path/to/the/file/'
907
+ save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
908
+ aws_credentials (dict): aws credentials
909
+ return_figs (boolean): if true, methods will return objects
910
+
911
+ Returns
912
+ -------
913
+ None
596
914
  """
597
-
598
915
  self.ticket_name = ticket_name
599
916
  self.data_frame = data_frame
600
917
  self.settings = settings
@@ -604,13 +921,44 @@ class produce_plotly_plots:
604
921
  self.aws_credentials = aws_credentials
605
922
  self.return_figs = return_figs
606
923
 
607
- def plot_asset_signals(self, feature_list,spread_column, date_intervals = False):
608
-
924
+ def plot_asset_signals(self, feature_list,spread_column, date_intervals = False, look_back = 800):
925
+ """
926
+ Display signals and hmm states over closing prices and feature time series
927
+
928
+ Parameters
929
+ ----------
930
+ feature_list (list): signal list
931
+ spread_column (list): moving average list
932
+ date_intervals (list): list of tuples of dates, e.g [('2022-01-01','2023-01-01'),('2022-01-01','2023-01-01')]
933
+
934
+ Returns
935
+ -------
936
+ fig (obj): plotly dashboard
937
+ """
609
938
  result_json_name = 'panel_signals.json'
610
939
  df = self.data_frame
940
+ if look_back:
941
+ df = df.iloc[-look_back:,:]
611
942
  ma1 = self.settings['settings'][spread_column]['ma1']
612
943
  ma2 = self.settings['settings'][spread_column]['ma2']
613
944
  hmm_n_clust = self.settings['settings']['hmm']['n_clusters']
945
+
946
+ def return_FeatureSingal_lists(feature, feature_2):
947
+ signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
948
+ signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
949
+ norm_list = [f'norm_{feature}', f'z_{feature}', feature]
950
+ return norm_list, signal_up_list, signal_low_list
951
+
952
+ # feature_list corrector
953
+ new_feature_list = list()
954
+ for feature in feature_list:
955
+ norm_list, _ , _ = return_FeatureSingal_lists(feature, '')
956
+ for norm_feat in norm_list:
957
+ if norm_feat in df.columns:
958
+ new_feature_list.append(feature)
959
+ break
960
+
961
+ feature_list = new_feature_list
614
962
  feature_rows = len(feature_list)
615
963
 
616
964
  rows_subplot = feature_rows + 1
@@ -627,9 +975,8 @@ class produce_plotly_plots:
627
975
  ### signal plots
628
976
  for row_i, feature in enumerate(feature_list,start=1):
629
977
  feature_2 = 'nan'
630
- signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
631
- signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
632
- norm_list = [f'norm_{feature}', f'z_{feature}', feature]
978
+ norm_list, signal_up_list, signal_low_list = return_FeatureSingal_lists(feature, feature_2)
979
+
633
980
  # signal
634
981
  for norm_feat in norm_list:
635
982
  if norm_feat in df.columns:
@@ -647,7 +994,7 @@ class produce_plotly_plots:
647
994
  for signal_low in signal_low_list:
648
995
  if signal_low in df.columns:
649
996
  fig.add_trace(go.Scatter(x=df['Date'], y=np.where(df[signal_low] == 1, df[norm_feat], np.nan),showlegend= False, mode='markers', marker_color = 'red'),col = 1, row = row_i)
650
-
997
+ fig.add_hline(y=0, line_width=2, line_dash="dash", line_color="grey",col = 1, row = row_i)
651
998
  fig.update_layout(height=height_plot, width=1600, title_text = f'asset plot and signals: {self.ticket_name}')
652
999
 
653
1000
  ## state plot with close prices
@@ -679,6 +1026,17 @@ class produce_plotly_plots:
679
1026
  return fig
680
1027
 
681
1028
  def explore_states_ts(self):
1029
+ """
1030
+ display scaled time series of every hmm state
1031
+
1032
+ Parameters
1033
+ ----------
1034
+ None
1035
+
1036
+ Returns
1037
+ -------
1038
+ fig (obj): plotly dashboard
1039
+ """
682
1040
  result_json_name = 'ts_hmm.json'
683
1041
  df = self.data_frame
684
1042
  hmm_n_clust = self.settings['settings']['hmm']['n_clusters']
@@ -693,7 +1051,6 @@ class produce_plotly_plots:
693
1051
  if len(states_subtitles)%2 == 1:
694
1052
  states_subtitles = states_subtitles + [None]
695
1053
 
696
-
697
1054
  fig = make_subplots(
698
1055
  rows= rows_subplot, cols=2,
699
1056
  specs = [[{"type": "scatter"},{"type": "scatter"}]]*state_rows,
@@ -727,6 +1084,20 @@ class produce_plotly_plots:
727
1084
  return fig
728
1085
 
729
1086
  def plot_hmm_analysis(self,settings, t_matrix, model = False):
1087
+ """
1088
+ display plots that analyse hmm states
1089
+
1090
+ Parameters
1091
+ ----------
1092
+ settings (dict): asset configurations
1093
+ t_matrix (txt): asset state transition matrix
1094
+ model(obj): hmm model
1095
+
1096
+ Returns
1097
+ -------
1098
+ fig (obj): plotly dashboard
1099
+ messages (dict): hmm model metrics
1100
+ """
730
1101
  result_json_name = 'hmm_analysis.json'
731
1102
  df = self.data_frame
732
1103
  hmm_n_clust = self.settings['settings']['hmm']['n_clusters']
@@ -737,7 +1108,7 @@ class produce_plotly_plots:
737
1108
  states = list(df.hmm_feature.unique())
738
1109
  states.sort()
739
1110
  ### expand hmm analysis
740
- hmm_titles = ['Transition matrix heatmap' , 'state return (base first observation)','length chains dist']
1111
+ hmm_titles = ['state return (base first observation)','Transition matrix heatmap','length chains dist']
741
1112
 
742
1113
  fig = make_subplots(
743
1114
  rows= rows_subplot, cols=2,
@@ -758,10 +1129,16 @@ class produce_plotly_plots:
758
1129
  df_ = df[['Date','hmm_feature','Close',"chain_return"]].sort_values('Date')
759
1130
  df_['Daily_Returns'] = df['Close'].pct_change(7)
760
1131
 
1132
+ df_agg_returns = df_.groupby('hmm_feature', as_index = False).agg(median =('Daily_Returns','median')).copy()
1133
+ current_state = df_.iloc[-1,:].hmm_feature
1134
+ medain_state_return = df_agg_returns[ df_agg_returns.hmm_feature == current_state]['median'].values[0]
1135
+ type_state = 'low state' if medain_state_return < 0 else 'high state'
1136
+
761
1137
  for state in states:
762
1138
  dfi = df_[df_.hmm_feature == state]
763
1139
  fig.add_trace(go.Box(y = dfi.chain_return, name=str(state),showlegend=False, marker_color = color_map[state] ),row=1, col=1)
764
-
1140
+ fig.add_hline(y=0, line_width=2, line_dash="dash", line_color="grey",row=1, col=1)
1141
+
765
1142
  ## lengths chains by state dist
766
1143
  if 'hmm_chain_order' in df.columns:
767
1144
  df_agg = df.groupby(['hmm_feature','chain_id'],as_index = False).agg(length_by_chain = ('hmm_chain_order','max'))
@@ -802,20 +1179,20 @@ class produce_plotly_plots:
802
1179
  fig.add_trace(go.Box(x = dfi.importance, name=str(feature),showlegend=False ),row=2, col=2)
803
1180
  fig.update_yaxes(visible=False, title="feature",row=2, col=2)
804
1181
 
805
-
806
1182
  fig.update_layout(height=height_plot, width=1600, title_text = f'State model analysis: {self.ticket_name}', coloraxis=dict(colorbar_len=0.50))
807
1183
 
808
1184
  date_execution = datetime.datetime.today().strftime('%Y-%m-%d')
809
1185
  current_step = df.iloc[-1,:].hmm_chain_order
810
1186
  current_state = df.iloc[-1,:].hmm_feature
811
- message1 = 'current state: ' + str(current_state)
812
- message2 = 'current step in state: ' + str(current_step)
1187
+ message1 = str(current_state)
1188
+ message2 = str(current_step)
813
1189
  message3 = str(date_execution)
814
1190
 
815
1191
  messages = {
816
1192
  'current state':message1,
817
1193
  'current step in state': message2,
818
1194
  'execution date':message3,
1195
+ 'type state':type_state,
819
1196
  }
820
1197
 
821
1198
  if self.show_plot:
@@ -847,7 +1224,27 @@ class produce_plotly_plots:
847
1224
 
848
1225
  if self.return_figs:
849
1226
  return fig, messages
850
- def produce_forecasting_plot(self,predictions):
1227
+
1228
+ def produce_forecasting_plot(self,predictions, window=30):
1229
+ """
1230
+ display forecasting plots
1231
+
1232
+ Parameters
1233
+ ----------
1234
+ predictions (pd.DataFrame): asset predictions
1235
+ window (int): historical data to display
1236
+
1237
+ Returns
1238
+ -------
1239
+ None
1240
+ """
1241
+ def qs(x):
1242
+ return x.quantile(0.05)
1243
+ def qm(x):
1244
+ return x.quantile(0.50)
1245
+ def ql(x):
1246
+ return x.quantile(0.95)
1247
+
851
1248
  result_json_name = 'forecast_plot.json'
852
1249
  hmm_n_clust = self.settings['settings']['hmm']['n_clusters']
853
1250
  model_type = self.settings.get('model_type',False)
@@ -863,8 +1260,6 @@ class produce_plotly_plots:
863
1260
  [{"type": "scatter"}, {"type": "scatter"}]],
864
1261
  subplot_titles = [f'asset returns {lags} lags', 'closing prices', 'hidden states']
865
1262
  )
866
-
867
-
868
1263
  predictions = predictions[predictions.StockCode == self.ticket_name]
869
1264
  if len(predictions) > 1:
870
1265
 
@@ -880,12 +1275,18 @@ class produce_plotly_plots:
880
1275
  last_exe_prediction_date = predictions.ExecutionDate.unique()
881
1276
  last_date = max(last_exe_prediction_date)
882
1277
 
883
- history = predictions[(predictions.Type == 'History') & (predictions.ExecutionDate == last_date)]
1278
+ history = self.data_frame.sort_values('Date').iloc[-window:,:]
884
1279
  cut_date = history.loc[history.iloc[-1:,:].index[0]:,'Date'].item()
885
-
886
1280
  prediction = predictions[predictions.Type == 'Prediction']
887
1281
 
888
1282
  ## log returns
1283
+ def add_intervals(data,feature,i,w=5):
1284
+ df_qs = data.sort_values('Date')[['Date',feature]].rolling(3,min_periods = 1,on='Date').apply(qs).groupby('Date',as_index=False)[feature].max()
1285
+ df_qm = data.sort_values('Date')[['Date',feature]].rolling(3,min_periods = 1,on='Date').apply(qm).groupby('Date',as_index=False)[feature].max()
1286
+ df_ql = data.sort_values('Date')[['Date',feature]].rolling(3,min_periods = 1,on='Date').apply(ql).groupby('Date',as_index=False)[feature].max()
1287
+ fig.add_trace(go.Scatter(x=df_qs.Date, y=df_qs[feature], mode='lines',marker_color ='#D0D0D0',showlegend=False,opacity=0.05),row=1, col=i)
1288
+ fig.add_trace(go.Scatter(x=df_qm.Date, y=df_qm[feature], mode='lines',marker_color ='#D0D0D0',showlegend=False,opacity=0.05, fill='tonexty'),row=1, col=i)
1289
+ fig.add_trace(go.Scatter(x=df_ql.Date, y=df_ql[feature], mode='lines',marker_color ='#D0D0D0',showlegend=False,opacity=0.05, fill='tonexty'),row=1, col=i)
889
1290
 
890
1291
  fig.add_trace(go.Scatter(x=history.Date, y=history.log_return, mode='lines',marker_color ='blue',showlegend=False),row=1, col=1)
891
1292
 
@@ -896,9 +1297,10 @@ class produce_plotly_plots:
896
1297
  df = prediction[prediction.ExecutionDate == last_date]
897
1298
  fig.add_trace(go.Scatter(x=df.Date, y=df.log_return, mode='lines',marker_color ='#ff7f0e',showlegend=False),row=1, col=1)
898
1299
  fig.add_trace(go.Scatter(x=df.Date, y=df.log_return, mode='markers',marker_color ='#ff7f0e',showlegend=False),row=1, col=1)
1300
+ fig.add_hline(y=0, line_width=2, line_dash="dash", line_color="grey",col = 1, row = 1)
1301
+ add_intervals(data=prediction,feature='log_return',i=1)
899
1302
 
900
1303
  ## closing prices
901
-
902
1304
  fig.add_trace(go.Scatter(x=history.Date, y=history.Close, mode='lines',marker_color ='blue',showlegend=False),row=1, col=2)
903
1305
  for i,datex in enumerate([x for x in last_exe_prediction_date if x != last_date]):
904
1306
  df = prediction[prediction.ExecutionDate == datex]
@@ -908,6 +1310,7 @@ class produce_plotly_plots:
908
1310
  fig.add_trace(go.Scatter(x=df.Date, y=df.Close, mode='lines',marker_color ='#ff7f0e',showlegend=False),row=1, col=2)
909
1311
  fig.add_trace(go.Scatter(x=df.Date, y=df.Close, mode='markers',marker_color ='#ff7f0e',showlegend=False),row=1, col=2)
910
1312
  fig.update_layout(height=height_plot, width=1600, title_text = f'forecasts: {self.ticket_name}')
1313
+ add_intervals(data=prediction,feature='Close',i=2)
911
1314
  else:
912
1315
  print('no forecasting history')
913
1316
 
@@ -918,9 +1321,22 @@ class produce_plotly_plots:
918
1321
  if self.save_path and self.save_aws:
919
1322
  # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
920
1323
  upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_json_name, input_path = self.save_path + result_json_name, aws_credentials = self.aws_credentials)
921
-
1324
+ if self.return_figs:
1325
+ return fig
1326
+
922
1327
  def plot_hmm_analysis_logger(data_frame,test_data_size, save_path = False, show_plot = True):
923
-
1328
+ '''
1329
+ display box plots train and test of hmm state returns
1330
+
1331
+ Parameters:
1332
+ data_frame (pd.DataFrame): asset data
1333
+ test_data_size (int): test data size, the remaining is training data
1334
+ save_path (str): path/to/save/
1335
+ show_plot (boolean): if true, display plot
1336
+
1337
+ Returns:
1338
+ None
1339
+ '''
924
1340
  df = data_frame
925
1341
  df_ = df[['Date','hmm_feature','Close',"chain_return"]].sort_values('Date')
926
1342
  fig, axs = plt.subplots(1,2,figsize=(10,4))
@@ -934,7 +1350,18 @@ def plot_hmm_analysis_logger(data_frame,test_data_size, save_path = False, show_
934
1350
  plt.close()
935
1351
 
936
1352
  def plot_hmm_tsanalysis_logger(data_frame, test_data_size,save_path = False, show_plot = True):
937
-
1353
+ '''
1354
+ display time series hmm state analisys
1355
+
1356
+ Parameters:
1357
+ data_frame (pd.DataFrame): asset data
1358
+ test_data_size (int): test data size, the remaining is training data
1359
+ save_path (str): path/to/save/
1360
+ show_plot (boolean): if true, display plot
1361
+
1362
+ Returns:
1363
+ None
1364
+ '''
938
1365
  df = data_frame
939
1366
  df_ = df[['Date','hmm_feature','Close',"chain_return"]].sort_values('Date')
940
1367
  states = list(df_['hmm_feature'].unique())
@@ -961,7 +1388,20 @@ def plot_hmm_tsanalysis_logger(data_frame, test_data_size,save_path = False, sho
961
1388
  plt.close()
962
1389
 
963
1390
  def extract_data_traintest(object_stock,features_to_search,configs, target_configs, window_analysis = False, drop_nan= True):
964
-
1391
+ '''
1392
+ code snippet that execute object_stock or stock_eda_panel to get features
1393
+
1394
+ Parameters:
1395
+ object_stock (object): stock_eda_panel object
1396
+ features_to_search (list): list of features
1397
+ configs (dict): asset configurations
1398
+ target_configs (dict): target configurations
1399
+ window_analysis (int): take a sample size data
1400
+ drop_nan (boolean): remove nans from the data
1401
+
1402
+ Returns:
1403
+ object_stock (obj): object_stock with features and signals
1404
+ '''
965
1405
  object_stock.get_data()
966
1406
  object_stock.volatility_analysis(**configs['volatility']['config_params'], plot = False, save_features = False)
967
1407
  target_params_up = target_configs['params_up']
@@ -972,7 +1412,26 @@ def extract_data_traintest(object_stock,features_to_search,configs, target_confi
972
1412
  arguments_to_use = configs[feature_name]['config_params']
973
1413
  method_to_use = configs[feature_name]['method']
974
1414
  getattr(object_stock, method_to_use)(**arguments_to_use, plot = False, save_features = False)
975
- object_stock.produce_order_features(feature_name)
1415
+ if method_to_use not in ['minmax_pricefeature']:
1416
+ object_stock.produce_order_features(feature_name)
1417
+ object_stock.get_order_feature_nosignal(feature_name)
1418
+ last_signal_featlist = configs.get('custom_transformations',{}).get('compute_last_signal', False)
1419
+ if last_signal_featlist:
1420
+ last_signal_featlist = last_signal_featlist
1421
+ last_signal_featlist = last_signal_featlist.split('//')
1422
+ if feature_name in last_signal_featlist:
1423
+ object_stock.compute_last_signal(feature_name, False)
1424
+ volatility_features = configs.get('custom_transformations',{}).get('volatility_features', False)
1425
+ if volatility_features:
1426
+ for al in volatility_features:
1427
+ object_stock.lag_log_return(lags = al, feature="Close", feature_name=f"asset_{al}_logreturn")
1428
+ object_stock.produce_log_volatility(trad_days=al,feature=f"asset_{al}_logreturn",feature_name=f"asset_{al}_volatility")
1429
+ market_interaction_features = configs.get('custom_transformations',{}).get('market_interaction_features', False)
1430
+ if market_interaction_features:
1431
+ for stage in market_interaction_features.keys():
1432
+ method_to_use = market_interaction_features.get(stage).get("method")
1433
+ arguments_to_use = market_interaction_features.get(stage).get("parameters")
1434
+ getattr(object_stock, method_to_use)(**arguments_to_use)
976
1435
  # geting targets
977
1436
  object_stock.get_categorical_targets(**target_params_up)
978
1437
  object_stock.df = object_stock.df.drop(columns = ['target_down']).rename(columns = {'target_up':'target_up_save'})
@@ -987,7 +1446,19 @@ def extract_data_traintest(object_stock,features_to_search,configs, target_confi
987
1446
  return object_stock
988
1447
 
989
1448
  def produce_simple_ts_from_model(stock_code, configs, n_days = 2000 , window_scope = '5y'):
990
-
1449
+ '''
1450
+ display dashboard analysis of a given asset
1451
+
1452
+ Parameters:
1453
+ stock_code (str): asset name
1454
+ configs (dict): asset configurations
1455
+ n_days (int): data size
1456
+ window_scope (str): window data size
1457
+
1458
+ Returns:
1459
+ fig (obj): plotly dashboard
1460
+ df (pd.DataFrame): result asset dataset
1461
+ '''
991
1462
  ## getting data
992
1463
  volat_args = {'lags': 3, 'trad_days': 15, 'window_log_return': 10}
993
1464
 
@@ -1038,7 +1509,7 @@ def produce_simple_ts_from_model(stock_code, configs, n_days = 2000 , window_sco
1038
1509
  for signal_low in signal_low_list:
1039
1510
  if signal_low in df.columns:
1040
1511
  fig.add_trace(go.Scatter(x=df['Date'], y=np.where(df[signal_low] == 1, df[norm_feat], np.nan),showlegend= False, mode='markers', marker_color = 'red'),col = 1, row = row_i)
1041
-
1512
+ fig.add_hline(y=0, line_width=2, line_dash="dash", line_color="grey",col = 1, row = row_i)
1042
1513
  fig.update_layout(height=height_plot, width=1600, title_text = f'asset plot and signals: {stock_code}')
1043
1514
 
1044
1515
  del object_stock
@@ -1046,17 +1517,21 @@ def produce_simple_ts_from_model(stock_code, configs, n_days = 2000 , window_sco
1046
1517
  return fig, df
1047
1518
 
1048
1519
  def save_edge_model(data, save_path = False, save_aws = False, show_result = False, aws_credentials = False):
1049
- """
1050
- data: pandas df
1051
- model_name: str
1052
- ticket_name: str name of the asset
1053
- save_path: str local path for saving e.g r'C:/path/to/the/file/'
1054
- save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
1055
- show_results: bool
1056
- aws_credentials: dict
1057
-
1058
- return a print of the dictionary
1059
- """
1520
+ '''
1521
+ get latest edge execution and edge probability
1522
+
1523
+ Parameters:
1524
+ data (pd.DataFrame): asset data
1525
+ model_name (str): model name
1526
+ ticket_name (str): name of the asset
1527
+ save_path (str): local path for saving e.g r'C:/path/to/the/file/'
1528
+ save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
1529
+ show_results (bool): if true, display results
1530
+ aws_credentials (dict): aws credentials
1531
+
1532
+ Returns:
1533
+ None
1534
+ '''
1060
1535
  today = datetime.datetime.today().strftime('%Y-%m-%d')
1061
1536
 
1062
1537
  curent_edge = (
@@ -1079,10 +1554,25 @@ def save_edge_model(data, save_path = False, save_aws = False, show_result = Fal
1079
1554
  if show_result:
1080
1555
  print(curent_edge)
1081
1556
 
1557
+ ## this function is going to be split and deprecated
1082
1558
  def create_feature_edge(model, data,feature_name, threshold, target_variables):
1083
-
1559
+ '''
1560
+ get latest edge execution and edge probability
1561
+
1562
+ Parameters:
1563
+ model (obj): edge model artifact
1564
+ data (pd.DataFrame): asset data
1565
+ feature_name (str): edge feature name
1566
+ threshold (float): edge threshold
1567
+ target_variables (list): names of the target columns
1568
+
1569
+ Returns:
1570
+ result_df (pd.DataFrame): result dataframe with edges
1571
+ '''
1084
1572
  label_prediction = ['proba_'+x for x in target_variables]
1085
1573
  predictions = model.predict_proba(data)
1574
+ if isinstance(predictions, list):
1575
+ predictions = np.array([ x[:,1].T for x in predictions]).T
1086
1576
  predictions = pd.DataFrame(predictions, columns = label_prediction, index = data.index)
1087
1577
 
1088
1578
  result_df = pd.concat([data, predictions], axis=1)
@@ -1095,4 +1585,57 @@ def create_feature_edge(model, data,feature_name, threshold, target_variables):
1095
1585
  result_df[f'signal_{type_use}_{feature_name}'] = np.where(result_df[pred_col] >= threshold,1,0)
1096
1586
  result_df[f'acc_{type_use}_{feature_name}'] = np.where(result_df[f'signal_{type_use}_{feature_name}'] == result_df[pred_col.replace('proba_','')],1,0)
1097
1587
 
1098
- return result_df
1588
+ return result_df
1589
+
1590
+ def produce_probas(model,data, target_variables):
1591
+ """
1592
+ produce probabilities given a model
1593
+
1594
+ Parameters:
1595
+ model (obj): edge model artifact
1596
+ data (pd.DataFrame): asset data
1597
+ target_variables (list): names of the target columns
1598
+
1599
+ Returns:
1600
+ result_df (pd.DataFrame): result dataframe with edges
1601
+ label_prediction (list): list of resulting label columns
1602
+ """
1603
+ label_prediction = ['proba_'+x for x in target_variables]
1604
+ predictions = model.predict_proba(data)
1605
+ if isinstance(predictions, list):
1606
+ predictions = np.array([ x[:,1].T for x in predictions]).T
1607
+ predictions = pd.DataFrame(predictions, columns = label_prediction, index = data.index)
1608
+ result_df = pd.concat([data, predictions], axis=1)
1609
+ result_df = result_df[['Date'] + target_variables + label_prediction]
1610
+
1611
+ return result_df, label_prediction
1612
+
1613
+ def produce_signals(result_df, feature_name, threshold, label_prediction):
1614
+ """
1615
+ produce signals from probabilities
1616
+
1617
+ Parameters:
1618
+ result_df (pd.DataFrame): asset data with probabilities
1619
+ feature_name (str): edge feature name
1620
+ threshold (float): edge threshold
1621
+ label_prediction (list): list of resulting label columns
1622
+
1623
+ Returns:
1624
+ result_df (pd.DataFrame): result dataframe with edges and signals
1625
+ """
1626
+ for pred_col in label_prediction:
1627
+ type_use = 'low'
1628
+ if 'down' in pred_col:
1629
+ type_use = 'up'
1630
+
1631
+ result_df[f'signal_{type_use}_{feature_name}'] = np.where(result_df[pred_col] >= threshold,1,0)
1632
+ result_df[f'acc_{type_use}_{feature_name}'] = np.where(result_df[f'signal_{type_use}_{feature_name}'] == result_df[pred_col.replace('proba_','')],1,0)
1633
+
1634
+ return result_df
1635
+
1636
+ def clean_cols(data, patterns):
1637
+ drop_cols = list()
1638
+ for pattern in patterns:
1639
+ drop_cols = drop_cols + [ x for x in data.columns if pattern in x]
1640
+ data = data.drop(columns = drop_cols)
1641
+ return data