PyPI - virgo-modules - Versions diffs - 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

virgo-modules 0.0.72py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

virgo_modules/__init__.py +1 -0
virgo_modules/src/aws_utils.py +35 -3
virgo_modules/src/backtester.py +474 -0
virgo_modules/src/edge_utils/__init__.py +0 -0
virgo_modules/src/edge_utils/conformal_utils.py +106 -0
virgo_modules/src/edge_utils/edge_utils.py +502 -0
virgo_modules/src/edge_utils/feature_selection.py +66 -0
virgo_modules/src/edge_utils/shap_utils.py +54 -0
virgo_modules/src/edge_utils/stack_model.py +94 -0
virgo_modules/src/hmm_utils.py +494 -0
virgo_modules/src/market/__init__.py +0 -0
virgo_modules/src/market/market_tools.py +189 -0
virgo_modules/src/markowitz/__init__.py +0 -0
virgo_modules/src/markowitz/markowitz_utils.py +44 -0
virgo_modules/src/re_utils.py +628 -85
virgo_modules/src/ticketer_source.py +1351 -1066
virgo_modules/src/transformer_utils.py +401 -0
{virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/METADATA +16 -22
virgo_modules-0.9.0.dist-info/RECORD +24 -0
{virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/WHEEL +1 -1
virgo_modules/src/edge_utils.py +0 -178
virgo_modules-0.0.72.dist-info/RECORD +0 -12
{virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info/licenses}/LICENSE +0 -0
{virgo_modules-0.0.72.dist-info → virgo_modules-0.9.0.dist-info}/top_level.txt +0 -0

virgo_modules/src/re_utils.py CHANGED Viewed

@@ -31,6 +31,18 @@ from pykalman import KalmanFilter
 from .aws_utils import upload_file_to_aws
 def calculate_cointegration(series_1, series_2):
+    '''
+    calculate cointegration score of two time series.
+            Parameters:
+                    series_1 (pd.series): pandas series of the asset returns
+                    series_2 (pd.series): pandas series of the asset returns
+            Returns:
+                    coint_flag (int): cointegration flag, 1 or 0. 1 if p value and coint_t lower than 0.05 and critical value
+                    hedge_value (float): hedge value
+    '''
     coint_flag = 0
     coint_res = coint(series_1, series_2)
     coint_t = coint_res[0]
@@ -44,8 +56,43 @@ def calculate_cointegration(series_1, series_2):
     return coint_flag, hedge_value
 class pair_finder():
+    """
+    class that is going assess two assets to evaluate whether both are cointegrated
+    Attributes
+    ----------
+    df  : pd.DataFrame
+        dataframe of merged assets with spread score
+    asset_1 : str
+        asset to assess
+    asset_2 : str
+        secondary asset to assess
+    Methods
+    -------
+    produce_zscore(window=int, z_threshold=float, verbose=boolean):
+        producing z score from the spread. Also getting signals using window functions
+    plot_scores():
+        display plot of the time series and signals and other plot for pair signal strategy
+    evaluate_signal(days_list=list(),test_size=int, signal_position=int,threshold=float,verbose=boolean, plot=boolean):
+        evaluate the signal strategy using future returns
+    create_backtest_signal(days_strategy=int, test_size=int):
+        create back test of the strategy and get somo plot analysis
+    """
     def __init__(self, raw_data , asset_1 ,asset_2):
+        """
+        Initialize object, selecting just the two assets and getting the spread between both assets
+        Parameters
+        ----------
+        raw_data (pd.DataFrame): dataframe of all assets
+        asset_1 (str): asset to assess
+        asset_2 (str): secondary asset to assess
+        Returns
+        -------
+        None
+        """
         df = raw_data[[asset_1, asset_2]]
         coint_flag, hedge_ratio = calculate_cointegration(df[asset_1], df[asset_2])
         spread = df[asset_1] - (hedge_ratio * df[asset_2])
@@ -55,6 +102,19 @@ class pair_finder():
         self.asset_2 = asset_2
     def produce_zscore(self, window, z_threshold, verbose = False):
+        """
+        producing z score from the spread. Also getting signals using window functions
+        Parameters
+        ----------
+        window (int): window size
+        z_threshold (float): alpha and z threhold for the normalized feature
+        verbose (boolean): to print analysis
+        Returns
+        -------
+        None
+        """
         self.z_threshold = z_threshold
         spread_series = pd.Series(self.df.spread)
         mean = spread_series.rolling(center = False, window = window).mean()
@@ -74,7 +134,17 @@ class pair_finder():
         self.df['low_pair_signal'] = low_signal
     def plot_scores(self):
+        """
+        display plot of the time series and signals and other plot for pair signal strategy
+        Parameters
+        ----------
+        None
+        Returns
+        -------
+        None
+        """
         plt.axhline(y=0.0, color='grey', linestyle='--')
         plt.figure(1, figsize = (10, 4))
         plt.plot(self.df.spread.values)
@@ -104,7 +174,22 @@ class pair_finder():
         fig.show()
     def evaluate_signal(self, days_list,test_size, signal_position = False,threshold = 0.05,verbose = False, plot = False):
+        """
+        evaluate the signal strategy using future returns
+        Parameters
+        ----------
+        days_list (list): list of days future returns
+        test_size (int): teste data size, the remainng is taken as training data
+        signal_position (int): position of the signal to open position
+        threshold (float): alpha or z threshold of the normalized feature
+        verbose (boolean): if True, print results
+        plot (boolean): if true, display plots
+        Returns
+        -------
+        None
+        """
         df = self.df.sort_values('Date').iloc[0:-test_size,:].copy()
         returns_list = list()
@@ -206,6 +291,18 @@ class pair_finder():
         del df
     def create_backtest_signal(self,days_strategy, test_size):
+        """
+        create back test of the strategy and get somo plot analysis
+        Parameters
+        ----------
+        days_strategy (int): list of days future returns
+        test_size (int): teste data size, the remainng is taken as training data
+        Returns
+        -------
+        None
+        """
         asset_1 = self.asset_1
         df1 = self.df.iloc[-test_size:,:].copy()
         df2 = df1.copy()
@@ -273,7 +370,18 @@ class pair_finder():
         del df1,df2,dft
 def produce_big_dataset(data_frames, stocks_codes_, feature_list, limit = 500):
+    '''
+    combine multiple asset, taking a common schema
+            Parameters:
+                    data_frames (pd.DataFrame): Base dataframe
+                    stocks_codes_ (list): assets to select
+                    feature_list (list): feature list
+                    limit (int): number of observation per asset
+            Returns:
+                    dataframe (pd.DataFrame): Base dataframe with extra data
+    '''
     feature_list_ = list()
     columns_vector = list(data_frames[stocks_codes_[-1]].columns )
     for feat in feature_list:
@@ -301,7 +409,19 @@ def produce_big_dataset(data_frames, stocks_codes_, feature_list, limit = 500):
     return dataframe
 def ranking(data, weighted_features, top = 5, window = 5):
+    '''
+    Create a ranking of assets given current signals and weighted average importance
+            Parameters:
+                    data (pd.Dataframe): base data
+                    weighted_features (dict): configuration dictionary
+                    top (int): top n to get result
+                    window (int): number of days to assess
+            Returns:
+                    top_up (list): top roof signal asset
+                    top_low (list): top botton signal asset
+    '''
     features = weighted_features.keys()
     up_columns = ['signal_up_' + x for x in features]
     low_columns = ['signal_low_' + x for x in features]
@@ -333,19 +453,80 @@ def ranking(data, weighted_features, top = 5, window = 5):
     top_up = list(df.sort_values('up_signas', ascending = False).index)[:top]
     top_low = list(df.sort_values('low_signas', ascending = False).index)[:top]
-    return top_up, top_low
+    return top_up, top_low, df
+def ranking_first(data, weighted_features, top = 5, window = 5):
+    '''
+    Create a ranking of assets given current signals and weighted average importance
+            Parameters:
+                    data (pd.Dataframe): base data
+                    weighted_features (dict): configuration dictionary
+                    top (int): top n to get result
+                    window (int): number of days to assess
+            Returns:
+                    top_up (list): top roof signal asset
+                    top_low (list): top botton signal asset
+    '''
+    features = weighted_features.keys()
+    up_columns = ['signal_up_' + x for x in features]
+    low_columns = ['signal_low_' + x for x in features]
+    def compute_score(df,col,window):
+        score = 0
+        for i in range(window):
+            row = df.iloc[i]
+            if (row[col] == 1) and (i == 0):
+                score += 1000
+            elif (row[col] == 1) and (i == 1):
+                score -= 200
+            elif (row[col] == 1) and (i >= 2):
+                score -= 50
+        return score
+    ticket_list= list(data.Ticket.unique())
+    result = dict()
+    for ticket in ticket_list:
+        result[ticket] = dict()
+        df = data[data.Ticket == ticket].sort_values('Date').iloc[-window:]
+        for col in low_columns:
+            df = df.sort_values('Date', ascending = False)
+            score = compute_score(df,col,window)
+            result[ticket][col] = score
+        for col in up_columns:
+            score = 0
+            df = df.sort_values('Date', ascending = False)
+            score = compute_score(df,col,window)
+            result[ticket][col] = score
+    df = pd.DataFrame(result).T
+    df['up_signas'] = df[up_columns].sum(axis=1)
+    df['low_signas'] = df[low_columns].sum(axis=1)
+    top_up = list(df.sort_values('up_signas', ascending = False).index)[:top]
+    top_low = list(df.sort_values('low_signas', ascending = False).index)[:top]
+    return top_up, top_low, df
 def produce_dashboard(data, columns , ticket_list, show_plot = True, nrows = 150,save_name = False, save_path = False, save_aws = False, aws_credential = False):
-    """
-    data: pandas df
-    columns: list
-    ticket_list: list asset list
-    nrows: int
-    show_plot: bool
-    save_path: str local path for saving e.g r'C:/path/to/the/file/'
-    save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
-    aws_credentials: dict
-    """
+    '''
+    produce dashboard using signals and list of assets
+            Parameters:
+                    data (pd.Dataframe): base data
+                    columns (list): list of features or signals
+                    ticket_list (list): list of assets
+                    show_plot (boolean): if true, display plot
+                    nrows (int): number of days back to display
+                    save_name (str): dashboad name resulting file
+                    save_path (str): local path for saving e.g r'C:/path/to/the/file/'
+                    save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
+                    aws_credential (dict): aws credentials
+            Returns:
+                    None
+    '''
     top = len(ticket_list)
     columns = ['history'] + columns
     subtitles = list()
@@ -393,9 +574,66 @@ def produce_dashboard(data, columns , ticket_list, show_plot = True, nrows = 150
         # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'multi_dashboards/'+save_name+'.json',input_path = save_path+save_name+'.json')
         upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = save_aws + save_name + '.json', input_path = save_path + save_name + '.json', aws_credentials = aws_credential)
+def produce_edges_dashboard(dataframe, ticket_list, save_name, show_plot = False, save_path = False, save_aws = False, aws_credentials = False):
+    '''
+    produce dashboard using signals and list of assets
+            Parameters:
+                    dataframe (pd.Dataframe): base data
+                    ticket_list (list): list of assets
+                    save_name (str): dashboad name resulting file
+                    show_plot (boolean): if true, display plot
+                    save_path (str): local path for saving e.g r'C:/path/to/the/file/'
+                    save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
+                    aws_credential (dict): aws credentials
+            Returns:
+                    None
+    '''
+    n_assets = len(ticket_list)
+    result_json_name = save_name
+    cols_length = 4
+    rows_length = math.ceil(n_assets/2)
+    subtitles = list()
+    for x in ticket_list:
+        subtitles.append(x)
+        subtitles.append(x + ' signal')
+    fig = make_subplots(rows=rows_length, cols=cols_length,vertical_spacing = 0.01, horizontal_spacing = 0.03, shared_xaxes=True, subplot_titles = subtitles)
+    for i,ticket in enumerate(ticket_list):
+        j = i%2*2 +1
+        i = i+1
+        i_r = math.ceil(i/2)
+        show_legend = True if i == 1 else False
+        df = dataframe[dataframe.asset == ticket]
+        fig.add_trace(go.Scatter(x=df['Date'], y=df['Close'],legendgroup="Close",showlegend = show_legend , mode='lines',name = 'Close', marker_color = 'blue'),col = j, row = i_r)
+        fig.add_trace(go.Scatter(x=df['Date'], y=df['proba_target_up'],legendgroup="proba",showlegend = show_legend , mode='lines',name = 'proba_target_up', marker_color = 'orange'),col = j+1, row = i_r)
+    fig.update_layout(height=rows_length*300, width=1500, title_text = f'dashboard top {n_assets} tickets')
+    if save_path:
+        fig.write_json(save_path+result_json_name)
+    if show_plot:
+        fig.show()
+    if save_path and save_aws:
+        upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = save_aws + result_json_name, input_path = save_path + result_json_name, aws_credentials = aws_credentials)
 def rank_by_return(data, lag_days, top_n = 5):
+    '''
+    produce ranking  by returns
+            Parameters:
+                    data (pd.Dataframe): base data
+                    lag_days (int): number of days to consider
+                    top_n (int): top n results assets
+            Returns:
+                    result (list): resulting assets top n most important
+    '''
     data = data.sort_values(['Ticket','Date'], ascending=[False,False]).reset_index(drop = True)
     data['first'] = data.sort_values(['Date'], ascending=[False]).groupby(['Ticket']).cumcount() + 1
     data =  data[data['first'] <= lag_days]
@@ -416,18 +654,19 @@ def rank_by_return(data, lag_days, top_n = 5):
     return result
 def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_available: object = False, data_window:str = '5y') -> object:
-    """
-    this functions runs the stock_eda_panel
-    it is shared between train model and predictions
-    arguments:
-    hmm_available: if the hmm is available, in prediction is required
-    ticker_name: name of the asset
-    ticket_settings: dictionary with all the parameters to compute features
-    n_days: to set an arbitrary data size
-    returns: stock eda panel
-    """
+    '''
+    this functions runs the stock_eda_panel. It is shared between train model and predictions
+            Parameters:
+                    ticker_name (str): name of the asset
+                    ticket_settings (dict): dictionary with all the parameters to compute features
+                    n_days (int): to set an arbitrary data size
+                    hmm_available (obj): if the hmm is available, in prediction is required
+                    data_window (str): window for the data extraction
+            Returns:
+                    object_stock (obj): resulting object_stock object
+    '''
     object_stock = stock_eda_panel(ticker_name , n_days, data_window)
     object_stock.get_data()
@@ -461,7 +700,11 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
         'stochastic_feature':'stochastic_feature',
         'william_feature':'william_feature',
         'vortex_feature':'vortex_feature',
-        'pair_index_feature':'pair_index_feature' # this has a diff structure!
+        'pair_index_feature':'pair_index_feature', # this has a diff structure!
+        'min_distance_pricefeature':'minmax_pricefeature',
+        'min_relprice_pricefeature':'minmax_pricefeature',
+        'max_distance_pricefeature':'minmax_pricefeature',
+        'max_relprice_pricefeature':'minmax_pricefeature',
     }
     exceptions = ['pair_feature','pair_index_feature']
     ### standar feature
@@ -506,6 +749,7 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
     if len(discrete_features) > 0:
         for feature_name in discrete_features:
             object_stock.produce_order_features(feature_name)
+            object_stock.get_order_feature_nosignal(feature_name)
     if hmm_available:
         object_stock.cluster_hmm_analysis( n_clusters = None,
@@ -517,13 +761,24 @@ def get_data(ticker_name:str, ticket_settings:dict, n_days:int = False, hmm_avai
             object_stock.cluster_hmm_analysis( n_clusters = ticket_settings['settings']['hmm']['n_clusters'],
                                             features_hmm = ticket_settings['settings']['hmm']['features_hmm'],
                                             test_data_size = ticket_settings['settings']['hmm']['test_data_size'],
-                                            seed = ticket_settings['settings']['hmm']['seed'])
+                                            seed = ticket_settings['settings']['hmm']['seed'],
+                                            corr_threshold = ticket_settings['settings']['hmm'].get('corr_threshold',0.75),
+                                            lag_returns_state = ticket_settings['settings']['hmm'].get('lag_returns_state',7),
+                                            )
     return object_stock
 trends = {'adjusted' : 0.001, 'smooth' : 0.0001}
 def apply_KF(self, trends):
+    '''
+    create kalman filter feature and attach it to the stock_eda_panel object
+            Parameters:
+                    trends (dict): configurations of the kalman filter
+            Returns:
+                    none
+    '''
     for ttrend in trends:
         tcov = trends.get(ttrend)
         kf = KalmanFilter(transition_matrices = [1],
@@ -537,11 +792,24 @@ def apply_KF(self, trends):
 stock_eda_panel.apply_KF = apply_KF
-def call_ml_objects(stock_code, client, call_models = False):
+def call_ml_objects(stock_code, client, call_models = False, clean_name=False):
+    '''
+    call artifcats from mlflow
+            Parameters:
+                    stock_code (str): asset name
+                    client (obj): mlflow client
+                    call_models (boolean): if true, call ml artifacts
+            Returns:
+                    objects (dict): that contains ml artifacts, data , configs and models
+    '''
     objects = dict()
-    registered_model_name = f'{stock_code}_models'
+    if clean_name:
+        renamed_stock_code = stock_code.replace("^","__",).replace(".","__").replace("=","__").replace("-","__")
+        registered_model_name = f'{renamed_stock_code}_models'
+    else:
+        registered_model_name = f'{stock_code}_models'
     latest_version_info = client.get_latest_versions(registered_model_name, stages=["Production"])
     latest_production_version = latest_version_info[0].version
     run_id_prod_model = latest_version_info[0].run_id
@@ -552,18 +820,27 @@ def call_ml_objects(stock_code, client, call_models = False):
     )
      ## calling models
+    if clean_name:
+        path_hmm = f"runs:/{run_id_prod_model}/{renamed_stock_code}-hmm-model"
+    else:
+        path_hmm = f"runs:/{run_id_prod_model}/{stock_code}-hmm-model"
     hmm_model = mlflow.pyfunc.load_model(
-            f"runs:/{run_id_prod_model}/{stock_code}-hmm-model",
-             suppress_warnings = True
+            path_hmm,
+            suppress_warnings = True
             )
     objects['called_hmm_models'] = hmm_model
     if call_models:
+        if clean_name:
+            path_model = f"runs:/{run_id_prod_model}/{renamed_stock_code}-forecasting-model"
+        else:
+            path_model = f"runs:/{run_id_prod_model}/{stock_code}-forecasting-model"
         forecasting_model = mlflow.pyfunc.load_model(
-            f"runs:/{run_id_prod_model}/{stock_code}-forecasting-model",
-             suppress_warnings = True
+            path_model,
+            suppress_warnings = True
             )
         objects['called_forecasting_model'] = forecasting_model
@@ -584,17 +861,57 @@ def call_ml_objects(stock_code, client, call_models = False):
     return objects
 class produce_plotly_plots:
+    """
+    class that helps to produce different dashboards
+    Attributes
+    ----------
+    ticket_name : str
+        asset name
+    data_frame (pd.DataFrame): asset data
+    settings : dict
+        asset configurations
+    show_plot : boolean
+        if true, display plots
+    save_path : str
+        local path for saving e.g r'C:/path/to/the/file/'
+    save_aws : str
+        remote key in s3 bucket path e.g. 'path/to/file/'
+    aws_credentials : dict
+        aws credentials
+    return_figs : boolean
+        if true, methods will return objects
+    Methods
+    -------
+    plot_asset_signals(feature_list=list, spread_column=list, date_intervals=list):
+        Display signals and hmm states over closing prices and feature time series
+    explore_states_ts():
+        display scaled time series of every hmm state
+    plot_hmm_analysis(settings=dict, t_matrix=txt, model=obj):
+        display plots that analyse hmm states
+    produce_forecasting_plot(predictions=pd.DataFrame):
+        display forecasting plots
+    """
     def __init__(self,ticket_name, data_frame,settings, save_path = False, save_aws = False, show_plot= True, aws_credentials = False, return_figs = False):
         """
-        ticket_name: str asset name
-        data_frame: pandas df
-        settings: dict
-        show_plot: bool
-        save_path: str local path for saving e.g r'C:/path/to/the/file/'
-        save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
-        aws_credentials: dict
+        Initialize object
+        Parameters
+        ----------
+        ticket_name (str): asset name
+        data_frame (pd.DataFrame): asset data
+        settings (dict): asset configurations
+        show_plot (boolean): if true, display plots
+        save_path (str): local path for saving e.g r'C:/path/to/the/file/'
+        save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
+        aws_credentials (dict): aws credentials
+        return_figs (boolean): if true, methods will return objects
+        Returns
+        -------
+        None
         """
         self.ticket_name = ticket_name
         self.data_frame = data_frame
         self.settings = settings
@@ -604,13 +921,44 @@ class produce_plotly_plots:
         self.aws_credentials = aws_credentials
         self.return_figs = return_figs
-    def plot_asset_signals(self, feature_list,spread_column, date_intervals = False):
+    def plot_asset_signals(self, feature_list,spread_column, date_intervals = False, look_back = 800):
+        """
+        Display signals and hmm states over closing prices and feature time series
+        Parameters
+        ----------
+        feature_list (list): signal list
+        spread_column (list): moving average list
+        date_intervals (list): list of tuples of dates, e.g [('2022-01-01','2023-01-01'),('2022-01-01','2023-01-01')]
+        Returns
+        -------
+        fig (obj): plotly dashboard
+        """
         result_json_name = 'panel_signals.json'
         df = self.data_frame
+        if look_back:
+            df = df.iloc[-look_back:,:]
         ma1 = self.settings['settings'][spread_column]['ma1']
         ma2 = self.settings['settings'][spread_column]['ma2']
         hmm_n_clust = self.settings['settings']['hmm']['n_clusters']
+        def return_FeatureSingal_lists(feature, feature_2):
+            signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
+            signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
+            norm_list = [f'norm_{feature}', f'z_{feature}', feature]
+            return norm_list, signal_up_list, signal_low_list
+        # feature_list corrector
+        new_feature_list = list()
+        for feature in feature_list:
+            norm_list, _ , _ = return_FeatureSingal_lists(feature, '')
+            for norm_feat in norm_list:
+                if norm_feat in df.columns:
+                    new_feature_list.append(feature)
+                    break
+        feature_list = new_feature_list
         feature_rows = len(feature_list)
         rows_subplot = feature_rows + 1
@@ -627,9 +975,8 @@ class produce_plotly_plots:
         ### signal plots
         for row_i, feature in enumerate(feature_list,start=1):
             feature_2 = 'nan'
-            signal_up_list = [f'signal_up_{feature}', f'signal_up_{feature_2}']
-            signal_low_list = [f'signal_low_{feature}', f'signal_low_{feature_2}']
-            norm_list = [f'norm_{feature}', f'z_{feature}', feature]
+            norm_list, signal_up_list, signal_low_list = return_FeatureSingal_lists(feature, feature_2)
             # signal
             for norm_feat in norm_list:
                 if norm_feat in df.columns:
@@ -647,7 +994,7 @@ class produce_plotly_plots:
             for signal_low in signal_low_list:
                 if signal_low in df.columns:
                     fig.add_trace(go.Scatter(x=df['Date'], y=np.where(df[signal_low] == 1, df[norm_feat], np.nan),showlegend= False, mode='markers', marker_color = 'red'),col = 1, row = row_i)
+            fig.add_hline(y=0, line_width=2, line_dash="dash", line_color="grey",col = 1, row = row_i)
         fig.update_layout(height=height_plot, width=1600, title_text = f'asset plot and signals: {self.ticket_name}')
         ## state plot with close prices
@@ -679,6 +1026,17 @@ class produce_plotly_plots:
             return fig
     def explore_states_ts(self):
+        """
+        display scaled time series of every hmm state
+        Parameters
+        ----------
+        None
+        Returns
+        -------
+        fig (obj): plotly dashboard
+        """
         result_json_name = 'ts_hmm.json'
         df = self.data_frame
         hmm_n_clust = self.settings['settings']['hmm']['n_clusters']
@@ -693,7 +1051,6 @@ class produce_plotly_plots:
         if len(states_subtitles)%2 == 1:
             states_subtitles = states_subtitles + [None]
         fig = make_subplots(
             rows= rows_subplot, cols=2,
             specs = [[{"type": "scatter"},{"type": "scatter"}]]*state_rows,
@@ -727,6 +1084,20 @@ class produce_plotly_plots:
             return fig
     def plot_hmm_analysis(self,settings, t_matrix, model = False):
+        """
+        display plots that analyse hmm states
+        Parameters
+        ----------
+        settings (dict): asset configurations
+        t_matrix (txt): asset state transition matrix
+        model(obj): hmm model
+        Returns
+        -------
+        fig (obj): plotly dashboard
+        messages (dict): hmm model metrics
+        """
         result_json_name = 'hmm_analysis.json'
         df = self.data_frame
         hmm_n_clust = self.settings['settings']['hmm']['n_clusters']
@@ -737,7 +1108,7 @@ class produce_plotly_plots:
         states = list(df.hmm_feature.unique())
         states.sort()
         ### expand hmm analysis
-        hmm_titles = ['Transition matrix heatmap' , 'state return (base first observation)','length chains dist']
+        hmm_titles = ['state return (base first observation)','Transition matrix heatmap','length chains dist']
         fig = make_subplots(
             rows= rows_subplot, cols=2,
@@ -758,10 +1129,16 @@ class produce_plotly_plots:
         df_ = df[['Date','hmm_feature','Close',"chain_return"]].sort_values('Date')
         df_['Daily_Returns'] = df['Close'].pct_change(7)
+        df_agg_returns = df_.groupby('hmm_feature', as_index = False).agg(median =('Daily_Returns','median')).copy()
+        current_state = df_.iloc[-1,:].hmm_feature
+        medain_state_return = df_agg_returns[ df_agg_returns.hmm_feature == current_state]['median'].values[0]
+        type_state = 'low state' if medain_state_return < 0 else 'high state'
         for state in states:
             dfi = df_[df_.hmm_feature == state]
             fig.add_trace(go.Box(y = dfi.chain_return, name=str(state),showlegend=False, marker_color = color_map[state] ),row=1, col=1)
+        fig.add_hline(y=0, line_width=2, line_dash="dash", line_color="grey",row=1, col=1)
         ## lengths chains by state dist
         if 'hmm_chain_order' in df.columns:
             df_agg = df.groupby(['hmm_feature','chain_id'],as_index = False).agg(length_by_chain = ('hmm_chain_order','max'))
@@ -802,20 +1179,20 @@ class produce_plotly_plots:
                 fig.add_trace(go.Box(x = dfi.importance, name=str(feature),showlegend=False ),row=2, col=2)
             fig.update_yaxes(visible=False, title="feature",row=2, col=2)
         fig.update_layout(height=height_plot, width=1600, title_text = f'State model analysis: {self.ticket_name}', coloraxis=dict(colorbar_len=0.50))
         date_execution = datetime.datetime.today().strftime('%Y-%m-%d')
         current_step = df.iloc[-1,:].hmm_chain_order
         current_state = df.iloc[-1,:].hmm_feature
-        message1 = 'current state: ' +  str(current_state)
-        message2 = 'current step in state: ' + str(current_step)
+        message1 = str(current_state)
+        message2 = str(current_step)
         message3 = str(date_execution)
         messages = {
             'current state':message1,
             'current step in state': message2,
             'execution date':message3,
+            'type state':type_state,
         }
         if self.show_plot:
@@ -847,7 +1224,27 @@ class produce_plotly_plots:
         if self.return_figs:
             return fig, messages
-    def produce_forecasting_plot(self,predictions):
+    def produce_forecasting_plot(self,predictions, window=30):
+        """
+        display forecasting plots
+        Parameters
+        ----------
+        predictions (pd.DataFrame): asset predictions
+        window (int): historical data to display
+        Returns
+        -------
+        None
+        """
+        def qs(x):
+            return x.quantile(0.05)
+        def qm(x):
+            return x.quantile(0.50)
+        def ql(x):
+            return x.quantile(0.95)
         result_json_name = 'forecast_plot.json'
         hmm_n_clust = self.settings['settings']['hmm']['n_clusters']
         model_type = self.settings.get('model_type',False)
@@ -863,8 +1260,6 @@ class produce_plotly_plots:
                 [{"type": "scatter"}, {"type": "scatter"}]],
             subplot_titles = [f'asset returns {lags} lags', 'closing prices', 'hidden states']
         )
         predictions = predictions[predictions.StockCode == self.ticket_name]
         if len(predictions) > 1:
@@ -880,12 +1275,18 @@ class produce_plotly_plots:
             last_exe_prediction_date = predictions.ExecutionDate.unique()
             last_date = max(last_exe_prediction_date)
-            history = predictions[(predictions.Type == 'History') & (predictions.ExecutionDate == last_date)]
+            history = self.data_frame.sort_values('Date').iloc[-window:,:]
             cut_date = history.loc[history.iloc[-1:,:].index[0]:,'Date'].item()
             prediction = predictions[predictions.Type == 'Prediction']
             ## log returns
+            def add_intervals(data,feature,i,w=5):
+                df_qs = data.sort_values('Date')[['Date',feature]].rolling(3,min_periods = 1,on='Date').apply(qs).groupby('Date',as_index=False)[feature].max()
+                df_qm = data.sort_values('Date')[['Date',feature]].rolling(3,min_periods = 1,on='Date').apply(qm).groupby('Date',as_index=False)[feature].max()
+                df_ql = data.sort_values('Date')[['Date',feature]].rolling(3,min_periods = 1,on='Date').apply(ql).groupby('Date',as_index=False)[feature].max()
+                fig.add_trace(go.Scatter(x=df_qs.Date, y=df_qs[feature], mode='lines',marker_color ='#D0D0D0',showlegend=False,opacity=0.05),row=1, col=i)
+                fig.add_trace(go.Scatter(x=df_qm.Date, y=df_qm[feature], mode='lines',marker_color ='#D0D0D0',showlegend=False,opacity=0.05, fill='tonexty'),row=1, col=i)
+                fig.add_trace(go.Scatter(x=df_ql.Date, y=df_ql[feature], mode='lines',marker_color ='#D0D0D0',showlegend=False,opacity=0.05, fill='tonexty'),row=1, col=i)
             fig.add_trace(go.Scatter(x=history.Date, y=history.log_return, mode='lines',marker_color ='blue',showlegend=False),row=1, col=1)
@@ -896,9 +1297,10 @@ class produce_plotly_plots:
             df = prediction[prediction.ExecutionDate == last_date]
             fig.add_trace(go.Scatter(x=df.Date, y=df.log_return, mode='lines',marker_color ='#ff7f0e',showlegend=False),row=1, col=1)
             fig.add_trace(go.Scatter(x=df.Date, y=df.log_return, mode='markers',marker_color ='#ff7f0e',showlegend=False),row=1, col=1)
+            fig.add_hline(y=0, line_width=2, line_dash="dash", line_color="grey",col = 1, row = 1)
+            add_intervals(data=prediction,feature='log_return',i=1)
             ## closing prices
             fig.add_trace(go.Scatter(x=history.Date, y=history.Close, mode='lines',marker_color ='blue',showlegend=False),row=1, col=2)
             for i,datex in enumerate([x for x in last_exe_prediction_date if x != last_date]):
                 df = prediction[prediction.ExecutionDate == datex]
@@ -908,6 +1310,7 @@ class produce_plotly_plots:
             fig.add_trace(go.Scatter(x=df.Date, y=df.Close, mode='lines',marker_color ='#ff7f0e',showlegend=False),row=1, col=2)
             fig.add_trace(go.Scatter(x=df.Date, y=df.Close, mode='markers',marker_color ='#ff7f0e',showlegend=False),row=1, col=2)
             fig.update_layout(height=height_plot, width=1600, title_text = f'forecasts: {self.ticket_name}')
+            add_intervals(data=prediction,feature='Close',i=2)
         else:
             print('no forecasting history')
@@ -918,9 +1321,22 @@ class produce_plotly_plots:
         if self.save_path and self.save_aws:
             # upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = f'market_plots/{self.ticket_name}/'+result_json_name ,input_path = self.save_path+result_json_name)
             upload_file_to_aws(bucket = 'VIRGO_BUCKET', key = self.save_aws + result_json_name, input_path = self.save_path + result_json_name, aws_credentials = self.aws_credentials)
+        if self.return_figs:
+            return fig
 def plot_hmm_analysis_logger(data_frame,test_data_size, save_path = False, show_plot = True):
+    '''
+    display box plots train and test of hmm state returns
+            Parameters:
+                    data_frame (pd.DataFrame): asset data
+                    test_data_size (int): test data size, the remaining is training data
+                    save_path (str): path/to/save/
+                    show_plot (boolean): if true, display plot
+            Returns:
+                    None
+    '''
     df = data_frame
     df_ = df[['Date','hmm_feature','Close',"chain_return"]].sort_values('Date')
     fig, axs = plt.subplots(1,2,figsize=(10,4))
@@ -934,7 +1350,18 @@ def plot_hmm_analysis_logger(data_frame,test_data_size, save_path = False, show_
         plt.close()
 def plot_hmm_tsanalysis_logger(data_frame, test_data_size,save_path = False, show_plot = True):
+    '''
+    display time series hmm state analisys
+            Parameters:
+                    data_frame (pd.DataFrame): asset data
+                    test_data_size (int): test data size, the remaining is training data
+                    save_path (str): path/to/save/
+                    show_plot (boolean): if true, display plot
+            Returns:
+                    None
+    '''
     df = data_frame
     df_ = df[['Date','hmm_feature','Close',"chain_return"]].sort_values('Date')
     states = list(df_['hmm_feature'].unique())
@@ -961,7 +1388,20 @@ def plot_hmm_tsanalysis_logger(data_frame, test_data_size,save_path = False, sho
         plt.close()
 def extract_data_traintest(object_stock,features_to_search,configs, target_configs, window_analysis = False, drop_nan= True):
+    '''
+    code snippet that execute object_stock or stock_eda_panel to get features
+            Parameters:
+                    object_stock (object): stock_eda_panel object
+                    features_to_search (list): list of features
+                    configs (dict): asset configurations
+                    target_configs (dict): target configurations
+                    window_analysis (int): take a sample size data
+                    drop_nan (boolean): remove nans from the data
+            Returns:
+                    object_stock (obj): object_stock with features and signals
+    '''
     object_stock.get_data()
     object_stock.volatility_analysis(**configs['volatility']['config_params'], plot = False, save_features = False)
     target_params_up = target_configs['params_up']
@@ -972,7 +1412,26 @@ def extract_data_traintest(object_stock,features_to_search,configs, target_confi
         arguments_to_use = configs[feature_name]['config_params']
         method_to_use = configs[feature_name]['method']
         getattr(object_stock, method_to_use)(**arguments_to_use, plot = False, save_features = False)
-        object_stock.produce_order_features(feature_name)
+        if method_to_use not in ['minmax_pricefeature']:
+            object_stock.produce_order_features(feature_name)
+            object_stock.get_order_feature_nosignal(feature_name)
+        last_signal_featlist = configs.get('custom_transformations',{}).get('compute_last_signal', False)
+        if last_signal_featlist:
+                last_signal_featlist = last_signal_featlist
+                last_signal_featlist = last_signal_featlist.split('//')
+                if feature_name in last_signal_featlist:
+                    object_stock.compute_last_signal(feature_name, False)
+    volatility_features = configs.get('custom_transformations',{}).get('volatility_features', False)
+    if volatility_features:
+        for al in volatility_features:
+            object_stock.lag_log_return(lags = al, feature="Close", feature_name=f"asset_{al}_logreturn")
+            object_stock.produce_log_volatility(trad_days=al,feature=f"asset_{al}_logreturn",feature_name=f"asset_{al}_volatility")
+    market_interaction_features = configs.get('custom_transformations',{}).get('market_interaction_features', False)
+    if market_interaction_features:
+        for stage in market_interaction_features.keys():
+            method_to_use = market_interaction_features.get(stage).get("method")
+            arguments_to_use = market_interaction_features.get(stage).get("parameters")
+            getattr(object_stock, method_to_use)(**arguments_to_use)
     # geting targets
     object_stock.get_categorical_targets(**target_params_up)
     object_stock.df = object_stock.df.drop(columns = ['target_down']).rename(columns = {'target_up':'target_up_save'})
@@ -987,7 +1446,19 @@ def extract_data_traintest(object_stock,features_to_search,configs, target_confi
     return object_stock
 def produce_simple_ts_from_model(stock_code, configs, n_days = 2000 , window_scope = '5y'):
+    '''
+    display dashboard analysis of a given asset
+            Parameters:
+                    stock_code (str): asset name
+                    configs (dict): asset configurations
+                    n_days (int): data size
+                    window_scope (str): window data size
+            Returns:
+                    fig (obj): plotly dashboard
+                    df (pd.DataFrame): result asset dataset
+    '''
     ## getting data
     volat_args = {'lags': 3, 'trad_days': 15, 'window_log_return': 10}
@@ -1038,7 +1509,7 @@ def produce_simple_ts_from_model(stock_code, configs, n_days = 2000 , window_sco
         for signal_low in signal_low_list:
             if signal_low in df.columns:
                 fig.add_trace(go.Scatter(x=df['Date'], y=np.where(df[signal_low] == 1, df[norm_feat], np.nan),showlegend= False, mode='markers', marker_color = 'red'),col = 1, row = row_i)
+        fig.add_hline(y=0, line_width=2, line_dash="dash", line_color="grey",col = 1, row = row_i)
     fig.update_layout(height=height_plot, width=1600, title_text = f'asset plot and signals: {stock_code}')
     del object_stock
@@ -1046,17 +1517,21 @@ def produce_simple_ts_from_model(stock_code, configs, n_days = 2000 , window_sco
     return fig, df
 def save_edge_model(data, save_path = False, save_aws = False, show_result = False, aws_credentials = False):
-    """
-    data: pandas df
-    model_name: str
-    ticket_name: str name of the asset
-    save_path: str local path for saving e.g r'C:/path/to/the/file/'
-    save_aws: str remote key in s3 bucket path e.g. 'path/to/file/'
-    show_results: bool
-    aws_credentials: dict
-    return a print of the dictionary
-    """
+    '''
+    get latest edge execution and edge probability
+            Parameters:
+                    data (pd.DataFrame): asset data
+                    model_name (str): model name
+                    ticket_name (str): name of the asset
+                    save_path (str): local path for saving e.g r'C:/path/to/the/file/'
+                    save_aws (str): remote key in s3 bucket path e.g. 'path/to/file/'
+                    show_results (bool): if true, display results
+                    aws_credentials (dict): aws credentials
+            Returns:
+                    None
+    '''
     today = datetime.datetime.today().strftime('%Y-%m-%d')
     curent_edge = (
@@ -1079,10 +1554,25 @@ def save_edge_model(data, save_path = False, save_aws = False, show_result = Fal
     if show_result:
         print(curent_edge)
+## this function is going to be split and deprecated
 def create_feature_edge(model, data,feature_name, threshold, target_variables):
+    '''
+    get latest edge execution and edge probability
+            Parameters:
+                    model (obj): edge model artifact
+                    data (pd.DataFrame): asset data
+                    feature_name (str): edge feature name
+                    threshold (float): edge threshold
+                    target_variables (list): names of the target columns
+            Returns:
+                    result_df (pd.DataFrame): result dataframe with edges
+    '''
     label_prediction = ['proba_'+x for x in target_variables]
     predictions = model.predict_proba(data)
+    if isinstance(predictions, list):
+        predictions = np.array([ x[:,1].T for x in predictions]).T
     predictions = pd.DataFrame(predictions, columns = label_prediction, index = data.index)
     result_df = pd.concat([data, predictions], axis=1)
@@ -1095,4 +1585,57 @@ def create_feature_edge(model, data,feature_name, threshold, target_variables):
         result_df[f'signal_{type_use}_{feature_name}'] = np.where(result_df[pred_col] >= threshold,1,0)
         result_df[f'acc_{type_use}_{feature_name}'] = np.where(result_df[f'signal_{type_use}_{feature_name}'] == result_df[pred_col.replace('proba_','')],1,0)
-    return result_df
+    return result_df
+def produce_probas(model,data, target_variables):
+    """
+    produce probabilities given a model
+            Parameters:
+                    model (obj): edge model artifact
+                    data (pd.DataFrame): asset data
+                    target_variables (list): names of the target columns
+            Returns:
+                    result_df (pd.DataFrame): result dataframe with edges
+                    label_prediction (list): list of resulting label columns
+    """
+    label_prediction = ['proba_'+x for x in target_variables]
+    predictions = model.predict_proba(data)
+    if isinstance(predictions, list):
+        predictions = np.array([ x[:,1].T for x in predictions]).T
+    predictions = pd.DataFrame(predictions, columns = label_prediction, index = data.index)
+    result_df = pd.concat([data, predictions], axis=1)
+    result_df = result_df[['Date'] + target_variables + label_prediction]
+    return result_df, label_prediction
+def produce_signals(result_df, feature_name, threshold, label_prediction):
+    """
+    produce signals from probabilities
+            Parameters:
+                    result_df (pd.DataFrame): asset data with probabilities
+                    feature_name (str): edge feature name
+                    threshold (float): edge threshold
+                    label_prediction (list): list of resulting label columns
+            Returns:
+                    result_df (pd.DataFrame): result dataframe with edges and signals
+    """
+    for pred_col in label_prediction:
+        type_use = 'low'
+        if 'down' in pred_col:
+            type_use = 'up'
+        result_df[f'signal_{type_use}_{feature_name}'] = np.where(result_df[pred_col] >= threshold,1,0)
+        result_df[f'acc_{type_use}_{feature_name}'] = np.where(result_df[f'signal_{type_use}_{feature_name}'] == result_df[pred_col.replace('proba_','')],1,0)
+    return result_df
+def clean_cols(data, patterns):
+    drop_cols = list()
+    for pattern in patterns:
+        drop_cols = drop_cols + [ x for x in data.columns if pattern in x]
+    data = data.drop(columns = drop_cols)
+    return data

virgo-modules 0.0.72__py3-none-any.whl → 0.9.0__py3-none-any.whl

virgo-modules 0.0.72py3-none-any.whl → 0.9.0py3-none-any.whl