PyPI - virgo-modules - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend - Supply Chain Defender

virgo-modules 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of virgo-modules might be problematic. Click here for more details.

Files changed (11) hide show

virgo_modules/src/ticketer_source.py CHANGED Viewed

@@ -36,7 +36,6 @@ from hmmlearn.hmm import GaussianHMM
 from plotly.colors import DEFAULT_PLOTLY_COLORS
-from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.pipeline import Pipeline
 from feature_engine.imputation import MeanMedianImputer
@@ -54,252 +53,8 @@ from .aws_utils import upload_file_to_aws
 import logging
-class InverseHyperbolicSine(BaseEstimator, TransformerMixin):
-    """
-    Class that applies inverse hyperbolic sine for feature transformation.
-    this class is compatible with scikitlearn pipeline
-    Attributes
-    ----------
-    features : list
-        list of features to apply the transformation
-    prefix : str
-        prefix for the new features. is '' the features are overwrite
-    Methods
-    -------
-    fit(additional="", X=DataFrame, y=None):
-        fit transformation.
-    transform(X=DataFrame, y=None):
-        apply feature transformation
-    """
-    def __init__(self, features, prefix = ''):
-        self.features = features
-        self.prefix = prefix
-    def fit(self, X, y=None):
-        return self
-    def transform(self, X, y=None):
-        for feature in self.features:
-            X[f'{self.prefix}{feature}'] = np.arcsinh(X[feature])
-        return X
-class VirgoWinsorizerFeature(BaseEstimator, TransformerMixin):
-    """
-    Class that applies winsorirization of a feature for feature transformation.
-    this class is compatible with scikitlearn pipeline
-    Attributes
-    ----------
-    feature_configs : dict
-        dictionary of features and configurations. the configuration has high and low limits per feature
-    Methods
-    -------
-    fit(additional="", X=DataFrame, y=None):
-        fit transformation.
-    transform(X=DataFrame, y=None):
-        apply feature transformation
-    """
-    def __init__(self, feature_configs):
-        self.feature_configs = feature_configs
-    def fit(self, X, y=None):
-        return self
-    def transform(self, X, y=None):
-        for feature in self.feature_configs:
-            lower = self.feature_configs[feature]['min']
-            upper = self.feature_configs[feature]['max']
-            X[feature] = np.where( lower > X[feature], lower, X[feature])
-            X[feature] = np.where( upper < X[feature], upper, X[feature])
-        return X
-class FeatureSelector(BaseEstimator, TransformerMixin):
-    """
-    Class that applies selection of features.
-    this class is compatible with scikitlearn pipeline
-    Attributes
-    ----------
-    columns : list
-        list of features to select
-    Methods
-    -------
-    fit(additional="", X=DataFrame, y=None):
-        fit transformation.
-    transform(X=DataFrame, y=None):
-        apply feature transformation
-    """
-    def __init__(self, columns):
-        self.columns = columns
-    def fit(self, X, y=None):
-        return self
-    def transform(self, X, y=None):
-        return X[self.columns]
-class FeaturesEntropy(BaseEstimator, TransformerMixin):
-    """
-    Class that creates a feature that calculate entropy for a given feature classes, but it might get some leackeage in the training set.
-    this class is compatible with scikitlearn pipeline
-    Attributes
-    ----------
-    columns : list
-        list of features to select
-    entropy_map: pd.DataFrame
-        dataframe of the map with the entropies per class
-    perc: float
-        percentage of the dates using for calculate the entropy map
-    Methods
-    -------
-    fit(additional="", X=DataFrame, y=None):
-        fit transformation.
-    transform(X=DataFrame, y=None):
-        apply feature transformation
-    """
-    def __init__(self, features, target, feature_name = None, feature_type = 'discrete', perc = 0.5, default_null = 0.99):
-        self.features = features
-        self.feature_type = feature_type
-        self.target = target
-        self.perc = perc
-        self.default_null = default_null
-        if not feature_name:
-            self.feature_name = '_'.join(features)
-            self.feature_name = self.feature_name + '_' + target + '_' + feature_type
-        else:
-            self.feature_name = feature_name
-    def fit(self, X, y=None):
-        unique_dates = list(X['Date'].unique())
-        unique_dates.sort()
-        total_length = len(unique_dates)
-        cut = int(round(total_length*self.perc,0))
-        train_dates = unique_dates[:cut]
-        max_train_date = max(train_dates)
-        X_ = X[X['Date'] <= max_train_date].copy()
-        df = X_.join(y, how = 'left')
-        column_list = [f'{self.feature_type}_signal_{colx}' for colx in self.features]
-        df_aggr = (
-            df
-            .groupby(column_list, as_index = False)
-            .apply(
-                lambda x: pd.Series(
-                    dict(
-                        counts = x[self.target].count(),
-                        trues=(x[self.target] == 1).sum(),
-                        falses=(x[self.target] == 0).sum(),
-                    )
-                )
-            )
-            .assign(
-                trues_rate=lambda x: x['trues'] / x['counts']
-            )
-            .assign(
-                falses_rate=lambda x: x['falses'] / x['counts']
-            )
-            .assign(
-                log2_trues = lambda x: np.log2(1/x['trues_rate'])
-            )
-            .assign(
-                log2_falses = lambda x: np.log2(1/x['falses_rate'])
-            )
-            .assign(
-                comp1 = lambda x: x['trues_rate']*x['log2_trues']
-            )
-            .assign(
-                comp2 = lambda x: x['falses_rate']*x['log2_falses']
-            )
-            .assign(
-                class_entropy = lambda x: np.round(x['comp1']+x['comp2'],3)
-            )
-        )
-        self.column_list = column_list
-        self.entropy_map = (
-            df_aggr
-            [column_list+['class_entropy']]
-            .rename(columns = {'class_entropy': self.feature_name})
-            .copy()
-        )
-        del df, df_aggr, X_
-        return self
-    def transform(self, X, y=None):
-        X = X.join(self.entropy_map.set_index(self.column_list), on=self.column_list, how = 'left')
-        X[self.feature_name] = X[self.feature_name].fillna(self.default_null)
-        return X
-class signal_combiner(BaseEstimator, TransformerMixin):
-    """
-    Class that applies feature combination of binary signals.
-    this class is compatible with scikitlearn pipeline
-    ...
-    Attributes
-    ----------
-    columns : list
-        list of features to select
-    drop : boolean
-        drop combining features
-    prefix_up : str
-        up prefix of the base feature
-    prefix_low : str
-        low prefix of the base feature
-    Methods
-    -------
-    fit(additional="", X=DataFrame, y=None):
-        fit transformation.
-    transform(X=DataFrame, y=None):
-        apply feature transformation
-    """
-    def __init__(self, columns, drop = True, prefix_up = 'signal_up_', prefix_low = 'signal_low_'):
-        self.columns = columns
-        self.drop = drop
-        self.prefix_up = prefix_up
-        self.prefix_low = prefix_low
-    def fit(self, X, y=None):
-        return self
-    def transform(self, X, y=None):
-        for column in self.columns:
-            X['CombSignal_'+column] = np.where(
-                X[self.prefix_up + column] == 1,
-                1,
-                np.where(
-                    X[self.prefix_low + column] == 1,
-                    1,
-                    0
-                )
-            )
-            if self.drop:
-                X = X.drop(columns = [self.prefix_up + column, self.prefix_low + column])
-        return X
+from virgo_modules.src.hmm_utils import trainer_hmm
+from virgo_modules.src.transformer_utils import signal_combiner, FeatureSelector
 def data_processing_pipeline(features_base,features_to_drop = False, lag_dict = False, combine_signals = False, discretize_columns = False, correlation = 0.77):
@@ -335,61 +90,6 @@ def data_processing_pipeline(features_base,features_to_drop = False, lag_dict =
     )
     return pipe
-def states_relevance_score(data, default_benchmark_sd = 0.00003, t_threshold = 2):
-    '''
-    calculate relevance score and summary report for hmm model
-            Parameters:
-                    default_benchmark_sd (float): default value to bias SD for t calculation
-                    t_threshold (float): alpha or z threshold for the normalized score
-            Returns:
-                    mean_relevance (float): mean relevance score of the states
-                    cluster_returns (pd.DataFrame): summary report of the analysis
-                    number_relevant_states (int): number of relevant states
-    '''
-    ## legnths
-    cluster_lengths = data.groupby(['hmm_feature','chain_id'],as_index = False).agg(chain_lenght = ('hmm_chain_order','max'))
-    cluster_lengths = cluster_lengths.groupby('hmm_feature').agg(cluster_length_median = ('chain_lenght','median'))
-    ## means
-    def quantile2(x):
-        return x.quantile(0.25)
-    def quantile3(x):
-        return x.quantile(0.75)
-    cluster_returns = data.groupby('hmm_feature').agg(
-        n_uniques = ('chain_id','nunique'),
-        n_obs = ('Date','count'),
-        cluster_ret_q25 = ('chain_return',quantile2),
-        cluster_ret_median = ('chain_return','median'),
-        cluster_ret_q75 = ('chain_return',quantile3),
-    )
-    cluster_returns =  cluster_returns.join(cluster_lengths, how = 'left')
-    cluster_returns['perc_dispute'] = np.where(
-        np.sign(cluster_returns['cluster_ret_q25']) != np.sign(cluster_returns['cluster_ret_q75']),
-        1,0
-    )
-    cluster_returns['iqr'] = cluster_returns.cluster_ret_q75 - cluster_returns.cluster_ret_q25
-    cluster_returns['perc_25'] = abs(cluster_returns.cluster_ret_q25)/cluster_returns['iqr']
-    cluster_returns['perc_75'] = abs(cluster_returns.cluster_ret_q75)/cluster_returns['iqr']
-    cluster_returns['min_perc'] = cluster_returns[['perc_25','perc_75']].min(axis = 1)
-    cluster_returns['min_overlap'] = np.where(cluster_returns['perc_dispute'] == 1,cluster_returns['min_perc'],0)
-    cluster_returns['abs_median'] = abs(cluster_returns['cluster_ret_median'])
-    cluster_returns = cluster_returns.drop(columns = ['perc_25','perc_75','min_perc'])
-    ## relevance or importance
-    # naive aproach
-    cluster_returns['relevance'] =  cluster_returns['abs_median'] + ( 0.5 - cluster_returns['min_overlap'])
-    cluster_returns['t_calc'] = (cluster_returns['cluster_ret_median'] - 0)/(cluster_returns['iqr']/cluster_returns['n_obs'] + default_benchmark_sd/cluster_returns['n_obs'])**(1/2)
-    cluster_returns['abs_t_accpted'] = abs(cluster_returns['t_calc'])
-    cluster_returns['t_accpted'] = abs(cluster_returns['abs_t_accpted']) > t_threshold
-    mean_relevance = cluster_returns['abs_t_accpted'].mean()
-    number_relevant_states = len(cluster_returns[cluster_returns.t_accpted == True])
-    return mean_relevance, cluster_returns, number_relevant_states
 class stock_eda_panel(object):
     """
@@ -800,7 +500,6 @@ class stock_eda_panel(object):
             self.augmented_dickey_fuller_statistics(df['log_return'], 'log_return')
             self.augmented_dickey_fuller_statistics(df['roll_mean_log_return'], 'roll_mean_log_return')
     def find_lag(self, feature, lag_list, column_target = 'log_return',posterior_lag = 4, test_size = 350):
         """
@@ -847,7 +546,6 @@ class stock_eda_panel(object):
         plt.axhline(y=0, color='grey', linestyle='--')
         plt.show()
     def outlier_plot(self, zlim, plot = False, save_features = False):
         """
@@ -1010,62 +708,6 @@ class stock_eda_panel(object):
         self.signals.append(f'signal_up_{feature_name}')
         self.signals.append(f'signal_low_{feature_name}')
-    #######################
-    #### to be deprecated ####
-    def spread_MA(self, ma1, ma2, limit = 1.95, plot = False, save_features = False):
-        self.df[f'MA_{ma1}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma1, min_periods=1).mean()))
-        self.df[f'MA_{ma2}'] = (self.df.sort_values("Date")["Close"].transform(lambda x: x.rolling(ma2, min_periods=1).mean()))
-        self.ma1_column = f'MA_{ma1}'
-        self.ma2_column = f'MA_{ma2}'
-        self.df['MA_spread'] = self.df[f'MA_{ma1}'] - self.df[f'MA_{ma2}']
-        self.df['norm_MA_spread'] =  (self.df['MA_spread'] - self.df['MA_spread'].mean())/self.df['MA_spread'].std()
-        mean_ = self.df['norm_MA_spread'].mean()
-        self.df['rollstd_MA_spread'] = self.df.sort_values("Date")["norm_MA_spread"].rolling(50).std()
-        self.df['upper_MA_spread'] = limit*self.df['rollstd_MA_spread'] + mean_
-        self.df['lower_MA_spread'] = -limit*self.df['rollstd_MA_spread'] + mean_
-        self.df['signal_low_MA_spread'] = np.where( (self.df['norm_MA_spread'] < self.df['lower_MA_spread'] ), 1, 0)
-        self.df['signal_up_MA_spread'] = np.where( (self.df['norm_MA_spread'] > self.df['upper_MA_spread'] ), 1, 0)
-        ### ploting purposes
-        self.df[f"Roll_mean_{ma1}"] = (
-            self.df.sort_values("Date")["Close"]
-            .transform(lambda x: x.rolling(ma1, min_periods=1).mean())
-        )
-        self.df[f"Roll_mean_{ma2}"] = (
-            self.df.sort_values("Date")["Close"]
-            .transform(lambda x: x.rolling(ma2, min_periods=1).mean())
-        )
-        print('--------------------------------------------------------------------')
-        if save_features:
-            self.features.append('MA_spread')
-            self.signals.append('signal_low_MA_spread')
-            self.signals.append('signal_up_MA_spread')
-            self.settings_spread_ma = {'ma1':ma1, 'ma2':ma2, 'limit':limit}
-        if plot:
-            fig, axs = plt.subplots(1, 3,figsize=(21,4))
-            axs[0].plot(self.df['Date'],self.df['norm_MA_spread'])
-            axs[0].plot(self.df['Date'],self.df['upper_MA_spread'], linestyle='--')
-            axs[0].plot(self.df['Date'],self.df['lower_MA_spread'], linestyle='--')
-            axs[0].set_title('MA_spread series')
-            plot_acf(self.df['MA_spread'].dropna(),lags=25, ax=axs[1])
-            axs[1].set_title('acf MA_spread series')
-            plot_pacf(self.df['MA_spread'].dropna(),lags=25, ax=axs[2])
-            axs[2].set_title('acf MA_spread series')
-            plt.show()
-    ##################################################
     def relative_spread_MA(self, ma1, ma2, threshold = 1.95, plot = False, save_features = False):
         """
         perform relative moving average features, one for short term and another for long/mid term
@@ -1248,36 +890,6 @@ class stock_eda_panel(object):
             plt.show()
-    #######################
-    #### to be deprecated ####
-    def get_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
-        # negative countiing and rolling countingng
-        self.df['RetClose'] = self.df['Close'].pct_change()
-        self.df['roll_pos_counting'] = np.where(self.df['RetClose'].shift(1) > 0,1,0 )
-        self.df['roll_pos_counting'] = self.df['roll_pos_counting'].rolling(window = rolling_window).sum()
-        mean = self.df['roll_pos_counting'].mean()
-        std = self.df['roll_pos_counting'].std()
-        self.df['norm_counting'] =  (self.df['roll_pos_counting'] - mean )/std
-        self.df['signal_up_roll_pos_counting'] = np.where((self.df['norm_counting'] > threshold),1,0)
-        self.df['signal_low_roll_pos_counting'] = np.where((self.df['norm_counting'] < -threshold),1,0)
-        if save_features:
-            self.features.append('roll_pos_counting')
-            self.signals.append('signal_up_roll_pos_counting')
-            self.signals.append('signal_low_roll_pos_counting')
-            self.settings_count_features = {'rolling_window':rolling_window, 'threshold':threshold}
-        if plot:
-            fig = plt.figure(figsize = (10,4))
-            plt.plot(self.df['Date'],self.df.norm_counting)
-            plt.axhline(y=threshold, color='grey', linestyle='--')
-            plt.axhline(y=-threshold, color='grey', linestyle='--')
-            plt.show()
-    #######################
     def bidirect_count_feature(self, rolling_window, threshold, plot = False, save_features = False):
         """
         perform negative and positive return counting in a given rolling time window
@@ -1317,45 +929,6 @@ class stock_eda_panel(object):
             plt.plot(self.df['Date'],self.df[f'lower_{feature_name}'], linestyle='--')
             plt.show()
-    #######################
-    #### to be deprecated ####
-    def get_range_feature(self, window, up_threshold, low_threshold, plot = False, save_features = False):
-        self.df["Range"] = self.df["High"] / self.df["Low"] - 1
-        self.df['Avg_range'] = self.df['Range'].rolling(window = 5).mean()
-        self.df['dist_range'] = self.df['Range'] - self.df['Avg_range']
-        self.df['norm_dist_range'] = (self.df['dist_range'] - self.df['dist_range'].mean())/ self.df['dist_range'].std()
-        mean_ = self.df['norm_dist_range'].mean()
-        self.df[f'std_norm_dist_range'] = (self.df.sort_values("Date")["norm_dist_range"].transform(lambda x: x.rolling(window, min_periods=1).std()))
-        self.df['up_bound_norm_dist_range'] = up_threshold*self.df['std_norm_dist_range'] + mean_
-        self.df['low_bound_norm_dist_range'] = -low_threshold*self.df['std_norm_dist_range'] + mean_
-        self.df['signal_up_dist_range'] = np.where(self.df['norm_dist_range'] > self.df['up_bound_norm_dist_range'],1,0 )
-        self.df['signal_low_dist_range'] = np.where(self.df['norm_dist_range'] < self.df['low_bound_norm_dist_range'],1,0 )
-        if save_features:
-            self.features.append('dist_range')
-            self.signals.append('signal_up_dist_range')
-            self.signals.append('signal_low_dist_range')
-            self.settings_price_range = {'window':window, 'up_threshold':up_threshold, 'low_threshold':low_threshold}
-        if plot:
-            fig, axs = plt.subplots(2, 2,figsize=(17,11))
-            axs[0,0].plot(self.df['Range'])
-            axs[0,0].set_title('range')
-            axs[0,1].plot(self.df['Avg_range'])
-            axs[0,1].set_title('Avg_range')
-            axs[1,0].plot(self.df['up_bound_norm_dist_range'],color = 'grey', linestyle='--')
-            axs[1,0].plot(self.df['low_bound_norm_dist_range'],color = 'grey', linestyle='--')
-            axs[1,0].plot(self.df['norm_dist_range'])
-            axs[1,0].set_title('norm_dist_range')
-    #######################
     def get_relative_range_feature(self, window, threshold, plot = False, save_features = False):
         """
         perform relative spread of opening and closing price
@@ -1399,42 +972,6 @@ class stock_eda_panel(object):
             axs[1].plot(self.df[f'norm_{feature_name}'])
             axs[1].set_title(f'norm_{feature_name}')
-    #######################
-    #### to be deprecated ####
-    def rsi_feature(self, window, lag_rsi_ret, threshold, plot = False, save_features = False):
-        rsi = RSIIndicator(close = self.df['Close'], window = window).rsi()
-        self.df['RSI'] = rsi
-        self.df['RSI_ret'] = self.df['RSI']/self.df['RSI'].shift(lag_rsi_ret)
-        mean = self.df['RSI_ret'].mean()
-        std = self.df['RSI_ret'].std()
-        self.df['norm_RSI_ret'] = (self.df['RSI_ret']-mean)/std
-        self.df['signal_up_RSI_ret'] = np.where(self.df['norm_RSI_ret'] > threshold,1,0)
-        self.df['signal_low_RSI_ret'] = np.where(self.df['norm_RSI_ret'] < -threshold,1,0)
-        if save_features:
-            self.features.append('RSI_ret')
-            self.signals.append('signal_up_RSI_ret')
-            self.signals.append('signal_low_RSI_ret')
-            self.settings_rsi_feature= {'window':window, 'lag_rsi_ret':lag_rsi_ret, 'threshold':threshold}
-        if plot:
-            fig, axs = plt.subplots(1, 3,figsize=(17,5))
-            axs[0].plot(self.df.norm_RSI_ret)
-            axs[0].axhline(y=threshold, color='grey', linestyle='--')
-            axs[0].axhline(y=-threshold, color='grey', linestyle='--')
-            plot_acf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[1])
-            axs[1].set_title('acf RSI_ret')
-            plot_pacf(self.df['RSI_ret'].dropna(),lags=25,ax = axs[2])
-            axs[2].set_title('pacf RSI_ret')
-            fig.show()
-    #######################
     def rsi_feature_improved(self, window, threshold, plot = False, save_features = False):
         """
         perform relative strength index
@@ -1462,51 +999,6 @@ class stock_eda_panel(object):
         if plot:
             self.signal_plotter(feature_name)
-    #######################
-    #### to be deprecated ####
-    def days_features(self, window_day, limit, plot = False, save_features = False):
-        self.df['dow'] = self.df.Date.dt.dayofweek
-        self.df['dow'] = self.df['dow'].astype('str')
-        self.df['target_mean_input'] = (self.df.sort_values("Date").groupby('dow')['roll_mean_log_return'].transform(lambda x: x.rolling(window_day, min_periods=1).mean()))
-        mean = self.df['target_mean_input'].mean()
-        std = self.df['target_mean_input'].std()
-        self.df['norm_dow_input'] = (self.df['target_mean_input']-mean)/std
-        mean_ = self.df['norm_dow_input'].mean()
-        self.df['std_dow_input'] = self.df.sort_values("Date")["norm_dow_input"].rolling(50).std()
-        self.df['up_dow_input'] = limit*self.df['std_dow_input'] + mean_
-        self.df['low_dow_input'] = -limit*self.df['std_dow_input'] - mean_
-        self.df['signal_up_target_mean_input'] = np.where(self.df['norm_dow_input'] > self.df['up_dow_input'],1,0)
-        self.df['signal_low_target_mean_input'] = np.where(self.df['norm_dow_input'] < self.df['low_dow_input'],1,0)
-        if save_features:
-            self.features.append('target_mean_input')
-            self.signals.append('signal_up_target_mean_input')
-            self.signals.append('signal_low_target_mean_input')
-            self.settings_days_features = {'window_day':window_day, 'limit':limit}
-        if plot:
-            fig, axs = plt.subplots(1, 3,figsize=(17,5))
-            axs[0].plot(self.df['norm_dow_input'])
-            axs[0].plot(self.df['up_dow_input'], linestyle='--')
-            axs[0].plot(self.df['low_dow_input'], linestyle='--')
-            plot_acf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[1])
-            axs[1].set_title('acf day feature')
-            plot_pacf(self.df['norm_dow_input'].dropna(),lags=25,ax = axs[2])
-            axs[2].set_title('pacf day feature')
-            fig.show()
-    #######################
     def days_features_bands(self, window, threshold, plot = False, save_features = False):
         """
         compute mean returns for a given day of the week in a window scope per day
@@ -1539,62 +1031,6 @@ class stock_eda_panel(object):
         if plot:
             self.signal_plotter(feature_name)
-    #######################
-    #### to be deprecated ####
-    def analysis_volume(self,lag_volume, threshold, window, plot = False, save_features = False):
-        self.df['log_Volume'] = np.log(self.df['Volume'])
-        self.df['ret_log_Volume'] = self.df['log_Volume'].pct_change(lag_volume)
-        self.df['norm_ret_log_Volume'] = (self.df['ret_log_Volume'] - self.df['ret_log_Volume'].mean())/ self.df['ret_log_Volume'].std()
-        mean_ = self.df['norm_ret_log_Volume'].mean()
-        self.df[f'std_norm_ret_log_Volume'] = (self.df.sort_values("Date")["norm_ret_log_Volume"].transform(lambda x: x.rolling(window, min_periods=1).std()))
-        self.df['up_bound_ret_log_Volume'] = threshold*self.df['std_norm_ret_log_Volume'] + mean_
-        self.df['low_bound_ret_log_Volume'] = -threshold*self.df['std_norm_ret_log_Volume'] + mean_
-        self.df['signal_up_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] > self.df['up_bound_ret_log_Volume'],1,0 )
-        self.df['signal_low_ret_log_Volume'] = np.where(self.df['norm_ret_log_Volume'] < self.df['low_bound_ret_log_Volume'],1,0 )
-        if save_features:
-            self.features.append('ret_log_Volume')
-            self.signals.append('signal_up_ret_log_Volume')
-            self.signals.append('signal_low_ret_log_Volume')
-            self.settings_volume_feature= {'lag_volume':lag_volume, 'threshold':threshold, 'window':window}
-        if plot:
-            fig, axs = plt.subplots(3, 2,figsize=(11,13))
-            axs[0,0].plot(self.df.Date, self.df.Volume)
-            axs[0,0].set_title('Volume')
-            axs[0,1].plot(self.df.Date, self.df.log_Volume)
-            axs[0,1].set_title('log Volume')
-            plot_acf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,0])
-            axs[1,0].set_title('acf log_Volume')
-            plot_pacf(self.df['log_Volume'].dropna(),lags=25, ax = axs[1,1])
-            axs[1,1].set_title('pacf log_Volume')
-            plot_acf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,0])
-            axs[2,0].set_title('acf ret_log_Volume')
-            plot_pacf(self.df['ret_log_Volume'].dropna(),lags=25, ax = axs[2,1])
-            axs[2,1].set_title('pacf ret_log_Volume')
-            plt.show()
-            print('--------------------------------------------------------------')
-            fig, axs = plt.subplots(1, 2,figsize=(10,4))
-            axs[0].plot(self.df.Date, self.df.norm_ret_log_Volume)
-            axs[0].plot(self.df.Date, self.df.up_bound_ret_log_Volume)
-            axs[0].plot(self.df.Date, self.df.low_bound_ret_log_Volume)
-            axs[0].set_title('norm_ret_log_Volume')
-            axs[1].plot(self.df.Date, self.df.std_norm_ret_log_Volume)
-            axs[1].set_title('std_norm_ret_log_Volume')
-            plt.show()
-    #######################
     def analysis_smooth_volume(self, window, threshold, plot = False, save_features = False):
         """
         compute feature of thrading volumes
@@ -1968,14 +1404,12 @@ class stock_eda_panel(object):
         self.df["chain_id"] = self.df["chain_id"].fillna(method='ffill')
         self.df["hmm_chain_order"] = self.df.groupby('chain_id')["Date"].rank(method="first", ascending=True)
-        ### returns using the first element in a chain
-        self.df['first'] = np.where(self.df['hmm_chain_order'] == 1, self.df['Close'], np.nan)
-        self.df['first'] = self.df.sort_values('Date')['first'].fillna(method='ffill')
-        self.df['chain_return'] = (self.df['Close']/self.df['first'] -1) * 100
+        ### returns using the windowsseeds
+        self.df['lag_chain_close'] = self.df.sort_values(by=["Date"]).groupby(['chain_id'])['Close'].shift(lag_returns)
+        self.df['chain_return'] = (self.df['Close']/self.df['lag_chain_close'] -1) * 100
+        self.df = self.df.drop(columns = ['breack'])
-        self.df = self.df.drop(columns = ['breack','first'])
-    def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, plot = False, save_features = False, model = False):
+    def cluster_hmm_analysis(self, n_clusters,features_hmm, test_data_size, seed, lag_returns_state=7, corr_threshold = 0.75, plot = False, save_features = False, model = False):
         """
         create or use a hmm model
@@ -1986,6 +1420,7 @@ class stock_eda_panel(object):
         test_data_size (int): size of the test data. Note that the remaining is going to be used as training data
         seed (int): seed for the model inizialization
         lag_returns_state (int) : lags for returns of the state
+        corr_threshold (float): correlation threshold for initial feature selection
         plot (boolean): True to display hmm states analysis
         save_features (boolean): True to save features and configurations
         model (obj): if provided, no model will be trainend and the provided model will be used to get hmm features
@@ -1997,16 +1432,12 @@ class stock_eda_panel(object):
         if not model:
             df_new = self.df
-            pipeline_hmm = Pipeline([
-                ('selector', FeatureSelector(columns=features_hmm)),
-                ('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
-                ('hmm',GaussianHMM(n_components =  n_clusters, covariance_type = 'full', random_state = seed))
-                ])
             data_train = df_new.iloc[:-test_data_size,:]
             data_test = df_new.iloc[-test_data_size:,:]
-            pipeline_hmm.fit(data_train)
+            th = trainer_hmm(data_train, features_hmm, n_clusters=n_clusters,corr_thrshold=corr_threshold, seed = seed)
+            th.train()
+            pipeline_hmm = th.hmm_model
             self.model_hmm = pipeline_hmm
             self.test_data_hmm = data_test
@@ -2034,7 +1465,7 @@ class stock_eda_panel(object):
         if save_features:
             self.features.append('hmm_feature')
             self.features.append('hmm_chain_order')
-            self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state }
+            self.settings_hmm = {'n_clusters':n_clusters,'features_hmm':features_hmm, 'test_data_size':test_data_size, 'seed':seed,'lag_returns_state':lag_returns_state, 'corr_threshold':corr_threshold }
         if plot:
@@ -2248,53 +1679,6 @@ class stock_eda_panel(object):
         plt.legend()
         plt.show()
-    ### deprecated ############################
-    def create_strategy(self, favourable_states):
-        test_data = self.test_data_hmm
-        # add MA signal
-        test_data.loc[test_data[self.ma1_column] > test_data[self.ma2_column], 'MA_signal'] = 1
-        test_data.loc[test_data[self.ma1_column] <= test_data[self.ma2_column], 'MA_signal'] = 0
-        # add hnn signal
-        test_data['HMM_signal'] =  np.where(test_data['HMM'].isin(favourable_states),1,0)
-        ## combined signals
-        test_data['main_signal'] = 0
-        test_data.loc[(test_data['MA_signal'] == 1) & (test_data['HMM_signal'] == 1), 'main_signal'] = 1
-        test_data['main_signal'] = test_data['main_signal'].shift(1)
-        ## benchmark return
-        test_data['lrets_bench'] = np.log(test_data['Close']/test_data['Close'].shift(1))
-        test_data['bench_prod'] = test_data['lrets_bench'].cumsum()
-        test_data['bench_prod_exp'] = np.exp(test_data['bench_prod']) - 1
-        ## strategy return
-        # test_data['lrets_strat'] = np.log(test_data['Open'].shift(-1)/test_data['Open']) * test_data['main_signal']
-        test_data['lrets_strat'] = np.log(test_data['Close'].shift(-1)/test_data['Close']) * test_data['main_signal']
-        test_data['lrets_prod'] = test_data['lrets_strat'].cumsum()
-        test_data['strat_prod_exp'] = np.exp(test_data['lrets_prod']) - 1
-        test_data.dropna(inplace = True)
-        bench_rets = round(test_data['bench_prod_exp'].values[-1]*100,1)
-        strat_rets = round(test_data['strat_prod_exp'].values[-1]*100,1)
-        bench_sharpe = self.sharpe_ratio(test_data['bench_prod_exp'].values)
-        strat_sharpe = self.sharpe_ratio(test_data['strat_prod_exp'].values)
-        print(f'returns benchmark {bench_rets}%')
-        print(f'returns strategy {strat_rets}%')
-        print('-----------------------------')
-        print(f'sharpe benchmark {bench_sharpe}')
-        print(f'sharpe strategy {strat_sharpe}')
-        fig = plt.figure(figsize = (10,4))
-        plt.plot(test_data['bench_prod_exp'])
-        plt.plot(test_data['strat_prod_exp'])
-        self.settings_hmm_states = {'favourable_states':favourable_states}
-    ################################################
     def deep_dive_analysis_hmm(self, test_data_size, split = 'train'):
         """
         display analysis plot hmm model
@@ -2582,214 +1966,6 @@ class produce_model:
         self.pipeline.fit(self.X_train, self.y_train)
         self.features_to_model = self.pipeline[:-1].transform(self.X_train).columns
-class hmm_feature_selector():
-    """
-    class that is going to train hmm models to perform feature selection
-    Attributes
-    ----------
-    data  : pd.DataFrame
-        symbol of the asset
-    n_clusters : int
-        number of clusters to search
-    init_features_hmm : list
-        list of features to consider in the search
-    test_data_size :int
-        test data size, meaning that the remaining is going to be used as training data
-    select_n_features : int
-        number of features to select
-    n_trials : int
-        total number of trials per combination
-    limit_search : int
-        limit number of combinations
-    default_benchmark_sd : float
-        default value to bias standard deviation
-    t_threshold : float
-        alpha or z threshold
-    pipeline_hmm: obj
-        pipeline object of the hmm model
-    features_used_in_model:list
-        features in model
-    train_model(features_hmm=list):
-        train hmm model
-    feature_combinations: list
-        list of combination of features
-    mean_relevance: float
-        relevance score of the model
-    best_features: list
-        list of best performing features
-    Methods
-    -------
-    split_data():
-        split data in train and test
-    train_model(features_hmm=list):
-        train hmm model
-    feature_list_generator():
-        perform combination of features
-    get_error():
-        get error or score of a given model using relevance score
-    execute_selector():
-        select the best combination of features
-    """
-    def __init__(self, data, n_clusters, init_features_hmm, test_data_size, select_n_features, n_trials = 1,limit_search = False, default_benchmark_sd = 0.00003, t_threshold = 2):
-        """
-        Initialize object
-        Parameters
-        ----------
-        data (pd.DataFrame): data
-        n_clusters (int): number of clusters to search
-        init_features_hmm (list): list of features to consider in the search
-        test_data_siz:(int:  test data size, meaning that the remaining is going to be used as training data
-        select_n_features (int): number of features to select
-        n_trials (int): total number of trials per combination
-        limit_search (int): limit number of combinations
-        default_benchmark_sd (float): default value to bias standard deviation
-        t_threshold (float): alpha or z threshold
-        Returns
-        -------
-        None
-        """
-        self.data = data.copy()
-        self.n_clusters = n_clusters
-        self.init_features_hmm = init_features_hmm
-        self.test_data_size = test_data_size
-        self.select_n_features = select_n_features
-        self.n_trials = n_trials
-        self.limit_search= limit_search
-        self.default_benchmark_sd = default_benchmark_sd
-        self.t_threshold = t_threshold
-    def split_data(self):
-        """
-        split data in train and test
-        Parameters
-        ----------
-        None
-        Returns
-        -------
-        None
-        """
-        self.data_train = self.data.iloc[:-self.test_data_size,:]
-        self.data_test = self.data.iloc[-self.test_data_size:,:]
-    def train_model(self,features_hmm):
-        """
-        train hmm model
-        Parameters
-        ----------
-        features_hmm (list): list of features to be selected in the model
-        Returns
-        -------
-        None
-        """
-        pipeline_hmm = Pipeline([
-                ('selector', FeatureSelector(columns=features_hmm)),
-                ('fillna', MeanMedianImputer(imputation_method='median',variables=features_hmm)),
-                ('hmm',GaussianHMM(n_components =  self.n_clusters, covariance_type = 'full'))
-                ])
-        self.pipeline_hmm = pipeline_hmm.fit(self.data_train)
-        self.features_used_in_model = features_hmm
-    def feature_list_generator(self):
-        """
-        perform combination of features
-        Parameters
-        ----------
-        None
-        Returns
-        -------
-        None
-        """
-        feature_combinations = set(list(combinations(self.init_features_hmm, self.select_n_features)))
-        feature_combinations = list(map(list, feature_combinations))
-        self.feature_combinations = feature_combinations
-    def get_error(self):
-        """
-        get error or score of a given model using relevance score
-        Parameters
-        ----------
-        None
-        Returns
-        -------
-        None
-        """
-        self.data_train_ = self.data_train.copy()
-        self.data_train_['hmm_feature'] = self.pipeline_hmm.predict(self.data_train_)
-        self.data_train_ = self.data_train_[['Date','hmm_feature','Close']].sort_values('Date')
-        ## indexing chains
-        self.data_train_['lag_hmm_feature'] = self.data_train_['hmm_feature'].shift(1)
-        self.data_train_['breack'] = np.where(self.data_train_['lag_hmm_feature'] != self.data_train_['hmm_feature'],1,0)
-        self.data_train_["chain_id"] = self.data_train_.groupby("breack")["Date"].rank(method="first", ascending=True)
-        self.data_train_["chain_id"] = np.where(self.data_train_['breack'] == 1,self.data_train_["chain_id"],np.nan)
-        self.data_train_["chain_id"] = self.data_train_["chain_id"].fillna(method='ffill')
-        self.data_train_["hmm_chain_order"] = self.data_train_.groupby('chain_id')["Date"].rank(method="first", ascending=True)
-        ### returns using the first element in a chain
-        self.data_train_['first'] = np.where(self.data_train_['hmm_chain_order'] == 1, self.data_train_['Close'], np.nan)
-        self.data_train_['first'] = self.data_train_.sort_values('Date')['first'].fillna(method='ffill')
-        self.data_train_['chain_return'] = (self.data_train_['Close']/self.data_train_['first'] -1) * 100
-        self.data_train_ = self.data_train_.drop(columns = ['first'])
-        mean_relevance, cluster_returns, number_relevant_states = states_relevance_score(self.data_train_)
-        self.mean_relevance = mean_relevance
-    def execute_selector(self):
-        """
-        select the best combination of features
-        Parameters
-        ----------
-        None
-        Returns
-        -------
-        None
-        """
-        self.split_data()
-        self.feature_list_generator()
-        maxi = -1
-        print(f'it is expected {len(self.feature_combinations)} combinations')
-        feature_results = dict()
-        if self.limit_search:
-            print(f' taking just {self.limit_search} combinations')
-            maxi = self.limit_search
-        for i,features_hmm in enumerate(self.feature_combinations[0:maxi]):
-            feature_results[f'group_{i}'] = {
-                'features':list(features_hmm),
-                'relevances':list()
-            }
-            for _ in range(self.n_trials):
-                try:
-                    self.train_model(features_hmm)
-                    self.get_error()
-                    feature_results[f'group_{i}']['relevances'].append(self.mean_relevance)
-                except:
-                    print('error')
-            feature_results[f'group_{i}']['mean relevance'] = np.mean(feature_results[f'group_{i}']['relevances'])
-        self.feature_results = feature_results
-        self.best_features = pd.DataFrame(self.feature_results).T.sort_values('mean relevance').iloc[-1,:].features
 class analyse_index(stock_eda_panel):
     """
     class that is going to train hmm models to perform feature selection
@@ -3025,7 +2201,6 @@ class analyse_index(stock_eda_panel):
         self.states_result = result
 def get_relevant_beta(data_market, ticket_name,  show_plot = True, save_path = False, save_aws = False, aws_credentials = False):
     '''
     select relevant beta result data of a given asset