PyPI - cryptodatapy - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

cryptodatapy 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

cryptodatapy/conf/fx_tickers.csv +31 -0
cryptodatapy/transform/clean.py +171 -172
cryptodatapy/transform/clean_perp_futures_ohlcv.ipynb +1025 -0
cryptodatapy/transform/filter.py +83 -142
cryptodatapy/transform/impute.py +36 -83
cryptodatapy/transform/od.py +221 -450
{cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/METADATA +4 -1
{cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/RECORD +10 -8
{cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/LICENSE +0 -0
{cryptodatapy-0.2.2.dist-info → cryptodatapy-0.2.4.dist-info}/WHEEL +0 -0

cryptodatapy/transform/filter.py CHANGED Viewed

@@ -7,12 +7,13 @@ import pandas as pd
 class Filter:
     """
     Filters dataframe in tidy format.
     """
-    def __init__(
-        self, raw_df: pd.DataFrame, excl_cols: Optional[Union[str, list]] = None
-    ):
+    def __init__(self,
+                 raw_df: pd.DataFrame,
+                 excl_cols: Optional[Union[str, list]] = None,
+                 plot: bool = False,
+                 plot_series: tuple = ("BTC", "close")
+                 ):
         """
         Constructor
@@ -22,64 +23,18 @@ class Filter:
             Dataframe with raw data. DatetimeIndex (level 0), ticker (level 1) and raw data (cols), in tidy format.
         excl_cols: str or list, default None
             Name of columns to exclude from filtering
         """
         self.raw_df = raw_df
         self.excl_cols = excl_cols
-    def outliers(
-        self,
-        outliers_dict: dict,
-        plot: bool = False,
-        plot_series: tuple = ("BTC", "close"),
-    ) -> pd.DataFrame:
-        """
-        Filters outliers, replacing them with NaNs.
-        Parameters
-        ----------
-        outliers_dict: Dictionary of pd.DataFrame - MultiIndex
-            Dictionary of forecasts (yhat), outliers (outliers) and filtered values (filt_vals) multiindex dataframes
-            with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with forecasted, outlier or filtered
-            values.
-        plot: bool, default False
-            Plots series with outliers highlighted with red dots.
-        plot_series: tuple, default ('BTC', 'close')
-            Plots the time series of a specific (ticker, field/column) tuple.
-        Returns
-        -------
-        filt_df: DataFrame - MultiIndex
-            Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with outliers removed.
-        """
-        # filter outliers
-        filt_df = outliers_dict["filt_vals"]
-        # add excl cols
-        if self.excl_cols is not None:
-            filt_df = pd.concat(
-                [filt_df, self.raw_df[self.excl_cols]], join="outer", axis=1
-            )
-        # plot
-        if plot:
-            if not isinstance(plot_series, tuple):
-                raise TypeError(
-                    "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
-                )
-            else:
-                self.plot_filtered(filt_df, plot_series=plot_series)
-        return filt_df
+        self.plot = plot
+        self.plot_series = plot_series
+        self.df = raw_df.copy() if excl_cols is None else raw_df.drop(columns=excl_cols).copy()
+        self.filtered_df = None
     def avg_trading_val(
         self,
         thresh_val: int = 10000000,
         window_size: int = 30,
-        plot: bool = False,
-        plot_series: tuple = ("BTC", "close"),
     ) -> pd.DataFrame:
         """
         Filters values below a threshold of average trading value (price * volume/size in quote currency) over some
@@ -91,35 +46,24 @@ class Filter:
             Threshold/cut-off for avg trading value.
         window_size: int, default 30
             Size of rolling window.
-        plot: bool, default False
-            Plots series with outliers highlighted with red dots.
-        plot_series: tuple, default ('BTC', 'close')
-            Plots the time series of a specific (ticker, field/column) tuple.
         Returns
         -------
-        filt_df: DataFrame - MultiIndex
+        filtered_df: DataFrame - MultiIndex
             Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with values below the
             threshold removed.
         """
-        # convert string to list
-        if self.excl_cols is not None:
-            df = self.raw_df.drop(columns=self.excl_cols).copy()
-        else:
-            df = self.raw_df.copy()
         # compute traded val
-        if "close" in df.columns and "volume" in df.columns:
-            df["trading_val"] = df.close * df.volume
-        elif ("bid" in df.columns and "ask" in df.columns) and (
-            "bid_size" in df.columns and "ask_size" in df.columns
+        if "close" in self.df.columns and "volume" in self.df.columns:
+            self.df["trading_val"] = self.df.close * self.df.volume
+        elif ("bid" in self.df.columns and "ask" in self.df.columns) and (
+            "bid_size" in self.df.columns and "ask_size" in self.df.columns
         ):
-            df["trading_val"] = ((df.bid + df.ask) / 2) * (
-                (df.bid_size + df.ask_size) / 2
+            self.df["trading_val"] = ((self.df.bid + self.df.ask) / 2) * (
+                (self.df.bid_size + self.df.ask_size) / 2
             )
-        elif "trade_size" in df.columns and "trade_price" in df.columns:
-            df["trading_val"] = df.trade_price * df.trade_size
+        elif "trade_size" in self.df.columns and "trade_price" in self.df.columns:
+            self.df["trading_val"] = self.df.trade_price * self.df.trade_size
         else:
             raise Exception(
                 "Dataframe must include at least one price series (e.g. close price, trade price, "
@@ -127,36 +71,29 @@ class Filter:
             )
         # compute rolling mean/avg
-        df1 = df.groupby(level=1).rolling(window_size).mean().droplevel(0)
+        df1 = self.df.groupby(level=1).rolling(window_size).mean().droplevel(0)
         # divide by thresh
         df1 = df1 / thresh_val
         # filter df1
-        filt_df = (
-            df.loc[df1.trading_val > 1].reindex(df.index).drop(columns="trading_val")
-        )
-        # add excl cols
-        if self.excl_cols is not None:
-            filt_df = pd.concat(
-                [filt_df, self.raw_df[self.excl_cols]], join="outer", axis=1
-            )
+        self.filtered_df = self.df.loc[df1.trading_val > 1].reindex(self.df.index).drop(columns="trading_val")
         # plot
-        if plot:
-            if not isinstance(plot_series, tuple):
+        if self.plot:
+            if not isinstance(self.plot_series, tuple):
                 raise TypeError(
                     "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
                 )
             else:
-                self.plot_filtered(filt_df, plot_series=plot_series)
+                self.plot_filtered(plot_series=self.plot_series)
-        return filt_df
+        # add excl cols
+        if self.excl_cols is not None:
+            self.filtered_df = pd.concat([self.filtered_df,
+                                          self.raw_df[self.excl_cols].reindex(self.filtered_df.index)], axis=1)
-    def missing_vals_gaps(
-        self,
-        gap_window: int = 30,
-        plot: bool = False,
-        plot_series: tuple = ("BTC", "close"),
-    ) -> pd.DataFrame:
+        return self.filtered_df
+    def missing_vals_gaps(self, gap_window: int = 30) -> pd.DataFrame:
         """
         Filters values before a large gap of missing values, replacing them with NaNs.
@@ -164,27 +101,16 @@ class Filter:
         ----------
         gap_window: int, default 30
             Size of window where all values are missing (NaNs).
-        plot: bool, default False
-            Plots series with outliers highlighted with red dots.
-        plot_series: tuple, default ('BTC', 'close')
-            Plots the time series of a specific (ticker, field/column) tuple.
         Returns
         -------
-        filt_df: DataFrame - MultiIndex
+        filtered_df: DataFrame - MultiIndex
             Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols) with values before
             missing values gaps removed.
         """
-        # convert string to list
-        if self.excl_cols is not None:
-            df = self.raw_df.drop(columns=self.excl_cols).copy()
-        else:
-            df = self.raw_df.copy()
         # window obs count
         window_count = (
-            df.groupby(level=1)
+            self.df.groupby(level=1)
             .rolling(window=gap_window, min_periods=gap_window)
             .count()
             .droplevel(0)
@@ -194,24 +120,25 @@ class Filter:
         for col in gap.unstack().columns:
             start_idx = gap.unstack()[col].last_valid_index()
             if start_idx is not None:
-                df.loc[pd.IndexSlice[:start_idx, col[1]], col[0]] = np.nan
-        # add excl cols
-        if self.excl_cols is not None:
-            filt_df = pd.concat([df, self.raw_df[self.excl_cols]], join="outer", axis=1)
-        else:
-            filt_df = df
+                self.df.loc[pd.IndexSlice[:start_idx, col[1]], col[0]] = np.nan
         # plot
-        if plot:
-            if not isinstance(plot_series, tuple):
+        if self.plot:
+            if not isinstance(self.plot_series, tuple):
                 raise TypeError(
                     "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
                 )
             else:
-                self.plot_filtered(filt_df, plot_series=plot_series)
+                self.plot_filtered(plot_series=self.plot_series)
+        # add excl cols
+        if self.excl_cols is not None:
+            self.filtered_df = pd.concat([self.df,
+                                          self.raw_df[self.excl_cols].reindex(self.df)], axis=1)
+        else:
+            self.filtered_df = self.df
-        return filt_df
+        return self.filtered_df
     def min_nobs(self, ts_obs=100, cs_obs=1) -> pd.DataFrame:
         """
@@ -227,25 +154,47 @@ class Filter:
         Returns
         -------
-        filt_df: DataFrame - MultiIndex
+        filtered_df: DataFrame - MultiIndex
             Filtered dataFrame with DatetimeIndex (level 0), tickers with minimum number of observations (level 1)
             and fields (cols).
         """
-        # create copy
-        df = self.raw_df.copy()
         # drop tickers with nobs < ts_obs
-        obs = df.groupby(level=1).count().min(axis=1)
+        obs = self.df.groupby(level=1).count().min(axis=1)
         drop_tickers_list = obs[obs < ts_obs].index.to_list()
-        filt_df = df.drop(drop_tickers_list, level=1, axis=0)
+        self.filtered_df = self.df.drop(drop_tickers_list, level=1, axis=0)
         # drop tickers with nobs < cs_obs
-        obs = filt_df.groupby(level=0).count().min(axis=1)
+        obs = self.filtered_df.groupby(level=0).count().min(axis=1)
         idx_start = obs[obs > cs_obs].index[0]
-        filt_df = filt_df.unstack()[filt_df.unstack().index > idx_start].stack()
+        self.filtered_df = self.filtered_df.loc[idx_start:]
-        return filt_df
+        return self.filtered_df
+    def remove_delisted(self, field: str = 'close', n_unch_vals: int = 30) -> pd.DataFrame:
+        """
+        Removes delisted tickers from dataframe.
+        Parameters
+        ----------
+        field: str, default 'close'
+            Field/column to use for detecting delisted tickers.
+        n_unch_vals: int, default 30
+            Number of consecutive unchanged values to consider a ticker as delisted.
+        Returns
+        -------
+        filtered_df: pd.DataFrame - MultiIndex
+            Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols).
+        """
+        # delisted tickers
+        delisted_tickers = self.df[field].unstack()[self.df[field].unstack().pct_change().iloc[-n_unch_vals:] == 0].\
+            dropna(how='all', axis=0).dropna(thresh=n_unch_vals, axis=1).columns
+        print(delisted_tickers)
+        # drop delisted tickers
+        self.filtered_df = self.df.drop(delisted_tickers, level=1)
+        return self.filtered_df
     def tickers(self, tickers_list) -> pd.DataFrame:
         """
@@ -259,37 +208,29 @@ class Filter:
         Returns
         -------
-        filt_df: pd.DataFrame - MultiIndex
+        filtered_df: pd.DataFrame - MultiIndex
             Filtered dataFrame with DatetimeIndex (level 0), tickers (level 1) and fields (cols).
         """
-        # create copy
-        df = self.raw_df.copy()
         # tickers list
         if isinstance(tickers_list, str):
             tickers_list = [tickers_list]
         # drop tickers
-        filt_df = df.drop(tickers_list, level=1, axis=0)
+        self.filtered_df = self.df.drop(tickers_list, level=1)
-        return filt_df
+        return self.filtered_df
-    @staticmethod
-    def plot_filtered(
-        filt_df: pd.DataFrame, plot_series: Optional[tuple] = None
-    ) -> None:
+    def plot_filtered(self, plot_series: Optional[tuple] = None) -> None:
         """
         Plots filtered time series.
         Parameters
         ----------
-        filt_df: pd.DataFrame - MultiIndex
-            Dataframe MultiIndex with DatetimeIndex (level 0), tickers (level 1) and filtered values (cols).
         plot_series: tuple, optional, default None
             Plots the time series of a specific (ticker, field) tuple.
         """
         ax = (
-            filt_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
+            self.filtered_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
             .droplevel(1)
             .plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
         )

cryptodatapy/transform/impute.py CHANGED Viewed

@@ -7,67 +7,51 @@ import pandas as pd
 class Impute:
     """
     Handles missing values.
     """
-    def __init__(self, filt_df: pd.DataFrame):
+    def __init__(self, filtered_df: pd.DataFrame, plot: bool = False, plot_series: tuple = ("BTC", "close")):
         """
         Constructor
         Parameters
         ----------
-        filt_df: pd.DataFrame - MultiIndex
+        filtered_df: pd.DataFrame - MultiIndex
             DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with filtered values.
         """
-        self.filt_df = filt_df
+        self.filtered_df = filtered_df.astype(float)
+        self.plot = plot
+        self.plot_series = plot_series
+        self.imputed_df = None
-    def fwd_fill(
-        self, plot: bool = False, plot_series: tuple = ("BTC", "close")
-    ) -> pd.DataFrame:
+    def fwd_fill(self) -> pd.DataFrame:
         """
         Imputes missing values by imputing missing values with latest non-missing values.
-        Parameters
-        ----------
-        plot: bool, default False
-            Plots series with outliers highlighted with red dots.
-        plot_series: tuple, default ('BTC', 'close')
-            Plots the time series of a specific (ticker, field/column) tuple.
         Returns
         -------
-        imp_df: pd.DataFrame - MultiIndex
+        imputed_df: pd.DataFrame - MultiIndex
             DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
             using forward fill method.
         """
-        # copy df
-        filt_df = self.filt_df.copy()
         # ffill
-        imp_df = filt_df.groupby(level=1).ffill()
+        self.imputed_df = self.filtered_df.groupby(level=1).ffill()
         # plot
-        if plot:
-            if not isinstance(plot_series, tuple):
+        if self.plot:
+            if not isinstance(self.plot_series, tuple):
                 raise TypeError(
                     "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
                 )
             else:
-                self.plot_imputed(imp_df, plot_series=plot_series)
+                self.plot_imputed()
-        return imp_df
+        return self.imputed_df
     def interpolate(
         self,
         method: str = "linear",
         order: Optional[int] = None,
-        axis=0,
+        axis: int = 0,
         limit: Optional[int] = None,
-        plot: bool = False,
-        plot_series: tuple = ("BTC", "close"),
     ) -> pd.DataFrame:
         """
         Imputes missing values by interpolating using various methods.
@@ -83,116 +67,85 @@ class Impute:
             Axis to interpolate along.
         limit: int, optional, default None
             Maximum number of consecutive NaNs to fill. Must be greater than 0.
-        plot: bool, default False
-            Plots series with outliers highlighted with red dots.
-        plot_series: tuple, default ('BTC', 'close')
-            Plots the time series of a specific (ticker, field/column) tuple.
         Returns
         -------
-        imp_df: pd.DataFrame - MultiIndex
+        imputed_df: pd.DataFrame - MultiIndex
             DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
             using interpolation method.
         """
-        # copy df and convert to float for interpolation (code will break if type int64)
-        filt_df = self.filt_df.astype(float).copy()
         # add order if spline or polynomial
         if (method == "spline" or method == "polynomial") and order is None:
             order = 3
         # interpolate
-        imp_df = (
-            filt_df.unstack()
-            .interpolate(method=method, order=order, axis=axis, limit=limit)
-            .stack()
-            .reindex(filt_df.index)
-        )
+        self.imputed_df = self.filtered_df.unstack().interpolate(method=method, order=order, axis=axis,
+                                                                 limit=limit).stack().reindex(self.filtered_df.index)
         # type conversion
-        imp_df = imp_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
+        self.imputed_df = self.imputed_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
         # plot
-        if plot:
-            if not isinstance(plot_series, tuple):
+        if self.plot:
+            if not isinstance(self.plot_series, tuple):
                 raise TypeError(
                     "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
                 )
             else:
-                self.plot_imputed(imp_df, plot_series=plot_series)
+                self.plot_imputed()
-        return imp_df
+        return self.imputed_df
     def fcst(
         self,
-        fcst_df: pd.DataFrame,
-        plot: bool = False,
-        plot_series: tuple = ("BTC", "close"),
+        yhat_df: pd.DataFrame,
     ) -> pd.DataFrame:
         """
         Imputes missing values with forecasts from outlier detection algorithm.
         Parameters
         ----------
-        fcst_df: pd.DataFrame - MultiIndex
+        yhat_df: pd.DataFrame - MultiIndex
             Multiindex dataframe with DatetimeIndex (level 0), tickers (level 1) and fields (cols)
             with forecasted values.
-        plot: bool, default False
-            Plots series with outliers highlighted with red dots.
-        plot_series: tuple, default ('BTC', 'close')
-            Plots the time series of a specific (ticker, field/column) tuple.
         Returns
         -------
-        imp_df: pd.DataFrame - MultiIndex
+        imputed_df: pd.DataFrame - MultiIndex
             DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values
             using forecasts from outlier detection method.
         """
-        # copy filtered and forecast dfs
-        filt_df, yhat_df = self.filt_df.copy(), fcst_df.copy()
         # impute missing vals in filtered df with fcst vals
-        imp_yhat = np.where(filt_df.isna(), yhat_df, filt_df)
+        imp_yhat = np.where(self.filtered_df.isna(), yhat_df, self.filtered_df)
         # create df
-        imp_df = pd.DataFrame(imp_yhat, index=filt_df.index, columns=filt_df.columns)
+        self.imputed_df = pd.DataFrame(imp_yhat, index=self.filtered_df.index, columns=self.filtered_df.columns)
         # type conversion
-        imp_df = imp_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
+        self.imputed_df = self.imputed_df.apply(pd.to_numeric, errors="ignore").convert_dtypes()
         # plot
-        if plot:
-            if not isinstance(plot_series, tuple):
+        if self.plot:
+            if not isinstance(self.plot_series, tuple):
                 raise TypeError(
                     "Plot_series must be a tuple specifying the ticker and column/field to plot (ticker, column)."
                 )
             else:
-                self.plot_imputed(imp_df, plot_series=plot_series)
+                self.plot_imputed()
-        return imp_df
+        return self.imputed_df
-    @staticmethod
-    def plot_imputed(imp_df: pd.DataFrame, plot_series: Optional[tuple] = None) -> None:
+    def plot_imputed(self) -> None:
         """
         Plots filtered time series.
-        Parameters
-        ----------
-        imp_df: pd.DataFrame - MultiIndex
-            DataFrame MultiIndex with DatetimeIndex (level 0), ticker (level 1) and fields (cols) with imputed values.
-        plot_series: tuple, optional, default None
-            Plots the time series of a specific (ticker, field) tuple.
         """
         ax = (
-            imp_df.loc[pd.IndexSlice[:, plot_series[0]], plot_series[1]]
+            self.imputed_df.loc[pd.IndexSlice[:, self.plot_series[0]], self.plot_series[1]]
             .droplevel(1)
             .plot(linewidth=1, figsize=(15, 7), color="#1f77b4", zorder=0)
         )
         ax.grid(color="black", linewidth=0.05)
         ax.xaxis.grid(False)
-        ax.set_ylabel(plot_series[0])
+        ax.set_ylabel(self.plot_series[0])
         ax.ticklabel_format(style="plain", axis="y")
         ax.set_facecolor("whitesmoke")
-        ax.legend([plot_series[1] + "_repaired"], loc="upper left")
+        ax.legend([self.plot_series[1] + "_repaired"], loc="upper left")

cryptodatapy 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

cryptodatapy 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl