PyPI - qfeaturelib - Versions diffs - 0.1.0__py3-none-any.whl - Mend

qfeaturelib 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

qfeaturelib/__init__.py +133 -0
qfeaturelib/core/__init__.py +0 -0
qfeaturelib/core/panel_data.py +360 -0
qfeaturelib/core/validators.py +330 -0
qfeaturelib/imputation/__init__.py +18 -0
qfeaturelib/imputation/cross_sectional.py +173 -0
qfeaturelib/imputation/time_series.py +212 -0
qfeaturelib/neutralization/__init__.py +14 -0
qfeaturelib/neutralization/regression.py +311 -0
qfeaturelib/splitting/__init__.py +17 -0
qfeaturelib/splitting/base.py +249 -0
qfeaturelib/splitting/expanding.py +137 -0
qfeaturelib/splitting/rolling.py +127 -0
qfeaturelib/standardization/__init__.py +43 -0
qfeaturelib/standardization/algorithms.py +305 -0
qfeaturelib/standardization/cross_sectional.py +306 -0
qfeaturelib/standardization/time_series.py +428 -0
qfeaturelib/utils/__init__.py +21 -0
qfeaturelib/utils/macro.py +424 -0
qfeaturelib/utils/numba_ops.py +209 -0
qfeaturelib-0.1.0.dist-info/METADATA +284 -0
qfeaturelib-0.1.0.dist-info/RECORD +24 -0
qfeaturelib-0.1.0.dist-info/WHEEL +4 -0
qfeaturelib-0.1.0.dist-info/licenses/LICENSE +21 -0

qfeaturelib/__init__.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""
+QFeatureLib - High-performance feature engineering for quantitative investment.
+This library provides efficient, production-grade tools for financial feature engineering
+with strict prevention of data leakage (future function) and optimized NumPy-based
+computations.
+Key Modules
+-----------
+- core: PanelData structure and validation utilities
+- standardization: Time-series and cross-sectional standardization
+- splitting: Rolling and expanding window sample splitting
+- imputation: Missing value handling
+- neutralization: Feature neutralization via regression
+Quick Start
+-----------
+>>> import numpy as np
+>>> from qfeaturelib import PanelData
+>>> from qfeaturelib.standardization import rolling_zscore, cs_zscore
+>>>
+>>> # Create panel data
+>>> values = np.random.randn(100, 50, 5)  # 100 days, 50 stocks, 5 features
+>>> dates = np.arange(100)
+>>> tickers = [f'STOCK_{i}' for i in range(50)]
+>>> panel = PanelData(values, dates, tickers)
+>>>
+>>> # Rolling Z-score (prevent future leakage with shift=1)
+>>> zscore_values = rolling_zscore(panel.values[..., 0], window=20, shift=1)
+>>>
+>>> # Cross-sectional Z-score
+>>> cs_values = cs_zscore(panel.values[..., 0])
+"""
+__version__ = "0.1.0"
+# Core
+from .core.panel_data import PanelData
+from .core.validators import FutureFunctionError, ValidationError
+# Standardization
+from .standardization import (
+    cs_minmax,
+    cs_rank,
+    cs_robust_zscore,
+    cs_zscore,
+    minmax_scale,
+    rank_scale,
+    robust_zscore,
+    rolling_minmax,
+    rolling_robust_zscore,
+    rolling_zscore,
+    winsorize,
+    zscore,
+)
+# Splitting
+from .splitting import (
+    ExpandingWindowSplitter,
+    RollingWindowSplitter,
+    SplitIndices,
+)
+# Imputation
+from .imputation import (
+    bfill,
+    cs_mean_fill,
+    cs_median_fill,
+    ffill,
+    ffill_limit,
+)
+# Neutralization
+from .neutralization import (
+    industry_neutralize,
+    neutralize,
+    size_neutralize,
+)
+# Utils (Macro indicators)
+from .utils import (
+    adapt_macro_to_panel,
+    macro_expanding_zscore,
+    macro_momentum,
+    macro_rolling_minmax,
+    macro_rolling_rank,
+    macro_rolling_zscore,
+    macro_yoy_change,
+)
+__all__ = [
+    # Version
+    "__version__",
+    # Core
+    "PanelData",
+    "FutureFunctionError",
+    "ValidationError",
+    # Standardization
+    "zscore",
+    "robust_zscore",
+    "minmax_scale",
+    "rank_scale",
+    "winsorize",
+    "rolling_zscore",
+    "rolling_robust_zscore",
+    "rolling_minmax",
+    "cs_zscore",
+    "cs_robust_zscore",
+    "cs_minmax",
+    "cs_rank",
+    # Splitting
+    "SplitIndices",
+    "RollingWindowSplitter",
+    "ExpandingWindowSplitter",
+    # Imputation
+    "ffill",
+    "bfill",
+    "ffill_limit",
+    "cs_median_fill",
+    "cs_mean_fill",
+    # Neutralization
+    "neutralize",
+    "industry_neutralize",
+    "size_neutralize",
+    # Utils (Macro indicators)
+    "macro_rolling_zscore",
+    "macro_expanding_zscore",
+    "macro_rolling_minmax",
+    "macro_rolling_rank",
+    "macro_yoy_change",
+    "macro_momentum",
+    "adapt_macro_to_panel",
+]

qfeaturelib/core/__init__.py ADDED Viewed

File without changes

qfeaturelib/core/panel_data.py ADDED Viewed

@@ -0,0 +1,360 @@
+"""
+Panel data structure for quantitative finance.
+This module provides a standardized data structure for handling 3D panel data
+(time x assets x features) commonly used in quantitative investment.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+import numpy as np
+import pandas as pd
+if TYPE_CHECKING:
+    pass
+@dataclass
+class PanelData:
+    """
+    Standardized panel data structure for quantitative finance.
+    This class represents a 3D panel data structure with dimensions (T, N, F):
+    - T: Time periods
+    - N: Number of assets/tickers
+    - F: Number of features
+    For 2D data (single feature), values will have shape (T, N).
+    Attributes
+    ----------
+    values : np.ndarray
+        The data array with shape (T, N) or (T, N, F)
+    dates : np.ndarray
+        Array of date indices with shape (T,)
+    tickers : np.ndarray
+        Array of ticker symbols with shape (N,)
+    features : Optional[List[str]]
+        List of feature names. If None and values is 3D, features are auto-named.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from qfeaturelib.core import PanelData
+    >>>
+    >>> # Create 2D panel data (single feature)
+    >>> values = np.random.randn(100, 50)  # 100 days, 50 stocks
+    >>> dates = pd.date_range('2020-01-01', periods=100)
+    >>> tickers = [f'STOCK_{i}' for i in range(50)]
+    >>> panel = PanelData(values, dates, tickers)
+    >>>
+    >>> # Create 3D panel data (multiple features)
+    >>> values_3d = np.random.randn(100, 50, 5)  # 100 days, 50 stocks, 5 features
+    >>> features = ['open', 'high', 'low', 'close', 'volume']
+    >>> panel_3d = PanelData(values_3d, dates, tickers, features)
+    """
+    values: np.ndarray
+    dates: np.ndarray
+    tickers: np.ndarray
+    features: Optional[List[str]] = None
+    def __post_init__(self) -> None:
+        """Validate inputs and set default feature names."""
+        self.values = np.asarray(self.values)
+        self.dates = np.asarray(self.dates)
+        self.tickers = np.asarray(self.tickers)
+        if self.values.ndim not in (2, 3):
+            raise ValueError(
+                f"values must be 2D or 3D array, got shape {self.values.shape}"
+            )
+        t, n = self.values.shape[0], self.values.shape[1]
+        if len(self.dates) != t:
+            raise ValueError(
+                f"dates length ({len(self.dates)}) must match values time dimension ({t})"
+            )
+        if len(self.tickers) != n:
+            raise ValueError(
+                f"tickers length ({len(self.tickers)}) must match values asset dimension ({n})"
+            )
+        # Auto-generate feature names for 3D data
+        if self.values.ndim == 3 and self.features is None:
+            n_features = self.values.shape[2]
+            self.features = [f"feature_{i}" for i in range(n_features)]
+        # Validate feature names count for 3D data
+        if self.values.ndim == 3 and self.features is not None:
+            if len(self.features) != self.values.shape[2]:
+                raise ValueError(
+                    f"features length ({len(self.features)}) must match "
+                    f"values feature dimension ({self.values.shape[2]})"
+                )
+    @property
+    def n_periods(self) -> int:
+        """Number of time periods (T)."""
+        return self.values.shape[0]
+    @property
+    def n_assets(self) -> int:
+        """Number of assets (N)."""
+        return self.values.shape[1]
+    @property
+    def n_features(self) -> int:
+        """Number of features (F). Returns 1 for 2D data."""
+        return self.values.shape[2] if self.values.ndim == 3 else 1
+    @property
+    def shape(self) -> Union[Tuple[int, int], Tuple[int, int, int]]:
+        """Shape of values array."""
+        return self.values.shape
+    def __len__(self) -> int:
+        """Return number of time periods."""
+        return self.n_periods
+    def __getitem__(self, key: Union[int, slice, Tuple]) -> PanelData:
+        """
+        Index into panel data.
+        Parameters
+        ----------
+        key : int, slice, or tuple
+            Index for time dimension or tuple for multi-dimensional indexing
+        Returns
+        -------
+        PanelData
+            New PanelData with indexed values
+        """
+        new_values = self.values[key]
+        if isinstance(key, tuple):
+            time_key = key[0]
+        else:
+            time_key = key
+        if isinstance(time_key, int):
+            new_dates = np.array([self.dates[time_key]])
+        else:
+            new_dates = self.dates[time_key]
+        # Handle asset/feature slicing
+        if isinstance(key, tuple) and len(key) > 1:
+            asset_key = key[1] if len(key) > 1 else slice(None)
+            if isinstance(asset_key, int):
+                new_tickers = np.array([self.tickers[asset_key]])
+            else:
+                new_tickers = self.tickers[asset_key]
+        else:
+            new_tickers = self.tickers
+        return PanelData(new_values, new_dates, new_tickers, self.features)
+    def to_frame(self, feature_idx: Optional[int] = None) -> pd.DataFrame:
+        """
+        Convert panel data to long-format DataFrame.
+        Parameters
+        ----------
+        feature_idx : Optional[int]
+            If values is 3D, select this feature index. If None and 3D,
+            converts each feature to a separate column.
+        Returns
+        -------
+        pd.DataFrame
+            Long-format DataFrame with columns [date, ticker, value] or
+            [date, ticker, feature_0, feature_1, ...] for 3D data
+        """
+        if self.values.ndim == 2:
+            # 2D data: reshape to long format
+            df = pd.DataFrame(
+                self.values,
+                index=pd.DatetimeIndex(self.dates),
+                columns=self.tickers,
+            )
+            df.index.name = "date"
+            return df.reset_index().melt(
+                id_vars=["date"], var_name="ticker", value_name="value"
+            )
+        else:
+            # 3D data
+            if feature_idx is not None:
+                # Return single feature
+                df = pd.DataFrame(
+                    self.values[:, :, feature_idx],
+                    index=pd.DatetimeIndex(self.dates),
+                    columns=self.tickers,
+                )
+                df.index.name = "date"
+                feature_name = self.features[feature_idx] if self.features else f"feature_{feature_idx}"
+                return df.reset_index().melt(
+                    id_vars=["date"], var_name="ticker", value_name=feature_name
+                )
+            else:
+                # Return all features as separate columns
+                dfs = []
+                for i in range(self.n_features):
+                    df = pd.DataFrame(
+                        self.values[:, :, i],
+                        index=pd.DatetimeIndex(self.dates),
+                        columns=self.tickers,
+                    )
+                    feature_name = self.features[i] if self.features else f"feature_{i}"
+                    dfs.append(df.stack().rename(feature_name))
+                result = pd.concat(dfs, axis=1)
+                result.index.names = ["date", "ticker"]
+                return result.reset_index()
+    @classmethod
+    def from_frame(
+        cls,
+        df: pd.DataFrame,
+        date_col: str = "date",
+        ticker_col: str = "ticker",
+        value_col: Optional[str] = None,
+        pivot: bool = True,
+    ) -> PanelData:
+        """
+        Create PanelData from a DataFrame.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            Input DataFrame in long format
+        date_col : str
+            Name of the date column
+        ticker_col : str
+            Name of the ticker column
+        value_col : Optional[str]
+            Name of the value column. If None and multiple non-id columns exist,
+            treats them as multiple features (3D data).
+        pivot : bool
+            If True, pivot the data. Set to False if data is already in wide format.
+        Returns
+        -------
+        PanelData
+            PanelData instance
+        """
+        df = df.copy()
+        # Ensure date column is datetime
+        df[date_col] = pd.to_datetime(df[date_col])
+        # Get value columns
+        id_cols = {date_col, ticker_col}
+        value_cols = [c for c in df.columns if c not in id_cols]
+        if len(value_cols) == 0:
+            raise ValueError("No value columns found in DataFrame")
+        if value_col is not None:
+            # Single feature (2D)
+            if pivot:
+                pivoted = df.pivot(index=date_col, columns=ticker_col, values=value_col)
+            else:
+                pivoted = df.set_index([date_col, ticker_col])[value_col].unstack()
+            values = pivoted.values
+            dates = pivoted.index.values
+            tickers = pivoted.columns.values
+            features = None
+        else:
+            # Multiple features (3D)
+            if len(value_cols) == 1:
+                # Actually single feature
+                return cls.from_frame(df, date_col, ticker_col, value_cols[0], pivot)
+            # Create 3D array
+            dates = df[date_col].unique()
+            tickers = df[ticker_col].unique()
+            t, n = len(dates), len(tickers)
+            f = len(value_cols)
+            values = np.full((t, n, f), np.nan)
+            date_idx = {d: i for i, d in enumerate(dates)}
+            ticker_idx = {t: i for i, t in enumerate(tickers)}
+            for _, row in df.iterrows():
+                di = date_idx[row[date_col]]
+                ti = ticker_idx[row[ticker_col]]
+                for fi, col in enumerate(value_cols):
+                    values[di, ti, fi] = row[col]
+            features = value_cols
+        return cls(values, dates, tickers, features)
+    def isna(self) -> np.ndarray:
+        """Return boolean mask of NaN values with same shape as values."""
+        return np.isnan(self.values)
+    def dropna(self, axis: int = 0, how: str = "any") -> PanelData:
+        """
+        Remove missing values.
+        Parameters
+        ----------
+        axis : int
+            Axis along which to drop. 0=time, 1=assets, 2=features (for 3D)
+        how : str
+            'any' or 'all'. If 'any', drop if any NA. If 'all', drop only if all NA.
+        Returns
+        -------
+        PanelData
+            New PanelData with NA removed
+        """
+        mask = self.isna()
+        if how == "any":
+            keep_mask = ~mask.any(axis=tuple(i for i in range(mask.ndim) if i != axis))
+        else:  # 'all'
+            keep_mask = ~mask.all(axis=tuple(i for i in range(mask.ndim) if i != axis))
+        if axis == 0:
+            new_values = self.values[keep_mask]
+            new_dates = self.dates[keep_mask]
+            return PanelData(new_values, new_dates, self.tickers, self.features)
+        elif axis == 1:
+            new_values = self.values[:, keep_mask]
+            new_tickers = self.tickers[keep_mask]
+            return PanelData(new_values, self.dates, new_tickers, self.features)
+        else:  # axis == 2
+            if self.values.ndim != 3:
+                raise ValueError("Cannot drop features from 2D data")
+            new_values = self.values[:, :, keep_mask]
+            new_features = [f for i, f in enumerate(self.features) if keep_mask[i]] if self.features else None
+            return PanelData(new_values, self.dates, self.tickers, new_features)
+    def copy(self) -> PanelData:
+        """Return a deep copy of the PanelData."""
+        return PanelData(
+            self.values.copy(),
+            self.dates.copy(),
+            self.tickers.copy(),
+            self.features.copy() if self.features else None,
+        )
+    def __repr__(self) -> str:
+        """String representation of PanelData."""
+        shape_str = f"({self.n_periods}, {self.n_assets})"
+        if self.n_features > 1:
+            shape_str = f"({self.n_periods}, {self.n_assets}, {self.n_features})"
+        return (
+            f"PanelData(shape={shape_str}, "
+            f"dates={self.dates[0]} to {self.dates[-1]}, "
+            f"n_assets={self.n_assets})"
+        )