qfeaturelib 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,133 @@
1
+ """
2
+ QFeatureLib - High-performance feature engineering for quantitative investment.
3
+
4
+ This library provides efficient, production-grade tools for financial feature engineering
5
+ with strict prevention of data leakage (future function) and optimized NumPy-based
6
+ computations.
7
+
8
+ Key Modules
9
+ -----------
10
+ - core: PanelData structure and validation utilities
11
+ - standardization: Time-series and cross-sectional standardization
12
+ - splitting: Rolling and expanding window sample splitting
13
+ - imputation: Missing value handling
14
+ - neutralization: Feature neutralization via regression
15
+
16
+ Quick Start
17
+ -----------
18
+ >>> import numpy as np
19
+ >>> from qfeaturelib import PanelData
20
+ >>> from qfeaturelib.standardization import rolling_zscore, cs_zscore
21
+ >>>
22
+ >>> # Create panel data
23
+ >>> values = np.random.randn(100, 50, 5) # 100 days, 50 stocks, 5 features
24
+ >>> dates = np.arange(100)
25
+ >>> tickers = [f'STOCK_{i}' for i in range(50)]
26
+ >>> panel = PanelData(values, dates, tickers)
27
+ >>>
28
+ >>> # Rolling Z-score (prevent future leakage with shift=1)
29
+ >>> zscore_values = rolling_zscore(panel.values[..., 0], window=20, shift=1)
30
+ >>>
31
+ >>> # Cross-sectional Z-score
32
+ >>> cs_values = cs_zscore(panel.values[..., 0])
33
+ """
34
+
35
+ __version__ = "0.1.0"
36
+
37
+ # Core
38
+ from .core.panel_data import PanelData
39
+ from .core.validators import FutureFunctionError, ValidationError
40
+
41
+ # Standardization
42
+ from .standardization import (
43
+ cs_minmax,
44
+ cs_rank,
45
+ cs_robust_zscore,
46
+ cs_zscore,
47
+ minmax_scale,
48
+ rank_scale,
49
+ robust_zscore,
50
+ rolling_minmax,
51
+ rolling_robust_zscore,
52
+ rolling_zscore,
53
+ winsorize,
54
+ zscore,
55
+ )
56
+
57
+ # Splitting
58
+ from .splitting import (
59
+ ExpandingWindowSplitter,
60
+ RollingWindowSplitter,
61
+ SplitIndices,
62
+ )
63
+
64
+ # Imputation
65
+ from .imputation import (
66
+ bfill,
67
+ cs_mean_fill,
68
+ cs_median_fill,
69
+ ffill,
70
+ ffill_limit,
71
+ )
72
+
73
+ # Neutralization
74
+ from .neutralization import (
75
+ industry_neutralize,
76
+ neutralize,
77
+ size_neutralize,
78
+ )
79
+
80
+ # Utils (Macro indicators)
81
+ from .utils import (
82
+ adapt_macro_to_panel,
83
+ macro_expanding_zscore,
84
+ macro_momentum,
85
+ macro_rolling_minmax,
86
+ macro_rolling_rank,
87
+ macro_rolling_zscore,
88
+ macro_yoy_change,
89
+ )
90
+
91
+ __all__ = [
92
+ # Version
93
+ "__version__",
94
+ # Core
95
+ "PanelData",
96
+ "FutureFunctionError",
97
+ "ValidationError",
98
+ # Standardization
99
+ "zscore",
100
+ "robust_zscore",
101
+ "minmax_scale",
102
+ "rank_scale",
103
+ "winsorize",
104
+ "rolling_zscore",
105
+ "rolling_robust_zscore",
106
+ "rolling_minmax",
107
+ "cs_zscore",
108
+ "cs_robust_zscore",
109
+ "cs_minmax",
110
+ "cs_rank",
111
+ # Splitting
112
+ "SplitIndices",
113
+ "RollingWindowSplitter",
114
+ "ExpandingWindowSplitter",
115
+ # Imputation
116
+ "ffill",
117
+ "bfill",
118
+ "ffill_limit",
119
+ "cs_median_fill",
120
+ "cs_mean_fill",
121
+ # Neutralization
122
+ "neutralize",
123
+ "industry_neutralize",
124
+ "size_neutralize",
125
+ # Utils (Macro indicators)
126
+ "macro_rolling_zscore",
127
+ "macro_expanding_zscore",
128
+ "macro_rolling_minmax",
129
+ "macro_rolling_rank",
130
+ "macro_yoy_change",
131
+ "macro_momentum",
132
+ "adapt_macro_to_panel",
133
+ ]
File without changes
@@ -0,0 +1,360 @@
1
+ """
2
+ Panel data structure for quantitative finance.
3
+
4
+ This module provides a standardized data structure for handling 3D panel data
5
+ (time x assets x features) commonly used in quantitative investment.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ if TYPE_CHECKING:
17
+ pass
18
+
19
+
20
+ @dataclass
21
+ class PanelData:
22
+ """
23
+ Standardized panel data structure for quantitative finance.
24
+
25
+ This class represents a 3D panel data structure with dimensions (T, N, F):
26
+ - T: Time periods
27
+ - N: Number of assets/tickers
28
+ - F: Number of features
29
+
30
+ For 2D data (single feature), values will have shape (T, N).
31
+
32
+ Attributes
33
+ ----------
34
+ values : np.ndarray
35
+ The data array with shape (T, N) or (T, N, F)
36
+ dates : np.ndarray
37
+ Array of date indices with shape (T,)
38
+ tickers : np.ndarray
39
+ Array of ticker symbols with shape (N,)
40
+ features : Optional[List[str]]
41
+ List of feature names. If None and values is 3D, features are auto-named.
42
+
43
+ Examples
44
+ --------
45
+ >>> import numpy as np
46
+ >>> from qfeaturelib.core import PanelData
47
+ >>>
48
+ >>> # Create 2D panel data (single feature)
49
+ >>> values = np.random.randn(100, 50) # 100 days, 50 stocks
50
+ >>> dates = pd.date_range('2020-01-01', periods=100)
51
+ >>> tickers = [f'STOCK_{i}' for i in range(50)]
52
+ >>> panel = PanelData(values, dates, tickers)
53
+ >>>
54
+ >>> # Create 3D panel data (multiple features)
55
+ >>> values_3d = np.random.randn(100, 50, 5) # 100 days, 50 stocks, 5 features
56
+ >>> features = ['open', 'high', 'low', 'close', 'volume']
57
+ >>> panel_3d = PanelData(values_3d, dates, tickers, features)
58
+ """
59
+
60
+ values: np.ndarray
61
+ dates: np.ndarray
62
+ tickers: np.ndarray
63
+ features: Optional[List[str]] = None
64
+
65
+ def __post_init__(self) -> None:
66
+ """Validate inputs and set default feature names."""
67
+ self.values = np.asarray(self.values)
68
+ self.dates = np.asarray(self.dates)
69
+ self.tickers = np.asarray(self.tickers)
70
+
71
+ if self.values.ndim not in (2, 3):
72
+ raise ValueError(
73
+ f"values must be 2D or 3D array, got shape {self.values.shape}"
74
+ )
75
+
76
+ t, n = self.values.shape[0], self.values.shape[1]
77
+
78
+ if len(self.dates) != t:
79
+ raise ValueError(
80
+ f"dates length ({len(self.dates)}) must match values time dimension ({t})"
81
+ )
82
+
83
+ if len(self.tickers) != n:
84
+ raise ValueError(
85
+ f"tickers length ({len(self.tickers)}) must match values asset dimension ({n})"
86
+ )
87
+
88
+ # Auto-generate feature names for 3D data
89
+ if self.values.ndim == 3 and self.features is None:
90
+ n_features = self.values.shape[2]
91
+ self.features = [f"feature_{i}" for i in range(n_features)]
92
+
93
+ # Validate feature names count for 3D data
94
+ if self.values.ndim == 3 and self.features is not None:
95
+ if len(self.features) != self.values.shape[2]:
96
+ raise ValueError(
97
+ f"features length ({len(self.features)}) must match "
98
+ f"values feature dimension ({self.values.shape[2]})"
99
+ )
100
+
101
+ @property
102
+ def n_periods(self) -> int:
103
+ """Number of time periods (T)."""
104
+ return self.values.shape[0]
105
+
106
+ @property
107
+ def n_assets(self) -> int:
108
+ """Number of assets (N)."""
109
+ return self.values.shape[1]
110
+
111
+ @property
112
+ def n_features(self) -> int:
113
+ """Number of features (F). Returns 1 for 2D data."""
114
+ return self.values.shape[2] if self.values.ndim == 3 else 1
115
+
116
+ @property
117
+ def shape(self) -> Union[Tuple[int, int], Tuple[int, int, int]]:
118
+ """Shape of values array."""
119
+ return self.values.shape
120
+
121
+ def __len__(self) -> int:
122
+ """Return number of time periods."""
123
+ return self.n_periods
124
+
125
+ def __getitem__(self, key: Union[int, slice, Tuple]) -> PanelData:
126
+ """
127
+ Index into panel data.
128
+
129
+ Parameters
130
+ ----------
131
+ key : int, slice, or tuple
132
+ Index for time dimension or tuple for multi-dimensional indexing
133
+
134
+ Returns
135
+ -------
136
+ PanelData
137
+ New PanelData with indexed values
138
+ """
139
+ new_values = self.values[key]
140
+
141
+ if isinstance(key, tuple):
142
+ time_key = key[0]
143
+ else:
144
+ time_key = key
145
+
146
+ if isinstance(time_key, int):
147
+ new_dates = np.array([self.dates[time_key]])
148
+ else:
149
+ new_dates = self.dates[time_key]
150
+
151
+ # Handle asset/feature slicing
152
+ if isinstance(key, tuple) and len(key) > 1:
153
+ asset_key = key[1] if len(key) > 1 else slice(None)
154
+ if isinstance(asset_key, int):
155
+ new_tickers = np.array([self.tickers[asset_key]])
156
+ else:
157
+ new_tickers = self.tickers[asset_key]
158
+ else:
159
+ new_tickers = self.tickers
160
+
161
+ return PanelData(new_values, new_dates, new_tickers, self.features)
162
+
163
+ def to_frame(self, feature_idx: Optional[int] = None) -> pd.DataFrame:
164
+ """
165
+ Convert panel data to long-format DataFrame.
166
+
167
+ Parameters
168
+ ----------
169
+ feature_idx : Optional[int]
170
+ If values is 3D, select this feature index. If None and 3D,
171
+ converts each feature to a separate column.
172
+
173
+ Returns
174
+ -------
175
+ pd.DataFrame
176
+ Long-format DataFrame with columns [date, ticker, value] or
177
+ [date, ticker, feature_0, feature_1, ...] for 3D data
178
+ """
179
+ if self.values.ndim == 2:
180
+ # 2D data: reshape to long format
181
+ df = pd.DataFrame(
182
+ self.values,
183
+ index=pd.DatetimeIndex(self.dates),
184
+ columns=self.tickers,
185
+ )
186
+ df.index.name = "date"
187
+ return df.reset_index().melt(
188
+ id_vars=["date"], var_name="ticker", value_name="value"
189
+ )
190
+ else:
191
+ # 3D data
192
+ if feature_idx is not None:
193
+ # Return single feature
194
+ df = pd.DataFrame(
195
+ self.values[:, :, feature_idx],
196
+ index=pd.DatetimeIndex(self.dates),
197
+ columns=self.tickers,
198
+ )
199
+ df.index.name = "date"
200
+ feature_name = self.features[feature_idx] if self.features else f"feature_{feature_idx}"
201
+ return df.reset_index().melt(
202
+ id_vars=["date"], var_name="ticker", value_name=feature_name
203
+ )
204
+ else:
205
+ # Return all features as separate columns
206
+ dfs = []
207
+ for i in range(self.n_features):
208
+ df = pd.DataFrame(
209
+ self.values[:, :, i],
210
+ index=pd.DatetimeIndex(self.dates),
211
+ columns=self.tickers,
212
+ )
213
+ feature_name = self.features[i] if self.features else f"feature_{i}"
214
+ dfs.append(df.stack().rename(feature_name))
215
+ result = pd.concat(dfs, axis=1)
216
+ result.index.names = ["date", "ticker"]
217
+ return result.reset_index()
218
+
219
+ @classmethod
220
+ def from_frame(
221
+ cls,
222
+ df: pd.DataFrame,
223
+ date_col: str = "date",
224
+ ticker_col: str = "ticker",
225
+ value_col: Optional[str] = None,
226
+ pivot: bool = True,
227
+ ) -> PanelData:
228
+ """
229
+ Create PanelData from a DataFrame.
230
+
231
+ Parameters
232
+ ----------
233
+ df : pd.DataFrame
234
+ Input DataFrame in long format
235
+ date_col : str
236
+ Name of the date column
237
+ ticker_col : str
238
+ Name of the ticker column
239
+ value_col : Optional[str]
240
+ Name of the value column. If None and multiple non-id columns exist,
241
+ treats them as multiple features (3D data).
242
+ pivot : bool
243
+ If True, pivot the data. Set to False if data is already in wide format.
244
+
245
+ Returns
246
+ -------
247
+ PanelData
248
+ PanelData instance
249
+ """
250
+ df = df.copy()
251
+
252
+ # Ensure date column is datetime
253
+ df[date_col] = pd.to_datetime(df[date_col])
254
+
255
+ # Get value columns
256
+ id_cols = {date_col, ticker_col}
257
+ value_cols = [c for c in df.columns if c not in id_cols]
258
+
259
+ if len(value_cols) == 0:
260
+ raise ValueError("No value columns found in DataFrame")
261
+
262
+ if value_col is not None:
263
+ # Single feature (2D)
264
+ if pivot:
265
+ pivoted = df.pivot(index=date_col, columns=ticker_col, values=value_col)
266
+ else:
267
+ pivoted = df.set_index([date_col, ticker_col])[value_col].unstack()
268
+
269
+ values = pivoted.values
270
+ dates = pivoted.index.values
271
+ tickers = pivoted.columns.values
272
+ features = None
273
+ else:
274
+ # Multiple features (3D)
275
+ if len(value_cols) == 1:
276
+ # Actually single feature
277
+ return cls.from_frame(df, date_col, ticker_col, value_cols[0], pivot)
278
+
279
+ # Create 3D array
280
+ dates = df[date_col].unique()
281
+ tickers = df[ticker_col].unique()
282
+ t, n = len(dates), len(tickers)
283
+ f = len(value_cols)
284
+
285
+ values = np.full((t, n, f), np.nan)
286
+ date_idx = {d: i for i, d in enumerate(dates)}
287
+ ticker_idx = {t: i for i, t in enumerate(tickers)}
288
+
289
+ for _, row in df.iterrows():
290
+ di = date_idx[row[date_col]]
291
+ ti = ticker_idx[row[ticker_col]]
292
+ for fi, col in enumerate(value_cols):
293
+ values[di, ti, fi] = row[col]
294
+
295
+ features = value_cols
296
+
297
+ return cls(values, dates, tickers, features)
298
+
299
+ def isna(self) -> np.ndarray:
300
+ """Return boolean mask of NaN values with same shape as values."""
301
+ return np.isnan(self.values)
302
+
303
+ def dropna(self, axis: int = 0, how: str = "any") -> PanelData:
304
+ """
305
+ Remove missing values.
306
+
307
+ Parameters
308
+ ----------
309
+ axis : int
310
+ Axis along which to drop. 0=time, 1=assets, 2=features (for 3D)
311
+ how : str
312
+ 'any' or 'all'. If 'any', drop if any NA. If 'all', drop only if all NA.
313
+
314
+ Returns
315
+ -------
316
+ PanelData
317
+ New PanelData with NA removed
318
+ """
319
+ mask = self.isna()
320
+
321
+ if how == "any":
322
+ keep_mask = ~mask.any(axis=tuple(i for i in range(mask.ndim) if i != axis))
323
+ else: # 'all'
324
+ keep_mask = ~mask.all(axis=tuple(i for i in range(mask.ndim) if i != axis))
325
+
326
+ if axis == 0:
327
+ new_values = self.values[keep_mask]
328
+ new_dates = self.dates[keep_mask]
329
+ return PanelData(new_values, new_dates, self.tickers, self.features)
330
+ elif axis == 1:
331
+ new_values = self.values[:, keep_mask]
332
+ new_tickers = self.tickers[keep_mask]
333
+ return PanelData(new_values, self.dates, new_tickers, self.features)
334
+ else: # axis == 2
335
+ if self.values.ndim != 3:
336
+ raise ValueError("Cannot drop features from 2D data")
337
+ new_values = self.values[:, :, keep_mask]
338
+ new_features = [f for i, f in enumerate(self.features) if keep_mask[i]] if self.features else None
339
+ return PanelData(new_values, self.dates, self.tickers, new_features)
340
+
341
+ def copy(self) -> PanelData:
342
+ """Return a deep copy of the PanelData."""
343
+ return PanelData(
344
+ self.values.copy(),
345
+ self.dates.copy(),
346
+ self.tickers.copy(),
347
+ self.features.copy() if self.features else None,
348
+ )
349
+
350
+ def __repr__(self) -> str:
351
+ """String representation of PanelData."""
352
+ shape_str = f"({self.n_periods}, {self.n_assets})"
353
+ if self.n_features > 1:
354
+ shape_str = f"({self.n_periods}, {self.n_assets}, {self.n_features})"
355
+
356
+ return (
357
+ f"PanelData(shape={shape_str}, "
358
+ f"dates={self.dates[0]} to {self.dates[-1]}, "
359
+ f"n_assets={self.n_assets})"
360
+ )