aptree 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aptree-0.1.0.dist-info/METADATA +663 -0
- aptree-0.1.0.dist-info/RECORD +13 -0
- aptree-0.1.0.dist-info/WHEEL +4 -0
- aptree-0.1.0.dist-info/licenses/LICENSE +21 -0
- ptree/__init__.py +95 -0
- ptree/criteria.py +451 -0
- ptree/data_handler.py +219 -0
- ptree/engine.py +1303 -0
- ptree/ensemble.py +657 -0
- ptree/node.py +130 -0
- ptree/predictors.py +619 -0
- ptree/py.typed +0 -0
- ptree/visualization.py +305 -0
|
@@ -0,0 +1,663 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aptree
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A supervised clustering algorithm for panel data, commonly used in quantitative finance to identify time-varying, cross-sectional predictability regimes.
|
|
5
|
+
Project-URL: Homepage, https://github.com/ElenYoung/AssetPanelTree
|
|
6
|
+
Project-URL: Repository, https://github.com/ElenYoung/AssetPanelTree.git
|
|
7
|
+
Project-URL: Documentation, https://github.com/ElenYoung/AssetPanelTree#readme
|
|
8
|
+
Project-URL: Issues, https://github.com/ElenYoung/AssetPanelTree/issues
|
|
9
|
+
Author-email: ElenYoung <elenyoung@example.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: asset-pricing,decision-tree,machine-learning,panel-data,predictability,quantitative-finance,supervised-clustering
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Office/Business :: Financial :: Investment
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.10
|
|
27
|
+
Requires-Dist: numpy>=1.23.0
|
|
28
|
+
Requires-Dist: pandas>=1.5.0
|
|
29
|
+
Provides-Extra: all
|
|
30
|
+
Requires-Dist: joblib>=1.2.0; extra == 'all'
|
|
31
|
+
Requires-Dist: lightgbm>=3.3.0; extra == 'all'
|
|
32
|
+
Requires-Dist: matplotlib>=3.5.0; extra == 'all'
|
|
33
|
+
Requires-Dist: mypy>=1.0.0; extra == 'all'
|
|
34
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'all'
|
|
35
|
+
Requires-Dist: pytest>=7.0.0; extra == 'all'
|
|
36
|
+
Requires-Dist: ruff>=0.1.0; extra == 'all'
|
|
37
|
+
Requires-Dist: scipy>=1.9.0; extra == 'all'
|
|
38
|
+
Requires-Dist: seaborn>=0.12.0; extra == 'all'
|
|
39
|
+
Provides-Extra: boost
|
|
40
|
+
Requires-Dist: lightgbm>=3.3.0; extra == 'boost'
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
43
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
45
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
46
|
+
Provides-Extra: fast
|
|
47
|
+
Requires-Dist: scipy>=1.9.0; extra == 'fast'
|
|
48
|
+
Provides-Extra: parallel
|
|
49
|
+
Requires-Dist: joblib>=1.2.0; extra == 'parallel'
|
|
50
|
+
Provides-Extra: viz
|
|
51
|
+
Requires-Dist: matplotlib>=3.5.0; extra == 'viz'
|
|
52
|
+
Requires-Dist: seaborn>=0.12.0; extra == 'viz'
|
|
53
|
+
Description-Content-Type: text/markdown
|
|
54
|
+
|
|
55
|
+
# Panel Tree (P-Tree)
|
|
56
|
+
|
|
57
|
+
[](https://pypi.org/project/ptree-panel/)
|
|
58
|
+
[](https://www.python.org/downloads/)
|
|
59
|
+
[](https://opensource.org/licenses/MIT)
|
|
60
|
+
|
|
61
|
+
A **supervised clustering algorithm** designed for **panel data**, commonly used in quantitative finance to identify time-varying, cross-sectional predictability regimes.
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install ptree-panel
|
|
67
|
+
|
|
68
|
+
# With visualization support (matplotlib, seaborn)
|
|
69
|
+
pip install ptree-panel[viz]
|
|
70
|
+
|
|
71
|
+
# For development
|
|
72
|
+
pip install ptree-panel[dev]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Core Idea
|
|
76
|
+
|
|
77
|
+
P-Tree recursively splits the full sample into disjoint leaf nodes using asset characteristics or macro states as thresholds. Unlike standard decision trees that minimise residual MSE, P-Tree **maximises the difference in predictive performance across child nodes**, producing a *prediction mosaic* — a map showing where and when alpha is concentrated.
|
|
78
|
+
|
|
79
|
+
### Key Differentiators
|
|
80
|
+
|
|
81
|
+
| Feature | Standard Decision Tree | P-Tree |
|
|
82
|
+
|---------|----------------------|--------|
|
|
83
|
+
| **Objective** | Minimise residual MSE/Gini | Maximise predictability difference |
|
|
84
|
+
| **Leaf Model** | Constant (mean) | Ridge regression / Logit |
|
|
85
|
+
| **Use Case** | Point prediction | Regime identification |
|
|
86
|
+
| **Output** | Single prediction | Prediction mosaic |
|
|
87
|
+
|
|
88
|
+
### Algorithm Overview
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
92
|
+
│ Full Sample │
|
|
93
|
+
│ (all time × assets) │
|
|
94
|
+
└──────────────────────────┬──────────────────────────────────────┘
|
|
95
|
+
│
|
|
96
|
+
┌─────────────────┴─────────────────┐
|
|
97
|
+
│ For each (feature, threshold): │
|
|
98
|
+
│ 1. Split into Left & Right │
|
|
99
|
+
│ 2. Fit Ridge on each subset │
|
|
100
|
+
│ 3. Compute R² for each │
|
|
101
|
+
│ 4. Score = |R²_L - R²_R| │
|
|
102
|
+
└─────────────────┬─────────────────┘
|
|
103
|
+
│
|
|
104
|
+
Select split with max score
|
|
105
|
+
│
|
|
106
|
+
┌─────────────────┴─────────────────┐
|
|
107
|
+
▼ ▼
|
|
108
|
+
┌──────────┐ ┌──────────┐
|
|
109
|
+
│ Left Node│ │Right Node│
|
|
110
|
+
│ (low val)│ │(high val)│
|
|
111
|
+
└────┬─────┘ └────┬─────┘
|
|
112
|
+
│ │
|
|
113
|
+
▼ ▼
|
|
114
|
+
Recurse or Recurse or
|
|
115
|
+
become Leaf become Leaf
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Project Structure
|
|
119
|
+
|
|
120
|
+
```
|
|
121
|
+
src/ptree/
|
|
122
|
+
├── __init__.py # Package exports
|
|
123
|
+
├── data_handler.py # DataHandler – alignment, missing-value fill, rank standardisation, volatility
|
|
124
|
+
├── predictors.py # PredictorBase, RidgeRegressor, VolWeightedRidgeRegressor, RidgeLogitClassifier, ElasticNetRegressor, PLSRegressor, SelfDefinedPredictor
|
|
125
|
+
├── criteria.py # CriterionBase, R2DiffCriterion, WeightedR2DiffCriterion, MeanVarianceCriterion, ClassificationCriterion, evaluation helpers
|
|
126
|
+
├── node.py # PanelTreeNode – per-node metadata container
|
|
127
|
+
├── engine.py # PanelTreeEngine – recursive splitting, cost-complexity pruning, honest splits, incremental matrix updates, feature-priority caching, joblib parallelism
|
|
128
|
+
├── ensemble.py # PanelForest (P-Forest bagging), BoostedPanelTree (P-Boost residual boosting)
|
|
129
|
+
└── visualization.py # NodeReporter (text/DataFrame reports), MosaicVisualizer (heatmap)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
## Quick Start
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
import numpy as np
|
|
137
|
+
import pandas as pd
|
|
138
|
+
from ptree import DataHandler, RidgeRegressor, R2DiffCriterion, PanelTreeEngine
|
|
139
|
+
from ptree import NodeReporter, MosaicVisualizer
|
|
140
|
+
|
|
141
|
+
# 1. Prepare panel data (DataFrame with date, asset_id, and feature columns)
|
|
142
|
+
dh = DataHandler(cs_rank_standardize=True)
|
|
143
|
+
X, y, vol_weights = dh.fit_transform(
|
|
144
|
+
df, y_series,
|
|
145
|
+
time_col="date", entity_col="asset_id",
|
|
146
|
+
ret_series_for_vol=ret_series, # optional, for VolWeightedRidge
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# 2. Build the tree
|
|
150
|
+
engine = PanelTreeEngine(
|
|
151
|
+
predictor=RidgeRegressor(alpha=1.0),
|
|
152
|
+
criterion=R2DiffCriterion(),
|
|
153
|
+
split_thresholds=[0.3, 0.5, 0.7],
|
|
154
|
+
max_depth=3,
|
|
155
|
+
min_samples=100,
|
|
156
|
+
fast_mode=False,
|
|
157
|
+
verbose=1,
|
|
158
|
+
)
|
|
159
|
+
engine.fit(X, y, feature_names=dh.feature_names, weights=vol_weights)
|
|
160
|
+
|
|
161
|
+
# 3. Inspect results
|
|
162
|
+
reporter = NodeReporter(engine)
|
|
163
|
+
print(reporter.print_tree()) # text tree
|
|
164
|
+
print(reporter.leaf_summary()) # DataFrame
|
|
165
|
+
|
|
166
|
+
# 4. Prediction mosaic
|
|
167
|
+
viz = MosaicVisualizer(engine)
|
|
168
|
+
mosaic = viz.build_mosaic(X, y, time_col="date", metric="r2")
|
|
169
|
+
fig, ax = viz.plot_mosaic(mosaic) # requires matplotlib & seaborn
|
|
170
|
+
|
|
171
|
+
# 5. Retrieve leaf-node samples
|
|
172
|
+
for leaf_id, indices in engine.get_leaf_samples().items():
|
|
173
|
+
print(f"Leaf {leaf_id}: {len(indices)} observations")
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Module Overview
|
|
177
|
+
|
|
178
|
+
### DataHandler
|
|
179
|
+
|
|
180
|
+
Handles panel data preprocessing including alignment, missing value imputation, cross-sectional rank standardisation, and volatility computation.
|
|
181
|
+
|
|
182
|
+
| Parameter | Default | Description |
|
|
183
|
+
|---|---|---|
|
|
184
|
+
| `cs_rank_standardize` | `True` | Cross-sectional rank normalisation to [0, 1] |
|
|
185
|
+
| `vol_window` | `60` | Rolling window for volatility computation |
|
|
186
|
+
| `min_obs` | `20` | Minimum observations for volatility calculation |
|
|
187
|
+
| `fillna_method` | `"ffill"` | Missing-value strategy (`ffill`, `bfill`, `zero`, `mean`, `None`) |
|
|
188
|
+
|
|
189
|
+
### Predictors
|
|
190
|
+
|
|
191
|
+
All predictors inherit from `PredictorBase` and implement `fit()` / `predict()`.
|
|
192
|
+
|
|
193
|
+
| Class | Use Case |
|
|
194
|
+
|---|---|
|
|
195
|
+
| `RidgeRegressor` | Standard Ridge regression (closed-form) |
|
|
196
|
+
| `VolWeightedRidgeRegressor` | Inverse-volatility weighted Ridge (handles heteroscedasticity) |
|
|
197
|
+
| `RidgeLogitClassifier` | Ridge logistic regression via IRLS |
|
|
198
|
+
| `ElasticNetRegressor` | L1+L2 coordinate-descent regression (sparse factor selection) |
|
|
199
|
+
| `PLSRegressor` | Partial least squares (handles highly-correlated factors) |
|
|
200
|
+
| `SelfDefinedPredictor` | User-defined model base class |
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
**Custom Predictor Example:**
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from ptree import SelfDefinedPredictor
|
|
207
|
+
|
|
208
|
+
class MyLGBPredictor(SelfDefinedPredictor):
|
|
209
|
+
def fit(self, X, y, weights=None):
|
|
210
|
+
import lightgbm as lgb
|
|
211
|
+
self.model = lgb.LGBMRegressor().fit(X, y, sample_weight=weights)
|
|
212
|
+
return self
|
|
213
|
+
|
|
214
|
+
def predict(self, X):
|
|
215
|
+
return self.model.predict(X)
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Criteria
|
|
219
|
+
|
|
220
|
+
Split-quality criteria evaluate whether a candidate split produces child nodes with meaningfully different predictability.
|
|
221
|
+
|
|
222
|
+
| Class | Description |
|
|
223
|
+
|---|---|
|
|
224
|
+
| `R2DiffCriterion` | Maximise \|R²_L − R²_R\| (regression, **default**) |
|
|
225
|
+
| `WeightedR2DiffCriterion` | \|R²_L − R²_R\| with balance / sample-size shrinkage / adjusted-R² penalties (stricter, anti-overfit variant) |
|
|
226
|
+
| `MeanVarianceCriterion` | Tangency (max) Sharpe of the two child long-short portfolios — aligns splits with the SDF / efficient-frontier objective (requires `fit(..., time_index=...)`) |
|
|
227
|
+
| `ClassificationCriterion` | Maximise difference in Precision / F1 / AUC / LogLoss (classification) |
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
### PanelTreeEngine
|
|
231
|
+
|
|
232
|
+
The main engine for building and querying Panel Trees.
|
|
233
|
+
|
|
234
|
+
| Parameter | Default | Description |
|
|
235
|
+
|---|---|---|
|
|
236
|
+
| `predictor` | `RidgeRegressor` | Leaf-node predictor (instance or class) |
|
|
237
|
+
| `criterion` | `R2DiffCriterion()` | Split-quality criterion |
|
|
238
|
+
| `split_thresholds` | `[0.3, 0.5, 0.7]` | Candidate split points on (rank-standardised) feature values. Pass `"adaptive"` to use per-node, per-feature quantile thresholds instead |
|
|
239
|
+
| `adaptive_quantiles` | `[0.25, 0.5, 0.75]` | Quantiles used when `split_thresholds="adaptive"` |
|
|
240
|
+
| `max_depth` | `3` | Maximum tree depth |
|
|
241
|
+
| `min_samples` | `100` | Minimum observations per node |
|
|
242
|
+
| `min_impurity_decrease` | `0.0` | Minimum criterion score the best split must reach; below it the node becomes a leaf |
|
|
243
|
+
| `honest` | `False` | Honest splits — fit leaf models on one in-node subset and evaluate split quality on a disjoint subset, removing the selection bias of fitting and scoring on the same data |
|
|
244
|
+
| `honest_frac` | `0.5` | Fraction of in-node samples held out as the honest evaluation set |
|
|
245
|
+
| `honest_refit_full` | `True` | Refit each final leaf model on the full in-node sample after honest split selection |
|
|
246
|
+
| `random_state` | `None` | Seed for honest splitting, random feature subsetting and the random splitter (reproducibility) |
|
|
247
|
+
| `fast_mode` | `False` | Enable feature-priority caching from parent nodes |
|
|
248
|
+
| `early_stopping_threshold` | `None` | Stop searching if criterion exceeds this value (requires `fast_mode`) |
|
|
249
|
+
| `n_jobs` | `1` | Parallel workers (`-1` = all cores), used for feature-dimension parallelism (requires `joblib`) |
|
|
250
|
+
| `parallel_backend` | `"threads"` | joblib backend for parallel feature evaluation (`"threads"` or `"processes"`) |
|
|
251
|
+
| `max_features` | `None` | Node-level random feature-subset size for splits (`"sqrt"`, `"log2"`, int, float or `None`); used by ensembles to decorrelate trees |
|
|
252
|
+
| `splitter` | `"best"` | `"best"` exhaustively scans `split_thresholds`; `"random"` draws random thresholds (Extra-Trees style) |
|
|
253
|
+
| `n_random_splits` | `1` | Number of random thresholds drawn per feature when `splitter="random"` |
|
|
254
|
+
| `keep_node_stats` | `False` | Retain per-node cached matrices after splitting (uses more memory; useful for debugging) |
|
|
255
|
+
| `verbose` | `1` | Logging verbosity (0=silent, 1=per-level, 2=per-candidate) |
|
|
256
|
+
|
|
257
|
+
**Pruning / honest helpers**
|
|
258
|
+
|
|
259
|
+
| Method | Description |
|
|
260
|
+
|---|---|
|
|
261
|
+
| `engine.prune(ccp_alpha)` | Cost-complexity post-pruning: bottom-up, collapse subtrees whose score gain does not justify their leaf-count penalty `ccp_alpha` |
|
|
262
|
+
| `engine.cost_complexity_pruning_path()` | Return `(ccp_alphas, n_leaves, scores)` to help select `ccp_alpha` |
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
## Output & Query API Reference
|
|
266
|
+
|
|
267
|
+
P-Tree provides rich output and query interfaces across four main classes: `PanelTreeEngine`, `PanelTreeNode`, `NodeReporter`, and `MosaicVisualizer`.
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
### PanelTreeEngine Methods
|
|
272
|
+
|
|
273
|
+
#### `engine.predict(X) → np.ndarray`
|
|
274
|
+
|
|
275
|
+
Generate per-sample predictions on new data. Each observation traverses down the tree to its corresponding leaf node, which provides the prediction using its local model.
|
|
276
|
+
|
|
277
|
+
```python
|
|
278
|
+
preds = engine.predict(X_proc) # shape: (n_samples,)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
#### `engine.get_leaves() → List[PanelTreeNode]`
|
|
282
|
+
|
|
283
|
+
Return a list of all leaf node objects.
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
for leaf in engine.get_leaves():
|
|
287
|
+
print(f"Leaf {leaf.node_id}: R²={leaf.metrics.get('r2', None):.4f}, n={leaf.n_samples}")
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
#### `engine.get_all_nodes() → List[PanelTreeNode]`
|
|
291
|
+
|
|
292
|
+
Return all nodes in the tree (BFS order), including both internal nodes and leaves.
|
|
293
|
+
|
|
294
|
+
```python
|
|
295
|
+
all_nodes = engine.get_all_nodes()
|
|
296
|
+
print(f"Total nodes: {len(all_nodes)}")
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
#### `engine.get_node_report() → pd.DataFrame`
|
|
300
|
+
|
|
301
|
+
Return a structured DataFrame with one row per node containing the following columns:
|
|
302
|
+
|
|
303
|
+
| Column | Description |
|
|
304
|
+
|---|---|
|
|
305
|
+
| `Node_ID` | Unique node identifier |
|
|
306
|
+
| `Depth` | Node depth (root = 0) |
|
|
307
|
+
| `Rule` | Full path rule from root, e.g., `root & char_1 >= 0.5 & char_3 < 0.7` |
|
|
308
|
+
| `Is_Leaf` | Whether the node is a leaf |
|
|
309
|
+
| `N_Samples` | Number of samples in the node |
|
|
310
|
+
| `Sample_Ratio` | Ratio of samples relative to total |
|
|
311
|
+
| `Split_Feature` | Feature used for splitting (NaN for leaves) |
|
|
312
|
+
| `Split_Threshold` | Split threshold value (NaN for leaves) |
|
|
313
|
+
| `Split_Score` | Criterion score at split |
|
|
314
|
+
| `Predictability_Score` | Predictability strength (R² for regression, Precision for classification) |
|
|
315
|
+
| `Metrics` | Full metrics dictionary, e.g., `{"r2": 0.63, "mse": 0.22, "n_samples": 2429}` |
|
|
316
|
+
| `Model_Weights` | Feature coefficients of the leaf model |
|
|
317
|
+
| `Elapsed_Time_s` | Time spent building the node (seconds) |
|
|
318
|
+
| `Parent_ID` | Parent node ID |
|
|
319
|
+
|
|
320
|
+
```python
|
|
321
|
+
report = engine.get_node_report()
|
|
322
|
+
print(report[["Node_ID", "Depth", "Rule", "Predictability_Score", "N_Samples"]])
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
#### `engine.get_leaf_samples() → Dict[int, np.ndarray]`
|
|
326
|
+
|
|
327
|
+
Return a dictionary mapping leaf `node_id` to an array of original sample row indices. Useful for extracting the raw data corresponding to each cluster.
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
leaf_samples = engine.get_leaf_samples()
|
|
331
|
+
for leaf_id, indices in leaf_samples.items():
|
|
332
|
+
subset = original_df.iloc[indices]
|
|
333
|
+
print(f"Leaf {leaf_id}: {len(indices)} samples, "
|
|
334
|
+
f"mean_return={subset['ret'].mean():.4f}, "
|
|
335
|
+
f"unique_assets={subset['asset_id'].nunique()}")
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
---
|
|
339
|
+
|
|
340
|
+
### PanelTreeNode Methods
|
|
341
|
+
|
|
342
|
+
Node objects can be obtained via `engine.get_leaves()` or `engine.get_all_nodes()`.
|
|
343
|
+
|
|
344
|
+
#### `node.n_samples → int`
|
|
345
|
+
|
|
346
|
+
Number of samples contained in this node (read-only property).
|
|
347
|
+
|
|
348
|
+
#### `node.metrics → Dict[str, float]`
|
|
349
|
+
|
|
350
|
+
Evaluation metrics dictionary. For regression: `r2`, `mse`, `n_samples`. For classification: `precision`, `f1`, `auc`, `n_samples`.
|
|
351
|
+
|
|
352
|
+
```python
|
|
353
|
+
leaf = engine.get_leaves()[0]
|
|
354
|
+
print(leaf.metrics) # {"r2": 0.63, "mse": 0.22, "n_samples": 2429}
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
#### `node.get_model_weights() → np.ndarray | None`
|
|
358
|
+
|
|
359
|
+
Return the feature coefficient vector of the leaf node's local model. Useful for inspecting which factors are active in a specific regime.
|
|
360
|
+
|
|
361
|
+
```python
|
|
362
|
+
for leaf in engine.get_leaves():
|
|
363
|
+
coef = leaf.get_model_weights()
|
|
364
|
+
if coef is not None:
|
|
365
|
+
for name, w in zip(dh.feature_names, coef):
|
|
366
|
+
print(f" {name}: {w:+.4f}")
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
#### `node.get_samples() → np.ndarray | None`
|
|
370
|
+
|
|
371
|
+
Return sample row indices belonging to this node. Similar to `engine.get_leaf_samples()`, but can be used for any node (including internal nodes).
|
|
372
|
+
|
|
373
|
+
```python
|
|
374
|
+
node = engine.get_all_nodes()[1] # Second node
|
|
375
|
+
indices = node.get_samples()
|
|
376
|
+
print(f"Node {node.node_id} contains {len(indices)} samples")
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
#### `node.to_dict() → Dict[str, Any]`
|
|
380
|
+
|
|
381
|
+
Serialise all node metadata to a flat dictionary, convenient for building DataFrames or exporting to JSON.
|
|
382
|
+
|
|
383
|
+
```python
|
|
384
|
+
import json
|
|
385
|
+
leaf = engine.get_leaves()[0]
|
|
386
|
+
print(json.dumps(leaf.to_dict(), indent=2, default=str))
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
#### Common Read-Only Attributes
|
|
390
|
+
|
|
391
|
+
| Attribute | Type | Description |
|
|
392
|
+
|---|---|---|
|
|
393
|
+
| `node.node_id` | `int` | Unique identifier |
|
|
394
|
+
| `node.depth` | `int` | Depth level |
|
|
395
|
+
| `node.rule` | `str` | Path description, e.g., `root & char_1 < 0.5 & char_3 >= 0.7` |
|
|
396
|
+
| `node.split_feature` | `str \| None` | Split feature name |
|
|
397
|
+
| `node.split_threshold` | `float \| None` | Split threshold |
|
|
398
|
+
| `node.split_score` | `float \| None` | Criterion score at split |
|
|
399
|
+
| `node.is_leaf` | `bool` | Whether this is a leaf |
|
|
400
|
+
| `node.sample_ratio` | `float` | Sample coverage ratio |
|
|
401
|
+
| `node.elapsed_time` | `float` | Build time (seconds) |
|
|
402
|
+
| `node.predictor` | `PredictorBase` | Trained local model instance |
|
|
403
|
+
|
|
404
|
+
---
|
|
405
|
+
|
|
406
|
+
### NodeReporter Methods
|
|
407
|
+
|
|
408
|
+
`NodeReporter` encapsulates user-facing reporting functionality. It requires a fitted `PanelTreeEngine`.
|
|
409
|
+
|
|
410
|
+
```python
|
|
411
|
+
from ptree import NodeReporter
|
|
412
|
+
reporter = NodeReporter(engine)
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
#### `reporter.summary() → pd.DataFrame`
|
|
416
|
+
|
|
417
|
+
Return a complete node report DataFrame (all nodes, including internal nodes and leaves). Column definitions are the same as `engine.get_node_report()`.
|
|
418
|
+
|
|
419
|
+
```python
|
|
420
|
+
full = reporter.summary()
|
|
421
|
+
print(full[["Node_ID", "Depth", "Is_Leaf", "Split_Feature", "Predictability_Score"]])
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
#### `reporter.leaf_summary() → pd.DataFrame`
|
|
425
|
+
|
|
426
|
+
Return only the leaf nodes report. Structure is the same as `summary()`, suitable for quickly viewing final clustering results.
|
|
427
|
+
|
|
428
|
+
```python
|
|
429
|
+
leaves = reporter.leaf_summary()
|
|
430
|
+
print(leaves[["Node_ID", "Rule", "Predictability_Score", "N_Samples", "Model_Weights"]])
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
**Example Output:**
|
|
434
|
+
|
|
435
|
+
```
|
|
436
|
+
Node_ID Rule Predictability_Score N_Samples
|
|
437
|
+
3 root & char_1 < 0.5 & char_1 < 0.3 & char_3 < 0.7 0.0147 2438
|
|
438
|
+
4 root & char_1 < 0.5 & char_1 < 0.3 & char_3 >= 0.7 0.0018 1102
|
|
439
|
+
13 root & char_1 >= 0.5 & char_3 >= 0.3 & char_3 < 0.7 0.6323 2429
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
#### `reporter.print_tree() → str`
|
|
443
|
+
|
|
444
|
+
Return a formatted tree structure text string using indentation and `├─` / `└─` to represent hierarchical relationships.
|
|
445
|
+
|
|
446
|
+
```python
|
|
447
|
+
print(reporter.print_tree())
|
|
448
|
+
```
|
|
449
|
+
|
|
450
|
+
**Example Output:**
|
|
451
|
+
|
|
452
|
+
```
|
|
453
|
+
[Node 0] char_1 < 0.5 | r2=0.1234, n=12000 (Δ=0.4569)
|
|
454
|
+
├── [Node 1] char_1 < 0.3 | r2=0.0523, n=5940 (Δ=0.0140)
|
|
455
|
+
│ ├── [Leaf 3] r2=0.0147, mse=0.4769, n=2438
|
|
456
|
+
│ └── [Leaf 4] r2=0.0018, mse=0.8028, n=1102
|
|
457
|
+
└── [Leaf 5] r2=0.4640, mse=0.5483, n=6060
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
---
|
|
461
|
+
|
|
462
|
+
### MosaicVisualizer Methods
|
|
463
|
+
|
|
464
|
+
`MosaicVisualizer` generates "prediction mosaics" — 2D heatmaps that visually display the model's predictive power across different time periods and asset clusters.
|
|
465
|
+
|
|
466
|
+
```python
|
|
467
|
+
from ptree import MosaicVisualizer
|
|
468
|
+
viz = MosaicVisualizer(engine)
|
|
469
|
+
```
|
|
470
|
+
|
|
471
|
+
#### `viz.build_mosaic(X, y, time_col, metric) → pd.DataFrame`
|
|
472
|
+
|
|
473
|
+
Compute per-leaf, per-period metric values and return a DataFrame.
|
|
474
|
+
|
|
475
|
+
| Parameter | Description |
|
|
476
|
+
|---|---|
|
|
477
|
+
| `X` | Processed panel DataFrame (must include `time_col` and feature columns) |
|
|
478
|
+
| `y` | Target variable |
|
|
479
|
+
| `time_col` | Time column name, default `"date"` |
|
|
480
|
+
| `metric` | Evaluation metric: `"r2"` for regression, `"precision"` / `"f1"` / `"auc"` for classification |
|
|
481
|
+
|
|
482
|
+
**Return Structure:**
|
|
483
|
+
- **Row index**: Leaf node IDs (`Leaf_ID`)
|
|
484
|
+
- **Columns**: Time periods (determined by `time_col`)
|
|
485
|
+
- **Values**: Metric value for that leaf in that period
|
|
486
|
+
|
|
487
|
+
```python
|
|
488
|
+
mosaic = viz.build_mosaic(X_proc, y_proc, time_col="date", metric="r2")
|
|
489
|
+
print(mosaic.shape) # (n_leaves, n_periods)
|
|
490
|
+
print(mosaic.iloc[:, :5]) # Preview first 5 periods
|
|
491
|
+
|
|
492
|
+
# Analyse which leaves perform best in which periods
|
|
493
|
+
best_leaf_per_period = mosaic.idxmax(axis=0)
|
|
494
|
+
print(best_leaf_per_period)
|
|
495
|
+
```
|
|
496
|
+
|
|
497
|
+
**Example Output:**
|
|
498
|
+
|
|
499
|
+
```
|
|
500
|
+
0 1 2 3 4
|
|
501
|
+
Leaf_ID
|
|
502
|
+
3 0.016 -0.042 0.006 -0.089 0.036
|
|
503
|
+
13 0.621 0.782 0.599 0.687 0.605
|
|
504
|
+
14 0.502 0.465 0.350 0.462 0.289
|
|
505
|
+
```
|
|
506
|
+
|
|
507
|
+
#### `viz.plot_mosaic(mosaic, title, cmap, figsize, save_path) → (fig, ax)`
|
|
508
|
+
|
|
509
|
+
Render the mosaic matrix as a seaborn heatmap. Requires `matplotlib` and `seaborn`.
|
|
510
|
+
|
|
511
|
+
| Parameter | Default | Description |
|
|
512
|
+
|---|---|---|
|
|
513
|
+
| `mosaic` | — | DataFrame returned by `build_mosaic()` |
|
|
514
|
+
| `title` | `"Prediction Mosaic"` | Chart title |
|
|
515
|
+
| `cmap` | `"RdYlGn"` | Colour map (red=poor, green=good) |
|
|
516
|
+
| `figsize` | `(14, 6)` | Figure size |
|
|
517
|
+
| `save_path` | `None` | If specified, automatically save as PNG |
|
|
518
|
+
|
|
519
|
+
```python
|
|
520
|
+
# Interactive viewing
|
|
521
|
+
fig, ax = viz.plot_mosaic(mosaic, title="P-Tree R² Mosaic")
|
|
522
|
+
|
|
523
|
+
# Save to file
|
|
524
|
+
fig, ax = viz.plot_mosaic(mosaic, save_path="output/mosaic.png", cmap="coolwarm")
|
|
525
|
+
```
|
|
526
|
+
|
|
527
|
+
**Heatmap Interpretation:**
|
|
528
|
+
- **X-axis**: Time period $t$
|
|
529
|
+
- **Y-axis**: Leaf nodes
|
|
530
|
+
- **Colour**: Predictive accuracy for that leaf in that period (R² or Precision)
|
|
531
|
+
- Instantly reveals when and where the model "fails" or "excels"
|
|
532
|
+
|
|
533
|
+
---
|
|
534
|
+
|
|
535
|
+
### Verbose Logging
|
|
536
|
+
|
|
537
|
+
`PanelTreeEngine` outputs detailed splitting process logs via Python's `logging` module when `verbose >= 1`:
|
|
538
|
+
|
|
539
|
+
```python
|
|
540
|
+
import logging
|
|
541
|
+
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
|
542
|
+
|
|
543
|
+
engine = PanelTreeEngine(..., verbose=1)
|
|
544
|
+
engine.fit(X, y, feature_names=...)
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
**Example Log Output:**
|
|
548
|
+
|
|
549
|
+
```
|
|
550
|
+
[INFO] [Level 0] Splitting Node 0...
|
|
551
|
+
- Best Split: 'char_1' at threshold 0.5000
|
|
552
|
+
- Metric Delta: score = 0.456896
|
|
553
|
+
- Left: 5940 samples | Right: 6060 samples
|
|
554
|
+
[INFO] [Level 1] Splitting Node 1...
|
|
555
|
+
- Best Split: 'char_3' at threshold 0.3000
|
|
556
|
+
- Metric Delta: score = 0.179045
|
|
557
|
+
- Left: 1808 samples | Right: 4252 samples
|
|
558
|
+
[INFO] Tree built: 15 nodes, 8 leaves, max_depth=3
|
|
559
|
+
```
|
|
560
|
+
|
|
561
|
+
Set `verbose=2` to view per-candidate (feature, threshold) evaluation results.
|
|
562
|
+
|
|
563
|
+
---
|
|
564
|
+
|
|
565
|
+
## Performance Optimisations
|
|
566
|
+
|
|
567
|
+
1. **Incremental matrix updates** – For Ridge models, $X^TWX$ and $X^TWy$ are cached at each node. Only the *smaller* child's statistics are computed directly (a single matmul); the larger child is obtained by subtracting it from the cached parent, halving the matrix-multiplication work per candidate split.
|
|
568
|
+
2. **Feature-priority caching** – When `fast_mode=True`, child nodes first evaluate the top-50% features from the parent, with optional early stopping.
|
|
569
|
+
3. **Feature-dimension parallelism** – When `n_jobs != 1` (and `joblib` is installed), candidate features are evaluated in parallel. The default `"threads"` backend keeps NumPy/BLAS matmuls GIL-free with zero data copying; switch to `parallel_backend="processes"` for heavy pure-Python custom predictors.
|
|
570
|
+
4. **Cost-complexity pruning & honest splits** – `engine.prune(ccp_alpha)` removes over-fit subtrees post-hoc, while `honest=True` evaluates split quality on a held-out in-node subset to remove the selection bias of fitting and scoring on the same data.
|
|
571
|
+
5. **Vectorised AUC** – Classification AUC uses an `O(n log n)` rank-based Mann–Whitney statistic instead of the former `O(n⁺·n⁻)` double loop.
|
|
572
|
+
|
|
573
|
+
## Ensembles (P-Forest & P-Boost)
|
|
574
|
+
|
|
575
|
+
A single Panel Tree is a *high-variance* estimator — small data perturbations can flip the greedily-chosen split and produce a completely different partition. The `ensemble` module provides two derived algorithms (mirroring the decision-tree → random-forest / gradient-boosting relationship) that act on the **prediction / target layer** while each tree's split criterion stays the unchanged `R2Diff` rule.
|
|
576
|
+
|
|
577
|
+
### PanelForest (P-Forest — bagging)
|
|
578
|
+
|
|
579
|
+
Grows many decorrelated P-Trees via **time-block bootstrap** (contiguous blocks of `block_size` periods resampled with replacement, preserving serial autocorrelation) plus **node-level random feature subsets** (`max_features`), then aggregates at the output layer.
|
|
580
|
+
|
|
581
|
+
```python
|
|
582
|
+
from ptree import PanelForest
|
|
583
|
+
|
|
584
|
+
forest = PanelForest(
|
|
585
|
+
n_estimators=100, max_features="sqrt", block_size=5,
|
|
586
|
+
base_params={"max_depth": 3, "min_samples": 100},
|
|
587
|
+
n_jobs=-1, random_state=0,
|
|
588
|
+
)
|
|
589
|
+
forest.fit(X, y, feature_names=dh.feature_names, time_index="date")
|
|
590
|
+
|
|
591
|
+
forest.predict(X) # bagged mean ŷ (variance reduction)
|
|
592
|
+
forest.regime_membership(X) # soft P(obs ∈ high-predictability regime) ∈ [0,1]
|
|
593
|
+
forest.coassociation_matrix(X) # consensus similarity C[i,j] ∈ [0,1] (same-leaf frequency)
|
|
594
|
+
forest.oob_score_ # out-of-bag R² (unselected time blocks)
|
|
595
|
+
```
|
|
596
|
+
|
|
597
|
+
| Output | Description |
|
|
598
|
+
|---|---|
|
|
599
|
+
| `.predict(X)` | Bagged mean prediction across trees (lower variance than a single tree) |
|
|
600
|
+
| `.regime_membership(X)` | Fraction of trees routing each observation into a *high-R²* leaf — a smooth, robust upgrade of the 0/1 mosaic |
|
|
601
|
+
| `.coassociation_matrix(X)` | Consensus / co-association matrix; fraction of trees in which two observations share a leaf (a precomputed affinity for spectral clustering) |
|
|
602
|
+
| `.oob_score_` | Out-of-bag R² estimated on each tree's unselected time blocks |
|
|
603
|
+
|
|
604
|
+
> **Note:** P-Forest's gains are largest when predictability is driven by *several weakly-identified features*. If a single strong feature dominates, all trees split on it, become highly correlated, and the ensemble adds little.
|
|
605
|
+
|
|
606
|
+
### BoostedPanelTree (P-Boost — residual boosting)
|
|
607
|
+
|
|
608
|
+
Boosts the **target/residual** (not the criterion): each round strips the predictability already explained by the running ensemble and re-grows a fresh P-Tree on the residual, uncovering the *next, weaker* regime the greedy single tree would have masked.
|
|
609
|
+
|
|
610
|
+
```python
|
|
611
|
+
from ptree import BoostedPanelTree
|
|
612
|
+
|
|
613
|
+
booster = BoostedPanelTree(
|
|
614
|
+
n_estimators=50, learning_rate=0.1, max_depth=2,
|
|
615
|
+
subsample=1.0, random_state=0,
|
|
616
|
+
)
|
|
617
|
+
booster.fit(X, y, feature_names=dh.feature_names)
|
|
618
|
+
|
|
619
|
+
booster.predict(X) # ν · Σ_m tree_m.predict(X)
|
|
620
|
+
booster.residual_norms_ # residual L2-norm per round (monotone ↓; flat ⇒ self-limited)
|
|
621
|
+
```
|
|
622
|
+
|
|
623
|
+
On single-feature-dominated data P-Boost is **self-limiting**: once the first tree explains the dominant regime, the residual is near-noise and later trees add almost nothing (visible in `residual_norms_`). Set `splitter="random"` via `base_params` for an Extra-Trees-style variant that injects extra diversity.
|
|
624
|
+
|
|
625
|
+
## Requirements
|
|
626
|
+
|
|
627
|
+
- Python ≥ 3.10
|
|
628
|
+
- `numpy`, `pandas`
|
|
629
|
+
- `matplotlib`, `seaborn` (optional, for visualisation)
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
## Contributing
|
|
633
|
+
|
|
634
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
|
635
|
+
|
|
636
|
+
```bash
|
|
637
|
+
# Clone the repository
|
|
638
|
+
git clone https://github.com/ElenYoung/AssetPanelTree.git
|
|
639
|
+
cd AssetPanelTree
|
|
640
|
+
|
|
641
|
+
# Install in development mode
|
|
642
|
+
pip install -e ".[dev]"
|
|
643
|
+
|
|
644
|
+
# Run tests
|
|
645
|
+
pytest test/ -v
|
|
646
|
+
```
|
|
647
|
+
|
|
648
|
+
## License
|
|
649
|
+
|
|
650
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
651
|
+
|
|
652
|
+
## Citation
|
|
653
|
+
|
|
654
|
+
If you use P-Tree in your research, please consider citing:
|
|
655
|
+
|
|
656
|
+
```bibtex
|
|
657
|
+
@software{ptree2026,
|
|
658
|
+
author = {ElenYoung},
|
|
659
|
+
title = {P-Tree: Panel Tree for Supervised Clustering},
|
|
660
|
+
year = {2026},
|
|
661
|
+
url = {https://github.com/ElenYoung/AssetPanelTree}
|
|
662
|
+
}
|
|
663
|
+
```
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
ptree/__init__.py,sha256=BuGnoxz948bnmwbLiRMiXXdfgD6xgfW5IqhlQdw-X_8,2469
|
|
2
|
+
ptree/criteria.py,sha256=UaT6Bp-RJgJnAqOU1gMuLtM5KZ7g6BWPfj4tJijzPlA,15122
|
|
3
|
+
ptree/data_handler.py,sha256=84rLdIC2HpRXXTNgMVZF59rG0ZeLbXcIUzND1PaKz_s,7811
|
|
4
|
+
ptree/engine.py,sha256=swMFJstS4mo2LEX3oORQCy7NVYuoyaj9EQO36WAARYU,50824
|
|
5
|
+
ptree/ensemble.py,sha256=0QvIbOG374-P8wpThKi4jnHZeC6i-lFvDsKngu2bMHg,25684
|
|
6
|
+
ptree/node.py,sha256=1vWXjBvTEC1BGZXc45rhFcs2ojwC24kiln-A_7_xCKM,4903
|
|
7
|
+
ptree/predictors.py,sha256=zs4twdjqu_gbVmbiT_ESD4-10V-7sSaEqbXK7owoF5g,18434
|
|
8
|
+
ptree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
ptree/visualization.py,sha256=ndmQeA5fxiwJoAE808SOWLo3pYltVyZMgynGY4IESZU,9927
|
|
10
|
+
aptree-0.1.0.dist-info/METADATA,sha256=IbXf2TaeXSSIcW-Mur7xswLXd7Rud_uxLOuq0AW5jAs,27999
|
|
11
|
+
aptree-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
12
|
+
aptree-0.1.0.dist-info/licenses/LICENSE,sha256=V7TyYr8HXZaxnwl-p9nQRHAS22Z9vPAD6r80_U-OXXk,1066
|
|
13
|
+
aptree-0.1.0.dist-info/RECORD,,
|