panelxai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- panelxai-0.1.0/LICENSE +21 -0
- panelxai-0.1.0/PKG-INFO +147 -0
- panelxai-0.1.0/README.md +115 -0
- panelxai-0.1.0/panelxai/__init__.py +50 -0
- panelxai-0.1.0/panelxai/data.py +262 -0
- panelxai-0.1.0/panelxai/explain/__init__.py +21 -0
- panelxai-0.1.0/panelxai/explain/counterfactual.py +89 -0
- panelxai-0.1.0/panelxai/explain/regime.py +133 -0
- panelxai-0.1.0/panelxai/explain/structured.py +155 -0
- panelxai-0.1.0/panelxai/explain/uncertainty.py +71 -0
- panelxai-0.1.0/panelxai/models/__init__.py +15 -0
- panelxai-0.1.0/panelxai/models/base.py +77 -0
- panelxai-0.1.0/panelxai/models/gb.py +56 -0
- panelxai-0.1.0/panelxai/models/hybrid.py +86 -0
- panelxai-0.1.0/panelxai/plots.py +122 -0
- panelxai-0.1.0/panelxai/reports.py +41 -0
- panelxai-0.1.0/panelxai/tables.py +31 -0
- panelxai-0.1.0/panelxai.egg-info/PKG-INFO +147 -0
- panelxai-0.1.0/panelxai.egg-info/SOURCES.txt +23 -0
- panelxai-0.1.0/panelxai.egg-info/dependency_links.txt +1 -0
- panelxai-0.1.0/panelxai.egg-info/requires.txt +13 -0
- panelxai-0.1.0/panelxai.egg-info/top_level.txt +1 -0
- panelxai-0.1.0/pyproject.toml +51 -0
- panelxai-0.1.0/setup.cfg +4 -0
- panelxai-0.1.0/tests/test_smoke.py +80 -0
panelxai-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Dr Merwan Roudane
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
panelxai-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: panelxai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Explainable AI for panel time-series econometrics: structured (variable x lag x unit x time) SHAP, factor / cross-sectional-dependence-aware attribution, regime-aware explanation drift, constrained counterfactuals, bootstrap uncertainty, and hybrid econometric-core + ML-residual models.
|
|
5
|
+
Author-email: Dr Merwan Roudane <merwanroudane920@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/merwanroudane/panelxai
|
|
8
|
+
Project-URL: Repository, https://github.com/merwanroudane/panelxai
|
|
9
|
+
Project-URL: Issues, https://github.com/merwanroudane/panelxai/issues
|
|
10
|
+
Keywords: explainable ai,xai,shap,panel data,time series,dynamic panel,econometrics,cross-sectional dependence,common factors,fixed effects,counterfactual,regime,interpretable machine learning,hybrid models
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: numpy>=1.22
|
|
20
|
+
Requires-Dist: pandas>=1.5
|
|
21
|
+
Requires-Dist: scipy>=1.9
|
|
22
|
+
Requires-Dist: scikit-learn>=1.1
|
|
23
|
+
Requires-Dist: statsmodels>=0.13
|
|
24
|
+
Requires-Dist: matplotlib>=3.5
|
|
25
|
+
Requires-Dist: shap>=0.41
|
|
26
|
+
Requires-Dist: xgboost>=1.6
|
|
27
|
+
Requires-Dist: linearmodels>=4.27
|
|
28
|
+
Requires-Dist: tabulate>=0.9
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# panelxai
|
|
34
|
+
|
|
35
|
+
**Explainable AI for panel time-series econometrics.**
|
|
36
|
+
|
|
37
|
+
Most XAI work either explains *time series* (TimeSHAP, WindowSHAP, TFT) **or**
|
|
38
|
+
applies SHAP to *panel-shaped* data — but rarely respects the econometric
|
|
39
|
+
structure of a **dynamic panel**: distributed lags, cross-sectional dependence,
|
|
40
|
+
common factors, unit heterogeneity, regime change, fixed effects.
|
|
41
|
+
|
|
42
|
+
`panelxai` closes that gap. A raw SHAP value is indexed only by
|
|
43
|
+
*(observation, feature)*. Because the design builder encodes
|
|
44
|
+
**variable**, **lag**, and **kind** (own regressor vs cross-sectional-average
|
|
45
|
+
factor proxy) into every feature name, `panelxai` re-indexes attributions onto
|
|
46
|
+
the structure that matters:
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
variable × lag × unit × time + own-vs-factor (CSD) decomposition
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Author: **Dr Merwan Roudane** · MIT License · `pip install -e .`
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## What it provides
|
|
57
|
+
|
|
58
|
+
| Capability | Function / class | Idea |
|
|
59
|
+
|---|---|---|
|
|
60
|
+
| Panel TS simulator | `simulate_panel_ts` | DGP with true lag drivers, common factors (CSD), regime switch, non-linearity |
|
|
61
|
+
| Lag + CCE design | `build_design` | within-FE design with own lags and cross-sectional averages |
|
|
62
|
+
| Models | `GBPanel`, `HybridPanel` | XGBoost on the design; or **linear econometric core + ML residual** |
|
|
63
|
+
| Structured SHAP | `StructuredExplainer` | importance by variable, by lag, variable×lag matrix, per-unit |
|
|
64
|
+
| Factor-aware XAI | `.own_vs_factor()` | share of explanation from own dynamics vs common factors / CSD |
|
|
65
|
+
| Regime-aware XAI | `regime_importance`, `regime_effect_sign`, `explanation_drift` | importance & **effect-sign flips** across regimes; rolling drift |
|
|
66
|
+
| Counterfactual XAI | `counterfactual` | minimal, box-constrained change to move a prediction |
|
|
67
|
+
| Uncertainty-aware XAI | `bootstrap_importance` | unit cluster-bootstrap CIs + stability flag for importances |
|
|
68
|
+
| Plots / tables / report | `plot_*`, `tables`, `Report` | publication-ready outputs |
|
|
69
|
+
|
|
70
|
+
## Quick start
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import panelxai as px
|
|
74
|
+
|
|
75
|
+
df = px.simulate_panel_ts(n_units=30, n_periods=40, n_features=4,
|
|
76
|
+
n_lags=2, n_factors=1, seed=7)
|
|
77
|
+
|
|
78
|
+
model = px.GBPanel(lags=2, csa=True).fit(df) # explains the full model
|
|
79
|
+
ex = px.StructuredExplainer(model)
|
|
80
|
+
|
|
81
|
+
ex.variable_importance() # which variable matters
|
|
82
|
+
ex.lag_importance() # at which lag (temporal profile)
|
|
83
|
+
ex.variable_lag_matrix("own") # variable × lag heatmap data
|
|
84
|
+
ex.own_vs_factor() # own dynamics vs common-factor / CSD share
|
|
85
|
+
ex.unit_importance() # cross-sectional heterogeneity of explanations
|
|
86
|
+
|
|
87
|
+
px.plot_variable_lag_heatmap(ex, save="varlag.png")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
On the built-in DGP (x1 drives `y` at lag 0, x2 at lag 1, x3 at lag 2) the
|
|
91
|
+
structured SHAP **recovers exactly that** — each variable's mass concentrates at
|
|
92
|
+
its true lag.
|
|
93
|
+
|
|
94
|
+
## Hybrid econometric core + ML
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
m = px.HybridPanel(lags=2).fit(df) # y = linear dynamic-panel core + ML residual
|
|
98
|
+
print(px.Report(m).text()) # SHAP here explains only the NON-LINEAR part
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Regime-aware (sign-flip detection)
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
df = px.simulate_panel_ts(regime_break=0.5, seed=11) # x1 effect flips at midpoint
|
|
105
|
+
ex = px.StructuredExplainer(px.GBPanel(lags=2).fit(df))
|
|
106
|
+
px.regime_effect_sign(ex, n_regimes=2) # x1: +corr -> -corr => sign_flip = True
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Uncertainty
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
boot = px.bootstrap_importance(lambda: px.GBPanel(lags=2),
|
|
113
|
+
df, n_boot=30, level=0.90)
|
|
114
|
+
boot # mean, lo, hi, stable (CI excludes zero)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Examples
|
|
118
|
+
|
|
119
|
+
Runnable scripts in [`examples/`](examples):
|
|
120
|
+
|
|
121
|
+
1. `01_structured_shap.py` — variable × lag × own/factor decomposition
|
|
122
|
+
2. `02_regime_drift.py` — regime sign-flip & explanation drift
|
|
123
|
+
3. `03_hybrid_counterfactual.py` — hybrid model + constrained counterfactual
|
|
124
|
+
4. `04_uncertainty.py` — bootstrap importance CIs
|
|
125
|
+
|
|
126
|
+
## Design notes
|
|
127
|
+
|
|
128
|
+
- **Fixed effects** are absorbed by the `fe="within"` transform, so SHAP
|
|
129
|
+
explains *within-unit* deviations (the dynamic signal), not level differences.
|
|
130
|
+
- **Cross-sectional dependence** is handled CCE-style: period-`t`
|
|
131
|
+
cross-sectional averages enter the design as proxies for unobserved common
|
|
132
|
+
factors; their SHAP share is reported separately from own dynamics.
|
|
133
|
+
- **The hybrid model** keeps a transparent linear econometric core and lets the
|
|
134
|
+
ML learner — and SHAP — speak only to the residual non-linearity, the honest
|
|
135
|
+
target of post-hoc XAI in an econometric setting.
|
|
136
|
+
|
|
137
|
+
## Status & scope
|
|
138
|
+
|
|
139
|
+
v0.1.0 implements the structured / temporal, factor-aware, regime-aware,
|
|
140
|
+
counterfactual, uncertainty, and hybrid families on a tree-ensemble backbone.
|
|
141
|
+
Planned extensions: causal/interventional SHAP, deep-sequence models
|
|
142
|
+
(LSTM/TFT) with Integrated Gradients, and graph XAI for network panels.
|
|
143
|
+
|
|
144
|
+
## Requirements
|
|
145
|
+
|
|
146
|
+
Python ≥ 3.9; numpy, pandas, scipy, scikit-learn, statsmodels, matplotlib,
|
|
147
|
+
shap, xgboost, linearmodels, tabulate.
|
panelxai-0.1.0/README.md
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# panelxai
|
|
2
|
+
|
|
3
|
+
**Explainable AI for panel time-series econometrics.**
|
|
4
|
+
|
|
5
|
+
Most XAI work either explains *time series* (TimeSHAP, WindowSHAP, TFT) **or**
|
|
6
|
+
applies SHAP to *panel-shaped* data — but rarely respects the econometric
|
|
7
|
+
structure of a **dynamic panel**: distributed lags, cross-sectional dependence,
|
|
8
|
+
common factors, unit heterogeneity, regime change, fixed effects.
|
|
9
|
+
|
|
10
|
+
`panelxai` closes that gap. A raw SHAP value is indexed only by
|
|
11
|
+
*(observation, feature)*. Because the design builder encodes
|
|
12
|
+
**variable**, **lag**, and **kind** (own regressor vs cross-sectional-average
|
|
13
|
+
factor proxy) into every feature name, `panelxai` re-indexes attributions onto
|
|
14
|
+
the structure that matters:
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
variable × lag × unit × time + own-vs-factor (CSD) decomposition
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Author: **Dr Merwan Roudane** · MIT License · `pip install -e .`
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## What it provides
|
|
25
|
+
|
|
26
|
+
| Capability | Function / class | Idea |
|
|
27
|
+
|---|---|---|
|
|
28
|
+
| Panel TS simulator | `simulate_panel_ts` | DGP with true lag drivers, common factors (CSD), regime switch, non-linearity |
|
|
29
|
+
| Lag + CCE design | `build_design` | within-FE design with own lags and cross-sectional averages |
|
|
30
|
+
| Models | `GBPanel`, `HybridPanel` | XGBoost on the design; or **linear econometric core + ML residual** |
|
|
31
|
+
| Structured SHAP | `StructuredExplainer` | importance by variable, by lag, variable×lag matrix, per-unit |
|
|
32
|
+
| Factor-aware XAI | `.own_vs_factor()` | share of explanation from own dynamics vs common factors / CSD |
|
|
33
|
+
| Regime-aware XAI | `regime_importance`, `regime_effect_sign`, `explanation_drift` | importance & **effect-sign flips** across regimes; rolling drift |
|
|
34
|
+
| Counterfactual XAI | `counterfactual` | minimal, box-constrained change to move a prediction |
|
|
35
|
+
| Uncertainty-aware XAI | `bootstrap_importance` | unit cluster-bootstrap CIs + stability flag for importances |
|
|
36
|
+
| Plots / tables / report | `plot_*`, `tables`, `Report` | publication-ready outputs |
|
|
37
|
+
|
|
38
|
+
## Quick start
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import panelxai as px
|
|
42
|
+
|
|
43
|
+
df = px.simulate_panel_ts(n_units=30, n_periods=40, n_features=4,
|
|
44
|
+
n_lags=2, n_factors=1, seed=7)
|
|
45
|
+
|
|
46
|
+
model = px.GBPanel(lags=2, csa=True).fit(df) # explains the full model
|
|
47
|
+
ex = px.StructuredExplainer(model)
|
|
48
|
+
|
|
49
|
+
ex.variable_importance() # which variable matters
|
|
50
|
+
ex.lag_importance() # at which lag (temporal profile)
|
|
51
|
+
ex.variable_lag_matrix("own") # variable × lag heatmap data
|
|
52
|
+
ex.own_vs_factor() # own dynamics vs common-factor / CSD share
|
|
53
|
+
ex.unit_importance() # cross-sectional heterogeneity of explanations
|
|
54
|
+
|
|
55
|
+
px.plot_variable_lag_heatmap(ex, save="varlag.png")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
On the built-in DGP (x1 drives `y` at lag 0, x2 at lag 1, x3 at lag 2) the
|
|
59
|
+
structured SHAP **recovers exactly that** — each variable's mass concentrates at
|
|
60
|
+
its true lag.
|
|
61
|
+
|
|
62
|
+
## Hybrid econometric core + ML
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
m = px.HybridPanel(lags=2).fit(df) # y = linear dynamic-panel core + ML residual
|
|
66
|
+
print(px.Report(m).text()) # SHAP here explains only the NON-LINEAR part
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Regime-aware (sign-flip detection)
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
df = px.simulate_panel_ts(regime_break=0.5, seed=11) # x1 effect flips at midpoint
|
|
73
|
+
ex = px.StructuredExplainer(px.GBPanel(lags=2).fit(df))
|
|
74
|
+
px.regime_effect_sign(ex, n_regimes=2) # x1: +corr -> -corr => sign_flip = True
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Uncertainty
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
boot = px.bootstrap_importance(lambda: px.GBPanel(lags=2),
|
|
81
|
+
df, n_boot=30, level=0.90)
|
|
82
|
+
boot # mean, lo, hi, stable (CI excludes zero)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Examples
|
|
86
|
+
|
|
87
|
+
Runnable scripts in [`examples/`](examples):
|
|
88
|
+
|
|
89
|
+
1. `01_structured_shap.py` — variable × lag × own/factor decomposition
|
|
90
|
+
2. `02_regime_drift.py` — regime sign-flip & explanation drift
|
|
91
|
+
3. `03_hybrid_counterfactual.py` — hybrid model + constrained counterfactual
|
|
92
|
+
4. `04_uncertainty.py` — bootstrap importance CIs
|
|
93
|
+
|
|
94
|
+
## Design notes
|
|
95
|
+
|
|
96
|
+
- **Fixed effects** are absorbed by the `fe="within"` transform, so SHAP
|
|
97
|
+
explains *within-unit* deviations (the dynamic signal), not level differences.
|
|
98
|
+
- **Cross-sectional dependence** is handled CCE-style: period-`t`
|
|
99
|
+
cross-sectional averages enter the design as proxies for unobserved common
|
|
100
|
+
factors; their SHAP share is reported separately from own dynamics.
|
|
101
|
+
- **The hybrid model** keeps a transparent linear econometric core and lets the
|
|
102
|
+
ML learner — and SHAP — speak only to the residual non-linearity, the honest
|
|
103
|
+
target of post-hoc XAI in an econometric setting.
|
|
104
|
+
|
|
105
|
+
## Status & scope
|
|
106
|
+
|
|
107
|
+
v0.1.0 implements the structured / temporal, factor-aware, regime-aware,
|
|
108
|
+
counterfactual, uncertainty, and hybrid families on a tree-ensemble backbone.
|
|
109
|
+
Planned extensions: causal/interventional SHAP, deep-sequence models
|
|
110
|
+
(LSTM/TFT) with Integrated Gradients, and graph XAI for network panels.
|
|
111
|
+
|
|
112
|
+
## Requirements
|
|
113
|
+
|
|
114
|
+
Python ≥ 3.9; numpy, pandas, scipy, scikit-learn, statsmodels, matplotlib,
|
|
115
|
+
shap, xgboost, linearmodels, tabulate.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""panelxai - Explainable AI for panel time-series econometrics.
|
|
2
|
+
|
|
3
|
+
Structured (variable x lag x unit x time) SHAP, factor / cross-sectional-
|
|
4
|
+
dependence-aware attribution, regime-aware explanation drift, constrained
|
|
5
|
+
counterfactuals, bootstrap uncertainty, and hybrid econometric-core + ML
|
|
6
|
+
models.
|
|
7
|
+
|
|
8
|
+
Author : Dr Merwan Roudane <merwanroudane920@gmail.com>
|
|
9
|
+
GitHub : https://github.com/merwanroudane/panelxai
|
|
10
|
+
"""
|
|
11
|
+
from .data import (
|
|
12
|
+
simulate_panel_ts, build_design, build_lags,
|
|
13
|
+
add_cross_sectional_averages, panel_describe,
|
|
14
|
+
make_feature_name, parse_feature_name, DesignSpec,
|
|
15
|
+
)
|
|
16
|
+
from .models import PanelModel, GBPanel, HybridPanel
|
|
17
|
+
from .explain import (
|
|
18
|
+
StructuredExplainer, regime_importance, regime_effect_sign,
|
|
19
|
+
explanation_drift, counterfactual, bootstrap_importance,
|
|
20
|
+
)
|
|
21
|
+
from .plots import (
|
|
22
|
+
plot_variable_lag_heatmap, plot_lag_profile, plot_own_vs_factor,
|
|
23
|
+
plot_unit_heterogeneity, plot_regime_drift, plot_importance_ci,
|
|
24
|
+
plot_counterfactual,
|
|
25
|
+
)
|
|
26
|
+
from . import tables
|
|
27
|
+
from .reports import Report
|
|
28
|
+
|
|
29
|
+
__author__ = "Dr Merwan Roudane"
|
|
30
|
+
__email__ = "merwanroudane920@gmail.com"
|
|
31
|
+
__url__ = "https://github.com/merwanroudane/panelxai"
|
|
32
|
+
__version__ = "0.1.0"
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
# data
|
|
36
|
+
"simulate_panel_ts", "build_design", "build_lags",
|
|
37
|
+
"add_cross_sectional_averages", "panel_describe",
|
|
38
|
+
"make_feature_name", "parse_feature_name", "DesignSpec",
|
|
39
|
+
# models
|
|
40
|
+
"PanelModel", "GBPanel", "HybridPanel",
|
|
41
|
+
# explain
|
|
42
|
+
"StructuredExplainer", "regime_importance", "regime_effect_sign",
|
|
43
|
+
"explanation_drift", "counterfactual", "bootstrap_importance",
|
|
44
|
+
# plots
|
|
45
|
+
"plot_variable_lag_heatmap", "plot_lag_profile", "plot_own_vs_factor",
|
|
46
|
+
"plot_unit_heterogeneity", "plot_regime_drift", "plot_importance_ci",
|
|
47
|
+
"plot_counterfactual",
|
|
48
|
+
# misc
|
|
49
|
+
"tables", "Report",
|
|
50
|
+
]
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""Panel time-series data: simulation and the lag / cross-sectional-average
|
|
2
|
+
design builder.
|
|
3
|
+
|
|
4
|
+
The design builder is the backbone of the whole library: it engineers a
|
|
5
|
+
numeric feature matrix whose *column names* encode the structure
|
|
6
|
+
|
|
7
|
+
{variable}__L{lag}__{kind} kind in {own, csa}
|
|
8
|
+
|
|
9
|
+
so that downstream explainers can recover, for every attribution, which
|
|
10
|
+
*variable*, which *lag*, and whether it is an own regressor or a
|
|
11
|
+
cross-sectional / common-factor proxy (a CSA in the sense of Pesaran's CCE).
|
|
12
|
+
|
|
13
|
+
Author : Dr Merwan Roudane <merwanroudane920@gmail.com>
|
|
14
|
+
GitHub : https://github.com/merwanroudane/panelxai
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
SEP = "__"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# --------------------------------------------------------------------------- #
|
|
26
|
+
# Column-name protocol
|
|
27
|
+
# --------------------------------------------------------------------------- #
|
|
28
|
+
def make_feature_name(var: str, lag: int, kind: str) -> str:
|
|
29
|
+
return f"{var}{SEP}L{lag}{SEP}{kind}"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def parse_feature_name(name: str) -> dict:
|
|
33
|
+
"""Inverse of :func:`make_feature_name`. Returns var/lag/kind."""
|
|
34
|
+
parts = name.split(SEP)
|
|
35
|
+
if len(parts) != 3 or not parts[1].startswith("L"):
|
|
36
|
+
# Not a structured column (e.g. a raw passthrough); treat as own/lag0.
|
|
37
|
+
return {"var": name, "lag": 0, "kind": "own"}
|
|
38
|
+
return {"var": parts[0], "lag": int(parts[1][1:]), "kind": parts[2]}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class DesignSpec:
|
|
43
|
+
"""Metadata describing a built design matrix."""
|
|
44
|
+
features: list # ordered feature column names
|
|
45
|
+
y: str # target column name
|
|
46
|
+
unit: str
|
|
47
|
+
time: str
|
|
48
|
+
lags: int
|
|
49
|
+
csa_lags: int
|
|
50
|
+
fe: str # 'within' | 'none'
|
|
51
|
+
unit_index: np.ndarray # unit label per design row
|
|
52
|
+
time_index: np.ndarray # time label per design row
|
|
53
|
+
|
|
54
|
+
def meta_frame(self) -> pd.DataFrame:
|
|
55
|
+
rows = [parse_feature_name(f) for f in self.features]
|
|
56
|
+
m = pd.DataFrame(rows)
|
|
57
|
+
m.insert(0, "feature", self.features)
|
|
58
|
+
return m
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# --------------------------------------------------------------------------- #
|
|
62
|
+
# Simulation
|
|
63
|
+
# --------------------------------------------------------------------------- #
|
|
64
|
+
def simulate_panel_ts(
|
|
65
|
+
n_units: int = 30,
|
|
66
|
+
n_periods: int = 40,
|
|
67
|
+
n_features: int = 4,
|
|
68
|
+
n_lags: int = 2,
|
|
69
|
+
n_factors: int = 1,
|
|
70
|
+
csd_strength: float = 0.8,
|
|
71
|
+
regime_break: float | None = 0.5,
|
|
72
|
+
nonlinear: bool = True,
|
|
73
|
+
seed: int | None = None,
|
|
74
|
+
) -> pd.DataFrame:
|
|
75
|
+
"""Simulate a dynamic panel with the features the library is built to explain.
|
|
76
|
+
|
|
77
|
+
The data-generating process embeds, on purpose:
|
|
78
|
+
|
|
79
|
+
* **Distributed lags** - only a *subset* of (variable, lag) pairs truly
|
|
80
|
+
drive ``y`` (so importance recovery is testable).
|
|
81
|
+
* **Cross-sectional dependence** via ``n_factors`` common factors
|
|
82
|
+
``F_t`` with heterogeneous loadings (strength ``csd_strength``).
|
|
83
|
+
* **A regime switch** at ``regime_break`` (fraction of the sample) where
|
|
84
|
+
one coefficient changes sign - this is what regime-aware explainers
|
|
85
|
+
should detect as *explanation drift*.
|
|
86
|
+
* Optional **non-linearity** (interaction + saturation) so that linear
|
|
87
|
+
models are mis-specified and ML adds value.
|
|
88
|
+
|
|
89
|
+
Returns a long (balanced) panel with columns ``unit, time, y, x1..xp``.
|
|
90
|
+
"""
|
|
91
|
+
rng = np.random.default_rng(seed)
|
|
92
|
+
N, T, p = n_units, n_periods, n_features
|
|
93
|
+
|
|
94
|
+
# Common factors (shared across units) -> cross-sectional dependence.
|
|
95
|
+
F = rng.normal(size=(T, n_factors))
|
|
96
|
+
loadings = rng.normal(loc=csd_strength, scale=0.3, size=(N, n_factors))
|
|
97
|
+
|
|
98
|
+
# Regressors: persistent (AR) + a factor component.
|
|
99
|
+
x = np.zeros((N, T, p))
|
|
100
|
+
x_load = rng.normal(loc=0.5, scale=0.4, size=(N, p, n_factors))
|
|
101
|
+
for j in range(p):
|
|
102
|
+
ar = rng.uniform(0.3, 0.7)
|
|
103
|
+
for t in range(T):
|
|
104
|
+
prev = x[:, t - 1, j] if t > 0 else 0.0
|
|
105
|
+
common = x_load[:, j, :] @ F[t]
|
|
106
|
+
x[:, t, j] = ar * prev + common + rng.normal(scale=1.0, size=N)
|
|
107
|
+
|
|
108
|
+
# True distributed-lag structure: pick a few (var, lag) drivers.
|
|
109
|
+
beta = np.zeros((p, n_lags + 1))
|
|
110
|
+
beta[0, 0] = 1.2 # x1 contemporaneous
|
|
111
|
+
if n_lags >= 1:
|
|
112
|
+
beta[1, 1] = -0.9 # x2 at lag 1
|
|
113
|
+
if p >= 3 and n_lags >= 2:
|
|
114
|
+
beta[2, 2] = 0.7 # x3 at lag 2
|
|
115
|
+
|
|
116
|
+
alpha = rng.normal(scale=1.0, size=N) # unit fixed effects
|
|
117
|
+
gamma = rng.normal(loc=0.6, scale=0.2, size=(N, n_factors)) # y factor loadings
|
|
118
|
+
|
|
119
|
+
brk = int(regime_break * T) if regime_break is not None else None
|
|
120
|
+
|
|
121
|
+
y = np.zeros((N, T))
|
|
122
|
+
for t in range(T):
|
|
123
|
+
mu = alpha + gamma @ F[t]
|
|
124
|
+
for j in range(p):
|
|
125
|
+
for l in range(n_lags + 1):
|
|
126
|
+
if beta[j, l] == 0.0 or t - l < 0:
|
|
127
|
+
continue
|
|
128
|
+
coef = beta[j, l]
|
|
129
|
+
# Regime switch flips the sign of the x1 effect after the break.
|
|
130
|
+
if brk is not None and j == 0 and l == 0 and t >= brk:
|
|
131
|
+
coef = -coef
|
|
132
|
+
mu = mu + coef * x[:, t - l, j]
|
|
133
|
+
if nonlinear and p >= 2:
|
|
134
|
+
mu = mu + 0.5 * x[:, t, 0] * np.tanh(x[:, t, 1])
|
|
135
|
+
y[:, t] = mu + rng.normal(scale=0.7, size=N)
|
|
136
|
+
|
|
137
|
+
# Assemble long frame.
|
|
138
|
+
units = np.repeat(np.arange(1, N + 1), T)
|
|
139
|
+
times = np.tile(np.arange(1, T + 1), N)
|
|
140
|
+
data = {"unit": units, "time": times, "y": y.reshape(-1)}
|
|
141
|
+
for j in range(p):
|
|
142
|
+
data[f"x{j + 1}"] = x[:, :, j].reshape(-1)
|
|
143
|
+
return pd.DataFrame(data)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# --------------------------------------------------------------------------- #
|
|
147
|
+
# Design construction
|
|
148
|
+
# --------------------------------------------------------------------------- #
|
|
149
|
+
def build_lags(
|
|
150
|
+
df: pd.DataFrame,
|
|
151
|
+
cols: list,
|
|
152
|
+
lags: int,
|
|
153
|
+
unit: str = "unit",
|
|
154
|
+
time: str = "time",
|
|
155
|
+
) -> pd.DataFrame:
|
|
156
|
+
"""Add ``{col}__L{l}__own`` columns, lagging *within* each unit."""
|
|
157
|
+
out = df.sort_values([unit, time]).copy()
|
|
158
|
+
g = out.groupby(unit, sort=False)
|
|
159
|
+
for c in cols:
|
|
160
|
+
for l in range(lags + 1):
|
|
161
|
+
out[make_feature_name(c, l, "own")] = g[c].shift(l)
|
|
162
|
+
return out
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def add_cross_sectional_averages(
|
|
166
|
+
df: pd.DataFrame,
|
|
167
|
+
cols: list,
|
|
168
|
+
csa_lags: int = 0,
|
|
169
|
+
unit: str = "unit",
|
|
170
|
+
time: str = "time",
|
|
171
|
+
) -> pd.DataFrame:
|
|
172
|
+
"""Add ``{col}__L{l}__csa``: the period-t mean across units (a CCE-style
|
|
173
|
+
proxy for the unobserved common factors), then lagged within unit."""
|
|
174
|
+
out = df.copy()
|
|
175
|
+
means = out.groupby(time, sort=True)[cols].transform("mean")
|
|
176
|
+
g_time = out.sort_values([unit, time]).groupby(unit, sort=False)
|
|
177
|
+
for c in cols:
|
|
178
|
+
base = f"__csa_base_{c}"
|
|
179
|
+
out[base] = means[c]
|
|
180
|
+
gb = out.sort_values([unit, time]).groupby(unit, sort=False)[base]
|
|
181
|
+
for l in range(csa_lags + 1):
|
|
182
|
+
out[make_feature_name(c, l, "csa")] = gb.shift(l)
|
|
183
|
+
out = out.drop(columns=base)
|
|
184
|
+
return out
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def build_design(
|
|
188
|
+
df: pd.DataFrame,
|
|
189
|
+
y: str = "y",
|
|
190
|
+
X: list | None = None,
|
|
191
|
+
unit: str = "unit",
|
|
192
|
+
time: str = "time",
|
|
193
|
+
lags: int = 2,
|
|
194
|
+
csa: bool = True,
|
|
195
|
+
csa_lags: int = 0,
|
|
196
|
+
csa_vars: list | None = None,
|
|
197
|
+
fe: str = "within",
|
|
198
|
+
):
|
|
199
|
+
"""Turn a long panel into ``(X_design, y_vec, DesignSpec)``.
|
|
200
|
+
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
lags : number of own lags (0..lags) for each regressor in ``X``.
|
|
204
|
+
csa : if True, append cross-sectional averages (CCE proxies for the
|
|
205
|
+
common factors driving cross-sectional dependence).
|
|
206
|
+
csa_lags : own-lags of the CSAs.
|
|
207
|
+
fe : ``'within'`` demeans ``y`` and every feature within unit
|
|
208
|
+
(absorbing fixed effects, the standard FE transform);
|
|
209
|
+
``'none'`` leaves levels untouched.
|
|
210
|
+
"""
|
|
211
|
+
if X is None:
|
|
212
|
+
X = [c for c in df.columns if c not in (y, unit, time)]
|
|
213
|
+
csa_vars = csa_vars if csa_vars is not None else list(X)
|
|
214
|
+
|
|
215
|
+
work = build_lags(df, X, lags, unit, time)
|
|
216
|
+
if csa:
|
|
217
|
+
work = add_cross_sectional_averages(work, csa_vars, csa_lags, unit, time)
|
|
218
|
+
|
|
219
|
+
feats = [make_feature_name(v, l, "own") for v in X for l in range(lags + 1)]
|
|
220
|
+
if csa:
|
|
221
|
+
feats += [make_feature_name(v, l, "csa")
|
|
222
|
+
for v in csa_vars for l in range(csa_lags + 1)]
|
|
223
|
+
|
|
224
|
+
work = work.dropna(subset=feats + [y]).reset_index(drop=True)
|
|
225
|
+
|
|
226
|
+
y_vec = work[y].copy()
|
|
227
|
+
X_des = work[feats].copy()
|
|
228
|
+
u_idx = work[unit].to_numpy()
|
|
229
|
+
t_idx = work[time].to_numpy()
|
|
230
|
+
|
|
231
|
+
if fe == "within":
|
|
232
|
+
# Demean target and features within unit (absorb fixed effects).
|
|
233
|
+
gy = y_vec.groupby(work[unit])
|
|
234
|
+
y_vec = y_vec - gy.transform("mean")
|
|
235
|
+
gX = X_des.groupby(work[unit].values)
|
|
236
|
+
X_des = X_des - gX.transform("mean")
|
|
237
|
+
elif fe != "none":
|
|
238
|
+
raise ValueError("fe must be 'within' or 'none'")
|
|
239
|
+
|
|
240
|
+
spec = DesignSpec(
|
|
241
|
+
features=feats, y=y, unit=unit, time=time, lags=lags,
|
|
242
|
+
csa_lags=csa_lags if csa else -1, fe=fe,
|
|
243
|
+
unit_index=u_idx, time_index=t_idx,
|
|
244
|
+
)
|
|
245
|
+
return X_des.reset_index(drop=True), y_vec.reset_index(drop=True), spec
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def panel_describe(
|
|
249
|
+
df: pd.DataFrame, unit: str = "unit", time: str = "time"
|
|
250
|
+
) -> pd.DataFrame:
|
|
251
|
+
"""Quick structural summary of a long panel."""
|
|
252
|
+
n_units = df[unit].nunique()
|
|
253
|
+
counts = df.groupby(unit)[time].size()
|
|
254
|
+
rows = {
|
|
255
|
+
"n_units": n_units,
|
|
256
|
+
"n_periods_min": int(counts.min()),
|
|
257
|
+
"n_periods_max": int(counts.max()),
|
|
258
|
+
"balanced": bool(counts.nunique() == 1),
|
|
259
|
+
"n_obs": len(df),
|
|
260
|
+
"n_vars": df.shape[1] - 2,
|
|
261
|
+
}
|
|
262
|
+
return pd.DataFrame.from_dict(rows, orient="index", columns=["value"])
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Explainability core for panel time-series models.
|
|
2
|
+
|
|
3
|
+
The unifying idea: a raw SHAP value is indexed only by (observation, feature).
|
|
4
|
+
Because :mod:`panelxai.data` encodes ``variable``, ``lag`` and ``kind``
|
|
5
|
+
(own vs cross-sectional-average) in every feature name, we can *re-index*
|
|
6
|
+
attributions onto the structure that matters in panel econometrics:
|
|
7
|
+
|
|
8
|
+
variable x lag x unit x time + own-vs-factor decomposition.
|
|
9
|
+
"""
|
|
10
|
+
from .structured import StructuredExplainer
|
|
11
|
+
from .regime import regime_importance, regime_effect_sign, explanation_drift
|
|
12
|
+
from .counterfactual import counterfactual
|
|
13
|
+
from .uncertainty import bootstrap_importance
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"StructuredExplainer",
|
|
17
|
+
"regime_importance",
|
|
18
|
+
"explanation_drift",
|
|
19
|
+
"counterfactual",
|
|
20
|
+
"bootstrap_importance",
|
|
21
|
+
]
|