hapc 2.3.0__tar.gz → 2.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hapc-2.3.0 → hapc-2.5.0}/CMakeLists.txt +8 -2
- hapc-2.5.0/PKG-INFO +359 -0
- hapc-2.5.0/README.md +321 -0
- hapc-2.5.0/pyproject.toml +71 -0
- {hapc-2.3.0 → hapc-2.5.0}/python/hapc/__init__.py +10 -3
- hapc-2.5.0/python/hapc/ate.py +735 -0
- {hapc-2.3.0 → hapc-2.5.0}/python/hapc/cv.py +37 -4
- hapc-2.5.0/python/hapc/hazard.py +351 -0
- {hapc-2.3.0 → hapc-2.5.0}/python/hapc/single.py +14 -5
- hapc-2.5.0/python/hapc.egg-info/PKG-INFO +359 -0
- {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/SOURCES.txt +1 -0
- {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/requires.txt +5 -0
- {hapc-2.3.0 → hapc-2.5.0}/setup.py +10 -0
- {hapc-2.3.0 → hapc-2.5.0}/tests/test_ate.py +56 -0
- {hapc-2.3.0 → hapc-2.5.0}/tests/test_ate_hapc_diagnostics_example.py +8 -4
- hapc-2.3.0/PKG-INFO +0 -212
- hapc-2.3.0/README.md +0 -178
- hapc-2.3.0/pyproject.toml +0 -36
- hapc-2.3.0/python/hapc/ate.py +0 -425
- hapc-2.3.0/python/hapc.egg-info/PKG-INFO +0 -212
- {hapc-2.3.0 → hapc-2.5.0}/LICENSE +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/MANIFEST.in +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/python/hapc/core.py +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/dependency_links.txt +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/not-zip-safe +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/top_level.txt +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/setup.cfg +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/bindings.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/cross_kernel.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/cv_classi.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/cv_fast_pchal.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/cv_fast_pchal_python.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/fast_pchal.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/hapc_core.hpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/logistic_call.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/mkernel.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_call.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_classi_call.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_cv.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_cv_classi_cpp.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_cv_cpp.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/pchal_design.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/r_bindings.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/ridge_wrappers.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/single_pcghal_cpp.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/src/single_pchar.cpp +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/tests/test_api.py +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/tests/test_core.py +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/tests/test_logistic_regression.py +0 -0
- {hapc-2.3.0 → hapc-2.5.0}/tests/test_r_vs_python_alpha.py +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
cmake_minimum_required(VERSION 3.
|
|
1
|
+
cmake_minimum_required(VERSION 3.18)
|
|
2
2
|
project(hapc)
|
|
3
3
|
|
|
4
4
|
set(CMAKE_CXX_STANDARD 17)
|
|
@@ -15,7 +15,13 @@ endif()
|
|
|
15
15
|
# Python3_EXECUTABLE from setup.py so the build always targets the *same*
|
|
16
16
|
# interpreter that pip is using. Without this CMake may discover a newer/
|
|
17
17
|
# older system Python and produce a .so tagged for the wrong ABI.
|
|
18
|
-
|
|
18
|
+
#
|
|
19
|
+
# Use Development.Module (headers only), NOT the full Development component:
|
|
20
|
+
# the latter also requires Development.Embed -> libpython, which manylinux
|
|
21
|
+
# images deliberately do not ship (extension modules must not link libpython).
|
|
22
|
+
# Requiring full Development makes the manylinux build fail with
|
|
23
|
+
# "Could NOT find Python3 (missing: Python3_LIBRARIES Development.Embed)".
|
|
24
|
+
find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
|
|
19
25
|
message(STATUS "Python3_EXECUTABLE: ${Python3_EXECUTABLE}")
|
|
20
26
|
message(STATUS "Python3_VERSION: ${Python3_VERSION}")
|
|
21
27
|
|
hapc-2.5.0/PKG-INFO
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hapc
|
|
3
|
+
Version: 2.5.0
|
|
4
|
+
Summary: Highly Adaptive Principal Components
|
|
5
|
+
Home-page: https://github.com/meixide/hapc
|
|
6
|
+
Author: Carlos García Meixide
|
|
7
|
+
Author-email: Carlos García Meixide <cgmeixide@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/meixide/hapc
|
|
10
|
+
Project-URL: Documentation, https://hapc.readthedocs.io
|
|
11
|
+
Project-URL: Repository, https://github.com/meixide/hapc.git
|
|
12
|
+
Project-URL: Issues, https://github.com/meixide/hapc/issues
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: numpy<2.3,>=1.24
|
|
24
|
+
Requires-Dist: scikit-learn>=1.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
28
|
+
Requires-Dist: black; extra == "dev"
|
|
29
|
+
Requires-Dist: flake8; extra == "dev"
|
|
30
|
+
Provides-Extra: docs
|
|
31
|
+
Requires-Dist: sphinx>=7; extra == "docs"
|
|
32
|
+
Requires-Dist: furo; extra == "docs"
|
|
33
|
+
Requires-Dist: myst-parser; extra == "docs"
|
|
34
|
+
Dynamic: author
|
|
35
|
+
Dynamic: home-page
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
Dynamic: requires-python
|
|
38
|
+
|
|
39
|
+
# HAPC: Highly Adaptive Prinicipal Components
|
|
40
|
+
|
|
41
|
+
A fast and flexible machine learning library for nonparametric high-dimensional regression and classification with guarantees.
|
|
42
|
+
|
|
43
|
+
## Documentation
|
|
44
|
+
|
|
45
|
+
- **Python API** (rendered from docstrings): https://hapc.readthedocs.io —
|
|
46
|
+
configured via [`.readthedocs.yaml`](.readthedocs.yaml) and
|
|
47
|
+
[`docs/`](docs/) (Sphinx + autodoc). Build locally with
|
|
48
|
+
`pip install -e ".[docs]" && sphinx-build -b html docs docs/_build/html`.
|
|
49
|
+
- **R API** (rendered from roxygen): a [pkgdown](https://pkgdown.r-lib.org)
|
|
50
|
+
site built by [`.github/workflows/pkgdown.yaml`](.github/workflows/pkgdown.yaml)
|
|
51
|
+
(config in [`_pkgdown.yml`](_pkgdown.yml)). Build locally with
|
|
52
|
+
`Rscript -e 'pkgdown::build_site()'`.
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
### Prerequisites
|
|
57
|
+
|
|
58
|
+
- Python 3.8+
|
|
59
|
+
- C++ compiler (g++, clang, or MSVC)
|
|
60
|
+
- CMake 3.15+
|
|
61
|
+
- Eigen3
|
|
62
|
+
|
|
63
|
+
### Quick Install
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install hapc
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Prebuilt wheels are published for Linux (manylinux2014, x86_64), macOS
|
|
70
|
+
(Intel + Apple Silicon) and Windows, for CPython 3.8–3.12. No compiler,
|
|
71
|
+
CMake or Eigen is needed when a wheel is available.
|
|
72
|
+
|
|
73
|
+
### Linux / HPC clusters
|
|
74
|
+
|
|
75
|
+
The Linux wheels use the **manylinux2014** baseline (glibc 2.17), so
|
|
76
|
+
`pip install hapc` works out of the box on HPC login/compute nodes —
|
|
77
|
+
no `conda` toolchain, `devtoolset`, or sysroot setup required:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install hapc
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
If you must build from the source distribution (niche architecture, very
|
|
84
|
+
old Python, or an air-gapped node), provide a C++17 compiler and either
|
|
85
|
+
let CMake fetch Eigen automatically (needs network) or install Eigen and
|
|
86
|
+
let `find_package(Eigen3)` find it:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# with conda compilers (recommended on HPC)
|
|
90
|
+
conda install -c conda-forge cxx-compiler cmake eigen
|
|
91
|
+
pip install hapc --no-binary hapc
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Install from GitHub (latest development version)
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pip install git+https://github.com/meixide/hapc.git
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Or with editable install for development:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
git clone https://github.com/meixide/hapc.git
|
|
104
|
+
cd hapc
|
|
105
|
+
pip install -e .
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Install build dependencies
|
|
109
|
+
|
|
110
|
+
If installation fails, you may need to install build dependencies:
|
|
111
|
+
|
|
112
|
+
**macOS:**
|
|
113
|
+
```bash
|
|
114
|
+
brew install cmake eigen
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
**Ubuntu/Debian:**
|
|
118
|
+
```bash
|
|
119
|
+
sudo apt-get install cmake libeigen3-dev build-essential
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
**Windows:**
|
|
123
|
+
```bash
|
|
124
|
+
pip install cmake
|
|
125
|
+
# Install Visual Studio Build Tools or use conda
|
|
126
|
+
conda install -c conda-forge eigen
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Quick Start
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
import numpy as np
|
|
133
|
+
from hapc.single import single_pcghal
|
|
134
|
+
from hapc.cv import pcghal_cv
|
|
135
|
+
|
|
136
|
+
# Generate sample data
|
|
137
|
+
X = np.random.randn(100, 5)
|
|
138
|
+
Y = X[:, 0] + 0.5 * X[:, 1] + np.random.randn(100) * 0.1
|
|
139
|
+
|
|
140
|
+
# Single fit with fixed lambda
|
|
141
|
+
result = single_pcghal(X, Y, maxdeg=2, npc=5, single_lambda=0.01)
|
|
142
|
+
print(f"Risk: {result.optimizer_output.risk:.6f}")
|
|
143
|
+
|
|
144
|
+
# Cross-validation to select lambda
|
|
145
|
+
lambdas = np.logspace(-4, 0, 10)
|
|
146
|
+
cv_result = pcghal_cv(X, Y, maxdeg=2, npc=5, lambdas=lambdas, nfolds=5)
|
|
147
|
+
print(f"Best lambda: {cv_result.best_lambda:.6f}")
|
|
148
|
+
|
|
149
|
+
# Make predictions
|
|
150
|
+
X_test = np.random.randn(20, 5)
|
|
151
|
+
result = single_pcghal(X, Y, maxdeg=2, npc=5, single_lambda=0.01, predict=X_test)
|
|
152
|
+
print(f"Predictions: {result.predictions}")
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Usage
|
|
156
|
+
|
|
157
|
+
### Regression
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from hapc.single import single_pcghal
|
|
161
|
+
|
|
162
|
+
result = single_pcghal(
|
|
163
|
+
X, Y,
|
|
164
|
+
maxdeg=2, # Maximum degree of interactions
|
|
165
|
+
npc=10, # Number of principal components
|
|
166
|
+
single_lambda=0.01,
|
|
167
|
+
predict=X_test # Optional: test data for predictions
|
|
168
|
+
)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Classification
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from hapc.single import single_pcghal
|
|
175
|
+
|
|
176
|
+
result = single_pcghal(
|
|
177
|
+
X, Y_binary,
|
|
178
|
+
maxdeg=2,
|
|
179
|
+
npc=10,
|
|
180
|
+
single_lambda=0.01,
|
|
181
|
+
predict=X_test
|
|
182
|
+
)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Cross-Validation
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
from hapc.cv import pcghal_cv
|
|
189
|
+
|
|
190
|
+
cv_result = pcghal_cv(
|
|
191
|
+
X, Y,
|
|
192
|
+
maxdeg=2,
|
|
193
|
+
npc=10,
|
|
194
|
+
lambdas=np.logspace(-4, 0, 20),
|
|
195
|
+
nfolds=5
|
|
196
|
+
)
|
|
197
|
+
print(cv_result.best_lambda)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Average Treatment Effect (ATE)
|
|
201
|
+
|
|
202
|
+
Estimate the ATE `E[Y(1)] − E[Y(0)]` with HAPC nuisance models and a
|
|
203
|
+
doubly-robust (AIPW) efficient influence function. `ate_hapc` returns a point
|
|
204
|
+
estimate and a `(1 − alpha)` Wald confidence interval.
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from hapc import ate_hapc
|
|
208
|
+
|
|
209
|
+
# W: covariates (n, p); A: binary treatment in {0,1} or {-1,+1}; Y: outcome
|
|
210
|
+
res = ate_hapc(W, Y, A, alpha=0.05, method="undersmooth")
|
|
211
|
+
print(res.estimate, res.lower, res.upper)
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
Two bias-control strategies are available through `method`:
|
|
215
|
+
|
|
216
|
+
- **`method="undersmooth"`** (default) — single-sample estimator. The outcome
|
|
217
|
+
model is undersmoothed (λ pushed below the CV-optimal value) until the
|
|
218
|
+
empirical influence function is within `σ / (√n · log n)`. This requires the
|
|
219
|
+
**full PC basis** (`npcs = n`, the default) and a λ grid that reaches small λ
|
|
220
|
+
(defaults `log_lambda_out_min = -10`); otherwise the gate never reaches the
|
|
221
|
+
low-bias regime and `ate_hapc` emits a warning. Pass
|
|
222
|
+
`report_undersmoothing=True` to print the `|mean(EIF)|`-vs-λ path.
|
|
223
|
+
- **`method="crossfit"`** — DML-style K-fold cross-fitting (`cf_folds`, default
|
|
224
|
+
5, stratified by treatment). Both nuisances are fit on the training folds and
|
|
225
|
+
the influence function is evaluated out-of-fold, giving honest point estimates
|
|
226
|
+
and coverage without undersmoothing. Recommended under good overlap.
|
|
227
|
+
|
|
228
|
+
### Discrete-time survival (`family = "logit-hazard"`)
|
|
229
|
+
|
|
230
|
+
Fit a discrete-time **logistic hazard** model with HAPC. You supply only the
|
|
231
|
+
observed right-censored data — baseline covariates `X`, the observed time
|
|
232
|
+
`T = min(T_event, C)`, and the event indicator `Delta = 1(T_event <= C)` — and
|
|
233
|
+
the wrapper performs the person-period expansion (one row per
|
|
234
|
+
subject-per-interval-at-risk, hazard label = 1 at the event interval), prepends
|
|
235
|
+
the visit time as the first HAL covariate, and cross-validates the binomial fit.
|
|
236
|
+
|
|
237
|
+
**Model.** The discrete hazard is the conditional event probability in interval
|
|
238
|
+
`t` given survival up to `t`, modelled on the logit scale by a HAPC fit `f` of
|
|
239
|
+
the augmented covariate `(t, x)`:
|
|
240
|
+
|
|
241
|
+
```text
|
|
242
|
+
lambda(t | x) = P(T_event = t | T_event >= t, X = x)
|
|
243
|
+
logit lambda(t | x) = f(t, x)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
**Person-period likelihood.** Under independent right-censoring the observed-data
|
|
247
|
+
likelihood factorises over the at-risk intervals,
|
|
248
|
+
|
|
249
|
+
```text
|
|
250
|
+
prod_i prod_{t <= T_i} lambda(t|x_i)^Y_it * (1 - lambda(t|x_i))^(1 - Y_it),
|
|
251
|
+
with Y_it = 1(T_event_i = t),
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
which is exactly the Bernoulli (logistic) likelihood of the expanded
|
|
255
|
+
person-period table — so a binomial HAPC fit of `Y_it` on `(t, x_i)` estimates
|
|
256
|
+
the discrete hazard (Cox 1972; Brown 1975; Allison 1982).
|
|
257
|
+
|
|
258
|
+
**Survival.** The conditional survival function follows by the product-limit
|
|
259
|
+
relation `S(t | x) = prod_{s <= t} (1 - lambda(s | x))`, returned for new
|
|
260
|
+
subjects when `predict=` is supplied.
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
from hapc import hazard_hapc
|
|
264
|
+
import numpy as np
|
|
265
|
+
|
|
266
|
+
# X: baseline covariates (n, p); T: observed times; Delta: 0/1 event indicator
|
|
267
|
+
fit = hazard_hapc(X, T, Delta, norm="1", max_degree=2, time_grid=np.arange(1, 7))
|
|
268
|
+
fit.hazard # estimated hazard per person-period row (CV predictions)
|
|
269
|
+
fit.best_lambda, fit.interior # CV-selected lambda; is it interior to the grid?
|
|
270
|
+
|
|
271
|
+
# survival curves S(t|x) for new subjects
|
|
272
|
+
fit = hazard_hapc(X, T, Delta, norm="1", predict=X_new)
|
|
273
|
+
fit.predict_survival # (m, K) survival probabilities over the grid
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
```r
|
|
277
|
+
library(hapc)
|
|
278
|
+
# equivalent to cv.hapc(X, T, family = "logit-hazard", Delta = Delta, norm = "1")
|
|
279
|
+
fit <- hazard.hapc(X, T, Delta, norm = "1", max_degree = 2, time_grid = 1:6)
|
|
280
|
+
fit$hazard; fit$best_lambda; fit$interior
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
`norm` must be `"1"` (logistic LASSO) or `"2"` (logistic ridge); `norm = "sv"`
|
|
284
|
+
is **not implemented** for this family and is flagged.
|
|
285
|
+
|
|
286
|
+
**Returns** (Python `HazardResult` / R `hapc_hazard`):
|
|
287
|
+
|
|
288
|
+
- `hazard` — cross-validated discrete hazard for each person-period row
|
|
289
|
+
- `lambdas`, `risk`, `best_lambda` — CV grid, mean logistic deviance, selected λ
|
|
290
|
+
- `interior` — whether `best_lambda` is strictly inside the grid (sanity check)
|
|
291
|
+
- `time_grid`, `ids`/`id`, `Y` — the discrete grid and person-period bookkeeping
|
|
292
|
+
- `predict_hazard`, `predict_survival` — hazard surface and survival curves for
|
|
293
|
+
new subjects (only when `predict=` is given)
|
|
294
|
+
- `cv` — the underlying cross-validation result
|
|
295
|
+
|
|
296
|
+
Worked end-to-end examples (five hazard data-generating processes, with
|
|
297
|
+
true-vs-estimated hazard scatters and CV risk-vs-λ curves verifying an interior
|
|
298
|
+
optimum) are in
|
|
299
|
+
[`examples/hazard_logit_hazard_examples.R`](examples/hazard_logit_hazard_examples.R)
|
|
300
|
+
and
|
|
301
|
+
[`examples/hazard_logit_hazard_examples.py`](examples/hazard_logit_hazard_examples.py).
|
|
302
|
+
|
|
303
|
+
**References.** Cox (1972, *JRSS B*); Brown (1975, *Biometrics*); Allison (1982,
|
|
304
|
+
*Sociological Methodology*); Singer & Willett (2003, *Applied Longitudinal Data
|
|
305
|
+
Analysis*); Benkeser & van der Laan (2016, *IEEE DSAA*).
|
|
306
|
+
|
|
307
|
+
## API Reference
|
|
308
|
+
|
|
309
|
+
### `hapc.single.single_pcghal()`
|
|
310
|
+
|
|
311
|
+
Fit PC-GHAL with a single lambda value.
|
|
312
|
+
|
|
313
|
+
**Parameters:**
|
|
314
|
+
- `X` (ndarray, shape (n, p)): Input features
|
|
315
|
+
- `Y` (ndarray, shape (n,)): Response variable
|
|
316
|
+
- `maxdeg` (int): Maximum degree of interactions
|
|
317
|
+
- `npc` (int): Number of principal components
|
|
318
|
+
- `single_lambda` (float): Regularization parameter
|
|
319
|
+
- `max_iter` (int, default=100): Maximum iterations
|
|
320
|
+
- `tol` (float, default=1e-6): Convergence tolerance
|
|
321
|
+
- `verbose` (bool, default=False): Print progress
|
|
322
|
+
- `predict` (ndarray, optional): Test data for predictions
|
|
323
|
+
- `center` (bool, default=True): Center the design matrix
|
|
324
|
+
|
|
325
|
+
**Returns:**
|
|
326
|
+
- `result.optimizer_output.alpha`: Coefficients
|
|
327
|
+
- `result.optimizer_output.risk`: Final risk
|
|
328
|
+
- `result.optimizer_output.iter`: Iterations until convergence
|
|
329
|
+
- `result.predictions`: Predictions on test data (if provided)
|
|
330
|
+
|
|
331
|
+
### `hapc.cv.pcghal_cv()`
|
|
332
|
+
|
|
333
|
+
Cross-validation to select lambda.
|
|
334
|
+
|
|
335
|
+
**Parameters:**
|
|
336
|
+
- `lambdas` (ndarray): Grid of lambda values to test
|
|
337
|
+
- `nfolds` (int, default=5): Number of CV folds
|
|
338
|
+
- ...other parameters same as `single_pcghal`
|
|
339
|
+
|
|
340
|
+
**Returns:**
|
|
341
|
+
- `cv_result.best_lambda`: Optimal lambda
|
|
342
|
+
- `cv_result.mses`: CV errors for each lambda
|
|
343
|
+
- `cv_result.best_model`: Fitted model with best lambda
|
|
344
|
+
- `cv_result.predictions`: Predictions on test data (if provided)
|
|
345
|
+
|
|
346
|
+
## Contributing
|
|
347
|
+
|
|
348
|
+
Contributions welcome! The C++ core is shared between R and Python packages.
|
|
349
|
+
|
|
350
|
+
```bash
|
|
351
|
+
git clone https://github.com/meixide/hapc.git
|
|
352
|
+
cd hapc
|
|
353
|
+
pip install -e .
|
|
354
|
+
pytest
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
## License
|
|
358
|
+
|
|
359
|
+
MIT License - see LICENSE file
|
hapc-2.5.0/README.md
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
# HAPC: Highly Adaptive Prinicipal Components
|
|
2
|
+
|
|
3
|
+
A fast and flexible machine learning library for nonparametric high-dimensional regression and classification with guarantees.
|
|
4
|
+
|
|
5
|
+
## Documentation
|
|
6
|
+
|
|
7
|
+
- **Python API** (rendered from docstrings): https://hapc.readthedocs.io —
|
|
8
|
+
configured via [`.readthedocs.yaml`](.readthedocs.yaml) and
|
|
9
|
+
[`docs/`](docs/) (Sphinx + autodoc). Build locally with
|
|
10
|
+
`pip install -e ".[docs]" && sphinx-build -b html docs docs/_build/html`.
|
|
11
|
+
- **R API** (rendered from roxygen): a [pkgdown](https://pkgdown.r-lib.org)
|
|
12
|
+
site built by [`.github/workflows/pkgdown.yaml`](.github/workflows/pkgdown.yaml)
|
|
13
|
+
(config in [`_pkgdown.yml`](_pkgdown.yml)). Build locally with
|
|
14
|
+
`Rscript -e 'pkgdown::build_site()'`.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
### Prerequisites
|
|
19
|
+
|
|
20
|
+
- Python 3.8+
|
|
21
|
+
- C++ compiler (g++, clang, or MSVC)
|
|
22
|
+
- CMake 3.15+
|
|
23
|
+
- Eigen3
|
|
24
|
+
|
|
25
|
+
### Quick Install
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install hapc
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Prebuilt wheels are published for Linux (manylinux2014, x86_64), macOS
|
|
32
|
+
(Intel + Apple Silicon) and Windows, for CPython 3.8–3.12. No compiler,
|
|
33
|
+
CMake or Eigen is needed when a wheel is available.
|
|
34
|
+
|
|
35
|
+
### Linux / HPC clusters
|
|
36
|
+
|
|
37
|
+
The Linux wheels use the **manylinux2014** baseline (glibc 2.17), so
|
|
38
|
+
`pip install hapc` works out of the box on HPC login/compute nodes —
|
|
39
|
+
no `conda` toolchain, `devtoolset`, or sysroot setup required:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install hapc
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
If you must build from the source distribution (niche architecture, very
|
|
46
|
+
old Python, or an air-gapped node), provide a C++17 compiler and either
|
|
47
|
+
let CMake fetch Eigen automatically (needs network) or install Eigen and
|
|
48
|
+
let `find_package(Eigen3)` find it:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# with conda compilers (recommended on HPC)
|
|
52
|
+
conda install -c conda-forge cxx-compiler cmake eigen
|
|
53
|
+
pip install hapc --no-binary hapc
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Install from GitHub (latest development version)
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install git+https://github.com/meixide/hapc.git
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Or with editable install for development:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
git clone https://github.com/meixide/hapc.git
|
|
66
|
+
cd hapc
|
|
67
|
+
pip install -e .
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Install build dependencies
|
|
71
|
+
|
|
72
|
+
If installation fails, you may need to install build dependencies:
|
|
73
|
+
|
|
74
|
+
**macOS:**
|
|
75
|
+
```bash
|
|
76
|
+
brew install cmake eigen
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Ubuntu/Debian:**
|
|
80
|
+
```bash
|
|
81
|
+
sudo apt-get install cmake libeigen3-dev build-essential
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Windows:**
|
|
85
|
+
```bash
|
|
86
|
+
pip install cmake
|
|
87
|
+
# Install Visual Studio Build Tools or use conda
|
|
88
|
+
conda install -c conda-forge eigen
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Quick Start
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
import numpy as np
|
|
95
|
+
from hapc.single import single_pcghal
|
|
96
|
+
from hapc.cv import pcghal_cv
|
|
97
|
+
|
|
98
|
+
# Generate sample data
|
|
99
|
+
X = np.random.randn(100, 5)
|
|
100
|
+
Y = X[:, 0] + 0.5 * X[:, 1] + np.random.randn(100) * 0.1
|
|
101
|
+
|
|
102
|
+
# Single fit with fixed lambda
|
|
103
|
+
result = single_pcghal(X, Y, maxdeg=2, npc=5, single_lambda=0.01)
|
|
104
|
+
print(f"Risk: {result.optimizer_output.risk:.6f}")
|
|
105
|
+
|
|
106
|
+
# Cross-validation to select lambda
|
|
107
|
+
lambdas = np.logspace(-4, 0, 10)
|
|
108
|
+
cv_result = pcghal_cv(X, Y, maxdeg=2, npc=5, lambdas=lambdas, nfolds=5)
|
|
109
|
+
print(f"Best lambda: {cv_result.best_lambda:.6f}")
|
|
110
|
+
|
|
111
|
+
# Make predictions
|
|
112
|
+
X_test = np.random.randn(20, 5)
|
|
113
|
+
result = single_pcghal(X, Y, maxdeg=2, npc=5, single_lambda=0.01, predict=X_test)
|
|
114
|
+
print(f"Predictions: {result.predictions}")
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Usage
|
|
118
|
+
|
|
119
|
+
### Regression
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from hapc.single import single_pcghal
|
|
123
|
+
|
|
124
|
+
result = single_pcghal(
|
|
125
|
+
X, Y,
|
|
126
|
+
maxdeg=2, # Maximum degree of interactions
|
|
127
|
+
npc=10, # Number of principal components
|
|
128
|
+
single_lambda=0.01,
|
|
129
|
+
predict=X_test # Optional: test data for predictions
|
|
130
|
+
)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Classification
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
from hapc.single import single_pcghal
|
|
137
|
+
|
|
138
|
+
result = single_pcghal(
|
|
139
|
+
X, Y_binary,
|
|
140
|
+
maxdeg=2,
|
|
141
|
+
npc=10,
|
|
142
|
+
single_lambda=0.01,
|
|
143
|
+
predict=X_test
|
|
144
|
+
)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Cross-Validation
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from hapc.cv import pcghal_cv
|
|
151
|
+
|
|
152
|
+
cv_result = pcghal_cv(
|
|
153
|
+
X, Y,
|
|
154
|
+
maxdeg=2,
|
|
155
|
+
npc=10,
|
|
156
|
+
lambdas=np.logspace(-4, 0, 20),
|
|
157
|
+
nfolds=5
|
|
158
|
+
)
|
|
159
|
+
print(cv_result.best_lambda)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Average Treatment Effect (ATE)
|
|
163
|
+
|
|
164
|
+
Estimate the ATE `E[Y(1)] − E[Y(0)]` with HAPC nuisance models and a
|
|
165
|
+
doubly-robust (AIPW) efficient influence function. `ate_hapc` returns a point
|
|
166
|
+
estimate and a `(1 − alpha)` Wald confidence interval.
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from hapc import ate_hapc
|
|
170
|
+
|
|
171
|
+
# W: covariates (n, p); A: binary treatment in {0,1} or {-1,+1}; Y: outcome
|
|
172
|
+
res = ate_hapc(W, Y, A, alpha=0.05, method="undersmooth")
|
|
173
|
+
print(res.estimate, res.lower, res.upper)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Two bias-control strategies are available through `method`:
|
|
177
|
+
|
|
178
|
+
- **`method="undersmooth"`** (default) — single-sample estimator. The outcome
|
|
179
|
+
model is undersmoothed (λ pushed below the CV-optimal value) until the
|
|
180
|
+
empirical influence function is within `σ / (√n · log n)`. This requires the
|
|
181
|
+
**full PC basis** (`npcs = n`, the default) and a λ grid that reaches small λ
|
|
182
|
+
(defaults `log_lambda_out_min = -10`); otherwise the gate never reaches the
|
|
183
|
+
low-bias regime and `ate_hapc` emits a warning. Pass
|
|
184
|
+
`report_undersmoothing=True` to print the `|mean(EIF)|`-vs-λ path.
|
|
185
|
+
- **`method="crossfit"`** — DML-style K-fold cross-fitting (`cf_folds`, default
|
|
186
|
+
5, stratified by treatment). Both nuisances are fit on the training folds and
|
|
187
|
+
the influence function is evaluated out-of-fold, giving honest point estimates
|
|
188
|
+
and coverage without undersmoothing. Recommended under good overlap.
|
|
189
|
+
|
|
190
|
+
### Discrete-time survival (`family = "logit-hazard"`)
|
|
191
|
+
|
|
192
|
+
Fit a discrete-time **logistic hazard** model with HAPC. You supply only the
|
|
193
|
+
observed right-censored data — baseline covariates `X`, the observed time
|
|
194
|
+
`T = min(T_event, C)`, and the event indicator `Delta = 1(T_event <= C)` — and
|
|
195
|
+
the wrapper performs the person-period expansion (one row per
|
|
196
|
+
subject-per-interval-at-risk, hazard label = 1 at the event interval), prepends
|
|
197
|
+
the visit time as the first HAL covariate, and cross-validates the binomial fit.
|
|
198
|
+
|
|
199
|
+
**Model.** The discrete hazard is the conditional event probability in interval
|
|
200
|
+
`t` given survival up to `t`, modelled on the logit scale by a HAPC fit `f` of
|
|
201
|
+
the augmented covariate `(t, x)`:
|
|
202
|
+
|
|
203
|
+
```text
|
|
204
|
+
lambda(t | x) = P(T_event = t | T_event >= t, X = x)
|
|
205
|
+
logit lambda(t | x) = f(t, x)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
**Person-period likelihood.** Under independent right-censoring the observed-data
|
|
209
|
+
likelihood factorises over the at-risk intervals,
|
|
210
|
+
|
|
211
|
+
```text
|
|
212
|
+
prod_i prod_{t <= T_i} lambda(t|x_i)^Y_it * (1 - lambda(t|x_i))^(1 - Y_it),
|
|
213
|
+
with Y_it = 1(T_event_i = t),
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
which is exactly the Bernoulli (logistic) likelihood of the expanded
|
|
217
|
+
person-period table — so a binomial HAPC fit of `Y_it` on `(t, x_i)` estimates
|
|
218
|
+
the discrete hazard (Cox 1972; Brown 1975; Allison 1982).
|
|
219
|
+
|
|
220
|
+
**Survival.** The conditional survival function follows by the product-limit
|
|
221
|
+
relation `S(t | x) = prod_{s <= t} (1 - lambda(s | x))`, returned for new
|
|
222
|
+
subjects when `predict=` is supplied.
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
from hapc import hazard_hapc
|
|
226
|
+
import numpy as np
|
|
227
|
+
|
|
228
|
+
# X: baseline covariates (n, p); T: observed times; Delta: 0/1 event indicator
|
|
229
|
+
fit = hazard_hapc(X, T, Delta, norm="1", max_degree=2, time_grid=np.arange(1, 7))
|
|
230
|
+
fit.hazard # estimated hazard per person-period row (CV predictions)
|
|
231
|
+
fit.best_lambda, fit.interior # CV-selected lambda; is it interior to the grid?
|
|
232
|
+
|
|
233
|
+
# survival curves S(t|x) for new subjects
|
|
234
|
+
fit = hazard_hapc(X, T, Delta, norm="1", predict=X_new)
|
|
235
|
+
fit.predict_survival # (m, K) survival probabilities over the grid
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
```r
|
|
239
|
+
library(hapc)
|
|
240
|
+
# equivalent to cv.hapc(X, T, family = "logit-hazard", Delta = Delta, norm = "1")
|
|
241
|
+
fit <- hazard.hapc(X, T, Delta, norm = "1", max_degree = 2, time_grid = 1:6)
|
|
242
|
+
fit$hazard; fit$best_lambda; fit$interior
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
`norm` must be `"1"` (logistic LASSO) or `"2"` (logistic ridge); `norm = "sv"`
|
|
246
|
+
is **not implemented** for this family and is flagged.
|
|
247
|
+
|
|
248
|
+
**Returns** (Python `HazardResult` / R `hapc_hazard`):
|
|
249
|
+
|
|
250
|
+
- `hazard` — cross-validated discrete hazard for each person-period row
|
|
251
|
+
- `lambdas`, `risk`, `best_lambda` — CV grid, mean logistic deviance, selected λ
|
|
252
|
+
- `interior` — whether `best_lambda` is strictly inside the grid (sanity check)
|
|
253
|
+
- `time_grid`, `ids`/`id`, `Y` — the discrete grid and person-period bookkeeping
|
|
254
|
+
- `predict_hazard`, `predict_survival` — hazard surface and survival curves for
|
|
255
|
+
new subjects (only when `predict=` is given)
|
|
256
|
+
- `cv` — the underlying cross-validation result
|
|
257
|
+
|
|
258
|
+
Worked end-to-end examples (five hazard data-generating processes, with
|
|
259
|
+
true-vs-estimated hazard scatters and CV risk-vs-λ curves verifying an interior
|
|
260
|
+
optimum) are in
|
|
261
|
+
[`examples/hazard_logit_hazard_examples.R`](examples/hazard_logit_hazard_examples.R)
|
|
262
|
+
and
|
|
263
|
+
[`examples/hazard_logit_hazard_examples.py`](examples/hazard_logit_hazard_examples.py).
|
|
264
|
+
|
|
265
|
+
**References.** Cox (1972, *JRSS B*); Brown (1975, *Biometrics*); Allison (1982,
|
|
266
|
+
*Sociological Methodology*); Singer & Willett (2003, *Applied Longitudinal Data
|
|
267
|
+
Analysis*); Benkeser & van der Laan (2016, *IEEE DSAA*).
|
|
268
|
+
|
|
269
|
+
## API Reference
|
|
270
|
+
|
|
271
|
+
### `hapc.single.single_pcghal()`
|
|
272
|
+
|
|
273
|
+
Fit PC-GHAL with a single lambda value.
|
|
274
|
+
|
|
275
|
+
**Parameters:**
|
|
276
|
+
- `X` (ndarray, shape (n, p)): Input features
|
|
277
|
+
- `Y` (ndarray, shape (n,)): Response variable
|
|
278
|
+
- `maxdeg` (int): Maximum degree of interactions
|
|
279
|
+
- `npc` (int): Number of principal components
|
|
280
|
+
- `single_lambda` (float): Regularization parameter
|
|
281
|
+
- `max_iter` (int, default=100): Maximum iterations
|
|
282
|
+
- `tol` (float, default=1e-6): Convergence tolerance
|
|
283
|
+
- `verbose` (bool, default=False): Print progress
|
|
284
|
+
- `predict` (ndarray, optional): Test data for predictions
|
|
285
|
+
- `center` (bool, default=True): Center the design matrix
|
|
286
|
+
|
|
287
|
+
**Returns:**
|
|
288
|
+
- `result.optimizer_output.alpha`: Coefficients
|
|
289
|
+
- `result.optimizer_output.risk`: Final risk
|
|
290
|
+
- `result.optimizer_output.iter`: Iterations until convergence
|
|
291
|
+
- `result.predictions`: Predictions on test data (if provided)
|
|
292
|
+
|
|
293
|
+
### `hapc.cv.pcghal_cv()`
|
|
294
|
+
|
|
295
|
+
Cross-validation to select lambda.
|
|
296
|
+
|
|
297
|
+
**Parameters:**
|
|
298
|
+
- `lambdas` (ndarray): Grid of lambda values to test
|
|
299
|
+
- `nfolds` (int, default=5): Number of CV folds
|
|
300
|
+
- ...other parameters same as `single_pcghal`
|
|
301
|
+
|
|
302
|
+
**Returns:**
|
|
303
|
+
- `cv_result.best_lambda`: Optimal lambda
|
|
304
|
+
- `cv_result.mses`: CV errors for each lambda
|
|
305
|
+
- `cv_result.best_model`: Fitted model with best lambda
|
|
306
|
+
- `cv_result.predictions`: Predictions on test data (if provided)
|
|
307
|
+
|
|
308
|
+
## Contributing
|
|
309
|
+
|
|
310
|
+
Contributions welcome! The C++ core is shared between R and Python packages.
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
git clone https://github.com/meixide/hapc.git
|
|
314
|
+
cd hapc
|
|
315
|
+
pip install -e .
|
|
316
|
+
pytest
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
## License
|
|
320
|
+
|
|
321
|
+
MIT License - see LICENSE file
|