hapc 2.3.0__tar.gz → 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {hapc-2.3.0 → hapc-2.5.0}/CMakeLists.txt +8 -2
  2. hapc-2.5.0/PKG-INFO +359 -0
  3. hapc-2.5.0/README.md +321 -0
  4. hapc-2.5.0/pyproject.toml +71 -0
  5. {hapc-2.3.0 → hapc-2.5.0}/python/hapc/__init__.py +10 -3
  6. hapc-2.5.0/python/hapc/ate.py +735 -0
  7. {hapc-2.3.0 → hapc-2.5.0}/python/hapc/cv.py +37 -4
  8. hapc-2.5.0/python/hapc/hazard.py +351 -0
  9. {hapc-2.3.0 → hapc-2.5.0}/python/hapc/single.py +14 -5
  10. hapc-2.5.0/python/hapc.egg-info/PKG-INFO +359 -0
  11. {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/SOURCES.txt +1 -0
  12. {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/requires.txt +5 -0
  13. {hapc-2.3.0 → hapc-2.5.0}/setup.py +10 -0
  14. {hapc-2.3.0 → hapc-2.5.0}/tests/test_ate.py +56 -0
  15. {hapc-2.3.0 → hapc-2.5.0}/tests/test_ate_hapc_diagnostics_example.py +8 -4
  16. hapc-2.3.0/PKG-INFO +0 -212
  17. hapc-2.3.0/README.md +0 -178
  18. hapc-2.3.0/pyproject.toml +0 -36
  19. hapc-2.3.0/python/hapc/ate.py +0 -425
  20. hapc-2.3.0/python/hapc.egg-info/PKG-INFO +0 -212
  21. {hapc-2.3.0 → hapc-2.5.0}/LICENSE +0 -0
  22. {hapc-2.3.0 → hapc-2.5.0}/MANIFEST.in +0 -0
  23. {hapc-2.3.0 → hapc-2.5.0}/python/hapc/core.py +0 -0
  24. {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/dependency_links.txt +0 -0
  25. {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/not-zip-safe +0 -0
  26. {hapc-2.3.0 → hapc-2.5.0}/python/hapc.egg-info/top_level.txt +0 -0
  27. {hapc-2.3.0 → hapc-2.5.0}/setup.cfg +0 -0
  28. {hapc-2.3.0 → hapc-2.5.0}/src/bindings.cpp +0 -0
  29. {hapc-2.3.0 → hapc-2.5.0}/src/cross_kernel.cpp +0 -0
  30. {hapc-2.3.0 → hapc-2.5.0}/src/cv_classi.cpp +0 -0
  31. {hapc-2.3.0 → hapc-2.5.0}/src/cv_fast_pchal.cpp +0 -0
  32. {hapc-2.3.0 → hapc-2.5.0}/src/cv_fast_pchal_python.cpp +0 -0
  33. {hapc-2.3.0 → hapc-2.5.0}/src/fast_pchal.cpp +0 -0
  34. {hapc-2.3.0 → hapc-2.5.0}/src/hapc_core.hpp +0 -0
  35. {hapc-2.3.0 → hapc-2.5.0}/src/logistic_call.cpp +0 -0
  36. {hapc-2.3.0 → hapc-2.5.0}/src/mkernel.cpp +0 -0
  37. {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_call.cpp +0 -0
  38. {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_classi_call.cpp +0 -0
  39. {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_cv.cpp +0 -0
  40. {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_cv_classi_cpp.cpp +0 -0
  41. {hapc-2.3.0 → hapc-2.5.0}/src/pcghal_cv_cpp.cpp +0 -0
  42. {hapc-2.3.0 → hapc-2.5.0}/src/pchal_design.cpp +0 -0
  43. {hapc-2.3.0 → hapc-2.5.0}/src/r_bindings.cpp +0 -0
  44. {hapc-2.3.0 → hapc-2.5.0}/src/ridge_wrappers.cpp +0 -0
  45. {hapc-2.3.0 → hapc-2.5.0}/src/single_pcghal_cpp.cpp +0 -0
  46. {hapc-2.3.0 → hapc-2.5.0}/src/single_pchar.cpp +0 -0
  47. {hapc-2.3.0 → hapc-2.5.0}/tests/test_api.py +0 -0
  48. {hapc-2.3.0 → hapc-2.5.0}/tests/test_core.py +0 -0
  49. {hapc-2.3.0 → hapc-2.5.0}/tests/test_logistic_regression.py +0 -0
  50. {hapc-2.3.0 → hapc-2.5.0}/tests/test_r_vs_python_alpha.py +0 -0
@@ -1,4 +1,4 @@
1
- cmake_minimum_required(VERSION 3.15)
1
+ cmake_minimum_required(VERSION 3.18)
2
2
  project(hapc)
3
3
 
4
4
  set(CMAKE_CXX_STANDARD 17)
@@ -15,7 +15,13 @@ endif()
15
15
  # Python3_EXECUTABLE from setup.py so the build always targets the *same*
16
16
  # interpreter that pip is using. Without this CMake may discover a newer/
17
17
  # older system Python and produce a .so tagged for the wrong ABI.
18
- find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
18
+ #
19
+ # Use Development.Module (headers only), NOT the full Development component:
20
+ # the latter also requires Development.Embed -> libpython, which manylinux
21
+ # images deliberately do not ship (extension modules must not link libpython).
22
+ # Requiring full Development makes the manylinux build fail with
23
+ # "Could NOT find Python3 (missing: Python3_LIBRARIES Development.Embed)".
24
+ find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
19
25
  message(STATUS "Python3_EXECUTABLE: ${Python3_EXECUTABLE}")
20
26
  message(STATUS "Python3_VERSION: ${Python3_VERSION}")
21
27
 
hapc-2.5.0/PKG-INFO ADDED
@@ -0,0 +1,359 @@
1
+ Metadata-Version: 2.4
2
+ Name: hapc
3
+ Version: 2.5.0
4
+ Summary: Highly Adaptive Principal Components
5
+ Home-page: https://github.com/meixide/hapc
6
+ Author: Carlos García Meixide
7
+ Author-email: Carlos García Meixide <cgmeixide@gmail.com>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/meixide/hapc
10
+ Project-URL: Documentation, https://hapc.readthedocs.io
11
+ Project-URL: Repository, https://github.com/meixide/hapc.git
12
+ Project-URL: Issues, https://github.com/meixide/hapc/issues
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Operating System :: OS Independent
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: numpy<2.3,>=1.24
24
+ Requires-Dist: scikit-learn>=1.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest; extra == "dev"
27
+ Requires-Dist: pytest-cov; extra == "dev"
28
+ Requires-Dist: black; extra == "dev"
29
+ Requires-Dist: flake8; extra == "dev"
30
+ Provides-Extra: docs
31
+ Requires-Dist: sphinx>=7; extra == "docs"
32
+ Requires-Dist: furo; extra == "docs"
33
+ Requires-Dist: myst-parser; extra == "docs"
34
+ Dynamic: author
35
+ Dynamic: home-page
36
+ Dynamic: license-file
37
+ Dynamic: requires-python
38
+
39
+ # HAPC: Highly Adaptive Prinicipal Components
40
+
41
+ A fast and flexible machine learning library for nonparametric high-dimensional regression and classification with guarantees.
42
+
43
+ ## Documentation
44
+
45
+ - **Python API** (rendered from docstrings): https://hapc.readthedocs.io —
46
+ configured via [`.readthedocs.yaml`](.readthedocs.yaml) and
47
+ [`docs/`](docs/) (Sphinx + autodoc). Build locally with
48
+ `pip install -e ".[docs]" && sphinx-build -b html docs docs/_build/html`.
49
+ - **R API** (rendered from roxygen): a [pkgdown](https://pkgdown.r-lib.org)
50
+ site built by [`.github/workflows/pkgdown.yaml`](.github/workflows/pkgdown.yaml)
51
+ (config in [`_pkgdown.yml`](_pkgdown.yml)). Build locally with
52
+ `Rscript -e 'pkgdown::build_site()'`.
53
+
54
+ ## Installation
55
+
56
+ ### Prerequisites
57
+
58
+ - Python 3.8+
59
+ - C++ compiler (g++, clang, or MSVC)
60
+ - CMake 3.15+
61
+ - Eigen3
62
+
63
+ ### Quick Install
64
+
65
+ ```bash
66
+ pip install hapc
67
+ ```
68
+
69
+ Prebuilt wheels are published for Linux (manylinux2014, x86_64), macOS
70
+ (Intel + Apple Silicon) and Windows, for CPython 3.8–3.12. No compiler,
71
+ CMake or Eigen is needed when a wheel is available.
72
+
73
+ ### Linux / HPC clusters
74
+
75
+ The Linux wheels use the **manylinux2014** baseline (glibc 2.17), so
76
+ `pip install hapc` works out of the box on HPC login/compute nodes —
77
+ no `conda` toolchain, `devtoolset`, or sysroot setup required:
78
+
79
+ ```bash
80
+ pip install hapc
81
+ ```
82
+
83
+ If you must build from the source distribution (niche architecture, very
84
+ old Python, or an air-gapped node), provide a C++17 compiler and either
85
+ let CMake fetch Eigen automatically (needs network) or install Eigen and
86
+ let `find_package(Eigen3)` find it:
87
+
88
+ ```bash
89
+ # with conda compilers (recommended on HPC)
90
+ conda install -c conda-forge cxx-compiler cmake eigen
91
+ pip install hapc --no-binary hapc
92
+ ```
93
+
94
+ ### Install from GitHub (latest development version)
95
+
96
+ ```bash
97
+ pip install git+https://github.com/meixide/hapc.git
98
+ ```
99
+
100
+ Or with editable install for development:
101
+
102
+ ```bash
103
+ git clone https://github.com/meixide/hapc.git
104
+ cd hapc
105
+ pip install -e .
106
+ ```
107
+
108
+ ### Install build dependencies
109
+
110
+ If installation fails, you may need to install build dependencies:
111
+
112
+ **macOS:**
113
+ ```bash
114
+ brew install cmake eigen
115
+ ```
116
+
117
+ **Ubuntu/Debian:**
118
+ ```bash
119
+ sudo apt-get install cmake libeigen3-dev build-essential
120
+ ```
121
+
122
+ **Windows:**
123
+ ```bash
124
+ pip install cmake
125
+ # Install Visual Studio Build Tools or use conda
126
+ conda install -c conda-forge eigen
127
+ ```
128
+
129
+ ## Quick Start
130
+
131
+ ```python
132
+ import numpy as np
133
+ from hapc.single import single_pcghal
134
+ from hapc.cv import pcghal_cv
135
+
136
+ # Generate sample data
137
+ X = np.random.randn(100, 5)
138
+ Y = X[:, 0] + 0.5 * X[:, 1] + np.random.randn(100) * 0.1
139
+
140
+ # Single fit with fixed lambda
141
+ result = single_pcghal(X, Y, maxdeg=2, npc=5, single_lambda=0.01)
142
+ print(f"Risk: {result.optimizer_output.risk:.6f}")
143
+
144
+ # Cross-validation to select lambda
145
+ lambdas = np.logspace(-4, 0, 10)
146
+ cv_result = pcghal_cv(X, Y, maxdeg=2, npc=5, lambdas=lambdas, nfolds=5)
147
+ print(f"Best lambda: {cv_result.best_lambda:.6f}")
148
+
149
+ # Make predictions
150
+ X_test = np.random.randn(20, 5)
151
+ result = single_pcghal(X, Y, maxdeg=2, npc=5, single_lambda=0.01, predict=X_test)
152
+ print(f"Predictions: {result.predictions}")
153
+ ```
154
+
155
+ ## Usage
156
+
157
+ ### Regression
158
+
159
+ ```python
160
+ from hapc.single import single_pcghal
161
+
162
+ result = single_pcghal(
163
+ X, Y,
164
+ maxdeg=2, # Maximum degree of interactions
165
+ npc=10, # Number of principal components
166
+ single_lambda=0.01,
167
+ predict=X_test # Optional: test data for predictions
168
+ )
169
+ ```
170
+
171
+ ### Classification
172
+
173
+ ```python
174
+ from hapc.single import single_pcghal
175
+
176
+ result = single_pcghal(
177
+ X, Y_binary,
178
+ maxdeg=2,
179
+ npc=10,
180
+ single_lambda=0.01,
181
+ predict=X_test
182
+ )
183
+ ```
184
+
185
+ ### Cross-Validation
186
+
187
+ ```python
188
+ from hapc.cv import pcghal_cv
189
+
190
+ cv_result = pcghal_cv(
191
+ X, Y,
192
+ maxdeg=2,
193
+ npc=10,
194
+ lambdas=np.logspace(-4, 0, 20),
195
+ nfolds=5
196
+ )
197
+ print(cv_result.best_lambda)
198
+ ```
199
+
200
+ ### Average Treatment Effect (ATE)
201
+
202
+ Estimate the ATE `E[Y(1)] − E[Y(0)]` with HAPC nuisance models and a
203
+ doubly-robust (AIPW) efficient influence function. `ate_hapc` returns a point
204
+ estimate and a `(1 − alpha)` Wald confidence interval.
205
+
206
+ ```python
207
+ from hapc import ate_hapc
208
+
209
+ # W: covariates (n, p); A: binary treatment in {0,1} or {-1,+1}; Y: outcome
210
+ res = ate_hapc(W, Y, A, alpha=0.05, method="undersmooth")
211
+ print(res.estimate, res.lower, res.upper)
212
+ ```
213
+
214
+ Two bias-control strategies are available through `method`:
215
+
216
+ - **`method="undersmooth"`** (default) — single-sample estimator. The outcome
217
+ model is undersmoothed (λ pushed below the CV-optimal value) until the
218
+ empirical influence function is within `σ / (√n · log n)`. This requires the
219
+ **full PC basis** (`npcs = n`, the default) and a λ grid that reaches small λ
220
+ (defaults `log_lambda_out_min = -10`); otherwise the gate never reaches the
221
+ low-bias regime and `ate_hapc` emits a warning. Pass
222
+ `report_undersmoothing=True` to print the `|mean(EIF)|`-vs-λ path.
223
+ - **`method="crossfit"`** — DML-style K-fold cross-fitting (`cf_folds`, default
224
+ 5, stratified by treatment). Both nuisances are fit on the training folds and
225
+ the influence function is evaluated out-of-fold, giving honest point estimates
226
+ and coverage without undersmoothing. Recommended under good overlap.
227
+
228
+ ### Discrete-time survival (`family = "logit-hazard"`)
229
+
230
+ Fit a discrete-time **logistic hazard** model with HAPC. You supply only the
231
+ observed right-censored data — baseline covariates `X`, the observed time
232
+ `T = min(T_event, C)`, and the event indicator `Delta = 1(T_event <= C)` — and
233
+ the wrapper performs the person-period expansion (one row per
234
+ subject-per-interval-at-risk, hazard label = 1 at the event interval), prepends
235
+ the visit time as the first HAL covariate, and cross-validates the binomial fit.
236
+
237
+ **Model.** The discrete hazard is the conditional event probability in interval
238
+ `t` given survival up to `t`, modelled on the logit scale by a HAPC fit `f` of
239
+ the augmented covariate `(t, x)`:
240
+
241
+ ```text
242
+ lambda(t | x) = P(T_event = t | T_event >= t, X = x)
243
+ logit lambda(t | x) = f(t, x)
244
+ ```
245
+
246
+ **Person-period likelihood.** Under independent right-censoring the observed-data
247
+ likelihood factorises over the at-risk intervals,
248
+
249
+ ```text
250
+ prod_i prod_{t <= T_i} lambda(t|x_i)^Y_it * (1 - lambda(t|x_i))^(1 - Y_it),
251
+ with Y_it = 1(T_event_i = t),
252
+ ```
253
+
254
+ which is exactly the Bernoulli (logistic) likelihood of the expanded
255
+ person-period table — so a binomial HAPC fit of `Y_it` on `(t, x_i)` estimates
256
+ the discrete hazard (Cox 1972; Brown 1975; Allison 1982).
257
+
258
+ **Survival.** The conditional survival function follows by the product-limit
259
+ relation `S(t | x) = prod_{s <= t} (1 - lambda(s | x))`, returned for new
260
+ subjects when `predict=` is supplied.
261
+
262
+ ```python
263
+ from hapc import hazard_hapc
264
+ import numpy as np
265
+
266
+ # X: baseline covariates (n, p); T: observed times; Delta: 0/1 event indicator
267
+ fit = hazard_hapc(X, T, Delta, norm="1", max_degree=2, time_grid=np.arange(1, 7))
268
+ fit.hazard # estimated hazard per person-period row (CV predictions)
269
+ fit.best_lambda, fit.interior # CV-selected lambda; is it interior to the grid?
270
+
271
+ # survival curves S(t|x) for new subjects
272
+ fit = hazard_hapc(X, T, Delta, norm="1", predict=X_new)
273
+ fit.predict_survival # (m, K) survival probabilities over the grid
274
+ ```
275
+
276
+ ```r
277
+ library(hapc)
278
+ # equivalent to cv.hapc(X, T, family = "logit-hazard", Delta = Delta, norm = "1")
279
+ fit <- hazard.hapc(X, T, Delta, norm = "1", max_degree = 2, time_grid = 1:6)
280
+ fit$hazard; fit$best_lambda; fit$interior
281
+ ```
282
+
283
+ `norm` must be `"1"` (logistic LASSO) or `"2"` (logistic ridge); `norm = "sv"`
284
+ is **not implemented** for this family and is flagged.
285
+
286
+ **Returns** (Python `HazardResult` / R `hapc_hazard`):
287
+
288
+ - `hazard` — cross-validated discrete hazard for each person-period row
289
+ - `lambdas`, `risk`, `best_lambda` — CV grid, mean logistic deviance, selected λ
290
+ - `interior` — whether `best_lambda` is strictly inside the grid (sanity check)
291
+ - `time_grid`, `ids`/`id`, `Y` — the discrete grid and person-period bookkeeping
292
+ - `predict_hazard`, `predict_survival` — hazard surface and survival curves for
293
+ new subjects (only when `predict=` is given)
294
+ - `cv` — the underlying cross-validation result
295
+
296
+ Worked end-to-end examples (five hazard data-generating processes, with
297
+ true-vs-estimated hazard scatters and CV risk-vs-λ curves verifying an interior
298
+ optimum) are in
299
+ [`examples/hazard_logit_hazard_examples.R`](examples/hazard_logit_hazard_examples.R)
300
+ and
301
+ [`examples/hazard_logit_hazard_examples.py`](examples/hazard_logit_hazard_examples.py).
302
+
303
+ **References.** Cox (1972, *JRSS B*); Brown (1975, *Biometrics*); Allison (1982,
304
+ *Sociological Methodology*); Singer & Willett (2003, *Applied Longitudinal Data
305
+ Analysis*); Benkeser & van der Laan (2016, *IEEE DSAA*).
306
+
307
+ ## API Reference
308
+
309
+ ### `hapc.single.single_pcghal()`
310
+
311
+ Fit PC-GHAL with a single lambda value.
312
+
313
+ **Parameters:**
314
+ - `X` (ndarray, shape (n, p)): Input features
315
+ - `Y` (ndarray, shape (n,)): Response variable
316
+ - `maxdeg` (int): Maximum degree of interactions
317
+ - `npc` (int): Number of principal components
318
+ - `single_lambda` (float): Regularization parameter
319
+ - `max_iter` (int, default=100): Maximum iterations
320
+ - `tol` (float, default=1e-6): Convergence tolerance
321
+ - `verbose` (bool, default=False): Print progress
322
+ - `predict` (ndarray, optional): Test data for predictions
323
+ - `center` (bool, default=True): Center the design matrix
324
+
325
+ **Returns:**
326
+ - `result.optimizer_output.alpha`: Coefficients
327
+ - `result.optimizer_output.risk`: Final risk
328
+ - `result.optimizer_output.iter`: Iterations until convergence
329
+ - `result.predictions`: Predictions on test data (if provided)
330
+
331
+ ### `hapc.cv.pcghal_cv()`
332
+
333
+ Cross-validation to select lambda.
334
+
335
+ **Parameters:**
336
+ - `lambdas` (ndarray): Grid of lambda values to test
337
+ - `nfolds` (int, default=5): Number of CV folds
338
+ - ...other parameters same as `single_pcghal`
339
+
340
+ **Returns:**
341
+ - `cv_result.best_lambda`: Optimal lambda
342
+ - `cv_result.mses`: CV errors for each lambda
343
+ - `cv_result.best_model`: Fitted model with best lambda
344
+ - `cv_result.predictions`: Predictions on test data (if provided)
345
+
346
+ ## Contributing
347
+
348
+ Contributions welcome! The C++ core is shared between R and Python packages.
349
+
350
+ ```bash
351
+ git clone https://github.com/meixide/hapc.git
352
+ cd hapc
353
+ pip install -e .
354
+ pytest
355
+ ```
356
+
357
+ ## License
358
+
359
+ MIT License - see LICENSE file
hapc-2.5.0/README.md ADDED
@@ -0,0 +1,321 @@
1
+ # HAPC: Highly Adaptive Prinicipal Components
2
+
3
+ A fast and flexible machine learning library for nonparametric high-dimensional regression and classification with guarantees.
4
+
5
+ ## Documentation
6
+
7
+ - **Python API** (rendered from docstrings): https://hapc.readthedocs.io —
8
+ configured via [`.readthedocs.yaml`](.readthedocs.yaml) and
9
+ [`docs/`](docs/) (Sphinx + autodoc). Build locally with
10
+ `pip install -e ".[docs]" && sphinx-build -b html docs docs/_build/html`.
11
+ - **R API** (rendered from roxygen): a [pkgdown](https://pkgdown.r-lib.org)
12
+ site built by [`.github/workflows/pkgdown.yaml`](.github/workflows/pkgdown.yaml)
13
+ (config in [`_pkgdown.yml`](_pkgdown.yml)). Build locally with
14
+ `Rscript -e 'pkgdown::build_site()'`.
15
+
16
+ ## Installation
17
+
18
+ ### Prerequisites
19
+
20
+ - Python 3.8+
21
+ - C++ compiler (g++, clang, or MSVC)
22
+ - CMake 3.15+
23
+ - Eigen3
24
+
25
+ ### Quick Install
26
+
27
+ ```bash
28
+ pip install hapc
29
+ ```
30
+
31
+ Prebuilt wheels are published for Linux (manylinux2014, x86_64), macOS
32
+ (Intel + Apple Silicon) and Windows, for CPython 3.8–3.12. No compiler,
33
+ CMake or Eigen is needed when a wheel is available.
34
+
35
+ ### Linux / HPC clusters
36
+
37
+ The Linux wheels use the **manylinux2014** baseline (glibc 2.17), so
38
+ `pip install hapc` works out of the box on HPC login/compute nodes —
39
+ no `conda` toolchain, `devtoolset`, or sysroot setup required:
40
+
41
+ ```bash
42
+ pip install hapc
43
+ ```
44
+
45
+ If you must build from the source distribution (niche architecture, very
46
+ old Python, or an air-gapped node), provide a C++17 compiler and either
47
+ let CMake fetch Eigen automatically (needs network) or install Eigen and
48
+ let `find_package(Eigen3)` find it:
49
+
50
+ ```bash
51
+ # with conda compilers (recommended on HPC)
52
+ conda install -c conda-forge cxx-compiler cmake eigen
53
+ pip install hapc --no-binary hapc
54
+ ```
55
+
56
+ ### Install from GitHub (latest development version)
57
+
58
+ ```bash
59
+ pip install git+https://github.com/meixide/hapc.git
60
+ ```
61
+
62
+ Or with editable install for development:
63
+
64
+ ```bash
65
+ git clone https://github.com/meixide/hapc.git
66
+ cd hapc
67
+ pip install -e .
68
+ ```
69
+
70
+ ### Install build dependencies
71
+
72
+ If installation fails, you may need to install build dependencies:
73
+
74
+ **macOS:**
75
+ ```bash
76
+ brew install cmake eigen
77
+ ```
78
+
79
+ **Ubuntu/Debian:**
80
+ ```bash
81
+ sudo apt-get install cmake libeigen3-dev build-essential
82
+ ```
83
+
84
+ **Windows:**
85
+ ```bash
86
+ pip install cmake
87
+ # Install Visual Studio Build Tools or use conda
88
+ conda install -c conda-forge eigen
89
+ ```
90
+
91
+ ## Quick Start
92
+
93
+ ```python
94
+ import numpy as np
95
+ from hapc.single import single_pcghal
96
+ from hapc.cv import pcghal_cv
97
+
98
+ # Generate sample data
99
+ X = np.random.randn(100, 5)
100
+ Y = X[:, 0] + 0.5 * X[:, 1] + np.random.randn(100) * 0.1
101
+
102
+ # Single fit with fixed lambda
103
+ result = single_pcghal(X, Y, maxdeg=2, npc=5, single_lambda=0.01)
104
+ print(f"Risk: {result.optimizer_output.risk:.6f}")
105
+
106
+ # Cross-validation to select lambda
107
+ lambdas = np.logspace(-4, 0, 10)
108
+ cv_result = pcghal_cv(X, Y, maxdeg=2, npc=5, lambdas=lambdas, nfolds=5)
109
+ print(f"Best lambda: {cv_result.best_lambda:.6f}")
110
+
111
+ # Make predictions
112
+ X_test = np.random.randn(20, 5)
113
+ result = single_pcghal(X, Y, maxdeg=2, npc=5, single_lambda=0.01, predict=X_test)
114
+ print(f"Predictions: {result.predictions}")
115
+ ```
116
+
117
+ ## Usage
118
+
119
+ ### Regression
120
+
121
+ ```python
122
+ from hapc.single import single_pcghal
123
+
124
+ result = single_pcghal(
125
+ X, Y,
126
+ maxdeg=2, # Maximum degree of interactions
127
+ npc=10, # Number of principal components
128
+ single_lambda=0.01,
129
+ predict=X_test # Optional: test data for predictions
130
+ )
131
+ ```
132
+
133
+ ### Classification
134
+
135
+ ```python
136
+ from hapc.single import single_pcghal
137
+
138
+ result = single_pcghal(
139
+ X, Y_binary,
140
+ maxdeg=2,
141
+ npc=10,
142
+ single_lambda=0.01,
143
+ predict=X_test
144
+ )
145
+ ```
146
+
147
+ ### Cross-Validation
148
+
149
+ ```python
150
+ from hapc.cv import pcghal_cv
151
+
152
+ cv_result = pcghal_cv(
153
+ X, Y,
154
+ maxdeg=2,
155
+ npc=10,
156
+ lambdas=np.logspace(-4, 0, 20),
157
+ nfolds=5
158
+ )
159
+ print(cv_result.best_lambda)
160
+ ```
161
+
162
+ ### Average Treatment Effect (ATE)
163
+
164
+ Estimate the ATE `E[Y(1)] − E[Y(0)]` with HAPC nuisance models and a
165
+ doubly-robust (AIPW) efficient influence function. `ate_hapc` returns a point
166
+ estimate and a `(1 − alpha)` Wald confidence interval.
167
+
168
+ ```python
169
+ from hapc import ate_hapc
170
+
171
+ # W: covariates (n, p); A: binary treatment in {0,1} or {-1,+1}; Y: outcome
172
+ res = ate_hapc(W, Y, A, alpha=0.05, method="undersmooth")
173
+ print(res.estimate, res.lower, res.upper)
174
+ ```
175
+
176
+ Two bias-control strategies are available through `method`:
177
+
178
+ - **`method="undersmooth"`** (default) — single-sample estimator. The outcome
179
+ model is undersmoothed (λ pushed below the CV-optimal value) until the
180
+ empirical influence function is within `σ / (√n · log n)`. This requires the
181
+ **full PC basis** (`npcs = n`, the default) and a λ grid that reaches small λ
182
+ (defaults `log_lambda_out_min = -10`); otherwise the gate never reaches the
183
+ low-bias regime and `ate_hapc` emits a warning. Pass
184
+ `report_undersmoothing=True` to print the `|mean(EIF)|`-vs-λ path.
185
+ - **`method="crossfit"`** — DML-style K-fold cross-fitting (`cf_folds`, default
186
+ 5, stratified by treatment). Both nuisances are fit on the training folds and
187
+ the influence function is evaluated out-of-fold, giving honest point estimates
188
+ and coverage without undersmoothing. Recommended under good overlap.
189
+
190
+ ### Discrete-time survival (`family = "logit-hazard"`)
191
+
192
+ Fit a discrete-time **logistic hazard** model with HAPC. You supply only the
193
+ observed right-censored data — baseline covariates `X`, the observed time
194
+ `T = min(T_event, C)`, and the event indicator `Delta = 1(T_event <= C)` — and
195
+ the wrapper performs the person-period expansion (one row per
196
+ subject-per-interval-at-risk, hazard label = 1 at the event interval), prepends
197
+ the visit time as the first HAL covariate, and cross-validates the binomial fit.
198
+
199
+ **Model.** The discrete hazard is the conditional event probability in interval
200
+ `t` given survival up to `t`, modelled on the logit scale by a HAPC fit `f` of
201
+ the augmented covariate `(t, x)`:
202
+
203
+ ```text
204
+ lambda(t | x) = P(T_event = t | T_event >= t, X = x)
205
+ logit lambda(t | x) = f(t, x)
206
+ ```
207
+
208
+ **Person-period likelihood.** Under independent right-censoring the observed-data
209
+ likelihood factorises over the at-risk intervals,
210
+
211
+ ```text
212
+ prod_i prod_{t <= T_i} lambda(t|x_i)^Y_it * (1 - lambda(t|x_i))^(1 - Y_it),
213
+ with Y_it = 1(T_event_i = t),
214
+ ```
215
+
216
+ which is exactly the Bernoulli (logistic) likelihood of the expanded
217
+ person-period table — so a binomial HAPC fit of `Y_it` on `(t, x_i)` estimates
218
+ the discrete hazard (Cox 1972; Brown 1975; Allison 1982).
219
+
220
+ **Survival.** The conditional survival function follows by the product-limit
221
+ relation `S(t | x) = prod_{s <= t} (1 - lambda(s | x))`, returned for new
222
+ subjects when `predict=` is supplied.
223
+
224
+ ```python
225
+ from hapc import hazard_hapc
226
+ import numpy as np
227
+
228
+ # X: baseline covariates (n, p); T: observed times; Delta: 0/1 event indicator
229
+ fit = hazard_hapc(X, T, Delta, norm="1", max_degree=2, time_grid=np.arange(1, 7))
230
+ fit.hazard # estimated hazard per person-period row (CV predictions)
231
+ fit.best_lambda, fit.interior # CV-selected lambda; is it interior to the grid?
232
+
233
+ # survival curves S(t|x) for new subjects
234
+ fit = hazard_hapc(X, T, Delta, norm="1", predict=X_new)
235
+ fit.predict_survival # (m, K) survival probabilities over the grid
236
+ ```
237
+
238
+ ```r
239
+ library(hapc)
240
+ # equivalent to cv.hapc(X, T, family = "logit-hazard", Delta = Delta, norm = "1")
241
+ fit <- hazard.hapc(X, T, Delta, norm = "1", max_degree = 2, time_grid = 1:6)
242
+ fit$hazard; fit$best_lambda; fit$interior
243
+ ```
244
+
245
+ `norm` must be `"1"` (logistic LASSO) or `"2"` (logistic ridge); `norm = "sv"`
246
+ is **not implemented** for this family and is flagged.
247
+
248
+ **Returns** (Python `HazardResult` / R `hapc_hazard`):
249
+
250
+ - `hazard` — cross-validated discrete hazard for each person-period row
251
+ - `lambdas`, `risk`, `best_lambda` — CV grid, mean logistic deviance, selected λ
252
+ - `interior` — whether `best_lambda` is strictly inside the grid (sanity check)
253
+ - `time_grid`, `ids`/`id`, `Y` — the discrete grid and person-period bookkeeping
254
+ - `predict_hazard`, `predict_survival` — hazard surface and survival curves for
255
+ new subjects (only when `predict=` is given)
256
+ - `cv` — the underlying cross-validation result
257
+
258
+ Worked end-to-end examples (five hazard data-generating processes, with
259
+ true-vs-estimated hazard scatters and CV risk-vs-λ curves verifying an interior
260
+ optimum) are in
261
+ [`examples/hazard_logit_hazard_examples.R`](examples/hazard_logit_hazard_examples.R)
262
+ and
263
+ [`examples/hazard_logit_hazard_examples.py`](examples/hazard_logit_hazard_examples.py).
264
+
265
+ **References.** Cox (1972, *JRSS B*); Brown (1975, *Biometrics*); Allison (1982,
266
+ *Sociological Methodology*); Singer & Willett (2003, *Applied Longitudinal Data
267
+ Analysis*); Benkeser & van der Laan (2016, *IEEE DSAA*).
268
+
269
+ ## API Reference
270
+
271
+ ### `hapc.single.single_pcghal()`
272
+
273
+ Fit PC-GHAL with a single lambda value.
274
+
275
+ **Parameters:**
276
+ - `X` (ndarray, shape (n, p)): Input features
277
+ - `Y` (ndarray, shape (n,)): Response variable
278
+ - `maxdeg` (int): Maximum degree of interactions
279
+ - `npc` (int): Number of principal components
280
+ - `single_lambda` (float): Regularization parameter
281
+ - `max_iter` (int, default=100): Maximum iterations
282
+ - `tol` (float, default=1e-6): Convergence tolerance
283
+ - `verbose` (bool, default=False): Print progress
284
+ - `predict` (ndarray, optional): Test data for predictions
285
+ - `center` (bool, default=True): Center the design matrix
286
+
287
+ **Returns:**
288
+ - `result.optimizer_output.alpha`: Coefficients
289
+ - `result.optimizer_output.risk`: Final risk
290
+ - `result.optimizer_output.iter`: Iterations until convergence
291
+ - `result.predictions`: Predictions on test data (if provided)
292
+
293
+ ### `hapc.cv.pcghal_cv()`
294
+
295
+ Cross-validation to select lambda.
296
+
297
+ **Parameters:**
298
+ - `lambdas` (ndarray): Grid of lambda values to test
299
+ - `nfolds` (int, default=5): Number of CV folds
300
+ - ...other parameters same as `single_pcghal`
301
+
302
+ **Returns:**
303
+ - `cv_result.best_lambda`: Optimal lambda
304
+ - `cv_result.mses`: CV errors for each lambda
305
+ - `cv_result.best_model`: Fitted model with best lambda
306
+ - `cv_result.predictions`: Predictions on test data (if provided)
307
+
308
+ ## Contributing
309
+
310
+ Contributions welcome! The C++ core is shared between R and Python packages.
311
+
312
+ ```bash
313
+ git clone https://github.com/meixide/hapc.git
314
+ cd hapc
315
+ pip install -e .
316
+ pytest
317
+ ```
318
+
319
+ ## License
320
+
321
+ MIT License - see LICENSE file