PyPI - pyrollmatch - Versions diffs - 0.1.0__tar.gz - Mend

pyrollmatch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

pyrollmatch-0.1.0/.github/workflows/publish.yml +102 -0
pyrollmatch-0.1.0/.github/workflows/test.yml +50 -0
pyrollmatch-0.1.0/.gitignore +7 -0
pyrollmatch-0.1.0/LICENSE +21 -0
pyrollmatch-0.1.0/PKG-INFO +278 -0
pyrollmatch-0.1.0/README.md +250 -0
pyrollmatch-0.1.0/benchmarks/extended_validation.py +385 -0
pyrollmatch-0.1.0/benchmarks/extended_validation_report.html +224 -0
pyrollmatch-0.1.0/benchmarks/model_comparison.py +119 -0
pyrollmatch-0.1.0/benchmarks/model_comparison_report.html +52 -0
pyrollmatch-0.1.0/benchmarks/validation.py +534 -0
pyrollmatch-0.1.0/benchmarks/validation_report.html +219 -0
pyrollmatch-0.1.0/pyproject.toml +55 -0
pyrollmatch-0.1.0/src/pyrollmatch/__init__.py +49 -0
pyrollmatch-0.1.0/src/pyrollmatch/balance.py +121 -0
pyrollmatch-0.1.0/src/pyrollmatch/core.py +285 -0
pyrollmatch-0.1.0/src/pyrollmatch/diagnostics.py +194 -0
pyrollmatch-0.1.0/src/pyrollmatch/match.py +263 -0
pyrollmatch-0.1.0/src/pyrollmatch/reduce.py +66 -0
pyrollmatch-0.1.0/src/pyrollmatch/score.py +229 -0
pyrollmatch-0.1.0/tests/__init__.py +0 -0
pyrollmatch-0.1.0/tests/test_robust.py +136 -0
pyrollmatch-0.1.0/tests/test_smoke.py +255 -0
pyrollmatch-0.1.0/tests/test_stress.py +77 -0
pyrollmatch-0.1.0/uv.lock +888 -0

pyrollmatch-0.1.0/.github/workflows/publish.yml ADDED Viewed

@@ -0,0 +1,102 @@
+name: Build & Publish
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install build tools
+        run: pip install build twine
+      - name: Build wheel and sdist
+        run: python -m build
+      - name: Check distributions
+        run: twine check dist/*
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+  # pyrollmatch is pure Python — one wheel works on all platforms.
+  verify:
+    needs: build
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ["3.10", "3.12"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+      - name: Install wheel
+        shell: bash
+        run: pip install dist/*.whl
+      - name: Smoke test
+        shell: bash
+        run: |
+          python -c "
+          import pyrollmatch
+          print(f'pyrollmatch {pyrollmatch.__version__} loaded successfully')
+          from pyrollmatch.score import SUPPORTED_MODELS
+          print(f'Models: {SUPPORTED_MODELS}')
+          "
+  upload-release:
+    needs: [build, verify]
+    runs-on: ubuntu-latest
+    if: github.event_name == 'release'
+    permissions:
+      contents: write
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+      - name: Upload to GitHub Release
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: gh release upload "${{ github.event.release.tag_name }}" dist/* --clobber --repo "${{ github.repository }}"
+  publish-pypi:
+    needs: [build, verify]
+    runs-on: ubuntu-latest
+    if: github.event_name == 'release'
+    permissions:
+      id-token: write
+    environment:
+      name: pypi
+      url: https://pypi.org/p/pyrollmatch
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1

pyrollmatch-0.1.0/.github/workflows/test.yml ADDED Viewed

@@ -0,0 +1,50 @@
+name: Tests
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+      - name: Run tests
+        run: |
+          python -m pytest tests/test_smoke.py tests/test_robust.py -v
+  test-stress:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+      - name: Run stress tests
+        run: |
+          python -m pytest tests/test_stress.py -v

pyrollmatch-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,7 @@
+__pycache__/
+*.pyc
+.venv/
+*.egg-info/
+dist/
+build/
+.pytest_cache/

pyrollmatch-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Alan Huang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pyrollmatch-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,278 @@
+Metadata-Version: 2.4
+Name: pyrollmatch
+Version: 0.1.0
+Summary: Fast rolling entry matching for staggered adoption studies using polars + numpy
+Author: Alan Huang
+License-Expression: MIT
+License-File: LICENSE
+Keywords: causal-inference,difference-in-differences,matching,propensity-score,rolling-entry,staggered-adoption,treatment-effects
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Mathematics
+Requires-Python: >=3.10
+Requires-Dist: numpy>=1.24
+Requires-Dist: polars>=1.0
+Requires-Dist: scikit-learn>=1.3
+Requires-Dist: scipy>=1.10
+Provides-Extra: dev
+Requires-Dist: pytest-xdist; extra == 'dev'
+Requires-Dist: pytest>=7.0; extra == 'dev'
+Provides-Extra: r-comparison
+Requires-Dist: rpy2>=3.5; extra == 'r-comparison'
+Description-Content-Type: text/markdown
+# pyrollmatch
+> **Alpha** (v0.1.x) — This package is in early development. Results have been validated against R rollmatch on synthetic and real data, but edge cases may remain. APIs are not stable and may change without notice. **Please verify critical results independently.**
+Fast rolling entry matching for staggered adoption studies in Python.
+High-performance reimplementation of the R [rollmatch](https://github.com/RTIInternational/rollmatch) package using **polars** + **numpy**, with 8 propensity score models and comprehensive post-matching diagnostics.
+| | R rollmatch | pyrollmatch |
+|---|---|---|
+| Max scale | ~5K treated (crashes) | **90K+ treated** |
+| 10K × 30K | OOM (5.8B row join) | **1.7 seconds** |
+| Scoring models | Logistic only | **8 models** |
+| Diagnostics | SMD only | SMD + t-test + VR + KS + TOST |
+| Data library | dplyr (deprecated APIs) | polars |
+## Installation
+```bash
+pip install pyrollmatch
+```
+**From source:**
+```bash
+git clone https://github.com/AlanHuang99/pyrollmatch.git
+cd pyrollmatch
+pip install -e ".[dev]"
+```
+## Quick Start
+```python
+import polars as pl
+from pyrollmatch import rollmatch, alpha_sweep
+# Panel data: unit × time with treatment indicator
+data = pl.read_parquet("panel_data.parquet")
+# Rolling entry matching
+result = rollmatch(
+    data,
+    treat="treat",              # binary treatment group indicator (1=treated, 0=control)
+    tm="time_period",           # time period (integer)
+    entry="entry_time",         # treatment onset period for treated units;
+                                # for controls, set to any value > max(time_period)
+    id="unit_id",               # unit identifier
+    covariates=["x1", "x2"],    # matching covariates
+    lookback=1,                 # periods to look back for baseline
+    alpha=0.1,                  # caliper = alpha × pooled_SD
+    num_matches=3,              # controls per treated
+    model_type="logistic",      # propensity score model
+)
+# Results
+result.balance      # SMD table (polars DataFrame)
+result.weights      # unit_id → weight (polars DataFrame)
+result.matched_data # treat_id, control_id, difference
+```
+### Alpha Sweep
+Find the optimal caliper automatically:
+```python
+summary, best = alpha_sweep(
+    data, treat="treat", tm="time", entry="entry_time", id="unit_id",
+    covariates=["x1", "x2", "x3"],
+    alphas=[0.01, 0.02, 0.05, 0.1, 0.15, 0.2],
+)
+# summary: alpha, match_rate, max|SMD|, all_pass
+# best: RollmatchResult from the best alpha
+```
+### Post-Matching Diagnostics
+```python
+from pyrollmatch import balance_test, equivalence_test
+from pyrollmatch import reduce_data, score_data
+reduced = reduce_data(data, "treat", "time", "entry_time", "unit_id")
+scored = score_data(reduced, ["x1", "x2", "x3"], "treat")
+# SMD + t-test + variance ratio + KS test
+diag = balance_test(scored, result.matched_data,
+                    "treat", "unit_id", "time", ["x1", "x2", "x3"])
+# TOST equivalence test (Hartman & Hidalgo 2018)
+equiv = equivalence_test(scored, result.matched_data,
+                         "treat", "unit_id", "time", ["x1", "x2", "x3"])
+```
+## Propensity Score Models
+8 scoring methods, all accessible via the `model_type` parameter:
+| Model | `model_type` | Description | Best for |
+|---|---|---|---|
+| **Logistic** | `"logistic"` | Standard logistic regression (default) | Most cases |
+| **Probit** | `"probit"` | Probit model (Φ⁻¹ transform) | Robustness check |
+| **GBM** | `"gbm"` | Gradient boosting (HistGradientBoosting) | Non-linear relationships |
+| **Random Forest** | `"rf"` | Random forest classifier | High-dimensional, interactions |
+| **Lasso** | `"lasso"` | L1-regularized logistic | Variable selection |
+| **Ridge** | `"ridge"` | L2-regularized logistic | Multicollinearity |
+| **ElasticNet** | `"elasticnet"` | L1+L2 regularized logistic | Combined regularization |
+| **Mahalanobis** | `"mahalanobis"` | No propensity model — covariate distance | Direct covariate matching |
+```python
+# Use gradient boosting for propensity scores
+result = rollmatch(data, ..., model_type="gbm")
+# Use Mahalanobis distance (no propensity model)
+result = rollmatch(data, ..., model_type="mahalanobis")
+```
+## How It Works
+### Rolling Entry Matching
+For staggered adoption, each treated unit enters at a different time. Rolling entry matching:
+1. **`reduce_data()`** — For each treated unit at entry time *t*, select covariates at *t − lookback*. Controls get one observation per treatment entry period.
+2. **`score_data()`** — Fit propensity model, compute logit-transformed scores.
+3. **`match_all_periods()`** — For each time period, match treated to closest controls within caliper. Uses **block-vectorized numpy broadcasting** (not full cross-join).
+4. **`compute_balance()`** — Compare covariate means/SDs before and after matching.
+### Key Optimization
+R's bottleneck: `inner_join(treated, controls, by=time)` creates N_treated × N_controls rows.
+pyrollmatch instead processes treated in **blocks** via numpy broadcasting:
+```
+Block of 2000 treated × 64K controls = 128M distances = ~1 GB
+```
+Never materializes the full cross-product as a DataFrame.
+## Data Format
+Input must be a `polars.DataFrame` in **long panel format** (one row per unit per time period):
+| Column | Type | Description |
+|---|---|---|
+| `id` | int or str | Unit identifier (e.g., individual, firm, repository) |
+| `tm` | int | Time period, must be integer and monotonically increasing (e.g., 1, 2, ..., 20) |
+| `treat` | int (0 or 1) | **Time-invariant treatment group indicator.** `1` = unit that eventually receives treatment, `0` = unit that never receives treatment. This is NOT a time-varying treatment status — it labels the *group*, not whether treatment is currently active. |
+| `entry` | int | **Treatment onset period** for treated units (the time period when treatment begins). For control units, set this to any value **strictly greater** than the maximum time period in your data. For example, if your panel spans periods 1–20, use `entry=99` or `entry=999` for controls. The specific value does not matter as long as it exceeds `max(tm)`. |
+| covariates | float | Matching variables (e.g., pre-computed rolling means of activity measures) |
+**Example:**
+```
+unit_id | time | treat | entry_time | x1   | x2
+--------|------|-------|------------|------|-----
+1       | 1    | 1     | 5          | 2.3  | 1.1   <- treated, enters at period 5
+1       | 2    | 1     | 5          | 2.5  | 1.0
+...
+1       | 5    | 1     | 5          | 4.1  | 1.8   <- treatment starts here
+...
+101     | 1    | 0     | 99         | 1.8  | 0.9   <- control, entry=99 (sentinel)
+101     | 2    | 0     | 99         | 1.9  | 1.0
+```
+## API Reference
+### Core Functions
+| Function | Description |
+|---|---|
+| `rollmatch()` | Full rolling entry matching pipeline |
+| `alpha_sweep()` | Try multiple calipers, select best |
+| `reduce_data()` | Construct quasi-panel for matching |
+| `score_data()` | Compute propensity scores (8 models) |
+### Diagnostics
+| Function | Description |
+|---|---|
+| `balance_test()` | SMD + t-test + variance ratio + KS test |
+| `equivalence_test()` | TOST equivalence test (Hartman & Hidalgo 2018) |
+| `compute_balance()` | Covariate balance table |
+| `smd_table()` | Print formatted SMD table |
+### RollmatchResult
+| Attribute | Type | Description |
+|---|---|---|
+| `matched_data` | `pl.DataFrame` | `treat_id`, `control_id`, `difference` |
+| `balance` | `pl.DataFrame` | SMD for each covariate |
+| `weights` | `pl.DataFrame` | `id`, `weight` |
+| `n_treated_matched` | `int` | Number matched |
+| `n_treated_total` | `int` | Total treated |
+| `alpha` | `float` | Caliper used |
+## Validation Against R rollmatch
+Tested on identical synthetic data across 15 configurations:
+| Metric | Result |
+|---|---|
+| Propensity score correlation | **≥ 0.9999** across all configs |
+| Match count agreement | Within ±3% |
+| Balance quality (max\|SMD\|) | Both achieve < 0.05 |
+Pair-level overlap varies by caliper tightness (3%–71%) due to different GLM solvers (sklearn vs R glm). This is expected — **balance quality is what matters, not pair identity**.
+Validated on real data: [GithubRepo project](https://github.com/AlanHuang99/) (8K treated × 7.5K controls, 10 covariates, all diagnostics pass).
+## Performance
+| Scale | Runtime | Match rate |
+|---|---:|---:|
+| 500 × 2,000 | 0.03s | 100% |
+| 1,000 × 3,000 | 0.05s | 100% |
+| 5,000 × 15,000 | 0.10s | 100% |
+| 10,000 × 30,000 | 1.7s | 99.99% |
+R rollmatch crashes at 10K treated due to dplyr's 2.1B row limit.
+## Design Principles
+- **Polars-first**: Native polars DataFrames throughout. No pandas dependency.
+- **Reproducible**: Deterministic with fixed `random_state`. Same input → same output.
+- **Scalable**: Block-vectorized matching, O(block × N_controls) memory per iteration.
+- **Validated**: 44 tests, R-validated across 15 configurations, real-data tested.
+- **Modular**: Each step (reduce → score → match → balance) is independently usable.
+## References
+- Rickles, J. H., & Seltzer, M. (2014). "A Two-Stage Matching Strategy for Estimating the Effects of Staggered Adoption of Interventions."
+- RTI International. `rollmatch` R package. https://github.com/RTIInternational/rollmatch
+- Hartman, E., & Hidalgo, F. D. (2018). "An Equivalence Approach to Balance and Placebo Tests." *American Journal of Political Science*.
+## Status & Contributing
+This package is under **active development**. We welcome:
+- **Bug reports** — if you encounter unexpected behavior, please open an issue with a minimal reproducible example
+- **Feature requests** — suggestions for new matching methods, diagnostics, or API improvements
+- **Contributions** — pull requests are welcome; please open an issue first to discuss
+Please be cautious when using this package for published research. While we have validated against the R rollmatch package across multiple configurations (see [Validation](#validation-against-r-rollmatch)), this is alpha software. We strongly recommend:
+1. **Cross-checking** results against an established implementation (R rollmatch, R MatchIt) for critical analyses
+2. **Inspecting balance diagnostics** carefully before proceeding to estimation
+3. **Reporting any discrepancies** you find via [GitHub Issues](https://github.com/AlanHuang99/pyrollmatch/issues)
+## License
+MIT