polca 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polca-0.4.0/.clang-format +11 -0
- polca-0.4.0/.github/workflows/ci.yml +99 -0
- polca-0.4.0/.github/workflows/wheels.yml +59 -0
- polca-0.4.0/.gitignore +40 -0
- polca-0.4.0/.pre-commit-config.yaml +21 -0
- polca-0.4.0/CMakeLists.txt +31 -0
- polca-0.4.0/PKG-INFO +166 -0
- polca-0.4.0/README.md +136 -0
- polca-0.4.0/examples/basic_fit.py +62 -0
- polca-0.4.0/examples/debug_ylik.py +16 -0
- polca-0.4.0/examples/quick_analyses.py +41 -0
- polca-0.4.0/examples/tryout.cpp +111 -0
- polca-0.4.0/pypolca/__init__.py +19 -0
- polca-0.4.0/pypolca/api.py +465 -0
- polca-0.4.0/pypolca/data/__init__.py +3 -0
- polca-0.4.0/pypolca/data/_dataset.py +195 -0
- polca-0.4.0/pypolca/data/carcinoma.csv +119 -0
- polca-0.4.0/pypolca/data/cheating.csv +320 -0
- polca-0.4.0/pypolca/data/election.csv +1786 -0
- polca-0.4.0/pypolca/data/gss82.csv +1203 -0
- polca-0.4.0/pypolca/data/values.csv +217 -0
- polca-0.4.0/pypolca/utils.py +60 -0
- polca-0.4.0/pyproject.toml +90 -0
- polca-0.4.0/scripts/benchmark.py +167 -0
- polca-0.4.0/scripts/benchmark_scaling.py +141 -0
- polca-0.4.0/scripts/parity_covariates_py.py +84 -0
- polca-0.4.0/scripts/parity_covariates_r.R +57 -0
- polca-0.4.0/scripts/parity_deep_py.py +159 -0
- polca-0.4.0/scripts/parity_final.py +162 -0
- polca-0.4.0/scripts/parity_se_deep_r.R +65 -0
- polca-0.4.0/scripts/parity_se_py.py +85 -0
- polca-0.4.0/scripts/parity_se_r.R +54 -0
- polca-0.4.0/scripts/parity_vce_r.R +86 -0
- polca-0.4.0/scripts/quick_analysis.py +26 -0
- polca-0.4.0/src/cpp/CMakeLists.txt +29 -0
- polca-0.4.0/src/cpp/bindings.cpp +96 -0
- polca-0.4.0/src/cpp/core/em_engine.cpp +145 -0
- polca-0.4.0/src/cpp/core/math_ops.cpp +376 -0
- polca-0.4.0/src/cpp/include/pypolca/em_engine.h +23 -0
- polca-0.4.0/src/cpp/include/pypolca/math_ops.h +77 -0
- polca-0.4.0/src/cpp/include/pypolca/types.h +63 -0
- polca-0.4.0/tests/python/test_basic.py +58 -0
- polca-0.4.0/tests/python/test_math_ops.py +243 -0
- polca-0.4.0/tests/python/test_r_comparison.py +283 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
---
|
|
2
|
+
BasedOnStyle: Google
|
|
3
|
+
IndentWidth: 4
|
|
4
|
+
TabWidth: 4
|
|
5
|
+
ColumnLimit: 100
|
|
6
|
+
AccessModifierOffset: -4
|
|
7
|
+
AllowShortFunctionsOnASingleLine: None
|
|
8
|
+
AllowShortIfStatementsOnASingleLine: false
|
|
9
|
+
AllowShortLoopsOnASingleLine: false
|
|
10
|
+
BreakBeforeBraces: Attach
|
|
11
|
+
PointerAlignment: Left
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
concurrency:
|
|
10
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
11
|
+
cancel-in-progress: true
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
# ── Python lint & format (fast, no build) ────────────────────────
|
|
15
|
+
lint-python:
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
- uses: astral-sh/setup-uv@v5
|
|
20
|
+
- run: uvx ruff check .
|
|
21
|
+
- run: uvx ruff format --check .
|
|
22
|
+
|
|
23
|
+
# ── Python type check (needs C++ build) ──────────────────────────
|
|
24
|
+
typecheck:
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v4
|
|
28
|
+
- uses: astral-sh/setup-uv@v5
|
|
29
|
+
with:
|
|
30
|
+
python-version: "3.12"
|
|
31
|
+
- run: sudo apt-get install -y ninja-build
|
|
32
|
+
- run: uv pip install -e .
|
|
33
|
+
- run: uvx mypy pypolca/
|
|
34
|
+
|
|
35
|
+
# ── C++ format check ─────────────────────────────────────────────
|
|
36
|
+
lint-cpp:
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v4
|
|
40
|
+
- run: |
|
|
41
|
+
find src -type f \( -name '*.cpp' -o -name '*.h' -o -name '*.hpp' \) \
|
|
42
|
+
| xargs clang-format-18 --dry-run --Werror
|
|
43
|
+
|
|
44
|
+
# ── Build & test matrix ──────────────────────────────────────────
|
|
45
|
+
test:
|
|
46
|
+
needs: [lint-python, lint-cpp, typecheck]
|
|
47
|
+
strategy:
|
|
48
|
+
fail-fast: false
|
|
49
|
+
matrix:
|
|
50
|
+
os: [ubuntu-latest, macos-latest]
|
|
51
|
+
runs-on: ${{ matrix.os }}
|
|
52
|
+
steps:
|
|
53
|
+
- uses: actions/checkout@v4
|
|
54
|
+
- uses: astral-sh/setup-uv@v5
|
|
55
|
+
with:
|
|
56
|
+
python-version: "3.12"
|
|
57
|
+
|
|
58
|
+
- name: Install system deps (Ubuntu)
|
|
59
|
+
if: runner.os == 'Linux'
|
|
60
|
+
run: sudo apt-get install -y ninja-build
|
|
61
|
+
|
|
62
|
+
- name: Install system deps (macOS)
|
|
63
|
+
if: runner.os == 'macOS'
|
|
64
|
+
run: brew install ninja
|
|
65
|
+
|
|
66
|
+
- name: Build package
|
|
67
|
+
run: uv pip install -e .
|
|
68
|
+
|
|
69
|
+
- name: Install test deps
|
|
70
|
+
run: uv pip install pytest pytest-cov scipy
|
|
71
|
+
|
|
72
|
+
- name: Run tests with coverage
|
|
73
|
+
run: uv run pytest --cov=pypolca --cov-report=term-missing tests/
|
|
74
|
+
|
|
75
|
+
# ── Build wheels ─────────────────────────────────────────────────
|
|
76
|
+
build-wheels:
|
|
77
|
+
needs: test
|
|
78
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
79
|
+
strategy:
|
|
80
|
+
fail-fast: false
|
|
81
|
+
matrix:
|
|
82
|
+
os: [ubuntu-latest, macos-latest]
|
|
83
|
+
runs-on: ${{ matrix.os }}
|
|
84
|
+
steps:
|
|
85
|
+
- uses: actions/checkout@v4
|
|
86
|
+
|
|
87
|
+
- name: Build wheels
|
|
88
|
+
uses: pypa/cibuildwheel@v2.22
|
|
89
|
+
env:
|
|
90
|
+
CIBW_BUILD: "cp312-*"
|
|
91
|
+
CIBW_SKIP: "*_i686 *-musllinux*"
|
|
92
|
+
CIBW_ARCHS_MACOS: "arm64 x86_64"
|
|
93
|
+
CIBW_TEST_REQUIRES: "pytest"
|
|
94
|
+
CIBW_TEST_COMMAND: "pytest {package}/tests"
|
|
95
|
+
|
|
96
|
+
- uses: actions/upload-artifact@v4
|
|
97
|
+
with:
|
|
98
|
+
name: wheels-${{ matrix.os }}
|
|
99
|
+
path: ./wheelhouse/*.whl
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
name: wheels
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build-wheels:
|
|
9
|
+
name: ${{ matrix.os }} ${{ matrix.arch || 'x64' }}
|
|
10
|
+
runs-on: ${{ matrix.os }}
|
|
11
|
+
strategy:
|
|
12
|
+
fail-fast: false
|
|
13
|
+
matrix:
|
|
14
|
+
include:
|
|
15
|
+
- os: ubuntu-24.04
|
|
16
|
+
arch: x86_64
|
|
17
|
+
- os: macos-latest
|
|
18
|
+
arch: arm64
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- uses: astral-sh/setup-uv@v5
|
|
23
|
+
|
|
24
|
+
- name: Build wheels
|
|
25
|
+
uses: pypa/cibuildwheel@v2.22
|
|
26
|
+
env:
|
|
27
|
+
CIBW_BUILD: "cp312-* cp313-*"
|
|
28
|
+
CIBW_ARCHS: ${{ matrix.arch }}
|
|
29
|
+
|
|
30
|
+
- uses: actions/upload-artifact@v4
|
|
31
|
+
with:
|
|
32
|
+
name: wheels-${{ matrix.os }}-${{ matrix.arch }}
|
|
33
|
+
path: wheelhouse/*.whl
|
|
34
|
+
|
|
35
|
+
build-sdist:
|
|
36
|
+
runs-on: ubuntu-latest
|
|
37
|
+
steps:
|
|
38
|
+
- uses: actions/checkout@v4
|
|
39
|
+
- uses: astral-sh/setup-uv@v5
|
|
40
|
+
- run: uv build --sdist
|
|
41
|
+
- uses: actions/upload-artifact@v4
|
|
42
|
+
with:
|
|
43
|
+
name: sdist
|
|
44
|
+
path: dist/*.tar.gz
|
|
45
|
+
|
|
46
|
+
publish:
|
|
47
|
+
needs: [build-wheels, build-sdist]
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
permissions:
|
|
50
|
+
id-token: write
|
|
51
|
+
environment: pypi
|
|
52
|
+
steps:
|
|
53
|
+
- uses: actions/download-artifact@v4
|
|
54
|
+
with:
|
|
55
|
+
pattern: "*"
|
|
56
|
+
path: dist
|
|
57
|
+
merge-multiple: true
|
|
58
|
+
|
|
59
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
polca-0.4.0/.gitignore
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Build artifacts
|
|
2
|
+
build/
|
|
3
|
+
dist/
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
|
|
7
|
+
# Python
|
|
8
|
+
__pycache__/
|
|
9
|
+
*.py[cod]
|
|
10
|
+
*.so
|
|
11
|
+
*.pyd
|
|
12
|
+
*.dylib
|
|
13
|
+
*.dll
|
|
14
|
+
|
|
15
|
+
# uv
|
|
16
|
+
.venv/
|
|
17
|
+
uv.lock
|
|
18
|
+
|
|
19
|
+
# IDEs
|
|
20
|
+
.vscode/
|
|
21
|
+
.idea/
|
|
22
|
+
*.swp
|
|
23
|
+
*.swo
|
|
24
|
+
*~
|
|
25
|
+
|
|
26
|
+
# OS
|
|
27
|
+
.DS_Store
|
|
28
|
+
Thumbs.db
|
|
29
|
+
|
|
30
|
+
# Local-only scripts
|
|
31
|
+
rebuild.sh
|
|
32
|
+
|
|
33
|
+
# Agent instructions (local-only)
|
|
34
|
+
AGENTS.md
|
|
35
|
+
|
|
36
|
+
.cache/
|
|
37
|
+
package-lock.json
|
|
38
|
+
.cache
|
|
39
|
+
docs/
|
|
40
|
+
compile_commands.json
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.9.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
|
|
9
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
10
|
+
rev: v1.14.1
|
|
11
|
+
hooks:
|
|
12
|
+
- id: mypy
|
|
13
|
+
args: [--strict, --ignore-missing-imports, pypolca/]
|
|
14
|
+
additional_dependencies: [numpy, polars]
|
|
15
|
+
pass_filenames: false
|
|
16
|
+
|
|
17
|
+
- repo: https://github.com/pre-commit/mirrors-clang-format
|
|
18
|
+
rev: v18.1.8
|
|
19
|
+
hooks:
|
|
20
|
+
- id: clang-format
|
|
21
|
+
types_or: [c++, c]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.18)
|
|
2
|
+
project(pypolca LANGUAGES CXX)
|
|
3
|
+
|
|
4
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
5
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
6
|
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
7
|
+
|
|
8
|
+
# --- Fetch Eigen3 (header-only) ---
|
|
9
|
+
include(FetchContent)
|
|
10
|
+
FetchContent_Declare(
|
|
11
|
+
Eigen
|
|
12
|
+
GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
|
|
13
|
+
GIT_TAG 3.4.0
|
|
14
|
+
GIT_SHALLOW TRUE
|
|
15
|
+
)
|
|
16
|
+
FetchContent_MakeAvailable(Eigen)
|
|
17
|
+
|
|
18
|
+
# --- Fetch pybind11 ---
|
|
19
|
+
FetchContent_Declare(
|
|
20
|
+
pybind11
|
|
21
|
+
GIT_REPOSITORY https://github.com/pybind/pybind11.git
|
|
22
|
+
GIT_TAG v2.12.0
|
|
23
|
+
GIT_SHALLOW TRUE
|
|
24
|
+
)
|
|
25
|
+
FetchContent_MakeAvailable(pybind11)
|
|
26
|
+
|
|
27
|
+
# --- Build C++ code ---
|
|
28
|
+
add_subdirectory(src/cpp)
|
|
29
|
+
|
|
30
|
+
add_executable(tryout ${CMAKE_SOURCE_DIR}/examples/tryout.cpp)
|
|
31
|
+
target_link_libraries(tryout PRIVATE pypolca_core)
|
polca-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: polca
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Polytomous Variable Latent Class Analysis in C++ with Python bindings
|
|
5
|
+
Keywords: latent class analysis,EM algorithm,mixture model
|
|
6
|
+
Author-Email: =?utf-8?q?Marc-Andr=C3=A9_Ch=C3=A9nier?= <marcandrechenier@gmail.com>
|
|
7
|
+
License: GPL-2.0-or-later
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)
|
|
11
|
+
Classifier: Programming Language :: C++
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Requires-Dist: numpy>=1.20
|
|
18
|
+
Requires-Dist: polars
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
21
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
22
|
+
Requires-Dist: mypy; extra == "dev"
|
|
23
|
+
Requires-Dist: ruff; extra == "dev"
|
|
24
|
+
Provides-Extra: test
|
|
25
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
26
|
+
Requires-Dist: numpy; extra == "test"
|
|
27
|
+
Requires-Dist: polars; extra == "test"
|
|
28
|
+
Requires-Dist: scipy; extra == "test"
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# pypoLCA
|
|
32
|
+
|
|
33
|
+
Polytomous variable latent class analysis (LCA) for Python, powered by a C++17 backend. `pypoLCA` is a translation of R's [poLCA](https://github.com/dlinzer/poLCA) package by Drew Linzer and Jeffrey Lewis.
|
|
34
|
+
|
|
35
|
+
## What is latent class analysis?
|
|
36
|
+
|
|
37
|
+
LCA is a statistical method that discovers latent (unobserved) categorical variables from a set of nominal responses. The core assumption is that observed responses are mutually independent conditionally on the latent variable — all dependencies between responses flow through the latent structure.
|
|
38
|
+
|
|
39
|
+
The model identifies two things:
|
|
40
|
+
|
|
41
|
+
1. The underlying latent classes (e.g., "high-risk" vs. "low-risk" respondents), and
|
|
42
|
+
2. The conditional probabilities of each observed response given each class.
|
|
43
|
+
|
|
44
|
+
LCA's latent variables are categorical (e.g., *class 1* = "non-cheaters", *class 2* = "chronic cheaters"). This makes LCA the categorical-data analogue of Gaussian Mixture Models (GMM): GMM assumes normally-distributed responses, LCA assumes multinomial responses. Both fit parameters via Expectation-Maximisation (EM). A good tutorial extending EM beyond standard GMM applications is [Gao (2022)](https://teng-gao.github.io/blog/2022/ems/).
|
|
45
|
+
|
|
46
|
+
### Applications
|
|
47
|
+
|
|
48
|
+
- **Diagnostic agreement** — LCA estimates rater accuracy without a gold-standard reference. Applied to carcinoma diagnoses by seven pathologists (Uebersax & Grove, 1990; dataset: `carcinoma`).
|
|
49
|
+
- **Political typology** — LCA identifies voter segments from candidate trait ratings. Applied to 2000 ANES survey data (dataset: `election`).
|
|
50
|
+
- **Academic dishonesty** — Latent classes of cheating behavior among students, regressed on GPA covariates (dataset: `cheating`).
|
|
51
|
+
- **Survey attitude clustering** — Uncovering latent opinion groups from social survey responses (McCutcheon, 1987; dataset: `gss82`).
|
|
52
|
+
|
|
53
|
+
### Latent class regression
|
|
54
|
+
|
|
55
|
+
LCA can be extended with **covariates** that predict class membership. `pypoLCA` fits latent class regression using the same hybrid EM / Newton-Raphson algorithm as R's `poLCA`. The EM loop alternates expected-posterior and maximisation steps. Response probabilities have a closed-form M-step, which guarantees the standard EM ascent property. Covariate coefficients lack a closed form and are updated within each M-step via Newton-Raphson (NR). Unlike pure EM, the NR step can overshoot and cause a likelihood drop. The implementation detects this and restarts with perturbed starting values (`max_restarts`). In any case, the algorithm finds only a local maximum, so multiple random starts (`nrep`) are recommended.
|
|
56
|
+
|
|
57
|
+
Standard errors are provided for all parameter estimates (i.e. conditional response probabilities, prior class probabilities, and (when covariates are present) regression coefficients). SEs are computed from the observed information matrix via the outer product of the individual score contributions, then transformed to probability space via the delta method. This matches the approach used by R's `poLCA`.
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install pypolca
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
> Until published on PyPI, install from source:
|
|
66
|
+
>
|
|
67
|
+
> ```bash
|
|
68
|
+
> git clone https://github.com/.../pypoLCA.git
|
|
69
|
+
> cd pypoLCA
|
|
70
|
+
> uv pip install -e ".[dev]"
|
|
71
|
+
> ```
|
|
72
|
+
|
|
73
|
+
## Quick start
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import pypolca as lca
|
|
77
|
+
|
|
78
|
+
# Load a built-in dataset (a Polars DataFrame)
|
|
79
|
+
df = lca.load_dataset("carcinoma")
|
|
80
|
+
|
|
81
|
+
# Fit a 2-class model — seven pathologists rating 118 slides
|
|
82
|
+
result = lca.fit("cbind(A, B, C, D, E, F, G) ~ 1", data=df, nclass=2, nrep=5)
|
|
83
|
+
|
|
84
|
+
# Inspect results
|
|
85
|
+
print(f"Log-likelihood: {result.loglik:.2f}")
|
|
86
|
+
print(f"AIC: {result.aic:.2f}")
|
|
87
|
+
print(f"Iterations: {result.iterations}")
|
|
88
|
+
|
|
89
|
+
# Class-conditional probabilities for the first item
|
|
90
|
+
print(result.probs[0]) # shape (nclass, n_categories)
|
|
91
|
+
|
|
92
|
+
# Posterior class membership (N × R)
|
|
93
|
+
print(result.posterior[:5])
|
|
94
|
+
|
|
95
|
+
# Predicted class for each observation (1-based)
|
|
96
|
+
print(result.predclass[:5]) # 1-based (matching R poLCA convention)
|
|
97
|
+
|
|
98
|
+
# Standard errors
|
|
99
|
+
print(result.probs_se[0]) # SEs for first item
|
|
100
|
+
print(result.P_se) # SEs for class priors
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**With covariates (latent class regression):**
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
df = lca.load_dataset("cheating")
|
|
107
|
+
|
|
108
|
+
# Cheating behaviours ~ GPA
|
|
109
|
+
result = lca.fit(
|
|
110
|
+
"cbind(LIEEXAM, LIEPAPER, FRAUD, COPYEXAM) ~ GPA",
|
|
111
|
+
data=df,
|
|
112
|
+
nclass=2,
|
|
113
|
+
nrep=10,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
print(result.coeff) # Regression coefficients (covariates × (classes − 1))
|
|
117
|
+
print(result.coeff_se) # Standard errors
|
|
118
|
+
print(result.P) # Population class shares
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
The formula syntax uses `cbind(var1, var2, ...)` on the left-hand side, or equivalently Python-style `var1 + var2 + ...`. The right-hand side is `~ 1` for intercept-only or `~ cov1 + cov2` for latent class regression. Parsing is handled by a lightweight custom parser (no external formula library).
|
|
122
|
+
|
|
123
|
+
## Backend
|
|
124
|
+
|
|
125
|
+
The EM engine and standard error computation are written in C++17 (Eigen for linear algebra), exposed to Python via pybind11. The build system is CMake + scikit-build-core, managed by `uv`. Incremental C++ rebuilds take ~1–2 s with `./rebuild.sh`.
|
|
126
|
+
|
|
127
|
+
## Benchmarks
|
|
128
|
+
|
|
129
|
+
Comparison of `pypolca` (C++, with/without SE) vs R's `poLCA` on the `cheating` dataset (N=319, 4 binary items, 2 classes). Timings are means over 20 runs.
|
|
130
|
+
|
|
131
|
+
| N | Items | Classes | R poLCA | pypolca (with SE) | Speed-up | pypolca (no SE) | Speed-up |
|
|
132
|
+
|--------|-------|---------|----------|-------------------|----------|-----------------|----------|
|
|
133
|
+
| 319 | 4 | 2 | — | — | — | — | — |
|
|
134
|
+
| 500 | 5 | 2 | — | — | — | — | — |
|
|
135
|
+
| 2,000 | 5 | 2 | — | — | — | — | — |
|
|
136
|
+
| 10,000 | 5 | 2 | — | — | — | — | — |
|
|
137
|
+
|
|
138
|
+
> *Results TBD — run `python scripts/benchmark.py` and `python scripts/benchmark_scaling.py` to populate with fresh numbers (requires R with `poLCA` and `jsonlite` installed). Speed-up is relative to R `poLCA`.*
|
|
139
|
+
|
|
140
|
+
## Datasets
|
|
141
|
+
|
|
142
|
+
| Dataset | N | Manifest items | Covariates | Source |
|
|
143
|
+
|------------|-------|------------------------------------------------------|------------------------|---------------------------|
|
|
144
|
+
| carcinoma | 118 | A–G (7 binary: no carcinoma / carcinoma) | — | Agresti (2002) |
|
|
145
|
+
| cheating | 319 | LIEEXAM, LIEPAPER, FRAUD, COPYEXAM (4 binary) | GPA | R poLCA |
|
|
146
|
+
| election | 1,785 | MORALG–INTELB (12 ordinal: 4-point trait ratings) | VOTE3, AGE, EDUC, etc. | 2000 ANES |
|
|
147
|
+
| gss82 | 1,202 | PURPOSE, ACCURACY, UNDERSTA, COOPERAT (2–3 categories)| — | McCutcheon (1987) |
|
|
148
|
+
| values | 216 | A–D (4 binary: universalistic / particularistic) | — | R poLCA |
|
|
149
|
+
|
|
150
|
+
## Credits
|
|
151
|
+
|
|
152
|
+
pypoLCA is a translation of Drew A. Linzer and Jeffrey B. Lewis's R package:
|
|
153
|
+
|
|
154
|
+
> Linzer, D. A., & Lewis, J. B. (2011). poLCA: An R Package for Polytomous Variable Latent Class Analysis. *Journal of Statistical Software*, 42(10), 1–29. [doi:10.18637/jss.v042.i10](https://doi.org/10.18637/jss.v042.i10)
|
|
155
|
+
|
|
156
|
+
Built-in datasets and the EM / Newton-Raphson algorithm are taken from the `poLCA` R package, licensed GPL-2.0-or-later.
|
|
157
|
+
|
|
158
|
+
C++ bindings powered by [pybind11](https://github.com/pybind/pybind11). Linear algebra via [Eigen](https://eigen.tuxfamily.org/).
|
|
159
|
+
|
|
160
|
+
## Contributing
|
|
161
|
+
|
|
162
|
+
Contributions are welcome and appreciated. Please keep submissions tight and purposeful. The goal is to keep `pypoLCA` a focused, maintainable package. AI-assisted contributions are fine, but AI use doesn't excuse sloppy or verbose work; review your output before submitting a PR.
|
|
163
|
+
|
|
164
|
+
## License
|
|
165
|
+
|
|
166
|
+
GPL-2.0-or-later
|
polca-0.4.0/README.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# pypoLCA
|
|
2
|
+
|
|
3
|
+
Polytomous variable latent class analysis (LCA) for Python, powered by a C++17 backend. `pypoLCA` is a translation of R's [poLCA](https://github.com/dlinzer/poLCA) package by Drew Linzer and Jeffrey Lewis.
|
|
4
|
+
|
|
5
|
+
## What is latent class analysis?
|
|
6
|
+
|
|
7
|
+
LCA is a statistical method that discovers latent (unobserved) categorical variables from a set of nominal responses. The core assumption is that observed responses are mutually independent conditionally on the latent variable — all dependencies between responses flow through the latent structure.
|
|
8
|
+
|
|
9
|
+
The model identifies two things:
|
|
10
|
+
|
|
11
|
+
1. The underlying latent classes (e.g., "high-risk" vs. "low-risk" respondents), and
|
|
12
|
+
2. The conditional probabilities of each observed response given each class.
|
|
13
|
+
|
|
14
|
+
LCA's latent variables are categorical (e.g., *class 1* = "non-cheaters", *class 2* = "chronic cheaters"). This makes LCA the categorical-data analogue of Gaussian Mixture Models (GMM): GMM assumes normally-distributed responses, LCA assumes multinomial responses. Both fit parameters via Expectation-Maximisation (EM). A good tutorial extending EM beyond standard GMM applications is [Gao (2022)](https://teng-gao.github.io/blog/2022/ems/).
|
|
15
|
+
|
|
16
|
+
### Applications
|
|
17
|
+
|
|
18
|
+
- **Diagnostic agreement** — LCA estimates rater accuracy without a gold-standard reference. Applied to carcinoma diagnoses by seven pathologists (Uebersax & Grove, 1990; dataset: `carcinoma`).
|
|
19
|
+
- **Political typology** — LCA identifies voter segments from candidate trait ratings. Applied to 2000 ANES survey data (dataset: `election`).
|
|
20
|
+
- **Academic dishonesty** — Latent classes of cheating behavior among students, regressed on GPA covariates (dataset: `cheating`).
|
|
21
|
+
- **Survey attitude clustering** — Uncovering latent opinion groups from social survey responses (McCutcheon, 1987; dataset: `gss82`).
|
|
22
|
+
|
|
23
|
+
### Latent class regression
|
|
24
|
+
|
|
25
|
+
LCA can be extended with **covariates** that predict class membership. `pypoLCA` fits latent class regression using the same hybrid EM / Newton-Raphson algorithm as R's `poLCA`. The EM loop alternates expected-posterior and maximisation steps. Response probabilities have a closed-form M-step, which guarantees the standard EM ascent property. Covariate coefficients lack a closed form and are updated within each M-step via Newton-Raphson (NR). Unlike pure EM, the NR step can overshoot and cause a likelihood drop. The implementation detects this and restarts with perturbed starting values (`max_restarts`). In any case, the algorithm finds only a local maximum, so multiple random starts (`nrep`) are recommended.
|
|
26
|
+
|
|
27
|
+
Standard errors are provided for all parameter estimates (i.e. conditional response probabilities, prior class probabilities, and (when covariates are present) regression coefficients). SEs are computed from the observed information matrix via the outer product of the individual score contributions, then transformed to probability space via the delta method. This matches the approach used by R's `poLCA`.
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install pypolca
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
> Until published on PyPI, install from source:
|
|
36
|
+
>
|
|
37
|
+
> ```bash
|
|
38
|
+
> git clone https://github.com/.../pypoLCA.git
|
|
39
|
+
> cd pypoLCA
|
|
40
|
+
> uv pip install -e ".[dev]"
|
|
41
|
+
> ```
|
|
42
|
+
|
|
43
|
+
## Quick start
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import pypolca as lca
|
|
47
|
+
|
|
48
|
+
# Load a built-in dataset (a Polars DataFrame)
|
|
49
|
+
df = lca.load_dataset("carcinoma")
|
|
50
|
+
|
|
51
|
+
# Fit a 2-class model — seven pathologists rating 118 slides
|
|
52
|
+
result = lca.fit("cbind(A, B, C, D, E, F, G) ~ 1", data=df, nclass=2, nrep=5)
|
|
53
|
+
|
|
54
|
+
# Inspect results
|
|
55
|
+
print(f"Log-likelihood: {result.loglik:.2f}")
|
|
56
|
+
print(f"AIC: {result.aic:.2f}")
|
|
57
|
+
print(f"Iterations: {result.iterations}")
|
|
58
|
+
|
|
59
|
+
# Class-conditional probabilities for the first item
|
|
60
|
+
print(result.probs[0]) # shape (nclass, n_categories)
|
|
61
|
+
|
|
62
|
+
# Posterior class membership (N × R)
|
|
63
|
+
print(result.posterior[:5])
|
|
64
|
+
|
|
65
|
+
# Predicted class for each observation (1-based)
|
|
66
|
+
print(result.predclass[:5]) # 1-based (matching R poLCA convention)
|
|
67
|
+
|
|
68
|
+
# Standard errors
|
|
69
|
+
print(result.probs_se[0]) # SEs for first item
|
|
70
|
+
print(result.P_se) # SEs for class priors
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**With covariates (latent class regression):**
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
df = lca.load_dataset("cheating")
|
|
77
|
+
|
|
78
|
+
# Cheating behaviours ~ GPA
|
|
79
|
+
result = lca.fit(
|
|
80
|
+
"cbind(LIEEXAM, LIEPAPER, FRAUD, COPYEXAM) ~ GPA",
|
|
81
|
+
data=df,
|
|
82
|
+
nclass=2,
|
|
83
|
+
nrep=10,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
print(result.coeff) # Regression coefficients (covariates × (classes − 1))
|
|
87
|
+
print(result.coeff_se) # Standard errors
|
|
88
|
+
print(result.P) # Population class shares
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
The formula syntax uses `cbind(var1, var2, ...)` on the left-hand side, or equivalently Python-style `var1 + var2 + ...`. The right-hand side is `~ 1` for intercept-only or `~ cov1 + cov2` for latent class regression. Parsing is handled by a lightweight custom parser (no external formula library).
|
|
92
|
+
|
|
93
|
+
## Backend
|
|
94
|
+
|
|
95
|
+
The EM engine and standard error computation are written in C++17 (Eigen for linear algebra), exposed to Python via pybind11. The build system is CMake + scikit-build-core, managed by `uv`. Incremental C++ rebuilds take ~1–2 s with `./rebuild.sh`.
|
|
96
|
+
|
|
97
|
+
## Benchmarks
|
|
98
|
+
|
|
99
|
+
Comparison of `pypolca` (C++, with/without SE) vs R's `poLCA` on the `cheating` dataset (N=319, 4 binary items, 2 classes). Timings are means over 20 runs.
|
|
100
|
+
|
|
101
|
+
| N | Items | Classes | R poLCA | pypolca (with SE) | Speed-up | pypolca (no SE) | Speed-up |
|
|
102
|
+
|--------|-------|---------|----------|-------------------|----------|-----------------|----------|
|
|
103
|
+
| 319 | 4 | 2 | — | — | — | — | — |
|
|
104
|
+
| 500 | 5 | 2 | — | — | — | — | — |
|
|
105
|
+
| 2,000 | 5 | 2 | — | — | — | — | — |
|
|
106
|
+
| 10,000 | 5 | 2 | — | — | — | — | — |
|
|
107
|
+
|
|
108
|
+
> *Results TBD — run `python scripts/benchmark.py` and `python scripts/benchmark_scaling.py` to populate with fresh numbers (requires R with `poLCA` and `jsonlite` installed). Speed-up is relative to R `poLCA`.*
|
|
109
|
+
|
|
110
|
+
## Datasets
|
|
111
|
+
|
|
112
|
+
| Dataset | N | Manifest items | Covariates | Source |
|
|
113
|
+
|------------|-------|------------------------------------------------------|------------------------|---------------------------|
|
|
114
|
+
| carcinoma | 118 | A–G (7 binary: no carcinoma / carcinoma) | — | Agresti (2002) |
|
|
115
|
+
| cheating | 319 | LIEEXAM, LIEPAPER, FRAUD, COPYEXAM (4 binary) | GPA | R poLCA |
|
|
116
|
+
| election | 1,785 | MORALG–INTELB (12 ordinal: 4-point trait ratings) | VOTE3, AGE, EDUC, etc. | 2000 ANES |
|
|
117
|
+
| gss82 | 1,202 | PURPOSE, ACCURACY, UNDERSTA, COOPERAT (2–3 categories)| — | McCutcheon (1987) |
|
|
118
|
+
| values | 216 | A–D (4 binary: universalistic / particularistic) | — | R poLCA |
|
|
119
|
+
|
|
120
|
+
## Credits
|
|
121
|
+
|
|
122
|
+
pypoLCA is a translation of Drew A. Linzer and Jeffrey B. Lewis's R package:
|
|
123
|
+
|
|
124
|
+
> Linzer, D. A., & Lewis, J. B. (2011). poLCA: An R Package for Polytomous Variable Latent Class Analysis. *Journal of Statistical Software*, 42(10), 1–29. [doi:10.18637/jss.v042.i10](https://doi.org/10.18637/jss.v042.i10)
|
|
125
|
+
|
|
126
|
+
Built-in datasets and the EM / Newton-Raphson algorithm are taken from the `poLCA` R package, licensed GPL-2.0-or-later.
|
|
127
|
+
|
|
128
|
+
C++ bindings powered by [pybind11](https://github.com/pybind/pybind11). Linear algebra via [Eigen](https://eigen.tuxfamily.org/).
|
|
129
|
+
|
|
130
|
+
## Contributing
|
|
131
|
+
|
|
132
|
+
Contributions are welcome and appreciated. Please keep submissions tight and purposeful. The goal is to keep `pypoLCA` a focused, maintainable package. AI-assisted contributions are fine, but AI use doesn't excuse sloppy or verbose work; review your output before submitting a PR.
|
|
133
|
+
|
|
134
|
+
## License
|
|
135
|
+
|
|
136
|
+
GPL-2.0-or-later
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Example: fit a basic latent class model with pypoLCA."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import polars as pl
|
|
5
|
+
|
|
6
|
+
# Import the low-level C++ bindings directly
|
|
7
|
+
from pypolca._core import Data, fit_em
|
|
8
|
+
|
|
9
|
+
# --- Create synthetic data ---
|
|
10
|
+
np.random.seed(42)
|
|
11
|
+
N = 500
|
|
12
|
+
|
|
13
|
+
# 4 manifest variables, each with 3 categories
|
|
14
|
+
J = 4
|
|
15
|
+
K = 3
|
|
16
|
+
|
|
17
|
+
# True latent class labels
|
|
18
|
+
true_class = np.random.choice([0, 1], size=N, p=[0.4, 0.6])
|
|
19
|
+
|
|
20
|
+
# Class-conditional response probabilities
|
|
21
|
+
# Class 0: tends to answer 1
|
|
22
|
+
# Class 1: tends to answer 3
|
|
23
|
+
probs_c0 = np.array([0.6, 0.3, 0.1])
|
|
24
|
+
probs_c1 = np.array([0.1, 0.3, 0.6])
|
|
25
|
+
|
|
26
|
+
y = np.zeros((N, J), dtype=np.int32)
|
|
27
|
+
for i in range(N):
|
|
28
|
+
for j in range(J):
|
|
29
|
+
if true_class[i] == 0:
|
|
30
|
+
y[i, j] = np.random.choice([1, 2, 3], p=probs_c0) + 1 # 1-based
|
|
31
|
+
else:
|
|
32
|
+
y[i, j] = np.random.choice([1, 2, 3], p=probs_c1) + 1
|
|
33
|
+
|
|
34
|
+
# Build Data object
|
|
35
|
+
data = Data()
|
|
36
|
+
data.y = y
|
|
37
|
+
data.x = np.ones((N, 1), dtype=np.float64) # intercept only (no covariates)
|
|
38
|
+
data.num_choices = [K] * J
|
|
39
|
+
|
|
40
|
+
# --- Fit model ---
|
|
41
|
+
print("Fitting 2-class LCA model...")
|
|
42
|
+
result = fit_em(data, nclass=2, maxiter=200, tol=1e-8)
|
|
43
|
+
|
|
44
|
+
print(f"\nConverged: {result.converged}")
|
|
45
|
+
print(f"Iterations: {result.iterations}")
|
|
46
|
+
print(f"Log-likelihood: {result.loglik:.4f}")
|
|
47
|
+
print(f"Class shares: {result.posterior.mean(axis=0)}")
|
|
48
|
+
print(f"Predclass counts: {np.bincount(result.posterior.argmax(axis=1))}")
|
|
49
|
+
|
|
50
|
+
# --- Try the high-level API ---
|
|
51
|
+
try:
|
|
52
|
+
from pypolca.api import fit
|
|
53
|
+
|
|
54
|
+
df = pl.DataFrame(
|
|
55
|
+
y,
|
|
56
|
+
schema=[f"Y{j + 1}" for j in range(J)],
|
|
57
|
+
)
|
|
58
|
+
result2 = fit("Y1 + Y2 + Y3 + Y4 ~ 1", df, nclass=2)
|
|
59
|
+
print(f"\nHigh-level API result: {result2}")
|
|
60
|
+
print(f"Class shares: {result2.P}")
|
|
61
|
+
except Exception as e:
|
|
62
|
+
print(f"High-level API not yet functional: {e}")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from pypolca._core import Data, Params, compute_ylik
|
|
4
|
+
|
|
5
|
+
data = Data()
|
|
6
|
+
data.y = np.array([[1], [2]], dtype=np.int32) # 1-based
|
|
7
|
+
data.x = np.ones((2, 1), dtype=np.float64)
|
|
8
|
+
data.num_choices = [2]
|
|
9
|
+
|
|
10
|
+
p = Params()
|
|
11
|
+
# 1 class, 2 categories: vecprobs layout = [class0_cat0, class0_cat1]
|
|
12
|
+
p.vecprobs = np.array([0.3, 0.7], dtype=np.float64)
|
|
13
|
+
p.beta = np.array([], dtype=np.float64)
|
|
14
|
+
|
|
15
|
+
lik = compute_ylik(data, p, 1)
|
|
16
|
+
print(lik)
|