polca 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. polca-0.4.0/.clang-format +11 -0
  2. polca-0.4.0/.github/workflows/ci.yml +99 -0
  3. polca-0.4.0/.github/workflows/wheels.yml +59 -0
  4. polca-0.4.0/.gitignore +40 -0
  5. polca-0.4.0/.pre-commit-config.yaml +21 -0
  6. polca-0.4.0/CMakeLists.txt +31 -0
  7. polca-0.4.0/PKG-INFO +166 -0
  8. polca-0.4.0/README.md +136 -0
  9. polca-0.4.0/examples/basic_fit.py +62 -0
  10. polca-0.4.0/examples/debug_ylik.py +16 -0
  11. polca-0.4.0/examples/quick_analyses.py +41 -0
  12. polca-0.4.0/examples/tryout.cpp +111 -0
  13. polca-0.4.0/pypolca/__init__.py +19 -0
  14. polca-0.4.0/pypolca/api.py +465 -0
  15. polca-0.4.0/pypolca/data/__init__.py +3 -0
  16. polca-0.4.0/pypolca/data/_dataset.py +195 -0
  17. polca-0.4.0/pypolca/data/carcinoma.csv +119 -0
  18. polca-0.4.0/pypolca/data/cheating.csv +320 -0
  19. polca-0.4.0/pypolca/data/election.csv +1786 -0
  20. polca-0.4.0/pypolca/data/gss82.csv +1203 -0
  21. polca-0.4.0/pypolca/data/values.csv +217 -0
  22. polca-0.4.0/pypolca/utils.py +60 -0
  23. polca-0.4.0/pyproject.toml +90 -0
  24. polca-0.4.0/scripts/benchmark.py +167 -0
  25. polca-0.4.0/scripts/benchmark_scaling.py +141 -0
  26. polca-0.4.0/scripts/parity_covariates_py.py +84 -0
  27. polca-0.4.0/scripts/parity_covariates_r.R +57 -0
  28. polca-0.4.0/scripts/parity_deep_py.py +159 -0
  29. polca-0.4.0/scripts/parity_final.py +162 -0
  30. polca-0.4.0/scripts/parity_se_deep_r.R +65 -0
  31. polca-0.4.0/scripts/parity_se_py.py +85 -0
  32. polca-0.4.0/scripts/parity_se_r.R +54 -0
  33. polca-0.4.0/scripts/parity_vce_r.R +86 -0
  34. polca-0.4.0/scripts/quick_analysis.py +26 -0
  35. polca-0.4.0/src/cpp/CMakeLists.txt +29 -0
  36. polca-0.4.0/src/cpp/bindings.cpp +96 -0
  37. polca-0.4.0/src/cpp/core/em_engine.cpp +145 -0
  38. polca-0.4.0/src/cpp/core/math_ops.cpp +376 -0
  39. polca-0.4.0/src/cpp/include/pypolca/em_engine.h +23 -0
  40. polca-0.4.0/src/cpp/include/pypolca/math_ops.h +77 -0
  41. polca-0.4.0/src/cpp/include/pypolca/types.h +63 -0
  42. polca-0.4.0/tests/python/test_basic.py +58 -0
  43. polca-0.4.0/tests/python/test_math_ops.py +243 -0
  44. polca-0.4.0/tests/python/test_r_comparison.py +283 -0
@@ -0,0 +1,11 @@
1
+ ---
2
+ BasedOnStyle: Google
3
+ IndentWidth: 4
4
+ TabWidth: 4
5
+ ColumnLimit: 100
6
+ AccessModifierOffset: -4
7
+ AllowShortFunctionsOnASingleLine: None
8
+ AllowShortIfStatementsOnASingleLine: false
9
+ AllowShortLoopsOnASingleLine: false
10
+ BreakBeforeBraces: Attach
11
+ PointerAlignment: Left
@@ -0,0 +1,99 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ concurrency:
10
+ group: ${{ github.workflow }}-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ # ── Python lint & format (fast, no build) ────────────────────────
15
+ lint-python:
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - uses: astral-sh/setup-uv@v5
20
+ - run: uvx ruff check .
21
+ - run: uvx ruff format --check .
22
+
23
+ # ── Python type check (needs C++ build) ──────────────────────────
24
+ typecheck:
25
+ runs-on: ubuntu-latest
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ - uses: astral-sh/setup-uv@v5
29
+ with:
30
+ python-version: "3.12"
31
+ - run: sudo apt-get install -y ninja-build
32
+ - run: uv pip install -e .
33
+ - run: uvx mypy pypolca/
34
+
35
+ # ── C++ format check ─────────────────────────────────────────────
36
+ lint-cpp:
37
+ runs-on: ubuntu-latest
38
+ steps:
39
+ - uses: actions/checkout@v4
40
+ - run: |
41
+ find src -type f \( -name '*.cpp' -o -name '*.h' -o -name '*.hpp' \) \
42
+ | xargs clang-format-18 --dry-run --Werror
43
+
44
+ # ── Build & test matrix ──────────────────────────────────────────
45
+ test:
46
+ needs: [lint-python, lint-cpp, typecheck]
47
+ strategy:
48
+ fail-fast: false
49
+ matrix:
50
+ os: [ubuntu-latest, macos-latest]
51
+ runs-on: ${{ matrix.os }}
52
+ steps:
53
+ - uses: actions/checkout@v4
54
+ - uses: astral-sh/setup-uv@v5
55
+ with:
56
+ python-version: "3.12"
57
+
58
+ - name: Install system deps (Ubuntu)
59
+ if: runner.os == 'Linux'
60
+ run: sudo apt-get install -y ninja-build
61
+
62
+ - name: Install system deps (macOS)
63
+ if: runner.os == 'macOS'
64
+ run: brew install ninja
65
+
66
+ - name: Build package
67
+ run: uv pip install -e .
68
+
69
+ - name: Install test deps
70
+ run: uv pip install pytest pytest-cov scipy
71
+
72
+ - name: Run tests with coverage
73
+ run: uv run pytest --cov=pypolca --cov-report=term-missing tests/
74
+
75
+ # ── Build wheels ─────────────────────────────────────────────────
76
+ build-wheels:
77
+ needs: test
78
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
79
+ strategy:
80
+ fail-fast: false
81
+ matrix:
82
+ os: [ubuntu-latest, macos-latest]
83
+ runs-on: ${{ matrix.os }}
84
+ steps:
85
+ - uses: actions/checkout@v4
86
+
87
+ - name: Build wheels
88
+ uses: pypa/cibuildwheel@v2.22
89
+ env:
90
+ CIBW_BUILD: "cp312-*"
91
+ CIBW_SKIP: "*_i686 *-musllinux*"
92
+ CIBW_ARCHS_MACOS: "arm64 x86_64"
93
+ CIBW_TEST_REQUIRES: "pytest"
94
+ CIBW_TEST_COMMAND: "pytest {package}/tests"
95
+
96
+ - uses: actions/upload-artifact@v4
97
+ with:
98
+ name: wheels-${{ matrix.os }}
99
+ path: ./wheelhouse/*.whl
@@ -0,0 +1,59 @@
1
+ name: wheels
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ jobs:
8
+ build-wheels:
9
+ name: ${{ matrix.os }} ${{ matrix.arch || 'x64' }}
10
+ runs-on: ${{ matrix.os }}
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ include:
15
+ - os: ubuntu-24.04
16
+ arch: x86_64
17
+ - os: macos-latest
18
+ arch: arm64
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+
22
+ - uses: astral-sh/setup-uv@v5
23
+
24
+ - name: Build wheels
25
+ uses: pypa/cibuildwheel@v2.22
26
+ env:
27
+ CIBW_BUILD: "cp312-* cp313-*"
28
+ CIBW_ARCHS: ${{ matrix.arch }}
29
+
30
+ - uses: actions/upload-artifact@v4
31
+ with:
32
+ name: wheels-${{ matrix.os }}-${{ matrix.arch }}
33
+ path: wheelhouse/*.whl
34
+
35
+ build-sdist:
36
+ runs-on: ubuntu-latest
37
+ steps:
38
+ - uses: actions/checkout@v4
39
+ - uses: astral-sh/setup-uv@v5
40
+ - run: uv build --sdist
41
+ - uses: actions/upload-artifact@v4
42
+ with:
43
+ name: sdist
44
+ path: dist/*.tar.gz
45
+
46
+ publish:
47
+ needs: [build-wheels, build-sdist]
48
+ runs-on: ubuntu-latest
49
+ permissions:
50
+ id-token: write
51
+ environment: pypi
52
+ steps:
53
+ - uses: actions/download-artifact@v4
54
+ with:
55
+ pattern: "*"
56
+ path: dist
57
+ merge-multiple: true
58
+
59
+ - uses: pypa/gh-action-pypi-publish@release/v1
polca-0.4.0/.gitignore ADDED
@@ -0,0 +1,40 @@
1
+ # Build artifacts
2
+ build/
3
+ dist/
4
+ *.egg-info/
5
+ .eggs/
6
+
7
+ # Python
8
+ __pycache__/
9
+ *.py[cod]
10
+ *.so
11
+ *.pyd
12
+ *.dylib
13
+ *.dll
14
+
15
+ # uv
16
+ .venv/
17
+ uv.lock
18
+
19
+ # IDEs
20
+ .vscode/
21
+ .idea/
22
+ *.swp
23
+ *.swo
24
+ *~
25
+
26
+ # OS
27
+ .DS_Store
28
+ Thumbs.db
29
+
30
+ # Local-only scripts
31
+ rebuild.sh
32
+
33
+ # Agent instructions (local-only)
34
+ AGENTS.md
35
+
36
+ .cache/
37
+ package-lock.json
38
+ .cache
39
+ docs/
40
+ compile_commands.json
@@ -0,0 +1,21 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.9.0
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+
9
+ - repo: https://github.com/pre-commit/mirrors-mypy
10
+ rev: v1.14.1
11
+ hooks:
12
+ - id: mypy
13
+ args: [--strict, --ignore-missing-imports, pypolca/]
14
+ additional_dependencies: [numpy, polars]
15
+ pass_filenames: false
16
+
17
+ - repo: https://github.com/pre-commit/mirrors-clang-format
18
+ rev: v18.1.8
19
+ hooks:
20
+ - id: clang-format
21
+ types_or: [c++, c]
@@ -0,0 +1,31 @@
1
+ cmake_minimum_required(VERSION 3.18)
2
+ project(pypolca LANGUAGES CXX)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+ set(CMAKE_CXX_EXTENSIONS OFF)
7
+
8
+ # --- Fetch Eigen3 (header-only) ---
9
+ include(FetchContent)
10
+ FetchContent_Declare(
11
+ Eigen
12
+ GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
13
+ GIT_TAG 3.4.0
14
+ GIT_SHALLOW TRUE
15
+ )
16
+ FetchContent_MakeAvailable(Eigen)
17
+
18
+ # --- Fetch pybind11 ---
19
+ FetchContent_Declare(
20
+ pybind11
21
+ GIT_REPOSITORY https://github.com/pybind/pybind11.git
22
+ GIT_TAG v2.12.0
23
+ GIT_SHALLOW TRUE
24
+ )
25
+ FetchContent_MakeAvailable(pybind11)
26
+
27
+ # --- Build C++ code ---
28
+ add_subdirectory(src/cpp)
29
+
30
+ add_executable(tryout ${CMAKE_SOURCE_DIR}/examples/tryout.cpp)
31
+ target_link_libraries(tryout PRIVATE pypolca_core)
polca-0.4.0/PKG-INFO ADDED
@@ -0,0 +1,166 @@
1
+ Metadata-Version: 2.1
2
+ Name: polca
3
+ Version: 0.4.0
4
+ Summary: Polytomous Variable Latent Class Analysis in C++ with Python bindings
5
+ Keywords: latent class analysis,EM algorithm,mixture model
6
+ Author-Email: =?utf-8?q?Marc-Andr=C3=A9_Ch=C3=A9nier?= <marcandrechenier@gmail.com>
7
+ License: GPL-2.0-or-later
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)
11
+ Classifier: Programming Language :: C++
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: numpy>=1.20
18
+ Requires-Dist: polars
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest>=7.0; extra == "dev"
21
+ Requires-Dist: pytest-cov; extra == "dev"
22
+ Requires-Dist: mypy; extra == "dev"
23
+ Requires-Dist: ruff; extra == "dev"
24
+ Provides-Extra: test
25
+ Requires-Dist: pytest>=7.0; extra == "test"
26
+ Requires-Dist: numpy; extra == "test"
27
+ Requires-Dist: polars; extra == "test"
28
+ Requires-Dist: scipy; extra == "test"
29
+ Description-Content-Type: text/markdown
30
+
31
+ # pypoLCA
32
+
33
+ Polytomous variable latent class analysis (LCA) for Python, powered by a C++17 backend. `pypoLCA` is a translation of R's [poLCA](https://github.com/dlinzer/poLCA) package by Drew Linzer and Jeffrey Lewis.
34
+
35
+ ## What is latent class analysis?
36
+
37
+ LCA is a statistical method that discovers latent (unobserved) categorical variables from a set of nominal responses. The core assumption is that observed responses are mutually independent conditionally on the latent variable — all dependencies between responses flow through the latent structure.
38
+
39
+ The model identifies two things:
40
+
41
+ 1. The underlying latent classes (e.g., "high-risk" vs. "low-risk" respondents), and
42
+ 2. The conditional probabilities of each observed response given each class.
43
+
44
+ LCA's latent variables are categorical (e.g., *class 1* = "non-cheaters", *class 2* = "chronic cheaters"). This makes LCA the categorical-data analogue of Gaussian Mixture Models (GMM): GMM assumes normally-distributed responses, LCA assumes multinomial responses. Both fit parameters via Expectation-Maximisation (EM). A good tutorial extending EM beyond standard GMM applications is [Gao (2022)](https://teng-gao.github.io/blog/2022/ems/).
45
+
46
+ ### Applications
47
+
48
+ - **Diagnostic agreement** — LCA estimates rater accuracy without a gold-standard reference. Applied to carcinoma diagnoses by seven pathologists (Uebersax & Grove, 1990; dataset: `carcinoma`).
49
+ - **Political typology** — LCA identifies voter segments from candidate trait ratings. Applied to 2000 ANES survey data (dataset: `election`).
50
+ - **Academic dishonesty** — Latent classes of cheating behavior among students, regressed on GPA covariates (dataset: `cheating`).
51
+ - **Survey attitude clustering** — Uncovering latent opinion groups from social survey responses (McCutcheon, 1987; dataset: `gss82`).
52
+
53
+ ### Latent class regression
54
+
55
+ LCA can be extended with **covariates** that predict class membership. `pypoLCA` fits latent class regression using the same hybrid EM / Newton-Raphson algorithm as R's `poLCA`. The EM loop alternates expected-posterior and maximisation steps. Response probabilities have a closed-form M-step, which guarantees the standard EM ascent property. Covariate coefficients lack a closed form and are updated within each M-step via Newton-Raphson (NR). Unlike pure EM, the NR step can overshoot and cause a likelihood drop. The implementation detects this and restarts with perturbed starting values (`max_restarts`). In any case, the algorithm finds only a local maximum, so multiple random starts (`nrep`) are recommended.
56
+
57
+ Standard errors are provided for all parameter estimates (i.e. conditional response probabilities, prior class probabilities, and (when covariates are present) regression coefficients). SEs are computed from the observed information matrix via the outer product of the individual score contributions, then transformed to probability space via the delta method. This matches the approach used by R's `poLCA`.
58
+
59
+ ## Install
60
+
61
+ ```bash
62
+ pip install pypolca
63
+ ```
64
+
65
+ > Until published on PyPI, install from source:
66
+ >
67
+ > ```bash
68
+ > git clone https://github.com/.../pypoLCA.git
69
+ > cd pypoLCA
70
+ > uv pip install -e ".[dev]"
71
+ > ```
72
+
73
+ ## Quick start
74
+
75
+ ```python
76
+ import pypolca as lca
77
+
78
+ # Load a built-in dataset (a Polars DataFrame)
79
+ df = lca.load_dataset("carcinoma")
80
+
81
+ # Fit a 2-class model — seven pathologists rating 118 slides
82
+ result = lca.fit("cbind(A, B, C, D, E, F, G) ~ 1", data=df, nclass=2, nrep=5)
83
+
84
+ # Inspect results
85
+ print(f"Log-likelihood: {result.loglik:.2f}")
86
+ print(f"AIC: {result.aic:.2f}")
87
+ print(f"Iterations: {result.iterations}")
88
+
89
+ # Class-conditional probabilities for the first item
90
+ print(result.probs[0]) # shape (nclass, n_categories)
91
+
92
+ # Posterior class membership (N × R)
93
+ print(result.posterior[:5])
94
+
95
+ # Predicted class for each observation (1-based)
96
+ print(result.predclass[:5]) # 1-based (matching R poLCA convention)
97
+
98
+ # Standard errors
99
+ print(result.probs_se[0]) # SEs for first item
100
+ print(result.P_se) # SEs for class priors
101
+ ```
102
+
103
+ **With covariates (latent class regression):**
104
+
105
+ ```python
106
+ df = lca.load_dataset("cheating")
107
+
108
+ # Cheating behaviours ~ GPA
109
+ result = lca.fit(
110
+ "cbind(LIEEXAM, LIEPAPER, FRAUD, COPYEXAM) ~ GPA",
111
+ data=df,
112
+ nclass=2,
113
+ nrep=10,
114
+ )
115
+
116
+ print(result.coeff) # Regression coefficients (covariates × (classes − 1))
117
+ print(result.coeff_se) # Standard errors
118
+ print(result.P) # Population class shares
119
+ ```
120
+
121
+ The formula syntax uses `cbind(var1, var2, ...)` on the left-hand side, or equivalently Python-style `var1 + var2 + ...`. The right-hand side is `~ 1` for intercept-only or `~ cov1 + cov2` for latent class regression. Parsing is handled by a lightweight custom parser (no external formula library).
122
+
123
+ ## Backend
124
+
125
+ The EM engine and standard error computation are written in C++17 (Eigen for linear algebra), exposed to Python via pybind11. The build system is CMake + scikit-build-core, managed by `uv`. Incremental C++ rebuilds take ~1–2 s with `./rebuild.sh`.
126
+
127
+ ## Benchmarks
128
+
129
+ Comparison of `pypolca` (C++, with/without SE) vs R's `poLCA` on the `cheating` dataset (N=319, 4 binary items, 2 classes). Timings are means over 20 runs.
130
+
131
+ | N | Items | Classes | R poLCA | pypolca (with SE) | Speed-up | pypolca (no SE) | Speed-up |
132
+ |--------|-------|---------|----------|-------------------|----------|-----------------|----------|
133
+ | 319 | 4 | 2 | — | — | — | — | — |
134
+ | 500 | 5 | 2 | — | — | — | — | — |
135
+ | 2,000 | 5 | 2 | — | — | — | — | — |
136
+ | 10,000 | 5 | 2 | — | — | — | — | — |
137
+
138
+ > *Results TBD — run `python scripts/benchmark.py` and `python scripts/benchmark_scaling.py` to populate with fresh numbers (requires R with `poLCA` and `jsonlite` installed). Speed-up is relative to R `poLCA`.*
139
+
140
+ ## Datasets
141
+
142
+ | Dataset | N | Manifest items | Covariates | Source |
143
+ |------------|-------|------------------------------------------------------|------------------------|---------------------------|
144
+ | carcinoma | 118 | A–G (7 binary: no carcinoma / carcinoma) | — | Agresti (2002) |
145
+ | cheating | 319 | LIEEXAM, LIEPAPER, FRAUD, COPYEXAM (4 binary) | GPA | R poLCA |
146
+ | election | 1,785 | MORALG–INTELB (12 ordinal: 4-point trait ratings) | VOTE3, AGE, EDUC, etc. | 2000 ANES |
147
+ | gss82 | 1,202 | PURPOSE, ACCURACY, UNDERSTA, COOPERAT (2–3 categories)| — | McCutcheon (1987) |
148
+ | values | 216 | A–D (4 binary: universalistic / particularistic) | — | R poLCA |
149
+
150
+ ## Credits
151
+
152
+ pypoLCA is a translation of Drew A. Linzer and Jeffrey B. Lewis's R package:
153
+
154
+ > Linzer, D. A., & Lewis, J. B. (2011). poLCA: An R Package for Polytomous Variable Latent Class Analysis. *Journal of Statistical Software*, 42(10), 1–29. [doi:10.18637/jss.v042.i10](https://doi.org/10.18637/jss.v042.i10)
155
+
156
+ Built-in datasets and the EM / Newton-Raphson algorithm are taken from the `poLCA` R package, licensed GPL-2.0-or-later.
157
+
158
+ C++ bindings powered by [pybind11](https://github.com/pybind/pybind11). Linear algebra via [Eigen](https://eigen.tuxfamily.org/).
159
+
160
+ ## Contributing
161
+
162
+ Contributions are welcome and appreciated. Please keep submissions tight and purposeful. The goal is to keep `pypoLCA` a focused, maintainable package. AI-assisted contributions are fine, but AI use doesn't excuse sloppy or verbose work; review your output before submitting a PR.
163
+
164
+ ## License
165
+
166
+ GPL-2.0-or-later
polca-0.4.0/README.md ADDED
@@ -0,0 +1,136 @@
1
+ # pypoLCA
2
+
3
+ Polytomous variable latent class analysis (LCA) for Python, powered by a C++17 backend. `pypoLCA` is a translation of R's [poLCA](https://github.com/dlinzer/poLCA) package by Drew Linzer and Jeffrey Lewis.
4
+
5
+ ## What is latent class analysis?
6
+
7
+ LCA is a statistical method that discovers latent (unobserved) categorical variables from a set of nominal responses. The core assumption is that observed responses are mutually independent conditionally on the latent variable — all dependencies between responses flow through the latent structure.
8
+
9
+ The model identifies two things:
10
+
11
+ 1. The underlying latent classes (e.g., "high-risk" vs. "low-risk" respondents), and
12
+ 2. The conditional probabilities of each observed response given each class.
13
+
14
+ LCA's latent variables are categorical (e.g., *class 1* = "non-cheaters", *class 2* = "chronic cheaters"). This makes LCA the categorical-data analogue of Gaussian Mixture Models (GMM): GMM assumes normally-distributed responses, LCA assumes multinomial responses. Both fit parameters via Expectation-Maximisation (EM). A good tutorial extending EM beyond standard GMM applications is [Gao (2022)](https://teng-gao.github.io/blog/2022/ems/).
15
+
16
+ ### Applications
17
+
18
+ - **Diagnostic agreement** — LCA estimates rater accuracy without a gold-standard reference. Applied to carcinoma diagnoses by seven pathologists (Uebersax & Grove, 1990; dataset: `carcinoma`).
19
+ - **Political typology** — LCA identifies voter segments from candidate trait ratings. Applied to 2000 ANES survey data (dataset: `election`).
20
+ - **Academic dishonesty** — Latent classes of cheating behavior among students, regressed on GPA covariates (dataset: `cheating`).
21
+ - **Survey attitude clustering** — Uncovering latent opinion groups from social survey responses (McCutcheon, 1987; dataset: `gss82`).
22
+
23
+ ### Latent class regression
24
+
25
+ LCA can be extended with **covariates** that predict class membership. `pypoLCA` fits latent class regression using the same hybrid EM / Newton-Raphson algorithm as R's `poLCA`. The EM loop alternates expected-posterior and maximisation steps. Response probabilities have a closed-form M-step, which guarantees the standard EM ascent property. Covariate coefficients lack a closed form and are updated within each M-step via Newton-Raphson (NR). Unlike pure EM, the NR step can overshoot and cause a likelihood drop. The implementation detects this and restarts with perturbed starting values (`max_restarts`). In any case, the algorithm finds only a local maximum, so multiple random starts (`nrep`) are recommended.
26
+
27
+ Standard errors are provided for all parameter estimates (i.e. conditional response probabilities, prior class probabilities, and (when covariates are present) regression coefficients). SEs are computed from the observed information matrix via the outer product of the individual score contributions, then transformed to probability space via the delta method. This matches the approach used by R's `poLCA`.
28
+
29
+ ## Install
30
+
31
+ ```bash
32
+ pip install pypolca
33
+ ```
34
+
35
+ > Until published on PyPI, install from source:
36
+ >
37
+ > ```bash
38
+ > git clone https://github.com/.../pypoLCA.git
39
+ > cd pypoLCA
40
+ > uv pip install -e ".[dev]"
41
+ > ```
42
+
43
+ ## Quick start
44
+
45
+ ```python
46
+ import pypolca as lca
47
+
48
+ # Load a built-in dataset (a Polars DataFrame)
49
+ df = lca.load_dataset("carcinoma")
50
+
51
+ # Fit a 2-class model — seven pathologists rating 118 slides
52
+ result = lca.fit("cbind(A, B, C, D, E, F, G) ~ 1", data=df, nclass=2, nrep=5)
53
+
54
+ # Inspect results
55
+ print(f"Log-likelihood: {result.loglik:.2f}")
56
+ print(f"AIC: {result.aic:.2f}")
57
+ print(f"Iterations: {result.iterations}")
58
+
59
+ # Class-conditional probabilities for the first item
60
+ print(result.probs[0]) # shape (nclass, n_categories)
61
+
62
+ # Posterior class membership (N × R)
63
+ print(result.posterior[:5])
64
+
65
+ # Predicted class for each observation (1-based)
66
+ print(result.predclass[:5]) # 1-based (matching R poLCA convention)
67
+
68
+ # Standard errors
69
+ print(result.probs_se[0]) # SEs for first item
70
+ print(result.P_se) # SEs for class priors
71
+ ```
72
+
73
+ **With covariates (latent class regression):**
74
+
75
+ ```python
76
+ df = lca.load_dataset("cheating")
77
+
78
+ # Cheating behaviours ~ GPA
79
+ result = lca.fit(
80
+ "cbind(LIEEXAM, LIEPAPER, FRAUD, COPYEXAM) ~ GPA",
81
+ data=df,
82
+ nclass=2,
83
+ nrep=10,
84
+ )
85
+
86
+ print(result.coeff) # Regression coefficients (covariates × (classes − 1))
87
+ print(result.coeff_se) # Standard errors
88
+ print(result.P) # Population class shares
89
+ ```
90
+
91
+ The formula syntax uses `cbind(var1, var2, ...)` on the left-hand side, or equivalently Python-style `var1 + var2 + ...`. The right-hand side is `~ 1` for intercept-only or `~ cov1 + cov2` for latent class regression. Parsing is handled by a lightweight custom parser (no external formula library).
92
+
93
+ ## Backend
94
+
95
+ The EM engine and standard error computation are written in C++17 (Eigen for linear algebra), exposed to Python via pybind11. The build system is CMake + scikit-build-core, managed by `uv`. Incremental C++ rebuilds take ~1–2 s with `./rebuild.sh`.
96
+
97
+ ## Benchmarks
98
+
99
+ Comparison of `pypolca` (C++, with/without SE) vs R's `poLCA` on the `cheating` dataset (N=319, 4 binary items, 2 classes). Timings are means over 20 runs.
100
+
101
+ | N | Items | Classes | R poLCA | pypolca (with SE) | Speed-up | pypolca (no SE) | Speed-up |
102
+ |--------|-------|---------|----------|-------------------|----------|-----------------|----------|
103
+ | 319 | 4 | 2 | — | — | — | — | — |
104
+ | 500 | 5 | 2 | — | — | — | — | — |
105
+ | 2,000 | 5 | 2 | — | — | — | — | — |
106
+ | 10,000 | 5 | 2 | — | — | — | — | — |
107
+
108
+ > *Results TBD — run `python scripts/benchmark.py` and `python scripts/benchmark_scaling.py` to populate with fresh numbers (requires R with `poLCA` and `jsonlite` installed). Speed-up is relative to R `poLCA`.*
109
+
110
+ ## Datasets
111
+
112
+ | Dataset | N | Manifest items | Covariates | Source |
113
+ |------------|-------|------------------------------------------------------|------------------------|---------------------------|
114
+ | carcinoma | 118 | A–G (7 binary: no carcinoma / carcinoma) | — | Agresti (2002) |
115
+ | cheating | 319 | LIEEXAM, LIEPAPER, FRAUD, COPYEXAM (4 binary) | GPA | R poLCA |
116
+ | election | 1,785 | MORALG–INTELB (12 ordinal: 4-point trait ratings) | VOTE3, AGE, EDUC, etc. | 2000 ANES |
117
+ | gss82 | 1,202 | PURPOSE, ACCURACY, UNDERSTA, COOPERAT (2–3 categories)| — | McCutcheon (1987) |
118
+ | values | 216 | A–D (4 binary: universalistic / particularistic) | — | R poLCA |
119
+
120
+ ## Credits
121
+
122
+ pypoLCA is a translation of Drew A. Linzer and Jeffrey B. Lewis's R package:
123
+
124
+ > Linzer, D. A., & Lewis, J. B. (2011). poLCA: An R Package for Polytomous Variable Latent Class Analysis. *Journal of Statistical Software*, 42(10), 1–29. [doi:10.18637/jss.v042.i10](https://doi.org/10.18637/jss.v042.i10)
125
+
126
+ Built-in datasets and the EM / Newton-Raphson algorithm are taken from the `poLCA` R package, licensed GPL-2.0-or-later.
127
+
128
+ C++ bindings powered by [pybind11](https://github.com/pybind/pybind11). Linear algebra via [Eigen](https://eigen.tuxfamily.org/).
129
+
130
+ ## Contributing
131
+
132
+ Contributions are welcome and appreciated. Please keep submissions tight and purposeful. The goal is to keep `pypoLCA` a focused, maintainable package. AI-assisted contributions are fine, but AI use doesn't excuse sloppy or verbose work; review your output before submitting a PR.
133
+
134
+ ## License
135
+
136
+ GPL-2.0-or-later
@@ -0,0 +1,62 @@
1
+ """Example: fit a basic latent class model with pypoLCA."""
2
+
3
+ import numpy as np
4
+ import polars as pl
5
+
6
+ # Import the low-level C++ bindings directly
7
+ from pypolca._core import Data, fit_em
8
+
9
+ # --- Create synthetic data ---
10
+ np.random.seed(42)
11
+ N = 500
12
+
13
+ # 4 manifest variables, each with 3 categories
14
+ J = 4
15
+ K = 3
16
+
17
+ # True latent class labels
18
+ true_class = np.random.choice([0, 1], size=N, p=[0.4, 0.6])
19
+
20
+ # Class-conditional response probabilities
21
+ # Class 0: tends to answer 1
22
+ # Class 1: tends to answer 3
23
+ probs_c0 = np.array([0.6, 0.3, 0.1])
24
+ probs_c1 = np.array([0.1, 0.3, 0.6])
25
+
26
+ y = np.zeros((N, J), dtype=np.int32)
27
+ for i in range(N):
28
+ for j in range(J):
29
+ if true_class[i] == 0:
30
+ y[i, j] = np.random.choice([1, 2, 3], p=probs_c0) + 1 # 1-based
31
+ else:
32
+ y[i, j] = np.random.choice([1, 2, 3], p=probs_c1) + 1
33
+
34
+ # Build Data object
35
+ data = Data()
36
+ data.y = y
37
+ data.x = np.ones((N, 1), dtype=np.float64) # intercept only (no covariates)
38
+ data.num_choices = [K] * J
39
+
40
+ # --- Fit model ---
41
+ print("Fitting 2-class LCA model...")
42
+ result = fit_em(data, nclass=2, maxiter=200, tol=1e-8)
43
+
44
+ print(f"\nConverged: {result.converged}")
45
+ print(f"Iterations: {result.iterations}")
46
+ print(f"Log-likelihood: {result.loglik:.4f}")
47
+ print(f"Class shares: {result.posterior.mean(axis=0)}")
48
+ print(f"Predclass counts: {np.bincount(result.posterior.argmax(axis=1))}")
49
+
50
+ # --- Try the high-level API ---
51
+ try:
52
+ from pypolca.api import fit
53
+
54
+ df = pl.DataFrame(
55
+ y,
56
+ schema=[f"Y{j + 1}" for j in range(J)],
57
+ )
58
+ result2 = fit("Y1 + Y2 + Y3 + Y4 ~ 1", df, nclass=2)
59
+ print(f"\nHigh-level API result: {result2}")
60
+ print(f"Class shares: {result2.P}")
61
+ except Exception as e:
62
+ print(f"High-level API not yet functional: {e}")
@@ -0,0 +1,16 @@
1
+ import numpy as np
2
+
3
+ from pypolca._core import Data, Params, compute_ylik
4
+
5
+ data = Data()
6
+ data.y = np.array([[1], [2]], dtype=np.int32) # 1-based
7
+ data.x = np.ones((2, 1), dtype=np.float64)
8
+ data.num_choices = [2]
9
+
10
+ p = Params()
11
+ # 1 class, 2 categories: vecprobs layout = [class0_cat0, class0_cat1]
12
+ p.vecprobs = np.array([0.3, 0.7], dtype=np.float64)
13
+ p.beta = np.array([], dtype=np.float64)
14
+
15
+ lik = compute_ylik(data, p, 1)
16
+ print(lik)