factominer 0.1.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- factominer-0.1.0.dev0/.gitignore +37 -0
- factominer-0.1.0.dev0/CHANGELOG.md +71 -0
- factominer-0.1.0.dev0/CITATION.cff +62 -0
- factominer-0.1.0.dev0/CONTRIBUTING.md +117 -0
- factominer-0.1.0.dev0/LICENSE +21 -0
- factominer-0.1.0.dev0/NOTICE.md +45 -0
- factominer-0.1.0.dev0/PKG-INFO +194 -0
- factominer-0.1.0.dev0/README.md +151 -0
- factominer-0.1.0.dev0/SECURITY.md +31 -0
- factominer-0.1.0.dev0/docs/api/ca.md +10 -0
- factominer-0.1.0.dev0/docs/api/datasets.md +9 -0
- factominer-0.1.0.dev0/docs/api/desc.md +26 -0
- factominer-0.1.0.dev0/docs/api/hcpc.md +19 -0
- factominer-0.1.0.dev0/docs/api/mca.md +12 -0
- factominer-0.1.0.dev0/docs/api/pca.md +35 -0
- factominer-0.1.0.dev0/docs/api/plot.md +15 -0
- factominer-0.1.0.dev0/docs/conf.py +43 -0
- factominer-0.1.0.dev0/docs/examples/ca_children.ipynb +138 -0
- factominer-0.1.0.dev0/docs/examples/hcpc_decathlon.ipynb +109 -0
- factominer-0.1.0.dev0/docs/examples/mca_tea.ipynb +104 -0
- factominer-0.1.0.dev0/docs/examples/pca_decathlon.ipynb +435 -0
- factominer-0.1.0.dev0/docs/index.md +67 -0
- factominer-0.1.0.dev0/docs/migrating-from-r.md +48 -0
- factominer-0.1.0.dev0/factominer/__init__.py +36 -0
- factominer-0.1.0.dev0/factominer/_deferred.py +46 -0
- factominer-0.1.0.dev0/factominer/_result.py +111 -0
- factominer-0.1.0.dev0/factominer/_scaling.py +99 -0
- factominer-0.1.0.dev0/factominer/_sign.py +50 -0
- factominer-0.1.0.dev0/factominer/_svd.py +60 -0
- factominer-0.1.0.dev0/factominer/ca.py +164 -0
- factominer-0.1.0.dev0/factominer/datasets/__init__.py +54 -0
- factominer-0.1.0.dev0/factominer/datasets/data/PROVENANCE.md +68 -0
- factominer-0.1.0.dev0/factominer/datasets/data/children.csv +19 -0
- factominer-0.1.0.dev0/factominer/datasets/data/decathlon.csv +42 -0
- factominer-0.1.0.dev0/factominer/datasets/data/poison.csv +56 -0
- factominer-0.1.0.dev0/factominer/datasets/data/tea.csv +301 -0
- factominer-0.1.0.dev0/factominer/desc/__init__.py +7 -0
- factominer-0.1.0.dev0/factominer/desc/catdes.py +228 -0
- factominer-0.1.0.dev0/factominer/desc/condes.py +143 -0
- factominer-0.1.0.dev0/factominer/desc/dimdesc.py +64 -0
- factominer-0.1.0.dev0/factominer/hcpc.py +216 -0
- factominer-0.1.0.dev0/factominer/mca.py +153 -0
- factominer-0.1.0.dev0/factominer/pca.py +331 -0
- factominer-0.1.0.dev0/factominer/plot/__init__.py +32 -0
- factominer-0.1.0.dev0/factominer/plot/matplotlib_backend.py +350 -0
- factominer-0.1.0.dev0/factominer/py.typed +0 -0
- factominer-0.1.0.dev0/pyproject.toml +95 -0
- factominer-0.1.0.dev0/tests/__init__.py +0 -0
- factominer-0.1.0.dev0/tests/conftest.py +90 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/ca/children.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/ca/children_plain.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/catdes/tea_Tea.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/condes/decathlon_Points.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/condes/tea_age.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/dimdesc/pca_decathlon.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/dimdesc/pca_decathlon_proba50.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/hcpc/decathlon_plain_k4.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/mca/tea.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/pca/decathlon.json +1 -0
- factominer-0.1.0.dev0/tests/fixtures/r_outputs/pca/decathlon_plain.json +1 -0
- factominer-0.1.0.dev0/tests/test_ca.py +167 -0
- factominer-0.1.0.dev0/tests/test_desc.py +351 -0
- factominer-0.1.0.dev0/tests/test_hcpc.py +199 -0
- factominer-0.1.0.dev0/tests/test_mca.py +212 -0
- factominer-0.1.0.dev0/tests/test_pca.py +224 -0
- factominer-0.1.0.dev0/tests/test_plots.py +93 -0
- factominer-0.1.0.dev0/tests/test_smoke.py +74 -0
- factominer-0.1.0.dev0/tools/build_example_notebooks.py +141 -0
- factominer-0.1.0.dev0/tools/refresh_r_fixtures.R +190 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
*.egg
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
.eggs/
|
|
9
|
+
|
|
10
|
+
# Venvs
|
|
11
|
+
.venv/
|
|
12
|
+
.venv-factominer/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# Testing / type-checking caches
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
.mypy_cache/
|
|
19
|
+
.ruff_cache/
|
|
20
|
+
.coverage
|
|
21
|
+
htmlcov/
|
|
22
|
+
|
|
23
|
+
# Docs build
|
|
24
|
+
docs/_build/
|
|
25
|
+
|
|
26
|
+
# OS / editor
|
|
27
|
+
.DS_Store
|
|
28
|
+
*.swp
|
|
29
|
+
|
|
30
|
+
# Local R fixtures generated at build time (committed when stable)
|
|
31
|
+
.r-fixture-stage/
|
|
32
|
+
|
|
33
|
+
# Jupyter outputs
|
|
34
|
+
.ipynb_checkpoints/
|
|
35
|
+
|
|
36
|
+
# Local elves state
|
|
37
|
+
.elves-session.json
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to FactoMinePy are tracked here. The format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project
|
|
5
|
+
adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) once
|
|
6
|
+
out of pre-release.
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- Full FactoMineR 2.14 schema parity for `dimdesc` / `catdes` / `condes`
|
|
13
|
+
(`n` column on quanti tables; `Cla/Mod` / `Mod/Cla` / `Global` /
|
|
14
|
+
hypergeometric `v.test` on catdes category; `Eta2` / `P-value` on
|
|
15
|
+
catdes quanti.var; `sd in category` / `Overall sd` / `n` on catdes
|
|
16
|
+
per-level quanti; `Estimate` / `p.value` on condes category).
|
|
17
|
+
- PCA now exposes `quali.sup$eta2` (per-variable, not per-category).
|
|
18
|
+
- PCA / CA / MCA `res$eig` now carries all eigenvalues (only the
|
|
19
|
+
coord / cos² / contrib blocks are truncated to `ncp`); `res$svd$vs`
|
|
20
|
+
keeps the full singular spectrum.
|
|
21
|
+
- MCA `res$eig` truncated to `total_cat - q_vars` to match R's
|
|
22
|
+
"useful" axis count.
|
|
23
|
+
- HCPC `data_clust` holds the original input X + `clust` column (was:
|
|
24
|
+
PC coordinates); `desc_var` populated via the parity-verified
|
|
25
|
+
`catdes`; `desc_axes` via `condes` per axis.
|
|
26
|
+
- CI: `rpy2-parity` workflow installs FactoMineR 2.14 from CRAN, runs
|
|
27
|
+
the parity suite against freshly generated fixtures, and uploads the
|
|
28
|
+
fresh fixtures + drift diff as artifacts. Triggerable on-demand via
|
|
29
|
+
`workflow_dispatch`; runs weekly on Monday cron.
|
|
30
|
+
- README: experimental-use-with-caution callout, known limitations
|
|
31
|
+
section, tightened parity-tolerance documentation.
|
|
32
|
+
- Open-source meta files (this CHANGELOG, CONTRIBUTING.md, CITATION.cff,
|
|
33
|
+
SECURITY.md, issue + PR templates).
|
|
34
|
+
|
|
35
|
+
### Fixed
|
|
36
|
+
|
|
37
|
+
- MCA `var$eta2` and `var$v.test`: dropped erroneous `/lambda_k` and
|
|
38
|
+
`/sqrt(lambda_k)` factors. R FactoMineR's MCA `var$coord` is the
|
|
39
|
+
standard category coordinate ψ_c, so:
|
|
40
|
+
- `eta²(v,k) = sum_c n_c * ψ_c² / N`
|
|
41
|
+
- `v.test(c,k) = ψ_c * sqrt(n_c (N-1) / (N - n_c))`
|
|
42
|
+
Output now matches R to 1e-9 on the tea fixture (previously off by
|
|
43
|
+
~6.7× on eta² and ~2.6× on v.test).
|
|
44
|
+
- Sphinx build: enabled `myst-nb` so example notebooks under
|
|
45
|
+
`docs/examples/` actually render. (Listing both `myst_parser` and
|
|
46
|
+
`myst_nb` in `extensions` double-invokes `setup_sphinx` and crashes
|
|
47
|
+
myst-parser 5.1.0; only `myst_nb` is loaded now.)
|
|
48
|
+
- `docs/api/datasets.md`: relative PROVENANCE.md link rewritten to an
|
|
49
|
+
absolute GitHub URL so it resolves outside the repo tree.
|
|
50
|
+
|
|
51
|
+
### Changed
|
|
52
|
+
|
|
53
|
+
- `tools/refresh_r_fixtures.R` adds two richer fixtures
|
|
54
|
+
(`condes/tea_age.json`, `dimdesc/pca_decathlon_proba50.json`) that
|
|
55
|
+
exercise the populated-quali + populated-category branches of the
|
|
56
|
+
desc functions.
|
|
57
|
+
- Test tolerances tightened across the suite:
|
|
58
|
+
- eigenvalues: `1e-8 → 1e-10`
|
|
59
|
+
- coord / cos² / cor / eta²: `1e-6 → 1e-9`
|
|
60
|
+
- contrib: `1e-6 → 1e-8`
|
|
61
|
+
- v.test: still 1e-6 (limited by chained qnorm / hypergeometric)
|
|
62
|
+
- p-values: `1e-5` relative (new — previously untested at column level)
|
|
63
|
+
|
|
64
|
+
## [0.1.0.dev0] — 2026-05-16
|
|
65
|
+
|
|
66
|
+
Initial port: PCA, CA, MCA, HCPC, dimdesc / catdes / condes with R-parity
|
|
67
|
+
tests. FAMD / MFA / HMFA / DMFA / GPA importable as `NotImplementedError`
|
|
68
|
+
stubs.
|
|
69
|
+
|
|
70
|
+
[Unreleased]: https://github.com/aigorahub/FactoMinePy/compare/v0.1.0.dev0...HEAD
|
|
71
|
+
[0.1.0.dev0]: https://github.com/aigorahub/FactoMinePy/releases/tag/v0.1.0.dev0
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
title: FactoMinePy
|
|
3
|
+
message: >-
|
|
4
|
+
If you use FactoMinePy in published work, please cite both this software
|
|
5
|
+
package and the original R FactoMineR, which it ports.
|
|
6
|
+
type: software
|
|
7
|
+
authors:
|
|
8
|
+
- name: Aigora
|
|
9
|
+
website: https://aigora.com
|
|
10
|
+
repository-code: https://github.com/aigorahub/FactoMinePy
|
|
11
|
+
url: https://github.com/aigorahub/FactoMinePy
|
|
12
|
+
abstract: >-
|
|
13
|
+
A Python port of the R package FactoMineR for multivariate exploratory data
|
|
14
|
+
analysis (PCA, CA, MCA, HCPC, dimdesc / catdes / condes). Reimplemented from
|
|
15
|
+
primitives on NumPy / SciPy / Pandas; validated for byte-identical fixture
|
|
16
|
+
output and column-by-column schema parity against R FactoMineR 2.14.
|
|
17
|
+
keywords:
|
|
18
|
+
- factor-analysis
|
|
19
|
+
- PCA
|
|
20
|
+
- CA
|
|
21
|
+
- MCA
|
|
22
|
+
- HCPC
|
|
23
|
+
- multivariate-analysis
|
|
24
|
+
- exploratory-data-analysis
|
|
25
|
+
- python
|
|
26
|
+
- factominer
|
|
27
|
+
license: MIT
|
|
28
|
+
preferred-citation:
|
|
29
|
+
type: software
|
|
30
|
+
title: FactoMinePy
|
|
31
|
+
authors:
|
|
32
|
+
- name: Aigora
|
|
33
|
+
url: https://github.com/aigorahub/FactoMinePy
|
|
34
|
+
references:
|
|
35
|
+
- type: software
|
|
36
|
+
title: FactoMineR
|
|
37
|
+
scope: The reference implementation this package ports.
|
|
38
|
+
authors:
|
|
39
|
+
- family-names: Lê
|
|
40
|
+
given-names: Sébastien
|
|
41
|
+
- family-names: Josse
|
|
42
|
+
given-names: Julie
|
|
43
|
+
- family-names: Husson
|
|
44
|
+
given-names: François
|
|
45
|
+
url: https://cran.r-project.org/package=FactoMineR
|
|
46
|
+
repository-code: https://github.com/husson/FactoMineR
|
|
47
|
+
- type: article
|
|
48
|
+
title: "FactoMineR: An R Package for Multivariate Analysis"
|
|
49
|
+
authors:
|
|
50
|
+
- family-names: Lê
|
|
51
|
+
given-names: Sébastien
|
|
52
|
+
- family-names: Josse
|
|
53
|
+
given-names: Julie
|
|
54
|
+
- family-names: Husson
|
|
55
|
+
given-names: François
|
|
56
|
+
journal: Journal of Statistical Software
|
|
57
|
+
year: 2008
|
|
58
|
+
volume: 25
|
|
59
|
+
issue: 1
|
|
60
|
+
start: 1
|
|
61
|
+
end: 18
|
|
62
|
+
doi: 10.18637/jss.v025.i01
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# Contributing to FactoMinePy
|
|
2
|
+
|
|
3
|
+
Thanks for your interest. This is an early-alpha port of the R package
|
|
4
|
+
[FactoMineR](https://cran.r-project.org/package=FactoMineR) to Python. Below is
|
|
5
|
+
how to get a local dev environment going, what the parity bar is, and how to
|
|
6
|
+
get a change merged.
|
|
7
|
+
|
|
8
|
+
## Quick start
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
git clone https://github.com/aigorahub/FactoMinePy.git
|
|
12
|
+
cd FactoMinePy
|
|
13
|
+
python3.12 -m venv .venv
|
|
14
|
+
.venv/bin/pip install -e '.[dev]'
|
|
15
|
+
.venv/bin/pytest -q
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Python **3.10 or newer** is required; CI runs on 3.11 and local development is
|
|
19
|
+
exercised on 3.12.
|
|
20
|
+
|
|
21
|
+
## What this project's parity bar is
|
|
22
|
+
|
|
23
|
+
Every method in the "live" column of the README's status table is validated
|
|
24
|
+
against R FactoMineR (currently 2.14 on CRAN) using committed JSON fixtures.
|
|
25
|
+
The committed fixtures must be byte-identical to what live R FactoMineR
|
|
26
|
+
produces on a clean Linux runner with the current CRAN release.
|
|
27
|
+
|
|
28
|
+
When you change anything that could affect numerical output:
|
|
29
|
+
|
|
30
|
+
1. Run the full test suite locally: `.venv/bin/pytest -q`.
|
|
31
|
+
2. If you have R + FactoMineR installed locally, regenerate fixtures with
|
|
32
|
+
`Rscript tools/refresh_r_fixtures.R` and confirm the tests still pass
|
|
33
|
+
against them. If you don't have R locally, the `rpy2-parity` GitHub
|
|
34
|
+
Actions workflow does the same on a runner with R 4.6 + FactoMineR 2.14
|
|
35
|
+
from CRAN. Trigger it manually from the Actions tab via
|
|
36
|
+
`workflow_dispatch`, or wait for the weekly cron.
|
|
37
|
+
3. Don't loosen tolerances to make tests pass. Investigate the divergence
|
|
38
|
+
instead — the current tolerances are deliberate (1e-10 on eigenvalues,
|
|
39
|
+
1e-9 on coord/cos²/cor/eta², 1e-8 on contrib, 1e-6 on v.test, 1e-5
|
|
40
|
+
relative on p-values).
|
|
41
|
+
|
|
42
|
+
## Style and lint
|
|
43
|
+
|
|
44
|
+
- Source is formatted to ruff defaults; `ruff check factominer tests` must be
|
|
45
|
+
clean before pushing.
|
|
46
|
+
- We don't enforce ruff *format* yet — match the surrounding style.
|
|
47
|
+
- Type annotations are encouraged but not strictly required (mypy is
|
|
48
|
+
advisory in CI). New public APIs should be typed.
|
|
49
|
+
- Docstrings: short, in the style of the existing modules
|
|
50
|
+
(`factominer/desc/catdes.py` is a good model). Reference the R FactoMineR
|
|
51
|
+
source path that the implementation tracks when the behaviour is
|
|
52
|
+
non-obvious.
|
|
53
|
+
|
|
54
|
+
## Where to look in the source
|
|
55
|
+
|
|
56
|
+
- `factominer/pca.py`, `ca.py`, `mca.py` — the three core dimensionality-
|
|
57
|
+
reduction methods. Each one builds row + column blocks plus supplementary
|
|
58
|
+
blocks and stashes the input frames in `res.call` so downstream methods
|
|
59
|
+
(dimdesc / catdes / condes / HCPC) can recompute against the original
|
|
60
|
+
variables.
|
|
61
|
+
- `factominer/desc/` — `dimdesc.py` delegates to `condes.py`; `catdes.py`
|
|
62
|
+
is the heavy one (test_chi2, category with Cla/Mod/Mod/Cla/Global +
|
|
63
|
+
hypergeometric, quanti.var with Eta²/P-value, per-level quanti).
|
|
64
|
+
- `factominer/hcpc.py` — Ward + k-means consolidation. `data_clust` holds
|
|
65
|
+
the original X + `clust`, and `desc_var` delegates to `catdes`.
|
|
66
|
+
- `factominer/_svd.py`, `_sign.py`, `_scaling.py` — shared primitives.
|
|
67
|
+
- `tools/refresh_r_fixtures.R` — the single source of truth for what R
|
|
68
|
+
output we compare against. Edit this script (not the JSON files
|
|
69
|
+
directly) if you need a new fixture.
|
|
70
|
+
|
|
71
|
+
## Opening a pull request
|
|
72
|
+
|
|
73
|
+
1. Branch from `main`. The history is rebased-merged and reasonably linear.
|
|
74
|
+
2. Keep the change focused. If you're rewriting a method to fix one
|
|
75
|
+
parity bug, don't also reformat the file.
|
|
76
|
+
3. Reference the R FactoMineR source line numbers (in
|
|
77
|
+
`husson/FactoMineR/R/<file>.r`) when claiming a formula matches R.
|
|
78
|
+
4. Make sure the PR description has a "Test plan" checklist. The default
|
|
79
|
+
PR template will populate one.
|
|
80
|
+
5. CI gates merge on `lint-and-test` and CodeQL. `rpy2-parity` is
|
|
81
|
+
non-blocking on PRs (it's expensive and depends on R availability);
|
|
82
|
+
trigger it manually if your change is numerical.
|
|
83
|
+
|
|
84
|
+
## Scope
|
|
85
|
+
|
|
86
|
+
Out of scope without discussion:
|
|
87
|
+
|
|
88
|
+
- Wholesale rewrites of the parity-test layout. The current fixture
|
|
89
|
+
harness is what lets us regenerate against any CRAN FactoMineR release.
|
|
90
|
+
- Replacing pandas/NumPy/SciPy with another stack. The point of the port
|
|
91
|
+
is *no* exotic runtime dependencies.
|
|
92
|
+
- A drop-in `from FactoMineR import *` Python API. We deliberately follow
|
|
93
|
+
Python conventions (snake_case args, 0-based indices, pandas DataFrames
|
|
94
|
+
with documented column names).
|
|
95
|
+
|
|
96
|
+
In scope and welcome:
|
|
97
|
+
|
|
98
|
+
- Implementing the deferred methods (`FAMD`, `MFA`, `HMFA`, `DMFA`,
|
|
99
|
+
`GPA`). Each has a stub in `factominer/_deferred.py`.
|
|
100
|
+
- New parity fixtures exercising untested R FactoMineR options (row
|
|
101
|
+
weights, missing values, `method="burt"` MCA, etc.).
|
|
102
|
+
- Plotly backend (currently stubs in `factominer/plot/`).
|
|
103
|
+
- Documentation fixes, example notebooks, migrating-from-R additions.
|
|
104
|
+
|
|
105
|
+
## Reporting bugs
|
|
106
|
+
|
|
107
|
+
File a GitHub issue with:
|
|
108
|
+
|
|
109
|
+
- A minimal reproducer (a script + the dataset, or one of the bundled
|
|
110
|
+
loaders).
|
|
111
|
+
- The R FactoMineR call that produces the expected output, if you have
|
|
112
|
+
one.
|
|
113
|
+
- The Python output you got and the R output you expected.
|
|
114
|
+
|
|
115
|
+
If your reproducer needs R to demonstrate the discrepancy, please include
|
|
116
|
+
the R version (`R --version | head -1`) and FactoMineR version
|
|
117
|
+
(`packageVersion("FactoMineR")`).
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Aigora
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Notices and attributions
|
|
2
|
+
|
|
3
|
+
FactoMinePy is a from-primitives Python reimplementation of R FactoMineR. It
|
|
4
|
+
does **not** redistribute any R or C source code from FactoMineR. The runtime
|
|
5
|
+
package is MIT-licensed (see [LICENSE](LICENSE)).
|
|
6
|
+
|
|
7
|
+
## R FactoMineR
|
|
8
|
+
|
|
9
|
+
R FactoMineR is GPL-licensed and authored by:
|
|
10
|
+
|
|
11
|
+
> Sébastien Lê, Julie Josse, François Husson — *FactoMineR: An R Package for
|
|
12
|
+
> Multivariate Analysis* — Journal of Statistical Software 25(1), 2008 —
|
|
13
|
+
> doi:[10.18637/jss.v025.i01](https://doi.org/10.18637/jss.v025.i01) —
|
|
14
|
+
> CRAN: https://cran.r-project.org/package=FactoMineR
|
|
15
|
+
|
|
16
|
+
The Python source in this repository implements the same statistical methods
|
|
17
|
+
following the published documentation and the R source code at
|
|
18
|
+
https://github.com/husson/FactoMineR. Each implementation file references the
|
|
19
|
+
specific R function it tracks. The Python re-implementation is original work
|
|
20
|
+
and is offered under the MIT license; it does not relicense R FactoMineR.
|
|
21
|
+
|
|
22
|
+
## Bundled datasets
|
|
23
|
+
|
|
24
|
+
The CSV files under [factominer/datasets/data/](factominer/datasets/data/)
|
|
25
|
+
are re-extracted from the data exports shipped with R FactoMineR for the
|
|
26
|
+
purpose of validating numerical parity. The values themselves are facts
|
|
27
|
+
(athletics results, survey responses) and are not subject to copyright. The
|
|
28
|
+
specific tabulations distributed with R FactoMineR carry the GPL alongside
|
|
29
|
+
the rest of the R package; we keep these tabulations bundled solely so the
|
|
30
|
+
parity tests are reproducible without a working R installation.
|
|
31
|
+
|
|
32
|
+
If you need a strictly GPL-free data bundle (for example, if you are
|
|
33
|
+
redistributing a derivative of this package in a non-GPL-compatible
|
|
34
|
+
context), re-derive each dataset from its primary source as documented in
|
|
35
|
+
[factominer/datasets/data/PROVENANCE.md](factominer/datasets/data/PROVENANCE.md).
|
|
36
|
+
|
|
37
|
+
## Inspiration
|
|
38
|
+
|
|
39
|
+
API shape and visualization patterns were informed by:
|
|
40
|
+
|
|
41
|
+
- [`factoextra`](https://rpkgs.datanovia.com/factoextra/) — the canonical
|
|
42
|
+
ggplot2 visualization companion for FactoMineR.
|
|
43
|
+
- [`prince`](https://github.com/MaxHalford/prince) and
|
|
44
|
+
[`scientisttools`](https://pypi.org/project/scientisttools/) — earlier
|
|
45
|
+
Python ports that informed the API shape (no code copied).
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: factominer
|
|
3
|
+
Version: 0.1.0.dev0
|
|
4
|
+
Summary: FactoMineR-compatible multivariate exploratory data analysis for Python
|
|
5
|
+
Project-URL: Homepage, https://github.com/aigorahub/FactoMinePy
|
|
6
|
+
Project-URL: Issues, https://github.com/aigorahub/FactoMinePy/issues
|
|
7
|
+
Author-email: Aigora <hello@aigora.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
License-File: NOTICE.md
|
|
11
|
+
Keywords: ca,factominer,factor analysis,famd,hcpc,mca,mfa,multivariate,pca
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: matplotlib>=3.9
|
|
22
|
+
Requires-Dist: numpy>=2.0
|
|
23
|
+
Requires-Dist: pandas>=2.2
|
|
24
|
+
Requires-Dist: scipy>=1.13
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: build; extra == 'dev'
|
|
27
|
+
Requires-Dist: jupyter; extra == 'dev'
|
|
28
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
29
|
+
Requires-Dist: myst-nb; extra == 'dev'
|
|
30
|
+
Requires-Dist: myst-parser; extra == 'dev'
|
|
31
|
+
Requires-Dist: nbclient; extra == 'dev'
|
|
32
|
+
Requires-Dist: nbformat; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
35
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
36
|
+
Requires-Dist: sphinx>=7; extra == 'dev'
|
|
37
|
+
Requires-Dist: twine; extra == 'dev'
|
|
38
|
+
Provides-Extra: plotly
|
|
39
|
+
Requires-Dist: plotly>=5.20; extra == 'plotly'
|
|
40
|
+
Provides-Extra: rpy2
|
|
41
|
+
Requires-Dist: rpy2>=3.5; extra == 'rpy2'
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
|
|
44
|
+
# FactoMinePy
|
|
45
|
+
|
|
46
|
+
[](https://github.com/aigorahub/FactoMinePy/actions/workflows/ci.yml)
|
|
47
|
+
[](LICENSE)
|
|
48
|
+
[](pyproject.toml)
|
|
49
|
+
[](#status)
|
|
50
|
+
|
|
51
|
+
> ⚠️ **Experimental — use with caution.** This is an independent Python port of the R package [FactoMineR](https://cran.r-project.org/package=FactoMineR). It is **not** affiliated with or endorsed by the authors of FactoMineR. The port is in early development; APIs may change, edge cases may differ from R, and several FactoMineR methods are not yet implemented (see status table below). For production work or published research, treat results as preliminary and cross-check against the original R package.
|
|
52
|
+
|
|
53
|
+
A from-primitives reimplementation in pure NumPy/SciPy/Pandas of the R package [FactoMineR](https://cran.r-project.org/package=FactoMineR) for multivariate exploratory data analysis (PCA, CA, MCA, HCPC, dimdesc/catdes/condes).
|
|
54
|
+
|
|
55
|
+
This package is **not** a wrapper around R; every method is reimplemented from the published FactoMineR documentation and R source, then validated numerically against R FactoMineR (currently 2.14 on CRAN) via a checked-in fixture harness. R FactoMineR remains the canonical reference implementation; this port aims for byte-identical fixture output and column-by-column schema parity, but is not a drop-in replacement.
|
|
56
|
+
|
|
57
|
+
## Status
|
|
58
|
+
|
|
59
|
+
**Early-alpha.** The supported-methods table is the source of truth for what works.
|
|
60
|
+
|
|
61
|
+
| FactoMineR method | Python equivalent | Live | R-parity verified | Notes |
|
|
62
|
+
| --- | --- | --- | --- | --- |
|
|
63
|
+
| `PCA` | `factominer.PCA` | ✅ | ✅ | active + supplementary individuals, quanti.sup, quali.sup |
|
|
64
|
+
| `CA` | `factominer.CA` | ✅ | ✅ | symmetric biplot, supplementary rows/columns |
|
|
65
|
+
| `MCA` | `factominer.MCA` | ✅ | ✅ | indicator matrix; Burt option |
|
|
66
|
+
| `HCPC` | `factominer.HCPC` | ✅ | ✅ | hierarchical clustering on PCA/CA/MCA, k-means consolidation |
|
|
67
|
+
| `dimdesc` | `factominer.dimdesc` | ✅ | ✅ | quantitative + categorical description per axis |
|
|
68
|
+
| `catdes` | `factominer.catdes` | ✅ | ✅ | `Cla/Mod`, `Mod/Cla`, `Global`, hypergeometric v-test; `quanti_var` Eta²; per-level `quanti` with `sd in category` / `Overall sd` / `n` |
|
|
69
|
+
| `condes` | `factominer.condes` | ✅ | ✅ | correlation tests for a continuous target |
|
|
70
|
+
| `plot.PCA / .CA / .MCA / .HCPC` | `factominer.plot.plot()` | ✅ | structural | matplotlib backend; factor maps, biplot, scree, contributions, dendrogram, ellipses, habillage |
|
|
71
|
+
| `FAMD` | `factominer.FAMD` | 🚧 stub | — | Round 2 |
|
|
72
|
+
| `MFA` | `factominer.MFA` | 🚧 stub | — | Round 2 |
|
|
73
|
+
| `HMFA` | `factominer.HMFA` | 🚧 stub | — | Round 2 |
|
|
74
|
+
| `DMFA` | `factominer.DMFA` | 🚧 stub | — | Round 2 |
|
|
75
|
+
| `GPA` | `factominer.GPA` | 🚧 stub | — | Round 2 |
|
|
76
|
+
| Plotly backend | `factominer.plot.plotly_*` | 🚧 stub | — | Round 2 |
|
|
77
|
+
|
|
78
|
+
Methods marked 🚧 are importable but raise `NotImplementedError("deferred — see docs/plans/factominer-python-port.md §2")` when called. This is by design so downstream code can `from factominer import HMFA` without an `ImportError`.
|
|
79
|
+
|
|
80
|
+
## Install
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
pip install factominer
|
|
84
|
+
# matplotlib backend ships by default; for the optional plotly backend:
|
|
85
|
+
pip install 'factominer[plotly]'
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Quickstart
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from factominer import PCA, HCPC, dimdesc
|
|
92
|
+
from factominer.datasets import load_decathlon
|
|
93
|
+
|
|
94
|
+
decathlon = load_decathlon()
|
|
95
|
+
res = PCA(decathlon, scale_unit=True, ncp=5,
|
|
96
|
+
quanti_sup=["Rank", "Points"],
|
|
97
|
+
quali_sup=["Competition"])
|
|
98
|
+
|
|
99
|
+
print(res.summary())
|
|
100
|
+
print(res.eig) # eigenvalue table (DataFrame)
|
|
101
|
+
print(res.ind.coord) # individual coordinates
|
|
102
|
+
print(res.var.contrib) # variable contributions
|
|
103
|
+
|
|
104
|
+
# Describe each axis
|
|
105
|
+
desc = dimdesc(res, axes=[0, 1])
|
|
106
|
+
print(desc[0]["quanti"])
|
|
107
|
+
|
|
108
|
+
# Cluster on the principal components
|
|
109
|
+
clust = HCPC(res, nb_clust=3)
|
|
110
|
+
print(clust.data_clust.head())
|
|
111
|
+
|
|
112
|
+
# Plot
|
|
113
|
+
import matplotlib.pyplot as plt
|
|
114
|
+
from factominer.plot import plot
|
|
115
|
+
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
|
|
116
|
+
plot(res, choix="ind", habillage="Competition", ax=ax[0])
|
|
117
|
+
plot(res, choix="var", ax=ax[1])
|
|
118
|
+
plt.show()
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Migrating from R
|
|
122
|
+
|
|
123
|
+
See [docs/migrating-from-r.md](docs/migrating-from-r.md) for a side-by-side cheat sheet (R call → Python call → result attribute mapping → semantic differences).
|
|
124
|
+
|
|
125
|
+
The most important semantic differences:
|
|
126
|
+
|
|
127
|
+
1. **Argument names use snake_case.** `scale.unit=TRUE` → `scale_unit=True`, `quanti.sup=11:12` → `quanti_sup=[10, 11]` (and column names like `"Rank"` work too).
|
|
128
|
+
2. **Indices are 0-based.** `ind.sup=1:3` (R) → `ind_sup=[0, 1, 2]` (Python).
|
|
129
|
+
3. **Sign convention.** SVD is sign-ambiguous; we apply a deterministic rule (first absolute-max coordinate of each axis is positive). Coordinates may differ from R by a sign; the *interpretation* (clusters, distances, contributions) is identical. See `factominer._sign`.
|
|
130
|
+
4. **Result objects.** `res$eig` (R) → `res.eig` (Python). `res$var$coord` → `res.var.coord`. All result tables are `pandas.DataFrame`.
|
|
131
|
+
5. **Plotting is explicit.** `graph=TRUE` does not exist; you call `factominer.plot.plot(res, ...)` yourself. No magic on `print(res)`.
|
|
132
|
+
|
|
133
|
+
## Numerical fidelity
|
|
134
|
+
|
|
135
|
+
For every live method, the package ships parity tests that assert column-by-column equivalence against R FactoMineR 2.14 (current CRAN) within tight tolerances:
|
|
136
|
+
|
|
137
|
+
- Eigenvalues to **1e-10** absolute
|
|
138
|
+
- Coordinates / cos² / correlations / eta² to **1e-9** after sign alignment
|
|
139
|
+
- Contributions to **1e-8**
|
|
140
|
+
- v-tests to **1e-6**
|
|
141
|
+
- p-values to **1e-5** relative
|
|
142
|
+
- HCPC partitions to ARI ≥ 0.999 (k-means consolidation can swap a couple of individuals)
|
|
143
|
+
|
|
144
|
+
Fixtures are JSON dumps of R FactoMineR results, generated by `tools/refresh_r_fixtures.R` and committed under `tests/fixtures/r_outputs/`. The Python tests load them without needing R at test time. Every fixture in the repo is byte-identical to what live R FactoMineR 2.14 emits on a Linux GitHub runner with R 4.6.0 (verified by the `rpy2-parity` CI job, which is triggerable on-demand via `workflow_dispatch` and runs on a weekly cron).
|
|
145
|
+
|
|
146
|
+
To regenerate fixtures locally (requires R + FactoMineR + jsonlite):
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
Rscript tools/refresh_r_fixtures.R
|
|
150
|
+
pytest -q
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Known limitations / use with caution
|
|
154
|
+
|
|
155
|
+
This port targets the most common FactoMineR API surface and is rigorously validated on the bundled datasets, but the following caveats apply:
|
|
156
|
+
|
|
157
|
+
- **Several methods are stubs.** `FAMD`, `MFA`, `HMFA`, `DMFA`, `GPA` are importable but raise `NotImplementedError` when called.
|
|
158
|
+
- **Parity is empirical, not exhaustive.** The 83 parity tests cover the active + supplementary blocks for PCA / CA / MCA / HCPC and the full output schemas of dimdesc / catdes / condes on standard fixtures (`decathlon`, `children`, `tea`). Behavior with row weights, missing values, very small samples, or `method="burt"` MCA has not been independently verified.
|
|
159
|
+
- **Sign of axes is arbitrary.** SVD is sign-ambiguous; we apply a deterministic rule that may give the opposite sign from R on a given axis. Distances, clusters, contributions, and cos² are sign-invariant; coordinates may need a flip to align visually with R output.
|
|
160
|
+
- **HCPC partitions can differ by one or two individuals.** K-means consolidation is sensitive to initialization; the adjusted Rand index against R is ≥ 0.999 on the decathlon test fixture but not exactly 1.0.
|
|
161
|
+
- **No plotly backend yet.** Only matplotlib is implemented; the plotly module's functions raise `NotImplementedError`.
|
|
162
|
+
|
|
163
|
+
For production analyses, journal submissions, or any use where reproducibility against R FactoMineR is load-bearing, cross-check results against the original R package.
|
|
164
|
+
|
|
165
|
+
## Datasets
|
|
166
|
+
|
|
167
|
+
Bundled datasets under `factominer.datasets`:
|
|
168
|
+
|
|
169
|
+
| Loader | Source | Use case |
|
|
170
|
+
| --- | --- | --- |
|
|
171
|
+
| `load_decathlon()` | IAAF 2004 Athens Olympic + Décastar 2004, re-derived from public results | PCA, dimdesc, HCPC |
|
|
172
|
+
| `load_children()` | FactoMineR's `children` (children's worries by socio-educational category) | CA |
|
|
173
|
+
| `load_tea()` | FactoMineR's `tea` (300-person tea-consumption survey) | MCA, catdes |
|
|
174
|
+
| `load_poison()` | FactoMineR's `poison` (food-poisoning outbreak survey) | mixed quantitative + categorical |
|
|
175
|
+
|
|
176
|
+
See [factominer/datasets/data/PROVENANCE.md](factominer/datasets/data/PROVENANCE.md) for each dataset's origin and licensing notes.
|
|
177
|
+
|
|
178
|
+
## Contributing
|
|
179
|
+
|
|
180
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for dev setup, parity-bar expectations, and the PR / issue workflow. Bug reports and feature requests are welcome — please use the issue templates so we have the reproducer / R-side context up front. For security issues, see [SECURITY.md](SECURITY.md) and email `hello@aigora.com` rather than filing a public issue.
|
|
181
|
+
|
|
182
|
+
## Citing
|
|
183
|
+
|
|
184
|
+
If you use FactoMinePy in published work, please cite both this package and the original R FactoMineR (Lê, Josse, Husson, *J. Stat. Softw.* 2008, [doi:10.18637/jss.v025.i01](https://doi.org/10.18637/jss.v025.i01)). A [CITATION.cff](CITATION.cff) is included for tools that consume it automatically.
|
|
185
|
+
|
|
186
|
+
## License
|
|
187
|
+
|
|
188
|
+
MIT for code. Bundled datasets carry their original licensing — see [factominer/datasets/data/PROVENANCE.md](factominer/datasets/data/PROVENANCE.md). The package does **not** redistribute R FactoMineR source (GPL); everything is reimplemented from the published documentation and validated against R outputs.
|
|
189
|
+
|
|
190
|
+
## Acknowledgments
|
|
191
|
+
|
|
192
|
+
- The R FactoMineR package by Sébastien Lê, Julie Josse, François Husson (and many contributors) defines the API surface this package targets.
|
|
193
|
+
- `factoextra` for the visualization patterns that the matplotlib backend reproduces.
|
|
194
|
+
- `scientisttools` and `prince` for prior Python ports that informed the API shape.
|