factominer 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- factominer/__init__.py +36 -0
- factominer/_deferred.py +46 -0
- factominer/_result.py +111 -0
- factominer/_scaling.py +99 -0
- factominer/_sign.py +50 -0
- factominer/_svd.py +60 -0
- factominer/ca.py +164 -0
- factominer/datasets/__init__.py +54 -0
- factominer/datasets/data/PROVENANCE.md +68 -0
- factominer/datasets/data/children.csv +19 -0
- factominer/datasets/data/decathlon.csv +42 -0
- factominer/datasets/data/poison.csv +56 -0
- factominer/datasets/data/tea.csv +301 -0
- factominer/desc/__init__.py +7 -0
- factominer/desc/catdes.py +228 -0
- factominer/desc/condes.py +143 -0
- factominer/desc/dimdesc.py +64 -0
- factominer/hcpc.py +216 -0
- factominer/mca.py +153 -0
- factominer/pca.py +331 -0
- factominer/plot/__init__.py +32 -0
- factominer/plot/matplotlib_backend.py +350 -0
- factominer/py.typed +0 -0
- factominer-0.1.0.dev0.dist-info/METADATA +194 -0
- factominer-0.1.0.dev0.dist-info/RECORD +28 -0
- factominer-0.1.0.dev0.dist-info/WHEEL +4 -0
- factominer-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- factominer-0.1.0.dev0.dist-info/licenses/NOTICE.md +45 -0
factominer/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""factominer — a Python port of R's FactoMineR.
|
|
2
|
+
|
|
3
|
+
This module re-exports the public API. The supported-methods table in
|
|
4
|
+
``README.md`` is the source of truth for which symbols are live and which are
|
|
5
|
+
stubs that raise ``NotImplementedError``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
# Deferred methods (Round 2). Imported so ``from factominer import HMFA`` works,
|
|
11
|
+
# but the implementations raise NotImplementedError when called.
|
|
12
|
+
from ._deferred import DMFA, FAMD, GPA, HMFA, MFA
|
|
13
|
+
from ._result import Result
|
|
14
|
+
from .ca import CA
|
|
15
|
+
from .desc import catdes, condes, dimdesc
|
|
16
|
+
from .hcpc import HCPC
|
|
17
|
+
from .mca import MCA
|
|
18
|
+
from .pca import PCA
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"PCA",
|
|
22
|
+
"CA",
|
|
23
|
+
"MCA",
|
|
24
|
+
"FAMD",
|
|
25
|
+
"MFA",
|
|
26
|
+
"HMFA",
|
|
27
|
+
"DMFA",
|
|
28
|
+
"GPA",
|
|
29
|
+
"HCPC",
|
|
30
|
+
"dimdesc",
|
|
31
|
+
"catdes",
|
|
32
|
+
"condes",
|
|
33
|
+
"Result",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0.dev0"
|
factominer/_deferred.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Deferred-method stubs.
|
|
2
|
+
|
|
3
|
+
Importable so ``from factominer import HMFA`` works, but raising
|
|
4
|
+
``NotImplementedError`` when called. Each stub points at the plan that records
|
|
5
|
+
the round-2 work and the reason.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _deferred(name: str, hint: str) -> Any:
|
|
14
|
+
def stub(*_args: Any, **_kwargs: Any) -> Any:
|
|
15
|
+
raise NotImplementedError(
|
|
16
|
+
f"{name} is a Round 2 deferral. {hint} "
|
|
17
|
+
f"See docs/plans/factominer-python-port.md §2 and the README "
|
|
18
|
+
f"supported-methods table for the current status."
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
stub.__name__ = name
|
|
22
|
+
stub.__qualname__ = name
|
|
23
|
+
stub.__doc__ = f"Stub for {name}; deferred to Round 2."
|
|
24
|
+
return stub
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
FAMD = _deferred(
|
|
28
|
+
"FAMD",
|
|
29
|
+
"Factor Analysis for Mixed Data is planned for the next iteration.",
|
|
30
|
+
)
|
|
31
|
+
MFA = _deferred(
|
|
32
|
+
"MFA",
|
|
33
|
+
"Multiple Factor Analysis is planned for the next iteration.",
|
|
34
|
+
)
|
|
35
|
+
HMFA = _deferred(
|
|
36
|
+
"HMFA",
|
|
37
|
+
"Hierarchical Multiple Factor Analysis is planned for the next iteration.",
|
|
38
|
+
)
|
|
39
|
+
DMFA = _deferred(
|
|
40
|
+
"DMFA",
|
|
41
|
+
"Dual Multiple Factor Analysis is planned for the next iteration.",
|
|
42
|
+
)
|
|
43
|
+
GPA = _deferred(
|
|
44
|
+
"GPA",
|
|
45
|
+
"Generalized Procrustes Analysis is planned for the next iteration.",
|
|
46
|
+
)
|
factominer/_result.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Result containers mirroring FactoMineR's ``res`` lists.
|
|
2
|
+
|
|
3
|
+
R returns lists with ``$``-accessed fields. We use a small set of
|
|
4
|
+
``SimpleNamespace``-based holders so ``res.var.coord`` reads naturally in Python
|
|
5
|
+
and the same shape can carry any subset of fields a method actually produces.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class Block:
|
|
19
|
+
"""A coordinates / cos² / contributions block (variables or individuals).
|
|
20
|
+
|
|
21
|
+
``cor`` is FactoMineR's ``var$cor`` (variables only). ``dist`` is squared
|
|
22
|
+
distance to origin (FactoMineR's ``ind$dist``). ``v_test`` and ``eta2``
|
|
23
|
+
show up on qualitative blocks (MCA / quali.sup).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
coord: pd.DataFrame
|
|
27
|
+
cos2: pd.DataFrame | None = None
|
|
28
|
+
contrib: pd.DataFrame | None = None
|
|
29
|
+
cor: pd.DataFrame | None = None
|
|
30
|
+
dist: pd.Series | None = None
|
|
31
|
+
inertia: pd.Series | None = None
|
|
32
|
+
v_test: pd.DataFrame | None = None
|
|
33
|
+
eta2: pd.DataFrame | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class SVD:
|
|
38
|
+
vs: np.ndarray # singular values
|
|
39
|
+
U: np.ndarray # left singular vectors (rows × ncp)
|
|
40
|
+
V: np.ndarray # right singular vectors (cols × ncp)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True)
|
|
44
|
+
class Result:
|
|
45
|
+
"""FactoMineR-shaped result object.
|
|
46
|
+
|
|
47
|
+
Only ``eig``, ``svd``, ``call`` are always present. Method-specific blocks
|
|
48
|
+
are attached as additional attributes — ``ind`` and ``var`` for PCA; ``row``
|
|
49
|
+
and ``col`` for CA; etc.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
eig: pd.DataFrame
|
|
53
|
+
svd: SVD
|
|
54
|
+
call: dict[str, Any] = field(default_factory=dict)
|
|
55
|
+
ind: Block | None = None
|
|
56
|
+
var: Block | None = None
|
|
57
|
+
row: Block | None = None
|
|
58
|
+
col: Block | None = None
|
|
59
|
+
ind_sup: Block | None = None
|
|
60
|
+
quanti_sup: Block | None = None
|
|
61
|
+
quali_sup: Block | None = None
|
|
62
|
+
row_sup: Block | None = None
|
|
63
|
+
col_sup: Block | None = None
|
|
64
|
+
quanti_var_sup: Block | None = None
|
|
65
|
+
# Method tag for ``summary()``: "PCA", "CA", "MCA", ...
|
|
66
|
+
method: str = ""
|
|
67
|
+
|
|
68
|
+
def summary(self, ncp: int | None = None) -> str:
|
|
69
|
+
ncp = ncp if ncp is not None else min(5, self.eig.shape[0])
|
|
70
|
+
lines: list[str] = []
|
|
71
|
+
lines.append(f"\nResults for the {self.method or 'analysis'}")
|
|
72
|
+
lines.append("=" * 50)
|
|
73
|
+
lines.append("\nEigenvalues")
|
|
74
|
+
lines.append("-" * 50)
|
|
75
|
+
eig = self.eig.head(ncp).copy()
|
|
76
|
+
eig.columns = ["eigenvalue", "percentage of variance", "cumulative percentage of variance"]
|
|
77
|
+
lines.append(eig.round(4).to_string())
|
|
78
|
+
|
|
79
|
+
for label, block in [
|
|
80
|
+
("Individuals", self.ind),
|
|
81
|
+
("Variables", self.var),
|
|
82
|
+
("Rows", self.row),
|
|
83
|
+
("Columns", self.col),
|
|
84
|
+
]:
|
|
85
|
+
if block is None:
|
|
86
|
+
continue
|
|
87
|
+
lines.append(f"\n{label} (the first {min(ncp, block.coord.shape[0])} are reported)")
|
|
88
|
+
lines.append("-" * 50)
|
|
89
|
+
head = block.coord.iloc[: min(10, block.coord.shape[0]), :ncp].copy()
|
|
90
|
+
head.columns = [f"Dim.{i + 1}" for i in range(head.shape[1])]
|
|
91
|
+
lines.append(head.round(4).to_string())
|
|
92
|
+
|
|
93
|
+
for label, block in [
|
|
94
|
+
("Supplementary individuals", self.ind_sup),
|
|
95
|
+
("Supplementary continuous variables", self.quanti_sup),
|
|
96
|
+
("Supplementary categories", self.quali_sup),
|
|
97
|
+
("Supplementary rows", self.row_sup),
|
|
98
|
+
("Supplementary columns", self.col_sup),
|
|
99
|
+
]:
|
|
100
|
+
if block is None or block.coord.empty:
|
|
101
|
+
continue
|
|
102
|
+
lines.append(f"\n{label}")
|
|
103
|
+
lines.append("-" * 50)
|
|
104
|
+
head = block.coord.iloc[: min(10, block.coord.shape[0]), :ncp].copy()
|
|
105
|
+
head.columns = [f"Dim.{i + 1}" for i in range(head.shape[1])]
|
|
106
|
+
lines.append(head.round(4).to_string())
|
|
107
|
+
|
|
108
|
+
return "\n".join(lines)
|
|
109
|
+
|
|
110
|
+
def __repr__(self) -> str:
|
|
111
|
+
return f"<factominer.{self.method or 'Result'} ncp={self.eig.shape[0]}>"
|
factominer/_scaling.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Scaling utilities for the factor-method engines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def coerce_numeric(X: pd.DataFrame) -> np.ndarray:
|
|
10
|
+
"""Return X.values as float64, raising on any non-numeric column."""
|
|
11
|
+
non_numeric = [c for c in X.columns if not np.issubdtype(X[c].dtype, np.number)]
|
|
12
|
+
if non_numeric:
|
|
13
|
+
raise ValueError(f"non-numeric columns in X: {non_numeric}")
|
|
14
|
+
return np.asarray(X.to_numpy(), dtype=np.float64)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def center_scale(
|
|
18
|
+
X: np.ndarray,
|
|
19
|
+
scale_unit: bool,
|
|
20
|
+
row_w: np.ndarray | None = None,
|
|
21
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
22
|
+
"""Weighted center (and optionally scale) X.
|
|
23
|
+
|
|
24
|
+
Returns ``(X_scaled, mean, scale)``. ``scale`` is the per-column divisor
|
|
25
|
+
used (1.0 when ``scale_unit=False``). Weights default to uniform.
|
|
26
|
+
"""
|
|
27
|
+
n = X.shape[0]
|
|
28
|
+
if row_w is None:
|
|
29
|
+
row_w = np.full(n, 1.0 / n)
|
|
30
|
+
else:
|
|
31
|
+
row_w = np.asarray(row_w, dtype=np.float64)
|
|
32
|
+
if row_w.shape != (n,):
|
|
33
|
+
raise ValueError("row_w must have length n")
|
|
34
|
+
# Normalize to a probability vector for the moments.
|
|
35
|
+
row_w = row_w / row_w.sum()
|
|
36
|
+
mean = (X * row_w[:, None]).sum(axis=0)
|
|
37
|
+
Xc = X - mean
|
|
38
|
+
if scale_unit:
|
|
39
|
+
# FactoMineR uses 1/n weighted variance (not 1/(n-1)).
|
|
40
|
+
var = (Xc**2 * row_w[:, None]).sum(axis=0)
|
|
41
|
+
scale = np.sqrt(var)
|
|
42
|
+
scale_safe = np.where(scale < 1e-12, 1.0, scale)
|
|
43
|
+
Xs = Xc / scale_safe
|
|
44
|
+
else:
|
|
45
|
+
scale_safe = np.ones_like(mean)
|
|
46
|
+
Xs = Xc
|
|
47
|
+
return Xs, mean, scale_safe
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def column_indices(
|
|
51
|
+
cols: list[str] | pd.Index,
|
|
52
|
+
spec: list[int] | list[str] | None,
|
|
53
|
+
) -> list[int]:
|
|
54
|
+
"""Normalize a column spec (None / names / positional indices) to indices."""
|
|
55
|
+
if spec is None:
|
|
56
|
+
return []
|
|
57
|
+
out: list[int] = []
|
|
58
|
+
cols_list = list(cols)
|
|
59
|
+
for item in spec:
|
|
60
|
+
if isinstance(item, str):
|
|
61
|
+
if item not in cols_list:
|
|
62
|
+
raise KeyError(f"column not found: {item}")
|
|
63
|
+
out.append(cols_list.index(item))
|
|
64
|
+
elif isinstance(item, (int, np.integer)):
|
|
65
|
+
idx = int(item)
|
|
66
|
+
if not (0 <= idx < len(cols_list)):
|
|
67
|
+
raise IndexError(f"column index out of range: {idx}")
|
|
68
|
+
out.append(idx)
|
|
69
|
+
else:
|
|
70
|
+
raise TypeError(f"column spec items must be str or int, got {type(item).__name__}")
|
|
71
|
+
if len(out) != len(set(out)):
|
|
72
|
+
raise ValueError(f"duplicate column spec: {spec}")
|
|
73
|
+
return out
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def row_indices(
|
|
77
|
+
index: pd.Index,
|
|
78
|
+
spec: list[int] | list[str] | None,
|
|
79
|
+
) -> list[int]:
|
|
80
|
+
"""Normalize a row spec (None / names / positional indices) to indices."""
|
|
81
|
+
if spec is None:
|
|
82
|
+
return []
|
|
83
|
+
out: list[int] = []
|
|
84
|
+
idx_list = list(index)
|
|
85
|
+
for item in spec:
|
|
86
|
+
if isinstance(item, str):
|
|
87
|
+
if item not in idx_list:
|
|
88
|
+
raise KeyError(f"row not found: {item}")
|
|
89
|
+
out.append(idx_list.index(item))
|
|
90
|
+
elif isinstance(item, (int, np.integer)):
|
|
91
|
+
i = int(item)
|
|
92
|
+
if not (0 <= i < len(idx_list)):
|
|
93
|
+
raise IndexError(f"row index out of range: {i}")
|
|
94
|
+
out.append(i)
|
|
95
|
+
else:
|
|
96
|
+
raise TypeError(f"row spec items must be str or int, got {type(item).__name__}")
|
|
97
|
+
if len(out) != len(set(out)):
|
|
98
|
+
raise ValueError(f"duplicate row spec: {spec}")
|
|
99
|
+
return out
|
factominer/_sign.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Deterministic sign convention for SVD-based factor methods.
|
|
2
|
+
|
|
3
|
+
SVD signs are not unique — flipping the sign of column k of U and of column k of
|
|
4
|
+
V leaves the decomposition unchanged. Different libraries pick different
|
|
5
|
+
conventions; we pick one and apply it uniformly so our output is reproducible.
|
|
6
|
+
|
|
7
|
+
Convention: for each axis k, find the row index r with the largest absolute
|
|
8
|
+
value in U[:, k]. If U[r, k] is negative, flip the signs of column k of U and V.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def align_signs(U: np.ndarray, V: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
17
|
+
"""Apply the project's sign convention to (U, V).
|
|
18
|
+
|
|
19
|
+
Both ``U`` and ``V`` are assumed to share the same column dimension (the
|
|
20
|
+
rank kept). Returns sign-aligned copies — inputs are not modified.
|
|
21
|
+
"""
|
|
22
|
+
U = np.asarray(U, dtype=np.float64).copy()
|
|
23
|
+
V = np.asarray(V, dtype=np.float64).copy()
|
|
24
|
+
if U.ndim != 2 or V.ndim != 2 or U.shape[1] != V.shape[1]:
|
|
25
|
+
raise ValueError("U and V must be 2D and share the second dimension")
|
|
26
|
+
for k in range(U.shape[1]):
|
|
27
|
+
r = int(np.argmax(np.abs(U[:, k])))
|
|
28
|
+
if U[r, k] < 0:
|
|
29
|
+
U[:, k] *= -1
|
|
30
|
+
V[:, k] *= -1
|
|
31
|
+
return U, V
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def align_to_reference(values: np.ndarray, reference: np.ndarray) -> np.ndarray:
|
|
35
|
+
"""Sign-align ``values`` axis-wise to ``reference``.
|
|
36
|
+
|
|
37
|
+
For each column, multiply by -1 if the dot product with the reference column
|
|
38
|
+
is negative. Used to compare our output to R FactoMineR fixtures whose own
|
|
39
|
+
sign convention differs.
|
|
40
|
+
"""
|
|
41
|
+
values = np.asarray(values, dtype=np.float64).copy()
|
|
42
|
+
reference = np.asarray(reference, dtype=np.float64)
|
|
43
|
+
if values.shape != reference.shape:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"shape mismatch: values={values.shape}, reference={reference.shape}"
|
|
46
|
+
)
|
|
47
|
+
for k in range(values.shape[1]):
|
|
48
|
+
if float(values[:, k] @ reference[:, k]) < 0:
|
|
49
|
+
values[:, k] *= -1
|
|
50
|
+
return values
|
factominer/_svd.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Shared SVD primitives for the factor-method engines.
|
|
2
|
+
|
|
3
|
+
The generalized SVD (with row and column weights) underlies CA, MCA, FAMD, MFA.
|
|
4
|
+
PCA is a special case with uniform weights.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from ._sign import align_signs
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def standard_svd(X: np.ndarray, ncp: int) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
15
|
+
"""SVD of X, truncated to ``ncp`` components, sign-aligned.
|
|
16
|
+
|
|
17
|
+
Returns ``(U, vs, V)`` such that ``X ≈ U @ diag(vs) @ V.T``. Column counts
|
|
18
|
+
of U and V are min(ncp, rank).
|
|
19
|
+
"""
|
|
20
|
+
X = np.asarray(X, dtype=np.float64)
|
|
21
|
+
if X.ndim != 2:
|
|
22
|
+
raise ValueError("X must be 2D")
|
|
23
|
+
U_full, vs_full, Vt_full = np.linalg.svd(X, full_matrices=False)
|
|
24
|
+
rank_cap = min(ncp, vs_full.size)
|
|
25
|
+
U = U_full[:, :rank_cap]
|
|
26
|
+
vs = vs_full[:rank_cap]
|
|
27
|
+
V = Vt_full[:rank_cap].T
|
|
28
|
+
U, V = align_signs(U, V)
|
|
29
|
+
return U, vs, V
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def generalized_svd(
|
|
33
|
+
X: np.ndarray,
|
|
34
|
+
row_w: np.ndarray,
|
|
35
|
+
col_w: np.ndarray,
|
|
36
|
+
ncp: int,
|
|
37
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
38
|
+
"""Generalized SVD with positive row and column weights.
|
|
39
|
+
|
|
40
|
+
Solves ``argmax`` of the bilinear form ``u' diag(row_w) X diag(col_w) v``
|
|
41
|
+
subject to ``u' diag(row_w) u = 1`` and ``v' diag(col_w) v = 1``.
|
|
42
|
+
|
|
43
|
+
Implemented as: form ``Y = diag(sqrt(row_w)) X diag(sqrt(col_w))``, do a
|
|
44
|
+
standard SVD on Y, then unwhiten the singular vectors. Returns ``(U, vs, V)``
|
|
45
|
+
on the *unwhitened* (original) scales. ``vs`` are the singular values of Y.
|
|
46
|
+
"""
|
|
47
|
+
X = np.asarray(X, dtype=np.float64)
|
|
48
|
+
row_w = np.asarray(row_w, dtype=np.float64).reshape(-1)
|
|
49
|
+
col_w = np.asarray(col_w, dtype=np.float64).reshape(-1)
|
|
50
|
+
if row_w.size != X.shape[0] or col_w.size != X.shape[1]:
|
|
51
|
+
raise ValueError("weight vectors must match X's shape")
|
|
52
|
+
if (row_w <= 0).any() or (col_w <= 0).any():
|
|
53
|
+
raise ValueError("weights must be strictly positive")
|
|
54
|
+
sqrt_row = np.sqrt(row_w)
|
|
55
|
+
sqrt_col = np.sqrt(col_w)
|
|
56
|
+
Y = (X * sqrt_row[:, None]) * sqrt_col[None, :]
|
|
57
|
+
U_tilde, vs, V_tilde = standard_svd(Y, ncp)
|
|
58
|
+
U = U_tilde / sqrt_row[:, None]
|
|
59
|
+
V = V_tilde / sqrt_col[:, None]
|
|
60
|
+
return U, vs, V
|
factominer/ca.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Correspondence Analysis — FactoMineR-compatible API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from ._result import SVD, Block, Result
|
|
9
|
+
from ._scaling import row_indices
|
|
10
|
+
from ._svd import standard_svd
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def CA( # noqa: N802 — mirrors R
|
|
14
|
+
X: pd.DataFrame,
|
|
15
|
+
ncp: int = 5,
|
|
16
|
+
row_sup: list[int] | list[str] | None = None,
|
|
17
|
+
col_sup: list[int] | list[str] | None = None,
|
|
18
|
+
graph: bool = False, # noqa: ARG001
|
|
19
|
+
) -> Result:
|
|
20
|
+
"""Run Correspondence Analysis on a contingency table.
|
|
21
|
+
|
|
22
|
+
Mirrors ``FactoMineR::CA``.
|
|
23
|
+
"""
|
|
24
|
+
if not isinstance(X, pd.DataFrame):
|
|
25
|
+
raise TypeError("X must be a pandas DataFrame")
|
|
26
|
+
Xv = X.to_numpy(dtype=np.float64)
|
|
27
|
+
if (Xv < 0).any():
|
|
28
|
+
raise ValueError("CA requires non-negative counts")
|
|
29
|
+
|
|
30
|
+
row_sup_idx = row_indices(X.index, row_sup)
|
|
31
|
+
col_sup_idx = row_indices(X.columns, col_sup)
|
|
32
|
+
|
|
33
|
+
all_row_pos = np.arange(X.shape[0])
|
|
34
|
+
all_col_pos = np.arange(X.shape[1])
|
|
35
|
+
active_rows = np.array([i for i in all_row_pos if i not in set(row_sup_idx)])
|
|
36
|
+
active_cols = np.array([j for j in all_col_pos if j not in set(col_sup_idx)])
|
|
37
|
+
|
|
38
|
+
A = Xv[np.ix_(active_rows, active_cols)]
|
|
39
|
+
N = float(A.sum())
|
|
40
|
+
if N <= 0:
|
|
41
|
+
raise ValueError("active sub-table has zero total")
|
|
42
|
+
P = A / N
|
|
43
|
+
r = P.sum(axis=1)
|
|
44
|
+
c = P.sum(axis=0)
|
|
45
|
+
if (r <= 0).any() or (c <= 0).any():
|
|
46
|
+
raise ValueError("CA requires strictly positive row and column margins on the active table")
|
|
47
|
+
# Standardized residuals matrix S; SVD of S gives axes.
|
|
48
|
+
expected = np.outer(r, c)
|
|
49
|
+
S = (P - expected) / np.sqrt(expected)
|
|
50
|
+
n_pc = min(ncp, min(A.shape) - 1)
|
|
51
|
+
U_tilde, vs, V_tilde = standard_svd(S, n_pc)
|
|
52
|
+
eigenvalues = vs**2
|
|
53
|
+
# R returns all eigenvalues in res$eig (full rank, not truncated to ncp).
|
|
54
|
+
vs_full = np.linalg.svd(S, compute_uv=False)
|
|
55
|
+
eigenvalues_full = vs_full**2
|
|
56
|
+
# CA's rank is min(I,J)-1 (centering removes one axis); drop the trailing
|
|
57
|
+
# near-zero residual so the count matches FactoMineR's res$eig row count.
|
|
58
|
+
rank_ca = min(A.shape) - 1
|
|
59
|
+
if eigenvalues_full.size > rank_ca:
|
|
60
|
+
eigenvalues_full = eigenvalues_full[:rank_ca]
|
|
61
|
+
vs_full = vs_full[:rank_ca]
|
|
62
|
+
total_inertia = float((S**2).sum())
|
|
63
|
+
|
|
64
|
+
# Row / column coordinates (chi-square distance, "symmetric" rendering).
|
|
65
|
+
row_coord = (U_tilde * vs[None, :]) / np.sqrt(r)[:, None]
|
|
66
|
+
col_coord = (V_tilde * vs[None, :]) / np.sqrt(c)[:, None]
|
|
67
|
+
|
|
68
|
+
# Squared distance to centroid in chi-square space
|
|
69
|
+
row_dist2 = ((P / r[:, None] - c[None, :]) ** 2 / c[None, :]).sum(axis=1)
|
|
70
|
+
col_dist2 = ((P / c[None, :] - r[:, None]) ** 2 / r[:, None]).sum(axis=0)
|
|
71
|
+
|
|
72
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
73
|
+
row_cos2 = np.where(row_dist2[:, None] > 0, row_coord**2 / row_dist2[:, None], 0.0)
|
|
74
|
+
col_cos2 = np.where(col_dist2[:, None] > 0, col_coord**2 / col_dist2[:, None], 0.0)
|
|
75
|
+
|
|
76
|
+
row_contrib = (r[:, None] * row_coord**2) / np.where(
|
|
77
|
+
eigenvalues[None, :] > 0, eigenvalues[None, :], 1.0
|
|
78
|
+
) * 100.0
|
|
79
|
+
col_contrib = (c[:, None] * col_coord**2) / np.where(
|
|
80
|
+
eigenvalues[None, :] > 0, eigenvalues[None, :], 1.0
|
|
81
|
+
) * 100.0
|
|
82
|
+
|
|
83
|
+
row_inertia = r * row_dist2
|
|
84
|
+
col_inertia = c * col_dist2
|
|
85
|
+
|
|
86
|
+
dim_names = [f"Dim.{i + 1}" for i in range(n_pc)]
|
|
87
|
+
active_row_labels = list(X.index[active_rows])
|
|
88
|
+
active_col_labels = list(X.columns[active_cols])
|
|
89
|
+
|
|
90
|
+
eig_df = pd.DataFrame(
|
|
91
|
+
{
|
|
92
|
+
"eigenvalue": eigenvalues_full,
|
|
93
|
+
"percentage of variance": eigenvalues_full / total_inertia * 100.0,
|
|
94
|
+
"cumulative percentage of variance": np.cumsum(eigenvalues_full) / total_inertia * 100.0,
|
|
95
|
+
},
|
|
96
|
+
index=[f"dim {i + 1}" for i in range(eigenvalues_full.size)],
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
row_block = Block(
|
|
100
|
+
coord=pd.DataFrame(row_coord, index=active_row_labels, columns=dim_names),
|
|
101
|
+
cos2=pd.DataFrame(row_cos2, index=active_row_labels, columns=dim_names),
|
|
102
|
+
contrib=pd.DataFrame(row_contrib, index=active_row_labels, columns=dim_names),
|
|
103
|
+
inertia=pd.Series(row_inertia, index=active_row_labels, name="inertia"),
|
|
104
|
+
dist=pd.Series(np.sqrt(row_dist2), index=active_row_labels, name="dist"),
|
|
105
|
+
)
|
|
106
|
+
col_block = Block(
|
|
107
|
+
coord=pd.DataFrame(col_coord, index=active_col_labels, columns=dim_names),
|
|
108
|
+
cos2=pd.DataFrame(col_cos2, index=active_col_labels, columns=dim_names),
|
|
109
|
+
contrib=pd.DataFrame(col_contrib, index=active_col_labels, columns=dim_names),
|
|
110
|
+
inertia=pd.Series(col_inertia, index=active_col_labels, name="inertia"),
|
|
111
|
+
dist=pd.Series(np.sqrt(col_dist2), index=active_col_labels, name="dist"),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Supplementary rows (project onto column-axis basis)
|
|
115
|
+
row_sup_block = None
|
|
116
|
+
if row_sup_idx:
|
|
117
|
+
A_sup = Xv[np.ix_(np.asarray(row_sup_idx), active_cols)]
|
|
118
|
+
r_sup = A_sup.sum(axis=1)
|
|
119
|
+
r_sup_safe = np.where(r_sup <= 0, 1.0, r_sup)
|
|
120
|
+
prof_sup = A_sup / r_sup_safe[:, None]
|
|
121
|
+
# Row sup coords = (profile - c) projected on V_tilde / sqrt(c) (transition formula)
|
|
122
|
+
coord_sup = ((prof_sup - c[None, :]) / np.sqrt(c)[None, :]) @ V_tilde
|
|
123
|
+
dist2_sup = ((prof_sup - c[None, :]) ** 2 / c[None, :]).sum(axis=1)
|
|
124
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
125
|
+
cos2_sup = np.where(dist2_sup[:, None] > 0, coord_sup**2 / dist2_sup[:, None], 0.0)
|
|
126
|
+
row_sup_block = Block(
|
|
127
|
+
coord=pd.DataFrame(coord_sup, index=[X.index[i] for i in row_sup_idx], columns=dim_names),
|
|
128
|
+
cos2=pd.DataFrame(cos2_sup, index=[X.index[i] for i in row_sup_idx], columns=dim_names),
|
|
129
|
+
dist=pd.Series(np.sqrt(dist2_sup), index=[X.index[i] for i in row_sup_idx], name="dist"),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
col_sup_block = None
|
|
133
|
+
if col_sup_idx:
|
|
134
|
+
A_sup = Xv[np.ix_(active_rows, np.asarray(col_sup_idx))]
|
|
135
|
+
c_sup = A_sup.sum(axis=0)
|
|
136
|
+
c_sup_safe = np.where(c_sup <= 0, 1.0, c_sup)
|
|
137
|
+
prof_sup = A_sup / c_sup_safe[None, :]
|
|
138
|
+
coord_sup = ((prof_sup.T - r[None, :]) / np.sqrt(r)[None, :]) @ U_tilde
|
|
139
|
+
dist2_sup = ((prof_sup.T - r[None, :]) ** 2 / r[None, :]).sum(axis=1)
|
|
140
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
141
|
+
cos2_sup = np.where(dist2_sup[:, None] > 0, coord_sup**2 / dist2_sup[:, None], 0.0)
|
|
142
|
+
col_sup_block = Block(
|
|
143
|
+
coord=pd.DataFrame(coord_sup, index=[X.columns[j] for j in col_sup_idx], columns=dim_names),
|
|
144
|
+
cos2=pd.DataFrame(cos2_sup, index=[X.columns[j] for j in col_sup_idx], columns=dim_names),
|
|
145
|
+
dist=pd.Series(np.sqrt(dist2_sup), index=[X.columns[j] for j in col_sup_idx], name="dist"),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return Result(
|
|
149
|
+
eig=eig_df,
|
|
150
|
+
svd=SVD(vs=vs_full.copy(), U=U_tilde.copy(), V=V_tilde.copy()),
|
|
151
|
+
call={
|
|
152
|
+
"ncp": ncp,
|
|
153
|
+
"row_sup": row_sup_idx,
|
|
154
|
+
"col_sup": col_sup_idx,
|
|
155
|
+
"N": N,
|
|
156
|
+
"marge_row": r.copy(),
|
|
157
|
+
"marge_col": c.copy(),
|
|
158
|
+
},
|
|
159
|
+
row=row_block,
|
|
160
|
+
col=col_block,
|
|
161
|
+
row_sup=row_sup_block,
|
|
162
|
+
col_sup=col_sup_block,
|
|
163
|
+
method="CA",
|
|
164
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Bundled datasets re-used from FactoMineR's distribution for parity testing.
|
|
2
|
+
|
|
3
|
+
See ``factominer/datasets/data/PROVENANCE.md`` for the origin and licensing of
|
|
4
|
+
each file.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
_DATA_DIR = Path(__file__).parent / "data"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _load_csv(name: str) -> pd.DataFrame:
|
|
17
|
+
path = _DATA_DIR / name
|
|
18
|
+
if not path.exists():
|
|
19
|
+
raise FileNotFoundError(f"dataset not bundled: {path.name}")
|
|
20
|
+
return pd.read_csv(path, index_col=0)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_decathlon() -> pd.DataFrame:
|
|
24
|
+
"""41 athletes × 13 columns from the 2004 Athens Olympic + Décastar decathlons.
|
|
25
|
+
|
|
26
|
+
Columns: ten athletic events (seconds or meters), plus ``Rank``, ``Points``,
|
|
27
|
+
and ``Competition`` (a two-level factor). FactoMineR's canonical PCA example.
|
|
28
|
+
"""
|
|
29
|
+
return _load_csv("decathlon.csv")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def load_children() -> pd.DataFrame:
|
|
33
|
+
"""18 × 8 contingency table on the perceptions of children's worries.
|
|
34
|
+
|
|
35
|
+
Rows: kinds of worries. Columns: socio-educational categories. Used in
|
|
36
|
+
FactoMineR's CA examples.
|
|
37
|
+
"""
|
|
38
|
+
return _load_csv("children.csv")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def load_tea() -> pd.DataFrame:
|
|
42
|
+
"""300 × 36 survey on tea consumption habits.
|
|
43
|
+
|
|
44
|
+
Mostly categorical (factors); one integer column. Canonical MCA example.
|
|
45
|
+
"""
|
|
46
|
+
return _load_csv("tea.csv")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def load_poison() -> pd.DataFrame:
|
|
50
|
+
"""55 × 15 food-poisoning outbreak survey.
|
|
51
|
+
|
|
52
|
+
Mixed categorical + quantitative. Used in MCA / FAMD examples.
|
|
53
|
+
"""
|
|
54
|
+
return _load_csv("poison.csv")
|