factominer 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
factominer/__init__.py ADDED
@@ -0,0 +1,36 @@
1
+ """factominer — a Python port of R's FactoMineR.
2
+
3
+ This module re-exports the public API. The supported-methods table in
4
+ ``README.md`` is the source of truth for which symbols are live and which are
5
+ stubs that raise ``NotImplementedError``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ # Deferred methods (Round 2). Imported so ``from factominer import HMFA`` works,
11
+ # but the implementations raise NotImplementedError when called.
12
+ from ._deferred import DMFA, FAMD, GPA, HMFA, MFA
13
+ from ._result import Result
14
+ from .ca import CA
15
+ from .desc import catdes, condes, dimdesc
16
+ from .hcpc import HCPC
17
+ from .mca import MCA
18
+ from .pca import PCA
19
+
20
+ __all__ = [
21
+ "PCA",
22
+ "CA",
23
+ "MCA",
24
+ "FAMD",
25
+ "MFA",
26
+ "HMFA",
27
+ "DMFA",
28
+ "GPA",
29
+ "HCPC",
30
+ "dimdesc",
31
+ "catdes",
32
+ "condes",
33
+ "Result",
34
+ ]
35
+
36
+ __version__ = "0.1.0.dev0"
@@ -0,0 +1,46 @@
1
+ """Deferred-method stubs.
2
+
3
+ Importable so ``from factominer import HMFA`` works, but raising
4
+ ``NotImplementedError`` when called. Each stub points at the plan that records
5
+ the round-2 work and the reason.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any
11
+
12
+
13
+ def _deferred(name: str, hint: str) -> Any:
14
+ def stub(*_args: Any, **_kwargs: Any) -> Any:
15
+ raise NotImplementedError(
16
+ f"{name} is a Round 2 deferral. {hint} "
17
+ f"See docs/plans/factominer-python-port.md §2 and the README "
18
+ f"supported-methods table for the current status."
19
+ )
20
+
21
+ stub.__name__ = name
22
+ stub.__qualname__ = name
23
+ stub.__doc__ = f"Stub for {name}; deferred to Round 2."
24
+ return stub
25
+
26
+
27
+ FAMD = _deferred(
28
+ "FAMD",
29
+ "Factor Analysis for Mixed Data is planned for the next iteration.",
30
+ )
31
+ MFA = _deferred(
32
+ "MFA",
33
+ "Multiple Factor Analysis is planned for the next iteration.",
34
+ )
35
+ HMFA = _deferred(
36
+ "HMFA",
37
+ "Hierarchical Multiple Factor Analysis is planned for the next iteration.",
38
+ )
39
+ DMFA = _deferred(
40
+ "DMFA",
41
+ "Dual Multiple Factor Analysis is planned for the next iteration.",
42
+ )
43
+ GPA = _deferred(
44
+ "GPA",
45
+ "Generalized Procrustes Analysis is planned for the next iteration.",
46
+ )
factominer/_result.py ADDED
@@ -0,0 +1,111 @@
1
+ """Result containers mirroring FactoMineR's ``res`` lists.
2
+
3
+ R returns lists with ``$``-accessed fields. We use a small set of
4
+ ``SimpleNamespace``-based holders so ``res.var.coord`` reads naturally in Python
5
+ and the same shape can carry any subset of fields a method actually produces.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import Any
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class Block:
19
+ """A coordinates / cos² / contributions block (variables or individuals).
20
+
21
+ ``cor`` is FactoMineR's ``var$cor`` (variables only). ``dist`` is squared
22
+ distance to origin (FactoMineR's ``ind$dist``). ``v_test`` and ``eta2``
23
+ show up on qualitative blocks (MCA / quali.sup).
24
+ """
25
+
26
+ coord: pd.DataFrame
27
+ cos2: pd.DataFrame | None = None
28
+ contrib: pd.DataFrame | None = None
29
+ cor: pd.DataFrame | None = None
30
+ dist: pd.Series | None = None
31
+ inertia: pd.Series | None = None
32
+ v_test: pd.DataFrame | None = None
33
+ eta2: pd.DataFrame | None = None
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class SVD:
38
+ vs: np.ndarray # singular values
39
+ U: np.ndarray # left singular vectors (rows × ncp)
40
+ V: np.ndarray # right singular vectors (cols × ncp)
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class Result:
45
+ """FactoMineR-shaped result object.
46
+
47
+ Only ``eig``, ``svd``, ``call`` are always present. Method-specific blocks
48
+ are attached as additional attributes — ``ind`` and ``var`` for PCA; ``row``
49
+ and ``col`` for CA; etc.
50
+ """
51
+
52
+ eig: pd.DataFrame
53
+ svd: SVD
54
+ call: dict[str, Any] = field(default_factory=dict)
55
+ ind: Block | None = None
56
+ var: Block | None = None
57
+ row: Block | None = None
58
+ col: Block | None = None
59
+ ind_sup: Block | None = None
60
+ quanti_sup: Block | None = None
61
+ quali_sup: Block | None = None
62
+ row_sup: Block | None = None
63
+ col_sup: Block | None = None
64
+ quanti_var_sup: Block | None = None
65
+ # Method tag for ``summary()``: "PCA", "CA", "MCA", ...
66
+ method: str = ""
67
+
68
+ def summary(self, ncp: int | None = None) -> str:
69
+ ncp = ncp if ncp is not None else min(5, self.eig.shape[0])
70
+ lines: list[str] = []
71
+ lines.append(f"\nResults for the {self.method or 'analysis'}")
72
+ lines.append("=" * 50)
73
+ lines.append("\nEigenvalues")
74
+ lines.append("-" * 50)
75
+ eig = self.eig.head(ncp).copy()
76
+ eig.columns = ["eigenvalue", "percentage of variance", "cumulative percentage of variance"]
77
+ lines.append(eig.round(4).to_string())
78
+
79
+ for label, block in [
80
+ ("Individuals", self.ind),
81
+ ("Variables", self.var),
82
+ ("Rows", self.row),
83
+ ("Columns", self.col),
84
+ ]:
85
+ if block is None:
86
+ continue
87
+ lines.append(f"\n{label} (the first {min(ncp, block.coord.shape[0])} are reported)")
88
+ lines.append("-" * 50)
89
+ head = block.coord.iloc[: min(10, block.coord.shape[0]), :ncp].copy()
90
+ head.columns = [f"Dim.{i + 1}" for i in range(head.shape[1])]
91
+ lines.append(head.round(4).to_string())
92
+
93
+ for label, block in [
94
+ ("Supplementary individuals", self.ind_sup),
95
+ ("Supplementary continuous variables", self.quanti_sup),
96
+ ("Supplementary categories", self.quali_sup),
97
+ ("Supplementary rows", self.row_sup),
98
+ ("Supplementary columns", self.col_sup),
99
+ ]:
100
+ if block is None or block.coord.empty:
101
+ continue
102
+ lines.append(f"\n{label}")
103
+ lines.append("-" * 50)
104
+ head = block.coord.iloc[: min(10, block.coord.shape[0]), :ncp].copy()
105
+ head.columns = [f"Dim.{i + 1}" for i in range(head.shape[1])]
106
+ lines.append(head.round(4).to_string())
107
+
108
+ return "\n".join(lines)
109
+
110
+ def __repr__(self) -> str:
111
+ return f"<factominer.{self.method or 'Result'} ncp={self.eig.shape[0]}>"
factominer/_scaling.py ADDED
@@ -0,0 +1,99 @@
1
+ """Scaling utilities for the factor-method engines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def coerce_numeric(X: pd.DataFrame) -> np.ndarray:
10
+ """Return X.values as float64, raising on any non-numeric column."""
11
+ non_numeric = [c for c in X.columns if not np.issubdtype(X[c].dtype, np.number)]
12
+ if non_numeric:
13
+ raise ValueError(f"non-numeric columns in X: {non_numeric}")
14
+ return np.asarray(X.to_numpy(), dtype=np.float64)
15
+
16
+
17
+ def center_scale(
18
+ X: np.ndarray,
19
+ scale_unit: bool,
20
+ row_w: np.ndarray | None = None,
21
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
22
+ """Weighted center (and optionally scale) X.
23
+
24
+ Returns ``(X_scaled, mean, scale)``. ``scale`` is the per-column divisor
25
+ used (1.0 when ``scale_unit=False``). Weights default to uniform.
26
+ """
27
+ n = X.shape[0]
28
+ if row_w is None:
29
+ row_w = np.full(n, 1.0 / n)
30
+ else:
31
+ row_w = np.asarray(row_w, dtype=np.float64)
32
+ if row_w.shape != (n,):
33
+ raise ValueError("row_w must have length n")
34
+ # Normalize to a probability vector for the moments.
35
+ row_w = row_w / row_w.sum()
36
+ mean = (X * row_w[:, None]).sum(axis=0)
37
+ Xc = X - mean
38
+ if scale_unit:
39
+ # FactoMineR uses 1/n weighted variance (not 1/(n-1)).
40
+ var = (Xc**2 * row_w[:, None]).sum(axis=0)
41
+ scale = np.sqrt(var)
42
+ scale_safe = np.where(scale < 1e-12, 1.0, scale)
43
+ Xs = Xc / scale_safe
44
+ else:
45
+ scale_safe = np.ones_like(mean)
46
+ Xs = Xc
47
+ return Xs, mean, scale_safe
48
+
49
+
50
+ def column_indices(
51
+ cols: list[str] | pd.Index,
52
+ spec: list[int] | list[str] | None,
53
+ ) -> list[int]:
54
+ """Normalize a column spec (None / names / positional indices) to indices."""
55
+ if spec is None:
56
+ return []
57
+ out: list[int] = []
58
+ cols_list = list(cols)
59
+ for item in spec:
60
+ if isinstance(item, str):
61
+ if item not in cols_list:
62
+ raise KeyError(f"column not found: {item}")
63
+ out.append(cols_list.index(item))
64
+ elif isinstance(item, (int, np.integer)):
65
+ idx = int(item)
66
+ if not (0 <= idx < len(cols_list)):
67
+ raise IndexError(f"column index out of range: {idx}")
68
+ out.append(idx)
69
+ else:
70
+ raise TypeError(f"column spec items must be str or int, got {type(item).__name__}")
71
+ if len(out) != len(set(out)):
72
+ raise ValueError(f"duplicate column spec: {spec}")
73
+ return out
74
+
75
+
76
+ def row_indices(
77
+ index: pd.Index,
78
+ spec: list[int] | list[str] | None,
79
+ ) -> list[int]:
80
+ """Normalize a row spec (None / names / positional indices) to indices."""
81
+ if spec is None:
82
+ return []
83
+ out: list[int] = []
84
+ idx_list = list(index)
85
+ for item in spec:
86
+ if isinstance(item, str):
87
+ if item not in idx_list:
88
+ raise KeyError(f"row not found: {item}")
89
+ out.append(idx_list.index(item))
90
+ elif isinstance(item, (int, np.integer)):
91
+ i = int(item)
92
+ if not (0 <= i < len(idx_list)):
93
+ raise IndexError(f"row index out of range: {i}")
94
+ out.append(i)
95
+ else:
96
+ raise TypeError(f"row spec items must be str or int, got {type(item).__name__}")
97
+ if len(out) != len(set(out)):
98
+ raise ValueError(f"duplicate row spec: {spec}")
99
+ return out
factominer/_sign.py ADDED
@@ -0,0 +1,50 @@
1
+ """Deterministic sign convention for SVD-based factor methods.
2
+
3
+ SVD signs are not unique — flipping the sign of column k of U and of column k of
4
+ V leaves the decomposition unchanged. Different libraries pick different
5
+ conventions; we pick one and apply it uniformly so our output is reproducible.
6
+
7
+ Convention: for each axis k, find the row index r with the largest absolute
8
+ value in U[:, k]. If U[r, k] is negative, flip the signs of column k of U and V.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import numpy as np
14
+
15
+
16
+ def align_signs(U: np.ndarray, V: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
17
+ """Apply the project's sign convention to (U, V).
18
+
19
+ Both ``U`` and ``V`` are assumed to share the same column dimension (the
20
+ rank kept). Returns sign-aligned copies — inputs are not modified.
21
+ """
22
+ U = np.asarray(U, dtype=np.float64).copy()
23
+ V = np.asarray(V, dtype=np.float64).copy()
24
+ if U.ndim != 2 or V.ndim != 2 or U.shape[1] != V.shape[1]:
25
+ raise ValueError("U and V must be 2D and share the second dimension")
26
+ for k in range(U.shape[1]):
27
+ r = int(np.argmax(np.abs(U[:, k])))
28
+ if U[r, k] < 0:
29
+ U[:, k] *= -1
30
+ V[:, k] *= -1
31
+ return U, V
32
+
33
+
34
+ def align_to_reference(values: np.ndarray, reference: np.ndarray) -> np.ndarray:
35
+ """Sign-align ``values`` axis-wise to ``reference``.
36
+
37
+ For each column, multiply by -1 if the dot product with the reference column
38
+ is negative. Used to compare our output to R FactoMineR fixtures whose own
39
+ sign convention differs.
40
+ """
41
+ values = np.asarray(values, dtype=np.float64).copy()
42
+ reference = np.asarray(reference, dtype=np.float64)
43
+ if values.shape != reference.shape:
44
+ raise ValueError(
45
+ f"shape mismatch: values={values.shape}, reference={reference.shape}"
46
+ )
47
+ for k in range(values.shape[1]):
48
+ if float(values[:, k] @ reference[:, k]) < 0:
49
+ values[:, k] *= -1
50
+ return values
factominer/_svd.py ADDED
@@ -0,0 +1,60 @@
1
+ """Shared SVD primitives for the factor-method engines.
2
+
3
+ The generalized SVD (with row and column weights) underlies CA, MCA, FAMD, MFA.
4
+ PCA is a special case with uniform weights.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+
11
+ from ._sign import align_signs
12
+
13
+
14
+ def standard_svd(X: np.ndarray, ncp: int) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
15
+ """SVD of X, truncated to ``ncp`` components, sign-aligned.
16
+
17
+ Returns ``(U, vs, V)`` such that ``X ≈ U @ diag(vs) @ V.T``. Column counts
18
+ of U and V are min(ncp, rank).
19
+ """
20
+ X = np.asarray(X, dtype=np.float64)
21
+ if X.ndim != 2:
22
+ raise ValueError("X must be 2D")
23
+ U_full, vs_full, Vt_full = np.linalg.svd(X, full_matrices=False)
24
+ rank_cap = min(ncp, vs_full.size)
25
+ U = U_full[:, :rank_cap]
26
+ vs = vs_full[:rank_cap]
27
+ V = Vt_full[:rank_cap].T
28
+ U, V = align_signs(U, V)
29
+ return U, vs, V
30
+
31
+
32
+ def generalized_svd(
33
+ X: np.ndarray,
34
+ row_w: np.ndarray,
35
+ col_w: np.ndarray,
36
+ ncp: int,
37
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
38
+ """Generalized SVD with positive row and column weights.
39
+
40
+ Solves ``argmax`` of the bilinear form ``u' diag(row_w) X diag(col_w) v``
41
+ subject to ``u' diag(row_w) u = 1`` and ``v' diag(col_w) v = 1``.
42
+
43
+ Implemented as: form ``Y = diag(sqrt(row_w)) X diag(sqrt(col_w))``, do a
44
+ standard SVD on Y, then unwhiten the singular vectors. Returns ``(U, vs, V)``
45
+ on the *unwhitened* (original) scales. ``vs`` are the singular values of Y.
46
+ """
47
+ X = np.asarray(X, dtype=np.float64)
48
+ row_w = np.asarray(row_w, dtype=np.float64).reshape(-1)
49
+ col_w = np.asarray(col_w, dtype=np.float64).reshape(-1)
50
+ if row_w.size != X.shape[0] or col_w.size != X.shape[1]:
51
+ raise ValueError("weight vectors must match X's shape")
52
+ if (row_w <= 0).any() or (col_w <= 0).any():
53
+ raise ValueError("weights must be strictly positive")
54
+ sqrt_row = np.sqrt(row_w)
55
+ sqrt_col = np.sqrt(col_w)
56
+ Y = (X * sqrt_row[:, None]) * sqrt_col[None, :]
57
+ U_tilde, vs, V_tilde = standard_svd(Y, ncp)
58
+ U = U_tilde / sqrt_row[:, None]
59
+ V = V_tilde / sqrt_col[:, None]
60
+ return U, vs, V
factominer/ca.py ADDED
@@ -0,0 +1,164 @@
1
+ """Correspondence Analysis — FactoMineR-compatible API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from ._result import SVD, Block, Result
9
+ from ._scaling import row_indices
10
+ from ._svd import standard_svd
11
+
12
+
13
+ def CA( # noqa: N802 — mirrors R
14
+ X: pd.DataFrame,
15
+ ncp: int = 5,
16
+ row_sup: list[int] | list[str] | None = None,
17
+ col_sup: list[int] | list[str] | None = None,
18
+ graph: bool = False, # noqa: ARG001
19
+ ) -> Result:
20
+ """Run Correspondence Analysis on a contingency table.
21
+
22
+ Mirrors ``FactoMineR::CA``.
23
+ """
24
+ if not isinstance(X, pd.DataFrame):
25
+ raise TypeError("X must be a pandas DataFrame")
26
+ Xv = X.to_numpy(dtype=np.float64)
27
+ if (Xv < 0).any():
28
+ raise ValueError("CA requires non-negative counts")
29
+
30
+ row_sup_idx = row_indices(X.index, row_sup)
31
+ col_sup_idx = row_indices(X.columns, col_sup)
32
+
33
+ all_row_pos = np.arange(X.shape[0])
34
+ all_col_pos = np.arange(X.shape[1])
35
+ active_rows = np.array([i for i in all_row_pos if i not in set(row_sup_idx)])
36
+ active_cols = np.array([j for j in all_col_pos if j not in set(col_sup_idx)])
37
+
38
+ A = Xv[np.ix_(active_rows, active_cols)]
39
+ N = float(A.sum())
40
+ if N <= 0:
41
+ raise ValueError("active sub-table has zero total")
42
+ P = A / N
43
+ r = P.sum(axis=1)
44
+ c = P.sum(axis=0)
45
+ if (r <= 0).any() or (c <= 0).any():
46
+ raise ValueError("CA requires strictly positive row and column margins on the active table")
47
+ # Standardized residuals matrix S; SVD of S gives axes.
48
+ expected = np.outer(r, c)
49
+ S = (P - expected) / np.sqrt(expected)
50
+ n_pc = min(ncp, min(A.shape) - 1)
51
+ U_tilde, vs, V_tilde = standard_svd(S, n_pc)
52
+ eigenvalues = vs**2
53
+ # R returns all eigenvalues in res$eig (full rank, not truncated to ncp).
54
+ vs_full = np.linalg.svd(S, compute_uv=False)
55
+ eigenvalues_full = vs_full**2
56
+ # CA's rank is min(I,J)-1 (centering removes one axis); drop the trailing
57
+ # near-zero residual so the count matches FactoMineR's res$eig row count.
58
+ rank_ca = min(A.shape) - 1
59
+ if eigenvalues_full.size > rank_ca:
60
+ eigenvalues_full = eigenvalues_full[:rank_ca]
61
+ vs_full = vs_full[:rank_ca]
62
+ total_inertia = float((S**2).sum())
63
+
64
+ # Row / column coordinates (chi-square distance, "symmetric" rendering).
65
+ row_coord = (U_tilde * vs[None, :]) / np.sqrt(r)[:, None]
66
+ col_coord = (V_tilde * vs[None, :]) / np.sqrt(c)[:, None]
67
+
68
+ # Squared distance to centroid in chi-square space
69
+ row_dist2 = ((P / r[:, None] - c[None, :]) ** 2 / c[None, :]).sum(axis=1)
70
+ col_dist2 = ((P / c[None, :] - r[:, None]) ** 2 / r[:, None]).sum(axis=0)
71
+
72
+ with np.errstate(divide="ignore", invalid="ignore"):
73
+ row_cos2 = np.where(row_dist2[:, None] > 0, row_coord**2 / row_dist2[:, None], 0.0)
74
+ col_cos2 = np.where(col_dist2[:, None] > 0, col_coord**2 / col_dist2[:, None], 0.0)
75
+
76
+ row_contrib = (r[:, None] * row_coord**2) / np.where(
77
+ eigenvalues[None, :] > 0, eigenvalues[None, :], 1.0
78
+ ) * 100.0
79
+ col_contrib = (c[:, None] * col_coord**2) / np.where(
80
+ eigenvalues[None, :] > 0, eigenvalues[None, :], 1.0
81
+ ) * 100.0
82
+
83
+ row_inertia = r * row_dist2
84
+ col_inertia = c * col_dist2
85
+
86
+ dim_names = [f"Dim.{i + 1}" for i in range(n_pc)]
87
+ active_row_labels = list(X.index[active_rows])
88
+ active_col_labels = list(X.columns[active_cols])
89
+
90
+ eig_df = pd.DataFrame(
91
+ {
92
+ "eigenvalue": eigenvalues_full,
93
+ "percentage of variance": eigenvalues_full / total_inertia * 100.0,
94
+ "cumulative percentage of variance": np.cumsum(eigenvalues_full) / total_inertia * 100.0,
95
+ },
96
+ index=[f"dim {i + 1}" for i in range(eigenvalues_full.size)],
97
+ )
98
+
99
+ row_block = Block(
100
+ coord=pd.DataFrame(row_coord, index=active_row_labels, columns=dim_names),
101
+ cos2=pd.DataFrame(row_cos2, index=active_row_labels, columns=dim_names),
102
+ contrib=pd.DataFrame(row_contrib, index=active_row_labels, columns=dim_names),
103
+ inertia=pd.Series(row_inertia, index=active_row_labels, name="inertia"),
104
+ dist=pd.Series(np.sqrt(row_dist2), index=active_row_labels, name="dist"),
105
+ )
106
+ col_block = Block(
107
+ coord=pd.DataFrame(col_coord, index=active_col_labels, columns=dim_names),
108
+ cos2=pd.DataFrame(col_cos2, index=active_col_labels, columns=dim_names),
109
+ contrib=pd.DataFrame(col_contrib, index=active_col_labels, columns=dim_names),
110
+ inertia=pd.Series(col_inertia, index=active_col_labels, name="inertia"),
111
+ dist=pd.Series(np.sqrt(col_dist2), index=active_col_labels, name="dist"),
112
+ )
113
+
114
+ # Supplementary rows (project onto column-axis basis)
115
+ row_sup_block = None
116
+ if row_sup_idx:
117
+ A_sup = Xv[np.ix_(np.asarray(row_sup_idx), active_cols)]
118
+ r_sup = A_sup.sum(axis=1)
119
+ r_sup_safe = np.where(r_sup <= 0, 1.0, r_sup)
120
+ prof_sup = A_sup / r_sup_safe[:, None]
121
+ # Row sup coords = (profile - c) projected on V_tilde / sqrt(c) (transition formula)
122
+ coord_sup = ((prof_sup - c[None, :]) / np.sqrt(c)[None, :]) @ V_tilde
123
+ dist2_sup = ((prof_sup - c[None, :]) ** 2 / c[None, :]).sum(axis=1)
124
+ with np.errstate(divide="ignore", invalid="ignore"):
125
+ cos2_sup = np.where(dist2_sup[:, None] > 0, coord_sup**2 / dist2_sup[:, None], 0.0)
126
+ row_sup_block = Block(
127
+ coord=pd.DataFrame(coord_sup, index=[X.index[i] for i in row_sup_idx], columns=dim_names),
128
+ cos2=pd.DataFrame(cos2_sup, index=[X.index[i] for i in row_sup_idx], columns=dim_names),
129
+ dist=pd.Series(np.sqrt(dist2_sup), index=[X.index[i] for i in row_sup_idx], name="dist"),
130
+ )
131
+
132
+ col_sup_block = None
133
+ if col_sup_idx:
134
+ A_sup = Xv[np.ix_(active_rows, np.asarray(col_sup_idx))]
135
+ c_sup = A_sup.sum(axis=0)
136
+ c_sup_safe = np.where(c_sup <= 0, 1.0, c_sup)
137
+ prof_sup = A_sup / c_sup_safe[None, :]
138
+ coord_sup = ((prof_sup.T - r[None, :]) / np.sqrt(r)[None, :]) @ U_tilde
139
+ dist2_sup = ((prof_sup.T - r[None, :]) ** 2 / r[None, :]).sum(axis=1)
140
+ with np.errstate(divide="ignore", invalid="ignore"):
141
+ cos2_sup = np.where(dist2_sup[:, None] > 0, coord_sup**2 / dist2_sup[:, None], 0.0)
142
+ col_sup_block = Block(
143
+ coord=pd.DataFrame(coord_sup, index=[X.columns[j] for j in col_sup_idx], columns=dim_names),
144
+ cos2=pd.DataFrame(cos2_sup, index=[X.columns[j] for j in col_sup_idx], columns=dim_names),
145
+ dist=pd.Series(np.sqrt(dist2_sup), index=[X.columns[j] for j in col_sup_idx], name="dist"),
146
+ )
147
+
148
+ return Result(
149
+ eig=eig_df,
150
+ svd=SVD(vs=vs_full.copy(), U=U_tilde.copy(), V=V_tilde.copy()),
151
+ call={
152
+ "ncp": ncp,
153
+ "row_sup": row_sup_idx,
154
+ "col_sup": col_sup_idx,
155
+ "N": N,
156
+ "marge_row": r.copy(),
157
+ "marge_col": c.copy(),
158
+ },
159
+ row=row_block,
160
+ col=col_block,
161
+ row_sup=row_sup_block,
162
+ col_sup=col_sup_block,
163
+ method="CA",
164
+ )
@@ -0,0 +1,54 @@
1
+ """Bundled datasets re-used from FactoMineR's distribution for parity testing.
2
+
3
+ See ``factominer/datasets/data/PROVENANCE.md`` for the origin and licensing of
4
+ each file.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+
13
+ _DATA_DIR = Path(__file__).parent / "data"
14
+
15
+
16
+ def _load_csv(name: str) -> pd.DataFrame:
17
+ path = _DATA_DIR / name
18
+ if not path.exists():
19
+ raise FileNotFoundError(f"dataset not bundled: {path.name}")
20
+ return pd.read_csv(path, index_col=0)
21
+
22
+
23
+ def load_decathlon() -> pd.DataFrame:
24
+ """41 athletes × 13 columns from the 2004 Athens Olympic + Décastar decathlons.
25
+
26
+ Columns: ten athletic events (seconds or meters), plus ``Rank``, ``Points``,
27
+ and ``Competition`` (a two-level factor). FactoMineR's canonical PCA example.
28
+ """
29
+ return _load_csv("decathlon.csv")
30
+
31
+
32
+ def load_children() -> pd.DataFrame:
33
+ """18 × 8 contingency table on the perceptions of children's worries.
34
+
35
+ Rows: kinds of worries. Columns: socio-educational categories. Used in
36
+ FactoMineR's CA examples.
37
+ """
38
+ return _load_csv("children.csv")
39
+
40
+
41
+ def load_tea() -> pd.DataFrame:
42
+ """300 × 36 survey on tea consumption habits.
43
+
44
+ Mostly categorical (factors); one integer column. Canonical MCA example.
45
+ """
46
+ return _load_csv("tea.csv")
47
+
48
+
49
+ def load_poison() -> pd.DataFrame:
50
+ """55 × 15 food-poisoning outbreak survey.
51
+
52
+ Mixed categorical + quantitative. Used in MCA / FAMD examples.
53
+ """
54
+ return _load_csv("poison.csv")