cccpm 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cccpm/__init__.py +1 -0
- cccpm/cpm_analysis.py +272 -0
- cccpm/edge_selection.py +271 -0
- cccpm/fold.py +46 -0
- cccpm/logging.py +37 -0
- cccpm/models.py +148 -0
- cccpm/more_models.py +205 -0
- cccpm/reporting/__init__.py +1 -0
- cccpm/reporting/assets/CCCPM.png +0 -0
- cccpm/reporting/html_report.py +363 -0
- cccpm/reporting/plots/__init__.py +0 -0
- cccpm/reporting/plots/chord_v2.py +821 -0
- cccpm/reporting/plots/cpm_chord_plot.py +149 -0
- cccpm/reporting/plots/plots.py +337 -0
- cccpm/reporting/plots/utils.py +19 -0
- cccpm/reporting/reporting_utils.py +124 -0
- cccpm/results_manager.py +463 -0
- cccpm/scoring.py +40 -0
- cccpm/simulation/__init__.py +0 -0
- cccpm/simulation/simulate_multivariate.py +252 -0
- cccpm/simulation/simulate_sem.py +319 -0
- cccpm/simulation/simulate_simple.py +37 -0
- cccpm/utils.py +386 -0
- cccpm-0.2.1.dist-info/METADATA +105 -0
- cccpm-0.2.1.dist-info/RECORD +26 -0
- cccpm-0.2.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simulate multivariate predictors X, confounders Z, and a continuous outcome y.
|
|
3
|
+
|
|
4
|
+
Data-generating process (pure confounding, no mediation):
|
|
5
|
+
Z ~ N(0, I_q)
|
|
6
|
+
X = Z A + E
|
|
7
|
+
y = X beta + Z gamma + eps
|
|
8
|
+
|
|
9
|
+
- A (q x p) encodes how Z loads into columns of X (Z -> X).
|
|
10
|
+
- beta (p,) encodes the direct effect of X on y (X -> y). Only a subset is nonzero.
|
|
11
|
+
- gamma (q,) encodes the confounding path Z -> y.
|
|
12
|
+
- E is column-correlated noise for X with optional AR(1) structure across columns.
|
|
13
|
+
- eps is outcome noise for y.
|
|
14
|
+
|
|
15
|
+
Author: (your name)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from typing import Dict, Any, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
from numpy.random import Generator
|
|
23
|
+
from numpy.linalg import lstsq
|
|
24
|
+
from scipy.linalg import toeplitz
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------
|
|
28
|
+
# Data container
|
|
29
|
+
# ---------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class SimulationResult:
|
|
33
|
+
"""
|
|
34
|
+
Container for simulated data and the generating parameters.
|
|
35
|
+
|
|
36
|
+
Attributes
|
|
37
|
+
----------
|
|
38
|
+
X : np.ndarray, shape (n, p)
|
|
39
|
+
Predictor matrix.
|
|
40
|
+
Z : np.ndarray, shape (n, q)
|
|
41
|
+
Confounder matrix.
|
|
42
|
+
y : np.ndarray, shape (n,)
|
|
43
|
+
Continuous outcome.
|
|
44
|
+
beta : np.ndarray, shape (p,)
|
|
45
|
+
True direct effects of X on y (nonzero on a subset).
|
|
46
|
+
gamma : np.ndarray, shape (q,)
|
|
47
|
+
Effects of Z on y (confounding strength).
|
|
48
|
+
A : np.ndarray, shape (q, p)
|
|
49
|
+
Loadings from Z into X (confounding footprint inside X).
|
|
50
|
+
info : dict
|
|
51
|
+
Metadata: indices for signal features, which X columns are confounded,
|
|
52
|
+
and all simulation hyperparameters used.
|
|
53
|
+
"""
|
|
54
|
+
X: np.ndarray
|
|
55
|
+
Z: np.ndarray
|
|
56
|
+
y: np.ndarray
|
|
57
|
+
beta: np.ndarray
|
|
58
|
+
gamma: np.ndarray
|
|
59
|
+
A: np.ndarray
|
|
60
|
+
info: Dict[str, Any]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ---------------------------------------------------------------------
|
|
64
|
+
# Utilities
|
|
65
|
+
# ---------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
def ar1_cov(p: int, rho: float, sigma2: float) -> np.ndarray:
|
|
68
|
+
"""
|
|
69
|
+
Construct an AR(1) covariance matrix across p variables.
|
|
70
|
+
|
|
71
|
+
Var(X_j) = sigma2
|
|
72
|
+
Corr(X_j, X_{j+k}) = rho^k
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
p : int
|
|
77
|
+
Number of variables (columns).
|
|
78
|
+
rho : float
|
|
79
|
+
AR(1) correlation parameter in [-1, 1].
|
|
80
|
+
sigma2 : float
|
|
81
|
+
Marginal variance for each variable.
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
np.ndarray, shape (p, p)
|
|
86
|
+
AR(1) covariance matrix.
|
|
87
|
+
"""
|
|
88
|
+
if abs(rho) < 1e-12:
|
|
89
|
+
return np.eye(p) * sigma2
|
|
90
|
+
first_col = (rho ** np.arange(p)) * sigma2
|
|
91
|
+
return toeplitz(first_col)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _rng(seed: Optional[int]) -> Generator:
|
|
95
|
+
"""Create a NumPy Generator with a fixed seed (if provided)."""
|
|
96
|
+
return np.random.default_rng(seed)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ---------------------------------------------------------------------
|
|
100
|
+
# Core simulator
|
|
101
|
+
# ---------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
def simulate_confounders(
|
|
104
|
+
n_samples: int = 1000, # number of samples
|
|
105
|
+
n_features: int = 50, # number of predictors (columns of X)
|
|
106
|
+
n_confounds: int = 3, # number of confounders (columns of Z)
|
|
107
|
+
n_features_with_signal: int = 10, # number of X columns with nonzero beta
|
|
108
|
+
n_confounded_features: int = 10, # number of X columns influenced by Z (nonzero column in A)
|
|
109
|
+
beta_scale: float = 1.0, # SD for nonzero beta entries
|
|
110
|
+
gamma_scale: float = 1.0, # SD for entries of gamma (Z -> y strength)
|
|
111
|
+
z_to_x_strength: float = 0.7, # typical magnitude of Z -> X loadings (A)
|
|
112
|
+
sigma_x: float = 1.0, # SD for X-noise E
|
|
113
|
+
sigma_y: float = 1.0, # SD for outcome noise eps
|
|
114
|
+
rho_x: float = 0.0, # AR(1) correlation among columns of X-noise E
|
|
115
|
+
random_state: Optional[int] = 42 # RNG seed for reproducibility
|
|
116
|
+
) -> SimulationResult:
|
|
117
|
+
"""
|
|
118
|
+
Simulate (X, Z, y) with pure confounding (Z affects both X and y).
|
|
119
|
+
|
|
120
|
+
Model
|
|
121
|
+
-----
|
|
122
|
+
Z ~ N(0, I_q)
|
|
123
|
+
X = Z A + E, E ~ N(0, Sigma_E) with AR(1) columns
|
|
124
|
+
y = X beta + Z gamma + eps, eps ~ N(0, sigma_y^2 I_n)
|
|
125
|
+
|
|
126
|
+
Notes
|
|
127
|
+
-----
|
|
128
|
+
- Confounding arises when BOTH A != 0 (Z -> X) and gamma != 0 (Z -> y).
|
|
129
|
+
- If gamma_scale=0, there is no confounding even if Z strongly influences X.
|
|
130
|
+
- Only a subset of beta entries is nonzero (true signal features).
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
SimulationResult
|
|
135
|
+
Data and parameters used to generate it, including indices of signal
|
|
136
|
+
features and which X columns are confounded.
|
|
137
|
+
"""
|
|
138
|
+
rng = _rng(random_state)
|
|
139
|
+
|
|
140
|
+
# 1) Confounders: Z ~ N(0, I_q)
|
|
141
|
+
Z = rng.normal(0.0, 1.0, size=(n_samples, n_confounds))
|
|
142
|
+
|
|
143
|
+
# 2) Choose signal features and betas
|
|
144
|
+
if not (1 <= n_features_with_signal <= n_features):
|
|
145
|
+
raise ValueError(f"`num_signal` must be in [1, p]; got {n_features_with_signal} (p={n_features}).")
|
|
146
|
+
k = n_features_with_signal
|
|
147
|
+
|
|
148
|
+
signal_idx = np.arange(k) # 0..k-1
|
|
149
|
+
beta = np.zeros(n_features)
|
|
150
|
+
beta[signal_idx] = rng.normal(0.0, beta_scale, size=k)
|
|
151
|
+
|
|
152
|
+
# 3) Choose confounded features
|
|
153
|
+
confounded_mask = np.zeros(n_features, dtype=bool)
|
|
154
|
+
confounded_mask[:n_confounded_features] = True # first m columns are confounded
|
|
155
|
+
|
|
156
|
+
A = np.zeros((n_confounds, n_features))
|
|
157
|
+
if confounded_mask.any():
|
|
158
|
+
# Use scale / sqrt(q) so variance contribution is stable w.r.t. number of confounders
|
|
159
|
+
A[:, confounded_mask] = rng.normal(
|
|
160
|
+
loc=0.0,
|
|
161
|
+
scale=z_to_x_strength / np.sqrt(n_confounds),
|
|
162
|
+
size=(n_confounds, int(confounded_mask.sum()))
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# 4) Column-correlated noise for X (optional AR(1) across columns)
|
|
166
|
+
Sigma_E = ar1_cov(p=n_features, rho=rho_x, sigma2=sigma_x ** 2)
|
|
167
|
+
E = rng.multivariate_normal(mean=np.zeros(n_features), cov=Sigma_E, size=n_samples)
|
|
168
|
+
|
|
169
|
+
# 5) Build X
|
|
170
|
+
# Each column X_j is a linear combination of Z (via A) plus noise E_j.
|
|
171
|
+
X = Z @ A + E
|
|
172
|
+
|
|
173
|
+
# 6) Outcome y: direct X->y via beta, plus confounding Z->y via gamma
|
|
174
|
+
gamma = rng.normal(loc=0.0, scale=gamma_scale, size=n_confounds)
|
|
175
|
+
eps_y = rng.normal(loc=0.0, scale=sigma_y, size=n_samples)
|
|
176
|
+
y = X @ beta + Z @ gamma + eps_y
|
|
177
|
+
|
|
178
|
+
# 7) Metadata for inspection and downstream benchmarking
|
|
179
|
+
info: Dict[str, Any] = {
|
|
180
|
+
"n_samples": n_samples,
|
|
181
|
+
"n_features": n_features,
|
|
182
|
+
"n_confounds": n_confounds,
|
|
183
|
+
"n_features_with_signal": n_features_with_signal,
|
|
184
|
+
"signal_idx": signal_idx, # indices where beta != 0
|
|
185
|
+
"confounded_mask": confounded_mask, # columns of X with nonzero A
|
|
186
|
+
"n_confounded_features": n_confounded_features,
|
|
187
|
+
"beta_scale": beta_scale,
|
|
188
|
+
"gamma_scale": gamma_scale,
|
|
189
|
+
"z_to_x_strength": z_to_x_strength,
|
|
190
|
+
"sigma_x": sigma_x,
|
|
191
|
+
"sigma_y": sigma_y,
|
|
192
|
+
"rho_x": rho_x,
|
|
193
|
+
"random_state": random_state,
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return SimulationResult(X=X, Z=Z, y=y, beta=beta, gamma=gamma, A=A, info=info)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# ---------------------------------------------------------------------
|
|
200
|
+
# (Optional) tiny extras that are often handy while testing
|
|
201
|
+
# ---------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
def fit_ols(y: np.ndarray, X: np.ndarray, add_intercept: bool = True) -> np.ndarray:
|
|
204
|
+
"""
|
|
205
|
+
Minimal OLS via least squares (useful sanity check; no regularization).
|
|
206
|
+
|
|
207
|
+
Returns
|
|
208
|
+
-------
|
|
209
|
+
np.ndarray
|
|
210
|
+
If add_intercept=True: [intercept, betas...]
|
|
211
|
+
Else: betas
|
|
212
|
+
"""
|
|
213
|
+
if add_intercept:
|
|
214
|
+
X1 = np.column_stack([np.ones(X.shape[0]), X])
|
|
215
|
+
coef, *_ = lstsq(X1, y, rcond=None)
|
|
216
|
+
return coef
|
|
217
|
+
coef, *_ = lstsq(X, y, rcond=None)
|
|
218
|
+
return coef
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def residualize_against_Z(M: np.ndarray, Z: np.ndarray) -> np.ndarray:
|
|
222
|
+
"""
|
|
223
|
+
Remove linear effects of Z from M (works for vector y or matrix X).
|
|
224
|
+
|
|
225
|
+
Computes residuals from OLS projection:
|
|
226
|
+
M_res = M - Proj_{[1,Z]}(M)
|
|
227
|
+
|
|
228
|
+
Parameters
|
|
229
|
+
----------
|
|
230
|
+
M : np.ndarray
|
|
231
|
+
y (1D) or X (2D) to be residualized.
|
|
232
|
+
Z : np.ndarray, shape (n, q)
|
|
233
|
+
Confounders to regress out (intercept is added internally).
|
|
234
|
+
|
|
235
|
+
Returns
|
|
236
|
+
-------
|
|
237
|
+
np.ndarray
|
|
238
|
+
Residuals with the same shape as M.
|
|
239
|
+
"""
|
|
240
|
+
n = Z.shape[0]
|
|
241
|
+
Z1 = np.column_stack([np.ones(n), Z])
|
|
242
|
+
|
|
243
|
+
if M.ndim == 1:
|
|
244
|
+
coef, *_ = lstsq(Z1, M, rcond=None)
|
|
245
|
+
return M - Z1 @ coef
|
|
246
|
+
|
|
247
|
+
if M.ndim == 2:
|
|
248
|
+
coef, *_ = lstsq(Z1, M, rcond=None) # solves for all columns at once
|
|
249
|
+
return M - Z1 @ coef
|
|
250
|
+
|
|
251
|
+
raise ValueError("M must be a 1D vector (y) or a 2D matrix (X).")
|
|
252
|
+
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sklearn.linear_model import LinearRegression
|
|
4
|
+
from sklearn.metrics import r2_score
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _solve_rho_for_R2(r2_X_y: float, r2_X_y_given_Z: float, r2_Z_y: float,
|
|
8
|
+
tol: float = 1e-4) -> float:
|
|
9
|
+
"""
|
|
10
|
+
Find corr(X, Z) = rho that best matches the desired R2_X_y_given_Z.
|
|
11
|
+
If an exact match is impossible, return the closest achievable rho
|
|
12
|
+
and issue a warning instead of raising an error.
|
|
13
|
+
"""
|
|
14
|
+
r_x = np.sqrt(r2_X_y)
|
|
15
|
+
r_z = np.sqrt(r2_Z_y)
|
|
16
|
+
|
|
17
|
+
# Special case: Z has no effect → cannot adjust X→y
|
|
18
|
+
if r2_Z_y < tol:
|
|
19
|
+
if abs(r2_X_y_given_Z - r2_X_y) > tol:
|
|
20
|
+
print(
|
|
21
|
+
f"Warning: R2_Z_y≈0, so R2_X_y_given_Z must equal R2_X_y. "
|
|
22
|
+
f"Using R2_X_y_given_Z = {r2_X_y:.4f} instead of {r2_X_y_given_Z:.4f}."
|
|
23
|
+
)
|
|
24
|
+
return 0.0
|
|
25
|
+
|
|
26
|
+
# Search over allowable rho values
|
|
27
|
+
rhos = np.linspace(-0.99, 0.99, 20001)
|
|
28
|
+
num = (r_x - rhos * r_z) ** 2
|
|
29
|
+
den = 1 - rhos**2
|
|
30
|
+
vals = num / den
|
|
31
|
+
|
|
32
|
+
# Find closest match
|
|
33
|
+
idx = np.argmin(np.abs(vals - r2_X_y_given_Z))
|
|
34
|
+
rho_best = float(rhos[idx])
|
|
35
|
+
val_best = float(vals[idx])
|
|
36
|
+
|
|
37
|
+
# If we cannot achieve exact target, warn and continue
|
|
38
|
+
if abs(val_best - r2_X_y_given_Z) > tol:
|
|
39
|
+
print(
|
|
40
|
+
f"Warning: Could not achieve R2_X_y_given_Z={r2_X_y_given_Z:.4f}. "
|
|
41
|
+
f"Closest possible value is {val_best:.4f}. Using that instead."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
return rho_best
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def simulate_data_given_R2(
|
|
49
|
+
R2_X_y: float,
|
|
50
|
+
R2_X_y_given_Z: float,
|
|
51
|
+
R2_Z_y: float,
|
|
52
|
+
n_features: int = 100,
|
|
53
|
+
n_features_informative: int = 10,
|
|
54
|
+
n_confounds: int = 2,
|
|
55
|
+
n_samples: int = 10_000,
|
|
56
|
+
rho_informative: float = 0.5,
|
|
57
|
+
random_state: int | None = None,
|
|
58
|
+
) -> pd.DataFrame:
|
|
59
|
+
"""
|
|
60
|
+
Simulate data that matches specified R² relationships between a latent
|
|
61
|
+
predictor X, confounds Z, and outcome y.
|
|
62
|
+
|
|
63
|
+
Inputs (targets)
|
|
64
|
+
----------------
|
|
65
|
+
R2_X_y : float
|
|
66
|
+
Naive R² from regressing y ~ X.
|
|
67
|
+
R2_X_y_given_Z : float
|
|
68
|
+
Unique / incremental R² from X after adjusting for all confounds Z:
|
|
69
|
+
R2_X_y_given_Z = R2(y ~ X + Z) - R2(y ~ Z).
|
|
70
|
+
R2_Z_y : float
|
|
71
|
+
Naive R² from regressing y ~ Z (all confounds jointly).
|
|
72
|
+
|
|
73
|
+
Data structure
|
|
74
|
+
--------------
|
|
75
|
+
- latent variable: true_X (standardized)
|
|
76
|
+
- confounds: conf1..confK (K = n_confounds)
|
|
77
|
+
* only conf1 is used to build the desired R² (others are noise)
|
|
78
|
+
- features: X1..Xn_features
|
|
79
|
+
* first n_features_informative are informative: correlated with true_X
|
|
80
|
+
* remaining are pure noise
|
|
81
|
+
- outcome: y (standardized)
|
|
82
|
+
|
|
83
|
+
R² logic
|
|
84
|
+
--------
|
|
85
|
+
- Naive R²(y ~ X) ≈ R2_X_y
|
|
86
|
+
- Naive R²(y ~ Z) ≈ R2_Z_y
|
|
87
|
+
- Unique R² of X given Z ≈ R2_X_y_given_Z
|
|
88
|
+
so:
|
|
89
|
+
R2_full ≈ R2_Z_y + R2_X_y_given_Z
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
df : pandas.DataFrame
|
|
94
|
+
Columns:
|
|
95
|
+
conf1..confK
|
|
96
|
+
X1..Xn_features
|
|
97
|
+
true_X
|
|
98
|
+
y
|
|
99
|
+
"""
|
|
100
|
+
rng = np.random.default_rng(random_state)
|
|
101
|
+
|
|
102
|
+
# --- basic validity checks ---
|
|
103
|
+
for name, val in [
|
|
104
|
+
("R2_X_y", R2_X_y),
|
|
105
|
+
("R2_X_y_given_Z", R2_X_y_given_Z),
|
|
106
|
+
("R2_Z_y", R2_Z_y),
|
|
107
|
+
]:
|
|
108
|
+
if not (0.0 <= val < 1.0):
|
|
109
|
+
raise ValueError(f"{name} must be in [0,1), got {val}.")
|
|
110
|
+
|
|
111
|
+
if R2_X_y_given_Z > R2_X_y + 1e-8:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
"R2_X_y_given_Z cannot exceed R2_X_y (unique effect cannot be "
|
|
114
|
+
"larger than naive effect)."
|
|
115
|
+
)
|
|
116
|
+
if R2_Z_y + R2_X_y_given_Z >= 1.0:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
"R2_Z_y + R2_X_y_given_Z must be < 1 (total R² cannot exceed 1)."
|
|
119
|
+
)
|
|
120
|
+
if n_features_informative > n_features:
|
|
121
|
+
raise ValueError("n_features_informative cannot be greater than n_features.")
|
|
122
|
+
if not (0.0 <= rho_informative < 1.0):
|
|
123
|
+
raise ValueError("rho_informative must be in [0, 1).")
|
|
124
|
+
if n_confounds < 1:
|
|
125
|
+
raise ValueError("n_confounds must be at least 1.")
|
|
126
|
+
|
|
127
|
+
# --- step 1: build correlation matrix for (X, Z, y) ---
|
|
128
|
+
r_x = np.sqrt(R2_X_y)
|
|
129
|
+
r_z = np.sqrt(R2_Z_y)
|
|
130
|
+
|
|
131
|
+
# Solve Corr(X, Z) = rho to match desired R2_X_y_given_Z
|
|
132
|
+
rho = _solve_rho_for_R2(R2_X_y, R2_X_y_given_Z, R2_Z_y)
|
|
133
|
+
|
|
134
|
+
# correlation matrix for [X, Z, y]
|
|
135
|
+
# order: [X, Z, y]
|
|
136
|
+
corr = np.array(
|
|
137
|
+
[
|
|
138
|
+
[1.0, rho, r_x],
|
|
139
|
+
[rho, 1.0, r_z],
|
|
140
|
+
[r_x, r_z, 1.0],
|
|
141
|
+
]
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# sanity check: positive semi-definite
|
|
145
|
+
eigvals = np.linalg.eigvalsh(corr)
|
|
146
|
+
if np.min(eigvals) < -1e-6:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
"Requested R² combination leads to an invalid correlation matrix."
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# --- step 2: sample (X, Z, y) as multivariate normal with this corr matrix ---
|
|
152
|
+
L = np.linalg.cholesky(corr + 1e-8 * np.eye(3)) # jitter for numeric safety
|
|
153
|
+
Z_samples = rng.normal(size=(n_samples, 3))
|
|
154
|
+
XYZ = Z_samples @ L.T
|
|
155
|
+
true_X = XYZ[:, 0]
|
|
156
|
+
Z_scalar = XYZ[:, 1]
|
|
157
|
+
y = XYZ[:, 2]
|
|
158
|
+
|
|
159
|
+
# Now:
|
|
160
|
+
# Corr(true_X, y)^2 ≈ R2_X_y
|
|
161
|
+
# Corr(Z_scalar, y)^2 ≈ R2_Z_y
|
|
162
|
+
# and the full regression y ~ X + Z_scalar has unique R² for X ≈ R2_X_y_given_Z.
|
|
163
|
+
|
|
164
|
+
# --- step 3: expand scalar Z into n_confounds (only conf1 matters) ---
|
|
165
|
+
conf_data = {}
|
|
166
|
+
conf_data["conf1"] = Z_scalar
|
|
167
|
+
for i in range(1, n_confounds):
|
|
168
|
+
conf_data[f"conf{i+1}"] = rng.normal(0.0, 1.0, size=n_samples)
|
|
169
|
+
conf_df = pd.DataFrame(conf_data)
|
|
170
|
+
|
|
171
|
+
# --- step 4: generate X features ---
|
|
172
|
+
# First n_features_informative are equicorrelated measures of true_X
|
|
173
|
+
# For informative features with Var=1 and pairwise corr=rho_informative:
|
|
174
|
+
# Xj = sqrt(rho_inf)*true_X + sqrt(1-rho_inf)*N(0,1)
|
|
175
|
+
X_data = {}
|
|
176
|
+
loading = np.sqrt(rho_informative)
|
|
177
|
+
resid_sd = np.sqrt(1.0 - rho_informative)
|
|
178
|
+
|
|
179
|
+
for j in range(n_features_informative):
|
|
180
|
+
eps = rng.normal(0.0, resid_sd, size=n_samples)
|
|
181
|
+
X_data[f"X{j+1}"] = loading * true_X + eps
|
|
182
|
+
|
|
183
|
+
# Remaining features are pure noise
|
|
184
|
+
for j in range(n_features_informative, n_features):
|
|
185
|
+
X_data[f"X{j+1}"] = rng.normal(0.0, 1.0, size=n_samples)
|
|
186
|
+
|
|
187
|
+
X_df = pd.DataFrame(X_data)
|
|
188
|
+
|
|
189
|
+
# --- step 5: assemble final DataFrame ---
|
|
190
|
+
df = pd.concat([conf_df, X_df], axis=1)
|
|
191
|
+
df["true_X"] = true_X
|
|
192
|
+
df["y"] = y
|
|
193
|
+
|
|
194
|
+
return {'X': X_df.to_numpy(), 'Z': conf_df.to_numpy(), 'y': y.reshape(-1, 1), 'true_X': true_X.reshape(-1, 1)}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def compute_r2s(sim: dict) -> dict:
|
|
198
|
+
"""
|
|
199
|
+
Convenience function to empirically estimate the R² components from
|
|
200
|
+
the simulated data:
|
|
201
|
+
|
|
202
|
+
- r2_naive: R²(y ~ true_X)
|
|
203
|
+
- r2_conf_only: R²(y ~ all confounds)
|
|
204
|
+
- r2_full: R²(y ~ true_X + all confounds)
|
|
205
|
+
- r2_unique_X: r2_full - r2_conf_only
|
|
206
|
+
"""
|
|
207
|
+
y = sim["y"]
|
|
208
|
+
X_naive = sim["true_X"]
|
|
209
|
+
X_conf = sim["Z"]
|
|
210
|
+
|
|
211
|
+
# Naive: y ~ true_X
|
|
212
|
+
mdl_naive = LinearRegression().fit(X_naive, y)
|
|
213
|
+
r2_naive = r2_score(y, mdl_naive.predict(X_naive))
|
|
214
|
+
|
|
215
|
+
# Confounds-only: y ~ conf1..confK
|
|
216
|
+
mdl_conf = LinearRegression().fit(X_conf, y)
|
|
217
|
+
r2_conf_only = r2_score(y, mdl_conf.predict(X_conf))
|
|
218
|
+
|
|
219
|
+
# Full: y ~ true_X + conf1..confK
|
|
220
|
+
X_full = np.column_stack([X_naive, X_conf])
|
|
221
|
+
mdl_full = LinearRegression().fit(X_full, y)
|
|
222
|
+
r2_full = r2_score(y, mdl_full.predict(X_full))
|
|
223
|
+
|
|
224
|
+
r2_unique_X = r2_full - r2_conf_only
|
|
225
|
+
|
|
226
|
+
return {
|
|
227
|
+
"r2_naive": r2_naive,
|
|
228
|
+
"r2_conf_only": r2_conf_only,
|
|
229
|
+
"r2_full": r2_full,
|
|
230
|
+
"r2_unique_X": r2_unique_X,
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def generate_four_scenarios(
|
|
235
|
+
n_features: int = 100,
|
|
236
|
+
n_features_informative: int = 10,
|
|
237
|
+
n_confounds: int = 2,
|
|
238
|
+
n_samples: int = 10_000,
|
|
239
|
+
rho_informative: float = 0.5,
|
|
240
|
+
random_state: int | None = 123,
|
|
241
|
+
) -> dict[str, pd.DataFrame]:
|
|
242
|
+
|
|
243
|
+
rng = np.random.default_rng(random_state)
|
|
244
|
+
seeds = rng.integers(0, 2**32 - 1, size=4)
|
|
245
|
+
|
|
246
|
+
scenarios = {}
|
|
247
|
+
|
|
248
|
+
# 1. No confounding
|
|
249
|
+
scenarios["No Confounding Effect"] = simulate_data_given_R2(
|
|
250
|
+
R2_X_y=0.25,
|
|
251
|
+
R2_X_y_given_Z=0.25,
|
|
252
|
+
R2_Z_y=0.0,
|
|
253
|
+
n_features=n_features,
|
|
254
|
+
n_features_informative=n_features_informative,
|
|
255
|
+
n_confounds=n_confounds,
|
|
256
|
+
n_samples=n_samples,
|
|
257
|
+
rho_informative=rho_informative,
|
|
258
|
+
random_state=int(seeds[0]),
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# 2. Weak/partial confounding
|
|
262
|
+
scenarios["Moderate Confounding Effect"] = simulate_data_given_R2(
|
|
263
|
+
R2_X_y=0.25,
|
|
264
|
+
R2_X_y_given_Z=0.15,
|
|
265
|
+
R2_Z_y=0.10,
|
|
266
|
+
n_features=n_features,
|
|
267
|
+
n_features_informative=n_features_informative,
|
|
268
|
+
n_confounds=n_confounds,
|
|
269
|
+
n_samples=n_samples,
|
|
270
|
+
rho_informative=rho_informative,
|
|
271
|
+
random_state=int(seeds[1]),
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# 3. Full confounding
|
|
275
|
+
scenarios["Strong Confounding Effect"] = simulate_data_given_R2(
|
|
276
|
+
R2_X_y=0.25,
|
|
277
|
+
R2_X_y_given_Z=0.05,
|
|
278
|
+
R2_Z_y=0.20,
|
|
279
|
+
n_features=n_features,
|
|
280
|
+
n_features_informative=n_features_informative,
|
|
281
|
+
n_confounds=n_confounds,
|
|
282
|
+
n_samples=n_samples,
|
|
283
|
+
rho_informative=rho_informative,
|
|
284
|
+
random_state=int(seeds[2]),
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# 4. No confounding but Z explains part of y
|
|
288
|
+
scenarios["No Confounding Effect But Useful Confounds"] = simulate_data_given_R2(
|
|
289
|
+
R2_X_y=0.25,
|
|
290
|
+
R2_X_y_given_Z=0.25,
|
|
291
|
+
R2_Z_y=0.25,
|
|
292
|
+
n_features=n_features,
|
|
293
|
+
n_features_informative=n_features_informative,
|
|
294
|
+
n_confounds=n_confounds,
|
|
295
|
+
n_samples=n_samples,
|
|
296
|
+
rho_informative=rho_informative,
|
|
297
|
+
random_state=int(seeds[3]),
|
|
298
|
+
)
|
|
299
|
+
return scenarios
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# -------------------------------------------------------------------
|
|
303
|
+
# Example usage
|
|
304
|
+
# -------------------------------------------------------------------
|
|
305
|
+
if __name__ == "__main__":
|
|
306
|
+
scenarios = generate_four_scenarios(
|
|
307
|
+
n_features=100,
|
|
308
|
+
n_features_informative=10,
|
|
309
|
+
n_confounds=3,
|
|
310
|
+
n_samples=1000,
|
|
311
|
+
rho_informative=0.5,
|
|
312
|
+
random_state=43,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
for name, df in scenarios.items():
|
|
316
|
+
print(f"\nScenario: {name}")
|
|
317
|
+
r2s = compute_r2s(df)
|
|
318
|
+
for k, v in r2s.items():
|
|
319
|
+
print(f" {k}: {v:.3f}")
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def simulate_confounded_data_chyzhyk(link_type='direct_link',
|
|
5
|
+
n_samples=100, n_features=100):
|
|
6
|
+
"""
|
|
7
|
+
This is code by Darya Chyzhyk et al. 2022
|
|
8
|
+
https://github.com/darya-chyzhyk/confound_prediction/blob/master/confound_prediction/data_simulation.py
|
|
9
|
+
:param link_type: str,
|
|
10
|
+
Type of the links between target and confound. Options: "no_link",
|
|
11
|
+
"direct_link", "weak_link"
|
|
12
|
+
:param n_samples: int,
|
|
13
|
+
number of samples
|
|
14
|
+
:param n_features: int,
|
|
15
|
+
number of features
|
|
16
|
+
:return:
|
|
17
|
+
"""
|
|
18
|
+
np.random.seed(42)
|
|
19
|
+
|
|
20
|
+
mu, sigma = 0, 1.0 # mean and standard deviation
|
|
21
|
+
x_rand = np.random.normal(mu, sigma, [n_samples, n_features])
|
|
22
|
+
y_rand = np.random.normal(mu, sigma, n_samples)
|
|
23
|
+
z_rand = np.random.normal(mu, sigma, n_samples)
|
|
24
|
+
|
|
25
|
+
if link_type == 'no_link':
|
|
26
|
+
y = np.copy(y_rand)
|
|
27
|
+
z = 1 * y_rand + z_rand
|
|
28
|
+
X = x_rand + z.reshape(-1, 1)
|
|
29
|
+
elif link_type == 'direct_link':
|
|
30
|
+
y = np.copy(y_rand)
|
|
31
|
+
z = y_rand + z_rand
|
|
32
|
+
X = x_rand + y_rand.reshape(-1, 1) + z.reshape(-1, 1)
|
|
33
|
+
elif link_type == 'weak_link':
|
|
34
|
+
y = np.copy(y_rand)
|
|
35
|
+
z = 0.5 * y_rand + z_rand
|
|
36
|
+
X = x_rand + y_rand.reshape(-1, 1) + z.reshape(-1, 1)
|
|
37
|
+
return X, y, z.reshape(-1, 1)
|