cccpm 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,252 @@
1
+ """
2
+ Simulate multivariate predictors X, confounders Z, and a continuous outcome y.
3
+
4
+ Data-generating process (pure confounding, no mediation):
5
+ Z ~ N(0, I_q)
6
+ X = Z A + E
7
+ y = X beta + Z gamma + eps
8
+
9
+ - A (q x p) encodes how Z loads into columns of X (Z -> X).
10
+ - beta (p,) encodes the direct effect of X on y (X -> y). Only a subset is nonzero.
11
+ - gamma (q,) encodes the confounding path Z -> y.
12
+ - E is column-correlated noise for X with optional AR(1) structure across columns.
13
+ - eps is outcome noise for y.
14
+
15
+ Author: (your name)
16
+ """
17
+
18
+ from dataclasses import dataclass
19
+ from typing import Dict, Any, Optional, Tuple
20
+
21
+ import numpy as np
22
+ from numpy.random import Generator
23
+ from numpy.linalg import lstsq
24
+ from scipy.linalg import toeplitz
25
+
26
+
27
+ # ---------------------------------------------------------------------
28
+ # Data container
29
+ # ---------------------------------------------------------------------
30
+
31
+ @dataclass
32
+ class SimulationResult:
33
+ """
34
+ Container for simulated data and the generating parameters.
35
+
36
+ Attributes
37
+ ----------
38
+ X : np.ndarray, shape (n, p)
39
+ Predictor matrix.
40
+ Z : np.ndarray, shape (n, q)
41
+ Confounder matrix.
42
+ y : np.ndarray, shape (n,)
43
+ Continuous outcome.
44
+ beta : np.ndarray, shape (p,)
45
+ True direct effects of X on y (nonzero on a subset).
46
+ gamma : np.ndarray, shape (q,)
47
+ Effects of Z on y (confounding strength).
48
+ A : np.ndarray, shape (q, p)
49
+ Loadings from Z into X (confounding footprint inside X).
50
+ info : dict
51
+ Metadata: indices for signal features, which X columns are confounded,
52
+ and all simulation hyperparameters used.
53
+ """
54
+ X: np.ndarray
55
+ Z: np.ndarray
56
+ y: np.ndarray
57
+ beta: np.ndarray
58
+ gamma: np.ndarray
59
+ A: np.ndarray
60
+ info: Dict[str, Any]
61
+
62
+
63
+ # ---------------------------------------------------------------------
64
+ # Utilities
65
+ # ---------------------------------------------------------------------
66
+
67
+ def ar1_cov(p: int, rho: float, sigma2: float) -> np.ndarray:
68
+ """
69
+ Construct an AR(1) covariance matrix across p variables.
70
+
71
+ Var(X_j) = sigma2
72
+ Corr(X_j, X_{j+k}) = rho^k
73
+
74
+ Parameters
75
+ ----------
76
+ p : int
77
+ Number of variables (columns).
78
+ rho : float
79
+ AR(1) correlation parameter in [-1, 1].
80
+ sigma2 : float
81
+ Marginal variance for each variable.
82
+
83
+ Returns
84
+ -------
85
+ np.ndarray, shape (p, p)
86
+ AR(1) covariance matrix.
87
+ """
88
+ if abs(rho) < 1e-12:
89
+ return np.eye(p) * sigma2
90
+ first_col = (rho ** np.arange(p)) * sigma2
91
+ return toeplitz(first_col)
92
+
93
+
94
+ def _rng(seed: Optional[int]) -> Generator:
95
+ """Create a NumPy Generator with a fixed seed (if provided)."""
96
+ return np.random.default_rng(seed)
97
+
98
+
99
+ # ---------------------------------------------------------------------
100
+ # Core simulator
101
+ # ---------------------------------------------------------------------
102
+
103
+ def simulate_confounders(
104
+ n_samples: int = 1000, # number of samples
105
+ n_features: int = 50, # number of predictors (columns of X)
106
+ n_confounds: int = 3, # number of confounders (columns of Z)
107
+ n_features_with_signal: int = 10, # number of X columns with nonzero beta
108
+ n_confounded_features: int = 10, # number of X columns influenced by Z (nonzero column in A)
109
+ beta_scale: float = 1.0, # SD for nonzero beta entries
110
+ gamma_scale: float = 1.0, # SD for entries of gamma (Z -> y strength)
111
+ z_to_x_strength: float = 0.7, # typical magnitude of Z -> X loadings (A)
112
+ sigma_x: float = 1.0, # SD for X-noise E
113
+ sigma_y: float = 1.0, # SD for outcome noise eps
114
+ rho_x: float = 0.0, # AR(1) correlation among columns of X-noise E
115
+ random_state: Optional[int] = 42 # RNG seed for reproducibility
116
+ ) -> SimulationResult:
117
+ """
118
+ Simulate (X, Z, y) with pure confounding (Z affects both X and y).
119
+
120
+ Model
121
+ -----
122
+ Z ~ N(0, I_q)
123
+ X = Z A + E, E ~ N(0, Sigma_E) with AR(1) columns
124
+ y = X beta + Z gamma + eps, eps ~ N(0, sigma_y^2 I_n)
125
+
126
+ Notes
127
+ -----
128
+ - Confounding arises when BOTH A != 0 (Z -> X) and gamma != 0 (Z -> y).
129
+ - If gamma_scale=0, there is no confounding even if Z strongly influences X.
130
+ - Only a subset of beta entries is nonzero (true signal features).
131
+
132
+ Returns
133
+ -------
134
+ SimulationResult
135
+ Data and parameters used to generate it, including indices of signal
136
+ features and which X columns are confounded.
137
+ """
138
+ rng = _rng(random_state)
139
+
140
+ # 1) Confounders: Z ~ N(0, I_q)
141
+ Z = rng.normal(0.0, 1.0, size=(n_samples, n_confounds))
142
+
143
+ # 2) Choose signal features and betas
144
+ if not (1 <= n_features_with_signal <= n_features):
145
+ raise ValueError(f"`num_signal` must be in [1, p]; got {n_features_with_signal} (p={n_features}).")
146
+ k = n_features_with_signal
147
+
148
+ signal_idx = np.arange(k) # 0..k-1
149
+ beta = np.zeros(n_features)
150
+ beta[signal_idx] = rng.normal(0.0, beta_scale, size=k)
151
+
152
+ # 3) Choose confounded features
153
+ confounded_mask = np.zeros(n_features, dtype=bool)
154
+ confounded_mask[:n_confounded_features] = True # first m columns are confounded
155
+
156
+ A = np.zeros((n_confounds, n_features))
157
+ if confounded_mask.any():
158
+ # Use scale / sqrt(q) so variance contribution is stable w.r.t. number of confounders
159
+ A[:, confounded_mask] = rng.normal(
160
+ loc=0.0,
161
+ scale=z_to_x_strength / np.sqrt(n_confounds),
162
+ size=(n_confounds, int(confounded_mask.sum()))
163
+ )
164
+
165
+ # 4) Column-correlated noise for X (optional AR(1) across columns)
166
+ Sigma_E = ar1_cov(p=n_features, rho=rho_x, sigma2=sigma_x ** 2)
167
+ E = rng.multivariate_normal(mean=np.zeros(n_features), cov=Sigma_E, size=n_samples)
168
+
169
+ # 5) Build X
170
+ # Each column X_j is a linear combination of Z (via A) plus noise E_j.
171
+ X = Z @ A + E
172
+
173
+ # 6) Outcome y: direct X->y via beta, plus confounding Z->y via gamma
174
+ gamma = rng.normal(loc=0.0, scale=gamma_scale, size=n_confounds)
175
+ eps_y = rng.normal(loc=0.0, scale=sigma_y, size=n_samples)
176
+ y = X @ beta + Z @ gamma + eps_y
177
+
178
+ # 7) Metadata for inspection and downstream benchmarking
179
+ info: Dict[str, Any] = {
180
+ "n_samples": n_samples,
181
+ "n_features": n_features,
182
+ "n_confounds": n_confounds,
183
+ "n_features_with_signal": n_features_with_signal,
184
+ "signal_idx": signal_idx, # indices where beta != 0
185
+ "confounded_mask": confounded_mask, # columns of X with nonzero A
186
+ "n_confounded_features": n_confounded_features,
187
+ "beta_scale": beta_scale,
188
+ "gamma_scale": gamma_scale,
189
+ "z_to_x_strength": z_to_x_strength,
190
+ "sigma_x": sigma_x,
191
+ "sigma_y": sigma_y,
192
+ "rho_x": rho_x,
193
+ "random_state": random_state,
194
+ }
195
+
196
+ return SimulationResult(X=X, Z=Z, y=y, beta=beta, gamma=gamma, A=A, info=info)
197
+
198
+
199
+ # ---------------------------------------------------------------------
200
+ # (Optional) tiny extras that are often handy while testing
201
+ # ---------------------------------------------------------------------
202
+
203
+ def fit_ols(y: np.ndarray, X: np.ndarray, add_intercept: bool = True) -> np.ndarray:
204
+ """
205
+ Minimal OLS via least squares (useful sanity check; no regularization).
206
+
207
+ Returns
208
+ -------
209
+ np.ndarray
210
+ If add_intercept=True: [intercept, betas...]
211
+ Else: betas
212
+ """
213
+ if add_intercept:
214
+ X1 = np.column_stack([np.ones(X.shape[0]), X])
215
+ coef, *_ = lstsq(X1, y, rcond=None)
216
+ return coef
217
+ coef, *_ = lstsq(X, y, rcond=None)
218
+ return coef
219
+
220
+
221
+ def residualize_against_Z(M: np.ndarray, Z: np.ndarray) -> np.ndarray:
222
+ """
223
+ Remove linear effects of Z from M (works for vector y or matrix X).
224
+
225
+ Computes residuals from OLS projection:
226
+ M_res = M - Proj_{[1,Z]}(M)
227
+
228
+ Parameters
229
+ ----------
230
+ M : np.ndarray
231
+ y (1D) or X (2D) to be residualized.
232
+ Z : np.ndarray, shape (n, q)
233
+ Confounders to regress out (intercept is added internally).
234
+
235
+ Returns
236
+ -------
237
+ np.ndarray
238
+ Residuals with the same shape as M.
239
+ """
240
+ n = Z.shape[0]
241
+ Z1 = np.column_stack([np.ones(n), Z])
242
+
243
+ if M.ndim == 1:
244
+ coef, *_ = lstsq(Z1, M, rcond=None)
245
+ return M - Z1 @ coef
246
+
247
+ if M.ndim == 2:
248
+ coef, *_ = lstsq(Z1, M, rcond=None) # solves for all columns at once
249
+ return M - Z1 @ coef
250
+
251
+ raise ValueError("M must be a 1D vector (y) or a 2D matrix (X).")
252
+
@@ -0,0 +1,319 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.linear_model import LinearRegression
4
+ from sklearn.metrics import r2_score
5
+
6
+
7
+ def _solve_rho_for_R2(r2_X_y: float, r2_X_y_given_Z: float, r2_Z_y: float,
8
+ tol: float = 1e-4) -> float:
9
+ """
10
+ Find corr(X, Z) = rho that best matches the desired R2_X_y_given_Z.
11
+ If an exact match is impossible, return the closest achievable rho
12
+ and issue a warning instead of raising an error.
13
+ """
14
+ r_x = np.sqrt(r2_X_y)
15
+ r_z = np.sqrt(r2_Z_y)
16
+
17
+ # Special case: Z has no effect → cannot adjust X→y
18
+ if r2_Z_y < tol:
19
+ if abs(r2_X_y_given_Z - r2_X_y) > tol:
20
+ print(
21
+ f"Warning: R2_Z_y≈0, so R2_X_y_given_Z must equal R2_X_y. "
22
+ f"Using R2_X_y_given_Z = {r2_X_y:.4f} instead of {r2_X_y_given_Z:.4f}."
23
+ )
24
+ return 0.0
25
+
26
+ # Search over allowable rho values
27
+ rhos = np.linspace(-0.99, 0.99, 20001)
28
+ num = (r_x - rhos * r_z) ** 2
29
+ den = 1 - rhos**2
30
+ vals = num / den
31
+
32
+ # Find closest match
33
+ idx = np.argmin(np.abs(vals - r2_X_y_given_Z))
34
+ rho_best = float(rhos[idx])
35
+ val_best = float(vals[idx])
36
+
37
+ # If we cannot achieve exact target, warn and continue
38
+ if abs(val_best - r2_X_y_given_Z) > tol:
39
+ print(
40
+ f"Warning: Could not achieve R2_X_y_given_Z={r2_X_y_given_Z:.4f}. "
41
+ f"Closest possible value is {val_best:.4f}. Using that instead."
42
+ )
43
+
44
+ return rho_best
45
+
46
+
47
+
48
+ def simulate_data_given_R2(
49
+ R2_X_y: float,
50
+ R2_X_y_given_Z: float,
51
+ R2_Z_y: float,
52
+ n_features: int = 100,
53
+ n_features_informative: int = 10,
54
+ n_confounds: int = 2,
55
+ n_samples: int = 10_000,
56
+ rho_informative: float = 0.5,
57
+ random_state: int | None = None,
58
+ ) -> pd.DataFrame:
59
+ """
60
+ Simulate data that matches specified R² relationships between a latent
61
+ predictor X, confounds Z, and outcome y.
62
+
63
+ Inputs (targets)
64
+ ----------------
65
+ R2_X_y : float
66
+ Naive R² from regressing y ~ X.
67
+ R2_X_y_given_Z : float
68
+ Unique / incremental R² from X after adjusting for all confounds Z:
69
+ R2_X_y_given_Z = R2(y ~ X + Z) - R2(y ~ Z).
70
+ R2_Z_y : float
71
+ Naive R² from regressing y ~ Z (all confounds jointly).
72
+
73
+ Data structure
74
+ --------------
75
+ - latent variable: true_X (standardized)
76
+ - confounds: conf1..confK (K = n_confounds)
77
+ * only conf1 is used to build the desired R² (others are noise)
78
+ - features: X1..Xn_features
79
+ * first n_features_informative are informative: correlated with true_X
80
+ * remaining are pure noise
81
+ - outcome: y (standardized)
82
+
83
+ R² logic
84
+ --------
85
+ - Naive R²(y ~ X) ≈ R2_X_y
86
+ - Naive R²(y ~ Z) ≈ R2_Z_y
87
+ - Unique R² of X given Z ≈ R2_X_y_given_Z
88
+ so:
89
+ R2_full ≈ R2_Z_y + R2_X_y_given_Z
90
+
91
+ Returns
92
+ -------
93
+ df : pandas.DataFrame
94
+ Columns:
95
+ conf1..confK
96
+ X1..Xn_features
97
+ true_X
98
+ y
99
+ """
100
+ rng = np.random.default_rng(random_state)
101
+
102
+ # --- basic validity checks ---
103
+ for name, val in [
104
+ ("R2_X_y", R2_X_y),
105
+ ("R2_X_y_given_Z", R2_X_y_given_Z),
106
+ ("R2_Z_y", R2_Z_y),
107
+ ]:
108
+ if not (0.0 <= val < 1.0):
109
+ raise ValueError(f"{name} must be in [0,1), got {val}.")
110
+
111
+ if R2_X_y_given_Z > R2_X_y + 1e-8:
112
+ raise ValueError(
113
+ "R2_X_y_given_Z cannot exceed R2_X_y (unique effect cannot be "
114
+ "larger than naive effect)."
115
+ )
116
+ if R2_Z_y + R2_X_y_given_Z >= 1.0:
117
+ raise ValueError(
118
+ "R2_Z_y + R2_X_y_given_Z must be < 1 (total R² cannot exceed 1)."
119
+ )
120
+ if n_features_informative > n_features:
121
+ raise ValueError("n_features_informative cannot be greater than n_features.")
122
+ if not (0.0 <= rho_informative < 1.0):
123
+ raise ValueError("rho_informative must be in [0, 1).")
124
+ if n_confounds < 1:
125
+ raise ValueError("n_confounds must be at least 1.")
126
+
127
+ # --- step 1: build correlation matrix for (X, Z, y) ---
128
+ r_x = np.sqrt(R2_X_y)
129
+ r_z = np.sqrt(R2_Z_y)
130
+
131
+ # Solve Corr(X, Z) = rho to match desired R2_X_y_given_Z
132
+ rho = _solve_rho_for_R2(R2_X_y, R2_X_y_given_Z, R2_Z_y)
133
+
134
+ # correlation matrix for [X, Z, y]
135
+ # order: [X, Z, y]
136
+ corr = np.array(
137
+ [
138
+ [1.0, rho, r_x],
139
+ [rho, 1.0, r_z],
140
+ [r_x, r_z, 1.0],
141
+ ]
142
+ )
143
+
144
+ # sanity check: positive semi-definite
145
+ eigvals = np.linalg.eigvalsh(corr)
146
+ if np.min(eigvals) < -1e-6:
147
+ raise ValueError(
148
+ "Requested R² combination leads to an invalid correlation matrix."
149
+ )
150
+
151
+ # --- step 2: sample (X, Z, y) as multivariate normal with this corr matrix ---
152
+ L = np.linalg.cholesky(corr + 1e-8 * np.eye(3)) # jitter for numeric safety
153
+ Z_samples = rng.normal(size=(n_samples, 3))
154
+ XYZ = Z_samples @ L.T
155
+ true_X = XYZ[:, 0]
156
+ Z_scalar = XYZ[:, 1]
157
+ y = XYZ[:, 2]
158
+
159
+ # Now:
160
+ # Corr(true_X, y)^2 ≈ R2_X_y
161
+ # Corr(Z_scalar, y)^2 ≈ R2_Z_y
162
+ # and the full regression y ~ X + Z_scalar has unique R² for X ≈ R2_X_y_given_Z.
163
+
164
+ # --- step 3: expand scalar Z into n_confounds (only conf1 matters) ---
165
+ conf_data = {}
166
+ conf_data["conf1"] = Z_scalar
167
+ for i in range(1, n_confounds):
168
+ conf_data[f"conf{i+1}"] = rng.normal(0.0, 1.0, size=n_samples)
169
+ conf_df = pd.DataFrame(conf_data)
170
+
171
+ # --- step 4: generate X features ---
172
+ # First n_features_informative are equicorrelated measures of true_X
173
+ # For informative features with Var=1 and pairwise corr=rho_informative:
174
+ # Xj = sqrt(rho_inf)*true_X + sqrt(1-rho_inf)*N(0,1)
175
+ X_data = {}
176
+ loading = np.sqrt(rho_informative)
177
+ resid_sd = np.sqrt(1.0 - rho_informative)
178
+
179
+ for j in range(n_features_informative):
180
+ eps = rng.normal(0.0, resid_sd, size=n_samples)
181
+ X_data[f"X{j+1}"] = loading * true_X + eps
182
+
183
+ # Remaining features are pure noise
184
+ for j in range(n_features_informative, n_features):
185
+ X_data[f"X{j+1}"] = rng.normal(0.0, 1.0, size=n_samples)
186
+
187
+ X_df = pd.DataFrame(X_data)
188
+
189
+ # --- step 5: assemble final DataFrame ---
190
+ df = pd.concat([conf_df, X_df], axis=1)
191
+ df["true_X"] = true_X
192
+ df["y"] = y
193
+
194
+ return {'X': X_df.to_numpy(), 'Z': conf_df.to_numpy(), 'y': y.reshape(-1, 1), 'true_X': true_X.reshape(-1, 1)}
195
+
196
+
197
+ def compute_r2s(sim: dict) -> dict:
198
+ """
199
+ Convenience function to empirically estimate the R² components from
200
+ the simulated data:
201
+
202
+ - r2_naive: R²(y ~ true_X)
203
+ - r2_conf_only: R²(y ~ all confounds)
204
+ - r2_full: R²(y ~ true_X + all confounds)
205
+ - r2_unique_X: r2_full - r2_conf_only
206
+ """
207
+ y = sim["y"]
208
+ X_naive = sim["true_X"]
209
+ X_conf = sim["Z"]
210
+
211
+ # Naive: y ~ true_X
212
+ mdl_naive = LinearRegression().fit(X_naive, y)
213
+ r2_naive = r2_score(y, mdl_naive.predict(X_naive))
214
+
215
+ # Confounds-only: y ~ conf1..confK
216
+ mdl_conf = LinearRegression().fit(X_conf, y)
217
+ r2_conf_only = r2_score(y, mdl_conf.predict(X_conf))
218
+
219
+ # Full: y ~ true_X + conf1..confK
220
+ X_full = np.column_stack([X_naive, X_conf])
221
+ mdl_full = LinearRegression().fit(X_full, y)
222
+ r2_full = r2_score(y, mdl_full.predict(X_full))
223
+
224
+ r2_unique_X = r2_full - r2_conf_only
225
+
226
+ return {
227
+ "r2_naive": r2_naive,
228
+ "r2_conf_only": r2_conf_only,
229
+ "r2_full": r2_full,
230
+ "r2_unique_X": r2_unique_X,
231
+ }
232
+
233
+
234
+ def generate_four_scenarios(
235
+ n_features: int = 100,
236
+ n_features_informative: int = 10,
237
+ n_confounds: int = 2,
238
+ n_samples: int = 10_000,
239
+ rho_informative: float = 0.5,
240
+ random_state: int | None = 123,
241
+ ) -> dict[str, pd.DataFrame]:
242
+
243
+ rng = np.random.default_rng(random_state)
244
+ seeds = rng.integers(0, 2**32 - 1, size=4)
245
+
246
+ scenarios = {}
247
+
248
+ # 1. No confounding
249
+ scenarios["No Confounding Effect"] = simulate_data_given_R2(
250
+ R2_X_y=0.25,
251
+ R2_X_y_given_Z=0.25,
252
+ R2_Z_y=0.0,
253
+ n_features=n_features,
254
+ n_features_informative=n_features_informative,
255
+ n_confounds=n_confounds,
256
+ n_samples=n_samples,
257
+ rho_informative=rho_informative,
258
+ random_state=int(seeds[0]),
259
+ )
260
+
261
+ # 2. Weak/partial confounding
262
+ scenarios["Moderate Confounding Effect"] = simulate_data_given_R2(
263
+ R2_X_y=0.25,
264
+ R2_X_y_given_Z=0.15,
265
+ R2_Z_y=0.10,
266
+ n_features=n_features,
267
+ n_features_informative=n_features_informative,
268
+ n_confounds=n_confounds,
269
+ n_samples=n_samples,
270
+ rho_informative=rho_informative,
271
+ random_state=int(seeds[1]),
272
+ )
273
+
274
+ # 3. Full confounding
275
+ scenarios["Strong Confounding Effect"] = simulate_data_given_R2(
276
+ R2_X_y=0.25,
277
+ R2_X_y_given_Z=0.05,
278
+ R2_Z_y=0.20,
279
+ n_features=n_features,
280
+ n_features_informative=n_features_informative,
281
+ n_confounds=n_confounds,
282
+ n_samples=n_samples,
283
+ rho_informative=rho_informative,
284
+ random_state=int(seeds[2]),
285
+ )
286
+
287
+ # 4. No confounding but Z explains part of y
288
+ scenarios["No Confounding Effect But Useful Confounds"] = simulate_data_given_R2(
289
+ R2_X_y=0.25,
290
+ R2_X_y_given_Z=0.25,
291
+ R2_Z_y=0.25,
292
+ n_features=n_features,
293
+ n_features_informative=n_features_informative,
294
+ n_confounds=n_confounds,
295
+ n_samples=n_samples,
296
+ rho_informative=rho_informative,
297
+ random_state=int(seeds[3]),
298
+ )
299
+ return scenarios
300
+
301
+
302
+ # -------------------------------------------------------------------
303
+ # Example usage
304
+ # -------------------------------------------------------------------
305
+ if __name__ == "__main__":
306
+ scenarios = generate_four_scenarios(
307
+ n_features=100,
308
+ n_features_informative=10,
309
+ n_confounds=3,
310
+ n_samples=1000,
311
+ rho_informative=0.5,
312
+ random_state=43,
313
+ )
314
+
315
+ for name, df in scenarios.items():
316
+ print(f"\nScenario: {name}")
317
+ r2s = compute_r2s(df)
318
+ for k, v in r2s.items():
319
+ print(f" {k}: {v:.3f}")
@@ -0,0 +1,37 @@
1
+ import numpy as np
2
+
3
+
4
+ def simulate_confounded_data_chyzhyk(link_type='direct_link',
5
+ n_samples=100, n_features=100):
6
+ """
7
+ This is code by Darya Chyzhyk et al. 2022
8
+ https://github.com/darya-chyzhyk/confound_prediction/blob/master/confound_prediction/data_simulation.py
9
+ :param link_type: str,
10
+ Type of the links between target and confound. Options: "no_link",
11
+ "direct_link", "weak_link"
12
+ :param n_samples: int,
13
+ number of samples
14
+ :param n_features: int,
15
+ number of features
16
+ :return:
17
+ """
18
+ np.random.seed(42)
19
+
20
+ mu, sigma = 0, 1.0 # mean and standard deviation
21
+ x_rand = np.random.normal(mu, sigma, [n_samples, n_features])
22
+ y_rand = np.random.normal(mu, sigma, n_samples)
23
+ z_rand = np.random.normal(mu, sigma, n_samples)
24
+
25
+ if link_type == 'no_link':
26
+ y = np.copy(y_rand)
27
+ z = 1 * y_rand + z_rand
28
+ X = x_rand + z.reshape(-1, 1)
29
+ elif link_type == 'direct_link':
30
+ y = np.copy(y_rand)
31
+ z = y_rand + z_rand
32
+ X = x_rand + y_rand.reshape(-1, 1) + z.reshape(-1, 1)
33
+ elif link_type == 'weak_link':
34
+ y = np.copy(y_rand)
35
+ z = 0.5 * y_rand + z_rand
36
+ X = x_rand + y_rand.reshape(-1, 1) + z.reshape(-1, 1)
37
+ return X, y, z.reshape(-1, 1)