scdesigner 0.0.5__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. scdesigner/base/__init__.py +8 -0
  2. scdesigner/base/copula.py +416 -0
  3. scdesigner/base/marginal.py +391 -0
  4. scdesigner/base/simulator.py +59 -0
  5. scdesigner/copulas/__init__.py +8 -0
  6. scdesigner/copulas/standard_copula.py +645 -0
  7. scdesigner/datasets/__init__.py +5 -0
  8. scdesigner/datasets/pancreas.py +39 -0
  9. scdesigner/distributions/__init__.py +19 -0
  10. scdesigner/{minimal → distributions}/bernoulli.py +42 -14
  11. scdesigner/distributions/gaussian.py +114 -0
  12. scdesigner/distributions/negbin.py +121 -0
  13. scdesigner/distributions/negbin_irls.py +72 -0
  14. scdesigner/distributions/negbin_irls_funs.py +456 -0
  15. scdesigner/distributions/poisson.py +88 -0
  16. scdesigner/{minimal → distributions}/zero_inflated_negbin.py +39 -10
  17. scdesigner/distributions/zero_inflated_poisson.py +103 -0
  18. scdesigner/simulators/__init__.py +24 -28
  19. scdesigner/simulators/composite.py +239 -0
  20. scdesigner/simulators/positive_nonnegative_matrix_factorization.py +477 -0
  21. scdesigner/simulators/scd3.py +486 -0
  22. scdesigner/transform/__init__.py +8 -6
  23. scdesigner/{minimal → transform}/transform.py +1 -1
  24. scdesigner/{minimal → utils}/kwargs.py +4 -1
  25. {scdesigner-0.0.5.dist-info → scdesigner-0.0.10.dist-info}/METADATA +1 -1
  26. scdesigner-0.0.10.dist-info/RECORD +28 -0
  27. {scdesigner-0.0.5.dist-info → scdesigner-0.0.10.dist-info}/WHEEL +1 -1
  28. scdesigner/data/__init__.py +0 -16
  29. scdesigner/data/formula.py +0 -137
  30. scdesigner/data/group.py +0 -123
  31. scdesigner/data/sparse.py +0 -39
  32. scdesigner/diagnose/__init__.py +0 -65
  33. scdesigner/diagnose/aic_bic.py +0 -119
  34. scdesigner/diagnose/plot.py +0 -242
  35. scdesigner/estimators/__init__.py +0 -32
  36. scdesigner/estimators/bernoulli.py +0 -85
  37. scdesigner/estimators/gaussian.py +0 -121
  38. scdesigner/estimators/gaussian_copula_factory.py +0 -367
  39. scdesigner/estimators/glm_factory.py +0 -75
  40. scdesigner/estimators/negbin.py +0 -153
  41. scdesigner/estimators/pnmf.py +0 -160
  42. scdesigner/estimators/poisson.py +0 -124
  43. scdesigner/estimators/zero_inflated_negbin.py +0 -195
  44. scdesigner/estimators/zero_inflated_poisson.py +0 -85
  45. scdesigner/format/__init__.py +0 -4
  46. scdesigner/format/format.py +0 -20
  47. scdesigner/format/print.py +0 -30
  48. scdesigner/minimal/__init__.py +0 -17
  49. scdesigner/minimal/composite.py +0 -119
  50. scdesigner/minimal/copula.py +0 -205
  51. scdesigner/minimal/formula.py +0 -23
  52. scdesigner/minimal/gaussian.py +0 -65
  53. scdesigner/minimal/loader.py +0 -211
  54. scdesigner/minimal/marginal.py +0 -154
  55. scdesigner/minimal/negbin.py +0 -73
  56. scdesigner/minimal/positive_nonnegative_matrix_factorization.py +0 -231
  57. scdesigner/minimal/scd3.py +0 -96
  58. scdesigner/minimal/scd3_instances.py +0 -50
  59. scdesigner/minimal/simulator.py +0 -25
  60. scdesigner/minimal/standard_copula.py +0 -383
  61. scdesigner/predictors/__init__.py +0 -15
  62. scdesigner/predictors/bernoulli.py +0 -9
  63. scdesigner/predictors/gaussian.py +0 -16
  64. scdesigner/predictors/negbin.py +0 -17
  65. scdesigner/predictors/poisson.py +0 -12
  66. scdesigner/predictors/zero_inflated_negbin.py +0 -18
  67. scdesigner/predictors/zero_inflated_poisson.py +0 -18
  68. scdesigner/samplers/__init__.py +0 -23
  69. scdesigner/samplers/bernoulli.py +0 -27
  70. scdesigner/samplers/gaussian.py +0 -25
  71. scdesigner/samplers/glm_factory.py +0 -103
  72. scdesigner/samplers/negbin.py +0 -25
  73. scdesigner/samplers/poisson.py +0 -25
  74. scdesigner/samplers/zero_inflated_negbin.py +0 -40
  75. scdesigner/samplers/zero_inflated_poisson.py +0 -16
  76. scdesigner/simulators/composite_regressor.py +0 -72
  77. scdesigner/simulators/glm_simulator.py +0 -167
  78. scdesigner/simulators/pnmf_regression.py +0 -61
  79. scdesigner/transform/amplify.py +0 -14
  80. scdesigner/transform/mask.py +0 -33
  81. scdesigner/transform/nullify.py +0 -25
  82. scdesigner/transform/split.py +0 -23
  83. scdesigner/transform/substitute.py +0 -14
  84. scdesigner-0.0.5.dist-info/RECORD +0 -66
@@ -1,383 +0,0 @@
1
- from .copula import Copula
2
- from .formula import standardize_formula
3
- from .kwargs import DEFAULT_ALLOWED_KWARGS, _filter_kwargs
4
- from anndata import AnnData
5
- from scipy.stats import norm, multivariate_normal
6
- from tqdm import tqdm
7
- from typing import Dict, Union, Callable, Tuple
8
- import numpy as np
9
- import torch
10
- from .copula import CovarianceStructure
11
- import warnings
12
-
13
- class StandardCopula(Copula):
14
- """Standard Gaussian Copula Model"""
15
- def __init__(self, formula: str = "~ 1"):
16
- """Initialize the StandardCopula model.
17
-
18
- Args:
19
- formula (str, optional): _description_. Defaults to "~ 1".
20
- """
21
- formula = standardize_formula(formula, allowed_keys=['group'])
22
- super().__init__(formula)
23
- self.groups = None
24
-
25
-
26
- def setup_data(self, adata: AnnData, marginal_formula: Dict[str, str], **kwargs):
27
- """Set up the data for the standard covariance model. After setting up the data, x_dict will always have a "group" key.
28
-
29
- Args:
30
- adata (AnnData): The AnnData object containing the data.
31
- marginal_formula (Dict[str, str]): The formula for the marginal model.
32
- Raises:
33
- ValueError: If the groupings are not binary.
34
- """
35
- data_kwargs = _filter_kwargs(kwargs, DEFAULT_ALLOWED_KWARGS['data'])
36
- super().setup_data(adata, marginal_formula, **data_kwargs)
37
- _, obs_batch = next(iter(self.loader))
38
- obs_batch_group = obs_batch.get("group")
39
-
40
- # fill in group indexing variables
41
- self.groups = self.loader.dataset.predictor_names["group"]
42
- self.n_groups = len(self.groups)
43
- self.group_col = {g: i for i, g in enumerate(self.groups)}
44
-
45
- # check that obs_batch is a binary grouping matrix (only if group exists)
46
- if obs_batch_group is not None:
47
- unique_vals = torch.unique(obs_batch_group)
48
- if (not torch.all((unique_vals == 0) | (unique_vals == 1)).item()):
49
- raise ValueError("Only categorical groups are currently supported in copula covariance estimation.")
50
-
51
- def fit(self, uniformizer: Callable, **kwargs):
52
- """
53
- Fit the copula covariance model.
54
-
55
- Args:
56
- uniformizer (Callable): Function to convert data to uniform distribution
57
- **kwargs: Additional arguments
58
- top_k (int, optional): Use only top-k most expressed genes for covariance estimation.
59
- If None, estimates full covariance for all genes.
60
-
61
- Returns:
62
- None: Stores fitted parameters in self.parameters as dict of CovarianceStructure objects.
63
-
64
- Raises:
65
- ValueError: If top_k is not a positive integer or exceeds n_outcomes
66
- """
67
- top_k = kwargs.get("top_k", None)
68
- if top_k is not None:
69
- if not isinstance(top_k, int):
70
- raise ValueError("top_k must be an integer")
71
- if top_k <= 0:
72
- raise ValueError("top_k must be positive")
73
- if top_k > self.n_outcomes:
74
- raise ValueError(f"top_k ({top_k}) cannot exceed number of outcomes ({self.n_outcomes})")
75
- gene_total_expression = np.array(self.adata.X.sum(axis=0)).flatten()
76
- sorted_indices = np.argsort(gene_total_expression)
77
- top_k_indices = sorted_indices[-top_k:]
78
- remaining_indices = sorted_indices[:-top_k]
79
- covariances = self._compute_block_covariance(uniformizer, top_k_indices,
80
- remaining_indices, top_k)
81
- else:
82
- covariances = self._compute_full_covariance(uniformizer)
83
-
84
- self.parameters = covariances
85
-
86
- def pseudo_obs(self, x_dict: Dict):
87
- # convert one-hot encoding memberships to a map
88
- # {"group1": [indices of group 1], "group2": [indices of group 2]}
89
- # The initialization method ensures that x_dict will always have a "group" key.
90
- group_data = x_dict.get("group")
91
- memberships = group_data.cpu().numpy()
92
- group_ix = {g: np.where(memberships[:, self.group_col[g] == 1])[0] for g in self.groups}
93
-
94
- # initialize the result
95
- u = np.zeros((len(memberships), self.n_outcomes))
96
- parameters = self.parameters
97
-
98
- # loop over groups and sample each part in turn
99
- for group, cov_struct in parameters.items():
100
- if cov_struct.remaining_var is not None:
101
- u[group_ix[group]] = self._fast_normal_pseudo_obs(len(group_ix[group]), cov_struct)
102
- else:
103
- u[group_ix[group]] = self._normal_pseudo_obs(len(group_ix[group]), cov_struct)
104
- return u
105
-
106
- def likelihood(self, uniformizer: Callable, batch: Tuple[torch.Tensor, Dict[str, torch.Tensor]]):
107
- """
108
- Compute likelihood of data given the copula model.
109
-
110
- Args:
111
- uniformizer (Callable): Function to convert expression data to uniform distribution
112
- batch (Tuple[torch.Tensor, Dict[str, torch.Tensor]]): Data batch containing:
113
- - Y (torch.Tensor): Expression data of shape (n_cells, n_genes)
114
- - X_dict (Dict[str, torch.Tensor]): Covariates dict with keys as parameter names
115
- and values as tensors of shape (n_cells, n_covariates)
116
-
117
- Returns:
118
- np.ndarray: Log-likelihood for each cell, shape (n_cells,)
119
- """
120
- # uniformize the observations
121
- y, x_dict = batch
122
- u = uniformizer(y, x_dict)
123
- z = norm().ppf(u)
124
-
125
- # same group manipulation as for pseudobs
126
- parameters = self.parameters
127
- if type(parameters) is not dict:
128
- parameters = {self.groups[0]: parameters}
129
-
130
- group_data = x_dict.get("group")
131
- memberships = group_data.numpy()
132
- group_ix = {g: np.where(memberships[:, self.group_col[g] == 1])[0] for g in self.groups}
133
-
134
- ll = np.zeros(len(z))
135
-
136
- for group, cov_struct in parameters.items():
137
- ix = group_ix[group]
138
- if len(ix) > 0:
139
- z_modeled = z[ix][:, cov_struct.modeled_indices]
140
-
141
- ll_modeled = multivariate_normal.logpdf(z_modeled,
142
- np.zeros(cov_struct.num_modeled_genes),
143
- cov_struct.cov.values)
144
- if cov_struct.num_remaining_genes > 0:
145
- z_remaining = z[ix][:, cov_struct.remaining_indices]
146
- ll_remaining = norm.logpdf(z_remaining,
147
- loc=0,
148
- scale = np.sqrt(cov_struct.remaining_var.values))
149
- else:
150
- ll_remaining = 0
151
- ll[ix] = ll_modeled + ll_remaining
152
- return ll
153
-
154
- def num_params(self, **kwargs):
155
- S = self.parameters
156
- per_group = [((S[g].num_modeled_genes * (S[g].num_modeled_genes - 1)) / 2) for g in self.groups]
157
- return sum(per_group)
158
-
159
- def _validate_parameters(self, **kwargs):
160
- top_k = kwargs.get("top_k", None)
161
- if top_k is not None:
162
- if not isinstance(top_k, int):
163
- raise ValueError("top_k must be an integer")
164
- if top_k <= 0:
165
- raise ValueError("top_k must be positive")
166
- if top_k > self.n_outcomes:
167
- raise ValueError(f"top_k ({top_k}) cannot exceed number of outcomes ({self.n_outcomes})")
168
- return top_k
169
-
170
-
171
-
172
- def _accumulate_top_k_stats(self, uniformizer:Callable, top_k_idx, rem_idx, top_k) \
173
- -> Tuple[Dict[Union[str, int], np.ndarray],
174
- Dict[Union[str, int], np.ndarray],
175
- Dict[Union[str, int], np.ndarray],
176
- Dict[Union[str, int], np.ndarray],
177
- Dict[Union[str, int], int]]:
178
- """Accumulate sufficient statistics for top-k covariance estimation.
179
-
180
- Args:
181
- uniformizer (Callable): Function to convert to uniform distribution
182
- top_k_idx (np.ndarray): Indices of the top-k genes
183
- rem_idx (np.ndarray): Indices of the remaining genes
184
- top_k (int): Number of top-k genes
185
-
186
- Returns:
187
- top_k_sums (dict): Sums of the top-k genes for each group
188
- top_k_second_moments (dict): Second moments of the top-k genes for each group
189
- rem_sums (dict): Sums of the remaining genes for each group
190
- rem_second_moments (dict): Second moments of the remaining genes for each group
191
- Ng (dict): Number of observations for each group
192
- """
193
- top_k_sums = {g: np.zeros(top_k) for g in self.groups}
194
- top_k_second_moments = {g: np.zeros((top_k, top_k)) for g in self.groups}
195
- rem_sums = {g: np.zeros(self.n_outcomes - top_k) for g in self.groups}
196
- rem_second_moments = {g: np.zeros(self.n_outcomes - top_k) for g in self.groups}
197
- Ng = {g: 0 for g in self.groups}
198
-
199
- for y, x_dict in tqdm(self.loader, desc="Estimating top-k copula covariance"):
200
- group_data = x_dict.get("group")
201
- memberships = group_data.cpu().numpy()
202
- u = uniformizer(y, x_dict)
203
- z = norm.ppf(u)
204
-
205
- for g in self.groups:
206
- mask = memberships[:, self.group_col[g]] == 1
207
- if not np.any(mask):
208
- continue
209
-
210
- z_g = z[mask]
211
- n_g = mask.sum()
212
-
213
- top_k_z, rem_z = z_g[:, top_k_idx], z_g[:, rem_idx]
214
-
215
- top_k_sums[g] += top_k_z.sum(axis=0)
216
- top_k_second_moments[g] += top_k_z.T @ top_k_z
217
-
218
- rem_sums[g] += rem_z.sum(axis=0)
219
- rem_second_moments[g] += (rem_z ** 2).sum(axis=0)
220
-
221
- Ng[g] += n_g
222
-
223
- return top_k_sums, top_k_second_moments, rem_sums, rem_second_moments, Ng
224
-
225
- def _accumulate_full_stats(self, uniformizer:Callable) \
226
- -> Tuple[Dict[Union[str, int], np.ndarray],
227
- Dict[Union[str, int], np.ndarray],
228
- Dict[Union[str, int], int]]:
229
- """Accumulate sufficient statistics for full covariance estimation.
230
-
231
- Args:
232
- uniformizer (Callable): Function to convert to uniform distribution
233
-
234
- Returns:
235
- sums (dict): Sums of the genes for each group
236
- second_moments (dict): Second moments of the genes for each group
237
- Ng (dict): Number of observations for each group
238
- """
239
- sums = {g: np.zeros(self.n_outcomes) for g in self.groups}
240
- second_moments = {g: np.zeros((self.n_outcomes, self.n_outcomes)) for g in self.groups}
241
- Ng = {g: 0 for g in self.groups}
242
-
243
- for y, x_dict in tqdm(self.loader, desc="Estimating copula covariance"):
244
- group_data = x_dict.get("group")
245
- memberships = group_data.cpu().numpy()
246
-
247
- u = uniformizer(y, x_dict)
248
- z = norm.ppf(u)
249
-
250
- for g in self.groups:
251
- mask = memberships[:, self.group_col[g]] == 1
252
-
253
- if not np.any(mask):
254
- continue
255
-
256
- z_g = z[mask]
257
- n_g = mask.sum()
258
-
259
- second_moments[g] += z_g.T @ z_g
260
- sums[g] += z_g.sum(axis=0)
261
-
262
- Ng[g] += n_g
263
-
264
- return sums, second_moments, Ng
265
-
266
- def _compute_block_covariance(self, uniformizer:Callable,
267
- top_k_idx: np.ndarray, rem_idx: np.ndarray, top_k: int) \
268
- -> Dict[Union[str, int], CovarianceStructure]:
269
- """Compute the covariance matrix for the top-k and remaining genes.
270
-
271
- Args:
272
- top_k_sums (dict): Sums of the top-k genes for each group
273
- top_k_second_moments (dict): Second moments of the top-k genes for each group
274
- remaining_sums (dict): Sums of the remaining genes for each group
275
- remaining_second_moments (dict): Second moments of the remaining genes for each group
276
- Ng (dict): Number of observations for each group
277
-
278
- Returns:
279
- covariance (dict): Covariance matrix for each group
280
- """
281
- top_k_sums, top_k_second_moments, remaining_sums, remaining_second_moments, Ng \
282
- = self._accumulate_top_k_stats(uniformizer, top_k_idx, rem_idx, top_k)
283
- covariance = {}
284
- for g in self.groups:
285
- if Ng[g] == 0:
286
- warnings.warn(f"Group {g} has no observations, skipping")
287
- continue
288
- mean_top_k = top_k_sums[g] / Ng[g]
289
- cov_top_k = top_k_second_moments[g] / Ng[g] - np.outer(mean_top_k, mean_top_k)
290
- mean_remaining = remaining_sums[g] / Ng[g]
291
- var_remaining = remaining_second_moments[g] / Ng[g] - mean_remaining ** 2
292
- top_k_names = self.adata.var_names[top_k_idx]
293
- remaining_names = self.adata.var_names[rem_idx]
294
- covariance[g] = CovarianceStructure(
295
- cov=cov_top_k,
296
- modeled_names=top_k_names,
297
- modeled_indices=top_k_idx,
298
- remaining_var=var_remaining,
299
- remaining_indices=rem_idx,
300
- remaining_names=remaining_names
301
- )
302
- return covariance
303
-
304
- def _compute_full_covariance(self, uniformizer:Callable) -> Dict[Union[str, int], CovarianceStructure]:
305
- """Compute the covariance matrix for the full genes.
306
-
307
- Args:
308
- uniformizer (Callable): Function to convert to uniform distribution
309
-
310
- Returns:
311
- covariance (dict): Covariance matrix for each group
312
- """
313
- sums, second_moments, Ng = self._accumulate_full_stats(uniformizer)
314
- covariance = {}
315
- for g in self.groups:
316
- if Ng[g] == 0:
317
- warnings.warn(f"Group {g} has no observations, skipping")
318
- continue
319
- mean = sums[g] / Ng[g]
320
- cov = second_moments[g] / Ng[g] - np.outer(mean, mean)
321
- covariance[g] = CovarianceStructure(
322
- cov=cov,
323
- modeled_names=self.adata.var_names,
324
- modeled_indices=np.arange(self.n_outcomes),
325
- remaining_var=None,
326
- remaining_indices=None,
327
- remaining_names=None
328
- )
329
- return covariance
330
-
331
- def _fast_normal_pseudo_obs(self, n_samples: int, cov_struct: CovarianceStructure) -> np.ndarray:
332
- """Sample pseudo-observations from the covariance structure.
333
-
334
- Args:
335
- n_samples (int): Number of samples to generate
336
- cov_struct (CovarianceStructure): The covariance structure
337
-
338
- Returns:
339
- np.ndarray: Pseudo-observations with shape (n_samples, total_genes)
340
- """
341
- u = np.zeros((n_samples, cov_struct.total_genes))
342
-
343
- z_modeled = np.random.multivariate_normal(
344
- mean=np.zeros(cov_struct.num_modeled_genes),
345
- cov=cov_struct.cov.values,
346
- size=n_samples
347
- )
348
-
349
- z_remaining = np.random.normal(
350
- loc=0,
351
- scale=cov_struct.remaining_var.values ** 0.5,
352
- size=(n_samples, cov_struct.num_remaining_genes)
353
- )
354
-
355
- normal_distn_modeled = norm(0, np.diag(cov_struct.cov.values) ** 0.5)
356
- u[:, cov_struct.modeled_indices] = normal_distn_modeled.cdf(z_modeled)
357
-
358
- normal_distn_remaining = norm(0, cov_struct.remaining_var.values ** 0.5)
359
- u[:, cov_struct.remaining_indices] = normal_distn_remaining.cdf(z_remaining)
360
-
361
- return u
362
-
363
- def _normal_pseudo_obs(self, n_samples: int, cov_struct: CovarianceStructure) -> np.ndarray:
364
- """Sample pseudo-observations from the covariance structure.
365
-
366
- Args:
367
- n_samples (int): Number of samples to generate
368
- cov_struct (CovarianceStructure): The covariance structure
369
-
370
- Returns:
371
- np.ndarray: Pseudo-observations with shape (n_samples, total_genes)
372
- """
373
- u = np.zeros((n_samples, cov_struct.total_genes))
374
- z = np.random.multivariate_normal(
375
- mean=np.zeros(cov_struct.total_genes),
376
- cov=cov_struct.cov.values,
377
- size=n_samples
378
- )
379
-
380
- normal_distn = norm(0, np.diag(cov_struct.cov.values) ** 0.5)
381
- u = normal_distn.cdf(z)
382
-
383
- return u
@@ -1,15 +0,0 @@
1
- from .bernoulli import bernoulli_predict
2
- from .negbin import negbin_predict
3
- from .poisson import poisson_predict
4
- from .gaussian import gaussian_predict
5
- from .zero_inflated_negbin import zero_inflated_negbin_predict
6
- from .zero_inflated_poisson import zero_inflated_poisson_predict
7
-
8
- __all__ = [
9
- "bernoulli_predict",
10
- "gaussian_predict",
11
- "negbin_predict",
12
- "poisson_predict",
13
- "zero_inflated_negbin_predict",
14
- "zero_inflated_poisson_predict",
15
- ]
@@ -1,9 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from ..format import format_matrix
4
-
5
-
6
- def bernoulli_predict(parameters: dict, obs: pd.DataFrame, formula: str):
7
- x = format_matrix(obs, formula)
8
- theta = np.exp(x @ parameters["coef_mean"])
9
- return {"mean": theta}
@@ -1,16 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from ..format import format_matrix
4
- from typing import Union
5
-
6
- def gaussian_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
7
- # Standardize formula to dictionary format
8
- if isinstance(formula, str):
9
- formula = {'mean': formula, 'sdev': '~ 1'}
10
-
11
- x_mean = format_matrix(obs, formula["mean"])
12
- x_dispersion = format_matrix(obs, formula["sdev"])
13
-
14
- sigma = np.exp(x_dispersion @ parameters["coef_sdev"])
15
- mu = x_mean @ parameters["coef_mean"]
16
- return {"mean": mu, "sdev": sigma}
@@ -1,17 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from ..format import format_matrix
4
- from typing import Union
5
-
6
- def negbin_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
7
- # Standardize formula to dictionary format
8
- if isinstance(formula, str):
9
- formula = {'mean': formula, 'dispersion': '~ 1'}
10
-
11
- x_mean = format_matrix(obs, formula["mean"])
12
- x_dispersion = format_matrix(obs, formula["dispersion"])
13
-
14
- r = np.exp(x_dispersion @ parameters["coef_dispersion"])
15
- mu = np.exp(x_mean @ parameters["coef_mean"])
16
- # r and mu are still dataframes with column names being the gene names
17
- return {"mean": mu, "dispersion": r}
@@ -1,12 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from ..format import format_matrix
4
- from typing import Union
5
-
6
- def poisson_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
7
- if isinstance(formula, dict):
8
- formula = formula['mean']
9
- x = format_matrix(obs, formula)
10
- mu = np.exp(x @ parameters["coef_mean"])
11
- return {"mean": mu}
12
-
@@ -1,18 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from ..format import format_matrix
4
- from scipy.special import expit
5
- from typing import Union
6
-
7
- def zero_inflated_negbin_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
8
- if isinstance(formula, str):
9
- formula = {"mean": formula, "dispersion": "~ 1", "zero_inflation": "~ 1"}
10
- x_mean = format_matrix(obs, formula["mean"])
11
- x_dispersion = format_matrix(obs, formula["dispersion"])
12
- x_zero_inflation = format_matrix(obs, formula["zero_inflation"])
13
- r, mu, pi = (
14
- np.exp(x_dispersion @ parameters["coef_dispersion"]),
15
- np.exp(x_mean @ parameters["coef_mean"]),
16
- expit(x_zero_inflation @ parameters["coef_zero_inflation"]),
17
- )
18
- return {"mean": mu, "dispersion": r, "zero_inflation": pi}
@@ -1,18 +0,0 @@
1
- from ..format import format_matrix
2
- from typing import Union
3
- import numpy as np
4
- import pandas as pd
5
-
6
-
7
- def zero_inflated_poisson_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
8
- if isinstance(formula, str):
9
- formula = {'beta': formula, 'pi': '~ 1'}
10
- mu, pi = (
11
- np.exp(format_matrix(obs, formula['mean']) @ parameters["coef_mean"]),
12
- sigmoid(format_matrix(obs, formula['zero_inflation']) @ parameters["coef_zero_inflation"]),
13
- )
14
- return {"mean": mu, "zero_inflation": pi}
15
-
16
-
17
- def sigmoid(x):
18
- return 1 / (1 + np.exp(-x))
@@ -1,23 +0,0 @@
1
- from .negbin import negbin_sample, negbin_copula_sample
2
- from .poisson import poisson_sample, poisson_copula_sample
3
- from .bernoulli import bernoulli_sample, bernoulli_copula_sample
4
- from .gaussian import gaussian_regression_sample, gaussian_copula_sample
5
- from .zero_inflated_negbin import (
6
- zero_inflated_negbin_sample,
7
- zero_inflated_negbin_copula_sample,
8
- )
9
- from .zero_inflated_poisson import zero_inflated_poisson_sample
10
-
11
- __all__ = [
12
- "negbin_sample",
13
- "negbin_copula_sample",
14
- "poisson_sample",
15
- "poisson_copula_sample",
16
- "bernoulli_sample",
17
- "bernoulli_copula_sample",
18
- "gaussian_regression_sample",
19
- "gaussian_copula_sample",
20
- "zero_inflated_negbin_sample",
21
- "zero_inflated_negbin_copula_sample",
22
- "zero_inflated_poisson_sample",
23
- ]
@@ -1,27 +0,0 @@
1
- from . import glm_factory as glm
2
- from scipy.stats import bernoulli
3
- from typing import Union
4
- import numpy as np
5
-
6
-
7
- def bernoulli_regression_sample_array(local_parameters: dict) -> np.array:
8
- theta = local_parameters["mean"]
9
- return bernoulli(theta).rvs()
10
-
11
-
12
- def bernoulli_copula_sample_array(
13
- local_parameters: dict, covariance: Union[dict, np.array], groups: dict
14
- ) -> np.array:
15
- # initialize uniformized gaussian samples
16
- N, G = local_parameters["mean"].shape
17
- u = glm.gaussian_copula_pseudo_obs(N, G, covariance, groups)
18
-
19
- theta = local_parameters["mean"]
20
- return bernoulli(theta).ppf(u)
21
-
22
-
23
- bernoulli_sample = glm.glm_sample_factory(bernoulli_regression_sample_array)
24
-
25
- bernoulli_copula_sample = glm.gaussian_copula_sample_factory(
26
- bernoulli_copula_sample_array
27
- )
@@ -1,25 +0,0 @@
1
- from scipy.stats import norm
2
- from . import glm_factory as glm
3
- from typing import Union
4
- import numpy as np
5
-
6
-
7
- def gaussian_regression_sample_array(local_parameters: dict) -> np.array:
8
- sigma, mu = local_parameters["sdev"], local_parameters["mean"] # dataframes of shape (n, g)
9
- return norm(loc=mu, scale=sigma).rvs()
10
-
11
-
12
- def gaussian_copula_sample_array(
13
- local_parameters: dict, covariance: Union[dict, np.array], groups: dict
14
- ) -> np.array:
15
- # initialize uniformized gaussian samples
16
- N, G = local_parameters["mean"].shape
17
- u = glm.gaussian_copula_pseudo_obs(N, G, covariance, groups)
18
-
19
- # transform the correlated uniforms to NB space
20
- sigma, mu = local_parameters["sdev"], local_parameters["mean"]
21
- return norm(loc=mu, scale=sigma).ppf(u)
22
-
23
-
24
- gaussian_regression_sample = glm.glm_sample_factory(gaussian_regression_sample_array)
25
- gaussian_copula_sample = glm.gaussian_copula_sample_factory(gaussian_copula_sample_array)
@@ -1,103 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- import anndata as ad
4
- from typing import Union
5
- from scipy.stats import norm
6
-
7
-
8
- def glm_sample_factory(sample_array):
9
- def sampler(local_parameters: dict, obs: pd.DataFrame) -> ad.AnnData:
10
- samples = sample_array(local_parameters)
11
- result = ad.AnnData(X=samples, obs=obs)
12
- result.var_names = local_parameters["mean"].columns
13
- return result
14
- return sampler
15
-
16
- def gaussian_copula_pseudo_obs(N, G, sigma, groups):
17
-
18
- # Import here to avoid circular imports
19
- from ..estimators.gaussian_copula_factory import FastCovarianceStructure
20
-
21
- u = np.zeros((N, G))
22
-
23
- # cycle across groups
24
- for group, ix in groups.items():
25
- # If sigma is not a dict, then every group shares the same sigma
26
- if type(sigma) is not dict:
27
- sigma = {group: sigma}
28
-
29
- group_sigma = sigma[group]
30
-
31
- # Handle FastCovarianceStructure
32
- if isinstance(group_sigma, FastCovarianceStructure):
33
- u[ix] = _fast_copula_pseudo_obs(len(ix), group_sigma)
34
- else:
35
- # Traditional full covariance matrix approach
36
- z = np.random.multivariate_normal(
37
- mean=np.zeros(G), cov=group_sigma, size=len(ix)
38
- )
39
- normal_distn = norm(0, np.diag(group_sigma ** 0.5))
40
- u[ix] = normal_distn.cdf(z)
41
- return u
42
-
43
-
44
- def _fast_copula_pseudo_obs(n_samples, fast_cov_struct):
45
- """
46
- Efficient pseudo-observation generation using FastCovarianceStructure.
47
-
48
- This function separately samples:
49
- 1. Top-k genes using full multivariate normal with their covariance matrix
50
- 2. Remaining genes using independent normal with their individual variances
51
-
52
- Parameters:
53
- -----------
54
- n_samples : int
55
- Number of samples to generate for this group
56
- fast_cov_struct : FastCovarianceStructure
57
- Structure containing top-k covariance and remaining variances
58
-
59
- Returns:
60
- --------
61
- np.ndarray : Pseudo-observations with shape (n_samples, total_genes)
62
- """
63
- u = np.zeros((n_samples, fast_cov_struct.total_genes))
64
-
65
- # Sample top-k genes with full covariance
66
- if fast_cov_struct.top_k > 0:
67
- z_top_k = np.random.multivariate_normal(
68
- mean=np.zeros(fast_cov_struct.top_k),
69
- cov=fast_cov_struct.top_k_cov,
70
- size=n_samples
71
- )
72
-
73
- # Convert to uniform via marginal CDFs
74
- top_k_std = np.sqrt(np.diag(fast_cov_struct.top_k_cov))
75
- normal_distn_top_k = norm(0, top_k_std)
76
- u[:, fast_cov_struct.top_k_indices] = normal_distn_top_k.cdf(z_top_k)
77
-
78
- # Sample remaining genes independently
79
- if len(fast_cov_struct.remaining_indices) > 0:
80
- remaining_std = np.sqrt(fast_cov_struct.remaining_var)
81
- z_remaining = np.random.normal(
82
- loc=0,
83
- scale=remaining_std,
84
- size=(n_samples, len(fast_cov_struct.remaining_indices))
85
- )
86
-
87
- # Convert to uniform via marginal CDFs
88
- normal_distn_remaining = norm(0, remaining_std)
89
- u[:, fast_cov_struct.remaining_indices] = normal_distn_remaining.cdf(z_remaining)
90
-
91
- return u
92
-
93
-
94
- def gaussian_copula_sample_factory(copula_sample_array):
95
- def sampler(
96
- local_parameters: dict, covariance: Union[dict, np.array], groups: dict, obs: pd.DataFrame
97
- ) -> ad.AnnData:
98
- samples = copula_sample_array(local_parameters, covariance, groups)
99
- result = ad.AnnData(X=samples, obs=obs)
100
- result.var_names = local_parameters["mean"].columns
101
- return result
102
- return sampler
103
-