scdesigner 0.0.5__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scdesigner/base/__init__.py +8 -0
- scdesigner/base/copula.py +416 -0
- scdesigner/base/marginal.py +391 -0
- scdesigner/base/simulator.py +59 -0
- scdesigner/copulas/__init__.py +8 -0
- scdesigner/copulas/standard_copula.py +645 -0
- scdesigner/datasets/__init__.py +5 -0
- scdesigner/datasets/pancreas.py +39 -0
- scdesigner/distributions/__init__.py +19 -0
- scdesigner/{minimal → distributions}/bernoulli.py +42 -14
- scdesigner/distributions/gaussian.py +114 -0
- scdesigner/distributions/negbin.py +121 -0
- scdesigner/distributions/negbin_irls.py +72 -0
- scdesigner/distributions/negbin_irls_funs.py +456 -0
- scdesigner/distributions/poisson.py +88 -0
- scdesigner/{minimal → distributions}/zero_inflated_negbin.py +39 -10
- scdesigner/distributions/zero_inflated_poisson.py +103 -0
- scdesigner/simulators/__init__.py +24 -28
- scdesigner/simulators/composite.py +239 -0
- scdesigner/simulators/positive_nonnegative_matrix_factorization.py +477 -0
- scdesigner/simulators/scd3.py +486 -0
- scdesigner/transform/__init__.py +8 -6
- scdesigner/{minimal → transform}/transform.py +1 -1
- scdesigner/{minimal → utils}/kwargs.py +4 -1
- {scdesigner-0.0.5.dist-info → scdesigner-0.0.10.dist-info}/METADATA +1 -1
- scdesigner-0.0.10.dist-info/RECORD +28 -0
- {scdesigner-0.0.5.dist-info → scdesigner-0.0.10.dist-info}/WHEEL +1 -1
- scdesigner/data/__init__.py +0 -16
- scdesigner/data/formula.py +0 -137
- scdesigner/data/group.py +0 -123
- scdesigner/data/sparse.py +0 -39
- scdesigner/diagnose/__init__.py +0 -65
- scdesigner/diagnose/aic_bic.py +0 -119
- scdesigner/diagnose/plot.py +0 -242
- scdesigner/estimators/__init__.py +0 -32
- scdesigner/estimators/bernoulli.py +0 -85
- scdesigner/estimators/gaussian.py +0 -121
- scdesigner/estimators/gaussian_copula_factory.py +0 -367
- scdesigner/estimators/glm_factory.py +0 -75
- scdesigner/estimators/negbin.py +0 -153
- scdesigner/estimators/pnmf.py +0 -160
- scdesigner/estimators/poisson.py +0 -124
- scdesigner/estimators/zero_inflated_negbin.py +0 -195
- scdesigner/estimators/zero_inflated_poisson.py +0 -85
- scdesigner/format/__init__.py +0 -4
- scdesigner/format/format.py +0 -20
- scdesigner/format/print.py +0 -30
- scdesigner/minimal/__init__.py +0 -17
- scdesigner/minimal/composite.py +0 -119
- scdesigner/minimal/copula.py +0 -205
- scdesigner/minimal/formula.py +0 -23
- scdesigner/minimal/gaussian.py +0 -65
- scdesigner/minimal/loader.py +0 -211
- scdesigner/minimal/marginal.py +0 -154
- scdesigner/minimal/negbin.py +0 -73
- scdesigner/minimal/positive_nonnegative_matrix_factorization.py +0 -231
- scdesigner/minimal/scd3.py +0 -96
- scdesigner/minimal/scd3_instances.py +0 -50
- scdesigner/minimal/simulator.py +0 -25
- scdesigner/minimal/standard_copula.py +0 -383
- scdesigner/predictors/__init__.py +0 -15
- scdesigner/predictors/bernoulli.py +0 -9
- scdesigner/predictors/gaussian.py +0 -16
- scdesigner/predictors/negbin.py +0 -17
- scdesigner/predictors/poisson.py +0 -12
- scdesigner/predictors/zero_inflated_negbin.py +0 -18
- scdesigner/predictors/zero_inflated_poisson.py +0 -18
- scdesigner/samplers/__init__.py +0 -23
- scdesigner/samplers/bernoulli.py +0 -27
- scdesigner/samplers/gaussian.py +0 -25
- scdesigner/samplers/glm_factory.py +0 -103
- scdesigner/samplers/negbin.py +0 -25
- scdesigner/samplers/poisson.py +0 -25
- scdesigner/samplers/zero_inflated_negbin.py +0 -40
- scdesigner/samplers/zero_inflated_poisson.py +0 -16
- scdesigner/simulators/composite_regressor.py +0 -72
- scdesigner/simulators/glm_simulator.py +0 -167
- scdesigner/simulators/pnmf_regression.py +0 -61
- scdesigner/transform/amplify.py +0 -14
- scdesigner/transform/mask.py +0 -33
- scdesigner/transform/nullify.py +0 -25
- scdesigner/transform/split.py +0 -23
- scdesigner/transform/substitute.py +0 -14
- scdesigner-0.0.5.dist-info/RECORD +0 -66
|
@@ -1,383 +0,0 @@
|
|
|
1
|
-
from .copula import Copula
|
|
2
|
-
from .formula import standardize_formula
|
|
3
|
-
from .kwargs import DEFAULT_ALLOWED_KWARGS, _filter_kwargs
|
|
4
|
-
from anndata import AnnData
|
|
5
|
-
from scipy.stats import norm, multivariate_normal
|
|
6
|
-
from tqdm import tqdm
|
|
7
|
-
from typing import Dict, Union, Callable, Tuple
|
|
8
|
-
import numpy as np
|
|
9
|
-
import torch
|
|
10
|
-
from .copula import CovarianceStructure
|
|
11
|
-
import warnings
|
|
12
|
-
|
|
13
|
-
class StandardCopula(Copula):
|
|
14
|
-
"""Standard Gaussian Copula Model"""
|
|
15
|
-
def __init__(self, formula: str = "~ 1"):
|
|
16
|
-
"""Initialize the StandardCopula model.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
formula (str, optional): _description_. Defaults to "~ 1".
|
|
20
|
-
"""
|
|
21
|
-
formula = standardize_formula(formula, allowed_keys=['group'])
|
|
22
|
-
super().__init__(formula)
|
|
23
|
-
self.groups = None
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def setup_data(self, adata: AnnData, marginal_formula: Dict[str, str], **kwargs):
|
|
27
|
-
"""Set up the data for the standard covariance model. After setting up the data, x_dict will always have a "group" key.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
adata (AnnData): The AnnData object containing the data.
|
|
31
|
-
marginal_formula (Dict[str, str]): The formula for the marginal model.
|
|
32
|
-
Raises:
|
|
33
|
-
ValueError: If the groupings are not binary.
|
|
34
|
-
"""
|
|
35
|
-
data_kwargs = _filter_kwargs(kwargs, DEFAULT_ALLOWED_KWARGS['data'])
|
|
36
|
-
super().setup_data(adata, marginal_formula, **data_kwargs)
|
|
37
|
-
_, obs_batch = next(iter(self.loader))
|
|
38
|
-
obs_batch_group = obs_batch.get("group")
|
|
39
|
-
|
|
40
|
-
# fill in group indexing variables
|
|
41
|
-
self.groups = self.loader.dataset.predictor_names["group"]
|
|
42
|
-
self.n_groups = len(self.groups)
|
|
43
|
-
self.group_col = {g: i for i, g in enumerate(self.groups)}
|
|
44
|
-
|
|
45
|
-
# check that obs_batch is a binary grouping matrix (only if group exists)
|
|
46
|
-
if obs_batch_group is not None:
|
|
47
|
-
unique_vals = torch.unique(obs_batch_group)
|
|
48
|
-
if (not torch.all((unique_vals == 0) | (unique_vals == 1)).item()):
|
|
49
|
-
raise ValueError("Only categorical groups are currently supported in copula covariance estimation.")
|
|
50
|
-
|
|
51
|
-
def fit(self, uniformizer: Callable, **kwargs):
|
|
52
|
-
"""
|
|
53
|
-
Fit the copula covariance model.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
uniformizer (Callable): Function to convert data to uniform distribution
|
|
57
|
-
**kwargs: Additional arguments
|
|
58
|
-
top_k (int, optional): Use only top-k most expressed genes for covariance estimation.
|
|
59
|
-
If None, estimates full covariance for all genes.
|
|
60
|
-
|
|
61
|
-
Returns:
|
|
62
|
-
None: Stores fitted parameters in self.parameters as dict of CovarianceStructure objects.
|
|
63
|
-
|
|
64
|
-
Raises:
|
|
65
|
-
ValueError: If top_k is not a positive integer or exceeds n_outcomes
|
|
66
|
-
"""
|
|
67
|
-
top_k = kwargs.get("top_k", None)
|
|
68
|
-
if top_k is not None:
|
|
69
|
-
if not isinstance(top_k, int):
|
|
70
|
-
raise ValueError("top_k must be an integer")
|
|
71
|
-
if top_k <= 0:
|
|
72
|
-
raise ValueError("top_k must be positive")
|
|
73
|
-
if top_k > self.n_outcomes:
|
|
74
|
-
raise ValueError(f"top_k ({top_k}) cannot exceed number of outcomes ({self.n_outcomes})")
|
|
75
|
-
gene_total_expression = np.array(self.adata.X.sum(axis=0)).flatten()
|
|
76
|
-
sorted_indices = np.argsort(gene_total_expression)
|
|
77
|
-
top_k_indices = sorted_indices[-top_k:]
|
|
78
|
-
remaining_indices = sorted_indices[:-top_k]
|
|
79
|
-
covariances = self._compute_block_covariance(uniformizer, top_k_indices,
|
|
80
|
-
remaining_indices, top_k)
|
|
81
|
-
else:
|
|
82
|
-
covariances = self._compute_full_covariance(uniformizer)
|
|
83
|
-
|
|
84
|
-
self.parameters = covariances
|
|
85
|
-
|
|
86
|
-
def pseudo_obs(self, x_dict: Dict):
|
|
87
|
-
# convert one-hot encoding memberships to a map
|
|
88
|
-
# {"group1": [indices of group 1], "group2": [indices of group 2]}
|
|
89
|
-
# The initialization method ensures that x_dict will always have a "group" key.
|
|
90
|
-
group_data = x_dict.get("group")
|
|
91
|
-
memberships = group_data.cpu().numpy()
|
|
92
|
-
group_ix = {g: np.where(memberships[:, self.group_col[g] == 1])[0] for g in self.groups}
|
|
93
|
-
|
|
94
|
-
# initialize the result
|
|
95
|
-
u = np.zeros((len(memberships), self.n_outcomes))
|
|
96
|
-
parameters = self.parameters
|
|
97
|
-
|
|
98
|
-
# loop over groups and sample each part in turn
|
|
99
|
-
for group, cov_struct in parameters.items():
|
|
100
|
-
if cov_struct.remaining_var is not None:
|
|
101
|
-
u[group_ix[group]] = self._fast_normal_pseudo_obs(len(group_ix[group]), cov_struct)
|
|
102
|
-
else:
|
|
103
|
-
u[group_ix[group]] = self._normal_pseudo_obs(len(group_ix[group]), cov_struct)
|
|
104
|
-
return u
|
|
105
|
-
|
|
106
|
-
def likelihood(self, uniformizer: Callable, batch: Tuple[torch.Tensor, Dict[str, torch.Tensor]]):
|
|
107
|
-
"""
|
|
108
|
-
Compute likelihood of data given the copula model.
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
uniformizer (Callable): Function to convert expression data to uniform distribution
|
|
112
|
-
batch (Tuple[torch.Tensor, Dict[str, torch.Tensor]]): Data batch containing:
|
|
113
|
-
- Y (torch.Tensor): Expression data of shape (n_cells, n_genes)
|
|
114
|
-
- X_dict (Dict[str, torch.Tensor]): Covariates dict with keys as parameter names
|
|
115
|
-
and values as tensors of shape (n_cells, n_covariates)
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
np.ndarray: Log-likelihood for each cell, shape (n_cells,)
|
|
119
|
-
"""
|
|
120
|
-
# uniformize the observations
|
|
121
|
-
y, x_dict = batch
|
|
122
|
-
u = uniformizer(y, x_dict)
|
|
123
|
-
z = norm().ppf(u)
|
|
124
|
-
|
|
125
|
-
# same group manipulation as for pseudobs
|
|
126
|
-
parameters = self.parameters
|
|
127
|
-
if type(parameters) is not dict:
|
|
128
|
-
parameters = {self.groups[0]: parameters}
|
|
129
|
-
|
|
130
|
-
group_data = x_dict.get("group")
|
|
131
|
-
memberships = group_data.numpy()
|
|
132
|
-
group_ix = {g: np.where(memberships[:, self.group_col[g] == 1])[0] for g in self.groups}
|
|
133
|
-
|
|
134
|
-
ll = np.zeros(len(z))
|
|
135
|
-
|
|
136
|
-
for group, cov_struct in parameters.items():
|
|
137
|
-
ix = group_ix[group]
|
|
138
|
-
if len(ix) > 0:
|
|
139
|
-
z_modeled = z[ix][:, cov_struct.modeled_indices]
|
|
140
|
-
|
|
141
|
-
ll_modeled = multivariate_normal.logpdf(z_modeled,
|
|
142
|
-
np.zeros(cov_struct.num_modeled_genes),
|
|
143
|
-
cov_struct.cov.values)
|
|
144
|
-
if cov_struct.num_remaining_genes > 0:
|
|
145
|
-
z_remaining = z[ix][:, cov_struct.remaining_indices]
|
|
146
|
-
ll_remaining = norm.logpdf(z_remaining,
|
|
147
|
-
loc=0,
|
|
148
|
-
scale = np.sqrt(cov_struct.remaining_var.values))
|
|
149
|
-
else:
|
|
150
|
-
ll_remaining = 0
|
|
151
|
-
ll[ix] = ll_modeled + ll_remaining
|
|
152
|
-
return ll
|
|
153
|
-
|
|
154
|
-
def num_params(self, **kwargs):
|
|
155
|
-
S = self.parameters
|
|
156
|
-
per_group = [((S[g].num_modeled_genes * (S[g].num_modeled_genes - 1)) / 2) for g in self.groups]
|
|
157
|
-
return sum(per_group)
|
|
158
|
-
|
|
159
|
-
def _validate_parameters(self, **kwargs):
|
|
160
|
-
top_k = kwargs.get("top_k", None)
|
|
161
|
-
if top_k is not None:
|
|
162
|
-
if not isinstance(top_k, int):
|
|
163
|
-
raise ValueError("top_k must be an integer")
|
|
164
|
-
if top_k <= 0:
|
|
165
|
-
raise ValueError("top_k must be positive")
|
|
166
|
-
if top_k > self.n_outcomes:
|
|
167
|
-
raise ValueError(f"top_k ({top_k}) cannot exceed number of outcomes ({self.n_outcomes})")
|
|
168
|
-
return top_k
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
def _accumulate_top_k_stats(self, uniformizer:Callable, top_k_idx, rem_idx, top_k) \
|
|
173
|
-
-> Tuple[Dict[Union[str, int], np.ndarray],
|
|
174
|
-
Dict[Union[str, int], np.ndarray],
|
|
175
|
-
Dict[Union[str, int], np.ndarray],
|
|
176
|
-
Dict[Union[str, int], np.ndarray],
|
|
177
|
-
Dict[Union[str, int], int]]:
|
|
178
|
-
"""Accumulate sufficient statistics for top-k covariance estimation.
|
|
179
|
-
|
|
180
|
-
Args:
|
|
181
|
-
uniformizer (Callable): Function to convert to uniform distribution
|
|
182
|
-
top_k_idx (np.ndarray): Indices of the top-k genes
|
|
183
|
-
rem_idx (np.ndarray): Indices of the remaining genes
|
|
184
|
-
top_k (int): Number of top-k genes
|
|
185
|
-
|
|
186
|
-
Returns:
|
|
187
|
-
top_k_sums (dict): Sums of the top-k genes for each group
|
|
188
|
-
top_k_second_moments (dict): Second moments of the top-k genes for each group
|
|
189
|
-
rem_sums (dict): Sums of the remaining genes for each group
|
|
190
|
-
rem_second_moments (dict): Second moments of the remaining genes for each group
|
|
191
|
-
Ng (dict): Number of observations for each group
|
|
192
|
-
"""
|
|
193
|
-
top_k_sums = {g: np.zeros(top_k) for g in self.groups}
|
|
194
|
-
top_k_second_moments = {g: np.zeros((top_k, top_k)) for g in self.groups}
|
|
195
|
-
rem_sums = {g: np.zeros(self.n_outcomes - top_k) for g in self.groups}
|
|
196
|
-
rem_second_moments = {g: np.zeros(self.n_outcomes - top_k) for g in self.groups}
|
|
197
|
-
Ng = {g: 0 for g in self.groups}
|
|
198
|
-
|
|
199
|
-
for y, x_dict in tqdm(self.loader, desc="Estimating top-k copula covariance"):
|
|
200
|
-
group_data = x_dict.get("group")
|
|
201
|
-
memberships = group_data.cpu().numpy()
|
|
202
|
-
u = uniformizer(y, x_dict)
|
|
203
|
-
z = norm.ppf(u)
|
|
204
|
-
|
|
205
|
-
for g in self.groups:
|
|
206
|
-
mask = memberships[:, self.group_col[g]] == 1
|
|
207
|
-
if not np.any(mask):
|
|
208
|
-
continue
|
|
209
|
-
|
|
210
|
-
z_g = z[mask]
|
|
211
|
-
n_g = mask.sum()
|
|
212
|
-
|
|
213
|
-
top_k_z, rem_z = z_g[:, top_k_idx], z_g[:, rem_idx]
|
|
214
|
-
|
|
215
|
-
top_k_sums[g] += top_k_z.sum(axis=0)
|
|
216
|
-
top_k_second_moments[g] += top_k_z.T @ top_k_z
|
|
217
|
-
|
|
218
|
-
rem_sums[g] += rem_z.sum(axis=0)
|
|
219
|
-
rem_second_moments[g] += (rem_z ** 2).sum(axis=0)
|
|
220
|
-
|
|
221
|
-
Ng[g] += n_g
|
|
222
|
-
|
|
223
|
-
return top_k_sums, top_k_second_moments, rem_sums, rem_second_moments, Ng
|
|
224
|
-
|
|
225
|
-
def _accumulate_full_stats(self, uniformizer:Callable) \
|
|
226
|
-
-> Tuple[Dict[Union[str, int], np.ndarray],
|
|
227
|
-
Dict[Union[str, int], np.ndarray],
|
|
228
|
-
Dict[Union[str, int], int]]:
|
|
229
|
-
"""Accumulate sufficient statistics for full covariance estimation.
|
|
230
|
-
|
|
231
|
-
Args:
|
|
232
|
-
uniformizer (Callable): Function to convert to uniform distribution
|
|
233
|
-
|
|
234
|
-
Returns:
|
|
235
|
-
sums (dict): Sums of the genes for each group
|
|
236
|
-
second_moments (dict): Second moments of the genes for each group
|
|
237
|
-
Ng (dict): Number of observations for each group
|
|
238
|
-
"""
|
|
239
|
-
sums = {g: np.zeros(self.n_outcomes) for g in self.groups}
|
|
240
|
-
second_moments = {g: np.zeros((self.n_outcomes, self.n_outcomes)) for g in self.groups}
|
|
241
|
-
Ng = {g: 0 for g in self.groups}
|
|
242
|
-
|
|
243
|
-
for y, x_dict in tqdm(self.loader, desc="Estimating copula covariance"):
|
|
244
|
-
group_data = x_dict.get("group")
|
|
245
|
-
memberships = group_data.cpu().numpy()
|
|
246
|
-
|
|
247
|
-
u = uniformizer(y, x_dict)
|
|
248
|
-
z = norm.ppf(u)
|
|
249
|
-
|
|
250
|
-
for g in self.groups:
|
|
251
|
-
mask = memberships[:, self.group_col[g]] == 1
|
|
252
|
-
|
|
253
|
-
if not np.any(mask):
|
|
254
|
-
continue
|
|
255
|
-
|
|
256
|
-
z_g = z[mask]
|
|
257
|
-
n_g = mask.sum()
|
|
258
|
-
|
|
259
|
-
second_moments[g] += z_g.T @ z_g
|
|
260
|
-
sums[g] += z_g.sum(axis=0)
|
|
261
|
-
|
|
262
|
-
Ng[g] += n_g
|
|
263
|
-
|
|
264
|
-
return sums, second_moments, Ng
|
|
265
|
-
|
|
266
|
-
def _compute_block_covariance(self, uniformizer:Callable,
|
|
267
|
-
top_k_idx: np.ndarray, rem_idx: np.ndarray, top_k: int) \
|
|
268
|
-
-> Dict[Union[str, int], CovarianceStructure]:
|
|
269
|
-
"""Compute the covariance matrix for the top-k and remaining genes.
|
|
270
|
-
|
|
271
|
-
Args:
|
|
272
|
-
top_k_sums (dict): Sums of the top-k genes for each group
|
|
273
|
-
top_k_second_moments (dict): Second moments of the top-k genes for each group
|
|
274
|
-
remaining_sums (dict): Sums of the remaining genes for each group
|
|
275
|
-
remaining_second_moments (dict): Second moments of the remaining genes for each group
|
|
276
|
-
Ng (dict): Number of observations for each group
|
|
277
|
-
|
|
278
|
-
Returns:
|
|
279
|
-
covariance (dict): Covariance matrix for each group
|
|
280
|
-
"""
|
|
281
|
-
top_k_sums, top_k_second_moments, remaining_sums, remaining_second_moments, Ng \
|
|
282
|
-
= self._accumulate_top_k_stats(uniformizer, top_k_idx, rem_idx, top_k)
|
|
283
|
-
covariance = {}
|
|
284
|
-
for g in self.groups:
|
|
285
|
-
if Ng[g] == 0:
|
|
286
|
-
warnings.warn(f"Group {g} has no observations, skipping")
|
|
287
|
-
continue
|
|
288
|
-
mean_top_k = top_k_sums[g] / Ng[g]
|
|
289
|
-
cov_top_k = top_k_second_moments[g] / Ng[g] - np.outer(mean_top_k, mean_top_k)
|
|
290
|
-
mean_remaining = remaining_sums[g] / Ng[g]
|
|
291
|
-
var_remaining = remaining_second_moments[g] / Ng[g] - mean_remaining ** 2
|
|
292
|
-
top_k_names = self.adata.var_names[top_k_idx]
|
|
293
|
-
remaining_names = self.adata.var_names[rem_idx]
|
|
294
|
-
covariance[g] = CovarianceStructure(
|
|
295
|
-
cov=cov_top_k,
|
|
296
|
-
modeled_names=top_k_names,
|
|
297
|
-
modeled_indices=top_k_idx,
|
|
298
|
-
remaining_var=var_remaining,
|
|
299
|
-
remaining_indices=rem_idx,
|
|
300
|
-
remaining_names=remaining_names
|
|
301
|
-
)
|
|
302
|
-
return covariance
|
|
303
|
-
|
|
304
|
-
def _compute_full_covariance(self, uniformizer:Callable) -> Dict[Union[str, int], CovarianceStructure]:
|
|
305
|
-
"""Compute the covariance matrix for the full genes.
|
|
306
|
-
|
|
307
|
-
Args:
|
|
308
|
-
uniformizer (Callable): Function to convert to uniform distribution
|
|
309
|
-
|
|
310
|
-
Returns:
|
|
311
|
-
covariance (dict): Covariance matrix for each group
|
|
312
|
-
"""
|
|
313
|
-
sums, second_moments, Ng = self._accumulate_full_stats(uniformizer)
|
|
314
|
-
covariance = {}
|
|
315
|
-
for g in self.groups:
|
|
316
|
-
if Ng[g] == 0:
|
|
317
|
-
warnings.warn(f"Group {g} has no observations, skipping")
|
|
318
|
-
continue
|
|
319
|
-
mean = sums[g] / Ng[g]
|
|
320
|
-
cov = second_moments[g] / Ng[g] - np.outer(mean, mean)
|
|
321
|
-
covariance[g] = CovarianceStructure(
|
|
322
|
-
cov=cov,
|
|
323
|
-
modeled_names=self.adata.var_names,
|
|
324
|
-
modeled_indices=np.arange(self.n_outcomes),
|
|
325
|
-
remaining_var=None,
|
|
326
|
-
remaining_indices=None,
|
|
327
|
-
remaining_names=None
|
|
328
|
-
)
|
|
329
|
-
return covariance
|
|
330
|
-
|
|
331
|
-
def _fast_normal_pseudo_obs(self, n_samples: int, cov_struct: CovarianceStructure) -> np.ndarray:
|
|
332
|
-
"""Sample pseudo-observations from the covariance structure.
|
|
333
|
-
|
|
334
|
-
Args:
|
|
335
|
-
n_samples (int): Number of samples to generate
|
|
336
|
-
cov_struct (CovarianceStructure): The covariance structure
|
|
337
|
-
|
|
338
|
-
Returns:
|
|
339
|
-
np.ndarray: Pseudo-observations with shape (n_samples, total_genes)
|
|
340
|
-
"""
|
|
341
|
-
u = np.zeros((n_samples, cov_struct.total_genes))
|
|
342
|
-
|
|
343
|
-
z_modeled = np.random.multivariate_normal(
|
|
344
|
-
mean=np.zeros(cov_struct.num_modeled_genes),
|
|
345
|
-
cov=cov_struct.cov.values,
|
|
346
|
-
size=n_samples
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
z_remaining = np.random.normal(
|
|
350
|
-
loc=0,
|
|
351
|
-
scale=cov_struct.remaining_var.values ** 0.5,
|
|
352
|
-
size=(n_samples, cov_struct.num_remaining_genes)
|
|
353
|
-
)
|
|
354
|
-
|
|
355
|
-
normal_distn_modeled = norm(0, np.diag(cov_struct.cov.values) ** 0.5)
|
|
356
|
-
u[:, cov_struct.modeled_indices] = normal_distn_modeled.cdf(z_modeled)
|
|
357
|
-
|
|
358
|
-
normal_distn_remaining = norm(0, cov_struct.remaining_var.values ** 0.5)
|
|
359
|
-
u[:, cov_struct.remaining_indices] = normal_distn_remaining.cdf(z_remaining)
|
|
360
|
-
|
|
361
|
-
return u
|
|
362
|
-
|
|
363
|
-
def _normal_pseudo_obs(self, n_samples: int, cov_struct: CovarianceStructure) -> np.ndarray:
|
|
364
|
-
"""Sample pseudo-observations from the covariance structure.
|
|
365
|
-
|
|
366
|
-
Args:
|
|
367
|
-
n_samples (int): Number of samples to generate
|
|
368
|
-
cov_struct (CovarianceStructure): The covariance structure
|
|
369
|
-
|
|
370
|
-
Returns:
|
|
371
|
-
np.ndarray: Pseudo-observations with shape (n_samples, total_genes)
|
|
372
|
-
"""
|
|
373
|
-
u = np.zeros((n_samples, cov_struct.total_genes))
|
|
374
|
-
z = np.random.multivariate_normal(
|
|
375
|
-
mean=np.zeros(cov_struct.total_genes),
|
|
376
|
-
cov=cov_struct.cov.values,
|
|
377
|
-
size=n_samples
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
normal_distn = norm(0, np.diag(cov_struct.cov.values) ** 0.5)
|
|
381
|
-
u = normal_distn.cdf(z)
|
|
382
|
-
|
|
383
|
-
return u
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from .bernoulli import bernoulli_predict
|
|
2
|
-
from .negbin import negbin_predict
|
|
3
|
-
from .poisson import poisson_predict
|
|
4
|
-
from .gaussian import gaussian_predict
|
|
5
|
-
from .zero_inflated_negbin import zero_inflated_negbin_predict
|
|
6
|
-
from .zero_inflated_poisson import zero_inflated_poisson_predict
|
|
7
|
-
|
|
8
|
-
__all__ = [
|
|
9
|
-
"bernoulli_predict",
|
|
10
|
-
"gaussian_predict",
|
|
11
|
-
"negbin_predict",
|
|
12
|
-
"poisson_predict",
|
|
13
|
-
"zero_inflated_negbin_predict",
|
|
14
|
-
"zero_inflated_poisson_predict",
|
|
15
|
-
]
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from ..format import format_matrix
|
|
4
|
-
from typing import Union
|
|
5
|
-
|
|
6
|
-
def gaussian_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
|
|
7
|
-
# Standardize formula to dictionary format
|
|
8
|
-
if isinstance(formula, str):
|
|
9
|
-
formula = {'mean': formula, 'sdev': '~ 1'}
|
|
10
|
-
|
|
11
|
-
x_mean = format_matrix(obs, formula["mean"])
|
|
12
|
-
x_dispersion = format_matrix(obs, formula["sdev"])
|
|
13
|
-
|
|
14
|
-
sigma = np.exp(x_dispersion @ parameters["coef_sdev"])
|
|
15
|
-
mu = x_mean @ parameters["coef_mean"]
|
|
16
|
-
return {"mean": mu, "sdev": sigma}
|
scdesigner/predictors/negbin.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from ..format import format_matrix
|
|
4
|
-
from typing import Union
|
|
5
|
-
|
|
6
|
-
def negbin_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
|
|
7
|
-
# Standardize formula to dictionary format
|
|
8
|
-
if isinstance(formula, str):
|
|
9
|
-
formula = {'mean': formula, 'dispersion': '~ 1'}
|
|
10
|
-
|
|
11
|
-
x_mean = format_matrix(obs, formula["mean"])
|
|
12
|
-
x_dispersion = format_matrix(obs, formula["dispersion"])
|
|
13
|
-
|
|
14
|
-
r = np.exp(x_dispersion @ parameters["coef_dispersion"])
|
|
15
|
-
mu = np.exp(x_mean @ parameters["coef_mean"])
|
|
16
|
-
# r and mu are still dataframes with column names being the gene names
|
|
17
|
-
return {"mean": mu, "dispersion": r}
|
scdesigner/predictors/poisson.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from ..format import format_matrix
|
|
4
|
-
from typing import Union
|
|
5
|
-
|
|
6
|
-
def poisson_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
|
|
7
|
-
if isinstance(formula, dict):
|
|
8
|
-
formula = formula['mean']
|
|
9
|
-
x = format_matrix(obs, formula)
|
|
10
|
-
mu = np.exp(x @ parameters["coef_mean"])
|
|
11
|
-
return {"mean": mu}
|
|
12
|
-
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from ..format import format_matrix
|
|
4
|
-
from scipy.special import expit
|
|
5
|
-
from typing import Union
|
|
6
|
-
|
|
7
|
-
def zero_inflated_negbin_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
|
|
8
|
-
if isinstance(formula, str):
|
|
9
|
-
formula = {"mean": formula, "dispersion": "~ 1", "zero_inflation": "~ 1"}
|
|
10
|
-
x_mean = format_matrix(obs, formula["mean"])
|
|
11
|
-
x_dispersion = format_matrix(obs, formula["dispersion"])
|
|
12
|
-
x_zero_inflation = format_matrix(obs, formula["zero_inflation"])
|
|
13
|
-
r, mu, pi = (
|
|
14
|
-
np.exp(x_dispersion @ parameters["coef_dispersion"]),
|
|
15
|
-
np.exp(x_mean @ parameters["coef_mean"]),
|
|
16
|
-
expit(x_zero_inflation @ parameters["coef_zero_inflation"]),
|
|
17
|
-
)
|
|
18
|
-
return {"mean": mu, "dispersion": r, "zero_inflation": pi}
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
from ..format import format_matrix
|
|
2
|
-
from typing import Union
|
|
3
|
-
import numpy as np
|
|
4
|
-
import pandas as pd
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def zero_inflated_poisson_predict(parameters: dict, obs: pd.DataFrame, formula: Union[str, dict]):
|
|
8
|
-
if isinstance(formula, str):
|
|
9
|
-
formula = {'beta': formula, 'pi': '~ 1'}
|
|
10
|
-
mu, pi = (
|
|
11
|
-
np.exp(format_matrix(obs, formula['mean']) @ parameters["coef_mean"]),
|
|
12
|
-
sigmoid(format_matrix(obs, formula['zero_inflation']) @ parameters["coef_zero_inflation"]),
|
|
13
|
-
)
|
|
14
|
-
return {"mean": mu, "zero_inflation": pi}
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def sigmoid(x):
|
|
18
|
-
return 1 / (1 + np.exp(-x))
|
scdesigner/samplers/__init__.py
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from .negbin import negbin_sample, negbin_copula_sample
|
|
2
|
-
from .poisson import poisson_sample, poisson_copula_sample
|
|
3
|
-
from .bernoulli import bernoulli_sample, bernoulli_copula_sample
|
|
4
|
-
from .gaussian import gaussian_regression_sample, gaussian_copula_sample
|
|
5
|
-
from .zero_inflated_negbin import (
|
|
6
|
-
zero_inflated_negbin_sample,
|
|
7
|
-
zero_inflated_negbin_copula_sample,
|
|
8
|
-
)
|
|
9
|
-
from .zero_inflated_poisson import zero_inflated_poisson_sample
|
|
10
|
-
|
|
11
|
-
__all__ = [
|
|
12
|
-
"negbin_sample",
|
|
13
|
-
"negbin_copula_sample",
|
|
14
|
-
"poisson_sample",
|
|
15
|
-
"poisson_copula_sample",
|
|
16
|
-
"bernoulli_sample",
|
|
17
|
-
"bernoulli_copula_sample",
|
|
18
|
-
"gaussian_regression_sample",
|
|
19
|
-
"gaussian_copula_sample",
|
|
20
|
-
"zero_inflated_negbin_sample",
|
|
21
|
-
"zero_inflated_negbin_copula_sample",
|
|
22
|
-
"zero_inflated_poisson_sample",
|
|
23
|
-
]
|
scdesigner/samplers/bernoulli.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from . import glm_factory as glm
|
|
2
|
-
from scipy.stats import bernoulli
|
|
3
|
-
from typing import Union
|
|
4
|
-
import numpy as np
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def bernoulli_regression_sample_array(local_parameters: dict) -> np.array:
|
|
8
|
-
theta = local_parameters["mean"]
|
|
9
|
-
return bernoulli(theta).rvs()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def bernoulli_copula_sample_array(
|
|
13
|
-
local_parameters: dict, covariance: Union[dict, np.array], groups: dict
|
|
14
|
-
) -> np.array:
|
|
15
|
-
# initialize uniformized gaussian samples
|
|
16
|
-
N, G = local_parameters["mean"].shape
|
|
17
|
-
u = glm.gaussian_copula_pseudo_obs(N, G, covariance, groups)
|
|
18
|
-
|
|
19
|
-
theta = local_parameters["mean"]
|
|
20
|
-
return bernoulli(theta).ppf(u)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
bernoulli_sample = glm.glm_sample_factory(bernoulli_regression_sample_array)
|
|
24
|
-
|
|
25
|
-
bernoulli_copula_sample = glm.gaussian_copula_sample_factory(
|
|
26
|
-
bernoulli_copula_sample_array
|
|
27
|
-
)
|
scdesigner/samplers/gaussian.py
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
from scipy.stats import norm
|
|
2
|
-
from . import glm_factory as glm
|
|
3
|
-
from typing import Union
|
|
4
|
-
import numpy as np
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def gaussian_regression_sample_array(local_parameters: dict) -> np.array:
|
|
8
|
-
sigma, mu = local_parameters["sdev"], local_parameters["mean"] # dataframes of shape (n, g)
|
|
9
|
-
return norm(loc=mu, scale=sigma).rvs()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def gaussian_copula_sample_array(
|
|
13
|
-
local_parameters: dict, covariance: Union[dict, np.array], groups: dict
|
|
14
|
-
) -> np.array:
|
|
15
|
-
# initialize uniformized gaussian samples
|
|
16
|
-
N, G = local_parameters["mean"].shape
|
|
17
|
-
u = glm.gaussian_copula_pseudo_obs(N, G, covariance, groups)
|
|
18
|
-
|
|
19
|
-
# transform the correlated uniforms to NB space
|
|
20
|
-
sigma, mu = local_parameters["sdev"], local_parameters["mean"]
|
|
21
|
-
return norm(loc=mu, scale=sigma).ppf(u)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
gaussian_regression_sample = glm.glm_sample_factory(gaussian_regression_sample_array)
|
|
25
|
-
gaussian_copula_sample = glm.gaussian_copula_sample_factory(gaussian_copula_sample_array)
|
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
import anndata as ad
|
|
4
|
-
from typing import Union
|
|
5
|
-
from scipy.stats import norm
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def glm_sample_factory(sample_array):
|
|
9
|
-
def sampler(local_parameters: dict, obs: pd.DataFrame) -> ad.AnnData:
|
|
10
|
-
samples = sample_array(local_parameters)
|
|
11
|
-
result = ad.AnnData(X=samples, obs=obs)
|
|
12
|
-
result.var_names = local_parameters["mean"].columns
|
|
13
|
-
return result
|
|
14
|
-
return sampler
|
|
15
|
-
|
|
16
|
-
def gaussian_copula_pseudo_obs(N, G, sigma, groups):
|
|
17
|
-
|
|
18
|
-
# Import here to avoid circular imports
|
|
19
|
-
from ..estimators.gaussian_copula_factory import FastCovarianceStructure
|
|
20
|
-
|
|
21
|
-
u = np.zeros((N, G))
|
|
22
|
-
|
|
23
|
-
# cycle across groups
|
|
24
|
-
for group, ix in groups.items():
|
|
25
|
-
# If sigma is not a dict, then every group shares the same sigma
|
|
26
|
-
if type(sigma) is not dict:
|
|
27
|
-
sigma = {group: sigma}
|
|
28
|
-
|
|
29
|
-
group_sigma = sigma[group]
|
|
30
|
-
|
|
31
|
-
# Handle FastCovarianceStructure
|
|
32
|
-
if isinstance(group_sigma, FastCovarianceStructure):
|
|
33
|
-
u[ix] = _fast_copula_pseudo_obs(len(ix), group_sigma)
|
|
34
|
-
else:
|
|
35
|
-
# Traditional full covariance matrix approach
|
|
36
|
-
z = np.random.multivariate_normal(
|
|
37
|
-
mean=np.zeros(G), cov=group_sigma, size=len(ix)
|
|
38
|
-
)
|
|
39
|
-
normal_distn = norm(0, np.diag(group_sigma ** 0.5))
|
|
40
|
-
u[ix] = normal_distn.cdf(z)
|
|
41
|
-
return u
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def _fast_copula_pseudo_obs(n_samples, fast_cov_struct):
|
|
45
|
-
"""
|
|
46
|
-
Efficient pseudo-observation generation using FastCovarianceStructure.
|
|
47
|
-
|
|
48
|
-
This function separately samples:
|
|
49
|
-
1. Top-k genes using full multivariate normal with their covariance matrix
|
|
50
|
-
2. Remaining genes using independent normal with their individual variances
|
|
51
|
-
|
|
52
|
-
Parameters:
|
|
53
|
-
-----------
|
|
54
|
-
n_samples : int
|
|
55
|
-
Number of samples to generate for this group
|
|
56
|
-
fast_cov_struct : FastCovarianceStructure
|
|
57
|
-
Structure containing top-k covariance and remaining variances
|
|
58
|
-
|
|
59
|
-
Returns:
|
|
60
|
-
--------
|
|
61
|
-
np.ndarray : Pseudo-observations with shape (n_samples, total_genes)
|
|
62
|
-
"""
|
|
63
|
-
u = np.zeros((n_samples, fast_cov_struct.total_genes))
|
|
64
|
-
|
|
65
|
-
# Sample top-k genes with full covariance
|
|
66
|
-
if fast_cov_struct.top_k > 0:
|
|
67
|
-
z_top_k = np.random.multivariate_normal(
|
|
68
|
-
mean=np.zeros(fast_cov_struct.top_k),
|
|
69
|
-
cov=fast_cov_struct.top_k_cov,
|
|
70
|
-
size=n_samples
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
# Convert to uniform via marginal CDFs
|
|
74
|
-
top_k_std = np.sqrt(np.diag(fast_cov_struct.top_k_cov))
|
|
75
|
-
normal_distn_top_k = norm(0, top_k_std)
|
|
76
|
-
u[:, fast_cov_struct.top_k_indices] = normal_distn_top_k.cdf(z_top_k)
|
|
77
|
-
|
|
78
|
-
# Sample remaining genes independently
|
|
79
|
-
if len(fast_cov_struct.remaining_indices) > 0:
|
|
80
|
-
remaining_std = np.sqrt(fast_cov_struct.remaining_var)
|
|
81
|
-
z_remaining = np.random.normal(
|
|
82
|
-
loc=0,
|
|
83
|
-
scale=remaining_std,
|
|
84
|
-
size=(n_samples, len(fast_cov_struct.remaining_indices))
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
# Convert to uniform via marginal CDFs
|
|
88
|
-
normal_distn_remaining = norm(0, remaining_std)
|
|
89
|
-
u[:, fast_cov_struct.remaining_indices] = normal_distn_remaining.cdf(z_remaining)
|
|
90
|
-
|
|
91
|
-
return u
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def gaussian_copula_sample_factory(copula_sample_array):
|
|
95
|
-
def sampler(
|
|
96
|
-
local_parameters: dict, covariance: Union[dict, np.array], groups: dict, obs: pd.DataFrame
|
|
97
|
-
) -> ad.AnnData:
|
|
98
|
-
samples = copula_sample_array(local_parameters, covariance, groups)
|
|
99
|
-
result = ad.AnnData(X=samples, obs=obs)
|
|
100
|
-
result.var_names = local_parameters["mean"].columns
|
|
101
|
-
return result
|
|
102
|
-
return sampler
|
|
103
|
-
|