scdesigner 0.0.5__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. scdesigner/base/__init__.py +8 -0
  2. scdesigner/base/copula.py +416 -0
  3. scdesigner/base/marginal.py +391 -0
  4. scdesigner/base/simulator.py +59 -0
  5. scdesigner/copulas/__init__.py +8 -0
  6. scdesigner/copulas/standard_copula.py +645 -0
  7. scdesigner/datasets/__init__.py +5 -0
  8. scdesigner/datasets/pancreas.py +39 -0
  9. scdesigner/distributions/__init__.py +19 -0
  10. scdesigner/{minimal → distributions}/bernoulli.py +42 -14
  11. scdesigner/distributions/gaussian.py +114 -0
  12. scdesigner/distributions/negbin.py +121 -0
  13. scdesigner/distributions/negbin_irls.py +72 -0
  14. scdesigner/distributions/negbin_irls_funs.py +456 -0
  15. scdesigner/distributions/poisson.py +88 -0
  16. scdesigner/{minimal → distributions}/zero_inflated_negbin.py +39 -10
  17. scdesigner/distributions/zero_inflated_poisson.py +103 -0
  18. scdesigner/simulators/__init__.py +24 -28
  19. scdesigner/simulators/composite.py +239 -0
  20. scdesigner/simulators/positive_nonnegative_matrix_factorization.py +477 -0
  21. scdesigner/simulators/scd3.py +486 -0
  22. scdesigner/transform/__init__.py +8 -6
  23. scdesigner/{minimal → transform}/transform.py +1 -1
  24. scdesigner/{minimal → utils}/kwargs.py +4 -1
  25. {scdesigner-0.0.5.dist-info → scdesigner-0.0.10.dist-info}/METADATA +1 -1
  26. scdesigner-0.0.10.dist-info/RECORD +28 -0
  27. {scdesigner-0.0.5.dist-info → scdesigner-0.0.10.dist-info}/WHEEL +1 -1
  28. scdesigner/data/__init__.py +0 -16
  29. scdesigner/data/formula.py +0 -137
  30. scdesigner/data/group.py +0 -123
  31. scdesigner/data/sparse.py +0 -39
  32. scdesigner/diagnose/__init__.py +0 -65
  33. scdesigner/diagnose/aic_bic.py +0 -119
  34. scdesigner/diagnose/plot.py +0 -242
  35. scdesigner/estimators/__init__.py +0 -32
  36. scdesigner/estimators/bernoulli.py +0 -85
  37. scdesigner/estimators/gaussian.py +0 -121
  38. scdesigner/estimators/gaussian_copula_factory.py +0 -367
  39. scdesigner/estimators/glm_factory.py +0 -75
  40. scdesigner/estimators/negbin.py +0 -153
  41. scdesigner/estimators/pnmf.py +0 -160
  42. scdesigner/estimators/poisson.py +0 -124
  43. scdesigner/estimators/zero_inflated_negbin.py +0 -195
  44. scdesigner/estimators/zero_inflated_poisson.py +0 -85
  45. scdesigner/format/__init__.py +0 -4
  46. scdesigner/format/format.py +0 -20
  47. scdesigner/format/print.py +0 -30
  48. scdesigner/minimal/__init__.py +0 -17
  49. scdesigner/minimal/composite.py +0 -119
  50. scdesigner/minimal/copula.py +0 -205
  51. scdesigner/minimal/formula.py +0 -23
  52. scdesigner/minimal/gaussian.py +0 -65
  53. scdesigner/minimal/loader.py +0 -211
  54. scdesigner/minimal/marginal.py +0 -154
  55. scdesigner/minimal/negbin.py +0 -73
  56. scdesigner/minimal/positive_nonnegative_matrix_factorization.py +0 -231
  57. scdesigner/minimal/scd3.py +0 -96
  58. scdesigner/minimal/scd3_instances.py +0 -50
  59. scdesigner/minimal/simulator.py +0 -25
  60. scdesigner/minimal/standard_copula.py +0 -383
  61. scdesigner/predictors/__init__.py +0 -15
  62. scdesigner/predictors/bernoulli.py +0 -9
  63. scdesigner/predictors/gaussian.py +0 -16
  64. scdesigner/predictors/negbin.py +0 -17
  65. scdesigner/predictors/poisson.py +0 -12
  66. scdesigner/predictors/zero_inflated_negbin.py +0 -18
  67. scdesigner/predictors/zero_inflated_poisson.py +0 -18
  68. scdesigner/samplers/__init__.py +0 -23
  69. scdesigner/samplers/bernoulli.py +0 -27
  70. scdesigner/samplers/gaussian.py +0 -25
  71. scdesigner/samplers/glm_factory.py +0 -103
  72. scdesigner/samplers/negbin.py +0 -25
  73. scdesigner/samplers/poisson.py +0 -25
  74. scdesigner/samplers/zero_inflated_negbin.py +0 -40
  75. scdesigner/samplers/zero_inflated_poisson.py +0 -16
  76. scdesigner/simulators/composite_regressor.py +0 -72
  77. scdesigner/simulators/glm_simulator.py +0 -167
  78. scdesigner/simulators/pnmf_regression.py +0 -61
  79. scdesigner/transform/amplify.py +0 -14
  80. scdesigner/transform/mask.py +0 -33
  81. scdesigner/transform/nullify.py +0 -25
  82. scdesigner/transform/split.py +0 -23
  83. scdesigner/transform/substitute.py +0 -14
  84. scdesigner-0.0.5.dist-info/RECORD +0 -66
@@ -0,0 +1,8 @@
1
+ """Base classes for scDesigner simulation framework."""
2
+
3
+ from .copula import CovarianceStructure
4
+
5
+ __all__ = [
6
+ "CovarianceStructure",
7
+ ]
8
+
@@ -0,0 +1,416 @@
1
+ from typing import Dict, Callable, Tuple
2
+ import torch
3
+ from anndata import AnnData
4
+ from ..data.loader import adata_loader
5
+ from abc import ABC, abstractmethod
6
+ import numpy as np
7
+ import pandas as pd
8
+ from typing import Optional, Union
9
+
10
+
11
+ class Copula(ABC):
12
+ """Abstract Copula Class
13
+
14
+ The scDesign3 model is built from two components: a collection of marginal
15
+ models, and a copula to tie them together. This class implements an abstract
16
+ version of the copula. Within this class, we may define different subclasses
17
+ that implement various types of regularization or dependencies on
18
+ experimental and biological conditions. Despite these differences, the
19
+ overall class must always provide utilities for fitting and sampling
20
+ dependent uniform variables.
21
+
22
+ Parameters
23
+ ----------
24
+ formula : str
25
+ A string describing the dependence of the copula on experimental or
26
+ biological conditions. We support predictors for categorical variables
27
+ like cell type; this corresponds to estimating a different covariance
28
+ for each category.
29
+ Attributes
30
+ ----------
31
+ loader : torch.utils.data.DataLoader
32
+ A data loader object is used to estimate the covariance one batch at a
33
+ time. This allows estimation of the covariance structure in a streaming
34
+ way, without having to load all data into memory.
35
+ n_outcomes : int
36
+ The number of features modeled by this marginal model. For example,
37
+ this corresponds to the number of genes being simulated.
38
+ parameters : Dict[str, CovarianceStructure]
39
+ A dictionary of CovarianceStructure objects. Each key corresponds to a
40
+ different category specified in the original formula. The covariance
41
+ structure stores the relationships among genes. It can be a standard
42
+ covariance matrix, but may also use more memory-efficient approximations
43
+ like when using CovarianceStructure with a constraint on
44
+ num_modeled_genes.
45
+
46
+ Examples
47
+ --------
48
+ >>> import scanpy as sc
49
+ >>> adata = sc.datasets.pbmc3k()[:, :300]
50
+ >>>
51
+ >>> class DummyCopula(Copula):
52
+ ... def fit(self):
53
+ ... pass
54
+ ... def likelihood(self):
55
+ ... pass
56
+ ... def num_params(self):
57
+ ... return 0
58
+ ... def pseudo_obs(self, x_dict):
59
+ ... return np.random.uniform(size=(x_dict["group"].shape[0], self.n_outcomes))
60
+ ...
61
+ >>> model = DummyCopula({"group": "~ 1"})
62
+ >>> model.setup_data(adata, {"group": "~ 1"})
63
+ >>> model.fit()
64
+ """
65
+ def __init__(self, formula: Union[str, dict], **kwargs):
66
+ self.formula = formula
67
+ self.loader = None
68
+ self.n_outcomes = None
69
+ self.parameters = None # Should be a dictionary of CovarianceStructure objects
70
+
71
+ def setup_data(self, adata: AnnData, marginal_formula: Dict[str, str], batch_size: int = 1024, **kwargs):
72
+ """
73
+ Populate the .loader attribute
74
+
75
+ Parameters
76
+ ----------
77
+ adata : AnnData
78
+ This is the object on which we want to estimate the simulator. This
79
+ serves as the template for all downstream fitting.
80
+
81
+ marginal_formula : Dict[str, str]
82
+ A dictionary or string specifying the relationship between the columns
83
+ of an input data frame (adata.obs, adata.var, or similar attributes) and
84
+ the parameters of the marginal model. If only a string is provided,
85
+ then the means are allowed to depend on the design parameters, while all
86
+ other parameters are treated as fixed. If a dictionary is provided,
87
+ each key should correspond to a parameter. The string values should be
88
+ in a format that can be parsed by the formulaic package. For example,
89
+ '~ x' will ensure that the parameter varies linearly with X.
90
+
91
+ Returns
92
+ -------
93
+ None
94
+ This method does not return anything but populates the self.adata,
95
+ formula, loader, and n_outcomes attributes based on the provided
96
+ adata input object.
97
+ """
98
+ self.adata = adata
99
+ self.formula = self.formula | marginal_formula #
100
+ self.loader = adata_loader(adata, self.formula, batch_size=batch_size, **kwargs)
101
+ X_batch, _ = next(iter(self.loader))
102
+ self.n_outcomes = X_batch.shape[1]
103
+
104
+ def decorrelate(self, row_pattern: str, col_pattern: str, group: Union[str, list, None] = None):
105
+ """
106
+ Decorrelate the covariance matrix for the given row and column patterns.
107
+
108
+ This method can be used to generate synthetic null data where particular
109
+ pairs of features are forced to be uncorrelated with one another. Any
110
+ indices of the covariance that lie in the intersection of the specified
111
+ row and column patterns will be set to zero.
112
+
113
+ Parameters
114
+ ----------
115
+ row_pattern : str
116
+ The regex pattern for the row names to match.
117
+ col_pattern : str
118
+ The regex pattern for the column names to match.
119
+ group : Union[str, list, None], optional
120
+ The group or groups to apply the transformation to. If None, the
121
+ transformation is applied to all groups.
122
+
123
+ Returns
124
+ -------
125
+ None
126
+ This method does not return anything but modifies self parameters as
127
+ a side effect.
128
+ """
129
+ if group is None:
130
+ for g in self.groups:
131
+ self.parameters[g].decorrelate(row_pattern, col_pattern)
132
+ elif isinstance(group, str):
133
+ self.parameters[group].decorrelate(row_pattern, col_pattern)
134
+ else:
135
+ for g in group:
136
+ self.parameters[g].decorrelate(row_pattern, col_pattern)
137
+
138
+ def correlate(self, factor: float, row_pattern: str, col_pattern: str, group: Union[str, list, None] = None):
139
+ """
140
+ Multiply selected off-diagonal entries by factor.
141
+
142
+ To adjust the signal strength in a power analysis, we may want to
143
+ rescale the correlation for specific entries in the covariance matrix.
144
+ This function is used to apply a multiplicative factor to selected
145
+ entries, allowing targeted modification of correlation strength.
146
+
147
+ Parameters
148
+ ----------
149
+ factor : float
150
+ The factor to multiply the off-diagonal entries by.
151
+ row_pattern : str
152
+ The regex pattern for the row names to match.
153
+ col_pattern : str
154
+ The regex pattern for the column names to match.
155
+ group : Union[str, list, None], optional
156
+ The group or groups to apply the transformation to. If None, the
157
+ transformation is applied to all groups.
158
+
159
+ Returns
160
+ -------
161
+ None
162
+ This method does not return anything but modifies self parameters as
163
+ a side effect.
164
+ """
165
+ if group is None:
166
+ for g in self.groups:
167
+ self.parameters[g].correlate(row_pattern, col_pattern, factor)
168
+ elif isinstance(group, str):
169
+ self.parameters[group].correlate(row_pattern, col_pattern, factor)
170
+ else:
171
+ for g in group:
172
+ self.parameters[g].correlate(row_pattern, col_pattern, factor)
173
+
174
+ @abstractmethod
175
+ def fit(self, uniformizer: Callable, **kwargs):
176
+ """
177
+ Fit a Copula
178
+
179
+ Copula models are estimated by transforming the observed data onto the
180
+ [0, 1] space of percentiles. See the .invert() method within class
181
+ Marginal.
182
+
183
+ Parameters
184
+ ----------
185
+ uniformizer : Callable
186
+ Function to transform data to uniform marginals. See .invert()
187
+ within class Marginal for an example.
188
+ **kwargs
189
+ Additional keyword arguments.
190
+ """
191
+ raise NotImplementedError
192
+
193
+ @abstractmethod
194
+ def pseudo_obs(self, x_dict: Dict):
195
+ """
196
+ Sample from a Copula
197
+
198
+ Dependent uniform variables can be sampled from the copula conditional
199
+ on a specific design matrix X (encoding biological and experimental
200
+ covariates). For example, this will sample uniform variables with
201
+ dependence reflecting the cell type specified by X.
202
+
203
+ Parameters
204
+ ----------
205
+ x_dict : Dict
206
+ A dictionary of tensors, with one key/value pair per parameter.
207
+ These tensors are the conditioning information to pass to the
208
+ .predict() function of this distribution class. They are the
209
+ numerical design matrices implied by the initializing formulas.
210
+ """
211
+ raise NotImplementedError
212
+
213
+ @abstractmethod
214
+ def likelihood(self, uniformizer: Callable, batch: Tuple[torch.Tensor, Dict[str, torch.Tensor]]):
215
+ """
216
+ Parameters
217
+ ----------
218
+ uniformizer : Callable
219
+ Function to transform data to uniform marginals.
220
+ batch : Tuple[torch.Tensor, Dict[str, torch.Tensor]]
221
+ Batch of data.
222
+ """
223
+ raise NotImplementedError
224
+
225
+ @abstractmethod
226
+ def num_params(self, **kwargs):
227
+ """
228
+ Covariance Parameters
229
+
230
+ This returns the number of free parameters in the overall copula model.
231
+ This is useful for assessing model complexity.
232
+
233
+ Parameters
234
+ ----------
235
+ **kwargs
236
+ Additional keyword arguments.
237
+ """
238
+ raise NotImplementedError
239
+
240
+
241
+ class CovarianceStructure:
242
+ """
243
+ Efficient storage for covariance matrices in copula-based gene expression modeling.
244
+
245
+ This class provides memory-efficient storage for covariance information by storing
246
+ either a full covariance matrix or a block matrix with diagonal variances for
247
+ remaining genes. This enables fast copula estimation and sampling for large
248
+ gene expression datasets.
249
+
250
+ Attributes
251
+ ----------
252
+ cov : pd.DataFrame
253
+ Covariance matrix for modeled genes with gene names as index/columns
254
+ modeled_indices : np.ndarray
255
+ Indices of modeled genes in original ordering
256
+ remaining_var : pd.Series or None
257
+ Diagonal variances for remaining genes, None if full matrix stored
258
+ remaining_indices : np.ndarray or None
259
+ Indices of remaining genes in original ordering
260
+ num_modeled_genes : int
261
+ Number of modeled genes
262
+ num_remaining_genes : int
263
+ Number of remaining genes (0 if full matrix stored)
264
+ total_genes : int
265
+ Total number of genes
266
+
267
+ Examples
268
+ --------
269
+ >>> import numpy as np
270
+ >>> import pandas as pd
271
+ >>>
272
+ >>> sigma = np.random.uniform(size=(5, 5))
273
+ >>> modeled_names = ["A", "B", "C", "D", "E"]
274
+ >>> sigma = pd.DataFrame(sigma, columns=modeled_names, index=modeled_names)
275
+ >>> covariance = CovarianceStructure(sigma, modeled_names)
276
+ """
277
+ def __init__(self, cov: np.ndarray,
278
+ modeled_names: pd.Index,
279
+ modeled_indices: Optional[np.ndarray] = None,
280
+ remaining_var: Optional[np.ndarray] = None,
281
+ remaining_indices: Optional[np.ndarray] = None,
282
+ remaining_names: Optional[pd.Index] = None):
283
+ """
284
+ Initialize a CovarianceStructure object.
285
+
286
+ Parameters
287
+ ----------
288
+ cov : np.ndarray
289
+ Covariance matrix for modeled genes, shape (n_modeled_genes, n_modeled_genes)
290
+ modeled_names : pd.Index
291
+ Gene names for the modeled genes
292
+ modeled_indices : Optional[np.ndarray], optional
293
+ Indices of modeled genes in original ordering. Defaults to sequential indices.
294
+ remaining_var : Optional[np.ndarray], optional
295
+ Diagonal variances for remaining genes, shape (n_remaining_genes,)
296
+ remaining_indices : Optional[np.ndarray], optional
297
+ Indices of remaining genes in original ordering
298
+ remaining_names : Optional[pd.Index], optional
299
+ Gene names for remaining genes
300
+ """
301
+ self.cov = pd.DataFrame(cov, index=modeled_names, columns=modeled_names)
302
+
303
+ if modeled_indices is not None:
304
+ self.modeled_indices = modeled_indices
305
+ else:
306
+ self.modeled_indices = np.arange(len(modeled_names))
307
+
308
+ if remaining_var is not None:
309
+ self.remaining_var = pd.Series(remaining_var, index=remaining_names)
310
+ else:
311
+ self.remaining_var = None
312
+
313
+ self.remaining_indices = remaining_indices
314
+ self.num_modeled_genes = len(modeled_names)
315
+ self.num_remaining_genes = len(remaining_indices) if remaining_indices is not None else 0
316
+ self.total_genes = self.num_modeled_genes + self.num_remaining_genes
317
+
318
+ def __repr__(self):
319
+ if self.remaining_var is None:
320
+ return self.cov.__repr__()
321
+ else:
322
+ return f"CovarianceStructure(modeled_genes={self.num_modeled_genes}, \
323
+ total_genes={self.total_genes})"
324
+
325
+ def _repr_html_(self):
326
+ """
327
+ Jupyter Notebook display
328
+
329
+ Returns
330
+ -------
331
+ str
332
+ HTML representation of the object.
333
+ """
334
+ if self.remaining_var is None:
335
+ return self.cov._repr_html_()
336
+ else:
337
+ html = f"<b>CovarianceStructure:</b> {self.num_modeled_genes} modeled genes, {self.total_genes} total<br>"
338
+ html += "<h4>Modeled Covariance Matrix</h4>" + self.cov._repr_html_()
339
+ html += "<h4>Remaining Gene Variances</h4>" + self.remaining_var.to_frame("variance").T._repr_html_()
340
+ return html
341
+
342
+ def decorrelate(self, row_pattern: str, col_pattern: str):
343
+ """
344
+ Decorrelate the covariance matrix for the given row and column patterns.
345
+
346
+ This method can be used to generate synthetic null data where particular
347
+ pairs of features are forced to be uncorrelated with one another. Any
348
+ indices of the covariance that lie in the intersection of the specified
349
+ row and column patterns will be set to zero.
350
+
351
+ Parameters
352
+ ----------
353
+ row_pattern : str
354
+ The regex pattern for the row names to match.
355
+ col_pattern : str
356
+ The regex pattern for the column names to match.
357
+ """
358
+ from ..transform.transform import data_frame_mask
359
+ m1 = data_frame_mask(self.cov, ".", col_pattern)
360
+ m2 = data_frame_mask(self.cov, row_pattern, ".")
361
+ mask = (m1 | m2)
362
+ np.fill_diagonal(mask, False)
363
+ self.cov.values[mask] = 0
364
+
365
+ def correlate(self, row_pattern: str, col_pattern: str, factor: float):
366
+ """
367
+ Multiply selected off-diagonal entries by factor.
368
+
369
+ To adjust the signal strength in a power analysis, we may want to
370
+ rescale the correlation for specific entries in the covariance matrix.
371
+ This function is used to apply a multiplicative factor to selected
372
+ entries, allowing targeted modification of correlation strength.
373
+
374
+
375
+ Parameters
376
+ ----------
377
+ row_pattern : str
378
+ The regex pattern for the row names to match.
379
+ col_pattern : str
380
+ The regex pattern for the column names to match.
381
+ factor : float
382
+ The factor to multiply the off-diagonal entries by.
383
+ """
384
+ from ..transform.transform import data_frame_mask
385
+ m1 = data_frame_mask(self.cov, ".", col_pattern)
386
+ m2 = data_frame_mask(self.cov, row_pattern, ".")
387
+ mask = (m1 | m2)
388
+ np.fill_diagonal(mask, False)
389
+ self.cov.values[mask] = self.cov.values[mask] * factor
390
+
391
+ @property
392
+ def shape(self):
393
+ return (self.total_genes, self.total_genes)
394
+
395
+ def to_full_matrix(self):
396
+ """
397
+ Convert to full covariance matrix for compatibility and debugging.
398
+
399
+ Returns
400
+ -------
401
+ np.ndarray
402
+ Full covariance matrix with shape (total_genes, total_genes)
403
+ """
404
+ if self.remaining_var is None:
405
+ return self.cov.values
406
+ else:
407
+ full_cov = np.zeros((self.total_genes, self.total_genes))
408
+
409
+ # Fill in top-k block
410
+ ix_modeled = np.ix_(self.modeled_indices, self.modeled_indices)
411
+ full_cov[ix_modeled] = self.cov.values
412
+
413
+ # Fill in diagonal for remaining genes
414
+ full_cov[self.remaining_indices, self.remaining_indices] = self.remaining_var.values
415
+
416
+ return full_cov