pertpy 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pertpy/__init__.py +1 -1
- pertpy/_doc.py +19 -0
- pertpy/data/_datasets.py +1 -1
- pertpy/metadata/_cell_line.py +18 -8
- pertpy/metadata/_compound.py +3 -4
- pertpy/metadata/_metadata.py +1 -1
- pertpy/preprocessing/_guide_rna.py +114 -13
- pertpy/preprocessing/_guide_rna_mixture.py +179 -0
- pertpy/tools/__init__.py +1 -1
- pertpy/tools/_augur.py +64 -86
- pertpy/tools/_cinemaot.py +21 -17
- pertpy/tools/_coda/_base_coda.py +90 -117
- pertpy/tools/_dialogue.py +32 -40
- pertpy/tools/_differential_gene_expression/__init__.py +1 -2
- pertpy/tools/_differential_gene_expression/_base.py +486 -112
- pertpy/tools/_differential_gene_expression/_edger.py +30 -21
- pertpy/tools/_differential_gene_expression/_pydeseq2.py +15 -29
- pertpy/tools/_differential_gene_expression/_statsmodels.py +0 -11
- pertpy/tools/_distances/_distances.py +71 -56
- pertpy/tools/_enrichment.py +16 -8
- pertpy/tools/_milo.py +54 -50
- pertpy/tools/_mixscape.py +307 -208
- pertpy/tools/_perturbation_space/_perturbation_space.py +40 -31
- pertpy/tools/_perturbation_space/_simple.py +48 -0
- pertpy/tools/_scgen/_scgen.py +35 -27
- {pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/METADATA +6 -6
- {pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/RECORD +29 -28
- {pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/WHEEL +1 -1
- pertpy/tools/_differential_gene_expression/_formulaic.py +0 -189
- {pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,189 +0,0 @@
|
|
1
|
-
"""Helpers to interact with Formulaic Formulas
|
2
|
-
|
3
|
-
Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`):
|
4
|
-
* A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`.
|
5
|
-
* A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`.
|
6
|
-
* A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank.
|
7
|
-
"""
|
8
|
-
|
9
|
-
from collections import defaultdict
|
10
|
-
from collections.abc import Mapping, Sequence
|
11
|
-
from dataclasses import dataclass
|
12
|
-
from typing import Any
|
13
|
-
|
14
|
-
from formulaic import FactorValues, ModelSpec
|
15
|
-
from formulaic.materializers import PandasMaterializer
|
16
|
-
from formulaic.materializers.types import EvaluatedFactor
|
17
|
-
from formulaic.parser.types import Factor
|
18
|
-
from interface_meta import override
|
19
|
-
|
20
|
-
|
21
|
-
@dataclass
|
22
|
-
class FactorMetadata:
|
23
|
-
"""Store (relevant) metadata for a factor of a formula."""
|
24
|
-
|
25
|
-
name: str
|
26
|
-
"""The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`"""
|
27
|
-
|
28
|
-
reduced_rank: bool
|
29
|
-
"""Whether a column will be dropped because it is redundant"""
|
30
|
-
|
31
|
-
custom_encoder: bool
|
32
|
-
"""Whether or not a custom encoder (e.g. `C(...)`) was used."""
|
33
|
-
|
34
|
-
categories: Sequence[str]
|
35
|
-
"""The unique categories in this factor (after applying `drop_rows`)"""
|
36
|
-
|
37
|
-
kind: Factor.Kind
|
38
|
-
"""Type of the factor"""
|
39
|
-
|
40
|
-
drop_field: str = None
|
41
|
-
"""The category that is dropped.
|
42
|
-
|
43
|
-
Note that
|
44
|
-
* this may also be populated if `reduced_rank = False`
|
45
|
-
* this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`.
|
46
|
-
"""
|
47
|
-
|
48
|
-
column_names: Sequence[str] = None
|
49
|
-
"""The column names for this factor included in the design matrix.
|
50
|
-
|
51
|
-
This may be the same as `categories` if the default encoder is used, or
|
52
|
-
categories without the base level if a custom encoder (e.g. `C(...)`) is used.
|
53
|
-
"""
|
54
|
-
|
55
|
-
colname_format: str = None
|
56
|
-
"""A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`"""
|
57
|
-
|
58
|
-
@property
|
59
|
-
def base(self) -> str | None:
|
60
|
-
"""
|
61
|
-
The base category for this categorical factor.
|
62
|
-
|
63
|
-
This is derived from `drop_field` (for default encoding) or by comparing the column names in
|
64
|
-
the design matrix with all categories (for custom encoding, e.g. `C(...)`).
|
65
|
-
"""
|
66
|
-
if not self.reduced_rank:
|
67
|
-
return None
|
68
|
-
else:
|
69
|
-
if self.custom_encoder:
|
70
|
-
tmp_base = set(self.categories) - set(self.column_names)
|
71
|
-
assert len(tmp_base) == 1
|
72
|
-
return tmp_base.pop()
|
73
|
-
else:
|
74
|
-
assert self.drop_field is not None
|
75
|
-
return self.drop_field
|
76
|
-
|
77
|
-
|
78
|
-
def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]:
|
79
|
-
"""Keeps track of categorical factors used in a model specification by generating a custom materializer.
|
80
|
-
|
81
|
-
This materializer reports back metadata upon materialization of the model matrix.
|
82
|
-
|
83
|
-
Returns:
|
84
|
-
- A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`.
|
85
|
-
- A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms
|
86
|
-
but maps to factors rather than terms, named `variable_to_factors`.
|
87
|
-
- A materializer class tied to the specific instance of `factor_storage`.
|
88
|
-
"""
|
89
|
-
# There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction
|
90
|
-
# term, it generates the factor with both full rank and reduced rank.
|
91
|
-
factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list)
|
92
|
-
variable_to_factors: dict[str, set[str]] = defaultdict(set)
|
93
|
-
|
94
|
-
class CustomPandasMaterializer(PandasMaterializer):
|
95
|
-
"""An extension of the PandasMaterializer that records all categorical variables and their (base) categories."""
|
96
|
-
|
97
|
-
REGISTER_NAME = "custom_pandas"
|
98
|
-
REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
|
99
|
-
REGISTER_OUTPUTS = ("pandas", "numpy", "sparse")
|
100
|
-
|
101
|
-
def __init__(
|
102
|
-
self,
|
103
|
-
data: Any,
|
104
|
-
context: Mapping[str, Any] | None = None,
|
105
|
-
record_factor_metadata: bool = False,
|
106
|
-
**params: Any,
|
107
|
-
):
|
108
|
-
"""Initialize the Materializer.
|
109
|
-
|
110
|
-
Args:
|
111
|
-
data: Passed to PandasMaterializer.
|
112
|
-
context: Passed to PandasMaterializer
|
113
|
-
record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class
|
114
|
-
is supposed to record factor metadata. Only the instance that is used for building the design
|
115
|
-
matrix should record the metadata. All other instances (e.g. used to generate contrast vectors)
|
116
|
-
should not record metadata to not overwrite the specifications from the design matrix.
|
117
|
-
**params: Passed to PandasMaterializer
|
118
|
-
"""
|
119
|
-
self.factor_metadata_storage = factor_storage if record_factor_metadata else None
|
120
|
-
self.variable_to_factors = variable_to_factors if record_factor_metadata else None
|
121
|
-
# temporary pointer to metadata of factor that is currently evaluated
|
122
|
-
self._current_factor: FactorMetadata = None
|
123
|
-
super().__init__(data, context, **params)
|
124
|
-
|
125
|
-
@override
|
126
|
-
def _encode_evaled_factor(
|
127
|
-
self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False
|
128
|
-
) -> dict[str, Any]:
|
129
|
-
"""Function is called just before the factor is evaluated.
|
130
|
-
|
131
|
-
We can record some metadata, before we call the original function.
|
132
|
-
"""
|
133
|
-
assert (
|
134
|
-
self._current_factor is None
|
135
|
-
), "_current_factor should always be None when we start recording metadata"
|
136
|
-
if self.factor_metadata_storage is not None:
|
137
|
-
# Don't store if the factor is cached (then we should already have recorded it)
|
138
|
-
if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache:
|
139
|
-
assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached"
|
140
|
-
else:
|
141
|
-
for var in factor.variables:
|
142
|
-
self.variable_to_factors[var].add(factor.expr)
|
143
|
-
self._current_factor = FactorMetadata(
|
144
|
-
name=factor.expr,
|
145
|
-
reduced_rank=reduced_rank,
|
146
|
-
categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())),
|
147
|
-
custom_encoder=factor.metadata.encoder is not None,
|
148
|
-
kind=factor.metadata.kind,
|
149
|
-
)
|
150
|
-
return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank)
|
151
|
-
|
152
|
-
@override
|
153
|
-
def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]:
|
154
|
-
"""
|
155
|
-
Function is called at the end, before the design matrix gets materialized.
|
156
|
-
|
157
|
-
Here we have access to additional metadata, such as `drop_field`.
|
158
|
-
"""
|
159
|
-
if self._current_factor is not None:
|
160
|
-
assert self._current_factor.name == name
|
161
|
-
self._current_factor.drop_field = values.__formulaic_metadata__.drop_field
|
162
|
-
self._current_factor.column_names = values.__formulaic_metadata__.column_names
|
163
|
-
self._current_factor.colname_format = values.__formulaic_metadata__.format
|
164
|
-
self.factor_metadata_storage[name].append(self._current_factor)
|
165
|
-
self._current_factor = None
|
166
|
-
|
167
|
-
return super()._flatten_encoded_evaled_factor(name, values)
|
168
|
-
|
169
|
-
return factor_storage, variable_to_factors, CustomPandasMaterializer
|
170
|
-
|
171
|
-
|
172
|
-
class AmbiguousAttributeError(ValueError):
|
173
|
-
pass
|
174
|
-
|
175
|
-
|
176
|
-
def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any:
|
177
|
-
"""Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error."""
|
178
|
-
if not objs:
|
179
|
-
raise ValueError("Collection is empty")
|
180
|
-
|
181
|
-
first_obj_attr = getattr(objs[0], attr)
|
182
|
-
|
183
|
-
# Check if the attribute is the same for all objects
|
184
|
-
for obj in objs[1:]:
|
185
|
-
if getattr(obj, attr) != first_obj_attr:
|
186
|
-
raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects")
|
187
|
-
|
188
|
-
# If attribute is the same for all objects, return it
|
189
|
-
return first_obj_attr
|
File without changes
|