pertpy 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- pertpy/__init__.py +1 -1
- pertpy/_doc.py +19 -0
- pertpy/data/_datasets.py +1 -1
- pertpy/metadata/_cell_line.py +18 -8
- pertpy/metadata/_compound.py +3 -4
- pertpy/metadata/_metadata.py +1 -1
- pertpy/preprocessing/_guide_rna.py +114 -13
- pertpy/preprocessing/_guide_rna_mixture.py +179 -0
- pertpy/tools/__init__.py +1 -1
- pertpy/tools/_augur.py +64 -86
- pertpy/tools/_cinemaot.py +21 -17
- pertpy/tools/_coda/_base_coda.py +90 -117
- pertpy/tools/_dialogue.py +32 -40
- pertpy/tools/_differential_gene_expression/__init__.py +1 -2
- pertpy/tools/_differential_gene_expression/_base.py +486 -112
- pertpy/tools/_differential_gene_expression/_edger.py +30 -21
- pertpy/tools/_differential_gene_expression/_pydeseq2.py +15 -29
- pertpy/tools/_differential_gene_expression/_statsmodels.py +0 -11
- pertpy/tools/_distances/_distances.py +71 -56
- pertpy/tools/_enrichment.py +16 -8
- pertpy/tools/_milo.py +54 -50
- pertpy/tools/_mixscape.py +307 -208
- pertpy/tools/_perturbation_space/_perturbation_space.py +40 -31
- pertpy/tools/_perturbation_space/_simple.py +48 -0
- pertpy/tools/_scgen/_scgen.py +35 -27
- {pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/METADATA +6 -6
- {pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/RECORD +29 -28
- {pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/WHEEL +1 -1
- pertpy/tools/_differential_gene_expression/_formulaic.py +0 -189
- {pertpy-0.9.4.dist-info → pertpy-0.10.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,189 +0,0 @@
|
|
1
|
-
"""Helpers to interact with Formulaic Formulas
|
2
|
-
|
3
|
-
Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`):
|
4
|
-
* A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`.
|
5
|
-
* A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`.
|
6
|
-
* A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank.
|
7
|
-
"""
|
8
|
-
|
9
|
-
from collections import defaultdict
|
10
|
-
from collections.abc import Mapping, Sequence
|
11
|
-
from dataclasses import dataclass
|
12
|
-
from typing import Any
|
13
|
-
|
14
|
-
from formulaic import FactorValues, ModelSpec
|
15
|
-
from formulaic.materializers import PandasMaterializer
|
16
|
-
from formulaic.materializers.types import EvaluatedFactor
|
17
|
-
from formulaic.parser.types import Factor
|
18
|
-
from interface_meta import override
|
19
|
-
|
20
|
-
|
21
|
-
@dataclass
|
22
|
-
class FactorMetadata:
|
23
|
-
"""Store (relevant) metadata for a factor of a formula."""
|
24
|
-
|
25
|
-
name: str
|
26
|
-
"""The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`"""
|
27
|
-
|
28
|
-
reduced_rank: bool
|
29
|
-
"""Whether a column will be dropped because it is redundant"""
|
30
|
-
|
31
|
-
custom_encoder: bool
|
32
|
-
"""Whether or not a custom encoder (e.g. `C(...)`) was used."""
|
33
|
-
|
34
|
-
categories: Sequence[str]
|
35
|
-
"""The unique categories in this factor (after applying `drop_rows`)"""
|
36
|
-
|
37
|
-
kind: Factor.Kind
|
38
|
-
"""Type of the factor"""
|
39
|
-
|
40
|
-
drop_field: str = None
|
41
|
-
"""The category that is dropped.
|
42
|
-
|
43
|
-
Note that
|
44
|
-
* this may also be populated if `reduced_rank = False`
|
45
|
-
* this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`.
|
46
|
-
"""
|
47
|
-
|
48
|
-
column_names: Sequence[str] = None
|
49
|
-
"""The column names for this factor included in the design matrix.
|
50
|
-
|
51
|
-
This may be the same as `categories` if the default encoder is used, or
|
52
|
-
categories without the base level if a custom encoder (e.g. `C(...)`) is used.
|
53
|
-
"""
|
54
|
-
|
55
|
-
colname_format: str = None
|
56
|
-
"""A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`"""
|
57
|
-
|
58
|
-
@property
|
59
|
-
def base(self) -> str | None:
|
60
|
-
"""
|
61
|
-
The base category for this categorical factor.
|
62
|
-
|
63
|
-
This is derived from `drop_field` (for default encoding) or by comparing the column names in
|
64
|
-
the design matrix with all categories (for custom encoding, e.g. `C(...)`).
|
65
|
-
"""
|
66
|
-
if not self.reduced_rank:
|
67
|
-
return None
|
68
|
-
else:
|
69
|
-
if self.custom_encoder:
|
70
|
-
tmp_base = set(self.categories) - set(self.column_names)
|
71
|
-
assert len(tmp_base) == 1
|
72
|
-
return tmp_base.pop()
|
73
|
-
else:
|
74
|
-
assert self.drop_field is not None
|
75
|
-
return self.drop_field
|
76
|
-
|
77
|
-
|
78
|
-
def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]:
|
79
|
-
"""Keeps track of categorical factors used in a model specification by generating a custom materializer.
|
80
|
-
|
81
|
-
This materializer reports back metadata upon materialization of the model matrix.
|
82
|
-
|
83
|
-
Returns:
|
84
|
-
- A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`.
|
85
|
-
- A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms
|
86
|
-
but maps to factors rather than terms, named `variable_to_factors`.
|
87
|
-
- A materializer class tied to the specific instance of `factor_storage`.
|
88
|
-
"""
|
89
|
-
# There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction
|
90
|
-
# term, it generates the factor with both full rank and reduced rank.
|
91
|
-
factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list)
|
92
|
-
variable_to_factors: dict[str, set[str]] = defaultdict(set)
|
93
|
-
|
94
|
-
class CustomPandasMaterializer(PandasMaterializer):
|
95
|
-
"""An extension of the PandasMaterializer that records all categorical variables and their (base) categories."""
|
96
|
-
|
97
|
-
REGISTER_NAME = "custom_pandas"
|
98
|
-
REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
|
99
|
-
REGISTER_OUTPUTS = ("pandas", "numpy", "sparse")
|
100
|
-
|
101
|
-
def __init__(
|
102
|
-
self,
|
103
|
-
data: Any,
|
104
|
-
context: Mapping[str, Any] | None = None,
|
105
|
-
record_factor_metadata: bool = False,
|
106
|
-
**params: Any,
|
107
|
-
):
|
108
|
-
"""Initialize the Materializer.
|
109
|
-
|
110
|
-
Args:
|
111
|
-
data: Passed to PandasMaterializer.
|
112
|
-
context: Passed to PandasMaterializer
|
113
|
-
record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class
|
114
|
-
is supposed to record factor metadata. Only the instance that is used for building the design
|
115
|
-
matrix should record the metadata. All other instances (e.g. used to generate contrast vectors)
|
116
|
-
should not record metadata to not overwrite the specifications from the design matrix.
|
117
|
-
**params: Passed to PandasMaterializer
|
118
|
-
"""
|
119
|
-
self.factor_metadata_storage = factor_storage if record_factor_metadata else None
|
120
|
-
self.variable_to_factors = variable_to_factors if record_factor_metadata else None
|
121
|
-
# temporary pointer to metadata of factor that is currently evaluated
|
122
|
-
self._current_factor: FactorMetadata = None
|
123
|
-
super().__init__(data, context, **params)
|
124
|
-
|
125
|
-
@override
|
126
|
-
def _encode_evaled_factor(
|
127
|
-
self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False
|
128
|
-
) -> dict[str, Any]:
|
129
|
-
"""Function is called just before the factor is evaluated.
|
130
|
-
|
131
|
-
We can record some metadata, before we call the original function.
|
132
|
-
"""
|
133
|
-
assert (
|
134
|
-
self._current_factor is None
|
135
|
-
), "_current_factor should always be None when we start recording metadata"
|
136
|
-
if self.factor_metadata_storage is not None:
|
137
|
-
# Don't store if the factor is cached (then we should already have recorded it)
|
138
|
-
if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache:
|
139
|
-
assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached"
|
140
|
-
else:
|
141
|
-
for var in factor.variables:
|
142
|
-
self.variable_to_factors[var].add(factor.expr)
|
143
|
-
self._current_factor = FactorMetadata(
|
144
|
-
name=factor.expr,
|
145
|
-
reduced_rank=reduced_rank,
|
146
|
-
categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())),
|
147
|
-
custom_encoder=factor.metadata.encoder is not None,
|
148
|
-
kind=factor.metadata.kind,
|
149
|
-
)
|
150
|
-
return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank)
|
151
|
-
|
152
|
-
@override
|
153
|
-
def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]:
|
154
|
-
"""
|
155
|
-
Function is called at the end, before the design matrix gets materialized.
|
156
|
-
|
157
|
-
Here we have access to additional metadata, such as `drop_field`.
|
158
|
-
"""
|
159
|
-
if self._current_factor is not None:
|
160
|
-
assert self._current_factor.name == name
|
161
|
-
self._current_factor.drop_field = values.__formulaic_metadata__.drop_field
|
162
|
-
self._current_factor.column_names = values.__formulaic_metadata__.column_names
|
163
|
-
self._current_factor.colname_format = values.__formulaic_metadata__.format
|
164
|
-
self.factor_metadata_storage[name].append(self._current_factor)
|
165
|
-
self._current_factor = None
|
166
|
-
|
167
|
-
return super()._flatten_encoded_evaled_factor(name, values)
|
168
|
-
|
169
|
-
return factor_storage, variable_to_factors, CustomPandasMaterializer
|
170
|
-
|
171
|
-
|
172
|
-
class AmbiguousAttributeError(ValueError):
|
173
|
-
pass
|
174
|
-
|
175
|
-
|
176
|
-
def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any:
|
177
|
-
"""Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error."""
|
178
|
-
if not objs:
|
179
|
-
raise ValueError("Collection is empty")
|
180
|
-
|
181
|
-
first_obj_attr = getattr(objs[0], attr)
|
182
|
-
|
183
|
-
# Check if the attribute is the same for all objects
|
184
|
-
for obj in objs[1:]:
|
185
|
-
if getattr(obj, attr) != first_obj_attr:
|
186
|
-
raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects")
|
187
|
-
|
188
|
-
# If attribute is the same for all objects, return it
|
189
|
-
return first_obj_attr
|
File without changes
|