pertpy 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,189 +0,0 @@
1
- """Helpers to interact with Formulaic Formulas
2
-
3
- Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`):
4
- * A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`.
5
- * A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`.
6
- * A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank.
7
- """
8
-
9
- from collections import defaultdict
10
- from collections.abc import Mapping, Sequence
11
- from dataclasses import dataclass
12
- from typing import Any
13
-
14
- from formulaic import FactorValues, ModelSpec
15
- from formulaic.materializers import PandasMaterializer
16
- from formulaic.materializers.types import EvaluatedFactor
17
- from formulaic.parser.types import Factor
18
- from interface_meta import override
19
-
20
-
21
- @dataclass
22
- class FactorMetadata:
23
- """Store (relevant) metadata for a factor of a formula."""
24
-
25
- name: str
26
- """The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`"""
27
-
28
- reduced_rank: bool
29
- """Whether a column will be dropped because it is redundant"""
30
-
31
- custom_encoder: bool
32
- """Whether or not a custom encoder (e.g. `C(...)`) was used."""
33
-
34
- categories: Sequence[str]
35
- """The unique categories in this factor (after applying `drop_rows`)"""
36
-
37
- kind: Factor.Kind
38
- """Type of the factor"""
39
-
40
- drop_field: str = None
41
- """The category that is dropped.
42
-
43
- Note that
44
- * this may also be populated if `reduced_rank = False`
45
- * this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`.
46
- """
47
-
48
- column_names: Sequence[str] = None
49
- """The column names for this factor included in the design matrix.
50
-
51
- This may be the same as `categories` if the default encoder is used, or
52
- categories without the base level if a custom encoder (e.g. `C(...)`) is used.
53
- """
54
-
55
- colname_format: str = None
56
- """A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`"""
57
-
58
- @property
59
- def base(self) -> str | None:
60
- """
61
- The base category for this categorical factor.
62
-
63
- This is derived from `drop_field` (for default encoding) or by comparing the column names in
64
- the design matrix with all categories (for custom encoding, e.g. `C(...)`).
65
- """
66
- if not self.reduced_rank:
67
- return None
68
- else:
69
- if self.custom_encoder:
70
- tmp_base = set(self.categories) - set(self.column_names)
71
- assert len(tmp_base) == 1
72
- return tmp_base.pop()
73
- else:
74
- assert self.drop_field is not None
75
- return self.drop_field
76
-
77
-
78
- def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]:
79
- """Keeps track of categorical factors used in a model specification by generating a custom materializer.
80
-
81
- This materializer reports back metadata upon materialization of the model matrix.
82
-
83
- Returns:
84
- - A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`.
85
- - A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms
86
- but maps to factors rather than terms, named `variable_to_factors`.
87
- - A materializer class tied to the specific instance of `factor_storage`.
88
- """
89
- # There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction
90
- # term, it generates the factor with both full rank and reduced rank.
91
- factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list)
92
- variable_to_factors: dict[str, set[str]] = defaultdict(set)
93
-
94
- class CustomPandasMaterializer(PandasMaterializer):
95
- """An extension of the PandasMaterializer that records all categorical variables and their (base) categories."""
96
-
97
- REGISTER_NAME = "custom_pandas"
98
- REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
99
- REGISTER_OUTPUTS = ("pandas", "numpy", "sparse")
100
-
101
- def __init__(
102
- self,
103
- data: Any,
104
- context: Mapping[str, Any] | None = None,
105
- record_factor_metadata: bool = False,
106
- **params: Any,
107
- ):
108
- """Initialize the Materializer.
109
-
110
- Args:
111
- data: Passed to PandasMaterializer.
112
- context: Passed to PandasMaterializer
113
- record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class
114
- is supposed to record factor metadata. Only the instance that is used for building the design
115
- matrix should record the metadata. All other instances (e.g. used to generate contrast vectors)
116
- should not record metadata to not overwrite the specifications from the design matrix.
117
- **params: Passed to PandasMaterializer
118
- """
119
- self.factor_metadata_storage = factor_storage if record_factor_metadata else None
120
- self.variable_to_factors = variable_to_factors if record_factor_metadata else None
121
- # temporary pointer to metadata of factor that is currently evaluated
122
- self._current_factor: FactorMetadata = None
123
- super().__init__(data, context, **params)
124
-
125
- @override
126
- def _encode_evaled_factor(
127
- self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False
128
- ) -> dict[str, Any]:
129
- """Function is called just before the factor is evaluated.
130
-
131
- We can record some metadata, before we call the original function.
132
- """
133
- assert (
134
- self._current_factor is None
135
- ), "_current_factor should always be None when we start recording metadata"
136
- if self.factor_metadata_storage is not None:
137
- # Don't store if the factor is cached (then we should already have recorded it)
138
- if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache:
139
- assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached"
140
- else:
141
- for var in factor.variables:
142
- self.variable_to_factors[var].add(factor.expr)
143
- self._current_factor = FactorMetadata(
144
- name=factor.expr,
145
- reduced_rank=reduced_rank,
146
- categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())),
147
- custom_encoder=factor.metadata.encoder is not None,
148
- kind=factor.metadata.kind,
149
- )
150
- return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank)
151
-
152
- @override
153
- def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]:
154
- """
155
- Function is called at the end, before the design matrix gets materialized.
156
-
157
- Here we have access to additional metadata, such as `drop_field`.
158
- """
159
- if self._current_factor is not None:
160
- assert self._current_factor.name == name
161
- self._current_factor.drop_field = values.__formulaic_metadata__.drop_field
162
- self._current_factor.column_names = values.__formulaic_metadata__.column_names
163
- self._current_factor.colname_format = values.__formulaic_metadata__.format
164
- self.factor_metadata_storage[name].append(self._current_factor)
165
- self._current_factor = None
166
-
167
- return super()._flatten_encoded_evaled_factor(name, values)
168
-
169
- return factor_storage, variable_to_factors, CustomPandasMaterializer
170
-
171
-
172
- class AmbiguousAttributeError(ValueError):
173
- pass
174
-
175
-
176
- def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any:
177
- """Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error."""
178
- if not objs:
179
- raise ValueError("Collection is empty")
180
-
181
- first_obj_attr = getattr(objs[0], attr)
182
-
183
- # Check if the attribute is the same for all objects
184
- for obj in objs[1:]:
185
- if getattr(obj, attr) != first_obj_attr:
186
- raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects")
187
-
188
- # If attribute is the same for all objects, return it
189
- return first_obj_attr