pertpy 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,189 +0,0 @@
1
- """Helpers to interact with Formulaic Formulas
2
-
3
- Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`):
4
- * A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`.
5
- * A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`.
6
- * A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank.
7
- """
8
-
9
- from collections import defaultdict
10
- from collections.abc import Mapping, Sequence
11
- from dataclasses import dataclass
12
- from typing import Any
13
-
14
- from formulaic import FactorValues, ModelSpec
15
- from formulaic.materializers import PandasMaterializer
16
- from formulaic.materializers.types import EvaluatedFactor
17
- from formulaic.parser.types import Factor
18
- from interface_meta import override
19
-
20
-
21
- @dataclass
22
- class FactorMetadata:
23
- """Store (relevant) metadata for a factor of a formula."""
24
-
25
- name: str
26
- """The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`"""
27
-
28
- reduced_rank: bool
29
- """Whether a column will be dropped because it is redundant"""
30
-
31
- custom_encoder: bool
32
- """Whether or not a custom encoder (e.g. `C(...)`) was used."""
33
-
34
- categories: Sequence[str]
35
- """The unique categories in this factor (after applying `drop_rows`)"""
36
-
37
- kind: Factor.Kind
38
- """Type of the factor"""
39
-
40
- drop_field: str = None
41
- """The category that is dropped.
42
-
43
- Note that
44
- * this may also be populated if `reduced_rank = False`
45
- * this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`.
46
- """
47
-
48
- column_names: Sequence[str] = None
49
- """The column names for this factor included in the design matrix.
50
-
51
- This may be the same as `categories` if the default encoder is used, or
52
- categories without the base level if a custom encoder (e.g. `C(...)`) is used.
53
- """
54
-
55
- colname_format: str = None
56
- """A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`"""
57
-
58
- @property
59
- def base(self) -> str | None:
60
- """
61
- The base category for this categorical factor.
62
-
63
- This is derived from `drop_field` (for default encoding) or by comparing the column names in
64
- the design matrix with all categories (for custom encoding, e.g. `C(...)`).
65
- """
66
- if not self.reduced_rank:
67
- return None
68
- else:
69
- if self.custom_encoder:
70
- tmp_base = set(self.categories) - set(self.column_names)
71
- assert len(tmp_base) == 1
72
- return tmp_base.pop()
73
- else:
74
- assert self.drop_field is not None
75
- return self.drop_field
76
-
77
-
78
- def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]:
79
- """Keeps track of categorical factors used in a model specification by generating a custom materializer.
80
-
81
- This materializer reports back metadata upon materialization of the model matrix.
82
-
83
- Returns:
84
- - A dictionary storing metadata for each factor processed by the custom materializer, named `factor_storage`.
85
- - A dictionary mapping variables to factor names, which works similarly to model_spec.variable_terms
86
- but maps to factors rather than terms, named `variable_to_factors`.
87
- - A materializer class tied to the specific instance of `factor_storage`.
88
- """
89
- # There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction
90
- # term, it generates the factor with both full rank and reduced rank.
91
- factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list)
92
- variable_to_factors: dict[str, set[str]] = defaultdict(set)
93
-
94
- class CustomPandasMaterializer(PandasMaterializer):
95
- """An extension of the PandasMaterializer that records all categorical variables and their (base) categories."""
96
-
97
- REGISTER_NAME = "custom_pandas"
98
- REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
99
- REGISTER_OUTPUTS = ("pandas", "numpy", "sparse")
100
-
101
- def __init__(
102
- self,
103
- data: Any,
104
- context: Mapping[str, Any] | None = None,
105
- record_factor_metadata: bool = False,
106
- **params: Any,
107
- ):
108
- """Initialize the Materializer.
109
-
110
- Args:
111
- data: Passed to PandasMaterializer.
112
- context: Passed to PandasMaterializer
113
- record_factor_metadata: Flag that tells whether this particular instance of the custom materializer class
114
- is supposed to record factor metadata. Only the instance that is used for building the design
115
- matrix should record the metadata. All other instances (e.g. used to generate contrast vectors)
116
- should not record metadata to not overwrite the specifications from the design matrix.
117
- **params: Passed to PandasMaterializer
118
- """
119
- self.factor_metadata_storage = factor_storage if record_factor_metadata else None
120
- self.variable_to_factors = variable_to_factors if record_factor_metadata else None
121
- # temporary pointer to metadata of factor that is currently evaluated
122
- self._current_factor: FactorMetadata = None
123
- super().__init__(data, context, **params)
124
-
125
- @override
126
- def _encode_evaled_factor(
127
- self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False
128
- ) -> dict[str, Any]:
129
- """Function is called just before the factor is evaluated.
130
-
131
- We can record some metadata, before we call the original function.
132
- """
133
- assert (
134
- self._current_factor is None
135
- ), "_current_factor should always be None when we start recording metadata"
136
- if self.factor_metadata_storage is not None:
137
- # Don't store if the factor is cached (then we should already have recorded it)
138
- if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache:
139
- assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached"
140
- else:
141
- for var in factor.variables:
142
- self.variable_to_factors[var].add(factor.expr)
143
- self._current_factor = FactorMetadata(
144
- name=factor.expr,
145
- reduced_rank=reduced_rank,
146
- categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())),
147
- custom_encoder=factor.metadata.encoder is not None,
148
- kind=factor.metadata.kind,
149
- )
150
- return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank)
151
-
152
- @override
153
- def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]:
154
- """
155
- Function is called at the end, before the design matrix gets materialized.
156
-
157
- Here we have access to additional metadata, such as `drop_field`.
158
- """
159
- if self._current_factor is not None:
160
- assert self._current_factor.name == name
161
- self._current_factor.drop_field = values.__formulaic_metadata__.drop_field
162
- self._current_factor.column_names = values.__formulaic_metadata__.column_names
163
- self._current_factor.colname_format = values.__formulaic_metadata__.format
164
- self.factor_metadata_storage[name].append(self._current_factor)
165
- self._current_factor = None
166
-
167
- return super()._flatten_encoded_evaled_factor(name, values)
168
-
169
- return factor_storage, variable_to_factors, CustomPandasMaterializer
170
-
171
-
172
- class AmbiguousAttributeError(ValueError):
173
- pass
174
-
175
-
176
- def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any:
177
- """Given a list of objects, return an attribute if it is the same between all object. Otherwise, raise an error."""
178
- if not objs:
179
- raise ValueError("Collection is empty")
180
-
181
- first_obj_attr = getattr(objs[0], attr)
182
-
183
- # Check if the attribute is the same for all objects
184
- for obj in objs[1:]:
185
- if getattr(obj, attr) != first_obj_attr:
186
- raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects")
187
-
188
- # If attribute is the same for all objects, return it
189
- return first_obj_attr