vivarium-public-health 4.3.1__py3-none-any.whl → 4.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,254 +0,0 @@
1
- """
2
- =========================
3
- Risk Data Transformations
4
- =========================
5
-
6
- This module contains tools for handling raw risk exposure and relative
7
- risk data and performing any necessary data transformations.
8
-
9
- """
10
-
11
- import numpy as np
12
- import pandas as pd
13
- from vivarium.framework.engine import Builder
14
-
15
- from vivarium_public_health.utilities import EntityString, TargetString
16
-
17
- #############
18
- # Utilities #
19
- #############
20
-
21
-
22
- def pivot_categorical(
23
- builder: Builder,
24
- risk: EntityString,
25
- data: pd.DataFrame,
26
- pivot_column: str = "parameter",
27
- reset_index: bool = True,
28
- ) -> pd.DataFrame:
29
- """Pivots data that is long on categories to be wide."""
30
- # todo remove dependency on artifact manager having exactly one value column
31
- value_column = builder.data.value_columns()(f"{risk}.exposure")[0]
32
- index_cols = [
33
- column for column in data.columns if column not in [value_column, pivot_column]
34
- ]
35
- data = data.pivot_table(index=index_cols, columns=pivot_column, values=value_column)
36
- if reset_index:
37
- data = data.reset_index()
38
- data.columns.name = None
39
-
40
- return data
41
-
42
-
43
- ##########################
44
- # Exposure data handlers #
45
- ##########################
46
-
47
-
48
- def get_exposure_post_processor(builder, risk: str):
49
- thresholds = builder.configuration[risk]["category_thresholds"]
50
-
51
- if thresholds:
52
- thresholds = [-np.inf] + thresholds + [np.inf]
53
- categories = [f"cat{i}" for i in range(1, len(thresholds))]
54
-
55
- def post_processor(exposure, _):
56
- return pd.Series(
57
- pd.cut(exposure, thresholds, labels=categories), index=exposure.index
58
- ).astype(str)
59
-
60
- else:
61
- post_processor = None
62
-
63
- return post_processor
64
-
65
-
66
- def load_exposure_data(builder: Builder, risk: EntityString) -> pd.DataFrame:
67
- risk_component = builder.components.get_component(risk)
68
- return risk_component.get_data(
69
- builder, builder.configuration[risk_component.name]["data_sources"]["exposure"]
70
- )
71
-
72
-
73
- ###############################
74
- # Relative risk data handlers #
75
- ###############################
76
-
77
-
78
- def rebin_relative_risk_data(
79
- builder, risk: EntityString, relative_risk_data: pd.DataFrame
80
- ) -> pd.DataFrame:
81
- """Rebin relative risk data if necessary.
82
-
83
- When the polytomous risk is rebinned, matching relative risk needs to be rebinned.
84
- After rebinning, rr for both exposed and unexposed categories should be the weighted sum of relative risk
85
- of the component categories where weights are relative proportions of exposure of those categories.
86
- For example, if cat1, cat2, cat3 are exposed categories and cat4 is unexposed with exposure [0.1,0.2,0.3,0.4],
87
- for the matching rr = [rr1, rr2, rr3, 1], rebinned rr for the rebinned cat1 should be:
88
- (0.1 *rr1 + 0.2 * rr2 + 0.3* rr3) / (0.1+0.2+0.3)
89
- """
90
- if not risk in builder.configuration.to_dict():
91
- return relative_risk_data
92
-
93
- rebin_exposed_categories = set(builder.configuration[risk]["rebinned_exposed"])
94
-
95
- if rebin_exposed_categories:
96
- # todo make sure this works
97
- exposure_data = load_exposure_data(builder, risk)
98
- relative_risk_data = _rebin_relative_risk_data(
99
- relative_risk_data, exposure_data, rebin_exposed_categories
100
- )
101
-
102
- return relative_risk_data
103
-
104
-
105
- def _rebin_relative_risk_data(
106
- relative_risk_data: pd.DataFrame,
107
- exposure_data: pd.DataFrame,
108
- rebin_exposed_categories: set,
109
- ) -> pd.DataFrame:
110
- cols = list(exposure_data.columns.difference(["value"]))
111
-
112
- relative_risk_data = relative_risk_data.merge(exposure_data, on=cols)
113
- relative_risk_data["value_x"] = relative_risk_data.value_x.multiply(
114
- relative_risk_data.value_y
115
- )
116
- relative_risk_data.parameter = relative_risk_data["parameter"].map(
117
- lambda p: "cat1" if p in rebin_exposed_categories else "cat2"
118
- )
119
- relative_risk_data = relative_risk_data.groupby(cols).sum().reset_index()
120
- relative_risk_data["value"] = relative_risk_data.value_x.divide(
121
- relative_risk_data.value_y
122
- ).fillna(0)
123
- return relative_risk_data.drop(columns=["value_x", "value_y"])
124
-
125
-
126
- ##############
127
- # Validators #
128
- ##############
129
-
130
-
131
- def validate_distribution_data_source(builder: Builder, risk: EntityString) -> None:
132
- """Checks that the exposure distribution specification is valid."""
133
- exposure_type = builder.configuration[risk]["data_sources"]["exposure"]
134
- rebin = builder.configuration[risk]["rebinned_exposed"]
135
- category_thresholds = builder.configuration[risk]["category_thresholds"]
136
-
137
- if risk.type == "alternative_risk_factor":
138
- if exposure_type != "data" or rebin:
139
- raise ValueError(
140
- "Parameterized risk components are not available for alternative risks."
141
- )
142
-
143
- if not category_thresholds:
144
- raise ValueError("Must specify category thresholds to use alternative risks.")
145
-
146
- elif risk.type not in ["risk_factor", "coverage_gap"]:
147
- raise ValueError(f"Unknown risk type {risk.type} for risk {risk.name}")
148
-
149
-
150
- def validate_relative_risk_data_source(builder, risk: EntityString, target: TargetString):
151
- from vivarium_public_health.risks import RiskEffect
152
-
153
- source_key = RiskEffect.get_name(risk, target)
154
- source_config = builder.configuration[source_key]
155
-
156
- provided_keys = set(
157
- k
158
- for k, v in source_config["distribution_args"].to_dict().items()
159
- if isinstance(v, (int, float))
160
- )
161
-
162
- source_map = {
163
- "data": set(),
164
- "relative risk value": {"relative_risk"},
165
- "normal distribution": {"mean", "se"},
166
- "log distribution": {"log_mean", "log_se", "tau_squared"},
167
- }
168
-
169
- if provided_keys not in source_map.values():
170
- raise ValueError(
171
- f"The acceptable parameter options for specifying relative risk are: "
172
- f"{source_map.values()}. You provided {provided_keys} for {source_key}."
173
- )
174
-
175
- source_type = [k for k, v in source_map.items() if provided_keys == v][0]
176
-
177
- if source_type == "relative risk value":
178
- if not 1 <= source_type <= 100:
179
- raise ValueError(
180
- "If specifying a single value for relative risk, it should be in the range [1, 100]. "
181
- f"You provided {source_type} for {source_key}."
182
- )
183
- elif source_type == "normal distribution":
184
- if source_config["mean"] <= 0 or source_config["se"] <= 0:
185
- raise ValueError(
186
- f"To specify parameters for a normal distribution for a risk effect, you must provide"
187
- f"both mean and se above 0. This is not the case for {source_key}."
188
- )
189
- elif source_type == "log distribution":
190
- if source_config["log_mean"] <= 0 or source_config["log_se"] <= 0:
191
- raise ValueError(
192
- f"To specify parameters for a log distribution for a risk effect, you must provide"
193
- f"both log_mean and log_se above 0. This is not the case for {source_key}."
194
- )
195
- if source_config["tau_squared"] < 0:
196
- raise ValueError(
197
- f"To specify parameters for a log distribution for a risk effect, you must provide"
198
- f"tau_squared >= 0. This is not the case for {source_key}."
199
- )
200
- else:
201
- pass
202
-
203
- return source_type
204
-
205
-
206
- def validate_relative_risk_rebin_source(
207
- builder, risk: EntityString, target: TargetString, data: pd.DataFrame
208
- ):
209
- if data.index.size == 0:
210
- raise ValueError(
211
- f"Subsetting {risk} relative risk data to {target.name} {target.measure} "
212
- "returned an empty DataFrame. Check your artifact."
213
- )
214
- if risk in builder.configuration.to_dict():
215
- validate_rebin_source(builder, risk, data)
216
-
217
-
218
- def validate_rebin_source(builder, risk: EntityString, data: pd.DataFrame) -> None:
219
-
220
- if not isinstance(data, pd.DataFrame):
221
- return
222
-
223
- rebin_exposed_categories = set(builder.configuration[risk]["rebinned_exposed"])
224
-
225
- if rebin_exposed_categories and builder.configuration[risk]["category_thresholds"]:
226
- raise ValueError(
227
- f"Rebinning and category thresholds are mutually exclusive. "
228
- f"You provided both for {risk.name}."
229
- )
230
-
231
- if rebin_exposed_categories and "polytomous" not in builder.data.load(
232
- f"{risk}.distribution"
233
- ):
234
- raise ValueError(
235
- f"Rebinning is only supported for polytomous risks. You provided "
236
- f"rebinning exposed categoriesfor {risk.name}, which is of "
237
- f"type {builder.data.load(f'{risk}.distribution')}."
238
- )
239
-
240
- invalid_cats = rebin_exposed_categories.difference(set(data.parameter))
241
- if invalid_cats:
242
- raise ValueError(
243
- f"The following provided categories for the rebinned exposed "
244
- f"category of {risk.name} are not found in the exposure data: "
245
- f"{invalid_cats}."
246
- )
247
-
248
- if rebin_exposed_categories == set(data.parameter):
249
- raise ValueError(
250
- f"The provided categories for the rebinned exposed category of "
251
- f"{risk.name} comprise all categories for the exposure data. "
252
- f"At least one category must be left out of the provided categories "
253
- f"to be rebinned into the unexposed category."
254
- )