vivarium-public-health 4.3.2__py3-none-any.whl → 4.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,491 @@
1
1
  """
2
- Backward compatibility module for risk distributions.
2
+ =================================
3
+ Risk Exposure Distribution Models
4
+ =================================
3
5
 
4
- This module provides backward compatibility for imports that expect risk
5
- distribution classes to be in vivarium_public_health.risks.distributions.
6
+ This module contains tools for modeling several different risk
7
+ exposure distributions.
6
8
 
7
- The actual distribution classes have been moved to:
8
- vivarium_public_health.exposure.distributions
9
-
10
- This module will be deprecated in a future version.
11
9
  """
12
10
 
13
- import warnings
11
+ from abc import ABC, abstractmethod
12
+ from collections.abc import Callable
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import risk_distributions as rd
17
+ from layered_config_tree import LayeredConfigTree
18
+ from vivarium import Component
19
+ from vivarium.framework.engine import Builder
20
+ from vivarium.framework.population import SimulantData
21
+ from vivarium.framework.resource import Resource
22
+ from vivarium.framework.values import Pipeline, list_combiner, union_post_processor
23
+
24
+ from vivarium_public_health.risks.data_transformations import pivot_categorical
25
+ from vivarium_public_health.utilities import EntityString, get_lookup_columns
26
+
27
+
28
+ class MissingDataError(Exception):
29
+ pass
30
+
31
+
32
+ class RiskExposureDistribution(Component, ABC):
33
+
34
+ #####################
35
+ # Lifecycle methods #
36
+ #####################
37
+
38
+ def __init__(
39
+ self,
40
+ risk: EntityString,
41
+ distribution_type: str,
42
+ exposure_data: int | float | pd.DataFrame | None = None,
43
+ ) -> None:
44
+ super().__init__()
45
+ self.risk = risk
46
+ self.distribution_type = distribution_type
47
+ self._exposure_data = exposure_data
48
+
49
+ self.parameters_pipeline_name = f"{self.risk}.exposure_parameters"
50
+
51
+ #################
52
+ # Setup methods #
53
+ #################
54
+
55
+ def get_configuration(self, builder: "Builder") -> LayeredConfigTree | None:
56
+ return builder.configuration[self.risk]
57
+
58
+ @abstractmethod
59
+ def build_all_lookup_tables(self, builder: "Builder") -> None:
60
+ raise NotImplementedError
61
+
62
+ def get_exposure_data(self, builder: Builder) -> int | float | pd.DataFrame:
63
+ if self._exposure_data is not None:
64
+ return self._exposure_data
65
+ return self.get_data(builder, self.configuration["data_sources"]["exposure"])
66
+
67
+ # noinspection PyAttributeOutsideInit
68
+ def setup(self, builder: Builder) -> None:
69
+ self.exposure_parameters = self.get_exposure_parameter_pipeline(builder)
70
+ if self.exposure_parameters.name != self.parameters_pipeline_name:
71
+ raise ValueError(
72
+ "Expected exposure parameters pipeline to be named "
73
+ f"{self.parameters_pipeline_name}, "
74
+ f"but found {self.exposure_parameters.name}."
75
+ )
76
+
77
+ @abstractmethod
78
+ def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
79
+ raise NotImplementedError
80
+
81
+ ##################
82
+ # Public methods #
83
+ ##################
84
+
85
+ @abstractmethod
86
+ def ppf(self, quantiles: pd.Series) -> pd.Series:
87
+ raise NotImplementedError
88
+
89
+
90
+ class EnsembleDistribution(RiskExposureDistribution):
91
+ ##############
92
+ # Properties #
93
+ ##############
94
+
95
+ @property
96
+ def columns_created(self) -> list[str]:
97
+ return [self._propensity]
98
+
99
+ @property
100
+ def initialization_requirements(self) -> list[str | Resource]:
101
+ return [self.randomness]
102
+
103
+ #####################
104
+ # Lifecycle methods #
105
+ #####################
106
+
107
+ def __init__(self, risk: EntityString, distribution_type: str = "ensemble") -> None:
108
+ super().__init__(risk, distribution_type)
109
+ self._propensity = f"ensemble_propensity_{self.risk}"
110
+
111
+ #################
112
+ # Setup methods #
113
+ #################
114
+
115
+ def build_all_lookup_tables(self, builder: Builder) -> None:
116
+ exposure_data = self.get_exposure_data(builder)
117
+ standard_deviation = self.get_data(
118
+ builder,
119
+ self.configuration["data_sources"]["exposure_standard_deviation"],
120
+ )
121
+ weights_source = self.configuration["data_sources"]["ensemble_distribution_weights"]
122
+ raw_weights = self.get_data(builder, weights_source)
123
+
124
+ glnorm_mask = raw_weights["parameter"] == "glnorm"
125
+ if np.any(raw_weights.loc[glnorm_mask, self.get_value_columns(weights_source)]):
126
+ raise NotImplementedError("glnorm distribution is not supported")
127
+ raw_weights = raw_weights[~glnorm_mask]
128
+
129
+ distributions = list(raw_weights["parameter"].unique())
130
+
131
+ raw_weights = pivot_categorical(
132
+ builder, self.risk, raw_weights, pivot_column="parameter", reset_index=False
133
+ )
134
+
135
+ weights, parameters = rd.EnsembleDistribution.get_parameters(
136
+ raw_weights,
137
+ mean=get_risk_distribution_parameter(self.get_value_columns, exposure_data),
138
+ sd=get_risk_distribution_parameter(self.get_value_columns, standard_deviation),
139
+ )
140
+
141
+ distribution_weights_table = self.build_lookup_table(
142
+ builder, weights.reset_index(), distributions
143
+ )
144
+ self.lookup_tables["ensemble_distribution_weights"] = distribution_weights_table
145
+ key_columns = distribution_weights_table.key_columns
146
+ parameter_columns = distribution_weights_table.parameter_columns
147
+
148
+ self.parameters = {
149
+ parameter: builder.lookup.build_table(
150
+ data.reset_index(),
151
+ key_columns=key_columns,
152
+ parameter_columns=parameter_columns,
153
+ )
154
+ for parameter, data in parameters.items()
155
+ }
156
+
157
+ def setup(self, builder: Builder) -> None:
158
+ super().setup(builder)
159
+ self.randomness = builder.randomness.get_stream(self._propensity, component=self)
160
+
161
+ def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
162
+ # This pipeline is not needed for ensemble distributions, so just
163
+ # register a dummy pipeline
164
+ def raise_not_implemented():
165
+ raise NotImplementedError(
166
+ "EnsembleDistribution does not use exposure parameters."
167
+ )
168
+
169
+ return builder.value.register_value_producer(
170
+ self.parameters_pipeline_name, lambda *_: raise_not_implemented(), component=self
171
+ )
172
+
173
+ ########################
174
+ # Event-driven methods #
175
+ ########################
176
+
177
+ def on_initialize_simulants(self, pop_data: SimulantData) -> None:
178
+ ensemble_propensity = self.randomness.get_draw(pop_data.index).rename(
179
+ self._propensity
180
+ )
181
+ self.population_view.update(ensemble_propensity)
182
+
183
+ ##################
184
+ # Public methods #
185
+ ##################
186
+
187
+ def ppf(self, quantiles: pd.Series) -> pd.Series:
188
+ if not quantiles.empty:
189
+ quantiles = clip(quantiles)
190
+ weights = self.lookup_tables["ensemble_distribution_weights"](quantiles.index)
191
+ parameters = {
192
+ name: param(quantiles.index) for name, param in self.parameters.items()
193
+ }
194
+ ensemble_propensity = self.population_view.get(quantiles.index).iloc[:, 0]
195
+ x = rd.EnsembleDistribution(weights, parameters).ppf(
196
+ quantiles, ensemble_propensity
197
+ )
198
+ x[x.isnull()] = 0
199
+ else:
200
+ x = pd.Series([])
201
+ return x
202
+
203
+
204
+ class ContinuousDistribution(RiskExposureDistribution):
205
+ #####################
206
+ # Lifecycle methods #
207
+ #####################
208
+
209
+ def __init__(self, risk: EntityString, distribution_type: str) -> None:
210
+ super().__init__(risk, distribution_type)
211
+ self.standard_deviation = None
212
+ try:
213
+ self._distribution = {
214
+ "normal": rd.Normal,
215
+ "lognormal": rd.LogNormal,
216
+ }[distribution_type]
217
+ except KeyError:
218
+ raise NotImplementedError(
219
+ f"Distribution type {distribution_type} is not supported for "
220
+ f"risk {risk.name}."
221
+ )
222
+
223
+ #################
224
+ # Setup methods #
225
+ #################
226
+
227
+ def build_all_lookup_tables(self, builder: "Builder") -> None:
228
+ exposure_data = self.get_exposure_data(builder)
229
+ standard_deviation = self.get_data(
230
+ builder, self.configuration["data_sources"]["exposure_standard_deviation"]
231
+ )
232
+ parameters = self._distribution.get_parameters(
233
+ mean=get_risk_distribution_parameter(self.get_value_columns, exposure_data),
234
+ sd=get_risk_distribution_parameter(self.get_value_columns, standard_deviation),
235
+ )
236
+
237
+ self.lookup_tables["parameters"] = self.build_lookup_table(
238
+ builder, parameters.reset_index(), list(parameters.columns)
239
+ )
240
+
241
+ def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
242
+ return builder.value.register_value_producer(
243
+ self.parameters_pipeline_name,
244
+ source=self.lookup_tables["parameters"],
245
+ component=self,
246
+ required_resources=get_lookup_columns([self.lookup_tables["parameters"]]),
247
+ )
248
+
249
+ ##################
250
+ # Public methods #
251
+ ##################
252
+
253
+ def ppf(self, quantiles: pd.Series) -> pd.Series:
254
+ if not quantiles.empty:
255
+ quantiles = clip(quantiles)
256
+ parameters = self.exposure_parameters(quantiles.index)
257
+ x = self._distribution(parameters=parameters).ppf(quantiles)
258
+ x[x.isnull()] = 0
259
+ else:
260
+ x = pd.Series([])
261
+ return x
262
+
263
+
264
+ class PolytomousDistribution(RiskExposureDistribution):
265
+ @property
266
+ def categories(self) -> list[str]:
267
+ # These need to be sorted so the cumulative sum is in the ocrrect order of categories
268
+ # and results are therefore reproducible and correct
269
+ return sorted(self.lookup_tables["exposure"].value_columns)
270
+
271
+ #################
272
+ # Setup methods #
273
+ #################
274
+
275
+ def build_all_lookup_tables(self, builder: "Builder") -> None:
276
+ exposure_data = self.get_exposure_data(builder)
277
+ exposure_value_columns = self.get_exposure_value_columns(exposure_data)
278
+
279
+ if isinstance(exposure_data, pd.DataFrame):
280
+ exposure_data = pivot_categorical(builder, self.risk, exposure_data, "parameter")
281
+
282
+ self.lookup_tables["exposure"] = self.build_lookup_table(
283
+ builder, exposure_data, exposure_value_columns
284
+ )
285
+
286
+ def get_exposure_value_columns(
287
+ self, exposure_data: int | float | pd.DataFrame
288
+ ) -> list[str] | None:
289
+ if isinstance(exposure_data, pd.DataFrame):
290
+ return list(exposure_data["parameter"].unique())
291
+ return None
292
+
293
+ def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
294
+ return builder.value.register_value_producer(
295
+ self.parameters_pipeline_name,
296
+ source=self.lookup_tables["exposure"],
297
+ component=self,
298
+ required_resources=get_lookup_columns([self.lookup_tables["exposure"]]),
299
+ )
300
+
301
+ ##################
302
+ # Public methods #
303
+ ##################
304
+
305
+ def ppf(self, quantiles: pd.Series) -> pd.Series:
306
+ exposure = self.exposure_parameters(quantiles.index)
307
+ sorted_exposures = exposure[self.categories]
308
+ if not np.allclose(1, np.sum(sorted_exposures, axis=1)):
309
+ raise MissingDataError("All exposure data returned as 0.")
310
+ exposure_sum = sorted_exposures.cumsum(axis="columns")
311
+ category_index = pd.concat(
312
+ [exposure_sum[c] < quantiles for c in exposure_sum.columns], axis=1
313
+ ).sum(axis=1)
314
+ return pd.Series(
315
+ np.array(self.categories)[category_index],
316
+ name=self.risk + ".exposure",
317
+ index=quantiles.index,
318
+ )
319
+
320
+
321
+ class DichotomousDistribution(RiskExposureDistribution):
322
+
323
+ #################
324
+ # Setup methods #
325
+ #################
326
+
327
+ def build_all_lookup_tables(self, builder: "Builder") -> None:
328
+ exposure_data = self.get_exposure_data(builder)
329
+ exposure_value_columns = self.get_exposure_value_columns(exposure_data)
330
+
331
+ if isinstance(exposure_data, pd.DataFrame):
332
+ any_negatives = (exposure_data[exposure_value_columns] < 0).any().any()
333
+ any_over_one = (exposure_data[exposure_value_columns] > 1).any().any()
334
+ if any_negatives or any_over_one:
335
+ raise ValueError(f"All exposures must be in the range [0, 1] for {self.risk}")
336
+ elif exposure_data < 0 or exposure_data > 1:
337
+ raise ValueError(f"Exposure must be in the range [0, 1] for {self.risk}")
338
+
339
+ self.lookup_tables["exposure"] = self.build_lookup_table(
340
+ builder, exposure_data, exposure_value_columns
341
+ )
342
+ self.lookup_tables["paf"] = self.build_lookup_table(builder, 0.0)
343
+
344
+ def get_exposure_data(self, builder: Builder) -> int | float | pd.DataFrame:
345
+ exposure_data = super().get_exposure_data(builder)
346
+
347
+ if isinstance(exposure_data, (int, float)):
348
+ return exposure_data
349
+
350
+ # rebin exposure categories
351
+ self.validate_rebin_source(builder, exposure_data)
352
+ rebin_exposed_categories = set(self.configuration["rebinned_exposed"])
353
+ if rebin_exposed_categories:
354
+ exposure_data = self._rebin_exposure_data(exposure_data, rebin_exposed_categories)
355
+
356
+ exposure_data = exposure_data[exposure_data["parameter"] == "cat1"]
357
+ return exposure_data.drop(columns="parameter")
358
+
359
+ @staticmethod
360
+ def _rebin_exposure_data(
361
+ exposure_data: pd.DataFrame, rebin_exposed_categories: set
362
+ ) -> pd.DataFrame:
363
+ exposure_data = exposure_data[
364
+ exposure_data["parameter"].isin(rebin_exposed_categories)
365
+ ]
366
+ exposure_data["parameter"] = "cat1"
367
+ exposure_data = (
368
+ exposure_data.groupby(list(exposure_data.columns.difference(["value"])))
369
+ .sum()
370
+ .reset_index()
371
+ )
372
+ return exposure_data
373
+
374
+ def get_exposure_value_columns(
375
+ self, exposure_data: int | float | pd.DataFrame
376
+ ) -> list[str] | None:
377
+ if isinstance(exposure_data, pd.DataFrame):
378
+ return self.get_value_columns(exposure_data)
379
+ return None
380
+
381
+ # noinspection PyAttributeOutsideInit
382
+ def setup(self, builder: Builder) -> None:
383
+ super().setup(builder)
384
+ self.joint_paf = builder.value.register_value_producer(
385
+ f"{self.risk}.exposure_parameters.paf",
386
+ source=lambda index: [self.lookup_tables["paf"](index)],
387
+ component=self,
388
+ preferred_combiner=list_combiner,
389
+ preferred_post_processor=union_post_processor,
390
+ )
391
+
392
+ def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
393
+ return builder.value.register_value_producer(
394
+ f"{self.risk}.exposure_parameters",
395
+ source=self.exposure_parameter_source,
396
+ component=self,
397
+ required_resources=get_lookup_columns([self.lookup_tables["exposure"]]),
398
+ )
399
+
400
+ ##############
401
+ # Validators #
402
+ ##############
403
+
404
+ def validate_rebin_source(self, builder, data: pd.DataFrame) -> None:
405
+ if not isinstance(data, pd.DataFrame):
406
+ return
407
+
408
+ rebin_exposed_categories = set(builder.configuration[self.risk]["rebinned_exposed"])
409
+
410
+ if (
411
+ rebin_exposed_categories
412
+ and builder.configuration[self.risk]["category_thresholds"]
413
+ ):
414
+ raise ValueError(
415
+ f"Rebinning and category thresholds are mutually exclusive. "
416
+ f"You provided both for {self.risk.name}."
417
+ )
418
+
419
+ invalid_cats = rebin_exposed_categories.difference(set(data.parameter))
420
+ if invalid_cats:
421
+ raise ValueError(
422
+ f"The following provided categories for the rebinned exposed "
423
+ f"category of {self.risk.name} are not found in the exposure data: "
424
+ f"{invalid_cats}."
425
+ )
426
+
427
+ if rebin_exposed_categories == set(data.parameter):
428
+ raise ValueError(
429
+ f"The provided categories for the rebinned exposed category of "
430
+ f"{self.risk.name} comprise all categories for the exposure data. "
431
+ f"At least one category must be left out of the provided categories "
432
+ f"to be rebinned into the unexposed category."
433
+ )
434
+
435
+ ##################################
436
+ # Pipeline sources and modifiers #
437
+ ##################################
438
+
439
+ def exposure_parameter_source(self, index: pd.Index) -> pd.Series:
440
+ base_exposure = self.lookup_tables["exposure"](index).values
441
+ joint_paf = self.joint_paf(index).values
442
+ return pd.Series(base_exposure * (1 - joint_paf), index=index, name="values")
443
+
444
+ ##################
445
+ # Public methods #
446
+ ##################
447
+
448
+ def ppf(self, quantiles: pd.Series) -> pd.Series:
449
+ exposed = quantiles < self.exposure_parameters(quantiles.index)
450
+ return pd.Series(
451
+ exposed.replace({True: "cat1", False: "cat2"}),
452
+ name=self.risk + ".exposure",
453
+ index=quantiles.index,
454
+ )
455
+
456
+
457
+ def clip(q):
458
+ """Adjust the percentile boundary cases.
459
+
460
+ The risk distributions package uses the 99.9th and 0.001st percentiles
461
+ of a log-normal distribution as the bounds of the distribution support.
462
+ This is bound up in the GBD risk factor PAF calculation process.
463
+ We'll clip the distribution tails so we don't get NaNs back from the
464
+ distribution calls
465
+ """
466
+ Q_LOWER_BOUND = 0.0011
467
+ Q_UPPER_BOUND = 0.998
468
+ q[q > Q_UPPER_BOUND] = Q_UPPER_BOUND
469
+ q[q < Q_LOWER_BOUND] = Q_LOWER_BOUND
470
+ return q
471
+
14
472
 
15
- # Issue a deprecation warning when this module is imported
16
- warnings.warn(
17
- "Importing from 'vivarium_public_health.risks.distributions' is deprecated. "
18
- "Please import from 'vivarium_public_health.exposure.distributions' instead.",
19
- DeprecationWarning,
20
- stacklevel=2,
21
- )
473
+ def get_risk_distribution_parameter(
474
+ value_columns_getter: Callable[[pd.DataFrame], list[str]],
475
+ data: float | pd.DataFrame,
476
+ ) -> float | pd.Series:
477
+ if isinstance(data, pd.DataFrame):
478
+ value_columns = value_columns_getter(data)
479
+ if len(value_columns) > 1:
480
+ raise ValueError(
481
+ "Expected a single value column for risk data, but found "
482
+ f"{len(value_columns)}: {value_columns}."
483
+ )
484
+ # don't return parameter col in continuous and ensemble distribution
485
+ # means to match standard deviation index
486
+ if "parameter" in data.columns and set(data["parameter"]) == {"continuous"}:
487
+ data = data.drop("parameter", axis=1)
488
+ index = [col for col in data.columns if col not in value_columns]
489
+ data = data.set_index(index)[value_columns].squeeze(axis=1)
22
490
 
23
- # Import all the classes from the new location
24
- from vivarium_public_health.exposure.distributions import *
491
+ return data