vivarium-public-health 4.3.2__py3-none-any.whl → 4.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,555 +0,0 @@
1
- """
2
- =================================
3
- Exposure Distribution Models
4
- =================================
5
-
6
- This module contains tools for modeling several different risk
7
- exposure distributions.
8
-
9
- """
10
-
11
- from __future__ import annotations
12
-
13
- import warnings
14
- from abc import ABC, abstractmethod
15
- from collections.abc import Callable
16
- from typing import TYPE_CHECKING
17
-
18
- import numpy as np
19
- import pandas as pd
20
- import risk_distributions as rd
21
- from layered_config_tree import LayeredConfigTree
22
- from vivarium import Component
23
- from vivarium.framework.engine import Builder
24
- from vivarium.framework.population import SimulantData
25
- from vivarium.framework.resource import Resource
26
- from vivarium.framework.values import Pipeline, list_combiner, union_post_processor
27
-
28
- from vivarium_public_health.risks.data_transformations import pivot_categorical
29
- from vivarium_public_health.utilities import EntityString, get_lookup_columns
30
-
31
- if TYPE_CHECKING:
32
- from vivarium_public_health.exposure import Exposure
33
-
34
-
35
- class MissingDataError(Exception):
36
- pass
37
-
38
-
39
- class ExposureDistribution(Component, ABC):
40
-
41
- #####################
42
- # Lifecycle methods #
43
- #####################
44
-
45
- def __init__(
46
- self,
47
- exposure_component: Exposure,
48
- distribution_type: str,
49
- exposure_data: int | float | pd.DataFrame | None = None,
50
- ) -> None:
51
- super().__init__()
52
- self.exposure_component = exposure_component
53
- self.distribution_type = distribution_type
54
- if (
55
- self.distribution_type != "dichotomous"
56
- and self.exposure_component.entity.type == "intervention"
57
- ):
58
- raise NotImplementedError(
59
- f"Distribution type {self.distribution_type} is not supported for interventions."
60
- )
61
- self._exposure_data = exposure_data
62
-
63
- self.parameters_pipeline_name = (
64
- f"{self.exposure_component.entity}.exposure_parameters"
65
- )
66
-
67
- #################
68
- # Setup methods #
69
- #################
70
-
71
- def get_configuration(self, builder: "Builder") -> LayeredConfigTree | None:
72
- return builder.configuration[self.exposure_component.entity]
73
-
74
- @abstractmethod
75
- def build_all_lookup_tables(self, builder: "Builder") -> None:
76
- raise NotImplementedError
77
-
78
- def get_exposure_data(self, builder: Builder) -> int | float | pd.DataFrame:
79
- if self._exposure_data is not None:
80
- return self._exposure_data
81
- return self.get_data(
82
- builder, self.configuration["data_sources"][self.exposure_component.exposure_type]
83
- )
84
-
85
- # noinspection PyAttributeOutsideInit
86
- def setup(self, builder: Builder) -> None:
87
- self.exposure_parameters = self.get_exposure_parameter_pipeline(builder)
88
- if self.exposure_parameters.name != self.parameters_pipeline_name:
89
- raise ValueError(
90
- "Expected exposure parameters pipeline to be named "
91
- f"{self.parameters_pipeline_name}, "
92
- f"but found {self.exposure_parameters.name}."
93
- )
94
-
95
- @abstractmethod
96
- def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
97
- raise NotImplementedError
98
-
99
- ##################
100
- # Public methods #
101
- ##################
102
-
103
- @abstractmethod
104
- def ppf(self, quantiles: pd.Series) -> pd.Series:
105
- raise NotImplementedError
106
-
107
-
108
- class EnsembleDistribution(ExposureDistribution):
109
- ##############
110
- # Properties #
111
- ##############
112
-
113
- @property
114
- def columns_created(self) -> list[str]:
115
- return [self._propensity]
116
-
117
- @property
118
- def initialization_requirements(self) -> list[str | Resource]:
119
- return [self.randomness]
120
-
121
- #####################
122
- # Lifecycle methods #
123
- #####################
124
-
125
- def __init__(self, risk: EntityString, distribution_type: str = "ensemble") -> None:
126
- super().__init__(risk, distribution_type)
127
- self._propensity = f"ensemble_propensity_{self.exposure_component.entity}"
128
-
129
- #################
130
- # Setup methods #
131
- #################
132
-
133
- def build_all_lookup_tables(self, builder: Builder) -> None:
134
- exposure_data = self.get_exposure_data(builder)
135
- standard_deviation = self.get_data(
136
- builder,
137
- self.configuration["data_sources"]["exposure_standard_deviation"],
138
- )
139
- weights_source = self.configuration["data_sources"]["ensemble_distribution_weights"]
140
- raw_weights = self.get_data(builder, weights_source)
141
-
142
- glnorm_mask = raw_weights["parameter"] == "glnorm"
143
- if np.any(raw_weights.loc[glnorm_mask, self.get_value_columns(weights_source)]):
144
- raise NotImplementedError("glnorm distribution is not supported")
145
- raw_weights = raw_weights[~glnorm_mask]
146
-
147
- distributions = list(raw_weights["parameter"].unique())
148
-
149
- raw_weights = pivot_categorical(
150
- builder,
151
- self.exposure_component.entity,
152
- raw_weights,
153
- pivot_column="parameter",
154
- reset_index=False,
155
- )
156
-
157
- weights, parameters = rd.EnsembleDistribution.get_parameters(
158
- raw_weights,
159
- mean=get_risk_distribution_parameter(self.get_value_columns, exposure_data),
160
- sd=get_risk_distribution_parameter(self.get_value_columns, standard_deviation),
161
- )
162
-
163
- distribution_weights_table = self.build_lookup_table(
164
- builder, weights.reset_index(), distributions
165
- )
166
- self.lookup_tables["ensemble_distribution_weights"] = distribution_weights_table
167
- key_columns = distribution_weights_table.key_columns
168
- parameter_columns = distribution_weights_table.parameter_columns
169
-
170
- self.parameters = {
171
- parameter: builder.lookup.build_table(
172
- data.reset_index(),
173
- key_columns=key_columns,
174
- parameter_columns=parameter_columns,
175
- )
176
- for parameter, data in parameters.items()
177
- }
178
-
179
- def setup(self, builder: Builder) -> None:
180
- super().setup(builder)
181
- self.randomness = builder.randomness.get_stream(self._propensity, component=self)
182
-
183
- def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
184
- # This pipeline is not needed for ensemble distributions, so just
185
- # register a dummy pipeline
186
- def raise_not_implemented():
187
- raise NotImplementedError(
188
- "EnsembleDistribution does not use exposure parameters."
189
- )
190
-
191
- return builder.value.register_value_producer(
192
- self.parameters_pipeline_name, lambda *_: raise_not_implemented(), component=self
193
- )
194
-
195
- ########################
196
- # Event-driven methods #
197
- ########################
198
-
199
- def on_initialize_simulants(self, pop_data: SimulantData) -> None:
200
- ensemble_propensity = self.randomness.get_draw(pop_data.index).rename(
201
- self._propensity
202
- )
203
- self.population_view.update(ensemble_propensity)
204
-
205
- ##################
206
- # Public methods #
207
- ##################
208
-
209
- def ppf(self, quantiles: pd.Series) -> pd.Series:
210
- if not quantiles.empty:
211
- quantiles = clip(quantiles)
212
- weights = self.lookup_tables["ensemble_distribution_weights"](quantiles.index)
213
- parameters = {
214
- name: param(quantiles.index) for name, param in self.parameters.items()
215
- }
216
- ensemble_propensity = self.population_view.get(quantiles.index).iloc[:, 0]
217
- x = rd.EnsembleDistribution(weights, parameters).ppf(
218
- quantiles, ensemble_propensity
219
- )
220
- x[x.isnull()] = 0
221
- else:
222
- x = pd.Series([])
223
- return x
224
-
225
-
226
- class ContinuousDistribution(ExposureDistribution):
227
- #####################
228
- # Lifecycle methods #
229
- #####################
230
-
231
- def __init__(self, risk: EntityString, distribution_type: str) -> None:
232
- super().__init__(risk, distribution_type)
233
- self.standard_deviation = None
234
- try:
235
- self._distribution = {
236
- "normal": rd.Normal,
237
- "lognormal": rd.LogNormal,
238
- }[distribution_type]
239
- except KeyError:
240
- raise NotImplementedError(
241
- f"Distribution type {distribution_type} is not supported for "
242
- f"risk {risk.name}."
243
- )
244
-
245
- #################
246
- # Setup methods #
247
- #################
248
-
249
- def build_all_lookup_tables(self, builder: "Builder") -> None:
250
- exposure_data = self.get_exposure_data(builder)
251
- standard_deviation = self.get_data(
252
- builder, self.configuration["data_sources"]["exposure_standard_deviation"]
253
- )
254
- parameters = self._distribution.get_parameters(
255
- mean=get_risk_distribution_parameter(self.get_value_columns, exposure_data),
256
- sd=get_risk_distribution_parameter(self.get_value_columns, standard_deviation),
257
- )
258
-
259
- self.lookup_tables["parameters"] = self.build_lookup_table(
260
- builder, parameters.reset_index(), list(parameters.columns)
261
- )
262
-
263
- def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
264
- return builder.value.register_value_producer(
265
- self.parameters_pipeline_name,
266
- source=self.lookup_tables["parameters"],
267
- component=self,
268
- required_resources=get_lookup_columns([self.lookup_tables["parameters"]]),
269
- )
270
-
271
- ##################
272
- # Public methods #
273
- ##################
274
-
275
- def ppf(self, quantiles: pd.Series) -> pd.Series:
276
- if not quantiles.empty:
277
- quantiles = clip(quantiles)
278
- parameters = self.exposure_parameters(quantiles.index)
279
- x = self._distribution(parameters=parameters).ppf(quantiles)
280
- x[x.isnull()] = 0
281
- else:
282
- x = pd.Series([])
283
- return x
284
-
285
-
286
- class PolytomousDistribution(ExposureDistribution):
287
- @property
288
- def categories(self) -> list[str]:
289
- # These need to be sorted so the cumulative sum is in the correct order of categories
290
- # and results are therefore reproducible and correct
291
- return sorted(self.lookup_tables[self.exposure_component.exposure_type].value_columns)
292
-
293
- #################
294
- # Setup methods #
295
- #################
296
-
297
- def build_all_lookup_tables(self, builder: "Builder") -> None:
298
- exposure_data = self.get_exposure_data(builder)
299
- exposure_value_columns = self.get_exposure_value_columns(exposure_data)
300
-
301
- if isinstance(exposure_data, pd.DataFrame):
302
- exposure_data = pivot_categorical(
303
- builder, self.exposure_component.entity, exposure_data, "parameter"
304
- )
305
-
306
- self.lookup_tables[self.exposure_component.exposure_type] = self.build_lookup_table(
307
- builder, exposure_data, exposure_value_columns
308
- )
309
-
310
- def get_exposure_value_columns(
311
- self, exposure_data: int | float | pd.DataFrame
312
- ) -> list[str] | None:
313
- if isinstance(exposure_data, pd.DataFrame):
314
- return list(exposure_data["parameter"].unique())
315
- return None
316
-
317
- def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
318
- return builder.value.register_value_producer(
319
- self.parameters_pipeline_name,
320
- source=self.lookup_tables[self.exposure_component.exposure_type],
321
- component=self,
322
- required_resources=get_lookup_columns(
323
- [self.lookup_tables[self.exposure_component.exposure_type]]
324
- ),
325
- )
326
-
327
- ##################
328
- # Public methods #
329
- ##################
330
-
331
- def ppf(self, quantiles: pd.Series) -> pd.Series:
332
- exposure = self.exposure_parameters(quantiles.index)
333
- sorted_exposures = exposure[self.categories]
334
- if not np.allclose(1, np.sum(sorted_exposures, axis=1)):
335
- raise MissingDataError("All exposure data returned as 0.")
336
- exposure_sum = sorted_exposures.cumsum(axis="columns")
337
- category_index = pd.concat(
338
- [exposure_sum[c] < quantiles for c in exposure_sum.columns], axis=1
339
- ).sum(axis=1)
340
- return pd.Series(
341
- np.array(self.categories)[category_index],
342
- name=f"{self.exposure_component.entity}.exposure",
343
- index=quantiles.index,
344
- )
345
-
346
-
347
- class DichotomousDistribution(ExposureDistribution):
348
-
349
- #################
350
- # Setup methods #
351
- #################
352
-
353
- def build_all_lookup_tables(self, builder: "Builder") -> None:
354
- exposure_data = self.get_exposure_data(builder)
355
- exposure_value_columns = self.get_exposure_value_columns(exposure_data)
356
-
357
- if isinstance(exposure_data, pd.DataFrame):
358
- any_negatives = (exposure_data[exposure_value_columns] < 0).any().any()
359
- any_over_one = (exposure_data[exposure_value_columns] > 1).any().any()
360
- if any_negatives or any_over_one:
361
- raise ValueError(
362
- f"All exposures must be in the range [0, 1] for {self.exposure_component.entity}"
363
- )
364
- elif exposure_data < 0 or exposure_data > 1:
365
- raise ValueError(
366
- f"Exposure must be in the range [0, 1] for {self.exposure_component.entity}"
367
- )
368
-
369
- self.lookup_tables[self.exposure_component.exposure_type] = self.build_lookup_table(
370
- builder, exposure_data, exposure_value_columns
371
- )
372
- self.lookup_tables["paf"] = self.build_lookup_table(builder, 0.0)
373
-
374
- def get_exposure_data(self, builder: Builder) -> int | float | pd.DataFrame:
375
- exposure_data = super().get_exposure_data(builder)
376
-
377
- if isinstance(exposure_data, (int, float)):
378
- return exposure_data
379
-
380
- # rebin exposure categories
381
- self.validate_rebin_source(builder, exposure_data)
382
- rebin_exposed_categories = set(self.configuration["rebinned_exposed"])
383
- # Check if risk exposure is exposed vs cat1
384
- if (
385
- "cat1" in exposure_data["parameter"].unique()
386
- and self.exposure_component.entity.type == "risk_factor"
387
- ):
388
- warnings.warn(
389
- "Using 'cat1' and 'cat2' for dichotomous exposure is deprecated and will be removed in a future release. Use 'exposed' and 'unexposed' instead.",
390
- FutureWarning,
391
- stacklevel=2,
392
- )
393
- exposure_data["parameter"] = exposure_data["parameter"].replace(
394
- {
395
- "cat1": self.exposure_component.dichotomous_exposure_category_names.exposed,
396
- "cat2": self.exposure_component.dichotomous_exposure_category_names.unexposed,
397
- }
398
- )
399
- if rebin_exposed_categories:
400
- exposure_data = self._rebin_exposure_data(
401
- exposure_data,
402
- rebin_exposed_categories,
403
- self.exposure_component.dichotomous_exposure_category_names.exposed,
404
- )
405
-
406
- exposure_data = exposure_data[
407
- exposure_data["parameter"]
408
- == self.exposure_component.dichotomous_exposure_category_names.exposed
409
- ]
410
- return exposure_data.drop(columns="parameter")
411
-
412
- @staticmethod
413
- def _rebin_exposure_data(
414
- exposure_data: pd.DataFrame, rebin_exposed_categories: set, exposed_category_name: str
415
- ) -> pd.DataFrame:
416
- exposure_data = exposure_data[
417
- exposure_data["parameter"].isin(rebin_exposed_categories)
418
- ]
419
- exposure_data["parameter"] = exposed_category_name
420
- exposure_data = (
421
- exposure_data.groupby(list(exposure_data.columns.difference(["value"])))
422
- .sum()
423
- .reset_index()
424
- )
425
- return exposure_data
426
-
427
- def get_exposure_value_columns(
428
- self, exposure_data: int | float | pd.DataFrame
429
- ) -> list[str] | None:
430
- if isinstance(exposure_data, pd.DataFrame):
431
- return self.get_value_columns(exposure_data)
432
- return None
433
-
434
- # noinspection PyAttributeOutsideInit
435
- def setup(self, builder: Builder) -> None:
436
- super().setup(builder)
437
- self.joint_paf = builder.value.register_value_producer(
438
- f"{self.exposure_component.entity}.exposure_parameters.paf",
439
- source=lambda index: [self.lookup_tables["paf"](index)],
440
- component=self,
441
- preferred_combiner=list_combiner,
442
- preferred_post_processor=union_post_processor,
443
- )
444
-
445
- def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
446
- return builder.value.register_value_producer(
447
- f"{self.exposure_component.entity}.exposure_parameters",
448
- source=self.exposure_parameter_source,
449
- component=self,
450
- required_resources=get_lookup_columns(
451
- [self.lookup_tables[self.exposure_component.exposure_type]]
452
- ),
453
- )
454
-
455
- ##############
456
- # Validators #
457
- ##############
458
-
459
- def validate_rebin_source(self, builder, data: pd.DataFrame) -> None:
460
- if not isinstance(data, pd.DataFrame):
461
- return
462
-
463
- rebin_exposed_categories = set(
464
- builder.configuration[self.exposure_component.entity]["rebinned_exposed"]
465
- )
466
-
467
- if (
468
- rebin_exposed_categories
469
- and builder.configuration[self.exposure_component.entity]["category_thresholds"]
470
- ):
471
- raise ValueError(
472
- f"Rebinning and category thresholds are mutually exclusive. "
473
- f"You provided both for {self.exposure_component.entity.name}."
474
- )
475
-
476
- invalid_cats = rebin_exposed_categories.difference(set(data.parameter))
477
- if invalid_cats:
478
- raise ValueError(
479
- f"The following provided categories for the rebinned exposed "
480
- f"category of {self.exposure_component.entity.name} are not found in the exposure data: "
481
- f"{invalid_cats}."
482
- )
483
-
484
- if rebin_exposed_categories == set(data.parameter):
485
- raise ValueError(
486
- f"The provided categories for the rebinned exposed category of "
487
- f"{self.exposure_component.entity.name} comprise all categories for the exposure data. "
488
- f"At least one category must be left out of the provided categories "
489
- f"to be rebinned into the unexposed category."
490
- )
491
-
492
- ##################################
493
- # Pipeline sources and modifiers #
494
- ##################################
495
-
496
- def exposure_parameter_source(self, index: pd.Index) -> pd.Series:
497
- base_exposure = self.lookup_tables[self.exposure_component.exposure_type](
498
- index
499
- ).values
500
- joint_paf = self.joint_paf(index).values
501
- return pd.Series(base_exposure * (1 - joint_paf), index=index, name="values")
502
-
503
- ##################
504
- # Public methods #
505
- ##################
506
-
507
- def ppf(self, quantiles: pd.Series) -> pd.Series:
508
- exposed = quantiles < self.exposure_parameters(quantiles.index)
509
- return pd.Series(
510
- exposed.replace(
511
- {
512
- True: self.exposure_component.dichotomous_exposure_category_names.exposed,
513
- False: self.exposure_component.dichotomous_exposure_category_names.unexposed,
514
- }
515
- ),
516
- name=f"{self.exposure_component.entity}.{self.exposure_component.exposure_type}",
517
- index=quantiles.index,
518
- )
519
-
520
-
521
- def clip(q):
522
- """Adjust the percentile boundary cases.
523
-
524
- The risk distributions package uses the 99.9th and 0.001st percentiles
525
- of a log-normal distribution as the bounds of the distribution support.
526
- This is bound up in the GBD risk factor PAF calculation process.
527
- We'll clip the distribution tails so we don't get NaNs back from the
528
- distribution calls
529
- """
530
- Q_LOWER_BOUND = 0.0011
531
- Q_UPPER_BOUND = 0.998
532
- q[q > Q_UPPER_BOUND] = Q_UPPER_BOUND
533
- q[q < Q_LOWER_BOUND] = Q_LOWER_BOUND
534
- return q
535
-
536
-
537
- def get_risk_distribution_parameter(
538
- value_columns_getter: Callable[[pd.DataFrame], list[str]],
539
- data: float | pd.DataFrame,
540
- ) -> float | pd.Series:
541
- if isinstance(data, pd.DataFrame):
542
- value_columns = value_columns_getter(data)
543
- if len(value_columns) > 1:
544
- raise ValueError(
545
- "Expected a single value column for risk data, but found "
546
- f"{len(value_columns)}: {value_columns}."
547
- )
548
- # don't return parameter col in continuous and ensemble distribution
549
- # means to match standard deviation index
550
- if "parameter" in data.columns and set(data["parameter"]) == {"continuous"}:
551
- data = data.drop("parameter", axis=1)
552
- index = [col for col in data.columns if col not in value_columns]
553
- data = data.set_index(index)[value_columns].squeeze(axis=1)
554
-
555
- return data