guts-base 2.0.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
guts_base/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ from . import sim
2
+ from . import mod
3
+ from . import data
4
+ from . import prob
5
+ from . import plot
6
+
7
+ __version__ = "2.0.0b0"
8
+
9
+ from .sim import (
10
+ GutsBase,
11
+ PymobSimulator,
12
+ ECxEstimator,
13
+ LPxEstimator,
14
+ GutsBaseError,
15
+ )
@@ -0,0 +1,35 @@
1
+ from typing import Callable
2
+
3
+ from . import utils
4
+ from . import openguts
5
+ from . import expydb
6
+ from . import survival
7
+ from . import generator
8
+ from . import time_of_death
9
+ from . import preprocessing
10
+
11
+ from .openguts import (
12
+ OpenGutsIO,
13
+ create_new_columns_and_test_integrity_of_replicates,
14
+ create_database_and_import_data,
15
+ create_database_and_import_data_main,
16
+ import_data_to_database,
17
+ )
18
+ from .survival import (
19
+ prepare_survival_data_for_conditional_binomial,
20
+ survivors_at_start_of_interval,
21
+ generate_survival_repeated_observations,
22
+ is_survival_only_nan_except_start,
23
+ )
24
+
25
+ from .generator import create_artificial_data, design_exposure_scenario, ExposureDataDict
26
+
27
+ from .expydb import (
28
+ to_dataset,
29
+ combine_coords_to_multiindex,
30
+ reduce_multiindex_to_flat_index
31
+ )
32
+
33
+ from .time_of_death import (
34
+ time_of_death_to_openguts
35
+ )
@@ -0,0 +1,248 @@
1
+ from typing import List, Optional, Literal
2
+ import numpy as np
3
+ import xarray as xr
4
+ import pandas as pd
5
+ import arviz as az
6
+ import datetime
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+
11
+ from expyDB.intervention_model import (
12
+ Experiment,
13
+ Treatment,
14
+ Timeseries,
15
+ TsData,
16
+ from_expydb
17
+ )
18
+
19
+ from guts_base.sim.config import AllowedTimeUnits
20
+
21
+ def prepare_dataset(
22
+ idata,
23
+ variable="survival",
24
+ unit_time: AllowedTimeUnits = "day"
25
+ ):
26
+ """Get interventions from idata storage with respect ot the treatment
27
+ ids of the observations and move non indexing-related metadata (unique metadata)
28
+ to the attrs container.
29
+ """
30
+ # this test is guaranteed when prepare dataset is used together with from_expydb
31
+ # because from_expydb organizes the data into datasets with 1 variable,
32
+ # which is the timeseries variable with the coordinates timeseries_id and time
33
+ # for treatments and replicates. Other variables receive their 'own' dataset
34
+ assert len(idata[variable].data_vars) == 1
35
+ array: xr.DataArray = idata[variable][variable]
36
+ array = array.swap_dims(timeseries_id="treatment_id")
37
+ array = array.drop_vars("id")
38
+ # assuming that each timeseries of one variable in each treatment has
39
+ # a unique name the resulting index should be unique
40
+ array = array.set_index(id=("treatment_id", "timeseries_name"))
41
+ array = array.drop_vars("timeseries_id")
42
+ assert array.indexes["id"].is_unique
43
+
44
+ # format time to h and set as float
45
+ time_h = array.time.values / pd.Timedelta(1, unit_time)
46
+ array = array.assign_coords(time=time_h)
47
+
48
+ array = move_unique_coordinates_to_attrs(array)
49
+
50
+ array.attrs["unit_time"] = unit_time
51
+
52
+ # add a unique id for the selected dataset which is only relevant for
53
+ # the scope of modelling
54
+ return array
55
+
56
+ def move_unique_coordinates_to_attrs(array:xr.DataArray) -> xr.DataArray:
57
+ key: str
58
+ for key, coord in array.coords.items(): # type:ignore
59
+ if key in ["id", "treatment_id", "timeseries_id", "experiment_id", "subject_count", "timeseries_name"]:
60
+ continue
61
+
62
+ if coord.isnull().all():
63
+ unique_values = [None]
64
+ else:
65
+ unique_values = np.unique(coord.data)
66
+
67
+ if len(unique_values) == 1:
68
+ array.attrs.update({key: unique_values[0]})
69
+ array = array.drop_vars(key)
70
+ return array
71
+
72
+ # def prepare_interventions_dataset(interventions_idata, observations, ivs:Optional[List[str]]=None):
73
+ # """Get interventions from idata storage with respect ot the treatment
74
+ # ids of the observations"""
75
+ # if ivs is None:
76
+ # ivs = list(interventions_idata.keys())
77
+ # ds_ivs = get_interventions(
78
+ # interventions_idata,
79
+ # observations=observations,
80
+ # ivs=ivs
81
+ # )
82
+
83
+ # time_h = ds_ivs.time.values / np.timedelta64(1, "h")
84
+ # ds_ivs = ds_ivs.assign_coords(time=time_h)
85
+ # ds_ivs.attrs["unit_time"] = "hours (h)"
86
+
87
+ # return ds_ivs
88
+
89
+ def to_dataset(
90
+ observations_idata,
91
+ interventions_idata,
92
+ unit_time: Literal["day", "hour", "minute", "second"] = "hour"
93
+ ) -> xr.Dataset:
94
+ """Combines intervention and observation datasets, assuming that there is
95
+ a unique multiindex that can be constructed from
96
+ - treatment_id
97
+ - timeseries_name
98
+
99
+ This way interventions and observations can be combined into a single dataset,
100
+ """
101
+ data_arrays = {}
102
+ for variable in observations_idata.groups():
103
+ # prepare observations
104
+ da = prepare_dataset(
105
+ idata=observations_idata,
106
+ variable=variable,
107
+ unit_time=unit_time,
108
+ )
109
+ data_arrays.update({variable: da})
110
+
111
+ # prepare interventions
112
+ for variable in interventions_idata.groups():
113
+ da = prepare_dataset(
114
+ idata=interventions_idata,
115
+ variable=variable,
116
+ unit_time=unit_time,
117
+ )
118
+ data_arrays.update({variable: da})
119
+
120
+ return xr.combine_by_coords(data_arrays.values()) # type: ignore
121
+
122
+
123
+ def reduce_multiindex_to_flat_index(dataset):
124
+ multi_index = dataset.id.indexes["id"]
125
+
126
+ # create a flat index from the multi index
127
+ flat_index = multi_index.map(lambda x: "__".join([str(x_) for x_ in x]))
128
+
129
+ # remove multi index from dimension 'id'
130
+ dataset = dataset.reset_index("id")
131
+
132
+ # assign flat index to dimension 'id'
133
+ dataset = dataset.assign_coords(id=flat_index)
134
+
135
+ return dataset
136
+
137
+ def combine_coords_to_multiindex(
138
+ dataset: xr.Dataset,
139
+ coordinates: List[str],
140
+ index_name: str,
141
+ sep: str = "__"
142
+ ) -> xr.Dataset:
143
+ """Simply combines a list of coordinates into a joint string
144
+
145
+ Parameters
146
+ ----------
147
+ dataset : xr.Dataset
148
+ The observations dataset
149
+ coordinates : List[str]
150
+ The coordinates that should be joined
151
+ index_name : str
152
+ The name of the new, joined, coordinate
153
+ sep : str, optional
154
+ The string to separate the coordinate components, by default "__"
155
+
156
+ Returns
157
+ -------
158
+ xr.Dataset
159
+ Dataset with a new coordinate composed of the listed coordinates
160
+ """
161
+ try:
162
+ multi_index = pd.MultiIndex.from_arrays([dataset[c].values for c in coordinates])
163
+ except KeyError as err:
164
+ raise KeyError(
165
+ f"Did not find key {err} in the dataset. "
166
+ f"This is probably because the key {err} is equal for all treatments."
167
+ )
168
+ multi_index = multi_index.map(lambda x: "__".join([str(x_) for x_ in x]))
169
+ return dataset.assign_coords({index_name: ("id", multi_index)})
170
+
171
+ # def get_interventions(interventions_idata, observations, ivs: List[str]) -> xr.Dataset:
172
+ # """Get the interventions according to the treatment ids of the observation
173
+ # dataset.
174
+
175
+ # Works only for single interventions
176
+ # """
177
+ # X_in = {}
178
+ # for data_var in ivs:
179
+ # x_in = interventions_idata[data_var]\
180
+ # .swap_dims(timeseries_id="treatment_id")\
181
+ # .sel(treatment_id=observations.treatment_id.values)
182
+
183
+ # x_in = x_in.assign_coords(
184
+ # _id=("treatment_id", range(x_in.sizes["treatment_id"]))
185
+ # )
186
+ # x_in = x_in.swap_dims(treatment_id="_id")
187
+ # X_in.update({data_var: x_in[data_var]})
188
+
189
+
190
+ # X_in_dataset = xr.concat(X_in.values(), dim="variable")\
191
+ # .assign_coords(variable=ivs)\
192
+ # .to_dataset(dim="variable")
193
+
194
+ # if "variable" in X_in_dataset.dims:
195
+ # X_in_dataset = X_in_dataset.drop_dims("variable")
196
+
197
+ # return X_in_dataset
198
+
199
+
200
+
201
+
202
+ # def combine_interventions(
203
+ # interventions: az.InferenceData,
204
+ # force: bool=False
205
+ # ) -> xr.DataArray:
206
+ # """Combining interventions into a single dataset is only possible,
207
+ # if there is only a single timeseries for each intervention.
208
+
209
+ # Parameters
210
+ # ----------
211
+ # interventions : az.InferenceData
212
+ # Interventions InferenceData. Contains multiple datasets with at
213
+ # least one timeseries
214
+ # force : bool, optional
215
+ # Override restrictions to combine interventions only when the number
216
+ # of timeseries is 1, by default False
217
+
218
+ # Returns
219
+ # -------
220
+ # xr.DataArray
221
+ # Interventions, combined into a single dataset
222
+
223
+ # Raises
224
+ # ------
225
+ # ValueError
226
+ # If the number of timeseries is larger than 1 and force is not True
227
+ # """
228
+ # assert isinstance(interventions, az.InferenceData)
229
+ # arrays = []
230
+ # for variable, dataset in interventions.items():
231
+ # if dataset.sizes["timeseries_id"] > 1:
232
+ # if force:
233
+ # arr = dataset.to_array()
234
+ # else:
235
+ # raise ValueError(
236
+ # "Combining interventions is only allowed when the number of "
237
+ # "Timeseries for each variable is 1. This is to avoid blowing "
238
+ # "Up the size of the dataset with nans, because timeseries ids "
239
+ # "are different for each variable. You can override this error "
240
+ # "By using `force=True`"
241
+ # )
242
+ # else:
243
+ # arr = dataset.squeeze("timeseries_id").to_array()
244
+
245
+ # arrays.append(arr)
246
+
247
+ # return xr.concat(arrays, dim="variable")
248
+
@@ -0,0 +1,191 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import xarray as xr
4
+ from typing import TypedDict, Dict, Optional, Sequence, Literal
5
+ from numpy.typing import NDArray
6
+
7
+ class ExposureDataDict(TypedDict):
8
+ start: float
9
+ end: Optional[float]
10
+ exposure: Optional[float|Sequence[float]]
11
+
12
+ def create_artificial_data(
13
+ t_max,
14
+ dt,
15
+ exposure_paths=["oral", "topical", "contact"],
16
+ intensity=[0.1, 0.5, 0.05],
17
+ seed=1,
18
+ ):
19
+ rng = np.random.default_rng(1)
20
+ time = np.arange(0, t_max, step=dt) # daily time resolution
21
+
22
+ # calculate potential exposure based on a lognormal distribution
23
+ oral = rng.lognormal(mean=np.log(intensity[0]), sigma=0.5, size=len(time))
24
+ # and include a random exposure days
25
+ oral *= rng.binomial(n=1, p=1, size=len(time))
26
+
27
+
28
+ # calculate potential exposure based on a lognormal distribution
29
+ topical = rng.lognormal(mean=np.log(intensity[1]), sigma=1, size=len(time))
30
+ # and include a random exposure days
31
+ topical *= rng.binomial(n=1, p=0.25, size=len(time))
32
+
33
+
34
+ # calculate potential exposure based on a lognormal distribution
35
+ contact = rng.lognormal(mean=np.log(intensity[2]), sigma=0.1, size=len(time))
36
+ # and include a random exposure days
37
+ contact *= rng.binomial(n=1, p=0.8, size=len(time))
38
+
39
+
40
+
41
+ exposures = xr.Dataset(
42
+ data_vars={
43
+ "exposure": (("time", "exposure_path"), np.column_stack([oral, topical, contact])),
44
+ },
45
+ coords={"time": time, "exposure_path": ["oral", "topical", "contact"]}
46
+ )
47
+
48
+ return exposures.sel(exposure_path=exposure_paths)
49
+
50
+
51
+ def design_exposure_timeseries(time: NDArray, exposure: ExposureDataDict, eps: float):
52
+ if exposure is None:
53
+ return
54
+
55
+ if exposure["exposure"] is None:
56
+ exposure["exposure"] = 0.0
57
+
58
+ exposure["end"] = time[-1] if exposure["end"] is None else exposure["end"]
59
+
60
+ return np.where(
61
+ np.logical_and(time >= exposure["start"], time < exposure["end"]),
62
+ # compatibility with old version where exposure was named concentration
63
+ exposure["concentration"] if "concentration" in exposure else exposure["exposure"],
64
+ 0
65
+ )
66
+
67
+ def design_exposure_scenario(
68
+ t_max: float,
69
+ dt: float,
70
+ exposures: Dict[str,ExposureDataDict],
71
+ eps: float = 1e-8,
72
+ exposure_dimension: str = "exposure_type",
73
+ ):
74
+ """
75
+ TODO: tmax, dt and eps are probably not necessary
76
+ """
77
+ # add dt so that tmax is definitely inclded
78
+ time = np.arange(0, t_max+dt, step=dt) # daily time resolution
79
+ time = np.unique(np.concatenate([time] + [
80
+ np.array([time[-1] if vals["end"] is None else vals["end"]])
81
+ for key, vals in exposures.items()
82
+
83
+ ]))
84
+
85
+ treatments = {}
86
+ for key, expo in exposures.items():
87
+ treat = design_exposure_timeseries(time, expo, eps)
88
+ treatments.update({key: treat})
89
+
90
+ data = np.column_stack(list(treatments.values()))
91
+ data = np.expand_dims(data, axis=0)
92
+
93
+ coords = {"id": [0], "time": time}
94
+
95
+ coords.update({exposure_dimension: list(treatments.keys())})
96
+
97
+ exposures_dataset = xr.Dataset(
98
+ data_vars={"exposure": (tuple(coords.keys()), data)},
99
+ coords=coords
100
+ )
101
+
102
+ return exposures_dataset
103
+
104
+
105
+ def draft_laboratory_experiment(
106
+ treatments: Dict[str, float|Dict[str,float]],
107
+ experiment_end: pd.Timedelta = pd.Timedelta(10, unit="days"),
108
+ exposure_pattern: ExposureDataDict|Dict[str,ExposureDataDict] = ExposureDataDict(start=0.0, end=None, exposure=None),
109
+ dt: pd.Timedelta = pd.Timedelta(1, unit="days"),
110
+ exposure_dimension: str = "exposure_type",
111
+ ):
112
+
113
+
114
+ time_unit = pd.Timedelta(1, experiment_end.resolution_string) # type: ignore
115
+
116
+ dt_float = dt / time_unit
117
+ experiment_end_float = experiment_end / time_unit + dt / time_unit
118
+ exposures = {}
119
+ for treatment_name, treatment in treatments.items():
120
+ if isinstance(treatment, dict):
121
+ dummy_dim = False
122
+ exposure_dict = exposure_pattern.copy()
123
+ for treatment_key, treatment_val in treatment.items():
124
+ if treatment_key not in exposure_dict:
125
+ raise KeyError(
126
+ "If `treatments` values contain mutliple keys " +
127
+ f"({treatment.keys()}), these must be present in the " +
128
+ "`exposure_pattern` as well; i.e. exposure_pattern must be a dict."
129
+
130
+ )
131
+ exposure_dict[treatment_key]["exposure"] = treatment_val
132
+
133
+ else:
134
+ dummy_dim = True
135
+ exposure = exposure_pattern.copy()
136
+
137
+ if "exposure" not in exposure:
138
+ raise KeyError(
139
+ "exposure_pattern did not contain the key `exposure` ",
140
+ f"but {exposure.keys()}. Make sure the treatments and exposures match."
141
+ )
142
+ exposure["exposure"] = treatment
143
+
144
+ exposure_dict = {"dummy_key": exposure}
145
+
146
+ for _, vals in exposure_dict.items():
147
+ if vals["end"] is None:
148
+ pass
149
+ elif isinstance(vals["end"], float|int):
150
+ pass
151
+ elif isinstance(vals["end"], pd.Timedelta):
152
+ vals["end"] = vals["end"] / time_unit
153
+ else:
154
+ raise NotImplementedError(
155
+ f"exposure_data['end']={vals['end']} but must be None, float or pd.Timedelta."
156
+ )
157
+
158
+ if vals["start"] is None:
159
+ pass
160
+ elif isinstance(vals["start"], float|int):
161
+ pass
162
+ elif isinstance(vals["start"], pd.Timedelta):
163
+ vals["start"] = vals["start"] / time_unit
164
+ else:
165
+ raise NotImplementedError(
166
+ f"exposure_data['start']={vals['start']} but must be None, float or pd.Timedelta."
167
+ )
168
+
169
+ x_in = design_exposure_scenario(
170
+ t_max=experiment_end_float, dt=dt_float,
171
+ exposures=exposure_dict,
172
+ exposure_dimension=exposure_dimension,
173
+ )
174
+
175
+ if dummy_dim:
176
+ x_in = x_in.isel({exposure_dimension: 0})
177
+ x_in["exposure"] = x_in["exposure"].drop_vars(exposure_dimension)
178
+
179
+
180
+ x_in = x_in.assign_coords({"id": [treatment_name]})
181
+ exposures.update({treatment_name: x_in})
182
+
183
+ experiment = xr.combine_by_coords(exposures.values())
184
+ # sort by id so the order of the treatments remains consistent
185
+ experiment = experiment.sel(
186
+ id=list(exposures.keys()),
187
+ time=[float(t) for t in experiment.time if t <= experiment_end / time_unit]
188
+ )
189
+
190
+ return experiment
191
+