PyPI - guts-base - Versions diffs - 2.0.0b0__py3-none-any.whl - Mend

guts-base 2.0.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

guts_base/__init__.py +15 -0
guts_base/data/__init__.py +35 -0
guts_base/data/expydb.py +248 -0
guts_base/data/generator.py +191 -0
guts_base/data/openguts.py +296 -0
guts_base/data/preprocessing.py +55 -0
guts_base/data/survival.py +148 -0
guts_base/data/time_of_death.py +595 -0
guts_base/data/utils.py +8 -0
guts_base/mod.py +332 -0
guts_base/plot.py +201 -0
guts_base/prob/__init__.py +13 -0
guts_base/prob/binom.py +18 -0
guts_base/prob/conditional_binom.py +118 -0
guts_base/prob/conditional_binom_mv.py +233 -0
guts_base/prob/predictions.py +164 -0
guts_base/sim/__init__.py +28 -0
guts_base/sim/base.py +1286 -0
guts_base/sim/config.py +170 -0
guts_base/sim/constructors.py +31 -0
guts_base/sim/ecx.py +585 -0
guts_base/sim/mempy.py +290 -0
guts_base/sim/report.py +405 -0
guts_base/sim/transformer.py +548 -0
guts_base/sim/units.py +313 -0
guts_base/sim/utils.py +10 -0
guts_base-2.0.0b0.dist-info/METADATA +853 -0
guts_base-2.0.0b0.dist-info/RECORD +32 -0
guts_base-2.0.0b0.dist-info/WHEEL +5 -0
guts_base-2.0.0b0.dist-info/entry_points.txt +3 -0
guts_base-2.0.0b0.dist-info/licenses/LICENSE +674 -0
guts_base-2.0.0b0.dist-info/top_level.txt +1 -0

guts_base/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from . import sim
+from . import mod
+from . import data
+from . import prob
+from . import plot
+__version__ = "2.0.0b0"
+from .sim import (
+    GutsBase,
+    PymobSimulator,
+    ECxEstimator,
+    LPxEstimator,
+    GutsBaseError,
+)

guts_base/data/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+from typing import Callable
+from . import utils
+from . import openguts
+from . import expydb
+from . import survival
+from . import generator
+from . import time_of_death
+from . import preprocessing
+from .openguts import (
+    OpenGutsIO,
+    create_new_columns_and_test_integrity_of_replicates,
+    create_database_and_import_data,
+    create_database_and_import_data_main,
+    import_data_to_database,
+)
+from .survival import (
+    prepare_survival_data_for_conditional_binomial,
+    survivors_at_start_of_interval,
+    generate_survival_repeated_observations,
+    is_survival_only_nan_except_start,
+)
+from .generator import create_artificial_data, design_exposure_scenario, ExposureDataDict
+from .expydb import (
+    to_dataset,
+    combine_coords_to_multiindex,
+    reduce_multiindex_to_flat_index
+)
+from .time_of_death import (
+    time_of_death_to_openguts
+)

guts_base/data/expydb.py ADDED Viewed

@@ -0,0 +1,248 @@
+from typing import List, Optional, Literal
+import numpy as np
+import xarray as xr
+import pandas as pd
+import arviz as az
+import datetime
+import pandas as pd
+import numpy as np
+from expyDB.intervention_model import (
+    Experiment,
+    Treatment,
+    Timeseries,
+    TsData,
+    from_expydb
+)
+from guts_base.sim.config import AllowedTimeUnits
+def prepare_dataset(
+    idata,
+    variable="survival",
+    unit_time: AllowedTimeUnits = "day"
+):
+    """Get interventions from idata storage with respect ot the treatment
+    ids of the observations and move non indexing-related metadata (unique metadata)
+    to the attrs container.
+    """
+    # this test is guaranteed when prepare dataset is used together with from_expydb
+    # because from_expydb organizes the data into datasets with 1 variable,
+    # which is the timeseries variable with the coordinates timeseries_id and time
+    # for treatments and replicates. Other variables receive their 'own' dataset
+    assert len(idata[variable].data_vars) == 1
+    array: xr.DataArray = idata[variable][variable]
+    array = array.swap_dims(timeseries_id="treatment_id")
+    array = array.drop_vars("id")
+    # assuming that each timeseries of one variable in each treatment has
+    # a unique name the resulting index should be unique
+    array = array.set_index(id=("treatment_id", "timeseries_name"))
+    array = array.drop_vars("timeseries_id")
+    assert array.indexes["id"].is_unique
+    # format time to h and set as float
+    time_h = array.time.values / pd.Timedelta(1, unit_time)
+    array = array.assign_coords(time=time_h)
+    array = move_unique_coordinates_to_attrs(array)
+    array.attrs["unit_time"] = unit_time
+    # add a unique id for the selected dataset which is only relevant for
+    # the scope of modelling
+    return array
+def move_unique_coordinates_to_attrs(array:xr.DataArray) -> xr.DataArray:
+    key: str
+    for key, coord in array.coords.items(): # type:ignore
+        if key in ["id", "treatment_id", "timeseries_id", "experiment_id", "subject_count", "timeseries_name"]:
+            continue
+        if coord.isnull().all():
+            unique_values = [None]
+        else:
+            unique_values = np.unique(coord.data)
+        if len(unique_values) == 1:
+            array.attrs.update({key: unique_values[0]})
+            array = array.drop_vars(key)
+    return array
+# def prepare_interventions_dataset(interventions_idata, observations, ivs:Optional[List[str]]=None):
+#     """Get interventions from idata storage with respect ot the treatment
+#     ids of the observations"""
+#     if ivs is None:
+#         ivs = list(interventions_idata.keys())
+#     ds_ivs = get_interventions(
+#         interventions_idata,
+#         observations=observations,
+#         ivs=ivs
+#     )
+#     time_h = ds_ivs.time.values / np.timedelta64(1, "h")
+#     ds_ivs = ds_ivs.assign_coords(time=time_h)
+#     ds_ivs.attrs["unit_time"] = "hours (h)"
+#     return ds_ivs
+def to_dataset(
+    observations_idata,
+    interventions_idata,
+    unit_time: Literal["day", "hour", "minute", "second"] = "hour"
+) -> xr.Dataset:
+    """Combines intervention and observation datasets, assuming that there is
+    a unique multiindex that can be constructed from
+      - treatment_id
+      - timeseries_name
+    This way interventions and observations can be combined into a single dataset,
+    """
+    data_arrays = {}
+    for variable in observations_idata.groups():
+        # prepare observations
+        da = prepare_dataset(
+            idata=observations_idata,
+            variable=variable,
+            unit_time=unit_time,
+        )
+        data_arrays.update({variable: da})
+    # prepare interventions
+    for variable in interventions_idata.groups():
+        da = prepare_dataset(
+            idata=interventions_idata,
+            variable=variable,
+            unit_time=unit_time,
+        )
+        data_arrays.update({variable: da})
+    return xr.combine_by_coords(data_arrays.values())  # type: ignore
+def reduce_multiindex_to_flat_index(dataset):
+    multi_index = dataset.id.indexes["id"]
+    # create a flat index from the multi index
+    flat_index = multi_index.map(lambda x: "__".join([str(x_) for x_ in x]))
+    # remove multi index from dimension 'id'
+    dataset = dataset.reset_index("id")
+    # assign flat index to dimension 'id'
+    dataset = dataset.assign_coords(id=flat_index)
+    return dataset
+def combine_coords_to_multiindex(
+        dataset: xr.Dataset,
+        coordinates: List[str],
+        index_name: str,
+        sep: str = "__"
+    ) -> xr.Dataset:
+    """Simply combines a list of coordinates into a joint string
+    Parameters
+    ----------
+    dataset : xr.Dataset
+        The observations dataset
+    coordinates : List[str]
+        The coordinates that should be joined
+    index_name : str
+        The name of the new, joined, coordinate
+    sep : str, optional
+        The string to separate the coordinate components, by default "__"
+    Returns
+    -------
+    xr.Dataset
+        Dataset with a new coordinate composed of the listed coordinates
+    """
+    try:
+        multi_index = pd.MultiIndex.from_arrays([dataset[c].values for c in coordinates])
+    except KeyError as err:
+        raise KeyError(
+            f"Did not find key {err} in the dataset. "
+            f"This is probably because the key {err} is equal for all treatments."
+        )
+    multi_index = multi_index.map(lambda x: "__".join([str(x_) for x_ in x]))
+    return dataset.assign_coords({index_name: ("id", multi_index)})
+# def get_interventions(interventions_idata, observations, ivs: List[str]) -> xr.Dataset:
+#     """Get the interventions according to the treatment ids of the observation
+#     dataset.
+#     Works only for single interventions
+#     """
+#     X_in = {}
+#     for data_var in ivs:
+#         x_in = interventions_idata[data_var]\
+#             .swap_dims(timeseries_id="treatment_id")\
+#             .sel(treatment_id=observations.treatment_id.values)
+#         x_in = x_in.assign_coords(
+#             _id=("treatment_id", range(x_in.sizes["treatment_id"]))
+#         )
+#         x_in = x_in.swap_dims(treatment_id="_id")
+#         X_in.update({data_var: x_in[data_var]})
+#     X_in_dataset = xr.concat(X_in.values(), dim="variable")\
+#         .assign_coords(variable=ivs)\
+#         .to_dataset(dim="variable")
+#     if "variable" in X_in_dataset.dims:
+#         X_in_dataset = X_in_dataset.drop_dims("variable")
+#     return X_in_dataset
+# def combine_interventions(
+#     interventions: az.InferenceData,
+#     force: bool=False
+# ) -> xr.DataArray:
+#     """Combining interventions into a single dataset is only possible,
+#     if there is only a single timeseries for each intervention.
+#     Parameters
+#     ----------
+#     interventions : az.InferenceData
+#         Interventions InferenceData. Contains multiple datasets with at
+#         least one timeseries
+#     force : bool, optional
+#         Override restrictions to combine interventions only when the number
+#         of timeseries is 1, by default False
+#     Returns
+#     -------
+#     xr.DataArray
+#         Interventions, combined into a single dataset
+#     Raises
+#     ------
+#     ValueError
+#         If the number of timeseries is larger than 1 and force is not True
+#     """
+#     assert isinstance(interventions, az.InferenceData)
+#     arrays = []
+#     for variable, dataset in interventions.items():
+#         if dataset.sizes["timeseries_id"] > 1:
+#             if force:
+#                 arr = dataset.to_array()
+#             else:
+#                 raise ValueError(
+#                     "Combining interventions is only allowed when the number of "
+#                     "Timeseries for each variable is 1. This is to avoid blowing "
+#                     "Up the size of the dataset with nans, because timeseries ids "
+#                     "are different for each variable. You can override this error "
+#                     "By using `force=True`"
+#                 )
+#         else:
+#             arr = dataset.squeeze("timeseries_id").to_array()
+#         arrays.append(arr)
+#     return xr.concat(arrays, dim="variable")

guts_base/data/generator.py ADDED Viewed

@@ -0,0 +1,191 @@
+import numpy as np
+import pandas as pd
+import xarray as xr
+from typing import TypedDict, Dict, Optional, Sequence, Literal
+from numpy.typing import NDArray
+class ExposureDataDict(TypedDict):
+    start: float
+    end: Optional[float]
+    exposure: Optional[float|Sequence[float]]
+def create_artificial_data(
+    t_max,
+    dt,
+    exposure_paths=["oral", "topical", "contact"],
+    intensity=[0.1, 0.5, 0.05],
+    seed=1,
+):
+    rng = np.random.default_rng(1)
+    time = np.arange(0, t_max, step=dt)  # daily time resolution
+    # calculate potential exposure based on a lognormal distribution
+    oral = rng.lognormal(mean=np.log(intensity[0]), sigma=0.5, size=len(time))
+    # and include a random exposure days
+    oral *= rng.binomial(n=1, p=1, size=len(time))
+    # calculate potential exposure based on a lognormal distribution
+    topical = rng.lognormal(mean=np.log(intensity[1]), sigma=1, size=len(time))
+    # and include a random exposure days
+    topical *= rng.binomial(n=1, p=0.25, size=len(time))
+    # calculate potential exposure based on a lognormal distribution
+    contact = rng.lognormal(mean=np.log(intensity[2]), sigma=0.1, size=len(time))
+    # and include a random exposure days
+    contact *= rng.binomial(n=1, p=0.8, size=len(time))
+    exposures = xr.Dataset(
+        data_vars={
+            "exposure": (("time", "exposure_path"), np.column_stack([oral, topical, contact])),
+        },
+        coords={"time": time, "exposure_path": ["oral", "topical", "contact"]}
+    )
+    return exposures.sel(exposure_path=exposure_paths)
+def design_exposure_timeseries(time: NDArray, exposure: ExposureDataDict, eps: float):
+    if exposure is None:
+        return
+    if exposure["exposure"] is None:
+        exposure["exposure"] = 0.0
+    exposure["end"] = time[-1] if exposure["end"] is None else exposure["end"]
+    return np.where(
+        np.logical_and(time >= exposure["start"], time < exposure["end"]),
+        # compatibility with old version where exposure was named concentration
+        exposure["concentration"] if "concentration" in exposure else exposure["exposure"],
+        0
+    )
+def design_exposure_scenario(
+    t_max: float,
+    dt: float,
+    exposures: Dict[str,ExposureDataDict],
+    eps: float = 1e-8,
+    exposure_dimension: str = "exposure_type",
+):
+    """
+    TODO: tmax, dt and eps are probably not necessary
+    """
+    # add dt so that tmax is definitely inclded
+    time = np.arange(0, t_max+dt, step=dt)  # daily time resolution
+    time = np.unique(np.concatenate([time] + [
+        np.array([time[-1] if vals["end"] is None else vals["end"]])
+        for key, vals in exposures.items()
+    ]))
+    treatments = {}
+    for key, expo in exposures.items():
+        treat = design_exposure_timeseries(time, expo, eps)
+        treatments.update({key: treat})
+    data = np.column_stack(list(treatments.values()))
+    data = np.expand_dims(data, axis=0)
+    coords = {"id": [0], "time": time}
+    coords.update({exposure_dimension: list(treatments.keys())})
+    exposures_dataset = xr.Dataset(
+        data_vars={"exposure": (tuple(coords.keys()), data)},
+        coords=coords
+    )
+    return exposures_dataset
+def draft_laboratory_experiment(
+    treatments: Dict[str, float|Dict[str,float]],
+    experiment_end: pd.Timedelta = pd.Timedelta(10, unit="days"),
+    exposure_pattern: ExposureDataDict|Dict[str,ExposureDataDict] = ExposureDataDict(start=0.0, end=None, exposure=None),
+    dt: pd.Timedelta = pd.Timedelta(1, unit="days"),
+    exposure_dimension: str = "exposure_type",
+):
+    time_unit = pd.Timedelta(1, experiment_end.resolution_string) # type: ignore
+    dt_float = dt / time_unit
+    experiment_end_float = experiment_end / time_unit + dt / time_unit
+    exposures = {}
+    for treatment_name, treatment in treatments.items():
+        if isinstance(treatment, dict):
+            dummy_dim = False
+            exposure_dict = exposure_pattern.copy()
+            for treatment_key, treatment_val in treatment.items():
+                if treatment_key not in exposure_dict:
+                    raise KeyError(
+                        "If `treatments` values contain mutliple keys " +
+                        f"({treatment.keys()}), these must be present in the " +
+                        "`exposure_pattern` as well; i.e. exposure_pattern must be a dict."
+                    )
+                exposure_dict[treatment_key]["exposure"] = treatment_val
+        else:
+            dummy_dim = True
+            exposure = exposure_pattern.copy()
+            if "exposure" not in exposure:
+                raise KeyError(
+                    "exposure_pattern did not contain the key `exposure` ",
+                    f"but {exposure.keys()}. Make sure the treatments and exposures match."
+                )
+            exposure["exposure"] = treatment
+            exposure_dict = {"dummy_key": exposure}
+        for _, vals in exposure_dict.items():
+            if vals["end"] is None:
+                pass
+            elif isinstance(vals["end"], float|int):
+                pass
+            elif isinstance(vals["end"], pd.Timedelta):
+                vals["end"] = vals["end"] / time_unit
+            else:
+                raise NotImplementedError(
+                    f"exposure_data['end']={vals['end']} but must be None, float or pd.Timedelta."
+                )
+            if vals["start"] is None:
+                pass
+            elif isinstance(vals["start"], float|int):
+                pass
+            elif isinstance(vals["start"], pd.Timedelta):
+                vals["start"] = vals["start"] / time_unit
+            else:
+                raise NotImplementedError(
+                    f"exposure_data['start']={vals['start']} but must be None, float or pd.Timedelta."
+                )
+        x_in = design_exposure_scenario(
+            t_max=experiment_end_float, dt=dt_float,
+            exposures=exposure_dict,
+            exposure_dimension=exposure_dimension,
+        )
+        if dummy_dim:
+            x_in = x_in.isel({exposure_dimension: 0})
+            x_in["exposure"] = x_in["exposure"].drop_vars(exposure_dimension)
+        x_in = x_in.assign_coords({"id": [treatment_name]})
+        exposures.update({treatment_name: x_in})
+    experiment = xr.combine_by_coords(exposures.values())
+    # sort by id so the order of the treatments remains consistent
+    experiment = experiment.sel(
+        id=list(exposures.keys()),
+        time=[float(t) for t in  experiment.time if t <= experiment_end / time_unit]
+    )
+    return experiment