PyPI - guts-base - Versions diffs - 2.0.0b0__py3-none-any.whl - Mend

guts-base 2.0.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

guts_base/__init__.py +15 -0
guts_base/data/__init__.py +35 -0
guts_base/data/expydb.py +248 -0
guts_base/data/generator.py +191 -0
guts_base/data/openguts.py +296 -0
guts_base/data/preprocessing.py +55 -0
guts_base/data/survival.py +148 -0
guts_base/data/time_of_death.py +595 -0
guts_base/data/utils.py +8 -0
guts_base/mod.py +332 -0
guts_base/plot.py +201 -0
guts_base/prob/__init__.py +13 -0
guts_base/prob/binom.py +18 -0
guts_base/prob/conditional_binom.py +118 -0
guts_base/prob/conditional_binom_mv.py +233 -0
guts_base/prob/predictions.py +164 -0
guts_base/sim/__init__.py +28 -0
guts_base/sim/base.py +1286 -0
guts_base/sim/config.py +170 -0
guts_base/sim/constructors.py +31 -0
guts_base/sim/ecx.py +585 -0
guts_base/sim/mempy.py +290 -0
guts_base/sim/report.py +405 -0
guts_base/sim/transformer.py +548 -0
guts_base/sim/units.py +313 -0
guts_base/sim/utils.py +10 -0
guts_base-2.0.0b0.dist-info/METADATA +853 -0
guts_base-2.0.0b0.dist-info/RECORD +32 -0
guts_base-2.0.0b0.dist-info/WHEEL +5 -0
guts_base-2.0.0b0.dist-info/entry_points.txt +3 -0
guts_base-2.0.0b0.dist-info/licenses/LICENSE +674 -0
guts_base-2.0.0b0.dist-info/top_level.txt +1 -0

guts_base/data/openguts.py ADDED Viewed

@@ -0,0 +1,296 @@
+from typing import List, Optional, Callable, Dict, Any, Tuple
+from importlib import import_module
+import warnings
+import numpy as np
+import pandas as pd
+from expyDB.database_operations import create_database, experiment_to_db
+from pymob.sim.config import dict_to_string
+import glob
+import os
+import click
+import pandas as pd
+from expyDB.intervention_model import to_expydb, Experiment, PandasConverter
+from guts_base.data.utils import datalad_locked_file_warning
+def test_equality_of_exposure_patterns_in_treatment(df):
+    for _, group in df.groupby("treatment_id"):
+        exposures = group.pivot_table(
+            # values="value",
+            index=["time", "treatment_id"],
+            columns="replicate_id"
+        )
+        equal_expo = exposures.values == exposures.values[:, ].reshape((-1, 1))
+        if not np.all(equal_expo):
+            raise RuntimeError(
+                "Replicates in the same treatment ID have different exposure patterns."
+            )
+def create_new_columns_and_test_integrity_of_replicates(
+    exposure, survival, n_reps, path
+):
+    assert np.all(exposure.columns == survival.columns)
+    columns_new, treatment_reps = identify_replicates(frame=exposure)
+    if not np.all(np.array(list(treatment_reps.values())) == n_reps):
+        warnings.warn(
+            f"Actual treatment replicates are different from "
+            f"replicates ({n_reps}), given in Info sheet in file: "
+            f"{path}"
+        )
+    return columns_new
+def identify_replicates(frame):
+    df = frame.drop(columns="time")
+    # Find identical columns and assign group labels
+    group_labels = {}
+    used_cols = set()
+    treatment_map = {}
+    for col in df.columns:
+        if col not in used_cols:
+            # compare the column to each column of the dataframe with
+            # df.apply(func, axis=0) and get the column names
+            identical_cols = df.columns[df.apply(lambda x: x.equals(df[col]), axis=0)].tolist()
+            group_label = f'{len(group_labels) + 1}'
+            group_labels[group_label] = identical_cols
+            used_cols.update(identical_cols)
+            for icol in identical_cols:
+                treatment_map.update({icol: group_label})
+    columns_new = [f"{treatment_map[col]}__{col}" for col in df.columns]
+    treatment_reps = {key: len(cols) for key, cols in group_labels.items()}
+    return columns_new, treatment_reps
+def read_timeseries_sheet(path, sheet, sep=None):
+    ts = pd.read_excel(path, sheet_name=sheet, index_col=0)  # type: ignore
+    multi_index = pd.MultiIndex.from_tuples(
+        [tuple(c.split(sep)) for c in ts.columns], names=["treatment_id", "timeseries_id"]
+    )
+    ts.columns = multi_index
+    return ts
+class OpenGutsIO:
+    # TODO: Use preprocessing here and use map as a class attribute
+    def __init__(self, file):
+        self._file = file
+        self.data = self.from_file(file)
+    def _openguts_wide_to_long(self, frame, columns_new):
+        frame_wide = frame.copy()
+        frame_wide.columns = ["time"] + columns_new
+        frame_long = pd.melt(
+            frame=frame_wide,
+            id_vars=["time"],
+            value_vars=columns_new,
+            var_name="exposure_id"
+        )
+        # create new index columns from new column names
+        frame_long[["treatment_id", "replicate_id"]] = frame_long\
+            .exposure_id.str.split("__", n=1, expand=True)
+        frame_long = frame_long.drop(columns="exposure_id")
+        return frame_long
+    def _merge_tables(self, tables: List):
+        data = tables.pop(0).set_index(["time", "treatment_id", "replicate_id"])
+        for expo in tables:
+            rdata =expo.set_index(["time", "treatment_id", "replicate_id"])
+            data = pd.merge(
+                left=data,
+                right=rdata,
+                how="left",
+                left_index=True,
+                right_index=True
+            )
+        return data
+    def _read_timeseries(self, path, sheets):
+        # design new columns based on the information about replicates and treatments
+        timeseries_long_list = []
+        timeseries_column_list = []
+        time_units = {}
+        for iv in sheets:
+            timeseries_df = pd.read_excel(path, sheet_name=f"{iv}")
+            time_column = timeseries_df.columns[0]
+            time_unit = time_column.lower().replace("time", "").strip(" []")
+            # define replicates based on equality of columns
+            timeseries_columns = [c for c in timeseries_df.columns[1:]]
+            timeseries_long = self._openguts_wide_to_long(
+                frame=timeseries_df, columns_new=timeseries_columns
+            )
+            intervention_long = timeseries_long.rename(columns={"value": iv})
+            timeseries_long_list.append(intervention_long)
+            timeseries_column_list.append(timeseries_columns)
+            time_units.update({iv: time_unit})
+        return self._merge_tables(timeseries_long_list).reset_index(), time_units
+    def _read_openguts(self, path, metadata_sheetname="meta"):
+        meta = pd.read_excel(path, sheet_name=metadata_sheetname, index_col=0).dropna(how="all")
+        interventions = meta.loc["experiment__interventions","Value"]
+        if interventions is None:
+            raise ValueError("'experiment__interventions' must be defined in metadata")
+        else:
+            intervention_sheets = [i.strip("[]' ") for i in interventions.split(",")]  # type: ignore
+        observations = meta.loc["experiment__observations","Value"]
+        if observations is None:
+            raise ValueError("'experiment__observations' must be defined in metadata")
+        else:
+            observation_sheets = [i.strip("[]' ") for i in observations.split(",")]  # type: ignore
+        # survival_df = pd.read_excel(path, sheet_name="survival")
+        # survival_df = survival_df.rename(columns={"time [d]": "time"})
+        # design new columns based on the information about replicates and treatments
+        interventions_long, interventions_time_units = self._read_timeseries(path, intervention_sheets)
+        observations_long, observations_time_units = self._read_timeseries(path, observation_sheets)
+        time_unit = {
+            "interventions": interventions_time_units,
+            "observations": observations_time_units
+        }
+        # TODO test if all exposures within a treatment (replicates) were nominally the same
+        # test_equality_of_exposure_patterns_in_treatment(df=exposures_long)
+        return interventions_long, observations_long, meta, time_unit
+    def from_file(self, file) -> None:
+        (
+            interventions_long,
+            observations_long,
+            meta,
+            time_unit
+        ) = self._read_openguts(path=file)
+        self.interventions = interventions_long
+        self.observations = observations_long
+        self.time_unit = time_unit
+        self.meta = meta
+    def to_file(self, file):
+        raise NotImplementedError(
+            "This method should implement writing an excel file that corresponds"
+            "to the original input file."
+        )
+    def to_experiment(self) -> Experiment:
+        return Experiment.from_dict(data=dict(
+            interventions=self.interventions,
+            observations=self.observations,
+            meta=self.meta,
+            time_units=self.time_unit
+        ))
+    def from_experiment(self, experiment: Experiment) -> None:
+        data = experiment.to_dict()
+        self.interventions=data["interventions"],
+        self.observations=data["observations"],
+        self.meta=data["meta"],
+        self.time_units=data["time_units"],
+    def to_xarray(self):
+        return self.to_experiment().to_xarray()
+def import_data_to_database(path, database, preprocessing: Optional[Callable] = None, preprocessing_out: Optional[str] = None):
+    """This script takes raw data, preprocesses it to contain all
+    necessary metadata for expyDB. Then it creates an experiment Model and
+    processes adds it to the database
+    """
+    # preprocess path
+    if preprocessing is not None:
+        if preprocessing_out is None:
+            filename = os.path.dirname(path)
+            directory = os.path.basename(filename)
+            new_path = path.replace(directory, f"processed_{directory}")
+        else:
+            filename = os.path.basename(path)
+            new_path = preprocessing_out.format(filename=filename)
+        os.makedirs(os.path.dirname(new_path), exist_ok=True)
+        processed_path = preprocessing(path, new_path)
+    else:
+        processed_path = path
+    # Preprocess excel to interventions and observations in Long form and a
+    # metadata Series as well as a default time unit
+    openguts = OpenGutsIO(processed_path)
+    # From excel to an Experiment Model instance
+    experiment = openguts.to_experiment()
+    # from the Model to the Database
+    if not os.access(database, os.W_OK):
+        warnings.warn(
+            f"Did not write to database. The file '{database}' does "
+            "not have write access."
+        )
+        return
+    experiment.to_database(database=database)
+    print("Import to database successful.")
+def create_database_and_import_data_main(datasets_path, database_path, preprocessing=None, preprocessing_out=None):
+    print("\n")
+    print(f"Creating a database and importing data")
+    print(f"======================================")
+    if preprocessing is not None:
+        module, func = preprocessing.rsplit(".", 1)
+        mod = import_module(module)
+        preprocessing_func = getattr(mod, func)
+    else:
+        preprocessing_func = None
+    paths = []
+    for p in datasets_path:
+        if os.path.isfile(p):
+            paths.append(p)
+        else:
+            paths.extend(glob.glob(os.path.join(p, "*.xlsx")))
+    create_database(database=database_path, force=True)
+    for p in paths:
+        print(f"\nPreprocessing and importing file: {p}")
+        import_data_to_database(
+            path=p, database=database_path,
+            preprocessing=preprocessing_func,
+            preprocessing_out=preprocessing_out
+        )
+@click.command()
+@click.option("--datasets_path", type=str, multiple=True, help="The path to the directory where the excel files are located. Alternatively, use multiple times with paths to files")
+@click.option("--database_path", type=str, help="The path to the database (should end with .db)")
+@click.option("--preprocessing", type=str, help="Function used to preprocess the data", default=None)
+@click.option("--preprocessing-out", type=str, help="A pattern that uses {filename} as a placeholder e.g. 'data/processed_data/{filename}. If unset, preprends 'processes_' to the dirname", default=None)
+def create_database_and_import_data(datasets_path, database_path, preprocessing, preprocessing_out):
+    create_database_and_import_data_main(
+        datasets_path=datasets_path,
+        database_path=database_path,
+        preprocessing=preprocessing,
+        preprocessing_out=preprocessing_out
+    )

guts_base/data/preprocessing.py ADDED Viewed

@@ -0,0 +1,55 @@
+import os
+import pandas as pd
+from expyDB.intervention_model import (
+    Experiment, Treatment, Timeseries,
+    PandasConverter,
+)
+def read_timeseries_sheet(path, sheet, sep=None):
+    ts = pd.read_excel(path, sheet_name=sheet, index_col=0)  # type: ignore
+    multi_index = pd.MultiIndex.from_tuples(
+        [tuple(c.split(sep)) for c in ts.columns], names=["treatment_id", "timeseries_id"]
+    )
+    ts.columns = multi_index
+    return ts
+def ringtest(path, new_path):
+    exposure = read_timeseries_sheet(path, sheet="Exposure", sep=" ")
+    exposure.index.name = "time"
+    survival = read_timeseries_sheet(path, sheet="Survival", sep=" ")
+    survival.index.name = "time"
+    # TODO: possibly using a normal index would also be acceptable
+    template = PandasConverter(Experiment())
+    # template.meta.index = template.meta_multiindex
+    # extract information from the meta that is needed elsewhere
+    data = {}
+    data.update({"exposure": exposure})
+    data.update({"survival": survival})
+    map = [
+        # new keys
+        (None, ("experiment", "name"), lambda x: "Ring test"),
+        (None, ("experiment", "interventions"), lambda x: ["exposure"]),
+        (None, ("experiment", "observations"), lambda x: ["survival"]),
+        (None, ("experiment", "public"), lambda x: True),
+        (None, ("treatment", "medium"), lambda x: "water"),
+        (None, ("observation", "unit"), lambda x: "-"),
+        (None, ("observation", "time_unit"), lambda x: "day"),
+        (None, ("intervention", "unit"), lambda x: "-"),
+        (None, ("intervention", "time_unit"), lambda x: "day"),
+    ]
+    template.map_to_meta(map=map)
+    template.data = data
+    template.to_excel(new_path)
+    return new_path

guts_base/data/survival.py ADDED Viewed

@@ -0,0 +1,148 @@
+import numpy as np
+import xarray as xr
+from scipy.stats import binom
+from matplotlib import pyplot as plt
+from pymob.utils.testing import assert_no_nans_in_dataset
+def prepare_survival_data_for_conditional_binomial(observations: xr.Dataset) -> xr.Dataset:
+    """This is a convenience method for preparing survival data for a
+    conditional binomial model. The method simply prepares an array of the
+    same size as survival just shifted by one time step to determine the
+    number of survivers at the beginning of the next time step to consider
+    conditional surviving of repeated observations.
+    The additional dataset fills NaN values which may occurr in the observations
+    but not in the parameters of the distribution by forward filling and
+    then fills remaining nans (which can only ocurr in the initial times t)
+    with the nominal number of used organisms.
+    """
+    survival = observations["survival"]
+    # fill nan values forward in time with the last observation
+    # until the next observation. Afterwards leading nans are replaced with
+    # the subject count (no lethality observed before the first observation)
+    nsurv = survival.ffill(dim="time").fillna(observations.subject_count)
+    # Test if the observations that were filled into the dataframe at the beginning
+    # are equal to the subject count if available.
+    np.testing.assert_array_equal(
+        nsurv.isel(time=0, id=~observations.subject_count.isnull()),
+        observations.subject_count.sel(id=~observations.subject_count.isnull())
+    )
+    assert_no_nans_in_dataset(nsurv.to_dataset())
+    # create a convenience observation survivors before t, which gives the
+    # number of living organisms at the end of time interval t-1
+    # this is used for calculating conditional survival
+    observations = observations.assign_coords({
+        "survivors_before_t": (("id", "time"), np.column_stack([
+            nsurv.isel(time=0).values,
+            nsurv.isel(time=list(range(0, len(nsurv.time)-1))).values
+    ]).astype(int))})
+    observations = observations.assign_coords({
+        "survivors_at_start": (("id", "time"), np.broadcast_to(
+            nsurv.isel(time=0).values.reshape(-1,1),
+            shape=nsurv.shape
+    ).astype(int))})
+    return observations
+def is_survival_only_nan_except_start(survival: xr.DataArray):
+    is_not_nan_at_start = survival.isel(time=0).notnull().all().values
+    is_nan_at_rest = survival.sel(time=survival.time[1:]).isnull().all().values
+    return bool(is_not_nan_at_start and is_nan_at_rest)
+def survivors_at_start_of_interval(survival: xr.DataArray, ):
+    # create a convenience observation survivors before t, which gives the
+    # number of living organisms at the end of time interval t-1
+    # this is used for calculating conditional survival
+    return np.column_stack([
+        survival.isel(time=0).values,
+        survival.isel(time=list(range(0, len(survival.time)-1))).values
+    ]).astype(int)
+def generate_survival_repeated_observations(
+    S,
+    N=10,
+    time=None,
+    reps=1,
+    incidence=True,
+    seed=1,
+    ax=None,
+    tol=None
+):
+    """Generate observations from a survival function S, with N individuals
+    For this the conditional survival probability is used. This means that
+    for each time-interval the probability of dying in that interval, conditional
+    on having lived until the beginning of that interval.
+    S_cond[i] = (S[i-1] - S[i]) / S[i-1] where i are the intervals in T
+    L[i] = Binom(p=S_cond[i], N=N_alive[i-1])
+    L[i] is the death incidence in the interval i. So the number of deceased
+    individuals in the interval.
+    For the binomial trials also N changes over time, with
+    N_alive[i] = N - sum(L[:i])
+    This means the number of alive individuals gets reduced by the cumulative
+    number of deceased individuals.
+    Parameters
+    ----------
+    S : ArrayLike
+        values from the survival function must be monotonically decreasing
+    N : int
+        The number of individuals in one experiment that is repeatedly observed
+    reps: int
+        The number of repeats of the same experiment
+    incidence: bool
+        If true, returns the number of deaths in each interval. If False returns
+        the number of cumulative deaths until the interval (including the
+        interval).
+    """
+    rng=np.random.default_rng(seed)
+    if time is None:
+        time = np.arange(len(S))
+    T = len(time)
+    if tol is not None:
+        S = np.clip(S, tol, 1-tol)
+    L = np.zeros(shape=(reps, T))
+    for i in range(T):
+        if i == 0:
+            S_0 = 1
+        else:
+            S_0 = S[i-1]
+        # calculate the binomial response of the conditional survival
+        # i.e. the probability to die within an interval conditional on
+        # having survived until the beginning of that interval
+        L[:, i] = binom(p=(S_0-S[i])/S_0, n=N-L.sum(axis=1).astype(int)).rvs(random_state=rng)
+    # observations
+    if ax is None:
+        fig, ax = plt.subplots(1,1)
+    ax.plot(time, S * N, color="black")
+    ax.plot(time, N - L.cumsum(axis=1).T,
+            marker="o", color="tab:red", ls="", alpha=.75)
+    ax.set_xlabel("Time [h]")
+    ax.set_ylabel("Survival")
+    ax.set_ylim(N-N*1.02,N*1.02)
+    if incidence:
+        return L
+    else:
+        return L.cumsum(axis=1)