guts-base 2.0.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,296 @@
1
+ from typing import List, Optional, Callable, Dict, Any, Tuple
2
+ from importlib import import_module
3
+ import warnings
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from expyDB.database_operations import create_database, experiment_to_db
8
+ from pymob.sim.config import dict_to_string
9
+
10
+ import glob
11
+ import os
12
+
13
+ import click
14
+ import pandas as pd
15
+
16
+ from expyDB.intervention_model import to_expydb, Experiment, PandasConverter
17
+ from guts_base.data.utils import datalad_locked_file_warning
18
+
19
+ def test_equality_of_exposure_patterns_in_treatment(df):
20
+ for _, group in df.groupby("treatment_id"):
21
+ exposures = group.pivot_table(
22
+ # values="value",
23
+ index=["time", "treatment_id"],
24
+ columns="replicate_id"
25
+ )
26
+
27
+ equal_expo = exposures.values == exposures.values[:, ].reshape((-1, 1))
28
+ if not np.all(equal_expo):
29
+ raise RuntimeError(
30
+ "Replicates in the same treatment ID have different exposure patterns."
31
+ )
32
+
33
+ def create_new_columns_and_test_integrity_of_replicates(
34
+ exposure, survival, n_reps, path
35
+ ):
36
+ assert np.all(exposure.columns == survival.columns)
37
+
38
+ columns_new, treatment_reps = identify_replicates(frame=exposure)
39
+
40
+ if not np.all(np.array(list(treatment_reps.values())) == n_reps):
41
+ warnings.warn(
42
+ f"Actual treatment replicates are different from "
43
+ f"replicates ({n_reps}), given in Info sheet in file: "
44
+ f"{path}"
45
+ )
46
+
47
+ return columns_new
48
+
49
+ def identify_replicates(frame):
50
+ df = frame.drop(columns="time")
51
+
52
+ # Find identical columns and assign group labels
53
+ group_labels = {}
54
+ used_cols = set()
55
+ treatment_map = {}
56
+
57
+ for col in df.columns:
58
+ if col not in used_cols:
59
+ # compare the column to each column of the dataframe with
60
+ # df.apply(func, axis=0) and get the column names
61
+ identical_cols = df.columns[df.apply(lambda x: x.equals(df[col]), axis=0)].tolist()
62
+ group_label = f'{len(group_labels) + 1}'
63
+ group_labels[group_label] = identical_cols
64
+ used_cols.update(identical_cols)
65
+
66
+ for icol in identical_cols:
67
+ treatment_map.update({icol: group_label})
68
+
69
+ columns_new = [f"{treatment_map[col]}__{col}" for col in df.columns]
70
+ treatment_reps = {key: len(cols) for key, cols in group_labels.items()}
71
+
72
+ return columns_new, treatment_reps
73
+
74
+
75
+
76
+ def read_timeseries_sheet(path, sheet, sep=None):
77
+ ts = pd.read_excel(path, sheet_name=sheet, index_col=0) # type: ignore
78
+ multi_index = pd.MultiIndex.from_tuples(
79
+ [tuple(c.split(sep)) for c in ts.columns], names=["treatment_id", "timeseries_id"]
80
+ )
81
+ ts.columns = multi_index
82
+ return ts
83
+
84
+
85
+ class OpenGutsIO:
86
+ # TODO: Use preprocessing here and use map as a class attribute
87
+ def __init__(self, file):
88
+ self._file = file
89
+ self.data = self.from_file(file)
90
+
91
+ def _openguts_wide_to_long(self, frame, columns_new):
92
+ frame_wide = frame.copy()
93
+
94
+ frame_wide.columns = ["time"] + columns_new
95
+ frame_long = pd.melt(
96
+ frame=frame_wide,
97
+ id_vars=["time"],
98
+ value_vars=columns_new,
99
+ var_name="exposure_id"
100
+ )
101
+ # create new index columns from new column names
102
+ frame_long[["treatment_id", "replicate_id"]] = frame_long\
103
+ .exposure_id.str.split("__", n=1, expand=True)
104
+ frame_long = frame_long.drop(columns="exposure_id")
105
+
106
+ return frame_long
107
+
108
+
109
+ def _merge_tables(self, tables: List):
110
+ data = tables.pop(0).set_index(["time", "treatment_id", "replicate_id"])
111
+
112
+ for expo in tables:
113
+ rdata =expo.set_index(["time", "treatment_id", "replicate_id"])
114
+ data = pd.merge(
115
+ left=data,
116
+ right=rdata,
117
+ how="left",
118
+ left_index=True,
119
+ right_index=True
120
+ )
121
+
122
+ return data
123
+
124
+ def _read_timeseries(self, path, sheets):
125
+ # design new columns based on the information about replicates and treatments
126
+ timeseries_long_list = []
127
+ timeseries_column_list = []
128
+ time_units = {}
129
+ for iv in sheets:
130
+ timeseries_df = pd.read_excel(path, sheet_name=f"{iv}")
131
+
132
+ time_column = timeseries_df.columns[0]
133
+ time_unit = time_column.lower().replace("time", "").strip(" []")
134
+
135
+ # define replicates based on equality of columns
136
+ timeseries_columns = [c for c in timeseries_df.columns[1:]]
137
+ timeseries_long = self._openguts_wide_to_long(
138
+ frame=timeseries_df, columns_new=timeseries_columns
139
+ )
140
+ intervention_long = timeseries_long.rename(columns={"value": iv})
141
+ timeseries_long_list.append(intervention_long)
142
+ timeseries_column_list.append(timeseries_columns)
143
+ time_units.update({iv: time_unit})
144
+
145
+ return self._merge_tables(timeseries_long_list).reset_index(), time_units
146
+
147
+
148
+ def _read_openguts(self, path, metadata_sheetname="meta"):
149
+ meta = pd.read_excel(path, sheet_name=metadata_sheetname, index_col=0).dropna(how="all")
150
+ interventions = meta.loc["experiment__interventions","Value"]
151
+ if interventions is None:
152
+ raise ValueError("'experiment__interventions' must be defined in metadata")
153
+ else:
154
+ intervention_sheets = [i.strip("[]' ") for i in interventions.split(",")] # type: ignore
155
+
156
+ observations = meta.loc["experiment__observations","Value"]
157
+ if observations is None:
158
+ raise ValueError("'experiment__observations' must be defined in metadata")
159
+ else:
160
+ observation_sheets = [i.strip("[]' ") for i in observations.split(",")] # type: ignore
161
+
162
+ # survival_df = pd.read_excel(path, sheet_name="survival")
163
+ # survival_df = survival_df.rename(columns={"time [d]": "time"})
164
+
165
+ # design new columns based on the information about replicates and treatments
166
+ interventions_long, interventions_time_units = self._read_timeseries(path, intervention_sheets)
167
+ observations_long, observations_time_units = self._read_timeseries(path, observation_sheets)
168
+ time_unit = {
169
+ "interventions": interventions_time_units,
170
+ "observations": observations_time_units
171
+ }
172
+
173
+ # TODO test if all exposures within a treatment (replicates) were nominally the same
174
+ # test_equality_of_exposure_patterns_in_treatment(df=exposures_long)
175
+
176
+ return interventions_long, observations_long, meta, time_unit
177
+
178
+ def from_file(self, file) -> None:
179
+ (
180
+ interventions_long,
181
+ observations_long,
182
+ meta,
183
+ time_unit
184
+ ) = self._read_openguts(path=file)
185
+
186
+ self.interventions = interventions_long
187
+ self.observations = observations_long
188
+ self.time_unit = time_unit
189
+ self.meta = meta
190
+
191
+ def to_file(self, file):
192
+ raise NotImplementedError(
193
+ "This method should implement writing an excel file that corresponds"
194
+ "to the original input file."
195
+ )
196
+
197
+ def to_experiment(self) -> Experiment:
198
+ return Experiment.from_dict(data=dict(
199
+ interventions=self.interventions,
200
+ observations=self.observations,
201
+ meta=self.meta,
202
+ time_units=self.time_unit
203
+ ))
204
+
205
+ def from_experiment(self, experiment: Experiment) -> None:
206
+ data = experiment.to_dict()
207
+ self.interventions=data["interventions"],
208
+ self.observations=data["observations"],
209
+ self.meta=data["meta"],
210
+ self.time_units=data["time_units"],
211
+
212
+ def to_xarray(self):
213
+ return self.to_experiment().to_xarray()
214
+
215
+
216
+ def import_data_to_database(path, database, preprocessing: Optional[Callable] = None, preprocessing_out: Optional[str] = None):
217
+ """This script takes raw data, preprocesses it to contain all
218
+ necessary metadata for expyDB. Then it creates an experiment Model and
219
+ processes adds it to the database
220
+ """
221
+ # preprocess path
222
+ if preprocessing is not None:
223
+ if preprocessing_out is None:
224
+ filename = os.path.dirname(path)
225
+ directory = os.path.basename(filename)
226
+ new_path = path.replace(directory, f"processed_{directory}")
227
+ else:
228
+ filename = os.path.basename(path)
229
+ new_path = preprocessing_out.format(filename=filename)
230
+
231
+ os.makedirs(os.path.dirname(new_path), exist_ok=True)
232
+
233
+ processed_path = preprocessing(path, new_path)
234
+ else:
235
+ processed_path = path
236
+
237
+ # Preprocess excel to interventions and observations in Long form and a
238
+ # metadata Series as well as a default time unit
239
+ openguts = OpenGutsIO(processed_path)
240
+
241
+ # From excel to an Experiment Model instance
242
+ experiment = openguts.to_experiment()
243
+
244
+ # from the Model to the Database
245
+ if not os.access(database, os.W_OK):
246
+ warnings.warn(
247
+ f"Did not write to database. The file '{database}' does "
248
+ "not have write access."
249
+ )
250
+ return
251
+
252
+ experiment.to_database(database=database)
253
+
254
+ print("Import to database successful.")
255
+
256
+
257
+ def create_database_and_import_data_main(datasets_path, database_path, preprocessing=None, preprocessing_out=None):
258
+ print("\n")
259
+ print(f"Creating a database and importing data")
260
+ print(f"======================================")
261
+
262
+ if preprocessing is not None:
263
+ module, func = preprocessing.rsplit(".", 1)
264
+ mod = import_module(module)
265
+ preprocessing_func = getattr(mod, func)
266
+ else:
267
+ preprocessing_func = None
268
+
269
+ paths = []
270
+ for p in datasets_path:
271
+ if os.path.isfile(p):
272
+ paths.append(p)
273
+ else:
274
+ paths.extend(glob.glob(os.path.join(p, "*.xlsx")))
275
+
276
+ create_database(database=database_path, force=True)
277
+ for p in paths:
278
+ print(f"\nPreprocessing and importing file: {p}")
279
+ import_data_to_database(
280
+ path=p, database=database_path,
281
+ preprocessing=preprocessing_func,
282
+ preprocessing_out=preprocessing_out
283
+ )
284
+
285
+ @click.command()
286
+ @click.option("--datasets_path", type=str, multiple=True, help="The path to the directory where the excel files are located. Alternatively, use multiple times with paths to files")
287
+ @click.option("--database_path", type=str, help="The path to the database (should end with .db)")
288
+ @click.option("--preprocessing", type=str, help="Function used to preprocess the data", default=None)
289
+ @click.option("--preprocessing-out", type=str, help="A pattern that uses {filename} as a placeholder e.g. 'data/processed_data/{filename}. If unset, preprends 'processes_' to the dirname", default=None)
290
+ def create_database_and_import_data(datasets_path, database_path, preprocessing, preprocessing_out):
291
+ create_database_and_import_data_main(
292
+ datasets_path=datasets_path,
293
+ database_path=database_path,
294
+ preprocessing=preprocessing,
295
+ preprocessing_out=preprocessing_out
296
+ )
@@ -0,0 +1,55 @@
1
+ import os
2
+
3
+ import pandas as pd
4
+
5
+ from expyDB.intervention_model import (
6
+ Experiment, Treatment, Timeseries,
7
+ PandasConverter,
8
+ )
9
+
10
+ def read_timeseries_sheet(path, sheet, sep=None):
11
+ ts = pd.read_excel(path, sheet_name=sheet, index_col=0) # type: ignore
12
+ multi_index = pd.MultiIndex.from_tuples(
13
+ [tuple(c.split(sep)) for c in ts.columns], names=["treatment_id", "timeseries_id"]
14
+ )
15
+ ts.columns = multi_index
16
+ return ts
17
+
18
+ def ringtest(path, new_path):
19
+ exposure = read_timeseries_sheet(path, sheet="Exposure", sep=" ")
20
+ exposure.index.name = "time"
21
+ survival = read_timeseries_sheet(path, sheet="Survival", sep=" ")
22
+ survival.index.name = "time"
23
+
24
+ # TODO: possibly using a normal index would also be acceptable
25
+ template = PandasConverter(Experiment())
26
+ # template.meta.index = template.meta_multiindex
27
+
28
+ # extract information from the meta that is needed elsewhere
29
+ data = {}
30
+ data.update({"exposure": exposure})
31
+ data.update({"survival": survival})
32
+
33
+
34
+ map = [
35
+ # new keys
36
+ (None, ("experiment", "name"), lambda x: "Ring test"),
37
+ (None, ("experiment", "interventions"), lambda x: ["exposure"]),
38
+ (None, ("experiment", "observations"), lambda x: ["survival"]),
39
+ (None, ("experiment", "public"), lambda x: True),
40
+
41
+ (None, ("treatment", "medium"), lambda x: "water"),
42
+
43
+ (None, ("observation", "unit"), lambda x: "-"),
44
+ (None, ("observation", "time_unit"), lambda x: "day"),
45
+
46
+ (None, ("intervention", "unit"), lambda x: "-"),
47
+ (None, ("intervention", "time_unit"), lambda x: "day"),
48
+ ]
49
+
50
+ template.map_to_meta(map=map)
51
+ template.data = data
52
+ template.to_excel(new_path)
53
+
54
+ return new_path
55
+
@@ -0,0 +1,148 @@
1
+ import numpy as np
2
+ import xarray as xr
3
+ from scipy.stats import binom
4
+ from matplotlib import pyplot as plt
5
+ from pymob.utils.testing import assert_no_nans_in_dataset
6
+
7
+ def prepare_survival_data_for_conditional_binomial(observations: xr.Dataset) -> xr.Dataset:
8
+ """This is a convenience method for preparing survival data for a
9
+ conditional binomial model. The method simply prepares an array of the
10
+ same size as survival just shifted by one time step to determine the
11
+ number of survivers at the beginning of the next time step to consider
12
+ conditional surviving of repeated observations.
13
+
14
+ The additional dataset fills NaN values which may occurr in the observations
15
+ but not in the parameters of the distribution by forward filling and
16
+ then fills remaining nans (which can only ocurr in the initial times t)
17
+ with the nominal number of used organisms.
18
+ """
19
+ survival = observations["survival"]
20
+ # fill nan values forward in time with the last observation
21
+ # until the next observation. Afterwards leading nans are replaced with
22
+ # the subject count (no lethality observed before the first observation)
23
+ nsurv = survival.ffill(dim="time").fillna(observations.subject_count)
24
+
25
+ # Test if the observations that were filled into the dataframe at the beginning
26
+ # are equal to the subject count if available.
27
+ np.testing.assert_array_equal(
28
+ nsurv.isel(time=0, id=~observations.subject_count.isnull()),
29
+ observations.subject_count.sel(id=~observations.subject_count.isnull())
30
+ )
31
+
32
+ assert_no_nans_in_dataset(nsurv.to_dataset())
33
+
34
+ # create a convenience observation survivors before t, which gives the
35
+ # number of living organisms at the end of time interval t-1
36
+ # this is used for calculating conditional survival
37
+ observations = observations.assign_coords({
38
+ "survivors_before_t": (("id", "time"), np.column_stack([
39
+ nsurv.isel(time=0).values,
40
+ nsurv.isel(time=list(range(0, len(nsurv.time)-1))).values
41
+ ]).astype(int))})
42
+
43
+ observations = observations.assign_coords({
44
+ "survivors_at_start": (("id", "time"), np.broadcast_to(
45
+ nsurv.isel(time=0).values.reshape(-1,1),
46
+ shape=nsurv.shape
47
+ ).astype(int))})
48
+
49
+ return observations
50
+
51
+
52
+ def is_survival_only_nan_except_start(survival: xr.DataArray):
53
+ is_not_nan_at_start = survival.isel(time=0).notnull().all().values
54
+ is_nan_at_rest = survival.sel(time=survival.time[1:]).isnull().all().values
55
+ return bool(is_not_nan_at_start and is_nan_at_rest)
56
+
57
+
58
+ def survivors_at_start_of_interval(survival: xr.DataArray, ):
59
+ # create a convenience observation survivors before t, which gives the
60
+ # number of living organisms at the end of time interval t-1
61
+ # this is used for calculating conditional survival
62
+ return np.column_stack([
63
+ survival.isel(time=0).values,
64
+ survival.isel(time=list(range(0, len(survival.time)-1))).values
65
+ ]).astype(int)
66
+
67
+
68
+ def generate_survival_repeated_observations(
69
+ S,
70
+ N=10,
71
+ time=None,
72
+ reps=1,
73
+ incidence=True,
74
+ seed=1,
75
+ ax=None,
76
+ tol=None
77
+ ):
78
+ """Generate observations from a survival function S, with N individuals
79
+
80
+ For this the conditional survival probability is used. This means that
81
+ for each time-interval the probability of dying in that interval, conditional
82
+ on having lived until the beginning of that interval.
83
+
84
+ S_cond[i] = (S[i-1] - S[i]) / S[i-1] where i are the intervals in T
85
+
86
+ L[i] = Binom(p=S_cond[i], N=N_alive[i-1])
87
+
88
+ L[i] is the death incidence in the interval i. So the number of deceased
89
+ individuals in the interval.
90
+
91
+ For the binomial trials also N changes over time, with
92
+
93
+ N_alive[i] = N - sum(L[:i])
94
+
95
+ This means the number of alive individuals gets reduced by the cumulative
96
+ number of deceased individuals.
97
+
98
+ Parameters
99
+ ----------
100
+ S : ArrayLike
101
+ values from the survival function must be monotonically decreasing
102
+ N : int
103
+ The number of individuals in one experiment that is repeatedly observed
104
+ reps: int
105
+ The number of repeats of the same experiment
106
+
107
+ incidence: bool
108
+ If true, returns the number of deaths in each interval. If False returns
109
+ the number of cumulative deaths until the interval (including the
110
+ interval).
111
+ """
112
+ rng=np.random.default_rng(seed)
113
+
114
+ if time is None:
115
+ time = np.arange(len(S))
116
+
117
+ T = len(time)
118
+
119
+ if tol is not None:
120
+ S = np.clip(S, tol, 1-tol)
121
+
122
+ L = np.zeros(shape=(reps, T))
123
+ for i in range(T):
124
+ if i == 0:
125
+ S_0 = 1
126
+ else:
127
+ S_0 = S[i-1]
128
+
129
+ # calculate the binomial response of the conditional survival
130
+ # i.e. the probability to die within an interval conditional on
131
+ # having survived until the beginning of that interval
132
+ L[:, i] = binom(p=(S_0-S[i])/S_0, n=N-L.sum(axis=1).astype(int)).rvs(random_state=rng)
133
+
134
+
135
+ # observations
136
+ if ax is None:
137
+ fig, ax = plt.subplots(1,1)
138
+ ax.plot(time, S * N, color="black")
139
+ ax.plot(time, N - L.cumsum(axis=1).T,
140
+ marker="o", color="tab:red", ls="", alpha=.75)
141
+ ax.set_xlabel("Time [h]")
142
+ ax.set_ylabel("Survival")
143
+ ax.set_ylim(N-N*1.02,N*1.02)
144
+
145
+ if incidence:
146
+ return L
147
+ else:
148
+ return L.cumsum(axis=1)