guts-base 0.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guts-base might be problematic. Click here for more details.
- guts_base/__init__.py +14 -0
- guts_base/data/__init__.py +34 -0
- guts_base/data/expydb.py +247 -0
- guts_base/data/generator.py +96 -0
- guts_base/data/openguts.py +294 -0
- guts_base/data/preprocessing.py +55 -0
- guts_base/data/survival.py +137 -0
- guts_base/data/time_of_death.py +571 -0
- guts_base/data/utils.py +8 -0
- guts_base/mod.py +251 -0
- guts_base/plot.py +162 -0
- guts_base/prob.py +412 -0
- guts_base/sim/__init__.py +14 -0
- guts_base/sim/base.py +464 -0
- guts_base/sim/ecx.py +357 -0
- guts_base/sim/mempy.py +252 -0
- guts_base/sim/report.py +72 -0
- guts_base/sim.py +0 -0
- guts_base-0.8.2.dist-info/METADATA +836 -0
- guts_base-0.8.2.dist-info/RECORD +24 -0
- guts_base-0.8.2.dist-info/WHEEL +5 -0
- guts_base-0.8.2.dist-info/entry_points.txt +3 -0
- guts_base-0.8.2.dist-info/licenses/LICENSE +674 -0
- guts_base-0.8.2.dist-info/top_level.txt +1 -0
guts_base/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
|
|
3
|
+
from . import utils
|
|
4
|
+
from . import openguts
|
|
5
|
+
from . import expydb
|
|
6
|
+
from . import survival
|
|
7
|
+
from . import generator
|
|
8
|
+
from . import time_of_death
|
|
9
|
+
from . import preprocessing
|
|
10
|
+
|
|
11
|
+
from .openguts import (
|
|
12
|
+
OpenGutsIO,
|
|
13
|
+
create_new_columns_and_test_integrity_of_replicates,
|
|
14
|
+
create_database_and_import_data,
|
|
15
|
+
create_database_and_import_data_main,
|
|
16
|
+
import_data_to_database,
|
|
17
|
+
)
|
|
18
|
+
from .survival import (
|
|
19
|
+
prepare_survival_data_for_conditional_binomial,
|
|
20
|
+
survivors_at_start_of_interval,
|
|
21
|
+
generate_survival_repeated_observations
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
from .generator import create_artificial_data, design_exposure_scenario
|
|
25
|
+
|
|
26
|
+
from .expydb import (
|
|
27
|
+
to_dataset,
|
|
28
|
+
combine_coords_to_multiindex,
|
|
29
|
+
reduce_multiindex_to_flat_index
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
from .time_of_death import (
|
|
33
|
+
time_of_death_to_openguts
|
|
34
|
+
)
|
guts_base/data/expydb.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
from typing import List, Optional, Literal
|
|
2
|
+
import numpy as np
|
|
3
|
+
import xarray as xr
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import arviz as az
|
|
6
|
+
import datetime
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from expyDB.intervention_model import (
|
|
12
|
+
Experiment,
|
|
13
|
+
Treatment,
|
|
14
|
+
Timeseries,
|
|
15
|
+
TsData,
|
|
16
|
+
from_expydb
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def prepare_dataset(
|
|
21
|
+
idata,
|
|
22
|
+
variable="survival",
|
|
23
|
+
unit_time: Literal["day", "hour", "minute", "second"] = "day"
|
|
24
|
+
):
|
|
25
|
+
"""Get interventions from idata storage with respect ot the treatment
|
|
26
|
+
ids of the observations and move non indexing-related metadata (unique metadata)
|
|
27
|
+
to the attrs container.
|
|
28
|
+
"""
|
|
29
|
+
# this test is guaranteed when prepare dataset is used together with from_expydb
|
|
30
|
+
# because from_expydb organizes the data into datasets with 1 variable,
|
|
31
|
+
# which is the timeseries variable with the coordinates timeseries_id and time
|
|
32
|
+
# for treatments and replicates. Other variables receive their 'own' dataset
|
|
33
|
+
assert len(idata[variable].data_vars) == 1
|
|
34
|
+
array: xr.DataArray = idata[variable][variable]
|
|
35
|
+
array = array.swap_dims(timeseries_id="treatment_id")
|
|
36
|
+
array = array.drop_vars("id")
|
|
37
|
+
# assuming that each timeseries of one variable in each treatment has
|
|
38
|
+
# a unique name the resulting index should be unique
|
|
39
|
+
array = array.set_index(id=("treatment_id", "timeseries_name"))
|
|
40
|
+
array = array.drop_vars("timeseries_id")
|
|
41
|
+
assert array.indexes["id"].is_unique
|
|
42
|
+
|
|
43
|
+
# format time to h and set as float
|
|
44
|
+
time_h = array.time.values / pd.Timedelta(1, unit_time)
|
|
45
|
+
array = array.assign_coords(time=time_h)
|
|
46
|
+
|
|
47
|
+
array = move_unique_coordinates_to_attrs(array)
|
|
48
|
+
|
|
49
|
+
array.attrs["unit_time"] = unit_time
|
|
50
|
+
|
|
51
|
+
# add a unique id for the selected dataset which is only relevant for
|
|
52
|
+
# the scope of modelling
|
|
53
|
+
return array
|
|
54
|
+
|
|
55
|
+
def move_unique_coordinates_to_attrs(array:xr.DataArray) -> xr.DataArray:
|
|
56
|
+
key: str
|
|
57
|
+
for key, coord in array.coords.items(): # type:ignore
|
|
58
|
+
if key in ["id", "treatment_id", "timeseries_id", "experiment_id", "subject_count", "timeseries_name"]:
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
if coord.isnull().all():
|
|
62
|
+
unique_values = [None]
|
|
63
|
+
else:
|
|
64
|
+
unique_values = np.unique(coord.data)
|
|
65
|
+
|
|
66
|
+
if len(unique_values) == 1:
|
|
67
|
+
array.attrs.update({key: unique_values[0]})
|
|
68
|
+
array = array.drop_vars(key)
|
|
69
|
+
return array
|
|
70
|
+
|
|
71
|
+
# def prepare_interventions_dataset(interventions_idata, observations, ivs:Optional[List[str]]=None):
|
|
72
|
+
# """Get interventions from idata storage with respect ot the treatment
|
|
73
|
+
# ids of the observations"""
|
|
74
|
+
# if ivs is None:
|
|
75
|
+
# ivs = list(interventions_idata.keys())
|
|
76
|
+
# ds_ivs = get_interventions(
|
|
77
|
+
# interventions_idata,
|
|
78
|
+
# observations=observations,
|
|
79
|
+
# ivs=ivs
|
|
80
|
+
# )
|
|
81
|
+
|
|
82
|
+
# time_h = ds_ivs.time.values / np.timedelta64(1, "h")
|
|
83
|
+
# ds_ivs = ds_ivs.assign_coords(time=time_h)
|
|
84
|
+
# ds_ivs.attrs["unit_time"] = "hours (h)"
|
|
85
|
+
|
|
86
|
+
# return ds_ivs
|
|
87
|
+
|
|
88
|
+
def to_dataset(
|
|
89
|
+
observations_idata,
|
|
90
|
+
interventions_idata,
|
|
91
|
+
unit_time: Literal["day", "hour", "minute", "second"] = "hour"
|
|
92
|
+
) -> xr.Dataset:
|
|
93
|
+
"""Combines intervention and observation datasets, assuming that there is
|
|
94
|
+
a unique multiindex that can be constructed from
|
|
95
|
+
- treatment_id
|
|
96
|
+
- timeseries_name
|
|
97
|
+
|
|
98
|
+
This way interventions and observations can be combined into a single dataset,
|
|
99
|
+
"""
|
|
100
|
+
data_arrays = {}
|
|
101
|
+
for variable in observations_idata.groups():
|
|
102
|
+
# prepare observations
|
|
103
|
+
da = prepare_dataset(
|
|
104
|
+
idata=observations_idata,
|
|
105
|
+
variable=variable,
|
|
106
|
+
unit_time=unit_time,
|
|
107
|
+
)
|
|
108
|
+
data_arrays.update({variable: da})
|
|
109
|
+
|
|
110
|
+
# prepare interventions
|
|
111
|
+
for variable in interventions_idata.groups():
|
|
112
|
+
da = prepare_dataset(
|
|
113
|
+
idata=interventions_idata,
|
|
114
|
+
variable=variable,
|
|
115
|
+
unit_time=unit_time,
|
|
116
|
+
)
|
|
117
|
+
data_arrays.update({variable: da})
|
|
118
|
+
|
|
119
|
+
return xr.combine_by_coords(data_arrays.values()) # type: ignore
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def reduce_multiindex_to_flat_index(dataset):
|
|
123
|
+
multi_index = dataset.id.indexes["id"]
|
|
124
|
+
|
|
125
|
+
# create a flat index from the multi index
|
|
126
|
+
flat_index = multi_index.map(lambda x: "__".join([str(x_) for x_ in x]))
|
|
127
|
+
|
|
128
|
+
# remove multi index from dimension 'id'
|
|
129
|
+
dataset = dataset.reset_index("id")
|
|
130
|
+
|
|
131
|
+
# assign flat index to dimension 'id'
|
|
132
|
+
dataset = dataset.assign_coords(id=flat_index)
|
|
133
|
+
|
|
134
|
+
return dataset
|
|
135
|
+
|
|
136
|
+
def combine_coords_to_multiindex(
|
|
137
|
+
dataset: xr.Dataset,
|
|
138
|
+
coordinates: List[str],
|
|
139
|
+
index_name: str,
|
|
140
|
+
sep: str = "__"
|
|
141
|
+
) -> xr.Dataset:
|
|
142
|
+
"""Simply combines a list of coordinates into a joint string
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
dataset : xr.Dataset
|
|
147
|
+
The observations dataset
|
|
148
|
+
coordinates : List[str]
|
|
149
|
+
The coordinates that should be joined
|
|
150
|
+
index_name : str
|
|
151
|
+
The name of the new, joined, coordinate
|
|
152
|
+
sep : str, optional
|
|
153
|
+
The string to separate the coordinate components, by default "__"
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
xr.Dataset
|
|
158
|
+
Dataset with a new coordinate composed of the listed coordinates
|
|
159
|
+
"""
|
|
160
|
+
try:
|
|
161
|
+
multi_index = pd.MultiIndex.from_arrays([dataset[c].values for c in coordinates])
|
|
162
|
+
except KeyError as err:
|
|
163
|
+
raise KeyError(
|
|
164
|
+
f"Did not find key {err} in the dataset. "
|
|
165
|
+
f"This is probably because the key {err} is equal for all treatments."
|
|
166
|
+
)
|
|
167
|
+
multi_index = multi_index.map(lambda x: "__".join([str(x_) for x_ in x]))
|
|
168
|
+
return dataset.assign_coords({index_name: ("id", multi_index)})
|
|
169
|
+
|
|
170
|
+
# def get_interventions(interventions_idata, observations, ivs: List[str]) -> xr.Dataset:
|
|
171
|
+
# """Get the interventions according to the treatment ids of the observation
|
|
172
|
+
# dataset.
|
|
173
|
+
|
|
174
|
+
# Works only for single interventions
|
|
175
|
+
# """
|
|
176
|
+
# X_in = {}
|
|
177
|
+
# for data_var in ivs:
|
|
178
|
+
# x_in = interventions_idata[data_var]\
|
|
179
|
+
# .swap_dims(timeseries_id="treatment_id")\
|
|
180
|
+
# .sel(treatment_id=observations.treatment_id.values)
|
|
181
|
+
|
|
182
|
+
# x_in = x_in.assign_coords(
|
|
183
|
+
# _id=("treatment_id", range(x_in.sizes["treatment_id"]))
|
|
184
|
+
# )
|
|
185
|
+
# x_in = x_in.swap_dims(treatment_id="_id")
|
|
186
|
+
# X_in.update({data_var: x_in[data_var]})
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# X_in_dataset = xr.concat(X_in.values(), dim="variable")\
|
|
190
|
+
# .assign_coords(variable=ivs)\
|
|
191
|
+
# .to_dataset(dim="variable")
|
|
192
|
+
|
|
193
|
+
# if "variable" in X_in_dataset.dims:
|
|
194
|
+
# X_in_dataset = X_in_dataset.drop_dims("variable")
|
|
195
|
+
|
|
196
|
+
# return X_in_dataset
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# def combine_interventions(
|
|
202
|
+
# interventions: az.InferenceData,
|
|
203
|
+
# force: bool=False
|
|
204
|
+
# ) -> xr.DataArray:
|
|
205
|
+
# """Combining interventions into a single dataset is only possible,
|
|
206
|
+
# if there is only a single timeseries for each intervention.
|
|
207
|
+
|
|
208
|
+
# Parameters
|
|
209
|
+
# ----------
|
|
210
|
+
# interventions : az.InferenceData
|
|
211
|
+
# Interventions InferenceData. Contains multiple datasets with at
|
|
212
|
+
# least one timeseries
|
|
213
|
+
# force : bool, optional
|
|
214
|
+
# Override restrictions to combine interventions only when the number
|
|
215
|
+
# of timeseries is 1, by default False
|
|
216
|
+
|
|
217
|
+
# Returns
|
|
218
|
+
# -------
|
|
219
|
+
# xr.DataArray
|
|
220
|
+
# Interventions, combined into a single dataset
|
|
221
|
+
|
|
222
|
+
# Raises
|
|
223
|
+
# ------
|
|
224
|
+
# ValueError
|
|
225
|
+
# If the number of timeseries is larger than 1 and force is not True
|
|
226
|
+
# """
|
|
227
|
+
# assert isinstance(interventions, az.InferenceData)
|
|
228
|
+
# arrays = []
|
|
229
|
+
# for variable, dataset in interventions.items():
|
|
230
|
+
# if dataset.sizes["timeseries_id"] > 1:
|
|
231
|
+
# if force:
|
|
232
|
+
# arr = dataset.to_array()
|
|
233
|
+
# else:
|
|
234
|
+
# raise ValueError(
|
|
235
|
+
# "Combining interventions is only allowed when the number of "
|
|
236
|
+
# "Timeseries for each variable is 1. This is to avoid blowing "
|
|
237
|
+
# "Up the size of the dataset with nans, because timeseries ids "
|
|
238
|
+
# "are different for each variable. You can override this error "
|
|
239
|
+
# "By using `force=True`"
|
|
240
|
+
# )
|
|
241
|
+
# else:
|
|
242
|
+
# arr = dataset.squeeze("timeseries_id").to_array()
|
|
243
|
+
|
|
244
|
+
# arrays.append(arr)
|
|
245
|
+
|
|
246
|
+
# return xr.concat(arrays, dim="variable")
|
|
247
|
+
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import xarray as xr
|
|
3
|
+
from typing import TypedDict, Dict, Optional, Sequence
|
|
4
|
+
|
|
5
|
+
class ExposureDataDict(TypedDict):
|
|
6
|
+
start: int
|
|
7
|
+
end: int
|
|
8
|
+
exposure: float|Sequence[float]
|
|
9
|
+
|
|
10
|
+
def create_artificial_data(
|
|
11
|
+
t_max,
|
|
12
|
+
dt,
|
|
13
|
+
exposure_paths=["oral", "topical", "contact"],
|
|
14
|
+
intensity=[0.1, 0.5, 0.05]
|
|
15
|
+
):
|
|
16
|
+
rng = np.random.default_rng(1)
|
|
17
|
+
time = np.arange(0, t_max, step=dt) # daily time resolution
|
|
18
|
+
|
|
19
|
+
# calculate potential exposure based on a lognormal distribution
|
|
20
|
+
oral = rng.lognormal(mean=np.log(intensity[0]), sigma=0.5, size=len(time))
|
|
21
|
+
# and include a random exposure days
|
|
22
|
+
oral *= rng.binomial(n=1, p=1, size=len(time))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# calculate potential exposure based on a lognormal distribution
|
|
26
|
+
topical = rng.lognormal(mean=np.log(intensity[1]), sigma=1, size=len(time))
|
|
27
|
+
# and include a random exposure days
|
|
28
|
+
topical *= rng.binomial(n=1, p=0.25, size=len(time))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# calculate potential exposure based on a lognormal distribution
|
|
32
|
+
contact = rng.lognormal(mean=np.log(intensity[2]), sigma=0.1, size=len(time))
|
|
33
|
+
# and include a random exposure days
|
|
34
|
+
contact *= rng.binomial(n=1, p=0.8, size=len(time))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
exposures = xr.Dataset(
|
|
39
|
+
data_vars={
|
|
40
|
+
"exposure": (("time", "exposure_path"), np.column_stack([oral, topical, contact])),
|
|
41
|
+
},
|
|
42
|
+
coords={"time": time, "exposure_path": ["oral", "topical", "contact"]}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
return exposures.sel(exposure_path=exposure_paths)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def design_exposure_timeseries(time: Sequence[float], exposure: ExposureDataDict, eps: float):
|
|
49
|
+
if exposure is None:
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
exposure["end"] = time[-1] if exposure["end"] is None else exposure["end"]
|
|
53
|
+
|
|
54
|
+
return np.where(
|
|
55
|
+
np.logical_and(time >= exposure["start"], time < exposure["end"]),
|
|
56
|
+
exposure["concentration"],
|
|
57
|
+
0
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def design_exposure_scenario(
|
|
61
|
+
t_max: float,
|
|
62
|
+
dt: float,
|
|
63
|
+
exposures: Dict[str,ExposureDataDict],
|
|
64
|
+
eps: float = 1e-8,
|
|
65
|
+
exposure_dimension: str = "exposure_type",
|
|
66
|
+
):
|
|
67
|
+
"""
|
|
68
|
+
TODO: tmax, dt and eps are probably not necessary
|
|
69
|
+
"""
|
|
70
|
+
time = np.arange(0, t_max, step=dt) # daily time resolution
|
|
71
|
+
time = np.unique(np.concatenate([time] + [
|
|
72
|
+
np.array([time[-1] if vals["end"] is None else vals["end"]])
|
|
73
|
+
for key, vals in exposures.items()
|
|
74
|
+
|
|
75
|
+
]))
|
|
76
|
+
|
|
77
|
+
treatments = {}
|
|
78
|
+
for key, expo in exposures.items():
|
|
79
|
+
treat = design_exposure_timeseries(time, expo, eps)
|
|
80
|
+
treatments.update({key: treat})
|
|
81
|
+
|
|
82
|
+
data = np.column_stack(list(treatments.values())).squeeze()
|
|
83
|
+
data = np.expand_dims(data, axis=0)
|
|
84
|
+
|
|
85
|
+
coords = {"id": [0], "time": time}
|
|
86
|
+
|
|
87
|
+
if len(exposures) > 1:
|
|
88
|
+
coords.update({exposure_dimension: list(treatments.keys())})
|
|
89
|
+
|
|
90
|
+
exposures_dataset = xr.Dataset(
|
|
91
|
+
data_vars={"exposure": (tuple(coords.keys()), data)},
|
|
92
|
+
coords=coords
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return exposures_dataset
|
|
96
|
+
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
from typing import List, Optional, Callable, Dict, Any, Tuple
|
|
2
|
+
from importlib import import_module
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from expyDB.database_operations import create_database, experiment_to_db
|
|
8
|
+
from pymob.sim.config import dict_to_string
|
|
9
|
+
|
|
10
|
+
import glob
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
import click
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from expyDB.intervention_model import to_expydb, Experiment, PandasConverter
|
|
17
|
+
from guts_base.data.utils import datalad_locked_file_warning
|
|
18
|
+
|
|
19
|
+
def test_equality_of_exposure_patterns_in_treatment(df):
|
|
20
|
+
for _, group in df.groupby("treatment_id"):
|
|
21
|
+
exposures = group.pivot_table(
|
|
22
|
+
# values="value",
|
|
23
|
+
index=["time", "treatment_id"],
|
|
24
|
+
columns="replicate_id"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
equal_expo = exposures.values == exposures.values[:, ].reshape((-1, 1))
|
|
28
|
+
if not np.all(equal_expo):
|
|
29
|
+
raise RuntimeError(
|
|
30
|
+
"Replicates in the same treatment ID have different exposure patterns."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def create_new_columns_and_test_integrity_of_replicates(
|
|
34
|
+
exposure, survival, n_reps, path
|
|
35
|
+
):
|
|
36
|
+
assert np.all(exposure.columns == survival.columns)
|
|
37
|
+
|
|
38
|
+
columns_new, treatment_reps = identify_replicates(frame=exposure)
|
|
39
|
+
|
|
40
|
+
if not np.all(np.array(list(treatment_reps.values())) == n_reps):
|
|
41
|
+
warnings.warn(
|
|
42
|
+
f"Actual treatment replicates are different from "
|
|
43
|
+
f"replicates ({n_reps}), given in Info sheet in file: "
|
|
44
|
+
f"{path}"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return columns_new
|
|
48
|
+
|
|
49
|
+
def identify_replicates(frame):
|
|
50
|
+
df = frame.drop(columns="time")
|
|
51
|
+
|
|
52
|
+
# Find identical columns and assign group labels
|
|
53
|
+
group_labels = {}
|
|
54
|
+
used_cols = set()
|
|
55
|
+
treatment_map = {}
|
|
56
|
+
|
|
57
|
+
for col in df.columns:
|
|
58
|
+
if col not in used_cols:
|
|
59
|
+
# compare the column to each column of the dataframe with
|
|
60
|
+
# df.apply(func, axis=0) and get the column names
|
|
61
|
+
identical_cols = df.columns[df.apply(lambda x: x.equals(df[col]), axis=0)].tolist()
|
|
62
|
+
group_label = f'{len(group_labels) + 1}'
|
|
63
|
+
group_labels[group_label] = identical_cols
|
|
64
|
+
used_cols.update(identical_cols)
|
|
65
|
+
|
|
66
|
+
for icol in identical_cols:
|
|
67
|
+
treatment_map.update({icol: group_label})
|
|
68
|
+
|
|
69
|
+
columns_new = [f"{treatment_map[col]}__{col}" for col in df.columns]
|
|
70
|
+
treatment_reps = {key: len(cols) for key, cols in group_labels.items()}
|
|
71
|
+
|
|
72
|
+
return columns_new, treatment_reps
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def read_timeseries_sheet(path, sheet, sep=None):
|
|
77
|
+
ts = pd.read_excel(path, sheet_name=sheet, index_col=0) # type: ignore
|
|
78
|
+
multi_index = pd.MultiIndex.from_tuples(
|
|
79
|
+
[tuple(c.split(sep)) for c in ts.columns], names=["treatment_id", "timeseries_id"]
|
|
80
|
+
)
|
|
81
|
+
ts.columns = multi_index
|
|
82
|
+
return ts
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class OpenGutsIO:
|
|
86
|
+
# TODO: Use preprocessing here and use map as a class attribute
|
|
87
|
+
def __init__(self, file):
|
|
88
|
+
self._file = file
|
|
89
|
+
self.data = self.from_file(file)
|
|
90
|
+
|
|
91
|
+
def _openguts_wide_to_long(self, frame, columns_new):
|
|
92
|
+
frame_wide = frame.copy()
|
|
93
|
+
|
|
94
|
+
frame_wide.columns = ["time"] + columns_new
|
|
95
|
+
frame_long = pd.melt(
|
|
96
|
+
frame=frame_wide,
|
|
97
|
+
id_vars=["time"],
|
|
98
|
+
value_vars=columns_new,
|
|
99
|
+
var_name="exposure_id"
|
|
100
|
+
)
|
|
101
|
+
# create new index columns from new column names
|
|
102
|
+
frame_long[["treatment_id", "replicate_id"]] = frame_long\
|
|
103
|
+
.exposure_id.str.split("__", n=1, expand=True)
|
|
104
|
+
frame_long = frame_long.drop(columns="exposure_id")
|
|
105
|
+
|
|
106
|
+
return frame_long
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _merge_tables(self, tables: List):
|
|
110
|
+
data = tables.pop(0).set_index(["time", "treatment_id", "replicate_id"])
|
|
111
|
+
|
|
112
|
+
for expo in tables:
|
|
113
|
+
rdata =expo.set_index(["time", "treatment_id", "replicate_id"])
|
|
114
|
+
data = pd.merge(
|
|
115
|
+
left=data,
|
|
116
|
+
right=rdata,
|
|
117
|
+
how="left",
|
|
118
|
+
left_index=True,
|
|
119
|
+
right_index=True
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return data
|
|
123
|
+
|
|
124
|
+
def _read_timeseries(self, path, sheets):
|
|
125
|
+
# design new columns based on the information about replicates and treatments
|
|
126
|
+
timeseries_long_list = []
|
|
127
|
+
timeseries_column_list = []
|
|
128
|
+
time_units = {}
|
|
129
|
+
for iv in sheets:
|
|
130
|
+
timeseries_df = pd.read_excel(path, sheet_name=f"{iv}")
|
|
131
|
+
|
|
132
|
+
time_column = timeseries_df.columns[0]
|
|
133
|
+
time_unit = time_column.lower().replace("time", "").strip(" []")
|
|
134
|
+
|
|
135
|
+
# define replicates based on equality of columns
|
|
136
|
+
timeseries_columns = [c for c in timeseries_df.columns[1:]]
|
|
137
|
+
timeseries_long = self._openguts_wide_to_long(
|
|
138
|
+
frame=timeseries_df, columns_new=timeseries_columns
|
|
139
|
+
)
|
|
140
|
+
intervention_long = timeseries_long.rename(columns={"value": iv})
|
|
141
|
+
timeseries_long_list.append(intervention_long)
|
|
142
|
+
timeseries_column_list.append(timeseries_columns)
|
|
143
|
+
time_units.update({iv: time_unit})
|
|
144
|
+
|
|
145
|
+
return self._merge_tables(timeseries_long_list).reset_index(), time_units
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _read_openguts(self, path, metadata_sheetname="meta"):
|
|
149
|
+
meta = pd.read_excel(path, sheet_name=metadata_sheetname, index_col=0).dropna(how="all")
|
|
150
|
+
interventions = meta.loc["experiment__interventions","Value"]
|
|
151
|
+
if interventions is None:
|
|
152
|
+
raise ValueError("'experiment__interventions' must be defined in metadata")
|
|
153
|
+
else:
|
|
154
|
+
intervention_sheets = [i.strip("[]' ") for i in interventions.split(",")] # type: ignore
|
|
155
|
+
|
|
156
|
+
observations = meta.loc["experiment__observations","Value"]
|
|
157
|
+
if observations is None:
|
|
158
|
+
raise ValueError("'experiment__observations' must be defined in metadata")
|
|
159
|
+
else:
|
|
160
|
+
observation_sheets = [i.strip("[]' ") for i in observations.split(",")] # type: ignore
|
|
161
|
+
|
|
162
|
+
# survival_df = pd.read_excel(path, sheet_name="survival")
|
|
163
|
+
# survival_df = survival_df.rename(columns={"time [d]": "time"})
|
|
164
|
+
|
|
165
|
+
# design new columns based on the information about replicates and treatments
|
|
166
|
+
interventions_long, interventions_time_units = self._read_timeseries(path, intervention_sheets)
|
|
167
|
+
observations_long, observations_time_units = self._read_timeseries(path, observation_sheets)
|
|
168
|
+
time_unit = {
|
|
169
|
+
"interventions": interventions_time_units,
|
|
170
|
+
"observations": observations_time_units
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
# TODO test if all exposures within a treatment (replicates) were nominally the same
|
|
174
|
+
# test_equality_of_exposure_patterns_in_treatment(df=exposures_long)
|
|
175
|
+
|
|
176
|
+
return interventions_long, observations_long, meta, time_unit
|
|
177
|
+
|
|
178
|
+
def from_file(self, file) -> None:
|
|
179
|
+
(
|
|
180
|
+
interventions_long,
|
|
181
|
+
observations_long,
|
|
182
|
+
meta,
|
|
183
|
+
time_unit
|
|
184
|
+
) = self._read_openguts(path=file)
|
|
185
|
+
|
|
186
|
+
self.interventions = interventions_long
|
|
187
|
+
self.observations = observations_long
|
|
188
|
+
self.time_unit = time_unit
|
|
189
|
+
self.meta = meta
|
|
190
|
+
|
|
191
|
+
def to_file(self, file):
|
|
192
|
+
raise NotImplementedError(
|
|
193
|
+
"This method should implement writing an excel file that corresponds"
|
|
194
|
+
"to the original input file."
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def to_experiment(self) -> Experiment:
|
|
198
|
+
return Experiment.from_dict(data=dict(
|
|
199
|
+
interventions=self.interventions,
|
|
200
|
+
observations=self.observations,
|
|
201
|
+
meta=self.meta,
|
|
202
|
+
time_units=self.time_unit
|
|
203
|
+
))
|
|
204
|
+
|
|
205
|
+
def from_experiment(self, experiment: Experiment) -> None:
|
|
206
|
+
data = experiment.to_dict()
|
|
207
|
+
self.interventions=data["interventions"],
|
|
208
|
+
self.observations=data["observations"],
|
|
209
|
+
self.meta=data["meta"],
|
|
210
|
+
self.time_units=data["time_units"],
|
|
211
|
+
|
|
212
|
+
def to_xarray(self):
|
|
213
|
+
return self.to_experiment().to_xarray()
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def import_data_to_database(path, database, preprocessing: Optional[Callable] = None, preprocessing_out: Optional[str] = None):
|
|
217
|
+
"""This script takes raw data, preprocesses it to contain all
|
|
218
|
+
necessary metadata for expyDB. Then it creates an experiment Model and
|
|
219
|
+
processes adds it to the database
|
|
220
|
+
"""
|
|
221
|
+
# preprocess path
|
|
222
|
+
if preprocessing is not None:
|
|
223
|
+
if preprocessing_out is None:
|
|
224
|
+
filename = os.path.dirname(path)
|
|
225
|
+
directory = os.path.basename(filename)
|
|
226
|
+
new_path = path.replace(directory, f"processed_{directory}")
|
|
227
|
+
else:
|
|
228
|
+
filename = os.path.basename(path)
|
|
229
|
+
new_path = preprocessing_out.format(filename=filename)
|
|
230
|
+
|
|
231
|
+
os.makedirs(os.path.dirname(new_path), exist_ok=True)
|
|
232
|
+
|
|
233
|
+
processed_path = preprocessing(path, new_path)
|
|
234
|
+
else:
|
|
235
|
+
processed_path = path
|
|
236
|
+
|
|
237
|
+
# Preprocess excel to interventions and observations in Long form and a
|
|
238
|
+
# metadata Series as well as a default time unit
|
|
239
|
+
openguts = OpenGutsIO(processed_path)
|
|
240
|
+
|
|
241
|
+
# From excel to an Experiment Model instance
|
|
242
|
+
experiment = openguts.to_experiment()
|
|
243
|
+
|
|
244
|
+
# from the Model to the Database
|
|
245
|
+
if not os.access(database, os.W_OK):
|
|
246
|
+
warnings.warn(
|
|
247
|
+
f"Did not write to database. The file '{database}' does "
|
|
248
|
+
"not have write access."
|
|
249
|
+
)
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
experiment.to_database(database=database)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def create_database_and_import_data_main(datasets_path, database_path, preprocessing=None, preprocessing_out=None):
|
|
256
|
+
print("\n")
|
|
257
|
+
print(f"Creating a database and importing data")
|
|
258
|
+
print(f"======================================")
|
|
259
|
+
|
|
260
|
+
if preprocessing is not None:
|
|
261
|
+
module, func = preprocessing.rsplit(".", 1)
|
|
262
|
+
mod = import_module(module)
|
|
263
|
+
preprocessing_func = getattr(mod, func)
|
|
264
|
+
else:
|
|
265
|
+
preprocessing_func = None
|
|
266
|
+
|
|
267
|
+
paths = []
|
|
268
|
+
for p in datasets_path:
|
|
269
|
+
if os.path.isfile(p):
|
|
270
|
+
paths.append(p)
|
|
271
|
+
else:
|
|
272
|
+
paths.extend(glob.glob(os.path.join(p, "*.xlsx")))
|
|
273
|
+
|
|
274
|
+
create_database(database=database_path, force=True)
|
|
275
|
+
for p in paths:
|
|
276
|
+
print(f"\nPreprocessing and importing file: {p}")
|
|
277
|
+
import_data_to_database(
|
|
278
|
+
path=p, database=database_path,
|
|
279
|
+
preprocessing=preprocessing_func,
|
|
280
|
+
preprocessing_out=preprocessing_out
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
@click.command()
|
|
284
|
+
@click.option("--datasets_path", type=str, multiple=True, help="The path to the directory where the excel files are located. Alternatively, use multiple times with paths to files")
|
|
285
|
+
@click.option("--database_path", type=str, help="The path to the database (should end with .db)")
|
|
286
|
+
@click.option("--preprocessing", type=str, help="Function used to preprocess the data", default=None)
|
|
287
|
+
@click.option("--preprocessing-out", type=str, help="A pattern that uses {filename} as a placeholder e.g. 'data/processed_data/{filename}. If unset, preprends 'processes_' to the dirname", default=None)
|
|
288
|
+
def create_database_and_import_data(datasets_path, database_path, preprocessing, preprocessing_out):
|
|
289
|
+
create_database_and_import_data_main(
|
|
290
|
+
datasets_path=datasets_path,
|
|
291
|
+
database_path=database_path,
|
|
292
|
+
preprocessing=preprocessing,
|
|
293
|
+
preprocessing_out=preprocessing_out
|
|
294
|
+
)
|