guts-base 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guts-base might be problematic. Click here for more details.

guts_base/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ from . import sim
2
+ from . import mod
3
+ from . import data
4
+ from . import prob
5
+ from . import plot
6
+
7
+ __version__ = "0.8.2"
8
+
9
+ from .sim import (
10
+ GutsBase,
11
+ PymobSimulator,
12
+ ECxEstimator,
13
+ LPxEstimator,
14
+ )
@@ -0,0 +1,34 @@
1
+ from typing import Callable
2
+
3
+ from . import utils
4
+ from . import openguts
5
+ from . import expydb
6
+ from . import survival
7
+ from . import generator
8
+ from . import time_of_death
9
+ from . import preprocessing
10
+
11
+ from .openguts import (
12
+ OpenGutsIO,
13
+ create_new_columns_and_test_integrity_of_replicates,
14
+ create_database_and_import_data,
15
+ create_database_and_import_data_main,
16
+ import_data_to_database,
17
+ )
18
+ from .survival import (
19
+ prepare_survival_data_for_conditional_binomial,
20
+ survivors_at_start_of_interval,
21
+ generate_survival_repeated_observations
22
+ )
23
+
24
+ from .generator import create_artificial_data, design_exposure_scenario
25
+
26
+ from .expydb import (
27
+ to_dataset,
28
+ combine_coords_to_multiindex,
29
+ reduce_multiindex_to_flat_index
30
+ )
31
+
32
+ from .time_of_death import (
33
+ time_of_death_to_openguts
34
+ )
@@ -0,0 +1,247 @@
1
+ from typing import List, Optional, Literal
2
+ import numpy as np
3
+ import xarray as xr
4
+ import pandas as pd
5
+ import arviz as az
6
+ import datetime
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+
11
+ from expyDB.intervention_model import (
12
+ Experiment,
13
+ Treatment,
14
+ Timeseries,
15
+ TsData,
16
+ from_expydb
17
+ )
18
+
19
+
20
+ def prepare_dataset(
21
+ idata,
22
+ variable="survival",
23
+ unit_time: Literal["day", "hour", "minute", "second"] = "day"
24
+ ):
25
+ """Get interventions from idata storage with respect ot the treatment
26
+ ids of the observations and move non indexing-related metadata (unique metadata)
27
+ to the attrs container.
28
+ """
29
+ # this test is guaranteed when prepare dataset is used together with from_expydb
30
+ # because from_expydb organizes the data into datasets with 1 variable,
31
+ # which is the timeseries variable with the coordinates timeseries_id and time
32
+ # for treatments and replicates. Other variables receive their 'own' dataset
33
+ assert len(idata[variable].data_vars) == 1
34
+ array: xr.DataArray = idata[variable][variable]
35
+ array = array.swap_dims(timeseries_id="treatment_id")
36
+ array = array.drop_vars("id")
37
+ # assuming that each timeseries of one variable in each treatment has
38
+ # a unique name the resulting index should be unique
39
+ array = array.set_index(id=("treatment_id", "timeseries_name"))
40
+ array = array.drop_vars("timeseries_id")
41
+ assert array.indexes["id"].is_unique
42
+
43
+ # format time to h and set as float
44
+ time_h = array.time.values / pd.Timedelta(1, unit_time)
45
+ array = array.assign_coords(time=time_h)
46
+
47
+ array = move_unique_coordinates_to_attrs(array)
48
+
49
+ array.attrs["unit_time"] = unit_time
50
+
51
+ # add a unique id for the selected dataset which is only relevant for
52
+ # the scope of modelling
53
+ return array
54
+
55
+ def move_unique_coordinates_to_attrs(array:xr.DataArray) -> xr.DataArray:
56
+ key: str
57
+ for key, coord in array.coords.items(): # type:ignore
58
+ if key in ["id", "treatment_id", "timeseries_id", "experiment_id", "subject_count", "timeseries_name"]:
59
+ continue
60
+
61
+ if coord.isnull().all():
62
+ unique_values = [None]
63
+ else:
64
+ unique_values = np.unique(coord.data)
65
+
66
+ if len(unique_values) == 1:
67
+ array.attrs.update({key: unique_values[0]})
68
+ array = array.drop_vars(key)
69
+ return array
70
+
71
+ # def prepare_interventions_dataset(interventions_idata, observations, ivs:Optional[List[str]]=None):
72
+ # """Get interventions from idata storage with respect ot the treatment
73
+ # ids of the observations"""
74
+ # if ivs is None:
75
+ # ivs = list(interventions_idata.keys())
76
+ # ds_ivs = get_interventions(
77
+ # interventions_idata,
78
+ # observations=observations,
79
+ # ivs=ivs
80
+ # )
81
+
82
+ # time_h = ds_ivs.time.values / np.timedelta64(1, "h")
83
+ # ds_ivs = ds_ivs.assign_coords(time=time_h)
84
+ # ds_ivs.attrs["unit_time"] = "hours (h)"
85
+
86
+ # return ds_ivs
87
+
88
+ def to_dataset(
89
+ observations_idata,
90
+ interventions_idata,
91
+ unit_time: Literal["day", "hour", "minute", "second"] = "hour"
92
+ ) -> xr.Dataset:
93
+ """Combines intervention and observation datasets, assuming that there is
94
+ a unique multiindex that can be constructed from
95
+ - treatment_id
96
+ - timeseries_name
97
+
98
+ This way interventions and observations can be combined into a single dataset,
99
+ """
100
+ data_arrays = {}
101
+ for variable in observations_idata.groups():
102
+ # prepare observations
103
+ da = prepare_dataset(
104
+ idata=observations_idata,
105
+ variable=variable,
106
+ unit_time=unit_time,
107
+ )
108
+ data_arrays.update({variable: da})
109
+
110
+ # prepare interventions
111
+ for variable in interventions_idata.groups():
112
+ da = prepare_dataset(
113
+ idata=interventions_idata,
114
+ variable=variable,
115
+ unit_time=unit_time,
116
+ )
117
+ data_arrays.update({variable: da})
118
+
119
+ return xr.combine_by_coords(data_arrays.values()) # type: ignore
120
+
121
+
122
+ def reduce_multiindex_to_flat_index(dataset):
123
+ multi_index = dataset.id.indexes["id"]
124
+
125
+ # create a flat index from the multi index
126
+ flat_index = multi_index.map(lambda x: "__".join([str(x_) for x_ in x]))
127
+
128
+ # remove multi index from dimension 'id'
129
+ dataset = dataset.reset_index("id")
130
+
131
+ # assign flat index to dimension 'id'
132
+ dataset = dataset.assign_coords(id=flat_index)
133
+
134
+ return dataset
135
+
136
+ def combine_coords_to_multiindex(
137
+ dataset: xr.Dataset,
138
+ coordinates: List[str],
139
+ index_name: str,
140
+ sep: str = "__"
141
+ ) -> xr.Dataset:
142
+ """Simply combines a list of coordinates into a joint string
143
+
144
+ Parameters
145
+ ----------
146
+ dataset : xr.Dataset
147
+ The observations dataset
148
+ coordinates : List[str]
149
+ The coordinates that should be joined
150
+ index_name : str
151
+ The name of the new, joined, coordinate
152
+ sep : str, optional
153
+ The string to separate the coordinate components, by default "__"
154
+
155
+ Returns
156
+ -------
157
+ xr.Dataset
158
+ Dataset with a new coordinate composed of the listed coordinates
159
+ """
160
+ try:
161
+ multi_index = pd.MultiIndex.from_arrays([dataset[c].values for c in coordinates])
162
+ except KeyError as err:
163
+ raise KeyError(
164
+ f"Did not find key {err} in the dataset. "
165
+ f"This is probably because the key {err} is equal for all treatments."
166
+ )
167
+ multi_index = multi_index.map(lambda x: "__".join([str(x_) for x_ in x]))
168
+ return dataset.assign_coords({index_name: ("id", multi_index)})
169
+
170
+ # def get_interventions(interventions_idata, observations, ivs: List[str]) -> xr.Dataset:
171
+ # """Get the interventions according to the treatment ids of the observation
172
+ # dataset.
173
+
174
+ # Works only for single interventions
175
+ # """
176
+ # X_in = {}
177
+ # for data_var in ivs:
178
+ # x_in = interventions_idata[data_var]\
179
+ # .swap_dims(timeseries_id="treatment_id")\
180
+ # .sel(treatment_id=observations.treatment_id.values)
181
+
182
+ # x_in = x_in.assign_coords(
183
+ # _id=("treatment_id", range(x_in.sizes["treatment_id"]))
184
+ # )
185
+ # x_in = x_in.swap_dims(treatment_id="_id")
186
+ # X_in.update({data_var: x_in[data_var]})
187
+
188
+
189
+ # X_in_dataset = xr.concat(X_in.values(), dim="variable")\
190
+ # .assign_coords(variable=ivs)\
191
+ # .to_dataset(dim="variable")
192
+
193
+ # if "variable" in X_in_dataset.dims:
194
+ # X_in_dataset = X_in_dataset.drop_dims("variable")
195
+
196
+ # return X_in_dataset
197
+
198
+
199
+
200
+
201
+ # def combine_interventions(
202
+ # interventions: az.InferenceData,
203
+ # force: bool=False
204
+ # ) -> xr.DataArray:
205
+ # """Combining interventions into a single dataset is only possible,
206
+ # if there is only a single timeseries for each intervention.
207
+
208
+ # Parameters
209
+ # ----------
210
+ # interventions : az.InferenceData
211
+ # Interventions InferenceData. Contains multiple datasets with at
212
+ # least one timeseries
213
+ # force : bool, optional
214
+ # Override restrictions to combine interventions only when the number
215
+ # of timeseries is 1, by default False
216
+
217
+ # Returns
218
+ # -------
219
+ # xr.DataArray
220
+ # Interventions, combined into a single dataset
221
+
222
+ # Raises
223
+ # ------
224
+ # ValueError
225
+ # If the number of timeseries is larger than 1 and force is not True
226
+ # """
227
+ # assert isinstance(interventions, az.InferenceData)
228
+ # arrays = []
229
+ # for variable, dataset in interventions.items():
230
+ # if dataset.sizes["timeseries_id"] > 1:
231
+ # if force:
232
+ # arr = dataset.to_array()
233
+ # else:
234
+ # raise ValueError(
235
+ # "Combining interventions is only allowed when the number of "
236
+ # "Timeseries for each variable is 1. This is to avoid blowing "
237
+ # "Up the size of the dataset with nans, because timeseries ids "
238
+ # "are different for each variable. You can override this error "
239
+ # "By using `force=True`"
240
+ # )
241
+ # else:
242
+ # arr = dataset.squeeze("timeseries_id").to_array()
243
+
244
+ # arrays.append(arr)
245
+
246
+ # return xr.concat(arrays, dim="variable")
247
+
@@ -0,0 +1,96 @@
1
+ import numpy as np
2
+ import xarray as xr
3
+ from typing import TypedDict, Dict, Optional, Sequence
4
+
5
+ class ExposureDataDict(TypedDict):
6
+ start: int
7
+ end: int
8
+ exposure: float|Sequence[float]
9
+
10
+ def create_artificial_data(
11
+ t_max,
12
+ dt,
13
+ exposure_paths=["oral", "topical", "contact"],
14
+ intensity=[0.1, 0.5, 0.05]
15
+ ):
16
+ rng = np.random.default_rng(1)
17
+ time = np.arange(0, t_max, step=dt) # daily time resolution
18
+
19
+ # calculate potential exposure based on a lognormal distribution
20
+ oral = rng.lognormal(mean=np.log(intensity[0]), sigma=0.5, size=len(time))
21
+ # and include a random exposure days
22
+ oral *= rng.binomial(n=1, p=1, size=len(time))
23
+
24
+
25
+ # calculate potential exposure based on a lognormal distribution
26
+ topical = rng.lognormal(mean=np.log(intensity[1]), sigma=1, size=len(time))
27
+ # and include a random exposure days
28
+ topical *= rng.binomial(n=1, p=0.25, size=len(time))
29
+
30
+
31
+ # calculate potential exposure based on a lognormal distribution
32
+ contact = rng.lognormal(mean=np.log(intensity[2]), sigma=0.1, size=len(time))
33
+ # and include a random exposure days
34
+ contact *= rng.binomial(n=1, p=0.8, size=len(time))
35
+
36
+
37
+
38
+ exposures = xr.Dataset(
39
+ data_vars={
40
+ "exposure": (("time", "exposure_path"), np.column_stack([oral, topical, contact])),
41
+ },
42
+ coords={"time": time, "exposure_path": ["oral", "topical", "contact"]}
43
+ )
44
+
45
+ return exposures.sel(exposure_path=exposure_paths)
46
+
47
+
48
+ def design_exposure_timeseries(time: Sequence[float], exposure: ExposureDataDict, eps: float):
49
+ if exposure is None:
50
+ return
51
+
52
+ exposure["end"] = time[-1] if exposure["end"] is None else exposure["end"]
53
+
54
+ return np.where(
55
+ np.logical_and(time >= exposure["start"], time < exposure["end"]),
56
+ exposure["concentration"],
57
+ 0
58
+ )
59
+
60
+ def design_exposure_scenario(
61
+ t_max: float,
62
+ dt: float,
63
+ exposures: Dict[str,ExposureDataDict],
64
+ eps: float = 1e-8,
65
+ exposure_dimension: str = "exposure_type",
66
+ ):
67
+ """
68
+ TODO: tmax, dt and eps are probably not necessary
69
+ """
70
+ time = np.arange(0, t_max, step=dt) # daily time resolution
71
+ time = np.unique(np.concatenate([time] + [
72
+ np.array([time[-1] if vals["end"] is None else vals["end"]])
73
+ for key, vals in exposures.items()
74
+
75
+ ]))
76
+
77
+ treatments = {}
78
+ for key, expo in exposures.items():
79
+ treat = design_exposure_timeseries(time, expo, eps)
80
+ treatments.update({key: treat})
81
+
82
+ data = np.column_stack(list(treatments.values())).squeeze()
83
+ data = np.expand_dims(data, axis=0)
84
+
85
+ coords = {"id": [0], "time": time}
86
+
87
+ if len(exposures) > 1:
88
+ coords.update({exposure_dimension: list(treatments.keys())})
89
+
90
+ exposures_dataset = xr.Dataset(
91
+ data_vars={"exposure": (tuple(coords.keys()), data)},
92
+ coords=coords
93
+ )
94
+
95
+ return exposures_dataset
96
+
@@ -0,0 +1,294 @@
1
+ from typing import List, Optional, Callable, Dict, Any, Tuple
2
+ from importlib import import_module
3
+ import warnings
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from expyDB.database_operations import create_database, experiment_to_db
8
+ from pymob.sim.config import dict_to_string
9
+
10
+ import glob
11
+ import os
12
+
13
+ import click
14
+ import pandas as pd
15
+
16
+ from expyDB.intervention_model import to_expydb, Experiment, PandasConverter
17
+ from guts_base.data.utils import datalad_locked_file_warning
18
+
19
+ def test_equality_of_exposure_patterns_in_treatment(df):
20
+ for _, group in df.groupby("treatment_id"):
21
+ exposures = group.pivot_table(
22
+ # values="value",
23
+ index=["time", "treatment_id"],
24
+ columns="replicate_id"
25
+ )
26
+
27
+ equal_expo = exposures.values == exposures.values[:, ].reshape((-1, 1))
28
+ if not np.all(equal_expo):
29
+ raise RuntimeError(
30
+ "Replicates in the same treatment ID have different exposure patterns."
31
+ )
32
+
33
+ def create_new_columns_and_test_integrity_of_replicates(
34
+ exposure, survival, n_reps, path
35
+ ):
36
+ assert np.all(exposure.columns == survival.columns)
37
+
38
+ columns_new, treatment_reps = identify_replicates(frame=exposure)
39
+
40
+ if not np.all(np.array(list(treatment_reps.values())) == n_reps):
41
+ warnings.warn(
42
+ f"Actual treatment replicates are different from "
43
+ f"replicates ({n_reps}), given in Info sheet in file: "
44
+ f"{path}"
45
+ )
46
+
47
+ return columns_new
48
+
49
+ def identify_replicates(frame):
50
+ df = frame.drop(columns="time")
51
+
52
+ # Find identical columns and assign group labels
53
+ group_labels = {}
54
+ used_cols = set()
55
+ treatment_map = {}
56
+
57
+ for col in df.columns:
58
+ if col not in used_cols:
59
+ # compare the column to each column of the dataframe with
60
+ # df.apply(func, axis=0) and get the column names
61
+ identical_cols = df.columns[df.apply(lambda x: x.equals(df[col]), axis=0)].tolist()
62
+ group_label = f'{len(group_labels) + 1}'
63
+ group_labels[group_label] = identical_cols
64
+ used_cols.update(identical_cols)
65
+
66
+ for icol in identical_cols:
67
+ treatment_map.update({icol: group_label})
68
+
69
+ columns_new = [f"{treatment_map[col]}__{col}" for col in df.columns]
70
+ treatment_reps = {key: len(cols) for key, cols in group_labels.items()}
71
+
72
+ return columns_new, treatment_reps
73
+
74
+
75
+
76
+ def read_timeseries_sheet(path, sheet, sep=None):
77
+ ts = pd.read_excel(path, sheet_name=sheet, index_col=0) # type: ignore
78
+ multi_index = pd.MultiIndex.from_tuples(
79
+ [tuple(c.split(sep)) for c in ts.columns], names=["treatment_id", "timeseries_id"]
80
+ )
81
+ ts.columns = multi_index
82
+ return ts
83
+
84
+
85
+ class OpenGutsIO:
86
+ # TODO: Use preprocessing here and use map as a class attribute
87
+ def __init__(self, file):
88
+ self._file = file
89
+ self.data = self.from_file(file)
90
+
91
+ def _openguts_wide_to_long(self, frame, columns_new):
92
+ frame_wide = frame.copy()
93
+
94
+ frame_wide.columns = ["time"] + columns_new
95
+ frame_long = pd.melt(
96
+ frame=frame_wide,
97
+ id_vars=["time"],
98
+ value_vars=columns_new,
99
+ var_name="exposure_id"
100
+ )
101
+ # create new index columns from new column names
102
+ frame_long[["treatment_id", "replicate_id"]] = frame_long\
103
+ .exposure_id.str.split("__", n=1, expand=True)
104
+ frame_long = frame_long.drop(columns="exposure_id")
105
+
106
+ return frame_long
107
+
108
+
109
+ def _merge_tables(self, tables: List):
110
+ data = tables.pop(0).set_index(["time", "treatment_id", "replicate_id"])
111
+
112
+ for expo in tables:
113
+ rdata =expo.set_index(["time", "treatment_id", "replicate_id"])
114
+ data = pd.merge(
115
+ left=data,
116
+ right=rdata,
117
+ how="left",
118
+ left_index=True,
119
+ right_index=True
120
+ )
121
+
122
+ return data
123
+
124
+ def _read_timeseries(self, path, sheets):
125
+ # design new columns based on the information about replicates and treatments
126
+ timeseries_long_list = []
127
+ timeseries_column_list = []
128
+ time_units = {}
129
+ for iv in sheets:
130
+ timeseries_df = pd.read_excel(path, sheet_name=f"{iv}")
131
+
132
+ time_column = timeseries_df.columns[0]
133
+ time_unit = time_column.lower().replace("time", "").strip(" []")
134
+
135
+ # define replicates based on equality of columns
136
+ timeseries_columns = [c for c in timeseries_df.columns[1:]]
137
+ timeseries_long = self._openguts_wide_to_long(
138
+ frame=timeseries_df, columns_new=timeseries_columns
139
+ )
140
+ intervention_long = timeseries_long.rename(columns={"value": iv})
141
+ timeseries_long_list.append(intervention_long)
142
+ timeseries_column_list.append(timeseries_columns)
143
+ time_units.update({iv: time_unit})
144
+
145
+ return self._merge_tables(timeseries_long_list).reset_index(), time_units
146
+
147
+
148
+ def _read_openguts(self, path, metadata_sheetname="meta"):
149
+ meta = pd.read_excel(path, sheet_name=metadata_sheetname, index_col=0).dropna(how="all")
150
+ interventions = meta.loc["experiment__interventions","Value"]
151
+ if interventions is None:
152
+ raise ValueError("'experiment__interventions' must be defined in metadata")
153
+ else:
154
+ intervention_sheets = [i.strip("[]' ") for i in interventions.split(",")] # type: ignore
155
+
156
+ observations = meta.loc["experiment__observations","Value"]
157
+ if observations is None:
158
+ raise ValueError("'experiment__observations' must be defined in metadata")
159
+ else:
160
+ observation_sheets = [i.strip("[]' ") for i in observations.split(",")] # type: ignore
161
+
162
+ # survival_df = pd.read_excel(path, sheet_name="survival")
163
+ # survival_df = survival_df.rename(columns={"time [d]": "time"})
164
+
165
+ # design new columns based on the information about replicates and treatments
166
+ interventions_long, interventions_time_units = self._read_timeseries(path, intervention_sheets)
167
+ observations_long, observations_time_units = self._read_timeseries(path, observation_sheets)
168
+ time_unit = {
169
+ "interventions": interventions_time_units,
170
+ "observations": observations_time_units
171
+ }
172
+
173
+ # TODO test if all exposures within a treatment (replicates) were nominally the same
174
+ # test_equality_of_exposure_patterns_in_treatment(df=exposures_long)
175
+
176
+ return interventions_long, observations_long, meta, time_unit
177
+
178
+ def from_file(self, file) -> None:
179
+ (
180
+ interventions_long,
181
+ observations_long,
182
+ meta,
183
+ time_unit
184
+ ) = self._read_openguts(path=file)
185
+
186
+ self.interventions = interventions_long
187
+ self.observations = observations_long
188
+ self.time_unit = time_unit
189
+ self.meta = meta
190
+
191
+ def to_file(self, file):
192
+ raise NotImplementedError(
193
+ "This method should implement writing an excel file that corresponds"
194
+ "to the original input file."
195
+ )
196
+
197
+ def to_experiment(self) -> Experiment:
198
+ return Experiment.from_dict(data=dict(
199
+ interventions=self.interventions,
200
+ observations=self.observations,
201
+ meta=self.meta,
202
+ time_units=self.time_unit
203
+ ))
204
+
205
+ def from_experiment(self, experiment: Experiment) -> None:
206
+ data = experiment.to_dict()
207
+ self.interventions=data["interventions"],
208
+ self.observations=data["observations"],
209
+ self.meta=data["meta"],
210
+ self.time_units=data["time_units"],
211
+
212
+ def to_xarray(self):
213
+ return self.to_experiment().to_xarray()
214
+
215
+
216
+ def import_data_to_database(path, database, preprocessing: Optional[Callable] = None, preprocessing_out: Optional[str] = None):
217
+ """This script takes raw data, preprocesses it to contain all
218
+ necessary metadata for expyDB. Then it creates an experiment Model and
219
+ processes adds it to the database
220
+ """
221
+ # preprocess path
222
+ if preprocessing is not None:
223
+ if preprocessing_out is None:
224
+ filename = os.path.dirname(path)
225
+ directory = os.path.basename(filename)
226
+ new_path = path.replace(directory, f"processed_{directory}")
227
+ else:
228
+ filename = os.path.basename(path)
229
+ new_path = preprocessing_out.format(filename=filename)
230
+
231
+ os.makedirs(os.path.dirname(new_path), exist_ok=True)
232
+
233
+ processed_path = preprocessing(path, new_path)
234
+ else:
235
+ processed_path = path
236
+
237
+ # Preprocess excel to interventions and observations in Long form and a
238
+ # metadata Series as well as a default time unit
239
+ openguts = OpenGutsIO(processed_path)
240
+
241
+ # From excel to an Experiment Model instance
242
+ experiment = openguts.to_experiment()
243
+
244
+ # from the Model to the Database
245
+ if not os.access(database, os.W_OK):
246
+ warnings.warn(
247
+ f"Did not write to database. The file '{database}' does "
248
+ "not have write access."
249
+ )
250
+ return
251
+
252
+ experiment.to_database(database=database)
253
+
254
+
255
+ def create_database_and_import_data_main(datasets_path, database_path, preprocessing=None, preprocessing_out=None):
256
+ print("\n")
257
+ print(f"Creating a database and importing data")
258
+ print(f"======================================")
259
+
260
+ if preprocessing is not None:
261
+ module, func = preprocessing.rsplit(".", 1)
262
+ mod = import_module(module)
263
+ preprocessing_func = getattr(mod, func)
264
+ else:
265
+ preprocessing_func = None
266
+
267
+ paths = []
268
+ for p in datasets_path:
269
+ if os.path.isfile(p):
270
+ paths.append(p)
271
+ else:
272
+ paths.extend(glob.glob(os.path.join(p, "*.xlsx")))
273
+
274
+ create_database(database=database_path, force=True)
275
+ for p in paths:
276
+ print(f"\nPreprocessing and importing file: {p}")
277
+ import_data_to_database(
278
+ path=p, database=database_path,
279
+ preprocessing=preprocessing_func,
280
+ preprocessing_out=preprocessing_out
281
+ )
282
+
283
+ @click.command()
284
+ @click.option("--datasets_path", type=str, multiple=True, help="The path to the directory where the excel files are located. Alternatively, use multiple times with paths to files")
285
+ @click.option("--database_path", type=str, help="The path to the database (should end with .db)")
286
+ @click.option("--preprocessing", type=str, help="Function used to preprocess the data", default=None)
287
+ @click.option("--preprocessing-out", type=str, help="A pattern that uses {filename} as a placeholder e.g. 'data/processed_data/{filename}. If unset, preprends 'processes_' to the dirname", default=None)
288
+ def create_database_and_import_data(datasets_path, database_path, preprocessing, preprocessing_out):
289
+ create_database_and_import_data_main(
290
+ datasets_path=datasets_path,
291
+ database_path=database_path,
292
+ preprocessing=preprocessing,
293
+ preprocessing_out=preprocessing_out
294
+ )