guts-base 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guts-base might be problematic. Click here for more details.

@@ -0,0 +1,571 @@
1
+ """This module has been developed in the PollinERA project to deal with time of
2
+ death notation and add another import format.
3
+
4
+ TODO: Surpress warnings for too long sheet names
5
+ TODO: Skip files that have locked postprocessing files and give a warning
6
+ TODO: Give a Status message, which file is being processed
7
+ TODO: Apply these changes also to openguts.py
8
+ TODO: Write tests for the imports and produce templates. See test_data_import.py
9
+ """
10
+
11
+
12
+ from typing import List
13
+ import os
14
+ import warnings
15
+ from datetime import timedelta
16
+ from guts_base.data.utils import datalad_locked_file_warning
17
+
18
+ import click
19
+ import pandas as pd
20
+ import numpy as np
21
+
22
+ # default columns
23
+ DEFAULT_COLUMNS_whitespace = dict(
24
+ id_subject = ["id subject", "subject id", "id", "id bee"],
25
+ id_treatment = ["id treatment", "treatment id", "treatment"],
26
+ id_replicate = ["id replicate", "replicate id", "replicate name", "replicate"],
27
+ n = ["individuals", "n", "number_replicates", "n_individuals", "replicates"],
28
+ censored = ["cenzus", "censoring", "escaped"],
29
+ time_start_experiment = ["date of start", "experiment start", "start experiment"],
30
+ time_end_experiment = ["date of end", "experiment end", "end experiment"],
31
+ time_start_exposure = ["time of exposure start", "start exposure", "exposure start"],
32
+ time_end_exposure = ["time of exposure end", "end exposure", "exposure end"],
33
+ time_death = ["time of death", "survival time", "date of death"],
34
+ )
35
+
36
+ DEFAULT_COLUMNS_underscore = {
37
+ k: [v_.replace(" ", "_") for v_ in v]
38
+ for k, v in DEFAULT_COLUMNS_whitespace.items()
39
+ }
40
+
41
+ DEFAULT_COLUMNS = {
42
+ k: list(set(DEFAULT_COLUMNS_whitespace[k] + DEFAULT_COLUMNS_underscore[k]))
43
+ for k in DEFAULT_COLUMNS_whitespace.keys()
44
+ }
45
+
46
+ REQUIRED_COLUMNS = dict(
47
+ id_subject = True,
48
+ id_treatment = True,
49
+ id_replicate = False,
50
+ censored = False,
51
+ n = False,
52
+ time_start_experiment = True,
53
+ time_end_experiment = False,
54
+ time_start_exposure = True,
55
+ time_end_exposure = True,
56
+ time_death = True,
57
+ )
58
+
59
+
60
+
61
+ def clean_column_names(columns: List[str]):
62
+ cleaned_columns = []
63
+ for c in columns:
64
+ c = c.lower() # convert to lowercase
65
+ c = c.strip() # strip leading and trailing whitespace
66
+ c = c.replace(" ", "_")
67
+ c = c.replace("[", "")
68
+ c = c.replace("]", "")
69
+ c = c.replace("/", "_")
70
+
71
+ cleaned_columns.append(c)
72
+
73
+ return cleaned_columns
74
+
75
+ def standardize_column_names(
76
+ columns: List[str],
77
+ raise_error=True,
78
+ ignore_columns=[]
79
+ ):
80
+ column_mapper = invert_dict_of_lists(DEFAULT_COLUMNS)
81
+ standardized_columns = []
82
+ for c in columns:
83
+ c = column_mapper.get(c, c) # try to get a standard value for the column
84
+
85
+ standardized_columns.append(c)
86
+
87
+ missing_columns = [
88
+ k for k in DEFAULT_COLUMNS.keys()
89
+ if k not in standardized_columns and k not in ignore_columns
90
+ ]
91
+ if len(missing_columns) > 0 and raise_error:
92
+ raise KeyError(
93
+ f"Not all necessary columns could be found. {missing_columns} "
94
+ "could not identified. Rename columns or add the corresponding "
95
+ "columns in the mapper."
96
+ )
97
+
98
+ return standardized_columns
99
+
100
+
101
+ def invert_dict_of_lists(original_dict):
102
+ inverted_dict = {}
103
+ for key, value_list in original_dict.items():
104
+ for value in value_list:
105
+ inverted_dict[value] = key
106
+ return inverted_dict
107
+
108
+ def long_to_wide(df_long, id_columns, time_column, observation_column):
109
+ df_long["id"] = df_long[id_columns].apply(
110
+ lambda x: '__'.join(x.astype(str)), axis=1
111
+ )
112
+
113
+ df_wide = df_long.pivot(
114
+ # data=df_long.reset_index(),
115
+ index=time_column,
116
+ values=observation_column,
117
+ columns="id",
118
+ )
119
+
120
+ return df_wide
121
+
122
+ def wide_to_long(df_wide, id_columns, time_column, observation_column):
123
+ df_long = pd.melt(
124
+ frame=df_wide.reset_index(),
125
+ value_vars=df_wide.columns,
126
+ id_vars=time_column,
127
+ var_name="id",
128
+ value_name=observation_column
129
+ )
130
+
131
+ df_long[id_columns] = df_long.id.str.split("__", n=1, expand=True)
132
+ df_long = df_long.drop(columns="id")
133
+ return df_long[id_columns+[time_column, observation_column]]
134
+
135
+
136
+ def get_unique_value(series, action_if_not_unique="mean"):
137
+ if series.nunique() == 1:
138
+ return series.drop_duplicates().iloc[0]
139
+ else:
140
+ if action_if_not_unique == "mean":
141
+ return series.mean()
142
+ elif action_if_not_unique == "max":
143
+ return series.max()
144
+ elif action_if_not_unique == "min":
145
+ return series.min()
146
+ elif action_if_not_unique == "error":
147
+ raise ValueError("Start time contains non unique values")
148
+ else:
149
+ raise NotImplementedError("Aggregation action is not implemented.")
150
+
151
+
152
+ def make_openguts_intervention_table(
153
+ df: pd.DataFrame,
154
+ intervention:str,
155
+ intervention_time_unit:str="d",
156
+ rect_interpolate=True,
157
+ ) -> List[pd.DataFrame]:
158
+ # create exposure tables
159
+ id_columns = ["id_treatment", "id_replicate"]
160
+ time_column = f"time [{intervention_time_unit}]"
161
+
162
+ df_long = []
163
+ for (tid, rid), group in df.groupby(id_columns):
164
+ intervention_value = float(group[intervention].unique())
165
+ if f"time_start_exposure_{intervention}" in group:
166
+ intervention_start = get_unique_value(group[f"time_start_exposure_{intervention}"])
167
+ else:
168
+ intervention_start = get_unique_value(group["time_start_exposure"])
169
+ if f"time_end_exposure_{intervention}" in group:
170
+ intervention_end = get_unique_value(group[f"time_end_exposure_{intervention}"])
171
+ else:
172
+ intervention_end = get_unique_value(group["time_end_exposure"])
173
+
174
+ experiment_start = get_unique_value(group["time_start_experiment"])
175
+ experiment_end = get_unique_value(group["time_end_experiment"])
176
+
177
+ time = np.array([
178
+ experiment_start,
179
+ intervention_start,
180
+ intervention_end,
181
+ experiment_end
182
+ ])
183
+
184
+ value = np.array([0, intervention_value, 0, 0])
185
+
186
+ m = pd.DataFrame(
187
+ data=np.column_stack([
188
+ np.repeat(tid, len(time)),
189
+ np.repeat(rid, len(time)),
190
+ time, value
191
+ ]),
192
+ columns=list(group[id_columns].columns) + [time_column, intervention]
193
+ )
194
+ # this throws the first value out if the time of exposure start is
195
+ # identical to the exposure end
196
+ m = m.drop_duplicates(subset=id_columns + [time_column], keep="last")
197
+
198
+ df_long.append(m)
199
+
200
+ df_long = pd.concat(df_long)
201
+ df_wide = long_to_wide(df_long, id_columns, time_column, intervention).reset_index()
202
+ df_wide[time_column] = (df_wide[time_column] - experiment_start)
203
+ df_wide = df_wide.set_index(time_column)
204
+
205
+ if rect_interpolate:
206
+ df_wide = df_wide.reindex(np.unique(np.concatenate([
207
+ np.array(list(df_wide.index)),
208
+ np.array(list(df_wide.index - pd.Timedelta(1, "s")))[1:]
209
+ ])))
210
+ df_wide = df_wide.fillna(method="ffill")
211
+
212
+ df_wide.index = df_wide.index / pd.Timedelta(1, "d")
213
+ return df_wide
214
+
215
+
216
+
217
+ def make_openguts_observation_table(
218
+ df: pd.DataFrame,
219
+ observation="censored",
220
+ observation_schedule:str="d",
221
+ ) -> List[pd.DataFrame]:
222
+ """returns counts of censored individuals"""
223
+ df = df.copy()
224
+
225
+ experiment_start = get_unique_value(df["time_start_experiment"])
226
+ experiment_end = get_unique_value(df["time_end_experiment"])
227
+
228
+ time = pd.date_range(experiment_start, experiment_end, freq=observation_schedule)
229
+ timecol_name = f"time [{observation_schedule.lower()}]"
230
+
231
+
232
+ id_columns = ["id_treatment", "id_replicate"]
233
+
234
+ # calculate survival time
235
+ df[timecol_name] = df["time_death"] - df["time_start_experiment"]
236
+
237
+ time_remainder = df[timecol_name] % pd.Timedelta(1, observation_schedule)
238
+ if (time_remainder > pd.Timedelta(0)).any():
239
+ raise ValueError(
240
+ "Observations should be entered at the same time as the experiment start "+
241
+ "df['time_death] - df['time_experiment_start'] should be a multiple of "+
242
+ f"the time resolution of the observation schedule. Here: 1{observation_schedule}"
243
+ )
244
+
245
+ if observation == "censored":
246
+ # sum IDs that were marked as censored at time t
247
+ df_long = df.groupby(id_columns+[timecol_name])["censored"].sum()
248
+
249
+ elif observation == "lethality":
250
+ # count IDs that died at time t
251
+ df_long = df.groupby(id_columns+[timecol_name])["time_death"].count()
252
+
253
+ else:
254
+ raise NotImplementedError(f"observation {observation} is not implemented.")
255
+
256
+ df_long = df_long.rename(observation)
257
+
258
+ # df to wide frame
259
+ df_wide = long_to_wide(df_long.reset_index(), id_columns, timecol_name, observation)
260
+
261
+ # reindex wide dataframe on time
262
+ df_wide = df_wide.reindex(index=time-experiment_start, method=None)
263
+ df_wide.index = df_wide.index.set_names(timecol_name)
264
+ df_wide = df_wide.fillna(0)
265
+
266
+ return df_wide
267
+
268
+
269
+ # write to excel file
270
+ def excel_writer(df: pd.DataFrame, file, sheet):
271
+ with warnings.catch_warnings():
272
+ warnings.simplefilter(action="ignore")
273
+ if not os.path.exists(file):
274
+ with pd.ExcelWriter(file, mode="w") as writer:
275
+ df.to_excel(writer, sheet_name=sheet)
276
+
277
+ else:
278
+ with pd.ExcelWriter(file, if_sheet_exists="replace", mode="a") as writer:
279
+ df.to_excel(writer, sheet_name=sheet)
280
+
281
+ def write_data_template(
282
+ notation="time_of_death",
283
+ time_start_experiment=()
284
+ ):
285
+ pass
286
+
287
+ def time_to_fraction(data, column, experiment_start):
288
+ data
289
+
290
+
291
+ class TimeOfDeathIO:
292
+ def __init__(
293
+ self,
294
+ file,
295
+ intervention_columns: List[str],
296
+ sheet:str = "time-of-death",
297
+ ):
298
+ self._file = file
299
+ self.data = self.from_file()
300
+
301
+ def main(file: str, sheet: str, out:str, intervention_columns: List[str],
302
+ observation_schedule="d", rect_interpolate=False):
303
+ intervention_columns = clean_column_names(list(intervention_columns))
304
+ processed_file = f"{out}/openguts_{os.path.basename(file)}"
305
+
306
+ print("\n")
307
+ print(f"Processing File: {file}")
308
+ print(f"Converting from time-of-death to openguts")
309
+ print("-----------------------------------------")
310
+
311
+ if os.access(processed_file, os.EX_OK):
312
+ if not os.access(processed_file, os.W_OK):
313
+ datalad_locked_file_warning(processed_file)
314
+ return
315
+ else:
316
+ directory = os.path.dirname(processed_file)
317
+ if os.access(directory, os.EX_OK):
318
+ pass
319
+ else:
320
+ os.makedirs(directory)
321
+
322
+ # read datafile
323
+ data = pd.read_excel(io=file, sheet_name=sheet)
324
+ data.columns = clean_column_names(data.columns)
325
+
326
+ # Assumptions
327
+ # -----------
328
+ # this should not be too small, Bürger and Focks (2025) assume a topical exposure
329
+ # duration of 1 hour. If exposure duration is too small, it will result in
330
+ # problems with k_d
331
+ exposure_duration = timedelta(seconds=3600)
332
+ exposure_start_delay = timedelta(hours=0)
333
+ id_zfill = 2 # number of zeros to pad ID column values with
334
+
335
+ # standardize columns
336
+ data.columns = standardize_column_names(data.columns, raise_error=False)
337
+ data["id_treatment"] = data["id_treatment"].astype(str).str.zfill(id_zfill)
338
+
339
+ # add optional columns to the dataframe if they are not present
340
+ experiment_start = get_unique_value(data["time_start_experiment"])
341
+ if "time_start_exposure" not in data:
342
+ warnings.warn(
343
+ "No column: 'time_start_exposure'. "
344
+ f"Assuming time_start_exposure=time_start_experiment"
345
+ f"({experiment_start}) + {exposure_start_delay}",
346
+ category=UserWarning
347
+ )
348
+ exposure_start = experiment_start + exposure_start_delay
349
+ data["time_start_exposure"] = exposure_start
350
+
351
+ if "time_end_exposure" in data:
352
+ if (data["time_start_experiment"] == data["time_end_exposure"]).all():
353
+ warnings.warn(
354
+ "'time_end_exposure' equals 'time_start_exposure'. "+
355
+ "Removing column 'time_end_exposure'"
356
+ )
357
+ data = data.drop("time_end_exposure", axis=1)
358
+
359
+ if "time_end_exposure" not in data:
360
+ exposure_start = data["time_start_exposure"]
361
+ warnings.warn(
362
+ "No column: 'time_end_exposure'. "
363
+ f"Assuming time_end_exposure=time_start_exposure + {exposure_duration}",
364
+ category=UserWarning
365
+ )
366
+ exposure_end = exposure_start + exposure_duration
367
+ data["time_end_exposure"] = exposure_end
368
+
369
+ if "time_end_experiment" not in data:
370
+ experiment_end = data.time_death.max()
371
+ warnings.warn(
372
+ "No column: 'time_end_experiment' "
373
+ f"Using the time of the last observation: {experiment_end}",
374
+ category=UserWarning
375
+ )
376
+ data["time_end_experiment"] = experiment_end
377
+
378
+ if "id_replicate" not in data:
379
+ warnings.warn(
380
+ "No column: 'id_replicate'. "
381
+ "Assuming all treatments were only carried out with 1 replicate "
382
+ "(containing n individuals).",
383
+ category=UserWarning
384
+ )
385
+ data["id_replicate"] = 0
386
+
387
+ # check for replicates
388
+ id_columns = ["id_treatment"]
389
+ for rid, (_, group) in enumerate(data[id_columns+intervention_columns]
390
+ .groupby(id_columns)):
391
+ data.loc[group.index, "id_replicate"] = rid + 1
392
+
393
+ data["id_replicate"] = data["id_replicate"].astype(str).str.zfill(id_zfill)
394
+ id_columns = ["id_treatment", "id_replicate"]
395
+
396
+ elif data["id_replicate"].isna().all():
397
+ warnings.warn(
398
+ "column: 'id_replicate' contained only NAN values"
399
+ "Assuming all treatments were only carried out with 1 replicate "
400
+ "(containing n individuals).",
401
+ category=UserWarning
402
+ )
403
+ data["id_replicate"] = 0
404
+
405
+ # check for replicates
406
+ id_columns = ["id_treatment"]
407
+ for rid, (_, group) in enumerate(data[id_columns+intervention_columns]
408
+ .groupby(id_columns)):
409
+ data.loc[group.index, "id_replicate"] = rid + 1
410
+
411
+ data["id_replicate"] = data["id_replicate"].astype(str).str.zfill(id_zfill)
412
+ id_columns = ["id_treatment", "id_replicate"]
413
+ else:
414
+ data["id_replicate"] = data["id_replicate"].astype(str)
415
+ data["id_treatment"] = data["id_treatment"].astype(str)
416
+ id_columns = ["id_treatment", "id_replicate"]
417
+
418
+ if "censored" not in data:
419
+ warnings.warn(
420
+ "No column: 'censoring'. "
421
+ "Assuming all observation are not censored, meaning each "
422
+ "'time of death' indication comes from an individual that was "
423
+ "observed dead at that time (as opposed to escaped or removed from "
424
+ "the experiment)",
425
+ category=UserWarning
426
+ )
427
+ data["censored"] = 0
428
+
429
+ data.columns = standardize_column_names(data.columns, raise_error=True, ignore_columns=["n"])
430
+
431
+ (data["time_start_experiment"] - experiment_start).dt.seconds
432
+ (data["time_start_experiment"] - experiment_start).dt.seconds
433
+
434
+ interventions = []
435
+ for iv in intervention_columns:
436
+ iv_wide = make_openguts_intervention_table(
437
+ data,
438
+ intervention=iv,
439
+ intervention_time_unit="d",
440
+ rect_interpolate=rect_interpolate,
441
+ )
442
+ interventions.append(iv_wide)
443
+ excel_writer(iv_wide, file=processed_file, sheet=iv)
444
+
445
+ censored = make_openguts_observation_table(
446
+ data,
447
+ observation="censored",
448
+ observation_schedule=observation_schedule,
449
+ )
450
+
451
+ lethality = make_openguts_observation_table(
452
+ data,
453
+ observation="lethality",
454
+ observation_schedule=observation_schedule,
455
+ )
456
+
457
+ deaths = lethality - censored
458
+
459
+ # excel export
460
+ excel_writer(censored, file=processed_file, sheet="censored")
461
+ excel_writer(lethality, file=processed_file, sheet="lethality (uncensored)")
462
+ excel_writer(deaths, file=processed_file, sheet="lethality (censored)")
463
+
464
+
465
+ cens_long = wide_to_long(censored, id_columns, f"time [{observation_schedule}]", "censored")
466
+ leth_long = wide_to_long(lethality, id_columns, f"time [{observation_schedule}]", "lethality")
467
+
468
+ if "n" in data:
469
+ if data["n"].isna().all():
470
+ warnings.warn(
471
+ "column: 'n' contained only NAN values. "+
472
+ "Removed (so it can be created from scratch)",
473
+ category=UserWarning
474
+ )
475
+ data = data.drop("n", axis=1)
476
+ else:
477
+ pass
478
+
479
+ if "n" not in data:
480
+ warnings.warn(
481
+ "No column: 'n'. "
482
+ "Inferring the number of individuals at the beginning of the "
483
+ "experiment from the uncensored number of dead organisms "
484
+ "(including those escaped and alive at the end of the experiment).",
485
+ category=UserWarning
486
+ )
487
+ n = leth_long.groupby(id_columns)["lethality"].sum().rename("n")
488
+ data = pd.merge(data, n.reset_index(), on=id_columns, how="left")
489
+
490
+
491
+ # calculate survival
492
+ n = data.groupby(id_columns)["n"].agg("unique").astype(int)
493
+ survival = pd.merge(leth_long, n, on=id_columns, how="left")
494
+ mortality = survival.groupby(id_columns)["lethality"].cumsum()
495
+ survival["survival"] = survival["n"] - mortality
496
+ survival_wide = long_to_wide(survival, id_columns, f"time [{observation_schedule}]", "survival")
497
+ excel_writer(survival_wide, file=processed_file, sheet="survival")
498
+
499
+ # Calculate the number of present organisms just after censoring
500
+ # n_observed_after_censoring = survival_wide.copy()
501
+ # n_observed_after_censoring[survival_wide.columns] = np.row_stack([
502
+ # survival_wide.iloc[0].values,
503
+ # survival_wide.iloc[:-1].values - censored.iloc[1:].values
504
+ # ])
505
+ # excel_writer(n_observed_after_censoring, file=processed_file,
506
+ # sheet="n_observed_after_censoring")
507
+
508
+
509
+ # Calculate the number of organisms alive after the last observation
510
+ n_observed_after_last_observation = survival_wide.copy()
511
+ n_observed_after_last_observation[survival_wide.columns] = np.row_stack([
512
+ np.full_like(survival_wide.iloc[0].values, np.nan),
513
+ survival_wide.iloc[:-1].values
514
+ ])
515
+ excel_writer(n_observed_after_last_observation, file=processed_file,
516
+ sheet="n_observed_after_last_observation")
517
+
518
+
519
+ data.columns = standardize_column_names(data.columns)
520
+ data_minimal = data[list(DEFAULT_COLUMNS.keys()) + intervention_columns]
521
+ excel_writer(data_minimal.set_index("id_subject"), file=processed_file,
522
+ sheet="time_of_death")
523
+
524
+ if "meta" in pd.ExcelFile(file).sheet_names:
525
+ excel_writer(
526
+ df=pd.read_excel(file, sheet_name="meta").set_index("Metadata"),
527
+ file=processed_file,
528
+ sheet="meta"
529
+ )
530
+ elif "Info" in pd.ExcelFile(file).sheet_names:
531
+ metadata = pd.read_excel(io=file, sheet_name="Info")
532
+ metadata = metadata.set_index("Experiment information")
533
+ metadata.columns = ["value", "description"]
534
+ metadata.loc["interventions", "value"] = ", ".join(intervention_columns)
535
+ metadata.loc["observations", "value"] = ", ".join(["survival", "censored"])
536
+ excel_writer(metadata, file=processed_file, sheet="meta")
537
+ else:
538
+ warnings.warn("No metadata found in sheets 'meta' or 'Info'.")
539
+
540
+
541
+ @click.command()
542
+ @click.option("--file", "-f", help="Path to the xlsx file")
543
+ @click.option("--sheet", "-s", help="Name of the excel sheet")
544
+ @click.option("--out", "-o", help="Output directory", default="processed_data")
545
+ @click.option("--observation_schedule", help="Schedule of the observations: d - daily, h - hourly", default="d")
546
+ @click.option("--intervention_columns", "-c", multiple=True, type=str, help="Names of the columns that carry the exposure information")
547
+ def time_of_death_to_openguts(file, sheet, out, observation_schedule, intervention_columns):
548
+ main(
549
+ file=file,
550
+ sheet=sheet,
551
+ out=out,
552
+ intervention_columns=intervention_columns,
553
+ observation_schedule=observation_schedule
554
+ )
555
+
556
+
557
+ if __name__ == "__main__":
558
+
559
+
560
+ if os.path.basename(os.getcwd()) != "data":
561
+ os.chdir("case_studies/tktd-osmia/data")
562
+ # call the underlying function
563
+ ctx = click.Context(time_of_death_to_openguts)
564
+ ctx.forward(
565
+ time_of_death_to_openguts,
566
+ file="test/template_time_of_death.xlsx",
567
+ sheet="time-of-death",
568
+ intervention_columns=["Substance_A", "Substance B"],
569
+ )
570
+ else:
571
+ time_of_death_to_openguts()
@@ -0,0 +1,8 @@
1
+ import warnings
2
+
3
+ def datalad_locked_file_warning(file):
4
+ warnings.warn(
5
+ f"The file '{file}' does not have write access. "
6
+ f"To unlock the file, use DataLad with the command: "
7
+ f"datalad unlock '{file}'."
8
+ )