glucose360 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glucose360/__init__.py +1 -0
- glucose360/events.py +680 -0
- glucose360/features.py +1042 -0
- glucose360/plots.py +494 -0
- glucose360/preprocessing.py +558 -0
- glucose360-0.0.1.dist-info/LICENSE +674 -0
- glucose360-0.0.1.dist-info/METADATA +34 -0
- glucose360-0.0.1.dist-info/RECORD +10 -0
- glucose360-0.0.1.dist-info/WHEEL +5 -0
- glucose360-0.0.1.dist-info/top_level.txt +1 -0
glucose360/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
__all__ = ["preprocessing", "features", "events", "plots"]
|
glucose360/events.py
ADDED
@@ -0,0 +1,680 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
from scipy.integrate import trapezoid
|
4
|
+
import configparser
|
5
|
+
import glob, os, zipfile, tempfile
|
6
|
+
import math
|
7
|
+
|
8
|
+
dir_path = os.path.dirname(os.path.realpath(__file__))
|
9
|
+
config_path = os.path.join(dir_path, "config.ini")
|
10
|
+
config = configparser.ConfigParser()
|
11
|
+
config.read(config_path)
|
12
|
+
ID = config['variables']['id']
|
13
|
+
GLUCOSE = config['variables']['glucose']
|
14
|
+
TIME = config['variables']['time']
|
15
|
+
BEFORE = config['variables']['before']
|
16
|
+
AFTER = config['variables']['after']
|
17
|
+
TYPE = config['variables']['type']
|
18
|
+
DESCRIPTION = config['variables']['description']
|
19
|
+
|
20
|
+
def import_events(
|
21
|
+
path: str,
|
22
|
+
id: str,
|
23
|
+
name: str = None,
|
24
|
+
day_col: str = "Day",
|
25
|
+
time_col: str = "Time",
|
26
|
+
before: int = 60,
|
27
|
+
after: int = 60,
|
28
|
+
type: str = "imported event"
|
29
|
+
) -> pd.DataFrame:
|
30
|
+
"""Bulk imports events from standalone .csv files or from those within a given directory or .zip file
|
31
|
+
|
32
|
+
:param path: the path of the directory/zip/csv to import from
|
33
|
+
:type path: str
|
34
|
+
:param id: the identification of the patient that the imported events belong to
|
35
|
+
:type id: str
|
36
|
+
:param day_col: the name of the column specifying the day the event occurred (year, month, and specific day), defaults to 'Day'
|
37
|
+
:type day_col: str, optional
|
38
|
+
:param time_col: the name of the column specifying what time during the day the event occurred, defaults to 'Time'
|
39
|
+
:type time_col: str, optional
|
40
|
+
:param before: the amount of minutes to also look at before the event timestamp, defaults to 60
|
41
|
+
:type before: int, optional
|
42
|
+
:param after: the amount of minutes to also look at after the event timestamp, defaults to 60
|
43
|
+
:type after: int, optional
|
44
|
+
:param type: the type of event to classify all the imported events as, defaults to 'imported event'
|
45
|
+
:type type: str, optional
|
46
|
+
:return: a Pandas DataFrame containing all the imported events
|
47
|
+
:rtype: 'pandas.DataFrame'
|
48
|
+
"""
|
49
|
+
ext = os.path.splitext(path)[1]
|
50
|
+
|
51
|
+
# path leads to directory
|
52
|
+
if ext == "":
|
53
|
+
if not os.path.isdir(path):
|
54
|
+
raise ValueError("Directory does not exist")
|
55
|
+
else:
|
56
|
+
return import_events_directory(path, id, day_col, time_col, before, after, type)
|
57
|
+
|
58
|
+
# check if path leads to .zip or .csv
|
59
|
+
if ext.lower() in [".csv", ".zip"]:
|
60
|
+
if not os.path.isfile(path):
|
61
|
+
raise ValueError("File does not exist")
|
62
|
+
else:
|
63
|
+
raise ValueError("Invalid file type")
|
64
|
+
|
65
|
+
# path leads to .csv
|
66
|
+
if ext.lower() == ".csv":
|
67
|
+
return import_events_csv(path, id, day_col, time_col, before, after, type)
|
68
|
+
|
69
|
+
# otherwise has to be a .zip file
|
70
|
+
with zipfile.ZipFile(path, 'r') as zip_ref:
|
71
|
+
# create a temporary directory to pull from
|
72
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
73
|
+
zip_ref.extractall(temp_dir)
|
74
|
+
dir = name or path.split("/")[-1].split(".")[0]
|
75
|
+
return import_events_directory((temp_dir + "/" + dir), id, day_col, time_col, before, after, type)
|
76
|
+
|
77
|
+
def import_events_directory(
|
78
|
+
path: str,
|
79
|
+
id: str,
|
80
|
+
day_col: str = "Day",
|
81
|
+
time_col: str = "Time",
|
82
|
+
before: int = 60,
|
83
|
+
after: int = 60,
|
84
|
+
type: str = "imported event"
|
85
|
+
) -> pd.DataFrame:
|
86
|
+
"""Bulk imports events from .csv files within a given directory
|
87
|
+
|
88
|
+
:param path: the path of the directory to import from
|
89
|
+
:type path: str
|
90
|
+
:param id: the identification of the patient that the imported events belong to
|
91
|
+
:type id: str
|
92
|
+
:param day_col: the name of the column specifying the day the event occurred (year, month, and specific day), defaults to 'Day'
|
93
|
+
:type day_col: str, optional
|
94
|
+
:param time_col: the name of the column specifying what time during the day the event occurred, defaults to 'Time'
|
95
|
+
:type time_col: str, optional
|
96
|
+
:param before: the amount of minutes to also look at before the event timestamp, defaults to 60
|
97
|
+
:type before: int, optional
|
98
|
+
:param after: the amount of minutes to also look at after the event timestamp, defaults to 60
|
99
|
+
:type after: int, optional
|
100
|
+
:param type: the type of event to classify all the imported events as, defaults to 'imported event'
|
101
|
+
:type type: str, optional
|
102
|
+
:return: a Pandas DataFrame containing all the imported events
|
103
|
+
:rtype: 'pandas.DataFrame'
|
104
|
+
"""
|
105
|
+
csv_files = glob.glob(path + "/*.csv")
|
106
|
+
|
107
|
+
if len(csv_files) == 0:
|
108
|
+
raise Exception("No CSV files found.")
|
109
|
+
|
110
|
+
return pd.concat(import_events_csv(file, id, day_col, time_col, before, after, type) for file in csv_files)
|
111
|
+
|
112
|
+
def import_events_csv(
|
113
|
+
path: str,
|
114
|
+
id: str,
|
115
|
+
day_col: str = "Day",
|
116
|
+
time_col: str = "Time",
|
117
|
+
before: int = 60,
|
118
|
+
after: int = 60,
|
119
|
+
type: str = "imported event"
|
120
|
+
) -> pd.DataFrame:
|
121
|
+
"""Bulk imports events from a single .csv file
|
122
|
+
|
123
|
+
:param path: the path of the .csv file to import from
|
124
|
+
:type path: str
|
125
|
+
:param id: the identification of the patient that the imported events belong to
|
126
|
+
:type id: str
|
127
|
+
:param day_col: the name of the column specifying the day the event occurred (year, month, and specific day), defaults to 'Day'
|
128
|
+
:type day_col: str, optional
|
129
|
+
:param time_col: the name of the column specifying what time during the day the event occurred, defaults to 'Time'
|
130
|
+
:type time_col: str, optional
|
131
|
+
:param before: the amount of minutes to also look at before the event timestamp, defaults to 60
|
132
|
+
:type before: int, optional
|
133
|
+
:param after: the amount of minutes to also look at after the event timestamp, defaults to 60
|
134
|
+
:type after: int, optional
|
135
|
+
:param type: the type of event to classify all the imported events as, defaults to 'imported event'
|
136
|
+
:type type: str, optional
|
137
|
+
:return: a Pandas DataFrame containing all the imported events
|
138
|
+
:rtype: 'pandas.DataFrame'
|
139
|
+
"""
|
140
|
+
df = pd.read_csv(path)
|
141
|
+
csv_name = os.path.splitext(path)[0]
|
142
|
+
|
143
|
+
events = pd.DataFrame()
|
144
|
+
events[TIME] = pd.to_datetime(df[day_col] + " " + df[time_col])
|
145
|
+
events[BEFORE] = before
|
146
|
+
events[AFTER] = after
|
147
|
+
events[TYPE] = type
|
148
|
+
events[DESCRIPTION] = df["Food Name"] if "Food Name" in df.columns else ("imported event #" + (events.index + 1).astype(str) + f"from {csv_name}")
|
149
|
+
events.insert(0, ID, id)
|
150
|
+
|
151
|
+
return events.dropna(subset=[TIME])
|
152
|
+
|
153
|
+
def _episodes_helper(
|
154
|
+
df: pd.DataFrame,
|
155
|
+
id: str,
|
156
|
+
type: str,
|
157
|
+
threshold: int,
|
158
|
+
level: int,
|
159
|
+
min_length: int,
|
160
|
+
end_length: int
|
161
|
+
) -> pd.DataFrame:
|
162
|
+
"""Retrieves all episodes of a specific type/level for a specific patient within the given CGM data
|
163
|
+
|
164
|
+
:param df: Pandas DataFrame containing preprocessed CGM data
|
165
|
+
:type df: pandas.DataFrame
|
166
|
+
:param id: identification of the patient to retrieve episodes for
|
167
|
+
:type id: str
|
168
|
+
:param type: type of episode ('hypo' or 'hyper')
|
169
|
+
:type type: str
|
170
|
+
:param threshold: threshold (in mg/dL) above/below which glucose values are considered as part of an episode
|
171
|
+
:type threshold: int
|
172
|
+
:param level: the level the retrieved episodes are (0, 1, or 2)
|
173
|
+
:type level: int
|
174
|
+
:param min_length: minimum duration (in minutes) required for excursions, defaults to 15
|
175
|
+
:type min_length: int, optional
|
176
|
+
:param end_length: minimum amount of time (in minutes) that the glucose values must be within typical ranges
|
177
|
+
at the end of an excursion, defaults to 15
|
178
|
+
:type end_length: int, optional
|
179
|
+
:return: a Pandas DataFrame containing all episodes of a specific type/level for a specific patient within the given CGM data
|
180
|
+
:rtype: pandas.DataFrame
|
181
|
+
"""
|
182
|
+
|
183
|
+
config.read('config.ini')
|
184
|
+
interval = int(config["variables"]["interval"])
|
185
|
+
timegap = lambda timedelta: timedelta.total_seconds() / 60
|
186
|
+
episodes = pd.DataFrame()
|
187
|
+
|
188
|
+
data = df.copy(); data.reset_index(drop=True, inplace=True)
|
189
|
+
episode_df = df[(df[GLUCOSE] <= threshold)].copy() if type == "hypo" else df[df[GLUCOSE] >= threshold].copy()
|
190
|
+
episode_df.reset_index(drop=True, inplace=True)
|
191
|
+
episode_df["gap"] = episode_df[TIME].diff().apply(timegap)
|
192
|
+
|
193
|
+
edges = episode_df.index[episode_df["gap"] != interval].to_list()
|
194
|
+
edges.append(-1)
|
195
|
+
|
196
|
+
get = lambda loc, col: episode_df.iloc[loc][col]
|
197
|
+
index = 0
|
198
|
+
while index < len(edges) - 1:
|
199
|
+
offset = 0 if (index == len(edges) - 2) else 1
|
200
|
+
end_i = edges[index + 1] - offset # index of the end of the episode (inclusive! - that's what the offset is for)
|
201
|
+
start_i = edges[index] # index of the start of the episode
|
202
|
+
start_time = get(start_i, TIME)
|
203
|
+
end_time = get(end_i, TIME)
|
204
|
+
episode_length = timegap(end_time - start_time)
|
205
|
+
|
206
|
+
if episode_length >= min_length: # check if episode lasts longer than 15 min
|
207
|
+
if offset != 0: # not the very last episode
|
208
|
+
end_counts = math.ceil(end_length / interval)
|
209
|
+
|
210
|
+
end_index = data.index[data[TIME] == end_time].to_list()[0]
|
211
|
+
end_data = data.iloc[end_index + 1 : end_index + 1 + end_counts][GLUCOSE]
|
212
|
+
outside_threshold = np.where(end_data >= threshold, True, False) if type == "hypo" else np.where(end_data <= threshold, True, False)
|
213
|
+
if False in outside_threshold: # check if episode ends within 15 min
|
214
|
+
edges.pop(index + 1) # this episode does not end within 15 min, so combine this episode with the next
|
215
|
+
continue
|
216
|
+
|
217
|
+
description = f"{start_time} to {end_time} level {level} {type}glycemic episode"
|
218
|
+
event = pd.DataFrame.from_records([{ID: id, TIME: start_time, BEFORE: 0, AFTER: episode_length,
|
219
|
+
TYPE: f"{type} level {level} episode", DESCRIPTION: description}])
|
220
|
+
episodes = pd.concat([episodes, event])
|
221
|
+
|
222
|
+
index += 1
|
223
|
+
|
224
|
+
return episodes
|
225
|
+
|
226
|
+
def get_episodes(
|
227
|
+
df: pd.DataFrame,
|
228
|
+
hypo_lvl2: int = 54,
|
229
|
+
hypo_lvl1: int = 70,
|
230
|
+
hyper_lvl0: int = 140,
|
231
|
+
hyper_lvl1: int = 180,
|
232
|
+
hyper_lvl2: int = 250,
|
233
|
+
min_length: int = 15,
|
234
|
+
end_length: int = 15
|
235
|
+
) -> pd.DataFrame:
|
236
|
+
"""Retrieves all episodes within the given CGM data
|
237
|
+
|
238
|
+
:param df: Pandas DataFrame containing preprocessed CGM data
|
239
|
+
:type df: pandas.DataFrame
|
240
|
+
:param hypo_lvl2: threshold (in mg/dL) below which glucose values are considered level 2 hypoglycemic, defaults to 54
|
241
|
+
:type hypo_lvl2: int, optional
|
242
|
+
:param hypo_lvl1: threshold (in mg/dL) below which glucose values are considered level 1 hypoglycemic, defaults to 70
|
243
|
+
:type hypo_lvl1: int, optional
|
244
|
+
:param hyper_lvl0: threshold (in mg/dL) above which glucose values are considered level 0 hyperglycemic, defaults to 140
|
245
|
+
:type hyper_lvl0: int, optional
|
246
|
+
:param hyper_lvl1: threshold (in mg/dL) above which glucose values are considered level 1 hyperglycemic, defaults to 180
|
247
|
+
:type hyper_lvl1: int, optional
|
248
|
+
:param hyper_lvl2: threshold (in mg/dL) above which glucose values are considered level 2 hyperglycemic, defaults to 250
|
249
|
+
:type hyper_lvl2: int, optional
|
250
|
+
:param min_length: minimum duration (in minutes) required for excursions, defaults to 15
|
251
|
+
:type min_length: int, optional
|
252
|
+
:param end_length: minimum amount of time (in minutes) that the glucose values must be within typical ranges
|
253
|
+
at the end of an excursion, defaults to 15
|
254
|
+
:type end_length: int, optional
|
255
|
+
:return: a Pandas DataFrame containing all episodes within the given CGM data
|
256
|
+
:rtype: pandas.DataFrame
|
257
|
+
"""
|
258
|
+
output = pd.DataFrame()
|
259
|
+
for id, data in df.groupby(ID):
|
260
|
+
episodes = pd.concat([_episodes_helper(data, id, "hyper", hyper_lvl0, 0, min_length, end_length),
|
261
|
+
_episodes_helper(data, id, "hyper", hyper_lvl1, 1, min_length, end_length),
|
262
|
+
_episodes_helper(data, id, "hyper", hyper_lvl2, 2, min_length, end_length),
|
263
|
+
_episodes_helper(data, id, "hypo", hypo_lvl1, 1, min_length, end_length),
|
264
|
+
_episodes_helper(data, id, "hypo", hypo_lvl2, 2, min_length, end_length)])
|
265
|
+
|
266
|
+
episodes.sort_values(by=[TIME], inplace=True)
|
267
|
+
output = pd.concat([output, episodes])
|
268
|
+
|
269
|
+
return output
|
270
|
+
|
271
|
+
def get_excursions(
|
272
|
+
df: pd.DataFrame,
|
273
|
+
z: int = 2,
|
274
|
+
min_length: int = 15,
|
275
|
+
end_length: int = 15
|
276
|
+
) -> pd.DataFrame:
|
277
|
+
"""Retrieves all excursions within the given CGM data
|
278
|
+
|
279
|
+
:param df: Pandas DataFrame containing preprocessed CGM data
|
280
|
+
:type df: pandas.DataFrame
|
281
|
+
:param z: the number of standard deviations away from the mean that should define an 'excursion', defaults to 2
|
282
|
+
:type z: int, optional
|
283
|
+
:param min_length: minimum duration (in minutes) required for excursions, defaults to 15
|
284
|
+
:type min_length: int, optional
|
285
|
+
:param end_length: minimum amount of time (in minutes) that the glucose values must be within typical ranges
|
286
|
+
at the end of an excursion, defaults to 15
|
287
|
+
:type end_length: int, optional
|
288
|
+
:return: a Pandas DataFrame containing all excursions within the given CGM data
|
289
|
+
:rtype: pandas.DataFrame
|
290
|
+
"""
|
291
|
+
|
292
|
+
excursions = pd.DataFrame()
|
293
|
+
|
294
|
+
config.read('config.ini')
|
295
|
+
interval = int(config["variables"]["interval"])
|
296
|
+
for id, data in df.groupby(ID):
|
297
|
+
data.reset_index(drop=True, inplace=True)
|
298
|
+
sd = data[GLUCOSE].std()
|
299
|
+
mean = data[GLUCOSE].mean()
|
300
|
+
upper = mean + (z * sd)
|
301
|
+
lower = mean - (z * sd)
|
302
|
+
|
303
|
+
peaks = data[(data[GLUCOSE].shift(1) < data[GLUCOSE]) & (data[GLUCOSE].shift(-1) < data[GLUCOSE])][TIME].copy()
|
304
|
+
peaks.reset_index(drop=True, inplace=True)
|
305
|
+
nadirs = data[(data[GLUCOSE].shift(1) > data[GLUCOSE]) & (data[GLUCOSE].shift(-1) > data[GLUCOSE])][TIME].copy()
|
306
|
+
nadirs.reset_index(drop=True, inplace=True)
|
307
|
+
|
308
|
+
outliers = data[(data[GLUCOSE] >= upper) | (data[GLUCOSE] <= lower)].copy()
|
309
|
+
outliers.reset_index(drop=True, inplace=True)
|
310
|
+
|
311
|
+
# calculate the differences between each of the timestamps
|
312
|
+
timegap = lambda timedelta: timedelta.total_seconds() / 60
|
313
|
+
outliers["gaps"] = outliers[TIME].diff().apply(timegap)
|
314
|
+
|
315
|
+
edges = outliers.index[outliers["gaps"] != interval].to_list()
|
316
|
+
edges.append(-1)
|
317
|
+
i = 0
|
318
|
+
while i < len(edges) - 1:
|
319
|
+
type = "hyper" if outliers.iloc[edges[i]][GLUCOSE] > mean else "hypo"
|
320
|
+
offset = 0 if i == len(edges) - 2 else 1
|
321
|
+
start_time = outliers.iloc[edges[i]][TIME]
|
322
|
+
start_index = data.index[data[TIME] == start_time].to_list()[0]
|
323
|
+
end_time = outliers.iloc[edges[i+1] - offset][TIME]
|
324
|
+
end_index = data.index[data[TIME] == end_time].to_list()[0]
|
325
|
+
|
326
|
+
excursion_length = timegap(end_time - start_time)
|
327
|
+
if excursion_length >= min_length:
|
328
|
+
if offset != 0: # not the very last episode
|
329
|
+
end_counts = math.ceil(end_length / interval)
|
330
|
+
|
331
|
+
last_index = data.reset_index().index[data[TIME] == end_time].to_list()[0]
|
332
|
+
last_data = data.iloc[last_index + 1 : last_index + 1 + end_counts][GLUCOSE]
|
333
|
+
outside_threshold = np.where(last_data <= upper if type == "hyper" else last_data >= lower, True, False)
|
334
|
+
if False in outside_threshold: # check if excursion ends within 15 min
|
335
|
+
edges.pop(i + 1) # this excursion does not end within 15 min, so combine this episode with the next
|
336
|
+
continue
|
337
|
+
|
338
|
+
outliers.set_index(TIME, inplace=True)
|
339
|
+
last_point = edges[i+1] if offset != 0 else None
|
340
|
+
timestamp = outliers.iloc[edges[i]:last_point][GLUCOSE].idxmax() if type == "hyper" else outliers.iloc[edges[i]:last_point][GLUCOSE].idxmin()
|
341
|
+
outliers.reset_index(inplace=True)
|
342
|
+
|
343
|
+
extrema = peaks if type == "hypo" else nadirs
|
344
|
+
if start_index != 0:
|
345
|
+
if not extrema[extrema <= start_time].empty: start_time = extrema[extrema <= start_time].iloc[-1]
|
346
|
+
if end_index != data.shape[0] - 1:
|
347
|
+
if not extrema[extrema >= end_time].empty: end_time = extrema[extrema >= end_time].iloc[0]
|
348
|
+
|
349
|
+
description = f"{start_time} to {end_time} {type}glycemic excursion"
|
350
|
+
event = pd.DataFrame.from_records([{ID: id, TIME: timestamp, BEFORE: timegap(timestamp - start_time),
|
351
|
+
AFTER: timegap(end_time - timestamp),
|
352
|
+
TYPE: f"{type} excursion", DESCRIPTION: description}])
|
353
|
+
excursions = pd.concat([excursions, event])
|
354
|
+
|
355
|
+
i += 1
|
356
|
+
|
357
|
+
return excursions
|
358
|
+
|
359
|
+
def get_curated_events(df: pd.DataFrame) -> pd.DataFrame:
|
360
|
+
"""Retrieves all curated events (episodes and excursions) for all the patients within the given DataFrame
|
361
|
+
|
362
|
+
:param df: a Pandas DataFrame containing preprocessed CGM data
|
363
|
+
:type df: 'pandas.DataFrame'
|
364
|
+
:return: a Pandas DataFrame (in the usual event structure defined by the package) containing all curated events for all the patients within the given DataFrame
|
365
|
+
:rtype: 'pandas.DataFrame'
|
366
|
+
"""
|
367
|
+
return pd.concat([get_episodes(df), get_excursions(df)])
|
368
|
+
|
369
|
+
def retrieve_event_data(
|
370
|
+
df: pd.DataFrame,
|
371
|
+
events: pd.DataFrame,
|
372
|
+
) -> pd.DataFrame:
|
373
|
+
"""Returns a multiindexed Pandas DataFrame containing only patient data during the respective given events
|
374
|
+
:param df: a Pandas DataFrame containing the preprocessed CGM traces to retrieve event subsets from
|
375
|
+
:type df: 'pandas.DataFrame'
|
376
|
+
:param events: a single indexed Pandas DataFrame, with each row specifying a single event in the form of
|
377
|
+
an id, a datetime, # of hours before the datetime to include, # of hours after to include, and a description
|
378
|
+
:type events: 'pandas.DataFrame'
|
379
|
+
:return: a multi-indexed Pandas DataFrame, with each index referring to a subset of CGM trace that was found within 'df' and occurs during a single event within 'events'
|
380
|
+
:rtype: 'pandas.DataFrame'
|
381
|
+
"""
|
382
|
+
event_data = pd.DataFrame()
|
383
|
+
for index, row in events.to_frame().T.iterrows():
|
384
|
+
id = row[ID]
|
385
|
+
if id in df.index:
|
386
|
+
datetime = pd.Timestamp(row[TIME])
|
387
|
+
initial = datetime - pd.Timedelta(row[BEFORE], "m")
|
388
|
+
final = datetime + pd.Timedelta(row[AFTER], "m")
|
389
|
+
|
390
|
+
patient_data = df.loc[id]
|
391
|
+
data = patient_data[(patient_data[TIME] >= initial) & (patient_data[TIME] <= final)].copy()
|
392
|
+
|
393
|
+
data[ID] = id
|
394
|
+
data[DESCRIPTION] = row[DESCRIPTION]
|
395
|
+
|
396
|
+
event_data = pd.concat([event_data, data])
|
397
|
+
|
398
|
+
#if event_data.shape[0] != 0:
|
399
|
+
#event_data = event_data.set_index(["id"])
|
400
|
+
|
401
|
+
return event_data
|
402
|
+
|
403
|
+
def event_summary(events: pd.DataFrame) -> pd.Series:
|
404
|
+
"""Returns the number of events per unique event type found within 'events'
|
405
|
+
|
406
|
+
:param events: a Pandas DataFrame containing events (as per package guidelines)
|
407
|
+
:type events: 'pandas.DataFrame'
|
408
|
+
:return: a Pandas Series containing the number of events per unique event type found within 'events'
|
409
|
+
:rtype: 'pandas.Series'
|
410
|
+
"""
|
411
|
+
return events[TYPE].value_counts()
|
412
|
+
|
413
|
+
def AUC(df: pd.DataFrame) -> float:
|
414
|
+
"""Calculates the total Area-Under-Curve (AUC) for the given CGM trace
|
415
|
+
|
416
|
+
:param df: a Pandas DataFrame containing the CGM trace to calculate the AUC of
|
417
|
+
:type df: 'pandas.DataFrame'
|
418
|
+
:return: the AUC of the given CGM trace
|
419
|
+
:rtype: float
|
420
|
+
"""
|
421
|
+
config.read('config.ini')
|
422
|
+
interval = int(config["variables"]["interval"])
|
423
|
+
return trapezoid(df[GLUCOSE], dx=interval)
|
424
|
+
|
425
|
+
def iAUC(df: pd.DataFrame, level: float) -> float:
|
426
|
+
"""Calculates the incremental Area-Under-Curve (iAUC) for the given CGM trace
|
427
|
+
|
428
|
+
:param df: a Pandas DataFrame containing the CGM trace to calculate the AUC of
|
429
|
+
:type df: 'pandas.DataFrame'
|
430
|
+
:param level: the threshold above which to calculate iAUC
|
431
|
+
:type level: float
|
432
|
+
:return: the iAUC of the given CGM trace
|
433
|
+
:rtype: float
|
434
|
+
"""
|
435
|
+
data = df.copy()
|
436
|
+
data[GLUCOSE] = abs(data[GLUCOSE] - level)
|
437
|
+
data.loc[data[GLUCOSE] < 0, GLUCOSE] = 0
|
438
|
+
return AUC(data)
|
439
|
+
|
440
|
+
def baseline(df: pd.DataFrame) -> float:
|
441
|
+
"""Returns the baseline glucose level for the given CGM trace
|
442
|
+
|
443
|
+
:param df: a Pandas DataFrame containing the CGM trace to retrieve the baseline glucose level for
|
444
|
+
:type df: 'pandas.DataFrame'
|
445
|
+
:return: the baseline glucose level of the given CGM trace
|
446
|
+
:rtype: float
|
447
|
+
"""
|
448
|
+
return df[GLUCOSE].iloc[0]
|
449
|
+
|
450
|
+
def peak(df: pd.DataFrame) -> float:
|
451
|
+
"""Returns the maximum glucose level for the given CGM trace
|
452
|
+
|
453
|
+
:param df: a Pandas DataFrame containing the CGM trace to retrieve the maximum glucose level for
|
454
|
+
:type df: 'pandas.DataFrame'
|
455
|
+
:return: the maximum glucose level of the given CGM trace
|
456
|
+
:rtype: float
|
457
|
+
"""
|
458
|
+
return np.max(df[GLUCOSE])
|
459
|
+
|
460
|
+
def nadir(df: pd.DataFrame) -> float:
|
461
|
+
"""Returns the minimum glucose level for the given CGM trace
|
462
|
+
|
463
|
+
:param df: a Pandas DataFrame containing the CGM trace to retrieve the minimum glucose level for
|
464
|
+
:type df: 'pandas.DataFrame'
|
465
|
+
:return: the minimum glucose level of the given CGM trace
|
466
|
+
:rtype: float
|
467
|
+
"""
|
468
|
+
return np.min(df[GLUCOSE])
|
469
|
+
|
470
|
+
def delta(df: pd.DataFrame) -> float:
|
471
|
+
"""Returns the difference in maximum and baseline glucose levels (delta) for the given CGM trace
|
472
|
+
|
473
|
+
:param df: a Pandas DataFrame containing the CGM trace to retrieve the delta for
|
474
|
+
:type df: 'pandas.DataFrame'
|
475
|
+
:return: the delta of the given CGM trace
|
476
|
+
:rtype: float
|
477
|
+
"""
|
478
|
+
return abs(peak(df) - baseline(df))
|
479
|
+
|
480
|
+
def post_event_glucoses(data: pd.DataFrame, event_time: pd.Timestamp, times: list[int], glucose_col: str = GLUCOSE) -> dict:
|
481
|
+
"""
|
482
|
+
Returns the glucose values closest to the specified times (in minutes) after the given event_time.
|
483
|
+
|
484
|
+
:param data: Pandas DataFrame containing the CGM data
|
485
|
+
:type data: pd.DataFrame
|
486
|
+
:param event_time: The time of the event
|
487
|
+
:type event_time: pd.Timestamp
|
488
|
+
:param times: A list of integers representing the number of minutes after event_time for which to find the glucose values
|
489
|
+
:type times: list[int]
|
490
|
+
:param glucose_col: The name of the glucose column in the data, defaults to GLUCOSE
|
491
|
+
:type glucose_col: str, optional
|
492
|
+
:return: A dictionary where keys are strings like "X-min Post Event" and values are the corresponding glucose readings or np.nan if not found
|
493
|
+
:rtype: dict
|
494
|
+
"""
|
495
|
+
result = {}
|
496
|
+
|
497
|
+
# Always include 0-min to have a reference point
|
498
|
+
if 0 not in times:
|
499
|
+
times = [0] + times
|
500
|
+
|
501
|
+
for t in times:
|
502
|
+
key = f"{t}-min Post Event"
|
503
|
+
result[key] = np.nan
|
504
|
+
post_time = event_time + pd.Timedelta(minutes=t)
|
505
|
+
|
506
|
+
# Check if the desired time is within the range of the data
|
507
|
+
if not data.empty and data[TIME].min() <= post_time <= data[TIME].max():
|
508
|
+
closest_idx = (data[TIME] - post_time).abs().idxmin()
|
509
|
+
result[key] = data.loc[closest_idx, glucose_col]
|
510
|
+
|
511
|
+
return result
|
512
|
+
|
513
|
+
def post_event_aucs(data: pd.DataFrame, event_time: pd.Timestamp, durations: list[int], glucose_col: str = GLUCOSE) -> dict:
|
514
|
+
"""
|
515
|
+
Calculates AUC values for multiple durations (in minutes) starting from the given event_time.
|
516
|
+
|
517
|
+
:param data: Pandas DataFrame containing the CGM data
|
518
|
+
:type data: pd.DataFrame
|
519
|
+
:param event_time: The time of the event
|
520
|
+
:type event_time: pd.Timestamp
|
521
|
+
:param durations: A list of integers representing the number of minutes after event_time for which to calculate the AUC
|
522
|
+
:type durations: list[int]
|
523
|
+
:param glucose_col: The name of the glucose column in the data, defaults to GLUCOSE
|
524
|
+
:type glucose_col: str, optional
|
525
|
+
:return: A dictionary where keys are strings like "X-min AUC" and values are the corresponding AUC readings or np.nan if no data is available
|
526
|
+
:rtype: dict
|
527
|
+
"""
|
528
|
+
result = {}
|
529
|
+
for d in durations:
|
530
|
+
key = f"{d}-min AUC"
|
531
|
+
end_time = event_time + pd.Timedelta(minutes=d)
|
532
|
+
subset = data[(data[TIME] >= event_time) & (data[TIME] <= end_time)].copy()
|
533
|
+
|
534
|
+
if subset.empty:
|
535
|
+
result[key] = np.nan
|
536
|
+
else:
|
537
|
+
result[key] = AUC(subset)
|
538
|
+
return result
|
539
|
+
|
540
|
+
def event_metrics(
|
541
|
+
df: pd.DataFrame,
|
542
|
+
event: pd.Series,
|
543
|
+
post_times: list[int] = [60, 120],
|
544
|
+
post_auc_times: list[int] = [120]
|
545
|
+
) -> pd.DataFrame:
|
546
|
+
"""Calculates basic metrics for events (baseline, peak, delta, iAUC, and
|
547
|
+
0-h, 1-h, and 2-h post event glucose values, and 2-h post event AUC)
|
548
|
+
|
549
|
+
:param df: Pandas DataFrame containing preprocessed CGM data
|
550
|
+
:type df: pandas.DataFrame
|
551
|
+
:param event: Pandas Series with fields that represent an 'event'
|
552
|
+
:type event: pandas.Series
|
553
|
+
:param post_times: A list of integers representing the number of minutes after event_time for which to find the glucose values
|
554
|
+
:type post_times: list[int], optional (defaults to [0, 60, 120] for 0-h, 1-h and 2-h post event)
|
555
|
+
:return: Pandas DataFrame containing the basic metrics for the given event
|
556
|
+
:rtype: pandas.DataFrame
|
557
|
+
"""
|
558
|
+
id = event[ID]
|
559
|
+
|
560
|
+
datetime = pd.Timestamp(event[TIME])
|
561
|
+
initial = datetime - pd.Timedelta(event[BEFORE], "m")
|
562
|
+
final = datetime + pd.Timedelta(event[AFTER], "m")
|
563
|
+
|
564
|
+
patient_data = df.loc[id]
|
565
|
+
data = patient_data[(patient_data[TIME] >= initial) & (patient_data[TIME] <= final)].copy()
|
566
|
+
|
567
|
+
metrics = pd.Series()
|
568
|
+
metrics["Baseline"] = baseline(data)
|
569
|
+
metrics["Peak"] = peak(data)
|
570
|
+
metrics["Delta"] = delta(data)
|
571
|
+
metrics["AUC"] = AUC(data)
|
572
|
+
metrics["iAUC"] = iAUC(data, baseline(data))
|
573
|
+
|
574
|
+
# Get post-event glucose values (including 0-min)
|
575
|
+
post_values = post_event_glucoses(data, datetime, post_times, GLUCOSE)
|
576
|
+
for k, v in post_values.items():
|
577
|
+
metrics[k] = v
|
578
|
+
|
579
|
+
# Compute deltas from 0-min Post Event
|
580
|
+
zero_min_val = metrics["0-min Post Event"]
|
581
|
+
for t in post_times:
|
582
|
+
if t == 0:
|
583
|
+
continue
|
584
|
+
post_key = f"{t}-min Post Event"
|
585
|
+
delta_key = f"{t}-min Delta"
|
586
|
+
if post_key in metrics and not np.isnan(metrics[post_key]) and not np.isnan(zero_min_val):
|
587
|
+
metrics[delta_key] = metrics[post_key] - zero_min_val
|
588
|
+
else:
|
589
|
+
metrics[delta_key] = np.nan
|
590
|
+
|
591
|
+
auc_values = post_event_aucs(data, datetime, post_auc_times, GLUCOSE)
|
592
|
+
for k, v in auc_values.items():
|
593
|
+
metrics[k] = v
|
594
|
+
|
595
|
+
return metrics.to_frame().T
|
596
|
+
|
597
|
+
def create_event_features(
|
598
|
+
df: pd.DataFrame,
|
599
|
+
events: pd.DataFrame,
|
600
|
+
) -> pd.DataFrame:
|
601
|
+
"""Returns a multi-indexed Pandas DataFrame containing metrics for the patient data during their respective 'events'
|
602
|
+
|
603
|
+
:param df: a Pandas DataFrame containing all the relevant patient CGM data to generate event metrics for
|
604
|
+
:type df: 'pandas.Series'
|
605
|
+
:param events: a single indexed Pandas DataFrame, with each row specifying a single event in the form of
|
606
|
+
an id, a datetime, # of hours before the datetime to include, # of hours after to include, and a desc
|
607
|
+
:type events: 'pandas.DataFrame'
|
608
|
+
:return: a multi-indexed Pandas DataFrame containing metrics for the patient data during their respective 'events'
|
609
|
+
"""
|
610
|
+
event_features = {}
|
611
|
+
for id in df.index.unique():
|
612
|
+
sub_features = {}
|
613
|
+
for type, sub_events in events[events[ID] == id].groupby(TYPE):
|
614
|
+
sub_features.update(create_event_features_helper(df.loc[id], sub_events, type))
|
615
|
+
event_features[id] = sub_features
|
616
|
+
|
617
|
+
return pd.DataFrame(event_features).T
|
618
|
+
|
619
|
+
def create_event_features_helper(
|
620
|
+
df: pd.DataFrame,
|
621
|
+
sub_events: pd.DataFrame,
|
622
|
+
type: str,
|
623
|
+
) -> dict[str, float]:
|
624
|
+
"""Calculates aggregate event-based metrics for a single patient and type of event. Helper method for 'create_event_features()'.
|
625
|
+
|
626
|
+
:param df: Pandas DataFrame containing the CGM trace for a single patient
|
627
|
+
:type df: 'pandas.DataFrame'
|
628
|
+
:param sub_events: Pandas DataFrame containing events of only one type solely for the patient whose CGM trace is also given
|
629
|
+
:type sub_events: 'pandas.DataFrame'
|
630
|
+
:param type: the type of event that 'sub_events' contains
|
631
|
+
:type type: str
|
632
|
+
:return: a dictionary with str-type keys that refer to the name of the calculated features and float-type values
|
633
|
+
:rtype: dict[str, float]
|
634
|
+
"""
|
635
|
+
|
636
|
+
features = {
|
637
|
+
f"Mean {type} Duration": [],
|
638
|
+
f"Mean Glucose During {type}s": [],
|
639
|
+
f"Mean Upwards Slope of {type}s (mg/dL per min)": [],
|
640
|
+
f"Mean Downwards Slope of {type}s (mg/dL per min)": [],
|
641
|
+
f"Mean Minimum Glucose of {type}s": [],
|
642
|
+
f"Mean Maximum Glucose of {type}s": [],
|
643
|
+
f"Mean Amplitude of {type}s": [],
|
644
|
+
f"Mean iAUC of {type}s": []
|
645
|
+
}
|
646
|
+
|
647
|
+
for _, event in sub_events.iterrows():
|
648
|
+
event_data = retrieve_event_data(df, event)
|
649
|
+
|
650
|
+
duration = event[AFTER] - event[BEFORE]
|
651
|
+
features[f"Mean {type} Duration"].append(duration)
|
652
|
+
|
653
|
+
features[f"Mean Glucose During {type}s"].append(event_data[GLUCOSE].mean())
|
654
|
+
features[f"Mean Minimum Glucose of {type}s"] = nadir(event_data)
|
655
|
+
features[f"Mean Maximum Glucose of {type}s"] = peak(event_data)
|
656
|
+
|
657
|
+
event_time = event[TIME]
|
658
|
+
closest_idx = (event_data[TIME] - event_time).abs().idxmin()
|
659
|
+
event_glucose = event_data.loc[closest_idx, GLUCOSE]
|
660
|
+
|
661
|
+
peak_glucose = peak(event_data)
|
662
|
+
peak_time = event_data.loc[event_data[GLUCOSE].idxmax(), TIME]
|
663
|
+
amplitude = peak_glucose - event_glucose
|
664
|
+
features[f"Mean Amplitude of {type}s"].append(abs(amplitude))
|
665
|
+
|
666
|
+
time_diff_to_peak = (peak_time - event_time).total_seconds() / 60.0
|
667
|
+
slope_to_peak = (peak_glucose - event_glucose) / time_diff_to_peak if time_diff_to_peak != 0 else np.nan
|
668
|
+
features[f"Mean Upwards Slope of {type}s (mg/dL per min)"].append(slope_to_peak)
|
669
|
+
|
670
|
+
end_time = event_data[TIME].iloc[-1]
|
671
|
+
end_glucose = event_data[GLUCOSE].iloc[-1]
|
672
|
+
time_diff_peak_to_end = (end_time - peak_time).total_seconds() / 60.0
|
673
|
+
slope_peak_to_end = (end_glucose - peak_glucose) / time_diff_peak_to_end if time_diff_peak_to_end != 0 else np.nan
|
674
|
+
features[f"Mean Downwards Slope of {type}s (mg/dL per min)"].append(slope_peak_to_end)
|
675
|
+
|
676
|
+
features[f"Mean iAUC of {type}s"].append(iAUC(event_data, event_glucose))
|
677
|
+
|
678
|
+
features = {k: np.mean(v) for k, v in features.items()}
|
679
|
+
features[f"Mean # of {type}s per day"] = sub_events.shape[0] / len(df[TIME].dt.date.unique())
|
680
|
+
return features
|