glucose360 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glucose360/__init__.py +1 -0
- glucose360/events.py +680 -0
- glucose360/features.py +1042 -0
- glucose360/plots.py +494 -0
- glucose360/preprocessing.py +558 -0
- glucose360-0.0.1.dist-info/LICENSE +674 -0
- glucose360-0.0.1.dist-info/METADATA +34 -0
- glucose360-0.0.1.dist-info/RECORD +10 -0
- glucose360-0.0.1.dist-info/WHEEL +5 -0
- glucose360-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,558 @@
|
|
1
|
+
import os, glob
|
2
|
+
import pandas as pd
|
3
|
+
import configparser
|
4
|
+
import zipfile, tempfile
|
5
|
+
|
6
|
+
dir_path = os.path.dirname(os.path.realpath(__file__))
|
7
|
+
config_path = os.path.join(dir_path, "config.ini")
|
8
|
+
config = configparser.ConfigParser()
|
9
|
+
config.read(config_path)
|
10
|
+
ID = config['variables']['id']
|
11
|
+
GLUCOSE = config['variables']['glucose']
|
12
|
+
TIME = config['variables']['time']
|
13
|
+
|
14
|
+
# globals for glucose values to replace "Low" and "High" with in the CGM data
|
15
|
+
LOW = 40
|
16
|
+
HIGH = 400
|
17
|
+
|
18
|
+
def import_data(
|
19
|
+
path: str,
|
20
|
+
name: str = None,
|
21
|
+
sensor: str = "dexcom",
|
22
|
+
id_template: str = None,
|
23
|
+
glucose: str = None,
|
24
|
+
time: str = None,
|
25
|
+
interval: int = 5,
|
26
|
+
max_gap: int = 45,
|
27
|
+
output = print
|
28
|
+
) -> pd.DataFrame:
|
29
|
+
"""Returns a Multiindexed Pandas DataFrame containing all of the csv data found at the given path.
|
30
|
+
The path can lead to a directory, .zip file, or a .csv file. The returned DataFrame holds columns
|
31
|
+
for timestamps and glucose values, and is indexed by patient identifications
|
32
|
+
|
33
|
+
:param path: the path of the directory/zip/csv to be parsed through
|
34
|
+
:type path: str
|
35
|
+
:param sensor: the CGM device model used (either dexcom, freestyle libre pro, or freestyle libre 2 / freestyle libre 3), defaults to 'dexcom'
|
36
|
+
:type sensor: str, optional
|
37
|
+
:param id_template: regex dictating how to parse each CSV file's name for the proper patient identification, defaults to None
|
38
|
+
:type id_template: str, optional
|
39
|
+
:param glucose: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
40
|
+
:type glucose: str, optional
|
41
|
+
:param time: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
42
|
+
:type time: str, optional
|
43
|
+
:param interval: the resampling interval (in minutes) that the data should follow, defaults to 5
|
44
|
+
:type interval: int, optional
|
45
|
+
:param max_gap: the maximum amount of minutes a gap in the data can be interpolated, defaults to 45
|
46
|
+
(filling in a gap with a longer duration would be considered extrapolation)
|
47
|
+
:type max_gap: int, optional
|
48
|
+
:return: A Pandas DataFrame containing the preprocessed data found at the given path. This DataFrame holds columns for timestamps, glucose values, weekday/weekend chunking, and waking/sleeping time chunking.
|
49
|
+
:rtype: pandas.DataFrame
|
50
|
+
|
51
|
+
:Example:
|
52
|
+
>>> path_to_data = "datasets/patient_data.csv"
|
53
|
+
>>> df = import_data(path_to_data)
|
54
|
+
"""
|
55
|
+
# update the config with the resampling interval the user chose
|
56
|
+
updated_config = config['variables']
|
57
|
+
updated_config['interval'] = str(interval)
|
58
|
+
config["variables"] = updated_config
|
59
|
+
with open('config.ini', 'w') as configfile:
|
60
|
+
config.write(configfile)
|
61
|
+
|
62
|
+
# get file extension of where the given path points
|
63
|
+
ext = os.path.splitext(path)[1]
|
64
|
+
|
65
|
+
# path leads to directory
|
66
|
+
if ext == "":
|
67
|
+
if not os.path.isdir(path):
|
68
|
+
raise ValueError("Directory does not exist")
|
69
|
+
else:
|
70
|
+
return _import_directory(path, sensor, id_template, glucose, time, interval, max_gap, output)
|
71
|
+
|
72
|
+
# check if path leads to .zip or .csv
|
73
|
+
if ext.lower() in [".csv", ".zip"]:
|
74
|
+
if not os.path.isfile(path):
|
75
|
+
raise ValueError("File does not exist")
|
76
|
+
else:
|
77
|
+
raise ValueError("Invalid file type")
|
78
|
+
|
79
|
+
# path leads to .csv
|
80
|
+
if ext.lower() == ".csv":
|
81
|
+
return _import_csv(path, sensor, id_template, glucose, time, interval, max_gap)
|
82
|
+
|
83
|
+
# otherwise has to be a .zip file
|
84
|
+
with zipfile.ZipFile(path, 'r') as zip_ref:
|
85
|
+
# create a temporary directory to pull from
|
86
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
87
|
+
zip_ref.extractall(temp_dir)
|
88
|
+
dir = name or path.split("/")[-1].split(".")[0]
|
89
|
+
return _import_directory((temp_dir + "/" + dir), sensor, id_template, glucose, time, interval, max_gap, output)
|
90
|
+
|
91
|
+
def _import_directory(
|
92
|
+
path: str,
|
93
|
+
sensor: str = "dexcom",
|
94
|
+
id_template: str = None,
|
95
|
+
glucose: str = None,
|
96
|
+
time: str = None,
|
97
|
+
interval: int = 5,
|
98
|
+
max_gap: int = 45,
|
99
|
+
output = print
|
100
|
+
) -> pd.DataFrame:
|
101
|
+
"""Returns a Multiindexed Pandas DataFrame containing all of the csv data found at the given path.
|
102
|
+
The path must lead to a directory containing .csv files. The returned DataFrame holds columns
|
103
|
+
for timestamps and glucose values, and is indexed by patient identifications
|
104
|
+
|
105
|
+
:param path: the path of the directory to be parsed through
|
106
|
+
:type path: str
|
107
|
+
:param sensor: the CGM device model used (either dexcom, freestyle libre pro, or freestyle libre 2 / freestyle libre 3), defaults to 'dexcom'
|
108
|
+
:type sensor: str, optional
|
109
|
+
:param id_template: regex dictating how to parse each CSV file's name for the proper patient identification, defaults to None
|
110
|
+
:type id_template: str, optional
|
111
|
+
:param glucose: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
112
|
+
:type glucose: str, optional
|
113
|
+
:param time: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
114
|
+
:type time: str, optional
|
115
|
+
:param interval: the resampling interval (in minutes) that the data should follow, defaults to 5
|
116
|
+
:type interval: int, optional
|
117
|
+
:param max_gap: the maximum amount of minutes a gap in the data can be interpolated, defaults to 45
|
118
|
+
(filling in a gap with a longer duration would be considered extrapolation)
|
119
|
+
:type max_gap: int, optional
|
120
|
+
:return: A Pandas DataFrame containing the preprocessed data found at the given path. This DataFrame holds columns for timestamps, glucose values, weekday/weekend chunking, and waking/sleeping time chunking.
|
121
|
+
:rtype: pandas.DataFrame
|
122
|
+
"""
|
123
|
+
csv_files = glob.glob(path + "/*.csv")
|
124
|
+
num_files = len(csv_files)
|
125
|
+
|
126
|
+
if num_files == 0:
|
127
|
+
raise Exception("No CSV files found.")
|
128
|
+
|
129
|
+
output(f"{num_files} .csv files were found in the specified directory.")
|
130
|
+
|
131
|
+
data: list[pd.DataFrame] = []
|
132
|
+
num_valid_files = num_files
|
133
|
+
for file in csv_files:
|
134
|
+
try:
|
135
|
+
data.append(_import_csv(file, sensor, id_template, glucose, time, interval, max_gap))
|
136
|
+
except:
|
137
|
+
num_valid_files -= 1
|
138
|
+
|
139
|
+
output(f"{num_valid_files} .csv files were successfully imported.")
|
140
|
+
|
141
|
+
if len(data) == 0: raise Exception("CSV files found, but none were valid.")
|
142
|
+
df = pd.concat(data)
|
143
|
+
|
144
|
+
output(f"{df.index.unique().size} sections were found in the imported data.")
|
145
|
+
|
146
|
+
return df
|
147
|
+
|
148
|
+
def _import_csv(
|
149
|
+
path: str,
|
150
|
+
sensor: str = "dexcom",
|
151
|
+
id_template: str = None,
|
152
|
+
glucose: str = None,
|
153
|
+
time: str = None,
|
154
|
+
interval: int = 5,
|
155
|
+
max_gap: int = 45
|
156
|
+
) -> pd.DataFrame:
|
157
|
+
"""Returns a Multiindexed Pandas DataFrame containing all of the csv data found at the given path.
|
158
|
+
The path must lead to a .csv file. The returned DataFrame holds columns
|
159
|
+
for timestamps and glucose values, and is indexed by patient identifications
|
160
|
+
|
161
|
+
:param path: the path of the csv file to be parsed through
|
162
|
+
:type path: str
|
163
|
+
:param sensor: the CGM device model used (either 'dexcom', 'freestyle libre pro', 'freestyle libre 2', 'freestyle libre 3', or 'columns'), defaults to 'dexcom'
|
164
|
+
:type sensor: str, optional
|
165
|
+
:param id_template: regex dictating how to parse the CSV file's name for the proper patient identification,
|
166
|
+
or the name of the patient identification column if using the 'columns' sensor, defaults to None
|
167
|
+
:type id_template: str, optional
|
168
|
+
:param glucose: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
169
|
+
:type glucose: str, optional
|
170
|
+
:param time: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
171
|
+
:type time: str, optional
|
172
|
+
:param interval: the resampling interval (in minutes) that the data should follow, defaults to 5
|
173
|
+
:type interval: int, optional
|
174
|
+
:param max_gap: the maximum amount of minutes a gap in the data can be interpolated, defaults to 45
|
175
|
+
(filling in a gap with a longer duration would be considered extrapolation)
|
176
|
+
:type max_gap: int, optional
|
177
|
+
:return: A Pandas DataFrame containing the preprocessed data found at the given path. This DataFrame holds columns for timestamps, glucose values, weekday/weekend chunking, and waking/sleeping time chunking.
|
178
|
+
:rtype: pandas.DataFrame
|
179
|
+
"""
|
180
|
+
data = pd.DataFrame()
|
181
|
+
if sensor == "dexcom":
|
182
|
+
data = _import_csv_dexcom(path, id_template, glucose, time)
|
183
|
+
elif sensor == "freestyle libre 2" or sensor == "freestyle libre 3":
|
184
|
+
data = _import_csv_freestyle_libre_23(path, id_template, glucose, time)
|
185
|
+
elif sensor == "freestyle libre pro":
|
186
|
+
data = _import_csv_freestyle_libre_pro(path, id_template, glucose, time)
|
187
|
+
elif sensor == "columns":
|
188
|
+
data = _import_csv_columns(path, id_template, glucose, time)
|
189
|
+
|
190
|
+
preprocessed_data = preprocess_data(data, interval, max_gap)
|
191
|
+
return preprocessed_data
|
192
|
+
|
193
|
+
def _import_csv_columns(
|
194
|
+
path: str,
|
195
|
+
id_col: str = None,
|
196
|
+
glucose_col: str = None,
|
197
|
+
time_col: str = None,
|
198
|
+
) -> pd.DataFrame:
|
199
|
+
"""Returns a Pandas DataFrame containing all of the csv data found at the given path.
|
200
|
+
The path must lead to a .csv file with three columns (identification, timestamp, and glucose value) containing CGM data. The returned DataFrame holds columns
|
201
|
+
for timestamps, glucose values, and the patient identification
|
202
|
+
|
203
|
+
:param path: the path of the csv file to be parsed through
|
204
|
+
:type path: str
|
205
|
+
:param id_col: the name of the column containing the patient identification(s), defaults to None
|
206
|
+
:type id_col: str, optional
|
207
|
+
:param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
208
|
+
:type glucose_col: str, optional
|
209
|
+
:param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
210
|
+
:type time_col: str, optional
|
211
|
+
:return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
|
212
|
+
:rtype: pandas.DataFrame
|
213
|
+
"""
|
214
|
+
df = pd.read_csv(path)
|
215
|
+
glucose = glucose_col or "Glucose Value (mg/dL)"
|
216
|
+
time = time_col or "Timestamp (YYYY-MM-DDThh:mm:ss)"
|
217
|
+
id = id_col or "ID"
|
218
|
+
|
219
|
+
df.rename(columns={glucose: GLUCOSE, time: TIME, id: ID}, inplace=True)
|
220
|
+
return df
|
221
|
+
|
222
|
+
def _import_csv_dexcom(
|
223
|
+
path: str,
|
224
|
+
id_template: str = None,
|
225
|
+
glucose_col: str = None,
|
226
|
+
time_col: str = None,
|
227
|
+
) -> pd.DataFrame:
|
228
|
+
"""Returns a Pandas DataFrame containing all of the Dexcom csv data found at the given path.
|
229
|
+
The path must lead to a .csv file containing CGM data from a Dexcom device. The returned DataFrame holds columns
|
230
|
+
for timestamps, glucose values, and the patient identification
|
231
|
+
|
232
|
+
:param path: the path of the Dexcom csv file to be parsed through
|
233
|
+
:type path: str
|
234
|
+
:param id_template: regex dictating how to parse the CSV file's name for the proper patient identification, defaults to None
|
235
|
+
:type id_template: str, optional
|
236
|
+
:param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
237
|
+
:type glucose_col: str, optional
|
238
|
+
:param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
239
|
+
:type time_col: str, optional
|
240
|
+
:return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
|
241
|
+
:rtype: pandas.DataFrame
|
242
|
+
"""
|
243
|
+
df = pd.read_csv(path)
|
244
|
+
glucose = glucose_col or "Glucose Value (mg/dL)"
|
245
|
+
time = time_col or "Timestamp (YYYY-MM-DDThh:mm:ss)"
|
246
|
+
id = _retrieve_id_dexcom(path.split("/")[-1], df, id_template)
|
247
|
+
|
248
|
+
df.rename(columns={glucose: GLUCOSE, time: TIME}, inplace=True)
|
249
|
+
df[ID] = id
|
250
|
+
return df
|
251
|
+
|
252
|
+
def _import_csv_freestyle_libre_23(
|
253
|
+
path: str,
|
254
|
+
id_template: str = None,
|
255
|
+
glucose_col: str = None,
|
256
|
+
time_col: str = None,
|
257
|
+
) -> pd.DataFrame:
|
258
|
+
"""Returns a Pandas DataFrame containing all of the FreeStyle Libre 2 or 3 csv data found at the given path.
|
259
|
+
The path must lead to a .csv file containing CGM data from FreeStyle Libre 2 or FreeStyle Libre 3 devices. The returned DataFrame holds columns
|
260
|
+
for timestamps, glucose values, and the patient identification
|
261
|
+
|
262
|
+
:param path: the path of the FreeStyle Libre 2 or 3 csv file to be parsed through
|
263
|
+
:type path: str
|
264
|
+
:param id_template: regex dictating how to parse the CSV file's name for the proper patient identification, defaults to None
|
265
|
+
:type id_template: str, optional
|
266
|
+
:param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
267
|
+
:type glucose_col: str, optional
|
268
|
+
:param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
269
|
+
:type time_col: str, optional
|
270
|
+
:return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
|
271
|
+
:rtype: pandas.DataFrame
|
272
|
+
"""
|
273
|
+
glucose = glucose_col or "Historic Glucose mg/dL"
|
274
|
+
time = time_col or "Device Timestamp"
|
275
|
+
return _import_csv_freestyle_libre(path, id_template, glucose, time)
|
276
|
+
|
277
|
+
def _import_csv_freestyle_libre_pro(
|
278
|
+
path: str,
|
279
|
+
id_template: str = None,
|
280
|
+
glucose_col: str = None,
|
281
|
+
time_col: str = None,
|
282
|
+
) -> pd.DataFrame:
|
283
|
+
"""Returns a Pandas DataFrame containing all of the FreeStyle Libre Pro csv data found at the given path.
|
284
|
+
The path must lead to a .csv file containing CGM data from a FreeStyle Libre Pro device. The returned DataFrame holds columns
|
285
|
+
for timestamps, glucose values, and the patient identification
|
286
|
+
|
287
|
+
:param path: the path of the FreeStyle Libre Pro csv file to be parsed through
|
288
|
+
:type path: str
|
289
|
+
:param id_template: regex dictating how to parse the CSV file's name for the proper patient identification, defaults to None
|
290
|
+
:type id_template: str, optional
|
291
|
+
:param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
292
|
+
:type glucose_col: str, optional
|
293
|
+
:param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
|
294
|
+
:type time_col: str, optional
|
295
|
+
:return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
|
296
|
+
:rtype: pandas.DataFrame
|
297
|
+
"""
|
298
|
+
glucose = glucose_col or "Historic Glucose(mg/dL)"
|
299
|
+
time = time_col or "Meter Timestamp"
|
300
|
+
return _import_csv_freestyle_libre(path, id_template, glucose, time)
|
301
|
+
|
302
|
+
def _import_csv_freestyle_libre(
|
303
|
+
path: str,
|
304
|
+
id_template: str,
|
305
|
+
glucose_col: str,
|
306
|
+
time_col: str
|
307
|
+
) -> pd.DataFrame:
|
308
|
+
"""Returns a Pandas DataFrame containing all of the FreeStyle Libre csv data found at the given path.
|
309
|
+
The path must lead to a .csv file containing CGM data from FreeStyle Libre 2/3/Pro devices. The returned DataFrame holds columns
|
310
|
+
for timestamps, glucose values, and the patient identification.
|
311
|
+
|
312
|
+
:param path: the path of the FreeStyle Libre csv file to be parsed through
|
313
|
+
:type path: str
|
314
|
+
:param id_template: regex dictating how to parse the CSV file's name for the proper patient identification
|
315
|
+
:type id_template: str
|
316
|
+
:param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used)
|
317
|
+
:type glucose_col: str
|
318
|
+
:param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used)
|
319
|
+
:type time_col: str
|
320
|
+
:return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
|
321
|
+
:rtype: pandas.DataFrame
|
322
|
+
"""
|
323
|
+
id = pd.read_csv(path, nrows=1)["Patient report"].iloc[0] if not id_template else _id_from_filename(path.split("/")[-1], id_template)
|
324
|
+
df = pd.read_csv(path, skiprows=2)
|
325
|
+
|
326
|
+
df.rename(columns={glucose_col: GLUCOSE, time_col: TIME}, inplace=True)
|
327
|
+
df[ID] = id
|
328
|
+
return df
|
329
|
+
|
330
|
+
def _retrieve_id_dexcom(name: str, df: pd.DataFrame, id_template: str = None) -> str:
|
331
|
+
"""Returns the appropriate identification for the given raw Dexcom CGM data based on the given template.
|
332
|
+
If the template is None, the identification will be pulled from the patient information fields from within the CSV.
|
333
|
+
Otherwise, the filename will be parsed accordingly.
|
334
|
+
|
335
|
+
:param name: the name of the file to parse for an identification
|
336
|
+
:type name: str
|
337
|
+
:param df: a Pandas DataFrame containing the raw data from a Dexcom CSV file
|
338
|
+
:type df: pandas.DataFrame
|
339
|
+
:param id_template: regex indicating how to parse the filename for the identification, defaults to None
|
340
|
+
:type id_template: str, optional
|
341
|
+
:return: the proper identification for the raw data in the given dataframe
|
342
|
+
:rtype: str
|
343
|
+
"""
|
344
|
+
if id_template and "first" not in id_template and "last" not in id_template and "patient_identifier" not in id_template:
|
345
|
+
# need to parse file name for id
|
346
|
+
return _id_from_filename(name, id_template)
|
347
|
+
|
348
|
+
# use Dexcom fields for id instead
|
349
|
+
first = df["Patient Info"].iloc[0]
|
350
|
+
last = df["Patient Info"].iloc[1]
|
351
|
+
patient_identifier = df["Patient Info"].iloc[2]
|
352
|
+
id = df["Patient Info"].iloc[0] + df["Patient Info"].iloc[1]
|
353
|
+
if id_template: id = id_template.format(first=first, last=last, patient_identifier=patient_identifier)
|
354
|
+
return id
|
355
|
+
|
356
|
+
def _id_from_filename(name: str, id_template: str):
|
357
|
+
"""Parses the given filename for an identification using a regex template.
|
358
|
+
|
359
|
+
:param name: the filename to parse for an identification
|
360
|
+
:type name: str
|
361
|
+
:param id_template: regex indicating how to parse the filename for the identification
|
362
|
+
:type id_template: str
|
363
|
+
:return: the identification from the filename
|
364
|
+
:rtype: str
|
365
|
+
"""
|
366
|
+
import re
|
367
|
+
pattern = re.compile(fr"{id_template}")
|
368
|
+
match = pattern.search(name)
|
369
|
+
if match is None:
|
370
|
+
raise Exception("The RegEx ID template passed does not match the file name.")
|
371
|
+
id = str(match.group("id"))
|
372
|
+
try:
|
373
|
+
section = str(match.group("section"))
|
374
|
+
id += f" ({section})"
|
375
|
+
except:
|
376
|
+
print(f"'Section' not defined for patient {id}.")
|
377
|
+
return id
|
378
|
+
|
379
|
+
def preprocess_data(
|
380
|
+
df: pd.DataFrame,
|
381
|
+
interval: int = 5,
|
382
|
+
max_gap: int = 45
|
383
|
+
) -> pd.DataFrame:
|
384
|
+
"""Returns a Pandas DataFrame containing the preprocessed CGM data within the given dataframe.
|
385
|
+
As part of the preprocessing phase, the data will be converted into the proper data types, resampled, interpolated, chunked, and
|
386
|
+
indexed by identification (alongside all 'Low's and 'High's being replaced and all edge null values being dropped)
|
387
|
+
|
388
|
+
:param df: the Pandas DataFrame containing the CGM data to preprocess
|
389
|
+
:type df: pandas.DataFrame
|
390
|
+
:param interval: the resampling interval (in minutes) the CGM data should follow, defaults to 5
|
391
|
+
:type interval: int, optional
|
392
|
+
:param max_gap: the maximum duration (in minutes) of a gap in the data that should be interpolated, defaults to 45
|
393
|
+
:type max_gap: int, optional
|
394
|
+
:return: A Pandas DataFrame containing the preprocessed CGM data. This DataFrame is indexed by identification and holds columns for
|
395
|
+
timestamps, glucose values, day chunking, and time chunking.
|
396
|
+
:rtype: pandas.DataFrame
|
397
|
+
|
398
|
+
:Example:
|
399
|
+
>>> # 'df' is a Pandas DataFrame already containing your CGM data, with columns for glucose values, timestamps, and identification
|
400
|
+
>>> preprocessed_df = preprocess_data(df)
|
401
|
+
"""
|
402
|
+
df = df.dropna(subset=[GLUCOSE])
|
403
|
+
df = df.replace("Low", LOW)
|
404
|
+
df = df.replace("High", HIGH)
|
405
|
+
df.reset_index(drop=True, inplace=True)
|
406
|
+
|
407
|
+
df[TIME] = pd.to_datetime(df[TIME])
|
408
|
+
df[GLUCOSE] = pd.to_numeric(df[GLUCOSE])
|
409
|
+
|
410
|
+
df = df[[TIME, GLUCOSE, ID]].copy()
|
411
|
+
df = _resample_data(df, interval, max_gap)
|
412
|
+
df = df.loc[df[GLUCOSE].first_valid_index():df[GLUCOSE].last_valid_index()]
|
413
|
+
df = _chunk_day(_chunk_time(df))
|
414
|
+
df.set_index(ID, inplace=True)
|
415
|
+
return df
|
416
|
+
|
417
|
+
def _resample_data(df: pd.DataFrame, minutes: int = 5, max_gap: int = 45) -> pd.DataFrame:
|
418
|
+
"""Resamples and (if needed) interpolates the given default-indexed DataFrame.
|
419
|
+
Used mostly to preprocess the data in the csv files being imported in _import_csv().
|
420
|
+
|
421
|
+
:param df: the DataFrame to be resampled and interpolated
|
422
|
+
:type df: pandas.DataFrame
|
423
|
+
:param minutes: the length of the interval to be resampled into (in minutes), defaults to 5
|
424
|
+
:type minutes: int
|
425
|
+
:param max_gap: the maximum duration (in minutes) of gaps that should be interpolated, defaults to 45
|
426
|
+
:type max_gap: int
|
427
|
+
:return: A Pandas DataFrame containing the resampled and interpolated. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
|
428
|
+
:rtype: pandas.DataFrame
|
429
|
+
"""
|
430
|
+
id = df.at[0, ID]
|
431
|
+
|
432
|
+
# Sort the DataFrame by datetime
|
433
|
+
resampled_df = df.sort_values(by=[TIME])
|
434
|
+
resampled_df = resampled_df.set_index(TIME)
|
435
|
+
|
436
|
+
interval = str(minutes) + "T"
|
437
|
+
# generate the times that match the frequency
|
438
|
+
resampled_df = resampled_df.asfreq(interval)
|
439
|
+
# add in the original points that don't match the frequency (just for linear time-based interpolation)
|
440
|
+
resampled_df.reset_index(inplace=True)
|
441
|
+
resampled_df = (pd.concat([resampled_df, df])).drop_duplicates(subset=[TIME])
|
442
|
+
resampled_df.sort_values(by=[TIME], inplace=True)
|
443
|
+
|
444
|
+
# interpolate the missing values
|
445
|
+
resampled_df.set_index(TIME, inplace=True)
|
446
|
+
resampled_df = _interpolate_data(resampled_df, max_gap)
|
447
|
+
|
448
|
+
# now that the values have been interpolated, remove the points that don't match the frequency
|
449
|
+
resampled_df = resampled_df.asfreq(interval)
|
450
|
+
resampled_df[ID] = id # resampled data points might have empty ID values
|
451
|
+
resampled_df.reset_index(inplace=True)
|
452
|
+
|
453
|
+
return resampled_df
|
454
|
+
|
455
|
+
def _interpolate_data(df: pd.DataFrame, max_gap: int) -> pd.DataFrame:
|
456
|
+
"""Only linearly interpolates NaN glucose values for time gaps that are less than the given number of minutes.
|
457
|
+
Used mainly in preprocessing for csv files that are being imported in _import_csv().
|
458
|
+
|
459
|
+
:param df: the Pandas DataFrame containing the CGM data to interpolate
|
460
|
+
:type df: pandas.DataFrame
|
461
|
+
:param max_gap: the maximum minute length of gaps that should be interpolated
|
462
|
+
:type max_gap: int
|
463
|
+
:return: a Pandas DataFrame with interpolated CGM data
|
464
|
+
:rtype: pandas.DataFrame
|
465
|
+
"""
|
466
|
+
config.read('config.ini')
|
467
|
+
interval = int(config["variables"]["interval"])
|
468
|
+
|
469
|
+
# based heavily on https://stackoverflow.com/questions/67128364/how-to-limit-pandas-interpolation-when-there-is-more-nan-than-the-limit
|
470
|
+
|
471
|
+
s = df[GLUCOSE].notnull()
|
472
|
+
s = s.ne(s.shift()).cumsum()
|
473
|
+
|
474
|
+
m = df.groupby([s, df[GLUCOSE].isnull()])[GLUCOSE].transform('size').where(df[GLUCOSE].isnull())
|
475
|
+
interpolated_df = df.interpolate(method="time", limit_area="inside").mask(m >= int(max_gap / interval))
|
476
|
+
|
477
|
+
return interpolated_df
|
478
|
+
|
479
|
+
def _chunk_time(df: pd.DataFrame) -> pd.DataFrame:
|
480
|
+
"""Adds a new column to the given DataFrame specifying whether the values occur during a waking or sleeping period
|
481
|
+
|
482
|
+
:param df: the Pandas DataFrame to add the new column to (must contain a column for timestamps)
|
483
|
+
:type df: pandas.DataFrame
|
484
|
+
:return: the Pandas DataFrame with the added column for time chunking
|
485
|
+
:rtype: pandas.DataFrame
|
486
|
+
"""
|
487
|
+
times = df[TIME] - df[TIME].dt.normalize()
|
488
|
+
is_waking = (times >= pd.Timedelta(hours=8)) & (times <= pd.Timedelta(hours=22))
|
489
|
+
df["Time Chunking"] = is_waking.replace({True: "Waking", False: "Sleeping"})
|
490
|
+
return df
|
491
|
+
|
492
|
+
def _chunk_day(df: pd.DataFrame) -> pd.DataFrame:
|
493
|
+
"""Adds a new column to the given DataFrame specifying whether the values occur during a weekday or the weekend
|
494
|
+
|
495
|
+
:param df: the Pandas DataFrame to add the new column to (must contain a column for timestamps)
|
496
|
+
:type df: pandas.DataFrame
|
497
|
+
:return: the Pandas DataFrame with the added column for day chunking
|
498
|
+
:rtype: pandas.DataFrame
|
499
|
+
"""
|
500
|
+
is_weekend = df[TIME].dt.dayofweek > 4
|
501
|
+
df["Day Chunking"] = is_weekend.replace({True: "Weekend", False: "Weekday"})
|
502
|
+
return df
|
503
|
+
|
504
|
+
def segment_data(path: str, df: pd.DataFrame) -> pd.DataFrame:
|
505
|
+
"""
|
506
|
+
Splits patients' data into multiple segments based on a given .csv file containing ID's and DateTimes.
|
507
|
+
|
508
|
+
:param path: path of the .csv file containing identifications and timestamps indicating where to split the given DataFrame
|
509
|
+
:type path: str
|
510
|
+
:param df: the DataFrame to split based on the given .csv file
|
511
|
+
:type df: pandas.DataFrame
|
512
|
+
:return: a Pandas DataFrame with the data split accordingly
|
513
|
+
:rtype: pandas.DataFrame
|
514
|
+
"""
|
515
|
+
# Read the segments CSV file
|
516
|
+
segments = pd.read_csv(path)
|
517
|
+
segments[TIME] = pd.to_datetime(segments[TIME])
|
518
|
+
|
519
|
+
# Sort segments by TIME
|
520
|
+
segments.sort_values(['ID', TIME], inplace=True)
|
521
|
+
|
522
|
+
# Create a copy of the original dataframe to avoid modifying it directly
|
523
|
+
df_copy = df.copy()
|
524
|
+
df_copy[TIME] = pd.to_datetime(df_copy[TIME])
|
525
|
+
df_copy = df_copy.reset_index()
|
526
|
+
|
527
|
+
# Initialize a Segment column in the original dataframe
|
528
|
+
df_copy['Segment'] = 0
|
529
|
+
|
530
|
+
# Use a dictionary to keep track of segment counters
|
531
|
+
segment_counters = {id: 1 for id in segments['ID'].unique()}
|
532
|
+
|
533
|
+
# Iterate over each row in the segments dataframe
|
534
|
+
for _, segment_row in segments.iterrows():
|
535
|
+
id = segment_row['ID']
|
536
|
+
time = segment_row[TIME]
|
537
|
+
|
538
|
+
# Create a mask for rows before the current segment time
|
539
|
+
mask = (df_copy['ID'] == id) & (df_copy[TIME] < time) & (df_copy['Segment'] == 0)
|
540
|
+
|
541
|
+
# Update the segment counter for those rows
|
542
|
+
df_copy.loc[mask, 'Segment'] = segment_counters[id]
|
543
|
+
|
544
|
+
# Increment the segment counter
|
545
|
+
segment_counters[id] += 1
|
546
|
+
|
547
|
+
# Assign the segment counter to rows after the last date
|
548
|
+
for id in segment_counters.keys():
|
549
|
+
mask = (df_copy['ID'] == id) & (df_copy['Segment'] == 0)
|
550
|
+
df_copy.loc[mask, 'Segment'] = segment_counters[id]
|
551
|
+
|
552
|
+
# Combine ID and Segment to form the new ID
|
553
|
+
df_copy['ID'] = df_copy['ID'].astype(str) + '_' + df_copy['Segment'].astype(str)
|
554
|
+
|
555
|
+
# Drop the Segment column as it's no longer needed
|
556
|
+
df_copy.drop(columns=['Segment'], inplace=True)
|
557
|
+
|
558
|
+
return df_copy
|