glucose360 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,558 @@
1
+ import os, glob
2
+ import pandas as pd
3
+ import configparser
4
+ import zipfile, tempfile
5
+
6
+ dir_path = os.path.dirname(os.path.realpath(__file__))
7
+ config_path = os.path.join(dir_path, "config.ini")
8
+ config = configparser.ConfigParser()
9
+ config.read(config_path)
10
+ ID = config['variables']['id']
11
+ GLUCOSE = config['variables']['glucose']
12
+ TIME = config['variables']['time']
13
+
14
+ # globals for glucose values to replace "Low" and "High" with in the CGM data
15
+ LOW = 40
16
+ HIGH = 400
17
+
18
+ def import_data(
19
+ path: str,
20
+ name: str = None,
21
+ sensor: str = "dexcom",
22
+ id_template: str = None,
23
+ glucose: str = None,
24
+ time: str = None,
25
+ interval: int = 5,
26
+ max_gap: int = 45,
27
+ output = print
28
+ ) -> pd.DataFrame:
29
+ """Returns a Multiindexed Pandas DataFrame containing all of the csv data found at the given path.
30
+ The path can lead to a directory, .zip file, or a .csv file. The returned DataFrame holds columns
31
+ for timestamps and glucose values, and is indexed by patient identifications
32
+
33
+ :param path: the path of the directory/zip/csv to be parsed through
34
+ :type path: str
35
+ :param sensor: the CGM device model used (either dexcom, freestyle libre pro, or freestyle libre 2 / freestyle libre 3), defaults to 'dexcom'
36
+ :type sensor: str, optional
37
+ :param id_template: regex dictating how to parse each CSV file's name for the proper patient identification, defaults to None
38
+ :type id_template: str, optional
39
+ :param glucose: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
40
+ :type glucose: str, optional
41
+ :param time: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
42
+ :type time: str, optional
43
+ :param interval: the resampling interval (in minutes) that the data should follow, defaults to 5
44
+ :type interval: int, optional
45
+ :param max_gap: the maximum amount of minutes a gap in the data can be interpolated, defaults to 45
46
+ (filling in a gap with a longer duration would be considered extrapolation)
47
+ :type max_gap: int, optional
48
+ :return: A Pandas DataFrame containing the preprocessed data found at the given path. This DataFrame holds columns for timestamps, glucose values, weekday/weekend chunking, and waking/sleeping time chunking.
49
+ :rtype: pandas.DataFrame
50
+
51
+ :Example:
52
+ >>> path_to_data = "datasets/patient_data.csv"
53
+ >>> df = import_data(path_to_data)
54
+ """
55
+ # update the config with the resampling interval the user chose
56
+ updated_config = config['variables']
57
+ updated_config['interval'] = str(interval)
58
+ config["variables"] = updated_config
59
+ with open('config.ini', 'w') as configfile:
60
+ config.write(configfile)
61
+
62
+ # get file extension of where the given path points
63
+ ext = os.path.splitext(path)[1]
64
+
65
+ # path leads to directory
66
+ if ext == "":
67
+ if not os.path.isdir(path):
68
+ raise ValueError("Directory does not exist")
69
+ else:
70
+ return _import_directory(path, sensor, id_template, glucose, time, interval, max_gap, output)
71
+
72
+ # check if path leads to .zip or .csv
73
+ if ext.lower() in [".csv", ".zip"]:
74
+ if not os.path.isfile(path):
75
+ raise ValueError("File does not exist")
76
+ else:
77
+ raise ValueError("Invalid file type")
78
+
79
+ # path leads to .csv
80
+ if ext.lower() == ".csv":
81
+ return _import_csv(path, sensor, id_template, glucose, time, interval, max_gap)
82
+
83
+ # otherwise has to be a .zip file
84
+ with zipfile.ZipFile(path, 'r') as zip_ref:
85
+ # create a temporary directory to pull from
86
+ with tempfile.TemporaryDirectory() as temp_dir:
87
+ zip_ref.extractall(temp_dir)
88
+ dir = name or path.split("/")[-1].split(".")[0]
89
+ return _import_directory((temp_dir + "/" + dir), sensor, id_template, glucose, time, interval, max_gap, output)
90
+
91
+ def _import_directory(
92
+ path: str,
93
+ sensor: str = "dexcom",
94
+ id_template: str = None,
95
+ glucose: str = None,
96
+ time: str = None,
97
+ interval: int = 5,
98
+ max_gap: int = 45,
99
+ output = print
100
+ ) -> pd.DataFrame:
101
+ """Returns a Multiindexed Pandas DataFrame containing all of the csv data found at the given path.
102
+ The path must lead to a directory containing .csv files. The returned DataFrame holds columns
103
+ for timestamps and glucose values, and is indexed by patient identifications
104
+
105
+ :param path: the path of the directory to be parsed through
106
+ :type path: str
107
+ :param sensor: the CGM device model used (either dexcom, freestyle libre pro, or freestyle libre 2 / freestyle libre 3), defaults to 'dexcom'
108
+ :type sensor: str, optional
109
+ :param id_template: regex dictating how to parse each CSV file's name for the proper patient identification, defaults to None
110
+ :type id_template: str, optional
111
+ :param glucose: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
112
+ :type glucose: str, optional
113
+ :param time: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
114
+ :type time: str, optional
115
+ :param interval: the resampling interval (in minutes) that the data should follow, defaults to 5
116
+ :type interval: int, optional
117
+ :param max_gap: the maximum amount of minutes a gap in the data can be interpolated, defaults to 45
118
+ (filling in a gap with a longer duration would be considered extrapolation)
119
+ :type max_gap: int, optional
120
+ :return: A Pandas DataFrame containing the preprocessed data found at the given path. This DataFrame holds columns for timestamps, glucose values, weekday/weekend chunking, and waking/sleeping time chunking.
121
+ :rtype: pandas.DataFrame
122
+ """
123
+ csv_files = glob.glob(path + "/*.csv")
124
+ num_files = len(csv_files)
125
+
126
+ if num_files == 0:
127
+ raise Exception("No CSV files found.")
128
+
129
+ output(f"{num_files} .csv files were found in the specified directory.")
130
+
131
+ data: list[pd.DataFrame] = []
132
+ num_valid_files = num_files
133
+ for file in csv_files:
134
+ try:
135
+ data.append(_import_csv(file, sensor, id_template, glucose, time, interval, max_gap))
136
+ except:
137
+ num_valid_files -= 1
138
+
139
+ output(f"{num_valid_files} .csv files were successfully imported.")
140
+
141
+ if len(data) == 0: raise Exception("CSV files found, but none were valid.")
142
+ df = pd.concat(data)
143
+
144
+ output(f"{df.index.unique().size} sections were found in the imported data.")
145
+
146
+ return df
147
+
148
+ def _import_csv(
149
+ path: str,
150
+ sensor: str = "dexcom",
151
+ id_template: str = None,
152
+ glucose: str = None,
153
+ time: str = None,
154
+ interval: int = 5,
155
+ max_gap: int = 45
156
+ ) -> pd.DataFrame:
157
+ """Returns a Multiindexed Pandas DataFrame containing all of the csv data found at the given path.
158
+ The path must lead to a .csv file. The returned DataFrame holds columns
159
+ for timestamps and glucose values, and is indexed by patient identifications
160
+
161
+ :param path: the path of the csv file to be parsed through
162
+ :type path: str
163
+ :param sensor: the CGM device model used (either 'dexcom', 'freestyle libre pro', 'freestyle libre 2', 'freestyle libre 3', or 'columns'), defaults to 'dexcom'
164
+ :type sensor: str, optional
165
+ :param id_template: regex dictating how to parse the CSV file's name for the proper patient identification,
166
+ or the name of the patient identification column if using the 'columns' sensor, defaults to None
167
+ :type id_template: str, optional
168
+ :param glucose: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
169
+ :type glucose: str, optional
170
+ :param time: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
171
+ :type time: str, optional
172
+ :param interval: the resampling interval (in minutes) that the data should follow, defaults to 5
173
+ :type interval: int, optional
174
+ :param max_gap: the maximum amount of minutes a gap in the data can be interpolated, defaults to 45
175
+ (filling in a gap with a longer duration would be considered extrapolation)
176
+ :type max_gap: int, optional
177
+ :return: A Pandas DataFrame containing the preprocessed data found at the given path. This DataFrame holds columns for timestamps, glucose values, weekday/weekend chunking, and waking/sleeping time chunking.
178
+ :rtype: pandas.DataFrame
179
+ """
180
+ data = pd.DataFrame()
181
+ if sensor == "dexcom":
182
+ data = _import_csv_dexcom(path, id_template, glucose, time)
183
+ elif sensor == "freestyle libre 2" or sensor == "freestyle libre 3":
184
+ data = _import_csv_freestyle_libre_23(path, id_template, glucose, time)
185
+ elif sensor == "freestyle libre pro":
186
+ data = _import_csv_freestyle_libre_pro(path, id_template, glucose, time)
187
+ elif sensor == "columns":
188
+ data = _import_csv_columns(path, id_template, glucose, time)
189
+
190
+ preprocessed_data = preprocess_data(data, interval, max_gap)
191
+ return preprocessed_data
192
+
193
+ def _import_csv_columns(
194
+ path: str,
195
+ id_col: str = None,
196
+ glucose_col: str = None,
197
+ time_col: str = None,
198
+ ) -> pd.DataFrame:
199
+ """Returns a Pandas DataFrame containing all of the csv data found at the given path.
200
+ The path must lead to a .csv file with three columns (identification, timestamp, and glucose value) containing CGM data. The returned DataFrame holds columns
201
+ for timestamps, glucose values, and the patient identification
202
+
203
+ :param path: the path of the csv file to be parsed through
204
+ :type path: str
205
+ :param id_col: the name of the column containing the patient identification(s), defaults to None
206
+ :type id_col: str, optional
207
+ :param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
208
+ :type glucose_col: str, optional
209
+ :param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
210
+ :type time_col: str, optional
211
+ :return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
212
+ :rtype: pandas.DataFrame
213
+ """
214
+ df = pd.read_csv(path)
215
+ glucose = glucose_col or "Glucose Value (mg/dL)"
216
+ time = time_col or "Timestamp (YYYY-MM-DDThh:mm:ss)"
217
+ id = id_col or "ID"
218
+
219
+ df.rename(columns={glucose: GLUCOSE, time: TIME, id: ID}, inplace=True)
220
+ return df
221
+
222
+ def _import_csv_dexcom(
223
+ path: str,
224
+ id_template: str = None,
225
+ glucose_col: str = None,
226
+ time_col: str = None,
227
+ ) -> pd.DataFrame:
228
+ """Returns a Pandas DataFrame containing all of the Dexcom csv data found at the given path.
229
+ The path must lead to a .csv file containing CGM data from a Dexcom device. The returned DataFrame holds columns
230
+ for timestamps, glucose values, and the patient identification
231
+
232
+ :param path: the path of the Dexcom csv file to be parsed through
233
+ :type path: str
234
+ :param id_template: regex dictating how to parse the CSV file's name for the proper patient identification, defaults to None
235
+ :type id_template: str, optional
236
+ :param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
237
+ :type glucose_col: str, optional
238
+ :param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
239
+ :type time_col: str, optional
240
+ :return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
241
+ :rtype: pandas.DataFrame
242
+ """
243
+ df = pd.read_csv(path)
244
+ glucose = glucose_col or "Glucose Value (mg/dL)"
245
+ time = time_col or "Timestamp (YYYY-MM-DDThh:mm:ss)"
246
+ id = _retrieve_id_dexcom(path.split("/")[-1], df, id_template)
247
+
248
+ df.rename(columns={glucose: GLUCOSE, time: TIME}, inplace=True)
249
+ df[ID] = id
250
+ return df
251
+
252
+ def _import_csv_freestyle_libre_23(
253
+ path: str,
254
+ id_template: str = None,
255
+ glucose_col: str = None,
256
+ time_col: str = None,
257
+ ) -> pd.DataFrame:
258
+ """Returns a Pandas DataFrame containing all of the FreeStyle Libre 2 or 3 csv data found at the given path.
259
+ The path must lead to a .csv file containing CGM data from FreeStyle Libre 2 or FreeStyle Libre 3 devices. The returned DataFrame holds columns
260
+ for timestamps, glucose values, and the patient identification
261
+
262
+ :param path: the path of the FreeStyle Libre 2 or 3 csv file to be parsed through
263
+ :type path: str
264
+ :param id_template: regex dictating how to parse the CSV file's name for the proper patient identification, defaults to None
265
+ :type id_template: str, optional
266
+ :param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
267
+ :type glucose_col: str, optional
268
+ :param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
269
+ :type time_col: str, optional
270
+ :return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
271
+ :rtype: pandas.DataFrame
272
+ """
273
+ glucose = glucose_col or "Historic Glucose mg/dL"
274
+ time = time_col or "Device Timestamp"
275
+ return _import_csv_freestyle_libre(path, id_template, glucose, time)
276
+
277
+ def _import_csv_freestyle_libre_pro(
278
+ path: str,
279
+ id_template: str = None,
280
+ glucose_col: str = None,
281
+ time_col: str = None,
282
+ ) -> pd.DataFrame:
283
+ """Returns a Pandas DataFrame containing all of the FreeStyle Libre Pro csv data found at the given path.
284
+ The path must lead to a .csv file containing CGM data from a FreeStyle Libre Pro device. The returned DataFrame holds columns
285
+ for timestamps, glucose values, and the patient identification
286
+
287
+ :param path: the path of the FreeStyle Libre Pro csv file to be parsed through
288
+ :type path: str
289
+ :param id_template: regex dictating how to parse the CSV file's name for the proper patient identification, defaults to None
290
+ :type id_template: str, optional
291
+ :param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used), defaults to None
292
+ :type glucose_col: str, optional
293
+ :param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used), defaults to None
294
+ :type time_col: str, optional
295
+ :return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
296
+ :rtype: pandas.DataFrame
297
+ """
298
+ glucose = glucose_col or "Historic Glucose(mg/dL)"
299
+ time = time_col or "Meter Timestamp"
300
+ return _import_csv_freestyle_libre(path, id_template, glucose, time)
301
+
302
+ def _import_csv_freestyle_libre(
303
+ path: str,
304
+ id_template: str,
305
+ glucose_col: str,
306
+ time_col: str
307
+ ) -> pd.DataFrame:
308
+ """Returns a Pandas DataFrame containing all of the FreeStyle Libre csv data found at the given path.
309
+ The path must lead to a .csv file containing CGM data from FreeStyle Libre 2/3/Pro devices. The returned DataFrame holds columns
310
+ for timestamps, glucose values, and the patient identification.
311
+
312
+ :param path: the path of the FreeStyle Libre csv file to be parsed through
313
+ :type path: str
314
+ :param id_template: regex dictating how to parse the CSV file's name for the proper patient identification
315
+ :type id_template: str
316
+ :param glucose_col: the name of the column containing the glucose values in the .csv files (if different than the default for the CGM sensor being used)
317
+ :type glucose_col: str
318
+ :param time_col: the name of the column containing the timestamps in the .csv files (if different than the default for the CGM sensor being used)
319
+ :type time_col: str
320
+ :return: A Pandas DataFrame containing the raw data found at the given path. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
321
+ :rtype: pandas.DataFrame
322
+ """
323
+ id = pd.read_csv(path, nrows=1)["Patient report"].iloc[0] if not id_template else _id_from_filename(path.split("/")[-1], id_template)
324
+ df = pd.read_csv(path, skiprows=2)
325
+
326
+ df.rename(columns={glucose_col: GLUCOSE, time_col: TIME}, inplace=True)
327
+ df[ID] = id
328
+ return df
329
+
330
+ def _retrieve_id_dexcom(name: str, df: pd.DataFrame, id_template: str = None) -> str:
331
+ """Returns the appropriate identification for the given raw Dexcom CGM data based on the given template.
332
+ If the template is None, the identification will be pulled from the patient information fields from within the CSV.
333
+ Otherwise, the filename will be parsed accordingly.
334
+
335
+ :param name: the name of the file to parse for an identification
336
+ :type name: str
337
+ :param df: a Pandas DataFrame containing the raw data from a Dexcom CSV file
338
+ :type df: pandas.DataFrame
339
+ :param id_template: regex indicating how to parse the filename for the identification, defaults to None
340
+ :type id_template: str, optional
341
+ :return: the proper identification for the raw data in the given dataframe
342
+ :rtype: str
343
+ """
344
+ if id_template and "first" not in id_template and "last" not in id_template and "patient_identifier" not in id_template:
345
+ # need to parse file name for id
346
+ return _id_from_filename(name, id_template)
347
+
348
+ # use Dexcom fields for id instead
349
+ first = df["Patient Info"].iloc[0]
350
+ last = df["Patient Info"].iloc[1]
351
+ patient_identifier = df["Patient Info"].iloc[2]
352
+ id = df["Patient Info"].iloc[0] + df["Patient Info"].iloc[1]
353
+ if id_template: id = id_template.format(first=first, last=last, patient_identifier=patient_identifier)
354
+ return id
355
+
356
+ def _id_from_filename(name: str, id_template: str):
357
+ """Parses the given filename for an identification using a regex template.
358
+
359
+ :param name: the filename to parse for an identification
360
+ :type name: str
361
+ :param id_template: regex indicating how to parse the filename for the identification
362
+ :type id_template: str
363
+ :return: the identification from the filename
364
+ :rtype: str
365
+ """
366
+ import re
367
+ pattern = re.compile(fr"{id_template}")
368
+ match = pattern.search(name)
369
+ if match is None:
370
+ raise Exception("The RegEx ID template passed does not match the file name.")
371
+ id = str(match.group("id"))
372
+ try:
373
+ section = str(match.group("section"))
374
+ id += f" ({section})"
375
+ except:
376
+ print(f"'Section' not defined for patient {id}.")
377
+ return id
378
+
379
+ def preprocess_data(
380
+ df: pd.DataFrame,
381
+ interval: int = 5,
382
+ max_gap: int = 45
383
+ ) -> pd.DataFrame:
384
+ """Returns a Pandas DataFrame containing the preprocessed CGM data within the given dataframe.
385
+ As part of the preprocessing phase, the data will be converted into the proper data types, resampled, interpolated, chunked, and
386
+ indexed by identification (alongside all 'Low's and 'High's being replaced and all edge null values being dropped)
387
+
388
+ :param df: the Pandas DataFrame containing the CGM data to preprocess
389
+ :type df: pandas.DataFrame
390
+ :param interval: the resampling interval (in minutes) the CGM data should follow, defaults to 5
391
+ :type interval: int, optional
392
+ :param max_gap: the maximum duration (in minutes) of a gap in the data that should be interpolated, defaults to 45
393
+ :type max_gap: int, optional
394
+ :return: A Pandas DataFrame containing the preprocessed CGM data. This DataFrame is indexed by identification and holds columns for
395
+ timestamps, glucose values, day chunking, and time chunking.
396
+ :rtype: pandas.DataFrame
397
+
398
+ :Example:
399
+ >>> # 'df' is a Pandas DataFrame already containing your CGM data, with columns for glucose values, timestamps, and identification
400
+ >>> preprocessed_df = preprocess_data(df)
401
+ """
402
+ df = df.dropna(subset=[GLUCOSE])
403
+ df = df.replace("Low", LOW)
404
+ df = df.replace("High", HIGH)
405
+ df.reset_index(drop=True, inplace=True)
406
+
407
+ df[TIME] = pd.to_datetime(df[TIME])
408
+ df[GLUCOSE] = pd.to_numeric(df[GLUCOSE])
409
+
410
+ df = df[[TIME, GLUCOSE, ID]].copy()
411
+ df = _resample_data(df, interval, max_gap)
412
+ df = df.loc[df[GLUCOSE].first_valid_index():df[GLUCOSE].last_valid_index()]
413
+ df = _chunk_day(_chunk_time(df))
414
+ df.set_index(ID, inplace=True)
415
+ return df
416
+
417
+ def _resample_data(df: pd.DataFrame, minutes: int = 5, max_gap: int = 45) -> pd.DataFrame:
418
+ """Resamples and (if needed) interpolates the given default-indexed DataFrame.
419
+ Used mostly to preprocess the data in the csv files being imported in _import_csv().
420
+
421
+ :param df: the DataFrame to be resampled and interpolated
422
+ :type df: pandas.DataFrame
423
+ :param minutes: the length of the interval to be resampled into (in minutes), defaults to 5
424
+ :type minutes: int
425
+ :param max_gap: the maximum duration (in minutes) of gaps that should be interpolated, defaults to 45
426
+ :type max_gap: int
427
+ :return: A Pandas DataFrame containing the resampled and interpolated. This DataFrame holds columns for timestamps, glucose values, and the patient identification.
428
+ :rtype: pandas.DataFrame
429
+ """
430
+ id = df.at[0, ID]
431
+
432
+ # Sort the DataFrame by datetime
433
+ resampled_df = df.sort_values(by=[TIME])
434
+ resampled_df = resampled_df.set_index(TIME)
435
+
436
+ interval = str(minutes) + "T"
437
+ # generate the times that match the frequency
438
+ resampled_df = resampled_df.asfreq(interval)
439
+ # add in the original points that don't match the frequency (just for linear time-based interpolation)
440
+ resampled_df.reset_index(inplace=True)
441
+ resampled_df = (pd.concat([resampled_df, df])).drop_duplicates(subset=[TIME])
442
+ resampled_df.sort_values(by=[TIME], inplace=True)
443
+
444
+ # interpolate the missing values
445
+ resampled_df.set_index(TIME, inplace=True)
446
+ resampled_df = _interpolate_data(resampled_df, max_gap)
447
+
448
+ # now that the values have been interpolated, remove the points that don't match the frequency
449
+ resampled_df = resampled_df.asfreq(interval)
450
+ resampled_df[ID] = id # resampled data points might have empty ID values
451
+ resampled_df.reset_index(inplace=True)
452
+
453
+ return resampled_df
454
+
455
+ def _interpolate_data(df: pd.DataFrame, max_gap: int) -> pd.DataFrame:
456
+ """Only linearly interpolates NaN glucose values for time gaps that are less than the given number of minutes.
457
+ Used mainly in preprocessing for csv files that are being imported in _import_csv().
458
+
459
+ :param df: the Pandas DataFrame containing the CGM data to interpolate
460
+ :type df: pandas.DataFrame
461
+ :param max_gap: the maximum minute length of gaps that should be interpolated
462
+ :type max_gap: int
463
+ :return: a Pandas DataFrame with interpolated CGM data
464
+ :rtype: pandas.DataFrame
465
+ """
466
+ config.read('config.ini')
467
+ interval = int(config["variables"]["interval"])
468
+
469
+ # based heavily on https://stackoverflow.com/questions/67128364/how-to-limit-pandas-interpolation-when-there-is-more-nan-than-the-limit
470
+
471
+ s = df[GLUCOSE].notnull()
472
+ s = s.ne(s.shift()).cumsum()
473
+
474
+ m = df.groupby([s, df[GLUCOSE].isnull()])[GLUCOSE].transform('size').where(df[GLUCOSE].isnull())
475
+ interpolated_df = df.interpolate(method="time", limit_area="inside").mask(m >= int(max_gap / interval))
476
+
477
+ return interpolated_df
478
+
479
+ def _chunk_time(df: pd.DataFrame) -> pd.DataFrame:
480
+ """Adds a new column to the given DataFrame specifying whether the values occur during a waking or sleeping period
481
+
482
+ :param df: the Pandas DataFrame to add the new column to (must contain a column for timestamps)
483
+ :type df: pandas.DataFrame
484
+ :return: the Pandas DataFrame with the added column for time chunking
485
+ :rtype: pandas.DataFrame
486
+ """
487
+ times = df[TIME] - df[TIME].dt.normalize()
488
+ is_waking = (times >= pd.Timedelta(hours=8)) & (times <= pd.Timedelta(hours=22))
489
+ df["Time Chunking"] = is_waking.replace({True: "Waking", False: "Sleeping"})
490
+ return df
491
+
492
+ def _chunk_day(df: pd.DataFrame) -> pd.DataFrame:
493
+ """Adds a new column to the given DataFrame specifying whether the values occur during a weekday or the weekend
494
+
495
+ :param df: the Pandas DataFrame to add the new column to (must contain a column for timestamps)
496
+ :type df: pandas.DataFrame
497
+ :return: the Pandas DataFrame with the added column for day chunking
498
+ :rtype: pandas.DataFrame
499
+ """
500
+ is_weekend = df[TIME].dt.dayofweek > 4
501
+ df["Day Chunking"] = is_weekend.replace({True: "Weekend", False: "Weekday"})
502
+ return df
503
+
504
+ def segment_data(path: str, df: pd.DataFrame) -> pd.DataFrame:
505
+ """
506
+ Splits patients' data into multiple segments based on a given .csv file containing ID's and DateTimes.
507
+
508
+ :param path: path of the .csv file containing identifications and timestamps indicating where to split the given DataFrame
509
+ :type path: str
510
+ :param df: the DataFrame to split based on the given .csv file
511
+ :type df: pandas.DataFrame
512
+ :return: a Pandas DataFrame with the data split accordingly
513
+ :rtype: pandas.DataFrame
514
+ """
515
+ # Read the segments CSV file
516
+ segments = pd.read_csv(path)
517
+ segments[TIME] = pd.to_datetime(segments[TIME])
518
+
519
+ # Sort segments by TIME
520
+ segments.sort_values(['ID', TIME], inplace=True)
521
+
522
+ # Create a copy of the original dataframe to avoid modifying it directly
523
+ df_copy = df.copy()
524
+ df_copy[TIME] = pd.to_datetime(df_copy[TIME])
525
+ df_copy = df_copy.reset_index()
526
+
527
+ # Initialize a Segment column in the original dataframe
528
+ df_copy['Segment'] = 0
529
+
530
+ # Use a dictionary to keep track of segment counters
531
+ segment_counters = {id: 1 for id in segments['ID'].unique()}
532
+
533
+ # Iterate over each row in the segments dataframe
534
+ for _, segment_row in segments.iterrows():
535
+ id = segment_row['ID']
536
+ time = segment_row[TIME]
537
+
538
+ # Create a mask for rows before the current segment time
539
+ mask = (df_copy['ID'] == id) & (df_copy[TIME] < time) & (df_copy['Segment'] == 0)
540
+
541
+ # Update the segment counter for those rows
542
+ df_copy.loc[mask, 'Segment'] = segment_counters[id]
543
+
544
+ # Increment the segment counter
545
+ segment_counters[id] += 1
546
+
547
+ # Assign the segment counter to rows after the last date
548
+ for id in segment_counters.keys():
549
+ mask = (df_copy['ID'] == id) & (df_copy['Segment'] == 0)
550
+ df_copy.loc[mask, 'Segment'] = segment_counters[id]
551
+
552
+ # Combine ID and Segment to form the new ID
553
+ df_copy['ID'] = df_copy['ID'].astype(str) + '_' + df_copy['Segment'].astype(str)
554
+
555
+ # Drop the Segment column as it's no longer needed
556
+ df_copy.drop(columns=['Segment'], inplace=True)
557
+
558
+ return df_copy