imputeCGM 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imputecgm-0.1.0/PKG-INFO +75 -0
- imputecgm-0.1.0/README.md +59 -0
- imputecgm-0.1.0/imputeCGM/__init__.py +3 -0
- imputecgm-0.1.0/imputeCGM/cli.py +49 -0
- imputecgm-0.1.0/imputeCGM/features.py +30 -0
- imputecgm-0.1.0/imputeCGM/gain.py +156 -0
- imputecgm-0.1.0/imputeCGM/gaps.py +5 -0
- imputecgm-0.1.0/imputeCGM/impute.py +168 -0
- imputecgm-0.1.0/imputeCGM/models.py +76 -0
- imputecgm-0.1.0/imputeCGM/pipeline.py +241 -0
- imputecgm-0.1.0/imputeCGM/timeseries_r.py +191 -0
- imputecgm-0.1.0/imputeCGM.egg-info/PKG-INFO +75 -0
- imputecgm-0.1.0/imputeCGM.egg-info/SOURCES.txt +17 -0
- imputecgm-0.1.0/imputeCGM.egg-info/dependency_links.txt +1 -0
- imputecgm-0.1.0/imputeCGM.egg-info/entry_points.txt +2 -0
- imputecgm-0.1.0/imputeCGM.egg-info/requires.txt +9 -0
- imputecgm-0.1.0/imputeCGM.egg-info/top_level.txt +1 -0
- imputecgm-0.1.0/pyproject.toml +25 -0
- imputecgm-0.1.0/setup.cfg +4 -0
imputecgm-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: imputeCGM
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CGM missing glucose imputation with MICE, ARIMA, XGBoost, and optional GAIN-based methods
|
|
5
|
+
Author: Hasin Shad, Shubh Saraswat, Dr. Xiaohua Douglas Zhang
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Requires-Dist: scikit-learn
|
|
11
|
+
Requires-Dist: statsmodels
|
|
12
|
+
Requires-Dist: xgboost
|
|
13
|
+
Requires-Dist: rpy2
|
|
14
|
+
Provides-Extra: gain
|
|
15
|
+
Requires-Dist: tensorflow; extra == "gain"
|
|
16
|
+
|
|
17
|
+
# imputeCGM
|
|
18
|
+
|
|
19
|
+
`imputeCGM` is a Python package for imputing missing glucose values in continuous glucose monitoring (CGM) data.
|
|
20
|
+
|
|
21
|
+
The package supports:
|
|
22
|
+
|
|
23
|
+
- Automatic method selection: MICE+ARIMA when missingness is low and MICE+XGBoost when missingness is higher
|
|
24
|
+
- Forced MICE+ARIMA
|
|
25
|
+
- Forced MICE+XGBoost
|
|
26
|
+
- GAIN-only imputation
|
|
27
|
+
- GAIN+ARIMA
|
|
28
|
+
- GAIN+XGBoost
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
For MICE, ARIMA, and XGBoost methods:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
python -m pip install -e .
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
For GAIN-based methods, install the optional TensorFlow dependency:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
python -m pip install -e .[gain]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Python usage
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from imputeCGM import impute_cgm
|
|
48
|
+
|
|
49
|
+
out = impute_cgm(
|
|
50
|
+
df,
|
|
51
|
+
timestamp_col="timestamp",
|
|
52
|
+
subjectid_col="subjectid",
|
|
53
|
+
glucose_col="glucose_value",
|
|
54
|
+
interval_minutes=5,
|
|
55
|
+
time_gap_tolerance_minutes=3,
|
|
56
|
+
model_method="auto",
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Valid `model_method` values are:
|
|
61
|
+
|
|
62
|
+
```text
|
|
63
|
+
auto
|
|
64
|
+
mice_arima
|
|
65
|
+
mice_xgboost
|
|
66
|
+
gain
|
|
67
|
+
gain_arima
|
|
68
|
+
gain_xgboost
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## CLI usage
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
imputeCGM --input ExampleData.csv --output ExampleData_imputed.csv --model-method auto
|
|
75
|
+
```
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# imputeCGM
|
|
2
|
+
|
|
3
|
+
`imputeCGM` is a Python package for imputing missing glucose values in continuous glucose monitoring (CGM) data.
|
|
4
|
+
|
|
5
|
+
The package supports:
|
|
6
|
+
|
|
7
|
+
- Automatic method selection: MICE+ARIMA when missingness is low and MICE+XGBoost when missingness is higher
|
|
8
|
+
- Forced MICE+ARIMA
|
|
9
|
+
- Forced MICE+XGBoost
|
|
10
|
+
- GAIN-only imputation
|
|
11
|
+
- GAIN+ARIMA
|
|
12
|
+
- GAIN+XGBoost
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
For MICE, ARIMA, and XGBoost methods:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
python -m pip install -e .
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
For GAIN-based methods, install the optional TensorFlow dependency:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
python -m pip install -e .[gain]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Python usage
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from imputeCGM import impute_cgm
|
|
32
|
+
|
|
33
|
+
out = impute_cgm(
|
|
34
|
+
df,
|
|
35
|
+
timestamp_col="timestamp",
|
|
36
|
+
subjectid_col="subjectid",
|
|
37
|
+
glucose_col="glucose_value",
|
|
38
|
+
interval_minutes=5,
|
|
39
|
+
time_gap_tolerance_minutes=3,
|
|
40
|
+
model_method="auto",
|
|
41
|
+
)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Valid `model_method` values are:
|
|
45
|
+
|
|
46
|
+
```text
|
|
47
|
+
auto
|
|
48
|
+
mice_arima
|
|
49
|
+
mice_xgboost
|
|
50
|
+
gain
|
|
51
|
+
gain_arima
|
|
52
|
+
gain_xgboost
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## CLI usage
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
imputeCGM --input ExampleData.csv --output ExampleData_imputed.csv --model-method auto
|
|
59
|
+
```
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from .pipeline import impute_cgm
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main():
|
|
7
|
+
p = argparse.ArgumentParser()
|
|
8
|
+
p.add_argument("--input", required=True)
|
|
9
|
+
p.add_argument("--output", required=True)
|
|
10
|
+
p.add_argument("--timestamp-col", default="timestamp")
|
|
11
|
+
p.add_argument("--subjectid-col", default="subjectid")
|
|
12
|
+
p.add_argument("--glucose-col", default="glucose_value")
|
|
13
|
+
p.add_argument("--interval", type=int, default=5)
|
|
14
|
+
p.add_argument("--arima-threshold", type=float, default=0.05)
|
|
15
|
+
p.add_argument("--time-gap-tolerance", type=int, default=3)
|
|
16
|
+
p.add_argument(
|
|
17
|
+
"--model-method",
|
|
18
|
+
default="auto",
|
|
19
|
+
choices=["auto", "mice_arima", "mice_xgboost", "gain", "gain_arima", "gain_xgboost"],
|
|
20
|
+
)
|
|
21
|
+
p.add_argument("--gain-epochs", type=int, default=500)
|
|
22
|
+
p.add_argument("--gain-batch-size", type=int, default=128)
|
|
23
|
+
p.add_argument("--gain-hint-rate", type=float, default=0.90)
|
|
24
|
+
p.add_argument("--gain-alpha", type=float, default=10.0)
|
|
25
|
+
p.add_argument("--gain-learning-rate", type=float, default=1e-3)
|
|
26
|
+
args = p.parse_args()
|
|
27
|
+
|
|
28
|
+
df = pd.read_csv(args.input)
|
|
29
|
+
|
|
30
|
+
out = impute_cgm(
|
|
31
|
+
df=df,
|
|
32
|
+
timestamp_col=args.timestamp_col,
|
|
33
|
+
subjectid_col=args.subjectid_col,
|
|
34
|
+
glucose_col=args.glucose_col,
|
|
35
|
+
interval_minutes=args.interval,
|
|
36
|
+
time_gap_tolerance_minutes=args.time_gap_tolerance,
|
|
37
|
+
use_arima_if_missing_leq=args.arima_threshold,
|
|
38
|
+
model_method=args.model_method,
|
|
39
|
+
gain_epochs=args.gain_epochs,
|
|
40
|
+
gain_batch_size=args.gain_batch_size,
|
|
41
|
+
gain_hint_rate=args.gain_hint_rate,
|
|
42
|
+
gain_alpha=args.gain_alpha,
|
|
43
|
+
gain_learning_rate=args.gain_learning_rate,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
out.to_csv(args.output, index=False)
|
|
47
|
+
print("Saved:", args.output)
|
|
48
|
+
print("Chosen method:", out["imputation_method"].iloc[0])
|
|
49
|
+
print("Missing rate:", out["missing_rate"].iloc[0])
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def encode_sex(df: pd.DataFrame, col: str = "SEX") -> pd.DataFrame:
|
|
5
|
+
out = df.copy()
|
|
6
|
+
if col in out.columns:
|
|
7
|
+
s = out[col].astype(str).str.strip().str.upper()
|
|
8
|
+
out[col] = s.map({"M": 1, "MALE": 1, "1": 1, "F": 0, "FEMALE": 0, "0": 0})
|
|
9
|
+
return out
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def add_lag_features(
|
|
13
|
+
df: pd.DataFrame,
|
|
14
|
+
target_col: str = "glucose_value",
|
|
15
|
+
id_col: str = "subjectid",
|
|
16
|
+
time_col: str = "TimeSeries",
|
|
17
|
+
lag_k=(1, 2, 3),
|
|
18
|
+
roll_window: int = 3
|
|
19
|
+
) -> pd.DataFrame:
|
|
20
|
+
out = df.sort_values([id_col, time_col]).reset_index(drop=True).copy()
|
|
21
|
+
|
|
22
|
+
for k in lag_k:
|
|
23
|
+
out[f"lag{k}"] = out.groupby(id_col)[target_col].shift(k)
|
|
24
|
+
|
|
25
|
+
s = out.groupby(id_col)[target_col].shift(1)
|
|
26
|
+
out["rollmean"] = (
|
|
27
|
+
s.groupby(out[id_col]).rolling(roll_window).mean().reset_index(level=0, drop=True)
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
return out
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def set_all_seeds(seed: int = 42) -> None:
|
|
6
|
+
random.seed(seed)
|
|
7
|
+
np.random.seed(seed)
|
|
8
|
+
try:
|
|
9
|
+
import tensorflow as tf
|
|
10
|
+
tf.random.set_seed(seed)
|
|
11
|
+
except Exception:
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def minmax_normalize(data_nan):
|
|
16
|
+
data_nan = np.asarray(data_nan, dtype=np.float32)
|
|
17
|
+
col_min = np.nanmin(data_nan, axis=0)
|
|
18
|
+
col_max = np.nanmax(data_nan, axis=0)
|
|
19
|
+
|
|
20
|
+
col_min = np.where(np.isfinite(col_min), col_min, 0.0)
|
|
21
|
+
col_max = np.where(np.isfinite(col_max), col_max, 1.0)
|
|
22
|
+
denom = col_max - col_min
|
|
23
|
+
denom = np.where(np.abs(denom) < 1e-8, 1.0, denom)
|
|
24
|
+
|
|
25
|
+
data_norm = (data_nan - col_min) / denom
|
|
26
|
+
return data_norm.astype(np.float32), col_min.astype(np.float32), denom.astype(np.float32)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def minmax_denormalize(data_norm, col_min, denom):
|
|
30
|
+
return data_norm * denom + col_min
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _make_mlp(input_dim, output_dim, hidden_dim=128, output_activation="sigmoid", name="mlp"):
|
|
34
|
+
try:
|
|
35
|
+
from tensorflow.keras import layers, Model
|
|
36
|
+
except Exception as exc:
|
|
37
|
+
raise ImportError(
|
|
38
|
+
"GAIN methods require TensorFlow. Install this package with: "
|
|
39
|
+
"python -m pip install -e .[gain]"
|
|
40
|
+
) from exc
|
|
41
|
+
|
|
42
|
+
inp = layers.Input(shape=(input_dim,))
|
|
43
|
+
x = layers.Dense(hidden_dim, activation="relu")(inp)
|
|
44
|
+
x = layers.Dense(hidden_dim, activation="relu")(x)
|
|
45
|
+
out = layers.Dense(output_dim, activation=output_activation)(x)
|
|
46
|
+
return Model(inp, out, name=name)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def gain_impute(
|
|
50
|
+
data_nan,
|
|
51
|
+
epochs: int = 500,
|
|
52
|
+
batch_size: int = 128,
|
|
53
|
+
hint_rate: float = 0.90,
|
|
54
|
+
alpha: float = 10.0,
|
|
55
|
+
learning_rate: float = 1e-3,
|
|
56
|
+
seed: int = 42,
|
|
57
|
+
verbose: bool = False,
|
|
58
|
+
):
|
|
59
|
+
"""
|
|
60
|
+
GAIN imputation for numeric tabular data.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
data_nan:
|
|
65
|
+
Numeric matrix with missing entries represented as np.nan.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
np.ndarray
|
|
70
|
+
Full imputed matrix in the original scale. Observed values are preserved.
|
|
71
|
+
"""
|
|
72
|
+
try:
|
|
73
|
+
import tensorflow as tf
|
|
74
|
+
except Exception as exc:
|
|
75
|
+
raise ImportError(
|
|
76
|
+
"GAIN methods require TensorFlow. Install this package with: "
|
|
77
|
+
"python -m pip install -e .[gain]"
|
|
78
|
+
) from exc
|
|
79
|
+
|
|
80
|
+
set_all_seeds(seed)
|
|
81
|
+
|
|
82
|
+
data_norm, col_min, denom = minmax_normalize(data_nan)
|
|
83
|
+
mask = (~np.isnan(data_norm)).astype(np.float32)
|
|
84
|
+
x_filled = np.nan_to_num(data_norm, nan=0.0).astype(np.float32)
|
|
85
|
+
|
|
86
|
+
n, d = x_filled.shape
|
|
87
|
+
generator = _make_mlp(
|
|
88
|
+
input_dim=2 * d,
|
|
89
|
+
output_dim=d,
|
|
90
|
+
hidden_dim=128,
|
|
91
|
+
output_activation="sigmoid",
|
|
92
|
+
name="gain_generator",
|
|
93
|
+
)
|
|
94
|
+
discriminator = _make_mlp(
|
|
95
|
+
input_dim=2 * d,
|
|
96
|
+
output_dim=d,
|
|
97
|
+
hidden_dim=128,
|
|
98
|
+
output_activation="sigmoid",
|
|
99
|
+
name="gain_discriminator",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
g_opt = tf.keras.optimizers.Adam(learning_rate)
|
|
103
|
+
d_opt = tf.keras.optimizers.Adam(learning_rate)
|
|
104
|
+
eps = 1e-8
|
|
105
|
+
|
|
106
|
+
for epoch in range(int(epochs)):
|
|
107
|
+
batch_idx = np.random.choice(n, size=min(batch_size, n), replace=n < batch_size)
|
|
108
|
+
x_mb = tf.convert_to_tensor(x_filled[batch_idx], dtype=tf.float32)
|
|
109
|
+
m_mb = tf.convert_to_tensor(mask[batch_idx], dtype=tf.float32)
|
|
110
|
+
|
|
111
|
+
z_mb = tf.random.uniform(shape=tf.shape(x_mb), minval=0.0, maxval=0.01, dtype=tf.float32)
|
|
112
|
+
x_hat = x_mb * m_mb + z_mb * (1.0 - m_mb)
|
|
113
|
+
|
|
114
|
+
h_binary = tf.cast(tf.random.uniform(shape=tf.shape(m_mb)) < hint_rate, tf.float32)
|
|
115
|
+
h_mb = m_mb * h_binary + 0.5 * (1.0 - h_binary)
|
|
116
|
+
|
|
117
|
+
with tf.GradientTape() as d_tape:
|
|
118
|
+
g_sample = generator(tf.concat([x_hat, m_mb], axis=1), training=True)
|
|
119
|
+
hat_x = x_hat * m_mb + g_sample * (1.0 - m_mb)
|
|
120
|
+
d_prob = discriminator(tf.concat([hat_x, h_mb], axis=1), training=True)
|
|
121
|
+
d_loss = -tf.reduce_mean(
|
|
122
|
+
m_mb * tf.math.log(d_prob + eps)
|
|
123
|
+
+ (1.0 - m_mb) * tf.math.log(1.0 - d_prob + eps)
|
|
124
|
+
)
|
|
125
|
+
d_grads = d_tape.gradient(d_loss, discriminator.trainable_variables)
|
|
126
|
+
d_opt.apply_gradients(zip(d_grads, discriminator.trainable_variables))
|
|
127
|
+
|
|
128
|
+
with tf.GradientTape() as g_tape:
|
|
129
|
+
g_sample = generator(tf.concat([x_hat, m_mb], axis=1), training=True)
|
|
130
|
+
hat_x = x_hat * m_mb + g_sample * (1.0 - m_mb)
|
|
131
|
+
d_prob = discriminator(tf.concat([hat_x, h_mb], axis=1), training=False)
|
|
132
|
+
|
|
133
|
+
g_loss_adv = -tf.reduce_mean((1.0 - m_mb) * tf.math.log(d_prob + eps))
|
|
134
|
+
mse_num = tf.reduce_sum((m_mb * x_mb - m_mb * g_sample) ** 2)
|
|
135
|
+
mse_den = tf.reduce_sum(m_mb) + eps
|
|
136
|
+
g_loss_mse = mse_num / mse_den
|
|
137
|
+
g_loss = g_loss_adv + alpha * g_loss_mse
|
|
138
|
+
g_grads = g_tape.gradient(g_loss, generator.trainable_variables)
|
|
139
|
+
g_opt.apply_gradients(zip(g_grads, generator.trainable_variables))
|
|
140
|
+
|
|
141
|
+
if verbose and ((epoch + 1) % 100 == 0 or epoch == 0):
|
|
142
|
+
print(
|
|
143
|
+
f"GAIN epoch {epoch + 1:4d}/{epochs} | "
|
|
144
|
+
f"D_loss={float(d_loss):.5f} | G_loss={float(g_loss):.5f}"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
x_tensor = tf.convert_to_tensor(x_filled, dtype=tf.float32)
|
|
148
|
+
m_tensor = tf.convert_to_tensor(mask, dtype=tf.float32)
|
|
149
|
+
z_all = tf.random.uniform(shape=tf.shape(x_tensor), minval=0.0, maxval=0.01, dtype=tf.float32)
|
|
150
|
+
x_hat_all = x_tensor * m_tensor + z_all * (1.0 - m_tensor)
|
|
151
|
+
g_all = generator(tf.concat([x_hat_all, m_tensor], axis=1), training=False).numpy()
|
|
152
|
+
|
|
153
|
+
observed_norm = np.nan_to_num(data_norm, nan=0.0).astype(np.float32)
|
|
154
|
+
imputed_norm = mask * observed_norm + (1.0 - mask) * g_all
|
|
155
|
+
imputed = minmax_denormalize(imputed_norm, col_min, denom)
|
|
156
|
+
return imputed.astype(float)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sklearn.experimental import enable_iterative_imputer # noqa: F401
|
|
4
|
+
from sklearn.impute import IterativeImputer
|
|
5
|
+
|
|
6
|
+
from .models import arima_segmentwise_on_mice, fit_xgb_predict_missing
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
VALID_MODEL_METHODS = {
|
|
10
|
+
"auto",
|
|
11
|
+
"mice_arima",
|
|
12
|
+
"mice_xgboost",
|
|
13
|
+
"gain",
|
|
14
|
+
"gain_arima",
|
|
15
|
+
"gain_xgboost",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _run_mice_matrix(imp_df: pd.DataFrame):
|
|
20
|
+
imp_mat = IterativeImputer(
|
|
21
|
+
random_state=42,
|
|
22
|
+
max_iter=10,
|
|
23
|
+
).fit_transform(imp_df.to_numpy(dtype=float))
|
|
24
|
+
|
|
25
|
+
y_mice_full = imp_mat[:, 0]
|
|
26
|
+
X_imp = imp_mat[:, 1:]
|
|
27
|
+
return y_mice_full, X_imp
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _run_gain_matrix(
|
|
31
|
+
imp_df: pd.DataFrame,
|
|
32
|
+
gain_epochs: int = 500,
|
|
33
|
+
gain_batch_size: int = 128,
|
|
34
|
+
gain_hint_rate: float = 0.90,
|
|
35
|
+
gain_alpha: float = 10.0,
|
|
36
|
+
gain_learning_rate: float = 1e-3,
|
|
37
|
+
gain_seed: int = 42,
|
|
38
|
+
gain_verbose: bool = False,
|
|
39
|
+
):
|
|
40
|
+
from .gain import gain_impute
|
|
41
|
+
|
|
42
|
+
gain_mat = gain_impute(
|
|
43
|
+
data_nan=imp_df.to_numpy(dtype=np.float32),
|
|
44
|
+
epochs=gain_epochs,
|
|
45
|
+
batch_size=gain_batch_size,
|
|
46
|
+
hint_rate=gain_hint_rate,
|
|
47
|
+
alpha=gain_alpha,
|
|
48
|
+
learning_rate=gain_learning_rate,
|
|
49
|
+
seed=gain_seed,
|
|
50
|
+
verbose=gain_verbose,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
y_gain_full = gain_mat[:, 0]
|
|
54
|
+
X_imp = gain_mat[:, 1:]
|
|
55
|
+
return y_gain_full, X_imp
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def impute_values(
|
|
59
|
+
df: pd.DataFrame,
|
|
60
|
+
id_col: str = "subjectid",
|
|
61
|
+
time_col: str = "TimeSeries",
|
|
62
|
+
target_col: str = "glucose_value",
|
|
63
|
+
feature_cols=None,
|
|
64
|
+
use_arima_if_missing_leq: float = 0.05,
|
|
65
|
+
model_method: str = "auto",
|
|
66
|
+
gain_epochs: int = 500,
|
|
67
|
+
gain_batch_size: int = 128,
|
|
68
|
+
gain_hint_rate: float = 0.90,
|
|
69
|
+
gain_alpha: float = 10.0,
|
|
70
|
+
gain_learning_rate: float = 1e-3,
|
|
71
|
+
gain_seed: int = 42,
|
|
72
|
+
gain_verbose: bool = False,
|
|
73
|
+
) -> pd.DataFrame:
|
|
74
|
+
"""
|
|
75
|
+
Impute missing CGM glucose values.
|
|
76
|
+
|
|
77
|
+
model_method options:
|
|
78
|
+
auto -> MICE+ARIMA if missing rate <= threshold; otherwise MICE+XGBoost
|
|
79
|
+
mice_arima -> force MICE+ARIMA
|
|
80
|
+
mice_xgboost -> force MICE+XGBoost
|
|
81
|
+
gain -> force GAIN-only imputation
|
|
82
|
+
gain_arima -> use GAIN initial imputation, then ARIMA on missing segments
|
|
83
|
+
gain_xgboost -> use GAIN-completed features, then XGBoost for missing glucose
|
|
84
|
+
"""
|
|
85
|
+
model_method = str(model_method).lower().strip()
|
|
86
|
+
if model_method not in VALID_MODEL_METHODS:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
f"Unknown model_method '{model_method}'. Valid options are: "
|
|
89
|
+
+ ", ".join(sorted(VALID_MODEL_METHODS))
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
out = df.sort_values([id_col, time_col]).reset_index(drop=True).copy()
|
|
93
|
+
|
|
94
|
+
if feature_cols is None:
|
|
95
|
+
feature_cols = [c for c in out.columns if c != target_col]
|
|
96
|
+
|
|
97
|
+
mask_pos = out[target_col].isna().to_numpy()
|
|
98
|
+
miss_rate = float(mask_pos.mean())
|
|
99
|
+
|
|
100
|
+
if not np.any(mask_pos):
|
|
101
|
+
out["imputed_glucose_value"] = out[target_col].to_numpy(dtype=float)
|
|
102
|
+
out["imputation_method"] = "No missing glucose values"
|
|
103
|
+
out["missing_rate"] = miss_rate
|
|
104
|
+
return out
|
|
105
|
+
|
|
106
|
+
imp_df = out[[target_col] + feature_cols].copy()
|
|
107
|
+
|
|
108
|
+
if model_method == "auto":
|
|
109
|
+
resolved_method = "mice_arima" if miss_rate <= use_arima_if_missing_leq else "mice_xgboost"
|
|
110
|
+
else:
|
|
111
|
+
resolved_method = model_method
|
|
112
|
+
|
|
113
|
+
if resolved_method.startswith("mice"):
|
|
114
|
+
y_base_full, X_imp = _run_mice_matrix(imp_df)
|
|
115
|
+
base_label = "MICE"
|
|
116
|
+
else:
|
|
117
|
+
y_base_full, X_imp = _run_gain_matrix(
|
|
118
|
+
imp_df,
|
|
119
|
+
gain_epochs=gain_epochs,
|
|
120
|
+
gain_batch_size=gain_batch_size,
|
|
121
|
+
gain_hint_rate=gain_hint_rate,
|
|
122
|
+
gain_alpha=gain_alpha,
|
|
123
|
+
gain_learning_rate=gain_learning_rate,
|
|
124
|
+
gain_seed=gain_seed,
|
|
125
|
+
gain_verbose=gain_verbose,
|
|
126
|
+
)
|
|
127
|
+
base_label = "GAIN"
|
|
128
|
+
|
|
129
|
+
y_final = np.asarray(y_base_full, dtype=float).copy()
|
|
130
|
+
|
|
131
|
+
if resolved_method in {"mice_arima", "gain_arima"}:
|
|
132
|
+
y_final = arima_segmentwise_on_mice(
|
|
133
|
+
df_sorted=out,
|
|
134
|
+
id_col=id_col,
|
|
135
|
+
y_mice_full=y_base_full,
|
|
136
|
+
mask_pos=mask_pos,
|
|
137
|
+
order=(4, 1, 0),
|
|
138
|
+
min_history=20,
|
|
139
|
+
)
|
|
140
|
+
method = f"{base_label}+ARIMA"
|
|
141
|
+
|
|
142
|
+
elif resolved_method in {"mice_xgboost", "gain_xgboost"}:
|
|
143
|
+
train_idx = ~mask_pos
|
|
144
|
+
X_train = X_imp[train_idx]
|
|
145
|
+
y_train = out.loc[train_idx, target_col].to_numpy(dtype=float)
|
|
146
|
+
X_missing = X_imp[mask_pos]
|
|
147
|
+
|
|
148
|
+
y_pred_missing = fit_xgb_predict_missing(
|
|
149
|
+
X_train,
|
|
150
|
+
y_train,
|
|
151
|
+
X_missing,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
y_final[mask_pos] = y_pred_missing
|
|
155
|
+
method = f"{base_label}+XGBoost"
|
|
156
|
+
|
|
157
|
+
elif resolved_method == "gain":
|
|
158
|
+
method = "GAIN"
|
|
159
|
+
|
|
160
|
+
else:
|
|
161
|
+
raise RuntimeError(f"Could not resolve imputation method: {resolved_method}")
|
|
162
|
+
|
|
163
|
+
out["imputed_glucose_value"] = y_final
|
|
164
|
+
out["imputation_method"] = method
|
|
165
|
+
out["missing_rate"] = miss_rate
|
|
166
|
+
out["model_method"] = resolved_method
|
|
167
|
+
|
|
168
|
+
return out
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from statsmodels.tsa.arima.model import ARIMA
|
|
3
|
+
import xgboost as xgb
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
XGB_PARAMS = dict(
|
|
7
|
+
n_estimators=300,
|
|
8
|
+
learning_rate=0.05,
|
|
9
|
+
max_depth=6,
|
|
10
|
+
subsample=0.8,
|
|
11
|
+
colsample_bytree=0.8,
|
|
12
|
+
reg_lambda=1.0,
|
|
13
|
+
n_jobs=1,
|
|
14
|
+
random_state=42,
|
|
15
|
+
tree_method="hist",
|
|
16
|
+
eval_metric="rmse",
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _segments(mask: np.ndarray):
|
|
21
|
+
segs = []
|
|
22
|
+
i = 0
|
|
23
|
+
n = len(mask)
|
|
24
|
+
while i < n:
|
|
25
|
+
if mask[i]:
|
|
26
|
+
j = i
|
|
27
|
+
while j + 1 < n and mask[j + 1]:
|
|
28
|
+
j += 1
|
|
29
|
+
segs.append((i, j))
|
|
30
|
+
i = j + 1
|
|
31
|
+
else:
|
|
32
|
+
i += 1
|
|
33
|
+
return segs
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def arima_segmentwise_on_mice(
|
|
37
|
+
df_sorted,
|
|
38
|
+
id_col: str,
|
|
39
|
+
y_mice_full: np.ndarray,
|
|
40
|
+
mask_pos: np.ndarray,
|
|
41
|
+
order=(4, 1, 0),
|
|
42
|
+
min_history: int = 20
|
|
43
|
+
) -> np.ndarray:
|
|
44
|
+
pred_full = np.asarray(y_mice_full, dtype=float).copy()
|
|
45
|
+
mask_pos = np.asarray(mask_pos, dtype=bool)
|
|
46
|
+
|
|
47
|
+
for _, grp in df_sorted.groupby(id_col, sort=False):
|
|
48
|
+
idx = grp.index.to_numpy()
|
|
49
|
+
local_mask = mask_pos[idx]
|
|
50
|
+
segs = _segments(local_mask)
|
|
51
|
+
if not segs:
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
local_series = pred_full[idx]
|
|
55
|
+
|
|
56
|
+
for s, e in segs:
|
|
57
|
+
block_len = e - s + 1
|
|
58
|
+
history = local_series[:s]
|
|
59
|
+
if len(history) < min_history:
|
|
60
|
+
continue
|
|
61
|
+
if not np.all(np.isfinite(history)):
|
|
62
|
+
continue
|
|
63
|
+
try:
|
|
64
|
+
fit = ARIMA(history, order=order).fit()
|
|
65
|
+
fcst = fit.forecast(steps=block_len)
|
|
66
|
+
pred_full[idx[s:e+1]] = np.asarray(fcst, dtype=float)
|
|
67
|
+
except Exception:
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
return pred_full
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def fit_xgb_predict_missing(X_train, y_train, X_missing) -> np.ndarray:
|
|
74
|
+
model = xgb.XGBRegressor(**XGB_PARAMS)
|
|
75
|
+
model.fit(X_train, y_train, verbose=False)
|
|
76
|
+
return model.predict(X_missing)
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
from .timeseries_r import add_timeseries_column
|
|
5
|
+
from .features import encode_sex, add_lag_features
|
|
6
|
+
from .impute import impute_values
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def insert_missing_timeseries_rows(
|
|
10
|
+
df: pd.DataFrame,
|
|
11
|
+
id_col: str = "subjectid",
|
|
12
|
+
time_col: str = "TimeSeries",
|
|
13
|
+
timestamp_col: str = "timestamp",
|
|
14
|
+
glucose_col: str = "glucose_value",
|
|
15
|
+
interval_minutes: int = 5,
|
|
16
|
+
tolerance_minutes: int = 3,
|
|
17
|
+
static_cols: list[str] | None = None,
|
|
18
|
+
) -> pd.DataFrame:
|
|
19
|
+
"""
|
|
20
|
+
Insert synthetic rows when TimeSeries gaps exceed the expected interval.
|
|
21
|
+
|
|
22
|
+
Examples with interval=5 and tolerance=3:
|
|
23
|
+
|
|
24
|
+
200, 205, 220
|
|
25
|
+
-> 200, 205, 210, 215, 220
|
|
26
|
+
|
|
27
|
+
200, 205, 210, 224
|
|
28
|
+
-> 200, 205, 210, 215, 220, 224
|
|
29
|
+
|
|
30
|
+
200, 205, 210, 223
|
|
31
|
+
-> 200, 205, 210, 215, 220, 223
|
|
32
|
+
|
|
33
|
+
The observed row is preserved. Inserted rows receive missing glucose values
|
|
34
|
+
and inherit static subject-level fields such as AGE, SEX, and HBA1C.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
if static_cols is None:
|
|
38
|
+
static_cols = ["AGE", "SEX", "HBA1C"]
|
|
39
|
+
|
|
40
|
+
required_cols = [id_col, time_col, glucose_col]
|
|
41
|
+
|
|
42
|
+
for col in required_cols:
|
|
43
|
+
if col not in df.columns:
|
|
44
|
+
raise ValueError(f"Column '{col}' was not found.")
|
|
45
|
+
|
|
46
|
+
out = df.copy()
|
|
47
|
+
|
|
48
|
+
out[time_col] = pd.to_numeric(out[time_col], errors="coerce")
|
|
49
|
+
|
|
50
|
+
if out[time_col].isna().any():
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"Some values in '{time_col}' could not be converted to numeric."
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
expanded_groups = []
|
|
56
|
+
|
|
57
|
+
for subject_id, group in out.groupby(id_col, sort=False):
|
|
58
|
+
group = group.sort_values(time_col).reset_index(drop=True).copy()
|
|
59
|
+
|
|
60
|
+
expanded_rows = []
|
|
61
|
+
|
|
62
|
+
for i in range(len(group)):
|
|
63
|
+
current_row = group.iloc[i].copy()
|
|
64
|
+
current_row["inserted_missing_row"] = False
|
|
65
|
+
expanded_rows.append(current_row)
|
|
66
|
+
|
|
67
|
+
if i == len(group) - 1:
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
current_ts = float(group.loc[i, time_col])
|
|
71
|
+
next_ts = float(group.loc[i + 1, time_col])
|
|
72
|
+
|
|
73
|
+
candidate_ts = current_ts + interval_minutes
|
|
74
|
+
|
|
75
|
+
while candidate_ts < next_ts - tolerance_minutes:
|
|
76
|
+
new_row = pd.Series(index=group.columns, dtype="object")
|
|
77
|
+
|
|
78
|
+
new_row[id_col] = subject_id
|
|
79
|
+
new_row[time_col] = candidate_ts
|
|
80
|
+
new_row[glucose_col] = np.nan
|
|
81
|
+
new_row["inserted_missing_row"] = True
|
|
82
|
+
|
|
83
|
+
for col in static_cols:
|
|
84
|
+
if col in group.columns:
|
|
85
|
+
new_row[col] = current_row[col]
|
|
86
|
+
|
|
87
|
+
if timestamp_col in group.columns:
|
|
88
|
+
current_timestamp = pd.to_datetime(
|
|
89
|
+
current_row[timestamp_col],
|
|
90
|
+
errors="coerce",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
if pd.notna(current_timestamp):
|
|
94
|
+
offset_minutes = candidate_ts - current_ts
|
|
95
|
+
|
|
96
|
+
new_row[timestamp_col] = (
|
|
97
|
+
current_timestamp
|
|
98
|
+
+ pd.to_timedelta(offset_minutes, unit="m")
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
new_row[timestamp_col] = pd.NaT
|
|
102
|
+
|
|
103
|
+
expanded_rows.append(new_row)
|
|
104
|
+
candidate_ts += interval_minutes
|
|
105
|
+
|
|
106
|
+
expanded_group = pd.DataFrame(expanded_rows)
|
|
107
|
+
expanded_groups.append(expanded_group)
|
|
108
|
+
|
|
109
|
+
expanded = pd.concat(expanded_groups, ignore_index=True)
|
|
110
|
+
|
|
111
|
+
expanded = expanded.sort_values([id_col, time_col]).reset_index(drop=True)
|
|
112
|
+
|
|
113
|
+
if "TimeDifferenceMinutes" in expanded.columns:
|
|
114
|
+
expanded["TimeDifferenceMinutes"] = (
|
|
115
|
+
expanded.groupby(id_col)[time_col]
|
|
116
|
+
.diff()
|
|
117
|
+
.fillna(0)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
expanded["inserted_missing_row"] = (
|
|
121
|
+
expanded["inserted_missing_row"]
|
|
122
|
+
.fillna(False)
|
|
123
|
+
.astype(bool)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return expanded
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def impute_cgm(
|
|
130
|
+
df: pd.DataFrame,
|
|
131
|
+
timestamp_col: str = "timestamp",
|
|
132
|
+
subjectid_col: str = "subjectid",
|
|
133
|
+
glucose_col: str = "glucose_value",
|
|
134
|
+
interval_minutes: int = 5,
|
|
135
|
+
time_gap_tolerance_minutes: int = 3,
|
|
136
|
+
use_arima_if_missing_leq: float = 0.05,
|
|
137
|
+
model_method: str = "auto",
|
|
138
|
+
gain_epochs: int = 500,
|
|
139
|
+
gain_batch_size: int = 128,
|
|
140
|
+
gain_hint_rate: float = 0.90,
|
|
141
|
+
gain_alpha: float = 10.0,
|
|
142
|
+
gain_learning_rate: float = 1e-3,
|
|
143
|
+
gain_seed: int = 42,
|
|
144
|
+
gain_verbose: bool = False,
|
|
145
|
+
) -> pd.DataFrame:
|
|
146
|
+
"""
|
|
147
|
+
Full CGM missing-glucose imputation pipeline.
|
|
148
|
+
|
|
149
|
+
Steps:
|
|
150
|
+
1. Convert timestamp column into TimeSeries.
|
|
151
|
+
2. Encode SEX as numeric.
|
|
152
|
+
3. Insert missing time rows when CGM time gaps exceed the expected interval.
|
|
153
|
+
4. Convert modeling columns to numeric.
|
|
154
|
+
5. Create lag and rolling mean features.
|
|
155
|
+
6. Run the selected imputation method: automatic MICE-based selection, forced MICE+ARIMA, forced MICE+XGBoost, GAIN, GAIN+ARIMA, or GAIN+XGBoost.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
out = df.copy()
|
|
159
|
+
|
|
160
|
+
# convert timestamp into TimeSeries
|
|
161
|
+
out = add_timeseries_column(
|
|
162
|
+
out,
|
|
163
|
+
ts_col=timestamp_col,
|
|
164
|
+
id_col=subjectid_col,
|
|
165
|
+
interval_minutes=interval_minutes,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
#m-1, f-2
|
|
169
|
+
out = encode_sex(out, "SEX")
|
|
170
|
+
|
|
171
|
+
out = insert_missing_timeseries_rows(
|
|
172
|
+
out,
|
|
173
|
+
id_col=subjectid_col,
|
|
174
|
+
time_col="TimeSeries",
|
|
175
|
+
timestamp_col=timestamp_col,
|
|
176
|
+
glucose_col=glucose_col,
|
|
177
|
+
interval_minutes=interval_minutes,
|
|
178
|
+
tolerance_minutes=time_gap_tolerance_minutes,
|
|
179
|
+
static_cols=["AGE", "SEX", "HBA1C"],
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
numeric_cols = [
|
|
184
|
+
glucose_col,
|
|
185
|
+
"TimeSeries",
|
|
186
|
+
"TimeDifferenceMinutes",
|
|
187
|
+
subjectid_col,
|
|
188
|
+
"AGE",
|
|
189
|
+
"HBA1C",
|
|
190
|
+
"SEX",
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
for col in numeric_cols:
|
|
194
|
+
if col in out.columns:
|
|
195
|
+
out[col] = pd.to_numeric(out[col], errors="coerce")
|
|
196
|
+
|
|
197
|
+
# create lag and rolling features.
|
|
198
|
+
out = add_lag_features(
|
|
199
|
+
out,
|
|
200
|
+
target_col=glucose_col,
|
|
201
|
+
id_col=subjectid_col,
|
|
202
|
+
time_col="TimeSeries",
|
|
203
|
+
lag_k=(1, 2, 3),
|
|
204
|
+
roll_window=3,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# select model features.
|
|
208
|
+
feature_cols = [
|
|
209
|
+
"TimeSeries",
|
|
210
|
+
"TimeDifferenceMinutes",
|
|
211
|
+
subjectid_col,
|
|
212
|
+
"AGE",
|
|
213
|
+
"SEX",
|
|
214
|
+
"HBA1C",
|
|
215
|
+
"lag1",
|
|
216
|
+
"lag2",
|
|
217
|
+
"lag3",
|
|
218
|
+
"rollmean",
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
feature_cols = [col for col in feature_cols if col in out.columns]
|
|
222
|
+
|
|
223
|
+
# Run imputation
|
|
224
|
+
out = impute_values(
|
|
225
|
+
out,
|
|
226
|
+
id_col=subjectid_col,
|
|
227
|
+
time_col="TimeSeries",
|
|
228
|
+
target_col=glucose_col,
|
|
229
|
+
feature_cols=feature_cols,
|
|
230
|
+
use_arima_if_missing_leq=use_arima_if_missing_leq,
|
|
231
|
+
model_method=model_method,
|
|
232
|
+
gain_epochs=gain_epochs,
|
|
233
|
+
gain_batch_size=gain_batch_size,
|
|
234
|
+
gain_hint_rate=gain_hint_rate,
|
|
235
|
+
gain_alpha=gain_alpha,
|
|
236
|
+
gain_learning_rate=gain_learning_rate,
|
|
237
|
+
gain_seed=gain_seed,
|
|
238
|
+
gain_verbose=gain_verbose,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return out
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def insert_missing_timeseries_rows(
|
|
6
|
+
df: pd.DataFrame,
|
|
7
|
+
id_col: str = "subjectid",
|
|
8
|
+
time_col: str = "TimeSeries",
|
|
9
|
+
timestamp_col: str = "timestamp",
|
|
10
|
+
glucose_col: str = "glucose_value",
|
|
11
|
+
interval_minutes: int = 5,
|
|
12
|
+
tolerance_minutes: int = 3,
|
|
13
|
+
static_cols: list[str] | None = None,
|
|
14
|
+
) -> pd.DataFrame:
|
|
15
|
+
"""
|
|
16
|
+
Insert synthetic rows when TimeSeries gaps exceed the expected interval.
|
|
17
|
+
|
|
18
|
+
Examples with interval=5 and tolerance=3:
|
|
19
|
+
200, 205, 220 -> 200, 205, 210, 215, 220
|
|
20
|
+
200, 205, 210, 224 -> 200, 205, 210, 215, 220, 224
|
|
21
|
+
200, 205, 210, 223 -> 200, 205, 210, 215, 220, 223
|
|
22
|
+
|
|
23
|
+
The actual observed TimeSeries value is preserved.
|
|
24
|
+
Inserted rows receive missing glucose values and inherit static subject fields.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
if static_cols is None:
|
|
28
|
+
static_cols = ["AGE", "SEX", "HBA1C"]
|
|
29
|
+
|
|
30
|
+
required_cols = [id_col, time_col, glucose_col]
|
|
31
|
+
for col in required_cols:
|
|
32
|
+
if col not in df.columns:
|
|
33
|
+
raise ValueError(f"Column '{col}' was not found.")
|
|
34
|
+
|
|
35
|
+
out = df.copy()
|
|
36
|
+
out[time_col] = pd.to_numeric(out[time_col], errors="coerce")
|
|
37
|
+
|
|
38
|
+
if out[time_col].isna().any():
|
|
39
|
+
raise ValueError(f"Some values in '{time_col}' could not be converted to numeric.")
|
|
40
|
+
|
|
41
|
+
expanded_groups = []
|
|
42
|
+
|
|
43
|
+
for subject_id, group in out.groupby(id_col, sort=False):
|
|
44
|
+
group = group.sort_values(time_col).reset_index(drop=True).copy()
|
|
45
|
+
expanded_rows = []
|
|
46
|
+
|
|
47
|
+
for i in range(len(group)):
|
|
48
|
+
current_row = group.iloc[i].copy()
|
|
49
|
+
current_row["inserted_missing_row"] = False
|
|
50
|
+
expanded_rows.append(current_row)
|
|
51
|
+
|
|
52
|
+
if i == len(group) - 1:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
current_ts = float(group.loc[i, time_col])
|
|
56
|
+
next_ts = float(group.loc[i + 1, time_col])
|
|
57
|
+
|
|
58
|
+
candidate_ts = current_ts + interval_minutes
|
|
59
|
+
|
|
60
|
+
while candidate_ts < next_ts - tolerance_minutes:
|
|
61
|
+
new_row = pd.Series(index=group.columns, dtype="object")
|
|
62
|
+
|
|
63
|
+
new_row[id_col] = subject_id
|
|
64
|
+
new_row[time_col] = candidate_ts
|
|
65
|
+
new_row[glucose_col] = np.nan
|
|
66
|
+
new_row["inserted_missing_row"] = True
|
|
67
|
+
|
|
68
|
+
for col in static_cols:
|
|
69
|
+
if col in group.columns:
|
|
70
|
+
new_row[col] = current_row[col]
|
|
71
|
+
|
|
72
|
+
if timestamp_col in group.columns:
|
|
73
|
+
current_timestamp = pd.to_datetime(
|
|
74
|
+
current_row[timestamp_col],
|
|
75
|
+
errors="coerce",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if pd.notna(current_timestamp):
|
|
79
|
+
offset_minutes = candidate_ts - current_ts
|
|
80
|
+
new_row[timestamp_col] = current_timestamp + pd.to_timedelta(
|
|
81
|
+
offset_minutes,
|
|
82
|
+
unit="m",
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
new_row[timestamp_col] = pd.NaT
|
|
86
|
+
|
|
87
|
+
expanded_rows.append(new_row)
|
|
88
|
+
candidate_ts += interval_minutes
|
|
89
|
+
|
|
90
|
+
expanded_groups.append(pd.DataFrame(expanded_rows))
|
|
91
|
+
|
|
92
|
+
expanded = pd.concat(expanded_groups, ignore_index=True)
|
|
93
|
+
expanded = expanded.sort_values([id_col, time_col]).reset_index(drop=True)
|
|
94
|
+
|
|
95
|
+
if "TimeDifferenceMinutes" in expanded.columns:
|
|
96
|
+
expanded["TimeDifferenceMinutes"] = (
|
|
97
|
+
expanded.groupby(id_col)[time_col]
|
|
98
|
+
.diff()
|
|
99
|
+
.fillna(0)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
expanded["inserted_missing_row"] = expanded["inserted_missing_row"].fillna(False)
|
|
103
|
+
|
|
104
|
+
return expanded
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _timeseries_fallback_python(df: pd.DataFrame, ts_col: str, id_col: str) -> pd.DataFrame:
|
|
108
|
+
out = df.copy()
|
|
109
|
+
out["_ts_parsed"] = pd.to_datetime(out[ts_col], errors="coerce")
|
|
110
|
+
if out["_ts_parsed"].isna().any():
|
|
111
|
+
raise ValueError("Some timestamp values could not be parsed.")
|
|
112
|
+
|
|
113
|
+
if id_col in out.columns:
|
|
114
|
+
out = out.sort_values([id_col, "_ts_parsed"]).reset_index(drop=True)
|
|
115
|
+
out["TimeSeries"] = (
|
|
116
|
+
out.groupby(id_col)["_ts_parsed"]
|
|
117
|
+
.transform(lambda s: (s - s.min()).dt.total_seconds() / 60.0)
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
out = out.sort_values("_ts_parsed").reset_index(drop=True)
|
|
121
|
+
out["TimeSeries"] = (out["_ts_parsed"] - out["_ts_parsed"].min()).dt.total_seconds() / 60.0
|
|
122
|
+
|
|
123
|
+
return out.drop(columns=["_ts_parsed"])
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def add_timeseries_column(
|
|
127
|
+
df: pd.DataFrame,
|
|
128
|
+
ts_col: str = "timestamp",
|
|
129
|
+
id_col: str = "subjectid",
|
|
130
|
+
interval_minutes: int = 5,
|
|
131
|
+
prefer_r: bool = False
|
|
132
|
+
) -> pd.DataFrame:
|
|
133
|
+
if ts_col not in df.columns:
|
|
134
|
+
raise ValueError(f"Column '{ts_col}' was not found.")
|
|
135
|
+
|
|
136
|
+
# If TimeSeries already exists, do nothing
|
|
137
|
+
if "TimeSeries" in df.columns and df["TimeSeries"].notna().any():
|
|
138
|
+
return df
|
|
139
|
+
|
|
140
|
+
if not prefer_r:
|
|
141
|
+
return _timeseries_fallback_python(df, ts_col, id_col)
|
|
142
|
+
|
|
143
|
+
# Try CGManalyzer via rpy2; if it fails, fall back
|
|
144
|
+
try:
|
|
145
|
+
import rpy2.robjects as ro
|
|
146
|
+
from rpy2.robjects import FloatVector
|
|
147
|
+
from rpy2.robjects.packages import importr, isinstalled
|
|
148
|
+
|
|
149
|
+
if not isinstalled("CGManalyzer"):
|
|
150
|
+
raise ImportError("CGManalyzer is not installed in R; using Python fallback.")
|
|
151
|
+
|
|
152
|
+
importr("CGManalyzer")
|
|
153
|
+
equalInterval_fn = ro.r["equalInterval.fn"]
|
|
154
|
+
|
|
155
|
+
out = df.copy()
|
|
156
|
+
out["_ts_parsed"] = pd.to_datetime(out[ts_col], errors="coerce")
|
|
157
|
+
if out["_ts_parsed"].isna().any():
|
|
158
|
+
raise ValueError("Some timestamp values could not be parsed.")
|
|
159
|
+
|
|
160
|
+
def _add(group: pd.DataFrame) -> pd.DataFrame:
|
|
161
|
+
group = group.copy()
|
|
162
|
+
original_index = group.index
|
|
163
|
+
group = group.sort_values("_ts_parsed")
|
|
164
|
+
|
|
165
|
+
first_time = group["_ts_parsed"].min()
|
|
166
|
+
x_minutes = (group["_ts_parsed"] - first_time).dt.total_seconds() / 60.0
|
|
167
|
+
dummy_y = list(range(len(group)))
|
|
168
|
+
|
|
169
|
+
r_result = equalInterval_fn(
|
|
170
|
+
x=FloatVector(x_minutes.to_numpy()),
|
|
171
|
+
y=FloatVector(dummy_y),
|
|
172
|
+
Interval=int(interval_minutes)
|
|
173
|
+
)
|
|
174
|
+
r_times = list(ro.r["as.data.frame"](r_result).rx2(1))
|
|
175
|
+
|
|
176
|
+
if len(r_times) >= len(group):
|
|
177
|
+
group["TimeSeries"] = r_times[:len(group)]
|
|
178
|
+
else:
|
|
179
|
+
group["TimeSeries"] = x_minutes.to_numpy()
|
|
180
|
+
|
|
181
|
+
return group.loc[original_index]
|
|
182
|
+
|
|
183
|
+
if id_col in out.columns:
|
|
184
|
+
out = out.groupby(id_col, group_keys=False).apply(_add)
|
|
185
|
+
else:
|
|
186
|
+
out = _add(out)
|
|
187
|
+
|
|
188
|
+
return out.drop(columns=["_ts_parsed"])
|
|
189
|
+
|
|
190
|
+
except Exception:
|
|
191
|
+
return _timeseries_fallback_python(df, ts_col, id_col)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: imputeCGM
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CGM missing glucose imputation with MICE, ARIMA, XGBoost, and optional GAIN-based methods
|
|
5
|
+
Author: Hasin Shad, Shubh Saraswat, Dr. Xiaohua Douglas Zhang
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Requires-Dist: scikit-learn
|
|
11
|
+
Requires-Dist: statsmodels
|
|
12
|
+
Requires-Dist: xgboost
|
|
13
|
+
Requires-Dist: rpy2
|
|
14
|
+
Provides-Extra: gain
|
|
15
|
+
Requires-Dist: tensorflow; extra == "gain"
|
|
16
|
+
|
|
17
|
+
# imputeCGM
|
|
18
|
+
|
|
19
|
+
`imputeCGM` is a Python package for imputing missing glucose values in continuous glucose monitoring (CGM) data.
|
|
20
|
+
|
|
21
|
+
The package supports:
|
|
22
|
+
|
|
23
|
+
- Automatic method selection: MICE+ARIMA when missingness is low and MICE+XGBoost when missingness is higher
|
|
24
|
+
- Forced MICE+ARIMA
|
|
25
|
+
- Forced MICE+XGBoost
|
|
26
|
+
- GAIN-only imputation
|
|
27
|
+
- GAIN+ARIMA
|
|
28
|
+
- GAIN+XGBoost
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
For MICE, ARIMA, and XGBoost methods:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
python -m pip install -e .
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
For GAIN-based methods, install the optional TensorFlow dependency:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
python -m pip install -e .[gain]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Python usage
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from imputeCGM import impute_cgm
|
|
48
|
+
|
|
49
|
+
out = impute_cgm(
|
|
50
|
+
df,
|
|
51
|
+
timestamp_col="timestamp",
|
|
52
|
+
subjectid_col="subjectid",
|
|
53
|
+
glucose_col="glucose_value",
|
|
54
|
+
interval_minutes=5,
|
|
55
|
+
time_gap_tolerance_minutes=3,
|
|
56
|
+
model_method="auto",
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Valid `model_method` values are:
|
|
61
|
+
|
|
62
|
+
```text
|
|
63
|
+
auto
|
|
64
|
+
mice_arima
|
|
65
|
+
mice_xgboost
|
|
66
|
+
gain
|
|
67
|
+
gain_arima
|
|
68
|
+
gain_xgboost
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## CLI usage
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
imputeCGM --input ExampleData.csv --output ExampleData_imputed.csv --model-method auto
|
|
75
|
+
```
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
imputeCGM/__init__.py
|
|
4
|
+
imputeCGM/cli.py
|
|
5
|
+
imputeCGM/features.py
|
|
6
|
+
imputeCGM/gain.py
|
|
7
|
+
imputeCGM/gaps.py
|
|
8
|
+
imputeCGM/impute.py
|
|
9
|
+
imputeCGM/models.py
|
|
10
|
+
imputeCGM/pipeline.py
|
|
11
|
+
imputeCGM/timeseries_r.py
|
|
12
|
+
imputeCGM.egg-info/PKG-INFO
|
|
13
|
+
imputeCGM.egg-info/SOURCES.txt
|
|
14
|
+
imputeCGM.egg-info/dependency_links.txt
|
|
15
|
+
imputeCGM.egg-info/entry_points.txt
|
|
16
|
+
imputeCGM.egg-info/requires.txt
|
|
17
|
+
imputeCGM.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
imputeCGM
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "imputeCGM"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "CGM missing glucose imputation with MICE, ARIMA, XGBoost, and optional GAIN-based methods"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
authors = [{name = "Hasin Shad, Shubh Saraswat, Dr. Xiaohua Douglas Zhang"}]
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy",
|
|
14
|
+
"pandas",
|
|
15
|
+
"scikit-learn",
|
|
16
|
+
"statsmodels",
|
|
17
|
+
"xgboost",
|
|
18
|
+
"rpy2"
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
gain = ["tensorflow"]
|
|
23
|
+
|
|
24
|
+
[project.scripts]
|
|
25
|
+
imputeCGM = "imputeCGM.cli:main"
|