imputeCGM 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,75 @@
1
+ Metadata-Version: 2.4
2
+ Name: imputeCGM
3
+ Version: 0.1.0
4
+ Summary: CGM missing glucose imputation with MICE, ARIMA, XGBoost, and optional GAIN-based methods
5
+ Author: Hasin Shad, Shubh Saraswat, Dr. Xiaohua Douglas Zhang
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy
9
+ Requires-Dist: pandas
10
+ Requires-Dist: scikit-learn
11
+ Requires-Dist: statsmodels
12
+ Requires-Dist: xgboost
13
+ Requires-Dist: rpy2
14
+ Provides-Extra: gain
15
+ Requires-Dist: tensorflow; extra == "gain"
16
+
17
+ # imputeCGM
18
+
19
+ `imputeCGM` is a Python package for imputing missing glucose values in continuous glucose monitoring (CGM) data.
20
+
21
+ The package supports:
22
+
23
+ - Automatic method selection: MICE+ARIMA when missingness is low and MICE+XGBoost when missingness is higher
24
+ - Forced MICE+ARIMA
25
+ - Forced MICE+XGBoost
26
+ - GAIN-only imputation
27
+ - GAIN+ARIMA
28
+ - GAIN+XGBoost
29
+
30
+ ## Installation
31
+
32
+ For MICE, ARIMA, and XGBoost methods:
33
+
34
+ ```bash
35
+ python -m pip install -e .
36
+ ```
37
+
38
+ For GAIN-based methods, install the optional TensorFlow dependency:
39
+
40
+ ```bash
41
+ python -m pip install -e .[gain]
42
+ ```
43
+
44
+ ## Python usage
45
+
46
+ ```python
47
+ from imputeCGM import impute_cgm
48
+
49
+ out = impute_cgm(
50
+ df,
51
+ timestamp_col="timestamp",
52
+ subjectid_col="subjectid",
53
+ glucose_col="glucose_value",
54
+ interval_minutes=5,
55
+ time_gap_tolerance_minutes=3,
56
+ model_method="auto",
57
+ )
58
+ ```
59
+
60
+ Valid `model_method` values are:
61
+
62
+ ```text
63
+ auto
64
+ mice_arima
65
+ mice_xgboost
66
+ gain
67
+ gain_arima
68
+ gain_xgboost
69
+ ```
70
+
71
+ ## CLI usage
72
+
73
+ ```bash
74
+ imputeCGM --input ExampleData.csv --output ExampleData_imputed.csv --model-method auto
75
+ ```
@@ -0,0 +1,59 @@
1
+ # imputeCGM
2
+
3
+ `imputeCGM` is a Python package for imputing missing glucose values in continuous glucose monitoring (CGM) data.
4
+
5
+ The package supports:
6
+
7
+ - Automatic method selection: MICE+ARIMA when missingness is low and MICE+XGBoost when missingness is higher
8
+ - Forced MICE+ARIMA
9
+ - Forced MICE+XGBoost
10
+ - GAIN-only imputation
11
+ - GAIN+ARIMA
12
+ - GAIN+XGBoost
13
+
14
+ ## Installation
15
+
16
+ For MICE, ARIMA, and XGBoost methods:
17
+
18
+ ```bash
19
+ python -m pip install -e .
20
+ ```
21
+
22
+ For GAIN-based methods, install the optional TensorFlow dependency:
23
+
24
+ ```bash
25
+ python -m pip install -e .[gain]
26
+ ```
27
+
28
+ ## Python usage
29
+
30
+ ```python
31
+ from imputeCGM import impute_cgm
32
+
33
+ out = impute_cgm(
34
+ df,
35
+ timestamp_col="timestamp",
36
+ subjectid_col="subjectid",
37
+ glucose_col="glucose_value",
38
+ interval_minutes=5,
39
+ time_gap_tolerance_minutes=3,
40
+ model_method="auto",
41
+ )
42
+ ```
43
+
44
+ Valid `model_method` values are:
45
+
46
+ ```text
47
+ auto
48
+ mice_arima
49
+ mice_xgboost
50
+ gain
51
+ gain_arima
52
+ gain_xgboost
53
+ ```
54
+
55
+ ## CLI usage
56
+
57
+ ```bash
58
+ imputeCGM --input ExampleData.csv --output ExampleData_imputed.csv --model-method auto
59
+ ```
@@ -0,0 +1,3 @@
1
+ from .pipeline import impute_cgm
2
+
3
+ __all__ = ["impute_cgm"]
@@ -0,0 +1,49 @@
1
+ import argparse
2
+ import pandas as pd
3
+ from .pipeline import impute_cgm
4
+
5
+
6
+ def main():
7
+ p = argparse.ArgumentParser()
8
+ p.add_argument("--input", required=True)
9
+ p.add_argument("--output", required=True)
10
+ p.add_argument("--timestamp-col", default="timestamp")
11
+ p.add_argument("--subjectid-col", default="subjectid")
12
+ p.add_argument("--glucose-col", default="glucose_value")
13
+ p.add_argument("--interval", type=int, default=5)
14
+ p.add_argument("--arima-threshold", type=float, default=0.05)
15
+ p.add_argument("--time-gap-tolerance", type=int, default=3)
16
+ p.add_argument(
17
+ "--model-method",
18
+ default="auto",
19
+ choices=["auto", "mice_arima", "mice_xgboost", "gain", "gain_arima", "gain_xgboost"],
20
+ )
21
+ p.add_argument("--gain-epochs", type=int, default=500)
22
+ p.add_argument("--gain-batch-size", type=int, default=128)
23
+ p.add_argument("--gain-hint-rate", type=float, default=0.90)
24
+ p.add_argument("--gain-alpha", type=float, default=10.0)
25
+ p.add_argument("--gain-learning-rate", type=float, default=1e-3)
26
+ args = p.parse_args()
27
+
28
+ df = pd.read_csv(args.input)
29
+
30
+ out = impute_cgm(
31
+ df=df,
32
+ timestamp_col=args.timestamp_col,
33
+ subjectid_col=args.subjectid_col,
34
+ glucose_col=args.glucose_col,
35
+ interval_minutes=args.interval,
36
+ time_gap_tolerance_minutes=args.time_gap_tolerance,
37
+ use_arima_if_missing_leq=args.arima_threshold,
38
+ model_method=args.model_method,
39
+ gain_epochs=args.gain_epochs,
40
+ gain_batch_size=args.gain_batch_size,
41
+ gain_hint_rate=args.gain_hint_rate,
42
+ gain_alpha=args.gain_alpha,
43
+ gain_learning_rate=args.gain_learning_rate,
44
+ )
45
+
46
+ out.to_csv(args.output, index=False)
47
+ print("Saved:", args.output)
48
+ print("Chosen method:", out["imputation_method"].iloc[0])
49
+ print("Missing rate:", out["missing_rate"].iloc[0])
@@ -0,0 +1,30 @@
1
+ import pandas as pd
2
+
3
+
4
+ def encode_sex(df: pd.DataFrame, col: str = "SEX") -> pd.DataFrame:
5
+ out = df.copy()
6
+ if col in out.columns:
7
+ s = out[col].astype(str).str.strip().str.upper()
8
+ out[col] = s.map({"M": 1, "MALE": 1, "1": 1, "F": 0, "FEMALE": 0, "0": 0})
9
+ return out
10
+
11
+
12
+ def add_lag_features(
13
+ df: pd.DataFrame,
14
+ target_col: str = "glucose_value",
15
+ id_col: str = "subjectid",
16
+ time_col: str = "TimeSeries",
17
+ lag_k=(1, 2, 3),
18
+ roll_window: int = 3
19
+ ) -> pd.DataFrame:
20
+ out = df.sort_values([id_col, time_col]).reset_index(drop=True).copy()
21
+
22
+ for k in lag_k:
23
+ out[f"lag{k}"] = out.groupby(id_col)[target_col].shift(k)
24
+
25
+ s = out.groupby(id_col)[target_col].shift(1)
26
+ out["rollmean"] = (
27
+ s.groupby(out[id_col]).rolling(roll_window).mean().reset_index(level=0, drop=True)
28
+ )
29
+
30
+ return out
@@ -0,0 +1,156 @@
1
+ import random
2
+ import numpy as np
3
+
4
+
5
+ def set_all_seeds(seed: int = 42) -> None:
6
+ random.seed(seed)
7
+ np.random.seed(seed)
8
+ try:
9
+ import tensorflow as tf
10
+ tf.random.set_seed(seed)
11
+ except Exception:
12
+ pass
13
+
14
+
15
+ def minmax_normalize(data_nan):
16
+ data_nan = np.asarray(data_nan, dtype=np.float32)
17
+ col_min = np.nanmin(data_nan, axis=0)
18
+ col_max = np.nanmax(data_nan, axis=0)
19
+
20
+ col_min = np.where(np.isfinite(col_min), col_min, 0.0)
21
+ col_max = np.where(np.isfinite(col_max), col_max, 1.0)
22
+ denom = col_max - col_min
23
+ denom = np.where(np.abs(denom) < 1e-8, 1.0, denom)
24
+
25
+ data_norm = (data_nan - col_min) / denom
26
+ return data_norm.astype(np.float32), col_min.astype(np.float32), denom.astype(np.float32)
27
+
28
+
29
+ def minmax_denormalize(data_norm, col_min, denom):
30
+ return data_norm * denom + col_min
31
+
32
+
33
+ def _make_mlp(input_dim, output_dim, hidden_dim=128, output_activation="sigmoid", name="mlp"):
34
+ try:
35
+ from tensorflow.keras import layers, Model
36
+ except Exception as exc:
37
+ raise ImportError(
38
+ "GAIN methods require TensorFlow. Install this package with: "
39
+ "python -m pip install -e .[gain]"
40
+ ) from exc
41
+
42
+ inp = layers.Input(shape=(input_dim,))
43
+ x = layers.Dense(hidden_dim, activation="relu")(inp)
44
+ x = layers.Dense(hidden_dim, activation="relu")(x)
45
+ out = layers.Dense(output_dim, activation=output_activation)(x)
46
+ return Model(inp, out, name=name)
47
+
48
+
49
+ def gain_impute(
50
+ data_nan,
51
+ epochs: int = 500,
52
+ batch_size: int = 128,
53
+ hint_rate: float = 0.90,
54
+ alpha: float = 10.0,
55
+ learning_rate: float = 1e-3,
56
+ seed: int = 42,
57
+ verbose: bool = False,
58
+ ):
59
+ """
60
+ GAIN imputation for numeric tabular data.
61
+
62
+ Parameters
63
+ ----------
64
+ data_nan:
65
+ Numeric matrix with missing entries represented as np.nan.
66
+
67
+ Returns
68
+ -------
69
+ np.ndarray
70
+ Full imputed matrix in the original scale. Observed values are preserved.
71
+ """
72
+ try:
73
+ import tensorflow as tf
74
+ except Exception as exc:
75
+ raise ImportError(
76
+ "GAIN methods require TensorFlow. Install this package with: "
77
+ "python -m pip install -e .[gain]"
78
+ ) from exc
79
+
80
+ set_all_seeds(seed)
81
+
82
+ data_norm, col_min, denom = minmax_normalize(data_nan)
83
+ mask = (~np.isnan(data_norm)).astype(np.float32)
84
+ x_filled = np.nan_to_num(data_norm, nan=0.0).astype(np.float32)
85
+
86
+ n, d = x_filled.shape
87
+ generator = _make_mlp(
88
+ input_dim=2 * d,
89
+ output_dim=d,
90
+ hidden_dim=128,
91
+ output_activation="sigmoid",
92
+ name="gain_generator",
93
+ )
94
+ discriminator = _make_mlp(
95
+ input_dim=2 * d,
96
+ output_dim=d,
97
+ hidden_dim=128,
98
+ output_activation="sigmoid",
99
+ name="gain_discriminator",
100
+ )
101
+
102
+ g_opt = tf.keras.optimizers.Adam(learning_rate)
103
+ d_opt = tf.keras.optimizers.Adam(learning_rate)
104
+ eps = 1e-8
105
+
106
+ for epoch in range(int(epochs)):
107
+ batch_idx = np.random.choice(n, size=min(batch_size, n), replace=n < batch_size)
108
+ x_mb = tf.convert_to_tensor(x_filled[batch_idx], dtype=tf.float32)
109
+ m_mb = tf.convert_to_tensor(mask[batch_idx], dtype=tf.float32)
110
+
111
+ z_mb = tf.random.uniform(shape=tf.shape(x_mb), minval=0.0, maxval=0.01, dtype=tf.float32)
112
+ x_hat = x_mb * m_mb + z_mb * (1.0 - m_mb)
113
+
114
+ h_binary = tf.cast(tf.random.uniform(shape=tf.shape(m_mb)) < hint_rate, tf.float32)
115
+ h_mb = m_mb * h_binary + 0.5 * (1.0 - h_binary)
116
+
117
+ with tf.GradientTape() as d_tape:
118
+ g_sample = generator(tf.concat([x_hat, m_mb], axis=1), training=True)
119
+ hat_x = x_hat * m_mb + g_sample * (1.0 - m_mb)
120
+ d_prob = discriminator(tf.concat([hat_x, h_mb], axis=1), training=True)
121
+ d_loss = -tf.reduce_mean(
122
+ m_mb * tf.math.log(d_prob + eps)
123
+ + (1.0 - m_mb) * tf.math.log(1.0 - d_prob + eps)
124
+ )
125
+ d_grads = d_tape.gradient(d_loss, discriminator.trainable_variables)
126
+ d_opt.apply_gradients(zip(d_grads, discriminator.trainable_variables))
127
+
128
+ with tf.GradientTape() as g_tape:
129
+ g_sample = generator(tf.concat([x_hat, m_mb], axis=1), training=True)
130
+ hat_x = x_hat * m_mb + g_sample * (1.0 - m_mb)
131
+ d_prob = discriminator(tf.concat([hat_x, h_mb], axis=1), training=False)
132
+
133
+ g_loss_adv = -tf.reduce_mean((1.0 - m_mb) * tf.math.log(d_prob + eps))
134
+ mse_num = tf.reduce_sum((m_mb * x_mb - m_mb * g_sample) ** 2)
135
+ mse_den = tf.reduce_sum(m_mb) + eps
136
+ g_loss_mse = mse_num / mse_den
137
+ g_loss = g_loss_adv + alpha * g_loss_mse
138
+ g_grads = g_tape.gradient(g_loss, generator.trainable_variables)
139
+ g_opt.apply_gradients(zip(g_grads, generator.trainable_variables))
140
+
141
+ if verbose and ((epoch + 1) % 100 == 0 or epoch == 0):
142
+ print(
143
+ f"GAIN epoch {epoch + 1:4d}/{epochs} | "
144
+ f"D_loss={float(d_loss):.5f} | G_loss={float(g_loss):.5f}"
145
+ )
146
+
147
+ x_tensor = tf.convert_to_tensor(x_filled, dtype=tf.float32)
148
+ m_tensor = tf.convert_to_tensor(mask, dtype=tf.float32)
149
+ z_all = tf.random.uniform(shape=tf.shape(x_tensor), minval=0.0, maxval=0.01, dtype=tf.float32)
150
+ x_hat_all = x_tensor * m_tensor + z_all * (1.0 - m_tensor)
151
+ g_all = generator(tf.concat([x_hat_all, m_tensor], axis=1), training=False).numpy()
152
+
153
+ observed_norm = np.nan_to_num(data_norm, nan=0.0).astype(np.float32)
154
+ imputed_norm = mask * observed_norm + (1.0 - mask) * g_all
155
+ imputed = minmax_denormalize(imputed_norm, col_min, denom)
156
+ return imputed.astype(float)
@@ -0,0 +1,5 @@
1
+ import pandas as pd
2
+
3
+
4
+ def missing_rate(df: pd.DataFrame, target_col: str = "glucose_value") -> float:
5
+ return float(df[target_col].isna().mean())
@@ -0,0 +1,168 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.experimental import enable_iterative_imputer # noqa: F401
4
+ from sklearn.impute import IterativeImputer
5
+
6
+ from .models import arima_segmentwise_on_mice, fit_xgb_predict_missing
7
+
8
+
9
+ VALID_MODEL_METHODS = {
10
+ "auto",
11
+ "mice_arima",
12
+ "mice_xgboost",
13
+ "gain",
14
+ "gain_arima",
15
+ "gain_xgboost",
16
+ }
17
+
18
+
19
+ def _run_mice_matrix(imp_df: pd.DataFrame):
20
+ imp_mat = IterativeImputer(
21
+ random_state=42,
22
+ max_iter=10,
23
+ ).fit_transform(imp_df.to_numpy(dtype=float))
24
+
25
+ y_mice_full = imp_mat[:, 0]
26
+ X_imp = imp_mat[:, 1:]
27
+ return y_mice_full, X_imp
28
+
29
+
30
+ def _run_gain_matrix(
31
+ imp_df: pd.DataFrame,
32
+ gain_epochs: int = 500,
33
+ gain_batch_size: int = 128,
34
+ gain_hint_rate: float = 0.90,
35
+ gain_alpha: float = 10.0,
36
+ gain_learning_rate: float = 1e-3,
37
+ gain_seed: int = 42,
38
+ gain_verbose: bool = False,
39
+ ):
40
+ from .gain import gain_impute
41
+
42
+ gain_mat = gain_impute(
43
+ data_nan=imp_df.to_numpy(dtype=np.float32),
44
+ epochs=gain_epochs,
45
+ batch_size=gain_batch_size,
46
+ hint_rate=gain_hint_rate,
47
+ alpha=gain_alpha,
48
+ learning_rate=gain_learning_rate,
49
+ seed=gain_seed,
50
+ verbose=gain_verbose,
51
+ )
52
+
53
+ y_gain_full = gain_mat[:, 0]
54
+ X_imp = gain_mat[:, 1:]
55
+ return y_gain_full, X_imp
56
+
57
+
58
+ def impute_values(
59
+ df: pd.DataFrame,
60
+ id_col: str = "subjectid",
61
+ time_col: str = "TimeSeries",
62
+ target_col: str = "glucose_value",
63
+ feature_cols=None,
64
+ use_arima_if_missing_leq: float = 0.05,
65
+ model_method: str = "auto",
66
+ gain_epochs: int = 500,
67
+ gain_batch_size: int = 128,
68
+ gain_hint_rate: float = 0.90,
69
+ gain_alpha: float = 10.0,
70
+ gain_learning_rate: float = 1e-3,
71
+ gain_seed: int = 42,
72
+ gain_verbose: bool = False,
73
+ ) -> pd.DataFrame:
74
+ """
75
+ Impute missing CGM glucose values.
76
+
77
+ model_method options:
78
+ auto -> MICE+ARIMA if missing rate <= threshold; otherwise MICE+XGBoost
79
+ mice_arima -> force MICE+ARIMA
80
+ mice_xgboost -> force MICE+XGBoost
81
+ gain -> force GAIN-only imputation
82
+ gain_arima -> use GAIN initial imputation, then ARIMA on missing segments
83
+ gain_xgboost -> use GAIN-completed features, then XGBoost for missing glucose
84
+ """
85
+ model_method = str(model_method).lower().strip()
86
+ if model_method not in VALID_MODEL_METHODS:
87
+ raise ValueError(
88
+ f"Unknown model_method '{model_method}'. Valid options are: "
89
+ + ", ".join(sorted(VALID_MODEL_METHODS))
90
+ )
91
+
92
+ out = df.sort_values([id_col, time_col]).reset_index(drop=True).copy()
93
+
94
+ if feature_cols is None:
95
+ feature_cols = [c for c in out.columns if c != target_col]
96
+
97
+ mask_pos = out[target_col].isna().to_numpy()
98
+ miss_rate = float(mask_pos.mean())
99
+
100
+ if not np.any(mask_pos):
101
+ out["imputed_glucose_value"] = out[target_col].to_numpy(dtype=float)
102
+ out["imputation_method"] = "No missing glucose values"
103
+ out["missing_rate"] = miss_rate
104
+ return out
105
+
106
+ imp_df = out[[target_col] + feature_cols].copy()
107
+
108
+ if model_method == "auto":
109
+ resolved_method = "mice_arima" if miss_rate <= use_arima_if_missing_leq else "mice_xgboost"
110
+ else:
111
+ resolved_method = model_method
112
+
113
+ if resolved_method.startswith("mice"):
114
+ y_base_full, X_imp = _run_mice_matrix(imp_df)
115
+ base_label = "MICE"
116
+ else:
117
+ y_base_full, X_imp = _run_gain_matrix(
118
+ imp_df,
119
+ gain_epochs=gain_epochs,
120
+ gain_batch_size=gain_batch_size,
121
+ gain_hint_rate=gain_hint_rate,
122
+ gain_alpha=gain_alpha,
123
+ gain_learning_rate=gain_learning_rate,
124
+ gain_seed=gain_seed,
125
+ gain_verbose=gain_verbose,
126
+ )
127
+ base_label = "GAIN"
128
+
129
+ y_final = np.asarray(y_base_full, dtype=float).copy()
130
+
131
+ if resolved_method in {"mice_arima", "gain_arima"}:
132
+ y_final = arima_segmentwise_on_mice(
133
+ df_sorted=out,
134
+ id_col=id_col,
135
+ y_mice_full=y_base_full,
136
+ mask_pos=mask_pos,
137
+ order=(4, 1, 0),
138
+ min_history=20,
139
+ )
140
+ method = f"{base_label}+ARIMA"
141
+
142
+ elif resolved_method in {"mice_xgboost", "gain_xgboost"}:
143
+ train_idx = ~mask_pos
144
+ X_train = X_imp[train_idx]
145
+ y_train = out.loc[train_idx, target_col].to_numpy(dtype=float)
146
+ X_missing = X_imp[mask_pos]
147
+
148
+ y_pred_missing = fit_xgb_predict_missing(
149
+ X_train,
150
+ y_train,
151
+ X_missing,
152
+ )
153
+
154
+ y_final[mask_pos] = y_pred_missing
155
+ method = f"{base_label}+XGBoost"
156
+
157
+ elif resolved_method == "gain":
158
+ method = "GAIN"
159
+
160
+ else:
161
+ raise RuntimeError(f"Could not resolve imputation method: {resolved_method}")
162
+
163
+ out["imputed_glucose_value"] = y_final
164
+ out["imputation_method"] = method
165
+ out["missing_rate"] = miss_rate
166
+ out["model_method"] = resolved_method
167
+
168
+ return out
@@ -0,0 +1,76 @@
1
+ import numpy as np
2
+ from statsmodels.tsa.arima.model import ARIMA
3
+ import xgboost as xgb
4
+
5
+
6
+ XGB_PARAMS = dict(
7
+ n_estimators=300,
8
+ learning_rate=0.05,
9
+ max_depth=6,
10
+ subsample=0.8,
11
+ colsample_bytree=0.8,
12
+ reg_lambda=1.0,
13
+ n_jobs=1,
14
+ random_state=42,
15
+ tree_method="hist",
16
+ eval_metric="rmse",
17
+ )
18
+
19
+
20
+ def _segments(mask: np.ndarray):
21
+ segs = []
22
+ i = 0
23
+ n = len(mask)
24
+ while i < n:
25
+ if mask[i]:
26
+ j = i
27
+ while j + 1 < n and mask[j + 1]:
28
+ j += 1
29
+ segs.append((i, j))
30
+ i = j + 1
31
+ else:
32
+ i += 1
33
+ return segs
34
+
35
+
36
+ def arima_segmentwise_on_mice(
37
+ df_sorted,
38
+ id_col: str,
39
+ y_mice_full: np.ndarray,
40
+ mask_pos: np.ndarray,
41
+ order=(4, 1, 0),
42
+ min_history: int = 20
43
+ ) -> np.ndarray:
44
+ pred_full = np.asarray(y_mice_full, dtype=float).copy()
45
+ mask_pos = np.asarray(mask_pos, dtype=bool)
46
+
47
+ for _, grp in df_sorted.groupby(id_col, sort=False):
48
+ idx = grp.index.to_numpy()
49
+ local_mask = mask_pos[idx]
50
+ segs = _segments(local_mask)
51
+ if not segs:
52
+ continue
53
+
54
+ local_series = pred_full[idx]
55
+
56
+ for s, e in segs:
57
+ block_len = e - s + 1
58
+ history = local_series[:s]
59
+ if len(history) < min_history:
60
+ continue
61
+ if not np.all(np.isfinite(history)):
62
+ continue
63
+ try:
64
+ fit = ARIMA(history, order=order).fit()
65
+ fcst = fit.forecast(steps=block_len)
66
+ pred_full[idx[s:e+1]] = np.asarray(fcst, dtype=float)
67
+ except Exception:
68
+ continue
69
+
70
+ return pred_full
71
+
72
+
73
+ def fit_xgb_predict_missing(X_train, y_train, X_missing) -> np.ndarray:
74
+ model = xgb.XGBRegressor(**XGB_PARAMS)
75
+ model.fit(X_train, y_train, verbose=False)
76
+ return model.predict(X_missing)
@@ -0,0 +1,241 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ from .timeseries_r import add_timeseries_column
5
+ from .features import encode_sex, add_lag_features
6
+ from .impute import impute_values
7
+
8
+
9
+ def insert_missing_timeseries_rows(
10
+ df: pd.DataFrame,
11
+ id_col: str = "subjectid",
12
+ time_col: str = "TimeSeries",
13
+ timestamp_col: str = "timestamp",
14
+ glucose_col: str = "glucose_value",
15
+ interval_minutes: int = 5,
16
+ tolerance_minutes: int = 3,
17
+ static_cols: list[str] | None = None,
18
+ ) -> pd.DataFrame:
19
+ """
20
+ Insert synthetic rows when TimeSeries gaps exceed the expected interval.
21
+
22
+ Examples with interval=5 and tolerance=3:
23
+
24
+ 200, 205, 220
25
+ -> 200, 205, 210, 215, 220
26
+
27
+ 200, 205, 210, 224
28
+ -> 200, 205, 210, 215, 220, 224
29
+
30
+ 200, 205, 210, 223
31
+ -> 200, 205, 210, 215, 220, 223
32
+
33
+ The observed row is preserved. Inserted rows receive missing glucose values
34
+ and inherit static subject-level fields such as AGE, SEX, and HBA1C.
35
+ """
36
+
37
+ if static_cols is None:
38
+ static_cols = ["AGE", "SEX", "HBA1C"]
39
+
40
+ required_cols = [id_col, time_col, glucose_col]
41
+
42
+ for col in required_cols:
43
+ if col not in df.columns:
44
+ raise ValueError(f"Column '{col}' was not found.")
45
+
46
+ out = df.copy()
47
+
48
+ out[time_col] = pd.to_numeric(out[time_col], errors="coerce")
49
+
50
+ if out[time_col].isna().any():
51
+ raise ValueError(
52
+ f"Some values in '{time_col}' could not be converted to numeric."
53
+ )
54
+
55
+ expanded_groups = []
56
+
57
+ for subject_id, group in out.groupby(id_col, sort=False):
58
+ group = group.sort_values(time_col).reset_index(drop=True).copy()
59
+
60
+ expanded_rows = []
61
+
62
+ for i in range(len(group)):
63
+ current_row = group.iloc[i].copy()
64
+ current_row["inserted_missing_row"] = False
65
+ expanded_rows.append(current_row)
66
+
67
+ if i == len(group) - 1:
68
+ continue
69
+
70
+ current_ts = float(group.loc[i, time_col])
71
+ next_ts = float(group.loc[i + 1, time_col])
72
+
73
+ candidate_ts = current_ts + interval_minutes
74
+
75
+ while candidate_ts < next_ts - tolerance_minutes:
76
+ new_row = pd.Series(index=group.columns, dtype="object")
77
+
78
+ new_row[id_col] = subject_id
79
+ new_row[time_col] = candidate_ts
80
+ new_row[glucose_col] = np.nan
81
+ new_row["inserted_missing_row"] = True
82
+
83
+ for col in static_cols:
84
+ if col in group.columns:
85
+ new_row[col] = current_row[col]
86
+
87
+ if timestamp_col in group.columns:
88
+ current_timestamp = pd.to_datetime(
89
+ current_row[timestamp_col],
90
+ errors="coerce",
91
+ )
92
+
93
+ if pd.notna(current_timestamp):
94
+ offset_minutes = candidate_ts - current_ts
95
+
96
+ new_row[timestamp_col] = (
97
+ current_timestamp
98
+ + pd.to_timedelta(offset_minutes, unit="m")
99
+ )
100
+ else:
101
+ new_row[timestamp_col] = pd.NaT
102
+
103
+ expanded_rows.append(new_row)
104
+ candidate_ts += interval_minutes
105
+
106
+ expanded_group = pd.DataFrame(expanded_rows)
107
+ expanded_groups.append(expanded_group)
108
+
109
+ expanded = pd.concat(expanded_groups, ignore_index=True)
110
+
111
+ expanded = expanded.sort_values([id_col, time_col]).reset_index(drop=True)
112
+
113
+ if "TimeDifferenceMinutes" in expanded.columns:
114
+ expanded["TimeDifferenceMinutes"] = (
115
+ expanded.groupby(id_col)[time_col]
116
+ .diff()
117
+ .fillna(0)
118
+ )
119
+
120
+ expanded["inserted_missing_row"] = (
121
+ expanded["inserted_missing_row"]
122
+ .fillna(False)
123
+ .astype(bool)
124
+ )
125
+
126
+ return expanded
127
+
128
+
129
+ def impute_cgm(
130
+ df: pd.DataFrame,
131
+ timestamp_col: str = "timestamp",
132
+ subjectid_col: str = "subjectid",
133
+ glucose_col: str = "glucose_value",
134
+ interval_minutes: int = 5,
135
+ time_gap_tolerance_minutes: int = 3,
136
+ use_arima_if_missing_leq: float = 0.05,
137
+ model_method: str = "auto",
138
+ gain_epochs: int = 500,
139
+ gain_batch_size: int = 128,
140
+ gain_hint_rate: float = 0.90,
141
+ gain_alpha: float = 10.0,
142
+ gain_learning_rate: float = 1e-3,
143
+ gain_seed: int = 42,
144
+ gain_verbose: bool = False,
145
+ ) -> pd.DataFrame:
146
+ """
147
+ Full CGM missing-glucose imputation pipeline.
148
+
149
+ Steps:
150
+ 1. Convert timestamp column into TimeSeries.
151
+ 2. Encode SEX as numeric.
152
+ 3. Insert missing time rows when CGM time gaps exceed the expected interval.
153
+ 4. Convert modeling columns to numeric.
154
+ 5. Create lag and rolling mean features.
155
+ 6. Run the selected imputation method: automatic MICE-based selection, forced MICE+ARIMA, forced MICE+XGBoost, GAIN, GAIN+ARIMA, or GAIN+XGBoost.
156
+ """
157
+
158
+ out = df.copy()
159
+
160
+ # convert timestamp into TimeSeries
161
+ out = add_timeseries_column(
162
+ out,
163
+ ts_col=timestamp_col,
164
+ id_col=subjectid_col,
165
+ interval_minutes=interval_minutes,
166
+ )
167
+
168
+ #m-1, f-2
169
+ out = encode_sex(out, "SEX")
170
+
171
+ out = insert_missing_timeseries_rows(
172
+ out,
173
+ id_col=subjectid_col,
174
+ time_col="TimeSeries",
175
+ timestamp_col=timestamp_col,
176
+ glucose_col=glucose_col,
177
+ interval_minutes=interval_minutes,
178
+ tolerance_minutes=time_gap_tolerance_minutes,
179
+ static_cols=["AGE", "SEX", "HBA1C"],
180
+ )
181
+
182
+
183
+ numeric_cols = [
184
+ glucose_col,
185
+ "TimeSeries",
186
+ "TimeDifferenceMinutes",
187
+ subjectid_col,
188
+ "AGE",
189
+ "HBA1C",
190
+ "SEX",
191
+ ]
192
+
193
+ for col in numeric_cols:
194
+ if col in out.columns:
195
+ out[col] = pd.to_numeric(out[col], errors="coerce")
196
+
197
+ # create lag and rolling features.
198
+ out = add_lag_features(
199
+ out,
200
+ target_col=glucose_col,
201
+ id_col=subjectid_col,
202
+ time_col="TimeSeries",
203
+ lag_k=(1, 2, 3),
204
+ roll_window=3,
205
+ )
206
+
207
+ # select model features.
208
+ feature_cols = [
209
+ "TimeSeries",
210
+ "TimeDifferenceMinutes",
211
+ subjectid_col,
212
+ "AGE",
213
+ "SEX",
214
+ "HBA1C",
215
+ "lag1",
216
+ "lag2",
217
+ "lag3",
218
+ "rollmean",
219
+ ]
220
+
221
+ feature_cols = [col for col in feature_cols if col in out.columns]
222
+
223
+ # Run imputation
224
+ out = impute_values(
225
+ out,
226
+ id_col=subjectid_col,
227
+ time_col="TimeSeries",
228
+ target_col=glucose_col,
229
+ feature_cols=feature_cols,
230
+ use_arima_if_missing_leq=use_arima_if_missing_leq,
231
+ model_method=model_method,
232
+ gain_epochs=gain_epochs,
233
+ gain_batch_size=gain_batch_size,
234
+ gain_hint_rate=gain_hint_rate,
235
+ gain_alpha=gain_alpha,
236
+ gain_learning_rate=gain_learning_rate,
237
+ gain_seed=gain_seed,
238
+ gain_verbose=gain_verbose,
239
+ )
240
+
241
+ return out
@@ -0,0 +1,191 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+
5
+ def insert_missing_timeseries_rows(
6
+ df: pd.DataFrame,
7
+ id_col: str = "subjectid",
8
+ time_col: str = "TimeSeries",
9
+ timestamp_col: str = "timestamp",
10
+ glucose_col: str = "glucose_value",
11
+ interval_minutes: int = 5,
12
+ tolerance_minutes: int = 3,
13
+ static_cols: list[str] | None = None,
14
+ ) -> pd.DataFrame:
15
+ """
16
+ Insert synthetic rows when TimeSeries gaps exceed the expected interval.
17
+
18
+ Examples with interval=5 and tolerance=3:
19
+ 200, 205, 220 -> 200, 205, 210, 215, 220
20
+ 200, 205, 210, 224 -> 200, 205, 210, 215, 220, 224
21
+ 200, 205, 210, 223 -> 200, 205, 210, 215, 220, 223
22
+
23
+ The actual observed TimeSeries value is preserved.
24
+ Inserted rows receive missing glucose values and inherit static subject fields.
25
+ """
26
+
27
+ if static_cols is None:
28
+ static_cols = ["AGE", "SEX", "HBA1C"]
29
+
30
+ required_cols = [id_col, time_col, glucose_col]
31
+ for col in required_cols:
32
+ if col not in df.columns:
33
+ raise ValueError(f"Column '{col}' was not found.")
34
+
35
+ out = df.copy()
36
+ out[time_col] = pd.to_numeric(out[time_col], errors="coerce")
37
+
38
+ if out[time_col].isna().any():
39
+ raise ValueError(f"Some values in '{time_col}' could not be converted to numeric.")
40
+
41
+ expanded_groups = []
42
+
43
+ for subject_id, group in out.groupby(id_col, sort=False):
44
+ group = group.sort_values(time_col).reset_index(drop=True).copy()
45
+ expanded_rows = []
46
+
47
+ for i in range(len(group)):
48
+ current_row = group.iloc[i].copy()
49
+ current_row["inserted_missing_row"] = False
50
+ expanded_rows.append(current_row)
51
+
52
+ if i == len(group) - 1:
53
+ continue
54
+
55
+ current_ts = float(group.loc[i, time_col])
56
+ next_ts = float(group.loc[i + 1, time_col])
57
+
58
+ candidate_ts = current_ts + interval_minutes
59
+
60
+ while candidate_ts < next_ts - tolerance_minutes:
61
+ new_row = pd.Series(index=group.columns, dtype="object")
62
+
63
+ new_row[id_col] = subject_id
64
+ new_row[time_col] = candidate_ts
65
+ new_row[glucose_col] = np.nan
66
+ new_row["inserted_missing_row"] = True
67
+
68
+ for col in static_cols:
69
+ if col in group.columns:
70
+ new_row[col] = current_row[col]
71
+
72
+ if timestamp_col in group.columns:
73
+ current_timestamp = pd.to_datetime(
74
+ current_row[timestamp_col],
75
+ errors="coerce",
76
+ )
77
+
78
+ if pd.notna(current_timestamp):
79
+ offset_minutes = candidate_ts - current_ts
80
+ new_row[timestamp_col] = current_timestamp + pd.to_timedelta(
81
+ offset_minutes,
82
+ unit="m",
83
+ )
84
+ else:
85
+ new_row[timestamp_col] = pd.NaT
86
+
87
+ expanded_rows.append(new_row)
88
+ candidate_ts += interval_minutes
89
+
90
+ expanded_groups.append(pd.DataFrame(expanded_rows))
91
+
92
+ expanded = pd.concat(expanded_groups, ignore_index=True)
93
+ expanded = expanded.sort_values([id_col, time_col]).reset_index(drop=True)
94
+
95
+ if "TimeDifferenceMinutes" in expanded.columns:
96
+ expanded["TimeDifferenceMinutes"] = (
97
+ expanded.groupby(id_col)[time_col]
98
+ .diff()
99
+ .fillna(0)
100
+ )
101
+
102
+ expanded["inserted_missing_row"] = expanded["inserted_missing_row"].fillna(False)
103
+
104
+ return expanded
105
+
106
+
107
+ def _timeseries_fallback_python(df: pd.DataFrame, ts_col: str, id_col: str) -> pd.DataFrame:
108
+ out = df.copy()
109
+ out["_ts_parsed"] = pd.to_datetime(out[ts_col], errors="coerce")
110
+ if out["_ts_parsed"].isna().any():
111
+ raise ValueError("Some timestamp values could not be parsed.")
112
+
113
+ if id_col in out.columns:
114
+ out = out.sort_values([id_col, "_ts_parsed"]).reset_index(drop=True)
115
+ out["TimeSeries"] = (
116
+ out.groupby(id_col)["_ts_parsed"]
117
+ .transform(lambda s: (s - s.min()).dt.total_seconds() / 60.0)
118
+ )
119
+ else:
120
+ out = out.sort_values("_ts_parsed").reset_index(drop=True)
121
+ out["TimeSeries"] = (out["_ts_parsed"] - out["_ts_parsed"].min()).dt.total_seconds() / 60.0
122
+
123
+ return out.drop(columns=["_ts_parsed"])
124
+
125
+
126
+ def add_timeseries_column(
127
+ df: pd.DataFrame,
128
+ ts_col: str = "timestamp",
129
+ id_col: str = "subjectid",
130
+ interval_minutes: int = 5,
131
+ prefer_r: bool = False
132
+ ) -> pd.DataFrame:
133
+ if ts_col not in df.columns:
134
+ raise ValueError(f"Column '{ts_col}' was not found.")
135
+
136
+ # If TimeSeries already exists, do nothing
137
+ if "TimeSeries" in df.columns and df["TimeSeries"].notna().any():
138
+ return df
139
+
140
+ if not prefer_r:
141
+ return _timeseries_fallback_python(df, ts_col, id_col)
142
+
143
+ # Try CGManalyzer via rpy2; if it fails, fall back
144
+ try:
145
+ import rpy2.robjects as ro
146
+ from rpy2.robjects import FloatVector
147
+ from rpy2.robjects.packages import importr, isinstalled
148
+
149
+ if not isinstalled("CGManalyzer"):
150
+ raise ImportError("CGManalyzer is not installed in R; using Python fallback.")
151
+
152
+ importr("CGManalyzer")
153
+ equalInterval_fn = ro.r["equalInterval.fn"]
154
+
155
+ out = df.copy()
156
+ out["_ts_parsed"] = pd.to_datetime(out[ts_col], errors="coerce")
157
+ if out["_ts_parsed"].isna().any():
158
+ raise ValueError("Some timestamp values could not be parsed.")
159
+
160
+ def _add(group: pd.DataFrame) -> pd.DataFrame:
161
+ group = group.copy()
162
+ original_index = group.index
163
+ group = group.sort_values("_ts_parsed")
164
+
165
+ first_time = group["_ts_parsed"].min()
166
+ x_minutes = (group["_ts_parsed"] - first_time).dt.total_seconds() / 60.0
167
+ dummy_y = list(range(len(group)))
168
+
169
+ r_result = equalInterval_fn(
170
+ x=FloatVector(x_minutes.to_numpy()),
171
+ y=FloatVector(dummy_y),
172
+ Interval=int(interval_minutes)
173
+ )
174
+ r_times = list(ro.r["as.data.frame"](r_result).rx2(1))
175
+
176
+ if len(r_times) >= len(group):
177
+ group["TimeSeries"] = r_times[:len(group)]
178
+ else:
179
+ group["TimeSeries"] = x_minutes.to_numpy()
180
+
181
+ return group.loc[original_index]
182
+
183
+ if id_col in out.columns:
184
+ out = out.groupby(id_col, group_keys=False).apply(_add)
185
+ else:
186
+ out = _add(out)
187
+
188
+ return out.drop(columns=["_ts_parsed"])
189
+
190
+ except Exception:
191
+ return _timeseries_fallback_python(df, ts_col, id_col)
@@ -0,0 +1,75 @@
1
+ Metadata-Version: 2.4
2
+ Name: imputeCGM
3
+ Version: 0.1.0
4
+ Summary: CGM missing glucose imputation with MICE, ARIMA, XGBoost, and optional GAIN-based methods
5
+ Author: Hasin Shad, Shubh Saraswat, Dr. Xiaohua Douglas Zhang
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy
9
+ Requires-Dist: pandas
10
+ Requires-Dist: scikit-learn
11
+ Requires-Dist: statsmodels
12
+ Requires-Dist: xgboost
13
+ Requires-Dist: rpy2
14
+ Provides-Extra: gain
15
+ Requires-Dist: tensorflow; extra == "gain"
16
+
17
+ # imputeCGM
18
+
19
+ `imputeCGM` is a Python package for imputing missing glucose values in continuous glucose monitoring (CGM) data.
20
+
21
+ The package supports:
22
+
23
+ - Automatic method selection: MICE+ARIMA when missingness is low and MICE+XGBoost when missingness is higher
24
+ - Forced MICE+ARIMA
25
+ - Forced MICE+XGBoost
26
+ - GAIN-only imputation
27
+ - GAIN+ARIMA
28
+ - GAIN+XGBoost
29
+
30
+ ## Installation
31
+
32
+ For MICE, ARIMA, and XGBoost methods:
33
+
34
+ ```bash
35
+ python -m pip install -e .
36
+ ```
37
+
38
+ For GAIN-based methods, install the optional TensorFlow dependency:
39
+
40
+ ```bash
41
+ python -m pip install -e .[gain]
42
+ ```
43
+
44
+ ## Python usage
45
+
46
+ ```python
47
+ from imputeCGM import impute_cgm
48
+
49
+ out = impute_cgm(
50
+ df,
51
+ timestamp_col="timestamp",
52
+ subjectid_col="subjectid",
53
+ glucose_col="glucose_value",
54
+ interval_minutes=5,
55
+ time_gap_tolerance_minutes=3,
56
+ model_method="auto",
57
+ )
58
+ ```
59
+
60
+ Valid `model_method` values are:
61
+
62
+ ```text
63
+ auto
64
+ mice_arima
65
+ mice_xgboost
66
+ gain
67
+ gain_arima
68
+ gain_xgboost
69
+ ```
70
+
71
+ ## CLI usage
72
+
73
+ ```bash
74
+ imputeCGM --input ExampleData.csv --output ExampleData_imputed.csv --model-method auto
75
+ ```
@@ -0,0 +1,17 @@
1
+ README.md
2
+ pyproject.toml
3
+ imputeCGM/__init__.py
4
+ imputeCGM/cli.py
5
+ imputeCGM/features.py
6
+ imputeCGM/gain.py
7
+ imputeCGM/gaps.py
8
+ imputeCGM/impute.py
9
+ imputeCGM/models.py
10
+ imputeCGM/pipeline.py
11
+ imputeCGM/timeseries_r.py
12
+ imputeCGM.egg-info/PKG-INFO
13
+ imputeCGM.egg-info/SOURCES.txt
14
+ imputeCGM.egg-info/dependency_links.txt
15
+ imputeCGM.egg-info/entry_points.txt
16
+ imputeCGM.egg-info/requires.txt
17
+ imputeCGM.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ imputeCGM = imputeCGM.cli:main
@@ -0,0 +1,9 @@
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ statsmodels
5
+ xgboost
6
+ rpy2
7
+
8
+ [gain]
9
+ tensorflow
@@ -0,0 +1 @@
1
+ imputeCGM
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "imputeCGM"
7
+ version = "0.1.0"
8
+ description = "CGM missing glucose imputation with MICE, ARIMA, XGBoost, and optional GAIN-based methods"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ authors = [{name = "Hasin Shad, Shubh Saraswat, Dr. Xiaohua Douglas Zhang"}]
12
+ dependencies = [
13
+ "numpy",
14
+ "pandas",
15
+ "scikit-learn",
16
+ "statsmodels",
17
+ "xgboost",
18
+ "rpy2"
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ gain = ["tensorflow"]
23
+
24
+ [project.scripts]
25
+ imputeCGM = "imputeCGM.cli:main"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+