tsgap 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsgap/__init__.py +22 -0
- tsgap/core.py +264 -0
- tsgap/mechanisms.py +521 -0
- tsgap/patterns.py +528 -0
- tsgap/tests/__init__.py +0 -0
- tsgap/tests/test_missingness.py +961 -0
- tsgap-0.3.0.dist-info/METADATA +681 -0
- tsgap-0.3.0.dist-info/RECORD +11 -0
- tsgap-0.3.0.dist-info/WHEEL +5 -0
- tsgap-0.3.0.dist-info/licenses/LICENSE +21 -0
- tsgap-0.3.0.dist-info/top_level.txt +1 -0
tsgap/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""TSGap — Composable Time-Series Missingness Simulation
|
|
2
|
+
|
|
3
|
+
A library for simulating realistic missingness patterns in time-series data
|
|
4
|
+
for imputation benchmarking.
|
|
5
|
+
|
|
6
|
+
Separates two concepts:
|
|
7
|
+
1. MECHANISMS (why data is missing): MCAR, MAR, MNAR
|
|
8
|
+
2. PATTERNS (how data is missing): pointwise, block, monotone, decay, markov
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .core import simulate_missingness, simulate_many_rates, MissingnessSimulator
|
|
12
|
+
from .mechanisms import MECHANISMS
|
|
13
|
+
from .patterns import PATTERNS
|
|
14
|
+
|
|
15
|
+
__version__ = "0.3.0"
|
|
16
|
+
__all__ = [
|
|
17
|
+
"simulate_missingness",
|
|
18
|
+
"simulate_many_rates",
|
|
19
|
+
"MissingnessSimulator",
|
|
20
|
+
"MECHANISMS",
|
|
21
|
+
"PATTERNS",
|
|
22
|
+
]
|
tsgap/core.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Core API for missingness simulation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from .mechanisms import MECHANISMS
|
|
7
|
+
from .patterns import PATTERNS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def simulate_missingness(
|
|
11
|
+
X: np.ndarray,
|
|
12
|
+
mechanism: str,
|
|
13
|
+
missing_rate: float,
|
|
14
|
+
seed: int | None = None,
|
|
15
|
+
pattern: str = "pointwise",
|
|
16
|
+
**kwargs
|
|
17
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
18
|
+
"""Simulate missingness in time-series data.
|
|
19
|
+
|
|
20
|
+
This function separates two concepts:
|
|
21
|
+
1. MECHANISM (why data is missing): MCAR, MAR, MNAR
|
|
22
|
+
2. PATTERN (how data is missing): pointwise, block
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
X : np.ndarray
|
|
27
|
+
Input data of shape (T, D) or (N, T, D)
|
|
28
|
+
mechanism : str
|
|
29
|
+
Missingness mechanism (WHY data is missing):
|
|
30
|
+
- "mcar": Missing Completely At Random
|
|
31
|
+
- "mar": Missing At Random (depends on other variables)
|
|
32
|
+
- "mnar": Missing Not At Random (depends on value itself)
|
|
33
|
+
missing_rate : float
|
|
34
|
+
Target fraction of missing values (0.0 to 1.0)
|
|
35
|
+
Applied to eligible (non-NaN) entries
|
|
36
|
+
seed : int, optional
|
|
37
|
+
Random seed for reproducibility
|
|
38
|
+
pattern : str, optional
|
|
39
|
+
Missingness pattern (HOW data is missing):
|
|
40
|
+
- "pointwise" (default): Scattered individual points
|
|
41
|
+
- "block": Contiguous segments (sensor dropout)
|
|
42
|
+
- "monotone": Once missing, stays missing (participant dropout)
|
|
43
|
+
- "decay": Missingness increases over time (sensor degradation)
|
|
44
|
+
- "markov": Temporally dependent flickering (intermittent sensor failure)
|
|
45
|
+
Aliases: "point"/"scattered" for pointwise; "contiguous" for block;
|
|
46
|
+
"dropout" for monotone; "degradation" for decay;
|
|
47
|
+
"flickering" for markov
|
|
48
|
+
**kwargs : dict
|
|
49
|
+
Mechanism-specific parameters:
|
|
50
|
+
|
|
51
|
+
MCAR:
|
|
52
|
+
target : str or list[int]
|
|
53
|
+
"all" (default) or list of dimension indices
|
|
54
|
+
|
|
55
|
+
MAR:
|
|
56
|
+
driver_dims : list[int], required
|
|
57
|
+
Dimensions that drive missingness
|
|
58
|
+
driver_weights : list[float], optional
|
|
59
|
+
Weights for each driver dimension (normalized to sum to 1).
|
|
60
|
+
Allows different drivers to contribute differently.
|
|
61
|
+
Default: equal weights (simple mean).
|
|
62
|
+
target : str or list[int]
|
|
63
|
+
"all" (default) or list of dimension indices to mask
|
|
64
|
+
strength : float, default=2.0
|
|
65
|
+
Dependency strength
|
|
66
|
+
base_rate : float, default=0.01
|
|
67
|
+
Minimum probability
|
|
68
|
+
direction : str, default="positive"
|
|
69
|
+
"positive" or "negative"
|
|
70
|
+
|
|
71
|
+
MNAR:
|
|
72
|
+
mnar_mode : str, default="extreme"
|
|
73
|
+
"high", "low", or "extreme"
|
|
74
|
+
target : str or list[int]
|
|
75
|
+
"all" (default) or list of dimension indices to mask
|
|
76
|
+
strength : float, default=2.0
|
|
77
|
+
Dependency strength
|
|
78
|
+
|
|
79
|
+
Pattern-specific parameters:
|
|
80
|
+
|
|
81
|
+
Block pattern:
|
|
82
|
+
block_len : int, default=10
|
|
83
|
+
Length of each missing block (in timesteps)
|
|
84
|
+
block_density : float, default=0.7
|
|
85
|
+
Fraction of missingness in blocks (0.0 to 1.0)
|
|
86
|
+
|
|
87
|
+
Decay pattern:
|
|
88
|
+
decay_rate : float, default=3.0
|
|
89
|
+
Steepness of temporal ramp (higher = sharper transition)
|
|
90
|
+
decay_center : float, default=0.7
|
|
91
|
+
Normalized time (0-1) where missingness reaches 50%
|
|
92
|
+
|
|
93
|
+
Markov pattern:
|
|
94
|
+
persist : float, default=0.8
|
|
95
|
+
Probability of staying missing once entered [0, 1).
|
|
96
|
+
Higher = longer bursts.
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
X_missing : np.ndarray
|
|
101
|
+
Data with NaNs inserted (same shape as X)
|
|
102
|
+
mask : np.ndarray
|
|
103
|
+
Boolean mask (True=observed, False=missing)
|
|
104
|
+
|
|
105
|
+
Examples
|
|
106
|
+
--------
|
|
107
|
+
>>> # MCAR with point-wise pattern (default)
|
|
108
|
+
>>> X_missing, mask = simulate_missingness(X, "mcar", 0.15, seed=42)
|
|
109
|
+
|
|
110
|
+
>>> # MAR with block pattern (sensor dropout depends on activity)
|
|
111
|
+
>>> X_missing, mask = simulate_missingness(
|
|
112
|
+
... X, "mar", 0.25, seed=42,
|
|
113
|
+
... driver_dims=[0], pattern="block", block_len=10
|
|
114
|
+
... )
|
|
115
|
+
|
|
116
|
+
>>> # MNAR with block pattern (extreme values cause sensor failure)
|
|
117
|
+
>>> X_missing, mask = simulate_missingness(
|
|
118
|
+
... X, "mnar", 0.20, seed=42,
|
|
119
|
+
... mnar_mode="extreme", pattern="block"
|
|
120
|
+
... )
|
|
121
|
+
"""
|
|
122
|
+
# Create RNG for reproducibility
|
|
123
|
+
rng = np.random.default_rng(seed)
|
|
124
|
+
|
|
125
|
+
# Validate inputs
|
|
126
|
+
if not isinstance(X, np.ndarray):
|
|
127
|
+
raise TypeError("X must be a numpy array")
|
|
128
|
+
if X.ndim not in [2, 3]:
|
|
129
|
+
raise ValueError("X must be 2D (T, D) or 3D (N, T, D)")
|
|
130
|
+
|
|
131
|
+
# Clip missing_rate to valid range (allow out-of-range for convenience)
|
|
132
|
+
missing_rate = float(np.clip(missing_rate, 0.0, 1.0))
|
|
133
|
+
|
|
134
|
+
# Validate mechanism
|
|
135
|
+
mechanism = mechanism.lower()
|
|
136
|
+
if mechanism not in MECHANISMS:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
f"Unknown mechanism: {mechanism}. "
|
|
139
|
+
f"Must be one of: {list(MECHANISMS.keys())}"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Validate pattern
|
|
143
|
+
pattern = pattern.lower()
|
|
144
|
+
|
|
145
|
+
# Backward compatibility: handle old block=True API
|
|
146
|
+
if kwargs.get("block", False):
|
|
147
|
+
pattern = "block"
|
|
148
|
+
|
|
149
|
+
if pattern not in PATTERNS:
|
|
150
|
+
raise ValueError(
|
|
151
|
+
f"Unknown pattern: {pattern}. "
|
|
152
|
+
f"Must be one of: {list(PATTERNS.keys())}"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Copy input
|
|
156
|
+
X_missing = X.copy()
|
|
157
|
+
|
|
158
|
+
# Identify existing NaNs
|
|
159
|
+
existing_nans = np.isnan(X)
|
|
160
|
+
|
|
161
|
+
# Step 1: Generate mechanism-specific mask (WHY missing)
|
|
162
|
+
mask = MECHANISMS[mechanism](X, missing_rate, existing_nans, rng=rng, **kwargs)
|
|
163
|
+
|
|
164
|
+
# Step 2: Apply pattern (HOW missing)
|
|
165
|
+
mask = PATTERNS[pattern](mask, X.shape, rng=rng, **kwargs)
|
|
166
|
+
|
|
167
|
+
# Apply mask
|
|
168
|
+
X_missing[~mask] = np.nan
|
|
169
|
+
|
|
170
|
+
return X_missing, mask
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def simulate_many_rates(
|
|
174
|
+
X: np.ndarray,
|
|
175
|
+
mechanism: str,
|
|
176
|
+
rates: list[float],
|
|
177
|
+
seed: int | None = None,
|
|
178
|
+
**kwargs
|
|
179
|
+
) -> dict[float, tuple[np.ndarray, np.ndarray]]:
|
|
180
|
+
"""Simulate missingness at multiple rates.
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
X : np.ndarray
|
|
185
|
+
Input data
|
|
186
|
+
mechanism : str
|
|
187
|
+
Missingness mechanism
|
|
188
|
+
rates : list[float]
|
|
189
|
+
List of missing rates to simulate
|
|
190
|
+
seed : int, optional
|
|
191
|
+
Base random seed
|
|
192
|
+
**kwargs : dict
|
|
193
|
+
Mechanism-specific parameters
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
dict
|
|
198
|
+
Dictionary mapping rate -> (X_missing, mask)
|
|
199
|
+
"""
|
|
200
|
+
results = {}
|
|
201
|
+
for i, rate in enumerate(rates):
|
|
202
|
+
# Use different seed for each rate if seed provided
|
|
203
|
+
rate_seed = None if seed is None else seed + i
|
|
204
|
+
X_missing, mask = simulate_missingness(
|
|
205
|
+
X, mechanism, rate, seed=rate_seed, **kwargs
|
|
206
|
+
)
|
|
207
|
+
results[rate] = (X_missing, mask)
|
|
208
|
+
return results
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class MissingnessSimulator:
|
|
212
|
+
"""Object-oriented interface for missingness simulation.
|
|
213
|
+
|
|
214
|
+
Parameters
|
|
215
|
+
----------
|
|
216
|
+
mechanism : str
|
|
217
|
+
Missingness mechanism ("mcar", "mar", "mnar")
|
|
218
|
+
missing_rate : float
|
|
219
|
+
Target missing rate
|
|
220
|
+
seed : int, optional
|
|
221
|
+
Random seed
|
|
222
|
+
**config : dict
|
|
223
|
+
Mechanism-specific configuration
|
|
224
|
+
|
|
225
|
+
Examples
|
|
226
|
+
--------
|
|
227
|
+
>>> sim = MissingnessSimulator("mcar", missing_rate=0.15, seed=42)
|
|
228
|
+
>>> X_missing, mask = sim.generate(X)
|
|
229
|
+
"""
|
|
230
|
+
|
|
231
|
+
def __init__(
|
|
232
|
+
self,
|
|
233
|
+
mechanism: str,
|
|
234
|
+
missing_rate: float,
|
|
235
|
+
seed: int | None = None,
|
|
236
|
+
**config
|
|
237
|
+
):
|
|
238
|
+
self.mechanism = mechanism
|
|
239
|
+
self.missing_rate = missing_rate
|
|
240
|
+
self.seed = seed
|
|
241
|
+
self.config = config
|
|
242
|
+
|
|
243
|
+
def generate(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
244
|
+
"""Generate missingness for input data.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
X : np.ndarray
|
|
249
|
+
Input data
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
X_missing : np.ndarray
|
|
254
|
+
Data with missingness
|
|
255
|
+
mask : np.ndarray
|
|
256
|
+
Boolean mask
|
|
257
|
+
"""
|
|
258
|
+
return simulate_missingness(
|
|
259
|
+
X,
|
|
260
|
+
self.mechanism,
|
|
261
|
+
self.missing_rate,
|
|
262
|
+
self.seed,
|
|
263
|
+
**self.config
|
|
264
|
+
)
|