tsgap 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tsgap/__init__.py ADDED
@@ -0,0 +1,22 @@
1
+ """TSGap — Composable Time-Series Missingness Simulation
2
+
3
+ A library for simulating realistic missingness patterns in time-series data
4
+ for imputation benchmarking.
5
+
6
+ Separates two concepts:
7
+ 1. MECHANISMS (why data is missing): MCAR, MAR, MNAR
8
+ 2. PATTERNS (how data is missing): pointwise, block, monotone, decay, markov
9
+ """
10
+
11
+ from .core import simulate_missingness, simulate_many_rates, MissingnessSimulator
12
+ from .mechanisms import MECHANISMS
13
+ from .patterns import PATTERNS
14
+
15
+ __version__ = "0.3.0"
16
+ __all__ = [
17
+ "simulate_missingness",
18
+ "simulate_many_rates",
19
+ "MissingnessSimulator",
20
+ "MECHANISMS",
21
+ "PATTERNS",
22
+ ]
tsgap/core.py ADDED
@@ -0,0 +1,264 @@
1
+ """Core API for missingness simulation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ from .mechanisms import MECHANISMS
7
+ from .patterns import PATTERNS
8
+
9
+
10
+ def simulate_missingness(
11
+ X: np.ndarray,
12
+ mechanism: str,
13
+ missing_rate: float,
14
+ seed: int | None = None,
15
+ pattern: str = "pointwise",
16
+ **kwargs
17
+ ) -> tuple[np.ndarray, np.ndarray]:
18
+ """Simulate missingness in time-series data.
19
+
20
+ This function separates two concepts:
21
+ 1. MECHANISM (why data is missing): MCAR, MAR, MNAR
22
+ 2. PATTERN (how data is missing): pointwise, block
23
+
24
+ Parameters
25
+ ----------
26
+ X : np.ndarray
27
+ Input data of shape (T, D) or (N, T, D)
28
+ mechanism : str
29
+ Missingness mechanism (WHY data is missing):
30
+ - "mcar": Missing Completely At Random
31
+ - "mar": Missing At Random (depends on other variables)
32
+ - "mnar": Missing Not At Random (depends on value itself)
33
+ missing_rate : float
34
+ Target fraction of missing values (0.0 to 1.0)
35
+ Applied to eligible (non-NaN) entries
36
+ seed : int, optional
37
+ Random seed for reproducibility
38
+ pattern : str, optional
39
+ Missingness pattern (HOW data is missing):
40
+ - "pointwise" (default): Scattered individual points
41
+ - "block": Contiguous segments (sensor dropout)
42
+ - "monotone": Once missing, stays missing (participant dropout)
43
+ - "decay": Missingness increases over time (sensor degradation)
44
+ - "markov": Temporally dependent flickering (intermittent sensor failure)
45
+ Aliases: "point"/"scattered" for pointwise; "contiguous" for block;
46
+ "dropout" for monotone; "degradation" for decay;
47
+ "flickering" for markov
48
+ **kwargs : dict
49
+ Mechanism-specific parameters:
50
+
51
+ MCAR:
52
+ target : str or list[int]
53
+ "all" (default) or list of dimension indices
54
+
55
+ MAR:
56
+ driver_dims : list[int], required
57
+ Dimensions that drive missingness
58
+ driver_weights : list[float], optional
59
+ Weights for each driver dimension (normalized to sum to 1).
60
+ Allows different drivers to contribute differently.
61
+ Default: equal weights (simple mean).
62
+ target : str or list[int]
63
+ "all" (default) or list of dimension indices to mask
64
+ strength : float, default=2.0
65
+ Dependency strength
66
+ base_rate : float, default=0.01
67
+ Minimum probability
68
+ direction : str, default="positive"
69
+ "positive" or "negative"
70
+
71
+ MNAR:
72
+ mnar_mode : str, default="extreme"
73
+ "high", "low", or "extreme"
74
+ target : str or list[int]
75
+ "all" (default) or list of dimension indices to mask
76
+ strength : float, default=2.0
77
+ Dependency strength
78
+
79
+ Pattern-specific parameters:
80
+
81
+ Block pattern:
82
+ block_len : int, default=10
83
+ Length of each missing block (in timesteps)
84
+ block_density : float, default=0.7
85
+ Fraction of missingness in blocks (0.0 to 1.0)
86
+
87
+ Decay pattern:
88
+ decay_rate : float, default=3.0
89
+ Steepness of temporal ramp (higher = sharper transition)
90
+ decay_center : float, default=0.7
91
+ Normalized time (0-1) where missingness reaches 50%
92
+
93
+ Markov pattern:
94
+ persist : float, default=0.8
95
+ Probability of staying missing once entered [0, 1).
96
+ Higher = longer bursts.
97
+
98
+ Returns
99
+ -------
100
+ X_missing : np.ndarray
101
+ Data with NaNs inserted (same shape as X)
102
+ mask : np.ndarray
103
+ Boolean mask (True=observed, False=missing)
104
+
105
+ Examples
106
+ --------
107
+ >>> # MCAR with point-wise pattern (default)
108
+ >>> X_missing, mask = simulate_missingness(X, "mcar", 0.15, seed=42)
109
+
110
+ >>> # MAR with block pattern (sensor dropout depends on activity)
111
+ >>> X_missing, mask = simulate_missingness(
112
+ ... X, "mar", 0.25, seed=42,
113
+ ... driver_dims=[0], pattern="block", block_len=10
114
+ ... )
115
+
116
+ >>> # MNAR with block pattern (extreme values cause sensor failure)
117
+ >>> X_missing, mask = simulate_missingness(
118
+ ... X, "mnar", 0.20, seed=42,
119
+ ... mnar_mode="extreme", pattern="block"
120
+ ... )
121
+ """
122
+ # Create RNG for reproducibility
123
+ rng = np.random.default_rng(seed)
124
+
125
+ # Validate inputs
126
+ if not isinstance(X, np.ndarray):
127
+ raise TypeError("X must be a numpy array")
128
+ if X.ndim not in [2, 3]:
129
+ raise ValueError("X must be 2D (T, D) or 3D (N, T, D)")
130
+
131
+ # Clip missing_rate to valid range (allow out-of-range for convenience)
132
+ missing_rate = float(np.clip(missing_rate, 0.0, 1.0))
133
+
134
+ # Validate mechanism
135
+ mechanism = mechanism.lower()
136
+ if mechanism not in MECHANISMS:
137
+ raise ValueError(
138
+ f"Unknown mechanism: {mechanism}. "
139
+ f"Must be one of: {list(MECHANISMS.keys())}"
140
+ )
141
+
142
+ # Validate pattern
143
+ pattern = pattern.lower()
144
+
145
+ # Backward compatibility: handle old block=True API
146
+ if kwargs.get("block", False):
147
+ pattern = "block"
148
+
149
+ if pattern not in PATTERNS:
150
+ raise ValueError(
151
+ f"Unknown pattern: {pattern}. "
152
+ f"Must be one of: {list(PATTERNS.keys())}"
153
+ )
154
+
155
+ # Copy input
156
+ X_missing = X.copy()
157
+
158
+ # Identify existing NaNs
159
+ existing_nans = np.isnan(X)
160
+
161
+ # Step 1: Generate mechanism-specific mask (WHY missing)
162
+ mask = MECHANISMS[mechanism](X, missing_rate, existing_nans, rng=rng, **kwargs)
163
+
164
+ # Step 2: Apply pattern (HOW missing)
165
+ mask = PATTERNS[pattern](mask, X.shape, rng=rng, **kwargs)
166
+
167
+ # Apply mask
168
+ X_missing[~mask] = np.nan
169
+
170
+ return X_missing, mask
171
+
172
+
173
+ def simulate_many_rates(
174
+ X: np.ndarray,
175
+ mechanism: str,
176
+ rates: list[float],
177
+ seed: int | None = None,
178
+ **kwargs
179
+ ) -> dict[float, tuple[np.ndarray, np.ndarray]]:
180
+ """Simulate missingness at multiple rates.
181
+
182
+ Parameters
183
+ ----------
184
+ X : np.ndarray
185
+ Input data
186
+ mechanism : str
187
+ Missingness mechanism
188
+ rates : list[float]
189
+ List of missing rates to simulate
190
+ seed : int, optional
191
+ Base random seed
192
+ **kwargs : dict
193
+ Mechanism-specific parameters
194
+
195
+ Returns
196
+ -------
197
+ dict
198
+ Dictionary mapping rate -> (X_missing, mask)
199
+ """
200
+ results = {}
201
+ for i, rate in enumerate(rates):
202
+ # Use different seed for each rate if seed provided
203
+ rate_seed = None if seed is None else seed + i
204
+ X_missing, mask = simulate_missingness(
205
+ X, mechanism, rate, seed=rate_seed, **kwargs
206
+ )
207
+ results[rate] = (X_missing, mask)
208
+ return results
209
+
210
+
211
+ class MissingnessSimulator:
212
+ """Object-oriented interface for missingness simulation.
213
+
214
+ Parameters
215
+ ----------
216
+ mechanism : str
217
+ Missingness mechanism ("mcar", "mar", "mnar")
218
+ missing_rate : float
219
+ Target missing rate
220
+ seed : int, optional
221
+ Random seed
222
+ **config : dict
223
+ Mechanism-specific configuration
224
+
225
+ Examples
226
+ --------
227
+ >>> sim = MissingnessSimulator("mcar", missing_rate=0.15, seed=42)
228
+ >>> X_missing, mask = sim.generate(X)
229
+ """
230
+
231
+ def __init__(
232
+ self,
233
+ mechanism: str,
234
+ missing_rate: float,
235
+ seed: int | None = None,
236
+ **config
237
+ ):
238
+ self.mechanism = mechanism
239
+ self.missing_rate = missing_rate
240
+ self.seed = seed
241
+ self.config = config
242
+
243
+ def generate(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
244
+ """Generate missingness for input data.
245
+
246
+ Parameters
247
+ ----------
248
+ X : np.ndarray
249
+ Input data
250
+
251
+ Returns
252
+ -------
253
+ X_missing : np.ndarray
254
+ Data with missingness
255
+ mask : np.ndarray
256
+ Boolean mask
257
+ """
258
+ return simulate_missingness(
259
+ X,
260
+ self.mechanism,
261
+ self.missing_rate,
262
+ self.seed,
263
+ **self.config
264
+ )