datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,442 @@
1
+ """Built-in failure modes (05 §4, 04 §7).
2
+
3
+ Each mode mutates the working *injected* frame in place and returns a diff
4
+ summary. The clean baseline is captured before any of these run, so it is always
5
+ recoverable. All draws come from the injected ``RNG(failure:i)``.
6
+
7
+ Honest definitions (no hidden refitting):
8
+
9
+ * **mcar** — mask ``mᵢ ~ Bernoulli(rate)`` independent of the data.
10
+ * **mar** — ``P(M=1 | driver) = σ(a + s·z(driver))``; the intercept ``a`` is
11
+ calibrated so the *expected* missing rate equals ``rate`` while missingness
12
+ still depends on the **observed** driver.
13
+ * **mnar** — same logistic mechanism but on the column's **own value** (or an
14
+ unobserved driver): missingness depends on the value itself.
15
+ * **label_noise** — flip a boolean label / reassign a categorical label to a
16
+ *different* class with probability ``rate``.
17
+ * **feature_noise** — additive ``x' = x + ε``, ``ε ~ dist(params)``.
18
+ * **drift** — concept drift over the row index: ``x'[t] = x[t] + magnitude·g(t)``
19
+ (``g`` linear ``t/(n-1)`` or a step).
20
+ * **covariate_shift** — affine moment-match toward a target ``{mean, std}``.
21
+ * **leakage** — plant ``into = target + small noise``: a high-MI proxy for the
22
+ label.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from collections.abc import Mapping
28
+ from typing import Any
29
+
30
+ import numpy as np
31
+ import pandas as pd
32
+
33
+ from ..dist.builtins import REGISTRY
34
+ from ..errors import SpecValidationError
35
+ from ..spec.models import (
36
+ BooleanFeature,
37
+ CategoricalFeature,
38
+ Feature,
39
+ NumericFeature,
40
+ )
41
+ from .base import (
42
+ FailureMode,
43
+ calibrate_logistic_intercept,
44
+ require_feature,
45
+ require_rate,
46
+ sigmoid,
47
+ standardize,
48
+ )
49
+
50
+ # --- shared helpers ------------------------------------------------------------------
51
+
52
+
53
+ def _nullify(frame: pd.DataFrame, col: str, mask: np.ndarray) -> int:
54
+ """Set ``NaN`` where ``mask`` is true, upcasting int/bool columns as needed."""
55
+ series = frame[col]
56
+ if pd.api.types.is_integer_dtype(series):
57
+ frame[col] = series.astype("float64")
58
+ elif pd.api.types.is_bool_dtype(series):
59
+ frame[col] = series.astype("object")
60
+ frame.loc[mask, col] = np.nan
61
+ return int(mask.sum())
62
+
63
+
64
+ def _is_int_feature(feat: Feature | None) -> bool:
65
+ return isinstance(feat, NumericFeature) and feat.dtype == "int"
66
+
67
+
68
+ def _assign_numeric(frame: pd.DataFrame, col: str, values: np.ndarray, is_int: bool) -> None:
69
+ """Write a numeric result, rounding to int only when safe.
70
+
71
+ A prior failure may have already nullified some cells; ``NaN`` cannot be cast
72
+ to ``int64``, so we keep the column float in that case to preserve the
73
+ injected missingness rather than corrupting it into garbage integers.
74
+ """
75
+ if is_int and not np.isnan(values).any():
76
+ frame[col] = np.rint(values).astype("int64")
77
+ else:
78
+ frame[col] = values
79
+
80
+
81
+ def _require_float_coercible(
82
+ name: str, features: Mapping[str, Feature], role: str, locator: str
83
+ ) -> None:
84
+ """A logistic driver must be numeric/boolean (``standardize`` coerces to float)."""
85
+ feat = features[name]
86
+ if not isinstance(feat, (NumericFeature, BooleanFeature)):
87
+ raise SpecValidationError(
88
+ f"{role} {name!r} must be numeric/boolean; it is type {feat.type!r}",
89
+ locator=locator,
90
+ )
91
+
92
+
93
+ def _logistic_missing(
94
+ rng: np.random.Generator,
95
+ driver_values: np.ndarray,
96
+ rate: float,
97
+ strength: float,
98
+ ) -> np.ndarray:
99
+ """Draw a calibrated, driver-dependent missingness mask."""
100
+ scores = strength * standardize(driver_values)
101
+ intercept = calibrate_logistic_intercept(scores, rate)
102
+ probs = sigmoid(intercept + scores)
103
+ return rng.random(len(driver_values)) < probs
104
+
105
+
106
+ # --- missingness ---------------------------------------------------------------------
107
+
108
+
109
+ class MCAR(FailureMode):
110
+ name = "mcar"
111
+
112
+ def _columns(self, params: Mapping[str, Any]) -> list[str]:
113
+ cols = params.get("columns")
114
+ if isinstance(cols, list):
115
+ return [str(c) for c in cols]
116
+ col = params.get("column")
117
+ return [str(col)] if col is not None else []
118
+
119
+ def validate(self, params, features, locator):
120
+ require_rate(params, locator)
121
+ cols = self._columns(params)
122
+ if not cols:
123
+ raise SpecValidationError(
124
+ "mcar requires 'column' or 'columns'", locator=f"{locator}.columns"
125
+ )
126
+ for c in cols:
127
+ if c not in features:
128
+ raise SpecValidationError(
129
+ f"column {c!r} is not a declared feature", locator=f"{locator}.columns"
130
+ )
131
+
132
+ def apply(self, rng, frame, params, features):
133
+ rate = float(params["rate"])
134
+ n = len(frame)
135
+ nulled: dict[str, float] = {}
136
+ for col in self._columns(params):
137
+ mask = rng.random(n) < rate
138
+ count = _nullify(frame, col, mask)
139
+ nulled[col] = count / n if n else 0.0
140
+ return {"mechanism": "mcar", "rate": rate, "nullified_fraction": nulled}
141
+
142
+
143
+ class MAR(FailureMode):
144
+ name = "mar"
145
+
146
+ def validate(self, params, features, locator):
147
+ require_rate(params, locator)
148
+ require_feature(params, "column", features, locator)
149
+ driver = require_feature(params, "driver", features, locator)
150
+ _require_float_coercible(driver, features, "mar 'driver'", f"{locator}.driver")
151
+
152
+ def apply(self, rng, frame, params, features):
153
+ rate = float(params["rate"])
154
+ column = str(params["column"])
155
+ driver = str(params["driver"])
156
+ strength = float(params.get("strength", 2.0))
157
+ mask = _logistic_missing(rng, frame[driver].to_numpy(), rate, strength)
158
+ n = len(frame)
159
+ count = _nullify(frame, column, mask)
160
+ return {
161
+ "mechanism": "mar",
162
+ "column": column,
163
+ "driver": driver,
164
+ "target_rate": rate,
165
+ "realized_rate": count / n if n else 0.0,
166
+ }
167
+
168
+
169
+ class MNAR(FailureMode):
170
+ name = "mnar"
171
+
172
+ def validate(self, params, features, locator):
173
+ require_rate(params, locator)
174
+ column = require_feature(params, "column", features, locator)
175
+ # `driver` is optional for MNAR (defaults to the column's own value).
176
+ driver = params.get("driver")
177
+ if driver is not None:
178
+ driver = require_feature(params, "driver", features, locator)
179
+ effective = str(driver or column)
180
+ _require_float_coercible(effective, features, "mnar driver", f"{locator}.driver")
181
+
182
+ def apply(self, rng, frame, params, features):
183
+ rate = float(params["rate"])
184
+ column = str(params["column"])
185
+ driver = str(params.get("driver") or column)
186
+ strength = float(params.get("strength", 2.0))
187
+ mask = _logistic_missing(rng, frame[driver].to_numpy(), rate, strength)
188
+ n = len(frame)
189
+ count = _nullify(frame, column, mask)
190
+ return {
191
+ "mechanism": "mnar",
192
+ "column": column,
193
+ "driver": driver,
194
+ "self_dependent": driver == column,
195
+ "target_rate": rate,
196
+ "realized_rate": count / n if n else 0.0,
197
+ }
198
+
199
+
200
+ # --- label / feature corruption ------------------------------------------------------
201
+
202
+
203
+ class LabelNoise(FailureMode):
204
+ name = "label_noise"
205
+
206
+ def validate(self, params, features, locator):
207
+ require_rate(params, locator)
208
+ col = require_feature(params, "column", features, locator)
209
+ feat = features[col]
210
+ if not isinstance(feat, (BooleanFeature, CategoricalFeature)):
211
+ raise SpecValidationError(
212
+ f"label_noise requires a boolean/categorical 'column'; {col!r} is "
213
+ f"type {feat.type!r}",
214
+ locator=f"{locator}.column",
215
+ )
216
+
217
+ def apply(self, rng, frame, params, features):
218
+ rate = float(params["rate"])
219
+ column = str(params["column"])
220
+ n = len(frame)
221
+ flip = rng.random(n) < rate
222
+ feat = features.get(column)
223
+ series = frame[column]
224
+
225
+ if pd.api.types.is_bool_dtype(series) or isinstance(feat, BooleanFeature):
226
+ vals = series.to_numpy(dtype=bool)
227
+ frame[column] = np.where(flip, ~vals, vals)
228
+ flipped = int(flip.sum())
229
+ else:
230
+ cats = (
231
+ list(feat.categories)
232
+ if isinstance(feat, CategoricalFeature)
233
+ else sorted(series.dropna().unique().tolist())
234
+ )
235
+ k = len(cats)
236
+ if k < 2:
237
+ return {"mechanism": "label_noise", "column": column, "flipped_fraction": 0.0}
238
+ index = {c: i for i, c in enumerate(cats)}
239
+ codes = series.map(index).to_numpy()
240
+ # Reassign flipped rows to a *different* class (offset in 1..k-1).
241
+ offset = rng.integers(1, k, size=n)
242
+ new_codes = (codes + offset) % k
243
+ arr = series.to_numpy(dtype=object).copy()
244
+ cats_arr = np.array(cats, dtype=object)
245
+ valid = flip & ~pd.isna(codes)
246
+ arr[valid] = cats_arr[new_codes[valid].astype(int)]
247
+ frame[column] = arr
248
+ flipped = int(valid.sum())
249
+ return {
250
+ "mechanism": "label_noise",
251
+ "column": column,
252
+ "rate": rate,
253
+ "flipped_fraction": flipped / n if n else 0.0,
254
+ }
255
+
256
+
257
+ class FeatureNoise(FailureMode):
258
+ name = "feature_noise"
259
+
260
+ def validate(self, params, features, locator):
261
+ col = require_feature(params, "column", features, locator)
262
+ feat = features[col]
263
+ if not isinstance(feat, NumericFeature):
264
+ raise SpecValidationError(
265
+ f"feature_noise requires a numeric 'column'; {col!r} is type {feat.type!r}",
266
+ locator=f"{locator}.column",
267
+ )
268
+ dist_name = params.get("dist")
269
+ if dist_name is None:
270
+ raise SpecValidationError("feature_noise requires 'dist'", locator=f"{locator}.dist")
271
+ dist = REGISTRY.get(str(dist_name))
272
+ if dist is None:
273
+ raise SpecValidationError(
274
+ f"unknown noise distribution {dist_name!r}", locator=f"{locator}.dist"
275
+ )
276
+ dist.validate(params.get("params", {}), locator=f"{locator}.params")
277
+
278
+ def apply(self, rng, frame, params, features):
279
+ column = str(params["column"])
280
+ n = len(frame)
281
+ dist = REGISTRY[str(params["dist"])]
282
+ eps = dist.sample(rng, n, params.get("params", {}))
283
+ original = frame[column].to_numpy(dtype=float)
284
+ noised = original + eps
285
+ _assign_numeric(frame, column, noised, _is_int_feature(features.get(column)))
286
+ return {
287
+ "mechanism": "feature_noise",
288
+ "column": column,
289
+ "dist": str(params["dist"]),
290
+ "realized_noise_std": float(np.std(eps)),
291
+ "realized_mean_shift": float(np.nanmean(noised) - np.nanmean(original)),
292
+ }
293
+
294
+
295
+ # --- distributional shifts -----------------------------------------------------------
296
+
297
+
298
+ class Drift(FailureMode):
299
+ name = "drift"
300
+
301
+ def validate(self, params, features, locator):
302
+ col = require_feature(params, "column", features, locator)
303
+ feat = features[col]
304
+ if not isinstance(feat, NumericFeature):
305
+ raise SpecValidationError(
306
+ f"drift requires a numeric 'column'; {col!r} is type {feat.type!r}",
307
+ locator=f"{locator}.column",
308
+ )
309
+ sched = params.get("schedule", {})
310
+ if not isinstance(sched, Mapping):
311
+ raise SpecValidationError("drift 'schedule' must be a mapping", locator=f"{locator}.schedule")
312
+ kind = sched.get("kind", "linear")
313
+ if kind not in ("linear", "step"):
314
+ raise SpecValidationError(
315
+ f"unknown drift schedule kind {kind!r} (expected 'linear' or 'step')",
316
+ locator=f"{locator}.schedule.kind",
317
+ )
318
+
319
+ def apply(self, rng, frame, params, features):
320
+ column = str(params["column"])
321
+ n = len(frame)
322
+ sched = dict(params.get("schedule", {}))
323
+ kind = sched.get("kind", "linear")
324
+ idx = np.arange(n, dtype=float)
325
+ if kind == "step":
326
+ at = float(sched.get("at", 0.5))
327
+ g = (idx >= at * n).astype(float)
328
+ else: # linear
329
+ g = idx / (n - 1) if n > 1 else np.zeros(n)
330
+ magnitude = sched.get("magnitude")
331
+ if magnitude is None:
332
+ # `rate` reads as a per-row slope: total end-to-start shift = rate·(n-1).
333
+ magnitude = float(sched.get("rate", 0.0)) * (n - 1)
334
+ magnitude = float(magnitude)
335
+ delta = magnitude * g
336
+ shifted = frame[column].to_numpy(dtype=float) + delta
337
+ _assign_numeric(frame, column, shifted, _is_int_feature(features.get(column)))
338
+ half = n // 2
339
+ first = float(np.nanmean(delta[:half])) if half else 0.0
340
+ second = float(np.nanmean(delta[half:])) if n - half else 0.0
341
+ return {
342
+ "mechanism": "drift",
343
+ "column": column,
344
+ "kind": kind,
345
+ "total_shift": magnitude,
346
+ "mean_shift_second_vs_first_half": second - first,
347
+ }
348
+
349
+
350
+ class CovariateShift(FailureMode):
351
+ name = "covariate_shift"
352
+
353
+ def validate(self, params, features, locator):
354
+ col = require_feature(params, "column", features, locator)
355
+ feat = features[col]
356
+ if not isinstance(feat, NumericFeature):
357
+ raise SpecValidationError(
358
+ f"covariate_shift requires a numeric 'column'; {col!r} is type {feat.type!r}",
359
+ locator=f"{locator}.column",
360
+ )
361
+ target = params.get("target")
362
+ if not isinstance(target, Mapping) or not ({"mean", "std"} & set(target)):
363
+ raise SpecValidationError(
364
+ "covariate_shift requires a 'target' with 'mean' and/or 'std'",
365
+ locator=f"{locator}.target",
366
+ )
367
+
368
+ def apply(self, rng, frame, params, features):
369
+ column = str(params["column"])
370
+ target = dict(params.get("target", {}))
371
+ x = frame[column].to_numpy(dtype=float)
372
+ mu = float(np.nanmean(x))
373
+ sd = float(np.nanstd(x))
374
+ tmean = float(target.get("mean", mu))
375
+ tstd = target.get("std")
376
+ if tstd is not None and sd > 0.0:
377
+ shifted = (x - mu) * (float(tstd) / sd) + tmean
378
+ else:
379
+ shifted = x + (tmean - mu)
380
+ _assign_numeric(frame, column, shifted, _is_int_feature(features.get(column)))
381
+ return {
382
+ "mechanism": "covariate_shift",
383
+ "column": column,
384
+ "before": {"mean": mu, "std": sd},
385
+ "after": {"mean": float(np.nanmean(shifted)), "std": float(np.nanstd(shifted))},
386
+ }
387
+
388
+
389
+ class Leakage(FailureMode):
390
+ name = "leakage"
391
+
392
+ def validate(self, params, features, locator):
393
+ target = require_feature(params, "target", features, locator)
394
+ feat = features[target]
395
+ if not isinstance(feat, (NumericFeature, BooleanFeature)):
396
+ raise SpecValidationError(
397
+ f"leakage 'target' must be numeric/boolean; {target!r} is type {feat.type!r}",
398
+ locator=f"{locator}.target",
399
+ )
400
+ into = params.get("into")
401
+ if not isinstance(into, str):
402
+ raise SpecValidationError("leakage requires 'into'", locator=f"{locator}.into")
403
+ # `into` is the planted proxy column; it may be a *new* column. It must
404
+ # not collide with the target itself.
405
+ if into == target:
406
+ raise SpecValidationError(
407
+ "leakage 'into' must differ from 'target'", locator=f"{locator}.into"
408
+ )
409
+
410
+ def apply(self, rng, frame, params, features):
411
+ target = str(params["target"])
412
+ into = str(params["into"])
413
+ tgt = frame[target].to_numpy(dtype=float)
414
+ noise_level = float(params.get("noise", 0.05))
415
+ sd = float(np.nanstd(tgt)) or 1.0
416
+ proxy = tgt + rng.normal(0.0, noise_level * sd, len(frame))
417
+ frame[into] = proxy
418
+ # Realized leakage strength: Pearson correlation between proxy and target.
419
+ with np.errstate(invalid="ignore"):
420
+ corr = np.corrcoef(np.nan_to_num(proxy), np.nan_to_num(tgt))[0, 1]
421
+ return {
422
+ "mechanism": "leakage",
423
+ "target": target,
424
+ "into": into,
425
+ "noise_level": noise_level,
426
+ "realized_correlation": float(corr) if np.isfinite(corr) else None,
427
+ }
428
+
429
+
430
+ FAILURE_MODES: dict[str, FailureMode] = {
431
+ m.name: m
432
+ for m in (
433
+ MCAR(),
434
+ MAR(),
435
+ MNAR(),
436
+ LabelNoise(),
437
+ FeatureNoise(),
438
+ Drift(),
439
+ CovariateShift(),
440
+ Leakage(),
441
+ )
442
+ }