metamorphic-guard 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,465 @@
1
+ """
2
+ Test harness for running evaluations and computing bootstrap confidence intervals.
3
+ """
4
+
5
+ import math
6
+ import random
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ from statistics import NormalDist
9
+ from typing import Any, Callable, Dict, Iterable, List, Sequence, Tuple
10
+
11
+ from .sandbox import run_in_sandbox
12
+ from .specs import Spec, get_task
13
+ from .util import (
14
+ compute_spec_fingerprint,
15
+ get_environment_fingerprint,
16
+ sha256_file,
17
+ )
18
+
19
+
20
+ def run_eval(
21
+ task_name: str,
22
+ baseline_path: str,
23
+ candidate_path: str,
24
+ n: int = 400,
25
+ seed: int = 42,
26
+ timeout_s: float = 2.0,
27
+ mem_mb: int = 512,
28
+ alpha: float = 0.05,
29
+ violation_cap: int = 25,
30
+ parallel: int | None = None,
31
+ improve_delta: float = 0.02,
32
+ bootstrap_samples: int = 1000,
33
+ ci_method: str = "bootstrap",
34
+ rr_ci_method: str = "log",
35
+ ) -> Dict[str, Any]:
36
+ """
37
+ Run evaluation comparing baseline and candidate implementations.
38
+
39
+ Returns comprehensive metrics including bootstrap confidence intervals.
40
+ """
41
+ spec = get_task(task_name)
42
+ test_inputs = spec.gen_inputs(n, seed)
43
+
44
+ worker_count = max(1, parallel or 1)
45
+ baseline_results = _execute_suite(
46
+ baseline_path,
47
+ test_inputs,
48
+ timeout_s=timeout_s,
49
+ mem_mb=mem_mb,
50
+ workers=worker_count,
51
+ )
52
+ candidate_results = _execute_suite(
53
+ candidate_path,
54
+ test_inputs,
55
+ timeout_s=timeout_s,
56
+ mem_mb=mem_mb,
57
+ workers=worker_count,
58
+ )
59
+
60
+ baseline_metrics = _evaluate_results(
61
+ baseline_results,
62
+ spec,
63
+ test_inputs,
64
+ violation_cap,
65
+ rerun=lambda call_args: run_in_sandbox(
66
+ baseline_path,
67
+ "solve",
68
+ call_args,
69
+ timeout_s,
70
+ mem_mb,
71
+ ),
72
+ )
73
+ candidate_metrics = _evaluate_results(
74
+ candidate_results,
75
+ spec,
76
+ test_inputs,
77
+ violation_cap,
78
+ rerun=lambda call_args: run_in_sandbox(
79
+ candidate_path,
80
+ "solve",
81
+ call_args,
82
+ timeout_s,
83
+ mem_mb,
84
+ ),
85
+ )
86
+
87
+ delta_ci = _compute_delta_ci(
88
+ baseline_metrics,
89
+ candidate_metrics,
90
+ alpha=alpha,
91
+ seed=seed,
92
+ samples=bootstrap_samples,
93
+ method=ci_method,
94
+ )
95
+
96
+ baseline_hash = sha256_file(baseline_path)
97
+ candidate_hash = sha256_file(candidate_path)
98
+ spec_fingerprint = compute_spec_fingerprint(spec)
99
+ rr_value, rr_ci = _compute_relative_risk(
100
+ baseline_metrics,
101
+ candidate_metrics,
102
+ alpha=alpha,
103
+ method=rr_ci_method,
104
+ )
105
+
106
+ result = {
107
+ "task": task_name,
108
+ "n": n,
109
+ "seed": seed,
110
+ "config": {
111
+ "timeout_s": timeout_s,
112
+ "mem_mb": mem_mb,
113
+ "alpha": alpha,
114
+ "improve_delta": improve_delta,
115
+ "violation_cap": violation_cap,
116
+ "parallel": worker_count,
117
+ "bootstrap_samples": bootstrap_samples,
118
+ "ci_method": ci_method,
119
+ "rr_ci_method": rr_ci_method,
120
+ },
121
+ "hashes": {
122
+ "baseline": baseline_hash,
123
+ "candidate": candidate_hash,
124
+ },
125
+ "spec_fingerprint": spec_fingerprint,
126
+ "baseline": {
127
+ "passes": baseline_metrics["passes"],
128
+ "total": baseline_metrics["total"],
129
+ "pass_rate": baseline_metrics["pass_rate"],
130
+ },
131
+ "candidate": {
132
+ "passes": candidate_metrics["passes"],
133
+ "total": candidate_metrics["total"],
134
+ "pass_rate": candidate_metrics["pass_rate"],
135
+ "prop_violations": candidate_metrics["prop_violations"],
136
+ "mr_violations": candidate_metrics["mr_violations"],
137
+ },
138
+ "delta_pass_rate": candidate_metrics["pass_rate"] - baseline_metrics["pass_rate"],
139
+ "delta_ci": delta_ci,
140
+ "relative_risk": rr_value,
141
+ "relative_risk_ci": rr_ci,
142
+ "environment": get_environment_fingerprint(),
143
+ }
144
+
145
+ return result
146
+
147
+
148
+ def _execute_suite(
149
+ file_path: str,
150
+ test_inputs: Sequence[Tuple[Any, ...]],
151
+ *,
152
+ timeout_s: float,
153
+ mem_mb: int,
154
+ workers: int,
155
+ ) -> List[Dict[str, Any]]:
156
+ """Run the candidate/baseline across the generated inputs."""
157
+ if workers <= 1:
158
+ return [
159
+ run_in_sandbox(file_path, "solve", args, timeout_s, mem_mb)
160
+ for args in test_inputs
161
+ ]
162
+
163
+ with ThreadPoolExecutor(max_workers=workers) as executor:
164
+ futures = [
165
+ executor.submit(
166
+ run_in_sandbox,
167
+ file_path,
168
+ "solve",
169
+ args,
170
+ timeout_s,
171
+ mem_mb,
172
+ )
173
+ for args in test_inputs
174
+ ]
175
+ return [future.result() for future in futures]
176
+
177
+
178
+ def _evaluate_results(
179
+ results: Sequence[Dict[str, Any]],
180
+ spec: Spec,
181
+ test_inputs: Sequence[Tuple[Any, ...]],
182
+ violation_cap: int,
183
+ rerun: Callable[[Tuple[Any, ...]], Dict[str, Any]],
184
+ ) -> Dict[str, Any]:
185
+ """Evaluate results against properties and metamorphic relations."""
186
+ passes = 0
187
+ total = len(results)
188
+ prop_violations: list[Dict[str, Any]] = []
189
+ mr_violations: list[Dict[str, Any]] = []
190
+ pass_indicators: list[int] = []
191
+
192
+ for idx, (result, args) in enumerate(zip(results, test_inputs)):
193
+ if not result["success"]:
194
+ pass_indicators.append(0)
195
+ if len(prop_violations) < violation_cap:
196
+ prop_violations.append(
197
+ {
198
+ "test_case": idx,
199
+ "property": "execution",
200
+ "input": spec.fmt_in(args),
201
+ "output": "",
202
+ "error": result.get("error") or "Execution failed",
203
+ }
204
+ )
205
+ continue
206
+
207
+ output = result["result"]
208
+ prop_passed = True
209
+ for prop in spec.properties:
210
+ if prop.mode != "hard":
211
+ continue
212
+ try:
213
+ if not prop.check(output, *args):
214
+ prop_passed = False
215
+ if len(prop_violations) < violation_cap:
216
+ prop_violations.append(
217
+ {
218
+ "test_case": idx,
219
+ "property": prop.description,
220
+ "input": spec.fmt_in(args),
221
+ "output": spec.fmt_out(output),
222
+ }
223
+ )
224
+ except Exception as exc: # pragma: no cover - defensive logging
225
+ prop_passed = False
226
+ if len(prop_violations) < violation_cap:
227
+ prop_violations.append(
228
+ {
229
+ "test_case": idx,
230
+ "property": prop.description,
231
+ "input": spec.fmt_in(args),
232
+ "output": spec.fmt_out(output),
233
+ "error": str(exc),
234
+ }
235
+ )
236
+
237
+ if not prop_passed:
238
+ pass_indicators.append(0)
239
+ continue
240
+
241
+ mr_passed = True
242
+ for relation in spec.relations:
243
+ try:
244
+ transformed_args = relation.transform(*args)
245
+ except Exception as exc:
246
+ mr_passed = False
247
+ if len(mr_violations) < violation_cap:
248
+ mr_violations.append(
249
+ {
250
+ "test_case": idx,
251
+ "relation": relation.name,
252
+ "input": spec.fmt_in(args),
253
+ "output": spec.fmt_out(output),
254
+ "error": str(exc),
255
+ }
256
+ )
257
+ break
258
+
259
+ relation_result = rerun(transformed_args)
260
+ if not relation_result["success"]:
261
+ mr_passed = False
262
+ if len(mr_violations) < violation_cap:
263
+ mr_violations.append(
264
+ {
265
+ "test_case": idx,
266
+ "relation": relation.name,
267
+ "input": spec.fmt_in(transformed_args),
268
+ "output": "",
269
+ "error": relation_result.get("error") or "Execution failed",
270
+ }
271
+ )
272
+ break
273
+
274
+ relation_output = relation_result["result"]
275
+ if relation.expect == "equal":
276
+ equivalent = spec.equivalence(output, relation_output)
277
+ else: # pragma: no cover - placeholder for future relation modes
278
+ raise ValueError(f"Unsupported relation expectation: {relation.expect}")
279
+
280
+ if not equivalent:
281
+ mr_passed = False
282
+ if len(mr_violations) < violation_cap:
283
+ mr_violations.append(
284
+ {
285
+ "test_case": idx,
286
+ "relation": relation.name,
287
+ "input": spec.fmt_in(args),
288
+ "output": spec.fmt_out(output),
289
+ "relation_output": spec.fmt_out(relation_output),
290
+ }
291
+ )
292
+ break
293
+
294
+ if mr_passed:
295
+ passes += 1
296
+ pass_indicators.append(1)
297
+ else:
298
+ pass_indicators.append(0)
299
+
300
+ return {
301
+ "passes": passes,
302
+ "total": total,
303
+ "pass_rate": passes / total if total else 0.0,
304
+ "prop_violations": prop_violations,
305
+ "mr_violations": mr_violations,
306
+ "pass_indicators": pass_indicators,
307
+ }
308
+
309
+
310
+ def _compute_delta_ci(
311
+ baseline_metrics: Dict[str, Any],
312
+ candidate_metrics: Dict[str, Any],
313
+ *,
314
+ alpha: float,
315
+ seed: int,
316
+ samples: int,
317
+ method: str,
318
+ ) -> List[float]:
319
+ """Compute the pass-rate delta confidence interval using the requested method."""
320
+ method = method.lower()
321
+ if method == "bootstrap":
322
+ return _compute_bootstrap_ci(
323
+ baseline_metrics["pass_indicators"],
324
+ candidate_metrics["pass_indicators"],
325
+ alpha=alpha,
326
+ seed=seed,
327
+ samples=samples,
328
+ )
329
+ if method in {"newcombe", "wilson"}:
330
+ return _compute_newcombe_ci(
331
+ baseline_metrics["passes"],
332
+ baseline_metrics["total"],
333
+ candidate_metrics["passes"],
334
+ candidate_metrics["total"],
335
+ alpha=alpha,
336
+ )
337
+ raise ValueError(f"Unsupported CI method: {method}")
338
+
339
+
340
+ def _compute_bootstrap_ci(
341
+ baseline_indicators: Sequence[int],
342
+ candidate_indicators: Sequence[int],
343
+ *,
344
+ alpha: float,
345
+ seed: int,
346
+ samples: int,
347
+ ) -> List[float]:
348
+ """Compute a percentile bootstrap confidence interval for the pass-rate delta."""
349
+ n = len(baseline_indicators)
350
+ if n == 0 or len(candidate_indicators) != n:
351
+ return [0.0, 0.0]
352
+
353
+ rng = random.Random(seed)
354
+ deltas: list[float] = []
355
+
356
+ for _ in range(max(1, samples)):
357
+ baseline_sample = [baseline_indicators[rng.randrange(n)] for _ in range(n)]
358
+ candidate_sample = [candidate_indicators[rng.randrange(n)] for _ in range(n)]
359
+
360
+ p_baseline = sum(baseline_sample) / n
361
+ p_candidate = sum(candidate_sample) / n
362
+ deltas.append(p_candidate - p_baseline)
363
+
364
+ lower_quantile = alpha / 2
365
+ upper_quantile = 1 - alpha / 2
366
+ ci_lower = _percentile(deltas, lower_quantile)
367
+ ci_upper = _percentile(deltas, upper_quantile)
368
+ return [float(ci_lower), float(ci_upper)]
369
+
370
+
371
+ def _compute_newcombe_ci(
372
+ baseline_passes: int,
373
+ baseline_total: int,
374
+ candidate_passes: int,
375
+ candidate_total: int,
376
+ *,
377
+ alpha: float,
378
+ ) -> List[float]:
379
+ """Compute the score CI for difference in proportions using Newcombe's method."""
380
+ if baseline_total == 0 or candidate_total == 0:
381
+ return [0.0, 0.0]
382
+
383
+ lower_b, upper_b = _wilson_interval(baseline_passes, baseline_total, alpha)
384
+ lower_c, upper_c = _wilson_interval(candidate_passes, candidate_total, alpha)
385
+
386
+ delta_lower = lower_c - upper_b
387
+ delta_upper = upper_c - lower_b
388
+ return [float(delta_lower), float(delta_upper)]
389
+
390
+
391
+ def _wilson_interval(successes: int, total: int, alpha: float) -> Tuple[float, float]:
392
+ if total == 0:
393
+ return (0.0, 0.0)
394
+
395
+ z = NormalDist().inv_cdf(1 - alpha / 2)
396
+ phat = successes / total
397
+ denom = 1 + (z ** 2) / total
398
+ center = phat + (z ** 2) / (2 * total)
399
+ margin = z * math.sqrt((phat * (1 - phat) + (z ** 2) / (4 * total)) / total)
400
+ lower = (center - margin) / denom
401
+ upper = (center + margin) / denom
402
+ return (max(0.0, lower), min(1.0, upper))
403
+
404
+
405
+ def _compute_relative_risk(
406
+ baseline_metrics: Dict[str, Any],
407
+ candidate_metrics: Dict[str, Any],
408
+ *,
409
+ alpha: float,
410
+ method: str,
411
+ ) -> Tuple[float, List[float]]:
412
+ """Compute relative risk (candidate/baseline pass rate) with confidence interval."""
413
+ p_b = baseline_metrics.get("pass_rate")
414
+ if p_b is None:
415
+ total_b = baseline_metrics.get("total", 0)
416
+ p_b = baseline_metrics.get("passes", 0) / total_b if total_b else 0.0
417
+
418
+ p_c = candidate_metrics.get("pass_rate")
419
+ if p_c is None:
420
+ total_c = candidate_metrics.get("total", 0)
421
+ p_c = candidate_metrics.get("passes", 0) / total_c if total_c else 0.0
422
+
423
+ if p_b == 0:
424
+ return float("inf"), [float("inf"), float("inf")]
425
+
426
+ rr = p_c / p_b
427
+ method = method.lower()
428
+ if method != "log":
429
+ raise ValueError(f"Unsupported relative risk CI method: {method}")
430
+
431
+ # Katz log method
432
+ total_b = max(1, baseline_metrics.get("total", 0))
433
+ total_c = max(1, candidate_metrics.get("total", 0))
434
+ successes_b = max(1, baseline_metrics.get("passes", 0))
435
+ successes_c = max(1, candidate_metrics.get("passes", 0))
436
+ failures_b = max(1, total_b - successes_b)
437
+ failures_c = max(1, total_c - successes_c)
438
+
439
+ ln_rr = math.log(rr) if rr > 0 else float("-inf")
440
+ se = math.sqrt((1 / successes_c) - (1 / total_c) +
441
+ (1 / successes_b) - (1 / total_b))
442
+ z = NormalDist().inv_cdf(1 - alpha / 2)
443
+ lower = math.exp(ln_rr - z * se)
444
+ upper = math.exp(ln_rr + z * se)
445
+ return rr, [float(lower), float(upper)]
446
+
447
+
448
+ def _percentile(values: Sequence[float], q: float) -> float:
449
+ """Compute the q-th percentile (0 <= q <= 1) using linear interpolation."""
450
+ if not values:
451
+ return 0.0
452
+ if q <= 0:
453
+ return float(min(values))
454
+ if q >= 1:
455
+ return float(max(values))
456
+
457
+ sorted_vals = sorted(values)
458
+ k = (len(sorted_vals) - 1) * q
459
+ f = math.floor(k)
460
+ c = math.ceil(k)
461
+ if f == c:
462
+ return float(sorted_vals[int(k)])
463
+ d0 = sorted_vals[f] * (c - k)
464
+ d1 = sorted_vals[c] * (k - f)
465
+ return float(d0 + d1)
@@ -0,0 +1,31 @@
1
+ """
2
+ Metamorphic relations for input transformations.
3
+ """
4
+
5
+ import random
6
+ from typing import List, Tuple
7
+
8
+
9
+ def permute_input(L: List[int], k: int) -> Tuple[List[int], int]:
10
+ """
11
+ Permute the input list while keeping k the same.
12
+ The output should be equivalent (same multiset of results).
13
+ """
14
+ L_permuted = L.copy()
15
+ random.shuffle(L_permuted)
16
+ return L_permuted, k
17
+
18
+
19
+ def add_noise_below_min(L: List[int], k: int) -> Tuple[List[int], int]:
20
+ """
21
+ Add small negative values below the minimum of L.
22
+ The output should be equivalent (same results).
23
+ """
24
+ if not L:
25
+ return L, k
26
+
27
+ min_val = min(L)
28
+ noise = [min_val - 1 - i for i in range(5)] # Add 5 values below min
29
+ L_with_noise = L + noise
30
+ adjusted_k = min(k, len(L))
31
+ return L_with_noise, adjusted_k