metamorphic-guard 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. metamorphic_guard-2.0.0/LICENSE +21 -0
  2. metamorphic_guard-2.0.0/PKG-INFO +439 -0
  3. metamorphic_guard-2.0.0/README.md +418 -0
  4. metamorphic_guard-2.0.0/metamorphic_guard/__init__.py +50 -0
  5. metamorphic_guard-2.0.0/metamorphic_guard/cli.py +763 -0
  6. metamorphic_guard-2.0.0/metamorphic_guard/config.py +91 -0
  7. metamorphic_guard-2.0.0/metamorphic_guard/dispatch.py +110 -0
  8. metamorphic_guard-2.0.0/metamorphic_guard/dispatch_queue.py +604 -0
  9. metamorphic_guard-2.0.0/metamorphic_guard/executors/__init__.py +87 -0
  10. metamorphic_guard-2.0.0/metamorphic_guard/executors/anthropic.py +243 -0
  11. metamorphic_guard-2.0.0/metamorphic_guard/executors/openai.py +238 -0
  12. metamorphic_guard-2.0.0/metamorphic_guard/gate.py +59 -0
  13. metamorphic_guard-2.0.0/metamorphic_guard/generators.py +126 -0
  14. metamorphic_guard-2.0.0/metamorphic_guard/harness.py +604 -0
  15. metamorphic_guard-2.0.0/metamorphic_guard/judges/__init__.py +66 -0
  16. metamorphic_guard-2.0.0/metamorphic_guard/judges/builtin.py +116 -0
  17. metamorphic_guard-2.0.0/metamorphic_guard/judges/structured.py +166 -0
  18. metamorphic_guard-2.0.0/metamorphic_guard/llm_harness.py +200 -0
  19. metamorphic_guard-2.0.0/metamorphic_guard/llm_specs.py +152 -0
  20. metamorphic_guard-2.0.0/metamorphic_guard/monitoring.py +635 -0
  21. metamorphic_guard-2.0.0/metamorphic_guard/mutants/__init__.py +49 -0
  22. metamorphic_guard-2.0.0/metamorphic_guard/mutants/advanced.py +118 -0
  23. metamorphic_guard-2.0.0/metamorphic_guard/mutants/builtin.py +102 -0
  24. metamorphic_guard-2.0.0/metamorphic_guard/notifications.py +60 -0
  25. metamorphic_guard-2.0.0/metamorphic_guard/observability.py +293 -0
  26. metamorphic_guard-2.0.0/metamorphic_guard/plugins.py +158 -0
  27. metamorphic_guard-2.0.0/metamorphic_guard/redaction.py +82 -0
  28. metamorphic_guard-2.0.0/metamorphic_guard/relations.py +45 -0
  29. metamorphic_guard-2.0.0/metamorphic_guard/reporting.py +356 -0
  30. metamorphic_guard-2.0.0/metamorphic_guard/sandbox.py +851 -0
  31. metamorphic_guard-2.0.0/metamorphic_guard/specs.py +66 -0
  32. metamorphic_guard-2.0.0/metamorphic_guard/stability.py +23 -0
  33. metamorphic_guard-2.0.0/metamorphic_guard/util.py +305 -0
  34. metamorphic_guard-2.0.0/metamorphic_guard/worker.py +206 -0
  35. metamorphic_guard-2.0.0/metamorphic_guard.egg-info/PKG-INFO +439 -0
  36. metamorphic_guard-2.0.0/metamorphic_guard.egg-info/SOURCES.txt +49 -0
  37. metamorphic_guard-2.0.0/metamorphic_guard.egg-info/dependency_links.txt +1 -0
  38. metamorphic_guard-2.0.0/metamorphic_guard.egg-info/entry_points.txt +22 -0
  39. metamorphic_guard-2.0.0/metamorphic_guard.egg-info/requires.txt +9 -0
  40. metamorphic_guard-2.0.0/metamorphic_guard.egg-info/top_level.txt +2 -0
  41. metamorphic_guard-2.0.0/pyproject.toml +52 -0
  42. metamorphic_guard-2.0.0/setup.cfg +4 -0
  43. metamorphic_guard-2.0.0/setup.py +24 -0
  44. metamorphic_guard-2.0.0/tests/__init__.py +1 -0
  45. metamorphic_guard-2.0.0/tests/test_cli.py +489 -0
  46. metamorphic_guard-2.0.0/tests/test_dispatch.py +151 -0
  47. metamorphic_guard-2.0.0/tests/test_gate.py +109 -0
  48. metamorphic_guard-2.0.0/tests/test_harness.py +291 -0
  49. metamorphic_guard-2.0.0/tests/test_plugins.py +163 -0
  50. metamorphic_guard-2.0.0/tests/test_sandbox.py +237 -0
  51. metamorphic_guard-2.0.0/tests/test_utilities.py +281 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 duhboto
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,439 @@
1
+ Metadata-Version: 2.4
2
+ Name: metamorphic_guard
3
+ Version: 2.0.0
4
+ Summary: A Python library for comparing program versions using metamorphic testing
5
+ Author: Spencer Duh
6
+ Project-URL: Homepage, https://github.com/duhboto/MetamorphicGuard
7
+ Project-URL: Bug Tracker, https://github.com/duhboto/MetamorphicGuard/issues
8
+ Project-URL: Documentation, https://pypi.org/project/metamorphic-guard/
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: click>=8.1
13
+ Requires-Dist: pydantic>=2.0
14
+ Provides-Extra: dev
15
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
16
+ Provides-Extra: llm
17
+ Requires-Dist: openai>=1.0.0; extra == "llm"
18
+ Requires-Dist: anthropic>=0.18.0; extra == "llm"
19
+ Dynamic: license-file
20
+ Dynamic: requires-python
21
+
22
+ # Metamorphic Guard
23
+
24
+ [![PyPI](https://img.shields.io/pypi/v/metamorphic-guard.svg)](https://pypi.org/project/metamorphic-guard/) [![Python](https://img.shields.io/pypi/pyversions/metamorphic-guard.svg?label=python)](https://pypi.org/project/metamorphic-guard/) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Build](https://github.com/duhboto/MetamorphicGuard/actions/workflows/test.yml/badge.svg)](https://github.com/duhboto/MetamorphicGuard/actions/workflows/test.yml)
25
+
26
+ A Python library that compares two program versions—*baseline* and *candidate*—by running property and metamorphic tests, computing confidence intervals on pass-rate differences, and deciding whether to adopt the candidate.
27
+
28
+ ```
29
+ +-------------------+
30
+ search queries | Property & MR | candidate results
31
+ ─────────────▶ | test harness | ────────────────▶ adoption gate
32
+ +---------┬---------+
33
+
34
+
35
+ +-------------------+
36
+ | Bootstrap stats |
37
+ | Δ pass-rate CI |
38
+ +---------┬---------+
39
+
40
+
41
+ ranking-guard evaluate --candidate implementations/candidate_heap.py
42
+ ```
43
+
44
+ Sample CLI decision:
45
+
46
+ ```bash
47
+ $ ranking-guard evaluate --candidate implementations/candidate_heap.py
48
+ Candidate implementations/candidate_heap.py
49
+ Adopt? ✅ Yes
50
+ Reason meets_gate
51
+ Δ Pass Rate 0.0125
52
+ Δ 95% CI [0.0040, 0.0210]
53
+ Report reports/report_2025-11-02T12-00-00.json
54
+ ```
55
+
56
+ ## Overview
57
+
58
+ Metamorphic Guard evaluates candidate implementations against baseline versions by:
59
+
60
+ 1. **Property Testing**: Verifying that outputs satisfy required properties
61
+ 2. **Metamorphic Testing**: Checking that input transformations produce equivalent outputs
62
+ 3. **Statistical Analysis**: Computing bootstrap confidence intervals on pass-rate differences
63
+ 4. **Adoption Gating**: Making data-driven decisions about whether to adopt candidates
64
+
65
+ ## Reference Projects in This Repository
66
+
67
+ Metamorphic Guard ships with three companion projects that demonstrate how teams can fold the library into their delivery workflows and produce auditable evidence:
68
+
69
+ - **Ranking Guard Project** (`ranking_guard_project/`): A realistic release gate for search ranking algorithms. It compares a production baseline to new candidates, enforces metamorphic relations, and surfaces adoption decisions that teams can wire into CI/CD or release dashboards. The bundled CLI (`ranking-guard evaluate ...`) saves JSON reports under `reports/` so stakeholders can review the statistical lift before promoting changes.
70
+ - **Fairness Guard Project** (`fairness_guard_project/`): A responsibility-focused workflow for credit approval models. It uses a fairness-aware task specification with parity checks and transformation invariants to catch regressions before they reach borrowers. The CLI (`fairness-guard evaluate ...`) exports JSON evidence, including observed fairness gaps and group approval rates, that can populate governance dashboards or compliance reviews.
71
+ - **Minimal Demo** (`demo_project/`): A concise script that runs the same evaluation logic programmatically. It is ideal for teams who want to experiment in a notebook, wire Metamorphic Guard into existing automation, or share a lightweight proof-of-concept with stakeholders.
72
+
73
+ Together these examples highlight how the project supports the broader IT community: they provide reproducible workflows, confidence intervals that quantify risk, and machine-readable reports that serve as proof when auditing model or algorithm upgrades.
74
+
75
+ ## Installation
76
+
77
+ ```bash
78
+ pip install -e .
79
+ ```
80
+
81
+ ## Quick Start
82
+
83
+ ### Basic Usage
84
+
85
+ ```bash
86
+ metamorphic-guard --task top_k \
87
+ --baseline examples/top_k_baseline.py \
88
+ --candidate examples/top_k_improved.py
89
+ ```
90
+
91
+ > Tip: If the shorter `metamorphic-guard` alias collides with a system binary,
92
+ > use `python -m metamorphic_guard.cli` or the alternative console script
93
+ > `metaguard`.
94
+
95
+ ### Command Line Options
96
+
97
+ ```bash
98
+ metamorphic-guard --help
99
+ ```
100
+
101
+ **Required Options:**
102
+ - `--task`: Task name to evaluate (e.g., "top_k")
103
+ - `--baseline`: Path to baseline implementation
104
+ - `--candidate`: Path to candidate implementation
105
+
106
+ **Optional Options:**
107
+ - `--n`: Number of test cases (default: 400)
108
+ - `--seed`: Random seed for reproducibility (default: 42)
109
+ - `--timeout-s`: Timeout per test in seconds (default: 2.0)
110
+ - `--mem-mb`: Memory limit in MB (default: 512)
111
+ - `--alpha`: Significance level for confidence intervals (default: 0.05)
112
+ - `--improve-delta`: Minimum improvement threshold (default: 0.02)
113
+ - `--violation-cap`: Maximum violations to report (default: 25)
114
+ - `--parallel`: Number of worker processes used to drive the sandbox (default: 1)
115
+ - `--bootstrap-samples`: Resamples used for percentile bootstrap CI (default: 1000)
116
+ - `--ci-method`: Confidence interval method for pass-rate delta (`bootstrap`, `newcombe`, `wilson`)
117
+ - `--rr-ci-method`: Confidence interval method for relative risk (`log`)
118
+ - `--ci-method`: Confidence interval method for pass-rate delta (`bootstrap` or `newcombe`)
119
+ - `--report-dir`: Destination directory for JSON reports (defaults to auto-discovery)
120
+ - `--executor`: Sandbox backend (`local`, `docker`, or `module:callable`)
121
+ - `--executor-config`: JSON object with executor-specific settings (e.g. CPU, image)
122
+ - `--config`: Path to a TOML file providing defaults for the above options
123
+ - `--export-violations`: Emit a JSON summary of property/MR failures to a given path
124
+ - `--html-report`: Write an interactive-ready HTML summary alongside the JSON report
125
+ - `--dispatcher`: Execution dispatcher (`local` threads or experimental `queue`)
126
+ - `--queue-config`: JSON configuration for queue-backed dispatchers (experimental)
127
+ - `--monitor`: Enable built-in monitors such as `latency`
128
+
129
+ ## Example Implementations
130
+
131
+ The `examples/` directory contains sample implementations for the `top_k` task:
132
+
133
+ - **`top_k_baseline.py`**: Correct baseline implementation
134
+ - **`top_k_bad.py`**: Buggy implementation (should be rejected)
135
+ - **`top_k_improved.py`**: Improved implementation (should be accepted)
136
+
137
+ ## Task Specification
138
+
139
+ ### Top-K Task
140
+
141
+ The `top_k` task finds the k largest elements from a list:
142
+
143
+ **Input**: `(L: List[int], k: int)`
144
+ **Output**: `List[int]` - k largest elements, sorted in descending order
145
+
146
+ **Properties**:
147
+ 1. Output length equals `min(k, len(L))`
148
+ 2. Output is sorted in descending order
149
+ 3. All output elements are from the input list
150
+
151
+ **Metamorphic Relations**:
152
+ 1. **Permute Input**: Shuffling the input list should produce equivalent results
153
+ 2. **Add Noise Below Min**: Adding small values below the minimum should not affect results
154
+
155
+ ### Designing Effective Properties & Relations
156
+
157
+ Metamorphic Guard is only as strong as the properties and relations you write. When
158
+ modeling real ranking or pricing systems:
159
+
160
+ - **Separate invariants and tolerances** – keep hard invariants in `mode="hard"`
161
+ properties and express tolerance-based expectations (e.g., floating point) as
162
+ soft checks where near-misses are acceptable.
163
+ - **Explore symmetry & monotonicity** – swapping equivalent features, shuffling
164
+ inputs, or scaling features by positive constants are high-signal relations for
165
+ recommender systems.
166
+ - **Inject dominated noise** – append low-utility items to ensure the top results
167
+ remain stable under additional clutter.
168
+ - **Idempotence & projection** – running the algorithm twice should yield the same
169
+ output for deterministic tasks; encode this where appropriate.
170
+ - **Control randomness** – expose seed parameters and re-run stochastic algorithms
171
+ with fixed seeds inside your relations for reproducibility.
172
+
173
+ Each report now includes hashes for the generator function, properties, metamorphic
174
+ relations, and formatter callables (`spec_fingerprint`). This makes it possible to
175
+ prove precisely which oracles were active during a run.
176
+
177
+ ### Config Files
178
+
179
+ Store frequently used defaults in a TOML file and pass it via `--config`:
180
+
181
+ ```toml
182
+ task = "top_k"
183
+ baseline = "examples/top_k_baseline.py"
184
+ candidate = "examples/top_k_improved.py"
185
+ n = 600
186
+ seed = 1337
187
+ executor = "docker"
188
+ executor_config = { image = "python:3.11-slim", cpus = 2, memory_mb = 1024 }
189
+ policy_version = "policy-2025-11-09"
190
+
191
+ [metamorphic_guard.queue]
192
+ backend = "redis"
193
+ url = "redis://localhost:6379/0"
194
+
195
+ [metamorphic_guard.alerts]
196
+ webhooks = ["https://hooks.example.dev/metaguard"]
197
+ ```
198
+
199
+ Run with:
200
+
201
+ ```bash
202
+ metamorphic-guard --config metaguard.toml --report-dir reports/
203
+ ```
204
+
205
+ CLI arguments still override config values when provided.
206
+
207
+ Configuration files are validated via a Pydantic schema; malformed values (e.g.
208
+ negative `n`, unknown dispatchers) raise actionable CLI errors before a run starts.
209
+ The optional `policy_version` propagates into reports/metadata, making it easy to
210
+ track changes to guard rails across deployments.
211
+
212
+ ### Monitors & Alerts
213
+
214
+ Monitors provide higher-order statistical invariants beyond per-test properties.
215
+ Enable them via `--monitor latency` to capture latency distributions and flag
216
+ regressions, add `--monitor fairness` to track per-group success deltas, or
217
+ `--monitor resource:metric=cpu_ms,alert_ratio=1.3` to watch resource budgets.
218
+ Monitor output is written under the `monitors` key in the JSON report and
219
+ surfaced in the optional HTML report. Combine monitors by repeating
220
+ `--monitor …` on the CLI or programmatically via the Python API.
221
+
222
+ Alerts can be pushed to downstream systems by wiring `--alert-webhook
223
+ https://hooks.example.dev/guard`. The payload contains the flattened monitor
224
+ alerts together with run metadata (task, decision, run_id) for correlation.
225
+
226
+ ## Implementation Requirements
227
+
228
+ ### Candidate Function Contract
229
+
230
+ Each candidate file must export a callable function:
231
+
232
+ ```python
233
+ def solve(*args):
234
+ """
235
+ Your implementation here.
236
+ Must handle the same input format as the task specification.
237
+ """
238
+ return result
239
+ ```
240
+
241
+ ### Sandbox Execution
242
+
243
+ - All candidate code runs in isolated subprocesses
244
+ - Resource limits: CPU time, memory usage
245
+ - Network access is disabled by stubbing socket primitives and import hooks
246
+ - Subprocess creation (`os.system`, `subprocess.Popen`, etc.) is denied inside the sandbox
247
+ - Native FFI (`ctypes`, `cffi`), multiprocessing forks, and user site-packages are blocked at import time
248
+ - Timeout enforcement per test case
249
+ - Deterministic execution with fixed seeds
250
+ - Structured failures: sandbox responses include `error_type` / `error_code` fields (e.g., `timeout`, `process_exit`) and diagnostics for easier automation.
251
+ - Secret redaction: configure `METAMORPHIC_GUARD_REDACT` or `executor_config.redact_patterns` to scrub sensitive values from stdout/stderr/results before they leave the sandbox. Default patterns catch common API keys and tokens.
252
+ - Optional executors: set `--executor` / `METAMORPHIC_GUARD_EXECUTOR` to run evaluations inside Docker (`docker`) or a custom plugin (`package.module:callable`). Pass JSON tunables via `--executor-config` / `METAMORPHIC_GUARD_EXECUTOR_CONFIG` and override the Docker image with `METAMORPHIC_GUARD_DOCKER_IMAGE`.
253
+
254
+ Example Docker run:
255
+
256
+ ```bash
257
+ metamorphic-guard \
258
+ --task top_k \
259
+ --baseline examples/top_k_baseline.py \
260
+ --candidate examples/top_k_improved.py \
261
+ --executor docker \
262
+ --executor-config '{"image":"python:3.11-slim","cpus":1.5,"memory_mb":768}'
263
+ ```
264
+
265
+ > **Deployment tip:** For untrusted code, run the sandbox worker inside an OS-level
266
+ > container or VM (e.g., Docker with seccomp/AppArmor or Firejail) and drop Linux
267
+ > capabilities. The built-in guardrails reduce attack surface, but pairing them with
268
+ > kernel isolation provides a stronger security boundary.
269
+
270
+ See `deploy/docker-compose.worker.yml` for a hardened reference stack (Redis + containerised worker with read-only root filesystem and disabled privileges).
271
+
272
+ ### Distributed Execution (Preview)
273
+
274
+ The queue dispatcher (`--dispatcher queue`) enables distributed execution. In-memory
275
+ queues are available for local experimentation, while a Redis-backed adapter lets
276
+ you scale out with remote workers:
277
+
278
+ ```bash
279
+ metamorphic-guard --dispatcher queue \
280
+ --queue-config '{"backend":"redis","url":"redis://localhost:6379/0"}' \
281
+ --monitor latency \
282
+ --task top_k --baseline baseline.py --candidate candidate.py --improve-delta 0.0
283
+
284
+ # On worker machines
285
+ metamorphic-guard-worker --backend redis --queue-config '{"url":"redis://localhost:6379/0"}'
286
+ ```
287
+
288
+ Workers fetch tasks, run sandboxed evaluations, and stream results back to the
289
+ coordinator. Memory backend workers remain in-process and are best suited for tests.
290
+
291
+ Adaptive queue controls:
292
+ - `adaptive_batching` (default `true`) grows/shrinks batch sizes based on observed
293
+ duration and queue pressure. Override `initial_batch_size`, `max_batch_size`, or
294
+ `adaptive_fast_threshold_ms` / `adaptive_slow_threshold_ms` to tune behaviour.
295
+ - `adaptive_compress` automatically avoids gzip when payloads are already tiny or
296
+ compression fails to win over raw JSON, cutting CPU for short test cases.
297
+ - `inflight_factor` governs how many cases are kept in-flight (per worker) before
298
+ backpressure kicks in; lower it for heavyweight candidates, raise it for latency-sensitive smoke tests.
299
+
300
+ ### Plugin Ecosystem
301
+
302
+ Metamorphic Guard supports external extensions via Python entry points:
303
+
304
+ - `metamorphic_guard.monitors`: register additional monitor factories
305
+ - `metamorphic_guard.dispatchers`: provide custom dispatcher implementations
306
+ - Inspect installed plugins with `metamorphic-guard plugin list` (append `--json` for machine-readable output) and view rich metadata via `metamorphic-guard plugin info <name>`.
307
+ - Third-party packages should expose a `PLUGIN_METADATA` mapping (name, version, guard_min/guard_max, sandbox flag, etc.) so compatibility is surfaced in the registry.
308
+
309
+ Example `pyproject.toml` snippet:
310
+
311
+ ```toml
312
+ [project.entry-points."metamorphic_guard.monitors"]
313
+ latency99 = "my_package.monitors:Latency99Monitor"
314
+ ```
315
+
316
+ Once installed, the new monitor can be referenced on the CLI:
317
+
318
+ ```bash
319
+ metamorphic-guard --monitor latency99
320
+ ```
321
+
322
+ Programmatic APIs (`metamorphic_guard.monitoring.resolve_monitors`) also pick up
323
+ registered plugins, enabling teams to share bespoke invariants, dispatchers, and
324
+ workflows across services.
325
+ Pass `--sandbox-plugins` during evaluation (or set `sandbox_plugins = true` in config) to execute third-party monitors inside per-plugin subprocesses. Plugins can set `sandbox = true` in their metadata to request isolation by default.
326
+
327
+ ### Observability & Artifacts
328
+
329
+ - Set `METAMORPHIC_GUARD_LOG_JSON=1` to stream structured JSON logs (start/complete events,
330
+ worker task telemetry) to stdout for ingestion by log pipelines.
331
+ - Prefer the CLI toggles `--log-json` / `--no-log-json` and `--metrics` / `--no-metrics` for one-off runs; pair with `--metrics-port` to expose a Prometheus endpoint directly from the coordinator or worker.
332
+ - Capture structured logs to disk with `--log-file observability/run.jsonl`; the coordinator/worker
333
+ will append JSON events and handle file lifecycle automatically.
334
+ - Enable Prometheus counters by exporting `METAMORPHIC_GUARD_PROMETHEUS=1` and register the
335
+ exposed registry (`metamorphic_guard.observability.prometheus_registry()`) with your HTTP exporter.
336
+ - Persist failing case artifacts either by providing `METAMORPHIC_GUARD_FAILED_DIR` or letting the
337
+ harness default to `reports/failed_cases/`; these JSON snapshots capture violations and config for debugging.
338
+ - Retention controls: `--failed-artifact-limit` caps how many snapshots are retained and
339
+ `--failed-artifact-ttl-days` prunes entries older than the configured horizon.
340
+ - Queue telemetry ships out-of-the-box: `metamorphic_queue_pending_tasks` (tasks waiting),
341
+ `metamorphic_queue_inflight_cases` (cases outstanding), and `metamorphic_queue_active_workers`
342
+ (live heartbeat count) alongside throughput counters (`*_cases_dispatched_total`, `*_cases_completed_total`,
343
+ `*_cases_requeued_total`).
344
+ - A starter Grafana dashboard lives at `docs/grafana/metamorphic-guard-dashboard.json` – import it
345
+ into Grafana and point the Prometheus datasource at the Guard metrics endpoint for live telemetry.
346
+ - HTML reports embed Chart.js dashboards summarising pass rates, fairness gaps, and resource usage
347
+ whenever the relevant monitors are enabled, making it easy to eyeball regressions without leaving the report.
348
+
349
+ ### Quick Start Wizard & Cookbook
350
+
351
+ - Run `metamorphic-guard init` to scaffold a `metamorphic_guard.toml` configuration (supports distributed
352
+ queue defaults and monitor presets).
353
+ - Prefer `metamorphic-guard init --interactive` for a guided wizard that prompts for baseline/candidate paths,
354
+ distributed mode, and default monitors.
355
+ - Generate reusable plugin templates with `metamorphic-guard scaffold-plugin --kind monitor --name MyMonitor` and
356
+ wire them into your project via entry points.
357
+ - Explore `docs/cookbook.md` for recipes covering distributed evaluations, advanced monitors, and CI pipelines.
358
+
359
+ ## Output Format
360
+
361
+ The system generates JSON reports in `reports/report_<timestamp>.json`:
362
+
363
+ ```json
364
+ {
365
+ "task": "top_k",
366
+ "n": 400,
367
+ "seed": 42,
368
+ "config": {
369
+ "timeout_s": 2.0,
370
+ "mem_mb": 512,
371
+ "alpha": 0.05,
372
+ "improve_delta": 0.02,
373
+ "violation_cap": 25,
374
+ "parallel": 1,
375
+ "bootstrap_samples": 1000,
376
+ "ci_method": "bootstrap",
377
+ "rr_ci_method": "log"
378
+ },
379
+ "hashes": {
380
+ "baseline": "sha256...",
381
+ "candidate": "sha256..."
382
+ },
383
+ "spec_fingerprint": {
384
+ "gen_inputs": "sha256...",
385
+ "properties": [
386
+ { "description": "Output length equals min(k, len(L))", "mode": "hard", "hash": "sha256..." }
387
+ ],
388
+ "relations": [
389
+ { "name": "permute_input", "expect": "equal", "hash": "sha256..." }
390
+ ],
391
+ "equivalence": "sha256...",
392
+ "formatters": { "fmt_in": "sha256...", "fmt_out": "sha256..." }
393
+ },
394
+ "baseline": {
395
+ "passes": 388,
396
+ "total": 400,
397
+ "pass_rate": 0.97
398
+ },
399
+ "candidate": {
400
+ "passes": 396,
401
+ "total": 400,
402
+ "pass_rate": 0.99,
403
+ "prop_violations": [],
404
+ "mr_violations": []
405
+ },
406
+ "delta_pass_rate": 0.02,
407
+ "delta_ci": [0.015, 0.035],
408
+ "relative_risk": 1.021,
409
+ "relative_risk_ci": [0.998, 1.045],
410
+ "decision": {
411
+ "adopt": true,
412
+ "reason": "meets_gate"
413
+ },
414
+ "job_metadata": {
415
+ "hostname": "build-agent-01",
416
+ "python_version": "3.11.8",
417
+ "git_commit": "d1e5f8...",
418
+ "git_dirty": false
419
+ },
420
+ "monitors": {
421
+ "LatencyMonitor": {
422
+ "id": "LatencyMonitor",
423
+ "type": "latency",
424
+ "percentile": 0.95,
425
+ "summary": {
426
+ "baseline": {"count": 400, "mean_ms": 1.21, "p95_ms": 1.89},
427
+ "candidate": {"count": 400, "mean_ms": 1.05, "p95_ms": 1.61}
428
+ },
429
+ "alerts": []
430
+ }
431
+ },
432
+ "environment": {
433
+ "python_version": "3.11.8",
434
+ "implementation": "CPython",
435
+ "platform": "macOS-14-arm64-arm-64bit",
436
+ "executable": "/usr/bin/python3"
437
+ }
438
+ }
439
+ ```