metamorphic-guard 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metamorphic_guard-2.0.0/LICENSE +21 -0
- metamorphic_guard-2.0.0/PKG-INFO +439 -0
- metamorphic_guard-2.0.0/README.md +418 -0
- metamorphic_guard-2.0.0/metamorphic_guard/__init__.py +50 -0
- metamorphic_guard-2.0.0/metamorphic_guard/cli.py +763 -0
- metamorphic_guard-2.0.0/metamorphic_guard/config.py +91 -0
- metamorphic_guard-2.0.0/metamorphic_guard/dispatch.py +110 -0
- metamorphic_guard-2.0.0/metamorphic_guard/dispatch_queue.py +604 -0
- metamorphic_guard-2.0.0/metamorphic_guard/executors/__init__.py +87 -0
- metamorphic_guard-2.0.0/metamorphic_guard/executors/anthropic.py +243 -0
- metamorphic_guard-2.0.0/metamorphic_guard/executors/openai.py +238 -0
- metamorphic_guard-2.0.0/metamorphic_guard/gate.py +59 -0
- metamorphic_guard-2.0.0/metamorphic_guard/generators.py +126 -0
- metamorphic_guard-2.0.0/metamorphic_guard/harness.py +604 -0
- metamorphic_guard-2.0.0/metamorphic_guard/judges/__init__.py +66 -0
- metamorphic_guard-2.0.0/metamorphic_guard/judges/builtin.py +116 -0
- metamorphic_guard-2.0.0/metamorphic_guard/judges/structured.py +166 -0
- metamorphic_guard-2.0.0/metamorphic_guard/llm_harness.py +200 -0
- metamorphic_guard-2.0.0/metamorphic_guard/llm_specs.py +152 -0
- metamorphic_guard-2.0.0/metamorphic_guard/monitoring.py +635 -0
- metamorphic_guard-2.0.0/metamorphic_guard/mutants/__init__.py +49 -0
- metamorphic_guard-2.0.0/metamorphic_guard/mutants/advanced.py +118 -0
- metamorphic_guard-2.0.0/metamorphic_guard/mutants/builtin.py +102 -0
- metamorphic_guard-2.0.0/metamorphic_guard/notifications.py +60 -0
- metamorphic_guard-2.0.0/metamorphic_guard/observability.py +293 -0
- metamorphic_guard-2.0.0/metamorphic_guard/plugins.py +158 -0
- metamorphic_guard-2.0.0/metamorphic_guard/redaction.py +82 -0
- metamorphic_guard-2.0.0/metamorphic_guard/relations.py +45 -0
- metamorphic_guard-2.0.0/metamorphic_guard/reporting.py +356 -0
- metamorphic_guard-2.0.0/metamorphic_guard/sandbox.py +851 -0
- metamorphic_guard-2.0.0/metamorphic_guard/specs.py +66 -0
- metamorphic_guard-2.0.0/metamorphic_guard/stability.py +23 -0
- metamorphic_guard-2.0.0/metamorphic_guard/util.py +305 -0
- metamorphic_guard-2.0.0/metamorphic_guard/worker.py +206 -0
- metamorphic_guard-2.0.0/metamorphic_guard.egg-info/PKG-INFO +439 -0
- metamorphic_guard-2.0.0/metamorphic_guard.egg-info/SOURCES.txt +49 -0
- metamorphic_guard-2.0.0/metamorphic_guard.egg-info/dependency_links.txt +1 -0
- metamorphic_guard-2.0.0/metamorphic_guard.egg-info/entry_points.txt +22 -0
- metamorphic_guard-2.0.0/metamorphic_guard.egg-info/requires.txt +9 -0
- metamorphic_guard-2.0.0/metamorphic_guard.egg-info/top_level.txt +2 -0
- metamorphic_guard-2.0.0/pyproject.toml +52 -0
- metamorphic_guard-2.0.0/setup.cfg +4 -0
- metamorphic_guard-2.0.0/setup.py +24 -0
- metamorphic_guard-2.0.0/tests/__init__.py +1 -0
- metamorphic_guard-2.0.0/tests/test_cli.py +489 -0
- metamorphic_guard-2.0.0/tests/test_dispatch.py +151 -0
- metamorphic_guard-2.0.0/tests/test_gate.py +109 -0
- metamorphic_guard-2.0.0/tests/test_harness.py +291 -0
- metamorphic_guard-2.0.0/tests/test_plugins.py +163 -0
- metamorphic_guard-2.0.0/tests/test_sandbox.py +237 -0
- metamorphic_guard-2.0.0/tests/test_utilities.py +281 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 duhboto
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: metamorphic_guard
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: A Python library for comparing program versions using metamorphic testing
|
|
5
|
+
Author: Spencer Duh
|
|
6
|
+
Project-URL: Homepage, https://github.com/duhboto/MetamorphicGuard
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/duhboto/MetamorphicGuard/issues
|
|
8
|
+
Project-URL: Documentation, https://pypi.org/project/metamorphic-guard/
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: click>=8.1
|
|
13
|
+
Requires-Dist: pydantic>=2.0
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
16
|
+
Provides-Extra: llm
|
|
17
|
+
Requires-Dist: openai>=1.0.0; extra == "llm"
|
|
18
|
+
Requires-Dist: anthropic>=0.18.0; extra == "llm"
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
|
|
22
|
+
# Metamorphic Guard
|
|
23
|
+
|
|
24
|
+
[](https://pypi.org/project/metamorphic-guard/) [](https://pypi.org/project/metamorphic-guard/) [](https://opensource.org/licenses/MIT) [](https://github.com/duhboto/MetamorphicGuard/actions/workflows/test.yml)
|
|
25
|
+
|
|
26
|
+
A Python library that compares two program versions—*baseline* and *candidate*—by running property and metamorphic tests, computing confidence intervals on pass-rate differences, and deciding whether to adopt the candidate.
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
+-------------------+
|
|
30
|
+
search queries | Property & MR | candidate results
|
|
31
|
+
─────────────▶ | test harness | ────────────────▶ adoption gate
|
|
32
|
+
+---------┬---------+
|
|
33
|
+
│
|
|
34
|
+
▼
|
|
35
|
+
+-------------------+
|
|
36
|
+
| Bootstrap stats |
|
|
37
|
+
| Δ pass-rate CI |
|
|
38
|
+
+---------┬---------+
|
|
39
|
+
│
|
|
40
|
+
▼
|
|
41
|
+
ranking-guard evaluate --candidate implementations/candidate_heap.py
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Sample CLI decision:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
$ ranking-guard evaluate --candidate implementations/candidate_heap.py
|
|
48
|
+
Candidate implementations/candidate_heap.py
|
|
49
|
+
Adopt? ✅ Yes
|
|
50
|
+
Reason meets_gate
|
|
51
|
+
Δ Pass Rate 0.0125
|
|
52
|
+
Δ 95% CI [0.0040, 0.0210]
|
|
53
|
+
Report reports/report_2025-11-02T12-00-00.json
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Overview
|
|
57
|
+
|
|
58
|
+
Metamorphic Guard evaluates candidate implementations against baseline versions by:
|
|
59
|
+
|
|
60
|
+
1. **Property Testing**: Verifying that outputs satisfy required properties
|
|
61
|
+
2. **Metamorphic Testing**: Checking that input transformations produce equivalent outputs
|
|
62
|
+
3. **Statistical Analysis**: Computing bootstrap confidence intervals on pass-rate differences
|
|
63
|
+
4. **Adoption Gating**: Making data-driven decisions about whether to adopt candidates
|
|
64
|
+
|
|
65
|
+
## Reference Projects in This Repository
|
|
66
|
+
|
|
67
|
+
Metamorphic Guard ships with three companion projects that demonstrate how teams can fold the library into their delivery workflows and produce auditable evidence:
|
|
68
|
+
|
|
69
|
+
- **Ranking Guard Project** (`ranking_guard_project/`): A realistic release gate for search ranking algorithms. It compares a production baseline to new candidates, enforces metamorphic relations, and surfaces adoption decisions that teams can wire into CI/CD or release dashboards. The bundled CLI (`ranking-guard evaluate ...`) saves JSON reports under `reports/` so stakeholders can review the statistical lift before promoting changes.
|
|
70
|
+
- **Fairness Guard Project** (`fairness_guard_project/`): A responsibility-focused workflow for credit approval models. It uses a fairness-aware task specification with parity checks and transformation invariants to catch regressions before they reach borrowers. The CLI (`fairness-guard evaluate ...`) exports JSON evidence, including observed fairness gaps and group approval rates, that can populate governance dashboards or compliance reviews.
|
|
71
|
+
- **Minimal Demo** (`demo_project/`): A concise script that runs the same evaluation logic programmatically. It is ideal for teams who want to experiment in a notebook, wire Metamorphic Guard into existing automation, or share a lightweight proof-of-concept with stakeholders.
|
|
72
|
+
|
|
73
|
+
Together these examples highlight how the project supports the broader IT community: they provide reproducible workflows, confidence intervals that quantify risk, and machine-readable reports that serve as proof when auditing model or algorithm upgrades.
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install -e .
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Quick Start
|
|
82
|
+
|
|
83
|
+
### Basic Usage
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
metamorphic-guard --task top_k \
|
|
87
|
+
--baseline examples/top_k_baseline.py \
|
|
88
|
+
--candidate examples/top_k_improved.py
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
> Tip: If the shorter `metamorphic-guard` alias collides with a system binary,
|
|
92
|
+
> use `python -m metamorphic_guard.cli` or the alternative console script
|
|
93
|
+
> `metaguard`.
|
|
94
|
+
|
|
95
|
+
### Command Line Options
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
metamorphic-guard --help
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Required Options:**
|
|
102
|
+
- `--task`: Task name to evaluate (e.g., "top_k")
|
|
103
|
+
- `--baseline`: Path to baseline implementation
|
|
104
|
+
- `--candidate`: Path to candidate implementation
|
|
105
|
+
|
|
106
|
+
**Optional Options:**
|
|
107
|
+
- `--n`: Number of test cases (default: 400)
|
|
108
|
+
- `--seed`: Random seed for reproducibility (default: 42)
|
|
109
|
+
- `--timeout-s`: Timeout per test in seconds (default: 2.0)
|
|
110
|
+
- `--mem-mb`: Memory limit in MB (default: 512)
|
|
111
|
+
- `--alpha`: Significance level for confidence intervals (default: 0.05)
|
|
112
|
+
- `--improve-delta`: Minimum improvement threshold (default: 0.02)
|
|
113
|
+
- `--violation-cap`: Maximum violations to report (default: 25)
|
|
114
|
+
- `--parallel`: Number of worker processes used to drive the sandbox (default: 1)
|
|
115
|
+
- `--bootstrap-samples`: Resamples used for percentile bootstrap CI (default: 1000)
|
|
116
|
+
- `--ci-method`: Confidence interval method for pass-rate delta (`bootstrap`, `newcombe`, `wilson`)
|
|
117
|
+
- `--rr-ci-method`: Confidence interval method for relative risk (`log`)
|
|
118
|
+
- `--ci-method`: Confidence interval method for pass-rate delta (`bootstrap` or `newcombe`)
|
|
119
|
+
- `--report-dir`: Destination directory for JSON reports (defaults to auto-discovery)
|
|
120
|
+
- `--executor`: Sandbox backend (`local`, `docker`, or `module:callable`)
|
|
121
|
+
- `--executor-config`: JSON object with executor-specific settings (e.g. CPU, image)
|
|
122
|
+
- `--config`: Path to a TOML file providing defaults for the above options
|
|
123
|
+
- `--export-violations`: Emit a JSON summary of property/MR failures to a given path
|
|
124
|
+
- `--html-report`: Write an interactive-ready HTML summary alongside the JSON report
|
|
125
|
+
- `--dispatcher`: Execution dispatcher (`local` threads or experimental `queue`)
|
|
126
|
+
- `--queue-config`: JSON configuration for queue-backed dispatchers (experimental)
|
|
127
|
+
- `--monitor`: Enable built-in monitors such as `latency`
|
|
128
|
+
|
|
129
|
+
## Example Implementations
|
|
130
|
+
|
|
131
|
+
The `examples/` directory contains sample implementations for the `top_k` task:
|
|
132
|
+
|
|
133
|
+
- **`top_k_baseline.py`**: Correct baseline implementation
|
|
134
|
+
- **`top_k_bad.py`**: Buggy implementation (should be rejected)
|
|
135
|
+
- **`top_k_improved.py`**: Improved implementation (should be accepted)
|
|
136
|
+
|
|
137
|
+
## Task Specification
|
|
138
|
+
|
|
139
|
+
### Top-K Task
|
|
140
|
+
|
|
141
|
+
The `top_k` task finds the k largest elements from a list:
|
|
142
|
+
|
|
143
|
+
**Input**: `(L: List[int], k: int)`
|
|
144
|
+
**Output**: `List[int]` - k largest elements, sorted in descending order
|
|
145
|
+
|
|
146
|
+
**Properties**:
|
|
147
|
+
1. Output length equals `min(k, len(L))`
|
|
148
|
+
2. Output is sorted in descending order
|
|
149
|
+
3. All output elements are from the input list
|
|
150
|
+
|
|
151
|
+
**Metamorphic Relations**:
|
|
152
|
+
1. **Permute Input**: Shuffling the input list should produce equivalent results
|
|
153
|
+
2. **Add Noise Below Min**: Adding small values below the minimum should not affect results
|
|
154
|
+
|
|
155
|
+
### Designing Effective Properties & Relations
|
|
156
|
+
|
|
157
|
+
Metamorphic Guard is only as strong as the properties and relations you write. When
|
|
158
|
+
modeling real ranking or pricing systems:
|
|
159
|
+
|
|
160
|
+
- **Separate invariants and tolerances** – keep hard invariants in `mode="hard"`
|
|
161
|
+
properties and express tolerance-based expectations (e.g., floating point) as
|
|
162
|
+
soft checks where near-misses are acceptable.
|
|
163
|
+
- **Explore symmetry & monotonicity** – swapping equivalent features, shuffling
|
|
164
|
+
inputs, or scaling features by positive constants are high-signal relations for
|
|
165
|
+
recommender systems.
|
|
166
|
+
- **Inject dominated noise** – append low-utility items to ensure the top results
|
|
167
|
+
remain stable under additional clutter.
|
|
168
|
+
- **Idempotence & projection** – running the algorithm twice should yield the same
|
|
169
|
+
output for deterministic tasks; encode this where appropriate.
|
|
170
|
+
- **Control randomness** – expose seed parameters and re-run stochastic algorithms
|
|
171
|
+
with fixed seeds inside your relations for reproducibility.
|
|
172
|
+
|
|
173
|
+
Each report now includes hashes for the generator function, properties, metamorphic
|
|
174
|
+
relations, and formatter callables (`spec_fingerprint`). This makes it possible to
|
|
175
|
+
prove precisely which oracles were active during a run.
|
|
176
|
+
|
|
177
|
+
### Config Files
|
|
178
|
+
|
|
179
|
+
Store frequently used defaults in a TOML file and pass it via `--config`:
|
|
180
|
+
|
|
181
|
+
```toml
|
|
182
|
+
task = "top_k"
|
|
183
|
+
baseline = "examples/top_k_baseline.py"
|
|
184
|
+
candidate = "examples/top_k_improved.py"
|
|
185
|
+
n = 600
|
|
186
|
+
seed = 1337
|
|
187
|
+
executor = "docker"
|
|
188
|
+
executor_config = { image = "python:3.11-slim", cpus = 2, memory_mb = 1024 }
|
|
189
|
+
policy_version = "policy-2025-11-09"
|
|
190
|
+
|
|
191
|
+
[metamorphic_guard.queue]
|
|
192
|
+
backend = "redis"
|
|
193
|
+
url = "redis://localhost:6379/0"
|
|
194
|
+
|
|
195
|
+
[metamorphic_guard.alerts]
|
|
196
|
+
webhooks = ["https://hooks.example.dev/metaguard"]
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Run with:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
metamorphic-guard --config metaguard.toml --report-dir reports/
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
CLI arguments still override config values when provided.
|
|
206
|
+
|
|
207
|
+
Configuration files are validated via a Pydantic schema; malformed values (e.g.
|
|
208
|
+
negative `n`, unknown dispatchers) raise actionable CLI errors before a run starts.
|
|
209
|
+
The optional `policy_version` propagates into reports/metadata, making it easy to
|
|
210
|
+
track changes to guard rails across deployments.
|
|
211
|
+
|
|
212
|
+
### Monitors & Alerts
|
|
213
|
+
|
|
214
|
+
Monitors provide higher-order statistical invariants beyond per-test properties.
|
|
215
|
+
Enable them via `--monitor latency` to capture latency distributions and flag
|
|
216
|
+
regressions, add `--monitor fairness` to track per-group success deltas, or
|
|
217
|
+
`--monitor resource:metric=cpu_ms,alert_ratio=1.3` to watch resource budgets.
|
|
218
|
+
Monitor output is written under the `monitors` key in the JSON report and
|
|
219
|
+
surfaced in the optional HTML report. Combine monitors by repeating
|
|
220
|
+
`--monitor …` on the CLI or programmatically via the Python API.
|
|
221
|
+
|
|
222
|
+
Alerts can be pushed to downstream systems by wiring `--alert-webhook
|
|
223
|
+
https://hooks.example.dev/guard`. The payload contains the flattened monitor
|
|
224
|
+
alerts together with run metadata (task, decision, run_id) for correlation.
|
|
225
|
+
|
|
226
|
+
## Implementation Requirements
|
|
227
|
+
|
|
228
|
+
### Candidate Function Contract
|
|
229
|
+
|
|
230
|
+
Each candidate file must export a callable function:
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
def solve(*args):
|
|
234
|
+
"""
|
|
235
|
+
Your implementation here.
|
|
236
|
+
Must handle the same input format as the task specification.
|
|
237
|
+
"""
|
|
238
|
+
return result
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### Sandbox Execution
|
|
242
|
+
|
|
243
|
+
- All candidate code runs in isolated subprocesses
|
|
244
|
+
- Resource limits: CPU time, memory usage
|
|
245
|
+
- Network access is disabled by stubbing socket primitives and import hooks
|
|
246
|
+
- Subprocess creation (`os.system`, `subprocess.Popen`, etc.) is denied inside the sandbox
|
|
247
|
+
- Native FFI (`ctypes`, `cffi`), multiprocessing forks, and user site-packages are blocked at import time
|
|
248
|
+
- Timeout enforcement per test case
|
|
249
|
+
- Deterministic execution with fixed seeds
|
|
250
|
+
- Structured failures: sandbox responses include `error_type` / `error_code` fields (e.g., `timeout`, `process_exit`) and diagnostics for easier automation.
|
|
251
|
+
- Secret redaction: configure `METAMORPHIC_GUARD_REDACT` or `executor_config.redact_patterns` to scrub sensitive values from stdout/stderr/results before they leave the sandbox. Default patterns catch common API keys and tokens.
|
|
252
|
+
- Optional executors: set `--executor` / `METAMORPHIC_GUARD_EXECUTOR` to run evaluations inside Docker (`docker`) or a custom plugin (`package.module:callable`). Pass JSON tunables via `--executor-config` / `METAMORPHIC_GUARD_EXECUTOR_CONFIG` and override the Docker image with `METAMORPHIC_GUARD_DOCKER_IMAGE`.
|
|
253
|
+
|
|
254
|
+
Example Docker run:
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
metamorphic-guard \
|
|
258
|
+
--task top_k \
|
|
259
|
+
--baseline examples/top_k_baseline.py \
|
|
260
|
+
--candidate examples/top_k_improved.py \
|
|
261
|
+
--executor docker \
|
|
262
|
+
--executor-config '{"image":"python:3.11-slim","cpus":1.5,"memory_mb":768}'
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
> **Deployment tip:** For untrusted code, run the sandbox worker inside an OS-level
|
|
266
|
+
> container or VM (e.g., Docker with seccomp/AppArmor or Firejail) and drop Linux
|
|
267
|
+
> capabilities. The built-in guardrails reduce attack surface, but pairing them with
|
|
268
|
+
> kernel isolation provides a stronger security boundary.
|
|
269
|
+
|
|
270
|
+
See `deploy/docker-compose.worker.yml` for a hardened reference stack (Redis + containerised worker with read-only root filesystem and disabled privileges).
|
|
271
|
+
|
|
272
|
+
### Distributed Execution (Preview)
|
|
273
|
+
|
|
274
|
+
The queue dispatcher (`--dispatcher queue`) enables distributed execution. In-memory
|
|
275
|
+
queues are available for local experimentation, while a Redis-backed adapter lets
|
|
276
|
+
you scale out with remote workers:
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
metamorphic-guard --dispatcher queue \
|
|
280
|
+
--queue-config '{"backend":"redis","url":"redis://localhost:6379/0"}' \
|
|
281
|
+
--monitor latency \
|
|
282
|
+
--task top_k --baseline baseline.py --candidate candidate.py --improve-delta 0.0
|
|
283
|
+
|
|
284
|
+
# On worker machines
|
|
285
|
+
metamorphic-guard-worker --backend redis --queue-config '{"url":"redis://localhost:6379/0"}'
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
Workers fetch tasks, run sandboxed evaluations, and stream results back to the
|
|
289
|
+
coordinator. Memory backend workers remain in-process and are best suited for tests.
|
|
290
|
+
|
|
291
|
+
Adaptive queue controls:
|
|
292
|
+
- `adaptive_batching` (default `true`) grows/shrinks batch sizes based on observed
|
|
293
|
+
duration and queue pressure. Override `initial_batch_size`, `max_batch_size`, or
|
|
294
|
+
`adaptive_fast_threshold_ms` / `adaptive_slow_threshold_ms` to tune behaviour.
|
|
295
|
+
- `adaptive_compress` automatically avoids gzip when payloads are already tiny or
|
|
296
|
+
compression fails to win over raw JSON, cutting CPU for short test cases.
|
|
297
|
+
- `inflight_factor` governs how many cases are kept in-flight (per worker) before
|
|
298
|
+
backpressure kicks in; lower it for heavyweight candidates, raise it for latency-sensitive smoke tests.
|
|
299
|
+
|
|
300
|
+
### Plugin Ecosystem
|
|
301
|
+
|
|
302
|
+
Metamorphic Guard supports external extensions via Python entry points:
|
|
303
|
+
|
|
304
|
+
- `metamorphic_guard.monitors`: register additional monitor factories
|
|
305
|
+
- `metamorphic_guard.dispatchers`: provide custom dispatcher implementations
|
|
306
|
+
- Inspect installed plugins with `metamorphic-guard plugin list` (append `--json` for machine-readable output) and view rich metadata via `metamorphic-guard plugin info <name>`.
|
|
307
|
+
- Third-party packages should expose a `PLUGIN_METADATA` mapping (name, version, guard_min/guard_max, sandbox flag, etc.) so compatibility is surfaced in the registry.
|
|
308
|
+
|
|
309
|
+
Example `pyproject.toml` snippet:
|
|
310
|
+
|
|
311
|
+
```toml
|
|
312
|
+
[project.entry-points."metamorphic_guard.monitors"]
|
|
313
|
+
latency99 = "my_package.monitors:Latency99Monitor"
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
Once installed, the new monitor can be referenced on the CLI:
|
|
317
|
+
|
|
318
|
+
```bash
|
|
319
|
+
metamorphic-guard --monitor latency99
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
Programmatic APIs (`metamorphic_guard.monitoring.resolve_monitors`) also pick up
|
|
323
|
+
registered plugins, enabling teams to share bespoke invariants, dispatchers, and
|
|
324
|
+
workflows across services.
|
|
325
|
+
Pass `--sandbox-plugins` during evaluation (or set `sandbox_plugins = true` in config) to execute third-party monitors inside per-plugin subprocesses. Plugins can set `sandbox = true` in their metadata to request isolation by default.
|
|
326
|
+
|
|
327
|
+
### Observability & Artifacts
|
|
328
|
+
|
|
329
|
+
- Set `METAMORPHIC_GUARD_LOG_JSON=1` to stream structured JSON logs (start/complete events,
|
|
330
|
+
worker task telemetry) to stdout for ingestion by log pipelines.
|
|
331
|
+
- Prefer the CLI toggles `--log-json` / `--no-log-json` and `--metrics` / `--no-metrics` for one-off runs; pair with `--metrics-port` to expose a Prometheus endpoint directly from the coordinator or worker.
|
|
332
|
+
- Capture structured logs to disk with `--log-file observability/run.jsonl`; the coordinator/worker
|
|
333
|
+
will append JSON events and handle file lifecycle automatically.
|
|
334
|
+
- Enable Prometheus counters by exporting `METAMORPHIC_GUARD_PROMETHEUS=1` and register the
|
|
335
|
+
exposed registry (`metamorphic_guard.observability.prometheus_registry()`) with your HTTP exporter.
|
|
336
|
+
- Persist failing case artifacts either by providing `METAMORPHIC_GUARD_FAILED_DIR` or letting the
|
|
337
|
+
harness default to `reports/failed_cases/`; these JSON snapshots capture violations and config for debugging.
|
|
338
|
+
- Retention controls: `--failed-artifact-limit` caps how many snapshots are retained and
|
|
339
|
+
`--failed-artifact-ttl-days` prunes entries older than the configured horizon.
|
|
340
|
+
- Queue telemetry ships out-of-the-box: `metamorphic_queue_pending_tasks` (tasks waiting),
|
|
341
|
+
`metamorphic_queue_inflight_cases` (cases outstanding), and `metamorphic_queue_active_workers`
|
|
342
|
+
(live heartbeat count) alongside throughput counters (`*_cases_dispatched_total`, `*_cases_completed_total`,
|
|
343
|
+
`*_cases_requeued_total`).
|
|
344
|
+
- A starter Grafana dashboard lives at `docs/grafana/metamorphic-guard-dashboard.json` – import it
|
|
345
|
+
into Grafana and point the Prometheus datasource at the Guard metrics endpoint for live telemetry.
|
|
346
|
+
- HTML reports embed Chart.js dashboards summarising pass rates, fairness gaps, and resource usage
|
|
347
|
+
whenever the relevant monitors are enabled, making it easy to eyeball regressions without leaving the report.
|
|
348
|
+
|
|
349
|
+
### Quick Start Wizard & Cookbook
|
|
350
|
+
|
|
351
|
+
- Run `metamorphic-guard init` to scaffold a `metamorphic_guard.toml` configuration (supports distributed
|
|
352
|
+
queue defaults and monitor presets).
|
|
353
|
+
- Prefer `metamorphic-guard init --interactive` for a guided wizard that prompts for baseline/candidate paths,
|
|
354
|
+
distributed mode, and default monitors.
|
|
355
|
+
- Generate reusable plugin templates with `metamorphic-guard scaffold-plugin --kind monitor --name MyMonitor` and
|
|
356
|
+
wire them into your project via entry points.
|
|
357
|
+
- Explore `docs/cookbook.md` for recipes covering distributed evaluations, advanced monitors, and CI pipelines.
|
|
358
|
+
|
|
359
|
+
## Output Format
|
|
360
|
+
|
|
361
|
+
The system generates JSON reports in `reports/report_<timestamp>.json`:
|
|
362
|
+
|
|
363
|
+
```json
|
|
364
|
+
{
|
|
365
|
+
"task": "top_k",
|
|
366
|
+
"n": 400,
|
|
367
|
+
"seed": 42,
|
|
368
|
+
"config": {
|
|
369
|
+
"timeout_s": 2.0,
|
|
370
|
+
"mem_mb": 512,
|
|
371
|
+
"alpha": 0.05,
|
|
372
|
+
"improve_delta": 0.02,
|
|
373
|
+
"violation_cap": 25,
|
|
374
|
+
"parallel": 1,
|
|
375
|
+
"bootstrap_samples": 1000,
|
|
376
|
+
"ci_method": "bootstrap",
|
|
377
|
+
"rr_ci_method": "log"
|
|
378
|
+
},
|
|
379
|
+
"hashes": {
|
|
380
|
+
"baseline": "sha256...",
|
|
381
|
+
"candidate": "sha256..."
|
|
382
|
+
},
|
|
383
|
+
"spec_fingerprint": {
|
|
384
|
+
"gen_inputs": "sha256...",
|
|
385
|
+
"properties": [
|
|
386
|
+
{ "description": "Output length equals min(k, len(L))", "mode": "hard", "hash": "sha256..." }
|
|
387
|
+
],
|
|
388
|
+
"relations": [
|
|
389
|
+
{ "name": "permute_input", "expect": "equal", "hash": "sha256..." }
|
|
390
|
+
],
|
|
391
|
+
"equivalence": "sha256...",
|
|
392
|
+
"formatters": { "fmt_in": "sha256...", "fmt_out": "sha256..." }
|
|
393
|
+
},
|
|
394
|
+
"baseline": {
|
|
395
|
+
"passes": 388,
|
|
396
|
+
"total": 400,
|
|
397
|
+
"pass_rate": 0.97
|
|
398
|
+
},
|
|
399
|
+
"candidate": {
|
|
400
|
+
"passes": 396,
|
|
401
|
+
"total": 400,
|
|
402
|
+
"pass_rate": 0.99,
|
|
403
|
+
"prop_violations": [],
|
|
404
|
+
"mr_violations": []
|
|
405
|
+
},
|
|
406
|
+
"delta_pass_rate": 0.02,
|
|
407
|
+
"delta_ci": [0.015, 0.035],
|
|
408
|
+
"relative_risk": 1.021,
|
|
409
|
+
"relative_risk_ci": [0.998, 1.045],
|
|
410
|
+
"decision": {
|
|
411
|
+
"adopt": true,
|
|
412
|
+
"reason": "meets_gate"
|
|
413
|
+
},
|
|
414
|
+
"job_metadata": {
|
|
415
|
+
"hostname": "build-agent-01",
|
|
416
|
+
"python_version": "3.11.8",
|
|
417
|
+
"git_commit": "d1e5f8...",
|
|
418
|
+
"git_dirty": false
|
|
419
|
+
},
|
|
420
|
+
"monitors": {
|
|
421
|
+
"LatencyMonitor": {
|
|
422
|
+
"id": "LatencyMonitor",
|
|
423
|
+
"type": "latency",
|
|
424
|
+
"percentile": 0.95,
|
|
425
|
+
"summary": {
|
|
426
|
+
"baseline": {"count": 400, "mean_ms": 1.21, "p95_ms": 1.89},
|
|
427
|
+
"candidate": {"count": 400, "mean_ms": 1.05, "p95_ms": 1.61}
|
|
428
|
+
},
|
|
429
|
+
"alerts": []
|
|
430
|
+
}
|
|
431
|
+
},
|
|
432
|
+
"environment": {
|
|
433
|
+
"python_version": "3.11.8",
|
|
434
|
+
"implementation": "CPython",
|
|
435
|
+
"platform": "macOS-14-arm64-arm-64bit",
|
|
436
|
+
"executable": "/usr/bin/python3"
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
```
|