metamorphic-guard 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metamorphic_guard/__init__.py +47 -0
- metamorphic_guard/cli.py +163 -0
- metamorphic_guard/gate.py +59 -0
- metamorphic_guard/generators.py +126 -0
- metamorphic_guard/harness.py +465 -0
- metamorphic_guard/relations.py +31 -0
- metamorphic_guard/sandbox.py +417 -0
- metamorphic_guard/specs.py +65 -0
- metamorphic_guard/stability.py +23 -0
- metamorphic_guard/util.py +114 -0
- metamorphic_guard-1.1.0.dist-info/LICENSE +21 -0
- metamorphic_guard-1.1.0.dist-info/METADATA +295 -0
- metamorphic_guard-1.1.0.dist-info/RECORD +21 -0
- metamorphic_guard-1.1.0.dist-info/WHEEL +5 -0
- metamorphic_guard-1.1.0.dist-info/entry_points.txt +3 -0
- metamorphic_guard-1.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +1 -0
- tests/test_cli.py +114 -0
- tests/test_gate.py +109 -0
- tests/test_harness.py +186 -0
- tests/test_sandbox.py +178 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Metamorphic Guard v1 - A Python library for comparing program versions using metamorphic testing.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .specs import task, Spec, Property, MetamorphicRelation
|
|
6
|
+
from .generators import gen_top_k_inputs
|
|
7
|
+
from .relations import permute_input, add_noise_below_min
|
|
8
|
+
from .stability import multiset_equal
|
|
9
|
+
|
|
10
|
+
__version__ = "1.1.0"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@task("top_k")
|
|
14
|
+
def top_k_spec() -> Spec:
|
|
15
|
+
"""Specification for the top_k task."""
|
|
16
|
+
return Spec(
|
|
17
|
+
gen_inputs=gen_top_k_inputs,
|
|
18
|
+
properties=[
|
|
19
|
+
Property(
|
|
20
|
+
check=lambda out, L, k: len(out) == min(k, len(L)),
|
|
21
|
+
description="Output length equals min(k, len(L))"
|
|
22
|
+
),
|
|
23
|
+
Property(
|
|
24
|
+
check=lambda out, L, k: sorted(out, reverse=True) == out,
|
|
25
|
+
description="Output is sorted in descending order"
|
|
26
|
+
),
|
|
27
|
+
Property(
|
|
28
|
+
check=lambda out, L, k: all(x in L for x in out),
|
|
29
|
+
description="All output elements are from input list"
|
|
30
|
+
)
|
|
31
|
+
],
|
|
32
|
+
relations=[
|
|
33
|
+
MetamorphicRelation(
|
|
34
|
+
name="permute_input",
|
|
35
|
+
transform=permute_input,
|
|
36
|
+
expect="equal"
|
|
37
|
+
),
|
|
38
|
+
MetamorphicRelation(
|
|
39
|
+
name="add_noise_below_min",
|
|
40
|
+
transform=add_noise_below_min,
|
|
41
|
+
expect="equal"
|
|
42
|
+
)
|
|
43
|
+
],
|
|
44
|
+
equivalence=multiset_equal,
|
|
45
|
+
fmt_in=lambda args: f"L={args[0]}, k={args[1]}",
|
|
46
|
+
fmt_out=lambda result: f"top_k={result}"
|
|
47
|
+
)
|
metamorphic_guard/cli.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for Metamorphic Guard.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
|
|
9
|
+
from .gate import decide_adopt
|
|
10
|
+
from .harness import run_eval
|
|
11
|
+
from .specs import list_tasks
|
|
12
|
+
from .util import write_report
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.command()
|
|
16
|
+
@click.option("--task", required=True, help="Task name to evaluate")
|
|
17
|
+
@click.option("--baseline", required=True, help="Path to baseline implementation")
|
|
18
|
+
@click.option("--candidate", required=True, help="Path to candidate implementation")
|
|
19
|
+
@click.option("--n", default=400, show_default=True, help="Number of test cases to generate")
|
|
20
|
+
@click.option("--seed", default=42, show_default=True, help="Random seed for generators")
|
|
21
|
+
@click.option("--timeout-s", default=2.0, show_default=True, help="Timeout per test (seconds)")
|
|
22
|
+
@click.option("--mem-mb", default=512, show_default=True, help="Memory limit per test (MB)")
|
|
23
|
+
@click.option("--alpha", default=0.05, show_default=True, help="Significance level for bootstrap CI")
|
|
24
|
+
@click.option(
|
|
25
|
+
"--improve-delta",
|
|
26
|
+
default=0.02,
|
|
27
|
+
show_default=True,
|
|
28
|
+
help="Minimum improvement threshold for adoption",
|
|
29
|
+
)
|
|
30
|
+
@click.option("--violation-cap", default=25, show_default=True, help="Maximum violations to record")
|
|
31
|
+
@click.option(
|
|
32
|
+
"--parallel",
|
|
33
|
+
type=int,
|
|
34
|
+
default=1,
|
|
35
|
+
show_default=True,
|
|
36
|
+
help="Number of concurrent workers for sandbox execution",
|
|
37
|
+
)
|
|
38
|
+
@click.option(
|
|
39
|
+
"--bootstrap-samples",
|
|
40
|
+
type=int,
|
|
41
|
+
default=1000,
|
|
42
|
+
show_default=True,
|
|
43
|
+
help="Bootstrap resamples for confidence interval estimation",
|
|
44
|
+
)
|
|
45
|
+
@click.option(
|
|
46
|
+
"--ci-method",
|
|
47
|
+
type=click.Choice(["bootstrap", "newcombe", "wilson"], case_sensitive=False),
|
|
48
|
+
default="bootstrap",
|
|
49
|
+
show_default=True,
|
|
50
|
+
help="Method for the pass-rate delta confidence interval",
|
|
51
|
+
)
|
|
52
|
+
@click.option(
|
|
53
|
+
"--rr-ci-method",
|
|
54
|
+
type=click.Choice(["log"], case_sensitive=False),
|
|
55
|
+
default="log",
|
|
56
|
+
show_default=True,
|
|
57
|
+
help="Method for relative risk confidence interval",
|
|
58
|
+
)
|
|
59
|
+
def main(
|
|
60
|
+
task: str,
|
|
61
|
+
baseline: str,
|
|
62
|
+
candidate: str,
|
|
63
|
+
n: int,
|
|
64
|
+
seed: int,
|
|
65
|
+
timeout_s: float,
|
|
66
|
+
mem_mb: int,
|
|
67
|
+
alpha: float,
|
|
68
|
+
improve_delta: float,
|
|
69
|
+
violation_cap: int,
|
|
70
|
+
parallel: int,
|
|
71
|
+
bootstrap_samples: int,
|
|
72
|
+
ci_method: str,
|
|
73
|
+
rr_ci_method: str,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""Compare baseline and candidate implementations using metamorphic testing."""
|
|
76
|
+
|
|
77
|
+
available_tasks = list_tasks()
|
|
78
|
+
if task not in available_tasks:
|
|
79
|
+
click.echo(
|
|
80
|
+
f"Error: Task '{task}' not found. Available tasks: {available_tasks}",
|
|
81
|
+
err=True,
|
|
82
|
+
)
|
|
83
|
+
sys.exit(1)
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
click.echo(f"Running evaluation: {task}")
|
|
87
|
+
click.echo(f"Baseline: {baseline}")
|
|
88
|
+
click.echo(f"Candidate: {candidate}")
|
|
89
|
+
click.echo(f"Test cases: {n}, Seed: {seed}")
|
|
90
|
+
click.echo(f"Parallel workers: {parallel}")
|
|
91
|
+
click.echo(f"CI method: {ci_method}")
|
|
92
|
+
click.echo(f"RR CI method: {rr_ci_method}")
|
|
93
|
+
|
|
94
|
+
result = run_eval(
|
|
95
|
+
task_name=task,
|
|
96
|
+
baseline_path=baseline,
|
|
97
|
+
candidate_path=candidate,
|
|
98
|
+
n=n,
|
|
99
|
+
seed=seed,
|
|
100
|
+
timeout_s=timeout_s,
|
|
101
|
+
mem_mb=mem_mb,
|
|
102
|
+
alpha=alpha,
|
|
103
|
+
violation_cap=violation_cap,
|
|
104
|
+
parallel=parallel,
|
|
105
|
+
improve_delta=improve_delta,
|
|
106
|
+
bootstrap_samples=bootstrap_samples,
|
|
107
|
+
ci_method=ci_method,
|
|
108
|
+
rr_ci_method=rr_ci_method,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
decision = decide_adopt(result, improve_delta)
|
|
112
|
+
result["decision"] = decision
|
|
113
|
+
|
|
114
|
+
report_path = write_report(result)
|
|
115
|
+
|
|
116
|
+
click.echo("\n" + "=" * 60)
|
|
117
|
+
click.echo("EVALUATION SUMMARY")
|
|
118
|
+
click.echo("=" * 60)
|
|
119
|
+
click.echo(f"Task: {result['task']}")
|
|
120
|
+
click.echo(f"Test cases: {result['n']}")
|
|
121
|
+
click.echo(f"Seed: {result['seed']}")
|
|
122
|
+
click.echo()
|
|
123
|
+
click.echo("BASELINE:")
|
|
124
|
+
click.echo(
|
|
125
|
+
f" Pass rate: {result['baseline']['pass_rate']:.3f} "
|
|
126
|
+
f"({result['baseline']['passes']}/{result['baseline']['total']})"
|
|
127
|
+
)
|
|
128
|
+
click.echo()
|
|
129
|
+
click.echo("CANDIDATE:")
|
|
130
|
+
click.echo(
|
|
131
|
+
f" Pass rate: {result['candidate']['pass_rate']:.3f} "
|
|
132
|
+
f"({result['candidate']['passes']}/{result['candidate']['total']})"
|
|
133
|
+
)
|
|
134
|
+
click.echo(f" Property violations: {len(result['candidate']['prop_violations'])}")
|
|
135
|
+
click.echo(f" MR violations: {len(result['candidate']['mr_violations'])}")
|
|
136
|
+
click.echo()
|
|
137
|
+
click.echo("IMPROVEMENT:")
|
|
138
|
+
click.echo(f" Delta: {result['delta_pass_rate']:.3f}")
|
|
139
|
+
click.echo(f" 95% CI: [{result['delta_ci'][0]:.3f}, {result['delta_ci'][1]:.3f}]")
|
|
140
|
+
click.echo(f" Relative risk: {result['relative_risk']:.3f}")
|
|
141
|
+
rr_ci = result["relative_risk_ci"]
|
|
142
|
+
click.echo(f" RR 95% CI: [{rr_ci[0]:.3f}, {rr_ci[1]:.3f}]")
|
|
143
|
+
click.echo()
|
|
144
|
+
click.echo("DECISION:")
|
|
145
|
+
click.echo(f" Adopt: {decision['adopt']}")
|
|
146
|
+
click.echo(f" Reason: {decision['reason']}")
|
|
147
|
+
click.echo()
|
|
148
|
+
click.echo(f"Report saved to: {report_path}")
|
|
149
|
+
|
|
150
|
+
if decision["adopt"]:
|
|
151
|
+
click.echo("✅ Candidate accepted!")
|
|
152
|
+
sys.exit(0)
|
|
153
|
+
|
|
154
|
+
click.echo("❌ Candidate rejected!")
|
|
155
|
+
sys.exit(1)
|
|
156
|
+
|
|
157
|
+
except Exception as exc: # pragma: no cover - defensive surface
|
|
158
|
+
click.echo(f"Error during evaluation: {exc}", err=True)
|
|
159
|
+
sys.exit(1)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Adoption gate logic for deciding whether to accept a candidate implementation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Dict, Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def decide_adopt(
|
|
9
|
+
result: Dict[str, Any],
|
|
10
|
+
improve_delta: float = 0.02,
|
|
11
|
+
min_pass_rate: float = 0.80
|
|
12
|
+
) -> Dict[str, Any]:
|
|
13
|
+
"""
|
|
14
|
+
Decide whether to adopt the candidate based on evaluation results.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
result: Full evaluation result from harness
|
|
18
|
+
improve_delta: Minimum improvement threshold for CI lower bound
|
|
19
|
+
min_pass_rate: Minimum pass rate required for candidate
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Dict with 'adopt' boolean and 'reason' string
|
|
23
|
+
"""
|
|
24
|
+
candidate = result["candidate"]
|
|
25
|
+
delta_ci = result["delta_ci"]
|
|
26
|
+
|
|
27
|
+
# Check for property violations
|
|
28
|
+
if candidate["prop_violations"]:
|
|
29
|
+
return {
|
|
30
|
+
"adopt": False,
|
|
31
|
+
"reason": f"Property violations: {len(candidate['prop_violations'])} violations found"
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Check for metamorphic relation violations
|
|
35
|
+
if candidate["mr_violations"]:
|
|
36
|
+
return {
|
|
37
|
+
"adopt": False,
|
|
38
|
+
"reason": f"Metamorphic relation violations: {len(candidate['mr_violations'])} violations found"
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# Check minimum pass rate
|
|
42
|
+
if candidate["pass_rate"] < min_pass_rate:
|
|
43
|
+
return {
|
|
44
|
+
"adopt": False,
|
|
45
|
+
"reason": f"Pass rate too low: {candidate['pass_rate']:.3f} < {min_pass_rate}"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Check improvement threshold
|
|
49
|
+
if delta_ci[0] < improve_delta:
|
|
50
|
+
return {
|
|
51
|
+
"adopt": False,
|
|
52
|
+
"reason": f"Improvement insufficient: CI lower bound {delta_ci[0]:.3f} < {improve_delta}"
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# All conditions met
|
|
56
|
+
return {
|
|
57
|
+
"adopt": True,
|
|
58
|
+
"reason": "meets_gate"
|
|
59
|
+
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Input generators for test cases.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from typing import Callable, List, Sequence, Tuple
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def gen_top_k_inputs(n: int, seed: int) -> List[Tuple[List[int], int]]:
|
|
10
|
+
"""
|
|
11
|
+
Generate diverse test cases for the top_k task.
|
|
12
|
+
|
|
13
|
+
The generator aims to cover the following scenarios:
|
|
14
|
+
* Empty lists and single-element lists.
|
|
15
|
+
* Lists with heavy duplication.
|
|
16
|
+
* Already sorted ascending/descending inputs.
|
|
17
|
+
* Very long lists (up to ~200 elements).
|
|
18
|
+
* k larger than the list length and k == 0.
|
|
19
|
+
* Negative-only, mixed-sign, and extreme magnitude values.
|
|
20
|
+
"""
|
|
21
|
+
rng = random.Random(seed)
|
|
22
|
+
scenarios: Sequence[Callable[[random.Random], Tuple[List[int], int]]] = (
|
|
23
|
+
_case_empty,
|
|
24
|
+
_case_single,
|
|
25
|
+
_case_duplicates,
|
|
26
|
+
_case_sorted_ascending,
|
|
27
|
+
_case_sorted_descending,
|
|
28
|
+
_case_long,
|
|
29
|
+
_case_random_dense,
|
|
30
|
+
_case_k_zero,
|
|
31
|
+
_case_k_bigger_than_len,
|
|
32
|
+
_case_negatives_only,
|
|
33
|
+
_case_extreme_values,
|
|
34
|
+
_case_small_range_many_duplicates,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
test_cases: List[Tuple[List[int], int]] = []
|
|
38
|
+
for i in range(n):
|
|
39
|
+
case_fn = rng.choice(scenarios)
|
|
40
|
+
test_cases.append(case_fn(rng))
|
|
41
|
+
return test_cases
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _case_empty(rng: random.Random) -> Tuple[List[int], int]:
|
|
45
|
+
return [], 0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _case_single(rng: random.Random) -> Tuple[List[int], int]:
|
|
49
|
+
value = rng.randint(-1000, 1000)
|
|
50
|
+
return [value], rng.choice([0, 1, 2])
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _case_duplicates(rng: random.Random) -> Tuple[List[int], int]:
|
|
54
|
+
base = rng.randint(-100, 100)
|
|
55
|
+
dup_count = rng.randint(3, 15)
|
|
56
|
+
noise = [rng.randint(-150, 150) for _ in range(rng.randint(1, 6))]
|
|
57
|
+
values = [base] * dup_count + noise
|
|
58
|
+
rng.shuffle(values)
|
|
59
|
+
k = rng.randint(1, max(1, len(values)))
|
|
60
|
+
return values, k
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _case_sorted_ascending(rng: random.Random) -> Tuple[List[int], int]:
|
|
64
|
+
length = rng.randint(2, 40)
|
|
65
|
+
values = sorted(rng.randint(-500, 500) for _ in range(length))
|
|
66
|
+
k = rng.randint(1, length)
|
|
67
|
+
return values, k
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _case_sorted_descending(rng: random.Random) -> Tuple[List[int], int]:
|
|
71
|
+
length = rng.randint(2, 40)
|
|
72
|
+
values = sorted((rng.randint(-500, 500) for _ in range(length)), reverse=True)
|
|
73
|
+
k = rng.randint(1, length)
|
|
74
|
+
return values, k
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _case_long(rng: random.Random) -> Tuple[List[int], int]:
|
|
78
|
+
length = rng.randint(100, 200)
|
|
79
|
+
values = [rng.randint(-10**4, 10**4) for _ in range(length)]
|
|
80
|
+
k = rng.randint(1, length)
|
|
81
|
+
return values, k
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _case_random_dense(rng: random.Random) -> Tuple[List[int], int]:
|
|
85
|
+
length = rng.randint(5, 80)
|
|
86
|
+
values = [rng.randint(-500, 500) for _ in range(length)]
|
|
87
|
+
k = rng.randint(0, length + 5)
|
|
88
|
+
return values, k
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _case_k_zero(rng: random.Random) -> Tuple[List[int], int]:
|
|
92
|
+
length = rng.randint(1, 25)
|
|
93
|
+
values = [rng.randint(-200, 200) for _ in range(length)]
|
|
94
|
+
return values, 0
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _case_k_bigger_than_len(rng: random.Random) -> Tuple[List[int], int]:
|
|
98
|
+
length = rng.randint(1, 25)
|
|
99
|
+
values = [rng.randint(-100, 100) for _ in range(length)]
|
|
100
|
+
k = length + rng.randint(1, 20)
|
|
101
|
+
return values, k
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _case_negatives_only(rng: random.Random) -> Tuple[List[int], int]:
|
|
105
|
+
length = rng.randint(3, 30)
|
|
106
|
+
values = [rng.randint(-10**5, -1) for _ in range(length)]
|
|
107
|
+
k = rng.randint(1, length)
|
|
108
|
+
return values, k
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _case_extreme_values(rng: random.Random) -> Tuple[List[int], int]:
|
|
112
|
+
palette = [-10**9, -10**6, -10**3, 0, 10**3, 10**6, 10**9]
|
|
113
|
+
length = rng.randint(5, 30)
|
|
114
|
+
values = [rng.choice(palette) for _ in range(length)]
|
|
115
|
+
k = rng.randint(1, length)
|
|
116
|
+
return values, k
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _case_small_range_many_duplicates(rng: random.Random) -> Tuple[List[int], int]:
|
|
120
|
+
length = rng.randint(10, 60)
|
|
121
|
+
base_values = [rng.randint(-3, 3) for _ in range(length)]
|
|
122
|
+
values = base_values + [rng.randint(-3, 3) for _ in range(length // 2)]
|
|
123
|
+
rng.shuffle(values)
|
|
124
|
+
k = rng.randint(1, max(1, len(values)))
|
|
125
|
+
return values, k
|
|
126
|
+
|