metamorphic-guard 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ """
2
+ Metamorphic Guard v1 - A Python library for comparing program versions using metamorphic testing.
3
+ """
4
+
5
+ from .specs import task, Spec, Property, MetamorphicRelation
6
+ from .generators import gen_top_k_inputs
7
+ from .relations import permute_input, add_noise_below_min
8
+ from .stability import multiset_equal
9
+
10
+ __version__ = "1.1.0"
11
+
12
+
13
+ @task("top_k")
14
+ def top_k_spec() -> Spec:
15
+ """Specification for the top_k task."""
16
+ return Spec(
17
+ gen_inputs=gen_top_k_inputs,
18
+ properties=[
19
+ Property(
20
+ check=lambda out, L, k: len(out) == min(k, len(L)),
21
+ description="Output length equals min(k, len(L))"
22
+ ),
23
+ Property(
24
+ check=lambda out, L, k: sorted(out, reverse=True) == out,
25
+ description="Output is sorted in descending order"
26
+ ),
27
+ Property(
28
+ check=lambda out, L, k: all(x in L for x in out),
29
+ description="All output elements are from input list"
30
+ )
31
+ ],
32
+ relations=[
33
+ MetamorphicRelation(
34
+ name="permute_input",
35
+ transform=permute_input,
36
+ expect="equal"
37
+ ),
38
+ MetamorphicRelation(
39
+ name="add_noise_below_min",
40
+ transform=add_noise_below_min,
41
+ expect="equal"
42
+ )
43
+ ],
44
+ equivalence=multiset_equal,
45
+ fmt_in=lambda args: f"L={args[0]}, k={args[1]}",
46
+ fmt_out=lambda result: f"top_k={result}"
47
+ )
@@ -0,0 +1,163 @@
1
+ """
2
+ Command-line interface for Metamorphic Guard.
3
+ """
4
+
5
+ import sys
6
+
7
+ import click
8
+
9
+ from .gate import decide_adopt
10
+ from .harness import run_eval
11
+ from .specs import list_tasks
12
+ from .util import write_report
13
+
14
+
15
+ @click.command()
16
+ @click.option("--task", required=True, help="Task name to evaluate")
17
+ @click.option("--baseline", required=True, help="Path to baseline implementation")
18
+ @click.option("--candidate", required=True, help="Path to candidate implementation")
19
+ @click.option("--n", default=400, show_default=True, help="Number of test cases to generate")
20
+ @click.option("--seed", default=42, show_default=True, help="Random seed for generators")
21
+ @click.option("--timeout-s", default=2.0, show_default=True, help="Timeout per test (seconds)")
22
+ @click.option("--mem-mb", default=512, show_default=True, help="Memory limit per test (MB)")
23
+ @click.option("--alpha", default=0.05, show_default=True, help="Significance level for bootstrap CI")
24
+ @click.option(
25
+ "--improve-delta",
26
+ default=0.02,
27
+ show_default=True,
28
+ help="Minimum improvement threshold for adoption",
29
+ )
30
+ @click.option("--violation-cap", default=25, show_default=True, help="Maximum violations to record")
31
+ @click.option(
32
+ "--parallel",
33
+ type=int,
34
+ default=1,
35
+ show_default=True,
36
+ help="Number of concurrent workers for sandbox execution",
37
+ )
38
+ @click.option(
39
+ "--bootstrap-samples",
40
+ type=int,
41
+ default=1000,
42
+ show_default=True,
43
+ help="Bootstrap resamples for confidence interval estimation",
44
+ )
45
+ @click.option(
46
+ "--ci-method",
47
+ type=click.Choice(["bootstrap", "newcombe", "wilson"], case_sensitive=False),
48
+ default="bootstrap",
49
+ show_default=True,
50
+ help="Method for the pass-rate delta confidence interval",
51
+ )
52
+ @click.option(
53
+ "--rr-ci-method",
54
+ type=click.Choice(["log"], case_sensitive=False),
55
+ default="log",
56
+ show_default=True,
57
+ help="Method for relative risk confidence interval",
58
+ )
59
+ def main(
60
+ task: str,
61
+ baseline: str,
62
+ candidate: str,
63
+ n: int,
64
+ seed: int,
65
+ timeout_s: float,
66
+ mem_mb: int,
67
+ alpha: float,
68
+ improve_delta: float,
69
+ violation_cap: int,
70
+ parallel: int,
71
+ bootstrap_samples: int,
72
+ ci_method: str,
73
+ rr_ci_method: str,
74
+ ) -> None:
75
+ """Compare baseline and candidate implementations using metamorphic testing."""
76
+
77
+ available_tasks = list_tasks()
78
+ if task not in available_tasks:
79
+ click.echo(
80
+ f"Error: Task '{task}' not found. Available tasks: {available_tasks}",
81
+ err=True,
82
+ )
83
+ sys.exit(1)
84
+
85
+ try:
86
+ click.echo(f"Running evaluation: {task}")
87
+ click.echo(f"Baseline: {baseline}")
88
+ click.echo(f"Candidate: {candidate}")
89
+ click.echo(f"Test cases: {n}, Seed: {seed}")
90
+ click.echo(f"Parallel workers: {parallel}")
91
+ click.echo(f"CI method: {ci_method}")
92
+ click.echo(f"RR CI method: {rr_ci_method}")
93
+
94
+ result = run_eval(
95
+ task_name=task,
96
+ baseline_path=baseline,
97
+ candidate_path=candidate,
98
+ n=n,
99
+ seed=seed,
100
+ timeout_s=timeout_s,
101
+ mem_mb=mem_mb,
102
+ alpha=alpha,
103
+ violation_cap=violation_cap,
104
+ parallel=parallel,
105
+ improve_delta=improve_delta,
106
+ bootstrap_samples=bootstrap_samples,
107
+ ci_method=ci_method,
108
+ rr_ci_method=rr_ci_method,
109
+ )
110
+
111
+ decision = decide_adopt(result, improve_delta)
112
+ result["decision"] = decision
113
+
114
+ report_path = write_report(result)
115
+
116
+ click.echo("\n" + "=" * 60)
117
+ click.echo("EVALUATION SUMMARY")
118
+ click.echo("=" * 60)
119
+ click.echo(f"Task: {result['task']}")
120
+ click.echo(f"Test cases: {result['n']}")
121
+ click.echo(f"Seed: {result['seed']}")
122
+ click.echo()
123
+ click.echo("BASELINE:")
124
+ click.echo(
125
+ f" Pass rate: {result['baseline']['pass_rate']:.3f} "
126
+ f"({result['baseline']['passes']}/{result['baseline']['total']})"
127
+ )
128
+ click.echo()
129
+ click.echo("CANDIDATE:")
130
+ click.echo(
131
+ f" Pass rate: {result['candidate']['pass_rate']:.3f} "
132
+ f"({result['candidate']['passes']}/{result['candidate']['total']})"
133
+ )
134
+ click.echo(f" Property violations: {len(result['candidate']['prop_violations'])}")
135
+ click.echo(f" MR violations: {len(result['candidate']['mr_violations'])}")
136
+ click.echo()
137
+ click.echo("IMPROVEMENT:")
138
+ click.echo(f" Delta: {result['delta_pass_rate']:.3f}")
139
+ click.echo(f" 95% CI: [{result['delta_ci'][0]:.3f}, {result['delta_ci'][1]:.3f}]")
140
+ click.echo(f" Relative risk: {result['relative_risk']:.3f}")
141
+ rr_ci = result["relative_risk_ci"]
142
+ click.echo(f" RR 95% CI: [{rr_ci[0]:.3f}, {rr_ci[1]:.3f}]")
143
+ click.echo()
144
+ click.echo("DECISION:")
145
+ click.echo(f" Adopt: {decision['adopt']}")
146
+ click.echo(f" Reason: {decision['reason']}")
147
+ click.echo()
148
+ click.echo(f"Report saved to: {report_path}")
149
+
150
+ if decision["adopt"]:
151
+ click.echo("✅ Candidate accepted!")
152
+ sys.exit(0)
153
+
154
+ click.echo("❌ Candidate rejected!")
155
+ sys.exit(1)
156
+
157
+ except Exception as exc: # pragma: no cover - defensive surface
158
+ click.echo(f"Error during evaluation: {exc}", err=True)
159
+ sys.exit(1)
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
@@ -0,0 +1,59 @@
1
+ """
2
+ Adoption gate logic for deciding whether to accept a candidate implementation.
3
+ """
4
+
5
+ from typing import Dict, Any
6
+
7
+
8
+ def decide_adopt(
9
+ result: Dict[str, Any],
10
+ improve_delta: float = 0.02,
11
+ min_pass_rate: float = 0.80
12
+ ) -> Dict[str, Any]:
13
+ """
14
+ Decide whether to adopt the candidate based on evaluation results.
15
+
16
+ Args:
17
+ result: Full evaluation result from harness
18
+ improve_delta: Minimum improvement threshold for CI lower bound
19
+ min_pass_rate: Minimum pass rate required for candidate
20
+
21
+ Returns:
22
+ Dict with 'adopt' boolean and 'reason' string
23
+ """
24
+ candidate = result["candidate"]
25
+ delta_ci = result["delta_ci"]
26
+
27
+ # Check for property violations
28
+ if candidate["prop_violations"]:
29
+ return {
30
+ "adopt": False,
31
+ "reason": f"Property violations: {len(candidate['prop_violations'])} violations found"
32
+ }
33
+
34
+ # Check for metamorphic relation violations
35
+ if candidate["mr_violations"]:
36
+ return {
37
+ "adopt": False,
38
+ "reason": f"Metamorphic relation violations: {len(candidate['mr_violations'])} violations found"
39
+ }
40
+
41
+ # Check minimum pass rate
42
+ if candidate["pass_rate"] < min_pass_rate:
43
+ return {
44
+ "adopt": False,
45
+ "reason": f"Pass rate too low: {candidate['pass_rate']:.3f} < {min_pass_rate}"
46
+ }
47
+
48
+ # Check improvement threshold
49
+ if delta_ci[0] < improve_delta:
50
+ return {
51
+ "adopt": False,
52
+ "reason": f"Improvement insufficient: CI lower bound {delta_ci[0]:.3f} < {improve_delta}"
53
+ }
54
+
55
+ # All conditions met
56
+ return {
57
+ "adopt": True,
58
+ "reason": "meets_gate"
59
+ }
@@ -0,0 +1,126 @@
1
+ """
2
+ Input generators for test cases.
3
+ """
4
+
5
+ import random
6
+ from typing import Callable, List, Sequence, Tuple
7
+
8
+
9
+ def gen_top_k_inputs(n: int, seed: int) -> List[Tuple[List[int], int]]:
10
+ """
11
+ Generate diverse test cases for the top_k task.
12
+
13
+ The generator aims to cover the following scenarios:
14
+ * Empty lists and single-element lists.
15
+ * Lists with heavy duplication.
16
+ * Already sorted ascending/descending inputs.
17
+ * Very long lists (up to ~200 elements).
18
+ * k larger than the list length and k == 0.
19
+ * Negative-only, mixed-sign, and extreme magnitude values.
20
+ """
21
+ rng = random.Random(seed)
22
+ scenarios: Sequence[Callable[[random.Random], Tuple[List[int], int]]] = (
23
+ _case_empty,
24
+ _case_single,
25
+ _case_duplicates,
26
+ _case_sorted_ascending,
27
+ _case_sorted_descending,
28
+ _case_long,
29
+ _case_random_dense,
30
+ _case_k_zero,
31
+ _case_k_bigger_than_len,
32
+ _case_negatives_only,
33
+ _case_extreme_values,
34
+ _case_small_range_many_duplicates,
35
+ )
36
+
37
+ test_cases: List[Tuple[List[int], int]] = []
38
+ for i in range(n):
39
+ case_fn = rng.choice(scenarios)
40
+ test_cases.append(case_fn(rng))
41
+ return test_cases
42
+
43
+
44
+ def _case_empty(rng: random.Random) -> Tuple[List[int], int]:
45
+ return [], 0
46
+
47
+
48
+ def _case_single(rng: random.Random) -> Tuple[List[int], int]:
49
+ value = rng.randint(-1000, 1000)
50
+ return [value], rng.choice([0, 1, 2])
51
+
52
+
53
+ def _case_duplicates(rng: random.Random) -> Tuple[List[int], int]:
54
+ base = rng.randint(-100, 100)
55
+ dup_count = rng.randint(3, 15)
56
+ noise = [rng.randint(-150, 150) for _ in range(rng.randint(1, 6))]
57
+ values = [base] * dup_count + noise
58
+ rng.shuffle(values)
59
+ k = rng.randint(1, max(1, len(values)))
60
+ return values, k
61
+
62
+
63
+ def _case_sorted_ascending(rng: random.Random) -> Tuple[List[int], int]:
64
+ length = rng.randint(2, 40)
65
+ values = sorted(rng.randint(-500, 500) for _ in range(length))
66
+ k = rng.randint(1, length)
67
+ return values, k
68
+
69
+
70
+ def _case_sorted_descending(rng: random.Random) -> Tuple[List[int], int]:
71
+ length = rng.randint(2, 40)
72
+ values = sorted((rng.randint(-500, 500) for _ in range(length)), reverse=True)
73
+ k = rng.randint(1, length)
74
+ return values, k
75
+
76
+
77
+ def _case_long(rng: random.Random) -> Tuple[List[int], int]:
78
+ length = rng.randint(100, 200)
79
+ values = [rng.randint(-10**4, 10**4) for _ in range(length)]
80
+ k = rng.randint(1, length)
81
+ return values, k
82
+
83
+
84
+ def _case_random_dense(rng: random.Random) -> Tuple[List[int], int]:
85
+ length = rng.randint(5, 80)
86
+ values = [rng.randint(-500, 500) for _ in range(length)]
87
+ k = rng.randint(0, length + 5)
88
+ return values, k
89
+
90
+
91
+ def _case_k_zero(rng: random.Random) -> Tuple[List[int], int]:
92
+ length = rng.randint(1, 25)
93
+ values = [rng.randint(-200, 200) for _ in range(length)]
94
+ return values, 0
95
+
96
+
97
+ def _case_k_bigger_than_len(rng: random.Random) -> Tuple[List[int], int]:
98
+ length = rng.randint(1, 25)
99
+ values = [rng.randint(-100, 100) for _ in range(length)]
100
+ k = length + rng.randint(1, 20)
101
+ return values, k
102
+
103
+
104
+ def _case_negatives_only(rng: random.Random) -> Tuple[List[int], int]:
105
+ length = rng.randint(3, 30)
106
+ values = [rng.randint(-10**5, -1) for _ in range(length)]
107
+ k = rng.randint(1, length)
108
+ return values, k
109
+
110
+
111
+ def _case_extreme_values(rng: random.Random) -> Tuple[List[int], int]:
112
+ palette = [-10**9, -10**6, -10**3, 0, 10**3, 10**6, 10**9]
113
+ length = rng.randint(5, 30)
114
+ values = [rng.choice(palette) for _ in range(length)]
115
+ k = rng.randint(1, length)
116
+ return values, k
117
+
118
+
119
+ def _case_small_range_many_duplicates(rng: random.Random) -> Tuple[List[int], int]:
120
+ length = rng.randint(10, 60)
121
+ base_values = [rng.randint(-3, 3) for _ in range(length)]
122
+ values = base_values + [rng.randint(-3, 3) for _ in range(length // 2)]
123
+ rng.shuffle(values)
124
+ k = rng.randint(1, max(1, len(values)))
125
+ return values, k
126
+