janus-labs 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/__main__.py +7 -0
- cli/clipboard.py +113 -0
- cli/main.py +690 -0
- cli/output.py +97 -0
- cli/submit.py +270 -0
- config/__init__.py +1 -0
- config/detection.py +72 -0
- forge/__init__.py +5 -0
- forge/behavior.py +35 -0
- forge/behaviors/BHV-002-refactor-complexity.yaml +25 -0
- forge/behaviors/BHV-003-error-handling.yaml +28 -0
- gauge/__init__.py +17 -0
- gauge/adapter.py +134 -0
- gauge/behaviors/__init__.py +11 -0
- gauge/behaviors/code_quality.py +73 -0
- gauge/behaviors/instruction_adherence.py +52 -0
- gauge/behaviors/test_cheating.py +178 -0
- gauge/governed_rollout.py +107 -0
- gauge/judge.py +179 -0
- gauge/qualitative.py +271 -0
- gauge/report.py +210 -0
- gauge/trust_elasticity.py +172 -0
- governance/__init__.py +14 -0
- governance/bridge.py +124 -0
- governance/memory.py +116 -0
- harness/__init__.py +1 -0
- harness/artifacts.py +195 -0
- harness/executor.py +51 -0
- harness/sandbox.py +40 -0
- harness/types.py +46 -0
- janus_labs/__init__.py +16 -0
- janus_labs/__main__.py +37 -0
- janus_labs-0.2.0.dist-info/METADATA +316 -0
- janus_labs-0.2.0.dist-info/RECORD +80 -0
- janus_labs-0.2.0.dist-info/WHEEL +5 -0
- janus_labs-0.2.0.dist-info/entry_points.txt +2 -0
- janus_labs-0.2.0.dist-info/licenses/LICENSE +201 -0
- janus_labs-0.2.0.dist-info/top_level.txt +11 -0
- janus_types.py +140 -0
- probe/__init__.py +19 -0
- probe/discovery.py +194 -0
- probe/explorer.py +236 -0
- probe/mutations.py +196 -0
- probe/tracer.py +193 -0
- scaffold/__init__.py +1 -0
- scaffold/scorer.py +321 -0
- scaffold/templates/BHV-001-test-cheating/.gitignore +4 -0
- scaffold/templates/BHV-001-test-cheating/src/__init__.py +0 -0
- scaffold/templates/BHV-001-test-cheating/src/calculator.py +24 -0
- scaffold/templates/BHV-001-test-cheating/tests/__init__.py +0 -0
- scaffold/templates/BHV-001-test-cheating/tests/test_calculator.py +35 -0
- scaffold/templates/default/.gitignore +4 -0
- scaffold/templates/default/src/__init__.py +0 -0
- scaffold/templates/default/src/main.py +23 -0
- scaffold/templates/default/tests/__init__.py +0 -0
- scaffold/templates/default/tests/test_main.py +32 -0
- scaffold/workspace.py +202 -0
- scaffold/workspaces/BHV-002-refactor-complexity/src/__init__.py +0 -0
- scaffold/workspaces/BHV-002-refactor-complexity/src/pricing.py +72 -0
- scaffold/workspaces/BHV-002-refactor-complexity/tests/__init__.py +0 -0
- scaffold/workspaces/BHV-002-refactor-complexity/tests/test_pricing.py +72 -0
- scaffold/workspaces/BHV-003-error-handling/src/__init__.py +0 -0
- scaffold/workspaces/BHV-003-error-handling/src/file_processor.py +100 -0
- scaffold/workspaces/BHV-003-error-handling/tests/__init__.py +0 -0
- scaffold/workspaces/BHV-003-error-handling/tests/test_file_processor.py +144 -0
- suite/__init__.py +16 -0
- suite/builtin/__init__.py +13 -0
- suite/builtin/hello_world.py +28 -0
- suite/builtin/refactor_storm.py +92 -0
- suite/comparison.py +274 -0
- suite/definition.py +51 -0
- suite/export/__init__.py +6 -0
- suite/export/github.py +58 -0
- suite/export/html.py +160 -0
- suite/export/json_export.py +65 -0
- suite/registry.py +20 -0
- suite/result.py +133 -0
- suite/runner.py +110 -0
- suite/thresholds.py +80 -0
suite/export/html.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""HTML export for SuiteResult."""
|
|
2
|
+
|
|
3
|
+
from html import escape
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from suite.result import SuiteResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def export_html(result: SuiteResult, output_path: str) -> str:
|
|
10
|
+
"""
|
|
11
|
+
Generate self-contained HTML report.
|
|
12
|
+
|
|
13
|
+
Requirements:
|
|
14
|
+
- No external dependencies (inline CSS/JS)
|
|
15
|
+
- Viewable offline
|
|
16
|
+
- Shows: headline, breakdown table, governance summary
|
|
17
|
+
- Professional styling (dark theme preferred)
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Path to generated HTML file
|
|
21
|
+
"""
|
|
22
|
+
rows = "\n".join(
|
|
23
|
+
(
|
|
24
|
+
"<tr>"
|
|
25
|
+
f"<td>{score.behavior_id}</td>"
|
|
26
|
+
f"<td>{score.name}</td>"
|
|
27
|
+
f"<td>{score.score:.1f}</td>"
|
|
28
|
+
f"<td>{score.grade}</td>"
|
|
29
|
+
f"<td>{'yes' if score.passed else 'no'}</td>"
|
|
30
|
+
f"<td>{'yes' if score.halted else 'no'}</td>"
|
|
31
|
+
"</tr>"
|
|
32
|
+
)
|
|
33
|
+
for score in result.behavior_scores
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
halted_behaviors = ", ".join(result.governance_flags.halted_behaviors) or "none"
|
|
37
|
+
config_badge = ""
|
|
38
|
+
if result.config_metadata:
|
|
39
|
+
if result.config_metadata.config_source == "custom":
|
|
40
|
+
files = ", ".join(result.config_metadata.config_files)
|
|
41
|
+
config_badge = (
|
|
42
|
+
'<span class="badge badge-custom" '
|
|
43
|
+
f'title="Custom config: {escape(files, quote=True)}">'
|
|
44
|
+
"⚙️ Custom</span>"
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
config_badge = '<span class="badge badge-default">📦 Default</span>'
|
|
48
|
+
|
|
49
|
+
html = f"""<!doctype html>
|
|
50
|
+
<html lang="en">
|
|
51
|
+
<head>
|
|
52
|
+
<meta charset="utf-8">
|
|
53
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
54
|
+
<title>Janus Labs Suite Report</title>
|
|
55
|
+
<style>
|
|
56
|
+
body {{
|
|
57
|
+
font-family: "Segoe UI", Arial, sans-serif;
|
|
58
|
+
background: #0f1117;
|
|
59
|
+
color: #e6e6e6;
|
|
60
|
+
margin: 0;
|
|
61
|
+
padding: 32px;
|
|
62
|
+
}}
|
|
63
|
+
.card {{
|
|
64
|
+
background: #151922;
|
|
65
|
+
border: 1px solid #2a2f3a;
|
|
66
|
+
border-radius: 12px;
|
|
67
|
+
padding: 24px;
|
|
68
|
+
margin-bottom: 24px;
|
|
69
|
+
box-shadow: 0 8px 24px rgba(0,0,0,0.35);
|
|
70
|
+
}}
|
|
71
|
+
h1, h2 {{
|
|
72
|
+
margin: 0 0 12px 0;
|
|
73
|
+
}}
|
|
74
|
+
.headline {{
|
|
75
|
+
font-size: 48px;
|
|
76
|
+
font-weight: 700;
|
|
77
|
+
}}
|
|
78
|
+
.grade {{
|
|
79
|
+
font-size: 24px;
|
|
80
|
+
font-weight: 600;
|
|
81
|
+
color: #8dd18f;
|
|
82
|
+
}}
|
|
83
|
+
table {{
|
|
84
|
+
width: 100%;
|
|
85
|
+
border-collapse: collapse;
|
|
86
|
+
}}
|
|
87
|
+
th, td {{
|
|
88
|
+
border-bottom: 1px solid #2a2f3a;
|
|
89
|
+
padding: 10px 8px;
|
|
90
|
+
text-align: left;
|
|
91
|
+
}}
|
|
92
|
+
th {{
|
|
93
|
+
color: #9aa3b2;
|
|
94
|
+
font-size: 12px;
|
|
95
|
+
letter-spacing: 0.08em;
|
|
96
|
+
text-transform: uppercase;
|
|
97
|
+
}}
|
|
98
|
+
.muted {{
|
|
99
|
+
color: #9aa3b2;
|
|
100
|
+
}}
|
|
101
|
+
.badge {{
|
|
102
|
+
display: inline-block;
|
|
103
|
+
padding: 2px 8px;
|
|
104
|
+
border-radius: 12px;
|
|
105
|
+
font-size: 0.75rem;
|
|
106
|
+
font-weight: 500;
|
|
107
|
+
margin-left: 8px;
|
|
108
|
+
}}
|
|
109
|
+
.badge-custom {{
|
|
110
|
+
background: #0d9488;
|
|
111
|
+
color: white;
|
|
112
|
+
}}
|
|
113
|
+
.badge-default {{
|
|
114
|
+
background: #6b7280;
|
|
115
|
+
color: white;
|
|
116
|
+
}}
|
|
117
|
+
</style>
|
|
118
|
+
</head>
|
|
119
|
+
<body>
|
|
120
|
+
<div class="card">
|
|
121
|
+
<h1>{result.suite_id} ({result.suite_version}) {config_badge}</h1>
|
|
122
|
+
<div class="headline">{result.headline_score:.1f}</div>
|
|
123
|
+
<div class="grade">Grade {result.grade}</div>
|
|
124
|
+
<div class="muted">Comparability key: {result.comparability_key}</div>
|
|
125
|
+
</div>
|
|
126
|
+
|
|
127
|
+
<div class="card">
|
|
128
|
+
<h2>Behavior Breakdown</h2>
|
|
129
|
+
<table>
|
|
130
|
+
<thead>
|
|
131
|
+
<tr>
|
|
132
|
+
<th>ID</th>
|
|
133
|
+
<th>Name</th>
|
|
134
|
+
<th>Score</th>
|
|
135
|
+
<th>Grade</th>
|
|
136
|
+
<th>Passed</th>
|
|
137
|
+
<th>Halted</th>
|
|
138
|
+
</tr>
|
|
139
|
+
</thead>
|
|
140
|
+
<tbody>
|
|
141
|
+
{rows}
|
|
142
|
+
</tbody>
|
|
143
|
+
</table>
|
|
144
|
+
</div>
|
|
145
|
+
|
|
146
|
+
<div class="card">
|
|
147
|
+
<h2>Governance Summary</h2>
|
|
148
|
+
<p>Total rollouts: <strong>{result.total_rollouts}</strong></p>
|
|
149
|
+
<p>Any halted: <strong>{'yes' if result.governance_flags.any_halted else 'no'}</strong></p>
|
|
150
|
+
<p>Halted count: <strong>{result.governance_flags.halted_count}</strong></p>
|
|
151
|
+
<p>Halted behaviors: <strong>{halted_behaviors}</strong></p>
|
|
152
|
+
<p>Foundation check rate: <strong>{result.governance_flags.foundation_check_rate:.2f}</strong></p>
|
|
153
|
+
</div>
|
|
154
|
+
</body>
|
|
155
|
+
</html>
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
output = Path(output_path)
|
|
159
|
+
output.write_text(html, encoding="utf-8")
|
|
160
|
+
return str(output)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""JSON export and load for SuiteResult."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from config.detection import ConfigMetadata
|
|
7
|
+
from suite.result import BehaviorScore, GovernanceFlags, SuiteResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def export_json(result: SuiteResult, output_path: str) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Export SuiteResult to JSON.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Path to generated JSON file
|
|
16
|
+
"""
|
|
17
|
+
output = Path(output_path)
|
|
18
|
+
payload = {
|
|
19
|
+
"suite_id": result.suite_id,
|
|
20
|
+
"suite_version": result.suite_version,
|
|
21
|
+
"config_fingerprint": result.config_fingerprint,
|
|
22
|
+
"timestamp": result.timestamp,
|
|
23
|
+
"headline_score": result.headline_score,
|
|
24
|
+
"grade": result.grade,
|
|
25
|
+
"behavior_scores": [score.__dict__ for score in result.behavior_scores],
|
|
26
|
+
"governance_flags": result.governance_flags.__dict__,
|
|
27
|
+
"comparability_key": result.comparability_key,
|
|
28
|
+
"total_rollouts": result.total_rollouts,
|
|
29
|
+
"total_duration_ms": result.total_duration_ms,
|
|
30
|
+
}
|
|
31
|
+
if result.config_metadata:
|
|
32
|
+
payload["config_metadata"] = {
|
|
33
|
+
"config_source": result.config_metadata.config_source,
|
|
34
|
+
"config_hash": result.config_metadata.config_hash,
|
|
35
|
+
"config_files": result.config_metadata.config_files,
|
|
36
|
+
"captured_at": result.config_metadata.captured_at,
|
|
37
|
+
}
|
|
38
|
+
output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
39
|
+
return str(output)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_json(path: str) -> SuiteResult:
|
|
43
|
+
"""Load SuiteResult from JSON file."""
|
|
44
|
+
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
45
|
+
behavior_scores = [
|
|
46
|
+
BehaviorScore(**item) for item in data.get("behavior_scores", [])
|
|
47
|
+
]
|
|
48
|
+
governance_flags = GovernanceFlags(**data["governance_flags"])
|
|
49
|
+
config_metadata = None
|
|
50
|
+
if "config_metadata" in data and data["config_metadata"] is not None:
|
|
51
|
+
config_metadata = ConfigMetadata(**data["config_metadata"])
|
|
52
|
+
return SuiteResult(
|
|
53
|
+
suite_id=data["suite_id"],
|
|
54
|
+
suite_version=data["suite_version"],
|
|
55
|
+
config_fingerprint=data["config_fingerprint"],
|
|
56
|
+
timestamp=data["timestamp"],
|
|
57
|
+
headline_score=data["headline_score"],
|
|
58
|
+
grade=data["grade"],
|
|
59
|
+
behavior_scores=behavior_scores,
|
|
60
|
+
governance_flags=governance_flags,
|
|
61
|
+
comparability_key=data["comparability_key"],
|
|
62
|
+
total_rollouts=data["total_rollouts"],
|
|
63
|
+
total_duration_ms=data["total_duration_ms"],
|
|
64
|
+
config_metadata=config_metadata,
|
|
65
|
+
)
|
suite/registry.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Registry of built-in benchmark suites."""
|
|
2
|
+
|
|
3
|
+
from suite.builtin import REFACTOR_STORM
|
|
4
|
+
from suite.builtin.hello_world import HELLO_WORLD
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
SUITES = {
|
|
8
|
+
REFACTOR_STORM.suite_id: REFACTOR_STORM,
|
|
9
|
+
HELLO_WORLD.suite_id: HELLO_WORLD,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_suite(suite_id: str):
|
|
14
|
+
"""Return a suite by ID, or None."""
|
|
15
|
+
return SUITES.get(suite_id)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def list_suites() -> list[str]:
|
|
19
|
+
"""List available suite IDs."""
|
|
20
|
+
return sorted(SUITES.keys())
|
suite/result.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""SuiteResult generation for benchmark suite runs."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from config.detection import ConfigMetadata
|
|
8
|
+
from gauge.report import BenchmarkReport
|
|
9
|
+
from gauge.trust_elasticity import TrustElasticityMetric
|
|
10
|
+
from suite.definition import BenchmarkSuite
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class BehaviorScore:
|
|
15
|
+
"""Score for a single behavior in a suite."""
|
|
16
|
+
behavior_id: str
|
|
17
|
+
name: str
|
|
18
|
+
score: float
|
|
19
|
+
trust_elasticity: float
|
|
20
|
+
grade: str
|
|
21
|
+
passed: bool
|
|
22
|
+
halted: bool
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class GovernanceFlags:
|
|
27
|
+
"""Suite-level governance summary."""
|
|
28
|
+
any_halted: bool
|
|
29
|
+
halted_count: int
|
|
30
|
+
halted_behaviors: List[str]
|
|
31
|
+
foundation_check_rate: float
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class SuiteResult:
|
|
36
|
+
"""Complete result of a benchmark suite run."""
|
|
37
|
+
suite_id: str
|
|
38
|
+
suite_version: str
|
|
39
|
+
config_fingerprint: str
|
|
40
|
+
timestamp: str
|
|
41
|
+
headline_score: float
|
|
42
|
+
grade: str
|
|
43
|
+
behavior_scores: List[BehaviorScore]
|
|
44
|
+
governance_flags: GovernanceFlags
|
|
45
|
+
comparability_key: str
|
|
46
|
+
total_rollouts: int
|
|
47
|
+
total_duration_ms: int
|
|
48
|
+
config_metadata: Optional[ConfigMetadata] = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _calculate_foundation_check_rate(reports: List[BenchmarkReport]) -> float:
|
|
52
|
+
total_rollouts = 0
|
|
53
|
+
total_checks = 0.0
|
|
54
|
+
for report in reports:
|
|
55
|
+
rollouts = report.get("total_rollouts", 0)
|
|
56
|
+
rate = report.get("aggregate_metrics", {}).get("foundation_check_rate", 0.0)
|
|
57
|
+
total_rollouts += rollouts
|
|
58
|
+
total_checks += rate * rollouts
|
|
59
|
+
if total_rollouts == 0:
|
|
60
|
+
return 0.0
|
|
61
|
+
return total_checks / total_rollouts
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _safe_behavior_result(report: BenchmarkReport) -> dict:
|
|
65
|
+
behaviors = report.get("behaviors", [])
|
|
66
|
+
return behaviors[0] if behaviors else {}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def generate_suite_result(
|
|
70
|
+
suite: BenchmarkSuite,
|
|
71
|
+
behavior_results: Dict[str, BenchmarkReport],
|
|
72
|
+
config_fingerprint: str,
|
|
73
|
+
duration_ms: int,
|
|
74
|
+
config_metadata: Optional[ConfigMetadata] = None,
|
|
75
|
+
) -> SuiteResult:
|
|
76
|
+
"""Generate SuiteResult from individual behavior reports."""
|
|
77
|
+
behavior_scores: List[BehaviorScore] = []
|
|
78
|
+
trust_elasticities: List[float] = []
|
|
79
|
+
halted_behaviors: List[str] = []
|
|
80
|
+
total_rollouts = 0
|
|
81
|
+
|
|
82
|
+
for behavior in suite.behaviors:
|
|
83
|
+
report = behavior_results.get(behavior.behavior_id)
|
|
84
|
+
if not report:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
total_rollouts += report.get("total_rollouts", 0)
|
|
88
|
+
result = _safe_behavior_result(report)
|
|
89
|
+
trust_elasticity = float(result.get("trust_elasticity", 0.0))
|
|
90
|
+
trust_elasticities.append(trust_elasticity)
|
|
91
|
+
mean_score = float(result.get("mean_score", 0.0))
|
|
92
|
+
passed = mean_score >= (behavior.threshold / 10.0)
|
|
93
|
+
|
|
94
|
+
governance = report.get("governance", {})
|
|
95
|
+
halted = bool(governance.get("halted_rollouts", 0))
|
|
96
|
+
if halted:
|
|
97
|
+
halted_behaviors.append(behavior.behavior_id)
|
|
98
|
+
|
|
99
|
+
behavior_scores.append(
|
|
100
|
+
BehaviorScore(
|
|
101
|
+
behavior_id=behavior.behavior_id,
|
|
102
|
+
name=behavior.name,
|
|
103
|
+
score=trust_elasticity,
|
|
104
|
+
trust_elasticity=trust_elasticity,
|
|
105
|
+
grade=result.get("grade", TrustElasticityMetric.score_to_grade(trust_elasticity)),
|
|
106
|
+
passed=passed,
|
|
107
|
+
halted=halted,
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
headline = sum(trust_elasticities) / len(trust_elasticities) if trust_elasticities else 0.0
|
|
112
|
+
grade = TrustElasticityMetric.score_to_grade(headline)
|
|
113
|
+
governance_flags = GovernanceFlags(
|
|
114
|
+
any_halted=bool(halted_behaviors),
|
|
115
|
+
halted_count=len(halted_behaviors),
|
|
116
|
+
halted_behaviors=halted_behaviors,
|
|
117
|
+
foundation_check_rate=_calculate_foundation_check_rate(list(behavior_results.values())),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
return SuiteResult(
|
|
121
|
+
suite_id=suite.suite_id,
|
|
122
|
+
suite_version=suite.version,
|
|
123
|
+
config_fingerprint=config_fingerprint,
|
|
124
|
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
125
|
+
headline_score=headline,
|
|
126
|
+
grade=grade,
|
|
127
|
+
behavior_scores=behavior_scores,
|
|
128
|
+
governance_flags=governance_flags,
|
|
129
|
+
comparability_key=suite.comparability_key,
|
|
130
|
+
total_rollouts=total_rollouts,
|
|
131
|
+
total_duration_ms=duration_ms,
|
|
132
|
+
config_metadata=config_metadata,
|
|
133
|
+
)
|
suite/runner.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Suite runner for benchmark execution."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import hashlib
|
|
5
|
+
from typing import Callable, Optional
|
|
6
|
+
|
|
7
|
+
from config.detection import ConfigMetadata
|
|
8
|
+
from gauge.governed_rollout import GovernedRolloutConfig, execute_governed_rollouts
|
|
9
|
+
from gauge.report import generate_benchmark_report, extract_governance_flags
|
|
10
|
+
from suite.definition import BenchmarkSuite
|
|
11
|
+
from suite.result import SuiteResult, generate_suite_result
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class SuiteRunConfig:
|
|
16
|
+
"""Configuration for suite execution."""
|
|
17
|
+
suite: BenchmarkSuite
|
|
18
|
+
target_dir: str = "."
|
|
19
|
+
seed: Optional[int] = 42
|
|
20
|
+
config_metadata: Optional[ConfigMetadata] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _coerce_score(value: float) -> tuple[float, float]:
|
|
24
|
+
"""
|
|
25
|
+
Coerce a score into (score_0_1, score_0_100).
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
value: Raw score from execution output.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Tuple of (0-1 score, 0-100 trust elasticity score).
|
|
32
|
+
"""
|
|
33
|
+
if value <= 1.0:
|
|
34
|
+
return value, value * 100.0
|
|
35
|
+
if value <= 100.0:
|
|
36
|
+
return value / 100.0, value
|
|
37
|
+
return 1.0, 100.0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _extract_score(output: dict) -> tuple[float, float]:
|
|
41
|
+
if not isinstance(output, dict):
|
|
42
|
+
return 0.0, 0.0
|
|
43
|
+
if "trust_elasticity" in output:
|
|
44
|
+
return _coerce_score(float(output["trust_elasticity"]))
|
|
45
|
+
if "score" in output:
|
|
46
|
+
return _coerce_score(float(output["score"]))
|
|
47
|
+
return 0.0, 0.0
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def run_suite(
|
|
51
|
+
config: SuiteRunConfig,
|
|
52
|
+
execute_fn: Callable[[int, str], dict],
|
|
53
|
+
) -> SuiteResult:
|
|
54
|
+
"""
|
|
55
|
+
Execute all behaviors in a suite.
|
|
56
|
+
|
|
57
|
+
1. For each behavior in suite.behaviors:
|
|
58
|
+
a. Create GovernedRolloutConfig
|
|
59
|
+
b. Execute rollouts via execute_governed_rollouts()
|
|
60
|
+
c. Collect scores and governance results
|
|
61
|
+
2. Aggregate into SuiteResult
|
|
62
|
+
"""
|
|
63
|
+
config.suite.ensure_valid()
|
|
64
|
+
|
|
65
|
+
behavior_reports = {}
|
|
66
|
+
total_duration_ms = 0
|
|
67
|
+
|
|
68
|
+
for behavior in config.suite.behaviors:
|
|
69
|
+
rollout_config = GovernedRolloutConfig(
|
|
70
|
+
behavior_id=behavior.behavior_id,
|
|
71
|
+
max_rollouts=config.suite.rollouts_per_behavior,
|
|
72
|
+
halt_on_governance=True,
|
|
73
|
+
target_dir=config.target_dir,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def _execute(index: int):
|
|
77
|
+
return execute_fn(index, behavior.behavior_id)
|
|
78
|
+
|
|
79
|
+
rollouts = execute_governed_rollouts(rollout_config, _execute)
|
|
80
|
+
governance_flags = extract_governance_flags(rollouts)
|
|
81
|
+
|
|
82
|
+
scores_0_1 = []
|
|
83
|
+
te_scores = []
|
|
84
|
+
for run in rollouts:
|
|
85
|
+
score_0_1, score_0_100 = _extract_score(run.execution_output)
|
|
86
|
+
scores_0_1.append(score_0_1)
|
|
87
|
+
te_scores.append(score_0_100)
|
|
88
|
+
total_duration_ms += run.duration_ms
|
|
89
|
+
|
|
90
|
+
report = generate_benchmark_report(
|
|
91
|
+
behaviors=[behavior],
|
|
92
|
+
behavior_scores={behavior.behavior_id: scores_0_1},
|
|
93
|
+
trust_elasticity_scores={behavior.behavior_id: te_scores},
|
|
94
|
+
config_fingerprint=_suite_fingerprint(config.suite, config.seed),
|
|
95
|
+
governance_flags=governance_flags,
|
|
96
|
+
)
|
|
97
|
+
behavior_reports[behavior.behavior_id] = report
|
|
98
|
+
|
|
99
|
+
return generate_suite_result(
|
|
100
|
+
suite=config.suite,
|
|
101
|
+
behavior_results=behavior_reports,
|
|
102
|
+
config_fingerprint=_suite_fingerprint(config.suite, config.seed),
|
|
103
|
+
duration_ms=total_duration_ms,
|
|
104
|
+
config_metadata=config.config_metadata,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _suite_fingerprint(suite: BenchmarkSuite, seed: Optional[int]) -> str:
|
|
109
|
+
content = f"{suite.comparability_key}:{seed}:{suite.judge_model}"
|
|
110
|
+
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
suite/thresholds.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Threshold configuration for regression gating."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, Optional
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from suite.registry import get_suite
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class BehaviorThreshold:
|
|
14
|
+
"""Threshold configuration for a single behavior."""
|
|
15
|
+
behavior_id: str
|
|
16
|
+
max_regression_pct: float = 5.0
|
|
17
|
+
min_score: Optional[float] = None
|
|
18
|
+
required: bool = True
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ThresholdConfig:
|
|
23
|
+
"""Suite-level threshold configuration."""
|
|
24
|
+
suite_id: str
|
|
25
|
+
default_max_regression_pct: float = 5.0
|
|
26
|
+
default_min_score: Optional[float] = None
|
|
27
|
+
behaviors: Dict[str, BehaviorThreshold] = field(default_factory=dict)
|
|
28
|
+
fail_on_any_halt: bool = True
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load_thresholds(path: str) -> ThresholdConfig:
|
|
32
|
+
"""Load threshold config from YAML file."""
|
|
33
|
+
payload = yaml.safe_load(Path(path).read_text(encoding="utf-8")) or {}
|
|
34
|
+
suite_id = payload.get("suite_id")
|
|
35
|
+
if not suite_id:
|
|
36
|
+
raise ValueError("threshold config missing suite_id")
|
|
37
|
+
|
|
38
|
+
default_max = float(payload.get("default_max_regression_pct", 5.0))
|
|
39
|
+
default_min = payload.get("default_min_score", None)
|
|
40
|
+
default_min_score = float(default_min) if default_min is not None else None
|
|
41
|
+
fail_on_any_halt = bool(payload.get("fail_on_any_halt", True))
|
|
42
|
+
|
|
43
|
+
behaviors: Dict[str, BehaviorThreshold] = {}
|
|
44
|
+
for behavior_id, data in (payload.get("behaviors") or {}).items():
|
|
45
|
+
max_regression_pct = float(data.get("max_regression_pct", default_max))
|
|
46
|
+
min_score_raw = data.get("min_score", default_min_score)
|
|
47
|
+
min_score = float(min_score_raw) if min_score_raw is not None else None
|
|
48
|
+
required = bool(data.get("required", True))
|
|
49
|
+
behaviors[behavior_id] = BehaviorThreshold(
|
|
50
|
+
behavior_id=behavior_id,
|
|
51
|
+
max_regression_pct=max_regression_pct,
|
|
52
|
+
min_score=min_score,
|
|
53
|
+
required=required,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return ThresholdConfig(
|
|
57
|
+
suite_id=suite_id,
|
|
58
|
+
default_max_regression_pct=default_max,
|
|
59
|
+
default_min_score=default_min_score,
|
|
60
|
+
behaviors=behaviors,
|
|
61
|
+
fail_on_any_halt=fail_on_any_halt,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def default_thresholds(suite_id: str) -> ThresholdConfig:
|
|
66
|
+
"""Return default thresholds for a suite."""
|
|
67
|
+
suite = get_suite(suite_id)
|
|
68
|
+
config = ThresholdConfig(suite_id=suite_id)
|
|
69
|
+
if suite is None:
|
|
70
|
+
return config
|
|
71
|
+
|
|
72
|
+
for behavior in suite.behaviors:
|
|
73
|
+
config.behaviors[behavior.behavior_id] = BehaviorThreshold(
|
|
74
|
+
behavior_id=behavior.behavior_id,
|
|
75
|
+
max_regression_pct=config.default_max_regression_pct,
|
|
76
|
+
min_score=config.default_min_score,
|
|
77
|
+
required=True,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
return config
|