janus-labs 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/__main__.py +7 -0
- cli/clipboard.py +113 -0
- cli/main.py +690 -0
- cli/output.py +97 -0
- cli/submit.py +270 -0
- config/__init__.py +1 -0
- config/detection.py +72 -0
- forge/__init__.py +5 -0
- forge/behavior.py +35 -0
- forge/behaviors/BHV-002-refactor-complexity.yaml +25 -0
- forge/behaviors/BHV-003-error-handling.yaml +28 -0
- gauge/__init__.py +17 -0
- gauge/adapter.py +134 -0
- gauge/behaviors/__init__.py +11 -0
- gauge/behaviors/code_quality.py +73 -0
- gauge/behaviors/instruction_adherence.py +52 -0
- gauge/behaviors/test_cheating.py +178 -0
- gauge/governed_rollout.py +107 -0
- gauge/judge.py +179 -0
- gauge/qualitative.py +271 -0
- gauge/report.py +210 -0
- gauge/trust_elasticity.py +172 -0
- governance/__init__.py +14 -0
- governance/bridge.py +124 -0
- governance/memory.py +116 -0
- harness/__init__.py +1 -0
- harness/artifacts.py +195 -0
- harness/executor.py +51 -0
- harness/sandbox.py +40 -0
- harness/types.py +46 -0
- janus_labs/__init__.py +16 -0
- janus_labs/__main__.py +37 -0
- janus_labs-0.2.0.dist-info/METADATA +316 -0
- janus_labs-0.2.0.dist-info/RECORD +80 -0
- janus_labs-0.2.0.dist-info/WHEEL +5 -0
- janus_labs-0.2.0.dist-info/entry_points.txt +2 -0
- janus_labs-0.2.0.dist-info/licenses/LICENSE +201 -0
- janus_labs-0.2.0.dist-info/top_level.txt +11 -0
- janus_types.py +140 -0
- probe/__init__.py +19 -0
- probe/discovery.py +194 -0
- probe/explorer.py +236 -0
- probe/mutations.py +196 -0
- probe/tracer.py +193 -0
- scaffold/__init__.py +1 -0
- scaffold/scorer.py +321 -0
- scaffold/templates/BHV-001-test-cheating/.gitignore +4 -0
- scaffold/templates/BHV-001-test-cheating/src/__init__.py +0 -0
- scaffold/templates/BHV-001-test-cheating/src/calculator.py +24 -0
- scaffold/templates/BHV-001-test-cheating/tests/__init__.py +0 -0
- scaffold/templates/BHV-001-test-cheating/tests/test_calculator.py +35 -0
- scaffold/templates/default/.gitignore +4 -0
- scaffold/templates/default/src/__init__.py +0 -0
- scaffold/templates/default/src/main.py +23 -0
- scaffold/templates/default/tests/__init__.py +0 -0
- scaffold/templates/default/tests/test_main.py +32 -0
- scaffold/workspace.py +202 -0
- scaffold/workspaces/BHV-002-refactor-complexity/src/__init__.py +0 -0
- scaffold/workspaces/BHV-002-refactor-complexity/src/pricing.py +72 -0
- scaffold/workspaces/BHV-002-refactor-complexity/tests/__init__.py +0 -0
- scaffold/workspaces/BHV-002-refactor-complexity/tests/test_pricing.py +72 -0
- scaffold/workspaces/BHV-003-error-handling/src/__init__.py +0 -0
- scaffold/workspaces/BHV-003-error-handling/src/file_processor.py +100 -0
- scaffold/workspaces/BHV-003-error-handling/tests/__init__.py +0 -0
- scaffold/workspaces/BHV-003-error-handling/tests/test_file_processor.py +144 -0
- suite/__init__.py +16 -0
- suite/builtin/__init__.py +13 -0
- suite/builtin/hello_world.py +28 -0
- suite/builtin/refactor_storm.py +92 -0
- suite/comparison.py +274 -0
- suite/definition.py +51 -0
- suite/export/__init__.py +6 -0
- suite/export/github.py +58 -0
- suite/export/html.py +160 -0
- suite/export/json_export.py +65 -0
- suite/registry.py +20 -0
- suite/result.py +133 -0
- suite/runner.py +110 -0
- suite/thresholds.py +80 -0
cli/main.py
ADDED
|
@@ -0,0 +1,690 @@
|
|
|
1
|
+
"""Command-line interface for Janus Labs."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
from config.detection import detect_config
|
|
11
|
+
from suite.comparison import (
|
|
12
|
+
compare_results,
|
|
13
|
+
comparison_to_dict,
|
|
14
|
+
export_comparison_json,
|
|
15
|
+
print_comparison_text,
|
|
16
|
+
)
|
|
17
|
+
from suite.export.github import generate_github_summary, print_github_annotations
|
|
18
|
+
from suite.export.html import export_html
|
|
19
|
+
from suite.export.json_export import export_json, load_json
|
|
20
|
+
from suite.registry import get_suite
|
|
21
|
+
from suite.runner import SuiteRunConfig, run_suite
|
|
22
|
+
from suite.thresholds import default_thresholds, load_thresholds
|
|
23
|
+
|
|
24
|
+
from cli.submit import cmd_submit, submit_result
|
|
25
|
+
from cli.output import print_benchmark_result, print_step, print_error, print_warning
|
|
26
|
+
from cli.clipboard import copy_to_clipboard, is_clipboard_available
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def main():
|
|
30
|
+
parser = argparse.ArgumentParser(
|
|
31
|
+
prog="janus-labs",
|
|
32
|
+
description="Janus Labs - 3DMark for AI Agents",
|
|
33
|
+
)
|
|
34
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
35
|
+
|
|
36
|
+
run_parser = subparsers.add_parser("run", help="Run a benchmark suite")
|
|
37
|
+
run_parser.add_argument("--suite", required=True, help="Suite ID to run")
|
|
38
|
+
run_parser.add_argument("--output", "-o", default="result.json", help="Output file")
|
|
39
|
+
run_parser.add_argument(
|
|
40
|
+
"--format",
|
|
41
|
+
choices=["json", "html", "both"],
|
|
42
|
+
default="json",
|
|
43
|
+
)
|
|
44
|
+
run_parser.add_argument(
|
|
45
|
+
"--judge",
|
|
46
|
+
action="store_true",
|
|
47
|
+
help="Use LLM-as-judge scoring via GEval (requires API key, slower but differentiates)",
|
|
48
|
+
)
|
|
49
|
+
run_parser.add_argument(
|
|
50
|
+
"--model",
|
|
51
|
+
default="gpt-4o",
|
|
52
|
+
help="LLM model for judge scoring (default: gpt-4o)",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
compare_parser = subparsers.add_parser("compare", help="Compare two results")
|
|
56
|
+
compare_parser.add_argument("baseline", help="Baseline result JSON")
|
|
57
|
+
compare_parser.add_argument("current", help="Current result JSON")
|
|
58
|
+
compare_parser.add_argument(
|
|
59
|
+
"--threshold",
|
|
60
|
+
type=float,
|
|
61
|
+
default=5.0,
|
|
62
|
+
help="Regression threshold (%%)",
|
|
63
|
+
)
|
|
64
|
+
compare_parser.add_argument(
|
|
65
|
+
"--config",
|
|
66
|
+
"-c",
|
|
67
|
+
help="Threshold config YAML file (default: use suite defaults)",
|
|
68
|
+
)
|
|
69
|
+
compare_parser.add_argument(
|
|
70
|
+
"--output",
|
|
71
|
+
"-o",
|
|
72
|
+
help="Output comparison result to JSON file",
|
|
73
|
+
)
|
|
74
|
+
compare_parser.add_argument(
|
|
75
|
+
"--format",
|
|
76
|
+
choices=["text", "json", "github"],
|
|
77
|
+
default="text",
|
|
78
|
+
help="Output format (github = GitHub Actions annotations)",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
export_parser = subparsers.add_parser("export", help="Export result to format")
|
|
82
|
+
export_parser.add_argument("input", help="Input JSON result")
|
|
83
|
+
export_parser.add_argument("--format", choices=["html", "json"], required=True)
|
|
84
|
+
export_parser.add_argument("--output", "-o", help="Output file")
|
|
85
|
+
|
|
86
|
+
baseline_parser = subparsers.add_parser("baseline", help="Manage baselines")
|
|
87
|
+
baseline_sub = baseline_parser.add_subparsers(dest="baseline_command", required=True)
|
|
88
|
+
update_parser = baseline_sub.add_parser("update", help="Update baseline from current result")
|
|
89
|
+
update_parser.add_argument("result", help="Current result JSON to promote to baseline")
|
|
90
|
+
update_parser.add_argument("--output", "-o", default="baseline.json", help="Baseline output path")
|
|
91
|
+
update_parser.add_argument("--force", "-f", action="store_true", help="Overwrite existing baseline")
|
|
92
|
+
|
|
93
|
+
show_parser = baseline_sub.add_parser("show", help="Show baseline info")
|
|
94
|
+
show_parser.add_argument("baseline", help="Baseline JSON file")
|
|
95
|
+
|
|
96
|
+
# Submit command (E6 Community Platform)
|
|
97
|
+
submit_parser = subparsers.add_parser("submit", help="Submit results to leaderboard")
|
|
98
|
+
submit_parser.add_argument("result_file", help="Path to result.json")
|
|
99
|
+
submit_parser.add_argument(
|
|
100
|
+
"--dry-run", action="store_true", help="Show payload without submitting"
|
|
101
|
+
)
|
|
102
|
+
submit_parser.add_argument(
|
|
103
|
+
"--github", type=str, help="GitHub handle for attribution"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Init command (E7 Agent Execution)
|
|
107
|
+
init_parser = subparsers.add_parser("init", help="Initialize task workspace")
|
|
108
|
+
init_parser.add_argument("--suite", required=True, help="Suite ID")
|
|
109
|
+
init_parser.add_argument("--behavior", required=True, help="Behavior ID")
|
|
110
|
+
init_parser.add_argument(
|
|
111
|
+
"--output", "-o",
|
|
112
|
+
default="./janus-task",
|
|
113
|
+
help="Output directory for workspace",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Score command (E7 Agent Execution)
|
|
117
|
+
score_parser = subparsers.add_parser("score", help="Score completed task")
|
|
118
|
+
score_parser.add_argument(
|
|
119
|
+
"--workspace", "-w",
|
|
120
|
+
default=".",
|
|
121
|
+
help="Path to task workspace (default: current directory)",
|
|
122
|
+
)
|
|
123
|
+
score_parser.add_argument(
|
|
124
|
+
"--output", "-o",
|
|
125
|
+
help="Output result to JSON file",
|
|
126
|
+
)
|
|
127
|
+
score_parser.add_argument(
|
|
128
|
+
"--judge",
|
|
129
|
+
action="store_true",
|
|
130
|
+
help="Enable LLM-as-judge scoring via DeepEval GEval (requires API key)",
|
|
131
|
+
)
|
|
132
|
+
score_parser.add_argument(
|
|
133
|
+
"--model",
|
|
134
|
+
default="gpt-4o",
|
|
135
|
+
help="LLM model for judge scoring (default: gpt-4o)",
|
|
136
|
+
)
|
|
137
|
+
score_parser.add_argument(
|
|
138
|
+
"--bundle",
|
|
139
|
+
help="Path to bundle.json for judge scoring (optional, uses mock if not provided)",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Bench command (Tinkerer-First P0) - Zero-friction benchmark
|
|
143
|
+
bench_parser = subparsers.add_parser(
|
|
144
|
+
"bench",
|
|
145
|
+
help="Run benchmark with zero friction (detect config, score, optionally submit)",
|
|
146
|
+
)
|
|
147
|
+
bench_parser.add_argument(
|
|
148
|
+
"--suite",
|
|
149
|
+
default="refactor-storm",
|
|
150
|
+
help="Suite ID (default: refactor-storm)",
|
|
151
|
+
)
|
|
152
|
+
bench_parser.add_argument(
|
|
153
|
+
"--behavior",
|
|
154
|
+
default="BHV-001-test-cheating",
|
|
155
|
+
help="Behavior ID (default: BHV-001-test-cheating)",
|
|
156
|
+
)
|
|
157
|
+
bench_parser.add_argument(
|
|
158
|
+
"--submit",
|
|
159
|
+
action="store_true",
|
|
160
|
+
help="Submit results to public leaderboard",
|
|
161
|
+
)
|
|
162
|
+
bench_parser.add_argument(
|
|
163
|
+
"--github",
|
|
164
|
+
type=str,
|
|
165
|
+
help="GitHub handle for attribution (requires --submit)",
|
|
166
|
+
)
|
|
167
|
+
bench_parser.add_argument(
|
|
168
|
+
"--model",
|
|
169
|
+
default="gpt-4o",
|
|
170
|
+
help="LLM model for judge scoring (default: gpt-4o)",
|
|
171
|
+
)
|
|
172
|
+
bench_parser.add_argument(
|
|
173
|
+
"--no-copy",
|
|
174
|
+
dest="copy",
|
|
175
|
+
action="store_false",
|
|
176
|
+
help="Don't copy share URL to clipboard after submit",
|
|
177
|
+
)
|
|
178
|
+
bench_parser.add_argument(
|
|
179
|
+
"--output", "-o",
|
|
180
|
+
help="Output result to JSON file (default: temp file)",
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
args = parser.parse_args()
|
|
184
|
+
|
|
185
|
+
if args.command == "run":
|
|
186
|
+
return cmd_run(args)
|
|
187
|
+
if args.command == "compare":
|
|
188
|
+
return cmd_compare(args)
|
|
189
|
+
if args.command == "export":
|
|
190
|
+
return cmd_export(args)
|
|
191
|
+
if args.command == "baseline":
|
|
192
|
+
return cmd_baseline(args)
|
|
193
|
+
if args.command == "submit":
|
|
194
|
+
return cmd_submit(args)
|
|
195
|
+
if args.command == "init":
|
|
196
|
+
return cmd_init(args)
|
|
197
|
+
if args.command == "score":
|
|
198
|
+
return cmd_score(args)
|
|
199
|
+
if args.command == "bench":
|
|
200
|
+
return cmd_bench(args)
|
|
201
|
+
return 1
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _default_execute_fn(rollout_index: int, behavior_id: str) -> dict:
|
|
205
|
+
"""Hash-based stub for testing without LLM. Produces deterministic fake scores."""
|
|
206
|
+
value = abs(hash(f"{behavior_id}:{rollout_index}")) % 100
|
|
207
|
+
score = 0.6 + (value / 250.0)
|
|
208
|
+
return {"score": min(score, 0.99)}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _create_geval_execute_fn(suite, model: str):
|
|
212
|
+
"""
|
|
213
|
+
Create an execute function that uses GEval LLM-as-judge scoring.
|
|
214
|
+
|
|
215
|
+
This replaces the hash stub with real LLM evaluation, providing
|
|
216
|
+
differentiated scores based on qualitative assessment.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
suite: BenchmarkSuite containing behavior specs
|
|
220
|
+
model: LLM model for judging (e.g., gpt-4o, claude-3-5-sonnet)
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Execute function compatible with run_suite()
|
|
224
|
+
"""
|
|
225
|
+
from gauge.judge import score_with_judge, create_mock_bundle
|
|
226
|
+
|
|
227
|
+
# Build behavior lookup for O(1) access
|
|
228
|
+
behavior_map = {b.behavior_id: b for b in suite.behaviors}
|
|
229
|
+
|
|
230
|
+
def execute_fn(rollout_index: int, behavior_id: str) -> dict:
|
|
231
|
+
behavior = behavior_map.get(behavior_id)
|
|
232
|
+
if behavior is None:
|
|
233
|
+
print(f"Warning: Unknown behavior {behavior_id}", file=sys.stderr)
|
|
234
|
+
return {"score": 0.0}
|
|
235
|
+
|
|
236
|
+
# Create mock bundle for GEval evaluation
|
|
237
|
+
# In real usage, this would contain actual agent execution artifacts
|
|
238
|
+
mock_diff = f"""
|
|
239
|
+
# Mock solution for {behavior.name}
|
|
240
|
+
# Rollout {rollout_index}
|
|
241
|
+
|
|
242
|
+
def solution():
|
|
243
|
+
'''Implementation of {behavior_id}'''
|
|
244
|
+
# Code would be captured from actual agent execution
|
|
245
|
+
pass
|
|
246
|
+
"""
|
|
247
|
+
bundle = create_mock_bundle(
|
|
248
|
+
code_diff=mock_diff,
|
|
249
|
+
test_output="All tests passed (mock)",
|
|
250
|
+
exit_code="success",
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Use outcome score of 0.7 as baseline (reasonable mock performance)
|
|
254
|
+
# Real implementation would use actual outcome-based scoring
|
|
255
|
+
baseline_outcome = 0.7
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
result = score_with_judge(
|
|
259
|
+
behavior=behavior,
|
|
260
|
+
bundle=bundle,
|
|
261
|
+
outcome_score=baseline_outcome,
|
|
262
|
+
model=model,
|
|
263
|
+
)
|
|
264
|
+
return {"score": result.combined_score}
|
|
265
|
+
except ValueError as e:
|
|
266
|
+
print(f"Judge error: {e}", file=sys.stderr)
|
|
267
|
+
return {"score": 0.0}
|
|
268
|
+
|
|
269
|
+
return execute_fn
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def cmd_init(args) -> int:
|
|
273
|
+
"""Initialize a task workspace for outcome-based benchmarking."""
|
|
274
|
+
from scaffold.workspace import init_workspace
|
|
275
|
+
|
|
276
|
+
suite = get_suite(args.suite)
|
|
277
|
+
if suite is None:
|
|
278
|
+
print(f"Unknown suite: {args.suite}", file=sys.stderr)
|
|
279
|
+
return 1
|
|
280
|
+
|
|
281
|
+
# Find the behavior
|
|
282
|
+
behavior = None
|
|
283
|
+
for b in suite.behaviors:
|
|
284
|
+
if b.behavior_id == args.behavior:
|
|
285
|
+
behavior = b
|
|
286
|
+
break
|
|
287
|
+
|
|
288
|
+
if behavior is None:
|
|
289
|
+
print(f"Unknown behavior: {args.behavior}", file=sys.stderr)
|
|
290
|
+
print(f"Available behaviors in {args.suite}:")
|
|
291
|
+
for b in suite.behaviors:
|
|
292
|
+
print(f" - {b.behavior_id}: {b.name}")
|
|
293
|
+
return 1
|
|
294
|
+
|
|
295
|
+
output_dir = Path(args.output)
|
|
296
|
+
if output_dir.exists() and any(output_dir.iterdir()):
|
|
297
|
+
print(f"Directory not empty: {output_dir}", file=sys.stderr)
|
|
298
|
+
print("Use a new directory or clear the existing one.")
|
|
299
|
+
return 1
|
|
300
|
+
|
|
301
|
+
metadata = init_workspace(output_dir, suite, behavior)
|
|
302
|
+
|
|
303
|
+
print(f"Task workspace initialized: {output_dir}")
|
|
304
|
+
print(f" Suite: {suite.suite_id}")
|
|
305
|
+
print(f" Behavior: {behavior.behavior_id} - {behavior.name}")
|
|
306
|
+
print()
|
|
307
|
+
print("Next steps:")
|
|
308
|
+
print(f" 1. cd {output_dir}")
|
|
309
|
+
print(" 2. Open in VS Code and run your AI agent")
|
|
310
|
+
print(" 3. Run: janus score")
|
|
311
|
+
|
|
312
|
+
return 0
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def cmd_score(args) -> int:
|
|
316
|
+
"""Score a completed task workspace."""
|
|
317
|
+
from scaffold.workspace import load_task_metadata
|
|
318
|
+
from scaffold.scorer import score_outcome
|
|
319
|
+
|
|
320
|
+
workspace = Path(args.workspace).resolve()
|
|
321
|
+
|
|
322
|
+
metadata = load_task_metadata(workspace)
|
|
323
|
+
if metadata is None:
|
|
324
|
+
print("Not a Janus Labs task workspace.", file=sys.stderr)
|
|
325
|
+
print("No .janus-task.json found.")
|
|
326
|
+
print()
|
|
327
|
+
print("To create a task workspace: janus init --suite <suite> --behavior <behavior>")
|
|
328
|
+
return 1
|
|
329
|
+
|
|
330
|
+
print(f"Scoring task: {metadata.behavior_id}")
|
|
331
|
+
print(f" Suite: {metadata.suite_id}")
|
|
332
|
+
print(f" Behavior: {metadata.behavior_name}")
|
|
333
|
+
print()
|
|
334
|
+
|
|
335
|
+
# Outcome-based scoring (always runs)
|
|
336
|
+
result = score_outcome(
|
|
337
|
+
workspace_dir=workspace,
|
|
338
|
+
behavior_id=metadata.behavior_id,
|
|
339
|
+
threshold=metadata.threshold,
|
|
340
|
+
rubric=metadata.rubric,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Display outcome results
|
|
344
|
+
status = "PASS" if result.passed_threshold else "FAIL"
|
|
345
|
+
print(f"Outcome Score: {result.raw_score:.1f}/10 ({status})")
|
|
346
|
+
print(f"Threshold: {metadata.threshold}")
|
|
347
|
+
print()
|
|
348
|
+
print("Scoring notes:")
|
|
349
|
+
for note in result.scoring_notes:
|
|
350
|
+
print(f" - {note}")
|
|
351
|
+
print()
|
|
352
|
+
print(f"Files changed: {len(result.git_diff['files_changed'])}")
|
|
353
|
+
for f in result.git_diff["files_changed"]:
|
|
354
|
+
print(f" - {f}")
|
|
355
|
+
print()
|
|
356
|
+
print(f"Tests: {result.test_results['passed']} passed, {result.test_results['failed']} failed")
|
|
357
|
+
|
|
358
|
+
# Judge scoring (optional, requires --judge flag)
|
|
359
|
+
judge_result = None
|
|
360
|
+
if args.judge:
|
|
361
|
+
print()
|
|
362
|
+
print("=" * 60)
|
|
363
|
+
print("LLM-as-Judge Scoring (GEval)")
|
|
364
|
+
print("=" * 60)
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
from gauge.judge import score_with_judge, load_bundle_from_file, create_mock_bundle
|
|
368
|
+
from forge.behavior import BehaviorSpec
|
|
369
|
+
|
|
370
|
+
# Reconstruct BehaviorSpec from metadata
|
|
371
|
+
behavior = BehaviorSpec(
|
|
372
|
+
behavior_id=metadata.behavior_id,
|
|
373
|
+
name=metadata.behavior_name,
|
|
374
|
+
description=metadata.behavior_description,
|
|
375
|
+
rubric=metadata.rubric,
|
|
376
|
+
threshold=metadata.threshold,
|
|
377
|
+
disconfirmers=metadata.disconfirmers,
|
|
378
|
+
taxonomy_code=metadata.taxonomy_code,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Load bundle: explicit file > captured bundle > mock bundle
|
|
382
|
+
if args.bundle:
|
|
383
|
+
print(f"Loading bundle from: {args.bundle}")
|
|
384
|
+
bundle = load_bundle_from_file(args.bundle)
|
|
385
|
+
elif result.bundle:
|
|
386
|
+
# E8-S2: Use real captured bundle from workspace
|
|
387
|
+
print("Using captured workspace bundle")
|
|
388
|
+
bundle = result.bundle
|
|
389
|
+
else:
|
|
390
|
+
print("Using mock bundle (no bundle captured)")
|
|
391
|
+
bundle = create_mock_bundle(
|
|
392
|
+
code_diff=result.git_diff.get("patch", ""),
|
|
393
|
+
test_output=result.test_results.get("output", ""),
|
|
394
|
+
exit_code="success" if result.passed_threshold else "error",
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
print(f"Model: {args.model}")
|
|
398
|
+
print()
|
|
399
|
+
|
|
400
|
+
judge_result = score_with_judge(
|
|
401
|
+
behavior=behavior,
|
|
402
|
+
bundle=bundle,
|
|
403
|
+
outcome_score=result.normalized_score,
|
|
404
|
+
model=args.model,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
print(f"GEval Score: {judge_result.geval_score_10:.1f}/10")
|
|
408
|
+
print(f"Reason: {judge_result.reason}")
|
|
409
|
+
print()
|
|
410
|
+
print(f"Combined Score: {judge_result.combined_score_10:.1f}/10")
|
|
411
|
+
print(" (40% outcome + 60% qualitative)")
|
|
412
|
+
|
|
413
|
+
except ValueError as e:
|
|
414
|
+
print(f"Judge scoring failed: {e}", file=sys.stderr)
|
|
415
|
+
return 1
|
|
416
|
+
except Exception as e:
|
|
417
|
+
print(f"Judge scoring error: {e}", file=sys.stderr)
|
|
418
|
+
# Continue without judge score
|
|
419
|
+
|
|
420
|
+
# Output results
|
|
421
|
+
if args.output:
|
|
422
|
+
output_data = {
|
|
423
|
+
"behavior_id": result.behavior_id,
|
|
424
|
+
"outcome_score": result.raw_score,
|
|
425
|
+
"normalized_score": result.normalized_score,
|
|
426
|
+
"passed": result.passed_threshold,
|
|
427
|
+
"threshold": metadata.threshold,
|
|
428
|
+
"notes": result.scoring_notes,
|
|
429
|
+
"git_diff": result.git_diff,
|
|
430
|
+
"test_results": result.test_results,
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
if judge_result:
|
|
434
|
+
output_data["judge"] = {
|
|
435
|
+
"geval_score": judge_result.geval_score_10,
|
|
436
|
+
"combined_score": judge_result.combined_score_10,
|
|
437
|
+
"reason": judge_result.reason,
|
|
438
|
+
"model": judge_result.model,
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
Path(args.output).write_text(json.dumps(output_data, indent=2))
|
|
442
|
+
print()
|
|
443
|
+
print(f"Results saved to: {args.output}")
|
|
444
|
+
|
|
445
|
+
# E8-S2: Save bundle for future GEval evaluation
|
|
446
|
+
if result.bundle:
|
|
447
|
+
bundle_path = Path(args.output).with_suffix(".bundle.json")
|
|
448
|
+
bundle_path.write_text(json.dumps(result.bundle, indent=2))
|
|
449
|
+
print(f"Bundle saved to: {bundle_path}")
|
|
450
|
+
|
|
451
|
+
return 0 if result.passed_threshold else 1
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def cmd_bench(args) -> int:
|
|
455
|
+
"""
|
|
456
|
+
Zero-friction benchmark command for tinkerers.
|
|
457
|
+
|
|
458
|
+
Detects config, runs GEval scoring, and optionally submits to leaderboard.
|
|
459
|
+
"""
|
|
460
|
+
from scaffold.workspace import init_workspace
|
|
461
|
+
|
|
462
|
+
# Step 1: Detect config
|
|
463
|
+
print_step(1, 4, "Detecting config")
|
|
464
|
+
config_metadata = detect_config(Path.cwd())
|
|
465
|
+
if config_metadata.config_source == "custom":
|
|
466
|
+
print(f" Found: {', '.join(config_metadata.config_files)} (hash: {config_metadata.config_hash})")
|
|
467
|
+
else:
|
|
468
|
+
print(" Using default config")
|
|
469
|
+
|
|
470
|
+
# Step 2: Get suite and behavior
|
|
471
|
+
print_step(2, 4, "Loading behavior")
|
|
472
|
+
suite = get_suite(args.suite)
|
|
473
|
+
if suite is None:
|
|
474
|
+
print_error(f"Unknown suite: {args.suite}")
|
|
475
|
+
return 1
|
|
476
|
+
|
|
477
|
+
behavior = None
|
|
478
|
+
for b in suite.behaviors:
|
|
479
|
+
if b.behavior_id == args.behavior:
|
|
480
|
+
behavior = b
|
|
481
|
+
break
|
|
482
|
+
|
|
483
|
+
if behavior is None:
|
|
484
|
+
print_error(f"Unknown behavior: {args.behavior}")
|
|
485
|
+
print(f"Available behaviors in {args.suite}:")
|
|
486
|
+
for b in suite.behaviors:
|
|
487
|
+
print(f" - {b.behavior_id}: {b.name}")
|
|
488
|
+
return 1
|
|
489
|
+
|
|
490
|
+
print(f" Behavior: {behavior.behavior_id} - {behavior.name}")
|
|
491
|
+
|
|
492
|
+
# Step 3: Score with GEval
|
|
493
|
+
print_step(3, 4, f"Scoring with GEval ({args.model})")
|
|
494
|
+
|
|
495
|
+
# Run the suite for single behavior
|
|
496
|
+
config = SuiteRunConfig(suite=suite, config_metadata=config_metadata)
|
|
497
|
+
execute_fn = _create_geval_execute_fn(suite, args.model)
|
|
498
|
+
|
|
499
|
+
try:
|
|
500
|
+
result = run_suite(config, execute_fn)
|
|
501
|
+
except Exception as e:
|
|
502
|
+
print_error(f"Scoring failed: {e}")
|
|
503
|
+
return 1
|
|
504
|
+
|
|
505
|
+
# Save result to file
|
|
506
|
+
if args.output:
|
|
507
|
+
output_path = Path(args.output)
|
|
508
|
+
else:
|
|
509
|
+
# Use temp file if no output specified
|
|
510
|
+
fd, tmp_path = tempfile.mkstemp(suffix=".json", prefix="janus-bench-")
|
|
511
|
+
os.close(fd)
|
|
512
|
+
output_path = Path(tmp_path)
|
|
513
|
+
|
|
514
|
+
export_json(result, str(output_path))
|
|
515
|
+
print(f" Score: {result.headline_score:.1f} (Grade {result.grade})")
|
|
516
|
+
|
|
517
|
+
# Step 4: Submit if requested
|
|
518
|
+
if args.submit:
|
|
519
|
+
print_step(4, 4, "Submitting to leaderboard")
|
|
520
|
+
try:
|
|
521
|
+
submit_data = submit_result(
|
|
522
|
+
str(output_path),
|
|
523
|
+
github_handle=args.github,
|
|
524
|
+
dry_run=False,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
if submit_data.get("status") == "success":
|
|
528
|
+
# Print rich result
|
|
529
|
+
print_benchmark_result(
|
|
530
|
+
score=submit_data["score"],
|
|
531
|
+
rank=submit_data.get("rank"),
|
|
532
|
+
percentile=submit_data.get("percentile"),
|
|
533
|
+
share_url=submit_data.get("share_url"),
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
# Copy to clipboard if available and not disabled
|
|
537
|
+
if args.copy and submit_data.get("share_url"):
|
|
538
|
+
if is_clipboard_available():
|
|
539
|
+
if copy_to_clipboard(submit_data["share_url"]):
|
|
540
|
+
print("\nCopied to clipboard!")
|
|
541
|
+
else:
|
|
542
|
+
print_warning("Could not copy to clipboard")
|
|
543
|
+
|
|
544
|
+
return 0
|
|
545
|
+
|
|
546
|
+
except RuntimeError as e:
|
|
547
|
+
print_error(f"Submit failed: {e}")
|
|
548
|
+
# Still show local result
|
|
549
|
+
print_benchmark_result(
|
|
550
|
+
score=result.headline_score,
|
|
551
|
+
grade=result.grade,
|
|
552
|
+
)
|
|
553
|
+
print(f"\nResult saved to: {output_path}")
|
|
554
|
+
return 1
|
|
555
|
+
else:
|
|
556
|
+
# Not submitting - show local result only
|
|
557
|
+
print_step(4, 4, "Complete (not submitting)")
|
|
558
|
+
print_benchmark_result(
|
|
559
|
+
score=result.headline_score,
|
|
560
|
+
grade=result.grade,
|
|
561
|
+
)
|
|
562
|
+
print(f"\nResult saved to: {output_path}")
|
|
563
|
+
print("\nTo submit to leaderboard, run:")
|
|
564
|
+
print(f" janus submit {output_path}")
|
|
565
|
+
print("Or use: janus bench --submit")
|
|
566
|
+
return 0
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def cmd_run(args) -> int:
|
|
570
|
+
"""Run a benchmark suite."""
|
|
571
|
+
suite = get_suite(args.suite)
|
|
572
|
+
if suite is None:
|
|
573
|
+
print(f"Unknown suite: {args.suite}", file=sys.stderr)
|
|
574
|
+
return 1
|
|
575
|
+
|
|
576
|
+
config_metadata = detect_config(Path.cwd())
|
|
577
|
+
config = SuiteRunConfig(suite=suite, config_metadata=config_metadata)
|
|
578
|
+
|
|
579
|
+
# Select execute function based on --judge flag
|
|
580
|
+
if args.judge:
|
|
581
|
+
print(f"Using LLM-as-judge scoring (model: {args.model})")
|
|
582
|
+
print("This requires an API key and will be slower but provides differentiated scores.")
|
|
583
|
+
print()
|
|
584
|
+
execute_fn = _create_geval_execute_fn(suite, args.model)
|
|
585
|
+
else:
|
|
586
|
+
execute_fn = _default_execute_fn
|
|
587
|
+
|
|
588
|
+
result = run_suite(config, execute_fn)
|
|
589
|
+
|
|
590
|
+
output_path = Path(args.output)
|
|
591
|
+
if args.format in ("json", "both"):
|
|
592
|
+
export_json(result, str(output_path))
|
|
593
|
+
if args.format in ("html", "both"):
|
|
594
|
+
html_path = output_path.with_suffix(".html")
|
|
595
|
+
export_html(result, str(html_path))
|
|
596
|
+
|
|
597
|
+
print(f"Suite {suite.suite_id} complete. Headline score: {result.headline_score:.1f} ({result.grade})")
|
|
598
|
+
if args.judge:
|
|
599
|
+
print(f"Scored with GEval ({args.model})")
|
|
600
|
+
return 0
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def cmd_compare(args) -> int:
|
|
604
|
+
"""Compare two results for regression."""
|
|
605
|
+
baseline = load_json(args.baseline)
|
|
606
|
+
current = load_json(args.current)
|
|
607
|
+
|
|
608
|
+
if args.config:
|
|
609
|
+
config = load_thresholds(args.config)
|
|
610
|
+
else:
|
|
611
|
+
config = default_thresholds(baseline.suite_id)
|
|
612
|
+
|
|
613
|
+
if config.suite_id != baseline.suite_id:
|
|
614
|
+
print("Threshold suite_id does not match baseline suite.", file=sys.stderr)
|
|
615
|
+
return 2
|
|
616
|
+
|
|
617
|
+
result = compare_results(baseline, current, config)
|
|
618
|
+
|
|
619
|
+
if args.output:
|
|
620
|
+
export_comparison_json(result, args.output)
|
|
621
|
+
|
|
622
|
+
if args.format == "text":
|
|
623
|
+
print_comparison_text(result)
|
|
624
|
+
elif args.format == "json":
|
|
625
|
+
print(json.dumps(comparison_to_dict(result), indent=2))
|
|
626
|
+
elif args.format == "github":
|
|
627
|
+
print_github_annotations(result)
|
|
628
|
+
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
|
|
629
|
+
if summary_path:
|
|
630
|
+
Path(summary_path).write_text(generate_github_summary(result), encoding="utf-8")
|
|
631
|
+
|
|
632
|
+
return result.exit_code
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def cmd_export(args) -> int:
|
|
636
|
+
"""Export result to different format."""
|
|
637
|
+
result = load_json(args.input)
|
|
638
|
+
output = args.output
|
|
639
|
+
|
|
640
|
+
if args.format == "html":
|
|
641
|
+
if output is None:
|
|
642
|
+
output = str(Path(args.input).with_suffix(".html"))
|
|
643
|
+
export_html(result, output)
|
|
644
|
+
return 0
|
|
645
|
+
|
|
646
|
+
if args.format == "json":
|
|
647
|
+
if output is None:
|
|
648
|
+
output = str(Path(args.input).with_suffix(".json"))
|
|
649
|
+
export_json(result, output)
|
|
650
|
+
return 0
|
|
651
|
+
|
|
652
|
+
return 1
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def cmd_baseline(args) -> int:
|
|
656
|
+
"""Baseline management command."""
|
|
657
|
+
if args.baseline_command == "update":
|
|
658
|
+
return cmd_baseline_update(args)
|
|
659
|
+
if args.baseline_command == "show":
|
|
660
|
+
return cmd_baseline_show(args)
|
|
661
|
+
return 1
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def cmd_baseline_update(args) -> int:
|
|
665
|
+
"""Promote result to baseline."""
|
|
666
|
+
result = load_json(args.result)
|
|
667
|
+
output = Path(args.output)
|
|
668
|
+
|
|
669
|
+
if output.exists() and not args.force:
|
|
670
|
+
print(f"Baseline exists: {output}. Use --force to overwrite.", file=sys.stderr)
|
|
671
|
+
return 1
|
|
672
|
+
|
|
673
|
+
export_json(result, str(output))
|
|
674
|
+
print(f"Baseline updated: {output}")
|
|
675
|
+
print(f" Suite: {result.suite_id} v{result.suite_version}")
|
|
676
|
+
print(f" Score: {result.headline_score:.1f} ({result.grade})")
|
|
677
|
+
return 0
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
def cmd_baseline_show(args) -> int:
|
|
681
|
+
"""Show baseline info."""
|
|
682
|
+
result = load_json(args.baseline)
|
|
683
|
+
print(f"Suite: {result.suite_id} v{result.suite_version}")
|
|
684
|
+
print(f"Score: {result.headline_score:.1f} ({result.grade})")
|
|
685
|
+
print(f"Comparability key: {result.comparability_key}")
|
|
686
|
+
return 0
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
if __name__ == "__main__":
|
|
690
|
+
sys.exit(main())
|