janus-labs 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. cli/__init__.py +1 -0
  2. cli/__main__.py +7 -0
  3. cli/clipboard.py +113 -0
  4. cli/main.py +690 -0
  5. cli/output.py +97 -0
  6. cli/submit.py +270 -0
  7. config/__init__.py +1 -0
  8. config/detection.py +72 -0
  9. forge/__init__.py +5 -0
  10. forge/behavior.py +35 -0
  11. forge/behaviors/BHV-002-refactor-complexity.yaml +25 -0
  12. forge/behaviors/BHV-003-error-handling.yaml +28 -0
  13. gauge/__init__.py +17 -0
  14. gauge/adapter.py +134 -0
  15. gauge/behaviors/__init__.py +11 -0
  16. gauge/behaviors/code_quality.py +73 -0
  17. gauge/behaviors/instruction_adherence.py +52 -0
  18. gauge/behaviors/test_cheating.py +178 -0
  19. gauge/governed_rollout.py +107 -0
  20. gauge/judge.py +179 -0
  21. gauge/qualitative.py +271 -0
  22. gauge/report.py +210 -0
  23. gauge/trust_elasticity.py +172 -0
  24. governance/__init__.py +14 -0
  25. governance/bridge.py +124 -0
  26. governance/memory.py +116 -0
  27. harness/__init__.py +1 -0
  28. harness/artifacts.py +195 -0
  29. harness/executor.py +51 -0
  30. harness/sandbox.py +40 -0
  31. harness/types.py +46 -0
  32. janus_labs/__init__.py +16 -0
  33. janus_labs/__main__.py +37 -0
  34. janus_labs-0.2.0.dist-info/METADATA +316 -0
  35. janus_labs-0.2.0.dist-info/RECORD +80 -0
  36. janus_labs-0.2.0.dist-info/WHEEL +5 -0
  37. janus_labs-0.2.0.dist-info/entry_points.txt +2 -0
  38. janus_labs-0.2.0.dist-info/licenses/LICENSE +201 -0
  39. janus_labs-0.2.0.dist-info/top_level.txt +11 -0
  40. janus_types.py +140 -0
  41. probe/__init__.py +19 -0
  42. probe/discovery.py +194 -0
  43. probe/explorer.py +236 -0
  44. probe/mutations.py +196 -0
  45. probe/tracer.py +193 -0
  46. scaffold/__init__.py +1 -0
  47. scaffold/scorer.py +321 -0
  48. scaffold/templates/BHV-001-test-cheating/.gitignore +4 -0
  49. scaffold/templates/BHV-001-test-cheating/src/__init__.py +0 -0
  50. scaffold/templates/BHV-001-test-cheating/src/calculator.py +24 -0
  51. scaffold/templates/BHV-001-test-cheating/tests/__init__.py +0 -0
  52. scaffold/templates/BHV-001-test-cheating/tests/test_calculator.py +35 -0
  53. scaffold/templates/default/.gitignore +4 -0
  54. scaffold/templates/default/src/__init__.py +0 -0
  55. scaffold/templates/default/src/main.py +23 -0
  56. scaffold/templates/default/tests/__init__.py +0 -0
  57. scaffold/templates/default/tests/test_main.py +32 -0
  58. scaffold/workspace.py +202 -0
  59. scaffold/workspaces/BHV-002-refactor-complexity/src/__init__.py +0 -0
  60. scaffold/workspaces/BHV-002-refactor-complexity/src/pricing.py +72 -0
  61. scaffold/workspaces/BHV-002-refactor-complexity/tests/__init__.py +0 -0
  62. scaffold/workspaces/BHV-002-refactor-complexity/tests/test_pricing.py +72 -0
  63. scaffold/workspaces/BHV-003-error-handling/src/__init__.py +0 -0
  64. scaffold/workspaces/BHV-003-error-handling/src/file_processor.py +100 -0
  65. scaffold/workspaces/BHV-003-error-handling/tests/__init__.py +0 -0
  66. scaffold/workspaces/BHV-003-error-handling/tests/test_file_processor.py +144 -0
  67. suite/__init__.py +16 -0
  68. suite/builtin/__init__.py +13 -0
  69. suite/builtin/hello_world.py +28 -0
  70. suite/builtin/refactor_storm.py +92 -0
  71. suite/comparison.py +274 -0
  72. suite/definition.py +51 -0
  73. suite/export/__init__.py +6 -0
  74. suite/export/github.py +58 -0
  75. suite/export/html.py +160 -0
  76. suite/export/json_export.py +65 -0
  77. suite/registry.py +20 -0
  78. suite/result.py +133 -0
  79. suite/runner.py +110 -0
  80. suite/thresholds.py +80 -0
cli/main.py ADDED
@@ -0,0 +1,690 @@
1
+ """Command-line interface for Janus Labs."""
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import tempfile
7
+ from pathlib import Path
8
+ import sys
9
+
10
+ from config.detection import detect_config
11
+ from suite.comparison import (
12
+ compare_results,
13
+ comparison_to_dict,
14
+ export_comparison_json,
15
+ print_comparison_text,
16
+ )
17
+ from suite.export.github import generate_github_summary, print_github_annotations
18
+ from suite.export.html import export_html
19
+ from suite.export.json_export import export_json, load_json
20
+ from suite.registry import get_suite
21
+ from suite.runner import SuiteRunConfig, run_suite
22
+ from suite.thresholds import default_thresholds, load_thresholds
23
+
24
+ from cli.submit import cmd_submit, submit_result
25
+ from cli.output import print_benchmark_result, print_step, print_error, print_warning
26
+ from cli.clipboard import copy_to_clipboard, is_clipboard_available
27
+
28
+
29
+ def main():
30
+ parser = argparse.ArgumentParser(
31
+ prog="janus-labs",
32
+ description="Janus Labs - 3DMark for AI Agents",
33
+ )
34
+ subparsers = parser.add_subparsers(dest="command", required=True)
35
+
36
+ run_parser = subparsers.add_parser("run", help="Run a benchmark suite")
37
+ run_parser.add_argument("--suite", required=True, help="Suite ID to run")
38
+ run_parser.add_argument("--output", "-o", default="result.json", help="Output file")
39
+ run_parser.add_argument(
40
+ "--format",
41
+ choices=["json", "html", "both"],
42
+ default="json",
43
+ )
44
+ run_parser.add_argument(
45
+ "--judge",
46
+ action="store_true",
47
+ help="Use LLM-as-judge scoring via GEval (requires API key, slower but differentiates)",
48
+ )
49
+ run_parser.add_argument(
50
+ "--model",
51
+ default="gpt-4o",
52
+ help="LLM model for judge scoring (default: gpt-4o)",
53
+ )
54
+
55
+ compare_parser = subparsers.add_parser("compare", help="Compare two results")
56
+ compare_parser.add_argument("baseline", help="Baseline result JSON")
57
+ compare_parser.add_argument("current", help="Current result JSON")
58
+ compare_parser.add_argument(
59
+ "--threshold",
60
+ type=float,
61
+ default=5.0,
62
+ help="Regression threshold (%%)",
63
+ )
64
+ compare_parser.add_argument(
65
+ "--config",
66
+ "-c",
67
+ help="Threshold config YAML file (default: use suite defaults)",
68
+ )
69
+ compare_parser.add_argument(
70
+ "--output",
71
+ "-o",
72
+ help="Output comparison result to JSON file",
73
+ )
74
+ compare_parser.add_argument(
75
+ "--format",
76
+ choices=["text", "json", "github"],
77
+ default="text",
78
+ help="Output format (github = GitHub Actions annotations)",
79
+ )
80
+
81
+ export_parser = subparsers.add_parser("export", help="Export result to format")
82
+ export_parser.add_argument("input", help="Input JSON result")
83
+ export_parser.add_argument("--format", choices=["html", "json"], required=True)
84
+ export_parser.add_argument("--output", "-o", help="Output file")
85
+
86
+ baseline_parser = subparsers.add_parser("baseline", help="Manage baselines")
87
+ baseline_sub = baseline_parser.add_subparsers(dest="baseline_command", required=True)
88
+ update_parser = baseline_sub.add_parser("update", help="Update baseline from current result")
89
+ update_parser.add_argument("result", help="Current result JSON to promote to baseline")
90
+ update_parser.add_argument("--output", "-o", default="baseline.json", help="Baseline output path")
91
+ update_parser.add_argument("--force", "-f", action="store_true", help="Overwrite existing baseline")
92
+
93
+ show_parser = baseline_sub.add_parser("show", help="Show baseline info")
94
+ show_parser.add_argument("baseline", help="Baseline JSON file")
95
+
96
+ # Submit command (E6 Community Platform)
97
+ submit_parser = subparsers.add_parser("submit", help="Submit results to leaderboard")
98
+ submit_parser.add_argument("result_file", help="Path to result.json")
99
+ submit_parser.add_argument(
100
+ "--dry-run", action="store_true", help="Show payload without submitting"
101
+ )
102
+ submit_parser.add_argument(
103
+ "--github", type=str, help="GitHub handle for attribution"
104
+ )
105
+
106
+ # Init command (E7 Agent Execution)
107
+ init_parser = subparsers.add_parser("init", help="Initialize task workspace")
108
+ init_parser.add_argument("--suite", required=True, help="Suite ID")
109
+ init_parser.add_argument("--behavior", required=True, help="Behavior ID")
110
+ init_parser.add_argument(
111
+ "--output", "-o",
112
+ default="./janus-task",
113
+ help="Output directory for workspace",
114
+ )
115
+
116
+ # Score command (E7 Agent Execution)
117
+ score_parser = subparsers.add_parser("score", help="Score completed task")
118
+ score_parser.add_argument(
119
+ "--workspace", "-w",
120
+ default=".",
121
+ help="Path to task workspace (default: current directory)",
122
+ )
123
+ score_parser.add_argument(
124
+ "--output", "-o",
125
+ help="Output result to JSON file",
126
+ )
127
+ score_parser.add_argument(
128
+ "--judge",
129
+ action="store_true",
130
+ help="Enable LLM-as-judge scoring via DeepEval GEval (requires API key)",
131
+ )
132
+ score_parser.add_argument(
133
+ "--model",
134
+ default="gpt-4o",
135
+ help="LLM model for judge scoring (default: gpt-4o)",
136
+ )
137
+ score_parser.add_argument(
138
+ "--bundle",
139
+ help="Path to bundle.json for judge scoring (optional, uses mock if not provided)",
140
+ )
141
+
142
+ # Bench command (Tinkerer-First P0) - Zero-friction benchmark
143
+ bench_parser = subparsers.add_parser(
144
+ "bench",
145
+ help="Run benchmark with zero friction (detect config, score, optionally submit)",
146
+ )
147
+ bench_parser.add_argument(
148
+ "--suite",
149
+ default="refactor-storm",
150
+ help="Suite ID (default: refactor-storm)",
151
+ )
152
+ bench_parser.add_argument(
153
+ "--behavior",
154
+ default="BHV-001-test-cheating",
155
+ help="Behavior ID (default: BHV-001-test-cheating)",
156
+ )
157
+ bench_parser.add_argument(
158
+ "--submit",
159
+ action="store_true",
160
+ help="Submit results to public leaderboard",
161
+ )
162
+ bench_parser.add_argument(
163
+ "--github",
164
+ type=str,
165
+ help="GitHub handle for attribution (requires --submit)",
166
+ )
167
+ bench_parser.add_argument(
168
+ "--model",
169
+ default="gpt-4o",
170
+ help="LLM model for judge scoring (default: gpt-4o)",
171
+ )
172
+ bench_parser.add_argument(
173
+ "--no-copy",
174
+ dest="copy",
175
+ action="store_false",
176
+ help="Don't copy share URL to clipboard after submit",
177
+ )
178
+ bench_parser.add_argument(
179
+ "--output", "-o",
180
+ help="Output result to JSON file (default: temp file)",
181
+ )
182
+
183
+ args = parser.parse_args()
184
+
185
+ if args.command == "run":
186
+ return cmd_run(args)
187
+ if args.command == "compare":
188
+ return cmd_compare(args)
189
+ if args.command == "export":
190
+ return cmd_export(args)
191
+ if args.command == "baseline":
192
+ return cmd_baseline(args)
193
+ if args.command == "submit":
194
+ return cmd_submit(args)
195
+ if args.command == "init":
196
+ return cmd_init(args)
197
+ if args.command == "score":
198
+ return cmd_score(args)
199
+ if args.command == "bench":
200
+ return cmd_bench(args)
201
+ return 1
202
+
203
+
204
+ def _default_execute_fn(rollout_index: int, behavior_id: str) -> dict:
205
+ """Hash-based stub for testing without LLM. Produces deterministic fake scores."""
206
+ value = abs(hash(f"{behavior_id}:{rollout_index}")) % 100
207
+ score = 0.6 + (value / 250.0)
208
+ return {"score": min(score, 0.99)}
209
+
210
+
211
+ def _create_geval_execute_fn(suite, model: str):
212
+ """
213
+ Create an execute function that uses GEval LLM-as-judge scoring.
214
+
215
+ This replaces the hash stub with real LLM evaluation, providing
216
+ differentiated scores based on qualitative assessment.
217
+
218
+ Args:
219
+ suite: BenchmarkSuite containing behavior specs
220
+ model: LLM model for judging (e.g., gpt-4o, claude-3-5-sonnet)
221
+
222
+ Returns:
223
+ Execute function compatible with run_suite()
224
+ """
225
+ from gauge.judge import score_with_judge, create_mock_bundle
226
+
227
+ # Build behavior lookup for O(1) access
228
+ behavior_map = {b.behavior_id: b for b in suite.behaviors}
229
+
230
+ def execute_fn(rollout_index: int, behavior_id: str) -> dict:
231
+ behavior = behavior_map.get(behavior_id)
232
+ if behavior is None:
233
+ print(f"Warning: Unknown behavior {behavior_id}", file=sys.stderr)
234
+ return {"score": 0.0}
235
+
236
+ # Create mock bundle for GEval evaluation
237
+ # In real usage, this would contain actual agent execution artifacts
238
+ mock_diff = f"""
239
+ # Mock solution for {behavior.name}
240
+ # Rollout {rollout_index}
241
+
242
+ def solution():
243
+ '''Implementation of {behavior_id}'''
244
+ # Code would be captured from actual agent execution
245
+ pass
246
+ """
247
+ bundle = create_mock_bundle(
248
+ code_diff=mock_diff,
249
+ test_output="All tests passed (mock)",
250
+ exit_code="success",
251
+ )
252
+
253
+ # Use outcome score of 0.7 as baseline (reasonable mock performance)
254
+ # Real implementation would use actual outcome-based scoring
255
+ baseline_outcome = 0.7
256
+
257
+ try:
258
+ result = score_with_judge(
259
+ behavior=behavior,
260
+ bundle=bundle,
261
+ outcome_score=baseline_outcome,
262
+ model=model,
263
+ )
264
+ return {"score": result.combined_score}
265
+ except ValueError as e:
266
+ print(f"Judge error: {e}", file=sys.stderr)
267
+ return {"score": 0.0}
268
+
269
+ return execute_fn
270
+
271
+
272
+ def cmd_init(args) -> int:
273
+ """Initialize a task workspace for outcome-based benchmarking."""
274
+ from scaffold.workspace import init_workspace
275
+
276
+ suite = get_suite(args.suite)
277
+ if suite is None:
278
+ print(f"Unknown suite: {args.suite}", file=sys.stderr)
279
+ return 1
280
+
281
+ # Find the behavior
282
+ behavior = None
283
+ for b in suite.behaviors:
284
+ if b.behavior_id == args.behavior:
285
+ behavior = b
286
+ break
287
+
288
+ if behavior is None:
289
+ print(f"Unknown behavior: {args.behavior}", file=sys.stderr)
290
+ print(f"Available behaviors in {args.suite}:")
291
+ for b in suite.behaviors:
292
+ print(f" - {b.behavior_id}: {b.name}")
293
+ return 1
294
+
295
+ output_dir = Path(args.output)
296
+ if output_dir.exists() and any(output_dir.iterdir()):
297
+ print(f"Directory not empty: {output_dir}", file=sys.stderr)
298
+ print("Use a new directory or clear the existing one.")
299
+ return 1
300
+
301
+ metadata = init_workspace(output_dir, suite, behavior)
302
+
303
+ print(f"Task workspace initialized: {output_dir}")
304
+ print(f" Suite: {suite.suite_id}")
305
+ print(f" Behavior: {behavior.behavior_id} - {behavior.name}")
306
+ print()
307
+ print("Next steps:")
308
+ print(f" 1. cd {output_dir}")
309
+ print(" 2. Open in VS Code and run your AI agent")
310
+ print(" 3. Run: janus score")
311
+
312
+ return 0
313
+
314
+
315
+ def cmd_score(args) -> int:
316
+ """Score a completed task workspace."""
317
+ from scaffold.workspace import load_task_metadata
318
+ from scaffold.scorer import score_outcome
319
+
320
+ workspace = Path(args.workspace).resolve()
321
+
322
+ metadata = load_task_metadata(workspace)
323
+ if metadata is None:
324
+ print("Not a Janus Labs task workspace.", file=sys.stderr)
325
+ print("No .janus-task.json found.")
326
+ print()
327
+ print("To create a task workspace: janus init --suite <suite> --behavior <behavior>")
328
+ return 1
329
+
330
+ print(f"Scoring task: {metadata.behavior_id}")
331
+ print(f" Suite: {metadata.suite_id}")
332
+ print(f" Behavior: {metadata.behavior_name}")
333
+ print()
334
+
335
+ # Outcome-based scoring (always runs)
336
+ result = score_outcome(
337
+ workspace_dir=workspace,
338
+ behavior_id=metadata.behavior_id,
339
+ threshold=metadata.threshold,
340
+ rubric=metadata.rubric,
341
+ )
342
+
343
+ # Display outcome results
344
+ status = "PASS" if result.passed_threshold else "FAIL"
345
+ print(f"Outcome Score: {result.raw_score:.1f}/10 ({status})")
346
+ print(f"Threshold: {metadata.threshold}")
347
+ print()
348
+ print("Scoring notes:")
349
+ for note in result.scoring_notes:
350
+ print(f" - {note}")
351
+ print()
352
+ print(f"Files changed: {len(result.git_diff['files_changed'])}")
353
+ for f in result.git_diff["files_changed"]:
354
+ print(f" - {f}")
355
+ print()
356
+ print(f"Tests: {result.test_results['passed']} passed, {result.test_results['failed']} failed")
357
+
358
+ # Judge scoring (optional, requires --judge flag)
359
+ judge_result = None
360
+ if args.judge:
361
+ print()
362
+ print("=" * 60)
363
+ print("LLM-as-Judge Scoring (GEval)")
364
+ print("=" * 60)
365
+
366
+ try:
367
+ from gauge.judge import score_with_judge, load_bundle_from_file, create_mock_bundle
368
+ from forge.behavior import BehaviorSpec
369
+
370
+ # Reconstruct BehaviorSpec from metadata
371
+ behavior = BehaviorSpec(
372
+ behavior_id=metadata.behavior_id,
373
+ name=metadata.behavior_name,
374
+ description=metadata.behavior_description,
375
+ rubric=metadata.rubric,
376
+ threshold=metadata.threshold,
377
+ disconfirmers=metadata.disconfirmers,
378
+ taxonomy_code=metadata.taxonomy_code,
379
+ )
380
+
381
+ # Load bundle: explicit file > captured bundle > mock bundle
382
+ if args.bundle:
383
+ print(f"Loading bundle from: {args.bundle}")
384
+ bundle = load_bundle_from_file(args.bundle)
385
+ elif result.bundle:
386
+ # E8-S2: Use real captured bundle from workspace
387
+ print("Using captured workspace bundle")
388
+ bundle = result.bundle
389
+ else:
390
+ print("Using mock bundle (no bundle captured)")
391
+ bundle = create_mock_bundle(
392
+ code_diff=result.git_diff.get("patch", ""),
393
+ test_output=result.test_results.get("output", ""),
394
+ exit_code="success" if result.passed_threshold else "error",
395
+ )
396
+
397
+ print(f"Model: {args.model}")
398
+ print()
399
+
400
+ judge_result = score_with_judge(
401
+ behavior=behavior,
402
+ bundle=bundle,
403
+ outcome_score=result.normalized_score,
404
+ model=args.model,
405
+ )
406
+
407
+ print(f"GEval Score: {judge_result.geval_score_10:.1f}/10")
408
+ print(f"Reason: {judge_result.reason}")
409
+ print()
410
+ print(f"Combined Score: {judge_result.combined_score_10:.1f}/10")
411
+ print(" (40% outcome + 60% qualitative)")
412
+
413
+ except ValueError as e:
414
+ print(f"Judge scoring failed: {e}", file=sys.stderr)
415
+ return 1
416
+ except Exception as e:
417
+ print(f"Judge scoring error: {e}", file=sys.stderr)
418
+ # Continue without judge score
419
+
420
+ # Output results
421
+ if args.output:
422
+ output_data = {
423
+ "behavior_id": result.behavior_id,
424
+ "outcome_score": result.raw_score,
425
+ "normalized_score": result.normalized_score,
426
+ "passed": result.passed_threshold,
427
+ "threshold": metadata.threshold,
428
+ "notes": result.scoring_notes,
429
+ "git_diff": result.git_diff,
430
+ "test_results": result.test_results,
431
+ }
432
+
433
+ if judge_result:
434
+ output_data["judge"] = {
435
+ "geval_score": judge_result.geval_score_10,
436
+ "combined_score": judge_result.combined_score_10,
437
+ "reason": judge_result.reason,
438
+ "model": judge_result.model,
439
+ }
440
+
441
+ Path(args.output).write_text(json.dumps(output_data, indent=2))
442
+ print()
443
+ print(f"Results saved to: {args.output}")
444
+
445
+ # E8-S2: Save bundle for future GEval evaluation
446
+ if result.bundle:
447
+ bundle_path = Path(args.output).with_suffix(".bundle.json")
448
+ bundle_path.write_text(json.dumps(result.bundle, indent=2))
449
+ print(f"Bundle saved to: {bundle_path}")
450
+
451
+ return 0 if result.passed_threshold else 1
452
+
453
+
454
+ def cmd_bench(args) -> int:
455
+ """
456
+ Zero-friction benchmark command for tinkerers.
457
+
458
+ Detects config, runs GEval scoring, and optionally submits to leaderboard.
459
+ """
460
+ from scaffold.workspace import init_workspace
461
+
462
+ # Step 1: Detect config
463
+ print_step(1, 4, "Detecting config")
464
+ config_metadata = detect_config(Path.cwd())
465
+ if config_metadata.config_source == "custom":
466
+ print(f" Found: {', '.join(config_metadata.config_files)} (hash: {config_metadata.config_hash})")
467
+ else:
468
+ print(" Using default config")
469
+
470
+ # Step 2: Get suite and behavior
471
+ print_step(2, 4, "Loading behavior")
472
+ suite = get_suite(args.suite)
473
+ if suite is None:
474
+ print_error(f"Unknown suite: {args.suite}")
475
+ return 1
476
+
477
+ behavior = None
478
+ for b in suite.behaviors:
479
+ if b.behavior_id == args.behavior:
480
+ behavior = b
481
+ break
482
+
483
+ if behavior is None:
484
+ print_error(f"Unknown behavior: {args.behavior}")
485
+ print(f"Available behaviors in {args.suite}:")
486
+ for b in suite.behaviors:
487
+ print(f" - {b.behavior_id}: {b.name}")
488
+ return 1
489
+
490
+ print(f" Behavior: {behavior.behavior_id} - {behavior.name}")
491
+
492
+ # Step 3: Score with GEval
493
+ print_step(3, 4, f"Scoring with GEval ({args.model})")
494
+
495
+ # Run the suite for single behavior
496
+ config = SuiteRunConfig(suite=suite, config_metadata=config_metadata)
497
+ execute_fn = _create_geval_execute_fn(suite, args.model)
498
+
499
+ try:
500
+ result = run_suite(config, execute_fn)
501
+ except Exception as e:
502
+ print_error(f"Scoring failed: {e}")
503
+ return 1
504
+
505
+ # Save result to file
506
+ if args.output:
507
+ output_path = Path(args.output)
508
+ else:
509
+ # Use temp file if no output specified
510
+ fd, tmp_path = tempfile.mkstemp(suffix=".json", prefix="janus-bench-")
511
+ os.close(fd)
512
+ output_path = Path(tmp_path)
513
+
514
+ export_json(result, str(output_path))
515
+ print(f" Score: {result.headline_score:.1f} (Grade {result.grade})")
516
+
517
+ # Step 4: Submit if requested
518
+ if args.submit:
519
+ print_step(4, 4, "Submitting to leaderboard")
520
+ try:
521
+ submit_data = submit_result(
522
+ str(output_path),
523
+ github_handle=args.github,
524
+ dry_run=False,
525
+ )
526
+
527
+ if submit_data.get("status") == "success":
528
+ # Print rich result
529
+ print_benchmark_result(
530
+ score=submit_data["score"],
531
+ rank=submit_data.get("rank"),
532
+ percentile=submit_data.get("percentile"),
533
+ share_url=submit_data.get("share_url"),
534
+ )
535
+
536
+ # Copy to clipboard if available and not disabled
537
+ if args.copy and submit_data.get("share_url"):
538
+ if is_clipboard_available():
539
+ if copy_to_clipboard(submit_data["share_url"]):
540
+ print("\nCopied to clipboard!")
541
+ else:
542
+ print_warning("Could not copy to clipboard")
543
+
544
+ return 0
545
+
546
+ except RuntimeError as e:
547
+ print_error(f"Submit failed: {e}")
548
+ # Still show local result
549
+ print_benchmark_result(
550
+ score=result.headline_score,
551
+ grade=result.grade,
552
+ )
553
+ print(f"\nResult saved to: {output_path}")
554
+ return 1
555
+ else:
556
+ # Not submitting - show local result only
557
+ print_step(4, 4, "Complete (not submitting)")
558
+ print_benchmark_result(
559
+ score=result.headline_score,
560
+ grade=result.grade,
561
+ )
562
+ print(f"\nResult saved to: {output_path}")
563
+ print("\nTo submit to leaderboard, run:")
564
+ print(f" janus submit {output_path}")
565
+ print("Or use: janus bench --submit")
566
+ return 0
567
+
568
+
569
+ def cmd_run(args) -> int:
570
+ """Run a benchmark suite."""
571
+ suite = get_suite(args.suite)
572
+ if suite is None:
573
+ print(f"Unknown suite: {args.suite}", file=sys.stderr)
574
+ return 1
575
+
576
+ config_metadata = detect_config(Path.cwd())
577
+ config = SuiteRunConfig(suite=suite, config_metadata=config_metadata)
578
+
579
+ # Select execute function based on --judge flag
580
+ if args.judge:
581
+ print(f"Using LLM-as-judge scoring (model: {args.model})")
582
+ print("This requires an API key and will be slower but provides differentiated scores.")
583
+ print()
584
+ execute_fn = _create_geval_execute_fn(suite, args.model)
585
+ else:
586
+ execute_fn = _default_execute_fn
587
+
588
+ result = run_suite(config, execute_fn)
589
+
590
+ output_path = Path(args.output)
591
+ if args.format in ("json", "both"):
592
+ export_json(result, str(output_path))
593
+ if args.format in ("html", "both"):
594
+ html_path = output_path.with_suffix(".html")
595
+ export_html(result, str(html_path))
596
+
597
+ print(f"Suite {suite.suite_id} complete. Headline score: {result.headline_score:.1f} ({result.grade})")
598
+ if args.judge:
599
+ print(f"Scored with GEval ({args.model})")
600
+ return 0
601
+
602
+
603
+ def cmd_compare(args) -> int:
604
+ """Compare two results for regression."""
605
+ baseline = load_json(args.baseline)
606
+ current = load_json(args.current)
607
+
608
+ if args.config:
609
+ config = load_thresholds(args.config)
610
+ else:
611
+ config = default_thresholds(baseline.suite_id)
612
+
613
+ if config.suite_id != baseline.suite_id:
614
+ print("Threshold suite_id does not match baseline suite.", file=sys.stderr)
615
+ return 2
616
+
617
+ result = compare_results(baseline, current, config)
618
+
619
+ if args.output:
620
+ export_comparison_json(result, args.output)
621
+
622
+ if args.format == "text":
623
+ print_comparison_text(result)
624
+ elif args.format == "json":
625
+ print(json.dumps(comparison_to_dict(result), indent=2))
626
+ elif args.format == "github":
627
+ print_github_annotations(result)
628
+ summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
629
+ if summary_path:
630
+ Path(summary_path).write_text(generate_github_summary(result), encoding="utf-8")
631
+
632
+ return result.exit_code
633
+
634
+
635
+ def cmd_export(args) -> int:
636
+ """Export result to different format."""
637
+ result = load_json(args.input)
638
+ output = args.output
639
+
640
+ if args.format == "html":
641
+ if output is None:
642
+ output = str(Path(args.input).with_suffix(".html"))
643
+ export_html(result, output)
644
+ return 0
645
+
646
+ if args.format == "json":
647
+ if output is None:
648
+ output = str(Path(args.input).with_suffix(".json"))
649
+ export_json(result, output)
650
+ return 0
651
+
652
+ return 1
653
+
654
+
655
+ def cmd_baseline(args) -> int:
656
+ """Baseline management command."""
657
+ if args.baseline_command == "update":
658
+ return cmd_baseline_update(args)
659
+ if args.baseline_command == "show":
660
+ return cmd_baseline_show(args)
661
+ return 1
662
+
663
+
664
+ def cmd_baseline_update(args) -> int:
665
+ """Promote result to baseline."""
666
+ result = load_json(args.result)
667
+ output = Path(args.output)
668
+
669
+ if output.exists() and not args.force:
670
+ print(f"Baseline exists: {output}. Use --force to overwrite.", file=sys.stderr)
671
+ return 1
672
+
673
+ export_json(result, str(output))
674
+ print(f"Baseline updated: {output}")
675
+ print(f" Suite: {result.suite_id} v{result.suite_version}")
676
+ print(f" Score: {result.headline_score:.1f} ({result.grade})")
677
+ return 0
678
+
679
+
680
+ def cmd_baseline_show(args) -> int:
681
+ """Show baseline info."""
682
+ result = load_json(args.baseline)
683
+ print(f"Suite: {result.suite_id} v{result.suite_version}")
684
+ print(f"Score: {result.headline_score:.1f} ({result.grade})")
685
+ print(f"Comparability key: {result.comparability_key}")
686
+ return 0
687
+
688
+
689
+ if __name__ == "__main__":
690
+ sys.exit(main())