jfl 0.8.1 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -4
- package/dist/commands/digest.d.ts +6 -0
- package/dist/commands/digest.d.ts.map +1 -1
- package/dist/commands/digest.js +70 -69
- package/dist/commands/digest.js.map +1 -1
- package/dist/commands/doctor.d.ts +1 -0
- package/dist/commands/doctor.d.ts.map +1 -1
- package/dist/commands/doctor.js +30 -1
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/eval.d.ts +40 -0
- package/dist/commands/eval.d.ts.map +1 -1
- package/dist/commands/eval.js +8 -8
- package/dist/commands/eval.js.map +1 -1
- package/dist/commands/findings.d.ts +7 -0
- package/dist/commands/findings.d.ts.map +1 -1
- package/dist/commands/findings.js +4 -4
- package/dist/commands/findings.js.map +1 -1
- package/dist/commands/ide.d.ts +2 -1
- package/dist/commands/ide.d.ts.map +1 -1
- package/dist/commands/ide.js +61 -1
- package/dist/commands/ide.js.map +1 -1
- package/dist/commands/init-from-service.d.ts +15 -0
- package/dist/commands/init-from-service.d.ts.map +1 -0
- package/dist/commands/init-from-service.js +541 -0
- package/dist/commands/init-from-service.js.map +1 -0
- package/dist/commands/init.d.ts +1 -0
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +32 -1
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/kanban.d.ts.map +1 -1
- package/dist/commands/kanban.js +13 -4
- package/dist/commands/kanban.js.map +1 -1
- package/dist/commands/linear.d.ts +41 -0
- package/dist/commands/linear.d.ts.map +1 -0
- package/dist/commands/linear.js +715 -0
- package/dist/commands/linear.js.map +1 -0
- package/dist/commands/peter.d.ts.map +1 -1
- package/dist/commands/peter.js +232 -25
- package/dist/commands/peter.js.map +1 -1
- package/dist/commands/portfolio.d.ts +5 -0
- package/dist/commands/portfolio.d.ts.map +1 -1
- package/dist/commands/portfolio.js +193 -203
- package/dist/commands/portfolio.js.map +1 -1
- package/dist/commands/predict.d.ts +19 -0
- package/dist/commands/predict.d.ts.map +1 -1
- package/dist/commands/predict.js +4 -4
- package/dist/commands/predict.js.map +1 -1
- package/dist/commands/services.d.ts.map +1 -1
- package/dist/commands/services.js +146 -0
- package/dist/commands/services.js.map +1 -1
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +279 -20
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/start.d.ts +25 -0
- package/dist/commands/start.d.ts.map +1 -0
- package/dist/commands/start.js +191 -0
- package/dist/commands/start.js.map +1 -0
- package/dist/commands/telemetry-monitor.d.ts +11 -0
- package/dist/commands/telemetry-monitor.d.ts.map +1 -0
- package/dist/commands/telemetry-monitor.js +224 -0
- package/dist/commands/telemetry-monitor.js.map +1 -0
- package/dist/commands/telemetry-test.d.ts +11 -0
- package/dist/commands/telemetry-test.d.ts.map +1 -0
- package/dist/commands/telemetry-test.js +67 -0
- package/dist/commands/telemetry-test.js.map +1 -0
- package/dist/commands/tenet-agents.d.ts +13 -0
- package/dist/commands/tenet-agents.d.ts.map +1 -0
- package/dist/commands/tenet-agents.js +191 -0
- package/dist/commands/tenet-agents.js.map +1 -0
- package/dist/commands/tenet-setup.d.ts +20 -0
- package/dist/commands/tenet-setup.d.ts.map +1 -0
- package/dist/commands/tenet-setup.js +135 -0
- package/dist/commands/tenet-setup.js.map +1 -0
- package/dist/commands/train.d.ts +18 -0
- package/dist/commands/train.d.ts.map +1 -1
- package/dist/commands/train.js +182 -0
- package/dist/commands/train.js.map +1 -1
- package/dist/commands/viz.d.ts +33 -0
- package/dist/commands/viz.d.ts.map +1 -1
- package/dist/commands/viz.js +9 -9
- package/dist/commands/viz.js.map +1 -1
- package/dist/commands/whoami.d.ts +2 -0
- package/dist/commands/whoami.d.ts.map +1 -0
- package/dist/commands/whoami.js +24 -0
- package/dist/commands/whoami.js.map +1 -0
- package/dist/index.js +230 -30
- package/dist/index.js.map +1 -1
- package/dist/lib/advanced-setup.d.ts +78 -0
- package/dist/lib/advanced-setup.d.ts.map +1 -0
- package/dist/lib/advanced-setup.js +433 -0
- package/dist/lib/advanced-setup.js.map +1 -0
- package/dist/lib/agent-config.d.ts +33 -0
- package/dist/lib/agent-config.d.ts.map +1 -1
- package/dist/lib/agent-config.js +26 -0
- package/dist/lib/agent-config.js.map +1 -1
- package/dist/lib/counterfactual-training-bridge.d.ts +114 -0
- package/dist/lib/counterfactual-training-bridge.d.ts.map +1 -0
- package/dist/lib/counterfactual-training-bridge.js +322 -0
- package/dist/lib/counterfactual-training-bridge.js.map +1 -0
- package/dist/lib/discovery-agent.d.ts +48 -0
- package/dist/lib/discovery-agent.d.ts.map +1 -0
- package/dist/lib/discovery-agent.js +111 -0
- package/dist/lib/discovery-agent.js.map +1 -0
- package/dist/lib/flow-engine.d.ts.map +1 -1
- package/dist/lib/flow-engine.js +46 -8
- package/dist/lib/flow-engine.js.map +1 -1
- package/dist/lib/gtm-generator.d.ts +29 -0
- package/dist/lib/gtm-generator.d.ts.map +1 -0
- package/dist/lib/gtm-generator.js +252 -0
- package/dist/lib/gtm-generator.js.map +1 -0
- package/dist/lib/hub-health.d.ts +40 -0
- package/dist/lib/hub-health.d.ts.map +1 -0
- package/dist/lib/hub-health.js +89 -0
- package/dist/lib/hub-health.js.map +1 -0
- package/dist/lib/invariant-monitor.d.ts +6 -2
- package/dist/lib/invariant-monitor.d.ts.map +1 -1
- package/dist/lib/invariant-monitor.js +89 -2
- package/dist/lib/invariant-monitor.js.map +1 -1
- package/dist/lib/journal-analyzer.d.ts +71 -0
- package/dist/lib/journal-analyzer.d.ts.map +1 -0
- package/dist/lib/journal-analyzer.js +306 -0
- package/dist/lib/journal-analyzer.js.map +1 -0
- package/dist/lib/linear-client.d.ts +73 -0
- package/dist/lib/linear-client.d.ts.map +1 -0
- package/dist/lib/linear-client.js +112 -0
- package/dist/lib/linear-client.js.map +1 -0
- package/dist/lib/linear-id-map.d.ts +20 -0
- package/dist/lib/linear-id-map.d.ts.map +1 -0
- package/dist/lib/linear-id-map.js +59 -0
- package/dist/lib/linear-id-map.js.map +1 -0
- package/dist/lib/linear-kanban.d.ts +66 -0
- package/dist/lib/linear-kanban.d.ts.map +1 -0
- package/dist/lib/linear-kanban.js +175 -0
- package/dist/lib/linear-kanban.js.map +1 -0
- package/dist/lib/onboarding.d.ts +40 -0
- package/dist/lib/onboarding.d.ts.map +1 -0
- package/dist/lib/onboarding.js +213 -0
- package/dist/lib/onboarding.js.map +1 -0
- package/dist/lib/physical-world-model.d.ts +50 -0
- package/dist/lib/physical-world-model.d.ts.map +1 -0
- package/dist/lib/physical-world-model.js +251 -0
- package/dist/lib/physical-world-model.js.map +1 -0
- package/dist/lib/planning-loop.d.ts +157 -0
- package/dist/lib/planning-loop.d.ts.map +1 -0
- package/dist/lib/planning-loop.js +537 -0
- package/dist/lib/planning-loop.js.map +1 -0
- package/dist/lib/policy-head.d.ts +13 -0
- package/dist/lib/policy-head.d.ts.map +1 -1
- package/dist/lib/policy-head.js +168 -2
- package/dist/lib/policy-head.js.map +1 -1
- package/dist/lib/resource-optimizer-middleware.d.ts +39 -0
- package/dist/lib/resource-optimizer-middleware.d.ts.map +1 -0
- package/dist/lib/resource-optimizer-middleware.js +222 -0
- package/dist/lib/resource-optimizer-middleware.js.map +1 -0
- package/dist/lib/resource-optimizer.d.ts +71 -0
- package/dist/lib/resource-optimizer.d.ts.map +1 -0
- package/dist/lib/resource-optimizer.js +228 -0
- package/dist/lib/resource-optimizer.js.map +1 -0
- package/dist/lib/rl-manager.d.ts +74 -0
- package/dist/lib/rl-manager.d.ts.map +1 -0
- package/dist/lib/rl-manager.js +245 -0
- package/dist/lib/rl-manager.js.map +1 -0
- package/dist/lib/service-analyzer.d.ts +76 -0
- package/dist/lib/service-analyzer.d.ts.map +1 -0
- package/dist/lib/service-analyzer.js +704 -0
- package/dist/lib/service-analyzer.js.map +1 -0
- package/dist/lib/service-gtm.js +2 -2
- package/dist/lib/service-gtm.js.map +1 -1
- package/dist/lib/service-questionnaire.d.ts +11 -0
- package/dist/lib/service-questionnaire.d.ts.map +1 -0
- package/dist/lib/service-questionnaire.js +89 -0
- package/dist/lib/service-questionnaire.js.map +1 -0
- package/dist/lib/setup/agent-generator.d.ts +2 -0
- package/dist/lib/setup/agent-generator.d.ts.map +1 -1
- package/dist/lib/setup/agent-generator.js +128 -4
- package/dist/lib/setup/agent-generator.js.map +1 -1
- package/dist/lib/setup/flow-generator.d.ts +10 -0
- package/dist/lib/setup/flow-generator.d.ts.map +1 -0
- package/dist/lib/setup/flow-generator.js +113 -0
- package/dist/lib/setup/flow-generator.js.map +1 -0
- package/dist/lib/setup/invariant-bridge.d.ts +91 -0
- package/dist/lib/setup/invariant-bridge.d.ts.map +1 -0
- package/dist/lib/setup/invariant-bridge.js +384 -0
- package/dist/lib/setup/invariant-bridge.js.map +1 -0
- package/dist/lib/setup/spec-generator.d.ts +41 -5
- package/dist/lib/setup/spec-generator.d.ts.map +1 -1
- package/dist/lib/setup/spec-generator.js +503 -29
- package/dist/lib/setup/spec-generator.js.map +1 -1
- package/dist/lib/setup/starter-intelligence.d.ts +25 -0
- package/dist/lib/setup/starter-intelligence.d.ts.map +1 -0
- package/dist/lib/setup/starter-intelligence.js +309 -0
- package/dist/lib/setup/starter-intelligence.js.map +1 -0
- package/dist/lib/stratus-client.js +1 -1
- package/dist/lib/stratus-client.js.map +1 -1
- package/dist/lib/surface-agent.d.ts +78 -0
- package/dist/lib/surface-agent.d.ts.map +1 -0
- package/dist/lib/surface-agent.js +105 -0
- package/dist/lib/surface-agent.js.map +1 -0
- package/dist/lib/surface-coordination-example.d.ts +30 -0
- package/dist/lib/surface-coordination-example.d.ts.map +1 -0
- package/dist/lib/surface-coordination-example.js +164 -0
- package/dist/lib/surface-coordination-example.js.map +1 -0
- package/dist/lib/telemetry/physical-world-collector.d.ts +15 -0
- package/dist/lib/telemetry/physical-world-collector.d.ts.map +1 -0
- package/dist/lib/telemetry/physical-world-collector.js +177 -0
- package/dist/lib/telemetry/physical-world-collector.js.map +1 -0
- package/dist/lib/telemetry/training-bridge.d.ts +51 -0
- package/dist/lib/telemetry/training-bridge.d.ts.map +1 -0
- package/dist/lib/telemetry/training-bridge.js +185 -0
- package/dist/lib/telemetry/training-bridge.js.map +1 -0
- package/dist/lib/telemetry.d.ts +2 -1
- package/dist/lib/telemetry.d.ts.map +1 -1
- package/dist/lib/telemetry.js +23 -2
- package/dist/lib/telemetry.js.map +1 -1
- package/dist/lib/tenet-board-agent.d.ts +52 -0
- package/dist/lib/tenet-board-agent.d.ts.map +1 -0
- package/dist/lib/tenet-board-agent.js +226 -0
- package/dist/lib/tenet-board-agent.js.map +1 -0
- package/dist/lib/tenet-ide-agent.d.ts +40 -0
- package/dist/lib/tenet-ide-agent.d.ts.map +1 -0
- package/dist/lib/tenet-ide-agent.js +199 -0
- package/dist/lib/tenet-ide-agent.js.map +1 -0
- package/dist/lib/workspace/data-pipeline.d.ts.map +1 -1
- package/dist/lib/workspace/data-pipeline.js +27 -5
- package/dist/lib/workspace/data-pipeline.js.map +1 -1
- package/dist/lib/workspace/sidebar-runner.d.ts +13 -0
- package/dist/lib/workspace/sidebar-runner.d.ts.map +1 -0
- package/dist/lib/workspace/sidebar-runner.js +419 -0
- package/dist/lib/workspace/sidebar-runner.js.map +1 -0
- package/dist/lib/workspace/surface-registry.d.ts.map +1 -1
- package/dist/lib/workspace/surface-registry.js +9 -1
- package/dist/lib/workspace/surface-registry.js.map +1 -1
- package/dist/lib/workspace/surfaces/agent-overview.d.ts +3 -3
- package/dist/lib/workspace/surfaces/agent-overview.d.ts.map +1 -1
- package/dist/lib/workspace/surfaces/agent-overview.js +3 -3
- package/dist/lib/workspace/surfaces/agent-overview.js.map +1 -1
- package/dist/lib/workspace/surfaces/index.d.ts +3 -0
- package/dist/lib/workspace/surfaces/index.d.ts.map +1 -1
- package/dist/lib/workspace/surfaces/index.js +3 -0
- package/dist/lib/workspace/surfaces/index.js.map +1 -1
- package/dist/lib/workspace/surfaces/kanban.d.ts +15 -0
- package/dist/lib/workspace/surfaces/kanban.d.ts.map +1 -0
- package/dist/lib/workspace/surfaces/kanban.js +43 -0
- package/dist/lib/workspace/surfaces/kanban.js.map +1 -0
- package/dist/lib/workspace/surfaces/physical-world.d.ts +15 -0
- package/dist/lib/workspace/surfaces/physical-world.d.ts.map +1 -0
- package/dist/lib/workspace/surfaces/physical-world.js +37 -0
- package/dist/lib/workspace/surfaces/physical-world.js.map +1 -0
- package/dist/lib/workspace/surfaces/sidebar.d.ts +22 -0
- package/dist/lib/workspace/surfaces/sidebar.d.ts.map +1 -0
- package/dist/lib/workspace/surfaces/sidebar.js +94 -0
- package/dist/lib/workspace/surfaces/sidebar.js.map +1 -0
- package/dist/lib/workspace/tmux-adapter.d.ts +8 -5
- package/dist/lib/workspace/tmux-adapter.d.ts.map +1 -1
- package/dist/lib/workspace/tmux-adapter.js +38 -7
- package/dist/lib/workspace/tmux-adapter.js.map +1 -1
- package/dist/lib/workspace/tmux-sidebar.d.ts +14 -0
- package/dist/lib/workspace/tmux-sidebar.d.ts.map +1 -0
- package/dist/lib/workspace/tmux-sidebar.js +230 -0
- package/dist/lib/workspace/tmux-sidebar.js.map +1 -0
- package/dist/types/flows.d.ts +2 -1
- package/dist/types/flows.d.ts.map +1 -1
- package/dist/types/physical-world-model.d.ts +65 -0
- package/dist/types/physical-world-model.d.ts.map +1 -0
- package/dist/types/physical-world-model.js +43 -0
- package/dist/types/physical-world-model.js.map +1 -0
- package/dist/types/telemetry.d.ts +37 -0
- package/dist/types/telemetry.d.ts.map +1 -1
- package/dist/types/world-model.d.ts.map +1 -1
- package/dist/types/world-model.js +14 -7
- package/dist/types/world-model.js.map +1 -1
- package/dist/utils/context-hub-port.d.ts.map +1 -1
- package/dist/utils/context-hub-port.js +6 -1
- package/dist/utils/context-hub-port.js.map +1 -1
- package/dist/utils/jfl-config.d.ts +7 -2
- package/dist/utils/jfl-config.d.ts.map +1 -1
- package/dist/utils/jfl-config.js +14 -4
- package/dist/utils/jfl-config.js.map +1 -1
- package/package.json +3 -2
- package/packages/pi/extensions/context.ts +51 -1
- package/packages/pi/extensions/hub-tools.ts +247 -0
- package/packages/pi/extensions/index.ts +38 -6
- package/packages/pi/extensions/memory-tool.ts +84 -4
- package/packages/pi/extensions/service-skills.ts +214 -0
- package/scripts/telemetry-dashboard.sh +44 -0
- package/scripts/test-planning-loop-e2e.ts +181 -0
- package/scripts/test-server-inference.ts +49 -0
- package/scripts/test-state-sensitivity.ts +32 -0
- package/scripts/train/v2/benchmark.py +661 -0
- package/scripts/train/v2/generate_balanced.py +439 -0
- package/scripts/train/v2/generate_hard_negatives.py +219 -0
- package/scripts/train/v2/infer.py +149 -36
- package/scripts/train/v2/infer_server.py +224 -0
- package/scripts/train/v2/online_train.py +576 -0
- package/scripts/train/v2/precompute.py +24 -6
- package/template/CLAUDE.md +74 -132
|
@@ -0,0 +1,661 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark Scenarios for PolicyHead Evaluation.
|
|
3
|
+
|
|
4
|
+
Creates synthetic evaluation environments with known "right answers"
|
|
5
|
+
for rapid PolicyHead iteration. Test in minutes, not weeks.
|
|
6
|
+
|
|
7
|
+
Drew's recommendation (section 8.6):
|
|
8
|
+
- Create benchmark agent scenarios for rapid iteration
|
|
9
|
+
- Use World Model to create synthetic state snapshots
|
|
10
|
+
- Counterfactual Engine generates ground truth outcomes
|
|
11
|
+
- PolicyHead evaluated on these scenarios before deploying to real agents
|
|
12
|
+
|
|
13
|
+
Scenarios:
|
|
14
|
+
1. Fix failing test — agent sees failing tests, should select fix_bug
|
|
15
|
+
2. Refactor messy code — high code churn, should select refactor_code
|
|
16
|
+
3. Optimize slow build — build failing/slow, should select optimize_performance
|
|
17
|
+
4. Handle hub crash — hub down, agents stranded, should select fix_bug
|
|
18
|
+
5. Improve coverage — low test counts, should select add_tests
|
|
19
|
+
6. Stale dependencies — security issues, should select dependency_update
|
|
20
|
+
7. Missing docs — new features without docs, should select update_docs
|
|
21
|
+
8. Data pipeline broken — ETL failures, should select data_pipeline
|
|
22
|
+
|
|
23
|
+
Each scenario defines:
|
|
24
|
+
- A synthetic WorldState snapshot (state text for embedding)
|
|
25
|
+
- A goal string
|
|
26
|
+
- The expected correct tool
|
|
27
|
+
- Difficulty level (easy/medium/hard)
|
|
28
|
+
- Variations (to test generalization)
|
|
29
|
+
|
|
30
|
+
Usage:
|
|
31
|
+
python benchmark.py --checkpoint .jfl/checkpoints/best_policy_head.pt
|
|
32
|
+
python benchmark.py --checkpoint .jfl/checkpoints/best_policy_head.pt --json
|
|
33
|
+
python benchmark.py --generate # Write benchmark.jsonl for offline eval
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
import json
|
|
37
|
+
import os
|
|
38
|
+
import sys
|
|
39
|
+
import argparse
|
|
40
|
+
import time
|
|
41
|
+
from dataclasses import dataclass, asdict
|
|
42
|
+
|
|
43
|
+
# ============================================================================
|
|
44
|
+
# Scenario Definitions
|
|
45
|
+
# ============================================================================
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class BenchmarkScenario:
|
|
49
|
+
id: str
|
|
50
|
+
name: str
|
|
51
|
+
description: str
|
|
52
|
+
difficulty: str # easy, medium, hard
|
|
53
|
+
state_text: str
|
|
54
|
+
goal: str
|
|
55
|
+
correct_tool: str
|
|
56
|
+
category: str # diagnostic, scheduling, recovery, optimization
|
|
57
|
+
tags: list[str]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def generate_scenarios() -> list[BenchmarkScenario]:
|
|
61
|
+
"""Generate all benchmark scenarios with variations."""
|
|
62
|
+
scenarios = []
|
|
63
|
+
|
|
64
|
+
# ── Scenario 1: Fix Failing Test (Easy) ──────────────────────────
|
|
65
|
+
for variant, (tests_failing, composite, goal_text) in enumerate([
|
|
66
|
+
(3, 0.72, "Fix 3 failing tests in auth module"),
|
|
67
|
+
(1, 0.91, "Fix flaky test in session-manager.test.ts"),
|
|
68
|
+
(7, 0.45, "Multiple test failures after refactor — fix regressions"),
|
|
69
|
+
(1, 0.88, "TypeError in user-service.test.ts line 42"),
|
|
70
|
+
]):
|
|
71
|
+
scenarios.append(BenchmarkScenario(
|
|
72
|
+
id=f"fix-failing-test-v{variant}",
|
|
73
|
+
name=f"Fix Failing Test (variant {variant})",
|
|
74
|
+
description="Agent sees failing tests, should identify fix_bug as the right action",
|
|
75
|
+
difficulty="easy" if variant < 2 else "medium",
|
|
76
|
+
state_text="\n".join([
|
|
77
|
+
f"Agent: test-fixer",
|
|
78
|
+
f"Composite: {composite:.4f}",
|
|
79
|
+
f"Tests: {20 - tests_failing}/{20}",
|
|
80
|
+
f"Trajectory: {variant + 1}",
|
|
81
|
+
f"Dimensions: test_pass_rate={1 - tests_failing/20:.4f}, build_health=0.9000, code_quality=0.8500",
|
|
82
|
+
f"Recent deltas: -0.0300, -0.0100",
|
|
83
|
+
]),
|
|
84
|
+
goal=goal_text,
|
|
85
|
+
correct_tool="fix_bug",
|
|
86
|
+
category="diagnostic",
|
|
87
|
+
tags=["tests", "regression", "bug"],
|
|
88
|
+
))
|
|
89
|
+
|
|
90
|
+
# ── Scenario 2: Refactor Messy Code (Medium) ─────────────────────
|
|
91
|
+
for variant, (churn, complexity, goal_text) in enumerate([
|
|
92
|
+
(450, "high", "Reduce complexity in orchestrator module — too many responsibilities"),
|
|
93
|
+
(200, "medium", "Extract shared logic from peter.ts and eval.ts into utils"),
|
|
94
|
+
(800, "critical", "Module has 2400 lines, 15 functions over 100 lines — needs decomposition"),
|
|
95
|
+
(100, "low", "Clean up unused imports and dead code paths in training pipeline"),
|
|
96
|
+
]):
|
|
97
|
+
scenarios.append(BenchmarkScenario(
|
|
98
|
+
id=f"refactor-messy-code-v{variant}",
|
|
99
|
+
name=f"Refactor Messy Code (variant {variant})",
|
|
100
|
+
description="High complexity or code churn indicates refactoring needed",
|
|
101
|
+
difficulty="medium",
|
|
102
|
+
state_text="\n".join([
|
|
103
|
+
f"Agent: code-quality",
|
|
104
|
+
f"Composite: 0.7200",
|
|
105
|
+
f"Tests: 45/45",
|
|
106
|
+
f"Trajectory: {variant + 3}",
|
|
107
|
+
f"Dimensions: code_quality=0.5500, test_pass_rate=1.0000, build_health=1.0000, code_churn={churn}",
|
|
108
|
+
f"Recent deltas: +0.0050, +0.0020, -0.0010",
|
|
109
|
+
]),
|
|
110
|
+
goal=goal_text,
|
|
111
|
+
correct_tool="refactor_code",
|
|
112
|
+
category="optimization",
|
|
113
|
+
tags=["complexity", "maintainability", "code-quality"],
|
|
114
|
+
))
|
|
115
|
+
|
|
116
|
+
# ── Scenario 3: Optimize Slow Build (Medium) ─────────────────────
|
|
117
|
+
for variant, (build_time, goal_text) in enumerate([
|
|
118
|
+
("45s", "Build taking 45s — optimize TypeScript compilation"),
|
|
119
|
+
("120s", "CI pipeline runs 2 minutes — find bottleneck"),
|
|
120
|
+
("30s", "Bundle size 2.4MB — tree-shake unused dependencies"),
|
|
121
|
+
]):
|
|
122
|
+
scenarios.append(BenchmarkScenario(
|
|
123
|
+
id=f"optimize-build-v{variant}",
|
|
124
|
+
name=f"Optimize Slow Build (variant {variant})",
|
|
125
|
+
description="Build performance degraded, needs optimization",
|
|
126
|
+
difficulty="medium",
|
|
127
|
+
state_text="\n".join([
|
|
128
|
+
f"Agent: build-optimizer",
|
|
129
|
+
f"Composite: 0.6800",
|
|
130
|
+
f"Tests: 30/30",
|
|
131
|
+
f"Trajectory: {variant + 2}",
|
|
132
|
+
f"Dimensions: build_health=0.4000, test_pass_rate=1.0000, code_quality=0.8000",
|
|
133
|
+
f"Recent deltas: -0.0200, -0.0150",
|
|
134
|
+
]),
|
|
135
|
+
goal=goal_text,
|
|
136
|
+
correct_tool="optimize_performance",
|
|
137
|
+
category="optimization",
|
|
138
|
+
tags=["build", "performance", "ci"],
|
|
139
|
+
))
|
|
140
|
+
|
|
141
|
+
# ── Scenario 4: Handle Hub Crash (Hard) ──────────────────────────
|
|
142
|
+
for variant, (agents_stranded, goal_text) in enumerate([
|
|
143
|
+
(3, "Hub crashed — 3 agents stranded, need to restore connectivity"),
|
|
144
|
+
(6, "100% agent stranding — hub process died, restart and recover"),
|
|
145
|
+
(1, "Hub WebSocket connection dropped — single agent lost progress"),
|
|
146
|
+
(4, "Hub OOM killed — reduce memory usage and restart with safeguards"),
|
|
147
|
+
]):
|
|
148
|
+
scenarios.append(BenchmarkScenario(
|
|
149
|
+
id=f"hub-crash-v{variant}",
|
|
150
|
+
name=f"Handle Hub Crash (variant {variant})",
|
|
151
|
+
description="Hub crash causing agent stranding — critical infrastructure fix",
|
|
152
|
+
difficulty="hard",
|
|
153
|
+
state_text="\n".join([
|
|
154
|
+
f"Agent: hub-sentinel",
|
|
155
|
+
f"Composite: 0.3000",
|
|
156
|
+
f"Tests: 10/15",
|
|
157
|
+
f"Trajectory: {variant + 5}",
|
|
158
|
+
f"Dimensions: hub_health=0.0000, agent_stranding={agents_stranded/6:.4f}, build_health=0.5000",
|
|
159
|
+
f"Recent deltas: -0.1500, -0.2000, -0.0500",
|
|
160
|
+
]),
|
|
161
|
+
goal=goal_text,
|
|
162
|
+
correct_tool="fix_bug",
|
|
163
|
+
category="recovery",
|
|
164
|
+
tags=["hub", "crash", "infrastructure", "critical"],
|
|
165
|
+
))
|
|
166
|
+
|
|
167
|
+
# ── Scenario 5: Improve Test Coverage (Easy) ─────────────────────
|
|
168
|
+
for variant, (coverage, goal_text) in enumerate([
|
|
169
|
+
(0.45, "Test coverage at 45% — add tests for uncovered modules"),
|
|
170
|
+
(0.60, "New feature shipped without tests — add unit tests"),
|
|
171
|
+
(0.72, "Coverage dropped after refactor — restore test coverage"),
|
|
172
|
+
]):
|
|
173
|
+
scenarios.append(BenchmarkScenario(
|
|
174
|
+
id=f"improve-coverage-v{variant}",
|
|
175
|
+
name=f"Improve Test Coverage (variant {variant})",
|
|
176
|
+
description="Low test coverage, should add tests",
|
|
177
|
+
difficulty="easy",
|
|
178
|
+
state_text="\n".join([
|
|
179
|
+
f"Agent: test-coverage",
|
|
180
|
+
f"Composite: {coverage:.4f}",
|
|
181
|
+
f"Tests: {int(coverage * 50)}/{50}",
|
|
182
|
+
f"Trajectory: {variant + 1}",
|
|
183
|
+
f"Dimensions: test_coverage={coverage:.4f}, test_pass_rate=1.0000, code_quality=0.8000",
|
|
184
|
+
f"Recent deltas: +0.0100, +0.0050",
|
|
185
|
+
]),
|
|
186
|
+
goal=goal_text,
|
|
187
|
+
correct_tool="add_tests",
|
|
188
|
+
category="diagnostic",
|
|
189
|
+
tags=["tests", "coverage", "quality"],
|
|
190
|
+
))
|
|
191
|
+
|
|
192
|
+
# ── Scenario 6: Stale Dependencies (Medium) ──────────────────────
|
|
193
|
+
for variant, goal_text in enumerate([
|
|
194
|
+
"3 critical CVEs in dependencies — update packages",
|
|
195
|
+
"Node.js version EOL next month — migrate dependencies",
|
|
196
|
+
"Lockfile conflicts — resolve dependency tree issues",
|
|
197
|
+
]):
|
|
198
|
+
scenarios.append(BenchmarkScenario(
|
|
199
|
+
id=f"stale-deps-v{variant}",
|
|
200
|
+
name=f"Stale Dependencies (variant {variant})",
|
|
201
|
+
description="Security or compatibility issues from outdated dependencies",
|
|
202
|
+
difficulty="medium",
|
|
203
|
+
state_text="\n".join([
|
|
204
|
+
f"Agent: dependency-updater",
|
|
205
|
+
f"Composite: 0.7500",
|
|
206
|
+
f"Tests: 40/40",
|
|
207
|
+
f"Trajectory: {variant + 1}",
|
|
208
|
+
f"Dimensions: security_score=0.4000, test_pass_rate=1.0000, build_health=0.9000",
|
|
209
|
+
f"Recent deltas: -0.0050, +0.0020",
|
|
210
|
+
]),
|
|
211
|
+
goal=goal_text,
|
|
212
|
+
correct_tool="dependency_update",
|
|
213
|
+
category="optimization",
|
|
214
|
+
tags=["security", "dependencies", "maintenance"],
|
|
215
|
+
))
|
|
216
|
+
|
|
217
|
+
# ── Scenario 7: Missing Documentation (Easy) ─────────────────────
|
|
218
|
+
for variant, goal_text in enumerate([
|
|
219
|
+
"3 new CLI commands have no documentation — write usage docs",
|
|
220
|
+
"API changed but README is stale — update docs",
|
|
221
|
+
"SPEC.md doesn't reflect recent architecture decisions — update",
|
|
222
|
+
]):
|
|
223
|
+
scenarios.append(BenchmarkScenario(
|
|
224
|
+
id=f"missing-docs-v{variant}",
|
|
225
|
+
name=f"Missing Documentation (variant {variant})",
|
|
226
|
+
description="Documentation gaps need filling",
|
|
227
|
+
difficulty="easy",
|
|
228
|
+
state_text="\n".join([
|
|
229
|
+
f"Agent: docs-updater",
|
|
230
|
+
f"Composite: 0.8200",
|
|
231
|
+
f"Tests: 35/35",
|
|
232
|
+
f"Trajectory: {variant + 1}",
|
|
233
|
+
f"Dimensions: doc_coverage=0.4000, test_pass_rate=1.0000, code_quality=0.9000",
|
|
234
|
+
f"Recent deltas: +0.0100, +0.0200",
|
|
235
|
+
]),
|
|
236
|
+
goal=goal_text,
|
|
237
|
+
correct_tool="update_docs",
|
|
238
|
+
category="diagnostic",
|
|
239
|
+
tags=["docs", "documentation", "communication"],
|
|
240
|
+
))
|
|
241
|
+
|
|
242
|
+
# ── Scenario 8: Data Pipeline Broken (Hard) ──────────────────────
|
|
243
|
+
for variant, goal_text in enumerate([
|
|
244
|
+
"Training buffer transform failing — v1 to v2 conversion errors",
|
|
245
|
+
"Embedding precompute script OOM on large dataset — fix pipeline",
|
|
246
|
+
"Counterfactual generation producing invalid scenarios — debug pipeline",
|
|
247
|
+
]):
|
|
248
|
+
scenarios.append(BenchmarkScenario(
|
|
249
|
+
id=f"data-pipeline-v{variant}",
|
|
250
|
+
name=f"Data Pipeline Broken (variant {variant})",
|
|
251
|
+
description="Data processing pipeline needs fixing",
|
|
252
|
+
difficulty="hard",
|
|
253
|
+
state_text="\n".join([
|
|
254
|
+
f"Agent: data-engineer",
|
|
255
|
+
f"Composite: 0.5500",
|
|
256
|
+
f"Tests: 20/25",
|
|
257
|
+
f"Trajectory: {variant + 4}",
|
|
258
|
+
f"Dimensions: pipeline_health=0.2000, data_quality=0.4000, test_pass_rate=0.8000",
|
|
259
|
+
f"Recent deltas: -0.0500, -0.0300, -0.0200",
|
|
260
|
+
]),
|
|
261
|
+
goal=goal_text,
|
|
262
|
+
correct_tool="data_pipeline",
|
|
263
|
+
category="recovery",
|
|
264
|
+
tags=["data", "pipeline", "etl", "training"],
|
|
265
|
+
))
|
|
266
|
+
|
|
267
|
+
# ── Scenario 9: Add Monitoring (Medium) ──────────────────────────
|
|
268
|
+
for variant, goal_text in enumerate([
|
|
269
|
+
"No visibility into agent performance — add telemetry",
|
|
270
|
+
"Hub crashes go undetected — add health check endpoint",
|
|
271
|
+
"Training pipeline has no metrics — add loss/accuracy logging",
|
|
272
|
+
]):
|
|
273
|
+
scenarios.append(BenchmarkScenario(
|
|
274
|
+
id=f"add-monitoring-v{variant}",
|
|
275
|
+
name=f"Add Monitoring (variant {variant})",
|
|
276
|
+
description="System lacks observability, needs monitoring",
|
|
277
|
+
difficulty="medium",
|
|
278
|
+
state_text="\n".join([
|
|
279
|
+
f"Agent: observability",
|
|
280
|
+
f"Composite: 0.7000",
|
|
281
|
+
f"Tests: 30/30",
|
|
282
|
+
f"Trajectory: {variant + 2}",
|
|
283
|
+
f"Dimensions: observability=0.2000, test_pass_rate=1.0000, build_health=0.8500",
|
|
284
|
+
f"Recent deltas: +0.0050, +0.0030",
|
|
285
|
+
]),
|
|
286
|
+
goal=goal_text,
|
|
287
|
+
correct_tool="add_monitoring",
|
|
288
|
+
category="optimization",
|
|
289
|
+
tags=["monitoring", "observability", "telemetry"],
|
|
290
|
+
))
|
|
291
|
+
|
|
292
|
+
# ── Scenario 10: New Feature Request (Hard — ambiguous) ──────────
|
|
293
|
+
for variant, goal_text in enumerate([
|
|
294
|
+
"Add planning loop that connects PolicyHead to DynamicsModel",
|
|
295
|
+
"Implement experience replay buffer for online learning",
|
|
296
|
+
"Build counterfactual training bridge for synthetic data generation",
|
|
297
|
+
]):
|
|
298
|
+
scenarios.append(BenchmarkScenario(
|
|
299
|
+
id=f"new-feature-v{variant}",
|
|
300
|
+
name=f"New Feature Request (variant {variant})",
|
|
301
|
+
description="Clear feature request — should select add_feature",
|
|
302
|
+
difficulty="hard",
|
|
303
|
+
state_text="\n".join([
|
|
304
|
+
f"Agent: feature-builder",
|
|
305
|
+
f"Composite: 0.8000",
|
|
306
|
+
f"Tests: 40/40",
|
|
307
|
+
f"Trajectory: {variant + 1}",
|
|
308
|
+
f"Dimensions: feature_completeness=0.6000, test_pass_rate=1.0000, code_quality=0.8500",
|
|
309
|
+
f"Recent deltas: +0.0200, +0.0100, +0.0150",
|
|
310
|
+
]),
|
|
311
|
+
goal=goal_text,
|
|
312
|
+
correct_tool="add_feature",
|
|
313
|
+
category="diagnostic",
|
|
314
|
+
tags=["feature", "implementation", "new"],
|
|
315
|
+
))
|
|
316
|
+
|
|
317
|
+
return scenarios
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
# ============================================================================
|
|
321
|
+
# Benchmark Runner
|
|
322
|
+
# ============================================================================
|
|
323
|
+
|
|
324
|
+
def run_benchmark(checkpoint_path: str, scenarios: list[BenchmarkScenario], device: str = "cpu") -> dict:
|
|
325
|
+
"""Run all scenarios against a PolicyHead checkpoint."""
|
|
326
|
+
import torch
|
|
327
|
+
from model import PolicyHead
|
|
328
|
+
from dataset import load_embedding_cache
|
|
329
|
+
|
|
330
|
+
# Load model
|
|
331
|
+
ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)
|
|
332
|
+
config = ckpt["config"]
|
|
333
|
+
tool_to_index = ckpt["tool_to_index"]
|
|
334
|
+
index_to_tool = {v: k for k, v in tool_to_index.items()}
|
|
335
|
+
|
|
336
|
+
model = PolicyHead(
|
|
337
|
+
embedding_dim=config["embedding_dim"],
|
|
338
|
+
hidden_dim=config["hidden_dim"],
|
|
339
|
+
num_tools=ckpt["num_tools"],
|
|
340
|
+
num_layers=config["num_layers"],
|
|
341
|
+
num_heads=config["num_heads"],
|
|
342
|
+
dropout=config.get("dropout", 0.1),
|
|
343
|
+
).to(device)
|
|
344
|
+
model.load_state_dict(ckpt["model_state_dict"])
|
|
345
|
+
model.eval()
|
|
346
|
+
|
|
347
|
+
# Load embeddings
|
|
348
|
+
data_dir = os.path.dirname(checkpoint_path).replace("checkpoints", "v2-data")
|
|
349
|
+
embeddings_matrix, text_to_idx = load_embedding_cache(data_dir)
|
|
350
|
+
|
|
351
|
+
if embeddings_matrix is None:
|
|
352
|
+
print("WARNING: No embedding cache — using zero vectors")
|
|
353
|
+
embedding_dim = config["embedding_dim"]
|
|
354
|
+
else:
|
|
355
|
+
embedding_dim = embeddings_matrix.shape[1]
|
|
356
|
+
|
|
357
|
+
# Run each scenario
|
|
358
|
+
results = []
|
|
359
|
+
for scenario in scenarios:
|
|
360
|
+
result = evaluate_scenario(
|
|
361
|
+
model, scenario, tool_to_index, index_to_tool,
|
|
362
|
+
embeddings_matrix, text_to_idx, embedding_dim, device
|
|
363
|
+
)
|
|
364
|
+
results.append(result)
|
|
365
|
+
|
|
366
|
+
return aggregate_results(results, scenarios)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def evaluate_scenario(
|
|
370
|
+
model, scenario, tool_to_index, index_to_tool,
|
|
371
|
+
embeddings_matrix, text_to_idx, embedding_dim, device
|
|
372
|
+
) -> dict:
|
|
373
|
+
"""Evaluate a single scenario."""
|
|
374
|
+
import torch
|
|
375
|
+
|
|
376
|
+
# Get embeddings for state and goal
|
|
377
|
+
state_emb = get_embedding(scenario.state_text, embeddings_matrix, text_to_idx, embedding_dim)
|
|
378
|
+
goal_emb = get_embedding(scenario.goal, embeddings_matrix, text_to_idx, embedding_dim)
|
|
379
|
+
|
|
380
|
+
state_tensor = torch.tensor(state_emb, dtype=torch.float32).unsqueeze(0).to(device)
|
|
381
|
+
goal_tensor = torch.tensor(goal_emb, dtype=torch.float32).unsqueeze(0).to(device)
|
|
382
|
+
|
|
383
|
+
with torch.no_grad():
|
|
384
|
+
result = model.predict(state_tensor, goal_tensor, top_k=5)
|
|
385
|
+
|
|
386
|
+
top_indices = result["top_k_indices"][0].cpu().tolist()
|
|
387
|
+
top_probs = result["top_k_probs"][0].cpu().tolist()
|
|
388
|
+
|
|
389
|
+
predicted_tool = index_to_tool.get(top_indices[0], "unknown")
|
|
390
|
+
correct = predicted_tool == scenario.correct_tool
|
|
391
|
+
correct_tool_index = tool_to_index.get(scenario.correct_tool)
|
|
392
|
+
|
|
393
|
+
# Check if correct tool is in top-3
|
|
394
|
+
in_top3 = correct_tool_index in top_indices[:3] if correct_tool_index is not None else False
|
|
395
|
+
in_top5 = correct_tool_index in top_indices[:5] if correct_tool_index is not None else False
|
|
396
|
+
|
|
397
|
+
# Get rank of correct tool
|
|
398
|
+
correct_rank = -1
|
|
399
|
+
if correct_tool_index is not None:
|
|
400
|
+
all_probs = result["all_probs"][0].cpu().tolist()
|
|
401
|
+
sorted_indices = sorted(range(len(all_probs)), key=lambda i: -all_probs[i])
|
|
402
|
+
for rank, idx in enumerate(sorted_indices):
|
|
403
|
+
if idx == correct_tool_index:
|
|
404
|
+
correct_rank = rank + 1
|
|
405
|
+
break
|
|
406
|
+
|
|
407
|
+
return {
|
|
408
|
+
"scenario_id": scenario.id,
|
|
409
|
+
"correct_tool": scenario.correct_tool,
|
|
410
|
+
"predicted_tool": predicted_tool,
|
|
411
|
+
"correct": correct,
|
|
412
|
+
"in_top3": in_top3,
|
|
413
|
+
"in_top5": in_top5,
|
|
414
|
+
"correct_rank": correct_rank,
|
|
415
|
+
"confidence": top_probs[0],
|
|
416
|
+
"difficulty": scenario.difficulty,
|
|
417
|
+
"category": scenario.category,
|
|
418
|
+
"tags": scenario.tags,
|
|
419
|
+
"top5": [
|
|
420
|
+
{"tool": index_to_tool.get(idx, "?"), "prob": prob}
|
|
421
|
+
for idx, prob in zip(top_indices, top_probs)
|
|
422
|
+
],
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def get_embedding(text: str, embeddings_matrix, text_to_idx, embedding_dim: int):
|
|
427
|
+
"""Look up or generate zero embedding for text."""
|
|
428
|
+
import numpy as np
|
|
429
|
+
|
|
430
|
+
if text_to_idx and text in text_to_idx:
|
|
431
|
+
idx = text_to_idx[text]
|
|
432
|
+
return embeddings_matrix[idx].tolist()
|
|
433
|
+
|
|
434
|
+
# Fallback: zero vector (will degrade accuracy but won't crash)
|
|
435
|
+
return [0.0] * embedding_dim
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def aggregate_results(results: list[dict], scenarios: list[BenchmarkScenario]) -> dict:
|
|
439
|
+
"""Aggregate individual scenario results into a report."""
|
|
440
|
+
total = len(results)
|
|
441
|
+
correct = sum(1 for r in results if r["correct"])
|
|
442
|
+
in_top3 = sum(1 for r in results if r["in_top3"])
|
|
443
|
+
in_top5 = sum(1 for r in results if r["in_top5"])
|
|
444
|
+
|
|
445
|
+
# By difficulty
|
|
446
|
+
by_difficulty = {}
|
|
447
|
+
for diff in ["easy", "medium", "hard"]:
|
|
448
|
+
subset = [r for r in results if r["difficulty"] == diff]
|
|
449
|
+
if subset:
|
|
450
|
+
by_difficulty[diff] = {
|
|
451
|
+
"total": len(subset),
|
|
452
|
+
"correct": sum(1 for r in subset if r["correct"]),
|
|
453
|
+
"accuracy": sum(1 for r in subset if r["correct"]) / len(subset),
|
|
454
|
+
"top3": sum(1 for r in subset if r["in_top3"]) / len(subset),
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
# By category
|
|
458
|
+
by_category = {}
|
|
459
|
+
for cat in set(r["category"] for r in results):
|
|
460
|
+
subset = [r for r in results if r["category"] == cat]
|
|
461
|
+
by_category[cat] = {
|
|
462
|
+
"total": len(subset),
|
|
463
|
+
"correct": sum(1 for r in subset if r["correct"]),
|
|
464
|
+
"accuracy": sum(1 for r in subset if r["correct"]) / len(subset),
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
# By correct_tool
|
|
468
|
+
by_tool = {}
|
|
469
|
+
for tool in set(r["correct_tool"] for r in results):
|
|
470
|
+
subset = [r for r in results if r["correct_tool"] == tool]
|
|
471
|
+
by_tool[tool] = {
|
|
472
|
+
"total": len(subset),
|
|
473
|
+
"correct": sum(1 for r in subset if r["correct"]),
|
|
474
|
+
"accuracy": sum(1 for r in subset if r["correct"]) / len(subset),
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
# Failures for debugging
|
|
478
|
+
failures = [r for r in results if not r["correct"]]
|
|
479
|
+
|
|
480
|
+
return {
|
|
481
|
+
"summary": {
|
|
482
|
+
"total_scenarios": total,
|
|
483
|
+
"top1_accuracy": correct / total if total > 0 else 0,
|
|
484
|
+
"top3_accuracy": in_top3 / total if total > 0 else 0,
|
|
485
|
+
"top5_accuracy": in_top5 / total if total > 0 else 0,
|
|
486
|
+
"correct": correct,
|
|
487
|
+
},
|
|
488
|
+
"by_difficulty": by_difficulty,
|
|
489
|
+
"by_category": by_category,
|
|
490
|
+
"by_tool": by_tool,
|
|
491
|
+
"failures": [
|
|
492
|
+
{
|
|
493
|
+
"id": f["scenario_id"],
|
|
494
|
+
"expected": f["correct_tool"],
|
|
495
|
+
"got": f["predicted_tool"],
|
|
496
|
+
"rank": f["correct_rank"],
|
|
497
|
+
"confidence": f["confidence"],
|
|
498
|
+
"top5": f["top5"],
|
|
499
|
+
}
|
|
500
|
+
for f in failures
|
|
501
|
+
],
|
|
502
|
+
"all_results": results,
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
# ============================================================================
|
|
507
|
+
# Output
|
|
508
|
+
# ============================================================================
|
|
509
|
+
|
|
510
|
+
def print_report(report: dict):
|
|
511
|
+
"""Print a human-readable benchmark report."""
|
|
512
|
+
s = report["summary"]
|
|
513
|
+
|
|
514
|
+
print("=" * 70)
|
|
515
|
+
print(" BENCHMARK EVALUATION REPORT")
|
|
516
|
+
print("=" * 70)
|
|
517
|
+
print(f"\n Scenarios: {s['total_scenarios']}")
|
|
518
|
+
print(f" Top-1 Accuracy: {s['top1_accuracy']:.1%} ({s['correct']}/{s['total_scenarios']})")
|
|
519
|
+
print(f" Top-3 Accuracy: {s['top3_accuracy']:.1%}")
|
|
520
|
+
print(f" Top-5 Accuracy: {s['top5_accuracy']:.1%}")
|
|
521
|
+
|
|
522
|
+
print(f"\n {'Difficulty':<12} {'Acc':>8} {'Top-3':>8} {'N':>5}")
|
|
523
|
+
print(" " + "-" * 35)
|
|
524
|
+
for diff in ["easy", "medium", "hard"]:
|
|
525
|
+
if diff in report["by_difficulty"]:
|
|
526
|
+
d = report["by_difficulty"][diff]
|
|
527
|
+
print(f" {diff:<12} {d['accuracy']:>7.1%} {d['top3']:>7.1%} {d['total']:>5}")
|
|
528
|
+
|
|
529
|
+
print(f"\n {'Category':<18} {'Acc':>8} {'N':>5}")
|
|
530
|
+
print(" " + "-" * 33)
|
|
531
|
+
for cat, stats in sorted(report["by_category"].items()):
|
|
532
|
+
print(f" {cat:<18} {stats['accuracy']:>7.1%} {stats['total']:>5}")
|
|
533
|
+
|
|
534
|
+
print(f"\n {'Tool':<25} {'Acc':>8} {'N':>5}")
|
|
535
|
+
print(" " + "-" * 40)
|
|
536
|
+
for tool, stats in sorted(report["by_tool"].items(), key=lambda x: -x[1]["accuracy"]):
|
|
537
|
+
bar = "█" * int(stats["accuracy"] * 10) + "░" * (10 - int(stats["accuracy"] * 10))
|
|
538
|
+
print(f" {tool:<25} {stats['accuracy']:>7.1%} {stats['total']:>5} {bar}")
|
|
539
|
+
|
|
540
|
+
if report["failures"]:
|
|
541
|
+
print(f"\n Failures ({len(report['failures'])}):")
|
|
542
|
+
print(f" {'ID':<30} {'Expected':<20} {'Got':<20} {'Rank':>5}")
|
|
543
|
+
print(" " + "-" * 78)
|
|
544
|
+
for f in report["failures"]:
|
|
545
|
+
print(f" {f['id']:<30} {f['expected']:<20} {f['got']:<20} {f['rank']:>5}")
|
|
546
|
+
|
|
547
|
+
# Overall grade
|
|
548
|
+
acc = s["top1_accuracy"]
|
|
549
|
+
if acc >= 0.90:
|
|
550
|
+
grade = "A — Ready for production"
|
|
551
|
+
elif acc >= 0.80:
|
|
552
|
+
grade = "B — Good, minor gaps"
|
|
553
|
+
elif acc >= 0.70:
|
|
554
|
+
grade = "C — Needs improvement"
|
|
555
|
+
elif acc >= 0.50:
|
|
556
|
+
grade = "D — Significant gaps"
|
|
557
|
+
else:
|
|
558
|
+
grade = "F — Major retraining needed"
|
|
559
|
+
|
|
560
|
+
print(f"\n Grade: {grade}")
|
|
561
|
+
print("=" * 70)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def generate_benchmark_jsonl(scenarios: list[BenchmarkScenario], output_path: str):
|
|
565
|
+
"""Write benchmark scenarios as JSONL for use with eval.py."""
|
|
566
|
+
with open(output_path, "w") as f:
|
|
567
|
+
for s in scenarios:
|
|
568
|
+
entry = {
|
|
569
|
+
"current_state": s.state_text,
|
|
570
|
+
"goal": s.goal,
|
|
571
|
+
"correct_tool": s.correct_tool,
|
|
572
|
+
"source": "benchmark",
|
|
573
|
+
"scenario_id": s.id,
|
|
574
|
+
"difficulty": s.difficulty,
|
|
575
|
+
"category": s.category,
|
|
576
|
+
}
|
|
577
|
+
f.write(json.dumps(entry) + "\n")
|
|
578
|
+
print(f"Wrote {len(scenarios)} scenarios to {output_path}")
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
# ============================================================================
|
|
582
|
+
# Main
|
|
583
|
+
# ============================================================================
|
|
584
|
+
|
|
585
|
+
def main():
|
|
586
|
+
parser = argparse.ArgumentParser(description="Benchmark scenarios for PolicyHead evaluation")
|
|
587
|
+
parser.add_argument("--checkpoint", default=None, help="Path to PolicyHead checkpoint")
|
|
588
|
+
parser.add_argument("--generate", action="store_true", help="Generate benchmark.jsonl for offline eval")
|
|
589
|
+
parser.add_argument("--output", default=".jfl/v2-data/benchmark.jsonl", help="Output path for generated JSONL")
|
|
590
|
+
parser.add_argument("--json", action="store_true", help="Output results as JSON")
|
|
591
|
+
parser.add_argument("--difficulty", default=None, help="Filter by difficulty: easy/medium/hard")
|
|
592
|
+
parser.add_argument("--category", default=None, help="Filter by category: diagnostic/scheduling/recovery/optimization")
|
|
593
|
+
args = parser.parse_args()
|
|
594
|
+
|
|
595
|
+
scenarios = generate_scenarios()
|
|
596
|
+
|
|
597
|
+
# Apply filters
|
|
598
|
+
if args.difficulty:
|
|
599
|
+
scenarios = [s for s in scenarios if s.difficulty == args.difficulty]
|
|
600
|
+
if args.category:
|
|
601
|
+
scenarios = [s for s in scenarios if s.category == args.category]
|
|
602
|
+
|
|
603
|
+
print(f"Loaded {len(scenarios)} benchmark scenarios")
|
|
604
|
+
|
|
605
|
+
if args.generate:
|
|
606
|
+
generate_benchmark_jsonl(scenarios, args.output)
|
|
607
|
+
return
|
|
608
|
+
|
|
609
|
+
if args.checkpoint is None:
|
|
610
|
+
# Try default
|
|
611
|
+
default_ckpt = ".jfl/checkpoints/best_policy_head.pt"
|
|
612
|
+
if os.path.exists(default_ckpt):
|
|
613
|
+
args.checkpoint = default_ckpt
|
|
614
|
+
else:
|
|
615
|
+
print("No checkpoint specified. Use --checkpoint or --generate")
|
|
616
|
+
print("\n Available commands:")
|
|
617
|
+
print(" python benchmark.py --generate # Create benchmark.jsonl")
|
|
618
|
+
print(" python benchmark.py --checkpoint path/to/model.pt # Run evaluation")
|
|
619
|
+
sys.exit(1)
|
|
620
|
+
|
|
621
|
+
if not os.path.exists(args.checkpoint):
|
|
622
|
+
print(f"Checkpoint not found: {args.checkpoint}")
|
|
623
|
+
sys.exit(1)
|
|
624
|
+
|
|
625
|
+
# Detect device
|
|
626
|
+
import torch
|
|
627
|
+
if torch.cuda.is_available():
|
|
628
|
+
device = "cuda"
|
|
629
|
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
630
|
+
device = "mps"
|
|
631
|
+
else:
|
|
632
|
+
device = "cpu"
|
|
633
|
+
|
|
634
|
+
print(f"Device: {device}")
|
|
635
|
+
print(f"Checkpoint: {args.checkpoint}")
|
|
636
|
+
print()
|
|
637
|
+
|
|
638
|
+
t0 = time.time()
|
|
639
|
+
report = run_benchmark(args.checkpoint, scenarios, device)
|
|
640
|
+
elapsed = time.time() - t0
|
|
641
|
+
|
|
642
|
+
if args.json:
|
|
643
|
+
# Remove all_results for cleaner JSON output
|
|
644
|
+
output = {k: v for k, v in report.items() if k != "all_results"}
|
|
645
|
+
print(json.dumps(output, indent=2))
|
|
646
|
+
else:
|
|
647
|
+
print_report(report)
|
|
648
|
+
print(f"\n Benchmark completed in {elapsed:.1f}s")
|
|
649
|
+
|
|
650
|
+
# Save results
|
|
651
|
+
results_path = os.path.join(
|
|
652
|
+
os.path.dirname(args.checkpoint),
|
|
653
|
+
"benchmark-results.json"
|
|
654
|
+
)
|
|
655
|
+
with open(results_path, "w") as f:
|
|
656
|
+
json.dump(report, f, indent=2)
|
|
657
|
+
print(f" Results saved to {results_path}")
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
if __name__ == "__main__":
|
|
661
|
+
main()
|