jfl 0.8.1 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (296) hide show
  1. package/README.md +35 -4
  2. package/dist/commands/digest.d.ts +6 -0
  3. package/dist/commands/digest.d.ts.map +1 -1
  4. package/dist/commands/digest.js +70 -69
  5. package/dist/commands/digest.js.map +1 -1
  6. package/dist/commands/doctor.d.ts +1 -0
  7. package/dist/commands/doctor.d.ts.map +1 -1
  8. package/dist/commands/doctor.js +30 -1
  9. package/dist/commands/doctor.js.map +1 -1
  10. package/dist/commands/eval.d.ts +40 -0
  11. package/dist/commands/eval.d.ts.map +1 -1
  12. package/dist/commands/eval.js +8 -8
  13. package/dist/commands/eval.js.map +1 -1
  14. package/dist/commands/findings.d.ts +7 -0
  15. package/dist/commands/findings.d.ts.map +1 -1
  16. package/dist/commands/findings.js +4 -4
  17. package/dist/commands/findings.js.map +1 -1
  18. package/dist/commands/ide.d.ts +2 -1
  19. package/dist/commands/ide.d.ts.map +1 -1
  20. package/dist/commands/ide.js +61 -1
  21. package/dist/commands/ide.js.map +1 -1
  22. package/dist/commands/init-from-service.d.ts +15 -0
  23. package/dist/commands/init-from-service.d.ts.map +1 -0
  24. package/dist/commands/init-from-service.js +541 -0
  25. package/dist/commands/init-from-service.js.map +1 -0
  26. package/dist/commands/init.d.ts +1 -0
  27. package/dist/commands/init.d.ts.map +1 -1
  28. package/dist/commands/init.js +32 -1
  29. package/dist/commands/init.js.map +1 -1
  30. package/dist/commands/kanban.d.ts.map +1 -1
  31. package/dist/commands/kanban.js +13 -4
  32. package/dist/commands/kanban.js.map +1 -1
  33. package/dist/commands/linear.d.ts +41 -0
  34. package/dist/commands/linear.d.ts.map +1 -0
  35. package/dist/commands/linear.js +715 -0
  36. package/dist/commands/linear.js.map +1 -0
  37. package/dist/commands/peter.d.ts.map +1 -1
  38. package/dist/commands/peter.js +232 -25
  39. package/dist/commands/peter.js.map +1 -1
  40. package/dist/commands/portfolio.d.ts +5 -0
  41. package/dist/commands/portfolio.d.ts.map +1 -1
  42. package/dist/commands/portfolio.js +193 -203
  43. package/dist/commands/portfolio.js.map +1 -1
  44. package/dist/commands/predict.d.ts +19 -0
  45. package/dist/commands/predict.d.ts.map +1 -1
  46. package/dist/commands/predict.js +4 -4
  47. package/dist/commands/predict.js.map +1 -1
  48. package/dist/commands/services.d.ts.map +1 -1
  49. package/dist/commands/services.js +146 -0
  50. package/dist/commands/services.js.map +1 -1
  51. package/dist/commands/setup.d.ts.map +1 -1
  52. package/dist/commands/setup.js +279 -20
  53. package/dist/commands/setup.js.map +1 -1
  54. package/dist/commands/start.d.ts +25 -0
  55. package/dist/commands/start.d.ts.map +1 -0
  56. package/dist/commands/start.js +191 -0
  57. package/dist/commands/start.js.map +1 -0
  58. package/dist/commands/telemetry-monitor.d.ts +11 -0
  59. package/dist/commands/telemetry-monitor.d.ts.map +1 -0
  60. package/dist/commands/telemetry-monitor.js +224 -0
  61. package/dist/commands/telemetry-monitor.js.map +1 -0
  62. package/dist/commands/telemetry-test.d.ts +11 -0
  63. package/dist/commands/telemetry-test.d.ts.map +1 -0
  64. package/dist/commands/telemetry-test.js +67 -0
  65. package/dist/commands/telemetry-test.js.map +1 -0
  66. package/dist/commands/tenet-agents.d.ts +13 -0
  67. package/dist/commands/tenet-agents.d.ts.map +1 -0
  68. package/dist/commands/tenet-agents.js +191 -0
  69. package/dist/commands/tenet-agents.js.map +1 -0
  70. package/dist/commands/tenet-setup.d.ts +20 -0
  71. package/dist/commands/tenet-setup.d.ts.map +1 -0
  72. package/dist/commands/tenet-setup.js +135 -0
  73. package/dist/commands/tenet-setup.js.map +1 -0
  74. package/dist/commands/train.d.ts +18 -0
  75. package/dist/commands/train.d.ts.map +1 -1
  76. package/dist/commands/train.js +182 -0
  77. package/dist/commands/train.js.map +1 -1
  78. package/dist/commands/viz.d.ts +33 -0
  79. package/dist/commands/viz.d.ts.map +1 -1
  80. package/dist/commands/viz.js +9 -9
  81. package/dist/commands/viz.js.map +1 -1
  82. package/dist/commands/whoami.d.ts +2 -0
  83. package/dist/commands/whoami.d.ts.map +1 -0
  84. package/dist/commands/whoami.js +24 -0
  85. package/dist/commands/whoami.js.map +1 -0
  86. package/dist/index.js +230 -30
  87. package/dist/index.js.map +1 -1
  88. package/dist/lib/advanced-setup.d.ts +78 -0
  89. package/dist/lib/advanced-setup.d.ts.map +1 -0
  90. package/dist/lib/advanced-setup.js +433 -0
  91. package/dist/lib/advanced-setup.js.map +1 -0
  92. package/dist/lib/agent-config.d.ts +33 -0
  93. package/dist/lib/agent-config.d.ts.map +1 -1
  94. package/dist/lib/agent-config.js +26 -0
  95. package/dist/lib/agent-config.js.map +1 -1
  96. package/dist/lib/counterfactual-training-bridge.d.ts +114 -0
  97. package/dist/lib/counterfactual-training-bridge.d.ts.map +1 -0
  98. package/dist/lib/counterfactual-training-bridge.js +322 -0
  99. package/dist/lib/counterfactual-training-bridge.js.map +1 -0
  100. package/dist/lib/discovery-agent.d.ts +48 -0
  101. package/dist/lib/discovery-agent.d.ts.map +1 -0
  102. package/dist/lib/discovery-agent.js +111 -0
  103. package/dist/lib/discovery-agent.js.map +1 -0
  104. package/dist/lib/flow-engine.d.ts.map +1 -1
  105. package/dist/lib/flow-engine.js +46 -8
  106. package/dist/lib/flow-engine.js.map +1 -1
  107. package/dist/lib/gtm-generator.d.ts +29 -0
  108. package/dist/lib/gtm-generator.d.ts.map +1 -0
  109. package/dist/lib/gtm-generator.js +252 -0
  110. package/dist/lib/gtm-generator.js.map +1 -0
  111. package/dist/lib/hub-health.d.ts +40 -0
  112. package/dist/lib/hub-health.d.ts.map +1 -0
  113. package/dist/lib/hub-health.js +89 -0
  114. package/dist/lib/hub-health.js.map +1 -0
  115. package/dist/lib/invariant-monitor.d.ts +6 -2
  116. package/dist/lib/invariant-monitor.d.ts.map +1 -1
  117. package/dist/lib/invariant-monitor.js +89 -2
  118. package/dist/lib/invariant-monitor.js.map +1 -1
  119. package/dist/lib/journal-analyzer.d.ts +71 -0
  120. package/dist/lib/journal-analyzer.d.ts.map +1 -0
  121. package/dist/lib/journal-analyzer.js +306 -0
  122. package/dist/lib/journal-analyzer.js.map +1 -0
  123. package/dist/lib/linear-client.d.ts +73 -0
  124. package/dist/lib/linear-client.d.ts.map +1 -0
  125. package/dist/lib/linear-client.js +112 -0
  126. package/dist/lib/linear-client.js.map +1 -0
  127. package/dist/lib/linear-id-map.d.ts +20 -0
  128. package/dist/lib/linear-id-map.d.ts.map +1 -0
  129. package/dist/lib/linear-id-map.js +59 -0
  130. package/dist/lib/linear-id-map.js.map +1 -0
  131. package/dist/lib/linear-kanban.d.ts +66 -0
  132. package/dist/lib/linear-kanban.d.ts.map +1 -0
  133. package/dist/lib/linear-kanban.js +175 -0
  134. package/dist/lib/linear-kanban.js.map +1 -0
  135. package/dist/lib/onboarding.d.ts +40 -0
  136. package/dist/lib/onboarding.d.ts.map +1 -0
  137. package/dist/lib/onboarding.js +213 -0
  138. package/dist/lib/onboarding.js.map +1 -0
  139. package/dist/lib/physical-world-model.d.ts +50 -0
  140. package/dist/lib/physical-world-model.d.ts.map +1 -0
  141. package/dist/lib/physical-world-model.js +251 -0
  142. package/dist/lib/physical-world-model.js.map +1 -0
  143. package/dist/lib/planning-loop.d.ts +157 -0
  144. package/dist/lib/planning-loop.d.ts.map +1 -0
  145. package/dist/lib/planning-loop.js +537 -0
  146. package/dist/lib/planning-loop.js.map +1 -0
  147. package/dist/lib/policy-head.d.ts +13 -0
  148. package/dist/lib/policy-head.d.ts.map +1 -1
  149. package/dist/lib/policy-head.js +168 -2
  150. package/dist/lib/policy-head.js.map +1 -1
  151. package/dist/lib/resource-optimizer-middleware.d.ts +39 -0
  152. package/dist/lib/resource-optimizer-middleware.d.ts.map +1 -0
  153. package/dist/lib/resource-optimizer-middleware.js +222 -0
  154. package/dist/lib/resource-optimizer-middleware.js.map +1 -0
  155. package/dist/lib/resource-optimizer.d.ts +71 -0
  156. package/dist/lib/resource-optimizer.d.ts.map +1 -0
  157. package/dist/lib/resource-optimizer.js +228 -0
  158. package/dist/lib/resource-optimizer.js.map +1 -0
  159. package/dist/lib/rl-manager.d.ts +74 -0
  160. package/dist/lib/rl-manager.d.ts.map +1 -0
  161. package/dist/lib/rl-manager.js +245 -0
  162. package/dist/lib/rl-manager.js.map +1 -0
  163. package/dist/lib/service-analyzer.d.ts +76 -0
  164. package/dist/lib/service-analyzer.d.ts.map +1 -0
  165. package/dist/lib/service-analyzer.js +704 -0
  166. package/dist/lib/service-analyzer.js.map +1 -0
  167. package/dist/lib/service-gtm.js +2 -2
  168. package/dist/lib/service-gtm.js.map +1 -1
  169. package/dist/lib/service-questionnaire.d.ts +11 -0
  170. package/dist/lib/service-questionnaire.d.ts.map +1 -0
  171. package/dist/lib/service-questionnaire.js +89 -0
  172. package/dist/lib/service-questionnaire.js.map +1 -0
  173. package/dist/lib/setup/agent-generator.d.ts +2 -0
  174. package/dist/lib/setup/agent-generator.d.ts.map +1 -1
  175. package/dist/lib/setup/agent-generator.js +128 -4
  176. package/dist/lib/setup/agent-generator.js.map +1 -1
  177. package/dist/lib/setup/flow-generator.d.ts +10 -0
  178. package/dist/lib/setup/flow-generator.d.ts.map +1 -0
  179. package/dist/lib/setup/flow-generator.js +113 -0
  180. package/dist/lib/setup/flow-generator.js.map +1 -0
  181. package/dist/lib/setup/invariant-bridge.d.ts +91 -0
  182. package/dist/lib/setup/invariant-bridge.d.ts.map +1 -0
  183. package/dist/lib/setup/invariant-bridge.js +384 -0
  184. package/dist/lib/setup/invariant-bridge.js.map +1 -0
  185. package/dist/lib/setup/spec-generator.d.ts +41 -5
  186. package/dist/lib/setup/spec-generator.d.ts.map +1 -1
  187. package/dist/lib/setup/spec-generator.js +503 -29
  188. package/dist/lib/setup/spec-generator.js.map +1 -1
  189. package/dist/lib/setup/starter-intelligence.d.ts +25 -0
  190. package/dist/lib/setup/starter-intelligence.d.ts.map +1 -0
  191. package/dist/lib/setup/starter-intelligence.js +309 -0
  192. package/dist/lib/setup/starter-intelligence.js.map +1 -0
  193. package/dist/lib/stratus-client.js +1 -1
  194. package/dist/lib/stratus-client.js.map +1 -1
  195. package/dist/lib/surface-agent.d.ts +78 -0
  196. package/dist/lib/surface-agent.d.ts.map +1 -0
  197. package/dist/lib/surface-agent.js +105 -0
  198. package/dist/lib/surface-agent.js.map +1 -0
  199. package/dist/lib/surface-coordination-example.d.ts +30 -0
  200. package/dist/lib/surface-coordination-example.d.ts.map +1 -0
  201. package/dist/lib/surface-coordination-example.js +164 -0
  202. package/dist/lib/surface-coordination-example.js.map +1 -0
  203. package/dist/lib/telemetry/physical-world-collector.d.ts +15 -0
  204. package/dist/lib/telemetry/physical-world-collector.d.ts.map +1 -0
  205. package/dist/lib/telemetry/physical-world-collector.js +177 -0
  206. package/dist/lib/telemetry/physical-world-collector.js.map +1 -0
  207. package/dist/lib/telemetry/training-bridge.d.ts +51 -0
  208. package/dist/lib/telemetry/training-bridge.d.ts.map +1 -0
  209. package/dist/lib/telemetry/training-bridge.js +185 -0
  210. package/dist/lib/telemetry/training-bridge.js.map +1 -0
  211. package/dist/lib/telemetry.d.ts +2 -1
  212. package/dist/lib/telemetry.d.ts.map +1 -1
  213. package/dist/lib/telemetry.js +23 -2
  214. package/dist/lib/telemetry.js.map +1 -1
  215. package/dist/lib/tenet-board-agent.d.ts +52 -0
  216. package/dist/lib/tenet-board-agent.d.ts.map +1 -0
  217. package/dist/lib/tenet-board-agent.js +226 -0
  218. package/dist/lib/tenet-board-agent.js.map +1 -0
  219. package/dist/lib/tenet-ide-agent.d.ts +40 -0
  220. package/dist/lib/tenet-ide-agent.d.ts.map +1 -0
  221. package/dist/lib/tenet-ide-agent.js +199 -0
  222. package/dist/lib/tenet-ide-agent.js.map +1 -0
  223. package/dist/lib/workspace/data-pipeline.d.ts.map +1 -1
  224. package/dist/lib/workspace/data-pipeline.js +27 -5
  225. package/dist/lib/workspace/data-pipeline.js.map +1 -1
  226. package/dist/lib/workspace/sidebar-runner.d.ts +13 -0
  227. package/dist/lib/workspace/sidebar-runner.d.ts.map +1 -0
  228. package/dist/lib/workspace/sidebar-runner.js +419 -0
  229. package/dist/lib/workspace/sidebar-runner.js.map +1 -0
  230. package/dist/lib/workspace/surface-registry.d.ts.map +1 -1
  231. package/dist/lib/workspace/surface-registry.js +9 -1
  232. package/dist/lib/workspace/surface-registry.js.map +1 -1
  233. package/dist/lib/workspace/surfaces/agent-overview.d.ts +3 -3
  234. package/dist/lib/workspace/surfaces/agent-overview.d.ts.map +1 -1
  235. package/dist/lib/workspace/surfaces/agent-overview.js +3 -3
  236. package/dist/lib/workspace/surfaces/agent-overview.js.map +1 -1
  237. package/dist/lib/workspace/surfaces/index.d.ts +3 -0
  238. package/dist/lib/workspace/surfaces/index.d.ts.map +1 -1
  239. package/dist/lib/workspace/surfaces/index.js +3 -0
  240. package/dist/lib/workspace/surfaces/index.js.map +1 -1
  241. package/dist/lib/workspace/surfaces/kanban.d.ts +15 -0
  242. package/dist/lib/workspace/surfaces/kanban.d.ts.map +1 -0
  243. package/dist/lib/workspace/surfaces/kanban.js +43 -0
  244. package/dist/lib/workspace/surfaces/kanban.js.map +1 -0
  245. package/dist/lib/workspace/surfaces/physical-world.d.ts +15 -0
  246. package/dist/lib/workspace/surfaces/physical-world.d.ts.map +1 -0
  247. package/dist/lib/workspace/surfaces/physical-world.js +37 -0
  248. package/dist/lib/workspace/surfaces/physical-world.js.map +1 -0
  249. package/dist/lib/workspace/surfaces/sidebar.d.ts +22 -0
  250. package/dist/lib/workspace/surfaces/sidebar.d.ts.map +1 -0
  251. package/dist/lib/workspace/surfaces/sidebar.js +94 -0
  252. package/dist/lib/workspace/surfaces/sidebar.js.map +1 -0
  253. package/dist/lib/workspace/tmux-adapter.d.ts +8 -5
  254. package/dist/lib/workspace/tmux-adapter.d.ts.map +1 -1
  255. package/dist/lib/workspace/tmux-adapter.js +38 -7
  256. package/dist/lib/workspace/tmux-adapter.js.map +1 -1
  257. package/dist/lib/workspace/tmux-sidebar.d.ts +14 -0
  258. package/dist/lib/workspace/tmux-sidebar.d.ts.map +1 -0
  259. package/dist/lib/workspace/tmux-sidebar.js +230 -0
  260. package/dist/lib/workspace/tmux-sidebar.js.map +1 -0
  261. package/dist/types/flows.d.ts +2 -1
  262. package/dist/types/flows.d.ts.map +1 -1
  263. package/dist/types/physical-world-model.d.ts +65 -0
  264. package/dist/types/physical-world-model.d.ts.map +1 -0
  265. package/dist/types/physical-world-model.js +43 -0
  266. package/dist/types/physical-world-model.js.map +1 -0
  267. package/dist/types/telemetry.d.ts +37 -0
  268. package/dist/types/telemetry.d.ts.map +1 -1
  269. package/dist/types/world-model.d.ts.map +1 -1
  270. package/dist/types/world-model.js +14 -7
  271. package/dist/types/world-model.js.map +1 -1
  272. package/dist/utils/context-hub-port.d.ts.map +1 -1
  273. package/dist/utils/context-hub-port.js +6 -1
  274. package/dist/utils/context-hub-port.js.map +1 -1
  275. package/dist/utils/jfl-config.d.ts +7 -2
  276. package/dist/utils/jfl-config.d.ts.map +1 -1
  277. package/dist/utils/jfl-config.js +14 -4
  278. package/dist/utils/jfl-config.js.map +1 -1
  279. package/package.json +3 -2
  280. package/packages/pi/extensions/context.ts +51 -1
  281. package/packages/pi/extensions/hub-tools.ts +247 -0
  282. package/packages/pi/extensions/index.ts +38 -6
  283. package/packages/pi/extensions/memory-tool.ts +84 -4
  284. package/packages/pi/extensions/service-skills.ts +214 -0
  285. package/scripts/telemetry-dashboard.sh +44 -0
  286. package/scripts/test-planning-loop-e2e.ts +181 -0
  287. package/scripts/test-server-inference.ts +49 -0
  288. package/scripts/test-state-sensitivity.ts +32 -0
  289. package/scripts/train/v2/benchmark.py +661 -0
  290. package/scripts/train/v2/generate_balanced.py +439 -0
  291. package/scripts/train/v2/generate_hard_negatives.py +219 -0
  292. package/scripts/train/v2/infer.py +149 -36
  293. package/scripts/train/v2/infer_server.py +224 -0
  294. package/scripts/train/v2/online_train.py +576 -0
  295. package/scripts/train/v2/precompute.py +24 -6
  296. package/template/CLAUDE.md +74 -132
@@ -0,0 +1,661 @@
1
+ """
2
+ Benchmark Scenarios for PolicyHead Evaluation.
3
+
4
+ Creates synthetic evaluation environments with known "right answers"
5
+ for rapid PolicyHead iteration. Test in minutes, not weeks.
6
+
7
+ Drew's recommendation (section 8.6):
8
+ - Create benchmark agent scenarios for rapid iteration
9
+ - Use World Model to create synthetic state snapshots
10
+ - Counterfactual Engine generates ground truth outcomes
11
+ - PolicyHead evaluated on these scenarios before deploying to real agents
12
+
13
+ Scenarios:
14
+ 1. Fix failing test — agent sees failing tests, should select fix_bug
15
+ 2. Refactor messy code — high code churn, should select refactor_code
16
+ 3. Optimize slow build — build failing/slow, should select optimize_performance
17
+ 4. Handle hub crash — hub down, agents stranded, should select fix_bug
18
+ 5. Improve coverage — low test counts, should select add_tests
19
+ 6. Stale dependencies — security issues, should select dependency_update
20
+ 7. Missing docs — new features without docs, should select update_docs
21
+ 8. Data pipeline broken — ETL failures, should select data_pipeline
22
+
23
+ Each scenario defines:
24
+ - A synthetic WorldState snapshot (state text for embedding)
25
+ - A goal string
26
+ - The expected correct tool
27
+ - Difficulty level (easy/medium/hard)
28
+ - Variations (to test generalization)
29
+
30
+ Usage:
31
+ python benchmark.py --checkpoint .jfl/checkpoints/best_policy_head.pt
32
+ python benchmark.py --checkpoint .jfl/checkpoints/best_policy_head.pt --json
33
+ python benchmark.py --generate # Write benchmark.jsonl for offline eval
34
+ """
35
+
36
+ import json
37
+ import os
38
+ import sys
39
+ import argparse
40
+ import time
41
+ from dataclasses import dataclass, asdict
42
+
43
+ # ============================================================================
44
+ # Scenario Definitions
45
+ # ============================================================================
46
+
47
+ @dataclass
48
+ class BenchmarkScenario:
49
+ id: str
50
+ name: str
51
+ description: str
52
+ difficulty: str # easy, medium, hard
53
+ state_text: str
54
+ goal: str
55
+ correct_tool: str
56
+ category: str # diagnostic, scheduling, recovery, optimization
57
+ tags: list[str]
58
+
59
+
60
+ def generate_scenarios() -> list[BenchmarkScenario]:
61
+ """Generate all benchmark scenarios with variations."""
62
+ scenarios = []
63
+
64
+ # ── Scenario 1: Fix Failing Test (Easy) ──────────────────────────
65
+ for variant, (tests_failing, composite, goal_text) in enumerate([
66
+ (3, 0.72, "Fix 3 failing tests in auth module"),
67
+ (1, 0.91, "Fix flaky test in session-manager.test.ts"),
68
+ (7, 0.45, "Multiple test failures after refactor — fix regressions"),
69
+ (1, 0.88, "TypeError in user-service.test.ts line 42"),
70
+ ]):
71
+ scenarios.append(BenchmarkScenario(
72
+ id=f"fix-failing-test-v{variant}",
73
+ name=f"Fix Failing Test (variant {variant})",
74
+ description="Agent sees failing tests, should identify fix_bug as the right action",
75
+ difficulty="easy" if variant < 2 else "medium",
76
+ state_text="\n".join([
77
+ f"Agent: test-fixer",
78
+ f"Composite: {composite:.4f}",
79
+ f"Tests: {20 - tests_failing}/{20}",
80
+ f"Trajectory: {variant + 1}",
81
+ f"Dimensions: test_pass_rate={1 - tests_failing/20:.4f}, build_health=0.9000, code_quality=0.8500",
82
+ f"Recent deltas: -0.0300, -0.0100",
83
+ ]),
84
+ goal=goal_text,
85
+ correct_tool="fix_bug",
86
+ category="diagnostic",
87
+ tags=["tests", "regression", "bug"],
88
+ ))
89
+
90
+ # ── Scenario 2: Refactor Messy Code (Medium) ─────────────────────
91
+ for variant, (churn, complexity, goal_text) in enumerate([
92
+ (450, "high", "Reduce complexity in orchestrator module — too many responsibilities"),
93
+ (200, "medium", "Extract shared logic from peter.ts and eval.ts into utils"),
94
+ (800, "critical", "Module has 2400 lines, 15 functions over 100 lines — needs decomposition"),
95
+ (100, "low", "Clean up unused imports and dead code paths in training pipeline"),
96
+ ]):
97
+ scenarios.append(BenchmarkScenario(
98
+ id=f"refactor-messy-code-v{variant}",
99
+ name=f"Refactor Messy Code (variant {variant})",
100
+ description="High complexity or code churn indicates refactoring needed",
101
+ difficulty="medium",
102
+ state_text="\n".join([
103
+ f"Agent: code-quality",
104
+ f"Composite: 0.7200",
105
+ f"Tests: 45/45",
106
+ f"Trajectory: {variant + 3}",
107
+ f"Dimensions: code_quality=0.5500, test_pass_rate=1.0000, build_health=1.0000, code_churn={churn}",
108
+ f"Recent deltas: +0.0050, +0.0020, -0.0010",
109
+ ]),
110
+ goal=goal_text,
111
+ correct_tool="refactor_code",
112
+ category="optimization",
113
+ tags=["complexity", "maintainability", "code-quality"],
114
+ ))
115
+
116
+ # ── Scenario 3: Optimize Slow Build (Medium) ─────────────────────
117
+ for variant, (build_time, goal_text) in enumerate([
118
+ ("45s", "Build taking 45s — optimize TypeScript compilation"),
119
+ ("120s", "CI pipeline runs 2 minutes — find bottleneck"),
120
+ ("30s", "Bundle size 2.4MB — tree-shake unused dependencies"),
121
+ ]):
122
+ scenarios.append(BenchmarkScenario(
123
+ id=f"optimize-build-v{variant}",
124
+ name=f"Optimize Slow Build (variant {variant})",
125
+ description="Build performance degraded, needs optimization",
126
+ difficulty="medium",
127
+ state_text="\n".join([
128
+ f"Agent: build-optimizer",
129
+ f"Composite: 0.6800",
130
+ f"Tests: 30/30",
131
+ f"Trajectory: {variant + 2}",
132
+ f"Dimensions: build_health=0.4000, test_pass_rate=1.0000, code_quality=0.8000",
133
+ f"Recent deltas: -0.0200, -0.0150",
134
+ ]),
135
+ goal=goal_text,
136
+ correct_tool="optimize_performance",
137
+ category="optimization",
138
+ tags=["build", "performance", "ci"],
139
+ ))
140
+
141
+ # ── Scenario 4: Handle Hub Crash (Hard) ──────────────────────────
142
+ for variant, (agents_stranded, goal_text) in enumerate([
143
+ (3, "Hub crashed — 3 agents stranded, need to restore connectivity"),
144
+ (6, "100% agent stranding — hub process died, restart and recover"),
145
+ (1, "Hub WebSocket connection dropped — single agent lost progress"),
146
+ (4, "Hub OOM killed — reduce memory usage and restart with safeguards"),
147
+ ]):
148
+ scenarios.append(BenchmarkScenario(
149
+ id=f"hub-crash-v{variant}",
150
+ name=f"Handle Hub Crash (variant {variant})",
151
+ description="Hub crash causing agent stranding — critical infrastructure fix",
152
+ difficulty="hard",
153
+ state_text="\n".join([
154
+ f"Agent: hub-sentinel",
155
+ f"Composite: 0.3000",
156
+ f"Tests: 10/15",
157
+ f"Trajectory: {variant + 5}",
158
+ f"Dimensions: hub_health=0.0000, agent_stranding={agents_stranded/6:.4f}, build_health=0.5000",
159
+ f"Recent deltas: -0.1500, -0.2000, -0.0500",
160
+ ]),
161
+ goal=goal_text,
162
+ correct_tool="fix_bug",
163
+ category="recovery",
164
+ tags=["hub", "crash", "infrastructure", "critical"],
165
+ ))
166
+
167
+ # ── Scenario 5: Improve Test Coverage (Easy) ─────────────────────
168
+ for variant, (coverage, goal_text) in enumerate([
169
+ (0.45, "Test coverage at 45% — add tests for uncovered modules"),
170
+ (0.60, "New feature shipped without tests — add unit tests"),
171
+ (0.72, "Coverage dropped after refactor — restore test coverage"),
172
+ ]):
173
+ scenarios.append(BenchmarkScenario(
174
+ id=f"improve-coverage-v{variant}",
175
+ name=f"Improve Test Coverage (variant {variant})",
176
+ description="Low test coverage, should add tests",
177
+ difficulty="easy",
178
+ state_text="\n".join([
179
+ f"Agent: test-coverage",
180
+ f"Composite: {coverage:.4f}",
181
+ f"Tests: {int(coverage * 50)}/{50}",
182
+ f"Trajectory: {variant + 1}",
183
+ f"Dimensions: test_coverage={coverage:.4f}, test_pass_rate=1.0000, code_quality=0.8000",
184
+ f"Recent deltas: +0.0100, +0.0050",
185
+ ]),
186
+ goal=goal_text,
187
+ correct_tool="add_tests",
188
+ category="diagnostic",
189
+ tags=["tests", "coverage", "quality"],
190
+ ))
191
+
192
+ # ── Scenario 6: Stale Dependencies (Medium) ──────────────────────
193
+ for variant, goal_text in enumerate([
194
+ "3 critical CVEs in dependencies — update packages",
195
+ "Node.js version EOL next month — migrate dependencies",
196
+ "Lockfile conflicts — resolve dependency tree issues",
197
+ ]):
198
+ scenarios.append(BenchmarkScenario(
199
+ id=f"stale-deps-v{variant}",
200
+ name=f"Stale Dependencies (variant {variant})",
201
+ description="Security or compatibility issues from outdated dependencies",
202
+ difficulty="medium",
203
+ state_text="\n".join([
204
+ f"Agent: dependency-updater",
205
+ f"Composite: 0.7500",
206
+ f"Tests: 40/40",
207
+ f"Trajectory: {variant + 1}",
208
+ f"Dimensions: security_score=0.4000, test_pass_rate=1.0000, build_health=0.9000",
209
+ f"Recent deltas: -0.0050, +0.0020",
210
+ ]),
211
+ goal=goal_text,
212
+ correct_tool="dependency_update",
213
+ category="optimization",
214
+ tags=["security", "dependencies", "maintenance"],
215
+ ))
216
+
217
+ # ── Scenario 7: Missing Documentation (Easy) ─────────────────────
218
+ for variant, goal_text in enumerate([
219
+ "3 new CLI commands have no documentation — write usage docs",
220
+ "API changed but README is stale — update docs",
221
+ "SPEC.md doesn't reflect recent architecture decisions — update",
222
+ ]):
223
+ scenarios.append(BenchmarkScenario(
224
+ id=f"missing-docs-v{variant}",
225
+ name=f"Missing Documentation (variant {variant})",
226
+ description="Documentation gaps need filling",
227
+ difficulty="easy",
228
+ state_text="\n".join([
229
+ f"Agent: docs-updater",
230
+ f"Composite: 0.8200",
231
+ f"Tests: 35/35",
232
+ f"Trajectory: {variant + 1}",
233
+ f"Dimensions: doc_coverage=0.4000, test_pass_rate=1.0000, code_quality=0.9000",
234
+ f"Recent deltas: +0.0100, +0.0200",
235
+ ]),
236
+ goal=goal_text,
237
+ correct_tool="update_docs",
238
+ category="diagnostic",
239
+ tags=["docs", "documentation", "communication"],
240
+ ))
241
+
242
+ # ── Scenario 8: Data Pipeline Broken (Hard) ──────────────────────
243
+ for variant, goal_text in enumerate([
244
+ "Training buffer transform failing — v1 to v2 conversion errors",
245
+ "Embedding precompute script OOM on large dataset — fix pipeline",
246
+ "Counterfactual generation producing invalid scenarios — debug pipeline",
247
+ ]):
248
+ scenarios.append(BenchmarkScenario(
249
+ id=f"data-pipeline-v{variant}",
250
+ name=f"Data Pipeline Broken (variant {variant})",
251
+ description="Data processing pipeline needs fixing",
252
+ difficulty="hard",
253
+ state_text="\n".join([
254
+ f"Agent: data-engineer",
255
+ f"Composite: 0.5500",
256
+ f"Tests: 20/25",
257
+ f"Trajectory: {variant + 4}",
258
+ f"Dimensions: pipeline_health=0.2000, data_quality=0.4000, test_pass_rate=0.8000",
259
+ f"Recent deltas: -0.0500, -0.0300, -0.0200",
260
+ ]),
261
+ goal=goal_text,
262
+ correct_tool="data_pipeline",
263
+ category="recovery",
264
+ tags=["data", "pipeline", "etl", "training"],
265
+ ))
266
+
267
+ # ── Scenario 9: Add Monitoring (Medium) ──────────────────────────
268
+ for variant, goal_text in enumerate([
269
+ "No visibility into agent performance — add telemetry",
270
+ "Hub crashes go undetected — add health check endpoint",
271
+ "Training pipeline has no metrics — add loss/accuracy logging",
272
+ ]):
273
+ scenarios.append(BenchmarkScenario(
274
+ id=f"add-monitoring-v{variant}",
275
+ name=f"Add Monitoring (variant {variant})",
276
+ description="System lacks observability, needs monitoring",
277
+ difficulty="medium",
278
+ state_text="\n".join([
279
+ f"Agent: observability",
280
+ f"Composite: 0.7000",
281
+ f"Tests: 30/30",
282
+ f"Trajectory: {variant + 2}",
283
+ f"Dimensions: observability=0.2000, test_pass_rate=1.0000, build_health=0.8500",
284
+ f"Recent deltas: +0.0050, +0.0030",
285
+ ]),
286
+ goal=goal_text,
287
+ correct_tool="add_monitoring",
288
+ category="optimization",
289
+ tags=["monitoring", "observability", "telemetry"],
290
+ ))
291
+
292
+ # ── Scenario 10: New Feature Request (Hard — ambiguous) ──────────
293
+ for variant, goal_text in enumerate([
294
+ "Add planning loop that connects PolicyHead to DynamicsModel",
295
+ "Implement experience replay buffer for online learning",
296
+ "Build counterfactual training bridge for synthetic data generation",
297
+ ]):
298
+ scenarios.append(BenchmarkScenario(
299
+ id=f"new-feature-v{variant}",
300
+ name=f"New Feature Request (variant {variant})",
301
+ description="Clear feature request — should select add_feature",
302
+ difficulty="hard",
303
+ state_text="\n".join([
304
+ f"Agent: feature-builder",
305
+ f"Composite: 0.8000",
306
+ f"Tests: 40/40",
307
+ f"Trajectory: {variant + 1}",
308
+ f"Dimensions: feature_completeness=0.6000, test_pass_rate=1.0000, code_quality=0.8500",
309
+ f"Recent deltas: +0.0200, +0.0100, +0.0150",
310
+ ]),
311
+ goal=goal_text,
312
+ correct_tool="add_feature",
313
+ category="diagnostic",
314
+ tags=["feature", "implementation", "new"],
315
+ ))
316
+
317
+ return scenarios
318
+
319
+
320
+ # ============================================================================
321
+ # Benchmark Runner
322
+ # ============================================================================
323
+
324
+ def run_benchmark(checkpoint_path: str, scenarios: list[BenchmarkScenario], device: str = "cpu") -> dict:
325
+ """Run all scenarios against a PolicyHead checkpoint."""
326
+ import torch
327
+ from model import PolicyHead
328
+ from dataset import load_embedding_cache
329
+
330
+ # Load model
331
+ ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False)
332
+ config = ckpt["config"]
333
+ tool_to_index = ckpt["tool_to_index"]
334
+ index_to_tool = {v: k for k, v in tool_to_index.items()}
335
+
336
+ model = PolicyHead(
337
+ embedding_dim=config["embedding_dim"],
338
+ hidden_dim=config["hidden_dim"],
339
+ num_tools=ckpt["num_tools"],
340
+ num_layers=config["num_layers"],
341
+ num_heads=config["num_heads"],
342
+ dropout=config.get("dropout", 0.1),
343
+ ).to(device)
344
+ model.load_state_dict(ckpt["model_state_dict"])
345
+ model.eval()
346
+
347
+ # Load embeddings
348
+ data_dir = os.path.dirname(checkpoint_path).replace("checkpoints", "v2-data")
349
+ embeddings_matrix, text_to_idx = load_embedding_cache(data_dir)
350
+
351
+ if embeddings_matrix is None:
352
+ print("WARNING: No embedding cache — using zero vectors")
353
+ embedding_dim = config["embedding_dim"]
354
+ else:
355
+ embedding_dim = embeddings_matrix.shape[1]
356
+
357
+ # Run each scenario
358
+ results = []
359
+ for scenario in scenarios:
360
+ result = evaluate_scenario(
361
+ model, scenario, tool_to_index, index_to_tool,
362
+ embeddings_matrix, text_to_idx, embedding_dim, device
363
+ )
364
+ results.append(result)
365
+
366
+ return aggregate_results(results, scenarios)
367
+
368
+
369
+ def evaluate_scenario(
370
+ model, scenario, tool_to_index, index_to_tool,
371
+ embeddings_matrix, text_to_idx, embedding_dim, device
372
+ ) -> dict:
373
+ """Evaluate a single scenario."""
374
+ import torch
375
+
376
+ # Get embeddings for state and goal
377
+ state_emb = get_embedding(scenario.state_text, embeddings_matrix, text_to_idx, embedding_dim)
378
+ goal_emb = get_embedding(scenario.goal, embeddings_matrix, text_to_idx, embedding_dim)
379
+
380
+ state_tensor = torch.tensor(state_emb, dtype=torch.float32).unsqueeze(0).to(device)
381
+ goal_tensor = torch.tensor(goal_emb, dtype=torch.float32).unsqueeze(0).to(device)
382
+
383
+ with torch.no_grad():
384
+ result = model.predict(state_tensor, goal_tensor, top_k=5)
385
+
386
+ top_indices = result["top_k_indices"][0].cpu().tolist()
387
+ top_probs = result["top_k_probs"][0].cpu().tolist()
388
+
389
+ predicted_tool = index_to_tool.get(top_indices[0], "unknown")
390
+ correct = predicted_tool == scenario.correct_tool
391
+ correct_tool_index = tool_to_index.get(scenario.correct_tool)
392
+
393
+ # Check if correct tool is in top-3
394
+ in_top3 = correct_tool_index in top_indices[:3] if correct_tool_index is not None else False
395
+ in_top5 = correct_tool_index in top_indices[:5] if correct_tool_index is not None else False
396
+
397
+ # Get rank of correct tool
398
+ correct_rank = -1
399
+ if correct_tool_index is not None:
400
+ all_probs = result["all_probs"][0].cpu().tolist()
401
+ sorted_indices = sorted(range(len(all_probs)), key=lambda i: -all_probs[i])
402
+ for rank, idx in enumerate(sorted_indices):
403
+ if idx == correct_tool_index:
404
+ correct_rank = rank + 1
405
+ break
406
+
407
+ return {
408
+ "scenario_id": scenario.id,
409
+ "correct_tool": scenario.correct_tool,
410
+ "predicted_tool": predicted_tool,
411
+ "correct": correct,
412
+ "in_top3": in_top3,
413
+ "in_top5": in_top5,
414
+ "correct_rank": correct_rank,
415
+ "confidence": top_probs[0],
416
+ "difficulty": scenario.difficulty,
417
+ "category": scenario.category,
418
+ "tags": scenario.tags,
419
+ "top5": [
420
+ {"tool": index_to_tool.get(idx, "?"), "prob": prob}
421
+ for idx, prob in zip(top_indices, top_probs)
422
+ ],
423
+ }
424
+
425
+
426
+ def get_embedding(text: str, embeddings_matrix, text_to_idx, embedding_dim: int):
427
+ """Look up or generate zero embedding for text."""
428
+ import numpy as np
429
+
430
+ if text_to_idx and text in text_to_idx:
431
+ idx = text_to_idx[text]
432
+ return embeddings_matrix[idx].tolist()
433
+
434
+ # Fallback: zero vector (will degrade accuracy but won't crash)
435
+ return [0.0] * embedding_dim
436
+
437
+
438
+ def aggregate_results(results: list[dict], scenarios: list[BenchmarkScenario]) -> dict:
439
+ """Aggregate individual scenario results into a report."""
440
+ total = len(results)
441
+ correct = sum(1 for r in results if r["correct"])
442
+ in_top3 = sum(1 for r in results if r["in_top3"])
443
+ in_top5 = sum(1 for r in results if r["in_top5"])
444
+
445
+ # By difficulty
446
+ by_difficulty = {}
447
+ for diff in ["easy", "medium", "hard"]:
448
+ subset = [r for r in results if r["difficulty"] == diff]
449
+ if subset:
450
+ by_difficulty[diff] = {
451
+ "total": len(subset),
452
+ "correct": sum(1 for r in subset if r["correct"]),
453
+ "accuracy": sum(1 for r in subset if r["correct"]) / len(subset),
454
+ "top3": sum(1 for r in subset if r["in_top3"]) / len(subset),
455
+ }
456
+
457
+ # By category
458
+ by_category = {}
459
+ for cat in set(r["category"] for r in results):
460
+ subset = [r for r in results if r["category"] == cat]
461
+ by_category[cat] = {
462
+ "total": len(subset),
463
+ "correct": sum(1 for r in subset if r["correct"]),
464
+ "accuracy": sum(1 for r in subset if r["correct"]) / len(subset),
465
+ }
466
+
467
+ # By correct_tool
468
+ by_tool = {}
469
+ for tool in set(r["correct_tool"] for r in results):
470
+ subset = [r for r in results if r["correct_tool"] == tool]
471
+ by_tool[tool] = {
472
+ "total": len(subset),
473
+ "correct": sum(1 for r in subset if r["correct"]),
474
+ "accuracy": sum(1 for r in subset if r["correct"]) / len(subset),
475
+ }
476
+
477
+ # Failures for debugging
478
+ failures = [r for r in results if not r["correct"]]
479
+
480
+ return {
481
+ "summary": {
482
+ "total_scenarios": total,
483
+ "top1_accuracy": correct / total if total > 0 else 0,
484
+ "top3_accuracy": in_top3 / total if total > 0 else 0,
485
+ "top5_accuracy": in_top5 / total if total > 0 else 0,
486
+ "correct": correct,
487
+ },
488
+ "by_difficulty": by_difficulty,
489
+ "by_category": by_category,
490
+ "by_tool": by_tool,
491
+ "failures": [
492
+ {
493
+ "id": f["scenario_id"],
494
+ "expected": f["correct_tool"],
495
+ "got": f["predicted_tool"],
496
+ "rank": f["correct_rank"],
497
+ "confidence": f["confidence"],
498
+ "top5": f["top5"],
499
+ }
500
+ for f in failures
501
+ ],
502
+ "all_results": results,
503
+ }
504
+
505
+
506
+ # ============================================================================
507
+ # Output
508
+ # ============================================================================
509
+
510
+ def print_report(report: dict):
511
+ """Print a human-readable benchmark report."""
512
+ s = report["summary"]
513
+
514
+ print("=" * 70)
515
+ print(" BENCHMARK EVALUATION REPORT")
516
+ print("=" * 70)
517
+ print(f"\n Scenarios: {s['total_scenarios']}")
518
+ print(f" Top-1 Accuracy: {s['top1_accuracy']:.1%} ({s['correct']}/{s['total_scenarios']})")
519
+ print(f" Top-3 Accuracy: {s['top3_accuracy']:.1%}")
520
+ print(f" Top-5 Accuracy: {s['top5_accuracy']:.1%}")
521
+
522
+ print(f"\n {'Difficulty':<12} {'Acc':>8} {'Top-3':>8} {'N':>5}")
523
+ print(" " + "-" * 35)
524
+ for diff in ["easy", "medium", "hard"]:
525
+ if diff in report["by_difficulty"]:
526
+ d = report["by_difficulty"][diff]
527
+ print(f" {diff:<12} {d['accuracy']:>7.1%} {d['top3']:>7.1%} {d['total']:>5}")
528
+
529
+ print(f"\n {'Category':<18} {'Acc':>8} {'N':>5}")
530
+ print(" " + "-" * 33)
531
+ for cat, stats in sorted(report["by_category"].items()):
532
+ print(f" {cat:<18} {stats['accuracy']:>7.1%} {stats['total']:>5}")
533
+
534
+ print(f"\n {'Tool':<25} {'Acc':>8} {'N':>5}")
535
+ print(" " + "-" * 40)
536
+ for tool, stats in sorted(report["by_tool"].items(), key=lambda x: -x[1]["accuracy"]):
537
+ bar = "█" * int(stats["accuracy"] * 10) + "░" * (10 - int(stats["accuracy"] * 10))
538
+ print(f" {tool:<25} {stats['accuracy']:>7.1%} {stats['total']:>5} {bar}")
539
+
540
+ if report["failures"]:
541
+ print(f"\n Failures ({len(report['failures'])}):")
542
+ print(f" {'ID':<30} {'Expected':<20} {'Got':<20} {'Rank':>5}")
543
+ print(" " + "-" * 78)
544
+ for f in report["failures"]:
545
+ print(f" {f['id']:<30} {f['expected']:<20} {f['got']:<20} {f['rank']:>5}")
546
+
547
+ # Overall grade
548
+ acc = s["top1_accuracy"]
549
+ if acc >= 0.90:
550
+ grade = "A — Ready for production"
551
+ elif acc >= 0.80:
552
+ grade = "B — Good, minor gaps"
553
+ elif acc >= 0.70:
554
+ grade = "C — Needs improvement"
555
+ elif acc >= 0.50:
556
+ grade = "D — Significant gaps"
557
+ else:
558
+ grade = "F — Major retraining needed"
559
+
560
+ print(f"\n Grade: {grade}")
561
+ print("=" * 70)
562
+
563
+
564
+ def generate_benchmark_jsonl(scenarios: list[BenchmarkScenario], output_path: str):
565
+ """Write benchmark scenarios as JSONL for use with eval.py."""
566
+ with open(output_path, "w") as f:
567
+ for s in scenarios:
568
+ entry = {
569
+ "current_state": s.state_text,
570
+ "goal": s.goal,
571
+ "correct_tool": s.correct_tool,
572
+ "source": "benchmark",
573
+ "scenario_id": s.id,
574
+ "difficulty": s.difficulty,
575
+ "category": s.category,
576
+ }
577
+ f.write(json.dumps(entry) + "\n")
578
+ print(f"Wrote {len(scenarios)} scenarios to {output_path}")
579
+
580
+
581
+ # ============================================================================
582
+ # Main
583
+ # ============================================================================
584
+
585
+ def main():
586
+ parser = argparse.ArgumentParser(description="Benchmark scenarios for PolicyHead evaluation")
587
+ parser.add_argument("--checkpoint", default=None, help="Path to PolicyHead checkpoint")
588
+ parser.add_argument("--generate", action="store_true", help="Generate benchmark.jsonl for offline eval")
589
+ parser.add_argument("--output", default=".jfl/v2-data/benchmark.jsonl", help="Output path for generated JSONL")
590
+ parser.add_argument("--json", action="store_true", help="Output results as JSON")
591
+ parser.add_argument("--difficulty", default=None, help="Filter by difficulty: easy/medium/hard")
592
+ parser.add_argument("--category", default=None, help="Filter by category: diagnostic/scheduling/recovery/optimization")
593
+ args = parser.parse_args()
594
+
595
+ scenarios = generate_scenarios()
596
+
597
+ # Apply filters
598
+ if args.difficulty:
599
+ scenarios = [s for s in scenarios if s.difficulty == args.difficulty]
600
+ if args.category:
601
+ scenarios = [s for s in scenarios if s.category == args.category]
602
+
603
+ print(f"Loaded {len(scenarios)} benchmark scenarios")
604
+
605
+ if args.generate:
606
+ generate_benchmark_jsonl(scenarios, args.output)
607
+ return
608
+
609
+ if args.checkpoint is None:
610
+ # Try default
611
+ default_ckpt = ".jfl/checkpoints/best_policy_head.pt"
612
+ if os.path.exists(default_ckpt):
613
+ args.checkpoint = default_ckpt
614
+ else:
615
+ print("No checkpoint specified. Use --checkpoint or --generate")
616
+ print("\n Available commands:")
617
+ print(" python benchmark.py --generate # Create benchmark.jsonl")
618
+ print(" python benchmark.py --checkpoint path/to/model.pt # Run evaluation")
619
+ sys.exit(1)
620
+
621
+ if not os.path.exists(args.checkpoint):
622
+ print(f"Checkpoint not found: {args.checkpoint}")
623
+ sys.exit(1)
624
+
625
+ # Detect device
626
+ import torch
627
+ if torch.cuda.is_available():
628
+ device = "cuda"
629
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
630
+ device = "mps"
631
+ else:
632
+ device = "cpu"
633
+
634
+ print(f"Device: {device}")
635
+ print(f"Checkpoint: {args.checkpoint}")
636
+ print()
637
+
638
+ t0 = time.time()
639
+ report = run_benchmark(args.checkpoint, scenarios, device)
640
+ elapsed = time.time() - t0
641
+
642
+ if args.json:
643
+ # Remove all_results for cleaner JSON output
644
+ output = {k: v for k, v in report.items() if k != "all_results"}
645
+ print(json.dumps(output, indent=2))
646
+ else:
647
+ print_report(report)
648
+ print(f"\n Benchmark completed in {elapsed:.1f}s")
649
+
650
+ # Save results
651
+ results_path = os.path.join(
652
+ os.path.dirname(args.checkpoint),
653
+ "benchmark-results.json"
654
+ )
655
+ with open(results_path, "w") as f:
656
+ json.dump(report, f, indent=2)
657
+ print(f" Results saved to {results_path}")
658
+
659
+
660
+ if __name__ == "__main__":
661
+ main()