crewswarm 0.9.2 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. package/README.md +22 -9
  2. package/apps/dashboard/dist/assets/chat-core-uXb_C0GM.js +1 -0
  3. package/apps/dashboard/dist/assets/chat-core-uXb_C0GM.js.br +0 -0
  4. package/apps/dashboard/dist/assets/cli-process-CNZ_UBCt.js +1 -0
  5. package/apps/dashboard/dist/assets/cli-process-CNZ_UBCt.js.br +0 -0
  6. package/apps/dashboard/dist/assets/index-BeVllEj_.js +2 -0
  7. package/apps/dashboard/dist/assets/index-BeVllEj_.js.br +0 -0
  8. package/apps/dashboard/dist/assets/{index-CF0aJRtC.css → index-D-sRshvg.css} +1 -1
  9. package/apps/dashboard/dist/assets/index-D-sRshvg.css.br +0 -0
  10. package/apps/dashboard/dist/assets/tab-benchmarks-tab-BHjKCPm3.js.br +0 -0
  11. package/apps/dashboard/dist/assets/tab-models-tab-dNRgsTOO.js +1 -0
  12. package/apps/dashboard/dist/assets/tab-models-tab-dNRgsTOO.js.br +0 -0
  13. package/apps/dashboard/dist/assets/{tab-pm-loop-tab-Bfd449B4.js → tab-pm-loop-tab-DiAPTJXu.js} +1 -1
  14. package/apps/dashboard/dist/assets/tab-pm-loop-tab-DiAPTJXu.js.br +0 -0
  15. package/apps/dashboard/dist/assets/{tab-projects-tab-DhNWnlzt.js → tab-projects-tab-SFH4E--a.js} +1 -1
  16. package/apps/dashboard/dist/assets/tab-projects-tab-SFH4E--a.js.br +0 -0
  17. package/apps/dashboard/dist/assets/tab-settings-tab-CuvH_Fj_.js +1 -0
  18. package/apps/dashboard/dist/assets/tab-settings-tab-CuvH_Fj_.js.br +0 -0
  19. package/apps/dashboard/dist/assets/tab-skills-tab-DR7PJ7NB.js +1 -0
  20. package/apps/dashboard/dist/assets/tab-skills-tab-DR7PJ7NB.js.br +0 -0
  21. package/apps/dashboard/dist/assets/tab-testing-tab-CezZOZcJ.js +1 -0
  22. package/apps/dashboard/dist/assets/tab-testing-tab-CezZOZcJ.js.br +0 -0
  23. package/apps/dashboard/dist/index.html +135 -15
  24. package/apps/dashboard/dist/index.html.br +0 -0
  25. package/apps/dashboard/dist/index.html.gz +0 -0
  26. package/apps/vibe/README.md +2 -2
  27. package/apps/vibe/package.json +1 -1
  28. package/apps/vibe/server.mjs +101 -56
  29. package/crew-lead.mjs +34 -4
  30. package/lib/bridges/cli-executor.mjs +1 -1
  31. package/lib/bridges/gateway-ws.mjs +4 -0
  32. package/lib/browser/passthrough-stderr.js +1 -0
  33. package/lib/chat/project-messages.mjs +3 -5
  34. package/lib/cli-process-tracker.mjs +3 -2
  35. package/lib/contacts/identity-linker.mjs +1 -0
  36. package/lib/crew-judge/judge.mjs +19 -18
  37. package/lib/crew-lead/agent-manager.mjs +1 -1
  38. package/lib/crew-lead/background.mjs +14 -1
  39. package/lib/crew-lead/chat-handler.mjs +38 -1
  40. package/lib/crew-lead/http-server.mjs +106 -57
  41. package/lib/crew-lead/llm-caller.mjs +24 -8
  42. package/lib/crew-lead/prompts.mjs +14 -1
  43. package/lib/crew-lead/tools.mjs +3 -2
  44. package/lib/crew-lead/wave-dispatcher.mjs +19 -5
  45. package/lib/crew-lead/ws-router.mjs +219 -27
  46. package/lib/engines/crew-cli.mjs +1 -1
  47. package/lib/engines/engine-registry.mjs +14 -3
  48. package/lib/engines/rt-envelope.mjs +1 -0
  49. package/lib/engines/runners.mjs +28 -4
  50. package/lib/gemini-cli-passthrough-noise.mjs +1 -1
  51. package/lib/integrations/code-search.mjs +4 -3
  52. package/lib/memory/shared-adapter.mjs +23 -10
  53. package/lib/pipeline/manager.mjs +2 -1
  54. package/lib/runtime/config.mjs +1 -1
  55. package/lib/runtime/paths.mjs +12 -8
  56. package/lib/runtime/spending.mjs +2 -1
  57. package/package.json +42 -14
  58. package/scripts/capture-build-flow.mjs +118 -0
  59. package/scripts/coverage-report.mjs +209 -0
  60. package/scripts/coverage-summary.mjs +47 -0
  61. package/scripts/dashboard-validation.mjs +76 -0
  62. package/scripts/dashboard.mjs +1667 -551
  63. package/scripts/generate-openapi.mjs +683 -277
  64. package/scripts/live-bridge-matrix.mjs +79 -0
  65. package/scripts/live-cli-matrix.mjs +166 -0
  66. package/scripts/live-crewchat-check.mjs +42 -0
  67. package/scripts/live-engine-matrix.mjs +50 -0
  68. package/scripts/live-provider-failover-matrix.mjs +107 -0
  69. package/scripts/live-provider-matrix.mjs +228 -0
  70. package/scripts/restart-all-from-repo.sh +4 -4
  71. package/scripts/restart-service.sh +12 -9
  72. package/scripts/smoke-dispatch.mjs +4 -1
  73. package/scripts/test-blast-radius.mjs +204 -0
  74. package/scripts/test-report-summary.mjs +88 -0
  75. package/scripts/test-reporter.mjs +651 -0
  76. package/scripts/test-rerun.mjs +136 -0
  77. package/scripts/tmux-bridge +130 -0
  78. package/apps/dashboard/dist/assets/chat-core-Cx4sTxDd.js +0 -1
  79. package/apps/dashboard/dist/assets/chat-core-Cx4sTxDd.js.br +0 -0
  80. package/apps/dashboard/dist/assets/cli-process-COMRNPqr.js +0 -1
  81. package/apps/dashboard/dist/assets/cli-process-COMRNPqr.js.br +0 -0
  82. package/apps/dashboard/dist/assets/index-CF0aJRtC.css.br +0 -0
  83. package/apps/dashboard/dist/assets/index-DnClJ1ee.js +0 -2
  84. package/apps/dashboard/dist/assets/index-DnClJ1ee.js.br +0 -0
  85. package/apps/dashboard/dist/assets/tab-models-tab-BLEjmd19.js +0 -1
  86. package/apps/dashboard/dist/assets/tab-models-tab-BLEjmd19.js.br +0 -0
  87. package/apps/dashboard/dist/assets/tab-pm-loop-tab-Bfd449B4.js.br +0 -0
  88. package/apps/dashboard/dist/assets/tab-projects-tab-DhNWnlzt.js.br +0 -0
  89. package/apps/dashboard/dist/assets/tab-settings-tab-Bn4nXtDe.js +0 -1
  90. package/apps/dashboard/dist/assets/tab-settings-tab-Bn4nXtDe.js.br +0 -0
  91. package/apps/dashboard/dist/assets/tab-skills-tab-BpY0uZHW.js +0 -1
  92. package/apps/dashboard/dist/assets/tab-skills-tab-BpY0uZHW.js.br +0 -0
  93. package/apps/dashboard/index.html +0 -6529
  94. package/apps/dashboard/package.json +0 -15
  95. package/apps/dashboard/src/app.js +0 -2828
  96. package/apps/dashboard/src/app.js.br +0 -0
  97. package/apps/dashboard/src/app.js.gz +0 -0
  98. package/apps/dashboard/src/chat/chat-actions.js +0 -1847
  99. package/apps/dashboard/src/chat/chat-actions.js.br +0 -0
  100. package/apps/dashboard/src/chat/unified-messages.js +0 -327
  101. package/apps/dashboard/src/chat/unified-messages.js.br +0 -0
  102. package/apps/dashboard/src/cli-process.js +0 -208
  103. package/apps/dashboard/src/cli-process.js.br +0 -0
  104. package/apps/dashboard/src/cli-process.js.gz +0 -0
  105. package/apps/dashboard/src/components/active-tasks-panel.js +0 -175
  106. package/apps/dashboard/src/components/active-tasks-panel.js.br +0 -0
  107. package/apps/dashboard/src/core/api.js +0 -18
  108. package/apps/dashboard/src/core/api.js.br +0 -0
  109. package/apps/dashboard/src/core/dom.js +0 -228
  110. package/apps/dashboard/src/core/dom.js.br +0 -0
  111. package/apps/dashboard/src/core/state.js +0 -91
  112. package/apps/dashboard/src/core/state.js.br +0 -0
  113. package/apps/dashboard/src/core/task-manager.js +0 -134
  114. package/apps/dashboard/src/core/task-manager.js.br +0 -0
  115. package/apps/dashboard/src/orchestration-status.js +0 -127
  116. package/apps/dashboard/src/orchestration-status.js.br +0 -0
  117. package/apps/dashboard/src/setup-wizard.js +0 -562
  118. package/apps/dashboard/src/setup-wizard.js.br +0 -0
  119. package/apps/dashboard/src/styles.css +0 -2085
  120. package/apps/dashboard/src/styles.css.br +0 -0
  121. package/apps/dashboard/src/styles.css.gz +0 -0
  122. package/apps/dashboard/src/tabs/agents-tab.js +0 -2237
  123. package/apps/dashboard/src/tabs/agents-tab.js.br +0 -0
  124. package/apps/dashboard/src/tabs/benchmarks-tab.js +0 -229
  125. package/apps/dashboard/src/tabs/benchmarks-tab.js.br +0 -0
  126. package/apps/dashboard/src/tabs/comms-tab.js +0 -955
  127. package/apps/dashboard/src/tabs/comms-tab.js.br +0 -0
  128. package/apps/dashboard/src/tabs/contacts-tab.js +0 -654
  129. package/apps/dashboard/src/tabs/contacts-tab.js.br +0 -0
  130. package/apps/dashboard/src/tabs/engines-tab.js +0 -175
  131. package/apps/dashboard/src/tabs/engines-tab.js.br +0 -0
  132. package/apps/dashboard/src/tabs/memory-tab.js +0 -182
  133. package/apps/dashboard/src/tabs/memory-tab.js.br +0 -0
  134. package/apps/dashboard/src/tabs/models-tab.js +0 -450
  135. package/apps/dashboard/src/tabs/models-tab.js.br +0 -0
  136. package/apps/dashboard/src/tabs/pm-loop-tab.js +0 -185
  137. package/apps/dashboard/src/tabs/pm-loop-tab.js.br +0 -0
  138. package/apps/dashboard/src/tabs/projects-tab.js +0 -663
  139. package/apps/dashboard/src/tabs/projects-tab.js.br +0 -0
  140. package/apps/dashboard/src/tabs/projects-tab.js.gz +0 -0
  141. package/apps/dashboard/src/tabs/prompts-tab.js +0 -160
  142. package/apps/dashboard/src/tabs/prompts-tab.js.br +0 -0
  143. package/apps/dashboard/src/tabs/services-tab.js +0 -202
  144. package/apps/dashboard/src/tabs/services-tab.js.br +0 -0
  145. package/apps/dashboard/src/tabs/settings-tab.js +0 -861
  146. package/apps/dashboard/src/tabs/settings-tab.js.br +0 -0
  147. package/apps/dashboard/src/tabs/skills-tab.js +0 -284
  148. package/apps/dashboard/src/tabs/skills-tab.js.br +0 -0
  149. package/apps/dashboard/src/tabs/spending-tab.js +0 -173
  150. package/apps/dashboard/src/tabs/spending-tab.js.br +0 -0
  151. package/apps/dashboard/src/tabs/swarm-chat-tab.js +0 -660
  152. package/apps/dashboard/src/tabs/swarm-chat-tab.js.br +0 -0
  153. package/apps/dashboard/src/tabs/swarm-tab.js +0 -538
  154. package/apps/dashboard/src/tabs/swarm-tab.js.br +0 -0
  155. package/apps/dashboard/src/tabs/usage-tab.js +0 -390
  156. package/apps/dashboard/src/tabs/usage-tab.js.br +0 -0
  157. package/apps/dashboard/src/tabs/waves-tab.js +0 -238
  158. package/apps/dashboard/src/tabs/waves-tab.js.br +0 -0
  159. package/apps/dashboard/src/tabs/workflows-tab.js +0 -747
  160. package/apps/dashboard/src/tabs/workflows-tab.js.br +0 -0
  161. package/apps/vibe/.crew/agent-memory/pipeline.json +0 -304
  162. package/apps/vibe/.crew/cost.json +0 -17
  163. package/apps/vibe/.crew/json-parse-metrics.jsonl +0 -27
  164. package/apps/vibe/.crew/pipeline-metrics.jsonl +0 -27
  165. package/apps/vibe/.crew/pipeline-runs/pipeline-0f90c392-2425-4ae5-850c-bd9d17b1d690.jsonl +0 -5
  166. package/apps/vibe/.crew/pipeline-runs/pipeline-1c269dd9-a63f-4fba-af81-5cf08048ef06.jsonl +0 -5
  167. package/apps/vibe/.crew/pipeline-runs/pipeline-288a7765-da24-4a22-89bc-1f3cc9b0562c.jsonl +0 -5
  168. package/apps/vibe/.crew/pipeline-runs/pipeline-2c78fd22-a657-4bd1-bc49-0679fb384409.jsonl +0 -5
  169. package/apps/vibe/.crew/pipeline-runs/pipeline-3da23550-22ed-4904-9a0a-8e79c1f3024c.jsonl +0 -5
  170. package/apps/vibe/.crew/pipeline-runs/pipeline-3e6fe08d-3264-404a-8df3-aab7efef10e7.jsonl +0 -5
  171. package/apps/vibe/.crew/pipeline-runs/pipeline-42eec610-57fe-4e09-9e7e-b315038495c2.jsonl +0 -5
  172. package/apps/vibe/.crew/pipeline-runs/pipeline-4438eb4c-ae13-42b1-90e2-b043d8983be8.jsonl +0 -5
  173. package/apps/vibe/.crew/pipeline-runs/pipeline-4740a9f5-86e7-44b6-a394-de433e291727.jsonl +0 -5
  174. package/apps/vibe/.crew/pipeline-runs/pipeline-49e1da6a-957e-48fd-9220-415019e4f8e2.jsonl +0 -5
  175. package/apps/vibe/.crew/pipeline-runs/pipeline-4c9251db-be68-427b-a3fc-a264f2b5778d.jsonl +0 -5
  176. package/apps/vibe/.crew/pipeline-runs/pipeline-6413fa33-a802-4b57-a8c0-a9056ad67842.jsonl +0 -5
  177. package/apps/vibe/.crew/pipeline-runs/pipeline-65e29a57-664d-4196-8109-017e364f182e.jsonl +0 -5
  178. package/apps/vibe/.crew/pipeline-runs/pipeline-6aa04bc5-9593-4b1f-b58d-3bf2978cb602.jsonl +0 -5
  179. package/apps/vibe/.crew/pipeline-runs/pipeline-6e1cba53-9b70-457e-99e0-59199149dd21.jsonl +0 -5
  180. package/apps/vibe/.crew/pipeline-runs/pipeline-749f41cc-4dac-4204-be64-873a6080a0d2.jsonl +0 -5
  181. package/apps/vibe/.crew/pipeline-runs/pipeline-74d68121-e181-4864-bd9a-c3211341dfaf.jsonl +0 -5
  182. package/apps/vibe/.crew/pipeline-runs/pipeline-8509bc24-142d-4e07-b44a-a50bf99d1103.jsonl +0 -5
  183. package/apps/vibe/.crew/pipeline-runs/pipeline-960339c6-07ca-43ce-9900-f6e1702b39b9.jsonl +0 -5
  184. package/apps/vibe/.crew/pipeline-runs/pipeline-9bef2dd2-6122-42e5-b3d9-19f4d80f9e40.jsonl +0 -5
  185. package/apps/vibe/.crew/pipeline-runs/pipeline-9c6480a9-7031-4146-b241-825b9a2d1de1.jsonl +0 -5
  186. package/apps/vibe/.crew/pipeline-runs/pipeline-9fd42426-8492-4157-9d5f-e1537c060489.jsonl +0 -2
  187. package/apps/vibe/.crew/pipeline-runs/pipeline-ad6d40a3-2f5e-46a9-a345-47caaccc51aa.jsonl +0 -5
  188. package/apps/vibe/.crew/pipeline-runs/pipeline-bc606133-8d5b-4535-8d85-f1a29cdaa981.jsonl +0 -5
  189. package/apps/vibe/.crew/pipeline-runs/pipeline-c1418f4e-b773-4ca1-84a3-216acf36e2f2.jsonl +0 -5
  190. package/apps/vibe/.crew/pipeline-runs/pipeline-c1a13ccd-634a-4d01-a4a7-1177b8a752ff.jsonl +0 -5
  191. package/apps/vibe/.crew/pipeline-runs/pipeline-c7d27b42-249e-4bd4-8f26-6aa998110b8a.jsonl +0 -5
  192. package/apps/vibe/.crew/pipeline-runs/pipeline-cca2e9b9-4a34-4d25-a311-5c793fa7e91e.jsonl +0 -5
  193. package/apps/vibe/.crew/sandbox.json +0 -7
  194. package/apps/vibe/.crew/session.json +0 -330
  195. package/apps/vibe/.crew/training-data.jsonl +0 -0
  196. package/apps/vibe/.github/workflows/studio-quality.yml +0 -37
  197. package/apps/vibe/.studio-data/project-messages/chuck-norris.jsonl +0 -18
  198. package/apps/vibe/.studio-data/project-messages/general.jsonl +0 -81
  199. package/apps/vibe/.studio-data/project-messages/studio-local.jsonl +0 -18
  200. package/apps/vibe/ARCHITECTURE.md +0 -3393
  201. package/apps/vibe/QUICK-REFERENCE.md +0 -211
  202. package/apps/vibe/ROADMAP.md +0 -41
  203. package/apps/vibe/STUDIO-SETUP-COMPLETE.md +0 -35
  204. package/apps/vibe/VISUAL-GUIDE.md +0 -378
  205. package/apps/vibe/capture-demo.mjs +0 -160
  206. package/apps/vibe/capture-full-demo.mjs +0 -255
  207. package/apps/vibe/capture-quickstart.mjs +0 -256
  208. package/apps/vibe/capture-vibe-assets.mjs +0 -71
  209. package/apps/vibe/capture-vibe-video.mjs +0 -260
  210. package/apps/vibe/check-buttons.js +0 -41
  211. package/apps/vibe/diagnose.html +0 -106
  212. package/apps/vibe/fix-buttons.js +0 -103
  213. package/apps/vibe/index.html +0 -3404
  214. package/apps/vibe/package-lock.json +0 -920
  215. package/apps/vibe/scripts/studio-pty-host.py +0 -117
  216. package/apps/vibe/src/main.js +0 -2940
  217. package/apps/vibe/src/register-all-languages.js +0 -98
  218. package/apps/vibe/start-studio.sh +0 -11
  219. package/apps/vibe/test/accessibility-tests.js +0 -77
  220. package/apps/vibe/test/browser-performance-audit.mjs +0 -205
  221. package/apps/vibe/test/performance-tests.js +0 -120
  222. package/apps/vibe/test/security-tests.js +0 -213
  223. package/apps/vibe/tests/e2e.local.mjs +0 -54
  224. package/apps/vibe/tests/server.smoke.mjs +0 -106
  225. package/apps/vibe/update_website.mjs +0 -74
  226. package/apps/vibe/vite.config.js +0 -19
  227. package/lib/crew-lead/chat-handler.mjs.bak +0 -1274
  228. package/lib/engines/rt-envelope.mjs.backup-current +0 -870
@@ -1,229 +0,0 @@
1
- import { escHtml } from '../core/dom.js';
2
-
3
- // ── Benchmark task runner state ──────────────────────────────────────────────
4
- let _runnerTasks = []; // cached task rows from HuggingFace
5
- let _runnerAbort = null; // AbortController for active SSE stream
6
-
7
- export function showBenchmarks({ hideAllViews, setNavActive } = {}) {
8
- if (typeof hideAllViews === 'function') hideAllViews();
9
- const view = document.getElementById('benchmarksView');
10
- if (view) view.classList.add('active');
11
- if (typeof setNavActive === 'function') setNavActive('navBenchmarks');
12
- loadBenchmarkOptions().then(() => {
13
- const sel = document.getElementById('benchmarkSelect');
14
- if (sel && sel.value) loadBenchmarkLeaderboard(sel.value);
15
- });
16
- }
17
-
18
- export async function loadBenchmarkOptions() {
19
- const sel = document.getElementById('benchmarkSelect');
20
- if (!sel) return;
21
- const cur = sel.value;
22
- sel.innerHTML = '<option value="">— Loading… —</option>';
23
- try {
24
- const r = await fetch('/api/zeroeval/benchmarks');
25
- const arr = await r.json();
26
- if (!Array.isArray(arr)) throw new Error('Expected array');
27
- sel.innerHTML = '<option value="">— Pick benchmark —</option>';
28
- arr.forEach(b => {
29
- const id = typeof b === 'object' ? (b.benchmark_id || b.id) : b;
30
- const name = typeof b === 'object' ? (b.name || id) : id;
31
- const opt = document.createElement('option');
32
- opt.value = id;
33
- opt.textContent = name;
34
- sel.appendChild(opt);
35
- });
36
- if (cur && arr.some(b => (typeof b === 'object' ? b.benchmark_id : b) === cur)) {
37
- sel.value = cur;
38
- } else {
39
- const DEFAULT_BENCHMARK = 'swe-bench-verified';
40
- if (arr.some(b => (typeof b === 'object' ? b.benchmark_id : b) === DEFAULT_BENCHMARK)) {
41
- sel.value = DEFAULT_BENCHMARK;
42
- }
43
- }
44
- return sel.value;
45
- } catch (e) {
46
- sel.innerHTML = '<option value="">— Failed to load —</option>';
47
- }
48
- }
49
-
50
- export async function loadBenchmarks() {
51
- await loadBenchmarkOptions();
52
- const sel = document.getElementById('benchmarkSelect');
53
- if (sel && sel.value) loadBenchmarkLeaderboard(sel.value);
54
- }
55
-
56
- export async function loadBenchmarkLeaderboard(benchmarkId) {
57
- const tableEl = document.getElementById('benchmarkTable');
58
- const metaEl = document.getElementById('benchmarkMeta');
59
- if (!tableEl || !metaEl) return;
60
- if (!benchmarkId) {
61
- tableEl.innerHTML = '';
62
- metaEl.style.display = 'none';
63
- return;
64
- }
65
- tableEl.innerHTML = '<div class="meta" style="padding:20px;">Loading…</div>';
66
- metaEl.style.display = 'none';
67
- try {
68
- const r = await fetch('/api/zeroeval/benchmarks/' + encodeURIComponent(benchmarkId));
69
- const data = await r.json();
70
- if (!r.ok) throw new Error(data.error || data.detail || 'Failed to load');
71
- const models = data.entries || data.models || [];
72
- const totalModels = data.total_models ?? data.statistics?.total_models ?? models.length;
73
- const avgScore = data.statistics?.average_score ?? (models.length ? models.reduce((s, m) => (s + (m.normalized_score ?? m.benchmark_score ?? m.score ?? 0)), 0) / models.length : 0);
74
- const displayName = data.benchmark_name || data.name || benchmarkId;
75
- const displayDesc = data.benchmark_description || data.description || '';
76
- metaEl.innerHTML = '<b>' + escHtml(displayName) + '</b>' + (displayDesc ? ': ' + escHtml(displayDesc.slice(0, 200)) : '') + ' | ' + totalModels + ' models, avg ' + (avgScore * 100).toFixed(1) + '%';
77
- metaEl.style.display = 'block';
78
- if (!models.length) {
79
- tableEl.innerHTML = '<div class="meta" style="padding:20px;">No model scores for this benchmark.</div>';
80
- return;
81
- }
82
- const rows = models.slice(0, 100).map(m => {
83
- const score = (m.normalized_score != null ? m.normalized_score : (m.benchmark_score != null ? m.benchmark_score : m.score)) ?? 0;
84
- const pct = (score * 100).toFixed(1);
85
- const inp = m.input_cost_per_million != null ? Math.round(m.input_cost_per_million * 100) + '¢' : '—';
86
- const out = m.output_cost_per_million != null ? Math.round(m.output_cost_per_million * 100) + '¢' : '—';
87
- const inC = m.input_cost_per_million ?? 0;
88
- const outC = m.output_cost_per_million ?? 0;
89
- const centsPerPt = (inC + outC) > 0 && score > 0 ? ((inC + outC) * 100 / (score * 100)).toFixed(1) + '¢/pt' : '—';
90
- return '<tr><td style="padding:6px 10px;">' + (m.rank || '-') + '</td><td style="padding:6px 10px;">' + escHtml(m.model_name || m.model_id) + '</td><td style="padding:6px 10px;">' + escHtml(m.organization_name || '') + '</td><td style="padding:6px 10px;font-weight:600;">' + pct + '%</td><td style="padding:6px 10px;font-size:11px;" title="¢ per 1M input tokens">' + inp + '</td><td style="padding:6px 10px;font-size:11px;" title="¢ per 1M output tokens">' + out + '</td><td style="padding:6px 10px;font-size:11px;" title="¢ per score point (1M in+out / score%)">' + centsPerPt + '</td><td style="padding:6px 10px;font-size:11px;">' + (m.analysis_method || '-').slice(0, 40) + '</td></tr>';
91
- }).join('');
92
- tableEl.innerHTML = '<table style="width:100%;border-collapse:collapse;font-size:12px;"><thead><tr style="border-bottom:1px solid var(--border);"><th style="text-align:left;padding:6px 10px;">Rank</th><th style="text-align:left;padding:6px 10px;">Model</th><th style="text-align:left;padding:6px 10px;">Org</th><th style="text-align:left;padding:6px 10px;">Score</th><th style="text-align:left;padding:6px 10px;" title="¢ per 1M input">in ¢</th><th style="text-align:left;padding:6px 10px;" title="¢ per 1M output">out ¢</th><th style="text-align:left;padding:6px 10px;" title="¢ per score point">¢/pt</th><th style="text-align:left;padding:6px 10px;">Method</th></tr></thead><tbody>' + rows + '</tbody></table>';
93
- } catch (e) {
94
- tableEl.innerHTML = '<div style="color:var(--red);padding:20px;">Error: ' + escHtml(e.message) + '</div>';
95
- }
96
- }
97
-
98
- // ── Custom runner — load SWE-Bench tasks into the task picker ────────────────
99
- export async function loadBenchmarkTasks() {
100
- const sel = document.getElementById('benchmarkTaskSelect');
101
- if (!sel) return;
102
- sel.innerHTML = '<option value="">— Loading tasks… —</option>';
103
- try {
104
- const r = await fetch('/api/benchmark-tasks?benchmark=swe-bench-verified&offset=0&length=50');
105
- const data = await r.json();
106
- if (!r.ok) throw new Error(data.error || 'Failed to load tasks');
107
- const rows = data.rows || [];
108
- _runnerTasks = rows.map(r => r.row || r);
109
- sel.innerHTML = '<option value="">— Pick a task —</option>';
110
- _runnerTasks.forEach((task, i) => {
111
- const id = task.instance_id || task.id || `task-${i}`;
112
- const repo = task.repo || '';
113
- const opt = document.createElement('option');
114
- opt.value = i;
115
- opt.textContent = id + (repo ? ` (${repo})` : '');
116
- sel.appendChild(opt);
117
- });
118
- } catch (e) {
119
- sel.innerHTML = '<option value="">— Failed: ' + escHtml(e.message) + ' —</option>';
120
- }
121
- }
122
-
123
- // Show problem statement preview when a task is selected
124
- export function onBenchmarkTaskSelect(idx) {
125
- const preview = document.getElementById('benchmarkTaskPreview');
126
- if (!preview) return;
127
- if (idx === '' || idx == null || !_runnerTasks[idx]) { preview.style.display = 'none'; return; }
128
- const task = _runnerTasks[idx];
129
- const ps = task.problem_statement || task.description || '(no problem statement)';
130
- preview.textContent = ps.slice(0, 800) + (ps.length > 800 ? '\n…' : '');
131
- preview.style.display = 'block';
132
- }
133
-
134
- // ── Stream a benchmark task through an engine ────────────────────────────────
135
- export async function runBenchmarkTask() {
136
- const sel = document.getElementById('benchmarkTaskSelect');
137
- const engineSel = document.getElementById('benchmarkRunEngine');
138
- const modelInput = document.getElementById('benchmarkRunModel');
139
- const outputEl = document.getElementById('benchmarkRunOutput');
140
- const streamEl = document.getElementById('benchmarkRunStream');
141
- const statusEl = document.getElementById('benchmarkRunStatus');
142
- const stopBtn = document.getElementById('benchmarkRunStop');
143
- const runBtn = document.getElementById('benchmarkRunBtn');
144
- if (!sel || !engineSel || !outputEl || !streamEl) return;
145
-
146
- const idx = sel.value;
147
- if (idx === '' || idx == null || !_runnerTasks[idx]) {
148
- alert('Pick a task first — click "↻ Load Tasks" if the list is empty.');
149
- return;
150
- }
151
- const task = _runnerTasks[idx];
152
- const engine = engineSel.value;
153
- const model = (modelInput?.value || '').trim() || undefined;
154
-
155
- // Cancel any existing run
156
- if (_runnerAbort) { try { _runnerAbort.abort(); } catch {} }
157
- _runnerAbort = new AbortController();
158
-
159
- outputEl.style.display = 'flex';
160
- streamEl.textContent = '';
161
- statusEl.textContent = `Running on ${engine}…`;
162
- if (stopBtn) stopBtn.style.display = 'inline-block';
163
- if (runBtn) runBtn.disabled = true;
164
-
165
- try {
166
- const resp = await fetch('/api/benchmark-run', {
167
- method: 'POST',
168
- headers: { 'content-type': 'application/json' },
169
- body: JSON.stringify({
170
- instanceId: task.instance_id || task.id,
171
- problemStatement: task.problem_statement || task.description || '',
172
- repo: task.repo || '',
173
- hints: task.hints_text || '',
174
- engine,
175
- ...(model ? { model } : {}),
176
- }),
177
- signal: _runnerAbort.signal,
178
- });
179
-
180
- const reader = resp.body.getReader();
181
- const decoder = new TextDecoder();
182
- let buf = '';
183
-
184
- while (true) {
185
- const { done, value } = await reader.read();
186
- if (done) break;
187
- buf += decoder.decode(value, { stream: true });
188
- const parts = buf.split('\n\n');
189
- buf = parts.pop();
190
- for (const part of parts) {
191
- const line = part.replace(/^data:\s*/, '');
192
- if (!line) continue;
193
- try {
194
- const ev = JSON.parse(line);
195
- if (ev.type === 'chunk' && ev.text) {
196
- streamEl.textContent += ev.text;
197
- streamEl.scrollTop = streamEl.scrollHeight;
198
- } else if (ev.type === 'done') {
199
- const ok = ev.exitCode === 0 || ev.exitCode == null;
200
- statusEl.textContent = ok ? '✓ Done' : `✗ Exit ${ev.exitCode}`;
201
- statusEl.style.color = ok ? 'var(--green)' : 'var(--red)';
202
- } else if (ev.type === 'error' || ev.error) {
203
- streamEl.textContent += '\n[error] ' + (ev.error || ev.message || JSON.stringify(ev));
204
- }
205
- } catch {}
206
- }
207
- }
208
- } catch (e) {
209
- if (e.name !== 'AbortError') {
210
- streamEl.textContent += '\n[stream error] ' + e.message;
211
- statusEl.textContent = '✗ Error';
212
- statusEl.style.color = 'var(--red)';
213
- } else {
214
- statusEl.textContent = '⏹ Stopped';
215
- statusEl.style.color = 'var(--text-2)';
216
- }
217
- } finally {
218
- if (stopBtn) stopBtn.style.display = 'none';
219
- if (runBtn) runBtn.disabled = false;
220
- _runnerAbort = null;
221
- }
222
- }
223
-
224
- export function stopBenchmarkRun() {
225
- if (_runnerAbort) {
226
- try { _runnerAbort.abort(); } catch {}
227
- _runnerAbort = null;
228
- }
229
- }