crewswarm 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. package/README.md +22 -9
  2. package/apps/dashboard/dist/assets/{chat-core-Cx4sTxDd.js → chat-core-3KirthZA.js} +1 -1
  3. package/apps/dashboard/dist/assets/index-GSWxxEPO.js +2 -0
  4. package/apps/dashboard/dist/assets/{tab-pm-loop-tab-Bfd449B4.js → tab-pm-loop-tab-DiAPTJXu.js} +1 -1
  5. package/apps/dashboard/dist/assets/{tab-projects-tab-DhNWnlzt.js → tab-projects-tab-SFH4E--a.js} +1 -1
  6. package/apps/dashboard/dist/assets/tab-settings-tab-BselH1c0.js +1 -0
  7. package/apps/dashboard/dist/index.html +82 -11
  8. package/apps/vibe/README.md +2 -2
  9. package/apps/vibe/package.json +1 -1
  10. package/apps/vibe/server.mjs +3 -3
  11. package/crew-lead.mjs +48 -5
  12. package/lib/bridges/gateway-ws.mjs +4 -0
  13. package/lib/bridges/tmux-bridge.mjs +200 -0
  14. package/lib/cli-process-tracker.mjs +2 -1
  15. package/lib/crew-lead/chat-handler.mjs +34 -0
  16. package/lib/crew-lead/http-server.mjs +340 -14
  17. package/lib/crew-lead/llm-caller.mjs +24 -8
  18. package/lib/crew-lead/prompts.mjs +7 -0
  19. package/lib/crew-lead/wave-dispatcher.mjs +53 -3
  20. package/lib/crew-lead/ws-router.mjs +219 -27
  21. package/lib/engines/engine-registry.mjs +9 -0
  22. package/lib/engines/rt-envelope.mjs +1 -0
  23. package/lib/engines/runners.mjs +26 -2
  24. package/lib/runtime/config.mjs +7 -0
  25. package/lib/runtime/paths.mjs +12 -8
  26. package/lib/sessions/session-manager.mjs +287 -0
  27. package/package.json +35 -15
  28. package/scripts/capture-build-flow.mjs +118 -0
  29. package/scripts/coverage-report.mjs +209 -0
  30. package/scripts/coverage-summary.mjs +47 -0
  31. package/scripts/dashboard-validation.mjs +74 -0
  32. package/scripts/dashboard.mjs +560 -70
  33. package/scripts/live-bridge-matrix.mjs +79 -0
  34. package/scripts/live-cli-matrix.mjs +166 -0
  35. package/scripts/live-crewchat-check.mjs +42 -0
  36. package/scripts/live-engine-matrix.mjs +50 -0
  37. package/scripts/live-provider-failover-matrix.mjs +107 -0
  38. package/scripts/live-provider-matrix.mjs +228 -0
  39. package/scripts/restart-all-from-repo.sh +4 -4
  40. package/scripts/smoke-dispatch.mjs +4 -1
  41. package/scripts/test-blast-radius.mjs +204 -0
  42. package/scripts/test-report-summary.mjs +88 -0
  43. package/scripts/test-reporter.mjs +651 -0
  44. package/scripts/test-rerun.mjs +136 -0
  45. package/scripts/tmux-bridge +130 -0
  46. package/apps/dashboard/dist/assets/chat-core-Cx4sTxDd.js.br +0 -0
  47. package/apps/dashboard/dist/assets/cli-process-COMRNPqr.js.br +0 -0
  48. package/apps/dashboard/dist/assets/components-BS9fQjE_.js.br +0 -0
  49. package/apps/dashboard/dist/assets/core-utils-CmOkXgzi.js.br +0 -0
  50. package/apps/dashboard/dist/assets/index-CF0aJRtC.css.br +0 -0
  51. package/apps/dashboard/dist/assets/index-DnClJ1ee.js +0 -2
  52. package/apps/dashboard/dist/assets/index-DnClJ1ee.js.br +0 -0
  53. package/apps/dashboard/dist/assets/orchestration-Ca2DLWN-.js.br +0 -0
  54. package/apps/dashboard/dist/assets/setup-wizard-CA0Or47w.js.br +0 -0
  55. package/apps/dashboard/dist/assets/tab-agents-tab-BgpIsjkw.js.br +0 -0
  56. package/apps/dashboard/dist/assets/tab-comms-tab-kguqTIzD.js.br +0 -0
  57. package/apps/dashboard/dist/assets/tab-contacts-tab-DiOyMYth.js.br +0 -0
  58. package/apps/dashboard/dist/assets/tab-engines-tab-BsdZVvU0.js.br +0 -0
  59. package/apps/dashboard/dist/assets/tab-memory-tab-Cu6u13EQ.js.br +0 -0
  60. package/apps/dashboard/dist/assets/tab-models-tab-BLEjmd19.js.br +0 -0
  61. package/apps/dashboard/dist/assets/tab-pm-loop-tab-Bfd449B4.js.br +0 -0
  62. package/apps/dashboard/dist/assets/tab-projects-tab-DhNWnlzt.js.br +0 -0
  63. package/apps/dashboard/dist/assets/tab-prompts-tab-DVkUNaJd.js.br +0 -0
  64. package/apps/dashboard/dist/assets/tab-services-tab-DU_LH3uG.js.br +0 -0
  65. package/apps/dashboard/dist/assets/tab-settings-tab-Bn4nXtDe.js +0 -1
  66. package/apps/dashboard/dist/assets/tab-settings-tab-Bn4nXtDe.js.br +0 -0
  67. package/apps/dashboard/dist/assets/tab-skills-tab-BpY0uZHW.js.br +0 -0
  68. package/apps/dashboard/dist/assets/tab-spending-tab-DEccQHnt.js.br +0 -0
  69. package/apps/dashboard/dist/assets/tab-swarm-chat-tab-BNrd88-r.js.br +0 -0
  70. package/apps/dashboard/dist/assets/tab-swarm-tab-B1AcjL1W.js.br +0 -0
  71. package/apps/dashboard/dist/assets/tab-usage-tab-BIOOnB-Y.js.br +0 -0
  72. package/apps/dashboard/dist/assets/tab-waves-tab-SaJDkb4x.js.br +0 -0
  73. package/apps/dashboard/dist/assets/tab-workflows-tab-B-soSy1k.js.br +0 -0
  74. package/apps/dashboard/dist/index.html.br +0 -0
  75. package/apps/dashboard/index.html +0 -6459
  76. package/apps/dashboard/package.json +0 -15
  77. package/apps/dashboard/src/app.js +0 -2823
  78. package/apps/dashboard/src/app.js.br +0 -0
  79. package/apps/dashboard/src/app.js.gz +0 -0
  80. package/apps/dashboard/src/chat/chat-actions.js +0 -1847
  81. package/apps/dashboard/src/chat/chat-actions.js.br +0 -0
  82. package/apps/dashboard/src/chat/unified-messages.js +0 -327
  83. package/apps/dashboard/src/chat/unified-messages.js.br +0 -0
  84. package/apps/dashboard/src/cli-process.js +0 -208
  85. package/apps/dashboard/src/cli-process.js.br +0 -0
  86. package/apps/dashboard/src/cli-process.js.gz +0 -0
  87. package/apps/dashboard/src/components/active-tasks-panel.js +0 -175
  88. package/apps/dashboard/src/components/active-tasks-panel.js.br +0 -0
  89. package/apps/dashboard/src/core/api.js +0 -18
  90. package/apps/dashboard/src/core/api.js.br +0 -0
  91. package/apps/dashboard/src/core/dom.js +0 -228
  92. package/apps/dashboard/src/core/dom.js.br +0 -0
  93. package/apps/dashboard/src/core/state.js +0 -91
  94. package/apps/dashboard/src/core/state.js.br +0 -0
  95. package/apps/dashboard/src/core/task-manager.js +0 -134
  96. package/apps/dashboard/src/core/task-manager.js.br +0 -0
  97. package/apps/dashboard/src/orchestration-status.js +0 -127
  98. package/apps/dashboard/src/orchestration-status.js.br +0 -0
  99. package/apps/dashboard/src/setup-wizard.js +0 -562
  100. package/apps/dashboard/src/setup-wizard.js.br +0 -0
  101. package/apps/dashboard/src/styles.css +0 -2085
  102. package/apps/dashboard/src/styles.css.br +0 -0
  103. package/apps/dashboard/src/styles.css.gz +0 -0
  104. package/apps/dashboard/src/tabs/agents-tab.js +0 -2237
  105. package/apps/dashboard/src/tabs/agents-tab.js.br +0 -0
  106. package/apps/dashboard/src/tabs/benchmarks-tab.js +0 -229
  107. package/apps/dashboard/src/tabs/benchmarks-tab.js.br +0 -0
  108. package/apps/dashboard/src/tabs/comms-tab.js +0 -955
  109. package/apps/dashboard/src/tabs/comms-tab.js.br +0 -0
  110. package/apps/dashboard/src/tabs/contacts-tab.js +0 -654
  111. package/apps/dashboard/src/tabs/contacts-tab.js.br +0 -0
  112. package/apps/dashboard/src/tabs/engines-tab.js +0 -175
  113. package/apps/dashboard/src/tabs/engines-tab.js.br +0 -0
  114. package/apps/dashboard/src/tabs/memory-tab.js +0 -182
  115. package/apps/dashboard/src/tabs/memory-tab.js.br +0 -0
  116. package/apps/dashboard/src/tabs/models-tab.js +0 -450
  117. package/apps/dashboard/src/tabs/models-tab.js.br +0 -0
  118. package/apps/dashboard/src/tabs/pm-loop-tab.js +0 -185
  119. package/apps/dashboard/src/tabs/pm-loop-tab.js.br +0 -0
  120. package/apps/dashboard/src/tabs/projects-tab.js +0 -663
  121. package/apps/dashboard/src/tabs/projects-tab.js.br +0 -0
  122. package/apps/dashboard/src/tabs/projects-tab.js.gz +0 -0
  123. package/apps/dashboard/src/tabs/prompts-tab.js +0 -160
  124. package/apps/dashboard/src/tabs/prompts-tab.js.br +0 -0
  125. package/apps/dashboard/src/tabs/services-tab.js +0 -202
  126. package/apps/dashboard/src/tabs/services-tab.js.br +0 -0
  127. package/apps/dashboard/src/tabs/settings-tab.js +0 -803
  128. package/apps/dashboard/src/tabs/settings-tab.js.br +0 -0
  129. package/apps/dashboard/src/tabs/skills-tab.js +0 -284
  130. package/apps/dashboard/src/tabs/skills-tab.js.br +0 -0
  131. package/apps/dashboard/src/tabs/spending-tab.js +0 -173
  132. package/apps/dashboard/src/tabs/spending-tab.js.br +0 -0
  133. package/apps/dashboard/src/tabs/swarm-chat-tab.js +0 -660
  134. package/apps/dashboard/src/tabs/swarm-chat-tab.js.br +0 -0
  135. package/apps/dashboard/src/tabs/swarm-tab.js +0 -538
  136. package/apps/dashboard/src/tabs/swarm-tab.js.br +0 -0
  137. package/apps/dashboard/src/tabs/usage-tab.js +0 -390
  138. package/apps/dashboard/src/tabs/usage-tab.js.br +0 -0
  139. package/apps/dashboard/src/tabs/waves-tab.js +0 -238
  140. package/apps/dashboard/src/tabs/waves-tab.js.br +0 -0
  141. package/apps/dashboard/src/tabs/workflows-tab.js +0 -747
  142. package/apps/dashboard/src/tabs/workflows-tab.js.br +0 -0
  143. package/apps/vibe/.crew/agent-memory/pipeline.json +0 -304
  144. package/apps/vibe/.crew/cost.json +0 -17
  145. package/apps/vibe/.crew/json-parse-metrics.jsonl +0 -27
  146. package/apps/vibe/.crew/pipeline-metrics.jsonl +0 -27
  147. package/apps/vibe/.crew/pipeline-runs/pipeline-0f90c392-2425-4ae5-850c-bd9d17b1d690.jsonl +0 -5
  148. package/apps/vibe/.crew/pipeline-runs/pipeline-1c269dd9-a63f-4fba-af81-5cf08048ef06.jsonl +0 -5
  149. package/apps/vibe/.crew/pipeline-runs/pipeline-288a7765-da24-4a22-89bc-1f3cc9b0562c.jsonl +0 -5
  150. package/apps/vibe/.crew/pipeline-runs/pipeline-2c78fd22-a657-4bd1-bc49-0679fb384409.jsonl +0 -5
  151. package/apps/vibe/.crew/pipeline-runs/pipeline-3da23550-22ed-4904-9a0a-8e79c1f3024c.jsonl +0 -5
  152. package/apps/vibe/.crew/pipeline-runs/pipeline-3e6fe08d-3264-404a-8df3-aab7efef10e7.jsonl +0 -5
  153. package/apps/vibe/.crew/pipeline-runs/pipeline-42eec610-57fe-4e09-9e7e-b315038495c2.jsonl +0 -5
  154. package/apps/vibe/.crew/pipeline-runs/pipeline-4438eb4c-ae13-42b1-90e2-b043d8983be8.jsonl +0 -5
  155. package/apps/vibe/.crew/pipeline-runs/pipeline-4740a9f5-86e7-44b6-a394-de433e291727.jsonl +0 -5
  156. package/apps/vibe/.crew/pipeline-runs/pipeline-49e1da6a-957e-48fd-9220-415019e4f8e2.jsonl +0 -5
  157. package/apps/vibe/.crew/pipeline-runs/pipeline-4c9251db-be68-427b-a3fc-a264f2b5778d.jsonl +0 -5
  158. package/apps/vibe/.crew/pipeline-runs/pipeline-6413fa33-a802-4b57-a8c0-a9056ad67842.jsonl +0 -5
  159. package/apps/vibe/.crew/pipeline-runs/pipeline-65e29a57-664d-4196-8109-017e364f182e.jsonl +0 -5
  160. package/apps/vibe/.crew/pipeline-runs/pipeline-6aa04bc5-9593-4b1f-b58d-3bf2978cb602.jsonl +0 -5
  161. package/apps/vibe/.crew/pipeline-runs/pipeline-6e1cba53-9b70-457e-99e0-59199149dd21.jsonl +0 -5
  162. package/apps/vibe/.crew/pipeline-runs/pipeline-749f41cc-4dac-4204-be64-873a6080a0d2.jsonl +0 -5
  163. package/apps/vibe/.crew/pipeline-runs/pipeline-74d68121-e181-4864-bd9a-c3211341dfaf.jsonl +0 -5
  164. package/apps/vibe/.crew/pipeline-runs/pipeline-8509bc24-142d-4e07-b44a-a50bf99d1103.jsonl +0 -5
  165. package/apps/vibe/.crew/pipeline-runs/pipeline-960339c6-07ca-43ce-9900-f6e1702b39b9.jsonl +0 -5
  166. package/apps/vibe/.crew/pipeline-runs/pipeline-9bef2dd2-6122-42e5-b3d9-19f4d80f9e40.jsonl +0 -5
  167. package/apps/vibe/.crew/pipeline-runs/pipeline-9c6480a9-7031-4146-b241-825b9a2d1de1.jsonl +0 -5
  168. package/apps/vibe/.crew/pipeline-runs/pipeline-9fd42426-8492-4157-9d5f-e1537c060489.jsonl +0 -2
  169. package/apps/vibe/.crew/pipeline-runs/pipeline-ad6d40a3-2f5e-46a9-a345-47caaccc51aa.jsonl +0 -5
  170. package/apps/vibe/.crew/pipeline-runs/pipeline-bc606133-8d5b-4535-8d85-f1a29cdaa981.jsonl +0 -5
  171. package/apps/vibe/.crew/pipeline-runs/pipeline-c1418f4e-b773-4ca1-84a3-216acf36e2f2.jsonl +0 -5
  172. package/apps/vibe/.crew/pipeline-runs/pipeline-c1a13ccd-634a-4d01-a4a7-1177b8a752ff.jsonl +0 -5
  173. package/apps/vibe/.crew/pipeline-runs/pipeline-c7d27b42-249e-4bd4-8f26-6aa998110b8a.jsonl +0 -5
  174. package/apps/vibe/.crew/pipeline-runs/pipeline-cca2e9b9-4a34-4d25-a311-5c793fa7e91e.jsonl +0 -5
  175. package/apps/vibe/.crew/sandbox.json +0 -7
  176. package/apps/vibe/.crew/session.json +0 -330
  177. package/apps/vibe/.crew/training-data.jsonl +0 -0
  178. package/apps/vibe/.github/workflows/studio-quality.yml +0 -37
  179. package/apps/vibe/.studio-data/project-messages/chuck-norris.jsonl +0 -18
  180. package/apps/vibe/.studio-data/project-messages/general.jsonl +0 -81
  181. package/apps/vibe/.studio-data/project-messages/studio-local.jsonl +0 -18
  182. package/apps/vibe/ARCHITECTURE.md +0 -3393
  183. package/apps/vibe/QUICK-REFERENCE.md +0 -211
  184. package/apps/vibe/ROADMAP.md +0 -41
  185. package/apps/vibe/STUDIO-SETUP-COMPLETE.md +0 -35
  186. package/apps/vibe/VISUAL-GUIDE.md +0 -378
  187. package/apps/vibe/capture-demo.mjs +0 -160
  188. package/apps/vibe/capture-full-demo.mjs +0 -255
  189. package/apps/vibe/capture-quickstart.mjs +0 -256
  190. package/apps/vibe/capture-vibe-assets.mjs +0 -71
  191. package/apps/vibe/capture-vibe-video.mjs +0 -260
  192. package/apps/vibe/check-buttons.js +0 -41
  193. package/apps/vibe/diagnose.html +0 -106
  194. package/apps/vibe/fix-buttons.js +0 -103
  195. package/apps/vibe/index.html +0 -3404
  196. package/apps/vibe/package-lock.json +0 -920
  197. package/apps/vibe/scripts/studio-pty-host.py +0 -117
  198. package/apps/vibe/src/main.js +0 -2940
  199. package/apps/vibe/src/register-all-languages.js +0 -98
  200. package/apps/vibe/start-studio.sh +0 -11
  201. package/apps/vibe/test/accessibility-tests.js +0 -77
  202. package/apps/vibe/test/browser-performance-audit.mjs +0 -205
  203. package/apps/vibe/test/performance-tests.js +0 -120
  204. package/apps/vibe/test/security-tests.js +0 -213
  205. package/apps/vibe/tests/e2e.local.mjs +0 -54
  206. package/apps/vibe/tests/server.smoke.mjs +0 -106
  207. package/apps/vibe/update_website.mjs +0 -74
  208. package/apps/vibe/vite.config.js +0 -19
  209. package/lib/crew-lead/chat-handler.mjs.bak +0 -1274
  210. package/lib/engines/rt-envelope.mjs.backup-current +0 -870
@@ -1,229 +0,0 @@
1
- import { escHtml } from '../core/dom.js';
2
-
3
- // ── Benchmark task runner state ──────────────────────────────────────────────
4
- let _runnerTasks = []; // cached task rows from HuggingFace
5
- let _runnerAbort = null; // AbortController for active SSE stream
6
-
7
- export function showBenchmarks({ hideAllViews, setNavActive } = {}) {
8
- if (typeof hideAllViews === 'function') hideAllViews();
9
- const view = document.getElementById('benchmarksView');
10
- if (view) view.classList.add('active');
11
- if (typeof setNavActive === 'function') setNavActive('navBenchmarks');
12
- loadBenchmarkOptions().then(() => {
13
- const sel = document.getElementById('benchmarkSelect');
14
- if (sel && sel.value) loadBenchmarkLeaderboard(sel.value);
15
- });
16
- }
17
-
18
- export async function loadBenchmarkOptions() {
19
- const sel = document.getElementById('benchmarkSelect');
20
- if (!sel) return;
21
- const cur = sel.value;
22
- sel.innerHTML = '<option value="">— Loading… —</option>';
23
- try {
24
- const r = await fetch('/api/zeroeval/benchmarks');
25
- const arr = await r.json();
26
- if (!Array.isArray(arr)) throw new Error('Expected array');
27
- sel.innerHTML = '<option value="">— Pick benchmark —</option>';
28
- arr.forEach(b => {
29
- const id = typeof b === 'object' ? (b.benchmark_id || b.id) : b;
30
- const name = typeof b === 'object' ? (b.name || id) : id;
31
- const opt = document.createElement('option');
32
- opt.value = id;
33
- opt.textContent = name;
34
- sel.appendChild(opt);
35
- });
36
- if (cur && arr.some(b => (typeof b === 'object' ? b.benchmark_id : b) === cur)) {
37
- sel.value = cur;
38
- } else {
39
- const DEFAULT_BENCHMARK = 'swe-bench-verified';
40
- if (arr.some(b => (typeof b === 'object' ? b.benchmark_id : b) === DEFAULT_BENCHMARK)) {
41
- sel.value = DEFAULT_BENCHMARK;
42
- }
43
- }
44
- return sel.value;
45
- } catch (e) {
46
- sel.innerHTML = '<option value="">— Failed to load —</option>';
47
- }
48
- }
49
-
50
- export async function loadBenchmarks() {
51
- await loadBenchmarkOptions();
52
- const sel = document.getElementById('benchmarkSelect');
53
- if (sel && sel.value) loadBenchmarkLeaderboard(sel.value);
54
- }
55
-
56
- export async function loadBenchmarkLeaderboard(benchmarkId) {
57
- const tableEl = document.getElementById('benchmarkTable');
58
- const metaEl = document.getElementById('benchmarkMeta');
59
- if (!tableEl || !metaEl) return;
60
- if (!benchmarkId) {
61
- tableEl.innerHTML = '';
62
- metaEl.style.display = 'none';
63
- return;
64
- }
65
- tableEl.innerHTML = '<div class="meta" style="padding:20px;">Loading…</div>';
66
- metaEl.style.display = 'none';
67
- try {
68
- const r = await fetch('/api/zeroeval/benchmarks/' + encodeURIComponent(benchmarkId));
69
- const data = await r.json();
70
- if (!r.ok) throw new Error(data.error || data.detail || 'Failed to load');
71
- const models = data.entries || data.models || [];
72
- const totalModels = data.total_models ?? data.statistics?.total_models ?? models.length;
73
- const avgScore = data.statistics?.average_score ?? (models.length ? models.reduce((s, m) => (s + (m.normalized_score ?? m.benchmark_score ?? m.score ?? 0)), 0) / models.length : 0);
74
- const displayName = data.benchmark_name || data.name || benchmarkId;
75
- const displayDesc = data.benchmark_description || data.description || '';
76
- metaEl.innerHTML = '<b>' + escHtml(displayName) + '</b>' + (displayDesc ? ': ' + escHtml(displayDesc.slice(0, 200)) : '') + ' | ' + totalModels + ' models, avg ' + (avgScore * 100).toFixed(1) + '%';
77
- metaEl.style.display = 'block';
78
- if (!models.length) {
79
- tableEl.innerHTML = '<div class="meta" style="padding:20px;">No model scores for this benchmark.</div>';
80
- return;
81
- }
82
- const rows = models.slice(0, 100).map(m => {
83
- const score = (m.normalized_score != null ? m.normalized_score : (m.benchmark_score != null ? m.benchmark_score : m.score)) ?? 0;
84
- const pct = (score * 100).toFixed(1);
85
- const inp = m.input_cost_per_million != null ? Math.round(m.input_cost_per_million * 100) + '¢' : '—';
86
- const out = m.output_cost_per_million != null ? Math.round(m.output_cost_per_million * 100) + '¢' : '—';
87
- const inC = m.input_cost_per_million ?? 0;
88
- const outC = m.output_cost_per_million ?? 0;
89
- const centsPerPt = (inC + outC) > 0 && score > 0 ? ((inC + outC) * 100 / (score * 100)).toFixed(1) + '¢/pt' : '—';
90
- return '<tr><td style="padding:6px 10px;">' + (m.rank || '-') + '</td><td style="padding:6px 10px;">' + escHtml(m.model_name || m.model_id) + '</td><td style="padding:6px 10px;">' + escHtml(m.organization_name || '') + '</td><td style="padding:6px 10px;font-weight:600;">' + pct + '%</td><td style="padding:6px 10px;font-size:11px;" title="¢ per 1M input tokens">' + inp + '</td><td style="padding:6px 10px;font-size:11px;" title="¢ per 1M output tokens">' + out + '</td><td style="padding:6px 10px;font-size:11px;" title="¢ per score point (1M in+out / score%)">' + centsPerPt + '</td><td style="padding:6px 10px;font-size:11px;">' + (m.analysis_method || '-').slice(0, 40) + '</td></tr>';
91
- }).join('');
92
- tableEl.innerHTML = '<table style="width:100%;border-collapse:collapse;font-size:12px;"><thead><tr style="border-bottom:1px solid var(--border);"><th style="text-align:left;padding:6px 10px;">Rank</th><th style="text-align:left;padding:6px 10px;">Model</th><th style="text-align:left;padding:6px 10px;">Org</th><th style="text-align:left;padding:6px 10px;">Score</th><th style="text-align:left;padding:6px 10px;" title="¢ per 1M input">in ¢</th><th style="text-align:left;padding:6px 10px;" title="¢ per 1M output">out ¢</th><th style="text-align:left;padding:6px 10px;" title="¢ per score point">¢/pt</th><th style="text-align:left;padding:6px 10px;">Method</th></tr></thead><tbody>' + rows + '</tbody></table>';
93
- } catch (e) {
94
- tableEl.innerHTML = '<div style="color:var(--red);padding:20px;">Error: ' + escHtml(e.message) + '</div>';
95
- }
96
- }
97
-
98
- // ── Custom runner — load SWE-Bench tasks into the task picker ────────────────
99
- export async function loadBenchmarkTasks() {
100
- const sel = document.getElementById('benchmarkTaskSelect');
101
- if (!sel) return;
102
- sel.innerHTML = '<option value="">— Loading tasks… —</option>';
103
- try {
104
- const r = await fetch('/api/benchmark-tasks?benchmark=swe-bench-verified&offset=0&length=50');
105
- const data = await r.json();
106
- if (!r.ok) throw new Error(data.error || 'Failed to load tasks');
107
- const rows = data.rows || [];
108
- _runnerTasks = rows.map(r => r.row || r);
109
- sel.innerHTML = '<option value="">— Pick a task —</option>';
110
- _runnerTasks.forEach((task, i) => {
111
- const id = task.instance_id || task.id || `task-${i}`;
112
- const repo = task.repo || '';
113
- const opt = document.createElement('option');
114
- opt.value = i;
115
- opt.textContent = id + (repo ? ` (${repo})` : '');
116
- sel.appendChild(opt);
117
- });
118
- } catch (e) {
119
- sel.innerHTML = '<option value="">— Failed: ' + escHtml(e.message) + ' —</option>';
120
- }
121
- }
122
-
123
- // Show problem statement preview when a task is selected
124
- export function onBenchmarkTaskSelect(idx) {
125
- const preview = document.getElementById('benchmarkTaskPreview');
126
- if (!preview) return;
127
- if (idx === '' || idx == null || !_runnerTasks[idx]) { preview.style.display = 'none'; return; }
128
- const task = _runnerTasks[idx];
129
- const ps = task.problem_statement || task.description || '(no problem statement)';
130
- preview.textContent = ps.slice(0, 800) + (ps.length > 800 ? '\n…' : '');
131
- preview.style.display = 'block';
132
- }
133
-
134
- // ── Stream a benchmark task through an engine ────────────────────────────────
135
- export async function runBenchmarkTask() {
136
- const sel = document.getElementById('benchmarkTaskSelect');
137
- const engineSel = document.getElementById('benchmarkRunEngine');
138
- const modelInput = document.getElementById('benchmarkRunModel');
139
- const outputEl = document.getElementById('benchmarkRunOutput');
140
- const streamEl = document.getElementById('benchmarkRunStream');
141
- const statusEl = document.getElementById('benchmarkRunStatus');
142
- const stopBtn = document.getElementById('benchmarkRunStop');
143
- const runBtn = document.getElementById('benchmarkRunBtn');
144
- if (!sel || !engineSel || !outputEl || !streamEl) return;
145
-
146
- const idx = sel.value;
147
- if (idx === '' || idx == null || !_runnerTasks[idx]) {
148
- alert('Pick a task first — click "↻ Load Tasks" if the list is empty.');
149
- return;
150
- }
151
- const task = _runnerTasks[idx];
152
- const engine = engineSel.value;
153
- const model = (modelInput?.value || '').trim() || undefined;
154
-
155
- // Cancel any existing run
156
- if (_runnerAbort) { try { _runnerAbort.abort(); } catch {} }
157
- _runnerAbort = new AbortController();
158
-
159
- outputEl.style.display = 'flex';
160
- streamEl.textContent = '';
161
- statusEl.textContent = `Running on ${engine}…`;
162
- if (stopBtn) stopBtn.style.display = 'inline-block';
163
- if (runBtn) runBtn.disabled = true;
164
-
165
- try {
166
- const resp = await fetch('/api/benchmark-run', {
167
- method: 'POST',
168
- headers: { 'content-type': 'application/json' },
169
- body: JSON.stringify({
170
- instanceId: task.instance_id || task.id,
171
- problemStatement: task.problem_statement || task.description || '',
172
- repo: task.repo || '',
173
- hints: task.hints_text || '',
174
- engine,
175
- ...(model ? { model } : {}),
176
- }),
177
- signal: _runnerAbort.signal,
178
- });
179
-
180
- const reader = resp.body.getReader();
181
- const decoder = new TextDecoder();
182
- let buf = '';
183
-
184
- while (true) {
185
- const { done, value } = await reader.read();
186
- if (done) break;
187
- buf += decoder.decode(value, { stream: true });
188
- const parts = buf.split('\n\n');
189
- buf = parts.pop();
190
- for (const part of parts) {
191
- const line = part.replace(/^data:\s*/, '');
192
- if (!line) continue;
193
- try {
194
- const ev = JSON.parse(line);
195
- if (ev.type === 'chunk' && ev.text) {
196
- streamEl.textContent += ev.text;
197
- streamEl.scrollTop = streamEl.scrollHeight;
198
- } else if (ev.type === 'done') {
199
- const ok = ev.exitCode === 0 || ev.exitCode == null;
200
- statusEl.textContent = ok ? '✓ Done' : `✗ Exit ${ev.exitCode}`;
201
- statusEl.style.color = ok ? 'var(--green)' : 'var(--red)';
202
- } else if (ev.type === 'error' || ev.error) {
203
- streamEl.textContent += '\n[error] ' + (ev.error || ev.message || JSON.stringify(ev));
204
- }
205
- } catch {}
206
- }
207
- }
208
- } catch (e) {
209
- if (e.name !== 'AbortError') {
210
- streamEl.textContent += '\n[stream error] ' + e.message;
211
- statusEl.textContent = '✗ Error';
212
- statusEl.style.color = 'var(--red)';
213
- } else {
214
- statusEl.textContent = '⏹ Stopped';
215
- statusEl.style.color = 'var(--text-2)';
216
- }
217
- } finally {
218
- if (stopBtn) stopBtn.style.display = 'none';
219
- if (runBtn) runBtn.disabled = false;
220
- _runnerAbort = null;
221
- }
222
- }
223
-
224
- export function stopBenchmarkRun() {
225
- if (_runnerAbort) {
226
- try { _runnerAbort.abort(); } catch {}
227
- _runnerAbort = null;
228
- }
229
- }