@wingman-ai/gateway 0.5.2 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/config/agentConfig.cjs +4 -0
- package/dist/agent/config/agentConfig.d.ts +12 -0
- package/dist/agent/config/agentConfig.js +4 -0
- package/dist/agent/config/toolRegistry.cjs +75 -1
- package/dist/agent/config/toolRegistry.d.ts +3 -0
- package/dist/agent/config/toolRegistry.js +75 -1
- package/dist/agent/middleware/large-tool-results.cjs +207 -0
- package/dist/agent/middleware/large-tool-results.d.ts +16 -0
- package/dist/agent/middleware/large-tool-results.js +173 -0
- package/dist/agent/tools/browser_control.cjs +9 -1231
- package/dist/agent/tools/browser_control.d.ts +126 -234
- package/dist/agent/tools/browser_control.js +7 -1226
- package/dist/agent/tools/browser_runtime.cjs +1189 -0
- package/dist/agent/tools/browser_runtime.d.ts +560 -0
- package/dist/agent/tools/browser_runtime.js +1122 -0
- package/dist/agent/tools/browser_session.cjs +153 -0
- package/dist/agent/tools/browser_session.d.ts +741 -0
- package/dist/agent/tools/browser_session.js +110 -0
- package/dist/agent/tools/browser_session_manager.cjs +202 -0
- package/dist/agent/tools/browser_session_manager.d.ts +64 -0
- package/dist/agent/tools/browser_session_manager.js +165 -0
- package/dist/cli/commands/init.cjs +5 -1
- package/dist/cli/commands/init.js +5 -1
- package/dist/cli/config/loader.cjs +0 -5
- package/dist/cli/config/loader.js +0 -5
- package/dist/cli/config/schema.cjs +3 -7
- package/dist/cli/config/schema.d.ts +6 -6
- package/dist/cli/config/schema.js +3 -7
- package/dist/cli/core/agentInvoker.cjs +86 -22
- package/dist/cli/core/agentInvoker.d.ts +10 -3
- package/dist/cli/core/agentInvoker.js +86 -25
- package/dist/cli/core/outputManager.cjs +7 -2
- package/dist/cli/core/outputManager.d.ts +2 -2
- package/dist/cli/core/outputManager.js +7 -2
- package/dist/cli/types.d.ts +2 -1
- package/dist/webui/assets/index-XrEnkZiq.css +11 -0
- package/dist/webui/assets/index-mDs6HbKM.js +215 -0
- package/dist/webui/index.html +2 -2
- package/package.json +10 -10
- package/templates/agents/README.md +2 -1
- package/templates/agents/coding/agent.md +6 -13
- package/templates/agents/coding-v2/agent.md +6 -1
- package/templates/agents/game-dev/agent.md +8 -2
- package/templates/agents/game-dev/game-designer.md +4 -0
- package/templates/agents/game-dev/scene-engineer.md +4 -0
- package/templates/agents/main/agent.md +5 -0
- package/templates/agents/researcher/agent.md +11 -0
- package/templates/agents/stock-trader/agent.md +4 -0
- package/dist/agent/tests/agentConfig.test.cjs +0 -224
- package/dist/agent/tests/agentConfig.test.d.ts +0 -1
- package/dist/agent/tests/agentConfig.test.js +0 -218
- package/dist/agent/tests/agentLoader.test.cjs +0 -335
- package/dist/agent/tests/agentLoader.test.d.ts +0 -1
- package/dist/agent/tests/agentLoader.test.js +0 -329
- package/dist/agent/tests/backgroundTerminal.test.cjs +0 -70
- package/dist/agent/tests/backgroundTerminal.test.d.ts +0 -1
- package/dist/agent/tests/backgroundTerminal.test.js +0 -64
- package/dist/agent/tests/browserControlHelpers.test.cjs +0 -35
- package/dist/agent/tests/browserControlHelpers.test.d.ts +0 -1
- package/dist/agent/tests/browserControlHelpers.test.js +0 -29
- package/dist/agent/tests/browserControlTool.test.cjs +0 -2117
- package/dist/agent/tests/browserControlTool.test.d.ts +0 -1
- package/dist/agent/tests/browserControlTool.test.js +0 -2111
- package/dist/agent/tests/commandExecuteTool.test.cjs +0 -29
- package/dist/agent/tests/commandExecuteTool.test.d.ts +0 -1
- package/dist/agent/tests/commandExecuteTool.test.js +0 -23
- package/dist/agent/tests/internet_search.test.cjs +0 -107
- package/dist/agent/tests/internet_search.test.d.ts +0 -1
- package/dist/agent/tests/internet_search.test.js +0 -101
- package/dist/agent/tests/mcpClientManager.test.cjs +0 -290
- package/dist/agent/tests/mcpClientManager.test.d.ts +0 -1
- package/dist/agent/tests/mcpClientManager.test.js +0 -284
- package/dist/agent/tests/mcpResourceTools.test.cjs +0 -101
- package/dist/agent/tests/mcpResourceTools.test.d.ts +0 -1
- package/dist/agent/tests/mcpResourceTools.test.js +0 -95
- package/dist/agent/tests/modelFactory.test.cjs +0 -190
- package/dist/agent/tests/modelFactory.test.d.ts +0 -1
- package/dist/agent/tests/modelFactory.test.js +0 -184
- package/dist/agent/tests/terminalSessionManager.test.cjs +0 -121
- package/dist/agent/tests/terminalSessionManager.test.d.ts +0 -1
- package/dist/agent/tests/terminalSessionManager.test.js +0 -115
- package/dist/agent/tests/test-agent-loader.cjs +0 -33
- package/dist/agent/tests/test-agent-loader.d.ts +0 -1
- package/dist/agent/tests/test-agent-loader.js +0 -27
- package/dist/agent/tests/test-subagent-loading.cjs +0 -99
- package/dist/agent/tests/test-subagent-loading.d.ts +0 -1
- package/dist/agent/tests/test-subagent-loading.js +0 -93
- package/dist/agent/tests/toolRegistry.test.cjs +0 -147
- package/dist/agent/tests/toolRegistry.test.d.ts +0 -1
- package/dist/agent/tests/toolRegistry.test.js +0 -141
- package/dist/agent/tests/uiRegistryTools.test.cjs +0 -114
- package/dist/agent/tests/uiRegistryTools.test.d.ts +0 -1
- package/dist/agent/tests/uiRegistryTools.test.js +0 -105
- package/dist/agent/tests/xaiImageModel.test.cjs +0 -194
- package/dist/agent/tests/xaiImageModel.test.d.ts +0 -1
- package/dist/agent/tests/xaiImageModel.test.js +0 -188
- package/dist/tests/additionalMessageMiddleware.test.cjs +0 -216
- package/dist/tests/additionalMessageMiddleware.test.d.ts +0 -1
- package/dist/tests/additionalMessageMiddleware.test.js +0 -188
- package/dist/tests/agent-config-voice.test.cjs +0 -25
- package/dist/tests/agent-config-voice.test.d.ts +0 -1
- package/dist/tests/agent-config-voice.test.js +0 -19
- package/dist/tests/agentInvokerAttachments.test.cjs +0 -190
- package/dist/tests/agentInvokerAttachments.test.d.ts +0 -1
- package/dist/tests/agentInvokerAttachments.test.js +0 -184
- package/dist/tests/agentInvokerSummarization.test.cjs +0 -613
- package/dist/tests/agentInvokerSummarization.test.d.ts +0 -1
- package/dist/tests/agentInvokerSummarization.test.js +0 -607
- package/dist/tests/agentInvokerTokenUsage.test.cjs +0 -124
- package/dist/tests/agentInvokerTokenUsage.test.d.ts +0 -1
- package/dist/tests/agentInvokerTokenUsage.test.js +0 -118
- package/dist/tests/agentInvokerWorkdir.test.cjs +0 -150
- package/dist/tests/agentInvokerWorkdir.test.d.ts +0 -1
- package/dist/tests/agentInvokerWorkdir.test.js +0 -122
- package/dist/tests/agents-api.test.cjs +0 -324
- package/dist/tests/agents-api.test.d.ts +0 -1
- package/dist/tests/agents-api.test.js +0 -318
- package/dist/tests/attachments-utils.test.cjs +0 -46
- package/dist/tests/attachments-utils.test.d.ts +0 -1
- package/dist/tests/attachments-utils.test.js +0 -40
- package/dist/tests/browser-command.test.cjs +0 -264
- package/dist/tests/browser-command.test.d.ts +0 -1
- package/dist/tests/browser-command.test.js +0 -258
- package/dist/tests/browser-relay-server.test.cjs +0 -20
- package/dist/tests/browser-relay-server.test.d.ts +0 -1
- package/dist/tests/browser-relay-server.test.js +0 -14
- package/dist/tests/bunSqliteAdapter.test.cjs +0 -265
- package/dist/tests/bunSqliteAdapter.test.d.ts +0 -1
- package/dist/tests/bunSqliteAdapter.test.js +0 -259
- package/dist/tests/candleRange.test.cjs +0 -48
- package/dist/tests/candleRange.test.d.ts +0 -1
- package/dist/tests/candleRange.test.js +0 -42
- package/dist/tests/cli-config-loader.test.cjs +0 -532
- package/dist/tests/cli-config-loader.test.d.ts +0 -1
- package/dist/tests/cli-config-loader.test.js +0 -526
- package/dist/tests/cli-config-warnings.test.cjs +0 -94
- package/dist/tests/cli-config-warnings.test.d.ts +0 -1
- package/dist/tests/cli-config-warnings.test.js +0 -88
- package/dist/tests/cli-init.test.cjs +0 -225
- package/dist/tests/cli-init.test.d.ts +0 -1
- package/dist/tests/cli-init.test.js +0 -219
- package/dist/tests/cli-workspace-root.test.cjs +0 -114
- package/dist/tests/cli-workspace-root.test.d.ts +0 -1
- package/dist/tests/cli-workspace-root.test.js +0 -108
- package/dist/tests/codex-credentials-precedence.test.cjs +0 -94
- package/dist/tests/codex-credentials-precedence.test.d.ts +0 -1
- package/dist/tests/codex-credentials-precedence.test.js +0 -88
- package/dist/tests/codex-provider.test.cjs +0 -383
- package/dist/tests/codex-provider.test.d.ts +0 -1
- package/dist/tests/codex-provider.test.js +0 -377
- package/dist/tests/config-json-schema.test.cjs +0 -37
- package/dist/tests/config-json-schema.test.d.ts +0 -1
- package/dist/tests/config-json-schema.test.js +0 -31
- package/dist/tests/discord-adapter.test.cjs +0 -89
- package/dist/tests/discord-adapter.test.d.ts +0 -1
- package/dist/tests/discord-adapter.test.js +0 -83
- package/dist/tests/falRuntime.test.cjs +0 -78
- package/dist/tests/falRuntime.test.d.ts +0 -1
- package/dist/tests/falRuntime.test.js +0 -72
- package/dist/tests/falSummary.test.cjs +0 -51
- package/dist/tests/falSummary.test.d.ts +0 -1
- package/dist/tests/falSummary.test.js +0 -45
- package/dist/tests/fs-api.test.cjs +0 -138
- package/dist/tests/fs-api.test.d.ts +0 -1
- package/dist/tests/fs-api.test.js +0 -132
- package/dist/tests/gateway-command-workspace.test.cjs +0 -150
- package/dist/tests/gateway-command-workspace.test.d.ts +0 -1
- package/dist/tests/gateway-command-workspace.test.js +0 -144
- package/dist/tests/gateway-http-security.test.cjs +0 -318
- package/dist/tests/gateway-http-security.test.d.ts +0 -1
- package/dist/tests/gateway-http-security.test.js +0 -312
- package/dist/tests/gateway-node-mode.test.cjs +0 -174
- package/dist/tests/gateway-node-mode.test.d.ts +0 -1
- package/dist/tests/gateway-node-mode.test.js +0 -168
- package/dist/tests/gateway-origin-policy.test.cjs +0 -82
- package/dist/tests/gateway-origin-policy.test.d.ts +0 -1
- package/dist/tests/gateway-origin-policy.test.js +0 -76
- package/dist/tests/gateway-request-execution-overrides.test.cjs +0 -42
- package/dist/tests/gateway-request-execution-overrides.test.d.ts +0 -1
- package/dist/tests/gateway-request-execution-overrides.test.js +0 -36
- package/dist/tests/gateway.test.cjs +0 -700
- package/dist/tests/gateway.test.d.ts +0 -1
- package/dist/tests/gateway.test.js +0 -694
- package/dist/tests/hooks-matcher.test.cjs +0 -309
- package/dist/tests/hooks-matcher.test.d.ts +0 -1
- package/dist/tests/hooks-matcher.test.js +0 -303
- package/dist/tests/hooks-merger.test.cjs +0 -528
- package/dist/tests/hooks-merger.test.d.ts +0 -1
- package/dist/tests/hooks-merger.test.js +0 -522
- package/dist/tests/imagePersistence.test.cjs +0 -169
- package/dist/tests/imagePersistence.test.d.ts +0 -1
- package/dist/tests/imagePersistence.test.js +0 -163
- package/dist/tests/integration/agent-invocation.integration.test.cjs +0 -264
- package/dist/tests/integration/agent-invocation.integration.test.d.ts +0 -1
- package/dist/tests/integration/agent-invocation.integration.test.js +0 -258
- package/dist/tests/integration/finnhub-candles.integration.test.cjs +0 -98
- package/dist/tests/integration/finnhub-candles.integration.test.d.ts +0 -1
- package/dist/tests/integration/finnhub-candles.integration.test.js +0 -92
- package/dist/tests/integration/summarization-e2e.integration.test.cjs +0 -127
- package/dist/tests/integration/summarization-e2e.integration.test.d.ts +0 -1
- package/dist/tests/integration/summarization-e2e.integration.test.js +0 -121
- package/dist/tests/logger.test.cjs +0 -353
- package/dist/tests/logger.test.d.ts +0 -1
- package/dist/tests/logger.test.js +0 -347
- package/dist/tests/mediaCompatibilityMiddleware.test.cjs +0 -106
- package/dist/tests/mediaCompatibilityMiddleware.test.d.ts +0 -1
- package/dist/tests/mediaCompatibilityMiddleware.test.js +0 -100
- package/dist/tests/node-tools.test.cjs +0 -77
- package/dist/tests/node-tools.test.d.ts +0 -1
- package/dist/tests/node-tools.test.js +0 -71
- package/dist/tests/nodes-api.test.cjs +0 -86
- package/dist/tests/nodes-api.test.d.ts +0 -1
- package/dist/tests/nodes-api.test.js +0 -80
- package/dist/tests/outputManagerContextSummarized.test.cjs +0 -43
- package/dist/tests/outputManagerContextSummarized.test.d.ts +0 -1
- package/dist/tests/outputManagerContextSummarized.test.js +0 -37
- package/dist/tests/provider-command-codex.test.cjs +0 -57
- package/dist/tests/provider-command-codex.test.d.ts +0 -1
- package/dist/tests/provider-command-codex.test.js +0 -51
- package/dist/tests/routines-api.test.cjs +0 -107
- package/dist/tests/routines-api.test.d.ts +0 -1
- package/dist/tests/routines-api.test.js +0 -101
- package/dist/tests/run-terminal-bench-official-script.test.cjs +0 -61
- package/dist/tests/run-terminal-bench-official-script.test.d.ts +0 -1
- package/dist/tests/run-terminal-bench-official-script.test.js +0 -55
- package/dist/tests/sessionManager-uionly.test.cjs +0 -50
- package/dist/tests/sessionManager-uionly.test.d.ts +0 -1
- package/dist/tests/sessionManager-uionly.test.js +0 -44
- package/dist/tests/sessionMessageAttachments.test.cjs +0 -197
- package/dist/tests/sessionMessageAttachments.test.d.ts +0 -1
- package/dist/tests/sessionMessageAttachments.test.js +0 -191
- package/dist/tests/sessionMessageRole.test.cjs +0 -44
- package/dist/tests/sessionMessageRole.test.d.ts +0 -1
- package/dist/tests/sessionMessageRole.test.js +0 -38
- package/dist/tests/sessionStateMessages.test.cjs +0 -236
- package/dist/tests/sessionStateMessages.test.d.ts +0 -1
- package/dist/tests/sessionStateMessages.test.js +0 -230
- package/dist/tests/sessions-api.test.cjs +0 -250
- package/dist/tests/sessions-api.test.d.ts +0 -1
- package/dist/tests/sessions-api.test.js +0 -244
- package/dist/tests/skill-activation.test.cjs +0 -86
- package/dist/tests/skill-activation.test.d.ts +0 -1
- package/dist/tests/skill-activation.test.js +0 -80
- package/dist/tests/skill-metadata.test.cjs +0 -119
- package/dist/tests/skill-metadata.test.d.ts +0 -1
- package/dist/tests/skill-metadata.test.js +0 -113
- package/dist/tests/skill-repository.test.cjs +0 -469
- package/dist/tests/skill-repository.test.d.ts +0 -1
- package/dist/tests/skill-repository.test.js +0 -463
- package/dist/tests/skill-security-scanner.test.cjs +0 -126
- package/dist/tests/skill-security-scanner.test.d.ts +0 -1
- package/dist/tests/skill-security-scanner.test.js +0 -120
- package/dist/tests/sms-api.test.cjs +0 -183
- package/dist/tests/sms-api.test.d.ts +0 -1
- package/dist/tests/sms-api.test.js +0 -177
- package/dist/tests/sms-commands.test.cjs +0 -90
- package/dist/tests/sms-commands.test.d.ts +0 -1
- package/dist/tests/sms-commands.test.js +0 -84
- package/dist/tests/sms-policy-store.test.cjs +0 -69
- package/dist/tests/sms-policy-store.test.d.ts +0 -1
- package/dist/tests/sms-policy-store.test.js +0 -63
- package/dist/tests/teams-adapter.test.cjs +0 -58
- package/dist/tests/teams-adapter.test.d.ts +0 -1
- package/dist/tests/teams-adapter.test.js +0 -52
- package/dist/tests/technicalIndicators.test.cjs +0 -82
- package/dist/tests/technicalIndicators.test.d.ts +0 -1
- package/dist/tests/technicalIndicators.test.js +0 -76
- package/dist/tests/terminal-bench-adapters-helpers.test.cjs +0 -64
- package/dist/tests/terminal-bench-adapters-helpers.test.d.ts +0 -1
- package/dist/tests/terminal-bench-adapters-helpers.test.js +0 -58
- package/dist/tests/terminal-bench-cleanup.test.cjs +0 -93
- package/dist/tests/terminal-bench-cleanup.test.d.ts +0 -1
- package/dist/tests/terminal-bench-cleanup.test.js +0 -87
- package/dist/tests/terminal-bench-config.test.cjs +0 -62
- package/dist/tests/terminal-bench-config.test.d.ts +0 -1
- package/dist/tests/terminal-bench-config.test.js +0 -56
- package/dist/tests/terminal-bench-official.test.cjs +0 -194
- package/dist/tests/terminal-bench-official.test.d.ts +0 -1
- package/dist/tests/terminal-bench-official.test.js +0 -188
- package/dist/tests/terminal-bench-runner.test.cjs +0 -82
- package/dist/tests/terminal-bench-runner.test.d.ts +0 -1
- package/dist/tests/terminal-bench-runner.test.js +0 -76
- package/dist/tests/terminal-bench-scoring.test.cjs +0 -128
- package/dist/tests/terminal-bench-scoring.test.d.ts +0 -1
- package/dist/tests/terminal-bench-scoring.test.js +0 -122
- package/dist/tests/terminalProbe.test.cjs +0 -45
- package/dist/tests/terminalProbe.test.d.ts +0 -1
- package/dist/tests/terminalProbe.test.js +0 -39
- package/dist/tests/terminalProbeAuth.test.cjs +0 -85
- package/dist/tests/terminalProbeAuth.test.d.ts +0 -1
- package/dist/tests/terminalProbeAuth.test.js +0 -79
- package/dist/tests/toolDisplayHelpers.test.cjs +0 -46
- package/dist/tests/toolDisplayHelpers.test.d.ts +0 -1
- package/dist/tests/toolDisplayHelpers.test.js +0 -40
- package/dist/tests/uv.test.cjs +0 -47
- package/dist/tests/uv.test.d.ts +0 -1
- package/dist/tests/uv.test.js +0 -41
- package/dist/tests/voice-config.test.cjs +0 -35
- package/dist/tests/voice-config.test.d.ts +0 -1
- package/dist/tests/voice-config.test.js +0 -29
- package/dist/tests/websocket-transport.test.cjs +0 -31
- package/dist/tests/websocket-transport.test.d.ts +0 -1
- package/dist/tests/websocket-transport.test.js +0 -25
- package/dist/tests/yahooCandles.test.cjs +0 -111
- package/dist/tests/yahooCandles.test.d.ts +0 -1
- package/dist/tests/yahooCandles.test.js +0 -105
- package/dist/tools/finance/optionsAnalytics.test.cjs +0 -128
- package/dist/tools/finance/optionsAnalytics.test.d.ts +0 -1
- package/dist/tools/finance/optionsAnalytics.test.js +0 -122
- package/dist/webui/assets/index-D07GBGp0.js +0 -215
- package/dist/webui/assets/index-DV8IYeOw.css +0 -11
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
import { mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
-
import { access, mkdtemp, readFile, rm } from "node:fs/promises";
|
|
3
|
-
import { tmpdir } from "node:os";
|
|
4
|
-
import { join } from "node:path";
|
|
5
|
-
import { afterEach, describe, expect, it } from "vitest";
|
|
6
|
-
import { cleanBenchArtifacts, getBenchCleanupTargets } from "../bench/cleanup.js";
|
|
7
|
-
const tempDirs = [];
|
|
8
|
-
async function pathExists(path) {
|
|
9
|
-
try {
|
|
10
|
-
await access(path);
|
|
11
|
-
return true;
|
|
12
|
-
} catch {
|
|
13
|
-
return false;
|
|
14
|
-
}
|
|
15
|
-
}
|
|
16
|
-
describe("terminal bench cleanup", ()=>{
|
|
17
|
-
afterEach(async ()=>{
|
|
18
|
-
for (const dir of tempDirs)await rm(dir, {
|
|
19
|
-
recursive: true,
|
|
20
|
-
force: true
|
|
21
|
-
});
|
|
22
|
-
tempDirs.length = 0;
|
|
23
|
-
});
|
|
24
|
-
it("targets generated bench artifacts only", async ()=>{
|
|
25
|
-
const root = await mkdtemp(join(tmpdir(), "wingman-bench-cleanup-"));
|
|
26
|
-
tempDirs.push(root);
|
|
27
|
-
const jobsRun = join(root, "jobs", "2026-01-01__00-00-00");
|
|
28
|
-
const officialRun = join(root, "bench", "results", "official", "2026-01-01__00-00-00");
|
|
29
|
-
const wrapperRun = join(root, "bench", "results", "official-wrapper", "2026-01-01T00-00-00-000Z");
|
|
30
|
-
const pycacheDir = join(root, "bench", "harbor_agents", "__pycache__");
|
|
31
|
-
const configPath = join(root, "bench", "config.tb2-wingman.json");
|
|
32
|
-
mkdirSync(jobsRun, {
|
|
33
|
-
recursive: true
|
|
34
|
-
});
|
|
35
|
-
mkdirSync(officialRun, {
|
|
36
|
-
recursive: true
|
|
37
|
-
});
|
|
38
|
-
mkdirSync(wrapperRun, {
|
|
39
|
-
recursive: true
|
|
40
|
-
});
|
|
41
|
-
mkdirSync(pycacheDir, {
|
|
42
|
-
recursive: true
|
|
43
|
-
});
|
|
44
|
-
writeFileSync(join(jobsRun, "result.json"), "{}");
|
|
45
|
-
writeFileSync(join(officialRun, "result.json"), "{}");
|
|
46
|
-
writeFileSync(join(wrapperRun, "summary.json"), "{}");
|
|
47
|
-
writeFileSync(join(pycacheDir, "cache.pyc"), "x");
|
|
48
|
-
writeFileSync(configPath, "{}");
|
|
49
|
-
const targets = await getBenchCleanupTargets(root);
|
|
50
|
-
expect(targets).toContain(jobsRun);
|
|
51
|
-
expect(targets).toContain(officialRun);
|
|
52
|
-
expect(targets).toContain(wrapperRun);
|
|
53
|
-
expect(targets).toContain(pycacheDir);
|
|
54
|
-
expect(targets).not.toContain(configPath);
|
|
55
|
-
});
|
|
56
|
-
it("removes generated artifacts and keeps config files", async ()=>{
|
|
57
|
-
const root = await mkdtemp(join(tmpdir(), "wingman-bench-cleanup-"));
|
|
58
|
-
tempDirs.push(root);
|
|
59
|
-
const jobsRun = join(root, "jobs", "2026-01-01__00-00-00");
|
|
60
|
-
const officialRun = join(root, "bench", "results", "official", "2026-01-01__00-00-00");
|
|
61
|
-
const wrapperRun = join(root, "bench", "results", "official-wrapper", "2026-01-01T00-00-00-000Z");
|
|
62
|
-
const pycacheDir = join(root, "bench", "harbor_agents", "__pycache__");
|
|
63
|
-
const configPath = join(root, "bench", "config.tb2-wingman.json");
|
|
64
|
-
mkdirSync(jobsRun, {
|
|
65
|
-
recursive: true
|
|
66
|
-
});
|
|
67
|
-
mkdirSync(officialRun, {
|
|
68
|
-
recursive: true
|
|
69
|
-
});
|
|
70
|
-
mkdirSync(wrapperRun, {
|
|
71
|
-
recursive: true
|
|
72
|
-
});
|
|
73
|
-
mkdirSync(pycacheDir, {
|
|
74
|
-
recursive: true
|
|
75
|
-
});
|
|
76
|
-
writeFileSync(configPath, '{"dataset":"terminal-bench@2.0"}');
|
|
77
|
-
const result = await cleanBenchArtifacts(root);
|
|
78
|
-
expect(result.missingPaths).toHaveLength(0);
|
|
79
|
-
expect(result.removedPaths.length).toBeGreaterThanOrEqual(4);
|
|
80
|
-
expect(await pathExists(jobsRun)).toBe(false);
|
|
81
|
-
expect(await pathExists(officialRun)).toBe(false);
|
|
82
|
-
expect(await pathExists(wrapperRun)).toBe(false);
|
|
83
|
-
expect(await pathExists(pycacheDir)).toBe(false);
|
|
84
|
-
expect(await pathExists(configPath)).toBe(true);
|
|
85
|
-
expect(await readFile(configPath, "utf-8")).toContain("terminal-bench@2.0");
|
|
86
|
-
});
|
|
87
|
-
});
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __webpack_exports__ = {};
|
|
3
|
-
const external_node_fs_namespaceObject = require("node:fs");
|
|
4
|
-
const external_node_os_namespaceObject = require("node:os");
|
|
5
|
-
const external_node_path_namespaceObject = require("node:path");
|
|
6
|
-
const external_vitest_namespaceObject = require("vitest");
|
|
7
|
-
const config_cjs_namespaceObject = require("../bench/config.cjs");
|
|
8
|
-
(0, external_vitest_namespaceObject.describe)("terminal bench config", ()=>{
|
|
9
|
-
const workdirs = [];
|
|
10
|
-
(0, external_vitest_namespaceObject.afterEach)(()=>{
|
|
11
|
-
for (const workdir of workdirs)(0, external_node_fs_namespaceObject.rmSync)(workdir, {
|
|
12
|
-
recursive: true,
|
|
13
|
-
force: true
|
|
14
|
-
});
|
|
15
|
-
workdirs.length = 0;
|
|
16
|
-
});
|
|
17
|
-
(0, external_vitest_namespaceObject.it)("loads config/tasks with defaults and resolves relative paths", async ()=>{
|
|
18
|
-
const workdir = (0, external_node_fs_namespaceObject.mkdtempSync)((0, external_node_path_namespaceObject.join)((0, external_node_os_namespaceObject.tmpdir)(), "wingman-bench-config-"));
|
|
19
|
-
workdirs.push(workdir);
|
|
20
|
-
const taskFilePath = (0, external_node_path_namespaceObject.join)(workdir, "tasks.json");
|
|
21
|
-
(0, external_node_fs_namespaceObject.writeFileSync)(taskFilePath, JSON.stringify({
|
|
22
|
-
tasks: [
|
|
23
|
-
{
|
|
24
|
-
id: "t1",
|
|
25
|
-
prompt: "hello",
|
|
26
|
-
validator: {
|
|
27
|
-
type: "assistant_contains",
|
|
28
|
-
includes: "ok"
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
]
|
|
32
|
-
}, null, 2));
|
|
33
|
-
const configPath = (0, external_node_path_namespaceObject.join)(workdir, "config.json");
|
|
34
|
-
(0, external_node_fs_namespaceObject.writeFileSync)(configPath, JSON.stringify({
|
|
35
|
-
taskFile: "./tasks.json",
|
|
36
|
-
adapter: {
|
|
37
|
-
type: "command",
|
|
38
|
-
command: {
|
|
39
|
-
command: "echo",
|
|
40
|
-
args: [
|
|
41
|
-
"ok"
|
|
42
|
-
]
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
}, null, 2));
|
|
46
|
-
const config = await (0, config_cjs_namespaceObject.loadTerminalBenchConfig)(configPath);
|
|
47
|
-
(0, external_vitest_namespaceObject.expect)(config.version).toBe(1);
|
|
48
|
-
(0, external_vitest_namespaceObject.expect)(config.taskFilePath).toBe(taskFilePath);
|
|
49
|
-
(0, external_vitest_namespaceObject.expect)(config.resultsDir).toBe((0, external_node_path_namespaceObject.join)(workdir, "bench/results"));
|
|
50
|
-
(0, external_vitest_namespaceObject.expect)(config.run.defaultTimeoutMs).toBe(300000);
|
|
51
|
-
(0, external_vitest_namespaceObject.expect)(config.tasks).toHaveLength(1);
|
|
52
|
-
(0, external_vitest_namespaceObject.expect)(config.tasks[0].validator.type).toBe("assistant_contains");
|
|
53
|
-
if ("assistant_contains" !== config.tasks[0].validator.type) throw new Error("Unexpected validator type");
|
|
54
|
-
(0, external_vitest_namespaceObject.expect)(config.tasks[0].validator.includes).toEqual([
|
|
55
|
-
"ok"
|
|
56
|
-
]);
|
|
57
|
-
});
|
|
58
|
-
});
|
|
59
|
-
for(var __rspack_i in __webpack_exports__)exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
60
|
-
Object.defineProperty(exports, '__esModule', {
|
|
61
|
-
value: true
|
|
62
|
-
});
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
|
2
|
-
import { tmpdir } from "node:os";
|
|
3
|
-
import { join } from "node:path";
|
|
4
|
-
import { afterEach, describe, expect, it } from "vitest";
|
|
5
|
-
import { loadTerminalBenchConfig } from "../bench/config.js";
|
|
6
|
-
describe("terminal bench config", ()=>{
|
|
7
|
-
const workdirs = [];
|
|
8
|
-
afterEach(()=>{
|
|
9
|
-
for (const workdir of workdirs)rmSync(workdir, {
|
|
10
|
-
recursive: true,
|
|
11
|
-
force: true
|
|
12
|
-
});
|
|
13
|
-
workdirs.length = 0;
|
|
14
|
-
});
|
|
15
|
-
it("loads config/tasks with defaults and resolves relative paths", async ()=>{
|
|
16
|
-
const workdir = mkdtempSync(join(tmpdir(), "wingman-bench-config-"));
|
|
17
|
-
workdirs.push(workdir);
|
|
18
|
-
const taskFilePath = join(workdir, "tasks.json");
|
|
19
|
-
writeFileSync(taskFilePath, JSON.stringify({
|
|
20
|
-
tasks: [
|
|
21
|
-
{
|
|
22
|
-
id: "t1",
|
|
23
|
-
prompt: "hello",
|
|
24
|
-
validator: {
|
|
25
|
-
type: "assistant_contains",
|
|
26
|
-
includes: "ok"
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}, null, 2));
|
|
31
|
-
const configPath = join(workdir, "config.json");
|
|
32
|
-
writeFileSync(configPath, JSON.stringify({
|
|
33
|
-
taskFile: "./tasks.json",
|
|
34
|
-
adapter: {
|
|
35
|
-
type: "command",
|
|
36
|
-
command: {
|
|
37
|
-
command: "echo",
|
|
38
|
-
args: [
|
|
39
|
-
"ok"
|
|
40
|
-
]
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
}, null, 2));
|
|
44
|
-
const config = await loadTerminalBenchConfig(configPath);
|
|
45
|
-
expect(config.version).toBe(1);
|
|
46
|
-
expect(config.taskFilePath).toBe(taskFilePath);
|
|
47
|
-
expect(config.resultsDir).toBe(join(workdir, "bench/results"));
|
|
48
|
-
expect(config.run.defaultTimeoutMs).toBe(300000);
|
|
49
|
-
expect(config.tasks).toHaveLength(1);
|
|
50
|
-
expect(config.tasks[0].validator.type).toBe("assistant_contains");
|
|
51
|
-
if ("assistant_contains" !== config.tasks[0].validator.type) throw new Error("Unexpected validator type");
|
|
52
|
-
expect(config.tasks[0].validator.includes).toEqual([
|
|
53
|
-
"ok"
|
|
54
|
-
]);
|
|
55
|
-
});
|
|
56
|
-
});
|
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __webpack_exports__ = {};
|
|
3
|
-
const external_vitest_namespaceObject = require("vitest");
|
|
4
|
-
const official_cjs_namespaceObject = require("../bench/official.cjs");
|
|
5
|
-
(0, external_vitest_namespaceObject.describe)("terminal bench official runner (harbor tb2)", ()=>{
|
|
6
|
-
(0, external_vitest_namespaceObject.it)("builds harbor args with overrides", ()=>{
|
|
7
|
-
const args = (0, official_cjs_namespaceObject.buildHarborRunArgs)({
|
|
8
|
-
dataset: "terminal-bench@2.0",
|
|
9
|
-
taskNames: [
|
|
10
|
-
"a",
|
|
11
|
-
"b"
|
|
12
|
-
],
|
|
13
|
-
agent: "oracle",
|
|
14
|
-
nConcurrent: 1,
|
|
15
|
-
nAttempts: 1
|
|
16
|
-
}, {
|
|
17
|
-
taskNames: [
|
|
18
|
-
"single"
|
|
19
|
-
],
|
|
20
|
-
agent: "codex",
|
|
21
|
-
nConcurrent: 2,
|
|
22
|
-
nAttempts: 3,
|
|
23
|
-
nTasks: 2,
|
|
24
|
-
model: "openai/gpt-4.1-mini",
|
|
25
|
-
agentKwargs: {
|
|
26
|
-
foo: "bar"
|
|
27
|
-
}
|
|
28
|
-
});
|
|
29
|
-
(0, external_vitest_namespaceObject.expect)(args).toEqual([
|
|
30
|
-
"run",
|
|
31
|
-
"--dataset",
|
|
32
|
-
"terminal-bench@2.0",
|
|
33
|
-
"--agent",
|
|
34
|
-
"codex",
|
|
35
|
-
"--model",
|
|
36
|
-
"openai/gpt-4.1-mini",
|
|
37
|
-
"--n-concurrent",
|
|
38
|
-
"2",
|
|
39
|
-
"--n-attempts",
|
|
40
|
-
"3",
|
|
41
|
-
"--n-tasks",
|
|
42
|
-
"2",
|
|
43
|
-
"--agent-kwarg",
|
|
44
|
-
"foo=bar",
|
|
45
|
-
"--task-name",
|
|
46
|
-
"single"
|
|
47
|
-
]);
|
|
48
|
-
});
|
|
49
|
-
(0, external_vitest_namespaceObject.it)("builds harbor args with explicit registry url", ()=>{
|
|
50
|
-
const args = (0, official_cjs_namespaceObject.buildHarborRunArgs)({
|
|
51
|
-
dataset: "terminal-bench@2.0",
|
|
52
|
-
registryUrl: "https://raw.githubusercontent.com/laude-institute/harbor/main/registry.json?source=wingman",
|
|
53
|
-
agent: "oracle"
|
|
54
|
-
}, {});
|
|
55
|
-
(0, external_vitest_namespaceObject.expect)(args).toEqual([
|
|
56
|
-
"run",
|
|
57
|
-
"--dataset",
|
|
58
|
-
"terminal-bench@2.0",
|
|
59
|
-
"--registry-url",
|
|
60
|
-
"https://raw.githubusercontent.com/laude-institute/harbor/main/registry.json?source=wingman",
|
|
61
|
-
"--agent",
|
|
62
|
-
"oracle"
|
|
63
|
-
]);
|
|
64
|
-
});
|
|
65
|
-
(0, external_vitest_namespaceObject.it)("builds harbor args without task names when running all dataset tasks", ()=>{
|
|
66
|
-
const args = (0, official_cjs_namespaceObject.buildHarborRunArgs)({
|
|
67
|
-
dataset: "terminal-bench@2.0",
|
|
68
|
-
agent: "oracle",
|
|
69
|
-
nConcurrent: 1
|
|
70
|
-
}, {
|
|
71
|
-
taskNames: []
|
|
72
|
-
});
|
|
73
|
-
(0, external_vitest_namespaceObject.expect)(args).toEqual([
|
|
74
|
-
"run",
|
|
75
|
-
"--dataset",
|
|
76
|
-
"terminal-bench@2.0",
|
|
77
|
-
"--agent",
|
|
78
|
-
"oracle",
|
|
79
|
-
"--n-concurrent",
|
|
80
|
-
"1"
|
|
81
|
-
]);
|
|
82
|
-
});
|
|
83
|
-
(0, external_vitest_namespaceObject.it)("builds harbor args with custom import-path agent", ()=>{
|
|
84
|
-
const args = (0, official_cjs_namespaceObject.buildHarborRunArgs)({
|
|
85
|
-
dataset: "terminal-bench@2.0",
|
|
86
|
-
taskNames: [
|
|
87
|
-
"hello-world"
|
|
88
|
-
],
|
|
89
|
-
agent: "oracle",
|
|
90
|
-
agentImportPath: "my_pkg.my_agent:MyAgent",
|
|
91
|
-
agentKwargs: {
|
|
92
|
-
wingman_agent: "coding",
|
|
93
|
-
model_name: "should-not-pass"
|
|
94
|
-
},
|
|
95
|
-
nConcurrent: 1
|
|
96
|
-
}, {
|
|
97
|
-
agentKwargs: {
|
|
98
|
-
wingman_cli_path: "./bin/wingman"
|
|
99
|
-
}
|
|
100
|
-
});
|
|
101
|
-
(0, external_vitest_namespaceObject.expect)(args).toEqual([
|
|
102
|
-
"run",
|
|
103
|
-
"--dataset",
|
|
104
|
-
"terminal-bench@2.0",
|
|
105
|
-
"--agent-import-path",
|
|
106
|
-
"my_pkg.my_agent:MyAgent",
|
|
107
|
-
"--n-concurrent",
|
|
108
|
-
"1",
|
|
109
|
-
"--agent-kwarg",
|
|
110
|
-
"wingman_agent=coding",
|
|
111
|
-
"--agent-kwarg",
|
|
112
|
-
"wingman_cli_path=./bin/wingman",
|
|
113
|
-
"--task-name",
|
|
114
|
-
"hello-world"
|
|
115
|
-
]);
|
|
116
|
-
});
|
|
117
|
-
(0, external_vitest_namespaceObject.it)("parses resolved/unresolved/accuracy and pass@k", ()=>{
|
|
118
|
-
const parsed = (0, official_cjs_namespaceObject.parseHarborRunOutput)(`
|
|
119
|
-
│ Resolved Trials │ 1 │
|
|
120
|
-
│ Unresolved Trials │ 1 │
|
|
121
|
-
│ Accuracy │ 50.00% │
|
|
122
|
-
│ Pass@1 │ 50.00% │
|
|
123
|
-
Results saved to /tmp/harbor/runs/run-1
|
|
124
|
-
`);
|
|
125
|
-
(0, external_vitest_namespaceObject.expect)(parsed.resolvedTrials).toBe(1);
|
|
126
|
-
(0, external_vitest_namespaceObject.expect)(parsed.unresolvedTrials).toBe(1);
|
|
127
|
-
(0, external_vitest_namespaceObject.expect)(parsed.accuracyPercent).toBe(50);
|
|
128
|
-
(0, external_vitest_namespaceObject.expect)(parsed.passAtK["1"]).toBe(50);
|
|
129
|
-
(0, external_vitest_namespaceObject.expect)(parsed.runOutputPath).toBe("/tmp/harbor/runs/run-1");
|
|
130
|
-
});
|
|
131
|
-
(0, external_vitest_namespaceObject.it)("builds a docker shim script and path for podman fallback", ()=>{
|
|
132
|
-
const script = (0, official_cjs_namespaceObject.createDockerShimScript)("/usr/local/bin/podman");
|
|
133
|
-
(0, external_vitest_namespaceObject.expect)(script).toContain("TARGET_BINARY='/usr/local/bin/podman'");
|
|
134
|
-
(0, external_vitest_namespaceObject.expect)(script).toContain("exec podman-compose");
|
|
135
|
-
(0, external_vitest_namespaceObject.expect)(script).toContain("exec podman cp");
|
|
136
|
-
(0, external_vitest_namespaceObject.expect)(script).toContain("exec podman exec");
|
|
137
|
-
(0, external_vitest_namespaceObject.expect)(script).toContain("label=com.docker.compose.project");
|
|
138
|
-
(0, external_vitest_namespaceObject.expect)(script).toContain("--project-directory");
|
|
139
|
-
(0, external_vitest_namespaceObject.expect)(script.startsWith("#!/bin/bash")).toBe(true);
|
|
140
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.buildRuntimePathEnv)("/tmp/runtime-bin", "/usr/bin")).toBe("/tmp/runtime-bin:/usr/bin");
|
|
141
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.buildPythonPathEnv)("/tmp/repo", "/usr/lib/python")).toBe("/tmp/repo:/usr/lib/python");
|
|
142
|
-
});
|
|
143
|
-
(0, external_vitest_namespaceObject.it)("extracts a concise harbor error message", ()=>{
|
|
144
|
-
const message = (0, official_cjs_namespaceObject.extractHarborErrorMessage)(`
|
|
145
|
-
Traceback...
|
|
146
|
-
ValueError: No tasks found matching pattern: jq-data-processing
|
|
147
|
-
`);
|
|
148
|
-
(0, external_vitest_namespaceObject.expect)(message).toBe("ValueError: No tasks found matching pattern: jq-data-processing");
|
|
149
|
-
});
|
|
150
|
-
(0, external_vitest_namespaceObject.it)("extracts a specific dataset resolution error over generic fallback", ()=>{
|
|
151
|
-
const message = (0, official_cjs_namespaceObject.extractHarborErrorMessage)(`
|
|
152
|
-
Traceback...
|
|
153
|
-
ValueError: Error getting dataset terminal-bench@2.0
|
|
154
|
-
ValueError: Either datasets or tasks must be provided.
|
|
155
|
-
`);
|
|
156
|
-
(0, external_vitest_namespaceObject.expect)(message).toBe("ValueError: Error getting dataset terminal-bench@2.0");
|
|
157
|
-
});
|
|
158
|
-
(0, external_vitest_namespaceObject.it)("rewrites generic empty-task selection error", ()=>{
|
|
159
|
-
const message = (0, official_cjs_namespaceObject.normalizeHarborFailureMessage)({
|
|
160
|
-
rawMessage: "ValueError: Either datasets or tasks must be provided.",
|
|
161
|
-
args: [
|
|
162
|
-
"run",
|
|
163
|
-
"--dataset",
|
|
164
|
-
"terminal-bench@2.0",
|
|
165
|
-
"--task-name",
|
|
166
|
-
"heterogeneous-dates"
|
|
167
|
-
],
|
|
168
|
-
dataset: "terminal-bench@2.0"
|
|
169
|
-
});
|
|
170
|
-
(0, external_vitest_namespaceObject.expect)(message).toBe('No tasks matched "heterogeneous-dates" in dataset "terminal-bench@2.0". Verify task ids for Terminal-Bench 2.0.');
|
|
171
|
-
});
|
|
172
|
-
(0, external_vitest_namespaceObject.it)("normalizes podman docker host candidates", ()=>{
|
|
173
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.parseDockerHostCandidate)("unix:///tmp/podman.sock")).toBe("unix:///tmp/podman.sock");
|
|
174
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.parseDockerHostCandidate)("/tmp/podman.sock")).toBe("unix:///tmp/podman.sock");
|
|
175
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.parseDockerHostCandidate)("'unix:///tmp/podman.sock'")).toBe("unix:///tmp/podman.sock");
|
|
176
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.parseDockerHostCandidate)("<nil>")).toBeUndefined();
|
|
177
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.parseDockerHostCandidate)(void 0)).toBeUndefined();
|
|
178
|
-
});
|
|
179
|
-
(0, external_vitest_namespaceObject.it)("detects missing compose provider errors", ()=>{
|
|
180
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.isMissingComposeProviderError)(`
|
|
181
|
-
Error: looking up compose provider failed
|
|
182
|
-
* exec: "podman-compose": executable file not found in $PATH
|
|
183
|
-
`)).toBe(true);
|
|
184
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.isMissingComposeProviderError)("some other error")).toBe(false);
|
|
185
|
-
});
|
|
186
|
-
(0, external_vitest_namespaceObject.it)("detects podman-backed docker version output", ()=>{
|
|
187
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.isPodmanBackedDockerVersionOutput)("Emulate Docker CLI using podman")).toBe(true);
|
|
188
|
-
(0, external_vitest_namespaceObject.expect)((0, official_cjs_namespaceObject.isPodmanBackedDockerVersionOutput)("Docker version 27.0.0")).toBe(false);
|
|
189
|
-
});
|
|
190
|
-
});
|
|
191
|
-
for(var __rspack_i in __webpack_exports__)exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
192
|
-
Object.defineProperty(exports, '__esModule', {
|
|
193
|
-
value: true
|
|
194
|
-
});
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
|
@@ -1,188 +0,0 @@
|
|
|
1
|
-
import { describe, expect, it } from "vitest";
|
|
2
|
-
import { buildHarborRunArgs, buildPythonPathEnv, buildRuntimePathEnv, createDockerShimScript, extractHarborErrorMessage, isMissingComposeProviderError, isPodmanBackedDockerVersionOutput, normalizeHarborFailureMessage, parseDockerHostCandidate, parseHarborRunOutput } from "../bench/official.js";
|
|
3
|
-
describe("terminal bench official runner (harbor tb2)", ()=>{
|
|
4
|
-
it("builds harbor args with overrides", ()=>{
|
|
5
|
-
const args = buildHarborRunArgs({
|
|
6
|
-
dataset: "terminal-bench@2.0",
|
|
7
|
-
taskNames: [
|
|
8
|
-
"a",
|
|
9
|
-
"b"
|
|
10
|
-
],
|
|
11
|
-
agent: "oracle",
|
|
12
|
-
nConcurrent: 1,
|
|
13
|
-
nAttempts: 1
|
|
14
|
-
}, {
|
|
15
|
-
taskNames: [
|
|
16
|
-
"single"
|
|
17
|
-
],
|
|
18
|
-
agent: "codex",
|
|
19
|
-
nConcurrent: 2,
|
|
20
|
-
nAttempts: 3,
|
|
21
|
-
nTasks: 2,
|
|
22
|
-
model: "openai/gpt-4.1-mini",
|
|
23
|
-
agentKwargs: {
|
|
24
|
-
foo: "bar"
|
|
25
|
-
}
|
|
26
|
-
});
|
|
27
|
-
expect(args).toEqual([
|
|
28
|
-
"run",
|
|
29
|
-
"--dataset",
|
|
30
|
-
"terminal-bench@2.0",
|
|
31
|
-
"--agent",
|
|
32
|
-
"codex",
|
|
33
|
-
"--model",
|
|
34
|
-
"openai/gpt-4.1-mini",
|
|
35
|
-
"--n-concurrent",
|
|
36
|
-
"2",
|
|
37
|
-
"--n-attempts",
|
|
38
|
-
"3",
|
|
39
|
-
"--n-tasks",
|
|
40
|
-
"2",
|
|
41
|
-
"--agent-kwarg",
|
|
42
|
-
"foo=bar",
|
|
43
|
-
"--task-name",
|
|
44
|
-
"single"
|
|
45
|
-
]);
|
|
46
|
-
});
|
|
47
|
-
it("builds harbor args with explicit registry url", ()=>{
|
|
48
|
-
const args = buildHarborRunArgs({
|
|
49
|
-
dataset: "terminal-bench@2.0",
|
|
50
|
-
registryUrl: "https://raw.githubusercontent.com/laude-institute/harbor/main/registry.json?source=wingman",
|
|
51
|
-
agent: "oracle"
|
|
52
|
-
}, {});
|
|
53
|
-
expect(args).toEqual([
|
|
54
|
-
"run",
|
|
55
|
-
"--dataset",
|
|
56
|
-
"terminal-bench@2.0",
|
|
57
|
-
"--registry-url",
|
|
58
|
-
"https://raw.githubusercontent.com/laude-institute/harbor/main/registry.json?source=wingman",
|
|
59
|
-
"--agent",
|
|
60
|
-
"oracle"
|
|
61
|
-
]);
|
|
62
|
-
});
|
|
63
|
-
it("builds harbor args without task names when running all dataset tasks", ()=>{
|
|
64
|
-
const args = buildHarborRunArgs({
|
|
65
|
-
dataset: "terminal-bench@2.0",
|
|
66
|
-
agent: "oracle",
|
|
67
|
-
nConcurrent: 1
|
|
68
|
-
}, {
|
|
69
|
-
taskNames: []
|
|
70
|
-
});
|
|
71
|
-
expect(args).toEqual([
|
|
72
|
-
"run",
|
|
73
|
-
"--dataset",
|
|
74
|
-
"terminal-bench@2.0",
|
|
75
|
-
"--agent",
|
|
76
|
-
"oracle",
|
|
77
|
-
"--n-concurrent",
|
|
78
|
-
"1"
|
|
79
|
-
]);
|
|
80
|
-
});
|
|
81
|
-
it("builds harbor args with custom import-path agent", ()=>{
|
|
82
|
-
const args = buildHarborRunArgs({
|
|
83
|
-
dataset: "terminal-bench@2.0",
|
|
84
|
-
taskNames: [
|
|
85
|
-
"hello-world"
|
|
86
|
-
],
|
|
87
|
-
agent: "oracle",
|
|
88
|
-
agentImportPath: "my_pkg.my_agent:MyAgent",
|
|
89
|
-
agentKwargs: {
|
|
90
|
-
wingman_agent: "coding",
|
|
91
|
-
model_name: "should-not-pass"
|
|
92
|
-
},
|
|
93
|
-
nConcurrent: 1
|
|
94
|
-
}, {
|
|
95
|
-
agentKwargs: {
|
|
96
|
-
wingman_cli_path: "./bin/wingman"
|
|
97
|
-
}
|
|
98
|
-
});
|
|
99
|
-
expect(args).toEqual([
|
|
100
|
-
"run",
|
|
101
|
-
"--dataset",
|
|
102
|
-
"terminal-bench@2.0",
|
|
103
|
-
"--agent-import-path",
|
|
104
|
-
"my_pkg.my_agent:MyAgent",
|
|
105
|
-
"--n-concurrent",
|
|
106
|
-
"1",
|
|
107
|
-
"--agent-kwarg",
|
|
108
|
-
"wingman_agent=coding",
|
|
109
|
-
"--agent-kwarg",
|
|
110
|
-
"wingman_cli_path=./bin/wingman",
|
|
111
|
-
"--task-name",
|
|
112
|
-
"hello-world"
|
|
113
|
-
]);
|
|
114
|
-
});
|
|
115
|
-
it("parses resolved/unresolved/accuracy and pass@k", ()=>{
|
|
116
|
-
const parsed = parseHarborRunOutput(`
|
|
117
|
-
│ Resolved Trials │ 1 │
|
|
118
|
-
│ Unresolved Trials │ 1 │
|
|
119
|
-
│ Accuracy │ 50.00% │
|
|
120
|
-
│ Pass@1 │ 50.00% │
|
|
121
|
-
Results saved to /tmp/harbor/runs/run-1
|
|
122
|
-
`);
|
|
123
|
-
expect(parsed.resolvedTrials).toBe(1);
|
|
124
|
-
expect(parsed.unresolvedTrials).toBe(1);
|
|
125
|
-
expect(parsed.accuracyPercent).toBe(50);
|
|
126
|
-
expect(parsed.passAtK["1"]).toBe(50);
|
|
127
|
-
expect(parsed.runOutputPath).toBe("/tmp/harbor/runs/run-1");
|
|
128
|
-
});
|
|
129
|
-
it("builds a docker shim script and path for podman fallback", ()=>{
|
|
130
|
-
const script = createDockerShimScript("/usr/local/bin/podman");
|
|
131
|
-
expect(script).toContain("TARGET_BINARY='/usr/local/bin/podman'");
|
|
132
|
-
expect(script).toContain("exec podman-compose");
|
|
133
|
-
expect(script).toContain("exec podman cp");
|
|
134
|
-
expect(script).toContain("exec podman exec");
|
|
135
|
-
expect(script).toContain("label=com.docker.compose.project");
|
|
136
|
-
expect(script).toContain("--project-directory");
|
|
137
|
-
expect(script.startsWith("#!/bin/bash")).toBe(true);
|
|
138
|
-
expect(buildRuntimePathEnv("/tmp/runtime-bin", "/usr/bin")).toBe("/tmp/runtime-bin:/usr/bin");
|
|
139
|
-
expect(buildPythonPathEnv("/tmp/repo", "/usr/lib/python")).toBe("/tmp/repo:/usr/lib/python");
|
|
140
|
-
});
|
|
141
|
-
it("extracts a concise harbor error message", ()=>{
|
|
142
|
-
const message = extractHarborErrorMessage(`
|
|
143
|
-
Traceback...
|
|
144
|
-
ValueError: No tasks found matching pattern: jq-data-processing
|
|
145
|
-
`);
|
|
146
|
-
expect(message).toBe("ValueError: No tasks found matching pattern: jq-data-processing");
|
|
147
|
-
});
|
|
148
|
-
it("extracts a specific dataset resolution error over generic fallback", ()=>{
|
|
149
|
-
const message = extractHarborErrorMessage(`
|
|
150
|
-
Traceback...
|
|
151
|
-
ValueError: Error getting dataset terminal-bench@2.0
|
|
152
|
-
ValueError: Either datasets or tasks must be provided.
|
|
153
|
-
`);
|
|
154
|
-
expect(message).toBe("ValueError: Error getting dataset terminal-bench@2.0");
|
|
155
|
-
});
|
|
156
|
-
it("rewrites generic empty-task selection error", ()=>{
|
|
157
|
-
const message = normalizeHarborFailureMessage({
|
|
158
|
-
rawMessage: "ValueError: Either datasets or tasks must be provided.",
|
|
159
|
-
args: [
|
|
160
|
-
"run",
|
|
161
|
-
"--dataset",
|
|
162
|
-
"terminal-bench@2.0",
|
|
163
|
-
"--task-name",
|
|
164
|
-
"heterogeneous-dates"
|
|
165
|
-
],
|
|
166
|
-
dataset: "terminal-bench@2.0"
|
|
167
|
-
});
|
|
168
|
-
expect(message).toBe('No tasks matched "heterogeneous-dates" in dataset "terminal-bench@2.0". Verify task ids for Terminal-Bench 2.0.');
|
|
169
|
-
});
|
|
170
|
-
it("normalizes podman docker host candidates", ()=>{
|
|
171
|
-
expect(parseDockerHostCandidate("unix:///tmp/podman.sock")).toBe("unix:///tmp/podman.sock");
|
|
172
|
-
expect(parseDockerHostCandidate("/tmp/podman.sock")).toBe("unix:///tmp/podman.sock");
|
|
173
|
-
expect(parseDockerHostCandidate("'unix:///tmp/podman.sock'")).toBe("unix:///tmp/podman.sock");
|
|
174
|
-
expect(parseDockerHostCandidate("<nil>")).toBeUndefined();
|
|
175
|
-
expect(parseDockerHostCandidate(void 0)).toBeUndefined();
|
|
176
|
-
});
|
|
177
|
-
it("detects missing compose provider errors", ()=>{
|
|
178
|
-
expect(isMissingComposeProviderError(`
|
|
179
|
-
Error: looking up compose provider failed
|
|
180
|
-
* exec: "podman-compose": executable file not found in $PATH
|
|
181
|
-
`)).toBe(true);
|
|
182
|
-
expect(isMissingComposeProviderError("some other error")).toBe(false);
|
|
183
|
-
});
|
|
184
|
-
it("detects podman-backed docker version output", ()=>{
|
|
185
|
-
expect(isPodmanBackedDockerVersionOutput("Emulate Docker CLI using podman")).toBe(true);
|
|
186
|
-
expect(isPodmanBackedDockerVersionOutput("Docker version 27.0.0")).toBe(false);
|
|
187
|
-
});
|
|
188
|
-
});
|