@wingman-ai/gateway 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -0
- package/dist/agent/config/mcpClientManager.cjs +104 -1
- package/dist/agent/config/mcpClientManager.d.ts +30 -0
- package/dist/agent/config/mcpClientManager.js +104 -1
- package/dist/agent/config/modelFactory.cjs +10 -0
- package/dist/agent/config/modelFactory.js +10 -0
- package/dist/agent/config/xaiImageModel.cjs +242 -0
- package/dist/agent/config/xaiImageModel.d.ts +33 -0
- package/dist/agent/config/xaiImageModel.js +202 -0
- package/dist/agent/tests/mcpClientManager.test.cjs +116 -0
- package/dist/agent/tests/mcpClientManager.test.js +117 -1
- package/dist/agent/tests/mcpResourceTools.test.cjs +101 -0
- package/dist/agent/tests/mcpResourceTools.test.d.ts +1 -0
- package/dist/agent/tests/mcpResourceTools.test.js +95 -0
- package/dist/agent/tests/modelFactory.test.cjs +16 -2
- package/dist/agent/tests/modelFactory.test.js +16 -2
- package/dist/agent/tests/xaiImageModel.test.cjs +194 -0
- package/dist/agent/tests/xaiImageModel.test.d.ts +1 -0
- package/dist/agent/tests/xaiImageModel.test.js +188 -0
- package/dist/agent/tools/mcp_resources.cjs +111 -0
- package/dist/agent/tools/mcp_resources.d.ts +3 -0
- package/dist/agent/tools/mcp_resources.js +77 -0
- package/dist/bench/adapters/commandAdapter.cjs +93 -0
- package/dist/bench/adapters/commandAdapter.d.ts +6 -0
- package/dist/bench/adapters/commandAdapter.js +59 -0
- package/dist/bench/adapters/helpers.cjs +170 -0
- package/dist/bench/adapters/helpers.d.ts +7 -0
- package/dist/bench/adapters/helpers.js +133 -0
- package/dist/bench/adapters/index.cjs +41 -0
- package/dist/bench/adapters/index.d.ts +2 -0
- package/dist/bench/adapters/index.js +7 -0
- package/dist/bench/adapters/wingmanCliAdapter.cjs +100 -0
- package/dist/bench/adapters/wingmanCliAdapter.d.ts +6 -0
- package/dist/bench/adapters/wingmanCliAdapter.js +66 -0
- package/dist/bench/cleanup.cjs +122 -0
- package/dist/bench/cleanup.d.ts +9 -0
- package/dist/bench/cleanup.js +85 -0
- package/dist/bench/config.cjs +190 -0
- package/dist/bench/config.d.ts +2 -0
- package/dist/bench/config.js +156 -0
- package/dist/bench/index.cjs +43 -0
- package/dist/bench/index.d.ts +3 -0
- package/dist/bench/index.js +3 -0
- package/dist/bench/official.cjs +616 -0
- package/dist/bench/official.d.ts +80 -0
- package/dist/bench/official.js +546 -0
- package/dist/bench/officialCli.cjs +204 -0
- package/dist/bench/officialCli.d.ts +5 -0
- package/dist/bench/officialCli.js +170 -0
- package/dist/bench/process.cjs +78 -0
- package/dist/bench/process.d.ts +14 -0
- package/dist/bench/process.js +44 -0
- package/dist/bench/runner.cjs +237 -0
- package/dist/bench/runner.d.ts +7 -0
- package/dist/bench/runner.js +197 -0
- package/dist/bench/scoring.cjs +171 -0
- package/dist/bench/scoring.d.ts +9 -0
- package/dist/bench/scoring.js +137 -0
- package/dist/bench/types.cjs +18 -0
- package/dist/bench/types.d.ts +200 -0
- package/dist/bench/types.js +0 -0
- package/dist/bench/validator.cjs +92 -0
- package/dist/bench/validator.d.ts +2 -0
- package/dist/bench/validator.js +58 -0
- package/dist/cli/config/schema.cjs +36 -1
- package/dist/cli/config/schema.d.ts +46 -0
- package/dist/cli/config/schema.js +36 -1
- package/dist/cli/config/warnings.cjs +119 -51
- package/dist/cli/config/warnings.js +119 -51
- package/dist/cli/core/agentInvoker.cjs +9 -2
- package/dist/cli/core/agentInvoker.d.ts +1 -0
- package/dist/cli/core/agentInvoker.js +9 -2
- package/dist/cli/core/imagePersistence.cjs +17 -1
- package/dist/cli/core/imagePersistence.d.ts +2 -0
- package/dist/cli/core/imagePersistence.js +13 -3
- package/dist/cli/core/sessionManager.cjs +2 -0
- package/dist/cli/core/sessionManager.js +3 -1
- package/dist/cli/types.d.ts +18 -0
- package/dist/gateway/adapters/teams.cjs +419 -0
- package/dist/gateway/adapters/teams.d.ts +47 -0
- package/dist/gateway/adapters/teams.js +361 -0
- package/dist/gateway/http/sms.cjs +286 -0
- package/dist/gateway/http/sms.d.ts +4 -0
- package/dist/gateway/http/sms.js +249 -0
- package/dist/gateway/server.cjs +54 -3
- package/dist/gateway/server.d.ts +2 -0
- package/dist/gateway/server.js +54 -3
- package/dist/gateway/sms/commands.cjs +116 -0
- package/dist/gateway/sms/commands.d.ts +15 -0
- package/dist/gateway/sms/commands.js +79 -0
- package/dist/gateway/sms/control.cjs +118 -0
- package/dist/gateway/sms/control.d.ts +18 -0
- package/dist/gateway/sms/control.js +84 -0
- package/dist/gateway/sms/policyStore.cjs +198 -0
- package/dist/gateway/sms/policyStore.d.ts +37 -0
- package/dist/gateway/sms/policyStore.js +161 -0
- package/dist/providers/registry.cjs +1 -0
- package/dist/providers/registry.js +1 -0
- package/dist/tests/cli-config-warnings.test.cjs +41 -0
- package/dist/tests/cli-config-warnings.test.js +41 -0
- package/dist/tests/cli-init.test.cjs +32 -26
- package/dist/tests/cli-init.test.js +32 -26
- package/dist/tests/gateway-http-security.test.cjs +21 -0
- package/dist/tests/gateway-http-security.test.js +21 -0
- package/dist/tests/gateway-origin-policy.test.cjs +22 -0
- package/dist/tests/gateway-origin-policy.test.js +22 -0
- package/dist/tests/gateway.test.cjs +57 -0
- package/dist/tests/gateway.test.js +57 -0
- package/dist/tests/imagePersistence.test.cjs +26 -0
- package/dist/tests/imagePersistence.test.js +27 -1
- package/dist/tests/run-terminal-bench-official-script.test.cjs +61 -0
- package/dist/tests/run-terminal-bench-official-script.test.d.ts +1 -0
- package/dist/tests/run-terminal-bench-official-script.test.js +55 -0
- package/dist/tests/sessions-api.test.cjs +69 -1
- package/dist/tests/sessions-api.test.js +70 -2
- package/dist/tests/sms-api.test.cjs +183 -0
- package/dist/tests/sms-api.test.d.ts +1 -0
- package/dist/tests/sms-api.test.js +177 -0
- package/dist/tests/sms-commands.test.cjs +90 -0
- package/dist/tests/sms-commands.test.d.ts +1 -0
- package/dist/tests/sms-commands.test.js +84 -0
- package/dist/tests/sms-policy-store.test.cjs +69 -0
- package/dist/tests/sms-policy-store.test.d.ts +1 -0
- package/dist/tests/sms-policy-store.test.js +63 -0
- package/dist/tests/teams-adapter.test.cjs +58 -0
- package/dist/tests/teams-adapter.test.d.ts +1 -0
- package/dist/tests/teams-adapter.test.js +52 -0
- package/dist/tests/terminal-bench-adapters-helpers.test.cjs +64 -0
- package/dist/tests/terminal-bench-adapters-helpers.test.d.ts +1 -0
- package/dist/tests/terminal-bench-adapters-helpers.test.js +58 -0
- package/dist/tests/terminal-bench-cleanup.test.cjs +93 -0
- package/dist/tests/terminal-bench-cleanup.test.d.ts +1 -0
- package/dist/tests/terminal-bench-cleanup.test.js +87 -0
- package/dist/tests/terminal-bench-config.test.cjs +62 -0
- package/dist/tests/terminal-bench-config.test.d.ts +1 -0
- package/dist/tests/terminal-bench-config.test.js +56 -0
- package/dist/tests/terminal-bench-official.test.cjs +194 -0
- package/dist/tests/terminal-bench-official.test.d.ts +1 -0
- package/dist/tests/terminal-bench-official.test.js +188 -0
- package/dist/tests/terminal-bench-runner.test.cjs +82 -0
- package/dist/tests/terminal-bench-runner.test.d.ts +1 -0
- package/dist/tests/terminal-bench-runner.test.js +76 -0
- package/dist/tests/terminal-bench-scoring.test.cjs +128 -0
- package/dist/tests/terminal-bench-scoring.test.d.ts +1 -0
- package/dist/tests/terminal-bench-scoring.test.js +122 -0
- package/dist/tools/mcp-fal-ai.cjs +1 -1
- package/dist/tools/mcp-fal-ai.js +1 -1
- package/dist/webui/assets/index-Cyg_Hs57.css +11 -0
- package/dist/webui/assets/{index-BMekSELC.js → index-DZXLLjaA.js} +109 -109
- package/dist/webui/index.html +2 -2
- package/package.json +11 -2
- package/templates/agents/game-dev/agent.md +122 -63
- package/templates/agents/game-dev/art-director.md +106 -0
- package/templates/agents/game-dev/game-designer.md +87 -0
- package/templates/agents/game-dev/scene-engineer.md +474 -0
- package/dist/webui/assets/index-Cwkg4DKj.css +0 -11
- package/templates/agents/game-dev/art-generation.md +0 -38
- package/templates/agents/game-dev/asset-refinement.md +0 -17
- package/templates/agents/game-dev/planning-idea.md +0 -17
- package/templates/agents/game-dev/ui-specialist.md +0 -17
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { buildHarborRunArgs, buildPythonPathEnv, buildRuntimePathEnv, createDockerShimScript, extractHarborErrorMessage, isMissingComposeProviderError, isPodmanBackedDockerVersionOutput, normalizeHarborFailureMessage, parseDockerHostCandidate, parseHarborRunOutput } from "../bench/official.js";
|
|
3
|
+
describe("terminal bench official runner (harbor tb2)", ()=>{
|
|
4
|
+
it("builds harbor args with overrides", ()=>{
|
|
5
|
+
const args = buildHarborRunArgs({
|
|
6
|
+
dataset: "terminal-bench@2.0",
|
|
7
|
+
taskNames: [
|
|
8
|
+
"a",
|
|
9
|
+
"b"
|
|
10
|
+
],
|
|
11
|
+
agent: "oracle",
|
|
12
|
+
nConcurrent: 1,
|
|
13
|
+
nAttempts: 1
|
|
14
|
+
}, {
|
|
15
|
+
taskNames: [
|
|
16
|
+
"single"
|
|
17
|
+
],
|
|
18
|
+
agent: "codex",
|
|
19
|
+
nConcurrent: 2,
|
|
20
|
+
nAttempts: 3,
|
|
21
|
+
nTasks: 2,
|
|
22
|
+
model: "openai/gpt-4.1-mini",
|
|
23
|
+
agentKwargs: {
|
|
24
|
+
foo: "bar"
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
expect(args).toEqual([
|
|
28
|
+
"run",
|
|
29
|
+
"--dataset",
|
|
30
|
+
"terminal-bench@2.0",
|
|
31
|
+
"--agent",
|
|
32
|
+
"codex",
|
|
33
|
+
"--model",
|
|
34
|
+
"openai/gpt-4.1-mini",
|
|
35
|
+
"--n-concurrent",
|
|
36
|
+
"2",
|
|
37
|
+
"--n-attempts",
|
|
38
|
+
"3",
|
|
39
|
+
"--n-tasks",
|
|
40
|
+
"2",
|
|
41
|
+
"--agent-kwarg",
|
|
42
|
+
"foo=bar",
|
|
43
|
+
"--task-name",
|
|
44
|
+
"single"
|
|
45
|
+
]);
|
|
46
|
+
});
|
|
47
|
+
it("builds harbor args with explicit registry url", ()=>{
|
|
48
|
+
const args = buildHarborRunArgs({
|
|
49
|
+
dataset: "terminal-bench@2.0",
|
|
50
|
+
registryUrl: "https://raw.githubusercontent.com/laude-institute/harbor/main/registry.json?source=wingman",
|
|
51
|
+
agent: "oracle"
|
|
52
|
+
}, {});
|
|
53
|
+
expect(args).toEqual([
|
|
54
|
+
"run",
|
|
55
|
+
"--dataset",
|
|
56
|
+
"terminal-bench@2.0",
|
|
57
|
+
"--registry-url",
|
|
58
|
+
"https://raw.githubusercontent.com/laude-institute/harbor/main/registry.json?source=wingman",
|
|
59
|
+
"--agent",
|
|
60
|
+
"oracle"
|
|
61
|
+
]);
|
|
62
|
+
});
|
|
63
|
+
it("builds harbor args without task names when running all dataset tasks", ()=>{
|
|
64
|
+
const args = buildHarborRunArgs({
|
|
65
|
+
dataset: "terminal-bench@2.0",
|
|
66
|
+
agent: "oracle",
|
|
67
|
+
nConcurrent: 1
|
|
68
|
+
}, {
|
|
69
|
+
taskNames: []
|
|
70
|
+
});
|
|
71
|
+
expect(args).toEqual([
|
|
72
|
+
"run",
|
|
73
|
+
"--dataset",
|
|
74
|
+
"terminal-bench@2.0",
|
|
75
|
+
"--agent",
|
|
76
|
+
"oracle",
|
|
77
|
+
"--n-concurrent",
|
|
78
|
+
"1"
|
|
79
|
+
]);
|
|
80
|
+
});
|
|
81
|
+
it("builds harbor args with custom import-path agent", ()=>{
|
|
82
|
+
const args = buildHarborRunArgs({
|
|
83
|
+
dataset: "terminal-bench@2.0",
|
|
84
|
+
taskNames: [
|
|
85
|
+
"hello-world"
|
|
86
|
+
],
|
|
87
|
+
agent: "oracle",
|
|
88
|
+
agentImportPath: "my_pkg.my_agent:MyAgent",
|
|
89
|
+
agentKwargs: {
|
|
90
|
+
wingman_agent: "coding",
|
|
91
|
+
model_name: "should-not-pass"
|
|
92
|
+
},
|
|
93
|
+
nConcurrent: 1
|
|
94
|
+
}, {
|
|
95
|
+
agentKwargs: {
|
|
96
|
+
wingman_cli_path: "./bin/wingman"
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
expect(args).toEqual([
|
|
100
|
+
"run",
|
|
101
|
+
"--dataset",
|
|
102
|
+
"terminal-bench@2.0",
|
|
103
|
+
"--agent-import-path",
|
|
104
|
+
"my_pkg.my_agent:MyAgent",
|
|
105
|
+
"--n-concurrent",
|
|
106
|
+
"1",
|
|
107
|
+
"--agent-kwarg",
|
|
108
|
+
"wingman_agent=coding",
|
|
109
|
+
"--agent-kwarg",
|
|
110
|
+
"wingman_cli_path=./bin/wingman",
|
|
111
|
+
"--task-name",
|
|
112
|
+
"hello-world"
|
|
113
|
+
]);
|
|
114
|
+
});
|
|
115
|
+
it("parses resolved/unresolved/accuracy and pass@k", ()=>{
|
|
116
|
+
const parsed = parseHarborRunOutput(`
|
|
117
|
+
│ Resolved Trials │ 1 │
|
|
118
|
+
│ Unresolved Trials │ 1 │
|
|
119
|
+
│ Accuracy │ 50.00% │
|
|
120
|
+
│ Pass@1 │ 50.00% │
|
|
121
|
+
Results saved to /tmp/harbor/runs/run-1
|
|
122
|
+
`);
|
|
123
|
+
expect(parsed.resolvedTrials).toBe(1);
|
|
124
|
+
expect(parsed.unresolvedTrials).toBe(1);
|
|
125
|
+
expect(parsed.accuracyPercent).toBe(50);
|
|
126
|
+
expect(parsed.passAtK["1"]).toBe(50);
|
|
127
|
+
expect(parsed.runOutputPath).toBe("/tmp/harbor/runs/run-1");
|
|
128
|
+
});
|
|
129
|
+
it("builds a docker shim script and path for podman fallback", ()=>{
|
|
130
|
+
const script = createDockerShimScript("/usr/local/bin/podman");
|
|
131
|
+
expect(script).toContain("TARGET_BINARY='/usr/local/bin/podman'");
|
|
132
|
+
expect(script).toContain("exec podman-compose");
|
|
133
|
+
expect(script).toContain("exec podman cp");
|
|
134
|
+
expect(script).toContain("exec podman exec");
|
|
135
|
+
expect(script).toContain("label=com.docker.compose.project");
|
|
136
|
+
expect(script).toContain("--project-directory");
|
|
137
|
+
expect(script.startsWith("#!/bin/bash")).toBe(true);
|
|
138
|
+
expect(buildRuntimePathEnv("/tmp/runtime-bin", "/usr/bin")).toBe("/tmp/runtime-bin:/usr/bin");
|
|
139
|
+
expect(buildPythonPathEnv("/tmp/repo", "/usr/lib/python")).toBe("/tmp/repo:/usr/lib/python");
|
|
140
|
+
});
|
|
141
|
+
it("extracts a concise harbor error message", ()=>{
|
|
142
|
+
const message = extractHarborErrorMessage(`
|
|
143
|
+
Traceback...
|
|
144
|
+
ValueError: No tasks found matching pattern: jq-data-processing
|
|
145
|
+
`);
|
|
146
|
+
expect(message).toBe("ValueError: No tasks found matching pattern: jq-data-processing");
|
|
147
|
+
});
|
|
148
|
+
it("extracts a specific dataset resolution error over generic fallback", ()=>{
|
|
149
|
+
const message = extractHarborErrorMessage(`
|
|
150
|
+
Traceback...
|
|
151
|
+
ValueError: Error getting dataset terminal-bench@2.0
|
|
152
|
+
ValueError: Either datasets or tasks must be provided.
|
|
153
|
+
`);
|
|
154
|
+
expect(message).toBe("ValueError: Error getting dataset terminal-bench@2.0");
|
|
155
|
+
});
|
|
156
|
+
it("rewrites generic empty-task selection error", ()=>{
|
|
157
|
+
const message = normalizeHarborFailureMessage({
|
|
158
|
+
rawMessage: "ValueError: Either datasets or tasks must be provided.",
|
|
159
|
+
args: [
|
|
160
|
+
"run",
|
|
161
|
+
"--dataset",
|
|
162
|
+
"terminal-bench@2.0",
|
|
163
|
+
"--task-name",
|
|
164
|
+
"heterogeneous-dates"
|
|
165
|
+
],
|
|
166
|
+
dataset: "terminal-bench@2.0"
|
|
167
|
+
});
|
|
168
|
+
expect(message).toBe('No tasks matched "heterogeneous-dates" in dataset "terminal-bench@2.0". Verify task ids for Terminal-Bench 2.0.');
|
|
169
|
+
});
|
|
170
|
+
it("normalizes podman docker host candidates", ()=>{
|
|
171
|
+
expect(parseDockerHostCandidate("unix:///tmp/podman.sock")).toBe("unix:///tmp/podman.sock");
|
|
172
|
+
expect(parseDockerHostCandidate("/tmp/podman.sock")).toBe("unix:///tmp/podman.sock");
|
|
173
|
+
expect(parseDockerHostCandidate("'unix:///tmp/podman.sock'")).toBe("unix:///tmp/podman.sock");
|
|
174
|
+
expect(parseDockerHostCandidate("<nil>")).toBeUndefined();
|
|
175
|
+
expect(parseDockerHostCandidate(void 0)).toBeUndefined();
|
|
176
|
+
});
|
|
177
|
+
it("detects missing compose provider errors", ()=>{
|
|
178
|
+
expect(isMissingComposeProviderError(`
|
|
179
|
+
Error: looking up compose provider failed
|
|
180
|
+
* exec: "podman-compose": executable file not found in $PATH
|
|
181
|
+
`)).toBe(true);
|
|
182
|
+
expect(isMissingComposeProviderError("some other error")).toBe(false);
|
|
183
|
+
});
|
|
184
|
+
it("detects podman-backed docker version output", ()=>{
|
|
185
|
+
expect(isPodmanBackedDockerVersionOutput("Emulate Docker CLI using podman")).toBe(true);
|
|
186
|
+
expect(isPodmanBackedDockerVersionOutput("Docker version 27.0.0")).toBe(false);
|
|
187
|
+
});
|
|
188
|
+
});
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __webpack_exports__ = {};
|
|
3
|
+
const external_node_fs_namespaceObject = require("node:fs");
|
|
4
|
+
const external_node_os_namespaceObject = require("node:os");
|
|
5
|
+
const external_node_path_namespaceObject = require("node:path");
|
|
6
|
+
const external_vitest_namespaceObject = require("vitest");
|
|
7
|
+
const runner_cjs_namespaceObject = require("../bench/runner.cjs");
|
|
8
|
+
(0, external_vitest_namespaceObject.describe)("terminal bench runner", ()=>{
|
|
9
|
+
const workdirs = [];
|
|
10
|
+
(0, external_vitest_namespaceObject.afterEach)(()=>{
|
|
11
|
+
for (const workdir of workdirs)(0, external_node_fs_namespaceObject.rmSync)(workdir, {
|
|
12
|
+
recursive: true,
|
|
13
|
+
force: true
|
|
14
|
+
});
|
|
15
|
+
workdirs.length = 0;
|
|
16
|
+
});
|
|
17
|
+
(0, external_vitest_namespaceObject.it)("runs tasks with command adapter and writes artifacts", async ()=>{
|
|
18
|
+
const workdir = (0, external_node_fs_namespaceObject.mkdtempSync)((0, external_node_path_namespaceObject.join)((0, external_node_os_namespaceObject.tmpdir)(), "wingman-bench-runner-"));
|
|
19
|
+
workdirs.push(workdir);
|
|
20
|
+
const benchmarkDir = (0, external_node_path_namespaceObject.join)(workdir, "bench");
|
|
21
|
+
const tasksDir = (0, external_node_path_namespaceObject.join)(benchmarkDir, "tasks");
|
|
22
|
+
const sandboxDir = (0, external_node_path_namespaceObject.join)(benchmarkDir, "sandbox");
|
|
23
|
+
(0, external_node_fs_namespaceObject.mkdirSync)(tasksDir, {
|
|
24
|
+
recursive: true
|
|
25
|
+
});
|
|
26
|
+
(0, external_node_fs_namespaceObject.mkdirSync)(sandboxDir, {
|
|
27
|
+
recursive: true
|
|
28
|
+
});
|
|
29
|
+
(0, external_node_fs_namespaceObject.writeFileSync)((0, external_node_path_namespaceObject.join)(tasksDir, "suite.json"), JSON.stringify({
|
|
30
|
+
tasks: [
|
|
31
|
+
{
|
|
32
|
+
id: "write-output",
|
|
33
|
+
prompt: "FILE_OK",
|
|
34
|
+
workingDirectory: "sandbox",
|
|
35
|
+
setup: [
|
|
36
|
+
{
|
|
37
|
+
command: "rm",
|
|
38
|
+
args: [
|
|
39
|
+
"-f",
|
|
40
|
+
"output.txt"
|
|
41
|
+
]
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
validator: {
|
|
45
|
+
type: "file_contains",
|
|
46
|
+
path: "output.txt",
|
|
47
|
+
includes: [
|
|
48
|
+
"FILE_OK"
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
]
|
|
53
|
+
}, null, 2));
|
|
54
|
+
(0, external_node_fs_namespaceObject.writeFileSync)((0, external_node_path_namespaceObject.join)(benchmarkDir, "config.json"), JSON.stringify({
|
|
55
|
+
taskFile: "tasks/suite.json",
|
|
56
|
+
resultsDir: "results",
|
|
57
|
+
adapter: {
|
|
58
|
+
type: "command",
|
|
59
|
+
command: {
|
|
60
|
+
command: "sh",
|
|
61
|
+
args: [
|
|
62
|
+
"-lc",
|
|
63
|
+
"printf '%s\\n' \"$WINGMAN_BENCH_PROMPT\" > output.txt; echo COMPLETE"
|
|
64
|
+
]
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}, null, 2));
|
|
68
|
+
const summary = await (0, runner_cjs_namespaceObject.runTerminalBench)({
|
|
69
|
+
configPath: (0, external_node_path_namespaceObject.join)(benchmarkDir, "config.json")
|
|
70
|
+
});
|
|
71
|
+
(0, external_vitest_namespaceObject.expect)(summary.metrics.totalTasks).toBe(1);
|
|
72
|
+
(0, external_vitest_namespaceObject.expect)(summary.metrics.passedTasks).toBe(1);
|
|
73
|
+
(0, external_vitest_namespaceObject.expect)(summary.metrics.failedTasks).toBe(0);
|
|
74
|
+
(0, external_vitest_namespaceObject.expect)((0, external_node_fs_namespaceObject.existsSync)((0, external_node_path_namespaceObject.join)(summary.resultsDir, "summary.json"))).toBe(true);
|
|
75
|
+
(0, external_vitest_namespaceObject.expect)((0, external_node_fs_namespaceObject.existsSync)((0, external_node_path_namespaceObject.join)(summary.resultsDir, "write-output.assistant.txt"))).toBe(true);
|
|
76
|
+
(0, external_vitest_namespaceObject.expect)((0, external_node_fs_namespaceObject.readFileSync)((0, external_node_path_namespaceObject.join)(sandboxDir, "output.txt"), "utf-8")).toContain("FILE_OK");
|
|
77
|
+
});
|
|
78
|
+
});
|
|
79
|
+
for(var __rspack_i in __webpack_exports__)exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
80
|
+
Object.defineProperty(exports, '__esModule', {
|
|
81
|
+
value: true
|
|
82
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { tmpdir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { afterEach, describe, expect, it } from "vitest";
|
|
5
|
+
import { runTerminalBench } from "../bench/runner.js";
|
|
6
|
+
describe("terminal bench runner", ()=>{
|
|
7
|
+
const workdirs = [];
|
|
8
|
+
afterEach(()=>{
|
|
9
|
+
for (const workdir of workdirs)rmSync(workdir, {
|
|
10
|
+
recursive: true,
|
|
11
|
+
force: true
|
|
12
|
+
});
|
|
13
|
+
workdirs.length = 0;
|
|
14
|
+
});
|
|
15
|
+
it("runs tasks with command adapter and writes artifacts", async ()=>{
|
|
16
|
+
const workdir = mkdtempSync(join(tmpdir(), "wingman-bench-runner-"));
|
|
17
|
+
workdirs.push(workdir);
|
|
18
|
+
const benchmarkDir = join(workdir, "bench");
|
|
19
|
+
const tasksDir = join(benchmarkDir, "tasks");
|
|
20
|
+
const sandboxDir = join(benchmarkDir, "sandbox");
|
|
21
|
+
mkdirSync(tasksDir, {
|
|
22
|
+
recursive: true
|
|
23
|
+
});
|
|
24
|
+
mkdirSync(sandboxDir, {
|
|
25
|
+
recursive: true
|
|
26
|
+
});
|
|
27
|
+
writeFileSync(join(tasksDir, "suite.json"), JSON.stringify({
|
|
28
|
+
tasks: [
|
|
29
|
+
{
|
|
30
|
+
id: "write-output",
|
|
31
|
+
prompt: "FILE_OK",
|
|
32
|
+
workingDirectory: "sandbox",
|
|
33
|
+
setup: [
|
|
34
|
+
{
|
|
35
|
+
command: "rm",
|
|
36
|
+
args: [
|
|
37
|
+
"-f",
|
|
38
|
+
"output.txt"
|
|
39
|
+
]
|
|
40
|
+
}
|
|
41
|
+
],
|
|
42
|
+
validator: {
|
|
43
|
+
type: "file_contains",
|
|
44
|
+
path: "output.txt",
|
|
45
|
+
includes: [
|
|
46
|
+
"FILE_OK"
|
|
47
|
+
]
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
]
|
|
51
|
+
}, null, 2));
|
|
52
|
+
writeFileSync(join(benchmarkDir, "config.json"), JSON.stringify({
|
|
53
|
+
taskFile: "tasks/suite.json",
|
|
54
|
+
resultsDir: "results",
|
|
55
|
+
adapter: {
|
|
56
|
+
type: "command",
|
|
57
|
+
command: {
|
|
58
|
+
command: "sh",
|
|
59
|
+
args: [
|
|
60
|
+
"-lc",
|
|
61
|
+
"printf '%s\\n' \"$WINGMAN_BENCH_PROMPT\" > output.txt; echo COMPLETE"
|
|
62
|
+
]
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}, null, 2));
|
|
66
|
+
const summary = await runTerminalBench({
|
|
67
|
+
configPath: join(benchmarkDir, "config.json")
|
|
68
|
+
});
|
|
69
|
+
expect(summary.metrics.totalTasks).toBe(1);
|
|
70
|
+
expect(summary.metrics.passedTasks).toBe(1);
|
|
71
|
+
expect(summary.metrics.failedTasks).toBe(0);
|
|
72
|
+
expect(existsSync(join(summary.resultsDir, "summary.json"))).toBe(true);
|
|
73
|
+
expect(existsSync(join(summary.resultsDir, "write-output.assistant.txt"))).toBe(true);
|
|
74
|
+
expect(readFileSync(join(sandboxDir, "output.txt"), "utf-8")).toContain("FILE_OK");
|
|
75
|
+
});
|
|
76
|
+
});
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __webpack_exports__ = {};
|
|
3
|
+
const external_node_fs_namespaceObject = require("node:fs");
|
|
4
|
+
const external_node_os_namespaceObject = require("node:os");
|
|
5
|
+
const external_node_path_namespaceObject = require("node:path");
|
|
6
|
+
const external_vitest_namespaceObject = require("vitest");
|
|
7
|
+
const scoring_cjs_namespaceObject = require("../bench/scoring.cjs");
|
|
8
|
+
function createTask(id, status, durationMs) {
|
|
9
|
+
return {
|
|
10
|
+
taskId: id,
|
|
11
|
+
status,
|
|
12
|
+
workingDirectory: "/tmp",
|
|
13
|
+
prompt: "prompt",
|
|
14
|
+
startedAt: new Date().toISOString(),
|
|
15
|
+
endedAt: new Date().toISOString(),
|
|
16
|
+
durationMs,
|
|
17
|
+
setup: {
|
|
18
|
+
runCount: 0
|
|
19
|
+
},
|
|
20
|
+
adapter: {
|
|
21
|
+
exitCode: "passed" === status ? 0 : 1,
|
|
22
|
+
timedOut: false,
|
|
23
|
+
durationMs,
|
|
24
|
+
stdout: "",
|
|
25
|
+
stderr: "",
|
|
26
|
+
assistantText: "ok",
|
|
27
|
+
tokens: {
|
|
28
|
+
inputTokens: 100,
|
|
29
|
+
outputTokens: 200,
|
|
30
|
+
totalTokens: 300
|
|
31
|
+
}
|
|
32
|
+
},
|
|
33
|
+
validator: {
|
|
34
|
+
passed: "passed" === status,
|
|
35
|
+
details: status
|
|
36
|
+
},
|
|
37
|
+
artifacts: {
|
|
38
|
+
stdoutFile: "stdout.log",
|
|
39
|
+
stderrFile: "stderr.log",
|
|
40
|
+
assistantFile: "assistant.txt",
|
|
41
|
+
recordFile: "record.json"
|
|
42
|
+
}
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
function createConfig(workdir) {
|
|
46
|
+
return {
|
|
47
|
+
version: 1,
|
|
48
|
+
configPath: (0, external_node_path_namespaceObject.join)(workdir, "config.json"),
|
|
49
|
+
taskFilePath: (0, external_node_path_namespaceObject.join)(workdir, "tasks.json"),
|
|
50
|
+
resultsDir: (0, external_node_path_namespaceObject.join)(workdir, "results"),
|
|
51
|
+
run: {
|
|
52
|
+
defaultTimeoutMs: 10000,
|
|
53
|
+
continueOnFailure: true
|
|
54
|
+
},
|
|
55
|
+
adapter: {
|
|
56
|
+
type: "command",
|
|
57
|
+
command: {
|
|
58
|
+
command: "echo"
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
tasks: [],
|
|
62
|
+
scoring: {
|
|
63
|
+
weights: {
|
|
64
|
+
passRate: 0.8,
|
|
65
|
+
reliability: 0.2,
|
|
66
|
+
duration: 0,
|
|
67
|
+
cost: 0
|
|
68
|
+
},
|
|
69
|
+
budgets: {},
|
|
70
|
+
pricing: {
|
|
71
|
+
inputPer1kTokensUsd: 0.001,
|
|
72
|
+
outputPer1kTokensUsd: 0.002
|
|
73
|
+
}
|
|
74
|
+
},
|
|
75
|
+
qualityGate: {
|
|
76
|
+
enabled: true,
|
|
77
|
+
baselineFile: (0, external_node_path_namespaceObject.join)(workdir, "baseline.json"),
|
|
78
|
+
minPassRateDelta: -0.1,
|
|
79
|
+
maxCostIncreaseRatio: 1,
|
|
80
|
+
maxAvgDurationIncreaseRatio: 1
|
|
81
|
+
},
|
|
82
|
+
metadata: {}
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
(0, external_vitest_namespaceObject.describe)("terminal bench scoring", ()=>{
|
|
86
|
+
const workdirs = [];
|
|
87
|
+
(0, external_vitest_namespaceObject.afterEach)(()=>{
|
|
88
|
+
for (const workdir of workdirs)(0, external_node_fs_namespaceObject.rmSync)(workdir, {
|
|
89
|
+
recursive: true,
|
|
90
|
+
force: true
|
|
91
|
+
});
|
|
92
|
+
workdirs.length = 0;
|
|
93
|
+
});
|
|
94
|
+
(0, external_vitest_namespaceObject.it)("computes summary metrics and applies quality gate", async ()=>{
|
|
95
|
+
const workdir = (0, external_node_fs_namespaceObject.mkdtempSync)((0, external_node_path_namespaceObject.join)((0, external_node_os_namespaceObject.tmpdir)(), "wingman-bench-score-"));
|
|
96
|
+
workdirs.push(workdir);
|
|
97
|
+
const baseline = {
|
|
98
|
+
metrics: {
|
|
99
|
+
passRate: 1,
|
|
100
|
+
totalCostUsd: 0.001,
|
|
101
|
+
avgDurationMs: 100
|
|
102
|
+
}
|
|
103
|
+
};
|
|
104
|
+
(0, external_node_fs_namespaceObject.writeFileSync)((0, external_node_path_namespaceObject.join)(workdir, "baseline.json"), JSON.stringify(baseline));
|
|
105
|
+
const summary = await (0, scoring_cjs_namespaceObject.buildTerminalBenchSummary)({
|
|
106
|
+
runId: "run-1",
|
|
107
|
+
startedAt: new Date().toISOString(),
|
|
108
|
+
endedAt: new Date().toISOString(),
|
|
109
|
+
config: createConfig(workdir),
|
|
110
|
+
resultsDir: (0, external_node_path_namespaceObject.join)(workdir, "results", "run-1"),
|
|
111
|
+
tasks: [
|
|
112
|
+
createTask("a", "passed", 100),
|
|
113
|
+
createTask("b", "failed", 200)
|
|
114
|
+
]
|
|
115
|
+
});
|
|
116
|
+
(0, external_vitest_namespaceObject.expect)(summary.metrics.totalTasks).toBe(2);
|
|
117
|
+
(0, external_vitest_namespaceObject.expect)(summary.metrics.passedTasks).toBe(1);
|
|
118
|
+
(0, external_vitest_namespaceObject.expect)(summary.metrics.passRate).toBeCloseTo(0.5);
|
|
119
|
+
(0, external_vitest_namespaceObject.expect)(summary.metrics.totalTokens).toBe(600);
|
|
120
|
+
(0, external_vitest_namespaceObject.expect)(summary.metrics.totalCostUsd).toBeCloseTo(0.001);
|
|
121
|
+
(0, external_vitest_namespaceObject.expect)(summary.qualityGate.passed).toBe(false);
|
|
122
|
+
(0, external_vitest_namespaceObject.expect)(summary.qualityGate.messages.length).toBeGreaterThan(0);
|
|
123
|
+
});
|
|
124
|
+
});
|
|
125
|
+
for(var __rspack_i in __webpack_exports__)exports[__rspack_i] = __webpack_exports__[__rspack_i];
|
|
126
|
+
Object.defineProperty(exports, '__esModule', {
|
|
127
|
+
value: true
|
|
128
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { tmpdir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { afterEach, describe, expect, it } from "vitest";
|
|
5
|
+
import { buildTerminalBenchSummary } from "../bench/scoring.js";
|
|
6
|
+
function createTask(id, status, durationMs) {
|
|
7
|
+
return {
|
|
8
|
+
taskId: id,
|
|
9
|
+
status,
|
|
10
|
+
workingDirectory: "/tmp",
|
|
11
|
+
prompt: "prompt",
|
|
12
|
+
startedAt: new Date().toISOString(),
|
|
13
|
+
endedAt: new Date().toISOString(),
|
|
14
|
+
durationMs,
|
|
15
|
+
setup: {
|
|
16
|
+
runCount: 0
|
|
17
|
+
},
|
|
18
|
+
adapter: {
|
|
19
|
+
exitCode: "passed" === status ? 0 : 1,
|
|
20
|
+
timedOut: false,
|
|
21
|
+
durationMs,
|
|
22
|
+
stdout: "",
|
|
23
|
+
stderr: "",
|
|
24
|
+
assistantText: "ok",
|
|
25
|
+
tokens: {
|
|
26
|
+
inputTokens: 100,
|
|
27
|
+
outputTokens: 200,
|
|
28
|
+
totalTokens: 300
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
validator: {
|
|
32
|
+
passed: "passed" === status,
|
|
33
|
+
details: status
|
|
34
|
+
},
|
|
35
|
+
artifacts: {
|
|
36
|
+
stdoutFile: "stdout.log",
|
|
37
|
+
stderrFile: "stderr.log",
|
|
38
|
+
assistantFile: "assistant.txt",
|
|
39
|
+
recordFile: "record.json"
|
|
40
|
+
}
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
function createConfig(workdir) {
|
|
44
|
+
return {
|
|
45
|
+
version: 1,
|
|
46
|
+
configPath: join(workdir, "config.json"),
|
|
47
|
+
taskFilePath: join(workdir, "tasks.json"),
|
|
48
|
+
resultsDir: join(workdir, "results"),
|
|
49
|
+
run: {
|
|
50
|
+
defaultTimeoutMs: 10000,
|
|
51
|
+
continueOnFailure: true
|
|
52
|
+
},
|
|
53
|
+
adapter: {
|
|
54
|
+
type: "command",
|
|
55
|
+
command: {
|
|
56
|
+
command: "echo"
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
tasks: [],
|
|
60
|
+
scoring: {
|
|
61
|
+
weights: {
|
|
62
|
+
passRate: 0.8,
|
|
63
|
+
reliability: 0.2,
|
|
64
|
+
duration: 0,
|
|
65
|
+
cost: 0
|
|
66
|
+
},
|
|
67
|
+
budgets: {},
|
|
68
|
+
pricing: {
|
|
69
|
+
inputPer1kTokensUsd: 0.001,
|
|
70
|
+
outputPer1kTokensUsd: 0.002
|
|
71
|
+
}
|
|
72
|
+
},
|
|
73
|
+
qualityGate: {
|
|
74
|
+
enabled: true,
|
|
75
|
+
baselineFile: join(workdir, "baseline.json"),
|
|
76
|
+
minPassRateDelta: -0.1,
|
|
77
|
+
maxCostIncreaseRatio: 1,
|
|
78
|
+
maxAvgDurationIncreaseRatio: 1
|
|
79
|
+
},
|
|
80
|
+
metadata: {}
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
describe("terminal bench scoring", ()=>{
|
|
84
|
+
const workdirs = [];
|
|
85
|
+
afterEach(()=>{
|
|
86
|
+
for (const workdir of workdirs)rmSync(workdir, {
|
|
87
|
+
recursive: true,
|
|
88
|
+
force: true
|
|
89
|
+
});
|
|
90
|
+
workdirs.length = 0;
|
|
91
|
+
});
|
|
92
|
+
it("computes summary metrics and applies quality gate", async ()=>{
|
|
93
|
+
const workdir = mkdtempSync(join(tmpdir(), "wingman-bench-score-"));
|
|
94
|
+
workdirs.push(workdir);
|
|
95
|
+
const baseline = {
|
|
96
|
+
metrics: {
|
|
97
|
+
passRate: 1,
|
|
98
|
+
totalCostUsd: 0.001,
|
|
99
|
+
avgDurationMs: 100
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
writeFileSync(join(workdir, "baseline.json"), JSON.stringify(baseline));
|
|
103
|
+
const summary = await buildTerminalBenchSummary({
|
|
104
|
+
runId: "run-1",
|
|
105
|
+
startedAt: new Date().toISOString(),
|
|
106
|
+
endedAt: new Date().toISOString(),
|
|
107
|
+
config: createConfig(workdir),
|
|
108
|
+
resultsDir: join(workdir, "results", "run-1"),
|
|
109
|
+
tasks: [
|
|
110
|
+
createTask("a", "passed", 100),
|
|
111
|
+
createTask("b", "failed", 200)
|
|
112
|
+
]
|
|
113
|
+
});
|
|
114
|
+
expect(summary.metrics.totalTasks).toBe(2);
|
|
115
|
+
expect(summary.metrics.passedTasks).toBe(1);
|
|
116
|
+
expect(summary.metrics.passRate).toBeCloseTo(0.5);
|
|
117
|
+
expect(summary.metrics.totalTokens).toBe(600);
|
|
118
|
+
expect(summary.metrics.totalCostUsd).toBeCloseTo(0.001);
|
|
119
|
+
expect(summary.qualityGate.passed).toBe(false);
|
|
120
|
+
expect(summary.qualityGate.messages.length).toBeGreaterThan(0);
|
|
121
|
+
});
|
|
122
|
+
});
|
|
@@ -20,7 +20,7 @@ var __webpack_modules__ = {
|
|
|
20
20
|
const FAL_API_KEY_ENV = process.env.FAL_API_KEY?.trim() || process.env.FAL_KEY?.trim() || "";
|
|
21
21
|
const FAL_REVIEW_MODE = normalizeReviewMode(process.env.FAL_MCP_REVIEW_MODE);
|
|
22
22
|
const FAL_MODELS = {
|
|
23
|
-
imageOrTexture: process.env.FAL_MODEL_IMAGE_OR_TEXTURE?.trim() || "fal-ai/nano-banana-
|
|
23
|
+
imageOrTexture: process.env.FAL_MODEL_IMAGE_OR_TEXTURE?.trim() || "fal-ai/nano-banana-2",
|
|
24
24
|
imageEdit: process.env.FAL_MODEL_IMAGE_EDIT?.trim() || "fal-ai/kling-image/v3/image-to-image",
|
|
25
25
|
audioMusic: process.env.FAL_MODEL_AUDIO_OR_MUSIC?.trim() || "fal-ai/elevenlabs/music",
|
|
26
26
|
audioSoundEffect: process.env.FAL_MODEL_SOUND_EFFECT?.trim() || "beatoven/sound-effect-generation",
|
package/dist/tools/mcp-fal-ai.js
CHANGED
|
@@ -28,7 +28,7 @@ var __webpack_modules__ = {
|
|
|
28
28
|
const FAL_API_KEY_ENV = process.env.FAL_API_KEY?.trim() || process.env.FAL_KEY?.trim() || "";
|
|
29
29
|
const FAL_REVIEW_MODE = normalizeReviewMode(process.env.FAL_MCP_REVIEW_MODE);
|
|
30
30
|
const FAL_MODELS = {
|
|
31
|
-
imageOrTexture: process.env.FAL_MODEL_IMAGE_OR_TEXTURE?.trim() || "fal-ai/nano-banana-
|
|
31
|
+
imageOrTexture: process.env.FAL_MODEL_IMAGE_OR_TEXTURE?.trim() || "fal-ai/nano-banana-2",
|
|
32
32
|
imageEdit: process.env.FAL_MODEL_IMAGE_EDIT?.trim() || "fal-ai/kling-image/v3/image-to-image",
|
|
33
33
|
audioMusic: process.env.FAL_MODEL_AUDIO_OR_MUSIC?.trim() || "fal-ai/elevenlabs/music",
|
|
34
34
|
audioSoundEffect: process.env.FAL_MODEL_SOUND_EFFECT?.trim() || "beatoven/sound-effect-generation",
|