@wingman-ai/gateway 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/README.md +14 -0
  2. package/dist/agent/config/mcpClientManager.cjs +104 -1
  3. package/dist/agent/config/mcpClientManager.d.ts +30 -0
  4. package/dist/agent/config/mcpClientManager.js +104 -1
  5. package/dist/agent/config/modelFactory.cjs +10 -0
  6. package/dist/agent/config/modelFactory.js +10 -0
  7. package/dist/agent/config/xaiImageModel.cjs +242 -0
  8. package/dist/agent/config/xaiImageModel.d.ts +33 -0
  9. package/dist/agent/config/xaiImageModel.js +202 -0
  10. package/dist/agent/tests/mcpClientManager.test.cjs +116 -0
  11. package/dist/agent/tests/mcpClientManager.test.js +117 -1
  12. package/dist/agent/tests/mcpResourceTools.test.cjs +101 -0
  13. package/dist/agent/tests/mcpResourceTools.test.d.ts +1 -0
  14. package/dist/agent/tests/mcpResourceTools.test.js +95 -0
  15. package/dist/agent/tests/modelFactory.test.cjs +16 -2
  16. package/dist/agent/tests/modelFactory.test.js +16 -2
  17. package/dist/agent/tests/xaiImageModel.test.cjs +194 -0
  18. package/dist/agent/tests/xaiImageModel.test.d.ts +1 -0
  19. package/dist/agent/tests/xaiImageModel.test.js +188 -0
  20. package/dist/agent/tools/mcp_resources.cjs +111 -0
  21. package/dist/agent/tools/mcp_resources.d.ts +3 -0
  22. package/dist/agent/tools/mcp_resources.js +77 -0
  23. package/dist/bench/adapters/commandAdapter.cjs +93 -0
  24. package/dist/bench/adapters/commandAdapter.d.ts +6 -0
  25. package/dist/bench/adapters/commandAdapter.js +59 -0
  26. package/dist/bench/adapters/helpers.cjs +170 -0
  27. package/dist/bench/adapters/helpers.d.ts +7 -0
  28. package/dist/bench/adapters/helpers.js +133 -0
  29. package/dist/bench/adapters/index.cjs +41 -0
  30. package/dist/bench/adapters/index.d.ts +2 -0
  31. package/dist/bench/adapters/index.js +7 -0
  32. package/dist/bench/adapters/wingmanCliAdapter.cjs +100 -0
  33. package/dist/bench/adapters/wingmanCliAdapter.d.ts +6 -0
  34. package/dist/bench/adapters/wingmanCliAdapter.js +66 -0
  35. package/dist/bench/cleanup.cjs +122 -0
  36. package/dist/bench/cleanup.d.ts +9 -0
  37. package/dist/bench/cleanup.js +85 -0
  38. package/dist/bench/config.cjs +190 -0
  39. package/dist/bench/config.d.ts +2 -0
  40. package/dist/bench/config.js +156 -0
  41. package/dist/bench/index.cjs +43 -0
  42. package/dist/bench/index.d.ts +3 -0
  43. package/dist/bench/index.js +3 -0
  44. package/dist/bench/official.cjs +616 -0
  45. package/dist/bench/official.d.ts +80 -0
  46. package/dist/bench/official.js +546 -0
  47. package/dist/bench/officialCli.cjs +204 -0
  48. package/dist/bench/officialCli.d.ts +5 -0
  49. package/dist/bench/officialCli.js +170 -0
  50. package/dist/bench/process.cjs +78 -0
  51. package/dist/bench/process.d.ts +14 -0
  52. package/dist/bench/process.js +44 -0
  53. package/dist/bench/runner.cjs +237 -0
  54. package/dist/bench/runner.d.ts +7 -0
  55. package/dist/bench/runner.js +197 -0
  56. package/dist/bench/scoring.cjs +171 -0
  57. package/dist/bench/scoring.d.ts +9 -0
  58. package/dist/bench/scoring.js +137 -0
  59. package/dist/bench/types.cjs +18 -0
  60. package/dist/bench/types.d.ts +200 -0
  61. package/dist/bench/types.js +0 -0
  62. package/dist/bench/validator.cjs +92 -0
  63. package/dist/bench/validator.d.ts +2 -0
  64. package/dist/bench/validator.js +58 -0
  65. package/dist/cli/config/schema.cjs +36 -1
  66. package/dist/cli/config/schema.d.ts +46 -0
  67. package/dist/cli/config/schema.js +36 -1
  68. package/dist/cli/config/warnings.cjs +119 -51
  69. package/dist/cli/config/warnings.js +119 -51
  70. package/dist/cli/core/agentInvoker.cjs +9 -2
  71. package/dist/cli/core/agentInvoker.d.ts +1 -0
  72. package/dist/cli/core/agentInvoker.js +9 -2
  73. package/dist/cli/core/imagePersistence.cjs +17 -1
  74. package/dist/cli/core/imagePersistence.d.ts +2 -0
  75. package/dist/cli/core/imagePersistence.js +13 -3
  76. package/dist/cli/core/sessionManager.cjs +2 -0
  77. package/dist/cli/core/sessionManager.js +3 -1
  78. package/dist/cli/types.d.ts +18 -0
  79. package/dist/gateway/adapters/teams.cjs +419 -0
  80. package/dist/gateway/adapters/teams.d.ts +47 -0
  81. package/dist/gateway/adapters/teams.js +361 -0
  82. package/dist/gateway/http/sms.cjs +286 -0
  83. package/dist/gateway/http/sms.d.ts +4 -0
  84. package/dist/gateway/http/sms.js +249 -0
  85. package/dist/gateway/server.cjs +54 -3
  86. package/dist/gateway/server.d.ts +2 -0
  87. package/dist/gateway/server.js +54 -3
  88. package/dist/gateway/sms/commands.cjs +116 -0
  89. package/dist/gateway/sms/commands.d.ts +15 -0
  90. package/dist/gateway/sms/commands.js +79 -0
  91. package/dist/gateway/sms/control.cjs +118 -0
  92. package/dist/gateway/sms/control.d.ts +18 -0
  93. package/dist/gateway/sms/control.js +84 -0
  94. package/dist/gateway/sms/policyStore.cjs +198 -0
  95. package/dist/gateway/sms/policyStore.d.ts +37 -0
  96. package/dist/gateway/sms/policyStore.js +161 -0
  97. package/dist/providers/registry.cjs +1 -0
  98. package/dist/providers/registry.js +1 -0
  99. package/dist/tests/cli-config-warnings.test.cjs +41 -0
  100. package/dist/tests/cli-config-warnings.test.js +41 -0
  101. package/dist/tests/cli-init.test.cjs +32 -26
  102. package/dist/tests/cli-init.test.js +32 -26
  103. package/dist/tests/gateway-http-security.test.cjs +21 -0
  104. package/dist/tests/gateway-http-security.test.js +21 -0
  105. package/dist/tests/gateway-origin-policy.test.cjs +22 -0
  106. package/dist/tests/gateway-origin-policy.test.js +22 -0
  107. package/dist/tests/gateway.test.cjs +57 -0
  108. package/dist/tests/gateway.test.js +57 -0
  109. package/dist/tests/imagePersistence.test.cjs +26 -0
  110. package/dist/tests/imagePersistence.test.js +27 -1
  111. package/dist/tests/run-terminal-bench-official-script.test.cjs +61 -0
  112. package/dist/tests/run-terminal-bench-official-script.test.d.ts +1 -0
  113. package/dist/tests/run-terminal-bench-official-script.test.js +55 -0
  114. package/dist/tests/sessions-api.test.cjs +69 -1
  115. package/dist/tests/sessions-api.test.js +70 -2
  116. package/dist/tests/sms-api.test.cjs +183 -0
  117. package/dist/tests/sms-api.test.d.ts +1 -0
  118. package/dist/tests/sms-api.test.js +177 -0
  119. package/dist/tests/sms-commands.test.cjs +90 -0
  120. package/dist/tests/sms-commands.test.d.ts +1 -0
  121. package/dist/tests/sms-commands.test.js +84 -0
  122. package/dist/tests/sms-policy-store.test.cjs +69 -0
  123. package/dist/tests/sms-policy-store.test.d.ts +1 -0
  124. package/dist/tests/sms-policy-store.test.js +63 -0
  125. package/dist/tests/teams-adapter.test.cjs +58 -0
  126. package/dist/tests/teams-adapter.test.d.ts +1 -0
  127. package/dist/tests/teams-adapter.test.js +52 -0
  128. package/dist/tests/terminal-bench-adapters-helpers.test.cjs +64 -0
  129. package/dist/tests/terminal-bench-adapters-helpers.test.d.ts +1 -0
  130. package/dist/tests/terminal-bench-adapters-helpers.test.js +58 -0
  131. package/dist/tests/terminal-bench-cleanup.test.cjs +93 -0
  132. package/dist/tests/terminal-bench-cleanup.test.d.ts +1 -0
  133. package/dist/tests/terminal-bench-cleanup.test.js +87 -0
  134. package/dist/tests/terminal-bench-config.test.cjs +62 -0
  135. package/dist/tests/terminal-bench-config.test.d.ts +1 -0
  136. package/dist/tests/terminal-bench-config.test.js +56 -0
  137. package/dist/tests/terminal-bench-official.test.cjs +194 -0
  138. package/dist/tests/terminal-bench-official.test.d.ts +1 -0
  139. package/dist/tests/terminal-bench-official.test.js +188 -0
  140. package/dist/tests/terminal-bench-runner.test.cjs +82 -0
  141. package/dist/tests/terminal-bench-runner.test.d.ts +1 -0
  142. package/dist/tests/terminal-bench-runner.test.js +76 -0
  143. package/dist/tests/terminal-bench-scoring.test.cjs +128 -0
  144. package/dist/tests/terminal-bench-scoring.test.d.ts +1 -0
  145. package/dist/tests/terminal-bench-scoring.test.js +122 -0
  146. package/dist/tools/mcp-fal-ai.cjs +1 -1
  147. package/dist/tools/mcp-fal-ai.js +1 -1
  148. package/dist/webui/assets/index-Cyg_Hs57.css +11 -0
  149. package/dist/webui/assets/{index-BMekSELC.js → index-DZXLLjaA.js} +109 -109
  150. package/dist/webui/index.html +2 -2
  151. package/package.json +11 -2
  152. package/templates/agents/game-dev/agent.md +122 -63
  153. package/templates/agents/game-dev/art-director.md +106 -0
  154. package/templates/agents/game-dev/game-designer.md +87 -0
  155. package/templates/agents/game-dev/scene-engineer.md +474 -0
  156. package/dist/webui/assets/index-Cwkg4DKj.css +0 -11
  157. package/templates/agents/game-dev/art-generation.md +0 -38
  158. package/templates/agents/game-dev/asset-refinement.md +0 -17
  159. package/templates/agents/game-dev/planning-idea.md +0 -17
  160. package/templates/agents/game-dev/ui-specialist.md +0 -17
@@ -0,0 +1,188 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { buildHarborRunArgs, buildPythonPathEnv, buildRuntimePathEnv, createDockerShimScript, extractHarborErrorMessage, isMissingComposeProviderError, isPodmanBackedDockerVersionOutput, normalizeHarborFailureMessage, parseDockerHostCandidate, parseHarborRunOutput } from "../bench/official.js";
3
+ describe("terminal bench official runner (harbor tb2)", ()=>{
4
+ it("builds harbor args with overrides", ()=>{
5
+ const args = buildHarborRunArgs({
6
+ dataset: "terminal-bench@2.0",
7
+ taskNames: [
8
+ "a",
9
+ "b"
10
+ ],
11
+ agent: "oracle",
12
+ nConcurrent: 1,
13
+ nAttempts: 1
14
+ }, {
15
+ taskNames: [
16
+ "single"
17
+ ],
18
+ agent: "codex",
19
+ nConcurrent: 2,
20
+ nAttempts: 3,
21
+ nTasks: 2,
22
+ model: "openai/gpt-4.1-mini",
23
+ agentKwargs: {
24
+ foo: "bar"
25
+ }
26
+ });
27
+ expect(args).toEqual([
28
+ "run",
29
+ "--dataset",
30
+ "terminal-bench@2.0",
31
+ "--agent",
32
+ "codex",
33
+ "--model",
34
+ "openai/gpt-4.1-mini",
35
+ "--n-concurrent",
36
+ "2",
37
+ "--n-attempts",
38
+ "3",
39
+ "--n-tasks",
40
+ "2",
41
+ "--agent-kwarg",
42
+ "foo=bar",
43
+ "--task-name",
44
+ "single"
45
+ ]);
46
+ });
47
+ it("builds harbor args with explicit registry url", ()=>{
48
+ const args = buildHarborRunArgs({
49
+ dataset: "terminal-bench@2.0",
50
+ registryUrl: "https://raw.githubusercontent.com/laude-institute/harbor/main/registry.json?source=wingman",
51
+ agent: "oracle"
52
+ }, {});
53
+ expect(args).toEqual([
54
+ "run",
55
+ "--dataset",
56
+ "terminal-bench@2.0",
57
+ "--registry-url",
58
+ "https://raw.githubusercontent.com/laude-institute/harbor/main/registry.json?source=wingman",
59
+ "--agent",
60
+ "oracle"
61
+ ]);
62
+ });
63
+ it("builds harbor args without task names when running all dataset tasks", ()=>{
64
+ const args = buildHarborRunArgs({
65
+ dataset: "terminal-bench@2.0",
66
+ agent: "oracle",
67
+ nConcurrent: 1
68
+ }, {
69
+ taskNames: []
70
+ });
71
+ expect(args).toEqual([
72
+ "run",
73
+ "--dataset",
74
+ "terminal-bench@2.0",
75
+ "--agent",
76
+ "oracle",
77
+ "--n-concurrent",
78
+ "1"
79
+ ]);
80
+ });
81
+ it("builds harbor args with custom import-path agent", ()=>{
82
+ const args = buildHarborRunArgs({
83
+ dataset: "terminal-bench@2.0",
84
+ taskNames: [
85
+ "hello-world"
86
+ ],
87
+ agent: "oracle",
88
+ agentImportPath: "my_pkg.my_agent:MyAgent",
89
+ agentKwargs: {
90
+ wingman_agent: "coding",
91
+ model_name: "should-not-pass"
92
+ },
93
+ nConcurrent: 1
94
+ }, {
95
+ agentKwargs: {
96
+ wingman_cli_path: "./bin/wingman"
97
+ }
98
+ });
99
+ expect(args).toEqual([
100
+ "run",
101
+ "--dataset",
102
+ "terminal-bench@2.0",
103
+ "--agent-import-path",
104
+ "my_pkg.my_agent:MyAgent",
105
+ "--n-concurrent",
106
+ "1",
107
+ "--agent-kwarg",
108
+ "wingman_agent=coding",
109
+ "--agent-kwarg",
110
+ "wingman_cli_path=./bin/wingman",
111
+ "--task-name",
112
+ "hello-world"
113
+ ]);
114
+ });
115
+ it("parses resolved/unresolved/accuracy and pass@k", ()=>{
116
+ const parsed = parseHarborRunOutput(`
117
+ │ Resolved Trials │ 1 │
118
+ │ Unresolved Trials │ 1 │
119
+ │ Accuracy │ 50.00% │
120
+ │ Pass@1 │ 50.00% │
121
+ Results saved to /tmp/harbor/runs/run-1
122
+ `);
123
+ expect(parsed.resolvedTrials).toBe(1);
124
+ expect(parsed.unresolvedTrials).toBe(1);
125
+ expect(parsed.accuracyPercent).toBe(50);
126
+ expect(parsed.passAtK["1"]).toBe(50);
127
+ expect(parsed.runOutputPath).toBe("/tmp/harbor/runs/run-1");
128
+ });
129
+ it("builds a docker shim script and path for podman fallback", ()=>{
130
+ const script = createDockerShimScript("/usr/local/bin/podman");
131
+ expect(script).toContain("TARGET_BINARY='/usr/local/bin/podman'");
132
+ expect(script).toContain("exec podman-compose");
133
+ expect(script).toContain("exec podman cp");
134
+ expect(script).toContain("exec podman exec");
135
+ expect(script).toContain("label=com.docker.compose.project");
136
+ expect(script).toContain("--project-directory");
137
+ expect(script.startsWith("#!/bin/bash")).toBe(true);
138
+ expect(buildRuntimePathEnv("/tmp/runtime-bin", "/usr/bin")).toBe("/tmp/runtime-bin:/usr/bin");
139
+ expect(buildPythonPathEnv("/tmp/repo", "/usr/lib/python")).toBe("/tmp/repo:/usr/lib/python");
140
+ });
141
+ it("extracts a concise harbor error message", ()=>{
142
+ const message = extractHarborErrorMessage(`
143
+ Traceback...
144
+ ValueError: No tasks found matching pattern: jq-data-processing
145
+ `);
146
+ expect(message).toBe("ValueError: No tasks found matching pattern: jq-data-processing");
147
+ });
148
+ it("extracts a specific dataset resolution error over generic fallback", ()=>{
149
+ const message = extractHarborErrorMessage(`
150
+ Traceback...
151
+ ValueError: Error getting dataset terminal-bench@2.0
152
+ ValueError: Either datasets or tasks must be provided.
153
+ `);
154
+ expect(message).toBe("ValueError: Error getting dataset terminal-bench@2.0");
155
+ });
156
+ it("rewrites generic empty-task selection error", ()=>{
157
+ const message = normalizeHarborFailureMessage({
158
+ rawMessage: "ValueError: Either datasets or tasks must be provided.",
159
+ args: [
160
+ "run",
161
+ "--dataset",
162
+ "terminal-bench@2.0",
163
+ "--task-name",
164
+ "heterogeneous-dates"
165
+ ],
166
+ dataset: "terminal-bench@2.0"
167
+ });
168
+ expect(message).toBe('No tasks matched "heterogeneous-dates" in dataset "terminal-bench@2.0". Verify task ids for Terminal-Bench 2.0.');
169
+ });
170
+ it("normalizes podman docker host candidates", ()=>{
171
+ expect(parseDockerHostCandidate("unix:///tmp/podman.sock")).toBe("unix:///tmp/podman.sock");
172
+ expect(parseDockerHostCandidate("/tmp/podman.sock")).toBe("unix:///tmp/podman.sock");
173
+ expect(parseDockerHostCandidate("'unix:///tmp/podman.sock'")).toBe("unix:///tmp/podman.sock");
174
+ expect(parseDockerHostCandidate("<nil>")).toBeUndefined();
175
+ expect(parseDockerHostCandidate(void 0)).toBeUndefined();
176
+ });
177
+ it("detects missing compose provider errors", ()=>{
178
+ expect(isMissingComposeProviderError(`
179
+ Error: looking up compose provider failed
180
+ * exec: "podman-compose": executable file not found in $PATH
181
+ `)).toBe(true);
182
+ expect(isMissingComposeProviderError("some other error")).toBe(false);
183
+ });
184
+ it("detects podman-backed docker version output", ()=>{
185
+ expect(isPodmanBackedDockerVersionOutput("Emulate Docker CLI using podman")).toBe(true);
186
+ expect(isPodmanBackedDockerVersionOutput("Docker version 27.0.0")).toBe(false);
187
+ });
188
+ });
@@ -0,0 +1,82 @@
1
+ "use strict";
2
+ var __webpack_exports__ = {};
3
+ const external_node_fs_namespaceObject = require("node:fs");
4
+ const external_node_os_namespaceObject = require("node:os");
5
+ const external_node_path_namespaceObject = require("node:path");
6
+ const external_vitest_namespaceObject = require("vitest");
7
+ const runner_cjs_namespaceObject = require("../bench/runner.cjs");
8
+ (0, external_vitest_namespaceObject.describe)("terminal bench runner", ()=>{
9
+ const workdirs = [];
10
+ (0, external_vitest_namespaceObject.afterEach)(()=>{
11
+ for (const workdir of workdirs)(0, external_node_fs_namespaceObject.rmSync)(workdir, {
12
+ recursive: true,
13
+ force: true
14
+ });
15
+ workdirs.length = 0;
16
+ });
17
+ (0, external_vitest_namespaceObject.it)("runs tasks with command adapter and writes artifacts", async ()=>{
18
+ const workdir = (0, external_node_fs_namespaceObject.mkdtempSync)((0, external_node_path_namespaceObject.join)((0, external_node_os_namespaceObject.tmpdir)(), "wingman-bench-runner-"));
19
+ workdirs.push(workdir);
20
+ const benchmarkDir = (0, external_node_path_namespaceObject.join)(workdir, "bench");
21
+ const tasksDir = (0, external_node_path_namespaceObject.join)(benchmarkDir, "tasks");
22
+ const sandboxDir = (0, external_node_path_namespaceObject.join)(benchmarkDir, "sandbox");
23
+ (0, external_node_fs_namespaceObject.mkdirSync)(tasksDir, {
24
+ recursive: true
25
+ });
26
+ (0, external_node_fs_namespaceObject.mkdirSync)(sandboxDir, {
27
+ recursive: true
28
+ });
29
+ (0, external_node_fs_namespaceObject.writeFileSync)((0, external_node_path_namespaceObject.join)(tasksDir, "suite.json"), JSON.stringify({
30
+ tasks: [
31
+ {
32
+ id: "write-output",
33
+ prompt: "FILE_OK",
34
+ workingDirectory: "sandbox",
35
+ setup: [
36
+ {
37
+ command: "rm",
38
+ args: [
39
+ "-f",
40
+ "output.txt"
41
+ ]
42
+ }
43
+ ],
44
+ validator: {
45
+ type: "file_contains",
46
+ path: "output.txt",
47
+ includes: [
48
+ "FILE_OK"
49
+ ]
50
+ }
51
+ }
52
+ ]
53
+ }, null, 2));
54
+ (0, external_node_fs_namespaceObject.writeFileSync)((0, external_node_path_namespaceObject.join)(benchmarkDir, "config.json"), JSON.stringify({
55
+ taskFile: "tasks/suite.json",
56
+ resultsDir: "results",
57
+ adapter: {
58
+ type: "command",
59
+ command: {
60
+ command: "sh",
61
+ args: [
62
+ "-lc",
63
+ "printf '%s\\n' \"$WINGMAN_BENCH_PROMPT\" > output.txt; echo COMPLETE"
64
+ ]
65
+ }
66
+ }
67
+ }, null, 2));
68
+ const summary = await (0, runner_cjs_namespaceObject.runTerminalBench)({
69
+ configPath: (0, external_node_path_namespaceObject.join)(benchmarkDir, "config.json")
70
+ });
71
+ (0, external_vitest_namespaceObject.expect)(summary.metrics.totalTasks).toBe(1);
72
+ (0, external_vitest_namespaceObject.expect)(summary.metrics.passedTasks).toBe(1);
73
+ (0, external_vitest_namespaceObject.expect)(summary.metrics.failedTasks).toBe(0);
74
+ (0, external_vitest_namespaceObject.expect)((0, external_node_fs_namespaceObject.existsSync)((0, external_node_path_namespaceObject.join)(summary.resultsDir, "summary.json"))).toBe(true);
75
+ (0, external_vitest_namespaceObject.expect)((0, external_node_fs_namespaceObject.existsSync)((0, external_node_path_namespaceObject.join)(summary.resultsDir, "write-output.assistant.txt"))).toBe(true);
76
+ (0, external_vitest_namespaceObject.expect)((0, external_node_fs_namespaceObject.readFileSync)((0, external_node_path_namespaceObject.join)(sandboxDir, "output.txt"), "utf-8")).toContain("FILE_OK");
77
+ });
78
+ });
79
+ for(var __rspack_i in __webpack_exports__)exports[__rspack_i] = __webpack_exports__[__rspack_i];
80
+ Object.defineProperty(exports, '__esModule', {
81
+ value: true
82
+ });
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,76 @@
1
+ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
2
+ import { tmpdir } from "node:os";
3
+ import { join } from "node:path";
4
+ import { afterEach, describe, expect, it } from "vitest";
5
+ import { runTerminalBench } from "../bench/runner.js";
6
+ describe("terminal bench runner", ()=>{
7
+ const workdirs = [];
8
+ afterEach(()=>{
9
+ for (const workdir of workdirs)rmSync(workdir, {
10
+ recursive: true,
11
+ force: true
12
+ });
13
+ workdirs.length = 0;
14
+ });
15
+ it("runs tasks with command adapter and writes artifacts", async ()=>{
16
+ const workdir = mkdtempSync(join(tmpdir(), "wingman-bench-runner-"));
17
+ workdirs.push(workdir);
18
+ const benchmarkDir = join(workdir, "bench");
19
+ const tasksDir = join(benchmarkDir, "tasks");
20
+ const sandboxDir = join(benchmarkDir, "sandbox");
21
+ mkdirSync(tasksDir, {
22
+ recursive: true
23
+ });
24
+ mkdirSync(sandboxDir, {
25
+ recursive: true
26
+ });
27
+ writeFileSync(join(tasksDir, "suite.json"), JSON.stringify({
28
+ tasks: [
29
+ {
30
+ id: "write-output",
31
+ prompt: "FILE_OK",
32
+ workingDirectory: "sandbox",
33
+ setup: [
34
+ {
35
+ command: "rm",
36
+ args: [
37
+ "-f",
38
+ "output.txt"
39
+ ]
40
+ }
41
+ ],
42
+ validator: {
43
+ type: "file_contains",
44
+ path: "output.txt",
45
+ includes: [
46
+ "FILE_OK"
47
+ ]
48
+ }
49
+ }
50
+ ]
51
+ }, null, 2));
52
+ writeFileSync(join(benchmarkDir, "config.json"), JSON.stringify({
53
+ taskFile: "tasks/suite.json",
54
+ resultsDir: "results",
55
+ adapter: {
56
+ type: "command",
57
+ command: {
58
+ command: "sh",
59
+ args: [
60
+ "-lc",
61
+ "printf '%s\\n' \"$WINGMAN_BENCH_PROMPT\" > output.txt; echo COMPLETE"
62
+ ]
63
+ }
64
+ }
65
+ }, null, 2));
66
+ const summary = await runTerminalBench({
67
+ configPath: join(benchmarkDir, "config.json")
68
+ });
69
+ expect(summary.metrics.totalTasks).toBe(1);
70
+ expect(summary.metrics.passedTasks).toBe(1);
71
+ expect(summary.metrics.failedTasks).toBe(0);
72
+ expect(existsSync(join(summary.resultsDir, "summary.json"))).toBe(true);
73
+ expect(existsSync(join(summary.resultsDir, "write-output.assistant.txt"))).toBe(true);
74
+ expect(readFileSync(join(sandboxDir, "output.txt"), "utf-8")).toContain("FILE_OK");
75
+ });
76
+ });
@@ -0,0 +1,128 @@
1
+ "use strict";
2
+ var __webpack_exports__ = {};
3
+ const external_node_fs_namespaceObject = require("node:fs");
4
+ const external_node_os_namespaceObject = require("node:os");
5
+ const external_node_path_namespaceObject = require("node:path");
6
+ const external_vitest_namespaceObject = require("vitest");
7
+ const scoring_cjs_namespaceObject = require("../bench/scoring.cjs");
8
+ function createTask(id, status, durationMs) {
9
+ return {
10
+ taskId: id,
11
+ status,
12
+ workingDirectory: "/tmp",
13
+ prompt: "prompt",
14
+ startedAt: new Date().toISOString(),
15
+ endedAt: new Date().toISOString(),
16
+ durationMs,
17
+ setup: {
18
+ runCount: 0
19
+ },
20
+ adapter: {
21
+ exitCode: "passed" === status ? 0 : 1,
22
+ timedOut: false,
23
+ durationMs,
24
+ stdout: "",
25
+ stderr: "",
26
+ assistantText: "ok",
27
+ tokens: {
28
+ inputTokens: 100,
29
+ outputTokens: 200,
30
+ totalTokens: 300
31
+ }
32
+ },
33
+ validator: {
34
+ passed: "passed" === status,
35
+ details: status
36
+ },
37
+ artifacts: {
38
+ stdoutFile: "stdout.log",
39
+ stderrFile: "stderr.log",
40
+ assistantFile: "assistant.txt",
41
+ recordFile: "record.json"
42
+ }
43
+ };
44
+ }
45
+ function createConfig(workdir) {
46
+ return {
47
+ version: 1,
48
+ configPath: (0, external_node_path_namespaceObject.join)(workdir, "config.json"),
49
+ taskFilePath: (0, external_node_path_namespaceObject.join)(workdir, "tasks.json"),
50
+ resultsDir: (0, external_node_path_namespaceObject.join)(workdir, "results"),
51
+ run: {
52
+ defaultTimeoutMs: 10000,
53
+ continueOnFailure: true
54
+ },
55
+ adapter: {
56
+ type: "command",
57
+ command: {
58
+ command: "echo"
59
+ }
60
+ },
61
+ tasks: [],
62
+ scoring: {
63
+ weights: {
64
+ passRate: 0.8,
65
+ reliability: 0.2,
66
+ duration: 0,
67
+ cost: 0
68
+ },
69
+ budgets: {},
70
+ pricing: {
71
+ inputPer1kTokensUsd: 0.001,
72
+ outputPer1kTokensUsd: 0.002
73
+ }
74
+ },
75
+ qualityGate: {
76
+ enabled: true,
77
+ baselineFile: (0, external_node_path_namespaceObject.join)(workdir, "baseline.json"),
78
+ minPassRateDelta: -0.1,
79
+ maxCostIncreaseRatio: 1,
80
+ maxAvgDurationIncreaseRatio: 1
81
+ },
82
+ metadata: {}
83
+ };
84
+ }
85
+ (0, external_vitest_namespaceObject.describe)("terminal bench scoring", ()=>{
86
+ const workdirs = [];
87
+ (0, external_vitest_namespaceObject.afterEach)(()=>{
88
+ for (const workdir of workdirs)(0, external_node_fs_namespaceObject.rmSync)(workdir, {
89
+ recursive: true,
90
+ force: true
91
+ });
92
+ workdirs.length = 0;
93
+ });
94
+ (0, external_vitest_namespaceObject.it)("computes summary metrics and applies quality gate", async ()=>{
95
+ const workdir = (0, external_node_fs_namespaceObject.mkdtempSync)((0, external_node_path_namespaceObject.join)((0, external_node_os_namespaceObject.tmpdir)(), "wingman-bench-score-"));
96
+ workdirs.push(workdir);
97
+ const baseline = {
98
+ metrics: {
99
+ passRate: 1,
100
+ totalCostUsd: 0.001,
101
+ avgDurationMs: 100
102
+ }
103
+ };
104
+ (0, external_node_fs_namespaceObject.writeFileSync)((0, external_node_path_namespaceObject.join)(workdir, "baseline.json"), JSON.stringify(baseline));
105
+ const summary = await (0, scoring_cjs_namespaceObject.buildTerminalBenchSummary)({
106
+ runId: "run-1",
107
+ startedAt: new Date().toISOString(),
108
+ endedAt: new Date().toISOString(),
109
+ config: createConfig(workdir),
110
+ resultsDir: (0, external_node_path_namespaceObject.join)(workdir, "results", "run-1"),
111
+ tasks: [
112
+ createTask("a", "passed", 100),
113
+ createTask("b", "failed", 200)
114
+ ]
115
+ });
116
+ (0, external_vitest_namespaceObject.expect)(summary.metrics.totalTasks).toBe(2);
117
+ (0, external_vitest_namespaceObject.expect)(summary.metrics.passedTasks).toBe(1);
118
+ (0, external_vitest_namespaceObject.expect)(summary.metrics.passRate).toBeCloseTo(0.5);
119
+ (0, external_vitest_namespaceObject.expect)(summary.metrics.totalTokens).toBe(600);
120
+ (0, external_vitest_namespaceObject.expect)(summary.metrics.totalCostUsd).toBeCloseTo(0.001);
121
+ (0, external_vitest_namespaceObject.expect)(summary.qualityGate.passed).toBe(false);
122
+ (0, external_vitest_namespaceObject.expect)(summary.qualityGate.messages.length).toBeGreaterThan(0);
123
+ });
124
+ });
125
+ for(var __rspack_i in __webpack_exports__)exports[__rspack_i] = __webpack_exports__[__rspack_i];
126
+ Object.defineProperty(exports, '__esModule', {
127
+ value: true
128
+ });
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,122 @@
1
+ import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
2
+ import { tmpdir } from "node:os";
3
+ import { join } from "node:path";
4
+ import { afterEach, describe, expect, it } from "vitest";
5
+ import { buildTerminalBenchSummary } from "../bench/scoring.js";
6
+ function createTask(id, status, durationMs) {
7
+ return {
8
+ taskId: id,
9
+ status,
10
+ workingDirectory: "/tmp",
11
+ prompt: "prompt",
12
+ startedAt: new Date().toISOString(),
13
+ endedAt: new Date().toISOString(),
14
+ durationMs,
15
+ setup: {
16
+ runCount: 0
17
+ },
18
+ adapter: {
19
+ exitCode: "passed" === status ? 0 : 1,
20
+ timedOut: false,
21
+ durationMs,
22
+ stdout: "",
23
+ stderr: "",
24
+ assistantText: "ok",
25
+ tokens: {
26
+ inputTokens: 100,
27
+ outputTokens: 200,
28
+ totalTokens: 300
29
+ }
30
+ },
31
+ validator: {
32
+ passed: "passed" === status,
33
+ details: status
34
+ },
35
+ artifacts: {
36
+ stdoutFile: "stdout.log",
37
+ stderrFile: "stderr.log",
38
+ assistantFile: "assistant.txt",
39
+ recordFile: "record.json"
40
+ }
41
+ };
42
+ }
43
+ function createConfig(workdir) {
44
+ return {
45
+ version: 1,
46
+ configPath: join(workdir, "config.json"),
47
+ taskFilePath: join(workdir, "tasks.json"),
48
+ resultsDir: join(workdir, "results"),
49
+ run: {
50
+ defaultTimeoutMs: 10000,
51
+ continueOnFailure: true
52
+ },
53
+ adapter: {
54
+ type: "command",
55
+ command: {
56
+ command: "echo"
57
+ }
58
+ },
59
+ tasks: [],
60
+ scoring: {
61
+ weights: {
62
+ passRate: 0.8,
63
+ reliability: 0.2,
64
+ duration: 0,
65
+ cost: 0
66
+ },
67
+ budgets: {},
68
+ pricing: {
69
+ inputPer1kTokensUsd: 0.001,
70
+ outputPer1kTokensUsd: 0.002
71
+ }
72
+ },
73
+ qualityGate: {
74
+ enabled: true,
75
+ baselineFile: join(workdir, "baseline.json"),
76
+ minPassRateDelta: -0.1,
77
+ maxCostIncreaseRatio: 1,
78
+ maxAvgDurationIncreaseRatio: 1
79
+ },
80
+ metadata: {}
81
+ };
82
+ }
83
+ describe("terminal bench scoring", ()=>{
84
+ const workdirs = [];
85
+ afterEach(()=>{
86
+ for (const workdir of workdirs)rmSync(workdir, {
87
+ recursive: true,
88
+ force: true
89
+ });
90
+ workdirs.length = 0;
91
+ });
92
+ it("computes summary metrics and applies quality gate", async ()=>{
93
+ const workdir = mkdtempSync(join(tmpdir(), "wingman-bench-score-"));
94
+ workdirs.push(workdir);
95
+ const baseline = {
96
+ metrics: {
97
+ passRate: 1,
98
+ totalCostUsd: 0.001,
99
+ avgDurationMs: 100
100
+ }
101
+ };
102
+ writeFileSync(join(workdir, "baseline.json"), JSON.stringify(baseline));
103
+ const summary = await buildTerminalBenchSummary({
104
+ runId: "run-1",
105
+ startedAt: new Date().toISOString(),
106
+ endedAt: new Date().toISOString(),
107
+ config: createConfig(workdir),
108
+ resultsDir: join(workdir, "results", "run-1"),
109
+ tasks: [
110
+ createTask("a", "passed", 100),
111
+ createTask("b", "failed", 200)
112
+ ]
113
+ });
114
+ expect(summary.metrics.totalTasks).toBe(2);
115
+ expect(summary.metrics.passedTasks).toBe(1);
116
+ expect(summary.metrics.passRate).toBeCloseTo(0.5);
117
+ expect(summary.metrics.totalTokens).toBe(600);
118
+ expect(summary.metrics.totalCostUsd).toBeCloseTo(0.001);
119
+ expect(summary.qualityGate.passed).toBe(false);
120
+ expect(summary.qualityGate.messages.length).toBeGreaterThan(0);
121
+ });
122
+ });
@@ -20,7 +20,7 @@ var __webpack_modules__ = {
20
20
  const FAL_API_KEY_ENV = process.env.FAL_API_KEY?.trim() || process.env.FAL_KEY?.trim() || "";
21
21
  const FAL_REVIEW_MODE = normalizeReviewMode(process.env.FAL_MCP_REVIEW_MODE);
22
22
  const FAL_MODELS = {
23
- imageOrTexture: process.env.FAL_MODEL_IMAGE_OR_TEXTURE?.trim() || "fal-ai/nano-banana-pro",
23
+ imageOrTexture: process.env.FAL_MODEL_IMAGE_OR_TEXTURE?.trim() || "fal-ai/nano-banana-2",
24
24
  imageEdit: process.env.FAL_MODEL_IMAGE_EDIT?.trim() || "fal-ai/kling-image/v3/image-to-image",
25
25
  audioMusic: process.env.FAL_MODEL_AUDIO_OR_MUSIC?.trim() || "fal-ai/elevenlabs/music",
26
26
  audioSoundEffect: process.env.FAL_MODEL_SOUND_EFFECT?.trim() || "beatoven/sound-effect-generation",
@@ -28,7 +28,7 @@ var __webpack_modules__ = {
28
28
  const FAL_API_KEY_ENV = process.env.FAL_API_KEY?.trim() || process.env.FAL_KEY?.trim() || "";
29
29
  const FAL_REVIEW_MODE = normalizeReviewMode(process.env.FAL_MCP_REVIEW_MODE);
30
30
  const FAL_MODELS = {
31
- imageOrTexture: process.env.FAL_MODEL_IMAGE_OR_TEXTURE?.trim() || "fal-ai/nano-banana-pro",
31
+ imageOrTexture: process.env.FAL_MODEL_IMAGE_OR_TEXTURE?.trim() || "fal-ai/nano-banana-2",
32
32
  imageEdit: process.env.FAL_MODEL_IMAGE_EDIT?.trim() || "fal-ai/kling-image/v3/image-to-image",
33
33
  audioMusic: process.env.FAL_MODEL_AUDIO_OR_MUSIC?.trim() || "fal-ai/elevenlabs/music",
34
34
  audioSoundEffect: process.env.FAL_MODEL_SOUND_EFFECT?.trim() || "beatoven/sound-effect-generation",