@wingman-ai/gateway 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/README.md +14 -0
  2. package/dist/agent/config/mcpClientManager.cjs +104 -1
  3. package/dist/agent/config/mcpClientManager.d.ts +30 -0
  4. package/dist/agent/config/mcpClientManager.js +104 -1
  5. package/dist/agent/config/modelFactory.cjs +10 -0
  6. package/dist/agent/config/modelFactory.js +10 -0
  7. package/dist/agent/config/xaiImageModel.cjs +242 -0
  8. package/dist/agent/config/xaiImageModel.d.ts +33 -0
  9. package/dist/agent/config/xaiImageModel.js +202 -0
  10. package/dist/agent/tests/mcpClientManager.test.cjs +116 -0
  11. package/dist/agent/tests/mcpClientManager.test.js +117 -1
  12. package/dist/agent/tests/mcpResourceTools.test.cjs +101 -0
  13. package/dist/agent/tests/mcpResourceTools.test.d.ts +1 -0
  14. package/dist/agent/tests/mcpResourceTools.test.js +95 -0
  15. package/dist/agent/tests/modelFactory.test.cjs +16 -2
  16. package/dist/agent/tests/modelFactory.test.js +16 -2
  17. package/dist/agent/tests/xaiImageModel.test.cjs +194 -0
  18. package/dist/agent/tests/xaiImageModel.test.d.ts +1 -0
  19. package/dist/agent/tests/xaiImageModel.test.js +188 -0
  20. package/dist/agent/tools/mcp_resources.cjs +111 -0
  21. package/dist/agent/tools/mcp_resources.d.ts +3 -0
  22. package/dist/agent/tools/mcp_resources.js +77 -0
  23. package/dist/bench/adapters/commandAdapter.cjs +93 -0
  24. package/dist/bench/adapters/commandAdapter.d.ts +6 -0
  25. package/dist/bench/adapters/commandAdapter.js +59 -0
  26. package/dist/bench/adapters/helpers.cjs +170 -0
  27. package/dist/bench/adapters/helpers.d.ts +7 -0
  28. package/dist/bench/adapters/helpers.js +133 -0
  29. package/dist/bench/adapters/index.cjs +41 -0
  30. package/dist/bench/adapters/index.d.ts +2 -0
  31. package/dist/bench/adapters/index.js +7 -0
  32. package/dist/bench/adapters/wingmanCliAdapter.cjs +100 -0
  33. package/dist/bench/adapters/wingmanCliAdapter.d.ts +6 -0
  34. package/dist/bench/adapters/wingmanCliAdapter.js +66 -0
  35. package/dist/bench/cleanup.cjs +122 -0
  36. package/dist/bench/cleanup.d.ts +9 -0
  37. package/dist/bench/cleanup.js +85 -0
  38. package/dist/bench/config.cjs +190 -0
  39. package/dist/bench/config.d.ts +2 -0
  40. package/dist/bench/config.js +156 -0
  41. package/dist/bench/index.cjs +43 -0
  42. package/dist/bench/index.d.ts +3 -0
  43. package/dist/bench/index.js +3 -0
  44. package/dist/bench/official.cjs +616 -0
  45. package/dist/bench/official.d.ts +80 -0
  46. package/dist/bench/official.js +546 -0
  47. package/dist/bench/officialCli.cjs +204 -0
  48. package/dist/bench/officialCli.d.ts +5 -0
  49. package/dist/bench/officialCli.js +170 -0
  50. package/dist/bench/process.cjs +78 -0
  51. package/dist/bench/process.d.ts +14 -0
  52. package/dist/bench/process.js +44 -0
  53. package/dist/bench/runner.cjs +237 -0
  54. package/dist/bench/runner.d.ts +7 -0
  55. package/dist/bench/runner.js +197 -0
  56. package/dist/bench/scoring.cjs +171 -0
  57. package/dist/bench/scoring.d.ts +9 -0
  58. package/dist/bench/scoring.js +137 -0
  59. package/dist/bench/types.cjs +18 -0
  60. package/dist/bench/types.d.ts +200 -0
  61. package/dist/bench/types.js +0 -0
  62. package/dist/bench/validator.cjs +92 -0
  63. package/dist/bench/validator.d.ts +2 -0
  64. package/dist/bench/validator.js +58 -0
  65. package/dist/cli/config/schema.cjs +36 -1
  66. package/dist/cli/config/schema.d.ts +46 -0
  67. package/dist/cli/config/schema.js +36 -1
  68. package/dist/cli/config/warnings.cjs +119 -51
  69. package/dist/cli/config/warnings.js +119 -51
  70. package/dist/cli/core/agentInvoker.cjs +9 -2
  71. package/dist/cli/core/agentInvoker.d.ts +1 -0
  72. package/dist/cli/core/agentInvoker.js +9 -2
  73. package/dist/cli/core/imagePersistence.cjs +17 -1
  74. package/dist/cli/core/imagePersistence.d.ts +2 -0
  75. package/dist/cli/core/imagePersistence.js +13 -3
  76. package/dist/cli/core/sessionManager.cjs +2 -0
  77. package/dist/cli/core/sessionManager.js +3 -1
  78. package/dist/cli/types.d.ts +18 -0
  79. package/dist/gateway/adapters/teams.cjs +419 -0
  80. package/dist/gateway/adapters/teams.d.ts +47 -0
  81. package/dist/gateway/adapters/teams.js +361 -0
  82. package/dist/gateway/http/sms.cjs +286 -0
  83. package/dist/gateway/http/sms.d.ts +4 -0
  84. package/dist/gateway/http/sms.js +249 -0
  85. package/dist/gateway/server.cjs +54 -3
  86. package/dist/gateway/server.d.ts +2 -0
  87. package/dist/gateway/server.js +54 -3
  88. package/dist/gateway/sms/commands.cjs +116 -0
  89. package/dist/gateway/sms/commands.d.ts +15 -0
  90. package/dist/gateway/sms/commands.js +79 -0
  91. package/dist/gateway/sms/control.cjs +118 -0
  92. package/dist/gateway/sms/control.d.ts +18 -0
  93. package/dist/gateway/sms/control.js +84 -0
  94. package/dist/gateway/sms/policyStore.cjs +198 -0
  95. package/dist/gateway/sms/policyStore.d.ts +37 -0
  96. package/dist/gateway/sms/policyStore.js +161 -0
  97. package/dist/providers/registry.cjs +1 -0
  98. package/dist/providers/registry.js +1 -0
  99. package/dist/tests/cli-config-warnings.test.cjs +41 -0
  100. package/dist/tests/cli-config-warnings.test.js +41 -0
  101. package/dist/tests/cli-init.test.cjs +32 -26
  102. package/dist/tests/cli-init.test.js +32 -26
  103. package/dist/tests/gateway-http-security.test.cjs +21 -0
  104. package/dist/tests/gateway-http-security.test.js +21 -0
  105. package/dist/tests/gateway-origin-policy.test.cjs +22 -0
  106. package/dist/tests/gateway-origin-policy.test.js +22 -0
  107. package/dist/tests/gateway.test.cjs +57 -0
  108. package/dist/tests/gateway.test.js +57 -0
  109. package/dist/tests/imagePersistence.test.cjs +26 -0
  110. package/dist/tests/imagePersistence.test.js +27 -1
  111. package/dist/tests/run-terminal-bench-official-script.test.cjs +61 -0
  112. package/dist/tests/run-terminal-bench-official-script.test.d.ts +1 -0
  113. package/dist/tests/run-terminal-bench-official-script.test.js +55 -0
  114. package/dist/tests/sessions-api.test.cjs +69 -1
  115. package/dist/tests/sessions-api.test.js +70 -2
  116. package/dist/tests/sms-api.test.cjs +183 -0
  117. package/dist/tests/sms-api.test.d.ts +1 -0
  118. package/dist/tests/sms-api.test.js +177 -0
  119. package/dist/tests/sms-commands.test.cjs +90 -0
  120. package/dist/tests/sms-commands.test.d.ts +1 -0
  121. package/dist/tests/sms-commands.test.js +84 -0
  122. package/dist/tests/sms-policy-store.test.cjs +69 -0
  123. package/dist/tests/sms-policy-store.test.d.ts +1 -0
  124. package/dist/tests/sms-policy-store.test.js +63 -0
  125. package/dist/tests/teams-adapter.test.cjs +58 -0
  126. package/dist/tests/teams-adapter.test.d.ts +1 -0
  127. package/dist/tests/teams-adapter.test.js +52 -0
  128. package/dist/tests/terminal-bench-adapters-helpers.test.cjs +64 -0
  129. package/dist/tests/terminal-bench-adapters-helpers.test.d.ts +1 -0
  130. package/dist/tests/terminal-bench-adapters-helpers.test.js +58 -0
  131. package/dist/tests/terminal-bench-cleanup.test.cjs +93 -0
  132. package/dist/tests/terminal-bench-cleanup.test.d.ts +1 -0
  133. package/dist/tests/terminal-bench-cleanup.test.js +87 -0
  134. package/dist/tests/terminal-bench-config.test.cjs +62 -0
  135. package/dist/tests/terminal-bench-config.test.d.ts +1 -0
  136. package/dist/tests/terminal-bench-config.test.js +56 -0
  137. package/dist/tests/terminal-bench-official.test.cjs +194 -0
  138. package/dist/tests/terminal-bench-official.test.d.ts +1 -0
  139. package/dist/tests/terminal-bench-official.test.js +188 -0
  140. package/dist/tests/terminal-bench-runner.test.cjs +82 -0
  141. package/dist/tests/terminal-bench-runner.test.d.ts +1 -0
  142. package/dist/tests/terminal-bench-runner.test.js +76 -0
  143. package/dist/tests/terminal-bench-scoring.test.cjs +128 -0
  144. package/dist/tests/terminal-bench-scoring.test.d.ts +1 -0
  145. package/dist/tests/terminal-bench-scoring.test.js +122 -0
  146. package/dist/tools/mcp-fal-ai.cjs +1 -1
  147. package/dist/tools/mcp-fal-ai.js +1 -1
  148. package/dist/webui/assets/index-Cyg_Hs57.css +11 -0
  149. package/dist/webui/assets/{index-BMekSELC.js → index-DZXLLjaA.js} +109 -109
  150. package/dist/webui/index.html +2 -2
  151. package/package.json +11 -2
  152. package/templates/agents/game-dev/agent.md +122 -63
  153. package/templates/agents/game-dev/art-director.md +106 -0
  154. package/templates/agents/game-dev/game-designer.md +87 -0
  155. package/templates/agents/game-dev/scene-engineer.md +474 -0
  156. package/dist/webui/assets/index-Cwkg4DKj.css +0 -11
  157. package/templates/agents/game-dev/art-generation.md +0 -38
  158. package/templates/agents/game-dev/asset-refinement.md +0 -17
  159. package/templates/agents/game-dev/planning-idea.md +0 -17
  160. package/templates/agents/game-dev/ui-specialist.md +0 -17
@@ -0,0 +1,80 @@
1
+ import { z } from "zod";
2
+ declare const officialConfigSchema: z.ZodObject<{
3
+ dataset: z.ZodString;
4
+ taskNames: z.ZodOptional<z.ZodArray<z.ZodString>>;
5
+ registryUrl: z.ZodOptional<z.ZodString>;
6
+ registryPath: z.ZodOptional<z.ZodString>;
7
+ agent: z.ZodOptional<z.ZodString>;
8
+ agentImportPath: z.ZodOptional<z.ZodString>;
9
+ agentKwargs: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
10
+ model: z.ZodOptional<z.ZodString>;
11
+ nConcurrent: z.ZodOptional<z.ZodNumber>;
12
+ nAttempts: z.ZodOptional<z.ZodNumber>;
13
+ nTasks: z.ZodOptional<z.ZodNumber>;
14
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
15
+ extraArgs: z.ZodOptional<z.ZodArray<z.ZodString>>;
16
+ }, z.core.$strip>;
17
+ export type OfficialBenchConfig = z.infer<typeof officialConfigSchema>;
18
+ export interface OfficialBenchOverrides {
19
+ taskNames?: string[];
20
+ registryUrl?: string;
21
+ registryPath?: string;
22
+ agent?: string;
23
+ agentImportPath?: string;
24
+ agentKwargs?: Record<string, string>;
25
+ model?: string;
26
+ dataset?: string;
27
+ nConcurrent?: number;
28
+ nAttempts?: number;
29
+ nTasks?: number;
30
+ timeoutMs?: number;
31
+ }
32
+ export interface OfficialBenchSummary {
33
+ timestamp: string;
34
+ command: {
35
+ binary: string;
36
+ args: string[];
37
+ };
38
+ runtime: {
39
+ containerRuntime: "docker" | "podman";
40
+ };
41
+ exitCode: number;
42
+ timedOut: boolean;
43
+ durationMs: number;
44
+ errorMessage?: string;
45
+ metrics: {
46
+ resolvedTrials?: number;
47
+ unresolvedTrials?: number;
48
+ accuracyPercent?: number;
49
+ passAtK: Record<string, number>;
50
+ };
51
+ runOutputPath?: string;
52
+ artifacts: {
53
+ rawStdoutPath: string;
54
+ rawStderrPath: string;
55
+ summaryPath: string;
56
+ };
57
+ }
58
+ export declare function extractTaskNamesFromArgs(args: string[]): string[];
59
+ export declare function normalizeHarborFailureMessage(params: {
60
+ rawMessage: string | undefined;
61
+ args: string[];
62
+ dataset: string;
63
+ }): string | undefined;
64
+ export declare function isMissingComposeProviderError(output: string): boolean;
65
+ export declare function isPodmanBackedDockerVersionOutput(output: string): boolean;
66
+ export declare function parseHarborRunOutput(output: string): OfficialBenchSummary["metrics"] & {
67
+ runOutputPath?: string;
68
+ };
69
+ export declare function extractHarborErrorMessage(stderr: string): string | undefined;
70
+ export declare function buildHarborRunArgs(config: OfficialBenchConfig, overrides: OfficialBenchOverrides): string[];
71
+ export declare function loadOfficialBenchConfig(configPath: string): Promise<OfficialBenchConfig>;
72
+ export declare function createDockerShimScript(targetBinary: string): string;
73
+ export declare function buildRuntimePathEnv(shimDir: string, basePath?: string): string;
74
+ export declare function buildPythonPathEnv(pathToAdd: string, basePythonPath?: string): string;
75
+ export declare function parseDockerHostCandidate(value: string | undefined): string | undefined;
76
+ export declare function runOfficialTerminalBench(options: {
77
+ configPath: string;
78
+ overrides: OfficialBenchOverrides;
79
+ }): Promise<OfficialBenchSummary>;
80
+ export {};
@@ -0,0 +1,546 @@
1
+ import { chmod, mkdir, writeFile } from "node:fs/promises";
2
+ import { delimiter, join, resolve } from "node:path";
3
+ import { z } from "zod";
4
+ import { runCommand } from "./process.js";
5
+ const officialConfigSchema = z.object({
6
+ dataset: z.string().min(1),
7
+ taskNames: z.array(z.string().min(1)).optional(),
8
+ registryUrl: z.string().min(1).optional(),
9
+ registryPath: z.string().min(1).optional(),
10
+ agent: z.string().min(1).optional(),
11
+ agentImportPath: z.string().min(1).optional(),
12
+ agentKwargs: z.record(z.string(), z.string()).optional(),
13
+ model: z.string().optional(),
14
+ nConcurrent: z.number().int().positive().optional(),
15
+ nAttempts: z.number().int().positive().optional(),
16
+ nTasks: z.number().int().positive().optional(),
17
+ timeoutMs: z.number().int().positive().optional(),
18
+ extraArgs: z.array(z.string()).optional()
19
+ });
20
+ function extractTaskNamesFromArgs(args) {
21
+ const names = [];
22
+ for(let i = 0; i < args.length; i += 1){
23
+ const arg = args[i];
24
+ if ("--task-name" === arg && args[i + 1]) {
25
+ names.push(args[i + 1]);
26
+ i += 1;
27
+ continue;
28
+ }
29
+ if (arg.startsWith("--task-name=")) names.push(arg.slice(12));
30
+ }
31
+ return names;
32
+ }
33
+ function normalizeHarborFailureMessage(params) {
34
+ const { rawMessage, args, dataset } = params;
35
+ if ("ValueError: Either datasets or tasks must be provided." !== rawMessage) return rawMessage;
36
+ const selectedTaskNames = extractTaskNamesFromArgs(args);
37
+ if (0 === selectedTaskNames.length) return rawMessage;
38
+ return `No tasks matched ${selectedTaskNames.map((name)=>`"${name}"`).join(", ")} in dataset "${dataset}". Verify task ids for Terminal-Bench 2.0.`;
39
+ }
40
+ function stripAnsi(value) {
41
+ return value.replace(/\u001B\[[0-9;]*[A-Za-z]/g, "");
42
+ }
43
+ function parseMetricNumber(line) {
44
+ const match = line.match(/(-?\d+(?:\.\d+)?)(?:\s*%?)\s*[│|]?\s*$/);
45
+ if (!match) return;
46
+ const value = Number.parseFloat(match[1]);
47
+ return Number.isFinite(value) ? value : void 0;
48
+ }
49
+ function isMissingComposeProviderError(output) {
50
+ const normalized = stripAnsi(output).toLowerCase();
51
+ return normalized.includes("looking up compose provider failed") || normalized.includes('exec: "docker-compose": executable file not found') || normalized.includes('exec: "podman-compose": executable file not found');
52
+ }
53
+ function isPodmanBackedDockerVersionOutput(output) {
54
+ const normalized = stripAnsi(output).toLowerCase();
55
+ return normalized.includes("podman") || normalized.includes("emulate docker cli using podman");
56
+ }
57
+ function parseHarborRunOutput(output) {
58
+ const normalized = stripAnsi(output);
59
+ const lines = normalized.split(/\r?\n/);
60
+ let resolvedTrials;
61
+ let unresolvedTrials;
62
+ let accuracyPercent;
63
+ const passAtK = {};
64
+ let runOutputPath;
65
+ for (const rawLine of lines){
66
+ const line = rawLine.trim();
67
+ if (!line) continue;
68
+ if (line.includes("Resolved Trials")) {
69
+ resolvedTrials = parseMetricNumber(line);
70
+ continue;
71
+ }
72
+ if (line.includes("Unresolved Trials")) {
73
+ unresolvedTrials = parseMetricNumber(line);
74
+ continue;
75
+ }
76
+ if (line.includes("Accuracy")) {
77
+ accuracyPercent = parseMetricNumber(line);
78
+ continue;
79
+ }
80
+ const passAtKMatch = line.match(/Pass@(\d+)/i);
81
+ if (passAtKMatch) {
82
+ const value = parseMetricNumber(line);
83
+ if (void 0 !== value) passAtK[passAtKMatch[1]] = value;
84
+ continue;
85
+ }
86
+ const pathMatch = line.match(/results written to\s+(.+)$/i) || line.match(/results saved to\s+(.+)$/i) || line.match(/output written to\s+(.+)$/i);
87
+ if (pathMatch?.[1]) runOutputPath = pathMatch[1].trim();
88
+ }
89
+ return {
90
+ resolvedTrials,
91
+ unresolvedTrials,
92
+ accuracyPercent,
93
+ passAtK,
94
+ runOutputPath
95
+ };
96
+ }
97
+ function extractHarborErrorMessage(stderr) {
98
+ const normalized = stripAnsi(stderr);
99
+ if (normalized.includes("ValueError: Error getting dataset") && normalized.match(/ValueError: Error getting dataset[^\n]*/g)) return normalized.match(/ValueError: Error getting dataset[^\n]*/g)?.at(-1);
100
+ if (normalized.includes("ConnectError:") && normalized.match(/ConnectError:[^\n]*/g)) return normalized.match(/ConnectError:[^\n]*/g)?.at(-1);
101
+ if (normalized.includes("nodename nor servname provided") || normalized.includes("temporary failure in name resolution")) return "Harbor registry lookup failed due DNS/network error. Verify internet access or pass --registry-url/--registry-path.";
102
+ const lines = normalized.split(/\r?\n/).map((line)=>line.trim()).filter(Boolean);
103
+ if (0 === lines.length) return;
104
+ for(let i = lines.length - 1; i >= 0; i -= 1)if (lines[i].startsWith("ValueError:")) return lines[i];
105
+ for(let i = lines.length - 1; i >= 0; i -= 1)if (/\w+Error:/.test(lines[i])) return lines[i];
106
+ return lines.at(-1);
107
+ }
108
+ function buildHarborRunArgs(config, overrides) {
109
+ const dataset = overrides.dataset || config.dataset;
110
+ const taskNames = overrides.taskNames ?? config.taskNames ?? [];
111
+ const registryUrl = overrides.registryUrl || config.registryUrl;
112
+ const registryPath = overrides.registryPath || config.registryPath;
113
+ const agent = overrides.agent || config.agent || "oracle";
114
+ const agentImportPath = overrides.agentImportPath || config.agentImportPath;
115
+ const agentKwargs = {
116
+ ...config.agentKwargs || {},
117
+ ...overrides.agentKwargs || {}
118
+ };
119
+ const model = overrides.model || config.model;
120
+ const nConcurrent = overrides.nConcurrent || config.nConcurrent;
121
+ const nAttempts = overrides.nAttempts || config.nAttempts;
122
+ const nTasks = overrides.nTasks || config.nTasks;
123
+ const args = [
124
+ "run",
125
+ "--dataset",
126
+ dataset
127
+ ];
128
+ if (registryUrl) args.push("--registry-url", registryUrl);
129
+ if (registryPath) args.push("--registry-path", registryPath);
130
+ if (agentImportPath) args.push("--agent-import-path", agentImportPath);
131
+ else args.push("--agent", agent);
132
+ if (model) args.push("--model", model);
133
+ if (nConcurrent) args.push("--n-concurrent", String(nConcurrent));
134
+ if (nAttempts) args.push("--n-attempts", String(nAttempts));
135
+ if (nTasks) args.push("--n-tasks", String(nTasks));
136
+ for (const [key, value] of Object.entries(agentKwargs))if (!agentImportPath || "model_name" !== key) args.push("--agent-kwarg", `${key}=${value}`);
137
+ for (const taskName of taskNames)args.push("--task-name", taskName);
138
+ if (config.extraArgs && config.extraArgs.length > 0) args.push(...config.extraArgs);
139
+ return args;
140
+ }
141
+ async function loadOfficialBenchConfig(configPath) {
142
+ const path = resolve(configPath);
143
+ const text = await Bun.file(path).text();
144
+ return officialConfigSchema.parse(JSON.parse(text));
145
+ }
146
+ async function resolveRequiredBinary(name) {
147
+ const check = await runCommand("sh", [
148
+ "-lc",
149
+ `command -v ${name}`
150
+ ], {
151
+ cwd: process.cwd(),
152
+ timeoutMs: 5000
153
+ });
154
+ if (0 !== check.exitCode) throw new Error("harbor is not installed or not on PATH. Install Harbor CLI and verify with `harbor --help`.");
155
+ const resolvedPath = check.stdout.trim().split(/\r?\n/).at(-1)?.trim();
156
+ if (!resolvedPath) throw new Error(`Unable to resolve ${name} binary path.`);
157
+ return resolvedPath;
158
+ }
159
+ async function resolveBinary(name) {
160
+ const check = await runCommand("sh", [
161
+ "-lc",
162
+ `command -v ${name}`
163
+ ], {
164
+ cwd: process.cwd(),
165
+ timeoutMs: 5000
166
+ });
167
+ if (0 !== check.exitCode) return null;
168
+ const resolvedPath = check.stdout.trim().split(/\r?\n/).at(-1)?.trim();
169
+ return resolvedPath || null;
170
+ }
171
+ function shellQuote(value) {
172
+ return `'${value.replace(/'/g, "'\"'\"'")}'`;
173
+ }
174
+ function createDockerShimScript(targetBinary) {
175
+ return `#!/bin/bash
176
+ set -e
177
+ TARGET_BINARY=${shellQuote(targetBinary)}
178
+
179
+ if [[ "$1" == "compose" ]] && command -v podman-compose >/dev/null 2>&1; then
180
+ shift
181
+ PROJECT_DIR=""
182
+ PROJECT_NAME=""
183
+ COMPOSE_FILES=()
184
+ TRANSLATED_ARGS=()
185
+
186
+ while [[ $# -gt 0 ]]; do
187
+ case "$1" in
188
+ -p)
189
+ PROJECT_NAME="$2"
190
+ TRANSLATED_ARGS+=("$1" "$2")
191
+ shift 2
192
+ ;;
193
+ -p=*)
194
+ PROJECT_NAME="\${1#*=}"
195
+ TRANSLATED_ARGS+=("$1")
196
+ shift
197
+ ;;
198
+ -f)
199
+ COMPOSE_FILES+=("$2")
200
+ TRANSLATED_ARGS+=("$1" "$2")
201
+ shift 2
202
+ ;;
203
+ -f=*)
204
+ COMPOSE_FILES+=("\${1#*=}")
205
+ TRANSLATED_ARGS+=("$1")
206
+ shift
207
+ ;;
208
+ --project-directory)
209
+ PROJECT_DIR="$2"
210
+ shift 2
211
+ ;;
212
+ --project-directory=*)
213
+ PROJECT_DIR="\${1#*=}"
214
+ shift
215
+ ;;
216
+ *)
217
+ TRANSLATED_ARGS+=("$1")
218
+ shift
219
+ ;;
220
+ esac
221
+ done
222
+
223
+ if [[ -n "$PROJECT_DIR" ]]; then
224
+ cd "$PROJECT_DIR"
225
+ fi
226
+
227
+ resolve_container_id() {
228
+ local service="$1"
229
+ local container_id=""
230
+
231
+ if [[ -n "$PROJECT_NAME" ]]; then
232
+ container_id=$(podman ps -a \
233
+ --filter "label=com.docker.compose.project=$PROJECT_NAME" \
234
+ --filter "label=com.docker.compose.service=$service" \
235
+ --format "{{.ID}}" | head -n 1 || true)
236
+ fi
237
+
238
+ if [[ -z "$container_id" && -n "$PROJECT_NAME" ]]; then
239
+ local c1="\${PROJECT_NAME}_\${service}_1"
240
+ local c2="\${PROJECT_NAME}-\${service}-1"
241
+ if podman container exists "$c1" >/dev/null 2>&1; then
242
+ container_id="$c1"
243
+ elif podman container exists "$c2" >/dev/null 2>&1; then
244
+ container_id="$c2"
245
+ fi
246
+ fi
247
+
248
+ echo "$container_id"
249
+ }
250
+
251
+ translate_cp_endpoint() {
252
+ local endpoint="$1"
253
+ if [[ "$endpoint" == *:* ]]; then
254
+ local service="\${endpoint%%:*}"
255
+ local inner_path="\${endpoint#*:}"
256
+ local container_id
257
+ container_id=$(resolve_container_id "$service")
258
+ if [[ -z "$container_id" ]]; then
259
+ echo "docker shim: unable to resolve container for service '$service' (project '$PROJECT_NAME')" >&2
260
+ exit 2
261
+ fi
262
+ echo "$container_id:$inner_path"
263
+ return
264
+ fi
265
+ echo "$endpoint"
266
+ }
267
+
268
+ # podman-compose does not implement compose cp, so map it directly to podman cp.
269
+ if [[ "\${#TRANSLATED_ARGS[@]}" -gt 0 ]]; then
270
+ for i in "\${!TRANSLATED_ARGS[@]}"; do
271
+ if [[ "\${TRANSLATED_ARGS[$i]}" == "cp" ]]; then
272
+ cp_index="$i"
273
+ src_idx=$((cp_index + 1))
274
+ dst_idx=$((cp_index + 2))
275
+ if [[ -z "\${TRANSLATED_ARGS[$src_idx]:-}" || -z "\${TRANSLATED_ARGS[$dst_idx]:-}" ]]; then
276
+ echo "docker shim: compose cp requires source and destination" >&2
277
+ exit 2
278
+ fi
279
+ src=""
280
+ dst=""
281
+ src=$(translate_cp_endpoint "\${TRANSLATED_ARGS[$src_idx]}")
282
+ dst=$(translate_cp_endpoint "\${TRANSLATED_ARGS[$dst_idx]}")
283
+ exec podman cp "$src" "$dst"
284
+ fi
285
+
286
+ if [[ "\${TRANSLATED_ARGS[$i]}" == "exec" ]]; then
287
+ exec_idx="$i"
288
+ j=$((exec_idx + 1))
289
+ PODMAN_EXEC_ARGS=()
290
+
291
+ while [[ $j -lt \${#TRANSLATED_ARGS[@]} ]]; do
292
+ tok="\${TRANSLATED_ARGS[$j]}"
293
+ case "$tok" in
294
+ -it|-ti|-i|-t|--interactive|--tty)
295
+ # Skip compose tty/interactive flags to avoid non-tty failures.
296
+ j=$((j + 1))
297
+ ;;
298
+ -w|--workdir|-e|--env)
299
+ if [[ $((j + 1)) -ge \${#TRANSLATED_ARGS[@]} ]]; then
300
+ echo "docker shim: missing value for $tok in compose exec" >&2
301
+ exit 2
302
+ fi
303
+ PODMAN_EXEC_ARGS+=("$tok" "\${TRANSLATED_ARGS[$((j + 1))]}")
304
+ j=$((j + 2))
305
+ ;;
306
+ -w=*|--workdir=*|-e=*|--env=*)
307
+ PODMAN_EXEC_ARGS+=("$tok")
308
+ j=$((j + 1))
309
+ ;;
310
+ --)
311
+ j=$((j + 1))
312
+ break
313
+ ;;
314
+ -*)
315
+ PODMAN_EXEC_ARGS+=("$tok")
316
+ j=$((j + 1))
317
+ ;;
318
+ *)
319
+ service="$tok"
320
+ j=$((j + 1))
321
+ break
322
+ ;;
323
+ esac
324
+ done
325
+
326
+ if [[ -z "\${service:-}" ]]; then
327
+ echo "docker shim: compose exec missing service name" >&2
328
+ exit 2
329
+ fi
330
+
331
+ container_id=$(resolve_container_id "$service")
332
+ if [[ -z "$container_id" ]]; then
333
+ echo "docker shim: unable to resolve container for service '$service' (project '$PROJECT_NAME')" >&2
334
+ exit 2
335
+ fi
336
+
337
+ REMAINDER=("\${TRANSLATED_ARGS[@]:$j}")
338
+ if [[ \${#REMAINDER[@]} -eq 0 ]]; then
339
+ echo "docker shim: compose exec missing command" >&2
340
+ exit 2
341
+ fi
342
+ exec podman exec "\${PODMAN_EXEC_ARGS[@]}" "$container_id" "\${REMAINDER[@]}"
343
+ fi
344
+ done
345
+ fi
346
+
347
+ exec podman-compose "\${TRANSLATED_ARGS[@]}"
348
+ fi
349
+
350
+ exec "$TARGET_BINARY" "$@"
351
+ `;
352
+ }
353
+ function buildRuntimePathEnv(shimDir, basePath = process.env.PATH || "") {
354
+ return basePath ? `${shimDir}:${basePath}` : shimDir;
355
+ }
356
+ function buildPythonPathEnv(pathToAdd, basePythonPath = process.env.PYTHONPATH || "") {
357
+ return basePythonPath ? `${pathToAdd}${delimiter}${basePythonPath}` : pathToAdd;
358
+ }
359
+ function parseLastNonEmptyLine(value) {
360
+ const lines = value.split(/\r?\n/).map((line)=>line.trim()).filter(Boolean);
361
+ return lines.at(-1);
362
+ }
363
+ function normalizeDockerHost(value) {
364
+ const trimmed = value.trim().replace(/^['"]|['"]$/g, "");
365
+ if (/^[a-zA-Z]+:\/\//.test(trimmed)) return trimmed;
366
+ if (trimmed.startsWith("/")) return `unix://${trimmed}`;
367
+ return trimmed;
368
+ }
369
+ function parseDockerHostCandidate(value) {
370
+ if (!value) return;
371
+ const normalized = normalizeDockerHost(value);
372
+ if (0 === normalized.length || "null" === normalized || "<nil>" === normalized || "<no value>" === normalized || "[]" === normalized) return;
373
+ return normalized;
374
+ }
375
+ async function resolvePodmanDockerHost() {
376
+ const existing = parseDockerHostCandidate(process.env.DOCKER_HOST);
377
+ if (existing) return existing;
378
+ const inspect = await runCommand("podman", [
379
+ "machine",
380
+ "inspect",
381
+ "--format",
382
+ "{{.ConnectionInfo.PodmanSocket.Path}}"
383
+ ], {
384
+ cwd: process.cwd(),
385
+ timeoutMs: 5000
386
+ });
387
+ if (0 === inspect.exitCode) {
388
+ const fromInspect = parseDockerHostCandidate(parseLastNonEmptyLine(inspect.stdout));
389
+ if (fromInspect) return fromInspect;
390
+ }
391
+ const machineEnv = await runCommand("podman", [
392
+ "machine",
393
+ "env"
394
+ ], {
395
+ cwd: process.cwd(),
396
+ timeoutMs: 5000
397
+ });
398
+ if (0 === machineEnv.exitCode) {
399
+ const match = machineEnv.stdout.match(/DOCKER_HOST=(['"]?)([^'"\n]+)\1/);
400
+ const fromMachineEnv = parseDockerHostCandidate(match?.[2]);
401
+ if (fromMachineEnv) return fromMachineEnv;
402
+ }
403
+ const info = await runCommand("podman", [
404
+ "info",
405
+ "--format",
406
+ "{{.Host.RemoteSocket.Path}}"
407
+ ], {
408
+ cwd: process.cwd(),
409
+ timeoutMs: 5000
410
+ });
411
+ if (0 === info.exitCode) return parseDockerHostCandidate(parseLastNonEmptyLine(info.stdout));
412
+ }
413
+ async function resolveContainerRuntime(wrapperOutputDir) {
414
+ const dockerBinary = await resolveBinary("docker");
415
+ const podmanBinary = await resolveBinary("podman");
416
+ if (dockerBinary) {
417
+ const dockerVersionCheck = await runCommand(dockerBinary, [
418
+ "--version"
419
+ ], {
420
+ cwd: process.cwd(),
421
+ timeoutMs: 5000
422
+ });
423
+ const dockerVersionOutput = `${dockerVersionCheck.stdout}\n${dockerVersionCheck.stderr}`;
424
+ const dockerLooksPodman = dockerBinary.toLowerCase().includes("podman") || isPodmanBackedDockerVersionOutput(dockerVersionOutput);
425
+ if (!dockerLooksPodman || !podmanBinary) return {
426
+ containerRuntime: "docker"
427
+ };
428
+ const shimDir = join(wrapperOutputDir, "runtime-bin");
429
+ const shimPath = join(shimDir, "docker");
430
+ await mkdir(shimDir, {
431
+ recursive: true
432
+ });
433
+ await writeFile(shimPath, createDockerShimScript(podmanBinary), "utf-8");
434
+ await chmod(shimPath, 493);
435
+ const runtimeEnv = {
436
+ PATH: buildRuntimePathEnv(shimDir)
437
+ };
438
+ const podmanDockerHost = await resolvePodmanDockerHost();
439
+ if (podmanDockerHost) runtimeEnv.DOCKER_HOST = podmanDockerHost;
440
+ return {
441
+ containerRuntime: "podman",
442
+ env: runtimeEnv
443
+ };
444
+ }
445
+ if (!podmanBinary) throw new Error("Neither docker nor podman is installed or on PATH. Install Docker Desktop or Podman, then retry.");
446
+ const shimDir = join(wrapperOutputDir, "runtime-bin");
447
+ const shimPath = join(shimDir, "docker");
448
+ await mkdir(shimDir, {
449
+ recursive: true
450
+ });
451
+ await writeFile(shimPath, createDockerShimScript(podmanBinary), "utf-8");
452
+ await chmod(shimPath, 493);
453
+ const runtimeEnv = {
454
+ PATH: buildRuntimePathEnv(shimDir)
455
+ };
456
+ const podmanDockerHost = await resolvePodmanDockerHost();
457
+ if (podmanDockerHost) runtimeEnv.DOCKER_HOST = podmanDockerHost;
458
+ return {
459
+ containerRuntime: "podman",
460
+ env: runtimeEnv
461
+ };
462
+ }
463
+ async function ensureComposeAvailableForPodman(runtime) {
464
+ if ("podman" !== runtime.containerRuntime) return;
465
+ const check = await runCommand("docker", [
466
+ "compose",
467
+ "version"
468
+ ], {
469
+ cwd: process.cwd(),
470
+ timeoutMs: 10000,
471
+ env: runtime.env
472
+ });
473
+ if (0 === check.exitCode) return;
474
+ const combinedOutput = `${check.stdout}\n${check.stderr}`;
475
+ if (isMissingComposeProviderError(combinedOutput)) throw new Error("Podman compose provider is missing. Install `podman-compose` (e.g. `uv tool install podman-compose`) or `docker-compose`, then verify with `docker compose version`.");
476
+ }
477
+ function createRunId() {
478
+ return new Date().toISOString().replace(/[:.]/g, "-");
479
+ }
480
+ async function runOfficialTerminalBench(options) {
481
+ const config = await loadOfficialBenchConfig(options.configPath);
482
+ const selectedAgentImportPath = options.overrides.agentImportPath || config.agentImportPath;
483
+ const harborBinary = await resolveRequiredBinary("harbor");
484
+ const runId = createRunId();
485
+ const wrapperOutputDir = join(process.cwd(), "bench", "results", "official-wrapper", runId);
486
+ await mkdir(wrapperOutputDir, {
487
+ recursive: true
488
+ });
489
+ const runtime = await resolveContainerRuntime(wrapperOutputDir);
490
+ await ensureComposeAvailableForPodman(runtime);
491
+ const args = buildHarborRunArgs(config, options.overrides);
492
+ const timeoutMs = options.overrides.timeoutMs || config.timeoutMs || 3600000;
493
+ const runEnv = {
494
+ ...runtime.env || {}
495
+ };
496
+ if (selectedAgentImportPath) runEnv.PYTHONPATH = buildPythonPathEnv(process.cwd(), runEnv.PYTHONPATH);
497
+ const effectiveRuntime = {
498
+ ...runtime,
499
+ env: runEnv
500
+ };
501
+ const result = await runCommand(harborBinary, args, {
502
+ cwd: process.cwd(),
503
+ timeoutMs,
504
+ env: effectiveRuntime.env
505
+ });
506
+ const parsed = parseHarborRunOutput(`${result.stdout}\n${result.stderr}`);
507
+ const rawStdoutPath = join(wrapperOutputDir, "harbor.stdout.log");
508
+ const rawStderrPath = join(wrapperOutputDir, "harbor.stderr.log");
509
+ const summaryPath = join(wrapperOutputDir, "summary.json");
510
+ const summary = {
511
+ timestamp: new Date().toISOString(),
512
+ command: {
513
+ binary: harborBinary,
514
+ args
515
+ },
516
+ runtime: effectiveRuntime,
517
+ exitCode: result.exitCode,
518
+ timedOut: result.timedOut,
519
+ durationMs: result.durationMs,
520
+ errorMessage: 0 !== result.exitCode ? extractHarborErrorMessage(`${result.stderr}\n${result.stdout}`) : void 0,
521
+ metrics: {
522
+ resolvedTrials: parsed.resolvedTrials,
523
+ unresolvedTrials: parsed.unresolvedTrials,
524
+ accuracyPercent: parsed.accuracyPercent,
525
+ passAtK: parsed.passAtK
526
+ },
527
+ runOutputPath: parsed.runOutputPath,
528
+ artifacts: {
529
+ rawStdoutPath,
530
+ rawStderrPath,
531
+ summaryPath
532
+ }
533
+ };
534
+ summary.errorMessage = normalizeHarborFailureMessage({
535
+ rawMessage: summary.errorMessage,
536
+ args,
537
+ dataset: options.overrides.dataset || config.dataset
538
+ });
539
+ await Promise.all([
540
+ writeFile(rawStdoutPath, result.stdout, "utf-8"),
541
+ writeFile(rawStderrPath, result.stderr, "utf-8"),
542
+ writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, "utf-8")
543
+ ]);
544
+ return summary;
545
+ }
546
+ export { buildHarborRunArgs, buildPythonPathEnv, buildRuntimePathEnv, createDockerShimScript, extractHarborErrorMessage, extractTaskNamesFromArgs, isMissingComposeProviderError, isPodmanBackedDockerVersionOutput, loadOfficialBenchConfig, normalizeHarborFailureMessage, parseDockerHostCandidate, parseHarborRunOutput, runOfficialTerminalBench };