@wingman-ai/gateway 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -0
- package/dist/agent/config/mcpClientManager.cjs +104 -1
- package/dist/agent/config/mcpClientManager.d.ts +30 -0
- package/dist/agent/config/mcpClientManager.js +104 -1
- package/dist/agent/config/modelFactory.cjs +10 -0
- package/dist/agent/config/modelFactory.js +10 -0
- package/dist/agent/config/xaiImageModel.cjs +242 -0
- package/dist/agent/config/xaiImageModel.d.ts +33 -0
- package/dist/agent/config/xaiImageModel.js +202 -0
- package/dist/agent/tests/mcpClientManager.test.cjs +116 -0
- package/dist/agent/tests/mcpClientManager.test.js +117 -1
- package/dist/agent/tests/mcpResourceTools.test.cjs +101 -0
- package/dist/agent/tests/mcpResourceTools.test.d.ts +1 -0
- package/dist/agent/tests/mcpResourceTools.test.js +95 -0
- package/dist/agent/tests/modelFactory.test.cjs +16 -2
- package/dist/agent/tests/modelFactory.test.js +16 -2
- package/dist/agent/tests/xaiImageModel.test.cjs +194 -0
- package/dist/agent/tests/xaiImageModel.test.d.ts +1 -0
- package/dist/agent/tests/xaiImageModel.test.js +188 -0
- package/dist/agent/tools/mcp_resources.cjs +111 -0
- package/dist/agent/tools/mcp_resources.d.ts +3 -0
- package/dist/agent/tools/mcp_resources.js +77 -0
- package/dist/bench/adapters/commandAdapter.cjs +93 -0
- package/dist/bench/adapters/commandAdapter.d.ts +6 -0
- package/dist/bench/adapters/commandAdapter.js +59 -0
- package/dist/bench/adapters/helpers.cjs +170 -0
- package/dist/bench/adapters/helpers.d.ts +7 -0
- package/dist/bench/adapters/helpers.js +133 -0
- package/dist/bench/adapters/index.cjs +41 -0
- package/dist/bench/adapters/index.d.ts +2 -0
- package/dist/bench/adapters/index.js +7 -0
- package/dist/bench/adapters/wingmanCliAdapter.cjs +100 -0
- package/dist/bench/adapters/wingmanCliAdapter.d.ts +6 -0
- package/dist/bench/adapters/wingmanCliAdapter.js +66 -0
- package/dist/bench/cleanup.cjs +122 -0
- package/dist/bench/cleanup.d.ts +9 -0
- package/dist/bench/cleanup.js +85 -0
- package/dist/bench/config.cjs +190 -0
- package/dist/bench/config.d.ts +2 -0
- package/dist/bench/config.js +156 -0
- package/dist/bench/index.cjs +43 -0
- package/dist/bench/index.d.ts +3 -0
- package/dist/bench/index.js +3 -0
- package/dist/bench/official.cjs +616 -0
- package/dist/bench/official.d.ts +80 -0
- package/dist/bench/official.js +546 -0
- package/dist/bench/officialCli.cjs +204 -0
- package/dist/bench/officialCli.d.ts +5 -0
- package/dist/bench/officialCli.js +170 -0
- package/dist/bench/process.cjs +78 -0
- package/dist/bench/process.d.ts +14 -0
- package/dist/bench/process.js +44 -0
- package/dist/bench/runner.cjs +237 -0
- package/dist/bench/runner.d.ts +7 -0
- package/dist/bench/runner.js +197 -0
- package/dist/bench/scoring.cjs +171 -0
- package/dist/bench/scoring.d.ts +9 -0
- package/dist/bench/scoring.js +137 -0
- package/dist/bench/types.cjs +18 -0
- package/dist/bench/types.d.ts +200 -0
- package/dist/bench/types.js +0 -0
- package/dist/bench/validator.cjs +92 -0
- package/dist/bench/validator.d.ts +2 -0
- package/dist/bench/validator.js +58 -0
- package/dist/cli/config/schema.cjs +36 -1
- package/dist/cli/config/schema.d.ts +46 -0
- package/dist/cli/config/schema.js +36 -1
- package/dist/cli/config/warnings.cjs +119 -51
- package/dist/cli/config/warnings.js +119 -51
- package/dist/cli/core/agentInvoker.cjs +9 -2
- package/dist/cli/core/agentInvoker.d.ts +1 -0
- package/dist/cli/core/agentInvoker.js +9 -2
- package/dist/cli/core/imagePersistence.cjs +17 -1
- package/dist/cli/core/imagePersistence.d.ts +2 -0
- package/dist/cli/core/imagePersistence.js +13 -3
- package/dist/cli/core/sessionManager.cjs +2 -0
- package/dist/cli/core/sessionManager.js +3 -1
- package/dist/cli/types.d.ts +18 -0
- package/dist/gateway/adapters/teams.cjs +419 -0
- package/dist/gateway/adapters/teams.d.ts +47 -0
- package/dist/gateway/adapters/teams.js +361 -0
- package/dist/gateway/http/sms.cjs +286 -0
- package/dist/gateway/http/sms.d.ts +4 -0
- package/dist/gateway/http/sms.js +249 -0
- package/dist/gateway/server.cjs +54 -3
- package/dist/gateway/server.d.ts +2 -0
- package/dist/gateway/server.js +54 -3
- package/dist/gateway/sms/commands.cjs +116 -0
- package/dist/gateway/sms/commands.d.ts +15 -0
- package/dist/gateway/sms/commands.js +79 -0
- package/dist/gateway/sms/control.cjs +118 -0
- package/dist/gateway/sms/control.d.ts +18 -0
- package/dist/gateway/sms/control.js +84 -0
- package/dist/gateway/sms/policyStore.cjs +198 -0
- package/dist/gateway/sms/policyStore.d.ts +37 -0
- package/dist/gateway/sms/policyStore.js +161 -0
- package/dist/providers/registry.cjs +1 -0
- package/dist/providers/registry.js +1 -0
- package/dist/tests/cli-config-warnings.test.cjs +41 -0
- package/dist/tests/cli-config-warnings.test.js +41 -0
- package/dist/tests/cli-init.test.cjs +32 -26
- package/dist/tests/cli-init.test.js +32 -26
- package/dist/tests/gateway-http-security.test.cjs +21 -0
- package/dist/tests/gateway-http-security.test.js +21 -0
- package/dist/tests/gateway-origin-policy.test.cjs +22 -0
- package/dist/tests/gateway-origin-policy.test.js +22 -0
- package/dist/tests/gateway.test.cjs +57 -0
- package/dist/tests/gateway.test.js +57 -0
- package/dist/tests/imagePersistence.test.cjs +26 -0
- package/dist/tests/imagePersistence.test.js +27 -1
- package/dist/tests/run-terminal-bench-official-script.test.cjs +61 -0
- package/dist/tests/run-terminal-bench-official-script.test.d.ts +1 -0
- package/dist/tests/run-terminal-bench-official-script.test.js +55 -0
- package/dist/tests/sessions-api.test.cjs +69 -1
- package/dist/tests/sessions-api.test.js +70 -2
- package/dist/tests/sms-api.test.cjs +183 -0
- package/dist/tests/sms-api.test.d.ts +1 -0
- package/dist/tests/sms-api.test.js +177 -0
- package/dist/tests/sms-commands.test.cjs +90 -0
- package/dist/tests/sms-commands.test.d.ts +1 -0
- package/dist/tests/sms-commands.test.js +84 -0
- package/dist/tests/sms-policy-store.test.cjs +69 -0
- package/dist/tests/sms-policy-store.test.d.ts +1 -0
- package/dist/tests/sms-policy-store.test.js +63 -0
- package/dist/tests/teams-adapter.test.cjs +58 -0
- package/dist/tests/teams-adapter.test.d.ts +1 -0
- package/dist/tests/teams-adapter.test.js +52 -0
- package/dist/tests/terminal-bench-adapters-helpers.test.cjs +64 -0
- package/dist/tests/terminal-bench-adapters-helpers.test.d.ts +1 -0
- package/dist/tests/terminal-bench-adapters-helpers.test.js +58 -0
- package/dist/tests/terminal-bench-cleanup.test.cjs +93 -0
- package/dist/tests/terminal-bench-cleanup.test.d.ts +1 -0
- package/dist/tests/terminal-bench-cleanup.test.js +87 -0
- package/dist/tests/terminal-bench-config.test.cjs +62 -0
- package/dist/tests/terminal-bench-config.test.d.ts +1 -0
- package/dist/tests/terminal-bench-config.test.js +56 -0
- package/dist/tests/terminal-bench-official.test.cjs +194 -0
- package/dist/tests/terminal-bench-official.test.d.ts +1 -0
- package/dist/tests/terminal-bench-official.test.js +188 -0
- package/dist/tests/terminal-bench-runner.test.cjs +82 -0
- package/dist/tests/terminal-bench-runner.test.d.ts +1 -0
- package/dist/tests/terminal-bench-runner.test.js +76 -0
- package/dist/tests/terminal-bench-scoring.test.cjs +128 -0
- package/dist/tests/terminal-bench-scoring.test.d.ts +1 -0
- package/dist/tests/terminal-bench-scoring.test.js +122 -0
- package/dist/tools/mcp-fal-ai.cjs +1 -1
- package/dist/tools/mcp-fal-ai.js +1 -1
- package/dist/webui/assets/index-Cyg_Hs57.css +11 -0
- package/dist/webui/assets/{index-BMekSELC.js → index-DZXLLjaA.js} +109 -109
- package/dist/webui/index.html +2 -2
- package/package.json +11 -2
- package/templates/agents/game-dev/agent.md +122 -63
- package/templates/agents/game-dev/art-director.md +106 -0
- package/templates/agents/game-dev/game-designer.md +87 -0
- package/templates/agents/game-dev/scene-engineer.md +474 -0
- package/dist/webui/assets/index-Cwkg4DKj.css +0 -11
- package/templates/agents/game-dev/art-generation.md +0 -38
- package/templates/agents/game-dev/asset-refinement.md +0 -17
- package/templates/agents/game-dev/planning-idea.md +0 -17
- package/templates/agents/game-dev/ui-specialist.md +0 -17
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
declare const officialConfigSchema: z.ZodObject<{
|
|
3
|
+
dataset: z.ZodString;
|
|
4
|
+
taskNames: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
5
|
+
registryUrl: z.ZodOptional<z.ZodString>;
|
|
6
|
+
registryPath: z.ZodOptional<z.ZodString>;
|
|
7
|
+
agent: z.ZodOptional<z.ZodString>;
|
|
8
|
+
agentImportPath: z.ZodOptional<z.ZodString>;
|
|
9
|
+
agentKwargs: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
10
|
+
model: z.ZodOptional<z.ZodString>;
|
|
11
|
+
nConcurrent: z.ZodOptional<z.ZodNumber>;
|
|
12
|
+
nAttempts: z.ZodOptional<z.ZodNumber>;
|
|
13
|
+
nTasks: z.ZodOptional<z.ZodNumber>;
|
|
14
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
15
|
+
extraArgs: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
16
|
+
}, z.core.$strip>;
|
|
17
|
+
export type OfficialBenchConfig = z.infer<typeof officialConfigSchema>;
|
|
18
|
+
export interface OfficialBenchOverrides {
|
|
19
|
+
taskNames?: string[];
|
|
20
|
+
registryUrl?: string;
|
|
21
|
+
registryPath?: string;
|
|
22
|
+
agent?: string;
|
|
23
|
+
agentImportPath?: string;
|
|
24
|
+
agentKwargs?: Record<string, string>;
|
|
25
|
+
model?: string;
|
|
26
|
+
dataset?: string;
|
|
27
|
+
nConcurrent?: number;
|
|
28
|
+
nAttempts?: number;
|
|
29
|
+
nTasks?: number;
|
|
30
|
+
timeoutMs?: number;
|
|
31
|
+
}
|
|
32
|
+
export interface OfficialBenchSummary {
|
|
33
|
+
timestamp: string;
|
|
34
|
+
command: {
|
|
35
|
+
binary: string;
|
|
36
|
+
args: string[];
|
|
37
|
+
};
|
|
38
|
+
runtime: {
|
|
39
|
+
containerRuntime: "docker" | "podman";
|
|
40
|
+
};
|
|
41
|
+
exitCode: number;
|
|
42
|
+
timedOut: boolean;
|
|
43
|
+
durationMs: number;
|
|
44
|
+
errorMessage?: string;
|
|
45
|
+
metrics: {
|
|
46
|
+
resolvedTrials?: number;
|
|
47
|
+
unresolvedTrials?: number;
|
|
48
|
+
accuracyPercent?: number;
|
|
49
|
+
passAtK: Record<string, number>;
|
|
50
|
+
};
|
|
51
|
+
runOutputPath?: string;
|
|
52
|
+
artifacts: {
|
|
53
|
+
rawStdoutPath: string;
|
|
54
|
+
rawStderrPath: string;
|
|
55
|
+
summaryPath: string;
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
export declare function extractTaskNamesFromArgs(args: string[]): string[];
|
|
59
|
+
export declare function normalizeHarborFailureMessage(params: {
|
|
60
|
+
rawMessage: string | undefined;
|
|
61
|
+
args: string[];
|
|
62
|
+
dataset: string;
|
|
63
|
+
}): string | undefined;
|
|
64
|
+
export declare function isMissingComposeProviderError(output: string): boolean;
|
|
65
|
+
export declare function isPodmanBackedDockerVersionOutput(output: string): boolean;
|
|
66
|
+
export declare function parseHarborRunOutput(output: string): OfficialBenchSummary["metrics"] & {
|
|
67
|
+
runOutputPath?: string;
|
|
68
|
+
};
|
|
69
|
+
export declare function extractHarborErrorMessage(stderr: string): string | undefined;
|
|
70
|
+
export declare function buildHarborRunArgs(config: OfficialBenchConfig, overrides: OfficialBenchOverrides): string[];
|
|
71
|
+
export declare function loadOfficialBenchConfig(configPath: string): Promise<OfficialBenchConfig>;
|
|
72
|
+
export declare function createDockerShimScript(targetBinary: string): string;
|
|
73
|
+
export declare function buildRuntimePathEnv(shimDir: string, basePath?: string): string;
|
|
74
|
+
export declare function buildPythonPathEnv(pathToAdd: string, basePythonPath?: string): string;
|
|
75
|
+
export declare function parseDockerHostCandidate(value: string | undefined): string | undefined;
|
|
76
|
+
export declare function runOfficialTerminalBench(options: {
|
|
77
|
+
configPath: string;
|
|
78
|
+
overrides: OfficialBenchOverrides;
|
|
79
|
+
}): Promise<OfficialBenchSummary>;
|
|
80
|
+
export {};
|
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
import { chmod, mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import { delimiter, join, resolve } from "node:path";
|
|
3
|
+
import { z } from "zod";
|
|
4
|
+
import { runCommand } from "./process.js";
|
|
5
|
+
const officialConfigSchema = z.object({
|
|
6
|
+
dataset: z.string().min(1),
|
|
7
|
+
taskNames: z.array(z.string().min(1)).optional(),
|
|
8
|
+
registryUrl: z.string().min(1).optional(),
|
|
9
|
+
registryPath: z.string().min(1).optional(),
|
|
10
|
+
agent: z.string().min(1).optional(),
|
|
11
|
+
agentImportPath: z.string().min(1).optional(),
|
|
12
|
+
agentKwargs: z.record(z.string(), z.string()).optional(),
|
|
13
|
+
model: z.string().optional(),
|
|
14
|
+
nConcurrent: z.number().int().positive().optional(),
|
|
15
|
+
nAttempts: z.number().int().positive().optional(),
|
|
16
|
+
nTasks: z.number().int().positive().optional(),
|
|
17
|
+
timeoutMs: z.number().int().positive().optional(),
|
|
18
|
+
extraArgs: z.array(z.string()).optional()
|
|
19
|
+
});
|
|
20
|
+
function extractTaskNamesFromArgs(args) {
|
|
21
|
+
const names = [];
|
|
22
|
+
for(let i = 0; i < args.length; i += 1){
|
|
23
|
+
const arg = args[i];
|
|
24
|
+
if ("--task-name" === arg && args[i + 1]) {
|
|
25
|
+
names.push(args[i + 1]);
|
|
26
|
+
i += 1;
|
|
27
|
+
continue;
|
|
28
|
+
}
|
|
29
|
+
if (arg.startsWith("--task-name=")) names.push(arg.slice(12));
|
|
30
|
+
}
|
|
31
|
+
return names;
|
|
32
|
+
}
|
|
33
|
+
function normalizeHarborFailureMessage(params) {
|
|
34
|
+
const { rawMessage, args, dataset } = params;
|
|
35
|
+
if ("ValueError: Either datasets or tasks must be provided." !== rawMessage) return rawMessage;
|
|
36
|
+
const selectedTaskNames = extractTaskNamesFromArgs(args);
|
|
37
|
+
if (0 === selectedTaskNames.length) return rawMessage;
|
|
38
|
+
return `No tasks matched ${selectedTaskNames.map((name)=>`"${name}"`).join(", ")} in dataset "${dataset}". Verify task ids for Terminal-Bench 2.0.`;
|
|
39
|
+
}
|
|
40
|
+
function stripAnsi(value) {
|
|
41
|
+
return value.replace(/\u001B\[[0-9;]*[A-Za-z]/g, "");
|
|
42
|
+
}
|
|
43
|
+
function parseMetricNumber(line) {
|
|
44
|
+
const match = line.match(/(-?\d+(?:\.\d+)?)(?:\s*%?)\s*[│|]?\s*$/);
|
|
45
|
+
if (!match) return;
|
|
46
|
+
const value = Number.parseFloat(match[1]);
|
|
47
|
+
return Number.isFinite(value) ? value : void 0;
|
|
48
|
+
}
|
|
49
|
+
function isMissingComposeProviderError(output) {
|
|
50
|
+
const normalized = stripAnsi(output).toLowerCase();
|
|
51
|
+
return normalized.includes("looking up compose provider failed") || normalized.includes('exec: "docker-compose": executable file not found') || normalized.includes('exec: "podman-compose": executable file not found');
|
|
52
|
+
}
|
|
53
|
+
function isPodmanBackedDockerVersionOutput(output) {
|
|
54
|
+
const normalized = stripAnsi(output).toLowerCase();
|
|
55
|
+
return normalized.includes("podman") || normalized.includes("emulate docker cli using podman");
|
|
56
|
+
}
|
|
57
|
+
function parseHarborRunOutput(output) {
|
|
58
|
+
const normalized = stripAnsi(output);
|
|
59
|
+
const lines = normalized.split(/\r?\n/);
|
|
60
|
+
let resolvedTrials;
|
|
61
|
+
let unresolvedTrials;
|
|
62
|
+
let accuracyPercent;
|
|
63
|
+
const passAtK = {};
|
|
64
|
+
let runOutputPath;
|
|
65
|
+
for (const rawLine of lines){
|
|
66
|
+
const line = rawLine.trim();
|
|
67
|
+
if (!line) continue;
|
|
68
|
+
if (line.includes("Resolved Trials")) {
|
|
69
|
+
resolvedTrials = parseMetricNumber(line);
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
if (line.includes("Unresolved Trials")) {
|
|
73
|
+
unresolvedTrials = parseMetricNumber(line);
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
if (line.includes("Accuracy")) {
|
|
77
|
+
accuracyPercent = parseMetricNumber(line);
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
const passAtKMatch = line.match(/Pass@(\d+)/i);
|
|
81
|
+
if (passAtKMatch) {
|
|
82
|
+
const value = parseMetricNumber(line);
|
|
83
|
+
if (void 0 !== value) passAtK[passAtKMatch[1]] = value;
|
|
84
|
+
continue;
|
|
85
|
+
}
|
|
86
|
+
const pathMatch = line.match(/results written to\s+(.+)$/i) || line.match(/results saved to\s+(.+)$/i) || line.match(/output written to\s+(.+)$/i);
|
|
87
|
+
if (pathMatch?.[1]) runOutputPath = pathMatch[1].trim();
|
|
88
|
+
}
|
|
89
|
+
return {
|
|
90
|
+
resolvedTrials,
|
|
91
|
+
unresolvedTrials,
|
|
92
|
+
accuracyPercent,
|
|
93
|
+
passAtK,
|
|
94
|
+
runOutputPath
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
function extractHarborErrorMessage(stderr) {
|
|
98
|
+
const normalized = stripAnsi(stderr);
|
|
99
|
+
if (normalized.includes("ValueError: Error getting dataset") && normalized.match(/ValueError: Error getting dataset[^\n]*/g)) return normalized.match(/ValueError: Error getting dataset[^\n]*/g)?.at(-1);
|
|
100
|
+
if (normalized.includes("ConnectError:") && normalized.match(/ConnectError:[^\n]*/g)) return normalized.match(/ConnectError:[^\n]*/g)?.at(-1);
|
|
101
|
+
if (normalized.includes("nodename nor servname provided") || normalized.includes("temporary failure in name resolution")) return "Harbor registry lookup failed due DNS/network error. Verify internet access or pass --registry-url/--registry-path.";
|
|
102
|
+
const lines = normalized.split(/\r?\n/).map((line)=>line.trim()).filter(Boolean);
|
|
103
|
+
if (0 === lines.length) return;
|
|
104
|
+
for(let i = lines.length - 1; i >= 0; i -= 1)if (lines[i].startsWith("ValueError:")) return lines[i];
|
|
105
|
+
for(let i = lines.length - 1; i >= 0; i -= 1)if (/\w+Error:/.test(lines[i])) return lines[i];
|
|
106
|
+
return lines.at(-1);
|
|
107
|
+
}
|
|
108
|
+
function buildHarborRunArgs(config, overrides) {
|
|
109
|
+
const dataset = overrides.dataset || config.dataset;
|
|
110
|
+
const taskNames = overrides.taskNames ?? config.taskNames ?? [];
|
|
111
|
+
const registryUrl = overrides.registryUrl || config.registryUrl;
|
|
112
|
+
const registryPath = overrides.registryPath || config.registryPath;
|
|
113
|
+
const agent = overrides.agent || config.agent || "oracle";
|
|
114
|
+
const agentImportPath = overrides.agentImportPath || config.agentImportPath;
|
|
115
|
+
const agentKwargs = {
|
|
116
|
+
...config.agentKwargs || {},
|
|
117
|
+
...overrides.agentKwargs || {}
|
|
118
|
+
};
|
|
119
|
+
const model = overrides.model || config.model;
|
|
120
|
+
const nConcurrent = overrides.nConcurrent || config.nConcurrent;
|
|
121
|
+
const nAttempts = overrides.nAttempts || config.nAttempts;
|
|
122
|
+
const nTasks = overrides.nTasks || config.nTasks;
|
|
123
|
+
const args = [
|
|
124
|
+
"run",
|
|
125
|
+
"--dataset",
|
|
126
|
+
dataset
|
|
127
|
+
];
|
|
128
|
+
if (registryUrl) args.push("--registry-url", registryUrl);
|
|
129
|
+
if (registryPath) args.push("--registry-path", registryPath);
|
|
130
|
+
if (agentImportPath) args.push("--agent-import-path", agentImportPath);
|
|
131
|
+
else args.push("--agent", agent);
|
|
132
|
+
if (model) args.push("--model", model);
|
|
133
|
+
if (nConcurrent) args.push("--n-concurrent", String(nConcurrent));
|
|
134
|
+
if (nAttempts) args.push("--n-attempts", String(nAttempts));
|
|
135
|
+
if (nTasks) args.push("--n-tasks", String(nTasks));
|
|
136
|
+
for (const [key, value] of Object.entries(agentKwargs))if (!agentImportPath || "model_name" !== key) args.push("--agent-kwarg", `${key}=${value}`);
|
|
137
|
+
for (const taskName of taskNames)args.push("--task-name", taskName);
|
|
138
|
+
if (config.extraArgs && config.extraArgs.length > 0) args.push(...config.extraArgs);
|
|
139
|
+
return args;
|
|
140
|
+
}
|
|
141
|
+
async function loadOfficialBenchConfig(configPath) {
|
|
142
|
+
const path = resolve(configPath);
|
|
143
|
+
const text = await Bun.file(path).text();
|
|
144
|
+
return officialConfigSchema.parse(JSON.parse(text));
|
|
145
|
+
}
|
|
146
|
+
async function resolveRequiredBinary(name) {
|
|
147
|
+
const check = await runCommand("sh", [
|
|
148
|
+
"-lc",
|
|
149
|
+
`command -v ${name}`
|
|
150
|
+
], {
|
|
151
|
+
cwd: process.cwd(),
|
|
152
|
+
timeoutMs: 5000
|
|
153
|
+
});
|
|
154
|
+
if (0 !== check.exitCode) throw new Error("harbor is not installed or not on PATH. Install Harbor CLI and verify with `harbor --help`.");
|
|
155
|
+
const resolvedPath = check.stdout.trim().split(/\r?\n/).at(-1)?.trim();
|
|
156
|
+
if (!resolvedPath) throw new Error(`Unable to resolve ${name} binary path.`);
|
|
157
|
+
return resolvedPath;
|
|
158
|
+
}
|
|
159
|
+
async function resolveBinary(name) {
|
|
160
|
+
const check = await runCommand("sh", [
|
|
161
|
+
"-lc",
|
|
162
|
+
`command -v ${name}`
|
|
163
|
+
], {
|
|
164
|
+
cwd: process.cwd(),
|
|
165
|
+
timeoutMs: 5000
|
|
166
|
+
});
|
|
167
|
+
if (0 !== check.exitCode) return null;
|
|
168
|
+
const resolvedPath = check.stdout.trim().split(/\r?\n/).at(-1)?.trim();
|
|
169
|
+
return resolvedPath || null;
|
|
170
|
+
}
|
|
171
|
+
function shellQuote(value) {
|
|
172
|
+
return `'${value.replace(/'/g, "'\"'\"'")}'`;
|
|
173
|
+
}
|
|
174
|
+
function createDockerShimScript(targetBinary) {
|
|
175
|
+
return `#!/bin/bash
|
|
176
|
+
set -e
|
|
177
|
+
TARGET_BINARY=${shellQuote(targetBinary)}
|
|
178
|
+
|
|
179
|
+
if [[ "$1" == "compose" ]] && command -v podman-compose >/dev/null 2>&1; then
|
|
180
|
+
shift
|
|
181
|
+
PROJECT_DIR=""
|
|
182
|
+
PROJECT_NAME=""
|
|
183
|
+
COMPOSE_FILES=()
|
|
184
|
+
TRANSLATED_ARGS=()
|
|
185
|
+
|
|
186
|
+
while [[ $# -gt 0 ]]; do
|
|
187
|
+
case "$1" in
|
|
188
|
+
-p)
|
|
189
|
+
PROJECT_NAME="$2"
|
|
190
|
+
TRANSLATED_ARGS+=("$1" "$2")
|
|
191
|
+
shift 2
|
|
192
|
+
;;
|
|
193
|
+
-p=*)
|
|
194
|
+
PROJECT_NAME="\${1#*=}"
|
|
195
|
+
TRANSLATED_ARGS+=("$1")
|
|
196
|
+
shift
|
|
197
|
+
;;
|
|
198
|
+
-f)
|
|
199
|
+
COMPOSE_FILES+=("$2")
|
|
200
|
+
TRANSLATED_ARGS+=("$1" "$2")
|
|
201
|
+
shift 2
|
|
202
|
+
;;
|
|
203
|
+
-f=*)
|
|
204
|
+
COMPOSE_FILES+=("\${1#*=}")
|
|
205
|
+
TRANSLATED_ARGS+=("$1")
|
|
206
|
+
shift
|
|
207
|
+
;;
|
|
208
|
+
--project-directory)
|
|
209
|
+
PROJECT_DIR="$2"
|
|
210
|
+
shift 2
|
|
211
|
+
;;
|
|
212
|
+
--project-directory=*)
|
|
213
|
+
PROJECT_DIR="\${1#*=}"
|
|
214
|
+
shift
|
|
215
|
+
;;
|
|
216
|
+
*)
|
|
217
|
+
TRANSLATED_ARGS+=("$1")
|
|
218
|
+
shift
|
|
219
|
+
;;
|
|
220
|
+
esac
|
|
221
|
+
done
|
|
222
|
+
|
|
223
|
+
if [[ -n "$PROJECT_DIR" ]]; then
|
|
224
|
+
cd "$PROJECT_DIR"
|
|
225
|
+
fi
|
|
226
|
+
|
|
227
|
+
resolve_container_id() {
|
|
228
|
+
local service="$1"
|
|
229
|
+
local container_id=""
|
|
230
|
+
|
|
231
|
+
if [[ -n "$PROJECT_NAME" ]]; then
|
|
232
|
+
container_id=$(podman ps -a \
|
|
233
|
+
--filter "label=com.docker.compose.project=$PROJECT_NAME" \
|
|
234
|
+
--filter "label=com.docker.compose.service=$service" \
|
|
235
|
+
--format "{{.ID}}" | head -n 1 || true)
|
|
236
|
+
fi
|
|
237
|
+
|
|
238
|
+
if [[ -z "$container_id" && -n "$PROJECT_NAME" ]]; then
|
|
239
|
+
local c1="\${PROJECT_NAME}_\${service}_1"
|
|
240
|
+
local c2="\${PROJECT_NAME}-\${service}-1"
|
|
241
|
+
if podman container exists "$c1" >/dev/null 2>&1; then
|
|
242
|
+
container_id="$c1"
|
|
243
|
+
elif podman container exists "$c2" >/dev/null 2>&1; then
|
|
244
|
+
container_id="$c2"
|
|
245
|
+
fi
|
|
246
|
+
fi
|
|
247
|
+
|
|
248
|
+
echo "$container_id"
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
translate_cp_endpoint() {
|
|
252
|
+
local endpoint="$1"
|
|
253
|
+
if [[ "$endpoint" == *:* ]]; then
|
|
254
|
+
local service="\${endpoint%%:*}"
|
|
255
|
+
local inner_path="\${endpoint#*:}"
|
|
256
|
+
local container_id
|
|
257
|
+
container_id=$(resolve_container_id "$service")
|
|
258
|
+
if [[ -z "$container_id" ]]; then
|
|
259
|
+
echo "docker shim: unable to resolve container for service '$service' (project '$PROJECT_NAME')" >&2
|
|
260
|
+
exit 2
|
|
261
|
+
fi
|
|
262
|
+
echo "$container_id:$inner_path"
|
|
263
|
+
return
|
|
264
|
+
fi
|
|
265
|
+
echo "$endpoint"
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
# podman-compose does not implement compose cp, so map it directly to podman cp.
|
|
269
|
+
if [[ "\${#TRANSLATED_ARGS[@]}" -gt 0 ]]; then
|
|
270
|
+
for i in "\${!TRANSLATED_ARGS[@]}"; do
|
|
271
|
+
if [[ "\${TRANSLATED_ARGS[$i]}" == "cp" ]]; then
|
|
272
|
+
cp_index="$i"
|
|
273
|
+
src_idx=$((cp_index + 1))
|
|
274
|
+
dst_idx=$((cp_index + 2))
|
|
275
|
+
if [[ -z "\${TRANSLATED_ARGS[$src_idx]:-}" || -z "\${TRANSLATED_ARGS[$dst_idx]:-}" ]]; then
|
|
276
|
+
echo "docker shim: compose cp requires source and destination" >&2
|
|
277
|
+
exit 2
|
|
278
|
+
fi
|
|
279
|
+
src=""
|
|
280
|
+
dst=""
|
|
281
|
+
src=$(translate_cp_endpoint "\${TRANSLATED_ARGS[$src_idx]}")
|
|
282
|
+
dst=$(translate_cp_endpoint "\${TRANSLATED_ARGS[$dst_idx]}")
|
|
283
|
+
exec podman cp "$src" "$dst"
|
|
284
|
+
fi
|
|
285
|
+
|
|
286
|
+
if [[ "\${TRANSLATED_ARGS[$i]}" == "exec" ]]; then
|
|
287
|
+
exec_idx="$i"
|
|
288
|
+
j=$((exec_idx + 1))
|
|
289
|
+
PODMAN_EXEC_ARGS=()
|
|
290
|
+
|
|
291
|
+
while [[ $j -lt \${#TRANSLATED_ARGS[@]} ]]; do
|
|
292
|
+
tok="\${TRANSLATED_ARGS[$j]}"
|
|
293
|
+
case "$tok" in
|
|
294
|
+
-it|-ti|-i|-t|--interactive|--tty)
|
|
295
|
+
# Skip compose tty/interactive flags to avoid non-tty failures.
|
|
296
|
+
j=$((j + 1))
|
|
297
|
+
;;
|
|
298
|
+
-w|--workdir|-e|--env)
|
|
299
|
+
if [[ $((j + 1)) -ge \${#TRANSLATED_ARGS[@]} ]]; then
|
|
300
|
+
echo "docker shim: missing value for $tok in compose exec" >&2
|
|
301
|
+
exit 2
|
|
302
|
+
fi
|
|
303
|
+
PODMAN_EXEC_ARGS+=("$tok" "\${TRANSLATED_ARGS[$((j + 1))]}")
|
|
304
|
+
j=$((j + 2))
|
|
305
|
+
;;
|
|
306
|
+
-w=*|--workdir=*|-e=*|--env=*)
|
|
307
|
+
PODMAN_EXEC_ARGS+=("$tok")
|
|
308
|
+
j=$((j + 1))
|
|
309
|
+
;;
|
|
310
|
+
--)
|
|
311
|
+
j=$((j + 1))
|
|
312
|
+
break
|
|
313
|
+
;;
|
|
314
|
+
-*)
|
|
315
|
+
PODMAN_EXEC_ARGS+=("$tok")
|
|
316
|
+
j=$((j + 1))
|
|
317
|
+
;;
|
|
318
|
+
*)
|
|
319
|
+
service="$tok"
|
|
320
|
+
j=$((j + 1))
|
|
321
|
+
break
|
|
322
|
+
;;
|
|
323
|
+
esac
|
|
324
|
+
done
|
|
325
|
+
|
|
326
|
+
if [[ -z "\${service:-}" ]]; then
|
|
327
|
+
echo "docker shim: compose exec missing service name" >&2
|
|
328
|
+
exit 2
|
|
329
|
+
fi
|
|
330
|
+
|
|
331
|
+
container_id=$(resolve_container_id "$service")
|
|
332
|
+
if [[ -z "$container_id" ]]; then
|
|
333
|
+
echo "docker shim: unable to resolve container for service '$service' (project '$PROJECT_NAME')" >&2
|
|
334
|
+
exit 2
|
|
335
|
+
fi
|
|
336
|
+
|
|
337
|
+
REMAINDER=("\${TRANSLATED_ARGS[@]:$j}")
|
|
338
|
+
if [[ \${#REMAINDER[@]} -eq 0 ]]; then
|
|
339
|
+
echo "docker shim: compose exec missing command" >&2
|
|
340
|
+
exit 2
|
|
341
|
+
fi
|
|
342
|
+
exec podman exec "\${PODMAN_EXEC_ARGS[@]}" "$container_id" "\${REMAINDER[@]}"
|
|
343
|
+
fi
|
|
344
|
+
done
|
|
345
|
+
fi
|
|
346
|
+
|
|
347
|
+
exec podman-compose "\${TRANSLATED_ARGS[@]}"
|
|
348
|
+
fi
|
|
349
|
+
|
|
350
|
+
exec "$TARGET_BINARY" "$@"
|
|
351
|
+
`;
|
|
352
|
+
}
|
|
353
|
+
function buildRuntimePathEnv(shimDir, basePath = process.env.PATH || "") {
|
|
354
|
+
return basePath ? `${shimDir}:${basePath}` : shimDir;
|
|
355
|
+
}
|
|
356
|
+
function buildPythonPathEnv(pathToAdd, basePythonPath = process.env.PYTHONPATH || "") {
|
|
357
|
+
return basePythonPath ? `${pathToAdd}${delimiter}${basePythonPath}` : pathToAdd;
|
|
358
|
+
}
|
|
359
|
+
function parseLastNonEmptyLine(value) {
|
|
360
|
+
const lines = value.split(/\r?\n/).map((line)=>line.trim()).filter(Boolean);
|
|
361
|
+
return lines.at(-1);
|
|
362
|
+
}
|
|
363
|
+
function normalizeDockerHost(value) {
|
|
364
|
+
const trimmed = value.trim().replace(/^['"]|['"]$/g, "");
|
|
365
|
+
if (/^[a-zA-Z]+:\/\//.test(trimmed)) return trimmed;
|
|
366
|
+
if (trimmed.startsWith("/")) return `unix://${trimmed}`;
|
|
367
|
+
return trimmed;
|
|
368
|
+
}
|
|
369
|
+
function parseDockerHostCandidate(value) {
|
|
370
|
+
if (!value) return;
|
|
371
|
+
const normalized = normalizeDockerHost(value);
|
|
372
|
+
if (0 === normalized.length || "null" === normalized || "<nil>" === normalized || "<no value>" === normalized || "[]" === normalized) return;
|
|
373
|
+
return normalized;
|
|
374
|
+
}
|
|
375
|
+
async function resolvePodmanDockerHost() {
|
|
376
|
+
const existing = parseDockerHostCandidate(process.env.DOCKER_HOST);
|
|
377
|
+
if (existing) return existing;
|
|
378
|
+
const inspect = await runCommand("podman", [
|
|
379
|
+
"machine",
|
|
380
|
+
"inspect",
|
|
381
|
+
"--format",
|
|
382
|
+
"{{.ConnectionInfo.PodmanSocket.Path}}"
|
|
383
|
+
], {
|
|
384
|
+
cwd: process.cwd(),
|
|
385
|
+
timeoutMs: 5000
|
|
386
|
+
});
|
|
387
|
+
if (0 === inspect.exitCode) {
|
|
388
|
+
const fromInspect = parseDockerHostCandidate(parseLastNonEmptyLine(inspect.stdout));
|
|
389
|
+
if (fromInspect) return fromInspect;
|
|
390
|
+
}
|
|
391
|
+
const machineEnv = await runCommand("podman", [
|
|
392
|
+
"machine",
|
|
393
|
+
"env"
|
|
394
|
+
], {
|
|
395
|
+
cwd: process.cwd(),
|
|
396
|
+
timeoutMs: 5000
|
|
397
|
+
});
|
|
398
|
+
if (0 === machineEnv.exitCode) {
|
|
399
|
+
const match = machineEnv.stdout.match(/DOCKER_HOST=(['"]?)([^'"\n]+)\1/);
|
|
400
|
+
const fromMachineEnv = parseDockerHostCandidate(match?.[2]);
|
|
401
|
+
if (fromMachineEnv) return fromMachineEnv;
|
|
402
|
+
}
|
|
403
|
+
const info = await runCommand("podman", [
|
|
404
|
+
"info",
|
|
405
|
+
"--format",
|
|
406
|
+
"{{.Host.RemoteSocket.Path}}"
|
|
407
|
+
], {
|
|
408
|
+
cwd: process.cwd(),
|
|
409
|
+
timeoutMs: 5000
|
|
410
|
+
});
|
|
411
|
+
if (0 === info.exitCode) return parseDockerHostCandidate(parseLastNonEmptyLine(info.stdout));
|
|
412
|
+
}
|
|
413
|
+
async function resolveContainerRuntime(wrapperOutputDir) {
|
|
414
|
+
const dockerBinary = await resolveBinary("docker");
|
|
415
|
+
const podmanBinary = await resolveBinary("podman");
|
|
416
|
+
if (dockerBinary) {
|
|
417
|
+
const dockerVersionCheck = await runCommand(dockerBinary, [
|
|
418
|
+
"--version"
|
|
419
|
+
], {
|
|
420
|
+
cwd: process.cwd(),
|
|
421
|
+
timeoutMs: 5000
|
|
422
|
+
});
|
|
423
|
+
const dockerVersionOutput = `${dockerVersionCheck.stdout}\n${dockerVersionCheck.stderr}`;
|
|
424
|
+
const dockerLooksPodman = dockerBinary.toLowerCase().includes("podman") || isPodmanBackedDockerVersionOutput(dockerVersionOutput);
|
|
425
|
+
if (!dockerLooksPodman || !podmanBinary) return {
|
|
426
|
+
containerRuntime: "docker"
|
|
427
|
+
};
|
|
428
|
+
const shimDir = join(wrapperOutputDir, "runtime-bin");
|
|
429
|
+
const shimPath = join(shimDir, "docker");
|
|
430
|
+
await mkdir(shimDir, {
|
|
431
|
+
recursive: true
|
|
432
|
+
});
|
|
433
|
+
await writeFile(shimPath, createDockerShimScript(podmanBinary), "utf-8");
|
|
434
|
+
await chmod(shimPath, 493);
|
|
435
|
+
const runtimeEnv = {
|
|
436
|
+
PATH: buildRuntimePathEnv(shimDir)
|
|
437
|
+
};
|
|
438
|
+
const podmanDockerHost = await resolvePodmanDockerHost();
|
|
439
|
+
if (podmanDockerHost) runtimeEnv.DOCKER_HOST = podmanDockerHost;
|
|
440
|
+
return {
|
|
441
|
+
containerRuntime: "podman",
|
|
442
|
+
env: runtimeEnv
|
|
443
|
+
};
|
|
444
|
+
}
|
|
445
|
+
if (!podmanBinary) throw new Error("Neither docker nor podman is installed or on PATH. Install Docker Desktop or Podman, then retry.");
|
|
446
|
+
const shimDir = join(wrapperOutputDir, "runtime-bin");
|
|
447
|
+
const shimPath = join(shimDir, "docker");
|
|
448
|
+
await mkdir(shimDir, {
|
|
449
|
+
recursive: true
|
|
450
|
+
});
|
|
451
|
+
await writeFile(shimPath, createDockerShimScript(podmanBinary), "utf-8");
|
|
452
|
+
await chmod(shimPath, 493);
|
|
453
|
+
const runtimeEnv = {
|
|
454
|
+
PATH: buildRuntimePathEnv(shimDir)
|
|
455
|
+
};
|
|
456
|
+
const podmanDockerHost = await resolvePodmanDockerHost();
|
|
457
|
+
if (podmanDockerHost) runtimeEnv.DOCKER_HOST = podmanDockerHost;
|
|
458
|
+
return {
|
|
459
|
+
containerRuntime: "podman",
|
|
460
|
+
env: runtimeEnv
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
async function ensureComposeAvailableForPodman(runtime) {
|
|
464
|
+
if ("podman" !== runtime.containerRuntime) return;
|
|
465
|
+
const check = await runCommand("docker", [
|
|
466
|
+
"compose",
|
|
467
|
+
"version"
|
|
468
|
+
], {
|
|
469
|
+
cwd: process.cwd(),
|
|
470
|
+
timeoutMs: 10000,
|
|
471
|
+
env: runtime.env
|
|
472
|
+
});
|
|
473
|
+
if (0 === check.exitCode) return;
|
|
474
|
+
const combinedOutput = `${check.stdout}\n${check.stderr}`;
|
|
475
|
+
if (isMissingComposeProviderError(combinedOutput)) throw new Error("Podman compose provider is missing. Install `podman-compose` (e.g. `uv tool install podman-compose`) or `docker-compose`, then verify with `docker compose version`.");
|
|
476
|
+
}
|
|
477
|
+
function createRunId() {
|
|
478
|
+
return new Date().toISOString().replace(/[:.]/g, "-");
|
|
479
|
+
}
|
|
480
|
+
async function runOfficialTerminalBench(options) {
|
|
481
|
+
const config = await loadOfficialBenchConfig(options.configPath);
|
|
482
|
+
const selectedAgentImportPath = options.overrides.agentImportPath || config.agentImportPath;
|
|
483
|
+
const harborBinary = await resolveRequiredBinary("harbor");
|
|
484
|
+
const runId = createRunId();
|
|
485
|
+
const wrapperOutputDir = join(process.cwd(), "bench", "results", "official-wrapper", runId);
|
|
486
|
+
await mkdir(wrapperOutputDir, {
|
|
487
|
+
recursive: true
|
|
488
|
+
});
|
|
489
|
+
const runtime = await resolveContainerRuntime(wrapperOutputDir);
|
|
490
|
+
await ensureComposeAvailableForPodman(runtime);
|
|
491
|
+
const args = buildHarborRunArgs(config, options.overrides);
|
|
492
|
+
const timeoutMs = options.overrides.timeoutMs || config.timeoutMs || 3600000;
|
|
493
|
+
const runEnv = {
|
|
494
|
+
...runtime.env || {}
|
|
495
|
+
};
|
|
496
|
+
if (selectedAgentImportPath) runEnv.PYTHONPATH = buildPythonPathEnv(process.cwd(), runEnv.PYTHONPATH);
|
|
497
|
+
const effectiveRuntime = {
|
|
498
|
+
...runtime,
|
|
499
|
+
env: runEnv
|
|
500
|
+
};
|
|
501
|
+
const result = await runCommand(harborBinary, args, {
|
|
502
|
+
cwd: process.cwd(),
|
|
503
|
+
timeoutMs,
|
|
504
|
+
env: effectiveRuntime.env
|
|
505
|
+
});
|
|
506
|
+
const parsed = parseHarborRunOutput(`${result.stdout}\n${result.stderr}`);
|
|
507
|
+
const rawStdoutPath = join(wrapperOutputDir, "harbor.stdout.log");
|
|
508
|
+
const rawStderrPath = join(wrapperOutputDir, "harbor.stderr.log");
|
|
509
|
+
const summaryPath = join(wrapperOutputDir, "summary.json");
|
|
510
|
+
const summary = {
|
|
511
|
+
timestamp: new Date().toISOString(),
|
|
512
|
+
command: {
|
|
513
|
+
binary: harborBinary,
|
|
514
|
+
args
|
|
515
|
+
},
|
|
516
|
+
runtime: effectiveRuntime,
|
|
517
|
+
exitCode: result.exitCode,
|
|
518
|
+
timedOut: result.timedOut,
|
|
519
|
+
durationMs: result.durationMs,
|
|
520
|
+
errorMessage: 0 !== result.exitCode ? extractHarborErrorMessage(`${result.stderr}\n${result.stdout}`) : void 0,
|
|
521
|
+
metrics: {
|
|
522
|
+
resolvedTrials: parsed.resolvedTrials,
|
|
523
|
+
unresolvedTrials: parsed.unresolvedTrials,
|
|
524
|
+
accuracyPercent: parsed.accuracyPercent,
|
|
525
|
+
passAtK: parsed.passAtK
|
|
526
|
+
},
|
|
527
|
+
runOutputPath: parsed.runOutputPath,
|
|
528
|
+
artifacts: {
|
|
529
|
+
rawStdoutPath,
|
|
530
|
+
rawStderrPath,
|
|
531
|
+
summaryPath
|
|
532
|
+
}
|
|
533
|
+
};
|
|
534
|
+
summary.errorMessage = normalizeHarborFailureMessage({
|
|
535
|
+
rawMessage: summary.errorMessage,
|
|
536
|
+
args,
|
|
537
|
+
dataset: options.overrides.dataset || config.dataset
|
|
538
|
+
});
|
|
539
|
+
await Promise.all([
|
|
540
|
+
writeFile(rawStdoutPath, result.stdout, "utf-8"),
|
|
541
|
+
writeFile(rawStderrPath, result.stderr, "utf-8"),
|
|
542
|
+
writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, "utf-8")
|
|
543
|
+
]);
|
|
544
|
+
return summary;
|
|
545
|
+
}
|
|
546
|
+
export { buildHarborRunArgs, buildPythonPathEnv, buildRuntimePathEnv, createDockerShimScript, extractHarborErrorMessage, extractTaskNamesFromArgs, isMissingComposeProviderError, isPodmanBackedDockerVersionOutput, loadOfficialBenchConfig, normalizeHarborFailureMessage, parseDockerHostCandidate, parseHarborRunOutput, runOfficialTerminalBench };
|