create-walle 0.9.11 → 0.9.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/package.json +2 -2
- package/template/bin/dev.sh +7 -1
- package/template/bin/setup.js +53 -9
- package/template/bin/sync-images.js +53 -0
- package/template/builder-journal.md +17 -0
- package/template/claude-task-manager/api-prompts.js +98 -13
- package/template/claude-task-manager/api-reviews.js +82 -5
- package/template/claude-task-manager/db.js +32 -5
- package/template/claude-task-manager/docs/session-capture-foundation-design.md +1273 -0
- package/template/claude-task-manager/lib/claude-desktop-sessions.js +696 -0
- package/template/claude-task-manager/lib/coding-agent-models.js +49 -1
- package/template/claude-task-manager/lib/session-capture.js +421 -0
- package/template/claude-task-manager/lib/session-history.js +135 -15
- package/template/claude-task-manager/lib/session-jobs.js +10 -5
- package/template/claude-task-manager/lib/session-stream.js +87 -19
- package/template/claude-task-manager/lib/setup-provider-config.js +115 -0
- package/template/claude-task-manager/lib/walle-ctm-history.js +72 -0
- package/template/claude-task-manager/lib/walle-session-context.js +61 -0
- package/template/claude-task-manager/lib/walle-transcript.js +176 -0
- package/template/claude-task-manager/public/css/setup.css +35 -8
- package/template/claude-task-manager/public/css/walle-session.css +56 -0
- package/template/claude-task-manager/public/css/walle.css +120 -0
- package/template/claude-task-manager/public/index.html +814 -181
- package/template/claude-task-manager/public/js/message-renderer.js +148 -19
- package/template/claude-task-manager/public/js/reviews.js +120 -62
- package/template/claude-task-manager/public/js/setup.js +75 -31
- package/template/claude-task-manager/public/js/stream-view.js +115 -55
- package/template/claude-task-manager/public/js/walle-session.js +84 -2
- package/template/claude-task-manager/public/js/walle.js +308 -54
- package/template/claude-task-manager/server.js +1092 -146
- package/template/claude-task-manager/session-integrity.js +181 -54
- package/template/claude-task-manager/session-utils.js +123 -41
- package/template/claude-task-manager/workers/state-detectors/codex.js +5 -2
- package/template/package.json +1 -1
- package/template/wall-e/adapters/ctm.js +39 -18
- package/template/wall-e/agent-runners/contract.js +17 -0
- package/template/wall-e/agent-runners/index.js +22 -0
- package/template/wall-e/agent-runtime/harness.js +212 -0
- package/template/wall-e/agent-runtime/index.js +8 -0
- package/template/wall-e/agent-runtime/registry.js +67 -0
- package/template/wall-e/agent-runtime/session-store.js +179 -0
- package/template/wall-e/agent-runtime/spawn.js +208 -0
- package/template/wall-e/api-walle.js +174 -7
- package/template/wall-e/brain.js +266 -28
- package/template/wall-e/channels/policy.js +88 -0
- package/template/wall-e/channels/registry.js +15 -1
- package/template/wall-e/channels/reply-dispatcher.js +70 -0
- package/template/wall-e/channels/session-bindings.js +51 -0
- package/template/wall-e/chat/code-review-context.js +29 -0
- package/template/wall-e/chat.js +188 -42
- package/template/wall-e/coding/acp-adapter.js +188 -0
- package/template/wall-e/coding/agent-catalog.js +129 -0
- package/template/wall-e/coding/compaction-service.js +247 -0
- package/template/wall-e/coding/execution-trace.js +3 -0
- package/template/wall-e/coding/instruction-service.js +224 -0
- package/template/wall-e/coding/model-message.js +67 -0
- package/template/wall-e/coding/permission-rules-store.js +111 -0
- package/template/wall-e/coding/permission-service.js +266 -0
- package/template/wall-e/coding/prompt-bundle.js +67 -0
- package/template/wall-e/coding/prompt-runtime.js +243 -0
- package/template/wall-e/coding/provider-transform.js +188 -0
- package/template/wall-e/coding/runtime-mode.js +132 -0
- package/template/wall-e/coding/snapshot-service.js +155 -0
- package/template/wall-e/coding/stream-processor.js +268 -0
- package/template/wall-e/coding/task-tool.js +255 -0
- package/template/wall-e/coding/tool-registry.js +361 -0
- package/template/wall-e/coding/transcript-writer.js +143 -0
- package/template/wall-e/coding/workspace-replay.js +324 -0
- package/template/wall-e/coding-context.js +4 -22
- package/template/wall-e/coding-orchestrator.js +307 -18
- package/template/wall-e/coding-prompts.js +44 -3
- package/template/wall-e/context/context-builder.js +43 -1
- package/template/wall-e/context/topic-matcher.js +1 -1
- package/template/wall-e/eval/agent-runner.js +59 -13
- package/template/wall-e/eval/benchmarks/memory-retrieval.json +155 -57
- package/template/wall-e/eval/benchmarks.js +100 -16
- package/template/wall-e/eval/eval-orchestrator.js +218 -8
- package/template/wall-e/eval/harvester.js +62 -5
- package/template/wall-e/eval/head-to-head.js +23 -2
- package/template/wall-e/eval/humaneval-adapter.js +30 -5
- package/template/wall-e/eval/livecodebench-adapter.js +29 -5
- package/template/wall-e/eval/manifest.js +186 -0
- package/template/wall-e/eval/run-agent-benchmarks.js +66 -2
- package/template/wall-e/eval/session-retrieval-benchmark.js +150 -0
- package/template/wall-e/eval/session-transcripts.js +57 -4
- package/template/wall-e/eval/swebench-adapter.js +109 -3
- package/template/wall-e/evaluation/agent-router.js +53 -1
- package/template/wall-e/evaluation/coding-quorum.js +48 -1
- package/template/wall-e/evaluation/router.js +4 -2
- package/template/wall-e/evaluation/tier-selector.js +11 -1
- package/template/wall-e/extraction/contradiction.js +2 -2
- package/template/wall-e/extraction/indexer.js +2 -1
- package/template/wall-e/extraction/knowledge-extractor.js +2 -2
- package/template/wall-e/hooks/cli.js +92 -0
- package/template/wall-e/hooks/discovery.js +119 -0
- package/template/wall-e/hooks/index.js +7 -0
- package/template/wall-e/hooks/manifest.js +55 -0
- package/template/wall-e/hooks/runtime.js +84 -0
- package/template/wall-e/hooks/session-memory.js +225 -0
- package/template/wall-e/http/auth.js +6 -2
- package/template/wall-e/http/chat-api.js +54 -8
- package/template/wall-e/integrations/claude-plugin/hooks/hooks.json +27 -0
- package/template/wall-e/integrations/claude-plugin/hooks/walle-precompact-hook.sh +5 -0
- package/template/wall-e/integrations/claude-plugin/hooks/walle-stop-hook.sh +5 -0
- package/template/wall-e/integrations/codex-plugin/hooks/walle-hook.sh +7 -0
- package/template/wall-e/integrations/codex-plugin/hooks.json +37 -0
- package/template/wall-e/listening/calendar.js +3 -1
- package/template/wall-e/llm/client.js +64 -10
- package/template/wall-e/llm/google.js +39 -5
- package/template/wall-e/llm/ollama.js +1 -1
- package/template/wall-e/llm/ollama.plugin.json +1 -1
- package/template/wall-e/llm/provider-availability.js +10 -0
- package/template/wall-e/llm/provider-error.js +269 -0
- package/template/wall-e/llm/tool-adapter.js +48 -12
- package/template/wall-e/loops/boot.js +2 -1
- package/template/wall-e/loops/initiative.js +2 -2
- package/template/wall-e/loops/tasks.js +8 -47
- package/template/wall-e/loops/workspace-prompts.js +20 -0
- package/template/wall-e/mcp-server.js +442 -1
- package/template/wall-e/memory/session-ingest-service.js +159 -0
- package/template/wall-e/memory/source-indexer.js +289 -0
- package/template/wall-e/plugins/discovery.js +83 -0
- package/template/wall-e/plugins/manifest-loader.js +50 -10
- package/template/wall-e/plugins/manifest-schema.js +69 -0
- package/template/wall-e/plugins/model-catalog.js +55 -0
- package/template/wall-e/prompts/coding/base.txt +2 -0
- package/template/wall-e/prompts/coding/deepseek.txt +1 -0
- package/template/wall-e/prompts/coding/memory-protocol.md +9 -0
- package/template/wall-e/prompts/coding/plan.txt +1 -0
- package/template/wall-e/runtime/execution-trace.js +220 -0
- package/template/wall-e/security/audit.js +266 -0
- package/template/wall-e/security/ssrf.js +236 -0
- package/template/wall-e/session-files.js +303 -0
- package/template/wall-e/skills/_bundled/slack-backfill/SKILL.md +3 -0
- package/template/wall-e/skills/_bundled/slack-sync/SKILL.md +3 -0
- package/template/wall-e/skills/internal-skill-registry.js +2 -2
- package/template/wall-e/skills/script-skill-runner.js +143 -0
- package/template/wall-e/skills/skill-executor.js +5 -6
- package/template/wall-e/skills/skill-fallback.js +3 -1
- package/template/wall-e/skills/skill-harness-registry.js +7 -8
- package/template/wall-e/skills/skill-planner.js +52 -4
- package/template/wall-e/skills/slack-ingest.js +11 -3
- package/template/wall-e/sources/base.js +90 -0
- package/template/wall-e/sources/builtin.js +33 -0
- package/template/wall-e/sources/claude-code-jsonl.js +78 -0
- package/template/wall-e/sources/codex-jsonl.js +125 -0
- package/template/wall-e/sources/coding-session-utils.js +117 -0
- package/template/wall-e/sources/contract-suite.js +59 -0
- package/template/wall-e/sources/gemini-jsonl.js +85 -0
- package/template/wall-e/sources/index.js +9 -0
- package/template/wall-e/sources/jsonl-utils.js +181 -0
- package/template/wall-e/sources/record-types.js +252 -0
- package/template/wall-e/sources/registry.js +92 -0
- package/template/wall-e/sources/transforms.js +100 -0
- package/template/wall-e/sources/walle-jsonl.js +108 -0
- package/template/wall-e/tools/coding-middleware.js +31 -1
- package/template/wall-e/tools/file-tracker.js +25 -1
- package/template/wall-e/tools/local-tools.js +75 -47
- package/template/wall-e/tools/session-sharing.js +68 -1
- package/template/wall-e/tools/shell-analyzer.js +1 -1
- package/template/wall-e/tools/shell-policy.js +47 -0
- package/template/wall-e/tools/snapshot.js +42 -0
- package/template/wall-e/training/harvester.js +62 -5
- package/template/wall-e/utils/repair.js +253 -1
- package/template/website/index.html +3 -3
- package/template/wall-e/skills/_bundled/slack-mentions/.watched-threads.json +0 -18
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const crypto = require('crypto');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const { execFileSync } = require('child_process');
|
|
6
|
+
|
|
7
|
+
const DEFAULT_DATASET_VERSION = 'local-v1';
|
|
8
|
+
const DEFAULT_SCORER_VERSION = 'wall-e-eval-v2';
|
|
9
|
+
const DEFAULT_EVALUATOR_VERSION = 'wall-e-evaluator-v2';
|
|
10
|
+
|
|
11
|
+
let cachedRepoSha;
|
|
12
|
+
|
|
13
|
+
function stableStringify(value) {
|
|
14
|
+
if (value === null || typeof value !== 'object') return JSON.stringify(value);
|
|
15
|
+
if (Array.isArray(value)) return '[' + value.map(stableStringify).join(',') + ']';
|
|
16
|
+
return '{' + Object.keys(value).sort().map((key) => (
|
|
17
|
+
JSON.stringify(key) + ':' + stableStringify(value[key])
|
|
18
|
+
)).join(',') + '}';
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function sha256(value) {
|
|
22
|
+
return crypto.createHash('sha256').update(String(value ?? '')).digest('hex');
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function hashObject(value) {
|
|
26
|
+
return sha256(stableStringify(value));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function getRepoSha(cwd = path.resolve(__dirname, '..')) {
|
|
30
|
+
if (cachedRepoSha !== undefined) return cachedRepoSha;
|
|
31
|
+
try {
|
|
32
|
+
cachedRepoSha = execFileSync('git', ['rev-parse', 'HEAD'], {
|
|
33
|
+
cwd,
|
|
34
|
+
encoding: 'utf8',
|
|
35
|
+
stdio: ['ignore', 'pipe', 'ignore'],
|
|
36
|
+
}).trim() || null;
|
|
37
|
+
} catch {
|
|
38
|
+
cachedRepoSha = null;
|
|
39
|
+
}
|
|
40
|
+
return cachedRepoSha;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function safeJson(value) {
|
|
44
|
+
if (value == null) return null;
|
|
45
|
+
if (typeof value === 'string') return value;
|
|
46
|
+
try { return JSON.stringify(value); } catch { return null; }
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function pickRunConfig(config = {}) {
|
|
50
|
+
const allowed = [
|
|
51
|
+
'temperature', 'seed', 'maxTokens', 'timeoutMs', 'concurrency',
|
|
52
|
+
'budgetDollars', 'suite', 'taskType', 'scoringMethod',
|
|
53
|
+
];
|
|
54
|
+
const out = {};
|
|
55
|
+
for (const key of allowed) {
|
|
56
|
+
if (config[key] !== undefined) out[key] = config[key];
|
|
57
|
+
}
|
|
58
|
+
return out;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function samplePayloadForHash(benchmark = {}) {
|
|
62
|
+
return {
|
|
63
|
+
id: benchmark.id || benchmark.promptId || benchmark.benchmark_id || null,
|
|
64
|
+
prompt: benchmark.prompt || '',
|
|
65
|
+
taskType: benchmark.taskType || null,
|
|
66
|
+
difficulty: benchmark.difficulty || null,
|
|
67
|
+
expectedTraits: benchmark.expectedTraits || null,
|
|
68
|
+
expectedInReply: benchmark.expectedInReply || null,
|
|
69
|
+
agentExpectations: benchmark.agentExpectations || null,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function buildEvalManifest({
|
|
74
|
+
suite,
|
|
75
|
+
benchmark = {},
|
|
76
|
+
runId,
|
|
77
|
+
provider,
|
|
78
|
+
model,
|
|
79
|
+
runConfig = {},
|
|
80
|
+
scorerVersion = DEFAULT_SCORER_VERSION,
|
|
81
|
+
evaluatorVersion = DEFAULT_EVALUATOR_VERSION,
|
|
82
|
+
scoringMethod,
|
|
83
|
+
artifactPath,
|
|
84
|
+
trusted,
|
|
85
|
+
} = {}) {
|
|
86
|
+
const sampleId = benchmark.sampleId || benchmark.id || benchmark.promptId || benchmark.benchmark_id || null;
|
|
87
|
+
const datasetVersion = benchmark.datasetVersion || `${suite || 'unknown'}:${DEFAULT_DATASET_VERSION}`;
|
|
88
|
+
const datasetHash = benchmark.datasetHash || hashObject({
|
|
89
|
+
suite: suite || 'unknown',
|
|
90
|
+
datasetVersion,
|
|
91
|
+
sample: samplePayloadForHash(benchmark),
|
|
92
|
+
});
|
|
93
|
+
const promptHash = sha256(benchmark.prompt || '');
|
|
94
|
+
const sanitizedConfig = pickRunConfig({ ...runConfig, suite, scoringMethod });
|
|
95
|
+
const repoSha = getRepoSha();
|
|
96
|
+
|
|
97
|
+
const manifest = {
|
|
98
|
+
runId: runId || null,
|
|
99
|
+
suite: suite || null,
|
|
100
|
+
datasetVersion,
|
|
101
|
+
datasetHash,
|
|
102
|
+
sampleId,
|
|
103
|
+
promptHash,
|
|
104
|
+
provider: provider || null,
|
|
105
|
+
model: model || null,
|
|
106
|
+
modelSnapshot: benchmark.modelSnapshot || model || null,
|
|
107
|
+
scorerVersion,
|
|
108
|
+
evaluatorVersion,
|
|
109
|
+
scoringMethod: scoringMethod || null,
|
|
110
|
+
repoSha,
|
|
111
|
+
runConfig: sanitizedConfig,
|
|
112
|
+
artifactPath: artifactPath || null,
|
|
113
|
+
trusted: trusted === undefined ? null : !!trusted,
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
sampleId,
|
|
118
|
+
datasetVersion,
|
|
119
|
+
datasetHash,
|
|
120
|
+
promptHash,
|
|
121
|
+
repoSha,
|
|
122
|
+
scorerVersion,
|
|
123
|
+
evaluatorVersion,
|
|
124
|
+
scoringMethod: scoringMethod || null,
|
|
125
|
+
runConfigJson: safeJson(sanitizedConfig),
|
|
126
|
+
evalManifestJson: safeJson(manifest),
|
|
127
|
+
artifactPath: artifactPath || null,
|
|
128
|
+
modelSnapshot: benchmark.modelSnapshot || model || null,
|
|
129
|
+
temperature: sanitizedConfig.temperature ?? null,
|
|
130
|
+
seed: sanitizedConfig.seed ?? null,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function decorateBenchmarkResult(entry = {}, context = {}) {
|
|
135
|
+
const suite = entry.suite || context.suite;
|
|
136
|
+
const benchmark = {
|
|
137
|
+
...(context.benchmark || {}),
|
|
138
|
+
id: entry.promptId || entry.benchmark_id || context.benchmark?.id,
|
|
139
|
+
prompt: entry.prompt || context.benchmark?.prompt,
|
|
140
|
+
taskType: entry.taskType || context.benchmark?.taskType,
|
|
141
|
+
difficulty: entry.difficulty || context.benchmark?.difficulty,
|
|
142
|
+
};
|
|
143
|
+
const manifest = buildEvalManifest({
|
|
144
|
+
suite,
|
|
145
|
+
benchmark,
|
|
146
|
+
runId: entry.runId || context.runId,
|
|
147
|
+
provider: entry.provider || context.provider,
|
|
148
|
+
model: entry.model || context.model,
|
|
149
|
+
runConfig: context.runConfig || {},
|
|
150
|
+
scorerVersion: entry.scorerVersion || context.scorerVersion,
|
|
151
|
+
evaluatorVersion: entry.evaluatorVersion || context.evaluatorVersion,
|
|
152
|
+
scoringMethod: entry.scoringMethod || context.scoringMethod,
|
|
153
|
+
artifactPath: entry.artifactPath || context.artifactPath,
|
|
154
|
+
trusted: entry.trusted ?? context.trusted,
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
...entry,
|
|
159
|
+
sampleId: entry.sampleId || manifest.sampleId,
|
|
160
|
+
datasetVersion: entry.datasetVersion || manifest.datasetVersion,
|
|
161
|
+
datasetHash: entry.datasetHash || manifest.datasetHash,
|
|
162
|
+
promptHash: entry.promptHash || manifest.promptHash,
|
|
163
|
+
repoSha: entry.repoSha || manifest.repoSha,
|
|
164
|
+
scorerVersion: entry.scorerVersion || manifest.scorerVersion,
|
|
165
|
+
evaluatorVersion: entry.evaluatorVersion || manifest.evaluatorVersion,
|
|
166
|
+
scoringMethod: entry.scoringMethod || manifest.scoringMethod,
|
|
167
|
+
runConfigJson: entry.runConfigJson || manifest.runConfigJson,
|
|
168
|
+
evalManifestJson: entry.evalManifestJson || manifest.evalManifestJson,
|
|
169
|
+
artifactPath: entry.artifactPath || manifest.artifactPath,
|
|
170
|
+
modelSnapshot: entry.modelSnapshot || manifest.modelSnapshot,
|
|
171
|
+
temperature: entry.temperature ?? manifest.temperature,
|
|
172
|
+
seed: entry.seed ?? manifest.seed,
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
module.exports = {
|
|
177
|
+
DEFAULT_DATASET_VERSION,
|
|
178
|
+
DEFAULT_SCORER_VERSION,
|
|
179
|
+
DEFAULT_EVALUATOR_VERSION,
|
|
180
|
+
stableStringify,
|
|
181
|
+
sha256,
|
|
182
|
+
hashObject,
|
|
183
|
+
getRepoSha,
|
|
184
|
+
buildEvalManifest,
|
|
185
|
+
decorateBenchmarkResult,
|
|
186
|
+
};
|
|
@@ -25,9 +25,11 @@ try {
|
|
|
25
25
|
} catch {}
|
|
26
26
|
|
|
27
27
|
const path = require('path');
|
|
28
|
+
const crypto = require('crypto');
|
|
28
29
|
process.chdir(path.join(__dirname, '..'));
|
|
29
30
|
|
|
30
|
-
const { setupSandbox, cleanupSandbox, runAgentBenchmark, runAgentBenchmarkSuite } = require('./agent-runner');
|
|
31
|
+
const { setupSandbox, cleanupSandbox, runAgentBenchmark, runAgentBenchmarkSuite, resolveModelName } = require('./agent-runner');
|
|
32
|
+
const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
|
|
31
33
|
const benchmarks = require('./benchmarks/coding-agent.json');
|
|
32
34
|
|
|
33
35
|
/**
|
|
@@ -227,6 +229,7 @@ async function main() {
|
|
|
227
229
|
|
|
228
230
|
console.log(`Running ${selectedBenchmarks.length} benchmarks...\n`);
|
|
229
231
|
|
|
232
|
+
const runId = crypto.randomUUID();
|
|
230
233
|
const results = [];
|
|
231
234
|
for (const benchmark of selectedBenchmarks) {
|
|
232
235
|
const startTime = Date.now();
|
|
@@ -244,7 +247,10 @@ async function main() {
|
|
|
244
247
|
});
|
|
245
248
|
|
|
246
249
|
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
|
|
250
|
+
result.runId = runId;
|
|
251
|
+
result.timestamp = new Date().toISOString();
|
|
247
252
|
results.push(result);
|
|
253
|
+
storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs: 600000 });
|
|
248
254
|
|
|
249
255
|
console.log(` Success: ${result.success}`);
|
|
250
256
|
console.log(` Score: ${(result.score?.composite || 0).toFixed(3)}`);
|
|
@@ -263,7 +269,16 @@ async function main() {
|
|
|
263
269
|
if (result.error) console.log(` Error: ${result.error}`);
|
|
264
270
|
} catch (err) {
|
|
265
271
|
console.error(` EXCEPTION: ${err.message}`);
|
|
266
|
-
|
|
272
|
+
const result = {
|
|
273
|
+
benchmarkId: benchmark.id,
|
|
274
|
+
success: false,
|
|
275
|
+
error: err.message,
|
|
276
|
+
score: { composite: 0 },
|
|
277
|
+
runId,
|
|
278
|
+
timestamp: new Date().toISOString(),
|
|
279
|
+
};
|
|
280
|
+
results.push(result);
|
|
281
|
+
storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs: 600000 });
|
|
267
282
|
}
|
|
268
283
|
console.log('');
|
|
269
284
|
}
|
|
@@ -312,3 +327,52 @@ main().catch(err => {
|
|
|
312
327
|
console.error('Fatal error:', err);
|
|
313
328
|
process.exit(1);
|
|
314
329
|
});
|
|
330
|
+
|
|
331
|
+
function storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs }) {
|
|
332
|
+
if (!brain || typeof brain.insertBenchmarkResult !== 'function') return;
|
|
333
|
+
try {
|
|
334
|
+
const scoringMethod = benchmark.agentExpectations?.testCommand
|
|
335
|
+
? 'agent-rubric+tests'
|
|
336
|
+
: 'agent-rubric';
|
|
337
|
+
brain.insertBenchmarkResult(decorateBenchmarkResult({
|
|
338
|
+
runId,
|
|
339
|
+
suite: 'coding-agent',
|
|
340
|
+
promptId: benchmark.id,
|
|
341
|
+
taskType: 'coding-agent',
|
|
342
|
+
difficulty: benchmark.difficulty,
|
|
343
|
+
provider: provider?.type || 'default',
|
|
344
|
+
model: resolveModelName(modelId),
|
|
345
|
+
prompt: benchmark.prompt,
|
|
346
|
+
response: result.output || '',
|
|
347
|
+
traitScore: null,
|
|
348
|
+
matchedTraits: [],
|
|
349
|
+
compositeScore: result.score?.composite || 0,
|
|
350
|
+
latencyMs: result.latencyMs,
|
|
351
|
+
error: result.error,
|
|
352
|
+
timestamp: result.timestamp,
|
|
353
|
+
costDollars: result.costDollars || null,
|
|
354
|
+
testsBefore: result.testsBefore ?? null,
|
|
355
|
+
testsAfter: result.testsAfter ?? null,
|
|
356
|
+
totalTests: result.totalTests ?? null,
|
|
357
|
+
dimensionsJson: result.dimensionsJson || null,
|
|
358
|
+
inputTokens: result.inputTokens ?? null,
|
|
359
|
+
outputTokens: result.outputTokens ?? null,
|
|
360
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
361
|
+
scoringMethod,
|
|
362
|
+
trusted: !result.error && result.testsPassed === true,
|
|
363
|
+
runConfig: { timeoutMs, scoringMethod },
|
|
364
|
+
}, {
|
|
365
|
+
suite: 'coding-agent',
|
|
366
|
+
benchmark,
|
|
367
|
+
runId,
|
|
368
|
+
provider: provider?.type || 'default',
|
|
369
|
+
model: resolveModelName(modelId),
|
|
370
|
+
scoringMethod,
|
|
371
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
372
|
+
trusted: !result.error && result.testsPassed === true,
|
|
373
|
+
runConfig: { timeoutMs, scoringMethod },
|
|
374
|
+
}));
|
|
375
|
+
} catch (err) {
|
|
376
|
+
console.warn(` [WARN] Failed to store benchmark result: ${err.message}`);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('node:fs');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
const brainDefault = require('../brain');
|
|
6
|
+
const { indexMemory } = require('../memory/source-indexer');
|
|
7
|
+
|
|
8
|
+
const DEFAULT_CASES_PATH = path.join(__dirname, 'benchmarks', 'memory-retrieval.json');
|
|
9
|
+
|
|
10
|
+
function loadMemoryRetrievalCases(filePath = DEFAULT_CASES_PATH) {
|
|
11
|
+
const parsed = JSON.parse(fs.readFileSync(filePath, 'utf8'));
|
|
12
|
+
return Array.isArray(parsed) ? parsed : [];
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function seedBenchmarkMemories({ brain = brainDefault, cases = [] } = {}) {
|
|
16
|
+
let inserted = 0;
|
|
17
|
+
let indexed = 0;
|
|
18
|
+
for (const bench of cases) {
|
|
19
|
+
const memories = bench.retrieval?.seedMemories || [];
|
|
20
|
+
for (const seed of memories) {
|
|
21
|
+
const result = brain.insertMemory({
|
|
22
|
+
source: seed.source || 'codex-jsonl',
|
|
23
|
+
source_id: seed.source_id,
|
|
24
|
+
source_channel: seed.cwd || '',
|
|
25
|
+
memory_type: seed.memory_type || 'coding_session_exchange',
|
|
26
|
+
direction: seed.direction || 'exchange',
|
|
27
|
+
subject: seed.subject || seed.source_id,
|
|
28
|
+
content: seed.content,
|
|
29
|
+
content_raw: seed.content,
|
|
30
|
+
metadata: JSON.stringify(seed.metadata || {}),
|
|
31
|
+
importance: seed.importance ?? 0.7,
|
|
32
|
+
timestamp: seed.timestamp || new Date().toISOString(),
|
|
33
|
+
});
|
|
34
|
+
if (!result) continue;
|
|
35
|
+
inserted++;
|
|
36
|
+
indexMemory({
|
|
37
|
+
...seed,
|
|
38
|
+
id: result.id,
|
|
39
|
+
source: seed.source || 'codex-jsonl',
|
|
40
|
+
source_id: seed.source_id,
|
|
41
|
+
source_channel: seed.cwd || '',
|
|
42
|
+
memory_type: seed.memory_type || 'coding_session_exchange',
|
|
43
|
+
content: seed.content,
|
|
44
|
+
metadata: JSON.stringify(seed.metadata || {}),
|
|
45
|
+
timestamp: seed.timestamp || new Date().toISOString(),
|
|
46
|
+
}, { brain });
|
|
47
|
+
indexed++;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return { inserted, indexed };
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function searchRetrievalCase({ brain = brainDefault, query, limit = 10 } = {}) {
|
|
54
|
+
const max = Math.min(Math.max(Number(limit) || 10, 1), 50);
|
|
55
|
+
const direct = brain.searchMemories({ query, limit: max * 3 });
|
|
56
|
+
let indexed = [];
|
|
57
|
+
try {
|
|
58
|
+
const rows = brain.searchMemoryIndex({ query, limit: max * 3 });
|
|
59
|
+
indexed = hydrateIndexRows(brain, rows);
|
|
60
|
+
} catch {}
|
|
61
|
+
return mergeById(direct, indexed).slice(0, max);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function scoreRetrievalCase(bench, results, { ks = [5, 10] } = {}) {
|
|
65
|
+
const expected = new Set(bench.retrieval?.expectedSourceIds || []);
|
|
66
|
+
const sourceIds = results.map(resultSourceId);
|
|
67
|
+
const out = {
|
|
68
|
+
id: bench.id,
|
|
69
|
+
query: bench.retrieval?.query || '',
|
|
70
|
+
expected: [...expected],
|
|
71
|
+
returned: sourceIds,
|
|
72
|
+
};
|
|
73
|
+
for (const k of ks) {
|
|
74
|
+
out[`hit_at_${k}`] = sourceIds.slice(0, k).some((id) => expected.has(id));
|
|
75
|
+
}
|
|
76
|
+
return out;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function runMemoryRetrievalBenchmark({ brain = brainDefault, cases = loadMemoryRetrievalCases(), seed = false, limit = 10 } = {}) {
|
|
80
|
+
if (seed) seedBenchmarkMemories({ brain, cases });
|
|
81
|
+
const results = [];
|
|
82
|
+
for (const bench of cases) {
|
|
83
|
+
const query = bench.retrieval?.query || bench.prompt || '';
|
|
84
|
+
const hits = searchRetrievalCase({ brain, query, limit });
|
|
85
|
+
results.push(scoreRetrievalCase(bench, hits));
|
|
86
|
+
}
|
|
87
|
+
return summarizeRetrievalResults(results);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function summarizeRetrievalResults(results) {
|
|
91
|
+
const total = results.length || 1;
|
|
92
|
+
const hitAt5 = results.filter((result) => result.hit_at_5).length;
|
|
93
|
+
const hitAt10 = results.filter((result) => result.hit_at_10).length;
|
|
94
|
+
return {
|
|
95
|
+
total: results.length,
|
|
96
|
+
recall_at_5: hitAt5 / total,
|
|
97
|
+
recall_at_10: hitAt10 / total,
|
|
98
|
+
results,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function hydrateIndexRows(brain, rows) {
|
|
103
|
+
if (!rows?.length) return [];
|
|
104
|
+
const ids = [...new Set(rows.map((row) => row.memory_id).filter(Boolean))];
|
|
105
|
+
if (!ids.length) return [];
|
|
106
|
+
const placeholders = ids.map(() => '?').join(',');
|
|
107
|
+
return brain.getDb().prepare(`
|
|
108
|
+
SELECT * FROM memories
|
|
109
|
+
WHERE archived_at IS NULL AND id IN (${placeholders})
|
|
110
|
+
`).all(...ids);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function mergeById(...groups) {
|
|
114
|
+
const seen = new Set();
|
|
115
|
+
const merged = [];
|
|
116
|
+
for (const group of groups) {
|
|
117
|
+
for (const item of group || []) {
|
|
118
|
+
if (!item?.id || seen.has(item.id)) continue;
|
|
119
|
+
seen.add(item.id);
|
|
120
|
+
merged.push(item);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return merged;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function resultSourceId(result = {}) {
|
|
127
|
+
try {
|
|
128
|
+
const metadata = JSON.parse(result.metadata || '{}');
|
|
129
|
+
if (metadata?.sourceId) return metadata.sourceId;
|
|
130
|
+
} catch {}
|
|
131
|
+
const sourceId = String(result.source_id || '');
|
|
132
|
+
const parts = sourceId.split(':');
|
|
133
|
+
return parts.length > 2 ? parts.slice(0, 2).join(':') : sourceId;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (require.main === module) {
|
|
137
|
+
brainDefault.initDb();
|
|
138
|
+
const summary = runMemoryRetrievalBenchmark({ seed: process.argv.includes('--seed') });
|
|
139
|
+
console.log(JSON.stringify(summary, null, 2));
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
module.exports = {
|
|
143
|
+
loadMemoryRetrievalCases,
|
|
144
|
+
resultSourceId,
|
|
145
|
+
runMemoryRetrievalBenchmark,
|
|
146
|
+
scoreRetrievalCase,
|
|
147
|
+
searchRetrievalCase,
|
|
148
|
+
seedBenchmarkMemories,
|
|
149
|
+
summarizeRetrievalResults,
|
|
150
|
+
};
|
|
@@ -13,6 +13,12 @@ const {
|
|
|
13
13
|
const DEFAULT_TRANSCRIPT_ROOTS = {
|
|
14
14
|
claude: path.join(os.homedir(), '.claude', 'projects'),
|
|
15
15
|
codex: path.join(os.homedir(), '.codex', 'sessions'),
|
|
16
|
+
walle: process.env.WALLE_SESSIONS_DIR
|
|
17
|
+
|| process.env.WALL_E_SESSIONS_DIR
|
|
18
|
+
|| process.env.WALL_E_SESSION_DIR
|
|
19
|
+
|| process.env.WALLE_SESSION_DIR
|
|
20
|
+
|| (process.env.WALLE_DEV_DIR ? path.join(process.env.WALLE_DEV_DIR, 'sessions') : '')
|
|
21
|
+
|| path.join(process.env.WALL_E_DATA_DIR || path.join(os.homedir(), '.walle'), 'sessions'),
|
|
16
22
|
};
|
|
17
23
|
|
|
18
24
|
const MIN_PROMPT_CHARS = 20;
|
|
@@ -21,9 +27,12 @@ function detectTranscriptSource(jsonlPath, events = null) {
|
|
|
21
27
|
const normalized = path.normalize(jsonlPath || '');
|
|
22
28
|
if (normalized.includes(`${path.sep}.claude${path.sep}projects${path.sep}`)) return 'claude';
|
|
23
29
|
if (normalized.includes(`${path.sep}.codex${path.sep}sessions${path.sep}`)) return 'codex';
|
|
30
|
+
if (normalized.includes(`${path.sep}.walle${path.sep}sessions${path.sep}`)) return 'walle';
|
|
24
31
|
|
|
25
32
|
const sample = events || readJsonlEvents(jsonlPath).slice(0, 50);
|
|
26
33
|
for (const evt of sample) {
|
|
34
|
+
if (evt?.walle?.schema === 'wall-e-session-v1') return 'walle';
|
|
35
|
+
if (evt?.provider === 'walle' || evt?.type === 'walle_part') return 'walle';
|
|
27
36
|
if (evt?.type === 'session_meta' && evt.payload?.originator === 'codex-tui') return 'codex';
|
|
28
37
|
if (evt?.type === 'turn_context' && evt.payload?.cwd) return 'codex';
|
|
29
38
|
if (evt?.type === 'response_item' && evt.payload) return 'codex';
|
|
@@ -57,8 +66,9 @@ function parseTranscriptJsonl(jsonlPath, { repoPath = null, minPromptChars = MIN
|
|
|
57
66
|
|
|
58
67
|
const source = detectTranscriptSource(jsonlPath, events);
|
|
59
68
|
let session = null;
|
|
60
|
-
if (source === 'claude') session = parseClaudeTranscript(jsonlPath, events, minPromptChars);
|
|
69
|
+
if (source === 'claude') session = parseClaudeTranscript(jsonlPath, events, minPromptChars, 'claude');
|
|
61
70
|
else if (source === 'codex') session = parseCodexTranscript(jsonlPath, events, minPromptChars);
|
|
71
|
+
else if (source === 'walle') session = parseWalleTranscript(jsonlPath, events, minPromptChars);
|
|
62
72
|
else session = parseUnknownTranscript(jsonlPath, events, minPromptChars);
|
|
63
73
|
|
|
64
74
|
if (!session || !session.cwd) return null;
|
|
@@ -77,8 +87,8 @@ function readJsonlEvents(jsonlPath) {
|
|
|
77
87
|
return events;
|
|
78
88
|
}
|
|
79
89
|
|
|
80
|
-
function parseClaudeTranscript(jsonlPath, events, minPromptChars) {
|
|
81
|
-
const session = baseSession(jsonlPath,
|
|
90
|
+
function parseClaudeTranscript(jsonlPath, events, minPromptChars, source = 'claude') {
|
|
91
|
+
const session = baseSession(jsonlPath, source);
|
|
82
92
|
const editedFiles = new Set();
|
|
83
93
|
|
|
84
94
|
for (const evt of events) {
|
|
@@ -152,6 +162,48 @@ function parseCodexTranscript(jsonlPath, events, minPromptChars) {
|
|
|
152
162
|
return session;
|
|
153
163
|
}
|
|
154
164
|
|
|
165
|
+
function parseWalleTranscript(jsonlPath, events, minPromptChars) {
|
|
166
|
+
const session = baseSession(jsonlPath, 'walle');
|
|
167
|
+
const editedFiles = new Set();
|
|
168
|
+
|
|
169
|
+
for (const evt of events) {
|
|
170
|
+
const ts = evt.timestamp || null;
|
|
171
|
+
if (ts) setSessionTime(session, ts);
|
|
172
|
+
if (evt.cwd && !session.cwd) session.cwd = evt.cwd;
|
|
173
|
+
if (evt.gitBranch && !session.gitBranch) session.gitBranch = evt.gitBranch;
|
|
174
|
+
|
|
175
|
+
if (evt.type === 'session_meta') {
|
|
176
|
+
session.sessionId = evt.sessionId || session.sessionId;
|
|
177
|
+
if (evt.cwd && !session.cwd) session.cwd = evt.cwd;
|
|
178
|
+
if (evt.gitBranch && !session.gitBranch) session.gitBranch = evt.gitBranch;
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if (evt.type === 'user') {
|
|
183
|
+
const text = cleanUserText(extractMessageText(evt.message || evt));
|
|
184
|
+
if (isReplayableUserText(text, minPromptChars)) session.userMessages.push(text);
|
|
185
|
+
continue;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if (evt.type === 'assistant') {
|
|
189
|
+
const text = extractMessageText(evt.message || evt);
|
|
190
|
+
if (text) session.assistantMessages.push(text);
|
|
191
|
+
for (const call of harvestExtractToolCalls(evt.message || evt)) {
|
|
192
|
+
addToolCall(session, editedFiles, call.name, call.input || {}, ts);
|
|
193
|
+
}
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if (evt.type === 'walle_part' && evt.partType === 'tool_call') {
|
|
198
|
+
const data = evt.data || {};
|
|
199
|
+
addToolCall(session, editedFiles, data.name || data.tool || 'tool', data.input || {}, ts);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
finishSession(session, editedFiles, events.length);
|
|
204
|
+
return session;
|
|
205
|
+
}
|
|
206
|
+
|
|
155
207
|
function parseUnknownTranscript(jsonlPath, events, minPromptChars) {
|
|
156
208
|
const session = baseSession(jsonlPath, 'unknown');
|
|
157
209
|
const editedFiles = new Set();
|
|
@@ -434,7 +486,8 @@ function resolveRoots(roots, source) {
|
|
|
434
486
|
if (roots) return Array.isArray(roots) ? roots : [roots];
|
|
435
487
|
if (source === 'claude') return [DEFAULT_TRANSCRIPT_ROOTS.claude];
|
|
436
488
|
if (source === 'codex') return [DEFAULT_TRANSCRIPT_ROOTS.codex];
|
|
437
|
-
return [DEFAULT_TRANSCRIPT_ROOTS.
|
|
489
|
+
if (source === 'walle') return [DEFAULT_TRANSCRIPT_ROOTS.walle];
|
|
490
|
+
return [DEFAULT_TRANSCRIPT_ROOTS.claude, DEFAULT_TRANSCRIPT_ROOTS.codex, DEFAULT_TRANSCRIPT_ROOTS.walle];
|
|
438
491
|
}
|
|
439
492
|
|
|
440
493
|
module.exports = {
|
|
@@ -15,6 +15,9 @@ const fs = require('fs');
|
|
|
15
15
|
const path = require('path');
|
|
16
16
|
const os = require('os');
|
|
17
17
|
const https = require('https');
|
|
18
|
+
const crypto = require('crypto');
|
|
19
|
+
const { resolveModelName } = require('./agent-runner');
|
|
20
|
+
const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
|
|
18
21
|
|
|
19
22
|
const CACHE_DIR = path.join(os.homedir(), '.walle', 'swebench-cache');
|
|
20
23
|
const DATASET_URL =
|
|
@@ -148,6 +151,7 @@ async function runSWEBenchTask(task, options = {}) {
|
|
|
148
151
|
let agentResult = null;
|
|
149
152
|
if (runAgentLoop) {
|
|
150
153
|
try {
|
|
154
|
+
let timeoutHandle;
|
|
151
155
|
agentResult = await Promise.race([
|
|
152
156
|
runAgentLoop(mapped.prompt, {
|
|
153
157
|
brain,
|
|
@@ -155,10 +159,12 @@ async function runSWEBenchTask(task, options = {}) {
|
|
|
155
159
|
model,
|
|
156
160
|
maxTurns: 30,
|
|
157
161
|
}),
|
|
158
|
-
new Promise((_, reject) =>
|
|
159
|
-
setTimeout(() => reject(new Error('Agent timeout')), timeoutMs)
|
|
160
|
-
|
|
162
|
+
new Promise((_, reject) => {
|
|
163
|
+
timeoutHandle = setTimeout(() => reject(new Error('Agent timeout')), timeoutMs);
|
|
164
|
+
if (typeof timeoutHandle.unref === 'function') timeoutHandle.unref();
|
|
165
|
+
}),
|
|
161
166
|
]);
|
|
167
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
162
168
|
} catch (err) {
|
|
163
169
|
return {
|
|
164
170
|
taskId: mapped.id,
|
|
@@ -229,10 +235,110 @@ async function loadCuratedSubset() {
|
|
|
229
235
|
return JSON.parse(fs.readFileSync(filePath, 'utf8'));
|
|
230
236
|
}
|
|
231
237
|
|
|
238
|
+
async function runSWEBenchSuite(options = {}) {
|
|
239
|
+
const {
|
|
240
|
+
brain,
|
|
241
|
+
runAgentLoop,
|
|
242
|
+
provider,
|
|
243
|
+
providerType,
|
|
244
|
+
model,
|
|
245
|
+
maxTasks,
|
|
246
|
+
signal,
|
|
247
|
+
timeoutMs,
|
|
248
|
+
runId: providedRunId,
|
|
249
|
+
} = options;
|
|
250
|
+
|
|
251
|
+
const allTasks = await loadCuratedSubset();
|
|
252
|
+
const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
|
|
253
|
+
const runId = providedRunId || crypto.randomUUID();
|
|
254
|
+
const results = [];
|
|
255
|
+
let totalPassed = 0;
|
|
256
|
+
|
|
257
|
+
for (const rawTask of tasks) {
|
|
258
|
+
if (signal?.aborted) break;
|
|
259
|
+
|
|
260
|
+
const mapped = mapTaskToPrompt(rawTask);
|
|
261
|
+
const result = await runSWEBenchTask(mapped, {
|
|
262
|
+
brain,
|
|
263
|
+
runAgentLoop,
|
|
264
|
+
provider,
|
|
265
|
+
model,
|
|
266
|
+
timeoutMs,
|
|
267
|
+
});
|
|
268
|
+
results.push(result);
|
|
269
|
+
if (result.success) totalPassed++;
|
|
270
|
+
|
|
271
|
+
if (brain && typeof brain.insertBenchmarkResult === 'function') {
|
|
272
|
+
try {
|
|
273
|
+
const scoringMethod = 'swebench-docker-tests';
|
|
274
|
+
brain.insertBenchmarkResult(decorateBenchmarkResult({
|
|
275
|
+
runId,
|
|
276
|
+
suite: 'swebench-lite',
|
|
277
|
+
promptId: mapped.id,
|
|
278
|
+
taskType: 'coding-agent',
|
|
279
|
+
difficulty: rawTask.difficulty || 'medium',
|
|
280
|
+
provider: providerType || 'unknown',
|
|
281
|
+
model: resolveModelName(model),
|
|
282
|
+
prompt: mapped.prompt,
|
|
283
|
+
response: result.testOutput || '',
|
|
284
|
+
traitScore: null,
|
|
285
|
+
compositeScore: result.score?.composite || 0,
|
|
286
|
+
latencyMs: result.elapsedMs || null,
|
|
287
|
+
error: result.error || null,
|
|
288
|
+
testsBefore: null,
|
|
289
|
+
testsAfter: result.success ? 1 : 0,
|
|
290
|
+
totalTests: result.error ? 0 : 1,
|
|
291
|
+
dimensionsJson: JSON.stringify(result.score?.dimensions || {}),
|
|
292
|
+
modelMetadataJson: JSON.stringify({
|
|
293
|
+
repo: mapped.repo,
|
|
294
|
+
baseCommit: mapped.baseCommit,
|
|
295
|
+
agentTurns: result.agentTurns || 0,
|
|
296
|
+
testOutput: result.testOutput || null,
|
|
297
|
+
}),
|
|
298
|
+
datasetVersion: 'swebench-lite:curated-30',
|
|
299
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
300
|
+
scoringMethod,
|
|
301
|
+
trusted: !result.error,
|
|
302
|
+
runConfig: { maxTasks, timeoutMs, scoringMethod },
|
|
303
|
+
}, {
|
|
304
|
+
suite: 'swebench-lite',
|
|
305
|
+
benchmark: {
|
|
306
|
+
id: mapped.id,
|
|
307
|
+
prompt: mapped.prompt,
|
|
308
|
+
taskType: 'coding-agent',
|
|
309
|
+
difficulty: rawTask.difficulty || 'medium',
|
|
310
|
+
datasetVersion: 'swebench-lite:curated-30',
|
|
311
|
+
},
|
|
312
|
+
runId,
|
|
313
|
+
provider: providerType || 'unknown',
|
|
314
|
+
model: resolveModelName(model),
|
|
315
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
316
|
+
scoringMethod,
|
|
317
|
+
trusted: !result.error,
|
|
318
|
+
runConfig: { maxTasks, timeoutMs, scoringMethod },
|
|
319
|
+
}));
|
|
320
|
+
} catch {}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
return {
|
|
325
|
+
runId,
|
|
326
|
+
suite: 'swebench-lite',
|
|
327
|
+
model: resolveModelName(model),
|
|
328
|
+
totalTasks: tasks.length,
|
|
329
|
+
passed: totalPassed,
|
|
330
|
+
passAt1: tasks.length > 0 ? totalPassed / tasks.length : 0,
|
|
331
|
+
avgScore: results.reduce((s, r) => s + (r.score?.composite || 0), 0) / Math.max(results.length, 1),
|
|
332
|
+
totalCost: results.reduce((s, r) => s + (r.costDollars || 0), 0),
|
|
333
|
+
results,
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
|
|
232
337
|
module.exports = {
|
|
233
338
|
downloadDataset,
|
|
234
339
|
mapTaskToPrompt,
|
|
235
340
|
runSWEBenchTask,
|
|
341
|
+
runSWEBenchSuite,
|
|
236
342
|
loadCuratedSubset,
|
|
237
343
|
CACHE_DIR,
|
|
238
344
|
};
|