create-walle 0.9.11 → 0.9.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/package.json +2 -2
- package/template/bin/dev.sh +7 -1
- package/template/bin/setup.js +53 -9
- package/template/bin/sync-images.js +53 -0
- package/template/builder-journal.md +17 -0
- package/template/claude-task-manager/api-prompts.js +98 -13
- package/template/claude-task-manager/api-reviews.js +82 -5
- package/template/claude-task-manager/db.js +32 -5
- package/template/claude-task-manager/docs/session-capture-foundation-design.md +1273 -0
- package/template/claude-task-manager/lib/claude-desktop-sessions.js +696 -0
- package/template/claude-task-manager/lib/coding-agent-models.js +49 -1
- package/template/claude-task-manager/lib/session-capture.js +421 -0
- package/template/claude-task-manager/lib/session-history.js +135 -15
- package/template/claude-task-manager/lib/session-jobs.js +10 -5
- package/template/claude-task-manager/lib/session-stream.js +87 -19
- package/template/claude-task-manager/lib/setup-provider-config.js +115 -0
- package/template/claude-task-manager/lib/walle-ctm-history.js +72 -0
- package/template/claude-task-manager/lib/walle-session-context.js +61 -0
- package/template/claude-task-manager/lib/walle-transcript.js +176 -0
- package/template/claude-task-manager/public/css/setup.css +35 -8
- package/template/claude-task-manager/public/css/walle-session.css +56 -0
- package/template/claude-task-manager/public/css/walle.css +120 -0
- package/template/claude-task-manager/public/index.html +814 -181
- package/template/claude-task-manager/public/js/message-renderer.js +148 -19
- package/template/claude-task-manager/public/js/reviews.js +120 -62
- package/template/claude-task-manager/public/js/setup.js +75 -31
- package/template/claude-task-manager/public/js/stream-view.js +115 -55
- package/template/claude-task-manager/public/js/walle-session.js +84 -2
- package/template/claude-task-manager/public/js/walle.js +308 -54
- package/template/claude-task-manager/server.js +1092 -146
- package/template/claude-task-manager/session-integrity.js +181 -54
- package/template/claude-task-manager/session-utils.js +123 -41
- package/template/claude-task-manager/workers/state-detectors/codex.js +5 -2
- package/template/package.json +1 -1
- package/template/wall-e/adapters/ctm.js +39 -18
- package/template/wall-e/agent-runners/contract.js +17 -0
- package/template/wall-e/agent-runners/index.js +22 -0
- package/template/wall-e/agent-runtime/harness.js +212 -0
- package/template/wall-e/agent-runtime/index.js +8 -0
- package/template/wall-e/agent-runtime/registry.js +67 -0
- package/template/wall-e/agent-runtime/session-store.js +179 -0
- package/template/wall-e/agent-runtime/spawn.js +208 -0
- package/template/wall-e/api-walle.js +174 -7
- package/template/wall-e/brain.js +266 -28
- package/template/wall-e/channels/policy.js +88 -0
- package/template/wall-e/channels/registry.js +15 -1
- package/template/wall-e/channels/reply-dispatcher.js +70 -0
- package/template/wall-e/channels/session-bindings.js +51 -0
- package/template/wall-e/chat/code-review-context.js +29 -0
- package/template/wall-e/chat.js +188 -42
- package/template/wall-e/coding/acp-adapter.js +188 -0
- package/template/wall-e/coding/agent-catalog.js +129 -0
- package/template/wall-e/coding/compaction-service.js +247 -0
- package/template/wall-e/coding/execution-trace.js +3 -0
- package/template/wall-e/coding/instruction-service.js +224 -0
- package/template/wall-e/coding/model-message.js +67 -0
- package/template/wall-e/coding/permission-rules-store.js +111 -0
- package/template/wall-e/coding/permission-service.js +266 -0
- package/template/wall-e/coding/prompt-bundle.js +67 -0
- package/template/wall-e/coding/prompt-runtime.js +243 -0
- package/template/wall-e/coding/provider-transform.js +188 -0
- package/template/wall-e/coding/runtime-mode.js +132 -0
- package/template/wall-e/coding/snapshot-service.js +155 -0
- package/template/wall-e/coding/stream-processor.js +268 -0
- package/template/wall-e/coding/task-tool.js +255 -0
- package/template/wall-e/coding/tool-registry.js +361 -0
- package/template/wall-e/coding/transcript-writer.js +143 -0
- package/template/wall-e/coding/workspace-replay.js +324 -0
- package/template/wall-e/coding-context.js +4 -22
- package/template/wall-e/coding-orchestrator.js +307 -18
- package/template/wall-e/coding-prompts.js +44 -3
- package/template/wall-e/context/context-builder.js +43 -1
- package/template/wall-e/context/topic-matcher.js +1 -1
- package/template/wall-e/eval/agent-runner.js +59 -13
- package/template/wall-e/eval/benchmarks/memory-retrieval.json +155 -57
- package/template/wall-e/eval/benchmarks.js +100 -16
- package/template/wall-e/eval/eval-orchestrator.js +218 -8
- package/template/wall-e/eval/harvester.js +62 -5
- package/template/wall-e/eval/head-to-head.js +23 -2
- package/template/wall-e/eval/humaneval-adapter.js +30 -5
- package/template/wall-e/eval/livecodebench-adapter.js +29 -5
- package/template/wall-e/eval/manifest.js +186 -0
- package/template/wall-e/eval/run-agent-benchmarks.js +66 -2
- package/template/wall-e/eval/session-retrieval-benchmark.js +150 -0
- package/template/wall-e/eval/session-transcripts.js +57 -4
- package/template/wall-e/eval/swebench-adapter.js +109 -3
- package/template/wall-e/evaluation/agent-router.js +53 -1
- package/template/wall-e/evaluation/coding-quorum.js +48 -1
- package/template/wall-e/evaluation/router.js +4 -2
- package/template/wall-e/evaluation/tier-selector.js +11 -1
- package/template/wall-e/extraction/contradiction.js +2 -2
- package/template/wall-e/extraction/indexer.js +2 -1
- package/template/wall-e/extraction/knowledge-extractor.js +2 -2
- package/template/wall-e/hooks/cli.js +92 -0
- package/template/wall-e/hooks/discovery.js +119 -0
- package/template/wall-e/hooks/index.js +7 -0
- package/template/wall-e/hooks/manifest.js +55 -0
- package/template/wall-e/hooks/runtime.js +84 -0
- package/template/wall-e/hooks/session-memory.js +225 -0
- package/template/wall-e/http/auth.js +6 -2
- package/template/wall-e/http/chat-api.js +54 -8
- package/template/wall-e/integrations/claude-plugin/hooks/hooks.json +27 -0
- package/template/wall-e/integrations/claude-plugin/hooks/walle-precompact-hook.sh +5 -0
- package/template/wall-e/integrations/claude-plugin/hooks/walle-stop-hook.sh +5 -0
- package/template/wall-e/integrations/codex-plugin/hooks/walle-hook.sh +7 -0
- package/template/wall-e/integrations/codex-plugin/hooks.json +37 -0
- package/template/wall-e/listening/calendar.js +3 -1
- package/template/wall-e/llm/client.js +64 -10
- package/template/wall-e/llm/google.js +39 -5
- package/template/wall-e/llm/ollama.js +1 -1
- package/template/wall-e/llm/ollama.plugin.json +1 -1
- package/template/wall-e/llm/provider-availability.js +10 -0
- package/template/wall-e/llm/provider-error.js +269 -0
- package/template/wall-e/llm/tool-adapter.js +48 -12
- package/template/wall-e/loops/boot.js +2 -1
- package/template/wall-e/loops/initiative.js +2 -2
- package/template/wall-e/loops/tasks.js +8 -47
- package/template/wall-e/loops/workspace-prompts.js +20 -0
- package/template/wall-e/mcp-server.js +442 -1
- package/template/wall-e/memory/session-ingest-service.js +159 -0
- package/template/wall-e/memory/source-indexer.js +289 -0
- package/template/wall-e/plugins/discovery.js +83 -0
- package/template/wall-e/plugins/manifest-loader.js +50 -10
- package/template/wall-e/plugins/manifest-schema.js +69 -0
- package/template/wall-e/plugins/model-catalog.js +55 -0
- package/template/wall-e/prompts/coding/base.txt +2 -0
- package/template/wall-e/prompts/coding/deepseek.txt +1 -0
- package/template/wall-e/prompts/coding/memory-protocol.md +9 -0
- package/template/wall-e/prompts/coding/plan.txt +1 -0
- package/template/wall-e/runtime/execution-trace.js +220 -0
- package/template/wall-e/security/audit.js +266 -0
- package/template/wall-e/security/ssrf.js +236 -0
- package/template/wall-e/session-files.js +303 -0
- package/template/wall-e/skills/_bundled/slack-backfill/SKILL.md +3 -0
- package/template/wall-e/skills/_bundled/slack-sync/SKILL.md +3 -0
- package/template/wall-e/skills/internal-skill-registry.js +2 -2
- package/template/wall-e/skills/script-skill-runner.js +143 -0
- package/template/wall-e/skills/skill-executor.js +5 -6
- package/template/wall-e/skills/skill-fallback.js +3 -1
- package/template/wall-e/skills/skill-harness-registry.js +7 -8
- package/template/wall-e/skills/skill-planner.js +52 -4
- package/template/wall-e/skills/slack-ingest.js +11 -3
- package/template/wall-e/sources/base.js +90 -0
- package/template/wall-e/sources/builtin.js +33 -0
- package/template/wall-e/sources/claude-code-jsonl.js +78 -0
- package/template/wall-e/sources/codex-jsonl.js +125 -0
- package/template/wall-e/sources/coding-session-utils.js +117 -0
- package/template/wall-e/sources/contract-suite.js +59 -0
- package/template/wall-e/sources/gemini-jsonl.js +85 -0
- package/template/wall-e/sources/index.js +9 -0
- package/template/wall-e/sources/jsonl-utils.js +181 -0
- package/template/wall-e/sources/record-types.js +252 -0
- package/template/wall-e/sources/registry.js +92 -0
- package/template/wall-e/sources/transforms.js +100 -0
- package/template/wall-e/sources/walle-jsonl.js +108 -0
- package/template/wall-e/tools/coding-middleware.js +31 -1
- package/template/wall-e/tools/file-tracker.js +25 -1
- package/template/wall-e/tools/local-tools.js +75 -47
- package/template/wall-e/tools/session-sharing.js +68 -1
- package/template/wall-e/tools/shell-analyzer.js +1 -1
- package/template/wall-e/tools/shell-policy.js +47 -0
- package/template/wall-e/tools/snapshot.js +42 -0
- package/template/wall-e/training/harvester.js +62 -5
- package/template/wall-e/utils/repair.js +253 -1
- package/template/website/index.html +3 -3
- package/template/wall-e/skills/_bundled/slack-mentions/.watched-threads.json +0 -18
|
@@ -8,6 +8,7 @@ const { pLimit, getAvailableProviders } = require('./head-to-head');
|
|
|
8
8
|
const { runAgentBenchmark, runMultiTurnBenchmark } = require('./agent-runner');
|
|
9
9
|
const { createClient } = require('../llm/client');
|
|
10
10
|
const { createAnthropicFromEnv } = require('../llm/anthropic');
|
|
11
|
+
const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
|
|
11
12
|
|
|
12
13
|
// ============================================================
|
|
13
14
|
// Benchmark suite loader
|
|
@@ -22,14 +23,89 @@ const SUITE_FILES = {
|
|
|
22
23
|
'chat-eval': 'chat-eval.json',
|
|
23
24
|
'reasoning': 'reasoning.json',
|
|
24
25
|
'memory-retrieval': 'memory-retrieval.json',
|
|
25
|
-
|
|
26
|
-
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
const ADAPTER_SUITE_METADATA = {
|
|
29
|
+
'humaneval-plus': {
|
|
30
|
+
name: 'humaneval-plus',
|
|
31
|
+
count: null,
|
|
32
|
+
taskTypes: ['coding'],
|
|
33
|
+
difficulties: ['easy', 'medium', 'hard'],
|
|
34
|
+
adapter: true,
|
|
35
|
+
description: 'EvalPlus HumanEval+ Python function-generation tasks',
|
|
36
|
+
},
|
|
37
|
+
livecodebench: {
|
|
38
|
+
name: 'livecodebench',
|
|
39
|
+
count: null,
|
|
40
|
+
taskTypes: ['coding'],
|
|
41
|
+
difficulties: ['easy', 'medium', 'hard'],
|
|
42
|
+
adapter: true,
|
|
43
|
+
description: 'LiveCodeBench code-generation tasks with date filtering',
|
|
44
|
+
},
|
|
45
|
+
'swebench-lite': {
|
|
46
|
+
name: 'swebench-lite',
|
|
47
|
+
count: 30,
|
|
48
|
+
taskTypes: ['coding-agent'],
|
|
49
|
+
difficulties: ['medium', 'hard'],
|
|
50
|
+
adapter: true,
|
|
51
|
+
description: 'Curated SWE-bench Lite issue-fixing tasks',
|
|
52
|
+
},
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
function isAdapterSuite(suiteName) {
|
|
56
|
+
return !!ADAPTER_SUITE_METADATA[suiteName];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function listAdapterSuites() {
|
|
60
|
+
return Object.values(ADAPTER_SUITE_METADATA).map((s) => ({ ...s }));
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const DEFAULT_ADAPTER_RUNNERS = {
|
|
64
|
+
'humaneval-plus': async ({ brain, providerInfo, model, runId, maxTasks, signal }) => {
|
|
65
|
+
const { runHumanEvalSuite } = require('./humaneval-adapter');
|
|
66
|
+
return runHumanEvalSuite({
|
|
67
|
+
brain,
|
|
68
|
+
providerType: providerInfo.provider,
|
|
69
|
+
config: { apiKey: providerInfo.apiKey, baseUrl: providerInfo.baseUrl },
|
|
70
|
+
model,
|
|
71
|
+
runId,
|
|
72
|
+
maxTasks,
|
|
73
|
+
signal,
|
|
74
|
+
});
|
|
75
|
+
},
|
|
76
|
+
livecodebench: async ({ brain, providerInfo, model, runId, maxTasks, afterDate, signal }) => {
|
|
77
|
+
const { runLiveCodeBenchSuite } = require('./livecodebench-adapter');
|
|
78
|
+
return runLiveCodeBenchSuite({
|
|
79
|
+
brain,
|
|
80
|
+
providerType: providerInfo.provider,
|
|
81
|
+
config: { apiKey: providerInfo.apiKey, baseUrl: providerInfo.baseUrl },
|
|
82
|
+
model,
|
|
83
|
+
runId,
|
|
84
|
+
maxTasks,
|
|
85
|
+
afterDate,
|
|
86
|
+
signal,
|
|
87
|
+
});
|
|
88
|
+
},
|
|
89
|
+
'swebench-lite': async ({ brain, providerInfo, model, runId, maxTasks, signal, runAgentLoop, timeoutMs }) => {
|
|
90
|
+
const { runSWEBenchSuite } = require('./swebench-adapter');
|
|
91
|
+
return runSWEBenchSuite({
|
|
92
|
+
brain,
|
|
93
|
+
provider: providerInfo.client,
|
|
94
|
+
providerType: providerInfo.provider,
|
|
95
|
+
model,
|
|
96
|
+
runId,
|
|
97
|
+
maxTasks,
|
|
98
|
+
signal,
|
|
99
|
+
runAgentLoop,
|
|
100
|
+
timeoutMs,
|
|
101
|
+
});
|
|
102
|
+
},
|
|
27
103
|
};
|
|
28
104
|
|
|
29
105
|
function loadSuite(suiteName) {
|
|
106
|
+
if (isAdapterSuite(suiteName)) return [];
|
|
30
107
|
if (!(suiteName in SUITE_FILES)) throw new Error(`Unknown suite: ${suiteName}`);
|
|
31
108
|
const file = SUITE_FILES[suiteName];
|
|
32
|
-
if (!file) return []; // adapter-based suite (humaneval-plus, livecodebench) — loaded externally
|
|
33
109
|
const filePath = path.join(BENCHMARKS_DIR, file);
|
|
34
110
|
if (!fs.existsSync(filePath)) throw new Error(`Suite file not found: ${filePath}`);
|
|
35
111
|
return JSON.parse(fs.readFileSync(filePath, 'utf8'));
|
|
@@ -83,6 +159,7 @@ class EvalOrchestrator extends EventEmitter {
|
|
|
83
159
|
this.totalSpent = 0;
|
|
84
160
|
this.running = false;
|
|
85
161
|
this.aborted = false;
|
|
162
|
+
this.adapterRunners = { ...DEFAULT_ADAPTER_RUNNERS, ...(options.adapterRunners || {}) };
|
|
86
163
|
}
|
|
87
164
|
|
|
88
165
|
/**
|
|
@@ -94,12 +171,16 @@ class EvalOrchestrator extends EventEmitter {
|
|
|
94
171
|
* @param {Function} params.runAgentLoop - Agent loop function for coding benchmarks
|
|
95
172
|
* @returns {Promise<object>} Run summary
|
|
96
173
|
*/
|
|
97
|
-
async run({ suite, models, benchmarkIds, runAgentLoop }) {
|
|
174
|
+
async run({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate }) {
|
|
98
175
|
if (this.running) throw new Error('Orchestrator is already running');
|
|
99
176
|
this.running = true;
|
|
100
177
|
this.aborted = false;
|
|
101
178
|
|
|
102
179
|
try {
|
|
180
|
+
if (isAdapterSuite(suite)) {
|
|
181
|
+
return await this._runAdapterSuite({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate });
|
|
182
|
+
}
|
|
183
|
+
|
|
103
184
|
// 1. Load benchmarks
|
|
104
185
|
const benchmarks = suite === 'all' ? loadAllSuites() : loadSuite(suite);
|
|
105
186
|
const filtered = benchmarkIds && benchmarkIds.length > 0
|
|
@@ -330,7 +411,7 @@ class EvalOrchestrator extends EventEmitter {
|
|
|
330
411
|
} else {
|
|
331
412
|
client = createClient(providerType, { apiKey, baseUrl });
|
|
332
413
|
}
|
|
333
|
-
map[model] = { client, provider: providerType, registryId };
|
|
414
|
+
map[model] = { client, provider: providerType, registryId, apiKey, baseUrl };
|
|
334
415
|
} catch (err) {
|
|
335
416
|
this.emit('error', { benchmarkId: null, model, error: `Failed to create client: ${err.message}` });
|
|
336
417
|
}
|
|
@@ -338,6 +419,102 @@ class EvalOrchestrator extends EventEmitter {
|
|
|
338
419
|
return map;
|
|
339
420
|
}
|
|
340
421
|
|
|
422
|
+
async _runAdapterSuite({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate }) {
|
|
423
|
+
const runner = this.adapterRunners[suite];
|
|
424
|
+
if (!runner) throw new Error(`No adapter runner configured for suite: ${suite}`);
|
|
425
|
+
|
|
426
|
+
if (!models || models.length === 0) {
|
|
427
|
+
const summary = { runId: this.runId, status: 'error', error: 'No models specified', models: {}, totalBenchmarks: 0, totalSpent: 0 };
|
|
428
|
+
this.emit('error', { benchmarkId: null, model: null, error: 'No models specified. Provide at least one model.' });
|
|
429
|
+
this.emit('run-complete', { runId: this.runId, summary });
|
|
430
|
+
return summary;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
const providerMap = this._resolveProviders(models);
|
|
434
|
+
const adapterMaxTasks = maxTasks || (benchmarkIds && benchmarkIds.length ? benchmarkIds.length : undefined);
|
|
435
|
+
const limit = pLimit(this.concurrency);
|
|
436
|
+
const modelResults = {};
|
|
437
|
+
|
|
438
|
+
const orchestrator = this;
|
|
439
|
+
const adapterSignal = { get aborted() { return orchestrator.aborted; } };
|
|
440
|
+
const tasks = models.map((model) => limit(async () => {
|
|
441
|
+
const providerInfo = providerMap[model];
|
|
442
|
+
if (!providerInfo) {
|
|
443
|
+
this.emit('error', { benchmarkId: suite, model, error: `No provider found for model: ${model}` });
|
|
444
|
+
return null;
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
this.emit('benchmark-start', {
|
|
448
|
+
benchmarkId: suite,
|
|
449
|
+
model,
|
|
450
|
+
startedAt: new Date().toISOString(),
|
|
451
|
+
});
|
|
452
|
+
|
|
453
|
+
let result;
|
|
454
|
+
try {
|
|
455
|
+
result = await runner({
|
|
456
|
+
brain: this.brain,
|
|
457
|
+
providerInfo,
|
|
458
|
+
model,
|
|
459
|
+
runId: this.runId,
|
|
460
|
+
maxTasks: adapterMaxTasks,
|
|
461
|
+
afterDate,
|
|
462
|
+
signal: adapterSignal,
|
|
463
|
+
runAgentLoop,
|
|
464
|
+
timeoutMs: this.timeoutMs,
|
|
465
|
+
});
|
|
466
|
+
} catch (err) {
|
|
467
|
+
result = {
|
|
468
|
+
suite,
|
|
469
|
+
model,
|
|
470
|
+
totalTasks: 0,
|
|
471
|
+
avgScore: 0,
|
|
472
|
+
totalCost: 0,
|
|
473
|
+
error: err.message,
|
|
474
|
+
results: [],
|
|
475
|
+
};
|
|
476
|
+
this.emit('error', { benchmarkId: suite, model, error: err.message });
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
const totalCost = result.totalCost || 0;
|
|
480
|
+
if (!isLocalModel(providerInfo.provider)) {
|
|
481
|
+
this.spent[model] = (this.spent[model] || 0) + totalCost;
|
|
482
|
+
this.totalSpent += totalCost;
|
|
483
|
+
}
|
|
484
|
+
modelResults[model] = result;
|
|
485
|
+
this.emit('model-complete', {
|
|
486
|
+
model,
|
|
487
|
+
avgScore: Math.round((result.avgScore || 0) * 1000) / 1000,
|
|
488
|
+
totalCost: Math.round(totalCost * 1_000_000) / 1_000_000,
|
|
489
|
+
benchmarksRun: result.totalTasks || result.results?.length || 0,
|
|
490
|
+
});
|
|
491
|
+
return result;
|
|
492
|
+
}));
|
|
493
|
+
|
|
494
|
+
await Promise.all(tasks);
|
|
495
|
+
|
|
496
|
+
const summary = {
|
|
497
|
+
runId: this.runId,
|
|
498
|
+
status: this.aborted ? 'aborted' : 'complete',
|
|
499
|
+
totalBenchmarks: Object.values(modelResults).reduce((s, r) => s + (r.totalTasks || r.results?.length || 0), 0),
|
|
500
|
+
totalSpent: Math.round(this.totalSpent * 1_000_000) / 1_000_000,
|
|
501
|
+
models: {},
|
|
502
|
+
suite,
|
|
503
|
+
adapter: true,
|
|
504
|
+
};
|
|
505
|
+
for (const [model, result] of Object.entries(modelResults)) {
|
|
506
|
+
summary.models[model] = {
|
|
507
|
+
avgScore: Math.round((result.avgScore || 0) * 1000) / 1000,
|
|
508
|
+
totalCost: Math.round((result.totalCost || 0) * 1_000_000) / 1_000_000,
|
|
509
|
+
benchmarksRun: result.totalTasks || result.results?.length || 0,
|
|
510
|
+
errors: (result.results || []).filter(r => r.error).length + (result.error ? 1 : 0),
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
this.emit('run-complete', { runId: this.runId, summary });
|
|
515
|
+
return summary;
|
|
516
|
+
}
|
|
517
|
+
|
|
341
518
|
/**
|
|
342
519
|
* Get set of completed benchmark keys for this runId (for resume).
|
|
343
520
|
*/
|
|
@@ -363,7 +540,10 @@ class EvalOrchestrator extends EventEmitter {
|
|
|
363
540
|
if (!this.brain || typeof this.brain.insertBenchmarkResult !== 'function') return;
|
|
364
541
|
|
|
365
542
|
try {
|
|
366
|
-
|
|
543
|
+
const scoringMethod = item.benchmark.agentExpectations?.testCommand
|
|
544
|
+
? 'agent-rubric+tests'
|
|
545
|
+
: 'agent-rubric';
|
|
546
|
+
this.brain.insertBenchmarkResult(decorateBenchmarkResult({
|
|
367
547
|
runId: this.runId,
|
|
368
548
|
suite: item.benchmark._suite || 'coding-agent',
|
|
369
549
|
promptId: item.benchmark.id,
|
|
@@ -384,7 +564,31 @@ class EvalOrchestrator extends EventEmitter {
|
|
|
384
564
|
dimensionsJson: result.dimensionsJson || null,
|
|
385
565
|
inputTokens: result.inputTokens ?? null,
|
|
386
566
|
outputTokens: result.outputTokens ?? null,
|
|
387
|
-
|
|
567
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
568
|
+
scoringMethod,
|
|
569
|
+
trusted: !result.error && result.testsPassed === true,
|
|
570
|
+
runConfig: {
|
|
571
|
+
timeoutMs: this.timeoutMs,
|
|
572
|
+
concurrency: this.concurrency,
|
|
573
|
+
budgetDollars: this.budgetDollars,
|
|
574
|
+
scoringMethod,
|
|
575
|
+
},
|
|
576
|
+
}, {
|
|
577
|
+
suite: item.benchmark._suite || 'coding-agent',
|
|
578
|
+
benchmark: item.benchmark,
|
|
579
|
+
runId: this.runId,
|
|
580
|
+
provider: item.provider.provider,
|
|
581
|
+
model: item.model,
|
|
582
|
+
scoringMethod,
|
|
583
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
584
|
+
trusted: !result.error && result.testsPassed === true,
|
|
585
|
+
runConfig: {
|
|
586
|
+
timeoutMs: this.timeoutMs,
|
|
587
|
+
concurrency: this.concurrency,
|
|
588
|
+
budgetDollars: this.budgetDollars,
|
|
589
|
+
scoringMethod,
|
|
590
|
+
},
|
|
591
|
+
}));
|
|
388
592
|
} catch { /* non-fatal */ }
|
|
389
593
|
}
|
|
390
594
|
|
|
@@ -427,4 +631,10 @@ class EvalOrchestrator extends EventEmitter {
|
|
|
427
631
|
}
|
|
428
632
|
}
|
|
429
633
|
|
|
430
|
-
module.exports = {
|
|
634
|
+
module.exports = {
|
|
635
|
+
EvalOrchestrator,
|
|
636
|
+
ADAPTER_SUITE_METADATA,
|
|
637
|
+
isAdapterSuite,
|
|
638
|
+
listAdapterSuites,
|
|
639
|
+
loadSuite,
|
|
640
|
+
};
|
|
@@ -5,6 +5,17 @@ const path = require('path');
|
|
|
5
5
|
const { createHash } = require('crypto');
|
|
6
6
|
const { execFileSync } = require('child_process');
|
|
7
7
|
|
|
8
|
+
let claudeDesktopSessions = null;
|
|
9
|
+
function getClaudeDesktopSessions() {
|
|
10
|
+
if (claudeDesktopSessions) return claudeDesktopSessions;
|
|
11
|
+
try {
|
|
12
|
+
claudeDesktopSessions = require('../../claude-task-manager/lib/claude-desktop-sessions');
|
|
13
|
+
} catch {
|
|
14
|
+
return null;
|
|
15
|
+
}
|
|
16
|
+
return claudeDesktopSessions;
|
|
17
|
+
}
|
|
18
|
+
|
|
8
19
|
// --- Task type classification ---
|
|
9
20
|
|
|
10
21
|
function classifyTaskType(content) {
|
|
@@ -86,6 +97,46 @@ async function harvestClaudeCodeSessions(since) {
|
|
|
86
97
|
return samples;
|
|
87
98
|
}
|
|
88
99
|
|
|
100
|
+
// --- Claude Desktop Session Harvesting ---
|
|
101
|
+
|
|
102
|
+
async function harvestClaudeDesktopSessions(since) {
|
|
103
|
+
const reader = getClaudeDesktopSessions();
|
|
104
|
+
if (!reader) return [];
|
|
105
|
+
|
|
106
|
+
const sessions = reader.listSessions();
|
|
107
|
+
const samples = [];
|
|
108
|
+
|
|
109
|
+
for (const session of sessions) {
|
|
110
|
+
if (since && session.updatedAt && session.updatedAt <= since) continue;
|
|
111
|
+
const messages = Array.isArray(session.messages) ? session.messages : [];
|
|
112
|
+
for (let i = 0; i < messages.length - 1; i++) {
|
|
113
|
+
const userMsg = messages[i];
|
|
114
|
+
const assistantMsg = messages[i + 1];
|
|
115
|
+
if (userMsg.role !== 'user' || assistantMsg.role !== 'assistant') continue;
|
|
116
|
+
const userContent = userMsg.text || '';
|
|
117
|
+
const assistantContent = assistantMsg.text || '';
|
|
118
|
+
if (!userContent || userContent.length < 20) continue;
|
|
119
|
+
if (!assistantContent || assistantContent.length < 20) continue;
|
|
120
|
+
|
|
121
|
+
samples.push({
|
|
122
|
+
id: contentHash('claude-desktop', `${session.uuid}:${i}:${userContent}`),
|
|
123
|
+
source: 'claude-desktop',
|
|
124
|
+
session_id: session.uuid,
|
|
125
|
+
timestamp: userMsg.timestamp || session.updatedAt || session.createdAt || new Date().toISOString(),
|
|
126
|
+
task_type: classifyTaskType(userContent),
|
|
127
|
+
prompt: userContent,
|
|
128
|
+
response: assistantContent,
|
|
129
|
+
tool_calls: [],
|
|
130
|
+
outcome: 'unknown',
|
|
131
|
+
outcome_signal: { git_committed: false, git_diff: null, task_status: null, user_corrected: false },
|
|
132
|
+
model: session.model || 'unknown',
|
|
133
|
+
quality_label: 0.5,
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
return samples;
|
|
138
|
+
}
|
|
139
|
+
|
|
89
140
|
// --- Codex Session Harvesting ---
|
|
90
141
|
|
|
91
142
|
async function harvestCodexSessions(since) {
|
|
@@ -144,12 +195,13 @@ async function harvestCodexSessions(since) {
|
|
|
144
195
|
|
|
145
196
|
// --- CTM Session Harvesting ---
|
|
146
197
|
|
|
147
|
-
async function harvestCtmSessions(since) {
|
|
148
|
-
const dataDir = process.env.WALL_E_DATA_DIR || path.join(process.env.HOME, '.walle', 'data');
|
|
198
|
+
async function harvestCtmSessions(since, dataDirOverride = null) {
|
|
199
|
+
const dataDir = dataDirOverride || process.env.WALL_E_DATA_DIR || path.join(process.env.HOME, '.walle', 'data');
|
|
149
200
|
const ctmDbPath = path.join(dataDir, 'task-manager.db');
|
|
150
201
|
if (!fs.existsSync(ctmDbPath)) return [];
|
|
151
202
|
|
|
152
|
-
|
|
203
|
+
let Database;
|
|
204
|
+
try { Database = require('better-sqlite3'); } catch { return []; }
|
|
153
205
|
let ctmDb;
|
|
154
206
|
try {
|
|
155
207
|
ctmDb = new Database(ctmDbPath, { readonly: true, fileMustExist: true });
|
|
@@ -534,10 +586,11 @@ async function runHarvest({ incremental = true, brain, dataDir } = {}) {
|
|
|
534
586
|
|
|
535
587
|
// Harvest from each source
|
|
536
588
|
const claudeSamples = await harvestClaudeCodeSessions(getSince('claude-code'));
|
|
589
|
+
const claudeDesktopSamples = await harvestClaudeDesktopSessions(getSince('claude-desktop'));
|
|
537
590
|
const codexSamples = await harvestCodexSessions(getSince('codex'));
|
|
538
591
|
const chatSamples = await harvestWalleChat(brain, getSince('walle-chat'));
|
|
539
592
|
const taskSamples = await harvestWalleTasks(brain, getSince('walle-task'));
|
|
540
|
-
const ctmSamples = await harvestCtmSessions(getSince('ctm-sessions'));
|
|
593
|
+
const ctmSamples = await harvestCtmSessions(getSince('ctm-sessions'), dataDir);
|
|
541
594
|
|
|
542
595
|
// Harvest coding agent sessions and store in brain
|
|
543
596
|
const codingAgentSessions = await harvestCodingAgentSessions(getSince('coding-agent'));
|
|
@@ -549,7 +602,7 @@ async function runHarvest({ incremental = true, brain, dataDir } = {}) {
|
|
|
549
602
|
if (typeof brain.insertCodingSession === 'function') brain.insertCodingSession(session);
|
|
550
603
|
}
|
|
551
604
|
|
|
552
|
-
allSamples.push(...claudeSamples, ...codexSamples, ...chatSamples, ...taskSamples, ...ctmSamples);
|
|
605
|
+
allSamples.push(...claudeSamples, ...claudeDesktopSamples, ...codexSamples, ...chatSamples, ...taskSamples, ...ctmSamples);
|
|
553
606
|
|
|
554
607
|
// Deduplicate by content hash
|
|
555
608
|
const seen = new Set();
|
|
@@ -584,6 +637,9 @@ async function runHarvest({ incremental = true, brain, dataDir } = {}) {
|
|
|
584
637
|
if (claudeSamples.length > 0) {
|
|
585
638
|
brain.updateHarvestState('claude-code', { lastProcessedAt: now, totalHarvested: claudeSamples.length });
|
|
586
639
|
}
|
|
640
|
+
if (claudeDesktopSamples.length > 0) {
|
|
641
|
+
brain.updateHarvestState('claude-desktop', { lastProcessedAt: now, totalHarvested: claudeDesktopSamples.length });
|
|
642
|
+
}
|
|
587
643
|
if (codexSamples.length > 0) {
|
|
588
644
|
brain.updateHarvestState('codex', { lastProcessedAt: now, totalHarvested: codexSamples.length });
|
|
589
645
|
}
|
|
@@ -610,6 +666,7 @@ module.exports = {
|
|
|
610
666
|
classifyTaskType,
|
|
611
667
|
contentHash,
|
|
612
668
|
harvestClaudeCodeSessions,
|
|
669
|
+
harvestClaudeDesktopSessions,
|
|
613
670
|
harvestCodexSessions,
|
|
614
671
|
harvestCtmSessions,
|
|
615
672
|
harvestWalleChat,
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
const { randomUUID } = require('node:crypto');
|
|
4
4
|
const { createClient } = require('../llm/client');
|
|
5
5
|
const { heuristicScore } = require('./evaluator');
|
|
6
|
-
const {
|
|
6
|
+
const { scoreTraitsDetailed, TRAIT_MATCHERS, UNSCORABLE_TRAITS } = require('./benchmarks');
|
|
7
7
|
|
|
8
8
|
// ============================================================
|
|
9
9
|
// Concurrency limiter (inline, no external dependency)
|
|
@@ -122,7 +122,28 @@ function getAvailableProviders(brain) {
|
|
|
122
122
|
|
|
123
123
|
function traitScore(response, expectedTraits) {
|
|
124
124
|
if (!expectedTraits || expectedTraits.length === 0) return null;
|
|
125
|
-
|
|
125
|
+
const text = String(response || '').toLowerCase();
|
|
126
|
+
let matched = 0;
|
|
127
|
+
let scored = 0;
|
|
128
|
+
|
|
129
|
+
const knownTraits = [];
|
|
130
|
+
for (const trait of expectedTraits) {
|
|
131
|
+
if (UNSCORABLE_TRAITS.has(trait)) continue;
|
|
132
|
+
if (TRAIT_MATCHERS[trait]) {
|
|
133
|
+
knownTraits.push(trait);
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
scored++;
|
|
137
|
+
if (text.includes(String(trait).toLowerCase())) matched++;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (knownTraits.length > 0) {
|
|
141
|
+
const detail = scoreTraitsDetailed(response, knownTraits);
|
|
142
|
+
matched += detail.matched.length;
|
|
143
|
+
scored += detail.scoredCount;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return scored > 0 ? matched / scored : 0;
|
|
126
147
|
}
|
|
127
148
|
|
|
128
149
|
// ============================================================
|
|
@@ -6,6 +6,7 @@ const crypto = require('crypto');
|
|
|
6
6
|
const { execFileSync } = require('child_process');
|
|
7
7
|
const { createClient } = require('../llm/client');
|
|
8
8
|
const { resolveModelName } = require('./agent-runner');
|
|
9
|
+
const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
|
|
9
10
|
|
|
10
11
|
const SUITE_NAME = 'humaneval-plus';
|
|
11
12
|
const CACHE_DIR = path.join(os.homedir(), '.walle', 'eval-cache');
|
|
@@ -192,7 +193,8 @@ async function runHumanEvalTask(task, options = {}) {
|
|
|
192
193
|
costDollars,
|
|
193
194
|
response: response.slice(0, 2000),
|
|
194
195
|
code: code.slice(0, 2000),
|
|
195
|
-
error: error ||
|
|
196
|
+
error: error || null,
|
|
197
|
+
testError: testError || null,
|
|
196
198
|
usage,
|
|
197
199
|
};
|
|
198
200
|
}
|
|
@@ -222,13 +224,13 @@ function scoreHumanEvalQuality(code) {
|
|
|
222
224
|
* @returns {Promise<object>} Suite results
|
|
223
225
|
*/
|
|
224
226
|
async function runHumanEvalSuite(options = {}) {
|
|
225
|
-
const { brain, providerType, config, model, maxTasks, signal } = options;
|
|
227
|
+
const { brain, providerType, config, model, maxTasks, signal, runId: providedRunId } = options;
|
|
226
228
|
|
|
227
229
|
const allTasks = await loadHumanEvalDataset();
|
|
228
230
|
const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
|
|
229
231
|
|
|
230
232
|
const client = createClient(providerType || 'anthropic', config || {});
|
|
231
|
-
const runId = crypto.randomUUID();
|
|
233
|
+
const runId = providedRunId || crypto.randomUUID();
|
|
232
234
|
const results = [];
|
|
233
235
|
let totalPassed = 0;
|
|
234
236
|
|
|
@@ -247,7 +249,8 @@ async function runHumanEvalSuite(options = {}) {
|
|
|
247
249
|
// Store in brain
|
|
248
250
|
if (brain && typeof brain.insertBenchmarkResult === 'function') {
|
|
249
251
|
try {
|
|
250
|
-
|
|
252
|
+
const scoringMethod = 'executable-tests';
|
|
253
|
+
brain.insertBenchmarkResult(decorateBenchmarkResult({
|
|
251
254
|
runId,
|
|
252
255
|
suite: SUITE_NAME,
|
|
253
256
|
promptId: task.task_id,
|
|
@@ -266,7 +269,29 @@ async function runHumanEvalSuite(options = {}) {
|
|
|
266
269
|
testsAfter: result.passed ? 1 : 0,
|
|
267
270
|
totalTests: 1,
|
|
268
271
|
dimensionsJson: JSON.stringify(result.score.dimensions),
|
|
269
|
-
|
|
272
|
+
modelMetadataJson: JSON.stringify({ testError: result.testError || null }),
|
|
273
|
+
datasetVersion: 'humaneval-plus:evalplus-master',
|
|
274
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
275
|
+
scoringMethod,
|
|
276
|
+
trusted: !result.error,
|
|
277
|
+
runConfig: { maxTasks, scoringMethod },
|
|
278
|
+
}, {
|
|
279
|
+
suite: SUITE_NAME,
|
|
280
|
+
benchmark: {
|
|
281
|
+
id: task.task_id,
|
|
282
|
+
prompt: task.prompt,
|
|
283
|
+
taskType: 'coding',
|
|
284
|
+
difficulty: taskDifficulty(task.task_id),
|
|
285
|
+
datasetVersion: 'humaneval-plus:evalplus-master',
|
|
286
|
+
},
|
|
287
|
+
runId,
|
|
288
|
+
provider: providerType || 'unknown',
|
|
289
|
+
model: resolveModelName(model),
|
|
290
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
291
|
+
scoringMethod,
|
|
292
|
+
trusted: !result.error,
|
|
293
|
+
runConfig: { maxTasks, scoringMethod },
|
|
294
|
+
}));
|
|
270
295
|
} catch {}
|
|
271
296
|
}
|
|
272
297
|
}
|
|
@@ -7,6 +7,7 @@ const crypto = require('crypto');
|
|
|
7
7
|
const { execFileSync } = require('child_process');
|
|
8
8
|
const { createClient } = require('../llm/client');
|
|
9
9
|
const { resolveModelName } = require('./agent-runner');
|
|
10
|
+
const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
|
|
10
11
|
|
|
11
12
|
const SUITE_NAME = 'livecodebench';
|
|
12
13
|
const CACHE_DIR = path.join(os.homedir(), '.walle', 'eval-cache');
|
|
@@ -187,7 +188,7 @@ async function runLiveCodeBenchTask(task, options = {}) {
|
|
|
187
188
|
* Run the full LiveCodeBench suite.
|
|
188
189
|
*/
|
|
189
190
|
async function runLiveCodeBenchSuite(options = {}) {
|
|
190
|
-
const { brain, providerType, config, model, maxTasks, signal, afterDate } = options;
|
|
191
|
+
const { brain, providerType, config, model, maxTasks, signal, afterDate, runId: providedRunId } = options;
|
|
191
192
|
|
|
192
193
|
let allTasks = await loadLiveCodeBenchDataset();
|
|
193
194
|
|
|
@@ -201,7 +202,7 @@ async function runLiveCodeBenchSuite(options = {}) {
|
|
|
201
202
|
|
|
202
203
|
const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
|
|
203
204
|
const client = createClient(providerType || 'anthropic', config || {});
|
|
204
|
-
const runId = crypto.randomUUID();
|
|
205
|
+
const runId = providedRunId || crypto.randomUUID();
|
|
205
206
|
const results = [];
|
|
206
207
|
let totalPassed = 0;
|
|
207
208
|
|
|
@@ -219,7 +220,9 @@ async function runLiveCodeBenchSuite(options = {}) {
|
|
|
219
220
|
// Store in brain
|
|
220
221
|
if (brain && typeof brain.insertBenchmarkResult === 'function') {
|
|
221
222
|
try {
|
|
222
|
-
|
|
223
|
+
const scoringMethod = 'executable-tests';
|
|
224
|
+
const prompt = task.question_content || task.prompt || task.description || '';
|
|
225
|
+
brain.insertBenchmarkResult(decorateBenchmarkResult({
|
|
223
226
|
runId,
|
|
224
227
|
suite: SUITE_NAME,
|
|
225
228
|
promptId: result.taskId,
|
|
@@ -227,7 +230,7 @@ async function runLiveCodeBenchSuite(options = {}) {
|
|
|
227
230
|
difficulty: taskDifficulty(task),
|
|
228
231
|
provider: providerType || 'unknown',
|
|
229
232
|
model: resolveModelName(model),
|
|
230
|
-
prompt:
|
|
233
|
+
prompt: prompt.slice(0, 2000),
|
|
231
234
|
response: result.response || '',
|
|
232
235
|
traitScore: null,
|
|
233
236
|
compositeScore: result.score.composite,
|
|
@@ -238,7 +241,28 @@ async function runLiveCodeBenchSuite(options = {}) {
|
|
|
238
241
|
testsAfter: result.passedCases || 0,
|
|
239
242
|
totalTests: result.totalCases || 0,
|
|
240
243
|
dimensionsJson: JSON.stringify(result.score.dimensions),
|
|
241
|
-
|
|
244
|
+
datasetVersion: 'livecodebench:release_v6',
|
|
245
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
246
|
+
scoringMethod,
|
|
247
|
+
trusted: !result.error,
|
|
248
|
+
runConfig: { maxTasks, afterDate, scoringMethod },
|
|
249
|
+
}, {
|
|
250
|
+
suite: SUITE_NAME,
|
|
251
|
+
benchmark: {
|
|
252
|
+
id: result.taskId,
|
|
253
|
+
prompt,
|
|
254
|
+
taskType: 'coding',
|
|
255
|
+
difficulty: taskDifficulty(task),
|
|
256
|
+
datasetVersion: 'livecodebench:release_v6',
|
|
257
|
+
},
|
|
258
|
+
runId,
|
|
259
|
+
provider: providerType || 'unknown',
|
|
260
|
+
model: resolveModelName(model),
|
|
261
|
+
scorerVersion: DEFAULT_SCORER_VERSION,
|
|
262
|
+
scoringMethod,
|
|
263
|
+
trusted: !result.error,
|
|
264
|
+
runConfig: { maxTasks, afterDate, scoringMethod },
|
|
265
|
+
}));
|
|
242
266
|
} catch {}
|
|
243
267
|
}
|
|
244
268
|
}
|