create-walle 0.9.11 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. package/README.md +3 -3
  2. package/package.json +2 -2
  3. package/template/bin/dev.sh +7 -1
  4. package/template/bin/setup.js +53 -9
  5. package/template/bin/sync-images.js +53 -0
  6. package/template/builder-journal.md +17 -0
  7. package/template/claude-task-manager/api-prompts.js +98 -13
  8. package/template/claude-task-manager/api-reviews.js +82 -5
  9. package/template/claude-task-manager/db.js +32 -5
  10. package/template/claude-task-manager/docs/session-capture-foundation-design.md +1273 -0
  11. package/template/claude-task-manager/lib/claude-desktop-sessions.js +696 -0
  12. package/template/claude-task-manager/lib/coding-agent-models.js +49 -1
  13. package/template/claude-task-manager/lib/session-capture.js +421 -0
  14. package/template/claude-task-manager/lib/session-history.js +135 -15
  15. package/template/claude-task-manager/lib/session-jobs.js +10 -5
  16. package/template/claude-task-manager/lib/session-stream.js +87 -19
  17. package/template/claude-task-manager/lib/setup-provider-config.js +115 -0
  18. package/template/claude-task-manager/lib/walle-ctm-history.js +72 -0
  19. package/template/claude-task-manager/lib/walle-session-context.js +61 -0
  20. package/template/claude-task-manager/lib/walle-transcript.js +176 -0
  21. package/template/claude-task-manager/public/css/setup.css +35 -8
  22. package/template/claude-task-manager/public/css/walle-session.css +56 -0
  23. package/template/claude-task-manager/public/css/walle.css +120 -0
  24. package/template/claude-task-manager/public/index.html +814 -181
  25. package/template/claude-task-manager/public/js/message-renderer.js +148 -19
  26. package/template/claude-task-manager/public/js/reviews.js +120 -62
  27. package/template/claude-task-manager/public/js/setup.js +75 -31
  28. package/template/claude-task-manager/public/js/stream-view.js +115 -55
  29. package/template/claude-task-manager/public/js/walle-session.js +84 -2
  30. package/template/claude-task-manager/public/js/walle.js +308 -54
  31. package/template/claude-task-manager/server.js +1092 -146
  32. package/template/claude-task-manager/session-integrity.js +181 -54
  33. package/template/claude-task-manager/session-utils.js +123 -41
  34. package/template/claude-task-manager/workers/state-detectors/codex.js +5 -2
  35. package/template/package.json +1 -1
  36. package/template/wall-e/adapters/ctm.js +39 -18
  37. package/template/wall-e/agent-runners/contract.js +17 -0
  38. package/template/wall-e/agent-runners/index.js +22 -0
  39. package/template/wall-e/agent-runtime/harness.js +212 -0
  40. package/template/wall-e/agent-runtime/index.js +8 -0
  41. package/template/wall-e/agent-runtime/registry.js +67 -0
  42. package/template/wall-e/agent-runtime/session-store.js +179 -0
  43. package/template/wall-e/agent-runtime/spawn.js +208 -0
  44. package/template/wall-e/api-walle.js +174 -7
  45. package/template/wall-e/brain.js +266 -28
  46. package/template/wall-e/channels/policy.js +88 -0
  47. package/template/wall-e/channels/registry.js +15 -1
  48. package/template/wall-e/channels/reply-dispatcher.js +70 -0
  49. package/template/wall-e/channels/session-bindings.js +51 -0
  50. package/template/wall-e/chat/code-review-context.js +29 -0
  51. package/template/wall-e/chat.js +188 -42
  52. package/template/wall-e/coding/acp-adapter.js +188 -0
  53. package/template/wall-e/coding/agent-catalog.js +129 -0
  54. package/template/wall-e/coding/compaction-service.js +247 -0
  55. package/template/wall-e/coding/execution-trace.js +3 -0
  56. package/template/wall-e/coding/instruction-service.js +224 -0
  57. package/template/wall-e/coding/model-message.js +67 -0
  58. package/template/wall-e/coding/permission-rules-store.js +111 -0
  59. package/template/wall-e/coding/permission-service.js +266 -0
  60. package/template/wall-e/coding/prompt-bundle.js +67 -0
  61. package/template/wall-e/coding/prompt-runtime.js +243 -0
  62. package/template/wall-e/coding/provider-transform.js +188 -0
  63. package/template/wall-e/coding/runtime-mode.js +132 -0
  64. package/template/wall-e/coding/snapshot-service.js +155 -0
  65. package/template/wall-e/coding/stream-processor.js +268 -0
  66. package/template/wall-e/coding/task-tool.js +255 -0
  67. package/template/wall-e/coding/tool-registry.js +361 -0
  68. package/template/wall-e/coding/transcript-writer.js +143 -0
  69. package/template/wall-e/coding/workspace-replay.js +324 -0
  70. package/template/wall-e/coding-context.js +4 -22
  71. package/template/wall-e/coding-orchestrator.js +307 -18
  72. package/template/wall-e/coding-prompts.js +44 -3
  73. package/template/wall-e/context/context-builder.js +43 -1
  74. package/template/wall-e/context/topic-matcher.js +1 -1
  75. package/template/wall-e/eval/agent-runner.js +59 -13
  76. package/template/wall-e/eval/benchmarks/memory-retrieval.json +155 -57
  77. package/template/wall-e/eval/benchmarks.js +100 -16
  78. package/template/wall-e/eval/eval-orchestrator.js +218 -8
  79. package/template/wall-e/eval/harvester.js +62 -5
  80. package/template/wall-e/eval/head-to-head.js +23 -2
  81. package/template/wall-e/eval/humaneval-adapter.js +30 -5
  82. package/template/wall-e/eval/livecodebench-adapter.js +29 -5
  83. package/template/wall-e/eval/manifest.js +186 -0
  84. package/template/wall-e/eval/run-agent-benchmarks.js +66 -2
  85. package/template/wall-e/eval/session-retrieval-benchmark.js +150 -0
  86. package/template/wall-e/eval/session-transcripts.js +57 -4
  87. package/template/wall-e/eval/swebench-adapter.js +109 -3
  88. package/template/wall-e/evaluation/agent-router.js +53 -1
  89. package/template/wall-e/evaluation/coding-quorum.js +48 -1
  90. package/template/wall-e/evaluation/router.js +4 -2
  91. package/template/wall-e/evaluation/tier-selector.js +11 -1
  92. package/template/wall-e/extraction/contradiction.js +2 -2
  93. package/template/wall-e/extraction/indexer.js +2 -1
  94. package/template/wall-e/extraction/knowledge-extractor.js +2 -2
  95. package/template/wall-e/hooks/cli.js +92 -0
  96. package/template/wall-e/hooks/discovery.js +119 -0
  97. package/template/wall-e/hooks/index.js +7 -0
  98. package/template/wall-e/hooks/manifest.js +55 -0
  99. package/template/wall-e/hooks/runtime.js +84 -0
  100. package/template/wall-e/hooks/session-memory.js +225 -0
  101. package/template/wall-e/http/auth.js +6 -2
  102. package/template/wall-e/http/chat-api.js +54 -8
  103. package/template/wall-e/integrations/claude-plugin/hooks/hooks.json +27 -0
  104. package/template/wall-e/integrations/claude-plugin/hooks/walle-precompact-hook.sh +5 -0
  105. package/template/wall-e/integrations/claude-plugin/hooks/walle-stop-hook.sh +5 -0
  106. package/template/wall-e/integrations/codex-plugin/hooks/walle-hook.sh +7 -0
  107. package/template/wall-e/integrations/codex-plugin/hooks.json +37 -0
  108. package/template/wall-e/listening/calendar.js +3 -1
  109. package/template/wall-e/llm/client.js +64 -10
  110. package/template/wall-e/llm/google.js +39 -5
  111. package/template/wall-e/llm/ollama.js +1 -1
  112. package/template/wall-e/llm/ollama.plugin.json +1 -1
  113. package/template/wall-e/llm/provider-availability.js +10 -0
  114. package/template/wall-e/llm/provider-error.js +269 -0
  115. package/template/wall-e/llm/tool-adapter.js +48 -12
  116. package/template/wall-e/loops/boot.js +2 -1
  117. package/template/wall-e/loops/initiative.js +2 -2
  118. package/template/wall-e/loops/tasks.js +8 -47
  119. package/template/wall-e/loops/workspace-prompts.js +20 -0
  120. package/template/wall-e/mcp-server.js +442 -1
  121. package/template/wall-e/memory/session-ingest-service.js +159 -0
  122. package/template/wall-e/memory/source-indexer.js +289 -0
  123. package/template/wall-e/plugins/discovery.js +83 -0
  124. package/template/wall-e/plugins/manifest-loader.js +50 -10
  125. package/template/wall-e/plugins/manifest-schema.js +69 -0
  126. package/template/wall-e/plugins/model-catalog.js +55 -0
  127. package/template/wall-e/prompts/coding/base.txt +2 -0
  128. package/template/wall-e/prompts/coding/deepseek.txt +1 -0
  129. package/template/wall-e/prompts/coding/memory-protocol.md +9 -0
  130. package/template/wall-e/prompts/coding/plan.txt +1 -0
  131. package/template/wall-e/runtime/execution-trace.js +220 -0
  132. package/template/wall-e/security/audit.js +266 -0
  133. package/template/wall-e/security/ssrf.js +236 -0
  134. package/template/wall-e/session-files.js +303 -0
  135. package/template/wall-e/skills/_bundled/slack-backfill/SKILL.md +3 -0
  136. package/template/wall-e/skills/_bundled/slack-sync/SKILL.md +3 -0
  137. package/template/wall-e/skills/internal-skill-registry.js +2 -2
  138. package/template/wall-e/skills/script-skill-runner.js +143 -0
  139. package/template/wall-e/skills/skill-executor.js +5 -6
  140. package/template/wall-e/skills/skill-fallback.js +3 -1
  141. package/template/wall-e/skills/skill-harness-registry.js +7 -8
  142. package/template/wall-e/skills/skill-planner.js +52 -4
  143. package/template/wall-e/skills/slack-ingest.js +11 -3
  144. package/template/wall-e/sources/base.js +90 -0
  145. package/template/wall-e/sources/builtin.js +33 -0
  146. package/template/wall-e/sources/claude-code-jsonl.js +78 -0
  147. package/template/wall-e/sources/codex-jsonl.js +125 -0
  148. package/template/wall-e/sources/coding-session-utils.js +117 -0
  149. package/template/wall-e/sources/contract-suite.js +59 -0
  150. package/template/wall-e/sources/gemini-jsonl.js +85 -0
  151. package/template/wall-e/sources/index.js +9 -0
  152. package/template/wall-e/sources/jsonl-utils.js +181 -0
  153. package/template/wall-e/sources/record-types.js +252 -0
  154. package/template/wall-e/sources/registry.js +92 -0
  155. package/template/wall-e/sources/transforms.js +100 -0
  156. package/template/wall-e/sources/walle-jsonl.js +108 -0
  157. package/template/wall-e/tools/coding-middleware.js +31 -1
  158. package/template/wall-e/tools/file-tracker.js +25 -1
  159. package/template/wall-e/tools/local-tools.js +75 -47
  160. package/template/wall-e/tools/session-sharing.js +68 -1
  161. package/template/wall-e/tools/shell-analyzer.js +1 -1
  162. package/template/wall-e/tools/shell-policy.js +47 -0
  163. package/template/wall-e/tools/snapshot.js +42 -0
  164. package/template/wall-e/training/harvester.js +62 -5
  165. package/template/wall-e/utils/repair.js +253 -1
  166. package/template/website/index.html +3 -3
  167. package/template/wall-e/skills/_bundled/slack-mentions/.watched-threads.json +0 -18
@@ -0,0 +1,186 @@
1
+ 'use strict';
2
+
3
+ const crypto = require('crypto');
4
+ const path = require('path');
5
+ const { execFileSync } = require('child_process');
6
+
7
+ const DEFAULT_DATASET_VERSION = 'local-v1';
8
+ const DEFAULT_SCORER_VERSION = 'wall-e-eval-v2';
9
+ const DEFAULT_EVALUATOR_VERSION = 'wall-e-evaluator-v2';
10
+
11
+ let cachedRepoSha;
12
+
13
+ function stableStringify(value) {
14
+ if (value === null || typeof value !== 'object') return JSON.stringify(value);
15
+ if (Array.isArray(value)) return '[' + value.map(stableStringify).join(',') + ']';
16
+ return '{' + Object.keys(value).sort().map((key) => (
17
+ JSON.stringify(key) + ':' + stableStringify(value[key])
18
+ )).join(',') + '}';
19
+ }
20
+
21
+ function sha256(value) {
22
+ return crypto.createHash('sha256').update(String(value ?? '')).digest('hex');
23
+ }
24
+
25
+ function hashObject(value) {
26
+ return sha256(stableStringify(value));
27
+ }
28
+
29
+ function getRepoSha(cwd = path.resolve(__dirname, '..')) {
30
+ if (cachedRepoSha !== undefined) return cachedRepoSha;
31
+ try {
32
+ cachedRepoSha = execFileSync('git', ['rev-parse', 'HEAD'], {
33
+ cwd,
34
+ encoding: 'utf8',
35
+ stdio: ['ignore', 'pipe', 'ignore'],
36
+ }).trim() || null;
37
+ } catch {
38
+ cachedRepoSha = null;
39
+ }
40
+ return cachedRepoSha;
41
+ }
42
+
43
+ function safeJson(value) {
44
+ if (value == null) return null;
45
+ if (typeof value === 'string') return value;
46
+ try { return JSON.stringify(value); } catch { return null; }
47
+ }
48
+
49
+ function pickRunConfig(config = {}) {
50
+ const allowed = [
51
+ 'temperature', 'seed', 'maxTokens', 'timeoutMs', 'concurrency',
52
+ 'budgetDollars', 'suite', 'taskType', 'scoringMethod',
53
+ ];
54
+ const out = {};
55
+ for (const key of allowed) {
56
+ if (config[key] !== undefined) out[key] = config[key];
57
+ }
58
+ return out;
59
+ }
60
+
61
+ function samplePayloadForHash(benchmark = {}) {
62
+ return {
63
+ id: benchmark.id || benchmark.promptId || benchmark.benchmark_id || null,
64
+ prompt: benchmark.prompt || '',
65
+ taskType: benchmark.taskType || null,
66
+ difficulty: benchmark.difficulty || null,
67
+ expectedTraits: benchmark.expectedTraits || null,
68
+ expectedInReply: benchmark.expectedInReply || null,
69
+ agentExpectations: benchmark.agentExpectations || null,
70
+ };
71
+ }
72
+
73
+ function buildEvalManifest({
74
+ suite,
75
+ benchmark = {},
76
+ runId,
77
+ provider,
78
+ model,
79
+ runConfig = {},
80
+ scorerVersion = DEFAULT_SCORER_VERSION,
81
+ evaluatorVersion = DEFAULT_EVALUATOR_VERSION,
82
+ scoringMethod,
83
+ artifactPath,
84
+ trusted,
85
+ } = {}) {
86
+ const sampleId = benchmark.sampleId || benchmark.id || benchmark.promptId || benchmark.benchmark_id || null;
87
+ const datasetVersion = benchmark.datasetVersion || `${suite || 'unknown'}:${DEFAULT_DATASET_VERSION}`;
88
+ const datasetHash = benchmark.datasetHash || hashObject({
89
+ suite: suite || 'unknown',
90
+ datasetVersion,
91
+ sample: samplePayloadForHash(benchmark),
92
+ });
93
+ const promptHash = sha256(benchmark.prompt || '');
94
+ const sanitizedConfig = pickRunConfig({ ...runConfig, suite, scoringMethod });
95
+ const repoSha = getRepoSha();
96
+
97
+ const manifest = {
98
+ runId: runId || null,
99
+ suite: suite || null,
100
+ datasetVersion,
101
+ datasetHash,
102
+ sampleId,
103
+ promptHash,
104
+ provider: provider || null,
105
+ model: model || null,
106
+ modelSnapshot: benchmark.modelSnapshot || model || null,
107
+ scorerVersion,
108
+ evaluatorVersion,
109
+ scoringMethod: scoringMethod || null,
110
+ repoSha,
111
+ runConfig: sanitizedConfig,
112
+ artifactPath: artifactPath || null,
113
+ trusted: trusted === undefined ? null : !!trusted,
114
+ };
115
+
116
+ return {
117
+ sampleId,
118
+ datasetVersion,
119
+ datasetHash,
120
+ promptHash,
121
+ repoSha,
122
+ scorerVersion,
123
+ evaluatorVersion,
124
+ scoringMethod: scoringMethod || null,
125
+ runConfigJson: safeJson(sanitizedConfig),
126
+ evalManifestJson: safeJson(manifest),
127
+ artifactPath: artifactPath || null,
128
+ modelSnapshot: benchmark.modelSnapshot || model || null,
129
+ temperature: sanitizedConfig.temperature ?? null,
130
+ seed: sanitizedConfig.seed ?? null,
131
+ };
132
+ }
133
+
134
+ function decorateBenchmarkResult(entry = {}, context = {}) {
135
+ const suite = entry.suite || context.suite;
136
+ const benchmark = {
137
+ ...(context.benchmark || {}),
138
+ id: entry.promptId || entry.benchmark_id || context.benchmark?.id,
139
+ prompt: entry.prompt || context.benchmark?.prompt,
140
+ taskType: entry.taskType || context.benchmark?.taskType,
141
+ difficulty: entry.difficulty || context.benchmark?.difficulty,
142
+ };
143
+ const manifest = buildEvalManifest({
144
+ suite,
145
+ benchmark,
146
+ runId: entry.runId || context.runId,
147
+ provider: entry.provider || context.provider,
148
+ model: entry.model || context.model,
149
+ runConfig: context.runConfig || {},
150
+ scorerVersion: entry.scorerVersion || context.scorerVersion,
151
+ evaluatorVersion: entry.evaluatorVersion || context.evaluatorVersion,
152
+ scoringMethod: entry.scoringMethod || context.scoringMethod,
153
+ artifactPath: entry.artifactPath || context.artifactPath,
154
+ trusted: entry.trusted ?? context.trusted,
155
+ });
156
+
157
+ return {
158
+ ...entry,
159
+ sampleId: entry.sampleId || manifest.sampleId,
160
+ datasetVersion: entry.datasetVersion || manifest.datasetVersion,
161
+ datasetHash: entry.datasetHash || manifest.datasetHash,
162
+ promptHash: entry.promptHash || manifest.promptHash,
163
+ repoSha: entry.repoSha || manifest.repoSha,
164
+ scorerVersion: entry.scorerVersion || manifest.scorerVersion,
165
+ evaluatorVersion: entry.evaluatorVersion || manifest.evaluatorVersion,
166
+ scoringMethod: entry.scoringMethod || manifest.scoringMethod,
167
+ runConfigJson: entry.runConfigJson || manifest.runConfigJson,
168
+ evalManifestJson: entry.evalManifestJson || manifest.evalManifestJson,
169
+ artifactPath: entry.artifactPath || manifest.artifactPath,
170
+ modelSnapshot: entry.modelSnapshot || manifest.modelSnapshot,
171
+ temperature: entry.temperature ?? manifest.temperature,
172
+ seed: entry.seed ?? manifest.seed,
173
+ };
174
+ }
175
+
176
+ module.exports = {
177
+ DEFAULT_DATASET_VERSION,
178
+ DEFAULT_SCORER_VERSION,
179
+ DEFAULT_EVALUATOR_VERSION,
180
+ stableStringify,
181
+ sha256,
182
+ hashObject,
183
+ getRepoSha,
184
+ buildEvalManifest,
185
+ decorateBenchmarkResult,
186
+ };
@@ -25,9 +25,11 @@ try {
25
25
  } catch {}
26
26
 
27
27
  const path = require('path');
28
+ const crypto = require('crypto');
28
29
  process.chdir(path.join(__dirname, '..'));
29
30
 
30
- const { setupSandbox, cleanupSandbox, runAgentBenchmark, runAgentBenchmarkSuite } = require('./agent-runner');
31
+ const { setupSandbox, cleanupSandbox, runAgentBenchmark, runAgentBenchmarkSuite, resolveModelName } = require('./agent-runner');
32
+ const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
31
33
  const benchmarks = require('./benchmarks/coding-agent.json');
32
34
 
33
35
  /**
@@ -227,6 +229,7 @@ async function main() {
227
229
 
228
230
  console.log(`Running ${selectedBenchmarks.length} benchmarks...\n`);
229
231
 
232
+ const runId = crypto.randomUUID();
230
233
  const results = [];
231
234
  for (const benchmark of selectedBenchmarks) {
232
235
  const startTime = Date.now();
@@ -244,7 +247,10 @@ async function main() {
244
247
  });
245
248
 
246
249
  const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
250
+ result.runId = runId;
251
+ result.timestamp = new Date().toISOString();
247
252
  results.push(result);
253
+ storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs: 600000 });
248
254
 
249
255
  console.log(` Success: ${result.success}`);
250
256
  console.log(` Score: ${(result.score?.composite || 0).toFixed(3)}`);
@@ -263,7 +269,16 @@ async function main() {
263
269
  if (result.error) console.log(` Error: ${result.error}`);
264
270
  } catch (err) {
265
271
  console.error(` EXCEPTION: ${err.message}`);
266
- results.push({ benchmarkId: benchmark.id, success: false, error: err.message, score: { composite: 0 } });
272
+ const result = {
273
+ benchmarkId: benchmark.id,
274
+ success: false,
275
+ error: err.message,
276
+ score: { composite: 0 },
277
+ runId,
278
+ timestamp: new Date().toISOString(),
279
+ };
280
+ results.push(result);
281
+ storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs: 600000 });
267
282
  }
268
283
  console.log('');
269
284
  }
@@ -312,3 +327,52 @@ main().catch(err => {
312
327
  console.error('Fatal error:', err);
313
328
  process.exit(1);
314
329
  });
330
+
331
+ function storeBenchmarkResult({ brain, runId, benchmark, result, provider, modelId, timeoutMs }) {
332
+ if (!brain || typeof brain.insertBenchmarkResult !== 'function') return;
333
+ try {
334
+ const scoringMethod = benchmark.agentExpectations?.testCommand
335
+ ? 'agent-rubric+tests'
336
+ : 'agent-rubric';
337
+ brain.insertBenchmarkResult(decorateBenchmarkResult({
338
+ runId,
339
+ suite: 'coding-agent',
340
+ promptId: benchmark.id,
341
+ taskType: 'coding-agent',
342
+ difficulty: benchmark.difficulty,
343
+ provider: provider?.type || 'default',
344
+ model: resolveModelName(modelId),
345
+ prompt: benchmark.prompt,
346
+ response: result.output || '',
347
+ traitScore: null,
348
+ matchedTraits: [],
349
+ compositeScore: result.score?.composite || 0,
350
+ latencyMs: result.latencyMs,
351
+ error: result.error,
352
+ timestamp: result.timestamp,
353
+ costDollars: result.costDollars || null,
354
+ testsBefore: result.testsBefore ?? null,
355
+ testsAfter: result.testsAfter ?? null,
356
+ totalTests: result.totalTests ?? null,
357
+ dimensionsJson: result.dimensionsJson || null,
358
+ inputTokens: result.inputTokens ?? null,
359
+ outputTokens: result.outputTokens ?? null,
360
+ scorerVersion: DEFAULT_SCORER_VERSION,
361
+ scoringMethod,
362
+ trusted: !result.error && result.testsPassed === true,
363
+ runConfig: { timeoutMs, scoringMethod },
364
+ }, {
365
+ suite: 'coding-agent',
366
+ benchmark,
367
+ runId,
368
+ provider: provider?.type || 'default',
369
+ model: resolveModelName(modelId),
370
+ scoringMethod,
371
+ scorerVersion: DEFAULT_SCORER_VERSION,
372
+ trusted: !result.error && result.testsPassed === true,
373
+ runConfig: { timeoutMs, scoringMethod },
374
+ }));
375
+ } catch (err) {
376
+ console.warn(` [WARN] Failed to store benchmark result: ${err.message}`);
377
+ }
378
+ }
@@ -0,0 +1,150 @@
1
+ 'use strict';
2
+
3
+ const fs = require('node:fs');
4
+ const path = require('node:path');
5
+ const brainDefault = require('../brain');
6
+ const { indexMemory } = require('../memory/source-indexer');
7
+
8
+ const DEFAULT_CASES_PATH = path.join(__dirname, 'benchmarks', 'memory-retrieval.json');
9
+
10
+ function loadMemoryRetrievalCases(filePath = DEFAULT_CASES_PATH) {
11
+ const parsed = JSON.parse(fs.readFileSync(filePath, 'utf8'));
12
+ return Array.isArray(parsed) ? parsed : [];
13
+ }
14
+
15
+ function seedBenchmarkMemories({ brain = brainDefault, cases = [] } = {}) {
16
+ let inserted = 0;
17
+ let indexed = 0;
18
+ for (const bench of cases) {
19
+ const memories = bench.retrieval?.seedMemories || [];
20
+ for (const seed of memories) {
21
+ const result = brain.insertMemory({
22
+ source: seed.source || 'codex-jsonl',
23
+ source_id: seed.source_id,
24
+ source_channel: seed.cwd || '',
25
+ memory_type: seed.memory_type || 'coding_session_exchange',
26
+ direction: seed.direction || 'exchange',
27
+ subject: seed.subject || seed.source_id,
28
+ content: seed.content,
29
+ content_raw: seed.content,
30
+ metadata: JSON.stringify(seed.metadata || {}),
31
+ importance: seed.importance ?? 0.7,
32
+ timestamp: seed.timestamp || new Date().toISOString(),
33
+ });
34
+ if (!result) continue;
35
+ inserted++;
36
+ indexMemory({
37
+ ...seed,
38
+ id: result.id,
39
+ source: seed.source || 'codex-jsonl',
40
+ source_id: seed.source_id,
41
+ source_channel: seed.cwd || '',
42
+ memory_type: seed.memory_type || 'coding_session_exchange',
43
+ content: seed.content,
44
+ metadata: JSON.stringify(seed.metadata || {}),
45
+ timestamp: seed.timestamp || new Date().toISOString(),
46
+ }, { brain });
47
+ indexed++;
48
+ }
49
+ }
50
+ return { inserted, indexed };
51
+ }
52
+
53
+ function searchRetrievalCase({ brain = brainDefault, query, limit = 10 } = {}) {
54
+ const max = Math.min(Math.max(Number(limit) || 10, 1), 50);
55
+ const direct = brain.searchMemories({ query, limit: max * 3 });
56
+ let indexed = [];
57
+ try {
58
+ const rows = brain.searchMemoryIndex({ query, limit: max * 3 });
59
+ indexed = hydrateIndexRows(brain, rows);
60
+ } catch {}
61
+ return mergeById(direct, indexed).slice(0, max);
62
+ }
63
+
64
+ function scoreRetrievalCase(bench, results, { ks = [5, 10] } = {}) {
65
+ const expected = new Set(bench.retrieval?.expectedSourceIds || []);
66
+ const sourceIds = results.map(resultSourceId);
67
+ const out = {
68
+ id: bench.id,
69
+ query: bench.retrieval?.query || '',
70
+ expected: [...expected],
71
+ returned: sourceIds,
72
+ };
73
+ for (const k of ks) {
74
+ out[`hit_at_${k}`] = sourceIds.slice(0, k).some((id) => expected.has(id));
75
+ }
76
+ return out;
77
+ }
78
+
79
+ function runMemoryRetrievalBenchmark({ brain = brainDefault, cases = loadMemoryRetrievalCases(), seed = false, limit = 10 } = {}) {
80
+ if (seed) seedBenchmarkMemories({ brain, cases });
81
+ const results = [];
82
+ for (const bench of cases) {
83
+ const query = bench.retrieval?.query || bench.prompt || '';
84
+ const hits = searchRetrievalCase({ brain, query, limit });
85
+ results.push(scoreRetrievalCase(bench, hits));
86
+ }
87
+ return summarizeRetrievalResults(results);
88
+ }
89
+
90
+ function summarizeRetrievalResults(results) {
91
+ const total = results.length || 1;
92
+ const hitAt5 = results.filter((result) => result.hit_at_5).length;
93
+ const hitAt10 = results.filter((result) => result.hit_at_10).length;
94
+ return {
95
+ total: results.length,
96
+ recall_at_5: hitAt5 / total,
97
+ recall_at_10: hitAt10 / total,
98
+ results,
99
+ };
100
+ }
101
+
102
+ function hydrateIndexRows(brain, rows) {
103
+ if (!rows?.length) return [];
104
+ const ids = [...new Set(rows.map((row) => row.memory_id).filter(Boolean))];
105
+ if (!ids.length) return [];
106
+ const placeholders = ids.map(() => '?').join(',');
107
+ return brain.getDb().prepare(`
108
+ SELECT * FROM memories
109
+ WHERE archived_at IS NULL AND id IN (${placeholders})
110
+ `).all(...ids);
111
+ }
112
+
113
+ function mergeById(...groups) {
114
+ const seen = new Set();
115
+ const merged = [];
116
+ for (const group of groups) {
117
+ for (const item of group || []) {
118
+ if (!item?.id || seen.has(item.id)) continue;
119
+ seen.add(item.id);
120
+ merged.push(item);
121
+ }
122
+ }
123
+ return merged;
124
+ }
125
+
126
+ function resultSourceId(result = {}) {
127
+ try {
128
+ const metadata = JSON.parse(result.metadata || '{}');
129
+ if (metadata?.sourceId) return metadata.sourceId;
130
+ } catch {}
131
+ const sourceId = String(result.source_id || '');
132
+ const parts = sourceId.split(':');
133
+ return parts.length > 2 ? parts.slice(0, 2).join(':') : sourceId;
134
+ }
135
+
136
+ if (require.main === module) {
137
+ brainDefault.initDb();
138
+ const summary = runMemoryRetrievalBenchmark({ seed: process.argv.includes('--seed') });
139
+ console.log(JSON.stringify(summary, null, 2));
140
+ }
141
+
142
+ module.exports = {
143
+ loadMemoryRetrievalCases,
144
+ resultSourceId,
145
+ runMemoryRetrievalBenchmark,
146
+ scoreRetrievalCase,
147
+ searchRetrievalCase,
148
+ seedBenchmarkMemories,
149
+ summarizeRetrievalResults,
150
+ };
@@ -13,6 +13,12 @@ const {
13
13
  const DEFAULT_TRANSCRIPT_ROOTS = {
14
14
  claude: path.join(os.homedir(), '.claude', 'projects'),
15
15
  codex: path.join(os.homedir(), '.codex', 'sessions'),
16
+ walle: process.env.WALLE_SESSIONS_DIR
17
+ || process.env.WALL_E_SESSIONS_DIR
18
+ || process.env.WALL_E_SESSION_DIR
19
+ || process.env.WALLE_SESSION_DIR
20
+ || (process.env.WALLE_DEV_DIR ? path.join(process.env.WALLE_DEV_DIR, 'sessions') : '')
21
+ || path.join(process.env.WALL_E_DATA_DIR || path.join(os.homedir(), '.walle'), 'sessions'),
16
22
  };
17
23
 
18
24
  const MIN_PROMPT_CHARS = 20;
@@ -21,9 +27,12 @@ function detectTranscriptSource(jsonlPath, events = null) {
21
27
  const normalized = path.normalize(jsonlPath || '');
22
28
  if (normalized.includes(`${path.sep}.claude${path.sep}projects${path.sep}`)) return 'claude';
23
29
  if (normalized.includes(`${path.sep}.codex${path.sep}sessions${path.sep}`)) return 'codex';
30
+ if (normalized.includes(`${path.sep}.walle${path.sep}sessions${path.sep}`)) return 'walle';
24
31
 
25
32
  const sample = events || readJsonlEvents(jsonlPath).slice(0, 50);
26
33
  for (const evt of sample) {
34
+ if (evt?.walle?.schema === 'wall-e-session-v1') return 'walle';
35
+ if (evt?.provider === 'walle' || evt?.type === 'walle_part') return 'walle';
27
36
  if (evt?.type === 'session_meta' && evt.payload?.originator === 'codex-tui') return 'codex';
28
37
  if (evt?.type === 'turn_context' && evt.payload?.cwd) return 'codex';
29
38
  if (evt?.type === 'response_item' && evt.payload) return 'codex';
@@ -57,8 +66,9 @@ function parseTranscriptJsonl(jsonlPath, { repoPath = null, minPromptChars = MIN
57
66
 
58
67
  const source = detectTranscriptSource(jsonlPath, events);
59
68
  let session = null;
60
- if (source === 'claude') session = parseClaudeTranscript(jsonlPath, events, minPromptChars);
69
+ if (source === 'claude') session = parseClaudeTranscript(jsonlPath, events, minPromptChars, 'claude');
61
70
  else if (source === 'codex') session = parseCodexTranscript(jsonlPath, events, minPromptChars);
71
+ else if (source === 'walle') session = parseWalleTranscript(jsonlPath, events, minPromptChars);
62
72
  else session = parseUnknownTranscript(jsonlPath, events, minPromptChars);
63
73
 
64
74
  if (!session || !session.cwd) return null;
@@ -77,8 +87,8 @@ function readJsonlEvents(jsonlPath) {
77
87
  return events;
78
88
  }
79
89
 
80
- function parseClaudeTranscript(jsonlPath, events, minPromptChars) {
81
- const session = baseSession(jsonlPath, 'claude');
90
+ function parseClaudeTranscript(jsonlPath, events, minPromptChars, source = 'claude') {
91
+ const session = baseSession(jsonlPath, source);
82
92
  const editedFiles = new Set();
83
93
 
84
94
  for (const evt of events) {
@@ -152,6 +162,48 @@ function parseCodexTranscript(jsonlPath, events, minPromptChars) {
152
162
  return session;
153
163
  }
154
164
 
165
+ function parseWalleTranscript(jsonlPath, events, minPromptChars) {
166
+ const session = baseSession(jsonlPath, 'walle');
167
+ const editedFiles = new Set();
168
+
169
+ for (const evt of events) {
170
+ const ts = evt.timestamp || null;
171
+ if (ts) setSessionTime(session, ts);
172
+ if (evt.cwd && !session.cwd) session.cwd = evt.cwd;
173
+ if (evt.gitBranch && !session.gitBranch) session.gitBranch = evt.gitBranch;
174
+
175
+ if (evt.type === 'session_meta') {
176
+ session.sessionId = evt.sessionId || session.sessionId;
177
+ if (evt.cwd && !session.cwd) session.cwd = evt.cwd;
178
+ if (evt.gitBranch && !session.gitBranch) session.gitBranch = evt.gitBranch;
179
+ continue;
180
+ }
181
+
182
+ if (evt.type === 'user') {
183
+ const text = cleanUserText(extractMessageText(evt.message || evt));
184
+ if (isReplayableUserText(text, minPromptChars)) session.userMessages.push(text);
185
+ continue;
186
+ }
187
+
188
+ if (evt.type === 'assistant') {
189
+ const text = extractMessageText(evt.message || evt);
190
+ if (text) session.assistantMessages.push(text);
191
+ for (const call of harvestExtractToolCalls(evt.message || evt)) {
192
+ addToolCall(session, editedFiles, call.name, call.input || {}, ts);
193
+ }
194
+ continue;
195
+ }
196
+
197
+ if (evt.type === 'walle_part' && evt.partType === 'tool_call') {
198
+ const data = evt.data || {};
199
+ addToolCall(session, editedFiles, data.name || data.tool || 'tool', data.input || {}, ts);
200
+ }
201
+ }
202
+
203
+ finishSession(session, editedFiles, events.length);
204
+ return session;
205
+ }
206
+
155
207
  function parseUnknownTranscript(jsonlPath, events, minPromptChars) {
156
208
  const session = baseSession(jsonlPath, 'unknown');
157
209
  const editedFiles = new Set();
@@ -434,7 +486,8 @@ function resolveRoots(roots, source) {
434
486
  if (roots) return Array.isArray(roots) ? roots : [roots];
435
487
  if (source === 'claude') return [DEFAULT_TRANSCRIPT_ROOTS.claude];
436
488
  if (source === 'codex') return [DEFAULT_TRANSCRIPT_ROOTS.codex];
437
- return [DEFAULT_TRANSCRIPT_ROOTS.claude, DEFAULT_TRANSCRIPT_ROOTS.codex];
489
+ if (source === 'walle') return [DEFAULT_TRANSCRIPT_ROOTS.walle];
490
+ return [DEFAULT_TRANSCRIPT_ROOTS.claude, DEFAULT_TRANSCRIPT_ROOTS.codex, DEFAULT_TRANSCRIPT_ROOTS.walle];
438
491
  }
439
492
 
440
493
  module.exports = {
@@ -15,6 +15,9 @@ const fs = require('fs');
15
15
  const path = require('path');
16
16
  const os = require('os');
17
17
  const https = require('https');
18
+ const crypto = require('crypto');
19
+ const { resolveModelName } = require('./agent-runner');
20
+ const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
18
21
 
19
22
  const CACHE_DIR = path.join(os.homedir(), '.walle', 'swebench-cache');
20
23
  const DATASET_URL =
@@ -148,6 +151,7 @@ async function runSWEBenchTask(task, options = {}) {
148
151
  let agentResult = null;
149
152
  if (runAgentLoop) {
150
153
  try {
154
+ let timeoutHandle;
151
155
  agentResult = await Promise.race([
152
156
  runAgentLoop(mapped.prompt, {
153
157
  brain,
@@ -155,10 +159,12 @@ async function runSWEBenchTask(task, options = {}) {
155
159
  model,
156
160
  maxTurns: 30,
157
161
  }),
158
- new Promise((_, reject) =>
159
- setTimeout(() => reject(new Error('Agent timeout')), timeoutMs)
160
- ),
162
+ new Promise((_, reject) => {
163
+ timeoutHandle = setTimeout(() => reject(new Error('Agent timeout')), timeoutMs);
164
+ if (typeof timeoutHandle.unref === 'function') timeoutHandle.unref();
165
+ }),
161
166
  ]);
167
+ if (timeoutHandle) clearTimeout(timeoutHandle);
162
168
  } catch (err) {
163
169
  return {
164
170
  taskId: mapped.id,
@@ -229,10 +235,110 @@ async function loadCuratedSubset() {
229
235
  return JSON.parse(fs.readFileSync(filePath, 'utf8'));
230
236
  }
231
237
 
238
+ async function runSWEBenchSuite(options = {}) {
239
+ const {
240
+ brain,
241
+ runAgentLoop,
242
+ provider,
243
+ providerType,
244
+ model,
245
+ maxTasks,
246
+ signal,
247
+ timeoutMs,
248
+ runId: providedRunId,
249
+ } = options;
250
+
251
+ const allTasks = await loadCuratedSubset();
252
+ const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
253
+ const runId = providedRunId || crypto.randomUUID();
254
+ const results = [];
255
+ let totalPassed = 0;
256
+
257
+ for (const rawTask of tasks) {
258
+ if (signal?.aborted) break;
259
+
260
+ const mapped = mapTaskToPrompt(rawTask);
261
+ const result = await runSWEBenchTask(mapped, {
262
+ brain,
263
+ runAgentLoop,
264
+ provider,
265
+ model,
266
+ timeoutMs,
267
+ });
268
+ results.push(result);
269
+ if (result.success) totalPassed++;
270
+
271
+ if (brain && typeof brain.insertBenchmarkResult === 'function') {
272
+ try {
273
+ const scoringMethod = 'swebench-docker-tests';
274
+ brain.insertBenchmarkResult(decorateBenchmarkResult({
275
+ runId,
276
+ suite: 'swebench-lite',
277
+ promptId: mapped.id,
278
+ taskType: 'coding-agent',
279
+ difficulty: rawTask.difficulty || 'medium',
280
+ provider: providerType || 'unknown',
281
+ model: resolveModelName(model),
282
+ prompt: mapped.prompt,
283
+ response: result.testOutput || '',
284
+ traitScore: null,
285
+ compositeScore: result.score?.composite || 0,
286
+ latencyMs: result.elapsedMs || null,
287
+ error: result.error || null,
288
+ testsBefore: null,
289
+ testsAfter: result.success ? 1 : 0,
290
+ totalTests: result.error ? 0 : 1,
291
+ dimensionsJson: JSON.stringify(result.score?.dimensions || {}),
292
+ modelMetadataJson: JSON.stringify({
293
+ repo: mapped.repo,
294
+ baseCommit: mapped.baseCommit,
295
+ agentTurns: result.agentTurns || 0,
296
+ testOutput: result.testOutput || null,
297
+ }),
298
+ datasetVersion: 'swebench-lite:curated-30',
299
+ scorerVersion: DEFAULT_SCORER_VERSION,
300
+ scoringMethod,
301
+ trusted: !result.error,
302
+ runConfig: { maxTasks, timeoutMs, scoringMethod },
303
+ }, {
304
+ suite: 'swebench-lite',
305
+ benchmark: {
306
+ id: mapped.id,
307
+ prompt: mapped.prompt,
308
+ taskType: 'coding-agent',
309
+ difficulty: rawTask.difficulty || 'medium',
310
+ datasetVersion: 'swebench-lite:curated-30',
311
+ },
312
+ runId,
313
+ provider: providerType || 'unknown',
314
+ model: resolveModelName(model),
315
+ scorerVersion: DEFAULT_SCORER_VERSION,
316
+ scoringMethod,
317
+ trusted: !result.error,
318
+ runConfig: { maxTasks, timeoutMs, scoringMethod },
319
+ }));
320
+ } catch {}
321
+ }
322
+ }
323
+
324
+ return {
325
+ runId,
326
+ suite: 'swebench-lite',
327
+ model: resolveModelName(model),
328
+ totalTasks: tasks.length,
329
+ passed: totalPassed,
330
+ passAt1: tasks.length > 0 ? totalPassed / tasks.length : 0,
331
+ avgScore: results.reduce((s, r) => s + (r.score?.composite || 0), 0) / Math.max(results.length, 1),
332
+ totalCost: results.reduce((s, r) => s + (r.costDollars || 0), 0),
333
+ results,
334
+ };
335
+ }
336
+
232
337
  module.exports = {
233
338
  downloadDataset,
234
339
  mapTaskToPrompt,
235
340
  runSWEBenchTask,
341
+ runSWEBenchSuite,
236
342
  loadCuratedSubset,
237
343
  CACHE_DIR,
238
344
  };