create-walle 0.9.11 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. package/README.md +3 -3
  2. package/package.json +2 -2
  3. package/template/bin/dev.sh +7 -1
  4. package/template/bin/setup.js +53 -9
  5. package/template/bin/sync-images.js +53 -0
  6. package/template/builder-journal.md +17 -0
  7. package/template/claude-task-manager/api-prompts.js +98 -13
  8. package/template/claude-task-manager/api-reviews.js +82 -5
  9. package/template/claude-task-manager/db.js +32 -5
  10. package/template/claude-task-manager/docs/session-capture-foundation-design.md +1273 -0
  11. package/template/claude-task-manager/lib/claude-desktop-sessions.js +696 -0
  12. package/template/claude-task-manager/lib/coding-agent-models.js +49 -1
  13. package/template/claude-task-manager/lib/session-capture.js +421 -0
  14. package/template/claude-task-manager/lib/session-history.js +135 -15
  15. package/template/claude-task-manager/lib/session-jobs.js +10 -5
  16. package/template/claude-task-manager/lib/session-stream.js +87 -19
  17. package/template/claude-task-manager/lib/setup-provider-config.js +115 -0
  18. package/template/claude-task-manager/lib/walle-ctm-history.js +72 -0
  19. package/template/claude-task-manager/lib/walle-session-context.js +61 -0
  20. package/template/claude-task-manager/lib/walle-transcript.js +176 -0
  21. package/template/claude-task-manager/public/css/setup.css +35 -8
  22. package/template/claude-task-manager/public/css/walle-session.css +56 -0
  23. package/template/claude-task-manager/public/css/walle.css +120 -0
  24. package/template/claude-task-manager/public/index.html +814 -181
  25. package/template/claude-task-manager/public/js/message-renderer.js +148 -19
  26. package/template/claude-task-manager/public/js/reviews.js +120 -62
  27. package/template/claude-task-manager/public/js/setup.js +75 -31
  28. package/template/claude-task-manager/public/js/stream-view.js +115 -55
  29. package/template/claude-task-manager/public/js/walle-session.js +84 -2
  30. package/template/claude-task-manager/public/js/walle.js +308 -54
  31. package/template/claude-task-manager/server.js +1092 -146
  32. package/template/claude-task-manager/session-integrity.js +181 -54
  33. package/template/claude-task-manager/session-utils.js +123 -41
  34. package/template/claude-task-manager/workers/state-detectors/codex.js +5 -2
  35. package/template/package.json +1 -1
  36. package/template/wall-e/adapters/ctm.js +39 -18
  37. package/template/wall-e/agent-runners/contract.js +17 -0
  38. package/template/wall-e/agent-runners/index.js +22 -0
  39. package/template/wall-e/agent-runtime/harness.js +212 -0
  40. package/template/wall-e/agent-runtime/index.js +8 -0
  41. package/template/wall-e/agent-runtime/registry.js +67 -0
  42. package/template/wall-e/agent-runtime/session-store.js +179 -0
  43. package/template/wall-e/agent-runtime/spawn.js +208 -0
  44. package/template/wall-e/api-walle.js +174 -7
  45. package/template/wall-e/brain.js +266 -28
  46. package/template/wall-e/channels/policy.js +88 -0
  47. package/template/wall-e/channels/registry.js +15 -1
  48. package/template/wall-e/channels/reply-dispatcher.js +70 -0
  49. package/template/wall-e/channels/session-bindings.js +51 -0
  50. package/template/wall-e/chat/code-review-context.js +29 -0
  51. package/template/wall-e/chat.js +188 -42
  52. package/template/wall-e/coding/acp-adapter.js +188 -0
  53. package/template/wall-e/coding/agent-catalog.js +129 -0
  54. package/template/wall-e/coding/compaction-service.js +247 -0
  55. package/template/wall-e/coding/execution-trace.js +3 -0
  56. package/template/wall-e/coding/instruction-service.js +224 -0
  57. package/template/wall-e/coding/model-message.js +67 -0
  58. package/template/wall-e/coding/permission-rules-store.js +111 -0
  59. package/template/wall-e/coding/permission-service.js +266 -0
  60. package/template/wall-e/coding/prompt-bundle.js +67 -0
  61. package/template/wall-e/coding/prompt-runtime.js +243 -0
  62. package/template/wall-e/coding/provider-transform.js +188 -0
  63. package/template/wall-e/coding/runtime-mode.js +132 -0
  64. package/template/wall-e/coding/snapshot-service.js +155 -0
  65. package/template/wall-e/coding/stream-processor.js +268 -0
  66. package/template/wall-e/coding/task-tool.js +255 -0
  67. package/template/wall-e/coding/tool-registry.js +361 -0
  68. package/template/wall-e/coding/transcript-writer.js +143 -0
  69. package/template/wall-e/coding/workspace-replay.js +324 -0
  70. package/template/wall-e/coding-context.js +4 -22
  71. package/template/wall-e/coding-orchestrator.js +307 -18
  72. package/template/wall-e/coding-prompts.js +44 -3
  73. package/template/wall-e/context/context-builder.js +43 -1
  74. package/template/wall-e/context/topic-matcher.js +1 -1
  75. package/template/wall-e/eval/agent-runner.js +59 -13
  76. package/template/wall-e/eval/benchmarks/memory-retrieval.json +155 -57
  77. package/template/wall-e/eval/benchmarks.js +100 -16
  78. package/template/wall-e/eval/eval-orchestrator.js +218 -8
  79. package/template/wall-e/eval/harvester.js +62 -5
  80. package/template/wall-e/eval/head-to-head.js +23 -2
  81. package/template/wall-e/eval/humaneval-adapter.js +30 -5
  82. package/template/wall-e/eval/livecodebench-adapter.js +29 -5
  83. package/template/wall-e/eval/manifest.js +186 -0
  84. package/template/wall-e/eval/run-agent-benchmarks.js +66 -2
  85. package/template/wall-e/eval/session-retrieval-benchmark.js +150 -0
  86. package/template/wall-e/eval/session-transcripts.js +57 -4
  87. package/template/wall-e/eval/swebench-adapter.js +109 -3
  88. package/template/wall-e/evaluation/agent-router.js +53 -1
  89. package/template/wall-e/evaluation/coding-quorum.js +48 -1
  90. package/template/wall-e/evaluation/router.js +4 -2
  91. package/template/wall-e/evaluation/tier-selector.js +11 -1
  92. package/template/wall-e/extraction/contradiction.js +2 -2
  93. package/template/wall-e/extraction/indexer.js +2 -1
  94. package/template/wall-e/extraction/knowledge-extractor.js +2 -2
  95. package/template/wall-e/hooks/cli.js +92 -0
  96. package/template/wall-e/hooks/discovery.js +119 -0
  97. package/template/wall-e/hooks/index.js +7 -0
  98. package/template/wall-e/hooks/manifest.js +55 -0
  99. package/template/wall-e/hooks/runtime.js +84 -0
  100. package/template/wall-e/hooks/session-memory.js +225 -0
  101. package/template/wall-e/http/auth.js +6 -2
  102. package/template/wall-e/http/chat-api.js +54 -8
  103. package/template/wall-e/integrations/claude-plugin/hooks/hooks.json +27 -0
  104. package/template/wall-e/integrations/claude-plugin/hooks/walle-precompact-hook.sh +5 -0
  105. package/template/wall-e/integrations/claude-plugin/hooks/walle-stop-hook.sh +5 -0
  106. package/template/wall-e/integrations/codex-plugin/hooks/walle-hook.sh +7 -0
  107. package/template/wall-e/integrations/codex-plugin/hooks.json +37 -0
  108. package/template/wall-e/listening/calendar.js +3 -1
  109. package/template/wall-e/llm/client.js +64 -10
  110. package/template/wall-e/llm/google.js +39 -5
  111. package/template/wall-e/llm/ollama.js +1 -1
  112. package/template/wall-e/llm/ollama.plugin.json +1 -1
  113. package/template/wall-e/llm/provider-availability.js +10 -0
  114. package/template/wall-e/llm/provider-error.js +269 -0
  115. package/template/wall-e/llm/tool-adapter.js +48 -12
  116. package/template/wall-e/loops/boot.js +2 -1
  117. package/template/wall-e/loops/initiative.js +2 -2
  118. package/template/wall-e/loops/tasks.js +8 -47
  119. package/template/wall-e/loops/workspace-prompts.js +20 -0
  120. package/template/wall-e/mcp-server.js +442 -1
  121. package/template/wall-e/memory/session-ingest-service.js +159 -0
  122. package/template/wall-e/memory/source-indexer.js +289 -0
  123. package/template/wall-e/plugins/discovery.js +83 -0
  124. package/template/wall-e/plugins/manifest-loader.js +50 -10
  125. package/template/wall-e/plugins/manifest-schema.js +69 -0
  126. package/template/wall-e/plugins/model-catalog.js +55 -0
  127. package/template/wall-e/prompts/coding/base.txt +2 -0
  128. package/template/wall-e/prompts/coding/deepseek.txt +1 -0
  129. package/template/wall-e/prompts/coding/memory-protocol.md +9 -0
  130. package/template/wall-e/prompts/coding/plan.txt +1 -0
  131. package/template/wall-e/runtime/execution-trace.js +220 -0
  132. package/template/wall-e/security/audit.js +266 -0
  133. package/template/wall-e/security/ssrf.js +236 -0
  134. package/template/wall-e/session-files.js +303 -0
  135. package/template/wall-e/skills/_bundled/slack-backfill/SKILL.md +3 -0
  136. package/template/wall-e/skills/_bundled/slack-sync/SKILL.md +3 -0
  137. package/template/wall-e/skills/internal-skill-registry.js +2 -2
  138. package/template/wall-e/skills/script-skill-runner.js +143 -0
  139. package/template/wall-e/skills/skill-executor.js +5 -6
  140. package/template/wall-e/skills/skill-fallback.js +3 -1
  141. package/template/wall-e/skills/skill-harness-registry.js +7 -8
  142. package/template/wall-e/skills/skill-planner.js +52 -4
  143. package/template/wall-e/skills/slack-ingest.js +11 -3
  144. package/template/wall-e/sources/base.js +90 -0
  145. package/template/wall-e/sources/builtin.js +33 -0
  146. package/template/wall-e/sources/claude-code-jsonl.js +78 -0
  147. package/template/wall-e/sources/codex-jsonl.js +125 -0
  148. package/template/wall-e/sources/coding-session-utils.js +117 -0
  149. package/template/wall-e/sources/contract-suite.js +59 -0
  150. package/template/wall-e/sources/gemini-jsonl.js +85 -0
  151. package/template/wall-e/sources/index.js +9 -0
  152. package/template/wall-e/sources/jsonl-utils.js +181 -0
  153. package/template/wall-e/sources/record-types.js +252 -0
  154. package/template/wall-e/sources/registry.js +92 -0
  155. package/template/wall-e/sources/transforms.js +100 -0
  156. package/template/wall-e/sources/walle-jsonl.js +108 -0
  157. package/template/wall-e/tools/coding-middleware.js +31 -1
  158. package/template/wall-e/tools/file-tracker.js +25 -1
  159. package/template/wall-e/tools/local-tools.js +75 -47
  160. package/template/wall-e/tools/session-sharing.js +68 -1
  161. package/template/wall-e/tools/shell-analyzer.js +1 -1
  162. package/template/wall-e/tools/shell-policy.js +47 -0
  163. package/template/wall-e/tools/snapshot.js +42 -0
  164. package/template/wall-e/training/harvester.js +62 -5
  165. package/template/wall-e/utils/repair.js +253 -1
  166. package/template/website/index.html +3 -3
  167. package/template/wall-e/skills/_bundled/slack-mentions/.watched-threads.json +0 -18
@@ -8,6 +8,7 @@ const { pLimit, getAvailableProviders } = require('./head-to-head');
8
8
  const { runAgentBenchmark, runMultiTurnBenchmark } = require('./agent-runner');
9
9
  const { createClient } = require('../llm/client');
10
10
  const { createAnthropicFromEnv } = require('../llm/anthropic');
11
+ const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
11
12
 
12
13
  // ============================================================
13
14
  // Benchmark suite loader
@@ -22,14 +23,89 @@ const SUITE_FILES = {
22
23
  'chat-eval': 'chat-eval.json',
23
24
  'reasoning': 'reasoning.json',
24
25
  'memory-retrieval': 'memory-retrieval.json',
25
- 'humaneval-plus': null, // handled by humaneval-adapter.js (no JSON file)
26
- 'livecodebench': null, // handled by livecodebench-adapter.js (no JSON file)
26
+ };
27
+
28
+ const ADAPTER_SUITE_METADATA = {
29
+ 'humaneval-plus': {
30
+ name: 'humaneval-plus',
31
+ count: null,
32
+ taskTypes: ['coding'],
33
+ difficulties: ['easy', 'medium', 'hard'],
34
+ adapter: true,
35
+ description: 'EvalPlus HumanEval+ Python function-generation tasks',
36
+ },
37
+ livecodebench: {
38
+ name: 'livecodebench',
39
+ count: null,
40
+ taskTypes: ['coding'],
41
+ difficulties: ['easy', 'medium', 'hard'],
42
+ adapter: true,
43
+ description: 'LiveCodeBench code-generation tasks with date filtering',
44
+ },
45
+ 'swebench-lite': {
46
+ name: 'swebench-lite',
47
+ count: 30,
48
+ taskTypes: ['coding-agent'],
49
+ difficulties: ['medium', 'hard'],
50
+ adapter: true,
51
+ description: 'Curated SWE-bench Lite issue-fixing tasks',
52
+ },
53
+ };
54
+
55
+ function isAdapterSuite(suiteName) {
56
+ return !!ADAPTER_SUITE_METADATA[suiteName];
57
+ }
58
+
59
+ function listAdapterSuites() {
60
+ return Object.values(ADAPTER_SUITE_METADATA).map((s) => ({ ...s }));
61
+ }
62
+
63
+ const DEFAULT_ADAPTER_RUNNERS = {
64
+ 'humaneval-plus': async ({ brain, providerInfo, model, runId, maxTasks, signal }) => {
65
+ const { runHumanEvalSuite } = require('./humaneval-adapter');
66
+ return runHumanEvalSuite({
67
+ brain,
68
+ providerType: providerInfo.provider,
69
+ config: { apiKey: providerInfo.apiKey, baseUrl: providerInfo.baseUrl },
70
+ model,
71
+ runId,
72
+ maxTasks,
73
+ signal,
74
+ });
75
+ },
76
+ livecodebench: async ({ brain, providerInfo, model, runId, maxTasks, afterDate, signal }) => {
77
+ const { runLiveCodeBenchSuite } = require('./livecodebench-adapter');
78
+ return runLiveCodeBenchSuite({
79
+ brain,
80
+ providerType: providerInfo.provider,
81
+ config: { apiKey: providerInfo.apiKey, baseUrl: providerInfo.baseUrl },
82
+ model,
83
+ runId,
84
+ maxTasks,
85
+ afterDate,
86
+ signal,
87
+ });
88
+ },
89
+ 'swebench-lite': async ({ brain, providerInfo, model, runId, maxTasks, signal, runAgentLoop, timeoutMs }) => {
90
+ const { runSWEBenchSuite } = require('./swebench-adapter');
91
+ return runSWEBenchSuite({
92
+ brain,
93
+ provider: providerInfo.client,
94
+ providerType: providerInfo.provider,
95
+ model,
96
+ runId,
97
+ maxTasks,
98
+ signal,
99
+ runAgentLoop,
100
+ timeoutMs,
101
+ });
102
+ },
27
103
  };
28
104
 
29
105
  function loadSuite(suiteName) {
106
+ if (isAdapterSuite(suiteName)) return [];
30
107
  if (!(suiteName in SUITE_FILES)) throw new Error(`Unknown suite: ${suiteName}`);
31
108
  const file = SUITE_FILES[suiteName];
32
- if (!file) return []; // adapter-based suite (humaneval-plus, livecodebench) — loaded externally
33
109
  const filePath = path.join(BENCHMARKS_DIR, file);
34
110
  if (!fs.existsSync(filePath)) throw new Error(`Suite file not found: ${filePath}`);
35
111
  return JSON.parse(fs.readFileSync(filePath, 'utf8'));
@@ -83,6 +159,7 @@ class EvalOrchestrator extends EventEmitter {
83
159
  this.totalSpent = 0;
84
160
  this.running = false;
85
161
  this.aborted = false;
162
+ this.adapterRunners = { ...DEFAULT_ADAPTER_RUNNERS, ...(options.adapterRunners || {}) };
86
163
  }
87
164
 
88
165
  /**
@@ -94,12 +171,16 @@ class EvalOrchestrator extends EventEmitter {
94
171
  * @param {Function} params.runAgentLoop - Agent loop function for coding benchmarks
95
172
  * @returns {Promise<object>} Run summary
96
173
  */
97
- async run({ suite, models, benchmarkIds, runAgentLoop }) {
174
+ async run({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate }) {
98
175
  if (this.running) throw new Error('Orchestrator is already running');
99
176
  this.running = true;
100
177
  this.aborted = false;
101
178
 
102
179
  try {
180
+ if (isAdapterSuite(suite)) {
181
+ return await this._runAdapterSuite({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate });
182
+ }
183
+
103
184
  // 1. Load benchmarks
104
185
  const benchmarks = suite === 'all' ? loadAllSuites() : loadSuite(suite);
105
186
  const filtered = benchmarkIds && benchmarkIds.length > 0
@@ -330,7 +411,7 @@ class EvalOrchestrator extends EventEmitter {
330
411
  } else {
331
412
  client = createClient(providerType, { apiKey, baseUrl });
332
413
  }
333
- map[model] = { client, provider: providerType, registryId };
414
+ map[model] = { client, provider: providerType, registryId, apiKey, baseUrl };
334
415
  } catch (err) {
335
416
  this.emit('error', { benchmarkId: null, model, error: `Failed to create client: ${err.message}` });
336
417
  }
@@ -338,6 +419,102 @@ class EvalOrchestrator extends EventEmitter {
338
419
  return map;
339
420
  }
340
421
 
422
+ async _runAdapterSuite({ suite, models, benchmarkIds, runAgentLoop, maxTasks, afterDate }) {
423
+ const runner = this.adapterRunners[suite];
424
+ if (!runner) throw new Error(`No adapter runner configured for suite: ${suite}`);
425
+
426
+ if (!models || models.length === 0) {
427
+ const summary = { runId: this.runId, status: 'error', error: 'No models specified', models: {}, totalBenchmarks: 0, totalSpent: 0 };
428
+ this.emit('error', { benchmarkId: null, model: null, error: 'No models specified. Provide at least one model.' });
429
+ this.emit('run-complete', { runId: this.runId, summary });
430
+ return summary;
431
+ }
432
+
433
+ const providerMap = this._resolveProviders(models);
434
+ const adapterMaxTasks = maxTasks || (benchmarkIds && benchmarkIds.length ? benchmarkIds.length : undefined);
435
+ const limit = pLimit(this.concurrency);
436
+ const modelResults = {};
437
+
438
+ const orchestrator = this;
439
+ const adapterSignal = { get aborted() { return orchestrator.aborted; } };
440
+ const tasks = models.map((model) => limit(async () => {
441
+ const providerInfo = providerMap[model];
442
+ if (!providerInfo) {
443
+ this.emit('error', { benchmarkId: suite, model, error: `No provider found for model: ${model}` });
444
+ return null;
445
+ }
446
+
447
+ this.emit('benchmark-start', {
448
+ benchmarkId: suite,
449
+ model,
450
+ startedAt: new Date().toISOString(),
451
+ });
452
+
453
+ let result;
454
+ try {
455
+ result = await runner({
456
+ brain: this.brain,
457
+ providerInfo,
458
+ model,
459
+ runId: this.runId,
460
+ maxTasks: adapterMaxTasks,
461
+ afterDate,
462
+ signal: adapterSignal,
463
+ runAgentLoop,
464
+ timeoutMs: this.timeoutMs,
465
+ });
466
+ } catch (err) {
467
+ result = {
468
+ suite,
469
+ model,
470
+ totalTasks: 0,
471
+ avgScore: 0,
472
+ totalCost: 0,
473
+ error: err.message,
474
+ results: [],
475
+ };
476
+ this.emit('error', { benchmarkId: suite, model, error: err.message });
477
+ }
478
+
479
+ const totalCost = result.totalCost || 0;
480
+ if (!isLocalModel(providerInfo.provider)) {
481
+ this.spent[model] = (this.spent[model] || 0) + totalCost;
482
+ this.totalSpent += totalCost;
483
+ }
484
+ modelResults[model] = result;
485
+ this.emit('model-complete', {
486
+ model,
487
+ avgScore: Math.round((result.avgScore || 0) * 1000) / 1000,
488
+ totalCost: Math.round(totalCost * 1_000_000) / 1_000_000,
489
+ benchmarksRun: result.totalTasks || result.results?.length || 0,
490
+ });
491
+ return result;
492
+ }));
493
+
494
+ await Promise.all(tasks);
495
+
496
+ const summary = {
497
+ runId: this.runId,
498
+ status: this.aborted ? 'aborted' : 'complete',
499
+ totalBenchmarks: Object.values(modelResults).reduce((s, r) => s + (r.totalTasks || r.results?.length || 0), 0),
500
+ totalSpent: Math.round(this.totalSpent * 1_000_000) / 1_000_000,
501
+ models: {},
502
+ suite,
503
+ adapter: true,
504
+ };
505
+ for (const [model, result] of Object.entries(modelResults)) {
506
+ summary.models[model] = {
507
+ avgScore: Math.round((result.avgScore || 0) * 1000) / 1000,
508
+ totalCost: Math.round((result.totalCost || 0) * 1_000_000) / 1_000_000,
509
+ benchmarksRun: result.totalTasks || result.results?.length || 0,
510
+ errors: (result.results || []).filter(r => r.error).length + (result.error ? 1 : 0),
511
+ };
512
+ }
513
+
514
+ this.emit('run-complete', { runId: this.runId, summary });
515
+ return summary;
516
+ }
517
+
341
518
  /**
342
519
  * Get set of completed benchmark keys for this runId (for resume).
343
520
  */
@@ -363,7 +540,10 @@ class EvalOrchestrator extends EventEmitter {
363
540
  if (!this.brain || typeof this.brain.insertBenchmarkResult !== 'function') return;
364
541
 
365
542
  try {
366
- this.brain.insertBenchmarkResult({
543
+ const scoringMethod = item.benchmark.agentExpectations?.testCommand
544
+ ? 'agent-rubric+tests'
545
+ : 'agent-rubric';
546
+ this.brain.insertBenchmarkResult(decorateBenchmarkResult({
367
547
  runId: this.runId,
368
548
  suite: item.benchmark._suite || 'coding-agent',
369
549
  promptId: item.benchmark.id,
@@ -384,7 +564,31 @@ class EvalOrchestrator extends EventEmitter {
384
564
  dimensionsJson: result.dimensionsJson || null,
385
565
  inputTokens: result.inputTokens ?? null,
386
566
  outputTokens: result.outputTokens ?? null,
387
- });
567
+ scorerVersion: DEFAULT_SCORER_VERSION,
568
+ scoringMethod,
569
+ trusted: !result.error && result.testsPassed === true,
570
+ runConfig: {
571
+ timeoutMs: this.timeoutMs,
572
+ concurrency: this.concurrency,
573
+ budgetDollars: this.budgetDollars,
574
+ scoringMethod,
575
+ },
576
+ }, {
577
+ suite: item.benchmark._suite || 'coding-agent',
578
+ benchmark: item.benchmark,
579
+ runId: this.runId,
580
+ provider: item.provider.provider,
581
+ model: item.model,
582
+ scoringMethod,
583
+ scorerVersion: DEFAULT_SCORER_VERSION,
584
+ trusted: !result.error && result.testsPassed === true,
585
+ runConfig: {
586
+ timeoutMs: this.timeoutMs,
587
+ concurrency: this.concurrency,
588
+ budgetDollars: this.budgetDollars,
589
+ scoringMethod,
590
+ },
591
+ }));
388
592
  } catch { /* non-fatal */ }
389
593
  }
390
594
 
@@ -427,4 +631,10 @@ class EvalOrchestrator extends EventEmitter {
427
631
  }
428
632
  }
429
633
 
430
- module.exports = { EvalOrchestrator };
634
+ module.exports = {
635
+ EvalOrchestrator,
636
+ ADAPTER_SUITE_METADATA,
637
+ isAdapterSuite,
638
+ listAdapterSuites,
639
+ loadSuite,
640
+ };
@@ -5,6 +5,17 @@ const path = require('path');
5
5
  const { createHash } = require('crypto');
6
6
  const { execFileSync } = require('child_process');
7
7
 
8
+ let claudeDesktopSessions = null;
9
+ function getClaudeDesktopSessions() {
10
+ if (claudeDesktopSessions) return claudeDesktopSessions;
11
+ try {
12
+ claudeDesktopSessions = require('../../claude-task-manager/lib/claude-desktop-sessions');
13
+ } catch {
14
+ return null;
15
+ }
16
+ return claudeDesktopSessions;
17
+ }
18
+
8
19
  // --- Task type classification ---
9
20
 
10
21
  function classifyTaskType(content) {
@@ -86,6 +97,46 @@ async function harvestClaudeCodeSessions(since) {
86
97
  return samples;
87
98
  }
88
99
 
100
+ // --- Claude Desktop Session Harvesting ---
101
+
102
+ async function harvestClaudeDesktopSessions(since) {
103
+ const reader = getClaudeDesktopSessions();
104
+ if (!reader) return [];
105
+
106
+ const sessions = reader.listSessions();
107
+ const samples = [];
108
+
109
+ for (const session of sessions) {
110
+ if (since && session.updatedAt && session.updatedAt <= since) continue;
111
+ const messages = Array.isArray(session.messages) ? session.messages : [];
112
+ for (let i = 0; i < messages.length - 1; i++) {
113
+ const userMsg = messages[i];
114
+ const assistantMsg = messages[i + 1];
115
+ if (userMsg.role !== 'user' || assistantMsg.role !== 'assistant') continue;
116
+ const userContent = userMsg.text || '';
117
+ const assistantContent = assistantMsg.text || '';
118
+ if (!userContent || userContent.length < 20) continue;
119
+ if (!assistantContent || assistantContent.length < 20) continue;
120
+
121
+ samples.push({
122
+ id: contentHash('claude-desktop', `${session.uuid}:${i}:${userContent}`),
123
+ source: 'claude-desktop',
124
+ session_id: session.uuid,
125
+ timestamp: userMsg.timestamp || session.updatedAt || session.createdAt || new Date().toISOString(),
126
+ task_type: classifyTaskType(userContent),
127
+ prompt: userContent,
128
+ response: assistantContent,
129
+ tool_calls: [],
130
+ outcome: 'unknown',
131
+ outcome_signal: { git_committed: false, git_diff: null, task_status: null, user_corrected: false },
132
+ model: session.model || 'unknown',
133
+ quality_label: 0.5,
134
+ });
135
+ }
136
+ }
137
+ return samples;
138
+ }
139
+
89
140
  // --- Codex Session Harvesting ---
90
141
 
91
142
  async function harvestCodexSessions(since) {
@@ -144,12 +195,13 @@ async function harvestCodexSessions(since) {
144
195
 
145
196
  // --- CTM Session Harvesting ---
146
197
 
147
- async function harvestCtmSessions(since) {
148
- const dataDir = process.env.WALL_E_DATA_DIR || path.join(process.env.HOME, '.walle', 'data');
198
+ async function harvestCtmSessions(since, dataDirOverride = null) {
199
+ const dataDir = dataDirOverride || process.env.WALL_E_DATA_DIR || path.join(process.env.HOME, '.walle', 'data');
149
200
  const ctmDbPath = path.join(dataDir, 'task-manager.db');
150
201
  if (!fs.existsSync(ctmDbPath)) return [];
151
202
 
152
- const Database = require('better-sqlite3');
203
+ let Database;
204
+ try { Database = require('better-sqlite3'); } catch { return []; }
153
205
  let ctmDb;
154
206
  try {
155
207
  ctmDb = new Database(ctmDbPath, { readonly: true, fileMustExist: true });
@@ -534,10 +586,11 @@ async function runHarvest({ incremental = true, brain, dataDir } = {}) {
534
586
 
535
587
  // Harvest from each source
536
588
  const claudeSamples = await harvestClaudeCodeSessions(getSince('claude-code'));
589
+ const claudeDesktopSamples = await harvestClaudeDesktopSessions(getSince('claude-desktop'));
537
590
  const codexSamples = await harvestCodexSessions(getSince('codex'));
538
591
  const chatSamples = await harvestWalleChat(brain, getSince('walle-chat'));
539
592
  const taskSamples = await harvestWalleTasks(brain, getSince('walle-task'));
540
- const ctmSamples = await harvestCtmSessions(getSince('ctm-sessions'));
593
+ const ctmSamples = await harvestCtmSessions(getSince('ctm-sessions'), dataDir);
541
594
 
542
595
  // Harvest coding agent sessions and store in brain
543
596
  const codingAgentSessions = await harvestCodingAgentSessions(getSince('coding-agent'));
@@ -549,7 +602,7 @@ async function runHarvest({ incremental = true, brain, dataDir } = {}) {
549
602
  if (typeof brain.insertCodingSession === 'function') brain.insertCodingSession(session);
550
603
  }
551
604
 
552
- allSamples.push(...claudeSamples, ...codexSamples, ...chatSamples, ...taskSamples, ...ctmSamples);
605
+ allSamples.push(...claudeSamples, ...claudeDesktopSamples, ...codexSamples, ...chatSamples, ...taskSamples, ...ctmSamples);
553
606
 
554
607
  // Deduplicate by content hash
555
608
  const seen = new Set();
@@ -584,6 +637,9 @@ async function runHarvest({ incremental = true, brain, dataDir } = {}) {
584
637
  if (claudeSamples.length > 0) {
585
638
  brain.updateHarvestState('claude-code', { lastProcessedAt: now, totalHarvested: claudeSamples.length });
586
639
  }
640
+ if (claudeDesktopSamples.length > 0) {
641
+ brain.updateHarvestState('claude-desktop', { lastProcessedAt: now, totalHarvested: claudeDesktopSamples.length });
642
+ }
587
643
  if (codexSamples.length > 0) {
588
644
  brain.updateHarvestState('codex', { lastProcessedAt: now, totalHarvested: codexSamples.length });
589
645
  }
@@ -610,6 +666,7 @@ module.exports = {
610
666
  classifyTaskType,
611
667
  contentHash,
612
668
  harvestClaudeCodeSessions,
669
+ harvestClaudeDesktopSessions,
613
670
  harvestCodexSessions,
614
671
  harvestCtmSessions,
615
672
  harvestWalleChat,
@@ -3,7 +3,7 @@
3
3
  const { randomUUID } = require('node:crypto');
4
4
  const { createClient } = require('../llm/client');
5
5
  const { heuristicScore } = require('./evaluator');
6
- const { scoreTraits } = require('./benchmarks');
6
+ const { scoreTraitsDetailed, TRAIT_MATCHERS, UNSCORABLE_TRAITS } = require('./benchmarks');
7
7
 
8
8
  // ============================================================
9
9
  // Concurrency limiter (inline, no external dependency)
@@ -122,7 +122,28 @@ function getAvailableProviders(brain) {
122
122
 
123
123
  function traitScore(response, expectedTraits) {
124
124
  if (!expectedTraits || expectedTraits.length === 0) return null;
125
- return scoreTraits(response, expectedTraits);
125
+ const text = String(response || '').toLowerCase();
126
+ let matched = 0;
127
+ let scored = 0;
128
+
129
+ const knownTraits = [];
130
+ for (const trait of expectedTraits) {
131
+ if (UNSCORABLE_TRAITS.has(trait)) continue;
132
+ if (TRAIT_MATCHERS[trait]) {
133
+ knownTraits.push(trait);
134
+ continue;
135
+ }
136
+ scored++;
137
+ if (text.includes(String(trait).toLowerCase())) matched++;
138
+ }
139
+
140
+ if (knownTraits.length > 0) {
141
+ const detail = scoreTraitsDetailed(response, knownTraits);
142
+ matched += detail.matched.length;
143
+ scored += detail.scoredCount;
144
+ }
145
+
146
+ return scored > 0 ? matched / scored : 0;
126
147
  }
127
148
 
128
149
  // ============================================================
@@ -6,6 +6,7 @@ const crypto = require('crypto');
6
6
  const { execFileSync } = require('child_process');
7
7
  const { createClient } = require('../llm/client');
8
8
  const { resolveModelName } = require('./agent-runner');
9
+ const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
9
10
 
10
11
  const SUITE_NAME = 'humaneval-plus';
11
12
  const CACHE_DIR = path.join(os.homedir(), '.walle', 'eval-cache');
@@ -192,7 +193,8 @@ async function runHumanEvalTask(task, options = {}) {
192
193
  costDollars,
193
194
  response: response.slice(0, 2000),
194
195
  code: code.slice(0, 2000),
195
- error: error || testError || null,
196
+ error: error || null,
197
+ testError: testError || null,
196
198
  usage,
197
199
  };
198
200
  }
@@ -222,13 +224,13 @@ function scoreHumanEvalQuality(code) {
222
224
  * @returns {Promise<object>} Suite results
223
225
  */
224
226
  async function runHumanEvalSuite(options = {}) {
225
- const { brain, providerType, config, model, maxTasks, signal } = options;
227
+ const { brain, providerType, config, model, maxTasks, signal, runId: providedRunId } = options;
226
228
 
227
229
  const allTasks = await loadHumanEvalDataset();
228
230
  const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
229
231
 
230
232
  const client = createClient(providerType || 'anthropic', config || {});
231
- const runId = crypto.randomUUID();
233
+ const runId = providedRunId || crypto.randomUUID();
232
234
  const results = [];
233
235
  let totalPassed = 0;
234
236
 
@@ -247,7 +249,8 @@ async function runHumanEvalSuite(options = {}) {
247
249
  // Store in brain
248
250
  if (brain && typeof brain.insertBenchmarkResult === 'function') {
249
251
  try {
250
- brain.insertBenchmarkResult({
252
+ const scoringMethod = 'executable-tests';
253
+ brain.insertBenchmarkResult(decorateBenchmarkResult({
251
254
  runId,
252
255
  suite: SUITE_NAME,
253
256
  promptId: task.task_id,
@@ -266,7 +269,29 @@ async function runHumanEvalSuite(options = {}) {
266
269
  testsAfter: result.passed ? 1 : 0,
267
270
  totalTests: 1,
268
271
  dimensionsJson: JSON.stringify(result.score.dimensions),
269
- });
272
+ modelMetadataJson: JSON.stringify({ testError: result.testError || null }),
273
+ datasetVersion: 'humaneval-plus:evalplus-master',
274
+ scorerVersion: DEFAULT_SCORER_VERSION,
275
+ scoringMethod,
276
+ trusted: !result.error,
277
+ runConfig: { maxTasks, scoringMethod },
278
+ }, {
279
+ suite: SUITE_NAME,
280
+ benchmark: {
281
+ id: task.task_id,
282
+ prompt: task.prompt,
283
+ taskType: 'coding',
284
+ difficulty: taskDifficulty(task.task_id),
285
+ datasetVersion: 'humaneval-plus:evalplus-master',
286
+ },
287
+ runId,
288
+ provider: providerType || 'unknown',
289
+ model: resolveModelName(model),
290
+ scorerVersion: DEFAULT_SCORER_VERSION,
291
+ scoringMethod,
292
+ trusted: !result.error,
293
+ runConfig: { maxTasks, scoringMethod },
294
+ }));
270
295
  } catch {}
271
296
  }
272
297
  }
@@ -7,6 +7,7 @@ const crypto = require('crypto');
7
7
  const { execFileSync } = require('child_process');
8
8
  const { createClient } = require('../llm/client');
9
9
  const { resolveModelName } = require('./agent-runner');
10
+ const { decorateBenchmarkResult, DEFAULT_SCORER_VERSION } = require('./manifest');
10
11
 
11
12
  const SUITE_NAME = 'livecodebench';
12
13
  const CACHE_DIR = path.join(os.homedir(), '.walle', 'eval-cache');
@@ -187,7 +188,7 @@ async function runLiveCodeBenchTask(task, options = {}) {
187
188
  * Run the full LiveCodeBench suite.
188
189
  */
189
190
  async function runLiveCodeBenchSuite(options = {}) {
190
- const { brain, providerType, config, model, maxTasks, signal, afterDate } = options;
191
+ const { brain, providerType, config, model, maxTasks, signal, afterDate, runId: providedRunId } = options;
191
192
 
192
193
  let allTasks = await loadLiveCodeBenchDataset();
193
194
 
@@ -201,7 +202,7 @@ async function runLiveCodeBenchSuite(options = {}) {
201
202
 
202
203
  const tasks = maxTasks ? allTasks.slice(0, maxTasks) : allTasks;
203
204
  const client = createClient(providerType || 'anthropic', config || {});
204
- const runId = crypto.randomUUID();
205
+ const runId = providedRunId || crypto.randomUUID();
205
206
  const results = [];
206
207
  let totalPassed = 0;
207
208
 
@@ -219,7 +220,9 @@ async function runLiveCodeBenchSuite(options = {}) {
219
220
  // Store in brain
220
221
  if (brain && typeof brain.insertBenchmarkResult === 'function') {
221
222
  try {
222
- brain.insertBenchmarkResult({
223
+ const scoringMethod = 'executable-tests';
224
+ const prompt = task.question_content || task.prompt || task.description || '';
225
+ brain.insertBenchmarkResult(decorateBenchmarkResult({
223
226
  runId,
224
227
  suite: SUITE_NAME,
225
228
  promptId: result.taskId,
@@ -227,7 +230,7 @@ async function runLiveCodeBenchSuite(options = {}) {
227
230
  difficulty: taskDifficulty(task),
228
231
  provider: providerType || 'unknown',
229
232
  model: resolveModelName(model),
230
- prompt: (task.question_content || '').slice(0, 2000),
233
+ prompt: prompt.slice(0, 2000),
231
234
  response: result.response || '',
232
235
  traitScore: null,
233
236
  compositeScore: result.score.composite,
@@ -238,7 +241,28 @@ async function runLiveCodeBenchSuite(options = {}) {
238
241
  testsAfter: result.passedCases || 0,
239
242
  totalTests: result.totalCases || 0,
240
243
  dimensionsJson: JSON.stringify(result.score.dimensions),
241
- });
244
+ datasetVersion: 'livecodebench:release_v6',
245
+ scorerVersion: DEFAULT_SCORER_VERSION,
246
+ scoringMethod,
247
+ trusted: !result.error,
248
+ runConfig: { maxTasks, afterDate, scoringMethod },
249
+ }, {
250
+ suite: SUITE_NAME,
251
+ benchmark: {
252
+ id: result.taskId,
253
+ prompt,
254
+ taskType: 'coding',
255
+ difficulty: taskDifficulty(task),
256
+ datasetVersion: 'livecodebench:release_v6',
257
+ },
258
+ runId,
259
+ provider: providerType || 'unknown',
260
+ model: resolveModelName(model),
261
+ scorerVersion: DEFAULT_SCORER_VERSION,
262
+ scoringMethod,
263
+ trusted: !result.error,
264
+ runConfig: { maxTasks, afterDate, scoringMethod },
265
+ }));
242
266
  } catch {}
243
267
  }
244
268
  }