openclaw-node-harness 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/mesh-agent.js +417 -94
- package/bin/mesh-join-token.js +147 -0
- package/bin/mesh-node-remove.js +277 -0
- package/bin/mesh-task-daemon.js +723 -15
- package/bin/openclaw-node-init.js +733 -0
- package/cli.js +8 -10
- package/lib/llm-providers.js +262 -0
- package/lib/mesh-collab.js +549 -0
- package/lib/mesh-plans.js +528 -0
- package/lib/mesh-tasks.js +50 -34
- package/package.json +4 -1
package/bin/mesh-agent.js
CHANGED
|
@@ -3,15 +3,21 @@
|
|
|
3
3
|
/**
|
|
4
4
|
* mesh-agent.js — Mesh worker agent for OpenClaw.
|
|
5
5
|
*
|
|
6
|
-
*
|
|
6
|
+
* LLM-agnostic architecture: external wrapper around any LLM CLI.
|
|
7
7
|
* The outer loop is mechanical Node.js code. The inner loop is the LLM.
|
|
8
8
|
* The LLM has no awareness of the mesh — it gets a clean task prompt.
|
|
9
9
|
*
|
|
10
|
+
* Supported LLM backends (via lib/llm-providers.js):
|
|
11
|
+
* claude — Anthropic Claude Code CLI
|
|
12
|
+
* openai — OpenAI Codex/GPT CLI
|
|
13
|
+
* shell — Raw shell execution (no LLM)
|
|
14
|
+
* (custom providers can be registered at runtime)
|
|
15
|
+
*
|
|
10
16
|
* Flow:
|
|
11
17
|
* 1. Connect to NATS
|
|
12
18
|
* 2. Claim next available task from mesh-task-daemon
|
|
13
19
|
* 3. Construct prompt from task schema
|
|
14
|
-
* 4. Run
|
|
20
|
+
* 4. Run LLM CLI (non-interactive)
|
|
15
21
|
* 5. Evaluate metric (if defined)
|
|
16
22
|
* 6. If metric fails → log attempt, retry with failure context
|
|
17
23
|
* 7. If metric passes or no metric → report completion
|
|
@@ -22,10 +28,11 @@
|
|
|
22
28
|
* The outer loop is deterministic. The LLM owns the problem-solving.
|
|
23
29
|
*
|
|
24
30
|
* Usage:
|
|
25
|
-
* node mesh-agent.js
|
|
26
|
-
* node mesh-agent.js --once
|
|
27
|
-
* node mesh-agent.js --model sonnet
|
|
28
|
-
* node mesh-agent.js --
|
|
31
|
+
* node mesh-agent.js # run worker (default provider)
|
|
32
|
+
* node mesh-agent.js --once # claim one task, execute, exit
|
|
33
|
+
* node mesh-agent.js --model sonnet # override model
|
|
34
|
+
* node mesh-agent.js --provider openai # use OpenAI backend
|
|
35
|
+
* node mesh-agent.js --dry-run # claim + build prompt, don't execute
|
|
29
36
|
*/
|
|
30
37
|
|
|
31
38
|
const { connect, StringCodec } = require('nats');
|
|
@@ -36,12 +43,12 @@ const fs = require('fs');
|
|
|
36
43
|
const { getActivityState, getSessionInfo } = require('../lib/agent-activity');
|
|
37
44
|
|
|
38
45
|
const sc = StringCodec();
|
|
39
|
-
const { NATS_URL
|
|
46
|
+
const { NATS_URL } = require('../lib/nats-resolve');
|
|
47
|
+
const { resolveProvider, resolveModel } = require('../lib/llm-providers');
|
|
40
48
|
const NODE_ID = process.env.MESH_NODE_ID || os.hostname().toLowerCase().replace(/[^a-z0-9-]/g, '-');
|
|
41
49
|
const POLL_INTERVAL = parseInt(process.env.MESH_POLL_INTERVAL || '15000'); // 15s between polls
|
|
42
50
|
const MAX_ATTEMPTS = parseInt(process.env.MESH_MAX_ATTEMPTS || '3');
|
|
43
51
|
const HEARTBEAT_INTERVAL = parseInt(process.env.MESH_HEARTBEAT_INTERVAL || '60000'); // 60s heartbeat
|
|
44
|
-
const CLAUDE_PATH = process.env.CLAUDE_PATH || '/usr/local/bin/claude';
|
|
45
52
|
const WORKSPACE = process.env.MESH_WORKSPACE || path.join(process.env.HOME, '.openclaw', 'workspace');
|
|
46
53
|
|
|
47
54
|
// ── CLI args ──────────────────────────────────────────
|
|
@@ -49,10 +56,15 @@ const WORKSPACE = process.env.MESH_WORKSPACE || path.join(process.env.HOME, '.op
|
|
|
49
56
|
const args = process.argv.slice(2);
|
|
50
57
|
const ONCE = args.includes('--once');
|
|
51
58
|
const DRY_RUN = args.includes('--dry-run');
|
|
52
|
-
const
|
|
59
|
+
const CLI_MODEL = (() => {
|
|
53
60
|
const idx = args.indexOf('--model');
|
|
54
|
-
return idx >= 0 && args[idx + 1] ? args[idx + 1] :
|
|
61
|
+
return idx >= 0 && args[idx + 1] ? args[idx + 1] : null;
|
|
62
|
+
})();
|
|
63
|
+
const CLI_PROVIDER = (() => {
|
|
64
|
+
const idx = args.indexOf('--provider');
|
|
65
|
+
return idx >= 0 && args[idx + 1] ? args[idx + 1] : null;
|
|
55
66
|
})();
|
|
67
|
+
const ENV_PROVIDER = process.env.MESH_LLM_PROVIDER || null;
|
|
56
68
|
|
|
57
69
|
let nc;
|
|
58
70
|
let running = true;
|
|
@@ -61,12 +73,12 @@ let currentTaskId = null; // tracks active task for alive-check responses
|
|
|
61
73
|
// ── Agent State File (read by mesh-health-publisher) ──
|
|
62
74
|
const AGENT_STATE_PATH = path.join(os.homedir(), '.openclaw', '.tmp', 'agent-state.json');
|
|
63
75
|
|
|
64
|
-
function writeAgentState(status, taskId) {
|
|
76
|
+
function writeAgentState(status, taskId, provider, model) {
|
|
65
77
|
try {
|
|
66
78
|
fs.writeFileSync(AGENT_STATE_PATH, JSON.stringify({
|
|
67
79
|
status, taskId: taskId || null,
|
|
68
|
-
llm: status === 'working' ? '
|
|
69
|
-
model: status === 'working' ?
|
|
80
|
+
llm: status === 'working' ? (provider || 'unknown') : null,
|
|
81
|
+
model: status === 'working' ? (model || null) : null,
|
|
70
82
|
}));
|
|
71
83
|
} catch { /* best-effort */ }
|
|
72
84
|
}
|
|
@@ -272,13 +284,9 @@ function commitAndMergeWorktree(worktreePath, taskId, summary) {
|
|
|
272
284
|
// Stage and commit all changes
|
|
273
285
|
execSync('git add -A', { cwd: worktreePath, timeout: 10000, stdio: 'pipe' });
|
|
274
286
|
const commitMsg = `mesh(${taskId}): ${(summary || 'task completed').slice(0, 72)}`;
|
|
275
|
-
|
|
276
|
-
const commitResult = spawnSync('git', ['commit', '-m', commitMsg], {
|
|
287
|
+
execSync(`git commit -m "${commitMsg.replace(/"/g, '\\"')}"`, {
|
|
277
288
|
cwd: worktreePath, timeout: 10000, stdio: 'pipe',
|
|
278
289
|
});
|
|
279
|
-
if (commitResult.status !== 0) {
|
|
280
|
-
throw new Error(`git commit failed: ${commitResult.stderr?.toString() || 'unknown error'}`);
|
|
281
|
-
}
|
|
282
290
|
|
|
283
291
|
const sha = execSync('git rev-parse --short HEAD', {
|
|
284
292
|
cwd: worktreePath, timeout: 5000, encoding: 'utf-8',
|
|
@@ -335,94 +343,52 @@ function cleanupWorktree(worktreePath, keep = false) {
|
|
|
335
343
|
}
|
|
336
344
|
}
|
|
337
345
|
|
|
338
|
-
// ──
|
|
346
|
+
// ── LLM Execution ────────────────────────────────────
|
|
339
347
|
|
|
340
348
|
/**
|
|
341
|
-
* Run
|
|
349
|
+
* Run an LLM CLI with a prompt. Returns { exitCode, stdout, stderr, provider, model }.
|
|
350
|
+
* LLM-agnostic: provider is resolved per-task from task.llm_provider, env, or CLI flag.
|
|
342
351
|
* Sends heartbeats to the daemon every HEARTBEAT_INTERVAL to prevent stall detection.
|
|
343
352
|
*
|
|
344
353
|
* @param {string} prompt
|
|
345
354
|
* @param {object} task
|
|
346
|
-
* @param {string|null} worktreePath - If set,
|
|
355
|
+
* @param {string|null} worktreePath - If set, LLM accesses this worktree instead of WORKSPACE
|
|
347
356
|
*/
|
|
348
|
-
function
|
|
357
|
+
function runLLM(prompt, task, worktreePath) {
|
|
349
358
|
return new Promise((resolve) => {
|
|
350
|
-
const
|
|
351
|
-
|
|
352
|
-
'--output-format', 'text',
|
|
353
|
-
'--model', MODEL,
|
|
354
|
-
'--permission-mode', 'bypassPermissions',
|
|
355
|
-
// SECURITY NOTE: bypassPermissions is intentional for mesh agents.
|
|
356
|
-
// Tasks run in isolated worktrees with no interactive terminal.
|
|
357
|
-
// The agent needs autonomous execution without permission prompts.
|
|
358
|
-
// Safety is enforced at the mesh level: budget limits, scope restrictions,
|
|
359
|
-
// and human review of all results before merge to main.
|
|
360
|
-
// Note: --no-session-persistence removed to enable JSONL activity tracking
|
|
361
|
-
// Claude writes session files to ~/.claude/projects/{encoded-cwd}/
|
|
362
|
-
// which agent-activity.js reads for cost, summary, and activity state
|
|
363
|
-
];
|
|
364
|
-
|
|
365
|
-
// Use worktree if available, otherwise fall back to workspace
|
|
366
|
-
const targetDir = worktreePath || WORKSPACE;
|
|
367
|
-
args.push('--add-dir', targetDir);
|
|
359
|
+
const provider = resolveProvider(task, CLI_PROVIDER, ENV_PROVIDER);
|
|
360
|
+
const model = resolveModel(task, CLI_MODEL, provider);
|
|
368
361
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
if (worktreePath) {
|
|
372
|
-
args.push('--add-dir', WORKSPACE);
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
// Add scope directories if specified (with path traversal validation)
|
|
376
|
-
if (task.scope.length > 0) {
|
|
377
|
-
const addedDirs = new Set([targetDir, WORKSPACE]);
|
|
378
|
-
for (const s of task.scope) {
|
|
379
|
-
// Resolve against both workspace and worktree
|
|
380
|
-
for (const base of [targetDir, WORKSPACE]) {
|
|
381
|
-
const resolved = path.resolve(base, s);
|
|
382
|
-
const resolvedDir = path.dirname(resolved);
|
|
383
|
-
if (!resolved.startsWith(base) && !resolved.startsWith('/tmp/')) continue;
|
|
384
|
-
if (addedDirs.has(resolvedDir)) continue;
|
|
385
|
-
addedDirs.add(resolvedDir);
|
|
386
|
-
args.push('--add-dir', resolvedDir);
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
}
|
|
362
|
+
const targetDir = worktreePath || WORKSPACE;
|
|
363
|
+
const llmArgs = provider.buildArgs(prompt, model, task, targetDir, WORKSPACE);
|
|
390
364
|
|
|
391
|
-
log(`Spawning:
|
|
365
|
+
log(`Spawning [${provider.name}]: ${provider.binary} ${llmArgs.slice(0, 6).join(' ')} ... (target: ${worktreePath ? 'worktree' : 'workspace'})`);
|
|
392
366
|
|
|
393
|
-
// Use a clean temp directory as cwd to avoid loading workspace
|
|
394
|
-
// (which triggers the full Daedalus boot sequence and eats the entire budget)
|
|
367
|
+
// Use a clean temp directory as cwd to avoid loading workspace config files
|
|
395
368
|
const cleanCwd = path.join(os.tmpdir(), 'mesh-agent-work');
|
|
396
369
|
if (!fs.existsSync(cleanCwd)) fs.mkdirSync(cleanCwd, { recursive: true });
|
|
397
370
|
|
|
398
|
-
|
|
399
|
-
const cleanEnv = { ...process.env };
|
|
400
|
-
delete cleanEnv.CLAUDECODE;
|
|
371
|
+
const cleanEnv = provider.cleanEnv(process.env);
|
|
401
372
|
|
|
402
|
-
const child = spawn(
|
|
373
|
+
const child = spawn(provider.binary, llmArgs, {
|
|
403
374
|
cwd: cleanCwd,
|
|
404
375
|
env: cleanEnv,
|
|
405
|
-
stdio: ['ignore', 'pipe', 'pipe'], // stdin must be 'ignore' —
|
|
376
|
+
stdio: ['ignore', 'pipe', 'pipe'], // stdin must be 'ignore' — some CLIs block on piped stdin
|
|
406
377
|
timeout: (task.budget_minutes || 30) * 60 * 1000, // kill if exceeds budget
|
|
407
378
|
});
|
|
408
379
|
|
|
409
|
-
// Heartbeat: signal daemon with
|
|
380
|
+
// Heartbeat: signal daemon with activity state
|
|
410
381
|
const heartbeatTimer = setInterval(async () => {
|
|
411
382
|
try {
|
|
412
|
-
// Read Claude's JSONL session file for real activity state (zero token cost)
|
|
413
|
-
// KNOWN LIMITATION: If Claude transitions working→ready→working within one
|
|
414
|
-
// heartbeat interval (60s), the ready state is missed. Acceptable for V1
|
|
415
|
-
// (used for visibility only, not triggering reactions). Revisit if reactions
|
|
416
|
-
// depend on seeing transient states.
|
|
417
383
|
const activity = await getActivityState(cleanCwd);
|
|
418
384
|
const payload = { task_id: task.task_id };
|
|
419
385
|
if (activity) {
|
|
420
|
-
payload.activity_state = activity.state;
|
|
386
|
+
payload.activity_state = activity.state;
|
|
421
387
|
payload.activity_timestamp = activity.timestamp?.toISOString();
|
|
422
388
|
}
|
|
423
389
|
await natsRequest('mesh.tasks.heartbeat', payload);
|
|
424
390
|
} catch {
|
|
425
|
-
// fire-and-forget
|
|
391
|
+
// fire-and-forget
|
|
426
392
|
}
|
|
427
393
|
}, HEARTBEAT_INTERVAL);
|
|
428
394
|
|
|
@@ -434,12 +400,12 @@ function runClaude(prompt, task, worktreePath) {
|
|
|
434
400
|
|
|
435
401
|
child.on('close', (code) => {
|
|
436
402
|
clearInterval(heartbeatTimer);
|
|
437
|
-
resolve({ exitCode: code, stdout, stderr });
|
|
403
|
+
resolve({ exitCode: code, stdout, stderr, provider: provider.name, model });
|
|
438
404
|
});
|
|
439
405
|
|
|
440
406
|
child.on('error', (err) => {
|
|
441
407
|
clearInterval(heartbeatTimer);
|
|
442
|
-
resolve({ exitCode: 1, stdout: '', stderr: err.message });
|
|
408
|
+
resolve({ exitCode: 1, stdout: '', stderr: err.message, provider: provider.name, model });
|
|
443
409
|
});
|
|
444
410
|
});
|
|
445
411
|
}
|
|
@@ -472,6 +438,338 @@ function evaluateMetric(metric, cwd) {
|
|
|
472
438
|
});
|
|
473
439
|
}
|
|
474
440
|
|
|
441
|
+
// ── Collab Prompt Construction ────────────────────────
|
|
442
|
+
|
|
443
|
+
/**
|
|
444
|
+
* Build a prompt for a collaborative round.
|
|
445
|
+
* Includes: task description, round number, shared intel from previous round, scope.
|
|
446
|
+
*/
|
|
447
|
+
function buildCollabPrompt(task, roundNumber, sharedIntel, myScope, myRole) {
|
|
448
|
+
const parts = [];
|
|
449
|
+
|
|
450
|
+
parts.push(`# Task: ${task.title} (Collaborative Round ${roundNumber})`);
|
|
451
|
+
parts.push('');
|
|
452
|
+
parts.push(`You are working on this task as part of a **${task.collaboration.mode}** collaboration with other nodes.`);
|
|
453
|
+
parts.push(`Your role: **${myRole}**`);
|
|
454
|
+
parts.push('');
|
|
455
|
+
|
|
456
|
+
if (task.description) {
|
|
457
|
+
parts.push(task.description);
|
|
458
|
+
parts.push('');
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
if (roundNumber > 1 && sharedIntel) {
|
|
462
|
+
parts.push('## Shared Intelligence from Previous Round');
|
|
463
|
+
parts.push('Other nodes shared the following reflections. Use this to inform your work:');
|
|
464
|
+
parts.push('');
|
|
465
|
+
parts.push(sharedIntel);
|
|
466
|
+
parts.push('');
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
if (myScope && myScope !== '*' && Array.isArray(myScope) && myScope[0] !== '*') {
|
|
470
|
+
const isReviewOnly = Array.isArray(myScope) && myScope.some(s => typeof s === 'string' && s.startsWith('[REVIEW-ONLY]'));
|
|
471
|
+
if (isReviewOnly) {
|
|
472
|
+
parts.push('## Your Scope (REVIEW ONLY)');
|
|
473
|
+
parts.push('You are a **reviewer**. Read and analyze these files but do NOT modify them:');
|
|
474
|
+
for (const s of myScope) {
|
|
475
|
+
parts.push(`- ${s.replace('[REVIEW-ONLY] ', '')}`);
|
|
476
|
+
}
|
|
477
|
+
parts.push('');
|
|
478
|
+
parts.push('Your job is to review the leader\'s changes, identify issues, and report findings in your reflection.');
|
|
479
|
+
parts.push('Do NOT write or edit any files. Focus on code review, correctness, and security analysis.');
|
|
480
|
+
parts.push('');
|
|
481
|
+
} else {
|
|
482
|
+
parts.push('## Your Scope');
|
|
483
|
+
parts.push('Only modify these files/paths:');
|
|
484
|
+
for (const s of myScope) {
|
|
485
|
+
parts.push(`- ${s}`);
|
|
486
|
+
}
|
|
487
|
+
parts.push('');
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if (task.success_criteria && task.success_criteria.length > 0) {
|
|
492
|
+
parts.push('## Success Criteria');
|
|
493
|
+
for (const c of task.success_criteria) {
|
|
494
|
+
parts.push(`- ${c}`);
|
|
495
|
+
}
|
|
496
|
+
parts.push('');
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
parts.push('## Instructions');
|
|
500
|
+
parts.push('- Read the relevant files before making changes.');
|
|
501
|
+
parts.push('- Make minimal, focused changes within your scope.');
|
|
502
|
+
parts.push('- Focus on YOUR contribution — other nodes handle their parts.');
|
|
503
|
+
if (roundNumber > 1) {
|
|
504
|
+
parts.push('- Incorporate learnings from the shared intelligence above.');
|
|
505
|
+
}
|
|
506
|
+
parts.push('');
|
|
507
|
+
|
|
508
|
+
parts.push('## After You Finish');
|
|
509
|
+
parts.push('At the very end of your response, output ONLY a JSON reflection block.');
|
|
510
|
+
parts.push('This block MUST be the last thing in your output, wrapped in triple backticks with `json` language tag.');
|
|
511
|
+
parts.push('Do NOT add any text after this block.');
|
|
512
|
+
parts.push('');
|
|
513
|
+
parts.push('```json');
|
|
514
|
+
parts.push('{');
|
|
515
|
+
parts.push(' "reflection": {');
|
|
516
|
+
parts.push(' "summary": "1-2 sentences: what you did this round",');
|
|
517
|
+
parts.push(' "learnings": "what you discovered that other nodes should know",');
|
|
518
|
+
parts.push(' "confidence": 0.85,');
|
|
519
|
+
parts.push(' "vote": "continue"');
|
|
520
|
+
parts.push(' }');
|
|
521
|
+
parts.push('}');
|
|
522
|
+
parts.push('```');
|
|
523
|
+
parts.push('');
|
|
524
|
+
parts.push('Rules for the reflection block:');
|
|
525
|
+
parts.push('- `confidence`: a number between 0.0 and 1.0');
|
|
526
|
+
parts.push('- `vote`: exactly one of `"continue"`, `"converged"`, or `"blocked"`');
|
|
527
|
+
parts.push('- `summary` and `learnings`: plain strings, no nested objects');
|
|
528
|
+
parts.push('- The JSON must be valid. No trailing commas, no comments.');
|
|
529
|
+
|
|
530
|
+
return parts.join('\n');
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
/**
|
|
534
|
+
* Parse a JSON reflection block from Claude's output.
|
|
535
|
+
* Returns { summary, learnings, confidence, vote, parse_failed }.
|
|
536
|
+
*
|
|
537
|
+
* On parse failure: parse_failed=true, vote='parse_error' (never silent 'continue').
|
|
538
|
+
* The caller and convergence logic can distinguish real votes from parse failures.
|
|
539
|
+
*/
|
|
540
|
+
const VALID_VOTES = new Set(['continue', 'converged', 'blocked']);
|
|
541
|
+
|
|
542
|
+
function parseReflection(output) {
|
|
543
|
+
// Strategy: find the last ```json ... ``` block in the output
|
|
544
|
+
const jsonBlocks = [...output.matchAll(/```json\s*\n([\s\S]*?)```/g)];
|
|
545
|
+
|
|
546
|
+
if (jsonBlocks.length > 0) {
|
|
547
|
+
const lastBlock = jsonBlocks[jsonBlocks.length - 1][1].trim();
|
|
548
|
+
try {
|
|
549
|
+
const parsed = JSON.parse(lastBlock);
|
|
550
|
+
const r = parsed.reflection || parsed;
|
|
551
|
+
|
|
552
|
+
const summary = typeof r.summary === 'string' ? r.summary : '';
|
|
553
|
+
const learnings = typeof r.learnings === 'string' ? r.learnings : '';
|
|
554
|
+
const confidence = typeof r.confidence === 'number' && r.confidence >= 0 && r.confidence <= 1
|
|
555
|
+
? r.confidence : null;
|
|
556
|
+
const vote = typeof r.vote === 'string' && VALID_VOTES.has(r.vote.toLowerCase())
|
|
557
|
+
? r.vote.toLowerCase() : null;
|
|
558
|
+
|
|
559
|
+
if (vote === null || confidence === null) {
|
|
560
|
+
log(`REFLECTION PARSE: JSON found but invalid fields (vote=${r.vote}, confidence=${r.confidence})`);
|
|
561
|
+
return {
|
|
562
|
+
summary: summary || output.slice(-300),
|
|
563
|
+
learnings,
|
|
564
|
+
confidence: confidence ?? 0.5,
|
|
565
|
+
vote: vote ?? 'parse_error',
|
|
566
|
+
parse_failed: true,
|
|
567
|
+
};
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
return { summary, learnings, confidence, vote, parse_failed: false };
|
|
571
|
+
} catch (err) {
|
|
572
|
+
log(`REFLECTION PARSE: JSON block found but invalid JSON: ${err.message}`);
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// Fallback: try legacy REFLECTION_START format for backwards compat
|
|
577
|
+
const legacyMatch = output.match(/REFLECTION_START\n?([\s\S]*?)REFLECTION_END/);
|
|
578
|
+
if (legacyMatch) {
|
|
579
|
+
log(`REFLECTION PARSE: Using legacy REFLECTION_START format (deprecated)`);
|
|
580
|
+
const block = legacyMatch[1];
|
|
581
|
+
const summary = (block.match(/SUMMARY:\s*(.+)/)?.[1] || '').trim();
|
|
582
|
+
const learnings = (block.match(/LEARNINGS:\s*(.+)/)?.[1] || '').trim();
|
|
583
|
+
const confidence = parseFloat(block.match(/CONFIDENCE:\s*([\d.]+)/)?.[1] || 'NaN');
|
|
584
|
+
const voteRaw = (block.match(/VOTE:\s*(\w+)/)?.[1] || '').trim().toLowerCase();
|
|
585
|
+
const vote = VALID_VOTES.has(voteRaw) ? voteRaw : 'parse_error';
|
|
586
|
+
|
|
587
|
+
return {
|
|
588
|
+
summary, learnings,
|
|
589
|
+
confidence: isNaN(confidence) ? 0.5 : confidence,
|
|
590
|
+
vote,
|
|
591
|
+
parse_failed: vote === 'parse_error',
|
|
592
|
+
};
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
// No reflection block found at all
|
|
596
|
+
log(`REFLECTION PARSE FAILED: No JSON or legacy reflection block found in output`);
|
|
597
|
+
return {
|
|
598
|
+
summary: output.slice(-300),
|
|
599
|
+
learnings: '',
|
|
600
|
+
confidence: 0.5,
|
|
601
|
+
vote: 'parse_error',
|
|
602
|
+
parse_failed: true,
|
|
603
|
+
};
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
// ── Collaborative Task Execution ──────────────────────
|
|
607
|
+
|
|
608
|
+
/**
|
|
609
|
+
* Execute a collaborative task: join session, work in rounds, submit reflections.
|
|
610
|
+
*/
|
|
611
|
+
async function executeCollabTask(task) {
|
|
612
|
+
const collabSpec = task.collaboration;
|
|
613
|
+
log(`COLLAB EXECUTING: ${task.task_id} "${task.title}" (mode: ${collabSpec.mode})`);
|
|
614
|
+
|
|
615
|
+
// Discover session ID — three strategies in priority order:
|
|
616
|
+
// 1. task.collab_session_id (set by daemon on auto-create)
|
|
617
|
+
// 2. mesh.collab.find RPC (lookup by task_id)
|
|
618
|
+
// 3. Brief wait + retry (race condition: task claimed before session created)
|
|
619
|
+
let sessionId = task.collab_session_id || null;
|
|
620
|
+
|
|
621
|
+
if (!sessionId) {
|
|
622
|
+
log(`COLLAB: No session_id in task. Discovering via mesh.collab.find...`);
|
|
623
|
+
try {
|
|
624
|
+
const found = await natsRequest('mesh.collab.find', { task_id: task.task_id }, 5000);
|
|
625
|
+
if (found) sessionId = found.session_id;
|
|
626
|
+
} catch { /* find RPC unavailable or no session yet */ }
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
if (!sessionId) {
|
|
630
|
+
// Brief wait — session may still be creating (race between claim and session auto-create)
|
|
631
|
+
log(`COLLAB: Session not found. Waiting 3s for daemon to create it...`);
|
|
632
|
+
await new Promise(r => setTimeout(r, 3000));
|
|
633
|
+
try {
|
|
634
|
+
const found = await natsRequest('mesh.collab.find', { task_id: task.task_id }, 5000);
|
|
635
|
+
if (found) sessionId = found.session_id;
|
|
636
|
+
} catch { /* still nothing */ }
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
if (!sessionId) {
|
|
640
|
+
// EXPLICIT FAILURE — do NOT silently fall back to solo execution.
|
|
641
|
+
// A collab task running solo loses the multi-node quality guarantee
|
|
642
|
+
// with zero indication in the output. This must be a visible error.
|
|
643
|
+
log(`COLLAB FAILED: No session found for task ${task.task_id}. Refusing silent solo fallback.`);
|
|
644
|
+
await natsRequest('mesh.tasks.fail', {
|
|
645
|
+
task_id: task.task_id,
|
|
646
|
+
reason: `Collab session not found for task ${task.task_id}. Task requires collaborative execution (mode: ${collabSpec.mode}) but no session could be discovered. Solo fallback refused — collab tasks must run collaboratively.`,
|
|
647
|
+
}).catch(() => {});
|
|
648
|
+
writeAgentState('idle', null);
|
|
649
|
+
return;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
// Join the session using the discovered session_id
|
|
653
|
+
let session;
|
|
654
|
+
try {
|
|
655
|
+
const joinResult = await natsRequest('mesh.collab.join', {
|
|
656
|
+
session_id: sessionId,
|
|
657
|
+
node_id: NODE_ID,
|
|
658
|
+
}, 10000);
|
|
659
|
+
session = joinResult;
|
|
660
|
+
} catch (err) {
|
|
661
|
+
log(`COLLAB JOIN FAILED: ${err.message} (session: ${sessionId})`);
|
|
662
|
+
await natsRequest('mesh.tasks.fail', {
|
|
663
|
+
task_id: task.task_id,
|
|
664
|
+
reason: `Failed to join collab session ${sessionId}: ${err.message}`,
|
|
665
|
+
}).catch(() => {});
|
|
666
|
+
writeAgentState('idle', null);
|
|
667
|
+
return;
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
if (!session) {
|
|
671
|
+
log(`COLLAB JOIN RETURNED NULL for session ${sessionId}`);
|
|
672
|
+
await natsRequest('mesh.tasks.fail', {
|
|
673
|
+
task_id: task.task_id,
|
|
674
|
+
reason: `Collab session ${sessionId} rejected join (full, closed, or duplicate node).`,
|
|
675
|
+
}).catch(() => {});
|
|
676
|
+
writeAgentState('idle', null);
|
|
677
|
+
return;
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
log(`COLLAB JOINED: ${sessionId} (${session.nodes.length} nodes)`);
|
|
681
|
+
writeAgentState('working', task.task_id);
|
|
682
|
+
|
|
683
|
+
// Create worktree for isolation
|
|
684
|
+
const worktreePath = createWorktree(`${task.task_id}-${NODE_ID}`);
|
|
685
|
+
const taskDir = worktreePath || WORKSPACE;
|
|
686
|
+
|
|
687
|
+
// Subscribe to round notifications for this session and this node
|
|
688
|
+
const roundSub = nc.subscribe(`mesh.collab.${sessionId}.node.${NODE_ID}.round`);
|
|
689
|
+
let roundsDone = false;
|
|
690
|
+
|
|
691
|
+
// Signal start
|
|
692
|
+
await natsRequest('mesh.tasks.start', { task_id: task.task_id }).catch(() => {});
|
|
693
|
+
|
|
694
|
+
for await (const roundMsg of roundSub) {
|
|
695
|
+
if (roundsDone) break;
|
|
696
|
+
|
|
697
|
+
const roundData = JSON.parse(sc.decode(roundMsg.data));
|
|
698
|
+
const { round_number, shared_intel, my_scope, my_role, mode, current_turn } = roundData;
|
|
699
|
+
|
|
700
|
+
// Sequential mode: skip if it's not our turn
|
|
701
|
+
if (mode === 'sequential' && current_turn && current_turn !== NODE_ID) {
|
|
702
|
+
log(`COLLAB R${round_number}: Not our turn (current: ${current_turn}). Waiting.`);
|
|
703
|
+
continue;
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
log(`COLLAB R${round_number}: Starting work (role: ${my_role}, scope: ${JSON.stringify(my_scope)})`);
|
|
707
|
+
|
|
708
|
+
// Build round-specific prompt
|
|
709
|
+
const prompt = buildCollabPrompt(task, round_number, shared_intel, my_scope, my_role);
|
|
710
|
+
|
|
711
|
+
if (DRY_RUN) {
|
|
712
|
+
log(`[DRY RUN] Collab prompt:\n${prompt}`);
|
|
713
|
+
break;
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
// Execute Claude
|
|
717
|
+
const llmResult = await runLLM(prompt, task, worktreePath);
|
|
718
|
+
const output = llmResult.stdout || '';
|
|
719
|
+
|
|
720
|
+
// Parse reflection from output
|
|
721
|
+
const reflection = parseReflection(output);
|
|
722
|
+
|
|
723
|
+
// List modified files
|
|
724
|
+
let artifacts = [];
|
|
725
|
+
try {
|
|
726
|
+
if (worktreePath) {
|
|
727
|
+
const status = require('child_process').execSync('git status --porcelain', {
|
|
728
|
+
cwd: worktreePath, timeout: 5000, encoding: 'utf-8',
|
|
729
|
+
}).trim();
|
|
730
|
+
artifacts = status.split('\n').filter(Boolean).map(line => line.slice(3));
|
|
731
|
+
}
|
|
732
|
+
} catch { /* best effort */ }
|
|
733
|
+
|
|
734
|
+
// Submit reflection
|
|
735
|
+
try {
|
|
736
|
+
await natsRequest('mesh.collab.reflect', {
|
|
737
|
+
session_id: sessionId,
|
|
738
|
+
node_id: NODE_ID,
|
|
739
|
+
round: round_number,
|
|
740
|
+
summary: reflection.summary,
|
|
741
|
+
learnings: reflection.learnings,
|
|
742
|
+
artifacts,
|
|
743
|
+
confidence: reflection.confidence,
|
|
744
|
+
vote: reflection.vote,
|
|
745
|
+
parse_failed: reflection.parse_failed,
|
|
746
|
+
});
|
|
747
|
+
const parseTag = reflection.parse_failed ? ' [PARSE FAILED]' : '';
|
|
748
|
+
log(`COLLAB R${round_number}: Reflection submitted (vote: ${reflection.vote}, conf: ${reflection.confidence}${parseTag})`);
|
|
749
|
+
} catch (err) {
|
|
750
|
+
log(`COLLAB R${round_number}: Reflection submit failed: ${err.message}`);
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
// Check if session is done (converged/completed/aborted)
|
|
754
|
+
try {
|
|
755
|
+
const status = await natsRequest('mesh.collab.status', { session_id: sessionId });
|
|
756
|
+
if (['converged', 'completed', 'aborted'].includes(status.status)) {
|
|
757
|
+
log(`COLLAB: Session ${sessionId} is ${status.status}. Done.`);
|
|
758
|
+
roundsDone = true;
|
|
759
|
+
}
|
|
760
|
+
} catch { /* continue listening */ }
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
roundSub.unsubscribe();
|
|
764
|
+
|
|
765
|
+
// Commit and merge worktree
|
|
766
|
+
const mergeResult = commitAndMergeWorktree(worktreePath, `${task.task_id}-${NODE_ID}`, `collab contribution from ${NODE_ID}`);
|
|
767
|
+
cleanupWorktree(worktreePath, mergeResult && !mergeResult?.merged);
|
|
768
|
+
|
|
769
|
+
writeAgentState('idle', null);
|
|
770
|
+
log(`COLLAB DONE: ${task.task_id} (node: ${NODE_ID})`);
|
|
771
|
+
}
|
|
772
|
+
|
|
475
773
|
// ── Task Execution ────────────────────────────────────
|
|
476
774
|
|
|
477
775
|
/**
|
|
@@ -509,16 +807,14 @@ async function executeTask(task) {
|
|
|
509
807
|
|
|
510
808
|
if (DRY_RUN) {
|
|
511
809
|
log(`[DRY RUN] Prompt:\n${prompt}`);
|
|
512
|
-
// Release the task so it doesn't stay stuck in "running" state
|
|
513
|
-
await natsRequest('mesh.tasks.release', { task_id: task.task_id, node_id: NODE_ID }).catch(() => {});
|
|
514
810
|
return;
|
|
515
811
|
}
|
|
516
812
|
|
|
517
|
-
// Run
|
|
518
|
-
const
|
|
519
|
-
const summary =
|
|
813
|
+
// Run LLM (with worktree isolation if available)
|
|
814
|
+
const llmResult = await runLLM(prompt, task, worktreePath);
|
|
815
|
+
const summary = llmResult.stdout.slice(-500) || '(no output)';
|
|
520
816
|
|
|
521
|
-
log(
|
|
817
|
+
log(`${llmResult.provider} exited with code ${llmResult.exitCode}`);
|
|
522
818
|
|
|
523
819
|
// Extract cost + summary from JSONL session file (zero-cost observability)
|
|
524
820
|
const cleanCwd = path.join(os.tmpdir(), 'mesh-agent-work');
|
|
@@ -527,10 +823,10 @@ async function executeTask(task) {
|
|
|
527
823
|
log(`Cost: $${sessionInfo.cost.estimatedCostUsd.toFixed(4)} (${sessionInfo.cost.inputTokens} in / ${sessionInfo.cost.outputTokens} out)`);
|
|
528
824
|
}
|
|
529
825
|
|
|
530
|
-
if (
|
|
826
|
+
if (llmResult.exitCode !== 0) {
|
|
531
827
|
const attemptRecord = {
|
|
532
|
-
approach: `Attempt ${attempt}:
|
|
533
|
-
result:
|
|
828
|
+
approach: `Attempt ${attempt}: ${llmResult.provider} exited with error (code ${llmResult.exitCode})`,
|
|
829
|
+
result: llmResult.stderr.slice(-500) || 'unknown error',
|
|
534
830
|
keep: false,
|
|
535
831
|
};
|
|
536
832
|
attempts.push(attemptRecord);
|
|
@@ -538,12 +834,12 @@ async function executeTask(task) {
|
|
|
538
834
|
|
|
539
835
|
// Two-tier retry: abnormal exit → exponential backoff (agent crash, OOM, etc.)
|
|
540
836
|
const backoffMs = Math.min(1000 * Math.pow(2, attempt - 1), 30000); // 1s, 2s, 4s... max 30s
|
|
541
|
-
log(`Attempt ${attempt} failed (
|
|
837
|
+
log(`Attempt ${attempt} failed (${llmResult.provider} error, code ${llmResult.exitCode}). Backoff ${backoffMs}ms before retry.`);
|
|
542
838
|
await new Promise(r => setTimeout(r, backoffMs));
|
|
543
839
|
continue;
|
|
544
840
|
}
|
|
545
841
|
|
|
546
|
-
// If no metric, trust
|
|
842
|
+
// If no metric, trust LLM output and complete
|
|
547
843
|
if (!task.metric) {
|
|
548
844
|
const attemptRecord = {
|
|
549
845
|
approach: `Attempt ${attempt}: executed without metric`,
|
|
@@ -639,18 +935,41 @@ async function executeTask(task) {
|
|
|
639
935
|
// ── Main Loop ─────────────────────────────────────────
|
|
640
936
|
|
|
641
937
|
async function main() {
|
|
938
|
+
const defaultProvider = resolveProvider(null, CLI_PROVIDER, ENV_PROVIDER);
|
|
939
|
+
const defaultModel = resolveModel(null, CLI_MODEL, defaultProvider);
|
|
642
940
|
log(`Starting mesh agent worker`);
|
|
643
941
|
log(` Node ID: ${NODE_ID}`);
|
|
644
942
|
log(` NATS: ${NATS_URL}`);
|
|
645
|
-
log(`
|
|
943
|
+
log(` LLM: ${defaultProvider.name} (${defaultProvider.binary})`);
|
|
944
|
+
log(` Model: ${defaultModel || '(per-task)'}`);
|
|
646
945
|
log(` Workspace: ${WORKSPACE}`);
|
|
647
946
|
log(` Max attempts: ${MAX_ATTEMPTS}`);
|
|
648
947
|
log(` Poll interval: ${POLL_INTERVAL / 1000}s`);
|
|
649
948
|
log(` Mode: ${ONCE ? 'single task' : 'continuous'} ${DRY_RUN ? '(dry run)' : ''}`);
|
|
650
949
|
|
|
651
|
-
nc = await connect(
|
|
950
|
+
nc = await connect({
|
|
951
|
+
servers: NATS_URL,
|
|
952
|
+
timeout: 5000,
|
|
953
|
+
reconnect: true,
|
|
954
|
+
maxReconnectAttempts: 10,
|
|
955
|
+
reconnectTimeWait: 2000,
|
|
956
|
+
});
|
|
652
957
|
log(`Connected to NATS`);
|
|
653
958
|
|
|
959
|
+
// Exit on permanent NATS disconnect so launchd restarts us
|
|
960
|
+
(async () => {
|
|
961
|
+
for await (const s of nc.status()) {
|
|
962
|
+
log(`NATS status: ${s.type}`);
|
|
963
|
+
if (s.type === 'disconnect') {
|
|
964
|
+
log('NATS disconnected — will attempt reconnect');
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
})();
|
|
968
|
+
nc.closed().then(() => {
|
|
969
|
+
log('NATS connection permanently closed — exiting for launchd restart');
|
|
970
|
+
process.exit(1);
|
|
971
|
+
});
|
|
972
|
+
|
|
654
973
|
// Subscribe to alive-check requests from the daemon's stall detector
|
|
655
974
|
const aliveSub = nc.subscribe(`mesh.agent.${NODE_ID}.alive`);
|
|
656
975
|
(async () => {
|
|
@@ -684,9 +1003,13 @@ async function main() {
|
|
|
684
1003
|
|
|
685
1004
|
log(`CLAIMED: ${task.task_id} "${task.title}"`);
|
|
686
1005
|
|
|
687
|
-
// Execute the task
|
|
1006
|
+
// Execute the task (collab or solo)
|
|
688
1007
|
currentTaskId = task.task_id;
|
|
689
|
-
|
|
1008
|
+
if (task.collaboration) {
|
|
1009
|
+
await executeCollabTask(task);
|
|
1010
|
+
} else {
|
|
1011
|
+
await executeTask(task);
|
|
1012
|
+
}
|
|
690
1013
|
currentTaskId = null;
|
|
691
1014
|
|
|
692
1015
|
} catch (err) {
|