openclaw-node-harness 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -131,6 +131,36 @@ async function handleSubmit(msg) {
131
131
  respond(msg, task);
132
132
  }
133
133
 
134
+ /**
135
+ * Abort any collab session tied to a task that is being terminated.
136
+ * Shared by handleFail, handleRelease, handleCancel.
137
+ *
138
+ * NOT called from handleComplete — that path goes through evaluateRound
139
+ * which already calls collabStore.markCompleted() on the session.
140
+ *
141
+ * markAborted() is idempotent: no-op if session is already completed/aborted.
142
+ * This makes double-abort safe (e.g. stall detection → release race).
143
+ */
144
+ async function cleanupTaskCollabSession(task, reason) {
145
+ if (!task.collab_session_id || !collabStore) return;
146
+ try {
147
+ // markAborted returns null if session doesn't exist or is already completed/aborted.
148
+ // Non-null means we actually transitioned the session to aborted.
149
+ const session = await collabStore.markAborted(task.collab_session_id, reason);
150
+ if (session) {
151
+ await collabStore.appendAudit(task.collab_session_id, 'session_aborted', { reason });
152
+ publishCollabEvent('aborted', session);
153
+ log(`COLLAB ABORTED ${task.collab_session_id}: ${reason}`);
154
+ }
155
+ // Clean up audit error rate-limit counter
156
+ // NOTE: sessions expiring via KV TTL bypass this — residual Map entry is negligible
157
+ // for a homelab mesh but worth noting.
158
+ collabStore.clearAuditErrorCount(task.collab_session_id);
159
+ } catch (err) {
160
+ log(`COLLAB CLEANUP WARN: could not abort session ${task.collab_session_id}: ${err.message}`);
161
+ }
162
+ }
163
+
134
164
  /**
135
165
  * mesh.tasks.claim — Agent requests the next available task.
136
166
  * Expects: { node_id }
@@ -203,6 +233,14 @@ async function handleComplete(msg) {
203
233
  log(`COMPLETE ${task_id} in ${elapsed}m: ${result?.summary || 'no summary'}`);
204
234
  publishEvent('completed', task);
205
235
 
236
+ // NOTE: no cleanupTaskCollabSession here — collab tasks complete via
237
+ // evaluateRound → markCompleted on the session, then store.markCompleted
238
+ // on the parent task. Calling cleanupTaskCollabSession would markAborted
239
+ // on an already-completed session. Clean up audit counter only.
240
+ if (task.collab_session_id && collabStore) {
241
+ collabStore.clearAuditErrorCount(task.collab_session_id);
242
+ }
243
+
206
244
  // Check if this task belongs to a plan
207
245
  await checkPlanProgress(task_id, 'completed');
208
246
 
@@ -222,6 +260,7 @@ async function handleFail(msg) {
222
260
 
223
261
  log(`FAIL ${task_id}: ${reason}`);
224
262
  publishEvent('failed', task);
263
+ await cleanupTaskCollabSession(task, `Parent task ${task_id} failed: ${reason}`);
225
264
 
226
265
  // Check if this task belongs to a plan
227
266
  await checkPlanProgress(task_id, 'failed');
@@ -302,6 +341,7 @@ async function handleRelease(msg) {
302
341
 
303
342
  log(`RELEASED ${task_id}: ${reason || 'no reason'} (needs human triage)`);
304
343
  publishEvent('released', task);
344
+ await cleanupTaskCollabSession(task, `Parent task ${task_id} released: ${reason || 'human triage'}`);
305
345
  respond(msg, task);
306
346
  }
307
347
 
@@ -323,6 +363,7 @@ async function handleCancel(msg) {
323
363
 
324
364
  log(`CANCEL ${task_id}: ${reason || 'no reason'}`);
325
365
  publishEvent('cancelled', task);
366
+ await cleanupTaskCollabSession(task, `Parent task ${task_id} cancelled: ${reason || 'no reason'}`);
326
367
  respond(msg, task);
327
368
  }
328
369
 
@@ -358,6 +399,35 @@ async function detectStalls() {
358
399
  }
359
400
  }
360
401
 
402
+ // Mark stalled node as dead in any collab sessions it belongs to.
403
+ // This unblocks isRoundComplete() which otherwise waits forever for
404
+ // a reflection from a crashed node.
405
+ // Uses findActiveSessionsByNode() — O(sessions) single pass instead of
406
+ // the previous O(sessions × nodes) list-then-find pattern.
407
+ if (task.owner && collabStore) {
408
+ try {
409
+ const sessions = await collabStore.findActiveSessionsByNode(task.owner);
410
+ for (const session of sessions) {
411
+ const node = session.nodes.find(n => n.node_id === task.owner);
412
+ if (node && node.status !== 'dead') {
413
+ await collabStore.setNodeStatus(session.session_id, task.owner, 'dead');
414
+ log(`STALL → COLLAB: marked ${task.owner} as dead in session ${session.session_id}`);
415
+ await collabStore.appendAudit(session.session_id, 'node_marked_dead', {
416
+ node_id: task.owner, reason: `Stall detected: no heartbeat for ${silentMin}m`,
417
+ });
418
+
419
+ // Re-check if the round is now complete (dead nodes excluded)
420
+ const updated = await collabStore.get(session.session_id);
421
+ if (updated && collabStore.isRoundComplete(updated)) {
422
+ await evaluateRound(session.session_id);
423
+ }
424
+ }
425
+ }
426
+ } catch (err) {
427
+ log(`STALL → COLLAB ERROR: ${err.message}`);
428
+ }
429
+ }
430
+
361
431
  const releasedTask = await store.markReleased(
362
432
  task.task_id,
363
433
  `Stall detected: no agent heartbeat for ${silentMin}m, alive check failed`,
@@ -526,6 +596,26 @@ async function handleCollabFind(msg) {
526
596
  respond(msg, session);
527
597
  }
528
598
 
599
+ /**
600
+ * mesh.collab.recruiting — List all sessions currently recruiting nodes.
601
+ * Used by agents to discover collab sessions they should join.
602
+ * Returns: array of { session_id, task_id, mode, min_nodes, max_nodes, current_nodes, recruiting_deadline }
603
+ */
604
+ async function handleCollabRecruiting(msg) {
605
+ const recruiting = await collabStore.list({ status: COLLAB_STATUS.RECRUITING });
606
+ const summaries = recruiting.map(s => ({
607
+ session_id: s.session_id,
608
+ task_id: s.task_id,
609
+ mode: s.mode,
610
+ min_nodes: s.min_nodes,
611
+ max_nodes: s.max_nodes,
612
+ current_nodes: s.nodes.length,
613
+ node_ids: s.nodes.map(n => n.node_id || n.id),
614
+ recruiting_deadline: s.recruiting_deadline,
615
+ }));
616
+ respond(msg, summaries);
617
+ }
618
+
529
619
  /**
530
620
  * mesh.collab.reflect — Node submits a reflection for the current round.
531
621
  * Expects: { session_id, node_id, summary, learnings, artifacts, confidence, vote }
@@ -546,8 +636,20 @@ async function handleCollabReflect(msg) {
546
636
  });
547
637
  publishCollabEvent('reflection_received', session);
548
638
 
549
- // Check if all reflections are in evaluate convergence
550
- if (collabStore.isRoundComplete(session)) {
639
+ // Sequential mode: advance turn, notify next node or evaluate round
640
+ // Parallel mode: check if all reflections are in → evaluate convergence
641
+ // NOTE: Node.js single-threaded event loop prevents concurrent execution of this
642
+ // handler — no mutex needed. advanceTurn() is safe without CAS here.
643
+ if (session.mode === 'sequential') {
644
+ const nextNodeId = await collabStore.advanceTurn(session_id);
645
+ if (nextNodeId) {
646
+ // Notify only the next-turn node with accumulated intra-round intel
647
+ await notifySequentialTurn(session_id, nextNodeId);
648
+ } else {
649
+ // All turns done → evaluate round
650
+ await evaluateRound(session_id);
651
+ }
652
+ } else if (collabStore.isRoundComplete(session)) {
551
653
  await evaluateRound(session_id);
552
654
  }
553
655
 
@@ -657,8 +759,14 @@ async function startCollabRound(sessionId) {
657
759
  const scopeStrategy = session.scope_strategy || 'shared';
658
760
  const nodeScopes = computeNodeScopes(session.nodes, taskScope, scopeStrategy);
659
761
 
660
- // Notify each node with their enforced scope
661
- for (const node of session.nodes) {
762
+ // Sequential mode: only notify the current_turn node.
763
+ // Other nodes get notified via notifySequentialTurn() as turns advance.
764
+ // Parallel mode: notify all nodes at once.
765
+ const nodesToNotify = session.mode === 'sequential' && session.current_turn
766
+ ? session.nodes.filter(n => n.node_id === session.current_turn)
767
+ : session.nodes;
768
+
769
+ for (const node of nodesToNotify) {
662
770
  const effectiveScope = nodeScopes[node.node_id] || node.scope;
663
771
  nc.publish(`mesh.collab.${sessionId}.node.${node.node_id}.round`, sc.encode(JSON.stringify({
664
772
  session_id: sessionId,
@@ -674,6 +782,57 @@ async function startCollabRound(sessionId) {
674
782
  }
675
783
  }
676
784
 
785
+ /**
786
+ * Notify the next node in a sequential turn.
787
+ * Includes intra-round reflections so far as additional shared intel.
788
+ */
789
+ async function notifySequentialTurn(sessionId, nextNodeId) {
790
+ const session = await collabStore.get(sessionId);
791
+ if (!session) return;
792
+
793
+ const currentRound = session.rounds[session.rounds.length - 1];
794
+ if (!currentRound) return;
795
+
796
+ // Compile intra-round intel from reflections already submitted this round
797
+ const intraLines = [`=== INTRA-ROUND ${currentRound.round_number} (turns so far) ===\n`];
798
+ for (const r of currentRound.reflections) {
799
+ intraLines.push(`## Turn: ${r.node_id}${r.parse_failed ? ' [PARSE FAILED]' : ''}`);
800
+ if (r.summary) intraLines.push(`Summary: ${r.summary}`);
801
+ if (r.learnings) intraLines.push(`Learnings: ${r.learnings}`);
802
+ if (r.artifacts.length > 0) intraLines.push(`Artifacts: ${r.artifacts.join(', ')}`);
803
+ intraLines.push(`Confidence: ${r.confidence} | Vote: ${r.vote}`);
804
+ intraLines.push('');
805
+ }
806
+ const intraRoundIntel = intraLines.join('\n');
807
+ const combinedIntel = currentRound.shared_intel
808
+ ? currentRound.shared_intel + '\n\n' + intraRoundIntel
809
+ : intraRoundIntel;
810
+
811
+ const parentTask = await store.get(session.task_id);
812
+ const taskScope = parentTask?.scope || [];
813
+ const scopeStrategy = session.scope_strategy || 'shared';
814
+ const nodeScopes = computeNodeScopes(session.nodes, taskScope, scopeStrategy);
815
+ const nextNode = session.nodes.find(n => n.node_id === nextNodeId);
816
+
817
+ nc.publish(`mesh.collab.${sessionId}.node.${nextNodeId}.round`, sc.encode(JSON.stringify({
818
+ session_id: sessionId,
819
+ task_id: session.task_id,
820
+ round_number: currentRound.round_number,
821
+ shared_intel: combinedIntel,
822
+ my_scope: nodeScopes[nextNodeId] || nextNode?.scope || ['*'],
823
+ my_role: nextNode?.role || 'worker',
824
+ mode: 'sequential',
825
+ current_turn: nextNodeId,
826
+ scope_strategy: scopeStrategy,
827
+ })));
828
+
829
+ log(`COLLAB SEQ ${sessionId} R${currentRound.round_number}: Turn advanced to ${nextNodeId}`);
830
+ await collabStore.appendAudit(sessionId, 'turn_advanced', {
831
+ round: currentRound.round_number, next_node: nextNodeId,
832
+ reflections_so_far: currentRound.reflections.length,
833
+ });
834
+ }
835
+
677
836
  /**
678
837
  * Evaluate the current round: check convergence, advance or complete.
679
838
  */
@@ -702,10 +861,11 @@ async function evaluateRound(sessionId) {
702
861
  await collabStore.markConverged(sessionId);
703
862
  publishCollabEvent('converged', session);
704
863
 
705
- // Collect artifacts from all reflections
864
+ // Re-fetch after markConverged to ensure fresh state
865
+ const freshSession = await collabStore.get(sessionId);
706
866
  const allArtifacts = [];
707
867
  const contributions = {};
708
- for (const round of session.rounds) {
868
+ for (const round of freshSession.rounds) {
709
869
  for (const r of round.reflections) {
710
870
  allArtifacts.push(...r.artifacts);
711
871
  contributions[r.node_id] = r.summary;
@@ -714,20 +874,20 @@ async function evaluateRound(sessionId) {
714
874
 
715
875
  await collabStore.markCompleted(sessionId, {
716
876
  artifacts: [...new Set(allArtifacts)],
717
- summary: `Converged after ${session.current_round} rounds with ${session.nodes.length} nodes`,
877
+ summary: `Converged after ${freshSession.current_round} rounds with ${freshSession.nodes.length} nodes`,
718
878
  node_contributions: contributions,
719
879
  });
720
880
  await collabStore.appendAudit(sessionId, 'session_completed', {
721
- outcome: 'converged', rounds: session.current_round,
881
+ outcome: 'converged', rounds: freshSession.current_round,
722
882
  artifacts: [...new Set(allArtifacts)].length,
723
- node_count: session.nodes.length, recruited_count: session.recruited_count,
883
+ node_count: freshSession.nodes.length, recruited_count: freshSession.recruited_count,
724
884
  });
725
885
 
726
886
  // Complete the parent task
727
- const updatedSession = await collabStore.get(sessionId);
728
- await store.markCompleted(session.task_id, updatedSession.result);
729
- publishEvent('completed', await store.get(session.task_id));
730
- publishCollabEvent('completed', updatedSession);
887
+ const completedSession = await collabStore.get(sessionId);
888
+ await store.markCompleted(freshSession.task_id, completedSession.result);
889
+ publishEvent('completed', await store.get(freshSession.task_id));
890
+ publishCollabEvent('completed', completedSession);
731
891
 
732
892
  } else if (maxReached) {
733
893
  log(`COLLAB MAX ROUNDS ${sessionId}: ${session.current_round}/${session.max_rounds}. Completing with current artifacts.`);
@@ -955,6 +1115,19 @@ async function advancePlanWave(planId) {
955
1115
  const waveNum = ready[0].wave;
956
1116
  log(`PLAN WAVE ${planId} W${waveNum}: dispatching ${ready.length} subtasks`);
957
1117
 
1118
+ // Inherit routing fields from parent task so subtasks use the same LLM/node preferences.
1119
+ // CONSTRAINT: Subtasks cannot override routing independently — they always inherit from the
1120
+ // parent task. If per-subtask routing is needed, extend the subtask schema in mesh-plans.js
1121
+ // (e.g. subtask.llm_provider) and merge here with subtask fields taking priority.
1122
+ const parentTask = await store.get(plan.parent_task_id);
1123
+ const inheritedRouting = {};
1124
+ if (parentTask) {
1125
+ if (parentTask.llm_provider) inheritedRouting.llm_provider = parentTask.llm_provider;
1126
+ if (parentTask.llm_model) inheritedRouting.llm_model = parentTask.llm_model;
1127
+ if (parentTask.preferred_nodes) inheritedRouting.preferred_nodes = parentTask.preferred_nodes;
1128
+ if (parentTask.exclude_nodes) inheritedRouting.exclude_nodes = parentTask.exclude_nodes;
1129
+ }
1130
+
958
1131
  for (const st of ready) {
959
1132
  st.status = SUBTASK_STATUS.QUEUED;
960
1133
 
@@ -962,7 +1135,7 @@ async function advancePlanWave(planId) {
962
1135
  switch (st.delegation.mode) {
963
1136
  case 'solo_mesh':
964
1137
  case 'collab_mesh': {
965
- // Submit as mesh task
1138
+ // Submit as mesh task — inherit routing fields from parent task
966
1139
  const meshTask = createTask({
967
1140
  task_id: st.subtask_id,
968
1141
  title: st.title,
@@ -973,6 +1146,7 @@ async function advancePlanWave(planId) {
973
1146
  success_criteria: st.success_criteria,
974
1147
  tags: ['plan', planId],
975
1148
  collaboration: st.delegation.collaboration || undefined,
1149
+ ...inheritedRouting,
976
1150
  });
977
1151
  await store.put(meshTask);
978
1152
  st.mesh_task_id = meshTask.task_id;
@@ -1098,6 +1272,7 @@ async function main() {
1098
1272
  'mesh.collab.status': handleCollabStatus,
1099
1273
  'mesh.collab.find': handleCollabFind,
1100
1274
  'mesh.collab.reflect': handleCollabReflect,
1275
+ 'mesh.collab.recruiting': handleCollabRecruiting,
1101
1276
  // Plan handlers
1102
1277
  'mesh.plans.create': handlePlanCreate,
1103
1278
  'mesh.plans.get': handlePlanGet,
@@ -1116,7 +1291,7 @@ async function main() {
1116
1291
  try {
1117
1292
  await handler(msg);
1118
1293
  } catch (err) {
1119
- log(`ERROR handling ${subject}: ${err.message}`);
1294
+ log(`ERROR handling ${subject}: ${err.message}\n${err.stack}`);
1120
1295
  try { respondError(msg, err.message); } catch {}
1121
1296
  }
1122
1297
  }
package/bin/mesh.js CHANGED
@@ -397,6 +397,10 @@ async function cmdSubmit(args) {
397
397
  success_criteria: task.success_criteria || [],
398
398
  scope: task.scope || [],
399
399
  priority: task.auto_priority || 0,
400
+ llm_provider: task.provider || task.llm_provider || null,
401
+ llm_model: task.model || task.llm_model || null,
402
+ preferred_nodes: task.preferred_nodes || [],
403
+ exclude_nodes: task.exclude_nodes || [],
400
404
  });
401
405
  console.log(`Submitted: ${result.data.task_id} [${result.data.status}]`);
402
406
  // Mark as 'submitted' — NOT 'running'. The card reflects actual mesh state.
@@ -451,12 +455,18 @@ async function cmdSubmit(args) {
451
455
  scope: task.scope || [],
452
456
  priority: task.priority || 0,
453
457
  tags: task.tags || [],
458
+ llm_provider: task.provider || task.llm_provider || null,
459
+ llm_model: task.model || task.llm_model || null,
460
+ preferred_nodes: task.preferred_nodes || [],
461
+ exclude_nodes: task.exclude_nodes || [],
462
+ collaboration: task.collaboration || undefined,
454
463
  });
455
464
 
456
465
  console.log(`Submitted: ${result.data.task_id} "${result.data.title}"`);
457
466
  console.log(` Status: ${result.data.status}`);
458
467
  console.log(` Budget: ${result.data.budget_minutes}m`);
459
468
  console.log(` Metric: ${result.data.metric || 'none'}`);
469
+ if (result.data.llm_provider) console.log(` Provider: ${result.data.llm_provider}`);
460
470
  await nc.close();
461
471
  }
462
472
 
@@ -598,7 +608,11 @@ async function cmdRepair(args) {
598
608
  */
599
609
  async function cmdDeploy(args) {
600
610
  const { execSync } = require('child_process');
601
- const repoDir = process.env.OPENCLAW_REPO_DIR || path.join(os.homedir(), 'openclaw');
611
+ // Prefer openclaw-node (git repo) over openclaw (runtime)
612
+ const defaultRepo = fs.existsSync(path.join(os.homedir(), 'openclaw-node', '.git'))
613
+ ? path.join(os.homedir(), 'openclaw-node')
614
+ : path.join(os.homedir(), 'openclaw');
615
+ const repoDir = process.env.OPENCLAW_REPO_DIR || defaultRepo;
602
616
  const force = args.includes('--force');
603
617
 
604
618
  // Parse --component flags
@@ -658,7 +672,7 @@ async function cmdDeploy(args) {
658
672
  await nc.flush();
659
673
  console.log('Deploy trigger sent.\n');
660
674
 
661
- // Poll for results (15s timeout)
675
+ // Poll for results (10s timeout)
662
676
  console.log('Waiting for node responses...');
663
677
  const deadline = Date.now() + 15000;
664
678
  const seen = new Set();
@@ -668,6 +682,7 @@ async function cmdDeploy(args) {
668
682
  const resultsKv = await js.views.kv('MESH_DEPLOY_RESULTS');
669
683
 
670
684
  while (Date.now() < deadline) {
685
+ // Check all nodes
671
686
  const allAliasNodes = [...new Set(Object.values(NODE_ALIASES))];
672
687
  const checkNodes = targetNodes.length > 0 ? targetNodes : allAliasNodes;
673
688
 
@@ -731,9 +746,9 @@ function cmdHelp() {
731
746
  ' mesh repair Self-repair this node',
732
747
  ' mesh repair --all Self-repair ALL nodes',
733
748
  ' mesh deploy Deploy to all nodes',
734
- ' mesh deploy --force Force deploy (even if up to date)',
735
- ' mesh deploy --component <name> Deploy specific component',
736
- ' mesh deploy --node <name> Deploy to specific node',
749
+ ' mesh deploy --force Force deploy (skip cache)',
750
+ ' mesh deploy --node ubuntu Deploy to specific node',
751
+ ' mesh deploy --component mesh-daemons Deploy specific component',
737
752
  '',
738
753
  'NODE ALIASES:',
739
754
  ' ubuntu, linux = Ubuntu VM (calos-vmware-virtual-platform)',
package/install.sh CHANGED
@@ -230,6 +230,10 @@ if [ -z "$NODE_ROLE" ]; then
230
230
  NODE_ROLE="worker"
231
231
  fi
232
232
  fi
233
+ if [ "$NODE_ROLE" != "lead" ] && [ "$NODE_ROLE" != "worker" ]; then
234
+ error "Invalid role: $NODE_ROLE (must be 'lead' or 'worker')"
235
+ exit 1
236
+ fi
233
237
  export OPENCLAW_NODE_ROLE="$NODE_ROLE"
234
238
  info "Node role: $NODE_ROLE"
235
239
 
@@ -692,6 +696,9 @@ else
692
696
  if command -v envsubst >/dev/null 2>&1; then
693
697
  envsubst < "$TEMPLATE" > "$DEST"
694
698
  else
699
+ # NOTE: sed delimiter is |. If OPENCLAW_NATS_TOKEN ever contains |
700
+ # (unlikely — tokens are hex/base64), this substitution will break.
701
+ # Prefer envsubst (above) when available; it has no delimiter issue.
695
702
  sed \
696
703
  -e "s|\${HOME}|$HOME|g" \
697
704
  -e "s|\${NODE_BIN}|$NODE_BIN|g" \
package/lib/kanban-io.js CHANGED
@@ -53,8 +53,14 @@ function withMkdirLock(filePath, fn) {
53
53
  if (Date.now() - start > maxWait) {
54
54
  throw new Error(`kanban-io: lock timeout after ${maxWait}ms on ${filePath}`);
55
55
  }
56
- // Non-blocking 10ms pause (busy-wait would peg CPU in Node's single thread)
57
- require('child_process').spawnSync('sleep', ['0.01']);
56
+ // Sleep ~10ms Atomics.wait is precise but throws on main thread
57
+ // in some Node.js builds; fall back to busy-spin (rare contention path)
58
+ try {
59
+ Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, 10);
60
+ } catch {
61
+ const end = Date.now() + 10;
62
+ while (Date.now() < end) { /* busy-wait fallback */ }
63
+ }
58
64
  }
59
65
  }
60
66
 
@@ -81,9 +87,7 @@ function parseTasks(content) {
81
87
  const liveIdx = content.indexOf('## Live Tasks');
82
88
  if (liveIdx === -1) return tasks;
83
89
 
84
- const afterLive = content.slice(liveIdx);
85
- const nextSectionIdx = afterLive.indexOf('\n## ', 1); // skip the current ## Live Tasks
86
- const liveSection = nextSectionIdx >= 0 ? afterLive.slice(0, nextSectionIdx) : afterLive;
90
+ const liveSection = content.slice(liveIdx);
87
91
  const lines = liveSection.split('\n');
88
92
 
89
93
  let current = null;
@@ -110,6 +114,13 @@ function parseTasks(content) {
110
114
  budget_minutes: current.budget_minutes || 30,
111
115
  scope: current.scope || [],
112
116
  updated_at: current.updated_at || '',
117
+ // Mesh routing
118
+ llm_provider: current.llm_provider || null,
119
+ llm_model: current.llm_model || null,
120
+ preferred_nodes: current.preferred_nodes || [],
121
+ exclude_nodes: current.exclude_nodes || [],
122
+ collaboration: current.collaboration || null,
123
+ collab_result: current.collab_result || null,
113
124
  });
114
125
  }
115
126
  }
@@ -119,7 +130,7 @@ function parseTasks(content) {
119
130
  const taskIdMatch = line.match(/^- task_id:\s*(.+)$/);
120
131
  if (taskIdMatch) {
121
132
  flush();
122
- current = { task_id: taskIdMatch[1].trim(), success_criteria: [], artifacts: [], scope: [] };
133
+ current = { task_id: taskIdMatch[1].trim(), success_criteria: [], artifacts: [], scope: [], preferred_nodes: [], exclude_nodes: [] };
123
134
  currentArrayKey = null;
124
135
  continue;
125
136
  }
@@ -176,6 +187,31 @@ function parseTasks(content) {
176
187
  current.scope = [];
177
188
  currentArrayKey = 'scope';
178
189
  break;
190
+ // Mesh routing fields
191
+ case 'llm_provider':
192
+ case 'provider':
193
+ current.llm_provider = value || null; currentArrayKey = null; break;
194
+ case 'llm_model':
195
+ case 'model':
196
+ current.llm_model = value || null; currentArrayKey = null; break;
197
+ case 'preferred_nodes':
198
+ current.preferred_nodes = [];
199
+ currentArrayKey = 'preferred_nodes';
200
+ break;
201
+ case 'exclude_nodes':
202
+ current.exclude_nodes = [];
203
+ currentArrayKey = 'exclude_nodes';
204
+ break;
205
+ case 'collaboration':
206
+ try { current.collaboration = value ? JSON.parse(value) : null; }
207
+ catch { current.collaboration = null; }
208
+ currentArrayKey = null;
209
+ break;
210
+ case 'collab_result':
211
+ try { current.collab_result = value ? JSON.parse(value) : null; }
212
+ catch { current.collab_result = null; }
213
+ currentArrayKey = null;
214
+ break;
179
215
  default:
180
216
  currentArrayKey = null;
181
217
  break;
@@ -236,7 +272,11 @@ function _updateTaskInPlaceUnsafe(filePath, taskId, fieldUpdates = {}, arrayAppe
236
272
  const blockLines = lines.slice(blockStart, blockEnd);
237
273
 
238
274
  // Update scalar fields
239
- for (const [key, value] of Object.entries(fieldUpdates)) {
275
+ for (const [key, rawValue] of Object.entries(fieldUpdates)) {
276
+ // Serialize objects/arrays as JSON so the parser can read them back
277
+ const value = (rawValue !== null && typeof rawValue === 'object')
278
+ ? JSON.stringify(rawValue)
279
+ : rawValue;
240
280
  const fieldRegex = new RegExp(`^ ${key}:\\s*.*$`);
241
281
  let found = false;
242
282
  for (let i = 1; i < blockLines.length; i++) {
@@ -249,7 +289,7 @@ function _updateTaskInPlaceUnsafe(filePath, taskId, fieldUpdates = {}, arrayAppe
249
289
  if (!found) {
250
290
  // Insert before updated_at if it exists, otherwise at end of block
251
291
  const updatedAtIdx = blockLines.findIndex(l => l.match(/^ updated_at:/));
252
- const insertIdx = updatedAtIdx >= 0 ? updatedAtIdx : blockLines.length;
292
+ const insertIdx = updatedAtIdx > 0 ? updatedAtIdx : blockLines.length;
253
293
  blockLines.splice(insertIdx, 0, ` ${key}: ${value}`);
254
294
  }
255
295
  }
@@ -262,7 +302,7 @@ function _updateTaskInPlaceUnsafe(filePath, taskId, fieldUpdates = {}, arrayAppe
262
302
  if (headerIdx === -1) {
263
303
  // Insert the array before updated_at
264
304
  const updatedAtIdx = blockLines.findIndex(l => l.match(/^ updated_at:/));
265
- const insertIdx = updatedAtIdx >= 0 ? updatedAtIdx : blockLines.length;
305
+ const insertIdx = updatedAtIdx > 0 ? updatedAtIdx : blockLines.length;
266
306
  const newLines = [` ${key}:`];
267
307
  for (const item of items) {
268
308
  newLines.push(` - ${item}`);
@@ -290,7 +330,7 @@ function _updateTaskInPlaceUnsafe(filePath, taskId, fieldUpdates = {}, arrayAppe
290
330
  if (headerIdx === -1) {
291
331
  // Insert the array before updated_at
292
332
  const updatedAtIdx = blockLines.findIndex(l => l.match(/^ updated_at:/));
293
- const insertIdx = updatedAtIdx >= 0 ? updatedAtIdx : blockLines.length;
333
+ const insertIdx = updatedAtIdx > 0 ? updatedAtIdx : blockLines.length;
294
334
  const newLines = [` ${key}:`];
295
335
  for (const item of items) {
296
336
  newLines.push(` - ${item}`);
@@ -105,6 +105,10 @@ function createSession(taskId, collabSpec) {
105
105
 
106
106
  // ── CollabStore (KV-backed) ─────────────────────────
107
107
 
108
+ // Rate-limit audit error logs: max 3 per session, then go silent
109
+ const _auditErrorCounts = new Map();
110
+ const AUDIT_ERROR_LOG_LIMIT = 3;
111
+
108
112
  class CollabStore {
109
113
  constructor(kv) {
110
114
  this.kv = kv;
@@ -139,7 +143,14 @@ class CollabStore {
139
143
  ...detail,
140
144
  });
141
145
  await this.put(session);
142
- } catch { /* best-effort — never block on audit */ }
146
+ } catch (err) {
147
+ // Best-effort — never block on audit, but log first N failures per session
148
+ const count = (_auditErrorCounts.get(sessionId) || 0) + 1;
149
+ _auditErrorCounts.set(sessionId, count);
150
+ if (count <= AUDIT_ERROR_LOG_LIMIT) {
151
+ console.error(`[collab] audit append failed for ${sessionId}/${event}: ${err.message}${count === AUDIT_ERROR_LOG_LIMIT ? ' (suppressing further audit errors for this session)' : ''}`);
152
+ }
153
+ }
143
154
  }
144
155
 
145
156
  /**
@@ -176,6 +187,30 @@ class CollabStore {
176
187
  return sessions[0] || null;
177
188
  }
178
189
 
190
+ /**
191
+ * Find active sessions that contain a given node.
192
+ * O(sessions) single pass — avoids the O(sessions × nodes) scan
193
+ * that detectStalls() previously used with list() + inner find().
194
+ */
195
+ async findActiveSessionsByNode(nodeId) {
196
+ const results = [];
197
+ const allKeys = [];
198
+ const keys = await this.kv.keys();
199
+ for await (const key of keys) {
200
+ allKeys.push(key);
201
+ }
202
+ for (const key of allKeys) {
203
+ const entry = await this.kv.get(key);
204
+ if (!entry || !entry.value) continue;
205
+ const session = JSON.parse(sc.decode(entry.value));
206
+ if (session.status !== COLLAB_STATUS.ACTIVE) continue;
207
+ if (session.nodes.some(n => n.node_id === nodeId)) {
208
+ results.push(session);
209
+ }
210
+ }
211
+ return results;
212
+ }
213
+
179
214
  // ── Node Management ────────────────────────────────
180
215
 
181
216
  /**
@@ -190,7 +225,8 @@ class CollabStore {
190
225
  // Check max_nodes
191
226
  if (session.max_nodes && session.nodes.length >= session.max_nodes) return null;
192
227
 
193
- // Check duplicate
228
+ // Check duplicate — single-threaded event loop prevents concurrent joins
229
+ // from interleaving between find() and push(). No mutex needed.
194
230
  if (session.nodes.find(n => n.node_id === nodeId)) return null;
195
231
 
196
232
  session.nodes.push({
@@ -320,6 +356,9 @@ class CollabStore {
320
356
  const session = await this.get(sessionId);
321
357
  if (!session) return null;
322
358
 
359
+ // Only accept reflections on active sessions
360
+ if (session.status !== COLLAB_STATUS.ACTIVE) return null;
361
+
323
362
  const currentRound = session.rounds[session.rounds.length - 1];
324
363
  if (!currentRound) return null;
325
364
 
@@ -506,11 +545,14 @@ class CollabStore {
506
545
  }
507
546
 
508
547
  /**
509
- * Mark session as aborted.
548
+ * Mark session as aborted. Returns null (no-op) if already completed or aborted.
549
+ * Callers can use truthiness to detect whether the abort actually happened.
510
550
  */
511
551
  async markAborted(sessionId, reason) {
512
552
  const session = await this.get(sessionId);
513
553
  if (!session) return null;
554
+ // Guard: don't corrupt completed/aborted sessions
555
+ if (['completed', 'aborted'].includes(session.status)) return null;
514
556
  session.status = COLLAB_STATUS.ABORTED;
515
557
  session.completed_at = new Date().toISOString();
516
558
  session.result = { success: false, summary: reason, aborted: true };
@@ -518,6 +560,14 @@ class CollabStore {
518
560
  return session;
519
561
  }
520
562
 
563
+ /**
564
+ * Clear the audit error rate-limit counter for a session.
565
+ * Call when a session is finalized (completed/aborted) to prevent Map leak.
566
+ */
567
+ clearAuditErrorCount(sessionId) {
568
+ _auditErrorCounts.delete(sessionId);
569
+ }
570
+
521
571
  /**
522
572
  * Get a summary of the session for reporting.
523
573
  */