npm - openclaw-node-harness - Versions diffs - 2.0.3 → 2.0.4 - Mend

openclaw-node-harness 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/bin/mesh-agent.js +214 -81
package/bin/mesh-bridge.js +274 -10
package/bin/mesh-deploy-listener.js +119 -97
package/bin/mesh-deploy.js +8 -0
package/bin/mesh-task-daemon.js +190 -15
package/bin/mesh.js +20 -5
package/install.sh +7 -0
package/lib/kanban-io.js +50 -10
package/lib/mesh-collab.js +53 -3
package/lib/mesh-registry.js +11 -2
package/package.json +1 -1

package/bin/mesh-task-daemon.js CHANGED Viewed

@@ -131,6 +131,36 @@ async function handleSubmit(msg) {
   respond(msg, task);
 }
+/**
+ * Abort any collab session tied to a task that is being terminated.
+ * Shared by handleFail, handleRelease, handleCancel.
+ *
+ * NOT called from handleComplete — that path goes through evaluateRound
+ * which already calls collabStore.markCompleted() on the session.
+ *
+ * markAborted() is idempotent: no-op if session is already completed/aborted.
+ * This makes double-abort safe (e.g. stall detection → release race).
+ */
+async function cleanupTaskCollabSession(task, reason) {
+  if (!task.collab_session_id || !collabStore) return;
+  try {
+    // markAborted returns null if session doesn't exist or is already completed/aborted.
+    // Non-null means we actually transitioned the session to aborted.
+    const session = await collabStore.markAborted(task.collab_session_id, reason);
+    if (session) {
+      await collabStore.appendAudit(task.collab_session_id, 'session_aborted', { reason });
+      publishCollabEvent('aborted', session);
+      log(`COLLAB ABORTED ${task.collab_session_id}: ${reason}`);
+    }
+    // Clean up audit error rate-limit counter
+    // NOTE: sessions expiring via KV TTL bypass this — residual Map entry is negligible
+    // for a homelab mesh but worth noting.
+    collabStore.clearAuditErrorCount(task.collab_session_id);
+  } catch (err) {
+    log(`COLLAB CLEANUP WARN: could not abort session ${task.collab_session_id}: ${err.message}`);
+  }
+}
 /**
  * mesh.tasks.claim — Agent requests the next available task.
  * Expects: { node_id }
@@ -203,6 +233,14 @@ async function handleComplete(msg) {
   log(`COMPLETE ${task_id} in ${elapsed}m: ${result?.summary || 'no summary'}`);
   publishEvent('completed', task);
+  // NOTE: no cleanupTaskCollabSession here — collab tasks complete via
+  // evaluateRound → markCompleted on the session, then store.markCompleted
+  // on the parent task. Calling cleanupTaskCollabSession would markAborted
+  // on an already-completed session. Clean up audit counter only.
+  if (task.collab_session_id && collabStore) {
+    collabStore.clearAuditErrorCount(task.collab_session_id);
+  }
   // Check if this task belongs to a plan
   await checkPlanProgress(task_id, 'completed');
@@ -222,6 +260,7 @@ async function handleFail(msg) {
   log(`FAIL ${task_id}: ${reason}`);
   publishEvent('failed', task);
+  await cleanupTaskCollabSession(task, `Parent task ${task_id} failed: ${reason}`);
   // Check if this task belongs to a plan
   await checkPlanProgress(task_id, 'failed');
@@ -302,6 +341,7 @@ async function handleRelease(msg) {
   log(`RELEASED ${task_id}: ${reason || 'no reason'} (needs human triage)`);
   publishEvent('released', task);
+  await cleanupTaskCollabSession(task, `Parent task ${task_id} released: ${reason || 'human triage'}`);
   respond(msg, task);
 }
@@ -323,6 +363,7 @@ async function handleCancel(msg) {
   log(`CANCEL ${task_id}: ${reason || 'no reason'}`);
   publishEvent('cancelled', task);
+  await cleanupTaskCollabSession(task, `Parent task ${task_id} cancelled: ${reason || 'no reason'}`);
   respond(msg, task);
 }
@@ -358,6 +399,35 @@ async function detectStalls() {
       }
     }
+    // Mark stalled node as dead in any collab sessions it belongs to.
+    // This unblocks isRoundComplete() which otherwise waits forever for
+    // a reflection from a crashed node.
+    // Uses findActiveSessionsByNode() — O(sessions) single pass instead of
+    // the previous O(sessions × nodes) list-then-find pattern.
+    if (task.owner && collabStore) {
+      try {
+        const sessions = await collabStore.findActiveSessionsByNode(task.owner);
+        for (const session of sessions) {
+          const node = session.nodes.find(n => n.node_id === task.owner);
+          if (node && node.status !== 'dead') {
+            await collabStore.setNodeStatus(session.session_id, task.owner, 'dead');
+            log(`STALL → COLLAB: marked ${task.owner} as dead in session ${session.session_id}`);
+            await collabStore.appendAudit(session.session_id, 'node_marked_dead', {
+              node_id: task.owner, reason: `Stall detected: no heartbeat for ${silentMin}m`,
+            });
+            // Re-check if the round is now complete (dead nodes excluded)
+            const updated = await collabStore.get(session.session_id);
+            if (updated && collabStore.isRoundComplete(updated)) {
+              await evaluateRound(session.session_id);
+            }
+          }
+        }
+      } catch (err) {
+        log(`STALL → COLLAB ERROR: ${err.message}`);
+      }
+    }
     const releasedTask = await store.markReleased(
       task.task_id,
       `Stall detected: no agent heartbeat for ${silentMin}m, alive check failed`,
@@ -526,6 +596,26 @@ async function handleCollabFind(msg) {
   respond(msg, session);
 }
+/**
+ * mesh.collab.recruiting — List all sessions currently recruiting nodes.
+ * Used by agents to discover collab sessions they should join.
+ * Returns: array of { session_id, task_id, mode, min_nodes, max_nodes, current_nodes, recruiting_deadline }
+ */
+async function handleCollabRecruiting(msg) {
+  const recruiting = await collabStore.list({ status: COLLAB_STATUS.RECRUITING });
+  const summaries = recruiting.map(s => ({
+    session_id: s.session_id,
+    task_id: s.task_id,
+    mode: s.mode,
+    min_nodes: s.min_nodes,
+    max_nodes: s.max_nodes,
+    current_nodes: s.nodes.length,
+    node_ids: s.nodes.map(n => n.node_id || n.id),
+    recruiting_deadline: s.recruiting_deadline,
+  }));
+  respond(msg, summaries);
+}
 /**
  * mesh.collab.reflect — Node submits a reflection for the current round.
  * Expects: { session_id, node_id, summary, learnings, artifacts, confidence, vote }
@@ -546,8 +636,20 @@ async function handleCollabReflect(msg) {
   });
   publishCollabEvent('reflection_received', session);
-  // Check if all reflections are in → evaluate convergence
-  if (collabStore.isRoundComplete(session)) {
+  // Sequential mode: advance turn, notify next node or evaluate round
+  // Parallel mode: check if all reflections are in → evaluate convergence
+  // NOTE: Node.js single-threaded event loop prevents concurrent execution of this
+  // handler — no mutex needed. advanceTurn() is safe without CAS here.
+  if (session.mode === 'sequential') {
+    const nextNodeId = await collabStore.advanceTurn(session_id);
+    if (nextNodeId) {
+      // Notify only the next-turn node with accumulated intra-round intel
+      await notifySequentialTurn(session_id, nextNodeId);
+    } else {
+      // All turns done → evaluate round
+      await evaluateRound(session_id);
+    }
+  } else if (collabStore.isRoundComplete(session)) {
     await evaluateRound(session_id);
   }
@@ -657,8 +759,14 @@ async function startCollabRound(sessionId) {
   const scopeStrategy = session.scope_strategy || 'shared';
   const nodeScopes = computeNodeScopes(session.nodes, taskScope, scopeStrategy);
-  // Notify each node with their enforced scope
-  for (const node of session.nodes) {
+  // Sequential mode: only notify the current_turn node.
+  // Other nodes get notified via notifySequentialTurn() as turns advance.
+  // Parallel mode: notify all nodes at once.
+  const nodesToNotify = session.mode === 'sequential' && session.current_turn
+    ? session.nodes.filter(n => n.node_id === session.current_turn)
+    : session.nodes;
+  for (const node of nodesToNotify) {
     const effectiveScope = nodeScopes[node.node_id] || node.scope;
     nc.publish(`mesh.collab.${sessionId}.node.${node.node_id}.round`, sc.encode(JSON.stringify({
       session_id: sessionId,
@@ -674,6 +782,57 @@ async function startCollabRound(sessionId) {
   }
 }
+/**
+ * Notify the next node in a sequential turn.
+ * Includes intra-round reflections so far as additional shared intel.
+ */
+async function notifySequentialTurn(sessionId, nextNodeId) {
+  const session = await collabStore.get(sessionId);
+  if (!session) return;
+  const currentRound = session.rounds[session.rounds.length - 1];
+  if (!currentRound) return;
+  // Compile intra-round intel from reflections already submitted this round
+  const intraLines = [`=== INTRA-ROUND ${currentRound.round_number} (turns so far) ===\n`];
+  for (const r of currentRound.reflections) {
+    intraLines.push(`## Turn: ${r.node_id}${r.parse_failed ? ' [PARSE FAILED]' : ''}`);
+    if (r.summary) intraLines.push(`Summary: ${r.summary}`);
+    if (r.learnings) intraLines.push(`Learnings: ${r.learnings}`);
+    if (r.artifacts.length > 0) intraLines.push(`Artifacts: ${r.artifacts.join(', ')}`);
+    intraLines.push(`Confidence: ${r.confidence} | Vote: ${r.vote}`);
+    intraLines.push('');
+  }
+  const intraRoundIntel = intraLines.join('\n');
+  const combinedIntel = currentRound.shared_intel
+    ? currentRound.shared_intel + '\n\n' + intraRoundIntel
+    : intraRoundIntel;
+  const parentTask = await store.get(session.task_id);
+  const taskScope = parentTask?.scope || [];
+  const scopeStrategy = session.scope_strategy || 'shared';
+  const nodeScopes = computeNodeScopes(session.nodes, taskScope, scopeStrategy);
+  const nextNode = session.nodes.find(n => n.node_id === nextNodeId);
+  nc.publish(`mesh.collab.${sessionId}.node.${nextNodeId}.round`, sc.encode(JSON.stringify({
+    session_id: sessionId,
+    task_id: session.task_id,
+    round_number: currentRound.round_number,
+    shared_intel: combinedIntel,
+    my_scope: nodeScopes[nextNodeId] || nextNode?.scope || ['*'],
+    my_role: nextNode?.role || 'worker',
+    mode: 'sequential',
+    current_turn: nextNodeId,
+    scope_strategy: scopeStrategy,
+  })));
+  log(`COLLAB SEQ ${sessionId} R${currentRound.round_number}: Turn advanced to ${nextNodeId}`);
+  await collabStore.appendAudit(sessionId, 'turn_advanced', {
+    round: currentRound.round_number, next_node: nextNodeId,
+    reflections_so_far: currentRound.reflections.length,
+  });
+}
 /**
  * Evaluate the current round: check convergence, advance or complete.
  */
@@ -702,10 +861,11 @@ async function evaluateRound(sessionId) {
     await collabStore.markConverged(sessionId);
     publishCollabEvent('converged', session);
-    // Collect artifacts from all reflections
+    // Re-fetch after markConverged to ensure fresh state
+    const freshSession = await collabStore.get(sessionId);
     const allArtifacts = [];
     const contributions = {};
-    for (const round of session.rounds) {
+    for (const round of freshSession.rounds) {
       for (const r of round.reflections) {
         allArtifacts.push(...r.artifacts);
         contributions[r.node_id] = r.summary;
@@ -714,20 +874,20 @@ async function evaluateRound(sessionId) {
     await collabStore.markCompleted(sessionId, {
       artifacts: [...new Set(allArtifacts)],
-      summary: `Converged after ${session.current_round} rounds with ${session.nodes.length} nodes`,
+      summary: `Converged after ${freshSession.current_round} rounds with ${freshSession.nodes.length} nodes`,
       node_contributions: contributions,
     });
     await collabStore.appendAudit(sessionId, 'session_completed', {
-      outcome: 'converged', rounds: session.current_round,
+      outcome: 'converged', rounds: freshSession.current_round,
       artifacts: [...new Set(allArtifacts)].length,
-      node_count: session.nodes.length, recruited_count: session.recruited_count,
+      node_count: freshSession.nodes.length, recruited_count: freshSession.recruited_count,
     });
     // Complete the parent task
-    const updatedSession = await collabStore.get(sessionId);
-    await store.markCompleted(session.task_id, updatedSession.result);
-    publishEvent('completed', await store.get(session.task_id));
-    publishCollabEvent('completed', updatedSession);
+    const completedSession = await collabStore.get(sessionId);
+    await store.markCompleted(freshSession.task_id, completedSession.result);
+    publishEvent('completed', await store.get(freshSession.task_id));
+    publishCollabEvent('completed', completedSession);
   } else if (maxReached) {
     log(`COLLAB MAX ROUNDS ${sessionId}: ${session.current_round}/${session.max_rounds}. Completing with current artifacts.`);
@@ -955,6 +1115,19 @@ async function advancePlanWave(planId) {
   const waveNum = ready[0].wave;
   log(`PLAN WAVE ${planId} W${waveNum}: dispatching ${ready.length} subtasks`);
+  // Inherit routing fields from parent task so subtasks use the same LLM/node preferences.
+  // CONSTRAINT: Subtasks cannot override routing independently — they always inherit from the
+  // parent task. If per-subtask routing is needed, extend the subtask schema in mesh-plans.js
+  // (e.g. subtask.llm_provider) and merge here with subtask fields taking priority.
+  const parentTask = await store.get(plan.parent_task_id);
+  const inheritedRouting = {};
+  if (parentTask) {
+    if (parentTask.llm_provider) inheritedRouting.llm_provider = parentTask.llm_provider;
+    if (parentTask.llm_model) inheritedRouting.llm_model = parentTask.llm_model;
+    if (parentTask.preferred_nodes) inheritedRouting.preferred_nodes = parentTask.preferred_nodes;
+    if (parentTask.exclude_nodes) inheritedRouting.exclude_nodes = parentTask.exclude_nodes;
+  }
   for (const st of ready) {
     st.status = SUBTASK_STATUS.QUEUED;
@@ -962,7 +1135,7 @@ async function advancePlanWave(planId) {
     switch (st.delegation.mode) {
       case 'solo_mesh':
       case 'collab_mesh': {
-        // Submit as mesh task
+        // Submit as mesh task — inherit routing fields from parent task
         const meshTask = createTask({
           task_id: st.subtask_id,
           title: st.title,
@@ -973,6 +1146,7 @@ async function advancePlanWave(planId) {
           success_criteria: st.success_criteria,
           tags: ['plan', planId],
           collaboration: st.delegation.collaboration || undefined,
+          ...inheritedRouting,
         });
         await store.put(meshTask);
         st.mesh_task_id = meshTask.task_id;
@@ -1098,6 +1272,7 @@ async function main() {
     'mesh.collab.status':   handleCollabStatus,
     'mesh.collab.find':     handleCollabFind,
     'mesh.collab.reflect':  handleCollabReflect,
+    'mesh.collab.recruiting': handleCollabRecruiting,
     // Plan handlers
     'mesh.plans.create':          handlePlanCreate,
     'mesh.plans.get':             handlePlanGet,
@@ -1116,7 +1291,7 @@ async function main() {
         try {
           await handler(msg);
         } catch (err) {
-          log(`ERROR handling ${subject}: ${err.message}`);
+          log(`ERROR handling ${subject}: ${err.message}\n${err.stack}`);
           try { respondError(msg, err.message); } catch {}
         }
       }

package/bin/mesh.js CHANGED Viewed

@@ -397,6 +397,10 @@ async function cmdSubmit(args) {
       success_criteria: task.success_criteria || [],
       scope: task.scope || [],
       priority: task.auto_priority || 0,
+      llm_provider: task.provider || task.llm_provider || null,
+      llm_model: task.model || task.llm_model || null,
+      preferred_nodes: task.preferred_nodes || [],
+      exclude_nodes: task.exclude_nodes || [],
     });
     console.log(`Submitted: ${result.data.task_id} [${result.data.status}]`);
     // Mark as 'submitted' — NOT 'running'. The card reflects actual mesh state.
@@ -451,12 +455,18 @@ async function cmdSubmit(args) {
     scope: task.scope || [],
     priority: task.priority || 0,
     tags: task.tags || [],
+    llm_provider: task.provider || task.llm_provider || null,
+    llm_model: task.model || task.llm_model || null,
+    preferred_nodes: task.preferred_nodes || [],
+    exclude_nodes: task.exclude_nodes || [],
+    collaboration: task.collaboration || undefined,
   });
   console.log(`Submitted: ${result.data.task_id} "${result.data.title}"`);
   console.log(`  Status:  ${result.data.status}`);
   console.log(`  Budget:  ${result.data.budget_minutes}m`);
   console.log(`  Metric:  ${result.data.metric || 'none'}`);
+  if (result.data.llm_provider) console.log(`  Provider: ${result.data.llm_provider}`);
   await nc.close();
 }
@@ -598,7 +608,11 @@ async function cmdRepair(args) {
  */
 async function cmdDeploy(args) {
   const { execSync } = require('child_process');
-  const repoDir = process.env.OPENCLAW_REPO_DIR || path.join(os.homedir(), 'openclaw');
+  // Prefer openclaw-node (git repo) over openclaw (runtime)
+  const defaultRepo = fs.existsSync(path.join(os.homedir(), 'openclaw-node', '.git'))
+    ? path.join(os.homedir(), 'openclaw-node')
+    : path.join(os.homedir(), 'openclaw');
+  const repoDir = process.env.OPENCLAW_REPO_DIR || defaultRepo;
   const force = args.includes('--force');
   // Parse --component flags
@@ -658,7 +672,7 @@ async function cmdDeploy(args) {
   await nc.flush();
   console.log('Deploy trigger sent.\n');
-  // Poll for results (15s timeout)
+  // Poll for results (10s timeout)
   console.log('Waiting for node responses...');
   const deadline = Date.now() + 15000;
   const seen = new Set();
@@ -668,6 +682,7 @@ async function cmdDeploy(args) {
     const resultsKv = await js.views.kv('MESH_DEPLOY_RESULTS');
     while (Date.now() < deadline) {
+      // Check all nodes
       const allAliasNodes = [...new Set(Object.values(NODE_ALIASES))];
       const checkNodes = targetNodes.length > 0 ? targetNodes : allAliasNodes;
@@ -731,9 +746,9 @@ function cmdHelp() {
     '  mesh repair                             Self-repair this node',
     '  mesh repair --all                       Self-repair ALL nodes',
     '  mesh deploy                             Deploy to all nodes',
-    '  mesh deploy --force                     Force deploy (even if up to date)',
-    '  mesh deploy --component <name>          Deploy specific component',
-    '  mesh deploy --node <name>               Deploy to specific node',
+    '  mesh deploy --force                     Force deploy (skip cache)',
+    '  mesh deploy --node ubuntu               Deploy to specific node',
+    '  mesh deploy --component mesh-daemons    Deploy specific component',
     '',
     'NODE ALIASES:',
     '  ubuntu, linux   = Ubuntu VM (calos-vmware-virtual-platform)',

package/install.sh CHANGED Viewed

@@ -230,6 +230,10 @@ if [ -z "$NODE_ROLE" ]; then
     NODE_ROLE="worker"
   fi
 fi
+if [ "$NODE_ROLE" != "lead" ] && [ "$NODE_ROLE" != "worker" ]; then
+  error "Invalid role: $NODE_ROLE (must be 'lead' or 'worker')"
+  exit 1
+fi
 export OPENCLAW_NODE_ROLE="$NODE_ROLE"
 info "Node role: $NODE_ROLE"
@@ -692,6 +696,9 @@ else
       if command -v envsubst >/dev/null 2>&1; then
         envsubst < "$TEMPLATE" > "$DEST"
       else
+        # NOTE: sed delimiter is |. If OPENCLAW_NATS_TOKEN ever contains |
+        # (unlikely — tokens are hex/base64), this substitution will break.
+        # Prefer envsubst (above) when available; it has no delimiter issue.
         sed \
           -e "s|\${HOME}|$HOME|g" \
           -e "s|\${NODE_BIN}|$NODE_BIN|g" \

package/lib/kanban-io.js CHANGED Viewed

@@ -53,8 +53,14 @@ function withMkdirLock(filePath, fn) {
       if (Date.now() - start > maxWait) {
         throw new Error(`kanban-io: lock timeout after ${maxWait}ms on ${filePath}`);
       }
-      // Non-blocking 10ms pause (busy-wait would peg CPU in Node's single thread)
-      require('child_process').spawnSync('sleep', ['0.01']);
+      // Sleep ~10ms — Atomics.wait is precise but throws on main thread
+      // in some Node.js builds; fall back to busy-spin (rare contention path)
+      try {
+        Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, 10);
+      } catch {
+        const end = Date.now() + 10;
+        while (Date.now() < end) { /* busy-wait fallback */ }
+      }
     }
   }
@@ -81,9 +87,7 @@ function parseTasks(content) {
   const liveIdx = content.indexOf('## Live Tasks');
   if (liveIdx === -1) return tasks;
-  const afterLive = content.slice(liveIdx);
-  const nextSectionIdx = afterLive.indexOf('\n## ', 1); // skip the current ## Live Tasks
-  const liveSection = nextSectionIdx >= 0 ? afterLive.slice(0, nextSectionIdx) : afterLive;
+  const liveSection = content.slice(liveIdx);
   const lines = liveSection.split('\n');
   let current = null;
@@ -110,6 +114,13 @@ function parseTasks(content) {
         budget_minutes: current.budget_minutes || 30,
         scope: current.scope || [],
         updated_at: current.updated_at || '',
+        // Mesh routing
+        llm_provider: current.llm_provider || null,
+        llm_model: current.llm_model || null,
+        preferred_nodes: current.preferred_nodes || [],
+        exclude_nodes: current.exclude_nodes || [],
+        collaboration: current.collaboration || null,
+        collab_result: current.collab_result || null,
       });
     }
   }
@@ -119,7 +130,7 @@ function parseTasks(content) {
     const taskIdMatch = line.match(/^- task_id:\s*(.+)$/);
     if (taskIdMatch) {
       flush();
-      current = { task_id: taskIdMatch[1].trim(), success_criteria: [], artifacts: [], scope: [] };
+      current = { task_id: taskIdMatch[1].trim(), success_criteria: [], artifacts: [], scope: [], preferred_nodes: [], exclude_nodes: [] };
       currentArrayKey = null;
       continue;
     }
@@ -176,6 +187,31 @@ function parseTasks(content) {
           current.scope = [];
           currentArrayKey = 'scope';
           break;
+        // Mesh routing fields
+        case 'llm_provider':
+        case 'provider':
+          current.llm_provider = value || null; currentArrayKey = null; break;
+        case 'llm_model':
+        case 'model':
+          current.llm_model = value || null; currentArrayKey = null; break;
+        case 'preferred_nodes':
+          current.preferred_nodes = [];
+          currentArrayKey = 'preferred_nodes';
+          break;
+        case 'exclude_nodes':
+          current.exclude_nodes = [];
+          currentArrayKey = 'exclude_nodes';
+          break;
+        case 'collaboration':
+          try { current.collaboration = value ? JSON.parse(value) : null; }
+          catch { current.collaboration = null; }
+          currentArrayKey = null;
+          break;
+        case 'collab_result':
+          try { current.collab_result = value ? JSON.parse(value) : null; }
+          catch { current.collab_result = null; }
+          currentArrayKey = null;
+          break;
         default:
           currentArrayKey = null;
           break;
@@ -236,7 +272,11 @@ function _updateTaskInPlaceUnsafe(filePath, taskId, fieldUpdates = {}, arrayAppe
   const blockLines = lines.slice(blockStart, blockEnd);
   // Update scalar fields
-  for (const [key, value] of Object.entries(fieldUpdates)) {
+  for (const [key, rawValue] of Object.entries(fieldUpdates)) {
+    // Serialize objects/arrays as JSON so the parser can read them back
+    const value = (rawValue !== null && typeof rawValue === 'object')
+      ? JSON.stringify(rawValue)
+      : rawValue;
     const fieldRegex = new RegExp(`^  ${key}:\\s*.*$`);
     let found = false;
     for (let i = 1; i < blockLines.length; i++) {
@@ -249,7 +289,7 @@ function _updateTaskInPlaceUnsafe(filePath, taskId, fieldUpdates = {}, arrayAppe
     if (!found) {
       // Insert before updated_at if it exists, otherwise at end of block
       const updatedAtIdx = blockLines.findIndex(l => l.match(/^  updated_at:/));
-      const insertIdx = updatedAtIdx >= 0 ? updatedAtIdx : blockLines.length;
+      const insertIdx = updatedAtIdx > 0 ? updatedAtIdx : blockLines.length;
       blockLines.splice(insertIdx, 0, `  ${key}: ${value}`);
     }
   }
@@ -262,7 +302,7 @@ function _updateTaskInPlaceUnsafe(filePath, taskId, fieldUpdates = {}, arrayAppe
     if (headerIdx === -1) {
       // Insert the array before updated_at
       const updatedAtIdx = blockLines.findIndex(l => l.match(/^  updated_at:/));
-      const insertIdx = updatedAtIdx >= 0 ? updatedAtIdx : blockLines.length;
+      const insertIdx = updatedAtIdx > 0 ? updatedAtIdx : blockLines.length;
       const newLines = [`  ${key}:`];
       for (const item of items) {
         newLines.push(`    - ${item}`);
@@ -290,7 +330,7 @@ function _updateTaskInPlaceUnsafe(filePath, taskId, fieldUpdates = {}, arrayAppe
     if (headerIdx === -1) {
       // Insert the array before updated_at
       const updatedAtIdx = blockLines.findIndex(l => l.match(/^  updated_at:/));
-      const insertIdx = updatedAtIdx >= 0 ? updatedAtIdx : blockLines.length;
+      const insertIdx = updatedAtIdx > 0 ? updatedAtIdx : blockLines.length;
       const newLines = [`  ${key}:`];
       for (const item of items) {
         newLines.push(`    - ${item}`);

package/lib/mesh-collab.js CHANGED Viewed

@@ -105,6 +105,10 @@ function createSession(taskId, collabSpec) {
 // ── CollabStore (KV-backed) ─────────────────────────
+// Rate-limit audit error logs: max 3 per session, then go silent
+const _auditErrorCounts = new Map();
+const AUDIT_ERROR_LOG_LIMIT = 3;
 class CollabStore {
   constructor(kv) {
     this.kv = kv;
@@ -139,7 +143,14 @@ class CollabStore {
         ...detail,
       });
       await this.put(session);
-    } catch { /* best-effort — never block on audit */ }
+    } catch (err) {
+      // Best-effort — never block on audit, but log first N failures per session
+      const count = (_auditErrorCounts.get(sessionId) || 0) + 1;
+      _auditErrorCounts.set(sessionId, count);
+      if (count <= AUDIT_ERROR_LOG_LIMIT) {
+        console.error(`[collab] audit append failed for ${sessionId}/${event}: ${err.message}${count === AUDIT_ERROR_LOG_LIMIT ? ' (suppressing further audit errors for this session)' : ''}`);
+      }
+    }
   }
   /**
@@ -176,6 +187,30 @@ class CollabStore {
     return sessions[0] || null;
   }
+  /**
+   * Find active sessions that contain a given node.
+   * O(sessions) single pass — avoids the O(sessions × nodes) scan
+   * that detectStalls() previously used with list() + inner find().
+   */
+  async findActiveSessionsByNode(nodeId) {
+    const results = [];
+    const allKeys = [];
+    const keys = await this.kv.keys();
+    for await (const key of keys) {
+      allKeys.push(key);
+    }
+    for (const key of allKeys) {
+      const entry = await this.kv.get(key);
+      if (!entry || !entry.value) continue;
+      const session = JSON.parse(sc.decode(entry.value));
+      if (session.status !== COLLAB_STATUS.ACTIVE) continue;
+      if (session.nodes.some(n => n.node_id === nodeId)) {
+        results.push(session);
+      }
+    }
+    return results;
+  }
   // ── Node Management ────────────────────────────────
   /**
@@ -190,7 +225,8 @@ class CollabStore {
     // Check max_nodes
     if (session.max_nodes && session.nodes.length >= session.max_nodes) return null;
-    // Check duplicate
+    // Check duplicate — single-threaded event loop prevents concurrent joins
+    // from interleaving between find() and push(). No mutex needed.
     if (session.nodes.find(n => n.node_id === nodeId)) return null;
     session.nodes.push({
@@ -320,6 +356,9 @@ class CollabStore {
     const session = await this.get(sessionId);
     if (!session) return null;
+    // Only accept reflections on active sessions
+    if (session.status !== COLLAB_STATUS.ACTIVE) return null;
     const currentRound = session.rounds[session.rounds.length - 1];
     if (!currentRound) return null;
@@ -506,11 +545,14 @@ class CollabStore {
   }
   /**
-   * Mark session as aborted.
+   * Mark session as aborted. Returns null (no-op) if already completed or aborted.
+   * Callers can use truthiness to detect whether the abort actually happened.
    */
   async markAborted(sessionId, reason) {
     const session = await this.get(sessionId);
     if (!session) return null;
+    // Guard: don't corrupt completed/aborted sessions
+    if (['completed', 'aborted'].includes(session.status)) return null;
     session.status = COLLAB_STATUS.ABORTED;
     session.completed_at = new Date().toISOString();
     session.result = { success: false, summary: reason, aborted: true };
@@ -518,6 +560,14 @@ class CollabStore {
     return session;
   }
+  /**
+   * Clear the audit error rate-limit counter for a session.
+   * Call when a session is finalized (completed/aborted) to prevent Map leak.
+   */
+  clearAuditErrorCount(sessionId) {
+    _auditErrorCounts.delete(sessionId);
+  }
   /**
    * Get a summary of the session for reporting.
    */