npm - clawmatrix - Versions diffs - 0.4.2 → 0.5.0 - Mend

clawmatrix 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +17 -21
package/cli/bin/clawmatrix.mjs +300 -1
package/package.json +8 -1
package/src/acp-proxy.ts +122 -50
package/src/{web.ts → api.ts} +646 -25
package/src/audit.ts +37 -2
package/src/auth.ts +5 -10
package/src/automation.ts +625 -0
package/src/cluster-service.ts +172 -16
package/src/compat.ts +103 -0
package/src/config.ts +75 -27
package/src/connection.ts +215 -37
package/src/crypto.ts +72 -5
package/src/device-info.ts +21 -2
package/src/file-transfer.ts +3 -2
package/src/handoff.ts +90 -32
package/src/health-tracker.ts +91 -356
package/src/index.ts +421 -13
package/src/kanban.ts +507 -0
package/src/knowledge-sync.ts +158 -7
package/src/local-tools.ts +65 -2
package/src/log-replication.ts +198 -0
package/src/model-proxy.ts +152 -60
package/src/peer-approval.ts +3 -2
package/src/peer-manager.ts +230 -44
package/src/retry.ts +81 -0
package/src/router.ts +152 -104
package/src/sentinel.ts +85 -51
package/src/store.ts +578 -0
package/src/terminal.ts +17 -8
package/src/tool-proxy.ts +6 -5
package/src/tools/cluster-events.ts +6 -6
package/src/tools/cluster-kanban.ts +345 -0
package/src/tools/cluster-peers.ts +1 -1
package/src/tools/cluster-query.ts +145 -0
package/src/types.ts +95 -9

package/src/model-proxy.ts CHANGED Viewed

@@ -8,11 +8,52 @@ import type {
   ModelResponse,
   ModelStreamChunk,
 } from "./types.ts";
+import { nanoid } from "nanoid";
+import { LRUCache } from "lru-cache";
+import { Semaphore as AsyncSemaphore } from "async-mutex";
+import { getNodeCircuitBreaker, isNodeCircuitOpen, removeNodeCircuitBreaker, resetAllCircuitBreakers } from "./retry.ts";
 import { debug } from "./debug.ts";
 import { readBody } from "./http-utils.ts";
 const DEFAULT_MODEL_TIMEOUT = 120_000; // 2 minutes
+// ── Semaphore for per-node concurrency control (backed by async-mutex) ──
+class Semaphore {
+  private sem: AsyncSemaphore;
+  constructor(max: number) {
+    this.sem = new AsyncSemaphore(max);
+  }
+  /** Acquire a permit. Rejects if timeout expires before a slot opens. */
+  acquire(timeoutMs: number): Promise<void> {
+    return new Promise<void>((resolve, reject) => {
+      let settled = false;
+      const timer = setTimeout(() => {
+        if (!settled) { settled = true; reject(new Error("Semaphore timeout")); }
+      }, timeoutMs);
+      this.sem.acquire().then(([, release]) => {
+        clearTimeout(timer);
+        if (settled) { release(); return; } // timeout already fired → release slot to avoid deadlock
+        settled = true;
+        resolve();
+      }, (err) => {
+        clearTimeout(timer);
+        if (!settled) { settled = true; reject(err); }
+      });
+    });
+  }
+  release() {
+    this.sem.release();
+  }
+  drain() {
+    this.sem.cancel();
+  }
+}
 /** Normalize usage from OpenAI-compatible APIs (supports both field naming conventions). */
 function parseUsage(usage: Record<string, number> | undefined): { inputTokens: number; outputTokens: number } | undefined {
   if (!usage) return undefined;
@@ -54,6 +95,8 @@ interface PendingModelReq {
   buildFrame?: (candidate: FailoverCandidate, newId: string) => ModelRequest;
   /** Stable ID for the entire stream (for setup events & final close). */
   stableStreamId?: string;
+  /** Release concurrency semaphore when request completes. */
+  release?: () => void;
 }
 export class ModelProxy {
@@ -65,14 +108,17 @@ export class ModelProxy {
   private openclawConfig: OpenClawConfig;
   private readonly modelTimeout: number;
+  /** Per-node concurrency control semaphores. */
+  private nodeSemaphores = new Map<string, Semaphore>();
+  private readonly modelConcurrency: number;
   /** Dynamically discovered proxy models from peer capabilities (auto-discovery). */
   private discoveredModels: import("./config.ts").ProxyModel[] = [];
   /** Cache of models that need a different API format than configured (detected at runtime).
    *  Entries expire after 10 minutes so upstream upgrades are eventually detected. */
-  private modelApiCache = new Map<string, { api: string; ts: number }>();
   private static readonly MODEL_API_CACHE_TTL = 600_000; // 10 minutes
-  private cacheCleanupTimer: ReturnType<typeof setInterval> | null = null;
+  private modelApiCache = new LRUCache<string, string>({ max: 200, ttl: ModelProxy.MODEL_API_CACHE_TTL });
   constructor(config: ClawMatrixConfig, peerManager: PeerManager, gatewayInfo: GatewayInfo, openclawConfig: OpenClawConfig) {
     this.config = config;
@@ -80,6 +126,16 @@ export class ModelProxy {
     this.gatewayInfo = gatewayInfo;
     this.openclawConfig = openclawConfig;
     this.modelTimeout = config.modelTimeout ?? DEFAULT_MODEL_TIMEOUT;
+    this.modelConcurrency = config.modelConcurrency ?? 5;
+  }
+  private getSemaphore(nodeId: string): Semaphore {
+    let sem = this.nodeSemaphores.get(nodeId);
+    if (!sem) {
+      sem = new Semaphore(this.modelConcurrency);
+      this.nodeSemaphores.set(nodeId, sem);
+    }
+    return sem;
   }
   /** All proxy models: static config + dynamically discovered from peers. */
@@ -309,13 +365,7 @@ export class ModelProxy {
   /** Start the local HTTP proxy server for OpenAI-compatible requests. */
   start() {
-    // Periodically prune expired model API cache entries
-    this.cacheCleanupTimer = setInterval(() => {
-      const now = Date.now();
-      for (const [id, entry] of this.modelApiCache) {
-        if (now - entry.ts > ModelProxy.MODEL_API_CACHE_TTL) this.modelApiCache.delete(id);
-      }
-    }, ModelProxy.MODEL_API_CACHE_TTL);
+    // LRU cache handles TTL-based expiration automatically
     this.httpServer = createServer(async (req, res) => {
       try {
@@ -366,10 +416,7 @@ export class ModelProxy {
   }
   stop() {
-    if (this.cacheCleanupTimer) {
-      clearInterval(this.cacheCleanupTimer);
-      this.cacheCleanupTimer = null;
-    }
+    this.modelApiCache.clear();
     if (this.httpServer) {
       // Force-close all keep-alive connections so the port is released immediately
       const server = this.httpServer as typeof this.httpServer & { closeAllConnections?: () => void };
@@ -387,12 +434,18 @@ export class ModelProxy {
     this.streamText.clear();
     this.streamSetupSent.clear();
     this.modelApiCache.clear();
+    for (const sem of this.nodeSemaphores.values()) sem.drain();
+    this.nodeSemaphores.clear();
+    resetAllCircuitBreakers();
   }
   /** Clean up all tracking state for a request (pending, streamText, streamSetupSent). */
   private cleanupRequest(id: string, stableStreamId?: string) {
     const pending = this.pending.get(id);
-    if (pending) clearTimeout(pending.timer);
+    if (pending) {
+      clearTimeout(pending.timer);
+      pending.release?.();
+    }
     this.pending.delete(id);
     this.streamText.delete(id);
     if (stableStreamId) this.streamSetupSent.delete(stableStreamId);
@@ -477,8 +530,11 @@ export class ModelProxy {
       }
     }
-    // Sort candidates by latency (lowest first) for optimal first-try and failover order
+    // Sort candidates: circuit-open nodes last, then direct before relay, then by latency
     candidates.sort((a, b) => {
+      const aOpen = isNodeCircuitOpen(a.routeNodeId) ? 1 : 0;
+      const bOpen = isNodeCircuitOpen(b.routeNodeId) ? 1 : 0;
+      if (aOpen !== bOpen) return aOpen - bOpen;
       const routeA = this.peerManager.router.getRoute(a.routeNodeId);
       const routeB = this.peerManager.router.getRoute(b.routeNodeId);
       const aDirect = routeA?.connection ? 0 : 1;
@@ -515,7 +571,7 @@ export class ModelProxy {
     debug("proxy", `messages count=${baseMessages?.length ?? 0} roles=${(baseMessages ?? []).map((m: unknown) => (m as Record<string, unknown>)?.role).join(",")}`);
     const stream = body.stream ?? false;
-    const requestId = crypto.randomUUID();
+    const requestId = nanoid();
     const buildFrame = (candidate: FailoverCandidate, id: string): ModelRequest => {
       // Clone messages so each candidate gets its own description prefix
       const messages = baseMessages.map((m: unknown) => (m && typeof m === "object" ? { ...(m as object) } : m));
@@ -569,7 +625,7 @@ export class ModelProxy {
     }
     const stream = body.stream ?? false;
-    const requestId = crypto.randomUUID();
+    const requestId = nanoid();
     debug("proxy", `responses: stream=${stream} messages=${baseItems.length} input_type=${typeof body.input}${Array.isArray(body.input) ? `[${body.input.length}]` : ""}`);
     const buildFrame = (candidate: FailoverCandidate, id: string): ModelRequest => {
       // Clone items so each candidate gets its own description prefix
@@ -653,35 +709,47 @@ export class ModelProxy {
   ) {
     const stableId = streamId ?? requestId;
-    const timer = setTimeout(() => {
-      this.cleanupRequest(requestId);
-      this.peerManager.router.markFailed(requestId);
-      this.tryStreamFailover(stableId, responseFormat, controller, encoder, model, failoverCandidates, buildFrame, `model request to "${targetNodeId}" timed out`);
-    }, this.modelTimeout);
-    this.pending.set(requestId, {
-      resolve: () => {}, reject: () => {},
-      timer, stream: true, responseFormat, model,
-      targetNodeId,
-      controller, encoder,
-      hasContent: false,
-      failoverCandidates,
-      buildFrame,
-      stableStreamId: stableId,
-    });
+    // Acquire per-node concurrency permit (async, then send)
+    const sem = this.getSemaphore(targetNodeId);
+    sem.acquire(this.modelTimeout).then(() => {
+      if (this.pending.has(requestId)) { sem.release(); return; } // already cleaned up — release permit
-    // Emit setup events for responses API (only once per stream, keyed by stableId)
-    if (responseFormat === "responses" && !this.streamSetupSent.has(stableId)) {
-      const hasTools = Array.isArray(frame.payload.tools) && frame.payload.tools.length > 0;
-      this.enqueueResponsesStreamSetup(controller, encoder, stableId, model, hasTools);
-      this.streamSetupSent.add(stableId);
-    }
+      const release = () => sem.release();
-    const sent = this.peerManager.sendTo(targetNodeId, frame);
-    if (!sent) {
-      this.cleanupRequest(requestId);
-      this.tryStreamFailover(stableId, responseFormat, controller, encoder, model, failoverCandidates, buildFrame, `cannot reach node "${targetNodeId}"`);
-    }
+      const timer = setTimeout(() => {
+        this.cleanupRequest(requestId);
+        this.peerManager.router.markFailed(requestId);
+        this.tryStreamFailover(stableId, responseFormat, controller, encoder, model, failoverCandidates, buildFrame, `model request to "${targetNodeId}" timed out`);
+      }, this.modelTimeout);
+      this.pending.set(requestId, {
+        resolve: () => {}, reject: () => {},
+        timer, stream: true, responseFormat, model,
+        targetNodeId,
+        controller, encoder,
+        hasContent: false,
+        failoverCandidates,
+        buildFrame,
+        stableStreamId: stableId,
+        release,
+      });
+      // Emit setup events for responses API (only once per stream, keyed by stableId)
+      if (responseFormat === "responses" && !this.streamSetupSent.has(stableId)) {
+        const hasTools = Array.isArray(frame.payload.tools) && frame.payload.tools.length > 0;
+        this.enqueueResponsesStreamSetup(controller, encoder, stableId, model, hasTools);
+        this.streamSetupSent.add(stableId);
+      }
+      const sent = this.peerManager.sendTo(targetNodeId, frame);
+      if (!sent) {
+        this.cleanupRequest(requestId);
+        this.tryStreamFailover(stableId, responseFormat, controller, encoder, model, failoverCandidates, buildFrame, `cannot reach node "${targetNodeId}"`);
+      }
+    }).catch(() => {
+      // Semaphore timeout — all slots busy
+      this.tryStreamFailover(stableId, responseFormat, controller, encoder, model, failoverCandidates, buildFrame, `node "${targetNodeId}" concurrency limit reached`);
+    });
   }
   /** Track which stream requests have already sent responses API setup events. */
@@ -701,7 +769,7 @@ export class ModelProxy {
     if (candidates.length > 0 && buildFrame) {
       const next = candidates[0]!;
       const remaining = candidates.slice(1);
-      const newId = crypto.randomUUID();
+      const newId = nanoid();
       const newFrame = buildFrame(next, newId);
       debug("proxy", `failover: ${reason} → trying ${next.routeNodeId} (${remaining.length} left)`);
       this.startStreamAttempt(newId, next.routeNodeId, newFrame, responseFormat, controller, encoder, model, remaining, buildFrame, stableStreamId);
@@ -773,8 +841,20 @@ export class ModelProxy {
     const maxAttempts = failoverCandidates.length + 1;
     for (let attempt = 0; attempt < maxAttempts; attempt++) {
+      // Acquire per-node concurrency permit
+      const sem = this.getSemaphore(currentTarget);
       try {
-        const result = await this.sendNonStreamAndWait(currentId, currentTarget, currentFrame, responseFormat);
+        await sem.acquire(this.modelTimeout);
+      } catch {
+        return {
+          status: 503,
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify({ error: { message: `Node "${currentTarget}" concurrency limit reached` } }),
+        };
+      }
+      try {
+        const release = () => sem.release();
+        const result = await this.sendNonStreamAndWait(currentId, currentTarget, currentFrame, responseFormat, release);
         if (!result.success) {
           // Upstream error — try failover if available
@@ -782,7 +862,7 @@ export class ModelProxy {
             const next = failoverCandidates[failoverIdx]!;
             debug("proxy", `failover: remote error "${result.error}" → trying ${next.routeNodeId} (${failoverCandidates.length - failoverIdx - 1} left)`);
             failoverIdx++;
-            currentId = crypto.randomUUID();
+            currentId = nanoid();
             currentFrame = buildFrame(next, currentId);
             currentTarget = next.routeNodeId;
             continue;
@@ -801,7 +881,7 @@ export class ModelProxy {
           const next = failoverCandidates[failoverIdx]!;
           debug("proxy", `failover: ${err instanceof Error ? err.message : String(err)} → trying ${next.routeNodeId} (${failoverCandidates.length - failoverIdx - 1} left)`);
           failoverIdx++;
-          currentId = crypto.randomUUID();
+          currentId = nanoid();
           currentFrame = buildFrame(next, currentId);
           currentTarget = next.routeNodeId;
           continue;
@@ -826,10 +906,11 @@ export class ModelProxy {
     targetNodeId: string,
     frame: ModelRequest,
     responseFormat: ResponseFormat,
+    release?: () => void,
   ): Promise<ModelResponse["payload"]> {
     return new Promise<ModelResponse["payload"]>((resolve, reject) => {
       const timer = setTimeout(() => {
-        this.pending.delete(requestId);
+        this.cleanupRequest(requestId);
         this.peerManager.router.markFailed(requestId);
         reject(new Error(`Model request to "${targetNodeId}" timed out`));
       }, this.modelTimeout);
@@ -837,13 +918,12 @@ export class ModelProxy {
       this.pending.set(requestId, {
         resolve: resolve as (v: unknown) => void,
         reject, timer, stream: false, responseFormat,
-        targetNodeId,
+        targetNodeId, release,
       });
       const sent = this.peerManager.sendTo(targetNodeId, frame);
       if (!sent) {
-        this.pending.delete(requestId);
-        clearTimeout(timer);
+        this.cleanupRequest(requestId);
         reject(new Error(`Cannot reach model node "${targetNodeId}"`));
       }
     });
@@ -952,6 +1032,16 @@ export class ModelProxy {
     const pending = this.pending.get(frame.id);
     if (!pending) return;
+    // Record circuit breaker outcome for the responding node
+    if (pending.targetNodeId) {
+      const cb = getNodeCircuitBreaker(pending.targetNodeId);
+      if (frame.payload.success) {
+        cb.onSuccess();
+      } else {
+        cb.onFailure();
+      }
+    }
     // For stream requests, handle error responses (the remote node couldn't
     // process the request and sent model_res instead of model_stream).
     if (pending.stream) {
@@ -999,10 +1089,14 @@ export class ModelProxy {
     // Reset activity timer — keeps long-running streams alive and detects
     // stalled connections within modelTimeout of the last received chunk.
     clearTimeout(pending.timer);
-    if (!frame.payload.done) {
+    if (frame.payload.done) {
+      // Stream completed successfully — record circuit breaker success
+      if (pending.targetNodeId) getNodeCircuitBreaker(pending.targetNodeId).onSuccess();
+    } else {
       pending.timer = setTimeout(() => {
         // Capture references before cleanup removes pending from the map
-        const { stableStreamId, responseFormat, controller, encoder, model, failoverCandidates, buildFrame } = pending;
+        const { stableStreamId, responseFormat, controller, encoder, model, failoverCandidates, buildFrame, targetNodeId } = pending;
+        if (targetNodeId) getNodeCircuitBreaker(targetNodeId).onFailure();
         this.cleanupRequest(frame.id);
         this.peerManager.router.markFailed(frame.id);
         this.tryStreamFailover(
@@ -1160,9 +1254,7 @@ export class ModelProxy {
         return;
       }
       // Use payload.api override from requesting side, or cached API from previous auto-detection
-      const cached = this.modelApiCache.get(model.id);
-      const cachedApi = (cached && Date.now() - cached.ts < ModelProxy.MODEL_API_CACHE_TTL) ? cached.api : undefined;
-      if (cached && !cachedApi) this.modelApiCache.delete(model.id); // expired
+      const cachedApi = this.modelApiCache.get(model.id);
       const effectiveApi = payload.api ?? cachedApi ?? endpoint.api;
       const isResponsesApi = effectiveApi === "openai-responses" || effectiveApi === "openai-codex-responses";
       const path = isResponsesApi ? "/responses" : "/chat/completions";
@@ -1309,7 +1401,7 @@ export class ModelProxy {
             debug("model_req", `responses API stream produced no content for "${model.id}", retrying with chat completions`);
             const chatResult = await this.retryWithChatCompletions(endpoint, modelField, payload, headers);
             if (chatResult) {
-              this.modelApiCache.set(model.id, { api: "openai-completions", ts: Date.now() });
+              this.modelApiCache.set(model.id, "openai-completions");
               debug("model_req", `cached "${model.id}" as openai-completions (stream fallback)`);
               if (chatResult.content) {
                 this.sendStreamDelta(from, id, chatResult.content);
@@ -1352,7 +1444,7 @@ export class ModelProxy {
             debug("model_req", `responses API returned non-JSON for "${model.id}", retrying with chat completions`);
             chatFallbackResult = await this.retryWithChatCompletions(endpoint, modelField, payload, headers);
             if (chatFallbackResult) {
-              this.modelApiCache.set(model.id, { api: "openai-completions", ts: Date.now() });
+              this.modelApiCache.set(model.id, "openai-completions");
               debug("model_req", `cached "${model.id}" as openai-completions (non-JSON fallback)`);
             }
           }
@@ -1390,7 +1482,7 @@ export class ModelProxy {
             debug("model_req", `responses API returned empty output for "${model.id}" (output_tokens=${parsedUsage!.outputTokens}), retrying with chat completions`);
             const chatResult = await this.retryWithChatCompletions(endpoint, modelField, payload, headers);
             if (chatResult) {
-              this.modelApiCache.set(model.id, { api: "openai-completions", ts: Date.now() });
+              this.modelApiCache.set(model.id, "openai-completions");
               debug("model_req", `cached "${model.id}" as openai-completions`);
               ({ content, message, usage } = chatResult);
             } else {

package/src/peer-approval.ts CHANGED Viewed

@@ -1,6 +1,7 @@
-import { EventEmitter } from "node:events";
+import { EventEmitter } from "eventemitter3";
 import fs from "node:fs";
 import path from "node:path";
+import { nanoid } from "nanoid";
 import type { PeerApprovalConfig } from "./config.ts";
 import { debug } from "./debug.ts";
 import type {
@@ -257,7 +258,7 @@ export class PeerApprovalManager extends EventEmitter<PeerApprovalEvents> {
       return this.waitForBaseApproval(baseNodeId, nodeId, capabilities, publicKey);
     }
-    const approvalId = crypto.randomUUID();
+    const approvalId = nanoid();
     this.log(`requestApproval: nodeId=${nodeId} mode=${this.config.mode} approvalId=${approvalId}`);
     if (this.config.mode === "notify") {