npm - @loreai/gateway - Versions diffs - 0.13.3 → 0.14.0 - Mend

@loreai/gateway 0.13.3 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/dist/index.js +49694 -3155
package/package.json +14 -6
package/src/batch-queue.ts +21 -1
package/src/cache-analytics.ts +344 -0
package/src/cli/agents.ts +107 -0
package/src/cli/bin.ts +11 -0
package/src/cli/help.ts +55 -0
package/src/cli/lib/binary.ts +353 -0
package/src/cli/lib/bspatch.ts +306 -0
package/src/cli/lib/delta-upgrade.ts +790 -0
package/src/cli/lib/errors.ts +48 -0
package/src/cli/lib/ghcr.ts +389 -0
package/src/cli/lib/patch-cache.ts +342 -0
package/src/cli/lib/upgrade.ts +454 -0
package/src/cli/lib/version-check.ts +385 -0
package/src/cli/main.ts +152 -0
package/src/cli/run.ts +181 -0
package/src/cli/start.ts +82 -0
package/src/cli/upgrade.ts +311 -0
package/src/cli/version.ts +22 -0
package/src/idle.ts +0 -6
package/src/index.ts +27 -27
package/src/llm-adapter.ts +100 -28
package/src/pipeline.ts +254 -177
package/src/recall.ts +223 -91
package/src/temporal-adapter.ts +3 -0
package/src/translate/anthropic.ts +50 -6
package/src/translate/types.ts +54 -9
package/dist/index.js.map +0 -7

package/src/recall.ts CHANGED Viewed

@@ -1,15 +1,19 @@
 /**
  * Gateway recall interception — transparent memory search for any client.
  *
- * Injects a `recall` tool into upstream requests and handles the response
- * transparently. Two strategies based on whether recall is the only tool:
+ * Uses a unified "Marker and Expand" strategy:
  *
- *  - **Case 1 (recall-only)**: "Pause and Continue" — pause client stream,
- *    execute recall, send follow-up request, resume streaming in the same
- *    HTTP response.
- *  - **Case 2 (mixed tools)**: "Strip and Inject" — suppress recall blocks
- *    from the client stream, execute recall in background, inject the result
- *    into the next request from the client.
+ *  1. **On response (to client):** The recall `tool_use` block is replaced
+ *     with a human-readable marker text block
+ *     (`📚 Searching <scope> for "<query>"…`). The recall is executed
+ *     internally and the result is stored in session state.
+ *
+ *  2. **On request (from client):** Marker text blocks in the conversation
+ *     are expanded back into the original `tool_use` + `tool_result` pairs
+ *     before forwarding upstream.
+ *
+ *  For recall-only responses, a follow-up call is still made internally
+ *  so the model can continue in the same HTTP response (seamless UX).
  *
  * All recall execution delegates to `runRecall()` from `@loreai/core`.
  */
@@ -28,7 +32,7 @@ import type {
   GatewayResponse,
   GatewayToolUseBlock,
   GatewayMessage,
-  PendingRecall,
+  RecallStore,
 } from "./translate/types";
 // ---------------------------------------------------------------------------
@@ -59,15 +63,205 @@ export const RECALL_GATEWAY_TOOL: GatewayTool = {
 export const RECALL_TOOL_NAME = "recall";
 // ---------------------------------------------------------------------------
-// Pending recall state (cross-request, Case 2)
+// Marker utilities — human-readable text ↔ recall tool round-trip
 // ---------------------------------------------------------------------------
-/** TTL for pending recall results — discard after 60 seconds. */
-const PENDING_RECALL_TTL_MS = 60_000;
+/** Scope → human-readable label for marker text. */
+const SCOPE_LABELS: Record<string, string> = {
+  all: "all archives",
+  session: "session history",
+  project: "project archives",
+  knowledge: "knowledge base",
+};
+/** Reverse: label → scope enum. */
+const LABEL_TO_SCOPE: Record<string, RecallScope> = Object.fromEntries(
+  Object.entries(SCOPE_LABELS).map(([k, v]) => [v, k as RecallScope]),
+);
+/** Map a recall scope to a human-readable label. */
+export function scopeToLabel(scope: string = "all"): string {
+  return SCOPE_LABELS[scope] ?? SCOPE_LABELS.all;
+}
-/** Check whether a pending recall is still valid (within TTL). */
-export function isPendingRecallValid(pending: PendingRecall): boolean {
-  return Date.now() - pending.timestamp < PENDING_RECALL_TTL_MS;
+/** Map a human-readable label back to a scope enum value. */
+export function labelToScope(label: string): RecallScope {
+  return LABEL_TO_SCOPE[label] ?? "all";
+}
+/**
+ * Build a marker text string for a recall tool call.
+ *
+ * Format: `📚 Searching <scope-label> for "<query>"…`
+ */
+export function buildRecallMarker(query: string, scope: string = "all"): string {
+  return `📚 Searching ${scopeToLabel(scope)} for "${query}"…`;
+}
+/** Regex to parse a recall marker back into query + scope. */
+const MARKER_REGEX = /📚 Searching (.+?) for "(.+?)"…/;
+/**
+ * Parse a recall marker text block, returning query and scope if valid.
+ * Returns null if the text doesn't match the marker format.
+ */
+export function parseRecallMarker(
+  text: string,
+): { query: string; scope: RecallScope } | null {
+  const match = MARKER_REGEX.exec(text);
+  if (!match) return null;
+  return {
+    query: match[2],
+    scope: labelToScope(match[1]),
+  };
+}
+/** Derive a store key from query + scope. */
+export function recallStoreKey(query: string, scope: string = "all"): string {
+  return `${scope}:${query}`;
+}
+// ---------------------------------------------------------------------------
+// Marker expansion — restore tool_use + tool_result from markers on inbound
+// ---------------------------------------------------------------------------
+/**
+ * Find recall marker text blocks in the conversation and expand them
+ * back into tool_use + tool_result pairs for the upstream API.
+ *
+ * Scans ALL assistant messages (not just the last one) since markers
+ * persist across turns until gradient evicts the message.
+ *
+ * Mutates the request in-place. Returns true if any expansion was performed.
+ */
+export function expandRecallMarkers(
+  req: GatewayRequest,
+  store: RecallStore,
+): boolean {
+  let expanded = false;
+  // Iterate forward; when we splice messages the index is adjusted.
+  for (let i = 0; i < req.messages.length; i++) {
+    const msg = req.messages[i];
+    if (msg.role !== "assistant") continue;
+    // Find the first (should be only) recall marker in this message.
+    // We process one marker per assistant message per pass; the outer
+    // loop will revisit if there's more than one (rare).
+    let markerIdx = -1;
+    let parsed: { query: string; scope: RecallScope } | null = null;
+    for (let j = 0; j < msg.content.length; j++) {
+      const block = msg.content[j];
+      if (block.type !== "text") continue;
+      parsed = parseRecallMarker(block.text);
+      if (parsed) {
+        markerIdx = j;
+        break;
+      }
+    }
+    if (markerIdx < 0 || !parsed) continue;
+    const key = recallStoreKey(parsed.query, parsed.scope);
+    const stored = store.get(key);
+    if (!stored) continue; // No stored result — leave marker as-is
+    // Check if there's non-tool content AFTER the marker in this message.
+    // This happens when recall-only follow-up piped continuation content
+    // (text blocks) into the same assistant message. Tool_use blocks after
+    // the marker are from the same turn (mixed tools) and stay together.
+    const afterMarker = msg.content.slice(markerIdx + 1);
+    const hasContinuationAfter = afterMarker.length > 0 &&
+      afterMarker.some((b) => b.type !== "tool_use");
+    // Replace marker with tool_use
+    msg.content[markerIdx] = {
+      type: "tool_use",
+      id: stored.toolUseId,
+      name: RECALL_TOOL_NAME,
+      input: stored.input,
+    };
+    // Truncate assistant message at the tool_use (remove continuation)
+    if (hasContinuationAfter) {
+      msg.content.length = markerIdx + 1;
+    }
+    // Build synthetic tool_result user message
+    const toolResultMsg: GatewayMessage = {
+      role: "user",
+      content: [
+        {
+          type: "tool_result",
+          toolUseId: stored.toolUseId,
+          content: stored.result,
+        },
+      ],
+    };
+    if (hasContinuationAfter) {
+      // Split: insert tool_result user message + continuation assistant
+      // message after the current assistant message.
+      const continuationMsg: GatewayMessage = {
+        role: "assistant",
+        content: afterMarker,
+      };
+      req.messages.splice(i + 1, 0, toolResultMsg, continuationMsg);
+      // Skip past the two newly inserted messages
+      i += 2;
+    } else {
+      // No split needed — insert tool_result into the following user message.
+      // Prepend (unshift) so the recall result appears before existing
+      // tool_results — matching the tool_use order in the assistant message.
+      const nextMsg = req.messages[i + 1];
+      if (nextMsg?.role === "user") {
+        nextMsg.content.unshift({
+          type: "tool_result",
+          toolUseId: stored.toolUseId,
+          content: stored.result,
+        });
+      } else {
+        // No following user message — insert a synthetic one
+        req.messages.splice(i + 1, 0, toolResultMsg);
+        i += 1;
+      }
+    }
+    expanded = true;
+  }
+  return expanded;
+}
+/**
+ * Clean up orphaned recall store entries whose markers no longer
+ * appear in the conversation (e.g. gradient evicted the turn).
+ */
+export function cleanupRecallStore(
+  req: GatewayRequest,
+  store: RecallStore,
+): void {
+  if (store.size === 0) return;
+  // Collect all marker keys still present in assistant messages
+  const activeKeys = new Set<string>();
+  for (const msg of req.messages) {
+    if (msg.role !== "assistant") continue;
+    for (const block of msg.content) {
+      if (block.type !== "text") continue;
+      const parsed = parseRecallMarker(block.text);
+      if (parsed) {
+        activeKeys.add(recallStoreKey(parsed.query, parsed.scope));
+      }
+    }
+  }
+  // Remove entries not referenced by any current marker
+  for (const key of store.keys()) {
+    if (!activeKeys.has(key)) {
+      store.delete(key);
+    }
+  }
 }
 // ---------------------------------------------------------------------------
@@ -212,90 +406,28 @@ export function buildRecallFollowUp(
 }
 // ---------------------------------------------------------------------------
-// Pending recall injection (Case 2: next request enrichment)
-// ---------------------------------------------------------------------------
-/**
- * Inject a pending recall result into the current request.
- *
- * Finds the last assistant message in `req.messages`, inserts the recall
- * tool_use block at the recorded position, and inserts a tool_result block
- * into the following user message.
- *
- * Mutates the request in-place for efficiency. Returns true if injection
- * was performed, false if the conversation structure didn't match
- * (e.g., no trailing assistant→user pair).
- */
-export function injectPendingRecall(
-  req: GatewayRequest,
-  pending: PendingRecall,
-): boolean {
-  const messages = req.messages;
-  if (messages.length < 2) return false;
-  // Find the last assistant message followed by a user message.
-  // The pending recall was from the previous turn's assistant response.
-  let assistantIdx = -1;
-  for (let i = messages.length - 2; i >= 0; i--) {
-    if (
-      messages[i].role === "assistant" &&
-      messages[i + 1]?.role === "user"
-    ) {
-      assistantIdx = i;
-      break;
-    }
-  }
-  if (assistantIdx < 0) {
-    log.warn("injectPendingRecall: no assistant→user pair found");
-    return false;
-  }
-  const assistantMsg = messages[assistantIdx];
-  const userMsg = messages[assistantIdx + 1];
-  // Insert recall tool_use into assistant message at the recorded position.
-  // Clamp to content length in case the message was modified by gradient.
-  const insertPos = Math.min(pending.position, assistantMsg.content.length);
-  const recallToolUse: GatewayToolUseBlock = {
-    type: "tool_use",
-    id: pending.toolUseId,
-    name: RECALL_TOOL_NAME,
-    input: pending.input,
-  };
-  assistantMsg.content.splice(insertPos, 0, recallToolUse);
-  // Insert recall tool_result into the user message.
-  // Add it at the beginning alongside any other tool_results.
-  userMsg.content.unshift({
-    type: "tool_result",
-    toolUseId: pending.toolUseId,
-    content: pending.result,
-  });
-  // Strip recall from tools list for this request
-  req.tools = req.tools.filter((t) => t.name !== RECALL_TOOL_NAME);
-  return true;
-}
-// ---------------------------------------------------------------------------
-// Response content stripping (Case 2: remove recall from response)
+// Response content rewriting — replace recall tool_use with marker text
 // ---------------------------------------------------------------------------
 /**
- * Build a GatewayResponse with recall tool_use blocks removed.
+ * Build a GatewayResponse with recall tool_use blocks replaced by marker text.
  *
- * Used for Case 2 to produce a clean response for `postResponse` storage
- * that excludes the gateway-internal recall blocks.
+ * Used for both recall-only and mixed-tools cases to produce a response
+ * where the client sees human-readable markers instead of tool call mechanics.
  */
-export function stripRecallFromResponse(
+export function replaceRecallWithMarker(
   resp: GatewayResponse,
 ): GatewayResponse {
   return {
     ...resp,
-    content: resp.content.filter(
-      (b) => !(b.type === "tool_use" && b.name === RECALL_TOOL_NAME),
-    ),
+    content: resp.content.map((b) => {
+      if (b.type === "tool_use" && b.name === RECALL_TOOL_NAME) {
+        const input = b.input as Record<string, unknown>;
+        const query = typeof input.query === "string" ? input.query : "";
+        const scope = (input.scope as string) ?? "all";
+        return { type: "text" as const, text: buildRecallMarker(query, scope) };
+      }
+      return b;
+    }),
   };
 }

package/src/temporal-adapter.ts CHANGED Viewed

@@ -97,6 +97,9 @@ function contentBlockToPart(
         messageID,
         type: "reasoning",
         text: block.thinking,
+        ...(block.signature != null
+          ? { signature: block.signature }
+          : undefined),
       } satisfies LoreReasoningPart;
     case "tool_use":

package/src/translate/anthropic.ts CHANGED Viewed

@@ -265,6 +265,24 @@ export type AnthropicCacheOptions = {
    */
   systemTTL?: "5m" | "1h" | false;
+  /**
+   * LTM knowledge text to inject as a separate system block after the host
+   * prompt. Keeping it in a separate block means the host prompt gets its
+   * own cache breakpoint (1h) and LTM changes don't bust the host prefix.
+   *
+   * When provided AND systemTTL is set, the system becomes a 2-block array:
+   *   system[0]: host prompt  — cache_control with systemTTL
+   *   system[1]: LTM content  — no cache_control (benefits from prefix)
+   */
+  ltmSystem?: string;
+  /**
+   * Cache the last tool definition with an explicit 1h breakpoint.
+   * Tool definitions (including our injected recall tool) are stable
+   * across turns — caching them avoids re-processing on every request.
+   */
+  cacheTools?: boolean;
   /**
    * Place an explicit `cache_control` breakpoint on the last block of the
    * last message, enabling Anthropic to cache the conversation prefix.
@@ -329,19 +347,33 @@ export function buildAnthropicRequest(
   // System — only include if non-empty
   if (req.system) {
     const systemTTL = cache?.systemTTL;
+    const ltmText = cache?.ltmSystem;
     if (systemTTL) {
-      // Send as block array with explicit cache_control breakpoint.
-      // This creates a stable cache slot for the system prompt — it changes
-      // only when LTM entries are added/removed or AGENTS.md is updated.
+      // Send as block array with explicit cache_control breakpoint on the
+      // host prompt. The host prompt is the most stable part (changes only
+      // when the host mutates AGENTS.md, memory, etc.) so it gets a 1h TTL.
       const cacheControl: Record<string, string> =
         systemTTL === "1h"
           ? { type: "ephemeral", ttl: "1h" }
           : { type: "ephemeral" };
-      body.system = [
+      const blocks: Record<string, unknown>[] = [
         { type: "text", text: req.system, cache_control: cacheControl },
       ];
+      // LTM knowledge as a separate block — no cache_control of its own,
+      // but benefits from the host prompt prefix cache. When LTM changes,
+      // only this block and everything after it is re-processed; the host
+      // prompt prefix is still a cache read.
+      if (ltmText) {
+        blocks.push({ type: "text", text: ltmText });
+      }
+      body.system = blocks;
     } else {
-      body.system = req.system;
+      // No caching — concatenate LTM into a single string.
+      body.system = ltmText ? `${req.system}\n\n${ltmText}` : req.system;
     }
   }
@@ -368,11 +400,23 @@ export function buildAnthropicRequest(
   // Tools — only include if present
   if (req.tools.length > 0) {
-    body.tools = req.tools.map((t) => ({
+    const tools = req.tools.map((t) => ({
       name: t.name,
       description: t.description,
       input_schema: t.inputSchema,
     }));
+    // Tool caching: place a 1h breakpoint on the last tool definition.
+    // Tool definitions (including our recall tool) are stable across turns.
+    if (cache?.cacheTools && tools.length > 0) {
+      const lastTool = tools[tools.length - 1]!;
+      (lastTool as Record<string, unknown>).cache_control = {
+        type: "ephemeral",
+        ttl: "1h",
+      };
+    }
+    body.tools = tools;
   }
   // Restore all metadata params (temperature, top_p, stop_sequences, etc.)

package/src/translate/types.ts CHANGED Viewed

@@ -139,27 +139,70 @@ export type GatewayResponse = {
 };
 // ---------------------------------------------------------------------------
-// Pending recall state (cross-request, gateway recall interception)
+// Recall store (cross-request, gateway recall interception)
 // ---------------------------------------------------------------------------
-/** Pending recall result stored between requests (Case 2: mixed tools). */
-export type PendingRecall = {
-  /** tool_use ID from the suppressed block. */
+/** Stored recall result for marker-based round-trip expansion. */
+export type StoredRecall = {
+  /** The tool_use ID to reconstruct in the upstream request. */
   toolUseId: string;
-  /** The original recall input (for conversation history reconstruction). */
+  /** Original recall input (query + scope). */
   input: { query: string; scope?: string };
   /** Position (content block index) in the original assistant message. */
   position: number;
   /** Executed recall result (formatted markdown). */
   result: string;
-  /** Timestamp for TTL-based cleanup. */
-  timestamp: number;
 };
+/** Map from marker key (`${scope}:${query}`) → stored recall data. */
+export type RecallStore = Map<string, StoredRecall>;
 // ---------------------------------------------------------------------------
 // Session state — per-session tracking for Lore pipeline integration
 // ---------------------------------------------------------------------------
+/** Per-turn cache analysis emitted as structured log data. */
+export type CacheTurnAnalysis = {
+  /** Turn number within this session. */
+  turn: number;
+  // --- Ground truth from API response ---
+  /** Tokens served from prompt cache (hit). */
+  cacheRead: number;
+  /** Tokens written to prompt cache (miss / new). */
+  cacheCreation: number;
+  /** Uncached input tokens. */
+  inputTokens: number;
+  /** cacheRead / total input — 0..1. */
+  cacheHitRate: number;
+  // --- Request body prefix comparison ---
+  /** Bytes matching from start of serialized request body vs previous turn. */
+  prefixMatchBytes: number;
+  /** prefixMatchBytes / min(prev, current) body length — 0..1. */
+  prefixMatchPercent: number;
+  /** Semantic location of the first divergence (e.g. "messages[3].content[1]"). */
+  divergencePoint: string;
+  /** Human-readable reason (e.g. "system prompt changed", "new message appended"). */
+  divergenceReason: string;
+};
+/** Per-session cache analytics state. */
+export type CacheAnalytics = {
+  /** Deflate-compressed serialized request body from the last turn. */
+  lastRequestBody: Uint8Array | null;
+  /** Uncompressed byte length of lastRequestBody (for prefix match %). */
+  lastRequestBodyLength: number;
+  /** cache_read_input_tokens from last API response. */
+  lastCacheRead: number;
+  /** cache_creation_input_tokens from last API response. */
+  lastCacheCreation: number;
+  /** Total turns observed. */
+  turnCount: number;
+  /** Confirmed busts (API returned cacheRead=0 with cacheCreation>0). */
+  bustCount: number;
+};
 /** Per-session state tracked by the gateway for Lore pipeline decisions. */
 export type SessionState = {
   sessionID: string;
@@ -172,6 +215,8 @@ export type SessionState = {
   messageCount: number;
   /** Turns since last curation run — triggers background curation. */
   turnsSinceCuration: number;
-  /** Pending recall result from previous turn (Case 2: mixed tool interception). */
-  pendingRecall?: PendingRecall;
+  /** Stored recall results for marker-based round-trip expansion. */
+  recallStore: RecallStore;
+  /** Cache analytics — request body prefix comparison + API cache fields. */
+  cacheAnalytics: CacheAnalytics;
 };