npm - @poncho-ai/harness - Versions diffs - 0.37.1 → 0.37.2 - Mend

@poncho-ai/harness 0.37.1 → 0.37.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/.turbo/turbo-build.log CHANGED Viewed

@@ -1,5 +1,5 @@
-> @poncho-ai/harness@0.37.1 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
+> @poncho-ai/harness@0.37.2 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
 > node scripts/embed-docs.js && tsup src/index.ts --format esm --dts
 [embed-docs] Generated poncho-docs.ts with 4 topics
@@ -8,9 +8,9 @@
 [34mCLI[39m tsup v8.5.1
 [34mCLI[39m Target: es2022
 [34mESM[39m Build start
-[32mESM[39m [1mdist/index.js            [22m[32m389.90 KB[39m
+[32mESM[39m [1mdist/index.js            [22m[32m390.92 KB[39m
 [32mESM[39m [1mdist/isolate-TCWTUVG4.js [22m[32m47.34 KB[39m
-[32mESM[39m ⚡️ Build success in 211ms
+[32mESM[39m ⚡️ Build success in 247ms
 [34mDTS[39m Build start
-[32mDTS[39m ⚡️ Build success in 6845ms
+[32mDTS[39m ⚡️ Build success in 7644ms
 [32mDTS[39m [1mdist/index.d.ts [22m[32m56.62 KB[39m

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,17 @@
 # @poncho-ai/harness
+## 0.37.2
+### Patch Changes
+- [`2229f74`](https://github.com/cesr/poncho-ai/commit/2229f74ae4d02c5618c60787a7db925060cc1313) Thanks [@cesr](https://github.com/cesr)! - fix: stop invalidating the prompt cache across runs and preserve cache reads when tool results are in flight.
+  Two issues were degrading prompt-cache hit rates to ~0 between turns:
+  1. The system prompt embedded `new Date().toISOString()` (millisecond precision) on every run when a reminder store was active, which changed the very first block of the prefix and prevented any cross-run cache match. The timestamp is now quantized to the hour, which keeps the system prompt stable across runs while still giving the agent a usable sense of time.
+  2. When the message history contained untruncated tool results from the previous run, prompt caching was disabled entirely — no `cache_control` breakpoint was emitted, which also killed cache _reads_ of the stable prefix (system prompt + earlier turns). The breakpoint is now placed immediately before the first untruncated tool result instead, so the stable prefix is still cached and read while the soon-to-be-truncated tail stays out of the cache.
+  `addPromptCacheBreakpoints` now takes an optional `targetIndex` to support this.
 ## 0.37.1
 ### Patch Changes

package/dist/index.js CHANGED Viewed

@@ -6659,15 +6659,19 @@ function isAnthropicModel(model) {
   }
   return model.provider === "anthropic" || model.provider.includes("anthropic") || model.modelId.includes("anthropic") || model.modelId.includes("claude");
 }
-function addPromptCacheBreakpoints(messages, model) {
+function addPromptCacheBreakpoints(messages, model, targetIndex) {
   if (messages.length === 0 || !isAnthropicModel(model)) {
     return messages;
   }
+  const index = targetIndex ?? messages.length - 1;
+  if (index < 0 || index >= messages.length) {
+    return messages;
+  }
   const cacheDirective = {
     anthropic: { cacheControl: { type: "ephemeral" } }
   };
-  return messages.map((message, index) => {
-    if (index === messages.length - 1) {
+  return messages.map((message, i) => {
+    if (i === index) {
       return {
         ...message,
         providerOptions: {
@@ -7800,6 +7804,25 @@ var hasUntruncatedToolResults = (messages) => {
   }
   return false;
 };
+var findLastStableCacheIndex = (messages) => {
+  for (let i = 0; i < messages.length; i += 1) {
+    const msg = messages[i];
+    if (msg.role !== "tool") continue;
+    if (!Array.isArray(msg.content)) continue;
+    for (const part of msg.content) {
+      if (!part || typeof part !== "object") continue;
+      const p = part;
+      if (p.type !== "tool-result" || !p.output) continue;
+      if (p.output.type === "json") return i - 1;
+      if (p.output.type === "text" && typeof p.output.value === "string") {
+        if (!p.output.value.startsWith(TOOL_RESULT_TRUNCATED_PREFIX)) {
+          return i - 1;
+        }
+      }
+    }
+  }
+  return messages.length - 1;
+};
 var DEVELOPMENT_MODE_CONTEXT = `## Development Mode Context
 You are running locally in development mode. Treat this as an editable agent workspace.
@@ -9072,14 +9095,13 @@ var AgentHarness = class _AgentHarness {
       );
     }
     const hasFullToolResults = hasUntruncatedToolResults(messages);
-    const enablePromptCache = !hasFullToolResults;
-    if (!enablePromptCache) {
+    if (hasFullToolResults) {
       console.info(
-        `[poncho][cost] Prompt cache write disabled for run "${runId}" (untruncated tool results present in history).`
+        `[poncho][cost] Prompt cache breakpoint will be placed before untruncated tool results for run "${runId}" (stable prefix only).`
       );
     } else {
       console.info(
-        `[poncho][cost] Prompt cache write enabled for run "${runId}" (history has no untruncated tool results).`
+        `[poncho][cost] Prompt cache breakpoint will be placed at history tail for run "${runId}" (no untruncated tool results).`
       );
     }
     const inputMessageCount = messages.length;
@@ -9174,9 +9196,14 @@ Code is wrapped in an async IIFE \u2014 use \`return\` to return a value to the
       const promptWithSkills = this.skillContextWindow ? `${agentPrompt}${developmentContext}
 ${this.skillContextWindow}${browserContext}${fsContext}${isolateContext}` : `${agentPrompt}${developmentContext}${browserContext}${fsContext}${isolateContext}`;
+      const hourlyTime = (() => {
+        const d = /* @__PURE__ */ new Date();
+        d.setUTCMinutes(0, 0, 0);
+        return d.toISOString();
+      })();
       const timeContext = this.reminderStore ? `
-Current UTC time: ${(/* @__PURE__ */ new Date()).toISOString()}` : "";
+Current UTC time (hour precision): ${hourlyTime}` : "";
       return `${promptWithSkills}${memoryContext}${todoContext}${timeContext}`;
     };
     let systemPrompt = buildSystemPrompt();
@@ -9615,7 +9642,12 @@ ${textContent}` };
         const coreMessages = cachedCoreMessages;
         const temperature = agent.frontmatter.model?.temperature ?? 0.2;
         const maxTokens = agent.frontmatter.model?.maxTokens;
-        const cachedMessages = enablePromptCache ? addPromptCacheBreakpoints(coreMessages, modelInstance) : coreMessages;
+        const breakpointIndex = hasFullToolResults ? findLastStableCacheIndex(coreMessages) : coreMessages.length - 1;
+        const cachedMessages = addPromptCacheBreakpoints(
+          coreMessages,
+          modelInstance,
+          breakpointIndex
+        );
         const telemetryEnabled = this.loadedConfig?.telemetry?.enabled !== false;
         const result = await streamText({
           model: modelInstance,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@poncho-ai/harness",
-  "version": "0.37.1",
+  "version": "0.37.2",
   "description": "Agent execution runtime - conversation loop, tool dispatch, streaming",
   "repository": {
     "type": "git",

package/src/harness.ts CHANGED Viewed

@@ -333,6 +333,39 @@ const hasUntruncatedToolResults = (messages: Message[]): boolean => {
   return false;
 };
+/**
+ * Finds the last ModelMessage index that's safe to place a prompt cache
+ * breakpoint at — i.e. the last index before any untruncated tool-result.
+ *
+ * Untruncated tool-results from a prior run will be truncated on the next
+ * run, which would invalidate any cache write covering them. Placing the
+ * breakpoint just before them lets us cache only the stable prefix (system
+ * prompt + earlier turns) while still reading it back next turn.
+ *
+ * Returns `messages.length - 1` when there are no untruncated tool-results
+ * (normal tail-of-history caching).
+ */
+const findLastStableCacheIndex = (messages: ModelMessage[]): number => {
+  for (let i = 0; i < messages.length; i += 1) {
+    const msg = messages[i]!;
+    if (msg.role !== "tool") continue;
+    if (!Array.isArray(msg.content)) continue;
+    for (const part of msg.content) {
+      if (!part || typeof part !== "object") continue;
+      const p = part as { type?: string; output?: { type?: string; value?: unknown } };
+      if (p.type !== "tool-result" || !p.output) continue;
+      // JSON outputs bypass truncation (only text content is truncated).
+      if (p.output.type === "json") return i - 1;
+      if (p.output.type === "text" && typeof p.output.value === "string") {
+        if (!p.output.value.startsWith(TOOL_RESULT_TRUNCATED_PREFIX)) {
+          return i - 1;
+        }
+      }
+    }
+  }
+  return messages.length - 1;
+};
 const DEVELOPMENT_MODE_CONTEXT = `## Development Mode Context
 You are running locally in development mode. Treat this as an editable agent workspace.
@@ -1799,16 +1832,15 @@ export class AgentHarness {
       );
     }
     const hasFullToolResults = hasUntruncatedToolResults(messages);
-    const enablePromptCache = !hasFullToolResults;
-    if (!enablePromptCache) {
+    if (hasFullToolResults) {
       console.info(
-        `[poncho][cost] Prompt cache write disabled for run "${runId}" ` +
-        `(untruncated tool results present in history).`,
+        `[poncho][cost] Prompt cache breakpoint will be placed before untruncated ` +
+        `tool results for run "${runId}" (stable prefix only).`,
       );
     } else {
       console.info(
-        `[poncho][cost] Prompt cache write enabled for run "${runId}" ` +
-        `(history has no untruncated tool results).`,
+        `[poncho][cost] Prompt cache breakpoint will be placed at history tail ` +
+        `for run "${runId}" (no untruncated tool results).`,
       );
     }
     const inputMessageCount = messages.length;
@@ -1917,8 +1949,17 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
       const promptWithSkills = this.skillContextWindow
         ? `${agentPrompt}${developmentContext}\n\n${this.skillContextWindow}${browserContext}${fsContext}${isolateContext}`
         : `${agentPrompt}${developmentContext}${browserContext}${fsContext}${isolateContext}`;
+      // Quantize to the hour so the system prompt is stable across runs
+      // within the same hour. Including a per-millisecond timestamp would
+      // invalidate the prompt cache on every run, since the system prompt
+      // is the first block the cache tries to match.
+      const hourlyTime = (() => {
+        const d = new Date();
+        d.setUTCMinutes(0, 0, 0);
+        return d.toISOString();
+      })();
       const timeContext = this.reminderStore
-        ? `\n\nCurrent UTC time: ${new Date().toISOString()}`
+        ? `\n\nCurrent UTC time (hour precision): ${hourlyTime}`
         : "";
       return `${promptWithSkills}${memoryContext}${todoContext}${timeContext}`;
     };
@@ -2452,9 +2493,17 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
         const temperature = agent.frontmatter.model?.temperature ?? 0.2;
         const maxTokens = agent.frontmatter.model?.maxTokens;
-        const cachedMessages = enablePromptCache
-          ? addPromptCacheBreakpoints(coreMessages, modelInstance)
-          : coreMessages;
+        // Place the breakpoint before any untruncated tool-result so we
+        // cache only the stable prefix when prior-run tool results are
+        // still full-fidelity. Otherwise cache at the history tail.
+        const breakpointIndex = hasFullToolResults
+          ? findLastStableCacheIndex(coreMessages)
+          : coreMessages.length - 1;
+        const cachedMessages = addPromptCacheBreakpoints(
+          coreMessages,
+          modelInstance,
+          breakpointIndex,
+        );
         const telemetryEnabled = this.loadedConfig?.telemetry?.enabled !== false;

package/src/prompt-cache.ts CHANGED Viewed

@@ -17,23 +17,32 @@ function isAnthropicModel(model: LanguageModel): boolean {
  * explicit opt-in (Anthropic). For providers with automatic caching
  * (OpenAI), messages are returned unchanged.
  *
- * For Anthropic, marks the last message with ephemeral cache control so the
- * conversation prefix is incrementally cached across steps.
+ * For Anthropic, marks the target message with ephemeral cache control so
+ * the conversation prefix is incrementally cached across steps. When
+ * `targetIndex` is omitted, the last message is used (default behavior).
+ * Callers that want to cache only a stable prefix (e.g. skipping tool
+ * results that will be truncated next turn) can pass an earlier index.
  */
 export function addPromptCacheBreakpoints(
   messages: ModelMessage[],
   model: LanguageModel,
+  targetIndex?: number,
 ): ModelMessage[] {
   if (messages.length === 0 || !isAnthropicModel(model)) {
     return messages;
   }
+  const index = targetIndex ?? messages.length - 1;
+  if (index < 0 || index >= messages.length) {
+    return messages;
+  }
   const cacheDirective = {
     anthropic: { cacheControl: { type: "ephemeral" as const } },
   };
-  return messages.map((message, index) => {
-    if (index === messages.length - 1) {
+  return messages.map((message, i) => {
+    if (i === index) {
       return {
         ...message,
         providerOptions: {

package/test/harness.test.ts CHANGED Viewed

@@ -617,7 +617,7 @@ description: Safe skill
       script: "../outside.ts",
     }, stubContext);
     expect(result).toMatchObject({
-      error: expect.stringContaining("must be relative and within the allowed directory"),
+      error: expect.stringContaining("Expected a relative path"),
     });
   });