npm - @poncho-ai/harness - Versions diffs - 0.59.6 → 0.59.7 - Mend

@poncho-ai/harness 0.59.6 → 0.59.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/.turbo/turbo-build.log CHANGED Viewed

@@ -1,5 +1,5 @@
-> @poncho-ai/harness@0.59.6 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
+> @poncho-ai/harness@0.59.7 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
 > node scripts/embed-docs.js && tsup src/index.ts --format esm --dts
 [embed-docs] Generated poncho-docs.ts with 4 topics
@@ -9,8 +9,8 @@
 [34mCLI[39m Target: es2022
 [34mESM[39m Build start
 [32mESM[39m [1mdist/isolate-F2PPSUL6.js [22m[32m53.82 KB[39m
-[32mESM[39m [1mdist/index.js            [22m[32m557.73 KB[39m
-[32mESM[39m ⚡️ Build success in 256ms
+[32mESM[39m [1mdist/index.js            [22m[32m558.06 KB[39m
+[32mESM[39m ⚡️ Build success in 257ms
 [34mDTS[39m Build start
-[32mDTS[39m ⚡️ Build success in 8276ms
+[32mDTS[39m ⚡️ Build success in 7680ms
 [32mDTS[39m [1mdist/index.d.ts [22m[32m101.66 KB[39m

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,18 @@
 # @poncho-ai/harness
+## 0.59.7
+### Patch Changes
+- [`c73cb19`](https://github.com/cesr/poncho-ai/commit/c73cb19ec8bf61fe0598262ae4d050fb84c939b5) Thanks [@cesr](https://github.com/cesr)! - Auto-compaction never fired on cached conversations: the per-step context
+  measure (`latestContextTokens`) used `usage.inputTokens`, which with
+  Anthropic prompt caching is only the NON-cached slice — a real 190k+
+  conversation reported ~12k of "context", so the trigger comparison never
+  tripped and transcripts grew past the model's window. Context now counts
+  input + cache-read + cache-write tokens (everything the model read). Also
+  pins claude-fable-5 / opus-4-8 / opus-4-7 in the context-window registry
+  (previously relying on the silent 200k default).
 ## 0.59.6
 ### Patch Changes

package/dist/index.js CHANGED Viewed

@@ -7490,6 +7490,12 @@ var completeOpenAICodexDeviceAuth = async (request) => {
 // src/model-factory.ts
 var MODEL_CONTEXT_WINDOWS = {
+  // Pinned conservatively at 200k. The API has accepted >204k for fable-5
+  // (its real window is larger), but compacting at trigger×200k keeps
+  // long-conversation cost bounded; raise deliberately, not by omission.
+  "claude-fable-5": 2e5,
+  "claude-opus-4-8": 2e5,
+  "claude-opus-4-7": 2e5,
   "claude-opus-4-6": 2e5,
   "claude-sonnet-4-6": 2e5,
   "claude-opus-4-5": 2e5,
@@ -11362,7 +11368,7 @@ ${textContent}` };
           totalOutputTokens += usage.outputTokens ?? 0;
           totalCachedTokens += stepCachedTokens;
           totalCacheWriteTokens += stepCacheWriteTokens;
-          latestContextTokens = stepInputTokens;
+          latestContextTokens = stepInputTokens + stepCachedTokens + stepCacheWriteTokens;
           toolOutputEstimateSinceModel = 0;
           yield pushEvent({
             type: "model:response",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@poncho-ai/harness",
-  "version": "0.59.6",
+  "version": "0.59.7",
   "description": "Agent execution runtime - conversation loop, tool dispatch, streaming",
   "repository": {
     "type": "git",

package/src/harness.ts CHANGED Viewed

@@ -3208,7 +3208,14 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
       totalOutputTokens += usage.outputTokens ?? 0;
       totalCachedTokens += stepCachedTokens;
       totalCacheWriteTokens += stepCacheWriteTokens;
-      latestContextTokens = stepInputTokens;
+      // Context size = EVERYTHING the model read this step. With prompt
+      // caching, Anthropic's `usage.input_tokens` is only the non-cached
+      // slice — the bulk of a long conversation arrives as cache reads.
+      // Counting input alone made the auto-compaction check see ~12k of
+      // "context" on a real 190k+ conversation, so compaction never fired
+      // and the transcript grew unbounded (observed 2026-06-12: 205k real
+      // context, trigger at 190k, no compaction).
+      latestContextTokens = stepInputTokens + stepCachedTokens + stepCacheWriteTokens;
       toolOutputEstimateSinceModel = 0;
       yield pushEvent({

package/src/model-factory.ts CHANGED Viewed

@@ -9,6 +9,12 @@ import {
 export type ModelProviderFactory = (modelName: string) => LanguageModel;
 const MODEL_CONTEXT_WINDOWS: Record<string, number> = {
+  // Pinned conservatively at 200k. The API has accepted >204k for fable-5
+  // (its real window is larger), but compacting at trigger×200k keeps
+  // long-conversation cost bounded; raise deliberately, not by omission.
+  "claude-fable-5": 200_000,
+  "claude-opus-4-8": 200_000,
+  "claude-opus-4-7": 200_000,
   "claude-opus-4-6": 200_000,
   "claude-sonnet-4-6": 200_000,
   "claude-opus-4-5": 200_000,