npm - @loreai/core - Versions diffs - 0.19.0 → 0.20.1 - Mend

@loreai/core 0.19.0 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/dist/bun/agents-file.d.ts.map +1 -1
package/dist/bun/config.d.ts +1 -1
package/dist/bun/config.d.ts.map +1 -1
package/dist/bun/db.d.ts +13 -1
package/dist/bun/db.d.ts.map +1 -1
package/dist/bun/embedding.d.ts.map +1 -1
package/dist/bun/git.d.ts.map +1 -1
package/dist/bun/gradient.d.ts +39 -13
package/dist/bun/gradient.d.ts.map +1 -1
package/dist/bun/hosted.d.ts +36 -0
package/dist/bun/hosted.d.ts.map +1 -0
package/dist/bun/index.d.ts +3 -2
package/dist/bun/index.d.ts.map +1 -1
package/dist/bun/index.js +295 -235
package/dist/bun/index.js.map +4 -4
package/dist/bun/lat-reader.d.ts.map +1 -1
package/dist/node/agents-file.d.ts.map +1 -1
package/dist/node/config.d.ts +1 -1
package/dist/node/config.d.ts.map +1 -1
package/dist/node/db.d.ts +13 -1
package/dist/node/db.d.ts.map +1 -1
package/dist/node/embedding.d.ts.map +1 -1
package/dist/node/git.d.ts.map +1 -1
package/dist/node/gradient.d.ts +39 -13
package/dist/node/gradient.d.ts.map +1 -1
package/dist/node/hosted.d.ts +36 -0
package/dist/node/hosted.d.ts.map +1 -0
package/dist/node/index.d.ts +3 -2
package/dist/node/index.d.ts.map +1 -1
package/dist/node/index.js +295 -235
package/dist/node/index.js.map +4 -4
package/dist/node/lat-reader.d.ts.map +1 -1
package/dist/types/agents-file.d.ts.map +1 -1
package/dist/types/config.d.ts +1 -1
package/dist/types/config.d.ts.map +1 -1
package/dist/types/db.d.ts +13 -1
package/dist/types/db.d.ts.map +1 -1
package/dist/types/embedding.d.ts.map +1 -1
package/dist/types/git.d.ts.map +1 -1
package/dist/types/gradient.d.ts +39 -13
package/dist/types/gradient.d.ts.map +1 -1
package/dist/types/hosted.d.ts +36 -0
package/dist/types/hosted.d.ts.map +1 -0
package/dist/types/index.d.ts +3 -2
package/dist/types/index.d.ts.map +1 -1
package/dist/types/lat-reader.d.ts.map +1 -1
package/package.json +2 -1
package/src/agents-file.ts +12 -0
package/src/config.ts +14 -17
package/src/db.ts +39 -6
package/src/embedding.ts +43 -5
package/src/git.ts +4 -0
package/src/gradient.ts +167 -145
package/src/hosted.ts +46 -0
package/src/index.ts +9 -4
package/src/lat-reader.ts +4 -0

package/src/embedding.ts CHANGED Viewed

@@ -28,6 +28,27 @@ import type {
  *  embedding calls but bounded enough to avoid minutes-long hangs. */
 const EMBED_TIMEOUT_MS = 10_000;
+/**
+ * Safe per-text character limit for local ONNX inference. The Nomic v1.5 model
+ * supports up to 8192 tokens, but ONNX runtime OOMs on inputs near that ceiling
+ * (error codes 284432024, 287180544, 144786472). Pre-truncating to ~4096 tokens
+ * worth of characters keeps the tensor well within safe allocation bounds.
+ * The worker's `truncation: true` remains as a safety net.
+ */
+const LOCAL_MAX_CHARS = 4096 * 4; // ~4096 tokens × ~4 chars/token
+/**
+ * Truncate a string to LOCAL_MAX_CHARS without splitting a UTF-16 surrogate pair.
+ * If the cut falls on a high surrogate (0xD800-0xDBFF), backs up one char.
+ */
+function safeLocalTruncate(text: string): string {
+  if (text.length <= LOCAL_MAX_CHARS) return text;
+  let end = LOCAL_MAX_CHARS;
+  const code = text.charCodeAt(end - 1);
+  if (code >= 0xD800 && code <= 0xDBFF) end--; // don't split surrogate pair
+  return text.slice(0, end);
+}
 // ---------------------------------------------------------------------------
 // Provider interface
 // ---------------------------------------------------------------------------
@@ -332,9 +353,10 @@ class LocalProvider implements EmbeddingProvider {
             localProviderKnownBroken = true;
             if (!localProviderErrorLogged) {
               localProviderErrorLogged = true;
-              log.info(
+              log.error(
                 `local embedding provider failed to init: ${msg.error}. ` +
                   `Set VOYAGE_API_KEY/OPENAI_API_KEY for automatic remote fallback.`,
+                new Error(`embedding worker init failed: ${msg.error}`),
               );
             }
             for (const [, p] of this.pendingRequests) {
@@ -351,6 +373,7 @@ class LocalProvider implements EmbeddingProvider {
       this.worker.on("error", (err: Error) => {
         this.workerInitError = err.message;
         this.workerReady = false;
+        log.error("embedding worker crashed:", err);
         for (const [, p] of this.pendingRequests) {
           p.reject(new LocalProviderUnavailableError(err));
         }
@@ -361,6 +384,10 @@ class LocalProvider implements EmbeddingProvider {
       this.worker.on("exit", (code) => {
         if (code !== 0 && !this.workerInitError) {
           this.workerInitError = `embedding worker exited with code ${code}`;
+          log.error(
+            this.workerInitError,
+            new Error(this.workerInitError),
+          );
         }
         this.workerReady = false;
         for (const [, p] of this.pendingRequests) {
@@ -396,9 +423,13 @@ class LocalProvider implements EmbeddingProvider {
   async embed(texts: string[], inputType: "document" | "query"): Promise<Float32Array[]> {
     await this.ensureWorker();
+    // Pre-truncate texts that exceed the safe ONNX inference limit.
+    // This prevents OOM on single inputs near the model's 8192-token max.
+    const truncated = texts.map(safeLocalTruncate);
     // Prepend Nomic task instruction prefix.
     const prefix = inputType === "document" ? "search_document: " : "search_query: ";
-    const prefixed = texts.map((t) => prefix + t);
+    const prefixed = truncated.map((t) => prefix + t);
     const id = this.nextRequestId++;
     // Recall queries (single query-type texts) get high priority so they
@@ -842,6 +873,7 @@ export function embedKnowledgeEntry(
   title: string,
   content: string,
 ): void {
+  if (!isAvailable()) return;
   const text = `${title}\n${content}`;
   embed([text], "document")
     .then(([vec]) => {
@@ -850,7 +882,7 @@ export function embedKnowledgeEntry(
         .run(toBlob(vec), id);
     })
     .catch((err) => {
-      log.info("embedding failed for knowledge entry", id, ":", err);
+      log.error("embedding failed for knowledge entry", id, ":", err);
     });
 }
@@ -863,6 +895,7 @@ export function embedDistillation(
   id: string,
   observations: string,
 ): void {
+  if (!isAvailable()) return;
   embed([observations], "document")
     .then(([vec]) => {
       db()
@@ -870,7 +903,7 @@ export function embedDistillation(
         .run(toBlob(vec), id);
     })
     .catch((err) => {
-      log.info("embedding failed for distillation", id, ":", err);
+      log.error("embedding failed for distillation", id, ":", err);
     });
 }
@@ -884,6 +917,7 @@ export function embedTemporalMessage(
   id: string,
   content: string,
 ): void {
+  if (!isAvailable()) return;
   // Skip very short messages — they don't carry enough semantic signal
   // to be useful in vector search and would waste embedding capacity.
   if (content.length < 50) return;
@@ -895,7 +929,7 @@ export function embedTemporalMessage(
         .run(toBlob(vec), id);
     })
     .catch((err) => {
-      log.info("embedding failed for temporal message", id, ":", err);
+      log.error("embedding failed for temporal message", id, ":", err);
     });
 }
@@ -1199,6 +1233,8 @@ export async function backfillEmbeddings(): Promise<number> {
     } catch (err) {
       // log.error sends to Sentry via captureException
       log.error(`embedding backfill batch failed (${batch.length} items):`, err);
+      // Provider is dead — no point retrying remaining batches.
+      if (err instanceof LocalProviderUnavailableError) break;
     }
     // No yieldToEventLoop() needed — embed() is truly async (worker thread).
   }
@@ -1259,6 +1295,8 @@ export async function backfillDistillationEmbeddings(): Promise<number> {
     } catch (err) {
       // log.error sends to Sentry via captureException
       log.error(`distillation embedding backfill batch failed (${batch.length} items):`, err);
+      // Provider is dead — no point retrying remaining batches.
+      if (err instanceof LocalProviderUnavailableError) break;
     }
     if (embedded >= nextProgressAt) {

package/src/git.ts CHANGED Viewed

@@ -13,6 +13,7 @@
  */
 import { execSync } from "child_process";
+import { isHostedMode } from "./hosted";
 // ---------------------------------------------------------------------------
 // URL normalization
@@ -95,6 +96,9 @@ export function clearGitRemoteCache(): void {
  * subprocess calls — `git remote -v` only runs once per unique path.
  */
 export function getGitRemote(path: string): string | null {
+  // In hosted mode, never run git subprocesses with client-controlled cwd.
+  if (isHostedMode()) return null;
   const cached = gitRemoteCache.get(path);
   if (cached !== undefined) return cached;

package/src/gradient.ts CHANGED Viewed

@@ -37,134 +37,129 @@ function estimateMessage(msg: MessageWithParts): number {
 let contextLimit = 200_000; // sensible default
 let outputReserved = 32_000;
-// Cost-aware layer-0 token cap. When > 0, the layer-0 passthrough gate uses
-// min(maxInput, maxLayer0Tokens) instead of maxInput alone. Derived from the
-// model's cache-read cost: cap = targetCostPerTurn / costPerToken. This prevents
-// expensive models from sending huge contexts at layer 0, where cache-read costs
-// compound linearly across turns. Set to 0 to disable (use full context).
-let maxLayer0Tokens = 0;
-const MIN_LAYER0_FLOOR = 40_000;
 // ---------------------------------------------------------------------------
-// Cost-aware context token cap (layer 1+)
+// Tier-based context management
+//
+// Three quality tiers based on empirical model effectiveness:
+//   Tier 1: 0 – 200K tokens (best quality, preferred operating range)
+//   Tier 2: 200K – 500K tokens (acceptable quality)
+//   Tier 3: 500K – model context limit (degraded, compress when economical)
 //
-// Limits total tokens (distilled + raw) to keep per-bust cache write cost
-// bounded. For opus-4-6 at $6.25/M write, a $1.00 target yields a 160K cap.
-// For sonnet-4 at $3.75/M write, the cap is 267K (effectively uncapped).
+// At each tier boundary, a per-turn economic comparison decides whether to
+// compress (bust the cache) or continue growing:
+//   bustCost    = compressedSize × cacheWriteCostPerToken
+//   continueCost = currentSize   × cacheReadCostPerToken
+// If bustCost ≥ threshold × continueCost, don't compress — reads are cheap.
 //
-// The cap is further adjusted dynamically per session via bust rate EMA and
-// inter-bust interval tracking: tighten when busts are frequent, relax when
-// the cache is working well. Asymmetric rates: tighten fast, relax slowly.
+// Rolling bust detection: if 5+ consecutive turns bust the cache, stop trying
+// to compress — something structural is causing busts, and compression just
+// adds cost on top.
 // ---------------------------------------------------------------------------
-/** Static ceiling for total context tokens, derived from model pricing.
- *  0 = disabled (no cap). Set via setMaxContextTokens(). */
-let maxContextTokensCeiling = 0;
+/** Tier boundary tokens. Configurable for testing. */
+const TIER_BOUNDARIES = [200_000, 500_000] as const;
-const MIN_CONTEXT_FLOOR = 130_000;
+/** Cache pricing per token (USD). Set by host adapter via setCachePricing(). */
+let cacheWriteCostPerToken = 0;
+let cacheReadCostPerToken = 0;
-/** Compute the context ceiling from a per-bust cost target and cache-write price per token. */
-export function computeContextCap(
-  targetBustCost: number,
-  cacheWriteCostPerToken: number,
-): number {
-  if (targetBustCost <= 0 || cacheWriteCostPerToken <= 0) return 0;
-  return Math.max(MIN_CONTEXT_FLOOR, Math.floor(targetBustCost / cacheWriteCostPerToken));
+/**
+ * Set cache pricing for the current model. Called by the host adapter after
+ * looking up model cost data. Required for tier-based bust-vs-continue
+ * decisions. When not set (both 0), tier decisions fall back to conservative
+ * defaults: always compress at tier boundaries.
+ */
+export function setCachePricing(writeCost: number, readCost: number) {
+  cacheWriteCostPerToken = Math.max(0, writeCost);
+  cacheReadCostPerToken = Math.max(0, readCost);
 }
-/** Set the static context ceiling. Called by the host adapter after computing
- *  from model pricing. The effective per-session cap may be lower due to
- *  dynamic adaptation (bust rate EMA). */
-export function setMaxContextTokens(tokens: number) {
-  maxContextTokensCeiling = Math.max(0, Math.floor(tokens));
+/** Returns current pricing (for tests). */
+export function getCachePricing(): { write: number; read: number } {
+  return { write: cacheWriteCostPerToken, read: cacheReadCostPerToken };
+}
+// Cost-aware layer-0 token cap. When > 0, the layer-0 passthrough gate uses
+// min(maxInput, maxLayer0Tokens) instead of maxInput alone. Derived from the
+// model's cache-read cost: cap = targetCostPerTurn / costPerToken. This prevents
+// expensive models from sending huge contexts at layer 0, where cache-read costs
+// compound linearly across turns. Set to 0 to disable (use full context).
+let maxLayer0Tokens = 0;
+const MIN_LAYER0_FLOOR = 40_000;
+/**
+ * Decide whether compression is economical at a tier boundary.
+ *
+ * @param currentTokens   - expected input tokens if we stay at the current layer
+ * @param compressedTokens - expected tokens after compression
+ * @param consecutiveBusts - how many turns in a row we've busted the cache
+ * @param threshold        - bust cost must be < threshold × continue cost to compress (default 0.85)
+ * @returns true if compression is worth it
+ */
+export function shouldCompress(
+  currentTokens: number,
+  compressedTokens: number,
+  consecutiveBusts: number,
+  threshold = 0.85,
+): boolean {
+  // Rolling bust detection: if we've been busting 5+ turns in a row,
+  // stop trying to compress — it's clearly not helping.
+  if (consecutiveBusts >= 5) return false;
+  // If no pricing data, fall back to conservative: do NOT compress.
+  // Compression busts the cache, which is expensive. Without pricing data
+  // we can't prove it's worthwhile, so err on the side of keeping the cache.
+  if (cacheWriteCostPerToken <= 0 || cacheReadCostPerToken <= 0) return false;
+  const bustCost = compressedTokens * cacheWriteCostPerToken;
+  const continueCost = currentTokens * cacheReadCostPerToken;
+  // Compress only if the bust cost is meaningfully less than continuing
+  return bustCost < threshold * continueCost;
 }
-/** Returns the current static ceiling (for external callers / tests). */
-export function getMaxContextTokens(): number {
-  return maxContextTokensCeiling;
+/**
+ * Determine which tier the given token count falls into.
+ * Returns 0, 1, or 2 corresponding to the tier index.
+ */
+export function getTier(tokens: number): number {
+  if (tokens <= TIER_BOUNDARIES[0]) return 0;
+  if (tokens <= TIER_BOUNDARIES[1]) return 1;
+  return 2;
 }
 /**
- * Feed cache usage data after each API response. Updates the per-session
- * bust rate EMA and inter-bust interval, which adjust the effective context
- * cap dynamically.
+ * Record cache usage from an API response. Tracks consecutive busts for
+ * the rolling bust detection used by shouldCompress().
+ *
+ * A "bust" is when cache_write > 50% of total input tokens.
  *
  * @param cacheWrite - cache_creation_input_tokens from the API response
  * @param cacheRead  - cache_read_input_tokens from the API response
+ * @param inputTokens - total input_tokens from the API response (includes uncached)
  * @param sessionID  - session that produced this response
  */
-export function updateBustRate(
+export function recordCacheUsage(
   cacheWrite: number,
   cacheRead: number,
+  inputTokens: number,
   sessionID?: string,
 ): void {
   if (!sessionID) return;
   const state = getSessionState(sessionID);
-  const total = cacheWrite + cacheRead;
-  if (total === 0) return;
-  // Bust ratio: fraction of total input that was cache-written (0 = all reads, 1 = all writes)
-  const bustRatio = cacheWrite / total;
-  // EMA update (α = 0.3 for smoothing — responsive but not twitchy)
-  state.bustRateEMA =
-    state.bustRateEMA < 0
-      ? bustRatio  // first observation
-      : state.bustRateEMA * 0.7 + bustRatio * 0.3;
-  // Inter-bust interval tracking: a "bust" is when >50% of input is writes
-  const now = Date.now();
-  if (bustRatio > 0.5) {
-    if (state.lastBustAt > 0) {
-      const interval = now - state.lastBustAt;
-      state.interBustIntervalEMA =
-        state.interBustIntervalEMA < 0
-          ? interval
-          : state.interBustIntervalEMA * 0.7 + interval * 0.3;
-    }
-    state.lastBustAt = now;
-  }
-  // Adapt per-session cap based on bust rate and interval
-  adaptContextCap(state);
-}
-/** Adapt the per-session context cap based on bust rate and break frequency. */
-function adaptContextCap(state: SessionState): void {
-  if (maxContextTokensCeiling <= 0) return; // disabled
-  const cap = state.dynamicContextCap > 0
-    ? state.dynamicContextCap
-    : maxContextTokensCeiling;
-  let newCap = cap;
-  // Primary signal: bust rate EMA
-  if (state.bustRateEMA > 0.8) {
-    // Mostly writes — tighten by 10%
-    newCap = Math.floor(cap * 0.90);
-  } else if (state.bustRateEMA < 0.3) {
-    // Mostly reads — relax by 5% (slower than tightening)
-    newCap = Math.floor(cap * 1.05);
-  }
-  // Secondary signal: inter-bust interval
-  if (state.interBustIntervalEMA > 0) {
-    if (state.interBustIntervalEMA < 2 * 60_000) {
-      // Busts less than 2 min apart — proactively tighten by extra 5%
-      newCap = Math.floor(newCap * 0.95);
-    } else if (state.interBustIntervalEMA > 10 * 60_000) {
-      // Busts more than 10 min apart — allow extra relaxation
-      newCap = Math.floor(newCap * 1.03);
+  // Use total input tokens as denominator (includes uncached input),
+  // not just cacheWrite + cacheRead, to avoid inflated bust ratios
+  // when a large fraction of tokens is uncached.
+  const total = inputTokens > 0 ? inputTokens : cacheWrite + cacheRead;
+  if (total > 0) {
+    if (cacheWrite / total > 0.5) {
+      state.consecutiveBusts++;
+    } else {
+      state.consecutiveBusts = 0;
     }
   }
-  // Clamp to [floor, ceiling]
-  state.dynamicContextCap = Math.max(
-    MIN_CONTEXT_FLOOR,
-    Math.min(maxContextTokensCeiling, newCap),
-  );
 }
 // Conservative overhead reserve for first-turn (before calibration):
@@ -253,18 +248,10 @@ type SessionState = {
   postIdleCompact: boolean;
   /** Consecutive turns at layer >= 2. When >= 3, log a compaction hint. */
   consecutiveHighLayer: number;
-  // --- Cost-aware context cap dynamic state ---
-  /** EMA of bust ratio (cacheWrite / total). -1 = uninitialized. */
-  bustRateEMA: number;
-  /** EMA of time between full busts (ms). -1 = uninitialized. */
-  interBustIntervalEMA: number;
-  /** Epoch ms of the last full bust (cacheWrite > 50% of total). 0 = never. */
-  lastBustAt: number;
-  /** Per-session dynamic context cap (tokens). Adjusted by adaptContextCap().
-   *  0 = use the static ceiling (maxContextTokensCeiling). */
-  dynamicContextCap: number;
+  /** Consecutive turns where the cache was busted (>50% writes).
+   *  Used for rolling bust detection: after 5+ consecutive busts, stop
+   *  trying to compress and warn that the conversation is unsustainable. */
+  consecutiveBusts: number;
   /**
    * Distillation row snapshot — cached to avoid hitting the DB on every
@@ -298,11 +285,7 @@ function makeSessionState(): SessionState {
     cameOutOfIdle: false,
     postIdleCompact: false,
     consecutiveHighLayer: 0,
-    bustRateEMA: -1,
-    interBustIntervalEMA: -1,
-    lastBustAt: 0,
-    dynamicContextCap: 0,
+    consecutiveBusts: 0,
     distillationSnapshot: null,
   };
@@ -321,9 +304,8 @@ function getSessionState(sessionID: string): SessionState {
     state.forceMinLayer = loadForceMinLayer(sessionID) as SafetyLayer;
     // Restore gradient calibration state from DB (v24) — avoids uncalibrated
-    // first turns after restart. Without this, dynamicContextCap reverts to
-    // the static ceiling, bustRateEMA is uninitialized, and lastTurnAt=0
-    // prevents onIdleResume() from detecting idle gaps.
+    // first turns after restart. Without this, lastTurnAt=0 prevents
+    // onIdleResume() from detecting idle gaps.
     //
     // Atomic restore: lastTurnAt > 0 is the proxy for "gradient state was
     // ever flushed to DB". Restore all fields together or none — avoids
@@ -331,13 +313,12 @@ function getSessionState(sessionID: string): SessionState {
     // could be mistaken for "never persisted".
     const persisted = loadSessionTracking(sessionID);
     if (persisted && persisted.lastTurnAt > 0) {
-      state.dynamicContextCap = persisted.dynamicContextCap;
-      state.bustRateEMA = persisted.bustRateEMA;
-      state.interBustIntervalEMA = persisted.interBustIntervalEMA;
       state.lastLayer = persisted.lastLayer as SafetyLayer;
       state.lastKnownInput = persisted.lastKnownInput;
       state.lastTurnAt = persisted.lastTurnAt;
-      state.lastBustAt = persisted.lastBustAt;
+      // consecutiveBusts is persisted in the dynamicContextCap column
+      // (repurposed, see saveGradientState).
+      state.consecutiveBusts = persisted.dynamicContextCap;
     }
     sessionStates.set(sessionID, state);
@@ -581,6 +562,8 @@ export function setForceMinLayer(layer: SafetyLayer, sessionID?: string) {
 // For testing only — reset all calibration and force-escalation state
 export function resetCalibration(sessionID?: string) {
   calibratedOverhead = null;
+  cacheWriteCostPerToken = 0;
+  cacheReadCostPerToken = 0;
   if (sessionID) {
     saveForceMinLayer(sessionID, 0); // clear persisted state
     sessionStates.delete(sessionID);
@@ -605,6 +588,7 @@ export function inspectSessionState(sessionID: string): {
   postIdleCompact: boolean;
   lastTurnAt: number;
   distillationSnapshot: DistillationSnapshot | null;
+  consecutiveBusts: number;
 } | null {
   const state = sessionStates.get(sessionID);
   if (!state) return null;
@@ -615,6 +599,7 @@ export function inspectSessionState(sessionID: string): {
     postIdleCompact: state.postIdleCompact,
     lastTurnAt: state.lastTurnAt,
     distillationSnapshot: state.distillationSnapshot,
+    consecutiveBusts: state.consecutiveBusts,
   };
 }
@@ -639,13 +624,12 @@ export function saveGradientState(sessionID: string): void {
   if (!state) return;
   saveSessionTracking(sessionID, {
-    dynamicContextCap: state.dynamicContextCap,
-    bustRateEMA: state.bustRateEMA,
-    interBustIntervalEMA: state.interBustIntervalEMA,
     lastLayer: state.lastLayer,
     lastKnownInput: state.lastKnownInput,
     lastTurnAt: state.lastTurnAt,
-    lastBustAt: state.lastBustAt,
+    // Repurpose the dead dynamicContextCap column (v24, always 0 now)
+    // to persist consecutiveBusts — avoids a new DB migration.
+    dynamicContextCap: state.consecutiveBusts,
   });
 }
@@ -1547,6 +1531,10 @@ export type TransformResult = {
   // relevance scoring. Set on Layer 4 (emergency) where the context is
   // fully reset and mid-session knowledge may have changed relevance.
   refreshLtm: boolean;
+  /** When set, the conversation is growing unsustainably — 5+ consecutive
+   *  cache busts detected. The pipeline should inject a warning message
+   *  advising the user to compact or start a new conversation. */
+  unsustainable?: boolean;
 };
 // Per-session urgent distillation tracking.
@@ -1580,17 +1568,11 @@ function transformInner(input: {
     contextLimit - outputReserved - overhead - sessLtmTokens,
   );
-  // Cost-aware context cap: limit total distilled + raw tokens to keep
-  // per-bust cache write cost bounded. On opus-4-6 at $6.25/M, a $1.00
-  // target yields a 160K ceiling; on sonnet-4 at $3.75/M, 267K (effectively
-  // uncapped at 200K context). Per-session dynamic adaptation may reduce
-  // this further based on observed bust rate and break frequency.
-  const effectiveCap = sid && sessState.dynamicContextCap > 0
-    ? sessState.dynamicContextCap
-    : maxContextTokensCeiling;
-  const usable = effectiveCap > 0 && usableRaw > effectiveCap
-    ? effectiveCap
-    : usableRaw;
+  // No EMA-driven adaptive cap — use the full available context budget.
+  // The layer-0 cap (maxLayer0Tokens) still applies for per-turn read cost,
+  // and tier-based bust-vs-continue decisions control whether to compress
+  // at quality boundaries.
+  const usable = usableRaw;
   const distilledBudget = Math.floor(usable * cfg.budget.distilled);
   // Base raw budget. May be overridden below for post-idle compact mode.
@@ -1661,11 +1643,8 @@ function transformInner(input: {
     sessState.postIdleCompact = false;
     // Skip layer 0 — don't pass through all raw messages on a cold cache.
     effectiveMinLayer = Math.max(effectiveMinLayer, 1) as SafetyLayer;
-    // Use a tighter raw budget. When the cost-aware context cap is active,
-    // total write size is already bounded — use a moderate 30%. Without
-    // the cap, use a tighter 20% to limit cold-write cost directly.
-    const postIdleRawFraction = effectiveCap > 0 ? 0.30 : 0.20;
-    rawBudget = Math.floor(usable * postIdleRawFraction);
+    // Use a tighter raw budget on cold cache to limit write cost.
+    rawBudget = Math.floor(usable * 0.20);
     log.info(
       `post-idle compact: session=${sid} rawBudget=${rawBudget}` +
       ` (${Math.floor(usable * cfg.budget.raw)}→${rawBudget})`,
@@ -1727,6 +1706,46 @@ function transformInner(input: {
     };
   }
+  // --- Tier-based bust-vs-continue gate ---
+  // When expectedInput exceeds the layer-0 cap but still fits in the model's
+  // context window, check whether compression is economically justified.
+  // If not (bust cost ≥ 85% of continue cost), skip compression and pass
+  // through at layer 0 — the cache reads are cheap enough to justify the
+  // larger context, and raw messages are better quality than distilled.
+  if (
+    effectiveMinLayer === 0 &&
+    layer0Input > layer0Ceiling &&
+    layer0Input <= maxInput &&
+    sid
+  ) {
+    const busts = getSessionState(sid).consecutiveBusts;
+    // For compression, estimate the compressed size as the layer-1 budget
+    // (distilled + raw fractions). This is a rough upper bound — actual
+    // compressed output may be smaller.
+    const compressedEstimate = distilledBudget + rawBudget;
+    if (!shouldCompress(Math.round(layer0Input), compressedEstimate, busts)) {
+      const messageTokens = calibrated
+        ? expectedInput - (sessLtmTokens - sessState.lastKnownLtm)
+        : expectedInput - overhead - sessLtmTokens;
+      log.info(
+        `tier gate: session=${sid} skipping compression — bustCost not justified` +
+        ` (input=${Math.round(layer0Input)} compressed=${compressedEstimate} busts=${busts})`,
+      );
+      return {
+        messages: input.messages,
+        layer: 0,
+        distilledTokens: 0,
+        rawTokens: Math.max(0, messageTokens),
+        totalTokens: Math.max(0, messageTokens),
+        usable,
+        distilledBudget,
+        rawBudget,
+        refreshLtm: false,
+        unsustainable: busts >= 5,
+      };
+    }
+  }
   // --- Gradient mode: context exhausted (or force-escalated), compress older messages ---
   // Pre-pass: deduplicate repeated tool outputs before layer selection.
@@ -1872,6 +1891,8 @@ function transformInner(input: {
   const nuclearRaw = [...olderMessages, ...currentTurn];
   const nuclearRawTokens = olderTokens + currentTurnTokens;
+  const unsustainable = sid ? getSessionState(sid).consecutiveBusts >= 5 : false;
   return {
     messages: [...nuclearPrefix, ...nuclearRaw],
     layer: 4,
@@ -1882,6 +1903,7 @@ function transformInner(input: {
     distilledBudget,
     rawBudget,
     refreshLtm: true,
+    unsustainable,
   };
 }
@@ -1933,7 +1955,7 @@ export function transform(input: {
     log.info(
       `gradient: session=${sid} layer=${result.layer} tokens=${result.totalTokens}` +
       ` (distilled=${result.distilledTokens} raw=${result.rawTokens})` +
-      ` usable=${result.usable} cap=${maxLayer0Tokens || "off"}`,
+      ` usable=${result.usable} tier=${getTier(result.totalTokens)} l0cap=${maxLayer0Tokens || "off"}`,
     );
   }
   return result;

package/src/hosted.ts ADDED Viewed

@@ -0,0 +1,46 @@
+/**
+ * hosted.ts — Hosted/remote mode flag for @loreai/core.
+ *
+ * When the gateway runs remotely (different machine/container from the
+ * developer's workspace), filesystem operations that use client-controlled
+ * paths are unsafe:
+ *
+ *  - `git remote -v` subprocess with attacker-controlled cwd
+ *  - `.lore.json` config read from attacker-controlled path
+ *  - `.lore.md` / AGENTS.md read/write at attacker-controlled path
+ *  - `lat.md/` recursive directory scan at attacker-controlled path
+ *  - `fs.watch()` on attacker-controlled paths
+ *
+ * Setting hosted mode causes all these operations to become safe no-ops.
+ * The gateway sets this flag during startup when `LORE_HOSTED_MODE=1`.
+ *
+ * This is a process-wide flag — once set, it cannot be unset (the only
+ * consumer is the gateway process, and hosted mode is a startup decision).
+ */
+let _hostedMode = false;
+/**
+ * Enable hosted mode. Once enabled, cannot be disabled.
+ * All filesystem operations using client-controlled paths become no-ops.
+ */
+export function enableHostedMode(): void {
+  _hostedMode = true;
+}
+/**
+ * Returns true if hosted mode is active — filesystem operations using
+ * client-controlled paths should be skipped.
+ */
+export function isHostedMode(): boolean {
+  return _hostedMode;
+}
+/**
+ * Reset hosted mode flag. **Test-only** — production code should never
+ * call this. Exported so tests can toggle hosted mode without process
+ * restarts.
+ */
+export function _resetHostedModeForTest(): void {
+  _hostedMode = false;
+}