npm - jeo-code - Versions diffs - 0.6.28 → 0.6.30 - Mend

jeo-code 0.6.28 → 0.6.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/CHANGELOG.md +23 -0
package/README.ja.md +2 -2
package/README.ko.md +2 -2
package/README.md +2 -2
package/README.zh.md +2 -2
package/package.json +1 -1
package/src/agent/AGENTS.md +1 -1
package/src/agent/engine.ts +14 -33
package/src/agent/loop-guards.ts +135 -0
package/src/ai/model-catalog.ts +12 -5
package/src/ai/providers/anthropic.ts +9 -2
package/src/agent/tool-registry.ts +0 -54

package/CHANGELOG.md CHANGED Viewed

@@ -6,6 +6,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 The README mirrors the latest 5 entries — regenerate with `bun run changelog:sync`.
+## [0.6.30] - 2026-06-19
+_gjc-style intermediate-judgment guard classification extracted from the engine loop, plus a re-verification that `jeo --tmux` does not leak bun memory or slow down._
+### Changed
+- **Loop intermediate-judgment guards extracted into a classified module (`src/agent/loop-guards.ts`).** The mid-run "continue / self-correct / stop" decisions that were inlined across `engine.ts`'s `while` loop as scattered booleans and message strings are now a named `GuardState` discriminated-union taxonomy — jeo's descendant of gjc's `ultragoal-guard` `UltragoalGuardState` pattern. A single frozen `GUARD_LIMITS` object is the source of truth for every threshold (`MAX_REPEAT`, `MAX_FAILURES`, `MAX_REFUSAL_RETRIES`, `MAX_INVALID_CALLS`, `MAX_PARSE_BOUNCES`, `CYCLE_WINDOW`), and pure classifiers (`isVerificationSignal`, `repeatHint`, `nearestToolName`, `classifyDoneGate`) are now independently testable. `engine.ts` still owns all control flow (history mutation, `step++`, `continue`, `return finish(...)`) — only the JUDGMENT moved, so behavior is unchanged (net −19 lines in `engine.ts`). Removed the now-unused `src/agent/tool-registry.ts`.
+### Verified
+- **`jeo --tmux` has no bun memory leak and does not slow down.** An in-process probe streaming 5,000,000 SGR mouse-report escapes through `queuePromptInputChunk` (10 × 500k, `Bun.gc(true)` between batches) holds RSS flat (133.9 → 135.2 MB, slope ≈0.13 MB/round) with zero prompt-queue accumulation; a real `jeo --tmux` session flooded with 60k live mouse reports via `tmux send-keys` plateaus in RSS (129,456 → 129,472 KB). `jeo --tmux -p` end-to-end creates the profiled session, runs the turn, and tears down cleanly.
+- **Full suite green:** `bun run typecheck` clean and `bun test` 1687 pass / 0 fail across 210 files (includes the new `test/loop-guards.test.ts`, 9 tests, and the signature-only Anthropic replay test).
+## [0.6.29] - 2026-06-19
+_Signature-only thinking-block replay (Anthropic opus-4-7/4-8), plus a tmux mouse-flood memory guard confirming `jeo --tmux` does not leak._
+### Fixed
+- **Anthropic thinking-block replay now covers signature-only artifacts.** Newer Opus models (opus-4-7/opus-4-8) think internally — tokens billed, a valid `signature` present — but return empty thinking text. The cross-turn replay required both `signature` AND `text`, so those models' reasoning was dropped between steps. Replay now sends a signed `thinking` block whenever a `signature` (or `redacted`) is present (text defaults to `""`), restoring multi-step reasoning continuity for signature-only models. API-key requests also send the `interleaved-thinking` + `prompt-caching-scope` betas so thinking+tools and scoped caching work outside OAuth.
+### Added
+- **`claude-opus-4-7` catalogued** (FULL thinking, 200k ctx) and a dynamic context-window fallback for uncatalogued ids (claude 200k / gpt-5 400k / gemini-3 1M).
+- **tmux mouse-report-flood memory guard** (`test/mouse-report-filter.test.ts`): 100k SGR mouse-move reports through `queuePromptInputChunk` leave the prompt queue at zero accumulation — the regression guard for the "`jeo --tmux` slows down over time" concern.
+### Verified
+- **`jeo --tmux` has no bun memory leak.** The in-process lifecycle probe (`scripts/mem-probe.ts`, 3000 turns) reports a per-turn heap slope of ≈0 (returns to baseline, exit-listeners flat); a real `jeo --tmux` process plateaus in RSS under sustained mouse/resize/keystroke churn instead of climbing; and mouse reports are filtered (not buffered) with `activityLog` bounded to a 200-entry per-turn ring.
 ## [0.6.28] - 2026-06-19
 _Signed thinking-block replay: native reasoning is now sent BACK to providers across steps/turns, restoring multi-step reasoning continuity (gajae parity)._

package/README.ja.md CHANGED Viewed

@@ -200,11 +200,11 @@ CI は `.github/workflows/npm-publish.yml` で公開します — GitHub リリ
 ## 変更履歴 (Changelog)
 <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
+- **[0.6.30]** (2026-06-19) — gjc-style intermediate-judgment guard classification extracted from the engine loop, plus a re-verification that `jeo --tmux` does not leak bun memory or slow down.
+- **[0.6.29]** (2026-06-19) — Signature-only thinking-block replay (Anthropic opus-4-7/4-8), plus a tmux mouse-flood memory guard confirming `jeo --tmux` does not leak.
 - **[0.6.28]** (2026-06-19) — Signed thinking-block replay: native reasoning is now sent BACK to providers across steps/turns, restoring multi-step reasoning continuity (gajae parity).
 - **[0.6.27]** (2026-06-19) — Ponytail pass on the reasoning-tier mapper, plus a real-tmux verification of `jeo --tmux`.
 - **[0.6.26]** (2026-06-19) — The forge emblem is redrawn again as the mascot crayfish, foregrounding its signature pincer claws (집게).
-- **[0.6.25]** (2026-06-19) — Reasoning works at every thinking level (gajae parity), and the forge emblem is redrawn as the neon-lens coding wizard.
-- **[0.6.24]** (2026-06-19) — `/provider` opens an interactive onboarding selector (OAuth vs API-compatible), and OpenAI-compatible backends gain per-vendor native-reasoning formats.
 See [CHANGELOG.md](CHANGELOG.md) for the full history.
 <!-- CHANGELOG:END -->

package/README.ko.md CHANGED Viewed

@@ -200,11 +200,11 @@ CI는 `.github/workflows/npm-publish.yml`로 배포합니다 — GitHub 릴리
 ## 변경 이력 (Changelog)
 <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
+- **[0.6.30]** (2026-06-19) — gjc-style intermediate-judgment guard classification extracted from the engine loop, plus a re-verification that `jeo --tmux` does not leak bun memory or slow down.
+- **[0.6.29]** (2026-06-19) — Signature-only thinking-block replay (Anthropic opus-4-7/4-8), plus a tmux mouse-flood memory guard confirming `jeo --tmux` does not leak.
 - **[0.6.28]** (2026-06-19) — Signed thinking-block replay: native reasoning is now sent BACK to providers across steps/turns, restoring multi-step reasoning continuity (gajae parity).
 - **[0.6.27]** (2026-06-19) — Ponytail pass on the reasoning-tier mapper, plus a real-tmux verification of `jeo --tmux`.
 - **[0.6.26]** (2026-06-19) — The forge emblem is redrawn again as the mascot crayfish, foregrounding its signature pincer claws (집게).
-- **[0.6.25]** (2026-06-19) — Reasoning works at every thinking level (gajae parity), and the forge emblem is redrawn as the neon-lens coding wizard.
-- **[0.6.24]** (2026-06-19) — `/provider` opens an interactive onboarding selector (OAuth vs API-compatible), and OpenAI-compatible backends gain per-vendor native-reasoning formats.
 See [CHANGELOG.md](CHANGELOG.md) for the full history.
 <!-- CHANGELOG:END -->

package/README.md CHANGED Viewed

@@ -200,11 +200,11 @@ Required npm token permissions (repository secret `NPM_TOKEN`):
 ## Changelog
 <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
+- **[0.6.30]** (2026-06-19) — gjc-style intermediate-judgment guard classification extracted from the engine loop, plus a re-verification that `jeo --tmux` does not leak bun memory or slow down.
+- **[0.6.29]** (2026-06-19) — Signature-only thinking-block replay (Anthropic opus-4-7/4-8), plus a tmux mouse-flood memory guard confirming `jeo --tmux` does not leak.
 - **[0.6.28]** (2026-06-19) — Signed thinking-block replay: native reasoning is now sent BACK to providers across steps/turns, restoring multi-step reasoning continuity (gajae parity).
 - **[0.6.27]** (2026-06-19) — Ponytail pass on the reasoning-tier mapper, plus a real-tmux verification of `jeo --tmux`.
 - **[0.6.26]** (2026-06-19) — The forge emblem is redrawn again as the mascot crayfish, foregrounding its signature pincer claws (집게).
-- **[0.6.25]** (2026-06-19) — Reasoning works at every thinking level (gajae parity), and the forge emblem is redrawn as the neon-lens coding wizard.
-- **[0.6.24]** (2026-06-19) — `/provider` opens an interactive onboarding selector (OAuth vs API-compatible), and OpenAI-compatible backends gain per-vendor native-reasoning formats.
 See [CHANGELOG.md](CHANGELOG.md) for the full history.
 <!-- CHANGELOG:END -->

package/README.zh.md CHANGED Viewed

@@ -200,11 +200,11 @@ CI 通过 `.github/workflows/npm-publish.yml` 发布 — GitHub 发布 release
 ## 更新日志 (Changelog)
 <!-- CHANGELOG:START (auto-generated from CHANGELOG.md — run `bun run changelog:sync`) -->
+- **[0.6.30]** (2026-06-19) — gjc-style intermediate-judgment guard classification extracted from the engine loop, plus a re-verification that `jeo --tmux` does not leak bun memory or slow down.
+- **[0.6.29]** (2026-06-19) — Signature-only thinking-block replay (Anthropic opus-4-7/4-8), plus a tmux mouse-flood memory guard confirming `jeo --tmux` does not leak.
 - **[0.6.28]** (2026-06-19) — Signed thinking-block replay: native reasoning is now sent BACK to providers across steps/turns, restoring multi-step reasoning continuity (gajae parity).
 - **[0.6.27]** (2026-06-19) — Ponytail pass on the reasoning-tier mapper, plus a real-tmux verification of `jeo --tmux`.
 - **[0.6.26]** (2026-06-19) — The forge emblem is redrawn again as the mascot crayfish, foregrounding its signature pincer claws (집게).
-- **[0.6.25]** (2026-06-19) — Reasoning works at every thinking level (gajae parity), and the forge emblem is redrawn as the neon-lens coding wizard.
-- **[0.6.24]** (2026-06-19) — `/provider` opens an interactive onboarding selector (OAuth vs API-compatible), and OpenAI-compatible backends gain per-vendor native-reasoning formats.
 See [CHANGELOG.md](CHANGELOG.md) for the full history.
 <!-- CHANGELOG:END -->

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "jeo-code",
-  "version": "0.6.28",
+  "version": "0.6.30",
   "description": "Clean, highly optimized AI coding agent using spec-first loop",
   "type": "module",
   "main": "src/cli.ts",

package/src/agent/AGENTS.md CHANGED Viewed

@@ -17,6 +17,7 @@ The core runtime loop, tool registry, session management, and state persistence
 | `hooks.ts` | Brief description of purpose |
 | `json.ts` | Brief description of purpose |
 | `loop.ts` | The primary execution loop orchestrating model calls and tool execution |
+| `loop-guards.ts` | Intermediate-judgment classification (gjc ultragoal-guard parity): named `GuardState` taxonomy, `GUARD_LIMITS` thresholds, and pure classifiers (`isVerificationSignal`, `repeatHint`, `classifyDoneGate`) consumed by `engine.ts` |
 | `memory.ts` | OKF concept-bundle memory: session distill, query-aware budget injection, legacy MEMORY.md migration (`migrateLegacyMemory`) + `JEO_MEMORY_LEGACY` rollback toggle |
 | `memory-okf.ts` | OKF v0.1 format layer: frontmatter parse/serialize, concept IDs, conformance validation |
 | `memory-graph.ts` | Concept cross-link graph: build/expand (1-hop search), broken-link-tolerant lint, optional graphify detection |
@@ -35,7 +36,6 @@ The core runtime loop, tool registry, session management, and state persistence
 | `todo-tool.ts` | Brief description of purpose |
 | `tokenizer.ts` | Brief description of purpose |
 | `tool-output.ts` | Brief description of purpose |
-| `tool-registry.ts` | Brief description of purpose |
 | `tools.ts` | Built-in tool definitions (bash, read, write, edit, etc.) |
 | `web-search.ts` | Brief description of purpose |

package/src/agent/engine.ts CHANGED Viewed

@@ -22,6 +22,7 @@ export { TOOL_OUTPUT_MAX, READ_OUTPUT_MAX, TOOL_SPILL_THRESHOLD, MAX_TOOL_ARTIFA
 import { StepBudget, dynamicStepBudgetConfig, resolveStepBudgetConfig, hashSignature, type StepBudgetConfig } from "./step-budget";
 import { historyTokens, trimToolResultsInPlace } from "./compaction";
 import { jeoEnv } from "../util/env";
+import { GUARD_LIMITS, isVerificationSignal, repeatHint, classifyDoneGate } from "./loop-guards";
 async function invokeCallLlm(history: Message[], options: {
@@ -378,29 +379,15 @@ export async function runAgentLoop(history: Message[], opts: AgentLoopOptions):
     } catch { /* best-effort; fall through to the plain stop message */ }
     return finish({ done: false, steps: step, doneReason: `Stopped: ${stopReason}` });
   };
-  // Result-aware repeat nudge (A): tell the model WHY repeating won't help and what to
-  // try instead, tailored to the repeated tool and its last actual result.
-  const repeatHint = (tool: string, prev?: { success: boolean; output: string }): string => {
-    const out = prev?.output ?? "";
-    const empty = !prev || !prev.success || out.trim() === "" || /no match|0 match|no result|not found|no file/i.test(out);
-    if (tool === "search" || tool === "find" || tool === "ls") {
-      return empty
-        ? `That '${tool}' returned nothing useful and will again — BROADEN it (a looser pattern, a parent directory, or a different tool such as ${tool === "search" ? "find" : "search"}), or call done if this lookup isn't needed.`
-        : `That '${tool}' already returned results — open one of the hits with read, or move on; re-running it changes nothing.`;
-    }
-    if (tool === "read") return `You already read that and its content is unchanged — use what you read, or read a DIFFERENT file.`;
-    if (tool === "bash") return `That command already ran with the same output — change the command, or call done.`;
-    return `That call's result is unchanged — take a different action, or call done.`;
-  };
   // No-progress guard: weak/local models often repeat the same tool call without
   // ever emitting `done`. Two escalating corrections (B), then a consolidated stop.
-  const MAX_REPEAT = 4;
+  const MAX_REPEAT = GUARD_LIMITS.MAX_REPEAT;
   // Last executed step's per-call results — fed to repeatHint so a corrective bounce
   // can cite the repeated call's ACTUAL last outcome (A).
   let lastResults: { success: boolean; output: string; executed: boolean }[] = [];
   // Consecutive-failure guard: a model that keeps emitting *different* but failing
   // calls (bad edits, failing commands) would otherwise burn the whole step budget.
-  const MAX_FAILURES = 5;
+  const MAX_FAILURES = GUARD_LIMITS.MAX_FAILURES;
   let consecutiveFailures = 0;
   // done-verification guard (plan/gjc-inheritance.md B4, gjc ultragoal-guard 경량 계승):
   // a turn that MUTATED files but shows no verification signal gets ONE pushback on
@@ -424,16 +411,15 @@ export async function runAgentLoop(history: Message[], opts: AgentLoopOptions):
   // as-is, then once more with an explicit re-grounding note; only a third
   // refusal in the turn surfaces the (friendly) error. Bounded per turn so a
   // genuinely refused request can never burn billed calls in a loop.
-  const MAX_REFUSAL_RETRIES = 3;
+  const MAX_REFUSAL_RETRIES = GUARD_LIMITS.MAX_REFUSAL_RETRIES;
   let refusalRetries = 0;
-  const VERIFY_SIGNAL_RE = /\b(test|tests|tsc|typecheck|lint|build|check|spec|pytest|vitest|jest)\b/i;
   let lastSig = "";
   let repeatCount = 0;
   // Cycle guard (the A↔B ping-pong the exact-repeat guard cannot see): the recent
   // executed step signatures, as fixed-size digests. When a full window cycles
   // through ≤2 distinct calls, bounce ONCE with an explicit correction; a spin that
   // persists through the correction stops the turn.
-  const CYCLE_WINDOW = 6;
+  const CYCLE_WINDOW = GUARD_LIMITS.CYCLE_WINDOW;
   const recentStepSigs: string[] = [];
   let cycleBounceUsed = false;
   // Invalid-tool-call guard: a model that returns JSON without a usable `tool`
@@ -441,10 +427,10 @@ export async function runAgentLoop(history: Message[], opts: AgentLoopOptions):
   let invalidToolCalls = 0;
   // A JSON reply with no usable `tool` field can't drive the loop — stop sooner than the
   // repeat-spin guard (no escalating correction helps a model that isn't producing a call).
-  const MAX_INVALID_CALLS = 3;
+  const MAX_INVALID_CALLS = GUARD_LIMITS.MAX_INVALID_CALLS;
   // Prose-bounce guard: after this many invalid-JSON corrections, salvage the
   // model's text as the final answer instead of burning the whole step budget.
-  const MAX_PARSE_BOUNCES = 2;
+  const MAX_PARSE_BOUNCES = GUARD_LIMITS.MAX_PARSE_BOUNCES;
   let parseFailures = 0;
   while (true) {
     if (turnBudgetMs > 0 && Date.now() - turnStartedAt > turnBudgetMs) {
@@ -703,19 +689,14 @@ export async function runAgentLoop(history: Message[], opts: AgentLoopOptions):
     ev.onAssistant?.(responseText, toolCalls[0]);
     if (toolCalls.length === 1 && toolCalls[0].tool === "done") {
-      if (sawMutation && (!sawVerification || pendingHookFailure !== null) && !donePushbackUsed) {
+      // done-verification gate — jeo's descendant of gjc's ultragoal-guard completion
+      // state machine (plan/gjc-inheritance.md B4). The classifier owns the JUDGMENT
+      // (which named state, which message); the loop owns the once-pushback latch.
+      const doneGate = classifyDoneGate({ sawMutation, sawVerification, pendingHookFailure });
+      if (doneGate.block && !donePushbackUsed) {
         donePushbackUsed = true; // second done always passes — escape hatch
         pushAssistantTurn(history, responseText, reasonBuf, artifactBuf);
-        history.push({
-          role: "user",
-          content: pendingHookFailure !== null
-            ? `Your latest mutation left the post-turn hook "${pendingHookFailure}" FAILING (non-zero exit) — its diagnostics were shown in the tool result above. ` +
-              "Fix the reported problems (the hook re-runs on your next mutation), then call done. " +
-              "If the hook failure is a false positive, call done again and say why in the reason."
-            : "You modified files this turn but ran NO verification (no test/build/typecheck command succeeded). " +
-              "Run the narrowest command that proves your change works, then call done. " +
-              "If verification is genuinely not applicable (docs/config-only change), call done again and say why in the reason.",
-        });
+        history.push({ role: "user", content: doneGate.message });
         step++;
         continue;
       }
@@ -1039,7 +1020,7 @@ export async function runAgentLoop(history: Message[], opts: AgentLoopOptions):
       if (t === "write" || t === "edit") sawMutation = true;
       else if (t === "bash") {
         const cmd = String(toolCalls[i].arguments?.command ?? "");
-        if (VERIFY_SIGNAL_RE.test(cmd) || VERIFY_SIGNAL_RE.test(results[i].output.slice(0, 2000))) sawVerification = true;
+        if (isVerificationSignal(cmd, results[i].output)) sawVerification = true;
       }
     }
     // F6 (round 4 architect, Low): judge the step by its NON-TRIVIAL calls — a

package/src/agent/loop-guards.ts ADDED Viewed

@@ -0,0 +1,135 @@
+/**
+ * Intermediate-judgment guards for the agent loop — the mid-run "should this turn
+ * continue, correct itself, or stop" decisions that run between model calls.
+ *
+ * gjc keeps this concern in its own layer: `gjc-runtime/ultragoal-guard.ts` computes a
+ * named `UltragoalGuardState` discriminated union PURELY, and the runtime merely acts on
+ * the verdict. jeo previously inlined the same logic inside `engine.ts`'s `while` loop as
+ * scattered booleans and message strings. This module gives jeo the same classification:
+ * a named `GuardState` taxonomy plus pure, independently-testable classifier functions.
+ * `engine.ts` still owns the control flow (history mutation, `step++`, `continue`,
+ * `return finish(...)`) — only the JUDGMENT moves here, so behavior is unchanged.
+ */
+/**
+ * Named taxonomy of the loop's intermediate judgments — jeo's descendant of gjc's
+ * `UltragoalGuardState`. Each member names one decision the loop can reach mid-turn.
+ */
+export type GuardState =
+  | "ok" // proceed: emit / execute the tool call as-is
+  | "repeat_correct" // exact-repeat detected → ONE corrective bounce (skip execution)
+  | "repeat_stop" // exact-repeat survived the correction → consolidate-stop
+  | "cycle_correct" // A↔B alternation detected → ONE corrective bounce
+  | "cycle_stop" // cycle survived the correction → consolidate-stop
+  | "consecutive_failure_stop" // MAX_FAILURES different-but-failing steps → stop
+  | "invalid_tool_stop" // MAX_INVALID_CALLS replies with no usable tool field → stop
+  | "parse_salvage" // repeated non-JSON prose → salvage the text as the final answer
+  | "context_overflow_retry" // provider reported context overflow → ONE trim + retry
+  | "refusal_retry" // transient safety refusal → bounded resend ladder
+  | "done_unverified" // mutated files, no verification signal → pushback on done
+  | "done_hook_failing" // post-turn hook still failing → pushback on done
+  | "done_ok"; // done accepted — the turn is finished
+/**
+ * Bounded thresholds for every loop guard — the single, named source of truth.
+ * Kept in one frozen object so the limits are discoverable and testable instead of
+ * sprinkled as bare literals through the loop body.
+ */
+export const GUARD_LIMITS = Object.freeze({
+  /** Identical step repeats tolerated before a consolidated stop (with corrections en route). */
+  MAX_REPEAT: 4,
+  /** Consecutive different-but-failing steps before the turn stops. */
+  MAX_FAILURES: 5,
+  /** Safety-refusal resends per turn before surfacing the friendly error. */
+  MAX_REFUSAL_RETRIES: 3,
+  /** Replies with no usable `tool`/`tools` field before the turn stops. */
+  MAX_INVALID_CALLS: 3,
+  /** Consecutive non-JSON parse failures before the prose is salvaged as the answer. */
+  MAX_PARSE_BOUNCES: 2,
+  /** Recent-signature window scanned for an A↔B (≤2 distinct calls) cycle. */
+  CYCLE_WINDOW: 6,
+});
+/**
+ * Commands (or their output) that count as a verification signal: a test, build,
+ * typecheck, or lint invocation. The done-verification guard treats a turn that mutated
+ * files without any such signal as "unverified".
+ */
+export const VERIFY_SIGNAL_RE = /\b(test|tests|tsc|typecheck|lint|build|check|spec|pytest|vitest|jest)\b/i;
+/**
+ * True when a bash command (or the head of its output) proves the work was verified.
+ * Output is examined only up to the first 2000 chars — enough to catch a tool runner's
+ * banner without rescanning a megabyte of logs.
+ */
+export function isVerificationSignal(cmd: string, output = ""): boolean {
+  return VERIFY_SIGNAL_RE.test(cmd) || VERIFY_SIGNAL_RE.test(output.slice(0, 2000));
+}
+/**
+ * Result-aware repeat nudge: tells the model WHY repeating the call won't help and what
+ * to try instead, tailored to the repeated tool and its last actual result.
+ */
+export function repeatHint(tool: string, prev?: { success: boolean; output: string }): string {
+  const out = prev?.output ?? "";
+  const empty = !prev || !prev.success || out.trim() === "" || /no match|0 match|no result|not found|no file/i.test(out);
+  if (tool === "search" || tool === "find" || tool === "ls") {
+    return empty
+      ? `That '${tool}' returned nothing useful and will again — BROADEN it (a looser pattern, a parent directory, or a different tool such as ${tool === "search" ? "find" : "search"}), or call done if this lookup isn't needed.`
+      : `That '${tool}' already returned results — open one of the hits with read, or move on; re-running it changes nothing.`;
+  }
+  if (tool === "read") return `You already read that and its content is unchanged — use what you read, or read a DIFFERENT file.`;
+  if (tool === "bash") return `That command already ran with the same output — change the command, or call done.`;
+  return `That call's result is unchanged — take a different action, or call done.`;
+}
+/** Inputs for the done-verification gate (jeo's descendant of gjc's ultragoal-guard). */
+export interface DoneGateInput {
+  /** A write/edit succeeded this turn. */
+  sawMutation: boolean;
+  /** A test/build/typecheck/lint command succeeded this turn. */
+  sawVerification: boolean;
+  /** The run-command of the most recent still-failing post-turn hook, or null. */
+  pendingHookFailure: string | null;
+}
+/** Verdict from {@link classifyDoneGate}: whether to bounce `done`, and the message. */
+export interface DoneGateVerdict {
+  state: Extract<GuardState, "done_ok" | "done_unverified" | "done_hook_failing">;
+  /** When true, `done` should be bounced ONCE with `message` (the caller owns the once-gate). */
+  block: boolean;
+  /** Corrective message to push back on `done`; empty when `state === "done_ok"`. */
+  message: string;
+}
+/**
+ * Classify whether a `done` should be accepted or bounced — the direct descendant of
+ * gjc's `ultragoal-guard` completion gate (plan/gjc-inheritance.md B4).
+ *
+ * A turn that MUTATED files but has either NO verification signal or a still-failing
+ * post-turn hook is blocked ONCE. The caller owns the single-pushback latch; a second
+ * `done` always passes (the escape hatch for genuinely-unverifiable docs/config changes).
+ */
+export function classifyDoneGate(input: DoneGateInput): DoneGateVerdict {
+  const hookFailing = input.pendingHookFailure !== null;
+  const block = input.sawMutation && (!input.sawVerification || hookFailing);
+  if (!block) return { state: "done_ok", block: false, message: "" };
+  if (hookFailing) {
+    return {
+      state: "done_hook_failing",
+      block: true,
+      message:
+        `Your latest mutation left the post-turn hook "${input.pendingHookFailure}" FAILING (non-zero exit) — its diagnostics were shown in the tool result above. ` +
+        "Fix the reported problems (the hook re-runs on your next mutation), then call done. " +
+        "If the hook failure is a false positive, call done again and say why in the reason.",
+    };
+  }
+  return {
+    state: "done_unverified",
+    block: true,
+    message:
+      "You modified files this turn but ran NO verification (no test/build/typecheck command succeeded). " +
+      "Run the narrowest command that proves your change works, then call done. " +
+      "If verification is genuinely not applicable (docs/config-only change), call done again and say why in the reason.",
+  };
+}

package/src/ai/model-catalog.ts CHANGED Viewed

@@ -37,6 +37,8 @@ const STD: ThinkLevel[] = ["minimal", "low", "medium", "high"];
 export const ANTIGRAVITY_MODELS = [
   "claude-opus-4-5-thinking",
   "claude-opus-4-6-thinking",
+  "claude-opus-4-7",
+  "claude-opus-4-7-thinking",
   "claude-opus-4-8",
   "claude-opus-4-8-thinking",
   "claude-sonnet-4-5",
@@ -52,6 +54,7 @@ export const ANTIGRAVITY_MODELS = [
   "gemini-3.1-pro-high",
   "gemini-3.1-pro-low",
   "gpt-oss-120b-medium",
+  "gpt-5.5",
 ] as const;
 /** A curated set of common public models with their documented capabilities. */
@@ -62,9 +65,13 @@ export const MODEL_CATALOG: readonly CatalogModel[] = [
   { canonical: "claude-sonnet-4-5", provider: "anthropic", providerModel: "claude-sonnet-4-5-20250929", contextTokens: 200_000, maxOutputTokens: 64_000, thinking: FULL, images: true },
   { canonical: "claude-opus-4-1", provider: "anthropic", providerModel: "claude-opus-4-1-20250805", contextTokens: 200_000, maxOutputTokens: 32_000, thinking: FULL, images: true },
   { canonical: "claude-opus-4-5", provider: "anthropic", providerModel: "claude-opus-4-5-20251101", contextTokens: 200_000, maxOutputTokens: 64_000, thinking: FULL, images: true },
-  // NOTE: confirm exact dated provider ids when these ship publicly; the family
-  // heuristic in `catalogMetadata` keeps reasoning working even before that.
+  // NOTE: opus-4-7 accepts extended thinking but currently returns 0 thinking tokens
+  // (model-internal, no visible thought). opus-4-8 thinks internally (tokens billed,
+  // signature present) but returns empty thinking text. Both are FULL-capable in the
+  // catalog so the budget is always sent — the nativizable path handles signature-only
+  // artifacts for cross-turn continuity.
   { canonical: "claude-opus-4-6", provider: "anthropic", providerModel: "claude-opus-4-6", contextTokens: 200_000, maxOutputTokens: 64_000, thinking: FULL, images: true },
+  { canonical: "claude-opus-4-7", provider: "anthropic", providerModel: "claude-opus-4-7", contextTokens: 200_000, maxOutputTokens: 64_000, thinking: FULL, images: true },
   { canonical: "claude-opus-4-8", provider: "anthropic", providerModel: "claude-opus-4-8", contextTokens: 200_000, maxOutputTokens: 64_000, thinking: FULL, images: true },
   // OpenAI
   { canonical: "gpt-4o", provider: "openai", providerModel: "gpt-4o", contextTokens: 128_000, maxOutputTokens: 16_384, thinking: [], images: true },
@@ -96,9 +103,9 @@ export const MODEL_CATALOG: readonly CatalogModel[] = [
     canonical: `antigravity/${id}`,
     provider: "antigravity",
     providerModel: id,
-    contextTokens: id.includes("claude") ? 200_000 : id.includes("gemini-3") ? 1_000_000 : 1_000_000,
-    maxOutputTokens: id.includes("claude") ? 64_000 : 65_536,
-    thinking: id.includes("thinking") || id.includes("-high") || id.includes("-low") || id.includes("gemini-3") ? FULL : STD,
+    contextTokens: id.includes("claude") ? 200_000 : id.startsWith("gpt-5") ? 400_000 : id.includes("gemini-3") ? 1_000_000 : 1_000_000,
+    maxOutputTokens: id.includes("claude") ? 64_000 : id.startsWith("gpt-5") ? 128_000 : 65_536,
+    thinking: id.includes("thinking") || id.includes("-high") || id.includes("-low") || id.includes("gemini-3") || id.startsWith("gpt-5") ? FULL : STD,
     images: !id.includes("gpt-oss"),
     company: id.includes("claude") ? "Anthropic via Antigravity" : id.includes("gpt") ? "OpenAI via Antigravity" : "Google Antigravity",
   })),

package/src/ai/providers/anthropic.ts CHANGED Viewed

@@ -11,6 +11,12 @@ const DEPRECATED_TEMPERATURE = "`temperature` is deprecated for this model.";
 const CLAUDE_CODE_VERSION = "2.1.63";
 const CLAUDE_CODE_SYSTEM_INSTRUCTION = "You are a Claude agent, built on Anthropic's Claude Agent SDK.";
 const CLAUDE_BILLING_HEADER_PREFIX = "x-anthropic-billing-header:";
+/** Betas needed for API-key requests: interleaved-thinking enables thinking+tools,
+ *  prompt-caching-scope gives scoped cache breakpoints. */
+const ANTHROPIC_API_KEY_BETA = [
+  "interleaved-thinking-2025-05-14",
+  "prompt-caching-scope-2026-01-05",
+].join(",");
 const ANTHROPIC_OAUTH_BETA = [
   "claude-code-20250219",
   "oauth-2025-04-20",
@@ -99,7 +105,7 @@ type AnthropicMessage = { role: string; content: string | AnthropicContentBlock[
 export function anthropicNativizable(m: Message, model: string, thinkingEnabled: boolean): boolean {
   return thinkingEnabled
     && !!m.toolUse?.length
-    && !!m.reasoningArtifacts?.some(a => a.provider === "anthropic" && a.model === model && ((!!a.signature && !!a.text) || !!a.redacted));
+    && !!m.reasoningArtifacts?.some(a => a.provider === "anthropic" && a.model === model && (!!a.signature || !!a.redacted));
 }
 /** Build Anthropic wire messages, reconstructing native tool_use / tool_result / thinking
@@ -121,7 +127,7 @@ export function buildAnthropicMessages(messages: Message[], model: string, think
       const blocks: AnthropicContentBlock[] = [];
       for (const a of m.reasoningArtifacts!) {
         if (a.provider !== "anthropic" || a.model !== model) continue;
-        if (a.signature && a.text) blocks.push({ type: "thinking", thinking: a.text, signature: a.signature });
+        if (a.signature) blocks.push({ type: "thinking", thinking: a.text ?? "", signature: a.signature });
         else if (a.redacted) blocks.push({ type: "redacted_thinking", data: a.redacted });
       }
       for (const tu of m.toolUse!) blocks.push({ type: "tool_use", id: tu.id, name: tu.tool, input: tu.arguments });
@@ -454,6 +460,7 @@ function headersFor(credential: Credential, stream: boolean): Record<string, str
       "content-type": "application/json",
       "x-api-key": credential.token,
       "anthropic-version": "2023-06-01",
+      "anthropic-beta": ANTHROPIC_API_KEY_BETA,
     };
   }
   throw new Error("anthropic adapter requires a credential");

package/src/agent/tool-registry.ts DELETED Viewed

@@ -1,54 +0,0 @@
-import { readTool, writeTool, editTool, bashTool, findTool, searchTool, lsTool, type ToolResult } from "./tools";
-export type ToolHandler = (args: Record<string, any>, cwd: string) => Promise<ToolResult>;
-export const DEFAULT_TOOLS: Record<string, ToolHandler> = {
-  read: (a, cwd) => readTool(a.filePath ?? a.path, a.lineRange ?? a.range, cwd, !!a.raw),
-  write: (a, cwd) => writeTool(a.filePath ?? a.path, a.content ?? "", cwd),
-  edit: (a, cwd) => editTool(a.filePath ?? a.path, a.editBlock ?? a.edit ?? "", cwd),
-  bash: (a, cwd) => bashTool(a.command ?? a.cmd, cwd, typeof a.timeoutMs === "number" ? a.timeoutMs : undefined, typeof a.cwd === "string" ? a.cwd : (typeof a.subdir === "string" ? a.subdir : undefined), a.env && typeof a.env === "object" ? a.env : undefined),
-  find: (a, cwd) => findTool(a.globPattern ?? a.pattern, cwd),
-  search: (a, cwd) => searchTool(a.pattern, a.globPattern ?? "*", cwd, !!(a.ignoreCase ?? a.i), { before: a.before, after: a.after, context: a.context, maxMatches: a.maxMatches }),
-  ls: (a, cwd) => lsTool(a.dirPath ?? a.path ?? a.dir ?? ".", cwd),
-};
-export const TOOL_PROTOCOL = [
-  "You have these tools (call exactly ONE per step):",
-  "1. read   {filePath, lineRange?, raw?} — read a file",
-  "2. write  {filePath, content}         — create/overwrite a file",
-  "3. edit   {filePath, editBlock}       — replace/insert lines",
-  "4. bash   {command, timeoutMs?, cwd?, env?} — run a shell command",
-  "5. find   {globPattern}               — find files by name",
-  "6. search {pattern, globPattern?, ignoreCase?, context?, maxMatches?} — grep",
-  "7. ls     {dirPath}                   — list a directory",
-  "8. done   {reason?}                   — call when done",
-  "",
-  "Reply with STRICT JSON only:",
-  '{ "tool": "<name>", "arguments": { ... } }',
-].join("\n");
-export const READONLY_TOOL_PROTOCOL = [
-  "You have these READ-ONLY tools:",
-  "1. read   {filePath, lineRange?}      — read a file",
-  "2. find   {globPattern}               — find files by name",
-  "3. search {pattern, globPattern?, ignoreCase?} — grep",
-  "4. ls     {dirPath}                   — list a directory",
-  "5. done   {reason?}                   — call when complete",
-  "",
-  "Reply with STRICT JSON only:",
-  '{ "tool": "<name>", "arguments": { ... } }',
-].join("\n");
-export function nearestToolName(name: string, known: string[]): string | undefined {
-  const want = name.trim().toLowerCase();
-  if (!want) return undefined;
-  let best: string | undefined;
-  let bestD = Infinity;
-  for (const k of known) {
-    const kl = k.toLowerCase();
-    if (kl === want) return k;
-    const d = kl.startsWith(want) || want.startsWith(kl) ? 1 : 10;
-    if (d < bestD) { bestD = d; best = k; }
-  }
-  return bestD <= 2 ? best : undefined;
-}