@oh-my-pi/pi-coding-agent 15.13.3 → 16.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +40 -0
  2. package/dist/cli.js +506 -443
  3. package/dist/types/advisor/__tests__/advisor.test.d.ts +1 -0
  4. package/dist/types/advisor/advise-tool.d.ts +58 -0
  5. package/dist/types/advisor/index.d.ts +3 -0
  6. package/dist/types/advisor/runtime.d.ts +52 -0
  7. package/dist/types/advisor/watchdog.d.ts +5 -0
  8. package/dist/types/config/model-roles.d.ts +1 -1
  9. package/dist/types/config/settings-schema.d.ts +44 -5
  10. package/dist/types/modes/components/advisor-message.d.ts +9 -0
  11. package/dist/types/modes/components/assistant-message.d.ts +1 -0
  12. package/dist/types/modes/controllers/command-controller.d.ts +3 -1
  13. package/dist/types/modes/interactive-mode.d.ts +3 -1
  14. package/dist/types/modes/types.d.ts +3 -1
  15. package/dist/types/sdk.d.ts +3 -3
  16. package/dist/types/session/agent-session.d.ts +71 -2
  17. package/dist/types/session/session-history-format.d.ts +4 -0
  18. package/dist/types/session/yield-queue.d.ts +2 -0
  19. package/dist/types/tools/path-utils.d.ts +1 -0
  20. package/dist/types/tools/report-tool-issue.d.ts +0 -1
  21. package/package.json +13 -13
  22. package/src/advisor/__tests__/advisor.test.ts +586 -0
  23. package/src/advisor/advise-tool.ts +87 -0
  24. package/src/advisor/index.ts +3 -0
  25. package/src/advisor/runtime.ts +248 -0
  26. package/src/advisor/watchdog.ts +83 -0
  27. package/src/config/model-roles.ts +13 -1
  28. package/src/config/settings-schema.ts +42 -5
  29. package/src/internal-urls/docs-index.generated.ts +6 -5
  30. package/src/main.ts +4 -0
  31. package/src/modes/components/advisor-message.ts +99 -0
  32. package/src/modes/components/agent-hub.ts +7 -0
  33. package/src/modes/components/assistant-message.ts +86 -0
  34. package/src/modes/components/status-line/segments.ts +20 -7
  35. package/src/modes/controllers/command-controller.ts +69 -2
  36. package/src/modes/interactive-mode.ts +12 -2
  37. package/src/modes/types.ts +3 -1
  38. package/src/modes/utils/ui-helpers.ts +9 -0
  39. package/src/prompts/advisor/advise-tool.md +1 -0
  40. package/src/prompts/advisor/system.md +31 -0
  41. package/src/sdk.ts +52 -13
  42. package/src/session/agent-session.ts +560 -13
  43. package/src/session/session-dump-format.ts +15 -131
  44. package/src/session/session-history-format.ts +30 -11
  45. package/src/session/yield-queue.ts +5 -1
  46. package/src/slash-commands/builtin-registry.ts +102 -4
  47. package/src/system-prompt.ts +1 -1
  48. package/src/tools/path-utils.ts +33 -2
  49. package/src/tools/report-tool-issue.ts +2 -7
  50. package/src/web/scrapers/docs-rs.ts +2 -3
@@ -1,10 +1,11 @@
1
1
  // Auto-generated by scripts/generate-docs-index.ts - DO NOT EDIT
2
2
 
3
- export const EMBEDDED_DOC_FILENAMES: readonly string[] = ["ERRATA-GPT5-HARMONY.md","adding-a-provider.md","ai-schema-normalize.md","approval-mode.md","auth-broker-gateway.md","bash-tool-runtime.md","blob-artifact-architecture.md","collab.md","compaction.md","config-usage.md","context-files.md","custom-tools.md","environment-variables.md","extension-loading.md","extensions.md","fs-scan-cache-architecture.md","gemini-manifest-extensions.md","handoff-generation-pipeline.md","hooks.md","install-id.md","keybindings.md","local-models.md","lsp-config.md","macos-signing-notarization.md","marketplace.md","mcp-config.md","mcp-protocol-transports.md","mcp-runtime-lifecycle.md","mcp-server-tool-authoring.md","memory.md","mnemosyne-memory-backend.md","models.md","natives-addon-loader-runtime.md","natives-architecture.md","natives-binding-contract.md","natives-build-release-debugging.md","natives-media-system-utils.md","natives-rust-task-cancellation.md","natives-shell-pty-process.md","natives-text-search-pipeline.md","non-compaction-retry-policy.md","notebook-tool-runtime.md","plugin-manager-installer-plumbing.md","porting-from-pi-mono.md","porting-to-natives.md","provider-streaming-internals.md","providers.md","python-repl.md","render-mermaid.md","resolve-tool-runtime.md","rpc.md","rulebook-matching-pipeline.md","sdk.md","secrets.md","session-operations-export-share-fork-resume.md","session-switching-and-recent-listing.md","session-tree-plan.md","session.md","settings.md","skills.md","skills/authoring-extensions.md","skills/authoring-hooks.md","skills/authoring-marketplaces.md","skills/examples/hello-extension/README.md","skills/examples/mini-marketplace/README.md","skills/examples/safety-hook/README.md","slash-command-internals.md","system-prompt-customization.md","task-agent-discovery.md","theme.md","toolconv/anthropic.md","toolconv/deepseek.md","toolconv/gemini.md","toolconv/gemma.md","toolconv/glm-4.5.md","toolconv/harmony.md","toolconv/kimi-k2.md","toolconv/pi-native.md","toolconv/qwen3.md","tools/ask.md","tools/ast-edit.md","tools/ast-grep.md","tools/bash.md","tools/browser.md","tools/checkpoint.md","tools/debug.md","tools/edit.md","tools/eval.md","tools/find.md","tools/github.md","tools/inspect_image.md","tools/irc.md","tools/job.md","tools/lsp.md","tools/read.md","tools/recall.md","tools/reflect.md","tools/render_mermaid.md","tools/resolve.md","tools/retain.md","tools/rewind.md","tools/search.md","tools/search_tool_bm25.md","tools/ssh.md","tools/task.md","tools/todo.md","tools/web_search.md","tools/write.md","tree.md","ttsr-injection-lifecycle.md","tui-core-renderer.md","tui-runtime-internals.md","tui.md"];
3
+ export const EMBEDDED_DOC_FILENAMES: readonly string[] = ["ERRATA-GPT5-HARMONY.md","adding-a-provider.md","advisor-watchdog.md","ai-schema-normalize.md","approval-mode.md","auth-broker-gateway.md","bash-tool-runtime.md","blob-artifact-architecture.md","collab.md","compaction.md","config-usage.md","context-files.md","custom-tools.md","environment-variables.md","extension-loading.md","extensions.md","fs-scan-cache-architecture.md","gemini-manifest-extensions.md","handoff-generation-pipeline.md","hooks.md","install-id.md","keybindings.md","local-models.md","lsp-config.md","macos-signing-notarization.md","marketplace.md","mcp-config.md","mcp-protocol-transports.md","mcp-runtime-lifecycle.md","mcp-server-tool-authoring.md","memory.md","mnemosyne-memory-backend.md","models.md","natives-addon-loader-runtime.md","natives-architecture.md","natives-binding-contract.md","natives-build-release-debugging.md","natives-media-system-utils.md","natives-rust-task-cancellation.md","natives-shell-pty-process.md","natives-text-search-pipeline.md","non-compaction-retry-policy.md","notebook-tool-runtime.md","plugin-manager-installer-plumbing.md","porting-from-pi-mono.md","porting-to-natives.md","provider-streaming-internals.md","providers.md","python-repl.md","render-mermaid.md","resolve-tool-runtime.md","rpc.md","rulebook-matching-pipeline.md","sdk.md","secrets.md","session-operations-export-share-fork-resume.md","session-switching-and-recent-listing.md","session-tree-plan.md","session.md","settings.md","skills.md","skills/authoring-extensions.md","skills/authoring-hooks.md","skills/authoring-marketplaces.md","skills/examples/hello-extension/README.md","skills/examples/mini-marketplace/README.md","skills/examples/safety-hook/README.md","slash-command-internals.md","system-prompt-customization.md","task-agent-discovery.md","theme.md","toolconv/anthropic.md","toolconv/deepseek.md","toolconv/gemini.md","toolconv/gemma.md","toolconv/glm-4.5.md","toolconv/harmony.md","toolconv/kimi-k2.md","toolconv/pi-native.md","toolconv/qwen3.md","tools/ask.md","tools/ast-edit.md","tools/ast-grep.md","tools/bash.md","tools/browser.md","tools/checkpoint.md","tools/debug.md","tools/edit.md","tools/eval.md","tools/find.md","tools/github.md","tools/inspect_image.md","tools/irc.md","tools/job.md","tools/lsp.md","tools/read.md","tools/recall.md","tools/reflect.md","tools/render_mermaid.md","tools/resolve.md","tools/retain.md","tools/rewind.md","tools/search.md","tools/search_tool_bm25.md","tools/ssh.md","tools/task.md","tools/todo.md","tools/web_search.md","tools/write.md","tree.md","ttsr-injection-lifecycle.md","tui-core-renderer.md","tui-runtime-internals.md","tui.md"];
4
4
 
5
5
  export const EMBEDDED_DOCS: Readonly<Record<string, string>> = {
6
6
  "ERRATA-GPT5-HARMONY.md": "# ERRATA — GPT-5 Harmony-Header Leakage\n\nHistorical research note, not a current runtime contract. The statistics below\ncome from the named local stats database snapshot, not from checked-in tests or\nruntime code.\n\n## 1. The problem\n\nOpenAI frames tool calls in the Harmony chat protocol:\n\n```\n<|start|>assistant<|channel|>commentary to=functions.<NAME><|message|>{ARGS}<|call|>\n```\n\n`<|channel|>commentary to=functions.NAME` is the **routing header** —\ncontrol tokens consumed by the runtime to dispatch the call. These\ntokens never appear as content under normal operation; the runtime\nstrips them.\n\nThe defect: gpt-5 models occasionally emit, **as ordinary content\ninside `{ARGS}`**, the **plain-text shadow** of these routing tokens —\nthe same characters without the `<|…|>` brackets — and continue\nproducing more pseudo-routing structure (channel name, body marker,\nmultilingual spam, fake tool-result framing). The contamination lives\ninside the visible tool argument and is dispatched to the tool as if it\nwere intended content.\n\n**Critical detail.** The actual `<|start|>` / `<|channel|>` /\n`<|message|>` / `<|call|>` special tokens almost never appear in tool\nargs. What leaks is the bracket-less spelling — `analysis to=functions.X\ncode …` — because OpenAI applies a logit mask suppressing the\ncontrol-token IDs inside the args region. The mass that would have gone\nto those special tokens redistributes onto the un-bracketed plain-text\nrepresentation the model also learned. This makes the leak structurally\ninvisible to the routing parser and lands it in the tool input verbatim.\n\nManifestation in tool args (real corpus example):\n\n```\n~ add_function(iso, ctx, ns, \"installSystemChangeObserver\",\n os_install_system_change_observer);】【\"】【analysis to=functions.edit\n code above เงินไทยฟรีuser to=functions.edit code …\n```\n\nThe leading code is real and intended. Everything after the first\nnon-Latin token through the next clean structural boundary is corruption.\n\n---\n\n## 2. Observed statistics & failure modes\n\nSource: `~/.omp/stats.db` (`ss_tool_calls`, `ss_assistant_msgs`), through\n2026-05-10. 1.05M tool calls scanned.\n\n### 2.1 Rate\n\n| Model | Leaks in tool args | Calls | per million |\n| ------------- | -----------------: | ------: | ----------: |\n| gpt-5.4 | 37 | 226,957 | 163 |\n| gpt-5.3-codex | 17 | 112,243 | 151 |\n| gpt-5.5 | 2 | 80,750 | 25 |\n| gpt-5.2-codex | 0 | — | — |\n\nPlus 15 hits in assistant visible text / thinking blobs.\n\n### 2.2 Tool distribution\n\n| Tool | Hits |\n| ------------------------------ | -----: |\n| `edit` | 38 |\n| `eval` | 11 |\n| `report_tool_issue` | 3 |\n| `grep`/`read`/`search`/`yield` | 1 each |\n\nConcentrated in tools with free-form (non-JSON-schema) argument formats.\n\n### 2.3 Leak shape (deterministic)\n\n```\nLEAK ::= JUNK_PREFIX MARKER CHANNEL_BODY (LEAK)?\nMARKER ::= \"to=functions.\" TOOL_NAME\nCHANNEL_BODY ::= \" code \" (SPAM | reasoning_prose | fake_tool_output)*\nJUNK_PREFIX ::= (GLITCH_TOKEN | CHANNEL_WORD | NON_LATIN_RUN | \"}\" | \"】【\")+\n```\n\n**Cascading is common.** Of 96 marker occurrences across 71 contaminated\nrecords, 39 contain ≥2 markers and 7 contain ≥3 — the model emits\nmultiple fake `to=functions.X code …` blocks back-to-back, often with\nfake `code_output\\nCell N:\\n…` framing between them. Once the\nplain-text scaffolding is in the residual stream, the prefix now _looks\nlike_ a fresh tool envelope start, so the macro prior over continuations\nkeeps voting for more scaffolding. Self-amplifying.\n\n### 2.4 Glitch tokens\n\nSingle-token identifiers in `o200k_base` whose embeddings appear to be\nnear-init from underrepresentation in post-training. ASCII residue\nimmediately before the marker in the natural corpus:\n\n| Surface string | Single-token | Token ID | Hits in corpus |\n| ----------------- | :----------: | -------: | ------------------------------: |\n| `Japgolly` | ✅ | 199,745 | 1 |\n| `Jsii` | ✅ | 114,318 | (subtoken of `Jsii_commentary`) |\n| `Jsii_commentary` | — (3 toks) | — | 2 |\n| `changedFiles` | — (2 toks) | — | 8 |\n| `RTLU` | — (2 toks) | — | 3 |\n\n`Japgolly` is in the last 0.13% of the vocabulary — the same family of\nGitHub-corpus residue that produced `SolidGoldMagikarp` in the 2023\nGPT-2 vocabulary (Rumbelow & Watkins). `SolidGoldMagikarp` itself\ntokenizes to 5 tokens in `o200k_base` — that specific token was retired,\nbut the class wasn't.\n\nFor the multi-token entries, the corpus-level signature is the surface\nstring; the underlying glitch trigger is a sub-token (e.g. `Jsii` inside\n`Jsii_commentary`). The detector list (`G` signal) keys on the surface\nstrings.\n\nStable across unrelated sessions. Treated as a high-precision detector\nsignal.\n\n### 2.5 Channel-word leakage\n\n`analysis` (5), `assistant` (5), `commentary` (3), `user` (1) appear\ndirectly preceding `to=`. Always bare words; never `<|channel|>analysis`\nor any other bracketed form. Consistent with §1 — the brackets are\nmasked, the words are not.\n\n### 2.6 Non-Latin spam residue\n\n96 marker hits, by script: CJK 40, Cyrillic 12, Telugu/Kannada/Malayalam\n18, Thai 8, Georgian 7, Armenian 7, Arabic 1. Recurring fragments are\nChinese gambling SEO (`大发时时彩`, `天天中彩票`), Georgian/Abkhaz junk,\nand Thai casino spam — well-known low-quality crawl residue.\n\nThis is the same script distribution observed in the controlled\nreproduction (§7.3), independent of the prompt's natural language.\n\n### 2.7 Failure-mode breakdown for the `edit` tool\n\nThe `edit` tool exists in two variants in the corpus:\n\n| Variant | Calls | Recovery |\n| ------------------------------------ | ----: | ---------------------------------------------------------------------------------------------------------------------------------------------------- |\n| Patch-DSL (`§PATH`/anchor/`«»≔` ops) | 27 | **Recoverable** by op-truncation (§3.3) |\n| JSON-schema (`{path,edits:[…]}`) | 11 | **Not recoverable** — contamination is escaped _inside_ JSON strings, parser accepts it cleanly, content would be written verbatim into source files |\n\nFor Patch-DSL leaks specifically:\n\n- 20/27 cases: contamination on the last input line; nothing follows.\n- 7/27 cases: contamination mid-input; what follows is one of: a\n duplicate replay of an earlier file/anchor, intended content for a\n _different_ tool call (the model started its next call inline), or\n pure hallucination. Post-contamination content is never trustworthy.\n\n### 2.8 Mechanism (confirmed)\n\n**Prior collapse from null-embedding glitch tokens, into a\ncontrol-token-masked basin whose mass redistributes onto the\nplain-text shadow of the Harmony protocol.**\n\nStep by step:\n\n1. The model is mid-`{ARGS}` of a Harmony tool call. The runtime applies\n a logit mask suppressing structural control tokens (`<|channel|>`,\n `<|message|>`, `<|call|>`, `<|start|>`, `<|end|>`) inside the args\n region. Without this mask, normal generation would constantly\n hallucinate envelope-closes; with it, those token IDs have logit\n `-∞` in args.\n2. A glitch token `g` is sampled. By construction `g` was in the BPE\n merge corpus but barely in LM/RL training, so its **input embedding\n `e_g` ≈ near-init noise of small norm**.\n3. At position t+1, the residual update `h_{t+1} ≈ LN(h_t + e_g + Attn +\nMLP)` is dominated by the prefix-derived terms; the just-emitted-token\n signal is effectively absent. Generation diversity normally comes\n from `e_x` steering the residual into different sub-regions —\n stripped here.\n4. The next-token distribution therefore collapses onto the **conditional\n prior over continuations of the prefix, with local conditioning\n removed**. In a tool-calling rollout context, that prior is sharply\n peaked on Harmony scaffolding (control tokens + routing tokens) —\n that's what RL trained.\n5. The mask zeros the control-token IDs. Mass redistributes onto the\n **next-best continuation**: the un-bracketed surface-form spelling of\n the same protocol (`analysis`, `commentary`, ` to=functions.X`,\n `code`). This spelling is unmasked because those characters are\n ordinary tokens.\n6. Once a few tokens of plain-text scaffolding land in the residual\n stream, the prefix now resembles a fresh envelope start. The macro\n prior keeps voting for more scaffolding. Cascading (§2.3) follows.\n7. Multilingual spam after the marker is the same prior-collapse\n continuation, drawn from the training neighborhood of the glitch\n token (often ESL/auto-generated multilingual web junk — exactly the\n crawl residue in §2.6).\n\n**Two corollaries the corpus data demanded but only the experiment\nexplained:**\n\n- **The brackets never appear** (§1, §2.5). The mask is what makes the\n leak land in plain text instead of as a real envelope-close.\n- **Counterintuitive grammar dependency** (§7.4). The leak is _worse_ in\n formats closest to OpenAI's training distribution. Off-distribution\n custom grammars dampen the macro-prior basin; the official\n `*** Begin Patch` format is the strongest collapse target.\n\nThe 2023 SolidGoldMagikarp paper documented mechanism (1)+(2)+(4). The\nnew piece is (5): when constrained decoding masks the natural collapse\ntarget, the mass laundered through the un-masked plain-text shadow\nbecomes a structurally-invisible exfiltration channel.\n",
7
7
  "adding-a-provider.md": "# Adding a provider\n\nA provider is described in two halves:\n\n- **Catalog half** (`packages/catalog`): one entry in the `CATALOG_PROVIDERS`\n table (`packages/catalog/src/provider-models/descriptors.ts`) carrying the\n `id`, `defaultModel`, runtime model-discovery factory, and catalog-generation\n wiring. `KnownProvider`, `PROVIDER_DESCRIPTORS`, and\n `DEFAULT_MODEL_PER_PROVIDER` are derived from this table.\n- **Auth half** (`packages/ai`): one declarative `ProviderDefinition` in the\n registry carrying env-key fallbacks and login/refresh flows. The\n `OAuthProvider` union, the env-key map, the `/login` provider list, the\n `refreshOAuthToken` / `AuthStorage.login` dispatch, and the coding-agent\n callback maps are derived from the registry.\n\n**Scope.** This is for a provider that reuses an existing wire API\n(`openai-completions`, `anthropic-messages`, `google-generative-ai`, …) — the\ncommon case for gateways and API-key providers, since stream dispatch keys on\n`model.api`, not `model.provider`. Adding a *new wire protocol* (a new\n`KnownApi`) is a separate task that also touches `stream.ts` dispatch,\n`api-registry.ts`, and the catalog `types.ts`.\n\n## Shape\n\nFor the common case, a provider is **one catalog entry + one def file + one registry line**:\n\n1. **Add an entry to `CATALOG_PROVIDERS`** in\n `packages/catalog/src/provider-models/descriptors.ts` with the `id`,\n `defaultModel`, the plain API-key env var(s) as `envVars`, and (usually) a\n `createModelManagerOptions` factory. For a\n simple OpenAI-compatible gateway, build the factory in\n `packages/catalog/src/provider-models/openai-compat.ts` or inline with the\n exported `createSimpleOpenAICompletionsOptions(providerId, baseUrl, config)`.\n2. **Create `packages/ai/src/registry/<id>.ts`** exporting one\n `export const <camelId>Provider = { … } as const satisfies ProviderDefinition;`\n with the auth fields (`login`, …). Plain env-var names live in the catalog\n entry's `envVars`; set `envKeys` only for computed resolvers (Foundry/ADC/\n Bedrock-style probes).\n3. **Add it to the `ALL` array** in `packages/ai/src/registry/registry.ts`\n (one import + one array entry). `ALL` order is the `/login` list order for\n loginable providers.\n\nThat is the full change for:\n- env-key-only providers,\n- providers with a simple inline API-key login flow,\n- most OpenAI-compatible gateways.\n\nFor a **non-trivial provider-local OAuth flow**, put the implementation in\n`packages/ai/src/registry/oauth/<vendor>.ts` and lazy-import it from the def\nfile. The shared OAuth flow infrastructure it builds on lives in the same\n`registry/oauth/` directory.\n\nDescriptors, the default-model map, env-key map, login list, and refresh\ndispatch all update automatically; the `KnownProvider` union gains the new id\nfrom the catalog table and `OAuthProvider` from the registry.\n\n## Field reference\n\n**Catalog table entry** (`ProviderCatalogEntry`, see\n`packages/catalog/src/provider-models/descriptor-types.ts` for JSDoc):\n\n| Field | Effect |\n|---|---|\n| `id` | Required. Member of `KnownProvider`. |\n| `defaultModel` | Required. Preferred model when no explicit selection is made. |\n| `envVars` | Env var name(s), in order, for the runtime API-key fallback (`getEnvApiKey`). |\n| `createModelManagerOptions` | Runtime model-discovery factory. Present (and not `specialModelManager`) ⇒ appears in `PROVIDER_DESCRIPTORS`. |\n| `allowUnauthenticated` | Runtime creates a model manager even without a key. |\n| `dynamicModelsAuthoritative` | Successful discovery replaces bundled models. |\n| `catalogDiscovery` | `{ label, envVars?, oauthProvider?, allowUnauthenticated? }` for offline catalog generation (`generate-models.ts`). `envVars` here overrides the entry-level list when generation uses different credentials (e.g. `cursor`). |\n| `specialModelManager` | Bespoke runtime factory (`google-antigravity` / `google-gemini-cli` / `openai-codex`); excluded from `PROVIDER_DESCRIPTORS`. |\n\n**Registry definition** (`ProviderDefinition`, see\n`packages/ai/src/registry/types.ts`):\n\n| Field | Effect |\n|---|---|\n| `id`, `name` | Required. `name` shows in the `/login` list. |\n| `envKeys` | Computed env fallback for `getEnvApiKey`, overriding the catalog entry's `envVars`: a var name string or a `() => string \\| undefined` resolver. Omit when `envVars` covers it. |\n| `login` | Interactive login. Present ⇒ member of `OAuthProvider`, shown in `/login`, dispatchable via `AuthStorage.login`. Returns an api-key `string` or `OAuthCredentials`. |\n| `refreshToken` | OAuth refresher; omit for static-token providers (the dispatch returns credentials unchanged). |\n| `storeCredentialsAs` | Store credentials under a different provider id (e.g. `openai-codex-device` ⇒ `openai-codex`). |\n| `callbackPort` | Present ⇒ entry in the auth-broker `CALLBACK_PORTS` map. |\n| `pasteCodeFlow` | OAuth flow needs a pasted code/redirect URL ⇒ member of `PASTE_CODE_LOGIN_PROVIDERS`. |\n\n## Conventions\n\n- Use `... as const satisfies ProviderDefinition` so the literal `id` is preserved\n for the union derivation.\n- `login` / `refreshToken` for simple API-key or validation-based flows can live\n directly in the provider def file (export the named login function there so\n tests can import it directly).\n- `login` / `refreshToken` for heavy provider-local OAuth flows MUST reach the\n adjacent `registry/oauth/*` module via a dynamic-import\n thunk (`const { loginX } = await import(\"./oauth/x\"); return loginX(cb);`),\n keeping those flows out of the eager startup graph.\n- All OAuth code lives under `registry/oauth/`: the shared flow infra\n (`callback-server`, `pkce`, `google-oauth-shared`, `types`, the runtime API\n `index`) plus every provider flow, including the `github-copilot` / `kimi` /\n `openai-codex` helpers reused by the streaming and usage layers. The non-OAuth\n API-key helpers (`api-key-login`, `api-key-validation`) sit beside the def\n files in `registry/`, since they back simple paste-an-API-key logins.\n- For a simple OpenAI-compatible gateway, build the manager inline with the\n exported `createSimpleOpenAICompletionsOptions(providerId, baseUrl, config)` —\n no edits to `openai-compat.ts` required.\n- A `ProviderDefinition` may also be registered at runtime by an extension via\n `registerOAuthProvider` (the `AuthStorage.login` dispatcher handles built-ins\n and extensions through the same path).\n",
8
+ "advisor-watchdog.md": "# Advisor and WATCHDOG.md\n\nThe advisor is an optional second model attached to a session. It reviews the primary agent's transcript after each turn, can inspect the workspace with read-only tools, and injects concise advice back into the primary session.\n\nThe advisor is not a second executor. It cannot edit files, run commands, approve actions, or change session state directly.\n\n## Implementation files\n\n- [`src/advisor/runtime.ts`](../packages/coding-agent/src/advisor/runtime.ts)\n- [`src/advisor/advise-tool.ts`](../packages/coding-agent/src/advisor/advise-tool.ts)\n- [`src/advisor/watchdog.ts`](../packages/coding-agent/src/advisor/watchdog.ts)\n- [`src/prompts/advisor/system.md`](../packages/coding-agent/src/prompts/advisor/system.md)\n- [`src/prompts/advisor/advise-tool.md`](../packages/coding-agent/src/prompts/advisor/advise-tool.md)\n- [`src/session/agent-session.ts`](../packages/coding-agent/src/session/agent-session.ts)\n- [`src/slash-commands/builtin-registry.ts`](../packages/coding-agent/src/slash-commands/builtin-registry.ts)\n- [`src/config/settings-schema.ts`](../packages/coding-agent/src/config/settings-schema.ts)\n\n---\n\n## Enabling the advisor\n\nThe advisor requires both:\n\n1. `advisor.enabled: true`\n2. a model assigned to the `advisor` model role\n\nExample:\n\n```yaml\nmodelRoles:\n advisor: anthropic/claude-sonnet-4-5:medium\n\nadvisor:\n enabled: true\n```\n\nThe advisor role uses normal model-role resolution, including provider-prefixed ids, canonical ids, and optional thinking suffixes.\n\nSlash commands:\n\n| Command | Effect |\n|---|---|\n| `/advisor` | Toggle the persisted `advisor.enabled` setting. |\n| `/advisor on` | Enable the setting and start the runtime when an advisor model is assigned. |\n| `/advisor off` | Disable the setting and stop the runtime. |\n| `/advisor status` | Show active model, context usage, token usage, and cost. |\n| `/advisor dump` | Print the advisor's compact transcript. |\n| `/advisor dump raw` | Print the advisor's full dump with system prompt, tools, thinking, and calls. |\n\nIf `advisor.enabled` is true but no `modelRoles.advisor` value resolves to an available model, status reports that the setting is enabled but no advisor model is assigned.\n\n## What the advisor sees\n\nAt each primary turn end, `AdvisorRuntime` receives only the new transcript delta since the last advisor update. Deltas are rendered with `formatSessionHistoryMarkdown(..., { includeThinking: true })`, so the advisor can review assistant reasoning as well as user-visible text, tool calls, and tool results.\n\nAdvisor messages already injected into the primary transcript are filtered out before the next delta is rendered. This prevents the advisor from recursively reviewing its own advice.\n\nWhen the primary transcript is rewritten, the advisor runtime is reset:\n\n- compaction\n- session switch/resume\n- branch/fork style history replacement\n- context-maintenance re-prime when the advisor's own context cannot fit\n\nReset clears the advisor's private in-memory transcript and rewinds its cursor. The next advisor update replays the current bounded primary transcript instead of continuing from stale pre-rewrite context.\n\nWhen the advisor is enabled mid-session, the cursor seeds to the current primary transcript length. That avoids replaying the whole old conversation on the first enabled turn.\n\n## Tools and isolation\n\nThe advisor receives a hard-isolated read-only tool set:\n\n- `read`\n- `search`\n- `find`\n- `advise`\n\nThe read/search/find tools are built against a distinct `ToolSession` whose session id is suffixed with `-advisor`. The advisor therefore does not share the primary agent's file snapshots, seen-lines tracking, conflict state, summary cache, or edit/yield capabilities.\n\nThe `advise` tool accepts one note and an optional severity:\n\n| Severity | Delivery | Intended use |\n|---|---|---|\n| omitted / `nit` | Non-interrupting aside, batched into the primary transcript at the next step boundary. | Cleanup, simplification, low-risk edge cases. |\n| `concern` | Interrupting steering message. | Material risk, likely wrong direction, missing constraint, hallucinated API. |\n| `blocker` | Interrupting steering message. | Continuing would clearly waste work or produce broken output. |\n\nInterrupting advice is sent through the steering channel and can abort in-flight tools at the next steering boundary. Non-interrupting notes are batched into one custom `advisor` transcript card with this prefix:\n\n```text\nAdvisor (a senior reviewer watching your work — weigh it, don't blindly obey):\n```\n\n## Bounded catch-up with `advisor.syncBacklog`\n\n`advisor.syncBacklog` is not lockstep turn execution. It is a bounded catch-up delay for the primary agent when the advisor falls behind.\n\nAllowed values:\n\n- `off` — never wait for advisor catch-up\n- `1`\n- `3`\n- `5`\n\nOn primary turn end:\n\n1. the primary turn delta is queued for the advisor\n2. the advisor drain loop starts or continues in the background\n3. if `advisor.syncBacklog` is not `off`, the primary agent waits only while advisor backlog is at or above the configured threshold\n4. the wait is capped at 30 seconds\n5. if the advisor catches up below the threshold, the primary continues immediately\n6. if the cap expires, the primary continues anyway\n\nPractical interpretation:\n\n- `off` favors maximum primary throughput.\n- `1` is the closest mode to synchronous review: after each queued advisor delta, the primary waits up to 30 seconds for backlog to return to zero.\n- `3` and `5` allow more advisor lag before the primary pauses.\n\nAdvisor failures do not permanently stall the primary. A failed advisor prompt is retried; after three consecutive advisor failures, the runtime logs a warning, drops the backlog, and lets the session continue.\n\n## WATCHDOG.md\n\n`WATCHDOG.md` is advisor-only guidance. It is appended to the advisor system prompt; it is not injected into the primary agent's normal context and does not behave like `AGENTS.md`, `RULES.md`, or other context files.\n\nUse it for review priorities: risks the advisor should watch for, project-specific traps, dangerous APIs, architectural boundaries, and quality bars that are useful to a reviewer but too noisy for the main executor.\n\nExample:\n\n```markdown\n# Watchdog notes\n\nEspecially watch for:\n\n- Changes that bypass the durable queue in `src/jobs/`.\n- UI renderer paths that display unsanitized tool output.\n- New worker spawns that do not re-enter the CLI host.\n```\n\n### Discovery locations\n\n`discoverWatchdogFiles(cwd, agentDir)` loads every readable candidate from these locations:\n\n1. user level: `<active agent dir>/WATCHDOG.md` (`~/.omp/agent/WATCHDOG.md` by default; relocated by `PI_CODING_AGENT_DIR`)\n2. project levels while walking from `cwd` upward to the git repository root, or to the home directory when no repo root is found:\n - `<dir>/WATCHDOG.md`\n - `<dir>/.omp/WATCHDOG.md`\n\nUnlike native context files, watchdog discovery does not stop at the nearest project file. Multiple project watchdog files can load together.\n\nCandidates in hidden owner directories are ignored unless the file is inside an `.omp` directory. This keeps unrelated dot-directory conventions from being picked up accidentally while still allowing `.omp/WATCHDOG.md`.\n\n### `@` imports\n\n`WATCHDOG.md` content is expanded with the same `@` import helper used by context files:\n\n- relative imports resolve from the importing file's directory\n- `~/` resolves from the user's home directory\n- imports inside fenced code blocks and inline code spans stay literal\n- cycles are skipped\n- missing or unreadable imports leave the original `@path` text in place\n\n### Prompt order\n\nLoaded watchdog blocks are sorted as:\n\n1. user-level `WATCHDOG.md`\n2. project-level files from farther ancestors down toward `cwd`\n\nEach file is appended to the advisor system prompt as:\n\n```xml\nEspecially pay attention to:\n<attention>\n...expanded watchdog content...\n</attention>\n```\n\nLater project files sit closer to the end of the advisor prompt, so narrower directory guidance is more prominent than broad ancestor guidance.\n\n## Subagents\n\n`advisor.subagents` controls whether spawned task/eval subagents also get an advisor runtime.\n\n- `false` (default): only the main session can run an advisor.\n- `true`: eligible subagent sessions build their own advisor with the same settings/model-role resolution, then rerun `WATCHDOG.md` discovery for that subagent session's `cwd` and agent directory.\n\nSubagent advisors remain isolated from the subagent's primary tool session in the same way the main advisor is isolated from the main agent.\n\n## Cost and context behavior\n\nAdvisor usage is separate model usage. `/advisor status` reports advisor token counts and cost from the advisor agent's own transcript.\n\nThe advisor has its own append-only context. Before each advisor prompt, `AgentSession` estimates incoming tokens and may maintain advisor context:\n\n1. try model-level context promotion when enabled and a larger compatible model is available\n2. if promotion cannot fit enough context, compact the advisor's own message history\n3. if compaction has no candidates or still cannot fit, re-prime from the current bounded primary transcript\n\nThe advisor transcript is in-memory for the session. It is retained while the session runs so `/advisor dump` can inspect it, but advisor state is not a replacement for the primary persisted transcript.\n",
8
9
  "ai-schema-normalize.md": "# AI tool-schema normalization\n\n`@oh-my-pi/pi-ai` exposes one unified schema normalizer that providers consume\nbefore tools are sent on the wire. All walkers live in\n`packages/ai/src/utils/schema/normalize.ts`; the operational contract is\n`packages/ai/src/utils/schema/CONSTRAINTS.md`.\n\nThere is no separate `strict-mode.ts` module any more — OpenAI strict-mode\nsanitization, OpenAI Responses `oneOf` rewriting, Google/Vertex/Gemini-CLI\nsanitization, Cloud Code Assist Claude sanitization, and MCP sanitization all\nshare the same option-driven walk.\n\n## Entry points\n\nAll exports live under `@oh-my-pi/pi-ai/utils/schema`:\n\n- `normalizeSchema(value, options)` — generic option-driven walker.\n- `normalizeSchemaForGoogle(value)` — Gemini / Vertex / Gemini CLI.\n- `normalizeSchemaForCCA(value)` — Cloud Code Assist Claude (Antigravity + GCA).\n- `normalizeSchemaForMCP(value)` — MCP inputSchemas before they enter the\n custom-tool registry. `tool-bridge.ts` runs every MCP `inputSchema` through\n this dispatcher.\n- `normalizeSchemaForOpenAIResponses(schema)` (alias\n `sanitizeSchemaForOpenAIResponses`) — rewrites `oneOf` → `anyOf` for the\n Responses family.\n- `sanitizeSchemaForStrictMode(schema)` and\n `enforceStrictSchema(schema)` / `tryEnforceStrictSchema(schema)` — the\n OpenAI strict-mode pipeline (sanitize → enforce). All three are exported\n from `normalize.ts`.\n- `adaptSchemaForStrict(schema, strict)` from `./adapt` — thin composer that\n upgrades draft-07 inputs to 2020-12 and wraps `tryEnforceStrictSchema` for\n provider call sites. `./adapt` also exports the `NO_STRICT` global-bypass\n flag (env `PI_NO_STRICT`) honored by every provider that emits `strict: true`.\n\nRemoved in the unified-flow refactor:\n\n- `strict-mode.ts` (merged into `normalize.ts`).\n- `sanitize-google.ts` and `normalize-cca.ts` (replaced by\n `normalizeSchemaFor*` dispatchers).\n- `StringEnum` helper — use `z.enum([...])` directly; Zod's emitted JSON\n Schema is already wire-compatible with Google and other providers.\n- `sanitizeSchemaFor{Google,CCA,MCP}` / `prepareSchemaForCCA` — renamed to\n `normalizeSchemaFor{Google,CCA,MCP}`.\n\n## Dispatcher mapping\n\n| Provider transport(s) | Dispatcher |\n| ------------------------------------------------------------------ | ------------------------------------------- |\n| `openai-completions`, `openai-responses`, `openai-codex-responses` | `adaptSchemaForStrict` (sanitize + enforce) |\n| `openai-responses` family (`oneOf` → `anyOf` only) | `normalizeSchemaForOpenAIResponses` |\n| `google-generative-ai`, `google-vertex`, Gemini CLI | `normalizeSchemaForGoogle` |\n| Cloud Code Assist Claude (Antigravity + GCA, `claude-*` model ids) | `normalizeSchemaForCCA` |\n| MCP `inputSchema` ingestion | `normalizeSchemaForMCP` |\n| `anthropic-messages` (native, not CCA) | per-provider whitelist in `anthropic.ts` |\n\nGemini CLI / Antigravity CCA MUST run the full `normalizeSchemaForCCA`\npipeline (not just the first keyword-stripping pass) to keep parity with the\nshared Google Claude path.\n\n## Walk semantics\n\n`normalizeSchema` first detoxifies serialized Zod-instance-shaped inputs, upgrades them to\nJSON Schema 2020-12, dereferences the tree, then walks it with the option set\npinned by the dispatcher. Each node:\n\n1. Renames `snake_case` combinator/property keys to camelCase\n (`any_of` → `anyOf`, etc.; collisions follow python-genai\n `pop(from)`/`set(to)` semantics — snake_case wins).\n2. Applies the `handle_null_fields` collapse for nullable unions before\n recursing into children.\n3. Strips keys the target provider does not support, optionally lifting\n human-meaningful keys (`pattern`, `format`, min/max, `default`,\n `examples`, ...) into the sibling `description` via the spill formatter\n (`spill.ts`). Structural/meta keys (`$ref`, `$defs`,\n `additionalProperties`) are not spilled.\n4. Normalizes type unions (`type: [\"T\", \"null\"]` → `type: \"T\"` + nullable\n marker on Google, plain `type: \"T\"` on CCA).\n5. Collapses object-only / same-type combiners, optionally lossy-collapses\n mixed-type combiners (CCA only), and runs the residual-combiner fixpoint.\n6. Validates against AJV 2020 when `validateAndFallback` is set (CCA path)\n and emits the per-tool fallback `{ \"type\": \"object\", \"properties\": {} }`\n on residual incompatibility — `type` array, `type: \"null\"`, `nullable`\n key, or any remaining `anyOf`/`oneOf`/`allOf`.\n\n## OpenAI strict-mode pipeline\n\n`adaptSchemaForStrict(schema, strict)` runs `tryEnforceStrictSchema`,\nwhich composes:\n\n1. **Sanitize** (`sanitizeSchemaForStrictMode`): strips non-structural\n keywords (`format`, `pattern`, min/max, `examples`, `default`,\n `if`/`then`/`else`, `not`, `unevaluated*`, `patternProperties`,\n `dependent*`, `content*`, `min/maxProperties`, `$dynamicRef`, etc.). The\n `default` value is inlined into the sibling `description` as\n ` (default: X)` before being dropped, unless `description` already\n contains `(default:` or no `description` exists.\n2. **Enforce** (`enforceStrictSchema`): every object node gets\n `additionalProperties: false`, every property goes into `required`, and\n optional properties become nullable unions\n (`anyOf: [<original>, { \"type\": \"null\" }]`). Tuple `prefixItems` are\n strictified recursively.\n\nThe two passes use cache/cycle guards, so refs, `allOf`, and nullable wrapping\nstay deterministic without recursing forever. `tryEnforceStrictSchema` is\nfail-open: if anything throws, it returns `{ strict: false, schema: upgraded }`\nso callers MUST emit `strict: true` only when enforcement actually succeeded.\n\n### Edge cases the strict-mode normalizer handles\n\n- **Local `$ref` inlining.** OpenAI strict mode rejects\n `{ \"$ref\": \"...\", \"description\": \"...\" }` with sibling keys. The\n sanitizer pre-resolves local `#/...` refs against the root and merges\n with **sibling keys winning** over the resolved def — same precedence\n as `openai-python`'s `_ensure_strict_json_schema`. Recursive refs are\n guarded by the per-walk epoch.\n- **Single-item `allOf`.** A `{ \"allOf\": [X], ...siblings }` collapses to\n `{ ...X, ...siblings }` with the inlined entry's keys winning over the\n original siblings (matches `openai-python`'s `_pydantic.py:79-83`). Multi-\n item `allOf` is left intact for the downstream validator to reject if\n needed.\n- **Type-array branches and nullable unions.** When a node has\n `type: [\"T\", \"U\"]`, the sanitizer emits one variant schema per type,\n pruning type-specific keywords (e.g. `properties`/`required` only stay on\n the `object` variant, `items` only on the `array` variant). The shared\n `description` is **hoisted onto the `anyOf` wrapper** instead of being\n duplicated on every branch — so a strict nullable union becomes\n `{ anyOf: [T, { type: \"null\" }], description: \"...\" }`, not\n `anyOf: [{ ..., description }, { ..., description }]`.\n- **Enum/const without a `type`.** Both sanitize and enforce paths call\n `inferStrictPrimitiveTypeFromEnumOrConst` to infer the primitive `type`\n from `enum` / `const` values. Mixed-primitive enums (`[1, \"two\", null]`),\n enums containing objects/arrays, and non-primitive `const` values\n (`{a:1}`, `[1,2,3]`) cannot be described by a single `type` keyword and\n trigger the strict-mode fail-open path — emitting a typeless schema\n would just be rejected on the wire by OpenAI.\n\n## Performance: static fingerprint cache\n\n`resolveProviderModels` in `packages/catalog/src/model-manager.ts` and\n`readModelCache`/`writeModelCache` in `packages/catalog/src/model-cache.ts`\ncooperate via a `static_fingerprint` column on the `model_cache` SQLite\ntable (current cache schema version 5).\n\n- `fingerprintStatic(staticModels)` hashes the static catalog slice\n (`Bun.hash(JSON.stringify(models))` in base36) and memoizes the result\n by tagging the array with a symbol property. Multiple cold-start arms\n calling `resolveProviderModels` with the same `staticModels` array pay\n the JSON+hash cost once.\n- On cache read, if the network fetch is being skipped, the cached row is\n fresh + authoritative, and the cached `static_fingerprint` matches the\n current one, `resolveProviderModels` returns the cached models verbatim\n — the cache already incorporates the same static state, so re-running\n `mergeDynamicModels(static, cache)` would just rebuild the same objects.\n- `mergeModelSources` and `mergeDynamicModels` short-circuit on\n empty-source inputs (the common shape after `(static, [])` or for\n providers without a static catalog), avoiding Map churn entirely.\n\nCache rows written before the current schema version are dropped by the\ncache-version check; the column defaults to `''` for any row that survives\na version upgrade so the fingerprint-equality check naturally fails closed\nand the full merge re-runs.\n\n## Related\n\n- `docs/models.md` — registry, equivalence, compat flags\n (`supportsStrictMode`, `toolStrictMode`, `disableStrictTools`).\n- `docs/provider-streaming-internals.md` — how the normalized schemas are\n used downstream during the provider stream loop.\n- `docs/mcp-server-tool-authoring.md` — MCP `inputSchema` ingestion via\n `normalizeSchemaForMCP`.\n- `packages/ai/src/utils/schema/CONSTRAINTS.md` — operational contract for\n every normalization rule.\n",
9
10
  "approval-mode.md": "# Tool approval mode\n\nTool approval has two independent inputs:\n\n1. **Tool declaration** — every tool may declare an `approval` tier:\n - `read`: reads data or updates UI-only session metadata.\n - `write`: mutates workspace/session state but does not execute arbitrary code.\n - `exec`: executes code, shells out, drives a browser, spawns agents, or performs similarly broad actions.\n2. **User policy** — `tools.approval.<toolName>: allow | deny | prompt` overrides the mode for that tool unless a non-yolo safety override forces a prompt.\n\nTools without an `approval` declaration are treated as `exec`. This is the safe default for MCP and unknown custom tools.\n\n## Modes\n\nConfigure with `tools.approvalMode`:\n\n| Mode | Auto-approves | Prompts for |\n| ---------------- | ----------------------- | --------------- |\n| `always-ask` | `read` | `write`, `exec` |\n| `write` | `read`, `write` | `exec` |\n| `yolo` (default) | `read`, `write`, `exec` | none |\n\n`--auto-approve` and `--yolo` force `tools.approvalMode: yolo` for the session.\n\n## User overrides\n\n`tools.approval` is honored in every mode:\n\n```yaml\ntools:\n approvalMode: write\n approval:\n bash: prompt\n read: allow\n mcp__filesystem__delete: deny\n```\n\nResolution per tool call:\n\n1. Compute the tool's approval decision from `tool.approval(args)`; omitted means `exec`.\n2. Normalize `tools.approval.<tool>` if present; invalid values are ignored.\n3. In `yolo` mode, the user policy is used when present; otherwise the call is allowed. Safety `override` reasons do not force a prompt in `yolo`.\n4. In non-yolo modes, if the tool sets `override: true`, `deny` is blocked and all other cases prompt, even if user policy says `allow`.\n5. Otherwise, a valid user policy wins.\n6. Otherwise, the active mode auto-approves or prompts by tier.\n\n## Safety overrides\n\nA tool can force a prompt with object-form approval:\n\n```ts\napproval: { tier: \"exec\", override: true, reason: \"Critical pattern detected\" }\n```\n\n`bash` uses this for critical destructive patterns such as `rm -rf /`, fork bombs, remote-fetch-then-execute, writes to `/etc/passwd`, and host shutdown commands. These surface as `reason` in the approval prompt, but in `yolo` mode they are auto-approved unless a user policy for the tool is set to `prompt` or `deny`.\n\n## Per-tool prompt details\n\nTools can add approval-prompt body lines with `formatApprovalDetails(args)`. The standard prompt includes:\n\n- `Allow tool: <name>`\n- `Origin: MCP server tool` for unannotated `mcp__...` tools\n- `Reason: <reason>` when the tool decision supplies one\n- tool-specific details such as command, path, code, browser action, or subagent assignment\n\n## Defining approval on tools\n\nBuilt-in and custom tools share the same shape:\n\n```ts\nexport type ToolTier = \"read\" | \"write\" | \"exec\";\nexport type ToolApprovalDecision = ToolTier | { tier: ToolTier; reason?: string; override?: boolean };\nexport type ToolApproval = ToolApprovalDecision | ((args: unknown) => ToolApprovalDecision);\n\napproval?: ToolApproval;\nformatApprovalDetails?: (args: unknown) => string | string[] | undefined;\n```\n\nExamples:\n\n```ts\napproval: \"read\";\n\napproval: (args) => (LSP_READONLY_ACTIONS.has(args.action) ? \"read\" : \"write\");\n\napproval: (args) =>\n isCritical(args.command)\n ? { tier: \"exec\", override: true, reason: \"Critical pattern detected\" }\n : \"exec\";\n```\n\n## Subagents\n\nSubagents run headless with `tools.approvalMode: yolo` so they do not stall waiting for UI. The parent `task` approval is the authorization boundary. User `tools.approval.<tool>` settings continue to control whether a tool is allowed, prompted, or blocked.\n",
10
11
  "auth-broker-gateway.md": "# Auth Broker and Auth Gateway\n\nThe auth broker and auth gateway are two cooperating HTTP services that move OAuth refresh tokens and provider access tokens off developer laptops and into a single broker host.\n\n- **`omp auth-broker serve`** holds the canonical SQLite credential vault, performs OAuth refreshes, and exposes a small REST API (`/v1/snapshot`, `/v1/snapshot/stream`, `/v1/credential/:id/refresh`, `/v1/credential/:id/disable`, `/v1/credential`, `/v1/usage`, `/v1/healthz`).\n- **`omp auth-gateway serve`** is a forward-proxy. It accepts OpenAI Chat Completions, Anthropic Messages, OpenAI Responses, and pi-native stream requests, resolves the broker-backed credential, and dispatches through `pi-ai` provider logic. Clients (containerised omp, llm-git, the macOS usage widget, …) never see the access token.\n\nTransport security between operator, broker, and gateway is delegated to the operator (Tailscale / Wireguard / reverse proxy + TLS). Every endpoint except `/v1/healthz` (broker) and `/healthz` (gateway) requires a bearer token.\n\nSource: `packages/ai/src/auth-broker/`, `packages/ai/src/auth-gateway/`, `packages/coding-agent/src/cli/auth-broker-cli.ts`, `packages/coding-agent/src/cli/auth-gateway-cli.ts`, `packages/coding-agent/src/session/auth-broker-config.ts`.\n\n## Data flow\n\n```\n ┌────────────────────────────────────────────────────────────┐\n │ broker host │\n │ │\n developer ──▶ │ ┌──────────────────────────┐ ┌────────────────────┐ │\n laptop / │ │ omp auth-broker serve │◀──▶│ SQLite agent.db │ │\n CI / robomp │ │ - holds refresh tokens │ │ (canonical writer)│ │\n │ │ - background refresher │ └────────────────────┘ │\n │ │ /v1/{snapshot,refresh,…}│ │\n │ └─────────┬────────────────┘ │\n │ │ bearer ($CONFIG_DIR/auth-broker.token) │\n │ ▼ │\n │ ┌──────────────────────────┐ │\n │ │ omp auth-gateway serve │ RemoteAuthCredentialStore │\n │ │ /v1/{chat,messages,…} │ receives snapshot stream, │\n │ │ /v1/usage,/v1/models │ refreshes credentials by id │\n │ │ /v1/credentials/check │ via the broker on expiry │\n │ └─────────┬────────────────┘ │\n └────────────┼───────────────────────────────────────────────┘\n │ bearer ($CONFIG_DIR/auth-gateway.token)\n ▼\n gateway clients\n (llm-git, macOS widget, robomp containers, IDE plugins, …)\n │\n ▼ provider request with broker-resolved credential\n api.anthropic.com / api.openai.com / …\n```\n\nThe broker is the only writer of OAuth refresh tokens. Clients (including the gateway itself) load a redacted snapshot in which every `refresh` field has been replaced with `REMOTE_REFRESH_SENTINEL`; when an access token expires the client calls `POST /v1/credential/:id/refresh` and the broker performs the refresh server-side. `RemoteAuthCredentialStore` rejects local replace/upsert/delete-by-provider mutations, with errors pointing at `omp auth-broker login` / `omp auth-broker logout`.\n\n## auth-broker\n\n### CLI\n\n```\nomp auth-broker serve [--bind=host:port] # boot the broker\nomp auth-broker token [--regenerate] [--json] # print or rotate the bearer token\nomp auth-broker login [<provider>] [--via=user@host] [--dry-run]\nomp auth-broker logout [<provider>]\nomp auth-broker list [--json]\nomp auth-broker import <file|dir> [--provider=<id>] [--include-disabled] [--dry-run] [--json]\nomp auth-broker migrate --from-local [--include-oauth] [--include-env] [--dry-run] [--json]\nomp auth-broker status [--json]\n```\n\n- `serve` opens the local SQLite store at `getAgentDbPath()` and binds an HTTP listener (default `127.0.0.1:8765`). On startup a token is ensured at `<config-dir>/auth-broker.token` (mode `0600`, `0700` parent dir). The background refresher refreshes any OAuth credential whose `expires - Date.now() < refreshSkewMs` (default 5 min) every `refreshIntervalMs` (default 60 s).\n- `token` prints the cached bearer or generates a new one. `--regenerate` rotates it.\n- `login [<provider>]` runs the per-provider OAuth flow locally — when no provider is supplied, it falls back to an interactive numbered picker. With `--via=user@host` it shells out `ssh -L <callback-port>:127.0.0.1:<callback-port> user@host omp auth-broker login <provider>` so the OAuth callback hits the local browser but the credential is written on the broker host (`--via` requires `<provider>`). Built-in callback ports: `anthropic:54545`, `openai-codex:1455`, `google-gemini-cli:8085`, `google-antigravity:51121`, `gitlab-duo:8080`. The OAuth dance is driven in-process via `AuthStorage.login()` — there is no longer a `pi-ai` bin to spawn.\n- `logout [<provider>]` deletes every credential row for `<provider>`. With no argument it shows an interactive numbered picker of currently-stored providers.\n- `list` enumerates every registered OAuth provider id/name (the union of built-ins + `registerOAuthProvider` custom providers). `--json` emits a machine-readable array.\n- `import <file|dir>` imports CLIProxyAPI-style JSON credentials into the local SQLite store. Maps `type` field → omp provider (`claude → anthropic`, `codex → openai-codex`, `gemini → google-gemini-cli`, `antigravity → google-antigravity`, `gemini-cli → google-gemini-cli`).\n- `migrate --from-local` uploads local SQLite credentials to the configured broker (`POST /v1/credential`). Local API keys are included by default; local OAuth rows are skipped unless `--include-oauth` is set; environment-derived API keys are skipped unless `--include-env` is set. Re-runs are idempotent against the broker snapshot.\n- `status` health-pings the configured remote broker.\n\n### Endpoints\n\n| Method | Path | Auth | Purpose |\n| ------ | ---------------------------- | ------ | ------------------------------------------------------- |\n| `GET` | `/v1/healthz` | none | Liveness + version |\n| `GET` | `/v1/snapshot` | bearer | Redacted snapshot (refresh tokens replaced by sentinel) |\n| `GET` | `/v1/snapshot/stream` | bearer | SSE snapshot stream with delta events and keepalives |\n| `POST` | `/v1/credential` | bearer | Upsert one OAuth or API-key credential |\n| `POST` | `/v1/credential/:id/refresh` | bearer | Force-refresh one OAuth credential |\n| `POST` | `/v1/credential/:id/disable` | bearer | Disable one credential with a recorded cause |\n| `GET` | `/v1/usage` | bearer | Aggregate `UsageReport[]` across credentials |\n\nRequests use `Authorization: Bearer <token>`. The server compares against an in-memory token allow-list; the gateway’s implementation uses a timing-safe comparison.\n\n### Background refresher\n\n`AuthBrokerRefresher` iterates active OAuth credentials at `refreshIntervalMs` cadence and refreshes any within `refreshSkewMs` of expiry. Refreshes are single-flighted per credential id so a slow refresh cannot be retriggered. The refresher distinguishes:\n\n- **definitive failures** (`invalid_grant`, `invalid_token`, `revoked`, unauthorized refresh-token, 401/403 not from a network blip) — credentials are passed to `AuthStorage.disableCredentialById(id, cause)` so the next snapshot pull surfaces a clean delete on the client;\n- **transient failures** (timeout / ECONNREFUSED / fetch failed) — left in place for the next sweep.\n\n## auth-gateway\n\n### CLI\n\n```\nomp auth-gateway serve [--bind=host:port] [--no-auth]\nomp auth-gateway token [--regenerate] [--json]\nomp auth-gateway status [--json]\nomp auth-gateway check [--strict] [--json]\n```\n\n- `serve` requires `OMP_AUTH_BROKER_URL` (or `auth.broker.url` in `config.yml`) — the gateway is itself a broker client. It calls `AuthBrokerClient.fetchSnapshot()`, wraps it in `RemoteAuthCredentialStore`, and constructs an `AuthStorage` that resolves access tokens through the broker. Default bind is `127.0.0.1:4000`. The gateway token is stored at `<config-dir>/auth-gateway.token` (`0600`); `--no-auth` disables the bearer check entirely (loopback-only use).\n- `token` / `status` manage and inspect the gateway bearer token and upstream broker readiness.\n- `check` probes broker-backed credentials through the gateway store. Without `--strict` it uses provider usage probes; `--strict` also exercises each credential against its chat-completion endpoint and can consume a small amount of quota.\n\n### Endpoints\n\n| Method | Path | Auth | Purpose |\n| ------ | ----------------------- | ------ | ------------------------------------------------------------ |\n| `GET` | `/healthz` | none | Liveness + version |\n| `GET` | `/v1/usage` | bearer | Aggregate `UsageReport[]` (proxied through `AuthStorage`) |\n| `GET` | `/v1/models` | bearer | Bundled-model catalog filtered to providers with credentials |\n| `GET` | `/v1/credentials/check` | bearer | Per-credential auth health probe |\n| `POST` | `/v1/chat/completions` | bearer | OpenAI Chat Completions wire format |\n| `POST` | `/v1/messages` | bearer | Anthropic Messages wire format |\n| `POST` | `/v1/responses` | bearer | OpenAI Responses wire format |\n| `POST` | `/v1/pi/stream` | bearer | Native `pi-ai` stream wire format |\n\nThe model id is read from the top-level `model` field for foreign wire formats and from the pi-native request body for `/v1/pi/stream`. The gateway picks the first bundled `Model<Api>` matching that id, parses the inbound wire format into an omp `Context`, resolves the provider credential from broker-backed `AuthStorage`, dispatches through `streamSimple()`, and re-encodes the result to the inbound format (SSE for streamed responses).\n\nThere is no raw provider passthrough path. All supported routes go through `pi-ai` provider logic so credential-specific request shaping, OAuth refresh-on-auth-error, and provider quirks stay centralized.\n\n`idleTimeout` on the underlying `Bun.serve` is set to `255 s` so long thinking-budget calls do not get killed by Bun’s default idle timeout.\n\n## Usage cache: server-side 5-min jitter + client-side 15 s single-flight\n\nTwo layers cache the aggregate provider-usage report. Both are intentional and stacked.\n\n### Server-side cache (broker `AuthStorage`)\n\n`AuthStorage` caches each credential’s `UsageReport` in the broker’s SQLite store at a **5-minute per-credential TTL with ±25 % jitter**. Anthropic and OpenAI rate-limit `/usage` aggressively per source IP, and a synchronized 5-credential fan-out trips 429s every cycle; the jitter decorrelates refresh times within a few cycles. On fetch failure the store keeps the **last-good** report for up to 24 h with a short jittered re-poll window — so a transient upstream blip never blanks out the widget.\n\nConstants: `USAGE_REPORT_TTL_MS = 5 * 60_000`, `USAGE_LAST_GOOD_RETENTION_MS = 24 * 60 * 60_000` (`packages/ai/src/auth-storage.ts`).\n\n### Client-side single-flight (`RemoteAuthCredentialStore`)\n\nWhen the gateway (or any other broker client) calls `fetchUsageReports()` / `getUsageReport(provider, credential)`, `RemoteAuthCredentialStore` coalesces concurrent calls into a single `GET /v1/usage` round-trip and caches the result for **15 s** in memory.\n\n- `USAGE_CACHE_TTL_MS = 15_000` (`packages/ai/src/auth-broker/remote-store.ts`).\n- A single `#usageInflight` promise is shared across all callers; a per-caller `AbortSignal` is **raced** against the shared promise, not threaded into it, so one caller’s abort never cascades into a peer’s in-flight request.\n- On fetch failure the rejected promise is logged and the awaited value is `null` — callers (`AuthStorage.fetchUsageReports`, `#getUsageReport`) treat a `null` report as \"no usage signal for this cycle\" and proceed without it. **This is the 15 s TTL fallback**: the client absorbs transient broker outages by suppressing the error, returning `null` to ranking, and re-attempting after the 15 s window.\n\nThe 15 s client window deliberately sits below the broker’s 5 min server cache, so almost every client poll is served from the broker’s already-cached value; the client cache exists to absorb the parallel fan-out generated by `AuthStorage.#rankOAuthSelections` into a single broker round-trip.\n\n## Client snapshot cache\n\n`discoverAuthStorage()` persists the broker snapshot to `~/.omp/cache/auth-broker-snapshot.enc` after the initial `/v1/snapshot` fetch and after later broker-sourced full snapshots. The file is AES-256-GCM encrypted with `SHA-256(OMP_AUTH_BROKER_TOKEN)` and authenticated with the broker URL as additional data, so changing either the token or URL makes the cache unreadable. The file is written atomically with mode `0600`.\n\nFreshness is anchored to the broker-stamped `snapshot.generatedAt`, not local write time. Default TTL is 1 h (`OMP_AUTH_BROKER_SNAPSHOT_TTL_MS`); `0` disables the cache and restores the old always-fetch boot path. When the cached snapshot is still fresh, `omp` boots from it and skips the blocking `/v1/snapshot` query. `RemoteAuthCredentialStore` still starts its normal SSE / long-poll background sync immediately, so deleted or rotated credentials reconcile after startup, and expired OAuth access tokens still refresh through `POST /v1/credential/:id/refresh`.\n\nIf the broker is down at boot and a fresh cache exists, startup now succeeds from the cached snapshot. If the cache is missing, expired, corrupt, written for a different URL, or encrypted with a different token, startup falls back to the live fetch and fails the same way it did before if the broker is unreachable.\n\n## Operator opt-in\n\nThe broker is **off** unless `OMP_AUTH_BROKER_URL` (or `auth.broker.url` in `config.yml`) is set. When set, `discoverAuthStorage` in `packages/coding-agent/src/sdk.ts` swaps the local SQLite credential store for `RemoteAuthCredentialStore` and every API call resolves credentials through the broker.\n\n### Environment variables\n\n| Variable | Purpose | Required when |\n| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |\n| `OMP_AUTH_BROKER_URL` | Base URL of the remote auth-broker (e.g. `https://broker.tailnet:8765`). Selecting this puts the client in broker mode — local SQLite is bypassed. | Any time the omp client should resolve credentials through a broker (and required by `omp auth-gateway serve`). |\n| `OMP_AUTH_BROKER_TOKEN` | Bearer token used for every broker endpoint except `/v1/healthz`. | When `OMP_AUTH_BROKER_URL` is set and no token is available from `auth.broker.token` or `<config-dir>/auth-broker.token`. |\n| `OMP_AUTH_BROKER_SNAPSHOT_TTL_MS` | Freshness window for the encrypted local snapshot cache. Default `3600000` (1 h); `0` disables cache reads and writes. | Optional in broker mode. |\n| `OMP_AUTH_BROKER_SNAPSHOT_CACHE` | Path override for the encrypted local snapshot cache. Default `~/.omp/cache/auth-broker-snapshot.enc` (or XDG cache equivalent). | Optional in broker mode. |\n\nResolution order in `resolveAuthBrokerConfig()`:\n\n1. `OMP_AUTH_BROKER_URL` env (else `auth.broker.url` from `config.yml`, resolved through `resolveConfigValue`);\n2. `OMP_AUTH_BROKER_TOKEN` env (else `auth.broker.token` from `config.yml`, else `<config-dir>/auth-broker.token`);\n3. URL set but no token resolvable → hard error pointing at the token file path.\n\nThe gateway has no dedicated env vars — it inherits `OMP_AUTH_BROKER_*` because it is itself a broker client.\n\n### `config.yml` keys\n\n| Key | Default | Purpose |\n| ------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| `auth.broker.url` | unset | Same as `OMP_AUTH_BROKER_URL`; env wins. Hidden from the settings UI. Values are resolved as a literal, an environment variable name, or `!<shell command>` to use trimmed stdout. |\n| `auth.broker.token` | unset | Same as `OMP_AUTH_BROKER_TOKEN`; env wins. Values are resolved the same way. |\n\n### Token files\n\n| Path | Owner | Mode |\n| --------------------------------- | ---------------------------------------------------- | ----------------------------- |\n| `<config-dir>/auth-broker.token` | `omp auth-broker serve` (created at first start) | `0600` in a `0700` parent dir |\n| `<config-dir>/auth-gateway.token` | `omp auth-gateway serve` (skipped under `--no-auth`) | `0600` in a `0700` parent dir |\n\n`<config-dir>` resolves to `~/.omp/` (respecting `PI_CONFIG_DIR`).\n\n## Interaction with the local API-key resolution order\n\nThe broker only owns OAuth credentials and provider-API-key credentials that were uploaded to it. The standard credential ladder in `models.md` (`Auth and API key resolution order`) is preserved, with one addition committed alongside the gateway:\n\n- `AuthStorage.setConfigApiKey / removeConfigApiKey / clearConfigApiKeys` let a `models.yml` `apiKey` beat a stored OAuth token **without** overriding an explicit `--api-key`. This is what allows a broker-resolved OAuth credential to be reliably shadowed by a per-environment `models.yml` config key when both are present.\n\n## See also\n\n- [`secrets.md`](./secrets.md) — secret obfuscation around tokens that _do_ leak through (e.g. `OMP_AUTH_BROKER_TOKEN` in shell output).\n- [`models.md`](./models.md) — provider auth resolution order; the broker plugs in at layers 2–3 (stored credentials).\n- [`environment-variables.md`](./environment-variables.md) — full env reference including `OMP_AUTH_BROKER_URL` / `OMP_AUTH_BROKER_TOKEN`.\n",
@@ -34,7 +35,7 @@ export const EMBEDDED_DOCS: Readonly<Record<string, string>> = {
34
35
  "mcp-server-tool-authoring.md": "# MCP server and tool authoring\n\nThis document explains how MCP server definitions become callable `mcp__*` tools in coding-agent, and what operators should expect when configs are invalid, duplicated, disabled, or auth-gated.\n\n## Architecture at a glance\n\n```text\nConfig sources (.omp/.claude/.cursor/.vscode/mcp.json, mcp.json, etc.)\n -> discovery providers normalize to canonical MCPServer\n -> capability loader dedupes by server name (higher provider priority wins)\n -> loadAllMCPConfigs converts to MCPServerConfig + skips enabled:false\n -> MCPManager connects/listTools (with auth/header/env resolution)\n -> manager best-effort loads resources/prompts and subscribes to resource updates when enabled\n -> MCPTool/DeferredMCPTool bridge exposes tools as mcp__<server>_<tool>\n -> AgentSession.refreshMCPTools replaces live MCP tools immediately\n```\n\n## 1) Server config model and validation\n\n`src/mcp/types.ts` defines the authoring shape used by MCP config writers and runtime:\n\n- `stdio` (default when `type` missing): requires `command`, optional `args`, `env`, `cwd`\n- `http`: requires `url`, optional `headers`\n- `sse`: requires `url`, optional `headers` (kept for compatibility)\n- shared fields: `enabled`, `timeout`, `auth`, `oauth`\n\n`validateServerConfig()` (`src/mcp/config.ts`) enforces transport basics:\n\n- rejects configs that set both `command` and `url`\n- requires `command` for stdio\n- requires `url` for http/sse\n- rejects unknown `type`\n\n`config-writer.ts` applies this validation for add/update operations and also validates server names:\n\n- non-empty\n- max 100 chars\n- only `[a-zA-Z0-9_.-]`\n\n### Transport pitfalls\n\n- `type` omitted means stdio. If you intended HTTP/SSE but omitted `type`, `command` becomes mandatory.\n- `sse` is still accepted but treated as HTTP transport internally (`createHttpTransport`).\n- Validation is structural, not reachability: a syntactically valid URL can still fail at connect time.\n\n## 2) Discovery, normalization, and precedence\n\n### Capability-based discovery\n\n`loadAllMCPConfigs()` (`src/mcp/config.ts`) loads canonical `MCPServer` items via `loadCapability(mcpCapability.id)`.\n\nThe capability layer (`src/capability/index.ts`) then:\n\n1. loads providers in priority order\n2. dedupes by `server.name` (first win = highest priority)\n3. validates deduped items\n\nResult: duplicate server names across sources are not merged. One definition wins; lower-priority duplicates are shadowed.\n\n### `.mcp.json` and related files\n\nThe dedicated fallback provider in `src/discovery/mcp-json.ts` reads project-root `mcp.json` and `.mcp.json` (low priority).\n\nIn practice MCP servers also come from higher-priority providers (for example native `.omp/...` and tool-specific config dirs). Authoring guidance:\n\n- Prefer `.omp/mcp.json` (project) or `~/.omp/agent/mcp.json` (user) for explicit control.\n- Use root `mcp.json` / `.mcp.json` when you need fallback compatibility.\n- Reusing the same server name in multiple sources causes precedence shadowing, not merge.\n\n### Normalization behavior\n\n`convertToLegacyConfig()` (`src/mcp/config.ts`) maps canonical `MCPServer` to runtime `MCPServerConfig`.\n\nKey behavior:\n\n- transport inferred as `server.transport ?? (command ? \"stdio\" : url ? \"http\" : \"stdio\")`\n- disabled servers (`enabled === false`) and names in the user `disabledServers` list are dropped before connection\n- optional fields are preserved when present\n\n### Environment expansion during discovery\n\nOMP-native MCP config (`.omp/mcp.json`, `~/.omp/agent/mcp.json`, plus their `.mcp.json` variants) expands `${VAR}` and `${VAR:-default}` placeholders recursively before converting to runtime config. It also accepts boolean/string forms for `enabled` (`true`, `false`, `1`, `0`) and numeric strings for `timeout`.\n\nThe standalone fallback provider in `src/discovery/mcp-json.ts` reads project-root `mcp.json` and `.mcp.json`, expands the same `${...}` placeholders, and type-checks `enabled`/`timeout` without coercing string values.\n\nInvalid `enabled`/`timeout` values are ignored with warnings rather than failing the whole file.\n\n## 3) Auth and runtime value resolution\n\n`MCPManager.prepareConfig()`/`#resolveAuthConfig()` (`src/mcp/manager.ts`) is the final pre-connect pass.\n\n### OAuth credential injection\n\nIf config has:\n\n```ts\nauth: { type: \"oauth\", credentialId: \"...\" }\n```\n\nand credential exists in auth storage:\n\n- `http`/`sse`: injects `Authorization: Bearer <access_token>` header\n- `stdio`: injects `OAUTH_ACCESS_TOKEN` env var\n\nIf credential lookup fails, manager logs a warning and continues with unresolved auth.\n\n### Header/env value resolution\n\nBefore connect, manager resolves stdio `env` values and HTTP/SSE `headers` values via `resolveConfigValue()` (`src/config/resolve-config-value.ts`):\n\n- value starting with `!` => execute shell command, use trimmed stdout (cached)\n- failed, timed-out, or whitespace-only commands produce `undefined`, so that entry is omitted\n- otherwise, treat value as environment variable name first (`process.env[name]`), fallback to literal value\n\nOperational caveat: a mistyped `!` secret command can silently remove that header/env entry, producing downstream 401/403 or server startup failures. A mistyped environment variable name is sent literally unless that literal happens to be meaningful to the server.\n\n## 4) Tool bridge: MCP -> agent-callable tools\n\n`src/mcp/tool-bridge.ts` converts MCP tool definitions into `CustomTool`s.\n\n### Naming and collision domain\n\nTool names are generated as:\n\n```text\nmcp__<sanitized_server_name>_<sanitized_tool_name>\n```\n\nRules:\n\n- lowercases\n- non-`[a-z_]` chars become `_`\n- repeated underscores collapse\n- redundant `<server>_` prefix in tool name is stripped once\n\nThis avoids many collisions, but not all. Different raw names can still sanitize to the same identifier (for example `my-server` and `my.server` both sanitize similarly), and registry insertion is last-write-wins.\n\n### Schema mapping\n\n`tool-bridge.ts` passes each MCP `inputSchema` through `normalizeSchemaForMCP()` before registering it as a `CustomTool` schema.\n\n### Execution mapping\n\n`MCPTool.execute()` / `DeferredMCPTool.execute()`:\n\n- calls MCP `tools/call`\n- flattens MCP content into displayable text\n- returns structured details (`serverName`, `mcpToolName`, provider metadata)\n- maps server-reported `isError` to `Error: ...` text result\n- attempts reconnect + one retry for retriable connection errors\n- maps remaining thrown transport/runtime failures to `MCP error: ...`\n- preserves abort semantics by translating AbortError into `ToolAbortError`\n\n## 5) Operator lifecycle: add/edit/remove and live updates\n\nInteractive mode exposes `/mcp` in `src/modes/controllers/mcp-command-controller.ts`.\n\nSupported operations:\n\n- `add` (wizard or quick-add)\n- `remove` / `rm`\n- `enable` / `disable`\n- `test`\n- `reauth` / `unauth`\n- `reconnect`\n- `reload`\n- `resources`, `prompts`, `notifications`\n- Smithery search/login/logout flows\n\nConfig writes are atomic (`writeMCPConfigFile`: temp file + rename).\n\nAfter changes, controller calls `#reloadMCP()`:\n\n1. `mcpManager.disconnectAll()`\n2. `mcpManager.discoverAndConnect()`\n3. `session.refreshMCPTools(mcpManager.getTools())`\n\n`refreshMCPTools()` replaces all `mcp__` registry entries and immediately re-activates the latest MCP tool set, so changes take effect without restarting the session.\n\n### Mode differences\n\n- **Interactive/TUI mode**: `/mcp` gives in-app UX (wizard, OAuth flow, connection status text, immediate runtime rebinding).\n- **SDK/headless integration**: `discoverAndLoadMCPTools()` (`src/mcp/loader.ts`) returns loaded tools + per-server errors; no `/mcp` command UX.\n\n## 6) User-visible error surfaces\n\nCommon error strings users/operators see:\n\n- add/update validation failures:\n - `Invalid server config: ...`\n - `Server \"<name>\" already exists in <path>`\n- quick-add argument issues:\n - `Use either --url or -- <command...>, not both.`\n - `--token requires --url (HTTP/SSE transport).`\n- connect/test failures:\n - `Failed to connect to \"<name>\": <message>`\n - timeout help text suggests increasing timeout\n - auth help text for `401/403`\n- auth/OAuth flows:\n - `Authentication required ... OAuth endpoints could not be discovered`\n - `OAuth flow timed out. Please try again.`\n - `OAuth authentication failed: ...`\n- disabled server usage:\n - `Server \"<name>\" is disabled. Run /mcp enable <name> first.`\n\nBad source JSON in discovery is generally handled as warnings/logs; config-writer paths throw explicit errors.\n\n## 7) Practical authoring guidance\n\nFor robust MCP authoring in this codebase:\n\n1. Keep server names globally unique across all MCP-capable config sources.\n2. Prefer names that remain distinct after MCP tool-name sanitization to avoid generated `mcp__` collisions.\n3. Use explicit `type` to avoid accidental stdio defaults.\n4. Treat `enabled: false` as hard-off: server is omitted from runtime connect set.\n5. For OAuth configs, store a valid `credentialId`; otherwise auth injection is skipped.\n6. If using command-based secret resolution (`!cmd`), verify command output is stable and non-empty.\n\n## Implementation files\n\n- [`src/mcp/types.ts`](../packages/coding-agent/src/mcp/types.ts)\n- [`src/mcp/config.ts`](../packages/coding-agent/src/mcp/config.ts)\n- [`src/mcp/config-writer.ts`](../packages/coding-agent/src/mcp/config-writer.ts)\n- [`src/mcp/tool-bridge.ts`](../packages/coding-agent/src/mcp/tool-bridge.ts)\n- [`src/discovery/mcp-json.ts`](../packages/coding-agent/src/discovery/mcp-json.ts)\n- [`src/modes/controllers/mcp-command-controller.ts`](../packages/coding-agent/src/modes/controllers/mcp-command-controller.ts)\n- [`src/mcp/manager.ts`](../packages/coding-agent/src/mcp/manager.ts)\n- [`src/capability/index.ts`](../packages/coding-agent/src/capability/index.ts)\n- [`src/config/resolve-config-value.ts`](../packages/coding-agent/src/config/resolve-config-value.ts)\n- [`src/mcp/loader.ts`](../packages/coding-agent/src/mcp/loader.ts)\n",
35
36
  "memory.md": "# Autonomous Memory\n\nWhen the local memory backend is enabled, the agent automatically extracts durable knowledge from past sessions and injects a compact summary into future sessions for the same project. Over time it builds a project-scoped memory store — technical decisions, recurring workflows, pitfalls — that carries forward without manual effort.\n\nDisabled by default. Enable the local summary pipeline via `/settings` or `config.yml`:\n\n```yaml\nmemory:\n backend: local\n```\n\n## Usage\n\n### What gets injected\n\nAt session start, if a memory summary exists for the current project, it is injected into the system prompt as a **Memory Guidance** block. The agent is instructed to:\n\n- Treat memory as heuristic context — useful for process and prior decisions, not authoritative on current repo state.\n- Cite the memory artifact path when memory changes the plan, and pair it with current-repo evidence before acting.\n- Prefer repo state and user instruction when they conflict with memory; treat conflicting memory as stale.\n\n### Reading memory artifacts\n\nThe agent can read memory files directly using `memory://` URLs with the `read` tool:\n\n| URL | Content |\n| -------------------------------------- | ----------------------------------- |\n| `memory://root` | Compact summary injected at startup |\n| `memory://root/MEMORY.md` | Full long-term memory document |\n| `memory://root/skills/<name>/SKILL.md` | A generated skill playbook |\n\n### `/memory` slash command\n\n| Subcommand | Effect |\n| --------------------- | --------------------------------------------------------- |\n| `view` | Show the current backend injection payload |\n| `stats` | Show backend-specific memory statistics, when supported |\n| `diagnose` | Show backend-specific diagnostics, when supported |\n| `clear` / `reset` | Delete active backend memory data/artifacts |\n| `enqueue` / `rebuild` | Force consolidation/retention work for the active backend |\n\n## How it works\n\nLocal summary memories are built by a background pipeline that runs at startup; `/memory enqueue` marks consolidation work that the next startup picks up. The pipeline is skipped for subagents and for sessions that are not persisted to a session file.\n\n**Phase 1 — per-session extraction:** For each past session that has changed since it was last processed, a model reads the session history and extracts durable signal: technical decisions, constraints, resolved failures, recurring workflows. Sessions that are too recent, too old, currently active, or beyond the configured scan/age limits are skipped. Each extraction produces a raw memory block and a short synopsis for that session.\n\n**Phase 2 — consolidation:** After extraction, a second model pass reads all per-session extractions and produces three outputs written to disk:\n\n- `MEMORY.md` — a curated long-term memory document\n- `memory_summary.md` — the compact text injected at session start\n- `skills/` — reusable procedural playbooks, each in its own subdirectory\n\nPhase 2 uses a lease and heartbeat to prevent double-running when multiple processes start simultaneously. Stale skill directories from prior runs are pruned automatically.\n\nConsolidated output is redacted for common secret/token patterns before `MEMORY.md`, `memory_summary.md`, or generated skills are written to disk.\n\n### Extraction behavior\n\nMemory extraction and consolidation behavior is driven by static prompt files in `packages/coding-agent/src/prompts/memories/`.\n\n| File | Purpose | Variables |\n| ------------------------ | -------------------------------------------- | ------------------------------------------- |\n| `stage_one_system.md` | System prompt for per-session extraction | — |\n| `stage_one_input.md` | User-turn template wrapping session content | `{{thread_id}}`, `{{response_items_json}}` |\n| `consolidation_system.md`| System prompt for cross-session consolidation | — |\n| `consolidation.md` | User-turn prompt for cross-session consolidation | `{{raw_memories}}`, `{{rollout_summaries}}` |\n| `read-path.md` | Memory guidance injected into live sessions | `{{memory_summary}}` |\n\n### Model selection\n\nMemory piggybacks on the model role system.\n\n| Phase | Role | Purpose |\n| ----------------------- | ------------------------------------------------------------------- | -------------------------------- |\n| Phase 1 (extraction) | `default` | Per-session knowledge extraction |\n| Phase 2 (consolidation) | `smol` (falls back to `default`, then current/first registry model) | Cross-session synthesis |\n\nIf the requested memory role is not configured, memory model resolution falls back to the `default` role, then the active session model, then the first model in the registry.\n\n## Configuration\n\n| Setting | Default | Description |\n| ------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------- |\n| `memory.backend` | `off` | Select `local` for this pipeline; legacy `memories.enabled: true` is migrated to `memory.backend: local` when no explicit backend is set |\n| `memories.maxRolloutAgeDays` | `30` | Sessions older than this are not processed |\n| `memories.minRolloutIdleHours` | `12` | Sessions active more recently than this are skipped |\n| `memories.maxRolloutsPerStartup` | `64` | Cap on sessions processed in a single startup |\n| `memories.summaryInjectionTokenLimit` | `5000` | Max tokens of the summary injected into the system prompt |\n\nAdditional tuning knobs (concurrency, lease durations, token budgets) are available in config for advanced use.\n\n## Key files\n\n- `packages/coding-agent/src/memories/index.ts` — pipeline orchestration, injection, clear/enqueue entry points (the `/memory` command routes here via `packages/coding-agent/src/memory-backend/local-backend.ts`)\n- `packages/coding-agent/src/memories/storage.ts` — SQLite-backed job queue and thread registry\n- `packages/coding-agent/src/prompts/memories/` — memory prompt templates\n- `packages/coding-agent/src/internal-urls/memory-protocol.ts` — `memory://` URL handler\n",
36
37
  "mnemosyne-memory-backend.md": "# Mnemopi memory backend\n\nOh My Pi can use `@oh-my-pi/pi-mnemopi` as a local long-term memory backend.\n\nSet:\n\n```yaml\nmemory:\n backend: mnemopi\n```\n\nExample:\n\n```yaml\nmemory:\n backend: mnemopi\nmnemopi:\n scoping: per-project-tagged\n```\n\nWith this backend enabled, the coding agent:\n\n1. Opens one or more local Mnemopi SQLite databases according to the configured bank scoping.\n2. Recalls relevant memories into a `<memories>` block for the first model turn of a session and refreshes the base prompt if recall happens from the `agent_start` listener.\n3. Retains completed conversation turns into the retain bank after agent turns, no more often than `mnemopi.retainEveryNTurns`.\n4. Adds recalled memory as extra compaction context when compaction asks the memory backend for `preCompactionContext`.\n5. Uses the normal `/memory view`, `/memory stats`, `/memory diagnose`, `/memory clear`, and `/memory enqueue` commands through the shared memory backend interface.\n\nRecalled memory is background context, not instructions. Current user messages and tool output take precedence when they conflict.\n\n## Settings\n\n| Setting | Default | Description |\n| ------------------------------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |\n| `memory.backend` | `off` | Set to `mnemopi` to enable this backend. |\n| `mnemopi.dbPath` | agent memories dir | Optional SQLite database path. |\n| `mnemopi.bank` | unset | Optional shared bank base name passed to `Mnemopi`; the coding-agent wrapper scopes from this base according to `mnemopi.scoping`. Unset → shared bank `default`; per-project modes derive a project bank from the project root name plus a stable hash. |\n| `mnemopi.scoping` | `per-project` | Memory visibility mode: `global` = one shared bank, `per-project` = isolated project memory, `per-project-tagged` = project-local writes plus global recall visibility. |\n| `mnemopi.autoRecall` | `true` | Recall memory on the first turn of a session. |\n| `mnemopi.autoRetain` | `true` | Retain completed turns automatically. |\n| `mnemopi.polyphonicRecall` | `false` | Enable 4-voice polyphonic recall (vector, graph, fact, temporal) with reciprocal rank fusion; `MNEMOPI_POLYPHONIC_RECALL` overrides when set. |\n| `mnemopi.enhancedRecall` | `false` | Enable the tiered query result cache for repeated/similar recall queries; `MNEMOPI_ENHANCED_RECALL` overrides when set. |\n| `mnemopi.retainEveryNTurns` | `4` | Minimum user turns between automatic retain writes. |\n| `mnemopi.recallLimit` | `8` | Maximum recalled memories in the prompt block. |\n| `mnemopi.recallContextTurns` | `3` | Prior user-bounded turns included in recall queries. |\n| `mnemopi.recallMaxQueryChars` | `4000` | Maximum composed recall query length. |\n| `mnemopi.injectionTokenLimit` | `5000` | Approximate token budget for memory prompt injection. |\n| `mnemopi.debug` | `false` | Enable debug logging for backend failures. |\n| `mnemopi.noEmbeddings` | `false` | Pass `noEmbeddings` to `Mnemopi` and force FTS-only recall. |\n| `mnemopi.embeddingModel` | env/default | Embedding model passed to `Mnemopi`. |\n| `mnemopi.embeddingApiUrl` | env/default | OpenAI-compatible embedding endpoint passed to `Mnemopi`. |\n| `mnemopi.embeddingApiKey` | env/default | Embedding API key passed to `Mnemopi`. |\n| `mnemopi.llmMode` | `smol` | `smol` uses the configured pi-ai smol model, `remote` uses the settings below, and `none` disables LLM calls. |\n| `mnemopi.llmBaseUrl` | env/default | OpenAI-compatible LLM endpoint for `llmMode: remote`. |\n| `mnemopi.llmApiKey` | env/default | LLM API key for `llmMode: remote`. |\n| `mnemopi.llmModel` | env/default | LLM model id for `llmMode: remote`. |\n\n## Scoping\n\nThe coding-agent wrapper applies scoping on top of the underlying `Mnemopi` package:\n\n- `global` uses one shared bank for recall and writes.\n- `per-project` writes to and recalls from a bank derived from the current git repository root (or cwd) plus a stable hash.\n- `per-project-tagged` writes to the project-local bank and recalls from both the project-local bank and the shared global bank, with duplicate recall results merged.\n\nThe combined project-plus-global behavior lives in the wrapper. The `@oh-my-pi/pi-mnemopi` package itself still exposes banks and constructor options directly, including `bank` for selecting a bank name. Project-local banks other than the shared bank are stored as sibling bank databases managed by Mnemopi's `BankManager`.\n\n## LLM and embeddings\n\nThe backend passes these settings to the `Mnemopi` constructor; if a setting is omitted, Mnemopi falls back to its `MNEMOPI_*` environment defaults. The backend does not download or run a local GGUF LLM. LLM-dependent paths use a configured pi-ai model, an opt-in local on-device memory model (`providers.memoryModel`, ONNX — overrides `smol`/`remote` when set to a local model), a dynamic completion function, a remote OpenAI-compatible endpoint, or deterministic no-LLM fallbacks.\n\nFTS-only:\n\n```yaml\nmemory:\n backend: mnemopi\nmnemopi:\n noEmbeddings: true\n```\n\nEquivalent constructor shape:\n\n```ts\nnew Mnemopi({ noEmbeddings: true });\n```\n\nRemote embeddings:\n\n```yaml\nmnemopi:\n embeddingModel: text-embedding-3-small\n embeddingApiUrl: https://api.openai.com/v1\n embeddingApiKey: ${OPENAI_API_KEY}\n```\n\nEquivalent constructor shape:\n\n```ts\nnew Mnemopi({\n embeddingModel: \"text-embedding-3-small\",\n embeddingApiUrl: \"https://api.openai.com/v1\",\n embeddingApiKey,\n});\n```\n\nRemote LLM:\n\n```yaml\nmnemopi:\n llmMode: remote\n llmBaseUrl: https://api.openai.com/v1\n llmApiKey: ${OPENAI_API_KEY}\n llmModel: gpt-4.1-mini\n```\n\nEquivalent constructor shapes:\n\n```ts\nnew Mnemopi({ llm: { baseUrl, apiKey, model } });\nnew Mnemopi({ llmBaseUrl: baseUrl, llmApiKey: apiKey, llmModel: model });\n```\n\nDynamic function LLM for rotating OAuth tokens:\n\n```ts\nnew Mnemopi({\n llm: async (prompt, opts) => {\n const token = await getFreshOauthToken();\n return await completeWithPiAi(prompt, {\n token,\n maxTokens: opts?.maxTokens,\n temperature: opts?.temperature,\n });\n },\n});\n```\n\npi-ai smol model LLM:\n\n```yaml\nmnemopi:\n llmMode: smol\n```\n\nThe coding agent resolves its configured smol role and passes a dynamic completion function so every Mnemopi LLM call can fetch the current provider credentials at call time:\n\n```ts\nnew Mnemopi({\n llm: async (prompt, opts) => completeSmolWithCurrentAuth(prompt, opts),\n});\n```\n\n## Operational notes\n\n- The default shared database lives under the agent memories directory in `mnemopi/mnemopi.db`; project-scoped banks use sibling database paths under that Mnemopi directory.\n- `/memory clear` removes every scoped Mnemopi SQLite database and sidecar WAL/SHM files for the active configuration.\n- `/memory enqueue` forces retention of the current session, flushes pending fact extractions, and runs Mnemopi sleep/consolidation.\n- `/memory stats` and `/memory diagnose` render backend-specific bank statistics/diagnostics when the Mnemopi backend is active.\n- Subagents do not own separate Mnemopi retain loops; they alias the parent state when a parent Mnemopi state exists, and otherwise remain inert.\n",
37
- "models.md": "# Model and Provider Configuration (`models.yml`)\n\nThis document describes how the coding-agent currently loads models, applies overrides, resolves credentials, and chooses models at runtime.\n\n## What controls model behavior\n\nPrimary implementation files:\n\n- `src/config/model-registry.ts` — loads built-in + custom models, provider overrides, runtime discovery, auth integration\n- `src/config/model-resolver.ts` — parses model patterns and selects initial/smol/slow models\n- `src/config/settings-schema.ts` — model-related settings (`modelRoles`, provider transport preferences)\n- `src/session/auth-storage.ts` — API key + OAuth resolution order\n- `packages/catalog/src/models.ts` and `packages/catalog/src/types.ts` — built-in providers/models (`getBundledModels` / `getBundledProviders`) and `Model`/`compat` types\n\n## Config file location and legacy behavior\n\nDefault config path:\n\n- `~/.omp/agent/models.yml`\n\nLegacy behavior still present:\n\n- If `models.yml` is missing and `models.json` exists at the same location, it is migrated to `models.yml`.\n- Explicit `.json` / `.jsonc` config paths are still supported when passed programmatically to `ModelRegistry`.\n\n## `models.yml` shape\n\n```yaml\nproviders:\n <provider-id>:\n # provider-level config\nequivalence:\n overrides:\n <provider-id>/<model-id>: <canonical-model-id>\n exclude:\n - <provider-id>/<model-id>\n```\n\n`provider-id` is the canonical provider key used across selection and auth lookup.\n\n`equivalence` is optional and configures canonical model grouping on top of concrete provider models:\n\n- `overrides` maps an exact concrete selector (`provider/modelId`) to an official upstream canonical id\n- `exclude` opts a concrete selector out of canonical grouping\n\n## Provider-level fields\n\n```yaml\nproviders:\n my-provider:\n baseUrl: https://api.example.com/v1\n apiKey: MY_PROVIDER_API_KEY\n api: openai-completions\n headers:\n X-Team: platform\n authHeader: true\n auth: apiKey\n disableStrictTools: false # set true for Anthropic-compatible endpoints that reject the strict field\n discovery:\n type: ollama\n modelOverrides:\n some-model-id:\n name: Renamed model\n models:\n - id: some-model-id\n name: Some Model\n api: openai-completions\n reasoning: false\n input: [text]\n cost:\n input: 0\n output: 0\n cacheRead: 0\n cacheWrite: 0\n contextWindow: 128000\n maxTokens: 16384\n headers:\n X-Model: value\n compat:\n supportsStore: true\n supportsDeveloperRole: true\n supportsReasoningEffort: true\n maxTokensField: max_completion_tokens\n openRouterRouting:\n only: [anthropic]\n vercelGatewayRouting:\n order: [anthropic, openai]\n extraBody:\n gateway: m1-01\n controller: mlx\n```\n\n### Allowed provider/model `api` values\n\n- `openai-completions`\n- `openai-responses`\n- `openai-codex-responses`\n- `azure-openai-responses`\n- `anthropic-messages`\n- `google-generative-ai`\n- `google-vertex`\n\n### Allowed auth/discovery values\n\n- `auth`: `apiKey` (default), `none`, or `oauth`; for `models.yml` custom models, `oauth` is accepted by schema but does not waive the `apiKey` requirement\n- `discovery.type`: `ollama`, `llama.cpp`, `lm-studio`, `openai-models-list`, or `proxy`\n- `transport`: `pi-native` only. When set, every model under that provider is sent to an `omp auth-gateway` compatible `baseUrl` via `POST /v1/pi/stream`; `apiKey` is the gateway bearer.\n\n## Validation rules (current)\n\n### Full custom provider (`models` is non-empty)\n\nRequired:\n\n- `baseUrl`\n- `apiKey` unless `auth: none`\n- `api` at provider level or each model\n\n### Override-only provider (`models` missing or empty)\n\nMust define at least one of:\n\n- `baseUrl`\n- `apiKey`\n- `headers`\n- `compat`\n- `disableStrictTools`\n- `modelOverrides`\n- `discovery`\n\n### Discovery\n\n- `discovery` requires provider-level `api`, except `discovery.type: proxy` (per-model wire auto-detected).\n\n### Model value checks\n\n- `id` required\n- `contextWindow` and `maxTokens` must be positive if provided\n\n### Command-resolved secrets\n\nProvider `apiKey` values and provider/model `headers` values may start with `!` to read a secret from command stdout. The command is run with a 10 s timeout, stdout is trimmed, and empty/failing commands are omitted:\n\n```yaml\nproviders:\n openai:\n apiKey: \"!op read op://dev/openai/api-key\"\n headers:\n X-Team-Key: \"!bw get password omp-team-key\"\n```\n\nSuccessful command outputs are cached for the process lifetime so the command is not re-run for every model.\n\n## Merge and override order\n\nModelRegistry pipeline (on refresh):\n\n1. Load built-in providers/models from `@oh-my-pi/pi-ai`.\n2. Load `models.yml` custom config.\n3. Apply provider overrides (`baseUrl`, `headers`, `disableStrictTools`) to built-in models.\n4. Apply `modelOverrides` (per provider + model id).\n5. Merge custom `models`:\n - same `provider + id` replaces existing\n - otherwise append\n6. Load cached/runtime-discovered models (Ollama, llama.cpp, LM Studio, plus built-in provider managers), then re-apply model overrides.\n\n### Provider-model cache and static fingerprint\n\nCached per-provider model lists are persisted in the model-cache SQLite\ndatabase (current schema version 5) with a `static_fingerprint` column that\nhashes the static catalog slice merged into the row. When `resolveProviderModels`\nskips the network fetch and the fingerprint of the in-memory static\ncatalog matches the cached one, the cached rows are returned verbatim —\nthe static + dynamic merge is bypassed entirely. The fingerprint is\nmemoized per process by tagging the static-models array with a symbol\nproperty, so repeated cold-start calls do not re-hash.\n\n## Canonical model equivalence and coalescing\n\nThe registry keeps every concrete provider model and then builds a canonical layer above them.\n\nCanonical ids are official upstream ids only, for example:\n\n- `claude-opus-4-6`\n- `claude-haiku-4-5`\n- `gpt-5.3-codex`\n\n### `models.yml` equivalence config\n\nExample:\n\n```yaml\nproviders:\n zenmux:\n baseUrl: https://api.zenmux.example/v1\n apiKey: ZENMUX_API_KEY\n api: openai-codex-responses\n models:\n - id: codex\n name: Zenmux Codex\n reasoning: true\n input: [text]\n cost:\n input: 0\n output: 0\n cacheRead: 0\n cacheWrite: 0\n contextWindow: 200000\n maxTokens: 32768\n\nequivalence:\n overrides:\n zenmux/codex: gpt-5.3-codex\n p-codex/codex: gpt-5.3-codex\n exclude:\n - demo/codex-preview\n```\n\nBuild order for canonical grouping:\n\n1. exact user override from `equivalence.overrides`\n2. bundled official-id matches from built-in model metadata\n3. conservative heuristic normalization for gateway/provider variants\n4. fallback to the concrete model's own id\n\nCurrent heuristics are intentionally narrow:\n\n- embedded upstream prefixes can be stripped when present, for example `anthropic/...` or `openai/...`\n- dotted and dashed version variants can normalize only when they map to an existing official id, for example `4.6 -> 4-6`\n- ambiguous families or versions are not merged without a bundled match or explicit override\n\n### Canonical resolution behavior\n\nWhen multiple concrete variants share a canonical id, resolution uses:\n\n1. availability and auth\n2. `config.yml` `modelProviderOrder`\n3. existing registry/provider order if `modelProviderOrder` is unset\n\nDisabled or unauthenticated providers are skipped.\n\nSession state and transcripts continue to record the concrete provider/model that actually executed the turn.\n\nProvider defaults vs per-model overrides:\n\n- Provider `headers` are baseline.\n- Model `headers` override provider header keys.\n- `modelOverrides` can override model metadata (`name`, `reasoning`, `thinking`, `input`, `cost`, `premiumMultiplier`, `contextWindow`, `maxTokens`, `headers`, `compat`, `contextPromotionTarget`).\n- `compat` is deep-merged for nested routing blocks (`openRouterRouting`, `vercelGatewayRouting`, `extraBody`).\n\n## Runtime discovery integration\n\n### Implicit Ollama discovery\n\nIf `ollama` is not explicitly configured, registry adds an implicit discoverable provider:\n\n- provider: `ollama`\n- api: `openai-responses`\n- base URL: `OLLAMA_BASE_URL`, or `OLLAMA_HOST`, or `http://127.0.0.1:11434`\n- context window: `OLLAMA_CONTEXT_LENGTH` if set, otherwise Ollama `/api/show` metadata, otherwise `128000`\n- auth mode: keyless (`auth: none` behavior)\n\nRuntime discovery calls Ollama endpoints and normalizes discovered OpenAI-compatible models to `openai-responses`.\n\n`OLLAMA_CONTEXT_LENGTH` does not configure Ollama's runtime `num_ctx`; set that in Ollama/model configuration separately.\n\n### Implicit llama.cpp discovery\n\nIf `llama.cpp` is not explicitly configured, registry adds an implicit discoverable provider:\n\n- provider: `llama.cpp`\n- api: `openai-responses`\n- base URL: `LLAMA_CPP_BASE_URL` or `http://127.0.0.1:8080`\n- auth mode: keyless (`auth: none` behavior)\n\nRuntime discovery calls llama.cpp model endpoints and synthesizes model entries with local defaults.\n\n### Implicit LM Studio discovery\n\nIf `lm-studio` is not explicitly configured, registry adds an implicit discoverable provider:\n\n- provider: `lm-studio`\n- api: `openai-completions`\n- base URL: `LM_STUDIO_BASE_URL` or `http://127.0.0.1:1234/v1`\n- auth mode: keyless (`auth: none` behavior)\n\nRuntime discovery fetches models (`GET /models`) and synthesizes model entries with local defaults.\n\nThis path also works for local OpenAI-compatible servers that are not LM Studio. For example, if oMLX is bound to Ollama's usual port, set `LM_STUDIO_BASE_URL=http://127.0.0.1:11434/v1` to discover it through the existing `/v1/models` flow. Running oMLX and Ollama side by side requires assigning a different port to one of them. Do not configure oMLX as `ollama`: Ollama discovery uses native `/api/tags` and `/api/show` endpoints, not OpenAI `/v1/models`.\n\n### Explicit provider discovery\n\nYou can configure discovery yourself:\n\n```yaml\nproviders:\n ollama:\n baseUrl: http://127.0.0.1:11434\n api: openai-responses\n auth: none\n discovery:\n type: ollama\n\n llama.cpp:\n baseUrl: http://127.0.0.1:8080\n api: openai-responses\n auth: none\n discovery:\n type: llama.cpp\n```\n\n### Proxy discovery (`discovery.type: proxy`)\n\nFor Anthropic+OpenAI-compatible proxies (new-api / one-api / similar)\nthat expose both `/v1/messages` and `/v1/chat/completions` behind the same\nhost. Discovery hits `GET /v1/models` (10s timeout, OpenAI-style payload) and\nderives each model's `api` from the entry's `supported_endpoint_types`:\n\n- contains `\"anthropic\"` -> `api: anthropic-messages` (routes via `/v1/messages`)\n- contains `\"openai\"` -> `api: openai-completions` (routes via `/v1/chat/completions`)\n- otherwise -> falls back to provider-level `api` if set, else dropped\n\nProvider-level `api` is **optional** with `discovery.type: proxy` because the\nper-model wire is auto-detected. The Anthropic SDK strips a trailing `/v1`\nfrom `baseUrl` before appending `/v1/messages`, so a single discovery `baseUrl`\n(ending in `/v1`) round-trips correctly to both wires.\n\n```yaml\nproviders:\n newapi-reseller:\n baseUrl: https://api.example.com/v1\n apiKey: xxxx\n authHeader: true # injects Authorization: Bearer for openai models\n disableStrictTools: true # most anthropic-fronted proxies reject `strict`\n discovery:\n type: proxy\n```\n\n### Extension provider registration\n\nExtensions can register providers at runtime (`pi.registerProvider(...)`), including:\n\n- model replacement/append for a provider\n- custom stream handler registration for new API IDs\n- custom OAuth provider registration\n\n## Auth and API key resolution order\n\nWhen requesting a key for a provider, effective order is:\n\n1. Runtime override (CLI `--api-key`)\n2. Stored API key credential in `agent.db`\n3. Stored OAuth credential in `agent.db` (with refresh)\n4. Environment variable mapping (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, etc.)\n5. ModelRegistry fallback resolver (provider `apiKey` from `models.yml`, env-name-or-literal semantics)\n\n`models.yml` `apiKey` behavior:\n\n- Value is first treated as an environment variable name.\n- If no env var exists, the literal string is used as the token.\n\nIf `authHeader: true` and provider `apiKey` is set, models get:\n\n- `Authorization: Bearer <resolved-key>` header injected.\n\nKeyless providers:\n\n- Providers marked `auth: none` are treated as available without credentials.\n- `getApiKey*` returns `kNoAuth` for them.\n\n### Broker mode\n\nWhen `OMP_AUTH_BROKER_URL` (or `auth.broker.url`) is set, the local SQLite credential store is replaced by `RemoteAuthCredentialStore`. Layers 2 and 3 above (stored API key / OAuth in `agent.db`) are served from a broker-supplied snapshot whose `refresh` tokens are redacted; expiry triggers `POST /v1/credential/:id/refresh` on the broker rather than a local refresh.\n\n`AuthStorage.setConfigApiKey` lets a `models.yml` `apiKey` win over a broker-resolved OAuth token without overriding a runtime `--api-key`. See [`auth-broker-gateway.md`](./auth-broker-gateway.md) for the full broker / gateway design and env surface (`OMP_AUTH_BROKER_URL`, `OMP_AUTH_BROKER_TOKEN`, `auth.broker.url`, `auth.broker.token`).\n\n## Model availability vs all models\n\n- `getAll()` returns the loaded model registry (built-in + merged custom + discovered).\n- `getAvailable()` filters to models that are keyless or have resolvable auth.\n\nSo a model can exist in registry but not be selectable until auth is available.\n\n## Runtime model resolution\n\n### CLI and pattern parsing\n\n`model-resolver.ts` supports:\n\n- exact `provider/modelId`\n- exact canonical model id\n- exact model id (provider inferred)\n- fuzzy/substring matching\n- glob scope patterns in `--models` (e.g. `openai/*`, `*sonnet*`)\n- optional `:thinkingLevel` suffix (`off|minimal|low|medium|high|xhigh`)\n\n`--provider` is legacy; `--model` is preferred.\n\nResolution precedence for exact selectors:\n\n1. exact `provider/modelId` bypasses coalescing\n2. exact canonical id resolves through the canonical index\n3. exact bare concrete id still works\n4. fuzzy and glob matching run after the exact paths\n\n### Initial model selection priority\n\n`findInitialModel(...)` uses this order:\n\n1. explicit CLI provider+model\n2. first scoped model (if not resuming)\n3. saved default provider/model\n4. known provider defaults (e.g. OpenAI/Anthropic/etc.) among available models\n5. first available model\n\n### Role aliases and settings\n\nSupported model roles:\n\n- `default`, `smol`, `slow`, `vision`, `plan`, `designer`, `commit`, `title`, `task`\n\nRole aliases like `pi/smol` expand through `settings.modelRoles`. Each role value can also append a thinking selector such as `:minimal`, `:low`, `:medium`, or `:high`.\n\nIf a role points at another role, the target model still inherits normally and any explicit suffix on the referring role wins for that role-specific use.\n\nRelated settings:\n\n- `modelRoles` (record)\n- `enabledModels` (scoped pattern list)\n- `modelProviderOrder` (global canonical-provider precedence)\n- `providers.kimiApiFormat` (`openai` or `anthropic` request format)\n- `providers.openaiWebsockets` (`auto|off|on` websocket preference for OpenAI Codex transport)\n\n`modelRoles` may store either:\n\n- `provider/modelId` to pin a concrete provider variant\n- a canonical id such as `gpt-5.3-codex` to allow provider coalescing\n\nFor `enabledModels` and CLI `--models`:\n\n- exact canonical ids expand to all concrete variants in that canonical group\n- explicit `provider/modelId` entries stay exact\n- globs and fuzzy matches still operate on concrete models\n\nGlobal `enabledModels` and `disabledProviders` entries may also be scoped to a path prefix:\n\n```yaml\nenabledModels:\n - claude-sonnet-4-5\n - path: ~/work\n models:\n - anthropic/claude-opus-4-5\ndisabledProviders:\n - ollama\n - path: ~/private\n providers:\n - anthropic\n```\n\nString entries apply everywhere. Scoped entries apply when the current working directory is the configured path or one of its subdirectories. Use `path`, `paths`, `pathPrefix`, or `pathPrefixes`; use `models` for `enabledModels`, `providers` for `disabledProviders`, or `values` for either.\n\n## `/model` and `omp models`\n\nBoth surfaces keep provider-prefixed models visible and selectable.\n\nThey now also expose canonical/coalesced models:\n\n- `/model` includes a canonical view alongside provider tabs\n- `omp models` prints provider-grouped tables of every concrete model, and `omp models canonical` prints the coalesced canonical view\n\nSelecting a canonical entry stores the canonical selector. Selecting a provider row stores the explicit `provider/modelId`.\n\n## Context promotion (model-level fallback chains)\n\nContext promotion is an overflow recovery mechanism for small-context variants (for example `*-spark`) that automatically promotes to a larger-context sibling when the API rejects a request with a context length error.\n\n### Trigger and order\n\nWhen a turn fails with a context overflow error (e.g. `context_length_exceeded`), `AgentSession` attempts promotion **before** falling back to compaction:\n\n1. If `contextPromotion.enabled` is true, resolve a promotion target (see below).\n2. If a target is found, switch to it and retry the request — no compaction needed.\n3. If no target is available, fall through to auto-compaction on the current model.\n\n### Target selection\n\nSelection is model-driven, not role-driven:\n\n1. `currentModel.contextPromotionTarget` (if configured)\n2. smallest larger-context model on the same provider + API\n\nCandidates are ignored unless credentials resolve (`ModelRegistry.getApiKey(...)`).\n\n### OpenAI Codex websocket handoff\n\nIf switching from/to `openai-codex-responses`, session provider state key `openai-codex-responses` is closed before model switch. This drops websocket transport state so the next turn starts clean on the promoted model.\n\n### Persistence behavior\n\nPromotion uses temporary switching (`setModelTemporary`):\n\n- recorded as a temporary `model_change` in session history\n- does not rewrite saved role mapping\n\n### Configuring explicit fallback chains\n\nConfigure fallback directly in model metadata via `contextPromotionTarget`.\n\n`contextPromotionTarget` accepts either:\n\n- `provider/model-id` (explicit)\n- `model-id` (resolved within current provider)\n\nExample (`models.yml`) for an explicit OpenAI fallback:\n\n```yaml\nproviders:\n openai-codex:\n modelOverrides:\n gpt-5.5:\n contextPromotionTarget: openai-codex/gpt-5.4\n```\n\nThe built-in model policy currently links OpenAI `codex-spark` variants to `gpt-5.5`, and `gpt-5.5` to `gpt-5.4`, when that target exists on the same provider/API.\n\n## Compatibility and routing fields\n\nThe `compat` block on a provider or model overrides the URL-based auto-detection in `packages/catalog/src/compat/openai.ts` (`buildOpenAICompat`). It is validated by `OpenAICompatSchema` in `packages/coding-agent/src/config/models-config-schema.ts` and consumed by every `openai-completions` transport (`packages/ai/src/providers/openai-completions.ts`). The canonical type is `OpenAICompat` in `packages/catalog/src/types.ts`.\n\n`models.yml` accepts the following keys (all optional; unset falls back to URL detection):\n\nRequest shaping:\n\n- `supportsStore` — emit `store: false` on requests. Default: auto (off for non-standard endpoints).\n- `supportsDeveloperRole` — use the `developer` system role for reasoning models instead of `system`. Default: auto.\n- `supportsMultipleSystemMessages` — preserve separate leading system/developer messages instead of coalescing them. Default: auto (known OpenAI-compatible hosted APIs preserve; strict-template/local hosts coalesce).\n- `supportsUsageInStreaming` — send `stream_options: { include_usage: true }` to receive token usage on streaming responses. Default: `true`.\n- `maxTokensField` — `\"max_completion_tokens\"` or `\"max_tokens\"`. Default: auto.\n- `supportsToolChoice` — emit the `tool_choice` parameter when the caller forces a specific tool. Default: `true`. Set `false` for endpoints that 400 on `tool_choice` (e.g. DeepSeek when reasoning is on).\n- `disableReasoningOnForcedToolChoice` — drop `reasoning_effort` / OpenRouter `reasoning` whenever `tool_choice` forces a call. Default: auto (Kimi/Anthropic-fronted endpoints).\n- `disableReasoningOnToolChoice` — drop reasoning fields whenever any `tool_choice` is sent. Default: auto (DeepSeek reasoning models).\n- `alwaysSendMaxTokens` — always send a max-token field when the caller did not provide one. Default: auto (Kimi-family models derive TPM limits from `max_tokens`).\n- `strictResponsesPairing` — Responses-API tool-call/result history must be strictly paired. Default: auto (Azure OpenAI, GitHub Copilot).\n- `streamIdleTimeoutMs` — stream-watchdog idle-timeout floor in ms for slow reasoning hosts. Default: auto (GLM coding-plan hosts, direct DeepSeek reasoning).\n- `cacheControlFormat` — `\"anthropic\"` to include Anthropic-style prompt-cache markers in chat-completions payloads. Default: auto (OpenRouter `anthropic/*` models).\n- `supportsLongPromptCacheRetention` — host honors `prompt_cache_retention: \"24h\"` on the Responses API. Default: auto (api.openai.com).\n- `extraBody` — extra top-level fields merged into every request body (gateway hints, controller selectors, etc.).\n\nReasoning / thinking:\n\n- `supportsReasoningEffort` — accept `reasoning_effort`. Default: auto (off for Grok, Z.ai/Zhipu, and Xiaomi MiMo).\n- `supportsReasoningParams` — whether request shaping may send reasoning params at all. Default: auto (off for GitHub Copilot chat-completions).\n- `reasoningEffortMap` — partial map from internal effort levels (`minimal|low|medium|high|xhigh`) to provider-specific strings (e.g. DeepSeek maps `xhigh -> \"max\"`).\n- `thinkingFormat` — request shape for thinking: `\"openai\"` (`reasoning_effort`), `\"openrouter\"` (`reasoning: { effort }`), `\"zai\"` (`thinking: { type: \"enabled\" }`), `\"qwen\"` (top-level `enable_thinking`), or `\"qwen-chat-template\"` (`chat_template_kwargs.enable_thinking`). Default: `\"openai\"`.\n- `reasoningContentField` — assistant field carrying chain-of-thought: `\"reasoning_content\"`, `\"reasoning\"`, or `\"reasoning_text\"`. Default: auto.\n- `requiresReasoningContentForToolCalls` — assistant tool-call turns must round-trip the reasoning field (DeepSeek-R1, Kimi, OpenRouter when reasoning is on). Default: `false`.\n- `allowsSyntheticReasoningContentForToolCalls` — allow a placeholder reasoning field when a prior assistant tool-call turn lacks provider reasoning content. Default: `true`; set `false` for providers that validate the exact reasoning value.\n- `requiresAssistantContentForToolCalls` — assistant tool-call turns must include non-empty text content (Kimi). Default: `false`.\n- `whenThinking` — partial compat overrides applied only when a request actually engages thinking mode (deep-merged over the baseline compat).\n\nTool / message normalization:\n\n- `requiresToolResultName` — tool-result messages need a `name` field (Mistral). Default: auto.\n- `requiresAssistantAfterToolResult` — a user message after a tool result needs an assistant turn in between. Default: auto.\n- `requiresThinkingAsText` — convert thinking blocks to text wrapped in `<thinking>` delimiters (Mistral). Default: auto.\n- `requiresMistralToolIds` — normalize tool-call ids to exactly 9 alphanumeric chars. Default: auto.\n- `supportsStrictMode` — accept the per-tool `strict` field on tool schemas. Default: conservative auto-detect per provider/baseUrl.\n- `toolStrictMode` — `\"all_strict\"` forces strict on every tool, `\"none\"` forces it off; unset keeps the existing per-tool mixed behavior.\n\nGateway routing (only applied when `baseUrl` matches the gateway):\n\n- `openRouterRouting.only` / `openRouterRouting.order` — provider routing on `openrouter.ai` (see <https://openrouter.ai/docs/provider-routing>).\n- `vercelGatewayRouting.only` / `vercelGatewayRouting.order` — provider routing on `ai-gateway.vercel.sh` (see <https://vercel.com/docs/ai-gateway/models-and-providers/provider-options>).\n\nProvider-level `compat` is the baseline; per-model `compat` is deep-merged on top, with `openRouterRouting`, `vercelGatewayRouting`, and `extraBody` merged as nested objects.\n\n### Anthropic compatibility (`anthropic-messages`)\n\nFor `anthropic-messages` models the runtime uses a separate `AnthropicCompat` shape (`packages/catalog/src/types.ts`). The `models.yml` schema exposes the strict-tools opt-out as a top-level provider field (see below) plus two Anthropic-side flags in the same `compat` slot — `requiresToolResultId` (non-standard `id` alias on `tool_result` blocks for Z.AI-style proxies) and `replayUnsignedThinking` (replay unsigned thinking blocks as native thinking instead of demoting them to text); the remaining Anthropic-side knobs (`disableAdaptiveThinking`, `supportsEagerToolInputStreaming`, `supportsLongCacheRetention`, `supportsMidConversationSystem`, `supportsForcedToolChoice`, `supportsSamplingParams`) are set by built-in catalog metadata and are not user-configurable from `models.yml`.\n\n### Strict tool schemas (`disableStrictTools`)\n\nAnthropic's API supports a `strict` field on tool definitions that forces the model to always follow the provided schema exactly. OMP enables it by default for a small allowlist of high-frequency built-in `anthropic-messages` tools (`bash`, `python`, `edit`, and `find`) whose schemas fit Anthropic's strict grammar limits; other tools still send normalized schemas but omit `strict`.\n\nThird-party providers that front the Anthropic API (AWS Bedrock, Azure, self-hosted proxies) do not always implement this field and will reject requests that include it. Set `disableStrictTools: true` at the provider level to opt out of strict mode for the allowlisted tools:\n\n```yaml\nproviders:\n bedrock-anthropic:\n baseUrl: https://bedrock-runtime.us-east-1.amazonaws.com/anthropic\n apiKey: AWS_BEARER_TOKEN\n api: anthropic-messages\n disableStrictTools: true\n models:\n - id: claude-sonnet-4-20250514\n name: Claude Sonnet 4 (Bedrock)\n input: [text, image]\n contextWindow: 200000\n maxTokens: 16384\n cost:\n input: 3.00\n output: 15.00\n cacheRead: 0.30\n cacheWrite: 3.75\n```\n\n`disableStrictTools` is a provider-level flag that applies to all models in the provider. It disables the Anthropic `strict` marker only for tools that OMP would otherwise mark strict; it does not change runtime tool argument validation. OMP can automatically retry without strict tools after Anthropic reports a strict-grammar-too-large error before the first streamed token, but proxies that reject the `strict` field for other reasons should set this flag explicitly.\n\nTool schemas going on the wire are normalized by the unified flow in\n`packages/ai/src/utils/schema/normalize.ts` (Google/CCA/MCP dispatchers\nplus the OpenAI strict-mode sanitize+enforce pipeline). See\n[`ai-schema-normalize.md`](./ai-schema-normalize.md) for the strict-mode\nedge cases (local `$ref` inlining, single-item `allOf` collapse,\n`anyOf`-wrapper description hoist, enum/const primitive-type inference)\nand the per-provider dispatcher mapping.\n\n## Practical examples\n\n### Local OpenAI-compatible endpoint (no auth)\n\n```yaml\nproviders:\n local-openai:\n baseUrl: http://127.0.0.1:8000/v1\n auth: none\n api: openai-completions\n models:\n - id: Qwen/Qwen2.5-Coder-32B-Instruct\n name: Qwen 2.5 Coder 32B (local)\n```\n\nFor oMLX or another local OpenAI-compatible server with a discoverable `/v1/models` endpoint, prefer discovery instead of listing models by hand. Set `api` to the endpoint family your server actually exposes: `openai-completions` uses `/v1/chat/completions`; servers that expose `/v1/responses` need `openai-responses` instead.\n\n```yaml\nproviders:\n omlx:\n baseUrl: http://127.0.0.1:11434/v1\n auth: none\n api: openai-completions\n discovery:\n type: openai-models-list\n```\n\nThe built-in vLLM provider can be pointed at a non-default endpoint without declaring a custom discovery type. OMP uses vLLM's `/v1/models` metadata and preserves vLLM's `max_model_len` field as the discovered context window.\n\n```yaml\nproviders:\n vllm:\n baseUrl: http://192.168.5.3:8085/v1\n auth: none\n```\n\nFor multiple vLLM endpoints, use arbitrary provider IDs with the generic OpenAI-compatible discovery path. Set `auth: none` for local no-auth servers or `apiKey` for authenticated ones. Generic discovery reads `max_model_len` first and then `context_length` as a generic OpenAI-compatible fallback.\n\n```yaml\nproviders:\n vllm-fast:\n baseUrl: http://host-a:8000/v1\n auth: none\n api: openai-completions\n discovery:\n type: openai-models-list\n vllm-long:\n baseUrl: http://host-b:8000/v1\n auth: none\n api: openai-completions\n discovery:\n type: openai-models-list\n```\n\n### Hosted proxy with env-based key\n\n```yaml\nproviders:\n anthropic-proxy:\n baseUrl: https://proxy.example.com/anthropic\n apiKey: ANTHROPIC_PROXY_API_KEY\n api: anthropic-messages\n authHeader: true\n disableStrictTools: true # if the proxy doesn't support strict tool schemas\n models:\n - id: claude-sonnet-4-20250514\n name: Claude Sonnet 4 (Proxy)\n reasoning: true\n input: [text, image]\n```\n\n### Override built-in provider route + model metadata\n\n```yaml\nproviders:\n openrouter:\n baseUrl: https://my-proxy.example.com/v1\n headers:\n X-Team: platform\n modelOverrides:\n anthropic/claude-sonnet-4:\n name: Sonnet 4 (Corp)\n compat:\n openRouterRouting:\n only: [anthropic]\n```\n\n## Legacy consumer caveat\n\nMost model configuration now flows through `models.yml` via `ModelRegistry`. Explicit `.json` / `.jsonc` paths remain supported only when passed programmatically to `ModelRegistry`; the default user config is `~/.omp/agent/models.yml`.\n\n## Failure mode\n\nIf `models.yml` fails schema or validation checks:\n\n- registry keeps operating with built-in models\n- error is exposed via `ModelRegistry.getError()` and surfaced in UI/notifications\n",
38
+ "models.md": "# Model and Provider Configuration (`models.yml`)\n\nThis document describes how the coding-agent currently loads models, applies overrides, resolves credentials, and chooses models at runtime.\n\n## What controls model behavior\n\nPrimary implementation files:\n\n- `src/config/model-registry.ts` — loads built-in + custom models, provider overrides, runtime discovery, auth integration\n- `src/config/model-resolver.ts` — parses model patterns and selects initial/smol/slow models\n- `src/config/settings-schema.ts` — model-related settings (`modelRoles`, provider transport preferences)\n- `src/session/auth-storage.ts` — API key + OAuth resolution order\n- `packages/catalog/src/models.ts` and `packages/catalog/src/types.ts` — built-in providers/models (`getBundledModels` / `getBundledProviders`) and `Model`/`compat` types\n\n## Config file location and legacy behavior\n\nDefault config path:\n\n- `~/.omp/agent/models.yml`\n\nLegacy behavior still present:\n\n- If `models.yml` is missing and `models.json` exists at the same location, it is migrated to `models.yml`.\n- Explicit `.json` / `.jsonc` config paths are still supported when passed programmatically to `ModelRegistry`.\n\n## `models.yml` shape\n\n```yaml\nproviders:\n <provider-id>:\n # provider-level config\nequivalence:\n overrides:\n <provider-id>/<model-id>: <canonical-model-id>\n exclude:\n - <provider-id>/<model-id>\n```\n\n`provider-id` is the canonical provider key used across selection and auth lookup.\n\n`equivalence` is optional and configures canonical model grouping on top of concrete provider models:\n\n- `overrides` maps an exact concrete selector (`provider/modelId`) to an official upstream canonical id\n- `exclude` opts a concrete selector out of canonical grouping\n\n## Provider-level fields\n\n```yaml\nproviders:\n my-provider:\n baseUrl: https://api.example.com/v1\n apiKey: MY_PROVIDER_API_KEY\n api: openai-completions\n headers:\n X-Team: platform\n authHeader: true\n auth: apiKey\n disableStrictTools: false # set true for Anthropic-compatible endpoints that reject the strict field\n discovery:\n type: ollama\n modelOverrides:\n some-model-id:\n name: Renamed model\n models:\n - id: some-model-id\n name: Some Model\n api: openai-completions\n reasoning: false\n input: [text]\n cost:\n input: 0\n output: 0\n cacheRead: 0\n cacheWrite: 0\n contextWindow: 128000\n maxTokens: 16384\n headers:\n X-Model: value\n compat:\n supportsStore: true\n supportsDeveloperRole: true\n supportsReasoningEffort: true\n maxTokensField: max_completion_tokens\n openRouterRouting:\n only: [anthropic]\n vercelGatewayRouting:\n order: [anthropic, openai]\n extraBody:\n gateway: m1-01\n controller: mlx\n```\n\n### Allowed provider/model `api` values\n\n- `openai-completions`\n- `openai-responses`\n- `openai-codex-responses`\n- `azure-openai-responses`\n- `anthropic-messages`\n- `google-generative-ai`\n- `google-vertex`\n\n### Allowed auth/discovery values\n\n- `auth`: `apiKey` (default), `none`, or `oauth`; for `models.yml` custom models, `oauth` is accepted by schema but does not waive the `apiKey` requirement\n- `discovery.type`: `ollama`, `llama.cpp`, `lm-studio`, `openai-models-list`, or `proxy`\n- `transport`: `pi-native` only. When set, every model under that provider is sent to an `omp auth-gateway` compatible `baseUrl` via `POST /v1/pi/stream`; `apiKey` is the gateway bearer.\n\n## Validation rules (current)\n\n### Full custom provider (`models` is non-empty)\n\nRequired:\n\n- `baseUrl`\n- `apiKey` unless `auth: none`\n- `api` at provider level or each model\n\n### Override-only provider (`models` missing or empty)\n\nMust define at least one of:\n\n- `baseUrl`\n- `apiKey`\n- `headers`\n- `compat`\n- `disableStrictTools`\n- `modelOverrides`\n- `discovery`\n\n### Discovery\n\n- `discovery` requires provider-level `api`, except `discovery.type: proxy` (per-model wire auto-detected).\n\n### Model value checks\n\n- `id` required\n- `contextWindow` and `maxTokens` must be positive if provided\n\n### Command-resolved secrets\n\nProvider `apiKey` values and provider/model `headers` values may start with `!` to read a secret from command stdout. The command is run with a 10 s timeout, stdout is trimmed, and empty/failing commands are omitted:\n\n```yaml\nproviders:\n openai:\n apiKey: \"!op read op://dev/openai/api-key\"\n headers:\n X-Team-Key: \"!bw get password omp-team-key\"\n```\n\nSuccessful command outputs are cached for the process lifetime so the command is not re-run for every model.\n\n## Merge and override order\n\nModelRegistry pipeline (on refresh):\n\n1. Load built-in providers/models from `@oh-my-pi/pi-ai`.\n2. Load `models.yml` custom config.\n3. Apply provider overrides (`baseUrl`, `headers`, `disableStrictTools`) to built-in models.\n4. Apply `modelOverrides` (per provider + model id).\n5. Merge custom `models`:\n - same `provider + id` replaces existing\n - otherwise append\n6. Load cached/runtime-discovered models (Ollama, llama.cpp, LM Studio, plus built-in provider managers), then re-apply model overrides.\n\n### Provider-model cache and static fingerprint\n\nCached per-provider model lists are persisted in the model-cache SQLite\ndatabase (current schema version 5) with a `static_fingerprint` column that\nhashes the static catalog slice merged into the row. When `resolveProviderModels`\nskips the network fetch and the fingerprint of the in-memory static\ncatalog matches the cached one, the cached rows are returned verbatim —\nthe static + dynamic merge is bypassed entirely. The fingerprint is\nmemoized per process by tagging the static-models array with a symbol\nproperty, so repeated cold-start calls do not re-hash.\n\n## Canonical model equivalence and coalescing\n\nThe registry keeps every concrete provider model and then builds a canonical layer above them.\n\nCanonical ids are official upstream ids only, for example:\n\n- `claude-opus-4-6`\n- `claude-haiku-4-5`\n- `gpt-5.3-codex`\n\n### `models.yml` equivalence config\n\nExample:\n\n```yaml\nproviders:\n zenmux:\n baseUrl: https://api.zenmux.example/v1\n apiKey: ZENMUX_API_KEY\n api: openai-codex-responses\n models:\n - id: codex\n name: Zenmux Codex\n reasoning: true\n input: [text]\n cost:\n input: 0\n output: 0\n cacheRead: 0\n cacheWrite: 0\n contextWindow: 200000\n maxTokens: 32768\n\nequivalence:\n overrides:\n zenmux/codex: gpt-5.3-codex\n p-codex/codex: gpt-5.3-codex\n exclude:\n - demo/codex-preview\n```\n\nBuild order for canonical grouping:\n\n1. exact user override from `equivalence.overrides`\n2. bundled official-id matches from built-in model metadata\n3. conservative heuristic normalization for gateway/provider variants\n4. fallback to the concrete model's own id\n\nCurrent heuristics are intentionally narrow:\n\n- embedded upstream prefixes can be stripped when present, for example `anthropic/...` or `openai/...`\n- dotted and dashed version variants can normalize only when they map to an existing official id, for example `4.6 -> 4-6`\n- ambiguous families or versions are not merged without a bundled match or explicit override\n\n### Canonical resolution behavior\n\nWhen multiple concrete variants share a canonical id, resolution uses:\n\n1. availability and auth\n2. `config.yml` `modelProviderOrder`\n3. existing registry/provider order if `modelProviderOrder` is unset\n\nDisabled or unauthenticated providers are skipped.\n\nSession state and transcripts continue to record the concrete provider/model that actually executed the turn.\n\nProvider defaults vs per-model overrides:\n\n- Provider `headers` are baseline.\n- Model `headers` override provider header keys.\n- `modelOverrides` can override model metadata (`name`, `reasoning`, `thinking`, `input`, `cost`, `premiumMultiplier`, `contextWindow`, `maxTokens`, `headers`, `compat`, `contextPromotionTarget`).\n- `compat` is deep-merged for nested routing blocks (`openRouterRouting`, `vercelGatewayRouting`, `extraBody`).\n\n## Runtime discovery integration\n\n### Implicit Ollama discovery\n\nIf `ollama` is not explicitly configured, registry adds an implicit discoverable provider:\n\n- provider: `ollama`\n- api: `openai-responses`\n- base URL: `OLLAMA_BASE_URL`, or `OLLAMA_HOST`, or `http://127.0.0.1:11434`\n- context window: `OLLAMA_CONTEXT_LENGTH` if set, otherwise Ollama `/api/show` metadata, otherwise `128000`\n- auth mode: keyless (`auth: none` behavior)\n\nRuntime discovery calls Ollama endpoints and normalizes discovered OpenAI-compatible models to `openai-responses`.\n\n`OLLAMA_CONTEXT_LENGTH` does not configure Ollama's runtime `num_ctx`; set that in Ollama/model configuration separately.\n\n### Implicit llama.cpp discovery\n\nIf `llama.cpp` is not explicitly configured, registry adds an implicit discoverable provider:\n\n- provider: `llama.cpp`\n- api: `openai-responses`\n- base URL: `LLAMA_CPP_BASE_URL` or `http://127.0.0.1:8080`\n- auth mode: keyless (`auth: none` behavior)\n\nRuntime discovery calls llama.cpp model endpoints and synthesizes model entries with local defaults.\n\n### Implicit LM Studio discovery\n\nIf `lm-studio` is not explicitly configured, registry adds an implicit discoverable provider:\n\n- provider: `lm-studio`\n- api: `openai-completions`\n- base URL: `LM_STUDIO_BASE_URL` or `http://127.0.0.1:1234/v1`\n- auth mode: keyless (`auth: none` behavior)\n\nRuntime discovery fetches models (`GET /models`) and synthesizes model entries with local defaults.\n\nThis path also works for local OpenAI-compatible servers that are not LM Studio. For example, if oMLX is bound to Ollama's usual port, set `LM_STUDIO_BASE_URL=http://127.0.0.1:11434/v1` to discover it through the existing `/v1/models` flow. Running oMLX and Ollama side by side requires assigning a different port to one of them. Do not configure oMLX as `ollama`: Ollama discovery uses native `/api/tags` and `/api/show` endpoints, not OpenAI `/v1/models`.\n\n### Explicit provider discovery\n\nYou can configure discovery yourself:\n\n```yaml\nproviders:\n ollama:\n baseUrl: http://127.0.0.1:11434\n api: openai-responses\n auth: none\n discovery:\n type: ollama\n\n llama.cpp:\n baseUrl: http://127.0.0.1:8080\n api: openai-responses\n auth: none\n discovery:\n type: llama.cpp\n```\n\n### Proxy discovery (`discovery.type: proxy`)\n\nFor Anthropic+OpenAI-compatible proxies (new-api / one-api / similar)\nthat expose both `/v1/messages` and `/v1/chat/completions` behind the same\nhost. Discovery hits `GET /v1/models` (10s timeout, OpenAI-style payload) and\nderives each model's `api` from the entry's `supported_endpoint_types`:\n\n- contains `\"anthropic\"` -> `api: anthropic-messages` (routes via `/v1/messages`)\n- contains `\"openai\"` -> `api: openai-completions` (routes via `/v1/chat/completions`)\n- otherwise -> falls back to provider-level `api` if set, else dropped\n\nProvider-level `api` is **optional** with `discovery.type: proxy` because the\nper-model wire is auto-detected. The Anthropic SDK strips a trailing `/v1`\nfrom `baseUrl` before appending `/v1/messages`, so a single discovery `baseUrl`\n(ending in `/v1`) round-trips correctly to both wires.\n\n```yaml\nproviders:\n newapi-reseller:\n baseUrl: https://api.example.com/v1\n apiKey: xxxx\n authHeader: true # injects Authorization: Bearer for openai models\n disableStrictTools: true # most anthropic-fronted proxies reject `strict`\n discovery:\n type: proxy\n```\n\n### Extension provider registration\n\nExtensions can register providers at runtime (`pi.registerProvider(...)`), including:\n\n- model replacement/append for a provider\n- custom stream handler registration for new API IDs\n- custom OAuth provider registration\n\n## Auth and API key resolution order\n\nWhen requesting a key for a provider, effective order is:\n\n1. Runtime override (CLI `--api-key`)\n2. Stored API key credential in `agent.db`\n3. Stored OAuth credential in `agent.db` (with refresh)\n4. Environment variable mapping (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, etc.)\n5. ModelRegistry fallback resolver (provider `apiKey` from `models.yml`, env-name-or-literal semantics)\n\n`models.yml` `apiKey` behavior:\n\n- Value is first treated as an environment variable name.\n- If no env var exists, the literal string is used as the token.\n\nIf `authHeader: true` and provider `apiKey` is set, models get:\n\n- `Authorization: Bearer <resolved-key>` header injected.\n\nKeyless providers:\n\n- Providers marked `auth: none` are treated as available without credentials.\n- `getApiKey*` returns `kNoAuth` for them.\n\n### Broker mode\n\nWhen `OMP_AUTH_BROKER_URL` (or `auth.broker.url`) is set, the local SQLite credential store is replaced by `RemoteAuthCredentialStore`. Layers 2 and 3 above (stored API key / OAuth in `agent.db`) are served from a broker-supplied snapshot whose `refresh` tokens are redacted; expiry triggers `POST /v1/credential/:id/refresh` on the broker rather than a local refresh.\n\n`AuthStorage.setConfigApiKey` lets a `models.yml` `apiKey` win over a broker-resolved OAuth token without overriding a runtime `--api-key`. See [`auth-broker-gateway.md`](./auth-broker-gateway.md) for the full broker / gateway design and env surface (`OMP_AUTH_BROKER_URL`, `OMP_AUTH_BROKER_TOKEN`, `auth.broker.url`, `auth.broker.token`).\n\n## Model availability vs all models\n\n- `getAll()` returns the loaded model registry (built-in + merged custom + discovered).\n- `getAvailable()` filters to models that are keyless or have resolvable auth.\n\nSo a model can exist in registry but not be selectable until auth is available.\n\n## Runtime model resolution\n\n### CLI and pattern parsing\n\n`model-resolver.ts` supports:\n\n- exact `provider/modelId`\n- exact canonical model id\n- exact model id (provider inferred)\n- fuzzy/substring matching\n- glob scope patterns in `--models` (e.g. `openai/*`, `*sonnet*`)\n- optional `:thinkingLevel` suffix (`off|minimal|low|medium|high|xhigh`)\n\n`--provider` is legacy; `--model` is preferred.\n\nResolution precedence for exact selectors:\n\n1. exact `provider/modelId` bypasses coalescing\n2. exact canonical id resolves through the canonical index\n3. exact bare concrete id still works\n4. fuzzy and glob matching run after the exact paths\n\n### Initial model selection priority\n\n`findInitialModel(...)` uses this order:\n\n1. explicit CLI provider+model\n2. first scoped model (if not resuming)\n3. saved default provider/model\n4. known provider defaults (e.g. OpenAI/Anthropic/etc.) among available models\n5. first available model\n\n### Role aliases and settings\n\nSupported model roles:\n\n- `default`, `smol`, `slow`, `vision`, `plan`, `designer`, `commit`, `title`, `task`, `advisor`\n\nRole aliases like `pi/smol` expand through `settings.modelRoles`. Each role value can also append a thinking selector such as `:minimal`, `:low`, `:medium`, or `:high`.\n\nIf a role points at another role, the target model still inherits normally and any explicit suffix on the referring role wins for that role-specific use.\n\nRelated settings:\n\n- `modelRoles` (record)\n- `enabledModels` (scoped pattern list)\n- `modelProviderOrder` (global canonical-provider precedence)\n- `providers.kimiApiFormat` (`openai` or `anthropic` request format)\n- `providers.openaiWebsockets` (`auto|off|on` websocket preference for OpenAI Codex transport)\n\n`modelRoles` may store either:\n\n- `provider/modelId` to pin a concrete provider variant\n- a canonical id such as `gpt-5.3-codex` to allow provider coalescing\n\nFor `enabledModels` and CLI `--models`:\n\n- exact canonical ids expand to all concrete variants in that canonical group\n- explicit `provider/modelId` entries stay exact\n- globs and fuzzy matches still operate on concrete models\n\nGlobal `enabledModels` and `disabledProviders` entries may also be scoped to a path prefix:\n\n```yaml\nenabledModels:\n - claude-sonnet-4-5\n - path: ~/work\n models:\n - anthropic/claude-opus-4-5\ndisabledProviders:\n - ollama\n - path: ~/private\n providers:\n - anthropic\n```\n\nString entries apply everywhere. Scoped entries apply when the current working directory is the configured path or one of its subdirectories. Use `path`, `paths`, `pathPrefix`, or `pathPrefixes`; use `models` for `enabledModels`, `providers` for `disabledProviders`, or `values` for either.\n\n## `/model` and `omp models`\n\nBoth surfaces keep provider-prefixed models visible and selectable.\n\nThey now also expose canonical/coalesced models:\n\n- `/model` includes a canonical view alongside provider tabs\n- `omp models` prints provider-grouped tables of every concrete model, and `omp models canonical` prints the coalesced canonical view\n\nSelecting a canonical entry stores the canonical selector. Selecting a provider row stores the explicit `provider/modelId`.\n\n## Context promotion (model-level fallback chains)\n\nContext promotion is an overflow recovery mechanism for small-context variants (for example `*-spark`) that automatically promotes to a larger-context sibling when the API rejects a request with a context length error.\n\n### Trigger and order\n\nWhen a turn fails with a context overflow error (e.g. `context_length_exceeded`), `AgentSession` attempts promotion **before** falling back to compaction:\n\n1. If `contextPromotion.enabled` is true, resolve a promotion target (see below).\n2. If a target is found, switch to it and retry the request — no compaction needed.\n3. If no target is available, fall through to auto-compaction on the current model.\n\n### Target selection\n\nSelection is model-driven, not role-driven:\n\n1. `currentModel.contextPromotionTarget` (if configured)\n2. smallest larger-context model on the same provider + API\n\nCandidates are ignored unless credentials resolve (`ModelRegistry.getApiKey(...)`).\n\n### OpenAI Codex websocket handoff\n\nIf switching from/to `openai-codex-responses`, session provider state key `openai-codex-responses` is closed before model switch. This drops websocket transport state so the next turn starts clean on the promoted model.\n\n### Persistence behavior\n\nPromotion uses temporary switching (`setModelTemporary`):\n\n- recorded as a temporary `model_change` in session history\n- does not rewrite saved role mapping\n\n### Configuring explicit fallback chains\n\nConfigure fallback directly in model metadata via `contextPromotionTarget`.\n\n`contextPromotionTarget` accepts either:\n\n- `provider/model-id` (explicit)\n- `model-id` (resolved within current provider)\n\nExample (`models.yml`) for an explicit OpenAI fallback:\n\n```yaml\nproviders:\n openai-codex:\n modelOverrides:\n gpt-5.5:\n contextPromotionTarget: openai-codex/gpt-5.4\n```\n\nThe built-in model policy currently links OpenAI `codex-spark` variants to `gpt-5.5`, and `gpt-5.5` to `gpt-5.4`, when that target exists on the same provider/API.\n\n## Compatibility and routing fields\n\nThe `compat` block on a provider or model overrides the URL-based auto-detection in `packages/catalog/src/compat/openai.ts` (`buildOpenAICompat`). It is validated by `OpenAICompatSchema` in `packages/coding-agent/src/config/models-config-schema.ts` and consumed by every `openai-completions` transport (`packages/ai/src/providers/openai-completions.ts`). The canonical type is `OpenAICompat` in `packages/catalog/src/types.ts`.\n\n`models.yml` accepts the following keys (all optional; unset falls back to URL detection):\n\nRequest shaping:\n\n- `supportsStore` — emit `store: false` on requests. Default: auto (off for non-standard endpoints).\n- `supportsDeveloperRole` — use the `developer` system role for reasoning models instead of `system`. Default: auto.\n- `supportsMultipleSystemMessages` — preserve separate leading system/developer messages instead of coalescing them. Default: auto (known OpenAI-compatible hosted APIs preserve; strict-template/local hosts coalesce).\n- `supportsUsageInStreaming` — send `stream_options: { include_usage: true }` to receive token usage on streaming responses. Default: `true`.\n- `maxTokensField` — `\"max_completion_tokens\"` or `\"max_tokens\"`. Default: auto.\n- `supportsToolChoice` — emit the `tool_choice` parameter when the caller forces a specific tool. Default: `true`. Set `false` for endpoints that 400 on `tool_choice` (e.g. DeepSeek when reasoning is on).\n- `disableReasoningOnForcedToolChoice` — drop `reasoning_effort` / OpenRouter `reasoning` whenever `tool_choice` forces a call. Default: auto (Kimi/Anthropic-fronted endpoints).\n- `disableReasoningOnToolChoice` — drop reasoning fields whenever any `tool_choice` is sent. Default: auto (DeepSeek reasoning models).\n- `alwaysSendMaxTokens` — always send a max-token field when the caller did not provide one. Default: auto (Kimi-family models derive TPM limits from `max_tokens`).\n- `strictResponsesPairing` — Responses-API tool-call/result history must be strictly paired. Default: auto (Azure OpenAI, GitHub Copilot).\n- `streamIdleTimeoutMs` — stream-watchdog idle-timeout floor in ms for slow reasoning hosts. Default: auto (GLM coding-plan hosts, direct DeepSeek reasoning).\n- `cacheControlFormat` — `\"anthropic\"` to include Anthropic-style prompt-cache markers in chat-completions payloads. Default: auto (OpenRouter `anthropic/*` models).\n- `supportsLongPromptCacheRetention` — host honors `prompt_cache_retention: \"24h\"` on the Responses API. Default: auto (api.openai.com).\n- `extraBody` — extra top-level fields merged into every request body (gateway hints, controller selectors, etc.).\n\nReasoning / thinking:\n\n- `supportsReasoningEffort` — accept `reasoning_effort`. Default: auto (off for Grok, Z.ai/Zhipu, and Xiaomi MiMo).\n- `supportsReasoningParams` — whether request shaping may send reasoning params at all. Default: auto (off for GitHub Copilot chat-completions).\n- `reasoningEffortMap` — partial map from internal effort levels (`minimal|low|medium|high|xhigh`) to provider-specific strings (e.g. DeepSeek maps `xhigh -> \"max\"`).\n- `thinkingFormat` — request shape for thinking: `\"openai\"` (`reasoning_effort`), `\"openrouter\"` (`reasoning: { effort }`), `\"zai\"` (`thinking: { type: \"enabled\" }`), `\"qwen\"` (top-level `enable_thinking`), or `\"qwen-chat-template\"` (`chat_template_kwargs.enable_thinking`). Default: `\"openai\"`.\n- `reasoningContentField` — assistant field carrying chain-of-thought: `\"reasoning_content\"`, `\"reasoning\"`, or `\"reasoning_text\"`. Default: auto.\n- `requiresReasoningContentForToolCalls` — assistant tool-call turns must round-trip the reasoning field (DeepSeek-R1, Kimi, OpenRouter when reasoning is on). Default: `false`.\n- `allowsSyntheticReasoningContentForToolCalls` — allow a placeholder reasoning field when a prior assistant tool-call turn lacks provider reasoning content. Default: `true`; set `false` for providers that validate the exact reasoning value.\n- `requiresAssistantContentForToolCalls` — assistant tool-call turns must include non-empty text content (Kimi). Default: `false`.\n- `whenThinking` — partial compat overrides applied only when a request actually engages thinking mode (deep-merged over the baseline compat).\n\nTool / message normalization:\n\n- `requiresToolResultName` — tool-result messages need a `name` field (Mistral). Default: auto.\n- `requiresAssistantAfterToolResult` — a user message after a tool result needs an assistant turn in between. Default: auto.\n- `requiresThinkingAsText` — convert thinking blocks to text wrapped in `<thinking>` delimiters (Mistral). Default: auto.\n- `requiresMistralToolIds` — normalize tool-call ids to exactly 9 alphanumeric chars. Default: auto.\n- `supportsStrictMode` — accept the per-tool `strict` field on tool schemas. Default: conservative auto-detect per provider/baseUrl.\n- `toolStrictMode` — `\"all_strict\"` forces strict on every tool, `\"none\"` forces it off; unset keeps the existing per-tool mixed behavior.\n\nGateway routing (only applied when `baseUrl` matches the gateway):\n\n- `openRouterRouting.only` / `openRouterRouting.order` — provider routing on `openrouter.ai` (see <https://openrouter.ai/docs/provider-routing>).\n- `vercelGatewayRouting.only` / `vercelGatewayRouting.order` — provider routing on `ai-gateway.vercel.sh` (see <https://vercel.com/docs/ai-gateway/models-and-providers/provider-options>).\n\nProvider-level `compat` is the baseline; per-model `compat` is deep-merged on top, with `openRouterRouting`, `vercelGatewayRouting`, and `extraBody` merged as nested objects.\n\n### Anthropic compatibility (`anthropic-messages`)\n\nFor `anthropic-messages` models the runtime uses a separate `AnthropicCompat` shape (`packages/catalog/src/types.ts`). The `models.yml` schema exposes the strict-tools opt-out as a top-level provider field (see below) plus two Anthropic-side flags in the same `compat` slot — `requiresToolResultId` (non-standard `id` alias on `tool_result` blocks for Z.AI-style proxies) and `replayUnsignedThinking` (replay unsigned thinking blocks as native thinking instead of demoting them to text); the remaining Anthropic-side knobs (`disableAdaptiveThinking`, `supportsEagerToolInputStreaming`, `supportsLongCacheRetention`, `supportsMidConversationSystem`, `supportsForcedToolChoice`, `supportsSamplingParams`) are set by built-in catalog metadata and are not user-configurable from `models.yml`.\n\n### Strict tool schemas (`disableStrictTools`)\n\nAnthropic's API supports a `strict` field on tool definitions that forces the model to always follow the provided schema exactly. OMP enables it by default for a small allowlist of high-frequency built-in `anthropic-messages` tools (`bash`, `python`, `edit`, and `find`) whose schemas fit Anthropic's strict grammar limits; other tools still send normalized schemas but omit `strict`.\n\nThird-party providers that front the Anthropic API (AWS Bedrock, Azure, self-hosted proxies) do not always implement this field and will reject requests that include it. Set `disableStrictTools: true` at the provider level to opt out of strict mode for the allowlisted tools:\n\n```yaml\nproviders:\n bedrock-anthropic:\n baseUrl: https://bedrock-runtime.us-east-1.amazonaws.com/anthropic\n apiKey: AWS_BEARER_TOKEN\n api: anthropic-messages\n disableStrictTools: true\n models:\n - id: claude-sonnet-4-20250514\n name: Claude Sonnet 4 (Bedrock)\n input: [text, image]\n contextWindow: 200000\n maxTokens: 16384\n cost:\n input: 3.00\n output: 15.00\n cacheRead: 0.30\n cacheWrite: 3.75\n```\n\n`disableStrictTools` is a provider-level flag that applies to all models in the provider. It disables the Anthropic `strict` marker only for tools that OMP would otherwise mark strict; it does not change runtime tool argument validation. OMP can automatically retry without strict tools after Anthropic reports a strict-grammar-too-large error before the first streamed token, but proxies that reject the `strict` field for other reasons should set this flag explicitly.\n\nTool schemas going on the wire are normalized by the unified flow in\n`packages/ai/src/utils/schema/normalize.ts` (Google/CCA/MCP dispatchers\nplus the OpenAI strict-mode sanitize+enforce pipeline). See\n[`ai-schema-normalize.md`](./ai-schema-normalize.md) for the strict-mode\nedge cases (local `$ref` inlining, single-item `allOf` collapse,\n`anyOf`-wrapper description hoist, enum/const primitive-type inference)\nand the per-provider dispatcher mapping.\n\n## Practical examples\n\n### Local OpenAI-compatible endpoint (no auth)\n\n```yaml\nproviders:\n local-openai:\n baseUrl: http://127.0.0.1:8000/v1\n auth: none\n api: openai-completions\n models:\n - id: Qwen/Qwen2.5-Coder-32B-Instruct\n name: Qwen 2.5 Coder 32B (local)\n```\n\nFor oMLX or another local OpenAI-compatible server with a discoverable `/v1/models` endpoint, prefer discovery instead of listing models by hand. Set `api` to the endpoint family your server actually exposes: `openai-completions` uses `/v1/chat/completions`; servers that expose `/v1/responses` need `openai-responses` instead.\n\n```yaml\nproviders:\n omlx:\n baseUrl: http://127.0.0.1:11434/v1\n auth: none\n api: openai-completions\n discovery:\n type: openai-models-list\n```\n\nThe built-in vLLM provider can be pointed at a non-default endpoint without declaring a custom discovery type. OMP uses vLLM's `/v1/models` metadata and preserves vLLM's `max_model_len` field as the discovered context window.\n\n```yaml\nproviders:\n vllm:\n baseUrl: http://192.168.5.3:8085/v1\n auth: none\n```\n\nFor multiple vLLM endpoints, use arbitrary provider IDs with the generic OpenAI-compatible discovery path. Set `auth: none` for local no-auth servers or `apiKey` for authenticated ones. Generic discovery reads `max_model_len` first and then `context_length` as a generic OpenAI-compatible fallback.\n\n```yaml\nproviders:\n vllm-fast:\n baseUrl: http://host-a:8000/v1\n auth: none\n api: openai-completions\n discovery:\n type: openai-models-list\n vllm-long:\n baseUrl: http://host-b:8000/v1\n auth: none\n api: openai-completions\n discovery:\n type: openai-models-list\n```\n\n### Hosted proxy with env-based key\n\n```yaml\nproviders:\n anthropic-proxy:\n baseUrl: https://proxy.example.com/anthropic\n apiKey: ANTHROPIC_PROXY_API_KEY\n api: anthropic-messages\n authHeader: true\n disableStrictTools: true # if the proxy doesn't support strict tool schemas\n models:\n - id: claude-sonnet-4-20250514\n name: Claude Sonnet 4 (Proxy)\n reasoning: true\n input: [text, image]\n```\n\n### Override built-in provider route + model metadata\n\n```yaml\nproviders:\n openrouter:\n baseUrl: https://my-proxy.example.com/v1\n headers:\n X-Team: platform\n modelOverrides:\n anthropic/claude-sonnet-4:\n name: Sonnet 4 (Corp)\n compat:\n openRouterRouting:\n only: [anthropic]\n```\n\n## Legacy consumer caveat\n\nMost model configuration now flows through `models.yml` via `ModelRegistry`. Explicit `.json` / `.jsonc` paths remain supported only when passed programmatically to `ModelRegistry`; the default user config is `~/.omp/agent/models.yml`.\n\n## Failure mode\n\nIf `models.yml` fails schema or validation checks:\n\n- registry keeps operating with built-in models\n- error is exposed via `ModelRegistry.getError()` and surfaced in UI/notifications\n",
38
39
  "natives-addon-loader-runtime.md": "# Natives Addon Loader Runtime\n\nThis document covers the runtime loader shipped by `@oh-my-pi/pi-natives`: how `native/index.js` decides which `.node` file to require, how compiled-binary embedded payloads are extracted, and what startup failures report.\n\n## Implementation files\n\n- `packages/natives/native/index.js`\n- `packages/natives/native/loader-state.js`\n- `packages/natives/native/embedded-addon.js`\n- `packages/natives/scripts/embed-native.ts`\n- `packages/natives/package.json`\n\n## Scope and responsibility\n\nThe loader is intentionally narrow:\n\n- Build a platform/CPU-aware candidate list for addon filenames and directories.\n- Treat an embedded-addon manifest as a compiled-binary signal when present.\n- Optionally materialize embedded addon archive contents into a versioned per-user cache directory.\n- On Windows `node_modules` installs, stage addon files into the versioned cache to avoid locked-DLL update failures.\n- Attempt candidates in deterministic order and return the first addon that `require(...)` loads and validates.\n\nFor install and compiled-binary paths, the loader verifies a release sentinel export named from `package.json#version` (for example `__piNativesV15_7_2`). Workspace-dev loads skip this validation so a local checkout can rebuild after a pull. The loader does not validate the full export surface; stale same-version or incomplete binaries still surface as missing members or native errors at use sites.\n\n## Runtime inputs and derived state\n\nAt module initialization, `native/index.js` computes:\n\n- **Platform tag**: `${process.platform}-${process.arch}` (for example `darwin-arm64`).\n- **Package version**: from `packages/natives/package.json`.\n- **Core directories**:\n - `leafPackageDir`: directory of the platform leaf package, resolved via `require.resolve(\"@oh-my-pi/pi-natives-<tag>/package.json\")`; `null` when no leaf is installed (e.g. local dev) and forced to `null` in compiled-binary mode.\n - `nativeDir`: package-local `packages/natives/native`.\n - `execDir`: directory containing `process.execPath`.\n - `versionedDir`: `<getNativesDir()>/<packageVersion>`.\n - `userDataDir` fallback:\n - Windows: `%LOCALAPPDATA%/omp` or `%USERPROFILE%/AppData/Local/omp`.\n - Non-Windows: `~/.local/bin`.\n- **Natives cache root** (`getNativesDir()`):\n - if `$XDG_DATA_HOME/omp` exists, `$XDG_DATA_HOME/omp/natives`;\n - otherwise `~/.omp/natives`.\n- **Compiled-binary mode** (`detectCompiledBinary`): true if any of:\n - embedded-addon manifest is non-null,\n - `PI_COMPILED` env var is set,\n - `import.meta.url` contains Bun embedded markers (`$bunfs`, `~BUN`, `%7EBUN`).\n- **Windows staging mode** (`shouldStageNodeModulesAddon`): true only on Windows, in non-compiled mode, when `nativeDir` is inside `node_modules`.\n- **Variant override**: `PI_NATIVE_VARIANT` (`modern`/`baseline` only; invalid values ignored).\n- **Selected variant**: explicit override, otherwise runtime AVX2 detection on x64 (`modern` if AVX2, else `baseline`).\n\n## Platform support and tag resolution\n\n`SUPPORTED_PLATFORMS` is fixed to:\n\n- `linux-x64`\n- `linux-arm64`\n- `darwin-x64`\n- `darwin-arm64`\n- `win32-x64`\n\nUnsupported platforms are not rejected before probing. The loader first tries the computed candidate paths. If all fail and `platformTag` is unsupported, it throws an unsupported-platform error listing supported tags.\n\n## Variant selection (`modern` / `baseline` / default)\n\n### x64 behavior\n\n1. `PI_NATIVE_VARIANT=modern|baseline` wins when valid.\n2. Otherwise AVX2 support is detected:\n - Linux: scan `/proc/cpuinfo` for `avx2`.\n - macOS: `sysctl -n machdep.cpu.leaf7_features`, then `machdep.cpu.features`.\n - Windows: PowerShell `[System.Runtime.Intrinsics.X86.Avx2]::IsSupported`.\n3. AVX2 selects `modern`; unavailable or undetectable AVX2 selects `baseline`.\n\n### Non-x64 behavior\n\nNo variant suffix is used; the filename is `pi_natives.<platform>-<arch>.node`.\n\n### Filename construction\n\n`loader-state.js#getAddonFilenames` returns:\n\n- Non-x64 or no variant: `pi_natives.<tag>.node`\n- x64 + `modern`:\n 1. `pi_natives.<tag>-modern.node`\n 2. `pi_natives.<tag>-baseline.node`\n 3. `pi_natives.<tag>.node`\n- x64 + `baseline`:\n 1. `pi_natives.<tag>-baseline.node`\n 2. `pi_natives.<tag>.node`\n\nThe default unsuffixed fallback remains part of the x64 candidate list.\n\n## Candidate path construction and fallback ordering\n\n`resolveLoaderCandidates(...)` expands every filename across directories, then de-duplicates while preserving first occurrence order.\n\n### Non-compiled runtime\n\nCandidates are grouped by directory class, in order:\n\n1. `<leafPackageDir>/<filename>` for every filename (omitted when `leafPackageDir` is `null`)\n2. `<nativeDir>/<filename>` then `<execDir>/<filename>`, per filename\n\nThe leaf package dir comes first so the optional-dependency binary published with the release is preferred over any `.node` left in the core package's `native/` (e.g. a stale local-dev build).\n\nOn Windows installs where `nativeDir` is inside a `node_modules` segment (`shouldStageNodeModulesAddon`), `<versionedDir>/<filename>` staging candidates are prepended ahead of the leaf candidates so a locked `node_modules` binary can be sidestepped during `bun install -g` updates. The staged file is copied from `leafPackageDir ?? nativeDir` before probing.\n\n### Compiled runtime\n\nCandidates are grouped, in order:\n\n1. `<versionedDir>/<filename>` then `<userDataDir>/<filename>`, per filename\n2. `<nativeDir>/<filename>` then `<execDir>/<filename>`, per filename\n\nAt load time, an extracted embedded candidate, or a staged Windows candidate when no embedded candidate exists, is prepended ahead of these de-duplicated candidates.\n\n## Embedded addon extraction lifecycle\n\n`embedded-addon.js` is generated by `scripts/embed-native.ts`. The reset stub exports `embeddedAddon = null`. A populated manifest has:\n\n- `platformTag`\n- `version`\n- `archive`: `{ format: \"tar.gz\", filename, filePath }`\n- `files[]` entries with `variant`, `filename`, and `size`\n\nExtraction (`maybeExtractEmbeddedAddon`) runs only when:\n\n1. compiled-binary mode is true,\n2. `embeddedAddon` is non-null,\n3. manifest `platformTag` equals the runtime platform tag,\n4. manifest `version` equals the package version,\n5. a variant-appropriate embedded file exists.\n\nVariant file selection:\n\n- Non-x64: prefer `default`, then first available file.\n- x64 + `modern`: prefer `modern`, fallback to `baseline`.\n- x64 + `baseline`: require `baseline`.\n\nMaterialization:\n\n1. Ensure `<versionedDir>` exists.\n2. Select `<versionedDir>/<selected filename>`.\n3. If the current cached file exists and its size matches manifest metadata, reuse it.\n4. Otherwise extract `embeddedAddon.archive.filePath` into `<versionedDir>` using the manifest `files[]` allowlist.\n5. Verify the selected target by size and return it as the first candidate.\n\nArchive, directory, or write failures are appended to the loader error list; probing continues through normal candidates.\n\n## Lifecycle and state transitions\n\n```text\nInit\n -> Load package metadata and embedded-addon manifest\n -> Compute platform/version/variant/filenames/candidate paths\n -> (compiled + embedded manifest matches?)\n yes -> extract archive to versionedDir when needed (record errors, continue)\n no -> skip extraction\n -> (Windows non-compiled node_modules install and no embedded candidate?)\n yes -> stage leaf/core addon to versionedDir (record errors, continue)\n no -> skip staging\n -> For each runtime candidate in order:\n require(candidate)\n -> sentinel validation passes or is workspace-dev: return addon exports (READY)\n -> failure: record error, continue\n -> none loaded:\n if unsupported platform tag -> throw Unsupported platform\n else -> throw Failed to load (tried-path diagnostics + hints)\n```\n\n## Failure behavior and diagnostics\n\n### Unsupported platform\n\nIf all candidates fail and `platformTag` is not supported, the loader throws:\n\n- `Unsupported platform: <tag>`\n- supported platform list\n- issue-reporting guidance\n\n### No loadable candidate\n\nIf the platform is supported but no candidate can be loaded, the final error includes:\n\n- `Failed to load pi_natives native addon for <platformTag>` or `<platformTag> (<variant>)`\n- every attempted path with the corresponding `require(...)` or sentinel-validation error\n- mode-specific remediation hints\n\n### Compiled-binary startup failures\n\nCompiled mode diagnostics include:\n\n- expected versioned cache target paths (`<versionedDir>/<filename>`),\n- remediation to delete the versioned cache and rerun,\n- direct release download `curl` commands for each expected filename.\n- release sentinel mismatch details when a loadable `.node` belongs to another `@oh-my-pi/pi-natives` version.\n\n### Non-compiled startup failures\n\nNormal package/runtime diagnostics include:\n\n- reinstall hint (`bun install @oh-my-pi/pi-natives`),\n- local rebuild command (`bun --cwd=packages/natives run build`),\n- optional x64 variant build hint (`TARGET_VARIANT=baseline|modern bun --cwd=packages/natives run build`).\n",
39
40
  "natives-architecture.md": "# Natives Architecture\n\n`@oh-my-pi/pi-natives` is a two-layer package around an ESM loader:\n\n1. **ESM loader/package entrypoint** resolves and loads the correct `.node` addon with `createRequire`, validates the release sentinel outside workspace-dev loads, and re-exports generated classes/functions plus enum runtime objects as explicit named ESM exports.\n2. **Rust N-API module layer** implements the exported functions/classes and emits the generated TypeScript declarations.\n\nThis document is the foundation for deeper module-level docs.\n\n## Implementation files\n\n- `packages/natives/native/index.js`\n- `packages/natives/native/index.d.ts`\n- `packages/natives/native/loader-state.js`\n- `packages/natives/native/embedded-addon.js`\n- `packages/natives/scripts/build-native.ts`\n- `packages/natives/scripts/embed-native.ts`\n- `packages/natives/scripts/gen-enums.ts`\n- `packages/natives/package.json`\n- `crates/pi-natives/src/lib.rs`\n\n## Package entrypoint and public surface\n\n`packages/natives/package.json` points at generated native artifacts:\n\n- `main`: `./native/index.js`\n- `types`: `./native/index.d.ts`\n- `exports[\".\"].types`: `./native/index.d.ts`\n- `exports[\".\"].import`: `./native/index.js`\n\nThere is no current `packages/natives/src` TypeScript wrapper layer. Consumers import functions/classes/enums directly from `@oh-my-pi/pi-natives`; the type contract is the generated `native/index.d.ts` plus the explicit named exports generated into `native/index.js` by `scripts/gen-enums.ts`.\n\nCurrent capability groups in the generated API include:\n\n- **Search/text/code primitives**: `grep`, `search`, `hasMatch`, `fuzzyFind`, `glob`, `astGrep`, `astEdit`, `blockRangeAt`, `summarizeCode`, text width/slicing/wrapping/sanitization, syntax highlighting, token counting.\n- **Execution/process/terminal primitives**: `executeShell`, `Shell`, `PtySession`, `Process`, key parsing, bash fixups.\n- **System/media/isolation/conversion primitives**: clipboard, SIXEL encoding, HTML-to-Markdown, macOS appearance/power helpers, work profiling, workspace scanning, isolation backend helpers (`iso*`).\n\n## Loader layer\n\n`packages/natives/native/index.js` owns runtime addon selection and optional embedded extraction.\n\n### Candidate resolution model\n\n- Platform tag is `${process.platform}-${process.arch}`.\n- Supported tags are currently:\n - `linux-x64`\n - `linux-arm64`\n - `darwin-x64`\n - `darwin-arm64`\n - `win32-x64`\n- x64 can use CPU variants:\n - `modern` (AVX2-capable)\n - `baseline` (fallback)\n- Non-x64 uses the default filename without a variant suffix.\n\nFilename strategy:\n\n- Default: `pi_natives.<platform>-<arch>.node`\n- x64 variant: `pi_natives.<platform>-<arch>-modern.node` or `...-baseline.node`\n- x64 runtime fallback includes the unsuffixed default filename after variant candidates.\n\n### Platform-specific variant detection\n\nFor x64, variant selection uses:\n\n- Linux: `/proc/cpuinfo`\n- macOS: `sysctl -n machdep.cpu.leaf7_features`, then `machdep.cpu.features`\n- Windows: PowerShell check for `System.Runtime.Intrinsics.X86.Avx2`\n\n`PI_NATIVE_VARIANT` can force `modern` or `baseline`; invalid values are ignored.\n\n### Binary distribution and extraction model\n\nThe published `@oh-my-pi/pi-natives` package ships **only** the loader layer in `native/`: the ESM loader (`index.js`), generated declarations (`index.d.ts`), the `loader-state.js`/`.d.ts` helpers, and the embedded-addon manifest stub (`embedded-addon.js`). It carries no `.node` binaries.\n\nEach platform's prebuilt `.node` is published as a separate optional-dependency leaf package — `@oh-my-pi/pi-natives-<platform>-<arch>`, one per supported tag — which the core lists in `optionalDependencies` at the lockstep version during publish. npm/bun install only the leaf whose `os`/`cpu` match the host. The working-tree package keeps built `.node` files under `native/` for local dev; the release-publish rewrite (`prepareNativeCorePackage` in `scripts/ci-release-publish.ts`) strips them from the core tarball, and the leaves are generated by `packages/natives/scripts/gen-npm-packages.ts` (`LEAF_TARGETS`). Adding a build target therefore requires a matching `LEAF_TARGETS` entry, or the binary never reaches npm users.\n\nFor compiled binaries, loader behavior is:\n\n1. Check versioned user cache path: `<getNativesDir()>/<packageVersion>/...`.\n2. Check legacy compiled-binary location:\n - Windows: `%LOCALAPPDATA%/omp` (fallback `%USERPROFILE%/AppData/Local/omp`)\n - non-Windows: `~/.local/bin`\n3. Fall back to packaged `native/` and executable directory candidates.\n\n`getNativesDir()` uses `$XDG_DATA_HOME/omp/natives` when `$XDG_DATA_HOME/omp` exists; otherwise it uses `~/.omp/natives`.\n\nIf a populated embedded addon manifest is present, it is also treated as a compiled-binary signal. Current embedded manifests point at a gzip-compressed tar archive (`embedded-addons.<tag>.tar.gz`) that contains one or more matching `.node` files. The loader extracts the archive into the versioned cache directory, validates the selected file by size, and prepends that cache path before normal candidate probing.\n\nFor npm/bun installs (non-compiled), `loader-state.js` resolves the platform leaf directory via `require.resolve(\"@oh-my-pi/pi-natives-<tag>/package.json\")` and probes its `.node` **before** the core package's `native/` directory and the executable directory. The optional-dependency binary is therefore preferred over any `.node` left in the core (e.g. a stale local-dev build). On Windows `node_modules` installs, the loader first stages the selected leaf/core addon into `<getNativesDir()>/<packageVersion>/...` and prepends that staged path so running processes do not lock the `node_modules` copy during global updates.\n\n### Failure modes\n\nLoader failures are explicit:\n\n- **Unsupported platform tag**: after failed probing, throws with supported platform list.\n- **No loadable candidate**: throws with all attempted paths and remediation hints.\n- **Embedded/staging errors**: directory/write/archive/staging failures are recorded and included in final load diagnostics if no candidate loads.\n- **Release mismatch**: outside workspace-dev loads, a candidate that loads but lacks the version sentinel export for `package.json#version` is rejected with a reinstall hint.\n\n## Rust N-API module layer\n\n`crates/pi-natives/src/lib.rs` declares exported module ownership:\n\n- `appearance`\n- `ast`\n- `block`\n- `clipboard`\n- `crash_handler`\n- `fd`\n- `fs_cache`\n- `glob`\n- `glob_util`\n- `grep`\n- `highlight`\n- `html`\n- `iso`\n- `keys`\n- `language` (re-exported from `pi_ast`)\n- `power`\n- `prof`\n- `ps`\n- `pty`\n- `shell`\n- `sixel`\n- `snapcompact`\n- `summary`\n- `task`\n- `text`\n- `tokens`\n- `utils` (crate-private helpers)\n- `workspace`\n\nN-API exports are generated from Rust `#[napi]` functions/classes/objects/enums. Snake_case Rust names are exposed as camelCase JavaScript names unless explicitly configured by napi-rs.\n\n## Ownership boundaries\n\n- **Loader/package ownership (`packages/natives/native`, `packages/natives/scripts`)**\n - runtime binary selection\n - CPU variant selection and override handling\n - compiled-binary embedded archive extraction\n - Windows `node_modules` addon staging\n - generated TypeScript declarations and explicit ESM export/enum patching\n- **Rust ownership (`crates/pi-natives/src`)**\n - algorithmic and system-level implementation\n - platform-native behavior and performance-sensitive logic\n - N-API symbol implementation consumed directly by package callers\n- **Consumer ownership (`packages/coding-agent`, `packages/tui`)**\n - user-facing policy and fallbacks that are not built into the native API\n - higher-level rendering, artifact, shell-session, and command behavior\n\n## Runtime flow (high level)\n\n1. Consumer imports from `@oh-my-pi/pi-natives`.\n2. `native/index.js` computes platform/arch/variant and candidate paths.\n3. Optional embedded archive extraction or Windows `node_modules` staging can prepend a versioned-cache candidate.\n4. Each candidate is `require(...)`d; install/compiled loads must expose the package-version sentinel.\n5. The loaded addon object is bound to explicit named ESM exports, including generated enum objects.\n6. Caller invokes generated N-API functions/classes directly.\n\n## Glossary\n\n- **Native addon**: A `.node` binary loaded via Node-API (N-API).\n- **Platform tag**: Runtime tuple `platform-arch` (for example `darwin-arm64`).\n- **Platform leaf package**: Per-platform npm package `@oh-my-pi/pi-natives-<tag>` that carries one platform's prebuilt `.node`. The core depends on every leaf via `optionalDependencies`; the package manager installs only the host-matching one (`os`/`cpu`).\n- **Variant**: x64 CPU-specific build flavor (`modern` AVX2, `baseline` fallback).\n- **Generated binding declaration**: `native/index.d.ts` emitted by napi-rs during `build-native.ts`.\n- **Version sentinel**: Rust export named from the package version (for example `__piNativesV15_7_2`) that lets the loader reject a `.node` from a different release.\n- **Compiled binary mode**: Runtime mode where the CLI is bundled and native addons are resolved from embedded/cache paths before package-local paths.\n- **Embedded addon**: Build artifact metadata and archive reference generated into `native/embedded-addon.js` so compiled binaries can extract matching `.node` payloads.\n",
40
41
  "natives-binding-contract.md": "# Natives Binding Contract (JavaScript/TypeScript Side)\n\nThis document defines the JS/TS contract between `@oh-my-pi/pi-natives` callers and the loaded N-API addon.\n\nCurrent package shape is direct-to-native: there is no `packages/natives/src/<module>` TypeScript wrapper layer. The public API is the generated `packages/natives/native/index.d.ts` declaration file, the ESM loader/export wrapper in `packages/natives/native/index.js`, and the Rust `#[napi]` exports in `crates/pi-natives/src`.\n\n## Implementation files\n\n- `packages/natives/native/index.js`\n- `packages/natives/native/index.d.ts`\n- `packages/natives/native/loader-state.js`\n- `packages/natives/scripts/build-native.ts`\n- `packages/natives/scripts/gen-enums.ts`\n- `packages/natives/package.json`\n- `crates/pi-natives/src/lib.rs`\n- Rust modules under `crates/pi-natives/src/*.rs`\n\n## Contract model\n\nThe contract has three parts:\n\n1. **ESM runtime loader/export wrapper** (`native/index.js`)\n - calls `loadNative()` from `loader-state.js`, which `require(...)`s the `.node` addon;\n - binds generated classes/functions as explicit named ESM exports;\n - emits enum runtime objects generated by `scripts/gen-enums.ts`.\n2. **Generated TypeScript declarations** (`native/index.d.ts`)\n - generated by napi-rs during `scripts/build-native.ts`;\n - declares exported functions, classes, object interfaces, and native enums;\n - is the package `types` entry.\n3. **Rust N-API exports** (`crates/pi-natives/src`)\n - `#[napi]` functions/classes/objects/enums are the source of generated declarations and runtime symbols;\n - snake_case Rust names become camelCase JavaScript names by napi-rs convention.\n\nThere is no current `NativeBindings` declaration-merging lifecycle and no full required-export list in the loader. Install/compiled loads do validate the package-version sentinel export; workspace-dev loads skip that check.\n\n## Public export surface organization\n\n`packages/natives/package.json` exposes the package root only:\n\n```json\n{\n \"main\": \"./native/index.js\",\n \"types\": \"./native/index.d.ts\",\n \"exports\": {\n \".\": {\n \"types\": \"./native/index.d.ts\",\n \"import\": \"./native/index.js\"\n }\n }\n}\n```\n\nConsumers in `packages/coding-agent` and `packages/tui` import directly from `@oh-my-pi/pi-natives`.\n\n## JS API ↔ native export mapping (representative)\n\n| Category | Public JS API | Rust source | Return style |\n| ----------------- | --------------------------------------------------------------------------------------------------------- | ------------------------------------------------ | -------------------------- |\n| Grep | `grep(options, onMatch?)` | `grep.rs` | `Promise<GrepResult>` |\n| Grep | `search(content, options)` | `grep.rs` | `SearchResult` |\n| Grep | `hasMatch(content, pattern, ignoreCase?, multiline?)` | `grep.rs` | `boolean` |\n| Fuzzy path search | `fuzzyFind(options)` | `fd.rs` | `Promise<FuzzyFindResult>` |\n| Glob/workspace | `glob(options, onMatch?)`, `listWorkspace(options)` | `glob.rs`, `workspace.rs` | `Promise<...>` |\n| Glob cache | `invalidateFsScanCache(path?)` | `fs_cache.rs` | `void` |\n| AST/block/summary | `astGrep(options)`, `astEdit(options)`, `blockRangeAt(options)`, `summarizeCode(options)` | `ast.rs`, `block.rs`, `summary.rs` | mixed |\n| Shell | `executeShell(options, onChunk?)` | `shell.rs` | `Promise<ShellRunResult>` |\n| Shell | `new Shell(options?)`, `shell.run(...)`, `shell.abort()` | `shell.rs` | class / promises |\n| PTY | `new PtySession()`, `start/write/resize/kill` | `pty.rs` | class / promises |\n| Process | `Process.fromPid/fromPath`, `status/children/killTree/terminate/waitForExit` | `ps.rs` | class / mixed |\n| Keys | `parseKey`, `matchesKey`, Kitty/legacy helpers | `keys.rs` | sync |\n| Text | `wrapTextWithAnsi`, `truncateToWidth`, `sliceWithWidth`, `extractSegments`, `visibleWidth` | `text.rs` | sync |\n| Highlight | `highlightCode`, `supportsLanguage`, `getSupportedLanguages` | `highlight.rs` | sync |\n| HTML | `htmlToMarkdown(html, options?)` | `html.rs` | `Promise<string>` |\n| SIXEL | `encodeSixel` | `sixel.rs` | sync |\n| Clipboard | `copyToClipboard`, `readImageFromClipboard` | `clipboard.rs` | sync / promise |\n| Tokens | `countTokens(input, encoding?)` | `tokens.rs` | sync |\n| System/isolation | `detectMacOSAppearance`, `MacAppearanceObserver`, `MacOSPowerAssertion`, `getWorkProfile`, `iso*` helpers | `appearance.rs`, `power.rs`, `prof.rs`, `iso.rs` | mixed |\n\n## Sync vs async contract differences\n\nThe contract preserves Rust/N-API call style:\n\n- **Promise-returning exports** for worker-thread or async runtime work (`grep`, `glob`, `fuzzyFind`, `astGrep`, `astEdit`, `htmlToMarkdown`, shell/PTY runs, `isoStart`/`isoStop`/`isoDiff`, clipboard image read, workspace scan).\n- **Synchronous exports** for deterministic in-memory transforms/parsers or direct system calls (`search`, `hasMatch`, highlighting, text utilities, token counting, process construction/status, `copyToClipboard`, `encodeSixel`, isolation probe/resolve helpers).\n- **Constructor exports** for stateful runtime objects (`Shell`, `PtySession`, `Process`, macOS observer/power handles).\n\nChanging sync ↔ async for an existing export is a breaking public API change because consumers call these exports directly.\n\n## Object and enum typing patterns\n\n### Object patterns\n\n`#[napi(object)]` Rust structs become TS interfaces, for example:\n\n- `GrepResult`, `SearchResult`, `GlobResult`, `FuzzyFindResult`\n- `ShellRunResult`, `PtyRunResult`, `MinimizerResult`\n- `AstFindResult`, `AstReplaceResult`, `BlockRange`, `SummaryResult`\n- `System`/media/isolation payloads such as `ClipboardImage`, `WorkProfile`, `ParsedKittyResult`, `IsoResolveResult`\n\nRuntime shape correctness is owned by napi-rs and the Rust implementation.\n\n### Enum patterns\n\nNative enums are represented in generated declarations and also emitted as runtime objects by `scripts/gen-enums.ts`, because napi-rs string enums are TS-only without explicit JS exports. Current enum objects include:\n\n- `AstMatchStrictness`\n- `Ellipsis`\n- `Encoding`\n- `FileType`\n- `GrepOutputMode`\n- `IsoBackendKind`\n- `IsoChangeKind`\n- `KeyEventType`\n- `MacOSAppearance`\n- `ProcessStatus`\n\n## Error behavior and caveats\n\n- Addon load failure or unsupported platform throws during package import from `native/index.js`.\n- The loader rejects install/compiled candidates that lack the package-version sentinel export. It does not verify the full export set after `require(...)`; stale same-version or incomplete binaries surface as native load errors or missing members at use sites.\n- N-API conversion validates basic argument conversion, but TS optional fields do not guarantee semantic validity for untyped callers.\n- Numeric enum declarations do not prevent out-of-range numeric values from untyped callers unless the Rust function rejects them during conversion.\n- Callback exports use napi-rs `ThreadsafeFunction` shape: `(error: Error | null, value) => void`. Native code generally emits successful values; hard failures reject/throw through the owning call.\n\n## Maintainer checklist for binding changes\n\nWhen adding/changing an export, update all of:\n\n1. Rust `#[napi]` implementation in the owning `crates/pi-natives/src/<module>.rs`.\n2. `crates/pi-natives/src/lib.rs` if a new module is added.\n3. Any consumer imports/callsites in `packages/coding-agent` or `packages/tui`.\n4. Build output by running the natives build so `native/index.d.ts` and `native/index.js` stay in sync.\n5. `scripts/gen-enums.ts` if enum runtime export patching needs to change.\n\nDo not add a parallel TS wrapper convention unless the package design intentionally moves back to wrappers; current consumers depend on the direct generated API.\n",
@@ -61,7 +62,7 @@ export const EMBEDDED_DOCS: Readonly<Record<string, string>> = {
61
62
  "session-switching-and-recent-listing.md": "# Session switching and recent session listing\n\nThis document describes how coding-agent discovers recent sessions, resolves `--resume` targets, presents session pickers, and switches the active runtime session.\n\nIt focuses on current implementation behavior, including fallback paths and caveats.\n\n## Implementation files\n\n- [`../src/session/session-manager.ts`](../packages/coding-agent/src/session/session-manager.ts)\n- [`../src/session/agent-session.ts`](../packages/coding-agent/src/session/agent-session.ts)\n- [`../src/cli/session-picker.ts`](../packages/coding-agent/src/cli/session-picker.ts)\n- [`../src/modes/components/session-selector.ts`](../packages/coding-agent/src/modes/components/session-selector.ts)\n- [`../src/modes/controllers/selector-controller.ts`](../packages/coding-agent/src/modes/controllers/selector-controller.ts)\n- [`../src/main.ts`](../packages/coding-agent/src/main.ts)\n- [`../src/sdk.ts`](../packages/coding-agent/src/sdk.ts)\n- [`../src/modes/interactive-mode.ts`](../packages/coding-agent/src/modes/interactive-mode.ts)\n- [`../src/modes/utils/ui-helpers.ts`](../packages/coding-agent/src/modes/utils/ui-helpers.ts)\n\n## Recent-session discovery\n\n### Directory scope\n\n`SessionManager` stores sessions under a cwd-scoped directory by default:\n\n- `~/.omp/agent/sessions/<dir-encoded>/*.jsonl` (home-relative `-<rel>` names, `-tmp-<rel>` for temp paths, legacy `--<abs>--` otherwise)\n\n`SessionManager.list(cwd, sessionDir?)` reads only that directory unless an explicit `sessionDir` is provided.\n\n### Two listing paths with different payloads\n\nThere are two different listing pipelines:\n\n1. `getRecentSessions(sessionDir, limit)` (welcome/summary view)\n - Reads only a 4KB prefix (`readTextSlices(..., 4096, 0)[0]`) from each file.\n - Parses header + earliest user text preview.\n - Returns lightweight `RecentSessionInfo` with lazy `name` and `timeAgo` getters.\n - Sorts by file `mtime` descending.\n\n2. `SessionManager.list(...)` / `SessionManager.listAll()` (resume pickers and ID matching)\n - Reads a 4KB prefix plus a bounded 32 KiB tail in one `readTextSlices(...)` call per file, not the full JSONL file.\n - Builds `SessionInfo` objects (`id`, `cwd`, `title`, `messageCount`, `firstMessage`, `allMessagesText`, timestamps, lifecycle status).\n - Uses prefix parsing plus marker counting for list text, and tail parsing for the final-message lifecycle status; later messages beyond the prefix may not be present in `allMessagesText`.\n - Sorts by `modified` descending.\n\n### Metadata fallback behavior\n\nFor recent summaries (`RecentSessionInfo`):\n\n- display name preference: `header.title` -> first user prompt -> `header.id` -> filename\n- name is truncated to 40 chars for compact displays\n- control characters/newlines are stripped/sanitized from title-derived names\n\nFor `SessionInfo` list entries:\n\n- `title` is `header.title` or the last compaction `shortSummary` seen in the 4KB prefix\n- `firstMessage` is first user message text discoverable from the prefix or `\"(no messages)\"`\n\n## `--continue` resolution and terminal breadcrumb preference\n\n`SessionManager.continueRecent(cwd, sessionDir?)` resolves the target in this order:\n\n1. Read terminal-scoped breadcrumb (`~/.omp/agent/terminal-sessions/<terminal-id>`)\n2. Validate breadcrumb:\n - current terminal can be identified\n - referenced file still exists\n3. If the breadcrumb's cwd differs from the current cwd, that cwd no longer exists (moved/renamed dir), and the current directory has no sessions of its own, the breadcrumb session is re-rooted into the current directory (`SessionManager.open` + `moveTo`) instead of starting fresh\n4. Otherwise, if the breadcrumb cwd matches the current cwd (resolved path compare), use the breadcrumb session; else fall back to newest file by mtime in the session dir (`findMostRecentSession`)\n5. If none found, create a new session\n\nTerminal ID derivation prefers TTY path and falls back to env-based identifiers (`TMUX_PANE`, `CMUX_SURFACE_ID`, `KITTY_WINDOW_ID`, `TERM_SESSION_ID`, `WT_SESSION`).\n\nBreadcrumb writes are best-effort and non-fatal.\n\n## Startup-time resume target resolution (`main.ts`)\n\n### `--resume <value>`\n\n`createSessionManager(...)` handles string-valued `--resume` in two modes:\n\n1. Path-like value (contains `/`, `\\\\`, or ends with `.jsonl`)\n - direct `SessionManager.open(sessionArg, parsed.sessionDir)`\n\n2. Resume key value\n - `resolveResumableSession(...)` searches local sessions first, then all sessions when `sessionDir` is not forced\n - matching is case-insensitive and accepts `id` prefix, full JSONL filename prefix, or the session-id suffix after the timestamp\n - first match in modified-descending order is used (no ambiguity prompt)\n\nCross-project match behavior:\n\n- if the matched session's recorded cwd no longer exists (moved/renamed dir), CLI prompts `Move (re-root) it into the current directory? [Y/n]`; yes opens the session and `moveTo(cwd)` re-roots it (this also applies to local-scope matches whose recorded cwd is gone)\n- otherwise, if a global match's cwd differs from the current cwd, CLI prompts `Fork into current directory? [y/N]`\n- fork accepted -> `SessionManager.forkFrom(...)`\n- either prompt declined -> command cancels (`Resume cancelled: session is in another project.`)\n- non-TTY -> throws `SessionResolutionError` instead of prompting\n\nNo match -> throws error (`Session \"...\" not found.`).\n\n### `--resume` (no value)\n\nHandled after initial session-manager construction:\n\n1. list local sessions with `SessionManager.list(cwd, parsed.sessionDir)`\n2. if empty: preload `SessionManager.listAll()` and open the picker in all-projects scope; print `No sessions found` and exit early only when the global list is also empty\n3. open TUI picker (`selectSession`, with optional preloaded `allSessions`/`startInAllScope`)\n4. if canceled: print `No session selected` and exit early\n5. if selected: when the session belongs to another project, switch the process into that project's directory (`setProjectDir`, cache resets, settings reload) first; then `SessionManager.open(selected.path)`\n\n### `--continue`\n\nUses `SessionManager.continueRecent(...)` directly (breadcrumb-first behavior above).\n\n## Picker-based selection internals\n\n## CLI picker (`src/cli/session-picker.ts`)\n\n`selectSession(sessions, { allSessions?, startInAllScope? })` creates a standalone TUI with `SessionSelectorComponent` and resolves exactly once:\n\n- selection -> resolves selected `SessionInfo` (caller uses `.path` / `.cwd`)\n- cancel (Esc) -> resolves `null`\n- hard exit (Ctrl+C path) -> stops TUI and `process.exit(0)`\n- Tab toggles current-folder / all-projects scope; the all-projects list is loaded lazily via `SessionManager.listAll` (or preloaded via `allSessions`)\n- search ranking is augmented with prompt-history matches from `history.db` (`HistoryStorage.matchingSessionIds`) when available\n\n## Interactive in-session picker (`SelectorController.showSessionSelector`)\n\nFlow:\n\n1. fetch sessions from current session dir via `SessionManager.list(currentCwd, currentSessionDir)`; if empty, preload `SessionManager.listAll()` and open in all-projects scope\n2. mount `SessionSelectorComponent` in editor area using `showSelector(...)`, wired with `loadAllSessions: () => SessionManager.listAll()` and a `history.db` prompt matcher\n3. callbacks:\n - select -> close selector and call `handleResumeSession(sessionPath)`\n - cancel -> restore editor and rerender\n - exit -> `ctx.shutdown()`\n\n## Session selector component behavior\n\n`SessionList` supports:\n\n- arrow/page navigation\n- Enter to select\n- Delete to delete after confirmation\n- Esc to cancel\n- Ctrl+C to exit\n- Tab to toggle current-folder / all-projects scope\n- ranked fuzzy search across session id/title/cwd/first message/all messages/path, merged with prompt-history matches from `history.db`\n\nEmpty-list render behavior:\n\n- current-folder scope renders `No sessions in current folder. Press Tab to view all.`; all-projects scope renders `No sessions found`\n- Enter/Delete on empty do nothing (no callback)\n- Esc/Ctrl+C still work\n\n## Runtime switch execution (`AgentSession.switchSession`)\n\n`switchSession(sessionPath)` is the core in-process switch path.\n\nLifecycle/state transition:\n\n1. capture `previousSessionFile`\n2. emit `session_before_switch` hook event (`reason: \"resume\"`, cancellable)\n3. if canceled -> return `false` with no switch\n4. disconnect from current agent event stream\n5. abort active generation/tool flow\n6. flush session writer (`sessionManager.flush()`) to persist pending writes, then capture rollback state\n7. clear queued steering/follow-up/next-turn message buffers\n8. `sessionManager.setSessionFile(sessionPath)`\n - updates session file pointer\n - writes terminal breadcrumb\n - loads entries / migrates / blob-resolves / reindexes\n - if missing/invalid file data: initializes a new session at that path and rewrites header\n9. update `agent.sessionId`\n10. rebuild display context via `buildDisplaySessionContext()`\n11. restore persisted/discovered MCP tool selections and rebuild active tools/system prompt when discovery is enabled\n12. emit `session_switch` hook event (`reason: \"resume\"`, `previousSessionFile`)\n13. replace agent messages with rebuilt context and sync todos\n14. close provider sessions when switching to a different session or when same-session reload changed replay messages\n15. restore model via `getRestorableSessionModels(sessionContext.models, lastModelChangeRole)` — tries the recorded models in fallback order and uses the first one present in the model registry\n16. restore thinking level and service tier:\n - thinking uses persisted `thinking_level_change`, otherwise the configured default clamped to model capability\n - service tier uses persisted `service_tier_change`, otherwise the configured `serviceTier` setting (`\"none\"` becomes unset)\n17. reconnect agent listeners, run the registered session-switch reconciler if any (interactive mode re-enters persisted modes; errors logged, not fatal), and return `true`\n\n## UI state rebuild after interactive switch\n\n`SelectorController.handleResumeSession` performs UI reset around `switchSession`:\n\n- stop loading animation\n- clear status container\n- clear pending-message UI and pending tool map\n- reset streaming component/message references\n- call `session.switchSession(...)`\n- if the resumed session's cwd differs from the previous one, re-point the process and cwd-derived caches at it (`applyCwdChange`)\n- clear chat container and rerender from session context (`renderInitialMessages`)\n- reload todos from new session artifacts\n- show `Resumed session` (or `Resumed session in <dir>` for a cross-project resume)\n\nSo visible conversation/todo state is rebuilt from the new session file.\n\n## Startup resume vs in-session switch\n\n### Startup resume (`--continue`, `--resume`, direct open)\n\n- Session file is chosen before `createAgentSession(...)`.\n- `sdk.ts` builds `existingSession = sessionManager.buildSessionContext()`.\n- Agent messages are restored once during session creation.\n- Model/thinking are selected during creation (including restore/fallback logic).\n- Interactive mode then runs `#reconcileModeFromSession()` to re-enter persisted mode state (e.g. plan mode).\n\n### In-session switch (`/resume`-style selector path)\n\n- Uses `AgentSession.switchSession(...)` on an already-running `AgentSession`.\n- Messages/model/thinking are rebuilt immediately in place.\n- Hook `session_before_switch`/`session_switch` events are emitted.\n- UI chat/todos are refreshed.\n- Mode re-entry is symmetric with startup: interactive mode registers `#reconcileModeFromSession()` as the session-switch reconciler (`setSessionSwitchReconciler`), and `switchSession()` invokes it after reconnecting.\n\n## Failure and edge-case behavior\n\n### Cancellation paths\n\n- CLI picker cancel -> returns `null`, caller prints `No session selected`, process exits early.\n- Interactive picker cancel -> editor restored, no session change.\n- Hook cancellation (`session_before_switch`) -> `switchSession()` returns `false`.\n\n### Empty list paths\n\n- CLI `--resume` (no value): empty list prints `No sessions found` and exits.\n- Interactive selector: empty list renders message and remains cancellable.\n\n### Missing/invalid target session file\n\nWhen opening/switching to a specific path (`setSessionFile`):\n\n- ENOENT -> treated as empty -> new session initialized at that exact path and persisted.\n- malformed/invalid header (or effectively unreadable parsed entries) -> treated as empty -> new session initialized and persisted.\n\nThis is recovery behavior, not hard failure.\n\n### Hard failures\n\nSwitch/open can still throw on true I/O failures (permission errors, rewrite failures, etc.), which propagate to callers.\n\n### ID prefix matching caveats\n\n- Matching uses `startsWith` on the lowercased session id, lowercased JSONL filename, and lowercased id suffix after the filename timestamp.\n- First match in modified-descending order wins; there is no ambiguity UI if multiple sessions share a prefix.\n- Prefix-listing metadata is intentionally lightweight, so search text may not include messages outside the first 4KB of the session file.\n",
62
63
  "session-tree-plan.md": "# Session tree architecture (current)\n\nReference: [session.md](../docs/session.md)\n\nThis document describes how session tree navigation works today: in-memory tree model, leaf movement rules, branching behavior, and extension/event integration.\n\n## What this subsystem is\n\nThe session is stored as an append-only entry log, but runtime behavior is tree-based:\n\n- Every non-header entry has `id` and `parentId`.\n- The active position is `leafId` in `SessionManager`.\n- Appending an entry always creates a child of the current leaf.\n- Branching does **not** rewrite history; it only changes where the leaf points before the next append.\n\nKey files:\n\n- `src/session/session-manager.ts` — tree data model, traversal, leaf movement, branch/session extraction\n- `src/session/agent-session.ts` — `/tree` navigation flow, summarization, hook/event emission\n- `src/modes/components/tree-selector.ts` — interactive tree UI behavior and filtering\n- `src/modes/controllers/selector-controller.ts` — selector orchestration for `/tree` and `/branch`\n- `src/slash-commands/builtin-registry.ts` — command routing (`/tree`, `/branch`)\n- `src/modes/controllers/input-controller.ts` — double-escape behavior and `app.session.tree`/`app.session.fork` keybinding wiring\n- `src/session/messages.ts` — conversion of `branch_summary`, `compaction`, and `custom_message` entries into LLM context messages\n\n## Tree data model in `SessionManager`\n\nRuntime indices:\n\n- `#byId: Map<string, SessionEntry>` — fast lookup for any entry\n- `#leafId: string | null` — current position in the tree\n- `#labelsById: Map<string, string>` — resolved labels by target entry id\n\nTree APIs:\n\n- `getBranch(fromId?)` walks parent links to root and returns root→node path\n- `getTree()` returns `SessionTreeNode[]` (`entry`, `children`, `label`)\n - parent links become children arrays\n - entries with missing parents are treated as roots\n - children are sorted oldest→newest by timestamp\n- `getChildren(parentId)` returns direct children\n- `getLabel(id)` resolves current label from `labelsById`\n\n`getTree()` is a runtime projection; persistence remains append-only JSONL entries.\n\n## Leaf movement semantics\n\nThere are three leaf movement primitives:\n\n1. `branch(entryId)`\n - Validates entry exists\n - Sets `leafId = entryId`\n - No new entry is written\n\n2. `resetLeaf()`\n - Sets `leafId = null`\n - Next append creates a new root entry (`parentId = null`)\n\n3. `branchWithSummary(branchFromId, summary, details?, fromExtension?)`\n - Accepts `branchFromId: string | null`\n - Sets `leafId = branchFromId`\n - Appends a `branch_summary` entry as child of that leaf\n - When `branchFromId` is `null`, `fromId` is persisted as `\"root\"`\n\n## `/tree` navigation behavior (same session file)\n\n`AgentSession.navigateTree()` is navigation, not file forking.\n\nFlow:\n\n1. Validate target and compute abandoned path (`collectEntriesForBranchSummary`)\n2. Emit `session_before_tree` with `TreePreparation`\n3. Optionally summarize abandoned entries (hook-provided summary or built-in summarizer)\n4. Compute new leaf target:\n - selecting a **user** message: leaf moves to its parent, and message text is returned for editor prefill\n - selecting a **custom_message**: same rule as user message (leaf = parent, text prefills editor)\n - selecting any other entry: leaf = selected entry id\n5. Apply leaf move:\n - with summary: `branchWithSummary(newLeafId, ...)`\n - without summary and `newLeafId === null`: `resetLeaf()`\n - otherwise: `branch(newLeafId)`\n6. Rebuild agent context from new leaf and emit `session_tree`\n\nImportant: summary entries are attached at the **new navigation position**, not on the abandoned branch tail.\n\n## `/branch` behavior (new session file)\n\n`/branch` and `/tree` are intentionally different:\n\n- `/tree` navigates within the current session file.\n- `/branch` creates a new session branch file (or in-memory replacement for non-persistent mode).\n\nUser-facing `/branch` flow (`SelectorController.showUserMessageSelector` → `AgentSession.branch`):\n\n- Branch source must be a **user message**.\n- Selected user text is extracted for editor prefill.\n- If selected user message is root (`parentId === null`): start a new session via `newSession({ parentSession: previousSessionFile })`.\n- Otherwise: `createBranchedSession(selectedEntry.parentId)` to fork history up to the selected prompt boundary.\n\n`SessionManager.createBranchedSession(leafId)` specifics:\n\n- Builds root→leaf path via `getBranch(leafId)`; throws if missing.\n- Excludes existing `label` entries from copied path.\n- Rebuilds fresh label entries from resolved `labelsById` for entries that remain in path.\n- Persistent mode: writes new JSONL file and switches manager to it; returns new file path.\n- In-memory mode: replaces in-memory entries; returns `undefined`.\n\n## Context reconstruction and summary/custom integration\n\n`buildSessionContext()` (in `session-manager.ts`) resolves the active root→leaf path and builds effective LLM context state:\n\n- Tracks latest thinking/model/service-tier/mode/TTSR/MCP-selection state on path.\n- Handles latest compaction on path:\n - emits compaction summary first\n - replays kept messages from `firstKeptEntryId` to compaction point\n - then replays post-compaction messages\n- Includes `branch_summary` and `custom_message` entries as `AgentMessage` objects.\n\n`session/messages.ts` then maps these message types for model input:\n\n- `branchSummary` and `compactionSummary` become user-role templated context messages\n- `custom`/`hookMessage` become developer-role content messages (via agent-core's `convertMessageToLlm`)\n\nSo tree movement changes context by changing the active leaf path, not by mutating old entries.\n\n## Labels and tree UI behavior\n\nLabel persistence:\n\n- `appendLabelChange(targetId, label?)` writes `label` entries on the current leaf chain.\n- `labelsById` is updated immediately (set or delete).\n- `getTree()` resolves current label onto each returned node.\n\nTree selector behavior (`tree-selector.ts`):\n\n- Flattens tree for navigation, keeps active-path highlighting, and prioritizes displaying the active branch first.\n- Supports filter modes: `default`, `no-tools`, `user-only`, `labeled-only`, `all`.\n - `default` suppresses `label`, `custom`, `model_change`, and `thinking_level_change`; it is not a complete \"hide all internal entries\" filter.\n- Supports free-text search over rendered semantic content.\n- `Shift+L` opens inline label editing and writes via `appendLabelChange`.\n\nCommand routing:\n\n- `/tree` always opens tree selector.\n- `/branch` opens user-message selector unless `doubleEscapeAction=tree`, in which case it also uses tree selector UX.\n\n## Extension and hook touchpoints for tree operations\n\nCommand-time extension API (`ExtensionCommandContext`):\n\n- `branch(entryId)` — create branched session file\n- `navigateTree(targetId, { summarize? })` — move within current tree/file\n\nEvents around tree navigation:\n\n- `session_before_tree`\n - receives `TreePreparation`:\n - `targetId`\n - `oldLeafId`\n - `commonAncestorId`\n - `entriesToSummarize`\n - `userWantsSummary`\n - may cancel navigation\n - may provide summary payload used instead of built-in summarizer\n - receives abort `signal` (Escape cancellation path)\n- `session_tree`\n - emits `newLeafId`, `oldLeafId`\n - includes `summaryEntry` when a summary was created\n - `fromExtension` indicates summary origin\n\nAdjacent but related lifecycle hooks:\n\n- `session_before_branch` / `session_branch` for `/branch` flow\n- `session_before_compact`, `session.compacting`, `session_compact` for compaction entries that later affect tree-context reconstruction\n\n## Real constraints and edge conditions\n\n- `branch()` cannot target `null`; use `resetLeaf()` for root-before-first-entry state.\n- `branchWithSummary()` supports `null` target and records `fromId: \"root\"`.\n- Selecting current leaf in tree selector is a no-op.\n- Summarization requires an active model; if absent, summarize navigation fails fast.\n- If summarization is aborted, navigation is cancelled and leaf is unchanged.\n- In-memory sessions never return a branch file path from `createBranchedSession`.\n- Tree context reconstruction includes service-tier and MCP tool-selection state, but those entries do not become LLM messages.\n\n## Plan approval session naming\n\nWhen a user approves a plan from plan mode (`InteractiveMode.#approvePlan`), the approval handler seeds the session name from the plan's title so the resulting (fresh or compacted) session does not stay unnamed.\n\nTrigger:\n\n- Plan approval reaches `#approvePlan(...)` with `options.title` populated from the plan-approval details.\n- This runs for every approval choice (`Approve and execute`, `Approve and compact context`, plain `Approve`); the synthetic `plan-approved` prompt is what otherwise bypasses the input-controller's title-generation path.\n\nNaming source:\n\n- The normalized plan title is humanized via `humanizePlanTitle(title)` (`packages/coding-agent/src/plan-mode/approved-plan.ts`):\n - replaces runs of `-`/`_` with a single space\n - trims whitespace\n - capitalizes the first character\n - returns `\"\"` for whitespace-only / separator-only input\n- The humanized name is applied only when the current session has no name (`!sessionManager.getSessionName()`). It then calls `sessionManager.setSessionName(name, \"auto\")`, which also refuses to overwrite user-named sessions.\n- On successful apply, the terminal title (`setSessionTerminalTitle`) and the editor border color are refreshed to reflect the new name.\n\nExamples (from `humanizePlanTitle`):\n\n- `migrate-mcp-loader` → `Migrate mcp loader`\n- `fix_session_naming` → `Fix session naming`\n- `foo--bar__baz` → `Foo bar baz`\n- `RefactorRouter` → `RefactorRouter` (no separators to expand)\n- `\"\"` / `\"---\"` → `\"\"` (no name applied)\n\n## Legacy compatibility still present\n\nSession migrations still run on load:\n\n- v1→v2 adds `id`/`parentId` and converts compaction index anchor to id anchor\n- v2→v3 migrates legacy `hookMessage` role to `custom`\n\nCurrent runtime behavior is version-3 tree semantics after migration.\n",
63
64
  "session.md": "# Session Storage and Entry Model\n\nThis document is the source of truth for how coding-agent sessions are represented, persisted, migrated, and reconstructed at runtime.\n\n## Scope\n\nCovers:\n\n- Session JSONL format and versioning\n- Entry taxonomy and tree semantics (`id`/`parentId` + leaf pointer)\n- Migration/compatibility behavior when loading old or malformed files\n- Context reconstruction (`buildSessionContext`)\n- Persistence guarantees, failure behavior, truncation/blob externalization\n- Storage abstractions (`FileSessionStorage`, `MemorySessionStorage`) and related utilities\n\nDoes not cover `/tree` UI rendering behavior beyond semantics that affect session data.\n\n## Implementation Files\n\n- [`src/session/session-manager.ts`](../packages/coding-agent/src/session/session-manager.ts)\n- [`src/session/messages.ts`](../packages/coding-agent/src/session/messages.ts)\n- [`src/session/session-storage.ts`](../packages/coding-agent/src/session/session-storage.ts)\n- [`src/session/history-storage.ts`](../packages/coding-agent/src/session/history-storage.ts)\n- [`src/session/blob-store.ts`](../packages/coding-agent/src/session/blob-store.ts)\n\n## On-Disk Layout\n\nDefault session file location:\n\n```text\n~/.omp/agent/sessions/<dir-encoded>/<timestamp>_<sessionId>.jsonl\n```\n\n`<dir-encoded>` depends on where the canonicalized cwd lives:\n\n- inside the home directory: `-<relative-path>` with `/`, `\\\\`, and `:` replaced by `-` (bare `-` for home itself)\n- inside the OS temp root: `-tmp-<relative-path>` with the same replacement\n- anywhere else: legacy absolute form `--<cwd-without-leading-slash-with-same-replacement>--`\n\nOld `--<home-encoded>-*--` directories are migrated to the new home-relative names once per sessions root on first access (best-effort).\n\nBlob store location:\n\n```text\n~/.omp/agent/blobs/<sha256>\n```\n\nTerminal breadcrumb files are written under:\n\n```text\n~/.omp/agent/terminal-sessions/<terminal-id>\n```\n\nBreadcrumb content is two lines: original cwd, then session file path. `continueRecent()` prefers this terminal-scoped pointer before scanning most-recent mtime.\n\n## File Format\n\nSession files are JSONL: one JSON object per line.\n\n- Line 1 is always the session header (`type: \"session\"`).\n- Remaining lines are `SessionEntry` values.\n- Entries are append-only at runtime; branch navigation moves a pointer (`leafId`) rather than mutating existing entries.\n\n### Header (`SessionHeader`)\n\n```json\n{\n \"type\": \"session\",\n \"version\": 3,\n \"id\": \"1f9d2a6b9c0d1234\",\n \"timestamp\": \"2026-02-16T10:20:30.000Z\",\n \"cwd\": \"/work/pi\",\n \"title\": \"optional session title\",\n \"titleSource\": \"auto\",\n \"parentSession\": \"optional lineage marker\"\n}\n```\n\nNotes:\n\n- `version` is optional in v1 files; absence means v1.\n- `parentSession` is an opaque lineage string. Current code writes either a session id or a session path depending on flow (`fork`, `forkFrom`, `createBranchedSession`, or explicit `newSession({ parentSession })`). Treat as metadata, not a typed foreign key.\n\n### Entry Base (`SessionEntryBase`)\n\nAll non-header entries include:\n\n```json\n{\n \"type\": \"...\",\n \"id\": \"8-char-id\",\n \"parentId\": \"previous-or-branch-parent\",\n \"timestamp\": \"2026-02-16T10:20:30.000Z\"\n}\n```\n\n`parentId` can be `null` for a root entry (first append, or after `resetLeaf()`).\n\n## Entry Taxonomy\n\n`SessionEntry` is the union of:\n\n- `message`\n- `thinking_level_change`\n- `model_change`\n- `service_tier_change`\n- `compaction`\n- `branch_summary`\n- `custom`\n- `custom_message`\n- `label`\n- `ttsr_injection`\n- `session_init`\n- `mode_change`\n- `mcp_tool_selection`\n\n### `message`\n\nStores an `AgentMessage` directly.\n\n```json\n{\n \"type\": \"message\",\n \"id\": \"a1b2c3d4\",\n \"parentId\": null,\n \"timestamp\": \"2026-02-16T10:21:00.000Z\",\n \"message\": {\n \"role\": \"assistant\",\n \"provider\": \"anthropic\",\n \"model\": \"claude-sonnet-4-5\",\n \"content\": [{ \"type\": \"text\", \"text\": \"Done.\" }],\n \"usage\": {\n \"input\": 100,\n \"output\": 20,\n \"cacheRead\": 0,\n \"cacheWrite\": 0,\n \"cost\": {\n \"input\": 0,\n \"output\": 0,\n \"cacheRead\": 0,\n \"cacheWrite\": 0,\n \"total\": 0\n }\n },\n \"timestamp\": 1760000000000\n }\n}\n```\n\n### `model_change`\n\n```json\n{\n \"type\": \"model_change\",\n \"id\": \"b1c2d3e4\",\n \"parentId\": \"a1b2c3d4\",\n \"timestamp\": \"2026-02-16T10:21:30.000Z\",\n \"model\": \"openai/gpt-4o\",\n \"role\": \"default\"\n}\n```\n\n`role` is optional; missing is treated as `default` in context reconstruction.\n\n### `service_tier_change`\n\n```json\n{\n \"type\": \"service_tier_change\",\n \"id\": \"c1d2e3f4\",\n \"parentId\": \"b1c2d3e4\",\n \"timestamp\": \"2026-02-16T10:21:45.000Z\",\n \"serviceTier\": \"flex\"\n}\n```\n\n`serviceTier` can also be `null`.\n\n### `thinking_level_change`\n\n```json\n{\n \"type\": \"thinking_level_change\",\n \"id\": \"c1d2e3f4\",\n \"parentId\": \"b1c2d3e4\",\n \"timestamp\": \"2026-02-16T10:22:00.000Z\",\n \"thinkingLevel\": \"high\"\n}\n```\n\n### `compaction`\n\n```json\n{\n \"type\": \"compaction\",\n \"id\": \"d1e2f3a4\",\n \"parentId\": \"c1d2e3f4\",\n \"timestamp\": \"2026-02-16T10:23:00.000Z\",\n \"summary\": \"Conversation summary\",\n \"shortSummary\": \"Short recap\",\n \"firstKeptEntryId\": \"a1b2c3d4\",\n \"tokensBefore\": 42000,\n \"details\": { \"readFiles\": [\"src/a.ts\"] },\n \"preserveData\": { \"hookState\": true },\n \"fromExtension\": false\n}\n```\n\n### `branch_summary`\n\n```json\n{\n \"type\": \"branch_summary\",\n \"id\": \"e1f2a3b4\",\n \"parentId\": \"a1b2c3d4\",\n \"timestamp\": \"2026-02-16T10:24:00.000Z\",\n \"fromId\": \"a1b2c3d4\",\n \"summary\": \"Summary of abandoned path\",\n \"details\": { \"note\": \"optional\" },\n \"fromExtension\": true\n}\n```\n\nIf branching from root (`branchFromId === null`), `fromId` is the literal string `\"root\"`.\n\n### `custom`\n\nExtension state persistence; ignored by `buildSessionContext`.\n\n```json\n{\n \"type\": \"custom\",\n \"id\": \"f1a2b3c4\",\n \"parentId\": \"e1f2a3b4\",\n \"timestamp\": \"2026-02-16T10:25:00.000Z\",\n \"customType\": \"my-extension\",\n \"data\": { \"state\": 1 }\n}\n```\n\n### `custom_message`\n\nExtension-provided message that does participate in LLM context. `content` can be a string or text/image content blocks, and `attribution` records whether the user or agent initiated it.\n\n```json\n{\n \"type\": \"custom_message\",\n \"id\": \"a2b3c4d5\",\n \"parentId\": \"f1a2b3c4\",\n \"timestamp\": \"2026-02-16T10:26:00.000Z\",\n \"customType\": \"my-extension\",\n \"content\": \"Injected context\",\n \"display\": true,\n \"details\": { \"debug\": false },\n \"attribution\": \"agent\"\n}\n```\n\n### `label`\n\n```json\n{\n \"type\": \"label\",\n \"id\": \"b2c3d4e5\",\n \"parentId\": \"a2b3c4d5\",\n \"timestamp\": \"2026-02-16T10:27:00.000Z\",\n \"targetId\": \"a1b2c3d4\",\n \"label\": \"checkpoint\"\n}\n```\n\n`label: undefined` clears a label for `targetId`.\n\n### `ttsr_injection`\n\n```json\n{\n \"type\": \"ttsr_injection\",\n \"id\": \"c2d3e4f5\",\n \"parentId\": \"b2c3d4e5\",\n \"timestamp\": \"2026-02-16T10:28:00.000Z\",\n \"injectedRules\": [\"ruleA\", \"ruleB\"]\n}\n```\n\n### `mcp_tool_selection`\n\n```json\n{\n \"type\": \"mcp_tool_selection\",\n \"id\": \"d2e3f4a5\",\n \"parentId\": \"c2d3e4f5\",\n \"timestamp\": \"2026-02-16T10:28:30.000Z\",\n \"selectedToolNames\": [\"server.tool\"]\n}\n```\n\n### `session_init`\n\n```json\n{\n \"type\": \"session_init\",\n \"id\": \"d2e3f4a5\",\n \"parentId\": \"c2d3e4f5\",\n \"timestamp\": \"2026-02-16T10:29:00.000Z\",\n \"systemPrompt\": \"...\",\n \"task\": \"...\",\n \"tools\": [\"read\", \"edit\"],\n \"outputSchema\": { \"type\": \"object\" }\n}\n```\n\n### `mode_change`\n\n```json\n{\n \"type\": \"mode_change\",\n \"id\": \"e2f3a4b5\",\n \"parentId\": \"d2e3f4a5\",\n \"timestamp\": \"2026-02-16T10:30:00.000Z\",\n \"mode\": \"plan\",\n \"data\": { \"planFile\": \"/tmp/plan.md\" }\n}\n```\n\n## Versioning and Migration\n\nCurrent session version: `3`.\n\n### v1 -> v2\n\nApplied when header `version` is missing or `< 2`:\n\n- Adds `id` and `parentId` to each non-header entry.\n- Reconstructs a linear parent chain using file order.\n- Migrates compaction field `firstKeptEntryIndex` -> `firstKeptEntryId` when present.\n- Sets header `version = 2`.\n\n### v2 -> v3\n\nApplied when header `version < 3`:\n\n- For `message` entries: rewrites legacy `message.role === \"hookMessage\"` to `\"custom\"`.\n- Sets header `version = 3`.\n\n### Migration Trigger and Persistence\n\n- Migrations run during session load (`setSessionFile`).\n- If any migration ran, the entire file is rewritten to disk immediately.\n- Migration mutates in-memory entries first, then persists rewritten JSONL.\n\n## Load and Compatibility Behavior\n\n`loadEntriesFromFile(path)` behavior:\n\n- Missing file (`ENOENT`) -> returns `[]`.\n- Non-parseable lines are handled by lenient JSONL parser (`parseJsonlLenient`).\n- If first parsed entry is not a valid session header (`type !== \"session\"` or missing string `id`) -> returns `[]`.\n\n`SessionManager.setSessionFile()` behavior:\n\n- `[]` from loader is treated as empty/nonexistent session and replaced with a new initialized session file at that path.\n- Valid files are loaded, migrated if needed, blob refs resolved, then indexed.\n\n## Tree and Leaf Semantics\n\nThe underlying model is append-only tree + mutable leaf pointer:\n\n- Every append method creates exactly one new entry whose `parentId` is current `leafId`.\n- The new entry becomes the new `leafId`.\n- `branch(entryId)` moves only `leafId`; existing entries remain unchanged.\n- `resetLeaf()` sets `leafId = null`; next append creates a new root entry (`parentId: null`).\n- `branchWithSummary()` sets leaf to branch target and appends a `branch_summary` entry.\n\n`getEntries()` returns all non-header entries in insertion order. Existing entries are not deleted in normal operation; rewrites preserve logical history while updating representation (migrations, move, targeted rewrite helpers).\n\n## Context Reconstruction (`buildSessionContext`)\n\n`buildSessionContext(entries, leafId?, byId?, options?)` resolves what is sent to the model. Passing `options.transcript: true` instead builds the full-history display transcript (compactions emitted inline at the position they fired) — display-only, never sent to a provider.\n\nAlgorithm:\n\n1. Determine leaf:\n - `leafId === null` -> return empty context.\n - explicit `leafId` -> use that entry if found.\n - otherwise fallback to last entry.\n2. Walk `parentId` chain from leaf to root and reverse to root->leaf path.\n3. Derive runtime state across path:\n - `thinkingLevel` from latest `thinking_level_change` (default `\"off\"`)\n - `serviceTier` from latest `service_tier_change`\n - model map from `model_change` entries (`role ?? \"default\"`)\n - fallback `models.default` from assistant message provider/model if no explicit model change\n - deduplicated `injectedTtsrRules` from all `ttsr_injection` entries\n - selected MCP discovery tools from latest `mcp_tool_selection`\n - mode/modeData from latest `mode_change` (default mode `\"none\"`)\n4. Build message list:\n - `message` entries pass through\n - `custom_message` entries become `custom` AgentMessages via `createCustomMessage`\n - `branch_summary` entries become `branchSummary` AgentMessages via `createBranchSummaryMessage`\n - if a `compaction` exists on path:\n - emit compaction summary first (`createCompactionSummaryMessage`)\n - emit path entries starting at `firstKeptEntryId` up to the compaction boundary\n - emit entries after the compaction boundary\n\n`custom`, `session_init`, `service_tier_change`, `mcp_tool_selection`, and `ttsr_injection` entries do not inject model context directly.\n\n## Persistence Guarantees and Failure Model\n\n### Persist vs in-memory\n\n- `SessionManager.create/open/continueRecent/forkFrom` -> persistent mode (`persist = true`).\n- `SessionManager.inMemory` -> non-persistent mode (`persist = false`) with `MemorySessionStorage`.\n\n### Write pipeline\n\nWrites are serialized through an internal promise chain (`#persistChain`) and `NdjsonFileWriter`.\n\n- `append*` updates in-memory state immediately.\n- Persistence is deferred until at least one assistant message exists.\n - Before first assistant: entries are retained in memory; no file append occurs.\n - When first assistant exists: full in-memory session is flushed to file.\n - Afterwards: new entries append incrementally.\n\nRationale in code: avoid persisting sessions that never produced an assistant response.\n\n### Durability operations\n\n- `flush()` flushes writer and calls `fsync()`.\n- Atomic full rewrites (`#rewriteFile`) write to temp file, flush+fsync, close, then rename over target.\n- Used for migrations, `setSessionName`, `rewriteEntries` (tool-output pruning/supersede passes), and move/fork operations.\n\n### Error behavior\n\n- Persistence errors are latched (`#persistError`) and rethrown on subsequent operations.\n- First error is logged once with session file context.\n- Writer close is best-effort but propagates the first meaningful error.\n\n## Data Size Controls and Blob Externalization\n\nBefore persisting entries:\n\n- Large strings are truncated to `MAX_PERSIST_CHARS` (500,000 chars) with notice:\n - `\"[Session persistence truncated large content]\"`\n- Transient fields `partialJson` and `jsonlEvents` are removed.\n- If object has both `content` and `lineCount`, line count is recomputed after truncation.\n- Image blocks in `content` arrays with base64 length >= 1024 are externalized to blob refs:\n - stored as `blob:sha256:<hash>`\n - raw bytes written to blob store (`BlobStore.put`)\n\nOn load, blob refs are resolved back to base64 for message/custom_message image blocks.\n\n## Storage Abstractions\n\n`SessionStorage` interface provides all filesystem operations used by `SessionManager`:\n\n- sync: `ensureDirSync`, `existsSync`, `writeTextSync`, `statSync`, `listFilesSync`\n- async: `exists`, `readText`, `readTextSlices`, `writeText`, `rename`, `unlink`, `deleteSessionWithArtifacts`, `openWriter`\n\nImplementations:\n\n- `FileSessionStorage`: real filesystem (Bun + node fs)\n- `MemorySessionStorage`: map-backed in-memory implementation for tests/non-persistent sessions\n\n`SessionStorageWriter` exposes `writeLine`, `writeLineSync`, `flush`, `fsync`, `close`, `getError`.\n\n## Session Discovery Utilities\n\nDefined in `session-manager.ts`:\n\n- `getRecentSessions(sessionDir, limit)` -> lightweight metadata for UI/session picker, capped by `limit`\n- `findMostRecentSession(sessionDir)` -> newest by mtime\n- `list(cwd, sessionDir?)` -> sessions in one project scope\n- `listAll()` -> sessions across all project scopes under `~/.omp/agent/sessions`\n- `resolveResumableSession(sessionArg, cwd, sessionDir?)` -> local then global resume/fork target lookup\n\nMetadata extraction for `getRecentSessions` reads a prefix via `readTextSlices(..., 4096, 0)`. `list`/`listAll` read a 4KB prefix plus a bounded 32 KiB tail through one `readTextSlices(...)` call per file, using the prefix for metadata and the tail for lifecycle status. Resume matching is case-insensitive and accepts session id prefixes, full filename prefixes, or the id suffix after the timestamp in `<timestamp>_<sessionId>.jsonl`.\n\n## Related but Distinct: Prompt History Storage\n\n`HistoryStorage` (`history-storage.ts`) is a separate SQLite subsystem for prompt recall/search, not session replay.\n\n- DB: `~/.omp/agent/history.db`\n- Table: `history(id, prompt, created_at, cwd, session_id)`\n- FTS5 index: `history_fts` with trigger-maintained sync\n- Deduplicates consecutive identical prompts using in-memory last-prompt cache\n- Inserts are batched through an async drain queue (~100 ms delay) so prompt capture does not block turn execution\n\nUse session files for conversation graph/state replay; use `HistoryStorage` for prompt history UX.\n",
64
- "settings.md": "# Settings\n\n`omp` resolves settings from built-in defaults, a persistent global config file, optional project-local config, one-shot CLI overlays, and in-memory runtime overrides. Reach for project settings when one repository needs a different provider set, model role, tool policy, memory backend, or UI behavior than your global defaults — without touching your machine-wide configuration.\n\nSettings are stored as plain YAML mappings. Every key, its type, default, and enum values come from the settings schema, and you can inspect or change any of them with `omp config` or the interactive `/settings` panel.\n\n- For model/provider credentials, `.env` files, and the env-var table that resolves API keys, see [Providers](./providers.md).\n- For custom model definitions in `models.yml`, see [Models](./models.md).\n- For instruction files discovered into the agent context (`AGENTS.md`, `.omp/`, etc.), see [Context files](./context-files.md).\n- For the full catalog of environment variables, see [Environment variables](./environment-variables.md).\n\n## Where settings live\n\n| Scope | Path | Read behavior | Write behavior |\n|---|---|---|---|\n| Global | `~/.omp/agent/config.yml` | The main persistent settings file. Always loaded. | `/settings`, `omp config set`, and `omp config reset` write here. |\n| Global legacy | `~/.omp/agent/settings.json` | Migrated into `config.yml` once, only when `config.yml` does not yet exist. | Not written after migration; the original is renamed to `settings.json.bak`. |\n| Project | `<cwd>/.omp/config.yml` (plus `.omp/settings.json`) | Loaded when the process working directory has a non-empty `.omp/`. | Read-only from settings commands; edit the file by hand. |\n| Project legacy | `<cwd>/.omp/settings.json` | Still read; project `config.yml` is merged on top of it. | Not written by settings commands. |\n| CLI overlay | Any file passed with `--config <file>` | Loaded after global and project settings, for that one process. Repeatable. | Never persisted. |\n| Runtime overrides | In-memory only | Set by dedicated CLI flags (`--model`, `--approval-mode`, …) and feature env vars. | Never persisted. |\n\n`PI_CODING_AGENT_DIR` relocates the `~/.omp/agent` base directory. When it is set, the global `config.yml`, the auth store (`agent.db`), and everything else under the agent directory move with it. Use `omp config path` to print the active agent directory.\n\nNative project settings are intentionally scoped to the process working directory's `.omp/` folder — settings discovery does **not** walk ancestor directories looking for the nearest `.omp/`. Other discovery providers (Claude, Codex, Gemini, Cursor, OpenCode) can also contribute project-level settings from their own files; those are read-only from `omp` settings commands and can be turned off by provider id (see [Provider and source disabling](#provider-and-source-disabling)).\n\n## Config file formats\n\nThe global `config.yml` is always YAML. The generic config loader used for other files (for example `models.yml`) accepts `.yml`, `.yaml`, `.json`, and `.jsonc`:\n\n- When a `.yml`/`.yaml` path is requested and only a sibling `.json` exists, it is migrated to YAML automatically (idempotent, once per process).\n- `.json` and `.jsonc` configs are read as-is, with no migration.\n- A file whose top level is not a mapping (a bare array or scalar) is treated as empty for persistent settings, and is a hard error for `--config` overlays.\n\n## Reading and writing settings\n\nUse the interactive `/settings` panel inside a session, or the `omp config` command from a shell. Both operate on the merged effective settings, but every persistent write lands in the **global** file only.\n\n```bash\nomp config list # all settings with current effective values\nomp config list --json # same, machine-readable\nomp config get theme.dark # one value\nomp config get theme.dark --json\nomp config set compaction.enabled false\nomp config set defaultThinkingLevel medium\nomp config reset steeringMode # restore a key to its schema default\nomp config path # print the active agent directory\n```\n\n### Subcommands\n\n| Command | Effect |\n|---|---|\n| `omp config list` | Print every setting grouped by tab, with its current value and type. `--json` emits an object keyed by setting path with `{ value, type, description }`. |\n| `omp config get <key>` | Print the effective value of one key. Unknown keys exit non-zero. `--json` emits `{ key, value, type, description }`. |\n| `omp config set <key> <value>` | Parse `<value>` against the key's schema type and write it to the global `config.yml`. |\n| `omp config reset <key>` | Write the key's schema **default** back to the global config (this persists the default, it does not delete the key). |\n| `omp config path` | Print the active agent directory (honors `PI_CODING_AGENT_DIR`). |\n\n`omp config` with no subcommand, or `--help`, prints the help and lists settings. The `--json` flag is accepted by `list`, `get`, `set`, and `reset`.\n\n### Value parsing\n\n`omp config set` parses the value string according to the target key's schema type. The string is trimmed first.\n\n| Type | Accepted input | Notes |\n|---|---|---|\n| boolean | `true`, `false`, `yes`, `no`, `on`, `off`, `1`, `0` | Case-insensitive. Anything else is rejected. |\n| number | Any finite JavaScript number | `Infinity`/`NaN` are rejected. |\n| enum | One of the key's allowed values | Must match exactly; the error lists the valid values. |\n| array | A JSON array | e.g. `'[\"anthropic\",\"openai\"]'`. Must parse and be an array. |\n| record | A JSON object | e.g. `'{\"bash\":\"prompt\"}'`. Must parse and be a non-array object. |\n| string | Stored as given (trimmed) | Multi-word values are joined with spaces. |\n\nKeys must match a real schema path exactly. There is no shorthand — set `theme.dark`, not `theme`.\n\n### Where writes go\n\n`omp config set`, `omp config reset`, `/settings`, and any runtime settings change all write to the global `config.yml` under the active agent directory. They never write to `<cwd>/.omp/config.yml`. To create a project-local override, edit that file directly (see [Project-local config](#project-local-config)). Saves are debounced and re-read the file under a lock, so external edits made while a session is open are preserved.\n\n## Precedence\n\nFrom lowest to highest priority, the effective value of a setting is built as:\n\n```text\nbuilt-in defaults <- global config <- project config <- CLI overlays <- runtime overrides\n```\n\nFrom highest to lowest:\n\n1. **Runtime overrides** — dedicated CLI flags and feature env vars applied in memory for the current process: `--model`, `--smol`, `--slow`, `--plan`, `--approval-mode`, `--auto-approve`/`--yolo`, `--hide-thinking`, `--no-pty`, `--api-key`, and protocol-mode defaults. Never persisted.\n2. **CLI config overlays** — each `--config <file>`; later overlay files override earlier ones.\n3. **Project settings** — `<cwd>/.omp/settings.json` then `<cwd>/.omp/config.yml` (and contributions from other discovery providers at project level).\n4. **Global settings** — `~/.omp/agent/config.yml`.\n5. **Built-in defaults** — from the settings schema.\n\nA key that is unset at every layer resolves to its schema default at read time.\n\n### Environment overrides\n\nEnvironment variables are **not** a single settings layer. Each is read by the feature that owns the value, usually as a per-machine override or fallback, and is never written back to `config.yml`. The ones that map directly onto a setting:\n\n| Env var | Overrides setting | Notes |\n|---|---|---|\n| `PI_SMOL_MODEL` | `modelRoles.smol` | Also exposed as `--smol`. |\n| `PI_SLOW_MODEL` | `modelRoles.slow` | Also exposed as `--slow`. |\n| `PI_PLAN_MODEL` | `modelRoles.plan` | Also exposed as `--plan`. |\n| `PI_NO_PTY=1` | (disables PTY bash) | Equivalent to `--no-pty` for the process. |\n| `PI_PY` | `eval.py` | `PI_PY=0` disables the Python eval backend. |\n| `PI_JS` | `eval.js` | `PI_JS=0` disables the JavaScript eval backend. |\n| `PI_TINY_DEVICE` | `providers.tinyModelDevice` | ONNX execution provider for local tiny models. |\n| `PI_TINY_DTYPE` | `providers.tinyModelDtype` | ONNX precision for local tiny models. |\n| `OMP_AUTH_BROKER_URL` | `auth.broker.url` | Env value takes precedence over config. |\n| `OMP_AUTH_BROKER_TOKEN` | `auth.broker.token` | Env value takes precedence over config. |\n| `PI_CODING_AGENT_DIR` | (relocates agent dir) | Moves `config.yml`, `agent.db`, and the whole agent base. |\n\nProvider API keys are resolved separately (stored auth, OAuth, `models.yml`, environment, and `.env` files); see [Providers](./providers.md) and the full [Environment variables](./environment-variables.md) reference.\n\n## Merge rules\n\nLayers are combined with a deep merge:\n\n- **Objects are deep-merged** — keys present only in a lower layer are kept; keys present in a higher layer override.\n- **Scalars and arrays are replaced wholesale** by the higher-precedence layer. A higher layer's array does not append to a lower layer's array.\n\nUse nested YAML mappings for dotted setting paths:\n\n```yaml\ntheme:\n dark: titanium\n light: light\n\ntools:\n approvalMode: write\n approval:\n bash: prompt\n read: allow\n```\n\n### Worked example: global vs. project\n\n```yaml\n# ~/.omp/agent/config.yml\ntools:\n approvalMode: write\n approval:\n bash: prompt\n read: allow\ndisabledProviders:\n - anthropic\n - openai\n - gemini\n\n# <repo>/.omp/config.yml\ntools:\n approval:\n bash: allow\ndisabledProviders:\n - groq\n```\n\nEffective settings inside `<repo>`:\n\n```yaml\ntools:\n approvalMode: write # kept from global (object deep-merge)\n approval:\n bash: allow # overridden by project\n read: allow # kept from global\ndisabledProviders:\n - groq # project array REPLACES the global array\n```\n\nArray replacement is the most common surprise: the project's `disabledProviders` does not extend the global list — it becomes the entire list for that project. The same applies to `enabledModels`, `cycleOrder`, `extensions`, and every other array-typed setting.\n\n## Project-local config\n\nCreate `<repo>/.omp/config.yml` when a repository needs its own settings:\n\n```yaml\n# <repo>/.omp/config.yml\nmodelRoles:\n default: anthropic/claude-sonnet-4-5\n smol: openai/gpt-4.1-mini\n slow: anthropic/claude-opus-4-5:high\n\ntools:\n approvalMode: write\n approval:\n bash: prompt\n\ncompaction:\n strategy: context-full\n thresholdPercent: 80\n\ntheme:\n dark: titanium\n```\n\nKeep secrets out of committed project config unless your repository policy allows it. Prefer environment variables, stored auth, an auth broker, or an untracked `--config` overlay for credentials.\n\n### One-shot overlays\n\nUse `--config` for a temporary layer that should not persist:\n\n```bash\nomp --config ./local/ci-settings.yml \"check this failure\"\nomp --config ./base.yml --config ./experiment.yml \"try this model\"\n```\n\nOverlay paths are resolved relative to the process working directory (and `~` is expanded). Each overlay must parse as a YAML mapping; a missing file, invalid YAML, or a top-level array/scalar is a hard error — it does **not** silently fall back to lower-precedence settings.\n\n## Path-scoped arrays\n\nTwo array settings — `enabledModels` and `disabledProviders` — accept path-scoped entries in addition to bare strings, so a single global config can behave differently per directory:\n\n```yaml\nenabledModels:\n - claude-sonnet-4-5 # applies everywhere\n - path: ~/work/high-context\n models:\n - anthropic/claude-opus-4-5\n\ndisabledProviders:\n - ollama # applies everywhere\n - paths:\n - ~/projects/sensitive\n - ~/clients/acme\n providers:\n - anthropic\n - openai\n```\n\nBare string entries apply everywhere. A scoped entry applies when the current working directory **is** the configured path or is **under** it. `~` expands to your home directory and relative paths are resolved before matching.\n\nAccepted **path** keys (any of them, combined): `path`, `paths`, `pathPrefix`, `pathPrefixes`.\n\nAccepted **value** keys:\n\n- `models` (for `enabledModels`) or `providers` (for `disabledProviders`)\n- `values` or `items` (for either setting)\n\nOnly string values are kept; malformed scoped entries are ignored. Path scoping is resolved **after** the layer merge, so it reads the final effective array.\n\n## Provider and source disabling\n\n`disabledProviders` is a single shared id namespace that gates two different subsystems, before any credential check:\n\n| Entry kind | Example ids | Effect |\n|---|---|---|\n| Model providers | `anthropic`, `openai`, `gemini`, `groq`, `ollama`, `openrouter` | Removes those backends from model selection, even when credentials are available. See [Providers](./providers.md). |\n| Discovery sources | `native`, `claude`, `codex`, `gemini`, `github`, `opencode`, `cursor`, `agents-md` | Stops that source from contributing context files, MCP servers, commands, skills, hooks, tools, prompts, or settings. See [Context files](./context-files.md). |\n\nMost provider-control use cases list model provider ids. Disabling the `claude` discovery source is different from disabling the `anthropic` model provider — one stops Claude-format config discovery, the other stops the Anthropic model backend.\n\nBecause arrays replace rather than append, a project that sets `disabledProviders` must list the complete desired set:\n\n```yaml\n# ~/.omp/agent/config.yml\ndisabledProviders:\n - anthropic\n - openai\n\n# <repo>/.omp/config.yml — inside this repo ONLY groq is disabled\ndisabledProviders:\n - groq\n```\n\nThe default is an empty array (nothing disabled). For the two subsystems' provider ids and ordering, see [Providers](./providers.md) and [Context files](./context-files.md).\n\n## Settings catalog\n\nEvery key below is defined in the settings schema; `omp config list` shows the full set with current values. Defaults and enum values are taken from the schema. Settings that accept an env or flag override are noted; those overrides are process-local and not persisted.\n\n### Models\n\n`modelRoles`, `modelTags`, and `cycleOrder` work together to define the models you can switch between. Role values may carry a thinking suffix (`:minimal`, `:low`, `:medium`, `:high`, `:xhigh`).\n\n```yaml\nmodelRoles:\n default: anthropic/claude-sonnet-4-5\n smol: openai/gpt-4.1-mini\n slow: anthropic/claude-opus-4-5:high\n vision: gemini/gemini-3-pro-preview\n plan: anthropic/claude-opus-4-5\n\ncycleOrder:\n - smol\n - default\n - slow\n\nmodelProviderOrder:\n - anthropic\n - openai\n\nenabledModels:\n - claude-sonnet-4-5\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `modelRoles` | record | `{}` | Map of role name -> model id. Built-in roles: `default`, `smol`, `slow`, `vision`, `plan`, `designer`, `commit`, `title`, `task`. Per-role env/flags: `--model`/`--smol`/`--slow`/`--plan`. |\n| `modelTags` | record | `{}` | Custom role/tag metadata; can introduce additional roles. |\n| `modelProviderOrder` | array | `[]` | Preferred provider order when a model id is ambiguous. |\n| `cycleOrder` | array | `[\"smol\",\"default\",\"slow\"]` | Roles cycled by the model switcher. |\n| `enabledModels` | array | `[]` | Allow-list of models; supports [path-scoped entries](#path-scoped-arrays). Empty means all available models. |\n| `disabledProviders` | array | `[]` | Disabled model/discovery providers; supports path-scoped entries. See [above](#provider-and-source-disabling). |\n| `includeModelInPrompt` | boolean | `true` | Include the active model name in the system prompt. |\n\nSee [Models](./models.md) for the `models.yml` schema and custom-provider definitions.\n\n### Thinking\n\n```yaml\ndefaultThinkingLevel: high\nhideThinkingBlock: false\nthinkingBudgets:\n minimal: 1024\n low: 2048\n medium: 8192\n high: 16384\n xhigh: 32768\n```\n\n| Key | Type | Default | Values |\n|---|---|---|---|\n| `defaultThinkingLevel` | enum | `high` | `minimal`, `low`, `medium`, `high`, `xhigh`, `auto`. Override per run with `--thinking`. |\n| `hideThinkingBlock` | boolean | `false` | Hide thinking blocks in output. `--hide-thinking` sets it for the run (display only). |\n| `thinkingBudgets.minimal` | number | `1024` | Token budget for the `minimal` level. |\n| `thinkingBudgets.low` | number | `2048` | Token budget for `low`. |\n| `thinkingBudgets.medium` | number | `8192` | Token budget for `medium`. |\n| `thinkingBudgets.high` | number | `16384` | Token budget for `high`. |\n| `thinkingBudgets.xhigh` | number | `32768` | Token budget for `xhigh`. |\n\n### Sampling\n\nA value of `-1` means \"use the provider/model default\" — `omp` does not send that parameter.\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `temperature` | number | `-1` | Sampling temperature. |\n| `topP` | number | `-1` | Nucleus sampling. |\n| `topK` | number | `-1` | Top-K sampling. |\n| `minP` | number | `-1` | Minimum-probability cutoff. |\n| `presencePenalty` | number | `-1` | Presence penalty. |\n| `repetitionPenalty` | number | `-1` | Repetition penalty. |\n| `serviceTier` | enum | `none` | `none`, `auto`, `default`, `flex`, `scale`, `priority`, `openai-only`, `claude-only`. |\n| `personality` | enum | `default` | `default`, `friendly`, `pragmatic`, `none`. |\n\n### Retry and fallback\n\n```yaml\nretry:\n enabled: true\n maxRetries: 10\n baseDelayMs: 500\n maxDelayMs: 300000\n modelFallback: true\n fallbackRevertPolicy: cooldown-expiry\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `retry.enabled` | boolean | `true` | Retry transient provider errors. |\n| `retry.maxRetries` | number | `10` | Max retries per request. |\n| `retry.baseDelayMs` | number | `500` | Initial backoff. |\n| `retry.maxDelayMs` | number | `300000` | Backoff ceiling (5 min). |\n| `retry.modelFallback` | boolean | `true` | Fall back to another model when one is unavailable. |\n| `retry.fallbackChains` | record | `{}` | Per-model fallback chains. |\n| `retry.fallbackRevertPolicy` | enum | `cooldown-expiry` | `cooldown-expiry`, `never`. |\n\n### Tools and approvals\n\n```yaml\ntools:\n approvalMode: yolo # default\n approval:\n bash: prompt\n edit: allow\n discoveryMode: auto\n maxTimeout: 0\n intentTracing: true\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `tools.approvalMode` | enum | `yolo` | `always-ask` (auto-approve read-only), `write` (auto-approve read + workspace-write), `yolo` (auto-approve all tiers). `--approval-mode` and `--auto-approve`/`--yolo` override per run. |\n| `tools.approval` | record | `{}` | Per-tool policy keyed by tool name; each value is `allow`, `deny`, or `prompt`. e.g. `omp config set tools.approval '{\"bash\":\"prompt\"}'`. |\n| `tools.discoveryMode` | enum | `auto` | `auto`, `off`, `mcp-only`, `all`. Controls dynamic tool discovery. |\n| `tools.essentialOverride` | array | `[]` | Tool names kept available even when tools are narrowed. |\n| `tools.maxTimeout` | number | `0` | Max tool runtime in seconds; `0` = no cap. |\n| `tools.intentTracing` | boolean | `true` | Record per-call intent strings. |\n| `tools.outputMaxColumns` | number | `768` | Per-line byte cap for streaming output; `0` disables. |\n| `tools.artifactSpillThreshold` | number | `50` | KB of tool output above which output spills to an artifact. |\n| `tools.artifactHeadBytes` | number | `20` | KB of head kept inline on spill; `0` = tail-only. |\n| `tools.artifactTailBytes` | number | `20` | KB of tail kept inline on spill. |\n| `tools.artifactTailLines` | number | `500` | Max tail lines kept inline on spill. |\n\nIndividual built-in tools are toggled by their own keys, e.g. `bash.enabled`, `eval.py`, `eval.js`, `find.enabled`, `search.enabled`, `fetch.enabled`, `browser.enabled`, `astEdit.enabled`, `astGrep.enabled`, `web_search.enabled`, `inspect_image.enabled`, `renderMermaid.enabled`.\n\n### Shell, eval, and LSP\n\n```yaml\nbash:\n enabled: true\n stripTrailingHeadTail: true\n autoBackground:\n enabled: false\n thresholdMs: 60000\n\neval:\n py: true\n js: true\n\npython:\n kernelMode: session # session, per-call\n interpreter: \"\"\n\nlsp:\n enabled: true\n lazy: true\n diagnosticsOnWrite: true\n diagnosticsOnEdit: false\n formatOnWrite: false\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `bash.enabled` | boolean | `true` | Enable the bash tool. |\n| `bash.stripTrailingHeadTail` | boolean | `true` | Strip trailing head/tail noise from output. |\n| `bash.autoBackground.enabled` | boolean | `false` | Auto-background long-running commands. |\n| `bash.autoBackground.thresholdMs` | number | `60000` | Threshold before auto-backgrounding. |\n| `eval.py` | boolean | `true` | Python eval backend. `PI_PY=0` disables for the process. |\n| `eval.js` | boolean | `true` | JavaScript eval backend. `PI_JS=0` disables for the process. |\n| `python.kernelMode` | enum | `session` | `session` (persistent kernel) or `per-call`. |\n| `python.interpreter` | string | `\"\"` | Path to a Python interpreter; empty = auto-detect. |\n| `lsp.enabled` | boolean | `true` | Language-server integration. `--no-lsp` disables for the run. |\n| `lsp.lazy` | boolean | `true` | Start servers on demand. |\n| `lsp.diagnosticsOnWrite` | boolean | `true` | Run diagnostics after a write. |\n| `lsp.diagnosticsOnEdit` | boolean | `false` | Run diagnostics after an edit. |\n| `lsp.formatOnWrite` | boolean | `false` | Format files on write. |\n| `lsp.diagnosticsDeduplicate` | boolean | `true` | Collapse duplicate diagnostics. |\n| `shellPath` | string | _(unset)_ | Override the shell binary used by bash. |\n\n### Files: editing and reading\n\n```yaml\nedit:\n mode: hashline # apply_patch, hashline, patch, replace\n fuzzyMatch: true\n fuzzyThreshold: 0.95\n blockAutoGenerated: true\n\nread:\n defaultLimit: 300\n toolResultPreview: false\n summarize:\n enabled: true\n prose: false\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `edit.mode` | enum | `hashline` | `apply_patch`, `hashline`, `patch`, `replace`. |\n| `edit.fuzzyMatch` | boolean | `true` | Allow fuzzy anchor matching. |\n| `edit.fuzzyThreshold` | number | `0.95` | Similarity threshold for fuzzy matching. |\n| `edit.blockAutoGenerated` | boolean | `true` | Refuse to edit generated/lockfile-like files. |\n| `edit.streamingAbort` | boolean | `false` | Abort on streaming edit mismatch. |\n| `read.defaultLimit` | number | `300` | Default line count for `read` without a selector. |\n| `read.summarize.enabled` | boolean | `true` | Structural summaries for code reads. |\n| `read.summarize.prose` | boolean | `false` | Summarize prose files too. |\n| `read.toolResultPreview` | boolean | `false` | Inline preview of tool results. |\n| `readHashLines` | boolean | `true` | Show hashline tags in read output. |\n| `readLineNumbers` | boolean | `false` | Show plain line numbers. |\n\n### Context, compaction, and memory\n\n```yaml\ncontextPromotion:\n enabled: true\n\ncompaction:\n enabled: true\n strategy: context-full # context-full, handoff, shake, snapcompact, off\n thresholdPercent: -1 # -1 = default reserve-based behavior\n thresholdTokens: -1 # fixed token limit when > 0\n remoteEnabled: true\n\nmemory:\n backend: off # off, local, hindsight, mnemopi\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `contextPromotion.enabled` | boolean | `true` | Promote relevant earlier context. |\n| `compaction.enabled` | boolean | `true` | Automatic conversation compaction. |\n| `compaction.strategy` | enum | `context-full` | `context-full`, `handoff`, `shake`, `snapcompact`, `off`. |\n| `compaction.thresholdPercent` | number | `-1` | Percent-of-context trigger; `-1` = reserve-based default. |\n| `compaction.thresholdTokens` | number | `-1` | Fixed token trigger when `> 0`. |\n| `compaction.reserveTokens` | number | `16384` | Tokens reserved for the next turn. |\n| `compaction.keepRecentTokens` | number | `20000` | Recent tokens always preserved. |\n| `compaction.remoteEnabled` | boolean | `true` | Allow remote compaction service. |\n| `compaction.autoContinue` | boolean | `true` | Continue automatically after compaction. |\n| `memory.backend` | enum | `off` | `off`, `local`, `hindsight`, `mnemopi`. Each backend has its own `hindsight.*` / `mnemopi.*` / `memories.*` tuning keys. |\n| `autolearn.enabled` | boolean | `false` | Experimental: after the agent stops, nudge it to capture lessons to memory and create/enhance isolated managed skills under `~/.omp/agent/managed-skills`. Enables the `manage_skill` tool (and `learn` when a memory backend is active). |\n| `autolearn.autoContinue` | boolean | `false` | When `autolearn.enabled`, auto-run one capture turn at stop (uses extra tokens). Off = a passive reminder rides your next turn. |\n| `autolearn.minToolCalls` | number | `5` | Only nudge after a turn that used at least this many tools. |\n\n`compaction` has additional tuning keys (idle compaction, supersede/drop heuristics) visible in `omp config list`. See [Compaction](./compaction.md) for the full strategy reference.\n\n### Appearance and terminal\n\n```yaml\ntheme:\n dark: titanium\n light: light\nsymbolPreset: unicode # unicode, nerd, ascii\ncolorBlindMode: false\n\nstatusLine:\n preset: default # default, minimal, compact, full, nerd, ascii, custom\n separator: powerline-thin\n transparent: false\n showHookStatus: true\n\nterminal:\n showImages: true\nimages:\n autoResize: true\n blockImages: false\ntui:\n hyperlinks: auto # off, auto, always\n```\n\n| Key | Type | Default | Values |\n|---|---|---|---|\n| `theme.dark` | string | `titanium` | Theme used on a dark terminal background. |\n| `theme.light` | string | `light` | Theme used on a light terminal background. |\n| `symbolPreset` | enum | `unicode` | `unicode`, `nerd`, `ascii`. |\n| `colorBlindMode` | boolean | `false` | Use blue instead of green for diff additions. |\n| `showHardwareCursor` | boolean | `true` | Show the terminal hardware cursor. |\n| `statusLine.preset` | enum | `default` | `default`, `minimal`, `compact`, `full`, `nerd`, `ascii`, `custom`. |\n| `statusLine.separator` | enum | `powerline-thin` | `powerline`, `powerline-thin`, `slash`, `pipe`, `block`, `none`, `ascii`. |\n| `statusLine.sessionAccent` | boolean | `true` | Tint the editor border with the session color. |\n| `statusLine.transparent` | boolean | `false` | Use the terminal background for the status line. |\n| `statusLine.showHookStatus` | boolean | `true` | Show hook status messages. |\n| `terminal.showImages` | boolean | `true` | Render images inline (when the terminal supports it). |\n| `images.autoResize` | boolean | `true` | Resize large images for model compatibility. |\n| `images.blockImages` | boolean | `false` | Never send images to providers. |\n| `tui.hyperlinks` | enum | `auto` | `off`, `auto`, `always`. |\n\nFor a custom status line, set `statusLine.preset: custom` and configure `statusLine.leftSegments`, `statusLine.rightSegments`, and `statusLine.segmentOptions`.\n\n### Interaction\n\n| Key | Type | Default | Values |\n|---|---|---|---|\n| `steeringMode` | enum | `one-at-a-time` | `all`, `one-at-a-time`. How queued steering messages are delivered. |\n| `followUpMode` | enum | `one-at-a-time` | `all`, `one-at-a-time`. |\n| `interruptMode` | enum | `immediate` | `immediate`, `wait`. |\n| `doubleEscapeAction` | enum | `tree` | `branch`, `tree`, `none`. |\n| `autoResume` | boolean | `false` | Auto-resume the most recent session in the cwd. |\n| `ask.timeout` | number | `0` | Seconds before an `ask` prompt times out; `0` = no timeout. (Legacy ms values are migrated to seconds.) |\n| `ask.notify` | enum | `on` | `on`, `off`. |\n\n### Providers and services\n\n```yaml\nproviders:\n webSearch: auto\n image: auto\n fetch: auto\n tinyModel: online\n tinyModelDevice: default\n tinyModelDtype: default\n openaiWebsockets: auto\n openrouterVariant: default\n kimiApiFormat: anthropic\n\nprovider:\n appendOnlyContext: auto # auto, on, off\n\nexa:\n enabled: true\n enableSearch: true\n enableResearcher: false\n enableWebsets: false\n\nsearxng:\n endpoint: https://search.example.com\n token: SEARXNG_TOKEN\n```\n\n| Key | Type | Default | Values / notes |\n|---|---|---|---|\n| `providers.webSearch` | enum | `auto` | `auto` plus the configured search providers (`tavily`, `perplexity`, `brave`, `jina`, `kimi`, `anthropic`, `gemini`, `codex`, `zai`, `exa`, `parallel`, `kagi`, `synthetic`, `searxng`). |\n| `providers.image` | enum | `auto` | `auto`, `openai`, `antigravity`, `xai`, `gemini`, `openrouter`. |\n| `providers.fetch` | enum | `auto` | `auto`, `native`, `trafilatura`, `lynx`, `parallel`, `jina`. |\n| `providers.tinyModel` | enum | `online` | `online` or a local model (`lfm2-350m`, `qwen3-0.6b`, `gemma-270m`, `qwen2.5-0.5b`, `lfm2-700m`). |\n| `providers.tinyModelDevice` | enum | `default` | ONNX execution provider for local tiny models. Overridden by `PI_TINY_DEVICE`. |\n| `providers.tinyModelDtype` | enum | `default` | ONNX precision for local tiny models. Overridden by `PI_TINY_DTYPE`. |\n| `providers.openaiWebsockets` | enum | `auto` | `auto`, `off`, `on`. |\n| `providers.openrouterVariant` | enum | `default` | `default`, `nitro`, `floor`, `online`, `exacto`. |\n| `providers.kimiApiFormat` | enum | `anthropic` | `openai`, `anthropic`. |\n| `provider.appendOnlyContext` | enum | `auto` | `auto`, `on`, `off`. |\n| `exa.enabled` | boolean | `true` | Enable Exa integration. |\n| `exa.enableSearch` | boolean | `true` | Exa search. |\n| `exa.enableResearcher` | boolean | `false` | Exa researcher. |\n| `exa.enableWebsets` | boolean | `false` | Exa websets. |\n| `searxng.endpoint` | string | _(unset)_ | SearXNG instance URL. |\n| `searxng.token` | string | _(unset)_ | SearXNG token; also `searxng.basicUsername`/`searxng.basicPassword`/`searxng.categories`/`searxng.language`. |\n| `auth.broker.url` | string | _(unset)_ | Auth-broker URL. Overridden by `OMP_AUTH_BROKER_URL`. |\n| `auth.broker.token` | string | _(unset)_ | Auth-broker token. Overridden by `OMP_AUTH_BROKER_TOKEN`. |\n\nProvider credentials and custom model definitions are configured separately — see [Providers](./providers.md) and [Models](./models.md).\n\n### Other groups\n\n`omp config list` exposes many more grouped settings, including: `task.*` (subagent concurrency, isolation, model overrides), `skills.*` and `commands.*` (discovery toggles), `mcp.*`, `github.*`, `async.*`, `goal.*`, `loop.*`, `todo.*`, `magicKeywords.*`, `ttsr.*` (sticky rules), `display.*`, `startup.*`, `share.*`, `collab.*`, `stt.*`/`tts.*`, `memories.*`/`hindsight.*`/`mnemopi.*` (memory backends), and `bashInterceptor.*`. Each follows the same type/default rules shown above.\n\n## Legacy migration\n\n`omp` migrates older config shapes automatically. None of these require action; they are listed so you know what changes you may see in `config.yml`.\n\n### Startup migration to `config.yml`\n\nWhen `~/.omp/agent/config.yml` does not exist, startup builds it once from legacy sources, then writes the result:\n\n1. `~/.omp/agent/settings.json` (renamed to `settings.json.bak` after a successful migration).\n2. Settings persisted in `agent.db`.\n\nAfter `config.yml` exists, these legacy sources are no longer consulted. The generic config loader also performs `.json` -> `.yml` migration for other config files when only the `.json` form is present.\n\n### Field-level migrations\n\nApplied whenever raw settings are loaded (global, project, overlays, and runtime overrides):\n\n| Old | New |\n|---|---|\n| `queueMode` | `steeringMode` |\n| `ask.timeout` in milliseconds (value `> 1000`) | seconds (divided by 1000) |\n| flat `theme: \"<name>\"` string | `theme.dark` / `theme.light` (slot chosen by luminance; built-in `light`/`dark` are dropped to use defaults) |\n| `task.isolation.enabled: true/false` | `task.isolation.mode: auto/none` |\n| `task.simple` | removed |\n| legacy `task.isolation.mode` (`worktree`, `fuse-overlay`, `fuse-projfs`) | `rcopy`, `overlayfs`, `projfs` |\n| `lastChangelogVersion` | moved to a marker file and stripped from `config.yml` |\n\n## Troubleshooting\n\n### A project setting is not taking effect\n\n- Start `omp` from the directory that contains `.omp/config.yml`. Settings discovery only checks the current working directory's `.omp/`, not ancestor directories.\n- Ensure `.omp/` is non-empty; empty config directories are ignored.\n- Confirm the file is valid YAML and its top level is a mapping.\n- Run `omp config get <key>` from that directory to see the effective value.\n- Remember that `--config` overlays and runtime flags override project config.\n\n### A global array disappeared in a project\n\nArrays replace; they do not append. If a project sets `disabledProviders`, `enabledModels`, `cycleOrder`, `extensions`, or any other array, include the **complete** desired value in the project layer — the global array is fully replaced.\n\n### A provider is still available after editing config\n\n- Check whether you disabled the model provider id (e.g. `anthropic`) or a discovery source id (e.g. `claude`) — they are different namespaces with different effects.\n- Check for a project (or overlay) `disabledProviders` array replacing your global one.\n- Credentials can still come from environment variables, `.env`, OAuth, stored auth, or `models.yml`; disabling a provider blocks selection regardless, but verify you edited the right layer. See [Providers](./providers.md).\n- Restart the session if the model list was already initialized.\n\n### `omp config set` changed the wrong file\n\n`omp config set` and `omp config reset` always write the global `config.yml` under the active agent directory. Run `omp config path` to print it. For project-local settings, edit `<repo>/.omp/config.yml` directly.\n\n### `omp config reset` did not remove my key\n\n`reset` writes the schema **default** value into the global config — it persists the default rather than deleting the key. To stop overriding a project value from global config, delete the key from `~/.omp/agent/config.yml` by hand.\n\n### A `--config` overlay fails at startup\n\n`--config` files are process-local YAML mappings. A missing file, invalid YAML, or a top-level array/scalar is a hard error — it does not silently fall back to lower-precedence settings. Fix the path or contents.\n\n### An environment variable beats my config\n\nSome settings (model roles, eval backends, tiny-model device/precision, auth broker, PTY) are overridable by env vars or CLI flags for per-machine convenience, and those take precedence over `config.yml`. Unset the variable or drop the flag to let the persisted value win. See [Environment overrides](#environment-overrides) and [Environment variables](./environment-variables.md).\n\n### `omp config set <key>` says \"Unknown setting\"\n\nKeys must match a schema path exactly, with no shorthand. Use `theme.dark`, not `theme`. Run `omp config list` to see every valid key.\n",
65
+ "settings.md": "# Settings\n\n`omp` resolves settings from built-in defaults, a persistent global config file, optional project-local config, one-shot CLI overlays, and in-memory runtime overrides. Reach for project settings when one repository needs a different provider set, model role, tool policy, memory backend, or UI behavior than your global defaults — without touching your machine-wide configuration.\n\nSettings are stored as plain YAML mappings. Every key, its type, default, and enum values come from the settings schema, and you can inspect or change any of them with `omp config` or the interactive `/settings` panel.\n\n- For model/provider credentials, `.env` files, and the env-var table that resolves API keys, see [Providers](./providers.md).\n- For custom model definitions in `models.yml`, see [Models](./models.md).\n- For instruction files discovered into the agent context (`AGENTS.md`, `.omp/`, etc.), see [Context files](./context-files.md).\n- For the full catalog of environment variables, see [Environment variables](./environment-variables.md).\n\n## Where settings live\n\n| Scope | Path | Read behavior | Write behavior |\n|---|---|---|---|\n| Global | `~/.omp/agent/config.yml` | The main persistent settings file. Always loaded. | `/settings`, `omp config set`, and `omp config reset` write here. |\n| Global legacy | `~/.omp/agent/settings.json` | Migrated into `config.yml` once, only when `config.yml` does not yet exist. | Not written after migration; the original is renamed to `settings.json.bak`. |\n| Project | `<cwd>/.omp/config.yml` (plus `.omp/settings.json`) | Loaded when the process working directory has a non-empty `.omp/`. | Read-only from settings commands; edit the file by hand. |\n| Project legacy | `<cwd>/.omp/settings.json` | Still read; project `config.yml` is merged on top of it. | Not written by settings commands. |\n| CLI overlay | Any file passed with `--config <file>` | Loaded after global and project settings, for that one process. Repeatable. | Never persisted. |\n| Runtime overrides | In-memory only | Set by dedicated CLI flags (`--model`, `--approval-mode`, …) and feature env vars. | Never persisted. |\n\n`PI_CODING_AGENT_DIR` relocates the `~/.omp/agent` base directory. When it is set, the global `config.yml`, the auth store (`agent.db`), and everything else under the agent directory move with it. Use `omp config path` to print the active agent directory.\n\nNative project settings are intentionally scoped to the process working directory's `.omp/` folder — settings discovery does **not** walk ancestor directories looking for the nearest `.omp/`. Other discovery providers (Claude, Codex, Gemini, Cursor, OpenCode) can also contribute project-level settings from their own files; those are read-only from `omp` settings commands and can be turned off by provider id (see [Provider and source disabling](#provider-and-source-disabling)).\n\n## Config file formats\n\nThe global `config.yml` is always YAML. The generic config loader used for other files (for example `models.yml`) accepts `.yml`, `.yaml`, `.json`, and `.jsonc`:\n\n- When a `.yml`/`.yaml` path is requested and only a sibling `.json` exists, it is migrated to YAML automatically (idempotent, once per process).\n- `.json` and `.jsonc` configs are read as-is, with no migration.\n- A file whose top level is not a mapping (a bare array or scalar) is treated as empty for persistent settings, and is a hard error for `--config` overlays.\n\n## Reading and writing settings\n\nUse the interactive `/settings` panel inside a session, or the `omp config` command from a shell. Both operate on the merged effective settings, but every persistent write lands in the **global** file only.\n\n```bash\nomp config list # all settings with current effective values\nomp config list --json # same, machine-readable\nomp config get theme.dark # one value\nomp config get theme.dark --json\nomp config set compaction.enabled false\nomp config set defaultThinkingLevel medium\nomp config reset steeringMode # restore a key to its schema default\nomp config path # print the active agent directory\n```\n\n### Subcommands\n\n| Command | Effect |\n|---|---|\n| `omp config list` | Print every setting grouped by tab, with its current value and type. `--json` emits an object keyed by setting path with `{ value, type, description }`. |\n| `omp config get <key>` | Print the effective value of one key. Unknown keys exit non-zero. `--json` emits `{ key, value, type, description }`. |\n| `omp config set <key> <value>` | Parse `<value>` against the key's schema type and write it to the global `config.yml`. |\n| `omp config reset <key>` | Write the key's schema **default** back to the global config (this persists the default, it does not delete the key). |\n| `omp config path` | Print the active agent directory (honors `PI_CODING_AGENT_DIR`). |\n\n`omp config` with no subcommand, or `--help`, prints the help and lists settings. The `--json` flag is accepted by `list`, `get`, `set`, and `reset`.\n\n### Value parsing\n\n`omp config set` parses the value string according to the target key's schema type. The string is trimmed first.\n\n| Type | Accepted input | Notes |\n|---|---|---|\n| boolean | `true`, `false`, `yes`, `no`, `on`, `off`, `1`, `0` | Case-insensitive. Anything else is rejected. |\n| number | Any finite JavaScript number | `Infinity`/`NaN` are rejected. |\n| enum | One of the key's allowed values | Must match exactly; the error lists the valid values. |\n| array | A JSON array | e.g. `'[\"anthropic\",\"openai\"]'`. Must parse and be an array. |\n| record | A JSON object | e.g. `'{\"bash\":\"prompt\"}'`. Must parse and be a non-array object. |\n| string | Stored as given (trimmed) | Multi-word values are joined with spaces. |\n\nKeys must match a real schema path exactly. There is no shorthand — set `theme.dark`, not `theme`.\n\n### Where writes go\n\n`omp config set`, `omp config reset`, `/settings`, and any runtime settings change all write to the global `config.yml` under the active agent directory. They never write to `<cwd>/.omp/config.yml`. To create a project-local override, edit that file directly (see [Project-local config](#project-local-config)). Saves are debounced and re-read the file under a lock, so external edits made while a session is open are preserved.\n\n## Precedence\n\nFrom lowest to highest priority, the effective value of a setting is built as:\n\n```text\nbuilt-in defaults <- global config <- project config <- CLI overlays <- runtime overrides\n```\n\nFrom highest to lowest:\n\n1. **Runtime overrides** — dedicated CLI flags and feature env vars applied in memory for the current process: `--model`, `--smol`, `--slow`, `--plan`, `--approval-mode`, `--auto-approve`/`--yolo`, `--hide-thinking`, `--no-pty`, `--api-key`, and protocol-mode defaults. Never persisted.\n2. **CLI config overlays** — each `--config <file>`; later overlay files override earlier ones.\n3. **Project settings** — `<cwd>/.omp/settings.json` then `<cwd>/.omp/config.yml` (and contributions from other discovery providers at project level).\n4. **Global settings** — `~/.omp/agent/config.yml`.\n5. **Built-in defaults** — from the settings schema.\n\nA key that is unset at every layer resolves to its schema default at read time.\n\n### Environment overrides\n\nEnvironment variables are **not** a single settings layer. Each is read by the feature that owns the value, usually as a per-machine override or fallback, and is never written back to `config.yml`. The ones that map directly onto a setting:\n\n| Env var | Overrides setting | Notes |\n|---|---|---|\n| `PI_SMOL_MODEL` | `modelRoles.smol` | Also exposed as `--smol`. |\n| `PI_SLOW_MODEL` | `modelRoles.slow` | Also exposed as `--slow`. |\n| `PI_PLAN_MODEL` | `modelRoles.plan` | Also exposed as `--plan`. |\n| `PI_NO_PTY=1` | (disables PTY bash) | Equivalent to `--no-pty` for the process. |\n| `PI_PY` | `eval.py` | `PI_PY=0` disables the Python eval backend. |\n| `PI_JS` | `eval.js` | `PI_JS=0` disables the JavaScript eval backend. |\n| `PI_TINY_DEVICE` | `providers.tinyModelDevice` | ONNX execution provider for local tiny models. |\n| `PI_TINY_DTYPE` | `providers.tinyModelDtype` | ONNX precision for local tiny models. |\n| `OMP_AUTH_BROKER_URL` | `auth.broker.url` | Env value takes precedence over config. |\n| `OMP_AUTH_BROKER_TOKEN` | `auth.broker.token` | Env value takes precedence over config. |\n| `PI_CODING_AGENT_DIR` | (relocates agent dir) | Moves `config.yml`, `agent.db`, and the whole agent base. |\n\nProvider API keys are resolved separately (stored auth, OAuth, `models.yml`, environment, and `.env` files); see [Providers](./providers.md) and the full [Environment variables](./environment-variables.md) reference.\n\n## Merge rules\n\nLayers are combined with a deep merge:\n\n- **Objects are deep-merged** — keys present only in a lower layer are kept; keys present in a higher layer override.\n- **Scalars and arrays are replaced wholesale** by the higher-precedence layer. A higher layer's array does not append to a lower layer's array.\n\nUse nested YAML mappings for dotted setting paths:\n\n```yaml\ntheme:\n dark: titanium\n light: light\n\ntools:\n approvalMode: write\n approval:\n bash: prompt\n read: allow\n```\n\n### Worked example: global vs. project\n\n```yaml\n# ~/.omp/agent/config.yml\ntools:\n approvalMode: write\n approval:\n bash: prompt\n read: allow\ndisabledProviders:\n - anthropic\n - openai\n - gemini\n\n# <repo>/.omp/config.yml\ntools:\n approval:\n bash: allow\ndisabledProviders:\n - groq\n```\n\nEffective settings inside `<repo>`:\n\n```yaml\ntools:\n approvalMode: write # kept from global (object deep-merge)\n approval:\n bash: allow # overridden by project\n read: allow # kept from global\ndisabledProviders:\n - groq # project array REPLACES the global array\n```\n\nArray replacement is the most common surprise: the project's `disabledProviders` does not extend the global list — it becomes the entire list for that project. The same applies to `enabledModels`, `cycleOrder`, `extensions`, and every other array-typed setting.\n\n## Project-local config\n\nCreate `<repo>/.omp/config.yml` when a repository needs its own settings:\n\n```yaml\n# <repo>/.omp/config.yml\nmodelRoles:\n default: anthropic/claude-sonnet-4-5\n smol: openai/gpt-4.1-mini\n slow: anthropic/claude-opus-4-5:high\n\ntools:\n approvalMode: write\n approval:\n bash: prompt\n\ncompaction:\n strategy: context-full\n thresholdPercent: 80\n\ntheme:\n dark: titanium\n```\n\nKeep secrets out of committed project config unless your repository policy allows it. Prefer environment variables, stored auth, an auth broker, or an untracked `--config` overlay for credentials.\n\n### One-shot overlays\n\nUse `--config` for a temporary layer that should not persist:\n\n```bash\nomp --config ./local/ci-settings.yml \"check this failure\"\nomp --config ./base.yml --config ./experiment.yml \"try this model\"\n```\n\nOverlay paths are resolved relative to the process working directory (and `~` is expanded). Each overlay must parse as a YAML mapping; a missing file, invalid YAML, or a top-level array/scalar is a hard error — it does **not** silently fall back to lower-precedence settings.\n\n## Path-scoped arrays\n\nTwo array settings — `enabledModels` and `disabledProviders` — accept path-scoped entries in addition to bare strings, so a single global config can behave differently per directory:\n\n```yaml\nenabledModels:\n - claude-sonnet-4-5 # applies everywhere\n - path: ~/work/high-context\n models:\n - anthropic/claude-opus-4-5\n\ndisabledProviders:\n - ollama # applies everywhere\n - paths:\n - ~/projects/sensitive\n - ~/clients/acme\n providers:\n - anthropic\n - openai\n```\n\nBare string entries apply everywhere. A scoped entry applies when the current working directory **is** the configured path or is **under** it. `~` expands to your home directory and relative paths are resolved before matching.\n\nAccepted **path** keys (any of them, combined): `path`, `paths`, `pathPrefix`, `pathPrefixes`.\n\nAccepted **value** keys:\n\n- `models` (for `enabledModels`) or `providers` (for `disabledProviders`)\n- `values` or `items` (for either setting)\n\nOnly string values are kept; malformed scoped entries are ignored. Path scoping is resolved **after** the layer merge, so it reads the final effective array.\n\n## Provider and source disabling\n\n`disabledProviders` is a single shared id namespace that gates two different subsystems, before any credential check:\n\n| Entry kind | Example ids | Effect |\n|---|---|---|\n| Model providers | `anthropic`, `openai`, `gemini`, `groq`, `ollama`, `openrouter` | Removes those backends from model selection, even when credentials are available. See [Providers](./providers.md). |\n| Discovery sources | `native`, `claude`, `codex`, `gemini`, `github`, `opencode`, `cursor`, `agents-md` | Stops that source from contributing context files, MCP servers, commands, skills, hooks, tools, prompts, or settings. See [Context files](./context-files.md). |\n\nMost provider-control use cases list model provider ids. Disabling the `claude` discovery source is different from disabling the `anthropic` model provider — one stops Claude-format config discovery, the other stops the Anthropic model backend.\n\nBecause arrays replace rather than append, a project that sets `disabledProviders` must list the complete desired set:\n\n```yaml\n# ~/.omp/agent/config.yml\ndisabledProviders:\n - anthropic\n - openai\n\n# <repo>/.omp/config.yml — inside this repo ONLY groq is disabled\ndisabledProviders:\n - groq\n```\n\nThe default is an empty array (nothing disabled). For the two subsystems' provider ids and ordering, see [Providers](./providers.md) and [Context files](./context-files.md).\n\n## Settings catalog\n\nEvery key below is defined in the settings schema; `omp config list` shows the full set with current values. Defaults and enum values are taken from the schema. Settings that accept an env or flag override are noted; those overrides are process-local and not persisted.\n\n### Models\n\n`modelRoles`, `modelTags`, and `cycleOrder` work together to define the models you can switch between. Role values may carry a thinking suffix (`:minimal`, `:low`, `:medium`, `:high`, `:xhigh`).\n\n```yaml\nmodelRoles:\n default: anthropic/claude-sonnet-4-5\n smol: openai/gpt-4.1-mini\n slow: anthropic/claude-opus-4-5:high\n vision: gemini/gemini-3-pro-preview\n plan: anthropic/claude-opus-4-5\n advisor: anthropic/claude-sonnet-4-5:medium\n\ncycleOrder:\n - smol\n - default\n - slow\n\nmodelProviderOrder:\n - anthropic\n - openai\n\nenabledModels:\n - claude-sonnet-4-5\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `modelRoles` | record | `{}` | Map of role name -> model id. Built-in roles: `default`, `smol`, `slow`, `vision`, `plan`, `designer`, `commit`, `title`, `task`, `advisor`. Per-role env/flags exist only for `--model`/`--smol`/`--slow`/`--plan`; configure the advisor with `modelRoles.advisor`. |\n| `modelTags` | record | `{}` | Custom role/tag metadata; can introduce additional roles. |\n| `modelProviderOrder` | array | `[]` | Preferred provider order when a model id is ambiguous. |\n| `cycleOrder` | array | `[\"smol\",\"default\",\"slow\"]` | Roles cycled by the model switcher. |\n| `enabledModels` | array | `[]` | Allow-list of models; supports [path-scoped entries](#path-scoped-arrays). Empty means all available models. |\n| `disabledProviders` | array | `[]` | Disabled model/discovery providers; supports path-scoped entries. See [above](#provider-and-source-disabling). |\n| `includeModelInPrompt` | boolean | `true` | Include the active model name in the system prompt. |\n\nSee [Models](./models.md) for the `models.yml` schema and custom-provider definitions.\n\n### Advisor\n\nThe advisor is a second model that reviews each completed turn and can inject advice into the primary session. Assign a model with `modelRoles.advisor`, then enable it with `advisor.enabled` or `/advisor on`. There is no `--advisor` flag.\n\nSee [Advisor and WATCHDOG.md](./advisor-watchdog.md) for runtime behavior, `WATCHDOG.md` discovery, and bounded catch-up semantics.\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `advisor.enabled` | boolean | `false` | Enable the advisor runtime when `modelRoles.advisor` resolves to an available model. |\n| `advisor.subagents` | boolean | `false` | Also enable advisor runtimes for spawned task/eval subagents. |\n| `advisor.syncBacklog` | enum | `off` | Bounded advisor catch-up delay: `off`, `1`, `3`, or `5`. The primary waits up to 30 seconds only while advisor backlog is at or above the threshold. |\n\n### Thinking\n\n```yaml\ndefaultThinkingLevel: high\nhideThinkingBlock: false\nthinkingBudgets:\n minimal: 1024\n low: 2048\n medium: 8192\n high: 16384\n xhigh: 32768\n```\n\n| Key | Type | Default | Values |\n|---|---|---|---|\n| `defaultThinkingLevel` | enum | `high` | `minimal`, `low`, `medium`, `high`, `xhigh`, `auto`. Override per run with `--thinking`. |\n| `hideThinkingBlock` | boolean | `false` | Hide thinking blocks in output. `--hide-thinking` sets it for the run (display only). |\n| `thinkingBudgets.minimal` | number | `1024` | Token budget for the `minimal` level. |\n| `thinkingBudgets.low` | number | `2048` | Token budget for `low`. |\n| `thinkingBudgets.medium` | number | `8192` | Token budget for `medium`. |\n| `thinkingBudgets.high` | number | `16384` | Token budget for `high`. |\n| `thinkingBudgets.xhigh` | number | `32768` | Token budget for `xhigh`. |\n\n### Sampling\n\nA value of `-1` means \"use the provider/model default\" — `omp` does not send that parameter.\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `temperature` | number | `-1` | Sampling temperature. |\n| `topP` | number | `-1` | Nucleus sampling. |\n| `topK` | number | `-1` | Top-K sampling. |\n| `minP` | number | `-1` | Minimum-probability cutoff. |\n| `presencePenalty` | number | `-1` | Presence penalty. |\n| `repetitionPenalty` | number | `-1` | Repetition penalty. |\n| `serviceTier` | enum | `none` | `none`, `auto`, `default`, `flex`, `scale`, `priority`, `openai-only`, `claude-only`. |\n| `personality` | enum | `default` | `default`, `friendly`, `pragmatic`, `none`. |\n\n### Retry and fallback\n\n```yaml\nretry:\n enabled: true\n maxRetries: 10\n baseDelayMs: 500\n maxDelayMs: 300000\n modelFallback: true\n fallbackRevertPolicy: cooldown-expiry\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `retry.enabled` | boolean | `true` | Retry transient provider errors. |\n| `retry.maxRetries` | number | `10` | Max retries per request. |\n| `retry.baseDelayMs` | number | `500` | Initial backoff. |\n| `retry.maxDelayMs` | number | `300000` | Backoff ceiling (5 min). |\n| `retry.modelFallback` | boolean | `true` | Fall back to another model when one is unavailable. |\n| `retry.fallbackChains` | record | `{}` | Per-model fallback chains. |\n| `retry.fallbackRevertPolicy` | enum | `cooldown-expiry` | `cooldown-expiry`, `never`. |\n\n### Tools and approvals\n\n```yaml\ntools:\n approvalMode: yolo # default\n approval:\n bash: prompt\n edit: allow\n discoveryMode: auto\n maxTimeout: 0\n intentTracing: true\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `tools.approvalMode` | enum | `yolo` | `always-ask` (auto-approve read-only), `write` (auto-approve read + workspace-write), `yolo` (auto-approve all tiers). `--approval-mode` and `--auto-approve`/`--yolo` override per run. |\n| `tools.approval` | record | `{}` | Per-tool policy keyed by tool name; each value is `allow`, `deny`, or `prompt`. e.g. `omp config set tools.approval '{\"bash\":\"prompt\"}'`. |\n| `tools.discoveryMode` | enum | `auto` | `auto`, `off`, `mcp-only`, `all`. Controls dynamic tool discovery. |\n| `tools.essentialOverride` | array | `[]` | Tool names kept available even when tools are narrowed. |\n| `tools.maxTimeout` | number | `0` | Max tool runtime in seconds; `0` = no cap. |\n| `tools.intentTracing` | boolean | `true` | Record per-call intent strings. |\n| `tools.outputMaxColumns` | number | `768` | Per-line byte cap for streaming output; `0` disables. |\n| `tools.artifactSpillThreshold` | number | `50` | KB of tool output above which output spills to an artifact. |\n| `tools.artifactHeadBytes` | number | `20` | KB of head kept inline on spill; `0` = tail-only. |\n| `tools.artifactTailBytes` | number | `20` | KB of tail kept inline on spill. |\n| `tools.artifactTailLines` | number | `500` | Max tail lines kept inline on spill. |\n\nIndividual built-in tools are toggled by their own keys, e.g. `bash.enabled`, `eval.py`, `eval.js`, `find.enabled`, `search.enabled`, `fetch.enabled`, `browser.enabled`, `astEdit.enabled`, `astGrep.enabled`, `web_search.enabled`, `inspect_image.enabled`, `renderMermaid.enabled`.\n\n### Shell, eval, and LSP\n\n```yaml\nbash:\n enabled: true\n stripTrailingHeadTail: true\n autoBackground:\n enabled: false\n thresholdMs: 60000\n\neval:\n py: true\n js: true\n\npython:\n kernelMode: session # session, per-call\n interpreter: \"\"\n\nlsp:\n enabled: true\n lazy: true\n diagnosticsOnWrite: true\n diagnosticsOnEdit: false\n formatOnWrite: false\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `bash.enabled` | boolean | `true` | Enable the bash tool. |\n| `bash.stripTrailingHeadTail` | boolean | `true` | Strip trailing head/tail noise from output. |\n| `bash.autoBackground.enabled` | boolean | `false` | Auto-background long-running commands. |\n| `bash.autoBackground.thresholdMs` | number | `60000` | Threshold before auto-backgrounding. |\n| `eval.py` | boolean | `true` | Python eval backend. `PI_PY=0` disables for the process. |\n| `eval.js` | boolean | `true` | JavaScript eval backend. `PI_JS=0` disables for the process. |\n| `python.kernelMode` | enum | `session` | `session` (persistent kernel) or `per-call`. |\n| `python.interpreter` | string | `\"\"` | Path to a Python interpreter; empty = auto-detect. |\n| `lsp.enabled` | boolean | `true` | Language-server integration. `--no-lsp` disables for the run. |\n| `lsp.lazy` | boolean | `true` | Start servers on demand. |\n| `lsp.diagnosticsOnWrite` | boolean | `true` | Run diagnostics after a write. |\n| `lsp.diagnosticsOnEdit` | boolean | `false` | Run diagnostics after an edit. |\n| `lsp.formatOnWrite` | boolean | `false` | Format files on write. |\n| `lsp.diagnosticsDeduplicate` | boolean | `true` | Collapse duplicate diagnostics. |\n| `shellPath` | string | _(unset)_ | Override the shell binary used by bash. |\n\n### Files: editing and reading\n\n```yaml\nedit:\n mode: hashline # apply_patch, hashline, patch, replace\n fuzzyMatch: true\n fuzzyThreshold: 0.95\n blockAutoGenerated: true\n\nread:\n defaultLimit: 300\n toolResultPreview: false\n summarize:\n enabled: true\n prose: false\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `edit.mode` | enum | `hashline` | `apply_patch`, `hashline`, `patch`, `replace`. |\n| `edit.fuzzyMatch` | boolean | `true` | Allow fuzzy anchor matching. |\n| `edit.fuzzyThreshold` | number | `0.95` | Similarity threshold for fuzzy matching. |\n| `edit.blockAutoGenerated` | boolean | `true` | Refuse to edit generated/lockfile-like files. |\n| `edit.streamingAbort` | boolean | `false` | Abort on streaming edit mismatch. |\n| `read.defaultLimit` | number | `300` | Default line count for `read` without a selector. |\n| `read.summarize.enabled` | boolean | `true` | Structural summaries for code reads. |\n| `read.summarize.prose` | boolean | `false` | Summarize prose files too. |\n| `read.toolResultPreview` | boolean | `false` | Inline preview of tool results. |\n| `readHashLines` | boolean | `true` | Show hashline tags in read output. |\n| `readLineNumbers` | boolean | `false` | Show plain line numbers. |\n\n### Context, compaction, and memory\n\n```yaml\ncontextPromotion:\n enabled: true\n\ncompaction:\n enabled: true\n strategy: context-full # context-full, handoff, shake, snapcompact, off\n thresholdPercent: -1 # -1 = default reserve-based behavior\n thresholdTokens: -1 # fixed token limit when > 0\n remoteEnabled: true\n\nmemory:\n backend: off # off, local, hindsight, mnemopi\n```\n\n| Key | Type | Default | Notes |\n|---|---|---|---|\n| `contextPromotion.enabled` | boolean | `true` | Promote relevant earlier context. |\n| `compaction.enabled` | boolean | `true` | Automatic conversation compaction. |\n| `compaction.strategy` | enum | `context-full` | `context-full`, `handoff`, `shake`, `snapcompact`, `off`. |\n| `compaction.thresholdPercent` | number | `-1` | Percent-of-context trigger; `-1` = reserve-based default. |\n| `compaction.thresholdTokens` | number | `-1` | Fixed token trigger when `> 0`. |\n| `compaction.reserveTokens` | number | `16384` | Tokens reserved for the next turn. |\n| `compaction.keepRecentTokens` | number | `20000` | Recent tokens always preserved. |\n| `compaction.remoteEnabled` | boolean | `true` | Allow remote compaction service. |\n| `compaction.autoContinue` | boolean | `true` | Continue automatically after compaction. |\n| `memory.backend` | enum | `off` | `off`, `local`, `hindsight`, `mnemopi`. Each backend has its own `hindsight.*` / `mnemopi.*` / `memories.*` tuning keys. |\n| `autolearn.enabled` | boolean | `false` | Experimental: after the agent stops, nudge it to capture lessons to memory and create/enhance isolated managed skills under `~/.omp/agent/managed-skills`. Enables the `manage_skill` tool (and `learn` when a memory backend is active). |\n| `autolearn.autoContinue` | boolean | `false` | When `autolearn.enabled`, auto-run one capture turn at stop (uses extra tokens). Off = a passive reminder rides your next turn. |\n| `autolearn.minToolCalls` | number | `5` | Only nudge after a turn that used at least this many tools. |\n\n`compaction` has additional tuning keys (idle compaction, supersede/drop heuristics) visible in `omp config list`. See [Compaction](./compaction.md) for the full strategy reference.\n\n### Appearance and terminal\n\n```yaml\ntheme:\n dark: titanium\n light: light\nsymbolPreset: unicode # unicode, nerd, ascii\ncolorBlindMode: false\n\nstatusLine:\n preset: default # default, minimal, compact, full, nerd, ascii, custom\n separator: powerline-thin\n transparent: false\n showHookStatus: true\n\nterminal:\n showImages: true\nimages:\n autoResize: true\n blockImages: false\ntui:\n hyperlinks: auto # off, auto, always\n```\n\n| Key | Type | Default | Values |\n|---|---|---|---|\n| `theme.dark` | string | `titanium` | Theme used on a dark terminal background. |\n| `theme.light` | string | `light` | Theme used on a light terminal background. |\n| `symbolPreset` | enum | `unicode` | `unicode`, `nerd`, `ascii`. |\n| `colorBlindMode` | boolean | `false` | Use blue instead of green for diff additions. |\n| `showHardwareCursor` | boolean | `true` | Show the terminal hardware cursor. |\n| `statusLine.preset` | enum | `default` | `default`, `minimal`, `compact`, `full`, `nerd`, `ascii`, `custom`. |\n| `statusLine.separator` | enum | `powerline-thin` | `powerline`, `powerline-thin`, `slash`, `pipe`, `block`, `none`, `ascii`. |\n| `statusLine.sessionAccent` | boolean | `true` | Tint the editor border with the session color. |\n| `statusLine.transparent` | boolean | `false` | Use the terminal background for the status line. |\n| `statusLine.showHookStatus` | boolean | `true` | Show hook status messages. |\n| `terminal.showImages` | boolean | `true` | Render images inline (when the terminal supports it). |\n| `images.autoResize` | boolean | `true` | Resize large images for model compatibility. |\n| `images.blockImages` | boolean | `false` | Never send images to providers. |\n| `tui.hyperlinks` | enum | `auto` | `off`, `auto`, `always`. |\n\nFor a custom status line, set `statusLine.preset: custom` and configure `statusLine.leftSegments`, `statusLine.rightSegments`, and `statusLine.segmentOptions`.\n\n### Interaction\n\n| Key | Type | Default | Values |\n|---|---|---|---|\n| `steeringMode` | enum | `one-at-a-time` | `all`, `one-at-a-time`. How queued steering messages are delivered. |\n| `followUpMode` | enum | `one-at-a-time` | `all`, `one-at-a-time`. |\n| `interruptMode` | enum | `immediate` | `immediate`, `wait`. |\n| `doubleEscapeAction` | enum | `tree` | `branch`, `tree`, `none`. |\n| `autoResume` | boolean | `false` | Auto-resume the most recent session in the cwd. |\n| `ask.timeout` | number | `0` | Seconds before an `ask` prompt times out; `0` = no timeout. (Legacy ms values are migrated to seconds.) |\n| `ask.notify` | enum | `on` | `on`, `off`. |\n\n### Providers and services\n\n```yaml\nproviders:\n webSearch: auto\n image: auto\n fetch: auto\n tinyModel: online\n tinyModelDevice: default\n tinyModelDtype: default\n openaiWebsockets: auto\n openrouterVariant: default\n kimiApiFormat: anthropic\n\nprovider:\n appendOnlyContext: auto # auto, on, off\n\nexa:\n enabled: true\n enableSearch: true\n enableResearcher: false\n enableWebsets: false\n\nsearxng:\n endpoint: https://search.example.com\n token: SEARXNG_TOKEN\n```\n\n| Key | Type | Default | Values / notes |\n|---|---|---|---|\n| `providers.webSearch` | enum | `auto` | `auto` plus the configured search providers (`tavily`, `perplexity`, `brave`, `jina`, `kimi`, `anthropic`, `gemini`, `codex`, `zai`, `exa`, `parallel`, `kagi`, `synthetic`, `searxng`). |\n| `providers.image` | enum | `auto` | `auto`, `openai`, `antigravity`, `xai`, `gemini`, `openrouter`. |\n| `providers.fetch` | enum | `auto` | `auto`, `native`, `trafilatura`, `lynx`, `parallel`, `jina`. |\n| `providers.tinyModel` | enum | `online` | `online` or a local model (`lfm2-350m`, `qwen3-0.6b`, `gemma-270m`, `qwen2.5-0.5b`, `lfm2-700m`). |\n| `providers.tinyModelDevice` | enum | `default` | ONNX execution provider for local tiny models. Overridden by `PI_TINY_DEVICE`. |\n| `providers.tinyModelDtype` | enum | `default` | ONNX precision for local tiny models. Overridden by `PI_TINY_DTYPE`. |\n| `providers.openaiWebsockets` | enum | `auto` | `auto`, `off`, `on`. |\n| `providers.openrouterVariant` | enum | `default` | `default`, `nitro`, `floor`, `online`, `exacto`. |\n| `providers.kimiApiFormat` | enum | `anthropic` | `openai`, `anthropic`. |\n| `provider.appendOnlyContext` | enum | `auto` | `auto`, `on`, `off`. |\n| `exa.enabled` | boolean | `true` | Enable Exa integration. |\n| `exa.enableSearch` | boolean | `true` | Exa search. |\n| `exa.enableResearcher` | boolean | `false` | Exa researcher. |\n| `exa.enableWebsets` | boolean | `false` | Exa websets. |\n| `searxng.endpoint` | string | _(unset)_ | SearXNG instance URL. |\n| `searxng.token` | string | _(unset)_ | SearXNG token; also `searxng.basicUsername`/`searxng.basicPassword`/`searxng.categories`/`searxng.language`. |\n| `auth.broker.url` | string | _(unset)_ | Auth-broker URL. Overridden by `OMP_AUTH_BROKER_URL`. |\n| `auth.broker.token` | string | _(unset)_ | Auth-broker token. Overridden by `OMP_AUTH_BROKER_TOKEN`. |\n\nProvider credentials and custom model definitions are configured separately — see [Providers](./providers.md) and [Models](./models.md).\n\n### Other groups\n\n`omp config list` exposes many more grouped settings, including: `task.*` (subagent concurrency, isolation, model overrides), `skills.*` and `commands.*` (discovery toggles), `mcp.*`, `github.*`, `async.*`, `goal.*`, `loop.*`, `todo.*`, `magicKeywords.*`, `ttsr.*` (sticky rules), `display.*`, `startup.*`, `share.*`, `collab.*`, `stt.*`/`tts.*`, `memories.*`/`hindsight.*`/`mnemopi.*` (memory backends), and `bashInterceptor.*`. Each follows the same type/default rules shown above.\n\n## Legacy migration\n\n`omp` migrates older config shapes automatically. None of these require action; they are listed so you know what changes you may see in `config.yml`.\n\n### Startup migration to `config.yml`\n\nWhen `~/.omp/agent/config.yml` does not exist, startup builds it once from legacy sources, then writes the result:\n\n1. `~/.omp/agent/settings.json` (renamed to `settings.json.bak` after a successful migration).\n2. Settings persisted in `agent.db`.\n\nAfter `config.yml` exists, these legacy sources are no longer consulted. The generic config loader also performs `.json` -> `.yml` migration for other config files when only the `.json` form is present.\n\n### Field-level migrations\n\nApplied whenever raw settings are loaded (global, project, overlays, and runtime overrides):\n\n| Old | New |\n|---|---|\n| `queueMode` | `steeringMode` |\n| `ask.timeout` in milliseconds (value `> 1000`) | seconds (divided by 1000) |\n| flat `theme: \"<name>\"` string | `theme.dark` / `theme.light` (slot chosen by luminance; built-in `light`/`dark` are dropped to use defaults) |\n| `task.isolation.enabled: true/false` | `task.isolation.mode: auto/none` |\n| `task.simple` | removed |\n| legacy `task.isolation.mode` (`worktree`, `fuse-overlay`, `fuse-projfs`) | `rcopy`, `overlayfs`, `projfs` |\n| `lastChangelogVersion` | moved to a marker file and stripped from `config.yml` |\n\n## Troubleshooting\n\n### A project setting is not taking effect\n\n- Start `omp` from the directory that contains `.omp/config.yml`. Settings discovery only checks the current working directory's `.omp/`, not ancestor directories.\n- Ensure `.omp/` is non-empty; empty config directories are ignored.\n- Confirm the file is valid YAML and its top level is a mapping.\n- Run `omp config get <key>` from that directory to see the effective value.\n- Remember that `--config` overlays and runtime flags override project config.\n\n### A global array disappeared in a project\n\nArrays replace; they do not append. If a project sets `disabledProviders`, `enabledModels`, `cycleOrder`, `extensions`, or any other array, include the **complete** desired value in the project layer — the global array is fully replaced.\n\n### A provider is still available after editing config\n\n- Check whether you disabled the model provider id (e.g. `anthropic`) or a discovery source id (e.g. `claude`) — they are different namespaces with different effects.\n- Check for a project (or overlay) `disabledProviders` array replacing your global one.\n- Credentials can still come from environment variables, `.env`, OAuth, stored auth, or `models.yml`; disabling a provider blocks selection regardless, but verify you edited the right layer. See [Providers](./providers.md).\n- Restart the session if the model list was already initialized.\n\n### `omp config set` changed the wrong file\n\n`omp config set` and `omp config reset` always write the global `config.yml` under the active agent directory. Run `omp config path` to print it. For project-local settings, edit `<repo>/.omp/config.yml` directly.\n\n### `omp config reset` did not remove my key\n\n`reset` writes the schema **default** value into the global config — it persists the default rather than deleting the key. To stop overriding a project value from global config, delete the key from `~/.omp/agent/config.yml` by hand.\n\n### A `--config` overlay fails at startup\n\n`--config` files are process-local YAML mappings. A missing file, invalid YAML, or a top-level array/scalar is a hard error — it does not silently fall back to lower-precedence settings. Fix the path or contents.\n\n### An environment variable beats my config\n\nSome settings (model roles, eval backends, tiny-model device/precision, auth broker, PTY) are overridable by env vars or CLI flags for per-machine convenience, and those take precedence over `config.yml`. Unset the variable or drop the flag to let the persisted value win. See [Environment overrides](#environment-overrides) and [Environment variables](./environment-variables.md).\n\n### `omp config set <key>` says \"Unknown setting\"\n\nKeys must match a schema path exactly, with no shorthand. Use `theme.dark`, not `theme`. Run `omp config list` to see every valid key.\n",
65
66
  "skills.md": "# Skills\n\nSkills are file-backed capability packs discovered at startup and exposed to the model as:\n\n- lightweight metadata in the system prompt (name + description)\n- on-demand content via the `read` tool against `skill://...`\n- optional interactive `/skill:<name>` commands\n\nThis document covers current runtime behavior in `src/extensibility/skills.ts`, `src/discovery/builtin.ts`, `src/internal-urls/skill-protocol.ts`, and `src/discovery/agents-md.ts`.\n\n## What a skill is in this codebase\n\nA discovered skill is represented as:\n\n- `name`\n- `description`\n- `filePath` (the `SKILL.md` path)\n- `baseDir` (skill directory)\n- source metadata (`provider`, `level`, path)\n\nThe runtime only requires `name` and `path` for validity. In practice, matching quality depends on `description` being meaningful.\n\n## Required layout and SKILL.md expectations\n\n### Directory layout\n\nFor provider-based discovery (native/Claude/Codex/Agents/plugin providers), skills are discovered as **one level under `skills/`**:\n\n- `<skills-root>/<skill-name>/SKILL.md`\n\nNested patterns like `<skills-root>/group/<skill>/SKILL.md` are not discovered by provider loaders.\n\nFor `skills.customDirectories`, scanning uses the same non-recursive layout (`*/SKILL.md`).\n\n```text\nProvider-discovered layout (non-recursive under skills/):\n\n<root>/skills/\n ├─ postgres/\n │ └─ SKILL.md ✅ discovered\n ├─ pdf/\n │ └─ SKILL.md ✅ discovered\n └─ team/\n └─ internal/\n └─ SKILL.md ❌ not discovered by provider loaders\n\nCustom-directory scanning is also non-recursive, so nested paths are ignored unless you point `customDirectories` at that nested parent.\n```\n\n### `SKILL.md` frontmatter\n\nSupported frontmatter fields on the skill type:\n\n- `name?: string`\n- `description?: string`\n- `globs?: string[]`\n- `alwaysApply?: boolean`\n- `hide?: boolean`\n- `disableModelInvocation?: boolean` (Agent Skills equivalent of `hide`; normalized from kebab-case `disable-model-invocation`)\n- additional keys are preserved as unknown metadata\n\nCurrent runtime behavior:\n\n- `name` defaults to the skill directory name\n- `description` is required for:\n - native `.omp` provider skill discovery (`requireDescription: true`)\n - `omp-plugins` extension-package skills and the `github` provider (`.github/skills/`), which also pass `requireDescription: true`\n - `skills.customDirectories` scans via `scanSkillsFromDir` in `src/discovery/helpers.ts` (non-recursive)\n- the claude/codex/agents/opencode/claude-plugins providers can load skills without description\n\n## Discovery pipeline\n\n`loadSkills()` in `src/extensibility/skills.ts` does two passes:\n\n1. **Capability providers** via `loadCapability(\"skills\")`\n2. **Custom directories** via `scanSkillsFromDir(..., { requireDescription: true })` (one-level directory enumeration)\n\nIf `skills.enabled` is `false`, discovery returns no skills.\n\n### Built-in skill providers and precedence\n\nProvider ordering is priority-first (higher wins), then registration order for ties.\n\nCurrent registered skill providers:\n\n1. `native` (priority 100) — `.omp` user/project skills via `src/discovery/builtin.ts`\n2. `omp-plugins` (priority 90) — `skills/` bundled next to extension packages loaded through `extensions:`, `--extension`/`-e`, or installed plugins under `~/.omp/plugins/node_modules`\n3. `claude` (priority 80)\n4. priority 70 group (in registration order):\n - `claude-plugins`\n - `agents`\n - `codex`\n5. `opencode` (priority 55)\n6. `github` (priority 30) — `.github/skills/<name>/SKILL.md` (GitHub Agent Skills layout, project-only)\n\nDedup key is skill name. First item with a given name wins.\n\n### Source toggles and filtering\n\n`loadSkills()` applies these controls:\n\n- source toggles: `enableCodexUser`, `enableClaudeUser`, `enableClaudeProject`, `enablePiUser`, `enablePiProject`, `enableAgentsUser`, `enableAgentsProject`\n- `disabledExtensions` entries with `skill:<name>`\n- `ignoredSkills` (exclude; glob patterns)\n- `includeSkills` (include allowlist; glob patterns; empty means include all)\n\nFilter order is:\n\n1. not disabled by `disabledExtensions`\n2. source enabled\n3. not ignored\n4. included (if include list present)\n\nThe `agents` provider (`.agent[s]/skills`) is the canonical OMP-native location and has its own `enableAgentsUser`/`enableAgentsProject` toggles — disabling Claude/Codex/Pi does **not** turn it off. For providers without a dedicated toggle (`claude-plugins`, `opencode`, `gemini`, `github`, …), enablement falls back to: enabled if **any** named source toggle is enabled.\n\n### Collision and duplicate handling\n\n- Capability dedup already keeps first skill per name (highest-precedence provider)\n- `extensibility/skills.ts` additionally:\n - de-duplicates identical files by `realpath` (symlink-safe)\n - emits collision warnings when a later skill name conflicts\n - keeps the convenience `loadSkillsFromDir({ dir, source })` API as a thin adapter over `scanSkillsFromDir`\n- Custom-directory skills are merged after provider skills and follow the same collision behavior\n\n## Runtime usage behavior\n\n### System prompt exposure\n\nSystem prompt construction (`src/system-prompt.ts`) uses discovered skills as follows:\n\n- if `read` tool is available:\n - include discovered skills list in prompt, excluding skills with `hide: true`\n- otherwise:\n - omit discovered list\n\n`hide: true` does not disable the skill. Hidden skills are still loaded and remain reachable through `skill://<name>` and `/skill:<name>` when skill commands are enabled.\n\nTask tool subagents receive the session's discovered/provided skills list via normal session creation; there is no per-task skill pinning override.\n\n### Interactive `/skill:<name>` commands\n\nIf `skills.enableSkillCommands` is true, interactive mode registers one slash command per discovered skill.\n\n`/skill:<name> [args]` behavior:\n\n- reads the skill file directly from `filePath`\n- strips frontmatter\n- injects skill body as a custom message\n- delivery mode follows the **submission keybinding**:\n - **Enter** → invokes the skill on the `steer` queue while streaming (matches free-text Enter, which also steers), or as a normal idle prompt when the agent is not streaming\n - **Ctrl+Enter** (`app.message.followUp`) → invokes the skill on the `followUp` queue while streaming, or as a normal idle prompt when the agent is not streaming\n- appends metadata (`Skill: <path>`, optional `User: <args>`)\n\nThere is no flag, mode-selector, or frontmatter knob to override this — the keybinding _is_ the choice, identical to how free text is routed during streaming (`input-controller.ts:436-442` for Enter, `input-controller.ts:770-775` for Ctrl+Enter; both dispatch through `#invokeSkillCommand`).\n\n## `skill://` URL behavior\n\n`src/internal-urls/skill-protocol.ts` supports:\n\n- `skill://<name>` → resolves to that skill's `SKILL.md`\n- `skill://<name>/<relative-path>` → resolves inside that skill directory\n\n```text\nskill:// URL resolution\n\nskill://pdf\n -> <pdf-base>/SKILL.md\n\nskill://pdf/references/tables.md\n -> <pdf-base>/references/tables.md\n\nGuards:\n- reject absolute paths\n- reject `..` traversal\n- reject any resolved path escaping <pdf-base>\n```\n\nResolution details:\n\n- skill name must match exactly\n- relative paths are URL-decoded\n- absolute paths are rejected\n- path traversal (`..`) is rejected\n- resolved path must remain within `baseDir`\n- missing files return an explicit `File not found` error\n\nContent type:\n\n- `.md` => `text/markdown`\n- everything else => `text/plain`\n\nNo fallback search is performed for missing assets.\n\n## Skills vs AGENTS.md, commands, tools, hooks\n\n### Skills vs AGENTS.md\n\n- **Skills**: named, optional capability packs selected by task context or explicitly requested\n- **AGENTS.md/context files**: persistent instruction files loaded as context-file capability and merged by level/depth rules\n\n`src/discovery/agents-md.ts` specifically walks ancestor directories from `cwd` to discover standalone `AGENTS.md` files (stopping at the repo root, or home when no repo root is known), skipping files whose containing directory name starts with a dot.\n\n### Skills vs slash commands\n\n- **Skills**: model-readable knowledge/workflow content\n- **Slash commands**: user-invoked command entry points\n- `/skill:<name>` is a convenience wrapper that injects skill text; it does not change skill discovery semantics\n\n### Skills vs custom tools\n\n- **Skills**: documentation/workflow content loaded through prompt context and `read`\n- **Custom tools**: executable tool APIs callable by the model with schemas and runtime side effects\n\n### Skills vs hooks\n\n- **Skills**: passive content\n- **Hooks**: event-driven runtime interceptors that can block/modify behavior during execution\n\n## Practical authoring guidance tied to discovery logic\n\n- Put each skill in its own directory: `<skills-root>/<skill-name>/SKILL.md`\n- Always include explicit `name` and `description` frontmatter\n- Keep referenced assets under the same skill directory and access with `skill://<name>/...`\n- For nested taxonomy (`team/domain/skill`), point `skills.customDirectories` to the nested parent directory; scanning itself remains non-recursive\n- Avoid duplicate skill names across sources; first match wins by provider precedence\n",
66
67
  "skills/authoring-extensions.md": "---\nname: authoring-extensions\ndescription: Use when creating a new omp extension. Covers ExtensionAPI, factory signature, tool/command/event registration, and local-dev testing.\n---\n\n# Authoring Extensions\n\nExtensions are the primary way to add capabilities to `oh-my-pi`. A single extension module can register tools the LLM can call, slash commands users can invoke, and event handlers that run throughout the session lifecycle — all from one TypeScript file.\n\n## Minimum viable extension\n\n```ts\nimport type { ExtensionAPI } from \"@oh-my-pi/pi-coding-agent\";\n\nexport default function (pi: ExtensionAPI) {\n pi.on(\"session_start\", async (_event, ctx) => {\n ctx.ui.notify(\"My extension loaded!\", \"info\");\n });\n}\n```\n\nThat is a working extension. Drop it into `~/.omp/agent/extensions/hello.ts` and restart omp to see the notification.\n\n## Full example\n\nThe following extension registers a slash command, a tool, and a session-start hook:\n\n```ts\nimport type { ExtensionAPI } from \"@oh-my-pi/pi-coding-agent\";\n\nexport default function myExtension(pi: ExtensionAPI) {\n const z = pi.zod;\n\n // Runs once when the session loads\n pi.on(\"session_start\", async (_event, ctx) => {\n ctx.ui.notify(`Session ready in ${ctx.cwd}`, \"info\");\n });\n\n // Slash command: /greet\n pi.registerCommand(\"greet\", {\n description: \"Send a greeting into the conversation\",\n handler: async (args, ctx) => {\n const name = args.trim() || \"world\";\n pi.sendMessage(\n {\n customType: \"greeting\",\n content: `Hello, ${name}!`,\n display: true,\n attribution: \"user\",\n },\n { triggerTurn: false }\n );\n ctx.ui.notify(`Greeted ${name}`, \"info\");\n },\n });\n\n // LLM-callable tool\n pi.registerTool({\n name: \"word_count\",\n label: \"Word Count\",\n description: \"Count the words in a string\",\n parameters: z.object({\n text: z.string().describe(\"Text to count\"),\n }),\n async execute(_id, params, _signal, _onUpdate, _ctx) {\n const count = params.text.split(/\\s+/).filter(Boolean).length;\n return {\n content: [{ type: \"text\", text: String(count) }],\n details: { count },\n };\n },\n });\n}\n```\n\n## Discovery paths\n\nomp loads extension modules from these sources:\n\n1. Native `.omp` locations discovered through the capability system:\n - `<cwd>/.omp/extensions/`\n - `~/.omp/agent/extensions/`\n - legacy extension paths listed in `.omp/settings.json#extensions` or `~/.omp/agent/settings.json#extensions`\n2. Installed plugins under `~/.omp/plugins/node_modules` (`omp plugin install` npm/git specs, or `omp plugin link`) via their `omp.extensions`/`pi.extensions` manifests. Marketplace cache installs do not feed extension modules — they surface skills/commands/hooks/tools/MCP only.\n3. Explicit configured paths passed by the CLI (`omp --extension ./my-ext.ts`, also `-e`; `--hook` is treated as an alias) and by the `extensions:` setting in config.\n\nThe runtime de-duplicates by resolved absolute path — first seen wins.\n\nWhen a path points to a directory, omp resolves the entry point in this order:\n\n1. `package.json` with `omp.extensions` (or legacy `pi.extensions`) field\n2. `index.ts`\n3. `index.js`\n\nWhen scanning an `extensions/` directory, omp also loads direct `*.ts`/`*.js` files and one-level subdirectories that have `index.ts`, `index.js`, or a manifest.\n\nExtension packages can also bundle sibling capability directories. When a package is loaded through `extensions:` or `--extension`/`-e`, the `omp-plugins` provider discovers its `skills/`, `hooks/pre|post/`, `tools/`, `commands/`, `rules/`, `prompts/`, and `.mcp.json`.\n\n## package.json manifest\n\nTo package an extension as an installable plugin, add an `omp` field to `package.json`:\n\n```json\n{\n \"name\": \"my-omp-extension\",\n \"omp\": {\n \"extensions\": [\"./src/main.ts\"]\n }\n}\n```\n\nThe legacy `pi` key is also accepted for backwards compatibility:\n\n```json\n{\n \"pi\": {\n \"extensions\": [\"./index.ts\"]\n }\n}\n```\n\nMultiple entry points are supported:\n\n```json\n{\n \"omp\": {\n \"extensions\": [\"./src/safety.ts\", \"./src/tools.ts\"]\n }\n}\n```\n\n## Registering commands\n\n```ts\npi.registerCommand(\"my-cmd\", {\n description: \"What the command does\",\n handler: async (args, ctx) => {\n // args: everything the user typed after /my-cmd\n // ctx: ExtensionCommandContext — includes ctx.ui, ctx.cwd, session controls\n ctx.ui.notify(\"Running!\", \"info\");\n await ctx.waitForIdle();\n await ctx.newSession();\n },\n});\n```\n\n`ExtensionCommandContext` session-control methods (safe to call from commands only):\n\n| Method | Effect |\n|---|---|\n| `waitForIdle()` | Wait for the agent to finish streaming |\n| `newSession(opts?)` | Open a fresh session |\n| `switchSession(path)` | Switch to an existing session file |\n| `branch(entryId)` | Fork from a specific history entry |\n| `navigateTree(id, opts?)` | Jump to a different point in the session tree |\n| `reload()` | Reload the session runtime |\n| `compact(opts?)` | Compact the current context |\n\n## Registering tools\n\nTools are called by the LLM. Parameters use [Zod](https://zod.dev) schemas, available at `pi.zod`:\n\n```ts\nconst z = pi.zod;\n\npi.registerTool({\n name: \"search_notes\", // snake_case, unique\n label: \"Search Notes\", // human-readable label for TUI\n description: \"Full-text search through project notes\",\n parameters: z.object({\n query: z.string().describe(\"Search query\"),\n limit: z.number().default(10).describe(\"Max results\").optional(),\n }),\n async execute(toolCallId, params, signal, onUpdate, ctx) {\n if (signal?.aborted) {\n return { content: [{ type: \"text\", text: \"Cancelled\" }] };\n }\n onUpdate?.({ content: [{ type: \"text\", text: \"Searching...\" }] });\n // ... do work ...\n return {\n content: [{ type: \"text\", text: `Found N results for \"${params.query}\"` }],\n details: { query: params.query, count: 0 },\n };\n },\n});\n```\n\n## Subscribing to events\n\n```ts\npi.on(\"tool_call\", async (event, ctx) => {\n // event.toolName, event.input, event.toolCallId\n if (event.toolName !== \"bash\") return;\n\n const command = String((event.input as { command?: unknown }).command ?? \"\");\n if (command.includes(\"rm -rf /\")) {\n return { block: true, reason: \"Blocked by safety policy\" };\n }\n});\n\npi.on(\"turn_end\", async (_event, ctx) => {\n ctx.ui.setStatus(\"tokens\", `~${ctx.getContextUsage()?.tokens ?? \"?\"} tokens`);\n});\n```\n\nFull event catalog: see [hooks authoring guide](./authoring-hooks.md).\n\n## Extension vs hook — when to use which\n\n| Need | Use |\n|---|---|\n| Tools + commands + events in one module | **Extension** (`ExtensionAPI`) |\n| Pure event interception (policy, redaction) | **Extension** or **Hook** (both work; extension is preferred) |\n| Legacy hook module already exists | **Hook** (`HookAPI` from `@oh-my-pi/pi-coding-agent/extensibility/hooks`) |\n| Registering provider / custom message renderer | **Extension only** |\n| Shipping as a marketplace plugin | **Extension** (use `package.json` manifest) |\n\nExtensions are a strict superset of hooks. New authoring should use `ExtensionAPI`.\n\n## Debugging\n\nStart omp with `--log-level debug` to see extension load diagnostics:\n\n```\nomp --log-level debug\n```\n\nFailed extension loads are logged with their path and error. Loaded extensions may also emit their own debug logs via `pi.logger`.\n\nTo temporarily disable a specific extension module by name without removing the file:\n\n```yaml\n# ~/.omp/agent/config.yml\ndisabledExtensions:\n - extension-module:my-ext\n```\n\nThe derived name is the filename stem (or directory name for `index.ts`-style entries): `/path/to/my-ext.ts` → `my-ext`.\n\n## Important constraints\n\n- **Do not call runtime actions during load.** Methods like `pi.sendMessage()` throw `ExtensionRuntimeNotInitializedError` if called synchronously during module evaluation (before a session is active). Register handlers/tools/commands during load; perform runtime actions only from event handlers, tools, or commands.\n- **`tool_call` errors are fail-closed.** If a `tool_call` handler throws, the tool is blocked.\n- **Command names must not clash with built-ins.** Conflicts are skipped with a diagnostic log.\n- **Reserved shortcuts are ignored** (`ctrl+c`, `ctrl+d`, `ctrl+z`, `ctrl+k`, `ctrl+p`, `ctrl+l`, `ctrl+o`, `ctrl+t`, `ctrl+g`, `ctrl+q`, `alt+m`, `shift+tab`, `shift+ctrl+p`, `alt+enter`, `escape`, `enter`).\n\n## Further reading\n\n- `docs/extensions.md` — runtime internals and full API surface reference\n- `docs/extension-loading.md` — detailed path resolution rules\n- `docs/hooks.md` — hook subsystem internals\n- `docs/skills/examples/hello-extension/` — complete working example\n",
67
68
  "skills/authoring-hooks.md": "---\nname: authoring-hooks\ndescription: Use when creating a new omp hook. Covers HookAPI, event catalog, blocking/overriding tool calls, and context modification.\n---\n\n# Authoring Hooks\n\nHooks are event-driven interceptors that run alongside the agent loop. They are best used for cross-cutting concerns: safety policy, secret redaction, context pruning, audit logging. A hook module registers handlers via `pi.on(event, handler)` and can block tool execution, override tool output, or rewrite the message context before each LLM call.\n\n> **Relationship to extensions:** The hook subsystem (`HookAPI`) is the legacy API. The extension runner now handles everything hooks can do plus more. `ExtensionAPI` supports the hook event model plus extension-only events. Use `ExtensionAPI` for new work; use `HookAPI` only if you are maintaining an existing hook module.\n\n## Factory signature\n\n```ts\nimport type { HookAPI } from \"@oh-my-pi/pi-coding-agent/extensibility/hooks\";\n\nexport default function myHook(omp: HookAPI): void {\n omp.on(\"tool_call\", async (event, ctx) => {\n // intercept every tool call\n });\n}\n```\n\nThe default export must be a plain function (not async, not a class). It receives a `HookAPI` instance and must register all handlers synchronously during execution.\n\nAlternatively, using `ExtensionAPI` (preferred):\n\n```ts\nimport type { ExtensionAPI } from \"@oh-my-pi/pi-coding-agent\";\n\nexport default function myExtension(pi: ExtensionAPI): void {\n pi.on(\"tool_call\", async (event, ctx) => { /* ... */ });\n}\n```\n\n## Event catalog\n\n### Tool lifecycle\n\n| Event | Fires | Can return |\n|---|---|---|\n| `tool_call` | Before every tool execution | `{ block?: boolean; reason?: string }` |\n| `tool_result` | After every tool execution | `{ content?; details?; isError?: boolean }` |\n\n### Session lifecycle\n\n| Event | Fires | Can return |\n|---|---|---|\n| `session_start` | On initial session load | — |\n| `session_before_switch` | Before session switch | `{ cancel?: boolean }` |\n| `session_switch` | After session switch | — |\n| `session_before_branch` | Before session branch | `{ cancel?: boolean; skipConversationRestore?: boolean }` |\n| `session_branch` | After session branch | — |\n| `session_before_compact` | Before compaction | `{ cancel?: boolean; compaction?: CompactionResult }` |\n| `session.compacting` | During compaction (inject context) | `{ context?: string[]; prompt?: string; preserveData?: Record<string, unknown> }` |\n| `session_compact` | After compaction | — |\n| `session_before_tree` | Before tree navigation | `{ cancel?: boolean; summary?: { summary: string; details?: unknown } }` |\n| `session_tree` | After tree navigation | — |\n| `session_shutdown` | On session shutdown | — |\n\n### Agent/turn lifecycle\n\n| Event | Fires | Can return |\n|---|---|---|\n| `before_agent_start` | Before agent starts a turn | `{ message?: { customType; content; display; details; attribution? } }` |\n| `agent_start` | Agent streaming starts | — |\n| `agent_end` | Agent streaming ends | — |\n| `turn_start` | Start of a user→agent turn | — |\n| `turn_end` | End of a user→agent turn | — |\n| `context` | Before each LLM API call | `{ messages?: Message[] }` |\n| `auto_compaction_start` | Auto-compaction begins | — |\n| `auto_compaction_end` | Auto-compaction ends | — |\n| `auto_retry_start` | Auto-retry begins | — |\n| `auto_retry_end` | Auto-retry ends | — |\n| `ttsr_triggered` | TTSR (too-short response) triggered | — |\n| `todo_reminder` | Todo reminder fires | — |\n\nExtension-only events such as `tool_execution_start`, `tool_execution_update`, `tool_execution_end`, `input`, `user_bash`, and `user_python` require `ExtensionAPI`.\n\n## Pre-tool blocking contract\n\nReturn `{ block: true, reason: \"...\" }` from a `tool_call` handler to prevent execution:\n\n```ts\nomp.on(\"tool_call\", async (event, ctx) => {\n if (event.toolName === \"bash\") {\n const cmd = String(event.input.command ?? \"\");\n if (/\\brm\\s+-rf\\s+\\//.test(cmd)) {\n return { block: true, reason: \"Refusing to delete root filesystem\" };\n }\n }\n});\n```\n\nContract:\n\n- If **any** handler returns `{ block: true }`, execution stops immediately.\n- `reason` is returned to the LLM as the tool error text.\n- If a handler **throws**, the tool is also blocked (fail-closed).\n- Last non-blocking return wins for non-blocking results; first `block: true` short-circuits.\n\n## Post-tool override contract\n\nReturn `{ content, details, isError }` from a `tool_result` handler to patch what the LLM sees:\n\n```ts\nomp.on(\"tool_result\", async (event, ctx) => {\n if (event.toolName === \"read\" && !event.isError) {\n const redacted = event.content.map(chunk => {\n if (chunk.type !== \"text\") return chunk;\n return {\n ...chunk,\n text: chunk.text.replace(/(?:sk|pk)-[a-zA-Z0-9]{20,}/g, \"[REDACTED_API_KEY]\"),\n };\n });\n return { content: redacted };\n }\n});\n```\n\nContract:\n\n- Handlers run in registration order. For `HookAPI`, each handler receives the original tool result event, and the last returned override wins.\n- `content` replaces the full content array for the LLM.\n- `details` replaces the structured details object.\n- `isError` exists on the shared result type, but `HookToolWrapper` does not propagate it into a successful tool result; on a tool failure, the original error is rethrown after handlers complete.\n- On a tool failure, `tool_result` is still emitted with `isError: true`.\n\n## Context modification contract\n\nReturn `{ messages: [...] }` from a `context` handler to rewrite the message list before each LLM API call:\n\n```ts\nomp.on(\"context\", async (event, ctx) => {\n // Remove debug-only custom messages from LLM context\n const filtered = event.messages.filter(\n msg => !(msg.role === \"custom\" && msg.customType === \"debug-only\")\n );\n return { messages: filtered };\n});\n```\n\nContract:\n\n- `event.messages` is the current accumulated list.\n- Handlers run in order; each receives the output of the previous handler.\n- Return `undefined` (or nothing) to pass messages through unmodified.\n\n## Three complete examples\n\n### 1. rm-rf blocker\n\n```ts\nimport type { HookAPI } from \"@oh-my-pi/pi-coding-agent/extensibility/hooks\";\n\nexport default function rmRfBlocker(omp: HookAPI): void {\n omp.on(\"tool_call\", async (event, ctx) => {\n if (event.toolName !== \"bash\") return;\n\n const cmd = String(event.input.command ?? \"\");\n if (!/\\brm\\s+-rf\\s+\\//.test(cmd)) return;\n\n // Allow if user explicitly confirms (interactive mode only)\n if (ctx.hasUI) {\n const allow = await ctx.ui.confirm(\n \"Dangerous command\",\n `This command deletes from root:\\n${cmd}\\n\\nProceed?`\n );\n if (allow) return;\n }\n\n return { block: true, reason: \"rm -rf / blocked by safety policy\" };\n });\n}\n```\n\n### 2. API-key redactor\n\n```ts\nimport type { HookAPI } from \"@oh-my-pi/pi-coding-agent/extensibility/hooks\";\n\n// Common API-key shapes. Not exhaustive — providers using bespoke formats\n// (Anthropic `sk-ant-…`, JWT-style bearers, gateway-specific prefixes, etc.)\n// need their own entries.\nconst SECRET_PATTERNS = [\n /\\b(sk|pk)-[a-zA-Z0-9]{20,}\\b/g,\n /\\bAKIA[A-Z0-9]{16}\\b/g,\n /\\bghp_[a-zA-Z0-9]{36}\\b/g,\n // Zhipu / GLM Coding Plan: `<id>.<secret>` (no `sk-` prefix).\n /\\b[a-zA-Z0-9]{16,}\\.[a-zA-Z0-9]{16,}\\b/g,\n /\\b[a-zA-Z0-9_-]{20,}\\s*=\\s*[\"']?[a-zA-Z0-9._/+=-]{20,}[\"']?/g,\n];\n\nexport default function apiKeyRedactor(omp: HookAPI): void {\n omp.on(\"tool_result\", async (event) => {\n if (event.isError) return;\n\n let changed = false;\n const redacted = event.content.map(chunk => {\n if (chunk.type !== \"text\") return chunk;\n let text = chunk.text;\n for (const pattern of SECRET_PATTERNS) {\n const next = text.replace(pattern, \"[REDACTED]\");\n if (next !== text) { changed = true; text = next; }\n }\n return { ...chunk, text };\n });\n\n if (changed) return { content: redacted };\n });\n}\n```\n\n### 3. Context filter\n\n```ts\nimport type { HookAPI } from \"@oh-my-pi/pi-coding-agent/extensibility/hooks\";\n\nexport default function contextFilter(omp: HookAPI): void {\n omp.on(\"context\", async (event) => {\n const MAX_TOOL_OUTPUT_CHARS = 8_000;\n\n const trimmed = event.messages.map(msg => {\n // Truncate very large tool results to keep context manageable\n if (msg.role !== \"tool\") return msg;\n const content = msg.content.map(chunk => {\n if (chunk.type !== \"text\" || chunk.text.length <= MAX_TOOL_OUTPUT_CHARS) return chunk;\n return {\n ...chunk,\n text: chunk.text.slice(0, MAX_TOOL_OUTPUT_CHARS) + \"\\n[... truncated by context-filter hook]\",\n };\n });\n return { ...msg, content };\n });\n\n return { messages: trimmed };\n });\n}\n```\n\n## UI methods in hook context\n\n`ctx.ui` is a `HookUIContext`. Available methods:\n\n| Method | Description |\n|---|---|\n| `notify(message, type?)` | Show an in-app notification |\n| `setStatus(key, text)` | Set footer status text (keyed, sorted by key) |\n| `select(title, options)` | Show a selection dialog |\n| `confirm(title, message)` | Show a yes/no dialog |\n| `input(title, placeholder?)` | Show a text input dialog |\n| `editor(title, prefill?, { signal }?, { promptStyle }?)` | Show a multi-line editor |\n| `setEditorText(text)` | Set the input editor content |\n| `getEditorText()` | Get current input editor content |\n| `custom(factory)` | Render a custom TUI component |\n| `theme` | Current theme object |\n\nPass `{ promptStyle: true }` as the fourth argument when Enter should submit and Shift+Enter should insert a newline. The default hook editor behavior keeps Enter as newline and Ctrl+Enter as submit.\n\n`ctx.hasUI` is `false` in headless/print/subagent mode — always guard interactive calls.\n\n## Further reading\n\n- `docs/hooks.md` — hook subsystem internals, ordering rules, error propagation\n- `docs/extensions.md` — `ExtensionAPI` (superset of `HookAPI`)\n- `docs/skills/examples/safety-hook/` — complete working example\n",
@@ -75,8 +76,8 @@ export const EMBEDDED_DOCS: Readonly<Record<string, string>> = {
75
76
  "theme.md": "# Theming Reference\n\nThis document describes how theming works in the coding-agent today: schema, loading, runtime behavior, and failure modes.\n\n## What the theme system controls\n\nThe theme system drives:\n\n- foreground/background color tokens used across the TUI\n- markdown styling adapters (`getMarkdownTheme()`)\n- selector/editor/settings list adapters (`getSelectListTheme()`, `getEditorTheme()`, `getSettingsListTheme()`)\n- symbol preset + symbol overrides (`unicode`, `nerd`, `ascii`)\n- syntax highlighting colors used by native highlighter (`@oh-my-pi/pi-natives`)\n- status line segment colors\n\nPrimary implementation: `src/modes/theme/theme.ts`.\n\n## Theme JSON shape\n\nTheme files are JSON objects validated against the runtime schema in `theme.ts` (`themeJsonSchema`) and mirrored by `src/modes/theme/theme-schema.json`.\n\nTop-level fields:\n\n- `name` (required)\n- `colors` (required; all color tokens required)\n- `vars` (optional; reusable color variables)\n- `export` (optional; HTML export colors)\n- `symbols` (optional)\n - `preset` (optional: `unicode | nerd | ascii`)\n - `overrides` (optional: key/value overrides for `SymbolKey`)\n\nColor values accept:\n\n- hex string (`\"#RRGGBB\"`)\n- 256-color index (`0..255`)\n- variable reference string (resolved through `vars`)\n- empty string (`\"\"`) meaning terminal default (`\\x1b[39m` fg, `\\x1b[49m` bg)\n\n## Required color tokens (current)\n\nAll tokens below are required in `colors`.\n\n### Core text and borders (11)\n\n`accent`, `border`, `borderAccent`, `borderMuted`, `success`, `error`, `warning`, `muted`, `dim`, `text`, `thinkingText`\n\n### Background blocks (7)\n\n`selectedBg`, `userMessageBg`, `customMessageBg`, `toolPendingBg`, `toolSuccessBg`, `toolErrorBg`, `statusLineBg`\n\n### Message/tool text (5)\n\n`userMessageText`, `customMessageText`, `customMessageLabel`, `toolTitle`, `toolOutput`\n\n### Markdown (10)\n\n`mdHeading`, `mdLink`, `mdLinkUrl`, `mdCode`, `mdCodeBlock`, `mdCodeBlockBorder`, `mdQuote`, `mdQuoteBorder`, `mdHr`, `mdListBullet`\n\n### Tool diff + syntax highlighting (12)\n\n`toolDiffAdded`, `toolDiffRemoved`, `toolDiffContext`,\n`syntaxComment`, `syntaxKeyword`, `syntaxFunction`, `syntaxVariable`, `syntaxString`, `syntaxNumber`, `syntaxType`, `syntaxOperator`, `syntaxPunctuation`\n\n### Mode/thinking borders (8)\n\n`thinkingOff`, `thinkingMinimal`, `thinkingLow`, `thinkingMedium`, `thinkingHigh`, `thinkingXhigh`, `bashMode`, `pythonMode`\n\n### Status line segment colors (13)\n\n`statusLineSep`, `statusLineModel`, `statusLinePath`, `statusLineGitClean`, `statusLineGitDirty`, `statusLineContext`, `statusLineSpend`, `statusLineStaged`, `statusLineDirty`, `statusLineUntracked`, `statusLineOutput`, `statusLineCost`, `statusLineSubagents`\n\n## Optional tokens\n\n### `export` section (optional)\n\nUsed for HTML export theming helpers:\n\n- `export.pageBg`\n- `export.cardBg`\n- `export.infoBg`\n\nIf omitted, export code derives defaults from resolved theme colors.\n\n### `symbols` section (optional)\n\n- `symbols.preset` sets a theme-level default symbol set.\n- `symbols.overrides` can override individual `SymbolKey` values.\n- `symbols.spinnerFrames` overrides the loading spinner frames. Accepts either a flat `string[]` (applied to both spinner types) or an object `{ \"status\"?: string[], \"activity\"?: string[] }` to override each type independently. Any type not specified falls back to the symbol preset's default frames. `status` drives the ~12.5fps spinner used by loaders and tool-execution indicators; `activity` drives the ~30fps spinner used by markdown progress bars and similar high-frequency UI.\n\nRuntime precedence:\n\n1. settings `symbolPreset` override (if set)\n2. theme JSON `symbols.preset`\n3. fallback `\"unicode\"`\n\nInvalid override keys are ignored and logged (`logger.debug`).\n\n## Built-in vs custom theme sources\n\nTheme lookup order (`loadThemeJson`):\n\n1. built-in embedded themes (`dark.json`, `light.json`, and all `defaults/*.json` compiled into `defaultThemes`)\n2. custom theme file: `<customThemesDir>/<name>.json`\n\nCustom themes directory comes from `getCustomThemesDir()`:\n\n- default: `~/.omp/agent/themes`\n- overridden by `PI_CODING_AGENT_DIR` (`$PI_CODING_AGENT_DIR/themes`)\n\n`getAvailableThemes()` returns merged built-in + custom names, sorted, with built-ins taking precedence on name collision.\n\n## Loading, validation, and resolution\n\nFor custom theme files:\n\n1. read JSON\n2. parse JSON\n3. validate against `ThemeJsonSchema`\n4. resolve `vars` references recursively\n5. convert resolved values to ANSI by terminal capability mode\n\nValidation behavior:\n\n- missing required color tokens: explicit grouped error message\n- bad token types/values: validation errors with JSON path\n- unknown theme file: `Theme not found: <name>`\n\nVar reference behavior:\n\n- supports nested references\n- throws on missing variable reference\n- throws on circular references\n\n## Terminal color mode behavior\n\nColor mode detection (`detectColorMode`):\n\n- `COLORTERM=truecolor|24bit` => truecolor\n- `WT_SESSION` => truecolor\n- `TERM` in `dumb`, `linux`, or empty => 256color\n- otherwise => truecolor\n\nConversion behavior:\n\n- hex -> `Bun.color(..., \"ansi-16m\" | \"ansi-256\")`\n- numeric -> `38;5` / `48;5` ANSI\n- `\"\"` -> default fg/bg reset\n\n## Runtime switching behavior\n\n### Initial theme (`initTheme`)\n\n`main.ts` initializes theme with settings:\n\n- `symbolPreset`\n- `colorBlindMode`\n- `theme.dark`\n- `theme.light`\n\nAuto theme slot selection uses terminal appearance in this order:\n\n1. terminal-reported OSC 11 background luminance, unless the macOS/Zellij fallback path is active\n2. `COLORFGBG` background index (`< 8` => dark, `>= 8` => light)\n3. macOS appearance fallback only for the known-broken macOS/Zellij OSC 11 path\n4. dark slot fallback\n\nCurrent defaults from settings schema:\n\n- `theme.dark = \"titanium\"`\n- `theme.light = \"light\"`\n- `symbolPreset = \"unicode\"`\n- `colorBlindMode = false`\n\n### Explicit switching (`setTheme`)\n\n- loads selected theme\n- updates global `theme` singleton\n- optionally starts watcher\n- triggers `onThemeChange` callback\n\nOn failure:\n\n- falls back to built-in `dark`\n- returns `{ success: false, error }`\n\n### Preview switching (`previewTheme`)\n\n- applies temporary preview theme to global `theme`\n- does **not** change persisted settings by itself\n- returns success/error without fallback replacement\n\nSettings UI uses this for live preview and restores prior theme on cancel.\n\n## Watchers and live reload\n\nWhen watcher is enabled (`setTheme(..., true)` / interactive init):\n\n- watches `<customThemesDir>/<currentTheme>.json` only when that file exists\n- built-ins are effectively not watched; built-in theme lookup also takes precedence over same-name custom files\n- matching file changes schedule a debounced reload; reload errors or temporary file absence keep the last successfully loaded theme\n- the watcher does not perform a delete/rename fallback; it waits for a future successful reload or explicit theme switch\n\nAuto mode also reevaluates dark/light slot mapping from terminal appearance changes, `SIGWINCH`, and the macOS fallback observer when active.\n\n## Color-blind mode behavior\n\n`colorBlindMode` changes only one token at runtime:\n\n- `toolDiffAdded` is HSV-adjusted (green shifted toward blue)\n- adjustment is applied only when resolved value is a hex string\n\nOther tokens are unchanged.\n\n## Where theme settings are persisted\n\nTheme-related settings are persisted by `Settings` to global config YAML:\n\n- path: `<agentDir>/config.yml`\n- default agent dir: `~/.omp/agent`\n- effective default file: `~/.omp/agent/config.yml`\n\nPersisted keys:\n\n- `theme.dark`\n- `theme.light`\n- `symbolPreset`\n- `colorBlindMode`\n\nLegacy migration exists: old flat `theme: \"name\"` is migrated to nested `theme.dark` or `theme.light` based on luminance detection.\n\n## Creating a custom theme (practical)\n\n1. Create file in custom themes dir, e.g. `~/.omp/agent/themes/my-theme.json`.\n2. Include `name`, optional `vars`, and **all required** `colors` tokens.\n3. Optionally include `symbols` and `export`.\n4. Select the theme in Settings (`Appearance -> Dark Theme` or `Appearance -> Light Theme`) depending on which auto slot you want.\n\nMinimal skeleton:\n\n```json\n{\n \"name\": \"my-theme\",\n \"vars\": {\n \"accent\": \"#7aa2f7\",\n \"muted\": 244\n },\n \"colors\": {\n \"accent\": \"accent\",\n \"border\": \"#4c566a\",\n \"borderAccent\": \"accent\",\n \"borderMuted\": \"muted\",\n \"success\": \"#9ece6a\",\n \"error\": \"#f7768e\",\n \"warning\": \"#e0af68\",\n \"muted\": \"muted\",\n \"dim\": 240,\n \"text\": \"\",\n \"thinkingText\": \"muted\",\n\n \"selectedBg\": \"#2a2f45\",\n \"userMessageBg\": \"#1f2335\",\n \"userMessageText\": \"\",\n \"customMessageBg\": \"#24283b\",\n \"customMessageText\": \"\",\n \"customMessageLabel\": \"accent\",\n \"toolPendingBg\": \"#1f2335\",\n \"toolSuccessBg\": \"#1f2d2a\",\n \"toolErrorBg\": \"#2d1f2a\",\n \"toolTitle\": \"\",\n \"toolOutput\": \"muted\",\n\n \"mdHeading\": \"accent\",\n \"mdLink\": \"accent\",\n \"mdLinkUrl\": \"muted\",\n \"mdCode\": \"#c0caf5\",\n \"mdCodeBlock\": \"#c0caf5\",\n \"mdCodeBlockBorder\": \"muted\",\n \"mdQuote\": \"muted\",\n \"mdQuoteBorder\": \"muted\",\n \"mdHr\": \"muted\",\n \"mdListBullet\": \"accent\",\n\n \"toolDiffAdded\": \"#9ece6a\",\n \"toolDiffRemoved\": \"#f7768e\",\n \"toolDiffContext\": \"muted\",\n\n \"syntaxComment\": \"#565f89\",\n \"syntaxKeyword\": \"#bb9af7\",\n \"syntaxFunction\": \"#7aa2f7\",\n \"syntaxVariable\": \"#c0caf5\",\n \"syntaxString\": \"#9ece6a\",\n \"syntaxNumber\": \"#ff9e64\",\n \"syntaxType\": \"#2ac3de\",\n \"syntaxOperator\": \"#89ddff\",\n \"syntaxPunctuation\": \"#9aa5ce\",\n\n \"thinkingOff\": 240,\n \"thinkingMinimal\": 244,\n \"thinkingLow\": \"#7aa2f7\",\n \"thinkingMedium\": \"#2ac3de\",\n \"thinkingHigh\": \"#bb9af7\",\n \"thinkingXhigh\": \"#f7768e\",\n\n \"bashMode\": \"#2ac3de\",\n \"pythonMode\": \"#bb9af7\",\n\n \"statusLineBg\": \"#16161e\",\n \"statusLineSep\": 240,\n \"statusLineModel\": \"#bb9af7\",\n \"statusLinePath\": \"#7aa2f7\",\n \"statusLineGitClean\": \"#9ece6a\",\n \"statusLineGitDirty\": \"#e0af68\",\n \"statusLineContext\": \"#2ac3de\",\n \"statusLineSpend\": \"#7dcfff\",\n \"statusLineStaged\": \"#9ece6a\",\n \"statusLineDirty\": \"#e0af68\",\n \"statusLineUntracked\": \"#f7768e\",\n \"statusLineOutput\": \"#c0caf5\",\n \"statusLineCost\": \"#ff9e64\",\n \"statusLineSubagents\": \"#bb9af7\"\n }\n}\n```\n\n## Testing custom themes\n\nUse this workflow:\n\n1. Start interactive mode (watcher enabled from startup).\n2. Open settings and preview theme values (live `previewTheme`).\n3. For custom theme files, edit the JSON while running and confirm auto-reload on save.\n4. Exercise critical surfaces:\n - markdown rendering\n - tool blocks (pending/success/error)\n - diff rendering (added/removed/context)\n - status line readability\n - thinking level border changes\n - bash/python mode border colors\n5. Validate both symbol presets if your theme depends on glyph width/appearance.\n\n## Real constraints and caveats\n\n- All `colors` tokens are required for custom themes.\n- `export` and `symbols` are optional.\n- `$schema` in theme JSON is informational; runtime validation is enforced by a Zod schema in code.\n- `setTheme` failure falls back to `dark`; `previewTheme` failure does not replace current theme.\n- File watcher reload errors or temporary missing files keep the current loaded theme until a successful reload or explicit theme switch.\n",
76
77
  "toolconv/anthropic.md": "# Anthropic Claude tool use (Messages API content blocks)\n\nAnthropic's Claude is a closed, hosted model family; there are no released weights and therefore no `--tool-call-parser` flag to set. The canonical tool-calling convention is the **Messages API** (`POST /v1/messages`, header `anthropic-version: 2023-06-01`): tools are advertised in a top-level `tools` array, the model returns structured `tool_use` **content blocks** with `stop_reason: \"tool_use\"`, and you feed results back as `tool_result` content blocks inside a `user` message. Tool use is \"enabled\" simply by including the `tools` parameter (optionally with `tool_choice`); the API then injects a tool-use system prompt and parses the model's output back into JSON blocks for you. This applies to all current models (Claude Opus / Sonnet / Haiku 3.x, 4, 4.x) and is mirrored by gateways such as LiteLLM and by third-party Claude-compatible servers.\n\nUnder the hood the model is trained to emit an **XML** function-call syntax (`<function_calls>` / `<invoke>` / `<parameter>`); the API serializes your JSON-Schema tools into a system prompt and converts the model's XML output into JSON `tool_use` blocks. That underlying format is documented as the *secondary* convention below, together with the older, now-retired prompt-based **legacy XML** format (`<tool_name>` / `<parameters>` / `<function_results>`) that pre-dates the Messages API and still surfaces when you do tool use purely through prompting.\n\nThe primary, authoritative shape for any parser/renderer is the JSON content-block format. The XML is informational (and the only thing visible if you reconstruct prompts at the token level).\n\n---\n\n## Content-block types & stop reasons\n\nAnthropic has no token-level tool delimiters in the public API. The unit is the **content block**: every `message.content` is an array of typed blocks. Tool calling adds two block types and one stop reason; streaming adds a delta type.\n\n| Item | Where | Shape / meaning |\n| --- | --- | --- |\n| `text` block | assistant & user | `{\"type\":\"text\",\"text\":\"...\"}`. Plain prose. Assistant may emit text *before* its tool calls. |\n| `tool_use` block | assistant | `{\"type\":\"tool_use\",\"id\":\"toolu_...\",\"name\":\"<tool>\",\"input\":{...}}`. The function call. `input` is a **nested JSON object** (already parsed), conforming to the tool's `input_schema`. |\n| `tool_result` block | user | `{\"type\":\"tool_result\",\"tool_use_id\":\"toolu_...\",\"content\":<string \\| block[]>,\"is_error\":<bool?>}`. The executed result, sent back in a `user` message. |\n| `server_tool_use` block | assistant | `{\"type\":\"server_tool_use\",\"id\":\"srvtoolu_...\",\"name\":\"web_search\",\"input\":{...}}`. Emitted for Anthropic-executed server tools; you do **not** return a `tool_result` for these. |\n| `web_search_tool_result` (and similar) | assistant | Server-tool output, injected by Anthropic inline in the assistant turn. |\n| `thinking` / `redacted_thinking` block | assistant | Extended-thinking reasoning blocks; carry a `signature`. Must be preserved verbatim across turns when thinking + tools are combined. |\n| `stop_reason: \"tool_use\"` | response top level | The model invoked one or more tools and is waiting for results. Drives the agentic loop. |\n| `stop_reason: \"end_turn\"` | response top level | Natural completion (no tool call); the loop exits. |\n| Other `stop_reason` | response top level | `\"max_tokens\"`, `\"stop_sequence\"`, `\"pause_turn\"` (long server-tool turn, resend as-is to continue), `\"refusal\"`. |\n| `id` prefixes | — | Messages `msg_…`; client tool calls `toolu_…`; server tool calls `srvtoolu_…`. |\n\nStreaming adds these SSE events / delta types (full list under [Roles / channels](#roles--channels--turn-structure) and [Tool-call format](#tool-call-format)):\n\n| Streaming item | Shape / meaning |\n| --- | --- |\n| `message_start` | Carries a `Message` skeleton with empty `content`, `stop_reason: null`. |\n| `content_block_start` | Opens a block at `index`. For a tool call: `content_block.{type:\"tool_use\",id,name,input:{}}` — `input` starts as an **empty object**. |\n| `content_block_delta` / `input_json_delta` | `{\"type\":\"input_json_delta\",\"partial_json\":\"<chunk>\"}` — a **partial JSON string** fragment of `tool_use.input`. |\n| `content_block_delta` / `text_delta` | `{\"type\":\"text_delta\",\"text\":\"...\"}`. |\n| `content_block_delta` / `thinking_delta`, `signature_delta` | Extended-thinking content / signature. |\n| `content_block_stop` | Closes the block at `index`; this is when accumulated `partial_json` is complete and safe to `JSON.parse`. |\n| `message_delta` | Top-level updates; carries the final `delta.stop_reason` (e.g. `\"tool_use\"`) and **cumulative** `usage`. |\n| `message_stop` | End of stream. |\n| `ping` / `error` | Keep-alive; `error` (e.g. `overloaded_error`) may appear mid-stream. |\n\n### Legacy XML tags (prompt-based, pre-Messages-API)\n\nThe retired prompt-based format used these tags. They are nested-element tags (no attributes), distinct from the modern attribute form (`<invoke name=\"…\">`). Verified against Anthropic's archived \"Legacy tool use\" doc (see [Sources](#sources)).\n\n| Tag | Role | Notes |\n| --- | --- | --- |\n| `<tools>` … `</tools>` | tool advertising | Container in the system prompt wrapping all `<tool_description>` entries. |\n| `<tool_description>` | tool advertising | One per tool: holds `<tool_name>`, `<description>`, `<parameters>`. |\n| `<tool_name>` | both | Function name (used in definitions, calls, and results). |\n| `<parameters>` / `<parameter>` | definition | `<parameters>` wraps `<parameter>` entries, each with `<name>`, `<type>`, `<description>`. |\n| `<function_calls>` | model output | Wraps one or more `<invoke>` blocks. |\n| `<invoke>` | model output | One function call; contains `<tool_name>` + a `<parameters>` block of `<paramName>value</paramName>` child tags. |\n| `<function_results>` | tool result (fed back) | Wraps `<result>` (success) or `<error>` (failure). |\n| `<result>` / `<stdout>` | tool result | `<result>` holds `<tool_name>` + `<stdout>`; the output text goes in `<stdout>`. |\n| `<error>` | tool result | Replaces `<result>` when the function raised. |\n| `</function_calls>` | stop sequence | Passed as `stop_sequence` so generation halts after a call. |\n| `<scratchpad>` / `<answer>` | model output | Conventionally used for chain-of-thought and final answer in legacy prompts. |\n\n---\n\n## Roles / channels / turn structure\n\nThe Messages API uses only two conversational roles, `user` and `assistant`, alternating. There is **no** dedicated `tool`/`function` role and **no** top-level `system` role — the system prompt is a separate top-level `system` parameter (string or text-block array). Tool data rides inside the normal roles:\n\n- `assistant` messages contain AI-generated `text`, `thinking`, and `tool_use` (and `server_tool_use`) blocks.\n- `user` messages contain your `text`/`image`/`document` content and `tool_result` blocks.\n\nThere are no named \"channels\". The closest analogue to a reasoning channel is the extended-thinking `thinking` content block (a first-class block with a cryptographic `signature`), kept separate from the user-visible `text` block. When thinking is enabled alongside tools, the `thinking` block(s) from a tool-calling turn must be passed back unmodified in the follow-up request.\n\nThe agentic loop is keyed on `stop_reason`:\n\n1. Send `tools` + the user message.\n2. Claude responds with `stop_reason: \"tool_use\"` and one or more `tool_use` blocks (optionally preceded by a `text` block).\n3. Execute each tool; build a `tool_result` block per call.\n4. Append the assistant message **and** a `user` message carrying all `tool_result` blocks; resend.\n5. Repeat while `stop_reason == \"tool_use\"`; exit on `end_turn` (or another terminal reason).\n\nStrict ordering rules (a 400 otherwise):\n- `tool_result` blocks must come **first** in the `user` message's `content` array (any text after them).\n- The `tool_result` `user` message must **immediately follow** the assistant `tool_use` message — nothing in between.\n- Every `tool_use.id` must be answered by a `tool_result.tool_use_id` in that next message.\n\n---\n\n## Tool definitions\n\nTools are passed in the top-level `tools` array. Each user-defined (client) tool is a **flat** object — no `{\"type\":\"function\", \"function\":{…}}` wrapper (that wrapper is OpenAI's). Fields:\n\n- `name` — matches `^[a-zA-Z0-9_-]{1,64}$`.\n- `description` — detailed plaintext (the single biggest driver of tool-call quality).\n- `input_schema` — a JSON Schema object (**not** `parameters`) describing the input the model must produce.\n- Optional: `input_examples`, `cache_control`, `strict`, `defer_loading`, `allowed_callers`.\n\n```json\n{\n \"name\": \"get_weather\",\n \"description\": \"Get the current weather in a given location\",\n \"input_schema\": {\n \"type\": \"object\",\n \"properties\": {\n \"location\": {\n \"type\": \"string\",\n \"description\": \"The city and state, e.g. San Francisco, CA\"\n },\n \"unit\": {\n \"type\": \"string\",\n \"enum\": [\"celsius\", \"fahrenheit\"],\n \"description\": \"The unit of temperature, either 'celsius' or 'fahrenheit'\"\n }\n },\n \"required\": [\"location\"]\n }\n}\n```\n\nAnthropic-schema client tools (`bash`, `text_editor`, `computer`, `memory`) and server tools (`web_search`, `web_fetch`, `code_execution`, `tool_search`) instead carry a versioned `type`, e.g. `{\"type\": \"web_search_20250305\", \"name\": \"web_search\"}`.\n\n`tool_choice` controls invocation (four options):\n- `{\"type\":\"auto\"}` — model decides (default when `tools` present).\n- `{\"type\":\"any\"}` — must call some tool.\n- `{\"type\":\"tool\",\"name\":\"get_weather\"}` — must call that specific tool.\n- `{\"type\":\"none\"}` — no tools (default when no `tools`).\n\nWith `any` or `tool` the API prefills the assistant turn, so no leading natural-language text precedes the `tool_use` block. Add `\"disable_parallel_tool_use\": true` inside `tool_choice` to cap at one tool per turn. (Extended thinking only supports `auto`/`none`.)\n\n### How the API turns this into a prompt (the bridge to XML)\n\nWhen `tools` is present, the API constructs a tool-use system prompt with this skeleton (verified from \"Define tools\"):\n\n```text\nIn this environment you have access to a set of tools you can use to answer the user's question.\n{{ FORMATTING INSTRUCTIONS }}\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n{{ TOOL DEFINITIONS IN JSON SCHEMA }}\n{{ USER SYSTEM PROMPT }}\n{{ TOOL CONFIGURATION }}\n```\n\n`{{ TOOL DEFINITIONS IN JSON SCHEMA }}` is your `tools` array serialized to JSON Schema. `{{ FORMATTING INSTRUCTIONS }}` is the (unpublished) block teaching the model the XML syntax with `antml:` namespace prefixes (shown under [Tool-call format → underlying XML](#underlying-xml-modern-attribute-form-with-antml-namespace)). The note \"parsed with regular expressions\" is why output need not be well-formed XML.\n\n---\n\n## Tool-call format\n\nThe wire format your application consumes is JSON. A single call is one `tool_use` content block in the assistant message, with `stop_reason: \"tool_use\"` at the top level:\n\n```json\n{\n \"id\": \"msg_01Aq9w938a90dw8q\",\n \"type\": \"message\",\n \"role\": \"assistant\",\n \"model\": \"claude-opus-4-8\",\n \"content\": [\n {\n \"type\": \"text\",\n \"text\": \"I'll check the current weather in San Francisco for you.\"\n },\n {\n \"type\": \"tool_use\",\n \"id\": \"toolu_01A09q90qw90lq917835lq9\",\n \"name\": \"get_weather\",\n \"input\": { \"location\": \"San Francisco, CA\", \"unit\": \"celsius\" }\n }\n ],\n \"stop_reason\": \"tool_use\",\n \"stop_sequence\": null,\n \"usage\": { \"input_tokens\": 472, \"output_tokens\": 65 }\n}\n```\n\nKey facts for a parser:\n- `tool_use.input` is an already-parsed **object**, never a JSON string.\n- A leading `text` block is optional and informational; do not rely on its wording.\n- Match calls to results by `id` → `tool_use_id`.\n\n### Underlying XML (modern attribute form with antml: namespace)\n\nBefore the API converts it, the model literally emits an XML block. The current (Claude 3+) form is attribute-based:\n\n```text\n<function_calls>\n<invoke name=\"get_weather\">\n<parameter name=\"location\">San Francisco, CA</parameter>\n<parameter name=\"unit\">celsius</parameter>\n</invoke>\n</function_calls>\n```\n\nCurrent Claude models prefix these tags with an `antml:` XML namespace prefix (e.g. `antml:function_calls`, `antml:invoke name=\"…\"`, `antml:parameter name=\"…\"`). The API strips all of this and exposes only the JSON `tool_use` block; integrators should target the JSON, not the XML.\n\n---\n\n## Multiple / parallel tool calls\n\nParallel calls are the default. Claude emits **multiple `tool_use` blocks in a single assistant message**:\n\n```json\n{\n \"role\": \"assistant\",\n \"content\": [\n { \"type\": \"text\", \"text\": \"Let me check both cities.\" },\n {\n \"type\": \"tool_use\",\n \"id\": \"toolu_01weather_sf\",\n \"name\": \"get_weather\",\n \"input\": { \"location\": \"San Francisco, CA\" }\n },\n {\n \"type\": \"tool_use\",\n \"id\": \"toolu_02weather_nyc\",\n \"name\": \"get_weather\",\n \"input\": { \"location\": \"New York, NY\" }\n }\n ]\n}\n```\n\nYou return **all** results in **one** `user` message, one `tool_result` per call, results first:\n\n```json\n{\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"tool_result\",\n \"tool_use_id\": \"toolu_01weather_sf\",\n \"content\": \"San Francisco: 68F, partly cloudy\"\n },\n {\n \"type\": \"tool_result\",\n \"tool_use_id\": \"toolu_02weather_nyc\",\n \"content\": \"New York: 45F, clear skies\"\n }\n ]\n}\n```\n\nCalls in one turn are **unordered** and may be run concurrently. If two batched calls turn out to depend on each other, return the natural error in a `tool_result` with `\"is_error\": true`; Claude reissues the dependent call on a later turn. (In the legacy XML format, parallelism is multiple `<invoke>` blocks inside one `<function_calls>`.)\n\n---\n\n## Tool-result format\n\nA result is a `tool_result` block inside a `user` message:\n\n- `tool_use_id` (required) — the `id` of the `tool_use` it answers.\n- `content` (optional) — a string, **or** an array of `text`/`image`/`document` blocks. Omit for an empty result.\n- `is_error` (optional) — `true` for execution failures; put a useful message in `content`.\n\n```json\n{\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"tool_result\",\n \"tool_use_id\": \"toolu_01A09q90qw90lq917835lq9\",\n \"content\": \"15 degrees\"\n }\n ]\n}\n```\n\nError result:\n\n```json\n{\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"tool_result\",\n \"tool_use_id\": \"toolu_01A09q90qw90lq917835lq9\",\n \"content\": \"ConnectionError: the weather service API is not available (HTTP 500)\",\n \"is_error\": true\n }\n ]\n}\n```\n\nRich result (text + image blocks):\n\n```json\n{\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"tool_result\",\n \"tool_use_id\": \"toolu_01A09q90qw90lq917835lq9\",\n \"content\": [\n { \"type\": \"text\", \"text\": \"15 degrees\" },\n {\n \"type\": \"image\",\n \"source\": { \"type\": \"base64\", \"media_type\": \"image/jpeg\", \"data\": \"/9j/4AAQSkZJRg...\" }\n }\n ]\n }\n ]\n}\n```\n\nServer tools require **no** `tool_result` from you — Anthropic executes them and injects the result inline in the assistant turn. (Legacy XML feeds results back as `<function_results><result><tool_name>…</tool_name><stdout>…</stdout></result></function_results>`, or `<error>…</error>` on failure.)\n\n---\n\n## End-to-end example\n\nA complete multi-turn weather exchange. All JSON is valid.\n\n**Request 1 — system + tools + user question:**\n\n```json\n{\n \"model\": \"claude-opus-4-8\",\n \"max_tokens\": 1024,\n \"system\": \"You are a helpful weather assistant. Use the provided tools to answer.\",\n \"tools\": [\n {\n \"name\": \"get_weather\",\n \"description\": \"Get the current weather in a given location\",\n \"input_schema\": {\n \"type\": \"object\",\n \"properties\": {\n \"location\": { \"type\": \"string\", \"description\": \"The city and state, e.g. San Francisco, CA\" },\n \"unit\": { \"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"], \"description\": \"Unit for the temperature\" }\n },\n \"required\": [\"location\"]\n }\n }\n ],\n \"messages\": [\n { \"role\": \"user\", \"content\": \"What's the weather in San Francisco?\" }\n ]\n}\n```\n\n**Response 1 — assistant requests the tool (`stop_reason: \"tool_use\"`):**\n\n```json\n{\n \"id\": \"msg_01Aq9w938a90dw8q\",\n \"type\": \"message\",\n \"role\": \"assistant\",\n \"model\": \"claude-opus-4-8\",\n \"content\": [\n { \"type\": \"text\", \"text\": \"I'll check the current weather in San Francisco for you.\" },\n {\n \"type\": \"tool_use\",\n \"id\": \"toolu_01A09q90qw90lq917835lq9\",\n \"name\": \"get_weather\",\n \"input\": { \"location\": \"San Francisco, CA\", \"unit\": \"celsius\" }\n }\n ],\n \"stop_reason\": \"tool_use\",\n \"stop_sequence\": null,\n \"usage\": { \"input_tokens\": 472, \"output_tokens\": 65 }\n}\n```\n\n**Request 2 — replay history, append the assistant turn and the `tool_result`:**\n\n```json\n{\n \"model\": \"claude-opus-4-8\",\n \"max_tokens\": 1024,\n \"system\": \"You are a helpful weather assistant. Use the provided tools to answer.\",\n \"tools\": [\n {\n \"name\": \"get_weather\",\n \"description\": \"Get the current weather in a given location\",\n \"input_schema\": {\n \"type\": \"object\",\n \"properties\": {\n \"location\": { \"type\": \"string\", \"description\": \"The city and state, e.g. San Francisco, CA\" },\n \"unit\": { \"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"], \"description\": \"Unit for the temperature\" }\n },\n \"required\": [\"location\"]\n }\n }\n ],\n \"messages\": [\n { \"role\": \"user\", \"content\": \"What's the weather in San Francisco?\" },\n {\n \"role\": \"assistant\",\n \"content\": [\n { \"type\": \"text\", \"text\": \"I'll check the current weather in San Francisco for you.\" },\n {\n \"type\": \"tool_use\",\n \"id\": \"toolu_01A09q90qw90lq917835lq9\",\n \"name\": \"get_weather\",\n \"input\": { \"location\": \"San Francisco, CA\", \"unit\": \"celsius\" }\n }\n ]\n },\n {\n \"role\": \"user\",\n \"content\": [\n {\n \"type\": \"tool_result\",\n \"tool_use_id\": \"toolu_01A09q90qw90lq917835lq9\",\n \"content\": \"15 degrees Celsius, partly cloudy\"\n }\n ]\n }\n ]\n}\n```\n\n**Response 2 — assistant's final answer (`stop_reason: \"end_turn\"`):**\n\n```json\n{\n \"id\": \"msg_01EeFG3hijk2lmno4PqrSt\",\n \"type\": \"message\",\n \"role\": \"assistant\",\n \"model\": \"claude-opus-4-8\",\n \"content\": [\n { \"type\": \"text\", \"text\": \"It's currently 15 degrees Celsius and partly cloudy in San Francisco.\" }\n ],\n \"stop_reason\": \"end_turn\",\n \"stop_sequence\": null,\n \"usage\": { \"input_tokens\": 530, \"output_tokens\": 18 }\n}\n```\n\n### Streaming (SSE) shape of the tool call\n\nThe same tool call, streamed. Note `tool_use` opens with an empty `input`, the arguments arrive as `input_json_delta.partial_json` fragments, and the final `stop_reason` lands in `message_delta`. This block is reproduced verbatim from Anthropic's streaming docs:\n\n```text\nevent: message_start\ndata: {\"type\":\"message_start\",\"message\":{\"id\":\"msg_014p7gG3wDgGV9EUtLvnow3U\",\"type\":\"message\",\"role\":\"assistant\",\"model\":\"claude-opus-4-8\",\"stop_sequence\":null,\"usage\":{\"input_tokens\":472,\"output_tokens\":2},\"content\":[],\"stop_reason\":null}}\n\nevent: content_block_start\ndata: {\"type\":\"content_block_start\",\"index\":0,\"content_block\":{\"type\":\"text\",\"text\":\"\"}}\n\nevent: ping\ndata: {\"type\": \"ping\"}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"Okay\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" let\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"'s\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" check\"}}\n\nevent: content_block_stop\ndata: {\"type\":\"content_block_stop\",\"index\":0}\n\nevent: content_block_start\ndata: {\"type\":\"content_block_start\",\"index\":1,\"content_block\":{\"type\":\"tool_use\",\"id\":\"toolu_01T1x1fJ34qAmk2tNTrN7Up6\",\"name\":\"get_weather\",\"input\":{}}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"input_json_delta\",\"partial_json\":\"\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"input_json_delta\",\"partial_json\":\"{\\\"location\\\":\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"input_json_delta\",\"partial_json\":\" \\\"San\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"input_json_delta\",\"partial_json\":\" Francisc\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"input_json_delta\",\"partial_json\":\"o,\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":1,\"delta\":{\"type\":\"input_json_delta\",\"partial_json\":\" CA\\\"}\"}}\n\nevent: content_block_stop\ndata: {\"type\":\"content_block_stop\",\"index\":1}\n\nevent: message_delta\ndata: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"tool_use\",\"stop_sequence\":null},\"usage\":{\"output_tokens\":89}}\n\nevent: message_stop\ndata: {\"type\":\"message_stop\"}\n```\n\nReassembly: concatenate every `partial_json` for a given `index` (`\"\" + \"{\\\"location\\\":\" + \" \\\"San\" + \" Francisc\" + \"o,\" + \" CA\\\"}\"` → `{\"location\": \"San Francisco, CA\"}`), then `JSON.parse` at that block's `content_block_stop`. Tool use also supports fine-grained streaming (`eager_input_streaming` per tool) for finer `partial_json` chunking.\n\n---\n\n## OpenAI-compatible API mapping\n\nAnthropic integrates tools into the `user`/`assistant` message structure rather than using OpenAI's separate `tool` role and `function` wrapper. Field-by-field:\n\n| Concept | Anthropic Messages API | OpenAI Chat Completions |\n| --- | --- | --- |\n| Tool definition wrapper | flat `{\"name\",\"description\",\"input_schema\"}` in `tools[]` | `{\"type\":\"function\",\"function\":{\"name\",\"description\",\"parameters\"}}` in `tools[]` |\n| Tool schema key | `input_schema` (JSON Schema) | `parameters` (JSON Schema) |\n| \"Must call a tool\" | `tool_choice:{\"type\":\"any\"}` / `{\"type\":\"tool\",\"name\":…}` | `tool_choice:\"required\"` / `{\"type\":\"function\",\"function\":{\"name\":…}}` |\n| Disable parallel calls | `tool_choice:{…,\"disable_parallel_tool_use\":true}` | `parallel_tool_calls:false` (top level) |\n| Assistant call container | `tool_use` **content block** in `content[]` | `tool_calls[]` on the assistant `message` |\n| Call id | `tool_use.id` = `toolu_…` | `tool_calls[].id` = `call_…` |\n| Function name | `tool_use.name` | `tool_calls[].function.name` |\n| Function arguments | `tool_use.input` = **nested JSON object** (parsed) | `tool_calls[].function.arguments` = **JSON string** (must `JSON.parse`) |\n| \"Tools were called\" signal | `stop_reason:\"tool_use\"` | `finish_reason:\"tool_calls\"` |\n| Result message role | `user` message containing `tool_result` block(s) | dedicated `{\"role\":\"tool\",…}` message(s) |\n| Result ↔ call linkage | `tool_result.tool_use_id` | `tool` message `tool_call_id` |\n| Result payload | `tool_result.content` = string **or** block array (text/image/document) | `tool` message `content` = string |\n| Error result | `tool_result` with `is_error:true` | no dedicated flag; encode in `content` |\n| System prompt | top-level `system` param (no `system` role) | `{\"role\":\"system\",…}` message |\n| Streamed args | `input_json_delta.partial_json` fragments | `tool_calls[].function.arguments` string deltas |\n\nConversion gotchas:\n- **Object vs string:** to emit OpenAI shape, `JSON.stringify(tool_use.input)`; to consume OpenAI shape into Anthropic, `JSON.parse(arguments)`.\n- **Role reshaping:** collapse N OpenAI `tool` messages into one Anthropic `user` message of N `tool_result` blocks (order them before any text), and vice-versa.\n- **No `type:\"function\"`** wrapper on Anthropic custom tools; add/remove it when translating.\n- Id prefixes differ (`toolu_` vs `call_`); never assume one format's id is valid in the other.\n\n---\n\n## Parsing notes & gotchas\n\n- **`input` is an object, not a string.** Unlike OpenAI's `arguments`, do not `JSON.parse` `tool_use.input` from a non-streamed response — it is already an object. Only the *streaming* `partial_json` fragments are strings.\n- **Streaming tool args need reassembly.** `content_block_start` for a `tool_use` always has `input: {}`. Buffer `partial_json` per `index` and parse only at `content_block_stop`; mid-stream fragments are not valid JSON on their own (e.g. `{\"location\":`). Current models emit one complete key/value at a time, so expect bursts and gaps.\n- **`stop_reason` placement.** In streaming, `stop_reason` is `null` in `message_start` and final value (`\"tool_use\"`/`\"end_turn\"`) arrives in `message_delta`, not `message_stop`. `usage` in `message_delta` is **cumulative**.\n- **Ordering is enforced.** `tool_result` blocks must be first in their `user` message and must immediately follow the assistant `tool_use` message; every `tool_use.id` needs a matching `tool_result.tool_use_id`, or you get HTTP 400 (\"tool_use ids were found without tool_result blocks immediately after\").\n- **`tool_choice:any`/`tool` suppress preamble.** The API prefills the assistant turn, so no leading `text` block appears before `tool_use` — don't write a parser that expects explanatory text.\n- **Parallel results in one message.** Splitting parallel `tool_result`s across multiple `user` messages breaks the contract; send them together.\n- **Treat result content as untrusted.** Tool results can carry indirect prompt injection; keep them inside `tool_result` blocks, never promote to `system`/`user` text.\n- **Server tools differ.** `server_tool_use` / `web_search_tool_result` blocks are produced and consumed by Anthropic; never synthesize `tool_result` for them. `stop_reason:\"pause_turn\"` means resend the response as-is to let a long server-tool turn continue.\n- **Extended thinking + tools.** Preserve `thinking`/`redacted_thinking` blocks (with their `signature`) verbatim across turns; forced `tool_choice` (`any`/`tool`) is rejected when thinking is on.\n- **Output is not valid XML.** The underlying model output is parsed by Anthropic with regular expressions, not an XML parser (\"The output is not expected to be valid XML\"). If you reconstruct prompts at token level, do not assume well-formedness; rely on the JSON the API returns.\n- **Legacy vs modern XML are different tag sets.** Legacy: `<invoke>` + child `<tool_name>` + `<parameters>` with per-name child tags; results in `<function_results>/<result>/<stdout>`. Modern: `<invoke name=\"…\">` + `<parameter name=\"…\">`. Mixing them up will misparse. The legacy format also required passing `</function_calls>` as a `stop_sequence` and is not optimized for Claude 3+.\n\n### Legacy XML format (secondary, prompt-based — fully verified, now retired)\n\nBefore the Messages API, tools were defined and called entirely in the prompt. Anthropic's archived \"Legacy tool use\" doc specifies it verbatim.\n\nTool definition (inside a `<tools>` block in the system prompt):\n\n```text\n<tool_description>\n<tool_name>get_weather</tool_name>\n<description>\nRetrieves the current weather for a specified location.\nReturns a dictionary with two fields:\n- temperature: float, the current temperature in Fahrenheit\n- conditions: string, a brief description of the current weather conditions\nRaises ValueError if the provided location cannot be found.\n</description>\n<parameters>\n<parameter>\n<name>location</name>\n<type>string</type>\n<description>The city and state, e.g. San Francisco, CA</description>\n</parameter>\n</parameters>\n</tool_description>\n```\n\nModel-emitted call (multiple `<invoke>` for parallel calls; pass `</function_calls>` as a `stop_sequence`):\n\n```text\n<function_calls>\n<invoke>\n<tool_name>get_weather</tool_name>\n<parameters>\n<location>San Francisco, CA</location>\n</parameters>\n</invoke>\n</function_calls>\n```\n\nResult fed back into the next user turn:\n\n```text\n<function_results>\n<result>\n<tool_name>get_weather</tool_name>\n<stdout>\n59 degrees Fahrenheit, partly cloudy\n</stdout>\n</result>\n</function_results>\n```\n\nError result:\n\n```text\n<function_results>\n<error>\nerror message goes here\n</error>\n</function_results>\n```\n\nThe legacy system-prompt preamble (verbatim from the archived doc) was:\n\n```text\nIn this environment you have access to a set of tools you can use to answer the user's question.\nYou may call them like this:\n<function_calls>\n<invoke>\n<tool_name>$TOOL_NAME</tool_name>\n<parameters>\n<$PARAMETER_NAME>$PARAMETER_VALUE</$PARAMETER_NAME>\n...\n</parameters>\n</invoke>\n</function_calls>\n\nHere are the tools available:\n<tools>\n...one <tool_description> per tool...\n</tools>\n```\n\nLegacy notes: no built-in tools (everything is prompt-defined); Anthropic recommended ≤3–5 tools; the model conventionally wrapped reasoning in `<scratchpad>` and final output in `<answer>`. This format is \"out of date\" and \"not optimized for Claude 3\" — use the JSON Messages API for anything current.\n\n---\n\n## Sources\n\n- Tool use overview — https://docs.claude.com/en/docs/agents-and-tools/tool-use/overview\n- How tool use works — https://docs.claude.com/en/docs/agents-and-tools/tool-use/how-tool-use-works\n- Define tools (tool schema, `input_schema`, `tool_choice`, constructed system prompt) — https://docs.claude.com/en/docs/agents-and-tools/tool-use/define-tools\n- Handle tool calls (`tool_use`/`tool_result`, `is_error`, ordering rules) — https://docs.claude.com/en/docs/agents-and-tools/tool-use/handle-tool-calls\n- Parallel tool use — https://docs.claude.com/en/docs/agents-and-tools/tool-use/parallel-tool-use\n- Streaming messages (SSE events, `input_json_delta`, verbatim tool-use stream) — https://docs.claude.com/en/docs/build-with-claude/streaming\n- Messages API reference (`stop_reason` enum, response shape, `tools`) — https://docs.claude.com/en/api/messages\n- Legacy tool use (archived; verbatim XML tags and prompt) — https://web.archive.org/web/20240528231249/https://docs.anthropic.com/en/docs/legacy-tool-use ; also live localized copies, e.g. https://docs.anthropic.com/de/docs/legacy-tool-use (English path now redirects to the tool-use overview)\n",
77
78
  "toolconv/deepseek.md": "# DeepSeek tool-calling wire format\n\nDeepSeek's chat models (DeepSeek-V3, V3-0324, R1, R1-0528, and DeepSeek-V3.1) share a\nsingle tokenizer family and a distinctive envelope built from **fullwidth-pipe** special\ntokens such as `<|begin▁of▁sentence|>` and `<|User|>`. Tool calling is emitted as a run\nof dedicated special tokens (`<|tool▁calls▁begin|>` … `<|tool▁calls▁end|>`) rather than\nJSON-in-text or XML. This document centers on **DeepSeek-V3.1** (the current hybrid\nthinking/non-thinking model) and documents the older **DeepSeek-V3-0324** and\n**DeepSeek-R1-0528** format as an explicit version difference, because their on-the-wire\ntool syntax is *not* the same as V3.1's.\n\nAn inference server enables it with a chat template plus a tool-call parser:\n\n- vLLM V3.1: `--enable-auto-tool-choice --tool-call-parser deepseek_v31 --chat-template examples/tool_chat_template_deepseekv31.jinja` (optionally `--reasoning-parser deepseek_r1`).\n- vLLM V3-0324 / R1-0528: `--enable-auto-tool-choice --tool-call-parser deepseek_v3 --chat-template examples/tool_chat_template_deepseekv3.jinja` (V3-0324) or `tool_chat_template_deepseekr1.jinja` (R1-0528).\n- The model's own `tokenizer_config.json` `chat_template` (and the identical `assets/chat_template.jinja`) renders the V3.1 envelope, tool calls, and tool outputs; it does **not** synthesize the `## Tools` advertisement block, so vLLM ships a template that does (see below).\n\n> Verified against: the DeepSeek-V3.1 model card \"Chat Template\" / \"ToolCall\" sections, the\n> byte-identical `chat_template` in `tokenizer_config.json` and `assets/chat_template.jinja`,\n> the `added_tokens` in `tokenizer.json` (token IDs), `config.json` (bos/eos IDs), the\n> DeepSeek-V3-0324 and DeepSeek-R1-0528 `tokenizer_config.json` chat templates, the vLLM\n> `tool_chat_template_deepseekv31.jinja`, and the vLLM tool-calling / reasoning-outputs docs.\n\n## A note on the unusual Unicode (do not substitute ASCII)\n\nDeepSeek's markers do **not** use the ASCII vertical bar `|` (U+007C) or ASCII underscore\n`_`. They use:\n\n- `|` — **U+FF5C FULLWIDTH VERTICAL LINE**, as the delimiter just inside the angle brackets.\n- `▁` — **U+2581 LOWER ONE EIGHTH BLOCK** (the SentencePiece word-boundary glyph), as the\n separator *between words* inside a token, e.g. `begin▁of▁sentence`, `tool▁calls▁begin`.\n\nSo `<|tool▁calls▁begin|>` is `<` + `|`(FF5C) + `tool` + `▁`(2581) + `calls` + `▁`(2581) +\n`begin` + `|`(FF5C) + `>`. Copying these tokens as `<|tool_calls_begin|>` (ASCII pipe +\nunderscore) produces tokens the model never trained on and will silently break parsing and\ngeneration. The only DeepSeek markers that use ASCII brackets are the thinking tags\n`<think>` / `</think>` (plain `<`, `/`, `>`) and the rarely used `<|EOT|>` (ASCII pipes).\n\n## Special tokens\n\nToken IDs are from DeepSeek-V3.1 `tokenizer.json` (`added_tokens`); `vocab_size` is 129280.\nThe `special` column reflects the tokenizer's `\"special\"` flag (it governs\n`skip_special_tokens`); note that the role/think/tool markers are `special: false`.\n\n| Token (verbatim) | ID | `special` | Purpose |\n| --- | --- | --- | --- |\n| `<|begin▁of▁sentence|>` | 0 | true | BOS; prepended once at the very start of the prompt. |\n| `<|end▁of▁sentence|>` | 1 | true | EOS; ends every assistant/tool turn and is the stop token. |\n| `<|▁pad▁|>` | 2 | true | Padding (`pad_token`; the model card/config also reuse EOS as pad). |\n| `<|search▁begin|>` | 128796 | false | Search-agent query open (thinking-mode search tool). |\n| `<|search▁end|>` | 128797 | false | Search-agent query close. |\n| `<think>` | 128798 | false | Opens the reasoning/thinking span. ASCII brackets. |\n| `</think>` | 128799 | false | Closes the reasoning span; **also emitted in non-thinking mode** (see below). |\n| `<|fim▁hole|>` / `<|fim▁begin|>` / `<|fim▁end|>` | 128800–128802 | false | Fill-in-the-middle (not chat). |\n| `<|User|>` | 128803 | false | User role marker. |\n| `<|Assistant|>` | 128804 | false | Assistant role marker. |\n| `<\\|EOT\\|>` | 128805 | true | End-of-turn (legacy; ASCII pipes, rarely used in chat). |\n| `<|tool▁calls▁begin|>` | 128806 | false | Opens the assistant's batch of tool calls. |\n| `<|tool▁calls▁end|>` | 128807 | false | Closes the batch of tool calls. |\n| `<|tool▁call▁begin|>` | 128808 | false | Opens a single tool call inside the batch. |\n| `<|tool▁call▁end|>` | 128809 | false | Closes a single tool call. |\n| `<|tool▁outputs▁begin|>` | 128810 | false | Opens a batch of tool results (**R1-0528 / V3-0324 only**). |\n| `<|tool▁outputs▁end|>` | 128811 | false | Closes a batch of tool results (**R1-0528 / V3-0324 only**). |\n| `<|tool▁output▁begin|>` | 128812 | false | Opens a single tool result. |\n| `<|tool▁output▁end|>` | 128813 | false | Closes a single tool result. |\n| `<|tool▁sep|>` | 128814 | false | Separator inside a tool call (between name and arguments). |\n\n`config.json` confirms `bos_token_id: 0`, `eos_token_id: 1`.\n\n## Roles / channels / turn structure\n\nThere is no OpenAI-style `system`/`developer` channel token. Roles are inline markers and\nthe prompt is one flat string:\n\n```text\n<|begin▁of▁sentence|>{system_prompt}<|User|>{query}<|Assistant|>{response}<|end▁of▁sentence|>\n```\n\n- **System prompt** has no marker. All `system` messages are concatenated (joined with\n `\\n\\n` when there are several) and emitted immediately after `<|begin▁of▁sentence|>`,\n before the first `<|User|>`. When tools are present the `## Tools` block is appended to\n this system text (separated by `\\n\\n`).\n- **User turn**: `<|User|>` + content. (No EOS after the user text in V3.1; the assistant\n marker follows directly.)\n- **Assistant turn**: opens with `<|Assistant|>`, then a thinking tag, then content, then\n `<|end▁of▁sentence|>`.\n- **Thinking vs non-thinking (V3.1 hybrid)** — selected by the template, not by the model:\n - Non-thinking generation prefix: `…<|Assistant|></think>` — the model starts *after* a\n `</think>` it never had to open. Unlike DeepSeek-V3, V3.1 always injects this `</think>`.\n - Thinking generation prefix: `…<|Assistant|><think>` — the model emits its chain of\n thought, closes with `</think>`, then the answer.\n - In multi-turn context, **every** stored assistant turn keeps a `</think>`; only the last\n turn's leading thinking tag reflects the requested mode. When rendering a stored\n assistant message, any text up to and including `</think>` is stripped from `content`\n before re-emitting (the template does `content.split('</think>', 1)[1]`).\n- **Tool calling runs in non-thinking mode.** The model card states \"Toolcall is supported\n in non-thinking mode,\" and the V3.1 tool template opens the tool-call turn with\n `<|Assistant|></think>`. With vLLM, V3.1 reasoning is disabled by default; enable it via\n `chat_template_kwargs={\"thinking\": true}`.\n- **Search-agent channel**: a separate thinking-mode protocol using `<|search▁begin|>` /\n `<|search▁end|>` (see the model card's `assets/search_tool_trajectory.html`); out of\n scope for ordinary function calling.\n\n## Tool definitions\n\nTools are advertised as a **Markdown block injected into the system area** (after the system\nprompt, before the first `<|User|>`). The chat template in `tokenizer_config.json` does not\nbuild this block from a `tools=[…]` argument; the caller (or vLLM's\n`tool_chat_template_deepseekv31.jinja`) constructs it. Reproduced verbatim from the\nDeepSeek-V3.1 model card, the full layout is\n`<|begin▁of▁sentence|>{system prompt}\\n\\n{tool_description}<|User|>{query}<|Assistant|></think>`\nwhere `{tool_description}` is:\n\n```text\n## Tools\nYou have access to the following tools:\n\n### {tool_name1}\nDescription: {description}\n\nParameters: {json.dumps(parameters)}\n\nIMPORTANT: ALWAYS adhere to this exact format for tool use:\n<|tool▁calls▁begin|><|tool▁call▁begin|>tool_call_name<|tool▁sep|>tool_call_arguments<|tool▁call▁end|>{additional_tool_calls}<|tool▁calls▁end|>\n\nWhere:\n- `tool_call_name` must be an exact match to one of the available tools\n- `tool_call_arguments` must be valid JSON that strictly follows the tool's Parameters Schema\n- For multiple tool calls, chain them directly without separators or spaces\n```\n\nEach tool contributes one `### {name}` section with a `Description:` line and a\n`Parameters: {…}` line whose value is the compact JSON of the JSON-Schema parameters object\n(`json.dumps(parameters)` in the card, `parameters | tojson` in vLLM's template). The\n`IMPORTANT:` instruction block is appended once, after the last tool.\n\n## Tool-call format\n\nThe model emits one batch wrapper containing one or more calls. Each call is\n`name <|tool▁sep|> arguments`, where **arguments is a raw JSON object string** (no code\nfence). Minimal single call (what the model generates after the `<|Assistant|></think>`\nprefix):\n\n```text\n<|tool▁calls▁begin|><|tool▁call▁begin|>get_weather<|tool▁sep|>{\"location\": \"San Francisco, CA\"}<|tool▁call▁end|><|tool▁calls▁end|>\n```\n\nGrammar (V3.1):\n\n```text\n<|tool▁calls▁begin|><|tool▁call▁begin|>{name}<|tool▁sep|>{json_args}<|tool▁call▁end|>{…more calls…}<|tool▁calls▁end|>\n```\n\n- `{name}` must exactly match an advertised tool name. It comes **first**, immediately after\n `<|tool▁call▁begin|>`.\n- `{json_args}` is valid JSON conforming to the tool's parameter schema, inlined directly.\n- The whole assistant turn is then closed by the template/server with\n `<|end▁of▁sentence|>`.\n\n(V3.1 has **no** `type` field and **no** ` ```json ` fence around arguments — that is the\nolder R1/V3-0324 convention; see Version differences.)\n\n## Multiple / parallel tool calls\n\nAll calls live inside one `<|tool▁calls▁begin|>…<|tool▁calls▁end|>` wrapper. After the\nfirst `<|tool▁call▁begin|>…<|tool▁call▁end|>`, each additional call is **another\n`<|tool▁call▁begin|>…<|tool▁call▁end|>` chained directly, with no separator, newline, or\nspace between calls** (the card: \"chain them directly without separators or spaces\"):\n\n```text\n<|tool▁calls▁begin|><|tool▁call▁begin|>get_weather<|tool▁sep|>{\"location\": \"San Francisco, CA\"}<|tool▁call▁end|><|tool▁call▁begin|>get_weather<|tool▁sep|>{\"location\": \"Seattle, WA\"}<|tool▁call▁end|><|tool▁calls▁end|>\n```\n\nNote that `<|tool▁calls▁begin|>` (plural, id 128806) appears exactly once; each call uses\nthe singular `<|tool▁call▁begin|>` (id 128808) / `<|tool▁call▁end|>` (id 128809).\n\n## Tool-result format\n\nExecuted results are fed back as `tool`-role messages. In **V3.1** each result is wrapped in\nthe singular output tokens, with **no** plural `<|tool▁outputs▁…|>` wrapper, emitted right\nafter the assistant tool-call turn's `<|end▁of▁sentence|>`:\n\n```text\n<|tool▁output▁begin|>{result_text}<|tool▁output▁end|>\n```\n\n`{result_text}` is the raw tool output (typically a JSON string, but any text). For multiple\nresults, the V3.1 template emits one `<|tool▁output▁begin|>…<|tool▁output▁end|>` per `tool`\nmessage, concatenated directly. There is **no tool-call ID in the wire format** — results are\nmatched to calls **positionally** (order of outputs ↔ order of calls).\n\nThe model then produces its final answer **directly after `<|tool▁output▁end|>`** with no\n`<|Assistant|>` marker and no `</think>` (see Parsing notes — the V3.1 reference template\ndeliberately renders post-tool assistant content as just `content<|end▁of▁sentence|>`).\n\n> R1-0528 / V3-0324 differ: results are enclosed in a `<|tool▁outputs▁begin|>` …\n> `<|tool▁outputs▁end|>` batch wrapper, with each result as\n> `<|tool▁output▁begin|>…<|tool▁output▁end|>` and multiple results newline-separated.\n\n## End-to-end example\n\nA complete DeepSeek-V3.1 **non-thinking** multi-turn exchange. Everything is one flat string;\ninline `←` comments mark where the model's generation begins (they are not part of the\nstream). Whitespace inside the `## Tools` block is literal newlines.\n\n```text\n<|begin▁of▁sentence|>You are a helpful assistant.\n\n## Tools\nYou have access to the following tools:\n\n### get_weather\nDescription: Get the current weather for a location\n\nParameters: {\"type\": \"object\", \"properties\": {\"location\": {\"type\": \"string\", \"description\": \"City and state, e.g. San Francisco, CA\"}, \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]}}, \"required\": [\"location\"]}\n\nIMPORTANT: ALWAYS adhere to this exact format for tool use:\n<|tool▁calls▁begin|><|tool▁call▁begin|>tool_call_name<|tool▁sep|>tool_call_arguments<|tool▁call▁end|>{additional_tool_calls}<|tool▁calls▁end|>\n\nWhere:\n- `tool_call_name` must be an exact match to one of the available tools\n- `tool_call_arguments` must be valid JSON that strictly follows the tool's Parameters Schema\n- For multiple tool calls, chain them directly without separators or spaces\n<|User|>What's the weather in San Francisco?<|Assistant|></think><|tool▁calls▁begin|><|tool▁call▁begin|>get_weather<|tool▁sep|>{\"location\": \"San Francisco, CA\", \"unit\": \"celsius\"}<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|><|tool▁output▁begin|>{\"temperature\": 18, \"unit\": \"celsius\", \"condition\": \"Foggy\"}<|tool▁output▁end|>It's currently 18°C and foggy in San Francisco.<|end▁of▁sentence|>\n```\n\nReading the spans:\n\n1. `<|begin▁of▁sentence|>` + system text + `\\n\\n` + `## Tools…` block — prompt prefix.\n2. `<|User|>What's the weather in San Francisco?` — user turn.\n3. `<|Assistant|></think>` — non-thinking generation prefix (prompt). **Model generates from here.**\n4. `<|tool▁calls▁begin|>…<|tool▁calls▁end|>` — the model's tool call; server appends `<|end▁of▁sentence|>` and stops with `finish_reason: \"tool_calls\"`.\n5. `<|tool▁output▁begin|>…<|tool▁output▁end|>` — your executed result, appended to the prompt.\n6. `It's currently 18°C and foggy in San Francisco.<|end▁of▁sentence|>` — **the model generates the final answer directly after the tool output** (no new `<|Assistant|>` marker), ending with EOS.\n\n## OpenAI-compatible API mapping\n\nWhen fronted by an OpenAI-compatible server (e.g. vLLM with `--tool-call-parser\ndeepseek_v31`):\n\n- **`finish_reason`**: `\"tool_calls\"` when the model emitted a `<|tool▁calls▁begin|>…`\n batch; otherwise `\"stop\"`.\n- **`message.tool_calls[]`**: one element per `<|tool▁call▁begin|>…<|tool▁call▁end|>`.\n - `.type` = `\"function\"`.\n - `.function.name` = the text between `<|tool▁call▁begin|>` and `<|tool▁sep|>`.\n - `.function.arguments` = the text between `<|tool▁sep|>` and `<|tool▁call▁end|>`, returned\n as a **JSON string** (per the OpenAI spec), not a nested object. The model already emits\n raw JSON there, so it is passed through.\n - `.id` = **synthesized by the server** (e.g. `chatcmpl-tool-…`). DeepSeek's wire format\n carries no call ID.\n- **Tool result messages**: `{\"role\": \"tool\", \"tool_call_id\": \"<id>\", \"content\": \"<result>\"}`.\n The server renders `content` into `<|tool▁output▁begin|>…<|tool▁output▁end|>`. Because the\n prompt has no IDs, `tool_call_id` is used only for client-side bookkeeping; **the model\n relies on ordering**, so preserve the order of results relative to the calls.\n- **Assistant replay**: when you send a prior assistant turn back with `tool_calls`, the\n template inlines `function.arguments`. The HF reference template inlines it **verbatim**\n (assumes it is already a JSON string); vLLM's `tool_chat_template_deepseekv31.jinja` pipes\n it through `| tojson`. Send `arguments` as a JSON **string** per the OpenAI spec (see the\n gotcha below about double-encoding).\n\n## Parsing notes & gotchas\n\n- **Unicode is load-bearing.** Match `|` = U+FF5C and `▁` = U+2581 exactly. ASCII\n `<|tool_calls_begin|>` will not tokenize to the special tokens. `<think>`/`</think>` use\n ASCII brackets; the rare `<|EOT|>` uses ASCII pipes.\n- **Tool/role markers are `special: false`.** Only `<|begin▁of▁sentence|>`,\n `<|end▁of▁sentence|>`, `<|▁pad▁|>`, and `<|EOT|>` are flagged `special: true`. So\n decoding with `skip_special_tokens=True` will **not** strip `<|tool▁calls▁begin|>`,\n `<|tool▁sep|>`, `<|Assistant|>`, `</think>`, etc. — they remain in the decoded string for\n the parser to find. (Conversely, do not assume special-token filtering removes them.)\n- **No code fence / no `type` field in V3.1.** A parser written for R1/V3-0324\n (`function<|tool▁sep|>name` + ` ```json ` block) will not parse V3.1, and vice-versa.\n V3.1 is `name<|tool▁sep|>raw_json`.\n- **Chaining has no delimiter in V3.1.** Calls abut directly:\n `…<|tool▁call▁end|><|tool▁call▁begin|>…`. Do not split on newlines/whitespace; split on\n the `<|tool▁call▁begin|>` / `<|tool▁call▁end|>` boundaries. (R1/V3-0324 put a `\\n` before\n each subsequent call.)\n- **No tool-call IDs on the wire.** Match results to calls by position. A server must\n generate synthetic `tool_call_id`s for the OpenAI shape.\n- **`</think>` appears even in non-thinking mode.** Strip the leading `</think>` (and any\n preceding reasoning) before treating the remainder as the visible answer; the template does\n `content.split('</think>', 1)[1]` when replaying stored turns.\n- **Post-tool generation prompt quirk.** The reference V3.1 chat template only appends the\n `<|Assistant|></think>` generation prefix when the **last message is `user`**. After a\n `tool` message it appends nothing and the model continues straight after\n `<|tool▁output▁end|>`. Agent loops that re-template a conversation ending in a tool result\n must not expect (or double-insert) an assistant marker there.\n- **`arguments` double-encoding risk.** On replay, vLLM's example template applies\n `arguments | tojson`. If `arguments` is already a JSON string (the OpenAI convention), that\n pipe will JSON-encode the string again (wrapping it in quotes and escaping it). Pass an\n object where the template expects `| tojson`, or a string where the template inlines\n verbatim — match the template you actually run.\n- **Streaming.** Tool calls arrive token-by-token; the name is complete only at\n `<|tool▁sep|>`, and arguments are partial JSON until `<|tool▁call▁end|>`. Buffer per call\n boundary; do not attempt to `json.loads` arguments before the closing tool-call token.\n- **Malformed output.** With `tool_choice=\"auto\"` and no structural-tag constraint\n (`VLLM_ENFORCE_STRICT_TOOL_CALLING=false`), the model can emit invalid JSON in\n `tool_call_arguments` or a `tool_call_name` that does not match any tool; the parser\n extracts best-effort. Named/`required` tool choice uses the structured-outputs backend and\n guarantees schema-valid arguments.\n\n## Version differences: V3.1 vs V3-0324 / R1-0528\n\nThe pre-V3.1 models (DeepSeek-V3-0324 and DeepSeek-R1-0528) share an older tool-call\nencoding, served in vLLM with `--tool-call-parser deepseek_v3`. The per-call body is:\n\n````text\n<|tool▁call▁begin|>function<|tool▁sep|>{name}\n```json\n{json_args}\n```<|tool▁call▁end|>\n````\n\nDifferences from V3.1:\n\n| Aspect | V3.1 (`deepseek_v31`) | V3-0324 / R1-0528 (`deepseek_v3`) |\n| --- | --- | --- |\n| Field order in a call | `{name}<|tool▁sep|>{args}` | `function<|tool▁sep|>{name}` (the literal `type`, then name) |\n| Arguments wrapping | raw JSON, inline | fenced ` ```json … ``` ` block (name and args separated by `\\n`) |\n| Chaining of calls | abut directly, **no separator** | each subsequent call prefixed with `\\n` |\n| Tool results | `<|tool▁output▁begin|>…<|tool▁output▁end|>` per message, no batch wrapper | wrapped in `<|tool▁outputs▁begin|>…<|tool▁outputs▁end|>`, results newline-separated |\n| User→assistant boundary | user turn = `<|User|>{q}`; `<|Assistant|></think>` added at generation | user turn = `<|User|>{q}<|Assistant|>` (assistant marker appended in the user branch) |\n| Thinking | hybrid; `thinking` kwarg toggles `<think>` vs `</think>` prefix | R1-0528 always reasoning (bare `<|Assistant|>` generation prefix, model opens `<think>` itself); V3-0324 non-reasoning |\n| vLLM parser | `--tool-call-parser deepseek_v31` | `--tool-call-parser deepseek_v3` |\n\nExample R1-0528 / V3-0324 parallel call with its result batch:\n\n````text\n<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_weather\n```json\n{\"location\": \"San Francisco, CA\"}\n```<|tool▁call▁end|>\n<|tool▁call▁begin|>function<|tool▁sep|>get_weather\n```json\n{\"location\": \"Seattle, WA\"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|><|tool▁outputs▁begin|><|tool▁output▁begin|>{\"temperature\": 18}<|tool▁output▁end|>\n<|tool▁output▁begin|>{\"temperature\": 14}<|tool▁output▁end|><|tool▁outputs▁end|>\n````\n\nThe `deepseek_r1` **reasoning** parser (`--reasoning-parser deepseek_r1`) applies to the R1\nseries **and** to DeepSeek-V3.1; it extracts the `<think>…</think>` span into the response's\n`reasoning` field. It is independent of the tool-call parser.\n\n## Sources\n\n- DeepSeek-V3.1 model card (Chat Template / ToolCall sections): <https://huggingface.co/deepseek-ai/DeepSeek-V3.1>\n- DeepSeek-V3.1 `assets/chat_template.jinja`: <https://huggingface.co/deepseek-ai/DeepSeek-V3.1/resolve/main/assets/chat_template.jinja>\n- DeepSeek-V3.1 `tokenizer_config.json` (`chat_template`, byte-identical to the jinja): <https://huggingface.co/deepseek-ai/DeepSeek-V3.1/resolve/main/tokenizer_config.json>\n- DeepSeek-V3.1 `tokenizer.json` (`added_tokens` → token IDs and `special` flags): <https://huggingface.co/deepseek-ai/DeepSeek-V3.1/resolve/main/tokenizer.json>\n- DeepSeek-V3.1 `config.json` (`bos_token_id`, `eos_token_id`, `vocab_size`): <https://huggingface.co/deepseek-ai/DeepSeek-V3.1/resolve/main/config.json>\n- DeepSeek-R1-0528 model card and `tokenizer_config.json` (older tool format): <https://huggingface.co/deepseek-ai/DeepSeek-R1-0528> · <https://huggingface.co/deepseek-ai/DeepSeek-R1-0528/resolve/main/tokenizer_config.json>\n- DeepSeek-R1 model card: <https://huggingface.co/deepseek-ai/DeepSeek-R1>\n- DeepSeek-V3-0324 `tokenizer_config.json` (older tool format): <https://huggingface.co/deepseek-ai/DeepSeek-V3-0324/resolve/main/tokenizer_config.json>\n- vLLM tool-call template for V3.1 (`## Tools` injection + `| tojson`): <https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_deepseekv31.jinja>\n- vLLM Tool Calling docs (`deepseek_v3`, `deepseek_v31` parser flags): <https://docs.vllm.ai/en/latest/features/tool_calling/>\n- vLLM Reasoning Outputs docs (`deepseek_r1` reasoning parser; V3.1 thinking default): <https://docs.vllm.ai/en/latest/features/reasoning_outputs/>\n",
78
- "toolconv/gemini.md": "# Gemini Pythonic tool-calling format (`tool_code` / `default_api`)\n\nTool-calling convention of Google's hosted **Gemini** models (current generation, incl. `gemini-3.5-flash` / `*-pro` / `*-preview`) and the **Gemma 3** open-weights family. Both drive tool use **entirely through prompt engineering** — there are **no dedicated special tokens**. The model emits each invocation as **Python source**: a call `default_api.<function_name>(<kwargs>)`, conventionally wrapped in `print(...)` and placed inside a fenced ```` ```tool_code ```` block; it reads results back from a ```` ```tool_outputs ```` block. Because the mechanism is plain text the model was post-trained to produce, the exact same syntax periodically leaks into ordinary output (surfaced by Vertex/AI-Studio as `finish_reason = MALFORMED_FUNCTION_CALL`) — that leak is the clearest public evidence of the format.\n\nVerified against: the official Gemma 3 function-calling guide (`ai.google.dev/gemma/docs/capabilities/function-calling` — the two recommended prompts, one Pythonic and one JSON), Simon Willison's transcription of those two prompts, Philipp Schmid's Gemma 3 walkthrough (`philschmid.de/gemma-function-calling`), and the reverse-engineered hosted-Gemini form recovered from `MALFORMED_FUNCTION_CALL` reports: `google/adk-go#492` (`Malformed function call: print(default_api.`), `google-gemini/cookbook#929` (`executableCode` part = `print(default_api.get_complaint_number_tool(consumer_number_or_mobile_number='2001234567'))`), `firebase/genkit#2628` (the ```` ```tool_code ```` markdown wrapper), and the Google AI dev-forum thread \"Gemini 2 flash returns raw markdown instead of function call\" (71964).\n\n## \"Special\" tokens\n\n**None.** Nothing here is a control token in the tokenizer's special-token table — every marker below BPE-splits into ordinary text and survives a `skip_special_tokens=True` decode. This is the defining property of the convention and the reason it both (a) works across hosted Gemini and open Gemma without tokenizer support and (b) leaks. The functional markers are:\n\n| Marker (verbatim) | Role |\n|---|---|\n| ` ```tool_code ` | Opens a fenced block whose body is Python the app must execute. Closed by a bare ` ``` `. |\n| ` ```tool_outputs ` | Opens a fenced block carrying the executed results back to the model. Closed by a bare ` ``` `. |\n| `default_api` | Synthetic module namespace the hosted stack bundles un-namespaced tools into. Calls read `default_api.<name>(...)`. |\n| `print(...)` | Conventional wrapper around the call in the hosted-Gemini form (the model is trained to \"print\" the call). Semantically irrelevant — the runtime parses the call, it does not execute Python. |\n\nThere is **no** per-call id on the wire and **no** in-band reasoning marker — Gemini reasoning travels out of band as API \"thought signatures\", never as `<think>`-style text.\n\n## Roles / turn structure\n\nThe Pythonic payload is independent of the envelope, and the envelope differs by deployment:\n\n- **Hosted Gemini** uses the normal `contents[]` turn structure (`role: \"user\" | \"model\"`); the `tool_code` block appears inside a `model` turn's text, and `tool_outputs` is supplied as the next turn.\n- **Gemma 3** (open weights) uses the Gemma chat template (`<start_of_turn>user … <end_of_turn>` / `<start_of_turn>model`); the tool prompt is prepended to the first user turn and the blocks live inside model/user turns.\n\nThis document specifies the **payload** (the two fenced blocks + the Python call form); the surrounding turn tokens belong to whichever template hosts it.\n\n## Tool definitions\n\nTools are advertised in the prompt as a JSON-Schema catalog. Gemma 3's official guide ships **two** interchangeable system-prompt templates that differ only in how the model is told to answer:\n\n1. **Pythonic** (the one this spec targets):\n > You have access to functions. If you decide to invoke any of the function(s), you MUST put it in the format of `[func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]`\n > You SHOULD NOT include any other text in the response if you call a function\n\n2. **JSON** (the sibling convention — see `qwen3.md` for the closely related Hermes shape):\n > … you MUST put it in the format of `{\"name\": function name, \"parameters\": dictionary of argument name and its value}`\n\nHosted Gemini wraps the same idea in markdown fences and the `default_api` namespace. The function signatures themselves are passed as OpenAI-style tool JSON (`{\"type\":\"function\",\"function\":{name,description,parameters}}`).\n\n## Tool-call format\n\nOne call is a Python call expression. The hosted-Gemini canonical form is a `print()` of a `default_api` method, fenced:\n\n````text\n```tool_code\nprint(default_api.get_current_temperature(location=\"London\", unit=\"celsius\"))\n```\n````\n\nAll of the following are accepted equivalents seen in the wild and across Gemma/Gemini variants; a robust parser normalizes them to `{name, arguments}`:\n\n- `print(default_api.NAME(KWARGS))` — hosted Gemini canonical.\n- `default_api.NAME(KWARGS)` — `print`/namespace are optional sugar.\n- `NAME(KWARGS)` — bare call (Gemma 3 Pythonic prompt).\n- `result = NAME(KWARGS)` — assignment form (Gemma 3 docs use `result = convert(...)`).\n\nArgument values are **Python literals**, not JSON:\n\n| Python literal | Example | Decoded |\n|---|---|---|\n| string | `'London'` or `\"London\"` | `\"London\"` |\n| int / float | `42`, `3.14` | `42`, `3.14` |\n| bool | `True` / `False` | `true` / `false` |\n| null | `None` | `null` |\n| list | `[\"a\", \"b\"]` | `[\"a\",\"b\"]` |\n| dict | `{\"k\": 1}` | `{\"k\":1}` |\n\nStrings use Python escaping (`\\n`, `\\t`, `\\\\`, `\\'`, `\\\"`); hosted Gemini emits single quotes (`location='London'`), Gemma examples use double quotes — both are valid. Arguments are keyword form (`name=value`); positional arguments are not used because the runtime maps to a named schema.\n\n## Multiple / parallel tool calls\n\nTwo encodings exist, both inside a single `tool_code` block:\n\n- **Gemma 3 Pythonic prompt** — a Python **list** of call expressions:\n ````text\n ```tool_code\n [get_current_temperature(location=\"London\"), get_temperature_date(location=\"London\", date=\"2024-10-01\")]\n ```\n ````\n- **Hosted Gemini** — one `print(default_api...)` **statement per line**:\n ````text\n ```tool_code\n print(default_api.get_current_temperature(location=\"London\"))\n print(default_api.get_temperature_date(location=\"London\", date=\"2024-10-01\"))\n ```\n ````\n\nEither way the calls are returned in source order; the application executes them and returns one result per call in the same order.\n\n## Tool-result format\n\nExecuted results are returned to the model in a ```` ```tool_outputs ```` block. Gemma 3 docs use assignment-style values (`result = 92.3`); for opaque tool output the block simply carries the returned text/JSON:\n\n````text\n```tool_outputs\n{\"temperature\": 26.1, \"location\": \"London\", \"unit\": \"celsius\"}\n```\n````\n\nThe model then continues with either a natural-language answer or another `tool_code` block.\n\n## End-to-end example\n\n````text\n<user>\nWhat's the temperature in London?\n\n<model>\n```tool_code\nprint(default_api.get_current_temperature(location=\"London\", unit=\"celsius\"))\n```\n\n<user>\n```tool_outputs\n{\"temperature\": 11.4, \"location\": \"London\", \"unit\": \"celsius\"}\n```\n\n<model>\nIt's currently 11.4°C in London.\n````\n\n## OpenAI-compatible / native API mapping\n\n- Hosted Gemini's native API normally returns a structured `functionCall` part (`{name, args}`); for Gemini 3 each carries an `id` that must be echoed in the matching `functionResponse`, plus a `thoughtSignature` that must be preserved. The Pythonic text form is what you get when the structured path *fails* (`finish_reason = MALFORMED_FUNCTION_CALL`) or when tool use is driven purely by prompt (Gemma, or Gemini via the code-execution `executableCode` part).\n- When parsed out of an OpenAI-compatible shim, each recovered call becomes `tool_calls[i] = {id (server-minted), type:\"function\", function:{name, arguments:<JSON string>}}` — the Python kwargs are re-serialized to a JSON string at that boundary.\n- Feed results back as the deployment's tool/`functionResponse` turn (hosted) or a `tool_outputs` block in the next user turn (prompt-driven).\n\n## Parsing notes & gotchas\n\n- **Python, not JSON.** `True`/`False`/`None` (not `true`/`false`/`null`), single-quoted strings, and trailing commas are all legal. A JSON parser will reject valid calls; decode Python literals.\n- **Strip the wrapper.** Normalize away `print(...)`, a `default_api.` (or any `module.`) prefix, and an `LHS =` assignment before reading the call name. `print` is never a tool name.\n- **Skip string contents when scanning.** A call like `search(pattern=\"foo(\")` contains a `(` inside a string; a naive `\\w+\\(` scan mis-detects `foo` as a callee. Track string state and only treat top-level `(` as a call opener.\n- **Fence ambiguity.** The body terminates at the first bare ` ``` `; a string argument literally containing ` ``` ` will truncate the block early (rare, accepted limitation).\n- **It leaks.** Because nothing is a special token, the format appears verbatim in normal responses when the model \"decides\" to call a tool but the structured decoder misfires. Production code reading raw text should detect ` ```tool_code ` and parse it; production code on the structured API should retry on `MALFORMED_FUNCTION_CALL`.\n- **Variant divergence.** Gemma **4** abandoned this Pythonic form for a token-delimited brace syntax (`<|tool_call>call:NAME{…}<tool_call|>`) — a different convention documented in `gemma.md`. This spec covers hosted Gemini and Gemma 3.\n\n## Sources\n\n- Gemma 3 function calling (two recommended prompts): https://ai.google.dev/gemma/docs/capabilities/function-calling\n- Simon Willison, \"Function calling with Gemma\": https://simonwillison.net/2025/Mar/26/function-calling-with-gemma/\n- Philipp Schmid, \"Google Gemma 3 Function Calling Example\": https://www.philschmid.de/gemma-function-calling\n- Gemini 3 thought signatures + functionCall ids: https://ai.google.dev/gemini-api/docs/gemini-3\n- `default_api` / `tool_code` leak evidence: https://github.com/google/adk-go/issues/492 · https://github.com/google-gemini/cookbook/issues/929 · https://github.com/firebase/genkit/issues/2628 · https://discuss.ai.google.dev/t/gemini-2-flash-api-returns-raw-markdown-instead-of-function-call/71964\n",
79
- "toolconv/gemma.md": "# Gemma 4 tool-calling format (token-delimited `call:NAME{…}`)\n\nTool-calling convention of Google's **Gemma 4** open-weights family (`google/gemma-4-*-it`). It is a clean break from the prompt-engineered Pythonic `tool_code` form used by Gemma 3 and hosted Gemini (see `gemini.md`): Gemma 4 introduces **dedicated special tokens** and a compact **token-delimited brace syntax**. Tool declarations, calls, and responses each get their own paired markers, and every string value is wrapped in a `<|\"|>` token rather than ASCII quotes. The model emits one call as `<|tool_call>call:NAME{key:value,…}<tool_call|>`; the developer parses it, runs the tool, and appends `<|tool_response>response:NAME{…}<tool_response|>`.\n\nVerified against: the official \"Function calling with Gemma 4\" guide (`ai.google.dev/gemma/docs/capabilities/text/function-calling-gemma4`), including the byte-exact `processor.apply_chat_template(...)` renderings and the reference `extract_tool_calls` regex it ships. All example streams below are copied from that page (model `google/gemma-4-E2B-it`).\n\n## Special tokens\n\nGemma 4 wraps each structural element in a paired token. Note the **asymmetric pipe placement** — an opener carries the pipe on the left (`<|x>`) and its closer carries it on the right (`<x|>`):\n\n| Open | Close | Purpose |\n|---|---|---|\n| `<bos>` | — | Beginning of sequence |\n| `<|turn>` | `<turn|>` | One conversation turn; the role name is the first line of the body |\n| `<|tool>` | `<tool|>` | A tool **declaration** block (in the system turn) |\n| `<|tool_call>` | `<tool_call|>` | One tool **call** emitted by the model |\n| `<|tool_response>` | `<tool_response|>` | One tool **result** fed back to the model |\n| `<|\"|>` | `<|\"|>` | String-literal delimiter (same token on both ends) |\n| `<eos>` | — | End of sequence |\n\nBecause the string delimiter is a token (`<|\"|>`), values may contain raw ASCII quotes and commas without escaping — only a literal `<|\"|>` token sequence cannot appear inside a string.\n\n## Roles / turn structure\n\nEach turn is `<|turn>{role}\\n{body}<turn|>`. Roles are `system`, `user`, `model`. With a generation prompt the stream ends at `<|turn>model\\n` and the model continues. Tool declarations are merged into the `system` turn; tool calls and the following tool responses are emitted inside the `model` turn (the response block immediately follows the call block in the re-rendered history).\n\n## Tool definitions\n\nEach tool is declared in the system turn as `<|tool>declaration:NAME{…}<tool|>`, where the body is the schema serialized in the same brace syntax used by calls. Types are upper-cased strings (`STRING`, `OBJECT`, …). Byte-exact, from the guide:\n\n```text\n<|tool>declaration:get_current_temperature{description:<|\"|>Gets the current temperature for a given location.<|\"|>,parameters:{properties:{location:{description:<|\"|>The city name, e.g. San Francisco<|\"|>,type:<|\"|>STRING<|\"|>} },required:[<|\"|>location<|\"|>],type:<|\"|>OBJECT<|\"|>} }<tool|>\n```\n\n## Tool-call format\n\nThe model emits one call per `<|tool_call>…<tool_call|>` block. The body is `call:NAME{ARGS}`, where `ARGS` is a comma-separated list of `key:value` pairs:\n\n```text\n<|tool_call>call:get_current_temperature{location:<|\"|>London<|\"|>}<tool_call|>\n```\n\nValue grammar inside `{…}`:\n\n| Value kind | Encoding | Example |\n|---|---|---|\n| string | `<|\"|>text<|\"|>` | `location:<|\"|>London<|\"|>` |\n| int / float | bare | `count:42` |\n| bool | bare | `flag:true` |\n| null | bare | `unit:null` |\n| list | `[v,v,…]` | `tags:[<|\"|>a<|\"|>,<|\"|>b<|\"|>]` |\n| nested object | `{k:v,…}` | `config:{theme:<|\"|>dark<|\"|>}` |\n\nThe reference parser shipped in the guide:\n\n```python\n[{\n \"name\": name,\n \"arguments\": {\n k: cast((v1 or v2).strip())\n for k, v1, v2 in re.findall(r'(\\w+):(?:<\\|\"\\|>(.*?)<\\|\"\\|>|([^,}]*))', args)\n }\n} for name, args in re.findall(r\"<\\|tool_call>call:(\\w+)\\{(.*?)\\}<tool_call\\|>\", text, re.DOTALL)]\n```\n\ni.e. each argument value is either a `<|\"|>…<|\"|>` string or a bare run of non-`,}` characters (cast to int/float/bool, else kept as a string).\n\n## Multiple / parallel tool calls\n\nParallel calls are consecutive `<|tool_call>…<tool_call|>` blocks (one call each), returned in order. The application returns one `<|tool_response>` per call in the same order.\n\n## Tool-result format\n\nEach result is `<|tool_response>response:NAME{…}<tool_response|>`, the response object serialized in the same brace syntax. Byte-exact, from the guide's re-rendered history:\n\n```text\n<|tool_response>response:get_current_weather{temperature:15,weather:<|\"|>sunny<|\"|>}<tool_response|>\n```\n\n## End-to-end example\n\nByte-exact `apply_chat_template` output from the guide (system + tool, user, model call, tool response, final answer — note the response block sits in the same model turn, right after the call):\n\n```text\n<bos><|turn>system\nYou are a helpful assistant.<|tool>declaration:get_current_weather{description:<|\"|>Gets the current weather in a given location.<|\"|>,parameters:{properties:{location:{description:<|\"|>The city and state, e.g. \"San Francisco, CA\" or \"Tokyo, JP\"<|\"|>,type:<|\"|>STRING<|\"|>},unit:{description:<|\"|>The unit to return the temperature in.<|\"|>,enum:[<|\"|>celsius<|\"|>,<|\"|>fahrenheit<|\"|>],type:<|\"|>STRING<|\"|>} },required:[<|\"|>location<|\"|>],type:<|\"|>OBJECT<|\"|>} }<tool|><turn|>\n<|turn>user\nHey, what's the weather in Tokyo right now?<turn|>\n<|turn>model\n<|tool_call>call:get_current_weather{location:<|\"|>Tokyo, JP<|\"|>}<tool_call|><|tool_response>response:get_current_weather{temperature:15,weather:<|\"|>sunny<|\"|>}<tool_response|>The current weather in Tokyo is 15 degrees Celsius and sunny.<turn|>\n```\n\n## Parsing notes & gotchas\n\n- **String delimiter is a token, not a quote.** Inside `<|\"|>…<|\"|>` the bytes `\"` and `,` are literal data — the example `<|\"|>The city and state, e.g. \"San Francisco, CA\"…<|\"|>` contains both. Split arguments on `,`/`}` only **outside** a `<|\"|>…<|\"|>` span.\n- **Asymmetric pipes.** The closer is `<tool_call|>`, not `</tool_call>` or `<|tool_call>`. Matching the wrong pipe side will never close the block.\n- **One call per block.** Unlike a JSON `tool_calls[]` array, parallelism is \"more blocks\", not \"more entries in one block\".\n- **Bare scalars.** A value not wrapped in `<|\"|>` is `true`/`false` → bool, `null`/`none` → null, numeric → number, otherwise a bare string (e.g. an unquoted enum or type name like `STRING`).\n- **Not Gemma 3 / hosted Gemini.** Those use the Pythonic `tool_code` / `default_api` form in `gemini.md`. Gemma 4 replaced it with this token syntax; the two are not interchangeable.\n\n## Sources\n\n- Function calling with Gemma 4 (byte-exact chat-template renderings + reference parser): https://ai.google.dev/gemma/docs/capabilities/text/function-calling-gemma4\n- Gemma 4 prompt formatting: https://ai.google.dev/gemma/docs/core/prompt-formatting-gemma4\n",
79
+ "toolconv/gemini.md": "# Gemini Pythonic tool-calling format (`tool_code` / `default_api`)\n\nTool-calling convention of Google's hosted **Gemini** models (current generation, incl. `gemini-3.5-flash` / `*-pro` / `*-preview`) and the **Gemma 3** open-weights family. Both drive tool use **entirely through prompt engineering** — there are **no dedicated special tokens**. The model emits each invocation as **Python source**: a call `default_api.<function_name>(<kwargs>)`, conventionally wrapped in `print(...)` and placed inside a fenced ```` ```tool_code ```` block; it reads results back from a ```` ```tool_outputs ```` block. Because the mechanism is plain text the model was post-trained to produce, the exact same syntax periodically leaks into ordinary output (surfaced by Vertex/AI-Studio as `finish_reason = MALFORMED_FUNCTION_CALL`) — that leak is the clearest public evidence of the format.\n\nVerified against: the official Gemma 3 function-calling guide (`ai.google.dev/gemma/docs/capabilities/function-calling` — the two recommended prompts, one Pythonic and one JSON), Simon Willison's transcription of those two prompts, Philipp Schmid's Gemma 3 walkthrough (`philschmid.de/gemma-function-calling`), and the reverse-engineered hosted-Gemini form recovered from `MALFORMED_FUNCTION_CALL` reports: `google/adk-go#492` (`Malformed function call: print(default_api.`), `google-gemini/cookbook#929` (`executableCode` part = `print(default_api.get_complaint_number_tool(consumer_number_or_mobile_number='2001234567'))`), `firebase/genkit#2628` (the ```` ```tool_code ```` markdown wrapper), and the Google AI dev-forum thread \"Gemini 2 flash returns raw markdown instead of function call\" (71964).\n\n## \"Special\" tokens\n\n**None.** Nothing here is a control token in the tokenizer's special-token table — every marker below BPE-splits into ordinary text and survives a `skip_special_tokens=True` decode. This is the defining property of the convention and the reason it both (a) works across hosted Gemini and open Gemma without tokenizer support and (b) leaks. The functional markers are:\n\n| Marker (verbatim) | Role |\n|---|---|\n| ` ```tool_code ` | Opens a fenced block whose body is Python the app must execute. Closed by a bare ` ``` `. |\n| ` ```tool_outputs ` | Opens a fenced block carrying the executed results back to the model. Closed by a bare ` ``` `. |\n| `default_api` | Synthetic module namespace the hosted stack bundles un-namespaced tools into. Calls read `default_api.<name>(...)`. |\n| `print(...)` | Conventional wrapper around the call in the hosted-Gemini form (the model is trained to \"print\" the call). Semantically irrelevant — the runtime parses the call, it does not execute Python. |\n\nThere is **no** per-call id on the wire and **no** in-band reasoning marker — Gemini reasoning travels out of band as API \"thought signatures\", never as `<think>`-style text.\n\n> **OMP dialect note:** because this convention carries no native in-band reasoning marker, the OMP `gemini` dialect layers a sibling fenced ` ```thinking ` block (closed by a bare ` ``` `, exactly like ` ```tool_code `) so prompt-driven Gemini / Gemma-3 deployments can express reasoning in-band. This is an OMP convention, not part of Google's format.\n\n## Roles / turn structure\n\nThe Pythonic payload is independent of the envelope, and the envelope differs by deployment:\n\n- **Hosted Gemini** uses the normal `contents[]` turn structure (`role: \"user\" | \"model\"`); the `tool_code` block appears inside a `model` turn's text, and `tool_outputs` is supplied as the next turn.\n- **Gemma 3** (open weights) uses the Gemma chat template (`<start_of_turn>user … <end_of_turn>` / `<start_of_turn>model`); the tool prompt is prepended to the first user turn and the blocks live inside model/user turns.\n\nThis document specifies the **payload** (the two fenced blocks + the Python call form); the surrounding turn tokens belong to whichever template hosts it.\n\n## Tool definitions\n\nTools are advertised in the prompt as a JSON-Schema catalog. Gemma 3's official guide ships **two** interchangeable system-prompt templates that differ only in how the model is told to answer:\n\n1. **Pythonic** (the one this spec targets):\n > You have access to functions. If you decide to invoke any of the function(s), you MUST put it in the format of `[func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]`\n > You SHOULD NOT include any other text in the response if you call a function\n\n2. **JSON** (the sibling convention — see `qwen3.md` for the closely related Hermes shape):\n > … you MUST put it in the format of `{\"name\": function name, \"parameters\": dictionary of argument name and its value}`\n\nHosted Gemini wraps the same idea in markdown fences and the `default_api` namespace. The function signatures themselves are passed as OpenAI-style tool JSON (`{\"type\":\"function\",\"function\":{name,description,parameters}}`).\n\n## Tool-call format\n\nOne call is a Python call expression. The hosted-Gemini canonical form is a `print()` of a `default_api` method, fenced:\n\n````text\n```tool_code\nprint(default_api.get_current_temperature(location=\"London\", unit=\"celsius\"))\n```\n````\n\nAll of the following are accepted equivalents seen in the wild and across Gemma/Gemini variants; a robust parser normalizes them to `{name, arguments}`:\n\n- `print(default_api.NAME(KWARGS))` — hosted Gemini canonical.\n- `default_api.NAME(KWARGS)` — `print`/namespace are optional sugar.\n- `NAME(KWARGS)` — bare call (Gemma 3 Pythonic prompt).\n- `result = NAME(KWARGS)` — assignment form (Gemma 3 docs use `result = convert(...)`).\n\nArgument values are **Python literals**, not JSON:\n\n| Python literal | Example | Decoded |\n|---|---|---|\n| string | `'London'` or `\"London\"` | `\"London\"` |\n| int / float | `42`, `3.14` | `42`, `3.14` |\n| bool | `True` / `False` | `true` / `false` |\n| null | `None` | `null` |\n| list | `[\"a\", \"b\"]` | `[\"a\",\"b\"]` |\n| dict | `{\"k\": 1}` | `{\"k\":1}` |\n\nStrings use Python escaping (`\\n`, `\\t`, `\\\\`, `\\'`, `\\\"`); hosted Gemini emits single quotes (`location='London'`), Gemma examples use double quotes — both are valid. Arguments are keyword form (`name=value`); positional arguments are not used because the runtime maps to a named schema.\n\n## Multiple / parallel tool calls\n\nTwo encodings exist, both inside a single `tool_code` block:\n\n- **Gemma 3 Pythonic prompt** — a Python **list** of call expressions:\n ````text\n ```tool_code\n [get_current_temperature(location=\"London\"), get_temperature_date(location=\"London\", date=\"2024-10-01\")]\n ```\n ````\n- **Hosted Gemini** — one `print(default_api...)` **statement per line**:\n ````text\n ```tool_code\n print(default_api.get_current_temperature(location=\"London\"))\n print(default_api.get_temperature_date(location=\"London\", date=\"2024-10-01\"))\n ```\n ````\n\nEither way the calls are returned in source order; the application executes them and returns one result per call in the same order.\n\n## Tool-result format\n\nExecuted results are returned to the model in a ```` ```tool_outputs ```` block. Gemma 3 docs use assignment-style values (`result = 92.3`); for opaque tool output the block simply carries the returned text/JSON:\n\n````text\n```tool_outputs\n{\"temperature\": 26.1, \"location\": \"London\", \"unit\": \"celsius\"}\n```\n````\n\nThe model then continues with either a natural-language answer or another `tool_code` block.\n\n## End-to-end example\n\n````text\n<user>\nWhat's the temperature in London?\n\n<model>\n```tool_code\nprint(default_api.get_current_temperature(location=\"London\", unit=\"celsius\"))\n```\n\n<user>\n```tool_outputs\n{\"temperature\": 11.4, \"location\": \"London\", \"unit\": \"celsius\"}\n```\n\n<model>\nIt's currently 11.4°C in London.\n````\n\n## OpenAI-compatible / native API mapping\n\n- Hosted Gemini's native API normally returns a structured `functionCall` part (`{name, args}`); for Gemini 3 each carries an `id` that must be echoed in the matching `functionResponse`, plus a `thoughtSignature` that must be preserved. The Pythonic text form is what you get when the structured path *fails* (`finish_reason = MALFORMED_FUNCTION_CALL`) or when tool use is driven purely by prompt (Gemma, or Gemini via the code-execution `executableCode` part).\n- When parsed out of an OpenAI-compatible shim, each recovered call becomes `tool_calls[i] = {id (server-minted), type:\"function\", function:{name, arguments:<JSON string>}}` — the Python kwargs are re-serialized to a JSON string at that boundary.\n- Feed results back as the deployment's tool/`functionResponse` turn (hosted) or a `tool_outputs` block in the next user turn (prompt-driven).\n\n## Parsing notes & gotchas\n\n- **Python, not JSON.** `True`/`False`/`None` (not `true`/`false`/`null`), single-quoted strings, and trailing commas are all legal. A JSON parser will reject valid calls; decode Python literals.\n- **Strip the wrapper.** Normalize away `print(...)`, a `default_api.` (or any `module.`) prefix, and an `LHS =` assignment before reading the call name. `print` is never a tool name.\n- **Skip string contents when scanning.** A call like `search(pattern=\"foo(\")` contains a `(` inside a string; a naive `\\w+\\(` scan mis-detects `foo` as a callee. Track string state and only treat top-level `(` as a call opener.\n- **Fence ambiguity.** The body terminates at the first bare ` ``` `; a string argument literally containing ` ``` ` will truncate the block early (rare, accepted limitation).\n- **It leaks.** Because nothing is a special token, the format appears verbatim in normal responses when the model \"decides\" to call a tool but the structured decoder misfires. Production code reading raw text should detect ` ```tool_code ` and parse it; production code on the structured API should retry on `MALFORMED_FUNCTION_CALL`.\n- **Variant divergence.** Gemma **4** abandoned this Pythonic form for a token-delimited brace syntax (`<|tool_call>call:NAME{…}<tool_call|>`) — a different convention documented in `gemma.md`. This spec covers hosted Gemini and Gemma 3.\n\n## Sources\n\n- Gemma 3 function calling (two recommended prompts): https://ai.google.dev/gemma/docs/capabilities/function-calling\n- Simon Willison, \"Function calling with Gemma\": https://simonwillison.net/2025/Mar/26/function-calling-with-gemma/\n- Philipp Schmid, \"Google Gemma 3 Function Calling Example\": https://www.philschmid.de/gemma-function-calling\n- Gemini 3 thought signatures + functionCall ids: https://ai.google.dev/gemini-api/docs/gemini-3\n- `default_api` / `tool_code` leak evidence: https://github.com/google/adk-go/issues/492 · https://github.com/google-gemini/cookbook/issues/929 · https://github.com/firebase/genkit/issues/2628 · https://discuss.ai.google.dev/t/gemini-2-flash-api-returns-raw-markdown-instead-of-function-call/71964\n",
80
+ "toolconv/gemma.md": "# Gemma 4 tool-calling format (token-delimited `call:NAME{…}`)\n\nTool-calling convention of Google's **Gemma 4** open-weights family (`google/gemma-4-*-it`). It is a clean break from the prompt-engineered Pythonic `tool_code` form used by Gemma 3 and hosted Gemini (see `gemini.md`): Gemma 4 introduces **dedicated special tokens** and a compact **token-delimited brace syntax**. Tool declarations, calls, and responses each get their own paired markers, and every string value is wrapped in a `<|\"|>` token rather than ASCII quotes. The model emits one call as `<|tool_call>call:NAME{key:value,…}<tool_call|>`; the developer parses it, runs the tool, and appends `<|tool_response>response:NAME{…}<tool_response|>`.\n\nVerified against: the official \"Function calling with Gemma 4\" guide (`ai.google.dev/gemma/docs/capabilities/text/function-calling-gemma4`), including the byte-exact `processor.apply_chat_template(...)` renderings and the reference `extract_tool_calls` regex it ships. All example streams below are copied from that page (model `google/gemma-4-E2B-it`).\n\n## Special tokens\n\nGemma 4 wraps each structural element in a paired token. Note the **asymmetric pipe placement** — an opener carries the pipe on the left (`<|x>`) and its closer carries it on the right (`<x|>`):\n\n| Open | Close | Purpose |\n|---|---|---|\n| `<bos>` | — | Beginning of sequence |\n| `<|turn>` | `<turn|>` | One conversation turn; the role name is the first line of the body |\n| `<|tool>` | `<tool|>` | A tool **declaration** block (in the system turn) |\n| `<|tool_call>` | `<tool_call|>` | One tool **call** emitted by the model |\n| `<|tool_response>` | `<tool_response|>` | One tool **result** fed back to the model |\n| `<|think|>` | — | Thinking-mode enable flag, injected in the system turn |\n| `<|channel>` | `<channel|>` | Reasoning channel; `<|channel>thought` opens the model's chain-of-thought (closed by `<channel|>`) before the visible reply |\n| `<|\"|>` | `<|\"|>` | String-literal delimiter (same token on both ends) |\n| `<eos>` | — | End of sequence |\n\nBecause the string delimiter is a token (`<|\"|>`), values may contain raw ASCII quotes and commas without escaping — only a literal `<|\"|>` token sequence cannot appear inside a string.\n\nThinking variants emit reasoning in a dedicated channel — `<|channel>thought\\n…\\n<channel|>` at the start of the model turn, before the reply (`enable_thinking` injects a `<|think|>` flag into the system turn). Prior-turn thoughts are stripped from history except on a tool-calling turn, where they are preserved between the call and its response.\n\n## Roles / turn structure\n\nEach turn is `<|turn>{role}\\n{body}<turn|>`. Roles are `system`, `user`, `model`. With a generation prompt the stream ends at `<|turn>model\\n` and the model continues. Tool declarations are merged into the `system` turn; tool calls and the following tool responses are emitted inside the `model` turn (the response block immediately follows the call block in the re-rendered history).\n\n## Tool definitions\n\nEach tool is declared in the system turn as `<|tool>declaration:NAME{…}<tool|>`, where the body is the schema serialized in the same brace syntax used by calls. Types are upper-cased strings (`STRING`, `OBJECT`, …). Byte-exact, from the guide:\n\n```text\n<|tool>declaration:get_current_temperature{description:<|\"|>Gets the current temperature for a given location.<|\"|>,parameters:{properties:{location:{description:<|\"|>The city name, e.g. San Francisco<|\"|>,type:<|\"|>STRING<|\"|>} },required:[<|\"|>location<|\"|>],type:<|\"|>OBJECT<|\"|>} }<tool|>\n```\n\n## Tool-call format\n\nThe model emits one call per `<|tool_call>…<tool_call|>` block. The body is `call:NAME{ARGS}`, where `ARGS` is a comma-separated list of `key:value` pairs:\n\n```text\n<|tool_call>call:get_current_temperature{location:<|\"|>London<|\"|>}<tool_call|>\n```\n\nValue grammar inside `{…}`:\n\n| Value kind | Encoding | Example |\n|---|---|---|\n| string | `<|\"|>text<|\"|>` | `location:<|\"|>London<|\"|>` |\n| int / float | bare | `count:42` |\n| bool | bare | `flag:true` |\n| null | bare | `unit:null` |\n| list | `[v,v,…]` | `tags:[<|\"|>a<|\"|>,<|\"|>b<|\"|>]` |\n| nested object | `{k:v,…}` | `config:{theme:<|\"|>dark<|\"|>}` |\n\nThe reference parser shipped in the guide:\n\n```python\n[{\n \"name\": name,\n \"arguments\": {\n k: cast((v1 or v2).strip())\n for k, v1, v2 in re.findall(r'(\\w+):(?:<\\|\"\\|>(.*?)<\\|\"\\|>|([^,}]*))', args)\n }\n} for name, args in re.findall(r\"<\\|tool_call>call:(\\w+)\\{(.*?)\\}<tool_call\\|>\", text, re.DOTALL)]\n```\n\ni.e. each argument value is either a `<|\"|>…<|\"|>` string or a bare run of non-`,}` characters (cast to int/float/bool, else kept as a string).\n\n## Multiple / parallel tool calls\n\nParallel calls are consecutive `<|tool_call>…<tool_call|>` blocks (one call each), returned in order. The application returns one `<|tool_response>` per call in the same order.\n\n## Tool-result format\n\nEach result is `<|tool_response>response:NAME{…}<tool_response|>`, the response object serialized in the same brace syntax. Byte-exact, from the guide's re-rendered history:\n\n```text\n<|tool_response>response:get_current_weather{temperature:15,weather:<|\"|>sunny<|\"|>}<tool_response|>\n```\n\n## End-to-end example\n\nByte-exact `apply_chat_template` output from the guide (system + tool, user, model call, tool response, final answer — note the response block sits in the same model turn, right after the call):\n\n```text\n<bos><|turn>system\nYou are a helpful assistant.<|tool>declaration:get_current_weather{description:<|\"|>Gets the current weather in a given location.<|\"|>,parameters:{properties:{location:{description:<|\"|>The city and state, e.g. \"San Francisco, CA\" or \"Tokyo, JP\"<|\"|>,type:<|\"|>STRING<|\"|>},unit:{description:<|\"|>The unit to return the temperature in.<|\"|>,enum:[<|\"|>celsius<|\"|>,<|\"|>fahrenheit<|\"|>],type:<|\"|>STRING<|\"|>} },required:[<|\"|>location<|\"|>],type:<|\"|>OBJECT<|\"|>} }<tool|><turn|>\n<|turn>user\nHey, what's the weather in Tokyo right now?<turn|>\n<|turn>model\n<|tool_call>call:get_current_weather{location:<|\"|>Tokyo, JP<|\"|>}<tool_call|><|tool_response>response:get_current_weather{temperature:15,weather:<|\"|>sunny<|\"|>}<tool_response|>The current weather in Tokyo is 15 degrees Celsius and sunny.<turn|>\n```\n\n## Parsing notes & gotchas\n\n- **String delimiter is a token, not a quote.** Inside `<|\"|>…<|\"|>` the bytes `\"` and `,` are literal data — the example `<|\"|>The city and state, e.g. \"San Francisco, CA\"…<|\"|>` contains both. Split arguments on `,`/`}` only **outside** a `<|\"|>…<|\"|>` span.\n- **Asymmetric pipes.** The closer is `<tool_call|>`, not `</tool_call>` or `<|tool_call>`. Matching the wrong pipe side will never close the block.\n- **One call per block.** Unlike a JSON `tool_calls[]` array, parallelism is \"more blocks\", not \"more entries in one block\".\n- **Bare scalars.** A value not wrapped in `<|\"|>` is `true`/`false` → bool, `null`/`none` → null, numeric → number, otherwise a bare string (e.g. an unquoted enum or type name like `STRING`).\n- **Not Gemma 3 / hosted Gemini.** Those use the Pythonic `tool_code` / `default_api` form in `gemini.md`. Gemma 4 replaced it with this token syntax; the two are not interchangeable.\n\n## Sources\n\n- Function calling with Gemma 4 (byte-exact chat-template renderings + reference parser): https://ai.google.dev/gemma/docs/capabilities/text/function-calling-gemma4\n- Gemma 4 prompt formatting: https://ai.google.dev/gemma/docs/core/prompt-formatting-gemma4\n",
80
81
  "toolconv/glm-4.5.md": "# GLM-4.5 / GLM-4.6 tool-calling format\n\nNative tool-calling convention of Zhipu AI / Z.ai's **GLM-4.5** family (`zai-org/GLM-4.5` 355B-A32B and `zai-org/GLM-4.5-Air` 106B-A12B, `model_type: \"glm4_moe\"`), shared byte-for-byte by **GLM-4.6**. Unlike the JSON-in-a-tag conventions used by most families, GLM emits each tool call as an **XML-like** block: `<tool_call>{name}` followed by alternating `<arg_key>`/`<arg_value>` element pairs, closed by `</tool_call>`. The prompt is a GLM-style sequence opened by `[gMASK]<sop>` with turn markers `<|system|>`, `<|user|>`, `<|assistant|>`, `<|observation|>`. An inference server turns the raw stream into OpenAI-style `tool_calls` with a parser plus a reasoning parser: both vLLM and SGLang expose `--tool-call-parser glm45 --reasoning-parser glm45` (vLLM additionally needs `--enable-auto-tool-choice`). Tool calling and reasoning are driven entirely by the bundled `chat_template.jinja`; thinking mode is on by default and is disabled per-request with `chat_template_kwargs={\"enable_thinking\": false}`.\n\nThis document was verified against the authoritative `chat_template.jinja` from the HF repo (fetched raw and **rendered locally with Jinja2** — `trim_blocks=True, lstrip_blocks=True`, transformers' `tojson` filter — to produce the byte-exact streams below), `tokenizer_config.json` and `generation_config.json` for the exact token IDs and stop tokens, the model card, and the vLLM (`Glm4MoeModelToolParser`) and SGLang (`Glm4MoeDetector`) parser sources. The HF `resolve`/`blob` web paths redirect to the model-card API; the byte-exact source was obtained via the `resolve/main/...:raw` cache (template commit `cbb2c7cfb52fa128a9660cb1a7a78e017899e115`). The GLM-4.5 and GLM-4.6 `chat_template.jinja` files are identical (same content hash `41478957…`).\n\n## Special tokens\n\nToken IDs are from `tokenizer_config.json` (`added_tokens_decoder`). Note the split: the turn/role markers are registered as **special** tokens, whereas the structural tool-call and thinking tags are each a single dedicated vocabulary token but flagged **`special: false`** (they are emitted/printed as ordinary text, not stripped as control tokens).\n\n| Token (verbatim) | ID | `special` | Purpose |\n|---|---|---|---|\n| `[gMASK]` | 151331 | true | GLM prefix / blank-infilling sentinel; first token of every prompt |\n| `<sop>` | 151333 | true | \"Start of piece\" — immediately follows `[gMASK]` to open the sequence |\n| `<eop>` | 151334 | true | \"End of piece\" (not emitted by the chat template) |\n| `<\\|system\\|>` | 151335 | true | Opens a system turn (and the injected tools turn) |\n| `<\\|user\\|>` | 151336 | true | Opens a user turn (also an EOS id — see below) |\n| `<\\|assistant\\|>` | 151337 | true | Opens an assistant turn / generation prompt |\n| `<\\|observation\\|>` | 151338 | true | Opens a tool-result (observation) turn (also an EOS id) |\n| `<\\|endoftext\\|>` | 151329 | true | End-of-text; `eos_token` and `pad_token` |\n| `<think>` | 151350 | false | Opens the reasoning span inside an assistant turn |\n| `</think>` | 151351 | false | Closes the reasoning span |\n| `<tool_call>` | 151352 | false | Opens one tool call; function name follows on the same line |\n| `</tool_call>` | 151353 | false | Closes one tool call |\n| `<arg_key>` | 151356 | false | Opens an argument-name element |\n| `</arg_key>` | 151357 | false | Closes an argument-name element |\n| `<arg_value>` | 151358 | false | Opens an argument-value element |\n| `</arg_value>` | 151359 | false | Closes an argument-value element |\n| `<tool_response>` | 151354 | false | Wraps one tool result inside an observation turn |\n| `</tool_response>` | 151355 | false | Closes a tool result |\n| `/nothink` | 151360 | true | Soft switch appended to user text to suppress thinking |\n\nNotes on exactness:\n- All pipes are ASCII `|` (U+007C); GLM uses no fullwidth `|` (U+FF5C) or `▁` (U+2581) variants (unlike DeepSeek). Reproduce `<|system|>`, `<|user|>`, `<|assistant|>`, `<|observation|>` exactly, and `[gMASK]` with literal square brackets.\n- Because `<tool_call>`, `<arg_key>`, `<arg_value>`, `<tool_response>`, `<think>` (and their closers) each map to exactly **one** token ID, they cost one token apiece in the stream — but being `special: false` they round-trip through detokenization as plain text. Parsers therefore match them as literal substrings in the decoded text, not as control-token ids.\n- `eos_token_id` is a **list**: `[151329, 151336, 151338]` = `<|endoftext|>`, `<|user|>`, `<|observation|>` (from `generation_config.json`). This is how a tool-call turn ends: after `</tool_call>` the model emits `<|observation|>`, which is an EOS id, so generation halts and the server reports a tool call (see Turn structure).\n\n## Roles / channels / turn structure\n\nEvery prompt begins with the literal two-token prefix `[gMASK]<sop>` (no following newline). Turns are then concatenated, each introduced by its role marker; there is no per-turn terminator token in rendered history (the next marker, or an EOS id during generation, ends a turn).\n\n- **System** (`<|system|>`): role marker, newline, then the message text. When `tools` are supplied, a synthetic tools system turn is rendered **first**, before any user-supplied system turn (the two are separate `<|system|>` blocks — see Tool definitions).\n- **User** (`<|user|>`): role marker, newline, then text. If `enable_thinking` is false, the literal `/nothink` is appended to the user text (unless it already ends with `/nothink`).\n- **Assistant** (`<|assistant|>`): role marker, then a reasoning span and/or visible content and/or tool calls. The reasoning span is `\\n<think>{reasoning}</think>`; visible content follows on its own line; tool calls follow as `<tool_call>…</tool_call>` blocks.\n- **Tool result** (`<|observation|>`): role marker introducing one or more `<tool_response>…</tool_response>` blocks (see Tool-result format).\n\nThinking / reasoning channel:\n- Reasoning lives in `<think>…</think>` inside the assistant turn. The `--reasoning-parser glm45` extracts it into a separate `reasoning_content` field; the visible answer is whatever follows `</think>`.\n- **Only the reasoning of assistant turns after the last user message is kept.** The template renders every earlier assistant turn with an empty `<think></think>` and drops its `reasoning_content` (or any inline `<think>…</think>` embedded in `content`). This keeps stale chains of thought out of the context on later turns.\n- An assistant turn with neither preserved reasoning nor an explicit chain renders `\\n<think></think>` (empty), then content/tool calls.\n\nGeneration prompt (`add_generation_prompt=True`):\n- **Thinking mode (default):** the prompt ends with a bare `<|assistant|>`; the model continues with `\\n<think>…</think>` then its answer or tool calls.\n- **Non-thinking mode** (`enable_thinking=false`): the prompt ends with `<|assistant|>\\n<think></think>`, pre-filling an empty reasoning span so the model goes straight to the answer.\n\nHow a tool-call turn terminates: there is no dedicated \"stop after tool call\" token. The model emits `</tool_call>` and then `<|observation|>` (token 151338), which is one of the three EOS ids, so decoding stops. The server inspects the text, finds `<tool_call>`, and returns `finish_reason: \"tool_calls\"`.\n\n## Tool definitions\n\nWhen the request carries `tools`, the template prepends one `<|system|>` turn containing a fixed preamble, the tool list wrapped in `<tools>…</tools>`, and a literal description of the output format. Each tool is serialized with `tool | tojson(ensure_ascii=False)` — i.e. the **entire OpenAI tool object verbatim**, including the `{\"type\": \"function\", \"function\": {…}}` wrapper, with default JSON spacing (`\", \"` / `\": \"`). One tool per line.\n\n```text\n<|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"description\": \"Get current weather for a city\", \"parameters\": {\"type\": \"object\", \"properties\": {\"location\": {\"type\": \"string\", \"description\": \"City name\"}, \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]}}, \"required\": [\"location\"]}}}\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}\n<arg_key>{arg-key-1}</arg_key>\n<arg_value>{arg-value-1}</arg_value>\n<arg_key>{arg-key-2}</arg_key>\n<arg_value>{arg-value-2}</arg_value>\n...\n</tool_call>\n```\n\nThe `<tool_call>{function-name}` / `<arg_key>` / `<arg_value>` lines above are part of the **prompt text** (the format spec the model is told to follow), not an example call. This tools turn is emitted only when `tools` is non-empty, and it is closed implicitly by the next role marker (e.g. a user-supplied `<|system|>` or the first `<|user|>`), with no blank line between them.\n\n## Tool-call format\n\nThe model emits a call as an `<tool_call>` block: the function **name on the same line** as the opening tag, a newline, then one `<arg_key>…</arg_key>` + `<arg_value>…</arg_value>` pair per argument, closed by `</tool_call>`. Minimal single call (assistant generation in thinking mode; reasoning shown for realism):\n\n```text\n<think>The user wants the weather in Beijing. I'll call get_weather.</think>\n<tool_call>get_weather\n<arg_key>location</arg_key>\n<arg_value>Beijing</arg_value>\n<arg_key>unit</arg_key>\n<arg_value>celsius</arg_value>\n</tool_call>\n```\n\nAnatomy and value encoding (this is the single most error-prone part):\n\n- The function name is the text between `<tool_call>` and the first newline — there is **no** wrapping tag around it and **no** space after `<tool_call>`.\n- Each argument is two adjacent elements: `<arg_key>name</arg_key>` then `<arg_value>value</arg_value>`, conventionally one pair per line.\n- **Argument values are NOT uniformly JSON.** The template renders each value as `value | tojson(ensure_ascii=False) if value is not string else value`:\n - **string** values are emitted **raw, without surrounding quotes** → `<arg_value>Beijing</arg_value>` (not `\"Beijing\"`).\n - **non-string** values (number, boolean, null, object, array) are JSON-encoded → `<arg_value>3</arg_value>`, `<arg_value>true</arg_value>`, `<arg_value>{\"k\": 1}</arg_value>`.\n- A **zero-argument** call has no pairs: the name is followed by a newline and the closer — `<tool_call>get_time\\n</tool_call>`.\n\nBecause string values lose their quotes, a parser must decide per argument whether to JSON-decode or treat the value as a literal string. Both reference parsers do this by consulting the tool's JSON Schema: if the parameter's type is `string`, the raw text is taken as-is; otherwise the value is JSON-decoded (with `ast.literal_eval` and raw-string fallbacks). The model is trained to follow the schema, so it emits a bare string exactly when the parameter is string-typed.\n\n## Multiple / parallel tool calls\n\nTwo or more calls in one turn are emitted as consecutive `<tool_call>…</tool_call>` blocks separated by a single newline (no wrapper element around the set). Raw assistant emission for two parallel calls with mixed argument types:\n\n```text\n<think>Two cities. Call get_weather twice in parallel.</think>\n<tool_call>get_weather\n<arg_key>location</arg_key>\n<arg_value>Beijing</arg_value>\n<arg_key>unit</arg_key>\n<arg_value>celsius</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>location</arg_key>\n<arg_value>Shanghai</arg_value>\n<arg_key>days</arg_key>\n<arg_value>3</arg_value>\n<arg_key>verbose</arg_key>\n<arg_value>true</arg_value>\n</tool_call>\n```\n\nNote `Beijing`/`Shanghai`/`celsius` (string) are bare, while `3` (number) and `true` (boolean) are JSON literals. Parsers split on the non-greedy `<tool_call>.*?</tool_call>` regex, so any number of calls is supported; each becomes a separate entry in `tool_calls[]`.\n\n## Tool-result format\n\nResults are returned in an **observation** turn. For a single result: the `<|observation|>` marker, a newline, then the result wrapped in `<tool_response>` / `</tool_response>`:\n\n```text\n<|observation|>\n<tool_response>\n{\"temperature\": 26, \"unit\": \"celsius\", \"condition\": \"Sunny\"}\n</tool_response>\n```\n\nThe content between the tags is inserted **verbatim** (callers typically pass a JSON string, but any text is allowed). For **multiple** results from a set of parallel calls, the `<|observation|>` marker appears **once** and each result gets its own `<tool_response>` block (consecutive `tool`-role messages are merged under a single observation turn):\n\n```text\n<|observation|>\n<tool_response>\n{\"temperature\": 26, \"condition\": \"Sunny\"}\n</tool_response>\n<tool_response>\n{\"temperature\": 30, \"condition\": \"Cloudy\"}\n</tool_response>\n```\n\nThe chat template reads **only** the tool message's `content` — it does not consult any `tool_call_id`. Results are therefore correlated to calls **positionally / by order**, not by an embedded id (GLM's wire format carries no per-call id; see API mapping).\n\n## End-to-end example\n\nA complete multi-turn weather exchange. These are the exact locally rendered streams; newlines inside a turn are literal and turns are otherwise contiguous (no separators between markers).\n\n**Stage 1 — prompt fed to the model** (`tools` set, one prior system message, `add_generation_prompt=True`, thinking mode):\n\n```text\n[gMASK]<sop><|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"description\": \"Get current weather for a city\", \"parameters\": {\"type\": \"object\", \"properties\": {\"location\": {\"type\": \"string\", \"description\": \"City name\"}, \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]}}, \"required\": [\"location\"]}}}\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}\n<arg_key>{arg-key-1}</arg_key>\n<arg_value>{arg-value-1}</arg_value>\n<arg_key>{arg-key-2}</arg_key>\n<arg_value>{arg-value-2}</arg_value>\n...\n</tool_call><|system|>\nYou are a helpful assistant.<|user|>\nWhat's the weather in Beijing?<|assistant|>\n```\n\n**Assistant generation** (model output; it ends by emitting `<|observation|>`, an EOS id, so decoding stops there; server returns `finish_reason: \"tool_calls\"`):\n\n```text\n<think>The user wants the weather in Beijing. I'll call get_weather.</think>\n<tool_call>get_weather\n<arg_key>location</arg_key>\n<arg_value>Beijing</arg_value>\n<arg_key>unit</arg_key>\n<arg_value>celsius</arg_value>\n</tool_call>\n```\n\n**Stage 2 — prompt for the next turn**, after appending the assistant tool-call turn and the tool result, then `add_generation_prompt=True`:\n\n```text\n[gMASK]<sop><|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"description\": \"Get current weather for a city\", \"parameters\": {\"type\": \"object\", \"properties\": {\"location\": {\"type\": \"string\", \"description\": \"City name\"}, \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]}}, \"required\": [\"location\"]}}}\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}\n<arg_key>{arg-key-1}</arg_key>\n<arg_value>{arg-value-1}</arg_value>\n<arg_key>{arg-key-2}</arg_key>\n<arg_value>{arg-value-2}</arg_value>\n...\n</tool_call><|system|>\nYou are a helpful assistant.<|user|>\nWhat's the weather in Beijing?<|assistant|>\n<think>The user wants the weather in Beijing. I'll call get_weather.</think>\n<tool_call>get_weather\n<arg_key>location</arg_key>\n<arg_value>Beijing</arg_value>\n<arg_key>unit</arg_key>\n<arg_value>celsius</arg_value>\n</tool_call><|observation|>\n<tool_response>\n{\"temperature\": 26, \"unit\": \"celsius\", \"condition\": \"Sunny\"}\n</tool_response><|assistant|>\n```\n\n**Final assistant generation** (natural-language answer, terminated by `<|endoftext|>`; `finish_reason: \"stop\"`):\n\n```text\n<think>Got it, 26C and sunny.</think>\nIt's 26°C and sunny in Beijing right now.\n```\n\nTwo subtleties visible above: (1) the reasoning of the assistant tool-call turn is **preserved** in Stage 2 only because it is the segment after the last user message; with another user turn after it, that `<think>…</think>` would be re-rendered empty. (2) The tool-call turn and the observation turn abut directly (`</tool_call><|observation|>`), and the observation abuts the next assistant marker (`</tool_response><|assistant|>`).\n\nFor **non-thinking** mode the user text carries the soft switch and the generation prompt pre-fills an empty think span:\n\n```text\n<|user|>\nHi there/nothink<|assistant|>\n<think></think>\n```\n\n## OpenAI-compatible API mapping\n\nWith a server parser active (`--tool-call-parser glm45 --reasoning-parser glm45`), the raw stream maps onto Chat Completions as follows:\n\n- `choices[].finish_reason` = `\"tool_calls\"` when the output contained at least one `<tool_call>` (otherwise `\"stop\"`).\n- `choices[].message.content` = the text **before** the first `<tool_call>` (normalized to `null` if empty/whitespace). The `<think>…</think>` reasoning is removed by the reasoning parser and surfaced separately as `message.reasoning_content`.\n- `choices[].message.tool_calls[]` — one entry per `<tool_call>…</tool_call>` block:\n - `.id` = a **server-generated** id (e.g. vLLM's `make_tool_call_id()`), **not** present in the model output. GLM emits no per-call id in the stream.\n - `.type` = `\"function\"`.\n - `.function.name` = the text after `<tool_call>` up to the first newline.\n - `.function.arguments` = a **JSON string** (an object), reconstructed from the `<arg_key>`/`<arg_value>` pairs with per-argument typing from the tool schema. vLLM returns `json.dumps(arg_dct, ensure_ascii=False)`, e.g. `\"{\\\"location\\\": \\\"Beijing\\\", \\\"unit\\\": \\\"celsius\\\"}\"`. Clients `json.loads()` it before use.\n- **Request side — tool results** are sent back as `role: \"tool\"` messages, e.g.:\n\n ```json\n {\"role\": \"tool\", \"tool_call_id\": \"call_abc123\", \"content\": \"{\\\"temperature\\\": 26, \\\"unit\\\": \\\"celsius\\\", \\\"condition\\\": \\\"Sunny\\\"}\"}\n ```\n\n The chat template renders only `content` (inside `<tool_response>`); `tool_call_id` is **ignored by the template** and matters only for the client's own bookkeeping. Order results to match the calls.\n- **Request side — assistant tool-call history**: the OpenAI shape carries `function.arguments` as a JSON **string**, but the chat template iterates `arguments.items()` and therefore needs an **object**. vLLM/SGLang parse the string back into a dict before rendering; if you call `tokenizer.apply_chat_template` directly, pass `arguments` as a dict (and optionally `reasoning_content` as a string) or the template will raise.\n- Disable thinking via `extra_body={\"chat_template_kwargs\": {\"enable_thinking\": false}}` (OpenAI Python client) — this flips the template to the `/nothink` + pre-filled `<think></think>` path.\n\n## Parsing notes & gotchas\n\n- **String values are unquoted; typing needs the schema.** The decisive rule: a `<arg_value>` is a literal string iff the parameter is string-typed in the tool's JSON Schema; otherwise it is JSON. vLLM's `_is_string_type` and SGLang's `get_argument_type` both walk `properties[arg].type` (handling `anyOf`/`oneOf`/`enum`/`allOf`/type-arrays). If the schema is missing/loose, they fall back to \"try `json.loads`, then `ast.literal_eval`, then treat as string\" — so a bare word like `celsius` survives as a string, while `26` becomes a number. A string value that *looks* like JSON (e.g. a parameter typed `string` whose value is `{\"a\":1}`) is correctly kept as the literal string only because the schema says `string`.\n- **Extraction regexes (GLM-4.5/4.6).** vLLM: calls via `<tool_call>.*?</tool_call>` (DOTALL); name/body via `<tool_call>([^\\n]*)\\n(.*)</tool_call>`; pairs via `<arg_key>(.*?)</arg_key>\\s*<arg_value>(.*?)</arg_value>`. The name regex **requires a newline** after the name — matching the 4.5/4.6 template. SGLang uses an equivalent `(?:\\\\n|\\n)` form so it also tolerates literal escaped `\\n`.\n- **`</arg_value>` in a value breaks parsing.** Values are captured non-greedily up to the next `</arg_value>`; a value whose text contains `</arg_value>` (or `</tool_call>`) truncates early. There is no escaping mechanism in the wire format.\n- **Tool calls are parsed from `content` only, not from reasoning.** A `<tool_call>` emitted inside `<think>…</think>` is ignored by the tool parser (vLLM's reasoning/tool parsers cooperate so only post-`</think>` content is scanned). Don't expect calls made \"while thinking\" to fire.\n- **Guided decoding is suppressed for GLM.** For `tool_choice: \"required\"` or a named tool, vLLM deliberately does **not** apply JSON structured-outputs/guided decoding, because that would force JSON output and conflict with GLM's XML syntax; the parser extracts from free-form XML instead.\n- **`skip_special_tokens` must be off.** Although the tool/think tags are `special: false`, vLLM forces `skip_special_tokens = False` when tools are enabled (defensive against transformers 5.x detokenization changes) so the literal `<tool_call>`/`</tool_call>` text survives for the regex.\n- **Streaming.** Long string arguments used to be buffered until the closing tag (vLLM issue #32829); the current parser re-parses the accumulated text each delta and emits only the diff, streaming incremental string content with an open-quote-then-fill strategy and holding back any partial trailing tag (`partial_tag_overlap`). The streamed tool name is the text before the first `\\n` or `<arg_key>`. SGLang implements the same as an explicit XML→JSON state machine (`INIT → IN_KEY → WAITING_VALUE → IN_VALUE`). Malformed tails (a missing `</arg_value>` before `</tool_call>`) are closed off heuristically.\n- **Lineage — GLM-4.5 vs GLM-4.6:** identical wire format and identical `chat_template.jinja` (same content hash); the same `glm45` parser serves both.\n- **Lineage — GLM-4.7 / GLM-5 changed the format.** Newer models drop the structural newlines: the function name may sit **directly** before the first `<arg_key>` (no newline), zero-argument calls may be `<tool_call>func</tool_call>`, and parallel calls may be emitted **back-to-back with no separator** (`…</tool_call><tool_call>…`). These require the distinct `Glm47MoeModelToolParser` (vLLM, `structural_tag_model=\"glm_4_7\"`) / `Glm47MoeDetector` (SGLang), whose `func_detail_regex` makes the newline and the argument section optional (`<tool_call>\\s*(\\S+?)\\s*(<arg_key>.*)?</tool_call>`). Do **not** use a GLM-4.7 stream to validate a GLM-4.5 parser or vice versa.\n\n## Sources\n\n- Chat template (authoritative; rendered locally for the byte-exact streams), GLM-4.5 commit `cbb2c7c…`: https://huggingface.co/zai-org/GLM-4.5/resolve/main/chat_template.jinja — the `blob`/web path redirects to the model-card API; verified via the raw `resolve/main` cache.\n- Identical GLM-4.6 template (same content hash, confirming shared format): https://huggingface.co/zai-org/GLM-4.6/resolve/main/chat_template.jinja\n- Special-token IDs and `special` flags (`added_tokens_decoder`, `additional_special_tokens`): https://huggingface.co/zai-org/GLM-4.5/resolve/main/tokenizer_config.json\n- Stop tokens (`eos_token_id = [151329, 151336, 151338]`): https://huggingface.co/zai-org/GLM-4.5/resolve/main/generation_config.json\n- Model card (server flags `--tool-call-parser glm45 --reasoning-parser glm45`, `enable_thinking` switch, parser links): https://huggingface.co/zai-org/GLM-4.5\n- vLLM GLM-4.5/4.6 tool parser (`Glm4MoeModelToolParser`: regexes, schema-driven string typing, JSON-string `arguments`, streaming, `skip_special_tokens`): https://github.com/vllm-project/vllm/blob/main/vllm/tool_parsers/glm4_moe_tool_parser.py\n- vLLM GLM-4.7 tool parser (`Glm47MoeModelToolParser`: same-line name, optional/zero args): https://github.com/vllm-project/vllm/blob/main/vllm/tool_parsers/glm47_moe_tool_parser.py\n- SGLang GLM-4.5/4.6 detector (`Glm4MoeDetector`: format docstring, XML→JSON state machine, argument typing): https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/function_call/glm4_moe_detector.py\n- SGLang GLM-4.7 detector (`Glm47MoeDetector`: newline-less / back-to-back calls): https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/function_call/glm47_moe_detector.py\n- vLLM tool-calling docs: https://docs.vllm.ai/en/latest/features/tool_calling/\n",
81
82
  "toolconv/harmony.md": "# OpenAI Harmony response format\n\nHarmony is the response format OpenAI trained its open-weight `gpt-oss` models on (`gpt-oss-20b`, `gpt-oss-120b`, released August 2025). It defines the conversation envelope, the multi-channel reasoning/answer separation, and the function-calling wire syntax. The models will not work correctly if prompted without it. The format deliberately mirrors the OpenAI *Responses* API (roles, channels, recipients) rather than the older Chat Completions shape.\n\nTokens are produced with the `o200k_harmony` encoding (the `o200k_base` BPE vocab plus a block of Harmony special tokens; see the table below). The reference renderer/parser is the Rust crate `openai-harmony` (Python bindings: `pip install openai-harmony`; encoding name `HarmonyEncodingName.HARMONY_GPT_OSS`).\n\nYou only deal with raw Harmony if you build your own inference loop. Served through an OpenAI-compatible endpoint the server handles it for you:\n\n- **Ollama / LM Studio / HuggingFace**: Harmony is applied internally; you send normal OpenAI-style JSON.\n- **vLLM**: `vllm serve openai/gpt-oss-120b --enable-auto-tool-choice --tool-call-parser openai --reasoning-parser openai_gptoss`. Note the tool-call parser flag is `openai` (not `harmony`). vLLM also exposes a Harmony-native path through the `/v1/responses` endpoint.\n- **SGLang**: `python3 -m sglang.launch_server --model-path openai/gpt-oss-20b --reasoning-parser gpt-oss --tool-call-parser gpt-oss` (in NVIDIA Dynamo disaggregated mode: `--dyn-tool-call-parser harmony --dyn-reasoning-parser gpt_oss`).\n\nThe chat template shipped with the gpt-oss weights renders these same token sequences from the standard `messages`/`tools` arrays.\n\n## Special tokens\n\nAll Harmony control tokens have the literal form `<|type|>` (ASCII pipes `|`, U+007C — no unicode variants). They are real single tokens in `o200k_harmony`, not text that is BPE-split. The structurally meaningful ones:\n\n| Token (verbatim) | Token ID | Purpose |\n| :--------------- | :------- | :------ |\n| `<\\|start\\|>` | `200006` | Begins a message; immediately followed by the header (role, optional recipient/channel/content-type). |\n| `<\\|end\\|>` | `200007` | Ends a fully-formed message. |\n| `<\\|message\\|>` | `200008` | Header → content transition. Everything after it (until a stop/end token) is the message body. |\n| `<\\|channel\\|>` | `200005` | Introduces the channel field of the header (`analysis` / `commentary` / `final`). |\n| `<\\|constrain\\|>` | `200003` | Marks the content-type / constrained-decoding format in a tool-call header (e.g. `<\\|constrain\\|>json`). |\n| `<\\|return\\|>` | `200002` | Stop token: the model finished its final answer. Decode-time only (see normalization note). |\n| `<\\|call\\|>` | `200012` | Stop token: the model is emitting a tool call and wants it executed. |\n\n`<|return|>` and `<|call|>` are the two valid generation stop tokens — halt inference on either.\n\nThe encoding also defines (same `o200k_harmony` block, IDs `199998`–`200013`) `<|startoftext|>` (199998), `<|endoftext|>` (199999), and reserved slots `<|reserved_200000|>`, `<|reserved_200001|>`, `<|reserved_200004|>`, `<|reserved_200009|>`–`<|reserved_200011|>`, `<|reserved_200013|>`, plus a bulk reserved range `<|reserved_200014|>`…`<|reserved_201088|>`. The renderer additionally knows the names `<|refusal|>`, `<|untrusted|>`, `<|end_untrusted|>`, `<|meta_end|>` but they are not part of the committed gpt-oss vocabulary and do not appear in normal traffic.\n\n## Roles / channels / turn structure\n\n**Message envelope.** Every message is:\n\n```text\n<|start|>{header}<|message|>{content}<|end|>\n```\n\n`{header}` always begins with the role and may carry an optional recipient (`to=...`), channel, and content-type. A completed message ends with `<|end|>`; an assistant message being generated ends instead with a stop token (`<|return|>` or `<|call|>`).\n\n**Roles** (five). The instruction hierarchy used to resolve conflicts is `system` > `developer` > `user` > `assistant` > `tool`.\n\n| Role | Purpose |\n| :--- | :------ |\n| `system` | Identity, knowledge cutoff / current date, reasoning effort, valid-channels declaration, built-in tools. NOT the user-facing \"system prompt\". |\n| `developer` | The conventional \"system prompt\": instructions + the `# Tools` function declarations + (optional) structured-output schema. |\n| `user` | End-user input. |\n| `assistant` | Model output. Carries a channel and, for tool calls, a recipient. |\n| `tool` | Output of an executed tool. The message's *author/role is the tool's own name* (e.g. `functions.get_current_weather`), not the literal word `tool`. |\n\n**Channels** (assistant output only; the channel is mandatory on every assistant message):\n\n| Channel | Purpose |\n| :------ | :------ |\n| `analysis` | Raw chain-of-thought (reasoning). Not held to the same safety bar as `final`; do not show to end users. Built-in `python`/`browser` calls usually go here. |\n| `commentary` | Function tool calls, and user-visible \"preambles\" (action plans) before calling multiple tools. |\n| `final` | The user-facing answer. |\n\n**Reasoning effort** is set in the system message as `Reasoning: high` (or `medium` / `low`; default is medium). The model emits CoT into `analysis` and the answer into `final`.\n\n**CoT carry-over rule.** On the next turn, drop prior `analysis` messages *if* the last assistant turn ended in a `final` message. The exception is an in-progress tool-calling turn: the `analysis` that preceded a tool call MUST be fed back in alongside the tool result so the model can continue its reasoning (the `openai-harmony` renderer does this via `RenderConversationConfig { auto_drop_analysis: true }`).\n\n## Tool definitions\n\nFunction tools are advertised in the **developer** message under a `# Tools` section, inside a TypeScript-style `namespace functions { ... }`. (Built-in `browser`/`python` tools are instead declared in the **system** message under their own `# Tools` / `## browser` / `## python` headings.) The renderer converts each JSON Schema into a TS type with these rules:\n\n- No-arg function → `type name = () => any;`\n- With args → the single parameter is named `_` and its object type is inlined: `type name = (_: { ... }) => any;`\n- Return type is always `any`.\n- A property `description` becomes a `//` comment on the line *above* the field; a JSON Schema `title` renders as `// TITLE` followed by a `//` blank-comment line; `examples` render as `// Examples:` then `// - \"value\"` lines.\n- Optional (non-`required`) fields get a trailing `?`. A `default` renders as a trailing `// default: <value>` comment; an `enum` becomes a `\"a\" | \"b\"` union; `oneOf` becomes a multi-line `|` union; JSON `integer` maps to TS `number`.\n- One blank line separates function definitions; the block closes with `} // namespace functions`.\n\nIf the developer message has no instruction text, the `# Instructions` heading is omitted and the message is just the `# Tools` block. When any function is defined, the system message gains the routing line `Calls to these tools must go to the commentary channel: 'functions'.`\n\nVerbatim developer-message example (instructions + two functions), exactly as the renderer emits it:\n\n```text\n<|start|>developer<|message|># Instructions\n\nUse a friendly tone.\n\n# Tools\n\n## functions\n\nnamespace functions {\n\n// Gets the location of the user.\ntype get_location = () => any;\n\n// Gets the current weather in the provided location.\ntype get_current_weather = (_: {\n// The city and state, e.g. San Francisco, CA\nlocation: string,\nformat?: \"celsius\" | \"fahrenheit\", // default: celsius\n}) => any;\n\n// Gets the current weather in the provided list of locations.\ntype get_multiple_weathers = (_: {\n// List of city and state, e.g. [\"San Francisco, CA\", \"New York, NY\"]\nlocations: string[],\nformat?: \"celsius\" | \"fahrenheit\", // default: celsius\n}) => any;\n\n} // namespace functions<|end|>\n```\n\n## Tool-call format\n\nA function call is an **assistant** message on the **commentary** channel, addressed to the tool via recipient `to=functions.<name>`, with the JSON arguments as the body, terminated by the `<|call|>` stop token.\n\nThe recipient may appear in the *role section* or the *channel section* of the header — both are valid Harmony and the parser accepts either. The model commonly emits it in the channel section. The pi renderer omits the optional content-type marker:\n\n```text\n<|start|>assistant<|channel|>commentary to=functions.get_current_weather<|message|>{\"location\":\"San Francisco, CA\"}<|call|>\n```\n\nSome Harmony serializers include an explicit JSON content type and place the recipient in the role section instead:\n\n```text\n<|start|>assistant to=functions.get_current_weather<|channel|>commentary <|constrain|>json<|message|>{\"location\":\"San Francisco, CA\"}<|call|>\n```\n\nThe arguments body is a raw JSON object. The optional `<|constrain|>json` content-type signals JSON (and is the hook for constrained/grammar-based decoding); the content-type may also be a bare word such as `code` (seen with built-in tools). Built-in tools differ only in channel and recipient: they typically render on `analysis`, with recipient `browser.search` / `browser.open` / `browser.find` or always `python`.\n\n## Multiple / parallel tool calls\n\nHarmony has no special \"parallel\" wrapper. Multiple calls are just multiple consecutive messages. The model may first emit an optional **preamble** — a *user-visible* assistant message on the `commentary` channel (unlike `analysis`, this is meant to be shown) — then one tool-call message per function. Each individual call still ends with its own `<|call|>` stop token, so a host that stops on `<|call|>` collects calls one at a time, executes, feeds the result back, and resumes:\n\n```text\n<|channel|>analysis<|message|>{reasoning}<|end|><|start|>assistant<|channel|>commentary<|message|>**Action plan**:\n1. Generate an HTML file\n2. Generate a JavaScript for the Node.js server\n3. Start the server\n---\nWill start executing the plan step by step<|end|><|start|>assistant<|channel|>commentary to=functions.generate_file<|message|>{\"template\": \"basic_html\", \"path\": \"index.html\"}<|call|>\n```\n\n## Tool-result format\n\nThe executed tool's output is fed back as a message whose **author/role is the tool's name**, addressed back to the assistant (`to=assistant`), on the **commentary** channel, ending with `<|end|>`. This is the canonical (recommended) form:\n\n```text\n<|start|>functions.get_current_weather to=assistant<|channel|>commentary<|message|>{\"sunny\": true, \"temperature\": 20}<|end|>\n```\n\nThe header ordering is `{toolname} to=assistant<|channel|>commentary`. Built-in tool results follow the same shape (e.g. `<|start|>browser.search to=assistant<|channel|>commentary<|message|>{\"result\": \"https://openai.com/\"}<|end|>`). The minimal form the renderer accepts when channel/recipient are not set on the message is just `<|start|>{toolname}<|message|>{output}<|end|>`, but emitting the full `to=assistant<|channel|>commentary` header is what the reference parser round-trips and is recommended. After appending the result, restart generation by emitting the next `<|start|>assistant`.\n\n## End-to-end example\n\nComplete multi-turn weather exchange: system + developer prompt → user question → assistant analysis CoT → assistant commentary tool call → tool result → assistant final answer. This is a single contiguous token stream (newlines inside headers are only between top-level messages for readability; in practice messages are concatenated with no separator).\n\n```text\n<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-06-28\n\nReasoning: high\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.\nCalls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>developer<|message|># Instructions\n\nUse a friendly tone.\n\n# Tools\n\n## functions\n\nnamespace functions {\n\n// Gets the current weather in the provided location.\ntype get_current_weather = (_: {\n// The city and state, e.g. San Francisco, CA\nlocation: string,\nformat?: \"celsius\" | \"fahrenheit\", // default: celsius\n}) => any;\n\n} // namespace functions<|end|><|start|>user<|message|>What is the weather like in SF?<|end|><|start|>assistant<|channel|>analysis<|message|>User wants the weather in San Francisco. Use get_current_weather.<|end|><|start|>assistant<|channel|>commentary to=functions.get_current_weather<|message|>{\"location\":\"San Francisco, CA\"}<|call|><|start|>functions.get_current_weather to=assistant<|channel|>commentary<|message|>{\"sunny\": true, \"temperature\": 20}<|end|><|start|>assistant<|channel|>final<|message|>It's sunny and about 20°C in San Francisco right now.<|return|>\n```\n\nTurn boundaries:\n\n- The host stops generation at `<|call|>`, parses the `commentary` call, runs `get_current_weather`, and appends the `functions.get_current_weather to=assistant` result message.\n- It then appends `<|start|>assistant` and resumes. The preceding `analysis` message is kept (the turn ended in a tool call, not a `final`), so the model can continue its reasoning.\n- Generation stops at `<|return|>`. When this turn is persisted into history for a *later* turn, normalize the trailing `<|return|>` to `<|end|>` (see next note).\n\n**`<|return|>` normalization.** `<|return|>` is a decode-time stop token only. When you store the assistant's reply into history for the next turn, replace the trailing `<|return|>` with `<|end|>` so every stored message is a well-formed `<|start|>{header}<|message|>{content}<|end|>`. (For supervised training targets, ending the example with `<|return|>` is correct.)\n\n## OpenAI-compatible API mapping\n\nWhen a server (vLLM/SGLang/Ollama) bridges Harmony to Chat Completions JSON:\n\n- **`finish_reason`**: `tool_calls` when generation stopped on `<|call|>`; `stop` when it stopped on `<|return|>`.\n- **`message.tool_calls[]`**: one entry per `commentary` `to=functions.*` call. `function.name` is the recipient with the `functions.` namespace stripped (`get_current_weather`). `function.arguments` is a **JSON string** (the verbatim `<|message|>` body), matching OpenAI semantics — not a parsed object.\n- **`tool_call_id`**: Harmony has no native call ID. The server synthesizes one (e.g. `call_abc123`) and is responsible for correlating the follow-up `role:\"tool\"` message back to the Harmony tool-result envelope (recipient `to=functions.<name>` / call order).\n- **Tool result messages** (`{\"role\":\"tool\",\"tool_call_id\":...,\"content\":...}`) are rendered into `<|start|>{toolname} to=assistant<|channel|>commentary<|message|>{content}<|end|>`. The server maps `tool_call_id` → the original function name to build the `{toolname}` author.\n- **Reasoning**: `analysis`-channel text is surfaced as `reasoning_content` (vLLM/SGLang) or as a `reasoning`/`thinking` field, and is generally not echoed back on subsequent requests. `final`-channel text is the normal `message.content`. `commentary` preambles, if surfaced, also map to assistant content.\n- **`tools` / `tool_choice`** request fields are compiled by the chat template into the developer-message `namespace functions { ... }` block; the system message gains the commentary-routing line.\n\n## Parsing notes & gotchas\n\n- **Two stop tokens.** Always stop on both `<|return|>` and `<|call|>`. Stopping only on `<|return|>` will run past tool calls; stopping only on `<|end|>` is wrong for assistant generation.\n- **Recipient position varies.** `to=functions.<name>` may be in the role section (`<|start|>assistant to=...<|channel|>commentary`) or the channel section (`<|channel|>commentary to=...`). A parser must accept both.\n- **Channel is mandatory** on assistant messages; the system message even reminds the model (\"Channel must be included for every message.\"). Missing-channel output is malformed.\n- **Tool author, not `tool`.** The tool-result message's role is the tool's *name* (`functions.get_current_weather`), not the literal string `tool`. Splitting `functions.x` into namespace + function is the parser's job.\n- **CoT dropping is conditional.** Drop `analysis` only when the previous assistant turn ended on `final`. Dropping the `analysis` that immediately precedes a `<|call|>` breaks multi-step tool reasoning.\n- **`arguments` is a string.** Do not double-encode. The body after `<|message|>` is already serialized JSON; pass it through as the `arguments` string.\n- **Content-type variants.** `<|constrain|>json` is optional. If present, it is metadata, not a guarantee of valid JSON. Enforce JSON validity with constrained decoding / your own grammar — the prompt format alone does not guarantee schema adherence (same caveat applies to structured-output `# Response Formats`).\n- **Streaming.** Use a stateful parser (the library ships `StreamableParser`) so partial UTF-8 and the header/channel/recipient/content-type fields are reconstructed incrementally; a naive substring scan mishandles multi-byte splits and the optional header fields. `parse_messages_from_completion_tokens` takes `strict=True|False` — `strict=False` tolerates some malformed headers. Do not pass the trailing stop token into the parser.\n- **Encoding.** Use `o200k_harmony` (the `o200k_base` ranks plus the Harmony specials above). Treat the `<|...|>` tokens as atomic special tokens during both encode and decode; encoding them as ordinary text yields different ranks and corrupts the stream.\n\n## Sources\n\n- OpenAI Cookbook — OpenAI harmony response format: https://cookbook.openai.com/articles/openai-harmony\n- openai/harmony renderer (README): https://github.com/openai/harmony\n- openai/harmony canonical format guide: https://raw.githubusercontent.com/openai/harmony/main/docs/format.md\n- openai/harmony special-token registry (`o200k_harmony` IDs): https://raw.githubusercontent.com/openai/harmony/main/src/tiktoken_ext/public_encodings.rs\n- openai/harmony renderer/parser tests and schema→TS logic: https://raw.githubusercontent.com/openai/harmony/main/src/tests.rs , https://raw.githubusercontent.com/openai/harmony/main/src/encoding.rs\n- openai/harmony test fixtures (verbatim rendered streams): `test-data/test_render_functions_with_parameters.txt`, `test-data/test_does_not_drop_if_ongoing_analysis.txt`, `test-data/test_tool_response_parsing.txt`, `test-data/test_streamable_parser.txt`, `test-data/test_browser_and_function_tool.txt` (https://github.com/openai/harmony/tree/main/test-data)\n- vLLM tool calling / gpt-oss parser flags: https://docs.vllm.ai/en/latest/features/tool_calling/\n- SGLang gpt-oss usage (`--tool-call-parser gpt-oss`): https://docs.sglang.io/basic_usage/gpt_oss.html\n",
82
83
  "toolconv/kimi-k2.md": "# Kimi K2 tool-calling format\n\nNative tool-calling convention of Moonshot AI's **Kimi K2** family (`moonshotai/Kimi-K2-Instruct` and `-Base`, `model_type: \"kimi_k2\"`, 1T-param MoE). It is a ChatML-like envelope built on a TikToken tokenizer (160K vocab): every turn is `<|im_{class}|>{name}<|im_middle|>{body}<|im_end|>`, and tool calls are emitted inside the assistant turn wrapped by a dedicated `<|tool_calls_section_begin|>…<|tool_calls_section_end|>` block. All control tokens are plain ASCII `<|…|>` forms (no fullwidth/unicode variants, unlike DeepSeek). An inference server turns the raw stream into OpenAI-style `tool_calls` with a parser: vLLM and SGLang both expose `--tool-call-parser kimi_k2` (vLLM additionally requires `--enable-auto-tool-choice`). The chat template (a standalone `chat_template.jinja` since the 2025.8.11 update) injects the tool schemas and renders the per-turn markers.\n\nThis document was verified against the model card, the official `docs/tool_call_guidance.md` and `docs/deploy_guidance.md` (GitHub `MoonshotAI/Kimi-K2`), the raw `chat_template.jinja` and `tokenizer_config.json` from the HF repo (rendered locally for the byte-exact streams below), and the vLLM `kimi_k2` tool parser source.\n\n## Special tokens\n\nThe five tool-call markers required for manual parsing, plus the ChatML envelope markers. Token IDs are from `tokenizer_config.json` (`added_tokens_decoder`).\n\n| Token (verbatim) | ID | Purpose |\n|---|---|---|\n| `<\\|tool_calls_section_begin\\|>` | 163595 | Opens the tool-call section inside an assistant turn |\n| `<\\|tool_call_begin\\|>` | 163597 | Opens one individual tool call |\n| `<\\|tool_call_argument_begin\\|>` | 163598 | Separates the tool-call ID from its JSON arguments |\n| `<\\|tool_call_end\\|>` | 163599 | Closes one individual tool call |\n| `<\\|tool_calls_section_end\\|>` | 163596 | Closes the tool-call section |\n| `<\\|im_system\\|>` | 163594 | Start marker for system-class turns (`system`, `tool`, `tool_declare`) |\n| `<\\|im_user\\|>` | 163587 | Start marker for a user turn |\n| `<\\|im_assistant\\|>` | 163588 | Start marker for an assistant turn |\n| `<\\|im_middle\\|>` | 163601 | Separates the role/name header from the message body |\n| `<\\|im_end\\|>` | 163586 | Ends any turn |\n| `[BOS]` | 163584 | Sequence-begin token (see notes; not emitted by the chat template) |\n| `[EOS]` | 163585 | Sequence-end token |\n\nNotes on exactness:\n- The five tool tokens use ASCII pipe `|` (U+007C) and underscores; reproduce them exactly. There are no fullwidth pipe (`|`) or `▁` variants in Kimi K2.\n- `<|im_middle|>` is the only envelope token whose ID (163601) is out of sequence with the others (163586–163599); a `163600` slot is unused.\n- Image inputs render via a content macro as the literal sequence `<|media_start|>image<|media_content|><|media_pad|><|media_end|>`. These media markers appear in the template but are **not** registered in `added_tokens_decoder`, so they tokenize as ordinary text rather than single special tokens. They are irrelevant to text tool calling and are listed here only for completeness.\n\n## Roles / channels / turn structure\n\nKimi K2 uses a ChatML-style envelope. Every message is rendered as:\n\n```text\n<|im_{class}|>{name}<|im_middle|>{body}<|im_end|>\n```\n\n- There are exactly **three** start-marker tokens, chosen by `role`:\n - `user` → `<|im_user|>`\n - `assistant` → `<|im_assistant|>`\n - everything else (`system`, `tool`, and the synthetic `tool_declare`) → `<|im_system|>`\n- The `{name}` segment between the marker and `<|im_middle|>` is `message.name or message.role`. This is the only \"channel\"/sub-role label Kimi K2 has. For ordinary turns it is literally `system`, `user`, or `assistant`; for a tool-result turn it is the tool's `name` (the function name) when supplied, otherwise `tool`; for the tool-schema turn it is the literal `tool_declare`.\n- `<|im_end|>` terminates every turn. The chat template does **not** emit `[BOS]`/`[EOS]`; turn boundaries are purely `<|im_*|>` markers (the tokenizer is TikToken-based with `add_bos_token`/`add_eos_token` unset, and the manual-parse flow feeds the rendered template straight to `/completions`).\n- **Default system prompt:** if the first message is not a `system` message, the template injects `<|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|>` before the first turn.\n- **Generation prompt:** with `add_generation_prompt=True` the template ends with `<|im_assistant|>assistant<|im_middle|>`, and the model generates from there.\n- **Thinking/reasoning:** `Kimi-K2-Instruct` is a \"reflex-grade\" model with no long thinking, so there is no reasoning channel in this format. (Thinking variants are handled separately — vLLM ships a distinct `kimi_k2` reasoning parser keyed on a `</think>` token — but that is out of scope for the Instruct tool-call format documented here.)\n\n## Tool definitions\n\nAvailable tools are advertised in a single dedicated turn placed at the very top of the prompt (before any system/user turn), using the synthetic `tool_declare` sub-role under the `<|im_system|>` marker:\n\n```text\n<|im_system|>tool_declare<|im_middle|>{TOOLS_JSON}<|im_end|>\n```\n\n`{TOOLS_JSON}` is the standard OpenAI-style `tools` array serialized to JSON with **compact separators** `(',', ':')` (no spaces). The array elements are passed through verbatim, i.e. each is `{\"type\":\"function\",\"function\":{\"name\":…,\"description\":…,\"parameters\":{…}}}` with a JSON-Schema `parameters` object. Example (single tool, exactly as emitted):\n\n```text\n<|im_system|>tool_declare<|im_middle|>[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information. Call this tool when the user needs to get weather information\",\"parameters\":{\"type\":\"object\",\"required\":[\"city\"],\"properties\":{\"city\":{\"type\":\"string\",\"description\":\"City name\"}}}}}]<|im_end|>\n```\n\nThe `tool_declare` turn is rendered only when `tools` is non-empty.\n\n## Tool-call format\n\nWhen the model decides to call a function, it emits — inside the assistant turn, after any natural-language content — a tool-calls section. Minimal single call (this is the assistant generation that follows `<|im_assistant|>assistant<|im_middle|>`):\n\n```text\n<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{\"city\": \"Beijing\"}<|tool_call_end|><|tool_calls_section_end|>\n```\n\nAnatomy of one call:\n\n```text\n<|tool_call_begin|> functions.{func_name}:{idx} <|tool_call_argument_begin|> {JSON arguments} <|tool_call_end|>\n```\n\n- The token between `<|tool_call_begin|>` and `<|tool_call_argument_begin|>` is the **tool-call ID**, with the fixed form `functions.{func_name}:{idx}`.\n - `functions.` is a literal prefix (it is not derived from the tool schema).\n - `{func_name}` is the called function's name; the function name is recovered by parsing it back out of this ID, not from a separate field.\n - `{idx}` is the **0-based call index** within the current assistant turn (`0` for the first call, `1` for the second, …).\n- After `<|tool_call_argument_begin|>` comes the raw JSON arguments object (e.g. `{\"city\": \"Beijing\"}`), terminated by `<|tool_call_end|>`.\n- All calls of the turn live between one `<|tool_calls_section_begin|>` / `<|tool_calls_section_end|>` pair. Any assistant text content precedes `<|tool_calls_section_begin|>`.\n- The whole assistant turn is still closed by `<|im_end|>` and the completion's `finish_reason` becomes `tool_calls`.\n\n## Multiple / parallel tool calls\n\nTwo or more calls in one turn are emitted as consecutive `<|tool_call_begin|>…<|tool_call_end|>` blocks inside a single section, with the index incrementing per call. Raw assistant emission for two parallel calls:\n\n```text\n<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{\"city\": \"Beijing\"}<|tool_call_end|><|tool_call_begin|>functions.get_weather:1<|tool_call_argument_begin|>{\"city\": \"Shanghai\"}<|tool_call_end|><|tool_calls_section_end|>\n```\n\nNote the IDs `functions.get_weather:0` and `functions.get_weather:1` — same function, distinct trailing index. The index is per-turn (it resets to `0` in the next assistant turn).\n\n## Tool-result format\n\nTool execution results are fed back as a turn with `role: \"tool\"`. Because `tool` is not `user`/`assistant`, it renders under the `<|im_system|>` marker; the sub-role label is the message's `name` (the function name) when present, else `tool`. The body is a literal `## Return of {tool_call_id}` header line followed by the result content:\n\n```text\n<|im_system|>get_weather<|im_middle|>## Return of functions.get_weather:0\n{\"weather\": \"Sunny\"}<|im_end|>\n```\n\n- `{tool_call_id}` echoes the exact ID from the originating call (`functions.get_weather:0`), which is how the model correlates a result with the call that produced it.\n- The result `content` is inserted verbatim on the line after the header; callers typically pass a JSON string (e.g. `json.dumps(tool_result)`).\n- If the `tool` message omits `name`, the envelope becomes `<|im_system|>tool<|im_middle|>## Return of …`.\n\n## End-to-end example\n\nA complete multi-turn weather exchange. These are the exact rendered streams (system + user supplied explicitly; line breaks inside a turn are literal, turns are otherwise contiguous).\n\n**Stage 1 — prompt fed to the model** (`tools` set, `add_generation_prompt=True`):\n\n```text\n<|im_system|>tool_declare<|im_middle|>[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information. Call this tool when the user needs to get weather information\",\"parameters\":{\"type\":\"object\",\"required\":[\"city\"],\"properties\":{\"city\":{\"type\":\"string\",\"description\":\"City name\"}}}}}]<|im_end|><|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|><|im_user|>user<|im_middle|>What's the weather like in Beijing today? Use the tool to check.<|im_end|><|im_assistant|>assistant<|im_middle|>\n```\n\n**Assistant generation** (model output; server reports `finish_reason: \"tool_calls\"`):\n\n```text\n<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{\"city\": \"Beijing\"}<|tool_call_end|><|tool_calls_section_end|><|im_end|>\n```\n\n**Stage 2 — prompt for the next turn**, after appending the assistant tool-call turn and the tool result turn (`add_generation_prompt=True`):\n\n```text\n<|im_system|>tool_declare<|im_middle|>[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information. Call this tool when the user needs to get weather information\",\"parameters\":{\"type\":\"object\",\"required\":[\"city\"],\"properties\":{\"city\":{\"type\":\"string\",\"description\":\"City name\"}}}}}]<|im_end|><|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|><|im_user|>user<|im_middle|>What's the weather like in Beijing today? Use the tool to check.<|im_end|><|im_assistant|>assistant<|im_middle|><|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{\"city\": \"Beijing\"}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>get_weather<|im_middle|>## Return of functions.get_weather:0\n{\"weather\": \"Sunny\"}<|im_end|><|im_assistant|>assistant<|im_middle|>\n```\n\n**Final assistant generation** (model produces natural-language answer terminated by `<|im_end|>`; `finish_reason: \"stop\"`):\n\n```text\nIt's sunny in Beijing today.<|im_end|>\n```\n\n## OpenAI-compatible API mapping\n\nWith a server parser active (`--tool-call-parser kimi_k2`), the raw stream maps onto the Chat Completions shape as follows:\n\n- `choices[].finish_reason` = `\"tool_calls\"` when the turn contained a tool-calls section (otherwise `\"stop\"`).\n- `choices[].message.tool_calls[]` — one entry per `<|tool_call_begin|>…<|tool_call_end|>` block:\n - `.id` = the raw call ID verbatim, e.g. `\"functions.get_weather:0\"`.\n - `.type` = `\"function\"`.\n - `.function.name` = the function name parsed out of the ID. vLLM computes `id.split(\":\")[0].split(\".\")[-1]` → `\"get_weather\"`.\n - `.function.arguments` = a **JSON string** (the raw text captured between `<|tool_call_argument_begin|>` and `<|tool_call_end|>`), e.g. `\"{\\\"city\\\": \\\"Beijing\\\"}\"`. Clients `json.loads()` it before use.\n- Tool results are sent back as messages of the form:\n\n ```json\n {\"role\": \"tool\", \"tool_call_id\": \"functions.get_weather:0\", \"name\": \"get_weather\", \"content\": \"{\\\"weather\\\": \\\"Sunny\\\"}\"}\n ```\n\n `tool_call_id` must equal the `id` returned for the call; `name` becomes the `<|im_system|>{name}<|im_middle|>` sub-role; `content` becomes the body after `## Return of …`.\n- Streaming: deltas arrive as `choices[].delta.tool_calls[]` with an `index`; the function `name`/`id` stream once the call header is complete, then `function.arguments` streams as incremental string fragments to be concatenated (standard OpenAI tool-call streaming assembly).\n\nMoonshot's hosted API (`platform.moonshot.ai`) exposes both OpenAI- and Anthropic-compatible endpoints; the Anthropic-compatible one scales temperature as `real_temperature = request_temperature * 0.6`. Recommended sampling temperature for `Kimi-K2-Instruct` is `0.6`.\n\n## Parsing notes & gotchas\n\n- **ID → name parsing differs between references.** The official `tool_call_guidance.md` extracts the name with `function_id.split('.')[1].split(':')[0]`, which assumes the ID is exactly `functions.{name}` with no extra dots. vLLM uses the more robust `function_id.split(\":\")[0].split(\".\")[-1]` (takes the last dot-segment before `:{idx}`). Prefer the vLLM form so function names containing `.` are handled.\n- **Extraction regexes differ too.** Guidance: `<\\|tool_call_begin\\|>\\s*(?P<tool_call_id>[\\w\\.]+:\\d+)\\s*<\\|tool_call_argument_begin\\|>\\s*(?P<function_arguments>.*?)\\s*<\\|tool_call_end\\|>`. vLLM: ID class is `[^<]+:\\d+` and the argument body uses a negative lookahead `(?:(?!<\\|tool_call_begin\\|>).)*?` so adjacent calls aren't merged. Both run with `DOTALL`.\n- **`skip_special_tokens` must be False.** The parser depends on the literal marker text surviving detokenization; vLLM forces `skip_special_tokens = False` when tools are enabled and `tool_choice != \"none\"`. If markers are stripped, no tool call is detected.\n- **Arguments are unvalidated raw text.** Whatever the model emits between the argument marker and `<|tool_call_end|>` is passed straight through as the `arguments` string; it must be valid JSON for downstream `json.loads`, and the model can emit malformed/truncated JSON. Validate before executing.\n- **Index semantics.** `{idx}` is the per-turn call counter starting at `0`; it is not a global counter and resets each assistant turn. Do not assume IDs are unique across turns — disambiguate by turn when persisting history.\n- **Streaming marker splits.** Section and call markers can be split across token boundaries. vLLM holds back any trailing suffix that partially matches a marker (`partial_tag_overlap`) to avoid leaking marker bytes into streamed content, and only streams a call's name once its header is fully received.\n- **`finish_reason` varies by engine.** The official guide explicitly warns the terminal `finish_reason` for tool calls \"may vary across different engines\"; loop on `finish_reason == \"tool_calls\"` but be defensive.\n- **Engine fallback.** Kimi K2 reuses the DeepSeek-V3 architecture; `config.json` sets `model_type: \"kimi_k2\"` so engines apply the right parser. If you force `model_type: \"deepseek_v3\"` as a compatibility workaround, no native Kimi tool parser is available and you must parse the `<|tool_calls_section_*|>` markers manually.\n- **Parser availability.** vLLM ships both a Python (`KimiK2ToolParser`) and a newer Rust tool parser; SGLang implements its own `kimi_k2` parser. All key off the same five markers and the `functions.{name}:{idx}` ID convention documented here.\n- **Whitespace artifact.** When no `system` message is supplied, the template injects the default system prompt and a small `\\n ` (newline + two spaces) can appear before the first `<|im_user|>` marker. It is harmless (tokenizes around the markers), but supplying an explicit system message yields the clean streams shown above.\n\n## Sources\n\n- Model card (Tool Calling section, OpenAI-style example, deployment/API notes): https://huggingface.co/moonshotai/Kimi-K2-Instruct\n- Official tool-call guidance (markers, ID convention, manual parser, `extract_tool_call_info`): https://raw.githubusercontent.com/MoonshotAI/Kimi-K2/main/docs/tool_call_guidance.md (the HF `resolve`/`blob` paths redirected to the model card; verified against this GitHub raw file)\n- Deployment guide (`--tool-call-parser kimi_k2`, `--enable-auto-tool-choice`, SGLang flag, `model_type` fallback): https://raw.githubusercontent.com/MoonshotAI/Kimi-K2/main/docs/deploy_guidance.md\n- Chat template (`chat_template.jinja`, rendered locally for byte-exact streams): https://huggingface.co/moonshotai/Kimi-K2-Instruct/resolve/main/chat_template.jinja\n- Tokenizer config (special-token IDs in `added_tokens_decoder`): https://huggingface.co/moonshotai/Kimi-K2-Instruct/resolve/main/tokenizer_config.json\n- vLLM `kimi_k2` tool parser (markers, regex, name-parsing, `skip_special_tokens`, streaming): https://github.com/vllm-project/vllm/blob/main/vllm/tool_parsers/kimi_k2_tool_parser.py\n- vLLM PR adding the parser: https://github.com/vllm-project/vllm/pull/20789\n- vLLM tool-calling docs: https://docs.vllm.ai/en/latest/features/tool_calling/\n",