npm - @strav/brain - Versions diffs - 1.0.0-alpha.16 → 1.0.0-alpha.18 - Mend

@strav/brain 1.0.0-alpha.16 → 1.0.0-alpha.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/package.json +4 -2
package/src/agent.ts +34 -5
package/src/agent_generate_result.ts +2 -0
package/src/agent_result.ts +7 -0
package/src/agent_runner.ts +134 -15
package/src/agent_stream_event.ts +100 -0
package/src/brain_config.ts +91 -1
package/src/brain_manager.ts +287 -6
package/src/brain_provider.ts +25 -1
package/src/index.ts +37 -2
package/src/mcp/client.ts +99 -13
package/src/mcp/index.ts +7 -0
package/src/mcp/oauth.ts +227 -0
package/src/mcp/pool.ts +106 -0
package/src/mcp/resolve_mcp_tools.ts +31 -9
package/src/mcp_server.ts +16 -0
package/src/persistence/brain_message.ts +34 -0
package/src/persistence/brain_message_repository.ts +106 -0
package/src/persistence/brain_store.ts +166 -0
package/src/persistence/brain_suspended_run.ts +30 -0
package/src/persistence/brain_suspended_run_repository.ts +68 -0
package/src/persistence/brain_thread.ts +30 -0
package/src/persistence/brain_thread_repository.ts +65 -0
package/src/persistence/database_brain_store.ts +190 -0
package/src/persistence/index.ts +48 -0
package/src/persistence/schema/brain_message_schema.ts +61 -0
package/src/persistence/schema/brain_suspended_run_schema.ts +58 -0
package/src/persistence/schema/brain_thread_schema.ts +50 -0
package/src/persistence/schema/index.ts +3 -0
package/src/provider.ts +145 -1
package/src/providers/anthropic_provider.ts +723 -38
package/src/providers/deepseek_provider.ts +117 -0
package/src/providers/gemini_provider.ts +625 -33
package/src/providers/ollama_provider.ts +86 -0
package/src/providers/openai_compat_provider.ts +616 -0
package/src/providers/openai_provider.ts +801 -43
package/src/providers/openai_responses_provider.ts +1015 -0
package/src/suspended_run.ts +153 -0
package/src/thread.ts +40 -1
package/src/tool.ts +7 -0
package/src/tool_runner.ts +81 -0
package/src/types.ts +343 -0

package/src/types.ts CHANGED Viewed

@@ -105,12 +105,113 @@ export interface MCPToolResultBlock {
   isError?: boolean
 }
+/**
+ * Image input — attaches a picture to a user message so vision-
+ * capable models can see it alongside the text. V1 covers images
+ * only; audio + video defer.
+ *
+ * `source` is a discriminated union:
+ *   - `{ type: 'base64', mediaType, data }` — inline bytes for
+ *     uploads, screenshots, attachments your app already holds in
+ *     memory. `mediaType` is the IANA MIME (`image/png`,
+ *     `image/jpeg`, `image/webp`, `image/gif`); `data` is the
+ *     base64-encoded image (no `data:` prefix — the provider
+ *     translation adds it where needed).
+ *   - `{ type: 'url', url }` — remote image URL. Anthropic, OpenAI,
+ *     and Gemini all accept HTTPS URLs; check the provider's
+ *     domain allowlist if calls 404 (Anthropic was historically
+ *     stricter). For Gemini, GCS URIs (`gs://...`) also work.
+ *
+ * Vision support is provider- AND model-dependent. Cloud picks:
+ * Anthropic Claude 4 family, OpenAI gpt-4o / gpt-5 family, Gemini
+ * 2.x. Local: `llama3.2-vision`, `llava`, `qwen2.5-vl` on Ollama.
+ * Models without vision either reject the call or ignore the image.
+ */
+export interface ImageBlock {
+  type: 'image'
+  source:
+    | { type: 'base64'; mediaType: string; data: string }
+    | { type: 'url'; url: string }
+}
+/**
+ * Document input — attaches a PDF (V1 only — the providers that
+ * support documents currently all gate on `application/pdf`) to a
+ * user message. Anthropic surfaces it as a native `document` block;
+ * Gemini accepts it via `inlineData` / `fileData` with
+ * `application/pdf` mime; OpenAI / Ollama / DeepSeek don't support
+ * PDF blocks at all (apps split the PDF to images and use
+ * `ImageBlock`s for those vendors).
+ *
+ * The optional `title` is shown to the model on Anthropic (helpful
+ * for multi-document calls — "the contract", "the invoice"); other
+ * providers ignore it.
+ */
+export interface DocumentBlock {
+  type: 'document'
+  source:
+    | { type: 'base64'; mediaType: string; data: string }
+    | { type: 'url'; url: string }
+  /** Optional title shown to the model (Anthropic uses it; others ignore). */
+  title?: string
+}
+/**
+ * Audio input — attaches a sound clip to a user message. V1
+ * coverage: Gemini supports audio natively via `inlineData` with
+ * audio MIMEs (`audio/mp3`, `audio/wav`, `audio/ogg`, `audio/flac`,
+ * `audio/webm`, `audio/aac`). Anthropic + OpenAI + Ollama don't
+ * accept audio in their chat APIs — OpenAI apps preprocess via
+ * Whisper; Anthropic apps wait for the audio block to land in the
+ * SDK; Ollama apps that need audio look at server-side
+ * transcription models.
+ */
+export interface AudioBlock {
+  type: 'audio'
+  source:
+    | { type: 'base64'; mediaType: string; data: string }
+    | { type: 'url'; url: string }
+}
+/**
+ * Server-side compaction block. Anthropic's `compact-2026-01-12`
+ * beta returns a `compaction` block when an auto-compaction trigger
+ * fires during a request. The framework surfaces it on
+ * `result.content` and Thread persists it on the assistant turn so
+ * subsequent requests echo it back verbatim — the model only sees
+ * the summary + opaque blob from then on, and the older raw turns
+ * stay out of context.
+ *
+ * V1 produces these on Anthropic only. Other providers ignore the
+ * `compact` option silently, and never emit a `CompactionBlock`.
+ *
+ * Round-trip invariant: pass the block back unchanged. The
+ * `encryptedContent` blob is opaque metadata the server uses to
+ * stitch the compaction history together; the framework never
+ * mutates it.
+ *
+ * `content === null` means a compaction attempt failed (e.g.,
+ * malformed model output). The server treats these as no-ops on
+ * the next request, so apps don't need to special-case them.
+ */
+export interface CompactionBlock {
+  type: 'compaction'
+  /** Summary of compacted content. Null when compaction failed. */
+  content: string | null
+  /** Opaque metadata round-tripped verbatim on subsequent requests. */
+  encryptedContent: string | null
+}
 export type ContentBlock =
   | TextBlock
+  | ImageBlock
+  | DocumentBlock
+  | AudioBlock
   | ToolUseBlock
   | ToolResultBlock
   | MCPToolUseBlock
   | MCPToolResultBlock
+  | CompactionBlock
 /** A single conversation turn. `content` can be a bare string or a typed block list. */
 export interface Message {
@@ -134,6 +235,85 @@ export type SystemPrompt =
  * escape hatch in `ChatResult` is what they reach for when they need
  * provider-specific fields.
  */
+/**
+ * Server-side tool — work the provider's backend runs on behalf
+ * of the model. Unlike framework-local tools (`Tool` /
+ * `defineTool`), the model's call doesn't round-trip through
+ * the app's process; the provider executes the tool and inlines
+ * the result in the response.
+ *
+ * V1 coverage:
+ *   - **Anthropic**: `web_search`, `code_execution`, `web_fetch`.
+ *   - **Gemini**: `web_search` (Google Search), `code_execution`,
+ *     `url_context`.
+ *   - **OpenAI / DeepSeek / Ollama**: throw — OpenAI's server tools
+ *     live on the Responses API (separate slice); the compat
+ *     providers don't expose them.
+ *
+ * Cross-provider portability:
+ *   - `web_search` + `code_execution` work on both Anthropic and
+ *     Gemini.
+ *   - `web_fetch` is Anthropic-only.
+ *   - `url_context` is Gemini-only.
+ *
+ * Server tools combine freely with framework-local `Tool[]` and
+ * MCP servers — the model sees all three sets in one tool list.
+ */
+export type ServerTool =
+  | {
+      type: 'web_search'
+      /** Max times the model can call this tool per turn (Anthropic; Gemini ignores). */
+      maxUses?: number
+      /** Domain allowlist (Anthropic; Gemini ignores). Mutually exclusive with `blockedDomains`. */
+      allowedDomains?: readonly string[]
+      /** Domain blocklist (Anthropic; Gemini ignores). */
+      blockedDomains?: readonly string[]
+    }
+  | { type: 'code_execution' }
+  | {
+      type: 'web_fetch'
+      /** Max URL fetches per turn (Anthropic). */
+      maxUses?: number
+      /** Domain allowlist. */
+      allowedDomains?: readonly string[]
+      /** Domain blocklist. */
+      blockedDomains?: readonly string[]
+    }
+  | {
+      type: 'url_context'
+      /** Gemini fetches the URL and surfaces grounded answers from it. */
+    }
+/**
+ * Per-call compaction configuration. Maps to Anthropic's
+ * `compact-2026-01-12` beta `edits[]` entry. All fields optional —
+ * omitting one falls back to the server's default (trigger:
+ * 150,000 input tokens; no extra instructions; no pause).
+ */
+export interface CompactConfig {
+  /**
+   * Trigger threshold in input tokens. Compaction fires once the
+   * conversation crosses this token count. Default 150,000 — same
+   * as the server-side default.
+   */
+  trigger?: number
+  /**
+   * Extra hint to the summarization model. Useful for biasing the
+   * compaction toward what your app actually cares to preserve
+   * ("keep all customer ids referenced", "preserve every diff
+   * hunk", ...).
+   */
+  instructions?: string
+  /**
+   * When `true`, the server returns the compaction block in-line
+   * but does NOT continue generation — the next assistant turn
+   * waits for an explicit re-prompt. Apps that want to inspect or
+   * gate compaction set this; default `false` (compaction is
+   * transparent).
+   */
+  pauseAfterCompaction?: boolean
+}
 export interface ChatOptions {
   /** Override the configured default model. Wins over `tier`. */
   model?: string
@@ -168,6 +348,56 @@ export interface ChatOptions {
    * provider by config; this is the override for that.
    */
   provider?: string
+  /**
+   * Cancel the in-flight operation. Aborting between iterations of
+   * a tool loop bails before the next model call; aborting mid-call
+   * propagates the SDK's native abort error (typically a `DOMException`
+   * with `name: 'AbortError'`). Streaming iterators reject on the
+   * next `for await` step.
+   */
+  signal?: AbortSignal
+  /**
+   * Server-side tools — work the provider's backend runs (web
+   * search, code execution, URL fetching). The model's calls
+   * don't round-trip through the framework's tool loop; results
+   * land inline in the response. Combines freely with
+   * framework-local `Tool[]` and MCP servers.
+   *
+   * V1 supports Anthropic + Gemini; OpenAI / DeepSeek / Ollama
+   * throw `BrainError` (use the Responses API for OpenAI, or
+   * route to Anthropic / Gemini).
+   */
+  serverTools?: readonly ServerTool[]
+  /**
+   * Server-side conversation compaction. When set, the provider
+   * auto-summarizes the older part of the message history once the
+   * `trigger` token threshold is reached; the summary lives on the
+   * response as a `CompactionBlock` that apps round-trip on
+   * subsequent requests (Thread does this automatically). Saves
+   * tokens on long threads without lossy client-side pruning.
+   *
+   * Only honored by `AnthropicProvider` (driver `'anthropic'`),
+   * via the `compact-2026-01-12` beta. Silently ignored by every
+   * other provider so apps targeting multiple providers with the
+   * same options object don't have to special-case.
+   */
+  compact?: CompactConfig
+  /**
+   * Stateful conversation pointer — OpenAI Responses API. When set,
+   * the provider sends only the new turn(s); the server picks up
+   * from the prior `Response` identified by this id and replays
+   * the conversation server-side. Saves tokens on long threads.
+   *
+   * Only honored by `OpenAIResponsesProvider` (driver
+   * `'openai-responses'`); silently ignored by every other provider
+   * — apps that target multiple providers with the same options
+   * object don't have to special-case.
+   *
+   * Pair with `ChatResult.responseId` (returned by every call) to
+   * thread the conversation forward. `Thread` does this
+   * automatically when its underlying provider supports it.
+   */
+  previousResponseId?: string
 }
 /** Token usage for a single call. Cache-hit fields are populated when caching is in play. */
@@ -190,6 +420,24 @@ export interface ChatResult<Raw = unknown> {
   stopReason: string | null
   usage: ChatUsage
   raw: Raw
+  /**
+   * Structured assistant content blocks — populated when the model
+   * emitted more than plain text on this turn (compaction blocks
+   * today; reasoning blocks once those surface). Apps that
+   * persist the conversation (`Thread`, custom stores) push this
+   * onto the message history when present so round-trippable
+   * blocks survive subsequent requests. Undefined when the turn
+   * was plain text only.
+   */
+  content?: ContentBlock[]
+  /**
+   * Provider response id when the provider exposes stateful
+   * conversations (currently OpenAI Responses API). Apps thread
+   * this forward via `ChatOptions.previousResponseId` so the
+   * server replays prior turns without re-sending them.
+   * Undefined for providers that don't support the pattern.
+   */
+  responseId?: string
 }
 /**
@@ -201,6 +449,99 @@ export type StreamEvent =
   | { type: 'text'; delta: string }
   | { type: 'stop'; stopReason: string | null; usage: ChatUsage }
+/**
+ * Per-call options for `brain.embed(...)`. Only the embed-relevant
+ * subset of `ChatOptions` — chat-specific knobs (system prompt,
+ * thinking, cache, tools) don't apply.
+ */
+export interface EmbedOptions {
+  /** Override the configured default embedding model. */
+  model?: string
+  /**
+   * Override the default provider. Must name a provider that
+   * implements `embed` (V1: OpenAI, Gemini, Ollama; Anthropic +
+   * DeepSeek throw with a clear "route to a different provider"
+   * message).
+   */
+  provider?: string
+  /**
+   * Optional dimensionality hint. OpenAI passes through as
+   * `dimensions`; Gemini as `outputDimensionality`. Providers
+   * that ignore it silently drop the field.
+   */
+  dimensions?: number
+  /** Cancellation signal — same shape as `ChatOptions.signal`. */
+  signal?: AbortSignal
+}
+/**
+ * Per-call options for `brain.transcribe(...)`.
+ */
+export interface TranscribeOptions {
+  /** Override the configured default transcription model. */
+  model?: string
+  /**
+   * Override the default provider. Must name a provider that
+   * implements `transcribe` (V1: OpenAI / Gemini / Ollama;
+   * Anthropic + DeepSeek throw).
+   */
+  provider?: string
+  /**
+   * Optional BCP-47 language hint (`en`, `fr`, `ja`). Improves
+   * accuracy when known; models without hint support ignore.
+   */
+  language?: string
+  /**
+   * Optional bias prompt to steer vocabulary / style / formatting.
+   * OpenAI calls this `prompt`; Gemini-via-chat threads it into
+   * the system message; others ignore.
+   */
+  prompt?: string
+  /** Cancellation signal — same shape as `ChatOptions.signal`. */
+  signal?: AbortSignal
+}
+/**
+ * Audio source — same discriminated union as
+ * `AudioBlock.source`, named separately for `transcribe(...)`
+ * which takes it directly (no wrapping `AudioBlock` shell).
+ */
+export type AudioSource =
+  | { type: 'base64'; mediaType: string; data: string }
+  | { type: 'url'; url: string }
+/**
+ * Result of one `transcribe` call. `text` is the transcribed
+ * audio; `language` / `duration` are surfaced when the provider
+ * returns them (OpenAI does on the `verbose_json` response
+ * format; Gemini's chat-wrap path doesn't). `raw` is the
+ * provider's full native response for fields the framework
+ * doesn't surface.
+ */
+export interface TranscribeResult<Raw = unknown> {
+  text: string
+  model: string
+  /** BCP-47 detected (or echoed) language. Optional. */
+  language?: string
+  /** Audio duration in seconds. Optional. */
+  duration?: number
+  raw: Raw
+}
+/**
+ * Result of one `embed` call. `embeddings[i]` is the vector for
+ * the i-th input text. `model` is the model the provider used
+ * (echoed back for logging). `usage.inputTokens` is the total
+ * tokens consumed across all inputs.
+ */
+export interface EmbedResult<Raw = unknown> {
+  embeddings: number[][]
+  model: string
+  usage: { inputTokens: number }
+  /** Provider's full native response — escape hatch for fields the framework doesn't surface. */
+  raw: Raw
+}
 /**
  * Result of a structured-output call. `value` is the parsed JSON
  * shaped to the `OutputSchema<T>` passed in. `text` is the raw JSON
@@ -214,4 +555,6 @@ export interface GenerateResult<T = unknown, Raw = unknown> {
   stopReason: string | null
   usage: ChatUsage
   raw: Raw
+  /** See `ChatResult.responseId`. */
+  responseId?: string
 }