npm - @strav/brain - Versions diffs - 1.0.0-alpha.16 → 1.0.0-alpha.17 - Mend

@strav/brain 1.0.0-alpha.16 → 1.0.0-alpha.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/package.json +2 -2
package/src/agent.ts +34 -5
package/src/agent_runner.ts +54 -11
package/src/agent_stream_event.ts +100 -0
package/src/brain_config.ts +91 -1
package/src/brain_manager.ts +168 -4
package/src/brain_provider.ts +25 -1
package/src/index.ts +17 -0
package/src/mcp/client.ts +82 -13
package/src/mcp/index.ts +6 -0
package/src/mcp/oauth.ts +227 -0
package/src/mcp/resolve_mcp_tools.ts +6 -2
package/src/mcp_server.ts +16 -0
package/src/provider.ts +109 -0
package/src/providers/anthropic_provider.ts +596 -28
package/src/providers/deepseek_provider.ts +117 -0
package/src/providers/gemini_provider.ts +590 -21
package/src/providers/ollama_provider.ts +86 -0
package/src/providers/openai_compat_provider.ts +187 -0
package/src/providers/openai_provider.ts +735 -32
package/src/providers/openai_responses_provider.ts +700 -0
package/src/tool.ts +7 -0
package/src/tool_runner.ts +81 -0
package/src/types.ts +233 -0

package/src/tool.ts CHANGED Viewed

@@ -23,6 +23,13 @@ export interface ToolContext {
   readonly callId: string
   /** Per-run free-form context bag passed by the caller. Optional. */
   readonly context: Readonly<Record<string, unknown>>
+  /**
+   * Cancellation signal forwarded from the run's `options.signal`.
+   * Tools that wrap network calls (HTTP fetches, MCP servers, child
+   * processes) should pass this through so cancellation actually
+   * unwinds in-flight work.
+   */
+  readonly signal?: AbortSignal
 }
 export interface Tool<TInput = unknown, TOutput = unknown> {

package/src/tool_runner.ts ADDED Viewed

@@ -0,0 +1,81 @@
+/**
+ * `runToolWithRecovery` — shared helper used by every provider's
+ * agentic loop to execute one tool call.
+ *
+ * Encapsulates two error paths and the optional `onToolError`
+ * recovery callback:
+ *
+ *   1. **Tool not registered** — the model called a name that
+ *      isn't in `toolMap`. Without recovery, throw
+ *      `ToolExecutionError`. With recovery, the callback's return
+ *      string becomes the `tool_result.content` (with `isError:
+ *      true`) and the loop continues — the model sees "unknown
+ *      tool" and adapts.
+ *
+ *   2. **`execute()` throws** — the tool's body raised. Same
+ *      pattern: either rethrow as `ToolExecutionError` or feed
+ *      back as an error result.
+ *
+ * The returned shape is the framework-agnostic `{ content, isError }`
+ * pair each provider then wraps into its own `tool_result` block
+ * shape (Anthropic `tool_result` with `is_error`; OpenAI tool-role
+ * message content; Gemini `functionResponse` with `{ error }`).
+ */
+import type { RunWithToolsOptions } from './provider.ts'
+import type { Tool, ToolContext } from './tool.ts'
+import { ToolExecutionError } from './tool_execution_error.ts'
+export interface ToolRunResult {
+  content: string
+  isError: boolean
+}
+export async function runToolWithRecovery(
+  tool: Tool | undefined,
+  toolName: string,
+  callId: string,
+  input: unknown,
+  options: RunWithToolsOptions,
+): Promise<ToolRunResult> {
+  if (!tool) {
+    return recoverOrThrow(
+      new ToolExecutionError(
+        toolName,
+        callId,
+        new Error(`Tool "${toolName}" is not registered.`),
+      ),
+      options,
+    )
+  }
+  const ctx: ToolContext = {
+    callId,
+    context: options.context ?? {},
+    ...(options.signal !== undefined ? { signal: options.signal } : {}),
+  }
+  let output: unknown
+  try {
+    output = await tool.execute(input, ctx)
+  } catch (cause) {
+    return recoverOrThrow(new ToolExecutionError(toolName, callId, cause), options)
+  }
+  return {
+    content: typeof output === 'string' ? output : JSON.stringify(output),
+    isError: false,
+  }
+}
+/**
+ * Resolve a `ToolExecutionError` through the `onToolError` callback
+ * (when set) or rethrow. Used by providers for failures that happen
+ * outside `tool.execute` — e.g., OpenAI's JSON-parse-arguments path.
+ */
+export function recoverOrThrow(
+  error: ToolExecutionError,
+  options: RunWithToolsOptions,
+): ToolRunResult {
+  const recovered = options.onToolError?.(error)
+  if (typeof recovered !== 'string') throw error
+  return { content: recovered, isError: true }
+}

package/src/types.ts CHANGED Viewed

@@ -105,8 +105,79 @@ export interface MCPToolResultBlock {
   isError?: boolean
 }
+/**
+ * Image input — attaches a picture to a user message so vision-
+ * capable models can see it alongside the text. V1 covers images
+ * only; audio + video defer.
+ *
+ * `source` is a discriminated union:
+ *   - `{ type: 'base64', mediaType, data }` — inline bytes for
+ *     uploads, screenshots, attachments your app already holds in
+ *     memory. `mediaType` is the IANA MIME (`image/png`,
+ *     `image/jpeg`, `image/webp`, `image/gif`); `data` is the
+ *     base64-encoded image (no `data:` prefix — the provider
+ *     translation adds it where needed).
+ *   - `{ type: 'url', url }` — remote image URL. Anthropic, OpenAI,
+ *     and Gemini all accept HTTPS URLs; check the provider's
+ *     domain allowlist if calls 404 (Anthropic was historically
+ *     stricter). For Gemini, GCS URIs (`gs://...`) also work.
+ *
+ * Vision support is provider- AND model-dependent. Cloud picks:
+ * Anthropic Claude 4 family, OpenAI gpt-4o / gpt-5 family, Gemini
+ * 2.x. Local: `llama3.2-vision`, `llava`, `qwen2.5-vl` on Ollama.
+ * Models without vision either reject the call or ignore the image.
+ */
+export interface ImageBlock {
+  type: 'image'
+  source:
+    | { type: 'base64'; mediaType: string; data: string }
+    | { type: 'url'; url: string }
+}
+/**
+ * Document input — attaches a PDF (V1 only — the providers that
+ * support documents currently all gate on `application/pdf`) to a
+ * user message. Anthropic surfaces it as a native `document` block;
+ * Gemini accepts it via `inlineData` / `fileData` with
+ * `application/pdf` mime; OpenAI / Ollama / DeepSeek don't support
+ * PDF blocks at all (apps split the PDF to images and use
+ * `ImageBlock`s for those vendors).
+ *
+ * The optional `title` is shown to the model on Anthropic (helpful
+ * for multi-document calls — "the contract", "the invoice"); other
+ * providers ignore it.
+ */
+export interface DocumentBlock {
+  type: 'document'
+  source:
+    | { type: 'base64'; mediaType: string; data: string }
+    | { type: 'url'; url: string }
+  /** Optional title shown to the model (Anthropic uses it; others ignore). */
+  title?: string
+}
+/**
+ * Audio input — attaches a sound clip to a user message. V1
+ * coverage: Gemini supports audio natively via `inlineData` with
+ * audio MIMEs (`audio/mp3`, `audio/wav`, `audio/ogg`, `audio/flac`,
+ * `audio/webm`, `audio/aac`). Anthropic + OpenAI + Ollama don't
+ * accept audio in their chat APIs — OpenAI apps preprocess via
+ * Whisper; Anthropic apps wait for the audio block to land in the
+ * SDK; Ollama apps that need audio look at server-side
+ * transcription models.
+ */
+export interface AudioBlock {
+  type: 'audio'
+  source:
+    | { type: 'base64'; mediaType: string; data: string }
+    | { type: 'url'; url: string }
+}
 export type ContentBlock =
   | TextBlock
+  | ImageBlock
+  | DocumentBlock
+  | AudioBlock
   | ToolUseBlock
   | ToolResultBlock
   | MCPToolUseBlock
@@ -134,6 +205,55 @@ export type SystemPrompt =
  * escape hatch in `ChatResult` is what they reach for when they need
  * provider-specific fields.
  */
+/**
+ * Server-side tool — work the provider's backend runs on behalf
+ * of the model. Unlike framework-local tools (`Tool` /
+ * `defineTool`), the model's call doesn't round-trip through
+ * the app's process; the provider executes the tool and inlines
+ * the result in the response.
+ *
+ * V1 coverage:
+ *   - **Anthropic**: `web_search`, `code_execution`, `web_fetch`.
+ *   - **Gemini**: `web_search` (Google Search), `code_execution`,
+ *     `url_context`.
+ *   - **OpenAI / DeepSeek / Ollama**: throw — OpenAI's server tools
+ *     live on the Responses API (separate slice); the compat
+ *     providers don't expose them.
+ *
+ * Cross-provider portability:
+ *   - `web_search` + `code_execution` work on both Anthropic and
+ *     Gemini.
+ *   - `web_fetch` is Anthropic-only.
+ *   - `url_context` is Gemini-only.
+ *
+ * Server tools combine freely with framework-local `Tool[]` and
+ * MCP servers — the model sees all three sets in one tool list.
+ */
+export type ServerTool =
+  | {
+      type: 'web_search'
+      /** Max times the model can call this tool per turn (Anthropic; Gemini ignores). */
+      maxUses?: number
+      /** Domain allowlist (Anthropic; Gemini ignores). Mutually exclusive with `blockedDomains`. */
+      allowedDomains?: readonly string[]
+      /** Domain blocklist (Anthropic; Gemini ignores). */
+      blockedDomains?: readonly string[]
+    }
+  | { type: 'code_execution' }
+  | {
+      type: 'web_fetch'
+      /** Max URL fetches per turn (Anthropic). */
+      maxUses?: number
+      /** Domain allowlist. */
+      allowedDomains?: readonly string[]
+      /** Domain blocklist. */
+      blockedDomains?: readonly string[]
+    }
+  | {
+      type: 'url_context'
+      /** Gemini fetches the URL and surfaces grounded answers from it. */
+    }
 export interface ChatOptions {
   /** Override the configured default model. Wins over `tier`. */
   model?: string
@@ -168,6 +288,26 @@ export interface ChatOptions {
    * provider by config; this is the override for that.
    */
   provider?: string
+  /**
+   * Cancel the in-flight operation. Aborting between iterations of
+   * a tool loop bails before the next model call; aborting mid-call
+   * propagates the SDK's native abort error (typically a `DOMException`
+   * with `name: 'AbortError'`). Streaming iterators reject on the
+   * next `for await` step.
+   */
+  signal?: AbortSignal
+  /**
+   * Server-side tools — work the provider's backend runs (web
+   * search, code execution, URL fetching). The model's calls
+   * don't round-trip through the framework's tool loop; results
+   * land inline in the response. Combines freely with
+   * framework-local `Tool[]` and MCP servers.
+   *
+   * V1 supports Anthropic + Gemini; OpenAI / DeepSeek / Ollama
+   * throw `BrainError` (use the Responses API for OpenAI, or
+   * route to Anthropic / Gemini).
+   */
+  serverTools?: readonly ServerTool[]
 }
 /** Token usage for a single call. Cache-hit fields are populated when caching is in play. */
@@ -201,6 +341,99 @@ export type StreamEvent =
   | { type: 'text'; delta: string }
   | { type: 'stop'; stopReason: string | null; usage: ChatUsage }
+/**
+ * Per-call options for `brain.embed(...)`. Only the embed-relevant
+ * subset of `ChatOptions` — chat-specific knobs (system prompt,
+ * thinking, cache, tools) don't apply.
+ */
+export interface EmbedOptions {
+  /** Override the configured default embedding model. */
+  model?: string
+  /**
+   * Override the default provider. Must name a provider that
+   * implements `embed` (V1: OpenAI, Gemini, Ollama; Anthropic +
+   * DeepSeek throw with a clear "route to a different provider"
+   * message).
+   */
+  provider?: string
+  /**
+   * Optional dimensionality hint. OpenAI passes through as
+   * `dimensions`; Gemini as `outputDimensionality`. Providers
+   * that ignore it silently drop the field.
+   */
+  dimensions?: number
+  /** Cancellation signal — same shape as `ChatOptions.signal`. */
+  signal?: AbortSignal
+}
+/**
+ * Per-call options for `brain.transcribe(...)`.
+ */
+export interface TranscribeOptions {
+  /** Override the configured default transcription model. */
+  model?: string
+  /**
+   * Override the default provider. Must name a provider that
+   * implements `transcribe` (V1: OpenAI / Gemini / Ollama;
+   * Anthropic + DeepSeek throw).
+   */
+  provider?: string
+  /**
+   * Optional BCP-47 language hint (`en`, `fr`, `ja`). Improves
+   * accuracy when known; models without hint support ignore.
+   */
+  language?: string
+  /**
+   * Optional bias prompt to steer vocabulary / style / formatting.
+   * OpenAI calls this `prompt`; Gemini-via-chat threads it into
+   * the system message; others ignore.
+   */
+  prompt?: string
+  /** Cancellation signal — same shape as `ChatOptions.signal`. */
+  signal?: AbortSignal
+}
+/**
+ * Audio source — same discriminated union as
+ * `AudioBlock.source`, named separately for `transcribe(...)`
+ * which takes it directly (no wrapping `AudioBlock` shell).
+ */
+export type AudioSource =
+  | { type: 'base64'; mediaType: string; data: string }
+  | { type: 'url'; url: string }
+/**
+ * Result of one `transcribe` call. `text` is the transcribed
+ * audio; `language` / `duration` are surfaced when the provider
+ * returns them (OpenAI does on the `verbose_json` response
+ * format; Gemini's chat-wrap path doesn't). `raw` is the
+ * provider's full native response for fields the framework
+ * doesn't surface.
+ */
+export interface TranscribeResult<Raw = unknown> {
+  text: string
+  model: string
+  /** BCP-47 detected (or echoed) language. Optional. */
+  language?: string
+  /** Audio duration in seconds. Optional. */
+  duration?: number
+  raw: Raw
+}
+/**
+ * Result of one `embed` call. `embeddings[i]` is the vector for
+ * the i-th input text. `model` is the model the provider used
+ * (echoed back for logging). `usage.inputTokens` is the total
+ * tokens consumed across all inputs.
+ */
+export interface EmbedResult<Raw = unknown> {
+  embeddings: number[][]
+  model: string
+  usage: { inputTokens: number }
+  /** Provider's full native response — escape hatch for fields the framework doesn't surface. */
+  raw: Raw
+}
 /**
  * Result of a structured-output call. `value` is the parsed JSON
  * shaped to the `OutputSchema<T>` passed in. `text` is the raw JSON