@strav/brain 1.0.0-alpha.15 → 1.0.0-alpha.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tool.ts CHANGED
@@ -23,6 +23,13 @@ export interface ToolContext {
23
23
  readonly callId: string
24
24
  /** Per-run free-form context bag passed by the caller. Optional. */
25
25
  readonly context: Readonly<Record<string, unknown>>
26
+ /**
27
+ * Cancellation signal forwarded from the run's `options.signal`.
28
+ * Tools that wrap network calls (HTTP fetches, MCP servers, child
29
+ * processes) should pass this through so cancellation actually
30
+ * unwinds in-flight work.
31
+ */
32
+ readonly signal?: AbortSignal
26
33
  }
27
34
 
28
35
  export interface Tool<TInput = unknown, TOutput = unknown> {
@@ -0,0 +1,81 @@
1
+ /**
2
+ * `runToolWithRecovery` — shared helper used by every provider's
3
+ * agentic loop to execute one tool call.
4
+ *
5
+ * Encapsulates two error paths and the optional `onToolError`
6
+ * recovery callback:
7
+ *
8
+ * 1. **Tool not registered** — the model called a name that
9
+ * isn't in `toolMap`. Without recovery, throw
10
+ * `ToolExecutionError`. With recovery, the callback's return
11
+ * string becomes the `tool_result.content` (with `isError:
12
+ * true`) and the loop continues — the model sees "unknown
13
+ * tool" and adapts.
14
+ *
15
+ * 2. **`execute()` throws** — the tool's body raised. Same
16
+ * pattern: either rethrow as `ToolExecutionError` or feed
17
+ * back as an error result.
18
+ *
19
+ * The returned shape is the framework-agnostic `{ content, isError }`
20
+ * pair each provider then wraps into its own `tool_result` block
21
+ * shape (Anthropic `tool_result` with `is_error`; OpenAI tool-role
22
+ * message content; Gemini `functionResponse` with `{ error }`).
23
+ */
24
+
25
+ import type { RunWithToolsOptions } from './provider.ts'
26
+ import type { Tool, ToolContext } from './tool.ts'
27
+ import { ToolExecutionError } from './tool_execution_error.ts'
28
+
29
+ export interface ToolRunResult {
30
+ content: string
31
+ isError: boolean
32
+ }
33
+
34
+ export async function runToolWithRecovery(
35
+ tool: Tool | undefined,
36
+ toolName: string,
37
+ callId: string,
38
+ input: unknown,
39
+ options: RunWithToolsOptions,
40
+ ): Promise<ToolRunResult> {
41
+ if (!tool) {
42
+ return recoverOrThrow(
43
+ new ToolExecutionError(
44
+ toolName,
45
+ callId,
46
+ new Error(`Tool "${toolName}" is not registered.`),
47
+ ),
48
+ options,
49
+ )
50
+ }
51
+
52
+ const ctx: ToolContext = {
53
+ callId,
54
+ context: options.context ?? {},
55
+ ...(options.signal !== undefined ? { signal: options.signal } : {}),
56
+ }
57
+ let output: unknown
58
+ try {
59
+ output = await tool.execute(input, ctx)
60
+ } catch (cause) {
61
+ return recoverOrThrow(new ToolExecutionError(toolName, callId, cause), options)
62
+ }
63
+ return {
64
+ content: typeof output === 'string' ? output : JSON.stringify(output),
65
+ isError: false,
66
+ }
67
+ }
68
+
69
+ /**
70
+ * Resolve a `ToolExecutionError` through the `onToolError` callback
71
+ * (when set) or rethrow. Used by providers for failures that happen
72
+ * outside `tool.execute` — e.g., OpenAI's JSON-parse-arguments path.
73
+ */
74
+ export function recoverOrThrow(
75
+ error: ToolExecutionError,
76
+ options: RunWithToolsOptions,
77
+ ): ToolRunResult {
78
+ const recovered = options.onToolError?.(error)
79
+ if (typeof recovered !== 'string') throw error
80
+ return { content: recovered, isError: true }
81
+ }
package/src/types.ts CHANGED
@@ -105,8 +105,79 @@ export interface MCPToolResultBlock {
105
105
  isError?: boolean
106
106
  }
107
107
 
108
+ /**
109
+ * Image input — attaches a picture to a user message so vision-
110
+ * capable models can see it alongside the text. V1 covers images
111
+ * only; audio + video defer.
112
+ *
113
+ * `source` is a discriminated union:
114
+ * - `{ type: 'base64', mediaType, data }` — inline bytes for
115
+ * uploads, screenshots, attachments your app already holds in
116
+ * memory. `mediaType` is the IANA MIME (`image/png`,
117
+ * `image/jpeg`, `image/webp`, `image/gif`); `data` is the
118
+ * base64-encoded image (no `data:` prefix — the provider
119
+ * translation adds it where needed).
120
+ * - `{ type: 'url', url }` — remote image URL. Anthropic, OpenAI,
121
+ * and Gemini all accept HTTPS URLs; check the provider's
122
+ * domain allowlist if calls 404 (Anthropic was historically
123
+ * stricter). For Gemini, GCS URIs (`gs://...`) also work.
124
+ *
125
+ * Vision support is provider- AND model-dependent. Cloud picks:
126
+ * Anthropic Claude 4 family, OpenAI gpt-4o / gpt-5 family, Gemini
127
+ * 2.x. Local: `llama3.2-vision`, `llava`, `qwen2.5-vl` on Ollama.
128
+ * Models without vision either reject the call or ignore the image.
129
+ */
130
+ export interface ImageBlock {
131
+ type: 'image'
132
+ source:
133
+ | { type: 'base64'; mediaType: string; data: string }
134
+ | { type: 'url'; url: string }
135
+ }
136
+
137
+ /**
138
+ * Document input — attaches a PDF (V1 only — the providers that
139
+ * support documents currently all gate on `application/pdf`) to a
140
+ * user message. Anthropic surfaces it as a native `document` block;
141
+ * Gemini accepts it via `inlineData` / `fileData` with
142
+ * `application/pdf` mime; OpenAI / Ollama / DeepSeek don't support
143
+ * PDF blocks at all (apps split the PDF to images and use
144
+ * `ImageBlock`s for those vendors).
145
+ *
146
+ * The optional `title` is shown to the model on Anthropic (helpful
147
+ * for multi-document calls — "the contract", "the invoice"); other
148
+ * providers ignore it.
149
+ */
150
+ export interface DocumentBlock {
151
+ type: 'document'
152
+ source:
153
+ | { type: 'base64'; mediaType: string; data: string }
154
+ | { type: 'url'; url: string }
155
+ /** Optional title shown to the model (Anthropic uses it; others ignore). */
156
+ title?: string
157
+ }
158
+
159
+ /**
160
+ * Audio input — attaches a sound clip to a user message. V1
161
+ * coverage: Gemini supports audio natively via `inlineData` with
162
+ * audio MIMEs (`audio/mp3`, `audio/wav`, `audio/ogg`, `audio/flac`,
163
+ * `audio/webm`, `audio/aac`). Anthropic + OpenAI + Ollama don't
164
+ * accept audio in their chat APIs — OpenAI apps preprocess via
165
+ * Whisper; Anthropic apps wait for the audio block to land in the
166
+ * SDK; Ollama apps that need audio look at server-side
167
+ * transcription models.
168
+ */
169
+ export interface AudioBlock {
170
+ type: 'audio'
171
+ source:
172
+ | { type: 'base64'; mediaType: string; data: string }
173
+ | { type: 'url'; url: string }
174
+ }
175
+
108
176
  export type ContentBlock =
109
177
  | TextBlock
178
+ | ImageBlock
179
+ | DocumentBlock
180
+ | AudioBlock
110
181
  | ToolUseBlock
111
182
  | ToolResultBlock
112
183
  | MCPToolUseBlock
@@ -134,6 +205,55 @@ export type SystemPrompt =
134
205
  * escape hatch in `ChatResult` is what they reach for when they need
135
206
  * provider-specific fields.
136
207
  */
208
+ /**
209
+ * Server-side tool — work the provider's backend runs on behalf
210
+ * of the model. Unlike framework-local tools (`Tool` /
211
+ * `defineTool`), the model's call doesn't round-trip through
212
+ * the app's process; the provider executes the tool and inlines
213
+ * the result in the response.
214
+ *
215
+ * V1 coverage:
216
+ * - **Anthropic**: `web_search`, `code_execution`, `web_fetch`.
217
+ * - **Gemini**: `web_search` (Google Search), `code_execution`,
218
+ * `url_context`.
219
+ * - **OpenAI / DeepSeek / Ollama**: throw — OpenAI's server tools
220
+ * live on the Responses API (separate slice); the compat
221
+ * providers don't expose them.
222
+ *
223
+ * Cross-provider portability:
224
+ * - `web_search` + `code_execution` work on both Anthropic and
225
+ * Gemini.
226
+ * - `web_fetch` is Anthropic-only.
227
+ * - `url_context` is Gemini-only.
228
+ *
229
+ * Server tools combine freely with framework-local `Tool[]` and
230
+ * MCP servers — the model sees all three sets in one tool list.
231
+ */
232
+ export type ServerTool =
233
+ | {
234
+ type: 'web_search'
235
+ /** Max times the model can call this tool per turn (Anthropic; Gemini ignores). */
236
+ maxUses?: number
237
+ /** Domain allowlist (Anthropic; Gemini ignores). Mutually exclusive with `blockedDomains`. */
238
+ allowedDomains?: readonly string[]
239
+ /** Domain blocklist (Anthropic; Gemini ignores). */
240
+ blockedDomains?: readonly string[]
241
+ }
242
+ | { type: 'code_execution' }
243
+ | {
244
+ type: 'web_fetch'
245
+ /** Max URL fetches per turn (Anthropic). */
246
+ maxUses?: number
247
+ /** Domain allowlist. */
248
+ allowedDomains?: readonly string[]
249
+ /** Domain blocklist. */
250
+ blockedDomains?: readonly string[]
251
+ }
252
+ | {
253
+ type: 'url_context'
254
+ /** Gemini fetches the URL and surfaces grounded answers from it. */
255
+ }
256
+
137
257
  export interface ChatOptions {
138
258
  /** Override the configured default model. Wins over `tier`. */
139
259
  model?: string
@@ -168,6 +288,26 @@ export interface ChatOptions {
168
288
  * provider by config; this is the override for that.
169
289
  */
170
290
  provider?: string
291
+ /**
292
+ * Cancel the in-flight operation. Aborting between iterations of
293
+ * a tool loop bails before the next model call; aborting mid-call
294
+ * propagates the SDK's native abort error (typically a `DOMException`
295
+ * with `name: 'AbortError'`). Streaming iterators reject on the
296
+ * next `for await` step.
297
+ */
298
+ signal?: AbortSignal
299
+ /**
300
+ * Server-side tools — work the provider's backend runs (web
301
+ * search, code execution, URL fetching). The model's calls
302
+ * don't round-trip through the framework's tool loop; results
303
+ * land inline in the response. Combines freely with
304
+ * framework-local `Tool[]` and MCP servers.
305
+ *
306
+ * V1 supports Anthropic + Gemini; OpenAI / DeepSeek / Ollama
307
+ * throw `BrainError` (use the Responses API for OpenAI, or
308
+ * route to Anthropic / Gemini).
309
+ */
310
+ serverTools?: readonly ServerTool[]
171
311
  }
172
312
 
173
313
  /** Token usage for a single call. Cache-hit fields are populated when caching is in play. */
@@ -201,6 +341,99 @@ export type StreamEvent =
201
341
  | { type: 'text'; delta: string }
202
342
  | { type: 'stop'; stopReason: string | null; usage: ChatUsage }
203
343
 
344
+ /**
345
+ * Per-call options for `brain.embed(...)`. Only the embed-relevant
346
+ * subset of `ChatOptions` — chat-specific knobs (system prompt,
347
+ * thinking, cache, tools) don't apply.
348
+ */
349
+ export interface EmbedOptions {
350
+ /** Override the configured default embedding model. */
351
+ model?: string
352
+ /**
353
+ * Override the default provider. Must name a provider that
354
+ * implements `embed` (V1: OpenAI, Gemini, Ollama; Anthropic +
355
+ * DeepSeek throw with a clear "route to a different provider"
356
+ * message).
357
+ */
358
+ provider?: string
359
+ /**
360
+ * Optional dimensionality hint. OpenAI passes through as
361
+ * `dimensions`; Gemini as `outputDimensionality`. Providers
362
+ * that ignore it silently drop the field.
363
+ */
364
+ dimensions?: number
365
+ /** Cancellation signal — same shape as `ChatOptions.signal`. */
366
+ signal?: AbortSignal
367
+ }
368
+
369
+ /**
370
+ * Per-call options for `brain.transcribe(...)`.
371
+ */
372
+ export interface TranscribeOptions {
373
+ /** Override the configured default transcription model. */
374
+ model?: string
375
+ /**
376
+ * Override the default provider. Must name a provider that
377
+ * implements `transcribe` (V1: OpenAI / Gemini / Ollama;
378
+ * Anthropic + DeepSeek throw).
379
+ */
380
+ provider?: string
381
+ /**
382
+ * Optional BCP-47 language hint (`en`, `fr`, `ja`). Improves
383
+ * accuracy when known; models without hint support ignore.
384
+ */
385
+ language?: string
386
+ /**
387
+ * Optional bias prompt to steer vocabulary / style / formatting.
388
+ * OpenAI calls this `prompt`; Gemini-via-chat threads it into
389
+ * the system message; others ignore.
390
+ */
391
+ prompt?: string
392
+ /** Cancellation signal — same shape as `ChatOptions.signal`. */
393
+ signal?: AbortSignal
394
+ }
395
+
396
+ /**
397
+ * Audio source — same discriminated union as
398
+ * `AudioBlock.source`, named separately for `transcribe(...)`
399
+ * which takes it directly (no wrapping `AudioBlock` shell).
400
+ */
401
+ export type AudioSource =
402
+ | { type: 'base64'; mediaType: string; data: string }
403
+ | { type: 'url'; url: string }
404
+
405
+ /**
406
+ * Result of one `transcribe` call. `text` is the transcribed
407
+ * audio; `language` / `duration` are surfaced when the provider
408
+ * returns them (OpenAI does on the `verbose_json` response
409
+ * format; Gemini's chat-wrap path doesn't). `raw` is the
410
+ * provider's full native response for fields the framework
411
+ * doesn't surface.
412
+ */
413
+ export interface TranscribeResult<Raw = unknown> {
414
+ text: string
415
+ model: string
416
+ /** BCP-47 detected (or echoed) language. Optional. */
417
+ language?: string
418
+ /** Audio duration in seconds. Optional. */
419
+ duration?: number
420
+ raw: Raw
421
+ }
422
+
423
+ /**
424
+ * Result of one `embed` call. `embeddings[i]` is the vector for
425
+ * the i-th input text. `model` is the model the provider used
426
+ * (echoed back for logging). `usage.inputTokens` is the total
427
+ * tokens consumed across all inputs.
428
+ */
429
+ export interface EmbedResult<Raw = unknown> {
430
+ embeddings: number[][]
431
+ model: string
432
+ usage: { inputTokens: number }
433
+ /** Provider's full native response — escape hatch for fields the framework doesn't surface. */
434
+ raw: Raw
435
+ }
436
+
204
437
  /**
205
438
  * Result of a structured-output call. `value` is the parsed JSON
206
439
  * shaped to the `OutputSchema<T>` passed in. `text` is the raw JSON