@strav/brain 1.0.0-alpha.16 → 1.0.0-alpha.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -60,11 +60,21 @@ import type { AgentResult } from '../agent_result.ts'
60
60
  import { BrainError } from '../brain_error.ts'
61
61
  import type { GeminiProviderConfig } from '../brain_config.ts'
62
62
  import type { MCPServer } from '../mcp_server.ts'
63
+ import type { AgentGenerateResult } from '../agent_generate_result.ts'
64
+ import type { AgentStreamEvent } from '../agent_stream_event.ts'
65
+ import type {
66
+ AudioSource,
67
+ EmbedOptions,
68
+ EmbedResult,
69
+ ServerTool,
70
+ TranscribeOptions,
71
+ TranscribeResult,
72
+ } from '../types.ts'
63
73
  import { resolveMcpTools, type ResolveMcpToolsOptions } from '../mcp/resolve_mcp_tools.ts'
64
74
  import { parseGenerated, type OutputSchema } from '../output_schema.ts'
75
+ import { runToolWithRecovery } from '../tool_runner.ts'
65
76
  import type { Provider, RunWithToolsOptions } from '../provider.ts'
66
77
  import type { Tool } from '../tool.ts'
67
- import { ToolExecutionError } from '../tool_execution_error.ts'
68
78
  import type {
69
79
  ChatOptions,
70
80
  ChatResult,
@@ -80,6 +90,7 @@ import type {
80
90
  } from '../types.ts'
81
91
 
82
92
  const DEFAULT_GEMINI_MODEL = 'gemini-2.5-flash'
93
+ const DEFAULT_GEMINI_EMBED_MODEL = 'text-embedding-004'
83
94
 
84
95
  /**
85
96
  * The slice of `GoogleGenAI` the provider exercises. Narrowed so
@@ -91,6 +102,17 @@ export interface GeminiModelsClient {
91
102
  params: GenerateContentParameters,
92
103
  ): Promise<AsyncIterable<GenerateContentResponse>>
93
104
  countTokens(params: { model: string; contents: Content[] }): Promise<{ totalTokens?: number }>
105
+ /**
106
+ * Optional on the test seam — the real SDK always provides it,
107
+ * but tests that don't exercise embed don't need to stub it.
108
+ * `embed()` calls this directly; missing it throws a clear
109
+ * TypeError if invoked.
110
+ */
111
+ embedContent?(params: {
112
+ model: string
113
+ contents: string[]
114
+ config?: { outputDimensionality?: number; abortSignal?: AbortSignal }
115
+ }): Promise<{ embeddings?: Array<{ values?: number[] }> }>
94
116
  }
95
117
 
96
118
  export interface GeminiProviderOptions {
@@ -104,12 +126,14 @@ export class GeminiProvider implements Provider {
104
126
  private readonly models: GeminiModelsClient
105
127
  private readonly defaultModel: string
106
128
  private readonly defaultMaxTokens: number
129
+ private readonly defaultEmbedModel: string
107
130
  private readonly mcpClientFactory?: ResolveMcpToolsOptions['clientFactory']
108
131
 
109
132
  constructor(name: string, config: GeminiProviderConfig, options: GeminiProviderOptions = {}) {
110
133
  this.name = name
111
134
  this.defaultModel = config.defaultModel ?? DEFAULT_GEMINI_MODEL
112
135
  this.defaultMaxTokens = config.defaultMaxTokens ?? 4096
136
+ this.defaultEmbedModel = config.defaultEmbedModel ?? DEFAULT_GEMINI_EMBED_MODEL
113
137
  this.mcpClientFactory = options.mcpClientFactory
114
138
  if (options.client) {
115
139
  this.models = options.client.models
@@ -169,6 +193,86 @@ export class GeminiProvider implements Provider {
169
193
  return response.totalTokens ?? 0
170
194
  }
171
195
 
196
+ /**
197
+ * Gemini embeddings via `ai.models.embedContent`. Returns one
198
+ * vector per input text. `usage.inputTokens` is `0` — Gemini's
199
+ * embed endpoint doesn't surface token counts in the response
200
+ * for the Gemini Developer API tier (Vertex's request-level
201
+ * metadata exposes billable characters, but that's a different
202
+ * accounting unit and not the framework's contract). Apps that
203
+ * need exact embed-token usage call `countTokens` separately
204
+ * before the call.
205
+ */
206
+ /**
207
+ * Gemini has no dedicated transcription endpoint, so we wrap a
208
+ * chat call: an AudioBlock + a system message that tells the
209
+ * model to transcribe verbatim. Apps that want OpenAI-style
210
+ * Whisper transcription with `language` / `duration` metadata
211
+ * route to OpenAI (or local Whisper via Ollama).
212
+ *
213
+ * `options.prompt` threads into the system instruction —
214
+ * useful for style/vocabulary hints. `options.language` is
215
+ * surfaced to the model in the system prompt (Gemini doesn't
216
+ * have a dedicated language field).
217
+ */
218
+ async transcribe(
219
+ audio: AudioSource,
220
+ options: TranscribeOptions = {},
221
+ ): Promise<TranscribeResult> {
222
+ const lines = [
223
+ 'Transcribe the attached audio verbatim. Output ONLY the transcribed text — no preamble, no quotes, no commentary.',
224
+ options.language ? `Audio language: ${options.language}.` : undefined,
225
+ options.prompt ? `Style / vocabulary hints: ${options.prompt}` : undefined,
226
+ ].filter((s): s is string => s !== undefined)
227
+ const system = lines.join(' ')
228
+ const chatResult = await this.chat(
229
+ [
230
+ {
231
+ role: 'user',
232
+ content: [{ type: 'audio', source: audio }],
233
+ },
234
+ ],
235
+ {
236
+ system,
237
+ ...(options.model !== undefined ? { model: options.model } : {}),
238
+ ...(options.signal !== undefined ? { signal: options.signal } : {}),
239
+ },
240
+ )
241
+ return {
242
+ text: chatResult.text,
243
+ model: chatResult.model,
244
+ raw: chatResult.raw,
245
+ }
246
+ }
247
+
248
+ async embed(
249
+ texts: readonly string[],
250
+ options: EmbedOptions = {},
251
+ ): Promise<EmbedResult<{ embeddings?: Array<{ values?: number[] }> }>> {
252
+ const model = options.model ?? this.defaultEmbedModel
253
+ const config: { outputDimensionality?: number; abortSignal?: AbortSignal } = {}
254
+ if (options.dimensions !== undefined) config.outputDimensionality = options.dimensions
255
+ if (options.signal !== undefined) config.abortSignal = options.signal
256
+ if (!this.models.embedContent) {
257
+ throw new BrainError(
258
+ `GeminiProvider.embed: underlying SDK does not implement embedContent. This usually means a test stub omitted it.`,
259
+ { context: { provider: this.name } },
260
+ )
261
+ }
262
+ const response = await this.models.embedContent({
263
+ model,
264
+ contents: texts as string[],
265
+ ...(Object.keys(config).length > 0 ? { config } : {}),
266
+ })
267
+ const embeddings = (response.embeddings ?? []).map((e) => e.values ?? [])
268
+ return {
269
+ embeddings,
270
+ model,
271
+ usage: { inputTokens: 0 },
272
+ raw: response,
273
+ }
274
+ }
275
+
172
276
  async runWithTools(
173
277
  messages: readonly Message[],
174
278
  tools: readonly Tool[],
@@ -205,6 +309,7 @@ export class GeminiProvider implements Provider {
205
309
  let iterations = 0
206
310
 
207
311
  while (true) {
312
+ checkAborted(options.signal)
208
313
  const params = this.buildParams(workingMessages, options, tools)
209
314
  const response = await this.models.generateContent(params)
210
315
  addUsage(aggregated, response.usageMetadata)
@@ -235,36 +340,132 @@ export class GeminiProvider implements Provider {
235
340
 
236
341
  const resultBlocks: ContentBlock[] = []
237
342
  for (const call of toolUses) {
238
- const tool = toolMap.get(call.name)
239
- if (!tool) {
240
- throw new ToolExecutionError(
241
- call.name,
242
- call.id,
243
- new Error(`Tool "${call.name}" is not registered.`),
244
- )
343
+ const { content, isError } = await runToolWithRecovery(
344
+ toolMap.get(call.name),
345
+ call.name,
346
+ call.id,
347
+ call.input,
348
+ options,
349
+ )
350
+ resultBlocks.push({
351
+ type: 'tool_result',
352
+ toolUseId: call.id,
353
+ content,
354
+ ...(isError ? { isError: true } : {}),
355
+ } satisfies ToolResultBlock)
356
+ }
357
+ workingMessages.push({ role: 'user', content: resultBlocks })
358
+
359
+ iterations++
360
+ if (iterations >= maxIterations) {
361
+ return {
362
+ text: candidateText(candidate),
363
+ messages: workingMessages,
364
+ iterations,
365
+ stopReason: 'max_iterations',
366
+ usage: aggregated,
245
367
  }
246
- let output: unknown
247
- try {
248
- output = await tool.execute(call.input, {
249
- callId: call.id,
250
- context: options.context ?? {},
368
+ }
369
+ }
370
+ }
371
+
372
+ async runWithToolsAndSchema<T>(
373
+ messages: readonly Message[],
374
+ tools: readonly Tool[],
375
+ schema: OutputSchema<T>,
376
+ options: RunWithToolsOptions = {},
377
+ ): Promise<AgentGenerateResult<T>> {
378
+ const mcpServers: readonly MCPServer[] = options.mcpServers ?? []
379
+ const resolved =
380
+ mcpServers.length > 0
381
+ ? await resolveMcpTools(mcpServers, {
382
+ ...(this.mcpClientFactory ? { clientFactory: this.mcpClientFactory } : {}),
251
383
  })
252
- } catch (cause) {
253
- throw new ToolExecutionError(call.name, call.id, cause)
384
+ : { tools: [] as Tool[], close: async () => {} }
385
+ try {
386
+ return await this._runLoopWithSchema([...tools, ...resolved.tools], messages, schema, options)
387
+ } finally {
388
+ await resolved.close()
389
+ }
390
+ }
391
+
392
+ private async _runLoopWithSchema<T>(
393
+ tools: readonly Tool[],
394
+ messages: readonly Message[],
395
+ schema: OutputSchema<T>,
396
+ options: RunWithToolsOptions,
397
+ ): Promise<AgentGenerateResult<T>> {
398
+ const maxIterations = options.maxIterations ?? 10
399
+ const toolMap = new Map<string, Tool>(tools.map((t) => [t.name, t]))
400
+ const workingMessages: Message[] = [...messages]
401
+ const aggregated: ChatUsage = {
402
+ inputTokens: 0,
403
+ outputTokens: 0,
404
+ cacheReadTokens: 0,
405
+ cacheCreationTokens: 0,
406
+ }
407
+ let iterations = 0
408
+
409
+ while (true) {
410
+ const params = this.buildParams(workingMessages, options, tools)
411
+ params.config = {
412
+ ...(params.config ?? {}),
413
+ responseMimeType: 'application/json',
414
+ responseJsonSchema: schema.jsonSchema,
415
+ }
416
+ const response = await this.models.generateContent(params)
417
+ addUsage(aggregated, response.usageMetadata)
418
+
419
+ const candidate = response.candidates?.[0]
420
+ if (!candidate) {
421
+ throw new BrainError('GeminiProvider: response had no candidates.')
422
+ }
423
+ const parts = candidate.content?.parts ?? []
424
+ const assistantContent = fromGeminiParts(parts)
425
+ workingMessages.push({ role: 'assistant', content: assistantContent })
426
+
427
+ const toolUses = (Array.isArray(assistantContent) ? assistantContent : []).filter(
428
+ (b): b is ToolUseBlock => b.type === 'tool_use',
429
+ )
430
+
431
+ if (toolUses.length === 0) {
432
+ const text = typeof assistantContent === 'string'
433
+ ? assistantContent
434
+ : candidateText(candidate)
435
+ return {
436
+ value: parseGenerated(text, schema),
437
+ text,
438
+ messages: workingMessages,
439
+ iterations,
440
+ stopReason: candidate.finishReason ? String(candidate.finishReason) : 'stop',
441
+ usage: aggregated,
254
442
  }
255
- const resultBlock: ToolResultBlock = {
443
+ }
444
+
445
+ const resultBlocks: ContentBlock[] = []
446
+ for (const call of toolUses) {
447
+ const { content, isError } = await runToolWithRecovery(
448
+ toolMap.get(call.name),
449
+ call.name,
450
+ call.id,
451
+ call.input,
452
+ options,
453
+ )
454
+ resultBlocks.push({
256
455
  type: 'tool_result',
257
456
  toolUseId: call.id,
258
- content: typeof output === 'string' ? output : JSON.stringify(output),
259
- }
260
- resultBlocks.push(resultBlock)
457
+ content,
458
+ ...(isError ? { isError: true } : {}),
459
+ } satisfies ToolResultBlock)
261
460
  }
262
461
  workingMessages.push({ role: 'user', content: resultBlocks })
263
462
 
264
463
  iterations++
265
464
  if (iterations >= maxIterations) {
465
+ const text = candidateText(candidate)
266
466
  return {
267
- text: candidateText(candidate),
467
+ value: parseGenerated(text, schema),
468
+ text,
268
469
  messages: workingMessages,
269
470
  iterations,
270
471
  stopReason: 'max_iterations',
@@ -274,6 +475,276 @@ export class GeminiProvider implements Provider {
274
475
  }
275
476
  }
276
477
 
478
+ async *streamWithTools(
479
+ messages: readonly Message[],
480
+ tools: readonly Tool[],
481
+ options: RunWithToolsOptions = {},
482
+ ): AsyncIterable<AgentStreamEvent> {
483
+ const mcpServers: readonly MCPServer[] = options.mcpServers ?? []
484
+ const resolved =
485
+ mcpServers.length > 0
486
+ ? await resolveMcpTools(mcpServers, {
487
+ ...(this.mcpClientFactory ? { clientFactory: this.mcpClientFactory } : {}),
488
+ })
489
+ : { tools: [] as Tool[], close: async () => {} }
490
+ try {
491
+ yield* this._streamLoop(messages, [...tools, ...resolved.tools], options)
492
+ } finally {
493
+ await resolved.close()
494
+ }
495
+ }
496
+
497
+ private async *_streamLoop(
498
+ messages: readonly Message[],
499
+ tools: readonly Tool[],
500
+ options: RunWithToolsOptions,
501
+ ): AsyncIterable<AgentStreamEvent> {
502
+ const maxIterations = options.maxIterations ?? 10
503
+ const toolMap = new Map<string, Tool>(tools.map((t) => [t.name, t]))
504
+ const workingMessages: Message[] = [...messages]
505
+ const aggregated: ChatUsage = {
506
+ inputTokens: 0,
507
+ outputTokens: 0,
508
+ cacheReadTokens: 0,
509
+ cacheCreationTokens: 0,
510
+ }
511
+ let iterations = 0
512
+
513
+ while (true) {
514
+ checkAborted(options.signal)
515
+ yield { type: 'iteration_start', iteration: iterations }
516
+
517
+ const params = this.buildParams(workingMessages, options, tools)
518
+ const stream = await this.models.generateContentStream(params)
519
+
520
+ const accumulatedParts: Part[] = []
521
+ let finishReason: string | null = null
522
+ let lastUsage: ChatUsage | undefined
523
+
524
+ for await (const chunk of stream) {
525
+ const candidate = chunk.candidates?.[0]
526
+ const chunkParts = candidate?.content?.parts ?? []
527
+ for (const part of chunkParts) {
528
+ if (typeof part.text === 'string' && part.text.length > 0) {
529
+ yield { type: 'text', delta: part.text }
530
+ }
531
+ }
532
+ accumulatedParts.push(...chunkParts)
533
+ if (candidate?.finishReason) finishReason = String(candidate.finishReason)
534
+ if (chunk.usageMetadata) lastUsage = toUsage(chunk.usageMetadata)
535
+ }
536
+ if (lastUsage) {
537
+ aggregated.inputTokens += lastUsage.inputTokens
538
+ aggregated.outputTokens += lastUsage.outputTokens
539
+ aggregated.cacheReadTokens += lastUsage.cacheReadTokens
540
+ }
541
+
542
+ yield { type: 'iteration_end', iteration: iterations, stopReason: finishReason }
543
+
544
+ const assistantContent = fromGeminiParts(accumulatedParts)
545
+ workingMessages.push({ role: 'assistant', content: assistantContent })
546
+
547
+ const toolUses = (Array.isArray(assistantContent) ? assistantContent : []).filter(
548
+ (b): b is ToolUseBlock => b.type === 'tool_use',
549
+ )
550
+
551
+ if (toolUses.length === 0) {
552
+ yield {
553
+ type: 'stop',
554
+ stopReason: finishReason ?? 'stop',
555
+ iterations,
556
+ usage: aggregated,
557
+ messages: workingMessages,
558
+ }
559
+ return
560
+ }
561
+
562
+ const resultBlocks: ContentBlock[] = []
563
+ for (const call of toolUses) {
564
+ yield { type: 'tool_use', id: call.id, name: call.name, input: call.input }
565
+ const { content, isError } = await runToolWithRecovery(
566
+ toolMap.get(call.name),
567
+ call.name,
568
+ call.id,
569
+ call.input,
570
+ options,
571
+ )
572
+ resultBlocks.push({
573
+ type: 'tool_result',
574
+ toolUseId: call.id,
575
+ content,
576
+ ...(isError ? { isError: true } : {}),
577
+ } satisfies ToolResultBlock)
578
+ yield {
579
+ type: 'tool_result',
580
+ id: call.id,
581
+ name: call.name,
582
+ content,
583
+ isError,
584
+ }
585
+ }
586
+ workingMessages.push({ role: 'user', content: resultBlocks })
587
+
588
+ iterations++
589
+ if (iterations >= maxIterations) {
590
+ yield {
591
+ type: 'stop',
592
+ stopReason: 'max_iterations',
593
+ iterations,
594
+ usage: aggregated,
595
+ messages: workingMessages,
596
+ }
597
+ return
598
+ }
599
+ }
600
+ }
601
+
602
+ async *streamWithToolsAndSchema<T>(
603
+ messages: readonly Message[],
604
+ tools: readonly Tool[],
605
+ schema: OutputSchema<T>,
606
+ options: RunWithToolsOptions = {},
607
+ ): AsyncIterable<AgentStreamEvent<T>> {
608
+ const mcpServers: readonly MCPServer[] = options.mcpServers ?? []
609
+ const resolved =
610
+ mcpServers.length > 0
611
+ ? await resolveMcpTools(mcpServers, {
612
+ ...(this.mcpClientFactory ? { clientFactory: this.mcpClientFactory } : {}),
613
+ })
614
+ : { tools: [] as Tool[], close: async () => {} }
615
+ try {
616
+ yield* this._streamLoopWithSchema(
617
+ [...tools, ...resolved.tools],
618
+ messages,
619
+ schema,
620
+ options,
621
+ )
622
+ } finally {
623
+ await resolved.close()
624
+ }
625
+ }
626
+
627
+ private async *_streamLoopWithSchema<T>(
628
+ tools: readonly Tool[],
629
+ messages: readonly Message[],
630
+ schema: OutputSchema<T>,
631
+ options: RunWithToolsOptions,
632
+ ): AsyncIterable<AgentStreamEvent<T>> {
633
+ const maxIterations = options.maxIterations ?? 10
634
+ const toolMap = new Map<string, Tool>(tools.map((t) => [t.name, t]))
635
+ const workingMessages: Message[] = [...messages]
636
+ const aggregated: ChatUsage = {
637
+ inputTokens: 0,
638
+ outputTokens: 0,
639
+ cacheReadTokens: 0,
640
+ cacheCreationTokens: 0,
641
+ }
642
+ let iterations = 0
643
+
644
+ while (true) {
645
+ checkAborted(options.signal)
646
+ yield { type: 'iteration_start', iteration: iterations }
647
+
648
+ const params = this.buildParams(workingMessages, options, tools)
649
+ params.config = {
650
+ ...(params.config ?? {}),
651
+ responseMimeType: 'application/json',
652
+ responseJsonSchema: schema.jsonSchema,
653
+ }
654
+ const stream = await this.models.generateContentStream(params)
655
+
656
+ const accumulatedParts: Part[] = []
657
+ let textBuf = ''
658
+ let finishReason: string | null = null
659
+ let lastUsage: ChatUsage | undefined
660
+
661
+ for await (const chunk of stream) {
662
+ const candidate = chunk.candidates?.[0]
663
+ const chunkParts = candidate?.content?.parts ?? []
664
+ for (const part of chunkParts) {
665
+ if (typeof part.text === 'string' && part.text.length > 0) {
666
+ textBuf += part.text
667
+ yield { type: 'text', delta: part.text }
668
+ }
669
+ }
670
+ accumulatedParts.push(...chunkParts)
671
+ if (candidate?.finishReason) finishReason = String(candidate.finishReason)
672
+ if (chunk.usageMetadata) lastUsage = toUsage(chunk.usageMetadata)
673
+ }
674
+ if (lastUsage) {
675
+ aggregated.inputTokens += lastUsage.inputTokens
676
+ aggregated.outputTokens += lastUsage.outputTokens
677
+ aggregated.cacheReadTokens += lastUsage.cacheReadTokens
678
+ }
679
+
680
+ yield { type: 'iteration_end', iteration: iterations, stopReason: finishReason }
681
+
682
+ const assistantContent = fromGeminiParts(accumulatedParts)
683
+ workingMessages.push({ role: 'assistant', content: assistantContent })
684
+
685
+ const toolUses = (Array.isArray(assistantContent) ? assistantContent : []).filter(
686
+ (b): b is ToolUseBlock => b.type === 'tool_use',
687
+ )
688
+
689
+ if (toolUses.length === 0) {
690
+ const text = textBuf
691
+ const value = parseGenerated(text, schema)
692
+ yield {
693
+ type: 'stop',
694
+ stopReason: finishReason ?? 'stop',
695
+ iterations,
696
+ usage: aggregated,
697
+ messages: workingMessages,
698
+ value,
699
+ text,
700
+ } as AgentStreamEvent<T>
701
+ return
702
+ }
703
+
704
+ const resultBlocks: ContentBlock[] = []
705
+ for (const call of toolUses) {
706
+ yield { type: 'tool_use', id: call.id, name: call.name, input: call.input }
707
+ const { content, isError } = await runToolWithRecovery(
708
+ toolMap.get(call.name),
709
+ call.name,
710
+ call.id,
711
+ call.input,
712
+ options,
713
+ )
714
+ resultBlocks.push({
715
+ type: 'tool_result',
716
+ toolUseId: call.id,
717
+ content,
718
+ ...(isError ? { isError: true } : {}),
719
+ } satisfies ToolResultBlock)
720
+ yield {
721
+ type: 'tool_result',
722
+ id: call.id,
723
+ name: call.name,
724
+ content,
725
+ isError,
726
+ }
727
+ }
728
+ workingMessages.push({ role: 'user', content: resultBlocks })
729
+
730
+ iterations++
731
+ if (iterations >= maxIterations) {
732
+ const text = textBuf
733
+ const value = parseGenerated(text, schema)
734
+ yield {
735
+ type: 'stop',
736
+ stopReason: 'max_iterations',
737
+ iterations,
738
+ usage: aggregated,
739
+ messages: workingMessages,
740
+ value,
741
+ text,
742
+ } as AgentStreamEvent<T>
743
+ return
744
+ }
745
+ }
746
+ }
747
+
277
748
  async generate<T>(
278
749
  messages: readonly Message[],
279
750
  schema: OutputSchema<T>,
@@ -317,18 +788,27 @@ export class GeminiProvider implements Provider {
317
788
  config.systemInstruction = systemText
318
789
  }
319
790
 
791
+ const configTools: NonNullable<GenerateContentConfig['tools']> = []
320
792
  if (tools.length > 0) {
321
793
  const functionDeclarations: FunctionDeclaration[] = tools.map((t) => ({
322
794
  name: t.name,
323
795
  description: t.description,
324
796
  parametersJsonSchema: t.inputSchema,
325
797
  }))
326
- config.tools = [{ functionDeclarations }]
798
+ configTools.push({ functionDeclarations })
799
+ }
800
+ if (options.serverTools && options.serverTools.length > 0) {
801
+ configTools.push(...geminiServerTools(options.serverTools))
802
+ }
803
+ if (configTools.length > 0) {
804
+ config.tools = configTools
327
805
  }
328
806
 
329
807
  const thinking = buildThinkingConfig(options)
330
808
  if (thinking !== undefined) config.thinkingConfig = thinking
331
809
 
810
+ if (options.signal !== undefined) config.abortSignal = options.signal
811
+
332
812
  return { model, contents, config }
333
813
  }
334
814
 
@@ -356,6 +836,13 @@ export class GeminiProvider implements Provider {
356
836
 
357
837
  // ─── Shape converters ─────────────────────────────────────────────────────
358
838
 
839
+ /** Throw a DOMException-shaped abort error if the signal has fired. */
840
+ function checkAborted(signal: AbortSignal | undefined): void {
841
+ if (signal?.aborted) {
842
+ throw signal.reason ?? new DOMException('Aborted', 'AbortError')
843
+ }
844
+ }
845
+
359
846
  function systemPromptText(system: SystemPrompt | undefined): string {
360
847
  if (system === undefined) return ''
361
848
  if (typeof system === 'string') return system
@@ -369,6 +856,25 @@ function toGeminiParts(content: string | ContentBlock[]): Part[] {
369
856
  for (const block of content) {
370
857
  if (block.type === 'text') {
371
858
  parts.push({ text: block.text })
859
+ } else if (block.type === 'image' || block.type === 'document' || block.type === 'audio') {
860
+ // All three media block types share Gemini's inlineData /
861
+ // fileData wire shape; only the MIME differs. Base64 →
862
+ // inlineData. URL → fileData with fileUri. Gemini's
863
+ // fileData accepts public HTTPS and gs:// URIs; arbitrary
864
+ // private URLs need to be fetched and converted to base64
865
+ // by the app.
866
+ if (block.source.type === 'base64') {
867
+ parts.push({
868
+ inlineData: { mimeType: block.source.mediaType, data: block.source.data },
869
+ })
870
+ } else {
871
+ parts.push({
872
+ fileData: {
873
+ fileUri: block.source.url,
874
+ mimeType: guessMimeFromUrl(block.source.url, block.type),
875
+ },
876
+ })
877
+ }
372
878
  } else if (block.type === 'tool_use') {
373
879
  parts.push({
374
880
  functionCall: {
@@ -394,6 +900,69 @@ function toGeminiParts(content: string | ContentBlock[]): Part[] {
394
900
  return parts
395
901
  }
396
902
 
903
+ /**
904
+ * Gemini's `fileData.mimeType` is required, but our media-block
905
+ * URL-source variants don't carry it (the app may not know).
906
+ * Best-effort from the file extension. Default falls back to the
907
+ * block type's most-common MIME (jpeg for images, pdf for
908
+ * documents, mp3 for audio).
909
+ */
910
+ /**
911
+ * Translate framework `ServerTool[]` into Gemini's typed entries
912
+ * (`googleSearch` / `codeExecution` / `urlContext`). Anthropic-
913
+ * specific tools (`web_fetch`) throw with clear guidance.
914
+ *
915
+ * Gemini's server tools have no per-tool config — they're enabled
916
+ * with empty `{}` objects. Domain allowlists / max_uses /
917
+ * blocked_domains on `web_search` are silently dropped (Gemini
918
+ * doesn't accept them).
919
+ */
920
+ function geminiServerTools(
921
+ serverTools: readonly ServerTool[],
922
+ ): NonNullable<GenerateContentConfig['tools']> {
923
+ const out: NonNullable<GenerateContentConfig['tools']> = []
924
+ for (const t of serverTools) {
925
+ if (t.type === 'web_search') {
926
+ out.push({ googleSearch: {} })
927
+ } else if (t.type === 'code_execution') {
928
+ out.push({ codeExecution: {} })
929
+ } else if (t.type === 'url_context') {
930
+ out.push({ urlContext: {} })
931
+ } else if (t.type === 'web_fetch') {
932
+ throw new BrainError(
933
+ 'GeminiProvider: server tool `web_fetch` is Anthropic-only. Use `url_context` for Gemini or route the call to Anthropic.',
934
+ { context: { provider: 'google' } },
935
+ )
936
+ }
937
+ }
938
+ return out
939
+ }
940
+
941
+ function guessMimeFromUrl(
942
+ url: string,
943
+ kind: 'image' | 'document' | 'audio',
944
+ ): string {
945
+ const lower = url.toLowerCase().split('?')[0] ?? ''
946
+ if (kind === 'image') {
947
+ if (lower.endsWith('.png')) return 'image/png'
948
+ if (lower.endsWith('.webp')) return 'image/webp'
949
+ if (lower.endsWith('.gif')) return 'image/gif'
950
+ if (lower.endsWith('.jpg') || lower.endsWith('.jpeg')) return 'image/jpeg'
951
+ return 'image/jpeg'
952
+ }
953
+ if (kind === 'document') {
954
+ return 'application/pdf'
955
+ }
956
+ // audio
957
+ if (lower.endsWith('.mp3')) return 'audio/mp3'
958
+ if (lower.endsWith('.wav')) return 'audio/wav'
959
+ if (lower.endsWith('.ogg')) return 'audio/ogg'
960
+ if (lower.endsWith('.flac')) return 'audio/flac'
961
+ if (lower.endsWith('.webm')) return 'audio/webm'
962
+ if (lower.endsWith('.aac') || lower.endsWith('.m4a')) return 'audio/aac'
963
+ return 'audio/mp3'
964
+ }
965
+
397
966
  function fromGeminiParts(parts: readonly Part[]): string | ContentBlock[] {
398
967
  const blocks: ContentBlock[] = []
399
968
  for (const part of parts) {