@strav/brain 1.0.0-alpha.15 → 1.0.0-alpha.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,26 +52,37 @@ import type { AgentResult } from '../agent_result.ts'
52
52
  import { BrainError } from '../brain_error.ts'
53
53
  import type { OpenAIProviderConfig } from '../brain_config.ts'
54
54
  import type { MCPServer } from '../mcp_server.ts'
55
+ import type { AgentGenerateResult } from '../agent_generate_result.ts'
56
+ import type { AgentStreamEvent } from '../agent_stream_event.ts'
55
57
  import { resolveMcpTools, type ResolveMcpToolsOptions } from '../mcp/resolve_mcp_tools.ts'
56
58
  import { parseGenerated, type OutputSchema } from '../output_schema.ts'
59
+ import { recoverOrThrow, runToolWithRecovery } from '../tool_runner.ts'
57
60
  import type { Provider, RunWithToolsOptions } from '../provider.ts'
58
61
  import type { Tool } from '../tool.ts'
59
62
  import { ToolExecutionError } from '../tool_execution_error.ts'
60
63
  import type {
64
+ AudioSource,
61
65
  ChatOptions,
62
66
  ChatResult,
63
67
  ChatUsage,
64
68
  ContentBlock,
69
+ EmbedOptions,
70
+ EmbedResult,
65
71
  GenerateResult,
72
+ ImageBlock,
66
73
  Message,
67
74
  StreamEvent,
68
75
  SystemPrompt,
69
76
  TextBlock,
70
77
  ToolResultBlock,
71
78
  ToolUseBlock,
79
+ TranscribeOptions,
80
+ TranscribeResult,
72
81
  } from '../types.ts'
73
82
 
74
83
  const DEFAULT_OPENAI_MODEL = 'gpt-5'
84
+ const DEFAULT_OPENAI_EMBED_MODEL = 'text-embedding-3-small'
85
+ const DEFAULT_OPENAI_TRANSCRIBE_MODEL = 'whisper-1'
75
86
 
76
87
  export interface OpenAIProviderOptions {
77
88
  client?: OpenAI
@@ -85,10 +96,18 @@ export interface OpenAIProviderOptions {
85
96
 
86
97
  export class OpenAIProvider implements Provider {
87
98
  readonly name: string
88
- private readonly client: OpenAI
89
- private readonly defaultModel: string
90
- private readonly defaultMaxTokens: number
91
- private readonly mcpClientFactory?: ResolveMcpToolsOptions['clientFactory']
99
+ // Protected (rather than private) so OpenAI-compatible drivers
100
+ // can subclass — see `DeepSeekProvider`. Apps that want to plug
101
+ // in Groq / Together / Fireworks follow the same pattern: extend,
102
+ // override the constructor's base URL + default model, optionally
103
+ // override `buildParams` to suppress fields the upstream API
104
+ // doesn't accept.
105
+ protected readonly client: OpenAI
106
+ protected readonly defaultModel: string
107
+ protected readonly defaultMaxTokens: number
108
+ protected readonly defaultEmbedModel: string
109
+ protected readonly defaultTranscribeModel: string
110
+ protected readonly mcpClientFactory?: ResolveMcpToolsOptions['clientFactory']
92
111
 
93
112
  constructor(
94
113
  name: string,
@@ -98,6 +117,8 @@ export class OpenAIProvider implements Provider {
98
117
  this.name = name
99
118
  this.defaultModel = config.defaultModel ?? DEFAULT_OPENAI_MODEL
100
119
  this.defaultMaxTokens = config.defaultMaxTokens ?? 4096
120
+ this.defaultEmbedModel = config.defaultEmbedModel ?? DEFAULT_OPENAI_EMBED_MODEL
121
+ this.defaultTranscribeModel = config.defaultTranscribeModel ?? DEFAULT_OPENAI_TRANSCRIBE_MODEL
101
122
  this.mcpClientFactory = options.mcpClientFactory
102
123
  this.client =
103
124
  options.client ??
@@ -110,7 +131,7 @@ export class OpenAIProvider implements Provider {
110
131
 
111
132
  async chat(messages: readonly Message[], options: ChatOptions = {}): Promise<ChatResult> {
112
133
  const params = this.buildParams(messages, options, [])
113
- const response = await this.client.chat.completions.create(params)
134
+ const response = await this.client.chat.completions.create(params, reqOpts(options))
114
135
  return this.toChatResult(response)
115
136
  }
116
137
 
@@ -123,7 +144,7 @@ export class OpenAIProvider implements Provider {
123
144
  stream: true,
124
145
  stream_options: { include_usage: true },
125
146
  }
126
- const stream = await this.client.chat.completions.create(params)
147
+ const stream = await this.client.chat.completions.create(params, reqOpts(options))
127
148
  let aggregatedUsage: OpenAI.CompletionUsage | undefined
128
149
  let finishReason: string | null = null
129
150
  for await (const chunk of stream) {
@@ -179,8 +200,9 @@ export class OpenAIProvider implements Provider {
179
200
  let iterations = 0
180
201
 
181
202
  while (true) {
203
+ checkAborted(options.signal)
182
204
  const params = this.buildParams(workingMessages, options, tools)
183
- const response = await this.client.chat.completions.create(params)
205
+ const response = await this.client.chat.completions.create(params, reqOpts(options))
184
206
  addUsage(aggregated, response.usage)
185
207
 
186
208
  const choice = response.choices[0]
@@ -210,52 +232,639 @@ export class OpenAIProvider implements Provider {
210
232
  const resultBlocks: ContentBlock[] = []
211
233
  for (const call of toolCalls) {
212
234
  if (call.type !== 'function') continue
213
- const tool = toolMap.get(call.function.name)
214
- if (!tool) {
215
- throw new ToolExecutionError(
235
+ let parsedInput: unknown
236
+ let parseFailed: { content: string; isError: boolean } | undefined
237
+ try {
238
+ parsedInput = call.function.arguments ? JSON.parse(call.function.arguments) : {}
239
+ } catch (err) {
240
+ parseFailed = recoverOrThrow(
241
+ new ToolExecutionError(
242
+ call.function.name,
243
+ call.id,
244
+ new Error(`Failed to parse tool input JSON: ${(err as Error).message}`),
245
+ ),
246
+ options,
247
+ )
248
+ }
249
+ const { content, isError } = parseFailed
250
+ ?? (await runToolWithRecovery(
251
+ toolMap.get(call.function.name),
216
252
  call.function.name,
217
253
  call.id,
218
- new Error(`Tool "${call.function.name}" is not registered.`),
219
- )
254
+ parsedInput,
255
+ options,
256
+ ))
257
+ resultBlocks.push({
258
+ type: 'tool_result',
259
+ toolUseId: call.id,
260
+ content,
261
+ ...(isError ? { isError: true } : {}),
262
+ } satisfies ToolResultBlock)
263
+ }
264
+ workingMessages.push({ role: 'user', content: resultBlocks })
265
+
266
+ iterations++
267
+ if (iterations >= maxIterations) {
268
+ return {
269
+ text: assistantMessage.content ?? '',
270
+ messages: workingMessages,
271
+ iterations,
272
+ stopReason: 'max_iterations',
273
+ usage: aggregated,
274
+ }
275
+ }
276
+ }
277
+ }
278
+
279
+ async runWithToolsAndSchema<T>(
280
+ messages: readonly Message[],
281
+ tools: readonly Tool[],
282
+ schema: OutputSchema<T>,
283
+ options: RunWithToolsOptions = {},
284
+ ): Promise<AgentGenerateResult<T>> {
285
+ const mcpServers: readonly MCPServer[] = options.mcpServers ?? []
286
+ const resolved =
287
+ mcpServers.length > 0
288
+ ? await resolveMcpTools(mcpServers, {
289
+ ...(this.mcpClientFactory ? { clientFactory: this.mcpClientFactory } : {}),
290
+ })
291
+ : { tools: [] as Tool[], close: async () => {} }
292
+ try {
293
+ return await this._runLoopWithSchema([...tools, ...resolved.tools], messages, schema, options)
294
+ } finally {
295
+ await resolved.close()
296
+ }
297
+ }
298
+
299
+ private async _runLoopWithSchema<T>(
300
+ tools: readonly Tool[],
301
+ messages: readonly Message[],
302
+ schema: OutputSchema<T>,
303
+ options: RunWithToolsOptions,
304
+ ): Promise<AgentGenerateResult<T>> {
305
+ const maxIterations = options.maxIterations ?? 10
306
+ const toolMap = new Map<string, Tool>(tools.map((t) => [t.name, t]))
307
+ const workingMessages: Message[] = [...messages]
308
+ const aggregated: ChatUsage = {
309
+ inputTokens: 0,
310
+ outputTokens: 0,
311
+ cacheReadTokens: 0,
312
+ cacheCreationTokens: 0,
313
+ }
314
+ let iterations = 0
315
+
316
+ while (true) {
317
+ const params = this.buildParams(workingMessages, options, tools)
318
+ params.response_format = {
319
+ type: 'json_schema',
320
+ json_schema: {
321
+ name: schema.name,
322
+ ...(schema.description !== undefined ? { description: schema.description } : {}),
323
+ schema: schema.jsonSchema,
324
+ strict: true,
325
+ },
326
+ }
327
+ const response = await this.client.chat.completions.create(params, reqOpts(options))
328
+ addUsage(aggregated, response.usage)
329
+
330
+ const choice = response.choices[0]
331
+ if (!choice) {
332
+ throw new BrainError('OpenAIProvider: response had no choices.')
333
+ }
334
+ const assistantMessage = choice.message
335
+ workingMessages.push({
336
+ role: 'assistant',
337
+ content: fromOpenAIAssistantMessage(assistantMessage),
338
+ })
339
+
340
+ const toolCalls = assistantMessage.tool_calls ?? []
341
+ if (toolCalls.length === 0 || choice.finish_reason !== 'tool_calls') {
342
+ const text = assistantMessage.content ?? ''
343
+ return {
344
+ value: parseGenerated(text, schema),
345
+ text,
346
+ messages: workingMessages,
347
+ iterations,
348
+ stopReason: choice.finish_reason ?? 'stop',
349
+ usage: aggregated,
220
350
  }
351
+ }
352
+
353
+ const resultBlocks: ContentBlock[] = []
354
+ for (const call of toolCalls) {
355
+ if (call.type !== 'function') continue
221
356
  let parsedInput: unknown
357
+ let parseFailed: { content: string; isError: boolean } | undefined
222
358
  try {
223
359
  parsedInput = call.function.arguments ? JSON.parse(call.function.arguments) : {}
224
360
  } catch (err) {
225
- throw new ToolExecutionError(
361
+ parseFailed = recoverOrThrow(
362
+ new ToolExecutionError(
363
+ call.function.name,
364
+ call.id,
365
+ new Error(`Failed to parse tool input JSON: ${(err as Error).message}`),
366
+ ),
367
+ options,
368
+ )
369
+ }
370
+ const { content, isError } = parseFailed
371
+ ?? (await runToolWithRecovery(
372
+ toolMap.get(call.function.name),
226
373
  call.function.name,
227
374
  call.id,
228
- new Error(`Failed to parse tool input JSON: ${(err as Error).message}`),
229
- )
375
+ parsedInput,
376
+ options,
377
+ ))
378
+ resultBlocks.push({
379
+ type: 'tool_result',
380
+ toolUseId: call.id,
381
+ content,
382
+ ...(isError ? { isError: true } : {}),
383
+ } satisfies ToolResultBlock)
384
+ }
385
+ workingMessages.push({ role: 'user', content: resultBlocks })
386
+
387
+ iterations++
388
+ if (iterations >= maxIterations) {
389
+ const text = assistantMessage.content ?? ''
390
+ return {
391
+ value: parseGenerated(text, schema),
392
+ text,
393
+ messages: workingMessages,
394
+ iterations,
395
+ stopReason: 'max_iterations',
396
+ usage: aggregated,
230
397
  }
231
- let output: unknown
232
- try {
233
- output = await tool.execute(parsedInput, {
234
- callId: call.id,
235
- context: options.context ?? {},
398
+ }
399
+ }
400
+ }
401
+
402
+ async *streamWithTools(
403
+ messages: readonly Message[],
404
+ tools: readonly Tool[],
405
+ options: RunWithToolsOptions = {},
406
+ ): AsyncIterable<AgentStreamEvent> {
407
+ const mcpServers: readonly MCPServer[] = options.mcpServers ?? []
408
+ const resolved =
409
+ mcpServers.length > 0
410
+ ? await resolveMcpTools(mcpServers, {
411
+ ...(this.mcpClientFactory ? { clientFactory: this.mcpClientFactory } : {}),
236
412
  })
237
- } catch (cause) {
238
- throw new ToolExecutionError(call.function.name, call.id, cause)
413
+ : { tools: [] as Tool[], close: async () => {} }
414
+ try {
415
+ yield* this._streamLoop(messages, [...tools, ...resolved.tools], options)
416
+ } finally {
417
+ await resolved.close()
418
+ }
419
+ }
420
+
421
+ private async *_streamLoop(
422
+ messages: readonly Message[],
423
+ tools: readonly Tool[],
424
+ options: RunWithToolsOptions,
425
+ ): AsyncIterable<AgentStreamEvent> {
426
+ const maxIterations = options.maxIterations ?? 10
427
+ const toolMap = new Map<string, Tool>(tools.map((t) => [t.name, t]))
428
+ const workingMessages: Message[] = [...messages]
429
+ const aggregated: ChatUsage = {
430
+ inputTokens: 0,
431
+ outputTokens: 0,
432
+ cacheReadTokens: 0,
433
+ cacheCreationTokens: 0,
434
+ }
435
+ let iterations = 0
436
+
437
+ while (true) {
438
+ checkAborted(options.signal)
439
+ yield { type: 'iteration_start', iteration: iterations }
440
+
441
+ const params: OpenAI.Chat.ChatCompletionCreateParamsStreaming = {
442
+ ...this.buildParams(workingMessages, options, tools),
443
+ stream: true,
444
+ stream_options: { include_usage: true },
445
+ }
446
+ const stream = await this.client.chat.completions.create(params, reqOpts(options))
447
+
448
+ let textBuf = ''
449
+ // Tracks: per index, the running entry; and whether
450
+ // `tool_use_start` has already been emitted (we emit once the
451
+ // first chunk brings the id + name).
452
+ const toolCallsByIndex: Map<
453
+ number,
454
+ { id?: string; name?: string; args: string; started: boolean }
455
+ > = new Map()
456
+ let finishReason: string | null = null
457
+ let lastUsage: OpenAI.CompletionUsage | undefined
458
+
459
+ for await (const chunk of stream) {
460
+ const choice = chunk.choices[0]
461
+ const delta = choice?.delta
462
+ if (delta?.content && typeof delta.content === 'string' && delta.content.length > 0) {
463
+ textBuf += delta.content
464
+ yield { type: 'text', delta: delta.content }
465
+ }
466
+ if (delta?.tool_calls) {
467
+ for (const tc of delta.tool_calls) {
468
+ const entry = toolCallsByIndex.get(tc.index) ?? { args: '', started: false }
469
+ if (tc.id) entry.id = tc.id
470
+ if (tc.function?.name) entry.name = tc.function.name
471
+ toolCallsByIndex.set(tc.index, entry)
472
+ // Emit `tool_use_start` once id+name are both known.
473
+ // OpenAI typically delivers them in the same first
474
+ // chunk for a given tool call.
475
+ if (!entry.started && entry.id !== undefined && entry.name !== undefined) {
476
+ entry.started = true
477
+ yield { type: 'tool_use_start', id: entry.id, name: entry.name }
478
+ }
479
+ if (tc.function?.arguments) {
480
+ entry.args += tc.function.arguments
481
+ // Emit a delta only after start has fired — apps relying
482
+ // on an id wouldn't have one until then.
483
+ if (entry.started && entry.id !== undefined) {
484
+ yield {
485
+ type: 'tool_use_delta',
486
+ id: entry.id,
487
+ argsDelta: tc.function.arguments,
488
+ }
489
+ }
490
+ }
491
+ }
239
492
  }
240
- const resultBlock: ToolResultBlock = {
493
+ if (choice?.finish_reason) finishReason = choice.finish_reason
494
+ if (chunk.usage) lastUsage = chunk.usage
495
+ }
496
+
497
+ addUsage(aggregated, lastUsage)
498
+ yield { type: 'iteration_end', iteration: iterations, stopReason: finishReason }
499
+
500
+ // Materialize the assistant turn the same way runWithTools does.
501
+ const assistantBlocks: ContentBlock[] = []
502
+ if (textBuf.length > 0) assistantBlocks.push({ type: 'text', text: textBuf })
503
+ const orderedCalls = [...toolCallsByIndex.entries()]
504
+ .sort(([a], [b]) => a - b)
505
+ .map(([, v]) => v)
506
+ for (const call of orderedCalls) {
507
+ if (!call.id || !call.name) continue
508
+ let parsedInput: unknown = {}
509
+ try {
510
+ parsedInput = call.args ? JSON.parse(call.args) : {}
511
+ } catch {
512
+ parsedInput = call.args
513
+ }
514
+ assistantBlocks.push({
515
+ type: 'tool_use',
516
+ id: call.id,
517
+ name: call.name,
518
+ input: parsedInput,
519
+ } satisfies ToolUseBlock)
520
+ }
521
+ const assistantContent: string | ContentBlock[] =
522
+ assistantBlocks.length === 1 && assistantBlocks[0]?.type === 'text'
523
+ ? assistantBlocks[0].text
524
+ : assistantBlocks
525
+ workingMessages.push({ role: 'assistant', content: assistantContent })
526
+
527
+ if (finishReason !== 'tool_calls' || orderedCalls.length === 0) {
528
+ yield {
529
+ type: 'stop',
530
+ stopReason: finishReason ?? 'stop',
531
+ iterations,
532
+ usage: aggregated,
533
+ messages: workingMessages,
534
+ }
535
+ return
536
+ }
537
+
538
+ const resultBlocks: ContentBlock[] = []
539
+ for (const call of orderedCalls) {
540
+ if (!call.id || !call.name) continue
541
+ let parsedInput: unknown
542
+ let parseFailed: { content: string; isError: boolean } | undefined
543
+ try {
544
+ parsedInput = call.args ? JSON.parse(call.args) : {}
545
+ } catch (err) {
546
+ parseFailed = recoverOrThrow(
547
+ new ToolExecutionError(
548
+ call.name,
549
+ call.id,
550
+ new Error(`Failed to parse tool input JSON: ${(err as Error).message}`),
551
+ ),
552
+ options,
553
+ )
554
+ parsedInput = call.args
555
+ }
556
+ yield { type: 'tool_use', id: call.id, name: call.name, input: parsedInput }
557
+ const { content, isError } = parseFailed
558
+ ?? (await runToolWithRecovery(
559
+ toolMap.get(call.name),
560
+ call.name,
561
+ call.id,
562
+ parsedInput,
563
+ options,
564
+ ))
565
+ resultBlocks.push({
241
566
  type: 'tool_result',
242
567
  toolUseId: call.id,
243
- content: typeof output === 'string' ? output : JSON.stringify(output),
568
+ content,
569
+ ...(isError ? { isError: true } : {}),
570
+ } satisfies ToolResultBlock)
571
+ yield {
572
+ type: 'tool_result',
573
+ id: call.id,
574
+ name: call.name,
575
+ content,
576
+ isError,
244
577
  }
245
- resultBlocks.push(resultBlock)
246
578
  }
247
579
  workingMessages.push({ role: 'user', content: resultBlocks })
248
580
 
249
581
  iterations++
250
582
  if (iterations >= maxIterations) {
251
- return {
252
- text: assistantMessage.content ?? '',
583
+ yield {
584
+ type: 'stop',
585
+ stopReason: 'max_iterations',
586
+ iterations,
587
+ usage: aggregated,
253
588
  messages: workingMessages,
589
+ }
590
+ return
591
+ }
592
+ }
593
+ }
594
+
595
+ async *streamWithToolsAndSchema<T>(
596
+ messages: readonly Message[],
597
+ tools: readonly Tool[],
598
+ schema: OutputSchema<T>,
599
+ options: RunWithToolsOptions = {},
600
+ ): AsyncIterable<AgentStreamEvent<T>> {
601
+ const mcpServers: readonly MCPServer[] = options.mcpServers ?? []
602
+ const resolved =
603
+ mcpServers.length > 0
604
+ ? await resolveMcpTools(mcpServers, {
605
+ ...(this.mcpClientFactory ? { clientFactory: this.mcpClientFactory } : {}),
606
+ })
607
+ : { tools: [] as Tool[], close: async () => {} }
608
+ try {
609
+ yield* this._streamLoopWithSchema(
610
+ [...tools, ...resolved.tools],
611
+ messages,
612
+ schema,
613
+ options,
614
+ )
615
+ } finally {
616
+ await resolved.close()
617
+ }
618
+ }
619
+
620
+ private async *_streamLoopWithSchema<T>(
621
+ tools: readonly Tool[],
622
+ messages: readonly Message[],
623
+ schema: OutputSchema<T>,
624
+ options: RunWithToolsOptions,
625
+ ): AsyncIterable<AgentStreamEvent<T>> {
626
+ const maxIterations = options.maxIterations ?? 10
627
+ const toolMap = new Map<string, Tool>(tools.map((t) => [t.name, t]))
628
+ const workingMessages: Message[] = [...messages]
629
+ const aggregated: ChatUsage = {
630
+ inputTokens: 0,
631
+ outputTokens: 0,
632
+ cacheReadTokens: 0,
633
+ cacheCreationTokens: 0,
634
+ }
635
+ let iterations = 0
636
+
637
+ while (true) {
638
+ checkAborted(options.signal)
639
+ yield { type: 'iteration_start', iteration: iterations }
640
+
641
+ const baseParams = this.buildParams(workingMessages, options, tools)
642
+ baseParams.response_format = {
643
+ type: 'json_schema',
644
+ json_schema: {
645
+ name: schema.name,
646
+ ...(schema.description !== undefined ? { description: schema.description } : {}),
647
+ schema: schema.jsonSchema,
648
+ strict: true,
649
+ },
650
+ }
651
+ const params: OpenAI.Chat.ChatCompletionCreateParamsStreaming = {
652
+ ...baseParams,
653
+ stream: true,
654
+ stream_options: { include_usage: true },
655
+ }
656
+ const stream = await this.client.chat.completions.create(params, reqOpts(options))
657
+
658
+ let textBuf = ''
659
+ // Tracks: per index, the running entry; and whether
660
+ // `tool_use_start` has already been emitted (we emit once the
661
+ // first chunk brings the id + name).
662
+ const toolCallsByIndex: Map<
663
+ number,
664
+ { id?: string; name?: string; args: string; started: boolean }
665
+ > = new Map()
666
+ let finishReason: string | null = null
667
+ let lastUsage: OpenAI.CompletionUsage | undefined
668
+
669
+ for await (const chunk of stream) {
670
+ const choice = chunk.choices[0]
671
+ const delta = choice?.delta
672
+ if (delta?.content && typeof delta.content === 'string' && delta.content.length > 0) {
673
+ textBuf += delta.content
674
+ yield { type: 'text', delta: delta.content }
675
+ }
676
+ if (delta?.tool_calls) {
677
+ for (const tc of delta.tool_calls) {
678
+ const entry = toolCallsByIndex.get(tc.index) ?? { args: '', started: false }
679
+ if (tc.id) entry.id = tc.id
680
+ if (tc.function?.name) entry.name = tc.function.name
681
+ toolCallsByIndex.set(tc.index, entry)
682
+ // Emit `tool_use_start` once id+name are both known.
683
+ // OpenAI typically delivers them in the same first
684
+ // chunk for a given tool call.
685
+ if (!entry.started && entry.id !== undefined && entry.name !== undefined) {
686
+ entry.started = true
687
+ yield { type: 'tool_use_start', id: entry.id, name: entry.name }
688
+ }
689
+ if (tc.function?.arguments) {
690
+ entry.args += tc.function.arguments
691
+ // Emit a delta only after start has fired — apps relying
692
+ // on an id wouldn't have one until then.
693
+ if (entry.started && entry.id !== undefined) {
694
+ yield {
695
+ type: 'tool_use_delta',
696
+ id: entry.id,
697
+ argsDelta: tc.function.arguments,
698
+ }
699
+ }
700
+ }
701
+ }
702
+ }
703
+ if (choice?.finish_reason) finishReason = choice.finish_reason
704
+ if (chunk.usage) lastUsage = chunk.usage
705
+ }
706
+
707
+ addUsage(aggregated, lastUsage)
708
+ yield { type: 'iteration_end', iteration: iterations, stopReason: finishReason }
709
+
710
+ const assistantBlocks: ContentBlock[] = []
711
+ if (textBuf.length > 0) assistantBlocks.push({ type: 'text', text: textBuf })
712
+ const orderedCalls = [...toolCallsByIndex.entries()]
713
+ .sort(([a], [b]) => a - b)
714
+ .map(([, v]) => v)
715
+ for (const call of orderedCalls) {
716
+ if (!call.id || !call.name) continue
717
+ let parsedInput: unknown = {}
718
+ try {
719
+ parsedInput = call.args ? JSON.parse(call.args) : {}
720
+ } catch {
721
+ parsedInput = call.args
722
+ }
723
+ assistantBlocks.push({
724
+ type: 'tool_use',
725
+ id: call.id,
726
+ name: call.name,
727
+ input: parsedInput,
728
+ } satisfies ToolUseBlock)
729
+ }
730
+ const assistantContent: string | ContentBlock[] =
731
+ assistantBlocks.length === 1 && assistantBlocks[0]?.type === 'text'
732
+ ? assistantBlocks[0].text
733
+ : assistantBlocks
734
+ workingMessages.push({ role: 'assistant', content: assistantContent })
735
+
736
+ if (finishReason !== 'tool_calls' || orderedCalls.length === 0) {
737
+ const text = textBuf
738
+ const value = parseGenerated(text, schema)
739
+ yield {
740
+ type: 'stop',
741
+ stopReason: finishReason ?? 'stop',
254
742
  iterations,
255
- stopReason: 'max_iterations',
256
743
  usage: aggregated,
744
+ messages: workingMessages,
745
+ value,
746
+ text,
747
+ } as AgentStreamEvent<T>
748
+ return
749
+ }
750
+
751
+ const resultBlocks: ContentBlock[] = []
752
+ for (const call of orderedCalls) {
753
+ if (!call.id || !call.name) continue
754
+ let parsedInput: unknown
755
+ let parseFailed: { content: string; isError: boolean } | undefined
756
+ try {
757
+ parsedInput = call.args ? JSON.parse(call.args) : {}
758
+ } catch (err) {
759
+ parseFailed = recoverOrThrow(
760
+ new ToolExecutionError(
761
+ call.name,
762
+ call.id,
763
+ new Error(`Failed to parse tool input JSON: ${(err as Error).message}`),
764
+ ),
765
+ options,
766
+ )
767
+ parsedInput = call.args
768
+ }
769
+ yield { type: 'tool_use', id: call.id, name: call.name, input: parsedInput }
770
+ const { content, isError } = parseFailed
771
+ ?? (await runToolWithRecovery(
772
+ toolMap.get(call.name),
773
+ call.name,
774
+ call.id,
775
+ parsedInput,
776
+ options,
777
+ ))
778
+ resultBlocks.push({
779
+ type: 'tool_result',
780
+ toolUseId: call.id,
781
+ content,
782
+ ...(isError ? { isError: true } : {}),
783
+ } satisfies ToolResultBlock)
784
+ yield {
785
+ type: 'tool_result',
786
+ id: call.id,
787
+ name: call.name,
788
+ content,
789
+ isError,
257
790
  }
258
791
  }
792
+ workingMessages.push({ role: 'user', content: resultBlocks })
793
+
794
+ iterations++
795
+ if (iterations >= maxIterations) {
796
+ const text = textBuf
797
+ const value = parseGenerated(text, schema)
798
+ yield {
799
+ type: 'stop',
800
+ stopReason: 'max_iterations',
801
+ iterations,
802
+ usage: aggregated,
803
+ messages: workingMessages,
804
+ value,
805
+ text,
806
+ } as AgentStreamEvent<T>
807
+ return
808
+ }
809
+ }
810
+ }
811
+
812
+ async transcribe(
813
+ audio: AudioSource,
814
+ options: TranscribeOptions = {},
815
+ ): Promise<TranscribeResult<OpenAI.Audio.TranscriptionCreateResponse>> {
816
+ const model = options.model ?? this.defaultTranscribeModel
817
+ const file = await audioSourceToFile(audio)
818
+ const params: OpenAI.Audio.TranscriptionCreateParams = {
819
+ file,
820
+ model,
821
+ ...(options.language !== undefined ? { language: options.language } : {}),
822
+ ...(options.prompt !== undefined ? { prompt: options.prompt } : {}),
823
+ }
824
+ const response = await this.client.audio.transcriptions.create(
825
+ params,
826
+ options.signal !== undefined ? { signal: options.signal } : undefined,
827
+ )
828
+ // Whisper-1 returns { text, language?, duration? } when
829
+ // response_format is 'verbose_json'; we default to the SDK
830
+ // default (`json`) which only surfaces `text`. Apps that
831
+ // want language / duration from Whisper set
832
+ // `response_format: 'verbose_json'` via a raw SDK call;
833
+ // we can extend the option set when an app asks.
834
+ const text = 'text' in response && typeof response.text === 'string' ? response.text : ''
835
+ const result: TranscribeResult<OpenAI.Audio.TranscriptionCreateResponse> = {
836
+ text,
837
+ model,
838
+ raw: response,
839
+ }
840
+ if ('language' in response && typeof response.language === 'string') {
841
+ result.language = response.language
842
+ }
843
+ if ('duration' in response && typeof response.duration === 'number') {
844
+ result.duration = response.duration
845
+ }
846
+ return result
847
+ }
848
+
849
+ async embed(
850
+ texts: readonly string[],
851
+ options: EmbedOptions = {},
852
+ ): Promise<EmbedResult<OpenAI.CreateEmbeddingResponse>> {
853
+ const model = options.model ?? this.defaultEmbedModel
854
+ const params: OpenAI.EmbeddingCreateParams = {
855
+ model,
856
+ input: texts as string[],
857
+ ...(options.dimensions !== undefined ? { dimensions: options.dimensions } : {}),
858
+ }
859
+ const response = await this.client.embeddings.create(
860
+ params,
861
+ options.signal !== undefined ? { signal: options.signal } : undefined,
862
+ )
863
+ return {
864
+ embeddings: response.data.map((d) => d.embedding),
865
+ model: response.model,
866
+ usage: { inputTokens: response.usage?.prompt_tokens ?? 0 },
867
+ raw: response,
259
868
  }
260
869
  }
261
870
 
@@ -274,7 +883,7 @@ export class OpenAIProvider implements Provider {
274
883
  strict: true,
275
884
  },
276
885
  }
277
- const response = await this.client.chat.completions.create(params)
886
+ const response = await this.client.chat.completions.create(params, reqOpts(options))
278
887
  const choice = response.choices[0]
279
888
  const text = choice?.message?.content ?? ''
280
889
  const value = parseGenerated(text, schema)
@@ -290,11 +899,17 @@ export class OpenAIProvider implements Provider {
290
899
 
291
900
  // ─── Param translation ──────────────────────────────────────────────────
292
901
 
293
- private buildParams(
902
+ protected buildParams(
294
903
  messages: readonly Message[],
295
904
  options: ChatOptions,
296
905
  tools: readonly Tool[],
297
906
  ): OpenAI.Chat.ChatCompletionCreateParamsNonStreaming {
907
+ if (options.serverTools && options.serverTools.length > 0) {
908
+ throw new BrainError(
909
+ "OpenAIProvider: server tools (web_search / code_execution / web_fetch / url_context) are not supported on OpenAI's chat completions API. OpenAI's server tools live on the Responses API (separate provider slice). Run them as framework-local tools, route to Anthropic / Gemini, or wait for the OpenAIResponsesProvider slice.",
910
+ { context: { provider: 'openai' } },
911
+ )
912
+ }
298
913
  const model = options.model ?? this.defaultModel
299
914
  const params: OpenAI.Chat.ChatCompletionCreateParamsNonStreaming = {
300
915
  model,
@@ -389,6 +1004,55 @@ export class OpenAIProvider implements Provider {
389
1004
 
390
1005
  // ─── Shape converters ─────────────────────────────────────────────────────
391
1006
 
1007
+ /** Build the request-options bag forwarded to the SDK. Only `signal` for now. */
1008
+ function reqOpts(options: { signal?: AbortSignal }): { signal?: AbortSignal } | undefined {
1009
+ return options.signal !== undefined ? { signal: options.signal } : undefined
1010
+ }
1011
+
1012
+ /**
1013
+ * Materialize an `AudioSource` as a `File` the OpenAI SDK's
1014
+ * `Uploadable` shape accepts. Base64 → in-memory File; URL →
1015
+ * fetch + wrap. The SDK wants a filename; we synthesize one
1016
+ * since `AudioSource` doesn't carry one. The extension lets the
1017
+ * SDK pick the right content-type for the multipart upload.
1018
+ */
1019
+ async function audioSourceToFile(audio: AudioSource): Promise<File> {
1020
+ if (audio.type === 'base64') {
1021
+ const bytes = Buffer.from(audio.data, 'base64')
1022
+ const ext = extFromMime(audio.mediaType)
1023
+ return new File([bytes], `audio.${ext}`, { type: audio.mediaType })
1024
+ }
1025
+ const response = await fetch(audio.url)
1026
+ if (!response.ok) {
1027
+ throw new BrainError(
1028
+ `OpenAIProvider.transcribe: failed to fetch audio at ${audio.url}: ${response.status} ${response.statusText}.`,
1029
+ { context: { url: audio.url, status: response.status } },
1030
+ )
1031
+ }
1032
+ const buf = await response.arrayBuffer()
1033
+ const mime = response.headers.get('content-type') ?? 'audio/mpeg'
1034
+ return new File([buf], `audio.${extFromMime(mime)}`, { type: mime })
1035
+ }
1036
+
1037
+ function extFromMime(mime: string): string {
1038
+ // Strip parameters (`audio/mpeg; codecs=...` → `audio/mpeg`).
1039
+ const m = mime.split(';')[0]?.trim().toLowerCase() ?? ''
1040
+ if (m === 'audio/mp3' || m === 'audio/mpeg' || m === 'audio/mpga') return 'mp3'
1041
+ if (m === 'audio/wav' || m === 'audio/x-wav') return 'wav'
1042
+ if (m === 'audio/ogg') return 'ogg'
1043
+ if (m === 'audio/flac') return 'flac'
1044
+ if (m === 'audio/webm') return 'webm'
1045
+ if (m === 'audio/aac' || m === 'audio/x-aac' || m === 'audio/mp4' || m === 'audio/m4a') return 'm4a'
1046
+ return 'mp3'
1047
+ }
1048
+
1049
+ /** Throw a DOMException-shaped abort error if the signal has fired. */
1050
+ function checkAborted(signal: AbortSignal | undefined): void {
1051
+ if (signal?.aborted) {
1052
+ throw signal.reason ?? new DOMException('Aborted', 'AbortError')
1053
+ }
1054
+ }
1055
+
392
1056
  function systemPromptText(system: SystemPrompt | undefined): string {
393
1057
  if (system === undefined) return ''
394
1058
  if (typeof system === 'string') return system
@@ -425,8 +1089,47 @@ function toOpenAIMessage(message: Message): OpenAI.Chat.ChatCompletionMessagePar
425
1089
  return param
426
1090
  }
427
1091
 
428
- // User-role multi-block content flatten text. MCP blocks (which
429
- // are read-only and Anthropic-specific) are silently dropped.
1092
+ // Document / audio aren't supported by OpenAI's chat completions
1093
+ // API. Throw with vendor-specific guidance so apps don't waste a
1094
+ // 400 trying to send a PDF.
1095
+ for (const block of message.content) {
1096
+ if (block.type === 'document') {
1097
+ throw new BrainError(
1098
+ "OpenAIProvider: document blocks are not supported on OpenAI's chat completions API. For PDFs, split the document to images (one per page) and send them as ImageBlocks on a vision-capable model (gpt-5 / gpt-4o family); or route document workloads to Anthropic / Gemini, which accept PDF blocks natively.",
1099
+ { context: { provider: 'openai' } },
1100
+ )
1101
+ }
1102
+ if (block.type === 'audio') {
1103
+ throw new BrainError(
1104
+ "OpenAIProvider: audio blocks are not supported on OpenAI's chat completions API. Transcribe audio upstream via OpenAI's Whisper / gpt-4o-transcribe and send the resulting text; or route audio workloads to Gemini, which accepts audio blocks natively.",
1105
+ { context: { provider: 'openai' } },
1106
+ )
1107
+ }
1108
+ }
1109
+
1110
+ // User-role multi-block content. If any image blocks are present,
1111
+ // emit OpenAI's multi-part content array (text + image_url
1112
+ // entries). Otherwise flatten text — keeps simple text messages
1113
+ // cleanly typed as strings. MCP blocks (read-only,
1114
+ // Anthropic-specific) are silently dropped.
1115
+ const images = message.content.filter((b): b is ImageBlock => b.type === 'image')
1116
+ if (images.length > 0) {
1117
+ const parts: OpenAI.Chat.ChatCompletionContentPart[] = []
1118
+ for (const block of message.content) {
1119
+ if (block.type === 'text') {
1120
+ parts.push({ type: 'text', text: block.text })
1121
+ } else if (block.type === 'image') {
1122
+ const url =
1123
+ block.source.type === 'base64'
1124
+ ? `data:${block.source.mediaType};base64,${block.source.data}`
1125
+ : block.source.url
1126
+ parts.push({ type: 'image_url', image_url: { url } })
1127
+ }
1128
+ // tool_result / tool_use / mcp blocks dropped from user content
1129
+ // (they're handled elsewhere or aren't valid on user turns).
1130
+ }
1131
+ return { role: 'user', content: parts }
1132
+ }
430
1133
  const text = message.content
431
1134
  .filter((b): b is TextBlock => b.type === 'text')
432
1135
  .map((b) => b.text)