@strav/brain 0.4.28 → 0.4.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@strav/brain",
3
- "version": "0.4.28",
3
+ "version": "0.4.29",
4
4
  "type": "module",
5
5
  "description": "AI module for the Strav framework",
6
6
  "license": "MIT",
@@ -15,15 +15,15 @@
15
15
  "CHANGELOG.md"
16
16
  ],
17
17
  "peerDependencies": {
18
- "@strav/kernel": "0.4.28"
18
+ "@strav/kernel": "0.4.29"
19
19
  },
20
20
  "dependencies": {
21
- "@strav/mcp": "0.4.28",
22
- "@strav/workflow": "0.4.28",
21
+ "@strav/mcp": "0.4.29",
22
+ "@strav/workflow": "0.4.29",
23
23
  "zod": "^3.25 || ^4.0"
24
24
  },
25
25
  "devDependencies": {
26
- "@strav/http": "0.4.28"
26
+ "@strav/http": "0.4.29"
27
27
  },
28
28
  "scripts": {
29
29
  "test": "bun test tests/",
@@ -9,6 +9,8 @@ import type {
9
9
  CompletionResponse,
10
10
  BeforeHook,
11
11
  AfterHook,
12
+ TranscribeRequest,
13
+ TranscriptionResponse,
12
14
  } from './types.ts'
13
15
  import type { MemoryConfig, ThreadStore } from './memory/types.ts'
14
16
 
@@ -125,6 +127,26 @@ export default class BrainManager {
125
127
  return response
126
128
  }
127
129
 
130
+ /**
131
+ * Transcribe audio through the named provider. Throws a clear
132
+ * ConfigurationError if the provider doesn't implement `transcribe()`
133
+ * (Anthropic, at time of writing). Hooks are not invoked — they're
134
+ * shaped for chat completions and don't carry an audio analogue.
135
+ */
136
+ static async transcribe(
137
+ providerName: string | undefined,
138
+ request: TranscribeRequest
139
+ ): Promise<TranscriptionResponse> {
140
+ const provider = BrainManager.provider(providerName)
141
+ if (!provider.transcribe) {
142
+ throw new ConfigurationError(
143
+ `AI provider "${provider.name}" does not support transcribe(). ` +
144
+ `Use the OpenAI or Google providers, or register a custom one via BrainManager.useProvider().`
145
+ )
146
+ }
147
+ return provider.transcribe(request)
148
+ }
149
+
128
150
  /** Clear all providers, hooks, and stores (for testing). */
129
151
  static reset(): void {
130
152
  BrainManager._providers.clear()
package/src/helpers.ts CHANGED
@@ -24,6 +24,8 @@ import type {
24
24
  SerializedAgentState,
25
25
  SuspendedRun,
26
26
  ToolCallResult,
27
+ TranscribeRequest,
28
+ TranscriptionResponse,
27
29
  } from './types.ts'
28
30
 
29
31
  // ── Shared tool executor ─────────────────────────────────────────────────────
@@ -88,6 +90,16 @@ export interface EmbedOptions {
88
90
  model?: string
89
91
  }
90
92
 
93
+ export interface TranscribeOptions {
94
+ audio: Uint8Array | Blob
95
+ contentType?: string
96
+ language?: string
97
+ prompt?: string
98
+ filename?: string
99
+ provider?: string
100
+ model?: string
101
+ }
102
+
91
103
  // ── brain Helper Object ─────────────────────────────────────────────────────
92
104
 
93
105
  export const brain = {
@@ -199,6 +211,33 @@ export const brain = {
199
211
  return result.embeddings
200
212
  },
201
213
 
214
+ /**
215
+ * Transcribe audio (speech-to-text). Uses the OpenAI Whisper endpoint
216
+ * by default; pass `provider: 'google'` to use Gemini's multimodal
217
+ * generateContent endpoint instead. Both accept a `language` hint
218
+ * (BCP-47) and a `prompt` to bias vocabulary.
219
+ *
220
+ * @example
221
+ * // Voice note coming off a LINE inbound webhook
222
+ * const { bytes, contentType } = await LineManager.client.downloadContent(messageId)
223
+ * const { text } = await brain.transcribe({
224
+ * audio: bytes,
225
+ * contentType, // 'audio/m4a' from LINE
226
+ * language: 'th',
227
+ * prompt: 'Coffee shop menu items, Bangkok area names',
228
+ * })
229
+ */
230
+ async transcribe(options: TranscribeOptions): Promise<TranscriptionResponse> {
231
+ return BrainManager.transcribe(options.provider, {
232
+ audio: options.audio,
233
+ contentType: options.contentType,
234
+ model: options.model,
235
+ language: options.language,
236
+ prompt: options.prompt,
237
+ filename: options.filename,
238
+ })
239
+ },
240
+
202
241
  /** Create a fluent agent runner. */
203
242
  agent<T extends Agent>(AgentClass: new () => T): AgentRunner<T> {
204
243
  return new AgentRunner(AgentClass)
@@ -10,6 +10,8 @@ import type {
10
10
  ProviderConfig,
11
11
  Message,
12
12
  ToolCall,
13
+ TranscribeRequest,
14
+ TranscriptionResponse,
13
15
  Usage,
14
16
  } from '../types.ts'
15
17
 
@@ -178,6 +180,65 @@ export class GoogleProvider implements AIProvider {
178
180
  }
179
181
  }
180
182
 
183
+ /**
184
+ * Speech-to-text via Gemini's multimodal generateContent endpoint.
185
+ *
186
+ * Gemini doesn't have a dedicated STT endpoint; instead, audio is
187
+ * passed as an inline `audio/*` part alongside a text prompt asking
188
+ * for a transcription. We default to `gemini-2.5-flash` (fast, cheap,
189
+ * Thai-capable). Override `model` for `gemini-2.5-pro` when accuracy
190
+ * matters more than latency.
191
+ *
192
+ * Inline audio is capped at ~20MB across the whole request. Chunk
193
+ * longer recordings, or use Gemini's Files API (upload + reference)
194
+ * which isn't covered here — out of scope for the typical SME
195
+ * voice-note flow (<=60s clips).
196
+ */
197
+ async transcribe(request: TranscribeRequest): Promise<TranscriptionResponse> {
198
+ const model = request.model ?? 'gemini-2.5-flash'
199
+ const contentType = request.contentType ?? 'audio/mpeg'
200
+
201
+ const bytes =
202
+ request.audio instanceof Blob
203
+ ? new Uint8Array(await request.audio.arrayBuffer())
204
+ : request.audio
205
+ const base64 = encodeBase64(bytes)
206
+
207
+ const instruction = buildTranscriptionInstruction(request)
208
+
209
+ const body = {
210
+ contents: [
211
+ {
212
+ role: 'user',
213
+ parts: [
214
+ { text: instruction },
215
+ { inline_data: { mime_type: contentType, data: base64 } },
216
+ ],
217
+ },
218
+ ],
219
+ generationConfig: {
220
+ // Deterministic output for a transcription task.
221
+ temperature: 0,
222
+ },
223
+ }
224
+
225
+ const response = await retryableFetch(
226
+ 'Google',
227
+ `${this.baseUrl}/models/${model}:generateContent`,
228
+ { method: 'POST', headers: this.buildHeaders(), body: JSON.stringify(body) },
229
+ this.retryOptions
230
+ )
231
+
232
+ const data: any = await response.json()
233
+ const text = extractTranscript(data)
234
+
235
+ return {
236
+ text,
237
+ language: request.language,
238
+ raw: data,
239
+ }
240
+ }
241
+
181
242
  // ── Private helpers ──────────────────────────────────────────────────────
182
243
 
183
244
  private buildHeaders(): Record<string, string> {
@@ -395,4 +456,41 @@ export class GoogleProvider implements AIProvider {
395
456
  private generateResponseId(): string {
396
457
  return `resp_${Math.random().toString(36).substring(2, 15)}`
397
458
  }
459
+ }
460
+
461
+ function buildTranscriptionInstruction(request: TranscribeRequest): string {
462
+ const parts: string[] = [
463
+ 'Transcribe the audio to text. Return only the transcription, without commentary, timestamps, or speaker labels.',
464
+ ]
465
+ if (request.language) {
466
+ parts.push(`The audio is in ${request.language}. Preserve the original language in the output.`)
467
+ }
468
+ if (request.prompt) {
469
+ // Surface the priming hint to bias vocabulary (proper nouns, menu
470
+ // items, dialect markers). Kept inside the same system-style turn —
471
+ // Gemini doesn't have a separate "system_instruction" field that
472
+ // behaves differently for this use.
473
+ parts.push(`Context to help with vocabulary: ${request.prompt}`)
474
+ }
475
+ return parts.join(' ')
476
+ }
477
+
478
+ function extractTranscript(data: any): string {
479
+ const candidate = data?.candidates?.[0]
480
+ if (!candidate?.content?.parts) return ''
481
+ return candidate.content.parts
482
+ .map((part: any) => (typeof part?.text === 'string' ? part.text : ''))
483
+ .join('')
484
+ .trim()
485
+ }
486
+
487
+ function encodeBase64(bytes: Uint8Array): string {
488
+ // Node / Bun: use Buffer; falls back to atob/btoa in pure browser envs
489
+ // (not used in this codebase, but kept for parity with bun-types).
490
+ if (typeof Buffer !== 'undefined') {
491
+ return Buffer.from(bytes).toString('base64')
492
+ }
493
+ let binary = ''
494
+ for (const b of bytes) binary += String.fromCharCode(b)
495
+ return btoa(binary)
398
496
  }
@@ -10,6 +10,8 @@ import type {
10
10
  ProviderConfig,
11
11
  Message,
12
12
  ToolCall,
13
+ TranscribeRequest,
14
+ TranscriptionResponse,
13
15
  Usage,
14
16
  } from '../types.ts'
15
17
 
@@ -171,6 +173,53 @@ export class OpenAIProvider implements AIProvider {
171
173
  }
172
174
  }
173
175
 
176
+ /**
177
+ * Speech-to-text via the OpenAI Whisper API (/v1/audio/transcriptions).
178
+ *
179
+ * Defaults to `whisper-1` — the long-standing, broadly supported model.
180
+ * Override with `gpt-4o-transcribe` or `gpt-4o-mini-transcribe` for the
181
+ * newer architecture (better noise/accent robustness, similar pricing).
182
+ *
183
+ * Requests `verbose_json` so we can surface `language` and `duration`
184
+ * on the normalized response without a second round-trip.
185
+ */
186
+ async transcribe(request: TranscribeRequest): Promise<TranscriptionResponse> {
187
+ const filename = request.filename ?? defaultFilename(request.contentType)
188
+ const contentType = request.contentType ?? 'application/octet-stream'
189
+ const blob =
190
+ request.audio instanceof Blob
191
+ ? request.audio
192
+ : new Blob([request.audio], { type: contentType })
193
+
194
+ const form = new FormData()
195
+ form.append('file', blob, filename)
196
+ form.append('model', request.model ?? 'whisper-1')
197
+ form.append('response_format', 'verbose_json')
198
+ if (request.language) form.append('language', request.language)
199
+ if (request.prompt) form.append('prompt', request.prompt)
200
+
201
+ const response = await retryableFetch(
202
+ 'OpenAI',
203
+ `${this.baseUrl}/v1/audio/transcriptions`,
204
+ {
205
+ method: 'POST',
206
+ // Don't set Content-Type — the runtime sets it with the
207
+ // multipart boundary derived from the FormData body.
208
+ headers: { Authorization: `Bearer ${this.apiKey}` },
209
+ body: form,
210
+ },
211
+ this.retryOptions
212
+ )
213
+
214
+ const data: any = await response.json()
215
+ return {
216
+ text: String(data.text ?? ''),
217
+ language: typeof data.language === 'string' ? data.language : undefined,
218
+ duration: typeof data.duration === 'number' ? data.duration : undefined,
219
+ raw: data,
220
+ }
221
+ }
222
+
174
223
  // ── Private helpers ──────────────────────────────────────────────────────
175
224
 
176
225
  private isReasoningModel(model: string): boolean {
@@ -507,3 +556,14 @@ export class OpenAIProvider implements AIProvider {
507
556
  return schema
508
557
  }
509
558
  }
559
+
560
+ /**
561
+ * Choose a multipart filename for Whisper based on the content type.
562
+ * Whisper sniffs the extension when no MIME is supplied; sending a name
563
+ * that matches the actual format avoids "unsupported file" 400s.
564
+ */
565
+ function defaultFilename(contentType?: string): string {
566
+ if (!contentType) return 'audio.bin'
567
+ const ext = contentType.split('/')[1]?.split(';')[0]?.trim()
568
+ return ext ? `audio.${ext}` : 'audio.bin'
569
+ }
package/src/types.ts CHANGED
@@ -175,6 +175,52 @@ export interface EmbeddingResponse {
175
175
  usage: { totalTokens: number }
176
176
  }
177
177
 
178
+ // ── Transcription (Speech-to-Text) ───────────────────────────────────────────
179
+
180
+ export interface TranscribeRequest {
181
+ /** Audio bytes. Most STT endpoints cap at ~25MB; chunk longer recordings. */
182
+ audio: Uint8Array | Blob
183
+ /**
184
+ * MIME type of the audio. Required for providers that infer format from
185
+ * the multipart filename or rely on it for inline base64 (Gemini).
186
+ * Examples: 'audio/m4a', 'audio/mpeg', 'audio/wav', 'audio/ogg',
187
+ * 'audio/webm', 'audio/flac'.
188
+ */
189
+ contentType?: string
190
+ /** Override the provider's default STT model. */
191
+ model?: string
192
+ /**
193
+ * BCP-47 language hint (e.g. 'th', 'en', 'zh'). Whisper accepts ISO-639-1
194
+ * ('th'); Gemini uses BCP-47. Both improve accuracy when set; omit for
195
+ * auto-detection.
196
+ */
197
+ language?: string
198
+ /**
199
+ * Optional priming prompt — gives the model vocabulary or context to
200
+ * bias toward (proper nouns, brand names, menu items, dialect markers).
201
+ * Whisper uses this directly; Gemini incorporates it into the system
202
+ * instruction.
203
+ */
204
+ prompt?: string
205
+ /**
206
+ * Filename to send in the multipart form (Whisper). Used to derive the
207
+ * audio format on the server when `contentType` is missing. Defaults to
208
+ * 'audio.bin' if not provided.
209
+ */
210
+ filename?: string
211
+ }
212
+
213
+ export interface TranscriptionResponse {
214
+ /** Transcribed text. */
215
+ text: string
216
+ /** Detected language, when the provider reports one. */
217
+ language?: string
218
+ /** Audio duration in seconds, when the provider reports one. */
219
+ duration?: number
220
+ /** Original provider response for callers that need provider-specific fields. */
221
+ raw: unknown
222
+ }
223
+
178
224
  // ── Provider ─────────────────────────────────────────────────────────────────
179
225
 
180
226
  export interface AIProvider {
@@ -182,6 +228,13 @@ export interface AIProvider {
182
228
  complete(request: CompletionRequest): Promise<CompletionResponse>
183
229
  stream(request: CompletionRequest): AsyncIterable<StreamChunk>
184
230
  embed?(input: string | string[], model?: string): Promise<EmbeddingResponse>
231
+ /**
232
+ * Transcribe audio to text. Implemented by providers that expose a
233
+ * speech-to-text endpoint (OpenAI Whisper, Google Gemini's multimodal
234
+ * generateContent). Throws or remains undefined for providers without
235
+ * STT (Anthropic at time of writing).
236
+ */
237
+ transcribe?(request: TranscribeRequest): Promise<TranscriptionResponse>
185
238
  }
186
239
 
187
240
  // ── Hooks ────────────────────────────────────────────────────────────────────