@strav/brain 0.4.28 → 0.4.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -5
- package/src/brain_manager.ts +22 -0
- package/src/helpers.ts +39 -0
- package/src/providers/google_provider.ts +98 -0
- package/src/providers/openai_provider.ts +60 -0
- package/src/types.ts +53 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@strav/brain",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.29",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "AI module for the Strav framework",
|
|
6
6
|
"license": "MIT",
|
|
@@ -15,15 +15,15 @@
|
|
|
15
15
|
"CHANGELOG.md"
|
|
16
16
|
],
|
|
17
17
|
"peerDependencies": {
|
|
18
|
-
"@strav/kernel": "0.4.
|
|
18
|
+
"@strav/kernel": "0.4.29"
|
|
19
19
|
},
|
|
20
20
|
"dependencies": {
|
|
21
|
-
"@strav/mcp": "0.4.
|
|
22
|
-
"@strav/workflow": "0.4.
|
|
21
|
+
"@strav/mcp": "0.4.29",
|
|
22
|
+
"@strav/workflow": "0.4.29",
|
|
23
23
|
"zod": "^3.25 || ^4.0"
|
|
24
24
|
},
|
|
25
25
|
"devDependencies": {
|
|
26
|
-
"@strav/http": "0.4.
|
|
26
|
+
"@strav/http": "0.4.29"
|
|
27
27
|
},
|
|
28
28
|
"scripts": {
|
|
29
29
|
"test": "bun test tests/",
|
package/src/brain_manager.ts
CHANGED
|
@@ -9,6 +9,8 @@ import type {
|
|
|
9
9
|
CompletionResponse,
|
|
10
10
|
BeforeHook,
|
|
11
11
|
AfterHook,
|
|
12
|
+
TranscribeRequest,
|
|
13
|
+
TranscriptionResponse,
|
|
12
14
|
} from './types.ts'
|
|
13
15
|
import type { MemoryConfig, ThreadStore } from './memory/types.ts'
|
|
14
16
|
|
|
@@ -125,6 +127,26 @@ export default class BrainManager {
|
|
|
125
127
|
return response
|
|
126
128
|
}
|
|
127
129
|
|
|
130
|
+
/**
|
|
131
|
+
* Transcribe audio through the named provider. Throws a clear
|
|
132
|
+
* ConfigurationError if the provider doesn't implement `transcribe()`
|
|
133
|
+
* (Anthropic, at time of writing). Hooks are not invoked — they're
|
|
134
|
+
* shaped for chat completions and don't carry an audio analogue.
|
|
135
|
+
*/
|
|
136
|
+
static async transcribe(
|
|
137
|
+
providerName: string | undefined,
|
|
138
|
+
request: TranscribeRequest
|
|
139
|
+
): Promise<TranscriptionResponse> {
|
|
140
|
+
const provider = BrainManager.provider(providerName)
|
|
141
|
+
if (!provider.transcribe) {
|
|
142
|
+
throw new ConfigurationError(
|
|
143
|
+
`AI provider "${provider.name}" does not support transcribe(). ` +
|
|
144
|
+
`Use the OpenAI or Google providers, or register a custom one via BrainManager.useProvider().`
|
|
145
|
+
)
|
|
146
|
+
}
|
|
147
|
+
return provider.transcribe(request)
|
|
148
|
+
}
|
|
149
|
+
|
|
128
150
|
/** Clear all providers, hooks, and stores (for testing). */
|
|
129
151
|
static reset(): void {
|
|
130
152
|
BrainManager._providers.clear()
|
package/src/helpers.ts
CHANGED
|
@@ -24,6 +24,8 @@ import type {
|
|
|
24
24
|
SerializedAgentState,
|
|
25
25
|
SuspendedRun,
|
|
26
26
|
ToolCallResult,
|
|
27
|
+
TranscribeRequest,
|
|
28
|
+
TranscriptionResponse,
|
|
27
29
|
} from './types.ts'
|
|
28
30
|
|
|
29
31
|
// ── Shared tool executor ─────────────────────────────────────────────────────
|
|
@@ -88,6 +90,16 @@ export interface EmbedOptions {
|
|
|
88
90
|
model?: string
|
|
89
91
|
}
|
|
90
92
|
|
|
93
|
+
export interface TranscribeOptions {
|
|
94
|
+
audio: Uint8Array | Blob
|
|
95
|
+
contentType?: string
|
|
96
|
+
language?: string
|
|
97
|
+
prompt?: string
|
|
98
|
+
filename?: string
|
|
99
|
+
provider?: string
|
|
100
|
+
model?: string
|
|
101
|
+
}
|
|
102
|
+
|
|
91
103
|
// ── brain Helper Object ─────────────────────────────────────────────────────
|
|
92
104
|
|
|
93
105
|
export const brain = {
|
|
@@ -199,6 +211,33 @@ export const brain = {
|
|
|
199
211
|
return result.embeddings
|
|
200
212
|
},
|
|
201
213
|
|
|
214
|
+
/**
|
|
215
|
+
* Transcribe audio (speech-to-text). Uses the OpenAI Whisper endpoint
|
|
216
|
+
* by default; pass `provider: 'google'` to use Gemini's multimodal
|
|
217
|
+
* generateContent endpoint instead. Both accept a `language` hint
|
|
218
|
+
* (BCP-47) and a `prompt` to bias vocabulary.
|
|
219
|
+
*
|
|
220
|
+
* @example
|
|
221
|
+
* // Voice note coming off a LINE inbound webhook
|
|
222
|
+
* const { bytes, contentType } = await LineManager.client.downloadContent(messageId)
|
|
223
|
+
* const { text } = await brain.transcribe({
|
|
224
|
+
* audio: bytes,
|
|
225
|
+
* contentType, // 'audio/m4a' from LINE
|
|
226
|
+
* language: 'th',
|
|
227
|
+
* prompt: 'Coffee shop menu items, Bangkok area names',
|
|
228
|
+
* })
|
|
229
|
+
*/
|
|
230
|
+
async transcribe(options: TranscribeOptions): Promise<TranscriptionResponse> {
|
|
231
|
+
return BrainManager.transcribe(options.provider, {
|
|
232
|
+
audio: options.audio,
|
|
233
|
+
contentType: options.contentType,
|
|
234
|
+
model: options.model,
|
|
235
|
+
language: options.language,
|
|
236
|
+
prompt: options.prompt,
|
|
237
|
+
filename: options.filename,
|
|
238
|
+
})
|
|
239
|
+
},
|
|
240
|
+
|
|
202
241
|
/** Create a fluent agent runner. */
|
|
203
242
|
agent<T extends Agent>(AgentClass: new () => T): AgentRunner<T> {
|
|
204
243
|
return new AgentRunner(AgentClass)
|
|
@@ -10,6 +10,8 @@ import type {
|
|
|
10
10
|
ProviderConfig,
|
|
11
11
|
Message,
|
|
12
12
|
ToolCall,
|
|
13
|
+
TranscribeRequest,
|
|
14
|
+
TranscriptionResponse,
|
|
13
15
|
Usage,
|
|
14
16
|
} from '../types.ts'
|
|
15
17
|
|
|
@@ -178,6 +180,65 @@ export class GoogleProvider implements AIProvider {
|
|
|
178
180
|
}
|
|
179
181
|
}
|
|
180
182
|
|
|
183
|
+
/**
|
|
184
|
+
* Speech-to-text via Gemini's multimodal generateContent endpoint.
|
|
185
|
+
*
|
|
186
|
+
* Gemini doesn't have a dedicated STT endpoint; instead, audio is
|
|
187
|
+
* passed as an inline `audio/*` part alongside a text prompt asking
|
|
188
|
+
* for a transcription. We default to `gemini-2.5-flash` (fast, cheap,
|
|
189
|
+
* Thai-capable). Override `model` for `gemini-2.5-pro` when accuracy
|
|
190
|
+
* matters more than latency.
|
|
191
|
+
*
|
|
192
|
+
* Inline audio is capped at ~20MB across the whole request. Chunk
|
|
193
|
+
* longer recordings, or use Gemini's Files API (upload + reference)
|
|
194
|
+
* which isn't covered here — out of scope for the typical SME
|
|
195
|
+
* voice-note flow (<=60s clips).
|
|
196
|
+
*/
|
|
197
|
+
async transcribe(request: TranscribeRequest): Promise<TranscriptionResponse> {
|
|
198
|
+
const model = request.model ?? 'gemini-2.5-flash'
|
|
199
|
+
const contentType = request.contentType ?? 'audio/mpeg'
|
|
200
|
+
|
|
201
|
+
const bytes =
|
|
202
|
+
request.audio instanceof Blob
|
|
203
|
+
? new Uint8Array(await request.audio.arrayBuffer())
|
|
204
|
+
: request.audio
|
|
205
|
+
const base64 = encodeBase64(bytes)
|
|
206
|
+
|
|
207
|
+
const instruction = buildTranscriptionInstruction(request)
|
|
208
|
+
|
|
209
|
+
const body = {
|
|
210
|
+
contents: [
|
|
211
|
+
{
|
|
212
|
+
role: 'user',
|
|
213
|
+
parts: [
|
|
214
|
+
{ text: instruction },
|
|
215
|
+
{ inline_data: { mime_type: contentType, data: base64 } },
|
|
216
|
+
],
|
|
217
|
+
},
|
|
218
|
+
],
|
|
219
|
+
generationConfig: {
|
|
220
|
+
// Deterministic output for a transcription task.
|
|
221
|
+
temperature: 0,
|
|
222
|
+
},
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const response = await retryableFetch(
|
|
226
|
+
'Google',
|
|
227
|
+
`${this.baseUrl}/models/${model}:generateContent`,
|
|
228
|
+
{ method: 'POST', headers: this.buildHeaders(), body: JSON.stringify(body) },
|
|
229
|
+
this.retryOptions
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
const data: any = await response.json()
|
|
233
|
+
const text = extractTranscript(data)
|
|
234
|
+
|
|
235
|
+
return {
|
|
236
|
+
text,
|
|
237
|
+
language: request.language,
|
|
238
|
+
raw: data,
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
181
242
|
// ── Private helpers ──────────────────────────────────────────────────────
|
|
182
243
|
|
|
183
244
|
private buildHeaders(): Record<string, string> {
|
|
@@ -395,4 +456,41 @@ export class GoogleProvider implements AIProvider {
|
|
|
395
456
|
private generateResponseId(): string {
|
|
396
457
|
return `resp_${Math.random().toString(36).substring(2, 15)}`
|
|
397
458
|
}
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
function buildTranscriptionInstruction(request: TranscribeRequest): string {
|
|
462
|
+
const parts: string[] = [
|
|
463
|
+
'Transcribe the audio to text. Return only the transcription, without commentary, timestamps, or speaker labels.',
|
|
464
|
+
]
|
|
465
|
+
if (request.language) {
|
|
466
|
+
parts.push(`The audio is in ${request.language}. Preserve the original language in the output.`)
|
|
467
|
+
}
|
|
468
|
+
if (request.prompt) {
|
|
469
|
+
// Surface the priming hint to bias vocabulary (proper nouns, menu
|
|
470
|
+
// items, dialect markers). Kept inside the same system-style turn —
|
|
471
|
+
// Gemini doesn't have a separate "system_instruction" field that
|
|
472
|
+
// behaves differently for this use.
|
|
473
|
+
parts.push(`Context to help with vocabulary: ${request.prompt}`)
|
|
474
|
+
}
|
|
475
|
+
return parts.join(' ')
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
function extractTranscript(data: any): string {
|
|
479
|
+
const candidate = data?.candidates?.[0]
|
|
480
|
+
if (!candidate?.content?.parts) return ''
|
|
481
|
+
return candidate.content.parts
|
|
482
|
+
.map((part: any) => (typeof part?.text === 'string' ? part.text : ''))
|
|
483
|
+
.join('')
|
|
484
|
+
.trim()
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
function encodeBase64(bytes: Uint8Array): string {
|
|
488
|
+
// Node / Bun: use Buffer; falls back to atob/btoa in pure browser envs
|
|
489
|
+
// (not used in this codebase, but kept for parity with bun-types).
|
|
490
|
+
if (typeof Buffer !== 'undefined') {
|
|
491
|
+
return Buffer.from(bytes).toString('base64')
|
|
492
|
+
}
|
|
493
|
+
let binary = ''
|
|
494
|
+
for (const b of bytes) binary += String.fromCharCode(b)
|
|
495
|
+
return btoa(binary)
|
|
398
496
|
}
|
|
@@ -10,6 +10,8 @@ import type {
|
|
|
10
10
|
ProviderConfig,
|
|
11
11
|
Message,
|
|
12
12
|
ToolCall,
|
|
13
|
+
TranscribeRequest,
|
|
14
|
+
TranscriptionResponse,
|
|
13
15
|
Usage,
|
|
14
16
|
} from '../types.ts'
|
|
15
17
|
|
|
@@ -171,6 +173,53 @@ export class OpenAIProvider implements AIProvider {
|
|
|
171
173
|
}
|
|
172
174
|
}
|
|
173
175
|
|
|
176
|
+
/**
|
|
177
|
+
* Speech-to-text via the OpenAI Whisper API (/v1/audio/transcriptions).
|
|
178
|
+
*
|
|
179
|
+
* Defaults to `whisper-1` — the long-standing, broadly supported model.
|
|
180
|
+
* Override with `gpt-4o-transcribe` or `gpt-4o-mini-transcribe` for the
|
|
181
|
+
* newer architecture (better noise/accent robustness, similar pricing).
|
|
182
|
+
*
|
|
183
|
+
* Requests `verbose_json` so we can surface `language` and `duration`
|
|
184
|
+
* on the normalized response without a second round-trip.
|
|
185
|
+
*/
|
|
186
|
+
async transcribe(request: TranscribeRequest): Promise<TranscriptionResponse> {
|
|
187
|
+
const filename = request.filename ?? defaultFilename(request.contentType)
|
|
188
|
+
const contentType = request.contentType ?? 'application/octet-stream'
|
|
189
|
+
const blob =
|
|
190
|
+
request.audio instanceof Blob
|
|
191
|
+
? request.audio
|
|
192
|
+
: new Blob([request.audio], { type: contentType })
|
|
193
|
+
|
|
194
|
+
const form = new FormData()
|
|
195
|
+
form.append('file', blob, filename)
|
|
196
|
+
form.append('model', request.model ?? 'whisper-1')
|
|
197
|
+
form.append('response_format', 'verbose_json')
|
|
198
|
+
if (request.language) form.append('language', request.language)
|
|
199
|
+
if (request.prompt) form.append('prompt', request.prompt)
|
|
200
|
+
|
|
201
|
+
const response = await retryableFetch(
|
|
202
|
+
'OpenAI',
|
|
203
|
+
`${this.baseUrl}/v1/audio/transcriptions`,
|
|
204
|
+
{
|
|
205
|
+
method: 'POST',
|
|
206
|
+
// Don't set Content-Type — the runtime sets it with the
|
|
207
|
+
// multipart boundary derived from the FormData body.
|
|
208
|
+
headers: { Authorization: `Bearer ${this.apiKey}` },
|
|
209
|
+
body: form,
|
|
210
|
+
},
|
|
211
|
+
this.retryOptions
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
const data: any = await response.json()
|
|
215
|
+
return {
|
|
216
|
+
text: String(data.text ?? ''),
|
|
217
|
+
language: typeof data.language === 'string' ? data.language : undefined,
|
|
218
|
+
duration: typeof data.duration === 'number' ? data.duration : undefined,
|
|
219
|
+
raw: data,
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
174
223
|
// ── Private helpers ──────────────────────────────────────────────────────
|
|
175
224
|
|
|
176
225
|
private isReasoningModel(model: string): boolean {
|
|
@@ -507,3 +556,14 @@ export class OpenAIProvider implements AIProvider {
|
|
|
507
556
|
return schema
|
|
508
557
|
}
|
|
509
558
|
}
|
|
559
|
+
|
|
560
|
+
/**
|
|
561
|
+
* Choose a multipart filename for Whisper based on the content type.
|
|
562
|
+
* Whisper sniffs the extension when no MIME is supplied; sending a name
|
|
563
|
+
* that matches the actual format avoids "unsupported file" 400s.
|
|
564
|
+
*/
|
|
565
|
+
function defaultFilename(contentType?: string): string {
|
|
566
|
+
if (!contentType) return 'audio.bin'
|
|
567
|
+
const ext = contentType.split('/')[1]?.split(';')[0]?.trim()
|
|
568
|
+
return ext ? `audio.${ext}` : 'audio.bin'
|
|
569
|
+
}
|
package/src/types.ts
CHANGED
|
@@ -175,6 +175,52 @@ export interface EmbeddingResponse {
|
|
|
175
175
|
usage: { totalTokens: number }
|
|
176
176
|
}
|
|
177
177
|
|
|
178
|
+
// ── Transcription (Speech-to-Text) ───────────────────────────────────────────
|
|
179
|
+
|
|
180
|
+
export interface TranscribeRequest {
|
|
181
|
+
/** Audio bytes. Most STT endpoints cap at ~25MB; chunk longer recordings. */
|
|
182
|
+
audio: Uint8Array | Blob
|
|
183
|
+
/**
|
|
184
|
+
* MIME type of the audio. Required for providers that infer format from
|
|
185
|
+
* the multipart filename or rely on it for inline base64 (Gemini).
|
|
186
|
+
* Examples: 'audio/m4a', 'audio/mpeg', 'audio/wav', 'audio/ogg',
|
|
187
|
+
* 'audio/webm', 'audio/flac'.
|
|
188
|
+
*/
|
|
189
|
+
contentType?: string
|
|
190
|
+
/** Override the provider's default STT model. */
|
|
191
|
+
model?: string
|
|
192
|
+
/**
|
|
193
|
+
* BCP-47 language hint (e.g. 'th', 'en', 'zh'). Whisper accepts ISO-639-1
|
|
194
|
+
* ('th'); Gemini uses BCP-47. Both improve accuracy when set; omit for
|
|
195
|
+
* auto-detection.
|
|
196
|
+
*/
|
|
197
|
+
language?: string
|
|
198
|
+
/**
|
|
199
|
+
* Optional priming prompt — gives the model vocabulary or context to
|
|
200
|
+
* bias toward (proper nouns, brand names, menu items, dialect markers).
|
|
201
|
+
* Whisper uses this directly; Gemini incorporates it into the system
|
|
202
|
+
* instruction.
|
|
203
|
+
*/
|
|
204
|
+
prompt?: string
|
|
205
|
+
/**
|
|
206
|
+
* Filename to send in the multipart form (Whisper). Used to derive the
|
|
207
|
+
* audio format on the server when `contentType` is missing. Defaults to
|
|
208
|
+
* 'audio.bin' if not provided.
|
|
209
|
+
*/
|
|
210
|
+
filename?: string
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
export interface TranscriptionResponse {
|
|
214
|
+
/** Transcribed text. */
|
|
215
|
+
text: string
|
|
216
|
+
/** Detected language, when the provider reports one. */
|
|
217
|
+
language?: string
|
|
218
|
+
/** Audio duration in seconds, when the provider reports one. */
|
|
219
|
+
duration?: number
|
|
220
|
+
/** Original provider response for callers that need provider-specific fields. */
|
|
221
|
+
raw: unknown
|
|
222
|
+
}
|
|
223
|
+
|
|
178
224
|
// ── Provider ─────────────────────────────────────────────────────────────────
|
|
179
225
|
|
|
180
226
|
export interface AIProvider {
|
|
@@ -182,6 +228,13 @@ export interface AIProvider {
|
|
|
182
228
|
complete(request: CompletionRequest): Promise<CompletionResponse>
|
|
183
229
|
stream(request: CompletionRequest): AsyncIterable<StreamChunk>
|
|
184
230
|
embed?(input: string | string[], model?: string): Promise<EmbeddingResponse>
|
|
231
|
+
/**
|
|
232
|
+
* Transcribe audio to text. Implemented by providers that expose a
|
|
233
|
+
* speech-to-text endpoint (OpenAI Whisper, Google Gemini's multimodal
|
|
234
|
+
* generateContent). Throws or remains undefined for providers without
|
|
235
|
+
* STT (Anthropic at time of writing).
|
|
236
|
+
*/
|
|
237
|
+
transcribe?(request: TranscribeRequest): Promise<TranscriptionResponse>
|
|
185
238
|
}
|
|
186
239
|
|
|
187
240
|
// ── Hooks ────────────────────────────────────────────────────────────────────
|