npm - @strav/brain - Versions diffs - 0.4.28 → 0.4.29 - Mend

@strav/brain 0.4.28 → 0.4.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json +5 -5
package/src/brain_manager.ts +22 -0
package/src/helpers.ts +39 -0
package/src/providers/google_provider.ts +98 -0
package/src/providers/openai_provider.ts +60 -0
package/src/types.ts +53 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@strav/brain",
-  "version": "0.4.28",
+  "version": "0.4.29",
   "type": "module",
   "description": "AI module for the Strav framework",
   "license": "MIT",
@@ -15,15 +15,15 @@
     "CHANGELOG.md"
   ],
   "peerDependencies": {
-    "@strav/kernel": "0.4.28"
+    "@strav/kernel": "0.4.29"
   },
   "dependencies": {
-    "@strav/mcp": "0.4.28",
-    "@strav/workflow": "0.4.28",
+    "@strav/mcp": "0.4.29",
+    "@strav/workflow": "0.4.29",
     "zod": "^3.25 || ^4.0"
   },
   "devDependencies": {
-    "@strav/http": "0.4.28"
+    "@strav/http": "0.4.29"
   },
   "scripts": {
     "test": "bun test tests/",

package/src/brain_manager.ts CHANGED Viewed

@@ -9,6 +9,8 @@ import type {
   CompletionResponse,
   BeforeHook,
   AfterHook,
+  TranscribeRequest,
+  TranscriptionResponse,
 } from './types.ts'
 import type { MemoryConfig, ThreadStore } from './memory/types.ts'
@@ -125,6 +127,26 @@ export default class BrainManager {
     return response
   }
+  /**
+   * Transcribe audio through the named provider. Throws a clear
+   * ConfigurationError if the provider doesn't implement `transcribe()`
+   * (Anthropic, at time of writing). Hooks are not invoked — they're
+   * shaped for chat completions and don't carry an audio analogue.
+   */
+  static async transcribe(
+    providerName: string | undefined,
+    request: TranscribeRequest
+  ): Promise<TranscriptionResponse> {
+    const provider = BrainManager.provider(providerName)
+    if (!provider.transcribe) {
+      throw new ConfigurationError(
+        `AI provider "${provider.name}" does not support transcribe(). ` +
+          `Use the OpenAI or Google providers, or register a custom one via BrainManager.useProvider().`
+      )
+    }
+    return provider.transcribe(request)
+  }
   /** Clear all providers, hooks, and stores (for testing). */
   static reset(): void {
     BrainManager._providers.clear()

package/src/helpers.ts CHANGED Viewed

@@ -24,6 +24,8 @@ import type {
   SerializedAgentState,
   SuspendedRun,
   ToolCallResult,
+  TranscribeRequest,
+  TranscriptionResponse,
 } from './types.ts'
 // ── Shared tool executor ─────────────────────────────────────────────────────
@@ -88,6 +90,16 @@ export interface EmbedOptions {
   model?: string
 }
+export interface TranscribeOptions {
+  audio: Uint8Array | Blob
+  contentType?: string
+  language?: string
+  prompt?: string
+  filename?: string
+  provider?: string
+  model?: string
+}
 // ── brain Helper Object ─────────────────────────────────────────────────────
 export const brain = {
@@ -199,6 +211,33 @@ export const brain = {
     return result.embeddings
   },
+  /**
+   * Transcribe audio (speech-to-text). Uses the OpenAI Whisper endpoint
+   * by default; pass `provider: 'google'` to use Gemini's multimodal
+   * generateContent endpoint instead. Both accept a `language` hint
+   * (BCP-47) and a `prompt` to bias vocabulary.
+   *
+   * @example
+   *   // Voice note coming off a LINE inbound webhook
+   *   const { bytes, contentType } = await LineManager.client.downloadContent(messageId)
+   *   const { text } = await brain.transcribe({
+   *     audio: bytes,
+   *     contentType,                  // 'audio/m4a' from LINE
+   *     language: 'th',
+   *     prompt: 'Coffee shop menu items, Bangkok area names',
+   *   })
+   */
+  async transcribe(options: TranscribeOptions): Promise<TranscriptionResponse> {
+    return BrainManager.transcribe(options.provider, {
+      audio: options.audio,
+      contentType: options.contentType,
+      model: options.model,
+      language: options.language,
+      prompt: options.prompt,
+      filename: options.filename,
+    })
+  },
   /** Create a fluent agent runner. */
   agent<T extends Agent>(AgentClass: new () => T): AgentRunner<T> {
     return new AgentRunner(AgentClass)

package/src/providers/google_provider.ts CHANGED Viewed

@@ -10,6 +10,8 @@ import type {
   ProviderConfig,
   Message,
   ToolCall,
+  TranscribeRequest,
+  TranscriptionResponse,
   Usage,
 } from '../types.ts'
@@ -178,6 +180,65 @@ export class GoogleProvider implements AIProvider {
     }
   }
+  /**
+   * Speech-to-text via Gemini's multimodal generateContent endpoint.
+   *
+   * Gemini doesn't have a dedicated STT endpoint; instead, audio is
+   * passed as an inline `audio/*` part alongside a text prompt asking
+   * for a transcription. We default to `gemini-2.5-flash` (fast, cheap,
+   * Thai-capable). Override `model` for `gemini-2.5-pro` when accuracy
+   * matters more than latency.
+   *
+   * Inline audio is capped at ~20MB across the whole request. Chunk
+   * longer recordings, or use Gemini's Files API (upload + reference)
+   * which isn't covered here — out of scope for the typical SME
+   * voice-note flow (<=60s clips).
+   */
+  async transcribe(request: TranscribeRequest): Promise<TranscriptionResponse> {
+    const model = request.model ?? 'gemini-2.5-flash'
+    const contentType = request.contentType ?? 'audio/mpeg'
+    const bytes =
+      request.audio instanceof Blob
+        ? new Uint8Array(await request.audio.arrayBuffer())
+        : request.audio
+    const base64 = encodeBase64(bytes)
+    const instruction = buildTranscriptionInstruction(request)
+    const body = {
+      contents: [
+        {
+          role: 'user',
+          parts: [
+            { text: instruction },
+            { inline_data: { mime_type: contentType, data: base64 } },
+          ],
+        },
+      ],
+      generationConfig: {
+        // Deterministic output for a transcription task.
+        temperature: 0,
+      },
+    }
+    const response = await retryableFetch(
+      'Google',
+      `${this.baseUrl}/models/${model}:generateContent`,
+      { method: 'POST', headers: this.buildHeaders(), body: JSON.stringify(body) },
+      this.retryOptions
+    )
+    const data: any = await response.json()
+    const text = extractTranscript(data)
+    return {
+      text,
+      language: request.language,
+      raw: data,
+    }
+  }
   // ── Private helpers ──────────────────────────────────────────────────────
   private buildHeaders(): Record<string, string> {
@@ -395,4 +456,41 @@ export class GoogleProvider implements AIProvider {
   private generateResponseId(): string {
     return `resp_${Math.random().toString(36).substring(2, 15)}`
   }
+}
+function buildTranscriptionInstruction(request: TranscribeRequest): string {
+  const parts: string[] = [
+    'Transcribe the audio to text. Return only the transcription, without commentary, timestamps, or speaker labels.',
+  ]
+  if (request.language) {
+    parts.push(`The audio is in ${request.language}. Preserve the original language in the output.`)
+  }
+  if (request.prompt) {
+    // Surface the priming hint to bias vocabulary (proper nouns, menu
+    // items, dialect markers). Kept inside the same system-style turn —
+    // Gemini doesn't have a separate "system_instruction" field that
+    // behaves differently for this use.
+    parts.push(`Context to help with vocabulary: ${request.prompt}`)
+  }
+  return parts.join(' ')
+}
+function extractTranscript(data: any): string {
+  const candidate = data?.candidates?.[0]
+  if (!candidate?.content?.parts) return ''
+  return candidate.content.parts
+    .map((part: any) => (typeof part?.text === 'string' ? part.text : ''))
+    .join('')
+    .trim()
+}
+function encodeBase64(bytes: Uint8Array): string {
+  // Node / Bun: use Buffer; falls back to atob/btoa in pure browser envs
+  // (not used in this codebase, but kept for parity with bun-types).
+  if (typeof Buffer !== 'undefined') {
+    return Buffer.from(bytes).toString('base64')
+  }
+  let binary = ''
+  for (const b of bytes) binary += String.fromCharCode(b)
+  return btoa(binary)
 }

package/src/providers/openai_provider.ts CHANGED Viewed

@@ -10,6 +10,8 @@ import type {
   ProviderConfig,
   Message,
   ToolCall,
+  TranscribeRequest,
+  TranscriptionResponse,
   Usage,
 } from '../types.ts'
@@ -171,6 +173,53 @@ export class OpenAIProvider implements AIProvider {
     }
   }
+  /**
+   * Speech-to-text via the OpenAI Whisper API (/v1/audio/transcriptions).
+   *
+   * Defaults to `whisper-1` — the long-standing, broadly supported model.
+   * Override with `gpt-4o-transcribe` or `gpt-4o-mini-transcribe` for the
+   * newer architecture (better noise/accent robustness, similar pricing).
+   *
+   * Requests `verbose_json` so we can surface `language` and `duration`
+   * on the normalized response without a second round-trip.
+   */
+  async transcribe(request: TranscribeRequest): Promise<TranscriptionResponse> {
+    const filename = request.filename ?? defaultFilename(request.contentType)
+    const contentType = request.contentType ?? 'application/octet-stream'
+    const blob =
+      request.audio instanceof Blob
+        ? request.audio
+        : new Blob([request.audio], { type: contentType })
+    const form = new FormData()
+    form.append('file', blob, filename)
+    form.append('model', request.model ?? 'whisper-1')
+    form.append('response_format', 'verbose_json')
+    if (request.language) form.append('language', request.language)
+    if (request.prompt) form.append('prompt', request.prompt)
+    const response = await retryableFetch(
+      'OpenAI',
+      `${this.baseUrl}/v1/audio/transcriptions`,
+      {
+        method: 'POST',
+        // Don't set Content-Type — the runtime sets it with the
+        // multipart boundary derived from the FormData body.
+        headers: { Authorization: `Bearer ${this.apiKey}` },
+        body: form,
+      },
+      this.retryOptions
+    )
+    const data: any = await response.json()
+    return {
+      text: String(data.text ?? ''),
+      language: typeof data.language === 'string' ? data.language : undefined,
+      duration: typeof data.duration === 'number' ? data.duration : undefined,
+      raw: data,
+    }
+  }
   // ── Private helpers ──────────────────────────────────────────────────────
   private isReasoningModel(model: string): boolean {
@@ -507,3 +556,14 @@ export class OpenAIProvider implements AIProvider {
     return schema
   }
 }
+/**
+ * Choose a multipart filename for Whisper based on the content type.
+ * Whisper sniffs the extension when no MIME is supplied; sending a name
+ * that matches the actual format avoids "unsupported file" 400s.
+ */
+function defaultFilename(contentType?: string): string {
+  if (!contentType) return 'audio.bin'
+  const ext = contentType.split('/')[1]?.split(';')[0]?.trim()
+  return ext ? `audio.${ext}` : 'audio.bin'
+}

package/src/types.ts CHANGED Viewed

@@ -175,6 +175,52 @@ export interface EmbeddingResponse {
   usage: { totalTokens: number }
 }
+// ── Transcription (Speech-to-Text) ───────────────────────────────────────────
+export interface TranscribeRequest {
+  /** Audio bytes. Most STT endpoints cap at ~25MB; chunk longer recordings. */
+  audio: Uint8Array | Blob
+  /**
+   * MIME type of the audio. Required for providers that infer format from
+   * the multipart filename or rely on it for inline base64 (Gemini).
+   * Examples: 'audio/m4a', 'audio/mpeg', 'audio/wav', 'audio/ogg',
+   * 'audio/webm', 'audio/flac'.
+   */
+  contentType?: string
+  /** Override the provider's default STT model. */
+  model?: string
+  /**
+   * BCP-47 language hint (e.g. 'th', 'en', 'zh'). Whisper accepts ISO-639-1
+   * ('th'); Gemini uses BCP-47. Both improve accuracy when set; omit for
+   * auto-detection.
+   */
+  language?: string
+  /**
+   * Optional priming prompt — gives the model vocabulary or context to
+   * bias toward (proper nouns, brand names, menu items, dialect markers).
+   * Whisper uses this directly; Gemini incorporates it into the system
+   * instruction.
+   */
+  prompt?: string
+  /**
+   * Filename to send in the multipart form (Whisper). Used to derive the
+   * audio format on the server when `contentType` is missing. Defaults to
+   * 'audio.bin' if not provided.
+   */
+  filename?: string
+}
+export interface TranscriptionResponse {
+  /** Transcribed text. */
+  text: string
+  /** Detected language, when the provider reports one. */
+  language?: string
+  /** Audio duration in seconds, when the provider reports one. */
+  duration?: number
+  /** Original provider response for callers that need provider-specific fields. */
+  raw: unknown
+}
 // ── Provider ─────────────────────────────────────────────────────────────────
 export interface AIProvider {
@@ -182,6 +228,13 @@ export interface AIProvider {
   complete(request: CompletionRequest): Promise<CompletionResponse>
   stream(request: CompletionRequest): AsyncIterable<StreamChunk>
   embed?(input: string | string[], model?: string): Promise<EmbeddingResponse>
+  /**
+   * Transcribe audio to text. Implemented by providers that expose a
+   * speech-to-text endpoint (OpenAI Whisper, Google Gemini's multimodal
+   * generateContent). Throws or remains undefined for providers without
+   * STT (Anthropic at time of writing).
+   */
+  transcribe?(request: TranscribeRequest): Promise<TranscriptionResponse>
 }
 // ── Hooks ────────────────────────────────────────────────────────────────────