voice-router-dev 0.8.0 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +169 -0
- package/README.md +21 -2
- package/dist/constants.d.mts +577 -7
- package/dist/constants.d.ts +577 -7
- package/dist/constants.js +493 -1
- package/dist/constants.mjs +482 -1
- package/dist/{field-configs-CDeDcDz_.d.mts → field-configs-DN2_WrYr.d.mts} +568 -568
- package/dist/{field-configs-CDeDcDz_.d.ts → field-configs-DN2_WrYr.d.ts} +568 -568
- package/dist/field-configs.d.mts +1 -1
- package/dist/field-configs.d.ts +1 -1
- package/dist/index.d.mts +2784 -1246
- package/dist/index.d.ts +2784 -1246
- package/dist/index.js +1466 -91
- package/dist/index.mjs +1458 -91
- package/dist/{provider-metadata-BHbouRC9.d.mts → provider-metadata-BnkedpXm.d.mts} +34 -4
- package/dist/{provider-metadata-Dsk2PVud.d.ts → provider-metadata-DbsSGAO7.d.ts} +34 -4
- package/dist/provider-metadata.d.mts +2 -2
- package/dist/provider-metadata.d.ts +2 -2
- package/dist/provider-metadata.js +349 -6
- package/dist/provider-metadata.mjs +345 -6
- package/dist/{transcriptWebhookNotification-Cz9RsK5D.d.mts → speechToTextChunkResponseModel-3IUnJXKx.d.mts} +975 -9
- package/dist/{transcriptWebhookNotification-D1iE2_a4.d.ts → speechToTextChunkResponseModel-DExUFZT3.d.ts} +975 -9
- package/dist/webhooks.d.mts +103 -5
- package/dist/webhooks.d.ts +103 -5
- package/dist/webhooks.js +342 -39
- package/dist/webhooks.mjs +340 -39
- package/package.json +14 -6
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,175 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.8.2] - 2026-03-15
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
|
|
12
|
+
- Pin Deepgram streaming SDK source to v4.11.3 tag (v5.0.0 Fern rewrite removed `TranscriptionSchema.ts`)
|
|
13
|
+
- Fix ElevenLabs FormData append for object/array fields (`additional_formats`, `webhook_metadata`, `entity_detection`)
|
|
14
|
+
- Add `elevenlabs` to `ProviderWebhookPayloadMap` for typed webhook payloads
|
|
15
|
+
|
|
16
|
+
### Added
|
|
17
|
+
|
|
18
|
+
- Documentation generation for ElevenLabs and Soniox providers (typedoc configs + scripts)
|
|
19
|
+
- Speechmatics and ElevenLabs webhook handlers added to webhook documentation
|
|
20
|
+
|
|
21
|
+
## [0.8.1] - 2026-03-13
|
|
22
|
+
|
|
23
|
+
#### ElevenLabs ScribeV2 — 9th Provider
|
|
24
|
+
|
|
25
|
+
New adapter for [ElevenLabs](https://elevenlabs.io) speech-to-text with batch and real-time streaming support:
|
|
26
|
+
|
|
27
|
+
```typescript
|
|
28
|
+
import { createElevenLabsAdapter, ElevenLabsModel, ElevenLabsRegion } from 'voice-router-dev'
|
|
29
|
+
|
|
30
|
+
const adapter = createElevenLabsAdapter({
|
|
31
|
+
apiKey: process.env.ELEVENLABS_API_KEY,
|
|
32
|
+
model: ElevenLabsModel.scribe_v2,
|
|
33
|
+
region: ElevenLabsRegion.eu // global, us, eu, in
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
// Batch transcription (synchronous)
|
|
37
|
+
const result = await adapter.transcribe({
|
|
38
|
+
type: 'url',
|
|
39
|
+
url: 'https://example.com/audio.mp3'
|
|
40
|
+
}, {
|
|
41
|
+
language: 'en',
|
|
42
|
+
diarization: true,
|
|
43
|
+
customVocabulary: ['ElevenLabs', 'ScribeV2'],
|
|
44
|
+
entityDetection: true,
|
|
45
|
+
elevenlabs: {
|
|
46
|
+
tag_audio_events: true,
|
|
47
|
+
no_verbatim: true,
|
|
48
|
+
temperature: 0.1
|
|
49
|
+
}
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
// Real-time streaming (WebSocket)
|
|
53
|
+
const session = await adapter.transcribeStream({
|
|
54
|
+
elevenlabsStreaming: {
|
|
55
|
+
model: 'scribe_v2_realtime',
|
|
56
|
+
audioFormat: 'pcm_16000',
|
|
57
|
+
commitStrategy: 'vad',
|
|
58
|
+
includeTimestamps: true
|
|
59
|
+
}
|
|
60
|
+
}, {
|
|
61
|
+
onTranscript: (event) => console.log(event.text),
|
|
62
|
+
onUtterance: (utterance) => console.log(utterance.text),
|
|
63
|
+
onError: (error) => console.error(error)
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
// Retrieve transcript by ID
|
|
67
|
+
const transcript = await adapter.getTranscript('abc123')
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**Features:**
|
|
71
|
+
- Batch transcription via URL or file upload (synchronous — result returned directly)
|
|
72
|
+
- Real-time WebSocket streaming with base64-encoded audio
|
|
73
|
+
- Speaker diarization with configurable threshold and speaker count
|
|
74
|
+
- Word-level timestamps with logprob-based confidence
|
|
75
|
+
- Entity detection (PII, PHI, PCI) with character positions
|
|
76
|
+
- Audio event tagging (laughter, footsteps, music)
|
|
77
|
+
- Custom vocabulary (keyterms) boosting (up to 100 terms)
|
|
78
|
+
- 99 supported languages (ISO 639-1/3)
|
|
79
|
+
- 4 regional endpoints (global, US, EU, India)
|
|
80
|
+
- No-verbatim mode (removes filler words and false starts)
|
|
81
|
+
|
|
82
|
+
**Spec pipeline:**
|
|
83
|
+
- `fix-elevenlabs-spec.js` filters monolithic API spec (210 paths) to 2 STT endpoints, 18 schemas
|
|
84
|
+
- `generate-elevenlabs-languages.js` generates 99 language codes
|
|
85
|
+
- `generate-elevenlabs-models.js` extracts batch models from spec + adds realtime model
|
|
86
|
+
|
|
87
|
+
**New exports:**
|
|
88
|
+
|
|
89
|
+
| Export | Description |
|
|
90
|
+
|--------|-------------|
|
|
91
|
+
| `ElevenLabsModel` | Batch models: `scribe_v1`, `scribe_v2` |
|
|
92
|
+
| `ElevenLabsRealtimeModel` | Realtime: `scribe_v2_realtime` |
|
|
93
|
+
| `ElevenLabsLanguage` | 99 language codes with labels |
|
|
94
|
+
| `ElevenLabsRegion` | `global`, `us`, `eu`, `in` |
|
|
95
|
+
| `ElevenLabsAudioFormat` | PCM sample rates + ulaw |
|
|
96
|
+
| `ElevenLabsTypes` | Generated schema types |
|
|
97
|
+
| `ElevenLabsZodSchemas` | Zod schemas for runtime validation |
|
|
98
|
+
|
|
99
|
+
**Webhook handler:** Auto-detects ElevenLabs payloads via `words` + `language_code` + `logprob` fields. Extracts words, speakers, utterances, entities, and language detection results.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## [0.8.2] - 2026-02-28
|
|
104
|
+
|
|
105
|
+
### Fixed
|
|
106
|
+
|
|
107
|
+
#### AssemblyAI Webhook Handler — Parse Full Transcript Body
|
|
108
|
+
|
|
109
|
+
Previously, the AssemblyAI webhook handler only accepted the lightweight notification format (`{ transcript_id, status }`) and returned just the transcript ID — no text, words, or utterances.
|
|
110
|
+
|
|
111
|
+
Now it also accepts the full transcript response body (`{ id, status, audio_url, text, words, utterances, ... }`) and extracts all available data:
|
|
112
|
+
|
|
113
|
+
- Text, confidence, duration, language
|
|
114
|
+
- Word-level timestamps (converted from ms to seconds)
|
|
115
|
+
- Speaker-segmented utterances with per-word timing
|
|
116
|
+
- Summary, speaker list, and provider metadata
|
|
117
|
+
|
|
118
|
+
The handler auto-detects which format was received. Existing notification-format integrations continue to work unchanged.
|
|
119
|
+
|
|
120
|
+
#### Speechmatics Webhook Handler — Word and Utterance Extraction
|
|
121
|
+
|
|
122
|
+
The Speechmatics webhook handler now extracts word-level data and builds speaker-grouped utterances from the transcript body:
|
|
123
|
+
|
|
124
|
+
- Words with `start`/`end` timestamps, `confidence`, and `speaker` ID
|
|
125
|
+
- Utterances grouped by consecutive speaker (same algorithm as the batch adapter)
|
|
126
|
+
|
|
127
|
+
Previously, only the full text and speaker set were extracted.
|
|
128
|
+
|
|
129
|
+
#### Speechmatics Adapter — Utterances Now Include Words
|
|
130
|
+
|
|
131
|
+
The Speechmatics batch adapter's utterance construction now includes word-level data in each utterance, matching the `Utterance.words: Word[]` contract.
|
|
132
|
+
|
|
133
|
+
#### Speechmatics Text Now Preserves Punctuation
|
|
134
|
+
|
|
135
|
+
Both the Speechmatics adapter and webhook handler were stripping all punctuation from transcript text by filtering to `type === "word"` and joining with spaces, producing `"Hello world"` instead of `"Hello, world."`.
|
|
136
|
+
|
|
137
|
+
Now uses a shared `buildTextFromSpeechmaticsResults()` helper that reads the `attaches_to` field on punctuation results:
|
|
138
|
+
- `previous` (`.`, `,`, `?`, `!`) — appended directly to previous word
|
|
139
|
+
- `next` (opening quotes) — prepended to next word
|
|
140
|
+
- `both` (hyphens) — no space on either side
|
|
141
|
+
- `none` or missing — space-separated token
|
|
142
|
+
|
|
143
|
+
#### Deepgram Webhook — Speaker Deduplication
|
|
144
|
+
|
|
145
|
+
The Deepgram webhook handler mapped every utterance 1:1 to a "speaker" entry, so 10 utterances from 2 speakers produced 10 entries with duplicate IDs. Now correctly deduplicates via `Set<string>` and returns proper `{ id, label }` objects matching other providers.
|
|
146
|
+
|
|
147
|
+
Also added `speaker` field to utterance word mappings — the generated Deepgram type has `speaker?: number` on utterance words but it wasn't being mapped through.
|
|
148
|
+
|
|
149
|
+
#### Soniox Adapter — Utterances No Longer Include Interim Tokens
|
|
150
|
+
|
|
151
|
+
`normalizeResponse` was building utterances from all tokens (including non-final/interim), while `words` and `text` correctly filtered by `is_final`. This could cause utterance text to contain partial words not present in the words array. Now filters to `is_final` tokens before building utterances.
|
|
152
|
+
|
|
153
|
+
#### Speechmatics Adapter — Utterance End Time Fix
|
|
154
|
+
|
|
155
|
+
Utterance end times used `prevResult?.end_time || result.start_time` (falling back to the *next* word's start time), which could produce incorrect values. Now uses `currentWords[last].end` — the actual end time of the last word in the utterance, matching the Speechmatics webhook and Soniox adapter behavior.
|
|
156
|
+
|
|
157
|
+
#### Deepgram Webhook — Language Field Was Returning Model ID
|
|
158
|
+
|
|
159
|
+
`data.language` was set to `response.metadata.models[0]`, which is a model UUID — not a language code. Now correctly reads `channel.detected_language` from the first channel (set when `detect_language=true`).
|
|
160
|
+
|
|
161
|
+
#### Deepgram Webhook — Utterance `id` and `channel` Now Extracted
|
|
162
|
+
|
|
163
|
+
The Deepgram schema includes `id` and `channel` on utterances but they weren't mapped through. Now populated in the unified response.
|
|
164
|
+
|
|
165
|
+
#### Consistent Speaker `label` Across All Webhooks
|
|
166
|
+
|
|
167
|
+
AssemblyAI and Gladia webhook handlers returned Speaker objects without a `label` field (`{ id }`), while Deepgram and Speechmatics included `{ id, label: "Speaker ..." }`. All handlers now consistently include the `label` field.
|
|
168
|
+
|
|
169
|
+
### Changed
|
|
170
|
+
|
|
171
|
+
#### `Utterance.words` Is Now Non-Optional
|
|
172
|
+
|
|
173
|
+
`Utterance.words` changed from `words?: Word[]` to `words: Word[]`. Providers that don't supply word-level data now return an empty array instead of `undefined`. This removes the need for null checks when iterating utterance words.
|
|
174
|
+
|
|
175
|
+
All adapters and webhook handlers have been updated accordingly (Deepgram, Gladia, OpenAI, Speechmatics, AssemblyAI).
|
|
176
|
+
|
|
8
177
|
## [0.8.0] - 2026-01-27
|
|
9
178
|
|
|
10
179
|
### Fixed
|
package/README.md
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
## Why Voice Router?
|
|
10
10
|
|
|
11
|
-
Switch between speech-to-text providers **without changing your code**. One API for Gladia, AssemblyAI, Deepgram, Azure, OpenAI Whisper, Speechmatics, and
|
|
11
|
+
Switch between speech-to-text providers **without changing your code**. One API for Gladia, AssemblyAI, Deepgram, Azure, OpenAI Whisper, Speechmatics, Soniox, and ElevenLabs.
|
|
12
12
|
|
|
13
13
|
```typescript
|
|
14
14
|
import { VoiceRouter } from 'voice-router-dev';
|
|
@@ -31,7 +31,7 @@ const result = await router.transcribe(audio, {
|
|
|
31
31
|
- **Provider-Agnostic** - Switch providers with one line
|
|
32
32
|
- **Unified API** - Same interface for all providers
|
|
33
33
|
- **Webhook Normalization** - Auto-detect and parse webhooks
|
|
34
|
-
- **Real-time Streaming** - WebSocket support (Gladia, AssemblyAI, Deepgram, Soniox, OpenAI Realtime)
|
|
34
|
+
- **Real-time Streaming** - WebSocket support (Gladia, AssemblyAI, Deepgram, Soniox, OpenAI Realtime, ElevenLabs)
|
|
35
35
|
- **Advanced Features** - Diarization, sentiment, summarization, chapters, entities
|
|
36
36
|
- **Type-Safe** - Full TypeScript support with OpenAPI-generated types
|
|
37
37
|
- **Typed Extended Data** - Access provider-specific features with full autocomplete
|
|
@@ -49,6 +49,7 @@ const result = await router.transcribe(audio, {
|
|
|
49
49
|
| **OpenAI** | Sync | Realtime | No | gpt-4o, diarization, Realtime API |
|
|
50
50
|
| **Speechmatics** | Async | No | Query params | High accuracy, summarization |
|
|
51
51
|
| **Soniox** | Yes | WebSocket | No | 60+ languages, translation, regions |
|
|
52
|
+
| **ElevenLabs** | Sync | WebSocket | Yes | Entity detection, audio events, 99 languages |
|
|
52
53
|
|
|
53
54
|
## Installation
|
|
54
55
|
|
|
@@ -376,6 +377,7 @@ Provider-specific implementations:
|
|
|
376
377
|
- `OpenAIWhisperAdapter` - OpenAI Whisper + Realtime API
|
|
377
378
|
- `SpeechmaticsAdapter` - Speechmatics transcription
|
|
378
379
|
- `SonioxAdapter` - Soniox transcription (batch + streaming)
|
|
380
|
+
- `ElevenLabsAdapter` - ElevenLabs transcription (batch + streaming)
|
|
379
381
|
|
|
380
382
|
## TypeScript Support
|
|
381
383
|
|
|
@@ -787,6 +789,23 @@ router.registerAdapter(new SonioxAdapter());
|
|
|
787
789
|
|
|
788
790
|
Get your API key: https://soniox.com
|
|
789
791
|
|
|
792
|
+
### ElevenLabs
|
|
793
|
+
```typescript
|
|
794
|
+
import { VoiceRouter, ElevenLabsAdapter, ElevenLabsRegion } from 'voice-router-dev';
|
|
795
|
+
|
|
796
|
+
const router = new VoiceRouter({
|
|
797
|
+
providers: {
|
|
798
|
+
elevenlabs: {
|
|
799
|
+
apiKey: 'YOUR_KEY',
|
|
800
|
+
region: ElevenLabsRegion.global // or 'us', 'eu', 'in'
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
});
|
|
804
|
+
router.registerAdapter(new ElevenLabsAdapter());
|
|
805
|
+
```
|
|
806
|
+
|
|
807
|
+
Get your API key: https://elevenlabs.io
|
|
808
|
+
|
|
790
809
|
## Contributing
|
|
791
810
|
|
|
792
811
|
Contributions welcome! Please read our [Contributing Guide](CONTRIBUTING.md).
|