getpatter 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-CL2U3YET.mjs +1429 -0
- package/dist/{chunk-TEW3NAZJ.mjs → chunk-LE63CSOB.mjs} +371 -1486
- package/dist/{chunk-RV7APPYE.mjs → chunk-R2T4JABZ.mjs} +13 -0
- package/dist/cli.js +48 -23
- package/dist/dashboard/ui.html +8 -8
- package/dist/index.d.mts +452 -186
- package/dist/index.d.ts +452 -186
- package/dist/index.js +1485 -979
- package/dist/index.mjs +973 -790
- package/dist/openai-realtime-2-CNFARP25.mjs +8 -0
- package/dist/{silero-vad-NSEXI4XS.mjs → silero-vad-LNDFGIY7.mjs} +1 -1
- package/dist/{test-mode-WEKKNBLD.mjs → test-mode-RS57BDM6.mjs} +2 -1
- package/package.json +1 -1
- package/src/dashboard/ui.html +8 -8
package/dist/index.d.ts
CHANGED
|
@@ -61,7 +61,11 @@ declare class Carrier {
|
|
|
61
61
|
interface RealtimeOptions {
|
|
62
62
|
/** API key. Falls back to OPENAI_API_KEY env var when omitted. */
|
|
63
63
|
apiKey?: string;
|
|
64
|
-
/**
|
|
64
|
+
/**
|
|
65
|
+
* Realtime model. Defaults to ``gpt-realtime-mini`` (bumped from the
|
|
66
|
+
* deprecated ``gpt-4o-mini-realtime-preview`` on 2026-05-25 for
|
|
67
|
+
* parity with the Python SDK and the GA Realtime API surface).
|
|
68
|
+
*/
|
|
65
69
|
model?: string;
|
|
66
70
|
/** Voice preset. Defaults to alloy. */
|
|
67
71
|
voice?: string;
|
|
@@ -1258,15 +1262,16 @@ interface AgentOptions {
|
|
|
1258
1262
|
*/
|
|
1259
1263
|
prewarm?: boolean;
|
|
1260
1264
|
/**
|
|
1261
|
-
* When ``true`` (default
|
|
1262
|
-
* ``firstMessage`` to TTS audio bytes during the ringing
|
|
1263
|
-
* streams the cached buffer immediately when the carrier
|
|
1264
|
-
* ``start``. Eliminates the 200-700 ms TTS first-byte latency
|
|
1265
|
-
*
|
|
1266
|
-
*
|
|
1267
|
-
*
|
|
1268
|
-
*
|
|
1269
|
-
*
|
|
1265
|
+
* When ``true`` (default since 0.6.2 in pipeline mode), ``Patter.call``
|
|
1266
|
+
* pre-renders ``firstMessage`` to TTS audio bytes during the ringing
|
|
1267
|
+
* window and streams the cached buffer immediately when the carrier
|
|
1268
|
+
* emits ``start``. Eliminates the 200-700 ms TTS first-byte latency
|
|
1269
|
+
* on the greeting that dominated first-turn ``p95`` on every pipeline
|
|
1270
|
+
* acceptance run. The trade-off is paying the TTS bill even if the
|
|
1271
|
+
* call is never answered (silently logged at warn level when the call
|
|
1272
|
+
* fails) — typically $0.001-$0.005 per ringing call depending on TTS
|
|
1273
|
+
* provider. Opt out by passing ``prewarmFirstMessage: false`` (e.g.
|
|
1274
|
+
* for very high-volume outbound where un-answered TTS spend matters).
|
|
1270
1275
|
*
|
|
1271
1276
|
* **Pipeline mode only.** Realtime / ConvAI provider modes never
|
|
1272
1277
|
* consume the prewarm cache (the StreamHandler for those modes runs
|
|
@@ -1563,12 +1568,35 @@ declare class ElevenLabsTTS {
|
|
|
1563
1568
|
private readonly apiKey;
|
|
1564
1569
|
private readonly voiceId;
|
|
1565
1570
|
private readonly modelId;
|
|
1566
|
-
private
|
|
1571
|
+
private _outputFormat;
|
|
1572
|
+
private readonly _outputFormatExplicit;
|
|
1567
1573
|
private readonly voiceSettings;
|
|
1568
1574
|
private readonly languageCode;
|
|
1569
1575
|
private readonly chunkSize;
|
|
1576
|
+
/**
|
|
1577
|
+
* Public view of the (possibly auto-flipped) wire format. Read by the
|
|
1578
|
+
* stream-handler to decide whether to skip the client-side resample +
|
|
1579
|
+
* mulaw encode when the bytes are already in the carrier's wire codec.
|
|
1580
|
+
*/
|
|
1581
|
+
get outputFormat(): ElevenLabsOutputFormat;
|
|
1570
1582
|
constructor(apiKey: string, voiceId?: string, modelId?: string, outputFormat?: ElevenLabsOutputFormat | string);
|
|
1571
1583
|
constructor(apiKey: string, options: ElevenLabsTTSOptions$1);
|
|
1584
|
+
/**
|
|
1585
|
+
* Hook called by ``StreamHandler.initPipeline`` to advise the carrier
|
|
1586
|
+
* wire format. When the user did NOT pass an explicit ``outputFormat``,
|
|
1587
|
+
* auto-flip to the carrier's native codec so the audio bytes ElevenLabs
|
|
1588
|
+
* returns are already in Twilio/Telnyx wire format — eliminating the
|
|
1589
|
+
* client-side 16 kHz → 8 kHz resample and PCM → μ-law encode. The
|
|
1590
|
+
* resample/encode chain was a source of audible artifacts on the
|
|
1591
|
+
* prewarmed firstMessage (see 0.6.2 acceptance notes — burst delivery
|
|
1592
|
+
* of resampled audio crackled on the carrier-side jitter buffer).
|
|
1593
|
+
*
|
|
1594
|
+
* No-op when the caller passed an explicit ``outputFormat`` (incl. via
|
|
1595
|
+
* the ``forTwilio`` / ``forTelnyx`` factories) — user wins.
|
|
1596
|
+
*
|
|
1597
|
+
* Parity with {@link ElevenLabsWebSocketTTS.setTelephonyCarrier}.
|
|
1598
|
+
*/
|
|
1599
|
+
setTelephonyCarrier(carrier: string): void;
|
|
1572
1600
|
/**
|
|
1573
1601
|
* Construct an instance pre-configured for Twilio Media Streams.
|
|
1574
1602
|
*
|
|
@@ -1695,6 +1723,20 @@ declare class ElevenLabsWebSocketTTS implements TTSAdapter {
|
|
|
1695
1723
|
* changes.
|
|
1696
1724
|
*/
|
|
1697
1725
|
private adoptedConnection;
|
|
1726
|
+
/**
|
|
1727
|
+
* Active WS for the in-flight ``synthesizeStream`` call, if any. Set
|
|
1728
|
+
* when a stream starts, cleared in its ``finally`` block. The
|
|
1729
|
+
* stream-handler calls ``cancelActiveStream()`` from ``cancelSpeaking``
|
|
1730
|
+
* to unblock the generator's inner ``await Promise<frame>`` — without
|
|
1731
|
+
* it, a barge-in on the firstMessage live path leaves the for-await
|
|
1732
|
+
* stuck waiting for the next frame; ElevenLabs never sends
|
|
1733
|
+
* ``isFinal=true`` after the consumer breaks, the 30 s frame timeout
|
|
1734
|
+
* fires post-call, and meanwhile ``initPipeline`` never returns so
|
|
1735
|
+
* the STT ``onTranscript`` callback never registers and subsequent
|
|
1736
|
+
* user turns are silently dropped (root cause of the 2026-05-20
|
|
1737
|
+
* "first message OK, then no response" symptom).
|
|
1738
|
+
*/
|
|
1739
|
+
private activeStreamWs;
|
|
1698
1740
|
/**
|
|
1699
1741
|
* The wire format requested over the ElevenLabs WS. Initially set from
|
|
1700
1742
|
* the constructor; ``setTelephonyCarrier`` may auto-flip it to the
|
|
@@ -1719,6 +1761,22 @@ declare class ElevenLabsWebSocketTTS implements TTSAdapter {
|
|
|
1719
1761
|
* the user's choice always wins.
|
|
1720
1762
|
*/
|
|
1721
1763
|
setTelephonyCarrier(carrier: string): void;
|
|
1764
|
+
/**
|
|
1765
|
+
* Force-close the WebSocket of any in-flight ``synthesizeStream`` call.
|
|
1766
|
+
* Called by the stream-handler from ``cancelSpeaking`` (barge-in) so
|
|
1767
|
+
* the generator's inner ``await Promise<frame>`` loop unblocks cleanly
|
|
1768
|
+
* via the ``onClose`` handler — instead of waiting up to 30 s for the
|
|
1769
|
+
* ``FRAME_TIMEOUT_MS`` watchdog to fire. No-op when no stream is in
|
|
1770
|
+
* flight or when the WS is already closing.
|
|
1771
|
+
*
|
|
1772
|
+
* Without this, a barge-in during the firstMessage live path left the
|
|
1773
|
+
* for-await stuck (ElevenLabs never sends ``isFinal=true`` after the
|
|
1774
|
+
* consumer breaks), ``initPipeline`` never returned, the STT
|
|
1775
|
+
* ``onTranscript`` callback never registered, and the entire remainder
|
|
1776
|
+
* of the call was silent for the user. Surfaced during the 2026-05-20
|
|
1777
|
+
* acceptance run.
|
|
1778
|
+
*/
|
|
1779
|
+
cancelActiveStream(): void;
|
|
1722
1780
|
/** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
|
|
1723
1781
|
static forTwilio(opts: Omit<ElevenLabsWebSocketTTSOptions, 'outputFormat'>): ElevenLabsWebSocketTTS;
|
|
1724
1782
|
/** Pre-configured for Telnyx (`pcm_16000`). */
|
|
@@ -2777,6 +2835,27 @@ declare function geminiLive(opts: {
|
|
|
2777
2835
|
voice?: string;
|
|
2778
2836
|
}): RealtimeConfig;
|
|
2779
2837
|
|
|
2838
|
+
/**
|
|
2839
|
+
* Default provider pricing and merge utilities.
|
|
2840
|
+
*
|
|
2841
|
+
* Pricing reflects public provider rates as of 2026. Each provider entry
|
|
2842
|
+
* carries provider-level defaults (the model Patter ships with by default)
|
|
2843
|
+
* plus an optional ``models`` map keyed by model identifier with per-model
|
|
2844
|
+
* overrides. Cost-calc functions take an optional ``model`` arg and
|
|
2845
|
+
* auto-resolve the rate via {@link resolveProviderRates} (longest-prefix
|
|
2846
|
+
* match for versioned model IDs). When the agent's adapter exposes
|
|
2847
|
+
* ``model`` and the metrics layer threads it through, the dashboard bills
|
|
2848
|
+
* with model accuracy out of the box — no manual override needed.
|
|
2849
|
+
*
|
|
2850
|
+
* User overrides via ``new Patter({ pricing: {...} })`` keep working as
|
|
2851
|
+
* before. To register a new model rate without touching the SDK source:
|
|
2852
|
+
*
|
|
2853
|
+
* new Patter({ pricing: { elevenlabs: { models: { my_custom: { price: 0.075 } } } } })
|
|
2854
|
+
*/
|
|
2855
|
+
/** Pricing table version identifier, updated in lockstep with the Python SDK. */
|
|
2856
|
+
declare const PRICING_VERSION = "2026.3";
|
|
2857
|
+
/** ISO date the pricing table was last refreshed against public provider rates. */
|
|
2858
|
+
declare const PRICING_LAST_UPDATED = "2026-05-08";
|
|
2780
2859
|
/**
|
|
2781
2860
|
* Billing units used by ``DEFAULT_PRICING`` entries. String values keep the
|
|
2782
2861
|
* pricing table JSON-serialisable and backwards-compatible with consumers
|
|
@@ -3067,6 +3146,21 @@ declare class CallMetricsAccumulator {
|
|
|
3067
3146
|
private _bargeinStoppedAt;
|
|
3068
3147
|
private _turnUserText;
|
|
3069
3148
|
private _turnSttAudioSeconds;
|
|
3149
|
+
/**
|
|
3150
|
+
* Guard against the recordTurnInterrupted / recordTurnComplete race.
|
|
3151
|
+
*
|
|
3152
|
+
* A VAD-path barge-in fires ``recordTurnInterrupted`` synchronously
|
|
3153
|
+
* inside ``handleAudioAsync`` while the in-flight pipeline LLM stream
|
|
3154
|
+
* keeps unwinding on its own task. When the LLM stream eventually
|
|
3155
|
+
* exits, the existing pipeline path falls through to
|
|
3156
|
+
* ``recordTurnComplete``, which would push a second turn for the same
|
|
3157
|
+
* logical exchange (this time carrying ``user_text=''`` because the
|
|
3158
|
+
* field was already reset). ``_turnAlreadyClosed`` is flipped by
|
|
3159
|
+
* ``recordTurnInterrupted`` and read by ``recordTurnComplete`` so the
|
|
3160
|
+
* late ``recordTurnComplete`` becomes a no-op until the next
|
|
3161
|
+
* ``startTurn`` re-arms the accumulator.
|
|
3162
|
+
*/
|
|
3163
|
+
private _turnAlreadyClosed;
|
|
3070
3164
|
private _totalSttAudioSeconds;
|
|
3071
3165
|
private _totalTtsCharacters;
|
|
3072
3166
|
private _totalRealtimeCost;
|
|
@@ -3202,9 +3296,26 @@ declare class CallMetricsAccumulator {
|
|
|
3202
3296
|
* to compute ``bargein_ms``.
|
|
3203
3297
|
*/
|
|
3204
3298
|
recordTtsStopped(ts?: number): void;
|
|
3205
|
-
/**
|
|
3206
|
-
|
|
3207
|
-
|
|
3299
|
+
/**
|
|
3300
|
+
* Close the current turn cleanly and append a `TurnMetrics` record.
|
|
3301
|
+
*
|
|
3302
|
+
* Returns ``null`` when ``recordTurnInterrupted`` has already closed
|
|
3303
|
+
* the current turn — this protects against the VAD-barge-in /
|
|
3304
|
+
* pipeline-LLM race where both paths try to finalise the same logical
|
|
3305
|
+
* turn and the second would otherwise push a phantom entry with
|
|
3306
|
+
* ``user_text=''``. The caller treats ``null`` as "nothing to emit";
|
|
3307
|
+
* ``emitTurnMetrics`` is already null-safe.
|
|
3308
|
+
*/
|
|
3309
|
+
recordTurnComplete(agentText: string): TurnMetrics | null;
|
|
3310
|
+
/**
|
|
3311
|
+
* Close the current turn as interrupted (barge-in) and return the
|
|
3312
|
+
* recorded metrics. Returns ``null`` when no turn is open, OR when
|
|
3313
|
+
* ``recordTurnComplete`` has already finalised the current turn —
|
|
3314
|
+
* bidirectional parity with the guard at the top of
|
|
3315
|
+
* ``recordTurnComplete``. Prevents an out-of-order interruption (e.g.
|
|
3316
|
+
* a future refactor that reorders the bargein + LLM-unwind paths)
|
|
3317
|
+
* from overwriting a turn that the complete path already emitted.
|
|
3318
|
+
*/
|
|
3208
3319
|
recordTurnInterrupted(): TurnMetrics | null;
|
|
3209
3320
|
/**
|
|
3210
3321
|
* Record the moment VAD emitted speech_end for the current utterance.
|
|
@@ -3346,6 +3457,67 @@ declare const OpenAIRealtimeAudioFormat: {
|
|
|
3346
3457
|
};
|
|
3347
3458
|
/** Union of {@link OpenAIRealtimeAudioFormat} string values. */
|
|
3348
3459
|
type OpenAIRealtimeAudioFormat = (typeof OpenAIRealtimeAudioFormat)[keyof typeof OpenAIRealtimeAudioFormat];
|
|
3460
|
+
/**
|
|
3461
|
+
* Known OpenAI Realtime API model identifiers.
|
|
3462
|
+
*
|
|
3463
|
+
* `GPT_REALTIME_2` is OpenAI's most-capable realtime voice model
|
|
3464
|
+
* (speech-to-speech with configurable reasoning effort, stronger
|
|
3465
|
+
* instruction following, 128K context). It accepts the same session
|
|
3466
|
+
* update wire format as the v1 `gpt-realtime` family but supports an
|
|
3467
|
+
* additional `reasoning.effort` field — see `reasoningEffort` on
|
|
3468
|
+
* {@link OpenAIRealtimeOptions}. Pricing differs from the mini default;
|
|
3469
|
+
* override `DEFAULT_PRICING.openai_realtime` with the values in
|
|
3470
|
+
* `DEFAULT_PRICING.openai_realtime_2` when selecting it.
|
|
3471
|
+
*/
|
|
3472
|
+
declare const OpenAIRealtimeModel: {
|
|
3473
|
+
readonly GPT_REALTIME: "gpt-realtime";
|
|
3474
|
+
readonly GPT_REALTIME_2: "gpt-realtime-2";
|
|
3475
|
+
readonly GPT_REALTIME_MINI: "gpt-realtime-mini";
|
|
3476
|
+
readonly GPT_4O_REALTIME_PREVIEW: "gpt-4o-realtime-preview";
|
|
3477
|
+
readonly GPT_4O_MINI_REALTIME_PREVIEW: "gpt-4o-mini-realtime-preview";
|
|
3478
|
+
};
|
|
3479
|
+
/** Union of {@link OpenAIRealtimeModel} string values. */
|
|
3480
|
+
type OpenAIRealtimeModel = (typeof OpenAIRealtimeModel)[keyof typeof OpenAIRealtimeModel];
|
|
3481
|
+
/** OpenAI Realtime / TTS voice identifiers. */
|
|
3482
|
+
declare const OpenAIVoice: {
|
|
3483
|
+
readonly ALLOY: "alloy";
|
|
3484
|
+
readonly ASH: "ash";
|
|
3485
|
+
readonly BALLAD: "ballad";
|
|
3486
|
+
readonly CORAL: "coral";
|
|
3487
|
+
readonly ECHO: "echo";
|
|
3488
|
+
readonly FABLE: "fable";
|
|
3489
|
+
readonly NOVA: "nova";
|
|
3490
|
+
readonly ONYX: "onyx";
|
|
3491
|
+
readonly SAGE: "sage";
|
|
3492
|
+
readonly SHIMMER: "shimmer";
|
|
3493
|
+
readonly VERSE: "verse";
|
|
3494
|
+
};
|
|
3495
|
+
/** Union of {@link OpenAIVoice} string values. */
|
|
3496
|
+
type OpenAIVoice = (typeof OpenAIVoice)[keyof typeof OpenAIVoice];
|
|
3497
|
+
/**
|
|
3498
|
+
* Models accepted by `input_audio_transcription` on Realtime sessions.
|
|
3499
|
+
*
|
|
3500
|
+
* `GPT_REALTIME_WHISPER` is OpenAI's streaming-optimised Whisper variant
|
|
3501
|
+
* designed for low-latency transcript deltas inside a Realtime session.
|
|
3502
|
+
* Billed per minute of audio (separate from the conversational model
|
|
3503
|
+
* tokens). Use it when you want faster partial transcripts than
|
|
3504
|
+
* `whisper-1` at lower cost than `gpt-4o-transcribe`.
|
|
3505
|
+
*/
|
|
3506
|
+
declare const OpenAITranscriptionModel: {
|
|
3507
|
+
readonly WHISPER_1: "whisper-1";
|
|
3508
|
+
readonly GPT_4O_TRANSCRIBE: "gpt-4o-transcribe";
|
|
3509
|
+
readonly GPT_4O_MINI_TRANSCRIBE: "gpt-4o-mini-transcribe";
|
|
3510
|
+
readonly GPT_REALTIME_WHISPER: "gpt-realtime-whisper";
|
|
3511
|
+
};
|
|
3512
|
+
/** Union of {@link OpenAITranscriptionModel} string values. */
|
|
3513
|
+
type OpenAITranscriptionModel = (typeof OpenAITranscriptionModel)[keyof typeof OpenAITranscriptionModel];
|
|
3514
|
+
/** Server-side voice-activity-detection modes. */
|
|
3515
|
+
declare const OpenAIRealtimeVADType: {
|
|
3516
|
+
readonly SERVER_VAD: "server_vad";
|
|
3517
|
+
readonly SEMANTIC_VAD: "semantic_vad";
|
|
3518
|
+
};
|
|
3519
|
+
/** Union of {@link OpenAIRealtimeVADType} string values. */
|
|
3520
|
+
type OpenAIRealtimeVADType = (typeof OpenAIRealtimeVADType)[keyof typeof OpenAIRealtimeVADType];
|
|
3349
3521
|
/** Callback signature for events emitted by {@link OpenAIRealtimeAdapter}. */
|
|
3350
3522
|
type RealtimeEventCallback = (type: string, data: unknown) => void | Promise<void>;
|
|
3351
3523
|
/** Constructor options for {@link OpenAIRealtimeAdapter}. */
|
|
@@ -3493,6 +3665,17 @@ declare class OpenAIRealtimeAdapter {
|
|
|
3493
3665
|
cancelResponse(): void;
|
|
3494
3666
|
/** Inject a user text turn and request a new response. */
|
|
3495
3667
|
sendText(text: string): Promise<void>;
|
|
3668
|
+
/**
|
|
3669
|
+
* Trigger `response.create` with no new user item.
|
|
3670
|
+
*
|
|
3671
|
+
* Used by the Realtime stream-handler to drive a response after the
|
|
3672
|
+
* client-side hallucination filter accepts an
|
|
3673
|
+
* `input_audio_transcription.completed` event. The server VAD config
|
|
3674
|
+
* sets `create_response: false` so OpenAI no longer auto-creates a
|
|
3675
|
+
* response on every `input_audio_buffer.committed`; Patter is now
|
|
3676
|
+
* responsible for triggering it explicitly when a real user turn lands.
|
|
3677
|
+
*/
|
|
3678
|
+
requestResponse(): Promise<void>;
|
|
3496
3679
|
/**
|
|
3497
3680
|
* Make the AI speak ``text`` as its opening line.
|
|
3498
3681
|
*
|
|
@@ -4708,6 +4891,18 @@ interface Transcript$3 {
|
|
|
4708
4891
|
}
|
|
4709
4892
|
type TranscriptCallback$3 = (transcript: Transcript$3) => void;
|
|
4710
4893
|
type ErrorCallback$1 = (error: Error) => void;
|
|
4894
|
+
/** Known Deepgram STT models. */
|
|
4895
|
+
declare const DeepgramModel: {
|
|
4896
|
+
readonly NOVA_3: "nova-3";
|
|
4897
|
+
readonly NOVA_2: "nova-2";
|
|
4898
|
+
readonly NOVA_2_PHONECALL: "nova-2-phonecall";
|
|
4899
|
+
readonly NOVA_2_GENERAL: "nova-2-general";
|
|
4900
|
+
readonly NOVA_2_MEETING: "nova-2-meeting";
|
|
4901
|
+
readonly NOVA: "nova";
|
|
4902
|
+
readonly ENHANCED: "enhanced";
|
|
4903
|
+
readonly BASE: "base";
|
|
4904
|
+
};
|
|
4905
|
+
type DeepgramModel = (typeof DeepgramModel)[keyof typeof DeepgramModel];
|
|
4711
4906
|
/**
|
|
4712
4907
|
* Optional tuning knobs for Deepgram live transcription.
|
|
4713
4908
|
*
|
|
@@ -4834,6 +5029,202 @@ declare class DeepgramSTT {
|
|
|
4834
5029
|
close(): void;
|
|
4835
5030
|
}
|
|
4836
5031
|
|
|
5032
|
+
/**
|
|
5033
|
+
* Cartesia TTS provider — HTTP `/tts/bytes` endpoint.
|
|
5034
|
+
*
|
|
5035
|
+
* Cartesia also offers a WebSocket streaming mode with word timestamps;
|
|
5036
|
+
* this provider focuses on the chunked-bytes HTTP API which maps cleanly
|
|
5037
|
+
* onto Patter's `synthesize(text)` contract and keeps the provider
|
|
5038
|
+
* dependency-free (just `fetch`).
|
|
5039
|
+
*
|
|
5040
|
+
* Default model is `sonic-3` (GA snapshot `sonic-3-2026-01-12`) — Cartesia's
|
|
5041
|
+
* current GA model with a documented ~90 ms TTFB target. Voice IDs from the
|
|
5042
|
+
* sonic-2 generation (including the default Katie voice) remain compatible.
|
|
5043
|
+
*
|
|
5044
|
+
* **Telephony optimization** — the constructor default
|
|
5045
|
+
* `sampleRate=16000` is correct for web playback, dashboard previews, and
|
|
5046
|
+
* 16 kHz pipelines. For real phone calls, use the carrier-specific
|
|
5047
|
+
* factories instead:
|
|
5048
|
+
*
|
|
5049
|
+
* - {@link CartesiaTTS.forTwilio} requests `sampleRate=8000` natively from
|
|
5050
|
+
* Cartesia. Twilio's media-stream WebSocket expects μ-law @ 8 kHz, so
|
|
5051
|
+
* the SDK normally resamples 16 kHz → 8 kHz before doing the PCM →
|
|
5052
|
+
* μ-law transcode in `TwilioAudioSender`. Asking Cartesia for 8 kHz
|
|
5053
|
+
* PCM at the source skips the resample step (saves ~10–30 ms first-
|
|
5054
|
+
* byte plus per-frame CPU and removes a potential aliasing source).
|
|
5055
|
+
* The PCM → μ-law transcode still happens client-side.
|
|
5056
|
+
* - {@link CartesiaTTS.forTelnyx} requests `sampleRate=16000`. Telnyx
|
|
5057
|
+
* negotiates L16/16000 on its bidirectional media WebSocket, so
|
|
5058
|
+
* 16 kHz PCM is already the format used end-to-end and no
|
|
5059
|
+
* transcoding happens. This is the same as the bare-constructor
|
|
5060
|
+
* default and exists for API symmetry with the Twilio factory.
|
|
5061
|
+
*/
|
|
5062
|
+
/** Known Cartesia TTS models. */
|
|
5063
|
+
declare const CartesiaTTSModel: {
|
|
5064
|
+
readonly SONIC_3: "sonic-3";
|
|
5065
|
+
readonly SONIC_2: "sonic-2";
|
|
5066
|
+
readonly SONIC: "sonic";
|
|
5067
|
+
};
|
|
5068
|
+
type CartesiaTTSModel = (typeof CartesiaTTSModel)[keyof typeof CartesiaTTSModel];
|
|
5069
|
+
/** Common PCM sample rates accepted by the Cartesia bytes endpoint. */
|
|
5070
|
+
declare const CartesiaTTSSampleRate: {
|
|
5071
|
+
readonly HZ_8000: 8000;
|
|
5072
|
+
readonly HZ_16000: 16000;
|
|
5073
|
+
readonly HZ_22050: 22050;
|
|
5074
|
+
readonly HZ_24000: 24000;
|
|
5075
|
+
readonly HZ_44100: 44100;
|
|
5076
|
+
};
|
|
5077
|
+
type CartesiaTTSSampleRate = (typeof CartesiaTTSSampleRate)[keyof typeof CartesiaTTSSampleRate];
|
|
5078
|
+
/** Voice-selection mode passed in the Cartesia bytes payload. */
|
|
5079
|
+
declare const CartesiaTTSVoiceMode: {
|
|
5080
|
+
readonly ID: "id";
|
|
5081
|
+
readonly EMBEDDING: "embedding";
|
|
5082
|
+
};
|
|
5083
|
+
type CartesiaTTSVoiceMode = (typeof CartesiaTTSVoiceMode)[keyof typeof CartesiaTTSVoiceMode];
|
|
5084
|
+
/** Constructor options for {@link CartesiaTTS}. */
|
|
5085
|
+
interface CartesiaTTSOptions$1 {
|
|
5086
|
+
model?: CartesiaTTSModel | string;
|
|
5087
|
+
voice?: string;
|
|
5088
|
+
language?: string;
|
|
5089
|
+
sampleRate?: CartesiaTTSSampleRate | number;
|
|
5090
|
+
speed?: string | number;
|
|
5091
|
+
emotion?: string | string[];
|
|
5092
|
+
volume?: number;
|
|
5093
|
+
baseUrl?: string;
|
|
5094
|
+
apiVersion?: string;
|
|
5095
|
+
}
|
|
5096
|
+
/** Cartesia TTS provider backed by the HTTP `/tts/bytes` streaming endpoint. */
|
|
5097
|
+
declare class CartesiaTTS {
|
|
5098
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5099
|
+
static readonly providerKey = "cartesia_tts";
|
|
5100
|
+
private readonly apiKey;
|
|
5101
|
+
private readonly model;
|
|
5102
|
+
private readonly voice;
|
|
5103
|
+
private readonly language;
|
|
5104
|
+
private readonly sampleRate;
|
|
5105
|
+
private readonly speed?;
|
|
5106
|
+
private readonly emotion?;
|
|
5107
|
+
private readonly volume?;
|
|
5108
|
+
private readonly baseUrl;
|
|
5109
|
+
private readonly apiVersion;
|
|
5110
|
+
constructor(apiKey: string, opts?: CartesiaTTSOptions$1);
|
|
5111
|
+
/**
|
|
5112
|
+
* Construct an instance pre-configured for Twilio Media Streams.
|
|
5113
|
+
*
|
|
5114
|
+
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
5115
|
+
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
5116
|
+
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
5117
|
+
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
5118
|
+
* removes a potential aliasing source.
|
|
5119
|
+
*/
|
|
5120
|
+
static forTwilio(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
|
|
5121
|
+
/**
|
|
5122
|
+
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
5123
|
+
*
|
|
5124
|
+
* Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
|
|
5125
|
+
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
5126
|
+
* the bare-constructor default; exists for API symmetry with
|
|
5127
|
+
* {@link CartesiaTTS.forTwilio}.
|
|
5128
|
+
*/
|
|
5129
|
+
static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
|
|
5130
|
+
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
5131
|
+
private buildPayload;
|
|
5132
|
+
/**
|
|
5133
|
+
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
5134
|
+
*
|
|
5135
|
+
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
5136
|
+
* are already up by the time the first `synthesizeStream()` POST
|
|
5137
|
+
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
5138
|
+
* debug level.
|
|
5139
|
+
*
|
|
5140
|
+
* Billing safety: `GET /voices` is a free metadata read on
|
|
5141
|
+
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
5142
|
+
* not consume synthesis credits. The actual synthesis is billed
|
|
5143
|
+
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
5144
|
+
*
|
|
5145
|
+
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
5146
|
+
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
5147
|
+
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
5148
|
+
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
5149
|
+
*/
|
|
5150
|
+
warmup(): Promise<void>;
|
|
5151
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
5152
|
+
synthesize(text: string): Promise<Buffer>;
|
|
5153
|
+
/**
|
|
5154
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
5155
|
+
* `sampleRate` as they arrive from Cartesia.
|
|
5156
|
+
*/
|
|
5157
|
+
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
5158
|
+
}
|
|
5159
|
+
|
|
5160
|
+
/**
|
|
5161
|
+
* Rime TTS provider — HTTP chunked endpoint.
|
|
5162
|
+
*
|
|
5163
|
+
* Supports both Arcana and Mist model families. The Arcana model can take
|
|
5164
|
+
* up to ~80% of the output audio's duration to synthesize, so its request
|
|
5165
|
+
* timeout is bumped to 4 minutes.
|
|
5166
|
+
*/
|
|
5167
|
+
/** Rime TTS model families. */
|
|
5168
|
+
declare const RimeModel: {
|
|
5169
|
+
readonly ARCANA: "arcana";
|
|
5170
|
+
readonly MIST: "mist";
|
|
5171
|
+
readonly MIST_V2: "mistv2";
|
|
5172
|
+
};
|
|
5173
|
+
type RimeModel = (typeof RimeModel)[keyof typeof RimeModel];
|
|
5174
|
+
/** Supported response Content-Type accept headers for Rime TTS. */
|
|
5175
|
+
declare const RimeAudioFormat: {
|
|
5176
|
+
readonly PCM: "audio/pcm";
|
|
5177
|
+
readonly MP3: "audio/mp3";
|
|
5178
|
+
readonly WAV: "audio/wav";
|
|
5179
|
+
readonly MULAW: "audio/mulaw";
|
|
5180
|
+
};
|
|
5181
|
+
type RimeAudioFormat = (typeof RimeAudioFormat)[keyof typeof RimeAudioFormat];
|
|
5182
|
+
/** Constructor options for {@link RimeTTS}. */
|
|
5183
|
+
interface RimeTTSOptions$1 {
|
|
5184
|
+
model?: string;
|
|
5185
|
+
speaker?: string;
|
|
5186
|
+
lang?: string;
|
|
5187
|
+
sampleRate?: number;
|
|
5188
|
+
repetitionPenalty?: number;
|
|
5189
|
+
temperature?: number;
|
|
5190
|
+
topP?: number;
|
|
5191
|
+
maxTokens?: number;
|
|
5192
|
+
speedAlpha?: number;
|
|
5193
|
+
reduceLatency?: boolean;
|
|
5194
|
+
pauseBetweenBrackets?: boolean;
|
|
5195
|
+
phonemizeBetweenBrackets?: boolean;
|
|
5196
|
+
baseUrl?: string;
|
|
5197
|
+
}
|
|
5198
|
+
/** Rime TTS adapter for the `users.rime.ai/v1/rime-tts` HTTP streaming endpoint. */
|
|
5199
|
+
declare class RimeTTS {
|
|
5200
|
+
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5201
|
+
static readonly providerKey = "rime";
|
|
5202
|
+
private readonly apiKey;
|
|
5203
|
+
private readonly model;
|
|
5204
|
+
private readonly speaker;
|
|
5205
|
+
private readonly lang;
|
|
5206
|
+
private readonly sampleRate;
|
|
5207
|
+
private readonly repetitionPenalty?;
|
|
5208
|
+
private readonly temperature?;
|
|
5209
|
+
private readonly topP?;
|
|
5210
|
+
private readonly maxTokens?;
|
|
5211
|
+
private readonly speedAlpha?;
|
|
5212
|
+
private readonly reduceLatency?;
|
|
5213
|
+
private readonly pauseBetweenBrackets?;
|
|
5214
|
+
private readonly phonemizeBetweenBrackets?;
|
|
5215
|
+
private readonly baseUrl;
|
|
5216
|
+
private readonly totalTimeoutMs;
|
|
5217
|
+
constructor(apiKey: string, opts?: RimeTTSOptions$1);
|
|
5218
|
+
private buildPayload;
|
|
5219
|
+
/** Synthesize text and return the concatenated audio buffer. */
|
|
5220
|
+
synthesize(text: string): Promise<Buffer>;
|
|
5221
|
+
/**
|
|
5222
|
+
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
5223
|
+
* `sampleRate` as they stream in.
|
|
5224
|
+
*/
|
|
5225
|
+
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
5226
|
+
}
|
|
5227
|
+
|
|
4837
5228
|
/** Deepgram streaming STT for Patter pipeline mode. */
|
|
4838
5229
|
|
|
4839
5230
|
type DeepgramSTTOptions = DeepgramSTTOptions$1 & {
|
|
@@ -5488,128 +5879,6 @@ declare class TTS$4 extends OpenAITTS {
|
|
|
5488
5879
|
constructor(opts?: OpenAITTSOptions);
|
|
5489
5880
|
}
|
|
5490
5881
|
|
|
5491
|
-
/**
|
|
5492
|
-
* Cartesia TTS provider — HTTP `/tts/bytes` endpoint.
|
|
5493
|
-
*
|
|
5494
|
-
* Cartesia also offers a WebSocket streaming mode with word timestamps;
|
|
5495
|
-
* this provider focuses on the chunked-bytes HTTP API which maps cleanly
|
|
5496
|
-
* onto Patter's `synthesize(text)` contract and keeps the provider
|
|
5497
|
-
* dependency-free (just `fetch`).
|
|
5498
|
-
*
|
|
5499
|
-
* Default model is `sonic-3` (GA snapshot `sonic-3-2026-01-12`) — Cartesia's
|
|
5500
|
-
* current GA model with a documented ~90 ms TTFB target. Voice IDs from the
|
|
5501
|
-
* sonic-2 generation (including the default Katie voice) remain compatible.
|
|
5502
|
-
*
|
|
5503
|
-
* **Telephony optimization** — the constructor default
|
|
5504
|
-
* `sampleRate=16000` is correct for web playback, dashboard previews, and
|
|
5505
|
-
* 16 kHz pipelines. For real phone calls, use the carrier-specific
|
|
5506
|
-
* factories instead:
|
|
5507
|
-
*
|
|
5508
|
-
* - {@link CartesiaTTS.forTwilio} requests `sampleRate=8000` natively from
|
|
5509
|
-
* Cartesia. Twilio's media-stream WebSocket expects μ-law @ 8 kHz, so
|
|
5510
|
-
* the SDK normally resamples 16 kHz → 8 kHz before doing the PCM →
|
|
5511
|
-
* μ-law transcode in `TwilioAudioSender`. Asking Cartesia for 8 kHz
|
|
5512
|
-
* PCM at the source skips the resample step (saves ~10–30 ms first-
|
|
5513
|
-
* byte plus per-frame CPU and removes a potential aliasing source).
|
|
5514
|
-
* The PCM → μ-law transcode still happens client-side.
|
|
5515
|
-
* - {@link CartesiaTTS.forTelnyx} requests `sampleRate=16000`. Telnyx
|
|
5516
|
-
* negotiates L16/16000 on its bidirectional media WebSocket, so
|
|
5517
|
-
* 16 kHz PCM is already the format used end-to-end and no
|
|
5518
|
-
* transcoding happens. This is the same as the bare-constructor
|
|
5519
|
-
* default and exists for API symmetry with the Twilio factory.
|
|
5520
|
-
*/
|
|
5521
|
-
/** Known Cartesia TTS models. */
|
|
5522
|
-
declare const CartesiaTTSModel: {
|
|
5523
|
-
readonly SONIC_3: "sonic-3";
|
|
5524
|
-
readonly SONIC_2: "sonic-2";
|
|
5525
|
-
readonly SONIC: "sonic";
|
|
5526
|
-
};
|
|
5527
|
-
type CartesiaTTSModel = (typeof CartesiaTTSModel)[keyof typeof CartesiaTTSModel];
|
|
5528
|
-
/** Common PCM sample rates accepted by the Cartesia bytes endpoint. */
|
|
5529
|
-
declare const CartesiaTTSSampleRate: {
|
|
5530
|
-
readonly HZ_8000: 8000;
|
|
5531
|
-
readonly HZ_16000: 16000;
|
|
5532
|
-
readonly HZ_22050: 22050;
|
|
5533
|
-
readonly HZ_24000: 24000;
|
|
5534
|
-
readonly HZ_44100: 44100;
|
|
5535
|
-
};
|
|
5536
|
-
type CartesiaTTSSampleRate = (typeof CartesiaTTSSampleRate)[keyof typeof CartesiaTTSSampleRate];
|
|
5537
|
-
/** Constructor options for {@link CartesiaTTS}. */
|
|
5538
|
-
interface CartesiaTTSOptions$1 {
|
|
5539
|
-
model?: CartesiaTTSModel | string;
|
|
5540
|
-
voice?: string;
|
|
5541
|
-
language?: string;
|
|
5542
|
-
sampleRate?: CartesiaTTSSampleRate | number;
|
|
5543
|
-
speed?: string | number;
|
|
5544
|
-
emotion?: string | string[];
|
|
5545
|
-
volume?: number;
|
|
5546
|
-
baseUrl?: string;
|
|
5547
|
-
apiVersion?: string;
|
|
5548
|
-
}
|
|
5549
|
-
/** Cartesia TTS provider backed by the HTTP `/tts/bytes` streaming endpoint. */
|
|
5550
|
-
declare class CartesiaTTS {
|
|
5551
|
-
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5552
|
-
static readonly providerKey = "cartesia_tts";
|
|
5553
|
-
private readonly apiKey;
|
|
5554
|
-
private readonly model;
|
|
5555
|
-
private readonly voice;
|
|
5556
|
-
private readonly language;
|
|
5557
|
-
private readonly sampleRate;
|
|
5558
|
-
private readonly speed?;
|
|
5559
|
-
private readonly emotion?;
|
|
5560
|
-
private readonly volume?;
|
|
5561
|
-
private readonly baseUrl;
|
|
5562
|
-
private readonly apiVersion;
|
|
5563
|
-
constructor(apiKey: string, opts?: CartesiaTTSOptions$1);
|
|
5564
|
-
/**
|
|
5565
|
-
* Construct an instance pre-configured for Twilio Media Streams.
|
|
5566
|
-
*
|
|
5567
|
-
* Sets `sampleRate=8000` so Cartesia emits PCM_S16LE @ 8 kHz directly.
|
|
5568
|
-
* Twilio's media stream uses μ-law @ 8 kHz so the SDK still does the
|
|
5569
|
-
* PCM → μ-law transcode client-side, but the 16 kHz → 8 kHz resample
|
|
5570
|
-
* step is skipped. Saves ~10–30 ms first-byte plus per-frame CPU and
|
|
5571
|
-
* removes a potential aliasing source.
|
|
5572
|
-
*/
|
|
5573
|
-
static forTwilio(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
|
|
5574
|
-
/**
|
|
5575
|
-
* Construct an instance pre-configured for Telnyx bidirectional media.
|
|
5576
|
-
*
|
|
5577
|
-
* Sets `sampleRate=16000` to match Telnyx's L16/16000 default codec —
|
|
5578
|
-
* audio flows end-to-end with zero resampling or transcoding. Same as
|
|
5579
|
-
* the bare-constructor default; exists for API symmetry with
|
|
5580
|
-
* {@link CartesiaTTS.forTwilio}.
|
|
5581
|
-
*/
|
|
5582
|
-
static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions$1, 'sampleRate'>): CartesiaTTS;
|
|
5583
|
-
/** Build the JSON payload for the Cartesia bytes endpoint. */
|
|
5584
|
-
private buildPayload;
|
|
5585
|
-
/**
|
|
5586
|
-
* Pre-call HTTP warmup for the Cartesia `/tts/bytes` endpoint.
|
|
5587
|
-
*
|
|
5588
|
-
* Issues a lightweight `GET <baseUrl>/voices` so DNS, TLS, and HTTP/2
|
|
5589
|
-
* are already up by the time the first `synthesizeStream()` POST
|
|
5590
|
-
* lands. Best-effort: 5 s timeout, all exceptions swallowed at
|
|
5591
|
-
* debug level.
|
|
5592
|
-
*
|
|
5593
|
-
* Billing safety: `GET /voices` is a free metadata read on
|
|
5594
|
-
* Cartesia's REST surface (per https://docs.cartesia.ai). It does
|
|
5595
|
-
* not consume synthesis credits. The actual synthesis is billed
|
|
5596
|
-
* only when `POST /tts/bytes` runs with a non-empty `transcript`.
|
|
5597
|
-
*
|
|
5598
|
-
* Note: Cartesia TTS uses the HTTP path (vs the WebSocket variant
|
|
5599
|
-
* Cartesia also exposes) — connection warmup is therefore HTTP-GET
|
|
5600
|
-
* based, not WebSocket pre-handshake. The latency win is smaller
|
|
5601
|
-
* (~50-150 ms vs the ~200-500 ms of a WS prewarm) but still real.
|
|
5602
|
-
*/
|
|
5603
|
-
warmup(): Promise<void>;
|
|
5604
|
-
/** Synthesize text and return the concatenated audio buffer. */
|
|
5605
|
-
synthesize(text: string): Promise<Buffer>;
|
|
5606
|
-
/**
|
|
5607
|
-
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
5608
|
-
* `sampleRate` as they arrive from Cartesia.
|
|
5609
|
-
*/
|
|
5610
|
-
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
5611
|
-
}
|
|
5612
|
-
|
|
5613
5882
|
/** Cartesia TTS for Patter pipeline mode. */
|
|
5614
5883
|
|
|
5615
5884
|
/** Constructor options for the Cartesia `TTS` adapter. */
|
|
@@ -5658,52 +5927,6 @@ declare class TTS$3 extends CartesiaTTS {
|
|
|
5658
5927
|
static forTelnyx(apiKey: string, options?: Omit<CartesiaTTSOptions, "sampleRate">): TTS$3;
|
|
5659
5928
|
}
|
|
5660
5929
|
|
|
5661
|
-
/** Constructor options for {@link RimeTTS}. */
|
|
5662
|
-
interface RimeTTSOptions$1 {
|
|
5663
|
-
model?: string;
|
|
5664
|
-
speaker?: string;
|
|
5665
|
-
lang?: string;
|
|
5666
|
-
sampleRate?: number;
|
|
5667
|
-
repetitionPenalty?: number;
|
|
5668
|
-
temperature?: number;
|
|
5669
|
-
topP?: number;
|
|
5670
|
-
maxTokens?: number;
|
|
5671
|
-
speedAlpha?: number;
|
|
5672
|
-
reduceLatency?: boolean;
|
|
5673
|
-
pauseBetweenBrackets?: boolean;
|
|
5674
|
-
phonemizeBetweenBrackets?: boolean;
|
|
5675
|
-
baseUrl?: string;
|
|
5676
|
-
}
|
|
5677
|
-
/** Rime TTS adapter for the `users.rime.ai/v1/rime-tts` HTTP streaming endpoint. */
|
|
5678
|
-
declare class RimeTTS {
|
|
5679
|
-
/** Stable pricing/dashboard key — read by stream-handler/metrics. */
|
|
5680
|
-
static readonly providerKey = "rime";
|
|
5681
|
-
private readonly apiKey;
|
|
5682
|
-
private readonly model;
|
|
5683
|
-
private readonly speaker;
|
|
5684
|
-
private readonly lang;
|
|
5685
|
-
private readonly sampleRate;
|
|
5686
|
-
private readonly repetitionPenalty?;
|
|
5687
|
-
private readonly temperature?;
|
|
5688
|
-
private readonly topP?;
|
|
5689
|
-
private readonly maxTokens?;
|
|
5690
|
-
private readonly speedAlpha?;
|
|
5691
|
-
private readonly reduceLatency?;
|
|
5692
|
-
private readonly pauseBetweenBrackets?;
|
|
5693
|
-
private readonly phonemizeBetweenBrackets?;
|
|
5694
|
-
private readonly baseUrl;
|
|
5695
|
-
private readonly totalTimeoutMs;
|
|
5696
|
-
constructor(apiKey: string, opts?: RimeTTSOptions$1);
|
|
5697
|
-
private buildPayload;
|
|
5698
|
-
/** Synthesize text and return the concatenated audio buffer. */
|
|
5699
|
-
synthesize(text: string): Promise<Buffer>;
|
|
5700
|
-
/**
|
|
5701
|
-
* Synthesize text and yield raw PCM_S16LE chunks at the configured
|
|
5702
|
-
* `sampleRate` as they stream in.
|
|
5703
|
-
*/
|
|
5704
|
-
synthesizeStream(text: string): AsyncGenerator<Buffer>;
|
|
5705
|
-
}
|
|
5706
|
-
|
|
5707
5930
|
/** Rime TTS for Patter pipeline mode. */
|
|
5708
5931
|
|
|
5709
5932
|
/** Constructor options for the Rime `TTS` adapter. */
|
|
@@ -6743,6 +6966,41 @@ declare class OpenAIRealtime2Adapter extends OpenAIRealtimeAdapter {
|
|
|
6743
6966
|
* output}` + `output_modalities` + `session.type === "realtime"`.
|
|
6744
6967
|
*/
|
|
6745
6968
|
connect(): Promise<void>;
|
|
6969
|
+
/**
|
|
6970
|
+
* GA-API variant of {@link OpenAIRealtimeAdapter.openParkedConnection}.
|
|
6971
|
+
* Opens a fresh Realtime WS against the GA endpoint, exchanges
|
|
6972
|
+
* `session.created` → GA-shape `session.update` → `session.updated`
|
|
6973
|
+
* so the upstream session is fully primed, and returns the OPEN
|
|
6974
|
+
* socket WITHOUT taking it on `this.ws` or arming the heartbeat /
|
|
6975
|
+
* message listener.
|
|
6976
|
+
*
|
|
6977
|
+
* Used by `Patter.parkProviderConnections` during the carrier
|
|
6978
|
+
* ringing window so the per-call `StreamHandler` can adopt the
|
|
6979
|
+
* primed socket at carrier `start` — eliminating the TCP + TLS +
|
|
6980
|
+
* HTTP-101 + `session.update` ack round-trip from the critical path.
|
|
6981
|
+
* Saves ~300-600 ms of first-audible-word latency.
|
|
6982
|
+
*
|
|
6983
|
+
* Bounded by 8 s. Throws on timeout / handshake failure / GA-side
|
|
6984
|
+
* rejection. Callers treat any error as a cache miss and fall
|
|
6985
|
+
* through to the cold {@link connect} path.
|
|
6986
|
+
*
|
|
6987
|
+
* Billing safety: confirmed by OpenAI's Managing Realtime Costs
|
|
6988
|
+
* guide — `session.update` does NOT invoke the model and bills no
|
|
6989
|
+
* tokens. An idle parked socket costs $0.
|
|
6990
|
+
*/
|
|
6991
|
+
openParkedConnection(): Promise<WebSocket__default>;
|
|
6992
|
+
/**
|
|
6993
|
+
* GA-API variant of {@link OpenAIRealtimeAdapter.adoptWebSocket}. Takes
|
|
6994
|
+
* over a WS that {@link openParkedConnection} produced (already through
|
|
6995
|
+
* `session.created` + `session.update` + `session.updated`) and arms
|
|
6996
|
+
* the heartbeat + message listener so the GA event-translation shim
|
|
6997
|
+
* is wired up. Skips the cold-connect path — saves ~300-600 ms on
|
|
6998
|
+
* first audible word.
|
|
6999
|
+
*
|
|
7000
|
+
* Caller MUST verify `ws.readyState === OPEN` before calling. If the
|
|
7001
|
+
* parked WS died between park and adopt, fall back to {@link connect}.
|
|
7002
|
+
*/
|
|
7003
|
+
adoptWebSocket(ws: WebSocket__default): void;
|
|
6746
7004
|
/**
|
|
6747
7005
|
* GA-API variant of {@link OpenAIRealtimeAdapter.sendFirstMessage}. Two
|
|
6748
7006
|
* differences from the v1 path:
|
|
@@ -7430,10 +7688,18 @@ declare class TwilioAdapter {
|
|
|
7430
7688
|
/** Place an outbound call. Returns the Twilio call SID. */
|
|
7431
7689
|
initiateCall(opts: InitiateCallOptions$1): Promise<InitiateCallResult$1>;
|
|
7432
7690
|
/**
|
|
7433
|
-
* Build a
|
|
7434
|
-
*
|
|
7691
|
+
* Build a ``<Response><Connect><Stream url="...">`` TwiML document.
|
|
7692
|
+
*
|
|
7693
|
+
* ``parameters`` is forwarded as ``<Parameter name="..." value="..."/>``
|
|
7694
|
+
* children of ``<Stream>``. Twilio Media Streams strips query-string params
|
|
7695
|
+
* from the ``<Stream url=...>`` before the WS handshake, so
|
|
7696
|
+
* ``<Parameter>`` tags are the supported way to pre-populate
|
|
7697
|
+
* ``start.customParameters`` on the WS ``start`` frame. Used by the
|
|
7698
|
+
* inbound path to carry caller / callee through to the bridge.
|
|
7699
|
+
*
|
|
7700
|
+
* Mirrors the Python adapter's ``generate_stream_twiml``.
|
|
7435
7701
|
*/
|
|
7436
|
-
static generateStreamTwiml(streamUrl: string): string;
|
|
7702
|
+
static generateStreamTwiml(streamUrl: string, parameters?: Record<string, string>): string;
|
|
7437
7703
|
/** Force-complete an in-progress call. */
|
|
7438
7704
|
endCall(callSid: string): Promise<void>;
|
|
7439
7705
|
}
|
|
@@ -7665,4 +7931,4 @@ interface CallEvent {
|
|
|
7665
7931
|
readonly direction?: string;
|
|
7666
7932
|
}
|
|
7667
7933
|
|
|
7668
|
-
export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, type EvaluateContext as BargeInEvaluateContext, type BargeInStrategy, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, type CartesiaTTSOptions, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, DeepFilterNetFilter, type DeepFilterNetOptions, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, ElevenLabsTTS as ElevenLabsRestTTS, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, KrispFrameDuration, KrispSampleRate, KrispVivaFilter, type KrispVivaFilterOptions, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, MinWordsStrategy, type MinWordsStrategyOptions, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, Realtime2 as OpenAIRealtime2, OpenAIRealtime2Adapter, type Realtime2Options as OpenAIRealtime2Options, OpenAIRealtimeAdapter, type RealtimeOptions as OpenAIRealtimeOptions, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, evaluateStrategies as evaluateBargeInStrategies, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, resetStrategies as resetBargeInStrategies, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };
|
|
7934
|
+
export { type AgentOptions, type AgentState, AllProvidersFailedError, type AnthropicConversion, LLM$3 as AnthropicLLM, type AnthropicLLMOptions, type AnthropicMessage, AssemblyAIEncoding, AssemblyAIModel, STT$1 as AssemblyAISTT, type AssemblyAISTTOptions, type AudioConfig, type AudioSource, AuthenticationError, type BackgroundAudioOptions, BackgroundAudioPlayer, type EvaluateContext as BargeInEvaluateContext, type BargeInStrategy, BuiltinAudioClip, type BuiltinAudioClipName, type BuiltinPcmSource, type CallControl, type CallEvent, type CallEventHandler, type CallMetrics, CallMetricsAccumulator, type CallRecord, type CartesiaEncoding, STT$3 as CartesiaSTT, type CartesiaSTTOptions, TTS$3 as CartesiaTTS, CartesiaTTSModel, type CartesiaTTSOptions, CartesiaTTSVoiceMode, LLM$1 as CerebrasLLM, type CerebrasLLMOptions, ChatContext, type ChatMessage, type ChatRole, CloudflareTunnel, type ConversationStateSnapshot, type CostBreakdown, DEFAULT_MIN_SENTENCE_LEN, DEFAULT_PRICING, DTMF_EVENTS, DeepFilterNetFilter, type DeepFilterNetOptions, DeepgramModel, STT$6 as DeepgramSTT, type DeepgramSTTOptions, DefaultToolExecutor, type DefaultToolExecutorOptions, type DefineToolInput, type DtmfEvent, ConvAI as ElevenLabsConvAI, ElevenLabsConvAIAdapter, type ConvAIOptions as ElevenLabsConvAIOptions, ElevenLabsModel, ElevenLabsOutputFormat, ElevenLabsTTS as ElevenLabsRestTTS, TTS$6 as ElevenLabsTTS, type ElevenLabsTTSOptions, type ElevenLabsWebSocketOptions, TTS$5 as ElevenLabsWebSocketTTS, type EouTrigger, ErrorCode, EventBus, FallbackLLMProvider, type FallbackLLMProviderOptions, type FilePcmSource, GEMINI_DEFAULT_INPUT_SR, GEMINI_DEFAULT_OUTPUT_SR, GeminiLiveAdapter, type GeminiLiveEventHandler, LLM as GoogleLLM, type GoogleLLMOptions, LLM$2 as GroqLLM, type GroqLLMOptions, Guardrail$1 as Guardrail, type GuardrailOptions, type HookContext, IVRActivity, type IVRActivityOptions, type IVRToolDefinition, type IncomingMessage, type InitTracingOptions, TTS as InworldTTS, type InworldTTSOptions, type JobCallback, KrispFrameDuration, KrispSampleRate, KrispVivaFilter, type KrispVivaFilterOptions, type LLMChunk, LLMLoop, type LLMProvider, LMNTAudioFormat, LMNTModel, LMNTSampleRate, TTS$1 as LMNTTTS, type LMNTTTSOptions, type LatencyBreakdown, type LocalCallOptions, type LocalConfig, type LocalOptions, type Logger, type LoopCallback, type MessageHandler, MetricsStore, MinWordsStrategy, type MinWordsStrategyOptions, type ModelPricing, Ngrok, LLM$4 as OpenAILLM, type OpenAILLMOptions, OpenAILLMProvider, type OpenAIMessage, Realtime as OpenAIRealtime, Realtime2 as OpenAIRealtime2, OpenAIRealtime2Adapter, type Realtime2Options as OpenAIRealtime2Options, OpenAIRealtimeAdapter, OpenAIRealtimeAudioFormat, OpenAIRealtimeModel, type RealtimeOptions as OpenAIRealtimeOptions, OpenAIRealtimeVADType, TTS$4 as OpenAITTS, type OpenAITTSOptions, STT$4 as OpenAITranscribeSTT, type OpenAITranscribeSTTOptions, OpenAITranscriptionModel, OpenAIVoice, PRICING_LAST_UPDATED, PRICING_VERSION, type ParamSpec, PartialStreamError, Patter, PatterConnectionError, PatterError, type PatterEventType, PatterTool, type PatterToolExecuteArgs, type PatterToolOptions, type PatterToolResult, PcmCarry, PipelineHookExecutor, type PipelineHooks, type PipelineMessageHandler, PricingUnit, type PricingUnitValue, type ProviderPricing, ProvisionError, RateLimitError, type RawPcmSource, type RealtimeConfig, RemoteMessageHandler, RimeAudioFormat, RimeModel, TTS$2 as RimeTTS, type RimeTTSOptions, SPAN_BARGEIN, SPAN_CALL, SPAN_ENDPOINT, SPAN_LLM, SPAN_STT, SPAN_TOOL, SPAN_TTS, type SSEEvent, type STTConfig, type ScheduleHandle, SentenceChunker, type ServeOptions, type SilenceCallback, type SileroSampleRate, SileroVAD, type SileroVADOptions, STT$2 as SonioxSTT, type SonioxSTTOptions$1 as SonioxSTTOptions, type Span, type SpeechEventCallback, SpeechEvents, SpeechmaticsAudioEncoding, SpeechmaticsOperatingPoint, STT as SpeechmaticsSTT, type SpeechmaticsSTTOptions, SpeechmaticsSampleRate, SpeechmaticsServerMessage, TurnDetectionMode as SpeechmaticsTurnDetectionMode, StatefulResampler, type StatefulResamplerOptions, Static as StaticTunnel, type TTSConfig, Carrier as Telnyx, TelnyxAdapter, type TelnyxCarrierOptions, type ConfigureNumberOptions as TelnyxConfigureNumberOptions, type EndCallOptions as TelnyxEndCallOptions, type InitiateCallOptions as TelnyxInitiateCallOptions, type InitiateCallResult as TelnyxInitiateCallResult, type ProvisionNumberOptions as TelnyxProvisionNumberOptions, type ProvisionNumberResult as TelnyxProvisionNumberResult, TelnyxSTT, TelnyxSTTInputFormat, TelnyxSTTSampleRate, type Transcript as TelnyxSTTTranscript, TelnyxTTS, TelnyxTTSSampleRate, TelnyxTTSVoice, type TelnyxTranscriptionEngine, TestSession, TfidfLoopDetector, type TfidfLoopDetectorOptions, Tool, type ToolDefinition, type ToolExecutor, type ToolHandler, type ToolOptions, type TunnelHandle, type TurnMetrics, Carrier$1 as Twilio, TwilioAdapter, type TwilioAdapterOptions, type TwilioCarrierOptions, type ConfigureNumberOptions$1 as TwilioConfigureNumberOptions, type InitiateCallOptions$1 as TwilioInitiateCallOptions, type InitiateCallResult$1 as TwilioInitiateCallResult, type ProvisionNumberOptions$1 as TwilioProvisionNumberOptions, type ProvisionNumberResult$1 as TwilioProvisionNumberResult, ULTRAVOX_DEFAULT_API_BASE, ULTRAVOX_DEFAULT_SR, type UltravoxEventHandler, UltravoxRealtimeAdapter, type UserState, STT$5 as WhisperSTT, type WhisperSTTOptions, assemblyai, builtinClipPath, calculateRealtimeCost, calculateSttCost, calculateTelephonyCost, calculateTtsCost, callsToCsv, callsToJson, cartesia, createResampler16kTo8k, createResampler24kTo16k, createResampler24kTo8k, createResampler8kTo16k, deepgram, defineTool, elevenlabs, evaluateStrategies as evaluateBargeInStrategies, filterEmoji, filterForTTS, filterMarkdown, formatDtmf, geminiLive, getLogger, guardrail, initTracing, isRemoteUrl, isTracingEnabled, isWebSocketUrl, lmnt, makeAuthMiddleware, mergePricing, mixPcm, mountApi, mountDashboard, mulawToPcm16, notifyDashboard, openaiTts, pcm16ToMulaw, resample16kTo8k, resample24kTo16k, resample8kTo16k, resamplePcm, resetStrategies as resetBargeInStrategies, rime, scheduleCron, scheduleInterval, scheduleOnce, selectSoundFromList, setLogger, soniox, speechmatics, startSpan, startTunnel, tool, ultravox, whisper };
|