getpatter 0.5.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,9 +1,6 @@
1
- import {
2
- notifyDashboard
3
- } from "./chunk-AFUYSNDH.mjs";
4
1
  import {
5
2
  startTunnel
6
- } from "./chunk-SEMKNPCD.mjs";
3
+ } from "./chunk-XS45BAQL.mjs";
7
4
  import {
8
5
  AuthenticationError,
9
6
  CallMetricsAccumulator,
@@ -13,6 +10,7 @@ import {
13
10
  DefaultToolExecutor,
14
11
  ElevenLabsConvAIAdapter,
15
12
  EmbeddedServer,
13
+ ErrorCode,
16
14
  EventBus,
17
15
  LLMLoop,
18
16
  MetricsStore,
@@ -43,12 +41,14 @@ import {
43
41
  callsToJson,
44
42
  createResampler16kTo8k,
45
43
  createResampler24kTo16k,
44
+ createResampler24kTo8k,
46
45
  createResampler8kTo16k,
47
46
  initTracing,
48
47
  isRemoteUrl,
49
48
  isTracingEnabled,
50
49
  isWebSocketUrl,
51
50
  makeAuthMiddleware,
51
+ mergeAbortSignals,
52
52
  mergePricing,
53
53
  mountApi,
54
54
  mountDashboard,
@@ -57,20 +57,40 @@ import {
57
57
  resample16kTo8k,
58
58
  resample24kTo16k,
59
59
  resample8kTo16k,
60
+ resolveLogRoot,
60
61
  startSpan
61
- } from "./chunk-FIFIWBL7.mjs";
62
+ } from "./chunk-JUQ5WQTQ.mjs";
62
63
  import {
63
64
  getLogger,
64
65
  setLogger
65
- } from "./chunk-VJVDG4V5.mjs";
66
- import "./chunk-QHHBUCMT.mjs";
66
+ } from "./chunk-MVOQFAEO.mjs";
67
+ import {
68
+ notifyDashboard
69
+ } from "./chunk-6GR5MHHQ.mjs";
70
+ import {
71
+ SileroVAD
72
+ } from "./chunk-X3364LSI.mjs";
73
+ import {
74
+ __dirname,
75
+ __require,
76
+ init_esm_shims
77
+ } from "./chunk-N565J3CF.mjs";
78
+
79
+ // src/index.ts
80
+ init_esm_shims();
81
+
82
+ // src/client.ts
83
+ init_esm_shims();
67
84
 
68
85
  // src/engines/openai.ts
86
+ init_esm_shims();
69
87
  var Realtime = class {
70
88
  kind = "openai_realtime";
71
89
  apiKey;
72
90
  model;
73
91
  voice;
92
+ reasoningEffort;
93
+ inputAudioTranscriptionModel;
74
94
  constructor(opts = {}) {
75
95
  const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
76
96
  if (!key) {
@@ -81,10 +101,13 @@ var Realtime = class {
81
101
  this.apiKey = key;
82
102
  this.model = opts.model ?? "gpt-4o-mini-realtime-preview";
83
103
  this.voice = opts.voice ?? "alloy";
104
+ this.reasoningEffort = opts.reasoningEffort;
105
+ this.inputAudioTranscriptionModel = opts.inputAudioTranscriptionModel;
84
106
  }
85
107
  };
86
108
 
87
109
  // src/engines/elevenlabs.ts
110
+ init_esm_shims();
88
111
  var ConvAI = class {
89
112
  kind = "elevenlabs_convai";
90
113
  apiKey;
@@ -100,7 +123,7 @@ var ConvAI = class {
100
123
  }
101
124
  if (!agent) {
102
125
  throw new Error(
103
- "ElevenLabs ConvAI requires an agentId. Pass { agentId: 'agent_...' } or set ELEVENLABS_AGENT_ID in the environment."
126
+ "ElevenLabs ConvAI requires an agentId. Create one in the ElevenLabs dashboard (https://elevenlabs.io/app/conversational-ai) \u2014 the agent ID is per-deployed-agent and cannot be derived from the API key alone. Then either pass { agentId: 'agent_...' } at construction or set ELEVENLABS_AGENT_ID in the environment."
104
127
  );
105
128
  }
106
129
  this.apiKey = key;
@@ -110,6 +133,7 @@ var ConvAI = class {
110
133
  };
111
134
 
112
135
  // src/tunnels/index.ts
136
+ init_esm_shims();
113
137
  var CloudflareTunnel = class {
114
138
  kind = "cloudflare";
115
139
  };
@@ -144,11 +168,448 @@ var Ngrok = class {
144
168
  }
145
169
  };
146
170
 
171
+ // src/tools/schema-validation.ts
172
+ init_esm_shims();
173
+ var ToolSchemaError = class extends Error {
174
+ constructor(message) {
175
+ super(message);
176
+ this.name = "ToolSchemaError";
177
+ }
178
+ };
179
+ function validateToolSchema(tool2) {
180
+ const params = tool2.parameters;
181
+ const tag = `tool '${tool2.name}'`;
182
+ if (!params || typeof params !== "object" || Array.isArray(params)) {
183
+ throw new ToolSchemaError(
184
+ `${tag}: \`parameters\` must be a JSON Schema object (got ${typeof params}).`
185
+ );
186
+ }
187
+ if (params.type !== "object") {
188
+ throw new ToolSchemaError(
189
+ `${tag}: \`parameters.type\` must be "object" (got ${JSON.stringify(params.type)}). OpenAI function tools require an object root.`
190
+ );
191
+ }
192
+ if (params.properties !== void 0 && (typeof params.properties !== "object" || params.properties === null || Array.isArray(params.properties))) {
193
+ throw new ToolSchemaError(
194
+ `${tag}: \`parameters.properties\` must be an object map of field \u2192 JSON Schema.`
195
+ );
196
+ }
197
+ if (params.required !== void 0 && !Array.isArray(params.required)) {
198
+ throw new ToolSchemaError(
199
+ `${tag}: \`parameters.required\` must be an array of field names.`
200
+ );
201
+ }
202
+ if (Array.isArray(params.required) && params.properties) {
203
+ const props = params.properties;
204
+ for (const fieldName of params.required) {
205
+ if (typeof fieldName !== "string") {
206
+ throw new ToolSchemaError(
207
+ `${tag}: \`parameters.required\` entries must be strings (got ${typeof fieldName}).`
208
+ );
209
+ }
210
+ if (!(fieldName in props)) {
211
+ throw new ToolSchemaError(
212
+ `${tag}: \`parameters.required\` lists "${fieldName}" but it is not declared in \`parameters.properties\`.`
213
+ );
214
+ }
215
+ }
216
+ }
217
+ if (tool2.strict === true) {
218
+ validateStrictModeSchema(tool2.name, params);
219
+ }
220
+ }
221
+ function validateStrictModeSchema(toolName, schema, pathParts = []) {
222
+ const tag = `tool '${toolName}'`;
223
+ const here = pathParts.length === 0 ? "parameters" : `parameters.${pathParts.join(".")}`;
224
+ if (schema.type === "object") {
225
+ if (schema.additionalProperties !== false) {
226
+ throw new ToolSchemaError(
227
+ `${tag}: strict mode requires \`${here}.additionalProperties: false\` on every object \u2014 got ${JSON.stringify(schema.additionalProperties)}.`
228
+ );
229
+ }
230
+ const props = schema.properties ?? {};
231
+ const required = Array.isArray(schema.required) ? schema.required : [];
232
+ for (const propName of Object.keys(props)) {
233
+ if (!required.includes(propName)) {
234
+ throw new ToolSchemaError(
235
+ `${tag}: strict mode requires every property to be listed in \`required\` \u2014 "${here}.${propName}" is missing. Use a nullable type (e.g. ["string", "null"]) instead of an optional field.`
236
+ );
237
+ }
238
+ }
239
+ for (const [propName, propSchema] of Object.entries(props)) {
240
+ if (propSchema && typeof propSchema === "object") {
241
+ validateStrictModeSchema(toolName, propSchema, [...pathParts, "properties", propName]);
242
+ }
243
+ }
244
+ } else if (schema.type === "array" && schema.items && typeof schema.items === "object") {
245
+ validateStrictModeSchema(toolName, schema.items, [...pathParts, "items"]);
246
+ }
247
+ }
248
+ function validateAllToolSchemas(tools) {
249
+ if (!tools) return;
250
+ for (const tool2 of tools) {
251
+ validateToolSchema(tool2);
252
+ }
253
+ }
254
+
255
+ // src/_speech-events.ts
256
+ init_esm_shims();
257
+ var logger = getLogger();
258
+ var otelTrace = null;
259
+ var otelLoaded = false;
260
+ function loadOtel() {
261
+ if (otelLoaded) return otelTrace;
262
+ otelLoaded = true;
263
+ try {
264
+ const mod = __require("@opentelemetry/api");
265
+ otelTrace = mod.trace;
266
+ } catch {
267
+ otelTrace = null;
268
+ }
269
+ return otelTrace;
270
+ }
271
+ function recordSpanEvent(name, attrs) {
272
+ const trace = loadOtel();
273
+ if (trace === null) return;
274
+ try {
275
+ const span = trace.getActiveSpan?.();
276
+ if (!span || !span.isRecording()) return;
277
+ span.addEvent(name, attrs);
278
+ } catch (err) {
279
+ logger.debug?.(`Failed to record OTel span event ${name}: ${String(err)}`);
280
+ }
281
+ }
282
+ function nowMs() {
283
+ return Date.now();
284
+ }
285
+ var SpeechEvents = class {
286
+ // Public callback slots — any of them may be set by the user.
287
+ onUserSpeechStarted = null;
288
+ onUserSpeechEnded = null;
289
+ onUserSpeechEos = null;
290
+ onAgentSpeechStarted = null;
291
+ onAgentSpeechEnded = null;
292
+ onLlmToken = null;
293
+ onAudioOut = null;
294
+ // State machine — read via `conversationState`.
295
+ userState = "listening";
296
+ agentState = "initializing";
297
+ // Per-turn cursors. `turnIdxValue` increments on every committed EOU.
298
+ turnIdxValue = 0;
299
+ firstTokenForTurn = true;
300
+ firstAudioForTurn = true;
301
+ // Optional call start (ms since epoch) — used to compute `audioOffsetMs`
302
+ // payloads when the caller does not provide one.
303
+ callStartMs = null;
304
+ /** Snapshot of the current per-side state of the call. */
305
+ get conversationState() {
306
+ return { user: this.userState, agent: this.agentState };
307
+ }
308
+ /** Current 0-based turn index. Increments on every EOU commit. */
309
+ get turnIdx() {
310
+ return this.turnIdxValue;
311
+ }
312
+ /** Record the call-start wall-clock for ``audioOffsetMs`` math. */
313
+ markCallStarted(tsMs) {
314
+ this.callStartMs = tsMs ?? nowMs();
315
+ this.userState = "listening";
316
+ this.agentState = "idle";
317
+ }
318
+ /** Reset per-turn cursors. Called automatically on EOU commit. */
319
+ resetTurnState() {
320
+ this.firstTokenForTurn = true;
321
+ this.firstAudioForTurn = true;
322
+ }
323
+ // ---- User-side events -----------------------------------------------
324
+ /** Fire on the VAD positive edge of the inbound stream.
325
+ *
326
+ * Do not coalesce: the runner consumes positive→negative→positive
327
+ * transitions in order. For server-VAD engines (OpenAI Realtime, Telnyx
328
+ * Voice AI), forward the upstream signal directly — do not re-run a VAD
329
+ * layer on top.
330
+ */
331
+ async fireUserSpeechStarted(opts = {}) {
332
+ const tsMs = opts.timestampMs ?? nowMs();
333
+ const payload = { timestamp_ms: tsMs };
334
+ if (opts.vadConfidence !== void 0)
335
+ payload.vad_confidence = opts.vadConfidence;
336
+ const offset = this.resolveOffset(opts.audioOffsetMs, tsMs);
337
+ if (offset !== null) payload.audio_offset_ms = offset;
338
+ this.userState = "speaking";
339
+ await this.dispatch(this.onUserSpeechStarted, payload, {
340
+ spanEvent: "patter.event.user_speech_started",
341
+ spanAttrs: filterUndef({
342
+ "patter.audio.offset_ms": payload.audio_offset_ms,
343
+ "patter.vad.confidence": payload.vad_confidence
344
+ })
345
+ });
346
+ }
347
+ /** Fire on the VAD trailing edge (raw — *not* EOU).
348
+ *
349
+ * `speechDurationMs` is the length of the segment that just ended; the
350
+ * runner uses it to compute talk-ratio.
351
+ */
352
+ async fireUserSpeechEnded(opts) {
353
+ const tsMs = opts.timestampMs ?? nowMs();
354
+ const payload = {
355
+ timestamp_ms: tsMs,
356
+ speech_duration_ms: opts.speechDurationMs
357
+ };
358
+ if (opts.vadConfidence !== void 0)
359
+ payload.vad_confidence = opts.vadConfidence;
360
+ const offset = this.resolveOffset(opts.audioOffsetMs, tsMs);
361
+ if (offset !== null) payload.audio_offset_ms = offset;
362
+ this.userState = "listening";
363
+ await this.dispatch(this.onUserSpeechEnded, payload, {
364
+ spanEvent: "patter.event.user_speech_ended",
365
+ spanAttrs: { "patter.speech.duration_ms": opts.speechDurationMs }
366
+ });
367
+ }
368
+ /** Fire on the committed end-of-utterance.
369
+ *
370
+ * This is the canonical "user finished" signal — VAD edge + trailing
371
+ * silence + (optionally) a semantic turn-detector model agreement. The
372
+ * runner uses the timestamp of this event to compute
373
+ * `eos_to_first_token_ms` (Hamming AI threshold: <800 ms good, >1500 ms
374
+ * critical).
375
+ */
376
+ async fireUserSpeechEos(opts) {
377
+ const tsMs = opts.timestampMs ?? nowMs();
378
+ const payload = {
379
+ timestamp_ms: tsMs,
380
+ trigger: opts.trigger
381
+ };
382
+ if (opts.trailingSilenceMs !== void 0)
383
+ payload.trailing_silence_ms = opts.trailingSilenceMs;
384
+ if (opts.transcriptSoFar !== void 0)
385
+ payload.transcript_so_far = opts.transcriptSoFar;
386
+ this.turnIdxValue += 1;
387
+ this.resetTurnState();
388
+ this.userState = "listening";
389
+ this.agentState = "thinking";
390
+ await this.dispatch(this.onUserSpeechEos, payload, {
391
+ spanEvent: "patter.event.user_speech_eos",
392
+ spanAttrs: filterUndef({
393
+ "patter.eos.trigger": opts.trigger,
394
+ "patter.eos.trailing_silence_ms": opts.trailingSilenceMs
395
+ })
396
+ });
397
+ }
398
+ // ---- Agent-side events ----------------------------------------------
399
+ /** Fire on the FIRST audio chunk of the current agent turn that crosses
400
+ * to the wire (not the first chunk produced by TTS).
401
+ *
402
+ * The user hears the wire chunk, so this is the timestamp the runner
403
+ * anchors barge-in latency on.
404
+ */
405
+ async fireAgentSpeechStarted(opts = {}) {
406
+ const tsMs = opts.timestampMs ?? nowMs();
407
+ const payload = {
408
+ timestamp_ms: tsMs,
409
+ turn_idx: this.turnIdxValue
410
+ };
411
+ if (opts.ttsProvider !== void 0) payload.tts_provider = opts.ttsProvider;
412
+ if (opts.engine !== void 0) payload.engine = opts.engine;
413
+ this.agentState = "speaking";
414
+ await this.dispatch(this.onAgentSpeechStarted, payload, {
415
+ spanEvent: "patter.event.agent_speech_started",
416
+ spanAttrs: filterUndef({
417
+ "patter.turn.idx": this.turnIdxValue,
418
+ "patter.tts.provider": opts.ttsProvider,
419
+ "patter.engine": opts.engine
420
+ })
421
+ });
422
+ }
423
+ /** Fire on the LAST audio chunk of the current agent turn.
424
+ *
425
+ * `interrupted=true` marks the turn as cancelled by barge-in; the runner
426
+ * treats it as the `agent_speech_stopped` half of a barge-in pair.
427
+ */
428
+ async fireAgentSpeechEnded(opts) {
429
+ const tsMs = opts.timestampMs ?? nowMs();
430
+ const interrupted = opts.interrupted ?? false;
431
+ const payload = {
432
+ timestamp_ms: tsMs,
433
+ turn_idx: this.turnIdxValue,
434
+ speech_duration_ms: opts.speechDurationMs,
435
+ interrupted
436
+ };
437
+ this.agentState = "idle";
438
+ await this.dispatch(this.onAgentSpeechEnded, payload, {
439
+ spanEvent: "patter.event.agent_speech_ended",
440
+ spanAttrs: {
441
+ "patter.turn.idx": this.turnIdxValue,
442
+ "patter.speech.duration_ms": opts.speechDurationMs,
443
+ "patter.turn.interrupted": interrupted
444
+ }
445
+ });
446
+ }
447
+ // ---- LLM / TTS events -----------------------------------------------
448
+ /** Fire on the FIRST LLM token of the current turn (TTFT marker).
449
+ *
450
+ * Idempotent within a turn — guarded by `firstTokenForTurn`. Combined
451
+ * with `on_user_speech_eos.timestamp_ms` the runner computes
452
+ * `eos_to_first_token_ms`.
453
+ */
454
+ async fireLlmFirstToken(opts) {
455
+ if (!this.firstTokenForTurn) return;
456
+ this.firstTokenForTurn = false;
457
+ const tsMs = opts.timestampMs ?? nowMs();
458
+ const payload = {
459
+ timestamp_ms: tsMs,
460
+ turn_idx: this.turnIdxValue,
461
+ llm_provider: opts.llmProvider,
462
+ model: opts.model
463
+ };
464
+ await this.dispatch(this.onLlmToken, payload, {
465
+ spanEvent: "patter.event.llm_first_token",
466
+ spanAttrs: {
467
+ "gen_ai.request.model": opts.model,
468
+ "gen_ai.provider.name": opts.llmProvider,
469
+ "patter.turn.idx": this.turnIdxValue
470
+ }
471
+ });
472
+ }
473
+ /** Fire on the FIRST TTS audio chunk for the current turn.
474
+ *
475
+ * Distinct from `fireAgentSpeechStarted`: this is the agent-side buffer
476
+ * arrival (TTS warmup), not the wire-time chunk. Idempotent within a
477
+ * turn — guarded by `firstAudioForTurn`.
478
+ */
479
+ async fireAudioOut(opts) {
480
+ if (!this.firstAudioForTurn) return;
481
+ this.firstAudioForTurn = false;
482
+ const tsMs = opts.timestampMs ?? nowMs();
483
+ const payload = {
484
+ timestamp_ms: tsMs,
485
+ turn_idx: this.turnIdxValue,
486
+ tts_provider: opts.ttsProvider
487
+ };
488
+ await this.dispatch(this.onAudioOut, payload, {
489
+ spanEvent: "patter.event.tts_first_audio",
490
+ spanAttrs: {
491
+ "patter.turn.idx": this.turnIdxValue,
492
+ "patter.tts.provider": opts.ttsProvider
493
+ }
494
+ });
495
+ }
496
+ // ---- Internal -------------------------------------------------------
497
+ resolveOffset(given, tsMs) {
498
+ if (given !== void 0) return given;
499
+ if (this.callStartMs !== null) return Math.max(0, tsMs - this.callStartMs);
500
+ return null;
501
+ }
502
+ async dispatch(cb, payload, opts) {
503
+ recordSpanEvent(opts.spanEvent, opts.spanAttrs);
504
+ if (cb === null) return;
505
+ try {
506
+ await cb(payload);
507
+ } catch (err) {
508
+ logger.warn?.(
509
+ `Speech-event callback ${opts.spanEvent} raised: ${String(err)}`
510
+ );
511
+ }
512
+ }
513
+ };
514
+ function filterUndef(obj) {
515
+ const out = {};
516
+ for (const [k, v] of Object.entries(obj)) {
517
+ if (v !== void 0) out[k] = v;
518
+ }
519
+ return out;
520
+ }
521
+
147
522
  // src/client.ts
523
+ function resolvePersistRoot(persist) {
524
+ if (persist === false) return null;
525
+ if (persist === true) return resolveLogRoot("auto");
526
+ if (typeof persist === "string") return resolveLogRoot(persist);
527
+ return resolveLogRoot();
528
+ }
148
529
  var Patter = class {
149
530
  localConfig;
150
531
  embeddedServer = null;
151
532
  tunnelHandle = null;
533
+ _tunnelReadyResolve;
534
+ _tunnelReadyReject;
535
+ _tunnelReady;
536
+ _readyResolve;
537
+ _readyReject;
538
+ _ready;
539
+ /**
540
+ * True iff ``localConfig.webhookUrl`` was populated by ``serve()`` from a
541
+ * freshly-started cloudflared tunnel (rather than by the constructor from
542
+ * an explicit ``webhookUrl`` / ``StaticTunnel`` config). ``disconnect()``
543
+ * uses this flag to clear ONLY the auto-assigned hostname so a subsequent
544
+ * ``serve()`` call (e.g. from a plugin's ``ensureServing`` cycle that
545
+ * disposes + restarts on agent-identity changes) does not throw
546
+ * ``Cannot use both tunnel: true and webhookUrl``.
547
+ */
548
+ tunnelOwnsWebhookUrl = false;
549
+ /**
550
+ * Speech-edge events for turn-taking instrumentation. Public surface: the
551
+ * seven `on*` proxy accessors below plus the `conversationState` snapshot.
552
+ * Defaults are no-ops — existing users who never set a callback see exactly
553
+ * the previous behaviour.
554
+ *
555
+ * See `src/_speech-events.ts` for the full event taxonomy and the
556
+ * industry-alignment table (LiveKit / Pipecat / OpenAI Realtime).
557
+ */
558
+ speechEvents = new SpeechEvents();
559
+ // ---- Speech-edge event callback proxies ------------------------------
560
+ // The seven `on*` properties below mirror the public APIs of LiveKit
561
+ // Agents, Pipecat and OpenAI Realtime. They proxy to `speechEvents` so
562
+ // the dispatcher remains the single source of truth (state + OTel).
563
+ get onUserSpeechStarted() {
564
+ return this.speechEvents.onUserSpeechStarted;
565
+ }
566
+ set onUserSpeechStarted(cb) {
567
+ this.speechEvents.onUserSpeechStarted = cb;
568
+ }
569
+ get onUserSpeechEnded() {
570
+ return this.speechEvents.onUserSpeechEnded;
571
+ }
572
+ set onUserSpeechEnded(cb) {
573
+ this.speechEvents.onUserSpeechEnded = cb;
574
+ }
575
+ get onUserSpeechEos() {
576
+ return this.speechEvents.onUserSpeechEos;
577
+ }
578
+ set onUserSpeechEos(cb) {
579
+ this.speechEvents.onUserSpeechEos = cb;
580
+ }
581
+ get onAgentSpeechStarted() {
582
+ return this.speechEvents.onAgentSpeechStarted;
583
+ }
584
+ set onAgentSpeechStarted(cb) {
585
+ this.speechEvents.onAgentSpeechStarted = cb;
586
+ }
587
+ get onAgentSpeechEnded() {
588
+ return this.speechEvents.onAgentSpeechEnded;
589
+ }
590
+ set onAgentSpeechEnded(cb) {
591
+ this.speechEvents.onAgentSpeechEnded = cb;
592
+ }
593
+ get onLlmToken() {
594
+ return this.speechEvents.onLlmToken;
595
+ }
596
+ set onLlmToken(cb) {
597
+ this.speechEvents.onLlmToken = cb;
598
+ }
599
+ get onAudioOut() {
600
+ return this.speechEvents.onAudioOut;
601
+ }
602
+ set onAudioOut(cb) {
603
+ this.speechEvents.onAudioOut = cb;
604
+ }
605
+ /**
606
+ * Snapshot of the current per-side state of the call.
607
+ * Mirrors LiveKit's `user_state_changed` / `agent_state_changed`
608
+ * payloads. Read-only and safe to call at any time.
609
+ */
610
+ get conversationState() {
611
+ return this.speechEvents.conversationState;
612
+ }
152
613
  /**
153
614
  * Live `MetricsStore` for the embedded server. Returns `null` before
154
615
  * `serve()` is called. Exposed so integrations like `PatterTool` can
@@ -158,6 +619,42 @@ var Patter = class {
158
619
  get metricsStore() {
159
620
  return this.embeddedServer?.metricsStore ?? null;
160
621
  }
622
+ /**
623
+ * Resolves to the public webhook hostname as soon as it is known —
624
+ * either statically configured or freshly minted by the tunnel.
625
+ *
626
+ * **Prefer `phone.ready` for outbound calls.** This promise resolves
627
+ * before the embedded HTTP / WebSocket server is in `listen` state, so
628
+ * a `phone.call` placed immediately afterwards can still race the
629
+ * Twilio Media Streams upgrade and produce a "11100 Invalid URL
630
+ * format" call drop on answer.
631
+ *
632
+ * Kept as a separate signal because some integrations (e.g. webhook
633
+ * registration) only need the hostname, not the WS server.
634
+ */
635
+ get tunnelReady() {
636
+ return this._tunnelReady;
637
+ }
638
+ /**
639
+ * Resolves to the public webhook hostname once the SDK is fully ready
640
+ * to handle carrier callbacks: tunnel resolved, carrier auto-config
641
+ * complete, and the embedded HTTP / WS server in `listen` state.
642
+ *
643
+ * Use this for outbound calls instead of guessing `setTimeout` after
644
+ * `void phone.serve(...)`:
645
+ *
646
+ * ```ts
647
+ * void phone.serve({ agent, tunnel: true });
648
+ * await phone.ready;
649
+ * await phone.call({ to: '+15550001234', agent });
650
+ * ```
651
+ *
652
+ * Rejects with the underlying exception if `serve()` fails before the
653
+ * server is listening.
654
+ */
655
+ get ready() {
656
+ return this._ready;
657
+ }
161
658
  constructor(options) {
162
659
  if (options.apiKey !== void 0) {
163
660
  throw new Error(
@@ -190,10 +687,27 @@ var Patter = class {
190
687
  phoneNumber: options.phoneNumber,
191
688
  webhookUrl: normalizedWebhook,
192
689
  tunnel: options.tunnel,
193
- openaiKey: options.openaiKey
690
+ openaiKey: options.openaiKey,
691
+ persistRoot: resolvePersistRoot(options.persist)
194
692
  };
693
+ this._tunnelReady = new Promise((resolve, reject) => {
694
+ this._tunnelReadyResolve = resolve;
695
+ this._tunnelReadyReject = reject;
696
+ });
697
+ this._tunnelReady.catch(() => {
698
+ });
699
+ if (normalizedWebhook) {
700
+ this._tunnelReadyResolve(normalizedWebhook);
701
+ }
702
+ this._ready = new Promise((resolve, reject) => {
703
+ this._readyResolve = resolve;
704
+ this._readyReject = reject;
705
+ });
706
+ this._ready.catch(() => {
707
+ });
195
708
  }
196
709
  // === Agent definition ===
710
+ /** Resolve user-supplied agent options against engine defaults and return the merged config. */
197
711
  agent(opts) {
198
712
  let working = { ...opts };
199
713
  if (opts.engine) {
@@ -258,10 +772,23 @@ var Patter = class {
258
772
  if (working.variables !== void 0 && (typeof working.variables !== "object" || Array.isArray(working.variables))) {
259
773
  throw new TypeError("variables must be an object");
260
774
  }
775
+ if (working.tools) {
776
+ validateAllToolSchemas(working.tools);
777
+ }
261
778
  return working;
262
779
  }
263
780
  // === Serve / test / call ===
781
+ /** Boot the embedded HTTP/WebSocket server, configure the carrier webhook, and resolve `ready`. */
264
782
  async serve(opts) {
783
+ try {
784
+ await this._serveImpl(opts);
785
+ } catch (err) {
786
+ const e = err instanceof Error ? err : new Error(String(err));
787
+ this._readyReject(e);
788
+ throw e;
789
+ }
790
+ }
791
+ async _serveImpl(opts) {
265
792
  if (!opts.agent || typeof opts.agent !== "object") {
266
793
  throw new TypeError("agent is required. Use phone.agent() to create one.");
267
794
  }
@@ -286,31 +813,44 @@ var Patter = class {
286
813
  if (wantsCloudflared && webhookUrl) {
287
814
  throw new Error("Cannot use both tunnel: true and webhookUrl. Pick one.");
288
815
  }
289
- const { showBanner } = await import("./banner-3GNZ6VQK.mjs");
816
+ const { showBanner } = await import("./banner-UYW6UM3J.mjs");
290
817
  showBanner();
291
818
  if (wantsCloudflared) {
292
- const { startTunnel: startTunnel2 } = await import("./tunnel-UVR3PPAU.mjs");
293
- this.tunnelHandle = await startTunnel2(port);
294
- webhookUrl = this.tunnelHandle.hostname;
295
- this.localConfig = { ...this.localConfig, webhookUrl };
819
+ try {
820
+ const { startTunnel: startTunnel2 } = await import("./tunnel-43CHWPVQ.mjs");
821
+ this.tunnelHandle = await startTunnel2(port);
822
+ webhookUrl = this.tunnelHandle.hostname;
823
+ this.localConfig = { ...this.localConfig, webhookUrl };
824
+ this.tunnelOwnsWebhookUrl = true;
825
+ this._tunnelReadyResolve(webhookUrl);
826
+ } catch (err) {
827
+ const e = err instanceof Error ? err : new Error(String(err));
828
+ this._tunnelReadyReject(e);
829
+ throw e;
830
+ }
296
831
  }
297
832
  if (!webhookUrl) {
298
- throw new Error(
833
+ const err = new Error(
299
834
  "No webhookUrl configured. Either:\n - Pass webhookUrl in the Patter constructor\n - Use tunnel: true in serve() to auto-create a tunnel"
300
835
  );
836
+ this._tunnelReadyReject(err);
837
+ throw err;
301
838
  }
302
839
  const carrier = this.localConfig.carrier;
303
840
  const telephonyProvider = carrier.kind === "twilio" ? "twilio" : "telnyx";
304
- const { autoConfigureCarrier } = await import("./carrier-config-33HQ2W4V.mjs");
305
- await autoConfigureCarrier({
306
- telephonyProvider,
307
- twilioSid: carrier.kind === "twilio" ? carrier.accountSid : void 0,
308
- twilioToken: carrier.kind === "twilio" ? carrier.authToken : void 0,
309
- telnyxKey: carrier.kind === "telnyx" ? carrier.apiKey : void 0,
310
- telnyxConnectionId: carrier.kind === "telnyx" ? carrier.connectionId : void 0,
311
- phoneNumber: this.localConfig.phoneNumber,
312
- webhookHost: webhookUrl
313
- });
841
+ const wantsCarrierManagement = opts.manageWebhook !== false || wantsCloudflared;
842
+ if (wantsCarrierManagement) {
843
+ const { autoConfigureCarrier } = await import("./carrier-config-4ZKVYAWV.mjs");
844
+ await autoConfigureCarrier({
845
+ telephonyProvider,
846
+ twilioSid: carrier.kind === "twilio" ? carrier.accountSid : void 0,
847
+ twilioToken: carrier.kind === "twilio" ? carrier.authToken : void 0,
848
+ telnyxKey: carrier.kind === "telnyx" ? carrier.apiKey : void 0,
849
+ telnyxConnectionId: carrier.kind === "telnyx" ? carrier.connectionId : void 0,
850
+ phoneNumber: this.localConfig.phoneNumber,
851
+ webhookHost: webhookUrl
852
+ });
853
+ }
314
854
  this.embeddedServer = new EmbeddedServer(
315
855
  {
316
856
  twilioSid: carrier.kind === "twilio" ? carrier.accountSid : void 0,
@@ -321,7 +861,8 @@ var Patter = class {
321
861
  telephonyProvider,
322
862
  telnyxKey: carrier.kind === "telnyx" ? carrier.apiKey : void 0,
323
863
  telnyxConnectionId: carrier.kind === "telnyx" ? carrier.connectionId : void 0,
324
- telnyxPublicKey: carrier.kind === "telnyx" ? carrier.publicKey : void 0
864
+ telnyxPublicKey: carrier.kind === "telnyx" ? carrier.publicKey : void 0,
865
+ persistRoot: this.localConfig.persistRoot
325
866
  },
326
867
  opts.agent,
327
868
  opts.onCallStart,
@@ -335,10 +876,21 @@ var Patter = class {
335
876
  opts.dashboard ?? true,
336
877
  opts.dashboardToken ?? ""
337
878
  );
338
- await this.embeddedServer.start(port);
879
+ try {
880
+ await this.embeddedServer.start(port);
881
+ if (this.tunnelHandle) {
882
+ await waitForTunnelPubliclyReachable(webhookUrl);
883
+ }
884
+ this._readyResolve(webhookUrl);
885
+ } catch (err) {
886
+ const e = err instanceof Error ? err : new Error(String(err));
887
+ this._readyReject(e);
888
+ throw e;
889
+ }
339
890
  }
891
+ /** Run the agent in interactive terminal-test mode (no real telephony). */
340
892
  async test(opts) {
341
- const { TestSession: TestSession2 } = await import("./test-mode-MVJ3SKG4.mjs");
893
+ const { TestSession: TestSession2 } = await import("./test-mode-Y7YG5LFZ.mjs");
342
894
  const session = new TestSession2();
343
895
  await session.run({
344
896
  agent: opts.agent,
@@ -348,6 +900,7 @@ var Patter = class {
348
900
  onCallEnd: opts.onCallEnd
349
901
  });
350
902
  }
903
+ /** Place an outbound call via the configured carrier. */
351
904
  async call(options) {
352
905
  if (!options.to) {
353
906
  throw new Error("'to' phone number is required");
@@ -357,6 +910,10 @@ var Patter = class {
357
910
  }
358
911
  const { phoneNumber, webhookUrl, carrier } = this.localConfig;
359
912
  const effectiveRingTimeout = options.ringTimeout === void 0 ? 25 : options.ringTimeout;
913
+ const wantsAmd = options.machineDetection !== false || Boolean(options.voicemailMessage);
914
+ if (this.embeddedServer) {
915
+ this.embeddedServer.onMachineDetection = options.onMachineDetection;
916
+ }
360
917
  if (carrier.kind === "telnyx") {
361
918
  const telnyxKey = carrier.apiKey;
362
919
  const connectionId = carrier.connectionId;
@@ -365,6 +922,9 @@ var Patter = class {
365
922
  from: phoneNumber,
366
923
  to: options.to
367
924
  };
925
+ if (wantsAmd) {
926
+ telnyxPayload.answering_machine_detection = "greeting_end";
927
+ }
368
928
  if (effectiveRingTimeout !== null && effectiveRingTimeout !== void 0) {
369
929
  telnyxPayload.timeout_secs = Math.max(1, Math.floor(effectiveRingTimeout));
370
930
  }
@@ -407,12 +967,12 @@ var Patter = class {
407
967
  From: phoneNumber,
408
968
  Twiml: inlineTwiml,
409
969
  StatusCallback: statusCallbackUrl,
410
- StatusCallbackMethod: "POST",
411
- // Full lifecycle so the dashboard sees ringing/no-answer/busy/failed
412
- // transitions even when media never arrives.
413
- StatusCallbackEvent: "initiated ringing answered completed"
970
+ StatusCallbackMethod: "POST"
414
971
  });
415
- if (options.machineDetection) {
972
+ for (const evt of ["initiated", "ringing", "answered", "completed"]) {
973
+ params.append("StatusCallbackEvent", evt);
974
+ }
975
+ if (wantsAmd) {
416
976
  params.append("MachineDetection", "DetectMessageEnd");
417
977
  params.append("AsyncAmd", "true");
418
978
  params.append("AsyncAmdStatusCallback", `https://${webhookUrl}/webhooks/twilio/amd`);
@@ -445,11 +1005,22 @@ var Patter = class {
445
1005
  callee: options.to,
446
1006
  direction: "outbound"
447
1007
  });
1008
+ const notificationsPath = body.subresource_uris?.notifications;
1009
+ if (notificationsPath) {
1010
+ getLogger().info(
1011
+ `Outbound call ${callSid} placed. Twilio notifications: https://api.twilio.com${notificationsPath} (check here if the call drops with no audio).`
1012
+ );
1013
+ }
448
1014
  }
449
1015
  } catch {
450
1016
  }
451
1017
  }
452
1018
  }
1019
+ /**
1020
+ * Stop the embedded server and any running tunnel. Safe to call multiple
1021
+ * times. Leaves the instance reusable: a subsequent ``serve()`` works as
1022
+ * if the previous lifecycle never happened.
1023
+ */
453
1024
  async disconnect() {
454
1025
  if (this.tunnelHandle) {
455
1026
  this.tunnelHandle.stop();
@@ -459,10 +1030,116 @@ var Patter = class {
459
1030
  await this.embeddedServer.stop();
460
1031
  this.embeddedServer = null;
461
1032
  }
1033
+ if (this.tunnelOwnsWebhookUrl) {
1034
+ this.localConfig = { ...this.localConfig, webhookUrl: void 0 };
1035
+ this.tunnelOwnsWebhookUrl = false;
1036
+ }
1037
+ this._tunnelReady = new Promise((resolve, reject) => {
1038
+ this._tunnelReadyResolve = resolve;
1039
+ this._tunnelReadyReject = reject;
1040
+ });
1041
+ this._tunnelReady.catch(() => {
1042
+ });
1043
+ if (this.localConfig.webhookUrl) {
1044
+ this._tunnelReadyResolve(this.localConfig.webhookUrl);
1045
+ }
1046
+ this._ready = new Promise((resolve, reject) => {
1047
+ this._readyResolve = resolve;
1048
+ this._readyReject = reject;
1049
+ });
1050
+ this._ready.catch(() => {
1051
+ });
1052
+ }
1053
+ /**
1054
+ * Terminate an active call on the configured carrier.
1055
+ *
1056
+ * Posts a hangup to the carrier (Twilio
1057
+ * ``Calls(callSid).update({status:'completed'})`` or Telnyx
1058
+ * ``/v2/calls/{callControlId}/actions/hangup``) so the bridge tears down
1059
+ * gracefully — the SDK's WebSocket handler then fires ``onCallEnd`` with
1060
+ * the final ``CallMetrics`` before the WS closes.
1061
+ *
1062
+ * Use this when the host application needs to end a call programmatically
1063
+ * without going through the LLM tool-call path (e.g. an admin override,
1064
+ * a watchdog, or an integration test runner).
1065
+ *
1066
+ * @param callSid - Carrier-issued call identifier (Twilio Call SID or
1067
+ * Telnyx call_control_id) returned from a previous ``call(...)`` or
1068
+ * captured in the ``onCallStart`` callback's payload.
1069
+ * @throws Error when ``callSid`` is empty or no carrier is configured.
1070
+ */
1071
+ async endCall(callSid) {
1072
+ if (!callSid) {
1073
+ throw new Error("callSid must be a non-empty string");
1074
+ }
1075
+ const carrier = this.localConfig.carrier;
1076
+ if (carrier.kind === "twilio") {
1077
+ const auth = Buffer.from(`${carrier.accountSid}:${carrier.authToken}`).toString("base64");
1078
+ const url = `https://api.twilio.com/2010-04-01/Accounts/${carrier.accountSid}/Calls/${callSid}.json`;
1079
+ const body = new URLSearchParams({ Status: "completed" });
1080
+ const res = await fetch(url, {
1081
+ method: "POST",
1082
+ headers: {
1083
+ Authorization: `Basic ${auth}`,
1084
+ "Content-Type": "application/x-www-form-urlencoded"
1085
+ },
1086
+ body
1087
+ });
1088
+ if (!res.ok) {
1089
+ throw new Error(`Twilio hangup failed: ${res.status} ${await res.text()}`);
1090
+ }
1091
+ return;
1092
+ }
1093
+ if (carrier.kind === "telnyx") {
1094
+ const res = await fetch(`https://api.telnyx.com/v2/calls/${callSid}/actions/hangup`, {
1095
+ method: "POST",
1096
+ headers: {
1097
+ Authorization: `Bearer ${carrier.apiKey}`,
1098
+ "Content-Type": "application/json"
1099
+ }
1100
+ });
1101
+ if (!res.ok) {
1102
+ throw new Error(`Telnyx hangup failed: ${res.status} ${await res.text()}`);
1103
+ }
1104
+ return;
1105
+ }
1106
+ throw new Error(`endCall() requires a configured carrier; got kind=${carrier.kind}`);
462
1107
  }
463
1108
  };
1109
+ async function waitForTunnelPubliclyReachable(hostname, totalTimeoutMs = 6e4, graceMs = 5e3) {
1110
+ const log = getLogger();
1111
+ const { Resolver } = await import("dns/promises");
1112
+ const resolver = new Resolver({ timeout: 1500, tries: 1 });
1113
+ resolver.setServers(["1.1.1.1", "8.8.8.8"]);
1114
+ const deadline = Date.now() + totalTimeoutMs;
1115
+ let attempt = 0;
1116
+ let lastErr;
1117
+ while (Date.now() < deadline) {
1118
+ attempt += 1;
1119
+ try {
1120
+ const records = await resolver.resolve4(hostname);
1121
+ const first = records[0] ?? "<unknown>";
1122
+ log.info(
1123
+ "Tunnel DNS resolved \u2192 %s (attempt %d); waiting %d ms grace",
1124
+ first,
1125
+ attempt,
1126
+ graceMs
1127
+ );
1128
+ await new Promise((r) => setTimeout(r, graceMs));
1129
+ return;
1130
+ } catch (err) {
1131
+ lastErr = err;
1132
+ }
1133
+ const delay = Math.min(250 * Math.pow(1.6, attempt - 1), 2e3);
1134
+ await new Promise((r) => setTimeout(r, delay));
1135
+ }
1136
+ throw new Error(
1137
+ `Tunnel hostname ${hostname} did not resolve within ${totalTimeoutMs}ms. Last error: ${lastErr instanceof Error ? lastErr.message : String(lastErr)}`
1138
+ );
1139
+ }
464
1140
 
465
- // src/tool-decorator.ts
1141
+ // src/tools/tool-decorator.ts
1142
+ init_esm_shims();
466
1143
  function defineTool(input) {
467
1144
  const properties = {};
468
1145
  const required = [];
@@ -492,6 +1169,7 @@ function defineTool(input) {
492
1169
  }
493
1170
 
494
1171
  // src/text-transforms.ts
1172
+ init_esm_shims();
495
1173
  function filterMarkdown(text) {
496
1174
  let result = text;
497
1175
  result = result.replace(/```[\s\S]*?```/g, (match) => {
@@ -524,6 +1202,7 @@ function filterForTTS(text) {
524
1202
  }
525
1203
 
526
1204
  // src/providers.ts
1205
+ init_esm_shims();
527
1206
  var STTConfigImpl = class {
528
1207
  provider;
529
1208
  apiKey;
@@ -581,10 +1260,8 @@ function openaiTts(opts) {
581
1260
  function soniox(opts) {
582
1261
  return new STTConfigImpl("soniox", opts.apiKey, opts.language ?? "en");
583
1262
  }
584
- function speechmatics(_opts) {
585
- throw new Error(
586
- "speechmatics() is Python-only right now \u2014 the TS Speechmatics adapter has not shipped yet. Use the Python SDK (sdk-py) or pick another STT provider such as deepgram() / assemblyai() / soniox()."
587
- );
1263
+ function speechmatics(opts) {
1264
+ return new STTConfigImpl("speechmatics", opts.apiKey, opts.language ?? "en");
588
1265
  }
589
1266
  function assemblyai(opts) {
590
1267
  return new STTConfigImpl("assemblyai", opts.apiKey, opts.language ?? "en");
@@ -620,6 +1297,7 @@ function geminiLive(opts) {
620
1297
  }
621
1298
 
622
1299
  // src/fallback-provider.ts
1300
+ init_esm_shims();
623
1301
  var AllProvidersFailedError = class extends Error {
624
1302
  constructor(message) {
625
1303
  super(message);
@@ -698,6 +1376,7 @@ var FallbackLLMProvider = class {
698
1376
  // -----------------------------------------------------------------------
699
1377
  // LLMProvider implementation
700
1378
  // -----------------------------------------------------------------------
1379
+ /** Streaming entry point — yields chunks from the first provider that succeeds. */
701
1380
  async *stream(messages, tools) {
702
1381
  const errors = [];
703
1382
  const result = yield* this.tryProviders(
@@ -816,7 +1495,11 @@ var FallbackLLMProvider = class {
816
1495
  }
817
1496
  };
818
1497
 
1498
+ // src/integrations/index.ts
1499
+ init_esm_shims();
1500
+
819
1501
  // src/integrations/patter-tool.ts
1502
+ init_esm_shims();
820
1503
  import { EventEmitter } from "events";
821
1504
  var PARAMETERS_SCHEMA = {
822
1505
  type: "object",
@@ -970,6 +1653,7 @@ var PatterTool = class _PatterTool {
970
1653
  this.started = false;
971
1654
  }
972
1655
  // --- Execution ----------------------------------------------------------
1656
+ /** Place an outbound call and resolve once it ends with the transcript and metrics. */
973
1657
  async execute(args) {
974
1658
  if (!this.started) await this.start();
975
1659
  if (!args || typeof args.to !== "string" || !args.to.startsWith("+")) {
@@ -1079,6 +1763,7 @@ var PatterTool = class _PatterTool {
1079
1763
  };
1080
1764
 
1081
1765
  // src/providers/gemini-live.ts
1766
+ init_esm_shims();
1082
1767
  var GEMINI_DEFAULT_INPUT_SR = 16e3;
1083
1768
  var GEMINI_DEFAULT_OUTPUT_SR = 24e3;
1084
1769
  var GeminiLiveAdapter = class {
@@ -1093,6 +1778,7 @@ var GeminiLiveAdapter = class {
1093
1778
  this.outputSampleRate = options.outputSampleRate ?? GEMINI_DEFAULT_OUTPUT_SR;
1094
1779
  this.temperature = options.temperature ?? 0.8;
1095
1780
  }
1781
+ apiKey;
1096
1782
  model;
1097
1783
  voice;
1098
1784
  instructions;
@@ -1113,6 +1799,7 @@ var GeminiLiveAdapter = class {
1113
1799
  * not the call_id).
1114
1800
  */
1115
1801
  pendingToolCalls = /* @__PURE__ */ new Map();
1802
+ /** Lazily import @google/genai, open a Live session, and start the receive loop. */
1116
1803
  async connect() {
1117
1804
  let genaiModule;
1118
1805
  try {
@@ -1160,6 +1847,7 @@ var GeminiLiveAdapter = class {
1160
1847
  getLogger().error(`Gemini Live receive loop error: ${String(err)}`);
1161
1848
  });
1162
1849
  }
1850
+ /** Send a PCM audio chunk to Gemini as base64 inline data. */
1163
1851
  sendAudio(pcm) {
1164
1852
  if (!this.session || !this.running) return;
1165
1853
  const mime = `audio/pcm;rate=${this.inputSampleRate}`;
@@ -1173,6 +1861,7 @@ var GeminiLiveAdapter = class {
1173
1861
  );
1174
1862
  }
1175
1863
  }
1864
+ /** Send a text turn to Gemini and mark the turn complete. */
1176
1865
  async sendText(text) {
1177
1866
  if (!this.session) return;
1178
1867
  const sess = this.session;
@@ -1181,6 +1870,7 @@ var GeminiLiveAdapter = class {
1181
1870
  turnComplete: true
1182
1871
  });
1183
1872
  }
1873
+ /** Send a tool/function-call result back to Gemini. */
1184
1874
  async sendFunctionResult(callId, result) {
1185
1875
  if (!this.session) return;
1186
1876
  const sess = this.session;
@@ -1192,9 +1882,11 @@ var GeminiLiveAdapter = class {
1192
1882
  ]
1193
1883
  });
1194
1884
  }
1885
+ /** No-op — Gemini Live barge-in is VAD-driven, not client-cancelled. */
1195
1886
  cancelResponse() {
1196
1887
  getLogger().debug("Gemini Live: cancelResponse is implicit via VAD");
1197
1888
  }
1889
+ /** Register an event handler that receives every Gemini Live event. */
1198
1890
  onEvent(handler) {
1199
1891
  this.handlers.push(handler);
1200
1892
  }
@@ -1251,6 +1943,7 @@ var GeminiLiveAdapter = class {
1251
1943
  this.running = false;
1252
1944
  }
1253
1945
  }
1946
+ /** Close the Gemini Live session and stop the receive loop. */
1254
1947
  async close() {
1255
1948
  this.running = false;
1256
1949
  if (this.session) {
@@ -1271,6 +1964,7 @@ var GeminiLiveAdapter = class {
1271
1964
  };
1272
1965
 
1273
1966
  // src/providers/ultravox-realtime.ts
1967
+ init_esm_shims();
1274
1968
  import WebSocket from "ws";
1275
1969
  var ULTRAVOX_DEFAULT_API_BASE = "https://api.ultravox.ai/api";
1276
1970
  var ULTRAVOX_DEFAULT_SR = 16e3;
@@ -1286,6 +1980,7 @@ var UltravoxRealtimeAdapter = class {
1286
1980
  this.sampleRate = options.sampleRate ?? ULTRAVOX_DEFAULT_SR;
1287
1981
  this.firstMessage = options.firstMessage ?? "";
1288
1982
  }
1983
+ apiKey;
1289
1984
  model;
1290
1985
  voice;
1291
1986
  instructions;
@@ -1298,6 +1993,7 @@ var UltravoxRealtimeAdapter = class {
1298
1993
  handlers = [];
1299
1994
  /** Exposed for diagnostics — true while the underlying socket is open. */
1300
1995
  running = false;
1996
+ /** Create the Ultravox call, fetch the joinUrl, and open the WebSocket. */
1301
1997
  async connect() {
1302
1998
  const body = {
1303
1999
  model: this.model,
@@ -1367,14 +2063,17 @@ var UltravoxRealtimeAdapter = class {
1367
2063
  this.running = false;
1368
2064
  });
1369
2065
  }
2066
+ /** Send a binary PCM audio chunk to the Ultravox call. */
1370
2067
  sendAudio(pcm) {
1371
2068
  if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
1372
2069
  this.ws.send(pcm, { binary: true });
1373
2070
  }
2071
+ /** Inject a user text message into the Ultravox conversation. */
1374
2072
  async sendText(text) {
1375
2073
  if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
1376
2074
  this.ws.send(JSON.stringify({ type: "input_text_message", text }));
1377
2075
  }
2076
+ /** Send a tool/function-call result back to Ultravox. */
1378
2077
  async sendFunctionResult(callId, result) {
1379
2078
  if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
1380
2079
  this.ws.send(
@@ -1386,10 +2085,12 @@ var UltravoxRealtimeAdapter = class {
1386
2085
  })
1387
2086
  );
1388
2087
  }
2088
+ /** Clear the playback buffer to interrupt the agent's current response. */
1389
2089
  cancelResponse() {
1390
2090
  if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
1391
2091
  this.ws.send(JSON.stringify({ type: "playback_clear_buffer" }));
1392
2092
  }
2093
+ /** Register an event handler that receives every Ultravox event. */
1393
2094
  onEvent(handler) {
1394
2095
  this.handlers.push(handler);
1395
2096
  }
@@ -1436,6 +2137,7 @@ var UltravoxRealtimeAdapter = class {
1436
2137
  await this.emit("speech_started", null);
1437
2138
  }
1438
2139
  }
2140
+ /** Close the Ultravox WebSocket and mark the adapter idle. */
1439
2141
  async close() {
1440
2142
  this.running = false;
1441
2143
  if (this.ws) {
@@ -1461,6 +2163,7 @@ function toolParamsToUltravox(parameters) {
1461
2163
  }
1462
2164
 
1463
2165
  // src/scheduler.ts
2166
+ init_esm_shims();
1464
2167
  var cronModule = null;
1465
2168
  var loadError = null;
1466
2169
  async function loadCron() {
@@ -1469,7 +2172,7 @@ async function loadCron() {
1469
2172
  try {
1470
2173
  const imported = await import(
1471
2174
  /* @vite-ignore */
1472
- "./node-cron-6PRPSBG5.mjs"
2175
+ "./node-cron-JFWQQRBU.mjs"
1473
2176
  );
1474
2177
  cronModule = imported && imported.default ? imported.default : imported;
1475
2178
  return cronModule;
@@ -1576,6 +2279,7 @@ function scheduleInterval(intervalOrOpts, callback) {
1576
2279
  }
1577
2280
 
1578
2281
  // src/stt/deepgram.ts
2282
+ init_esm_shims();
1579
2283
  var STT = class extends DeepgramSTT {
1580
2284
  static providerKey = "deepgram";
1581
2285
  constructor(opts = {}) {
@@ -1602,7 +2306,11 @@ var STT = class extends DeepgramSTT {
1602
2306
  }
1603
2307
  };
1604
2308
 
2309
+ // src/stt/whisper.ts
2310
+ init_esm_shims();
2311
+
1605
2312
  // src/providers/whisper-stt.ts
2313
+ init_esm_shims();
1606
2314
  var OPENAI_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions";
1607
2315
  var DEFAULT_BUFFER_SIZE = 16e3 * 2;
1608
2316
  var ALLOWED_MODELS = /* @__PURE__ */ new Set(["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
@@ -1666,11 +2374,13 @@ var WhisperSTT = class _WhisperSTT {
1666
2374
  static forTwilio(apiKey, language = "en", model = "whisper-1") {
1667
2375
  return new _WhisperSTT(apiKey, language, model);
1668
2376
  }
2377
+ /** Reset the audio buffer and arm the adapter for incoming chunks. */
1669
2378
  async connect() {
1670
2379
  this.running = true;
1671
2380
  this.chunks = [];
1672
2381
  this.bufferedBytes = 0;
1673
2382
  }
2383
+ /** Buffer a PCM16 chunk; flushes to Whisper once `bufferSize` bytes are reached. */
1674
2384
  sendAudio(audio) {
1675
2385
  if (!this.running) return;
1676
2386
  this.chunks.push(audio);
@@ -1701,9 +2411,11 @@ var WhisperSTT = class _WhisperSTT {
1701
2411
  onTranscript(callback) {
1702
2412
  this.callbacks.add(callback);
1703
2413
  }
2414
+ /** Remove a previously registered transcript listener. */
1704
2415
  offTranscript(callback) {
1705
2416
  this.callbacks.delete(callback);
1706
2417
  }
2418
+ /** Flush any buffered audio, await pending transcriptions, and clear listeners. */
1707
2419
  async close() {
1708
2420
  this.running = false;
1709
2421
  if (this.bufferedBytes > 0) {
@@ -1781,7 +2493,11 @@ var STT2 = class extends WhisperSTT {
1781
2493
  }
1782
2494
  };
1783
2495
 
2496
+ // src/stt/openai-transcribe.ts
2497
+ init_esm_shims();
2498
+
1784
2499
  // src/providers/openai-transcribe-stt.ts
2500
+ init_esm_shims();
1785
2501
  var ALLOWED_MODELS2 = /* @__PURE__ */ new Set(["gpt-4o-transcribe", "gpt-4o-mini-transcribe"]);
1786
2502
  var DEFAULT_BUFFER_SIZE2 = 16e3 * 2;
1787
2503
  var OpenAITranscribeSTT = class extends WhisperSTT {
@@ -1817,11 +2533,37 @@ var STT3 = class extends OpenAITranscribeSTT {
1817
2533
  }
1818
2534
  };
1819
2535
 
2536
+ // src/stt/cartesia.ts
2537
+ init_esm_shims();
2538
+
1820
2539
  // src/providers/cartesia-stt.ts
2540
+ init_esm_shims();
1821
2541
  import WebSocket2 from "ws";
2542
+ var CartesiaSTTModel = {
2543
+ INK_WHISPER: "ink-whisper"
2544
+ };
2545
+ var CartesiaSTTEncoding = {
2546
+ PCM_S16LE: "pcm_s16le"
2547
+ };
2548
+ var CartesiaSTTSampleRate = {
2549
+ HZ_8000: 8e3,
2550
+ HZ_16000: 16e3,
2551
+ HZ_24000: 24e3,
2552
+ HZ_44100: 44100,
2553
+ HZ_48000: 48e3
2554
+ };
2555
+ var CartesiaSTTServerEvent = {
2556
+ TRANSCRIPT: "transcript",
2557
+ FLUSH_DONE: "flush_done",
2558
+ DONE: "done",
2559
+ ERROR: "error"
2560
+ };
2561
+ var CartesiaSTTClientFrame = {
2562
+ FINALIZE: "finalize"
2563
+ };
1822
2564
  var DEFAULT_BASE_URL = "https://api.cartesia.ai";
1823
2565
  var API_VERSION = "2025-04-16";
1824
- var USER_AGENT = "Patter/1.0 (integration=LiveKit-port; provider=Cartesia)";
2566
+ var USER_AGENT = "Patter/1.0";
1825
2567
  var KEEPALIVE_INTERVAL_MS = 3e4;
1826
2568
  var CONNECT_TIMEOUT_MS = 1e4;
1827
2569
  var CartesiaSTT = class {
@@ -1832,6 +2574,8 @@ var CartesiaSTT = class {
1832
2574
  throw new Error("CartesiaSTT requires a non-empty apiKey");
1833
2575
  }
1834
2576
  }
2577
+ apiKey;
2578
+ options;
1835
2579
  ws = null;
1836
2580
  callbacks = /* @__PURE__ */ new Set();
1837
2581
  keepaliveTimer = null;
@@ -1855,15 +2599,16 @@ var CartesiaSTT = class {
1855
2599
  }
1856
2600
  const language = opts.language ?? "en";
1857
2601
  const params = new URLSearchParams({
1858
- model: opts.model ?? "ink-whisper",
1859
- sample_rate: String(opts.sampleRate ?? 16e3),
1860
- encoding: opts.encoding ?? "pcm_s16le",
2602
+ model: opts.model ?? CartesiaSTTModel.INK_WHISPER,
2603
+ sample_rate: String(opts.sampleRate ?? CartesiaSTTSampleRate.HZ_16000),
2604
+ encoding: opts.encoding ?? CartesiaSTTEncoding.PCM_S16LE,
1861
2605
  cartesia_version: API_VERSION,
1862
2606
  api_key: this.apiKey,
1863
2607
  language
1864
2608
  });
1865
2609
  return `${base}/stt/websocket?${params.toString()}`;
1866
2610
  }
2611
+ /** Open the streaming WebSocket and arm message + keepalive handlers. */
1867
2612
  async connect() {
1868
2613
  const url = this.buildWsUrl();
1869
2614
  this.ws = new WebSocket2(url, {
@@ -1903,7 +2648,7 @@ var CartesiaSTT = class {
1903
2648
  }
1904
2649
  handleEvent(event) {
1905
2650
  const type = event.type;
1906
- if (type === "transcript") {
2651
+ if (type === CartesiaSTTServerEvent.TRANSCRIPT) {
1907
2652
  const text = (event.text ?? "").trim();
1908
2653
  const isFinal = Boolean(event.is_final);
1909
2654
  if (!text && !isFinal) return;
@@ -1915,7 +2660,7 @@ var CartesiaSTT = class {
1915
2660
  this.emit({ text, isFinal, confidence });
1916
2661
  return;
1917
2662
  }
1918
- if (type === "error") {
2663
+ if (type === CartesiaSTTServerEvent.ERROR) {
1919
2664
  getLogger().error(`Cartesia STT error: ${event.message ?? "unknown"}`);
1920
2665
  return;
1921
2666
  }
@@ -1925,10 +2670,12 @@ var CartesiaSTT = class {
1925
2670
  cb(transcript);
1926
2671
  }
1927
2672
  }
2673
+ /** Send a binary PCM16-LE audio chunk to Cartesia for transcription. */
1928
2674
  sendAudio(audio) {
1929
2675
  if (!this.ws || this.ws.readyState !== WebSocket2.OPEN) return;
1930
2676
  this.ws.send(audio);
1931
2677
  }
2678
+ /** Register a transcript listener. */
1932
2679
  onTranscript(callback) {
1933
2680
  this.callbacks.add(callback);
1934
2681
  }
@@ -1951,7 +2698,7 @@ var CartesiaSTT = class {
1951
2698
  }
1952
2699
  if (this.ws) {
1953
2700
  try {
1954
- this.ws.send("finalize");
2701
+ this.ws.send(CartesiaSTTClientFrame.FINALIZE);
1955
2702
  } catch {
1956
2703
  }
1957
2704
  this.ws.close();
@@ -1974,7 +2721,7 @@ var CartesiaSTT = class {
1974
2721
  if (ws.readyState === WebSocket2.OPEN) {
1975
2722
  try {
1976
2723
  await new Promise((resolve) => {
1977
- ws.send("finalize", (err) => {
2724
+ ws.send(CartesiaSTTClientFrame.FINALIZE, (err) => {
1978
2725
  if (err) getLogger().warn(`CartesiaSTT finalize send failed: ${String(err)}`);
1979
2726
  resolve();
1980
2727
  });
@@ -2022,12 +2769,33 @@ var STT4 = class extends CartesiaSTT {
2022
2769
  }
2023
2770
  };
2024
2771
 
2772
+ // src/stt/soniox.ts
2773
+ init_esm_shims();
2774
+
2025
2775
  // src/providers/soniox-stt.ts
2776
+ init_esm_shims();
2026
2777
  import WebSocket3 from "ws";
2027
2778
  var SONIOX_WS_URL = "wss://stt-rt.soniox.com/transcribe-websocket";
2028
- var KEEPALIVE_MESSAGE = '{"type": "keepalive"}';
2029
- var END_TOKEN = "<end>";
2030
- var FINALIZED_TOKEN = "<fin>";
2779
+ var SonioxModel = {
2780
+ STT_RT_V4: "stt-rt-v4",
2781
+ STT_RT_V3: "stt-rt-v3",
2782
+ STT_RT_V2: "stt-rt-v2"
2783
+ };
2784
+ var SonioxSampleRate = {
2785
+ HZ_8000: 8e3,
2786
+ HZ_16000: 16e3,
2787
+ HZ_24000: 24e3
2788
+ };
2789
+ var SonioxClientFrame = {
2790
+ KEEPALIVE: "keepalive"
2791
+ };
2792
+ var SonioxEndpointToken = {
2793
+ END: "<end>",
2794
+ FIN: "<fin>"
2795
+ };
2796
+ var KEEPALIVE_MESSAGE = JSON.stringify({ type: SonioxClientFrame.KEEPALIVE });
2797
+ var END_TOKEN = SonioxEndpointToken.END;
2798
+ var FINALIZED_TOKEN = SonioxEndpointToken.FIN;
2031
2799
  var KEEPALIVE_INTERVAL_MS2 = 5e3;
2032
2800
  function isEndToken(token) {
2033
2801
  return token.text === END_TOKEN || token.text === FINALIZED_TOKEN;
@@ -2082,10 +2850,10 @@ var SonioxSTT = class _SonioxSTT {
2082
2850
  throw new Error("maxEndpointDelayMs must be between 500 and 3000");
2083
2851
  }
2084
2852
  this.apiKey = apiKey;
2085
- this.model = options.model ?? "stt-rt-v4";
2853
+ this.model = options.model ?? SonioxModel.STT_RT_V4;
2086
2854
  this.languageHints = options.languageHints;
2087
2855
  this.languageHintsStrict = options.languageHintsStrict ?? false;
2088
- this.sampleRate = options.sampleRate ?? 16e3;
2856
+ this.sampleRate = options.sampleRate ?? SonioxSampleRate.HZ_16000;
2089
2857
  this.numChannels = options.numChannels ?? 1;
2090
2858
  this.enableSpeakerDiarization = options.enableSpeakerDiarization ?? false;
2091
2859
  this.enableLanguageIdentification = options.enableLanguageIdentification ?? true;
@@ -2095,7 +2863,10 @@ var SonioxSTT = class _SonioxSTT {
2095
2863
  }
2096
2864
  /** Factory for Twilio-style 8 kHz linear PCM. */
2097
2865
  static forTwilio(apiKey, languageHints) {
2098
- return new _SonioxSTT(apiKey, { sampleRate: 8e3, languageHints });
2866
+ return new _SonioxSTT(apiKey, {
2867
+ sampleRate: SonioxSampleRate.HZ_8000,
2868
+ languageHints
2869
+ });
2099
2870
  }
2100
2871
  buildConfig() {
2101
2872
  const config = {
@@ -2118,6 +2889,7 @@ var SonioxSTT = class _SonioxSTT {
2118
2889
  }
2119
2890
  return config;
2120
2891
  }
2892
+ /** Open the streaming WebSocket and send the initial config payload. */
2121
2893
  async connect() {
2122
2894
  this.final.reset();
2123
2895
  this.ws = new WebSocket3(this.baseUrl);
@@ -2211,11 +2983,13 @@ var SonioxSTT = class _SonioxSTT {
2211
2983
  cb(transcript);
2212
2984
  }
2213
2985
  }
2986
+ /** Send a binary PCM16-LE audio chunk to Soniox for transcription. */
2214
2987
  sendAudio(audio) {
2215
2988
  if (!this.ws || this.ws.readyState !== WebSocket3.OPEN) return;
2216
2989
  if (audio.length === 0) return;
2217
2990
  this.ws.send(audio);
2218
2991
  }
2992
+ /** Register a transcript listener (max 10 concurrent listeners). */
2219
2993
  onTranscript(callback) {
2220
2994
  if (this.callbacks.length >= 10) {
2221
2995
  getLogger().warn(
@@ -2226,6 +3000,7 @@ var SonioxSTT = class _SonioxSTT {
2226
3000
  }
2227
3001
  this.callbacks.push(callback);
2228
3002
  }
3003
+ /** Send the empty-frame stream terminator and close the WebSocket. */
2229
3004
  close() {
2230
3005
  this.clearKeepalive();
2231
3006
  if (this.ws) {
@@ -2258,8 +3033,41 @@ var STT5 = class extends SonioxSTT {
2258
3033
  }
2259
3034
  };
2260
3035
 
3036
+ // src/stt/assemblyai.ts
3037
+ init_esm_shims();
3038
+
2261
3039
  // src/providers/assemblyai-stt.ts
3040
+ init_esm_shims();
2262
3041
  import WebSocket4 from "ws";
3042
+ var AssemblyAIEncoding = {
3043
+ PCM_S16LE: "pcm_s16le",
3044
+ PCM_MULAW: "pcm_mulaw"
3045
+ };
3046
+ var AssemblyAIModel = {
3047
+ UNIVERSAL_STREAMING_ENGLISH: "universal-streaming-english",
3048
+ UNIVERSAL_STREAMING_MULTILINGUAL: "universal-streaming-multilingual",
3049
+ U3_RT_PRO: "u3-rt-pro",
3050
+ WHISPER_RT: "whisper-rt"
3051
+ };
3052
+ var AssemblyAIDomain = {
3053
+ GENERAL: "general",
3054
+ MEDICAL_V1: "medical-v1"
3055
+ };
3056
+ var AssemblyAISampleRate = {
3057
+ HZ_8000: 8e3,
3058
+ HZ_16000: 16e3
3059
+ };
3060
+ var AssemblyAIEventType = {
3061
+ BEGIN: "Begin",
3062
+ TURN: "Turn",
3063
+ SPEECH_STARTED: "SpeechStarted",
3064
+ TERMINATION: "Termination"
3065
+ };
3066
+ var AssemblyAIClientFrame = {
3067
+ UPDATE_CONFIGURATION: "UpdateConfiguration",
3068
+ FORCE_ENDPOINT: "ForceEndpoint",
3069
+ TERMINATE: "Terminate"
3070
+ };
2263
3071
  var DEFAULT_BASE_URL2 = "wss://streaming.assemblyai.com";
2264
3072
  var DEFAULT_MIN_TURN_SILENCE_MS = 400;
2265
3073
  var CONNECT_TIMEOUT_MS2 = 1e4;
@@ -2267,7 +3075,10 @@ var TERMINATION_WAIT_TIMEOUT_MS = 500;
2267
3075
  var MIN_CHUNK_DURATION_MS = 50;
2268
3076
  var MAX_CHUNK_DURATION_MS = 1e3;
2269
3077
  var RECONNECT_ERROR_CODES = /* @__PURE__ */ new Set([3005, 3008]);
2270
- var VALID_DOMAINS = /* @__PURE__ */ new Set(["general", "medical-v1"]);
3078
+ var VALID_DOMAINS = /* @__PURE__ */ new Set([
3079
+ AssemblyAIDomain.GENERAL,
3080
+ AssemblyAIDomain.MEDICAL_V1
3081
+ ]);
2271
3082
  var AssemblyAISTTNotConnectedError = class extends Error {
2272
3083
  constructor(message = "AssemblyAISTT is not connected") {
2273
3084
  super(message);
@@ -2290,38 +3101,56 @@ var AssemblyAISTT = class _AssemblyAISTT {
2290
3101
  );
2291
3102
  }
2292
3103
  }
3104
+ apiKey;
3105
+ options;
2293
3106
  ws = null;
2294
3107
  callbacks = /* @__PURE__ */ new Set();
2295
3108
  closing = false;
2296
3109
  reconnectAttempts = 0;
2297
3110
  terminationResolve = null;
2298
- /** AssemblyAI session id — set when the `Begin` message arrives. */
3111
+ /**
3112
+ * Coalescing buffer for inbound audio frames. AssemblyAI's v3
3113
+ * streaming endpoint requires each ws frame to carry 50–1000 ms of
3114
+ * audio (server emits error 3007 below 50 ms — observed in the
3115
+ * field as a fully-billed call with zero transcripts). Twilio sends
3116
+ * 20 ms frames, so the SDK must batch ~3 frames before forwarding.
3117
+ *
3118
+ * We accumulate raw bytes here until the cumulative duration crosses
3119
+ * the configured target (default 60 ms — comfortably above the 50 ms
3120
+ * floor with one frame of headroom against jitter), then flush in a
3121
+ * single `ws.send()`.
3122
+ */
3123
+ chunkBuffer = [];
3124
+ chunkBufferBytes = 0;
3125
+ /** Target send size in bytes — recomputed lazily once encoding/sample-rate is known. */
3126
+ chunkBufferTargetBytes = 0;
3127
+ /** AssemblyAI session id — set when the `Begin` message arrives. */
2299
3128
  sessionId = null;
2300
3129
  /** Unix timestamp when the AssemblyAI session expires. */
2301
3130
  expiresAt = null;
2302
3131
  /** Factory for Twilio calls — mulaw 8 kHz. */
2303
- static forTwilio(apiKey, model = "universal-streaming-english") {
3132
+ static forTwilio(apiKey, model = AssemblyAIModel.UNIVERSAL_STREAMING_ENGLISH) {
2304
3133
  return new _AssemblyAISTT(apiKey, {
2305
3134
  model,
2306
- encoding: "pcm_mulaw",
2307
- sampleRate: 8e3
3135
+ encoding: AssemblyAIEncoding.PCM_MULAW,
3136
+ sampleRate: AssemblyAISampleRate.HZ_8000
2308
3137
  });
2309
3138
  }
2310
3139
  buildUrl() {
2311
3140
  const opts = this.options;
2312
- const model = opts.model ?? "universal-streaming-english";
2313
- const encoding = opts.encoding ?? "pcm_s16le";
2314
- const sampleRate = opts.sampleRate ?? 16e3;
3141
+ const model = opts.model ?? AssemblyAIModel.UNIVERSAL_STREAMING_ENGLISH;
3142
+ const encoding = opts.encoding ?? AssemblyAIEncoding.PCM_S16LE;
3143
+ const sampleRate = opts.sampleRate ?? AssemblyAISampleRate.HZ_16000;
2315
3144
  let minSilence;
2316
3145
  let maxSilence;
2317
- if (model === "u3-rt-pro") {
3146
+ if (model === AssemblyAIModel.U3_RT_PRO) {
2318
3147
  minSilence = opts.minTurnSilence ?? 100;
2319
3148
  maxSilence = opts.maxTurnSilence ?? minSilence;
2320
3149
  } else {
2321
3150
  minSilence = opts.minTurnSilence ?? DEFAULT_MIN_TURN_SILENCE_MS;
2322
3151
  maxSilence = opts.maxTurnSilence;
2323
3152
  }
2324
- const languageDetection = opts.languageDetection ?? (model.includes("multilingual") || model === "u3-rt-pro");
3153
+ const languageDetection = opts.languageDetection ?? (model.includes("multilingual") || model === AssemblyAIModel.U3_RT_PRO);
2325
3154
  const raw = {
2326
3155
  sample_rate: sampleRate,
2327
3156
  encoding,
@@ -2363,6 +3192,7 @@ var AssemblyAISTT = class _AssemblyAISTT {
2363
3192
  }
2364
3193
  return headers;
2365
3194
  }
3195
+ /** Open the streaming WebSocket and arm message handlers. */
2366
3196
  async connect() {
2367
3197
  this.closing = false;
2368
3198
  const url = this.buildUrl();
@@ -2416,28 +3246,28 @@ var AssemblyAISTT = class _AssemblyAISTT {
2416
3246
  }
2417
3247
  handleEvent(event) {
2418
3248
  const type = event.type;
2419
- if (type === "Begin") {
3249
+ if (type === AssemblyAIEventType.BEGIN) {
2420
3250
  this.sessionId = event.id ?? null;
2421
3251
  this.expiresAt = event.expires_at ?? null;
2422
3252
  return;
2423
3253
  }
2424
- if (type === "Termination") {
3254
+ if (type === AssemblyAIEventType.TERMINATION) {
2425
3255
  if (this.terminationResolve) {
2426
3256
  this.terminationResolve();
2427
3257
  this.terminationResolve = null;
2428
3258
  }
2429
3259
  return;
2430
3260
  }
2431
- if (type === "SpeechStarted") {
3261
+ if (type === AssemblyAIEventType.SPEECH_STARTED) {
2432
3262
  this.emit({
2433
3263
  text: "",
2434
3264
  isFinal: false,
2435
3265
  confidence: 0,
2436
- eventType: "SpeechStarted"
3266
+ eventType: AssemblyAIEventType.SPEECH_STARTED
2437
3267
  });
2438
3268
  return;
2439
3269
  }
2440
- if (type !== "Turn") {
3270
+ if (type !== AssemblyAIEventType.TURN) {
2441
3271
  return;
2442
3272
  }
2443
3273
  const endOfTurn = Boolean(event.end_of_turn);
@@ -2468,25 +3298,49 @@ var AssemblyAISTT = class _AssemblyAISTT {
2468
3298
  cb(transcript);
2469
3299
  }
2470
3300
  }
3301
+ /** Send a binary PCM/mu-law audio chunk to AssemblyAI for transcription. */
2471
3302
  sendAudio(audio) {
2472
3303
  if (!this.ws || this.ws.readyState !== WebSocket4.OPEN) {
2473
- throw new AssemblyAISTTNotConnectedError(
2474
- "AssemblyAISTT.sendAudio: WebSocket is not open"
2475
- );
3304
+ return;
2476
3305
  }
2477
- const durationMs = this.estimateChunkDurationMs(audio.length);
3306
+ if (this.chunkBufferTargetBytes === 0) {
3307
+ this.chunkBufferTargetBytes = this.computeTargetChunkBytes();
3308
+ }
3309
+ this.chunkBuffer.push(audio);
3310
+ this.chunkBufferBytes += audio.length;
3311
+ if (this.chunkBufferBytes < this.chunkBufferTargetBytes) {
3312
+ return;
3313
+ }
3314
+ const merged = Buffer.concat(this.chunkBuffer, this.chunkBufferBytes);
3315
+ this.chunkBuffer = [];
3316
+ this.chunkBufferBytes = 0;
3317
+ const durationMs = this.estimateChunkDurationMs(merged.length);
2478
3318
  if (durationMs !== null && (durationMs < MIN_CHUNK_DURATION_MS || durationMs > MAX_CHUNK_DURATION_MS)) {
2479
3319
  getLogger().warn(
2480
3320
  `AssemblyAISTT: audio chunk duration ${durationMs.toFixed(1)}ms outside 50-1000ms bounds (may trigger error 3007).`
2481
3321
  );
2482
3322
  }
2483
- this.ws.send(audio);
3323
+ this.ws.send(merged);
3324
+ }
3325
+ /**
3326
+ * Compute the byte count corresponding to ~60 ms of audio for the
3327
+ * configured encoding / sample rate. Sits one Twilio frame (20 ms)
3328
+ * above AssemblyAI's 50 ms floor so jitter never dips below.
3329
+ */
3330
+ computeTargetChunkBytes() {
3331
+ const targetMs = 60;
3332
+ const encoding = this.options.encoding ?? AssemblyAIEncoding.PCM_S16LE;
3333
+ const sampleRate = this.options.sampleRate ?? AssemblyAISampleRate.HZ_16000;
3334
+ if (encoding === AssemblyAIEncoding.PCM_MULAW) {
3335
+ return Math.ceil(sampleRate * targetMs / 1e3);
3336
+ }
3337
+ return Math.ceil(sampleRate * targetMs / 1e3) * 2;
2484
3338
  }
2485
3339
  estimateChunkDurationMs(byteLength) {
2486
3340
  if (byteLength <= 0) return null;
2487
- const sampleRate = this.options.sampleRate ?? 16e3;
3341
+ const sampleRate = this.options.sampleRate ?? AssemblyAISampleRate.HZ_16000;
2488
3342
  if (sampleRate <= 0) return null;
2489
- const bytesPerSample = (this.options.encoding ?? "pcm_s16le") === "pcm_s16le" ? 2 : 1;
3343
+ const bytesPerSample = (this.options.encoding ?? AssemblyAIEncoding.PCM_S16LE) === AssemblyAIEncoding.PCM_S16LE ? 2 : 1;
2490
3344
  const samples = byteLength / bytesPerSample;
2491
3345
  return samples / sampleRate * 1e3;
2492
3346
  }
@@ -2500,7 +3354,9 @@ var AssemblyAISTT = class _AssemblyAISTT {
2500
3354
  "AssemblyAISTT.updateConfiguration: WebSocket is not open"
2501
3355
  );
2502
3356
  }
2503
- const payload = { type: "UpdateConfiguration" };
3357
+ const payload = {
3358
+ type: AssemblyAIClientFrame.UPDATE_CONFIGURATION
3359
+ };
2504
3360
  if (params.keytermsPrompt !== void 0) {
2505
3361
  payload.keyterms_prompt = JSON.stringify(params.keytermsPrompt);
2506
3362
  }
@@ -2522,19 +3378,21 @@ var AssemblyAISTT = class _AssemblyAISTT {
2522
3378
  "AssemblyAISTT.forceEndpoint: WebSocket is not open"
2523
3379
  );
2524
3380
  }
2525
- this.ws.send(JSON.stringify({ type: "ForceEndpoint" }));
3381
+ this.ws.send(JSON.stringify({ type: AssemblyAIClientFrame.FORCE_ENDPOINT }));
2526
3382
  }
3383
+ /** Register a transcript listener. Returns an unsubscribe function. */
2527
3384
  onTranscript(callback) {
2528
3385
  this.callbacks.add(callback);
2529
3386
  return () => {
2530
3387
  this.callbacks.delete(callback);
2531
3388
  };
2532
3389
  }
3390
+ /** Send a Terminate frame, wait briefly for ack, and close the socket. */
2533
3391
  async close() {
2534
3392
  this.closing = true;
2535
3393
  if (!this.ws) return;
2536
3394
  try {
2537
- this.ws.send(JSON.stringify({ type: "Terminate" }));
3395
+ this.ws.send(JSON.stringify({ type: AssemblyAIClientFrame.TERMINATE }));
2538
3396
  } catch {
2539
3397
  }
2540
3398
  await new Promise((resolve) => {
@@ -2573,13 +3431,361 @@ var STT6 = class extends AssemblyAISTT {
2573
3431
  "AssemblyAI STT requires an apiKey. Pass { apiKey: '...' } or set ASSEMBLYAI_API_KEY in the environment."
2574
3432
  );
2575
3433
  }
2576
- const { apiKey: _ignored, ...rest } = opts;
3434
+ const { apiKey: _ignored, language: _lang, ...rest } = opts;
2577
3435
  void _ignored;
3436
+ void _lang;
2578
3437
  super(key, rest);
2579
3438
  }
2580
3439
  };
2581
3440
 
3441
+ // src/stt/speechmatics.ts
3442
+ init_esm_shims();
3443
+
3444
+ // src/providers/speechmatics-stt.ts
3445
+ init_esm_shims();
3446
+ import WebSocket5 from "ws";
3447
+ var SPEECHMATICS_RT_URL = "wss://eu.rt.speechmatics.com/v2";
3448
+ var CONNECT_TIMEOUT_MS3 = 1e4;
3449
+ var TurnDetectionMode = {
3450
+ EXTERNAL: "external",
3451
+ FIXED: "fixed",
3452
+ ADAPTIVE: "adaptive",
3453
+ SMART_TURN: "smart_turn"
3454
+ };
3455
+ var SpeechmaticsSampleRate = {
3456
+ HZ_8000: 8e3,
3457
+ HZ_16000: 16e3,
3458
+ HZ_44100: 44100
3459
+ };
3460
+ var SpeechmaticsAudioEncoding = {
3461
+ PCM_S16LE: "pcm_s16le"
3462
+ };
3463
+ var SpeechmaticsOperatingPoint = {
3464
+ ENHANCED: "enhanced",
3465
+ STANDARD: "standard"
3466
+ };
3467
+ var SpeechmaticsServerMessage = {
3468
+ RECOGNITION_STARTED: "RecognitionStarted",
3469
+ ADD_PARTIAL_TRANSCRIPT: "AddPartialTranscript",
3470
+ ADD_TRANSCRIPT: "AddTranscript",
3471
+ END_OF_UTTERANCE: "EndOfUtterance",
3472
+ END_OF_TRANSCRIPT: "EndOfTranscript",
3473
+ AUDIO_ADDED: "AudioAdded",
3474
+ INFO: "Info",
3475
+ WARNING: "Warning",
3476
+ ERROR: "Error"
3477
+ };
3478
+ var SpeechmaticsSTT = class {
3479
+ ws = null;
3480
+ transcriptCallbacks = /* @__PURE__ */ new Set();
3481
+ errorCallbacks = /* @__PURE__ */ new Set();
3482
+ running = false;
3483
+ /** Sequence number of the last audio chunk acknowledged via `AudioAdded`. */
3484
+ lastSeqNo = 0;
3485
+ apiKey;
3486
+ baseUrl;
3487
+ language;
3488
+ turnDetectionMode;
3489
+ sampleRate;
3490
+ enableDiarization;
3491
+ maxDelay;
3492
+ endOfUtteranceSilenceTrigger;
3493
+ endOfUtteranceMaxDelay;
3494
+ includePartials;
3495
+ additionalVocab;
3496
+ operatingPoint;
3497
+ domain;
3498
+ outputLocale;
3499
+ constructor(apiKey, options = {}) {
3500
+ if (!apiKey) {
3501
+ throw new Error("Speechmatics apiKey is required");
3502
+ }
3503
+ const eouSilence = options.endOfUtteranceSilenceTrigger;
3504
+ const eouMax = options.endOfUtteranceMaxDelay;
3505
+ const maxDelay = options.maxDelay;
3506
+ if (eouSilence !== void 0 && !(eouSilence > 0 && eouSilence < 2)) {
3507
+ throw new Error("endOfUtteranceSilenceTrigger must be between 0 and 2");
3508
+ }
3509
+ if (eouMax !== void 0 && eouSilence !== void 0 && eouMax <= eouSilence) {
3510
+ throw new Error(
3511
+ "endOfUtteranceMaxDelay must be greater than endOfUtteranceSilenceTrigger"
3512
+ );
3513
+ }
3514
+ if (maxDelay !== void 0 && !(maxDelay >= 0.7 && maxDelay <= 4)) {
3515
+ throw new Error("maxDelay must be between 0.7 and 4.0");
3516
+ }
3517
+ this.apiKey = apiKey;
3518
+ this.baseUrl = options.baseUrl ?? SPEECHMATICS_RT_URL;
3519
+ this.language = options.language ?? "en";
3520
+ this.turnDetectionMode = options.turnDetectionMode ?? TurnDetectionMode.ADAPTIVE;
3521
+ this.sampleRate = options.sampleRate ?? SpeechmaticsSampleRate.HZ_16000;
3522
+ this.enableDiarization = options.enableDiarization ?? false;
3523
+ this.maxDelay = maxDelay;
3524
+ this.endOfUtteranceSilenceTrigger = eouSilence;
3525
+ this.endOfUtteranceMaxDelay = eouMax;
3526
+ this.includePartials = options.includePartials ?? true;
3527
+ this.additionalVocab = options.additionalVocab ?? [];
3528
+ this.operatingPoint = options.operatingPoint;
3529
+ this.domain = options.domain;
3530
+ this.outputLocale = options.outputLocale;
3531
+ }
3532
+ /** Build the JSON `StartRecognition` payload sent on connect. */
3533
+ buildStartRecognition() {
3534
+ const transcriptionConfig = {
3535
+ language: this.language,
3536
+ enable_partials: this.includePartials,
3537
+ diarization: this.enableDiarization ? "speaker" : "none"
3538
+ };
3539
+ if (this.maxDelay !== void 0) transcriptionConfig.max_delay = this.maxDelay;
3540
+ if (this.operatingPoint !== void 0) {
3541
+ transcriptionConfig.operating_point = this.operatingPoint;
3542
+ }
3543
+ if (this.domain !== void 0) transcriptionConfig.domain = this.domain;
3544
+ if (this.outputLocale !== void 0) {
3545
+ transcriptionConfig.output_locale = this.outputLocale;
3546
+ }
3547
+ if (this.additionalVocab.length > 0) {
3548
+ transcriptionConfig.additional_vocab = [...this.additionalVocab];
3549
+ }
3550
+ const conversationConfig = {
3551
+ end_of_utterance_mode: this.turnDetectionMode
3552
+ };
3553
+ if (this.endOfUtteranceSilenceTrigger !== void 0) {
3554
+ conversationConfig.end_of_utterance_silence_trigger = this.endOfUtteranceSilenceTrigger;
3555
+ }
3556
+ if (this.endOfUtteranceMaxDelay !== void 0) {
3557
+ conversationConfig.end_of_utterance_max_delay = this.endOfUtteranceMaxDelay;
3558
+ }
3559
+ transcriptionConfig.conversation_config = conversationConfig;
3560
+ return {
3561
+ message: "StartRecognition",
3562
+ audio_format: {
3563
+ type: "raw",
3564
+ encoding: SpeechmaticsAudioEncoding.PCM_S16LE,
3565
+ sample_rate: this.sampleRate
3566
+ },
3567
+ transcription_config: transcriptionConfig
3568
+ };
3569
+ }
3570
+ /** Open the streaming WebSocket and send the `StartRecognition` frame. */
3571
+ async connect() {
3572
+ if (this.ws !== null) return;
3573
+ const ws = new WebSocket5(this.baseUrl, {
3574
+ headers: { Authorization: `Bearer ${this.apiKey}` }
3575
+ });
3576
+ this.ws = ws;
3577
+ await new Promise((resolve, reject) => {
3578
+ let settled = false;
3579
+ const settle = (fn) => {
3580
+ if (settled) return;
3581
+ settled = true;
3582
+ clearTimeout(timer);
3583
+ fn();
3584
+ };
3585
+ const timer = setTimeout(
3586
+ () => settle(
3587
+ () => reject(new PatterConnectionError("Speechmatics connect timeout"))
3588
+ ),
3589
+ CONNECT_TIMEOUT_MS3
3590
+ );
3591
+ ws.once("open", () => settle(resolve));
3592
+ ws.once("error", (err) => settle(() => reject(err)));
3593
+ ws.once("unexpected-response", (_req, res) => {
3594
+ const status = res?.statusCode ?? 0;
3595
+ settle(() => {
3596
+ if (status === 401 || status === 403) {
3597
+ reject(
3598
+ new AuthenticationError(
3599
+ `Speechmatics rejected the API key (HTTP ${status}).`
3600
+ )
3601
+ );
3602
+ return;
3603
+ }
3604
+ if (status === 429) {
3605
+ reject(
3606
+ new RateLimitError("Speechmatics rate limit exceeded (HTTP 429).")
3607
+ );
3608
+ return;
3609
+ }
3610
+ reject(
3611
+ new PatterConnectionError(
3612
+ `Speechmatics WebSocket upgrade failed (HTTP ${status}).`
3613
+ )
3614
+ );
3615
+ });
3616
+ });
3617
+ });
3618
+ ws.on("message", (raw) => this.handleMessage(raw.toString()));
3619
+ ws.on("close", () => this.handleClose());
3620
+ ws.on("error", (err) => this.handleError(err));
3621
+ try {
3622
+ ws.send(JSON.stringify(this.buildStartRecognition()));
3623
+ } catch (err) {
3624
+ throw new PatterConnectionError(
3625
+ `Speechmatics StartRecognition send failed: ${String(err)}`
3626
+ );
3627
+ }
3628
+ this.running = true;
3629
+ }
3630
+ /** Send a binary PCM16-LE audio chunk to Speechmatics for transcription. */
3631
+ sendAudio(audio) {
3632
+ if (!this.ws || this.ws.readyState !== WebSocket5.OPEN) {
3633
+ return;
3634
+ }
3635
+ if (audio.length === 0) {
3636
+ return;
3637
+ }
3638
+ this.lastSeqNo += 1;
3639
+ try {
3640
+ this.ws.send(audio);
3641
+ } catch (err) {
3642
+ getLogger().error(`SpeechmaticsSTT sendAudio failed: ${String(err)}`);
3643
+ }
3644
+ }
3645
+ /** Register a transcript listener. */
3646
+ onTranscript(callback) {
3647
+ this.transcriptCallbacks.add(callback);
3648
+ }
3649
+ /** Remove a previously registered transcript listener. */
3650
+ offTranscript(callback) {
3651
+ this.transcriptCallbacks.delete(callback);
3652
+ }
3653
+ /** Register an error listener for socket / API failures. */
3654
+ onError(callback) {
3655
+ this.errorCallbacks.add(callback);
3656
+ }
3657
+ /** Remove a previously registered error listener. */
3658
+ offError(callback) {
3659
+ this.errorCallbacks.delete(callback);
3660
+ }
3661
+ handleMessage(raw) {
3662
+ let data;
3663
+ try {
3664
+ data = JSON.parse(raw);
3665
+ } catch {
3666
+ return;
3667
+ }
3668
+ const event = data.message;
3669
+ if (!event) return;
3670
+ switch (event) {
3671
+ case SpeechmaticsServerMessage.RECOGNITION_STARTED:
3672
+ case SpeechmaticsServerMessage.AUDIO_ADDED:
3673
+ case SpeechmaticsServerMessage.END_OF_UTTERANCE:
3674
+ case SpeechmaticsServerMessage.END_OF_TRANSCRIPT:
3675
+ case SpeechmaticsServerMessage.INFO:
3676
+ return;
3677
+ case SpeechmaticsServerMessage.WARNING:
3678
+ getLogger().warn(`SpeechmaticsSTT warning: ${JSON.stringify(data)}`);
3679
+ return;
3680
+ case SpeechmaticsServerMessage.ERROR: {
3681
+ const message = data.reason ?? data.type ?? "Speechmatics returned an Error frame";
3682
+ getLogger().error(`SpeechmaticsSTT error: ${message}`);
3683
+ this.emitError(new PatterConnectionError(`Speechmatics: ${message}`));
3684
+ return;
3685
+ }
3686
+ case SpeechmaticsServerMessage.ADD_PARTIAL_TRANSCRIPT:
3687
+ case SpeechmaticsServerMessage.ADD_TRANSCRIPT: {
3688
+ const isFinal = event === SpeechmaticsServerMessage.ADD_TRANSCRIPT;
3689
+ const transcript = this.toTranscript(data, isFinal);
3690
+ if (transcript !== null) this.emitTranscript(transcript);
3691
+ return;
3692
+ }
3693
+ default:
3694
+ return;
3695
+ }
3696
+ }
3697
+ /** Translate a Speechmatics transcript message into a Patter `Transcript`. */
3698
+ toTranscript(message, isFinal) {
3699
+ const rendered = (message.metadata?.transcript ?? "").trim();
3700
+ const results = message.results ?? [];
3701
+ let text = rendered;
3702
+ const confidences = [];
3703
+ for (const result of results) {
3704
+ const best = result.alternatives?.[0];
3705
+ if (!best) continue;
3706
+ const content = best.content;
3707
+ const confidence2 = best.confidence;
3708
+ if (!rendered && typeof content === "string" && content.length > 0) {
3709
+ text = text ? `${text} ${content}` : content;
3710
+ }
3711
+ if (typeof confidence2 === "number") {
3712
+ confidences.push(confidence2);
3713
+ }
3714
+ }
3715
+ text = text.trim();
3716
+ if (!text) return null;
3717
+ const confidence = confidences.length > 0 ? confidences.reduce((sum, c) => sum + c, 0) / confidences.length : 1;
3718
+ return { text, isFinal, confidence };
3719
+ }
3720
+ emitTranscript(transcript) {
3721
+ for (const cb of this.transcriptCallbacks) {
3722
+ try {
3723
+ cb(transcript);
3724
+ } catch (err) {
3725
+ getLogger().error(`SpeechmaticsSTT transcript callback threw: ${String(err)}`);
3726
+ }
3727
+ }
3728
+ }
3729
+ emitError(err) {
3730
+ for (const cb of this.errorCallbacks) {
3731
+ try {
3732
+ cb(err);
3733
+ } catch (cbErr) {
3734
+ getLogger().error(`SpeechmaticsSTT error callback threw: ${String(cbErr)}`);
3735
+ }
3736
+ }
3737
+ }
3738
+ handleError(err) {
3739
+ getLogger().error(`SpeechmaticsSTT WebSocket error: ${err.message}`);
3740
+ this.emitError(err);
3741
+ }
3742
+ handleClose() {
3743
+ if (!this.running) return;
3744
+ this.running = false;
3745
+ }
3746
+ /** Send `EndOfStream` and close the WebSocket. Idempotent. */
3747
+ close() {
3748
+ this.running = false;
3749
+ const ws = this.ws;
3750
+ if (!ws) return;
3751
+ this.ws = null;
3752
+ const sendSafe = (payload) => {
3753
+ if (ws.readyState === WebSocket5.OPEN) {
3754
+ try {
3755
+ ws.send(payload);
3756
+ } catch {
3757
+ }
3758
+ }
3759
+ };
3760
+ sendSafe(
3761
+ JSON.stringify({ message: "EndOfStream", last_seq_no: this.lastSeqNo })
3762
+ );
3763
+ try {
3764
+ ws.close();
3765
+ } catch {
3766
+ }
3767
+ }
3768
+ };
3769
+
3770
+ // src/stt/speechmatics.ts
3771
+ var STT7 = class extends SpeechmaticsSTT {
3772
+ static providerKey = "speechmatics";
3773
+ constructor(opts = {}) {
3774
+ const key = opts.apiKey ?? process.env.SPEECHMATICS_API_KEY;
3775
+ if (!key) {
3776
+ throw new Error(
3777
+ "Speechmatics STT requires an apiKey. Pass { apiKey: 'sm_...' } or set SPEECHMATICS_API_KEY in the environment."
3778
+ );
3779
+ }
3780
+ super(key, opts);
3781
+ }
3782
+ };
3783
+
3784
+ // src/tts/elevenlabs.ts
3785
+ init_esm_shims();
3786
+
2582
3787
  // src/providers/elevenlabs-tts.ts
3788
+ init_esm_shims();
2583
3789
  var ELEVENLABS_BASE_URL = "https://api.elevenlabs.io/v1";
2584
3790
  var ELEVENLABS_VOICE_ID_BY_NAME = {
2585
3791
  rachel: "21m00Tcm4TlvDq8ikWAM",
@@ -2636,6 +3842,27 @@ function resolveVoiceId(voice) {
2636
3842
  if (VOICE_ID_PATTERN.test(voice)) return voice;
2637
3843
  return ELEVENLABS_VOICE_ID_BY_NAME[voice.toLowerCase()] ?? voice;
2638
3844
  }
3845
+ var ElevenLabsModel = {
3846
+ V3: "eleven_v3",
3847
+ FLASH_V2_5: "eleven_flash_v2_5",
3848
+ TURBO_V2_5: "eleven_turbo_v2_5",
3849
+ MULTILINGUAL_V2: "eleven_multilingual_v2",
3850
+ MONOLINGUAL_V1: "eleven_monolingual_v1"
3851
+ };
3852
+ var ElevenLabsOutputFormat = {
3853
+ MP3_22050_32: "mp3_22050_32",
3854
+ MP3_44100_32: "mp3_44100_32",
3855
+ MP3_44100_64: "mp3_44100_64",
3856
+ MP3_44100_96: "mp3_44100_96",
3857
+ MP3_44100_128: "mp3_44100_128",
3858
+ MP3_44100_192: "mp3_44100_192",
3859
+ PCM_8000: "pcm_8000",
3860
+ PCM_16000: "pcm_16000",
3861
+ PCM_22050: "pcm_22050",
3862
+ PCM_24000: "pcm_24000",
3863
+ PCM_44100: "pcm_44100",
3864
+ ULAW_8000: "ulaw_8000"
3865
+ };
2639
3866
  var ElevenLabsTTS = class _ElevenLabsTTS {
2640
3867
  apiKey;
2641
3868
  voiceId;
@@ -2644,13 +3871,13 @@ var ElevenLabsTTS = class _ElevenLabsTTS {
2644
3871
  voiceSettings;
2645
3872
  languageCode;
2646
3873
  chunkSize;
2647
- constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = "eleven_flash_v2_5", outputFormat = "pcm_16000") {
3874
+ constructor(apiKey, voiceIdOrOptions = "21m00Tcm4TlvDq8ikWAM", modelId = ElevenLabsModel.FLASH_V2_5, outputFormat = ElevenLabsOutputFormat.PCM_16000) {
2648
3875
  this.apiKey = apiKey;
2649
3876
  if (typeof voiceIdOrOptions === "object") {
2650
3877
  const o = voiceIdOrOptions;
2651
3878
  this.voiceId = resolveVoiceId(o.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
2652
- this.modelId = o.modelId ?? "eleven_flash_v2_5";
2653
- this.outputFormat = o.outputFormat ?? "pcm_16000";
3879
+ this.modelId = o.modelId ?? ElevenLabsModel.FLASH_V2_5;
3880
+ this.outputFormat = o.outputFormat ?? ElevenLabsOutputFormat.PCM_16000;
2654
3881
  this.voiceSettings = o.voiceSettings;
2655
3882
  this.languageCode = o.languageCode;
2656
3883
  this.chunkSize = o.chunkSize ?? 4096;
@@ -2688,7 +3915,7 @@ var ElevenLabsTTS = class _ElevenLabsTTS {
2688
3915
  return new _ElevenLabsTTS(apiKey, {
2689
3916
  ...options,
2690
3917
  voiceSettings,
2691
- outputFormat: "ulaw_8000"
3918
+ outputFormat: ElevenLabsOutputFormat.ULAW_8000
2692
3919
  });
2693
3920
  }
2694
3921
  /**
@@ -2705,7 +3932,7 @@ var ElevenLabsTTS = class _ElevenLabsTTS {
2705
3932
  static forTelnyx(apiKey, options = {}) {
2706
3933
  return new _ElevenLabsTTS(apiKey, {
2707
3934
  ...options,
2708
- outputFormat: "pcm_16000"
3935
+ outputFormat: ElevenLabsOutputFormat.PCM_16000
2709
3936
  });
2710
3937
  }
2711
3938
  /**
@@ -2783,12 +4010,13 @@ function resolveApiKey(apiKey) {
2783
4010
  var TTS = class _TTS extends ElevenLabsTTS {
2784
4011
  static providerKey = "elevenlabs";
2785
4012
  constructor(opts = {}) {
2786
- super(
2787
- resolveApiKey(opts.apiKey),
2788
- opts.voiceId ?? "EXAVITQu4vr4xnSDxMaL",
2789
- opts.modelId ?? "eleven_flash_v2_5",
2790
- opts.outputFormat ?? "pcm_16000"
2791
- );
4013
+ super(resolveApiKey(opts.apiKey), {
4014
+ voiceId: opts.voiceId ?? "EXAVITQu4vr4xnSDxMaL",
4015
+ modelId: opts.modelId ?? "eleven_flash_v2_5",
4016
+ outputFormat: opts.outputFormat ?? "pcm_16000",
4017
+ languageCode: opts.languageCode,
4018
+ voiceSettings: opts.voiceSettings
4019
+ });
2792
4020
  }
2793
4021
  static forTwilio(arg1, arg2) {
2794
4022
  const opts = typeof arg1 === "string" ? { apiKey: arg1, ...arg2 ?? {} } : arg1 ?? {};
@@ -2800,22 +4028,364 @@ var TTS = class _TTS extends ElevenLabsTTS {
2800
4028
  }
2801
4029
  };
2802
4030
 
4031
+ // src/tts/elevenlabs-ws.ts
4032
+ init_esm_shims();
4033
+
4034
+ // src/providers/elevenlabs-ws-tts.ts
4035
+ init_esm_shims();
4036
+ import WebSocket6 from "ws";
4037
+ var WS_BASE = "wss://api.elevenlabs.io/v1/text-to-speech";
4038
+ var DEFAULT_INACTIVITY_TIMEOUT = 60;
4039
+ var DEFAULT_CHUNK_SIZE = 4096;
4040
+ var CONNECT_TIMEOUT_MS4 = 5e3;
4041
+ var FRAME_TIMEOUT_MS = 3e4;
4042
+ var MAX_AUDIO_B64_BYTES = 512 * 1024;
4043
+ var ElevenLabsTTSError = class extends Error {
4044
+ constructor(message) {
4045
+ super(message);
4046
+ this.name = "ElevenLabsTTSError";
4047
+ }
4048
+ };
4049
+ var ElevenLabsPlanError = class extends ElevenLabsTTSError {
4050
+ constructor(message) {
4051
+ super(message);
4052
+ this.name = "ElevenLabsPlanError";
4053
+ }
4054
+ };
4055
+ var PLAN_REQUIRED_MSG = "ElevenLabs WS streaming requires a Pro plan or higher (the WS endpoint returned `payment_required`). Either upgrade at https://elevenlabs.io/pricing, or use the HTTP `ElevenLabsTTS` class which works on all plans (drop-in API).";
4056
+ function sanitiseLogStr(value, limit = 200) {
4057
+ return String(value).replace(/[\r\n\x00]/g, " ").slice(0, limit);
4058
+ }
4059
+ var CARRIER_NATIVE_FORMAT = {
4060
+ twilio: "ulaw_8000",
4061
+ telnyx: "pcm_16000"
4062
+ };
4063
+ var ElevenLabsWebSocketTTS = class _ElevenLabsWebSocketTTS {
4064
+ static providerKey = "elevenlabs_ws";
4065
+ apiKey;
4066
+ voiceId;
4067
+ modelId;
4068
+ voiceSettings;
4069
+ languageCode;
4070
+ autoMode;
4071
+ inactivityTimeout;
4072
+ chunkLengthSchedule;
4073
+ chunkSize;
4074
+ /**
4075
+ * The wire format requested over the ElevenLabs WS. Initially set from
4076
+ * the constructor; ``setTelephonyCarrier`` may auto-flip it to the
4077
+ * carrier's native codec when the caller did NOT pass ``outputFormat``
4078
+ * explicitly.
4079
+ */
4080
+ _outputFormat;
4081
+ _outputFormatExplicit;
4082
+ /** Public read-only view of the (possibly auto-flipped) wire format. */
4083
+ get outputFormat() {
4084
+ return this._outputFormat;
4085
+ }
4086
+ constructor(opts) {
4087
+ if (opts.modelId === "eleven_v3") {
4088
+ throw new Error(
4089
+ "eleven_v3 is not supported by the WebSocket stream-input endpoint \u2014 use the HTTP ElevenLabsTTS class instead."
4090
+ );
4091
+ }
4092
+ this.apiKey = opts.apiKey;
4093
+ this.voiceId = resolveVoiceId(opts.voiceId ?? "21m00Tcm4TlvDq8ikWAM");
4094
+ this.modelId = opts.modelId ?? "eleven_flash_v2_5";
4095
+ this._outputFormatExplicit = opts.outputFormat !== void 0;
4096
+ this._outputFormat = opts.outputFormat ?? "pcm_16000";
4097
+ this.voiceSettings = opts.voiceSettings;
4098
+ this.languageCode = opts.languageCode;
4099
+ this.autoMode = opts.autoMode ?? true;
4100
+ this.inactivityTimeout = opts.inactivityTimeout ?? DEFAULT_INACTIVITY_TIMEOUT;
4101
+ this.chunkLengthSchedule = opts.chunkLengthSchedule;
4102
+ this.chunkSize = opts.chunkSize ?? DEFAULT_CHUNK_SIZE;
4103
+ }
4104
+ /**
4105
+ * Hook called by ``StreamHandler`` to advise the carrier wire format.
4106
+ *
4107
+ * When the user did NOT pass an explicit ``outputFormat`` in the
4108
+ * constructor options, this flips the format to the carrier's native
4109
+ * wire codec — saving a client-side transcode step. Calling with an
4110
+ * unknown carrier (``""`` / ``"custom"``) is a no-op.
4111
+ *
4112
+ * When ``outputFormat`` was explicitly passed (incl. via the
4113
+ * ``forTwilio`` / ``forTelnyx`` factories), this method is a no-op —
4114
+ * the user's choice always wins.
4115
+ */
4116
+ setTelephonyCarrier(carrier) {
4117
+ if (this._outputFormatExplicit) return;
4118
+ const native = CARRIER_NATIVE_FORMAT[carrier];
4119
+ if (!native) return;
4120
+ this._outputFormat = native;
4121
+ }
4122
+ /** Pre-configured for Twilio Media Streams (`ulaw_8000`). */
4123
+ static forTwilio(opts) {
4124
+ return new _ElevenLabsWebSocketTTS({
4125
+ ...opts,
4126
+ outputFormat: "ulaw_8000",
4127
+ voiceSettings: opts.voiceSettings ?? {
4128
+ stability: 0.6,
4129
+ similarity_boost: 0.75,
4130
+ use_speaker_boost: false
4131
+ }
4132
+ });
4133
+ }
4134
+ /** Pre-configured for Telnyx (`pcm_16000`). */
4135
+ static forTelnyx(opts) {
4136
+ return new _ElevenLabsWebSocketTTS({
4137
+ ...opts,
4138
+ outputFormat: "pcm_16000"
4139
+ });
4140
+ }
4141
+ buildUrl() {
4142
+ const params = new URLSearchParams({
4143
+ model_id: this.modelId,
4144
+ output_format: this.outputFormat,
4145
+ inactivity_timeout: String(this.inactivityTimeout)
4146
+ });
4147
+ if (this.autoMode) params.set("auto_mode", "true");
4148
+ if (this.languageCode) params.set("language_code", this.languageCode);
4149
+ return `${WS_BASE}/${encodeURIComponent(this.voiceId)}/stream-input?${params.toString()}`;
4150
+ }
4151
+ /**
4152
+ * Single-shot synthesis: open WS, send text, yield bytes, close.
4153
+ *
4154
+ * Resilience contract:
4155
+ * - Connection bounded by ``CONNECT_TIMEOUT_MS`` (5s, was 15s).
4156
+ * - Each idle wait bounded by ``FRAME_TIMEOUT_MS`` (30s) so a stalled
4157
+ * server cannot keep the generator alive indefinitely.
4158
+ * - Permanent error handler attached BEFORE the open await — prevents
4159
+ * ``uncaughtException`` if an error fires after the once-listener
4160
+ * resolves.
4161
+ * - All event listeners removed in ``finally`` (no closure leak past
4162
+ * socket close).
4163
+ * - Server-reported ``error`` raises ``ElevenLabsTTSError``.
4164
+ * - Per-frame audio payload capped at ``MAX_AUDIO_B64_BYTES``.
4165
+ * - Best-effort EOS ``{"text":""}`` sent in finally (not immediately
4166
+ * after flush — auto_mode could otherwise truncate the tail audio).
4167
+ */
4168
+ async *synthesizeStream(text) {
4169
+ const ws = new WebSocket6(this.buildUrl(), {
4170
+ headers: { "xi-api-key": this.apiKey }
4171
+ });
4172
+ const queue = [];
4173
+ let done = false;
4174
+ let pendingError = null;
4175
+ let resolveWaiter = null;
4176
+ let connectTimer;
4177
+ const wakeWaiter = () => {
4178
+ const r = resolveWaiter;
4179
+ resolveWaiter = null;
4180
+ r?.();
4181
+ };
4182
+ const onMessage = (raw) => {
4183
+ if (Buffer.isBuffer(raw) && !looksLikeJson(raw)) {
4184
+ if (raw.length > MAX_AUDIO_B64_BYTES) {
4185
+ getLogger().warn(
4186
+ `ElevenLabs WS binary frame too large (${raw.length} bytes), skipping`
4187
+ );
4188
+ return;
4189
+ }
4190
+ queue.push(raw);
4191
+ wakeWaiter();
4192
+ return;
4193
+ }
4194
+ const txt = raw.toString("utf8");
4195
+ let msg;
4196
+ try {
4197
+ msg = JSON.parse(txt);
4198
+ } catch {
4199
+ getLogger().warn("ElevenLabs WS sent non-JSON text frame");
4200
+ return;
4201
+ }
4202
+ if (msg.error) {
4203
+ const sanitised = sanitiseLogStr(msg.error);
4204
+ getLogger().error("ElevenLabs WS reported error:", sanitised);
4205
+ if (sanitised === "payment_required" || /payment[_ ]required/i.test(sanitised)) {
4206
+ pendingError = new ElevenLabsPlanError(PLAN_REQUIRED_MSG);
4207
+ } else {
4208
+ pendingError = new ElevenLabsTTSError(`ElevenLabs WS error: ${sanitised}`);
4209
+ }
4210
+ done = true;
4211
+ wakeWaiter();
4212
+ return;
4213
+ }
4214
+ if (msg.audio) {
4215
+ if (typeof msg.audio !== "string" || msg.audio.length > MAX_AUDIO_B64_BYTES) {
4216
+ getLogger().warn("ElevenLabs WS audio frame too large or malformed, skipping");
4217
+ } else {
4218
+ try {
4219
+ queue.push(Buffer.from(msg.audio, "base64"));
4220
+ } catch {
4221
+ getLogger().warn("ElevenLabs WS sent malformed base64 audio");
4222
+ }
4223
+ }
4224
+ }
4225
+ if (msg.isFinal) {
4226
+ done = true;
4227
+ }
4228
+ wakeWaiter();
4229
+ };
4230
+ const onClose = () => {
4231
+ done = true;
4232
+ wakeWaiter();
4233
+ };
4234
+ const onError = (err) => {
4235
+ pendingError = err;
4236
+ done = true;
4237
+ wakeWaiter();
4238
+ };
4239
+ ws.on("error", onError);
4240
+ try {
4241
+ await new Promise((resolve, reject) => {
4242
+ connectTimer = setTimeout(
4243
+ () => reject(new Error("ElevenLabs WS connect timeout")),
4244
+ CONNECT_TIMEOUT_MS4
4245
+ );
4246
+ ws.once("open", () => {
4247
+ if (connectTimer) clearTimeout(connectTimer);
4248
+ connectTimer = void 0;
4249
+ resolve();
4250
+ });
4251
+ ws.once("error", (err) => {
4252
+ if (connectTimer) clearTimeout(connectTimer);
4253
+ connectTimer = void 0;
4254
+ reject(err);
4255
+ });
4256
+ });
4257
+ const init = { text: " " };
4258
+ if (this.voiceSettings) init["voice_settings"] = this.voiceSettings;
4259
+ if (!this.autoMode && this.chunkLengthSchedule) {
4260
+ init["generation_config"] = { chunk_length_schedule: this.chunkLengthSchedule };
4261
+ }
4262
+ ws.send(JSON.stringify(init));
4263
+ ws.send(JSON.stringify({ text: text + " ", flush: true }));
4264
+ ws.on("message", onMessage);
4265
+ ws.on("close", onClose);
4266
+ while (true) {
4267
+ if (queue.length > 0) {
4268
+ const buf = queue.shift();
4269
+ for (let off = 0; off < buf.length; off += this.chunkSize) {
4270
+ yield buf.subarray(off, Math.min(off + this.chunkSize, buf.length));
4271
+ }
4272
+ continue;
4273
+ }
4274
+ if (done) {
4275
+ if (pendingError) throw pendingError;
4276
+ return;
4277
+ }
4278
+ let frameTimer;
4279
+ try {
4280
+ await new Promise((res, rej) => {
4281
+ resolveWaiter = res;
4282
+ frameTimer = setTimeout(
4283
+ () => rej(new ElevenLabsTTSError(`ElevenLabs WS no frame for ${FRAME_TIMEOUT_MS}ms`)),
4284
+ FRAME_TIMEOUT_MS
4285
+ );
4286
+ });
4287
+ } finally {
4288
+ if (frameTimer) clearTimeout(frameTimer);
4289
+ }
4290
+ }
4291
+ } finally {
4292
+ if (connectTimer) clearTimeout(connectTimer);
4293
+ try {
4294
+ if (ws.readyState === WebSocket6.OPEN) {
4295
+ ws.send(JSON.stringify({ text: "" }));
4296
+ }
4297
+ } catch {
4298
+ }
4299
+ try {
4300
+ if (ws.readyState === WebSocket6.OPEN || ws.readyState === WebSocket6.CONNECTING) {
4301
+ ws.close();
4302
+ }
4303
+ } catch {
4304
+ }
4305
+ ws.removeAllListeners();
4306
+ }
4307
+ }
4308
+ /** No-op — connections are per-utterance and torn down inside synthesizeStream. */
4309
+ async close() {
4310
+ }
4311
+ };
4312
+ function looksLikeJson(buf) {
4313
+ if (buf.length === 0) return false;
4314
+ const b = buf[0];
4315
+ return b === 123 || b === 91;
4316
+ }
4317
+
4318
+ // src/tts/elevenlabs-ws.ts
4319
+ function resolveApiKey2(apiKey) {
4320
+ const key = apiKey ?? process.env.ELEVENLABS_API_KEY;
4321
+ if (!key) {
4322
+ throw new Error(
4323
+ "ElevenLabs WebSocket TTS requires an apiKey. Pass { apiKey: '...' } or set ELEVENLABS_API_KEY in the environment."
4324
+ );
4325
+ }
4326
+ return key;
4327
+ }
4328
+ function buildOpts(opts) {
4329
+ const out = {
4330
+ apiKey: resolveApiKey2(opts.apiKey),
4331
+ modelId: opts.modelId ?? "eleven_flash_v2_5",
4332
+ outputFormat: opts.outputFormat ?? "pcm_16000",
4333
+ autoMode: opts.autoMode ?? true
4334
+ };
4335
+ if (opts.voiceId !== void 0) out.voiceId = opts.voiceId;
4336
+ if (opts.voiceSettings !== void 0) out.voiceSettings = opts.voiceSettings;
4337
+ if (opts.languageCode !== void 0) out.languageCode = opts.languageCode;
4338
+ if (opts.inactivityTimeout !== void 0) out.inactivityTimeout = opts.inactivityTimeout;
4339
+ if (opts.chunkLengthSchedule !== void 0) out.chunkLengthSchedule = opts.chunkLengthSchedule;
4340
+ return out;
4341
+ }
4342
+ var TTS2 = class _TTS extends ElevenLabsWebSocketTTS {
4343
+ static providerKey = "elevenlabs_ws";
4344
+ constructor(opts = {}) {
4345
+ super(buildOpts(opts));
4346
+ }
4347
+ /** WebSocket TTS pre-configured for Twilio Media Streams (`ulaw_8000`). */
4348
+ static forTwilio(opts = {}) {
4349
+ return new _TTS({ ...opts, outputFormat: "ulaw_8000" });
4350
+ }
4351
+ /** WebSocket TTS pre-configured for Telnyx (`pcm_16000`). */
4352
+ static forTelnyx(opts = {}) {
4353
+ return new _TTS({ ...opts, outputFormat: "pcm_16000" });
4354
+ }
4355
+ };
4356
+
4357
+ // src/tts/openai.ts
4358
+ init_esm_shims();
4359
+
2803
4360
  // src/providers/openai-tts.ts
4361
+ init_esm_shims();
2804
4362
  var OPENAI_TTS_URL = "https://api.openai.com/v1/audio/speech";
2805
4363
  var INSTRUCTIONS_PREFIX = "gpt-4o-mini-tts";
2806
4364
  var LPF_ALPHA = 0.78;
4365
+ var LPF_ALPHA_8K = 0.45;
2807
4366
  var OpenAITTS = class _OpenAITTS {
2808
- constructor(apiKey, voice = "alloy", model = "gpt-4o-mini-tts", instructions = null, speed = null, antiAlias = true) {
4367
+ constructor(apiKey, voice = "alloy", model = "gpt-4o-mini-tts", instructions = null, speed = null, antiAlias = true, targetSampleRate = 16e3) {
2809
4368
  this.apiKey = apiKey;
2810
4369
  this.voice = voice;
2811
4370
  this.model = model;
2812
4371
  this.instructions = instructions;
2813
4372
  this.speed = speed;
2814
4373
  this.antiAlias = antiAlias;
4374
+ this.targetSampleRate = targetSampleRate;
2815
4375
  if (speed !== null && speed !== void 0 && (speed < 0.25 || speed > 4)) {
2816
4376
  throw new Error("OpenAITTS: speed must be in [0.25, 4.0]");
2817
4377
  }
4378
+ if (targetSampleRate !== 8e3 && targetSampleRate !== 16e3) {
4379
+ throw new Error("OpenAITTS: targetSampleRate must be 8000 or 16000");
4380
+ }
2818
4381
  }
4382
+ apiKey;
4383
+ voice;
4384
+ model;
4385
+ instructions;
4386
+ speed;
4387
+ antiAlias;
4388
+ targetSampleRate;
2819
4389
  /**
2820
4390
  * Synthesise text to speech and return the full audio as a single Buffer.
2821
4391
  *
@@ -2871,7 +4441,8 @@ var OpenAITTS = class _OpenAITTS {
2871
4441
  carryByte: null,
2872
4442
  leftover: [],
2873
4443
  lpfPrev: 0,
2874
- lpfEnabled: this.antiAlias
4444
+ lpfEnabled: this.antiAlias,
4445
+ targetSampleRate: this.targetSampleRate
2875
4446
  };
2876
4447
  const reader = response.body.getReader();
2877
4448
  try {
@@ -2897,14 +4468,17 @@ var OpenAITTS = class _OpenAITTS {
2897
4468
  }
2898
4469
  }
2899
4470
  /**
2900
- * Streaming 24 kHz → 16 kHz resampler (PCM16-LE). Applies a single-pole
2901
- * lowpass ahead of the 3:2 decimation and carries filter + sample state
2902
- * across chunks so the cadence doesn't reset at every network read.
4471
+ * Streaming 24 kHz → {16, 8} kHz resampler (PCM16-LE). Applies a single-pole
4472
+ * lowpass ahead of the decimation and carries filter + sample state across
4473
+ * chunks so the cadence doesn't reset at every network read.
4474
+ *
4475
+ * Output rate is selected by ``ctx.targetSampleRate``:
4476
+ * 16000 → 3:2 decimation (sample 0 + mid(1,2)) [default]
4477
+ * 8000 → 3:1 decimation (sample 0 only) [fix #46]
2903
4478
  *
2904
- * ``ctx.lpfEnabled`` (default true on the streaming path, false for the
2905
- * legacy static helper) controls whether the LPF is engaged — we keep
2906
- * the helper bit-exact for the downsample-only tests while the real
2907
- * streaming path gets anti-alias filtering.
4479
+ * ``ctx.lpfEnabled`` controls whether the LPF is engaged kept disabled
4480
+ * for the legacy static helper so the bit-exact downsample-only tests
4481
+ * remain valid; the real streaming path always engages it.
2908
4482
  */
2909
4483
  static resampleStreaming(audio, ctx) {
2910
4484
  let buf;
@@ -2921,6 +4495,8 @@ var OpenAITTS = class _OpenAITTS {
2921
4495
  if (buf.length === 0 && ctx.leftover.length === 0) {
2922
4496
  return Buffer.alloc(0);
2923
4497
  }
4498
+ const direct8k = ctx.targetSampleRate === 8e3;
4499
+ const lpfAlpha = direct8k ? LPF_ALPHA_8K : LPF_ALPHA;
2924
4500
  const sampleCount = buf.length / 2;
2925
4501
  const samples = ctx.leftover.slice();
2926
4502
  const lpf = ctx.lpfEnabled !== false;
@@ -2928,7 +4504,7 @@ var OpenAITTS = class _OpenAITTS {
2928
4504
  for (let i2 = 0; i2 < sampleCount; i2++) {
2929
4505
  const x = buf.readInt16LE(i2 * 2);
2930
4506
  if (lpf) {
2931
- y = LPF_ALPHA * x + (1 - LPF_ALPHA) * y;
4507
+ y = lpfAlpha * x + (1 - lpfAlpha) * y;
2932
4508
  let s = Math.round(y);
2933
4509
  if (s > 32767) s = 32767;
2934
4510
  else if (s < -32768) s = -32768;
@@ -2940,10 +4516,17 @@ var OpenAITTS = class _OpenAITTS {
2940
4516
  if (lpf) ctx.lpfPrev = y;
2941
4517
  const out = [];
2942
4518
  let i = 0;
2943
- while (i + 2 < samples.length) {
2944
- out.push(samples[i]);
2945
- out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
2946
- i += 3;
4519
+ if (direct8k) {
4520
+ while (i + 2 < samples.length) {
4521
+ out.push(samples[i]);
4522
+ i += 3;
4523
+ }
4524
+ } else {
4525
+ while (i + 2 < samples.length) {
4526
+ out.push(samples[i]);
4527
+ out.push(Math.round((samples[i + 1] + samples[i + 2]) / 2));
4528
+ i += 3;
4529
+ }
2947
4530
  }
2948
4531
  ctx.leftover = samples.slice(i);
2949
4532
  const buffer = Buffer.alloc(out.length * 2);
@@ -2954,7 +4537,13 @@ var OpenAITTS = class _OpenAITTS {
2954
4537
  }
2955
4538
  /** @deprecated use {@link resampleStreaming} with persistent state. */
2956
4539
  static resample24kTo16k(audio) {
2957
- const ctx = { carryByte: null, leftover: [], lpfPrev: 0, lpfEnabled: false };
4540
+ const ctx = {
4541
+ carryByte: null,
4542
+ leftover: [],
4543
+ lpfPrev: 0,
4544
+ lpfEnabled: false,
4545
+ targetSampleRate: 16e3
4546
+ };
2958
4547
  const out = _OpenAITTS.resampleStreaming(audio, ctx);
2959
4548
  if (ctx.leftover.length === 0) return out;
2960
4549
  const tail = Buffer.alloc(ctx.leftover.length * 2);
@@ -2966,7 +4555,7 @@ var OpenAITTS = class _OpenAITTS {
2966
4555
  };
2967
4556
 
2968
4557
  // src/tts/openai.ts
2969
- var TTS2 = class extends OpenAITTS {
4558
+ var TTS3 = class extends OpenAITTS {
2970
4559
  static providerKey = "openai_tts";
2971
4560
  constructor(opts = {}) {
2972
4561
  const key = opts.apiKey ?? process.env.OPENAI_API_KEY;
@@ -2986,10 +4575,41 @@ var TTS2 = class extends OpenAITTS {
2986
4575
  }
2987
4576
  };
2988
4577
 
4578
+ // src/tts/cartesia.ts
4579
+ init_esm_shims();
4580
+
2989
4581
  // src/providers/cartesia-tts.ts
4582
+ init_esm_shims();
2990
4583
  var CARTESIA_BASE_URL = "https://api.cartesia.ai";
2991
4584
  var CARTESIA_API_VERSION = "2025-04-16";
2992
4585
  var CARTESIA_DEFAULT_VOICE_ID = "f786b574-daa5-4673-aa0c-cbe3e8534c02";
4586
+ var CartesiaTTSModel = {
4587
+ SONIC_3: "sonic-3",
4588
+ SONIC_2: "sonic-2",
4589
+ SONIC: "sonic"
4590
+ };
4591
+ var CartesiaTTSContainer = {
4592
+ RAW: "raw",
4593
+ WAV: "wav",
4594
+ MP3: "mp3"
4595
+ };
4596
+ var CartesiaTTSEncoding = {
4597
+ PCM_S16LE: "pcm_s16le",
4598
+ PCM_F32LE: "pcm_f32le",
4599
+ PCM_MULAW: "pcm_mulaw",
4600
+ PCM_ALAW: "pcm_alaw"
4601
+ };
4602
+ var CartesiaTTSSampleRate = {
4603
+ HZ_8000: 8e3,
4604
+ HZ_16000: 16e3,
4605
+ HZ_22050: 22050,
4606
+ HZ_24000: 24e3,
4607
+ HZ_44100: 44100
4608
+ };
4609
+ var CartesiaTTSVoiceMode = {
4610
+ ID: "id",
4611
+ EMBEDDING: "embedding"
4612
+ };
2993
4613
  var CartesiaTTS = class _CartesiaTTS {
2994
4614
  apiKey;
2995
4615
  model;
@@ -3003,10 +4623,10 @@ var CartesiaTTS = class _CartesiaTTS {
3003
4623
  apiVersion;
3004
4624
  constructor(apiKey, opts = {}) {
3005
4625
  this.apiKey = apiKey;
3006
- this.model = opts.model ?? "sonic-3";
4626
+ this.model = opts.model ?? CartesiaTTSModel.SONIC_3;
3007
4627
  this.voice = opts.voice ?? CARTESIA_DEFAULT_VOICE_ID;
3008
4628
  this.language = opts.language ?? "en";
3009
- this.sampleRate = opts.sampleRate ?? 16e3;
4629
+ this.sampleRate = opts.sampleRate ?? CartesiaTTSSampleRate.HZ_16000;
3010
4630
  this.speed = opts.speed;
3011
4631
  this.emotion = typeof opts.emotion === "string" ? [opts.emotion] : opts.emotion;
3012
4632
  this.volume = opts.volume;
@@ -3023,7 +4643,10 @@ var CartesiaTTS = class _CartesiaTTS {
3023
4643
  * removes a potential aliasing source.
3024
4644
  */
3025
4645
  static forTwilio(apiKey, options = {}) {
3026
- return new _CartesiaTTS(apiKey, { ...options, sampleRate: 8e3 });
4646
+ return new _CartesiaTTS(apiKey, {
4647
+ ...options,
4648
+ sampleRate: CartesiaTTSSampleRate.HZ_8000
4649
+ });
3027
4650
  }
3028
4651
  /**
3029
4652
  * Construct an instance pre-configured for Telnyx bidirectional media.
@@ -3034,17 +4657,20 @@ var CartesiaTTS = class _CartesiaTTS {
3034
4657
  * {@link CartesiaTTS.forTwilio}.
3035
4658
  */
3036
4659
  static forTelnyx(apiKey, options = {}) {
3037
- return new _CartesiaTTS(apiKey, { ...options, sampleRate: 16e3 });
4660
+ return new _CartesiaTTS(apiKey, {
4661
+ ...options,
4662
+ sampleRate: CartesiaTTSSampleRate.HZ_16000
4663
+ });
3038
4664
  }
3039
4665
  /** Build the JSON payload for the Cartesia bytes endpoint. */
3040
4666
  buildPayload(text) {
3041
4667
  const payload = {
3042
4668
  model_id: this.model,
3043
- voice: { mode: "id", id: this.voice },
4669
+ voice: { mode: CartesiaTTSVoiceMode.ID, id: this.voice },
3044
4670
  transcript: text,
3045
4671
  output_format: {
3046
- container: "raw",
3047
- encoding: "pcm_s16le",
4672
+ container: CartesiaTTSContainer.RAW,
4673
+ encoding: CartesiaTTSEncoding.PCM_S16LE,
3048
4674
  sample_rate: this.sampleRate
3049
4675
  },
3050
4676
  language: this.language
@@ -3108,7 +4734,7 @@ var CartesiaTTS = class _CartesiaTTS {
3108
4734
  };
3109
4735
 
3110
4736
  // src/tts/cartesia.ts
3111
- function resolveApiKey2(apiKey) {
4737
+ function resolveApiKey3(apiKey) {
3112
4738
  const key = apiKey ?? process.env.CARTESIA_API_KEY;
3113
4739
  if (!key) {
3114
4740
  throw new Error(
@@ -3117,10 +4743,10 @@ function resolveApiKey2(apiKey) {
3117
4743
  }
3118
4744
  return key;
3119
4745
  }
3120
- var TTS3 = class _TTS extends CartesiaTTS {
4746
+ var TTS4 = class _TTS extends CartesiaTTS {
3121
4747
  static providerKey = "cartesia_tts";
3122
4748
  constructor(opts = {}) {
3123
- const key = resolveApiKey2(opts.apiKey);
4749
+ const key = resolveApiKey3(opts.apiKey);
3124
4750
  const { apiKey: _ignored, ...rest } = opts;
3125
4751
  void _ignored;
3126
4752
  super(key, rest);
@@ -3135,15 +4761,30 @@ var TTS3 = class _TTS extends CartesiaTTS {
3135
4761
  }
3136
4762
  };
3137
4763
 
4764
+ // src/tts/rime.ts
4765
+ init_esm_shims();
4766
+
3138
4767
  // src/providers/rime-tts.ts
4768
+ init_esm_shims();
3139
4769
  var RIME_BASE_URL = "https://users.rime.ai/v1/rime-tts";
4770
+ var RimeModel = {
4771
+ ARCANA: "arcana",
4772
+ MIST: "mist",
4773
+ MIST_V2: "mistv2"
4774
+ };
4775
+ var RimeAudioFormat = {
4776
+ PCM: "audio/pcm",
4777
+ MP3: "audio/mp3",
4778
+ WAV: "audio/wav",
4779
+ MULAW: "audio/mulaw"
4780
+ };
3140
4781
  var ARCANA_MODEL_TIMEOUT_MS = 60 * 4 * 1e3;
3141
4782
  var MIST_MODEL_TIMEOUT_MS = 30 * 1e3;
3142
4783
  function isMistModel(model) {
3143
- return model.includes("mist");
4784
+ return model.includes(RimeModel.MIST);
3144
4785
  }
3145
4786
  function timeoutForModel(model) {
3146
- if (model === "arcana") return ARCANA_MODEL_TIMEOUT_MS;
4787
+ if (model === RimeModel.ARCANA) return ARCANA_MODEL_TIMEOUT_MS;
3147
4788
  return MIST_MODEL_TIMEOUT_MS;
3148
4789
  }
3149
4790
  var RimeTTS = class {
@@ -3164,7 +4805,7 @@ var RimeTTS = class {
3164
4805
  totalTimeoutMs;
3165
4806
  constructor(apiKey, opts = {}) {
3166
4807
  this.apiKey = apiKey;
3167
- this.model = opts.model ?? "arcana";
4808
+ this.model = opts.model ?? RimeModel.ARCANA;
3168
4809
  const defaultSpeaker = isMistModel(this.model) ? "cove" : "astra";
3169
4810
  this.speaker = opts.speaker ?? defaultSpeaker;
3170
4811
  this.lang = opts.lang ?? "eng";
@@ -3186,7 +4827,7 @@ var RimeTTS = class {
3186
4827
  text,
3187
4828
  modelId: this.model
3188
4829
  };
3189
- if (this.model === "arcana") {
4830
+ if (this.model === RimeModel.ARCANA) {
3190
4831
  if (this.repetitionPenalty !== void 0)
3191
4832
  payload.repetition_penalty = this.repetitionPenalty;
3192
4833
  if (this.temperature !== void 0) payload.temperature = this.temperature;
@@ -3198,7 +4839,7 @@ var RimeTTS = class {
3198
4839
  payload.lang = this.lang;
3199
4840
  payload.samplingRate = this.sampleRate;
3200
4841
  if (this.speedAlpha !== void 0) payload.speedAlpha = this.speedAlpha;
3201
- if (this.model === "mistv2" && this.reduceLatency !== void 0) {
4842
+ if (this.model === RimeModel.MIST_V2 && this.reduceLatency !== void 0) {
3202
4843
  payload.reduceLatency = this.reduceLatency;
3203
4844
  }
3204
4845
  if (this.pauseBetweenBrackets !== void 0) {
@@ -3210,6 +4851,7 @@ var RimeTTS = class {
3210
4851
  }
3211
4852
  return payload;
3212
4853
  }
4854
+ /** Synthesize text and return the concatenated audio buffer. */
3213
4855
  async synthesize(text) {
3214
4856
  const chunks = [];
3215
4857
  for await (const chunk of this.synthesizeStream(text)) {
@@ -3225,7 +4867,7 @@ var RimeTTS = class {
3225
4867
  const response = await fetch(this.baseUrl, {
3226
4868
  method: "POST",
3227
4869
  headers: {
3228
- accept: "audio/pcm",
4870
+ accept: RimeAudioFormat.PCM,
3229
4871
  Authorization: `Bearer ${this.apiKey}`,
3230
4872
  "content-type": "application/json"
3231
4873
  },
@@ -3263,7 +4905,7 @@ var RimeTTS = class {
3263
4905
  };
3264
4906
 
3265
4907
  // src/tts/rime.ts
3266
- var TTS4 = class extends RimeTTS {
4908
+ var TTS5 = class extends RimeTTS {
3267
4909
  static providerKey = "rime";
3268
4910
  constructor(opts = {}) {
3269
4911
  const key = opts.apiKey ?? process.env.RIME_API_KEY;
@@ -3278,8 +4920,28 @@ var TTS4 = class extends RimeTTS {
3278
4920
  }
3279
4921
  };
3280
4922
 
4923
+ // src/tts/lmnt.ts
4924
+ init_esm_shims();
4925
+
3281
4926
  // src/providers/lmnt-tts.ts
4927
+ init_esm_shims();
3282
4928
  var LMNT_BASE_URL = "https://api.lmnt.com/v1/ai/speech/bytes";
4929
+ var LMNTAudioFormat = {
4930
+ AAC: "aac",
4931
+ MP3: "mp3",
4932
+ MULAW: "mulaw",
4933
+ RAW: "raw",
4934
+ WAV: "wav"
4935
+ };
4936
+ var LMNTModel = {
4937
+ BLIZZARD: "blizzard",
4938
+ AURORA: "aurora"
4939
+ };
4940
+ var LMNTSampleRate = {
4941
+ HZ_8000: 8e3,
4942
+ HZ_16000: 16e3,
4943
+ HZ_24000: 24e3
4944
+ };
3283
4945
  var LMNTTTS = class {
3284
4946
  apiKey;
3285
4947
  model;
@@ -3292,11 +4954,11 @@ var LMNTTTS = class {
3292
4954
  baseUrl;
3293
4955
  constructor(apiKey, opts = {}) {
3294
4956
  this.apiKey = apiKey;
3295
- this.model = opts.model ?? "blizzard";
4957
+ this.model = opts.model ?? LMNTModel.BLIZZARD;
3296
4958
  this.voice = opts.voice ?? "leah";
3297
- this.language = opts.language ?? (this.model === "blizzard" ? "auto" : "en");
3298
- this.format = opts.format ?? "raw";
3299
- this.sampleRate = opts.sampleRate ?? 16e3;
4959
+ this.language = opts.language ?? (this.model === LMNTModel.BLIZZARD ? "auto" : "en");
4960
+ this.format = opts.format ?? LMNTAudioFormat.RAW;
4961
+ this.sampleRate = opts.sampleRate ?? LMNTSampleRate.HZ_16000;
3300
4962
  this.temperature = opts.temperature ?? 1;
3301
4963
  this.topP = opts.topP ?? 0.8;
3302
4964
  this.baseUrl = opts.baseUrl ?? LMNT_BASE_URL;
@@ -3313,6 +4975,131 @@ var LMNTTTS = class {
3313
4975
  top_p: this.topP
3314
4976
  };
3315
4977
  }
4978
+ /** Synthesize text and return the concatenated audio buffer. */
4979
+ async synthesize(text) {
4980
+ const chunks = [];
4981
+ for await (const chunk of this.synthesizeStream(text)) {
4982
+ chunks.push(chunk);
4983
+ }
4984
+ return Buffer.concat(chunks);
4985
+ }
4986
+ /** Yield audio chunks as they arrive — raw PCM_S16LE by default. */
4987
+ async *synthesizeStream(text) {
4988
+ const response = await fetch(this.baseUrl, {
4989
+ method: "POST",
4990
+ headers: {
4991
+ "Content-Type": "application/json",
4992
+ "X-API-Key": this.apiKey
4993
+ },
4994
+ body: JSON.stringify(this.buildPayload(text)),
4995
+ signal: AbortSignal.timeout(3e4)
4996
+ });
4997
+ if (!response.ok) {
4998
+ const body = await response.text();
4999
+ throw new Error(`LMNT TTS error ${response.status}: ${body}`);
5000
+ }
5001
+ if (!response.body) {
5002
+ throw new Error("LMNT TTS: no response body");
5003
+ }
5004
+ const reader = response.body.getReader();
5005
+ try {
5006
+ while (true) {
5007
+ const { done, value } = await reader.read();
5008
+ if (done) break;
5009
+ if (value && value.length > 0) {
5010
+ yield Buffer.from(value);
5011
+ }
5012
+ }
5013
+ } finally {
5014
+ if (typeof reader.cancel === "function")
5015
+ await reader.cancel().catch(() => {
5016
+ });
5017
+ reader.releaseLock();
5018
+ }
5019
+ }
5020
+ };
5021
+
5022
+ // src/tts/lmnt.ts
5023
+ var TTS6 = class extends LMNTTTS {
5024
+ static providerKey = "lmnt";
5025
+ constructor(opts = {}) {
5026
+ const key = opts.apiKey ?? process.env.LMNT_API_KEY;
5027
+ if (!key) {
5028
+ throw new Error(
5029
+ "LMNT TTS requires an apiKey. Pass { apiKey: '...' } or set LMNT_API_KEY in the environment."
5030
+ );
5031
+ }
5032
+ const { apiKey: _ignored, ...rest } = opts;
5033
+ void _ignored;
5034
+ super(key, rest);
5035
+ }
5036
+ };
5037
+
5038
+ // src/tts/inworld.ts
5039
+ init_esm_shims();
5040
+
5041
+ // src/providers/inworld-tts.ts
5042
+ init_esm_shims();
5043
+ var INWORLD_BASE_URL = "https://api.inworld.ai/tts/v1/voice:stream";
5044
+ var InworldModel = {
5045
+ TTS_2: "inworld-tts-2",
5046
+ TTS_1_5_MAX: "inworld-tts-1.5-max",
5047
+ TTS_1_5_MINI: "inworld-tts-1.5-mini",
5048
+ TTS_1_MAX: "inworld-tts-1-max",
5049
+ TTS_1: "inworld-tts-1"
5050
+ };
5051
+ var InworldAudioEncoding = {
5052
+ PCM: "PCM",
5053
+ LINEAR16: "LINEAR16",
5054
+ OGG_OPUS: "OGG_OPUS",
5055
+ MP3: "MP3"
5056
+ };
5057
+ var InworldTTS = class {
5058
+ authToken;
5059
+ model;
5060
+ voice;
5061
+ language;
5062
+ audioEncoding;
5063
+ sampleRate;
5064
+ bitrate;
5065
+ temperature;
5066
+ speakingRate;
5067
+ deliveryMode;
5068
+ baseUrl;
5069
+ constructor(authToken, opts = {}) {
5070
+ if (!authToken) {
5071
+ throw new Error("Inworld TTS: authToken is required");
5072
+ }
5073
+ this.authToken = authToken;
5074
+ this.model = opts.model ?? InworldModel.TTS_2;
5075
+ this.voice = opts.voice ?? "Ashley";
5076
+ this.language = opts.language;
5077
+ this.audioEncoding = opts.audioEncoding ?? InworldAudioEncoding.PCM;
5078
+ this.sampleRate = opts.sampleRate ?? 16e3;
5079
+ this.bitrate = opts.bitrate ?? 64e3;
5080
+ this.temperature = opts.temperature;
5081
+ this.speakingRate = opts.speakingRate ?? 1;
5082
+ this.deliveryMode = opts.deliveryMode;
5083
+ this.baseUrl = opts.baseUrl ?? INWORLD_BASE_URL;
5084
+ }
5085
+ buildPayload(text) {
5086
+ const payload = {
5087
+ text,
5088
+ voiceId: this.voice,
5089
+ modelId: this.model,
5090
+ audioConfig: {
5091
+ audioEncoding: this.audioEncoding,
5092
+ bitrate: this.bitrate,
5093
+ sampleRateHertz: this.sampleRate
5094
+ },
5095
+ speakingRate: this.speakingRate
5096
+ };
5097
+ if (this.language !== void 0) payload.language = this.language;
5098
+ if (this.temperature !== void 0) payload.temperature = this.temperature;
5099
+ if (this.deliveryMode !== void 0) payload.deliveryMode = this.deliveryMode;
5100
+ return payload;
5101
+ }
5102
+ /** Synthesize text and return the concatenated audio buffer. */
3316
5103
  async synthesize(text) {
3317
5104
  const chunks = [];
3318
5105
  for await (const chunk of this.synthesizeStream(text)) {
@@ -3320,32 +5107,51 @@ var LMNTTTS = class {
3320
5107
  }
3321
5108
  return Buffer.concat(chunks);
3322
5109
  }
3323
- /** Yield audio chunks as they arrive — raw PCM_S16LE by default. */
5110
+ /**
5111
+ * Yield audio chunks as they arrive. With the default `audioEncoding=PCM`
5112
+ * these are raw PCM_S16LE bytes at `sampleRate`.
5113
+ */
3324
5114
  async *synthesizeStream(text) {
3325
5115
  const response = await fetch(this.baseUrl, {
3326
5116
  method: "POST",
3327
5117
  headers: {
3328
5118
  "Content-Type": "application/json",
3329
- "X-API-Key": this.apiKey
5119
+ Authorization: `Basic ${this.authToken}`
3330
5120
  },
3331
5121
  body: JSON.stringify(this.buildPayload(text)),
3332
- signal: AbortSignal.timeout(3e4)
5122
+ signal: AbortSignal.timeout(6e4)
3333
5123
  });
3334
5124
  if (!response.ok) {
3335
5125
  const body = await response.text();
3336
- throw new Error(`LMNT TTS error ${response.status}: ${body}`);
5126
+ throw new Error(`Inworld TTS error ${response.status}: ${body}`);
3337
5127
  }
3338
5128
  if (!response.body) {
3339
- throw new Error("LMNT TTS: no response body");
5129
+ throw new Error("Inworld TTS: no response body");
3340
5130
  }
3341
5131
  const reader = response.body.getReader();
5132
+ const decoder = new TextDecoder("utf-8");
5133
+ let buffered = "";
3342
5134
  try {
3343
5135
  while (true) {
3344
5136
  const { done, value } = await reader.read();
3345
- if (done) break;
3346
- if (value && value.length > 0) {
3347
- yield Buffer.from(value);
5137
+ if (done) {
5138
+ buffered += decoder.decode();
5139
+ break;
3348
5140
  }
5141
+ buffered += decoder.decode(value, { stream: true });
5142
+ let newlineIdx;
5143
+ while ((newlineIdx = buffered.indexOf("\n")) >= 0) {
5144
+ const line = buffered.slice(0, newlineIdx).trim();
5145
+ buffered = buffered.slice(newlineIdx + 1);
5146
+ if (!line) continue;
5147
+ const audio = decodeNdjsonLine(line);
5148
+ if (audio && audio.length > 0) yield audio;
5149
+ }
5150
+ }
5151
+ const tail = buffered.trim();
5152
+ if (tail) {
5153
+ const audio = decodeNdjsonLine(tail);
5154
+ if (audio && audio.length > 0) yield audio;
3349
5155
  }
3350
5156
  } finally {
3351
5157
  if (typeof reader.cancel === "function")
@@ -3355,15 +5161,28 @@ var LMNTTTS = class {
3355
5161
  }
3356
5162
  }
3357
5163
  };
5164
+ function decodeNdjsonLine(line) {
5165
+ let parsed;
5166
+ try {
5167
+ parsed = JSON.parse(line);
5168
+ } catch {
5169
+ return null;
5170
+ }
5171
+ if (typeof parsed !== "object" || parsed === null) return null;
5172
+ const result = parsed.result;
5173
+ const audioB64 = result?.audioContent;
5174
+ if (typeof audioB64 !== "string" || audioB64.length === 0) return null;
5175
+ return Buffer.from(audioB64, "base64");
5176
+ }
3358
5177
 
3359
- // src/tts/lmnt.ts
3360
- var TTS5 = class extends LMNTTTS {
3361
- static providerKey = "lmnt";
5178
+ // src/tts/inworld.ts
5179
+ var TTS7 = class extends InworldTTS {
5180
+ static providerKey = "inworld";
3362
5181
  constructor(opts = {}) {
3363
- const key = opts.apiKey ?? process.env.LMNT_API_KEY;
5182
+ const key = opts.apiKey ?? process.env.INWORLD_API_KEY;
3364
5183
  if (!key) {
3365
5184
  throw new Error(
3366
- "LMNT TTS requires an apiKey. Pass { apiKey: '...' } or set LMNT_API_KEY in the environment."
5185
+ "Inworld TTS requires an apiKey. Pass { apiKey: '...' } or set INWORLD_API_KEY in the environment."
3367
5186
  );
3368
5187
  }
3369
5188
  const { apiKey: _ignored, ...rest } = opts;
@@ -3373,6 +5192,7 @@ var TTS5 = class extends LMNTTTS {
3373
5192
  };
3374
5193
 
3375
5194
  // src/llm/openai.ts
5195
+ init_esm_shims();
3376
5196
  var LLM = class extends OpenAILLMProvider {
3377
5197
  static providerKey = "openai";
3378
5198
  constructor(opts = {}) {
@@ -3397,10 +5217,24 @@ var LLM = class extends OpenAILLMProvider {
3397
5217
  }
3398
5218
  };
3399
5219
 
5220
+ // src/llm/anthropic.ts
5221
+ init_esm_shims();
5222
+
3400
5223
  // src/providers/anthropic-llm.ts
5224
+ init_esm_shims();
3401
5225
  var DEFAULT_ANTHROPIC_URL = "https://api.anthropic.com/v1/messages";
3402
5226
  var DEFAULT_ANTHROPIC_VERSION = "2023-06-01";
3403
- var DEFAULT_MODEL = "claude-haiku-4-5-20251001";
5227
+ var AnthropicModel = {
5228
+ CLAUDE_HAIKU_4_5_ALIAS: "claude-haiku-4-5",
5229
+ CLAUDE_SONNET_4_6_ALIAS: "claude-sonnet-4-6",
5230
+ CLAUDE_OPUS_4_7_ALIAS: "claude-opus-4-7",
5231
+ CLAUDE_3_5_SONNET_ALIAS: "claude-3-5-sonnet-latest",
5232
+ CLAUDE_3_5_HAIKU_ALIAS: "claude-3-5-haiku-latest",
5233
+ CLAUDE_HAIKU_4_5_20251001: "claude-haiku-4-5-20251001",
5234
+ CLAUDE_3_5_SONNET_20241022: "claude-3-5-sonnet-20241022",
5235
+ CLAUDE_3_5_HAIKU_20241022: "claude-3-5-haiku-20241022"
5236
+ };
5237
+ var DEFAULT_MODEL = AnthropicModel.CLAUDE_HAIKU_4_5_20251001;
3404
5238
  var DEFAULT_MAX_TOKENS = 1024;
3405
5239
  var PROMPT_CACHING_BETA = "prompt-caching-2024-07-31";
3406
5240
  var AnthropicLLMProvider = class {
@@ -3425,7 +5259,8 @@ var AnthropicLLMProvider = class {
3425
5259
  this.anthropicVersion = options.anthropicVersion ?? DEFAULT_ANTHROPIC_VERSION;
3426
5260
  this.promptCaching = options.promptCaching ?? true;
3427
5261
  }
3428
- async *stream(messages, tools) {
5262
+ /** Stream Patter-format LLM chunks for the given OpenAI-style chat history. */
5263
+ async *stream(messages, tools, opts) {
3429
5264
  const { system, messages: anthropicMessages } = toAnthropicMessages(messages);
3430
5265
  const anthropicTools = tools ? toAnthropicTools(tools) : null;
3431
5266
  const body = {
@@ -3473,7 +5308,7 @@ var AnthropicLLMProvider = class {
3473
5308
  method: "POST",
3474
5309
  headers,
3475
5310
  body: JSON.stringify(body),
3476
- signal: AbortSignal.timeout(3e4)
5311
+ signal: mergeAbortSignals(opts?.signal, AbortSignal.timeout(3e4))
3477
5312
  });
3478
5313
  if (!response.ok) {
3479
5314
  const errText = await response.text();
@@ -3636,12 +5471,28 @@ var LLM2 = class extends AnthropicLLMProvider {
3636
5471
  }
3637
5472
  };
3638
5473
 
5474
+ // src/llm/groq.ts
5475
+ init_esm_shims();
5476
+
5477
+ // src/providers/groq-llm.ts
5478
+ init_esm_shims();
5479
+
3639
5480
  // src/version.ts
3640
- var VERSION = "0.5.3";
5481
+ init_esm_shims();
5482
+ var VERSION = "0.5.5";
3641
5483
 
3642
5484
  // src/providers/groq-llm.ts
3643
5485
  var GROQ_BASE_URL = "https://api.groq.com/openai/v1";
3644
- var DEFAULT_MODEL2 = "llama-3.3-70b-versatile";
5486
+ var GroqModel = {
5487
+ LLAMA_3_3_70B_VERSATILE: "llama-3.3-70b-versatile",
5488
+ LLAMA_3_1_8B_INSTANT: "llama-3.1-8b-instant",
5489
+ LLAMA_3_3_70B_SPECDEC: "llama-3.3-70b-specdec",
5490
+ LLAMA_3_70B: "llama3-70b-8192",
5491
+ LLAMA_3_8B: "llama3-8b-8192",
5492
+ MIXTRAL_8X7B: "mixtral-8x7b-32768",
5493
+ GEMMA2_9B: "gemma2-9b-it"
5494
+ };
5495
+ var DEFAULT_MODEL2 = GroqModel.LLAMA_3_3_70B_VERSATILE;
3645
5496
  var GroqLLMProvider = class {
3646
5497
  apiKey;
3647
5498
  model;
@@ -3676,7 +5527,8 @@ var GroqLLMProvider = class {
3676
5527
  this.presencePenalty = options.presencePenalty;
3677
5528
  this.stop = options.stop;
3678
5529
  }
3679
- async *stream(messages, tools) {
5530
+ /** Stream Patter-format LLM chunks from the Groq chat completions API. */
5531
+ async *stream(messages, tools, opts) {
3680
5532
  const body = {
3681
5533
  model: this.model,
3682
5534
  messages,
@@ -3704,7 +5556,7 @@ var GroqLLMProvider = class {
3704
5556
  "User-Agent": `getpatter/${VERSION}`
3705
5557
  },
3706
5558
  body: JSON.stringify(body),
3707
- signal: AbortSignal.timeout(3e4)
5559
+ signal: mergeAbortSignals(opts?.signal, AbortSignal.timeout(3e4))
3708
5560
  });
3709
5561
  if (!response.ok) {
3710
5562
  const errText = await response.text();
@@ -3794,9 +5646,20 @@ var LLM3 = class extends GroqLLMProvider {
3794
5646
  }
3795
5647
  };
3796
5648
 
5649
+ // src/llm/cerebras.ts
5650
+ init_esm_shims();
5651
+
3797
5652
  // src/providers/cerebras-llm.ts
5653
+ init_esm_shims();
3798
5654
  var CEREBRAS_BASE_URL = "https://api.cerebras.ai/v1";
3799
- var DEFAULT_MODEL3 = "llama3.1-8b";
5655
+ var CerebrasModel = {
5656
+ GPT_OSS_120B: "gpt-oss-120b",
5657
+ LLAMA_3_1_8B: "llama3.1-8b",
5658
+ LLAMA_3_3_70B: "llama-3.3-70b",
5659
+ QWEN_3_235B_INSTRUCT: "qwen-3-235b-a22b-instruct-2507",
5660
+ ZAI_GLM_4_7: "zai-glm-4.7"
5661
+ };
5662
+ var DEFAULT_MODEL3 = CerebrasModel.GPT_OSS_120B;
3800
5663
  var RETRY_BACKOFF_BASE_MS = 500;
3801
5664
  var CerebrasLLMProvider = class {
3802
5665
  apiKey;
@@ -3834,7 +5697,8 @@ var CerebrasLLMProvider = class {
3834
5697
  this.presencePenalty = options.presencePenalty;
3835
5698
  this.stop = options.stop;
3836
5699
  }
3837
- async *stream(messages, tools) {
5700
+ /** Stream Patter-format LLM chunks from the Cerebras chat completions API. */
5701
+ async *stream(messages, tools, opts) {
3838
5702
  const body = {
3839
5703
  model: this.model,
3840
5704
  messages,
@@ -3876,7 +5740,7 @@ var CerebrasLLMProvider = class {
3876
5740
  method: "POST",
3877
5741
  headers,
3878
5742
  body: payload,
3879
- signal: AbortSignal.timeout(3e4)
5743
+ signal: mergeAbortSignals(opts?.signal, AbortSignal.timeout(3e4))
3880
5744
  });
3881
5745
  if (response.ok) {
3882
5746
  yield* parseOpenAISseStream(response);
@@ -3979,8 +5843,20 @@ var LLM4 = class extends CerebrasLLMProvider {
3979
5843
  }
3980
5844
  };
3981
5845
 
5846
+ // src/llm/google.ts
5847
+ init_esm_shims();
5848
+
3982
5849
  // src/providers/google-llm.ts
3983
- var DEFAULT_MODEL4 = "gemini-2.5-flash";
5850
+ init_esm_shims();
5851
+ var GoogleModel = {
5852
+ GEMINI_2_5_FLASH: "gemini-2.5-flash",
5853
+ GEMINI_2_5_PRO: "gemini-2.5-pro",
5854
+ GEMINI_2_0_FLASH: "gemini-2.0-flash",
5855
+ GEMINI_2_0_FLASH_LITE: "gemini-2.0-flash-lite",
5856
+ GEMINI_1_5_FLASH: "gemini-1.5-flash",
5857
+ GEMINI_1_5_PRO: "gemini-1.5-pro"
5858
+ };
5859
+ var DEFAULT_MODEL4 = GoogleModel.GEMINI_2_5_FLASH;
3984
5860
  var DEFAULT_BASE_URL3 = "https://generativelanguage.googleapis.com/v1beta";
3985
5861
  var GoogleLLMProvider = class {
3986
5862
  apiKey;
@@ -4000,7 +5876,8 @@ var GoogleLLMProvider = class {
4000
5876
  this.temperature = options.temperature;
4001
5877
  this.maxOutputTokens = options.maxOutputTokens;
4002
5878
  }
4003
- async *stream(messages, tools) {
5879
+ /** Stream Patter-format LLM chunks from the Gemini SSE endpoint. */
5880
+ async *stream(messages, tools, opts) {
4004
5881
  const { systemInstruction, contents } = toGeminiContents(messages);
4005
5882
  const geminiTools = tools ? toGeminiTools(tools) : null;
4006
5883
  const body = { contents };
@@ -4018,7 +5895,7 @@ var GoogleLLMProvider = class {
4018
5895
  method: "POST",
4019
5896
  headers: { "Content-Type": "application/json" },
4020
5897
  body: JSON.stringify(body),
4021
- signal: AbortSignal.timeout(3e4)
5898
+ signal: mergeAbortSignals(opts?.signal, AbortSignal.timeout(3e4))
4022
5899
  });
4023
5900
  if (!response.ok) {
4024
5901
  const errText = await response.text();
@@ -4188,280 +6065,8 @@ var LLM5 = class extends GoogleLLMProvider {
4188
6065
  }
4189
6066
  };
4190
6067
 
4191
- // src/providers/silero-vad.ts
4192
- import { createRequire } from "module";
4193
- import * as fs from "fs";
4194
- import * as path from "path";
4195
- import { fileURLToPath } from "url";
4196
- var SUPPORTED_SAMPLE_RATES = [8e3, 16e3];
4197
- function resolveModuleDir() {
4198
- try {
4199
- const cjsDir = new Function("return typeof __dirname !== 'undefined' ? __dirname : null")();
4200
- if (typeof cjsDir === "string") return cjsDir;
4201
- } catch {
4202
- }
4203
- try {
4204
- const url = import.meta.url;
4205
- if (url) return path.dirname(fileURLToPath(url));
4206
- } catch {
4207
- }
4208
- return process.cwd();
4209
- }
4210
- var MODULE_DIR = resolveModuleDir();
4211
- function resolveDefaultModelPath() {
4212
- const candidates = [
4213
- path.join(MODULE_DIR, "resources", "silero_vad.onnx"),
4214
- path.join(MODULE_DIR, "..", "resources", "silero_vad.onnx")
4215
- ];
4216
- for (const c of candidates) if (fs.existsSync(c)) return c;
4217
- return candidates[0];
4218
- }
4219
- var DEFAULT_MODEL_PATH = resolveDefaultModelPath();
4220
- async function loadOnnxRuntime() {
4221
- let firstErr;
4222
- try {
4223
- const mod = await import("./dist-YRCCJQ26.mjs");
4224
- return mod;
4225
- } catch (e) {
4226
- firstErr = e;
4227
- }
4228
- try {
4229
- const req = createRequire(path.join(process.cwd(), "package.json"));
4230
- return req("onnxruntime-node");
4231
- } catch (e) {
4232
- const detail = e?.message ?? String(e);
4233
- const original = firstErr?.message ?? String(firstErr);
4234
- throw new Error(
4235
- `
4236
- SileroVAD requires the "onnxruntime-node" package, which could not be resolved.
4237
-
4238
- Install: npm install onnxruntime-node
4239
-
4240
- This is an optional peer dependency of getpatter (~210 MB) \u2014 it is only
4241
- needed when you use SileroVAD in pipeline mode.
4242
-
4243
- import() failed: ${original}
4244
- cwd-require failed: ${detail}
4245
- `
4246
- );
4247
- }
4248
- }
4249
- var ExpFilter = class {
4250
- constructor(alpha) {
4251
- this.alpha = alpha;
4252
- if (!(alpha > 0 && alpha <= 1)) {
4253
- throw new Error("alpha must be in (0, 1].");
4254
- }
4255
- }
4256
- filtered = null;
4257
- apply(exp, sample) {
4258
- if (this.filtered === null) {
4259
- this.filtered = sample;
4260
- } else {
4261
- const a = Math.pow(this.alpha, exp);
4262
- this.filtered = a * this.filtered + (1 - a) * sample;
4263
- }
4264
- return this.filtered;
4265
- }
4266
- reset() {
4267
- this.filtered = null;
4268
- }
4269
- };
4270
- var OnnxModel = class {
4271
- constructor(runtime, session, sampleRate) {
4272
- this.runtime = runtime;
4273
- this.session = session;
4274
- if (!SUPPORTED_SAMPLE_RATES.includes(sampleRate)) {
4275
- throw new Error("Silero VAD only supports 8KHz and 16KHz sample rates");
4276
- }
4277
- this.sampleRate = sampleRate;
4278
- this.windowSizeSamples = sampleRate === 8e3 ? 256 : 512;
4279
- this.contextSize = sampleRate === 8e3 ? 32 : 64;
4280
- this.context = new Float32Array(this.contextSize);
4281
- this.rnnState = new Float32Array(2 * 1 * 128);
4282
- this.inputBuffer = new Float32Array(this.contextSize + this.windowSizeSamples);
4283
- this.sampleRateTensor = BigInt64Array.from([BigInt(sampleRate)]);
4284
- }
4285
- sampleRate;
4286
- windowSizeSamples;
4287
- contextSize;
4288
- context;
4289
- rnnState;
4290
- inputBuffer;
4291
- sampleRateTensor;
4292
- async run(window) {
4293
- if (window.length !== this.windowSizeSamples) {
4294
- throw new Error(
4295
- `window must have exactly ${this.windowSizeSamples} samples, got ${window.length}`
4296
- );
4297
- }
4298
- this.inputBuffer.set(this.context, 0);
4299
- this.inputBuffer.set(window, this.contextSize);
4300
- const { Tensor } = this.runtime;
4301
- const feeds = {
4302
- input: new Tensor("float32", this.inputBuffer, [1, this.inputBuffer.length]),
4303
- state: new Tensor("float32", this.rnnState, [2, 1, 128]),
4304
- sr: new Tensor("int64", this.sampleRateTensor, [])
4305
- };
4306
- const results = await this.session.run(feeds);
4307
- const outputKey = Object.keys(results).find((k) => k !== "stateN") ?? "output";
4308
- const stateKey = "stateN" in results ? "stateN" : Object.keys(results).find((k) => k !== outputKey);
4309
- const out = results[outputKey];
4310
- const newState = stateKey ? results[stateKey] : void 0;
4311
- if (newState && newState.data instanceof Float32Array) {
4312
- this.rnnState = Float32Array.from(newState.data);
4313
- }
4314
- this.context = this.inputBuffer.slice(-this.contextSize);
4315
- const data = out.data;
4316
- return data[0] ?? 0;
4317
- }
4318
- };
4319
- var SileroVAD = class _SileroVAD {
4320
- constructor(model, opts) {
4321
- this.model = model;
4322
- this.opts = opts;
4323
- }
4324
- pending = new Float32Array(0);
4325
- expFilter = new ExpFilter(0.35);
4326
- pubSpeaking = false;
4327
- speechThresholdDuration = 0;
4328
- silenceThresholdDuration = 0;
4329
- closed = false;
4330
- /**
4331
- * Load the Silero VAD model. Defaults match the LiveKit Silero plugin.
4332
- * Throws if `onnxruntime-node` is not installed.
4333
- */
4334
- static async load(options = {}) {
4335
- const sampleRate = options.sampleRate ?? 16e3;
4336
- if (!SUPPORTED_SAMPLE_RATES.includes(sampleRate)) {
4337
- throw new Error("Silero VAD only supports 8KHz and 16KHz sample rates");
4338
- }
4339
- const activationThreshold = options.activationThreshold ?? 0.5;
4340
- const deactivationThreshold = options.deactivationThreshold ?? Math.max(activationThreshold - 0.15, 0.01);
4341
- if (deactivationThreshold <= 0) {
4342
- throw new Error("deactivationThreshold must be greater than 0");
4343
- }
4344
- const runtime = await loadOnnxRuntime();
4345
- const modelPath = options.onnxFilePath ?? DEFAULT_MODEL_PATH;
4346
- const session = await runtime.InferenceSession.create(modelPath, {
4347
- interOpNumThreads: 1,
4348
- intraOpNumThreads: 1,
4349
- executionMode: "sequential",
4350
- executionProviders: options.forceCpu === false ? void 0 : ["cpu"]
4351
- });
4352
- const model = new OnnxModel(runtime, session, sampleRate);
4353
- return new _SileroVAD(model, {
4354
- minSpeechDuration: options.minSpeechDuration ?? 0.05,
4355
- minSilenceDuration: options.minSilenceDuration ?? 0.55,
4356
- prefixPaddingDuration: options.prefixPaddingDuration ?? 0.5,
4357
- activationThreshold,
4358
- deactivationThreshold,
4359
- sampleRate
4360
- });
4361
- }
4362
- /**
4363
- * Internal factory used by tests — bypasses onnxruntime-node loading.
4364
- * @internal
4365
- */
4366
- static fromOnnxModel(runtime, session, options) {
4367
- const model = new OnnxModel(runtime, session, options.sampleRate);
4368
- return new _SileroVAD(model, options);
4369
- }
4370
- get sampleRate() {
4371
- return this.opts.sampleRate;
4372
- }
4373
- /**
4374
- * Number of int16 PCM samples that must be provided per call to
4375
- * processFrame for the model to run one inference window.
4376
- *
4377
- * Constraint (ported from LiveKit Agents / Silero ONNX spec):
4378
- * - 16 000 Hz → 512 samples (32 ms)
4379
- * - 8 000 Hz → 256 samples (32 ms)
4380
- *
4381
- * Callers that feed raw audio in fixed-size chunks (e.g. WebSocket frames)
4382
- * should buffer incoming audio until at least numFramesRequired() int16
4383
- * samples are available before calling processFrame. The provider
4384
- * internally buffers partial windows so smaller chunks are also safe, but
4385
- * passing exactly one window per call minimises heap allocation.
4386
- */
4387
- numFramesRequired() {
4388
- return this.opts.sampleRate === 8e3 ? 256 : 512;
4389
- }
4390
- async processFrame(pcmChunk, sampleRate) {
4391
- if (this.closed) {
4392
- throw new Error("SileroVAD is closed");
4393
- }
4394
- if (sampleRate !== this.opts.sampleRate) {
4395
- throw new Error(
4396
- `input sampleRate ${sampleRate} does not match model sampleRate ${this.opts.sampleRate}; resampling is not implemented in the Patter port`
4397
- );
4398
- }
4399
- if (pcmChunk.length === 0) {
4400
- return null;
4401
- }
4402
- const numSamples = Math.floor(pcmChunk.length / 2);
4403
- if (numSamples === 0) {
4404
- return null;
4405
- }
4406
- const samples = new Float32Array(numSamples);
4407
- for (let i = 0; i < numSamples; i++) {
4408
- samples[i] = pcmChunk.readInt16LE(i * 2) / 32767;
4409
- }
4410
- const merged = new Float32Array(this.pending.length + samples.length);
4411
- merged.set(this.pending, 0);
4412
- merged.set(samples, this.pending.length);
4413
- this.pending = merged;
4414
- const windowSize = this.model.windowSizeSamples;
4415
- let event = null;
4416
- while (this.pending.length >= windowSize) {
4417
- const window = this.pending.slice(0, windowSize);
4418
- this.pending = this.pending.slice(windowSize);
4419
- const rawP = await this.model.run(window);
4420
- const p = this.expFilter.apply(1, rawP);
4421
- const windowDuration = windowSize / this.opts.sampleRate;
4422
- const transition = this.advanceState(p, windowDuration);
4423
- if (transition !== null) {
4424
- event = transition;
4425
- }
4426
- }
4427
- return event;
4428
- }
4429
- advanceState(p, windowDuration) {
4430
- const opts = this.opts;
4431
- if (p >= opts.activationThreshold || this.pubSpeaking && p > opts.deactivationThreshold) {
4432
- this.speechThresholdDuration += windowDuration;
4433
- this.silenceThresholdDuration = 0;
4434
- if (!this.pubSpeaking) {
4435
- if (this.speechThresholdDuration >= opts.minSpeechDuration) {
4436
- this.pubSpeaking = true;
4437
- return {
4438
- type: "speech_start",
4439
- confidence: p,
4440
- durationMs: this.speechThresholdDuration * 1e3
4441
- };
4442
- }
4443
- }
4444
- } else {
4445
- this.silenceThresholdDuration += windowDuration;
4446
- this.speechThresholdDuration = 0;
4447
- if (this.pubSpeaking && this.silenceThresholdDuration >= opts.minSilenceDuration) {
4448
- this.pubSpeaking = false;
4449
- return {
4450
- type: "speech_end",
4451
- confidence: p,
4452
- durationMs: this.silenceThresholdDuration * 1e3
4453
- };
4454
- }
4455
- }
4456
- return null;
4457
- }
4458
- async close() {
4459
- if (this.closed) return;
4460
- this.closed = true;
4461
- }
4462
- };
4463
-
4464
- // src/carriers/twilio.ts
6068
+ // src/telephony/twilio.ts
6069
+ init_esm_shims();
4465
6070
  var Carrier = class {
4466
6071
  kind = "twilio";
4467
6072
  accountSid;
@@ -4484,7 +6089,8 @@ var Carrier = class {
4484
6089
  }
4485
6090
  };
4486
6091
 
4487
- // src/carriers/telnyx.ts
6092
+ // src/telephony/telnyx.ts
6093
+ init_esm_shims();
4488
6094
  var Carrier2 = class {
4489
6095
  kind = "telnyx";
4490
6096
  apiKey;
@@ -4511,6 +6117,7 @@ var Carrier2 = class {
4511
6117
  };
4512
6118
 
4513
6119
  // src/public-api.ts
6120
+ init_esm_shims();
4514
6121
  var DEFAULT_GUARDRAIL_REPLACEMENT = "I'm sorry, I can't respond to that.";
4515
6122
  var Guardrail = class {
4516
6123
  name;
@@ -4560,6 +6167,7 @@ function tool(opts) {
4560
6167
  }
4561
6168
 
4562
6169
  // src/chat-context.ts
6170
+ init_esm_shims();
4563
6171
  import { randomUUID } from "crypto";
4564
6172
  function generateId() {
4565
6173
  return randomUUID().replace(/-/g, "").slice(0, 12);
@@ -4585,21 +6193,25 @@ var ChatContext = class _ChatContext {
4585
6193
  // -------------------------------------------------------------------------
4586
6194
  // Add messages
4587
6195
  // -------------------------------------------------------------------------
6196
+ /** Append a user message and return the created `ChatMessage`. */
4588
6197
  addUser(content) {
4589
6198
  const msg = createMessage("user", content);
4590
6199
  this.items = [...this.items, msg];
4591
6200
  return msg;
4592
6201
  }
6202
+ /** Append an assistant message and return the created `ChatMessage`. */
4593
6203
  addAssistant(content) {
4594
6204
  const msg = createMessage("assistant", content);
4595
6205
  this.items = [...this.items, msg];
4596
6206
  return msg;
4597
6207
  }
6208
+ /** Append a system message and return the created `ChatMessage`. */
4598
6209
  addSystem(content) {
4599
6210
  const msg = createMessage("system", content);
4600
6211
  this.items = [...this.items, msg];
4601
6212
  return msg;
4602
6213
  }
6214
+ /** Append a tool-result message tied to a tool-call id. */
4603
6215
  addToolResult(content, toolCallId) {
4604
6216
  const msg = createMessage("tool", content, { toolCallId });
4605
6217
  this.items = [...this.items, msg];
@@ -4608,13 +6220,16 @@ var ChatContext = class _ChatContext {
4608
6220
  // -------------------------------------------------------------------------
4609
6221
  // Access
4610
6222
  // -------------------------------------------------------------------------
6223
+ /** Return a snapshot of all messages currently in the context. */
4611
6224
  getMessages() {
4612
6225
  return [...this.items];
4613
6226
  }
6227
+ /** Return the last `n` messages (or `[]` when `n <= 0`). */
4614
6228
  getLastN(n) {
4615
6229
  if (n <= 0) return [];
4616
6230
  return [...this.items.slice(-n)];
4617
6231
  }
6232
+ /** Number of messages currently in the context. */
4618
6233
  get length() {
4619
6234
  return this.items.length;
4620
6235
  }
@@ -4641,6 +6256,7 @@ var ChatContext = class _ChatContext {
4641
6256
  // -------------------------------------------------------------------------
4642
6257
  // Provider format conversion
4643
6258
  // -------------------------------------------------------------------------
6259
+ /** Convert the conversation to the OpenAI Chat Completions message format. */
4644
6260
  toOpenAI() {
4645
6261
  return this.items.map((msg) => {
4646
6262
  const result = {
@@ -4678,6 +6294,7 @@ var ChatContext = class _ChatContext {
4678
6294
  // -------------------------------------------------------------------------
4679
6295
  // Copy
4680
6296
  // -------------------------------------------------------------------------
6297
+ /** Return a new `ChatContext` with the same messages (independent storage). */
4681
6298
  copy() {
4682
6299
  const ctx = new _ChatContext();
4683
6300
  ctx.items = this.items.map((msg) => ({ ...msg }));
@@ -4686,9 +6303,11 @@ var ChatContext = class _ChatContext {
4686
6303
  // -------------------------------------------------------------------------
4687
6304
  // Serialization
4688
6305
  // -------------------------------------------------------------------------
6306
+ /** Serialize the context to a JSON-safe object. */
4689
6307
  toJSON() {
4690
6308
  return { messages: [...this.items] };
4691
6309
  }
6310
+ /** Reconstruct a `ChatContext` from the result of `toJSON()`. */
4692
6311
  static fromJSON(data) {
4693
6312
  const ctx = new _ChatContext();
4694
6313
  ctx.items = (data.messages ?? []).map((msg) => Object.freeze({ ...msg }));
@@ -4697,6 +6316,7 @@ var ChatContext = class _ChatContext {
4697
6316
  };
4698
6317
 
4699
6318
  // src/services/ivr.ts
6319
+ init_esm_shims();
4700
6320
  var DTMF_EVENTS = [
4701
6321
  "0",
4702
6322
  "1",
@@ -4772,16 +6392,19 @@ var TfidfLoopDetector = class {
4772
6392
  this.similarityThreshold = similarityThreshold;
4773
6393
  this.consecutiveThreshold = consecutiveThreshold;
4774
6394
  }
6395
+ /** Forget all previously observed chunks and reset the consecutive-hit counter. */
4775
6396
  reset() {
4776
6397
  this.chunks = [];
4777
6398
  this.consecutiveSimilar = 0;
4778
6399
  }
6400
+ /** Record a new transcript chunk in the rolling window. */
4779
6401
  addChunk(text) {
4780
6402
  this.chunks.push({ text, vec: bagOfWords(text) });
4781
6403
  if (this.chunks.length > this.windowSize) {
4782
6404
  this.chunks = this.chunks.slice(-this.windowSize);
4783
6405
  }
4784
6406
  }
6407
+ /** Returns true once the most recent chunks look like a repeated IVR prompt. */
4785
6408
  checkLoopDetection() {
4786
6409
  if (this.chunks.length < 2) return false;
4787
6410
  const last = this.chunks[this.chunks.length - 1];
@@ -4803,6 +6426,8 @@ var DebouncedCall = class {
4803
6426
  this.callback = callback;
4804
6427
  this.delayMs = delayMs;
4805
6428
  }
6429
+ callback;
6430
+ delayMs;
4806
6431
  timer = null;
4807
6432
  schedule() {
4808
6433
  this.cancel();
@@ -4848,13 +6473,16 @@ var IVRActivity = class {
4848
6473
  this.maxSilenceDurationMs
4849
6474
  );
4850
6475
  }
6476
+ /** Begin tracking transcripts and silence; call once per call. */
4851
6477
  async start() {
4852
6478
  this.started = true;
4853
6479
  }
6480
+ /** Stop tracking and cancel any pending silence timer. */
4854
6481
  async stop() {
4855
6482
  this.debouncedSilence.cancel();
4856
6483
  this.started = false;
4857
6484
  }
6485
+ /** Feed a final user-side transcript chunk into the loop detector. */
4858
6486
  async onUserTranscribed(text) {
4859
6487
  if (!this.started || !text) return;
4860
6488
  if (this.loopDetector !== null) {
@@ -4871,14 +6499,17 @@ var IVRActivity = class {
4871
6499
  }
4872
6500
  }
4873
6501
  }
6502
+ /** Record the current user-turn state (e.g. `"listening"`, `"away"`). */
4874
6503
  noteUserState(state) {
4875
6504
  this.currentUserState = state;
4876
6505
  this.scheduleSilenceCheck();
4877
6506
  }
6507
+ /** Record the current agent-turn state (e.g. `"idle"`, `"listening"`). */
4878
6508
  noteAgentState(state) {
4879
6509
  this.currentAgentState = state;
4880
6510
  this.scheduleSilenceCheck();
4881
6511
  }
6512
+ /** Tool definitions to expose to the LLM (currently only `send_dtmf_events`). */
4882
6513
  get tools() {
4883
6514
  return [this.buildSendDtmfTool()];
4884
6515
  }
@@ -4952,10 +6583,11 @@ var IVRActivity = class {
4952
6583
  }
4953
6584
  };
4954
6585
 
4955
- // src/services/background-audio.ts
4956
- import { promises as fs2 } from "fs";
4957
- import path2 from "path";
4958
- import { fileURLToPath as fileURLToPath2 } from "url";
6586
+ // src/audio/background-audio.ts
6587
+ init_esm_shims();
6588
+ import { promises as fs } from "fs";
6589
+ import path from "path";
6590
+ import { fileURLToPath } from "url";
4959
6591
  var BuiltinAudioClip = {
4960
6592
  CITY_AMBIENCE: "city-ambience.ogg",
4961
6593
  FOREST_AMBIENCE: "forest-ambience.ogg",
@@ -4967,8 +6599,8 @@ var BuiltinAudioClip = {
4967
6599
  };
4968
6600
  function builtinClipPath(clip) {
4969
6601
  const meta = typeof import.meta !== "undefined" ? import.meta : void 0;
4970
- const here = meta?.url ? path2.dirname(fileURLToPath2(meta.url)) : typeof __dirname !== "undefined" ? __dirname : process.cwd();
4971
- return path2.resolve(here, "..", "resources", "audio", clip);
6602
+ const here = meta?.url ? path.dirname(fileURLToPath(meta.url)) : typeof __dirname !== "undefined" ? __dirname : process.cwd();
6603
+ return path.resolve(here, "..", "resources", "audio", clip);
4972
6604
  }
4973
6605
  var INT16_MIN = -32768;
4974
6606
  var INT16_MAX = 32767;
@@ -5137,7 +6769,7 @@ var BackgroundAudioPlayer = class {
5137
6769
  return source.decode(source.path);
5138
6770
  case "builtin": {
5139
6771
  const p = builtinClipPath(source.clip);
5140
- const header = await fs2.readFile(p, { flag: "r" }).then((buf) => buf.subarray(0, 4));
6772
+ const header = await fs.readFile(p, { flag: "r" }).then((buf) => buf.subarray(0, 4));
5141
6773
  if (header.toString("ascii") !== "OggS") {
5142
6774
  throw new Error(`Bundled clip ${source.clip} is not a valid Ogg file`);
5143
6775
  }
@@ -5169,6 +6801,7 @@ function isAudioConfig(value) {
5169
6801
  }
5170
6802
 
5171
6803
  // src/providers/twilio-adapter.ts
6804
+ init_esm_shims();
5172
6805
  var TWILIO_API_BASE = "https://api.twilio.com/2010-04-01";
5173
6806
  var TwilioAdapter = class _TwilioAdapter {
5174
6807
  accountSid;
@@ -5183,8 +6816,8 @@ var TwilioAdapter = class _TwilioAdapter {
5183
6816
  this.baseUrl = opts.region ? `https://api.${opts.region}.twilio.com/2010-04-01` : TWILIO_API_BASE;
5184
6817
  this.authHeader = `Basic ${Buffer.from(`${accountSid}:${authToken}`).toString("base64")}`;
5185
6818
  }
5186
- async request(method, path3, body) {
5187
- const url = `${this.baseUrl}/Accounts/${encodeURIComponent(this.accountSid)}${path3}`;
6819
+ async request(method, path2, body) {
6820
+ const url = `${this.baseUrl}/Accounts/${encodeURIComponent(this.accountSid)}${path2}`;
5188
6821
  const headers = { Authorization: this.authHeader };
5189
6822
  if (body) headers["Content-Type"] = "application/x-www-form-urlencoded";
5190
6823
  const response = await fetch(url, {
@@ -5195,7 +6828,7 @@ var TwilioAdapter = class _TwilioAdapter {
5195
6828
  });
5196
6829
  const text = await response.text();
5197
6830
  if (!response.ok) {
5198
- throw new Error(`Twilio ${method} ${path3} failed: ${response.status} ${text}`);
6831
+ throw new Error(`Twilio ${method} ${path2} failed: ${response.status} ${text}`);
5199
6832
  }
5200
6833
  if (!text) return {};
5201
6834
  try {
@@ -5213,8 +6846,8 @@ var TwilioAdapter = class _TwilioAdapter {
5213
6846
  const country = encodeURIComponent(opts.countryCode);
5214
6847
  const queryParts = ["PageSize=1"];
5215
6848
  if (opts.areaCode) queryParts.push(`AreaCode=${encodeURIComponent(opts.areaCode)}`);
5216
- const path3 = `/AvailablePhoneNumbers/${country}/Local.json?${queryParts.join("&")}`;
5217
- const available = await this.request("GET", path3);
6849
+ const path2 = `/AvailablePhoneNumbers/${country}/Local.json?${queryParts.join("&")}`;
6850
+ const available = await this.request("GET", path2);
5218
6851
  const first = available.available_phone_numbers?.[0]?.phone_number;
5219
6852
  if (!first) {
5220
6853
  throw new Error(`TwilioAdapter: no numbers available for country ${opts.countryCode}`);
@@ -5297,6 +6930,7 @@ var TwilioAdapter = class _TwilioAdapter {
5297
6930
  };
5298
6931
 
5299
6932
  // src/providers/telnyx-adapter.ts
6933
+ init_esm_shims();
5300
6934
  import { randomUUID as randomUUID2 } from "crypto";
5301
6935
  var TELNYX_API_BASE = "https://api.telnyx.com/v2";
5302
6936
  var TelnyxAdapter = class {
@@ -5308,8 +6942,8 @@ var TelnyxAdapter = class {
5308
6942
  this.apiKey = apiKey;
5309
6943
  this.connectionId = connectionId;
5310
6944
  }
5311
- async request(method, path3, body) {
5312
- const url = `${this.baseUrl}${path3}`;
6945
+ async request(method, path2, body) {
6946
+ const url = `${this.baseUrl}${path2}`;
5313
6947
  const headers = {
5314
6948
  Authorization: `Bearer ${this.apiKey}`
5315
6949
  };
@@ -5322,7 +6956,7 @@ var TelnyxAdapter = class {
5322
6956
  });
5323
6957
  const text = await response.text();
5324
6958
  if (!response.ok) {
5325
- throw new Error(`Telnyx ${method} ${path3} failed: ${response.status} ${text}`);
6959
+ throw new Error(`Telnyx ${method} ${path2} failed: ${response.status} ${text}`);
5326
6960
  }
5327
6961
  if (!text) return {};
5328
6962
  try {
@@ -5416,6 +7050,245 @@ var TelnyxAdapter = class {
5416
7050
  }
5417
7051
  }
5418
7052
  };
7053
+
7054
+ // src/providers/telnyx-stt.ts
7055
+ init_esm_shims();
7056
+ import WebSocket7 from "ws";
7057
+ var TelnyxSTTSampleRate = {
7058
+ HZ_8000: 8e3,
7059
+ HZ_16000: 16e3,
7060
+ HZ_24000: 24e3
7061
+ };
7062
+ var TelnyxSTTInputFormat = {
7063
+ WAV: "wav"
7064
+ };
7065
+ var TELNYX_STT_WS_URL = "wss://api.telnyx.com/v2/speech-to-text/transcription";
7066
+ var DEFAULT_SAMPLE_RATE = TelnyxSTTSampleRate.HZ_16000;
7067
+ var NUM_CHANNELS = 1;
7068
+ function createStreamingWavHeader(sampleRate, numChannels) {
7069
+ const bytesPerSample = 2;
7070
+ const byteRate = sampleRate * numChannels * bytesPerSample;
7071
+ const blockAlign = numChannels * bytesPerSample;
7072
+ const dataSize = 2147483647;
7073
+ const fileSize = 36 + dataSize;
7074
+ const header = Buffer.alloc(44);
7075
+ header.write("RIFF", 0);
7076
+ header.writeUInt32LE(fileSize, 4);
7077
+ header.write("WAVE", 8);
7078
+ header.write("fmt ", 12);
7079
+ header.writeUInt32LE(16, 16);
7080
+ header.writeUInt16LE(1, 20);
7081
+ header.writeUInt16LE(numChannels, 22);
7082
+ header.writeUInt32LE(sampleRate, 24);
7083
+ header.writeUInt32LE(byteRate, 28);
7084
+ header.writeUInt16LE(blockAlign, 32);
7085
+ header.writeUInt16LE(16, 34);
7086
+ header.write("data", 36);
7087
+ header.writeUInt32LE(dataSize, 40);
7088
+ return header;
7089
+ }
7090
+ var TelnyxSTT = class {
7091
+ constructor(apiKey, language = "en", transcriptionEngine = "telnyx", sampleRate = DEFAULT_SAMPLE_RATE, baseUrl = TELNYX_STT_WS_URL) {
7092
+ this.apiKey = apiKey;
7093
+ this.language = language;
7094
+ this.transcriptionEngine = transcriptionEngine;
7095
+ this.sampleRate = sampleRate;
7096
+ this.baseUrl = baseUrl;
7097
+ }
7098
+ apiKey;
7099
+ language;
7100
+ transcriptionEngine;
7101
+ sampleRate;
7102
+ baseUrl;
7103
+ ws = null;
7104
+ callbacks = [];
7105
+ headerSent = false;
7106
+ /** Open the streaming WebSocket and arm message handlers. */
7107
+ async connect() {
7108
+ const params = new URLSearchParams({
7109
+ transcription_engine: this.transcriptionEngine,
7110
+ language: this.language,
7111
+ input_format: "wav"
7112
+ });
7113
+ const url = `${this.baseUrl}?${params.toString()}`;
7114
+ this.ws = new WebSocket7(url, {
7115
+ headers: { Authorization: `Bearer ${this.apiKey}` }
7116
+ });
7117
+ await new Promise((resolve, reject) => {
7118
+ const timer = setTimeout(() => reject(new Error("Telnyx STT connect timeout")), 1e4);
7119
+ this.ws.once("open", () => {
7120
+ clearTimeout(timer);
7121
+ resolve();
7122
+ });
7123
+ this.ws.once("error", (err) => {
7124
+ clearTimeout(timer);
7125
+ reject(err);
7126
+ });
7127
+ });
7128
+ this.ws.on("message", (raw) => {
7129
+ let data;
7130
+ try {
7131
+ data = JSON.parse(raw.toString());
7132
+ } catch {
7133
+ return;
7134
+ }
7135
+ const text = (data.transcript ?? "").trim();
7136
+ if (!text) return;
7137
+ const transcript = {
7138
+ text,
7139
+ isFinal: Boolean(data.is_final),
7140
+ confidence: data.confidence ?? 0
7141
+ };
7142
+ for (const cb of this.callbacks) {
7143
+ cb(transcript);
7144
+ }
7145
+ });
7146
+ this.ws.on("error", (err) => {
7147
+ getLogger().warn(`TelnyxSTT WebSocket error: ${String(err)}`);
7148
+ });
7149
+ }
7150
+ /** Send a binary PCM16 audio chunk; emits the WAV header on the first call. */
7151
+ sendAudio(audio) {
7152
+ if (!this.ws || this.ws.readyState !== WebSocket7.OPEN) return;
7153
+ if (!this.headerSent) {
7154
+ const header = createStreamingWavHeader(this.sampleRate, NUM_CHANNELS);
7155
+ this.ws.send(header);
7156
+ this.headerSent = true;
7157
+ }
7158
+ this.ws.send(audio);
7159
+ }
7160
+ /** Register a transcript listener (max 10 concurrent listeners). */
7161
+ onTranscript(callback) {
7162
+ if (this.callbacks.length >= 10) {
7163
+ getLogger().warn("TelnyxSTT: maximum of 10 onTranscript callbacks reached; replacing the last callback.");
7164
+ this.callbacks[this.callbacks.length - 1] = callback;
7165
+ return;
7166
+ }
7167
+ this.callbacks.push(callback);
7168
+ }
7169
+ /** Close the streaming WebSocket. */
7170
+ close() {
7171
+ if (this.ws) {
7172
+ try {
7173
+ this.ws.close();
7174
+ } catch {
7175
+ }
7176
+ this.ws = null;
7177
+ }
7178
+ }
7179
+ };
7180
+
7181
+ // src/providers/telnyx-tts.ts
7182
+ init_esm_shims();
7183
+ import WebSocket8 from "ws";
7184
+ var TELNYX_TTS_WS_URL = "wss://api.telnyx.com/v2/text-to-speech/speech";
7185
+ var TelnyxTTSVoice = {
7186
+ NATURAL_HD_ASTRA: "Telnyx.NaturalHD.astra",
7187
+ NATURAL_HD_LUNA: "Telnyx.NaturalHD.luna",
7188
+ NATURAL_HD_ATLAS: "Telnyx.NaturalHD.atlas",
7189
+ NATURAL_HD_HERA: "Telnyx.NaturalHD.hera",
7190
+ NATURAL_HD_ZEUS: "Telnyx.NaturalHD.zeus"
7191
+ };
7192
+ var TelnyxTTSSampleRate = {
7193
+ HZ_8000: 8e3,
7194
+ HZ_16000: 16e3,
7195
+ HZ_24000: 24e3
7196
+ };
7197
+ var DEFAULT_VOICE = TelnyxTTSVoice.NATURAL_HD_ASTRA;
7198
+ var TelnyxTTS = class {
7199
+ constructor(apiKey, voice = DEFAULT_VOICE, baseUrl = TELNYX_TTS_WS_URL) {
7200
+ this.apiKey = apiKey;
7201
+ this.voice = voice;
7202
+ this.baseUrl = baseUrl;
7203
+ }
7204
+ apiKey;
7205
+ voice;
7206
+ baseUrl;
7207
+ /** Collect every audio chunk into a single Buffer. */
7208
+ async synthesize(text) {
7209
+ const chunks = [];
7210
+ for await (const chunk of this.synthesizeStream(text)) {
7211
+ chunks.push(chunk);
7212
+ }
7213
+ return Buffer.concat(chunks);
7214
+ }
7215
+ /**
7216
+ * Stream MP3-encoded audio chunks as they arrive from Telnyx.
7217
+ *
7218
+ * The server sends JSON frames of the shape `{"audio": "<base64-mp3>"}`.
7219
+ * Callers that need PCM must decode the MP3 bytes (e.g. via `ffmpeg`).
7220
+ */
7221
+ async *synthesizeStream(text) {
7222
+ const url = `${this.baseUrl}?voice=${encodeURIComponent(this.voice)}`;
7223
+ const ws = new WebSocket8(url, {
7224
+ headers: { Authorization: `Bearer ${this.apiKey}` }
7225
+ });
7226
+ await new Promise((resolve, reject) => {
7227
+ const timer = setTimeout(() => reject(new Error("Telnyx TTS connect timeout")), 1e4);
7228
+ ws.once("open", () => {
7229
+ clearTimeout(timer);
7230
+ resolve();
7231
+ });
7232
+ ws.once("error", (err) => {
7233
+ clearTimeout(timer);
7234
+ reject(err);
7235
+ });
7236
+ });
7237
+ const queue = [];
7238
+ const waiters = [];
7239
+ function push(item) {
7240
+ const w = waiters.shift();
7241
+ if (w) {
7242
+ w(item);
7243
+ } else {
7244
+ queue.push(item);
7245
+ }
7246
+ }
7247
+ ws.on("message", (raw) => {
7248
+ let data;
7249
+ try {
7250
+ data = JSON.parse(raw.toString());
7251
+ } catch {
7252
+ getLogger().warn("TelnyxTTS: received invalid JSON");
7253
+ return;
7254
+ }
7255
+ const audioB64 = data.audio;
7256
+ if (!audioB64) return;
7257
+ try {
7258
+ const audioBytes = Buffer.from(audioB64, "base64");
7259
+ if (audioBytes.length > 0) {
7260
+ push(audioBytes);
7261
+ }
7262
+ } catch {
7263
+ }
7264
+ });
7265
+ ws.on("close", () => {
7266
+ push(null);
7267
+ });
7268
+ ws.on("error", (err) => {
7269
+ push({ error: err instanceof Error ? err : new Error(String(err)) });
7270
+ });
7271
+ ws.send(JSON.stringify({ text: " " }));
7272
+ ws.send(JSON.stringify({ text }));
7273
+ ws.send(JSON.stringify({ text: "" }));
7274
+ try {
7275
+ while (true) {
7276
+ const item = queue.length > 0 ? queue.shift() : await new Promise((resolve) => waiters.push(resolve));
7277
+ if (item === null) return;
7278
+ if (typeof item === "object" && "error" in item) throw item.error;
7279
+ yield item;
7280
+ }
7281
+ } finally {
7282
+ try {
7283
+ ws.close();
7284
+ } catch {
7285
+ }
7286
+ }
7287
+ }
7288
+ };
7289
+
7290
+ // src/observability/index.ts
7291
+ init_esm_shims();
5419
7292
  export {
5420
7293
  AllProvidersFailedError,
5421
7294
  LLM2 as AnthropicLLM,
@@ -5425,7 +7298,7 @@ export {
5425
7298
  BuiltinAudioClip,
5426
7299
  CallMetricsAccumulator,
5427
7300
  STT4 as CartesiaSTT,
5428
- TTS3 as CartesiaTTS,
7301
+ TTS4 as CartesiaTTS,
5429
7302
  LLM4 as CerebrasLLM,
5430
7303
  ChatContext,
5431
7304
  CloudflareTunnel,
@@ -5437,6 +7310,8 @@ export {
5437
7310
  ConvAI as ElevenLabsConvAI,
5438
7311
  ElevenLabsConvAIAdapter,
5439
7312
  TTS as ElevenLabsTTS,
7313
+ TTS2 as ElevenLabsWebSocketTTS,
7314
+ ErrorCode,
5440
7315
  EventBus,
5441
7316
  FallbackLLMProvider,
5442
7317
  GEMINI_DEFAULT_INPUT_SR,
@@ -5446,15 +7321,16 @@ export {
5446
7321
  LLM3 as GroqLLM,
5447
7322
  Guardrail,
5448
7323
  IVRActivity,
7324
+ TTS7 as InworldTTS,
5449
7325
  LLMLoop,
5450
- TTS5 as LMNTTTS,
7326
+ TTS6 as LMNTTTS,
5451
7327
  MetricsStore,
5452
7328
  Ngrok,
5453
7329
  LLM as OpenAILLM,
5454
7330
  OpenAILLMProvider,
5455
7331
  Realtime as OpenAIRealtime,
5456
7332
  OpenAIRealtimeAdapter,
5457
- TTS2 as OpenAITTS,
7333
+ TTS3 as OpenAITTS,
5458
7334
  STT3 as OpenAITranscribeSTT,
5459
7335
  PartialStreamError,
5460
7336
  Patter,
@@ -5466,7 +7342,7 @@ export {
5466
7342
  ProvisionError,
5467
7343
  RateLimitError,
5468
7344
  RemoteMessageHandler,
5469
- TTS4 as RimeTTS,
7345
+ TTS5 as RimeTTS,
5470
7346
  SPAN_BARGEIN,
5471
7347
  SPAN_CALL,
5472
7348
  SPAN_ENDPOINT,
@@ -5477,10 +7353,23 @@ export {
5477
7353
  SentenceChunker,
5478
7354
  SileroVAD,
5479
7355
  STT5 as SonioxSTT,
7356
+ SpeechEvents,
7357
+ SpeechmaticsAudioEncoding,
7358
+ SpeechmaticsOperatingPoint,
7359
+ STT7 as SpeechmaticsSTT,
7360
+ SpeechmaticsSampleRate,
7361
+ SpeechmaticsServerMessage,
7362
+ TurnDetectionMode as SpeechmaticsTurnDetectionMode,
5480
7363
  StatefulResampler,
5481
7364
  Static as StaticTunnel,
5482
7365
  Carrier2 as Telnyx,
5483
7366
  TelnyxAdapter,
7367
+ TelnyxSTT,
7368
+ TelnyxSTTInputFormat,
7369
+ TelnyxSTTSampleRate,
7370
+ TelnyxTTS,
7371
+ TelnyxTTSSampleRate,
7372
+ TelnyxTTSVoice,
5484
7373
  TestSession,
5485
7374
  TfidfLoopDetector,
5486
7375
  Tool,
@@ -5501,6 +7390,7 @@ export {
5501
7390
  cartesia,
5502
7391
  createResampler16kTo8k,
5503
7392
  createResampler24kTo16k,
7393
+ createResampler24kTo8k,
5504
7394
  createResampler8kTo16k,
5505
7395
  deepgram,
5506
7396
  defineTool,