getpatter 0.6.5 → 0.6.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,105 @@ import express from "express";
29
29
  import { createServer } from "http";
30
30
  import { WebSocketServer } from "ws";
31
31
 
32
+ // src/telemetry/call-metrics.ts
33
+ init_esm_shims();
34
+ function engineFromMode(mode) {
35
+ if (mode === "openai_realtime" || mode === "openai_realtime_2") return "realtime";
36
+ if (mode === "elevenlabs_convai") return "convai";
37
+ if (mode === "pipeline") return "pipeline";
38
+ return "other";
39
+ }
40
+ function providerFromMetrics(m) {
41
+ const mode = m.provider_mode;
42
+ if (mode === "openai_realtime" || mode === "openai_realtime_2") return "openai";
43
+ if (mode === "elevenlabs_convai") return "elevenlabs";
44
+ for (const key of ["llm_provider", "stt_provider", "tts_provider"]) {
45
+ const v = m[key];
46
+ if (typeof v === "string" && v) return v.toLowerCase();
47
+ }
48
+ return "other";
49
+ }
50
+ function providerFromMode(mode) {
51
+ if (mode === "openai_realtime" || mode === "openai_realtime_2") return "openai";
52
+ if (mode === "elevenlabs_convai") return "elevenlabs";
53
+ return "other";
54
+ }
55
+ function carrierFamily(tp) {
56
+ return typeof tp === "string" && tp ? tp.toLowerCase() : "none";
57
+ }
58
+ function direction(value) {
59
+ const v = typeof value === "string" ? value.toLowerCase() : "";
60
+ return v === "inbound" || v === "outbound" ? v : void 0;
61
+ }
62
+ function turnCountBucket(n) {
63
+ if (n <= 0) return "0";
64
+ if (n === 1) return "1";
65
+ if (n <= 3) return "2_3";
66
+ if (n <= 6) return "4_6";
67
+ if (n <= 12) return "7_12";
68
+ return "13_plus";
69
+ }
70
+ function latencyMs(m) {
71
+ const p95 = m.latency_p95;
72
+ if (p95 && typeof p95 === "object") {
73
+ return p95.agent_response_ms;
74
+ }
75
+ return void 0;
76
+ }
77
+ function recordCallStarted(telemetry, opts) {
78
+ if (!telemetry) return;
79
+ try {
80
+ const dims = {
81
+ engine: engineFromMode(opts.providerMode),
82
+ provider: providerFromMode(opts.providerMode),
83
+ carrier: carrierFamily(opts.telephonyProvider)
84
+ };
85
+ const d = direction(opts.direction);
86
+ if (d !== void 0) dims.direction = d;
87
+ telemetry.record("call_started", dims);
88
+ } catch {
89
+ }
90
+ }
91
+ function recordCallCompleted(telemetry, opts) {
92
+ if (!telemetry) return;
93
+ try {
94
+ const dims = { outcome: opts.outcome };
95
+ const d = direction(opts.direction);
96
+ if (d !== void 0) dims.direction = d;
97
+ const metrics = opts.metrics;
98
+ if (metrics && typeof metrics === "object") {
99
+ const m = metrics;
100
+ dims.engine = engineFromMode(m.provider_mode);
101
+ dims.provider = providerFromMetrics(m);
102
+ dims.carrier = carrierFamily(m.telephony_provider);
103
+ if (typeof m.duration_seconds === "number") {
104
+ dims.duration_seconds = Math.max(0, Math.round(m.duration_seconds));
105
+ }
106
+ const lat = latencyMs(m);
107
+ if (typeof lat === "number") dims.latency_ms = Math.max(0, Math.round(lat));
108
+ const cost = m.cost;
109
+ if (cost && typeof cost === "object") {
110
+ const total = cost.total;
111
+ if (typeof total === "number" && Number.isFinite(total)) {
112
+ dims.cost_usd = Math.max(0, Math.round(total * 1e4) / 1e4);
113
+ }
114
+ }
115
+ if (Array.isArray(m.turns)) {
116
+ dims.turn_count_bucket = turnCountBucket(m.turns.length);
117
+ }
118
+ const errorCode = m.error_code;
119
+ if (typeof errorCode === "string" && errorCode) {
120
+ dims.error_code = errorCode;
121
+ dims.outcome = "error";
122
+ }
123
+ } else if (opts.carrier !== void 0) {
124
+ dims.carrier = carrierFamily(opts.carrier);
125
+ }
126
+ telemetry.record("call_completed", dims);
127
+ } catch {
128
+ }
129
+ }
130
+
32
131
  // src/providers/elevenlabs-convai.ts
33
132
  init_esm_shims();
34
133
  import WebSocket from "ws";
@@ -2826,6 +2925,9 @@ var CallMetricsAccumulator = class {
2826
2925
  ttsModel;
2827
2926
  realtimeModel;
2828
2927
  _pricing;
2928
+ // Terminal error code (lowercased ErrorCode value or "other"); set by
2929
+ // recordError when the call ends abnormally. Empty for a clean call.
2930
+ _errorCode = "";
2829
2931
  _callStart;
2830
2932
  _turns = [];
2831
2933
  // mutable internal array; immutable when exposed via TurnMetrics[] → readonly TurnMetrics[]
@@ -3396,11 +3498,35 @@ var CallMetricsAccumulator = class {
3396
3498
  telephony_provider: this.telephonyProvider,
3397
3499
  stt_model: this.sttModel,
3398
3500
  tts_model: this.ttsModel,
3399
- llm_model: this._llmModel
3501
+ llm_model: this._llmModel,
3502
+ error_code: this._errorCode
3400
3503
  };
3401
3504
  this._eventBus?.emit("call_ended", { callId: this.callId, metrics });
3402
3505
  return metrics;
3403
3506
  }
3507
+ /**
3508
+ * Record the call's terminal error as a coarse, anonymous code. Stores the
3509
+ * PatterError `.code` lowercased; maps common timeout/connection errors; falls
3510
+ * back to "other". Never stores the message. Last write wins.
3511
+ */
3512
+ recordError(err) {
3513
+ const code = err?.code;
3514
+ const name = err?.name;
3515
+ const sys = typeof code === "string" ? code : "";
3516
+ if (sys.startsWith("ECONN") || sys === "EHOSTUNREACH" || sys === "ENETUNREACH" || sys === "EPIPE") {
3517
+ this._errorCode = "connection";
3518
+ return;
3519
+ }
3520
+ if (typeof code === "string" && code) {
3521
+ this._errorCode = code.toLowerCase();
3522
+ return;
3523
+ }
3524
+ if (name === "TimeoutError" || name === "AbortError") {
3525
+ this._errorCode = "timeout";
3526
+ } else {
3527
+ this._errorCode = "other";
3528
+ }
3529
+ }
3404
3530
  /** Return the cost breakdown for the call so far without ending it. */
3405
3531
  getCostSoFar() {
3406
3532
  const duration = (hrTimeMs() - this._callStart) / 1e3;
@@ -4879,6 +5005,28 @@ function isSttHallucination(text) {
4879
5005
  const pieces = stripped.split(/[.!?…。!?]+/u).map((p) => p.trim()).filter((p) => p.length > 0);
4880
5006
  return pieces.length > 1 && pieces.every((p) => HALLUCINATIONS.has(p));
4881
5007
  }
5008
+ var ECHO_WORD_OVERLAP_THRESHOLD = 0.6;
5009
+ var ECHO_MIN_CANDIDATE_WORDS = 4;
5010
+ function normalizeForEcho(text) {
5011
+ return text.toLowerCase().replace(/[^\p{L}\p{N}\s]/gu, " ").replace(/\s+/u, " ").trim().replace(/\s+/gu, " ");
5012
+ }
5013
+ function looksLikeEcho(candidate, agentText) {
5014
+ const a = normalizeForEcho(agentText);
5015
+ const c = normalizeForEcho(candidate);
5016
+ if (!a || !c) return false;
5017
+ const words = c.split(" ").filter(Boolean);
5018
+ if (words.length < ECHO_MIN_CANDIDATE_WORDS) return false;
5019
+ if (a.includes(c)) return true;
5020
+ const agentWords = new Set(a.split(" "));
5021
+ const overlap = words.filter((w) => agentWords.has(w)).length / words.length;
5022
+ return overlap >= ECHO_WORD_OVERLAP_THRESHOLD;
5023
+ }
5024
+ function isNearDuplicate(a, b) {
5025
+ if (!a || !b) return false;
5026
+ if (a === b) return true;
5027
+ const [shorter, longer] = a.length <= b.length ? [a, b] : [b, a];
5028
+ return longer.startsWith(shorter + " ");
5029
+ }
4882
5030
  var StreamHandler = class _StreamHandler {
4883
5031
  deps;
4884
5032
  ws;
@@ -4891,6 +5039,17 @@ var StreamHandler = class _StreamHandler {
4891
5039
  stt = null;
4892
5040
  tts = null;
4893
5041
  isSpeaking = false;
5042
+ /**
5043
+ * True only while the post-TTS tail-grace window is pending: the agent has
5044
+ * finished its turn but ``isSpeaking`` is still held for
5045
+ * ``PATTER_TTS_TAIL_GRACE_MS`` to swallow the fading echo tail. A VAD
5046
+ * ``speech_start`` (or a transcript) during this window is the user's NEXT
5047
+ * turn, not a barge-in — there is nothing left to interrupt. Set by
5048
+ * ``endSpeakingWithGrace``; cleared by ``beginSpeaking``, the grace flip,
5049
+ * ``cancelSpeaking``, and ``endTailGraceForNewTurn``. Parity with Python
5050
+ * ``_tail_grace_active``.
5051
+ */
5052
+ tailGraceActive = false;
4894
5053
  /**
4895
5054
  * Ring buffer of inbound PCM16 16 kHz frames captured while the agent
4896
5055
  * is speaking and the self-hearing guard is dropping audio. On
@@ -4966,6 +5125,35 @@ var StreamHandler = class _StreamHandler {
4966
5125
  * ``isSpeaking=false``, and silently cut the agent's first turn.
4967
5126
  */
4968
5127
  firstAudioSentAt = null;
5128
+ /**
5129
+ * Estimated wall-clock (ms) when the LAST audio byte pushed to the carrier
5130
+ * finishes PLAYING on the phone. The pipeline pushes TTS audio as fast as
5131
+ * the provider synthesizes it (no pacing) and the carrier buffers + plays
5132
+ * at realtime, so "we finished pushing" and "the caller finished hearing"
5133
+ * can diverge by tens of seconds — especially with agent-runtime LLMs
5134
+ * (Hermes/OpenClaw) that deliver a long reply all at once after a thinking
5135
+ * pause. ``endSpeakingWithGrace`` holds ``isSpeaking=true`` (with
5136
+ * ``tailGraceActive=false``) until this cursor passes, so a barge-in during
5137
+ * the audible backlog still takes the cancel path (``sendClear`` drops the
5138
+ * carrier buffer) instead of being treated as a calm next turn. Advanced by
5139
+ * ``trackOutboundPlayback``; reset by ``cancelSpeaking`` (the buffer is
5140
+ * cleared) and ``endTailGraceForNewTurn``.
5141
+ */
5142
+ playbackBufferedUntil = 0;
5143
+ /**
5144
+ * Per-turn playback timeline used to estimate the response prefix the
5145
+ * caller actually HEARD when a barge-in lands. ``turnPlaybackTotalMs``
5146
+ * accumulates the playout duration of every chunk pushed this turn
5147
+ * (including filler audio, which keeps the timeline aligned);
5148
+ * ``turnSpokenSegments`` records ``{text, startMs}`` for each RESPONSE
5149
+ * sentence at its first audible chunk (filler / error-fallback audio
5150
+ * advances the clock but adds no segment). ``heard = total - backlog``
5151
+ * then maps to a sentence-granular prefix — see ``heardResponsePrefix``.
5152
+ * Both reset at ``beginSpeaking``. Mirrors Python
5153
+ * ``_turn_playback_total_s`` / ``_turn_spoken_segments``.
5154
+ */
5155
+ turnPlaybackTotalMs = 0;
5156
+ turnSpokenSegments = [];
4969
5157
  /**
4970
5158
  * Optional barge-in confirmation strategies. With an empty array the
4971
5159
  * SDK falls back to the legacy "cancel on first VAD speech_start"
@@ -5083,11 +5271,15 @@ var StreamHandler = class _StreamHandler {
5083
5271
  }
5084
5272
  this.speakingGeneration++;
5085
5273
  this.isSpeaking = true;
5274
+ this.tailGraceActive = false;
5086
5275
  this.speakingStartedAt = Date.now();
5087
5276
  this.suppressedSpeechPending = false;
5088
5277
  void isFirstMessage;
5089
5278
  this.firstAudioSentAt = Date.now();
5090
5279
  this.inboundAudioRing = [];
5280
+ this.currentAgentSpokenText = "";
5281
+ this.turnPlaybackTotalMs = 0;
5282
+ this.turnSpokenSegments = [];
5091
5283
  this.resetVad();
5092
5284
  }
5093
5285
  /**
@@ -5102,6 +5294,87 @@ var StreamHandler = class _StreamHandler {
5102
5294
  this.firstAudioSentAt = Date.now();
5103
5295
  }
5104
5296
  }
5297
+ /**
5298
+ * Advance ``playbackBufferedUntil`` by the playout duration of an outbound
5299
+ * TTS chunk. ``numBytes`` is the size of the chunk BEFORE carrier encoding
5300
+ * (the same buffer handed to ``encodePipelineAudio``): PCM16 @ 16 kHz in
5301
+ * the default path (32 bytes/ms), or the carrier's native μ-law @ 8 kHz
5302
+ * (8 bytes/ms) when the TTS adapter emits wire format directly
5303
+ * (``ttsOutputFormatNativeForCarrier`` — Twilio/Plivo ``ulaw_8000``;
5304
+ * Telnyx native is ``pcm_16000`` so it stays at 32 bytes/ms).
5305
+ */
5306
+ trackOutboundPlayback(numBytes) {
5307
+ if (numBytes <= 0) return;
5308
+ const bytesPerMs = this.ttsOutputFormatNativeForCarrier && this.deps.bridge.telephonyProvider !== "telnyx" ? 8 : 32;
5309
+ const now = Date.now();
5310
+ const chunkMs = numBytes / bytesPerMs;
5311
+ const base = this.playbackBufferedUntil > now ? this.playbackBufferedUntil : now;
5312
+ this.playbackBufferedUntil = base + chunkMs;
5313
+ this.turnPlaybackTotalMs += chunkMs;
5314
+ }
5315
+ /**
5316
+ * Estimate the response prefix the caller actually HEARD this turn.
5317
+ *
5318
+ * The pipeline pushes audio faster than realtime, so at barge-in time
5319
+ * ``heard = totalPushed - carrierBacklog`` ms of audio have actually
5320
+ * played. Mapped at sentence granularity against ``turnSpokenSegments``:
5321
+ * a sentence counts as heard once its playback has STARTED
5322
+ * (``startMs <= heardMs``), so the sentence playing at the moment of
5323
+ * interruption is included.
5324
+ *
5325
+ * Returns ``null`` when no segments were tracked this turn (nothing
5326
+ * synthesized through the tracked path — callers fall back to the legacy
5327
+ * full-text behaviour). Mirrors Python ``_heard_response_prefix``.
5328
+ */
5329
+ heardResponsePrefix() {
5330
+ if (this.turnSpokenSegments.length === 0) return null;
5331
+ const remainingMs = Math.max(0, this.playbackBufferedUntil - Date.now());
5332
+ const heardMs = Math.max(0, this.turnPlaybackTotalMs - remainingMs);
5333
+ const heard = this.turnSpokenSegments.filter((s) => s.startMs <= heardMs);
5334
+ return {
5335
+ text: heard.map((s) => s.text).join(" "),
5336
+ heardEverything: heard.length === this.turnSpokenSegments.length
5337
+ };
5338
+ }
5339
+ /**
5340
+ * Replace the text of the most recent assistant entry in the conversation
5341
+ * history. No-op when the last entry is not an assistant turn (e.g. the
5342
+ * caller's next turn was already committed).
5343
+ */
5344
+ rewriteLastAssistantEntry(text) {
5345
+ const entries = this.history.entries;
5346
+ const last = entries[entries.length - 1];
5347
+ if (last && last.role === "assistant") {
5348
+ entries[entries.length - 1] = { ...last, text };
5349
+ }
5350
+ }
5351
+ /**
5352
+ * LiveKit-style "heard prefix" semantics for a barge-in that lands AFTER
5353
+ * the turn completed, while the carrier is still playing the buffered
5354
+ * tail.
5355
+ *
5356
+ * The completed turn already recorded its FULL reply in history, but the
5357
+ * caller only heard part of it before interrupting — a stateful agent
5358
+ * runtime (Hermes / OpenClaw) would otherwise "remember saying" things
5359
+ * the caller never heard. Rewrites the last assistant entry to the heard
5360
+ * prefix + ``[interrupted by caller]``.
5361
+ *
5362
+ * MUST run BEFORE ``cancelSpeaking`` resets ``playbackBufferedUntil``
5363
+ * (the backlog is the heard-prefix input). No-op when a turn is still in
5364
+ * flight (the streaming path applies its own marker), when there is no
5365
+ * backlog, or when everything was already heard. Mirrors Python
5366
+ * ``_maybe_truncate_completed_turn_history``.
5367
+ */
5368
+ maybeTruncateCompletedTurnHistory() {
5369
+ if (this.dispatchTask !== null) return;
5370
+ const remainingMs = this.playbackBufferedUntil - Date.now();
5371
+ if (remainingMs <= 0) return;
5372
+ const heard = this.heardResponsePrefix();
5373
+ if (heard === null || heard.heardEverything) return;
5374
+ this.rewriteLastAssistantEntry(
5375
+ heard.text ? `${heard.text} [interrupted by caller]` : "[interrupted by caller]"
5376
+ );
5377
+ }
5105
5378
  /**
5106
5379
  * Atomically end speaking AND invalidate any pending grace timer.
5107
5380
  * Use instead of ``this.isSpeaking = false`` at barge-in sites.
@@ -5112,10 +5385,12 @@ var StreamHandler = class _StreamHandler {
5112
5385
  cancelSpeaking() {
5113
5386
  this.speakingGeneration++;
5114
5387
  this.isSpeaking = false;
5388
+ this.tailGraceActive = false;
5115
5389
  this.speakingStartedAt = null;
5116
5390
  this.firstAudioSentAt = null;
5117
5391
  this.lastCancelAt = Date.now();
5118
5392
  this.suppressedSpeechPending = false;
5393
+ this.playbackBufferedUntil = 0;
5119
5394
  this.drainPendingMarks();
5120
5395
  if (this.llmAbort !== null) {
5121
5396
  try {
@@ -5188,23 +5463,37 @@ var StreamHandler = class _StreamHandler {
5188
5463
  if (grace > 0) {
5189
5464
  const gen = this.speakingGeneration;
5190
5465
  this.clearGraceTimer();
5191
- this.graceTimer = setTimeout(() => {
5192
- this.graceTimer = null;
5193
- if (this.speakingGeneration === gen) {
5194
- this.isSpeaking = false;
5195
- this.speakingStartedAt = null;
5196
- this.firstAudioSentAt = null;
5197
- this.clearPendingBargeIn();
5198
- void this.resetBargeInStrategies();
5199
- if (this.suppressedSpeechPending) {
5200
- this.suppressedSpeechPending = false;
5201
- this.flushInboundAudioRing();
5466
+ const startTailGrace = () => {
5467
+ this.tailGraceActive = true;
5468
+ this.graceTimer = setTimeout(() => {
5469
+ this.graceTimer = null;
5470
+ if (this.speakingGeneration === gen) {
5471
+ this.isSpeaking = false;
5472
+ this.tailGraceActive = false;
5473
+ this.speakingStartedAt = null;
5474
+ this.firstAudioSentAt = null;
5475
+ this.clearPendingBargeIn();
5476
+ void this.resetBargeInStrategies();
5477
+ if (this.suppressedSpeechPending) {
5478
+ this.suppressedSpeechPending = false;
5479
+ this.flushInboundAudioRing();
5480
+ }
5481
+ this.resetVad();
5202
5482
  }
5203
- this.resetVad();
5204
- }
5205
- }, grace);
5483
+ }, grace);
5484
+ };
5485
+ const bufferedMs = Math.max(0, this.playbackBufferedUntil - Date.now());
5486
+ if (bufferedMs <= 0) {
5487
+ startTailGrace();
5488
+ } else {
5489
+ this.graceTimer = setTimeout(() => {
5490
+ this.graceTimer = null;
5491
+ if (this.speakingGeneration === gen) startTailGrace();
5492
+ }, bufferedMs);
5493
+ }
5206
5494
  } else {
5207
5495
  this.isSpeaking = false;
5496
+ this.tailGraceActive = false;
5208
5497
  this.speakingStartedAt = null;
5209
5498
  this.firstAudioSentAt = null;
5210
5499
  this.clearPendingBargeIn();
@@ -5216,6 +5505,35 @@ var StreamHandler = class _StreamHandler {
5216
5505
  this.resetVad();
5217
5506
  }
5218
5507
  }
5508
+ /**
5509
+ * End the post-TTS tail-grace window because the user has begun their next
5510
+ * turn. Unlike a barge-in, the agent's response already played out in full
5511
+ * — there is nothing to cancel and no turn was interrupted. We flip the
5512
+ * speaking flag off (bumping ``speakingGeneration`` so the scheduled grace
5513
+ * timer no-ops), recover any leading audio the self-hearing guard captured
5514
+ * into the ring (the user's first ~250 ms, which VAD needed before it could
5515
+ * emit ``speech_start``), and let the live STT stream take over. We do NOT
5516
+ * call ``sendClear``, ``recordBargeinDetected`` or ``recordTurnInterrupted``
5517
+ * — none apply to a turn that completed normally.
5518
+ *
5519
+ * Without this, fast next-turn speech (humans reply in 200-700 ms, well
5520
+ * inside the 1500 ms default grace) is withheld from STT and recorded as an
5521
+ * empty ``[interrupted]`` turn, after which the agent goes silent for the
5522
+ * rest of the call. Parity with Python ``_end_tail_grace_for_new_turn``.
5523
+ */
5524
+ endTailGraceForNewTurn() {
5525
+ this.isSpeaking = false;
5526
+ this.tailGraceActive = false;
5527
+ this.speakingStartedAt = null;
5528
+ this.firstAudioSentAt = null;
5529
+ this.playbackBufferedUntil = 0;
5530
+ this.speakingGeneration++;
5531
+ this.clearGraceTimer();
5532
+ this.clearPendingBargeIn();
5533
+ void this.resetBargeInStrategies();
5534
+ this.suppressedSpeechPending = false;
5535
+ this.flushInboundAudioRing();
5536
+ }
5219
5537
  async resetBargeInStrategies() {
5220
5538
  if (this.bargeInStrategies.length === 0) return;
5221
5539
  const { resetStrategies } = await import("./barge-in-strategies-X6ARMGIQ.mjs");
@@ -5351,9 +5669,43 @@ var StreamHandler = class _StreamHandler {
5351
5669
  maxDurationTimer = null;
5352
5670
  transcriptProcessing = false;
5353
5671
  transcriptQueue = [];
5672
+ /**
5673
+ * The in-flight turn dispatch (LLM + TTS) runs as a SINGLE tracked promise
5674
+ * so the transcript drain loop keeps running ``handleBargeIn`` against the
5675
+ * LIVE turn during a long (30-90 s) agent-runtime response, instead of
5676
+ * head-of-line-blocking on it. Exactly one is in flight: the launcher awaits
5677
+ * the previous one to settle (fast — a barge-in already aborted it) before
5678
+ * starting the next, preserving history/metrics ordering. Parity with
5679
+ * Python ``_dispatch_task``.
5680
+ */
5681
+ dispatchTask = null;
5682
+ /**
5683
+ * Cap (ms) on how long teardown waits for the backgrounded dispatch to
5684
+ * settle. JS promises are not cancellable, so a user-supplied ``onMessage``
5685
+ * (which receives no AbortSignal) parked on a hung external call could block
5686
+ * call cleanup indefinitely — `llmAbort.abort()` only unblocks the built-in
5687
+ * LLM/TTS paths. We bound the WAIT (Python hard-cancels the task instead).
5688
+ * 30 s matches the webhook ceiling.
5689
+ */
5690
+ static DISPATCH_SETTLE_TIMEOUT_MS = 3e4;
5691
+ /**
5692
+ * Opt-in (default OFF): forward inbound audio to STT even while the agent is
5693
+ * speaking, so the transcript barge-in path can receive a transcript on
5694
+ * echo-masked PSTN links where the VAD never fires. ECHO RISK without AEC.
5695
+ * Parity with Python ``_forward_stt_while_speaking``.
5696
+ */
5697
+ forwardSttWhileSpeaking = ["1", "true", "yes"].includes(
5698
+ (process.env.PATTER_FORWARD_STT_WHILE_SPEAKING ?? "").trim().toLowerCase()
5699
+ );
5354
5700
  // Throttle state for back-to-back STT finals — see ``commitTranscript``.
5355
5701
  lastCommitText = "";
5356
5702
  lastCommitAt = 0;
5703
+ /** The agent's spoken text for the CURRENT turn, accumulated as tokens stream.
5704
+ * The echo guard rejects transcripts matching it (the agent's own TTS bleeding
5705
+ * back into STT when audio is forwarded during TTS without effective AEC).
5706
+ * Reset in ``beginSpeaking``; only consulted while ``forwardSttWhileSpeaking``.
5707
+ * Parity with Python ``_current_agent_spoken_text``. */
5708
+ currentAgentSpokenText = "";
5357
5709
  // PCM16 byte-alignment carry for TTS streaming (pipeline mode).
5358
5710
  // HTTP streams from ElevenLabs / OpenAI / Cartesia can yield chunks of any
5359
5711
  // size, including odd byte counts. Silently dropping the trailing odd byte
@@ -5373,6 +5725,11 @@ var StreamHandler = class _StreamHandler {
5373
5725
  this.ws = ws;
5374
5726
  this.caller = caller;
5375
5727
  this.callee = callee;
5728
+ if (this.forwardSttWhileSpeaking) {
5729
+ getLogger().warn(
5730
+ "PATTER_FORWARD_STT_WHILE_SPEAKING=on: inbound audio is sent to STT during TTS so transcript barge-in works on echo-masked links. Without AEC the agent's own voice may be transcribed as a phantom interruption \u2014 pair with agent.bargeInStrategies."
5731
+ );
5732
+ }
5376
5733
  this.bargeInStrategies = (deps.agent.bargeInStrategies ?? []).slice();
5377
5734
  const confirmMs = deps.agent.bargeInConfirmMs;
5378
5735
  this.bargeInConfirmMs = typeof confirmMs === "number" && Number.isFinite(confirmMs) && confirmMs > 0 ? confirmMs : 1500;
@@ -5572,12 +5929,12 @@ var StreamHandler = class _StreamHandler {
5572
5929
  } catch {
5573
5930
  }
5574
5931
  if (this.deps.onCallStart) {
5575
- const direction = this.deps.metricsStore.getActive(callId)?.direction ?? "inbound";
5932
+ const direction2 = this.deps.metricsStore.getActive(callId)?.direction ?? "inbound";
5576
5933
  await this.deps.onCallStart({
5577
5934
  call_id: callId,
5578
5935
  caller: this.caller,
5579
5936
  callee: this.callee,
5580
- direction,
5937
+ direction: direction2,
5581
5938
  telephony_provider: this.deps.bridge.telephonyProvider,
5582
5939
  ...Object.keys(customParams).length > 0 ? { custom_params: customParams } : {}
5583
5940
  });
@@ -5644,6 +6001,17 @@ var StreamHandler = class _StreamHandler {
5644
6001
  setStreamSid(sid) {
5645
6002
  this.streamSid = sid;
5646
6003
  }
6004
+ /**
6005
+ * Record a terminal/processing error as a coarse, anonymous code on the call
6006
+ * metrics (code only, never the message). Surfaced via `call_completed`
6007
+ * telemetry. Safe to call with any value; last write wins.
6008
+ */
6009
+ recordError(err) {
6010
+ try {
6011
+ this.metricsAcc.recordError(err);
6012
+ } catch {
6013
+ }
6014
+ }
5647
6015
  /** Handle an incoming audio chunk (already decoded from base64). */
5648
6016
  /** Forward inbound audio bytes to the AI adapter and (in pipeline mode) the STT provider. */
5649
6017
  async handleAudio(audioBuffer) {
@@ -5670,6 +6038,9 @@ var StreamHandler = class _StreamHandler {
5670
6038
  );
5671
6039
  }
5672
6040
  if (evt?.type === "speech_start") {
6041
+ if (this.isSpeaking && this.tailGraceActive) {
6042
+ this.endTailGraceForNewTurn();
6043
+ }
5673
6044
  const phantomSuppressed = this.isSpeaking && !this.canBargeIn();
5674
6045
  if (phantomSuppressed) {
5675
6046
  getLogger().info(
@@ -5677,7 +6048,8 @@ var StreamHandler = class _StreamHandler {
5677
6048
  );
5678
6049
  this.suppressedSpeechPending = true;
5679
6050
  } else if (this.isSpeaking) {
5680
- if (this.bargeInStrategies.length > 0) {
6051
+ const deferCancel = this.bargeInStrategies.length > 0 || this.forwardSttWhileSpeaking && !this.aec;
6052
+ if (deferCancel) {
5681
6053
  this.startPendingBargeIn();
5682
6054
  this.metricsAcc.anchorUserSpeechStart();
5683
6055
  return;
@@ -5687,6 +6059,7 @@ var StreamHandler = class _StreamHandler {
5687
6059
  this.metricsAcc.recordBargeinDetected();
5688
6060
  const bargeinSpan = startSpan(SPAN_BARGEIN, { "patter.call.id": this.callId });
5689
6061
  try {
6062
+ this.maybeTruncateCompletedTurnHistory();
5690
6063
  this.cancelSpeaking();
5691
6064
  try {
5692
6065
  this.deps.bridge.sendClear(this.ws, this.streamSid);
@@ -5731,9 +6104,10 @@ var StreamHandler = class _StreamHandler {
5731
6104
  if (this.inboundAudioRing.length > _StreamHandler.INBOUND_AUDIO_RING_FRAMES) {
5732
6105
  this.inboundAudioRing.shift();
5733
6106
  }
6107
+ if (!this.forwardSttWhileSpeaking) return;
6108
+ } else if ((this.deps.agent.bargeInThresholdMs ?? 300) === 0) {
5734
6109
  return;
5735
6110
  }
5736
- if ((this.deps.agent.bargeInThresholdMs ?? 300) === 0) return;
5737
6111
  }
5738
6112
  const hooks = this.deps.agent.hooks;
5739
6113
  if (hooks?.beforeSendToStt) {
@@ -5795,6 +6169,27 @@ var StreamHandler = class _StreamHandler {
5795
6169
  }
5796
6170
  }
5797
6171
  }
6172
+ /**
6173
+ * Await the backgrounded turn dispatch during teardown, but never block
6174
+ * longer than ``DISPATCH_SETTLE_TIMEOUT_MS``. The earlier ``llmAbort.abort()``
6175
+ * settles the built-in LLM/TTS paths immediately; the cap only bites a
6176
+ * misbehaving user ``onMessage`` parked on a hung external call (JS promises
6177
+ * can't be cancelled). No-op when nothing is in flight.
6178
+ */
6179
+ async settleDispatchForTeardown() {
6180
+ if (!this.dispatchTask) return;
6181
+ const settle = this.dispatchTask.catch(() => {
6182
+ });
6183
+ let timer;
6184
+ const cap = new Promise((resolve2) => {
6185
+ timer = setTimeout(resolve2, _StreamHandler.DISPATCH_SETTLE_TIMEOUT_MS);
6186
+ });
6187
+ try {
6188
+ await Promise.race([settle, cap]);
6189
+ } finally {
6190
+ if (timer) clearTimeout(timer);
6191
+ }
6192
+ }
5798
6193
  /** Handle call stop / stream end. */
5799
6194
  /** Handle a carrier-emitted `stop` event signalling the call has ended. */
5800
6195
  async handleStop() {
@@ -5811,6 +6206,7 @@ var StreamHandler = class _StreamHandler {
5811
6206
  } catch {
5812
6207
  }
5813
6208
  }
6209
+ await this.settleDispatchForTeardown();
5814
6210
  this.clearPendingBargeIn();
5815
6211
  this.drainPendingMarks();
5816
6212
  this.clearGraceTimer();
@@ -5838,6 +6234,7 @@ var StreamHandler = class _StreamHandler {
5838
6234
  } catch {
5839
6235
  }
5840
6236
  }
6237
+ await this.settleDispatchForTeardown();
5841
6238
  this.clearPendingBargeIn();
5842
6239
  this.drainPendingMarks();
5843
6240
  this.clearGraceTimer();
@@ -6232,7 +6629,7 @@ var StreamHandler = class _StreamHandler {
6232
6629
  };
6233
6630
  }
6234
6631
  /** Synthesize a single sentence through TTS with hooks, sending audio to telephony. */
6235
- async synthesizeSentence(sentence, hookExecutor, hookCtx, ttsFirstByteSent) {
6632
+ async synthesizeSentence(sentence, hookExecutor, hookCtx, ttsFirstByteSent, recordSegment = true) {
6236
6633
  if (!this.tts || !this.isSpeaking) return;
6237
6634
  let transformed = sentence;
6238
6635
  const transforms = this.deps.agent.textTransforms;
@@ -6258,8 +6655,16 @@ var StreamHandler = class _StreamHandler {
6258
6655
  if (this.aec) {
6259
6656
  this.aec.pushFarEnd(processedAudio);
6260
6657
  }
6658
+ if (recordSegment) {
6659
+ this.turnSpokenSegments.push({
6660
+ text: processedText,
6661
+ startMs: this.turnPlaybackTotalMs
6662
+ });
6663
+ recordSegment = false;
6664
+ }
6261
6665
  const encoded = this.encodePipelineAudio(processedAudio);
6262
6666
  this.deps.bridge.sendAudio(this.ws, encoded, this.streamSid);
6667
+ this.trackOutboundPlayback(processedAudio.length);
6263
6668
  this.markFirstAudioSent();
6264
6669
  }
6265
6670
  } catch (e) {
@@ -6334,64 +6739,101 @@ var StreamHandler = class _StreamHandler {
6334
6739
  return;
6335
6740
  }
6336
6741
  this.history.push({ role: "user", text: filteredTranscript, timestamp: Date.now() });
6337
- let responseText = "";
6338
6742
  this.metricsAcc.recordOnUserTurnCompletedDelay(0);
6339
6743
  this.metricsAcc.recordTurnCommitted();
6340
6744
  closeEndpointSpan();
6341
- if (this.deps.onMessage && typeof this.deps.onMessage === "function") {
6342
- try {
6343
- responseText = await this.deps.onMessage({
6745
+ await this.dispatchTask?.catch(() => {
6746
+ });
6747
+ const historySnapshot = [...this.history.entries];
6748
+ this.dispatchTask = this.dispatchTurn(
6749
+ filteredTranscript,
6750
+ hookExecutor,
6751
+ hookCtx,
6752
+ interrupted,
6753
+ historySnapshot
6754
+ );
6755
+ }
6756
+ /**
6757
+ * Post-commit turn body (LLM dispatch → TTS → turn-complete) run as a
6758
+ * tracked background task so the transcript drain loop is not blocked for
6759
+ * the whole (possibly 30-90 s) agent-runtime turn. A barge-in — transcript
6760
+ * (now reachable mid-turn) or VAD — aborts the in-flight ``llmAbort`` and
6761
+ * flips ``isSpeaking``, which the LLM/TTS loops here observe and break on.
6762
+ * Parity with Python ``_dispatch_turn``.
6763
+ */
6764
+ async dispatchTurn(filteredTranscript, hookExecutor, hookCtx, interrupted, historySnapshot) {
6765
+ const label = this.deps.bridge.label;
6766
+ let responseText = "";
6767
+ try {
6768
+ if (this.deps.onMessage && typeof this.deps.onMessage === "function") {
6769
+ try {
6770
+ responseText = await this.deps.onMessage({
6771
+ text: filteredTranscript,
6772
+ call_id: this.callId,
6773
+ caller: this.caller,
6774
+ callee: this.callee,
6775
+ history: historySnapshot
6776
+ });
6777
+ } catch (e) {
6778
+ getLogger().error(`onMessage error (${label}):`, e);
6779
+ return;
6780
+ }
6781
+ if (!responseText) {
6782
+ getLogger().warn(
6783
+ `onMessage returned empty/void (${label}) \u2014 no TTS will play. If you intended to observe transcripts, use onTranscript instead; if you meant to answer via the built-in LLM, remove onMessage and pass openaiKey.`
6784
+ );
6785
+ }
6786
+ } else if (this.deps.onMessage && isRemoteUrl(this.deps.onMessage)) {
6787
+ const msgData = {
6344
6788
  text: filteredTranscript,
6345
6789
  call_id: this.callId,
6346
6790
  caller: this.caller,
6347
6791
  callee: this.callee,
6348
- history: [...this.history.entries]
6349
- });
6350
- } catch (e) {
6351
- getLogger().error(`onMessage error (${label}):`, e);
6352
- return;
6353
- }
6354
- if (!responseText) {
6792
+ history: historySnapshot
6793
+ };
6794
+ if (isWebSocketUrl(this.deps.onMessage)) {
6795
+ await this.handleWebSocketResponse(msgData);
6796
+ return;
6797
+ }
6798
+ try {
6799
+ responseText = await this.deps.remoteHandler.callWebhook(this.deps.onMessage, msgData);
6800
+ } catch (e) {
6801
+ getLogger().error(`Webhook remote error (${label}):`, e);
6802
+ return;
6803
+ }
6804
+ } else if (this.llmLoop) {
6805
+ const llmResult = await this.runPipelineLlm(
6806
+ filteredTranscript,
6807
+ hookExecutor,
6808
+ hookCtx,
6809
+ historySnapshot
6810
+ );
6811
+ responseText = llmResult.text;
6812
+ interrupted = interrupted || llmResult.interrupted;
6813
+ } else {
6355
6814
  getLogger().warn(
6356
- `onMessage returned empty/void (${label}) \u2014 no TTS will play. If you intended to observe transcripts, use onTranscript instead; if you meant to answer via the built-in LLM, remove onMessage and pass openaiKey.`
6815
+ `Pipeline (${label}) has no llm/onMessage handler \u2014 transcript "${sanitizeLogValue(filteredTranscript.slice(0, 60))}" dropped. Check that agent.llm or onMessage is configured.`
6357
6816
  );
6358
- }
6359
- } else if (this.deps.onMessage && isRemoteUrl(this.deps.onMessage)) {
6360
- const msgData = {
6361
- text: filteredTranscript,
6362
- call_id: this.callId,
6363
- caller: this.caller,
6364
- callee: this.callee,
6365
- history: [...this.history.entries]
6366
- };
6367
- if (isWebSocketUrl(this.deps.onMessage)) {
6368
- await this.handleWebSocketResponse(msgData);
6369
6817
  return;
6370
6818
  }
6371
- try {
6372
- responseText = await this.deps.remoteHandler.callWebhook(this.deps.onMessage, msgData);
6373
- } catch (e) {
6374
- getLogger().error(`Webhook remote error (${label}):`, e);
6375
- return;
6819
+ if (!responseText) return;
6820
+ if (this.llmLoop) {
6821
+ let spokenText = responseText;
6822
+ if (interrupted) {
6823
+ const heard = this.heardResponsePrefix();
6824
+ spokenText = heard === null ? `${responseText} [interrupted by caller]` : heard.text ? `${heard.text} [interrupted by caller]` : "[interrupted by caller]";
6825
+ }
6826
+ await this.emitAssistantTranscript(spokenText);
6827
+ if (!interrupted) this.metricsAcc.recordTtsComplete(responseText);
6828
+ } else {
6829
+ interrupted = await this.runRegularLlm(responseText, hookExecutor, hookCtx) || interrupted;
6830
+ responseText = this.history.entries[this.history.entries.length - 1]?.text ?? responseText;
6376
6831
  }
6377
- } else if (this.llmLoop) {
6378
- responseText = await this.runPipelineLlm(filteredTranscript, hookExecutor, hookCtx);
6379
- } else {
6380
- getLogger().warn(
6381
- `Pipeline (${label}) has no llm/onMessage handler \u2014 transcript "${sanitizeLogValue(filteredTranscript.slice(0, 60))}" dropped. Check that agent.llm or onMessage is configured.`
6382
- );
6383
- return;
6384
- }
6385
- if (!responseText) return;
6386
- if (this.llmLoop) {
6387
- await this.emitAssistantTranscript(responseText);
6388
- this.metricsAcc.recordTtsComplete(responseText);
6389
- } else {
6390
- interrupted = await this.runRegularLlm(responseText, hookExecutor, hookCtx) || interrupted;
6391
- responseText = this.history.entries[this.history.entries.length - 1]?.text ?? responseText;
6392
- }
6393
- if (!interrupted) {
6394
- await this.emitTurnMetrics(this.metricsAcc.recordTurnComplete(responseText));
6832
+ if (!interrupted) {
6833
+ await this.emitTurnMetrics(this.metricsAcc.recordTurnComplete(responseText));
6834
+ }
6835
+ } finally {
6836
+ this.dispatchTask = null;
6395
6837
  }
6396
6838
  }
6397
6839
  /**
@@ -6402,6 +6844,18 @@ var StreamHandler = class _StreamHandler {
6402
6844
  */
6403
6845
  async handleBargeInAsync(transcript) {
6404
6846
  if (!transcript.text || !this.isSpeaking) return false;
6847
+ if (this.tailGraceActive) {
6848
+ this.endTailGraceForNewTurn();
6849
+ return false;
6850
+ }
6851
+ if (this.forwardSttWhileSpeaking && looksLikeEcho(transcript.text, this.currentAgentSpokenText)) {
6852
+ getLogger().info(
6853
+ `Barge-in suppressed: transcript matches agent's own speech (echo) \u2014 ${sanitizeLogValue(
6854
+ transcript.text.slice(0, 40)
6855
+ )}`
6856
+ );
6857
+ return false;
6858
+ }
6405
6859
  if (!this.canBargeIn()) {
6406
6860
  getLogger().info(
6407
6861
  `Barge-in transcript suppressed (agent speaking < gate, aec=${this.aec ? "on" : "off"})`
@@ -6441,6 +6895,18 @@ var StreamHandler = class _StreamHandler {
6441
6895
  */
6442
6896
  handleBargeIn(transcript) {
6443
6897
  if (!transcript.text || !this.isSpeaking) return false;
6898
+ if (this.tailGraceActive) {
6899
+ this.endTailGraceForNewTurn();
6900
+ return false;
6901
+ }
6902
+ if (this.forwardSttWhileSpeaking && looksLikeEcho(transcript.text, this.currentAgentSpokenText)) {
6903
+ getLogger().info(
6904
+ `Barge-in suppressed: transcript matches agent's own speech (echo) \u2014 ${sanitizeLogValue(
6905
+ transcript.text.slice(0, 40)
6906
+ )}`
6907
+ );
6908
+ return false;
6909
+ }
6444
6910
  if (this.bargeInStrategies.length === 0) {
6445
6911
  if (!this.canBargeIn()) {
6446
6912
  getLogger().info(
@@ -6472,6 +6938,7 @@ var StreamHandler = class _StreamHandler {
6472
6938
  this.metricsAcc.recordBargeinDetected();
6473
6939
  const bargeinSpan = startSpan(SPAN_BARGEIN, { "patter.call.id": this.callId });
6474
6940
  try {
6941
+ this.maybeTruncateCompletedTurnHistory();
6475
6942
  this.cancelSpeaking();
6476
6943
  try {
6477
6944
  this.deps.bridge.sendClear(this.ws, this.streamSid);
@@ -6535,15 +7002,21 @@ var StreamHandler = class _StreamHandler {
6535
7002
  getLogger().debug(`Dropped likely STT hallucination: ${sanitizeLogValue(normalised.slice(0, 40))}`);
6536
7003
  return false;
6537
7004
  }
7005
+ if (this.forwardSttWhileSpeaking && this.isSpeaking && looksLikeEcho(text, this.currentAgentSpokenText)) {
7006
+ getLogger().debug(
7007
+ `Dropped agent-echo transcript (not a user turn): ${sanitizeLogValue(normalised.slice(0, 40))}`
7008
+ );
7009
+ return false;
7010
+ }
6538
7011
  if (sinceLastMs < 2e3 && normalised === this.lastCommitText) {
6539
7012
  getLogger().debug(
6540
7013
  `Dropped duplicate final transcript (${(sinceLastMs / 1e3).toFixed(1)}s since last): ${sanitizeLogValue(normalised.slice(0, 40))}`
6541
7014
  );
6542
7015
  return false;
6543
7016
  }
6544
- if (sinceLastMs < 500) {
7017
+ if (sinceLastMs < 500 && isNearDuplicate(normalised, this.lastCommitText)) {
6545
7018
  getLogger().debug(
6546
- `Dropped back-to-back final transcript (${(sinceLastMs / 1e3).toFixed(2)}s since last): ${sanitizeLogValue(normalised.slice(0, 40))}`
7019
+ `Dropped back-to-back near-duplicate final (${(sinceLastMs / 1e3).toFixed(2)}s since last): ${sanitizeLogValue(normalised.slice(0, 40))}`
6547
7020
  );
6548
7021
  return false;
6549
7022
  }
@@ -6551,11 +7024,63 @@ var StreamHandler = class _StreamHandler {
6551
7024
  this.lastCommitAt = now;
6552
7025
  return true;
6553
7026
  }
7027
+ /**
7028
+ * Schedule the opt-in long-turn filler and return its async ``clear()``.
7029
+ *
7030
+ * When ``agent.longTurnMessage`` is unset / empty the returned clear is a
7031
+ * no-op (byte-identical to today's behaviour). Otherwise a one-shot timer
7032
+ * fires after ``agent.longTurnMessageAfterS`` seconds and, IFF no audio has
7033
+ * reached the carrier this turn (``!ttsFirstByteSent.value``) AND we still own
7034
+ * the floor (``this.isSpeaking``), synthesizes the filler ONCE via the same
7035
+ * per-sentence TTS primitive every sentence uses.
7036
+ *
7037
+ * The returned ``clear()`` is **async**: it stops the timer AND, if the filler
7038
+ * already started synthesizing (its ``setTimeout`` callback runs in a separate
7039
+ * macro-task, so it can fire just before the first real sentence), AWAITS the
7040
+ * in-flight synthesis so the filler audio can never interleave with the real
7041
+ * sentence that follows. Idempotent; self-synthesis failure degrades to
7042
+ * silence (never crashes the turn). The caller must clear on first real audio,
7043
+ * on the error branch, and in the finally.
7044
+ */
7045
+ scheduleLongTurnFiller(ttsFirstByteSent, hookExecutor, hookCtx, label) {
7046
+ const message = this.deps.agent.longTurnMessage;
7047
+ if (!message) return async () => {
7048
+ };
7049
+ const afterS = this.deps.agent.longTurnMessageAfterS ?? 4;
7050
+ let cancelled = false;
7051
+ let inFlight = null;
7052
+ const timer = setTimeout(() => {
7053
+ if (cancelled || ttsFirstByteSent.value || !this.isSpeaking) return;
7054
+ inFlight = this.synthesizeSentence(
7055
+ message,
7056
+ hookExecutor,
7057
+ hookCtx,
7058
+ ttsFirstByteSent,
7059
+ false
7060
+ ).catch((err) => {
7061
+ getLogger().error(
7062
+ `longTurnMessage filler synthesis failed (${label}):`,
7063
+ err
7064
+ );
7065
+ });
7066
+ }, Math.max(0, afterS * 1e3));
7067
+ return async () => {
7068
+ cancelled = true;
7069
+ clearTimeout(timer);
7070
+ if (inFlight !== null) {
7071
+ const pending = inFlight;
7072
+ inFlight = null;
7073
+ await pending;
7074
+ }
7075
+ };
7076
+ }
6554
7077
  /**
6555
7078
  * Streaming built-in LLM path with sentence chunking and per-sentence
6556
- * guardrails/TTS. Returns the concatenated response text.
7079
+ * guardrails/TTS. Returns the concatenated (plain) response text plus whether
7080
+ * the turn was cut short by a barge-in — the caller applies the interrupted
7081
+ * marker to history only, keeping metrics on the plain text.
6557
7082
  */
6558
- async runPipelineLlm(filteredTranscript, hookExecutor, hookCtx) {
7083
+ async runPipelineLlm(filteredTranscript, hookExecutor, hookCtx, historySnapshot) {
6559
7084
  const label = this.deps.bridge.label;
6560
7085
  const callCtx = { call_id: this.callId, caller: this.caller, callee: this.callee };
6561
7086
  const chunker = new SentenceChunker({
@@ -6568,6 +7093,12 @@ var StreamHandler = class _StreamHandler {
6568
7093
  this.llmAbort = new AbortController();
6569
7094
  const llmSignal = this.llmAbort.signal;
6570
7095
  let llmError = false;
7096
+ const clearLongTurnFiller = this.scheduleLongTurnFiller(
7097
+ ttsFirstByteSent,
7098
+ hookExecutor,
7099
+ hookCtx,
7100
+ label
7101
+ );
6571
7102
  const llmSpan = startSpan(SPAN_LLM, { "patter.call.id": this.callId });
6572
7103
  const guardAndSpeak = async (sentence, isFirst) => {
6573
7104
  if (isFirst) this.metricsAcc.recordLlmFirstSentenceComplete();
@@ -6578,6 +7109,7 @@ var StreamHandler = class _StreamHandler {
6578
7109
  if (transformed === null) return;
6579
7110
  sentenceText = transformed;
6580
7111
  }
7112
+ await clearLongTurnFiller();
6581
7113
  await this.synthesizeSentence(sentenceText, hookExecutor, hookCtx, ttsFirstByteSent);
6582
7114
  };
6583
7115
  let firstSentenceEmitted = false;
@@ -6585,7 +7117,7 @@ var StreamHandler = class _StreamHandler {
6585
7117
  try {
6586
7118
  for await (const token of this.llmLoop.run(
6587
7119
  filteredTranscript,
6588
- this.history.entries,
7120
+ historySnapshot,
6589
7121
  callCtx,
6590
7122
  this.metricsAcc,
6591
7123
  hookExecutor,
@@ -6596,6 +7128,7 @@ var StreamHandler = class _StreamHandler {
6596
7128
  this.metricsAcc.recordLlmFirstToken();
6597
7129
  await this.emitLlmFirstToken();
6598
7130
  allParts.push(token);
7131
+ this.currentAgentSpokenText = allParts.join("");
6599
7132
  for (const sentence of chunker.push(token)) {
6600
7133
  if (!this.isSpeaking) break;
6601
7134
  await guardAndSpeak(sentence, !firstSentenceEmitted);
@@ -6605,6 +7138,7 @@ var StreamHandler = class _StreamHandler {
6605
7138
  }
6606
7139
  } catch (e) {
6607
7140
  const isAbort = e?.name === "AbortError" || llmSignal.aborted;
7141
+ await clearLongTurnFiller();
6608
7142
  if (!isAbort) {
6609
7143
  llmError = true;
6610
7144
  chunker.reset();
@@ -6613,7 +7147,7 @@ var StreamHandler = class _StreamHandler {
6613
7147
  const fallback = this.deps.agent.llmErrorMessage;
6614
7148
  if (fallback && !ttsFirstByteSent.value && this.isSpeaking) {
6615
7149
  try {
6616
- await this.synthesizeSentence(fallback, hookExecutor, hookCtx, ttsFirstByteSent);
7150
+ await this.synthesizeSentence(fallback, hookExecutor, hookCtx, ttsFirstByteSent, false);
6617
7151
  } catch (err) {
6618
7152
  getLogger().error(`llmErrorMessage fallback synthesis failed (${label}):`, err);
6619
7153
  }
@@ -6629,6 +7163,7 @@ var StreamHandler = class _StreamHandler {
6629
7163
  }
6630
7164
  }
6631
7165
  } finally {
7166
+ await clearLongTurnFiller();
6632
7167
  this.endSpeakingWithGrace();
6633
7168
  this.llmAbort = null;
6634
7169
  try {
@@ -6636,7 +7171,7 @@ var StreamHandler = class _StreamHandler {
6636
7171
  } catch {
6637
7172
  }
6638
7173
  }
6639
- return allParts.join("");
7174
+ return { text: allParts.join(""), interrupted: llmSignal.aborted };
6640
7175
  }
6641
7176
  /**
6642
7177
  * Non-streaming path (onMessage function / webhook): apply output guardrails,
@@ -7764,13 +8299,14 @@ function isLoopbackHost(value) {
7764
8299
  }
7765
8300
  return false;
7766
8301
  }
8302
+ var TELNYX_FUTURE_SKEW_MS = 3e4;
7767
8303
  function validateTelnyxSignature(rawBody, signature, timestamp, publicKey, toleranceSec = 300) {
7768
8304
  try {
7769
8305
  const ts = parseInt(timestamp, 10);
7770
8306
  if (!Number.isFinite(ts)) return false;
7771
8307
  const tsMs = ts < 1e12 ? ts * 1e3 : ts;
7772
8308
  const ageMs = Date.now() - tsMs;
7773
- if (ageMs < 0 || ageMs > toleranceSec * 1e3) return false;
8309
+ if (ageMs > toleranceSec * 1e3 || ageMs < -TELNYX_FUTURE_SKEW_MS) return false;
7774
8310
  const payload = `${timestamp}|${rawBody}`;
7775
8311
  const keyBuffer = Buffer.from(publicKey, "base64");
7776
8312
  const keyObject = crypto5.createPublicKey({
@@ -7816,7 +8352,7 @@ function sanitizeVariables(raw) {
7816
8352
  for (const key of Object.keys(raw)) {
7817
8353
  if (BLOCKED_KEYS.has(key)) continue;
7818
8354
  const val = raw[key];
7819
- safe[key] = typeof val === "string" ? val : String(val ?? "");
8355
+ safe[key] = (typeof val === "string" ? val : String(val ?? "")).replace(/[\x00-\x1f\x7f]/g, "").slice(0, 500);
7820
8356
  }
7821
8357
  return safe;
7822
8358
  }
@@ -8212,6 +8748,9 @@ var EmbeddedServer = class {
8212
8748
  twilioTokenWarningLogged = false;
8213
8749
  telnyxSigWarningLogged = false;
8214
8750
  metricsStore;
8751
+ /** Anonymous telemetry client, set by ``client.ts`` ``serve()``; emits the
8752
+ * per-call ``call_completed`` event from the call-end path. */
8753
+ telemetry;
8215
8754
  pricing;
8216
8755
  remoteHandler = new RemoteMessageHandler();
8217
8756
  /**
@@ -8315,6 +8854,12 @@ var EmbeddedServer = class {
8315
8854
  * Mirrors Python's ``_resolve_completion``.
8316
8855
  */
8317
8856
  resolveCompletion(callId, args) {
8857
+ if (args.outcome === "no_answer" || args.outcome === "busy" || args.outcome === "failed") {
8858
+ recordCallCompleted(this.telemetry, {
8859
+ outcome: args.outcome,
8860
+ carrier: this.config.telephonyProvider
8861
+ });
8862
+ }
8318
8863
  const entry = this.completions.get(callId);
8319
8864
  if (!entry || entry.done) return;
8320
8865
  const data = args.data;
@@ -9063,7 +9608,13 @@ var EmbeddedServer = class {
9063
9608
  return Object.fromEntries(Object.entries(snap).filter(([, v]) => v !== void 0));
9064
9609
  };
9065
9610
  const store = this.metricsStore;
9611
+ const telemetry = this.telemetry;
9066
9612
  const wrappedStart = async (data) => {
9613
+ recordCallStarted(telemetry, {
9614
+ providerMode: agent.provider ?? void 0,
9615
+ telephonyProvider: bridge.telephonyProvider,
9616
+ direction: data.direction
9617
+ });
9067
9618
  if (logger.enabled) {
9068
9619
  const callId = typeof data.call_id === "string" ? data.call_id : "";
9069
9620
  const dataCaller = typeof data.caller === "string" ? data.caller : "";
@@ -9094,6 +9645,11 @@ var EmbeddedServer = class {
9094
9645
  if (userMetrics) await userMetrics(data);
9095
9646
  };
9096
9647
  const wrappedEnd = async (data) => {
9648
+ recordCallCompleted(this.telemetry, {
9649
+ outcome: "completed",
9650
+ metrics: data.metrics,
9651
+ direction: data.direction
9652
+ });
9097
9653
  if (logger.enabled) {
9098
9654
  const callId = typeof data.call_id === "string" ? data.call_id : "";
9099
9655
  const metricsObj = data.metrics ?? null;
@@ -9149,7 +9705,7 @@ var EmbeddedServer = class {
9149
9705
  await handler.handleCallStart(callSid, customParameters);
9150
9706
  } else if (event === "media") {
9151
9707
  const payload = data.media?.payload ?? "";
9152
- handler.handleAudio(Buffer.from(payload, "base64"));
9708
+ await handler.handleAudio(Buffer.from(payload, "base64"));
9153
9709
  } else if (event === "mark") {
9154
9710
  const markName = String(data.mark?.name ?? "");
9155
9711
  if (markName) await handler.onMark(markName);
@@ -9161,6 +9717,7 @@ var EmbeddedServer = class {
9161
9717
  }
9162
9718
  } catch (err) {
9163
9719
  getLogger().error("Stream handler error:", err);
9720
+ handler.recordError(err);
9164
9721
  }
9165
9722
  });
9166
9723
  ws.on("close", async () => {
@@ -9205,7 +9762,7 @@ var EmbeddedServer = class {
9205
9762
  if (track !== "inbound") return;
9206
9763
  const audioChunk = data.media?.payload ?? "";
9207
9764
  if (!audioChunk) return;
9208
- handler.handleAudio(Buffer.from(audioChunk, "base64"));
9765
+ await handler.handleAudio(Buffer.from(audioChunk, "base64"));
9209
9766
  } else if (event === "dtmf") {
9210
9767
  const digit = String(data.dtmf?.digit ?? "").trim();
9211
9768
  if (digit) {
@@ -9219,9 +9776,11 @@ var EmbeddedServer = class {
9219
9776
  }
9220
9777
  } catch (err) {
9221
9778
  getLogger().error("Stream handler error (Telnyx):", err);
9779
+ handler.recordError(err);
9222
9780
  }
9223
9781
  });
9224
9782
  ws.on("close", async () => {
9783
+ this.activeCallIds.delete(ws);
9225
9784
  await handler.handleWsClose();
9226
9785
  });
9227
9786
  }
@@ -9250,7 +9809,7 @@ var EmbeddedServer = class {
9250
9809
  await handler.handleCallStart(callId);
9251
9810
  } else if (event === "media") {
9252
9811
  const payload = data.media?.payload ?? "";
9253
- if (payload) handler.handleAudio(Buffer.from(payload, "base64"));
9812
+ if (payload) await handler.handleAudio(Buffer.from(payload, "base64"));
9254
9813
  } else if (event === "playedStream") {
9255
9814
  const markName = String(data.name ?? "");
9256
9815
  if (markName) await handler.onMark(markName);
@@ -9264,6 +9823,7 @@ var EmbeddedServer = class {
9264
9823
  }
9265
9824
  } catch (err) {
9266
9825
  getLogger().error("Stream handler error (Plivo):", err);
9826
+ handler.recordError(err);
9267
9827
  }
9268
9828
  });
9269
9829
  ws.on("close", async () => {
@@ -9733,7 +10293,7 @@ var OpenAILLMProvider = class {
9733
10293
  });
9734
10294
  if (!response.ok) {
9735
10295
  const errText = await response.text();
9736
- getLogger().error(`LLM API error: ${response.status} ${errText}`);
10296
+ getLogger().error(`LLM API error: ${response.status} ${errText.slice(0, 200)}`);
9737
10297
  throw new PatterConnectionError(
9738
10298
  `LLM API returned ${response.status}: ${errText.slice(0, 200)}`
9739
10299
  );
@@ -9902,7 +10462,15 @@ ${systemPrompt}` : DEFAULT_PHONE_PREAMBLE;
9902
10462
  const hasAfterLlmChunk = Boolean(hookExecutor?.hasAfterLlmChunk());
9903
10463
  const allEmittedText = [];
9904
10464
  const callId = callContext.call_id;
9905
- const streamOpts = typeof callId === "string" && callId.length > 0 ? { ...opts, callId } : opts;
10465
+ const caller = callContext.caller;
10466
+ const callee = callContext.callee;
10467
+ const hasContext = typeof callId === "string" && callId.length > 0 || typeof caller === "string" && caller.length > 0 || typeof callee === "string" && callee.length > 0;
10468
+ const streamOpts = hasContext ? {
10469
+ ...opts,
10470
+ ...typeof callId === "string" && callId.length > 0 ? { callId } : {},
10471
+ ...typeof caller === "string" && caller.length > 0 ? { caller } : {},
10472
+ ...typeof callee === "string" && callee.length > 0 ? { callee } : {}
10473
+ } : opts;
9906
10474
  for (let iter = 0; iter < maxIterations; iter++) {
9907
10475
  const toolCallsAccumulated = /* @__PURE__ */ new Map();
9908
10476
  const textParts = [];
@@ -10036,6 +10604,7 @@ ${systemPrompt}` : DEFAULT_PHONE_PREAMBLE;
10036
10604
  { role: "system", content: this.systemPrompt }
10037
10605
  ];
10038
10606
  for (const entry of history) {
10607
+ if (entry.role === "tool") continue;
10039
10608
  messages.push({
10040
10609
  role: entry.role === "assistant" ? "assistant" : "user",
10041
10610
  content: entry.text