getpatter 0.6.3 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -47,6 +47,45 @@ var OpenAIRealtimeVADType = {
47
47
  SERVER_VAD: "server_vad",
48
48
  SEMANTIC_VAD: "semantic_vad"
49
49
  };
50
+ function validateRealtimeTurnDetection(td) {
51
+ if (td === void 0) return;
52
+ if (td.type !== void 0 && td.type !== "server_vad" && td.type !== "semantic_vad") {
53
+ throw new Error(
54
+ `RealtimeTurnDetection.type must be 'server_vad' or 'semantic_vad', got ${JSON.stringify(td.type)}`
55
+ );
56
+ }
57
+ if (td.eagerness !== void 0 && td.eagerness !== "low" && td.eagerness !== "medium" && td.eagerness !== "high" && td.eagerness !== "auto") {
58
+ throw new Error(
59
+ `RealtimeTurnDetection.eagerness must be one of low|medium|high|auto, got ${JSON.stringify(td.eagerness)}`
60
+ );
61
+ }
62
+ if (td.eagerness !== void 0 && td.type !== "semantic_vad") {
63
+ throw new Error(
64
+ "RealtimeTurnDetection.eagerness is only valid when type='semantic_vad'"
65
+ );
66
+ }
67
+ }
68
+ function buildTurnDetection(td, opts) {
69
+ validateRealtimeTurnDetection(td);
70
+ let detection;
71
+ if (td?.type === "semantic_vad") {
72
+ detection = { type: "semantic_vad" };
73
+ if (td.eagerness !== void 0) detection.eagerness = td.eagerness;
74
+ } else {
75
+ detection = {
76
+ type: td?.type ?? opts.defaultType,
77
+ threshold: td?.threshold ?? 0.5,
78
+ prefix_padding_ms: td?.prefixPaddingMs ?? 300,
79
+ silence_duration_ms: td?.silenceDurationMs ?? opts.defaultSilenceMs
80
+ };
81
+ }
82
+ if (opts.includeResponseGating) {
83
+ const serverManaged = !(opts.gateResponseOnTranscript ?? false);
84
+ detection.create_response = serverManaged;
85
+ detection.interrupt_response = serverManaged;
86
+ }
87
+ return detection;
88
+ }
50
89
  var OpenAIRealtimeAdapter = class {
51
90
  constructor(apiKey, model = OpenAIRealtimeModel.GPT_REALTIME_MINI, voice = OpenAIVoice.ALLOY, instructions = "", tools, audioFormat = OpenAIRealtimeAudioFormat.G711_ULAW, options = {}) {
52
91
  this.apiKey = apiKey;
@@ -56,6 +95,7 @@ var OpenAIRealtimeAdapter = class {
56
95
  this.tools = tools;
57
96
  this.audioFormat = audioFormat;
58
97
  this.options = options;
98
+ this.gateResponseOnTranscript = options.gateResponseOnTranscript ?? false;
59
99
  }
60
100
  apiKey;
61
101
  model;
@@ -85,6 +125,23 @@ var OpenAIRealtimeAdapter = class {
85
125
  // could have produced, which is what the user actually heard.
86
126
  currentResponseFirstAudioAt = null;
87
127
  options;
128
+ // When true, the stream handler waits for the Whisper ``transcript_input``
129
+ // event before requesting the model response (legacy behavior). When false
130
+ // (default) the response is requested on ``speech_stopped`` and the
131
+ // transcript is display-only. Read by the stream handler via
132
+ // ``getGateResponseOnTranscript()``.
133
+ gateResponseOnTranscript;
134
+ /**
135
+ * Whether the stream handler should gate the model response on the Whisper
136
+ * transcript (legacy) or fire it on `speech_stopped` (default, decoupled).
137
+ *
138
+ * `false` (default) — the response is requested on `speech_stopped`,
139
+ * independently of Whisper. `true` — the response is requested only after
140
+ * `transcript_input` passes the hallucination filter.
141
+ */
142
+ getGateResponseOnTranscript() {
143
+ return this.gateResponseOnTranscript;
144
+ }
88
145
  /**
89
146
  * Build the production session.update body. Mirrors the body sent
90
147
  * inside `connect()` so warmup can apply identical configuration to
@@ -96,16 +153,26 @@ var OpenAIRealtimeAdapter = class {
96
153
  output_audio_format: this.audioFormat,
97
154
  voice: this.voice,
98
155
  instructions: this.instructions || "You are a helpful voice assistant. Be concise.",
99
- turn_detection: {
100
- type: this.options.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
101
- threshold: 0.5,
102
- prefix_padding_ms: 300,
103
- silence_duration_ms: this.options.silenceDurationMs ?? 300
104
- },
156
+ // v1 turn_detection carries NO create_response / interrupt_response
157
+ // keys. The v1 server defaults (`create_response: true`,
158
+ // `interrupt_response: true`) ARE the server-managed behaviour we want by
159
+ // default, so omitting them is equivalent to sending `true` — gating
160
+ // disabled here. `gateResponseOnTranscript` is still threaded through for
161
+ // symmetry with the GA builder, but has no wire effect while
162
+ // includeResponseGating is false.
163
+ turn_detection: buildTurnDetection(this.options.turnDetection, {
164
+ defaultType: this.options.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
165
+ defaultSilenceMs: this.options.silenceDurationMs ?? 300,
166
+ includeResponseGating: false,
167
+ gateResponseOnTranscript: this.gateResponseOnTranscript
168
+ }),
105
169
  input_audio_transcription: {
106
170
  model: this.options.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
107
171
  }
108
172
  };
173
+ if (this.options.noiseReduction !== void 0) {
174
+ config.input_audio_noise_reduction = { type: this.options.noiseReduction };
175
+ }
109
176
  if (this.options.temperature !== void 0) config.temperature = this.options.temperature;
110
177
  if (this.options.maxResponseOutputTokens !== void 0) {
111
178
  config.max_response_output_tokens = this.options.maxResponseOutputTokens;
@@ -369,6 +436,10 @@ var OpenAIRealtimeAdapter = class {
369
436
  };
370
437
  const timer = setTimeout(() => {
371
438
  cleanup();
439
+ try {
440
+ ws.close();
441
+ } catch {
442
+ }
372
443
  reject(new Error("OpenAI Realtime park connect timeout"));
373
444
  }, 8e3);
374
445
  ws.on("message", onMessage);
@@ -463,20 +534,33 @@ var OpenAIRealtimeAdapter = class {
463
534
  dispatch("error", { type: "socket_error", message: err?.message ?? String(err) });
464
535
  });
465
536
  }
466
- /** Truncate the in-flight assistant turn and cancel the active response.
537
+ /** Truncate the in-flight assistant turn's playback offset on the server.
538
+ *
539
+ * Sends ONLY ``conversation.item.truncate`` — no ``response.cancel``. This
540
+ * is the half of barge-in handling that a WebSocket transport MUST always
541
+ * perform: per OpenAI's docs, the GA server auto-truncates on barge-in only
542
+ * over WebRTC / SIP; on the WebSocket transport the client is responsible
543
+ * for telling the server how much of the assistant turn was actually heard.
544
+ * In server-managed mode (``interrupt_response: true``) the server already
545
+ * cancels the response itself, so issuing ``response.cancel`` here would be
546
+ * redundant / rejected — call this method, not {@link cancelResponse}.
467
547
  *
468
548
  * ``audio_end_ms`` MUST reflect what the caller actually heard, not what
469
549
  * the server generated. OpenAI streams audio at 5-10x real-time, so the
470
550
  * byte-derived counter overstates playback whenever the consumer cleared
471
- * its playout buffer (e.g. ``send_clear``) before the audio reached the
551
+ * its playout buffer (e.g. ``sendClear``) before the audio reached the
472
552
  * speaker. We bound the truncate point by wall-clock time since the first
473
553
  * chunk of this response — that's the physical maximum a 1x real-time
474
554
  * playback could have produced. Without this cap, OpenAI keeps the full
475
555
  * generated assistant text on the transcript, and the model replays /
476
556
  * resumes from it on the next turn — manifesting as re-greetings and
477
557
  * mid-sentence fragments after a barge-in storm.
558
+ *
559
+ * No-op when no response is in flight, keeping it idempotent across stale
560
+ * callers. Resets per-response tracking so post-truncate late frames and
561
+ * the next response start clean.
478
562
  */
479
- cancelResponse() {
563
+ truncate() {
480
564
  if (!this.ws) return;
481
565
  if (!this.currentResponseItemId) {
482
566
  return;
@@ -496,11 +580,31 @@ var OpenAIRealtimeAdapter = class {
496
580
  } catch (err) {
497
581
  getLogger().debug?.(`conversation.item.truncate failed: ${String(err)}`);
498
582
  }
499
- this.ws.send(JSON.stringify({ type: "response.cancel" }));
500
583
  this.currentResponseItemId = null;
501
584
  this.currentResponseAudioMs = 0;
502
585
  this.currentResponseFirstAudioAt = null;
503
586
  }
587
+ /** Truncate the in-flight assistant turn AND cancel the active response.
588
+ *
589
+ * Sends BOTH ``conversation.item.truncate`` (the played-offset bookkeeping)
590
+ * AND ``response.cancel``. Use this on the LEGACY client-managed barge-in
591
+ * path (``gateResponseOnTranscript`` true → ``interrupt_response: false``,
592
+ * so the server does NOT cancel for us) and for explicit cancels driven by
593
+ * Patter (e.g. on transfer / hangup). In server-managed mode call
594
+ * {@link truncate} instead — the server already cancels the response, and an
595
+ * extra ``response.cancel`` would be redundant / rejected.
596
+ *
597
+ * Truncation bounding semantics are identical to {@link truncate}; see its
598
+ * doc comment for the ``audio_end_ms`` wall-clock cap rationale.
599
+ */
600
+ cancelResponse() {
601
+ if (!this.ws) return;
602
+ if (!this.currentResponseItemId) {
603
+ return;
604
+ }
605
+ this.truncate();
606
+ this.ws.send(JSON.stringify({ type: "response.cancel" }));
607
+ }
504
608
  /** Inject a user text turn and request a new response. */
505
609
  async sendText(text) {
506
610
  this.ws?.send(JSON.stringify({
@@ -545,6 +649,32 @@ var OpenAIRealtimeAdapter = class {
545
649
  }
546
650
  }));
547
651
  }
652
+ /**
653
+ * Speak a short reassurance filler WITHOUT injecting a `role:user` turn.
654
+ *
655
+ * Same no-fake-turn shape as {@link sendFirstMessage}: a bare
656
+ * `response.create` carrying explicit `instructions`, so the filler is the
657
+ * assistant's own in-band audio. The reassurance scheduler in the
658
+ * stream-handler routes here instead of {@link sendText} — which would emit
659
+ * a `conversation.item.create` with `role:'user'` and falsely show the
660
+ * caller saying "One moment." in the transcript. Fillers must not imply
661
+ * success or failure.
662
+ *
663
+ * Uses `modalities: ['audio', 'text']` (v1-beta shape). The GA subclass
664
+ * {@link OpenAIRealtime2Adapter} overrides this with `output_modalities`
665
+ * and re-injects `audio.output.voice` so the GA endpoint does not reject
666
+ * the request. Mirrors Python `OpenAIRealtimeAdapter.send_reassurance` in
667
+ * `providers/openai_realtime.py`.
668
+ */
669
+ async sendReassurance(text) {
670
+ this.ws?.send(JSON.stringify({
671
+ type: "response.create",
672
+ response: {
673
+ modalities: ["audio", "text"],
674
+ instructions: `Say exactly this and nothing else: "${text}"`
675
+ }
676
+ }));
677
+ }
548
678
  /** Submit a tool/function-call result and request the next response. */
549
679
  async sendFunctionResult(callId, result) {
550
680
  this.ws?.send(JSON.stringify({
@@ -727,7 +857,12 @@ var StatefulResampler = class {
727
857
  * Resets all state after flushing.
728
858
  */
729
859
  flush() {
730
- this.carry.flush();
860
+ const carryTail = this.carry.flush();
861
+ if (carryTail.length > 0) {
862
+ getLogger().warn(
863
+ "[patter] StatefulResampler.flush: trailing odd byte discarded \u2014 upstream produced odd-length PCM stream"
864
+ );
865
+ }
731
866
  if (this.srcRate === 16e3 && this.dstRate === 8e3 && this.firPendingSample !== null) {
732
867
  const s = this.firPendingSample;
733
868
  const tmp = Buffer.alloc(4);
@@ -1012,44 +1147,46 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
1012
1147
  buildGASessionConfig() {
1013
1148
  const opts = this.options;
1014
1149
  const fmt = { type: "audio/pcm", rate: 24e3 };
1150
+ const audioInput = {
1151
+ format: fmt,
1152
+ transcription: {
1153
+ model: opts.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
1154
+ },
1155
+ // Response creation + barge-in cancellation (issue #154 — hand
1156
+ // turn-taking to the server by default):
1157
+ // - DEFAULT (`gateResponseOnTranscript` false → SERVER-MANAGED):
1158
+ // `create_response: true` lets the SERVER auto-create the response
1159
+ // when it commits the user's audio buffer
1160
+ // (`input_audio_buffer.committed`). `interrupt_response: true` lets the
1161
+ // SERVER cancel the in-flight response on its own VAD `speech_started`.
1162
+ // The e2e model replies immediately, in parallel with the Whisper
1163
+ // transcript — no transcript wait (~500 ms reclaimed), no client-side
1164
+ // race. On a WebSocket transport the client STILL must clear the
1165
+ // carrier buffer (`sendClear`) and `conversation.item.truncate` the
1166
+ // played offset on barge-in (the server only auto-truncates on
1167
+ // WebRTC/SIP), but it does NOT send `response.cancel`. Whisper is
1168
+ // display-only — it can never trigger / gate / cancel the response.
1169
+ // - LEGACY (`gateResponseOnTranscript` true → CLIENT-MANAGED opt-out):
1170
+ // `create_response: false` + `interrupt_response: false` so the stream
1171
+ // handler drives `response.create` (after the hallucination filter)
1172
+ // and `response.cancel` (on barge-in) itself. Escape hatch for no-AEC
1173
+ // PSTN self-interruption. Both keys are tied to the same switch inside
1174
+ // `buildTurnDetection`.
1175
+ turn_detection: buildTurnDetection(opts.turnDetection, {
1176
+ defaultType: opts.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
1177
+ defaultSilenceMs: opts.silenceDurationMs ?? 300,
1178
+ includeResponseGating: true,
1179
+ gateResponseOnTranscript: this.getGateResponseOnTranscript()
1180
+ })
1181
+ };
1182
+ if (opts.noiseReduction !== void 0) {
1183
+ audioInput.noise_reduction = { type: opts.noiseReduction };
1184
+ }
1015
1185
  const config = {
1016
1186
  type: "realtime",
1017
1187
  output_modalities: opts.modalities ?? ["audio"],
1018
1188
  audio: {
1019
- input: {
1020
- format: fmt,
1021
- transcription: {
1022
- model: opts.inputAudioTranscriptionModel ?? OpenAITranscriptionModel.WHISPER_1
1023
- },
1024
- // VAD threshold raised back to the OpenAI default (0.5) on
1025
- // 2026-05-22. The earlier 0.1 tuning (motivated by the
1026
- // upsampled telephony-band loss in high frequencies) made the
1027
- // server VAD trigger on the carrier-loopback echo of the
1028
- // agent's OWN outbound audio in PSTN no-AEC scenarios.
1029
- // Combined with the default ``turn_detection.create_response:
1030
- // true``, every phantom ``speech_started`` ended a turn early
1031
- // and auto-created a new response that the agent immediately
1032
- // spoke over, leading to a runaway loop where the first
1033
- // message was repeatedly cut and re-generated.
1034
- turn_detection: {
1035
- type: opts.vadType ?? OpenAIRealtimeVADType.SERVER_VAD,
1036
- threshold: 0.5,
1037
- prefix_padding_ms: 300,
1038
- silence_duration_ms: opts.silenceDurationMs ?? 500,
1039
- // Defer ``response.create`` to the application: when OpenAI's
1040
- // server VAD commits an ``input_audio_buffer.committed`` segment
1041
- // that turns out to be a Whisper hallucination on silence/echo,
1042
- // auto-creating a response would generate a phantom turn (the
1043
- // model reads the hallucinated text as user input). Patter
1044
- // triggers ``response.create`` explicitly in the Realtime
1045
- // stream-handler AFTER validating ``transcript_input`` against
1046
- // the hallucination filter. Pair with ``interrupt_response:
1047
- // false`` so server VAD also leaves in-flight responses alone —
1048
- // barge-in is gated client-side.
1049
- create_response: false,
1050
- interrupt_response: false
1051
- }
1052
- },
1189
+ input: audioInput,
1053
1190
  output: {
1054
1191
  format: fmt,
1055
1192
  voice: this.voice
@@ -1102,14 +1239,7 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
1102
1239
  if (t && t in GA_TO_V1_EVENT_NAMES) {
1103
1240
  const newType = GA_TO_V1_EVENT_NAMES[t];
1104
1241
  if (t === "response.output_audio.delta" && typeof parsed.delta === "string") {
1105
- const mulaw = this.transcodeOutboundPcm24ToMulaw8Buffer(parsed.delta);
1106
- const FRAME_BYTES = 160;
1107
- if (mulaw.length === 0) return;
1108
- for (let off = 0; off < mulaw.length; off += FRAME_BYTES) {
1109
- const slice = mulaw.subarray(off, Math.min(off + FRAME_BYTES, mulaw.length));
1110
- const frame = { ...parsed, type: newType, delta: slice.toString("base64") };
1111
- handler(Buffer.from(JSON.stringify(frame)), ...rest);
1112
- }
1242
+ this.translateGaAudioDelta(parsed, handler, rest);
1113
1243
  return;
1114
1244
  }
1115
1245
  parsed.type = newType;
@@ -1138,6 +1268,7 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
1138
1268
  sessionCreated = true;
1139
1269
  ws.send(JSON.stringify({ type: "session.update", session: this.buildGASessionConfig() }));
1140
1270
  } else if (msg.type === "session.updated") {
1271
+ this.warnIfOutputFormatUnexpected(msg);
1141
1272
  cleanup();
1142
1273
  resolve();
1143
1274
  } else if (msg.type === "error") {
@@ -1243,6 +1374,10 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
1243
1374
  };
1244
1375
  const timer = setTimeout(() => {
1245
1376
  cleanup();
1377
+ try {
1378
+ ws.close();
1379
+ } catch {
1380
+ }
1246
1381
  reject(new Error("OpenAI Realtime 2 park connect timeout"));
1247
1382
  }, 8e3);
1248
1383
  ws.on("message", onMessage);
@@ -1290,8 +1425,12 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
1290
1425
  const parsed = JSON.parse(text);
1291
1426
  const t = parsed.type;
1292
1427
  if (t && Object.prototype.hasOwnProperty.call(GA_TO_V1_EVENT_NAMES, t)) {
1428
+ if (t === "response.output_audio.delta" && typeof parsed.delta === "string") {
1429
+ this.translateGaAudioDelta(parsed, handler, rest);
1430
+ return;
1431
+ }
1293
1432
  parsed.type = GA_TO_V1_EVENT_NAMES[t];
1294
- handler(JSON.stringify(parsed), ...rest);
1433
+ handler(Buffer.from(JSON.stringify(parsed)), ...rest);
1295
1434
  return;
1296
1435
  }
1297
1436
  } catch {
@@ -1376,6 +1515,55 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
1376
1515
  }
1377
1516
  return out;
1378
1517
  }
1518
+ /**
1519
+ * Log-only safety net for issue #154. The GA server echoes the *effective*
1520
+ * session config in `session.updated`; we request `audio/pcm` @ 24 kHz and
1521
+ * transcode PCM24→mulaw8 ourselves (see
1522
+ * `transcodeOutboundPcm24ToMulaw8Buffer`). If a future GA schema change ever
1523
+ * made the server return a different output format, that transcode — which
1524
+ * assumes PCM16-LE @ 24 kHz — would silently corrupt audio, exactly the
1525
+ * v1-beta failure mode #154 fixed. Warn so the drift surfaces in logs instead
1526
+ * of as static. Never gates audio.
1527
+ */
1528
+ warnIfOutputFormatUnexpected(msg) {
1529
+ const fmt = msg?.session?.audio?.output?.format;
1530
+ if (!fmt || typeof fmt !== "object") return;
1531
+ if (fmt.type !== "audio/pcm" || fmt.rate != null && fmt.rate !== 24e3) {
1532
+ getLogger().warn(
1533
+ `OpenAI Realtime 2: server-echoed output format ${JSON.stringify(fmt)} differs from the requested audio/pcm@24000 \u2014 the outbound PCM24\u2192mulaw8 transcode assumes PCM16-LE 24 kHz, so carrier audio may be garbled (issue #154). Informational only; audio is not gated on this.`
1534
+ );
1535
+ }
1536
+ }
1537
+ /**
1538
+ * Shared audio-delta translation helper. Transcodes a GA
1539
+ * `response.output_audio.delta` payload (base64 PCM-16-LE 24 kHz)
1540
+ * into mulaw 8 kHz and splits the result into 160-byte (20 ms) frames,
1541
+ * dispatching one synthetic `response.audio.delta` event per frame.
1542
+ *
1543
+ * Called from BOTH the `connect()` shim and the `adoptWebSocket()` shim
1544
+ * so that warm-path (prewarm/adopted) calls receive identical transcoding
1545
+ * to cold-path calls. Without this, adopted sockets forwarded raw PCM-24
1546
+ * to Twilio/Telnyx, producing garbled or silent audio on every warm call.
1547
+ *
1548
+ * @param parsed - The parsed GA event object (type already checked to be
1549
+ * `response.output_audio.delta` with a string `delta`).
1550
+ * @param handler - The downstream message listener to dispatch each frame to.
1551
+ * @param rest - Extra arguments forwarded from the original `message` event.
1552
+ * @returns `true` if frames were dispatched (caller should return early),
1553
+ * `false` if the resampler is still warming up (zero output bytes).
1554
+ */
1555
+ translateGaAudioDelta(parsed, handler, rest) {
1556
+ const newType = GA_TO_V1_EVENT_NAMES["response.output_audio.delta"];
1557
+ const mulaw = this.transcodeOutboundPcm24ToMulaw8Buffer(parsed.delta);
1558
+ const FRAME_BYTES = 160;
1559
+ if (mulaw.length === 0) return false;
1560
+ for (let off = 0; off < mulaw.length; off += FRAME_BYTES) {
1561
+ const slice = mulaw.subarray(off, Math.min(off + FRAME_BYTES, mulaw.length));
1562
+ const frame = { ...parsed, type: newType, delta: slice.toString("base64") };
1563
+ handler(Buffer.from(JSON.stringify(frame)), ...rest);
1564
+ }
1565
+ return true;
1566
+ }
1379
1567
  /**
1380
1568
  * Base64 PCM-16-LE 24 kHz → Base64 mulaw 8 kHz. Used by the WS
1381
1569
  * translation shim on each `response.output_audio.delta`. The stateful
@@ -1405,6 +1593,34 @@ var OpenAIRealtime2Adapter = class extends OpenAIRealtimeAdapter {
1405
1593
  }
1406
1594
  this.ws?.send(JSON.stringify({ type: "response.create", response: responseBody }));
1407
1595
  }
1596
+ /**
1597
+ * Speak a short reassurance filler WITHOUT injecting a `role:user` turn.
1598
+ *
1599
+ * GA-shape sibling of {@link sendFirstMessage} (and override of the base v1
1600
+ * {@link OpenAIRealtimeAdapter.sendReassurance}): a bare `response.create`
1601
+ * carrying explicit `instructions` so the filler is the assistant's own
1602
+ * in-band audio. No `conversation.item.create` with `role:"user"` is
1603
+ * emitted, so the transcript shows no phantom caller line. The GA endpoint
1604
+ * rejects `response.modalities` and does not inherit `audio.output.voice`
1605
+ * for an explicit `response.create`, so — exactly as in
1606
+ * {@link sendFirstMessage} — we send `output_modalities` and re-inject the
1607
+ * voice. Fillers must not imply success or failure.
1608
+ *
1609
+ * Mirrors Python `OpenAIRealtime2Adapter.send_reassurance` in
1610
+ * `providers/openai_realtime_2.py`.
1611
+ */
1612
+ async sendReassurance(text) {
1613
+ if (!this.ws) return;
1614
+ const responseBody = {
1615
+ output_modalities: ["audio"],
1616
+ audio: { output: { voice: this.voice } },
1617
+ instructions: `Say exactly this and nothing else: "${text}"`
1618
+ };
1619
+ if (this.options.reasoningEffort !== void 0) {
1620
+ responseBody.reasoning = { effort: this.options.reasoningEffort };
1621
+ }
1622
+ this.ws.send(JSON.stringify({ type: "response.create", response: responseBody }));
1623
+ }
1408
1624
  };
1409
1625
 
1410
1626
  export {
@@ -1413,6 +1629,7 @@ export {
1413
1629
  OpenAIVoice,
1414
1630
  OpenAITranscriptionModel,
1415
1631
  OpenAIRealtimeVADType,
1632
+ validateRealtimeTurnDetection,
1416
1633
  OpenAIRealtimeAdapter,
1417
1634
  mulawToPcm16,
1418
1635
  pcm16ToMulaw,
package/dist/cli.js CHANGED
@@ -185,14 +185,49 @@ var MetricsStore = class extends import_events.EventEmitter {
185
185
  } else {
186
186
  for (let i = this.calls.length - 1; i >= 0; i--) {
187
187
  if (this.calls[i].call_id === callId) {
188
- this.calls[i].status = status;
189
- Object.assign(this.calls[i], extra);
188
+ this.calls[i] = { ...this.calls[i], status, ...extra };
190
189
  break;
191
190
  }
192
191
  }
193
192
  }
194
193
  this.publish("call_status", { call_id: callId, status, ...extra });
195
194
  }
195
+ /**
196
+ * Record a single transcript line (user/assistant) as it becomes known.
197
+ *
198
+ * FIX-5 (issue #154): the live forward path for the dashboard transcript.
199
+ * The Realtime stream handler calls this the moment each line is known — the
200
+ * user line right after the hallucination filter accepts it, the assistant
201
+ * line when its turn flushes — keyed by the monotonic ``turnIndex`` reserved
202
+ * at turn-open (``reserveTurnIndex``). Each line is appended to the active
203
+ * call's ``transcript`` array and broadcast over SSE as a ``transcript_line``
204
+ * event so the dashboard can render lines as they arrive and re-sort by
205
+ * ``(turnIndex, user<assistant)`` — making a late-arriving user line land
206
+ * ABOVE its agent line. ``recordTurn`` de-dups against the lines pushed here
207
+ * by ``(turnIndex, role)`` so the metrics path never double-pushes the same
208
+ * text. Parity with Python ``record_transcript_line``.
209
+ */
210
+ recordTranscriptLine(data) {
211
+ const callId = data.call_id || "";
212
+ const { role, text, turnIndex } = data;
213
+ if (!callId || role !== "user" && role !== "assistant" || !text) return;
214
+ const active = this.activeCalls.get(callId);
215
+ if (active) {
216
+ if (!active.transcript) active.transcript = [];
217
+ active.transcript.push({
218
+ role,
219
+ text,
220
+ timestamp: Date.now() / 1e3,
221
+ turnIndex
222
+ });
223
+ }
224
+ this.publish("transcript_line", {
225
+ call_id: callId,
226
+ turnIndex,
227
+ role,
228
+ text
229
+ });
230
+ }
196
231
  /** Append a single conversation turn to an active call and broadcast it via SSE. */
197
232
  recordTurn(data) {
198
233
  const callId = data.call_id || "";
@@ -207,14 +242,19 @@ var MetricsStore = class extends import_events.EventEmitter {
207
242
  const userText = typeof turnRecord.user_text === "string" ? turnRecord.user_text : "";
208
243
  const agentText = typeof turnRecord.agent_text === "string" ? turnRecord.agent_text : "";
209
244
  const ts = typeof turnRecord.timestamp === "number" ? turnRecord.timestamp : Date.now() / 1e3;
210
- if (userText.length > 0) {
211
- active.transcript.push({ role: "user", text: userText, timestamp: ts });
245
+ const turnIndex = typeof turnRecord.turn_index === "number" ? turnRecord.turn_index : void 0;
246
+ const alreadyLive = (role) => turnIndex !== void 0 && (active.transcript ?? []).some(
247
+ (e) => e.turnIndex === turnIndex && e.role === role
248
+ );
249
+ if (userText.length > 0 && !alreadyLive("user")) {
250
+ active.transcript.push({ role: "user", text: userText, timestamp: ts, turnIndex });
212
251
  }
213
- if (agentText.length > 0 && agentText !== "[interrupted]") {
252
+ if (agentText.length > 0 && agentText !== "[interrupted]" && !alreadyLive("assistant")) {
214
253
  active.transcript.push({
215
254
  role: "assistant",
216
255
  text: agentText,
217
- timestamp: ts
256
+ timestamp: ts,
257
+ turnIndex
218
258
  });
219
259
  }
220
260
  }
@@ -287,7 +327,7 @@ var MetricsStore = class extends import_events.EventEmitter {
287
327
  getCall(callId) {
288
328
  if (this.deletedCallIds.has(callId)) return null;
289
329
  for (let i = this.calls.length - 1; i >= 0; i--) {
290
- if (this.calls[i].call_id === callId) return this.calls[i];
330
+ if (this.calls[i].call_id === callId) return { ...this.calls[i] };
291
331
  }
292
332
  return null;
293
333
  }
@@ -329,7 +369,9 @@ var MetricsStore = class extends import_events.EventEmitter {
329
369
  }
330
370
  if (accepted.length === 0) return [];
331
371
  accepted.sort();
332
- this.persistDeletedIds();
372
+ this.persistDeletedIds().catch(
373
+ (err) => getLogger().debug(`MetricsStore.deleteCalls: persistDeletedIds failed: ${String(err)}`)
374
+ );
333
375
  this.publish("calls_deleted", { call_ids: accepted });
334
376
  return accepted;
335
377
  }
@@ -341,19 +383,19 @@ var MetricsStore = class extends import_events.EventEmitter {
341
383
  getDeletedCallIds() {
342
384
  return Array.from(this.deletedCallIds).sort();
343
385
  }
344
- /** Atomically persist the deleted-ids set to disk. Best-effort. */
345
- persistDeletedIds() {
386
+ /** Atomically persist the deleted-ids set to disk. Best-effort async. */
387
+ async persistDeletedIds() {
346
388
  if (this.deletedIdsPath === null) return;
347
389
  try {
348
390
  const dir = path2.dirname(this.deletedIdsPath);
349
- fs2.mkdirSync(dir, { recursive: true });
391
+ await fs2.promises.mkdir(dir, { recursive: true });
350
392
  const tmp = this.deletedIdsPath + ".tmp";
351
393
  const payload = {
352
394
  version: 1,
353
395
  deleted_call_ids: Array.from(this.deletedCallIds).sort()
354
396
  };
355
- fs2.writeFileSync(tmp, JSON.stringify(payload, null, 2), "utf8");
356
- fs2.renameSync(tmp, this.deletedIdsPath);
397
+ await fs2.promises.writeFile(tmp, JSON.stringify(payload, null, 2), "utf8");
398
+ await fs2.promises.rename(tmp, this.deletedIdsPath);
357
399
  } catch (err) {
358
400
  getLogger().debug(
359
401
  `MetricsStore.persistDeletedIds: ${String(err)}`
@@ -362,7 +404,8 @@ var MetricsStore = class extends import_events.EventEmitter {
362
404
  }
363
405
  /** Look up an active call by id (returns undefined if not active or unknown). */
364
406
  getActive(callId) {
365
- return this.activeCalls.get(callId);
407
+ const rec = this.activeCalls.get(callId);
408
+ return rec !== void 0 ? { ...rec } : void 0;
366
409
  }
367
410
  /** Return all currently active (not yet ended) calls. */
368
411
  getActiveCalls() {
@@ -607,8 +650,8 @@ function loadTranscriptJsonl(filePath) {
607
650
  } catch {
608
651
  continue;
609
652
  }
610
- const tsIso = typeof row.ts === "string" ? Date.parse(row.ts) : NaN;
611
- const tsNumeric = typeof row.timestamp === "number" ? row.timestamp * 1e3 : NaN;
653
+ const tsIso = typeof row.ts === "string" ? Date.parse(row.ts) / 1e3 : NaN;
654
+ const tsNumeric = typeof row.timestamp === "number" ? row.timestamp : NaN;
612
655
  const timestamp = Number.isFinite(tsIso) ? tsIso : Number.isFinite(tsNumeric) ? tsNumeric : 0;
613
656
  const userText = typeof row.user_text === "string" ? row.user_text : "";
614
657
  const agentText = typeof row.agent_text === "string" ? row.agent_text : "";
@@ -759,8 +802,8 @@ function mountDashboard(app, store, token = "") {
759
802
  res.type("text/html").send(DASHBOARD_HTML);
760
803
  });
761
804
  app.get("/api/dashboard/calls", auth, (req, res) => {
762
- const limit = Math.min(parseInt(req.query.limit || "50", 10) || 50, 1e3);
763
- const offset = parseInt(req.query.offset || "0", 10) || 0;
805
+ const limit = Math.min(Math.max(0, parseInt(req.query.limit || "50", 10) || 50), 1e3);
806
+ const offset = Math.max(0, parseInt(req.query.offset || "0", 10) || 0);
764
807
  res.json(store.getCalls(limit, offset));
765
808
  });
766
809
  app.get("/api/dashboard/calls/:callId", auth, (req, res) => {
@@ -850,8 +893,8 @@ data: ${data}
850
893
  function mountApi(app, store, token = "") {
851
894
  const auth = makeAuthMiddleware(token);
852
895
  app.get("/api/v1/calls", auth, (req, res) => {
853
- const limit = Math.min(parseInt(req.query.limit || "50", 10) || 50, 1e3);
854
- const offset = parseInt(req.query.offset || "0", 10) || 0;
896
+ const limit = Math.min(Math.max(0, parseInt(req.query.limit || "50", 10) || 50), 1e3);
897
+ const offset = Math.max(0, parseInt(req.query.offset || "0", 10) || 0);
855
898
  const calls = store.getCalls(limit, offset);
856
899
  res.json({
857
900
  data: calls,